Commit adc19ac5 authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/locale/collate: adjusted contraction trie to support Myanmar (Burmese),

which has a rather large contraction table. The value of the next state
offset now starts after the current block, instead of before.  This is
slightly less efficient (on extra addition per state change), but gives
some extra range for the offsets.
Also introduced constants for final (0) and noIndex (0xFF).
tables.go is updated in a separate CL.

R=r
CC=golang-dev
https://golang.org/cl/6346092
parent 656b192c
......@@ -41,6 +41,11 @@ import (
// also includes the length and offset to the next sequence of entries
// to check in case of a match.
const (
final = 0
noIndex = 0xFF
)
// ctEntry associates to a matching byte an offset and/or next sequence of
// bytes to check. A ctEntry c is called final if a match means that the
// longest suffix has been found. An entry c is final if c.n == 0.
......@@ -50,24 +55,24 @@ import (
// Examples:
// The suffix strings "ab" and "ac" can be represented as:
// []ctEntry{
// {'a', 1, 1, 0xFF}, // 'a' by itself does not match, so i is 0xFF.
// {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF.
// {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2
// }
//
// The suffix strings "ab", "abc", "abd", and "abcd" can be represented as:
// []ctEntry{
// {'a', 1, 1, 0xFF}, // 'a' must be followed by 'b'.
// {'b', 2, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'.
// {'d', 'd', 0, 3}, // "abd" -> 3
// {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'.
// {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'.
// {'d', 'd', final, 3}, // "abd" -> 3
// {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'.
// {'d', 'd', 0, 4}, // "abcd" -> 4
// {'d', 'd', final, 4}, // "abcd" -> 4
// }
// See genStateTests in contract_test.go for more examples.
type ctEntry struct {
l uint8 // non-final: byte value to match; final: lowest match in range.
h uint8 // non-final: relative index to next block; final: highest match in range.
n uint8 // non-final: length of next block; final: 0
i uint8 // result offset. Will be 0xFF if more bytes are needed to complete.
n uint8 // non-final: length of next block; final: final
i uint8 // result offset. Will be noIndex if more bytes are needed to complete.
}
// contractTrieSet holds a set of contraction tries. The tries are stored
......@@ -124,7 +129,7 @@ func (ct *contractTrieSet) genStates(sis []stridx) (int, error) {
}
}
if !added {
*ct = append(*ct, ctEntry{l: c, i: 0xFF})
*ct = append(*ct, ctEntry{l: c, i: noIndex})
}
} else {
for j := len(*ct) - 1; j >= start; j-- {
......@@ -140,7 +145,7 @@ func (ct *contractTrieSet) genStates(sis []stridx) (int, error) {
}
}
if !added {
*ct = append(*ct, ctEntry{l: c, h: c, i: uint8(si.index)})
*ct = append(*ct, ctEntry{l: c, h: c, n: final, i: uint8(si.index)})
}
}
}
......@@ -150,7 +155,7 @@ func (ct *contractTrieSet) genStates(sis []stridx) (int, error) {
for i, end := start, len(*ct); i < end; i++ {
fe := (*ct)[i]
if fe.h == 0 { // uninitialized non-final
ln := len(*ct) - start
ln := len(*ct) - start - n
if ln > 0xFF {
return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln)
}
......@@ -238,16 +243,16 @@ func (ct *contractTrieSet) lookup(h ctHandle, str []byte) (index, ns int) {
if c >= e.l {
p++
if e.l == c {
if e.i != 0xFF {
if e.i != noIndex {
index, ns = int(e.i), p
}
if e.n != 0 {
if e.n != final {
// set to new state
i, states, n = 0, states[e.h:], int(e.n)
i, states, n = 0, states[int(e.h)+n:], int(e.n)
} else {
return
}
} else if e.n == 0 && c <= e.h {
} else if e.n == final && c <= e.h {
return int(c-e.l) + int(e.i), p
}
} else {
......
......@@ -111,9 +111,9 @@ var genStateTests = []GenStateTest{
},
1,
contractTrieSet{
{'a', 1, 1, 0xFF},
{'b', 1, 1, 0xFF},
{'c', 'c', 0, 1},
{'a', 0, 1, noIndex},
{'b', 0, 1, noIndex},
{'c', 'c', final, 1},
},
},
{[]stridx{
......@@ -123,9 +123,9 @@ var genStateTests = []GenStateTest{
},
1,
contractTrieSet{
{'a', 1, 1, 0xFF},
{'b', 1, 1, 0xFF},
{'c', 'e', 0, 1},
{'a', 0, 1, noIndex},
{'b', 0, 1, noIndex},
{'c', 'e', final, 1},
},
},
{[]stridx{
......@@ -135,9 +135,9 @@ var genStateTests = []GenStateTest{
},
1,
contractTrieSet{
{'a', 1, 1, 3},
{'b', 1, 1, 2},
{'c', 'c', 0, 1},
{'a', 0, 1, 3},
{'b', 0, 1, 2},
{'c', 'c', final, 1},
},
},
{[]stridx{
......@@ -150,11 +150,11 @@ var genStateTests = []GenStateTest{
},
2,
contractTrieSet{
{'b', 'b', 0, 6},
{'a', 2, 2, 5},
{'c', 'c', 0, 4},
{'b', 2, 1, 3},
{'c', 'd', 0, 1},
{'b', 'b', final, 6},
{'a', 0, 2, 5},
{'c', 'c', final, 4},
{'b', 0, 1, 3},
{'c', 'd', final, 1},
},
},
{[]stridx{
......@@ -168,14 +168,14 @@ var genStateTests = []GenStateTest{
},
2,
contractTrieSet{
{'b', 5, 1, 0xFF},
{'a', 2, 1, 0xFF},
{'b', 1, 1, 6},
{'c', 1, 1, 4},
{'d', 'd', 0, 1},
{'c', 1, 1, 7},
{'d', 1, 1, 5},
{'e', 'f', 0, 2},
{'b', 3, 1, noIndex},
{'a', 0, 1, noIndex},
{'b', 0, 1, 6},
{'c', 0, 1, 4},
{'d', 'd', final, 1},
{'c', 0, 1, 7},
{'d', 0, 1, 5},
{'e', 'f', final, 2},
},
},
}
......@@ -251,13 +251,13 @@ func TestPrintContractionTrieSet(t *testing.T) {
const contractTrieOutput = `// testCTEntries: 8 entries, 32 bytes
var testCTEntries = [8]struct{l,h,n,i uint8}{
{0x62, 0x5, 1, 255},
{0x61, 0x2, 1, 255},
{0x62, 0x1, 1, 6},
{0x63, 0x1, 1, 4},
{0x62, 0x3, 1, 255},
{0x61, 0x0, 1, 255},
{0x62, 0x0, 1, 6},
{0x63, 0x0, 1, 4},
{0x64, 0x64, 0, 1},
{0x63, 0x1, 1, 7},
{0x64, 0x1, 1, 5},
{0x63, 0x0, 1, 7},
{0x64, 0x0, 1, 5},
{0x65, 0x66, 0, 2},
}
var testContractTrieSet = contractTrieSet( testCTEntries[:] )
......
......@@ -37,6 +37,11 @@ func (s *ctScanner) result() (i, p int) {
return s.index, s.pindex
}
const (
final = 0
noIndex = 0xFF
)
// scan matches the longest suffix at the current location in the input
// and returns the number of bytes consumed.
func (s *ctScanner) scan(p int) int {
......@@ -53,12 +58,12 @@ func (s *ctScanner) scan(p int) int {
if c >= e.l {
if e.l == c {
p++
if e.i != 0xFF {
if e.i != noIndex {
s.index = int(e.i)
s.pindex = p
}
if e.n != 0 {
i, states, n = 0, states[e.h:], int(e.n)
if e.n != final {
i, states, n = 0, states[int(e.h)+n:], int(e.n)
if p >= len(str) || utf8.RuneStart(str[p]) {
s.states, s.n, pr = states, n, p
}
......@@ -67,7 +72,7 @@ func (s *ctScanner) scan(p int) int {
return p
}
continue
} else if e.n == 0 && c <= e.h {
} else if e.n == final && c <= e.h {
p++
s.done = true
s.index = int(c-e.l) + int(e.i)
......
......@@ -30,8 +30,8 @@ var lookupTests = []LookupTest{
},
1,
contractTrieSet{
{'a', 1, 1, 0xFF},
{'b', 1, 1, 0xFF},
{'a', 0, 1, 0xFF},
{'b', 0, 1, 0xFF},
{'c', 'c', 0, 1},
},
},
......@@ -46,8 +46,8 @@ var lookupTests = []LookupTest{
},
1,
contractTrieSet{
{'a', 1, 1, 0xFF},
{'b', 1, 1, 0xFF},
{'a', 0, 1, 0xFF},
{'b', 0, 1, 0xFF},
{'c', 'e', 0, 1},
},
},
......@@ -60,8 +60,8 @@ var lookupTests = []LookupTest{
},
1,
contractTrieSet{
{'a', 1, 1, 3},
{'b', 1, 1, 2},
{'a', 0, 1, 3},
{'b', 0, 1, 2},
{'c', 'c', 0, 1},
},
},
......@@ -77,9 +77,9 @@ var lookupTests = []LookupTest{
2,
contractTrieSet{
{'b', 'b', 0, 6},
{'a', 2, 2, 5},
{'a', 0, 2, 5},
{'c', 'c', 0, 4},
{'b', 2, 1, 3},
{'b', 0, 1, 3},
{'c', 'd', 0, 1},
},
},
......@@ -94,13 +94,13 @@ var lookupTests = []LookupTest{
},
2,
contractTrieSet{
{'b', 5, 1, 0xFF},
{'a', 2, 1, 0xFF},
{'b', 1, 1, 6},
{'c', 1, 1, 4},
{'b', 3, 1, 0xFF},
{'a', 0, 1, 0xFF},
{'b', 0, 1, 6},
{'c', 0, 1, 4},
{'d', 'd', 0, 1},
{'c', 1, 1, 7},
{'d', 1, 1, 5},
{'c', 0, 1, 7},
{'d', 0, 1, 5},
{'e', 'f', 0, 2},
},
},
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment