exp/locale/collate: fixed two bugs uncovered by regression tests.

The first bug was that tertiary ignorables had the same colElem as implicit colElems, yielding unexpected results. The current encoding ensures that a non-implicit colElem is never 0. This fix uncovered another bug of the trie that indexed incorrectly into the null block. This was caused by an unfinished optimization that would avoid the need to max out the most-significant bits of continuation bytes. This bug was also present in the trie used in exp/norm and has been fixed there as well. The appearence of the bug was rare, as the lower blocks happened to be nearly nil. R=r CC=golang-dev https://golang.org/cl/6127070

exp/locale/collate: fixed two bugs uncovered by regression tests.
The first bug was that tertiary ignorables had the same colElem as implicit colElems, yielding unexpected results. The current encoding ensures that a non-implicit colElem is never 0. This fix uncovered another bug of the trie that indexed incorrectly into the null block. This was caused by an unfinished optimization that would avoid the need to max out the most-significant bits of continuation bytes. This bug was also present in the trie used in exp/norm and has been fixed there as well. The appearence of the bug was rare, as the lower blocks happened to be nearly nil. R=r CC=golang-dev https://golang.org/cl/6127070
10838165 · Marcel van Lohuizen · 81d96215 · 10838165 · 10838165 · 10838165
Commit 10838165 authored May 02, 2012 by Marcel van Lohuizen
13 changed files
--- a/src/pkg/exp/locale/collate/build/colelem.go
+++ b/src/pkg/exp/locale/collate/build/colelem.go
@@ -25,11 +25,11 @@ const (
 // For normal collation elements, we assume that a collation element either has
 // a primary or non-default secondary value, not both.
 // Collation elements with a primary value are of the form
-// 010ppppp pppppppp pppppppp tttttttt, where
+// 000ppppp pppppppp pppppppp tttttttt, where
 //   - p* is primary collation value
 //   - t* is the tertiary collation value
 // Collation elements with a secondary value are of the form
-// 00000000 ssssssss ssssssss tttttttt, where
+// 01000000 ssssssss ssssssss tttttttt, where
 //   - s* is the secondary collation value
 //   - t* is the tertiary collation value
 const (
@@ -37,7 +37,7 @@ const (
 	maxSecondaryBits = 16
 	maxTertiaryBits  = 8

-	isPrimary = 0x40000000
+	isSecondary = 0x40000000
 )

 func makeCE(weights []int) (uint32, error) {
@@ -57,10 +57,10 @@ func makeCE(weights []int) (uint32, error) {
 			return 0, fmt.Errorf("makeCE: non-default secondary weight for non-zero primary: %X", weights)
 		}
 		ce = uint32(weights[0]<<maxTertiaryBits + weights[2])
-		ce |= isPrimary
 	} else {
 		// secondary weight form
 		ce = uint32(weights[1]<<maxTertiaryBits + weights[2])
+		ce |= isSecondary
 	}
 	return ce, nil
 }
@@ -162,7 +162,6 @@ const (
 // http://unicode.org/reports/tr10/#Implicit_Weights,
 // but preserve the resulting relative ordering of the runes.
 func implicitPrimary(r rune) int {
-
 	if r >= minUnified && r <= maxUnified {
 		// The most common case for CJK.
 		return int(r) + commonUnifiedOffset

--- a/src/pkg/exp/locale/collate/build/colelem_test.go
+++ b/src/pkg/exp/locale/collate/build/colelem_test.go
@@ -29,9 +29,9 @@ func decompCE(in []int) (ce uint32, err error) {
 }

 var ceTests = []ceTest{
-	{normalCE, []int{0, 0, 0}, 000},
-	{normalCE, []int{0, 30, 3}, 0x1E03},
-	{normalCE, []int{100, defaultSecondary, 3}, 0x40006403},
+	{normalCE, []int{0, 0, 0}, 0x40000000},
+	{normalCE, []int{0, 30, 3}, 0x40001E03},
+	{normalCE, []int{100, defaultSecondary, 3}, 0x6403},
 	{normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-default secondary
 	{normalCE, []int{100, 1, 3}, 0xFFFF},
 	{normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0}, 0xFFFF},

--- a/src/pkg/exp/locale/collate/build/trie.go
+++ b/src/pkg/exp/locale/collate/build/trie.go
@@ -19,7 +19,10 @@ import (
 	"reflect"
 )

-const blockSize = 64
+const (
+	blockSize   = 64
+	blockOffset = 2 // Substract 2 blocks to compensate for the 0x80 added to continuation bytes.
+)

 type trie struct {
 	index  []uint16
@@ -102,7 +105,7 @@ func computeOffsets(index *nodeIndex, n *trieNode) int64 {
 	if n.isInternal() {
 		v, ok := index.lookupBlockIdx[h]
 		if !ok {
-			v = int64(len(index.lookupBlocks))
+			v = int64(len(index.lookupBlocks)) - blockOffset
 			index.lookupBlocks = append(index.lookupBlocks, n)
 			index.lookupBlockIdx[h] = v
 		}
@@ -110,7 +113,7 @@ func computeOffsets(index *nodeIndex, n *trieNode) int64 {
 	} else {
 		v, ok := index.valueBlockIdx[h]
 		if !ok {
-			v = int64(len(index.valueBlocks))
+			v = int64(len(index.valueBlocks)) - blockOffset
 			index.valueBlocks = append(index.valueBlocks, n)
 			index.valueBlockIdx[h] = v
 		}

--- a/src/pkg/exp/locale/collate/build/trie_test.go
+++ b/src/pkg/exp/locale/collate/build/trie_test.go
@@ -79,24 +79,24 @@ var testLookup = [640]uint16 {
 	// Block 0x1, offset 0x40
 	// Block 0x2, offset 0x80
 	// Block 0x3, offset 0xc0
-	0x0c2:0x03, 0x0c4:0x04, 
-	0x0c8:0x05, 
-	0x0df:0x06, 
-	0x0e0:0x04, 
-	0x0ef:0x05, 
-	0x0f0:0x07, 0x0f4:0x09, 
+	0x0c2:0x01, 0x0c4:0x02, 
+	0x0c8:0x03, 
+	0x0df:0x04, 
+	0x0e0:0x02, 
+	0x0ef:0x03, 
+	0x0f0:0x05, 0x0f4:0x07, 
 	// Block 0x4, offset 0x100
-	0x120:0x07, 0x126:0x08, 
+	0x120:0x05, 0x126:0x06, 
 	// Block 0x5, offset 0x140
-	0x17f:0x09, 
+	0x17f:0x07, 
 	// Block 0x6, offset 0x180
-	0x180:0x0a, 0x184:0x0b, 
+	0x180:0x08, 0x184:0x09, 
 	// Block 0x7, offset 0x1c0
-	0x1d0:0x06, 
+	0x1d0:0x04, 
 	// Block 0x8, offset 0x200
-	0x23f:0x0c, 
+	0x23f:0x0a, 
 	// Block 0x9, offset 0x240
-	0x24f:0x08, 
+	0x24f:0x06, 
 }

 var testTrie = trie{ testLookup[:], testValues[:]}

--- a/src/pkg/exp/locale/collate/colelem.go
+++ b/src/pkg/exp/locale/collate/colelem.go
@@ -68,17 +68,18 @@ func (ce colElem) ctype() ceType {
 // For normal collation elements, we assume that a collation element either has
 // a primary or non-default secondary value, not both.
 // Collation elements with a primary value are of the form
-// 010ppppp pppppppp pppppppp tttttttt, where
+// 000ppppp pppppppp pppppppp tttttttt, where
 //   - p* is primary collation value
 //   - t* is the tertiary collation value
 // Collation elements with a secondary value are of the form
-// 00000000 ssssssss ssssssss tttttttt, where
+// 01000000 ssssssss ssssssss tttttttt, where
 //   - s* is the secondary collation value
 //   - t* is the tertiary collation value
 func splitCE(ce colElem) weights {
+	const secondaryMask = 0x40000000
 	w := weights{}
 	w.tertiary = uint8(ce)
-	if ce&0x40000000 != 0 {
+	if ce&secondaryMask == 0 {
 		// primary weight form
 		w.primary = uint32((ce >> 8) & 0x1FFFFF)
 		w.secondary = defaultSecondary

--- a/src/pkg/exp/locale/collate/colelem_test.go
+++ b/src/pkg/exp/locale/collate/colelem_test.go
@@ -20,14 +20,14 @@ func makeCE(weights []int) colElem {
 		maxPrimaryBits   = 21
 		maxSecondaryBits = 16
 		maxTertiaryBits  = 8
-		isPrimary        = 0x40000000
+		isSecondary      = 0x40000000
 	)
 	var ce colElem
 	if weights[0] != 0 {
 		ce = colElem(weights[0]<<maxTertiaryBits + weights[2])
-		ce |= isPrimary
 	} else {
 		ce = colElem(weights[1]<<maxTertiaryBits + weights[2])
+		ce |= isSecondary
 	}
 	return ce
 }

--- a/src/pkg/exp/locale/collate/trie.go
+++ b/src/pkg/exp/locale/collate/trie.go
@@ -27,15 +27,10 @@ const (
 	t5 = 0xF8 // 1111 1000
 	t6 = 0xFC // 1111 1100
 	te = 0xFE // 1111 1110
-
-	maskx = 0x3F // 0011 1111
-	mask2 = 0x1F // 0001 1111
-	mask3 = 0x0F // 0000 1111
-	mask4 = 0x07 // 0000 0111
 )

 func (t *trie) lookupValue(n uint16, b byte) colElem {
-	return colElem(t.values[int(n)<<6+int(b&maskx)])
+	return colElem(t.values[int(n)<<6+int(b)])
 }

 // lookup returns the trie value for the first UTF-8 encoding in s and
@@ -67,7 +62,7 @@ func (t *trie) lookup(s []byte) (v colElem, sz int) {
 		if c1 < tx || t2 <= c1 {
 			return 0, 1
 		}
-		o := int(i)<<6 + int(c1)&maskx
+		o := int(i)<<6 + int(c1)
 		i = t.index[o]
 		c2 := s[2]
 		if c2 < tx || t2 <= c2 {
@@ -83,13 +78,13 @@ func (t *trie) lookup(s []byte) (v colElem, sz int) {
 		if c1 < tx || t2 <= c1 {
 			return 0, 1
 		}
-		o := int(i)<<6 + int(c1)&maskx
+		o := int(i)<<6 + int(c1)
 		i = t.index[o]
 		c2 := s[2]
 		if c2 < tx || t2 <= c2 {
 			return 0, 2
 		}
-		o = int(i)<<6 + int(c2)&maskx
+		o = int(i)<<6 + int(c2)
 		i = t.index[o]
 		c3 := s[3]
 		if c3 < tx || t2 <= c3 {

--- a/src/pkg/exp/locale/collate/trie_test.go
+++ b/src/pkg/exp/locale/collate/trie_test.go
@@ -89,18 +89,18 @@ var testValues = [832]uint32{
 }

 var testLookup = [640]uint16{
-	0x0c2: 0x03, 0x0c4: 0x04,
-	0x0c8: 0x05,
-	0x0df: 0x06,
-	0x0e0: 0x04,
-	0x0ef: 0x05,
-	0x0f0: 0x07, 0x0f4: 0x09,
-	0x120: 0x07, 0x126: 0x08,
-	0x17f: 0x09,
-	0x180: 0x0a, 0x184: 0x0b,
-	0x1d0: 0x06,
-	0x23f: 0x0c,
-	0x24f: 0x08,
+	0x0c2: 0x01, 0x0c4: 0x02,
+	0x0c8: 0x03,
+	0x0df: 0x04,
+	0x0e0: 0x02,
+	0x0ef: 0x03,
+	0x0f0: 0x05, 0x0f4: 0x07,
+	0x120: 0x05, 0x126: 0x06,
+	0x17f: 0x07,
+	0x180: 0x08, 0x184: 0x09,
+	0x1d0: 0x04,
+	0x23f: 0x0a,
+	0x24f: 0x06,
 }

 var testTrie = trie{testLookup[:], testValues[:]}
--- a/src/pkg/exp/norm/tables.go
+++ b/src/pkg/exp/norm/tables.go
--- a/src/pkg/exp/norm/trie.go
+++ b/src/pkg/exp/norm/trie.go
@@ -23,7 +23,7 @@ type trie struct {
 // the value for b is by r.value + (b - r.lo) * stride.
 func (t *trie) lookupValue(n uint8, b byte) uint16 {
 	if n < t.cutoff {
-		return t.values[uint16(n)<<6+uint16(b&maskx)]
+		return t.values[uint16(n)<<6+uint16(b)]
 	}
 	offset := t.sparseOffset[n-t.cutoff]
 	header := t.sparse[offset]
@@ -53,11 +53,6 @@ const (
 	t5 = 0xF8 // 1111 1000
 	t6 = 0xFC // 1111 1100
 	te = 0xFE // 1111 1110
-
-	maskx = 0x3F // 0011 1111
-	mask2 = 0x1F // 0001 1111
-	mask3 = 0x0F // 0000 1111
-	mask4 = 0x07 // 0000 0111
 )

 // lookup returns the trie value for the first UTF-8 encoding in s and
@@ -89,7 +84,7 @@ func (t *trie) lookup(s []byte) (v uint16, sz int) {
 		if c1 < tx || t2 <= c1 {
 			return 0, 1
 		}
-		o := uint16(i)<<6 + uint16(c1)&maskx
+		o := uint16(i)<<6 + uint16(c1)
 		i = t.index[o]
 		c2 := s[2]
 		if c2 < tx || t2 <= c2 {
@@ -105,13 +100,13 @@ func (t *trie) lookup(s []byte) (v uint16, sz int) {
 		if c1 < tx || t2 <= c1 {
 			return 0, 1
 		}
-		o := uint16(i)<<6 + uint16(c1)&maskx
+		o := uint16(i)<<6 + uint16(c1)
 		i = t.index[o]
 		c2 := s[2]
 		if c2 < tx || t2 <= c2 {
 			return 0, 2
 		}
-		o = uint16(i)<<6 + uint16(c2)&maskx
+		o = uint16(i)<<6 + uint16(c2)
 		i = t.index[o]
 		c3 := s[3]
 		if c3 < tx || t2 <= c3 {
@@ -152,7 +147,7 @@ func (t *trie) lookupString(s string) (v uint16, sz int) {
 		if c1 < tx || t2 <= c1 {
 			return 0, 1
 		}
-		o := uint16(i)<<6 + uint16(c1)&maskx
+		o := uint16(i)<<6 + uint16(c1)
 		i = t.index[o]
 		c2 := s[2]
 		if c2 < tx || t2 <= c2 {
@@ -168,13 +163,13 @@ func (t *trie) lookupString(s string) (v uint16, sz int) {
 		if c1 < tx || t2 <= c1 {
 			return 0, 1
 		}
-		o := uint16(i)<<6 + uint16(c1)&maskx
+		o := uint16(i)<<6 + uint16(c1)
 		i = t.index[o]
 		c2 := s[2]
 		if c2 < tx || t2 <= c2 {
 			return 0, 2
 		}
-		o = uint16(i)<<6 + uint16(c2)&maskx
+		o = uint16(i)<<6 + uint16(c2)
 		i = t.index[o]
 		c3 := s[3]
 		if c3 < tx || t2 <= c3 {
@@ -200,11 +195,11 @@ func (t *trie) lookupUnsafe(s []byte) uint16 {
 	if c0 < t3 {
 		return t.lookupValue(i, s[1])
 	}
-	i = t.index[uint16(i)<<6+uint16(s[1])&maskx]
+	i = t.index[uint16(i)<<6+uint16(s[1])]
 	if c0 < t4 {
 		return t.lookupValue(i, s[2])
 	}
-	i = t.index[uint16(i)<<6+uint16(s[2])&maskx]
+	i = t.index[uint16(i)<<6+uint16(s[2])]
 	if c0 < t5 {
 		return t.lookupValue(i, s[3])
 	}
@@ -225,11 +220,11 @@ func (t *trie) lookupStringUnsafe(s string) uint16 {
 	if c0 < t3 {
 		return t.lookupValue(i, s[1])
 	}
-	i = t.index[uint16(i)<<6+uint16(s[1])&maskx]
+	i = t.index[uint16(i)<<6+uint16(s[1])]
 	if c0 < t4 {
 		return t.lookupValue(i, s[2])
 	}
-	i = t.index[uint16(i)<<6+uint16(s[2])&maskx]
+	i = t.index[uint16(i)<<6+uint16(s[2])]
 	if c0 < t5 {
 		return t.lookupValue(i, s[3])
 	}

--- a/src/pkg/exp/norm/trie_test.go
+++ b/src/pkg/exp/norm/trie_test.go
@@ -96,13 +96,17 @@ func TestLookup(t *testing.T) {
 	}
 	for i, tt := range tests {
 		v, sz := testdata.lookup(tt.bytes)
-		if int(v) != 0 {
+		if v != 0 {
 			t.Errorf("lookup of illegal rune, case %d: found value %#x, expected 0", i, v)
 		}
 		if sz != tt.size {
 			t.Errorf("lookup of illegal rune, case %d: found size %d, expected %d", i, sz, tt.size)
 		}
 	}
+	// Verify defaults.
+	if v, _ := testdata.lookup([]byte{0xC1, 0x8C}); v != 0 {
+		t.Errorf("lookup of non-existing rune should be 0; found %X", v)
+	}
 }

 func TestLookupUnsafe(t *testing.T) {

--- a/src/pkg/exp/norm/triedata_test.go
+++ b/src/pkg/exp/norm/triedata_test.go
@@ -4,7 +4,7 @@

 package norm

-var testRunes = []rune{1, 12, 127, 128, 256, 2047, 2048, 2457, 65535, 65536, 65793, 1114111, 512, 513, 514, 528, 533}
+var testRunes = []int32{1, 12, 127, 128, 256, 2047, 2048, 2457, 65535, 65536, 65793, 1114111, 512, 513, 514, 528, 533}

 // testdataValues: 192 entries, 384 bytes
 // Block 2 is the null block.
@@ -62,24 +62,24 @@ var testdataLookup = [640]uint8{
 	// Block 0x1, offset 0x40
 	// Block 0x2, offset 0x80
 	// Block 0x3, offset 0xc0
-	0x0c2: 0x03, 0x0c4: 0x04,
-	0x0c8: 0x05,
-	0x0df: 0x06,
-	0x0e0: 0x04,
-	0x0ef: 0x05,
-	0x0f0: 0x07, 0x0f4: 0x09,
+	0x0c2: 0x01, 0x0c4: 0x02,
+	0x0c8: 0x03,
+	0x0df: 0x04,
+	0x0e0: 0x02,
+	0x0ef: 0x03,
+	0x0f0: 0x05, 0x0f4: 0x07,
 	// Block 0x4, offset 0x100
-	0x120: 0x07, 0x126: 0x08,
+	0x120: 0x05, 0x126: 0x06,
 	// Block 0x5, offset 0x140
-	0x17f: 0x09,
+	0x17f: 0x07,
 	// Block 0x6, offset 0x180
-	0x180: 0x0a, 0x184: 0x0b,
+	0x180: 0x08, 0x184: 0x09,
 	// Block 0x7, offset 0x1c0
-	0x1d0: 0x06,
+	0x1d0: 0x04,
 	// Block 0x8, offset 0x200
-	0x23f: 0x0c,
+	0x23f: 0x0a,
 	// Block 0x9, offset 0x240
-	0x24f: 0x08,
+	0x24f: 0x06,
 }

-var testdataTrie = trie{testdataLookup[:], testdataValues[:], testdataSparseValues[:], testdataSparseOffset[:], 3}
+var testdataTrie = trie{testdataLookup[:], testdataValues[:], testdataSparseValues[:], testdataSparseOffset[:], 1}
--- a/src/pkg/exp/norm/triegen.go
+++ b/src/pkg/exp/norm/triegen.go
@@ -19,8 +19,11 @@ import (
 	"unicode/utf8"
 )

-const blockSize = 64
-const maxSparseEntries = 16
+const (
+	blockSize        = 64
+	blockOffset      = 2 // Substract two blocks to compensate for the 0x80 added to continuation bytes.
+	maxSparseEntries = 16
+)

 // Intermediate trie structure
 type trieNode struct {
@@ -157,7 +160,7 @@ func computeOffsets(index *nodeIndex, n *trieNode) int {
 	if n.isInternal() {
 		v, ok := index.lookupBlockIdx[h]
 		if !ok {
-			v = len(index.lookupBlocks)
+			v = len(index.lookupBlocks) - blockOffset
 			index.lookupBlocks = append(index.lookupBlocks, n)
 			index.lookupBlockIdx[h] = v
 		}
@@ -166,7 +169,7 @@ func computeOffsets(index *nodeIndex, n *trieNode) int {
 		v, ok := index.valueBlockIdx[h]
 		if !ok {
 			if c := n.countSparseEntries(); c > maxSparseEntries {
-				v = len(index.valueBlocks)
+				v = len(index.valueBlocks) - blockOffset
 				index.valueBlocks = append(index.valueBlocks, n)
 				index.valueBlockIdx[h] = v
 			} else {
@@ -295,7 +298,7 @@ func (t *trieNode) printTables(name string) int {
 	}
 	fmt.Print("\n}\n\n")

-	cutoff := len(index.valueBlocks)
+	cutoff := len(index.valueBlocks) - blockOffset
 	ni := len(index.lookupBlocks) * blockSize
 	fmt.Printf("// %sLookup: %d bytes\n", name, ni)
 	fmt.Printf("// Block 0 is the null block.\n")