Commit 9ec0c01e authored by Rob Pike's avatar Rob Pike

unicode: guarantee that the 32-bit range tables contain only

values >= 16 bits, so the lookup code can be smaller in the
common case.
Also make CaseRange uint32s rather than ints, so if we go to
64-bit ints we don't waste more space.

R=rsc
CC=golang-dev
https://golang.org/cl/4550094
parent 378c806c
...@@ -15,6 +15,7 @@ const ( ...@@ -15,6 +15,7 @@ const (
// code points within the set. The ranges are listed in two slices // code points within the set. The ranges are listed in two slices
// to save space: a slice of 16-bit ranges and a slice of 32-bit ranges. // to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
// The two slices must be in sorted order and non-overlapping. // The two slices must be in sorted order and non-overlapping.
// Also, R32 should contain only values >= 0x10000 (1<<16).
type RangeTable struct { type RangeTable struct {
R16 []Range16 R16 []Range16
R32 []Range32 R32 []Range32
...@@ -30,7 +31,7 @@ type Range16 struct { ...@@ -30,7 +31,7 @@ type Range16 struct {
// Range32 represents of a range of Unicode code points and is used when one or // Range32 represents of a range of Unicode code points and is used when one or
// more of the values will not fit in 16 bits. The range runs from Lo to Hi // more of the values will not fit in 16 bits. The range runs from Lo to Hi
// inclusive and has the specified stride. // inclusive and has the specified stride. Lo and Hi must always be >= 1<<16.
type Range32 struct { type Range32 struct {
Lo uint32 Lo uint32
Hi uint32 Hi uint32
...@@ -48,8 +49,8 @@ type Range32 struct { ...@@ -48,8 +49,8 @@ type Range32 struct {
// {UpperLower, UpperLower, UpperLower} // {UpperLower, UpperLower, UpperLower}
// The constant UpperLower has an otherwise impossible delta value. // The constant UpperLower has an otherwise impossible delta value.
type CaseRange struct { type CaseRange struct {
Lo int Lo uint32
Hi int Hi uint32
Delta d Delta d
} }
...@@ -121,6 +122,7 @@ func is32(ranges []Range32, rune uint32) bool { ...@@ -121,6 +122,7 @@ func is32(ranges []Range32, rune uint32) bool {
func Is(rangeTab *RangeTable, rune int) bool { func Is(rangeTab *RangeTable, rune int) bool {
// common case: rune is ASCII or Latin-1. // common case: rune is ASCII or Latin-1.
if rune < 0x100 { if rune < 0x100 {
// Only need to check R16, since R32 is always >= 1<<16.
r16 := uint16(rune) r16 := uint16(rune)
for _, r := range rangeTab.R16 { for _, r := range rangeTab.R16 {
if r16 > r.Hi { if r16 > r.Hi {
...@@ -131,16 +133,6 @@ func Is(rangeTab *RangeTable, rune int) bool { ...@@ -131,16 +133,6 @@ func Is(rangeTab *RangeTable, rune int) bool {
} }
return (r16-r.Lo)%r.Stride == 0 return (r16-r.Lo)%r.Stride == 0
} }
r32 := uint32(rune)
for _, r := range rangeTab.R32 {
if r32 > r.Hi {
continue
}
if r32 < r.Lo {
return false
}
return (r32-r.Lo)%r.Stride == 0
}
return false return false
} }
r16 := rangeTab.R16 r16 := rangeTab.R16
...@@ -210,7 +202,7 @@ func to(_case int, rune int, caseRange []CaseRange) int { ...@@ -210,7 +202,7 @@ func to(_case int, rune int, caseRange []CaseRange) int {
for lo < hi { for lo < hi {
m := lo + (hi-lo)/2 m := lo + (hi-lo)/2
r := caseRange[m] r := caseRange[m]
if r.Lo <= rune && rune <= r.Hi { if int(r.Lo) <= rune && rune <= int(r.Hi) {
delta := int(r.Delta[_case]) delta := int(r.Delta[_case])
if delta > MaxRune { if delta > MaxRune {
// In an Upper-Lower sequence, which always starts with // In an Upper-Lower sequence, which always starts with
...@@ -223,11 +215,11 @@ func to(_case int, rune int, caseRange []CaseRange) int { ...@@ -223,11 +215,11 @@ func to(_case int, rune int, caseRange []CaseRange) int {
// bit in the sequence offset. // bit in the sequence offset.
// The constants UpperCase and TitleCase are even while LowerCase // The constants UpperCase and TitleCase are even while LowerCase
// is odd so we take the low bit from _case. // is odd so we take the low bit from _case.
return r.Lo + ((rune-r.Lo)&^1 | _case&1) return int(r.Lo) + ((rune-int(r.Lo))&^1 | _case&1)
} }
return rune + delta return rune + delta
} }
if rune < r.Lo { if rune < int(r.Lo) {
hi = m hi = m
} else { } else {
lo = m + 1 lo = m + 1
......
...@@ -434,7 +434,28 @@ func dumpRange(header string, inCategory Op) { ...@@ -434,7 +434,28 @@ func dumpRange(header string, inCategory Op) {
break break
} }
} }
if size == 16 && (lo >= 1<<16 || hi >= 1<<16) { size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count)
// next range: start looking where this range ends
next = hi + 1
}
fmt.Print("\t},\n")
fmt.Print("}\n\n")
}
func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) {
if size == 16 && hi >= 1<<16 {
if lo < 1<<16 {
if lo+stride != hi {
log.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride)
}
// No range contains U+FFFF as an instance, so split
// the range into two entries. That way we can maintain
// the invariant that R32 contains only >= 1<<16.
fmt.Printf(format, lo, lo, 1)
lo = hi
stride = 1
*count++
}
fmt.Print("\t},\n") fmt.Print("\t},\n")
fmt.Print("\tR32: []Range32{\n") fmt.Print("\tR32: []Range32{\n")
size = 32 size = 32
...@@ -442,11 +463,7 @@ func dumpRange(header string, inCategory Op) { ...@@ -442,11 +463,7 @@ func dumpRange(header string, inCategory Op) {
} }
fmt.Printf(format, lo, hi, stride) fmt.Printf(format, lo, hi, stride)
*count++ *count++
// next range: start looking where this range ends return size, count
next = hi + 1
}
fmt.Print("\t},\n")
fmt.Print("}\n\n")
} }
func fullCategoryTest(list []string) { func fullCategoryTest(list []string) {
...@@ -634,14 +651,7 @@ func printScriptOrProperty(doProps bool) { ...@@ -634,14 +651,7 @@ func printScriptOrProperty(doProps bool) {
size := 16 size := 16
count := &range16Count count := &range16Count
for _, s := range ranges { for _, s := range ranges {
if size == 16 && (s.Lo >= 1<<16 || s.Hi >= 1<<16) { size, count = printRange(s.Lo, s.Hi, s.Stride, size, count)
fmt.Print("\t},\n")
fmt.Print("\tR32: []Range32{\n")
size = 32
count = &range32Count
}
*count++
fmt.Printf(format, s.Lo, s.Hi, s.Stride)
} }
fmt.Print("\t},\n") fmt.Print("\t},\n")
fmt.Print("}\n\n") fmt.Print("}\n\n")
...@@ -876,6 +886,9 @@ var range16Count = 0 // Number of entries in the 16-bit range tables. ...@@ -876,6 +886,9 @@ var range16Count = 0 // Number of entries in the 16-bit range tables.
var range32Count = 0 // Number of entries in the 32-bit range tables. var range32Count = 0 // Number of entries in the 32-bit range tables.
func printSizes() { func printSizes() {
if *test {
return
}
fmt.Println() fmt.Println()
fmt.Printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count) fmt.Printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
range16Bytes := range16Count * 3 * 2 range16Bytes := range16Count * 3 * 2
......
...@@ -331,9 +331,10 @@ var _Mc = &RangeTable{ ...@@ -331,9 +331,10 @@ var _Mc = &RangeTable{
{0xabe3, 0xabe4, 1}, {0xabe3, 0xabe4, 1},
{0xabe6, 0xabe7, 1}, {0xabe6, 0xabe7, 1},
{0xabe9, 0xabea, 1}, {0xabe9, 0xabea, 1},
{0xabec, 0xabec, 1},
}, },
R32: []Range32{ R32: []Range32{
{0xabec, 0x11000, 25620}, {0x11000, 0x11000, 1},
{0x11002, 0x11082, 128}, {0x11002, 0x11082, 128},
{0x110b0, 0x110b2, 1}, {0x110b0, 0x110b2, 1},
{0x110b7, 0x110b8, 1}, {0x110b7, 0x110b8, 1},
...@@ -1118,9 +1119,10 @@ var _Po = &RangeTable{ ...@@ -1118,9 +1119,10 @@ var _Po = &RangeTable{
{0xff1b, 0xff1f, 4}, {0xff1b, 0xff1f, 4},
{0xff20, 0xff3c, 28}, {0xff20, 0xff3c, 28},
{0xff61, 0xff64, 3}, {0xff61, 0xff64, 3},
{0xff65, 0xff65, 1},
}, },
R32: []Range32{ R32: []Range32{
{0xff65, 0x10100, 411}, {0x10100, 0x10100, 1},
{0x10101, 0x1039f, 670}, {0x10101, 0x1039f, 670},
{0x103d0, 0x10857, 1159}, {0x103d0, 0x10857, 1159},
{0x1091f, 0x1093f, 32}, {0x1091f, 0x1093f, 32},
...@@ -1439,9 +1441,10 @@ var _So = &RangeTable{ ...@@ -1439,9 +1441,10 @@ var _So = &RangeTable{
{0xfdfd, 0xffe4, 487}, {0xfdfd, 0xffe4, 487},
{0xffe8, 0xffed, 5}, {0xffe8, 0xffed, 5},
{0xffee, 0xfffc, 14}, {0xffee, 0xfffc, 14},
{0xfffd, 0xfffd, 1},
}, },
R32: []Range32{ R32: []Range32{
{0xfffd, 0x10102, 261}, {0x10102, 0x10102, 1},
{0x10137, 0x1013f, 1}, {0x10137, 0x1013f, 1},
{0x10179, 0x10189, 1}, {0x10179, 0x10189, 1},
{0x10190, 0x1019b, 1}, {0x10190, 0x1019b, 1},
...@@ -4762,5 +4765,5 @@ var _CaseRanges = []CaseRange{ ...@@ -4762,5 +4765,5 @@ var _CaseRanges = []CaseRange{
{0x10428, 0x1044F, d{-40, 0, -40}}, {0x10428, 0x1044F, d{-40, 0, -40}},
} }
// Range entries: 2712 16-bit, 545 32-bit, 3257 total. // Range entries: 2715 16-bit, 545 32-bit, 3260 total.
// Range bytes: 16272 16-bit, 6540 32-bit, 22812 total. // Range bytes: 16290 16-bit, 6540 32-bit, 22830 total.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment