Commit 3c098e27 authored by Rob Pike's avatar Rob Pike

add the Upper/Lower sequence optimization.

tables shrink 900 lines.
mapping code gets a little slower

R=rsc
DELTA=1124  (105 added, 952 deleted, 67 changed)
OCL=34079
CL=34089
parent be219c5e
......@@ -18,7 +18,15 @@ type Range struct {
// The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas
// are the number to add to the code point to reach the code point for a
// different case for that character. They may be negative. If zero, it
// means the character is in the corresponding case.
// means the character is in the corresponding case. There is a special
// case representing sequences of alternating corresponding Upper and Lower
// pairs. It appears with the usual Lo and Hi values and a Delta of
// {0, UpperLower, 0}
// The constant UpperLower has (meaningful) value 1. The lower case
// letters in such sequences are assumed; were they present they would
// have a Delta of
// {LowerUpper, 0, LowerUpper}
// where LowerUpper has value -1.
type CaseRange struct {
Lo int;
Hi int;
......@@ -38,8 +46,9 @@ type d [MaxCase]int32 // to make the CaseRanges text shorter
// this CaseRange represents a sequence of the form (say)
// Upper Lower Upper Lower.
const (
UpperLower = 1;
LowerUpper = -1;
MaxChar = 0x10FFFF;
UpperLower = MaxChar + 2; // cannot be a valid delta
LowerUpper = MaxChar + 3;
)
// Is tests whether rune is in the specified table of ranges.
......@@ -103,12 +112,28 @@ func IsTitle(rune int) bool {
// IsLetter reports whether the rune is a letter.
func IsLetter(rune int) bool {
if rune < 0x80 { // quick ASCII check
rune &^= ' ';
rune &^= 'a'-'A';
return 'A' <= rune && rune <= 'Z';
}
return Is(Letter, rune);
}
// In an Upper-Lower sequence, which always starts with an UpperCase letter,
// the real deltas always look like:
// 0 1 0
// -1 0 -1
// This is a single-dimensioned array addressed by the case shifted up one bit
// (the column of this table) or'ed with the low bit of the position in
// the sequence (the row of the table).
var ulDelta = [8]int{
(UpperCase<<1) | 0: 0,
(UpperCase<<1) | 1: -1,
(LowerCase<<1) | 0: 1,
(LowerCase<<1) | 1: 0,
(TitleCase<<1) | 0: 0,
(TitleCase<<1) | 1: -1,
}
// To maps the rune to the specified case, UpperCase, LowerCase, or TitleCase
func To(_case int, rune int) int {
if _case < 0 || MaxCase <= _case {
......@@ -121,7 +146,13 @@ func To(_case int, rune int) int {
m := lo + (hi - lo)/2;
r := CaseRanges[m];
if r.Lo <= rune && rune <= r.Hi {
return rune + int(r.Delta[_case]);
delta := int(r.Delta[_case]);
if delta > MaxChar {
// Somewhere inside an UpperLower sequence. Use
// the precomputed delta table to get our offset.
delta = ulDelta[((_case<<1) | ((rune-r.Lo)&1))];
}
return rune + delta;
}
if rune < r.Lo {
hi = m;
......@@ -136,7 +167,7 @@ func To(_case int, rune int) int {
func ToUpper(rune int) int {
if rune < 0x80 { // quick ASCII check
if 'a' <= rune && rune <= 'z' {
rune &^= ' '
rune -= 'a'-'A'
}
return rune
}
......@@ -147,7 +178,7 @@ func ToUpper(rune int) int {
func ToLower(rune int) int {
if rune < 0x80 { // quick ASCII check
if 'A' <= rune && rune <= 'Z' {
rune |= ' '
rune += 'a'-'A'
}
return rune
}
......@@ -158,7 +189,7 @@ func ToLower(rune int) int {
func ToTitle(rune int) int {
if rune < 0x80 { // quick ASCII check
if 'a' <= rune && rune <= 'z' { // title case is upper case for ASCII
rune &^= ' '
rune -= 'a'-'A'
}
return rune
}
......
......@@ -157,7 +157,7 @@ var caseTest = []caseT {
caseT{LowerCase, 0xA65F, 0xA65F},
caseT{TitleCase, 0xA65F, 0xA65E},
// From a LowerUpper sequence
// From another UpperLower sequence
// 0139;LATIN CAPITAL LETTER L WITH ACUTE;Lu;0;L;004C 0301;;;;N;LATIN CAPITAL LETTER L ACUTE;;;013A;
caseT{UpperCase, 0x0139, 0x0139},
caseT{LowerCase, 0x0139, 0x013A},
......
......@@ -636,13 +636,13 @@ type caseState struct {
// Is d a continuation of the state of c?
func (c *caseState) adjacent(d *caseState) bool {
if d.point < c.point {
return d.adjacent(c)
c, d = d, c
}
switch {
case d.point != c.point+1:
return false
case d._case != c._case:
case d.point != c.point+1: // code points not adjacent (shouldn't happen)
return false
case d._case != c._case: // different cases
return c.upperLowerAdjacent(d);
case c._case == CaseNone:
return false
case c._case == CaseMissing:
......@@ -657,6 +657,70 @@ func (c *caseState) adjacent(d *caseState) bool {
return true;
}
// Is d the same as c, but opposite in upper/lower case? this would make it
// an element of an UpperLower sequence.
func (c *caseState) upperLowerAdjacent(d *caseState) bool {
// check they're a matched case pair. we know they have adjacent values
switch {
case c._case == CaseUpper && d._case != CaseLower:
return false
case c._case == CaseLower && d._case != CaseUpper:
return false
}
// matched pair (at least in upper/lower). make the order Upper Lower
if c._case == CaseLower {
c, d = d, c
}
// for an Upper Lower sequence the deltas have to be in order
// c: 0 1 0
// d: -1 0 -1
switch {
case c.deltaToUpper != 0:
return false
case c.deltaToLower != 1:
return false
case c.deltaToTitle != 0:
return false
case d.deltaToUpper != -1:
return false
case d.deltaToLower != 0:
return false
case d.deltaToTitle != -1:
return false
}
return true
}
// Does this character start an UpperLower sequence?
func (c *caseState) isUpperLower() bool {
// for an Upper Lower sequence the deltas have to be in order
// c: 0 1 0
switch {
case c.deltaToUpper != 0:
return false
case c.deltaToLower != 1:
return false
case c.deltaToTitle != 0:
return false
}
return true
}
// Does this character start a LowerUpper sequence?
func (c *caseState) isLowerUpper() bool {
// for an Upper Lower sequence the deltas have to be in order
// c: -1 0 -1
switch {
case c.deltaToUpper != -1:
return false
case c.deltaToLower != 0:
return false
case c.deltaToTitle != -1:
return false
}
return true
}
func getCaseState(i int) (c *caseState) {
c = &caseState{ point: i, _case: CaseNone };
ch := &chars[i];
......@@ -729,9 +793,19 @@ func printCaseRange(lo, hi *caseState) {
// character represents itself in all cases - no need to mention it
return
}
fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
lo.point, hi.point,
lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
switch {
case hi.point > lo.point && lo.isUpperLower():
fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
lo.point, hi.point)
case hi.point > lo.point && lo.isLowerUpper():
die.Log("LowerUpper sequence: should not happen: U+%04X\n", lo.point);
fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
lo.point, hi.point)
default:
fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
lo.point, hi.point,
lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
}
}
// If the cased value in the Char is 0, it means use the rune itself.
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment