exp/regexp/syntax: case-folding in character classes

Also fix \x{123} parsing. R=r CC=golang-dev https://golang.org/cl/4632052

exp/regexp/syntax: case-folding in character classes
Also fix \x{123} parsing. R=r CC=golang-dev https://golang.org/cl/4632052
52cd055f · Russ Cox · 6b648caf · 52cd055f · 52cd055f
Commit 52cd055f authored Jun 28, 2011 by Russ Cox
Hide whitespace changes
Inline Side-by-side

Showing with 205 additions and 37 deletions

parse.go src/pkg/exp/regexp/syntax/parse.go +126 -31

parse_test.go src/pkg/exp/regexp/syntax/parse_test.go +79 -6

No files found.
--- a/src/pkg/exp/regexp/syntax/parse.go
+++ b/src/pkg/exp/regexp/syntax/parse.go
@@ -81,6 +81,7 @@ type parser struct {
 	stack       []*Regexp // stack of parsed expressions
 	numCap      int       // number of capturing groups seen
 	wholeRegexp string
+	tmpClass    []int // temporary char class work space
 }

 // Parse stack manipulation.
@@ -371,7 +372,6 @@ func Parse(s string, flags Flags) (*Regexp, os.Error) {
 				if r != nil {
 					re.Rune = r
 					t = rest
-					// TODO: Handle FoldCase flag.
 					p.push(re)
 					break BigSwitch
 				}
@@ -729,6 +729,7 @@ Switch:
 				if r > unicode.MaxRune {
 					break Switch
 				}
+				nhex++
 			}
 			if nhex == 0 {
 				break Switch
@@ -801,12 +802,7 @@ func (p *parser) parsePerlClassEscape(s string, r []int) (out []int, rest string
 	if g.sign == 0 {
 		return
 	}
-	if g.sign < 0 {
-		r = appendNegatedClass(r, g.class)
-	} else {
-		r = appendClass(r, g.class)
-	}
-	return r, s[2:]
+	return p.appendGroup(r, g), s[2:]
 }

 // parseNamedClass parses a leading POSIX named character class like [:alnum:]
@@ -827,23 +823,40 @@ func (p *parser) parseNamedClass(s string, r []int) (out []int, rest string, err
 	if g.sign == 0 {
 		return nil, "", &Error{ErrInvalidCharRange, name}
 	}
-	if g.sign < 0 {
-		r = appendNegatedClass(r, g.class)
+	return p.appendGroup(r, g), s, nil
+}
+
+func (p *parser) appendGroup(r []int, g charGroup) []int {
+	if p.flags&FoldCase == 0 {
+		if g.sign < 0 {
+			r = appendNegatedClass(r, g.class)
+		} else {
+			r = appendClass(r, g.class)
+		}
 	} else {
-		r = appendClass(r, g.class)
+		tmp := p.tmpClass[:0]
+		tmp = appendFoldedClass(tmp, g.class)
+		p.tmpClass = tmp
+		tmp = cleanClass(&p.tmpClass)
+		if g.sign < 0 {
+			r = appendNegatedClass(r, tmp)
+		} else {
+			r = appendClass(r, tmp)
+		}
 	}
-	return r, s, nil
+	return r
 }

-// unicodeTable returns the unicode.RangeTable identified by name.
-func unicodeTable(name string) *unicode.RangeTable {
+// unicodeTable returns the unicode.RangeTable identified by name
+// and the table of additional fold-equivalent code points.
+func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
 	if t := unicode.Categories[name]; t != nil {
-		return t
+		return t, unicode.FoldCategory[name]
 	}
 	if t := unicode.Scripts[name]; t != nil {
-		return t
+		return t, unicode.FoldScript[name]
 	}
-	return nil
+	return nil, nil
 }

 // parseUnicodeClass parses a leading Unicode character class like \p{Han}
@@ -891,14 +904,31 @@ func (p *parser) parseUnicodeClass(s string, r []int) (out []int, rest string, e
 		name = name[1:]
 	}

-	tab := unicodeTable(name)
+	tab, fold := unicodeTable(name)
 	if tab == nil {
 		return nil, "", &Error{ErrInvalidCharRange, seq}
 	}
-	if sign > 0 {
-		r = appendTable(r, tab)
+
+	if p.flags&FoldCase == 0 || fold == nil {
+		if sign > 0 {
+			r = appendTable(r, tab)
+		} else {
+			r = appendNegatedTable(r, tab)
+		}
 	} else {
-		r = appendNegatedTable(r, tab)
+		// Merge and clean tab and fold in a temporary buffer.
+		// This is necessary for the negative case and just tidy
+		// for the positive case.
+		tmp := p.tmpClass[:0]
+		tmp = appendTable(tmp, tab)
+		tmp = appendTable(tmp, fold)
+		p.tmpClass = tmp
+		tmp = cleanClass(&p.tmpClass)
+		if sign > 0 {
+			r = appendClass(r, tmp)
+		} else {
+			r = appendNegatedClass(r, tmp)
+		}
 	}
 	return r, t, nil
 }
@@ -979,7 +1009,11 @@ func (p *parser) parseClass(s string) (rest string, err os.Error) {
 				return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
 			}
 		}
-		class = appendRange(class, lo, hi)
+		if p.flags&FoldCase == 0 {
+			class = appendRange(class, lo, hi)
+		} else {
+			class = appendFoldedRange(class, lo, hi)
+		}
 	}
 	t = t[1:] // chop ]

@@ -999,10 +1033,15 @@ func (p *parser) parseClass(s string) (rest string, err os.Error) {
 // cleanClass sorts the ranges (pairs of elements of r),
 // merges them, and eliminates duplicates.
 func cleanClass(rp *[]int) []int {
+
 	// Sort by lo increasing, hi decreasing to break ties.
 	sort.Sort(ranges{rp})

 	r := *rp
+	if len(r) < 2 {
+		return r
+	}
+
 	// Merge abutting, overlapping.
 	w := 2 // write index
 	for i := 2; i < len(r); i += 2 {
@@ -1025,23 +1064,71 @@ func cleanClass(rp *[]int) []int {

 // appendRange returns the result of appending the range lo-hi to the class r.
 func appendRange(r []int, lo, hi int) []int {
-	// Expand last range if overlaps or abuts.
-	if n := len(r); n > 0 {
-		rlo, rhi := r[n-2], r[n-1]
-		if lo <= rhi+1 && rlo <= hi+1 {
-			if lo < rlo {
-				r[n-2] = lo
-			}
-			if hi > rhi {
-				r[n-1] = hi
+	// Expand last range or next to last range if it overlaps or abuts.
+	// Checking two ranges helps when appending case-folded
+	// alphabets, so that one range can be expanding A-Z and the
+	// other expanding a-z.
+	n := len(r)
+	for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
+		if n >= i {
+			rlo, rhi := r[n-i], r[n-i+1]
+			if lo <= rhi+1 && rlo <= hi+1 {
+				if lo < rlo {
+					r[n-i] = lo
+				}
+				if hi > rhi {
+					r[n-i+1] = hi
+				}
+				return r
 			}
-			return r
 		}
 	}

 	return append(r, lo, hi)
 }

+const (
+	// minimum and maximum runes involved in folding.
+	// checked during test.
+	minFold = 0x0041
+	maxFold = 0x1044f
+)
+
+// appendFoldedRange returns the result of appending the range lo-hi
+// and its case folding-equivalent runes to the class r.
+func appendFoldedRange(r []int, lo, hi int) []int {
+	// Optimizations.
+	if lo <= minFold && hi >= maxFold {
+		// Range is full: folding can't add more.
+		return appendRange(r, lo, hi)
+	}
+	if hi < minFold || lo > maxFold {
+		// Range is outside folding possibilities.
+		return appendRange(r, lo, hi)
+	}
+	if lo < minFold {
+		// [lo, minFold-1] needs no folding.
+		r = appendRange(r, lo, minFold-1)
+		lo = minFold
+	}
+	if hi > maxFold {
+		// [maxFold+1, hi] needs no folding.
+		r = appendRange(r, maxFold+1, hi)
+		hi = maxFold
+	}
+
+	// Brute force.  Depend on appendRange to coalesce ranges on the fly.
+	for c := lo; c <= hi; c++ {
+		r = appendRange(r, c, c)
+		f := unicode.SimpleFold(c)
+		for f != c {
+			r = appendRange(r, f, f)
+			f = unicode.SimpleFold(f)
+		}
+	}
+	return r
+}
+
 // appendClass returns the result of appending the class x to the class r.
 // It assume x is clean.
 func appendClass(r []int, x []int) []int {
@@ -1051,6 +1138,14 @@ func appendClass(r []int, x []int) []int {
 	return r
 }

+// appendFolded returns the result of appending the case folding of the class x to the class r.
+func appendFoldedClass(r []int, x []int) []int {
+	for i := 0; i < len(x); i += 2 {
+		r = appendFoldedRange(r, x[i], x[i+1])
+	}
+	return r
+}
+
 // appendNegatedClass returns the result of appending the negation of the class x to the class r.
 // It assumes x is clean.
 func appendNegatedClass(r []int, x []int) []int {

--- a/src/pkg/exp/regexp/syntax/parse_test.go
+++ b/src/pkg/exp/regexp/syntax/parse_test.go
@@ -74,18 +74,18 @@ var parseTests = []struct {
 	{"[a-z]", "cc{0x61-0x7a}"},
 	{"[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"},
 	{"[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"},
-	//	{ "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
-	//	{ "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
-	//	{ "(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
-	//	{ "(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+	{"(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"},
+	{"(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"},
+	{"(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
+	{"(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
 	{"\\d", "cc{0x30-0x39}"},
 	{"\\D", "cc{0x0-0x2f 0x3a-0x10ffff}"},
 	{"\\s", "cc{0x9-0xa 0xc-0xd 0x20}"},
 	{"\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}"},
 	{"\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}"},
 	{"\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}"},
-	//	{ "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
-	//	{ "(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+	{"(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}"},
+	{"(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
 	{"[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}"},
 	//	{ "\\C", "byte{}" },

@@ -100,6 +100,13 @@ var parseTests = []struct {
 	{"[\\p{^Braille}]", "cc{0x0-0x27ff 0x2900-0x10ffff}"},
 	{"[\\P{^Braille}]", "cc{0x2800-0x28ff}"},
 	{"[\\pZ]", "cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}"},
+	{"\\p{Lu}", mkCharClass(unicode.IsUpper)},
+	{"[\\p{Lu}]", mkCharClass(unicode.IsUpper)},
+	{"(?i)[\\p{Lu}]", mkCharClass(isUpperFold)},
+
+	// Hex, octal.
+	{"[\\012-\\234]\\141", "cat{cc{0xa-0x9c}lit{a}}"},
+	{"[\\x{41}-\\x7a]\\x61", "cat{cc{0x41-0x7a}lit{a}}"},

 	// More interesting regular expressions.
 	//	{ "a{,2}", "str{a{,2}}" },
@@ -270,3 +277,69 @@ func dumpRegexp(b *bytes.Buffer, re *Regexp) {
 	}
 	b.WriteByte('}')
 }
+
+func mkCharClass(f func(int) bool) string {
+	re := &Regexp{Op: OpCharClass}
+	lo := -1
+	for i := 0; i <= unicode.MaxRune; i++ {
+		if f(i) {
+			if lo < 0 {
+				lo = i
+			}
+		} else {
+			if lo >= 0 {
+				re.Rune = append(re.Rune, lo, i-1)
+				lo = -1
+			}
+		}
+	}
+	if lo >= 0 {
+		re.Rune = append(re.Rune, lo, unicode.MaxRune)
+	}
+	return dump(re)
+}
+
+func isUpperFold(rune int) bool {
+	if unicode.IsUpper(rune) {
+		return true
+	}
+	c := unicode.SimpleFold(rune)
+	for c != rune {
+		if unicode.IsUpper(c) {
+			return true
+		}
+		c = unicode.SimpleFold(c)
+	}
+	return false
+}
+
+func TestFoldConstants(t *testing.T) {
+	last := -1
+	for i := 0; i <= unicode.MaxRune; i++ {
+		if unicode.SimpleFold(i) == i {
+			continue
+		}
+		if last == -1 && minFold != i {
+			t.Errorf("minFold=%#U should be %#U", minFold, i)
+		}
+		last = i
+	}
+	if maxFold != last {
+		t.Errorf("maxFold=%#U should be %#U", maxFold, last)
+	}
+}
+
+func TestAppendRangeCollapse(t *testing.T) {
+	// AppendRange should collapse each of the new ranges
+	// into the earlier ones (it looks back two ranges), so that
+	// the slice never grows very large.
+	// Note that we are not calling cleanClass.
+	var r []int
+	for i := 'A'; i <= 'Z'; i++ {
+		r = appendRange(r, i, i)
+		r = appendRange(r, i+'a'-'A', i+'a'-'A')
+	}
+	if string(r) != "AZaz" {
+		t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
+	}
+}