Commit aae02a18 authored by Kyle Consalus's avatar Kyle Consalus Committed by Rob Pike

Optimization to regexp _CharClass: keep track of overall range of

    charclass to avoid unnecessarily iterating over ranges.
    Also, use the fact that IntVector is an []int to avoid method calls.
    On my machine, this brings us from ~27500 ns/op to ~17500 ns/op in the benchmark I've added (it is also faster in the case where a range check
    doesn't help, added a benchmark for this too.)

    I'd also like to propose that "[]", and "[^]" be disallowed. They aren't useful as far as I can tell, they aren't widely supported, and they make reasoning about character classes a bit more complicated.

R=r
CC=golang-dev
https://golang.org/cl/1495041
parent 3d4c97b8
...@@ -519,7 +519,7 @@ var numSubexpCases = []numSubexpCase{ ...@@ -519,7 +519,7 @@ var numSubexpCases = []numSubexpCase{
func TestNumSubexp(t *testing.T) { func TestNumSubexp(t *testing.T) {
for _, c := range numSubexpCases { for _, c := range numSubexpCases {
re, _ := Compile(c.input) re := MustCompile(c.input)
n := re.NumSubexp() n := re.NumSubexp()
if n != c.expected { if n != c.expected {
t.Errorf("NumSubexp for %q returned %d, expected %d", c.input, n, c.expected) t.Errorf("NumSubexp for %q returned %d, expected %d", c.input, n, c.expected)
...@@ -530,7 +530,7 @@ func TestNumSubexp(t *testing.T) { ...@@ -530,7 +530,7 @@ func TestNumSubexp(t *testing.T) {
func BenchmarkLiteral(b *testing.B) { func BenchmarkLiteral(b *testing.B) {
x := strings.Repeat("x", 50) x := strings.Repeat("x", 50)
b.StopTimer() b.StopTimer()
re, _ := Compile(x) re := MustCompile(x)
b.StartTimer() b.StartTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
if !re.MatchString(x) { if !re.MatchString(x) {
...@@ -543,7 +543,35 @@ func BenchmarkLiteral(b *testing.B) { ...@@ -543,7 +543,35 @@ func BenchmarkLiteral(b *testing.B) {
func BenchmarkNotLiteral(b *testing.B) { func BenchmarkNotLiteral(b *testing.B) {
x := strings.Repeat("x", 49) x := strings.Repeat("x", 49)
b.StopTimer() b.StopTimer()
re, _ := Compile("^" + x) re := MustCompile("^" + x)
b.StartTimer()
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!")
break
}
}
}
func BenchmarkMatchClass(b *testing.B) {
b.StopTimer()
x := strings.Repeat("xxxx", 20) + "w"
re := MustCompile("[abcdw]")
b.StartTimer()
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!")
break
}
}
}
func BenchmarkMatchClass_InRange(b *testing.B) {
b.StopTimer()
// 'b' is betwen 'a' and 'c', so the charclass
// range checking is no help here.
x := strings.Repeat("bbbb", 20) + "c"
re := MustCompile("[ac]")
b.StartTimer() b.StartTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
if !re.MatchString(x) { if !re.MatchString(x) {
...@@ -556,7 +584,7 @@ func BenchmarkNotLiteral(b *testing.B) { ...@@ -556,7 +584,7 @@ func BenchmarkNotLiteral(b *testing.B) {
func BenchmarkReplaceAll(b *testing.B) { func BenchmarkReplaceAll(b *testing.B) {
x := "abcdefghijklmnopqrstuvwxyz" x := "abcdefghijklmnopqrstuvwxyz"
b.StopTimer() b.StopTimer()
re, _ := Compile("[cjrw]") re := MustCompile("[cjrw]")
b.StartTimer() b.StartTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
re.ReplaceAllString(x, "") re.ReplaceAllString(x, "")
......
...@@ -152,10 +152,10 @@ func newChar(char int) *_Char { ...@@ -152,10 +152,10 @@ func newChar(char int) *_Char {
type _CharClass struct { type _CharClass struct {
common common
char int
negate bool // is character class negated? ([^a-z]) negate bool // is character class negated? ([^a-z])
// vector of int, stored pairwise: [a-z] is (a,z); x is (x,x): // vector of int, stored pairwise: [a-z] is (a,z); x is (x,x):
ranges *vector.IntVector ranges *vector.IntVector
cmin, cmax int
} }
func (cclass *_CharClass) kind() int { return _CHARCLASS } func (cclass *_CharClass) kind() int { return _CHARCLASS }
...@@ -180,13 +180,21 @@ func (cclass *_CharClass) addRange(a, b int) { ...@@ -180,13 +180,21 @@ func (cclass *_CharClass) addRange(a, b int) {
// range is a through b inclusive // range is a through b inclusive
cclass.ranges.Push(a) cclass.ranges.Push(a)
cclass.ranges.Push(b) cclass.ranges.Push(b)
if a < cclass.cmin {
cclass.cmin = a
}
if b > cclass.cmax {
cclass.cmax = b
}
} }
func (cclass *_CharClass) matches(c int) bool { func (cclass *_CharClass) matches(c int) bool {
for i := 0; i < cclass.ranges.Len(); i = i + 2 { if c < cclass.cmin || c > cclass.cmax {
min := cclass.ranges.At(i) return cclass.negate
max := cclass.ranges.At(i + 1) }
if min <= c && c <= max { ranges := []int(*cclass.ranges)
for i := 0; i < len(ranges); i = i + 2 {
if ranges[i] <= c && c <= ranges[i+1] {
return !cclass.negate return !cclass.negate
} }
} }
...@@ -196,6 +204,8 @@ func (cclass *_CharClass) matches(c int) bool { ...@@ -196,6 +204,8 @@ func (cclass *_CharClass) matches(c int) bool {
func newCharClass() *_CharClass { func newCharClass() *_CharClass {
c := new(_CharClass) c := new(_CharClass)
c.ranges = new(vector.IntVector) c.ranges = new(vector.IntVector)
c.cmin = 0x10FFFF + 1 // MaxRune + 1
c.cmax = -1
return c return c
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment