Commit b4cae4ae authored by Russ Cox's avatar Russ Cox

exp/regexp/syntax: finish Regexp manipulation

Except for the inevitable bug fixes, the Regexp code is done.

R=sam.thorogood, r
CC=golang-dev
https://golang.org/cl/4635082
parent a809abaf
This diff is collapsed.
......@@ -39,8 +39,7 @@ var parseTests = []struct {
{`a{2,3}?`, `nrep{2,3 lit{a}}`},
{`a{2,}?`, `nrep{2,-1 lit{a}}`},
{``, `emp{}`},
// { `|`, `emp{}` }, // alt{emp{}emp{}} but got factored
{`|`, `alt{emp{}emp{}}`},
{`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
{`|x|`, `alt{emp{}lit{x}emp{}}`},
{`.`, `dot{}`},
{`^`, `bol{}`},
......@@ -64,6 +63,9 @@ var parseTests = []struct {
{`\-`, `lit{-}`},
{`-`, `lit{-}`},
{`\_`, `lit{_}`},
{`abc`, `str{abc}`},
{`abc|def`, `alt{str{abc}str{def}}`},
{`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
// Posix and Perl extensions
{`[[:lower:]]`, `cc{0x61-0x7a}`},
......@@ -156,6 +158,10 @@ var parseTests = []struct {
// Strings
{`abcde`, `str{abcde}`},
{`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
// Factoring.
{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`},
}
const testFlags = MatchNL | PerlX | UnicodeGroups
......
......@@ -60,6 +60,59 @@ const (
const opPseudo Op = 128 // where pseudo-ops start
// Equal returns true if x and y have identical structure.
func (x *Regexp) Equal(y *Regexp) bool {
if x == nil || y == nil {
return x == y
}
if x.Op != y.Op {
return false
}
switch x.Op {
case OpEndText:
// The parse flags remember whether this is \z or \Z.
if x.Flags&WasDollar != y.Flags&WasDollar {
return false
}
case OpLiteral, OpCharClass:
if len(x.Rune) != len(y.Rune) {
return false
}
for i, r := range x.Rune {
if r != y.Rune[i] {
return false
}
}
case OpAlternate, OpConcat:
if len(x.Sub) != len(y.Sub) {
return false
}
for i, sub := range x.Sub {
if !sub.Equal(y.Sub[i]) {
return false
}
}
case OpStar, OpPlus, OpQuest:
if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
case OpRepeat:
if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
case OpCapture:
if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
}
return true
}
// writeRegexp writes the Perl syntax for the regular expression re to b.
func writeRegexp(b *bytes.Buffer, re *Regexp) {
switch re.Op {
......@@ -70,16 +123,24 @@ func writeRegexp(b *bytes.Buffer, re *Regexp) {
case OpEmptyMatch:
b.WriteString(`(?:)`)
case OpLiteral:
if re.Flags&FoldCase != 0 {
b.WriteString(`(?i:`)
}
for _, r := range re.Rune {
escape(b, r, false)
}
if re.Flags&FoldCase != 0 {
b.WriteString(`)`)
}
case OpCharClass:
if len(re.Rune)%2 != 0 {
b.WriteString(`[invalid char class]`)
break
}
b.WriteRune('[')
if len(re.Rune) > 0 && re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
if len(re.Rune) == 0 {
b.WriteString(`^\x00-\x{10FFFF}`)
} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
// Contains 0 and MaxRune. Probably a negated class.
// Print the gaps.
b.WriteRune('^')
......@@ -126,7 +187,9 @@ func writeRegexp(b *bytes.Buffer, re *Regexp) {
} else {
b.WriteRune('(')
}
writeRegexp(b, re.Sub[0])
if re.Sub[0].Op != OpEmptyMatch {
writeRegexp(b, re.Sub[0])
}
b.WriteRune(')')
case OpStar, OpPlus, OpQuest, OpRepeat:
if sub := re.Sub[0]; sub.Op > OpCapture {
......@@ -205,6 +268,15 @@ func escape(b *bytes.Buffer, r int, force bool) {
case '\v':
b.WriteString(`\v`)
default:
if r < 0x100 {
b.WriteString(`\x`)
s := strconv.Itob(r, 16)
if len(s) == 1 {
b.WriteRune('0')
}
b.WriteString(s)
break
}
b.WriteString(`\x{`)
b.WriteString(strconv.Itob(r, 16))
b.WriteString(`}`)
......
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
// Simplify returns a regexp equivalent to re but without counted repetitions
// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
// The resulting regexp will execute correctly but its string representation
// will not produce the same parse tree, because capturing parentheses
// may have been duplicated or removed. For example, the simplified form
// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
// The returned regexp may share structure with or be the original.
func (re *Regexp) Simplify() *Regexp {
if re == nil {
return nil
}
switch re.Op {
case OpCapture, OpConcat, OpAlternate:
// Simplify children, building new Regexp if children change.
nre := re
for i, sub := range re.Sub {
nsub := sub.Simplify()
if nre == re && nsub != sub {
// Start a copy.
nre = new(Regexp)
*nre = *re
nre.Rune = nil
nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...)
}
if nre != re {
nre.Sub = append(nre.Sub, nsub)
}
}
return nre
case OpStar, OpPlus, OpQuest:
sub := re.Sub[0].Simplify()
return simplify1(re.Op, re.Flags, sub, re)
case OpRepeat:
// Special special case: x{0} matches the empty string
// and doesn't even need to consider x.
if re.Min == 0 && re.Max == 0 {
return &Regexp{Op: OpEmptyMatch}
}
// The fun begins.
sub := re.Sub[0].Simplify()
// x{n,} means at least n matches of x.
if re.Max == -1 {
// Special case: x{0,} is x*.
if re.Min == 0 {
return simplify1(OpStar, re.Flags, sub, nil)
}
// Special case: x{1,} is x+.
if re.Min == 1 {
return simplify1(OpPlus, re.Flags, sub, nil)
}
// General case: x{4,} is xxxx+.
nre := &Regexp{Op: OpConcat}
nre.Sub = nre.Sub0[:0]
for i := 0; i < re.Min-1; i++ {
nre.Sub = append(nre.Sub, sub)
}
nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil))
return nre
}
// Special case x{0} handled above.
// Special case: x{1} is just x.
if re.Min == 1 && re.Max == 1 {
return sub
}
// General case: x{n,m} means n copies of x and m copies of x?
// The machine will do less work if we nest the final m copies,
// so that x{2,5} = xx(x(x(x)?)?)?
// Build leading prefix: xx.
var prefix *Regexp
if re.Min > 0 {
prefix = &Regexp{Op: OpConcat}
prefix.Sub = prefix.Sub0[:0]
for i := 0; i < re.Min; i++ {
prefix.Sub = append(prefix.Sub, sub)
}
}
// Build and attach suffix: (x(x(x)?)?)?
if re.Max > re.Min {
suffix := simplify1(OpQuest, re.Flags, sub, nil)
for i := re.Min + 1; i < re.Max; i++ {
nre2 := &Regexp{Op: OpConcat}
nre2.Sub = append(nre2.Sub0[:0], sub, suffix)
suffix = simplify1(OpQuest, re.Flags, nre2, nil)
}
if prefix == nil {
return suffix
}
prefix.Sub = append(prefix.Sub, suffix)
}
if prefix != nil {
return prefix
}
// Some degenerate case like min > max or min < max < 0.
// Handle as impossible match.
return &Regexp{Op: OpNoMatch}
}
return re
}
// simplify1 implements Simplify for the unary OpStar,
// OpPlus, and OpQuest operators. It returns the simple regexp
// equivalent to
//
// Regexp{Op: op, Flags: flags, Sub: {sub}}
//
// under the assumption that sub is already simple, and
// without first allocating that structure. If the regexp
// to be returned turns out to be equivalent to re, simplify1
// returns re instead.
//
// simplify1 is factored out of Simplify because the implementation
// for other operators generates these unary expressions.
// Letting them call simplify1 makes sure the expressions they
// generate are simple.
func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp {
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if sub.Op == OpEmptyMatch {
return sub
}
// The operators are idempotent if the flags match.
if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy {
return sub
}
if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] {
return re
}
re = &Regexp{Op: op, Flags: flags}
re.Sub = append(re.Sub0[:0], sub)
return re
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import "testing"
var simplifyTests = []struct {
Regexp string
Simple string
}{
// Already-simple constructs
{`a`, `a`},
{`ab`, `ab`},
{`a|b`, `[a-b]`},
{`ab|cd`, `ab|cd`},
{`(ab)*`, `(ab)*`},
{`(ab)+`, `(ab)+`},
{`(ab)?`, `(ab)?`},
{`.`, `.`},
{`^`, `^`},
{`$`, `$`},
{`[ac]`, `[ac]`},
{`[^ac]`, `[^ac]`},
// Posix character classes
{`[[:alnum:]]`, `[0-9A-Za-z]`},
{`[[:alpha:]]`, `[A-Za-z]`},
{`[[:blank:]]`, `[\t ]`},
{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
{`[[:digit:]]`, `[0-9]`},
{`[[:graph:]]`, `[!-~]`},
{`[[:lower:]]`, `[a-z]`},
{`[[:print:]]`, `[ -~]`},
{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
{`[[:space:]]`, `[\t-\r ]`},
{`[[:upper:]]`, `[A-Z]`},
{`[[:xdigit:]]`, `[0-9A-Fa-f]`},
// Perl character classes
{`\d`, `[0-9]`},
{`\s`, `[\t-\n\f-\r ]`},
{`\w`, `[0-9A-Z_a-z]`},
{`\D`, `[^0-9]`},
{`\S`, `[^\t-\n\f-\r ]`},
{`\W`, `[^0-9A-Z_a-z]`},
{`[\d]`, `[0-9]`},
{`[\s]`, `[\t-\n\f-\r ]`},
{`[\w]`, `[0-9A-Z_a-z]`},
{`[\D]`, `[^0-9]`},
{`[\S]`, `[^\t-\n\f-\r ]`},
{`[\W]`, `[^0-9A-Z_a-z]`},
// Posix repetitions
{`a{1}`, `a`},
{`a{2}`, `aa`},
{`a{5}`, `aaaaa`},
{`a{0,1}`, `a?`},
// The next three are illegible because Simplify inserts (?:)
// parens instead of () parens to avoid creating extra
// captured subexpressions. The comments show a version with fewer parens.
{`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)?
{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)?
{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
{`a{0,2}`, `(?:aa?)?`}, // (aa?)?
{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)?
{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)?
{`a{0,}`, `a*`},
{`a{1,}`, `a+`},
{`a{2,}`, `aa+`},
{`a{5,}`, `aaaaa+`},
// Test that operators simplify their arguments.
{`(?:a{1,}){1,}`, `a+`},
{`(a{1,}b{1,})`, `(a+b+)`},
{`a{1,}|b{1,}`, `a+|b+`},
{`(?:a{1,})*`, `(?:a+)*`},
{`(?:a{1,})+`, `a+`},
{`(?:a{1,})?`, `(?:a+)?`},
{``, `(?:)`},
{`a{0}`, `(?:)`},
// Character class simplification
{`[ab]`, `[a-b]`},
{`[a-za-za-z]`, `[a-z]`},
{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
{`[ABCDEFGH]`, `[A-H]`},
{`[AB-CD-EF-GH]`, `[A-H]`},
{`[W-ZP-XE-R]`, `[E-Z]`},
{`[a-ee-gg-m]`, `[a-m]`},
{`[a-ea-ha-m]`, `[a-m]`},
{`[a-ma-ha-e]`, `[a-m]`},
{`[a-zA-Z0-9 -~]`, `[ -~]`},
// Empty character classes
{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
// Full character classes
{`[[:cntrl:][:^cntrl:]]`, `.`},
// Unicode case folding.
{`(?i)A`, `(?i:A)`},
{`(?i)a`, `(?i:a)`},
{`(?i)[A]`, `(?i:A)`},
{`(?i)[a]`, `(?i:A)`},
{`(?i)K`, `(?i:K)`},
{`(?i)k`, `(?i:k)`},
{`(?i)\x{212a}`, "(?i:\u212A)"},
{`(?i)[K]`, "[Kk\u212A]"},
{`(?i)[k]`, "[Kk\u212A]"},
{`(?i)[\x{212a}]`, "[Kk\u212A]"},
{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
{`(?i)[\x00-\x{10FFFF}]`, `.`},
// Empty string as a regular expression.
// The empty string must be preserved inside parens in order
// to make submatches work right, so these tests are less
// interesting than they might otherwise be. String inserts
// explicit (?:) in place of non-parenthesized empty strings,
// to make them easier to spot for other parsers.
{`(a|b|)`, `([a-b]|(?:))`},
{`(|)`, `()`},
{`a()`, `a()`},
{`(()|())`, `(()|())`},
{`(a|)`, `(a|(?:))`},
{`ab()cd()`, `ab()cd()`},
{`()`, `()`},
{`()*`, `()*`},
{`()+`, `()+`},
{`()?`, `()?`},
{`(){0}`, `(?:)`},
{`(){1}`, `()`},
{`(){1,}`, `()+`},
{`(){0,2}`, `(?:()()?)?`},
}
func TestSimplify(t *testing.T) {
for _, tt := range simplifyTests {
re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
if err != nil {
t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
continue
}
s := re.Simplify().String()
if s != tt.Simple {
t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment