Commit 0b05e91f authored by Rob Pike's avatar Rob Pike

add some tests

fix some bugs in () ordering and rune processing

R=rsc
DELTA=72  (27 added, 5 deleted, 40 changed)
OCL=17147
CL=17147
parent 82e41cc5
...@@ -10,22 +10,22 @@ import ( ...@@ -10,22 +10,22 @@ import (
) )
var good_re = []string{ var good_re = []string{
`` ``,
, `.` `.`,
, `^.$` `^.$`,
, `a` `a`,
, `a*` `a*`,
, `a+` `a+`,
, `a?` `a?`,
, `a|b` `a|b`,
, `a*|b*` `a*|b*`,
, `(a*|b)(c*|d)` `(a*|b)(c*|d)`,
, `[a-z]` `[a-z]`,
, `[a-abc-c\-\]\[]` `[a-abc-c\-\]\[]`,
, `[a-z]+` `[a-z]+`,
, `[]` `[]`,
, `[abc]` `[abc]`,
, `[^1234]` `[^1234]`,
} }
// TODO: nice to do this with a map but we don't have an iterator // TODO: nice to do this with a map but we don't have an iterator
...@@ -45,7 +45,7 @@ var bad_re = []StringError{ ...@@ -45,7 +45,7 @@ var bad_re = []StringError{
StringError{ `a*+`, regexp.ErrBadClosure }, StringError{ `a*+`, regexp.ErrBadClosure },
StringError{ `a??`, regexp.ErrBadClosure }, StringError{ `a??`, regexp.ErrBadClosure },
StringError{ `*`, regexp.ErrBareClosure }, StringError{ `*`, regexp.ErrBareClosure },
StringError{ `\x`, regexp.ErrBadBackslash } StringError{ `\x`, regexp.ErrBadBackslash },
} }
type Vec [20]int; type Vec [20]int;
...@@ -56,17 +56,33 @@ type Tester struct { ...@@ -56,17 +56,33 @@ type Tester struct {
match Vec; match Vec;
} }
const END = -1000
var matches = []Tester { var matches = []Tester {
Tester{ ``, "", Vec{0,0, -1,-1} }, Tester{ ``, "", Vec{0,0, END} },
Tester{ `a`, "a", Vec{0,1, -1,-1} }, Tester{ `a`, "a", Vec{0,1, END} },
Tester{ `b`, "abc", Vec{1,2, -1,-1} }, Tester{ `b`, "abc", Vec{1,2, END} },
Tester{ `.`, "a", Vec{0,1, -1,-1} }, Tester{ `.`, "a", Vec{0,1, END} },
Tester{ `.*`, "abcdef", Vec{0,6, -1,-1} }, Tester{ `.*`, "abcdef", Vec{0,6, END} },
Tester{ `^abcd$`, "abcd", Vec{0,4, -1,-1} }, Tester{ `^abcd$`, "abcd", Vec{0,4, END} },
Tester{ `^bcd'`, "abcdef", Vec{-1,-1} }, Tester{ `^bcd'`, "abcdef", Vec{END} },
Tester{ `^abcd$`, "abcde", Vec{-1,-1} }, Tester{ `^abcd$`, "abcde", Vec{END} },
Tester{ `a+`, "baaab", Vec{1, 4, -1,-1} }, Tester{ `a+`, "baaab", Vec{1,4, END} },
Tester{ `a*`, "baaab", Vec{0, 0, -1,-1} } Tester{ `a*`, "baaab", Vec{0,0, END} },
Tester{ `[a-z]+`, "abcd", Vec{0,4, END} },
Tester{ `[^a-z]+`, "ab1234cd", Vec{2,6, END} },
Tester{ `[a\-\]z]+`, "az]-bcz", Vec{0,4, END} },
Tester{ `[日本語]+`, "日本語日本語", Vec{0,18, END} },
Tester{ `()`, "", Vec{0,0, 0,0, END} },
Tester{ `(a)`, "a", Vec{0,1, 0,1, END} },
Tester{ `(.)(.)`, "日a", Vec{0,4, 0,3, 3,4, END} },
Tester{ `(.*)`, "", Vec{0,0, 0,0, END} },
Tester{ `(.*)`, "abcd", Vec{0,4, 0,4, END} },
Tester{ `(..)(..)`, "abcd", Vec{0,4, 0,2, 2,4, END} },
Tester{ `(([^xyz]*)(d))`, "abcd", Vec{0,4, 0,4, 0,3, 3,4, END} },
Tester{ `((a|b|c)*(d))`, "abcd", Vec{0,4, 0,4, 2,3, 3,4, END} },
Tester{ `(((a|b|c)*)(d))`, "abcd", Vec{0,4, 0,4, 0,3, 2,3, 3,4, END} },
Tester{ `a*(|(b))c*`, "aacc", Vec{0,4, 2,2, -1,-1, END} },
} }
func Compile(expr string, error *os.Error) regexp.Regexp { func Compile(expr string, error *os.Error) regexp.Regexp {
...@@ -83,16 +99,20 @@ func MarkedLen(m *[] int) int { ...@@ -83,16 +99,20 @@ func MarkedLen(m *[] int) int {
return 0 return 0
} }
var i int; var i int;
for i = 0; i < len(m) && m[i] >= 0; i = i+2 { for i = 0; i < len(m) && m[i] != END; i = i+2 {
} }
return i return i
} }
func PrintVec(m *[] int) { func PrintVec(m *[] int) {
l := MarkedLen(m); l := MarkedLen(m);
for i := 0; i < l && m[i] >= 0; i = i+2 { if l == 0 {
print("<no match>");
} else {
for i := 0; i < l && m[i] != END; i = i+2 {
print(m[i], ",", m[i+1], " ") print(m[i], ",", m[i+1], " ")
} }
}
} }
func Equal(m1, m2 *[]int) bool { func Equal(m1, m2 *[]int) bool {
...@@ -122,6 +142,7 @@ func Match(expr string, str string, match *[]int) { ...@@ -122,6 +142,7 @@ func Match(expr string, str string, match *[]int) {
} }
func main() { func main() {
//regexp.debug = true;
if sys.argc() > 1 { if sys.argc() > 1 {
Compile(sys.argv(1), nil); Compile(sys.argv(1), nil);
sys.exit(0); sys.exit(0);
......
...@@ -287,7 +287,6 @@ func (p *Parser) nextc() int { ...@@ -287,7 +287,6 @@ func (p *Parser) nextc() int {
if p.pos >= len(p.re.expr) { if p.pos >= len(p.re.expr) {
p.ch = EOF p.ch = EOF
} else { } else {
// TODO: stringotorune should take a string*
c, w := sys.stringtorune(p.re.expr, p.pos); c, w := sys.stringtorune(p.re.expr, p.pos);
p.ch = c; p.ch = c;
p.pos += w; p.pos += w;
...@@ -433,6 +432,8 @@ func (p *Parser) Term() (start, end Inst) { ...@@ -433,6 +432,8 @@ func (p *Parser) Term() (start, end Inst) {
case '(': case '(':
p.nextc(); p.nextc();
p.nlpar++; p.nlpar++;
p.re.nbra++; // increment first so first subexpr is \1
nbra := p.re.nbra;
start, end = p.Regexp(); start, end = p.Regexp();
if p.c() != ')' { if p.c() != ')' {
p.re.Error(ErrUnmatchedLpar); p.re.Error(ErrUnmatchedLpar);
...@@ -443,9 +444,8 @@ func (p *Parser) Term() (start, end Inst) { ...@@ -443,9 +444,8 @@ func (p *Parser) Term() (start, end Inst) {
p.re.Add(bra); p.re.Add(bra);
ebra := new(Ebra); ebra := new(Ebra);
p.re.Add(ebra); p.re.Add(ebra);
p.re.nbra++; // increment first so first subexpr is \1 bra.n = nbra;
bra.n = p.re.nbra; ebra.n = nbra;
ebra.n = p.re.nbra;
if start == NULL { if start == NULL {
if end == NULL { p.re.Error(ErrInternal) } if end == NULL { p.re.Error(ErrInternal) }
start = ebra start = ebra
...@@ -479,7 +479,7 @@ func (p *Parser) Term() (start, end Inst) { ...@@ -479,7 +479,7 @@ func (p *Parser) Term() (start, end Inst) {
func (p *Parser) Closure() (start, end Inst) { func (p *Parser) Closure() (start, end Inst) {
start, end = p.Term(); start, end = p.Term();
if start == NULL { if start == NULL {
return start, end return
} }
switch p.c() { switch p.c() {
case '*': case '*':
...@@ -509,13 +509,13 @@ func (p *Parser) Closure() (start, end Inst) { ...@@ -509,13 +509,13 @@ func (p *Parser) Closure() (start, end Inst) {
start = alt; // start is now alt start = alt; // start is now alt
end = nop; // end is nop pointed to by both branches end = nop; // end is nop pointed to by both branches
default: default:
return start, end; return
} }
switch p.nextc() { switch p.nextc() {
case '*', '+', '?': case '*', '+', '?':
p.re.Error(ErrBadClosure); p.re.Error(ErrBadClosure);
} }
return start, end; return
} }
func (p *Parser) Concatenation() (start, end Inst) { func (p *Parser) Concatenation() (start, end Inst) {
...@@ -528,7 +528,7 @@ func (p *Parser) Concatenation() (start, end Inst) { ...@@ -528,7 +528,7 @@ func (p *Parser) Concatenation() (start, end Inst) {
nop := p.re.Add(new(Nop)); nop := p.re.Add(new(Nop));
return nop, nop; return nop, nop;
} }
return start, end; return;
case start == NULL: // this is first element of concatenation case start == NULL: // this is first element of concatenation
start, end = nstart, nend; start, end = nstart, nend;
default: default:
...@@ -544,7 +544,7 @@ func (p *Parser) Regexp() (start, end Inst) { ...@@ -544,7 +544,7 @@ func (p *Parser) Regexp() (start, end Inst) {
for { for {
switch p.c() { switch p.c() {
default: default:
return start, end; return;
case '|': case '|':
p.nextc(); p.nextc();
nstart, nend := p.Concatenation(); nstart, nend := p.Concatenation();
...@@ -683,6 +683,9 @@ func (re *RE) DoExecute(str string, pos int) *[]int { ...@@ -683,6 +683,9 @@ func (re *RE) DoExecute(str string, pos int) *[]int {
if !found { if !found {
// prime the pump if we haven't seen a match yet // prime the pump if we haven't seen a match yet
match := new([]int, 2*(re.nbra+1)); match := new([]int, 2*(re.nbra+1));
for i := 0; i < len(match); i++ {
match[i] = -1; // no match seen; catches cases like "a(b)?c" on "ac"
}
match[0] = pos; match[0] = pos;
s[out] = AddState(s[out], re.start.Next(), match); s[out] = AddState(s[out], re.start.Next(), match);
} }
...@@ -692,14 +695,13 @@ func (re *RE) DoExecute(str string, pos int) *[]int { ...@@ -692,14 +695,13 @@ func (re *RE) DoExecute(str string, pos int) *[]int {
// machine has completed // machine has completed
break; break;
} }
charwidth := 1;
c := EOF; c := EOF;
if pos < len(str) { if pos < len(str) {
c = int(str[pos]) c, charwidth = sys.stringtorune(str, pos);
} }
//println("position ", pos, "char", string(c), "in", in, "out", out, "len in", len(s[in]));
for i := 0; i < len(s[in]); i++ { for i := 0; i < len(s[in]); i++ {
state := s[in][i]; state := s[in][i];
//state.inst.Print(); print("\n");
switch s[in][i].inst.Type() { switch s[in][i].inst.Type() {
case BOT: case BOT:
if pos == 0 { if pos == 0 {
...@@ -751,12 +753,11 @@ func (re *RE) DoExecute(str string, pos int) *[]int { ...@@ -751,12 +753,11 @@ func (re *RE) DoExecute(str string, pos int) *[]int {
panic("unknown instruction in execute"); panic("unknown instruction in execute");
} }
} }
pos++; pos += charwidth;
} }
if !found { if !found {
return nil return nil
} }
//if found { println("found: from ", final.match[0], "to", final.match[1] )}
return final.match; return final.match;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment