Commit b96c3477 authored by Russ Cox's avatar Russ Cox

exp/regexp/syntax: syntax data structures, parser

Parser is a work in progress but can populate most of the
interesting parts of the data structure, so a good checkpoint.
All the complicated Perl syntax is missing, as are various
important optimizations made during parsing to the
syntax tree.

The plan is that exp/regexp's API will mimic regexp,
and exp/regexp/syntax provides the parser directly
for programs that need it (and for implementing exp/regexp).

Once finished, exp/regexp will replace regexp.

R=r, sam.thorogood, kevlar, edsrzf
CC=golang-dev
https://golang.org/cl/4538123
parent b4bb970e
......@@ -81,6 +81,7 @@ DIRS=\
exp/eval\
exp/gui\
exp/gui/x11\
exp/regexp/syntax\
expvar\
flag\
fmt\
......
# Copyright 2011 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
include ../../../../Make.inc
TARG=exp/regexp/syntax
GOFILES=\
parse.go\
regexp.go\
include ../../../../Make.pkg
This diff is collapsed.
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import (
"bytes"
"fmt"
"testing"
"unicode"
)
var parseTests = []struct {
Regexp string
Dump string
}{
// Base cases
{"a", "lit{a}"},
{"a.", "cat{lit{a}dot{}}"},
{"a.b", "cat{lit{a}dot{}lit{b}}"},
// { "ab", "str{ab}" },
{"ab", "cat{lit{a}lit{b}}"},
{"a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}"},
// { "abc", "str{abc}" },
{"abc", "cat{lit{a}lit{b}lit{c}}"},
{"a|^", "alt{lit{a}bol{}}"},
// { "a|b", "cc{0x61-0x62}" },
{"a|b", "alt{lit{a}lit{b}}"},
{"(a)", "cap{lit{a}}"},
{"(a)|b", "alt{cap{lit{a}}lit{b}}"},
{"a*", "star{lit{a}}"},
{"a+", "plus{lit{a}}"},
{"a?", "que{lit{a}}"},
// { "a{2}", "rep{2,2 lit{a}}" },
// { "a{2,3}", "rep{2,3 lit{a}}" },
// { "a{2,}", "rep{2,-1 lit{a}}" },
// { "a*?", "nstar{lit{a}}" },
// { "a+?", "nplus{lit{a}}" },
// { "a??", "nque{lit{a}}" },
// { "a{2}?", "nrep{2,2 lit{a}}" },
// { "a{2,3}?", "nrep{2,3 lit{a}}" },
// { "a{2,}?", "nrep{2,-1 lit{a}}" },
{"", "emp{}"},
// { "|", "emp{}" }, // alt{emp{}emp{}} but got factored
// { "|", "alt{emp{}emp{}}" },
{"|x|", "alt{emp{}lit{x}emp{}}"},
{".", "dot{}"},
{"^", "bol{}"},
{"$", "eol{}"},
// { "\\|", "lit{|}" },
// { "\\(", "lit{(}" },
// { "\\)", "lit{)}" },
// { "\\*", "lit{*}" },
// { "\\+", "lit{+}" },
// { "\\?", "lit{?}" },
// { "{", "lit{{}" },
{"}", "lit{}}"},
// { "\\.", "lit{.}" },
// { "\\^", "lit{^}" },
// { "\\$", "lit{$}" },
// { "\\\\", "lit{\\}" },
{"[ace]", "cc{0x61 0x63 0x65}"},
{"[abc]", "cc{0x61-0x63}"},
{"[a-z]", "cc{0x61-0x7a}"},
// { "[a]", "lit{a}" },
{"[a]", "cc{0x61}"},
// { "\\-", "lit{-}" },
{"-", "lit{-}"},
// { "\\_", "lit{_}" },
// Posix and Perl extensions
// { "[[:lower:]]", "cc{0x61-0x7a}" },
// { "[a-z]", "cc{0x61-0x7a}" },
// { "[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" },
// { "[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" },
// { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
// { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
// { "(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
// { "(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
// { "\\d", "cc{0x30-0x39}" },
// { "\\D", "cc{0x0-0x2f 0x3a-0x10ffff}" },
// { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
// { "\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
// { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
// { "\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
// { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
// { "(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
// { "[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}" },
// { "\\C", "byte{}" },
// Unicode, negatives, and a double negative.
// { "\\p{Braille}", "cc{0x2800-0x28ff}" },
// { "\\P{Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" },
// { "\\p{^Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" },
// { "\\P{^Braille}", "cc{0x2800-0x28ff}" },
// More interesting regular expressions.
// { "a{,2}", "str{a{,2}}" },
// { "\\.\\^\\$\\\\", "str{.^$\\}" },
{"[a-zABC]", "cc{0x41-0x43 0x61-0x7a}"},
{"[^a]", "cc{0x0-0x60 0x62-0x10ffff}"},
{"[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}"}, // utf-8
// { "a*{", "cat{star{lit{a}}lit{{}}" },
// Test precedences
// { "(?:ab)*", "star{str{ab}}" },
// { "(ab)*", "star{cap{str{ab}}}" },
// { "ab|cd", "alt{str{ab}str{cd}}" },
// { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
{"(?:ab)*", "star{cat{lit{a}lit{b}}}"},
{"(ab)*", "star{cap{cat{lit{a}lit{b}}}}"},
{"ab|cd", "alt{cat{lit{a}lit{b}}cat{lit{c}lit{d}}}"},
{"a(b|c)d", "cat{lit{a}cap{alt{lit{b}lit{c}}}lit{d}}"},
// Test flattening.
// { "(?:a)", "lit{a}" },
// { "(?:ab)(?:cd)", "str{abcd}" },
// { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
// { "a|.", "dot{}" },
// { ".|a", "dot{}" },
// Test Perl quoted literals
// { "\\Q+|*?{[\\E", "str{+|*?{[}" },
// { "\\Q+\\E+", "plus{lit{+}}" },
// { "\\Q\\\\E", "lit{\\}" },
// { "\\Q\\\\\\E", "str{\\\\}" },
// Test Perl \A and \z
// { "(?m)^", "bol{}" },
// { "(?m)$", "eol{}" },
// { "(?-m)^", "bot{}" },
// { "(?-m)$", "eot{}" },
// { "(?m)\\A", "bot{}" },
// { "(?m)\\z", "eot{\\z}" },
// { "(?-m)\\A", "bot{}" },
// { "(?-m)\\z", "eot{\\z}" },
// Test named captures
// { "(?P<name>a)", "cap{name:lit{a}}" },
// Case-folded literals
// { "[Aa]", "litfold{a}" },
// Strings
// { "abcde", "str{abcde}" },
// { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
}
const testFlags = MatchNL | PerlX | UnicodeGroups
// Test Parse -> Dump.
func TestParseDump(t *testing.T) {
for _, tt := range parseTests {
re, err := Parse(tt.Regexp, testFlags)
if err != nil {
t.Errorf("Parse(%#q): %v", tt.Regexp, err)
continue
}
d := dump(re)
if d != tt.Dump {
t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
}
}
}
// dump prints a string representation of the regexp showing
// the structure explicitly.
func dump(re *Regexp) string {
var b bytes.Buffer
dumpRegexp(&b, re)
return b.String()
}
var opNames = []string{
OpNoMatch: "no",
OpEmptyMatch: "emp",
OpLiteral: "lit",
OpCharClass: "cc",
OpAnyCharNotNL: "dnl",
OpAnyChar: "dot",
OpBeginLine: "bol",
OpEndLine: "eol",
OpBeginText: "bot",
OpEndText: "eot",
OpWordBoundary: "wb",
OpNoWordBoundary: "nwb",
OpCapture: "cap",
OpStar: "star",
OpPlus: "plus",
OpQuest: "que",
OpRepeat: "rep",
OpConcat: "cat",
OpAlternate: "alt",
}
// dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
// It is used during testing to distinguish between parses that might print
// the same using re's String method.
func dumpRegexp(b *bytes.Buffer, re *Regexp) {
if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
fmt.Fprintf(b, "op%d", re.Op)
} else {
switch re.Op {
default:
b.WriteString(opNames[re.Op])
case OpStar, OpPlus, OpQuest, OpRepeat:
if re.Flags&NonGreedy != 0 {
b.WriteByte('n')
}
b.WriteString(opNames[re.Op])
case OpLiteral:
if len(re.Rune) > 1 {
b.WriteString("str")
} else {
b.WriteString("lit")
}
if re.Flags&FoldCase != 0 {
for _, r := range re.Rune {
if unicode.ToUpper(r) != r {
b.WriteString("fold")
}
}
}
}
}
b.WriteByte('{')
switch re.Op {
case OpEndText:
if re.Flags&WasDollar == 0 {
b.WriteString(`\z`)
}
case OpLiteral:
for _, r := range re.Rune {
b.WriteRune(r)
}
case OpConcat, OpAlternate:
for _, sub := range re.Sub {
dumpRegexp(b, sub)
}
case OpStar, OpPlus, OpQuest:
dumpRegexp(b, re.Sub[0])
case OpRepeat:
fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
dumpRegexp(b, re.Sub[0])
case OpCapture:
if re.Name != "" {
b.WriteString(re.Name)
b.WriteByte(':')
}
dumpRegexp(b, re.Sub[0])
case OpCharClass:
sep := ""
for i := 0; i < len(re.Rune); i += 2 {
b.WriteString(sep)
sep = " "
lo, hi := re.Rune[i], re.Rune[i+1]
if lo == hi {
fmt.Fprintf(b, "%#x", lo)
} else {
fmt.Fprintf(b, "%#x-%#x", lo, hi)
}
}
}
b.WriteByte('}')
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package syntax parses regular expressions into syntax trees.
// WORK IN PROGRESS.
package syntax
// Note to implementers:
// In this package, re is always a *Regexp and r is always a rune.
import (
"bytes"
"strconv"
"strings"
"unicode"
)
// A Regexp is a node in a regular expression syntax tree.
type Regexp struct {
Op Op // operator
Flags Flags
Sub []*Regexp // subexpressions, if any
Sub0 [1]*Regexp // storage for short Sub
Rune []int // matched runes, for OpLiteral, OpCharClass
Rune0 [2]int // storage for short Rune
Min, Max int // min, max for OpRepeat
Cap int // capturing index, for OpCapture
Name string // capturing name, for OpCapture
}
// An Op is a single regular expression operator.
type Op uint8
// Operators are listed in precedence order, tightest binding to weakest.
const (
OpNoMatch Op = 1 + iota // matches no strings
OpEmptyMatch // matches empty string
OpLiteral // matches Runes sequence
OpCharClass // matches Runes interpreted as range pair list
OpAnyCharNotNL // matches any character
OpAnyChar // matches any character
OpBeginLine // matches empty string at beginning of line
OpEndLine // matches empty string at end of line
OpBeginText // matches empty string at beginning of text
OpEndText // matches empty string at end of text
OpWordBoundary // matches word boundary `\b`
OpNoWordBoundary // matches word non-boundary `\B`
OpCapture // capturing subexpression with index Cap, optional name Name
OpStar // matches Sub[0] zero or more times
OpPlus // matches Sub[0] one or more times
OpQuest // matches Sub[0] zero or one times
OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
OpConcat // matches concatenation of Subs
OpAlternate // matches alternation of Subs
)
const opPseudo Op = 128 // where pseudo-ops start
// writeRegexp writes the Perl syntax for the regular expression re to b.
func writeRegexp(b *bytes.Buffer, re *Regexp) {
switch re.Op {
default:
b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
case OpNoMatch:
b.WriteString(`[^\x00-\x{10FFFF}]`)
case OpEmptyMatch:
b.WriteString(`(?:)`)
case OpLiteral:
for _, r := range re.Rune {
escape(b, r, false)
}
case OpCharClass:
if len(re.Rune)%2 != 0 {
b.WriteString(`[invalid char class]`)
break
}
b.WriteRune('[')
if len(re.Rune) > 0 && re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
// Contains 0 and MaxRune. Probably a negated class.
// Print the gaps.
b.WriteRune('^')
for i := 1; i < len(re.Rune)-1; i += 2 {
lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
escape(b, lo, lo == '-')
if lo != hi {
b.WriteRune('-')
escape(b, hi, hi == '-')
}
}
} else {
for i := 0; i < len(re.Rune); i += 2 {
lo, hi := re.Rune[i], re.Rune[i+1]
escape(b, lo, lo == '-')
if lo != hi {
b.WriteRune('-')
escape(b, hi, hi == '-')
}
}
}
b.WriteRune(']')
case OpAnyCharNotNL:
b.WriteString(`[^\n]`)
case OpAnyChar:
b.WriteRune('.')
case OpBeginLine:
b.WriteRune('^')
case OpEndLine:
b.WriteRune('$')
case OpBeginText:
b.WriteString(`\A`)
case OpEndText:
b.WriteString(`\z`)
case OpWordBoundary:
b.WriteString(`\b`)
case OpNoWordBoundary:
b.WriteString(`\B`)
case OpCapture:
if re.Name != "" {
b.WriteString(`(?P<`)
b.WriteString(re.Name)
b.WriteRune('>')
} else {
b.WriteRune('(')
}
writeRegexp(b, re.Sub[0])
b.WriteRune(')')
case OpStar, OpPlus, OpQuest, OpRepeat:
if sub := re.Sub[0]; sub.Op > OpCapture {
b.WriteString(`(?:`)
writeRegexp(b, sub)
b.WriteString(`)`)
} else {
writeRegexp(b, sub)
}
switch re.Op {
case OpStar:
b.WriteRune('*')
case OpPlus:
b.WriteRune('+')
case OpQuest:
b.WriteRune('?')
case OpRepeat:
b.WriteRune('{')
b.WriteString(strconv.Itoa(re.Min))
if re.Max != re.Min {
b.WriteRune(',')
if re.Max >= 0 {
b.WriteString(strconv.Itoa(re.Max))
}
}
b.WriteRune('}')
}
case OpConcat:
for _, sub := range re.Sub {
if sub.Op == OpAlternate {
b.WriteString(`(?:`)
writeRegexp(b, sub)
b.WriteString(`)`)
} else {
writeRegexp(b, sub)
}
}
case OpAlternate:
for i, sub := range re.Sub {
if i > 0 {
b.WriteRune('|')
}
writeRegexp(b, sub)
}
}
}
func (re *Regexp) String() string {
var b bytes.Buffer
writeRegexp(&b, re)
return b.String()
}
const meta = `\.+*?()|[]{}^$`
func escape(b *bytes.Buffer, r int, force bool) {
if unicode.IsPrint(r) {
if strings.IndexRune(meta, r) >= 0 || force {
b.WriteRune('\\')
}
b.WriteRune(r)
return
}
switch r {
case '\a':
b.WriteString(`\a`)
case '\f':
b.WriteString(`\f`)
case '\n':
b.WriteString(`\n`)
case '\r':
b.WriteString(`\r`)
case '\t':
b.WriteString(`\t`)
case '\v':
b.WriteString(`\v`)
default:
b.WriteString(`\x{`)
b.WriteString(strconv.Itob(r, 16))
b.WriteString(`}`)
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment