Commit 0da66a2e authored by Robert Griesemer's avatar Robert Griesemer

ebnf: use scanner instead of go/scanner

R=rsc, r
CC=golang-dev
https://golang.org/cl/5192043
parent b2f1eba3
...@@ -98,12 +98,12 @@ func main() { ...@@ -98,12 +98,12 @@ func main() {
src = extractEBNF(src) src = extractEBNF(src)
} }
grammar, err := ebnf.Parse(fset, filename, src) grammar, err := ebnf.Parse(filename, bytes.NewBuffer(src))
if err != nil { if err != nil {
report(err) report(err)
} }
if err = ebnf.Verify(fset, grammar, *start); err != nil { if err = ebnf.Verify(grammar, *start); err != nil {
report(err) report(err)
} }
} }
...@@ -23,13 +23,39 @@ ...@@ -23,13 +23,39 @@
package ebnf package ebnf
import ( import (
"go/scanner" "fmt"
"go/token"
"os" "os"
"scanner"
"unicode" "unicode"
"utf8" "utf8"
) )
// ----------------------------------------------------------------------------
// Error handling
type errorList []os.Error
func (list errorList) Error() os.Error {
if len(list) == 0 {
return nil
}
return list
}
func (list errorList) String() string {
switch len(list) {
case 0:
return "no errors"
case 1:
return list[0].String()
}
return fmt.Sprintf("%s (and %d more errors)", list[0], len(list)-1)
}
func newError(pos scanner.Position, msg string) os.Error {
return os.NewError(fmt.Sprintf("%s: %s", pos, msg))
}
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Internal representation // Internal representation
...@@ -37,7 +63,7 @@ type ( ...@@ -37,7 +63,7 @@ type (
// An Expression node represents a production expression. // An Expression node represents a production expression.
Expression interface { Expression interface {
// Pos is the position of the first character of the syntactic construct // Pos is the position of the first character of the syntactic construct
Pos() token.Pos Pos() scanner.Position
} }
// An Alternative node represents a non-empty list of alternative expressions. // An Alternative node represents a non-empty list of alternative expressions.
...@@ -48,13 +74,13 @@ type ( ...@@ -48,13 +74,13 @@ type (
// A Name node represents a production name. // A Name node represents a production name.
Name struct { Name struct {
StringPos token.Pos StringPos scanner.Position
String string String string
} }
// A Token node represents a literal. // A Token node represents a literal.
Token struct { Token struct {
StringPos token.Pos StringPos scanner.Position
String string String string
} }
...@@ -65,50 +91,50 @@ type ( ...@@ -65,50 +91,50 @@ type (
// A Group node represents a grouped expression. // A Group node represents a grouped expression.
Group struct { Group struct {
Lparen token.Pos Lparen scanner.Position
Body Expression // (body) Body Expression // (body)
} }
// An Option node represents an optional expression. // An Option node represents an optional expression.
Option struct { Option struct {
Lbrack token.Pos Lbrack scanner.Position
Body Expression // [body] Body Expression // [body]
} }
// A Repetition node represents a repeated expression. // A Repetition node represents a repeated expression.
Repetition struct { Repetition struct {
Lbrace token.Pos Lbrace scanner.Position
Body Expression // {body} Body Expression // {body}
} }
// A Bad node stands for pieces of source code that lead to a parse error.
Bad struct {
TokPos token.Pos
Error string // parser error message
}
// A Production node represents an EBNF production. // A Production node represents an EBNF production.
Production struct { Production struct {
Name *Name Name *Name
Expr Expression Expr Expression
} }
// A Bad node stands for pieces of source code that lead to a parse error.
Bad struct {
TokPos scanner.Position
Error string // parser error message
}
// A Grammar is a set of EBNF productions. The map // A Grammar is a set of EBNF productions. The map
// is indexed by production name. // is indexed by production name.
// //
Grammar map[string]*Production Grammar map[string]*Production
) )
func (x Alternative) Pos() token.Pos { return x[0].Pos() } // the parser always generates non-empty Alternative func (x Alternative) Pos() scanner.Position { return x[0].Pos() } // the parser always generates non-empty Alternative
func (x Sequence) Pos() token.Pos { return x[0].Pos() } // the parser always generates non-empty Sequences func (x Sequence) Pos() scanner.Position { return x[0].Pos() } // the parser always generates non-empty Sequences
func (x *Name) Pos() token.Pos { return x.StringPos } func (x *Name) Pos() scanner.Position { return x.StringPos }
func (x *Token) Pos() token.Pos { return x.StringPos } func (x *Token) Pos() scanner.Position { return x.StringPos }
func (x *Range) Pos() token.Pos { return x.Begin.Pos() } func (x *Range) Pos() scanner.Position { return x.Begin.Pos() }
func (x *Group) Pos() token.Pos { return x.Lparen } func (x *Group) Pos() scanner.Position { return x.Lparen }
func (x *Option) Pos() token.Pos { return x.Lbrack } func (x *Option) Pos() scanner.Position { return x.Lbrack }
func (x *Repetition) Pos() token.Pos { return x.Lbrace } func (x *Repetition) Pos() scanner.Position { return x.Lbrace }
func (x *Bad) Pos() token.Pos { return x.TokPos } func (x *Production) Pos() scanner.Position { return x.Name.Pos() }
func (x *Production) Pos() token.Pos { return x.Name.Pos() } func (x *Bad) Pos() scanner.Position { return x.TokPos }
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Grammar verification // Grammar verification
...@@ -119,15 +145,14 @@ func isLexical(name string) bool { ...@@ -119,15 +145,14 @@ func isLexical(name string) bool {
} }
type verifier struct { type verifier struct {
fset *token.FileSet errors errorList
scanner.ErrorVector
worklist []*Production worklist []*Production
reached Grammar // set of productions reached from (and including) the root production reached Grammar // set of productions reached from (and including) the root production
grammar Grammar grammar Grammar
} }
func (v *verifier) error(pos token.Pos, msg string) { func (v *verifier) error(pos scanner.Position, msg string) {
v.Error(v.fset.Position(pos), msg) v.errors = append(v.errors, newError(pos, msg))
} }
func (v *verifier) push(prod *Production) { func (v *verifier) push(prod *Production) {
...@@ -187,24 +212,23 @@ func (v *verifier) verifyExpr(expr Expression, lexical bool) { ...@@ -187,24 +212,23 @@ func (v *verifier) verifyExpr(expr Expression, lexical bool) {
v.verifyExpr(x.Body, lexical) v.verifyExpr(x.Body, lexical)
case *Repetition: case *Repetition:
v.verifyExpr(x.Body, lexical) v.verifyExpr(x.Body, lexical)
case *Bad:
v.error(x.Pos(), x.Error)
default: default:
panic("unreachable") panic(fmt.Sprintf("internal error: unexpected type %T", expr))
} }
} }
func (v *verifier) verify(fset *token.FileSet, grammar Grammar, start string) { func (v *verifier) verify(grammar Grammar, start string) {
// find root production // find root production
root, found := grammar[start] root, found := grammar[start]
if !found { if !found {
// token.NoPos doesn't require a file set; var noPos scanner.Position
// ok to set v.fset only afterwards v.error(noPos, "no start production "+start)
v.error(token.NoPos, "no start production "+start)
return return
} }
// initialize verifier // initialize verifier
v.fset = fset
v.ErrorVector.Reset()
v.worklist = v.worklist[0:0] v.worklist = v.worklist[0:0]
v.reached = make(Grammar) v.reached = make(Grammar)
v.grammar = grammar v.grammar = grammar
...@@ -238,8 +262,8 @@ func (v *verifier) verify(fset *token.FileSet, grammar Grammar, start string) { ...@@ -238,8 +262,8 @@ func (v *verifier) verify(fset *token.FileSet, grammar Grammar, start string) {
// //
// Position information is interpreted relative to the file set fset. // Position information is interpreted relative to the file set fset.
// //
func Verify(fset *token.FileSet, grammar Grammar, start string) os.Error { func Verify(grammar Grammar, start string) os.Error {
var v verifier var v verifier
v.verify(fset, grammar, start) v.verify(grammar, start)
return v.GetError(scanner.Sorted) return v.errors.Error()
} }
...@@ -5,13 +5,10 @@ ...@@ -5,13 +5,10 @@
package ebnf package ebnf
import ( import (
"go/token" "bytes"
"io/ioutil"
"testing" "testing"
) )
var fset = token.NewFileSet()
var goodGrammars = []string{ var goodGrammars = []string{
`Program = .`, `Program = .`,
...@@ -46,18 +43,19 @@ var badGrammars = []string{ ...@@ -46,18 +43,19 @@ var badGrammars = []string{
`Program = {} .`, `Program = {} .`,
} }
func checkGood(t *testing.T, filename string, src []byte) { func checkGood(t *testing.T, src string) {
grammar, err := Parse(fset, filename, src) grammar, err := Parse("", bytes.NewBuffer([]byte(src)))
if err != nil { if err != nil {
t.Errorf("Parse(%s) failed: %v", src, err) t.Errorf("Parse(%s) failed: %v", src, err)
return
} }
if err = Verify(fset, grammar, "Program"); err != nil { if err = Verify(grammar, "Program"); err != nil {
t.Errorf("Verify(%s) failed: %v", src, err) t.Errorf("Verify(%s) failed: %v", src, err)
} }
} }
func checkBad(t *testing.T, filename string, src []byte) { func checkBad(t *testing.T, src string) {
_, err := Parse(fset, filename, src) _, err := Parse("", bytes.NewBuffer([]byte(src)))
if err == nil { if err == nil {
t.Errorf("Parse(%s) should have failed", src) t.Errorf("Parse(%s) should have failed", src)
} }
...@@ -65,23 +63,9 @@ func checkBad(t *testing.T, filename string, src []byte) { ...@@ -65,23 +63,9 @@ func checkBad(t *testing.T, filename string, src []byte) {
func TestGrammars(t *testing.T) { func TestGrammars(t *testing.T) {
for _, src := range goodGrammars { for _, src := range goodGrammars {
checkGood(t, "", []byte(src)) checkGood(t, src)
} }
for _, src := range badGrammars { for _, src := range badGrammars {
checkBad(t, "", []byte(src)) checkBad(t, src)
}
}
var files = []string{
// TODO(gri) add some test files
}
func TestFiles(t *testing.T) {
for _, filename := range files {
src, err := ioutil.ReadFile(filename)
if err != nil {
t.Fatal(err)
}
checkGood(t, filename, src)
} }
} }
...@@ -5,51 +5,47 @@ ...@@ -5,51 +5,47 @@
package ebnf package ebnf
import ( import (
"go/scanner" "io"
"go/token"
"os" "os"
"scanner"
"strconv" "strconv"
) )
type parser struct { type parser struct {
fset *token.FileSet errors errorList
scanner.ErrorVector
scanner scanner.Scanner scanner scanner.Scanner
pos token.Pos // token position pos scanner.Position // token position
tok token.Token // one token look-ahead tok int // one token look-ahead
lit string // token literal lit string // token literal
} }
func (p *parser) next() { func (p *parser) next() {
p.pos, p.tok, p.lit = p.scanner.Scan() p.tok = p.scanner.Scan()
if p.tok.IsKeyword() { p.pos = p.scanner.Position
// TODO Should keyword mapping always happen outside scanner? p.lit = p.scanner.TokenText()
// Or should there be a flag to scanner to enable keyword mapping?
p.tok = token.IDENT
}
} }
func (p *parser) error(pos token.Pos, msg string) { func (p *parser) error(pos scanner.Position, msg string) {
p.Error(p.fset.Position(pos), msg) p.errors = append(p.errors, newError(pos, msg))
} }
func (p *parser) errorExpected(pos token.Pos, msg string) { func (p *parser) errorExpected(pos scanner.Position, msg string) {
msg = "expected " + msg msg = `expected "` + msg + `"`
if pos == p.pos { if pos.Offset == p.pos.Offset {
// the error happened at the current position; // the error happened at the current position;
// make the error message more specific // make the error message more specific
msg += ", found '" + p.tok.String() + "'" msg += ", found " + scanner.TokenString(p.tok)
if p.tok.IsLiteral() { if p.tok < 0 {
msg += " " + p.lit msg += " " + p.lit
} }
} }
p.error(pos, msg) p.error(pos, msg)
} }
func (p *parser) expect(tok token.Token) token.Pos { func (p *parser) expect(tok int) scanner.Position {
pos := p.pos pos := p.pos
if p.tok != tok { if p.tok != tok {
p.errorExpected(pos, "'"+tok.String()+"'") p.errorExpected(pos, scanner.TokenString(tok))
} }
p.next() // make progress in any case p.next() // make progress in any case
return pos return pos
...@@ -58,21 +54,21 @@ func (p *parser) expect(tok token.Token) token.Pos { ...@@ -58,21 +54,21 @@ func (p *parser) expect(tok token.Token) token.Pos {
func (p *parser) parseIdentifier() *Name { func (p *parser) parseIdentifier() *Name {
pos := p.pos pos := p.pos
name := p.lit name := p.lit
p.expect(token.IDENT) p.expect(scanner.Ident)
return &Name{pos, name} return &Name{pos, name}
} }
func (p *parser) parseToken() *Token { func (p *parser) parseToken() *Token {
pos := p.pos pos := p.pos
value := "" value := ""
if p.tok == token.STRING { if p.tok == scanner.String {
value, _ = strconv.Unquote(p.lit) value, _ = strconv.Unquote(p.lit)
// Unquote may fail with an error, but only if the scanner found // Unquote may fail with an error, but only if the scanner found
// an illegal string in the first place. In this case the error // an illegal string in the first place. In this case the error
// has already been reported. // has already been reported.
p.next() p.next()
} else { } else {
p.expect(token.STRING) p.expect(scanner.String)
} }
return &Token{pos, value} return &Token{pos, value}
} }
...@@ -82,32 +78,32 @@ func (p *parser) parseTerm() (x Expression) { ...@@ -82,32 +78,32 @@ func (p *parser) parseTerm() (x Expression) {
pos := p.pos pos := p.pos
switch p.tok { switch p.tok {
case token.IDENT: case scanner.Ident:
x = p.parseIdentifier() x = p.parseIdentifier()
case token.STRING: case scanner.String:
tok := p.parseToken() tok := p.parseToken()
x = tok x = tok
const ellipsis = "…" // U+2026, the horizontal ellipsis character const ellipsis = '…' // U+2026, the horizontal ellipsis character
if p.tok == token.ILLEGAL && p.lit == ellipsis { if p.tok == ellipsis {
p.next() p.next()
x = &Range{tok, p.parseToken()} x = &Range{tok, p.parseToken()}
} }
case token.LPAREN: case '(':
p.next() p.next()
x = &Group{pos, p.parseExpression()} x = &Group{pos, p.parseExpression()}
p.expect(token.RPAREN) p.expect(')')
case token.LBRACK: case '[':
p.next() p.next()
x = &Option{pos, p.parseExpression()} x = &Option{pos, p.parseExpression()}
p.expect(token.RBRACK) p.expect(']')
case token.LBRACE: case '{':
p.next() p.next()
x = &Repetition{pos, p.parseExpression()} x = &Repetition{pos, p.parseExpression()}
p.expect(token.RBRACE) p.expect('}')
} }
return x return x
...@@ -137,7 +133,7 @@ func (p *parser) parseExpression() Expression { ...@@ -137,7 +133,7 @@ func (p *parser) parseExpression() Expression {
for { for {
list = append(list, p.parseSequence()) list = append(list, p.parseSequence())
if p.tok != token.OR { if p.tok != '|' {
break break
} }
p.next() p.next()
...@@ -154,24 +150,22 @@ func (p *parser) parseExpression() Expression { ...@@ -154,24 +150,22 @@ func (p *parser) parseExpression() Expression {
func (p *parser) parseProduction() *Production { func (p *parser) parseProduction() *Production {
name := p.parseIdentifier() name := p.parseIdentifier()
p.expect(token.ASSIGN) p.expect('=')
var expr Expression var expr Expression
if p.tok != token.PERIOD { if p.tok != '.' {
expr = p.parseExpression() expr = p.parseExpression()
} }
p.expect(token.PERIOD) p.expect('.')
return &Production{name, expr} return &Production{name, expr}
} }
func (p *parser) parse(fset *token.FileSet, filename string, src []byte) Grammar { func (p *parser) parse(filename string, src io.Reader) Grammar {
// initialize parser p.scanner.Init(src)
p.fset = fset p.scanner.Filename = filename
p.ErrorVector.Reset()
p.scanner.Init(fset.AddFile(filename, fset.Base(), len(src)), src, p, scanner.AllowIllegalChars)
p.next() // initializes pos, tok, lit p.next() // initializes pos, tok, lit
grammar := make(Grammar) grammar := make(Grammar)
for p.tok != token.EOF { for p.tok != scanner.EOF {
prod := p.parseProduction() prod := p.parseProduction()
name := prod.Name.String name := prod.Name.String
if _, found := grammar[name]; !found { if _, found := grammar[name]; !found {
...@@ -187,11 +181,11 @@ func (p *parser) parse(fset *token.FileSet, filename string, src []byte) Grammar ...@@ -187,11 +181,11 @@ func (p *parser) parse(fset *token.FileSet, filename string, src []byte) Grammar
// Parse parses a set of EBNF productions from source src. // Parse parses a set of EBNF productions from source src.
// It returns a set of productions. Errors are reported // It returns a set of productions. Errors are reported
// for incorrect syntax and if a production is declared // for incorrect syntax and if a production is declared
// more than once. Position information is recorded relative // more than once; the filename is used only for error
// to the file set fset. // positions.
// //
func Parse(fset *token.FileSet, filename string, src []byte) (Grammar, os.Error) { func Parse(filename string, src io.Reader) (Grammar, os.Error) {
var p parser var p parser
grammar := p.parse(fset, filename, src) grammar := p.parse(filename, src)
return grammar, p.GetError(scanner.Sorted) return grammar, p.errors.Error()
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment