ebnf: use scanner instead of go/scanner

R=rsc, r CC=golang-dev https://golang.org/cl/5192043

ebnf: use scanner instead of go/scanner
R=rsc, r CC=golang-dev https://golang.org/cl/5192043
0da66a2e · Robert Griesemer · b2f1eba3 · 0da66a2e · 0da66a2e · 0da66a2e
Commit 0da66a2e authored Oct 05, 2011 by Robert Griesemer
Showing with 116 additions and 114 deletions

ebnflint.go src/cmd/ebnflint/ebnflint.go +2 -2

ebnf.go src/pkg/ebnf/ebnf.go +62 -38

ebnf_test.go src/pkg/ebnf/ebnf_test.go +9 -25

parser.go src/pkg/ebnf/parser.go +43 -49

No files found.
--- a/src/cmd/ebnflint/ebnflint.go
+++ b/src/cmd/ebnflint/ebnflint.go
@@ -98,12 +98,12 @@ func main() {
 		src = extractEBNF(src)
 	}

-	grammar, err := ebnf.Parse(fset, filename, src)
+	grammar, err := ebnf.Parse(filename, bytes.NewBuffer(src))
 	if err != nil {
 		report(err)
 	}

-	if err = ebnf.Verify(fset, grammar, *start); err != nil {
+	if err = ebnf.Verify(grammar, *start); err != nil {
 		report(err)
 	}
 }
--- a/src/pkg/ebnf/ebnf.go
+++ b/src/pkg/ebnf/ebnf.go
@@ -23,13 +23,39 @@
 package ebnf

 import (
-	"go/scanner"
-	"go/token"
+	"fmt"
 	"os"
+	"scanner"
 	"unicode"
 	"utf8"
 )

+// ----------------------------------------------------------------------------
+// Error handling
+
+type errorList []os.Error
+
+func (list errorList) Error() os.Error {
+	if len(list) == 0 {
+		return nil
+	}
+	return list
+}
+
+func (list errorList) String() string {
+	switch len(list) {
+	case 0:
+		return "no errors"
+	case 1:
+		return list[0].String()
+	}
+	return fmt.Sprintf("%s (and %d more errors)", list[0], len(list)-1)
+}
+
+func newError(pos scanner.Position, msg string) os.Error {
+	return os.NewError(fmt.Sprintf("%s: %s", pos, msg))
+}
+
 // ----------------------------------------------------------------------------
 // Internal representation

@@ -37,7 +63,7 @@ type (
 	// An Expression node represents a production expression.
 	Expression interface {
 		// Pos is the position of the first character of the syntactic construct
-		Pos() token.Pos
+		Pos() scanner.Position
 	}

 	// An Alternative node represents a non-empty list of alternative expressions.
@@ -48,13 +74,13 @@ type (

 	// A Name node represents a production name.
 	Name struct {
-		StringPos token.Pos
+		StringPos scanner.Position
 		String    string
 	}

 	// A Token node represents a literal.
 	Token struct {
-		StringPos token.Pos
+		StringPos scanner.Position
 		String    string
 	}

@@ -65,50 +91,50 @@ type (

 	// A Group node represents a grouped expression.
 	Group struct {
-		Lparen token.Pos
+		Lparen scanner.Position
 		Body   Expression // (body)
 	}

 	// An Option node represents an optional expression.
 	Option struct {
-		Lbrack token.Pos
+		Lbrack scanner.Position
 		Body   Expression // [body]
 	}

 	// A Repetition node represents a repeated expression.
 	Repetition struct {
-		Lbrace token.Pos
+		Lbrace scanner.Position
 		Body   Expression // {body}
 	}

-	// A Bad node stands for pieces of source code that lead to a parse error.
-	Bad struct {
-		TokPos token.Pos
-		Error  string // parser error message
-	}
-
 	// A Production node represents an EBNF production.
 	Production struct {
 		Name *Name
 		Expr Expression
 	}

+	// A Bad node stands for pieces of source code that lead to a parse error.
+	Bad struct {
+		TokPos scanner.Position
+		Error  string // parser error message
+	}
+
 	// A Grammar is a set of EBNF productions. The map
 	// is indexed by production name.
 	//
 	Grammar map[string]*Production
 )

-func (x Alternative) Pos() token.Pos { return x[0].Pos() } // the parser always generates non-empty Alternative
-func (x Sequence) Pos() token.Pos    { return x[0].Pos() } // the parser always generates non-empty Sequences
-func (x *Name) Pos() token.Pos       { return x.StringPos }
-func (x *Token) Pos() token.Pos      { return x.StringPos }
-func (x *Range) Pos() token.Pos      { return x.Begin.Pos() }
-func (x *Group) Pos() token.Pos      { return x.Lparen }
-func (x *Option) Pos() token.Pos     { return x.Lbrack }
-func (x *Repetition) Pos() token.Pos { return x.Lbrace }
-func (x *Bad) Pos() token.Pos        { return x.TokPos }
-func (x *Production) Pos() token.Pos { return x.Name.Pos() }
+func (x Alternative) Pos() scanner.Position { return x[0].Pos() } // the parser always generates non-empty Alternative
+func (x Sequence) Pos() scanner.Position    { return x[0].Pos() } // the parser always generates non-empty Sequences
+func (x *Name) Pos() scanner.Position       { return x.StringPos }
+func (x *Token) Pos() scanner.Position      { return x.StringPos }
+func (x *Range) Pos() scanner.Position      { return x.Begin.Pos() }
+func (x *Group) Pos() scanner.Position      { return x.Lparen }
+func (x *Option) Pos() scanner.Position     { return x.Lbrack }
+func (x *Repetition) Pos() scanner.Position { return x.Lbrace }
+func (x *Production) Pos() scanner.Position { return x.Name.Pos() }
+func (x *Bad) Pos() scanner.Position        { return x.TokPos }

 // ----------------------------------------------------------------------------
 // Grammar verification
@@ -119,15 +145,14 @@ func isLexical(name string) bool {
 }

 type verifier struct {
-	fset *token.FileSet
-	scanner.ErrorVector
+	errors   errorList
 	worklist []*Production
 	reached  Grammar // set of productions reached from (and including) the root production
 	grammar  Grammar
 }

-func (v *verifier) error(pos token.Pos, msg string) {
-	v.Error(v.fset.Position(pos), msg)
+func (v *verifier) error(pos scanner.Position, msg string) {
+	v.errors = append(v.errors, newError(pos, msg))
 }

 func (v *verifier) push(prod *Production) {
@@ -187,24 +212,23 @@ func (v *verifier) verifyExpr(expr Expression, lexical bool) {
 		v.verifyExpr(x.Body, lexical)
 	case *Repetition:
 		v.verifyExpr(x.Body, lexical)
+	case *Bad:
+		v.error(x.Pos(), x.Error)
 	default:
-		panic("unreachable")
+		panic(fmt.Sprintf("internal error: unexpected type %T", expr))
 	}
 }

-func (v *verifier) verify(fset *token.FileSet, grammar Grammar, start string) {
+func (v *verifier) verify(grammar Grammar, start string) {
 	// find root production
 	root, found := grammar[start]
 	if !found {
-		// token.NoPos doesn't require a file set;
-		// ok to set v.fset only afterwards
-		v.error(token.NoPos, "no start production "+start)
+		var noPos scanner.Position
+		v.error(noPos, "no start production "+start)
 		return
 	}

 	// initialize verifier
-	v.fset = fset
-	v.ErrorVector.Reset()
 	v.worklist = v.worklist[0:0]
 	v.reached = make(Grammar)
 	v.grammar = grammar
@@ -238,8 +262,8 @@ func (v *verifier) verify(fset *token.FileSet, grammar Grammar, start string) {
 //
 // Position information is interpreted relative to the file set fset.
 //
-func Verify(fset *token.FileSet, grammar Grammar, start string) os.Error {
+func Verify(grammar Grammar, start string) os.Error {
 	var v verifier
-	v.verify(fset, grammar, start)
-	return v.GetError(scanner.Sorted)
+	v.verify(grammar, start)
+	return v.errors.Error()
 }
--- a/src/pkg/ebnf/ebnf_test.go
+++ b/src/pkg/ebnf/ebnf_test.go
@@ -5,13 +5,10 @@
 package ebnf

 import (
-	"go/token"
-	"io/ioutil"
+	"bytes"
 	"testing"
 )

-var fset = token.NewFileSet()
-
 var goodGrammars = []string{
 	`Program = .`,

@@ -46,18 +43,19 @@ var badGrammars = []string{
 	`Program = {} .`,
 }

-func checkGood(t *testing.T, filename string, src []byte) {
-	grammar, err := Parse(fset, filename, src)
+func checkGood(t *testing.T, src string) {
+	grammar, err := Parse("", bytes.NewBuffer([]byte(src)))
 	if err != nil {
 		t.Errorf("Parse(%s) failed: %v", src, err)
+		return
 	}
-	if err = Verify(fset, grammar, "Program"); err != nil {
+	if err = Verify(grammar, "Program"); err != nil {
 		t.Errorf("Verify(%s) failed: %v", src, err)
 	}
 }

-func checkBad(t *testing.T, filename string, src []byte) {
-	_, err := Parse(fset, filename, src)
+func checkBad(t *testing.T, src string) {
+	_, err := Parse("", bytes.NewBuffer([]byte(src)))
 	if err == nil {
 		t.Errorf("Parse(%s) should have failed", src)
 	}
@@ -65,23 +63,9 @@ func checkBad(t *testing.T, filename string, src []byte) {

 func TestGrammars(t *testing.T) {
 	for _, src := range goodGrammars {
-		checkGood(t, "", []byte(src))
+		checkGood(t, src)
 	}
 	for _, src := range badGrammars {
-		checkBad(t, "", []byte(src))
-	}
-}
-
-var files = []string{
-// TODO(gri) add some test files
-}
-
-func TestFiles(t *testing.T) {
-	for _, filename := range files {
-		src, err := ioutil.ReadFile(filename)
-		if err != nil {
-			t.Fatal(err)
-		}
-		checkGood(t, filename, src)
+		checkBad(t, src)
 	}
 }
--- a/src/pkg/ebnf/parser.go
+++ b/src/pkg/ebnf/parser.go
@@ -5,51 +5,47 @@
 package ebnf

 import (
-	"go/scanner"
-	"go/token"
+	"io"
 	"os"
+	"scanner"
 	"strconv"
 )

 type parser struct {
-	fset *token.FileSet
-	scanner.ErrorVector
+	errors  errorList
 	scanner scanner.Scanner
-	pos     token.Pos   // token position
-	tok     token.Token // one token look-ahead
+	pos     scanner.Position // token position
+	tok     int              // one token look-ahead
 	lit     string           // token literal
 }

 func (p *parser) next() {
-	p.pos, p.tok, p.lit = p.scanner.Scan()
-	if p.tok.IsKeyword() {
-		// TODO Should keyword mapping always happen outside scanner?
-		//      Or should there be a flag to scanner to enable keyword mapping?
-		p.tok = token.IDENT
-	}
+	p.tok = p.scanner.Scan()
+	p.pos = p.scanner.Position
+	p.lit = p.scanner.TokenText()
 }

-func (p *parser) error(pos token.Pos, msg string) {
-	p.Error(p.fset.Position(pos), msg)
+func (p *parser) error(pos scanner.Position, msg string) {
+	p.errors = append(p.errors, newError(pos, msg))
 }

-func (p *parser) errorExpected(pos token.Pos, msg string) {
-	msg = "expected " + msg
-	if pos == p.pos {
+func (p *parser) errorExpected(pos scanner.Position, msg string) {
+	msg = `expected "` + msg + `"`
+	if pos.Offset == p.pos.Offset {
 		// the error happened at the current position;
 		// make the error message more specific
-		msg += ", found '" + p.tok.String() + "'"
-		if p.tok.IsLiteral() {
+		msg += ", found " + scanner.TokenString(p.tok)
+		if p.tok < 0 {
 			msg += " " + p.lit
 		}
 	}
 	p.error(pos, msg)
 }

-func (p *parser) expect(tok token.Token) token.Pos {
+func (p *parser) expect(tok int) scanner.Position {
 	pos := p.pos
 	if p.tok != tok {
-		p.errorExpected(pos, "'"+tok.String()+"'")
+		p.errorExpected(pos, scanner.TokenString(tok))
 	}
 	p.next() // make progress in any case
 	return pos
@@ -58,21 +54,21 @@ func (p *parser) expect(tok token.Token) token.Pos {
 func (p *parser) parseIdentifier() *Name {
 	pos := p.pos
 	name := p.lit
-	p.expect(token.IDENT)
+	p.expect(scanner.Ident)
 	return &Name{pos, name}
 }

 func (p *parser) parseToken() *Token {
 	pos := p.pos
 	value := ""
-	if p.tok == token.STRING {
+	if p.tok == scanner.String {
 		value, _ = strconv.Unquote(p.lit)
 		// Unquote may fail with an error, but only if the scanner found
 		// an illegal string in the first place. In this case the error
 		// has already been reported.
 		p.next()
 	} else {
-		p.expect(token.STRING)
+		p.expect(scanner.String)
 	}
 	return &Token{pos, value}
 }
@@ -82,32 +78,32 @@ func (p *parser) parseTerm() (x Expression) {
 	pos := p.pos

 	switch p.tok {
-	case token.IDENT:
+	case scanner.Ident:
 		x = p.parseIdentifier()

-	case token.STRING:
+	case scanner.String:
 		tok := p.parseToken()
 		x = tok
-		const ellipsis = "…" // U+2026, the horizontal ellipsis character
-		if p.tok == token.ILLEGAL && p.lit == ellipsis {
+		const ellipsis = '…' // U+2026, the horizontal ellipsis character
+		if p.tok == ellipsis {
 			p.next()
 			x = &Range{tok, p.parseToken()}
 		}

-	case token.LPAREN:
+	case '(':
 		p.next()
 		x = &Group{pos, p.parseExpression()}
-		p.expect(token.RPAREN)
+		p.expect(')')

-	case token.LBRACK:
+	case '[':
 		p.next()
 		x = &Option{pos, p.parseExpression()}
-		p.expect(token.RBRACK)
+		p.expect(']')

-	case token.LBRACE:
+	case '{':
 		p.next()
 		x = &Repetition{pos, p.parseExpression()}
-		p.expect(token.RBRACE)
+		p.expect('}')
 	}

 	return x
@@ -137,7 +133,7 @@ func (p *parser) parseExpression() Expression {

 	for {
 		list = append(list, p.parseSequence())
-		if p.tok != token.OR {
+		if p.tok != '|' {
 			break
 		}
 		p.next()
@@ -154,24 +150,22 @@ func (p *parser) parseExpression() Expression {

 func (p *parser) parseProduction() *Production {
 	name := p.parseIdentifier()
-	p.expect(token.ASSIGN)
+	p.expect('=')
 	var expr Expression
-	if p.tok != token.PERIOD {
+	if p.tok != '.' {
 		expr = p.parseExpression()
 	}
-	p.expect(token.PERIOD)
+	p.expect('.')
 	return &Production{name, expr}
 }

-func (p *parser) parse(fset *token.FileSet, filename string, src []byte) Grammar {
-	// initialize parser
-	p.fset = fset
-	p.ErrorVector.Reset()
-	p.scanner.Init(fset.AddFile(filename, fset.Base(), len(src)), src, p, scanner.AllowIllegalChars)
+func (p *parser) parse(filename string, src io.Reader) Grammar {
+	p.scanner.Init(src)
+	p.scanner.Filename = filename
 	p.next() // initializes pos, tok, lit

 	grammar := make(Grammar)
-	for p.tok != token.EOF {
+	for p.tok != scanner.EOF {
 		prod := p.parseProduction()
 		name := prod.Name.String
 		if _, found := grammar[name]; !found {
@@ -187,11 +181,11 @@ func (p *parser) parse(fset *token.FileSet, filename string, src []byte) Grammar
 // Parse parses a set of EBNF productions from source src.
 // It returns a set of productions. Errors are reported
 // for incorrect syntax and if a production is declared
-// more than once. Position information is recorded relative
-// to the file set fset.
+// more than once; the filename is used only for error
+// positions.
 //
-func Parse(fset *token.FileSet, filename string, src []byte) (Grammar, os.Error) {
+func Parse(filename string, src io.Reader) (Grammar, os.Error) {
 	var p parser
-	grammar := p.parse(fset, filename, src)
-	return grammar, p.GetError(scanner.Sorted)
+	grammar := p.parse(filename, src)
+	return grammar, p.errors.Error()
 }