Commit ef4347f1 authored by Robert Griesemer's avatar Robert Griesemer

Basic EBNF package:

- parsing of EBNF grammars
- basic consistency checks

R=rsc
DELTA=695  (695 added, 0 deleted, 0 changed)
OCL=31479
CL=31516
parent 092d6290
......@@ -13,6 +13,7 @@ crypto/hmac.install: crypto/md5.install crypto/sha1.install hash.install os.inst
crypto/md5.install: hash.install os.install
crypto/sha1.install: hash.install os.install
datafmt.install: bytes.install container/vector.install fmt.install go/scanner.install go/token.install io.install os.install reflect.install runtime.install strconv.install strings.install
ebnf.install: container/vector.install fmt.install go/scanner.install go/token.install os.install strconv.install strings.install unicode.install utf8.install
exec.install: os.install strings.install
exvar.install: bytes.install fmt.install http.install io.install log.install strconv.install sync.install
flag.install: fmt.install os.install strconv.install
......
......@@ -27,6 +27,7 @@ DIRS=\
crypto/md5\
crypto/sha1\
datafmt\
ebnf\
exec\
exvar\
flag\
......@@ -84,6 +85,7 @@ TEST=\
crypto/md5\
crypto/sha1\
datafmt\
ebnf\
exec\
exvar\
flag\
......
# Copyright 2009 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# DO NOT EDIT. Automatically generated by gobuild.
# gobuild -m ebnf.go parser.go >Makefile
D=
include $(GOROOT)/src/Make.$(GOARCH)
AR=gopack
default: packages
clean:
rm -rf *.[$(OS)] *.a [$(OS)].out _obj
test: packages
gotest
coverage: packages
gotest
6cov -g $$(pwd) | grep -v '_test\.go:'
%.$O: %.go
$(GC) -I_obj $*.go
%.$O: %.c
$(CC) $*.c
%.$O: %.s
$(AS) $*.s
O1=\
ebnf.$O\
O2=\
parser.$O\
phases: a1 a2
_obj$D/ebnf.a: phases
a1: $(O1)
$(AR) grc _obj$D/ebnf.a ebnf.$O
rm -f $(O1)
a2: $(O2)
$(AR) grc _obj$D/ebnf.a parser.$O
rm -f $(O2)
newpkg: clean
mkdir -p _obj$D
$(AR) grc _obj$D/ebnf.a
$(O1): newpkg
$(O2): a1
$(O3): a2
nuke: clean
rm -f $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D/ebnf.a
packages: _obj$D/ebnf.a
install: packages
test -d $(GOROOT)/pkg && mkdir -p $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D
cp _obj$D/ebnf.a $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D/ebnf.a
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// A library for EBNF grammars. The input is text ([]byte) satisfying
// the following grammar (represented itself in EBNF):
//
// Production = name "=" Expression "." .
// Expression = Alternative { "|" Alternative } .
// Alternative = Term { Term } .
// Term = name | token [ "..." token ] | Group | Option | Repetition .
// Group = "(" Expression ")" .
// Option = "[" Expression "]" .
// Repetition = "{" Expression "}" .
//
// A name is a Go identifier, a token is a Go string, and comments
// and white space follow the same rules as for the Go language.
// Production names starting with an uppercase Unicode letter denote
// non-terminal productions (i.e., productions which allow white-space
// and comments between tokens); all other production names denote
// lexical productions.
//
package ebnf
import (
"container/vector";
"fmt";
"go/scanner";
"go/token";
"os";
"strconv";
"strings";
"unicode";
"utf8";
)
// ----------------------------------------------------------------------------
// Internal representation
type (
// An Expression node represents a production expression.
Expression interface {
// Pos is the position of the first character of the syntactic construct
Pos() token.Position;
};
// An Alternative node represents a non-empty list of alternative expressions.
Alternative []Expression; // x | y | z
// A Sequence node represents a non-empty list of sequential expressions.
Sequence []Expression; // x y z
// A Name node represents a production name.
Name struct {
token.Position;
String string;
};
// A Token node represents a literal.
Token struct {
token.Position;
String string;
};
// A List node represents a range of characters.
Range struct {
Begin, End *Token; // begin ... end
};
// A Group node represents a grouped expression.
Group struct {
token.Position;
Body Expression; // (body)
};
// An Option node represents an optional expression.
Option struct {
token.Position;
Body Expression; // [body]
};
// A Repetition node represents a repeated expression.
Repetition struct {
token.Position;
Body Expression; // {body}
};
// A Production node represents an EBNF production.
Production struct {
Name *Name;
Expr Expression;
};
// A Grammar is a set of EBNF productions. The map
// is indexed by production name.
//
Grammar map [string] *Production;
)
func (x Alternative) Pos() token.Position {
return x[0].Pos(); // the parser always generates non-empty Alternative
}
func (x Sequence) Pos() token.Position {
return x[0].Pos(); // the parser always generates non-empty Sequences
}
func (x Range) Pos() token.Position {
return x.Begin.Pos();
}
func (p *Production) Pos() token.Position {
return p.Name.Pos();
}
// ----------------------------------------------------------------------------
// Error handling
// TODO(gri) This is the same code as in datafmt and go/parser.
// Should factor this out as part of some parsing framework
// that could also deal with reading various input sources.
// Error describes an individual error. The position Pos, if valid,
// indicates the format source position the error relates to. The
// error is specified with the Msg string.
//
type Error struct {
Pos token.Position;
Msg string;
}
// String returns the error message. If the error contains (line, column)
// position information, it starts with "line:column: ", otherwise it
// starts with a blank " ".
//
func (e *Error) String() string {
pos := " ";
if e.Pos.IsValid() {
pos = fmt.Sprintf("%d:%d: ", e.Pos.Line, e.Pos.Column);
}
return pos + e.Msg;
}
// An ErrorList is a list of errors encountered during parsing.
type ErrorList []*Error
// ErrorList implements SortInterface and the os.Error interface.
func (p ErrorList) Len() int { return len(p); }
func (p ErrorList) Swap(i, j int) { p[i], p[j] = p[j], p[i]; }
func (p ErrorList) Less(i, j int) bool { return p[i].Pos.Offset < p[j].Pos.Offset; }
func (p ErrorList) String() string {
switch len(p) {
case 0:
return "unspecified error";
case 1:
return p[0].String();
}
return fmt.Sprintf("%s (and %d more errors)", p[0].String(), len(p) - 1);
}
// ----------------------------------------------------------------------------
// Grammar verification
func isLexical(name string) bool {
ch, len := utf8.DecodeRuneInString(name);
return !unicode.IsUpper(ch);
}
type verifier struct {
errors vector.Vector;
worklist vector.Vector;
reached Grammar; // set of productions reached from (and including) the root production
grammar Grammar;
}
func (v *verifier) error(pos token.Position, msg string) {
v.errors.Push(&Error{pos, msg});
}
func makeErrorList(v *vector.Vector) os.Error {
if v.Len() > 0 {
errors := make(ErrorList, v.Len());
for i := 0; i < v.Len(); i++ {
errors[i] = v.At(i).(*Error);
}
return errors;
}
return nil;
}
func (v *verifier) push(prod *Production) {
name := prod.Name.String;
if _, found := v.reached[name]; !found {
v.worklist.Push(prod);
v.reached[name] = prod;
}
}
func (v *verifier) verifyChar(x *Token) int {
s := x.String;
if utf8.RuneCountInString(s) != 1 {
v.error(x.Pos(), "single char expected, found " + s);
return 0;
}
ch, _ := utf8.DecodeRuneInString(s);
return ch;
}
func (v *verifier) verifyExpr(expr Expression, lexical bool) {
switch x := expr.(type) {
case nil:
// empty expression
case Alternative:
for _, e := range x {
v.verifyExpr(e, lexical);
}
case Sequence:
for _, e := range x {
v.verifyExpr(e, lexical);
}
case *Name:
// a production with this name must exist;
// add it to the worklist if not yet processed
if prod, found := v.grammar[x.String]; found {
v.push(prod);
} else {
v.error(x.Pos(), "missing production " + x.String);
}
// within a lexical production references
// to non-lexical productions are invalid
if lexical && !isLexical(x.String) {
v.error(x.Pos(), "reference to non-lexical production " + x.String);
}
case *Token:
// nothing to do for now
case *Range:
i := v.verifyChar(x.Begin);
j := v.verifyChar(x.End);
if i >= j {
v.error(x.Pos(), "decreasing character range");
}
case *Group:
v.verifyExpr(x.Body, lexical);
case *Option:
v.verifyExpr(x.Body, lexical);
case *Repetition:
v.verifyExpr(x.Body, lexical);
default:
panic("unreachable");
}
}
func (v *verifier) verify(grammar Grammar, start string) {
// find root production
root, found := grammar[start];
if !found {
var noPos token.Position;
v.error(noPos, "no start production " + start);
return;
}
// initialize verifier
v.errors.Init(0);
v.worklist.Init(0);
v.reached = make(Grammar);
v.grammar = grammar;
// work through the worklist
v.push(root);
for v.worklist.Len() > 0 {
prod := v.worklist.Pop().(*Production);
v.verifyExpr(prod.Expr, isLexical(prod.Name.String));
}
// check if all productions were reached
if len(v.reached) < len(v.grammar) {
for name, prod := range v.grammar {
if _, found := v.reached[name]; !found {
v.error(prod.Pos(), name + " is unreachable");
}
}
}
}
// Verify checks that:
// - all productions used are defined
// - all productions defined are used when beginning at start
// - lexical productions refer only to other lexical productions
//
func Verify(grammar Grammar, start string) os.Error {
var v verifier;
v.verify(grammar, start);
return makeErrorList(&v.errors);
}
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ebnf
import (
"ebnf";
"io";
"strings";
"testing";
)
var grammars = []string {
`Program = .
`,
`Program = foo .
foo = "foo" .
`,
`Program = "a" | "b" "c" .
`,
`Program = "a" ... "z" .
`,
`Program = Song .
Song = { Note } .
Note = Do | (Re | Mi | Fa | So | La) | Ti .
Do = "c" .
Re = "d" .
Mi = "e" .
Fa = "f" .
So = "g" .
La = "a" .
Ti = ti .
ti = "b" .
`,
}
func check(t *testing.T, src []byte) {
grammar, err := Parse(src);
if err != nil {
t.Errorf("Parse(%s) failed: %v", src, err);
}
if err = Verify(grammar, "Program"); err != nil {
t.Errorf("Verify(%s) failed: %v", src, err);
}
}
func TestGrammars(t *testing.T) {
for _, src := range grammars {
check(t, strings.Bytes(src));
}
}
var files = []string {
// TODO(gri) add some test files
}
func TestFiles(t *testing.T) {
for _, filename := range files {
src, err := io.ReadFile(filename);
if err != nil {
t.Fatal(err);
}
check(t, src);
}
}
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ebnf
import (
"container/vector";
"ebnf";
"fmt";
"go/scanner";
"go/token";
"os";
"strconv";
"strings";
"unicode";
"utf8";
)
type parser struct {
errors vector.Vector;
scanner scanner.Scanner;
pos token.Position; // token position
tok token.Token; // one token look-ahead
lit []byte; // token literal
}
func (p *parser) next() {
p.pos, p.tok, p.lit = p.scanner.Scan();
if p.tok.IsKeyword() {
// TODO Should keyword mapping always happen outside scanner?
// Or should there be a flag to scanner to enable keyword mapping?
p.tok = token.IDENT;
}
}
func (p *parser) init(src []byte) {
p.errors.Init(0);
p.scanner.Init(src, p, 0);
p.next(); // initializes pos, tok, lit
}
// The parser implements scanner.Error.
func (p *parser) Error(pos token.Position, msg string) {
// Do not collect errors that are on the same line as the previous
// error to reduce the number of spurious errors due to incorrect
// parser synchronization.
if p.errors.Len() == 0 || p.errors.Last().(*Error).Pos.Line != pos.Line {
p.errors.Push(&Error{pos, msg});
}
}
func (p *parser) errorExpected(pos token.Position, msg string) {
msg = "expected " + msg;
if pos.Offset == p.pos.Offset {
// the error happened at the current position;
// make the error message more specific
msg += ", found '" + p.tok.String() + "'";
if p.tok.IsLiteral() {
msg += " " + string(p.lit);
}
}
p.Error(pos, msg);
}
func (p *parser) expect(tok token.Token) token.Position {
pos := p.pos;
if p.tok != tok {
p.errorExpected(pos, "'" + tok.String() + "'");
}
p.next(); // make progress in any case
return pos;
}
func (p *parser) parseIdentifier() *Name {
pos := p.pos;
name := string(p.lit);
p.expect(token.IDENT);
return &Name{pos, name};
}
func (p *parser) parseToken() *Token {
pos := p.pos;
value := "";
if p.tok == token.STRING {
var err os.Error;
value, err = strconv.Unquote(string(p.lit));
// Unquote may fail with an error, but only if the scanner found
// an illegal string in the first place. In this case the error
// has already been reported.
p.next();
} else {
p.expect(token.STRING);
}
return &Token{pos, value};
}
func (p *parser) parseExpression() Expression
func (p *parser) parseTerm() (x Expression) {
pos := p.pos;
switch p.tok {
case token.IDENT:
x = p.parseIdentifier();
case token.STRING:
tok := p.parseToken();
x = tok;
if p.tok == token.ELLIPSIS {
p.next();
x = &Range{tok, p.parseToken()};
}
case token.LPAREN:
p.next();
x = &Group{pos, p.parseExpression()};
p.expect(token.RPAREN);
case token.LBRACK:
p.next();
x = &Option{pos, p.parseExpression()};
p.expect(token.RBRACK);
case token.LBRACE:
p.next();
x = &Repetition{pos, p.parseExpression()};
p.expect(token.RBRACE);
}
return x;
}
func (p *parser) parseSequence() Expression {
var list vector.Vector;
list.Init(0);
for x := p.parseTerm(); x != nil; x = p.parseTerm() {
list.Push(x);
}
// no need for a sequence if list.Len() < 2
switch list.Len() {
case 0:
return nil;
case 1:
return list.At(0).(Expression);
}
// convert list into a sequence
seq := make(Sequence, list.Len());
for i := 0; i < list.Len(); i++ {
seq[i] = list.At(i).(Expression);
}
return seq;
}
func (p *parser) parseExpression() Expression {
var list vector.Vector;
list.Init(0);
for {
x := p.parseSequence();
if x != nil {
list.Push(x);
}
if p.tok != token.OR {
break;
}
p.next();
}
// no need for an Alternative node if list.Len() < 2
switch list.Len() {
case 0:
return nil;
case 1:
return list.At(0).(Expression);
}
// convert list into an Alternative node
alt := make(Alternative, list.Len());
for i := 0; i < list.Len(); i++ {
alt[i] = list.At(i).(Expression);
}
return alt;
}
func (p *parser) parseProduction() *Production {
name := p.parseIdentifier();
p.expect(token.ASSIGN);
expr := p.parseExpression();
p.expect(token.PERIOD);
return &Production{name, expr};
}
func (p *parser) parse(src []byte) Grammar {
// initialize parser
p.errors.Init(0);
p.scanner.Init(src, p, 0);
p.next(); // initializes pos, tok, lit
grammar := make(Grammar);
for p.tok != token.EOF {
prod := p.parseProduction();
name := prod.Name.String;
if prev, found := grammar[name]; !found {
grammar[name] = prod;
} else {
p.Error(prod.Pos(), name + " declared already");
}
}
return grammar;
}
// Parse parses a set of EBNF productions from source src.
// It returns a set of productions. Errors are reported
// for incorrect syntax and if a production is declared
// more than once.
//
func Parse(src []byte) (Grammar, os.Error) {
var p parser;
grammar := p.parse(src);
return grammar, makeErrorList(&p.errors);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment