Commit 539aa05a authored by Robert Griesemer's avatar Robert Griesemer

cmd/compile: towards simpler and faster lexing: always use getr

Always reading runes (rather than bytes) has negligible overhead
(a simple if at the moment - it can be eliminated eventually) but
simplifies the lexer logic and opens up the door for speedups.
In the process remove many int conversions that are now not needed
anymore.

Also, because identifiers are now more easily recognized, remove
talph label and move identifier lexing "in place".

Also, instead of accepting all chars < 0x80 and then check for
"frogs", only permit valid characters in the first place. Removes
an extra call for common simple tokens and leads to simpler logic.

`time go build -a net/http` (best of 5 runs) seems 1% faster.
Assuming this is in the noise, there is no noticeable performance
degradation with this change.

Change-Id: I3454c9bf8b91808188cf7a5f559341749da9a1eb
Reviewed-on: https://go-review.googlesource.com/19847Reviewed-by: 's avatarMatthew Dempsky <mdempsky@google.com>
Run-TryBot: Robert Griesemer <gri@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 814978a0
...@@ -37,6 +37,8 @@ var ( ...@@ -37,6 +37,8 @@ var (
Debug_wb int Debug_wb int
) )
const BOM = 0xFEFF
// Debug arguments. // Debug arguments.
// These can be specified with the -d flag, as in "-d nil" // These can be specified with the -d flag, as in "-d nil"
// to set the debug_checknil variable. In general the list passed // to set the debug_checknil variable. In general the list passed
...@@ -310,7 +312,6 @@ func Main() { ...@@ -310,7 +312,6 @@ func Main() {
dclcontext = PEXTERN dclcontext = PEXTERN
nerrors = 0 nerrors = 0
lexlineno = 1 lexlineno = 1
const BOM = 0xFEFF
loadsys() loadsys()
...@@ -575,10 +576,14 @@ func addidir(dir string) { ...@@ -575,10 +576,14 @@ func addidir(dir string) {
} }
} }
func isDriveLetter(b byte) bool {
return 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z'
}
// is this path a local name? begins with ./ or ../ or / // is this path a local name? begins with ./ or ../ or /
func islocalname(name string) bool { func islocalname(name string) bool {
return strings.HasPrefix(name, "/") || return strings.HasPrefix(name, "/") ||
Ctxt.Windows != 0 && len(name) >= 3 && isAlpha(int(name[0])) && name[1] == ':' && name[2] == '/' || Ctxt.Windows != 0 && len(name) >= 3 && isDriveLetter(name[0]) && name[1] == ':' && name[2] == '/' ||
strings.HasPrefix(name, "./") || name == "." || strings.HasPrefix(name, "./") || name == "." ||
strings.HasPrefix(name, "../") || name == ".." strings.HasPrefix(name, "../") || name == ".."
} }
...@@ -829,20 +834,17 @@ func importfile(f *Val, indent []byte) { ...@@ -829,20 +834,17 @@ func importfile(f *Val, indent []byte) {
} }
} }
func isSpace(c int) bool { func isSpace(c rune) bool {
return c == ' ' || c == '\t' || c == '\n' || c == '\r' return c == ' ' || c == '\t' || c == '\n' || c == '\r'
} }
func isAlpha(c int) bool { func isLetter(c rune) bool {
return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_'
} }
func isDigit(c int) bool { func isDigit(c rune) bool {
return '0' <= c && c <= '9' return '0' <= c && c <= '9'
} }
func isAlnum(c int) bool {
return isAlpha(c) || isDigit(c)
}
func plan9quote(s string) string { func plan9quote(s string) string {
if s == "" { if s == "" {
...@@ -856,23 +858,11 @@ func plan9quote(s string) string { ...@@ -856,23 +858,11 @@ func plan9quote(s string) string {
return s return s
} }
func isfrog(c int) bool {
// complain about possibly invisible control characters
if c < ' ' {
return !isSpace(c) // exclude good white space
}
if 0x7f <= c && c <= 0xa0 { // DEL, unicode block including unbreakable space.
return true
}
return false
}
type lexer struct { type lexer struct {
// source // source
bin *obj.Biobuf bin *obj.Biobuf
peekc int peekr1 rune
peekc1 int // second peekc for ... peekr2 rune // second peekc for ...
nlsemi bool // if set, '\n' and EOF translate to ';' nlsemi bool // if set, '\n' and EOF translate to ';'
...@@ -932,7 +922,7 @@ const ( ...@@ -932,7 +922,7 @@ const (
) )
func (l *lexer) next() { func (l *lexer) next() {
var c1 int var c1 rune
var op Op var op Op
var escflag int var escflag int
var v int64 var v int64
...@@ -947,33 +937,73 @@ func (l *lexer) next() { ...@@ -947,33 +937,73 @@ func (l *lexer) next() {
l0: l0:
// skip white space // skip white space
c := l.getc() c := l.getr()
for isSpace(c) { for isSpace(c) {
if c == '\n' && nlsemi { if c == '\n' && nlsemi {
l.ungetc(c) // TODO(gri) we may be able avoid the ungetr and simply use lexlineno-1 below
l.ungetr(c) // for correct line number
if Debug['x'] != 0 { if Debug['x'] != 0 {
fmt.Printf("lex: implicit semi\n") fmt.Printf("lex: implicit semi\n")
} }
lineno = lexlineno
l.tok = ';' l.tok = ';'
return return
} }
c = l.getc() c = l.getr()
} }
// start of token // start of token
lineno = lexlineno lineno = lexlineno
if c >= utf8.RuneSelf { // identifiers and keywords
// all multibyte runes are alpha // (for better error messages consume all chars >= utf8.RuneSelf for identifiers)
if isLetter(c) || c >= utf8.RuneSelf {
cp = &lexbuf cp = &lexbuf
cp.Reset() cp.Reset()
goto talph
}
if isAlpha(c) { // accelerate common case (7bit ASCII)
cp = &lexbuf for isLetter(c) || isDigit(c) {
cp.Reset() cp.WriteByte(byte(c))
goto talph c = l.getr()
}
// general case
for {
if c >= utf8.RuneSelf {
if unicode.IsLetter(c) || c == '_' || unicode.IsDigit(c) || importpkg != nil && c == 0xb7 {
if cp.Len() == 0 && unicode.IsDigit(c) {
Yyerror("identifier cannot begin with digit %#U", c)
}
} else {
Yyerror("invalid identifier character %#U", c)
}
cp.WriteRune(c)
} else if isLetter(c) || isDigit(c) {
cp.WriteByte(byte(c))
} else {
break
}
c = l.getr()
}
cp = nil
l.ungetr(c)
s = LookupBytes(lexbuf.Bytes())
if s.Lexical == LIGNORE {
goto l0
}
if Debug['x'] != 0 {
fmt.Printf("lex: %s %s\n", s, lexname(rune(s.Lexical)))
}
l.sym_ = s
switch s.Lexical {
case LNAME, LRETURN, LBREAK, LCONTINUE, LFALL:
l.nlsemi = true
}
l.tok = int32(s.Lexical)
return
} }
if isDigit(c) { if isDigit(c) {
...@@ -982,7 +1012,7 @@ l0: ...@@ -982,7 +1012,7 @@ l0:
if c != '0' { if c != '0' {
for { for {
cp.WriteByte(byte(c)) cp.WriteByte(byte(c))
c = l.getc() c = l.getr()
if isDigit(c) { if isDigit(c) {
continue continue
} }
...@@ -1000,11 +1030,11 @@ l0: ...@@ -1000,11 +1030,11 @@ l0:
} }
cp.WriteByte(byte(c)) cp.WriteByte(byte(c))
c = l.getc() c = l.getr()
if c == 'x' || c == 'X' { if c == 'x' || c == 'X' {
for { for {
cp.WriteByte(byte(c)) cp.WriteByte(byte(c))
c = l.getc() c = l.getr()
if isDigit(c) { if isDigit(c) {
continue continue
} }
...@@ -1037,7 +1067,7 @@ l0: ...@@ -1037,7 +1067,7 @@ l0:
c1 = 1 // not octal c1 = 1 // not octal
} }
cp.WriteByte(byte(c)) cp.WriteByte(byte(c))
c = l.getc() c = l.getr()
} }
if c == '.' { if c == '.' {
...@@ -1057,8 +1087,7 @@ l0: ...@@ -1057,8 +1087,7 @@ l0:
switch c { switch c {
case EOF: case EOF:
lineno = prevlineno l.ungetr(EOF) // return EOF again in future next call
l.ungetc(EOF)
// Treat EOF as "end of line" for the purposes // Treat EOF as "end of line" for the purposes
// of inserting a semicolon. // of inserting a semicolon.
if nlsemi { if nlsemi {
...@@ -1071,13 +1100,8 @@ l0: ...@@ -1071,13 +1100,8 @@ l0:
l.tok = -1 l.tok = -1
return return
case '_':
cp = &lexbuf
cp.Reset()
goto talph
case '.': case '.':
c1 = l.getc() c1 = l.getr()
if isDigit(c1) { if isDigit(c1) {
cp = &lexbuf cp = &lexbuf
cp.Reset() cp.Reset()
...@@ -1087,13 +1111,13 @@ l0: ...@@ -1087,13 +1111,13 @@ l0:
} }
if c1 == '.' { if c1 == '.' {
c1 = l.getc() c1 = l.getr()
if c1 == '.' { if c1 == '.' {
c = LDDD c = LDDD
goto lx goto lx
} }
l.ungetc(c1) l.ungetr(c1)
c1 = '.' c1 = '.'
} }
...@@ -1127,7 +1151,7 @@ l0: ...@@ -1127,7 +1151,7 @@ l0:
cp.Reset() cp.Reset()
for { for {
c = int(l.getr()) c = l.getr()
if c == '\r' { if c == '\r' {
continue continue
} }
...@@ -1139,7 +1163,7 @@ l0: ...@@ -1139,7 +1163,7 @@ l0:
if c == '`' { if c == '`' {
break break
} }
cp.WriteRune(rune(c)) cp.WriteRune(c)
} }
goto strlit goto strlit
...@@ -1153,7 +1177,7 @@ l0: ...@@ -1153,7 +1177,7 @@ l0:
if !l.escchar('\'', &escflag, &v) { if !l.escchar('\'', &escflag, &v) {
Yyerror("missing '") Yyerror("missing '")
l.ungetc(int(v)) l.ungetr(rune(v))
} }
x := new(Mpint) x := new(Mpint)
...@@ -1163,25 +1187,25 @@ l0: ...@@ -1163,25 +1187,25 @@ l0:
if Debug['x'] != 0 { if Debug['x'] != 0 {
fmt.Printf("lex: codepoint literal\n") fmt.Printf("lex: codepoint literal\n")
} }
litbuf = "string literal" litbuf = "rune literal"
l.nlsemi = true l.nlsemi = true
l.tok = LLITERAL l.tok = LLITERAL
return return
case '/': case '/':
c1 = l.getc() c1 = l.getr()
if c1 == '*' { if c1 == '*' {
nl := false nl := false
for { for {
c = int(l.getr()) c = l.getr()
if c == '\n' { if c == '\n' {
nl = true nl = true
} }
for c == '*' { for c == '*' {
c = int(l.getr()) c = l.getr()
if c == '/' { if c == '/' {
if nl { if nl {
l.ungetc('\n') l.ungetr('\n')
} }
goto l0 goto l0
} }
...@@ -1202,11 +1226,11 @@ l0: ...@@ -1202,11 +1226,11 @@ l0:
c = l.getlinepragma() c = l.getlinepragma()
for { for {
if c == '\n' || c == EOF { if c == '\n' || c == EOF {
l.ungetc(c) l.ungetr(c)
goto l0 goto l0
} }
c = int(l.getr()) c = l.getr()
} }
} }
...@@ -1216,31 +1240,31 @@ l0: ...@@ -1216,31 +1240,31 @@ l0:
} }
case ':': case ':':
c1 = l.getc() c1 = l.getr()
if c1 == '=' { if c1 == '=' {
c = int(LCOLAS) c = LCOLAS
goto lx goto lx
} }
case '*': case '*':
c1 = l.getc() c1 = l.getr()
if c1 == '=' { if c1 == '=' {
op = OMUL op = OMUL
goto asop goto asop
} }
case '%': case '%':
c1 = l.getc() c1 = l.getr()
if c1 == '=' { if c1 == '=' {
op = OMOD op = OMOD
goto asop goto asop
} }
case '+': case '+':
c1 = l.getc() c1 = l.getr()
if c1 == '+' { if c1 == '+' {
l.nlsemi = true l.nlsemi = true
c = int(LINC) c = LINC
goto lx goto lx
} }
...@@ -1250,10 +1274,10 @@ l0: ...@@ -1250,10 +1274,10 @@ l0:
} }
case '-': case '-':
c1 = l.getc() c1 = l.getr()
if c1 == '-' { if c1 == '-' {
l.nlsemi = true l.nlsemi = true
c = int(LDEC) c = LDEC
goto lx goto lx
} }
...@@ -1263,10 +1287,10 @@ l0: ...@@ -1263,10 +1287,10 @@ l0:
} }
case '>': case '>':
c1 = l.getc() c1 = l.getr()
if c1 == '>' { if c1 == '>' {
c = int(LRSH) c = LRSH
c1 = l.getc() c1 = l.getr()
if c1 == '=' { if c1 == '=' {
op = ORSH op = ORSH
goto asop goto asop
...@@ -1276,17 +1300,17 @@ l0: ...@@ -1276,17 +1300,17 @@ l0:
} }
if c1 == '=' { if c1 == '=' {
c = int(LGE) c = LGE
goto lx goto lx
} }
c = int(LGT) c = LGT
case '<': case '<':
c1 = l.getc() c1 = l.getr()
if c1 == '<' { if c1 == '<' {
c = int(LLSH) c = LLSH
c1 = l.getc() c1 = l.getr()
if c1 == '=' { if c1 == '=' {
op = OLSH op = OLSH
goto asop goto asop
...@@ -1296,41 +1320,41 @@ l0: ...@@ -1296,41 +1320,41 @@ l0:
} }
if c1 == '=' { if c1 == '=' {
c = int(LLE) c = LLE
goto lx goto lx
} }
if c1 == '-' { if c1 == '-' {
c = int(LCOMM) c = LCOMM
goto lx goto lx
} }
c = int(LLT) c = LLT
case '=': case '=':
c1 = l.getc() c1 = l.getr()
if c1 == '=' { if c1 == '=' {
c = int(LEQ) c = LEQ
goto lx goto lx
} }
case '!': case '!':
c1 = l.getc() c1 = l.getr()
if c1 == '=' { if c1 == '=' {
c = int(LNE) c = LNE
goto lx goto lx
} }
case '&': case '&':
c1 = l.getc() c1 = l.getr()
if c1 == '&' { if c1 == '&' {
c = int(LANDAND) c = LANDAND
goto lx goto lx
} }
if c1 == '^' { if c1 == '^' {
c = int(LANDNOT) c = LANDNOT
c1 = l.getc() c1 = l.getr()
if c1 == '=' { if c1 == '=' {
op = OANDNOT op = OANDNOT
goto asop goto asop
...@@ -1345,9 +1369,9 @@ l0: ...@@ -1345,9 +1369,9 @@ l0:
} }
case '|': case '|':
c1 = l.getc() c1 = l.getr()
if c1 == '|' { if c1 == '|' {
c = int(LOROR) c = LOROR
goto lx goto lx
} }
...@@ -1357,21 +1381,32 @@ l0: ...@@ -1357,21 +1381,32 @@ l0:
} }
case '^': case '^':
c1 = l.getc() c1 = l.getr()
if c1 == '=' { if c1 == '=' {
op = OXOR op = OXOR
goto asop goto asop
} }
case '(', '[', '{', ',', ';':
goto lx
case ')', ']', '}': case ')', ']', '}':
l.nlsemi = true l.nlsemi = true
goto lx goto lx
case '#', '$', '?', '@', '\\':
if importpkg != nil {
goto lx
}
fallthrough
default: default:
goto lx // anything else is illegal
Yyerror("syntax error: illegal character %#U", c)
goto l0
} }
l.ungetc(c1) l.ungetr(c1)
lx: lx:
if Debug['x'] != 0 { if Debug['x'] != 0 {
...@@ -1381,17 +1416,8 @@ lx: ...@@ -1381,17 +1416,8 @@ lx:
fmt.Printf("%v lex: TOKEN '%c'\n", Ctxt.Line(int(lexlineno)), c) fmt.Printf("%v lex: TOKEN '%c'\n", Ctxt.Line(int(lexlineno)), c)
} }
} }
if isfrog(c) {
Yyerror("illegal character 0x%x", uint(c))
goto l0
}
if importpkg == nil && (c == '#' || c == '$' || c == '?' || c == '@' || c == '\\') {
Yyerror("%s: unexpected %c", "syntax error", c)
goto l0
}
l.tok = int32(c) l.tok = c
return return
asop: asop:
...@@ -1402,52 +1428,9 @@ asop: ...@@ -1402,52 +1428,9 @@ asop:
l.tok = LASOP l.tok = LASOP
return return
// cp is set to lexbuf and some
// prefix has been stored
talph:
for {
if c >= utf8.RuneSelf {
l.ungetc(c)
r := rune(l.getr())
// 0xb7 · is used for internal names
if !unicode.IsLetter(r) && !unicode.IsDigit(r) && (importpkg == nil || r != 0xb7) {
Yyerror("invalid identifier character U+%04x", r)
}
if cp.Len() == 0 && unicode.IsDigit(r) {
Yyerror("identifier cannot begin with digit U+%04x", r)
}
cp.WriteRune(r)
} else if !isAlnum(c) && c != '_' {
break
} else {
cp.WriteByte(byte(c))
}
c = l.getc()
}
cp = nil
l.ungetc(c)
s = LookupBytes(lexbuf.Bytes())
if s.Lexical == LIGNORE {
goto l0
}
if Debug['x'] != 0 {
fmt.Printf("lex: %s %s\n", s, lexname(int(s.Lexical)))
}
l.sym_ = s
switch s.Lexical {
case LNAME, LRETURN, LBREAK, LCONTINUE, LFALL:
l.nlsemi = true
}
l.tok = int32(s.Lexical)
return
ncu: ncu:
cp = nil cp = nil
l.ungetc(c) l.ungetr(c)
str = lexbuf.String() str = lexbuf.String()
l.val.U = new(Mpint) l.val.U = new(Mpint)
...@@ -1468,7 +1451,7 @@ ncu: ...@@ -1468,7 +1451,7 @@ ncu:
casedot: casedot:
for { for {
cp.WriteByte(byte(c)) cp.WriteByte(byte(c))
c = l.getc() c = l.getr()
if !isDigit(c) { if !isDigit(c) {
break break
} }
...@@ -1488,10 +1471,10 @@ caseep: ...@@ -1488,10 +1471,10 @@ caseep:
Yyerror("malformed floating point constant") Yyerror("malformed floating point constant")
} }
cp.WriteByte(byte(c)) cp.WriteByte(byte(c))
c = l.getc() c = l.getr()
if c == '+' || c == '-' { if c == '+' || c == '-' {
cp.WriteByte(byte(c)) cp.WriteByte(byte(c))
c = l.getc() c = l.getr()
} }
if !isDigit(c) { if !isDigit(c) {
...@@ -1499,7 +1482,7 @@ caseep: ...@@ -1499,7 +1482,7 @@ caseep:
} }
for isDigit(c) { for isDigit(c) {
cp.WriteByte(byte(c)) cp.WriteByte(byte(c))
c = l.getc() c = l.getr()
} }
if c == 'i' { if c == 'i' {
...@@ -1530,7 +1513,7 @@ casei: ...@@ -1530,7 +1513,7 @@ casei:
caseout: caseout:
cp = nil cp = nil
l.ungetc(c) l.ungetr(c)
str = lexbuf.String() str = lexbuf.String()
l.val.U = newMpflt() l.val.U = newMpflt()
...@@ -1571,7 +1554,7 @@ func internString(b []byte) string { ...@@ -1571,7 +1554,7 @@ func internString(b []byte) string {
func more(pp *string) bool { func more(pp *string) bool {
p := *pp p := *pp
for p != "" && isSpace(int(p[0])) { for p != "" && isSpace(rune(p[0])) {
p = p[1:] p = p[1:]
} }
*pp = p *pp = p
...@@ -1582,16 +1565,16 @@ func more(pp *string) bool { ...@@ -1582,16 +1565,16 @@ func more(pp *string) bool {
// //line parse.y:15 // //line parse.y:15
// as a discontinuity in sequential line numbers. // as a discontinuity in sequential line numbers.
// the next line of input comes from parse.y:15 // the next line of input comes from parse.y:15
func (l *lexer) getlinepragma() int { func (l *lexer) getlinepragma() rune {
var cmd, verb, name string var cmd, verb, name string
c := int(l.getr()) c := l.getr()
if c == 'g' { if c == 'g' {
cp := &lexbuf cp := &lexbuf
cp.Reset() cp.Reset()
cp.WriteByte('g') // already read cp.WriteByte('g') // already read
for { for {
c = int(l.getr()) c = l.getr()
if c == EOF || c >= utf8.RuneSelf { if c == EOF || c >= utf8.RuneSelf {
return c return c
} }
...@@ -1683,8 +1666,8 @@ func (l *lexer) getlinepragma() int { ...@@ -1683,8 +1666,8 @@ func (l *lexer) getlinepragma() int {
return c return c
} }
for i := 1; i < 5; i++ { for i := 1; i < 5; i++ {
c = int(l.getr()) c = l.getr()
if c != int("line "[i]) { if c != rune("line "[i]) {
return c return c
} }
} }
...@@ -1693,7 +1676,7 @@ func (l *lexer) getlinepragma() int { ...@@ -1693,7 +1676,7 @@ func (l *lexer) getlinepragma() int {
cp.Reset() cp.Reset()
linep := 0 linep := 0
for { for {
c = int(l.getr()) c = l.getr()
if c == EOF { if c == EOF {
return c return c
} }
...@@ -1746,7 +1729,7 @@ func getimpsym(pp *string) string { ...@@ -1746,7 +1729,7 @@ func getimpsym(pp *string) string {
return "" return ""
} }
i := 0 i := 0
for i < len(p) && !isSpace(int(p[i])) && p[i] != '"' { for i < len(p) && !isSpace(rune(p[i])) && p[i] != '"' {
i++ i++
} }
sym := p[:i] sym := p[:i]
...@@ -1874,79 +1857,72 @@ func pragcgo(text string) { ...@@ -1874,79 +1857,72 @@ func pragcgo(text string) {
} }
} }
func (l *lexer) getc() int { func (l *lexer) getr() rune {
c := l.peekc // unread rune != 0 available
if c != 0 { if r := l.peekr1; r != 0 {
l.peekc = l.peekc1 l.peekr1 = l.peekr2
l.peekc1 = 0 l.peekr2 = 0
goto check if r == '\n' && importpkg == nil {
lexlineno++
}
return r
} }
loop: redo:
c = obj.Bgetc(l.bin) // common case: 7bit ASCII
// recognize BOM (U+FEFF): UTF-8 encoding is 0xef 0xbb 0xbf c := obj.Bgetc(l.bin)
if c == 0xef { if c < utf8.RuneSelf {
buf, err := l.bin.Peek(2) if c == 0 {
if err != nil { // TODO(gri) do we need lineno = lexlineno here? Why not?
yyerrorl(int(lexlineno), "illegal UTF-8 sequence ef % x followed by read error (%v)", string(buf), err) Yyerror("illegal NUL byte")
errorexit() return 0
} }
if buf[0] == 0xbb && buf[1] == 0xbf { if c == '\n' && importpkg == nil {
yyerrorl(int(lexlineno), "Unicode (UTF-8) BOM in middle of file") lexlineno++
// consume BOM bytes
obj.Bgetc(l.bin)
obj.Bgetc(l.bin)
goto loop
} }
return rune(c)
} }
// c >= utf8.RuneSelf
check: // uncommon case: non-ASCII
if c == 0 { var buf [utf8.UTFMax]byte
Yyerror("illegal NUL byte") buf[0] = byte(c)
return 0 buf[1] = byte(obj.Bgetc(l.bin))
i := 2
for ; i < len(buf) && !utf8.FullRune(buf[:i]); i++ {
buf[i] = byte(obj.Bgetc(l.bin))
} }
if c == '\n' && importpkg == nil {
lexlineno++ r, w := utf8.DecodeRune(buf[:i])
if r == utf8.RuneError && w == 1 {
lineno = lexlineno
// The string conversion here makes a copy for passing
// to fmt.Printf, so that buf itself does not escape and
// can be allocated on the stack.
Yyerror("illegal UTF-8 sequence % x", string(buf[:i+1]))
} }
return c
}
func (l *lexer) ungetc(c int) { if r == BOM {
l.peekc1 = l.peekc // TODO(gri) can we use Yyerror here? Why not?
l.peekc = c yyerrorl(int(lexlineno), "Unicode (UTF-8) BOM in middle of file")
if c == '\n' && importpkg == nil { goto redo
lexlineno--
} }
}
func (l *lexer) getr() int32 { return r
var buf [utf8.UTFMax]byte }
for i := 0; ; i++ { func (l *lexer) ungetr(r rune) {
c := l.getc() l.peekr2 = l.peekr1
if i == 0 && c < utf8.RuneSelf { l.peekr1 = r
return int32(c) if r == '\n' && importpkg == nil {
} lexlineno--
buf[i] = byte(c)
if i+1 == len(buf) || utf8.FullRune(buf[:i+1]) {
r, w := utf8.DecodeRune(buf[:i+1])
if r == utf8.RuneError && w == 1 {
lineno = lexlineno
// The string conversion here makes a copy for passing
// to fmt.Printf, so that buf itself does not escape and can
// be allocated on the stack.
Yyerror("illegal UTF-8 sequence % x", string(buf[:i+1]))
}
return int32(r)
}
} }
} }
func (l *lexer) escchar(e int, escflg *int, val *int64) bool { func (l *lexer) escchar(e rune, escflg *int, val *int64) bool {
*escflg = 0 *escflg = 0
c := int(l.getr()) c := l.getr()
switch c { switch c {
case EOF: case EOF:
Yyerror("eof in string") Yyerror("eof in string")
...@@ -1968,7 +1944,7 @@ func (l *lexer) escchar(e int, escflg *int, val *int64) bool { ...@@ -1968,7 +1944,7 @@ func (l *lexer) escchar(e int, escflg *int, val *int64) bool {
} }
u := 0 u := 0
c = int(l.getr()) c = l.getr()
var i int var i int
switch c { switch c {
case 'x': case 'x':
...@@ -1997,14 +1973,14 @@ func (l *lexer) escchar(e int, escflg *int, val *int64) bool { ...@@ -1997,14 +1973,14 @@ func (l *lexer) escchar(e int, escflg *int, val *int64) bool {
*escflg = 1 // it's a byte *escflg = 1 // it's a byte
x := int64(c) - '0' x := int64(c) - '0'
for i := 2; i > 0; i-- { for i := 2; i > 0; i-- {
c = l.getc() c = l.getr()
if c >= '0' && c <= '7' { if c >= '0' && c <= '7' {
x = x*8 + int64(c) - '0' x = x*8 + int64(c) - '0'
continue continue
} }
Yyerror("non-octal character in escape sequence: %c", c) Yyerror("non-octal character in escape sequence: %c", c)
l.ungetc(c) l.ungetr(c)
} }
if x > 255 { if x > 255 {
...@@ -2043,7 +2019,7 @@ func (l *lexer) escchar(e int, escflg *int, val *int64) bool { ...@@ -2043,7 +2019,7 @@ func (l *lexer) escchar(e int, escflg *int, val *int64) bool {
hex: hex:
x := int64(0) x := int64(0)
for ; i > 0; i-- { for ; i > 0; i-- {
c = l.getc() c = l.getr()
if c >= '0' && c <= '9' { if c >= '0' && c <= '9' {
x = x*16 + int64(c) - '0' x = x*16 + int64(c) - '0'
continue continue
...@@ -2060,7 +2036,7 @@ hex: ...@@ -2060,7 +2036,7 @@ hex:
} }
Yyerror("non-hex character in escape sequence: %c", c) Yyerror("non-hex character in escape sequence: %c", c)
l.ungetc(c) l.ungetr(c)
break break
} }
...@@ -2377,7 +2353,7 @@ func lexfini() { ...@@ -2377,7 +2353,7 @@ func lexfini() {
nodfp.Sym = Lookup(".fp") nodfp.Sym = Lookup(".fp")
} }
var lexn = map[int]string{ var lexn = map[rune]string{
LANDAND: "ANDAND", LANDAND: "ANDAND",
LANDNOT: "ANDNOT", LANDNOT: "ANDNOT",
LASOP: "ASOP", LASOP: "ASOP",
...@@ -2424,7 +2400,7 @@ var lexn = map[int]string{ ...@@ -2424,7 +2400,7 @@ var lexn = map[int]string{
LVAR: "VAR", LVAR: "VAR",
} }
func lexname(lex int) string { func lexname(lex rune) string {
if s, ok := lexn[lex]; ok { if s, ok := lexn[lex]; ok {
return s return s
} }
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
package a package a
import"" // ERROR "import path is empty" import"" // ERROR "import path is empty"
var? // ERROR "unexpected \?" var? // ERROR "illegal character U\+003F '\?'"
var x int // ERROR "unexpected var" "cannot declare name" var x int // ERROR "unexpected var" "cannot declare name"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment