Commit 1161d190 authored by Robert Griesemer's avatar Robert Griesemer

scanner: error handler must be provided to Init

Init may report an error on the first character and
thus one needs an ability to set the error handler
for Init. Was a design bug.

Added corresponding test cases and better documentation.
Also: Fixed a subtle infinite loop exposed by one of the
new test cases.

Fixes #1380.

R=rsc, gri
CC=golang-dev
https://golang.org/cl/4094041
parent ab036abd
...@@ -144,7 +144,7 @@ type Scanner struct { ...@@ -144,7 +144,7 @@ type Scanner struct {
// the token text's head may be buffered in tokBuf while the token text's // the token text's head may be buffered in tokBuf while the token text's
// tail is stored in srcBuf. // tail is stored in srcBuf.
tokBuf bytes.Buffer // token text head that is not in srcBuf anymore tokBuf bytes.Buffer // token text head that is not in srcBuf anymore
tokPos int // token text tail position (srcBuf index) tokPos int // token text tail position (srcBuf index); valid if >= 0
tokEnd int // token text tail end (srcBuf index) tokEnd int // token text tail end (srcBuf index)
// One character look-ahead // One character look-ahead
...@@ -175,13 +175,14 @@ type Scanner struct { ...@@ -175,13 +175,14 @@ type Scanner struct {
} }
// Init initializes a Scanner with a new source and returns itself. // Init initializes a Scanner with a new source and returns s.
// Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens, // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
// and Whitespace is set to GoWhitespace. // and Whitespace is set to GoWhitespace.
func (s *Scanner) Init(src io.Reader) *Scanner { func (s *Scanner) Init(src io.Reader) *Scanner {
s.src = src s.src = src
// initialize source buffer // initialize source buffer
// (the first call to next() will fill it by calling src.Read)
s.srcBuf[0] = utf8.RuneSelf // sentinel s.srcBuf[0] = utf8.RuneSelf // sentinel
s.srcPos = 0 s.srcPos = 0
s.srcEnd = 0 s.srcEnd = 0
...@@ -192,10 +193,11 @@ func (s *Scanner) Init(src io.Reader) *Scanner { ...@@ -192,10 +193,11 @@ func (s *Scanner) Init(src io.Reader) *Scanner {
s.column = 0 s.column = 0
// initialize token text buffer // initialize token text buffer
// (required for first call to next()).
s.tokPos = -1 s.tokPos = -1
// initialize one character look-ahead // initialize one character look-ahead
s.ch = s.next() s.ch = -1 // no char read yet
// initialize public fields // initialize public fields
s.Error = nil s.Error = nil
...@@ -222,15 +224,20 @@ func (s *Scanner) next() int { ...@@ -222,15 +224,20 @@ func (s *Scanner) next() int {
if s.tokPos >= 0 { if s.tokPos >= 0 {
s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
s.tokPos = 0 s.tokPos = 0
// s.tokEnd is set by Scan()
} }
// move unread bytes to beginning of buffer // move unread bytes to beginning of buffer
copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
s.srcBufOffset += s.srcPos s.srcBufOffset += s.srcPos
// read more bytes // read more bytes
// (an io.Reader must return os.EOF when it reaches
// the end of what it is reading - simply returning
// n == 0 will make this loop retry forever; but the
// error is in the reader implementation in that case)
i := s.srcEnd - s.srcPos i := s.srcEnd - s.srcPos
n, err := s.src.Read(s.srcBuf[i:bufLen]) n, err := s.src.Read(s.srcBuf[i:bufLen])
s.srcEnd = i + n
s.srcPos = 0 s.srcPos = 0
s.srcEnd = i + n
s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
if err != nil { if err != nil {
if s.srcEnd == 0 { if s.srcEnd == 0 {
...@@ -238,8 +245,12 @@ func (s *Scanner) next() int { ...@@ -238,8 +245,12 @@ func (s *Scanner) next() int {
} }
if err != os.EOF { if err != os.EOF {
s.error(err.String()) s.error(err.String())
break
} }
// If err == EOF, we won't be getting more
// bytes; break to avoid infinite loop. If
// err is something else, we don't know if
// we can get more bytes; thus also break.
break
} }
} }
// at least one byte // at least one byte
...@@ -251,7 +262,7 @@ func (s *Scanner) next() int { ...@@ -251,7 +262,7 @@ func (s *Scanner) next() int {
if ch == utf8.RuneError && width == 1 { if ch == utf8.RuneError && width == 1 {
s.error("illegal UTF-8 encoding") s.error("illegal UTF-8 encoding")
} }
s.srcPos += width - 1 s.srcPos += width - 1 // -1 because of s.srcPos++ below
} }
} }
...@@ -272,13 +283,13 @@ func (s *Scanner) next() int { ...@@ -272,13 +283,13 @@ func (s *Scanner) next() int {
// Next reads and returns the next Unicode character. // Next reads and returns the next Unicode character.
// It returns EOF at the end of the source. It reports // It returns EOF at the end of the source. It reports
// a read error by calling s.Error, if set, or else // a read error by calling s.Error, if not nil; otherwise
// prints an error message to os.Stderr. Next does not // it prints an error message to os.Stderr. Next does not
// update the Scanner's Position field; use Pos() to // update the Scanner's Position field; use Pos() to
// get the current position. // get the current position.
func (s *Scanner) Next() int { func (s *Scanner) Next() int {
s.tokPos = -1 // don't collect token text s.tokPos = -1 // don't collect token text
ch := s.ch ch := s.Peek()
s.ch = s.next() s.ch = s.next()
return ch return ch
} }
...@@ -288,6 +299,9 @@ func (s *Scanner) Next() int { ...@@ -288,6 +299,9 @@ func (s *Scanner) Next() int {
// the scanner. It returns EOF if the scanner's position is at the last // the scanner. It returns EOF if the scanner's position is at the last
// character of the source. // character of the source.
func (s *Scanner) Peek() int { func (s *Scanner) Peek() int {
if s.ch < 0 {
s.ch = s.next()
}
return s.ch return s.ch
} }
...@@ -511,10 +525,10 @@ func (s *Scanner) scanComment(ch int) { ...@@ -511,10 +525,10 @@ func (s *Scanner) scanComment(ch int) {
// Scan reads the next token or Unicode character from source and returns it. // Scan reads the next token or Unicode character from source and returns it.
// It only recognizes tokens t for which the respective Mode bit (1<<-t) is set. // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
// It returns EOF at the end of the source. It reports scanner errors (read and // It returns EOF at the end of the source. It reports scanner errors (read and
// token errors) by calling s.Error, if set; otherwise it prints an error message // token errors) by calling s.Error, if not nil; otherwise it prints an error
// to os.Stderr. // message to os.Stderr.
func (s *Scanner) Scan() int { func (s *Scanner) Scan() int {
ch := s.ch ch := s.Peek()
// reset token text position // reset token text position
s.tokPos = -1 s.tokPos = -1
......
...@@ -10,6 +10,7 @@ import ( ...@@ -10,6 +10,7 @@ import (
"os" "os"
"strings" "strings"
"testing" "testing"
"utf8"
) )
...@@ -408,7 +409,7 @@ func TestScanWhitespace(t *testing.T) { ...@@ -408,7 +409,7 @@ func TestScanWhitespace(t *testing.T) {
func testError(t *testing.T, src, msg string, tok int) { func testError(t *testing.T, src, msg string, tok int) {
s := new(Scanner).Init(bytes.NewBufferString(src)) s := new(Scanner).Init(bytes.NewBufferString(src))
errorCalled := false errorCalled := false
s.Error = func(s *Scanner, m string) { s.Error = func(_ *Scanner, m string) {
if !errorCalled { if !errorCalled {
// only look at first error // only look at first error
if m != msg { if m != msg {
...@@ -431,6 +432,8 @@ func testError(t *testing.T, src, msg string, tok int) { ...@@ -431,6 +432,8 @@ func testError(t *testing.T, src, msg string, tok int) {
func TestError(t *testing.T) { func TestError(t *testing.T) {
testError(t, "\x00", "illegal character NUL", 0)
testError(t, "\xff", "illegal UTF-8 encoding", utf8.RuneError)
testError(t, `01238`, "illegal octal number", Int) testError(t, `01238`, "illegal octal number", Int)
testError(t, `'\"'`, "illegal char escape", Char) testError(t, `'\"'`, "illegal char escape", Char)
testError(t, `'aa'`, "illegal char literal", Char) testError(t, `'aa'`, "illegal char literal", Char)
...@@ -467,6 +470,7 @@ func TestPos(t *testing.T) { ...@@ -467,6 +470,7 @@ func TestPos(t *testing.T) {
s := new(Scanner).Init(bytes.NewBufferString("abc\n012\n\nx")) s := new(Scanner).Init(bytes.NewBufferString("abc\n012\n\nx"))
s.Mode = 0 s.Mode = 0
s.Whitespace = 0 s.Whitespace = 0
s.Peek() // get a defined position
checkPos(t, s, 0, 1, 1, 'a') checkPos(t, s, 0, 1, 1, 'a')
checkPos(t, s, 1, 1, 2, 'b') checkPos(t, s, 1, 1, 2, 'b')
checkPos(t, s, 2, 1, 3, 'c') checkPos(t, s, 2, 1, 3, 'c')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment