implemented InsertSemis mode for go/scanner

R=rsc https://golang.org/cl/175047

implemented InsertSemis mode for go/scanner
R=rsc https://golang.org/cl/175047
a70caf44 · Robert Griesemer · 2b3813d0 · a70caf44 · a70caf44
Commit a70caf44 authored Dec 10, 2009 by Robert Griesemer
Hide whitespace changes
Inline Side-by-side

Showing with 236 additions and 20 deletions

scanner.go src/pkg/go/scanner/scanner.go +79 -17

scanner_test.go src/pkg/go/scanner/scanner_test.go +157 -3

No files found.
--- a/src/pkg/go/scanner/scanner.go
+++ b/src/pkg/go/scanner/scanner.go
@@ -29,9 +29,11 @@ type Scanner struct {
 	mode	uint;		// scanning mode
 	// scanning state
-	pos	token.Position;	// previous reading position (position before ch)
+	pos		token.Position;	// previous reading position (position before ch)
-	offset	int;		// current reading offset (position after ch)
+	offset		int;		// current reading offset (position after ch)
-	ch	int;		// one char look-ahead
+	ch		int;		// one char look-ahead
+	insertSemi	bool;		// insert a semicolon before next newline
+	pendingComment	token.Position;	// valid if pendingComment.Line > 0
 	// public state - ok to modify
 	ErrorCount	int;	// number of errors encountered
@@ -69,6 +71,7 @@ func (S *Scanner) next() {
 const (
 	ScanComments		= 1 << iota;	// return comments as COMMENT tokens
 	AllowIllegalChars;	// do not report an error for illegal chars
+	InsertSemis;		// automatically insert semicolons
 )
@@ -420,6 +423,8 @@ func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Toke
 }
+var semicolon = []byte{';'}
 // Scan scans the next token and returns the token position pos,
 // the token tok, and the literal text lit corresponding to the
 // token. The source end is indicated by token.EOF.
@@ -432,40 +437,63 @@ func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Toke
 // of the error handler, if there was one installed.
 //
 func (S *Scanner) Scan() (pos token.Position, tok token.Token, lit []byte) {
-scan_again:
+	if S.pendingComment.Line > 0 {
+		// "consume" pending comment
+		S.pos = S.pendingComment;
+		S.offset = S.pos.Offset + 1;
+		S.ch = '/';
+		S.pendingComment.Line = 0;
+	}
+scanAgain:
 	// skip white space
-	for S.ch == ' ' || S.ch == '\t' || S.ch == '\n' || S.ch == '\r' {
+	for S.ch == ' ' || S.ch == '\t' || S.ch == '\n' && !S.insertSemi || S.ch == '\r' {
 		S.next()
 	}
 	// current token start
+	insertSemi := false;
 	pos, tok = S.pos, token.ILLEGAL;
 	// determine token value
 	switch ch := S.ch; {
 	case isLetter(ch):
-		tok = S.scanIdentifier()
+		tok = S.scanIdentifier();
+		switch tok {
+		case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
+			insertSemi = true
+		default:
+			insertSemi = false
+		}
 	case digitVal(ch) < 10:
-		tok = S.scanNumber(false)
+		insertSemi = true;
+		tok = S.scanNumber(false);
 	default:
 		S.next();	// always make progress
 		switch ch {
 		case -1:
 			tok = token.EOF
+		case '\n':
+			S.insertSemi = false;
+			return pos, token.SEMICOLON, semicolon;
 		case '"':
+			insertSemi = true;
 			tok = token.STRING;
 			S.scanString(pos);
 		case '\'':
+			insertSemi = true;
 			tok = token.CHAR;
 			S.scanChar(pos);
 		case '`':
+			insertSemi = true;
 			tok = token.STRING;
 			S.scanRawString(pos);
 		case ':':
 			tok = S.switch2(token.COLON, token.DEFINE)
 		case '.':
 			if digitVal(S.ch) < 10 {
-				tok = S.scanNumber(true)
+				insertSemi = true;
+				tok = S.scanNumber(true);
 			} else if S.ch == '.' {
 				S.next();
 				if S.ch == '.' {
@@ -482,27 +510,57 @@ scan_again:
 		case '(':
 			tok = token.LPAREN
 		case ')':
-			tok = token.RPAREN
+			insertSemi = true;
+			tok = token.RPAREN;
 		case '[':
 			tok = token.LBRACK
 		case ']':
-			tok = token.RBRACK
+			insertSemi = true;
+			tok = token.RBRACK;
 		case '{':
 			tok = token.LBRACE
 		case '}':
-			tok = token.RBRACE
+			insertSemi = true;
+			tok = token.RBRACE;
 		case '+':
-			tok = S.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
+			tok = S.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC);
+			if tok == token.INC {
+				insertSemi = true
+			}
 		case '-':
-			tok = S.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
+			tok = S.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC);
+			if tok == token.DEC {
+				insertSemi = true
+			}
 		case '*':
 			tok = S.switch2(token.MUL, token.MUL_ASSIGN)
 		case '/':
 			if S.ch == '/' || S.ch == '*' {
-				S.scanComment(pos);
+				// comment
-				tok = token.COMMENT;
+				newline := false;
-				if S.mode&ScanComments == 0 {
+				if S.insertSemi {
-					goto scan_again
+					if S.ch == '/' {
+						// a line comment acts like a newline
+						newline = true
+					} else {
+						// a general comment may act like a newline
+						S.scanComment(pos);
+						newline = pos.Line < S.pos.Line;
+					}
+				} else {
+					S.scanComment(pos)
+				}
+				if newline {
+					// insert a semicolon and retain pending comment
+					S.insertSemi = false;
+					S.pendingComment = pos;
+					return pos, token.SEMICOLON, semicolon;
+				} else if S.mode&ScanComments == 0 {
+					// skip comment
+					goto scanAgain
+				} else {
+					insertSemi = S.insertSemi;	// preserve insertSemi info
+					tok = token.COMMENT;
 				}
 			} else {
 				tok = S.switch2(token.QUO, token.QUO_ASSIGN)
@@ -537,9 +595,13 @@ scan_again:
 			if S.mode&AllowIllegalChars == 0 {
 				S.error(pos, "illegal character "+charString(ch))
 			}
+			insertSemi = S.insertSemi;	// preserve insertSemi info
 		}
 	}
+	if S.mode&InsertSemis != 0 {
+		S.insertSemi = insertSemi
+	}
 	return pos, tok, S.src[pos.Offset:S.pos.Offset];
 }

--- a/src/pkg/go/scanner/scanner_test.go
+++ b/src/pkg/go/scanner/scanner_test.go
@@ -225,13 +225,13 @@ func TestScan(t *testing.T) {
 			}
 			checkPos(t, lit, pos, epos);
 			if tok != e.tok {
-				t.Errorf("bad token for %s: got %s, expected %s", lit, tok.String(), e.tok.String())
+				t.Errorf("bad token for %q: got %s, expected %s", lit, tok.String(), e.tok.String())
 			}
 			if e.tok.IsLiteral() && lit != e.lit {
-				t.Errorf("bad literal for %s: got %s, expected %s", lit, lit, e.lit)
+				t.Errorf("bad literal for %q: got %q, expected %q", lit, lit, e.lit)
 			}
 			if tokenclass(tok) != e.class {
-				t.Errorf("bad class for %s: got %d, expected %d", lit, tokenclass(tok), e.class)
+				t.Errorf("bad class for %q: got %d, expected %d", lit, tokenclass(tok), e.class)
 			}
 			epos.Offset += len(lit) + len(whitespace);
 			epos.Line += NewlineCount(lit) + whitespace_linecount;
@@ -249,6 +249,160 @@ func TestScan(t *testing.T) {
 }
+func getTok(_ token.Position, tok token.Token, _ []byte) token.Token {
+	return tok
+}
+func checkSemi(t *testing.T, line string, mode uint) {
+	var S Scanner;
+	S.Init("TestSemis", strings.Bytes(line), nil, mode);
+	pos, tok, lit := S.Scan();
+	for tok != token.EOF {
+		if tok == token.ILLEGAL {
+			// next token must be a semicolon
+			offs := pos.Offset + 1;
+			pos, tok, lit = S.Scan();
+			if tok == token.SEMICOLON {
+				if pos.Offset != offs {
+					t.Errorf("bad offset for %q: got %d, expected %d", line, pos.Offset, offs)
+				}
+				if string(lit) != ";" {
+					t.Errorf(`bad literal for %q: got %q, expected ";"`, line, lit)
+				}
+			} else {
+				t.Errorf("bad token for %q: got %s, expected ;", line, tok.String())
+			}
+		} else if tok == token.SEMICOLON {
+			t.Errorf("bad token for %q: got ;, expected no ;", line)
+		}
+		pos, tok, lit = S.Scan();
+	}
+}
+var lines = []string{
+	// the $ character indicates where a semicolon is expected
+	"",
+	"foo$\n",
+	"123$\n",
+	"1.2$\n",
+	"'x'$\n",
+	`"x"` + "$\n",
+	"`x`$\n",
+	"+\n",
+	"-\n",
+	"*\n",
+	"/\n",
+	"%\n",
+	"&\n",
+	"|\n",
+	"^\n",
+	"<<\n",
+	">>\n",
+	"&^\n",
+	"+=\n",
+	"-=\n",
+	"*=\n",
+	"/=\n",
+	"%=\n",
+	"&=\n",
+	"|=\n",
+	"^=\n",
+	"<<=\n",
+	">>=\n",
+	"&^=\n",
+	"&&\n",
+	"||\n",
+	"<-\n",
+	"++$\n",
+	"--$\n",
+	"==\n",
+	"<\n",
+	">\n",
+	"=\n",
+	"!\n",
+	"!=\n",
+	"<=\n",
+	">=\n",
+	":=\n",
+	"...\n",
+	"(\n",
+	"[\n",
+	"{\n",
+	",\n",
+	".\n",
+	")$\n",
+	"]$\n",
+	"}$\n",
+	"$;\n",
+	":\n",
+	"break$\n",
+	"case\n",
+	"chan\n",
+	"const\n",
+	"continue$\n",
+	"default\n",
+	"defer\n",
+	"else\n",
+	"fallthrough$\n",
+	"for\n",
+	"func\n",
+	"go\n",
+	"goto\n",
+	"if\n",
+	"import\n",
+	"interface\n",
+	"map\n",
+	"package\n",
+	"range\n",
+	"return$\n",
+	"select\n",
+	"struct\n",
+	"switch\n",
+	"type\n",
+	"var\n",
+	"foo$//comment\n",
+	"foo$/*comment*/\n",
+	"foo$/*\n*/",
+	"foo    $// comment\n",
+	"foo    $/*comment*/\n",
+	"foo    $/*\n*/",
+	// TODO(gri): These need to insert the semicolon *before* the
+	//            first comment which requires arbitrary far look-
+	//            ahead. Only relevant for gofmt placement of
+	//            comments.
+	"foo    /*comment*/    $\n",
+	"foo    /*0*/ /*1*/ $/*2*/\n",
+}
+func TestSemis(t *testing.T) {
+	for _, line := range lines {
+		checkSemi(t, line, AllowIllegalChars|InsertSemis)
+	}
+	for _, line := range lines {
+		checkSemi(t, line, AllowIllegalChars|InsertSemis|ScanComments)
+	}
+}
 type seg struct {
 	srcline		string;	// a line of source text
 	filename	string;	// filename for current token