Commit 60c0b3b5 authored by Robert Griesemer's avatar Robert Griesemer

text/scanner: provide facility for custom identifiers

LGTM=r
R=golang-codereviews, r
CC=golang-codereviews
https://golang.org/cl/108030044
parent 54bc760a
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
// By default, a Scanner skips white space and Go comments and recognizes all // By default, a Scanner skips white space and Go comments and recognizes all
// literals as defined by the Go language specification. It may be // literals as defined by the Go language specification. It may be
// customized to recognize only a subset of those literals and to recognize // customized to recognize only a subset of those literals and to recognize
// different white space characters. // different identifier and white space characters.
// //
// Basic usage pattern: // Basic usage pattern:
// //
...@@ -34,8 +34,6 @@ import ( ...@@ -34,8 +34,6 @@ import (
"unicode/utf8" "unicode/utf8"
) )
// TODO(gri): Consider changing this to use the new (token) Position package.
// A source position is represented by a Position value. // A source position is represented by a Position value.
// A position is valid if Line > 0. // A position is valid if Line > 0.
type Position struct { type Position struct {
...@@ -164,6 +162,13 @@ type Scanner struct { ...@@ -164,6 +162,13 @@ type Scanner struct {
// for values ch > ' '). The field may be changed at any time. // for values ch > ' '). The field may be changed at any time.
Whitespace uint64 Whitespace uint64
// IsIdentRune is a predicate controlling the characters accepted
// as the ith rune in an identifier. The set of valid characters
// must not intersect with the set of white space characters.
// If no IsIdentRune function is set, regular Go identifiers are
// accepted instead. The field may be changed at any time.
IsIdentRune func(ch rune, i int) bool
// Start position of most recently scanned token; set by Scan. // Start position of most recently scanned token; set by Scan.
// Calling Init or Next invalidates the position (Line == 0). // Calling Init or Next invalidates the position (Line == 0).
// The Filename field is always left untouched by the Scanner. // The Filename field is always left untouched by the Scanner.
...@@ -334,9 +339,17 @@ func (s *Scanner) error(msg string) { ...@@ -334,9 +339,17 @@ func (s *Scanner) error(msg string) {
fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg) fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
} }
func (s *Scanner) isIdentRune(ch rune, i int) bool {
if s.IsIdentRune != nil {
return s.IsIdentRune(ch, i)
}
return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0
}
func (s *Scanner) scanIdentifier() rune { func (s *Scanner) scanIdentifier() rune {
ch := s.next() // read character after first '_' or letter // we know the zero'th rune is OK; start with 2nd one
for ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) { ch := s.next()
for i := 1; s.isIdentRune(ch, i); i++ {
ch = s.next() ch = s.next()
} }
return ch return ch
...@@ -563,7 +576,7 @@ redo: ...@@ -563,7 +576,7 @@ redo:
// determine token value // determine token value
tok := ch tok := ch
switch { switch {
case unicode.IsLetter(ch) || ch == '_': case s.isIdentRune(ch, 0):
if s.Mode&ScanIdents != 0 { if s.Mode&ScanIdents != 0 {
tok = Ident tok = Ident
ch = s.scanIdentifier() ch = s.scanIdentifier()
......
...@@ -357,6 +357,28 @@ func TestScanSelectedMask(t *testing.T) { ...@@ -357,6 +357,28 @@ func TestScanSelectedMask(t *testing.T) {
testScanSelectedMode(t, ScanComments, Comment) testScanSelectedMode(t, ScanComments, Comment)
} }
func TestScanCustomIdent(t *testing.T) {
const src = "faab12345 a12b123 a12 3b"
s := new(Scanner).Init(strings.NewReader(src))
// ident = ( 'a' | 'b' ) { digit } .
// digit = '0' .. '3' .
// with a maximum length of 4
s.IsIdentRune = func(ch rune, i int) bool {
return i == 0 && (ch == 'a' || ch == 'b') || 0 < i && i < 4 && '0' <= ch && ch <= '3'
}
checkTok(t, s, 1, s.Scan(), 'f', "f")
checkTok(t, s, 1, s.Scan(), Ident, "a")
checkTok(t, s, 1, s.Scan(), Ident, "a")
checkTok(t, s, 1, s.Scan(), Ident, "b123")
checkTok(t, s, 1, s.Scan(), Int, "45")
checkTok(t, s, 1, s.Scan(), Ident, "a12")
checkTok(t, s, 1, s.Scan(), Ident, "b123")
checkTok(t, s, 1, s.Scan(), Ident, "a12")
checkTok(t, s, 1, s.Scan(), Int, "3")
checkTok(t, s, 1, s.Scan(), Ident, "b")
checkTok(t, s, 1, s.Scan(), EOF, "")
}
func TestScanNext(t *testing.T) { func TestScanNext(t *testing.T) {
const BOM = '\uFEFF' const BOM = '\uFEFF'
BOMs := string(BOM) BOMs := string(BOM)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment