Commit 60c0b3b5 authored by Robert Griesemer's avatar Robert Griesemer

text/scanner: provide facility for custom identifiers

LGTM=r
R=golang-codereviews, r
CC=golang-codereviews
https://golang.org/cl/108030044
parent 54bc760a
......@@ -11,7 +11,7 @@
// By default, a Scanner skips white space and Go comments and recognizes all
// literals as defined by the Go language specification. It may be
// customized to recognize only a subset of those literals and to recognize
// different white space characters.
// different identifier and white space characters.
//
// Basic usage pattern:
//
......@@ -34,8 +34,6 @@ import (
"unicode/utf8"
)
// TODO(gri): Consider changing this to use the new (token) Position package.
// A source position is represented by a Position value.
// A position is valid if Line > 0.
type Position struct {
......@@ -164,6 +162,13 @@ type Scanner struct {
// for values ch > ' '). The field may be changed at any time.
Whitespace uint64
// IsIdentRune is a predicate controlling the characters accepted
// as the ith rune in an identifier. The set of valid characters
// must not intersect with the set of white space characters.
// If no IsIdentRune function is set, regular Go identifiers are
// accepted instead. The field may be changed at any time.
IsIdentRune func(ch rune, i int) bool
// Start position of most recently scanned token; set by Scan.
// Calling Init or Next invalidates the position (Line == 0).
// The Filename field is always left untouched by the Scanner.
......@@ -334,9 +339,17 @@ func (s *Scanner) error(msg string) {
fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
}
func (s *Scanner) isIdentRune(ch rune, i int) bool {
if s.IsIdentRune != nil {
return s.IsIdentRune(ch, i)
}
return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0
}
func (s *Scanner) scanIdentifier() rune {
ch := s.next() // read character after first '_' or letter
for ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) {
// we know the zero'th rune is OK; start with 2nd one
ch := s.next()
for i := 1; s.isIdentRune(ch, i); i++ {
ch = s.next()
}
return ch
......@@ -563,7 +576,7 @@ redo:
// determine token value
tok := ch
switch {
case unicode.IsLetter(ch) || ch == '_':
case s.isIdentRune(ch, 0):
if s.Mode&ScanIdents != 0 {
tok = Ident
ch = s.scanIdentifier()
......
......@@ -357,6 +357,28 @@ func TestScanSelectedMask(t *testing.T) {
testScanSelectedMode(t, ScanComments, Comment)
}
func TestScanCustomIdent(t *testing.T) {
const src = "faab12345 a12b123 a12 3b"
s := new(Scanner).Init(strings.NewReader(src))
// ident = ( 'a' | 'b' ) { digit } .
// digit = '0' .. '3' .
// with a maximum length of 4
s.IsIdentRune = func(ch rune, i int) bool {
return i == 0 && (ch == 'a' || ch == 'b') || 0 < i && i < 4 && '0' <= ch && ch <= '3'
}
checkTok(t, s, 1, s.Scan(), 'f', "f")
checkTok(t, s, 1, s.Scan(), Ident, "a")
checkTok(t, s, 1, s.Scan(), Ident, "a")
checkTok(t, s, 1, s.Scan(), Ident, "b123")
checkTok(t, s, 1, s.Scan(), Int, "45")
checkTok(t, s, 1, s.Scan(), Ident, "a12")
checkTok(t, s, 1, s.Scan(), Ident, "b123")
checkTok(t, s, 1, s.Scan(), Ident, "a12")
checkTok(t, s, 1, s.Scan(), Int, "3")
checkTok(t, s, 1, s.Scan(), Ident, "b")
checkTok(t, s, 1, s.Scan(), EOF, "")
}
func TestScanNext(t *testing.T) {
const BOM = '\uFEFF'
BOMs := string(BOM)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment