cmd/yacc: allow utf-8 token values

Also clean up the code and allow \U. Fixes #3007. R=golang-dev, rsc, 0xjnml CC=golang-dev https://golang.org/cl/6492105

cmd/yacc: allow utf-8 token values
Also clean up the code and allow \U. Fixes #3007. R=golang-dev, rsc, 0xjnml CC=golang-dev https://golang.org/cl/6492105
f269f9a3 · Rob Pike · f3fc0090 · f269f9a3 · f269f9a3
Commit f269f9a3 authored Sep 13, 2012 by Rob Pike
Hide whitespace changes
Inline Side-by-side

Showing with 63 additions and 53 deletions

units.y src/cmd/yacc/units.y +3 -3

yacc.go src/cmd/yacc/yacc.go +60 -50

No files found.
--- a/src/cmd/yacc/units.y
+++ b/src/cmd/yacc/units.y
@@ -76,7 +76,7 @@ var vflag bool

 %type	<node>	prog expr expr0 expr1 expr2 expr3 expr4

-%token	<vval>	VAL
+%token	<vval>	VÄL // dieresis to test UTF-8
 %token	<vvar>	VAR
 %token	<numb>	_SUP // tests leading underscore in token name
 %%
@@ -199,7 +199,7 @@ expr0:
 			$$ = $1.node
 		}
 	}
-|	VAL
+|	VÄL
 	{
 		$$ = one
 		$$.vval = $1
@@ -275,7 +275,7 @@ numb:
 		f = 0
 	}
 	yylval.vval = f
-	return VAL
+	return VÄL
 }

 func (UnitsLex) Error(s string) {

--- a/src/cmd/yacc/yacc.go
+++ b/src/cmd/yacc/yacc.go
@@ -52,6 +52,7 @@ import (
 	"os"
 	"strings"
 	"unicode"
+	"unicode/utf8"
 )

 // the following are adjustable
@@ -326,7 +327,6 @@ var resrv = []Resrv{
 var zznewstate = 0

 const EOF = -1
-const UTFmax = 0x3f

 func main() {

@@ -719,8 +719,8 @@ func moreprod() {
 }

 //
-// define s to be a terminal if t=0
-// or a nonterminal if t=1
+// define s to be a terminal if nt==0
+// or a nonterminal if nt==1
 //
 func defin(nt int, s string) int {
 	val := 0
@@ -753,56 +753,66 @@ func defin(nt int, s string) int {

 	// establish value for token
 	// single character literal
-	if s[0] == ' ' && len(s) == 1+1 {
-		val = int(s[1])
-	} else if s[0] == ' ' && s[1] == '\\' { // escape sequence
-		if len(s) == 2+1 {
-			// single character escape sequence
-			switch s[2] {
-			case '\'':
-				val = '\''
-			case '"':
-				val = '"'
-			case '\\':
-				val = '\\'
-			case 'a':
-				val = '\a'
-			case 'b':
-				val = '\b'
-			case 'n':
-				val = '\n'
-			case 'r':
-				val = '\r'
-			case 't':
-				val = '\t'
-			case 'v':
-				val = '\v'
-			default:
-				errorf("invalid escape %v", s[1:3])
-			}
-		} else if s[2] == 'u' && len(s) == 2+1+4 { // \unnnn sequence
-			val = 0
-			s = s[3:]
-			for s != "" {
-				c := int(s[0])
-				switch {
-				case c >= '0' && c <= '9':
-					c -= '0'
-				case c >= 'a' && c <= 'f':
-					c -= 'a' - 10
-				case c >= 'A' && c <= 'F':
-					c -= 'A' - 10
+	if s[0] == ' ' {
+		s = s[1:]
+		r, size := utf8.DecodeRuneInString(s)
+		if r == utf8.RuneError && size == 1 {
+			errorf("invalid UTF-8 sequence %q", s)
+		}
+		val = int(r)
+		if val == '\\' { // escape sequence
+			switch {
+			case len(s) == 2:
+				// single character escape sequence
+				switch s[1] {
+				case '\'':
+					val = '\''
+				case '"':
+					val = '"'
+				case '\\':
+					val = '\\'
+				case 'a':
+					val = '\a'
+				case 'b':
+					val = '\b'
+				case 'f':
+					val = '\f'
+				case 'n':
+					val = '\n'
+				case 'r':
+					val = '\r'
+				case 't':
+					val = '\t'
+				case 'v':
+					val = '\v'
 				default:
-					errorf("illegal \\unnnn construction")
+					errorf("invalid escape %s", s)
 				}
-				val = val*16 + c
-				s = s[1:]
-			}
-			if val == 0 {
-				errorf("'\\u0000' is illegal")
+			case s[1] == 'u' && len(s) == 2+4, // \unnnn sequence
+				s[1] == 'U' && len(s) == 2+8: // \Unnnnnnnn sequence
+				val = 0
+				s = s[2:]
+				for s != "" {
+					c := int(s[0])
+					switch {
+					case c >= '0' && c <= '9':
+						c -= '0'
+					case c >= 'a' && c <= 'f':
+						c -= 'a' - 10
+					case c >= 'A' && c <= 'F':
+						c -= 'A' - 10
+					default:
+						errorf(`illegal \u or \U construction`)
+					}
+					val = val*16 + c
+					s = s[1:]
+				}
+			default:
+				errorf("invalid escape %s", s)
 			}
-		} else {
-			errorf("unknown escape")
+		}
+		if val == 0 {
+			errorf("token value 0 is illegal")
 		}
 	} else {
 		val = extval