http: add lexing functions

In particular, add field-value tokenizer which respects quoting rules. The code is intended for use in tokenizing the Transfer-Encoding and Trailer fields. The lexing function is not connected to the main parsing code yet (in the next CL). R=rsc CC=golang-dev https://golang.org/cl/190085

http: add lexing functions
In particular, add field-value tokenizer which respects quoting rules. The code is intended for use in tokenizing the Transfer-Encoding and Trailer fields. The lexing function is not connected to the main parsing code yet (in the next CL). R=rsc CC=golang-dev https://golang.org/cl/190085
dd77c63d · Petar Maymounkov · Russ Cox · a0e6f03a · dd77c63d · dd77c63d
Commit dd77c63d authored Jan 28, 2010 by Petar Maymounkov Committed by Russ Cox Jan 28, 2010
Hide whitespace changes
Inline Side-by-side

Showing with 223 additions and 0 deletions

Makefile src/pkg/http/Makefile +1 -0

lex.go src/pkg/http/lex.go +152 -0

lex_test.go src/pkg/http/lex_test.go +70 -0

No files found.
--- a/src/pkg/http/Makefile
+++ b/src/pkg/http/Makefile
@@ -9,6 +9,7 @@ GOFILES=\
 	chunked.go\
 	client.go\
 	fs.go\
+	lex.go\
 	request.go\
 	response.go\
 	server.go\

--- a/src/pkg/http/lex.go
+++ b/src/pkg/http/lex.go
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+package http
+import (
+	"strings"
+)
+// This file deals with lexical matters of HTTP
+func isSeparator(c byte) bool {
+	switch c {
+	case '(', ')', '<', '>', '@', ',', ';', ':', '\\', '"', '/', '[', ']', '?', '=', '{', '}', ' ', '\t':
+		return true
+	default:
+		return false
+	}
+	panic()
+}
+func isSpace(c byte) bool {
+	switch c {
+	case ' ', '\t', '\r', '\n':
+		return true
+	default:
+		return false
+	}
+	panic()
+}
+func isCtl(c byte) bool { return (0 <= c && c <= 31) || c == 127 }
+func isChar(c byte) bool { return 0 <= c && c <= 127 }
+func isAnyText(c byte) bool { return !isCtl(c) }
+func isQdText(c byte) bool { return isAnyText(c) && c != '"' }
+func isToken(c byte) bool { return isChar(c) && !isCtl(c) && !isSeparator(c) }
+// Valid escaped sequences are not specified in RFC 2616, so for now, we assume
+// that they coincide with the common sense ones used by GO. Malformed
+// characters should probably not be treated as errors by a robust (forgiving)
+// parser, so we replace them with the '?' character.
+func httpUnquotePair(b byte) byte {
+	// skip the first byte, which should always be '\'
+	switch b {
+	case 'a':
+		return '\a'
+	case 'b':
+		return '\b'
+	case 'f':
+		return '\f'
+	case 'n':
+		return '\n'
+	case 'r':
+		return '\r'
+	case 't':
+		return '\t'
+	case 'v':
+		return '\v'
+	case '\\':
+		return '\\'
+	case '\'':
+		return '\''
+	case '"':
+		return '"'
+	}
+	return '?'
+}
+// raw must begin with a valid quoted string. Only the first quoted string is
+// parsed and is unquoted in result. eaten is the number of bytes parsed, or -1
+// upon failure.
+func httpUnquote(raw []byte) (eaten int, result string) {
+	buf := make([]byte, len(raw))
+	if raw[0] != '"' {
+		return -1, ""
+	}
+	eaten = 1
+	j := 0 // # of bytes written in buf
+	for i := 1; i < len(raw); i++ {
+		switch b := raw[i]; b {
+		case '"':
+			eaten++
+			buf = buf[0:j]
+			return i + 1, string(buf)
+		case '\\':
+			if len(raw) < i+2 {
+				return -1, ""
+			}
+			buf[j] = httpUnquotePair(raw[i+1])
+			eaten += 2
+			j++
+			i++
+		default:
+			if isQdText(b) {
+				buf[j] = b
+			} else {
+				buf[j] = '?'
+			}
+			eaten++
+			j++
+		}
+	}
+	return -1, ""
+}
+// This is a best effort parse, so errors are not returned, instead not all of
+// the input string might be parsed. result is always non-nil.
+func httpSplitFieldValue(fv string) (eaten int, result []string) {
+	result = make([]string, 0, len(fv))
+	raw := strings.Bytes(fv)
+	i := 0
+	chunk := ""
+	for i < len(raw) {
+		b := raw[i]
+		switch {
+		case b == '"':
+			eaten, unq := httpUnquote(raw[i:len(raw)])
+			if eaten < 0 {
+				return i, result
+			} else {
+				i += eaten
+				chunk += unq
+			}
+		case isSeparator(b):
+			if chunk != "" {
+				result = result[0 : len(result)+1]
+				result[len(result)-1] = chunk
+				chunk = ""
+			}
+			i++
+		case isToken(b):
+			chunk += string(b)
+			i++
+		case b == '\n' || b == '\r':
+			i++
+		default:
+			chunk += "?"
+			i++
+		}
+	}
+	if chunk != "" {
+		result = result[0 : len(result)+1]
+		result[len(result)-1] = chunk
+		chunk = ""
+	}
+	return i, result
+}
--- a/src/pkg/http/lex_test.go
+++ b/src/pkg/http/lex_test.go
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+package http
+import (
+	"testing"
+)
+type lexTest struct {
+	Raw    string
+	Parsed int // # of parsed characters
+	Result []string
+}
+var lexTests = []lexTest{
+	lexTest{
+		Raw: `"abc"def,:ghi`,
+		Parsed: 13,
+		Result: []string{"abcdef", "ghi"},
+	},
+	// My understanding of the RFC is that escape sequences outside of
+	// quotes are not interpreted?
+	lexTest{
+		Raw: `"\t"\t"\t"`,
+		Parsed: 10,
+		Result: []string{"\t", "t\t"},
+	},
+	lexTest{
+		Raw: `"\yab"\r\n`,
+		Parsed: 10,
+		Result: []string{"?ab", "r", "n"},
+	},
+	lexTest{
+		Raw: "ab\f",
+		Parsed: 3,
+		Result: []string{"ab?"},
+	},
+	lexTest{
+		Raw: "\"ab \" c,de f, gh, ij\n\t\r",
+		Parsed: 23,
+		Result: []string{"ab ", "c", "de", "f", "gh", "ij"},
+	},
+}
+func min(x, y int) int {
+	if x <= y {
+		return x
+	}
+	return y
+}
+func TestSplitFieldValue(t *testing.T) {
+	for k, l := range lexTests {
+		parsed, result := httpSplitFieldValue(l.Raw)
+		if parsed != l.Parsed {
+			t.Errorf("#%d: Parsed %d, expected %d", k, parsed, l.Parsed)
+		}
+		if len(result) != len(l.Result) {
+			t.Errorf("#%d: Result len  %d, expected %d", k, len(result), len(l.Result))
+		}
+		for i := 0; i < min(len(result), len(l.Result)); i++ {
+			if result[i] != l.Result[i] {
+				t.Errorf("#%d: %d-th entry mismatch. Have {%s}, expect {%s}",
+					k, i, result[i], l.Result[i])
+			}
+		}
+	}
+}