Commit 480e7b06 authored by Michael Piatek's avatar Michael Piatek Committed by Brad Fitzpatrick

go.net/html: Tokenizer.Raw returns the original input when tokenizer errors occur.

Two tweaks enable this:
1) Updating the raw and data span pointers when Tokenizer.Next is called, even
if an error has occurred. This prevents duplicate data from being returned by
Raw in the common case of an EOF.

2) Treating '</>' as an empty comment token to expose the raw text as a
tokenization event. (This matches the semantics of other non-token events,
e.g., '</ >' is treated as '<!-- -->'.)

Fixes golang/go#7029.

R=golang-codereviews, r, bradfitz
CC=golang-codereviews
https://golang.org/cl/46370043
parent 16ae4622
......@@ -734,7 +734,6 @@ func (z *Tokenizer) readCDATA() bool {
brackets = 0
}
}
panic("unreachable")
}
// startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end]
......@@ -934,13 +933,13 @@ func (z *Tokenizer) readTagAttrVal() {
// Next scans the next token and returns its type.
func (z *Tokenizer) Next() TokenType {
z.raw.start = z.raw.end
z.data.start = z.raw.end
z.data.end = z.raw.end
if z.err != nil {
z.tt = ErrorToken
return z.tt
}
z.raw.start = z.raw.end
z.data.start = z.raw.end
z.data.end = z.raw.end
if z.rawTag != "" {
if z.rawTag == "plaintext" {
// Read everything up to EOF.
......@@ -1010,12 +1009,11 @@ loop:
break loop
}
if c == '>' {
// "</>" does not generate a token at all.
// "</>" does not generate a token at all. Generate an empty comment
// to allow passthrough clients to pick up the data using Raw.
// Reset the tokenizer state and start again.
z.raw.start = z.raw.end
z.data.start = z.raw.end
z.data.end = z.raw.end
continue loop
z.tt = CommentToken
return z.tt
}
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
z.readTag(false)
......
......@@ -63,12 +63,12 @@ var tokenTests = []tokenTest{
{
"not a tag #2",
"</>",
"",
"<!---->",
},
{
"not a tag #3",
"a</>b",
"a$b",
"a$<!---->$b",
},
{
"not a tag #4",
......@@ -469,6 +469,25 @@ loop:
}
}
func TestPassthrough(t *testing.T) {
// Accumulating the raw output for each parse event should reconstruct the
// original input.
for _, test := range tokenTests {
z := NewTokenizer(strings.NewReader(test.html))
var parsed bytes.Buffer
for {
tt := z.Next()
parsed.Write(z.Raw())
if tt == ErrorToken {
break
}
}
if got, want := parsed.String(), test.html; got != want {
t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
}
}
}
func TestBufAPI(t *testing.T) {
s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
z := NewTokenizer(bytes.NewBufferString(s))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment