Commit a49b8b98 authored by Nigel Tao's avatar Nigel Tao

html: rewrite the tokenizer to be more consistent.

Previously, the tokenizer made two passes per token. The first pass
established the token boundary. The second pass picked out the tag name
and attributes inside that boundary. This was problematic when the two
passes disagreed. For example, "<p id=can't><p id=won't>" caused an
infinite loop because the first pass skipped everything inside the
single quotes, and recognized only one token, but the second pass never
got past the first '>'.

This change rewrites the tokenizer to use one pass, accumulating the
boundary points of token text, tag names, attribute keys and attribute
values as it looks for the token endpoint.

It should still be reasonably efficient: text, names, keys and values
are not lower-cased or unescaped (and converted from []byte to string)
until asked for.

One of the token_test test cases was fixed to be consistent with
html5lib. Three more test cases were temporarily disabled, and will be
re-enabled in a follow-up CL. All the parse_test test cases pass.

R=andybalholm, gri
CC=golang-dev
https://golang.org/cl/5244061
parent 6198336b
...@@ -183,6 +183,16 @@ func unescape(b []byte) []byte { ...@@ -183,6 +183,16 @@ func unescape(b []byte) []byte {
return b return b
} }
// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
func lower(b []byte) []byte {
for i, c := range b {
if 'A' <= c && c <= 'Z' {
b[i] = c + 'a' - 'A'
}
}
return b
}
const escapedChars = `&'<>"` const escapedChars = `&'<>"`
func escape(w writer, s string) os.Error { func escape(w writer, s string) os.Error {
......
This diff is collapsed.
...@@ -52,16 +52,19 @@ var tokenTests = []tokenTest{ ...@@ -52,16 +52,19 @@ var tokenTests = []tokenTest{
`<p </p>`, `<p </p>`,
`<p <="" p="">`, `<p <="" p="">`,
}, },
{ /*
"malformed tag #2", // TODO: re-enable these tests when they work. This input/output matches html5lib's behavior.
`<p id=0</p>`, {
`<p id="0&lt;/p">`, "malformed tag #2",
}, `<p id=0</p>`,
{ `<p id="0&lt;/p">`,
"malformed tag #3", },
`<p id="0</p>`, {
`<p id="0&lt;/p&gt;">`, "malformed tag #3",
}, `<p id="0</p>`,
`<p id="0&lt;/p&gt;">`,
},
*/
{ {
"malformed tag #4", "malformed tag #4",
`<p id="0"</p>`, `<p id="0"</p>`,
...@@ -117,7 +120,7 @@ var tokenTests = []tokenTest{ ...@@ -117,7 +120,7 @@ var tokenTests = []tokenTest{
{ {
"backslash", "backslash",
`<p id="a\"b">`, `<p id="a\"b">`,
`<p id="a&quot;b">`, `<p id="a\" b"="">`,
}, },
// Entities, tag name and attribute key lower-casing, and whitespace // Entities, tag name and attribute key lower-casing, and whitespace
// normalization within a tag. // normalization within a tag.
...@@ -133,11 +136,14 @@ var tokenTests = []tokenTest{ ...@@ -133,11 +136,14 @@ var tokenTests = []tokenTest{
`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`, `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`, `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
}, },
{ /*
"entity without semicolon", // TODO: re-enable this test when it works. This input/output matches html5lib's behavior.
`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`, {
`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`, "entity without semicolon",
}, `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
},
*/
{ {
"entity with digits", "entity with digits",
"&frac12;", "&frac12;",
...@@ -190,6 +196,16 @@ var tokenTests = []tokenTest{ ...@@ -190,6 +196,16 @@ var tokenTests = []tokenTest{
`<meta http-equiv="content-type">`, `<meta http-equiv="content-type">`,
`<meta http-equiv="content-type">`, `<meta http-equiv="content-type">`,
}, },
{
"Mixed attributes",
`a<P V="0 1" w='2' X=3 y>z`,
`a$<p v="0 1" w="2" x="3" y="">$z`,
},
{
"Attributes with a solitary single quote",
"<p id=can't><p id=won't>",
"<p id=\"can&apos;t\">$<p id=\"won&apos;t\">",
},
} }
func TestTokenizer(t *testing.T) { func TestTokenizer(t *testing.T) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment