Commit 9f3b0057 authored by Andrew Balholm's avatar Andrew Balholm Committed by Nigel Tao

exp/html: tokenize attributes of end tags

If an end tag has an attribute that is a quoted string containing '>',
the tokenizer would end the tag prematurely. Now it reads the attributes
on end tags just as it does on start tags, but the high-level interface
still doesn't return them, because their presence is a parse error.

Pass 1 additional test.

R=nigeltao
CC=golang-dev
https://golang.org/cl/6457060
parent 695024b8
...@@ -4,7 +4,7 @@ PASS "FOO<script></script >BAR" ...@@ -4,7 +4,7 @@ PASS "FOO<script></script >BAR"
PASS "FOO<script></script/>BAR" PASS "FOO<script></script/>BAR"
PASS "FOO<script></script/ >BAR" PASS "FOO<script></script/ >BAR"
PASS "FOO<script type=\"text/plain\"></scriptx>BAR" PASS "FOO<script type=\"text/plain\"></scriptx>BAR"
FAIL "FOO<script></script foo=\">\" dd>BAR" PASS "FOO<script></script foo=\">\" dd>BAR"
PASS "FOO<script>'<'</script>BAR" PASS "FOO<script>'<'</script>BAR"
PASS "FOO<script>'<!'</script>BAR" PASS "FOO<script>'<!'</script>BAR"
PASS "FOO<script>'<!-'</script>BAR" PASS "FOO<script>'<!-'</script>BAR"
......
...@@ -468,29 +468,10 @@ loop: ...@@ -468,29 +468,10 @@ loop:
// readStartTag reads the next start tag token. The opening "<a" has already // readStartTag reads the next start tag token. The opening "<a" has already
// been consumed, where 'a' means anything in [A-Za-z]. // been consumed, where 'a' means anything in [A-Za-z].
func (z *Tokenizer) readStartTag() TokenType { func (z *Tokenizer) readStartTag() TokenType {
z.attr = z.attr[:0] z.readTag()
z.nAttrReturned = 0 if z.err != nil && len(z.attr) == 0 {
// Read the tag name and attribute key/value pairs.
z.readTagName()
if z.skipWhiteSpace(); z.err != nil {
return ErrorToken return ErrorToken
} }
for {
c := z.readByte()
if z.err != nil || c == '>' {
break
}
z.raw.end--
z.readTagAttrKey()
z.readTagAttrVal()
// Save pendingAttr if it has a non-empty key.
if z.pendingAttr[0].start != z.pendingAttr[0].end {
z.attr = append(z.attr, z.pendingAttr)
}
if z.skipWhiteSpace(); z.err != nil {
break
}
}
// Several tags flag the tokenizer's next token as raw. // Several tags flag the tokenizer's next token as raw.
c, raw := z.buf[z.data.start], false c, raw := z.buf[z.data.start], false
if 'A' <= c && c <= 'Z' { if 'A' <= c && c <= 'Z' {
...@@ -520,16 +501,30 @@ func (z *Tokenizer) readStartTag() TokenType { ...@@ -520,16 +501,30 @@ func (z *Tokenizer) readStartTag() TokenType {
return StartTagToken return StartTagToken
} }
// readEndTag reads the next end tag token. The opening "</a" has already // readTag reads the next tag token. The opening "<a" or "</a" has already been
// been consumed, where 'a' means anything in [A-Za-z]. // consumed, where 'a' means anything in [A-Za-z].
func (z *Tokenizer) readEndTag() { func (z *Tokenizer) readTag() {
z.attr = z.attr[:0] z.attr = z.attr[:0]
z.nAttrReturned = 0 z.nAttrReturned = 0
// Read the tag name and attribute key/value pairs.
z.readTagName() z.readTagName()
if z.skipWhiteSpace(); z.err != nil {
return
}
for { for {
c := z.readByte() c := z.readByte()
if z.err != nil || c == '>' { if z.err != nil || c == '>' {
return break
}
z.raw.end--
z.readTagAttrKey()
z.readTagAttrVal()
// Save pendingAttr if it has a non-empty key.
if z.pendingAttr[0].start != z.pendingAttr[0].end {
z.attr = append(z.attr, z.pendingAttr)
}
if z.skipWhiteSpace(); z.err != nil {
break
} }
} }
} }
...@@ -727,7 +722,7 @@ loop: ...@@ -727,7 +722,7 @@ loop:
continue loop continue loop
} }
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
z.readEndTag() z.readTag()
z.tt = EndTagToken z.tt = EndTagToken
return z.tt return z.tt
} }
...@@ -858,22 +853,18 @@ func (z *Tokenizer) Token() Token { ...@@ -858,22 +853,18 @@ func (z *Tokenizer) Token() Token {
switch z.tt { switch z.tt {
case TextToken, CommentToken, DoctypeToken: case TextToken, CommentToken, DoctypeToken:
t.Data = string(z.Text()) t.Data = string(z.Text())
case StartTagToken, SelfClosingTagToken: case StartTagToken, SelfClosingTagToken, EndTagToken:
var attr []Attribute
name, moreAttr := z.TagName() name, moreAttr := z.TagName()
// Since end tags should not have attributes, the high-level tokenizer
// interface will not return attributes for an end tag token even if
// it looks like </br foo="bar">.
if z.tt != EndTagToken {
for moreAttr { for moreAttr {
var key, val []byte var key, val []byte
key, val, moreAttr = z.TagAttr() key, val, moreAttr = z.TagAttr()
attr = append(attr, Attribute{"", atom.String(key), string(val)}) t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)})
} }
if a := atom.Lookup(name); a != 0 {
t.DataAtom, t.Data = a, a.String()
} else {
t.DataAtom, t.Data = 0, string(name)
} }
t.Attr = attr
case EndTagToken:
name, _ := z.TagName()
if a := atom.Lookup(name); a != 0 { if a := atom.Lookup(name); a != 0 {
t.DataAtom, t.Data = a, a.String() t.DataAtom, t.Data = a, a.String()
} else { } else {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment