Commit 1f577d26 authored by Mike Samuel's avatar Mike Samuel

exp/template/html: simplify transition functions

This simplifies transition functions to make it easier to reliably
elide comments in a later CL.

Before:
- transition functions are responsible for detecting special end tags.
After:
- the code to detect special end tags is done in one place.

We were relying on end tags being skipped which meant we were
not noticing comments inside script/style elements that contain no
substitutions.
This change means we will notice all such comments where necessary,
but stripTags will notice none since it does not need to.  This speeds
up stripTags.

R=nigeltao
CC=golang-dev
https://golang.org/cl/5074041
parent 9169c27e
...@@ -175,6 +175,15 @@ func isComment(s state) bool { ...@@ -175,6 +175,15 @@ func isComment(s state) bool {
return false return false
} }
// isInTag return whether s occurs solely inside an HTML tag.
func isInTag(s state) bool {
switch s {
case stateTag, stateAttrName, stateAfterName, stateBeforeValue, stateAttr:
return true
}
return false
}
// delim is the delimiter that will end the current HTML attribute. // delim is the delimiter that will end the current HTML attribute.
type delim uint8 type delim uint8
......
...@@ -583,7 +583,14 @@ func (e *escaper) escapeText(c context, n *parse.TextNode) context { ...@@ -583,7 +583,14 @@ func (e *escaper) escapeText(c context, n *parse.TextNode) context {
// s, then returns the context after those tokens and the unprocessed suffix. // s, then returns the context after those tokens and the unprocessed suffix.
func contextAfterText(c context, s []byte) (context, int) { func contextAfterText(c context, s []byte) (context, int) {
if c.delim == delimNone { if c.delim == delimNone {
return transitionFunc[c.state](c, s) c1, i := tSpecialTagEnd(c, s)
if i == 0 {
// A special end tag (`</script>`) has been seen and
// all content preceding it has been consumed.
return c1, 0
}
// Consider all content up to any end tag.
return transitionFunc[c.state](c, s[:i])
} }
i := bytes.IndexAny(s, delimEnds[c.delim]) i := bytes.IndexAny(s, delimEnds[c.delim])
......
...@@ -814,7 +814,7 @@ func TestErrors(t *testing.T) { ...@@ -814,7 +814,7 @@ func TestErrors(t *testing.T) {
}, },
{ {
`<a onclick='alert(/x+\`, `<a onclick='alert(/x+\`,
`unfinished escape sequence in JS regexp: "x+\\"`, `unfinished escape sequence in JS string: "x+\\"`,
}, },
{ {
`<a onclick="/foo[\]/`, `<a onclick="/foo[\]/`,
......
...@@ -165,12 +165,17 @@ func htmlReplacer(s string, replacementTable []string, badRunes bool) string { ...@@ -165,12 +165,17 @@ func htmlReplacer(s string, replacementTable []string, badRunes bool) string {
// For example, `<b>&iexcl;Hi!</b> <script>...</script>` -> `&iexcl;Hi! `. // For example, `<b>&iexcl;Hi!</b> <script>...</script>` -> `&iexcl;Hi! `.
func stripTags(html string) string { func stripTags(html string) string {
var b bytes.Buffer var b bytes.Buffer
s, c, i := []byte(html), context{}, 0 s, c, i, allText := []byte(html), context{}, 0, true
// Using the transition funcs helps us avoid mangling // Using the transition funcs helps us avoid mangling
// `<div title="1>2">` or `I <3 Ponies!`. // `<div title="1>2">` or `I <3 Ponies!`.
for i != len(s) { for i != len(s) {
if c.delim == delimNone { if c.delim == delimNone {
d, nread := transitionFunc[c.state](c, s[i:]) st := c.state
// Use RCDATA instead of parsing into JS or CSS styles.
if c.element != elementNone && !isInTag(st) {
st = stateRCDATA
}
d, nread := transitionFunc[st](c, s[i:])
i1 := i + nread i1 := i + nread
if c.state == stateText || c.state == stateRCDATA { if c.state == stateText || c.state == stateRCDATA {
// Emit text up to the start of the tag or comment. // Emit text up to the start of the tag or comment.
...@@ -184,6 +189,8 @@ func stripTags(html string) string { ...@@ -184,6 +189,8 @@ func stripTags(html string) string {
} }
} }
b.Write(s[i:j]) b.Write(s[i:j])
} else {
allText = false
} }
c, i = d, i1 c, i = d, i1
continue continue
...@@ -198,10 +205,9 @@ func stripTags(html string) string { ...@@ -198,10 +205,9 @@ func stripTags(html string) string {
} }
c, i = context{state: stateTag, element: c.element}, i1 c, i = context{state: stateTag, element: c.element}, i1
} }
if c.state == stateText { if allText {
if b.Len() == 0 { return html
return html } else if c.state == stateText || c.state == stateRCDATA {
}
b.Write(s[i:]) b.Write(s[i:])
} }
return b.String() return b.String()
......
...@@ -59,6 +59,7 @@ func TestStripTags(t *testing.T) { ...@@ -59,6 +59,7 @@ func TestStripTags(t *testing.T) {
{`Foo<script type="text/javascript">alert(1337)</script>Bar`, "FooBar"}, {`Foo<script type="text/javascript">alert(1337)</script>Bar`, "FooBar"},
{`Foo<div title="1>2">Bar`, "FooBar"}, {`Foo<div title="1>2">Bar`, "FooBar"},
{`I <3 Ponies!`, `I <3 Ponies!`}, {`I <3 Ponies!`, `I <3 Ponies!`},
{`<script>foo()</script>`, ``},
} }
for _, test := range tests { for _, test := range tests {
......
...@@ -27,9 +27,9 @@ var transitionFunc = [...]func(context, []byte) (context, int){ ...@@ -27,9 +27,9 @@ var transitionFunc = [...]func(context, []byte) (context, int){
stateAttr: tAttr, stateAttr: tAttr,
stateURL: tURL, stateURL: tURL,
stateJS: tJS, stateJS: tJS,
stateJSDqStr: tJSStr, stateJSDqStr: tJSDelimited,
stateJSSqStr: tJSStr, stateJSSqStr: tJSDelimited,
stateJSRegexp: tJSRegexp, stateJSRegexp: tJSDelimited,
stateJSBlockCmt: tBlockCmt, stateJSBlockCmt: tBlockCmt,
stateJSLineCmt: tLineCmt, stateJSLineCmt: tLineCmt,
stateCSS: tCSS, stateCSS: tCSS,
...@@ -57,14 +57,18 @@ func tText(c context, s []byte) (context, int) { ...@@ -57,14 +57,18 @@ func tText(c context, s []byte) (context, int) {
return context{state: stateHTMLCmt}, i + 4 return context{state: stateHTMLCmt}, i + 4
} }
i++ i++
end := false
if s[i] == '/' { if s[i] == '/' {
if i+1 == len(s) { if i+1 == len(s) {
return c, len(s) return c, len(s)
} }
i++ end, i = true, i+1
} }
j, e := eatTagName(s, i) j, e := eatTagName(s, i)
if j != i { if j != i {
if end {
e = elementNone
}
// We've found an HTML tag. // We've found an HTML tag.
return context{state: stateTag, element: e}, j return context{state: stateTag, element: e}, j
} }
...@@ -122,10 +126,9 @@ func tAttrName(c context, s []byte) (context, int) { ...@@ -122,10 +126,9 @@ func tAttrName(c context, s []byte) (context, int) {
i, err := eatAttrName(s, 0) i, err := eatAttrName(s, 0)
if err != nil { if err != nil {
return context{state: stateError, err: err}, len(s) return context{state: stateError, err: err}, len(s)
} else if i == len(s) { } else if i != len(s) {
return c, len(s) c.state = stateAfterName
} }
c.state = stateAfterName
return c, i return c, i
} }
...@@ -172,8 +175,7 @@ func tBeforeValue(c context, s []byte) (context, int) { ...@@ -172,8 +175,7 @@ func tBeforeValue(c context, s []byte) (context, int) {
// tHTMLCmt is the context transition function for stateHTMLCmt. // tHTMLCmt is the context transition function for stateHTMLCmt.
func tHTMLCmt(c context, s []byte) (context, int) { func tHTMLCmt(c context, s []byte) (context, int) {
i := bytes.Index(s, commentEnd) if i := bytes.Index(s, commentEnd); i != -1 {
if i != -1 {
return context{}, i + 3 return context{}, i + 3
} }
return c, len(s) return c, len(s)
...@@ -192,10 +194,8 @@ var specialTagEndMarkers = [...]string{ ...@@ -192,10 +194,8 @@ var specialTagEndMarkers = [...]string{
// element states. // element states.
func tSpecialTagEnd(c context, s []byte) (context, int) { func tSpecialTagEnd(c context, s []byte) (context, int) {
if c.element != elementNone { if c.element != elementNone {
end := specialTagEndMarkers[c.element] if i := strings.Index(strings.ToLower(string(s)), specialTagEndMarkers[c.element]); i != -1 {
i := strings.Index(strings.ToLower(string(s)), end) return context{}, i
if i != -1 {
return context{state: stateTag}, i + len(end)
} }
} }
return c, len(s) return c, len(s)
...@@ -220,10 +220,6 @@ func tURL(c context, s []byte) (context, int) { ...@@ -220,10 +220,6 @@ func tURL(c context, s []byte) (context, int) {
// tJS is the context transition function for the JS state. // tJS is the context transition function for the JS state.
func tJS(c context, s []byte) (context, int) { func tJS(c context, s []byte) (context, int) {
if d, i := tSpecialTagEnd(c, s); i != len(s) {
return d, i
}
i := bytes.IndexAny(s, `"'/`) i := bytes.IndexAny(s, `"'/`)
if i == -1 { if i == -1 {
// Entire input is non string, comment, regexp tokens. // Entire input is non string, comment, regexp tokens.
...@@ -258,64 +254,30 @@ func tJS(c context, s []byte) (context, int) { ...@@ -258,64 +254,30 @@ func tJS(c context, s []byte) (context, int) {
return c, i + 1 return c, i + 1
} }
// tJSStr is the context transition function for the JS string states. // tJSDelimited is the context transition function for the JS string and regexp
func tJSStr(c context, s []byte) (context, int) { // states.
if d, i := tSpecialTagEnd(c, s); i != len(s) { func tJSDelimited(c context, s []byte) (context, int) {
return d, i specials := `\"`
} switch c.state {
case stateJSSqStr:
quoteAndEsc := `\"` specials = `\'`
if c.state == stateJSSqStr { case stateJSRegexp:
quoteAndEsc = `\'` specials = `\/[]`
}
k := 0
for {
i := k + bytes.IndexAny(s[k:], quoteAndEsc)
if i < k {
return c, len(s)
}
if s[i] == '\\' {
i++
if i == len(s) {
return context{
state: stateError,
err: errorf(ErrPartialEscape, 0, "unfinished escape sequence in JS string: %q", s),
}, len(s)
}
} else {
c.state, c.jsCtx = stateJS, jsCtxDivOp
return c, i + 1
}
k = i + 1
}
panic("unreachable")
}
// tJSRegexp is the context transition function for the /RegExp/ literal state.
func tJSRegexp(c context, s []byte) (context, int) {
if d, i := tSpecialTagEnd(c, s); i != len(s) {
return d, i
} }
k, inCharset := 0, false k, inCharset := 0, false
for { for {
i := k + bytes.IndexAny(s[k:], `\/[]`) i := k + bytes.IndexAny(s[k:], specials)
if i < k { if i < k {
break break
} }
switch s[i] { switch s[i] {
case '/':
if !inCharset {
c.state, c.jsCtx = stateJS, jsCtxDivOp
return c, i + 1
}
case '\\': case '\\':
i++ i++
if i == len(s) { if i == len(s) {
return context{ return context{
state: stateError, state: stateError,
err: errorf(ErrPartialEscape, 0, "unfinished escape sequence in JS regexp: %q", s), err: errorf(ErrPartialEscape, 0, "unfinished escape sequence in JS string: %q", s),
}, len(s) }, len(s)
} }
case '[': case '[':
...@@ -323,7 +285,11 @@ func tJSRegexp(c context, s []byte) (context, int) { ...@@ -323,7 +285,11 @@ func tJSRegexp(c context, s []byte) (context, int) {
case ']': case ']':
inCharset = false inCharset = false
default: default:
panic("unreachable") // end delimiter
if !inCharset {
c.state, c.jsCtx = stateJS, jsCtxDivOp
return c, i + 1
}
} }
k = i + 1 k = i + 1
} }
...@@ -344,9 +310,6 @@ var blockCommentEnd = []byte("*/") ...@@ -344,9 +310,6 @@ var blockCommentEnd = []byte("*/")
// tBlockCmt is the context transition function for /*comment*/ states. // tBlockCmt is the context transition function for /*comment*/ states.
func tBlockCmt(c context, s []byte) (context, int) { func tBlockCmt(c context, s []byte) (context, int) {
if d, i := tSpecialTagEnd(c, s); i != len(s) {
return d, i
}
i := bytes.Index(s, blockCommentEnd) i := bytes.Index(s, blockCommentEnd)
if i == -1 { if i == -1 {
return c, len(s) return c, len(s)
...@@ -364,9 +327,6 @@ func tBlockCmt(c context, s []byte) (context, int) { ...@@ -364,9 +327,6 @@ func tBlockCmt(c context, s []byte) (context, int) {
// tLineCmt is the context transition function for //comment states. // tLineCmt is the context transition function for //comment states.
func tLineCmt(c context, s []byte) (context, int) { func tLineCmt(c context, s []byte) (context, int) {
if d, i := tSpecialTagEnd(c, s); i != len(s) {
return d, i
}
var lineTerminators string var lineTerminators string
var endState state var endState state
switch c.state { switch c.state {
...@@ -400,10 +360,6 @@ func tLineCmt(c context, s []byte) (context, int) { ...@@ -400,10 +360,6 @@ func tLineCmt(c context, s []byte) (context, int) {
// tCSS is the context transition function for the CSS state. // tCSS is the context transition function for the CSS state.
func tCSS(c context, s []byte) (context, int) { func tCSS(c context, s []byte) (context, int) {
if d, i := tSpecialTagEnd(c, s); i != len(s) {
return d, i
}
// CSS quoted strings are almost never used except for: // CSS quoted strings are almost never used except for:
// (1) URLs as in background: "/foo.png" // (1) URLs as in background: "/foo.png"
// (2) Multiword font-names as in font-family: "Times New Roman" // (2) Multiword font-names as in font-family: "Times New Roman"
...@@ -478,10 +434,6 @@ func tCSS(c context, s []byte) (context, int) { ...@@ -478,10 +434,6 @@ func tCSS(c context, s []byte) (context, int) {
// tCSSStr is the context transition function for the CSS string and URL states. // tCSSStr is the context transition function for the CSS string and URL states.
func tCSSStr(c context, s []byte) (context, int) { func tCSSStr(c context, s []byte) (context, int) {
if d, i := tSpecialTagEnd(c, s); i != len(s) {
return d, i
}
var endAndEsc string var endAndEsc string
switch c.state { switch c.state {
case stateCSSDqStr, stateCSSDqURL: case stateCSSDqStr, stateCSSDqURL:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment