Commit 8fcdc6a1 authored by Kyle Consalus's avatar Kyle Consalus Committed by Nigel Tao

Small performance improvements to the HTML tokenizer based on your 'TODO's.

R=nigeltao_golang
CC=golang-dev
https://golang.org/cl/1941042
parent bca31510
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
package html package html
import ( import (
"bytes"
"strings" "strings"
"utf8" "utf8"
) )
...@@ -60,18 +61,45 @@ func unescape(b []byte) []byte { ...@@ -60,18 +61,45 @@ func unescape(b []byte) []byte {
return b return b
} }
const escapedChars = `&'<>"`
func escape(buf *bytes.Buffer, s string) {
i := strings.IndexAny(s, escapedChars)
for i != -1 {
buf.WriteString(s[0:i])
var esc string
switch s[i] {
case '&':
esc = "&amp;"
case '\'':
esc = "&apos;"
case '<':
esc = "&lt;"
case '>':
esc = "&gt;"
case '"':
esc = "&quot;"
default:
panic("unrecognized escape character")
}
s = s[i+1:]
buf.WriteString(esc)
i = strings.IndexAny(s, escapedChars)
}
buf.WriteString(s)
}
// EscapeString escapes special characters like "<" to become "&lt;". It // EscapeString escapes special characters like "<" to become "&lt;". It
// escapes only five such characters: amp, apos, lt, gt and quot. // escapes only five such characters: amp, apos, lt, gt and quot.
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true. // always true.
func EscapeString(s string) string { func EscapeString(s string) string {
// TODO(nigeltao): Do this much more efficiently. if strings.IndexAny(s, escapedChars) == -1 {
s = strings.Replace(s, `&`, `&amp;`, -1) return s
s = strings.Replace(s, `'`, `&apos;`, -1) }
s = strings.Replace(s, `<`, `&lt;`, -1) buf := bytes.NewBuffer(nil)
s = strings.Replace(s, `>`, `&gt;`, -1) escape(buf, s)
s = strings.Replace(s, `"`, `&quot;`, -1) return buf.String()
return s
} }
// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
package html package html
import ( import (
"bytes"
"io" "io"
"log" "log"
"os" "os"
...@@ -68,12 +69,19 @@ type Token struct { ...@@ -68,12 +69,19 @@ type Token struct {
// tagString returns a string representation of a tag Token's Data and Attr. // tagString returns a string representation of a tag Token's Data and Attr.
func (t Token) tagString() string { func (t Token) tagString() string {
// TODO(nigeltao): Don't use string concatenation; it is inefficient. if len(t.Attr) == 0 {
s := string(t.Data) return t.Data
}
buf := bytes.NewBuffer(nil)
buf.WriteString(t.Data)
for _, a := range t.Attr { for _, a := range t.Attr {
s += ` ` + a.Key + `="` + EscapeString(a.Val) + `"` buf.WriteByte(' ')
buf.WriteString(a.Key)
buf.WriteString(`="`)
escape(buf, a.Val)
buf.WriteByte('"')
} }
return s return buf.String()
} }
// String returns a string representation of the Token. // String returns a string representation of the Token.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment