Commit 5b920281 authored by Ingo Oeser's avatar Ingo Oeser Committed by Brad Fitzpatrick

html: speed up UnescapeString

Add benchmarks for for sparsely escaped and densely escaped strings.
Then speed up the sparse unescaping part heavily by using IndexByte and
copy to skip the parts containing no escaping very fast.

Unescaping densely escaped strings slower because of
the new function call overhead. But sparsely encoded strings are seen
more often in the utf8 enabled web.

We win part of the speed back by looking up entityName differently.

	benchmark                  old ns/op    new ns/op    delta
	BenchmarkEscape                31680        31396   -0.90%
	BenchmarkEscapeNone             6507         6872   +5.61%
	BenchmarkUnescape              36481        48298  +32.39%
	BenchmarkUnescapeNone            332          325   -2.11%
	BenchmarkUnescapeSparse         8836         3221  -63.55%
	BenchmarkUnescapeDense         30639        32224   +5.17%

Change-Id: If606cb01897a40eefe35ba98f2ff23bb25251606
Reviewed-on: https://go-review.googlesource.com/10172Reviewed-by: 's avatarBrad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 5f859ba8
......@@ -57,8 +57,9 @@ var replacementTable = [...]rune{
// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
// Precondition: b[src] == '&' && dst <= src.
// attribute should be true if parsing an attribute value.
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
const attribute = false
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
// i starts at 1 because we already know that s[0] == '&'.
......@@ -139,14 +140,14 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
break
}
entityName := string(s[1:i])
if entityName == "" {
entityName := s[1:i]
if len(entityName) == 0 {
// No-op.
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
// No-op.
} else if x := entity[entityName]; x != 0 {
} else if x := entity[string(entityName)]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + i
} else if x := entity2[entityName]; x[0] != 0 {
} else if x := entity2[string(entityName)]; x[0] != 0 {
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
} else if !attribute {
......@@ -155,7 +156,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
maxLen = longestEntityWithoutSemicolon
}
for j := maxLen; j > 1; j-- {
if x := entity[entityName[:j]]; x != 0 {
if x := entity[string(entityName[:j])]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
}
}
......@@ -166,26 +167,6 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
return dst1, src1
}
// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
func unescape(b []byte) []byte {
for i, c := range b {
if c == '&' {
dst, src := unescapeEntity(b, i, i, false)
for src < len(b) {
c := b[src]
if c == '&' {
dst, src = unescapeEntity(b, dst, src, false)
} else {
b[dst] = c
dst, src = dst+1, src+1
}
}
return b[0:dst]
}
}
return b
}
var htmlEscaper = strings.NewReplacer(
`&`, "&amp;",
`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
......@@ -208,8 +189,29 @@ func EscapeString(s string) string {
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true.
func UnescapeString(s string) string {
if !strings.Contains(s, "&") {
i := strings.IndexByte(s, '&')
if i < 0 {
return s
}
return string(unescape([]byte(s)))
b := []byte(s)
dst, src := unescapeEntity(b, i, i)
for len(s[src:]) > 0 {
if s[src] == '&' {
i = 0
} else {
i = strings.IndexByte(s[src:], '&')
}
if i < 0 {
dst += copy(b[dst:], s[src:])
break
}
if i > 0 {
copy(b[dst:], s[src:src+i])
}
dst, src = unescapeEntity(b, dst+i, src+i)
}
return string(b[:dst])
}
......@@ -120,6 +120,8 @@ func TestUnescapeEscape(t *testing.T) {
var (
benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
benchUnescapeSparse = strings.Repeat(strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 10)+"&amp;", 10)
benchUnescapeDense = strings.Repeat("&amp;&lt; &amp; &lt;", 100)
)
func BenchmarkEscape(b *testing.B) {
......@@ -151,3 +153,17 @@ func BenchmarkUnescapeNone(b *testing.B) {
n += len(UnescapeString(s))
}
}
func BenchmarkUnescapeSparse(b *testing.B) {
n := 0
for i := 0; i < b.N; i++ {
n += len(UnescapeString(benchUnescapeSparse))
}
}
func BenchmarkUnescapeDense(b *testing.B) {
n := 0
for i := 0; i < b.N; i++ {
n += len(UnescapeString(benchUnescapeDense))
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment