Commit 2d9a50b9 authored by Didier Spezia's avatar Didier Spezia Committed by Brad Fitzpatrick

html: simplify and optimize escape/unescape

The html package uses some specific code to escape special characters.
Actually, the strings.Replacer can be used instead, and is much more
efficient. The converse operation is more complex but can still be
slightly optimized.

Credits to Ken Bloom (kabloom@google.com), who first submitted a
similar patch at https://codereview.appspot.com/141930043

Added benchmarks and slightly optimized UnescapeString.

benchmark                   old ns/op     new ns/op     delta
BenchmarkEscape-4           118713        19825         -83.30%
BenchmarkEscapeNone-4       87653         3784          -95.68%
BenchmarkUnescape-4         24888         23417         -5.91%
BenchmarkUnescapeNone-4     14423         157           -98.91%

benchmark                   old allocs     new allocs     delta
BenchmarkEscape-4           9              2              -77.78%
BenchmarkEscapeNone-4       0              0              +0.00%
BenchmarkUnescape-4         2              2              +0.00%
BenchmarkUnescapeNone-4     0              0              +0.00%

benchmark                   old bytes     new bytes     delta
BenchmarkEscape-4           24800         12288         -50.45%
BenchmarkEscapeNone-4       0             0             +0.00%
BenchmarkUnescape-4         10240         10240         +0.00%
BenchmarkUnescapeNone-4     0             0             +0.00%

Fixes #8697

Change-Id: I208261ed7cbe9b3dee6317851f8c0cf15528bce4
Reviewed-on: https://go-review.googlesource.com/9808
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: 's avatarBrad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 2b833666
...@@ -6,7 +6,6 @@ ...@@ -6,7 +6,6 @@
package html package html
import ( import (
"bytes"
"strings" "strings"
"unicode/utf8" "unicode/utf8"
) )
...@@ -187,52 +186,20 @@ func unescape(b []byte) []byte { ...@@ -187,52 +186,20 @@ func unescape(b []byte) []byte {
return b return b
} }
const escapedChars = `&'<>"` var htmlEscaper = strings.NewReplacer(
`&`, "&amp;",
func escape(w writer, s string) error { `'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
i := strings.IndexAny(s, escapedChars) `<`, "&lt;",
for i != -1 { `>`, "&gt;",
if _, err := w.WriteString(s[:i]); err != nil { `"`, "&#34;", // "&#34;" is shorter than "&quot;".
return err )
}
var esc string
switch s[i] {
case '&':
esc = "&amp;"
case '\'':
// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
esc = "&#39;"
case '<':
esc = "&lt;"
case '>':
esc = "&gt;"
case '"':
// "&#34;" is shorter than "&quot;".
esc = "&#34;"
default:
panic("unrecognized escape character")
}
s = s[i+1:]
if _, err := w.WriteString(esc); err != nil {
return err
}
i = strings.IndexAny(s, escapedChars)
}
_, err := w.WriteString(s)
return err
}
// EscapeString escapes special characters like "<" to become "&lt;". It // EscapeString escapes special characters like "<" to become "&lt;". It
// escapes only five such characters: <, >, &, ' and ". // escapes only five such characters: <, >, &, ' and ".
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true. // always true.
func EscapeString(s string) string { func EscapeString(s string) string {
if strings.IndexAny(s, escapedChars) == -1 { return htmlEscaper.Replace(s)
return s
}
var buf bytes.Buffer
escape(&buf, s)
return buf.String()
} }
// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
...@@ -241,10 +208,8 @@ func EscapeString(s string) string { ...@@ -241,10 +208,8 @@ func EscapeString(s string) string {
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true. // always true.
func UnescapeString(s string) string { func UnescapeString(s string) string {
for _, c := range s { if !strings.Contains(s, "&") {
if c == '&' { return s
return string(unescape([]byte(s)))
}
} }
return s return string(unescape([]byte(s)))
} }
...@@ -4,7 +4,10 @@ ...@@ -4,7 +4,10 @@
package html package html
import "testing" import (
"strings"
"testing"
)
type unescapeTest struct { type unescapeTest struct {
// A short description of the test case. // A short description of the test case.
...@@ -113,3 +116,38 @@ func TestUnescapeEscape(t *testing.T) { ...@@ -113,3 +116,38 @@ func TestUnescapeEscape(t *testing.T) {
} }
} }
} }
var (
benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
)
func BenchmarkEscape(b *testing.B) {
n := 0
for i := 0; i < b.N; i++ {
n += len(EscapeString(benchEscapeData))
}
}
func BenchmarkEscapeNone(b *testing.B) {
n := 0
for i := 0; i < b.N; i++ {
n += len(EscapeString(benchEscapeNone))
}
}
func BenchmarkUnescape(b *testing.B) {
s := EscapeString(benchEscapeData)
n := 0
for i := 0; i < b.N; i++ {
n += len(UnescapeString(s))
}
}
func BenchmarkUnescapeNone(b *testing.B) {
s := EscapeString(benchEscapeNone)
n := 0
for i := 0; i < b.N; i++ {
n += len(UnescapeString(s))
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment