Commit f75ff01f authored by Brad Fitzpatrick's avatar Brad Fitzpatrick

strings: implement a faster byte->byte Replacer

When all old & new string values are single bytes,
byteReplacer is now used, instead of the generic
algorithm.

BenchmarkGenericMatch       10000  102519 ns/op
BenchmarkByteByteMatch    1000000    2178 ns/op

fast path, when nothing matches:
BenchmarkByteByteNoMatch  1000000    1109 ns/op

comparisons to multiple Replace calls:
BenchmarkByteByteReplaces  100000   16164 ns/op

comparison to strings.Map:
BenchmarkByteByteMap       500000    5454 ns/op

R=rsc
CC=golang-dev
https://golang.org/cl/5175050
parent 85916146
...@@ -207,17 +207,16 @@ func readCookies(h Header, filter string) []*Cookie { ...@@ -207,17 +207,16 @@ func readCookies(h Header, filter string) []*Cookie {
return cookies return cookies
} }
var cookieNameSanitizer = strings.NewReplacer("\n", "-", "\r", "-")
func sanitizeName(n string) string { func sanitizeName(n string) string {
n = strings.Replace(n, "\n", "-", -1) return cookieNameSanitizer.Replace(n)
n = strings.Replace(n, "\r", "-", -1)
return n
} }
var cookieValueSanitizer = strings.NewReplacer("\n", " ", "\r", " ", ";", " ")
func sanitizeValue(v string) string { func sanitizeValue(v string) string {
v = strings.Replace(v, "\n", " ", -1) return cookieValueSanitizer.Replace(v)
v = strings.Replace(v, "\r", " ", -1)
v = strings.Replace(v, ";", " ", -1)
return v
} }
func unquoteCookieValue(v string) string { func unquoteCookieValue(v string) string {
......
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package strings
func (r *Replacer) Replacer() interface{} {
return r.r
}
...@@ -9,20 +9,24 @@ import ( ...@@ -9,20 +9,24 @@ import (
"os" "os"
) )
// Can't import ioutil for ioutil.Discard, due to ioutil/tempfile.go -> strconv -> strings // A Replacer replaces a list of strings with replacements.
var discard io.Writer = devNull(0) type Replacer struct {
r replacer
type devNull int }
func (devNull) Write(p []byte) (int, os.Error) { // replacer is the interface that a replacement algorithm needs to implement.
return len(p), nil type replacer interface {
Replace(s string) string
WriteString(w io.Writer, s string) (n int, err os.Error)
} }
type pair struct{ old, new string } // byteBitmap represents bytes which are sought for replacement.
// byteBitmap is 256 bits wide, with a bit set for each old byte to be
// replaced.
type byteBitmap [256 / 32]uint32
// A Replacer replaces a list of strings with replacements. func (m *byteBitmap) set(b byte) {
type Replacer struct { m[b>>5] |= uint32(1 << (b & 31))
p []pair
} }
// NewReplacer returns a new Replacer from a list of old, new string pairs. // NewReplacer returns a new Replacer from a list of old, new string pairs.
...@@ -31,14 +35,51 @@ func NewReplacer(oldnew ...string) *Replacer { ...@@ -31,14 +35,51 @@ func NewReplacer(oldnew ...string) *Replacer {
if len(oldnew)%2 == 1 { if len(oldnew)%2 == 1 {
panic("strings.NewReplacer: odd argument count") panic("strings.NewReplacer: odd argument count")
} }
r := new(Replacer)
for len(oldnew) >= 2 { var bb byteReplacer
r.p = append(r.p, pair{oldnew[0], oldnew[1]}) var gen genericReplacer
allOldBytes, allNewBytes := true, true
for len(oldnew) > 0 {
old, new := oldnew[0], oldnew[1]
oldnew = oldnew[2:] oldnew = oldnew[2:]
if len(old) != 1 {
allOldBytes = false
}
if len(new) != 1 {
allNewBytes = false
}
gen.p = append(gen.p, pair{old, new})
if allOldBytes && allNewBytes {
bb.old.set(old[0])
bb.new[old[0]] = new[0]
}
} }
return r
if allOldBytes && allNewBytes {
return &Replacer{r: &bb}
}
return &Replacer{r: &gen}
}
// Replace returns a copy of s with all replacements performed.
func (r *Replacer) Replace(s string) string {
return r.r.Replace(s)
}
// WriteString writes s to w with all replacements performed.
func (r *Replacer) WriteString(w io.Writer, s string) (n int, err os.Error) {
return r.r.WriteString(w, s)
}
// genericReplacer is the fully generic (and least optimized) algorithm.
// It's used as a fallback when nothing faster can be used.
type genericReplacer struct {
p []pair
} }
type pair struct{ old, new string }
type appendSliceWriter struct { type appendSliceWriter struct {
b []byte b []byte
} }
...@@ -48,8 +89,7 @@ func (w *appendSliceWriter) Write(p []byte) (int, os.Error) { ...@@ -48,8 +89,7 @@ func (w *appendSliceWriter) Write(p []byte) (int, os.Error) {
return len(p), nil return len(p), nil
} }
// Replace returns a copy of s with all replacements performed. func (r *genericReplacer) Replace(s string) string {
func (r *Replacer) Replace(s string) string {
// TODO(bradfitz): optimized version // TODO(bradfitz): optimized version
n, _ := r.WriteString(discard, s) n, _ := r.WriteString(discard, s)
w := appendSliceWriter{make([]byte, 0, n)} w := appendSliceWriter{make([]byte, 0, n)}
...@@ -57,19 +97,28 @@ func (r *Replacer) Replace(s string) string { ...@@ -57,19 +97,28 @@ func (r *Replacer) Replace(s string) string {
return string(w.b) return string(w.b)
} }
// WriteString writes s to w with all replacements performed. func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err os.Error) {
func (r *Replacer) WriteString(w io.Writer, s string) (n int, err os.Error) { lastEmpty := false // the last replacement was of the empty string
Input: Input:
// TODO(bradfitz): optimized version // TODO(bradfitz): optimized version
for i := 0; i < len(s); { for i := 0; i < len(s); {
for _, p := range r.p { for _, p := range r.p {
if p.old == "" && lastEmpty {
// Don't let old match twice in a row.
// (it doesn't advance the input and
// would otherwise loop forever)
continue
}
if HasPrefix(s[i:], p.old) { if HasPrefix(s[i:], p.old) {
wn, err := w.Write([]byte(p.new)) if p.new != "" {
n += wn wn, err := w.Write([]byte(p.new))
if err != nil { n += wn
return n, err if err != nil {
return n, err
}
} }
i += len(p.old) i += len(p.old)
lastEmpty = p.old == ""
continue Input continue Input
} }
} }
...@@ -80,5 +129,81 @@ Input: ...@@ -80,5 +129,81 @@ Input:
} }
i++ i++
} }
// Final empty match at end.
for _, p := range r.p {
if p.old == "" {
if p.new != "" {
wn, err := w.Write([]byte(p.new))
n += wn
if err != nil {
return n, err
}
}
break
}
}
return n, nil
}
// byteReplacer is the implementation that's used when all the "old"
// and "new" values are single ASCII bytes.
type byteReplacer struct {
// old has a bit set for each old byte that should be replaced.
old byteBitmap
// replacement byte, indexed by old byte. only valid if
// corresponding old bit is set.
new [256]byte
}
func (r *byteReplacer) Replace(s string) string {
var buf []byte // lazily allocated
for i := 0; i < len(s); i++ {
b := s[i]
if r.old[b>>5]&uint32(1<<(b&31)) != 0 {
if buf == nil {
buf = []byte(s)
}
buf[i] = r.new[b]
}
}
if buf == nil {
return s
}
return string(buf)
}
func (r *byteReplacer) WriteString(w io.Writer, s string) (n int, err os.Error) {
bufsize := 32 << 10
if len(s) < bufsize {
bufsize = len(s)
}
buf := make([]byte, bufsize)
for len(s) > 0 {
ncopy := copy(buf, s[:])
s = s[ncopy:]
for i, b := range buf[:ncopy] {
if r.old[b>>5]&uint32(1<<(b&31)) != 0 {
buf[i] = r.new[b]
}
}
wn, err := w.Write(buf[:ncopy])
n += wn
if err != nil {
return n, err
}
}
return n, nil return n, nil
} }
// strings is too low-level to import io/ioutil
var discard io.Writer = devNull(0)
type devNull int
func (devNull) Write(p []byte) (int, os.Error) {
return len(p), nil
}
...@@ -5,12 +5,17 @@ ...@@ -5,12 +5,17 @@
package strings_test package strings_test
import ( import (
"bytes"
"fmt"
"log"
. "strings" . "strings"
"testing" "testing"
) )
var _ = log.Printf
type ReplacerTest struct { type ReplacerTest struct {
m *Replacer r *Replacer
in string in string
out string out string
} }
...@@ -31,6 +36,10 @@ var replacer = NewReplacer("aaa", "3[aaa]", "aa", "2[aa]", "a", "1[a]", "i", "i" ...@@ -31,6 +36,10 @@ var replacer = NewReplacer("aaa", "3[aaa]", "aa", "2[aa]", "a", "1[a]", "i", "i"
"longerst", "most long", "longer", "medium", "long", "short", "longerst", "most long", "longer", "medium", "long", "short",
"X", "Y", "Y", "Z") "X", "Y", "Y", "Z")
var capitalLetters = NewReplacer("a", "A", "b", "B")
var blankToXReplacer = NewReplacer("", "X", "o", "O")
var ReplacerTests = []ReplacerTest{ var ReplacerTests = []ReplacerTest{
{htmlEscaper, "No changes", "No changes"}, {htmlEscaper, "No changes", "No changes"},
{htmlEscaper, "I <3 escaping & stuff", "I &lt;3 escaping &amp; stuff"}, {htmlEscaper, "I <3 escaping & stuff", "I &lt;3 escaping &amp; stuff"},
...@@ -38,38 +47,98 @@ var ReplacerTests = []ReplacerTest{ ...@@ -38,38 +47,98 @@ var ReplacerTests = []ReplacerTest{
{replacer, "fooaaabar", "foo3[aaa]b1[a]r"}, {replacer, "fooaaabar", "foo3[aaa]b1[a]r"},
{replacer, "long, longerst, longer", "short, most long, medium"}, {replacer, "long, longerst, longer", "short, most long, medium"},
{replacer, "XiX", "YiY"}, {replacer, "XiX", "YiY"},
{capitalLetters, "brad", "BrAd"},
{capitalLetters, Repeat("a", (32<<10)+123), Repeat("A", (32<<10)+123)},
{blankToXReplacer, "oo", "XOXOX"},
} }
func TestReplacer(t *testing.T) { func TestReplacer(t *testing.T) {
for i, tt := range ReplacerTests { for i, tt := range ReplacerTests {
if s := tt.m.Replace(tt.in); s != tt.out { if s := tt.r.Replace(tt.in); s != tt.out {
t.Errorf("%d. Replace(%q) = %q, want %q", i, tt.in, s, tt.out) t.Errorf("%d. Replace(%q) = %q, want %q", i, tt.in, s, tt.out)
} }
var buf bytes.Buffer
n, err := tt.r.WriteString(&buf, tt.in)
if err != nil {
t.Errorf("%d. WriteString: %v", i, err)
continue
}
got := buf.String()
if got != tt.out {
t.Errorf("%d. WriteString(%q) wrote %q, want %q", i, tt.in, got, tt.out)
continue
}
if n != len(tt.out) {
t.Errorf("%d. WriteString(%q) wrote correct string but reported %d bytes; want %d (%q)",
i, tt.in, n, len(tt.out), tt.out)
}
} }
} }
var slowReplacer = NewReplacer("&&", "&amp;", "<<", "&lt;", ">>", "&gt;", "\"\"", "&quot;", "''", "&apos;") // pickAlgorithmTest is a test that verifies that given input for a
// Replacer that we pick the correct algorithm.
type pickAlgorithmTest struct {
r *Replacer
want string // name of algorithm
}
var pickAlgorithmTests = []pickAlgorithmTest{
{capitalLetters, "*strings.byteReplacer"},
{NewReplacer("a", "A", "b", "Bb"), "*strings.genericReplacer"},
}
func BenchmarkReplacerSingleByte(b *testing.B) { func TestPickAlgorithm(t *testing.T) {
str := "I <3 benchmarking html & other stuff too >:D" for i, tt := range pickAlgorithmTests {
n := 0 got := fmt.Sprintf("%T", tt.r.Replacer())
if got != tt.want {
t.Errorf("%d. algorithm = %s, want %s", i, got, tt.want)
}
}
}
func BenchmarkGenericMatch(b *testing.B) {
str := Repeat("A", 100) + Repeat("B", 100)
generic := NewReplacer("a", "A", "b", "B", "12", "123") // varying lengths forces generic
for i := 0; i < b.N; i++ {
generic.Replace(str)
}
}
func BenchmarkByteByteNoMatch(b *testing.B) {
str := Repeat("A", 100) + Repeat("B", 100)
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
n += len(htmlEscaper.Replace(str)) capitalLetters.Replace(str)
} }
} }
func BenchmarkReplaceMap(b *testing.B) { func BenchmarkByteByteMatch(b *testing.B) {
str := "I <<3 benchmarking html && other stuff too >>:D" str := Repeat("a", 100) + Repeat("b", 100)
n := 0
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
n += len(slowReplacer.Replace(str)) capitalLetters.Replace(str)
} }
} }
func BenchmarkOldHTTPHTMLReplace(b *testing.B) { // BenchmarkByteByteReplaces compares byteByteImpl against multiple Replaces.
str := "I <3 benchmarking html & other stuff too >:D" func BenchmarkByteByteReplaces(b *testing.B) {
n := 0 str := Repeat("a", 100) + Repeat("b", 100)
for i := 0; i < b.N; i++ {
Replace(Replace(str, "a", "A", -1), "b", "B", -1)
}
}
// BenchmarkByteByteMap compares byteByteImpl against Map.
func BenchmarkByteByteMap(b *testing.B) {
str := Repeat("a", 100) + Repeat("b", 100)
fn := func(r int) int {
switch r {
case 'a':
return int('A')
case 'b':
return int('B')
}
return r
}
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
n += len(oldhtmlEscape(str)) Map(fn, str)
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment