Commit 0aad3cdc authored by Eric Eisner's avatar Eric Eisner Committed by Nigel Tao

strings: implement a faster generic Replacer

This also fixes the semantics of some corner cases with the empty
match. TODOs for genericReplacer in the tests are fixed.

benchmark                  old ns/op    new ns/op    delta
BenchmarkGenericNoMatch        71395         3132  -95.61%
BenchmarkGenericMatch1         75610        20280  -73.18%
BenchmarkGenericMatch2        837995        86725  -89.65%

R=nigeltao, rsc
CC=golang-dev
https://golang.org/cl/6492076
parent a1a414e6
...@@ -7,3 +7,30 @@ package strings ...@@ -7,3 +7,30 @@ package strings
func (r *Replacer) Replacer() interface{} { func (r *Replacer) Replacer() interface{} {
return r.r return r.r
} }
func (r *Replacer) PrintTrie() string {
gen := r.r.(*genericReplacer)
return gen.printNode(&gen.root, 0)
}
func (r *genericReplacer) printNode(t *trieNode, depth int) (s string) {
if t.priority > 0 {
s += "+"
} else {
s += "-"
}
s += "\n"
if t.prefix != "" {
s += Repeat(".", depth) + t.prefix
s += r.printNode(t.next, depth+len(t.prefix))
} else if t.table != nil {
for b, m := range r.mapping {
if int(m) != r.tableSize && t.table[m] != nil {
s += Repeat(".", depth) + string([]byte{byte(b)})
s += r.printNode(t.table[m], depth+1)
}
}
}
return
}
...@@ -80,89 +80,264 @@ func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) { ...@@ -80,89 +80,264 @@ func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) {
return r.r.WriteString(w, s) return r.r.WriteString(w, s)
} }
// genericReplacer is the fully generic (and least optimized) algorithm. // trieNode is a node in a lookup trie for prioritized key/value pairs. Keys
// and values may be empty. For example, the trie containing keys "ax", "ay",
// "bcbc", "x" and "xy" could have eight nodes:
//
// n0 -
// n1 a-
// n2 .x+
// n3 .y+
// n4 b-
// n5 .cbc+
// n6 x+
// n7 .y+
//
// n0 is the root node, and its children are n1, n4 and n6; n1's children are
// n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked
// with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7
// (marked with a trailing "+") are complete keys.
type trieNode struct {
// value is the value of the trie node's key/value pair. It is empty if
// this node is not a complete key.
value string
// priority is the priority (higher is more important) of the trie node's
// key/value pair; keys are not necessarily matched shortest- or longest-
// first. Priority is positive if this node is a complete key, and zero
// otherwise. In the example above, positive/zero priorities are marked
// with a trailing "+" or "-".
priority int
// A trie node may have zero, one or more child nodes:
// * if the remaining fields are zero, there are no children.
// * if prefix and next are non-zero, there is one child in next.
// * if table is non-zero, it defines all the children.
//
// Prefixes are preferred over tables when there is one child, but the
// root node always uses a table for lookup efficiency.
// prefix is the difference in keys between this trie node and the next.
// In the example above, node n4 has prefix "cbc" and n4's next node is n5.
// Node n5 has no children and so has zero prefix, next and table fields.
prefix string
next *trieNode
// table is a lookup table indexed by the next byte in the key, after
// remapping that byte through genericReplacer.mapping to create a dense
// index. In the example above, the keys only use 'a', 'b', 'c', 'x' and
// 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and
// genericReplacer.tableSize will be 5. Node n0's table will be
// []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped
// 'a', 'b' and 'x'.
table []*trieNode
}
func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {
if key == "" {
if t.priority == 0 {
t.value = val
t.priority = priority
}
return
}
if t.prefix != "" {
// Need to split the prefix among multiple nodes.
var n int // length of the longest common prefix
for ; n < len(t.prefix) && n < len(key); n++ {
if t.prefix[n] != key[n] {
break
}
}
if n == len(t.prefix) {
t.next.add(key[n:], val, priority, r)
} else if n == 0 {
// First byte differs, start a new lookup table here. Looking up
// what is currently t.prefix[0] will lead to prefixNode, and
// looking up key[0] will lead to keyNode.
var prefixNode *trieNode
if len(t.prefix) == 1 {
prefixNode = t.next
} else {
prefixNode = &trieNode{
prefix: t.prefix[1:],
next: t.next,
}
}
keyNode := new(trieNode)
t.table = make([]*trieNode, r.tableSize)
t.table[r.mapping[t.prefix[0]]] = prefixNode
t.table[r.mapping[key[0]]] = keyNode
t.prefix = ""
t.next = nil
keyNode.add(key[1:], val, priority, r)
} else {
// Insert new node after the common section of the prefix.
next := &trieNode{
prefix: t.prefix[n:],
next: t.next,
}
t.prefix = t.prefix[:n]
t.next = next
next.add(key[n:], val, priority, r)
}
} else if t.table != nil {
// Insert into existing table.
m := r.mapping[key[0]]
if t.table[m] == nil {
t.table[m] = new(trieNode)
}
t.table[m].add(key[1:], val, priority, r)
} else {
t.prefix = key
t.next = new(trieNode)
t.next.add("", val, priority, r)
}
}
func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) {
// Iterate down the trie to the end, and grab the value and keylen with
// the highest priority.
bestPriority := 0
node := &r.root
n := 0
for node != nil {
if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
bestPriority = node.priority
val = node.value
keylen = n
found = true
}
if s == "" {
break
}
if node.table != nil {
index := r.mapping[s[0]]
if int(index) == r.tableSize {
break
}
node = node.table[index]
s = s[1:]
n++
} else if node.prefix != "" && HasPrefix(s, node.prefix) {
n += len(node.prefix)
s = s[len(node.prefix):]
node = node.next
} else {
break
}
}
return
}
// genericReplacer is the fully generic algorithm.
// It's used as a fallback when nothing faster can be used. // It's used as a fallback when nothing faster can be used.
type genericReplacer struct { type genericReplacer struct {
p []pair root trieNode
// tableSize is the size of a trie node's lookup table. It is the number
// of unique key bytes.
tableSize int
// mapping maps from key bytes to a dense index for trieNode.table.
mapping [256]byte
} }
type pair struct{ old, new string }
func makeGenericReplacer(oldnew []string) *genericReplacer { func makeGenericReplacer(oldnew []string) *genericReplacer {
gen := &genericReplacer{ r := new(genericReplacer)
p: make([]pair, len(oldnew)/2), // Find each byte used, then assign them each an index.
for i := 0; i < len(oldnew); i += 2 {
key := oldnew[i]
for j := 0; j < len(key); j++ {
r.mapping[key[j]] = 1
}
}
for _, b := range r.mapping {
r.tableSize += int(b)
} }
var index byte
for i, b := range r.mapping {
if b == 0 {
r.mapping[i] = byte(r.tableSize)
} else {
r.mapping[i] = index
index++
}
}
// Ensure root node uses a lookup table (for performance).
r.root.table = make([]*trieNode, r.tableSize)
for i := 0; i < len(oldnew); i += 2 { for i := 0; i < len(oldnew); i += 2 {
gen.p[i/2] = pair{oldnew[i], oldnew[i+1]} r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)
} }
return gen return r
} }
type appendSliceWriter struct { type appendSliceWriter []byte
b []byte
}
// Write writes to the buffer to satisfy io.Writer.
func (w *appendSliceWriter) Write(p []byte) (int, error) { func (w *appendSliceWriter) Write(p []byte) (int, error) {
w.b = append(w.b, p...) *w = append(*w, p...)
return len(p), nil return len(p), nil
} }
// WriteString writes to the buffer without string->[]byte->string allocations.
func (w *appendSliceWriter) WriteString(s string) (int, error) {
*w = append(*w, s...)
return len(s), nil
}
type stringWriter struct {
w io.Writer
}
func (w stringWriter) WriteString(s string) (int, error) {
return w.w.Write([]byte(s))
}
func (r *genericReplacer) Replace(s string) string { func (r *genericReplacer) Replace(s string) string {
// TODO(bradfitz): optimized version buf := make(appendSliceWriter, 0, len(s))
n, _ := r.WriteString(discard, s) r.WriteString(&buf, s)
w := appendSliceWriter{make([]byte, 0, n)} return string(buf)
r.WriteString(&w, s)
return string(w.b)
} }
func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) { func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
lastEmpty := false // the last replacement was of the empty string sw, ok := w.(interface {
Input: WriteString(string) (int, error)
// TODO(bradfitz): optimized version })
for i := 0; i < len(s); { if !ok {
for _, p := range r.p { sw = stringWriter{w}
if p.old == "" && lastEmpty { }
// Don't let old match twice in a row.
// (it doesn't advance the input and var last, wn int
// would otherwise loop forever) var prevMatchEmpty bool
continue for i := 0; i <= len(s); {
// Ignore the empty match iff the previous loop found the empty match.
val, keylen, match := r.lookup(s[i:], prevMatchEmpty)
prevMatchEmpty = match && keylen == 0
if match {
wn, err = sw.WriteString(s[last:i])
n += wn
if err != nil {
return
} }
if HasPrefix(s[i:], p.old) { wn, err = sw.WriteString(val)
if p.new != "" { n += wn
wn, err := w.Write([]byte(p.new)) if err != nil {
n += wn return
if err != nil {
return n, err
}
}
i += len(p.old)
lastEmpty = p.old == ""
continue Input
} }
} i += keylen
wn, err := w.Write([]byte{s[i]}) last = i
n += wn continue
if err != nil {
return n, err
} }
i++ i++
} }
if last != len(s) {
// Final empty match at end. wn, err = sw.WriteString(s[last:])
for _, p := range r.p { n += wn
if p.old == "" {
if p.new != "" {
wn, err := w.Write([]byte(p.new))
n += wn
if err != nil {
return n, err
}
}
break
}
} }
return
return n, nil
} }
// byteReplacer is the implementation that's used when all the "old" // byteReplacer is the implementation that's used when all the "old"
...@@ -305,12 +480,3 @@ func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err erro ...@@ -305,12 +480,3 @@ func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err erro
} }
return n, nil return n, nil
} }
// strings is too low-level to import io/ioutil
var discard io.Writer = devNull(0)
type devNull int
func (devNull) Write(p []byte) (int, error) {
return len(p), nil
}
...@@ -219,21 +219,29 @@ func TestReplacer(t *testing.T) { ...@@ -219,21 +219,29 @@ func TestReplacer(t *testing.T) {
blankToX1 := NewReplacer("", "X") blankToX1 := NewReplacer("", "X")
blankToX2 := NewReplacer("", "X", "", "") blankToX2 := NewReplacer("", "X", "", "")
blankToXOToO := NewReplacer("", "X", "o", "O") blankHighPriority := NewReplacer("", "X", "o", "O")
blankLowPriority := NewReplacer("o", "O", "", "X")
blankNoOp1 := NewReplacer("", "") blankNoOp1 := NewReplacer("", "")
blankNoOp2 := NewReplacer("", "", "", "A") blankNoOp2 := NewReplacer("", "", "", "A")
blankFoo := NewReplacer("", "X", "foobar", "R", "foobaz", "Z") blankFoo := NewReplacer("", "X", "foobar", "R", "foobaz", "Z")
testCases = append(testCases, testCases = append(testCases,
testCase{blankToX1, "foo", "XfooX"}, // TODO: should this be "XfXoXoX"? testCase{blankToX1, "foo", "XfXoXoX"},
testCase{blankToX1, "", "X"}, testCase{blankToX1, "", "X"},
testCase{blankToX2, "foo", "XfooX"}, // TODO: should this be "XfXoXoX"? testCase{blankToX2, "foo", "XfXoXoX"},
testCase{blankToX2, "", "X"}, testCase{blankToX2, "", "X"},
testCase{blankToXOToO, "oo", "XOXOX"}, testCase{blankHighPriority, "oo", "XOXOX"},
testCase{blankToXOToO, "ii", "XiiX"}, // TODO: should this be "XiXiX"? testCase{blankHighPriority, "ii", "XiXiX"},
testCase{blankToXOToO, "iooi", "XiOXOXiX"}, // TODO: should this be "XiXOXOXiX"? testCase{blankHighPriority, "oiio", "XOXiXiXOX"},
testCase{blankToXOToO, "", "X"}, testCase{blankHighPriority, "iooi", "XiXOXOXiX"},
testCase{blankHighPriority, "", "X"},
testCase{blankLowPriority, "oo", "OOX"},
testCase{blankLowPriority, "ii", "XiXiX"},
testCase{blankLowPriority, "oiio", "OXiXiOX"},
testCase{blankLowPriority, "iooi", "XiOOXiX"},
testCase{blankLowPriority, "", "X"},
testCase{blankNoOp1, "foo", "foo"}, testCase{blankNoOp1, "foo", "foo"},
testCase{blankNoOp1, "", ""}, testCase{blankNoOp1, "", ""},
...@@ -242,7 +250,7 @@ func TestReplacer(t *testing.T) { ...@@ -242,7 +250,7 @@ func TestReplacer(t *testing.T) {
testCase{blankNoOp2, "", ""}, testCase{blankNoOp2, "", ""},
testCase{blankFoo, "foobarfoobaz", "XRXZX"}, testCase{blankFoo, "foobarfoobaz", "XRXZX"},
testCase{blankFoo, "foobar-foobaz", "XRX-ZX"}, // TODO: should this be "XRX-XZX"? testCase{blankFoo, "foobar-foobaz", "XRX-XZX"},
testCase{blankFoo, "", "X"}, testCase{blankFoo, "", "X"},
) )
...@@ -298,6 +306,78 @@ func TestPickAlgorithm(t *testing.T) { ...@@ -298,6 +306,78 @@ func TestPickAlgorithm(t *testing.T) {
} }
} }
// TestGenericTrieBuilding verifies the structure of the generated trie. There
// is one node per line, and the key ending with the current line is in the
// trie if it ends with a "+".
func TestGenericTrieBuilding(t *testing.T) {
testCases := []struct{ in, out string }{
{"abc;abdef;abdefgh;xx;xy;z", `-
a-
.b-
..c+
..d-
...ef+
.....gh+
x-
.x+
.y+
z+
`},
{"abracadabra;abracadabrakazam;abraham;abrasion", `-
a-
.bra-
....c-
.....adabra+
...........kazam+
....h-
.....am+
....s-
.....ion+
`},
{"aaa;aa;a;i;longerst;longer;long;xx;x;X;Y", `-
X+
Y+
a+
.a+
..a+
i+
l-
.ong+
....er+
......st+
x+
.x+
`},
{"foo;;foo;foo1", `+
f-
.oo+
...1+
`},
}
for _, tc := range testCases {
keys := Split(tc.in, ";")
args := make([]string, len(keys)*2)
for i, key := range keys {
args[i*2] = key
}
got := NewReplacer(args...).PrintTrie()
// Remove tabs from tc.out
wantbuf := make([]byte, 0, len(tc.out))
for i := 0; i < len(tc.out); i++ {
if tc.out[i] != '\t' {
wantbuf = append(wantbuf, tc.out[i])
}
}
want := string(wantbuf)
if got != want {
t.Errorf("PrintTrie(%q)\ngot\n%swant\n%s", tc.in, got, want)
}
}
}
func BenchmarkGenericNoMatch(b *testing.B) { func BenchmarkGenericNoMatch(b *testing.B) {
str := Repeat("A", 100) + Repeat("B", 100) str := Repeat("A", 100) + Repeat("B", 100)
generic := NewReplacer("a", "A", "b", "B", "12", "123") // varying lengths forces generic generic := NewReplacer("a", "A", "b", "B", "12", "123") // varying lengths forces generic
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment