Commit cfcc3ebf authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/norm: changed API of Iter.

Motivations:
 - Simpler UI. Previous API proved a bit awkward for practical purposes.
 - Iter is often used in cases where one want to be able to bail out early.
   The old implementaton had too much look-ahead to be efficient.
Disadvantages:
 - ASCII performance is bad. This is unavoidable for tiny iterations.
   Example is included to show how to work around this.

Description:
Iter now iterates per boundary/segment. It returns a slice of bytes that
either points to the input bytes, the internal decomposition strings,
or the small internal buffer that each iterator has. In many cases, copying
bytes is avoided.
The method Seek was added to support jumping around the input without
having to reinitialize.

Details:
 - Table adjustments: some decompositions exist of multiple segments.
   Decompositions that are of this type are now marked so that Iter can
   handle them separately.
 - The old iterator had a different next function for different normal forms
   that was assigned to a function pointer called by Next.
   The new iterator uses this mechanism to switch between different modes
   for handling different type of input as well.  This greatly improves
   performance for Hangul and ASCII. It is also used for multi-segment
   decompositions.
 - input is now a struct of sting and []byte, instead of an interface.
   This simplifies optimizing the ASCII case.

R=rsc
CC=golang-dev
https://golang.org/cl/6873072
parent 9aa70984
......@@ -30,22 +30,18 @@ type reorderBuffer struct {
src input
nsrc int
srcBytes inputBytes
srcString inputString
tmpBytes inputBytes
tmpBytes input
}
func (rb *reorderBuffer) init(f Form, src []byte) {
rb.f = *formTable[f]
rb.srcBytes = inputBytes(src)
rb.src = &rb.srcBytes
rb.src.setBytes(src)
rb.nsrc = len(src)
}
func (rb *reorderBuffer) initString(f Form, src string) {
rb.f = *formTable[f]
rb.srcString = inputString(src)
rb.src = &rb.srcString
rb.src.setString(src)
rb.nsrc = len(src)
}
......@@ -121,9 +117,9 @@ func (rb *reorderBuffer) insert(src input, i int, info Properties) bool {
// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
func (rb *reorderBuffer) insertDecomposed(dcomp []byte) bool {
saveNrune, saveNbyte := rb.nrune, rb.nbyte
rb.tmpBytes = inputBytes(dcomp)
rb.tmpBytes.setBytes(dcomp)
for i := 0; i < len(dcomp); {
info := rb.f.info(&rb.tmpBytes, i)
info := rb.f.info(rb.tmpBytes, i)
pos := rb.nbyte
if !rb.insertOrdered(info) {
rb.nrune, rb.nbyte = saveNrune, saveNbyte
......
......@@ -81,7 +81,7 @@ func flushF(rb *reorderBuffer) []byte {
}
func flushCopyF(rb *reorderBuffer) []byte {
out := make([]byte, MaxSegmentSize)
out := make([]byte, maxByteBufferSize)
n := rb.flushCopy(out)
return out[:n]
}
......
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm_test
import (
"bytes"
"exp/norm"
"fmt"
"unicode/utf8"
)
// EqualSimple uses a norm.Iter to compare two non-normalized
// strings for equivalence.
func EqualSimple(a, b string) bool {
var ia, ib norm.Iter
ia.InitString(norm.NFKD, a)
ib.InitString(norm.NFKD, b)
for !ia.Done() && !ib.Done() {
if !bytes.Equal(ia.Next(), ib.Next()) {
return false
}
}
return ia.Done() && ib.Done()
}
// FindPrefix finds the longest common prefix of ASCII characters
// of a and b.
func FindPrefix(a, b string) int {
i := 0
for ; i < len(a) && i < len(b) && a[i] < utf8.RuneSelf && a[i] == b[i]; i++ {
}
return i
}
// EqualOpt is like EqualSimple, but optimizes the special
// case for ASCII characters.
func EqualOpt(a, b string) bool {
n := FindPrefix(a, b)
a, b = a[n:], b[n:]
var ia, ib norm.Iter
ia.InitString(norm.NFKD, a)
ib.InitString(norm.NFKD, b)
for !ia.Done() && !ib.Done() {
if !bytes.Equal(ia.Next(), ib.Next()) {
return false
}
if n := int64(FindPrefix(a[ia.Pos():], b[ib.Pos():])); n != 0 {
ia.Seek(n, 1)
ib.Seek(n, 1)
}
}
return ia.Done() && ib.Done()
}
var compareTests = []struct{ a, b string }{
{"aaa", "aaa"},
{"aaa", "aab"},
{"a\u0300a", "\u00E0a"},
{"a\u0300\u0320b", "a\u0320\u0300b"},
{"\u1E0A\u0323", "\x44\u0323\u0307"},
// A character that decomposes into multiple segments
// spans several iterations.
{"\u3304", "\u30A4\u30CB\u30F3\u30AF\u3099"},
}
func ExampleIter() {
for i, t := range compareTests {
r0 := EqualSimple(t.a, t.b)
r1 := EqualOpt(t.a, t.b)
fmt.Printf("%d: %v %v\n", i, r0, r1)
}
// Output:
// 0: true true
// 1: false false
// 2: true true
// 3: true true
// 4: true true
// 5: true true
}
......@@ -50,6 +50,7 @@ type formInfo struct {
form Form
composing, compatibility bool // form type
info lookupFunc
nextMain iterFunc
}
var formTable []*formInfo
......@@ -67,7 +68,9 @@ func init() {
} else {
f.info = lookupInfoNFC
}
f.nextMain = nextDecomposed
if Form(i) == NFC || Form(i) == NFKC {
f.nextMain = nextComposed
f.composing = true
}
}
......@@ -117,6 +120,10 @@ func (p Properties) isInert() bool {
return p.flags&0xf == 0 && p.ccc == 0
}
func (p Properties) multiSegment() bool {
return p.index >= firstMulti && p.index < endMulti
}
// Decomposition returns the decomposition for the underlying rune
// or nil if there is none.
func (p Properties) Decomposition() []byte {
......
......@@ -6,91 +6,100 @@ package norm
import "unicode/utf8"
type input interface {
skipASCII(p, max int) int
skipNonStarter(p int) int
appendSlice(buf []byte, s, e int) []byte
copySlice(buf []byte, s, e int)
charinfoNFC(p int) (uint16, int)
charinfoNFKC(p int) (uint16, int)
hangul(p int) rune
type input struct {
str string
bytes []byte
}
type inputString string
func (s inputString) skipASCII(p, max int) int {
for ; p < max && s[p] < utf8.RuneSelf; p++ {
}
return p
}
func (s inputString) skipNonStarter(p int) int {
for ; p < len(s) && !utf8.RuneStart(s[p]); p++ {
}
return p
func inputBytes(str []byte) input {
return input{bytes: str}
}
func (s inputString) appendSlice(buf []byte, b, e int) []byte {
for i := b; i < e; i++ {
buf = append(buf, s[i])
}
return buf
func inputString(str string) input {
return input{str: str}
}
func (s inputString) copySlice(buf []byte, b, e int) {
copy(buf, s[b:e])
func (in *input) setBytes(str []byte) {
in.str = ""
in.bytes = str
}
func (s inputString) charinfoNFC(p int) (uint16, int) {
return nfcTrie.lookupString(string(s[p:]))
func (in *input) setString(str string) {
in.str = str
in.bytes = nil
}
func (s inputString) charinfoNFKC(p int) (uint16, int) {
return nfkcTrie.lookupString(string(s[p:]))
}
func (s inputString) hangul(p int) rune {
if !isHangulString(string(s[p:])) {
return 0
func (in *input) _byte(p int) byte {
if in.bytes == nil {
return in.str[p]
}
rune, _ := utf8.DecodeRuneInString(string(s[p:]))
return rune
return in.bytes[p]
}
type inputBytes []byte
func (s inputBytes) skipASCII(p, max int) int {
for ; p < max && s[p] < utf8.RuneSelf; p++ {
func (in *input) skipASCII(p, max int) int {
if in.bytes == nil {
for ; p < max && in.str[p] < utf8.RuneSelf; p++ {
}
} else {
for ; p < max && in.bytes[p] < utf8.RuneSelf; p++ {
}
}
return p
}
func (s inputBytes) skipNonStarter(p int) int {
for ; p < len(s) && !utf8.RuneStart(s[p]); p++ {
func (in *input) skipNonStarter(p int) int {
if in.bytes == nil {
for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ {
}
} else {
for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ {
}
}
return p
}
func (s inputBytes) appendSlice(buf []byte, b, e int) []byte {
return append(buf, s[b:e]...)
func (in *input) appendSlice(buf []byte, b, e int) []byte {
if in.bytes != nil {
return append(buf, in.bytes[b:e]...)
}
for i := b; i < e; i++ {
buf = append(buf, in.str[i])
}
return buf
}
func (s inputBytes) copySlice(buf []byte, b, e int) {
copy(buf, s[b:e])
func (in *input) copySlice(buf []byte, b, e int) int {
if in.bytes == nil {
return copy(buf, in.str[b:e])
}
return copy(buf, in.bytes[b:e])
}
func (s inputBytes) charinfoNFC(p int) (uint16, int) {
return nfcTrie.lookup(s[p:])
func (in *input) charinfoNFC(p int) (uint16, int) {
if in.bytes == nil {
return nfcTrie.lookupString(in.str[p:])
}
return nfcTrie.lookup(in.bytes[p:])
}
func (s inputBytes) charinfoNFKC(p int) (uint16, int) {
return nfkcTrie.lookup(s[p:])
func (in *input) charinfoNFKC(p int) (uint16, int) {
if in.bytes == nil {
return nfkcTrie.lookupString(in.str[p:])
}
return nfkcTrie.lookup(in.bytes[p:])
}
func (s inputBytes) hangul(p int) rune {
if !isHangul(s[p:]) {
func (in *input) hangul(p int) (r rune) {
if in.bytes == nil {
if !isHangulString(in.str[p:]) {
return 0
}
rune, _ := utf8.DecodeRune(s[p:])
return rune
r, _ = utf8.DecodeRuneInString(in.str[p:])
} else {
if !isHangul(in.bytes[p:]) {
return 0
}
r, _ = utf8.DecodeRune(in.bytes[p:])
}
return r
}
This diff is collapsed.
......@@ -9,21 +9,12 @@ import (
"testing"
)
var iterBufSizes = []int{
MaxSegmentSize,
1.5 * MaxSegmentSize,
2 * MaxSegmentSize,
3 * MaxSegmentSize,
100 * MaxSegmentSize,
}
func doIterNorm(f Form, buf []byte, s string) []byte {
func doIterNorm(f Form, s string) []byte {
acc := []byte{}
i := Iter{}
i.SetInputString(f, s)
i.InitString(f, s)
for !i.Done() {
n := i.Next(buf)
acc = append(acc, buf[:n]...)
acc = append(acc, i.Next()...)
}
return acc
}
......@@ -35,17 +26,16 @@ func runIterTests(t *testing.T, name string, f Form, tests []AppendTest, norm bo
if norm {
gold = string(f.AppendString(nil, test.out))
}
for _, sz := range iterBufSizes {
buf := make([]byte, sz)
out := string(doIterNorm(f, buf, in))
out := string(doIterNorm(f, in))
if len(out) != len(gold) {
const msg = "%s:%d:%d: length is %d; want %d"
t.Errorf(msg, name, i, sz, len(out), len(gold))
const msg = "%s:%d: length is %d; want %d"
t.Errorf(msg, name, i, len(out), len(gold))
}
if out != gold {
// Find first rune that differs and show context.
ir := []rune(out)
ig := []rune(gold)
t.Errorf("\n%X != \n%X", ir, ig)
for j := 0; j < len(ir) && j < len(ig); j++ {
if ir[j] == ig[j] {
continue
......@@ -54,56 +44,57 @@ func runIterTests(t *testing.T, name string, f Form, tests []AppendTest, norm bo
j = 0
}
for e := j + 7; j < e && j < len(ir) && j < len(ig); j++ {
const msg = "%s:%d:%d: runeAt(%d) = %U; want %U"
t.Errorf(msg, name, i, sz, j, ir[j], ig[j])
const msg = "%s:%d: runeAt(%d) = %U; want %U"
t.Errorf(msg, name, i, j, ir[j], ig[j])
}
break
}
}
}
}
}
func rep(r rune, n int) string {
return strings.Repeat(string(r), n)
}
const segSize = maxByteBufferSize
var iterTests = []AppendTest{
{"", ascii, ascii},
{"", txt_all, txt_all},
{"", "a" + rep(0x0300, MaxSegmentSize/2), "a" + rep(0x0300, MaxSegmentSize/2)},
{"", "a" + rep(0x0300, segSize/2), "a" + rep(0x0300, segSize/2)},
}
var iterTestsD = []AppendTest{
{ // segment overflow on unchanged character
"",
"a" + rep(0x0300, MaxSegmentSize/2) + "\u0316",
"a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0316\u0300",
"a" + rep(0x0300, segSize/2) + "\u0316",
"a" + rep(0x0300, segSize/2-1) + "\u0316\u0300",
},
{ // segment overflow on unchanged character + start value
"",
"a" + rep(0x0300, MaxSegmentSize/2+maxCombiningChars+4) + "\u0316",
"a" + rep(0x0300, MaxSegmentSize/2+maxCombiningChars) + "\u0316" + rep(0x300, 4),
"a" + rep(0x0300, segSize/2+maxCombiningChars+4) + "\u0316",
"a" + rep(0x0300, segSize/2+maxCombiningChars) + "\u0316" + rep(0x300, 4),
},
{ // segment overflow on decomposition
"",
"a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0340",
"a" + rep(0x0300, MaxSegmentSize/2),
"a" + rep(0x0300, segSize/2-1) + "\u0340",
"a" + rep(0x0300, segSize/2),
},
{ // segment overflow on decomposition + start value
"",
"a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0340" + rep(0x300, maxCombiningChars+4) + "\u0320",
"a" + rep(0x0300, MaxSegmentSize/2-1) + rep(0x300, maxCombiningChars+1) + "\u0320" + rep(0x300, 4),
"a" + rep(0x0300, segSize/2-1) + "\u0340" + rep(0x300, maxCombiningChars+4) + "\u0320",
"a" + rep(0x0300, segSize/2-1) + rep(0x300, maxCombiningChars+1) + "\u0320" + rep(0x300, 4),
},
{ // start value after ASCII overflow
"",
rep('a', MaxSegmentSize) + rep(0x300, maxCombiningChars+2) + "\u0320",
rep('a', MaxSegmentSize) + rep(0x300, maxCombiningChars) + "\u0320\u0300\u0300",
rep('a', segSize) + rep(0x300, maxCombiningChars+2) + "\u0320",
rep('a', segSize) + rep(0x300, maxCombiningChars) + "\u0320\u0300\u0300",
},
{ // start value after Hangul overflow
"",
rep(0xAC00, MaxSegmentSize/6) + rep(0x300, maxCombiningChars+2) + "\u0320",
strings.Repeat("\u1100\u1161", MaxSegmentSize/6) + rep(0x300, maxCombiningChars-1) + "\u0320" + rep(0x300, 3),
rep(0xAC00, segSize/6) + rep(0x300, maxCombiningChars+2) + "\u0320",
strings.Repeat("\u1100\u1161", segSize/6) + rep(0x300, maxCombiningChars+1) + "\u0320" + rep(0x300, 1),
},
{ // start value after cc=0
"",
......@@ -125,8 +116,8 @@ var iterTestsC = []AppendTest{
},
{ // segment overflow
"",
"a" + rep(0x0305, MaxSegmentSize/2+4) + "\u0316",
"a" + rep(0x0305, MaxSegmentSize/2-1) + "\u0316" + rep(0x305, 5),
"a" + rep(0x0305, segSize/2+4) + "\u0316",
"a" + rep(0x0305, segSize/2-1) + "\u0316" + rep(0x305, 5),
},
}
......@@ -148,27 +139,39 @@ type SegmentTest struct {
}
var segmentTests = []SegmentTest{
{rep('a', MaxSegmentSize), []string{rep('a', MaxSegmentSize), ""}},
{rep('a', MaxSegmentSize+2), []string{rep('a', MaxSegmentSize-1), "aaa", ""}},
{rep('a', MaxSegmentSize) + "\u0300aa", []string{rep('a', MaxSegmentSize-1), "a\u0300", "aa", ""}},
{"\u1E0A\u0323a", []string{"\x44\u0323\u0307", "a", ""}},
{rep('a', segSize), append(strings.Split(rep('a', segSize), ""), "")},
{rep('a', segSize+2), append(strings.Split(rep('a', segSize+2), ""), "")},
{rep('a', segSize) + "\u0300aa",
append(strings.Split(rep('a', segSize-1), ""), "a\u0300", "a", "a", "")},
}
var segmentTestsK = []SegmentTest{
{"\u3332", []string{"\u30D5", "\u30A1", "\u30E9", "\u30C3", "\u30C8\u3099", ""}},
// last segment of multi-segment decomposition needs normalization
{"\u3332\u093C", []string{"\u30D5", "\u30A1", "\u30E9", "\u30C3", "\u30C8\u093C\u3099", ""}},
// Hangul and Jamo are grouped togeter.
{"\uAC00", []string{"\u1100\u1161", ""}},
{"\uAC01", []string{"\u1100\u1161\u11A8", ""}},
{"\u1100\u1161", []string{"\u1100\u1161", ""}},
}
// Note that, by design, segmentation is equal for composing and decomposing forms.
func TestIterSegmentation(t *testing.T) {
segmentTest(t, "SegmentTestD", NFD, segmentTests)
segmentTest(t, "SegmentTestC", NFC, segmentTests)
segmentTest(t, "SegmentTestD", NFKD, segmentTestsK)
segmentTest(t, "SegmentTestC", NFKC, segmentTestsK)
}
func segmentTest(t *testing.T, name string, f Form, tests []SegmentTest) {
iter := Iter{}
for i, tt := range segmentTests {
buf := make([]byte, MaxSegmentSize)
iter.SetInputString(f, tt.in)
for i, tt := range tests {
iter.InitString(f, tt.in)
for j, seg := range tt.out {
if seg == "" {
if !iter.Done() {
n := iter.Next(buf)
res := string(buf[:n])
res := string(iter.Next())
t.Errorf(`%s:%d:%d: expected Done()==true, found segment "%s"`, name, i, j, res)
}
continue
......@@ -176,10 +179,9 @@ func segmentTest(t *testing.T, name string, f Form, tests []SegmentTest) {
if iter.Done() {
t.Errorf("%s:%d:%d: Done()==true, want false", name, i, j)
}
n := iter.Next(buf)
seg = f.String(seg)
if res := string(buf[:n]); res != seg {
t.Errorf(`%s:%d:%d" segment was "%s" (%d); want "%s" (%d)`, name, i, j, res, len(res), seg, len(seg))
if res := string(iter.Next()); res != seg {
t.Errorf(`%s:%d:%d" segment was "%s" (%d); want "%s" (%d) %X %X`, name, i, j, res, len(res), seg, len(seg), []rune(res), []rune(seg))
}
}
}
......
......@@ -574,7 +574,19 @@ func makeEntry(f *FormInfo) uint16 {
// decompSet keeps track of unique decompositions, grouped by whether
// the decomposition is followed by a trailing and/or leading CCC.
type decompSet [4]map[string]bool
type decompSet [6]map[string]bool
const (
normalDecomp = iota
firstMulti
firstCCC
endMulti
firstLeadingCCC
firstCCCZeroExcept
lastDecomp
)
var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "lastDecomp"}
func makeDecompSet() decompSet {
m := decompSet{}
......@@ -614,20 +626,30 @@ func printCharInfoTables() int {
const msg = "%U: lccc (%d) must be <= tcc (%d)"
logger.Fatalf(msg, r, lccc, tccc)
}
index := 0
index := normalDecomp
if tccc > 0 || lccc > 0 {
s += string([]byte{tccc})
index = 1
index = endMulti
for _, r := range d[1:] {
if ccc(r) == 0 {
index = firstCCC
}
}
if lccc > 0 {
s += string([]byte{lccc})
index = 2
if index == firstCCC {
logger.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r)
}
index = firstLeadingCCC
}
if cc != lccc {
if cc != 0 {
logger.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc)
}
index = 3
index = firstCCCZeroExcept
}
} else if len(d) > 1 {
index = firstMulti
}
return index, s
}
......@@ -653,7 +675,6 @@ func printCharInfoTables() int {
size := 0
positionMap := make(map[string]uint16)
decompositions.WriteString("\000")
cname := []string{"firstCCC", "firstLeadingCCC", "firstCCCZeroExcept", "lastDecomp"}
fmt.Println("const (")
for i, m := range decompSet {
sa := []string{}
......
......@@ -6,6 +6,7 @@ package norm
import (
"bytes"
"io"
"strings"
"testing"
)
......@@ -504,12 +505,35 @@ func appendBench(f Form, in []byte) func() {
}
func iterBench(f Form, in []byte) func() {
buf := make([]byte, 4*len(in))
iter := Iter{}
return func() {
iter.SetInput(f, in)
iter.Init(f, in)
for !iter.Done() {
iter.Next(buf)
iter.Next()
}
}
}
func readerBench(f Form, in []byte) func() {
buf := make([]byte, 4*len(in))
return func() {
r := f.Reader(bytes.NewReader(in))
var err error
for err == nil {
_, err = r.Read(buf)
}
if err != io.EOF {
panic("")
}
}
}
func writerBench(f Form, in []byte) func() {
buf := make([]byte, 0, 4*len(in))
return func() {
r := f.Writer(bytes.NewBuffer(buf))
if _, err := r.Write(in); err != nil {
panic("")
}
}
}
......@@ -517,6 +541,8 @@ func iterBench(f Form, in []byte) func() {
func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
//bm = append(bm, appendBench(f, in))
bm = append(bm, iterBench(f, in))
//bm = append(bm, readerBench(f, in))
//bm = append(bm, writerBench(f, in))
return bm
}
......
......@@ -223,13 +223,11 @@ func doTest(t *Test, f norm.Form, gold, test string) {
cmpResult(t, "Bytes", f, gold, test, string(result))
sresult := f.String(test)
cmpResult(t, "String", f, gold, test, sresult)
buf := make([]byte, norm.MaxSegmentSize)
acc := []byte{}
i := norm.Iter{}
i.SetInputString(f, test)
i.InitString(f, test)
for !i.Done() {
n := i.Next(buf)
acc = append(acc, buf[:n]...)
acc = append(acc, i.Next()...)
}
cmpResult(t, "Iter.Next", f, gold, test, string(acc))
for i := range test {
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment