Commit cfcc3ebf authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/norm: changed API of Iter.

Motivations:
 - Simpler UI. Previous API proved a bit awkward for practical purposes.
 - Iter is often used in cases where one want to be able to bail out early.
   The old implementaton had too much look-ahead to be efficient.
Disadvantages:
 - ASCII performance is bad. This is unavoidable for tiny iterations.
   Example is included to show how to work around this.

Description:
Iter now iterates per boundary/segment. It returns a slice of bytes that
either points to the input bytes, the internal decomposition strings,
or the small internal buffer that each iterator has. In many cases, copying
bytes is avoided.
The method Seek was added to support jumping around the input without
having to reinitialize.

Details:
 - Table adjustments: some decompositions exist of multiple segments.
   Decompositions that are of this type are now marked so that Iter can
   handle them separately.
 - The old iterator had a different next function for different normal forms
   that was assigned to a function pointer called by Next.
   The new iterator uses this mechanism to switch between different modes
   for handling different type of input as well.  This greatly improves
   performance for Hangul and ASCII. It is also used for multi-segment
   decompositions.
 - input is now a struct of sting and []byte, instead of an interface.
   This simplifies optimizing the ASCII case.

R=rsc
CC=golang-dev
https://golang.org/cl/6873072
parent 9aa70984
...@@ -28,24 +28,20 @@ type reorderBuffer struct { ...@@ -28,24 +28,20 @@ type reorderBuffer struct {
nbyte uint8 // Number or bytes. nbyte uint8 // Number or bytes.
f formInfo f formInfo
src input src input
nsrc int nsrc int
srcBytes inputBytes tmpBytes input
srcString inputString
tmpBytes inputBytes
} }
func (rb *reorderBuffer) init(f Form, src []byte) { func (rb *reorderBuffer) init(f Form, src []byte) {
rb.f = *formTable[f] rb.f = *formTable[f]
rb.srcBytes = inputBytes(src) rb.src.setBytes(src)
rb.src = &rb.srcBytes
rb.nsrc = len(src) rb.nsrc = len(src)
} }
func (rb *reorderBuffer) initString(f Form, src string) { func (rb *reorderBuffer) initString(f Form, src string) {
rb.f = *formTable[f] rb.f = *formTable[f]
rb.srcString = inputString(src) rb.src.setString(src)
rb.src = &rb.srcString
rb.nsrc = len(src) rb.nsrc = len(src)
} }
...@@ -121,9 +117,9 @@ func (rb *reorderBuffer) insert(src input, i int, info Properties) bool { ...@@ -121,9 +117,9 @@ func (rb *reorderBuffer) insert(src input, i int, info Properties) bool {
// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes. // in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
func (rb *reorderBuffer) insertDecomposed(dcomp []byte) bool { func (rb *reorderBuffer) insertDecomposed(dcomp []byte) bool {
saveNrune, saveNbyte := rb.nrune, rb.nbyte saveNrune, saveNbyte := rb.nrune, rb.nbyte
rb.tmpBytes = inputBytes(dcomp) rb.tmpBytes.setBytes(dcomp)
for i := 0; i < len(dcomp); { for i := 0; i < len(dcomp); {
info := rb.f.info(&rb.tmpBytes, i) info := rb.f.info(rb.tmpBytes, i)
pos := rb.nbyte pos := rb.nbyte
if !rb.insertOrdered(info) { if !rb.insertOrdered(info) {
rb.nrune, rb.nbyte = saveNrune, saveNbyte rb.nrune, rb.nbyte = saveNrune, saveNbyte
......
...@@ -81,7 +81,7 @@ func flushF(rb *reorderBuffer) []byte { ...@@ -81,7 +81,7 @@ func flushF(rb *reorderBuffer) []byte {
} }
func flushCopyF(rb *reorderBuffer) []byte { func flushCopyF(rb *reorderBuffer) []byte {
out := make([]byte, MaxSegmentSize) out := make([]byte, maxByteBufferSize)
n := rb.flushCopy(out) n := rb.flushCopy(out)
return out[:n] return out[:n]
} }
......
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm_test
import (
"bytes"
"exp/norm"
"fmt"
"unicode/utf8"
)
// EqualSimple uses a norm.Iter to compare two non-normalized
// strings for equivalence.
func EqualSimple(a, b string) bool {
var ia, ib norm.Iter
ia.InitString(norm.NFKD, a)
ib.InitString(norm.NFKD, b)
for !ia.Done() && !ib.Done() {
if !bytes.Equal(ia.Next(), ib.Next()) {
return false
}
}
return ia.Done() && ib.Done()
}
// FindPrefix finds the longest common prefix of ASCII characters
// of a and b.
func FindPrefix(a, b string) int {
i := 0
for ; i < len(a) && i < len(b) && a[i] < utf8.RuneSelf && a[i] == b[i]; i++ {
}
return i
}
// EqualOpt is like EqualSimple, but optimizes the special
// case for ASCII characters.
func EqualOpt(a, b string) bool {
n := FindPrefix(a, b)
a, b = a[n:], b[n:]
var ia, ib norm.Iter
ia.InitString(norm.NFKD, a)
ib.InitString(norm.NFKD, b)
for !ia.Done() && !ib.Done() {
if !bytes.Equal(ia.Next(), ib.Next()) {
return false
}
if n := int64(FindPrefix(a[ia.Pos():], b[ib.Pos():])); n != 0 {
ia.Seek(n, 1)
ib.Seek(n, 1)
}
}
return ia.Done() && ib.Done()
}
var compareTests = []struct{ a, b string }{
{"aaa", "aaa"},
{"aaa", "aab"},
{"a\u0300a", "\u00E0a"},
{"a\u0300\u0320b", "a\u0320\u0300b"},
{"\u1E0A\u0323", "\x44\u0323\u0307"},
// A character that decomposes into multiple segments
// spans several iterations.
{"\u3304", "\u30A4\u30CB\u30F3\u30AF\u3099"},
}
func ExampleIter() {
for i, t := range compareTests {
r0 := EqualSimple(t.a, t.b)
r1 := EqualOpt(t.a, t.b)
fmt.Printf("%d: %v %v\n", i, r0, r1)
}
// Output:
// 0: true true
// 1: false false
// 2: true true
// 3: true true
// 4: true true
// 5: true true
}
...@@ -50,6 +50,7 @@ type formInfo struct { ...@@ -50,6 +50,7 @@ type formInfo struct {
form Form form Form
composing, compatibility bool // form type composing, compatibility bool // form type
info lookupFunc info lookupFunc
nextMain iterFunc
} }
var formTable []*formInfo var formTable []*formInfo
...@@ -67,7 +68,9 @@ func init() { ...@@ -67,7 +68,9 @@ func init() {
} else { } else {
f.info = lookupInfoNFC f.info = lookupInfoNFC
} }
f.nextMain = nextDecomposed
if Form(i) == NFC || Form(i) == NFKC { if Form(i) == NFC || Form(i) == NFKC {
f.nextMain = nextComposed
f.composing = true f.composing = true
} }
} }
...@@ -117,6 +120,10 @@ func (p Properties) isInert() bool { ...@@ -117,6 +120,10 @@ func (p Properties) isInert() bool {
return p.flags&0xf == 0 && p.ccc == 0 return p.flags&0xf == 0 && p.ccc == 0
} }
func (p Properties) multiSegment() bool {
return p.index >= firstMulti && p.index < endMulti
}
// Decomposition returns the decomposition for the underlying rune // Decomposition returns the decomposition for the underlying rune
// or nil if there is none. // or nil if there is none.
func (p Properties) Decomposition() []byte { func (p Properties) Decomposition() []byte {
......
...@@ -6,91 +6,100 @@ package norm ...@@ -6,91 +6,100 @@ package norm
import "unicode/utf8" import "unicode/utf8"
type input interface { type input struct {
skipASCII(p, max int) int str string
skipNonStarter(p int) int bytes []byte
appendSlice(buf []byte, s, e int) []byte
copySlice(buf []byte, s, e int)
charinfoNFC(p int) (uint16, int)
charinfoNFKC(p int) (uint16, int)
hangul(p int) rune
} }
type inputString string func inputBytes(str []byte) input {
return input{bytes: str}
func (s inputString) skipASCII(p, max int) int {
for ; p < max && s[p] < utf8.RuneSelf; p++ {
}
return p
}
func (s inputString) skipNonStarter(p int) int {
for ; p < len(s) && !utf8.RuneStart(s[p]); p++ {
}
return p
}
func (s inputString) appendSlice(buf []byte, b, e int) []byte {
for i := b; i < e; i++ {
buf = append(buf, s[i])
}
return buf
} }
func (s inputString) copySlice(buf []byte, b, e int) { func inputString(str string) input {
copy(buf, s[b:e]) return input{str: str}
} }
func (s inputString) charinfoNFC(p int) (uint16, int) { func (in *input) setBytes(str []byte) {
return nfcTrie.lookupString(string(s[p:])) in.str = ""
in.bytes = str
} }
func (s inputString) charinfoNFKC(p int) (uint16, int) { func (in *input) setString(str string) {
return nfkcTrie.lookupString(string(s[p:])) in.str = str
in.bytes = nil
} }
func (s inputString) hangul(p int) rune { func (in *input) _byte(p int) byte {
if !isHangulString(string(s[p:])) { if in.bytes == nil {
return 0 return in.str[p]
} }
rune, _ := utf8.DecodeRuneInString(string(s[p:])) return in.bytes[p]
return rune
} }
type inputBytes []byte func (in *input) skipASCII(p, max int) int {
if in.bytes == nil {
func (s inputBytes) skipASCII(p, max int) int { for ; p < max && in.str[p] < utf8.RuneSelf; p++ {
for ; p < max && s[p] < utf8.RuneSelf; p++ { }
} else {
for ; p < max && in.bytes[p] < utf8.RuneSelf; p++ {
}
} }
return p return p
} }
func (s inputBytes) skipNonStarter(p int) int { func (in *input) skipNonStarter(p int) int {
for ; p < len(s) && !utf8.RuneStart(s[p]); p++ { if in.bytes == nil {
for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ {
}
} else {
for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ {
}
} }
return p return p
} }
func (s inputBytes) appendSlice(buf []byte, b, e int) []byte { func (in *input) appendSlice(buf []byte, b, e int) []byte {
return append(buf, s[b:e]...) if in.bytes != nil {
return append(buf, in.bytes[b:e]...)
}
for i := b; i < e; i++ {
buf = append(buf, in.str[i])
}
return buf
} }
func (s inputBytes) copySlice(buf []byte, b, e int) { func (in *input) copySlice(buf []byte, b, e int) int {
copy(buf, s[b:e]) if in.bytes == nil {
return copy(buf, in.str[b:e])
}
return copy(buf, in.bytes[b:e])
} }
func (s inputBytes) charinfoNFC(p int) (uint16, int) { func (in *input) charinfoNFC(p int) (uint16, int) {
return nfcTrie.lookup(s[p:]) if in.bytes == nil {
return nfcTrie.lookupString(in.str[p:])
}
return nfcTrie.lookup(in.bytes[p:])
} }
func (s inputBytes) charinfoNFKC(p int) (uint16, int) { func (in *input) charinfoNFKC(p int) (uint16, int) {
return nfkcTrie.lookup(s[p:]) if in.bytes == nil {
return nfkcTrie.lookupString(in.str[p:])
}
return nfkcTrie.lookup(in.bytes[p:])
} }
func (s inputBytes) hangul(p int) rune { func (in *input) hangul(p int) (r rune) {
if !isHangul(s[p:]) { if in.bytes == nil {
return 0 if !isHangulString(in.str[p:]) {
return 0
}
r, _ = utf8.DecodeRuneInString(in.str[p:])
} else {
if !isHangul(in.bytes[p:]) {
return 0
}
r, _ = utf8.DecodeRune(in.bytes[p:])
} }
rune, _ := utf8.DecodeRune(s[p:]) return r
return rune
} }
This diff is collapsed.
...@@ -9,21 +9,12 @@ import ( ...@@ -9,21 +9,12 @@ import (
"testing" "testing"
) )
var iterBufSizes = []int{ func doIterNorm(f Form, s string) []byte {
MaxSegmentSize,
1.5 * MaxSegmentSize,
2 * MaxSegmentSize,
3 * MaxSegmentSize,
100 * MaxSegmentSize,
}
func doIterNorm(f Form, buf []byte, s string) []byte {
acc := []byte{} acc := []byte{}
i := Iter{} i := Iter{}
i.SetInputString(f, s) i.InitString(f, s)
for !i.Done() { for !i.Done() {
n := i.Next(buf) acc = append(acc, i.Next()...)
acc = append(acc, buf[:n]...)
} }
return acc return acc
} }
...@@ -35,30 +26,28 @@ func runIterTests(t *testing.T, name string, f Form, tests []AppendTest, norm bo ...@@ -35,30 +26,28 @@ func runIterTests(t *testing.T, name string, f Form, tests []AppendTest, norm bo
if norm { if norm {
gold = string(f.AppendString(nil, test.out)) gold = string(f.AppendString(nil, test.out))
} }
for _, sz := range iterBufSizes { out := string(doIterNorm(f, in))
buf := make([]byte, sz) if len(out) != len(gold) {
out := string(doIterNorm(f, buf, in)) const msg = "%s:%d: length is %d; want %d"
if len(out) != len(gold) { t.Errorf(msg, name, i, len(out), len(gold))
const msg = "%s:%d:%d: length is %d; want %d" }
t.Errorf(msg, name, i, sz, len(out), len(gold)) if out != gold {
} // Find first rune that differs and show context.
if out != gold { ir := []rune(out)
// Find first rune that differs and show context. ig := []rune(gold)
ir := []rune(out) t.Errorf("\n%X != \n%X", ir, ig)
ig := []rune(gold) for j := 0; j < len(ir) && j < len(ig); j++ {
for j := 0; j < len(ir) && j < len(ig); j++ { if ir[j] == ig[j] {
if ir[j] == ig[j] { continue
continue }
} if j -= 3; j < 0 {
if j -= 3; j < 0 { j = 0
j = 0
}
for e := j + 7; j < e && j < len(ir) && j < len(ig); j++ {
const msg = "%s:%d:%d: runeAt(%d) = %U; want %U"
t.Errorf(msg, name, i, sz, j, ir[j], ig[j])
}
break
} }
for e := j + 7; j < e && j < len(ir) && j < len(ig); j++ {
const msg = "%s:%d: runeAt(%d) = %U; want %U"
t.Errorf(msg, name, i, j, ir[j], ig[j])
}
break
} }
} }
} }
...@@ -68,42 +57,44 @@ func rep(r rune, n int) string { ...@@ -68,42 +57,44 @@ func rep(r rune, n int) string {
return strings.Repeat(string(r), n) return strings.Repeat(string(r), n)
} }
const segSize = maxByteBufferSize
var iterTests = []AppendTest{ var iterTests = []AppendTest{
{"", ascii, ascii}, {"", ascii, ascii},
{"", txt_all, txt_all}, {"", txt_all, txt_all},
{"", "a" + rep(0x0300, MaxSegmentSize/2), "a" + rep(0x0300, MaxSegmentSize/2)}, {"", "a" + rep(0x0300, segSize/2), "a" + rep(0x0300, segSize/2)},
} }
var iterTestsD = []AppendTest{ var iterTestsD = []AppendTest{
{ // segment overflow on unchanged character { // segment overflow on unchanged character
"", "",
"a" + rep(0x0300, MaxSegmentSize/2) + "\u0316", "a" + rep(0x0300, segSize/2) + "\u0316",
"a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0316\u0300", "a" + rep(0x0300, segSize/2-1) + "\u0316\u0300",
}, },
{ // segment overflow on unchanged character + start value { // segment overflow on unchanged character + start value
"", "",
"a" + rep(0x0300, MaxSegmentSize/2+maxCombiningChars+4) + "\u0316", "a" + rep(0x0300, segSize/2+maxCombiningChars+4) + "\u0316",
"a" + rep(0x0300, MaxSegmentSize/2+maxCombiningChars) + "\u0316" + rep(0x300, 4), "a" + rep(0x0300, segSize/2+maxCombiningChars) + "\u0316" + rep(0x300, 4),
}, },
{ // segment overflow on decomposition { // segment overflow on decomposition
"", "",
"a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0340", "a" + rep(0x0300, segSize/2-1) + "\u0340",
"a" + rep(0x0300, MaxSegmentSize/2), "a" + rep(0x0300, segSize/2),
}, },
{ // segment overflow on decomposition + start value { // segment overflow on decomposition + start value
"", "",
"a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0340" + rep(0x300, maxCombiningChars+4) + "\u0320", "a" + rep(0x0300, segSize/2-1) + "\u0340" + rep(0x300, maxCombiningChars+4) + "\u0320",
"a" + rep(0x0300, MaxSegmentSize/2-1) + rep(0x300, maxCombiningChars+1) + "\u0320" + rep(0x300, 4), "a" + rep(0x0300, segSize/2-1) + rep(0x300, maxCombiningChars+1) + "\u0320" + rep(0x300, 4),
}, },
{ // start value after ASCII overflow { // start value after ASCII overflow
"", "",
rep('a', MaxSegmentSize) + rep(0x300, maxCombiningChars+2) + "\u0320", rep('a', segSize) + rep(0x300, maxCombiningChars+2) + "\u0320",
rep('a', MaxSegmentSize) + rep(0x300, maxCombiningChars) + "\u0320\u0300\u0300", rep('a', segSize) + rep(0x300, maxCombiningChars) + "\u0320\u0300\u0300",
}, },
{ // start value after Hangul overflow { // start value after Hangul overflow
"", "",
rep(0xAC00, MaxSegmentSize/6) + rep(0x300, maxCombiningChars+2) + "\u0320", rep(0xAC00, segSize/6) + rep(0x300, maxCombiningChars+2) + "\u0320",
strings.Repeat("\u1100\u1161", MaxSegmentSize/6) + rep(0x300, maxCombiningChars-1) + "\u0320" + rep(0x300, 3), strings.Repeat("\u1100\u1161", segSize/6) + rep(0x300, maxCombiningChars+1) + "\u0320" + rep(0x300, 1),
}, },
{ // start value after cc=0 { // start value after cc=0
"", "",
...@@ -125,8 +116,8 @@ var iterTestsC = []AppendTest{ ...@@ -125,8 +116,8 @@ var iterTestsC = []AppendTest{
}, },
{ // segment overflow { // segment overflow
"", "",
"a" + rep(0x0305, MaxSegmentSize/2+4) + "\u0316", "a" + rep(0x0305, segSize/2+4) + "\u0316",
"a" + rep(0x0305, MaxSegmentSize/2-1) + "\u0316" + rep(0x305, 5), "a" + rep(0x0305, segSize/2-1) + "\u0316" + rep(0x305, 5),
}, },
} }
...@@ -148,27 +139,39 @@ type SegmentTest struct { ...@@ -148,27 +139,39 @@ type SegmentTest struct {
} }
var segmentTests = []SegmentTest{ var segmentTests = []SegmentTest{
{rep('a', MaxSegmentSize), []string{rep('a', MaxSegmentSize), ""}}, {"\u1E0A\u0323a", []string{"\x44\u0323\u0307", "a", ""}},
{rep('a', MaxSegmentSize+2), []string{rep('a', MaxSegmentSize-1), "aaa", ""}}, {rep('a', segSize), append(strings.Split(rep('a', segSize), ""), "")},
{rep('a', MaxSegmentSize) + "\u0300aa", []string{rep('a', MaxSegmentSize-1), "a\u0300", "aa", ""}}, {rep('a', segSize+2), append(strings.Split(rep('a', segSize+2), ""), "")},
{rep('a', segSize) + "\u0300aa",
append(strings.Split(rep('a', segSize-1), ""), "a\u0300", "a", "a", "")},
}
var segmentTestsK = []SegmentTest{
{"\u3332", []string{"\u30D5", "\u30A1", "\u30E9", "\u30C3", "\u30C8\u3099", ""}},
// last segment of multi-segment decomposition needs normalization
{"\u3332\u093C", []string{"\u30D5", "\u30A1", "\u30E9", "\u30C3", "\u30C8\u093C\u3099", ""}},
// Hangul and Jamo are grouped togeter.
{"\uAC00", []string{"\u1100\u1161", ""}},
{"\uAC01", []string{"\u1100\u1161\u11A8", ""}},
{"\u1100\u1161", []string{"\u1100\u1161", ""}},
} }
// Note that, by design, segmentation is equal for composing and decomposing forms. // Note that, by design, segmentation is equal for composing and decomposing forms.
func TestIterSegmentation(t *testing.T) { func TestIterSegmentation(t *testing.T) {
segmentTest(t, "SegmentTestD", NFD, segmentTests) segmentTest(t, "SegmentTestD", NFD, segmentTests)
segmentTest(t, "SegmentTestC", NFC, segmentTests) segmentTest(t, "SegmentTestC", NFC, segmentTests)
segmentTest(t, "SegmentTestD", NFKD, segmentTestsK)
segmentTest(t, "SegmentTestC", NFKC, segmentTestsK)
} }
func segmentTest(t *testing.T, name string, f Form, tests []SegmentTest) { func segmentTest(t *testing.T, name string, f Form, tests []SegmentTest) {
iter := Iter{} iter := Iter{}
for i, tt := range segmentTests { for i, tt := range tests {
buf := make([]byte, MaxSegmentSize) iter.InitString(f, tt.in)
iter.SetInputString(f, tt.in)
for j, seg := range tt.out { for j, seg := range tt.out {
if seg == "" { if seg == "" {
if !iter.Done() { if !iter.Done() {
n := iter.Next(buf) res := string(iter.Next())
res := string(buf[:n])
t.Errorf(`%s:%d:%d: expected Done()==true, found segment "%s"`, name, i, j, res) t.Errorf(`%s:%d:%d: expected Done()==true, found segment "%s"`, name, i, j, res)
} }
continue continue
...@@ -176,10 +179,9 @@ func segmentTest(t *testing.T, name string, f Form, tests []SegmentTest) { ...@@ -176,10 +179,9 @@ func segmentTest(t *testing.T, name string, f Form, tests []SegmentTest) {
if iter.Done() { if iter.Done() {
t.Errorf("%s:%d:%d: Done()==true, want false", name, i, j) t.Errorf("%s:%d:%d: Done()==true, want false", name, i, j)
} }
n := iter.Next(buf)
seg = f.String(seg) seg = f.String(seg)
if res := string(buf[:n]); res != seg { if res := string(iter.Next()); res != seg {
t.Errorf(`%s:%d:%d" segment was "%s" (%d); want "%s" (%d)`, name, i, j, res, len(res), seg, len(seg)) t.Errorf(`%s:%d:%d" segment was "%s" (%d); want "%s" (%d) %X %X`, name, i, j, res, len(res), seg, len(seg), []rune(res), []rune(seg))
} }
} }
} }
......
...@@ -574,7 +574,19 @@ func makeEntry(f *FormInfo) uint16 { ...@@ -574,7 +574,19 @@ func makeEntry(f *FormInfo) uint16 {
// decompSet keeps track of unique decompositions, grouped by whether // decompSet keeps track of unique decompositions, grouped by whether
// the decomposition is followed by a trailing and/or leading CCC. // the decomposition is followed by a trailing and/or leading CCC.
type decompSet [4]map[string]bool type decompSet [6]map[string]bool
const (
normalDecomp = iota
firstMulti
firstCCC
endMulti
firstLeadingCCC
firstCCCZeroExcept
lastDecomp
)
var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "lastDecomp"}
func makeDecompSet() decompSet { func makeDecompSet() decompSet {
m := decompSet{} m := decompSet{}
...@@ -614,20 +626,30 @@ func printCharInfoTables() int { ...@@ -614,20 +626,30 @@ func printCharInfoTables() int {
const msg = "%U: lccc (%d) must be <= tcc (%d)" const msg = "%U: lccc (%d) must be <= tcc (%d)"
logger.Fatalf(msg, r, lccc, tccc) logger.Fatalf(msg, r, lccc, tccc)
} }
index := 0 index := normalDecomp
if tccc > 0 || lccc > 0 { if tccc > 0 || lccc > 0 {
s += string([]byte{tccc}) s += string([]byte{tccc})
index = 1 index = endMulti
for _, r := range d[1:] {
if ccc(r) == 0 {
index = firstCCC
}
}
if lccc > 0 { if lccc > 0 {
s += string([]byte{lccc}) s += string([]byte{lccc})
index = 2 if index == firstCCC {
logger.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r)
}
index = firstLeadingCCC
} }
if cc != lccc { if cc != lccc {
if cc != 0 { if cc != 0 {
logger.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc) logger.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc)
} }
index = 3 index = firstCCCZeroExcept
} }
} else if len(d) > 1 {
index = firstMulti
} }
return index, s return index, s
} }
...@@ -653,7 +675,6 @@ func printCharInfoTables() int { ...@@ -653,7 +675,6 @@ func printCharInfoTables() int {
size := 0 size := 0
positionMap := make(map[string]uint16) positionMap := make(map[string]uint16)
decompositions.WriteString("\000") decompositions.WriteString("\000")
cname := []string{"firstCCC", "firstLeadingCCC", "firstCCCZeroExcept", "lastDecomp"}
fmt.Println("const (") fmt.Println("const (")
for i, m := range decompSet { for i, m := range decompSet {
sa := []string{} sa := []string{}
......
...@@ -6,6 +6,7 @@ package norm ...@@ -6,6 +6,7 @@ package norm
import ( import (
"bytes" "bytes"
"io"
"strings" "strings"
"testing" "testing"
) )
...@@ -504,12 +505,35 @@ func appendBench(f Form, in []byte) func() { ...@@ -504,12 +505,35 @@ func appendBench(f Form, in []byte) func() {
} }
func iterBench(f Form, in []byte) func() { func iterBench(f Form, in []byte) func() {
buf := make([]byte, 4*len(in))
iter := Iter{} iter := Iter{}
return func() { return func() {
iter.SetInput(f, in) iter.Init(f, in)
for !iter.Done() { for !iter.Done() {
iter.Next(buf) iter.Next()
}
}
}
func readerBench(f Form, in []byte) func() {
buf := make([]byte, 4*len(in))
return func() {
r := f.Reader(bytes.NewReader(in))
var err error
for err == nil {
_, err = r.Read(buf)
}
if err != io.EOF {
panic("")
}
}
}
func writerBench(f Form, in []byte) func() {
buf := make([]byte, 0, 4*len(in))
return func() {
r := f.Writer(bytes.NewBuffer(buf))
if _, err := r.Write(in); err != nil {
panic("")
} }
} }
} }
...@@ -517,6 +541,8 @@ func iterBench(f Form, in []byte) func() { ...@@ -517,6 +541,8 @@ func iterBench(f Form, in []byte) func() {
func appendBenchmarks(bm []func(), f Form, in []byte) []func() { func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
//bm = append(bm, appendBench(f, in)) //bm = append(bm, appendBench(f, in))
bm = append(bm, iterBench(f, in)) bm = append(bm, iterBench(f, in))
//bm = append(bm, readerBench(f, in))
//bm = append(bm, writerBench(f, in))
return bm return bm
} }
......
...@@ -223,13 +223,11 @@ func doTest(t *Test, f norm.Form, gold, test string) { ...@@ -223,13 +223,11 @@ func doTest(t *Test, f norm.Form, gold, test string) {
cmpResult(t, "Bytes", f, gold, test, string(result)) cmpResult(t, "Bytes", f, gold, test, string(result))
sresult := f.String(test) sresult := f.String(test)
cmpResult(t, "String", f, gold, test, sresult) cmpResult(t, "String", f, gold, test, sresult)
buf := make([]byte, norm.MaxSegmentSize)
acc := []byte{} acc := []byte{}
i := norm.Iter{} i := norm.Iter{}
i.SetInputString(f, test) i.InitString(f, test)
for !i.Done() { for !i.Done() {
n := i.Next(buf) acc = append(acc, i.Next()...)
acc = append(acc, buf[:n]...)
} }
cmpResult(t, "Iter.Next", f, gold, test, string(acc)) cmpResult(t, "Iter.Next", f, gold, test, string(acc))
for i := range test { for i := range test {
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment