exp/norm: changed API of Iter.

Motivations: - Simpler UI. Previous API proved a bit awkward for practical purposes. - Iter is often used in cases where one want to be able to bail out early. The old implementaton had too much look-ahead to be efficient. Disadvantages: - ASCII performance is bad. This is unavoidable for tiny iterations. Example is included to show how to work around this. Description: Iter now iterates per boundary/segment. It returns a slice of bytes that either points to the input bytes, the internal decomposition strings, or the small internal buffer that each iterator has. In many cases, copying bytes is avoided. The method Seek was added to support jumping around the input without having to reinitialize. Details: - Table adjustments: some decompositions exist of multiple segments. Decompositions that are of this type are now marked so that Iter can handle them separately. - The old iterator had a different next function for different normal forms that was assigned to a function pointer called by Next. The new iterator uses this mechanism to switch between different modes for handling different type of input as well. This greatly improves performance for Hangul and ASCII. It is also used for multi-segment decompositions. - input is now a struct of sting and []byte, instead of an interface. This simplifies optimizing the ASCII case. R=rsc CC=golang-dev https://golang.org/cl/6873072

exp/norm: changed API of Iter.
Motivations: - Simpler UI. Previous API proved a bit awkward for practical purposes. - Iter is often used in cases where one want to be able to bail out early. The old implementaton had too much look-ahead to be efficient. Disadvantages: - ASCII performance is bad. This is unavoidable for tiny iterations. Example is included to show how to work around this. Description: Iter now iterates per boundary/segment. It returns a slice of bytes that either points to the input bytes, the internal decomposition strings, or the small internal buffer that each iterator has. In many cases, copying bytes is avoided. The method Seek was added to support jumping around the input without having to reinitialize. Details: - Table adjustments: some decompositions exist of multiple segments. Decompositions that are of this type are now marked so that Iter can handle them separately. - The old iterator had a different next function for different normal forms that was assigned to a function pointer called by Next. The new iterator uses this mechanism to switch between different modes for handling different type of input as well. This greatly improves performance for Hangul and ASCII. It is also used for multi-segment decompositions. - input is now a struct of sting and []byte, instead of an interface. This simplifies optimizing the ASCII case. R=rsc CC=golang-dev https://golang.org/cl/6873072
cfcc3ebf · Marcel van Lohuizen · 9aa70984 · cfcc3ebf · cfcc3ebf · cfcc3ebf
Commit cfcc3ebf authored Dec 24, 2012 by Marcel van Lohuizen
11 changed files
--- a/src/pkg/exp/norm/composition.go
+++ b/src/pkg/exp/norm/composition.go
@@ -28,24 +28,20 @@ type reorderBuffer struct {
 	nbyte uint8                     // Number or bytes.
 	f     formInfo
-	src       input
+	src      input
-	nsrc      int
+	nsrc     int
-	srcBytes  inputBytes
+	tmpBytes input
-	srcString inputString
-	tmpBytes  inputBytes
 }
 func (rb *reorderBuffer) init(f Form, src []byte) {
 	rb.f = *formTable[f]
-	rb.srcBytes = inputBytes(src)
+	rb.src.setBytes(src)
-	rb.src = &rb.srcBytes
 	rb.nsrc = len(src)
 }
 func (rb *reorderBuffer) initString(f Form, src string) {
 	rb.f = *formTable[f]
-	rb.srcString = inputString(src)
+	rb.src.setString(src)
-	rb.src = &rb.srcString
 	rb.nsrc = len(src)
 }
@@ -121,9 +117,9 @@ func (rb *reorderBuffer) insert(src input, i int, info Properties) bool {
 // in dcomp.  dcomp must be a sequence of decomposed UTF-8-encoded runes.
 func (rb *reorderBuffer) insertDecomposed(dcomp []byte) bool {
 	saveNrune, saveNbyte := rb.nrune, rb.nbyte
-	rb.tmpBytes = inputBytes(dcomp)
+	rb.tmpBytes.setBytes(dcomp)
 	for i := 0; i < len(dcomp); {
-		info := rb.f.info(&rb.tmpBytes, i)
+		info := rb.f.info(rb.tmpBytes, i)
 		pos := rb.nbyte
 		if !rb.insertOrdered(info) {
 			rb.nrune, rb.nbyte = saveNrune, saveNbyte

--- a/src/pkg/exp/norm/composition_test.go
+++ b/src/pkg/exp/norm/composition_test.go
@@ -81,7 +81,7 @@ func flushF(rb *reorderBuffer) []byte {
 }
 func flushCopyF(rb *reorderBuffer) []byte {
-	out := make([]byte, MaxSegmentSize)
+	out := make([]byte, maxByteBufferSize)
 	n := rb.flushCopy(out)
 	return out[:n]
 }

--- a/src/pkg/exp/norm/example_iter_test.go
+++ b/src/pkg/exp/norm/example_iter_test.go
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+package norm_test
+import (
+	"bytes"
+	"exp/norm"
+	"fmt"
+	"unicode/utf8"
+)
+// EqualSimple uses a norm.Iter to compare two non-normalized
+// strings for equivalence.
+func EqualSimple(a, b string) bool {
+	var ia, ib norm.Iter
+	ia.InitString(norm.NFKD, a)
+	ib.InitString(norm.NFKD, b)
+	for !ia.Done() && !ib.Done() {
+		if !bytes.Equal(ia.Next(), ib.Next()) {
+			return false
+		}
+	}
+	return ia.Done() && ib.Done()
+}
+// FindPrefix finds the longest common prefix of ASCII characters
+// of a and b.
+func FindPrefix(a, b string) int {
+	i := 0
+	for ; i < len(a) && i < len(b) && a[i] < utf8.RuneSelf && a[i] == b[i]; i++ {
+	}
+	return i
+}
+// EqualOpt is like EqualSimple, but optimizes the special
+// case for ASCII characters.
+func EqualOpt(a, b string) bool {
+	n := FindPrefix(a, b)
+	a, b = a[n:], b[n:]
+	var ia, ib norm.Iter
+	ia.InitString(norm.NFKD, a)
+	ib.InitString(norm.NFKD, b)
+	for !ia.Done() && !ib.Done() {
+		if !bytes.Equal(ia.Next(), ib.Next()) {
+			return false
+		}
+		if n := int64(FindPrefix(a[ia.Pos():], b[ib.Pos():])); n != 0 {
+			ia.Seek(n, 1)
+			ib.Seek(n, 1)
+		}
+	}
+	return ia.Done() && ib.Done()
+}
+var compareTests = []struct{ a, b string }{
+	{"aaa", "aaa"},
+	{"aaa", "aab"},
+	{"a\u0300a", "\u00E0a"},
+	{"a\u0300\u0320b", "a\u0320\u0300b"},
+	{"\u1E0A\u0323", "\x44\u0323\u0307"},
+	// A character that decomposes into multiple segments
+	// spans several iterations.
+	{"\u3304", "\u30A4\u30CB\u30F3\u30AF\u3099"},
+}
+func ExampleIter() {
+	for i, t := range compareTests {
+		r0 := EqualSimple(t.a, t.b)
+		r1 := EqualOpt(t.a, t.b)
+		fmt.Printf("%d: %v %v\n", i, r0, r1)
+	}
+	// Output:
+	// 0: true true
+	// 1: false false
+	// 2: true true
+	// 3: true true
+	// 4: true true
+	// 5: true true
+}
--- a/src/pkg/exp/norm/forminfo.go
+++ b/src/pkg/exp/norm/forminfo.go
@@ -50,6 +50,7 @@ type formInfo struct {
 	form                     Form
 	composing, compatibility bool // form type
 	info                     lookupFunc
+	nextMain                 iterFunc
 }
 var formTable []*formInfo
@@ -67,7 +68,9 @@ func init() {
 		} else {
 			f.info = lookupInfoNFC
 		}
+		f.nextMain = nextDecomposed
 		if Form(i) == NFC || Form(i) == NFKC {
+			f.nextMain = nextComposed
 			f.composing = true
 		}
 	}
@@ -117,6 +120,10 @@ func (p Properties) isInert() bool {
 	return p.flags&0xf == 0 && p.ccc == 0
 }
+func (p Properties) multiSegment() bool {
+	return p.index >= firstMulti && p.index < endMulti
+}
 // Decomposition returns the decomposition for the underlying rune
 // or nil if there is none.
 func (p Properties) Decomposition() []byte {

--- a/src/pkg/exp/norm/input.go
+++ b/src/pkg/exp/norm/input.go
@@ -6,91 +6,100 @@ package norm
 import "unicode/utf8"
-type input interface {
+type input struct {
-	skipASCII(p, max int) int
+	str   string
-	skipNonStarter(p int) int
+	bytes []byte
-	appendSlice(buf []byte, s, e int) []byte
-	copySlice(buf []byte, s, e int)
-	charinfoNFC(p int) (uint16, int)
-	charinfoNFKC(p int) (uint16, int)
-	hangul(p int) rune
 }
-type inputString string
+func inputBytes(str []byte) input {
+	return input{bytes: str}
-func (s inputString) skipASCII(p, max int) int {
-	for ; p < max && s[p] < utf8.RuneSelf; p++ {
-	}
-	return p
-}
-func (s inputString) skipNonStarter(p int) int {
-	for ; p < len(s) && !utf8.RuneStart(s[p]); p++ {
-	}
-	return p
-}
-func (s inputString) appendSlice(buf []byte, b, e int) []byte {
-	for i := b; i < e; i++ {
-		buf = append(buf, s[i])
-	}
-	return buf
 }
-func (s inputString) copySlice(buf []byte, b, e int) {
+func inputString(str string) input {
-	copy(buf, s[b:e])
+	return input{str: str}
 }
-func (s inputString) charinfoNFC(p int) (uint16, int) {
+func (in *input) setBytes(str []byte) {
-	return nfcTrie.lookupString(string(s[p:]))
+	in.str = ""
+	in.bytes = str
 }
-func (s inputString) charinfoNFKC(p int) (uint16, int) {
+func (in *input) setString(str string) {
-	return nfkcTrie.lookupString(string(s[p:]))
+	in.str = str
+	in.bytes = nil
 }
-func (s inputString) hangul(p int) rune {
+func (in *input) _byte(p int) byte {
-	if !isHangulString(string(s[p:])) {
+	if in.bytes == nil {
-		return 0
+		return in.str[p]
 	}
-	rune, _ := utf8.DecodeRuneInString(string(s[p:]))
+	return in.bytes[p]
-	return rune
 }
-type inputBytes []byte
+func (in *input) skipASCII(p, max int) int {
+	if in.bytes == nil {
-func (s inputBytes) skipASCII(p, max int) int {
+		for ; p < max && in.str[p] < utf8.RuneSelf; p++ {
-	for ; p < max && s[p] < utf8.RuneSelf; p++ {
+		}
+	} else {
+		for ; p < max && in.bytes[p] < utf8.RuneSelf; p++ {
+		}
 	}
 	return p
 }
-func (s inputBytes) skipNonStarter(p int) int {
+func (in *input) skipNonStarter(p int) int {
-	for ; p < len(s) && !utf8.RuneStart(s[p]); p++ {
+	if in.bytes == nil {
+		for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ {
+		}
+	} else {
+		for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ {
+		}
 	}
 	return p
 }
-func (s inputBytes) appendSlice(buf []byte, b, e int) []byte {
+func (in *input) appendSlice(buf []byte, b, e int) []byte {
-	return append(buf, s[b:e]...)
+	if in.bytes != nil {
+		return append(buf, in.bytes[b:e]...)
+	}
+	for i := b; i < e; i++ {
+		buf = append(buf, in.str[i])
+	}
+	return buf
 }
-func (s inputBytes) copySlice(buf []byte, b, e int) {
+func (in *input) copySlice(buf []byte, b, e int) int {
-	copy(buf, s[b:e])
+	if in.bytes == nil {
+		return copy(buf, in.str[b:e])
+	}
+	return copy(buf, in.bytes[b:e])
 }
-func (s inputBytes) charinfoNFC(p int) (uint16, int) {
+func (in *input) charinfoNFC(p int) (uint16, int) {
-	return nfcTrie.lookup(s[p:])
+	if in.bytes == nil {
+		return nfcTrie.lookupString(in.str[p:])
+	}
+	return nfcTrie.lookup(in.bytes[p:])
 }
-func (s inputBytes) charinfoNFKC(p int) (uint16, int) {
+func (in *input) charinfoNFKC(p int) (uint16, int) {
-	return nfkcTrie.lookup(s[p:])
+	if in.bytes == nil {
+		return nfkcTrie.lookupString(in.str[p:])
+	}
+	return nfkcTrie.lookup(in.bytes[p:])
 }
-func (s inputBytes) hangul(p int) rune {
+func (in *input) hangul(p int) (r rune) {
-	if !isHangul(s[p:]) {
+	if in.bytes == nil {
-		return 0
+		if !isHangulString(in.str[p:]) {
+			return 0
+		}
+		r, _ = utf8.DecodeRuneInString(in.str[p:])
+	} else {
+		if !isHangul(in.bytes[p:]) {
+			return 0
+		}
+		r, _ = utf8.DecodeRune(in.bytes[p:])
 	}
-	rune, _ := utf8.DecodeRune(s[p:])
+	return r
-	return rune
 }
--- a/src/pkg/exp/norm/iter.go
+++ b/src/pkg/exp/norm/iter.go
--- a/src/pkg/exp/norm/iter_test.go
+++ b/src/pkg/exp/norm/iter_test.go
@@ -9,21 +9,12 @@ import (
 	"testing"
 )
-var iterBufSizes = []int{
+func doIterNorm(f Form, s string) []byte {
-	MaxSegmentSize,
-	1.5 * MaxSegmentSize,
-	2 * MaxSegmentSize,
-	3 * MaxSegmentSize,
-	100 * MaxSegmentSize,
-}
-func doIterNorm(f Form, buf []byte, s string) []byte {
 	acc := []byte{}
 	i := Iter{}
-	i.SetInputString(f, s)
+	i.InitString(f, s)
 	for !i.Done() {
-		n := i.Next(buf)
+		acc = append(acc, i.Next()...)
-		acc = append(acc, buf[:n]...)
 	}
 	return acc
 }
@@ -35,30 +26,28 @@ func runIterTests(t *testing.T, name string, f Form, tests []AppendTest, norm bo
 		if norm {
 			gold = string(f.AppendString(nil, test.out))
 		}
-		for _, sz := range iterBufSizes {
+		out := string(doIterNorm(f, in))
-			buf := make([]byte, sz)
+		if len(out) != len(gold) {
-			out := string(doIterNorm(f, buf, in))
+			const msg = "%s:%d: length is %d; want %d"
-			if len(out) != len(gold) {
+			t.Errorf(msg, name, i, len(out), len(gold))
-				const msg = "%s:%d:%d: length is %d; want %d"
+		}
-				t.Errorf(msg, name, i, sz, len(out), len(gold))
+		if out != gold {
-			}
+			// Find first rune that differs and show context.
-			if out != gold {
+			ir := []rune(out)
-				// Find first rune that differs and show context.
+			ig := []rune(gold)
-				ir := []rune(out)
+			t.Errorf("\n%X != \n%X", ir, ig)
-				ig := []rune(gold)
+			for j := 0; j < len(ir) && j < len(ig); j++ {
-				for j := 0; j < len(ir) && j < len(ig); j++ {
+				if ir[j] == ig[j] {
-					if ir[j] == ig[j] {
+					continue
-						continue
+				}
-					}
+				if j -= 3; j < 0 {
-					if j -= 3; j < 0 {
+					j = 0
-						j = 0
-					}
-					for e := j + 7; j < e && j < len(ir) && j < len(ig); j++ {
-						const msg = "%s:%d:%d: runeAt(%d) = %U; want %U"
-						t.Errorf(msg, name, i, sz, j, ir[j], ig[j])
-					}
-					break
 				}
+				for e := j + 7; j < e && j < len(ir) && j < len(ig); j++ {
+					const msg = "%s:%d: runeAt(%d) = %U; want %U"
+					t.Errorf(msg, name, i, j, ir[j], ig[j])
+				}
+				break
 			}
 		}
 	}
@@ -68,42 +57,44 @@ func rep(r rune, n int) string {
 	return strings.Repeat(string(r), n)
 }
+const segSize = maxByteBufferSize
 var iterTests = []AppendTest{
 	{"", ascii, ascii},
 	{"", txt_all, txt_all},
-	{"", "a" + rep(0x0300, MaxSegmentSize/2), "a" + rep(0x0300, MaxSegmentSize/2)},
+	{"", "a" + rep(0x0300, segSize/2), "a" + rep(0x0300, segSize/2)},
 }
 var iterTestsD = []AppendTest{
 	{ // segment overflow on unchanged character
 		"",
-		"a" + rep(0x0300, MaxSegmentSize/2) + "\u0316",
+		"a" + rep(0x0300, segSize/2) + "\u0316",
-		"a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0316\u0300",
+		"a" + rep(0x0300, segSize/2-1) + "\u0316\u0300",
 	},
 	{ // segment overflow on unchanged character + start value
 		"",
-		"a" + rep(0x0300, MaxSegmentSize/2+maxCombiningChars+4) + "\u0316",
+		"a" + rep(0x0300, segSize/2+maxCombiningChars+4) + "\u0316",
-		"a" + rep(0x0300, MaxSegmentSize/2+maxCombiningChars) + "\u0316" + rep(0x300, 4),
+		"a" + rep(0x0300, segSize/2+maxCombiningChars) + "\u0316" + rep(0x300, 4),
 	},
 	{ // segment overflow on decomposition
 		"",
-		"a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0340",
+		"a" + rep(0x0300, segSize/2-1) + "\u0340",
-		"a" + rep(0x0300, MaxSegmentSize/2),
+		"a" + rep(0x0300, segSize/2),
 	},
 	{ // segment overflow on decomposition + start value
 		"",
-		"a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0340" + rep(0x300, maxCombiningChars+4) + "\u0320",
+		"a" + rep(0x0300, segSize/2-1) + "\u0340" + rep(0x300, maxCombiningChars+4) + "\u0320",
-		"a" + rep(0x0300, MaxSegmentSize/2-1) + rep(0x300, maxCombiningChars+1) + "\u0320" + rep(0x300, 4),
+		"a" + rep(0x0300, segSize/2-1) + rep(0x300, maxCombiningChars+1) + "\u0320" + rep(0x300, 4),
 	},
 	{ // start value after ASCII overflow
 		"",
-		rep('a', MaxSegmentSize) + rep(0x300, maxCombiningChars+2) + "\u0320",
+		rep('a', segSize) + rep(0x300, maxCombiningChars+2) + "\u0320",
-		rep('a', MaxSegmentSize) + rep(0x300, maxCombiningChars) + "\u0320\u0300\u0300",
+		rep('a', segSize) + rep(0x300, maxCombiningChars) + "\u0320\u0300\u0300",
 	},
 	{ // start value after Hangul overflow
 		"",
-		rep(0xAC00, MaxSegmentSize/6) + rep(0x300, maxCombiningChars+2) + "\u0320",
+		rep(0xAC00, segSize/6) + rep(0x300, maxCombiningChars+2) + "\u0320",
-		strings.Repeat("\u1100\u1161", MaxSegmentSize/6) + rep(0x300, maxCombiningChars-1) + "\u0320" + rep(0x300, 3),
+		strings.Repeat("\u1100\u1161", segSize/6) + rep(0x300, maxCombiningChars+1) + "\u0320" + rep(0x300, 1),
 	},
 	{ // start value after cc=0
 		"",
@@ -125,8 +116,8 @@ var iterTestsC = []AppendTest{
 	},
 	{ // segment overflow
 		"",
-		"a" + rep(0x0305, MaxSegmentSize/2+4) + "\u0316",
+		"a" + rep(0x0305, segSize/2+4) + "\u0316",
-		"a" + rep(0x0305, MaxSegmentSize/2-1) + "\u0316" + rep(0x305, 5),
+		"a" + rep(0x0305, segSize/2-1) + "\u0316" + rep(0x305, 5),
 	},
 }
@@ -148,27 +139,39 @@ type SegmentTest struct {
 }
 var segmentTests = []SegmentTest{
-	{rep('a', MaxSegmentSize), []string{rep('a', MaxSegmentSize), ""}},
+	{"\u1E0A\u0323a", []string{"\x44\u0323\u0307", "a", ""}},
-	{rep('a', MaxSegmentSize+2), []string{rep('a', MaxSegmentSize-1), "aaa", ""}},
+	{rep('a', segSize), append(strings.Split(rep('a', segSize), ""), "")},
-	{rep('a', MaxSegmentSize) + "\u0300aa", []string{rep('a', MaxSegmentSize-1), "a\u0300", "aa", ""}},
+	{rep('a', segSize+2), append(strings.Split(rep('a', segSize+2), ""), "")},
+	{rep('a', segSize) + "\u0300aa",
+		append(strings.Split(rep('a', segSize-1), ""), "a\u0300", "a", "a", "")},
+}
+var segmentTestsK = []SegmentTest{
+	{"\u3332", []string{"\u30D5", "\u30A1", "\u30E9", "\u30C3", "\u30C8\u3099", ""}},
+	// last segment of multi-segment decomposition needs normalization
+	{"\u3332\u093C", []string{"\u30D5", "\u30A1", "\u30E9", "\u30C3", "\u30C8\u093C\u3099", ""}},
+	// Hangul and Jamo are grouped togeter.
+	{"\uAC00", []string{"\u1100\u1161", ""}},
+	{"\uAC01", []string{"\u1100\u1161\u11A8", ""}},
+	{"\u1100\u1161", []string{"\u1100\u1161", ""}},
 }
 // Note that, by design, segmentation is equal for composing and decomposing forms.
 func TestIterSegmentation(t *testing.T) {
 	segmentTest(t, "SegmentTestD", NFD, segmentTests)
 	segmentTest(t, "SegmentTestC", NFC, segmentTests)
+	segmentTest(t, "SegmentTestD", NFKD, segmentTestsK)
+	segmentTest(t, "SegmentTestC", NFKC, segmentTestsK)
 }
 func segmentTest(t *testing.T, name string, f Form, tests []SegmentTest) {
 	iter := Iter{}
-	for i, tt := range segmentTests {
+	for i, tt := range tests {
-		buf := make([]byte, MaxSegmentSize)
+		iter.InitString(f, tt.in)
-		iter.SetInputString(f, tt.in)
 		for j, seg := range tt.out {
 			if seg == "" {
 				if !iter.Done() {
-					n := iter.Next(buf)
+					res := string(iter.Next())
-					res := string(buf[:n])
 					t.Errorf(`%s:%d:%d: expected Done()==true, found segment "%s"`, name, i, j, res)
 				}
 				continue
@@ -176,10 +179,9 @@ func segmentTest(t *testing.T, name string, f Form, tests []SegmentTest) {
 			if iter.Done() {
 				t.Errorf("%s:%d:%d: Done()==true, want false", name, i, j)
 			}
-			n := iter.Next(buf)
 			seg = f.String(seg)
-			if res := string(buf[:n]); res != seg {
+			if res := string(iter.Next()); res != seg {
-				t.Errorf(`%s:%d:%d" segment was "%s" (%d); want "%s" (%d)`, name, i, j, res, len(res), seg, len(seg))
+				t.Errorf(`%s:%d:%d" segment was "%s" (%d); want "%s" (%d) %X %X`, name, i, j, res, len(res), seg, len(seg), []rune(res), []rune(seg))
 			}
 		}
 	}

--- a/src/pkg/exp/norm/maketables.go
+++ b/src/pkg/exp/norm/maketables.go
@@ -574,7 +574,19 @@ func makeEntry(f *FormInfo) uint16 {
 // decompSet keeps track of unique decompositions, grouped by whether
 // the decomposition is followed by a trailing and/or leading CCC.
-type decompSet [4]map[string]bool
+type decompSet [6]map[string]bool
+const (
+	normalDecomp = iota
+	firstMulti
+	firstCCC
+	endMulti
+	firstLeadingCCC
+	firstCCCZeroExcept
+	lastDecomp
+)
+var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "lastDecomp"}
 func makeDecompSet() decompSet {
 	m := decompSet{}
@@ -614,20 +626,30 @@ func printCharInfoTables() int {
 			const msg = "%U: lccc (%d) must be <= tcc (%d)"
 			logger.Fatalf(msg, r, lccc, tccc)
 		}
-		index := 0
+		index := normalDecomp
 		if tccc > 0 || lccc > 0 {
 			s += string([]byte{tccc})
-			index = 1
+			index = endMulti
+			for _, r := range d[1:] {
+				if ccc(r) == 0 {
+					index = firstCCC
+				}
+			}
 			if lccc > 0 {
 				s += string([]byte{lccc})
-				index = 2
+				if index == firstCCC {
+					logger.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r)
+				}
+				index = firstLeadingCCC
 			}
 			if cc != lccc {
 				if cc != 0 {
 					logger.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc)
 				}
-				index = 3
+				index = firstCCCZeroExcept
 			}
+		} else if len(d) > 1 {
+			index = firstMulti
 		}
 		return index, s
 	}
@@ -653,7 +675,6 @@ func printCharInfoTables() int {
 	size := 0
 	positionMap := make(map[string]uint16)
 	decompositions.WriteString("\000")
-	cname := []string{"firstCCC", "firstLeadingCCC", "firstCCCZeroExcept", "lastDecomp"}
 	fmt.Println("const (")
 	for i, m := range decompSet {
 		sa := []string{}

--- a/src/pkg/exp/norm/normalize_test.go
+++ b/src/pkg/exp/norm/normalize_test.go
@@ -6,6 +6,7 @@ package norm
 import (
 	"bytes"
+	"io"
 	"strings"
 	"testing"
 )
@@ -504,12 +505,35 @@ func appendBench(f Form, in []byte) func() {
 }
 func iterBench(f Form, in []byte) func() {
-	buf := make([]byte, 4*len(in))
 	iter := Iter{}
 	return func() {
-		iter.SetInput(f, in)
+		iter.Init(f, in)
 		for !iter.Done() {
-			iter.Next(buf)
+			iter.Next()
+		}
+	}
+}
+func readerBench(f Form, in []byte) func() {
+	buf := make([]byte, 4*len(in))
+	return func() {
+		r := f.Reader(bytes.NewReader(in))
+		var err error
+		for err == nil {
+			_, err = r.Read(buf)
+		}
+		if err != io.EOF {
+			panic("")
+		}
+	}
+}
+func writerBench(f Form, in []byte) func() {
+	buf := make([]byte, 0, 4*len(in))
+	return func() {
+		r := f.Writer(bytes.NewBuffer(buf))
+		if _, err := r.Write(in); err != nil {
+			panic("")
 		}
 	}
 }
@@ -517,6 +541,8 @@ func iterBench(f Form, in []byte) func() {
 func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
 	//bm = append(bm, appendBench(f, in))
 	bm = append(bm, iterBench(f, in))
+	//bm = append(bm, readerBench(f, in))
+	//bm = append(bm, writerBench(f, in))
 	return bm
 }

--- a/src/pkg/exp/norm/normregtest.go
+++ b/src/pkg/exp/norm/normregtest.go
@@ -223,13 +223,11 @@ func doTest(t *Test, f norm.Form, gold, test string) {
 	cmpResult(t, "Bytes", f, gold, test, string(result))
 	sresult := f.String(test)
 	cmpResult(t, "String", f, gold, test, sresult)
-	buf := make([]byte, norm.MaxSegmentSize)
 	acc := []byte{}
 	i := norm.Iter{}
-	i.SetInputString(f, test)
+	i.InitString(f, test)
 	for !i.Done() {
-		n := i.Next(buf)
+		acc = append(acc, i.Next()...)
-		acc = append(acc, buf[:n]...)
 	}
 	cmpResult(t, "Iter.Next", f, gold, test, string(acc))
 	for i := range test {

--- a/src/pkg/exp/norm/tables.go
+++ b/src/pkg/exp/norm/tables.go