Commit d9c9c487 authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/norm: added implemenation for []byte versions of methods.

R=r
CC=golang-dev
https://golang.org/cl/4925041
parent 45b7084b
......@@ -5,6 +5,8 @@
// Package norm contains types and functions for normalizing Unicode strings.
package norm
import "utf8"
// A Form denotes a canonical representation of Unicode code points.
// The Unicode-defined normalization and equivalence forms are:
//
......@@ -32,17 +34,57 @@ const (
// Bytes returns f(b). May return b if f(b) = b.
func (f Form) Bytes(b []byte) []byte {
panic("not implemented")
n := f.QuickSpan(b)
if n == len(b) {
return b
}
out := make([]byte, n, len(b))
copy(out, b[0:n])
return f.Append(out, b[n:]...)
}
// String returns f(s).
func (f Form) String(s string) string {
panic("not implemented")
n := f.QuickSpanString(s)
if n == len(s) {
return s
}
out := make([]byte, 0, len(s))
copy(out, s[0:n])
return string(f.AppendString(out, s[n:]))
}
// IsNormal returns true if b == f(b).
func (f Form) IsNormal(b []byte) bool {
panic("not implemented")
fd := formTable[f]
bp := quickSpan(fd, b)
if bp == len(b) {
return true
}
rb := reorderBuffer{f: *fd}
for bp < len(b) {
decomposeSegment(&rb, b[bp:])
if fd.composing {
rb.compose()
}
for i := 0; i < rb.nrune; i++ {
info := rb.rune[i]
if bp+int(info.size) > len(b) {
return false
}
p := info.pos
pe := p + info.size
for ; p < pe; p++ {
if b[bp] != rb.byte[p] {
return false
}
bp++
}
}
rb.reset()
bp += quickSpan(fd, b[bp:])
}
return true
}
// IsNormalString returns true if s == f(s).
......@@ -50,14 +92,92 @@ func (f Form) IsNormalString(s string) bool {
panic("not implemented")
}
// patchTail fixes a case where a rune may be incorrectly normalized
// if it is followed by illegal continuation bytes. It returns the
// patched buffer and the number of trailing continuation bytes that
// have been dropped.
func patchTail(rb *reorderBuffer, buf []byte) ([]byte, int) {
info, p := lastRuneStart(&rb.f, buf)
if p == -1 {
return buf, 0
}
end := p + int(info.size)
extra := len(buf) - end
if extra > 0 {
buf = decomposeToLastBoundary(rb, buf[:end])
if rb.f.composing {
rb.compose()
}
return rb.flush(buf), extra
}
return buf, 0
}
func appendQuick(f *formInfo, dst, src []byte) ([]byte, int) {
if len(src) == 0 {
return dst, 0
}
end := quickSpan(f, src)
return append(dst, src[:end]...), end
}
// Append returns f(append(out, b...)).
// The buffer out must be empty or equal to f(out).
func (f Form) Append(out, b []byte) []byte {
panic("not implemented")
// The buffer out must be nil, empty, or equal to f(out).
func (f Form) Append(out []byte, src ...byte) []byte {
if len(src) == 0 {
return out
}
fd := formTable[f]
rb := &reorderBuffer{f: *fd}
doMerge := len(out) > 0
p := 0
if !utf8.RuneStart(src[0]) {
// Move leading non-starters to destination.
for p++; p < len(src) && !utf8.RuneStart(src[p]); p++ {
}
out = append(out, src[:p]...)
buf, ndropped := patchTail(rb, out)
if ndropped > 0 {
out = append(buf, src[p-ndropped:p]...)
doMerge = false // no need to merge, ends with illegal UTF-8
} else {
out = decomposeToLastBoundary(rb, buf) // force decomposition
}
}
if doMerge {
var info runeInfo
if p < len(src[p:]) {
info = fd.info(src[p:])
if p == 0 && !fd.boundaryBefore(fd, info) {
out = decomposeToLastBoundary(rb, out)
}
}
if info.size == 0 {
if fd.composing {
rb.compose()
}
// Append incomplete UTF-8 encoding.
return append(rb.flush(out), src[p:]...)
}
}
if rb.nrune == 0 {
src = src[p:]
out, p = appendQuick(fd, out, src)
}
for n := 0; p < len(src); p += n {
p += decomposeSegment(rb, src[p:])
if fd.composing {
rb.compose()
}
out = rb.flush(out)
out, n = appendQuick(fd, out, src[p:])
}
return out
}
// AppendString returns f(append(out, []byte(s))).
// The buffer out must be empty or equal to f(out).
// The buffer out must be nil, empty, or equal to f(out).
func (f Form) AppendString(out []byte, s string) []byte {
panic("not implemented")
}
......@@ -65,7 +185,51 @@ func (f Form) AppendString(out []byte, s string) []byte {
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
// It is not guaranteed to return the largest such n.
func (f Form) QuickSpan(b []byte) int {
panic("not implemented")
return quickSpan(formTable[f], b)
}
func quickSpan(fd *formInfo, b []byte) int {
var lastCC uint8
var lastSegStart int
i := 0
for i < len(b) {
if b[i] < utf8.RuneSelf {
lastSegStart = i
i++
lastCC = 0
continue
}
info := fd.info(b[i:])
if info.size == 0 {
// include incomplete runes
return len(b)
}
cc := info.ccc
if lastCC > cc && cc != 0 {
return lastSegStart
}
if fd.composing {
if !info.flags.isYesC() {
break
}
} else {
if !info.flags.isYesD() {
break
}
}
if !fd.composing && cc == 0 {
lastSegStart = i
}
lastCC = cc
i += int(info.size)
}
if i == len(b) {
return len(b)
}
if fd.composing {
return lastSegStart
}
return i
}
// QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]).
......@@ -74,26 +238,157 @@ func (f Form) QuickSpanString(s string) int {
panic("not implemented")
}
// FirstBoundary returns the position i of the first boundary in b.
// It returns len(b), false if b contains no boundaries.
func (f Form) FirstBoundary(b []byte) (i int, ok bool) {
panic("not implemented")
// FirstBoundary returns the position i of the first boundary in b
// or -1 if b contains no boundary.
func (f Form) FirstBoundary(b []byte) int {
i := 0
for ; i < len(b) && !utf8.RuneStart(b[i]); i++ {
}
if i >= len(b) {
return -1
}
fd := formTable[f]
info := fd.info(b[i:])
for info.size != 0 && !fd.boundaryBefore(fd, info) {
i += int(info.size)
if i >= len(b) {
if !fd.boundaryAfter(fd, info) {
return -1
}
return len(b)
}
info = fd.info(b[i:])
}
if info.size == 0 {
return -1
}
return i
}
// FirstBoundaryInString return the position i of the first boundary in s.
// It returns len(s), false if s contains no boundaries.
// FirstBoundaryInString returns the position i of the first boundary in s
// or -1 if s contains no boundary.
func (f Form) FirstBoundaryInString(s string) (i int, ok bool) {
panic("not implemented")
}
// LastBoundaryIn returns the position i of the last boundary in b.
// It returns 0, false if b contains no boundary.
func (f Form) LastBoundary(b []byte) (i int, ok bool) {
panic("not implemented")
// LastBoundary returns the position i of the last boundary in b
// or -1 if b contains no boundary.
func (f Form) LastBoundary(b []byte) int {
fd := formTable[f]
i := len(b)
if i == 0 {
return -1
}
info, p := lastRuneStart(fd, b)
if int(info.size) != len(b)-p {
if p != -1 {
return i
}
return -1
}
if fd.boundaryAfter(fd, info) {
return i
}
i = p
for i >= 0 && !fd.boundaryBefore(fd, info) {
info, p = lastRuneStart(fd, b[:i])
if int(info.size) != i-p {
if p != -1 {
return i
}
return -1
}
i = p
}
return i
}
// LastBoundaryInString returns the position i of the last boundary in s.
// It returns 0, false if s contains no boundary.
func (f Form) LastBoundaryInString(s string) (i int, ok bool) {
// LastBoundaryInString returns the position i of the last boundary in s
// or -1 if s contains no boundary.
func (f Form) LastBoundaryInString(s string) int {
panic("not implemented")
}
// decomposeSegment scans the first segment in src into rb.
// It returns the number of bytes consumed from src.
// TODO(mpvl): consider inserting U+034f (Combining Grapheme Joiner)
// when we detect a sequence of 30+ non-starter chars.
func decomposeSegment(rb *reorderBuffer, src []byte) int {
// Force one character to be consumed.
info := rb.f.info(src)
if info.size == 0 {
return 0
}
sp := 0
for rb.insert(src[sp:], info) {
sp += int(info.size)
if sp >= len(src) {
break
}
info = rb.f.info(src[sp:])
bound := rb.f.boundaryBefore(&rb.f, info)
if bound || info.size == 0 {
break
}
}
return sp
}
// lastRuneStart returns the runeInfo and position of the last
// rune in buf or the zero runeInfo and -1 if no rune was found.
func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) {
p := len(buf) - 1
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
}
if p < 0 {
return runeInfo{0, 0, 0, 0}, -1
}
return fd.info(buf[p:]), p
}
// decomposeToLastBoundary finds an open segment at the end of the buffer
// and scans it into rb. Returns the buffer minus the last segment.
func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
info, i := lastRuneStart(&rb.f, buf)
if int(info.size) != len(buf)-i {
// illegal trailing continuation bytes
return buf
}
if rb.f.boundaryAfter(&rb.f, info) {
return buf
}
var add [maxBackRunes]runeInfo // stores runeInfo in reverse order
add[0] = info
padd := 1
n := 1
p := len(buf) - int(info.size)
for ; p >= 0 && !rb.f.boundaryBefore(&rb.f, info); p -= int(info.size) {
info, i = lastRuneStart(&rb.f, buf[:p])
if int(info.size) != p-i {
break
}
// Check that decomposition doesn't result in overflow.
if info.flags.hasDecomposition() {
dcomp := rb.f.decompose(buf[p-int(info.size):])
for i := 0; i < len(dcomp); {
inf := rb.f.info(dcomp[i:])
i += int(inf.size)
n++
}
} else {
n++
}
if n > maxBackRunes {
break
}
add[padd] = info
padd++
}
pp := p
for padd--; padd >= 0; padd-- {
info = add[padd]
rb.insert(buf[pp:], info)
pp += int(info.size)
}
return buf[:p]
}
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment