Commit 2fd95497 authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

unicode: update to Unicode 10.0.0

Also includes all derived values as well as
vendored packages.

Generated by running
    UNICODE_VERSION=10.0.0 go generate
in golang.org/x/text

and modified by hand to add the tests and
entries in next.txt for new script and properties.

Closes Issue #21471

Change-Id: I1d10ee3887bd1fd3d5a756ee0d04bd6ec2814ba1
Reviewed-on: https://go-review.googlesource.com/63953
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarAndrew Bonventre <andybons@golang.org>
parent 8c532f5f
......@@ -344,3 +344,4 @@ pkg syscall (openbsd-386), const SYS_KILL = 37
pkg syscall (openbsd-386-cgo), const SYS_KILL = 37
pkg syscall (openbsd-amd64), const SYS_KILL = 37
pkg syscall (openbsd-amd64-cgo), const SYS_KILL = 37
pkg unicode, const Version = "9.0.0"
pkg math/big, const MaxBase = 62
pkg unicode, const Version = "10.0.0"
pkg unicode, var Masaram_Gondi *RangeTable
pkg unicode, var Nushu *RangeTable
pkg unicode, var Soyombo *RangeTable
pkg unicode, var Zanabazar_Square *RangeTable
pkg unicode, var Regional_Indicator *RangeTable
......@@ -7,7 +7,7 @@
package strconv
// (462+139+82)*2 + (378)*4 = 2878 bytes
// (456+140+86)*2 + (396)*4 = 2948 bytes
var isPrint16 = []uint16{
0x0020, 0x007e,
......@@ -25,7 +25,7 @@ var isPrint16 = []uint16{
0x07c0, 0x07fa,
0x0800, 0x082d,
0x0830, 0x085b,
0x085e, 0x085e,
0x085e, 0x086a,
0x08a0, 0x08bd,
0x08d4, 0x098c,
0x098f, 0x0990,
......@@ -36,7 +36,7 @@ var isPrint16 = []uint16{
0x09cb, 0x09ce,
0x09d7, 0x09d7,
0x09dc, 0x09e3,
0x09e6, 0x09fb,
0x09e6, 0x09fd,
0x0a01, 0x0a0a,
0x0a0f, 0x0a10,
0x0a13, 0x0a39,
......@@ -51,8 +51,7 @@ var isPrint16 = []uint16{
0x0ad0, 0x0ad0,
0x0ae0, 0x0ae3,
0x0ae6, 0x0af1,
0x0af9, 0x0af9,
0x0b01, 0x0b0c,
0x0af9, 0x0b0c,
0x0b0f, 0x0b10,
0x0b13, 0x0b39,
0x0b3c, 0x0b44,
......@@ -82,8 +81,7 @@ var isPrint16 = []uint16{
0x0cd5, 0x0cd6,
0x0cde, 0x0ce3,
0x0ce6, 0x0cf2,
0x0d01, 0x0d3a,
0x0d3d, 0x0d4f,
0x0d00, 0x0d4f,
0x0d54, 0x0d63,
0x0d66, 0x0d7f,
0x0d82, 0x0d96,
......@@ -154,8 +152,7 @@ var isPrint16 = []uint16{
0x1c4d, 0x1c88,
0x1cc0, 0x1cc7,
0x1cd0, 0x1cf9,
0x1d00, 0x1df5,
0x1dfb, 0x1f15,
0x1d00, 0x1f15,
0x1f18, 0x1f1d,
0x1f20, 0x1f45,
0x1f48, 0x1f4d,
......@@ -167,7 +164,7 @@ var isPrint16 = []uint16{
0x2030, 0x205e,
0x2070, 0x2071,
0x2074, 0x209c,
0x20a0, 0x20be,
0x20a0, 0x20bf,
0x20d0, 0x20f0,
0x2100, 0x218b,
0x2190, 0x2426,
......@@ -175,7 +172,7 @@ var isPrint16 = []uint16{
0x2460, 0x2b73,
0x2b76, 0x2b95,
0x2b98, 0x2bb9,
0x2bbd, 0x2bd1,
0x2bbd, 0x2bd2,
0x2bec, 0x2bef,
0x2c00, 0x2cf3,
0x2cf9, 0x2d27,
......@@ -183,17 +180,17 @@ var isPrint16 = []uint16{
0x2d30, 0x2d67,
0x2d6f, 0x2d70,
0x2d7f, 0x2d96,
0x2da0, 0x2e44,
0x2da0, 0x2e49,
0x2e80, 0x2ef3,
0x2f00, 0x2fd5,
0x2ff0, 0x2ffb,
0x3001, 0x3096,
0x3099, 0x30ff,
0x3105, 0x312d,
0x3105, 0x312e,
0x3131, 0x31ba,
0x31c0, 0x31e3,
0x31f0, 0x4db5,
0x4dc0, 0x9fd5,
0x4dc0, 0x9fea,
0xa000, 0xa48c,
0xa490, 0xa4c6,
0xa4d0, 0xa62b,
......@@ -254,6 +251,7 @@ var isNotPrint16 = []uint16{
0x0590,
0x06dd,
0x083f,
0x085f,
0x08b5,
0x08e2,
0x0984,
......@@ -275,6 +273,7 @@ var isNotPrint16 = []uint16{
0x0ab4,
0x0ac6,
0x0aca,
0x0b00,
0x0b04,
0x0b29,
0x0b31,
......@@ -341,7 +340,7 @@ var isNotPrint16 = []uint16{
0x1771,
0x191f,
0x1a5f,
0x1cf7,
0x1dfa,
0x1f58,
0x1f5a,
0x1f5c,
......@@ -351,7 +350,6 @@ var isNotPrint16 = []uint16{
0x1fdc,
0x1ff5,
0x208f,
0x23ff,
0x2bc9,
0x2c2f,
0x2c5f,
......@@ -398,7 +396,7 @@ var isPrint32 = []uint32{
0x0102a0, 0x0102d0,
0x0102e0, 0x0102fb,
0x010300, 0x010323,
0x010330, 0x01034a,
0x01032d, 0x01034a,
0x010350, 0x01037a,
0x010380, 0x0103c3,
0x0103c8, 0x0103d5,
......@@ -481,11 +479,17 @@ var isPrint32 = []uint32{
0x011730, 0x01173f,
0x0118a0, 0x0118f2,
0x0118ff, 0x0118ff,
0x011a00, 0x011a47,
0x011a50, 0x011a83,
0x011a86, 0x011aa2,
0x011ac0, 0x011af8,
0x011c00, 0x011c45,
0x011c50, 0x011c6c,
0x011c70, 0x011c8f,
0x011c92, 0x011cb6,
0x011d00, 0x011d36,
0x011d3a, 0x011d47,
0x011d50, 0x011d59,
0x012000, 0x012399,
0x012400, 0x012474,
0x012480, 0x012543,
......@@ -502,10 +506,11 @@ var isPrint32 = []uint32{
0x016f00, 0x016f44,
0x016f50, 0x016f7e,
0x016f8f, 0x016f9f,
0x016fe0, 0x016fe0,
0x016fe0, 0x016fe1,
0x017000, 0x0187ec,
0x018800, 0x018af2,
0x01b000, 0x01b001,
0x01b000, 0x01b11e,
0x01b170, 0x01b2fb,
0x01bc00, 0x01bc6a,
0x01bc70, 0x01bc7c,
0x01bc80, 0x01bc88,
......@@ -553,9 +558,10 @@ var isPrint32 = []uint32{
0x01f210, 0x01f23b,
0x01f240, 0x01f248,
0x01f250, 0x01f251,
0x01f300, 0x01f6d2,
0x01f260, 0x01f265,
0x01f300, 0x01f6d4,
0x01f6e0, 0x01f6ec,
0x01f6f0, 0x01f6f6,
0x01f6f0, 0x01f6f8,
0x01f700, 0x01f773,
0x01f780, 0x01f7d4,
0x01f800, 0x01f80b,
......@@ -563,16 +569,17 @@ var isPrint32 = []uint32{
0x01f850, 0x01f859,
0x01f860, 0x01f887,
0x01f890, 0x01f8ad,
0x01f910, 0x01f927,
0x01f930, 0x01f930,
0x01f933, 0x01f94b,
0x01f950, 0x01f95e,
0x01f980, 0x01f991,
0x01f900, 0x01f90b,
0x01f910, 0x01f94c,
0x01f950, 0x01f96b,
0x01f980, 0x01f997,
0x01f9c0, 0x01f9c0,
0x01f9d0, 0x01f9e6,
0x020000, 0x02a6d6,
0x02a700, 0x02b734,
0x02b740, 0x02b81d,
0x02b820, 0x02cea1,
0x02ceb0, 0x02ebe0,
0x02f800, 0x02fa1d,
0x0e0100, 0x0e01ef,
}
......@@ -605,9 +612,14 @@ var isNotPrint32 = []uint16{ // add 0x10000 to each entry
0x1334,
0x145a,
0x145c,
0x1a9d,
0x1c09,
0x1c37,
0x1ca8,
0x1d07,
0x1d0a,
0x1d3b,
0x1d3e,
0x246f,
0x6a5f,
0x6b5a,
......@@ -658,7 +670,6 @@ var isNotPrint32 = []uint16{ // add 0x10000 to each entry
0xf0c0,
0xf0d0,
0xf12f,
0xf91f,
0xf93f,
}
......
......@@ -14,8 +14,13 @@ type T struct {
script string
}
// Hand-chosen tests from Unicode 5.1.0, 6.0.0, 6.2.0, 6.3.0, 7.0.0 and 8.0.0
// Hand-chosen tests from Unicode 5.1.0, 6.0.0, 6.2.0, 6.3.0, 7.0.0, 8.0.0,
// 9.0.0, 10.0.0.
// mostly to discover when new scripts and categories arise.
// If this tests fails, add the missing scripts to the test and add entries
// of the form
// pkg unicode, var <new script> *RangeTable
// to api/next.txt.
var inTest = []T{
{0x11711, "Ahom"},
{0x1e900, "Adlam"},
......@@ -92,6 +97,7 @@ var inTest = []T{
{0x0843, "Mandaic"},
{0x10ac8, "Manichaean"},
{0x11cB6, "Marchen"},
{0x11d59, "Masaram_Gondi"},
{0xabd0, "Meetei_Mayek"},
{0x1e800, "Mende_Kikakui"},
{0x1099f, "Meroitic_Hieroglyphs"},
......@@ -106,6 +112,7 @@ var inTest = []T{
{0x11400, "Newa"},
{0x19c3, "New_Tai_Lue"},
{0x07f8, "Nko"},
{0x1b170, "Nushu"},
{0x169b, "Ogham"},
{0x1c6a, "Ol_Chiki"},
{0x10C80, "Old_Hungarian"},
......@@ -134,6 +141,7 @@ var inTest = []T{
{0x1D920, "SignWriting"},
{0x0dbd, "Sinhala"},
{0x110d0, "Sora_Sompeng"},
{0x11a99, "Soyombo"},
{0x1ba3, "Sundanese"},
{0xa803, "Syloti_Nagri"},
{0x070f, "Syriac"},
......@@ -155,6 +163,7 @@ var inTest = []T{
{0xa60e, "Vai"},
{0x118ff, "Warang_Citi"},
{0xa216, "Yi"},
{0x11a0a, "Zanabazar_Square"},
}
var outTest = []T{ // not really worth being thorough
......@@ -229,6 +238,7 @@ var inPropTest = []T{
{0x06DD, "Prepended_Concatenation_Mark"},
{0x300D, "Quotation_Mark"},
{0x2EF3, "Radical"},
{0x1f1ff, "Regional_Indicator"},
{0x061F, "STerm"}, // Deprecated alias of Sentence_Terminal
{0x061F, "Sentence_Terminal"},
{0x2071, "Soft_Dotted"},
......
This diff is collapsed.
......@@ -21,6 +21,7 @@ import (
"unicode/utf8"
"golang_org/x/text/secure/bidirule"
"golang_org/x/text/unicode/bidi"
"golang_org/x/text/unicode/norm"
)
......@@ -67,6 +68,15 @@ func VerifyDNSLength(verify bool) Option {
return func(o *options) { o.verifyDNSLength = verify }
}
// RemoveLeadingDots removes leading label separators. Leading runes that map to
// dots, such as U+3002 IDEOGRAPHIC FULL STOP, are removed as well.
//
// This is the behavior suggested by the UTS #46 and is adopted by some
// browsers.
func RemoveLeadingDots(remove bool) Option {
return func(o *options) { o.removeLeadingDots = remove }
}
// ValidateLabels sets whether to check the mandatory label validation criteria
// as defined in Section 5.4 of RFC 5891. This includes testing for correct use
// of hyphens ('-'), normalization, validity of runes, and the context rules.
......@@ -83,7 +93,7 @@ func ValidateLabels(enable bool) Option {
}
}
// StrictDomainName limits the set of permissable ASCII characters to those
// StrictDomainName limits the set of permissible ASCII characters to those
// allowed in domain names as defined in RFC 1034 (A-Z, a-z, 0-9 and the
// hyphen). This is set by default for MapForLookup and ValidateForRegistration.
//
......@@ -137,10 +147,11 @@ func MapForLookup() Option {
}
type options struct {
transitional bool
useSTD3Rules bool
validateLabels bool
verifyDNSLength bool
transitional bool
useSTD3Rules bool
validateLabels bool
verifyDNSLength bool
removeLeadingDots bool
trie *idnaTrie
......@@ -149,14 +160,14 @@ type options struct {
// mapping implements a validation and mapping step as defined in RFC 5895
// or UTS 46, tailored to, for example, domain registration or lookup.
mapping func(p *Profile, s string) (string, error)
mapping func(p *Profile, s string) (mapped string, isBidi bool, err error)
// bidirule, if specified, checks whether s conforms to the Bidi Rule
// defined in RFC 5893.
bidirule func(s string) bool
}
// A Profile defines the configuration of a IDNA mapper.
// A Profile defines the configuration of an IDNA mapper.
type Profile struct {
options
}
......@@ -289,12 +300,16 @@ func (e runeError) Error() string {
// see http://www.unicode.org/reports/tr46.
func (p *Profile) process(s string, toASCII bool) (string, error) {
var err error
var isBidi bool
if p.mapping != nil {
s, err = p.mapping(p, s)
s, isBidi, err = p.mapping(p, s)
}
// Remove leading empty labels.
for ; len(s) > 0 && s[0] == '.'; s = s[1:] {
if p.removeLeadingDots {
for ; len(s) > 0 && s[0] == '.'; s = s[1:] {
}
}
// TODO: allow for a quick check the tables data.
// It seems like we should only create this error on ToASCII, but the
// UTS 46 conformance tests suggests we should always check this.
if err == nil && p.verifyDNSLength && s == "" {
......@@ -320,6 +335,7 @@ func (p *Profile) process(s string, toASCII bool) (string, error) {
// Spec says keep the old label.
continue
}
isBidi = isBidi || bidirule.DirectionString(u) != bidi.LeftToRight
labels.set(u)
if err == nil && p.validateLabels {
err = p.fromPuny(p, u)
......@@ -334,6 +350,14 @@ func (p *Profile) process(s string, toASCII bool) (string, error) {
err = p.validateLabel(label)
}
}
if isBidi && p.bidirule != nil && err == nil {
for labels.reset(); !labels.done(); labels.next() {
if !p.bidirule(labels.label()) {
err = &labelError{s, "B"}
break
}
}
}
if toASCII {
for labels.reset(); !labels.done(); labels.next() {
label := labels.label()
......@@ -365,41 +389,65 @@ func (p *Profile) process(s string, toASCII bool) (string, error) {
return s, err
}
func normalize(p *Profile, s string) (string, error) {
return norm.NFC.String(s), nil
func normalize(p *Profile, s string) (mapped string, isBidi bool, err error) {
// TODO: consider first doing a quick check to see if any of these checks
// need to be done. This will make it slower in the general case, but
// faster in the common case.
mapped = norm.NFC.String(s)
isBidi = bidirule.DirectionString(mapped) == bidi.RightToLeft
return mapped, isBidi, nil
}
func validateRegistration(p *Profile, s string) (string, error) {
func validateRegistration(p *Profile, s string) (idem string, bidi bool, err error) {
// TODO: filter need for normalization in loop below.
if !norm.NFC.IsNormalString(s) {
return s, &labelError{s, "V1"}
return s, false, &labelError{s, "V1"}
}
var err error
for i := 0; i < len(s); {
v, sz := trie.lookupString(s[i:])
i += sz
bidi = bidi || info(v).isBidi(s[i:])
// Copy bytes not copied so far.
switch p.simplify(info(v).category()) {
// TODO: handle the NV8 defined in the Unicode idna data set to allow
// for strict conformance to IDNA2008.
case valid, deviation:
case disallowed, mapped, unknown, ignored:
if err == nil {
r, _ := utf8.DecodeRuneInString(s[i:])
err = runeError(r)
}
r, _ := utf8.DecodeRuneInString(s[i:])
return s, bidi, runeError(r)
}
i += sz
}
return s, err
return s, bidi, nil
}
func validateAndMap(p *Profile, s string) (string, error) {
func (c info) isBidi(s string) bool {
if !c.isMapped() {
return c&attributesMask == rtl
}
// TODO: also store bidi info for mapped data. This is possible, but a bit
// cumbersome and not for the common case.
p, _ := bidi.LookupString(s)
switch p.Class() {
case bidi.R, bidi.AL, bidi.AN:
return true
}
return false
}
func validateAndMap(p *Profile, s string) (vm string, bidi bool, err error) {
var (
err error
b []byte
k int
b []byte
k int
)
// combinedInfoBits contains the or-ed bits of all runes. We use this
// to derive the mayNeedNorm bit later. This may trigger normalization
// overeagerly, but it will not do so in the common case. The end result
// is another 10% saving on BenchmarkProfile for the common case.
var combinedInfoBits info
for i := 0; i < len(s); {
v, sz := trie.lookupString(s[i:])
combinedInfoBits |= info(v)
bidi = bidi || info(v).isBidi(s[i:])
start := i
i += sz
// Copy bytes not copied so far.
......@@ -408,7 +456,7 @@ func validateAndMap(p *Profile, s string) (string, error) {
continue
case disallowed:
if err == nil {
r, _ := utf8.DecodeRuneInString(s[i:])
r, _ := utf8.DecodeRuneInString(s[start:])
err = runeError(r)
}
continue
......@@ -426,7 +474,9 @@ func validateAndMap(p *Profile, s string) (string, error) {
}
if k == 0 {
// No changes so far.
s = norm.NFC.String(s)
if combinedInfoBits&mayNeedNorm != 0 {
s = norm.NFC.String(s)
}
} else {
b = append(b, s[k:]...)
if norm.NFC.QuickSpan(b) != len(b) {
......@@ -435,7 +485,7 @@ func validateAndMap(p *Profile, s string) (string, error) {
// TODO: the punycode converters require strings as input.
s = string(b)
}
return s, err
return s, bidi, err
}
// A labelIter allows iterating over domain name labels.
......@@ -530,6 +580,8 @@ func validateFromPunycode(p *Profile, s string) error {
if !norm.NFC.IsNormalString(s) {
return &labelError{s, "V1"}
}
// TODO: detect whether string may have to be normalized in the following
// loop.
for i := 0; i < len(s); {
v, sz := trie.lookupString(s[i:])
if c := p.simplify(info(v).category()); c != valid && c != deviation {
......@@ -604,16 +656,13 @@ var joinStates = [][numJoinTypes]joinState{
// validateLabel validates the criteria from Section 4.1. Item 1, 4, and 6 are
// already implicitly satisfied by the overall implementation.
func (p *Profile) validateLabel(s string) error {
func (p *Profile) validateLabel(s string) (err error) {
if s == "" {
if p.verifyDNSLength {
return &labelError{s, "A4"}
}
return nil
}
if p.bidirule != nil && !p.bidirule(s) {
return &labelError{s, "B"}
}
if !p.validateLabels {
return nil
}
......
This diff is collapsed.
......@@ -28,9 +28,9 @@ package idna
// 15..3 index into xor or mapping table
// }
// } else {
// 15..13 unused
// 12 modifier (including virama)
// 11 virama modifier
// 15..14 unused
// 13 mayNeedNorm
// 12..11 attributes
// 10..8 joining type
// 7..3 category type
// }
......@@ -51,15 +51,20 @@ const (
joinShift = 8
joinMask = 0x07
viramaModifier = 0x0800
// Attributes
attributesMask = 0x1800
viramaModifier = 0x1800
modifier = 0x1000
rtl = 0x0800
mayNeedNorm = 0x2000
)
// A category corresponds to a category defined in the IDNA mapping table.
type category uint16
const (
unknown category = 0 // not defined currently in unicode.
unknown category = 0 // not currently defined in unicode.
mapped category = 1
disallowedSTD3Mapped category = 2
deviation category = 3
......@@ -112,5 +117,5 @@ func (c info) isModifier() bool {
}
func (c info) isViramaModifier() bool {
return c&(viramaModifier|catSmallMask) == viramaModifier
return c&(attributesMask|catSmallMask) == viramaModifier
}
......@@ -157,6 +157,7 @@ func DirectionString(s string) bidi.Direction {
e, sz := bidi.LookupString(s[i:])
if sz == 0 {
i++
continue
}
c := e.Class()
if c == bidi.R || c == bidi.AL || c == bidi.AN {
......@@ -205,9 +206,6 @@ func (t *Transformer) isRTL() bool {
}
func (t *Transformer) isFinal() bool {
if !t.isRTL() {
return true
}
return t.state == ruleLTRFinal || t.state == ruleRTLFinal || t.state == ruleInitial
}
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -35,17 +35,9 @@ const (
// streamSafe implements the policy of when a CGJ should be inserted.
type streamSafe uint8
// mkStreamSafe is a shorthand for declaring a streamSafe var and calling
// first on it.
func mkStreamSafe(p Properties) streamSafe {
return streamSafe(p.nTrailingNonStarters())
}
// first inserts the first rune of a segment.
// first inserts the first rune of a segment. It is a faster version of next if
// it is known p represents the first rune in a segment.
func (ss *streamSafe) first(p Properties) {
if *ss != 0 {
panic("!= 0")
}
*ss = streamSafe(p.nTrailingNonStarters())
}
......@@ -68,7 +60,7 @@ func (ss *streamSafe) next(p Properties) ssState {
// be a non-starter. Note that it always hold that if nLead > 0 then
// nLead == nTrail.
if n == 0 {
*ss = 0
*ss = streamSafe(p.nTrailingNonStarters())
return ssStarter
}
return ssSuccess
......@@ -144,7 +136,6 @@ func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) {
func (rb *reorderBuffer) reset() {
rb.nrune = 0
rb.nbyte = 0
rb.ss = 0
}
func (rb *reorderBuffer) doFlush() bool {
......@@ -259,6 +250,9 @@ func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) {
// It flushes the buffer on each new segment start.
func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr {
rb.tmpBytes.setBytes(dcomp)
// As the streamSafe accounting already handles the counting for modifiers,
// we don't have to call next. However, we do need to keep the accounting
// intact when flushing the buffer.
for i := 0; i < len(dcomp); {
info := rb.f.info(rb.tmpBytes, i)
if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() {
......
......@@ -92,16 +92,20 @@ func (in *input) charinfoNFKC(p int) (uint16, int) {
}
func (in *input) hangul(p int) (r rune) {
var size int
if in.bytes == nil {
if !isHangulString(in.str[p:]) {
return 0
}
r, _ = utf8.DecodeRuneInString(in.str[p:])
r, size = utf8.DecodeRuneInString(in.str[p:])
} else {
if !isHangul(in.bytes[p:]) {
return 0
}
r, _ = utf8.DecodeRune(in.bytes[p:])
r, size = utf8.DecodeRune(in.bytes[p:])
}
if size != hangulUTF8Size {
return 0
}
return r
}
......@@ -43,6 +43,7 @@ func (i *Iter) Init(f Form, src []byte) {
i.next = i.rb.f.nextMain
i.asciiF = nextASCIIBytes
i.info = i.rb.f.info(i.rb.src, i.p)
i.rb.ss.first(i.info)
}
// InitString initializes i to iterate over src after normalizing it to Form f.
......@@ -58,11 +59,12 @@ func (i *Iter) InitString(f Form, src string) {
i.next = i.rb.f.nextMain
i.asciiF = nextASCIIString
i.info = i.rb.f.info(i.rb.src, i.p)
i.rb.ss.first(i.info)
}
// Seek sets the segment to be returned by the next call to Next to start
// at position p. It is the responsibility of the caller to set p to the
// start of a UTF8 rune.
// start of a segment.
func (i *Iter) Seek(offset int64, whence int) (int64, error) {
var abs int64
switch whence {
......@@ -86,6 +88,7 @@ func (i *Iter) Seek(offset int64, whence int) (int64, error) {
i.multiSeg = nil
i.next = i.rb.f.nextMain
i.info = i.rb.f.info(i.rb.src, i.p)
i.rb.ss.first(i.info)
return abs, nil
}
......@@ -163,6 +166,7 @@ func nextHangul(i *Iter) []byte {
if next >= i.rb.nsrc {
i.setDone()
} else if i.rb.src.hangul(next) == 0 {
i.rb.ss.next(i.info)
i.info = i.rb.f.info(i.rb.src, i.p)
i.next = i.rb.f.nextMain
return i.next(i)
......@@ -206,12 +210,10 @@ func nextMultiNorm(i *Iter) []byte {
if info.BoundaryBefore() {
i.rb.compose()
seg := i.buf[:i.rb.flushCopy(i.buf[:])]
i.rb.ss.first(info)
i.rb.insertUnsafe(input{bytes: d}, j, info)
i.multiSeg = d[j+int(info.size):]
return seg
}
i.rb.ss.next(info)
i.rb.insertUnsafe(input{bytes: d}, j, info)
j += int(info.size)
}
......@@ -224,9 +226,9 @@ func nextMultiNorm(i *Iter) []byte {
func nextDecomposed(i *Iter) (next []byte) {
outp := 0
inCopyStart, outCopyStart := i.p, 0
ss := mkStreamSafe(i.info)
for {
if sz := int(i.info.size); sz <= 1 {
i.rb.ss = 0
p := i.p
i.p++ // ASCII or illegal byte. Either way, advance by 1.
if i.p >= i.rb.nsrc {
......@@ -245,6 +247,8 @@ func nextDecomposed(i *Iter) (next []byte) {
p := outp + len(d)
if outp > 0 {
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
// TODO: this condition should not be possible, but we leave it
// in for defensive purposes.
if p > len(i.buf) {
return i.buf[:outp]
}
......@@ -268,7 +272,7 @@ func nextDecomposed(i *Iter) (next []byte) {
} else {
i.info = i.rb.f.info(i.rb.src, i.p)
}
switch ss.next(i.info) {
switch i.rb.ss.next(i.info) {
case ssOverflow:
i.next = nextCGJDecompose
fallthrough
......@@ -311,7 +315,7 @@ func nextDecomposed(i *Iter) (next []byte) {
}
prevCC := i.info.tccc
i.info = i.rb.f.info(i.rb.src, i.p)
if v := ss.next(i.info); v == ssStarter {
if v := i.rb.ss.next(i.info); v == ssStarter {
break
} else if v == ssOverflow {
i.next = nextCGJDecompose
......@@ -337,10 +341,6 @@ doNorm:
func doNormDecomposed(i *Iter) []byte {
for {
if s := i.rb.ss.next(i.info); s == ssOverflow {
i.next = nextCGJDecompose
break
}
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
i.setDone()
......@@ -350,6 +350,10 @@ func doNormDecomposed(i *Iter) []byte {
if i.info.ccc == 0 {
break
}
if s := i.rb.ss.next(i.info); s == ssOverflow {
i.next = nextCGJDecompose
break
}
}
// new segment or too many combining characters: exit normalization
return i.buf[:i.rb.flushCopy(i.buf[:])]
......@@ -359,6 +363,7 @@ func nextCGJDecompose(i *Iter) []byte {
i.rb.ss = 0
i.rb.insertCGJ()
i.next = nextDecomposed
i.rb.ss.first(i.info)
buf := doNormDecomposed(i)
return buf
}
......@@ -367,7 +372,6 @@ func nextCGJDecompose(i *Iter) []byte {
func nextComposed(i *Iter) []byte {
outp, startp := 0, i.p
var prevCC uint8
ss := mkStreamSafe(i.info)
for {
if !i.info.isYesC() {
goto doNorm
......@@ -387,11 +391,12 @@ func nextComposed(i *Iter) []byte {
i.setDone()
break
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
i.rb.ss = 0
i.next = i.asciiF
break
}
i.info = i.rb.f.info(i.rb.src, i.p)
if v := ss.next(i.info); v == ssStarter {
if v := i.rb.ss.next(i.info); v == ssStarter {
break
} else if v == ssOverflow {
i.next = nextCGJCompose
......@@ -403,8 +408,10 @@ func nextComposed(i *Iter) []byte {
}
return i.returnSlice(startp, i.p)
doNorm:
// reset to start position
i.p = startp
i.info = i.rb.f.info(i.rb.src, i.p)
i.rb.ss.first(i.info)
if i.info.multiSegment() {
d := i.info.Decomposition()
info := i.rb.f.info(input{bytes: d}, 0)
......
......@@ -324,7 +324,6 @@ func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool)
// have an overflow for runes that are starters (e.g. with U+FF9E).
switch ss.next(info) {
case ssStarter:
ss.first(info)
lastSegStart = i
case ssOverflow:
return lastSegStart, false
......@@ -441,6 +440,8 @@ func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
}
return -1
}
// TODO: Using streamSafe to determine the boundary isn't the same as
// using BoundaryBefore. Determine which should be used.
if s := ss.next(info); s != ssSuccess {
return i
}
......@@ -505,15 +506,14 @@ func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
if info.size == 0 {
return 0
}
if rb.nrune > 0 {
if s := rb.ss.next(info); s == ssStarter {
goto end
} else if s == ssOverflow {
rb.insertCGJ()
if s := rb.ss.next(info); s == ssStarter {
// TODO: this could be removed if we don't support merging.
if rb.nrune > 0 {
goto end
}
} else {
rb.ss.first(info)
} else if s == ssOverflow {
rb.insertCGJ()
goto end
}
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
return int(err)
......
......@@ -42,7 +42,7 @@ func (f Form) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
}
func flushTransform(rb *reorderBuffer) bool {
// Write out (must fully fit in dst, or else it is a ErrShortDst).
// Write out (must fully fit in dst, or else it is an ErrShortDst).
if len(rb.out) < rb.nrune*utf8.UTFMax {
return false
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment