Commit fe15da62 authored by Marcel van Lohuizen's avatar Marcel van Lohuizen Committed by Russ Cox

unicode: upgrade to 8.0.0

Not sure if I'm on time for 1.5; Unicode 8 just got released.

Straighforward upgrade. Only changed maketables.go to prevent it from adding
the Cherokee upper and lower case mappings. This change causes the caseOrbit
table to NOT change. Added tests to verify that the relevant functions still
produce the correct result, even for Cherokee.

Fixes #11309

Change-Id: I42850f5b3399bde125b002efc78eff96dbd86a08
Reviewed-on: https://go-review.googlesource.com/11286Reviewed-by: 's avatarRuss Cox <rsc@golang.org>
parent 834fef80
......@@ -328,3 +328,4 @@ pkg syscall (netbsd-arm), type IfMsghdr struct, Pad_cgo_1 [4]uint8
pkg syscall (netbsd-arm-cgo), const SizeofIfData = 132
pkg syscall (netbsd-arm-cgo), type IfMsghdr struct, Pad_cgo_1 [4]uint8
pkg unicode, const Version = "6.3.0"
pkg unicode, const Version = "7.0.0"
......@@ -948,3 +948,10 @@ pkg syscall (openbsd-amd64-cgo), type SysProcAttr struct, Pgid int
pkg text/template, method (*Template) DefinedTemplates() string
pkg text/template, method (*Template) Option(...string) *Template
pkg time, method (Time) AppendFormat([]uint8, string) []uint8
pkg unicode, const Version = "8.0.0"
pkg unicode, var Ahom *RangeTable
pkg unicode, var Anatolian_Hieroglyphs *RangeTable
pkg unicode, var Hatran *RangeTable
pkg unicode, var Multani *RangeTable
pkg unicode, var Old_Hungarian *RangeTable
pkg unicode, var SignWriting *RangeTable
......@@ -7,7 +7,7 @@
package strconv
// (468+138+67)*2 + (326)*4 = 2650 bytes
// (470+136+73)*2 + (342)*4 = 2726 bytes
var isPrint16 = []uint16{
0x0020, 0x007e,
......@@ -26,8 +26,8 @@ var isPrint16 = []uint16{
0x0800, 0x082d,
0x0830, 0x085b,
0x085e, 0x085e,
0x08a0, 0x08b2,
0x08e4, 0x098c,
0x08a0, 0x08b4,
0x08e3, 0x098c,
0x098f, 0x0990,
0x0993, 0x09b2,
0x09b6, 0x09b9,
......@@ -51,6 +51,7 @@ var isPrint16 = []uint16{
0x0ad0, 0x0ad0,
0x0ae0, 0x0ae3,
0x0ae6, 0x0af1,
0x0af9, 0x0af9,
0x0b01, 0x0b0c,
0x0b0f, 0x0b10,
0x0b13, 0x0b39,
......@@ -73,7 +74,7 @@ var isPrint16 = []uint16{
0x0be6, 0x0bfa,
0x0c00, 0x0c39,
0x0c3d, 0x0c4d,
0x0c55, 0x0c59,
0x0c55, 0x0c5a,
0x0c60, 0x0c63,
0x0c66, 0x0c6f,
0x0c78, 0x0cb9,
......@@ -84,7 +85,7 @@ var isPrint16 = []uint16{
0x0d01, 0x0d3a,
0x0d3d, 0x0d4e,
0x0d57, 0x0d57,
0x0d60, 0x0d63,
0x0d5f, 0x0d63,
0x0d66, 0x0d75,
0x0d79, 0x0d7f,
0x0d82, 0x0d96,
......@@ -117,7 +118,8 @@ var isPrint16 = []uint16{
0x1318, 0x135a,
0x135d, 0x137c,
0x1380, 0x1399,
0x13a0, 0x13f4,
0x13a0, 0x13f5,
0x13f8, 0x13fd,
0x1400, 0x169c,
0x16a0, 0x16f8,
0x1700, 0x1714,
......@@ -167,9 +169,9 @@ var isPrint16 = []uint16{
0x2030, 0x205e,
0x2070, 0x2071,
0x2074, 0x209c,
0x20a0, 0x20bd,
0x20a0, 0x20be,
0x20d0, 0x20f0,
0x2100, 0x2189,
0x2100, 0x218b,
0x2190, 0x23fa,
0x2400, 0x2426,
0x2440, 0x244a,
......@@ -177,6 +179,7 @@ var isPrint16 = []uint16{
0x2b76, 0x2b95,
0x2b98, 0x2bb9,
0x2bbd, 0x2bd1,
0x2bec, 0x2bef,
0x2c00, 0x2cf3,
0x2cf9, 0x2d27,
0x2d2d, 0x2d2d,
......@@ -193,19 +196,19 @@ var isPrint16 = []uint16{
0x3131, 0x31ba,
0x31c0, 0x31e3,
0x31f0, 0x4db5,
0x4dc0, 0x9fcc,
0x4dc0, 0x9fd5,
0xa000, 0xa48c,
0xa490, 0xa4c6,
0xa4d0, 0xa62b,
0xa640, 0xa6f7,
0xa700, 0xa7ad,
0xa7b0, 0xa7b1,
0xa7b0, 0xa7b7,
0xa7f7, 0xa82b,
0xa830, 0xa839,
0xa840, 0xa877,
0xa880, 0xa8c4,
0xa8ce, 0xa8d9,
0xa8e0, 0xa8fb,
0xa8e0, 0xa8fd,
0xa900, 0xa953,
0xa95f, 0xa97c,
0xa980, 0xa9d9,
......@@ -217,9 +220,8 @@ var isPrint16 = []uint16{
0xab01, 0xab06,
0xab09, 0xab0e,
0xab11, 0xab16,
0xab20, 0xab5f,
0xab64, 0xab65,
0xabc0, 0xabed,
0xab20, 0xab65,
0xab70, 0xabed,
0xabf0, 0xabf9,
0xac00, 0xd7a3,
0xd7b0, 0xd7c6,
......@@ -234,8 +236,7 @@ var isPrint16 = []uint16{
0xfd92, 0xfdc7,
0xfdf0, 0xfdfd,
0xfe00, 0xfe19,
0xfe20, 0xfe2d,
0xfe30, 0xfe6b,
0xfe20, 0xfe6b,
0xfe70, 0xfefc,
0xff01, 0xffbe,
0xffc2, 0xffc7,
......@@ -370,8 +371,6 @@ var isNotPrint16 = []uint16{
0x318f,
0x321f,
0x32ff,
0xa69e,
0xa78f,
0xa9ce,
0xa9ff,
0xab27,
......@@ -418,12 +417,13 @@ var isPrint32 = []uint32{
0x01083c, 0x01083c,
0x01083f, 0x01089e,
0x0108a7, 0x0108af,
0x010900, 0x01091b,
0x0108e0, 0x0108f5,
0x0108fb, 0x01091b,
0x01091f, 0x010939,
0x01093f, 0x01093f,
0x010980, 0x0109b7,
0x0109be, 0x0109bf,
0x010a00, 0x010a06,
0x0109bc, 0x0109cf,
0x0109d2, 0x010a06,
0x010a0c, 0x010a33,
0x010a38, 0x010a3a,
0x010a3f, 0x010a47,
......@@ -438,6 +438,9 @@ var isPrint32 = []uint32{
0x010b99, 0x010b9c,
0x010ba9, 0x010baf,
0x010c00, 0x010c48,
0x010c80, 0x010cb2,
0x010cc0, 0x010cf2,
0x010cfa, 0x010cff,
0x010e60, 0x010e7e,
0x011000, 0x01104d,
0x011052, 0x01106f,
......@@ -446,19 +449,19 @@ var isPrint32 = []uint32{
0x0110f0, 0x0110f9,
0x011100, 0x011143,
0x011150, 0x011176,
0x011180, 0x0111c8,
0x0111cd, 0x0111cd,
0x0111d0, 0x0111da,
0x0111e1, 0x0111f4,
0x011180, 0x0111cd,
0x0111d0, 0x0111f4,
0x011200, 0x01123d,
0x011280, 0x0112a9,
0x0112b0, 0x0112ea,
0x0112f0, 0x0112f9,
0x011301, 0x01130c,
0x011300, 0x01130c,
0x01130f, 0x011310,
0x011313, 0x011339,
0x01133c, 0x011344,
0x011347, 0x011348,
0x01134b, 0x01134d,
0x011350, 0x011350,
0x011357, 0x011357,
0x01135d, 0x011363,
0x011366, 0x01136c,
......@@ -466,17 +469,22 @@ var isPrint32 = []uint32{
0x011480, 0x0114c7,
0x0114d0, 0x0114d9,
0x011580, 0x0115b5,
0x0115b8, 0x0115c9,
0x0115b8, 0x0115dd,
0x011600, 0x011644,
0x011650, 0x011659,
0x011680, 0x0116b7,
0x0116c0, 0x0116c9,
0x011700, 0x011719,
0x01171d, 0x01172b,
0x011730, 0x01173f,
0x0118a0, 0x0118f2,
0x0118ff, 0x0118ff,
0x011ac0, 0x011af8,
0x012000, 0x012398,
0x012000, 0x012399,
0x012400, 0x012474,
0x012480, 0x012543,
0x013000, 0x01342e,
0x014400, 0x014646,
0x016800, 0x016a38,
0x016a40, 0x016a69,
0x016a6e, 0x016a6f,
......@@ -497,7 +505,7 @@ var isPrint32 = []uint32{
0x01d000, 0x01d0f5,
0x01d100, 0x01d126,
0x01d129, 0x01d172,
0x01d17b, 0x01d1dd,
0x01d17b, 0x01d1e8,
0x01d200, 0x01d245,
0x01d300, 0x01d356,
0x01d360, 0x01d371,
......@@ -508,7 +516,8 @@ var isPrint32 = []uint32{
0x01d50d, 0x01d546,
0x01d54a, 0x01d6a5,
0x01d6a8, 0x01d7cb,
0x01d7ce, 0x01d7ff,
0x01d7ce, 0x01da8b,
0x01da9b, 0x01daaf,
0x01e800, 0x01e8c4,
0x01e8c7, 0x01e8d6,
0x01ee00, 0x01ee24,
......@@ -530,13 +539,7 @@ var isPrint32 = []uint32{
0x01f210, 0x01f23a,
0x01f240, 0x01f248,
0x01f250, 0x01f251,
0x01f300, 0x01f32c,
0x01f330, 0x01f37d,
0x01f380, 0x01f3ce,
0x01f3d4, 0x01f3f7,
0x01f400, 0x01f54a,
0x01f550, 0x01f642,
0x01f645, 0x01f6cf,
0x01f300, 0x01f6d0,
0x01f6e0, 0x01f6ec,
0x01f6f0, 0x01f6f3,
0x01f700, 0x01f773,
......@@ -546,9 +549,13 @@ var isPrint32 = []uint32{
0x01f850, 0x01f859,
0x01f860, 0x01f887,
0x01f890, 0x01f8ad,
0x01f910, 0x01f918,
0x01f980, 0x01f984,
0x01f9c0, 0x01f9c0,
0x020000, 0x02a6d6,
0x02a700, 0x02b734,
0x02b740, 0x02b81d,
0x02b820, 0x02cea1,
0x02f800, 0x02fa1d,
0x0e0100, 0x0e01ef,
}
......@@ -562,12 +569,18 @@ var isNotPrint32 = []uint16{ // add 0x10000 to each entry
0x0809,
0x0836,
0x0856,
0x08f3,
0x0a04,
0x0a14,
0x0a18,
0x10bd,
0x1135,
0x11e0,
0x1212,
0x1287,
0x1289,
0x128e,
0x129e,
0x1304,
0x1329,
0x1331,
......@@ -589,6 +602,7 @@ var isNotPrint32 = []uint16{ // add 0x10000 to each entry
0xd53f,
0xd545,
0xd551,
0xdaa0,
0xee04,
0xee20,
0xee23,
......@@ -618,7 +632,6 @@ var isNotPrint32 = []uint16{ // add 0x10000 to each entry
0xf0c0,
0xf0d0,
0xf12f,
0xf4ff,
0xf57a,
0xf5a4,
}
......@@ -24,6 +24,7 @@ var upperTest = []rune{
0x181,
0x376,
0x3cf,
0x13bd,
0x1f2a,
0x2102,
0x2c00,
......@@ -46,6 +47,7 @@ var notupperTest = []rune{
0x377,
0x387,
0x2150,
0xab7d,
0xffff,
0x10000,
}
......@@ -194,6 +196,15 @@ var caseTest = []caseT{
{LowerCase, 0x0148, 0x0148},
{TitleCase, 0x0148, 0x0147},
// Lowercase lower than uppercase.
// AB78;CHEROKEE SMALL LETTER GE;Ll;0;L;;;;;N;;;13A8;;13A8
{UpperCase, 0xab78, 0x13a8},
{LowerCase, 0xab78, 0xab78},
{TitleCase, 0xab78, 0x13a8},
{UpperCase, 0x13a8, 0x13a8},
{LowerCase, 0x13a8, 0xab78},
{TitleCase, 0x13a8, 0x13a8},
// Last block in the 5.1.0 table
// 10400;DESERET CAPITAL LETTER LONG I;Lu;0;L;;;;;N;;;;10428;
{UpperCase, 0x10400, 0x10400},
......@@ -405,6 +416,9 @@ var simpleFoldTests = []string{
// Extra special cases: has lower/upper but no case fold.
"İ",
"ı",
// Upper comes before lower (Cherokee).
"\u13b0\uab80",
}
func TestSimpleFold(t *testing.T) {
......
......@@ -44,7 +44,7 @@ func main() {
var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")
var url = flag.String("url",
"http://www.unicode.org/Public/7.0.0/ucd/",
"http://www.unicode.org/Public/8.0.0/ucd/",
"URL of Unicode database directory")
var tablelist = flag.String("tables",
"all",
......@@ -1152,11 +1152,14 @@ func printCasefold() {
}
}
// Delete the groups for which assuming [lower, upper] is right.
// Delete the groups for which assuming [lower, upper] or [upper, lower] is right.
for i, orb := range caseOrbit {
if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
caseOrbit[i] = nil
}
if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] {
caseOrbit[i] = nil
}
}
// Record orbit information in chars.
......
......@@ -14,9 +14,11 @@ type T struct {
script string
}
// Hand-chosen tests from Unicode 5.1.0, 6.0.0, 6.2.0, 6.3.0 and 7.0.0 mostly to
// discover when new scripts and categories arise.
// Hand-chosen tests from Unicode 5.1.0, 6.0.0, 6.2.0, 6.3.0, 7.0.0 and 8.0.0
// mostly to discover when new scripts and categories arise.
var inTest = []T{
{0x11711, "Ahom"},
{0x14646, "Anatolian_Hieroglyphs"},
{0x06e2, "Arabic"},
{0x0567, "Armenian"},
{0x10b20, "Avestan"},
......@@ -58,6 +60,7 @@ var inTest = []T{
{0x3028, "Han"},
{0x11b8, "Hangul"},
{0x1727, "Hanunoo"},
{0x108FF, "Hatran"},
{0x05a0, "Hebrew"},
{0x3058, "Hiragana"},
{0x10841, "Imperial_Aramaic"},
......@@ -94,12 +97,14 @@ var inTest = []T{
{0x11611, "Modi"},
{0x1822, "Mongolian"},
{0x16a60, "Mro"},
{0x11293, "Multani"},
{0x104c, "Myanmar"},
{0x10880, "Nabataean"},
{0x19c3, "New_Tai_Lue"},
{0x07f8, "Nko"},
{0x169b, "Ogham"},
{0x1c6a, "Ol_Chiki"},
{0x10C80, "Old_Hungarian"},
{0x10310, "Old_Italic"},
{0x10a80, "Old_North_Arabian"},
{0x10350, "Old_Permic"},
......@@ -121,6 +126,7 @@ var inTest = []T{
{0x111a0, "Sharada"},
{0x10463, "Shavian"},
{0x115c1, "Siddham"},
{0x1D920, "SignWriting"},
{0x0dbd, "Sinhala"},
{0x110d0, "Sora_Sompeng"},
{0x1ba3, "Sundanese"},
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment