exp/norm: Added regression test tool for the standard Unicode test set.

R=r CC=golang-dev https://golang.org/cl/4973064

exp/norm: Added regression test tool for the standard Unicode test set.
R=r CC=golang-dev https://golang.org/cl/4973064
efea5d0f · Marcel van Lohuizen · 40d85fb0 · efea5d0f · efea5d0f
Commit efea5d0f authored Sep 13, 2011 by Marcel van Lohuizen
Hide whitespace changes
Inline Side-by-side

Showing with 304 additions and 1 deletion

Makefile src/pkg/exp/norm/Makefile +8 -1

normregtest.go src/pkg/exp/norm/normregtest.go +296 -0

No files found.
--- a/src/pkg/exp/norm/Makefile
+++ b/src/pkg/exp/norm/Makefile
@@ -25,6 +25,10 @@ maketesttables: maketesttables.go triegen.go
 	$(GC) maketesttables.go triegen.go
 	$(LD) -o maketesttables maketesttables.$O

+normregtest: normregtest.go
+	$(GC) normregtest.go
+	$(LD) -o normregtest normregtest.$O
+
 tables:	maketables
 	./maketables > tables.go
 	gofmt -w tables.go
@@ -39,7 +43,10 @@ testshort: maketables maketesttables

 # Downloads from www.unicode.org, so not part
 # of standard test scripts.
-test: testtables
+test: testtables regtest

 testtables: maketables
 	./maketables -test -tables=
+
+regtest: normregtest
+	./normregtest
--- a/src/pkg/exp/norm/normregtest.go
+++ b/src/pkg/exp/norm/normregtest.go
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"exp/norm"
+	"exp/regexp"
+	"flag"
+	"fmt"
+	"http"
+	"log"
+	"os"
+	"path"
+	"runtime"
+	"strings"
+	"strconv"
+	"time"
+	"utf8"
+)
+
+func main() {
+	flag.Parse()
+	loadTestData()
+	CharacterByCharacterTests()
+	StandardTests()
+	PerformanceTest()
+	if errorCount == 0 {
+		fmt.Println("PASS")
+	}
+}
+
+const file = "NormalizationTest.txt"
+
+var url = flag.String("url",
+	"http://www.unicode.org/Public/6.0.0/ucd/"+file,
+	"URL of Unicode database directory")
+var localFiles = flag.Bool("local",
+	false,
+	"data files have been copied to the current directory; for debugging only")
+
+var logger = log.New(os.Stderr, "", log.Lshortfile)
+
+// This regression test runs the test set in NormalizationTest.txt
+// (taken from http://www.unicode.org/Public/6.0.0/ucd/).
+//
+// NormalizationTest.txt has form:
+// @Part0 # Specific cases
+// #
+// 1E0A;1E0A;0044 0307;1E0A;0044 0307; # (Ḋ; Ḋ; D◌̇; Ḋ; D◌̇; ) LATIN CAPITAL LETTER D WITH DOT ABOVE
+// 1E0C;1E0C;0044 0323;1E0C;0044 0323; # (Ḍ; Ḍ; D◌̣; Ḍ; D◌̣; ) LATIN CAPITAL LETTER D WITH DOT BELOW
+//
+// Each test has 5 columns (c1, c2, c3, c4, c5), where 
+// (c1, c2, c3, c4, c5) == (c1, NFC(c1), NFD(c1), NFKC(c1), NFKD(c1))
+//
+// CONFORMANCE:
+// 1. The following invariants must be true for all conformant implementations
+//
+//    NFC
+//      c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3)
+//      c4 ==  NFC(c4) ==  NFC(c5)
+//
+//    NFD
+//      c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3)
+//      c5 ==  NFD(c4) ==  NFD(c5)
+//
+//    NFKC
+//      c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
+//
+//    NFKD
+//      c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
+//
+// 2. For every code point X assigned in this version of Unicode that is not
+//    specifically listed in Part 1, the following invariants must be true
+//    for all conformant implementations:
+//
+//      X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)
+//
+
+// Column types.
+const (
+	cRaw = iota
+	cNFC
+	cNFD
+	cNFKC
+	cNFKD
+	cMaxColumns
+)
+
+// Holds data from NormalizationTest.txt
+var part []Part
+
+type Part struct {
+	name   string
+	number int
+	tests  []Test
+}
+
+type Test struct {
+	name   string
+	partnr int
+	number int
+	rune   int                 // used for character by character test
+	cols   [cMaxColumns]string // Each has 5 entries, see below.
+}
+
+func (t Test) Name() string {
+	if t.number < 0 {
+		return part[t.partnr].name
+	}
+	return fmt.Sprintf("%s:%d", part[t.partnr].name, t.number)
+}
+
+var partRe = regexp.MustCompile(`@Part(\d) # (.*)\n`) // TODO: using $ iso \n does not work
+// TODO: the following regexp does not work: `^(?:([\dA-F ]+);){5} # (.*)\n`
+var testRe = regexp.MustCompile(`^(?:([\dA-F ]+);)(?:([\dA-F ]+);)(?:([\dA-F ]+);)(?:([\dA-F ]+);)(?:([\dA-F ]+);) # (.*)\n`)
+
+var counter int
+
+// Load the data form NormalizationTest.txt
+func loadTestData() {
+	if *localFiles {
+		pwd, _ := os.Getwd()
+		*url = "file://" + path.Join(pwd, file)
+	}
+	t := &http.Transport{}
+	t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
+	c := &http.Client{Transport: t}
+	resp, err := c.Get(*url)
+	if err != nil {
+		logger.Fatal(err)
+	}
+	if resp.StatusCode != 200 {
+		logger.Fatal("bad GET status for "+file, resp.Status)
+	}
+	f := resp.Body
+	defer f.Close()
+	input := bufio.NewReader(f)
+	for {
+		line, err := input.ReadString('\n')
+		if err != nil {
+			if err == os.EOF {
+				break
+			}
+			logger.Fatal(err)
+		}
+		if len(line) == 0 || line[0] == '#' {
+			continue
+		}
+		m := partRe.FindStringSubmatch(line)
+		if m != nil {
+			if len(m) < 3 {
+				logger.Fatal("Failed to parse Part: ", line)
+			}
+			i, err := strconv.Atoi(m[1])
+			if err != nil {
+				logger.Fatal(err)
+			}
+			name := m[2]
+			part = append(part, Part{name: name[:len(name)-1], number: i})
+			continue
+		}
+		m = testRe.FindStringSubmatch(line)
+		if m == nil || len(m) < 7 {
+			logger.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
+		}
+		test := Test{name: m[6], partnr: len(part) - 1, number: counter}
+		counter++
+		for j := 1; j < len(m)-1; j++ {
+			for _, split := range strings.Split(m[j], " ") {
+				r, err := strconv.Btoui64(split, 16)
+				if err != nil {
+					logger.Fatal(err)
+				}
+				if test.rune == 0 {
+					// save for CharacterByCharacterTests
+					test.rune = int(r)
+				}
+				var buf [utf8.UTFMax]byte
+				sz := utf8.EncodeRune(buf[:], int(r))
+				test.cols[j-1] += string(buf[:sz])
+			}
+		}
+		part := &part[len(part)-1]
+		part.tests = append(part.tests, test)
+	}
+}
+
+var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"}
+
+var errorCount int
+
+func cmpResult(t *Test, name string, f norm.Form, gold, test, result string) {
+	if gold != result {
+		errorCount++
+		if errorCount > 20 {
+			return
+		}
+		st, sr, sg := []int(test), []int(result), []int(gold)
+		logger.Printf("%s:%s: %s(%X)=%X; want:%X: %s",
+			t.Name(), name, fstr[f], st, sr, sg, t.name)
+	}
+}
+
+func cmpIsNormal(t *Test, name string, f norm.Form, test string, result, want bool) {
+	if result != want {
+		errorCount++
+		if errorCount > 20 {
+			return
+		}
+		logger.Printf("%s:%s: %s(%X)=%v; want: %v", t.Name(), name, fstr[f], []int(test), result, want)
+	}
+}
+
+func doTest(t *Test, f norm.Form, gold, test string) {
+	result := f.Bytes([]byte(test))
+	cmpResult(t, "Bytes", f, gold, test, string(result))
+	for i := range test {
+		out := f.Append(f.Bytes([]byte(test[:i])), []byte(test[i:])...)
+		cmpResult(t, fmt.Sprintf(":Append:%d", i), f, gold, test, string(out))
+	}
+	cmpIsNormal(t, "IsNormal", f, test, f.IsNormal([]byte(test)), test == gold)
+}
+
+func doConformanceTests(t *Test, partn int) {
+	for i := 0; i <= 2; i++ {
+		doTest(t, norm.NFC, t.cols[1], t.cols[i])
+		doTest(t, norm.NFD, t.cols[2], t.cols[i])
+		doTest(t, norm.NFKC, t.cols[3], t.cols[i])
+		doTest(t, norm.NFKD, t.cols[4], t.cols[i])
+	}
+	for i := 3; i <= 4; i++ {
+		doTest(t, norm.NFC, t.cols[3], t.cols[i])
+		doTest(t, norm.NFD, t.cols[4], t.cols[i])
+		doTest(t, norm.NFKC, t.cols[3], t.cols[i])
+		doTest(t, norm.NFKD, t.cols[4], t.cols[i])
+	}
+}
+
+func CharacterByCharacterTests() {
+	tests := part[1].tests
+	last := 0
+	for i := 0; i <= len(tests); i++ { // last one is special case
+		var rune int
+		if i == len(tests) {
+			rune = 0x2FA1E // Don't have to go to 0x10FFFF
+		} else {
+			rune = tests[i].rune
+		}
+		for last++; last < rune; last++ {
+			// Check all characters that were not explicitly listed in the test.
+			t := &Test{partnr: 1, number: -1}
+			char := string(last)
+			doTest(t, norm.NFC, char, char)
+			doTest(t, norm.NFD, char, char)
+			doTest(t, norm.NFKC, char, char)
+			doTest(t, norm.NFKD, char, char)
+		}
+		if i < len(tests) {
+			doConformanceTests(&tests[i], 1)
+		}
+	}
+}
+
+func StandardTests() {
+	for _, j := range []int{0, 2, 3} {
+		for _, test := range part[j].tests {
+			doConformanceTests(&test, j)
+		}
+	}
+}
+
+// PerformanceTest verifies that normalization is O(n). If any of the
+// code does not properly check for maxCombiningChars, normalization
+// may exhibit O(n**2) behavior.
+func PerformanceTest() {
+	runtime.GOMAXPROCS(2)
+	success := make(chan bool, 1)
+	go func() {
+		buf := bytes.Repeat([]byte("\u035D"), 1024*1024)
+		buf = append(buf, []byte("\u035B")...)
+		norm.NFC.Append(nil, buf...)
+		success <- true
+	}()
+	timeout := time.After(1e9)
+	select {
+	case <-success:
+		// test completed before the timeout
+	case <-timeout:
+		errorCount++
+		logger.Printf(`unexpectedly long time to complete PerformanceTest`)
+	}
+}