Commit 5e47b779 authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/locale/collate/tools/colcmp: implementation of colcmp tool used for comparing

various implementation of collation.  The tool provides commands for soring,
regressing one implementation against another, and benchmarking.
Currently it includes collation implementations for the Go collator, ICU,
and one using Darwin's CoreFoundation framework.
To avoid building this tool in the default build, the colcmp tag has been
added to all files. This allows other tools/colcmp in this directory (e.g. it may make
sense to move maketables here) to be put in this directory as well.

R=r, rsc, mpvl
CC=golang-dev
https://golang.org/cl/6496118
parent 0d82e698
# Copyright 2012 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
chars:
go run ../maketables.go -tables=chars -package=main > chars.go
gofmt -w chars.go
This source diff could not be displayed because it is too large. You can view the blob instead.
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
import (
"exp/locale/collate"
"log"
"unicode/utf16"
)
// Input holds an input string in both UTF-8 and UTF-16 format.
type Input struct {
index int // used for restoring to original random order
UTF8 []byte
UTF16 []uint16
key []byte // used for sorting
}
func (i Input) String() string {
return string(i.UTF8)
}
func makeInput(s8 []byte, s16 []uint16) Input {
return Input{UTF8: s8, UTF16: s16}
}
func makeInputString(s string) Input {
return Input{
UTF8: []byte(s),
UTF16: utf16.Encode([]rune(s)),
}
}
// Collator is an interface for architecture-specific implementations of collation.
type Collator interface {
// Key generates a sort key for the given input. Implemenations
// may return nil if a collator does not support sort keys.
Key(s Input) []byte
// Compare returns -1 if a < b, 1 if a > b and 0 if a == b.
Compare(a, b Input) int
}
// CollatorFactory creates a Collator for a given locale.
type CollatorFactory struct {
name string
makeFn func(locale string) (Collator, error)
description string
}
var collators = []CollatorFactory{}
// AddFactory registers f as a factory for an implementation of Collator.
func AddFactory(f CollatorFactory) {
collators = append(collators, f)
}
func getCollator(name, locale string) Collator {
for _, f := range collators {
if f.name == name {
col, err := f.makeFn(locale)
if err != nil {
log.Fatal(err)
}
return col
}
}
log.Fatalf("collator of type %q not found", name)
return nil
}
// goCollator is an implemention of Collator using go's own collator.
type goCollator struct {
c *collate.Collator
buf collate.Buffer
}
func init() {
AddFactory(CollatorFactory{"go", newGoCollator, "Go's native collator implementation."})
}
func newGoCollator(locale string) (Collator, error) {
c := &goCollator{c: collate.New(locale)}
return c, nil
}
func (c *goCollator) Key(b Input) []byte {
return c.c.Key(&c.buf, b.UTF8)
}
func (c *goCollator) Compare(a, b Input) int {
return c.c.Compare(&c.buf, a.UTF8, b.UTF8)
}
This diff is collapsed.
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build darwin
package main
/*
#cgo LDFLAGS: -framework CoreFoundation
#include <CoreFoundation/CFBase.h>
#include <CoreFoundation/CoreFoundation.h>
*/
import "C"
import (
"unsafe"
)
func init() {
AddFactory(CollatorFactory{"osx", newOSX16Collator,
"OS X/Darwin collator, using native strings."})
AddFactory(CollatorFactory{"osx8", newOSX8Collator,
"OS X/Darwin collator for UTF-8."})
}
func osxUInt8P(s []byte) *C.UInt8 {
return (*C.UInt8)(unsafe.Pointer(&s[0]))
}
func osxCharP(s []uint16) *C.UniChar {
return (*C.UniChar)(unsafe.Pointer(&s[0]))
}
// osxCollator implements an Collator based on OS X's CoreFoundation.
type osxCollator struct {
loc C.CFLocaleRef
opt C.CFStringCompareFlags
}
func (c *osxCollator) init(locale string) {
l := C.CFStringCreateWithBytes(
nil,
osxUInt8P([]byte(locale)),
C.CFIndex(len(locale)),
C.kCFStringEncodingUTF8,
C.Boolean(0),
)
c.loc = C.CFLocaleCreate(nil, l)
}
func newOSX8Collator(locale string) (Collator, error) {
c := &osx8Collator{}
c.init(locale)
return c, nil
}
func newOSX16Collator(locale string) (Collator, error) {
c := &osx16Collator{}
c.init(locale)
return c, nil
}
func (c osxCollator) Key(s Input) []byte {
return nil // sort keys not supported by OS X CoreFoundation
}
type osx8Collator struct {
osxCollator
}
type osx16Collator struct {
osxCollator
}
func (c osx16Collator) Compare(a, b Input) int {
sa := C.CFStringCreateWithCharactersNoCopy(
nil,
osxCharP(a.UTF16),
C.CFIndex(len(a.UTF16)),
C.kCFAllocatorNull,
)
sb := C.CFStringCreateWithCharactersNoCopy(
nil,
osxCharP(b.UTF16),
C.CFIndex(len(b.UTF16)),
C.kCFAllocatorNull,
)
_range := C.CFRangeMake(0, C.CFStringGetLength(sa))
return int(C.CFStringCompareWithOptionsAndLocale(sa, sb, _range, c.opt, c.loc))
}
func (c osx8Collator) Compare(a, b Input) int {
sa := C.CFStringCreateWithBytesNoCopy(
nil,
osxUInt8P(a.UTF8),
C.CFIndex(len(a.UTF8)),
C.kCFStringEncodingUTF8,
C.Boolean(0),
C.kCFAllocatorNull,
)
sb := C.CFStringCreateWithBytesNoCopy(
nil,
osxUInt8P(b.UTF8),
C.CFIndex(len(b.UTF8)),
C.kCFStringEncodingUTF8,
C.Boolean(0),
C.kCFAllocatorNull,
)
_range := C.CFRangeMake(0, C.CFStringGetLength(sa))
return int(C.CFStringCompareWithOptionsAndLocale(sa, sb, _range, c.opt, c.loc))
}
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
import (
"exp/norm"
"math"
"math/rand"
"strings"
"unicode"
"unicode/utf16"
"unicode/utf8"
)
// parent computes the parent locale for the given locale.
// It returns false if the parent is already root.
func parent(locale string) (parent string, ok bool) {
if locale == "root" {
return "", false
}
if i := strings.LastIndex(locale, "_"); i != -1 {
return locale[:i], true
}
return "root", true
}
// rewriter is used to both unique strings and create variants of strings
// to add to the test set.
type rewriter struct {
seen map[string]bool
addCases bool
}
func newRewriter() *rewriter {
return &rewriter{
seen: make(map[string]bool),
}
}
func (r *rewriter) insert(a []string, s string) []string {
if !r.seen[s] {
r.seen[s] = true
a = append(a, s)
}
return a
}
// rewrite takes a sequence of strings in, adds variants of the these strings
// based on options and removes duplicates.
func (r *rewriter) rewrite(ss []string) []string {
ns := []string{}
for _, s := range ss {
ns = r.insert(ns, s)
if r.addCases {
rs := []rune(s)
rn := rs[0]
for c := unicode.SimpleFold(rn); c != rn; c = unicode.SimpleFold(c) {
rs[0] = c
ns = r.insert(ns, string(rs))
}
}
}
return ns
}
// exemplarySet holds a parsed set of characters from the exemplarCharacters table.
type exemplarySet struct {
typ exemplarType
set []string
charIndex int // cumulative total of phrases, including this set
}
type phraseGenerator struct {
sets [exN]exemplarySet
n int
}
func (g *phraseGenerator) init(locale string) {
ec := exemplarCharacters
// get sets for locale or parent locale if the set is not defined.
for i := range g.sets {
for p, ok := locale, true; ok; p, ok = parent(p) {
if set, ok := ec[p]; ok && set[i] != "" {
g.sets[i].set = strings.Split(set[i], " ")
break
}
}
}
r := newRewriter()
r.addCases = *cases
for i := range g.sets {
g.sets[i].set = r.rewrite(g.sets[i].set)
}
// compute indexes
for i, set := range g.sets {
g.n += len(set.set)
g.sets[i].charIndex = g.n
}
}
// phrase returns the ith phrase, where i < g.n.
func (g *phraseGenerator) phrase(i int) string {
for _, set := range g.sets {
if i < set.charIndex {
return set.set[i-(set.charIndex-len(set.set))]
}
}
panic("index out of range")
}
// generate generates inputs by combining all pairs of examplar strings.
// If doNorm is true, all input strings are normalized to NFC.
// TODO: allow other variations, statistical models, and random
// trailing sequences.
func (g *phraseGenerator) generate(doNorm bool) []Input {
const (
M = 1024 * 1024
buf8Size = 30 * M
buf16Size = 10 * M
)
// TODO: use a better way to limit the input size.
if sq := int(math.Sqrt(float64(*limit))); g.n > sq {
g.n = sq
}
size := g.n * g.n
a := make([]Input, 0, size)
buf8 := make([]byte, 0, buf8Size)
buf16 := make([]uint16, 0, buf16Size)
addInput := func(str string) {
buf8 = buf8[len(buf8):]
buf16 = buf16[len(buf16):]
if len(str) > cap(buf8) {
buf8 = make([]byte, 0, buf8Size)
}
if len(str) > cap(buf16) {
buf16 = make([]uint16, 0, buf16Size)
}
if doNorm {
buf8 = norm.NFC.AppendString(buf8, str)
} else {
buf8 = append(buf8, str...)
}
buf16 = appendUTF16(buf16, buf8)
a = append(a, makeInput(buf8, buf16))
}
for i := 0; i < g.n; i++ {
p1 := g.phrase(i)
addInput(p1)
for j := 0; j < g.n; j++ {
p2 := g.phrase(j)
addInput(p1 + p2)
}
}
// permutate
rnd := rand.New(rand.NewSource(int64(rand.Int())))
for i := range a {
j := i + rnd.Intn(len(a)-i)
a[i], a[j] = a[j], a[i]
a[i].index = i // allow restoring this order if input is used multiple times.
}
return a
}
func appendUTF16(buf []uint16, s []byte) []uint16 {
for len(s) > 0 {
r, sz := utf8.DecodeRune(s)
s = s[sz:]
r1, r2 := utf16.EncodeRune(r)
if r1 != 0xFFFD {
buf = append(buf, uint16(r1), uint16(r2))
} else {
buf = append(buf, uint16(r))
}
}
return buf
}
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build icu
package main
/*
#cgo LDFLAGS: -licui18n -licuuc
#include <stdlib.h>
#include <unicode/ucol.h>
#include <unicode/uiter.h>
#include <unicode/utypes.h>
*/
import "C"
import (
"fmt"
"log"
"unicode/utf16"
"unicode/utf8"
"unsafe"
)
func init() {
AddFactory(CollatorFactory{"icu", newUTF16,
"Main ICU collator, using native strings."})
AddFactory(CollatorFactory{"icu8", newUTF8iter,
"ICU collator using ICU iterators to process UTF8."})
AddFactory(CollatorFactory{"icu16", newUTF8conv,
"ICU collation by first converting UTF8 to UTF16."})
}
func icuCharP(s []byte) *C.char {
return (*C.char)(unsafe.Pointer(&s[0]))
}
func icuUInt8P(s []byte) *C.uint8_t {
return (*C.uint8_t)(unsafe.Pointer(&s[0]))
}
func icuUCharP(s []uint16) *C.UChar {
return (*C.UChar)(unsafe.Pointer(&s[0]))
}
func icuULen(s []uint16) C.int32_t {
return C.int32_t(len(s))
}
func icuSLen(s []byte) C.int32_t {
return C.int32_t(len(s))
}
// icuCollator implements a Collator based on ICU.
type icuCollator struct {
loc *C.char
col *C.UCollator
keyBuf []byte
}
const growBufSize = 10 * 1024 * 1024
func (c *icuCollator) init(locale string) error {
err := C.UErrorCode(0)
c.loc = C.CString(locale)
c.col = C.ucol_open(c.loc, &err)
if err > 0 {
return fmt.Errorf("failed opening collator for %q", locale)
} else if err < 0 {
loc := C.ucol_getLocaleByType(c.col, 0, &err)
fmt, ok := map[int]string{
-127: "warning: using default collator: %s",
-128: "warning: using fallback collator: %s",
}[int(err)]
if ok {
log.Printf(fmt, C.GoString(loc))
}
}
c.keyBuf = make([]byte, 0, growBufSize)
return nil
}
func (c *icuCollator) buf() (*C.uint8_t, C.int32_t) {
if len(c.keyBuf) == cap(c.keyBuf) {
c.keyBuf = make([]byte, 0, growBufSize)
}
b := c.keyBuf[len(c.keyBuf):cap(c.keyBuf)]
return icuUInt8P(b), icuSLen(b)
}
func (c *icuCollator) extendBuf(n C.int32_t) []byte {
end := len(c.keyBuf) + int(n)
if end > cap(c.keyBuf) {
if len(c.keyBuf) == 0 {
log.Fatalf("icuCollator: max string size exceeded: %v > %v", n, growBufSize)
}
c.keyBuf = make([]byte, 0, growBufSize)
return nil
}
b := c.keyBuf[len(c.keyBuf):end]
c.keyBuf = c.keyBuf[:end]
return b
}
func (c *icuCollator) Close() error {
C.ucol_close(c.col)
C.free(unsafe.Pointer(c.loc))
return nil
}
// icuUTF16 implements the Collator interface.
type icuUTF16 struct {
icuCollator
}
func newUTF16(locale string) (Collator, error) {
c := &icuUTF16{}
return c, c.init(locale)
}
func (c *icuUTF16) Compare(a, b Input) int {
return int(C.ucol_strcoll(c.col, icuUCharP(a.UTF16), icuULen(a.UTF16), icuUCharP(b.UTF16), icuULen(b.UTF16)))
}
func (c *icuUTF16) Key(s Input) []byte {
bp, bn := c.buf()
n := C.ucol_getSortKey(c.col, icuUCharP(s.UTF16), icuULen(s.UTF16), bp, bn)
if b := c.extendBuf(n); b != nil {
return b
}
return c.Key(s)
}
// icuUTF8iter implements the Collator interface
// This implementation wraps the UTF8 string in an iterator
// which is passed to the collator.
type icuUTF8iter struct {
icuCollator
a, b C.UCharIterator
}
func newUTF8iter(locale string) (Collator, error) {
c := &icuUTF8iter{}
return c, c.init(locale)
}
func (c *icuUTF8iter) Compare(a, b Input) int {
err := C.UErrorCode(0)
C.uiter_setUTF8(&c.a, icuCharP(a.UTF8), icuSLen(a.UTF8))
C.uiter_setUTF8(&c.b, icuCharP(b.UTF8), icuSLen(b.UTF8))
return int(C.ucol_strcollIter(c.col, &c.a, &c.b, &err))
}
func (c *icuUTF8iter) Key(s Input) []byte {
err := C.UErrorCode(0)
state := [2]C.uint32_t{}
C.uiter_setUTF8(&c.a, icuCharP(s.UTF8), icuSLen(s.UTF8))
bp, bn := c.buf()
n := C.ucol_nextSortKeyPart(c.col, &c.a, &(state[0]), bp, bn, &err)
if n >= bn {
// Force failure.
if c.extendBuf(n+1) != nil {
log.Fatal("expected extension to fail")
}
return c.Key(s)
}
return c.extendBuf(n)
}
// icuUTF8conv implementes the Collator interface.
// This implentation first converts the give UTF8 string
// to UTF16 and then calls the main ICU collation function.
type icuUTF8conv struct {
icuCollator
}
func newUTF8conv(locale string) (Collator, error) {
c := &icuUTF8conv{}
return c, c.init(locale)
}
func (c *icuUTF8conv) Compare(sa, sb Input) int {
a := encodeUTF16(sa.UTF8)
b := encodeUTF16(sb.UTF8)
return int(C.ucol_strcoll(c.col, icuUCharP(a), icuULen(a), icuUCharP(b), icuULen(b)))
}
func (c *icuUTF8conv) Key(s Input) []byte {
a := encodeUTF16(s.UTF8)
bp, bn := c.buf()
n := C.ucol_getSortKey(c.col, icuUCharP(a), icuULen(a), bp, bn)
if b := c.extendBuf(n); b != nil {
return b
}
return c.Key(s)
}
func encodeUTF16(b []byte) []uint16 {
a := []uint16{}
for len(b) > 0 {
r, sz := utf8.DecodeRune(b)
b = b[sz:]
r1, r2 := utf16.EncodeRune(r)
if r1 != 0xFFFD {
a = append(a, uint16(r1), uint16(r2))
} else {
a = append(a, uint16(r))
}
}
return a
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment