Commit 0355a717 authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/locale/collate: Add maketables tool and generated tables.

Also set maxContractLen automatically.
Note that the table size is much bigger than it needs to be.
Optimization is best done, though, when the language specific
tables are added.

R=r
CC=golang-dev
https://golang.org/cl/6167044
parent 25a8a8da
# Copyright 2012 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
CLEANFILES+=maketables
maketables: maketables.go
go build $^
tables: maketables
./maketables > tables.go
gofmt -w tables.go
# Build (but do not run) maketables during testing,
# just to make sure it still compiles.
testshort: maketables
......@@ -412,6 +412,9 @@ func (b *Builder) processContractions() {
cm := make(map[rune][]*entry)
for _, e := range b.entry {
if e.contraction() {
if len(e.str) > b.t.maxContractLen {
b.t.maxContractLen = len(e.str)
}
r := e.runes[0]
if _, ok := cm[r]; !ok {
starters = append(starters, r)
......
......@@ -72,6 +72,7 @@ func (t *table) print(w io.Writer, name string) (n, size int, err error) {
update(t.contractTries.printStruct(w, name))
p(",\n")
p("%sContractElem[:],\n", name)
p("%d,\n", t.maxContractLen)
p("}\n\n")
// Write arrays needed for the structure.
......
......@@ -92,10 +92,6 @@ func (c *Collator) SetVariableTop(r rune) {
// TODO: implement
}
var (
Root = Collator{}
)
// Buffer holds reusable buffers that can be used during collation.
// Reusing a Buffer for the various calls that accept it may avoid
// unnecessary memory allocations.
......
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
// Collation table generator.
// Data read from the web.
package main
import (
"bufio"
"exp/locale/collate"
"exp/locale/collate/build"
"flag"
"fmt"
"io"
"log"
"net/http"
"os"
"path"
"regexp"
"strconv"
"strings"
"unicode"
)
var ducet = flag.String("ducet",
"http://unicode.org/Public/UCA/"+unicode.Version+"/allkeys.txt",
"URL of the Default Unicode Collation Element Table (DUCET).")
var localFiles = flag.Bool("local",
false,
"data files have been copied to the current directory; for debugging only")
func failonerror(e error) {
if e != nil {
log.Fatal(e)
}
}
// openReader opens the url or file given by url and returns it as an io.ReadCloser
// or nil on error.
func openReader(url string) (io.ReadCloser, error) {
if *localFiles {
pwd, _ := os.Getwd()
url = "file://" + path.Join(pwd, path.Base(url))
}
t := &http.Transport{}
t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
c := &http.Client{Transport: t}
resp, err := c.Get(url)
if err != nil {
return nil, err
}
if resp.StatusCode != 200 {
return nil, fmt.Errorf(`bad GET status for "%s": %s`, url, resp.Status)
}
return resp.Body, nil
}
// parseUCA parses a Default Unicode Collation Element Table of the format
// specified in http://www.unicode.org/reports/tr10/#File_Format.
// It returns the variable top.
func parseUCA(builder *build.Builder) int {
maxVar, minNonVar := 0, 1<<30
r, err := openReader(*ducet)
failonerror(err)
defer r.Close()
input := bufio.NewReader(r)
colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
for i := 1; err == nil; i++ {
l, prefix, e := input.ReadLine()
err = e
line := string(l)
if prefix {
log.Fatalf("%d: buffer overflow", i)
}
if err != nil && err != io.EOF {
log.Fatalf("%d: %v", i, err)
}
if len(line) == 0 || line[0] == '#' {
continue
}
if line[0] == '@' {
// parse properties
switch {
case strings.HasPrefix(line[1:], "version "):
a := strings.Split(line[1:], " ")
if a[1] != unicode.Version {
log.Fatalf("incompatible version %s; want %s", a[1], unicode.Version)
}
case strings.HasPrefix(line[1:], "backwards "):
log.Fatalf("%d: unsupported option backwards", i)
default:
log.Printf("%d: unknown option %s", i, line[1:])
}
} else {
// parse entries
part := strings.Split(line, " ; ")
if len(part) != 2 {
log.Fatalf("%d: production rule without ';': %v", i, line)
}
lhs := []rune{}
for _, v := range strings.Split(part[0], " ") {
if v == "" {
continue
}
lhs = append(lhs, rune(convHex(i, v)))
}
var n int
rhs := [][]int{}
for _, m := range colelem.FindAllStringSubmatch(part[1], -1) {
n += len(m[0])
elem := []int{}
for _, h := range strings.Split(m[2], ".") {
elem = append(elem, convHex(i, h))
}
if p := elem[0]; m[1] == "*" {
if p > maxVar {
maxVar = p
}
} else if p > 0 && p < minNonVar {
minNonVar = p
}
rhs = append(rhs, elem)
}
if len(part[1]) < n+3 || part[1][n+1] != '#' {
log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
}
builder.Add(lhs, rhs)
}
}
if maxVar >= minNonVar {
log.Fatalf("found maxVar > minNonVar (%d > %d)", maxVar, minNonVar)
}
return maxVar
}
func convHex(line int, s string) int {
r, e := strconv.ParseInt(s, 16, 32)
if e != nil {
log.Fatalf("%d: %v", line, e)
}
return int(r)
}
// TODO: move this functionality to exp/locale/collate/build.
func printCollators(c *collate.Collator, vartop int) {
const name = "Root"
fmt.Printf("var _%s = Collator{\n", name)
fmt.Printf("\tStrength: %v,\n", c.Strength)
fmt.Printf("\tvariableTop: 0x%X,\n", vartop)
fmt.Printf("\tf: norm.NFD,\n")
fmt.Printf("\tt: &%sTable,\n", strings.ToLower(name))
fmt.Printf("}\n\n")
fmt.Printf("var (\n")
fmt.Printf("\t%s = _%s\n", name, name)
fmt.Printf(")\n\n")
}
func main() {
flag.Parse()
b := build.NewBuilder()
vartop := parseUCA(b)
_, err := b.Build("")
failonerror(err)
fmt.Println("// Generated by running")
fmt.Printf("// maketables --ducet=%s\n", *ducet)
fmt.Println("// DO NOT EDIT")
fmt.Println("// TODO: implement more compact representation for sparse blocks.")
fmt.Println("")
fmt.Println("package collate")
fmt.Println("")
fmt.Println(`import "exp/norm"`)
fmt.Println("")
c := &collate.Collator{}
c.Strength = collate.Quaternary
printCollators(c, vartop)
_, err = b.Print(os.Stdout)
failonerror(err)
}
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment