Commit a1f5f3f1 authored by Brad Fitzpatrick's avatar Brad Fitzpatrick

xml: Parser hook for non-UTF-8 charset converters

Adds an optional hook to Parser to let charset
converters step in when a processing directive
with a non-UTF-8 encoding is specified.

(Open to alternative proposals too...)

R=rsc
CC=golang-dev
https://golang.org/cl/4437061
parent f367c13c
...@@ -163,6 +163,13 @@ type Parser struct { ...@@ -163,6 +163,13 @@ type Parser struct {
// "quot": `"`, // "quot": `"`,
Entity map[string]string Entity map[string]string
// CharsetReader, if non-nil, defines a function to generate
// charset-conversion readers, converting from the provided
// non-UTF-8 charset into UTF-8. If CharsetReader is nil or
// returns an error, parsing stops with an error. One of the
// the CharsetReader's result values must be non-nil.
CharsetReader func(charset string, input io.Reader) (io.Reader, os.Error)
r io.ByteReader r io.ByteReader
buf bytes.Buffer buf bytes.Buffer
saved *bytes.Buffer saved *bytes.Buffer
...@@ -186,17 +193,7 @@ func NewParser(r io.Reader) *Parser { ...@@ -186,17 +193,7 @@ func NewParser(r io.Reader) *Parser {
line: 1, line: 1,
Strict: true, Strict: true,
} }
p.switchToReader(r)
// Get efficient byte at a time reader.
// Assume that if reader has its own
// ReadByte, it's efficient enough.
// Otherwise, use bufio.
if rb, ok := r.(io.ByteReader); ok {
p.r = rb
} else {
p.r = bufio.NewReader(r)
}
return p return p
} }
...@@ -290,6 +287,18 @@ func (p *Parser) translate(n *Name, isElementName bool) { ...@@ -290,6 +287,18 @@ func (p *Parser) translate(n *Name, isElementName bool) {
} }
} }
func (p *Parser) switchToReader(r io.Reader) {
// Get efficient byte at a time reader.
// Assume that if reader has its own
// ReadByte, it's efficient enough.
// Otherwise, use bufio.
if rb, ok := r.(io.ByteReader); ok {
p.r = rb
} else {
p.r = bufio.NewReader(r)
}
}
// Parsing state - stack holds old name space translations // Parsing state - stack holds old name space translations
// and the current set of open elements. The translations to pop when // and the current set of open elements. The translations to pop when
// ending a given tag are *below* it on the stack, which is // ending a given tag are *below* it on the stack, which is
...@@ -487,6 +496,25 @@ func (p *Parser) RawToken() (Token, os.Error) { ...@@ -487,6 +496,25 @@ func (p *Parser) RawToken() (Token, os.Error) {
} }
data := p.buf.Bytes() data := p.buf.Bytes()
data = data[0 : len(data)-2] // chop ?> data = data[0 : len(data)-2] // chop ?>
if target == "xml" {
enc := procInstEncoding(string(data))
if enc != "" && enc != "utf-8" && enc != "UTF-8" {
if p.CharsetReader == nil {
p.err = fmt.Errorf("xml: encoding %q declared but Parser.CharsetReader is nil", enc)
return nil, p.err
}
newr, err := p.CharsetReader(enc, p.r.(io.Reader))
if err != nil {
p.err = fmt.Errorf("xml: opening charset %q: %v", enc, err)
return nil, p.err
}
if newr == nil {
panic("CharsetReader returned a nil Reader for charset " + enc)
}
p.switchToReader(newr)
}
}
return ProcInst{target, data}, nil return ProcInst{target, data}, nil
case '!': case '!':
...@@ -1633,3 +1661,26 @@ func Escape(w io.Writer, s []byte) { ...@@ -1633,3 +1661,26 @@ func Escape(w io.Writer, s []byte) {
} }
w.Write(s[last:]) w.Write(s[last:])
} }
// procInstEncoding parses the `encoding="..."` or `encoding='...'`
// value out of the provided string, returning "" if not found.
func procInstEncoding(s string) string {
// TODO: this parsing is somewhat lame and not exact.
// It works for all actual cases, though.
idx := strings.Index(s, "encoding=")
if idx == -1 {
return ""
}
v := s[idx+len("encoding="):]
if v == "" {
return ""
}
if v[0] != '\'' && v[0] != '"' {
return ""
}
idx = strings.IndexRune(v[1:], int(v[0]))
if idx == -1 {
return ""
}
return v[1 : idx+1]
}
...@@ -9,6 +9,7 @@ import ( ...@@ -9,6 +9,7 @@ import (
"io" "io"
"os" "os"
"reflect" "reflect"
"strings"
"testing" "testing"
) )
...@@ -96,6 +97,19 @@ var cookedTokens = []Token{ ...@@ -96,6 +97,19 @@ var cookedTokens = []Token{
Comment([]byte(" missing final newline ")), Comment([]byte(" missing final newline ")),
} }
const testInputAltEncoding = `
<?xml version="1.0" encoding="x-testing-uppercase"?>
<TAG>VALUE</TAG>`
var rawTokensAltEncoding = []Token{
CharData([]byte("\n")),
ProcInst{"xml", []byte(`version="1.0" encoding="x-testing-uppercase"`)},
CharData([]byte("\n")),
StartElement{Name{"", "tag"}, nil},
CharData([]byte("value")),
EndElement{Name{"", "tag"}},
}
var xmlInput = []string{ var xmlInput = []string{
// unexpected EOF cases // unexpected EOF cases
"<", "<",
...@@ -173,7 +187,64 @@ func StringReader(s string) io.Reader { return &stringReader{s, 0} } ...@@ -173,7 +187,64 @@ func StringReader(s string) io.Reader { return &stringReader{s, 0} }
func TestRawToken(t *testing.T) { func TestRawToken(t *testing.T) {
p := NewParser(StringReader(testInput)) p := NewParser(StringReader(testInput))
testRawToken(t, p, rawTokens)
}
type downCaser struct {
t *testing.T
r io.ByteReader
}
func (d *downCaser) ReadByte() (c byte, err os.Error) {
c, err = d.r.ReadByte()
if c >= 'A' && c <= 'Z' {
c += 'a' - 'A'
}
return
}
func (d *downCaser) Read(p []byte) (int, os.Error) {
d.t.Fatalf("unexpected Read call on downCaser reader")
return 0, os.EINVAL
}
func TestRawTokenAltEncoding(t *testing.T) {
sawEncoding := ""
p := NewParser(StringReader(testInputAltEncoding))
p.CharsetReader = func(charset string, input io.Reader) (io.Reader, os.Error) {
sawEncoding = charset
if charset != "x-testing-uppercase" {
t.Fatalf("unexpected charset %q", charset)
}
return &downCaser{t, input.(io.ByteReader)}, nil
}
testRawToken(t, p, rawTokensAltEncoding)
}
func TestRawTokenAltEncodingNoConverter(t *testing.T) {
p := NewParser(StringReader(testInputAltEncoding))
token, err := p.RawToken()
if token == nil {
t.Fatalf("expected a token on first RawToken call")
}
if err != nil {
t.Fatal(err)
}
token, err = p.RawToken()
if token != nil {
t.Errorf("expected a nil token; got %#v", token)
}
if err == nil {
t.Fatalf("expected an error on second RawToken call")
}
const encoding = "x-testing-uppercase"
if !strings.Contains(err.String(), encoding) {
t.Errorf("expected error to contain %q; got error: %v",
encoding, err)
}
}
func testRawToken(t *testing.T, p *Parser, rawTokens []Token) {
for i, want := range rawTokens { for i, want := range rawTokens {
have, err := p.RawToken() have, err := p.RawToken()
if err != nil { if err != nil {
...@@ -483,3 +554,26 @@ func TestDisallowedCharacters(t *testing.T) { ...@@ -483,3 +554,26 @@ func TestDisallowedCharacters(t *testing.T) {
} }
} }
} }
type procInstEncodingTest struct {
expect, got string
}
var procInstTests = []struct {
input, expect string
}{
{`version="1.0" encoding="utf-8"`, "utf-8"},
{`version="1.0" encoding='utf-8'`, "utf-8"},
{`version="1.0" encoding='utf-8' `, "utf-8"},
{`version="1.0" encoding=utf-8`, ""},
{`encoding="FOO" `, "FOO"},
}
func TestProcInstEncoding(t *testing.T) {
for _, test := range procInstTests {
got := procInstEncoding(test.input)
if got != test.expect {
t.Errorf("procInstEncoding(%q) = %q; want %q", test.input, got, test.expect)
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment