Commit 3f04d1ff authored by Andrew Balholm's avatar Andrew Balholm Committed by Nigel Tao

go.net/html/charset: add NewReader

NewReader is a convenience function for finding the encoding of
an io.Reader and making a UTF-8 version of that Reader.

R=nigeltao
CC=golang-dev
https://golang.org/cl/43510043
parent 74213743
...@@ -6,6 +6,7 @@ package charset ...@@ -6,6 +6,7 @@ package charset
import ( import (
"bytes" "bytes"
"io"
"mime" "mime"
"strings" "strings"
"unicode/utf8" "unicode/utf8"
...@@ -13,6 +14,7 @@ import ( ...@@ -13,6 +14,7 @@ import (
"code.google.com/p/go.net/html" "code.google.com/p/go.net/html"
"code.google.com/p/go.text/encoding" "code.google.com/p/go.text/encoding"
"code.google.com/p/go.text/encoding/charmap" "code.google.com/p/go.text/encoding/charmap"
"code.google.com/p/go.text/transform"
) )
// Lookup returns the encoding with the specified label, and its canonical // Lookup returns the encoding with the specified label, and its canonical
...@@ -83,6 +85,27 @@ func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, ...@@ -83,6 +85,27 @@ func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding,
return charmap.Windows1252, "windows-1252", false return charmap.Windows1252, "windows-1252", false
} }
// NewReader returns an io.Reader that converts the content of r to UTF-8.
// It calls DetermineEncoding to find out what r's encoding is.
func NewReader(r io.Reader, contentType string) (io.Reader, error) {
preview := make([]byte, 1024)
n, err := io.ReadFull(r, preview)
switch {
case err == io.ErrUnexpectedEOF:
preview = preview[:n]
r = bytes.NewReader(preview)
case err != nil:
return nil, err
default:
r = io.MultiReader(bytes.NewReader(preview), r)
}
if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop {
r = transform.NewReader(r, e.NewDecoder())
}
return r, nil
}
func prescan(content []byte) (e encoding.Encoding, name string) { func prescan(content []byte) (e encoding.Encoding, name string) {
z := html.NewTokenizer(bytes.NewReader(content)) z := html.NewTokenizer(bytes.NewReader(content))
for { for {
......
package charset package charset
import ( import (
"bytes"
"io/ioutil" "io/ioutil"
"strings" "strings"
"testing" "testing"
...@@ -143,6 +144,40 @@ func TestSniff(t *testing.T) { ...@@ -143,6 +144,40 @@ func TestSniff(t *testing.T) {
} }
} }
func TestReader(t *testing.T) {
for _, tc := range sniffTestCases {
content, err := ioutil.ReadFile("testdata/" + tc.filename)
if err != nil {
t.Errorf("%s: error reading file: %v", tc.filename, err)
continue
}
r, err := NewReader(bytes.NewReader(content), tc.declared)
if err != nil {
t.Errorf("%s: error creating reader: %v", tc.filename, err)
continue
}
got, err := ioutil.ReadAll(r)
if err != nil {
t.Errorf("%s: error reading from charset.NewReader: %v", tc.filename, err)
continue
}
e, _ := Lookup(tc.want)
want, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader(content), e.NewDecoder()))
if err != nil {
t.Errorf("%s: error decoding with hard-coded charset name: %v", tc.filename, err)
continue
}
if !bytes.Equal(got, want) {
t.Errorf("%s: got %q, want %q", tc.filename, got, want)
continue
}
}
}
var metaTestCases = []struct { var metaTestCases = []struct {
meta, want string meta, want string
}{ }{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment