Commit 00f7cd4b authored by Paul Borman's avatar Paul Borman Committed by Russ Cox

csv: new package

csv reader/writer based on RFC 4180

R=rsc, mattn.jp, r, dchest
CC=golang-dev
https://golang.org/cl/4629085
parent 21752bc1
......@@ -62,6 +62,7 @@ DIRS=\
crypto/x509\
crypto/x509/pkix\
crypto/xtea\
csv\
debug/dwarf\
debug/macho\
debug/elf\
......
# Copyright 2011 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
include ../../Make.inc
TARG=csv
GOFILES=\
reader.go\
writer.go\
include ../../Make.pkg
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package csv reads and writes comma-separated values (CSV) files.
//
// A csv file contains zero or more records of one or more fields per record.
// Each record is separated by the newline character. The final record may
// optionally be followed by a newline character.
//
// field1,field2,field3
//
// White space is considered part of a field.
//
// Carriage returns before newline characters are silently removed.
//
// Blank lines are ignored. A line with only whitespace characters (excluding
// the ending newline character) is not considered a blank line.
//
// Fields which start and stop with the quote character " are called
// quoted-fields. The beginning and ending quote are not part of the
// field.
//
// The source:
//
// normal string,"quoted-field"
//
// results in the fields
//
// {`normal string`, `quoted-field`}
//
// Within a quoted-field a quote character followed by a second quote
// character is considered a single quote.
//
// "the ""word"" is true","a ""quoted-field"""
//
// results in
//
// {`the "word" is true`, `a "quoted-field"`}
//
// Newlines and commas may be included in a quoted-field
//
// "Multi-line
// field","comma is ,"
//
// results in
//
// {`Multi-line
// field`, `comma is ,`}
package csv
import (
"bufio"
"bytes"
"fmt"
"io"
"os"
"unicode"
)
// A ParseError is returned for parsing errors.
// The first line is 1. The first column is 0.
type ParseError struct {
Line int // Line where the error occurred
Column int // Column (rune index) where the error occurred
Error os.Error // The actual error
}
func (e *ParseError) String() string {
return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Error)
}
// These are the errors that can be returned in ParseError.Error
var (
ErrTrailingComma = os.NewError("extra delimiter at end of line")
ErrBareQuote = os.NewError("bare \" in non-quoted-field")
ErrQuote = os.NewError("extraneous \" in field")
ErrFieldCount = os.NewError("wrong number of fields in line")
)
// A Reader reads records from a CSV-encoded file.
//
// As returned by NewReader, a Reader expects input conforming to RFC 4180.
// The exported fields can be changed to customize the details before the
// first call to Read or ReadAll.
//
// Comma is the field delimiter. It defaults to ','.
//
// Comment, if not 0, is the comment character. Lines beginning with the
// Comment character is ignored.
//
// If FieldsPerRecord is positive, Read requires each record to
// have the given number of fields. If FieldsPerRecord is 0, Read sets it to
// the number of fields in the first record, so that future records must
// have the same field count.
//
// If LazyQuotes is true, a quote may appear in an unquoted field and a
// non-doubled quote may appear in a quoted field.
//
// If TrailingComma is true, the last field may be a unquoted empty field.
//
// If TrimLeadingSpace is true, leading white space in a field is ignored.
type Reader struct {
Comma int // Field delimiter (set to ',' by NewReader)
Comment int // Comment character for start of line
FieldsPerRecord int // Number of expected fields per record
LazyQuotes bool // Allow lazy quotes
TrailingComma bool // Allow trailing comma
TrimLeadingSpace bool // Trim leading space
line int
column int
r *bufio.Reader
field bytes.Buffer
}
// NewReader returns a new Reader that reads from r.
func NewReader(r io.Reader) *Reader {
return &Reader{
Comma: ',',
r: bufio.NewReader(r),
}
}
// error creates a new ParseError based on err.
func (r *Reader) error(err os.Error) os.Error {
return &ParseError{
Line: r.line,
Column: r.column,
Error: err,
}
}
// Read reads one record from r. The record is a slice of strings with each
// string representing one field.
func (r *Reader) Read() (record []string, err os.Error) {
for {
record, err = r.parseRecord()
if record != nil {
break
}
if err != nil {
return nil, err
}
}
if r.FieldsPerRecord > 0 {
if len(record) != r.FieldsPerRecord {
r.column = 0 // report at start of record
return record, r.error(ErrFieldCount)
}
} else if r.FieldsPerRecord == 0 {
r.FieldsPerRecord = len(record)
}
return record, nil
}
// ReadAll reads all the remaining records from r.
// Each record is a slice of fields.
func (r *Reader) ReadAll() (records [][]string, err os.Error) {
for {
record, err := r.Read()
if err == os.EOF {
return records, nil
}
if err != nil {
return nil, err
}
records = append(records, record)
}
panic("unreachable")
}
// readRune reads one rune from r, folding \r\n to \n and keeping track
// of our far into the line we have read. r.column will point to the start
// of this rune, not the end of this rune.
func (r *Reader) readRune() (int, os.Error) {
rune, _, err := r.r.ReadRune()
// Handle \r\n here. We make the simplifying assumption that
// anytime \r is followed by \n that it can be folded to \n.
// We will not detect files which contain both \r\n and bare \n.
if rune == '\r' {
rune, _, err = r.r.ReadRune()
if err == nil {
if rune != '\n' {
r.r.UnreadRune()
rune = '\r'
}
}
}
r.column++
return rune, err
}
// unreadRune puts the last rune read from r back.
func (r *Reader) unreadRune() {
r.r.UnreadRune()
r.column--
}
// skip reads runes up to and including the rune delim or until error.
func (r *Reader) skip(delim int) os.Error {
for {
rune, err := r.readRune()
if err != nil {
return err
}
if rune == delim {
return nil
}
}
panic("unreachable")
}
// parseRecord reads and parses a single csv record from r.
func (r *Reader) parseRecord() (fields []string, err os.Error) {
// Each record starts on a new line. We increment our line
// number (lines start at 1, not 0) and set column to -1
// so as we increment in readRune it points to the character we read.
r.line++
r.column = -1
// Peek at the first rune. If it is an error we are done.
// If we are support comments and it is the comment character
// the skip to the end of line.
rune, _, err := r.r.ReadRune()
if err != nil {
return nil, err
}
if r.Comment != 0 && rune == r.Comment {
return nil, r.skip('\n')
}
r.r.UnreadRune()
// At this point we have at least one field.
for {
haveField, delim, err := r.parseField()
if haveField {
fields = append(fields, r.field.String())
}
if delim == '\n' || err == os.EOF {
return fields, err
} else if err != nil {
return nil, err
}
}
panic("unreachable")
}
// parseField parses the next field in the record. The read field is
// located in r.field. Delim is the first character not part of the field
// (r.Comma or '\n').
func (r *Reader) parseField() (haveField bool, delim int, err os.Error) {
r.field.Reset()
rune, err := r.readRune()
if err != nil {
// If we have EOF and are not at the start of a line
// then we return the empty field. We have already
// checked for trailing commas if needed.
if err == os.EOF && r.column != 0 {
return true, 0, err
}
return false, 0, err
}
if r.TrimLeadingSpace {
for unicode.IsSpace(rune) {
rune, err = r.readRune()
if err != nil {
return false, 0, err
}
}
}
switch rune {
case r.Comma:
// will check below
case '\n':
// We are a trailing empty field or a blank linke
if r.column == 0 {
return false, rune, nil
}
return true, rune, nil
case '"':
// quoted field
Quoted:
for {
rune, err = r.readRune()
if err != nil {
if err == os.EOF {
if r.LazyQuotes {
return true, 0, err
}
return false, 0, r.error(ErrQuote)
}
return false, 0, err
}
switch rune {
case '"':
rune, err = r.readRune()
if err != nil || rune == r.Comma {
break Quoted
}
if rune == '\n' {
return true, rune, nil
}
if rune != '"' {
if !r.LazyQuotes {
r.column--
return false, 0, r.error(ErrQuote)
}
// accept the bare quote
r.field.WriteRune('"')
}
case '\n':
r.line++
r.column = -1
}
r.field.WriteRune(rune)
}
default:
// unquoted field
for {
r.field.WriteRune(rune)
rune, err = r.readRune()
if err != nil || rune == r.Comma {
break
}
if rune == '\n' {
return true, rune, nil
}
if !r.LazyQuotes && rune == '"' {
return false, 0, r.error(ErrBareQuote)
}
}
}
if err != nil {
if err == os.EOF {
return true, 0, err
}
return false, 0, err
}
if !r.TrailingComma {
// We don't allow trailing commas. See if we
// are at the end of the line (being mindful
// of triming spaces
c := r.column
rune, err = r.readRune()
if r.TrimLeadingSpace {
for unicode.IsSpace(rune) {
rune, err = r.readRune()
if err != nil {
break
}
}
}
if err == os.EOF || rune == '\n' {
r.column = c // report the comma
return false, 0, r.error(ErrTrailingComma)
}
r.unreadRune()
}
return true, rune, nil
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package csv
import (
"reflect"
"strings"
"testing"
)
var readTests = []struct {
Name string
Input string
Output [][]string
UseFieldsPerRecord bool // false (default) means FieldsPerRecord is -1
// These fields are copied into the Reader
Comma int
Comment int
FieldsPerRecord int
LazyQuotes bool
TrailingComma bool
TrimLeadingSpace bool
Error string
Line int // Expected error line if != 0
Column int // Expected error column if line != 0
}{
{
Name: "Simple",
Input: "a,b,c\n",
Output: [][]string{{"a", "b", "c"}},
},
{
Name: "CRLF",
Input: "a,b\r\nc,d\r\n",
Output: [][]string{{"a", "b"}, {"c", "d"}},
},
{
Name: "BareCR",
Input: "a,b\rc,d\r\n",
Output: [][]string{{"a", "b\rc", "d"}},
},
{
Name: "RFC4180test",
UseFieldsPerRecord: true,
Input: `#field1,field2,field3
"aaa","bb
b","ccc"
"a,a","b""bb","ccc"
zzz,yyy,xxx
`,
Output: [][]string{
{"#field1", "field2", "field3"},
{"aaa", "bb\nb", "ccc"},
{"a,a", `b"bb`, "ccc"},
{"zzz", "yyy", "xxx"},
},
},
{
Name: "NoEOLTest",
Input: "a,b,c",
Output: [][]string{{"a", "b", "c"}},
},
{
Name: "Semicolon",
Comma: ';',
Input: "a;b;c\n",
Output: [][]string{{"a", "b", "c"}},
},
{
Name: "MultiLine",
Input: `"two
line","one line","three
line
field"`,
Output: [][]string{{"two\nline", "one line", "three\nline\nfield"}},
},
{
Name: "BlankLine",
Input: "a,b,c\n\nd,e,f\n\n",
Output: [][]string{
{"a", "b", "c"},
{"d", "e", "f"},
},
},
{
Name: "TrimSpace",
Input: " a, b, c\n",
TrimLeadingSpace: true,
Output: [][]string{{"a", "b", "c"}},
},
{
Name: "LeadingSpace",
Input: " a, b, c\n",
Output: [][]string{{" a", " b", " c"}},
},
{
Name: "Comment",
Comment: '#',
Input: "#1,2,3\na,b,c\n#comment",
Output: [][]string{{"a", "b", "c"}},
},
{
Name: "NoComment",
Input: "#1,2,3\na,b,c",
Output: [][]string{{"#1", "2", "3"}, {"a", "b", "c"}},
},
{
Name: "LazyQuotes",
LazyQuotes: true,
Input: `a "word","1"2",a","b`,
Output: [][]string{{`a "word"`, `1"2`, `a"`, `b`}},
},
{
Name: "BareQuotes",
LazyQuotes: true,
Input: `a "word","1"2",a"`,
Output: [][]string{{`a "word"`, `1"2`, `a"`}},
},
{
Name: "BareDoubleQuotes",
LazyQuotes: true,
Input: `a""b,c`,
Output: [][]string{{`a""b`, `c`}},
},
{
Name: "BadDoubleQuotes",
Input: `a""b,c`,
Output: [][]string{{`a""b`, `c`}},
Error: `bare " in non-quoted-field`, Line: 1, Column: 1,
},
{
Name: "TrimQuote",
Input: ` "a"," b",c`,
TrimLeadingSpace: true,
Output: [][]string{{"a", " b", "c"}},
},
{
Name: "BadBareQuote",
Input: `a "word","b"`,
Error: `bare " in non-quoted-field`, Line: 1, Column: 2,
},
{
Name: "BadTrailingQuote",
Input: `"a word",b"`,
Error: `bare " in non-quoted-field`, Line: 1, Column: 10,
},
{
Name: "ExtraneousQuote",
Input: `"a "word","b"`,
Error: `extraneous " in field`, Line: 1, Column: 3,
},
{
Name: "BadFieldCount",
UseFieldsPerRecord: true,
Input: "a,b,c\nd,e",
Error: "wrong number of fields", Line: 2,
},
{
Name: "BadFieldCount1",
UseFieldsPerRecord: true,
FieldsPerRecord: 2,
Input: `a,b,c`,
Error: "wrong number of fields", Line: 1,
},
{
Name: "FieldCount",
Input: "a,b,c\nd,e",
Output: [][]string{{"a", "b", "c"}, {"d", "e"}},
},
{
Name: "BadTrailingCommaEOF",
Input: "a,b,c,",
Error: "extra delimiter at end of line", Line: 1, Column: 5,
},
{
Name: "BadTrailingCommaEOL",
Input: "a,b,c,\n",
Error: "extra delimiter at end of line", Line: 1, Column: 5,
},
{
Name: "BadTrailingCommaSpaceEOF",
TrimLeadingSpace: true,
Input: "a,b,c, ",
Error: "extra delimiter at end of line", Line: 1, Column: 5,
},
{
Name: "BadTrailingCommaSpaceEOL",
TrimLeadingSpace: true,
Input: "a,b,c, \n",
Error: "extra delimiter at end of line", Line: 1, Column: 5,
},
{
Name: "BadTrailingCommaLine3",
TrimLeadingSpace: true,
Input: "a,b,c\nd,e,f\ng,hi,",
Error: "extra delimiter at end of line", Line: 3, Column: 4,
},
{
Name: "NotTrailingComma3",
Input: "a,b,c, \n",
Output: [][]string{{"a", "b", "c", " "}},
},
{
Name: "CommaFieldTest",
TrailingComma: true,
Input: `x,y,z,w
x,y,z,
x,y,,
x,,,
,,,
"x","y","z","w"
"x","y","z",""
"x","y","",""
"x","","",""
"","","",""
`,
Output: [][]string{
{"x", "y", "z", "w"},
{"x", "y", "z", ""},
{"x", "y", "", ""},
{"x", "", "", ""},
{"", "", "", ""},
{"x", "y", "z", "w"},
{"x", "y", "z", ""},
{"x", "y", "", ""},
{"x", "", "", ""},
{"", "", "", ""},
},
},
}
func TestRead(t *testing.T) {
for _, tt := range readTests {
r := NewReader(strings.NewReader(tt.Input))
r.Comment = tt.Comment
if tt.UseFieldsPerRecord {
r.FieldsPerRecord = tt.FieldsPerRecord
} else {
r.FieldsPerRecord = -1
}
r.LazyQuotes = tt.LazyQuotes
r.TrailingComma = tt.TrailingComma
r.TrimLeadingSpace = tt.TrimLeadingSpace
if tt.Comma != 0 {
r.Comma = tt.Comma
}
out, err := r.ReadAll()
perr, _ := err.(*ParseError)
if tt.Error != "" {
if err == nil || !strings.Contains(err.String(), tt.Error) {
t.Errorf("%s: error %v, want error %q", tt.Name, err, tt.Error)
} else if tt.Line != 0 && (tt.Line != perr.Line || tt.Column != perr.Column) {
t.Errorf("%s: error at %d:%d expected %d:%d", tt.Name, perr.Line, perr.Column, tt.Line, tt.Column)
}
} else if err != nil {
t.Errorf("%s: unexpected error %v", tt.Name, err)
} else if !reflect.DeepEqual(out, tt.Output) {
t.Errorf("%s: out=%q want %q", tt.Name, out, tt.Output)
}
}
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package csv
import (
"bufio"
"io"
"os"
"strings"
"unicode"
"utf8"
)
// A Writer writes records to a CSV encoded file.
//
// As returned by NewWriter, a Writer writes records terminated by a
// newline and uses ',' as the field delimiter. The exported fields can be
// changed to customize the details before the first call to Write or WriteAll.
//
// Comma is the field delimiter.
//
// If UseCRLF is true, the Writer ends each record with \r\n instead of \n.
// just \n is written.
type Writer struct {
Comma int // Field delimiter (set to to ',' by NewWriter)
UseCRLF bool // True to use \r\n as the line terminator
w *bufio.Writer
}
// NewWriter returns a new Writer that writes to w.
func NewWriter(w io.Writer) *Writer {
return &Writer{
Comma: ',',
w: bufio.NewWriter(w),
}
}
// Writer writes a single CSV record to w along with any necessary quoting.
// A record is a slice of strings with each string being one field.
func (w *Writer) Write(record []string) (err os.Error) {
for n, field := range record {
if n > 0 {
if _, err = w.w.WriteRune(w.Comma); err != nil {
return
}
}
// If we don't have to have a quoted field then just
// write out the field and continue to the next field.
if !w.fieldNeedsQuotes(field) {
if _, err = w.w.WriteString(field); err != nil {
return
}
continue
}
if err = w.w.WriteByte('"'); err != nil {
return
}
for _, rune := range field {
switch rune {
case '"':
_, err = w.w.WriteString(`""`)
case '\r':
if !w.UseCRLF {
err = w.w.WriteByte('\r')
}
case '\n':
if w.UseCRLF {
_, err = w.w.WriteString("\r\n")
} else {
err = w.w.WriteByte('\n')
}
default:
_, err = w.w.WriteRune(rune)
}
if err != nil {
return
}
}
if err = w.w.WriteByte('"'); err != nil {
return
}
}
if w.UseCRLF {
_, err = w.w.WriteString("\r\n")
} else {
err = w.w.WriteByte('\n')
}
return
}
// Flush writes any buffered data to the underlying io.Writer.
func (w *Writer) Flush() {
w.w.Flush()
}
// WriteAll writes multiple CSV records to w using Write and then calls Flush.
func (w *Writer) WriteAll(records [][]string) (err os.Error) {
for _, record := range records {
err = w.Write(record)
if err != nil {
break
}
}
w.Flush()
return nil
}
// fieldNeedsQuotes returns true if our field must be enclosed in quotes.
// Empty fields, files with a Comma, fields with a quote or newline, and
// fields which start with a space must be enclosed in quotes.
func (w *Writer) fieldNeedsQuotes(field string) bool {
if len(field) == 0 || strings.IndexRune(field, w.Comma) >= 0 || strings.IndexAny(field, "\"\r\n") >= 0 {
return true
}
rune, _ := utf8.DecodeRuneInString(field)
return unicode.IsSpace(rune)
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package csv
import (
"bytes"
"testing"
)
var writeTests = []struct {
Input [][]string
Output string
UseCRLF bool
}{
{Input: [][]string{{"abc"}}, Output: "abc\n"},
{Input: [][]string{{"abc"}}, Output: "abc\r\n", UseCRLF: true},
{Input: [][]string{{`"abc"`}}, Output: `"""abc"""` + "\n"},
{Input: [][]string{{`a"b`}}, Output: `"a""b"` + "\n"},
{Input: [][]string{{`"a"b"`}}, Output: `"""a""b"""` + "\n"},
{Input: [][]string{{" abc"}}, Output: `" abc"` + "\n"},
{Input: [][]string{{"abc,def"}}, Output: `"abc,def"` + "\n"},
{Input: [][]string{{"abc", "def"}}, Output: "abc,def\n"},
{Input: [][]string{{"abc"}, {"def"}}, Output: "abc\ndef\n"},
{Input: [][]string{{"abc\ndef"}}, Output: "\"abc\ndef\"\n"},
{Input: [][]string{{"abc\ndef"}}, Output: "\"abc\r\ndef\"\r\n", UseCRLF: true},
}
func TestWrite(t *testing.T) {
for n, tt := range writeTests {
b := &bytes.Buffer{}
f := NewWriter(b)
f.UseCRLF = tt.UseCRLF
err := f.WriteAll(tt.Input)
if err != nil {
t.Errorf("Unexpected error: %s\n", err)
}
out := b.String()
if out != tt.Output {
t.Errorf("#%d: out=%q want %q", n, out, tt.Output)
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment