Commit 2181653b authored by Justin Nuß's avatar Justin Nuß Committed by Brad Fitzpatrick

encoding/csv: add option to reuse slices returned by Read

In many cases the records returned by Reader.Read will only be used between calls
to Read and become garbage once a new record is read. In this case, instead of
allocating a new slice on each call to Read, we can reuse the last allocated slice
for successive calls to avoid unnecessary allocations.

This change adds a new field ReuseRecord to the Reader struct to enable this reuse.

ReuseRecord is false by default to avoid breaking existing code which dependss on
the current behaviour.

I also added 4 new benchmarks, corresponding to the existing Read benchmarks, which
set ReuseRecord to true.

Benchstat on my local machine (old is ReuseRecord = false, new is ReuseRecord = true)

name                          old time/op    new time/op    delta
Read-8                          2.75µs ± 2%    1.88µs ± 1%  -31.52%  (p=0.000 n=14+15)
ReadWithFieldsPerRecord-8       2.75µs ± 0%    1.89µs ± 1%  -31.43%  (p=0.000 n=13+13)
ReadWithoutFieldsPerRecord-8    2.77µs ± 1%    1.88µs ± 1%  -32.06%  (p=0.000 n=15+15)
ReadLargeFields-8               55.4µs ± 1%    54.2µs ± 0%   -2.07%  (p=0.000 n=15+14)

name                          old alloc/op   new alloc/op   delta
Read-8                            664B ± 0%       24B ± 0%  -96.39%  (p=0.000 n=15+15)
ReadWithFieldsPerRecord-8         664B ± 0%       24B ± 0%  -96.39%  (p=0.000 n=15+15)
ReadWithoutFieldsPerRecord-8      664B ± 0%       24B ± 0%  -96.39%  (p=0.000 n=15+15)
ReadLargeFields-8               3.94kB ± 0%    2.98kB ± 0%  -24.39%  (p=0.000 n=15+15)

name                          old allocs/op  new allocs/op  delta
Read-8                            18.0 ± 0%       8.0 ± 0%  -55.56%  (p=0.000 n=15+15)
ReadWithFieldsPerRecord-8         18.0 ± 0%       8.0 ± 0%  -55.56%  (p=0.000 n=15+15)
ReadWithoutFieldsPerRecord-8      18.0 ± 0%       8.0 ± 0%  -55.56%  (p=0.000 n=15+15)
ReadLargeFields-8                 24.0 ± 0%      12.0 ± 0%  -50.00%  (p=0.000 n=15+15)

Fixes #19721

Change-Id: I79b14128bb9bb3465f53f40f93b1b528a9da6f58
Reviewed-on: https://go-review.googlesource.com/41730Reviewed-by: 's avatarBrad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent ba8ff87d
......@@ -110,6 +110,10 @@ type Reader struct {
// If TrimLeadingSpace is true, leading white space in a field is ignored.
// This is done even if the field delimiter, Comma, is white space.
TrimLeadingSpace bool
// ReuseRecord controls whether calls to Read may return a slice sharing
// the backing array of the previous call's returned slice for performance.
// By default, each call to Read returns newly allocated memory owned by the caller.
ReuseRecord bool
line int
column int
......@@ -122,6 +126,9 @@ type Reader struct {
// Indexes of fields inside lineBuffer
// The i'th field starts at offset fieldIndexes[i] in lineBuffer.
fieldIndexes []int
// only used when ReuseRecord == true
lastRecord []string
}
// NewReader returns a new Reader that reads from r.
......@@ -147,26 +154,17 @@ func (r *Reader) error(err error) error {
// Except for that case, Read always returns either a non-nil
// record or a non-nil error, but not both.
// If there is no data left to be read, Read returns nil, io.EOF.
// If ReuseRecord is true, the returned slice may be shared
// between multiple calls to Read.
func (r *Reader) Read() (record []string, err error) {
for {
record, err = r.parseRecord()
if record != nil {
break
}
if err != nil {
return nil, err
}
if r.ReuseRecord {
record, err = r.readRecord(r.lastRecord)
r.lastRecord = record
} else {
record, err = r.readRecord(nil)
}
if r.FieldsPerRecord > 0 {
if len(record) != r.FieldsPerRecord {
r.column = 0 // report at start of record
return record, r.error(ErrFieldCount)
}
} else if r.FieldsPerRecord == 0 {
r.FieldsPerRecord = len(record)
}
return record, nil
return record, err
}
// ReadAll reads all the remaining records from r.
......@@ -176,7 +174,7 @@ func (r *Reader) Read() (record []string, err error) {
// reported.
func (r *Reader) ReadAll() (records [][]string, err error) {
for {
record, err := r.Read()
record, err := r.readRecord(nil)
if err == io.EOF {
return records, nil
}
......@@ -187,6 +185,31 @@ func (r *Reader) ReadAll() (records [][]string, err error) {
}
}
// readRecord reads and parses a single csv record from r.
// Unlike parseRecord, readRecord handles FieldsPerRecord.
// If dst has enough capacity it will be used for the returned record.
func (r *Reader) readRecord(dst []string) (record []string, err error) {
for {
record, err = r.parseRecord(dst)
if record != nil {
break
}
if err != nil {
return nil, err
}
}
if r.FieldsPerRecord > 0 {
if len(record) != r.FieldsPerRecord {
r.column = 0 // report at start of record
return record, r.error(ErrFieldCount)
}
} else if r.FieldsPerRecord == 0 {
r.FieldsPerRecord = len(record)
}
return record, nil
}
// readRune reads one rune from r, folding \r\n to \n and keeping track
// of how far into the line we have read. r.column will point to the start
// of this rune, not the end of this rune.
......@@ -223,7 +246,8 @@ func (r *Reader) skip(delim rune) error {
}
// parseRecord reads and parses a single csv record from r.
func (r *Reader) parseRecord() (fields []string, err error) {
// If dst has enough capacity it will be used for the returned fields.
func (r *Reader) parseRecord(dst []string) (fields []string, err error) {
// Each record starts on a new line. We increment our line
// number (lines start at 1, not 0) and set column to -1
// so as we increment in readRune it points to the character we read.
......@@ -275,7 +299,12 @@ func (r *Reader) parseRecord() (fields []string, err error) {
// minimal and a tradeoff for better performance through the combined
// allocations.
line := r.lineBuffer.String()
fields = make([]string, fieldCount)
if cap(dst) >= fieldCount {
fields = dst[:fieldCount]
} else {
fields = make([]string, fieldCount)
}
for i, idx := range r.fieldIndexes {
if i == fieldCount-1 {
......
......@@ -24,6 +24,7 @@ var readTests = []struct {
LazyQuotes bool
TrailingComma bool
TrimLeadingSpace bool
ReuseRecord bool
Error string
Line int // Expected error line if != 0
......@@ -260,6 +261,15 @@ x,,,
{"c", "d", "e"},
},
},
{
Name: "ReadAllReuseRecord",
ReuseRecord: true,
Input: "a,b\nc,d",
Output: [][]string{
{"a", "b"},
{"c", "d"},
},
},
}
func TestRead(t *testing.T) {
......@@ -274,6 +284,7 @@ func TestRead(t *testing.T) {
r.LazyQuotes = tt.LazyQuotes
r.TrailingComma = tt.TrailingComma
r.TrimLeadingSpace = tt.TrimLeadingSpace
r.ReuseRecord = tt.ReuseRecord
if tt.Comma != 0 {
r.Comma = tt.Comma
}
......@@ -369,3 +380,23 @@ xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzz
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
`, 3))
}
func BenchmarkReadReuseRecord(b *testing.B) {
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, benchmarkCSVData)
}
func BenchmarkReadReuseRecordWithFieldsPerRecord(b *testing.B) {
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = 4 }, benchmarkCSVData)
}
func BenchmarkReadReuseRecordWithoutFieldsPerRecord(b *testing.B) {
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = -1 }, benchmarkCSVData)
}
func BenchmarkReadReuseRecordLargeFields(b *testing.B) {
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, strings.Repeat(`xxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvv
,,zzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
`, 3))
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment