Commit bd06d482 authored by Justin Nuß's avatar Justin Nuß Committed by Brad Fitzpatrick

encoding/csv: avoid allocations when reading records

This commit changes parseRecord to allocate a single string per record,
instead of per field, by using indexes into the raw record.

Benchstat (done with f69991c1)

name                          old time/op    new time/op    delta
Read-8                          3.17µs ± 0%    2.78µs ± 1%  -12.35%  (p=0.016 n=4+5)
ReadWithFieldsPerRecord-8       3.18µs ± 1%    2.79µs ± 1%  -12.23%  (p=0.008 n=5+5)
ReadWithoutFieldsPerRecord-8    4.59µs ± 0%    2.77µs ± 0%  -39.58%  (p=0.016 n=4+5)
ReadLargeFields-8               57.0µs ± 0%    55.7µs ± 0%   -2.18%  (p=0.008 n=5+5)

name                          old alloc/op   new alloc/op   delta
Read-8                            660B ± 0%      664B ± 0%   +0.61%  (p=0.008 n=5+5)
ReadWithFieldsPerRecord-8         660B ± 0%      664B ± 0%   +0.61%  (p=0.008 n=5+5)
ReadWithoutFieldsPerRecord-8    1.14kB ± 0%    0.66kB ± 0%  -41.75%  (p=0.008 n=5+5)
ReadLargeFields-8               3.86kB ± 0%    3.94kB ± 0%   +1.86%  (p=0.008 n=5+5)

name                          old allocs/op  new allocs/op  delta
Read-8                            30.0 ± 0%      18.0 ± 0%  -40.00%  (p=0.008 n=5+5)
ReadWithFieldsPerRecord-8         30.0 ± 0%      18.0 ± 0%  -40.00%  (p=0.008 n=5+5)
ReadWithoutFieldsPerRecord-8      50.0 ± 0%      18.0 ± 0%  -64.00%  (p=0.008 n=5+5)
ReadLargeFields-8                 66.0 ± 0%      24.0 ± 0%  -63.64%  (p=0.008 n=5+5)

For a simple application that I wrote, which reads in a CSV file (via
ReadAll) and outputs the number of rows read (15857625 rows), this change
reduces the total time on my notebook from ~58 seconds to ~48 seconds.

This reduces time and allocations (bytes) each by ~6% for a real world
CSV file at work (~230000 rows, 13 colums).

Updates #16791

Change-Id: Ia07177c94624e55cdd3064a7d2751fb69322d3e4
Reviewed-on: https://go-review.googlesource.com/24723Reviewed-by: 's avatarBrad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent dce0df29
......@@ -114,7 +114,14 @@ type Reader struct {
line int
column int
r *bufio.Reader
field bytes.Buffer
// lineBuffer holds the unescaped fields read by readField, one after another.
// The fields can be accessed by using the indexes in fieldIndexes.
// Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and
// fieldIndexes will contain the indexes 0, 1, 2, 5.
lineBuffer bytes.Buffer
// Indexes of fields inside lineBuffer
// The i'th field starts at offset fieldIndexes[i] in lineBuffer.
fieldIndexes []int
}
// NewReader returns a new Reader that reads from r.
......@@ -233,31 +240,54 @@ func (r *Reader) parseRecord() (fields []string, err error) {
}
r.r.UnreadRune()
r.lineBuffer.Reset()
r.fieldIndexes = r.fieldIndexes[:0]
// At this point we have at least one field.
for {
idx := r.lineBuffer.Len()
haveField, delim, err := r.parseField()
if haveField {
// If FieldsPerRecord is greater than 0 we can assume the final
// length of fields to be equal to FieldsPerRecord.
if r.FieldsPerRecord > 0 && fields == nil {
fields = make([]string, 0, r.FieldsPerRecord)
}
fields = append(fields, r.field.String())
r.fieldIndexes = append(r.fieldIndexes, idx)
}
if delim == '\n' || err == io.EOF {
return fields, err
} else if err != nil {
if len(r.fieldIndexes) == 0 {
return nil, err
}
break
}
if err != nil {
return nil, err
}
}
fieldCount := len(r.fieldIndexes)
// Using this approach (creating a single string and taking slices of it)
// means that a single reference to any of the fields will retain the whole
// string. The risk of a nontrivial space leak caused by this is considered
// minimal and a tradeoff for better performance through the combined
// allocations.
line := r.lineBuffer.String()
fields = make([]string, fieldCount)
for i, idx := range r.fieldIndexes {
if i == fieldCount-1 {
fields[i] = line[idx:]
} else {
fields[i] = line[idx:r.fieldIndexes[i+1]]
}
}
return fields, nil
}
// parseField parses the next field in the record. The read field is
// located in r.field. Delim is the first character not part of the field
// appended to r.lineBuffer. Delim is the first character not part of the field
// (r.Comma or '\n').
func (r *Reader) parseField() (haveField bool, delim rune, err error) {
r.field.Reset()
r1, err := r.readRune()
for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) {
r1, err = r.readRune()
......@@ -310,19 +340,19 @@ func (r *Reader) parseField() (haveField bool, delim rune, err error) {
return false, 0, r.error(ErrQuote)
}
// accept the bare quote
r.field.WriteRune('"')
r.lineBuffer.WriteRune('"')
}
case '\n':
r.line++
r.column = -1
}
r.field.WriteRune(r1)
r.lineBuffer.WriteRune(r1)
}
default:
// unquoted field
for {
r.field.WriteRune(r1)
r.lineBuffer.WriteRune(r1)
r1, err = r.readRune()
if err != nil || r1 == r.Comma {
break
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment