Commit 13be616e authored by Rob Pike's avatar Rob Pike

bufio: allow Scanner to accept a user-provided buffer

Add Scanner.Buffer, which lets the user give a buffer to
the scanner and set the maximum token size.

We call it Buffer not SetBuffer for consistency with Split, which
perhaps should have been called SetSplit; too late regardless.

Both Buffer and Split panic if they are called after Scan. The
panic in Split is new, but the comment on the method already
said it needed to be called first, so we might as well add the
verification while we're doing it for Buffer.

This method allows precise user control of storage.

Fixes #11702.

Change-Id: I80e3d0e3830562fdabd4f7b08f322e1378248c39
Reviewed-on: https://go-review.googlesource.com/14599Reviewed-by: 's avatarAndrew Gerrand <adg@golang.org>
Reviewed-by: 's avatarroger peppe <rogpeppe@gmail.com>
parent 1536c2e0
......@@ -37,6 +37,7 @@ type Scanner struct {
end int // End of data in buf.
err error // Sticky error.
empties int // Count of successive empty tokens.
scanCalled bool // Scan has been called; buffer is in use.
}
// SplitFunc is the signature of the split function used to tokenize the
......@@ -65,10 +66,13 @@ var (
)
const (
// MaxScanTokenSize is the maximum size used to buffer a token.
// MaxScanTokenSize is the maximum size used to buffer a token
// unless the user provides an explicit buffer with Scan.Buffer.
// The actual maximum token size may be smaller as the buffer
// may need to include, for instance, a newline.
MaxScanTokenSize = 64 * 1024
startBufSize = 4096 // Size of initial allocation for buffer.
)
// NewScanner returns a new Scanner to read from r.
......@@ -78,7 +82,6 @@ func NewScanner(r io.Reader) *Scanner {
r: r,
split: ScanLines,
maxTokenSize: MaxScanTokenSize,
buf: make([]byte, 4096), // Plausible starting size; needn't be large.
}
}
......@@ -112,6 +115,7 @@ func (s *Scanner) Text() string {
// Scan panics if the split function returns 100 empty tokens without
// advancing the input. This is a common error mode for scanners.
func (s *Scanner) Scan() bool {
s.scanCalled = true
// Loop until we have a token.
for {
// See if we can get a token with what we already have.
......@@ -162,7 +166,10 @@ func (s *Scanner) Scan() bool {
s.setErr(ErrTooLong)
return false
}
newSize := len(s.buf) * 2
newSize := len(s.buf) * 2 // See protection against overflow in Buffer.
if newSize == 0 {
newSize = startBufSize
}
if newSize > s.maxTokenSize {
newSize = s.maxTokenSize
}
......@@ -217,9 +224,37 @@ func (s *Scanner) setErr(err error) {
}
}
// Split sets the split function for the Scanner. If called, it must be
// called before Scan. The default split function is ScanLines.
// Buffer sets the initial buffer to use when scanning and the maximum
// size of buffer that may be allocated during scanning. The maximum
// token size is the larger of max and cap(buf). If max <= cap(buf),
// Scan will use this buffer only and do no allocation.
//
// By default, Scan uses an internal buffer and sets the
// maximum token size to MaxScanTokenSize.
//
// Buffer panics if it is called after scanning has started.
func (s *Scanner) Buffer(buf []byte, max int) {
if s.scanCalled {
panic("Buffer called after Scan")
}
s.buf = buf[0:cap(buf)]
// Guarantee no overflow: we multiply len(s.buf) by two in Scan,
// but only if it exceeds maxTokenSize.
const maxInt = int(^uint(0) >> 1)
if max > maxInt {
max = maxInt
}
s.maxTokenSize = max
}
// Split sets the split function for the Scanner.
// The default split function is ScanLines.
//
// Split panics if it is called after scanning has started.
func (s *Scanner) Split(split SplitFunc) {
if s.scanCalled {
panic("Split called after Scan")
}
s.split = split
}
......
......@@ -522,3 +522,19 @@ func TestEmptyLinesOK(t *testing.T) {
t.Fatalf("stopped with %d left to process", c)
}
}
// Make sure we can read a huge token if a big enough buffer is provided.
func TestHugeBuffer(t *testing.T) {
text := strings.Repeat("x", 2*MaxScanTokenSize)
s := NewScanner(strings.NewReader(text + "\n"))
s.Buffer(make([]byte, 100), 3*MaxScanTokenSize)
for s.Scan() {
token := s.Text()
if token != text {
t.Errorf("scan got incorrect token of length %d", len(token))
}
}
if s.Err() != nil {
t.Fatal("after scan:", s.Err())
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment