bufio: allow Scanner to accept a user-provided buffer

Add Scanner.Buffer, which lets the user give a buffer to the scanner and set the maximum token size. We call it Buffer not SetBuffer for consistency with Split, which perhaps should have been called SetSplit; too late regardless. Both Buffer and Split panic if they are called after Scan. The panic in Split is new, but the comment on the method already said it needed to be called first, so we might as well add the verification while we're doing it for Buffer. This method allows precise user control of storage. Fixes #11702. Change-Id: I80e3d0e3830562fdabd4f7b08f322e1378248c39 Reviewed-on: https://go-review.googlesource.com/14599Reviewed-by: Andrew Gerrand <adg@golang.org> Reviewed-by: roger peppe <rogpeppe@gmail.com>

bufio: allow Scanner to accept a user-provided buffer
Add Scanner.Buffer, which lets the user give a buffer to the scanner and set the maximum token size. We call it Buffer not SetBuffer for consistency with Split, which perhaps should have been called SetSplit; too late regardless. Both Buffer and Split panic if they are called after Scan. The panic in Split is new, but the comment on the method already said it needed to be called first, so we might as well add the verification while we're doing it for Buffer. This method allows precise user control of storage. Fixes #11702. Change-Id: I80e3d0e3830562fdabd4f7b08f322e1378248c39 Reviewed-on: https://go-review.googlesource.com/14599Reviewed-by: Andrew Gerrand <adg@golang.org> Reviewed-by: roger peppe <rogpeppe@gmail.com>
13be616e · Rob Pike · 1536c2e0 · 13be616e · 13be616e
Commit 13be616e authored Sep 15, 2015 by Rob Pike
Hide whitespace changes
Inline Side-by-side

Showing with 56 additions and 5 deletions

scan.go src/bufio/scan.go +40 -5

scan_test.go src/bufio/scan_test.go +16 -0

No files found.
--- a/src/bufio/scan.go
+++ b/src/bufio/scan.go
@@ -37,6 +37,7 @@ type Scanner struct {
 	end          int       // End of data in buf.
 	err          error     // Sticky error.
 	empties      int       // Count of successive empty tokens.
+	scanCalled   bool      // Scan has been called; buffer is in use.
 }

 // SplitFunc is the signature of the split function used to tokenize the
@@ -65,10 +66,13 @@ var (
 )

 const (
-	// MaxScanTokenSize is the maximum size used to buffer a token.
+	// MaxScanTokenSize is the maximum size used to buffer a token
+	// unless the user provides an explicit buffer with Scan.Buffer.
 	// The actual maximum token size may be smaller as the buffer
 	// may need to include, for instance, a newline.
 	MaxScanTokenSize = 64 * 1024
+
+	startBufSize = 4096 // Size of initial allocation for buffer.
 )

 // NewScanner returns a new Scanner to read from r.
@@ -78,7 +82,6 @@ func NewScanner(r io.Reader) *Scanner {
 		r:            r,
 		split:        ScanLines,
 		maxTokenSize: MaxScanTokenSize,
-		buf:          make([]byte, 4096), // Plausible starting size; needn't be large.
 	}
 }

@@ -112,6 +115,7 @@ func (s *Scanner) Text() string {
 // Scan panics if the split function returns 100 empty tokens without
 // advancing the input. This is a common error mode for scanners.
 func (s *Scanner) Scan() bool {
+	s.scanCalled = true
 	// Loop until we have a token.
 	for {
 		// See if we can get a token with what we already have.
@@ -162,7 +166,10 @@ func (s *Scanner) Scan() bool {
 				s.setErr(ErrTooLong)
 				return false
 			}
-			newSize := len(s.buf) * 2
+			newSize := len(s.buf) * 2 // See protection against overflow in Buffer.
+			if newSize == 0 {
+				newSize = startBufSize
+			}
 			if newSize > s.maxTokenSize {
 				newSize = s.maxTokenSize
 			}
@@ -217,9 +224,37 @@ func (s *Scanner) setErr(err error) {
 	}
 }

-// Split sets the split function for the Scanner. If called, it must be
-// called before Scan. The default split function is ScanLines.
+// Buffer sets the initial buffer to use when scanning and the maximum
+// size of buffer that may be allocated during scanning. The maximum
+// token size is the larger of max and cap(buf). If max <= cap(buf),
+// Scan will use this buffer only and do no allocation.
+//
+// By default, Scan uses an internal buffer and sets the
+// maximum token size to MaxScanTokenSize.
+//
+// Buffer panics if it is called after scanning has started.
+func (s *Scanner) Buffer(buf []byte, max int) {
+	if s.scanCalled {
+		panic("Buffer called after Scan")
+	}
+	s.buf = buf[0:cap(buf)]
+	// Guarantee no overflow: we multiply len(s.buf) by two in Scan,
+	// but only if it exceeds maxTokenSize.
+	const maxInt = int(^uint(0) >> 1)
+	if max > maxInt {
+		max = maxInt
+	}
+	s.maxTokenSize = max
+}
+
+// Split sets the split function for the Scanner.
+// The default split function is ScanLines.
+//
+// Split panics if it is called after scanning has started.
 func (s *Scanner) Split(split SplitFunc) {
+	if s.scanCalled {
+		panic("Split called after Scan")
+	}
 	s.split = split
 }


--- a/src/bufio/scan_test.go
+++ b/src/bufio/scan_test.go
@@ -522,3 +522,19 @@ func TestEmptyLinesOK(t *testing.T) {
 		t.Fatalf("stopped with %d left to process", c)
 	}
 }
+
+// Make sure we can read a huge token if a big enough buffer is provided.
+func TestHugeBuffer(t *testing.T) {
+	text := strings.Repeat("x", 2*MaxScanTokenSize)
+	s := NewScanner(strings.NewReader(text + "\n"))
+	s.Buffer(make([]byte, 100), 3*MaxScanTokenSize)
+	for s.Scan() {
+		token := s.Text()
+		if token != text {
+			t.Errorf("scan got incorrect token of length %d", len(token))
+		}
+	}
+	if s.Err() != nil {
+		t.Fatal("after scan:", s.Err())
+	}
+}