bufio: fix scanning with a final empty token.

The Scan function's interface to the split function was not sufficient to handle an empty final token in a pure function; state was required. This was ugly. We introduce a special error value that a split function can return that signals that this token is OK, but is the last one and scanning should stop immediately _after_ this token. The same effect could be achieved using the same trick (a special error value) and checking for that error after Scan finishes, but it's a little clumsy. Providing a published sentinel value in bufio is cleaner and means everyone can use the same trick. The result is an error-free scan. Rewrite the test (that was only barely working) to use the value and be more robust. Also write a new example showing how to do it. Fixes #11836 Change-Id: Iaae77d0f95b4a2efa0175ced94d93c66353079e8 Reviewed-on: https://go-review.googlesource.com/14924Reviewed-by: Ian Lance Taylor <iant@golang.org>

bufio: fix scanning with a final empty token.
The Scan function's interface to the split function was not sufficient to handle an empty final token in a pure function; state was required. This was ugly. We introduce a special error value that a split function can return that signals that this token is OK, but is the last one and scanning should stop immediately _after_ this token. The same effect could be achieved using the same trick (a special error value) and checking for that error after Scan finishes, but it's a little clumsy. Providing a published sentinel value in bufio is cleaner and means everyone can use the same trick. The result is an error-free scan. Rewrite the test (that was only barely working) to use the value and be more robust. Also write a new example showing how to do it. Fixes #11836 Change-Id: Iaae77d0f95b4a2efa0175ced94d93c66353079e8 Reviewed-on: https://go-review.googlesource.com/14924Reviewed-by: Ian Lance Taylor <iant@golang.org>
ec127547 · Rob Pike · dc6df1b0 · ec127547 · ec127547 · ec127547
Commit ec127547 authored Sep 24, 2015 by Rob Pike
Hide whitespace changes
Inline Side-by-side

Showing with 63 additions and 11 deletions

example_test.go src/bufio/example_test.go +29 -0

scan.go src/bufio/scan.go +19 -0

scan_test.go src/bufio/scan_test.go +15 -11

No files found.
--- a/src/bufio/example_test.go
+++ b/src/bufio/example_test.go
@@ -80,3 +80,32 @@ func ExampleScanner_custom() {
 	// 5678
 	// Invalid input: strconv.ParseInt: parsing "1234567901234567890": value out of range
 }
+
+// Use a Scanner with a custom split function to parse a comma-separated
+// list with an empty final value.
+func ExampleScanner_emptyFinalToken() {
+	// Comma-separated list; last entry is empty.
+	const input = "1,2,3,4,"
+	scanner := bufio.NewScanner(strings.NewReader(input))
+	// Define a split function that separates on commas.
+	onComma := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
+		for i := 0; i < len(data); i++ {
+			if data[i] == ',' {
+				return i + 1, data[:i], nil
+			}
+		}
+		// There is one final token to be delivered, which may be the empty string.
+		// Returning bufio.ErrFinalToken here tells Scan there are no more tokens after this
+		// but does not trigger an error to be returned from Scan itself.
+		return 0, data, bufio.ErrFinalToken
+	}
+	scanner.Split(onComma)
+	// Scan.
+	for scanner.Scan() {
+		fmt.Printf("%q ", scanner.Text())
+	}
+	if err := scanner.Err(); err != nil {
+		fmt.Fprintln(os.Stderr, "reading input:", err)
+	}
+	// Output: "1" "2" "3" "4" ""
+}
--- a/src/bufio/scan.go
+++ b/src/bufio/scan.go
@@ -38,6 +38,7 @@ type Scanner struct {
 	err          error     // Sticky error.
 	empties      int       // Count of successive empty tokens.
 	scanCalled   bool      // Scan has been called; buffer is in use.
+	done         bool      // Scan has finished.
 }

 // SplitFunc is the signature of the split function used to tokenize the
@@ -106,6 +107,16 @@ func (s *Scanner) Text() string {
 	return string(s.token)
 }

+// ErrFinalToken is a special sentinel error value. It is intended to be
+// returned by a Split function to indicate that the token being delivered
+// with the error is the last token and scanning should stop after this one.
+// After ErrFinalToken is received by Scan, scanning stops with no error.
+// The value is useful to stop processing early or when it is necessary to
+// deliver a final empty token. One could achieve the same behavior
+// with a custom error value but providing one here is tidier.
+// See the emptyFinalToken example for a use of this value.
+var ErrFinalToken = errors.New("final token")
+
 // Scan advances the Scanner to the next token, which will then be
 // available through the Bytes or Text method. It returns false when the
 // scan stops, either by reaching the end of the input or an error.
@@ -115,6 +126,9 @@ func (s *Scanner) Text() string {
 // Scan panics if the split function returns 100 empty tokens without
 // advancing the input. This is a common error mode for scanners.
 func (s *Scanner) Scan() bool {
+	if s.done {
+		return false
+	}
 	s.scanCalled = true
 	// Loop until we have a token.
 	for {
@@ -124,6 +138,11 @@ func (s *Scanner) Scan() bool {
 		if s.end > s.start || s.err != nil {
 			advance, token, err := s.split(s.buf[s.start:s.end], s.err != nil)
 			if err != nil {
+				if err == ErrFinalToken {
+					s.token = token
+					s.done = true
+					return true
+				}
 				s.setErr(err)
 				return false
 			}

--- a/src/bufio/scan_test.go
+++ b/src/bufio/scan_test.go
@@ -429,33 +429,37 @@ func commaSplit(data []byte, atEOF bool) (advance int, token []byte, err error)
 			return i + 1, data[:i], nil
 		}
 	}
-	if !atEOF {
-		return 0, nil, nil
-	}
-	return 0, data, nil
+	return 0, data, ErrFinalToken
 }

-func TestEmptyTokens(t *testing.T) {
-	s := NewScanner(strings.NewReader("1,2,3,"))
-	values := []string{"1", "2", "3", ""}
+func testEmptyTokens(t *testing.T, text string, values []string) {
+	s := NewScanner(strings.NewReader(text))
 	s.Split(commaSplit)
 	var i int
-	for i = 0; i < len(values); i++ {
-		if !s.Scan() {
-			break
+	for i = 0; s.Scan(); i++ {
+		if i >= len(values) {
+			t.Fatalf("got %d fields, expected %d", i+1, len(values))
 		}
 		if s.Text() != values[i] {
 			t.Errorf("%d: expected %q got %q", i, values[i], s.Text())
 		}
 	}
 	if i != len(values) {
-		t.Errorf("got %d fields, expected %d", i, len(values))
+		t.Fatalf("got %d fields, expected %d", i, len(values))
 	}
 	if err := s.Err(); err != nil {
 		t.Fatal(err)
 	}
 }

+func TestEmptyTokens(t *testing.T) {
+	testEmptyTokens(t, "1,2,3,", []string{"1", "2", "3", ""})
+}
+
+func TestWithNoEmptyTokens(t *testing.T) {
+	testEmptyTokens(t, "1,2,3", []string{"1", "2", "3"})
+}
+
 func loopAtEOFSplit(data []byte, atEOF bool) (advance int, token []byte, err error) {
 	if len(data) > 0 {
 		return 1, data[:1], nil