Commit 384e4d29 authored by Michael Piatek's avatar Michael Piatek Committed by Brad Fitzpatrick

html: limit buffering during tokenization.

This is optional. By default, buffering is unlimited.

Fixes golang/go#7053

R=bradfitz
CC=golang-codereviews
https://golang.org/cl/43190044
parent 480e7b06
......@@ -6,6 +6,7 @@ package html
import (
"bytes"
"errors"
"io"
"strconv"
"strings"
......@@ -33,6 +34,9 @@ const (
DoctypeToken
)
// ErrBufferExceeded means that the buffering limit was exceeded.
var ErrBufferExceeded = errors.New("max buffer exceeded")
// String returns a string representation of the TokenType.
func (t TokenType) String() string {
switch t {
......@@ -142,6 +146,8 @@ type Tokenizer struct {
// buf[raw.end:] is buffered input that will yield future tokens.
raw span
buf []byte
// maxBuf limits the data buffered in buf. A value of 0 means unlimited.
maxBuf int
// buf[data.start:data.end] holds the raw bytes of the current token's data:
// a text token's text, a tag token's tag name, etc.
data span
......@@ -273,6 +279,10 @@ func (z *Tokenizer) readByte() byte {
}
x := z.buf[z.raw.end]
z.raw.end++
if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf {
z.err = ErrBufferExceeded
return 0
}
return x
}
......@@ -1167,6 +1177,12 @@ func (z *Tokenizer) Token() Token {
return t
}
// SetMaxBuf sets a limit on the amount of data buffered during tokenization.
// A value of 0 means unlimited.
func (z *Tokenizer) SetMaxBuf(n int) {
z.maxBuf = n
}
// NewTokenizer returns a new HTML Tokenizer for the given Reader.
// The input is assumed to be UTF-8 encoded.
func NewTokenizer(r io.Reader) *Tokenizer {
......
......@@ -469,6 +469,63 @@ loop:
}
}
func TestMaxBuffer(t *testing.T) {
// Exceeding the maximum buffer size generates ErrBufferExceeded.
z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
z.SetMaxBuf(5)
tt := z.Next()
if got, want := tt, ErrorToken; got != want {
t.Fatalf("token type: got: %v want: %v", got, want)
}
if got, want := z.Err(), ErrBufferExceeded; got != want {
t.Errorf("error type: got: %v want: %v", got, want)
}
if got, want := string(z.Raw()), "<tttt"; got != want {
t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
}
}
func TestMaxBufferReconstruction(t *testing.T) {
// Exceeding the maximum buffer size at any point while tokenizing permits
// reconstructing the original input.
tests:
for _, test := range tokenTests {
buffer:
for maxBuf := 1; ; maxBuf++ {
r := strings.NewReader(test.html)
z := NewTokenizer(r)
z.SetMaxBuf(maxBuf)
var tokenized bytes.Buffer
for {
tt := z.Next()
tokenized.Write(z.Raw())
if tt == ErrorToken {
if z.Err() == ErrBufferExceeded {
continue buffer
}
// EOF is expected, and indicates that we found the max maxBuf that
// generates ErrBufferExceeded, so continue to the next test.
if err := z.Err(); err != io.EOF {
t.Errorf("%s: unexpected error: %v", test.desc, err)
}
break
}
}
// Anything tokenizing along with input left in the reader.
assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, r))
if err != nil {
t.Errorf("%s: ReadAll: %v", test.desc, err)
continue tests
}
if got, want := string(assembled), test.html; got != want {
t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
continue tests
}
break
} // buffer sizes
} // tests
}
func TestPassthrough(t *testing.T) {
// Accumulating the raw output for each parse event should reconstruct the
// original input.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment