html: first cut at a parser.

R=gri CC=golang-dev https://golang.org/cl/3355041

html: first cut at a parser.
R=gri CC=golang-dev https://golang.org/cl/3355041
08a47d6f · Nigel Tao · 2fd2991e · 08a47d6f · 08a47d6f · 08a47d6f
Commit 08a47d6f authored Dec 07, 2010 by Nigel Tao
6 changed files
--- a/src/pkg/html/Makefile
+++ b/src/pkg/html/Makefile
@@ -9,6 +9,7 @@ GOFILES=\
 	doc.go\
 	entity.go\
 	escape.go\
+	parse.go\
 	token.go\

 include ../../Make.pkg
--- a/src/pkg/html/doc.go
+++ b/src/pkg/html/doc.go
@@ -15,7 +15,7 @@ which parses the next token and returns its type, or an error:

 	for {
 		tt := z.Next()
-		if tt == html.Error {
+		if tt == html.ErrorToken {
 			// ...
 			return ...
 		}
@@ -34,7 +34,7 @@ Entities (such as "&lt;") are unescaped, tag names and attribute keys are
 lower-cased, and attributes are collected into a []Attribute. For example:

 	for {
-		if z.Next() == html.Error {
+		if z.Next() == html.ErrorToken {
 			// Returning os.EOF indicates success.
 			return z.Error()
 		}
@@ -49,15 +49,15 @@ call to Next. For example, to extract an HTML page's anchor text:
 	for {
 		tt := z.Next()
 		switch tt {
-		case Error:
+		case ErrorToken:
 			return z.Error()
-		case Text:
+		case TextToken:
 			if depth > 0 {
 				// emitBytes should copy the []byte it receives,
 				// if it doesn't process it immediately.
 				emitBytes(z.Text())
 			}
-		case StartTag, EndTag:
+		case StartTagToken, EndTagToken:
 			tn, _ := z.TagName()
 			if len(tn) == 1 && tn[0] == 'a' {
 				if tt == StartTag {
@@ -69,6 +69,26 @@ call to Next. For example, to extract an HTML page's anchor text:
 		}
 	}

+Parsing is done by calling Parse with an io.Reader, which returns the root of
+the parse tree (the document element) as a *Node. It is the caller's
+responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
+example, to process each anchor node in depth-first order:
+
+	doc, err := html.Parse(r)
+	if err != nil {
+		// ...
+	}
+	var f func(*html.Node)
+	f = func(n *html.Node) {
+		if n.Type == html.ElementNode && n.Data == "a" {
+			// Do something with n...
+		}
+		for _, c := range n.Child {
+			f(c)
+		}
+	}
+	f(doc)
+
 The relevant specifications include:
 http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html and
 http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
@@ -82,6 +102,5 @@ package html
 // node. Specification compliance is verified by checking expected and actual
 // outputs over a test suite rather than aiming for algorithmic fidelity.

-// TODO(nigeltao): Implement a parser, not just a tokenizer.
 // TODO(nigeltao): Does a DOM API belong in this package or a separate one?
 // TODO(nigeltao): How does parsing interact with a JavaScript engine?
--- a/src/pkg/html/parse.go
+++ b/src/pkg/html/parse.go
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+	"io"
+	"os"
+)
+
+// A NodeType is the type of a Node.
+type NodeType int
+
+const (
+	ErrorNode NodeType = iota
+	TextNode
+	DocumentNode
+	ElementNode
+	CommentNode
+)
+
+// A Node consists of a NodeType and some Data (tag name for element nodes,
+// content for text) and are part of a tree of Nodes. Element nodes may also
+// contain a slice of Attributes. Data is unescaped, so that it looks like
+// "a<b" rather than "a&lt;b".
+type Node struct {
+	Parent *Node
+	Child  []*Node
+	Type   NodeType
+	Data   string
+	Attr   []Attribute
+}
+
+// An insertion mode (section 10.2.3.1) is the state transition function from
+// a particular state in the HTML5 parser's state machine. In addition to
+// returning the next state, it also returns whether the token was consumed.
+type insertionMode func(*parser) (insertionMode, bool)
+
+// A parser implements the HTML5 parsing algorithm:
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction
+type parser struct {
+	// tokenizer provides the tokens for the parser.
+	tokenizer *Tokenizer
+	// tok is the most recently read token.
+	tok Token
+	// Self-closing tags like <hr/> are re-interpreted as a two-token sequence:
+	// <hr> followed by </hr>. hasSelfClosingToken is true if we have just read
+	// the synthetic start tag and the next one due is the matching end tag.
+	hasSelfClosingToken bool
+	// doc is the document root element.
+	doc *Node
+	// The stack of open elements (section 10.2.3.2).
+	stack []*Node
+	// Element pointers (section 10.2.3.4).
+	head, form *Node
+	// Other parsing state flags (section 10.2.3.5).
+	scripting, framesetOK bool
+}
+
+// pop pops the top of the stack of open elements.
+// It will panic if the stack is empty.
+func (p *parser) pop() *Node {
+	n := len(p.stack)
+	ret := p.stack[n-1]
+	p.stack = p.stack[:n-1]
+	return ret
+}
+
+// push pushes onto the stack of open elements.
+func (p *parser) push(n *Node) {
+	p.stack = append(p.stack, n)
+}
+
+// top returns the top of the stack of open elements.
+// This is also known as the current node.
+func (p *parser) top() *Node {
+	if n := len(p.stack); n > 0 {
+		return p.stack[n-1]
+	}
+	return p.doc
+}
+
+// addChild adds a child node n to the top element, and pushes n
+// if it is an element node (text nodes do not have children).
+func (p *parser) addChild(n *Node) {
+	m := p.top()
+	m.Child = append(m.Child, n)
+	if n.Type == ElementNode {
+		p.push(n)
+	}
+}
+
+// addText adds text to the current node.
+func (p *parser) addText(s string) {
+	// TODO(nigeltao): merge s with previous text, if the preceding node is a text node.
+	// TODO(nigeltao): distinguish whitespace text from others.
+	p.addChild(&Node{
+		Type: TextNode,
+		Data: s,
+	})
+}
+
+// Section 10.2.3.3.
+func (p *parser) addFormattingElement(n *Node) {
+	p.addChild(n)
+	// TODO.
+}
+
+// Section 10.2.3.3.
+func (p *parser) reconstructActiveFormattingElements() {
+	// TODO.
+}
+
+// read reads the next token. This is usually from the tokenizer, but it may
+// be the synthesized end tag implied by a self-closing tag.
+func (p *parser) read() os.Error {
+	if p.hasSelfClosingToken {
+		p.hasSelfClosingToken = false
+		p.tok.Type = EndTagToken
+		p.tok.Attr = nil
+		return nil
+	}
+	if tokenType := p.tokenizer.Next(); tokenType == ErrorToken {
+		return p.tokenizer.Error()
+	}
+	p.tok = p.tokenizer.Token()
+	if p.tok.Type == SelfClosingTagToken {
+		p.hasSelfClosingToken = true
+		p.tok.Type = StartTagToken
+	}
+	return nil
+}
+
+// Section 10.2.4.
+func (p *parser) acknowledgeSelfClosingTag() {
+	p.hasSelfClosingToken = false
+}
+
+// Section 10.2.5.4.
+func initialInsertionMode(p *parser) (insertionMode, bool) {
+	// TODO(nigeltao): check p.tok for DOCTYPE.
+	return beforeHTMLInsertionMode, false
+}
+
+// Section 10.2.5.5.
+func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
+	var (
+		add     bool
+		attr    []Attribute
+		implied bool
+	)
+	switch p.tok.Type {
+	case TextToken:
+		// TODO(nigeltao): distinguish whitespace text from others.
+		implied = true
+	case StartTagToken:
+		if p.tok.Data == "html" {
+			add = true
+			attr = p.tok.Attr
+		} else {
+			implied = true
+		}
+	case EndTagToken:
+		// TODO.
+	}
+	if add || implied {
+		p.addChild(&Node{
+			Type: ElementNode,
+			Data: "html",
+			Attr: attr,
+		})
+	}
+	return beforeHeadInsertionMode, !implied
+}
+
+// Section 10.2.5.6.
+func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
+	var (
+		add     bool
+		attr    []Attribute
+		implied bool
+	)
+	switch p.tok.Type {
+	case TextToken:
+		// TODO(nigeltao): distinguish whitespace text from others.
+		implied = true
+	case StartTagToken:
+		switch p.tok.Data {
+		case "head":
+			add = true
+			attr = p.tok.Attr
+		case "html":
+			// TODO.
+		default:
+			implied = true
+		}
+	case EndTagToken:
+		// TODO.
+	}
+	if add || implied {
+		p.addChild(&Node{
+			Type: ElementNode,
+			Data: "head",
+			Attr: attr,
+		})
+	}
+	return inHeadInsertionMode, !implied
+}
+
+// Section 10.2.5.7.
+func inHeadInsertionMode(p *parser) (insertionMode, bool) {
+	var (
+		pop     bool
+		implied bool
+	)
+	switch p.tok.Type {
+	case TextToken:
+		implied = true
+	case StartTagToken:
+		switch p.tok.Data {
+		case "meta":
+			// TODO.
+		case "script":
+			// TODO.
+		default:
+			implied = true
+		}
+	case EndTagToken:
+		if p.tok.Data == "head" {
+			pop = true
+		}
+		// TODO.
+	}
+	if pop || implied {
+		n := p.pop()
+		if n.Data != "head" {
+			panic("html: bad parser state")
+		}
+		return afterHeadInsertionMode, !implied
+	}
+	return inHeadInsertionMode, !implied
+}
+
+// Section 10.2.5.9.
+func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
+	var (
+		add        bool
+		attr       []Attribute
+		framesetOK bool
+		implied    bool
+	)
+	switch p.tok.Type {
+	case TextToken:
+		implied = true
+		framesetOK = true
+	case StartTagToken:
+		switch p.tok.Data {
+		case "html":
+			// TODO.
+		case "body":
+			add = true
+			attr = p.tok.Attr
+			framesetOK = false
+		case "frameset":
+			// TODO.
+		case "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title":
+			// TODO.
+		case "head":
+			// TODO.
+		default:
+			implied = true
+			framesetOK = true
+		}
+	case EndTagToken:
+		// TODO.
+	}
+	if add || implied {
+		p.addChild(&Node{
+			Type: ElementNode,
+			Data: "body",
+			Attr: attr,
+		})
+		p.framesetOK = framesetOK
+	}
+	return inBodyInsertionMode, !implied
+}
+
+// Section 10.2.5.10.
+func inBodyInsertionMode(p *parser) (insertionMode, bool) {
+	var endP bool
+	switch p.tok.Type {
+	case TextToken:
+		p.addText(p.tok.Data)
+		p.framesetOK = false
+	case StartTagToken:
+		switch p.tok.Data {
+		case "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul":
+			// TODO(nigeltao): Do the proper "does the stack of open elements has a p element in button scope" algorithm in section 10.2.3.2.
+			n := p.top()
+			if n.Type == ElementNode && n.Data == "p" {
+				endP = true
+			} else {
+				p.addChild(&Node{
+					Type: ElementNode,
+					Data: p.tok.Data,
+					Attr: p.tok.Attr,
+				})
+			}
+		case "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u":
+			p.reconstructActiveFormattingElements()
+			p.addFormattingElement(&Node{
+				Type: ElementNode,
+				Data: p.tok.Data,
+				Attr: p.tok.Attr,
+			})
+		case "area", "br", "embed", "img", "input", "keygen", "wbr":
+			p.reconstructActiveFormattingElements()
+			p.addChild(&Node{
+				Type: ElementNode,
+				Data: p.tok.Data,
+				Attr: p.tok.Attr,
+			})
+			p.pop()
+			p.acknowledgeSelfClosingTag()
+			p.framesetOK = false
+		case "hr":
+			// TODO(nigeltao): auto-insert </p> if necessary.
+			p.addChild(&Node{
+				Type: ElementNode,
+				Data: p.tok.Data,
+				Attr: p.tok.Attr,
+			})
+			p.pop()
+			p.acknowledgeSelfClosingTag()
+			p.framesetOK = false
+		default:
+			// TODO.
+		}
+	case EndTagToken:
+		switch p.tok.Data {
+		case "body":
+			// TODO(nigeltao): autoclose the stack of open elements.
+			return afterBodyInsertionMode, true
+		case "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u":
+			// TODO(nigeltao): implement the "adoption agency" algorithm:
+			// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency
+			p.pop()
+		default:
+			// TODO.
+		}
+	}
+	if endP {
+		// TODO(nigeltao): do the proper algorithm.
+		n := p.pop()
+		if n.Type != ElementNode || n.Data != "p" {
+			panic("unreachable")
+		}
+	}
+	return inBodyInsertionMode, !endP
+}
+
+// Section 10.2.5.22.
+func afterBodyInsertionMode(p *parser) (insertionMode, bool) {
+	switch p.tok.Type {
+	case TextToken:
+		// TODO.
+	case StartTagToken:
+		// TODO.
+	case EndTagToken:
+		switch p.tok.Data {
+		case "html":
+			// TODO(nigeltao): autoclose the stack of open elements.
+			return afterAfterBodyInsertionMode, true
+		default:
+			// TODO.
+		}
+	}
+	return afterBodyInsertionMode, true
+}
+
+// Section 10.2.5.25.
+func afterAfterBodyInsertionMode(p *parser) (insertionMode, bool) {
+	return inBodyInsertionMode, false
+}
+
+// Parse returns the parse tree for the HTML from the given Reader.
+// The input is assumed to be UTF-8 encoded.
+func Parse(r io.Reader) (*Node, os.Error) {
+	p := &parser{
+		tokenizer: NewTokenizer(r),
+		doc: &Node{
+			Type: DocumentNode,
+		},
+		scripting:  true,
+		framesetOK: true,
+	}
+	im, consumed := initialInsertionMode, true
+	for {
+		if consumed {
+			if err := p.read(); err != nil {
+				if err == os.EOF {
+					break
+				}
+				return nil, err
+			}
+		}
+		im, consumed = im(p)
+	}
+	// TODO(nigeltao): clean up, depending on the value of im.
+	// The specification's algorithm does clean up on reading an EOF 'token',
+	// but in go we represent EOF by an os.Error instead.
+	return p.doc, nil
+}
--- a/src/pkg/html/parse_test.go
+++ b/src/pkg/html/parse_test.go
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"testing"
+)
+
+type devNull struct{}
+
+func (devNull) Write(p []byte) (int, os.Error) {
+	return len(p), nil
+}
+
+func pipeErr(err os.Error) io.Reader {
+	pr, pw := io.Pipe()
+	pw.CloseWithError(err)
+	return pr
+}
+
+func readDat(filename string, c chan io.Reader) {
+	f, err := os.Open("testdata/webkit/"+filename, os.O_RDONLY, 0600)
+	if err != nil {
+		c <- pipeErr(err)
+		return
+	}
+	defer f.Close()
+
+	// Loop through the lines of the file. Each line beginning with "#" denotes
+	// a new section, which is returned as a separate io.Reader.
+	r := bufio.NewReader(f)
+	var pw *io.PipeWriter
+	for {
+		line, err := r.ReadSlice('\n')
+		if err != nil {
+			if pw != nil {
+				pw.CloseWithError(err)
+				pw = nil
+			} else {
+				c <- pipeErr(err)
+			}
+			return
+		}
+		if len(line) == 0 {
+			continue
+		}
+		if line[0] == '#' {
+			if pw != nil {
+				pw.Close()
+			}
+			var pr *io.PipeReader
+			pr, pw = io.Pipe()
+			c <- pr
+			continue
+		}
+		if line[0] != '|' {
+			// Strip the trailing '\n'.
+			line = line[:len(line)-1]
+		}
+		if pw != nil {
+			if _, err := pw.Write(line); err != nil {
+				pw.CloseWithError(err)
+				pw = nil
+			}
+		}
+	}
+}
+
+func dumpLevel(w io.Writer, n *Node, level int) os.Error {
+	io.WriteString(w, "| ")
+	for i := 0; i < level; i++ {
+		io.WriteString(w, "  ")
+	}
+	switch n.Type {
+	case ErrorNode:
+		return os.NewError("unexpected ErrorNode")
+	case DocumentNode:
+		return os.NewError("unexpected DocumentNode")
+	case ElementNode:
+		fmt.Fprintf(w, "<%s>", EscapeString(n.Data))
+	case TextNode:
+		fmt.Fprintf(w, "%q", EscapeString(n.Data))
+	case CommentNode:
+		return os.NewError("COMMENT")
+	default:
+		return os.NewError("unknown node type")
+	}
+	io.WriteString(w, "\n")
+	for _, c := range n.Child {
+		if err := dumpLevel(w, c, level+1); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func dump(n *Node) (string, os.Error) {
+	if n == nil || len(n.Child) == 0 {
+		return "", nil
+	}
+	if len(n.Child) > 1 {
+		return "too many children", nil
+	}
+	b := bytes.NewBuffer(nil)
+	if err := dumpLevel(b, n.Child[0], 0); err != nil {
+		return "", err
+	}
+	return b.String(), nil
+}
+
+func TestParser(t *testing.T) {
+	// TODO(nigeltao): Process all the .dat files, not just the first one.
+	filenames := []string{
+		"tests1.dat",
+	}
+	for _, filename := range filenames {
+		rc := make(chan io.Reader)
+		go readDat(filename, rc)
+		// TODO(nigeltao): Process all test cases, not just the first three.
+		for i := 0; i < 3; i++ {
+			// Parse the #data section.
+			doc, err := Parse(<-rc)
+			if err != nil {
+				t.Fatal(err)
+			}
+			actual, err := dump(doc)
+			if err != nil {
+				t.Fatal(err)
+			}
+			// Skip the #error section.
+			if _, err := io.Copy(devNull{}, <-rc); err != nil {
+				t.Fatal(err)
+			}
+			// Compare the parsed tree to the #document section.
+			b, err := ioutil.ReadAll(<-rc)
+			if err != nil {
+				t.Fatal(err)
+			}
+			expected := string(b)
+			if actual != expected {
+				t.Errorf("%s test #%d, actual vs expected:\n----\n%s----\n%s----", filename, i, actual, expected)
+			}
+		}
+	}
+}
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -15,30 +15,30 @@ import (
 type TokenType int

 const (
-	// Error means that an error occurred during tokenization.
-	Error TokenType = iota
-	// Text means a text node.
-	Text
-	// A StartTag looks like <a>.
-	StartTag
-	// An EndTag looks like </a>.
-	EndTag
-	// A SelfClosingTag tag looks like <br/>.
-	SelfClosingTag
+	// ErrorToken means that an error occurred during tokenization.
+	ErrorToken TokenType = iota
+	// TextToken means a text node.
+	TextToken
+	// A StartTagToken looks like <a>.
+	StartTagToken
+	// An EndTagToken looks like </a>.
+	EndTagToken
+	// A SelfClosingTagToken tag looks like <br/>.
+	SelfClosingTagToken
 )

 // String returns a string representation of the TokenType.
 func (t TokenType) String() string {
 	switch t {
-	case Error:
+	case ErrorToken:
 		return "Error"
-	case Text:
+	case TextToken:
 		return "Text"
-	case StartTag:
+	case StartTagToken:
 		return "StartTag"
-	case EndTag:
+	case EndTagToken:
 		return "EndTag"
-	case SelfClosingTag:
+	case SelfClosingTagToken:
 		return "SelfClosingTag"
 	}
 	return "Invalid(" + strconv.Itoa(int(t)) + ")"
@@ -81,15 +81,15 @@ func (t Token) tagString() string {
 // String returns a string representation of the Token.
 func (t Token) String() string {
 	switch t.Type {
-	case Error:
+	case ErrorToken:
 		return ""
-	case Text:
+	case TextToken:
 		return EscapeString(t.Data)
-	case StartTag:
+	case StartTagToken:
 		return "<" + t.tagString() + ">"
-	case EndTag:
+	case EndTagToken:
 		return "</" + t.tagString() + ">"
-	case SelfClosingTag:
+	case SelfClosingTagToken:
 		return "<" + t.tagString() + "/>"
 	}
 	return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
@@ -109,10 +109,10 @@ type Tokenizer struct {
 	buf    []byte
 }

-// Error returns the error associated with the most recent Error token. This is
-// typically os.EOF, meaning the end of tokenization.
+// Error returns the error associated with the most recent ErrorToken token.
+// This is typically os.EOF, meaning the end of tokenization.
 func (z *Tokenizer) Error() os.Error {
-	if z.tt != Error {
+	if z.tt != ErrorToken {
 		return nil
 	}
 	return z.err
@@ -180,40 +180,40 @@ func (z *Tokenizer) readTo(x uint8) os.Error {
 func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
 	c, err := z.readByte()
 	if err != nil {
-		return Error, err
+		return ErrorToken, err
 	}
 	switch {
 	case c == '/':
-		tt = EndTag
+		tt = EndTagToken
 	// Lower-cased characters are more common in tag names, so we check for them first.
 	case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
-		tt = StartTag
+		tt = StartTagToken
 	case c == '!':
-		return Error, os.NewError("html: TODO(nigeltao): implement comments")
+		return ErrorToken, os.NewError("html: TODO(nigeltao): implement comments")
 	case c == '?':
-		return Error, os.NewError("html: TODO(nigeltao): implement XML processing instructions")
+		return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions")
 	default:
-		return Error, os.NewError("html: TODO(nigeltao): handle malformed tags")
+		return ErrorToken, os.NewError("html: TODO(nigeltao): handle malformed tags")
 	}
 	for {
 		c, err := z.readByte()
 		if err != nil {
-			return Text, err
+			return TextToken, err
 		}
 		switch c {
 		case '"':
 			err = z.readTo('"')
 			if err != nil {
-				return Text, err
+				return TextToken, err
 			}
 		case '\'':
 			err = z.readTo('\'')
 			if err != nil {
-				return Text, err
+				return TextToken, err
 			}
 		case '>':
-			if z.buf[z.p1-2] == '/' && tt == StartTag {
-				return SelfClosingTag, nil
+			if z.buf[z.p1-2] == '/' && tt == StartTagToken {
+				return SelfClosingTagToken, nil
 			}
 			return tt, nil
 		}
@@ -224,13 +224,13 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
 // Next scans the next token and returns its type.
 func (z *Tokenizer) Next() TokenType {
 	if z.err != nil {
-		z.tt = Error
+		z.tt = ErrorToken
 		return z.tt
 	}
 	z.p0 = z.p1
 	c, err := z.readByte()
 	if err != nil {
-		z.tt, z.err = Error, err
+		z.tt, z.err = ErrorToken, err
 		return z.tt
 	}
 	if c == '<' {
@@ -240,15 +240,15 @@ func (z *Tokenizer) Next() TokenType {
 	for {
 		c, err := z.readByte()
 		if err != nil {
-			z.tt, z.err = Error, err
+			z.tt, z.err = ErrorToken, err
 			if err == os.EOF {
-				z.tt = Text
+				z.tt = TextToken
 			}
 			return z.tt
 		}
 		if c == '<' {
 			z.p1--
-			z.tt = Text
+			z.tt = TextToken
 			return z.tt
 		}
 	}
@@ -371,9 +371,9 @@ loop:
 func (z *Tokenizer) Token() Token {
 	t := Token{Type: z.tt}
 	switch z.tt {
-	case Text:
+	case TextToken:
 		t.Data = string(z.Text())
-	case StartTag, EndTag, SelfClosingTag:
+	case StartTagToken, EndTagToken, SelfClosingTagToken:
 		var attr []Attribute
 		name, remaining := z.TagName()
 		for remaining {

--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -88,7 +88,7 @@ loop:
 	for _, tt := range tokenTests {
 		z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
 		for i, s := range tt.tokens {
-			if z.Next() == Error {
+			if z.Next() == ErrorToken {
 				t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
 				continue loop
 			}
@@ -134,19 +134,19 @@ loop:
 	for {
 		tt := z.Next()
 		switch tt {
-		case Error:
+		case ErrorToken:
 			if z.Error() != os.EOF {
 				t.Error(z.Error())
 			}
 			break loop
-		case Text:
+		case TextToken:
 			if depth > 0 {
 				result.Write(z.Text())
 			}
-		case StartTag, EndTag:
+		case StartTagToken, EndTagToken:
 			tn, _ := z.TagName()
 			if len(tn) == 1 && tn[0] == 'a' {
-				if tt == StartTag {
+				if tt == StartTagToken {
 					depth++
 				} else {
 					depth--