Commit cd21eff7 authored by Nigel Tao's avatar Nigel Tao

exp/html: make the tokenizer return atoms for tag tokens.

This is part 1 of a 2 part changelist. Part 2 contains the mechanical
change to parse.go to compare atoms (ints) instead of strings.

The overall effect of the two changes are:
benchmark                      old ns/op    new ns/op    delta
BenchmarkParser                  4462274      4058254   -9.05%
BenchmarkRawLevelTokenizer        913202       912917   -0.03%
BenchmarkLowLevelTokenizer       1268626      1267836   -0.06%
BenchmarkHighLevelTokenizer      1947305      1968944   +1.11%

R=rsc
CC=andybalholm, golang-dev, r
https://golang.org/cl/6305053
parent 64236820
...@@ -4,8 +4,12 @@ ...@@ -4,8 +4,12 @@
package html package html
import (
"exp/html/atom"
)
// A NodeType is the type of a Node. // A NodeType is the type of a Node.
type NodeType int type NodeType uint32
const ( const (
ErrorNode NodeType = iota ErrorNode NodeType = iota
...@@ -25,7 +29,8 @@ var scopeMarker = Node{Type: scopeMarkerNode} ...@@ -25,7 +29,8 @@ var scopeMarker = Node{Type: scopeMarkerNode}
// A Node consists of a NodeType and some Data (tag name for element nodes, // A Node consists of a NodeType and some Data (tag name for element nodes,
// content for text) and are part of a tree of Nodes. Element nodes may also // content for text) and are part of a tree of Nodes. Element nodes may also
// have a Namespace and contain a slice of Attributes. Data is unescaped, so // have a Namespace and contain a slice of Attributes. Data is unescaped, so
// that it looks like "a<b" rather than "a&lt;b". // that it looks like "a<b" rather than "a&lt;b". For element nodes, DataAtom
// is the atom for Data, or zero if Data is not a known tag name.
// //
// An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace. // An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
// Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and // Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
...@@ -34,6 +39,7 @@ type Node struct { ...@@ -34,6 +39,7 @@ type Node struct {
Parent *Node Parent *Node
Child []*Node Child []*Node
Type NodeType Type NodeType
DataAtom atom.Atom
Data string Data string
Namespace string Namespace string
Attr []Attribute Attr []Attribute
...@@ -83,9 +89,10 @@ func reparentChildren(dst, src *Node) { ...@@ -83,9 +89,10 @@ func reparentChildren(dst, src *Node) {
// The clone has no parent and no children. // The clone has no parent and no children.
func (n *Node) clone() *Node { func (n *Node) clone() *Node {
m := &Node{ m := &Node{
Type: n.Type, Type: n.Type,
Data: n.Data, DataAtom: n.DataAtom,
Attr: make([]Attribute, len(n.Attr)), Data: n.Data,
Attr: make([]Attribute, len(n.Attr)),
} }
copy(m.Attr, n.Attr) copy(m.Attr, n.Attr)
return m return m
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
package html package html
import ( import (
a "exp/html/atom"
"io" "io"
"strings" "strings"
) )
...@@ -280,7 +281,7 @@ func (p *parser) addText(text string) { ...@@ -280,7 +281,7 @@ func (p *parser) addText(text string) {
func (p *parser) addElement(tag string, attr []Attribute) { func (p *parser) addElement(tag string, attr []Attribute) {
p.addChild(&Node{ p.addChild(&Node{
Type: ElementNode, Type: ElementNode,
Data: tag, Data: tag, // TODO: also set DataAtom.
Attr: attr, Attr: attr,
}) })
} }
...@@ -310,9 +311,9 @@ findIdenticalElements: ...@@ -310,9 +311,9 @@ findIdenticalElements:
continue continue
} }
compareAttributes: compareAttributes:
for _, a := range n.Attr { for _, t0 := range n.Attr {
for _, b := range attr { for _, t1 := range attr {
if a.Key == b.Key && a.Namespace == b.Namespace && a.Val == b.Val { if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
// Found a match for this attribute, continue with the next attribute. // Found a match for this attribute, continue with the next attribute.
continue compareAttributes continue compareAttributes
} }
...@@ -676,13 +677,13 @@ func copyAttributes(dst *Node, src Token) { ...@@ -676,13 +677,13 @@ func copyAttributes(dst *Node, src Token) {
return return
} }
attr := map[string]string{} attr := map[string]string{}
for _, a := range dst.Attr { for _, t := range dst.Attr {
attr[a.Key] = a.Val attr[t.Key] = t.Val
} }
for _, a := range src.Attr { for _, t := range src.Attr {
if _, ok := attr[a.Key]; !ok { if _, ok := attr[t.Key]; !ok {
dst.Attr = append(dst.Attr, a) dst.Attr = append(dst.Attr, t)
attr[a.Key] = a.Val attr[t.Key] = t.Val
} }
} }
} }
...@@ -843,9 +844,9 @@ func inBodyIM(p *parser) bool { ...@@ -843,9 +844,9 @@ func inBodyIM(p *parser) bool {
p.oe.pop() p.oe.pop()
p.acknowledgeSelfClosingTag() p.acknowledgeSelfClosingTag()
if p.tok.Data == "input" { if p.tok.Data == "input" {
for _, a := range p.tok.Attr { for _, t := range p.tok.Attr {
if a.Key == "type" { if t.Key == "type" {
if strings.ToLower(a.Val) == "hidden" { if strings.ToLower(t.Val) == "hidden" {
// Skip setting framesetOK = false // Skip setting framesetOK = false
return true return true
} }
...@@ -874,16 +875,16 @@ func inBodyIM(p *parser) bool { ...@@ -874,16 +875,16 @@ func inBodyIM(p *parser) bool {
action := "" action := ""
prompt := "This is a searchable index. Enter search keywords: " prompt := "This is a searchable index. Enter search keywords: "
attr := []Attribute{{Key: "name", Val: "isindex"}} attr := []Attribute{{Key: "name", Val: "isindex"}}
for _, a := range p.tok.Attr { for _, t := range p.tok.Attr {
switch a.Key { switch t.Key {
case "action": case "action":
action = a.Val action = t.Val
case "name": case "name":
// Ignore the attribute. // Ignore the attribute.
case "prompt": case "prompt":
prompt = a.Val prompt = t.Val
default: default:
attr = append(attr, a) attr = append(attr, t)
} }
} }
p.acknowledgeSelfClosingTag() p.acknowledgeSelfClosingTag()
...@@ -1231,8 +1232,8 @@ func inTableIM(p *parser) bool { ...@@ -1231,8 +1232,8 @@ func inTableIM(p *parser) bool {
case "style", "script": case "style", "script":
return inHeadIM(p) return inHeadIM(p)
case "input": case "input":
for _, a := range p.tok.Attr { for _, t := range p.tok.Attr {
if a.Key == "type" && strings.ToLower(a.Val) == "hidden" { if t.Key == "type" && strings.ToLower(t.Val) == "hidden" {
p.addElement(p.tok.Data, p.tok.Attr) p.addElement(p.tok.Data, p.tok.Attr)
p.oe.pop() p.oe.pop()
return true return true
...@@ -1863,6 +1864,7 @@ func parseForeignContent(p *parser) bool { ...@@ -1863,6 +1864,7 @@ func parseForeignContent(p *parser) bool {
// Adjust SVG tag names. The tokenizer lower-cases tag names, but // Adjust SVG tag names. The tokenizer lower-cases tag names, but
// SVG wants e.g. "foreignObject" with a capital second "O". // SVG wants e.g. "foreignObject" with a capital second "O".
if x := svgTagNameAdjustments[p.tok.Data]; x != "" { if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
p.tok.DataAtom = a.Lookup([]byte(x))
p.tok.Data = x p.tok.Data = x
} }
adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
...@@ -1929,7 +1931,7 @@ func (p *parser) parseImpliedToken(t TokenType, data string, attr []Attribute) { ...@@ -1929,7 +1931,7 @@ func (p *parser) parseImpliedToken(t TokenType, data string, attr []Attribute) {
realToken, selfClosing := p.tok, p.hasSelfClosingToken realToken, selfClosing := p.tok, p.hasSelfClosingToken
p.tok = Token{ p.tok = Token{
Type: t, Type: t,
Data: data, Data: data, // TODO: also set DataAtom.
Attr: attr, Attr: attr,
} }
p.hasSelfClosingToken = false p.hasSelfClosingToken = false
...@@ -2014,7 +2016,7 @@ func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { ...@@ -2014,7 +2016,7 @@ func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
root := &Node{ root := &Node{
Type: ElementNode, Type: ElementNode,
Data: "html", Data: "html", // TODO: also set DataAtom.
} }
p.doc.Add(root) p.doc.Add(root)
p.oe = nodeStack{root} p.oe = nodeStack{root}
......
...@@ -8,6 +8,7 @@ import ( ...@@ -8,6 +8,7 @@ import (
"bufio" "bufio"
"bytes" "bytes"
"errors" "errors"
"exp/html/atom"
"flag" "flag"
"fmt" "fmt"
"io" "io"
...@@ -320,8 +321,9 @@ func testParseCase(text, want, context string) (result parseTestResult, err erro ...@@ -320,8 +321,9 @@ func testParseCase(text, want, context string) (result parseTestResult, err erro
} }
} else { } else {
contextNode := &Node{ contextNode := &Node{
Type: ElementNode, Type: ElementNode,
Data: context, DataAtom: atom.Lookup([]byte(context)),
Data: context,
} }
nodes, err := ParseFragment(strings.NewReader(text), contextNode) nodes, err := ParseFragment(strings.NewReader(text), contextNode)
if err != nil { if err != nil {
......
...@@ -13,7 +13,7 @@ import ( ...@@ -13,7 +13,7 @@ import (
) )
// A TokenType is the type of a Token. // A TokenType is the type of a Token.
type TokenType int type TokenType uint32
const ( const (
// ErrorToken means that an error occurred during tokenization. // ErrorToken means that an error occurred during tokenization.
...@@ -66,11 +66,13 @@ type Attribute struct { ...@@ -66,11 +66,13 @@ type Attribute struct {
// A Token consists of a TokenType and some Data (tag name for start and end // A Token consists of a TokenType and some Data (tag name for start and end
// tags, content for text, comments and doctypes). A tag Token may also contain // tags, content for text, comments and doctypes). A tag Token may also contain
// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b" // a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b"
// rather than "a&lt;b"). // rather than "a&lt;b"). For tag Tokens, DataAtom is the atom for Data, or
// zero if Data is not a known tag name.
type Token struct { type Token struct {
Type TokenType Type TokenType
Data string DataAtom atom.Atom
Attr []Attribute Data string
Attr []Attribute
} }
// tagString returns a string representation of a tag Token's Data and Attr. // tagString returns a string representation of a tag Token's Data and Attr.
...@@ -794,11 +796,19 @@ func (z *Tokenizer) Token() Token { ...@@ -794,11 +796,19 @@ func (z *Tokenizer) Token() Token {
key, val, moreAttr = z.TagAttr() key, val, moreAttr = z.TagAttr()
attr = append(attr, Attribute{"", atom.String(key), string(val)}) attr = append(attr, Attribute{"", atom.String(key), string(val)})
} }
t.Data = atom.String(name) if a := atom.Lookup(name); a != 0 {
t.DataAtom, t.Data = a, a.String()
} else {
t.DataAtom, t.Data = 0, string(name)
}
t.Attr = attr t.Attr = attr
case EndTagToken: case EndTagToken:
name, _ := z.TagName() name, _ := z.TagName()
t.Data = atom.String(name) if a := atom.Lookup(name); a != 0 {
t.DataAtom, t.Data = a, a.String()
} else {
t.DataAtom, t.Data = 0, string(name)
}
} }
return t return t
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment