Commit 4e0749a4 authored by Andrew Balholm's avatar Andrew Balholm Committed by Nigel Tao

exp/html: Convert \r and \r\n to \n when tokenizing

Also escape "\r" as "
" when rendering HTML.

Pass 2 additional tests.

R=nigeltao
CC=golang-dev
https://golang.org/cl/6260046
parent afe0e97a
...@@ -192,7 +192,7 @@ func lower(b []byte) []byte { ...@@ -192,7 +192,7 @@ func lower(b []byte) []byte {
return b return b
} }
const escapedChars = `&'<>"` const escapedChars = "&'<>\"\r"
func escape(w writer, s string) error { func escape(w writer, s string) error {
i := strings.IndexAny(s, escapedChars) i := strings.IndexAny(s, escapedChars)
...@@ -214,6 +214,8 @@ func escape(w writer, s string) error { ...@@ -214,6 +214,8 @@ func escape(w writer, s string) error {
case '"': case '"':
// "&#34;" is shorter than "&quot;". // "&#34;" is shorter than "&quot;".
esc = "&#34;" esc = "&#34;"
case '\r':
esc = "&#13;"
default: default:
panic("unrecognized escape character") panic("unrecognized escape character")
} }
......
...@@ -21,8 +21,8 @@ PASS "<svg>\x00 </svg><frameset>" ...@@ -21,8 +21,8 @@ PASS "<svg>\x00 </svg><frameset>"
FAIL "<svg>\x00a</svg><frameset>" FAIL "<svg>\x00a</svg><frameset>"
PASS "<svg><path></path></svg><frameset>" PASS "<svg><path></path></svg><frameset>"
PASS "<svg><p><frameset>" PASS "<svg><p><frameset>"
FAIL "<!DOCTYPE html><pre>\r\n\r\nA</pre>" PASS "<!DOCTYPE html><pre>\r\n\r\nA</pre>"
FAIL "<!DOCTYPE html><pre>\r\rA</pre>" PASS "<!DOCTYPE html><pre>\r\rA</pre>"
PASS "<!DOCTYPE html><pre>\rA</pre>" PASS "<!DOCTYPE html><pre>\rA</pre>"
PASS "<!DOCTYPE html><table><tr><td><math><mtext>\x00a" PASS "<!DOCTYPE html><table><tr><td><math><mtext>\x00a"
PASS "<!DOCTYPE html><table><tr><td><svg><foreignObject>\x00a" PASS "<!DOCTYPE html><table><tr><td><svg><foreignObject>\x00a"
......
...@@ -696,6 +696,38 @@ func (z *Tokenizer) Raw() []byte { ...@@ -696,6 +696,38 @@ func (z *Tokenizer) Raw() []byte {
return z.buf[z.raw.start:z.raw.end] return z.buf[z.raw.start:z.raw.end]
} }
// convertNewlines converts "\r" and "\r\n" in s to "\n".
// The conversion happens in place, but the resulting slice may be shorter.
func convertNewlines(s []byte) []byte {
for i, c := range s {
if c != '\r' {
continue
}
src := i + 1
if src >= len(s) || s[src] != '\n' {
s[i] = '\n'
continue
}
dst := i
for src < len(s) {
if s[src] == '\r' {
if src+1 < len(s) && s[src+1] == '\n' {
src++
}
s[dst] = '\n'
} else {
s[dst] = s[src]
}
src++
dst++
}
return s[:dst]
}
return s
}
// Text returns the unescaped text of a text, comment or doctype token. The // Text returns the unescaped text of a text, comment or doctype token. The
// contents of the returned slice may change on the next call to Next. // contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) Text() []byte { func (z *Tokenizer) Text() []byte {
...@@ -704,6 +736,7 @@ func (z *Tokenizer) Text() []byte { ...@@ -704,6 +736,7 @@ func (z *Tokenizer) Text() []byte {
s := z.buf[z.data.start:z.data.end] s := z.buf[z.data.start:z.data.end]
z.data.start = z.raw.end z.data.start = z.raw.end
z.data.end = z.raw.end z.data.end = z.raw.end
s = convertNewlines(s)
if !z.textIsRaw { if !z.textIsRaw {
s = unescape(s) s = unescape(s)
} }
...@@ -739,7 +772,7 @@ func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { ...@@ -739,7 +772,7 @@ func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
z.nAttrReturned++ z.nAttrReturned++
key = z.buf[x[0].start:x[0].end] key = z.buf[x[0].start:x[0].end]
val = z.buf[x[1].start:x[1].end] val = z.buf[x[1].start:x[1].end]
return lower(key), unescape(val), z.nAttrReturned < len(z.attr) return lower(key), unescape(convertNewlines(val)), z.nAttrReturned < len(z.attr)
} }
} }
return nil, nil, false return nil, nil, false
......
...@@ -592,6 +592,33 @@ loop: ...@@ -592,6 +592,33 @@ loop:
} }
} }
func TestConvertNewlines(t *testing.T) {
testCases := map[string]string{
"Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
"Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
"DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
"": "",
"\n": "\n",
"\n\r": "\n\n",
"\r": "\n",
"\r\n": "\n",
"\r\n\n": "\n\n",
"\r\n\r": "\n\n",
"\r\n\r\n": "\n\n",
"\r\r": "\n\n",
"\r\r\n": "\n\n",
"\r\r\n\n": "\n\n\n",
"\r\r\r\n": "\n\n\n",
"\r \n": "\n \n",
"xyz": "xyz",
}
for in, want := range testCases {
if got := string(convertNewlines([]byte(in))); got != want {
t.Errorf("input %q: got %q, want %q", in, got, want)
}
}
}
const ( const (
rawLevel = iota rawLevel = iota
lowLevel lowLevel
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment