Commit f17e3d22 authored by Mike Samuel's avatar Mike Samuel

exp/template/html: handle custom attrs and HTML5 embedded elements.

HTML5 allows embedded SVG and MathML.
Code searches show SVG is used for graphing.

This changes transition to deal with constructs like
   <svg xmlns:xlink="http://www.w3.org/1999/xlink">
It changes attr and clients to call a single function that combines
the name lookup and "on" prefix check to determine an attribute
value type given an attribute name.

That function uses heuristics to recognize that
     xlink:href and svg:href
have URL content, and that data-url is likely contains URL content,
since "javascript:" injection is such a problem.

I did a code search over a closure templates codebase to determine
patterns of custom attribute usage.  I did something like

$ find . -name \*.soy | \
    xargs egrep perl -ne 'while (s/\b((data-|\w+:)\w+)\s*=//) { print "$1\n"; }' | \
    sort | uniq

to produce the list at the bottom.

Filtering that by egrep -i 'src|url|uri' produces

data-docConsumptionUri
data-docIconUrl
data-launchUrl
data-lazySrc
data-pageUrl
data-shareurl
data-suggestServerUrl
data-tweetUrl
g:secondaryurls
g:url

which seem to match all the ones that are likely URL content.
There are some short words that match that heuristic, but I still think it decent since
any custom attribute that has a numeric or enumerated keyword value will be unaffected by
the URL assumption.
Counterexamples from /usr/share/dict:
during, hourly, maturity, nourish, purloin, security, surly

Custom attributes present in existing closure templates codebase:
buzz:aid
data-a
data-action
data-actor
data-allowEqualityOps
data-analyticsId
data-bid
data-c
data-cartId
data-categoryId
data-cid
data-command
data-count
data-country
data-creativeId
data-cssToken
data-dest
data-docAttribution
data-docConsumptionUri
data-docCurrencyCode
data-docIconUrl
data-docId
data-docPrice
data-docPriceMicros
data-docTitle
data-docType
data-docid
data-email
data-entityid
data-errorindex
data-f
data-feature
data-fgid
data-filter
data-fireEvent
data-followable
data-followed
data-hashChange
data-height
data-hover
data-href
data-id
data-index
data-invitable
data-isFree
data-isPurchased
data-jid
data-jumpid
data-launchUrl
data-lazySrc
data-listType
data-maxVisiblePages
data-name
data-nid
data-nodeid
data-numItems
data-numPerPage
data-offerType
data-oid
data-opUsesEquality
data-overflowclass
data-packageName
data-pageId
data-pageUrl
data-pos
data-priceBrief
data-profileIds
data-query
data-rating
data-ref
data-rentalGrantPeriodDays
data-rentalactivePeriodHours
data-reviewId
data-role
data-score
data-shareurl
data-showGeLe
data-showLineInclude
data-size
data-sortval
data-suggestServerType
data-suggestServerUrl
data-suggestionIndex
data-tabBarId
data-tabBarIndex
data-tags
data-target
data-textColor
data-theme
data-title
data-toggletarget
data-tooltip
data-trailerId
data-transactionId
data-transition
data-ts
data-tweetContent
data-tweetUrl
data-type
data-useAjax
data-value
data-width
data-x
dm:index
dm:type
g:aspects
g:decorateusingsecondary
g:em
g:entity
g:groups
g:id
g:istoplevel
g:li
g:numresults
g:oid
g:parentId
g:pl
g:pt
g:rating_override
g:secondaryurls
g:sortby
g:startindex
g:target
g:type
g:url
g:value
ga:barsize
ga:css
ga:expandAfterCharsExceed
ga:initialNumRows
ga:nocancelicon
ga:numRowsToExpandTo
ga:type
ga:unlockwhenrated
gw:address
gw:businessname
gw:comment
gw:phone
gw:source
ng:controller
xlink:href
xml:lang
xmlns:atom
xmlns:dc
xmlns:jstd
xmlns:ng
xmlns:og
xmlns:webstore
xmlns:xlink

R=nigeltao
CC=golang-dev
https://golang.org/cl/5119041
parent 582bb304
......@@ -4,16 +4,19 @@
package html
// attrType[n] describes the value of the given attribute.
import (
"strings"
)
// attrTypeMap[n] describes the value of the given attribute.
// If an attribute affects (or can mask) the encoding or interpretation of
// other content, or affects the contents, idempotency, or credentials of a
// network message, then the value in this map is contentTypeUnsafe.
// This map is derived from HTML5, specifically
// http://www.w3.org/TR/html5/Overview.html#attributes-1 and
// http://www.w3.org/TR/html5/Overview.html#event-handlers-on-elements-document-objects-and-window-objects
// http://www.w3.org/TR/html5/Overview.html#attributes-1
// as well as "%URI"-typed attributes from
// http://www.w3.org/TR/html4/index/attributes.html
var attrType = map[string]contentType{
var attrTypeMap = map[string]contentType{
"accept": contentTypePlain,
"accept-charset": contentTypeUnsafe,
"action": contentTypeURL,
......@@ -86,60 +89,9 @@ var attrType = map[string]contentType{
"multiple": contentTypePlain,
"name": contentTypePlain,
"novalidate": contentTypeUnsafe,
"onabort": contentTypeJS,
"onblur": contentTypeJS,
"oncanplay": contentTypeJS,
"oncanplaythrough": contentTypeJS,
"onchange": contentTypeJS,
"onclick": contentTypeJS,
"oncontextmenu": contentTypeJS,
"oncuechange": contentTypeJS,
"ondblclick": contentTypeJS,
"ondrag": contentTypeJS,
"ondragend": contentTypeJS,
"ondragenter": contentTypeJS,
"ondragleave": contentTypeJS,
"ondragover": contentTypeJS,
"ondragstart": contentTypeJS,
"ondrop": contentTypeJS,
"ondurationchange": contentTypeJS,
"onemptied": contentTypeJS,
"onended": contentTypeJS,
"onerror": contentTypeJS,
"onfocus": contentTypeJS,
"oninput": contentTypeJS,
"oninvalid": contentTypeJS,
"onkeydown": contentTypeJS,
"onkeypress": contentTypeJS,
"onkeyup": contentTypeJS,
"onload": contentTypeJS,
"onloadeddata": contentTypeJS,
"onloadedmetadata": contentTypeJS,
"onloadstart": contentTypeJS,
"onmousedown": contentTypeJS,
"onmousemove": contentTypeJS,
"onmouseout": contentTypeJS,
"onmouseover": contentTypeJS,
"onmouseup": contentTypeJS,
"onmousewheel": contentTypeJS,
"onpause": contentTypeJS,
"onplay": contentTypeJS,
"onplaying": contentTypeJS,
"onprogress": contentTypeJS,
"onratechange": contentTypeJS,
"onreadystatechange": contentTypeJS,
"onreset": contentTypeJS,
"onscroll": contentTypeJS,
"onseeked": contentTypeJS,
"onseeking": contentTypeJS,
"onselect": contentTypeJS,
"onshow": contentTypeJS,
"onstalled": contentTypeJS,
"onsubmit": contentTypeJS,
"onsuspend": contentTypeJS,
"ontimeupdate": contentTypeJS,
"onvolumechange": contentTypeJS,
"onwaiting": contentTypeJS,
// Skip handler names from
// http://www.w3.org/TR/html5/Overview.html#event-handlers-on-elements-document-objects-and-window-objects
// since we have special handling in attrType.
"open": contentTypePlain,
"optimum": contentTypePlain,
"pattern": contentTypeUnsafe,
......@@ -179,6 +131,45 @@ var attrType = map[string]contentType{
"value": contentTypeUnsafe,
"width": contentTypePlain,
"wrap": contentTypePlain,
"xmlns": contentTypeURL,
}
// attrType returns a conservative (upper-bound on authority) guess at the
// type of the named attribute.
func attrType(name string) contentType {
name = strings.ToLower(name)
if strings.HasPrefix(name, "data-") {
// Strip data- so that custom attribute heuristics below are
// widely applied.
// Treat data-action as URL below.
name = name[5:]
} else if colon := strings.IndexRune(name, ':'); colon != -1 {
if name[:colon] == "xmlns" {
return contentTypeURL
}
// Treat svg:href and xlink:href as href below.
name = name[colon+1:]
}
if t, ok := attrTypeMap[name]; ok {
return t
}
// Treat partial event handler names as script.
if strings.HasPrefix(name, "on") {
return contentTypeJS
}
// TODO: data-* attrs? Recognize data-foo-url and similar.
// Heuristics to prevent "javascript:..." injection in custom
// data attributes and custom attributes like g:tweetUrl.
// http://www.w3.org/TR/html5/elements.html#embedding-custom-non-visible-data-with-the-data-attributes:
// "Custom data attributes are intended to store custom data
// private to the page or application, for which there are no
// more appropriate attributes or elements."
// Developers seem to store URL content in data URLs that start
// or end with "URI" or "URL".
if strings.Contains(name, "src") ||
strings.Contains(name, "uri") ||
strings.Contains(name, "url") {
return contentTypeURL
}
return contentTypePlain
}
......@@ -1400,6 +1400,66 @@ func TestEscapeText(t *testing.T) {
`<style>value`,
context{state: stateCSS, element: elementStyle},
},
{
`<a xlink:href`,
context{state: stateAttrName, attr: attrURL},
},
{
`<a xmlns`,
context{state: stateAttrName, attr: attrURL},
},
{
`<a xmlns:foo`,
context{state: stateAttrName, attr: attrURL},
},
{
`<a xmlnsxyz`,
context{state: stateAttrName},
},
{
`<a data-url`,
context{state: stateAttrName, attr: attrURL},
},
{
`<a data-iconUri`,
context{state: stateAttrName, attr: attrURL},
},
{
`<a data-urlItem`,
context{state: stateAttrName, attr: attrURL},
},
{
`<a g:`,
context{state: stateAttrName},
},
{
`<a g:url`,
context{state: stateAttrName, attr: attrURL},
},
{
`<a g:iconUri`,
context{state: stateAttrName, attr: attrURL},
},
{
`<a g:urlItem`,
context{state: stateAttrName, attr: attrURL},
},
{
`<a g:value`,
context{state: stateAttrName},
},
{
`<a svg:style='`,
context{state: stateCSS, delim: delimSingleQuote},
},
{
`<svg:font-face`,
context{state: stateTag},
},
{
`<svg:a svg:onclick="`,
context{state: stateJS, delim: delimDoubleQuote},
},
}
for _, test := range tests {
......
......@@ -230,7 +230,7 @@ func htmlNameFilter(args ...interface{}) string {
return filterFailsafe
}
s = strings.ToLower(s)
if t := attrType[s]; t != contentTypePlain && attrType["on"+s] != contentTypeJS {
if t := attrType(s); t != contentTypePlain {
// TODO: Split attr and element name part filters so we can whitelist
// attributes.
return filterFailsafe
......
......@@ -106,18 +106,13 @@ func tTag(c context, s []byte) (context, int) {
err: errorf(ErrBadHTML, 0, "expected space, attr name, or end of tag, but got %q", s[i:]),
}, len(s)
}
canonAttrName := strings.ToLower(string(s[i:j]))
switch attrType[canonAttrName] {
switch attrType(string(s[i:j])) {
case contentTypeURL:
attr = attrURL
case contentTypeCSS:
attr = attrStyle
case contentTypeJS:
attr = attrScript
default:
if strings.HasPrefix(canonAttrName, "on") {
attr = attrScript
}
}
if j == len(s) {
state = stateAttrName
......@@ -512,16 +507,34 @@ var elementNameMap = map[string]element{
"title": elementTitle,
}
// asciiAlpha returns whether c is an ASCII letter.
func asciiAlpha(c byte) bool {
return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
}
// asciiAlphaNum returns whether c is an ASCII letter or digit.
func asciiAlphaNum(c byte) bool {
return asciiAlpha(c) || '0' <= c && c <= '9'
}
// eatTagName returns the largest j such that s[i:j] is a tag name and the tag type.
func eatTagName(s []byte, i int) (int, element) {
j := i
for ; j < len(s); j++ {
if i == len(s) || !asciiAlpha(s[i]) {
return i, elementNone
}
j := i + 1
for j < len(s) {
x := s[j]
if !(('a' <= x && x <= 'z') ||
('A' <= x && x <= 'Z') ||
('0' <= x && x <= '9' && i != j)) {
break
if asciiAlphaNum(x) {
j++
continue
}
// Allow "x-y" or "x:y" but not "x-", "-y", or "x--y".
if (x == ':' || x == '-') && j+1 < len(s) && asciiAlphaNum(s[j+1]) {
j += 2
continue
}
break
}
return j, elementNameMap[strings.ToLower(string(s[i:j]))]
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment