Commit a6869d1c authored by Russ Cox's avatar Russ Cox Committed by Brad Fitzpatrick

net/url: accept non-ASCII bytes in URL per RFC 3986

Fixes #7991.
Fixes #12719.

Change-Id: I5650fa35ec5d49addeda6cc6e7fa93cfbe1cdfc0
Reviewed-on: https://go-review.googlesource.com/17385Reviewed-by: 's avatarBrad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
parent 84a875ca
...@@ -71,6 +71,7 @@ type encoding int ...@@ -71,6 +71,7 @@ type encoding int
const ( const (
encodePath encoding = 1 + iota encodePath encoding = 1 + iota
encodeHost encodeHost
encodeZone
encodeUserPassword encodeUserPassword
encodeQueryComponent encodeQueryComponent
encodeFragment encodeFragment
...@@ -93,7 +94,7 @@ func shouldEscape(c byte, mode encoding) bool { ...@@ -93,7 +94,7 @@ func shouldEscape(c byte, mode encoding) bool {
return false return false
} }
if mode == encodeHost { if mode == encodeHost || mode == encodeZone {
// §3.2.2 Host allows // §3.2.2 Host allows
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
// as part of reg-name. // as part of reg-name.
...@@ -166,6 +167,27 @@ func unescape(s string, mode encoding) (string, error) { ...@@ -166,6 +167,27 @@ func unescape(s string, mode encoding) (string, error) {
} }
return "", EscapeError(s) return "", EscapeError(s)
} }
// Per https://tools.ietf.org/html/rfc3986#page-21
// in the host component %-encoding can only be used
// for non-ASCII bytes.
// But https://tools.ietf.org/html/rfc6874#section-2
// introduces %25 being allowed to escape a percent sign
// in IPv6 scoped-address literals. Yay.
if mode == encodeHost && unhex(s[i+1]) < 8 && s[i:i+3] != "%25" {
return "", EscapeError(s[i : i+3])
}
if mode == encodeZone {
// RFC 6874 says basically "anything goes" for zone identifiers
// and that even non-ASCII can be redundantly escaped,
// but it seems prudent to restrict %-escaped bytes here to those
// that are valid host name bytes in their unescaped form.
// That is, you can use escaping in the zone identifier but not
// to introduce bytes you couldn't just write directly.
v := unhex(s[i+1])<<4 | unhex(s[i+2])
if s[i:i+3] != "%25" && shouldEscape(v, encodeHost) {
return "", EscapeError(s[i : i+3])
}
}
i += 3 i += 3
case '+': case '+':
hasPlus = mode == encodeQueryComponent hasPlus = mode == encodeQueryComponent
...@@ -496,14 +518,9 @@ func parseAuthority(authority string) (user *Userinfo, host string, err error) { ...@@ -496,14 +518,9 @@ func parseAuthority(authority string) (user *Userinfo, host string, err error) {
// parseHost parses host as an authority without user // parseHost parses host as an authority without user
// information. That is, as host[:port]. // information. That is, as host[:port].
func parseHost(host string) (string, error) { func parseHost(host string) (string, error) {
litOrName := host
if strings.HasPrefix(host, "[") { if strings.HasPrefix(host, "[") {
// Parse an IP-Literal in RFC 3986 and RFC 6874. // Parse an IP-Literal in RFC 3986 and RFC 6874.
// E.g., "[fe80::1], "[fe80::1%25en0]" // E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
//
// RFC 4007 defines "%" as a delimiter character in
// the textual representation of IPv6 addresses.
// Per RFC 6874, in URIs that "%" is encoded as "%25".
i := strings.LastIndex(host, "]") i := strings.LastIndex(host, "]")
if i < 0 { if i < 0 {
return "", errors.New("missing ']' in host") return "", errors.New("missing ']' in host")
...@@ -512,29 +529,31 @@ func parseHost(host string) (string, error) { ...@@ -512,29 +529,31 @@ func parseHost(host string) (string, error) {
if !validOptionalPort(colonPort) { if !validOptionalPort(colonPort) {
return "", fmt.Errorf("invalid port %q after host", colonPort) return "", fmt.Errorf("invalid port %q after host", colonPort)
} }
// Parse a host subcomponent without a ZoneID in RFC
// 6874 because the ZoneID is allowed to use the // RFC 6874 defines that %25 (%-encoded percent) introduces
// percent encoded form. // the zone identifier, and the zone identifier can use basically
j := strings.Index(host[:i], "%25") // any %-encoding it likes. That's different from the host, which
if j < 0 { // can only %-encode non-ASCII bytes.
litOrName = host[1:i] // We do impose some restrictions on the zone, to avoid stupidity
} else { // like newlines.
litOrName = host[1:j] zone := strings.Index(host[:i], "%25")
if zone >= 0 {
host1, err := unescape(host[:zone], encodeHost)
if err != nil {
return "", err
}
host2, err := unescape(host[zone:i], encodeZone)
if err != nil {
return "", err
}
host3, err := unescape(host[i:], encodeHost)
if err != nil {
return "", err
}
return host1 + host2 + host3, nil
} }
} }
// A URI containing an IP-Literal without a ZoneID or
// IPv4address in RFC 3986 and RFC 6847 must not be
// percent-encoded.
//
// A URI containing a DNS registered name in RFC 3986 is
// allowed to be percent-encoded, though we don't use it for
// now to avoid messing up with the gap between allowed
// characters in URI and allowed characters in DNS.
// See golang.org/issue/7991.
if strings.Contains(litOrName, "%") {
return "", errors.New("percent-encoded characters in host")
}
var err error var err error
if host, err = unescape(host, encodeHost); err != nil { if host, err = unescape(host, encodeHost); err != nil {
return "", err return "", err
......
...@@ -483,6 +483,34 @@ var urltests = []URLTest{ ...@@ -483,6 +483,34 @@ var urltests = []URLTest{
}, },
"", "",
}, },
// golang.org/issue/7991 and golang.org/issue/12719 (non-ascii %-encoded in host)
{
"http://hello.世界.com/foo",
&URL{
Scheme: "http",
Host: "hello.世界.com",
Path: "/foo",
},
"http://hello.%E4%B8%96%E7%95%8C.com/foo",
},
{
"http://hello.%e4%b8%96%e7%95%8c.com/foo",
&URL{
Scheme: "http",
Host: "hello.世界.com",
Path: "/foo",
},
"http://hello.%E4%B8%96%E7%95%8C.com/foo",
},
{
"http://hello.%E4%B8%96%E7%95%8C.com/foo",
&URL{
Scheme: "http",
Host: "hello.世界.com",
Path: "/foo",
},
"",
},
} }
// more useful string for debugging than fmt's struct printer // more useful string for debugging than fmt's struct printer
...@@ -1184,11 +1212,11 @@ func TestParseAuthority(t *testing.T) { ...@@ -1184,11 +1212,11 @@ func TestParseAuthority(t *testing.T) {
{"http://[::1]%23", true}, {"http://[::1]%23", true},
{"http://[::1%25en0]", false}, // valid zone id {"http://[::1%25en0]", false}, // valid zone id
{"http://[::1]:", false}, // colon, but no port OK {"http://[::1]:", false}, // colon, but no port OK
{"http://[::1]:%38%30", true}, // no hex in port {"http://[::1]:%38%30", true}, // not allowed: % encoding only for non-ASCII
{"http://[::1%25%10]", false}, // TODO: reject the %10 after the valid zone %25 separator? {"http://[::1%25%41]", false}, // RFC 6874 allows over-escaping in zone
{"http://[%10::1]", true}, // no %xx escapes in IP address {"http://[%10::1]", true}, // no %xx escapes in IP address
{"http://[::1]/%48", false}, // %xx in path is fine {"http://[::1]/%48", false}, // %xx in path is fine
{"http://%41:8080/", true}, // TODO: arguably we should accept reg-name with %xx {"http://%41:8080/", true}, // not allowed: % encoding only for non-ASCII
{"mysql://x@y(z:123)/foo", false}, // golang.org/issue/12023 {"mysql://x@y(z:123)/foo", false}, // golang.org/issue/12023
{"mysql://x@y(1.2.3.4:123)/foo", false}, {"mysql://x@y(1.2.3.4:123)/foo", false},
{"mysql://x@y([2001:db8::1]:123)/foo", false}, {"mysql://x@y([2001:db8::1]:123)/foo", false},
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment