Commit 74288f09 authored by Rob Pike's avatar Rob Pike

strconv: add QuoteToGraphic and friends

This version of quoting allows runes in category Zs, such as the
ideographic space characters, to be passed through unquoted.

Still to do (maybe): A way to access this from Printf.

Updates #11511.

Change-Id: I3bae84b1aa0bc1b885318d3f67c5f451099a2a5a
Reviewed-on: https://go-review.googlesource.com/14184Reviewed-by: 's avatarMarcel van Lohuizen <mpvl@golang.org>
parent 9ac0fff7
......@@ -635,3 +635,23 @@ var isNotPrint32 = []uint16{ // add 0x10000 to each entry
0xf57a,
0xf5a4,
}
// isGraphic lists the graphic runes not matched by IsPrint.
var isGraphic = []uint16{
0x00a0,
0x1680,
0x2000,
0x2001,
0x2002,
0x2003,
0x2004,
0x2005,
0x2006,
0x2007,
0x2008,
0x2009,
0x200a,
0x202f,
0x205f,
0x3000,
}
......@@ -174,6 +174,23 @@ func main() {
}
fmt.Fprintf(&buf, "\t%#04x,\n", r-0x10000)
}
fmt.Fprintf(&buf, "}\n\n")
// The list of graphic but not "printable" runes is short. Just make one easy table.
fmt.Fprintf(&buf, "// isGraphic lists the graphic runes not matched by IsPrint.\n")
fmt.Fprintf(&buf, "var isGraphic = []uint16{\n")
for r := rune(0); r <= unicode.MaxRune; r++ {
if unicode.IsPrint(r) != unicode.IsGraphic(r) {
// Sanity check.
if !unicode.IsGraphic(r) {
log.Fatalf("%U is printable but not graphic\n", r)
}
if r > 0xFFFF { // We expect only 16-bit values.
log.Fatalf("%U too big for isGraphic\n", r)
}
fmt.Fprintf(&buf, "\t%#04x,\n", r)
}
}
fmt.Fprintf(&buf, "}\n")
data, err := format.Source(buf.Bytes())
......
......@@ -12,7 +12,7 @@ import (
const lowerhex = "0123456789abcdef"
func quoteWith(s string, quote byte, ASCIIonly bool) string {
func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
var runeTmp [utf8.UTFMax]byte
buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
buf = append(buf, quote)
......@@ -38,7 +38,7 @@ func quoteWith(s string, quote byte, ASCIIonly bool) string {
buf = append(buf, byte(r))
continue
}
} else if IsPrint(r) {
} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
n := utf8.EncodeRune(runeTmp[:], r)
buf = append(buf, runeTmp[:n]...)
continue
......@@ -90,7 +90,7 @@ func quoteWith(s string, quote byte, ASCIIonly bool) string {
// control characters and non-printable characters as defined by
// IsPrint.
func Quote(s string) string {
return quoteWith(s, '"', false)
return quoteWith(s, '"', false, false)
}
// AppendQuote appends a double-quoted Go string literal representing s,
......@@ -103,7 +103,7 @@ func AppendQuote(dst []byte, s string) []byte {
// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
// non-ASCII characters and non-printable characters as defined by IsPrint.
func QuoteToASCII(s string) string {
return quoteWith(s, '"', true)
return quoteWith(s, '"', true, false)
}
// AppendQuoteToASCII appends a double-quoted Go string literal representing s,
......@@ -112,12 +112,25 @@ func AppendQuoteToASCII(dst []byte, s string) []byte {
return append(dst, QuoteToASCII(s)...)
}
// QuoteToGraphic returns a double-quoted Go string literal representing s.
// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
// non-ASCII characters and non-printable characters as defined by IsGraphic.
func QuoteToGraphic(s string) string {
return quoteWith(s, '"', false, true)
}
// AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
// as generated by QuoteToGraphic, to dst and returns the extended buffer.
func AppendQuoteToGraphic(dst []byte, s string) []byte {
return append(dst, QuoteToGraphic(s)...)
}
// QuoteRune returns a single-quoted Go character literal representing the
// rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
// rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
// for control characters and non-printable characters as defined by IsPrint.
func QuoteRune(r rune) string {
// TODO: avoid the allocation here.
return quoteWith(string(r), '\'', false)
return quoteWith(string(r), '\'', false, false)
}
// AppendQuoteRune appends a single-quoted Go character literal representing the rune,
......@@ -127,12 +140,12 @@ func AppendQuoteRune(dst []byte, r rune) []byte {
}
// QuoteRuneToASCII returns a single-quoted Go character literal representing
// the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
// the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
// \u0100) for non-ASCII characters and non-printable characters as defined
// by IsPrint.
func QuoteRuneToASCII(r rune) string {
// TODO: avoid the allocation here.
return quoteWith(string(r), '\'', true)
return quoteWith(string(r), '\'', true, false)
}
// AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
......@@ -141,6 +154,21 @@ func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
return append(dst, QuoteRuneToASCII(r)...)
}
// QuoteRuneToGraphic returns a single-quoted Go character literal representing
// the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
// \u0100) for non-ASCII characters and non-printable characters as defined
// by IsGraphic.
func QuoteRuneToGraphic(r rune) string {
// TODO: avoid the allocation here.
return quoteWith(string(r), '\'', false, true)
}
// AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
// as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
return append(dst, QuoteRuneToGraphic(r)...)
}
// CanBackquote reports whether the string s can be represented
// unchanged as a single-line backquoted string without control
// characters other than tab.
......@@ -453,3 +481,26 @@ func IsPrint(r rune) bool {
j := bsearch16(isNotPrint, uint16(r))
return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
}
// IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
// characters include letters, marks, numbers, punctuation, symbols, and
// spaces, from categories L, M, N, P, S, and Zs.
func IsGraphic(r rune) bool {
if IsPrint(r) {
return true
}
return isInGraphicList(r)
}
// isInGraphicList reports whether the rune is in the isGraphic list. This separation
// from IsGraphic allows quoteWith to avoid two calls to IsPrint.
// Should be called only if IsPrint fails.
func isInGraphicList(r rune) bool {
// We know r must fit in 16 bits - see makeisprint.go.
if r > 0xFFFF {
return false
}
rr := uint16(r)
i := bsearch16(isGraphic, rr)
return i < len(isGraphic) && rr == isGraphic[i]
}
......@@ -10,7 +10,7 @@ import (
"unicode"
)
// Verify that our isPrint agrees with unicode.IsPrint
// Verify that our IsPrint agrees with unicode.IsPrint.
func TestIsPrint(t *testing.T) {
n := 0
for r := rune(0); r <= unicode.MaxRune; r++ {
......@@ -24,19 +24,36 @@ func TestIsPrint(t *testing.T) {
}
}
// Verify that our IsGraphic agrees with unicode.IsGraphic.
func TestIsGraphic(t *testing.T) {
n := 0
for r := rune(0); r <= unicode.MaxRune; r++ {
if IsGraphic(r) != unicode.IsGraphic(r) {
t.Errorf("IsGraphic(%U)=%t incorrect", r, IsGraphic(r))
n++
if n > 10 {
return
}
}
}
}
type quoteTest struct {
in string
out string
ascii string
in string
out string
ascii string
graphic string
}
var quotetests = []quoteTest{
{"\a\b\f\r\n\t\v", `"\a\b\f\r\n\t\v"`, `"\a\b\f\r\n\t\v"`},
{"\\", `"\\"`, `"\\"`},
{"abc\xffdef", `"abc\xffdef"`, `"abc\xffdef"`},
{"\u263a", `"☺"`, `"\u263a"`},
{"\U0010ffff", `"\U0010ffff"`, `"\U0010ffff"`},
{"\x04", `"\x04"`, `"\x04"`},
{"\a\b\f\r\n\t\v", `"\a\b\f\r\n\t\v"`, `"\a\b\f\r\n\t\v"`, `"\a\b\f\r\n\t\v"`},
{"\\", `"\\"`, `"\\"`, `"\\"`},
{"abc\xffdef", `"abc\xffdef"`, `"abc\xffdef"`, `"abc\xffdef"`},
{"\u263a", `"☺"`, `"\u263a"`, `"☺"`},
{"\U0010ffff", `"\U0010ffff"`, `"\U0010ffff"`, `"\U0010ffff"`},
{"\x04", `"\x04"`, `"\x04"`, `"\x04"`},
// Some non-printable but graphic runes. Final column is double-quoted.
{"!\u00a0!\u2000!\u3000!", `"!\u00a0!\u2000!\u3000!"`, `"!\u00a0!\u2000!\u3000!"`, "\"!\u00a0!\u2000!\u3000!\""},
}
func TestQuote(t *testing.T) {
......@@ -61,22 +78,38 @@ func TestQuoteToASCII(t *testing.T) {
}
}
func TestQuoteToGraphic(t *testing.T) {
for _, tt := range quotetests {
if out := QuoteToGraphic(tt.in); out != tt.graphic {
t.Errorf("QuoteToGraphic(%s) = %s, want %s", tt.in, out, tt.graphic)
}
if out := AppendQuoteToGraphic([]byte("abc"), tt.in); string(out) != "abc"+tt.graphic {
t.Errorf("AppendQuoteToGraphic(%q, %s) = %s, want %s", "abc", tt.in, out, "abc"+tt.graphic)
}
}
}
type quoteRuneTest struct {
in rune
out string
ascii string
in rune
out string
ascii string
graphic string
}
var quoterunetests = []quoteRuneTest{
{'a', `'a'`, `'a'`},
{'\a', `'\a'`, `'\a'`},
{'\\', `'\\'`, `'\\'`},
{0xFF, `'ÿ'`, `'\u00ff'`},
{0x263a, `'☺'`, `'\u263a'`},
{0xfffd, `'�'`, `'\ufffd'`},
{0x0010ffff, `'\U0010ffff'`, `'\U0010ffff'`},
{0x0010ffff + 1, `'�'`, `'\ufffd'`},
{0x04, `'\x04'`, `'\x04'`},
{'a', `'a'`, `'a'`, `'a'`},
{'\a', `'\a'`, `'\a'`, `'\a'`},
{'\\', `'\\'`, `'\\'`, `'\\'`},
{0xFF, `'ÿ'`, `'\u00ff'`, `'ÿ'`},
{0x263a, `'☺'`, `'\u263a'`, `'☺'`},
{0xfffd, `'�'`, `'\ufffd'`, `'�'`},
{0x0010ffff, `'\U0010ffff'`, `'\U0010ffff'`, `'\U0010ffff'`},
{0x0010ffff + 1, `'�'`, `'\ufffd'`, `'�'`},
{0x04, `'\x04'`, `'\x04'`, `'\x04'`},
// Some differences between graphic and printable. Note the last column is double-quoted.
{'\u00a0', `'\u00a0'`, `'\u00a0'`, "'\u00a0'"},
{'\u2000', `'\u2000'`, `'\u2000'`, "'\u2000'"},
{'\u3000', `'\u3000'`, `'\u3000'`, "'\u3000'"},
}
func TestQuoteRune(t *testing.T) {
......@@ -101,6 +134,17 @@ func TestQuoteRuneToASCII(t *testing.T) {
}
}
func TestQuoteRuneToGraphic(t *testing.T) {
for _, tt := range quoterunetests {
if out := QuoteRuneToGraphic(tt.in); out != tt.graphic {
t.Errorf("QuoteRuneToGraphic(%U) = %s, want %s", tt.in, out, tt.graphic)
}
if out := AppendQuoteRuneToGraphic([]byte("abc"), tt.in); string(out) != "abc"+tt.graphic {
t.Errorf("AppendQuoteRuneToGraphic(%q, %U) = %s, want %s", "abc", tt.in, out, "abc"+tt.graphic)
}
}
}
type canBackquoteTest struct {
in string
out bool
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment