Commit 57d9de3a authored by Russ Cox's avatar Russ Cox

utf16: add DecodeRune, EncodeRune

R=r
CC=golang-dev
https://golang.org/cl/970041
parent aa2187ef
...@@ -18,6 +18,33 @@ const ( ...@@ -18,6 +18,33 @@ const (
surrSelf = 0x10000 surrSelf = 0x10000
) )
// IsSurrogate returns true if the specified Unicode code point
// can appear in a surrogate pair.
func IsSurrogate(rune int) bool {
return surr1 <= rune && rune < surr3
}
// DecodeRune returns the UTF-16 decoding of a surrogate pair.
// If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns
// the Unicode replacement code point U+FFFD.
func DecodeRune(r1, r2 int) int {
if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
return (int(r1)-surr1)<<10 | (int(r2) - surr2) + 0x10000
}
return unicode.ReplacementChar
}
// EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune.
// If the rune is not a valid Unicode code point or does not need encoding,
// EncodeRune returns U+FFFD, U+FFFD.
func EncodeRune(rune int) (r1, r2 int) {
if rune < surrSelf || rune > unicode.MaxRune || IsSurrogate(rune) {
return unicode.ReplacementChar, unicode.ReplacementChar
}
rune -= surrSelf
return surr1 + (rune>>10)&0x3ff, surr2 + rune&0x3ff
}
// Encode returns the UTF-16 encoding of the Unicode code point sequence s. // Encode returns the UTF-16 encoding of the Unicode code point sequence s.
func Encode(s []int) []uint16 { func Encode(s []int) []uint16 {
n := len(s) n := len(s)
...@@ -38,9 +65,9 @@ func Encode(s []int) []uint16 { ...@@ -38,9 +65,9 @@ func Encode(s []int) []uint16 {
a[n] = uint16(v) a[n] = uint16(v)
n++ n++
default: default:
v -= surrSelf r1, r2 := EncodeRune(v)
a[n] = uint16(surr1 + (v>>10)&0x3ff) a[n] = uint16(r1)
a[n+1] = uint16(surr2 + v&0x3ff) a[n+1] = uint16(r2)
n += 2 n += 2
} }
} }
...@@ -57,7 +84,7 @@ func Decode(s []uint16) []int { ...@@ -57,7 +84,7 @@ func Decode(s []uint16) []int {
case surr1 <= r && r < surr2 && i+1 < len(s) && case surr1 <= r && r < surr2 && i+1 < len(s) &&
surr2 <= s[i+1] && s[i+1] < surr3: surr2 <= s[i+1] && s[i+1] < surr3:
// valid surrogate sequence // valid surrogate sequence
a[n] = (int(r)-surr1)<<10 | (int(s[i+1]) - surr2) + 0x10000 a[n] = DecodeRune(int(r), int(s[i+1]))
i++ i++
n++ n++
case surr1 <= r && r < surr3: case surr1 <= r && r < surr3:
......
...@@ -8,6 +8,7 @@ import ( ...@@ -8,6 +8,7 @@ import (
"fmt" "fmt"
"reflect" "reflect"
"testing" "testing"
"unicode"
) )
type encodeTest struct { type encodeTest struct {
...@@ -32,6 +33,41 @@ func TestEncode(t *testing.T) { ...@@ -32,6 +33,41 @@ func TestEncode(t *testing.T) {
} }
} }
func TestEncodeRune(t *testing.T) {
for i, tt := range encodeTests {
j := 0
for _, r := range tt.in {
r1, r2 := EncodeRune(r)
if r < 0x10000 || r > unicode.MaxRune {
if j >= len(tt.out) {
t.Errorf("#%d: ran out of tt.out", i)
break
}
if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar {
t.Errorf("EncodeRune(%#x) = %#x, %#x; want 0xfffd, 0xfffd", r, r1, r2)
}
j++
} else {
if j+1 >= len(tt.out) {
t.Errorf("#%d: ran out of tt.out", i)
break
}
if r1 != int(tt.out[j]) || r2 != int(tt.out[j+1]) {
t.Errorf("EncodeRune(%#x) = %#x, %#x; want %#x, %#x", r, r1, r2, tt.out[j], tt.out[j+1])
}
j += 2
dec := DecodeRune(r1, r2)
if dec != r {
t.Errorf("DecodeRune(%#x, %#x) = %#x; want %#x", r1, r2, dec, r)
}
}
}
if j != len(tt.out) {
t.Errorf("#%d: EncodeRune didn't generate enough output", i)
}
}
}
type decodeTest struct { type decodeTest struct {
in []uint16 in []uint16
out []int out []int
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment