Commit 54ec7193 authored by Rob Pike's avatar Rob Pike

fix string range to have full unicode range (up to 10FFFF).

add test for string range.

test has minor failure: after loop the index == len(s); should be len(s)-1
in this case.  according to spec, vars are left at position at last
iteration.

R=ken,rsc
DELTA=259  (161 added, 96 deleted, 2 changed)
OCL=27343
CL=27343
parent 9ddeb210
...@@ -52,6 +52,119 @@ enum ...@@ -52,6 +52,119 @@ enum
Runemax = 0x10FFFF, /* maximum rune value */ Runemax = 0x10FFFF, /* maximum rune value */
}; };
/*
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
* This is a slower but "safe" version of the old chartorune
* that works on strings that are not necessarily null-terminated.
*
* If you know for sure that your string is null-terminated,
* chartorune will be a bit faster.
*
* It is guaranteed not to attempt to access "length"
* past the incoming pointer. This is to avoid
* possible access violations. If the string appears to be
* well-formed but incomplete (i.e., to get the whole Rune
* we'd need to read past str+length) then we'll set the Rune
* to Bad and return 0.
*
* Note that if we have decoding problems for other
* reasons, we return 1 instead of 0.
*/
int32
charntorune(int32 *rune, uint8 *str, int32 length)
{
int32 c, c1, c2, c3, l;
/* When we're not allowed to read anything */
if(length <= 0) {
goto badlen;
}
/*
* one character sequence (7-bit value)
* 00000-0007F => T1
*/
c = *(uint8*)str;
if(c < Tx) {
*rune = c;
return 1;
}
// If we can't read more than one character we must stop
if(length <= 1) {
goto badlen;
}
/*
* two character sequence (11-bit value)
* 0080-07FF => T2 Tx
*/
c1 = *(uint8*)(str+1) ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
// If we can't read more than two characters we must stop
if(length <= 2) {
goto badlen;
}
/*
* three character sequence (16-bit value)
* 0800-FFFF => T3 Tx Tx
*/
c2 = *(uint8*)(str+2) ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
if (length <= 3)
goto badlen;
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(uint8*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3 || l > Runemax)
goto bad;
*rune = l;
return 4;
}
// Support for 5-byte or longer UTF-8 would go here, but
// since we don't have that, we'll just fall through to bad.
/*
* bad decoding
*/
bad:
*rune = Bad;
return 1;
badlen:
*rune = Bad;
return 0;
}
int32 int32
runetochar(byte *str, int32 rune) /* note: in original, arg2 was pointer */ runetochar(byte *str, int32 rune) /* note: in original, arg2 was pointer */
{ {
......
...@@ -272,6 +272,7 @@ int32 strcmp(byte*, byte*); ...@@ -272,6 +272,7 @@ int32 strcmp(byte*, byte*);
int32 findnull(byte*); int32 findnull(byte*);
void dump(byte*, int32); void dump(byte*, int32);
int32 runetochar(byte*, int32); int32 runetochar(byte*, int32);
int32 charntorune(int32*, uint8*, int32);
/* /*
* very low level c-called * very low level c-called
......
...@@ -189,11 +189,9 @@ sys·arraystring(Array b, String s) ...@@ -189,11 +189,9 @@ sys·arraystring(Array b, String s)
FLUSH(&s); FLUSH(&s);
} }
static int32 chartorune(int32 *rune, byte *str);
enum enum
{ {
Runeself = 0x80, Runeself = 0x80,
Runeerror = 0xfffd,
}; };
// func stringiter(string, int) (retk int); // func stringiter(string, int) (retk int);
...@@ -213,13 +211,7 @@ sys·stringiter(String s, int32 k, int32 retk) ...@@ -213,13 +211,7 @@ sys·stringiter(String s, int32 k, int32 retk)
if(l >= Runeself) { if(l >= Runeself) {
// multi-char rune // multi-char rune
n = chartorune(&l, s.str+k); n = charntorune(&l, s.str+k, s.len-k);
if(k+n > s.len) {
// special case of multi-char rune
// that ran off end of string
l = Runeerror;
n = 1;
}
} }
retk = k+n; retk = k+n;
...@@ -246,13 +238,7 @@ sys·stringiter2(String s, int32 k, int32 retk, int32 retv) ...@@ -246,13 +238,7 @@ sys·stringiter2(String s, int32 k, int32 retk, int32 retv)
if(l >= Runeself) { if(l >= Runeself) {
// multi-char rune // multi-char rune
n = chartorune(&l, s.str+k); n = charntorune(&l, s.str+k, s.len-k);
if(k+n > s.len) {
// special case of multi-char rune
// that ran off end of string
l = Runeerror;
n = 1;
}
} }
retk = k+n; retk = k+n;
...@@ -262,85 +248,3 @@ out: ...@@ -262,85 +248,3 @@ out:
FLUSH(&retk); FLUSH(&retk);
FLUSH(&retv); FLUSH(&retv);
} }
//
// copied from plan9 library
//
enum
{
Bit1 = 7,
Bitx = 6,
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
};
static int32
chartorune(int32 *rune, byte *str)
{
int32 c, c1, c2;
int32 l;
/*
* one character sequence
* 00000-0007F => T1
*/
c = str[0];
if(c < Tx) {
*rune = c;
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
c1 = str[1] ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
c2 = str[2] ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
/*
* bad decoding
*/
bad:
*rune = Runeerror;
return 1;
}
...@@ -67,6 +67,10 @@ panic PC=xxx ...@@ -67,6 +67,10 @@ panic PC=xxx
=========== ./sigchld.go =========== ./sigchld.go
survived SIGCHLD survived SIGCHLD
=========== ./stringrange.go
after loop i is 18 not 17
FAIL
=========== ./turing.go =========== ./turing.go
Hello World! Hello World!
......
// $G $F.go && $L $F.$A && ./$A.out
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
import(
"fmt";
"utf8";
)
func main() {
s := "\000\123\x00\xca\xFE\u0123\ubabe\U0000babe\U0010FFFFx";
expect := []int{ 0, 0123, 0, 0xFFFD, 0xFFFD, 0x123, 0xbabe, 0xbabe, 0x10FFFF, 'x' };
var rune, size int;
offset := 0;
var i, c int;
ok := true;
cnum := 0;
for i, c = range s {
rune, size := utf8.DecodeRuneInString(s, i); // check it another way
if i != offset {
fmt.Printf("unexpected offset %d not %d\n", i, offset);
ok = false;
}
if rune != expect[cnum] {
fmt.Printf("unexpected rune %d from DecodeRuneInString: %x not %x\n", i, rune, expect[cnum]);
ok = false;
}
if c != expect[cnum] {
fmt.Printf("unexpected rune %d from range: %x not %x\n", i, rune, expect[cnum]);
ok = false;
}
offset += size;
cnum++;
}
if i != len(s)-1 {
fmt.Println("after loop i is", i, "not", len(s)-1);
ok = false;
}
if !ok {
fmt.Println("FAIL");
sys.Exit(1)
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment