big.nat: Improved speed of nat-to-string conversion

Three optimizations: First, special-case power of two bases that partion a Word(), bases 2, 4, 16, and 256. These can be moved directly from internal Word() storage to the output without multiprecision operations. Next, same approach for the other power-of-two bases, 8, 32, 64, and 128. These don't fill a Word() evenly, so special handling is needed for those cases where input spans the high-bits of one Word and the low bis of the next one. Finally, implement the general case for others bases in 2 <= base <= 256 using superbases, the largest power of base representable in a Word(). For base ten, this is 9 digits and a superbase of 10^9 for 32-bit Words and 19 digits and 10^19 for 64-bit compiles. This way we do just 1/9th or 1/19th of the expensive multiprecision divisions, unpacking superdigits using fast native machine arithmetic. The resulting code runs 7x to 800x the speed of the previous approach, depending on the length of the number to be converted--longer is relatively faster. Also, extended the tests and benchmarks for string to nat (scan()) and nat to string (string()) functions. A further enhancement awaits the next CL to make general cases about 7x faster for long cases. R=gri CC=golang-dev https://golang.org/cl/4595041

big.nat: Improved speed of nat-to-string conversion
Three optimizations: First, special-case power of two bases that partion a Word(), bases 2, 4, 16, and 256. These can be moved directly from internal Word() storage to the output without multiprecision operations. Next, same approach for the other power-of-two bases, 8, 32, 64, and 128. These don't fill a Word() evenly, so special handling is needed for those cases where input spans the high-bits of one Word and the low bis of the next one. Finally, implement the general case for others bases in 2 <= base <= 256 using superbases, the largest power of base representable in a Word(). For base ten, this is 9 digits and a superbase of 10^9 for 32-bit Words and 19 digits and 10^19 for 64-bit compiles. This way we do just 1/9th or 1/19th of the expensive multiprecision divisions, unpacking superdigits using fast native machine arithmetic. The resulting code runs 7x to 800x the speed of the previous approach, depending on the length of the number to be converted--longer is relatively faster. Also, extended the tests and benchmarks for string to nat (scan()) and nat to string (string()) functions. A further enhancement awaits the next CL to make general cases about 7x faster for long cases. R=gri CC=golang-dev https://golang.org/cl/4595041
d5c45c54 · Michael T. Jones · Robert Griesemer · c5281d7a · d5c45c54 · d5c45c54
Commit d5c45c54 authored Jun 07, 2011 by Michael T. Jones Committed by Robert Griesemer Jun 07, 2011
Show whitespace changes
Inline Side-by-side

Showing with 322 additions and 16 deletions

nat.go src/pkg/big/nat.go +110 -15

nat_test.go src/pkg/big/nat_test.go +212 -1

No files found.
--- a/src/pkg/big/nat.go
+++ b/src/pkg/big/nat.go
@@ -615,6 +615,7 @@ func (x nat) bitLen() int {
 // MaxBase is the largest number base accepted for string conversions.
 const MaxBase = 'z' - 'a' + 10 + 1 // = hexValue('z') + 1

+
 func hexValue(ch int) Word {
 	d := MaxBase + 1 // illegal base
 	switch {
@@ -733,46 +734,137 @@ const (
 	uppercaseDigits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 )

+
+// decimalString returns a decimal representation of x.
+// It calls x.string with the charset "0123456789".
+func (x nat) decimalString() string {
+	return x.string(lowercaseDigits[0:10])
+}
+
+
 // string converts x to a string using digits from a charset; a digit with
 // value d is represented by charset[d]. The conversion base is determined
 // by len(charset), which must be >= 2.
 func (x nat) string(charset string) string {
-	base := len(charset)
+	b := Word(len(charset))

 	// special cases
 	switch {
-	case base < 2:
+	case b < 2 || b > 256:
 		panic("illegal base")
 	case len(x) == 0:
 		return string(charset[0])
 	}

 	// allocate buffer for conversion
-	i := x.bitLen()/log2(Word(base)) + 1 // +1: round up
+	i := x.bitLen()/log2(b) + 1 // +1: round up
 	s := make([]byte, i)

-	// don't destroy x
+	// special case: power of two bases can avoid divisions completely
+	if b == b&-b {
+		// shift is base-b digit size in bits
+		shift := uint(trailingZeroBits(b)) // shift > 0 because b >= 2
+		m := len(x)
+		mask := Word(1)<<shift - 1
+		w := x[0]
+		nbits := uint(_W) // number of unprocessed bits in w
+
+		// convert less-significant words
+		for k := 0; k < m-1; k++ {
+			// convert full digits
+			for nbits >= shift {
+				i--
+				s[i] = charset[w&mask]
+				w >>= shift
+				nbits -= shift
+			}
+
+			// convert any partial leading digit and advance to next word
+			if nbits == 0 {
+				// no partial digit remaining, just advance
+				w = x[k+1]
+				nbits = _W
+			} else {
+				// partial digit in current (k) and next (k+1) word
+				w |= x[k+1] << nbits
+				i--
+				s[i] = charset[w&mask]
+
+				// advance
+				w = x[k+1] >> (shift - nbits)
+				nbits = _W - (shift - nbits)
+			}
+		}
+
+		// convert digits of most-significant word (omit leading zeros)
+		for nbits >= 0 && w != 0 {
+			i--
+			s[i] = charset[w&mask]
+			w >>= shift
+			nbits -= shift
+		}
+
+		return string(s[i:])
+	}
+
+	// general case: extract groups of digits by multiprecision division
+
+	// maximize ndigits where b**ndigits < 2^_W; bb (big base) is b**ndigits
+	bb := Word(1)
+	ndigits := 0
+	for max := Word(_M / b); bb <= max; bb *= b {
+		ndigits++
+	}
+
+	// preserve x, create local copy for use in repeated divisions
 	q := nat(nil).set(x)
+	var r Word

 	// convert
+	if b == 10 { // hard-coding for 10 here speeds this up by 1.25x
 		for len(q) > 0 {
+			// extract least significant, base bb "digit"
+			q, r = q.divW(q, bb) // N.B. >82% of time is here. Optimize divW
+			if len(q) == 0 {
+				// skip leading zeros in most-significant group of digits
+				for j := 0; j < ndigits && r != 0; j++ {
 					i--
-		var r Word
-		q, r = q.divW(q, Word(base))
-		s[i] = charset[r]
+					s[i] = charset[r%10]
+					r /= 10
+				}
+			} else {
+				for j := 0; j < ndigits; j++ {
+					i--
+					s[i] = charset[r%10]
+					r /= 10
+				}
+			}
+		}
+	} else {
+		for len(q) > 0 {
+			// extract least significant group of digits
+			q, r = q.divW(q, bb) // N.B. >82% of time is here. Optimize divW
+			if len(q) == 0 {
+				// skip leading zeros in most-significant group of digits
+				for j := 0; j < ndigits && r != 0; j++ {
+					i--
+					s[i] = charset[r%b]
+					r /= b
+				}
+			} else {
+				for j := 0; j < ndigits; j++ {
+					i--
+					s[i] = charset[r%b]
+					r /= b
+				}
+			}
+		}
 	}

 	return string(s[i:])
 }


-// decimalString returns a decimal representation of x.
-// It calls x.string with the charset "0123456789".
-func (x nat) decimalString() string {
-	return x.string(lowercaseDigits[0:10])
-}
-
-
 const deBruijn32 = 0x077CB531

 var deBruijn32Lookup = []byte{
@@ -789,6 +881,7 @@ var deBruijn64Lookup = []byte{
 	54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6,
 }

+
 // trailingZeroBits returns the number of consecutive zero bits on the right
 // side of the given Word.
 // See Knuth, volume 4, section 7.3.1
@@ -960,7 +1053,9 @@ func (z nat) xor(x, y nat) nat {


 // greaterThan returns true iff (x1<<_W + x2) > (y1<<_W + y2)
-func greaterThan(x1, x2, y1, y2 Word) bool { return x1 > y1 || x1 == y1 && x2 > y2 }
+func greaterThan(x1, x2, y1, y2 Word) bool {
+	return x1 > y1 || x1 == y1 && x2 > y2
+}


 // modW returns x % d.

--- a/src/pkg/big/nat_test.go
+++ b/src/pkg/big/nat_test.go
@@ -5,6 +5,7 @@
 package big

 import (
+	"fmt"
 	"os"
 	"strings"
 	"testing"
@@ -171,6 +172,36 @@ func BenchmarkMul(b *testing.B) {
 }


+func toString(x nat, charset string) string {
+	base := len(charset)
+
+	// special cases
+	switch {
+	case base < 2:
+		panic("illegal base")
+	case len(x) == 0:
+		return string(charset[0])
+	}
+
+	// allocate buffer for conversion
+	i := x.bitLen()/log2(Word(base)) + 1 // +1: round up
+	s := make([]byte, i)
+
+	// don't destroy x
+	q := nat(nil).set(x)
+
+	// convert
+	for len(q) > 0 {
+		i--
+		var r Word
+		q, r = q.divW(q, Word(base))
+		s[i] = charset[r]
+	}
+
+	return string(s[i:])
+}
+
+
 var strTests = []struct {
 	x nat    // nat value to be converted
 	c string // conversion charset
@@ -360,6 +391,187 @@ func BenchmarkScanPi(b *testing.B) {
 }


+const (
+	// 314**271
+	// base  2: 2249 digits
+	// base  8:  751 digits
+	// base 10:  678 digits
+	// base 16:  563 digits
+	shortBase     = 314
+	shortExponent = 271
+
+	// 3141**2178
+	// base  2: 31577 digits
+	// base  8: 10527 digits
+	// base 10:  9507 digits
+	// base 16:  7895 digits
+	mediumBase     = 3141
+	mediumExponent = 2718
+
+	// 3141**2178
+	// base  2: 406078 digits
+	// base  8: 135360 digits
+	// base 10: 122243 digits
+	// base 16: 101521 digits
+	longBase     = 31415
+	longExponent = 27182
+)
+
+
+func BenchmarkScanShort2(b *testing.B) {
+	ScanHelper(b, 2, shortBase, shortExponent)
+}
+
+
+func BenchmarkScanShort8(b *testing.B) {
+	ScanHelper(b, 8, shortBase, shortExponent)
+}
+
+
+func BenchmarkScanSort10(b *testing.B) {
+	ScanHelper(b, 10, shortBase, shortExponent)
+}
+
+
+func BenchmarkScanShort16(b *testing.B) {
+	ScanHelper(b, 16, shortBase, shortExponent)
+}
+
+
+func BenchmarkScanMedium2(b *testing.B) {
+	ScanHelper(b, 2, mediumBase, mediumExponent)
+}
+
+
+func BenchmarkScanMedium8(b *testing.B) {
+	ScanHelper(b, 8, mediumBase, mediumExponent)
+}
+
+
+func BenchmarkScanMedium10(b *testing.B) {
+	ScanHelper(b, 10, mediumBase, mediumExponent)
+}
+
+
+func BenchmarkScanMedium16(b *testing.B) {
+	ScanHelper(b, 16, mediumBase, mediumExponent)
+}
+
+
+func BenchmarkScanLong2(b *testing.B) {
+	ScanHelper(b, 2, longBase, longExponent)
+}
+
+
+func BenchmarkScanLong8(b *testing.B) {
+	ScanHelper(b, 8, longBase, longExponent)
+}
+
+
+func BenchmarkScanLong10(b *testing.B) {
+	ScanHelper(b, 10, longBase, longExponent)
+}
+
+
+func BenchmarkScanLong16(b *testing.B) {
+	ScanHelper(b, 16, longBase, longExponent)
+}
+
+
+func ScanHelper(b *testing.B, base int, xv, yv Word) {
+	b.StopTimer()
+	var x, y, z nat
+	x = x.setWord(xv)
+	y = y.setWord(yv)
+	z = z.expNN(x, y, nil)
+
+	var s string
+	s = z.string(lowercaseDigits[0:base])
+	if t := toString(z, lowercaseDigits[0:base]); t != s {
+		panic(fmt.Sprintf("scanning: got %s; want %s", s, t))
+	}
+	b.StartTimer()
+
+	for i := 0; i < b.N; i++ {
+		x.scan(strings.NewReader(s), base)
+	}
+}
+
+
+func BenchmarkStringShort2(b *testing.B) {
+	StringHelper(b, 2, shortBase, shortExponent)
+}
+
+
+func BenchmarkStringShort8(b *testing.B) {
+	StringHelper(b, 8, shortBase, shortExponent)
+}
+
+
+func BenchmarkStringShort10(b *testing.B) {
+	StringHelper(b, 10, shortBase, shortExponent)
+}
+
+
+func BenchmarkStringShort16(b *testing.B) {
+	StringHelper(b, 16, shortBase, shortExponent)
+}
+
+
+func BenchmarkStringMedium2(b *testing.B) {
+	StringHelper(b, 2, mediumBase, mediumExponent)
+}
+
+
+func BenchmarkStringMedium8(b *testing.B) {
+	StringHelper(b, 8, mediumBase, mediumExponent)
+}
+
+
+func BenchmarkStringMedium10(b *testing.B) {
+	StringHelper(b, 10, mediumBase, mediumExponent)
+}
+
+
+func BenchmarkStringMedium16(b *testing.B) {
+	StringHelper(b, 16, mediumBase, mediumExponent)
+}
+
+
+func BenchmarkStringLong2(b *testing.B) {
+	StringHelper(b, 2, longBase, longExponent)
+}
+
+
+func BenchmarkStringLong8(b *testing.B) {
+	StringHelper(b, 8, longBase, longExponent)
+}
+
+
+func BenchmarkStringLong10(b *testing.B) {
+	StringHelper(b, 10, longBase, longExponent)
+}
+
+
+func BenchmarkStringLong16(b *testing.B) {
+	StringHelper(b, 16, longBase, longExponent)
+}
+
+
+func StringHelper(b *testing.B, base int, xv, yv Word) {
+	b.StopTimer()
+	var x, y, z nat
+	x = x.setWord(xv)
+	y = y.setWord(yv)
+	z = z.expNN(x, y, nil)
+	b.StartTimer()
+
+	for i := 0; i < b.N; i++ {
+		z.string(lowercaseDigits[0:base])
+	}
+}
+
+
 func TestLeadingZeros(t *testing.T) {
 	var x Word = _B >> 1
 	for i := 0; i <= _W; i++ {
@@ -479,7 +691,6 @@ func TestTrailingZeroBits(t *testing.T) {
 	}
 }

-
 var expNNTests = []struct {
 	x, y, m string
 	out     string