Commit 067acd51 authored by Robert Griesemer's avatar Robert Griesemer

math/big: faster "pure Go" addition/subtraction for long vectors

          (platforms w/o corresponding assembly kernels)

For short vector adds there's some erradic slow-down, but overall
these routines have become significantly faster. This only matters
for platforms w/o native (assembly) versions of these kernels, so
we are not concerned about the minor slow-down for short vectors.

This code was already reviewed under Mercurial (golang.org/cl/172810043)
but wasn't submitted before the switch to git.

Benchmarks run on 2.3GHz Intel Core i7, running OS X 10.9.5,
with the respective AddVV and AddVW assembly routines disabled.

benchmark              old ns/op     new ns/op     delta
BenchmarkAddVV_1       6.59          7.09          +7.59%
BenchmarkAddVV_2       10.3          10.1          -1.94%
BenchmarkAddVV_3       10.9          12.6          +15.60%
BenchmarkAddVV_4       13.9          15.6          +12.23%
BenchmarkAddVV_5       16.8          17.3          +2.98%
BenchmarkAddVV_1e1     29.5          29.9          +1.36%
BenchmarkAddVV_1e2     246           232           -5.69%
BenchmarkAddVV_1e3     2374          2185          -7.96%
BenchmarkAddVV_1e4     58942         22292         -62.18%
BenchmarkAddVV_1e5     668622        225279        -66.31%
BenchmarkAddVW_1       6.81          5.58          -18.06%
BenchmarkAddVW_2       7.69          6.86          -10.79%
BenchmarkAddVW_3       9.56          8.32          -12.97%
BenchmarkAddVW_4       12.1          9.53          -21.24%
BenchmarkAddVW_5       13.2          10.9          -17.42%
BenchmarkAddVW_1e1     23.4          18.0          -23.08%
BenchmarkAddVW_1e2     175           141           -19.43%
BenchmarkAddVW_1e3     1568          1266          -19.26%
BenchmarkAddVW_1e4     15425         12596         -18.34%
BenchmarkAddVW_1e5     156737        133539        -14.80%
BenchmarkFibo          381678466     132958666     -65.16%

benchmark              old MB/s     new MB/s     speedup
BenchmarkAddVV_1       9715.25      9028.30      0.93x
BenchmarkAddVV_2       12461.72     12622.60     1.01x
BenchmarkAddVV_3       17549.64     15243.82     0.87x
BenchmarkAddVV_4       18392.54     16398.29     0.89x
BenchmarkAddVV_5       18995.23     18496.57     0.97x
BenchmarkAddVV_1e1     21708.98     21438.28     0.99x
BenchmarkAddVV_1e2     25956.53     27506.88     1.06x
BenchmarkAddVV_1e3     26947.93     29286.66     1.09x
BenchmarkAddVV_1e4     10857.96     28709.46     2.64x
BenchmarkAddVV_1e5     9571.91      28409.21     2.97x
BenchmarkAddVW_1       1175.28      1433.98      1.22x
BenchmarkAddVW_2       2080.01      2332.54      1.12x
BenchmarkAddVW_3       2509.28      2883.97      1.15x
BenchmarkAddVW_4       2646.09      3356.83      1.27x
BenchmarkAddVW_5       3020.69      3671.07      1.22x
BenchmarkAddVW_1e1     3425.76      4441.40      1.30x
BenchmarkAddVW_1e2     4553.17      5642.96      1.24x
BenchmarkAddVW_1e3     5100.14      6318.72      1.24x
BenchmarkAddVW_1e4     5186.15      6350.96      1.22x
BenchmarkAddVW_1e5     5104.07      5990.74      1.17x

Change-Id: I7a62023b1105248a0e85e5b9819d3fd4266123d4
Reviewed-on: https://go-review.googlesource.com/2480Reviewed-by: 's avatarRuss Cox <rsc@golang.org>
Reviewed-by: 's avatarAlan Donovan <adonovan@google.com>
parent 80b3ff9f
......@@ -70,7 +70,7 @@ func mulWW_g(x, y Word) (z1, z0 Word) {
// z1<<_W + z0 = x*y + c
func mulAddWWW_g(x, y, c Word) (z1, z0 Word) {
z1, zz0 := mulWW(x, y)
z1, zz0 := mulWW_g(x, y)
if z0 = zz0 + c; z0 < zz0 {
z1++
}
......@@ -154,32 +154,82 @@ func divWW_g(u1, u0, v Word) (q, r Word) {
return q1*_B2 + q0, (un21*_B2 + un0 - q0*v) >> s
}
// Keep for performance debugging.
// Using addWW_g is likely slower.
const use_addWW_g = false
// The resulting carry c is either 0 or 1.
func addVV_g(z, x, y []Word) (c Word) {
for i := range z {
c, z[i] = addWW_g(x[i], y[i], c)
if use_addWW_g {
for i := range z {
c, z[i] = addWW_g(x[i], y[i], c)
}
return
}
for i, xi := range x[:len(z)] {
yi := y[i]
zi := xi + yi + c
z[i] = zi
// see "Hacker's Delight", section 2-12 (overflow detection)
c = (xi&yi | (xi|yi)&^zi) >> (_W - 1)
}
return
}
// The resulting carry c is either 0 or 1.
func subVV_g(z, x, y []Word) (c Word) {
for i := range z {
c, z[i] = subWW_g(x[i], y[i], c)
if use_addWW_g {
for i := range z {
c, z[i] = subWW_g(x[i], y[i], c)
}
return
}
for i, xi := range x[:len(z)] {
yi := y[i]
zi := xi - yi - c
z[i] = zi
// see "Hacker's Delight", section 2-12 (overflow detection)
c = (yi&^xi | (yi|^xi)&zi) >> (_W - 1)
}
return
}
// Argument y must be either 0 or 1.
// The resulting carry c is either 0 or 1.
func addVW_g(z, x []Word, y Word) (c Word) {
if use_addWW_g {
c = y
for i := range z {
c, z[i] = addWW_g(x[i], c, 0)
}
return
}
c = y
for i := range z {
c, z[i] = addWW_g(x[i], c, 0)
for i, xi := range x[:len(z)] {
zi := xi + c
z[i] = zi
c = xi &^ zi >> (_W - 1)
}
return
}
func subVW_g(z, x []Word, y Word) (c Word) {
if use_addWW_g {
c = y
for i := range z {
c, z[i] = subWW_g(x[i], c, 0)
}
return
}
c = y
for i := range z {
c, z[i] = subWW_g(x[i], c, 0)
for i, xi := range x[:len(z)] {
zi := xi - c
z[i] = zi
c = (zi &^ xi) >> (_W - 1)
}
return
}
......@@ -222,6 +272,7 @@ func mulAddVWW_g(z, x []Word, y, r Word) (c Word) {
return
}
// TODO(gri) Remove use of addWW_g here and then we can remove addWW_g and subWW_g.
func addMulVVW_g(z, x []Word, y Word) (c Word) {
for i := range z {
z1, z0 := mulAddWWW_g(x[i], y, z[i])
......
......@@ -254,7 +254,7 @@ func benchmarkFunVW(b *testing.B, f funVW, n int) {
x := rndV(n)
y := rndW()
z := make([]Word, n)
b.SetBytes(int64(n * _W))
b.SetBytes(int64(n * _S))
b.ResetTimer()
for i := 0; i < b.N; i++ {
f(z, x, y)
......
......@@ -769,3 +769,55 @@ func BenchmarkExp3Power0x10000(b *testing.B) { ExpHelper(b, 3, 0x10000) }
func BenchmarkExp3Power0x40000(b *testing.B) { ExpHelper(b, 3, 0x40000) }
func BenchmarkExp3Power0x100000(b *testing.B) { ExpHelper(b, 3, 0x100000) }
func BenchmarkExp3Power0x400000(b *testing.B) { ExpHelper(b, 3, 0x400000) }
func fibo(n int) nat {
switch n {
case 0:
return nil
case 1:
return nat{1}
}
f0 := fibo(0)
f1 := fibo(1)
var f2 nat
for i := 1; i < n; i++ {
f2 = f2.add(f0, f1)
f0, f1, f2 = f1, f2, f0
}
return f1
}
var fiboNums = []string{
"0",
"55",
"6765",
"832040",
"102334155",
"12586269025",
"1548008755920",
"190392490709135",
"23416728348467685",
"2880067194370816120",
"354224848179261915075",
}
func TestFibo(t *testing.T) {
for i, want := range fiboNums {
n := i * 10
got := fibo(n).decimalString()
if got != want {
t.Errorf("fibo(%d) failed: got %s want %s", n, got, want)
}
}
}
func BenchmarkFibo(b *testing.B) {
for i := 0; i < b.N; i++ {
fibo(1e0)
fibo(1e1)
fibo(1e2)
fibo(1e3)
fibo(1e4)
fibo(1e5)
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment