Commit 654a185f authored by Robert Griesemer's avatar Robert Griesemer

math/big: faster assembly kernels for AddVx/SubVx for 386.

(analog to Change-Id: Ia473e9ab9c63a955c252426684176bca566645ae)

Fixes #9243.

benchmark              old ns/op     new ns/op     delta
BenchmarkAddVV_1       5.76          5.60          -2.78%
BenchmarkAddVV_2       7.17          6.98          -2.65%
BenchmarkAddVV_3       8.69          8.57          -1.38%
BenchmarkAddVV_4       10.5          10.5          +0.00%
BenchmarkAddVV_5       13.3          11.6          -12.78%
BenchmarkAddVV_1e1     20.4          19.3          -5.39%
BenchmarkAddVV_1e2     166           140           -15.66%
BenchmarkAddVV_1e3     1588          1278          -19.52%
BenchmarkAddVV_1e4     16138         12657         -21.57%
BenchmarkAddVV_1e5     167608        127836        -23.73%
BenchmarkAddVW_1       4.87          4.76          -2.26%
BenchmarkAddVW_2       6.10          6.07          -0.49%
BenchmarkAddVW_3       7.75          7.65          -1.29%
BenchmarkAddVW_4       9.30          9.39          +0.97%
BenchmarkAddVW_5       10.8          10.9          +0.93%
BenchmarkAddVW_1e1     18.8          18.8          +0.00%
BenchmarkAddVW_1e2     143           134           -6.29%
BenchmarkAddVW_1e3     1390          1266          -8.92%
BenchmarkAddVW_1e4     13877         12545         -9.60%
BenchmarkAddVW_1e5     155330        125432        -19.25%

benchmark              old MB/s     new MB/s     speedup
BenchmarkAddVV_1       5556.09      5715.12      1.03x
BenchmarkAddVV_2       8926.55      9170.64      1.03x
BenchmarkAddVV_3       11042.15     11201.77     1.01x
BenchmarkAddVV_4       12168.21     12245.50     1.01x
BenchmarkAddVV_5       12041.39     13805.73     1.15x
BenchmarkAddVV_1e1     15659.65     16548.18     1.06x
BenchmarkAddVV_1e2     19268.57     22728.64     1.18x
BenchmarkAddVV_1e3     20141.45     25033.36     1.24x
BenchmarkAddVV_1e4     19827.86     25281.92     1.28x
BenchmarkAddVV_1e5     19092.06     25031.92     1.31x
BenchmarkAddVW_1       822.12       840.92       1.02x
BenchmarkAddVW_2       1310.89      1317.89      1.01x
BenchmarkAddVW_3       1549.31      1568.26      1.01x
BenchmarkAddVW_4       1720.45      1703.77      0.99x
BenchmarkAddVW_5       1857.12      1828.66      0.98x
BenchmarkAddVW_1e1     2126.39      2132.38      1.00x
BenchmarkAddVW_1e2     2784.49      2969.21      1.07x
BenchmarkAddVW_1e3     2876.89      3157.35      1.10x
BenchmarkAddVW_1e4     2882.32      3188.51      1.11x
BenchmarkAddVW_1e5     2575.16      3188.96      1.24x

(measured on OS X 10.9.5, 2.3 GHz Intel Core i7, 8GB 1333 MHz DDR3)

Change-Id: I46698729d5e0bc3e277aa0146a9d7a086c0c26f1
Reviewed-on: https://go-review.googlesource.com/2560Reviewed-by: 's avatarKeith Randall <khr@golang.org>
parent 06ed8f0d
...@@ -7,8 +7,6 @@ ...@@ -7,8 +7,6 @@
// This file provides fast assembly versions for the elementary // This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go. // arithmetic operations on vectors implemented in arith.go.
// TODO(gri) Replace uses of RCRL/RCLL with ADDL/SBBL respectively.
// func mulWW(x, y Word) (z1, z0 Word) // func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),NOSPLIT,$0 TEXT ·mulWW(SB),NOSPLIT,$0
MOVL x+0(FP), AX MOVL x+0(FP), AX
...@@ -39,15 +37,16 @@ TEXT ·addVV(SB),NOSPLIT,$0 ...@@ -39,15 +37,16 @@ TEXT ·addVV(SB),NOSPLIT,$0
JMP E1 JMP E1
L1: MOVL (SI)(BX*4), AX L1: MOVL (SI)(BX*4), AX
RCRL $1, DX ADDL DX, DX // restore CF
ADCL (CX)(BX*4), AX ADCL (CX)(BX*4), AX
RCLL $1, DX SBBL DX, DX // save CF
MOVL AX, (DI)(BX*4) MOVL AX, (DI)(BX*4)
ADDL $1, BX // i++ ADDL $1, BX // i++
E1: CMPL BX, BP // i < n E1: CMPL BX, BP // i < n
JL L1 JL L1
NEGL DX
MOVL DX, c+36(FP) MOVL DX, c+36(FP)
RET RET
...@@ -64,15 +63,16 @@ TEXT ·subVV(SB),NOSPLIT,$0 ...@@ -64,15 +63,16 @@ TEXT ·subVV(SB),NOSPLIT,$0
JMP E2 JMP E2
L2: MOVL (SI)(BX*4), AX L2: MOVL (SI)(BX*4), AX
RCRL $1, DX ADDL DX, DX // restore CF
SBBL (CX)(BX*4), AX SBBL (CX)(BX*4), AX
RCLL $1, DX SBBL DX, DX // save CF
MOVL AX, (DI)(BX*4) MOVL AX, (DI)(BX*4)
ADDL $1, BX // i++ ADDL $1, BX // i++
E2: CMPL BX, BP // i < n E2: CMPL BX, BP // i < n
JL L2 JL L2
NEGL DX
MOVL DX, c+36(FP) MOVL DX, c+36(FP)
RET RET
...@@ -88,8 +88,8 @@ TEXT ·addVW(SB),NOSPLIT,$0 ...@@ -88,8 +88,8 @@ TEXT ·addVW(SB),NOSPLIT,$0
L3: ADDL (SI)(BX*4), AX L3: ADDL (SI)(BX*4), AX
MOVL AX, (DI)(BX*4) MOVL AX, (DI)(BX*4)
RCLL $1, AX SBBL AX, AX // save CF
ANDL $1, AX NEGL AX
ADDL $1, BX // i++ ADDL $1, BX // i++
E3: CMPL BX, BP // i < n E3: CMPL BX, BP // i < n
...@@ -108,11 +108,11 @@ TEXT ·subVW(SB),NOSPLIT,$0 ...@@ -108,11 +108,11 @@ TEXT ·subVW(SB),NOSPLIT,$0
MOVL $0, BX // i = 0 MOVL $0, BX // i = 0
JMP E4 JMP E4
L4: MOVL (SI)(BX*4), DX // TODO(gri) is there a reverse SUBL? L4: MOVL (SI)(BX*4), DX
SUBL AX, DX SUBL AX, DX
MOVL DX, (DI)(BX*4) MOVL DX, (DI)(BX*4)
RCLL $1, AX SBBL AX, AX // save CF
ANDL $1, AX NEGL AX
ADDL $1, BX // i++ ADDL $1, BX // i++
E4: CMPL BX, BP // i < n E4: CMPL BX, BP // i < n
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment