Commit 80b3ff9f authored by Robert Griesemer's avatar Robert Griesemer

math/big: faster assembly kernels for AddVx/SubVx for amd64.

Replaced use of rotate instructions (RCRQ, RCLQ) with ADDQ/SBBQ
for restoring/saving the carry flag per suggestion from Torbjörn
Granlund (author of GMP bignum libs for C).
The rotate instructions tend to be slower on todays machines.

benchmark              old ns/op     new ns/op     delta
BenchmarkAddVV_1       5.69          5.51          -3.16%
BenchmarkAddVV_2       7.15          6.87          -3.92%
BenchmarkAddVV_3       8.69          8.06          -7.25%
BenchmarkAddVV_4       8.10          8.13          +0.37%
BenchmarkAddVV_5       8.37          8.47          +1.19%
BenchmarkAddVV_1e1     13.1          12.0          -8.40%
BenchmarkAddVV_1e2     78.1          69.4          -11.14%
BenchmarkAddVV_1e3     815           656           -19.51%
BenchmarkAddVV_1e4     8137          7345          -9.73%
BenchmarkAddVV_1e5     100127        93909         -6.21%
BenchmarkAddVW_1       4.86          4.71          -3.09%
BenchmarkAddVW_2       5.67          5.50          -3.00%
BenchmarkAddVW_3       6.51          6.34          -2.61%
BenchmarkAddVW_4       6.69          6.66          -0.45%
BenchmarkAddVW_5       7.20          7.21          +0.14%
BenchmarkAddVW_1e1     10.0          9.34          -6.60%
BenchmarkAddVW_1e2     45.4          52.3          +15.20%
BenchmarkAddVW_1e3     417           491           +17.75%
BenchmarkAddVW_1e4     4760          4852          +1.93%
BenchmarkAddVW_1e5     69107         67717         -2.01%

benchmark              old MB/s      new MB/s      speedup
BenchmarkAddVV_1       11241.82      11610.28      1.03x
BenchmarkAddVV_2       17902.68      18631.82      1.04x
BenchmarkAddVV_3       22082.43      23835.64      1.08x
BenchmarkAddVV_4       31588.18      31492.06      1.00x
BenchmarkAddVV_5       38229.90      37783.17      0.99x
BenchmarkAddVV_1e1     48891.67      53340.91      1.09x
BenchmarkAddVV_1e2     81940.61      92191.86      1.13x
BenchmarkAddVV_1e3     78443.09      97480.44      1.24x
BenchmarkAddVV_1e4     78644.18      87129.50      1.11x
BenchmarkAddVV_1e5     63918.48      68150.84      1.07x
BenchmarkAddVW_1       13165.09      13581.00      1.03x
BenchmarkAddVW_2       22588.04      23275.41      1.03x
BenchmarkAddVW_3       29483.82      30303.96      1.03x
BenchmarkAddVW_4       38286.54      38453.21      1.00x
BenchmarkAddVW_5       44414.57      44370.59      1.00x
BenchmarkAddVW_1e1     63816.84      68494.08      1.07x
BenchmarkAddVW_1e2     140885.41     122427.16     0.87x
BenchmarkAddVW_1e3     153258.31     130325.28     0.85x
BenchmarkAddVW_1e4     134447.63     131904.02     0.98x
BenchmarkAddVW_1e5     92609.41      94509.88      1.02x

Change-Id: Ia473e9ab9c63a955c252426684176bca566645ae
Reviewed-on: https://go-review.googlesource.com/2503Reviewed-by: 's avatarKeith Randall <khr@golang.org>
parent 878fa886
...@@ -7,6 +7,8 @@ ...@@ -7,6 +7,8 @@
// This file provides fast assembly versions for the elementary // This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go. // arithmetic operations on vectors implemented in arith.go.
// TODO(gri) Replace uses of RCRL/RCLL with ADDL/SBBL respectively.
// func mulWW(x, y Word) (z1, z0 Word) // func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),NOSPLIT,$0 TEXT ·mulWW(SB),NOSPLIT,$0
MOVL x+0(FP), AX MOVL x+0(FP), AX
......
...@@ -7,16 +7,6 @@ ...@@ -7,16 +7,6 @@
// This file provides fast assembly versions for the elementary // This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go. // arithmetic operations on vectors implemented in arith.go.
// Literal instruction for MOVQ $0, CX.
// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
#define ZERO_CX BYTE $0x48; \
BYTE $0xc7; \
BYTE $0xc1; \
BYTE $0x00; \
BYTE $0x00; \
BYTE $0x00; \
BYTE $0x00
// func mulWW(x, y Word) (z1, z0 Word) // func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),NOSPLIT,$0 TEXT ·mulWW(SB),NOSPLIT,$0
MOVQ x+0(FP), AX MOVQ x+0(FP), AX
...@@ -35,6 +25,11 @@ TEXT ·divWW(SB),NOSPLIT,$0 ...@@ -35,6 +25,11 @@ TEXT ·divWW(SB),NOSPLIT,$0
MOVQ DX, r+32(FP) MOVQ DX, r+32(FP)
RET RET
// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
// This is faster than using rotate instructions.
//
// CAUTION: Note that MOVQ $0, Rx is translated to XORQ Rx, Rx which clears the carry bit!
// func addVV(z, x, y []Word) (c Word) // func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),NOSPLIT,$0 TEXT ·addVV(SB),NOSPLIT,$0
...@@ -52,7 +47,7 @@ TEXT ·addVV(SB),NOSPLIT,$0 ...@@ -52,7 +47,7 @@ TEXT ·addVV(SB),NOSPLIT,$0
U1: // n >= 0 U1: // n >= 0
// regular loop body unrolled 4x // regular loop body unrolled 4x
RCRQ $1, CX // CF = c ADDQ CX, CX // restore CF
MOVQ 0(R8)(SI*8), R11 MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12 MOVQ 8(R8)(SI*8), R12
MOVQ 16(R8)(SI*8), R13 MOVQ 16(R8)(SI*8), R13
...@@ -65,7 +60,7 @@ U1: // n >= 0 ...@@ -65,7 +60,7 @@ U1: // n >= 0
MOVQ R12, 8(R10)(SI*8) MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8) MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8) MOVQ R14, 24(R10)(SI*8)
RCLQ $1, CX // c = CF SBBQ CX, CX // save CF
ADDQ $4, SI // i += 4 ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4 SUBQ $4, DI // n -= 4
...@@ -75,17 +70,18 @@ V1: ADDQ $4, DI // n += 4 ...@@ -75,17 +70,18 @@ V1: ADDQ $4, DI // n += 4
JLE E1 // if n <= 0 goto E1 JLE E1 // if n <= 0 goto E1
L1: // n > 0 L1: // n > 0
RCRQ $1, CX // CF = c ADDQ CX, CX // restore CF
MOVQ 0(R8)(SI*8), R11 MOVQ 0(R8)(SI*8), R11
ADCQ 0(R9)(SI*8), R11 ADCQ 0(R9)(SI*8), R11
MOVQ R11, 0(R10)(SI*8) MOVQ R11, 0(R10)(SI*8)
RCLQ $1, CX // c = CF SBBQ CX, CX // save CF
ADDQ $1, SI // i++ ADDQ $1, SI // i++
SUBQ $1, DI // n-- SUBQ $1, DI // n--
JG L1 // if n > 0 goto L1 JG L1 // if n > 0 goto L1
E1: MOVQ CX, c+72(FP) // return c E1: NEGQ CX
MOVQ CX, c+72(FP) // return c
RET RET
...@@ -106,7 +102,7 @@ TEXT ·subVV(SB),NOSPLIT,$0 ...@@ -106,7 +102,7 @@ TEXT ·subVV(SB),NOSPLIT,$0
U2: // n >= 0 U2: // n >= 0
// regular loop body unrolled 4x // regular loop body unrolled 4x
RCRQ $1, CX // CF = c ADDQ CX, CX // restore CF
MOVQ 0(R8)(SI*8), R11 MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12 MOVQ 8(R8)(SI*8), R12
MOVQ 16(R8)(SI*8), R13 MOVQ 16(R8)(SI*8), R13
...@@ -119,7 +115,7 @@ U2: // n >= 0 ...@@ -119,7 +115,7 @@ U2: // n >= 0
MOVQ R12, 8(R10)(SI*8) MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8) MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8) MOVQ R14, 24(R10)(SI*8)
RCLQ $1, CX // c = CF SBBQ CX, CX // save CF
ADDQ $4, SI // i += 4 ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4 SUBQ $4, DI // n -= 4
...@@ -129,17 +125,18 @@ V2: ADDQ $4, DI // n += 4 ...@@ -129,17 +125,18 @@ V2: ADDQ $4, DI // n += 4
JLE E2 // if n <= 0 goto E2 JLE E2 // if n <= 0 goto E2
L2: // n > 0 L2: // n > 0
RCRQ $1, CX // CF = c ADDQ CX, CX // restore CF
MOVQ 0(R8)(SI*8), R11 MOVQ 0(R8)(SI*8), R11
SBBQ 0(R9)(SI*8), R11 SBBQ 0(R9)(SI*8), R11
MOVQ R11, 0(R10)(SI*8) MOVQ R11, 0(R10)(SI*8)
RCLQ $1, CX // c = CF SBBQ CX, CX // save CF
ADDQ $1, SI // i++ ADDQ $1, SI // i++
SUBQ $1, DI // n-- SUBQ $1, DI // n--
JG L2 // if n > 0 goto L2 JG L2 // if n > 0 goto L2
E2: MOVQ CX, c+72(FP) // return c E2: NEGQ CX
MOVQ CX, c+72(FP) // return c
RET RET
...@@ -163,11 +160,11 @@ U3: // n >= 0 ...@@ -163,11 +160,11 @@ U3: // n >= 0
MOVQ 16(R8)(SI*8), R13 MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14 MOVQ 24(R8)(SI*8), R14
ADDQ CX, R11 ADDQ CX, R11
ZERO_CX
ADCQ $0, R12 ADCQ $0, R12
ADCQ $0, R13 ADCQ $0, R13
ADCQ $0, R14 ADCQ $0, R14
SETCS CX // c = CF SBBQ CX, CX // save CF
NEGQ CX
MOVQ R11, 0(R10)(SI*8) MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8) MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8) MOVQ R13, 16(R10)(SI*8)
...@@ -183,8 +180,8 @@ V3: ADDQ $4, DI // n += 4 ...@@ -183,8 +180,8 @@ V3: ADDQ $4, DI // n += 4
L3: // n > 0 L3: // n > 0
ADDQ 0(R8)(SI*8), CX ADDQ 0(R8)(SI*8), CX
MOVQ CX, 0(R10)(SI*8) MOVQ CX, 0(R10)(SI*8)
ZERO_CX SBBQ CX, CX // save CF
RCLQ $1, CX // c = CF NEGQ CX
ADDQ $1, SI // i++ ADDQ $1, SI // i++
SUBQ $1, DI // n-- SUBQ $1, DI // n--
...@@ -201,7 +198,7 @@ TEXT ·subVW(SB),NOSPLIT,$0 ...@@ -201,7 +198,7 @@ TEXT ·subVW(SB),NOSPLIT,$0
MOVQ x+24(FP), R8 MOVQ x+24(FP), R8
MOVQ y+48(FP), CX // c = y MOVQ y+48(FP), CX // c = y
MOVQ z+0(FP), R10 MOVQ z+0(FP), R10
MOVQ $0, SI // i = 0 MOVQ $0, SI // i = 0
// s/JL/JMP/ below to disable the unrolled loop // s/JL/JMP/ below to disable the unrolled loop
...@@ -215,11 +212,11 @@ U4: // n >= 0 ...@@ -215,11 +212,11 @@ U4: // n >= 0
MOVQ 16(R8)(SI*8), R13 MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14 MOVQ 24(R8)(SI*8), R14
SUBQ CX, R11 SUBQ CX, R11
ZERO_CX
SBBQ $0, R12 SBBQ $0, R12
SBBQ $0, R13 SBBQ $0, R13
SBBQ $0, R14 SBBQ $0, R14
SETCS CX // c = CF SBBQ CX, CX // save CF
NEGQ CX
MOVQ R11, 0(R10)(SI*8) MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8) MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8) MOVQ R13, 16(R10)(SI*8)
...@@ -236,8 +233,8 @@ L4: // n > 0 ...@@ -236,8 +233,8 @@ L4: // n > 0
MOVQ 0(R8)(SI*8), R11 MOVQ 0(R8)(SI*8), R11
SUBQ CX, R11 SUBQ CX, R11
MOVQ R11, 0(R10)(SI*8) MOVQ R11, 0(R10)(SI*8)
ZERO_CX SBBQ CX, CX // save CF
RCLQ $1, CX // c = CF NEGQ CX
ADDQ $1, SI // i++ ADDQ $1, SI // i++
SUBQ $1, DI // n-- SUBQ $1, DI // n--
...@@ -306,7 +303,7 @@ L9: MOVQ AX, DX // w = w1 ...@@ -306,7 +303,7 @@ L9: MOVQ AX, DX // w = w1
SHRQ CX, DX:AX // w>>s | w1<<ŝ SHRQ CX, DX:AX // w>>s | w1<<ŝ
MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ
ADDQ $1, BX // i++ ADDQ $1, BX // i++
E9: CMPQ BX, R11 E9: CMPQ BX, R11
JL L9 // i < n-1 JL L9 // i < n-1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment