math/big: faster assembly kernels for AddVx/SubVx for amd64.

Replaced use of rotate instructions (RCRQ, RCLQ) with ADDQ/SBBQ for restoring/saving the carry flag per suggestion from Torbjörn Granlund (author of GMP bignum libs for C). The rotate instructions tend to be slower on todays machines. benchmark old ns/op new ns/op delta BenchmarkAddVV_1 5.69 5.51 -3.16% BenchmarkAddVV_2 7.15 6.87 -3.92% BenchmarkAddVV_3 8.69 8.06 -7.25% BenchmarkAddVV_4 8.10 8.13 +0.37% BenchmarkAddVV_5 8.37 8.47 +1.19% BenchmarkAddVV_1e1 13.1 12.0 -8.40% BenchmarkAddVV_1e2 78.1 69.4 -11.14% BenchmarkAddVV_1e3 815 656 -19.51% BenchmarkAddVV_1e4 8137 7345 -9.73% BenchmarkAddVV_1e5 100127 93909 -6.21% BenchmarkAddVW_1 4.86 4.71 -3.09% BenchmarkAddVW_2 5.67 5.50 -3.00% BenchmarkAddVW_3 6.51 6.34 -2.61% BenchmarkAddVW_4 6.69 6.66 -0.45% BenchmarkAddVW_5 7.20 7.21 +0.14% BenchmarkAddVW_1e1 10.0 9.34 -6.60% BenchmarkAddVW_1e2 45.4 52.3 +15.20% BenchmarkAddVW_1e3 417 491 +17.75% BenchmarkAddVW_1e4 4760 4852 +1.93% BenchmarkAddVW_1e5 69107 67717 -2.01% benchmark old MB/s new MB/s speedup BenchmarkAddVV_1 11241.82 11610.28 1.03x BenchmarkAddVV_2 17902.68 18631.82 1.04x BenchmarkAddVV_3 22082.43 23835.64 1.08x BenchmarkAddVV_4 31588.18 31492.06 1.00x BenchmarkAddVV_5 38229.90 37783.17 0.99x BenchmarkAddVV_1e1 48891.67 53340.91 1.09x BenchmarkAddVV_1e2 81940.61 92191.86 1.13x BenchmarkAddVV_1e3 78443.09 97480.44 1.24x BenchmarkAddVV_1e4 78644.18 87129.50 1.11x BenchmarkAddVV_1e5 63918.48 68150.84 1.07x BenchmarkAddVW_1 13165.09 13581.00 1.03x BenchmarkAddVW_2 22588.04 23275.41 1.03x BenchmarkAddVW_3 29483.82 30303.96 1.03x BenchmarkAddVW_4 38286.54 38453.21 1.00x BenchmarkAddVW_5 44414.57 44370.59 1.00x BenchmarkAddVW_1e1 63816.84 68494.08 1.07x BenchmarkAddVW_1e2 140885.41 122427.16 0.87x BenchmarkAddVW_1e3 153258.31 130325.28 0.85x BenchmarkAddVW_1e4 134447.63 131904.02 0.98x BenchmarkAddVW_1e5 92609.41 94509.88 1.02x Change-Id: Ia473e9ab9c63a955c252426684176bca566645ae Reviewed-on: https://go-review.googlesource.com/2503Reviewed-by: Keith Randall <khr@golang.org>

math/big: faster assembly kernels for AddVx/SubVx for amd64.
Replaced use of rotate instructions (RCRQ, RCLQ) with ADDQ/SBBQ for restoring/saving the carry flag per suggestion from Torbjörn Granlund (author of GMP bignum libs for C). The rotate instructions tend to be slower on todays machines. benchmark old ns/op new ns/op delta BenchmarkAddVV_1 5.69 5.51 -3.16% BenchmarkAddVV_2 7.15 6.87 -3.92% BenchmarkAddVV_3 8.69 8.06 -7.25% BenchmarkAddVV_4 8.10 8.13 +0.37% BenchmarkAddVV_5 8.37 8.47 +1.19% BenchmarkAddVV_1e1 13.1 12.0 -8.40% BenchmarkAddVV_1e2 78.1 69.4 -11.14% BenchmarkAddVV_1e3 815 656 -19.51% BenchmarkAddVV_1e4 8137 7345 -9.73% BenchmarkAddVV_1e5 100127 93909 -6.21% BenchmarkAddVW_1 4.86 4.71 -3.09% BenchmarkAddVW_2 5.67 5.50 -3.00% BenchmarkAddVW_3 6.51 6.34 -2.61% BenchmarkAddVW_4 6.69 6.66 -0.45% BenchmarkAddVW_5 7.20 7.21 +0.14% BenchmarkAddVW_1e1 10.0 9.34 -6.60% BenchmarkAddVW_1e2 45.4 52.3 +15.20% BenchmarkAddVW_1e3 417 491 +17.75% BenchmarkAddVW_1e4 4760 4852 +1.93% BenchmarkAddVW_1e5 69107 67717 -2.01% benchmark old MB/s new MB/s speedup BenchmarkAddVV_1 11241.82 11610.28 1.03x BenchmarkAddVV_2 17902.68 18631.82 1.04x BenchmarkAddVV_3 22082.43 23835.64 1.08x BenchmarkAddVV_4 31588.18 31492.06 1.00x BenchmarkAddVV_5 38229.90 37783.17 0.99x BenchmarkAddVV_1e1 48891.67 53340.91 1.09x BenchmarkAddVV_1e2 81940.61 92191.86 1.13x BenchmarkAddVV_1e3 78443.09 97480.44 1.24x BenchmarkAddVV_1e4 78644.18 87129.50 1.11x BenchmarkAddVV_1e5 63918.48 68150.84 1.07x BenchmarkAddVW_1 13165.09 13581.00 1.03x BenchmarkAddVW_2 22588.04 23275.41 1.03x BenchmarkAddVW_3 29483.82 30303.96 1.03x BenchmarkAddVW_4 38286.54 38453.21 1.00x BenchmarkAddVW_5 44414.57 44370.59 1.00x BenchmarkAddVW_1e1 63816.84 68494.08 1.07x BenchmarkAddVW_1e2 140885.41 122427.16 0.87x BenchmarkAddVW_1e3 153258.31 130325.28 0.85x BenchmarkAddVW_1e4 134447.63 131904.02 0.98x BenchmarkAddVW_1e5 92609.41 94509.88 1.02x Change-Id: Ia473e9ab9c63a955c252426684176bca566645ae Reviewed-on: https://go-review.googlesource.com/2503Reviewed-by: Keith Randall <khr@golang.org>
80b3ff9f · Robert Griesemer · 878fa886 · 80b3ff9f · 80b3ff9f
Commit 80b3ff9f authored Jan 08, 2015 by Robert Griesemer
Hide whitespace changes
Inline Side-by-side

Showing with 29 additions and 30 deletions

arith_386.s src/math/big/arith_386.s +2 -0

arith_amd64.s src/math/big/arith_amd64.s +27 -30

No files found.
--- a/src/math/big/arith_386.s
+++ b/src/math/big/arith_386.s
@@ -7,6 +7,8 @@
 // This file provides fast assembly versions for the elementary
 // arithmetic operations on vectors implemented in arith.go.
+// TODO(gri) Replace uses of RCRL/RCLL with ADDL/SBBL respectively.
 // func mulWW(x, y Word) (z1, z0 Word)
 TEXT ·mulWW(SB),NOSPLIT,$0
 	MOVL x+0(FP), AX

--- a/src/math/big/arith_amd64.s
+++ b/src/math/big/arith_amd64.s
@@ -7,16 +7,6 @@
 // This file provides fast assembly versions for the elementary
 // arithmetic operations on vectors implemented in arith.go.
-// Literal instruction for MOVQ $0, CX.
-// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
-#define ZERO_CX BYTE $0x48; \
-		BYTE $0xc7; \
-		BYTE $0xc1; \
-		BYTE $0x00; \
-		BYTE $0x00; \
-		BYTE $0x00; \
-		BYTE $0x00
 // func mulWW(x, y Word) (z1, z0 Word)
 TEXT ·mulWW(SB),NOSPLIT,$0
 	MOVQ x+0(FP), AX
@@ -35,6 +25,11 @@ TEXT ·divWW(SB),NOSPLIT,$0
 	MOVQ DX, r+32(FP)
 	RET
+// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
+// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
+// This is faster than using rotate instructions.
+//
+// CAUTION: Note that MOVQ $0, Rx is translated to XORQ Rx, Rx which clears the carry bit!
 // func addVV(z, x, y []Word) (c Word)
 TEXT ·addVV(SB),NOSPLIT,$0
@@ -52,7 +47,7 @@ TEXT ·addVV(SB),NOSPLIT,$0
 U1:	// n >= 0
 	// regular loop body unrolled 4x
-	RCRQ $1, CX		// CF = c
+	ADDQ CX, CX		// restore CF
 	MOVQ 0(R8)(SI*8), R11
 	MOVQ 8(R8)(SI*8), R12
 	MOVQ 16(R8)(SI*8), R13
@@ -65,7 +60,7 @@ U1:	// n >= 0
 	MOVQ R12, 8(R10)(SI*8)
 	MOVQ R13, 16(R10)(SI*8)
 	MOVQ R14, 24(R10)(SI*8)
-	RCLQ $1, CX		// c = CF
+	SBBQ CX, CX		// save CF
 	ADDQ $4, SI		// i += 4
 	SUBQ $4, DI		// n -= 4
@@ -75,17 +70,18 @@ V1:	ADDQ $4, DI		// n += 4
 	JLE E1			// if n <= 0 goto E1
 L1:	// n > 0
-	RCRQ $1, CX		// CF = c
+	ADDQ CX, CX		// restore CF
 	MOVQ 0(R8)(SI*8), R11
 	ADCQ 0(R9)(SI*8), R11
 	MOVQ R11, 0(R10)(SI*8)
-	RCLQ $1, CX		// c = CF
+	SBBQ CX, CX		// save CF
 	ADDQ $1, SI		// i++
 	SUBQ $1, DI		// n--
 	JG L1			// if n > 0 goto L1
-E1:	MOVQ CX, c+72(FP)	// return c
+E1:	NEGQ CX
+	MOVQ CX, c+72(FP)	// return c
 	RET
@@ -106,7 +102,7 @@ TEXT ·subVV(SB),NOSPLIT,$0
 U2:	// n >= 0
 	// regular loop body unrolled 4x
-	RCRQ $1, CX		// CF = c
+	ADDQ CX, CX		// restore CF
 	MOVQ 0(R8)(SI*8), R11
 	MOVQ 8(R8)(SI*8), R12
 	MOVQ 16(R8)(SI*8), R13
@@ -119,7 +115,7 @@ U2:	// n >= 0
 	MOVQ R12, 8(R10)(SI*8)
 	MOVQ R13, 16(R10)(SI*8)
 	MOVQ R14, 24(R10)(SI*8)
-	RCLQ $1, CX		// c = CF
+	SBBQ CX, CX		// save CF
 	ADDQ $4, SI		// i += 4
 	SUBQ $4, DI		// n -= 4
@@ -129,17 +125,18 @@ V2:	ADDQ $4, DI		// n += 4
 	JLE E2			// if n <= 0 goto E2
 L2:	// n > 0
-	RCRQ $1, CX		// CF = c
+	ADDQ CX, CX		// restore CF
 	MOVQ 0(R8)(SI*8), R11
 	SBBQ 0(R9)(SI*8), R11
 	MOVQ R11, 0(R10)(SI*8)
-	RCLQ $1, CX		// c = CF
+	SBBQ CX, CX		// save CF
 	ADDQ $1, SI		// i++
 	SUBQ $1, DI		// n--
 	JG L2			// if n > 0 goto L2
-E2:	MOVQ CX, c+72(FP)	// return c
+E2:	NEGQ CX
+	MOVQ CX, c+72(FP)	// return c
 	RET
@@ -163,11 +160,11 @@ U3:	// n >= 0
 	MOVQ 16(R8)(SI*8), R13
 	MOVQ 24(R8)(SI*8), R14
 	ADDQ CX, R11
-	ZERO_CX
 	ADCQ $0, R12
 	ADCQ $0, R13
 	ADCQ $0, R14
-	SETCS CX		// c = CF
+	SBBQ CX, CX		// save CF
+	NEGQ CX
 	MOVQ R11, 0(R10)(SI*8)
 	MOVQ R12, 8(R10)(SI*8)
 	MOVQ R13, 16(R10)(SI*8)
@@ -183,8 +180,8 @@ V3:	ADDQ $4, DI		// n += 4
 L3:	// n > 0
 	ADDQ 0(R8)(SI*8), CX
 	MOVQ CX, 0(R10)(SI*8)
-	ZERO_CX
+	SBBQ CX, CX		// save CF
-	RCLQ $1, CX		// c = CF
+	NEGQ CX
 	ADDQ $1, SI		// i++
 	SUBQ $1, DI		// n--
@@ -201,7 +198,7 @@ TEXT ·subVW(SB),NOSPLIT,$0
 	MOVQ x+24(FP), R8
 	MOVQ y+48(FP), CX	// c = y
 	MOVQ z+0(FP), R10
 	MOVQ $0, SI		// i = 0
 	// s/JL/JMP/ below to disable the unrolled loop
@@ -215,11 +212,11 @@ U4:	// n >= 0
 	MOVQ 16(R8)(SI*8), R13
 	MOVQ 24(R8)(SI*8), R14
 	SUBQ CX, R11
-	ZERO_CX
 	SBBQ $0, R12
 	SBBQ $0, R13
 	SBBQ $0, R14
-	SETCS CX		// c = CF
+	SBBQ CX, CX		// save CF
+	NEGQ CX
 	MOVQ R11, 0(R10)(SI*8)
 	MOVQ R12, 8(R10)(SI*8)
 	MOVQ R13, 16(R10)(SI*8)
@@ -236,8 +233,8 @@ L4:	// n > 0
 	MOVQ 0(R8)(SI*8), R11
 	SUBQ CX, R11
 	MOVQ R11, 0(R10)(SI*8)
-	ZERO_CX
+	SBBQ CX, CX		// save CF
-	RCLQ $1, CX		// c = CF
+	NEGQ CX
 	ADDQ $1, SI		// i++
 	SUBQ $1, DI		// n--
@@ -306,7 +303,7 @@ L9:	MOVQ AX, DX		// w = w1
 	SHRQ CX, DX:AX		// w>>s | w1<<ŝ
 	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
 	ADDQ $1, BX		// i++
 E9:	CMPQ BX, R11
 	JL L9			// i < n-1