math/big: improve performance of addVW/subVW for ppc64x

This change adds a better implementation in asm for addVW/subVW for ppc64x, with speedups up to 3.11x. benchmark old ns/op new ns/op delta BenchmarkAddVW/1-16 6.87 5.71 -16.89% BenchmarkAddVW/2-16 7.72 5.94 -23.06% BenchmarkAddVW/3-16 8.74 6.56 -24.94% BenchmarkAddVW/4-16 9.66 7.26 -24.84% BenchmarkAddVW/5-16 10.8 7.26 -32.78% BenchmarkAddVW/10-16 17.4 9.97 -42.70% BenchmarkAddVW/100-16 164 56.0 -65.85% BenchmarkAddVW/1000-16 1638 524 -68.01% BenchmarkAddVW/10000-16 16421 5201 -68.33% BenchmarkAddVW/100000-16 165762 53324 -67.83% BenchmarkSubVW/1-16 6.76 5.62 -16.86% BenchmarkSubVW/2-16 7.69 6.02 -21.72% BenchmarkSubVW/3-16 8.85 6.61 -25.31% BenchmarkSubVW/4-16 10.0 7.34 -26.60% BenchmarkSubVW/5-16 11.3 7.33 -35.13% BenchmarkSubVW/10-16 19.5 18.7 -4.10% BenchmarkSubVW/100-16 153 55.9 -63.46% BenchmarkSubVW/1000-16 1502 519 -65.45% BenchmarkSubVW/10000-16 15005 5165 -65.58% BenchmarkSubVW/100000-16 150620 53124 -64.73% benchmark old MB/s new MB/s speedup BenchmarkAddVW/1-16 1165.12 1400.76 1.20x BenchmarkAddVW/2-16 2071.39 2693.25 1.30x BenchmarkAddVW/3-16 2744.72 3656.92 1.33x BenchmarkAddVW/4-16 3311.63 4407.34 1.33x BenchmarkAddVW/5-16 3700.52 5512.48 1.49x BenchmarkAddVW/10-16 4605.63 8026.37 1.74x BenchmarkAddVW/100-16 4856.15 14296.76 2.94x BenchmarkAddVW/1000-16 4883.96 15264.21 3.13x BenchmarkAddVW/10000-16 4871.52 15380.78 3.16x BenchmarkAddVW/100000-16 4826.17 15002.48 3.11x BenchmarkSubVW/1-16 1183.20 1423.03 1.20x BenchmarkSubVW/2-16 2081.92 2657.44 1.28x BenchmarkSubVW/3-16 2711.52 3632.30 1.34x BenchmarkSubVW/4-16 3198.30 4360.30 1.36x BenchmarkSubVW/5-16 3534.43 5460.40 1.54x BenchmarkSubVW/10-16 4106.34 4273.51 1.04x BenchmarkSubVW/100-16 5213.48 14306.32 2.74x BenchmarkSubVW/1000-16 5324.27 15391.21 2.89x BenchmarkSubVW/10000-16 5331.33 15486.57 2.90x BenchmarkSubVW/100000-16 5311.35 15059.01 2.84x Change-Id: Ibaa5b9b38d63fba8e01a9c327eb8bef1e6e908c1 Reviewed-on: https://go-review.googlesource.com/101975Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>

math/big: improve performance of addVW/subVW for ppc64x
This change adds a better implementation in asm for addVW/subVW for ppc64x, with speedups up to 3.11x. benchmark old ns/op new ns/op delta BenchmarkAddVW/1-16 6.87 5.71 -16.89% BenchmarkAddVW/2-16 7.72 5.94 -23.06% BenchmarkAddVW/3-16 8.74 6.56 -24.94% BenchmarkAddVW/4-16 9.66 7.26 -24.84% BenchmarkAddVW/5-16 10.8 7.26 -32.78% BenchmarkAddVW/10-16 17.4 9.97 -42.70% BenchmarkAddVW/100-16 164 56.0 -65.85% BenchmarkAddVW/1000-16 1638 524 -68.01% BenchmarkAddVW/10000-16 16421 5201 -68.33% BenchmarkAddVW/100000-16 165762 53324 -67.83% BenchmarkSubVW/1-16 6.76 5.62 -16.86% BenchmarkSubVW/2-16 7.69 6.02 -21.72% BenchmarkSubVW/3-16 8.85 6.61 -25.31% BenchmarkSubVW/4-16 10.0 7.34 -26.60% BenchmarkSubVW/5-16 11.3 7.33 -35.13% BenchmarkSubVW/10-16 19.5 18.7 -4.10% BenchmarkSubVW/100-16 153 55.9 -63.46% BenchmarkSubVW/1000-16 1502 519 -65.45% BenchmarkSubVW/10000-16 15005 5165 -65.58% BenchmarkSubVW/100000-16 150620 53124 -64.73% benchmark old MB/s new MB/s speedup BenchmarkAddVW/1-16 1165.12 1400.76 1.20x BenchmarkAddVW/2-16 2071.39 2693.25 1.30x BenchmarkAddVW/3-16 2744.72 3656.92 1.33x BenchmarkAddVW/4-16 3311.63 4407.34 1.33x BenchmarkAddVW/5-16 3700.52 5512.48 1.49x BenchmarkAddVW/10-16 4605.63 8026.37 1.74x BenchmarkAddVW/100-16 4856.15 14296.76 2.94x BenchmarkAddVW/1000-16 4883.96 15264.21 3.13x BenchmarkAddVW/10000-16 4871.52 15380.78 3.16x BenchmarkAddVW/100000-16 4826.17 15002.48 3.11x BenchmarkSubVW/1-16 1183.20 1423.03 1.20x BenchmarkSubVW/2-16 2081.92 2657.44 1.28x BenchmarkSubVW/3-16 2711.52 3632.30 1.34x BenchmarkSubVW/4-16 3198.30 4360.30 1.36x BenchmarkSubVW/5-16 3534.43 5460.40 1.54x BenchmarkSubVW/10-16 4106.34 4273.51 1.04x BenchmarkSubVW/100-16 5213.48 14306.32 2.74x BenchmarkSubVW/1000-16 5324.27 15391.21 2.89x BenchmarkSubVW/10000-16 5331.33 15486.57 2.90x BenchmarkSubVW/100000-16 5311.35 15059.01 2.84x Change-Id: Ibaa5b9b38d63fba8e01a9c327eb8bef1e6e908c1 Reviewed-on: https://go-review.googlesource.com/101975Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
a44c7282 · Carlos Eduardo Seo · Lynn Boger · a42ea51a · a44c7282
Commit a44c7282 authored Mar 19, 2018 by Carlos Eduardo Seo Committed by Lynn Boger Mar 27, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 146 additions and 2 deletions

arith_ppc64x.s src/math/big/arith_ppc64x.s +146 -2

No files found.
--- a/src/math/big/arith_ppc64x.s
+++ b/src/math/big/arith_ppc64x.s
@@ -84,11 +84,155 @@ sublend:
 	MOVD  R4, c+72(FP)
 	RET

+// func addVW(z, x []Word, y Word) (c Word)
 TEXT ·addVW(SB), NOSPLIT, $0
-	BR ·addVW_g(SB)
+	MOVD z+0(FP), R10	// R10 = z[]
+	MOVD x+24(FP), R8	// R8 = x[]
+	MOVD y+48(FP), R4	// R4 = y = c
+	MOVD z_len+8(FP), R11	// R11 = z_len
+
+	CMP   R0, R11		// If z_len is zero, return
+	BEQ   done

+	// We will process the first iteration out of the loop so we capture
+	// the value of c. In the subsequent iterations, we will rely on the
+	// value of CA set here.
+	MOVD  0(R8), R20	// R20 = x[i]
+	ADD   $-1, R11		// R11 = z_len - 1
+	ADDC  R20, R4, R6	// R6 = x[i] + c
+	CMP   R0, R11		// If z_len was 1, we are done
+	MOVD  R6, 0(R10)	// z[i]
+	BEQ   final
+
+	// We will read 4 elements per iteration
+	SRD   $2, R11, R9	// R9 = z_len/4
+	DCBT  (R8)
+	CMP   R0, R9
+	MOVD  R9, CTR		// Set up the loop counter
+	BEQ   tail		// If R9 = 0, we can't use the loop
+
+loop:
+	MOVD  8(R8), R20	// R20 = x[i]
+	MOVD  16(R8), R21	// R21 = x[i+1]
+	MOVD  24(R8), R22	// R22 = x[i+2]
+	MOVDU 32(R8), R23	// R23 = x[i+3]
+	ADDZE R20, R24		// R24 = x[i] + CA
+	ADDZE R21, R25		// R25 = x[i+1] + CA
+	ADDZE R22, R26		// R26 = x[i+2] + CA
+	ADDZE R23, R27		// R27 = x[i+3] + CA
+	MOVD  R24, 8(R10)	// z[i]
+	MOVD  R25, 16(R10)	// z[i+1]
+	MOVD  R26, 24(R10)	// z[i+2]
+	MOVDU R27, 32(R10)	// z[i+3]
+	ADD   $-4, R11		// R11 = z_len - 4
+	BC    16, 0, loop	// bdnz
+
+	// We may have some elements to read
+	CMP R0, R11
+	BEQ final
+
+tail:
+	MOVDU 8(R8), R20
+	ADDZE R20, R24
+	ADD $-1, R11
+	MOVDU R24, 8(R10)
+	CMP R0, R11
+	BEQ final
+
+	MOVDU 8(R8), R20
+	ADDZE R20, R24
+	ADD $-1, R11
+	MOVDU R24, 8(R10)
+	CMP R0, R11
+	BEQ final
+
+	MOVD 8(R8), R20
+	ADDZE R20, R24
+	MOVD R24, 8(R10)
+
+final:
+	ADDZE R0, R4		// c = CA
+done:
+	MOVD  R4, c+56(FP)
+	RET
+
+// func subVW(z, x []Word, y Word) (c Word)
 TEXT ·subVW(SB), NOSPLIT, $0
-	BR ·subVW_g(SB)
+	MOVD  z+0(FP), R10	// R10 = z[]
+	MOVD  x+24(FP), R8	// R8 = x[]
+	MOVD  y+48(FP), R4	// R4 = y = c
+	MOVD  z_len+8(FP), R11	// R11 = z_len
+
+	CMP   R0, R11		// If z_len is zero, return
+	BEQ   done
+
+	// We will process the first iteration out of the loop so we capture
+	// the value of c. In the subsequent iterations, we will rely on the
+	// value of CA set here.
+	MOVD  0(R8), R20	// R20 = x[i]
+	ADD   $-1, R11		// R11 = z_len - 1
+	SUBC  R4, R20, R6	// R6 = x[i] - c
+	CMP   R0, R11		// If z_len was 1, we are done
+	MOVD  R6, 0(R10)	// z[i]
+	BEQ   final
+
+	// We will read 4 elements per iteration
+	SRD   $2, R11, R9	// R9 = z_len/4
+	DCBT  (R8)
+	CMP   R0, R9
+	MOVD  R9, CTR		// Set up the loop counter
+	BEQ   tail		// If R9 = 0, we can't use the loop
+
+	// The loop here is almost the same as the one used in s390x, but
+	// we don't need to capture CA every iteration because we've already
+	// done that above.
+loop:
+	MOVD  8(R8), R20
+	MOVD  16(R8), R21
+	MOVD  24(R8), R22
+	MOVDU 32(R8), R23
+	SUBE  R0, R20
+	SUBE  R0, R21
+	SUBE  R0, R22
+	SUBE  R0, R23
+	MOVD  R20, 8(R10)
+	MOVD  R21, 16(R10)
+	MOVD  R22, 24(R10)
+	MOVDU R23, 32(R10)
+	ADD   $-4, R11
+	BC    16, 0, loop	// bdnz
+
+	// We may have some elements to read
+	CMP   R0, R11
+	BEQ   final
+
+tail:
+	MOVDU 8(R8), R20
+	SUBE  R0, R20
+	ADD   $-1, R11
+	MOVDU R20, 8(R10)
+	CMP   R0, R11
+	BEQ   final
+
+	MOVDU 8(R8), R20
+	SUBE  R0, R20
+	ADD   $-1, R11
+	MOVDU R20, 8(R10)
+	CMP   R0, R11
+	BEQ   final
+
+	MOVD  8(R8), R20
+	SUBE  R0, R20
+	MOVD  R20, 8(R10)
+
+final:
+	// Capture CA
+	SUBE  R4, R4
+	NEG   R4, R4
+
+done:
+	MOVD  R4, c+56(FP)
+	RET

 TEXT ·shlVU(SB), NOSPLIT, $0
 	BR ·shlVU_g(SB)