Commit a44c7282 authored by Carlos Eduardo Seo's avatar Carlos Eduardo Seo Committed by Lynn Boger

math/big: improve performance of addVW/subVW for ppc64x

This change adds a better implementation in asm for addVW/subVW for
ppc64x, with speedups up to 3.11x.

benchmark                    old ns/op     new ns/op     delta
BenchmarkAddVW/1-16          6.87          5.71          -16.89%
BenchmarkAddVW/2-16          7.72          5.94          -23.06%
BenchmarkAddVW/3-16          8.74          6.56          -24.94%
BenchmarkAddVW/4-16          9.66          7.26          -24.84%
BenchmarkAddVW/5-16          10.8          7.26          -32.78%
BenchmarkAddVW/10-16         17.4          9.97          -42.70%
BenchmarkAddVW/100-16        164           56.0          -65.85%
BenchmarkAddVW/1000-16       1638          524           -68.01%
BenchmarkAddVW/10000-16      16421         5201          -68.33%
BenchmarkAddVW/100000-16     165762        53324         -67.83%
BenchmarkSubVW/1-16          6.76          5.62          -16.86%
BenchmarkSubVW/2-16          7.69          6.02          -21.72%
BenchmarkSubVW/3-16          8.85          6.61          -25.31%
BenchmarkSubVW/4-16          10.0          7.34          -26.60%
BenchmarkSubVW/5-16          11.3          7.33          -35.13%
BenchmarkSubVW/10-16         19.5          18.7          -4.10%
BenchmarkSubVW/100-16        153           55.9          -63.46%
BenchmarkSubVW/1000-16       1502          519           -65.45%
BenchmarkSubVW/10000-16      15005         5165          -65.58%
BenchmarkSubVW/100000-16     150620        53124         -64.73%

benchmark                    old MB/s     new MB/s     speedup
BenchmarkAddVW/1-16          1165.12      1400.76      1.20x
BenchmarkAddVW/2-16          2071.39      2693.25      1.30x
BenchmarkAddVW/3-16          2744.72      3656.92      1.33x
BenchmarkAddVW/4-16          3311.63      4407.34      1.33x
BenchmarkAddVW/5-16          3700.52      5512.48      1.49x
BenchmarkAddVW/10-16         4605.63      8026.37      1.74x
BenchmarkAddVW/100-16        4856.15      14296.76     2.94x
BenchmarkAddVW/1000-16       4883.96      15264.21     3.13x
BenchmarkAddVW/10000-16      4871.52      15380.78     3.16x
BenchmarkAddVW/100000-16     4826.17      15002.48     3.11x
BenchmarkSubVW/1-16          1183.20      1423.03      1.20x
BenchmarkSubVW/2-16          2081.92      2657.44      1.28x
BenchmarkSubVW/3-16          2711.52      3632.30      1.34x
BenchmarkSubVW/4-16          3198.30      4360.30      1.36x
BenchmarkSubVW/5-16          3534.43      5460.40      1.54x
BenchmarkSubVW/10-16         4106.34      4273.51      1.04x
BenchmarkSubVW/100-16        5213.48      14306.32     2.74x
BenchmarkSubVW/1000-16       5324.27      15391.21     2.89x
BenchmarkSubVW/10000-16      5331.33      15486.57     2.90x
BenchmarkSubVW/100000-16     5311.35      15059.01     2.84x

Change-Id: Ibaa5b9b38d63fba8e01a9c327eb8bef1e6e908c1
Reviewed-on: https://go-review.googlesource.com/101975Reviewed-by: 's avatarLynn Boger <laboger@linux.vnet.ibm.com>
parent a42ea51a
......@@ -84,11 +84,155 @@ sublend:
MOVD R4, c+72(FP)
RET
// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB), NOSPLIT, $0
BR ·addVW_g(SB)
MOVD z+0(FP), R10 // R10 = z[]
MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R4 // R4 = y = c
MOVD z_len+8(FP), R11 // R11 = z_len
CMP R0, R11 // If z_len is zero, return
BEQ done
// We will process the first iteration out of the loop so we capture
// the value of c. In the subsequent iterations, we will rely on the
// value of CA set here.
MOVD 0(R8), R20 // R20 = x[i]
ADD $-1, R11 // R11 = z_len - 1
ADDC R20, R4, R6 // R6 = x[i] + c
CMP R0, R11 // If z_len was 1, we are done
MOVD R6, 0(R10) // z[i]
BEQ final
// We will read 4 elements per iteration
SRD $2, R11, R9 // R9 = z_len/4
DCBT (R8)
CMP R0, R9
MOVD R9, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
loop:
MOVD 8(R8), R20 // R20 = x[i]
MOVD 16(R8), R21 // R21 = x[i+1]
MOVD 24(R8), R22 // R22 = x[i+2]
MOVDU 32(R8), R23 // R23 = x[i+3]
ADDZE R20, R24 // R24 = x[i] + CA
ADDZE R21, R25 // R25 = x[i+1] + CA
ADDZE R22, R26 // R26 = x[i+2] + CA
ADDZE R23, R27 // R27 = x[i+3] + CA
MOVD R24, 8(R10) // z[i]
MOVD R25, 16(R10) // z[i+1]
MOVD R26, 24(R10) // z[i+2]
MOVDU R27, 32(R10) // z[i+3]
ADD $-4, R11 // R11 = z_len - 4
BC 16, 0, loop // bdnz
// We may have some elements to read
CMP R0, R11
BEQ final
tail:
MOVDU 8(R8), R20
ADDZE R20, R24
ADD $-1, R11
MOVDU R24, 8(R10)
CMP R0, R11
BEQ final
MOVDU 8(R8), R20
ADDZE R20, R24
ADD $-1, R11
MOVDU R24, 8(R10)
CMP R0, R11
BEQ final
MOVD 8(R8), R20
ADDZE R20, R24
MOVD R24, 8(R10)
final:
ADDZE R0, R4 // c = CA
done:
MOVD R4, c+56(FP)
RET
// func subVW(z, x []Word, y Word) (c Word)
TEXT ·subVW(SB), NOSPLIT, $0
BR ·subVW_g(SB)
MOVD z+0(FP), R10 // R10 = z[]
MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R4 // R4 = y = c
MOVD z_len+8(FP), R11 // R11 = z_len
CMP R0, R11 // If z_len is zero, return
BEQ done
// We will process the first iteration out of the loop so we capture
// the value of c. In the subsequent iterations, we will rely on the
// value of CA set here.
MOVD 0(R8), R20 // R20 = x[i]
ADD $-1, R11 // R11 = z_len - 1
SUBC R4, R20, R6 // R6 = x[i] - c
CMP R0, R11 // If z_len was 1, we are done
MOVD R6, 0(R10) // z[i]
BEQ final
// We will read 4 elements per iteration
SRD $2, R11, R9 // R9 = z_len/4
DCBT (R8)
CMP R0, R9
MOVD R9, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
// The loop here is almost the same as the one used in s390x, but
// we don't need to capture CA every iteration because we've already
// done that above.
loop:
MOVD 8(R8), R20
MOVD 16(R8), R21
MOVD 24(R8), R22
MOVDU 32(R8), R23
SUBE R0, R20
SUBE R0, R21
SUBE R0, R22
SUBE R0, R23
MOVD R20, 8(R10)
MOVD R21, 16(R10)
MOVD R22, 24(R10)
MOVDU R23, 32(R10)
ADD $-4, R11
BC 16, 0, loop // bdnz
// We may have some elements to read
CMP R0, R11
BEQ final
tail:
MOVDU 8(R8), R20
SUBE R0, R20
ADD $-1, R11
MOVDU R20, 8(R10)
CMP R0, R11
BEQ final
MOVDU 8(R8), R20
SUBE R0, R20
ADD $-1, R11
MOVDU R20, 8(R10)
CMP R0, R11
BEQ final
MOVD 8(R8), R20
SUBE R0, R20
MOVD R20, 8(R10)
final:
// Capture CA
SUBE R4, R4
NEG R4, R4
done:
MOVD R4, c+56(FP)
RET
TEXT ·shlVU(SB), NOSPLIT, $0
BR ·shlVU_g(SB)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment