Commit 56a7c5b9 authored by Josh Bleecher Snyder's avatar Josh Bleecher Snyder

math/big: add partial arm64 assembly support

benchmark                       old ns/op      new ns/op      delta
BenchmarkAddVV_1                18.7           14.8           -20.86%
BenchmarkAddVV_2                21.8           16.6           -23.85%
BenchmarkAddVV_3                26.1           17.1           -34.48%
BenchmarkAddVV_4                30.4           21.9           -27.96%
BenchmarkAddVV_5                35.5           19.8           -44.23%
BenchmarkAddVV_1e1              63.0           28.3           -55.08%
BenchmarkAddVV_1e2              593            178            -69.98%
BenchmarkAddVV_1e3              5691           1490           -73.82%
BenchmarkAddVV_1e4              56868          20761          -63.49%
BenchmarkAddVV_1e5              569062         207679         -63.51%
BenchmarkAddVW_1                15.8           12.6           -20.25%
BenchmarkAddVW_2                17.8           13.1           -26.40%
BenchmarkAddVW_3                21.2           13.9           -34.43%
BenchmarkAddVW_4                23.6           14.7           -37.71%
BenchmarkAddVW_5                26.0           15.8           -39.23%
BenchmarkAddVW_1e1              41.3           21.6           -47.70%
BenchmarkAddVW_1e2              383            145            -62.14%
BenchmarkAddVW_1e3              3703           1264           -65.87%
BenchmarkAddVW_1e4              36920          14359          -61.11%
BenchmarkAddVW_1e5              370345         143046         -61.37%
BenchmarkAddMulVVW_1            33.2           32.5           -2.11%
BenchmarkAddMulVVW_2            58.0           57.2           -1.38%
BenchmarkAddMulVVW_3            95.2           93.9           -1.37%
BenchmarkAddMulVVW_4            108            106            -1.85%
BenchmarkAddMulVVW_5            159            156            -1.89%
BenchmarkAddMulVVW_1e1          344            340            -1.16%
BenchmarkAddMulVVW_1e2          3644           3624           -0.55%
BenchmarkAddMulVVW_1e3          37344          37208          -0.36%
BenchmarkAddMulVVW_1e4          373295         372170         -0.30%
BenchmarkAddMulVVW_1e5          3438116        3425606        -0.36%
BenchmarkBitLen0                7.21           4.32           -40.08%
BenchmarkBitLen1                6.49           4.32           -33.44%
BenchmarkBitLen2                7.23           4.32           -40.25%
BenchmarkBitLen3                6.49           4.32           -33.44%
BenchmarkBitLen4                7.22           4.32           -40.17%
BenchmarkBitLen5                6.52           4.33           -33.59%
BenchmarkBitLen8                7.22           4.32           -40.17%
BenchmarkBitLen9                6.49           4.32           -33.44%
BenchmarkBitLen16               8.66           4.32           -50.12%
BenchmarkBitLen17               7.95           4.32           -45.66%
BenchmarkBitLen31               8.69           4.32           -50.29%
BenchmarkGCD10x10               5021           5033           +0.24%
BenchmarkGCD10x100              5571           5572           +0.02%
BenchmarkGCD10x1000             6707           6729           +0.33%
BenchmarkGCD10x10000            13526          13419          -0.79%
BenchmarkGCD10x100000           85668          83242          -2.83%
BenchmarkGCD100x100             24196          23936          -1.07%
BenchmarkGCD100x1000            28802          27309          -5.18%
BenchmarkGCD100x10000           64111          51704          -19.35%
BenchmarkGCD100x100000          385840         274385         -28.89%
BenchmarkGCD1000x1000           262892         236269         -10.13%
BenchmarkGCD1000x10000          371393         277883         -25.18%
BenchmarkGCD1000x100000         1311795        589055         -55.10%
BenchmarkGCD10000x10000         9596740        6123930        -36.19%
BenchmarkGCD10000x100000        16404000       7269610        -55.68%
BenchmarkGCD100000x100000       776660000      419270000      -46.02%
BenchmarkHilbert                13478980       13402270       -0.57%
BenchmarkBinomial               9802           9440           -3.69%
BenchmarkBitset                 142            142            +0.00%
BenchmarkBitsetNeg              328            279            -14.94%
BenchmarkBitsetOrig             853            861            +0.94%
BenchmarkBitsetNegOrig          1489           1444           -3.02%
BenchmarkMul                    420949000      410481000      -2.49%
BenchmarkExp3Power0x10          1148           1229           +7.06%
BenchmarkExp3Power0x40          1322           1376           +4.08%
BenchmarkExp3Power0x100         2437           2486           +2.01%
BenchmarkExp3Power0x400         9456           9346           -1.16%
BenchmarkExp3Power0x1000        113623         108701         -4.33%
BenchmarkExp3Power0x4000        1134933        1101481        -2.95%
BenchmarkExp3Power0x10000       10773570       10396160       -3.50%
BenchmarkExp3Power0x40000       101362100      97788300       -3.53%
BenchmarkExp3Power0x100000      921114000      885249000      -3.89%
BenchmarkExp3Power0x400000      8323094000     7969020000     -4.25%
BenchmarkFibo                   322021600      92554450       -71.26%
BenchmarkScanPi                 1264583        321065         -74.61%
BenchmarkStringPiParallel       1644661        554216         -66.30%
BenchmarkScan10Base2            1111           1080           -2.79%
BenchmarkScan100Base2           6645           6345           -4.51%
BenchmarkScan1000Base2          84084          62405          -25.78%
BenchmarkScan10000Base2         3105998        932551         -69.98%
BenchmarkScan100000Base2        257234800      40113333       -84.41%
BenchmarkScan10Base8            571            573            +0.35%
BenchmarkScan100Base8           2810           2543           -9.50%
BenchmarkScan1000Base8          47383          25834          -45.48%
BenchmarkScan10000Base8         2739518        567203         -79.30%
BenchmarkScan100000Base8        253952400      36495680       -85.63%
BenchmarkScan10Base10           553            556            +0.54%
BenchmarkScan100Base10          2640           2385           -9.66%
BenchmarkScan1000Base10         50865          24049          -52.72%
BenchmarkScan10000Base10        3279916        549313         -83.25%
BenchmarkScan100000Base10       309121000      36213140       -88.29%
BenchmarkScan10Base16           478            483            +1.05%
BenchmarkScan100Base16          2353           2144           -8.88%
BenchmarkScan1000Base16         48091          24246          -49.58%
BenchmarkScan10000Base16        2858886        586475         -79.49%
BenchmarkScan100000Base16       266320000      38190500       -85.66%
BenchmarkString10Base2          736            730            -0.82%
BenchmarkString100Base2         2695           2707           +0.45%
BenchmarkString1000Base2        20549          20388          -0.78%
BenchmarkString10000Base2       212638         210782         -0.87%
BenchmarkString100000Base2      1944963        1938033        -0.36%
BenchmarkString10Base8          524            517            -1.34%
BenchmarkString100Base8         1326           1320           -0.45%
BenchmarkString1000Base8        8213           8249           +0.44%
BenchmarkString10000Base8       72204          72092          -0.16%
BenchmarkString100000Base8      769068         765993         -0.40%
BenchmarkString10Base10         1018           982            -3.54%
BenchmarkString100Base10        3485           3206           -8.01%
BenchmarkString1000Base10       37102          18935          -48.97%
BenchmarkString10000Base10      188633         88637          -53.01%
BenchmarkString100000Base10     124490300      19700940       -84.17%
BenchmarkString10Base16         509            502            -1.38%
BenchmarkString100Base16        1084           1098           +1.29%
BenchmarkString1000Base16       5641           5650           +0.16%
BenchmarkString10000Base16      46900          46745          -0.33%
BenchmarkString100000Base16     508957         505840         -0.61%
BenchmarkLeafSize0              8934320        8149465        -8.78%
BenchmarkLeafSize1              237666         118381         -50.19%
BenchmarkLeafSize2              237807         117854         -50.44%
BenchmarkLeafSize3              1688640        353494         -79.07%
BenchmarkLeafSize4              235676         116196         -50.70%
BenchmarkLeafSize5              2121896        430325         -79.72%
BenchmarkLeafSize6              1682306        351775         -79.09%
BenchmarkLeafSize7              1051847        251436         -76.10%
BenchmarkLeafSize8              232697         115674         -50.29%
BenchmarkLeafSize9              2403616        488443         -79.68%
BenchmarkLeafSize10             2120975        429545         -79.75%
BenchmarkLeafSize11             2023789        426525         -78.92%
BenchmarkLeafSize12             1684830        351985         -79.11%
BenchmarkLeafSize13             1465529        337906         -76.94%
BenchmarkLeafSize14             1050498        253872         -75.83%
BenchmarkLeafSize15             683228         197384         -71.11%
BenchmarkLeafSize16             232496         116026         -50.10%
BenchmarkLeafSize32             245841         126671         -48.47%
BenchmarkLeafSize64             301728         190285         -36.93%

Change-Id: I63e63297896d96b89c9a275b893c2b405a7e105d
Reviewed-on: https://go-review.googlesource.com/9260Reviewed-by: 's avatarDavid Crawshaw <crawshaw@golang.org>
parent 1f65c9c1
...@@ -9,38 +9,169 @@ ...@@ -9,38 +9,169 @@
// This file provides fast assembly versions for the elementary // This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go. // arithmetic operations on vectors implemented in arith.go.
// TODO: Consider re-implementing using Advanced SIMD
// once the assembler supports those instructions.
// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),NOSPLIT,$0 TEXT ·mulWW(SB),NOSPLIT,$0
B ·mulWW_g(SB) MOVD x+0(FP), R0
MOVD y+8(FP), R1
MUL R0, R1, R2
UMULH R0, R1, R3
MOVD R3, z1+16(FP)
MOVD R2, z0+24(FP)
RET
// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB),NOSPLIT,$0 TEXT ·divWW(SB),NOSPLIT,$0
B ·divWW_g(SB) B ·divWW_g(SB) // ARM64 has no multiword division
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),NOSPLIT,$0 TEXT ·addVV(SB),NOSPLIT,$0
B ·addVV_g(SB) MOVD z+0(FP), R3
MOVD z_len+8(FP), R0
MOVD x+24(FP), R1
MOVD y+48(FP), R2
ADDS $0, R0 // clear carry flag
loop:
CBZ R0, done // careful not to touch the carry flag
MOVD.P 8(R1), R4
MOVD.P 8(R2), R5
ADCS R4, R5
MOVD.P R5, 8(R3)
SUB $1, R0
B loop
done:
CSET HS, R0 // extract carry flag
MOVD R0, c+72(FP)
RET
// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB),NOSPLIT,$0 TEXT ·subVV(SB),NOSPLIT,$0
B ·subVV_g(SB) MOVD z+0(FP), R3
MOVD z_len+8(FP), R0
MOVD x+24(FP), R1
MOVD y+48(FP), R2
CMP R0, R0 // set carry flag
loop:
CBZ R0, done // careful not to touch the carry flag
MOVD.P 8(R1), R4
MOVD.P 8(R2), R5
SBCS R5, R4
MOVD.P R4, 8(R3)
SUB $1, R0
B loop
done:
CSET LO, R0 // extract carry flag
MOVD R0, c+72(FP)
RET
// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),NOSPLIT,$0 TEXT ·addVW(SB),NOSPLIT,$0
B ·addVW_g(SB) MOVD z+0(FP), R3
MOVD z_len+8(FP), R0
MOVD x+24(FP), R1
MOVD y+48(FP), R2
CBZ R0, return_y
MOVD.P 8(R1), R4
ADDS R2, R4
MOVD.P R4, 8(R3)
SUB $1, R0
loop:
CBZ R0, done // careful not to touch the carry flag
MOVD.P 8(R1), R4
ADCS $0, R4
MOVD.P R4, 8(R3)
SUB $1, R0
B loop
done:
CSET HS, R0 // extract carry flag
MOVD R0, c+56(FP)
RET
return_y: // z is empty; copy y to c
MOVD R2, c+56(FP)
RET
// func subVW(z, x []Word, y Word) (c Word)
TEXT ·subVW(SB),NOSPLIT,$0 TEXT ·subVW(SB),NOSPLIT,$0
B ·subVW_g(SB) MOVD z+0(FP), R3
MOVD z_len+8(FP), R0
MOVD x+24(FP), R1
MOVD y+48(FP), R2
CBZ R0, rety
MOVD.P 8(R1), R4
SUBS R2, R4
MOVD.P R4, 8(R3)
SUB $1, R0
loop:
CBZ R0, done // careful not to touch the carry flag
MOVD.P 8(R1), R4
SBCS $0, R4
MOVD.P R4, 8(R3)
SUB $1, R0
B loop
done:
CSET LO, R0 // extract carry flag
MOVD R0, c+56(FP)
RET
rety: // z is empty; copy y to c
MOVD R2, c+56(FP)
RET
// func shlVU(z, x []Word, s uint) (c Word)
TEXT ·shlVU(SB),NOSPLIT,$0 TEXT ·shlVU(SB),NOSPLIT,$0
B ·shlVU_g(SB) B ·shlVU_g(SB)
// func shrVU(z, x []Word, s uint) (c Word)
TEXT ·shrVU(SB),NOSPLIT,$0 TEXT ·shrVU(SB),NOSPLIT,$0
B ·shrVU_g(SB) B ·shrVU_g(SB)
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0 TEXT ·mulAddVWW(SB),NOSPLIT,$0
B ·mulAddVWW_g(SB) MOVD z+0(FP), R1
MOVD z_len+8(FP), R0
MOVD x+24(FP), R2
MOVD y+48(FP), R3
MOVD r+56(FP), R4
loop:
CBZ R0, done
MOVD.P 8(R2), R5
UMULH R5, R3, R7
MUL R5, R3, R6
ADDS R4, R6
ADC $0, R7
MOVD.P R6, 8(R1)
MOVD R7, R4
SUB $1, R0
B loop
done:
MOVD R4, c+64(FP)
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB),NOSPLIT,$0 TEXT ·addMulVVW(SB),NOSPLIT,$0
B ·addMulVVW_g(SB) B ·addMulVVW_g(SB)
// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),NOSPLIT,$0 TEXT ·divWVW(SB),NOSPLIT,$0
B ·divWVW_g(SB) B ·divWVW_g(SB)
// func bitLen(x Word) (n int)
TEXT ·bitLen(SB),NOSPLIT,$0 TEXT ·bitLen(SB),NOSPLIT,$0
B ·bitLen_g(SB) MOVD x+0(FP), R0
CLZ R0, R0
MOVD $64, R1
SUB R0, R1, R0
MOVD R0, n+8(FP)
RET
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment