Commit 49551472 authored by Ethan Miller's avatar Ethan Miller Committed by Michael Munday

math/big: add assembly implementation of arith for ppc64{le}

The existing implementation used a pure go implementation, leading to slow
cryptographic performance.

Implemented mulWW, subVV, mulAddVWW, addMulVVW, and bitLen for
ppc64{le}.
Implemented divWW for ppc64le only, as the DIVDEU instruction is only
available on Power8 or newer.

benchcmp output:

benchmark                         old ns/op     new ns/op     delta
BenchmarkSignP384                 28934360      10877330      -62.41%
BenchmarkRSA2048Decrypt           41261033      5139930       -87.54%
BenchmarkRSA2048Sign              45231300      7610985       -83.17%
Benchmark3PrimeRSA2048Decrypt     20487300      2481408       -87.89%

Fixes #16621

Change-Id: If8b68963bb49909bde832f2bda08a3791c4f5b7a
Reviewed-on: https://go-review.googlesource.com/26951
Run-TryBot: Michael Munday <munday@ca.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarMichael Munday <munday@ca.ibm.com>
parent 0a7c73b5
......@@ -483,6 +483,10 @@ const (
ACMPWU
ADIVD
ADIVDCC
ADIVDE
ADIVDECC
ADIVDEU
ADIVDEUCC
ADIVDVCC
ADIVDV
ADIVDU
......
......@@ -242,6 +242,10 @@ var Anames = []string{
"CMPWU",
"DIVD",
"DIVDCC",
"DIVDE",
"DIVDECC",
"DIVDEU",
"DIVDEUCC",
"DIVDVCC",
"DIVDV",
"DIVDU",
......
......@@ -1009,6 +1009,10 @@ func buildop(ctxt *obj.Link) {
opset(AMULLDV, r0)
opset(ADIVD, r0)
opset(ADIVDCC, r0)
opset(ADIVDE, r0)
opset(ADIVDEU, r0)
opset(ADIVDECC, r0)
opset(ADIVDEUCC, r0)
opset(ADIVDVCC, r0)
opset(ADIVDV, r0)
opset(ADIVDU, r0)
......@@ -2670,6 +2674,18 @@ func oprrr(ctxt *obj.Link, a obj.As) uint32 {
case AREMDCC, ADIVDCC:
return OPVCC(31, 489, 0, 1)
case ADIVDE:
return OPVCC(31, 425, 0, 0)
case ADIVDECC:
return OPVCC(31, 425, 0, 1)
case ADIVDEU:
return OPVCC(31, 393, 0, 0)
case ADIVDEUCC:
return OPVCC(31, 393, 0, 1)
case AREMDV, ADIVDV:
return OPVCC(31, 489, 1, 0)
......
......@@ -54,6 +54,18 @@ func BenchmarkSignP256(b *testing.B) {
}
}
func BenchmarkSignP384(b *testing.B) {
b.ResetTimer()
p384 := elliptic.P384()
hashed := []byte("testing")
priv, _ := GenerateKey(p384, rand.Reader)
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, _, _ = Sign(rand.Reader, priv, hashed)
}
}
func BenchmarkVerifyP256(b *testing.B) {
b.ResetTimer()
p256 := elliptic.P256()
......
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !math_big_pure_go,ppc64
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
TEXT ·divWW(SB), NOSPLIT, $0
BR ·divWW_g(SB)
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !math_big_pure_go,ppc64le
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB), NOSPLIT, $0
MOVD x1+0(FP), R4
MOVD x0+8(FP), R5
MOVD y+16(FP), R6
CMPU R4, R6
BGE divbigger
// from the programmer's note in ch. 3 of the ISA manual, p.74
DIVDEU R6, R4, R3
DIVDU R6, R5, R7
MULLD R6, R3, R8
MULLD R6, R7, R20
SUB R20, R5, R10
ADD R7, R3, R3
SUB R8, R10, R4
CMPU R4, R10
BLT adjust
CMPU R4, R6
BLT end
adjust:
MOVD $1, R21
ADD R21, R3, R3
SUB R6, R4, R4
end:
MOVD R3, q+24(FP)
MOVD R4, r+32(FP)
RET
divbigger:
MOVD $-1, R7
MOVD R7, q+24(FP)
MOVD R7, r+32(FP)
RET
......@@ -9,38 +9,178 @@
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
TEXT ·mulWW(SB),NOSPLIT,$0
BR ·mulWW_g(SB)
// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB), NOSPLIT, $0
MOVD x+0(FP), R4
MOVD y+8(FP), R5
MULHDU R4, R5, R6
MULLD R4, R5, R7
MOVD R6, z1+16(FP)
MOVD R7, z0+24(FP)
RET
TEXT ·divWW(SB),NOSPLIT,$0
BR ·divWW_g(SB)
TEXT ·addVV(SB),NOSPLIT,$0
TEXT ·addVV(SB), NOSPLIT, $0
BR ·addVV_g(SB)
TEXT ·subVV(SB),NOSPLIT,$0
BR ·subVV_g(SB)
// func subVV(z, x, y []Word) (c Word)
// z[i] = x[i] - y[i] for all i, carrying
TEXT ·subVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R7
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R10
MOVD $0, R4 // c = 0
MOVD $0, R5 // i = 0
MOVD $1, R29 // work around lack of ADDI
MOVD $8, R28 // work around lack of scaled addressing
SUBC R0, R0 // clear CA
JMP sublend
// amd64 saves and restores CF, but I believe they only have to do that because all of
// their math operations clobber it - we should just be able to recover it at the end.
subloop:
MULLD R5, R28, R6
MOVD (R8)(R6), R11 // x[i]
MOVD (R9)(R6), R12 // y[i]
SUBE R12, R11, R15
MOVD R15, (R10)(R6)
TEXT ·addVW(SB),NOSPLIT,$0
ADD R29, R5 // i++
sublend:
CMP R5, R7
BLT subloop
ADDZE R4
XOR R29, R4
MOVD R4, c+72(FP)
RET
TEXT ·addVW(SB), NOSPLIT, $0
BR ·addVW_g(SB)
TEXT ·subVW(SB),NOSPLIT,$0
TEXT ·subVW(SB), NOSPLIT, $0
BR ·subVW_g(SB)
TEXT ·shlVU(SB),NOSPLIT,$0
TEXT ·shlVU(SB), NOSPLIT, $0
BR ·shlVU_g(SB)
TEXT ·shrVU(SB),NOSPLIT,$0
TEXT ·shrVU(SB), NOSPLIT, $0
BR ·shrVU_g(SB)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
BR ·mulAddVWW_g(SB)
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R10
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD r+56(FP), R4 // c = r
MOVD z_len+8(FP), R11
MOVD $0, R3 // i = 0
MOVD $8, R18
MOVD $1, R19
JMP e5
l5:
MULLD R18, R3, R5
MOVD (R8)(R5), R20
MULLD R9, R20, R6
MULHDU R9, R20, R7
ADDC R4, R6
ADDZE R7
MOVD R6, (R10)(R5)
MOVD R7, R4
ADD R19, R3
e5:
CMP R3, R11
BLT l5
MOVD R4, c+64(FP)
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB), NOSPLIT, $0
MOVD z+0(FP), R10
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z_len+8(FP), R22
MOVD $0, R5 // i = 0
MOVD $0, R4 // c = 0
MOVD $8, R28
MOVD $-2, R23
AND R22, R23 // mask the last bit of z.len
MOVD $2, R24
CMP R23, R24
BGE unrolled
JMP end
unrolled:
MOVD $8, R19 // no (RA)(RB*8) on power
MULLD R5, R19
MOVD (R10)(R19), R11 // R11 = z[i]
MOVD (R8)(R19), R16 // R16 = x[i]
ADD R28, R19, R25
MOVD (R10)(R25), R17
MOVD (R8)(R25), R18
MULLD R9, R16, R12
MULHDU R9, R16, R14
MULLD R9, R18, R6
MULHDU R9, R18, R7
ADDC R4, R12
ADDZE R14
ADDC R11, R12 // z[i] = (x[i]*y) + z[i] + carry
ADDZE R14 // carry = high order bits + add carry
MOVD R12, (R10)(R19)
ADDC R14, R6
ADDZE R7
ADDC R17, R6
ADDZE R7
MOVD R6, (R10)(R25)
MOVD R7, R4
ADD R24, R5
CMP R5, R23
BLT unrolled
JMP end
loop:
MOVD $8, R19
MULLD R5, R19
MOVD (R10)(R19), R11
MOVD (R8)(R19), R16
MULLD R9, R16, R12
MULHDU R9, R16, R14
ADDC R4, R12
ADDZE R14
ADDC R11, R12
ADDZE R14
MOVD R12, (R10)(R19)
MOVD R14, R4
MOVD $1, R15
ADD R15, R5
end:
CMP R5, R22
BLT loop
TEXT ·addMulVVW(SB),NOSPLIT,$0
BR ·addMulVVW_g(SB)
MOVD R4, c+56(FP)
RET
TEXT ·divWVW(SB),NOSPLIT,$0
TEXT ·divWVW(SB), NOSPLIT, $0
BR ·divWVW_g(SB)
TEXT ·bitLen(SB),NOSPLIT,$0
BR ·bitLen_g(SB)
// func bitLen(x Word) int
TEXT ·bitLen(SB), NOSPLIT, $0
MOVD x+0(FP), R4
CNTLZD R4, R4
MOVD $64, R5
SUB R4, R5
MOVD R5, n+8(FP)
RET
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment