math/big: add assembly implementation of arith for ppc64{le}

The existing implementation used a pure go implementation, leading to slow cryptographic performance. Implemented mulWW, subVV, mulAddVWW, addMulVVW, and bitLen for ppc64{le}. Implemented divWW for ppc64le only, as the DIVDEU instruction is only available on Power8 or newer. benchcmp output: benchmark old ns/op new ns/op delta BenchmarkSignP384 28934360 10877330 -62.41% BenchmarkRSA2048Decrypt 41261033 5139930 -87.54% BenchmarkRSA2048Sign 45231300 7610985 -83.17% Benchmark3PrimeRSA2048Decrypt 20487300 2481408 -87.89% Fixes #16621 Change-Id: If8b68963bb49909bde832f2bda08a3791c4f5b7a Reviewed-on: https://go-review.googlesource.com/26951 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>

math/big: add assembly implementation of arith for ppc64{le}
The existing implementation used a pure go implementation, leading to slow cryptographic performance. Implemented mulWW, subVV, mulAddVWW, addMulVVW, and bitLen for ppc64{le}. Implemented divWW for ppc64le only, as the DIVDEU instruction is only available on Power8 or newer. benchcmp output: benchmark old ns/op new ns/op delta BenchmarkSignP384 28934360 10877330 -62.41% BenchmarkRSA2048Decrypt 41261033 5139930 -87.54% BenchmarkRSA2048Sign 45231300 7610985 -83.17% Benchmark3PrimeRSA2048Decrypt 20487300 2481408 -87.89% Fixes #16621 Change-Id: If8b68963bb49909bde832f2bda08a3791c4f5b7a Reviewed-on: https://go-review.googlesource.com/26951 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>
49551472 · Ethan Miller · Michael Munday · 0a7c73b5 · 49551472 · 49551472
Commit 49551472 authored Aug 12, 2016 by Ethan Miller Committed by Michael Munday Aug 29, 2016
7 changed files
--- a/src/cmd/internal/obj/ppc64/a.out.go
+++ b/src/cmd/internal/obj/ppc64/a.out.go
@@ -483,6 +483,10 @@ const (
 	ACMPWU
 	ADIVD
 	ADIVDCC
+	ADIVDE
+	ADIVDECC
+	ADIVDEU
+	ADIVDEUCC
 	ADIVDVCC
 	ADIVDV
 	ADIVDU

--- a/src/cmd/internal/obj/ppc64/anames.go
+++ b/src/cmd/internal/obj/ppc64/anames.go
@@ -242,6 +242,10 @@ var Anames = []string{
 	"CMPWU",
 	"DIVD",
 	"DIVDCC",
+	"DIVDE",
+	"DIVDECC",
+	"DIVDEU",
+	"DIVDEUCC",
 	"DIVDVCC",
 	"DIVDV",
 	"DIVDU",

--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -1009,6 +1009,10 @@ func buildop(ctxt *obj.Link) {
 			opset(AMULLDV, r0)
 			opset(ADIVD, r0)
 			opset(ADIVDCC, r0)
+			opset(ADIVDE, r0)
+			opset(ADIVDEU, r0)
+			opset(ADIVDECC, r0)
+			opset(ADIVDEUCC, r0)
 			opset(ADIVDVCC, r0)
 			opset(ADIVDV, r0)
 			opset(ADIVDU, r0)
@@ -2670,6 +2674,18 @@ func oprrr(ctxt *obj.Link, a obj.As) uint32 {
 	case AREMDCC, ADIVDCC:
 		return OPVCC(31, 489, 0, 1)

+	case ADIVDE:
+		return OPVCC(31, 425, 0, 0)
+
+	case ADIVDECC:
+		return OPVCC(31, 425, 0, 1)
+
+	case ADIVDEU:
+		return OPVCC(31, 393, 0, 0)
+
+	case ADIVDEUCC:
+		return OPVCC(31, 393, 0, 1)
+
 	case AREMDV, ADIVDV:
 		return OPVCC(31, 489, 1, 0)


--- a/src/crypto/ecdsa/ecdsa_test.go
+++ b/src/crypto/ecdsa/ecdsa_test.go
@@ -54,6 +54,18 @@ func BenchmarkSignP256(b *testing.B) {
 	}
 }

+func BenchmarkSignP384(b *testing.B) {
+	b.ResetTimer()
+	p384 := elliptic.P384()
+	hashed := []byte("testing")
+	priv, _ := GenerateKey(p384, rand.Reader)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, _, _ = Sign(rand.Reader, priv, hashed)
+	}
+}
+
 func BenchmarkVerifyP256(b *testing.B) {
 	b.ResetTimer()
 	p256 := elliptic.P256()

--- a/src/math/big/arith_ppc64.s
+++ b/src/math/big/arith_ppc64.s
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !math_big_pure_go,ppc64
+
+#include "textflag.h"
+
+// This file provides fast assembly versions for the elementary
+// arithmetic operations on vectors implemented in arith.go.
+
+TEXT ·divWW(SB), NOSPLIT, $0
+	BR ·divWW_g(SB)
+
--- a/src/math/big/arith_ppc64le.s
+++ b/src/math/big/arith_ppc64le.s
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !math_big_pure_go,ppc64le
+
+#include "textflag.h"
+
+// This file provides fast assembly versions for the elementary
+// arithmetic operations on vectors implemented in arith.go.
+
+// func divWW(x1, x0, y Word) (q, r Word)
+TEXT ·divWW(SB), NOSPLIT, $0
+	MOVD x1+0(FP), R4
+	MOVD x0+8(FP), R5
+	MOVD y+16(FP), R6
+
+	CMPU R4, R6
+	BGE  divbigger
+
+	// from the programmer's note in ch. 3 of the ISA manual, p.74
+	DIVDEU R6, R4, R3
+	DIVDU  R6, R5, R7
+	MULLD  R6, R3, R8
+	MULLD  R6, R7, R20
+	SUB    R20, R5, R10
+	ADD    R7, R3, R3
+	SUB    R8, R10, R4
+	CMPU   R4, R10
+	BLT    adjust
+	CMPU   R4, R6
+	BLT    end
+
+adjust:
+	MOVD $1, R21
+	ADD  R21, R3, R3
+	SUB  R6, R4, R4
+
+end:
+	MOVD R3, q+24(FP)
+	MOVD R4, r+32(FP)
+
+	RET
+
+divbigger:
+	MOVD $-1, R7
+	MOVD R7, q+24(FP)
+	MOVD R7, r+32(FP)
+	RET
+
--- a/src/math/big/arith_ppc64x.s
+++ b/src/math/big/arith_ppc64x.s
@@ -9,38 +9,178 @@
 // This file provides fast assembly versions for the elementary
 // arithmetic operations on vectors implemented in arith.go.

-TEXT ·mulWW(SB),NOSPLIT,$0
-	BR ·mulWW_g(SB)
+// func mulWW(x, y Word) (z1, z0 Word)
+TEXT ·mulWW(SB), NOSPLIT, $0
+	MOVD   x+0(FP), R4
+	MOVD   y+8(FP), R5
+	MULHDU R4, R5, R6
+	MULLD  R4, R5, R7
+	MOVD   R6, z1+16(FP)
+	MOVD   R7, z0+24(FP)
+	RET

-TEXT ·divWW(SB),NOSPLIT,$0
-	BR ·divWW_g(SB)
-
-TEXT ·addVV(SB),NOSPLIT,$0
+TEXT ·addVV(SB), NOSPLIT, $0
 	BR ·addVV_g(SB)

-TEXT ·subVV(SB),NOSPLIT,$0
-	BR ·subVV_g(SB)
+// func subVV(z, x, y []Word) (c Word)
+// z[i] = x[i] - y[i] for all i, carrying
+TEXT ·subVV(SB), NOSPLIT, $0
+	MOVD z_len+8(FP), R7
+	MOVD x+24(FP), R8
+	MOVD y+48(FP), R9
+	MOVD z+0(FP), R10
+
+	MOVD $0, R4  // c = 0
+	MOVD $0, R5  // i = 0
+	MOVD $1, R29 // work around lack of ADDI
+	MOVD $8, R28 // work around lack of scaled addressing
+
+	SUBC R0, R0  // clear CA
+	JMP  sublend
+
+// amd64 saves and restores CF, but I believe they only have to do that because all of
+// their math operations clobber it - we should just be able to recover it at the end.
+subloop:
+	MULLD R5, R28, R6
+	MOVD  (R8)(R6), R11 // x[i]
+	MOVD  (R9)(R6), R12 // y[i]
+
+	SUBE R12, R11, R15
+	MOVD R15, (R10)(R6)

-TEXT ·addVW(SB),NOSPLIT,$0
+	ADD R29, R5 // i++
+
+sublend:
+	CMP R5, R7
+	BLT subloop
+
+	ADDZE R4
+	XOR   R29, R4
+	MOVD  R4, c+72(FP)
+	RET
+
+TEXT ·addVW(SB), NOSPLIT, $0
 	BR ·addVW_g(SB)

-TEXT ·subVW(SB),NOSPLIT,$0
+TEXT ·subVW(SB), NOSPLIT, $0
 	BR ·subVW_g(SB)

-TEXT ·shlVU(SB),NOSPLIT,$0
+TEXT ·shlVU(SB), NOSPLIT, $0
 	BR ·shlVU_g(SB)

-TEXT ·shrVU(SB),NOSPLIT,$0
+TEXT ·shrVU(SB), NOSPLIT, $0
 	BR ·shrVU_g(SB)

-TEXT ·mulAddVWW(SB),NOSPLIT,$0
-	BR ·mulAddVWW_g(SB)
+// func mulAddVWW(z, x []Word, y, r Word) (c Word)
+TEXT ·mulAddVWW(SB), NOSPLIT, $0
+	MOVD z+0(FP), R10
+	MOVD x+24(FP), R8
+	MOVD y+48(FP), R9
+	MOVD r+56(FP), R4     // c = r
+	MOVD z_len+8(FP), R11
+	MOVD $0, R3           // i = 0
+	MOVD $8, R18
+	MOVD $1, R19
+
+	JMP e5
+
+l5:
+	MULLD  R18, R3, R5
+	MOVD   (R8)(R5), R20
+	MULLD  R9, R20, R6
+	MULHDU R9, R20, R7
+	ADDC   R4, R6
+	ADDZE  R7
+	MOVD   R6, (R10)(R5)
+	MOVD   R7, R4
+	ADD    R19, R3
+
+e5:
+	CMP R3, R11
+	BLT l5
+
+	MOVD R4, c+64(FP)
+	RET
+
+// func addMulVVW(z, x []Word, y Word) (c Word)
+TEXT ·addMulVVW(SB), NOSPLIT, $0
+	MOVD z+0(FP), R10
+	MOVD x+24(FP), R8
+	MOVD y+48(FP), R9
+	MOVD z_len+8(FP), R22
+
+	MOVD $0, R5   // i = 0
+	MOVD $0, R4   // c = 0
+	MOVD $8, R28
+	MOVD $-2, R23
+	AND  R22, R23 // mask the last bit of z.len
+	MOVD $2, R24
+	CMP  R23, R24
+	BGE  unrolled
+	JMP  end
+
+unrolled:
+	MOVD  $8, R19         // no (RA)(RB*8) on power
+	MULLD R5, R19
+	MOVD  (R10)(R19), R11 // R11 = z[i]
+	MOVD  (R8)(R19), R16  // R16 = x[i]
+	ADD   R28, R19, R25
+	MOVD  (R10)(R25), R17
+	MOVD  (R8)(R25), R18
+
+	MULLD  R9, R16, R12
+	MULHDU R9, R16, R14
+	MULLD  R9, R18, R6
+	MULHDU R9, R18, R7
+	ADDC   R4, R12
+	ADDZE  R14
+	ADDC   R11, R12        // z[i] = (x[i]*y) + z[i] + carry
+	ADDZE  R14             // carry = high order bits + add carry
+	MOVD   R12, (R10)(R19)
+	ADDC   R14, R6
+	ADDZE  R7
+	ADDC   R17, R6
+	ADDZE  R7
+	MOVD   R6, (R10)(R25)
+	MOVD   R7, R4
+
+	ADD R24, R5
+	CMP R5, R23
+	BLT unrolled
+	JMP end
+
+loop:
+	MOVD   $8, R19
+	MULLD  R5, R19
+	MOVD   (R10)(R19), R11
+	MOVD   (R8)(R19), R16
+	MULLD  R9, R16, R12
+	MULHDU R9, R16, R14
+	ADDC   R4, R12
+	ADDZE  R14
+	ADDC   R11, R12
+	ADDZE  R14
+	MOVD   R12, (R10)(R19)
+	MOVD   R14, R4
+
+	MOVD $1, R15
+	ADD  R15, R5
+
+end:
+	CMP R5, R22
+	BLT loop

-TEXT ·addMulVVW(SB),NOSPLIT,$0
-	BR ·addMulVVW_g(SB)
+	MOVD R4, c+56(FP)
+	RET

-TEXT ·divWVW(SB),NOSPLIT,$0
+TEXT ·divWVW(SB), NOSPLIT, $0
 	BR ·divWVW_g(SB)

-TEXT ·bitLen(SB),NOSPLIT,$0
-	BR ·bitLen_g(SB)
+// func bitLen(x Word) int
+TEXT ·bitLen(SB), NOSPLIT, $0
+	MOVD   x+0(FP), R4
+	CNTLZD R4, R4
+	MOVD   $64, R5
+	SUB    R4, R5
+	MOVD   R5, n+8(FP)
+	RET