Commit 897e0807 authored by bill_ofarrell's avatar bill_ofarrell Committed by Michael Munday

crypto/elliptic: utilize faster z14 multiply/square instructions (when available)

In the s390x assembly implementation of NIST P-256 curve, utilize faster multiply/square
instructions introduced in the z14. These new instructions are designed for crypto
and are constant time. The algorithm is unchanged except for faster
multiplication when run on a z14 or later. On z13, the original mutiplication
(also constant time) is used.

P-256 performance is critical in many applications, such as Blockchain.

name            old time      new time     delta
BaseMultP256    24396 ns/op   21564 ns/op  1.13x
ScalarMultP256  87546 ns/op   72813 ns/op. 1.20x

Change-Id: I7e6d8b420fac56d5f9cc13c9423e2080df854bac
Reviewed-on: https://go-review.googlesource.com/c/146022Reviewed-by: 's avatarMichael Munday <mike.munday@ibm.com>
Reviewed-by: 's avatarBrad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Michael Munday <mike.munday@ibm.com>
parent 9be01c2e
......@@ -3,6 +3,8 @@
// license that can be found in the LICENSE file.
#include "textflag.h"
#include "go_asm.h"
DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
......@@ -44,28 +46,23 @@ GLOBL p256ord<>(SB), 8, $32
GLOBL p256<>(SB), 8, $80
GLOBL p256mul<>(SB), 8, $160
// func hasVectorFacility() bool
TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
MOVD $x-24(SP), R1
XC $24, 0(R1), 0(R1) // clear the storage
MOVD $2, R0 // R0 is the number of double words stored -1
WORD $0xB2B01000 // STFLE 0(R1)
XOR R0, R0 // reset the value of R0
MOVBZ z-8(SP), R1
AND $0x40, R1
BEQ novector
vectorinstalled:
// check if the vector instruction has been enabled
VLEIB $0, $0xF, V16
VLGVB $0, V16, R1
CMPBNE R1, $0xF, novector
MOVB $1, ret+0(FP) // have vx
RET
novector:
MOVB $0, ret+0(FP) // no vx
RET
DATA p256vmsl<>+0x0(SB)/8, $0x0012131415161718
DATA p256vmsl<>+0x8(SB)/8, $0x00191a1b1c1d1e1f
DATA p256vmsl<>+0x10(SB)/8, $0x0012131415161718
DATA p256vmsl<>+0x18(SB)/8, $0x000b0c0d0e0f1011
DATA p256vmsl<>+0x20(SB)/8, $0x00191a1b1c1d1e1f
DATA p256vmsl<>+0x28(SB)/8, $0x0012131415161718
DATA p256vmsl<>+0x30(SB)/8, $0x000b0c0d0e0f1011
DATA p256vmsl<>+0x38(SB)/8, $0x0012131415161718
DATA p256vmsl<>+0x40(SB)/8, $0x000405060708090a
DATA p256vmsl<>+0x48(SB)/8, $0x000b0c0d0e0f1011
DATA p256vmsl<>+0x50(SB)/8, $0x000b0c0d0e0f1011
DATA p256vmsl<>+0x58(SB)/8, $0x000405060708090a
DATA p256vmsl<>+0x60(SB)/8, $0x1010101000010203
DATA p256vmsl<>+0x68(SB)/8, $0x100405060708090a
DATA p256vmsl<>+0x70(SB)/8, $0x100405060708090a
DATA p256vmsl<>+0x78(SB)/8, $0x1010101000010203
GLOBL p256vmsl<>(SB), 8, $128
// ---------------------------------------
// iff cond == 1 val <- -val
......@@ -890,7 +887,7 @@ TEXT ·p256OrdMul(SB), NOSPLIT, $0
#undef K0
// ---------------------------------------
// p256MulInternal
// p256MulInternalVX
// V0-V3,V30,V31 - Not Modified
// V4-V15 - Volatile
......@@ -1033,7 +1030,7 @@ TEXT ·p256OrdMul(SB), NOSPLIT, $0
*
* Last 'group' needs to RED2||RED1 shifted less
*/
TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
TEXT ·p256MulInternalVX(SB), NOSPLIT, $0-0
VL 32(CPOOL), SEL1
VL 48(CPOOL), SEL2
VL 64(CPOOL), SEL3
......@@ -1278,6 +1275,443 @@ TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
#undef CAR1
#undef CAR2
// ---------------------------------------
// p256MulInternalVMSL
// V0-V3,V30,V31 - Not Modified
// V4-V14 - Volatile
#define CPOOL R4
#define SCRATCH R9
// Parameters
#define X0 V0 // Not modified
#define X1 V1 // Not modified
#define Y0 V2 // Not modified
#define Y1 V3 // Not modified
#define T0 V4
#define T1 V5
#define T2 V6
#define P0 V30 // Not modified
#define P1 V31 // Not modified
// input: d0
// output: h0, h1
// temp: TEMP, ZERO, BORROW
#define OBSERVATION3(d0, h0, h1, TEMP, ZERO, BORROW) \
VZERO ZERO \
VSLDB $4, d0, ZERO, h0 \
VLR h0, BORROW \
VSLDB $12, ZERO, h0, TEMP \
VSQ TEMP, h0, h0 \
VSLDB $12, d0, BORROW, h1 \
VSLDB $8, ZERO, BORROW, TEMP \
VAQ TEMP, h0, h0 \
#define OBSERVATION3A(d2, h0, h1, TEMP, ZERO) \
VZERO ZERO \
VSLDB $8, d2, ZERO, TEMP \
VSLDB $8, d2, TEMP, h0 \
VSLDB $12, ZERO, TEMP, h1 \
VSQ h1, h0, h0 \
TEXT ·p256MulInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0
VSTM V16, V19, (SCRATCH)
MOVD $p256vmsl<>+0x00(SB), CPOOL
// Divide input1 into 5 limbs
VGBM $0x007f, V14
VZERO V12
VSLDB $2, X1, X0, V13
VSLDB $2, Y1, Y0, V8
VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb
VSLDB $4, V12, Y1, V6 // V6: 4 bytes limb
VN V14, X0, V5 // V5: first 7 bytes limb
VN V14, Y0, V10 // V10: first 7 bytes limb
VN V14, V13, V13 // v13: third 7 bytes limb
VN V14, V8, V8 // V8: third 7 bytes limb
VMSLG V10, V5, V12, V10 // v10: l10 x l5 (column 1)
VMSLG V8, V5, V12, V8 // v8: l8 x l5
VMSLG V6, V13, V12, V13 // v13: l6 x l3
VMSLG V6, V11, V12, V11 // v11: l6 x l1 (column 9)
VMSLG V6, V5, V12, V6 // v6: l6 x l5
MOVD $p256vmsl<>+0x00(SB), CPOOL
VGBM $0x7f7f, V14
VL 0(CPOOL), V4
VL 16(CPOOL), V7
VL 32(CPOOL), V9
VL 48(CPOOL), V5
VLM 64(CPOOL), V16, V19
VPERM V12, X0, V4, V4 // v4: limb4 | limb5
VPERM Y1, Y0, V7, V7
VPERM V12, Y0, V9, V9 // v9: limb10 | limb9
VPERM X1, X0, V5, V5
VPERM X1, X0, V16, V16
VPERM Y1, Y0, V17, V17
VPERM X1, V12, V18, V18 // v18: limb1 | limb2
VPERM Y1, V12, V19, V19 // v19: limb7 | limb6
VN V14, V7, V7 // v7: limb9 | limb8
VN V14, V5, V5 // v5: limb3 | limb4
VN V14, V16, V16 // v16: limb2 | limb3
VN V14, V17, V17 // v17: limb8 | limb7
VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2)
VMSLG V9, V5, V8, V8 // v8: l10 x l9 + l3 x l4 + l8 x l5 (column 3)
VMSLG V9, V16, V12, V16 // v16: l10 x l9 + l2 x l3
VMSLG V9, V18, V12, V9 // v9: l10 x l1 + l9 x l2
VMSLG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2
VMSLG V17, V4, V16, V16 // v16: l8 x l4 + l7 x l5 + l10 x l9 + l2 x l3 (column 4)
VMSLG V17, V5, V9, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4
VMSLG V17, V18, V12, V17 // v18: l8 x l1 + l7 x l2
VMSLG V19, V5, V7, V7 // v7: l9 x l1 + l8 x l2 + l7 x l3 + l6 x l4 (column 6)
VMSLG V19, V18, V12, V19 // v19: l7 x l1 + l6 x l2 (column 8)
VAQ V9, V6, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5)
VAQ V17, V13, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7)
VSLDB $9, V12, V10, V4
VSLDB $9, V12, V7, V5
VAQ V4, V14, V14
VAQ V5, V13, V13
VSLDB $9, V12, V14, V4
VSLDB $9, V12, V13, V5
VAQ V4, V8, V8
VAQ V5, V19, V19
VSLDB $9, V12, V8, V4
VSLDB $9, V12, V19, V5
VAQ V4, V16, V16
VAQ V5, V11, V11
VSLDB $9, V12, V16, V4
VAQ V4, V9, V17
VGBM $0x007f, V4
VGBM $0x00ff, V5
VN V10, V4, V10
VN V14, V4, V14
VN V8, V4, V8
VN V16, V4, V16
VN V17, V4, V9
VN V7, V4, V7
VN V13, V4, V13
VN V19, V4, V19
VN V11, V5, V11
VSLDB $7, V14, V14, V14
VSLDB $14, V8, V12, V4
VSLDB $14, V12, V8, V8
VSLDB $5, V16, V16, V16
VSLDB $12, V9, V12, V5
VO V14, V10, V10
VO V8, V16, V16
VO V4, V10, V10 // first rightmost 128bits of the multiplication result
VO V5, V16, V16 // second rightmost 128bits of the multiplication result
// adjust v7, v13, v19, v11
VSLDB $7, V13, V13, V13
VSLDB $14, V19, V12, V4
VSLDB $14, V12, V19, V19
VSLDB $5, V11, V12, V5
VO V13, V7, V7
VO V4, V7, V7
VO V19, V5, V11
VSLDB $9, V12, V17, V14
VSLDB $12, V12, V9, V9
VACCQ V7, V14, V13
VAQ V7, V14, V7
VAQ V11, V13, V11
// First reduction, 96 bits
VSLDB $4, V16, V10, T0
VSLDB $4, V12, V16, T1
VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result
VSLDB $3, V7, V12, V7
OBSERVATION3(V10, V8, T2, V17, V18, V19)// results V8 | T2
VO V7, V9, V7 // third rightmost 128bits of the multiplication result
VACCQ T0, T2, V9
VAQ T0, T2, T2
VACQ T1, V8, V9, V8
// Second reduction 96 bits
VSLDB $4, V8, T2, T0
VSLDB $4, V12, V8, T1
OBSERVATION3(T2, V9, V8, V17, V18, V19)// results V9 | V8
VACCQ T0, V8, T2
VAQ T0, V8, V8
VACQ T1, V9, T2, V9
// Third reduction 64 bits
VSLDB $8, V9, V8, T0
VSLDB $8, V12, V9, T1
OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13
VACCQ T0, V13, V12
VAQ T0, V13, V13
VACQ T1, V14, V12, V14
VACCQ V13, V7, V12
VAQ V13, V7, T0
VACCCQ V14, V11, V12, T2
VACQ V14, V11, V12, T1 // results T2 | T1 | T0
// ---------------------------------------------------
MOVD $p256mul<>+0x00(SB), CPOOL
VZERO V12
VSCBIQ P0, T0, V8
VSQ P0, T0, V7
VSBCBIQ T1, P1, V8, V10
VSBIQ T1, P1, V8, V9
VSBIQ T2, V12, V10, T2
// what output to use, V9||V7 or T1||T0?
VSEL T0, V7, T2, T0
VSEL T1, V9, T2, T1
VLM (SCRATCH), V16, V19
RET
// ---------------------------------------
// p256SqrInternalVMSL
// V0-V1,V30,V31 - Not Modified
// V4-V14 - Volatile
TEXT ·p256SqrInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0
VSTM V16, V18, (SCRATCH)
MOVD $p256vmsl<>+0x00(SB), CPOOL
// Divide input into limbs
VGBM $0x007f, V14
VZERO V12
VSLDB $2, X1, X0, V13
VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb
VN V14, X0, V10 // V10: first 7 bytes limb
VN V14, V13, V13 // v13: third 7 bytes limb
VMSLG V10, V10, V12, V10 // v10: l10 x l5 (column 1)
VMSLG V13, V13, V12, V13 // v13: l8 x l3
VMSLG V11, V11, V12, V11 // v11: l6 x l1 (column 9)
MOVD $p256vmsl<>+0x00(SB), CPOOL
VGBM $0x7f7f, V14
VL 0(CPOOL), V4
VL 16(CPOOL), V7
VL 32(CPOOL), V9
VL 48(CPOOL), V5
VLM 64(CPOOL), V16, V18
VL 112(CPOOL), V8
VPERM V12, X0, V4, V4 // v4: limb4 | limb5
VPERM X1, X0, V7, V7
VPERM V12, X0, V9, V9 // v9: limb10 | limb9
VPERM X1, X0, V5, V5
VPERM X1, X0, V16, V16
VPERM X1, X0, V17, V17
VPERM X1, V12, V18, V18 // v18: limb1 | limb2
VPERM X1, V12, V8, V8 // v8: limb7 | limb6
VN V14, V7, V7 // v7: limb9 | limb8
VN V14, V5, V5 // v5: limb3 | limb4
VN V14, V16, V16 // v16: limb2 | limb3
VN V14, V17, V17 // v17: limb8 | limb7
VMSLEOG V9, V18, V13, V6 // v6: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5)
VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2)
VMSLEOG V9, V16, V12, V16 // v16: l10 x l2 + l9 x l3 + l8 x l4 + l7 x l5 (column 4)
VMSLEOG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2 (column 6)
VMSLEG V17, V18, V12, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7)
VMSLG V8, V18, V12, V8 // v8: l7 x l1 + l6 x l2 (column 8)
VMSLEG V9, V5, V12, V18 // v18: l10 x l3 + l9 x l4 + l8 x l5 (column 3)
VSLDB $9, V12, V10, V4
VSLDB $9, V12, V7, V5
VAQ V4, V14, V14
VAQ V5, V13, V13
VSLDB $9, V12, V14, V4
VSLDB $9, V12, V13, V5
VAQ V4, V18, V18
VAQ V5, V8, V8
VSLDB $9, V12, V18, V4
VSLDB $9, V12, V8, V5
VAQ V4, V16, V16
VAQ V5, V11, V11
VSLDB $9, V12, V16, V4
VAQ V4, V6, V17
VGBM $0x007f, V4
VGBM $0x00ff, V5
VN V10, V4, V10
VN V14, V4, V14
VN V18, V4, V18
VN V16, V4, V16
VN V17, V4, V9
VN V7, V4, V7
VN V13, V4, V13
VN V8, V4, V8
VN V11, V5, V11
VSLDB $7, V14, V14, V14
VSLDB $14, V18, V12, V4
VSLDB $14, V12, V18, V18
VSLDB $5, V16, V16, V16
VSLDB $12, V9, V12, V5
VO V14, V10, V10
VO V18, V16, V16
VO V4, V10, V10 // first rightmost 128bits of the multiplication result
VO V5, V16, V16 // second rightmost 128bits of the multiplication result
// adjust v7, v13, v8, v11
VSLDB $7, V13, V13, V13
VSLDB $14, V8, V12, V4
VSLDB $14, V12, V8, V8
VSLDB $5, V11, V12, V5
VO V13, V7, V7
VO V4, V7, V7
VO V8, V5, V11
VSLDB $9, V12, V17, V14
VSLDB $12, V12, V9, V9
VACCQ V7, V14, V13
VAQ V7, V14, V7
VAQ V11, V13, V11
// First reduction, 96 bits
VSLDB $4, V16, V10, T0
VSLDB $4, V12, V16, T1
VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result
VSLDB $3, V7, V12, V7
OBSERVATION3(V10, V8, T2, V16, V17, V18)// results V8 | T2
VO V7, V9, V7 // third rightmost 128bits of the multiplication result
VACCQ T0, T2, V9
VAQ T0, T2, T2
VACQ T1, V8, V9, V8
// Second reduction 96 bits
VSLDB $4, V8, T2, T0
VSLDB $4, V12, V8, T1
OBSERVATION3(T2, V9, V8, V16, V17, V18)// results V9 | V8
VACCQ T0, V8, T2
VAQ T0, V8, V8
VACQ T1, V9, T2, V9
// Third reduction 64 bits
VSLDB $8, V9, V8, T0
VSLDB $8, V12, V9, T1
OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13
VACCQ T0, V13, V12
VAQ T0, V13, V13
VACQ T1, V14, V12, V14
VACCQ V13, V7, V12
VAQ V13, V7, T0
VACCCQ V14, V11, V12, T2
VACQ V14, V11, V12, T1 // results T2 | T1 | T0
// ---------------------------------------------------
MOVD $p256mul<>+0x00(SB), CPOOL
VZERO V12
VSCBIQ P0, T0, V8
VSQ P0, T0, V7
VSBCBIQ T1, P1, V8, V10
VSBIQ T1, P1, V8, V9
VSBIQ T2, V12, V10, T2
// what output to use, V9||V7 or T1||T0?
VSEL T0, V7, T2, T0
VSEL T1, V9, T2, T1
VLM (SCRATCH), V16, V18
RET
#undef CPOOL
#undef SCRATCH
#undef X0
#undef X1
#undef Y0
#undef Y1
#undef T0
#undef T1
#undef T2
#undef P0
#undef P1
#define SCRATCH R9
TEXT p256MulInternal<>(SB),NOSPLIT,$64-0
MOVD $scratch-64(SP), SCRATCH
MOVD ·p256MulInternalFacility+0x00(SB),R7
CALL (R7)
RET
TEXT ·p256MulInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0
MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0
MOVD $·p256MulInternalFacility+0x00(SB), R7
MOVD $·p256MulInternalVX(SB), R8
CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported
MOVD $·p256MulInternalVMSL(SB), R8
novmsl:
MOVD R8, 0(R7)
BR (R8)
GLOBL ·p256MulInternalFacility+0x00(SB), NOPTR, $8
DATA ·p256MulInternalFacility+0x00(SB)/8, $·p256MulInternalTrampolineSetup(SB)
// Parameters
#define X0 V0
#define X1 V1
#define Y0 V2
#define Y1 V3
TEXT ·p256SqrInternalVX(SB), NOFRAME|NOSPLIT, $0
VLR X0, Y0
VLR X1, Y1
BR ·p256MulInternalVX(SB)
#undef X0
#undef X1
#undef Y0
#undef Y1
TEXT p256SqrInternal<>(SB),NOSPLIT,$48-0
MOVD $scratch-48(SP), SCRATCH
MOVD ·p256SqrInternalFacility+0x00(SB),R7
CALL (R7)
RET
TEXT ·p256SqrInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0
MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0
MOVD $·p256SqrInternalFacility+0x00(SB), R7
MOVD $·p256SqrInternalVX(SB), R8
CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported
MOVD $·p256SqrInternalVMSL(SB), R8
novmsl:
MOVD R8, 0(R7)
BR (R8)
GLOBL ·p256SqrInternalFacility+0x00(SB), NOPTR, $8
DATA ·p256SqrInternalFacility+0x00(SB)/8, $·p256SqrInternalTrampolineSetup(SB)
#undef SCRATCH
#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
VZERO ZER \
VSCBIQ Y0, X0, CAR1 \
......@@ -1385,6 +1819,52 @@ TEXT ·p256MulAsm(SB), NOSPLIT, $0
#undef P0
#undef P1
// ---------------------------------------
// func p256SqrAsm(res, in1 []byte)
#define res_ptr R1
#define x_ptr R2
#define y_ptr R3
#define CPOOL R4
// Parameters
#define X0 V0
#define X1 V1
#define T0 V4
#define T1 V5
// Constants
#define P0 V30
#define P1 V31
TEXT ·p256SqrAsm(SB), NOSPLIT, $0
MOVD res+0(FP), res_ptr
MOVD in1+24(FP), x_ptr
VL (1*16)(x_ptr), X0
VL (0*16)(x_ptr), X1
MOVD $p256mul<>+0x00(SB), CPOOL
VL 16(CPOOL), P0
VL 0(CPOOL), P1
CALL p256SqrInternal<>(SB)
VST T0, (1*16)(res_ptr)
VST T1, (0*16)(res_ptr)
RET
#undef res_ptr
#undef x_ptr
#undef y_ptr
#undef CPOOL
#undef X0
#undef X1
#undef T0
#undef T1
#undef P0
#undef P1
// Point add with P2 being affine point
// If sign == 1 -> P2 = -P2
// If sel == 0 -> P3 = P1
......@@ -1524,7 +2004,7 @@ TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
VL 80(P1ptr), X0 // Z1L
VLR X0, Y0
VLR X1, Y1
CALL p256MulInternal<>(SB)
CALL p256SqrInternal<>(SB)
// X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2
VLR T0, X0
......@@ -1570,7 +2050,7 @@ TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
// X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2
VLR Y0, X0
VLR Y1, X1
CALL p256MulInternal<>(SB)
CALL p256SqrInternal<>(SB)
VLR T0, X0
VLR T1, X1
......@@ -1594,7 +2074,7 @@ TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
VLR T2H, X1
VLR T2L, Y0
VLR T2H, Y1
CALL p256MulInternal<>(SB)
CALL p256SqrInternal<>(SB)
// SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3)
p256SubInternal(T1,T0,T1,T0,T1H,T1L)
......@@ -1827,7 +2307,7 @@ TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
VL 80(P1ptr), X0 // Z1L
VLR X0, Y0
VLR X1, Y1
CALL p256MulInternal<>(SB)
CALL p256SqrInternal<>(SB)
// SUB(X<X1-T) // T2 = X1-T1
VL 0(P1ptr), X1H
......@@ -1859,7 +2339,7 @@ TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
// X- ; Y=X ; MUL; T- // Y3 = Y3²
VLR X0, Y0
VLR X1, Y1
CALL p256MulInternal<>(SB)
CALL p256SqrInternal<>(SB)
// X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1
VLR T0, X0
......@@ -1873,7 +2353,7 @@ TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
// X- ; Y=X ; MUL; T- // Y3 = Y3²
VLR X0, Y0
VLR X1, Y1
CALL p256MulInternal<>(SB)
CALL p256SqrInternal<>(SB)
// HAL(Y3<T) // Y3 = half*Y3
p256HalfInternal(Y3H,Y3L, T1,T0)
......@@ -1883,7 +2363,7 @@ TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
VLR T2H, X1
VLR T2L, Y0
VLR T2H, Y1
CALL p256MulInternal<>(SB)
CALL p256SqrInternal<>(SB)
// ADD(T1<T3+T3) // T1 = 2*T3
p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
......@@ -2064,7 +2544,7 @@ TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
VL 80(P1ptr), X0 // Z1L
VLR X0, Y0
VLR X1, Y1
CALL p256MulInternal<>(SB)
CALL p256SqrInternal<>(SB)
// X- ; Y=T ; MUL; R=T // R = Z1*T1
VLR T0, Y0
......@@ -2085,7 +2565,7 @@ TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
VL 80(P2ptr), X0 // Z2L
VLR X0, Y0
VLR X1, Y1
CALL p256MulInternal<>(SB)
CALL p256SqrInternal<>(SB)
// X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
VLR T0, Y0
......@@ -2175,7 +2655,7 @@ TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
VLR HH, X1
VLR HL, Y0
VLR HH, Y1
CALL p256MulInternal<>(SB)
CALL p256SqrInternal<>(SB)
// X- ; Y=T ; MUL; T2=T // T2 = H*T1
VLR T0, Y0
......@@ -2196,7 +2676,7 @@ TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
VLR RH, X1
VLR RL, Y0
VLR RH, Y1
CALL p256MulInternal<>(SB)
CALL p256SqrInternal<>(SB)
// SUB(T<T-T2) // X3 = X3-T2
p256SubInternal(T1,T0,T1,T0,T2H,T2L)
......
......@@ -8,7 +8,14 @@ package elliptic
import (
"crypto/subtle"
"internal/cpu"
"math/big"
"unsafe"
)
const (
offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
offsetS390xHasVE1 = unsafe.Offsetof(cpu.S390X.HasVE1)
)
type p256CurveFast struct {
......@@ -26,14 +33,26 @@ var (
p256PreFast *[37][64]p256Point
)
// hasVectorFacility reports whether the machine has the z/Architecture
// vector facility installed and enabled.
func hasVectorFacility() bool
//go:noescape
func p256MulInternalTrampolineSetup()
//go:noescape
func p256SqrInternalTrampolineSetup()
//go:noescape
func p256MulInternalVX()
//go:noescape
func p256MulInternalVMSL()
var hasVX = hasVectorFacility()
//go:noescape
func p256SqrInternalVX()
//go:noescape
func p256SqrInternalVMSL()
func initP256Arch() {
if hasVX {
if cpu.S390X.HasVX {
p256 = p256CurveFast{p256Params}
initTable()
return
......@@ -51,12 +70,15 @@ func (curve p256CurveFast) Params() *CurveParams {
// Functions implemented in p256_asm_s390x.s
// Montgomery multiplication modulo P256
//
//go:noescape
func p256SqrAsm(res, in1 []byte)
//go:noescape
func p256MulAsm(res, in1, in2 []byte)
// Montgomery square modulo P256
func p256Sqr(res, in []byte) {
p256MulAsm(res, in, in)
p256SqrAsm(res, in)
}
// Montgomery multiplication by 1
......
......@@ -126,6 +126,7 @@ type s390x struct {
HasSHA256 bool // K{I,L}MD-SHA-256 functions
HasSHA512 bool // K{I,L}MD-SHA-512 functions
HasVX bool // vector facility. Note: the runtime sets this when it processes auxv records.
HasVE1 bool // vector-enhancement 1
_ CacheLinePad
}
......
......@@ -71,6 +71,9 @@ const (
msa5 facility = 57 // message-security-assist extension 5
msa8 facility = 146 // message-security-assist extension 8
// vector facilities
ve1 facility = 135 // vector-enhancements 1
// Note: vx and highgprs are excluded because they require
// kernel support and so must be fetched from HWCAP.
)
......@@ -115,6 +118,7 @@ func doinit() {
{Name: "dfp", Feature: &S390X.HasDFP},
{Name: "etf3eh", Feature: &S390X.HasETF3Enhanced},
{Name: "vx", Feature: &S390X.HasVX},
{Name: "ve1", Feature: &S390X.HasVE1},
}
aes := []function{aes128, aes192, aes256}
......@@ -150,4 +154,7 @@ func doinit() {
S390X.HasSHA512 = kimd.Has(sha512) && klmd.Has(sha512)
S390X.HasGHASH = kimd.Has(ghash) // KLMD-GHASH does not exist
}
if S390X.HasVX {
S390X.HasVE1 = facilities.Has(ve1)
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment