Commit 73f284e2 authored by Ilya Tocar's avatar Ilya Tocar Committed by Brad Fitzpatrick

crypto/elliptic: reduce allocations on amd64

This is inspired by
https://blog.cloudflare.com/go-dont-collect-my-garbage/
This CL adds allocation tracking and parallelizes p256-related benchmarks.
Amount of allocations can be significantly reduced by marking amd64 asm
functions as noescape. This exposes a bug in p256MovCond:
PANDN with memory argument will fault if memory is not aligned, so they
are replaced with MOVDQU (which is ok with unaligned memory) and
register version of PANDN.

Results on 88-thread machine (2x 22 cores) below:
crypto/elliptic:
name               old time/op    new time/op    delta
BaseMultP256-88      1.50µs ±11%    1.19µs ± 5%  -20.20%  (p=0.000 n=10+10)
ScalarMultP256-88    5.47µs ± 5%    3.63µs ±10%  -33.66%  (p=0.000 n=9+10)

name               old alloc/op   new alloc/op   delta
BaseMultP256-88        800B ± 0%      288B ± 0%  -64.00%  (p=0.000 n=10+10)
ScalarMultP256-88    2.59kB ± 0%    0.26kB ± 0%  -90.12%  (p=0.000 n=10+10)

name               old allocs/op  new allocs/op  delta
BaseMultP256-88        13.0 ± 0%       6.0 ± 0%  -53.85%  (p=0.000 n=10+10)
ScalarMultP256-88      16.0 ± 0%       5.0 ± 0%  -68.75%  (p=0.000 n=10+10)

crypto/ecdsa:
name              old time/op    new time/op    delta
SignP256-88         8.63µs ±37%    7.55µs ±38%     ~     (p=0.393 n=10+10)
VerifyP256-88       13.9µs ± 8%     7.0µs ± 7%  -49.29%  (p=0.000 n=10+9)
KeyGeneration-88    2.77µs ±11%    2.34µs ±11%  -15.57%  (p=0.000 n=10+10)

name              old alloc/op   new alloc/op   delta
SignP256-88         4.14kB ± 1%    2.98kB ± 2%  -27.94%  (p=0.000 n=10+10)
VerifyP256-88       4.47kB ± 0%    0.99kB ± 0%  -77.84%  (p=0.000 n=9+10)
KeyGeneration-88    1.21kB ± 0%    0.69kB ± 0%  -42.78%  (p=0.000 n=10+10)

name              old allocs/op  new allocs/op  delta
SignP256-88           47.0 ± 0%      34.0 ± 0%  -27.66%  (p=0.000 n=10+10)
VerifyP256-88         38.0 ± 0%      17.0 ± 0%  -55.26%  (p=0.000 n=10+10)
KeyGeneration-88      20.0 ± 0%      13.0 ± 0%  -35.00%  (p=0.000 n=10+10)

On machine with only 4 cores, results are much less impressive:
around 2% performance gain.

Change-Id: I8a2f8168f83d27ad9ace1b4b1a1e11cb83edf717
Reviewed-on: https://go-review.googlesource.com/80757
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarIan Lance Taylor <iant@golang.org>
parent 19928933
...@@ -48,10 +48,13 @@ func BenchmarkSignP256(b *testing.B) { ...@@ -48,10 +48,13 @@ func BenchmarkSignP256(b *testing.B) {
hashed := []byte("testing") hashed := []byte("testing")
priv, _ := GenerateKey(p256, rand.Reader) priv, _ := GenerateKey(p256, rand.Reader)
b.ReportAllocs()
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { b.RunParallel(func(pb *testing.PB) {
_, _, _ = Sign(rand.Reader, priv, hashed) for pb.Next() {
} _, _, _ = Sign(rand.Reader, priv, hashed)
}
})
} }
func BenchmarkSignP384(b *testing.B) { func BenchmarkSignP384(b *testing.B) {
...@@ -60,10 +63,13 @@ func BenchmarkSignP384(b *testing.B) { ...@@ -60,10 +63,13 @@ func BenchmarkSignP384(b *testing.B) {
hashed := []byte("testing") hashed := []byte("testing")
priv, _ := GenerateKey(p384, rand.Reader) priv, _ := GenerateKey(p384, rand.Reader)
b.ReportAllocs()
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { b.RunParallel(func(pb *testing.PB) {
_, _, _ = Sign(rand.Reader, priv, hashed) for pb.Next() {
} _, _, _ = Sign(rand.Reader, priv, hashed)
}
})
} }
func BenchmarkVerifyP256(b *testing.B) { func BenchmarkVerifyP256(b *testing.B) {
...@@ -73,20 +79,26 @@ func BenchmarkVerifyP256(b *testing.B) { ...@@ -73,20 +79,26 @@ func BenchmarkVerifyP256(b *testing.B) {
priv, _ := GenerateKey(p256, rand.Reader) priv, _ := GenerateKey(p256, rand.Reader)
r, s, _ := Sign(rand.Reader, priv, hashed) r, s, _ := Sign(rand.Reader, priv, hashed)
b.ReportAllocs()
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { b.RunParallel(func(pb *testing.PB) {
Verify(&priv.PublicKey, hashed, r, s) for pb.Next() {
} Verify(&priv.PublicKey, hashed, r, s)
}
})
} }
func BenchmarkKeyGeneration(b *testing.B) { func BenchmarkKeyGeneration(b *testing.B) {
b.ResetTimer() b.ResetTimer()
p256 := elliptic.P256() p256 := elliptic.P256()
b.ReportAllocs()
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { b.RunParallel(func(pb *testing.PB) {
GenerateKey(p256, rand.Reader) for pb.Next() {
} GenerateKey(p256, rand.Reader)
}
})
} }
func testSignAndVerify(t *testing.T, c elliptic.Curve, tag string) { func testSignAndVerify(t *testing.T, c elliptic.Curve, tag string) {
......
...@@ -523,10 +523,13 @@ func BenchmarkBaseMult(b *testing.B) { ...@@ -523,10 +523,13 @@ func BenchmarkBaseMult(b *testing.B) {
p224 := P224() p224 := P224()
e := p224BaseMultTests[25] e := p224BaseMultTests[25]
k, _ := new(big.Int).SetString(e.k, 10) k, _ := new(big.Int).SetString(e.k, 10)
b.ReportAllocs()
b.StartTimer() b.StartTimer()
for i := 0; i < b.N; i++ { b.RunParallel(func(pb *testing.PB) {
p224.ScalarBaseMult(k.Bytes()) for pb.Next() {
} p224.ScalarBaseMult(k.Bytes())
}
})
} }
func BenchmarkBaseMultP256(b *testing.B) { func BenchmarkBaseMultP256(b *testing.B) {
...@@ -534,10 +537,13 @@ func BenchmarkBaseMultP256(b *testing.B) { ...@@ -534,10 +537,13 @@ func BenchmarkBaseMultP256(b *testing.B) {
p256 := P256() p256 := P256()
e := p224BaseMultTests[25] e := p224BaseMultTests[25]
k, _ := new(big.Int).SetString(e.k, 10) k, _ := new(big.Int).SetString(e.k, 10)
b.ReportAllocs()
b.StartTimer() b.StartTimer()
for i := 0; i < b.N; i++ { b.RunParallel(func(pb *testing.PB) {
p256.ScalarBaseMult(k.Bytes()) for pb.Next() {
} p256.ScalarBaseMult(k.Bytes())
}
})
} }
func BenchmarkScalarMultP256(b *testing.B) { func BenchmarkScalarMultP256(b *testing.B) {
...@@ -546,10 +552,13 @@ func BenchmarkScalarMultP256(b *testing.B) { ...@@ -546,10 +552,13 @@ func BenchmarkScalarMultP256(b *testing.B) {
_, x, y, _ := GenerateKey(p256, rand.Reader) _, x, y, _ := GenerateKey(p256, rand.Reader)
priv, _, _, _ := GenerateKey(p256, rand.Reader) priv, _, _, _ := GenerateKey(p256, rand.Reader)
b.ReportAllocs()
b.StartTimer() b.StartTimer()
for i := 0; i < b.N; i++ { b.RunParallel(func(pb *testing.PB) {
p256.ScalarMult(x, y, priv) for pb.Next() {
} p256.ScalarMult(x, y, priv)
}
})
} }
func TestMarshal(t *testing.T) { func TestMarshal(t *testing.T) {
......
...@@ -52,46 +52,62 @@ func (curve p256Curve) Params() *CurveParams { ...@@ -52,46 +52,62 @@ func (curve p256Curve) Params() *CurveParams {
// Functions implemented in p256_asm_amd64.s // Functions implemented in p256_asm_amd64.s
// Montgomery multiplication modulo P256 // Montgomery multiplication modulo P256
//go:noescape
func p256Mul(res, in1, in2 []uint64) func p256Mul(res, in1, in2 []uint64)
// Montgomery square modulo P256 // Montgomery square modulo P256
//go:noescape
func p256Sqr(res, in []uint64) func p256Sqr(res, in []uint64)
// Montgomery multiplication by 1 // Montgomery multiplication by 1
//go:noescape
func p256FromMont(res, in []uint64) func p256FromMont(res, in []uint64)
// iff cond == 1 val <- -val // iff cond == 1 val <- -val
//go:noescape
func p256NegCond(val []uint64, cond int) func p256NegCond(val []uint64, cond int)
// if cond == 0 res <- b; else res <- a // if cond == 0 res <- b; else res <- a
//go:noescape
func p256MovCond(res, a, b []uint64, cond int) func p256MovCond(res, a, b []uint64, cond int)
// Endianness swap // Endianness swap
//go:noescape
func p256BigToLittle(res []uint64, in []byte) func p256BigToLittle(res []uint64, in []byte)
//go:noescape
func p256LittleToBig(res []byte, in []uint64) func p256LittleToBig(res []byte, in []uint64)
// Constant time table access // Constant time table access
//go:noescape
func p256Select(point, table []uint64, idx int) func p256Select(point, table []uint64, idx int)
//go:noescape
func p256SelectBase(point, table []uint64, idx int) func p256SelectBase(point, table []uint64, idx int)
// Montgomery multiplication modulo Ord(G) // Montgomery multiplication modulo Ord(G)
//go:noescape
func p256OrdMul(res, in1, in2 []uint64) func p256OrdMul(res, in1, in2 []uint64)
// Montgomery square modulo Ord(G), repeated n times // Montgomery square modulo Ord(G), repeated n times
//go:noescape
func p256OrdSqr(res, in []uint64, n int) func p256OrdSqr(res, in []uint64, n int)
// Point add with in2 being affine point // Point add with in2 being affine point
// If sign == 1 -> in2 = -in2 // If sign == 1 -> in2 = -in2
// If sel == 0 -> res = in1 // If sel == 0 -> res = in1
// if zero == 0 -> res = in2 // if zero == 0 -> res = in2
//go:noescape
func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
// Point add. Returns one if the two input points were equal and zero // Point add. Returns one if the two input points were equal and zero
// otherwise. (Note that, due to the way that the equations work out, some // otherwise. (Note that, due to the way that the equations work out, some
// representations of ∞ are considered equal to everything by this function.) // representations of ∞ are considered equal to everything by this function.)
//go:noescape
func p256PointAddAsm(res, in1, in2 []uint64) int func p256PointAddAsm(res, in1, in2 []uint64) int
// Point double // Point double
//go:noescape
func p256PointDoubleAsm(res, in []uint64) func p256PointDoubleAsm(res, in []uint64)
func (curve p256Curve) Inverse(k *big.Int) *big.Int { func (curve p256Curve) Inverse(k *big.Int) *big.Int {
......
...@@ -81,17 +81,23 @@ TEXT ·p256MovCond(SB),NOSPLIT,$0 ...@@ -81,17 +81,23 @@ TEXT ·p256MovCond(SB),NOSPLIT,$0
PCMPEQL X13, X12 PCMPEQL X13, X12
MOVOU X12, X0 MOVOU X12, X0
PANDN (16*0)(x_ptr), X0 MOVOU (16*0)(x_ptr), X6
PANDN X6, X0
MOVOU X12, X1 MOVOU X12, X1
PANDN (16*1)(x_ptr), X1 MOVOU (16*1)(x_ptr), X7
PANDN X7, X1
MOVOU X12, X2 MOVOU X12, X2
PANDN (16*2)(x_ptr), X2 MOVOU (16*2)(x_ptr), X8
PANDN X8, X2
MOVOU X12, X3 MOVOU X12, X3
PANDN (16*3)(x_ptr), X3 MOVOU (16*3)(x_ptr), X9
PANDN X9, X3
MOVOU X12, X4 MOVOU X12, X4
PANDN (16*4)(x_ptr), X4 MOVOU (16*4)(x_ptr), X10
PANDN X10, X4
MOVOU X12, X5 MOVOU X12, X5
PANDN (16*5)(x_ptr), X5 MOVOU (16*5)(x_ptr), X11
PANDN X11, X5
MOVOU (16*0)(y_ptr), X6 MOVOU (16*0)(y_ptr), X6
MOVOU (16*1)(y_ptr), X7 MOVOU (16*1)(y_ptr), X7
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment