Commit 232a4e89 authored by Joel Sing's avatar Joel Sing Committed by Adam Langley

crypto/sha256: block implementation in 386 assembly

Benchmark on Intel(R) Core(TM) i5-2500S CPU @ 2.70GHz (albeit in a VM)

benchmark              old ns/op    new ns/op    delta
BenchmarkHash8Bytes         1606          699  -56.48%
BenchmarkHash1K            21920         7268  -66.84%
BenchmarkHash8K           165696        53756  -67.56%

benchmark               old MB/s     new MB/s  speedup
BenchmarkHash8Bytes         4.98        11.44    2.30x
BenchmarkHash1K            46.72       140.88    3.02x
BenchmarkHash8K            49.44       152.39    3.08x

R=agl, cldorian
CC=golang-codereviews
https://golang.org/cl/44800044
parent 939b3fa3
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64
// +build !386,!amd64
// SHA256 block step.
// In its own file so that a faster assembly or C version
......
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// SHA256 block routine. See sha256block.go for Go equivalent.
//
// The algorithm is detailed in FIPS 180-4:
//
// http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
//
// Wt = Mt; for 0 <= t <= 15
// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
//
// a = H0
// b = H1
// c = H2
// d = H3
// e = H4
// f = H5
// g = H6
// h = H7
//
// for t = 0 to 63 {
// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
// T2 = BIGSIGMA0(a) + Maj(a,b,c)
// h = g
// g = f
// f = e
// e = d + T1
// d = c
// c = b
// b = a
// a = T1 + T2
// }
//
// H0 = a + H0
// H1 = b + H1
// H2 = c + H2
// H3 = d + H3
// H4 = e + H4
// H5 = f + H5
// H6 = g + H6
// H7 = h + H7
// Wt = Mt; for 0 <= t <= 15
#define MSGSCHEDULE0(index) \
MOVL (index*4)(SI), AX; \
BSWAPL AX; \
MOVL AX, (index*4)(BP)
// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
// SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
// SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
#define MSGSCHEDULE1(index) \
MOVL ((index-2)*4)(BP), AX; \
MOVL AX, CX; \
RORL $17, AX; \
MOVL CX, DX; \
RORL $19, CX; \
SHRL $10, DX; \
MOVL ((index-15)*4)(BP), BX; \
XORL CX, AX; \
MOVL BX, CX; \
XORL DX, AX; \
RORL $7, BX; \
MOVL CX, DX; \
SHRL $3, DX; \
RORL $18, CX; \
ADDL ((index-7)*4)(BP), AX; \
XORL CX, BX; \
XORL DX, BX; \
ADDL ((index-16)*4)(BP), BX; \
ADDL BX, AX; \
MOVL AX, ((index)*4)(BP)
// Calculate T1 in AX - uses AX, BX, CX and DX registers.
// Wt is passed in AX.
// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
// BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
// Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
#define SHA256T1(const, e, f, g, h) \
MOVL (h*4)(DI), BX; \
ADDL AX, BX; \
MOVL (e*4)(DI), AX; \
ADDL $const, BX; \
MOVL (e*4)(DI), CX; \
RORL $6, AX; \
MOVL (e*4)(DI), DX; \
RORL $11, CX; \
XORL CX, AX; \
MOVL (e*4)(DI), CX; \
RORL $25, DX; \
ANDL (f*4)(DI), CX; \
XORL AX, DX; \
MOVL (e*4)(DI), AX; \
NOTL AX; \
ADDL DX, BX; \
ANDL (g*4)(DI), AX; \
XORL CX, AX; \
ADDL BX, AX
// Calculate T2 in BX - uses AX, BX, CX and DX registers.
// T2 = BIGSIGMA0(a) + Maj(a, b, c)
// BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
#define SHA256T2(a, b, c) \
MOVL (a*4)(DI), AX; \
MOVL (c*4)(DI), BX; \
RORL $2, AX; \
MOVL (a*4)(DI), DX; \
ANDL (b*4)(DI), BX; \
RORL $13, DX; \
MOVL (a*4)(DI), CX; \
ANDL (c*4)(DI), CX; \
XORL DX, AX; \
XORL CX, BX; \
MOVL (a*4)(DI), DX; \
MOVL (b*4)(DI), CX; \
RORL $22, DX; \
ANDL (a*4)(DI), CX; \
XORL CX, BX; \
XORL DX, AX; \
ADDL AX, BX
// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
// The values for e and a are stored in d and h, ready for rotation.
#define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
SHA256T1(const, e, f, g, h); \
MOVL AX, 292(SP); \
SHA256T2(a, b, c); \
MOVL 292(SP), AX; \
ADDL AX, BX; \
ADDL AX, (d*4)(DI); \
MOVL BX, (h*4)(DI)
#define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
MSGSCHEDULE0(index); \
SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
#define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
MSGSCHEDULE1(index); \
SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
TEXT ·block(SB),0,$296-12
MOVL p_base+4(FP), SI
MOVL p_len+8(FP), DX
SHRL $6, DX
SHLL $6, DX
LEAL (SI)(DX*1), DI
MOVL DI, 288(SP)
CMPL SI, DI
JEQ end
LEAL 256(SP), DI // variables
MOVL dig+0(FP), BP
MOVL (0*4)(BP), AX // a = H0
MOVL AX, (0*4)(DI)
MOVL (1*4)(BP), BX // b = H1
MOVL BX, (1*4)(DI)
MOVL (2*4)(BP), CX // c = H2
MOVL CX, (2*4)(DI)
MOVL (3*4)(BP), DX // d = H3
MOVL DX, (3*4)(DI)
MOVL (4*4)(BP), AX // e = H4
MOVL AX, (4*4)(DI)
MOVL (5*4)(BP), BX // f = H5
MOVL BX, (5*4)(DI)
MOVL (6*4)(BP), CX // g = H6
MOVL CX, (6*4)(DI)
MOVL (7*4)(BP), DX // h = H7
MOVL DX, (7*4)(DI)
loop:
MOVL SP, BP // message schedule
SHA256ROUND0(0, 0x428a2f98, 0, 1, 2, 3, 4, 5, 6, 7)
SHA256ROUND0(1, 0x71374491, 7, 0, 1, 2, 3, 4, 5, 6)
SHA256ROUND0(2, 0xb5c0fbcf, 6, 7, 0, 1, 2, 3, 4, 5)
SHA256ROUND0(3, 0xe9b5dba5, 5, 6, 7, 0, 1, 2, 3, 4)
SHA256ROUND0(4, 0x3956c25b, 4, 5, 6, 7, 0, 1, 2, 3)
SHA256ROUND0(5, 0x59f111f1, 3, 4, 5, 6, 7, 0, 1, 2)
SHA256ROUND0(6, 0x923f82a4, 2, 3, 4, 5, 6, 7, 0, 1)
SHA256ROUND0(7, 0xab1c5ed5, 1, 2, 3, 4, 5, 6, 7, 0)
SHA256ROUND0(8, 0xd807aa98, 0, 1, 2, 3, 4, 5, 6, 7)
SHA256ROUND0(9, 0x12835b01, 7, 0, 1, 2, 3, 4, 5, 6)
SHA256ROUND0(10, 0x243185be, 6, 7, 0, 1, 2, 3, 4, 5)
SHA256ROUND0(11, 0x550c7dc3, 5, 6, 7, 0, 1, 2, 3, 4)
SHA256ROUND0(12, 0x72be5d74, 4, 5, 6, 7, 0, 1, 2, 3)
SHA256ROUND0(13, 0x80deb1fe, 3, 4, 5, 6, 7, 0, 1, 2)
SHA256ROUND0(14, 0x9bdc06a7, 2, 3, 4, 5, 6, 7, 0, 1)
SHA256ROUND0(15, 0xc19bf174, 1, 2, 3, 4, 5, 6, 7, 0)
SHA256ROUND1(16, 0xe49b69c1, 0, 1, 2, 3, 4, 5, 6, 7)
SHA256ROUND1(17, 0xefbe4786, 7, 0, 1, 2, 3, 4, 5, 6)
SHA256ROUND1(18, 0x0fc19dc6, 6, 7, 0, 1, 2, 3, 4, 5)
SHA256ROUND1(19, 0x240ca1cc, 5, 6, 7, 0, 1, 2, 3, 4)
SHA256ROUND1(20, 0x2de92c6f, 4, 5, 6, 7, 0, 1, 2, 3)
SHA256ROUND1(21, 0x4a7484aa, 3, 4, 5, 6, 7, 0, 1, 2)
SHA256ROUND1(22, 0x5cb0a9dc, 2, 3, 4, 5, 6, 7, 0, 1)
SHA256ROUND1(23, 0x76f988da, 1, 2, 3, 4, 5, 6, 7, 0)
SHA256ROUND1(24, 0x983e5152, 0, 1, 2, 3, 4, 5, 6, 7)
SHA256ROUND1(25, 0xa831c66d, 7, 0, 1, 2, 3, 4, 5, 6)
SHA256ROUND1(26, 0xb00327c8, 6, 7, 0, 1, 2, 3, 4, 5)
SHA256ROUND1(27, 0xbf597fc7, 5, 6, 7, 0, 1, 2, 3, 4)
SHA256ROUND1(28, 0xc6e00bf3, 4, 5, 6, 7, 0, 1, 2, 3)
SHA256ROUND1(29, 0xd5a79147, 3, 4, 5, 6, 7, 0, 1, 2)
SHA256ROUND1(30, 0x06ca6351, 2, 3, 4, 5, 6, 7, 0, 1)
SHA256ROUND1(31, 0x14292967, 1, 2, 3, 4, 5, 6, 7, 0)
SHA256ROUND1(32, 0x27b70a85, 0, 1, 2, 3, 4, 5, 6, 7)
SHA256ROUND1(33, 0x2e1b2138, 7, 0, 1, 2, 3, 4, 5, 6)
SHA256ROUND1(34, 0x4d2c6dfc, 6, 7, 0, 1, 2, 3, 4, 5)
SHA256ROUND1(35, 0x53380d13, 5, 6, 7, 0, 1, 2, 3, 4)
SHA256ROUND1(36, 0x650a7354, 4, 5, 6, 7, 0, 1, 2, 3)
SHA256ROUND1(37, 0x766a0abb, 3, 4, 5, 6, 7, 0, 1, 2)
SHA256ROUND1(38, 0x81c2c92e, 2, 3, 4, 5, 6, 7, 0, 1)
SHA256ROUND1(39, 0x92722c85, 1, 2, 3, 4, 5, 6, 7, 0)
SHA256ROUND1(40, 0xa2bfe8a1, 0, 1, 2, 3, 4, 5, 6, 7)
SHA256ROUND1(41, 0xa81a664b, 7, 0, 1, 2, 3, 4, 5, 6)
SHA256ROUND1(42, 0xc24b8b70, 6, 7, 0, 1, 2, 3, 4, 5)
SHA256ROUND1(43, 0xc76c51a3, 5, 6, 7, 0, 1, 2, 3, 4)
SHA256ROUND1(44, 0xd192e819, 4, 5, 6, 7, 0, 1, 2, 3)
SHA256ROUND1(45, 0xd6990624, 3, 4, 5, 6, 7, 0, 1, 2)
SHA256ROUND1(46, 0xf40e3585, 2, 3, 4, 5, 6, 7, 0, 1)
SHA256ROUND1(47, 0x106aa070, 1, 2, 3, 4, 5, 6, 7, 0)
SHA256ROUND1(48, 0x19a4c116, 0, 1, 2, 3, 4, 5, 6, 7)
SHA256ROUND1(49, 0x1e376c08, 7, 0, 1, 2, 3, 4, 5, 6)
SHA256ROUND1(50, 0x2748774c, 6, 7, 0, 1, 2, 3, 4, 5)
SHA256ROUND1(51, 0x34b0bcb5, 5, 6, 7, 0, 1, 2, 3, 4)
SHA256ROUND1(52, 0x391c0cb3, 4, 5, 6, 7, 0, 1, 2, 3)
SHA256ROUND1(53, 0x4ed8aa4a, 3, 4, 5, 6, 7, 0, 1, 2)
SHA256ROUND1(54, 0x5b9cca4f, 2, 3, 4, 5, 6, 7, 0, 1)
SHA256ROUND1(55, 0x682e6ff3, 1, 2, 3, 4, 5, 6, 7, 0)
SHA256ROUND1(56, 0x748f82ee, 0, 1, 2, 3, 4, 5, 6, 7)
SHA256ROUND1(57, 0x78a5636f, 7, 0, 1, 2, 3, 4, 5, 6)
SHA256ROUND1(58, 0x84c87814, 6, 7, 0, 1, 2, 3, 4, 5)
SHA256ROUND1(59, 0x8cc70208, 5, 6, 7, 0, 1, 2, 3, 4)
SHA256ROUND1(60, 0x90befffa, 4, 5, 6, 7, 0, 1, 2, 3)
SHA256ROUND1(61, 0xa4506ceb, 3, 4, 5, 6, 7, 0, 1, 2)
SHA256ROUND1(62, 0xbef9a3f7, 2, 3, 4, 5, 6, 7, 0, 1)
SHA256ROUND1(63, 0xc67178f2, 1, 2, 3, 4, 5, 6, 7, 0)
MOVL dig+0(FP), BP
MOVL (0*4)(BP), AX // H0 = a + H0
ADDL (0*4)(DI), AX
MOVL AX, (0*4)(DI)
MOVL AX, (0*4)(BP)
MOVL (1*4)(BP), BX // H1 = b + H1
ADDL (1*4)(DI), BX
MOVL BX, (1*4)(DI)
MOVL BX, (1*4)(BP)
MOVL (2*4)(BP), CX // H2 = c + H2
ADDL (2*4)(DI), CX
MOVL CX, (2*4)(DI)
MOVL CX, (2*4)(BP)
MOVL (3*4)(BP), DX // H3 = d + H3
ADDL (3*4)(DI), DX
MOVL DX, (3*4)(DI)
MOVL DX, (3*4)(BP)
MOVL (4*4)(BP), AX // H4 = e + H4
ADDL (4*4)(DI), AX
MOVL AX, (4*4)(DI)
MOVL AX, (4*4)(BP)
MOVL (5*4)(BP), BX // H5 = f + H5
ADDL (5*4)(DI), BX
MOVL BX, (5*4)(DI)
MOVL BX, (5*4)(BP)
MOVL (6*4)(BP), CX // H6 = g + H6
ADDL (6*4)(DI), CX
MOVL CX, (6*4)(DI)
MOVL CX, (6*4)(BP)
MOVL (7*4)(BP), DX // H7 = h + H7
ADDL (7*4)(DI), DX
MOVL DX, (7*4)(DI)
MOVL DX, (7*4)(BP)
ADDL $64, SI
CMPL SI, 288(SP)
JB loop
end:
RET
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64
// +build 386 amd64
package sha256
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment