Commit 7f998453 authored by Michael Munday's avatar Michael Munday

crypto/md5: simplify generic implementation

This change uses library functions such as bits.RotateLeft32 to
reduce the amount of code needed in the generic implementation.
Since the code is now shorter I've also removed the option to
generate a non-unrolled version of the code.

I've also tried to remove bounds checks where possible to make
the new version performant, however that is not the primary goal
of this change since most architectures have assembly
implementations already.

Assembly performance:

name                 old speed      new speed      delta
Hash8Bytes           50.3MB/s ± 1%  59.1MB/s ± 0%  +17.63%  (p=0.000 n=9+8)
Hash1K                590MB/s ± 0%   597MB/s ± 0%   +1.25%  (p=0.000 n=9+9)
Hash8K                636MB/s ± 1%   638MB/s ± 1%     ~     (p=0.072 n=10+10)
Hash8BytesUnaligned  50.5MB/s ± 0%  59.1MB/s ± 1%  +17.09%  (p=0.000 n=10+10)
Hash1KUnaligned       589MB/s ± 1%   596MB/s ± 1%   +1.23%  (p=0.000 n=9+10)
Hash8KUnaligned       638MB/s ± 1%   640MB/s ± 0%   +0.35%  (p=0.002 n=10+10)

Pure Go performance:

name                 old speed      new speed      delta
Hash8Bytes           30.3MB/s ± 1%  42.8MB/s ± 0%  +41.20%  (p=0.000 n=9+9)
Hash1K                364MB/s ± 4%   394MB/s ± 1%   +8.27%  (p=0.000 n=10+10)
Hash8K                404MB/s ± 1%   420MB/s ± 0%   +4.17%  (p=0.000 n=10+9)
Hash8BytesUnaligned  30.3MB/s ± 1%  42.8MB/s ± 1%  +40.92%  (p=0.000 n=9+10)
Hash1KUnaligned       368MB/s ± 0%   394MB/s ± 0%   +7.07%  (p=0.000 n=9+9)
Hash8KUnaligned       404MB/s ± 1%   411MB/s ± 3%   +1.91%  (p=0.026 n=9+10)

Change-Id: I9a91fb52ea8d62964d5351bdf121e9fbc9282852
Reviewed-on: https://go-review.googlesource.com/c/137355
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarBrad Fitzpatrick <bradfitz@golang.org>
parent f1081589
...@@ -7,10 +7,7 @@ ...@@ -7,10 +7,7 @@
// This program generates md5block.go // This program generates md5block.go
// Invoke as // Invoke as
// //
// go run gen.go [-full] -output md5block.go // go run gen.go -output md5block.go
//
// The -full flag causes the generated code to do a full
// (16x) unrolling instead of a 4x unrolling.
package main package main
...@@ -56,13 +53,14 @@ type Data struct { ...@@ -56,13 +53,14 @@ type Data struct {
Table2 []uint32 Table2 []uint32
Table3 []uint32 Table3 []uint32
Table4 []uint32 Table4 []uint32
Full bool
} }
var funcs = template.FuncMap{ var funcs = template.FuncMap{
"dup": dup, "dup": dup,
"relabel": relabel, "relabel": relabel,
"rotate": rotate, "rotate": rotate,
"idx": idx,
"seq": seq,
} }
func dup(count int, x []int) []int { func dup(count int, x []int) []int {
...@@ -74,7 +72,7 @@ func dup(count int, x []int) []int { ...@@ -74,7 +72,7 @@ func dup(count int, x []int) []int {
} }
func relabel(s string) string { func relabel(s string) string {
return strings.NewReplacer("a", data.a, "b", data.b, "c", data.c, "d", data.d).Replace(s) return strings.NewReplacer("arg0", data.a, "arg1", data.b, "arg2", data.c, "arg3", data.d).Replace(s)
} }
func rotate() string { func rotate() string {
...@@ -82,8 +80,27 @@ func rotate() string { ...@@ -82,8 +80,27 @@ func rotate() string {
return "" // no output return "" // no output
} }
func init() { func idx(round, index int) int {
flag.BoolVar(&data.Full, "full", false, "complete unrolling") v := 0
switch round {
case 1:
v = index
case 2:
v = (1 + 5*index) & 15
case 3:
v = (5 + 3*index) & 15
case 4:
v = (7 * index) & 15
}
return v
}
func seq(i int) []int {
s := make([]int, i)
for i := range s {
s[i] = i
}
return s
} }
var data = Data{ var data = Data{
...@@ -179,152 +196,64 @@ var program = `// Copyright 2013 The Go Authors. All rights reserved. ...@@ -179,152 +196,64 @@ var program = `// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// Code generated by go run gen.go{{if .Full}} -full{{end}} -output md5block.go; DO NOT EDIT. // Code generated by go run gen.go -output md5block.go; DO NOT EDIT.
package md5 package md5
import ( import (
"unsafe" "encoding/binary"
"runtime" "math/bits"
) )
{{if not .Full}}
var t1 = [...]uint32{
{{range .Table1}}{{printf "\t%#x,\n" .}}{{end}}
}
var t2 = [...]uint32{
{{range .Table2}}{{printf "\t%#x,\n" .}}{{end}}
}
var t3 = [...]uint32{
{{range .Table3}}{{printf "\t%#x,\n" .}}{{end}}
}
var t4 = [...]uint32{
{{range .Table4}}{{printf "\t%#x,\n" .}}{{end}}
}
{{end}}
const x86 = runtime.GOARCH == "amd64" || runtime.GOARCH == "386"
var littleEndian bool
func init() {
x := uint32(0x04030201)
y := [4]byte{0x1, 0x2, 0x3, 0x4}
littleEndian = *(*[4]byte)(unsafe.Pointer(&x)) == y
}
func blockGeneric(dig *digest, p []byte) { func blockGeneric(dig *digest, p []byte) {
a := dig.s[0] // load state
b := dig.s[1] a, b, c, d := dig.s[0], dig.s[1], dig.s[2], dig.s[3]
c := dig.s[2]
d := dig.s[3]
var X *[16]uint32
var xbuf [16]uint32
for len(p) >= chunk {
aa, bb, cc, dd := a, b, c, d
// This is a constant condition - it is not evaluated on each iteration. for i := 0; i <= len(p)-BlockSize; i += BlockSize {
if x86 { // eliminate bounds checks on p
// MD5 was designed so that x86 processors can just iterate q := p[i:]
// over the block data directly as uint32s, and we generate q = q[:BlockSize:BlockSize]
// less code and run 1.3x faster if we take advantage of that.
// My apologies.
X = (*[16]uint32)(unsafe.Pointer(&p[0]))
} else if littleEndian && uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
X = (*[16]uint32)(unsafe.Pointer(&p[0]))
} else {
X = &xbuf
j := 0
for i := 0; i < 16; i++ {
X[i&15] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
j += 4
}
}
{{if .Full}} // save current state
// Round 1. aa, bb, cc, dd := a, b, c, d
{{range $i, $s := dup 4 .Shift1}}
{{index $.Table1 $i | printf "a += (((c^d)&b)^d) + X[%d] + %d" $i | relabel}}
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
{{rotate}}
{{end}}
// Round 2. // load input block
{{range $i, $s := dup 4 .Shift2}} {{range $i := seq 16 -}}
{{index $.Table2 $i | printf "a += (((b^c)&d)^c) + X[(1+5*%d)&15] + %d" $i | relabel}} {{printf "x%x := binary.LittleEndian.Uint32(q[4*%#x:])" $i $i}}
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
{{rotate}}
{{end}} {{end}}
// Round 3. // round 1
{{range $i, $s := dup 4 .Shift3}} {{range $i, $s := dup 4 .Shift1 -}}
{{index $.Table3 $i | printf "a += (b^c^d) + X[(5+3*%d)&15] + %d" $i | relabel}} {{printf "arg0 = arg1 + bits.RotateLeft32((((arg2^arg3)&arg1)^arg3)+arg0+x%x+%#08x, %d)" (idx 1 $i) (index $.Table1 $i) $s | relabel}}
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}} {{rotate -}}
{{rotate}}
{{end}} {{end}}
// Round 4. // round 2
{{range $i, $s := dup 4 .Shift4}} {{range $i, $s := dup 4 .Shift2 -}}
{{index $.Table4 $i | printf "a += (c^(b|^d)) + X[(7*%d)&15] + %d" $i | relabel}} {{printf "arg0 = arg1 + bits.RotateLeft32((((arg1^arg2)&arg3)^arg2)+arg0+x%x+%#08x, %d)" (idx 2 $i) (index $.Table2 $i) $s | relabel}}
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}} {{rotate -}}
{{rotate}}
{{end}}
{{else}}
// Round 1.
for i := uint(0); i < 16; {
{{range $s := .Shift1}}
{{printf "a += (((c^d)&b)^d) + X[i&15] + t1[i&15]" | relabel}}
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
i++
{{rotate}}
{{end}}
}
// Round 2.
for i := uint(0); i < 16; {
{{range $s := .Shift2}}
{{printf "a += (((b^c)&d)^c) + X[(1+5*i)&15] + t2[i&15]" | relabel}}
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
i++
{{rotate}}
{{end}} {{end}}
}
// Round 3. // round 3
for i := uint(0); i < 16; { {{range $i, $s := dup 4 .Shift3 -}}
{{range $s := .Shift3}} {{printf "arg0 = arg1 + bits.RotateLeft32((arg1^arg2^arg3)+arg0+x%x+%#08x, %d)" (idx 3 $i) (index $.Table3 $i) $s | relabel}}
{{printf "a += (b^c^d) + X[(5+3*i)&15] + t3[i&15]" | relabel}} {{rotate -}}
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
i++
{{rotate}}
{{end}} {{end}}
}
// Round 4. // round 4
for i := uint(0); i < 16; { {{range $i, $s := dup 4 .Shift4 -}}
{{range $s := .Shift4}} {{printf "arg0 = arg1 + bits.RotateLeft32((arg2^(arg1|^arg3))+arg0+x%x+%#08x, %d)" (idx 4 $i) (index $.Table4 $i) $s | relabel}}
{{printf "a += (c^(b|^d)) + X[(7*i)&15] + t4[i&15]" | relabel}} {{rotate -}}
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
i++
{{rotate}}
{{end}}
}
{{end}} {{end}}
// add saved state
a += aa a += aa
b += bb b += bb
c += cc c += cc
d += dd d += dd
p = p[chunk:]
} }
dig.s[0] = a // save state
dig.s[1] = b dig.s[0], dig.s[1], dig.s[2], dig.s[3] = a, b, c, d
dig.s[2] = c
dig.s[3] = d
} }
` `
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
//go:generate go run gen.go -full -output md5block.go //go:generate go run gen.go -output md5block.go
// Package md5 implements the MD5 hash algorithm as defined in RFC 1321. // Package md5 implements the MD5 hash algorithm as defined in RFC 1321.
// //
...@@ -12,6 +12,7 @@ package md5 ...@@ -12,6 +12,7 @@ package md5
import ( import (
"crypto" "crypto"
"encoding/binary"
"errors" "errors"
"hash" "hash"
) )
...@@ -27,7 +28,6 @@ const Size = 16 ...@@ -27,7 +28,6 @@ const Size = 16
const BlockSize = 64 const BlockSize = 64
const ( const (
chunk = 64
init0 = 0x67452301 init0 = 0x67452301
init1 = 0xEFCDAB89 init1 = 0xEFCDAB89
init2 = 0x98BADCFE init2 = 0x98BADCFE
...@@ -37,7 +37,7 @@ const ( ...@@ -37,7 +37,7 @@ const (
// digest represents the partial evaluation of a checksum. // digest represents the partial evaluation of a checksum.
type digest struct { type digest struct {
s [4]uint32 s [4]uint32
x [chunk]byte x [BlockSize]byte
nx int nx int
len uint64 len uint64
} }
...@@ -53,7 +53,7 @@ func (d *digest) Reset() { ...@@ -53,7 +53,7 @@ func (d *digest) Reset() {
const ( const (
magic = "md5\x01" magic = "md5\x01"
marshaledSize = len(magic) + 4*4 + chunk + 8 marshaledSize = len(magic) + 4*4 + BlockSize + 8
) )
func (d *digest) MarshalBinary() ([]byte, error) { func (d *digest) MarshalBinary() ([]byte, error) {
...@@ -83,45 +83,28 @@ func (d *digest) UnmarshalBinary(b []byte) error { ...@@ -83,45 +83,28 @@ func (d *digest) UnmarshalBinary(b []byte) error {
b, d.s[3] = consumeUint32(b) b, d.s[3] = consumeUint32(b)
b = b[copy(d.x[:], b):] b = b[copy(d.x[:], b):]
b, d.len = consumeUint64(b) b, d.len = consumeUint64(b)
d.nx = int(d.len) % chunk d.nx = int(d.len) % BlockSize
return nil return nil
} }
func appendUint64(b []byte, x uint64) []byte { func appendUint64(b []byte, x uint64) []byte {
a := [8]byte{ var a [8]byte
byte(x >> 56), binary.BigEndian.PutUint64(a[:], x)
byte(x >> 48),
byte(x >> 40),
byte(x >> 32),
byte(x >> 24),
byte(x >> 16),
byte(x >> 8),
byte(x),
}
return append(b, a[:]...) return append(b, a[:]...)
} }
func appendUint32(b []byte, x uint32) []byte { func appendUint32(b []byte, x uint32) []byte {
a := [4]byte{ var a [4]byte
byte(x >> 24), binary.BigEndian.PutUint32(a[:], x)
byte(x >> 16),
byte(x >> 8),
byte(x),
}
return append(b, a[:]...) return append(b, a[:]...)
} }
func consumeUint64(b []byte) ([]byte, uint64) { func consumeUint64(b []byte) ([]byte, uint64) {
_ = b[7] return b[8:], binary.BigEndian.Uint64(b[0:8])
x := uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 |
uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56
return b[8:], x
} }
func consumeUint32(b []byte) ([]byte, uint32) { func consumeUint32(b []byte) ([]byte, uint32) {
_ = b[3] return b[4:], binary.BigEndian.Uint32(b[0:4])
x := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
return b[4:], x
} }
// New returns a new hash.Hash computing the MD5 checksum. The Hash also // New returns a new hash.Hash computing the MD5 checksum. The Hash also
...@@ -138,20 +121,31 @@ func (d *digest) Size() int { return Size } ...@@ -138,20 +121,31 @@ func (d *digest) Size() int { return Size }
func (d *digest) BlockSize() int { return BlockSize } func (d *digest) BlockSize() int { return BlockSize }
func (d *digest) Write(p []byte) (nn int, err error) { func (d *digest) Write(p []byte) (nn int, err error) {
// Note that we currently call block or blockGeneric
// directly (guarded using haveAsm) because this allows
// escape analysis to see that p and d don't escape.
nn = len(p) nn = len(p)
d.len += uint64(nn) d.len += uint64(nn)
if d.nx > 0 { if d.nx > 0 {
n := copy(d.x[d.nx:], p) n := copy(d.x[d.nx:], p)
d.nx += n d.nx += n
if d.nx == chunk { if d.nx == BlockSize {
if haveAsm {
block(d, d.x[:]) block(d, d.x[:])
} else {
blockGeneric(d, d.x[:])
}
d.nx = 0 d.nx = 0
} }
p = p[n:] p = p[n:]
} }
if len(p) >= chunk { if len(p) >= BlockSize {
n := len(p) &^ (chunk - 1) n := len(p) &^ (BlockSize - 1)
if haveAsm {
block(d, p[:n]) block(d, p[:n])
} else {
blockGeneric(d, p[:n])
}
p = p[n:] p = p[n:]
} }
if len(p) > 0 { if len(p) > 0 {
...@@ -168,35 +162,27 @@ func (d *digest) Sum(in []byte) []byte { ...@@ -168,35 +162,27 @@ func (d *digest) Sum(in []byte) []byte {
} }
func (d *digest) checkSum() [Size]byte { func (d *digest) checkSum() [Size]byte {
// Padding. Add a 1 bit and 0 bits until 56 bytes mod 64. // Append 0x80 to the end of the message and then append zeros
len := d.len // until the length is a multiple of 56 bytes. Finally append
var tmp [64]byte // 8 bytes representing the message length in bits.
tmp[0] = 0x80 //
if len%64 < 56 { // 1 byte end marker :: 0-63 padding bytes :: 8 byte length
d.Write(tmp[0 : 56-len%64]) tmp := [1 + 63 + 8]byte{0x80}
} else { pad := (55 - d.len) % 64 // calculate number of padding bytes
d.Write(tmp[0 : 64+56-len%64]) binary.LittleEndian.PutUint64(tmp[1+pad:], d.len<<3) // append length in bits
} d.Write(tmp[:1+pad+8])
// Length in bits. // The previous write ensures that a whole number of
len <<= 3 // blocks (i.e. a multiple of 64 bytes) have been hashed.
for i := uint(0); i < 8; i++ {
tmp[i] = byte(len >> (8 * i))
}
d.Write(tmp[0:8])
if d.nx != 0 { if d.nx != 0 {
panic("d.nx != 0") panic("d.nx != 0")
} }
var digest [Size]byte var digest [Size]byte
for i, s := range d.s { binary.LittleEndian.PutUint32(digest[0:], d.s[0])
digest[i*4] = byte(s) binary.LittleEndian.PutUint32(digest[4:], d.s[1])
digest[i*4+1] = byte(s >> 8) binary.LittleEndian.PutUint32(digest[8:], d.s[2])
digest[i*4+2] = byte(s >> 16) binary.LittleEndian.PutUint32(digest[12:], d.s[3])
digest[i*4+3] = byte(s >> 24)
}
return digest return digest
} }
......
...@@ -2,263 +2,124 @@ ...@@ -2,263 +2,124 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// Code generated by go run gen.go -full -output md5block.go; DO NOT EDIT. // Code generated by go run gen.go -output md5block.go; DO NOT EDIT.
package md5 package md5
import ( import (
"runtime" "encoding/binary"
"unsafe" "math/bits"
) )
const x86 = runtime.GOARCH == "amd64" || runtime.GOARCH == "386"
var littleEndian bool
func init() {
x := uint32(0x04030201)
y := [4]byte{0x1, 0x2, 0x3, 0x4}
littleEndian = *(*[4]byte)(unsafe.Pointer(&x)) == y
}
func blockGeneric(dig *digest, p []byte) { func blockGeneric(dig *digest, p []byte) {
a := dig.s[0] // load state
b := dig.s[1] a, b, c, d := dig.s[0], dig.s[1], dig.s[2], dig.s[3]
c := dig.s[2]
d := dig.s[3]
var X *[16]uint32
var xbuf [16]uint32
for len(p) >= chunk {
aa, bb, cc, dd := a, b, c, d
// This is a constant condition - it is not evaluated on each iteration.
if x86 {
// MD5 was designed so that x86 processors can just iterate
// over the block data directly as uint32s, and we generate
// less code and run 1.3x faster if we take advantage of that.
// My apologies.
X = (*[16]uint32)(unsafe.Pointer(&p[0]))
} else if littleEndian && uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
X = (*[16]uint32)(unsafe.Pointer(&p[0]))
} else {
X = &xbuf
j := 0
for i := 0; i < 16; i++ {
X[i&15] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
j += 4
}
}
// Round 1.
a += (((c ^ d) & b) ^ d) + X[0] + 3614090360
a = a<<7 | a>>(32-7) + b
d += (((b ^ c) & a) ^ c) + X[1] + 3905402710
d = d<<12 | d>>(32-12) + a
c += (((a ^ b) & d) ^ b) + X[2] + 606105819
c = c<<17 | c>>(32-17) + d
b += (((d ^ a) & c) ^ a) + X[3] + 3250441966
b = b<<22 | b>>(32-22) + c
a += (((c ^ d) & b) ^ d) + X[4] + 4118548399
a = a<<7 | a>>(32-7) + b
d += (((b ^ c) & a) ^ c) + X[5] + 1200080426
d = d<<12 | d>>(32-12) + a
c += (((a ^ b) & d) ^ b) + X[6] + 2821735955
c = c<<17 | c>>(32-17) + d
b += (((d ^ a) & c) ^ a) + X[7] + 4249261313
b = b<<22 | b>>(32-22) + c
a += (((c ^ d) & b) ^ d) + X[8] + 1770035416
a = a<<7 | a>>(32-7) + b
d += (((b ^ c) & a) ^ c) + X[9] + 2336552879
d = d<<12 | d>>(32-12) + a
c += (((a ^ b) & d) ^ b) + X[10] + 4294925233
c = c<<17 | c>>(32-17) + d
b += (((d ^ a) & c) ^ a) + X[11] + 2304563134
b = b<<22 | b>>(32-22) + c
a += (((c ^ d) & b) ^ d) + X[12] + 1804603682
a = a<<7 | a>>(32-7) + b
d += (((b ^ c) & a) ^ c) + X[13] + 4254626195
d = d<<12 | d>>(32-12) + a
c += (((a ^ b) & d) ^ b) + X[14] + 2792965006
c = c<<17 | c>>(32-17) + d
b += (((d ^ a) & c) ^ a) + X[15] + 1236535329
b = b<<22 | b>>(32-22) + c
// Round 2.
a += (((b ^ c) & d) ^ c) + X[(1+5*0)&15] + 4129170786
a = a<<5 | a>>(32-5) + b
d += (((a ^ b) & c) ^ b) + X[(1+5*1)&15] + 3225465664
d = d<<9 | d>>(32-9) + a
c += (((d ^ a) & b) ^ a) + X[(1+5*2)&15] + 643717713
c = c<<14 | c>>(32-14) + d
b += (((c ^ d) & a) ^ d) + X[(1+5*3)&15] + 3921069994
b = b<<20 | b>>(32-20) + c
a += (((b ^ c) & d) ^ c) + X[(1+5*4)&15] + 3593408605
a = a<<5 | a>>(32-5) + b
d += (((a ^ b) & c) ^ b) + X[(1+5*5)&15] + 38016083
d = d<<9 | d>>(32-9) + a
c += (((d ^ a) & b) ^ a) + X[(1+5*6)&15] + 3634488961
c = c<<14 | c>>(32-14) + d
b += (((c ^ d) & a) ^ d) + X[(1+5*7)&15] + 3889429448
b = b<<20 | b>>(32-20) + c
a += (((b ^ c) & d) ^ c) + X[(1+5*8)&15] + 568446438
a = a<<5 | a>>(32-5) + b
d += (((a ^ b) & c) ^ b) + X[(1+5*9)&15] + 3275163606
d = d<<9 | d>>(32-9) + a
c += (((d ^ a) & b) ^ a) + X[(1+5*10)&15] + 4107603335
c = c<<14 | c>>(32-14) + d
b += (((c ^ d) & a) ^ d) + X[(1+5*11)&15] + 1163531501
b = b<<20 | b>>(32-20) + c
a += (((b ^ c) & d) ^ c) + X[(1+5*12)&15] + 2850285829 for i := 0; i <= len(p)-BlockSize; i += BlockSize {
a = a<<5 | a>>(32-5) + b // eliminate bounds checks on p
q := p[i:]
q = q[:BlockSize:BlockSize]
d += (((a ^ b) & c) ^ b) + X[(1+5*13)&15] + 4243563512 // save current state
d = d<<9 | d>>(32-9) + a aa, bb, cc, dd := a, b, c, d
c += (((d ^ a) & b) ^ a) + X[(1+5*14)&15] + 1735328473
c = c<<14 | c>>(32-14) + d
b += (((c ^ d) & a) ^ d) + X[(1+5*15)&15] + 2368359562
b = b<<20 | b>>(32-20) + c
// Round 3.
a += (b ^ c ^ d) + X[(5+3*0)&15] + 4294588738
a = a<<4 | a>>(32-4) + b
d += (a ^ b ^ c) + X[(5+3*1)&15] + 2272392833
d = d<<11 | d>>(32-11) + a
c += (d ^ a ^ b) + X[(5+3*2)&15] + 1839030562
c = c<<16 | c>>(32-16) + d
b += (c ^ d ^ a) + X[(5+3*3)&15] + 4259657740
b = b<<23 | b>>(32-23) + c
a += (b ^ c ^ d) + X[(5+3*4)&15] + 2763975236
a = a<<4 | a>>(32-4) + b
d += (a ^ b ^ c) + X[(5+3*5)&15] + 1272893353
d = d<<11 | d>>(32-11) + a
c += (d ^ a ^ b) + X[(5+3*6)&15] + 4139469664
c = c<<16 | c>>(32-16) + d
b += (c ^ d ^ a) + X[(5+3*7)&15] + 3200236656
b = b<<23 | b>>(32-23) + c
a += (b ^ c ^ d) + X[(5+3*8)&15] + 681279174
a = a<<4 | a>>(32-4) + b
d += (a ^ b ^ c) + X[(5+3*9)&15] + 3936430074
d = d<<11 | d>>(32-11) + a
c += (d ^ a ^ b) + X[(5+3*10)&15] + 3572445317
c = c<<16 | c>>(32-16) + d
b += (c ^ d ^ a) + X[(5+3*11)&15] + 76029189
b = b<<23 | b>>(32-23) + c
a += (b ^ c ^ d) + X[(5+3*12)&15] + 3654602809
a = a<<4 | a>>(32-4) + b
d += (a ^ b ^ c) + X[(5+3*13)&15] + 3873151461
d = d<<11 | d>>(32-11) + a
c += (d ^ a ^ b) + X[(5+3*14)&15] + 530742520
c = c<<16 | c>>(32-16) + d
b += (c ^ d ^ a) + X[(5+3*15)&15] + 3299628645
b = b<<23 | b>>(32-23) + c
// Round 4.
a += (c ^ (b | ^d)) + X[(7*0)&15] + 4096336452
a = a<<6 | a>>(32-6) + b
d += (b ^ (a | ^c)) + X[(7*1)&15] + 1126891415
d = d<<10 | d>>(32-10) + a
c += (a ^ (d | ^b)) + X[(7*2)&15] + 2878612391
c = c<<15 | c>>(32-15) + d
b += (d ^ (c | ^a)) + X[(7*3)&15] + 4237533241
b = b<<21 | b>>(32-21) + c
a += (c ^ (b | ^d)) + X[(7*4)&15] + 1700485571
a = a<<6 | a>>(32-6) + b
d += (b ^ (a | ^c)) + X[(7*5)&15] + 2399980690
d = d<<10 | d>>(32-10) + a
c += (a ^ (d | ^b)) + X[(7*6)&15] + 4293915773
c = c<<15 | c>>(32-15) + d
b += (d ^ (c | ^a)) + X[(7*7)&15] + 2240044497
b = b<<21 | b>>(32-21) + c
a += (c ^ (b | ^d)) + X[(7*8)&15] + 1873313359
a = a<<6 | a>>(32-6) + b
d += (b ^ (a | ^c)) + X[(7*9)&15] + 4264355552
d = d<<10 | d>>(32-10) + a
c += (a ^ (d | ^b)) + X[(7*10)&15] + 2734768916
c = c<<15 | c>>(32-15) + d
b += (d ^ (c | ^a)) + X[(7*11)&15] + 1309151649
b = b<<21 | b>>(32-21) + c
a += (c ^ (b | ^d)) + X[(7*12)&15] + 4149444226
a = a<<6 | a>>(32-6) + b
d += (b ^ (a | ^c)) + X[(7*13)&15] + 3174756917
d = d<<10 | d>>(32-10) + a
c += (a ^ (d | ^b)) + X[(7*14)&15] + 718787259
c = c<<15 | c>>(32-15) + d
b += (d ^ (c | ^a)) + X[(7*15)&15] + 3951481745
b = b<<21 | b>>(32-21) + c
// load input block
x0 := binary.LittleEndian.Uint32(q[4*0x0:])
x1 := binary.LittleEndian.Uint32(q[4*0x1:])
x2 := binary.LittleEndian.Uint32(q[4*0x2:])
x3 := binary.LittleEndian.Uint32(q[4*0x3:])
x4 := binary.LittleEndian.Uint32(q[4*0x4:])
x5 := binary.LittleEndian.Uint32(q[4*0x5:])
x6 := binary.LittleEndian.Uint32(q[4*0x6:])
x7 := binary.LittleEndian.Uint32(q[4*0x7:])
x8 := binary.LittleEndian.Uint32(q[4*0x8:])
x9 := binary.LittleEndian.Uint32(q[4*0x9:])
xa := binary.LittleEndian.Uint32(q[4*0xa:])
xb := binary.LittleEndian.Uint32(q[4*0xb:])
xc := binary.LittleEndian.Uint32(q[4*0xc:])
xd := binary.LittleEndian.Uint32(q[4*0xd:])
xe := binary.LittleEndian.Uint32(q[4*0xe:])
xf := binary.LittleEndian.Uint32(q[4*0xf:])
// round 1
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x0+0xd76aa478, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x1+0xe8c7b756, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x2+0x242070db, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x3+0xc1bdceee, 22)
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x4+0xf57c0faf, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x5+0x4787c62a, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x6+0xa8304613, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x7+0xfd469501, 22)
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x8+0x698098d8, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x9+0x8b44f7af, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xa+0xffff5bb1, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xb+0x895cd7be, 22)
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+xc+0x6b901122, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+xd+0xfd987193, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xe+0xa679438e, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xf+0x49b40821, 22)
// round 2
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x1+0xf61e2562, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x6+0xc040b340, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xb+0x265e5a51, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x0+0xe9b6c7aa, 20)
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x5+0xd62f105d, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xa+0x02441453, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xf+0xd8a1e681, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x4+0xe7d3fbc8, 20)
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x9+0x21e1cde6, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xe+0xc33707d6, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x3+0xf4d50d87, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x8+0x455a14ed, 20)
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+xd+0xa9e3e905, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x2+0xfcefa3f8, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x7+0x676f02d9, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+xc+0x8d2a4c8a, 20)
// round 3
a = b + bits.RotateLeft32((b^c^d)+a+x5+0xfffa3942, 4)
d = a + bits.RotateLeft32((a^b^c)+d+x8+0x8771f681, 11)
c = d + bits.RotateLeft32((d^a^b)+c+xb+0x6d9d6122, 16)
b = c + bits.RotateLeft32((c^d^a)+b+xe+0xfde5380c, 23)
a = b + bits.RotateLeft32((b^c^d)+a+x1+0xa4beea44, 4)
d = a + bits.RotateLeft32((a^b^c)+d+x4+0x4bdecfa9, 11)
c = d + bits.RotateLeft32((d^a^b)+c+x7+0xf6bb4b60, 16)
b = c + bits.RotateLeft32((c^d^a)+b+xa+0xbebfbc70, 23)
a = b + bits.RotateLeft32((b^c^d)+a+xd+0x289b7ec6, 4)
d = a + bits.RotateLeft32((a^b^c)+d+x0+0xeaa127fa, 11)
c = d + bits.RotateLeft32((d^a^b)+c+x3+0xd4ef3085, 16)
b = c + bits.RotateLeft32((c^d^a)+b+x6+0x04881d05, 23)
a = b + bits.RotateLeft32((b^c^d)+a+x9+0xd9d4d039, 4)
d = a + bits.RotateLeft32((a^b^c)+d+xc+0xe6db99e5, 11)
c = d + bits.RotateLeft32((d^a^b)+c+xf+0x1fa27cf8, 16)
b = c + bits.RotateLeft32((c^d^a)+b+x2+0xc4ac5665, 23)
// round 4
a = b + bits.RotateLeft32((c^(b|^d))+a+x0+0xf4292244, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+x7+0x432aff97, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+xe+0xab9423a7, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+x5+0xfc93a039, 21)
a = b + bits.RotateLeft32((c^(b|^d))+a+xc+0x655b59c3, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+x3+0x8f0ccc92, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+xa+0xffeff47d, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+x1+0x85845dd1, 21)
a = b + bits.RotateLeft32((c^(b|^d))+a+x8+0x6fa87e4f, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+xf+0xfe2ce6e0, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+x6+0xa3014314, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+xd+0x4e0811a1, 21)
a = b + bits.RotateLeft32((c^(b|^d))+a+x4+0xf7537e82, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+xb+0xbd3af235, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+x2+0x2ad7d2bb, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+x9+0xeb86d391, 21)
// add saved state
a += aa a += aa
b += bb b += bb
c += cc c += cc
d += dd d += dd
p = p[chunk:]
} }
dig.s[0] = a // save state
dig.s[1] = b dig.s[0], dig.s[1], dig.s[2], dig.s[3] = a, b, c, d
dig.s[2] = c
dig.s[3] = d
} }
...@@ -6,6 +6,8 @@ ...@@ -6,6 +6,8 @@
package md5 package md5
const haveAsm = true
//go:noescape //go:noescape
func block(dig *digest, p []byte) func block(dig *digest, p []byte)
...@@ -6,4 +6,6 @@ ...@@ -6,4 +6,6 @@
package md5 package md5
const haveAsm = false
var block = blockGeneric var block = blockGeneric
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment