crypto/md5: simplify generic implementation

This change uses library functions such as bits.RotateLeft32 to reduce the amount of code needed in the generic implementation. Since the code is now shorter I've also removed the option to generate a non-unrolled version of the code. I've also tried to remove bounds checks where possible to make the new version performant, however that is not the primary goal of this change since most architectures have assembly implementations already. Assembly performance: name old speed new speed delta Hash8Bytes 50.3MB/s ± 1% 59.1MB/s ± 0% +17.63% (p=0.000 n=9+8) Hash1K 590MB/s ± 0% 597MB/s ± 0% +1.25% (p=0.000 n=9+9) Hash8K 636MB/s ± 1% 638MB/s ± 1% ~ (p=0.072 n=10+10) Hash8BytesUnaligned 50.5MB/s ± 0% 59.1MB/s ± 1% +17.09% (p=0.000 n=10+10) Hash1KUnaligned 589MB/s ± 1% 596MB/s ± 1% +1.23% (p=0.000 n=9+10) Hash8KUnaligned 638MB/s ± 1% 640MB/s ± 0% +0.35% (p=0.002 n=10+10) Pure Go performance: name old speed new speed delta Hash8Bytes 30.3MB/s ± 1% 42.8MB/s ± 0% +41.20% (p=0.000 n=9+9) Hash1K 364MB/s ± 4% 394MB/s ± 1% +8.27% (p=0.000 n=10+10) Hash8K 404MB/s ± 1% 420MB/s ± 0% +4.17% (p=0.000 n=10+9) Hash8BytesUnaligned 30.3MB/s ± 1% 42.8MB/s ± 1% +40.92% (p=0.000 n=9+10) Hash1KUnaligned 368MB/s ± 0% 394MB/s ± 0% +7.07% (p=0.000 n=9+9) Hash8KUnaligned 404MB/s ± 1% 411MB/s ± 3% +1.91% (p=0.026 n=9+10) Change-Id: I9a91fb52ea8d62964d5351bdf121e9fbc9282852 Reviewed-on: https://go-review.googlesource.com/c/137355 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>

crypto/md5: simplify generic implementation
This change uses library functions such as bits.RotateLeft32 to reduce the amount of code needed in the generic implementation. Since the code is now shorter I've also removed the option to generate a non-unrolled version of the code. I've also tried to remove bounds checks where possible to make the new version performant, however that is not the primary goal of this change since most architectures have assembly implementations already. Assembly performance: name old speed new speed delta Hash8Bytes 50.3MB/s ± 1% 59.1MB/s ± 0% +17.63% (p=0.000 n=9+8) Hash1K 590MB/s ± 0% 597MB/s ± 0% +1.25% (p=0.000 n=9+9) Hash8K 636MB/s ± 1% 638MB/s ± 1% ~ (p=0.072 n=10+10) Hash8BytesUnaligned 50.5MB/s ± 0% 59.1MB/s ± 1% +17.09% (p=0.000 n=10+10) Hash1KUnaligned 589MB/s ± 1% 596MB/s ± 1% +1.23% (p=0.000 n=9+10) Hash8KUnaligned 638MB/s ± 1% 640MB/s ± 0% +0.35% (p=0.002 n=10+10) Pure Go performance: name old speed new speed delta Hash8Bytes 30.3MB/s ± 1% 42.8MB/s ± 0% +41.20% (p=0.000 n=9+9) Hash1K 364MB/s ± 4% 394MB/s ± 1% +8.27% (p=0.000 n=10+10) Hash8K 404MB/s ± 1% 420MB/s ± 0% +4.17% (p=0.000 n=10+9) Hash8BytesUnaligned 30.3MB/s ± 1% 42.8MB/s ± 1% +40.92% (p=0.000 n=9+10) Hash1KUnaligned 368MB/s ± 0% 394MB/s ± 0% +7.07% (p=0.000 n=9+9) Hash8KUnaligned 404MB/s ± 1% 411MB/s ± 3% +1.91% (p=0.026 n=9+10) Change-Id: I9a91fb52ea8d62964d5351bdf121e9fbc9282852 Reviewed-on: https://go-review.googlesource.com/c/137355 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
7f998453 · Michael Munday · f1081589 · 7f998453 · 7f998453 · 7f998453
Commit 7f998453 authored May 29, 2018 by Michael Munday
5 changed files
--- a/src/crypto/md5/gen.go
+++ b/src/crypto/md5/gen.go
@@ -7,10 +7,7 @@
 // This program generates md5block.go
 // Invoke as
 //
-//	go run gen.go [-full] -output md5block.go
+//	go run gen.go -output md5block.go
-//
-// The -full flag causes the generated code to do a full
-// (16x) unrolling instead of a 4x unrolling.
 package main
@@ -56,13 +53,14 @@ type Data struct {
 	Table2     []uint32
 	Table3     []uint32
 	Table4     []uint32
-	Full       bool
 }
 var funcs = template.FuncMap{
 	"dup":     dup,
 	"relabel": relabel,
 	"rotate":  rotate,
+	"idx":     idx,
+	"seq":     seq,
 }
 func dup(count int, x []int) []int {
@@ -74,7 +72,7 @@ func dup(count int, x []int) []int {
 }
 func relabel(s string) string {
-	return strings.NewReplacer("a", data.a, "b", data.b, "c", data.c, "d", data.d).Replace(s)
+	return strings.NewReplacer("arg0", data.a, "arg1", data.b, "arg2", data.c, "arg3", data.d).Replace(s)
 }
 func rotate() string {
@@ -82,8 +80,27 @@ func rotate() string {
 	return "" // no output
 }
-func init() {
+func idx(round, index int) int {
-	flag.BoolVar(&data.Full, "full", false, "complete unrolling")
+	v := 0
+	switch round {
+	case 1:
+		v = index
+	case 2:
+		v = (1 + 5*index) & 15
+	case 3:
+		v = (5 + 3*index) & 15
+	case 4:
+		v = (7 * index) & 15
+	}
+	return v
+}
+func seq(i int) []int {
+	s := make([]int, i)
+	for i := range s {
+		s[i] = i
+	}
+	return s
 }
 var data = Data{
@@ -179,152 +196,64 @@ var program = `// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// Code generated by go run gen.go{{if .Full}} -full{{end}} -output md5block.go; DO NOT EDIT.
+// Code generated by go run gen.go -output md5block.go; DO NOT EDIT.
 package md5
 import (
-	"unsafe"
+	"encoding/binary"
-	"runtime"
+	"math/bits"
 )
-{{if not .Full}}
+func blockGeneric(dig *digest, p []byte) {
-	var t1 = [...]uint32{
+	// load state
-	{{range .Table1}}{{printf "\t%#x,\n" .}}{{end}}
+	a, b, c, d := dig.s[0], dig.s[1], dig.s[2], dig.s[3]
-	}
-	var t2 = [...]uint32{
-	{{range .Table2}}{{printf "\t%#x,\n" .}}{{end}}
-	}
-	var t3 = [...]uint32{
-	{{range .Table3}}{{printf "\t%#x,\n" .}}{{end}}
-	}
-	var t4 = [...]uint32{
-	{{range .Table4}}{{printf "\t%#x,\n" .}}{{end}}
-	}
-{{end}}
-const x86 = runtime.GOARCH == "amd64" || runtime.GOARCH == "386"
-var littleEndian bool
-func init() {
+	for i := 0; i <= len(p)-BlockSize; i += BlockSize {
-	x := uint32(0x04030201)
+		// eliminate bounds checks on p
-	y := [4]byte{0x1, 0x2, 0x3, 0x4}
+		q := p[i:]
-	littleEndian = *(*[4]byte)(unsafe.Pointer(&x)) == y
+		q = q[:BlockSize:BlockSize]
-}
-func blockGeneric(dig *digest, p []byte) {
+		// save current state
-	a := dig.s[0]
-	b := dig.s[1]
-	c := dig.s[2]
-	d := dig.s[3]
-	var X *[16]uint32
-	var xbuf [16]uint32
-	for len(p) >= chunk {
 		aa, bb, cc, dd := a, b, c, d
-		// This is a constant condition - it is not evaluated on each iteration.
+		// load input block
-		if x86 {
+		{{range $i := seq 16 -}}
-			// MD5 was designed so that x86 processors can just iterate
+			{{printf "x%x := binary.LittleEndian.Uint32(q[4*%#x:])" $i $i}}
-			// over the block data directly as uint32s, and we generate
+		{{end}}
-			// less code and run 1.3x faster if we take advantage of that.
-			// My apologies.
-			X = (*[16]uint32)(unsafe.Pointer(&p[0]))
-		} else if littleEndian && uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
-			X = (*[16]uint32)(unsafe.Pointer(&p[0]))
-		} else {
-			X = &xbuf
-			j := 0
-			for i := 0; i < 16; i++ {
-				X[i&15] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
-				j += 4
-			}
-		}
-		{{if .Full}}
+		// round 1
-			// Round 1.
+		{{range $i, $s := dup 4 .Shift1 -}}
-			{{range $i, $s := dup 4 .Shift1}}
+			{{printf "arg0 = arg1 + bits.RotateLeft32((((arg2^arg3)&arg1)^arg3)+arg0+x%x+%#08x, %d)" (idx 1 $i) (index $.Table1 $i) $s | relabel}}
-				{{index $.Table1 $i | printf "a += (((c^d)&b)^d) + X[%d] + %d" $i | relabel}}
+			{{rotate -}}
-				{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+		{{end}}
-				{{rotate}}
-			{{end}}
-			// Round 2.
-			{{range $i, $s := dup 4 .Shift2}}
-				{{index $.Table2 $i | printf "a += (((b^c)&d)^c) + X[(1+5*%d)&15] + %d" $i | relabel}}
-				{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
-				{{rotate}}
-			{{end}}
-			// Round 3.
-			{{range $i, $s := dup 4 .Shift3}}
-				{{index $.Table3 $i | printf "a += (b^c^d) + X[(5+3*%d)&15] + %d" $i | relabel}}
-				{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
-				{{rotate}}
-			{{end}}
-			// Round 4.
-			{{range $i, $s := dup 4 .Shift4}}
-				{{index $.Table4 $i | printf "a += (c^(b|^d)) + X[(7*%d)&15] + %d" $i | relabel}}
-				{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
-				{{rotate}}
-			{{end}}
-		{{else}}
-			// Round 1.
-			for i := uint(0); i < 16; {
-				{{range $s := .Shift1}}
-					{{printf "a += (((c^d)&b)^d) + X[i&15] + t1[i&15]" | relabel}}
-					{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
-					i++
-					{{rotate}}
-				{{end}}
-			}
-			// Round 2.
+		// round 2
-			for i := uint(0); i < 16; {
+		{{range $i, $s := dup 4 .Shift2 -}}
-				{{range $s := .Shift2}}
+			{{printf "arg0 = arg1 + bits.RotateLeft32((((arg1^arg2)&arg3)^arg2)+arg0+x%x+%#08x, %d)" (idx 2 $i) (index $.Table2 $i) $s | relabel}}
-					{{printf "a += (((b^c)&d)^c) + X[(1+5*i)&15] + t2[i&15]" | relabel}}
+			{{rotate -}}
-					{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+		{{end}}
-					i++
-					{{rotate}}
-				{{end}}
-			}
-			// Round 3.
+		// round 3
-			for i := uint(0); i < 16; {
+		{{range $i, $s := dup 4 .Shift3 -}}
-				{{range $s := .Shift3}}
+			{{printf "arg0 = arg1 + bits.RotateLeft32((arg1^arg2^arg3)+arg0+x%x+%#08x, %d)" (idx 3 $i) (index $.Table3 $i) $s | relabel}}
-					{{printf "a += (b^c^d) + X[(5+3*i)&15] + t3[i&15]" | relabel}}
+			{{rotate -}}
-					{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
+		{{end}}
-					i++
-					{{rotate}}
-				{{end}}
-			}
-			// Round 4.
+		// round 4
-			for i := uint(0); i < 16; {
+		{{range $i, $s := dup 4 .Shift4 -}}
-				{{range $s := .Shift4}}
+			{{printf "arg0 = arg1 + bits.RotateLeft32((arg2^(arg1|^arg3))+arg0+x%x+%#08x, %d)" (idx 4 $i) (index $.Table4 $i) $s | relabel}}
-					{{printf "a += (c^(b|^d)) + X[(7*i)&15] + t4[i&15]" | relabel}}
+			{{rotate -}}
-					{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
-					i++
-					{{rotate}}
-				{{end}}
-			}
 		{{end}}
+		// add saved state
 		a += aa
 		b += bb
 		c += cc
 		d += dd
-		p = p[chunk:]
 	}
-	dig.s[0] = a
+	// save state
-	dig.s[1] = b
+	dig.s[0], dig.s[1], dig.s[2], dig.s[3] = a, b, c, d
-	dig.s[2] = c
-	dig.s[3] = d
 }
 `
--- a/src/crypto/md5/md5.go
+++ b/src/crypto/md5/md5.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-//go:generate go run gen.go -full -output md5block.go
+//go:generate go run gen.go -output md5block.go
 // Package md5 implements the MD5 hash algorithm as defined in RFC 1321.
 //
@@ -12,6 +12,7 @@ package md5
 import (
 	"crypto"
+	"encoding/binary"
 	"errors"
 	"hash"
 )
@@ -27,7 +28,6 @@ const Size = 16
 const BlockSize = 64
 const (
-	chunk = 64
 	init0 = 0x67452301
 	init1 = 0xEFCDAB89
 	init2 = 0x98BADCFE
@@ -37,7 +37,7 @@ const (
 // digest represents the partial evaluation of a checksum.
 type digest struct {
 	s   [4]uint32
-	x   [chunk]byte
+	x   [BlockSize]byte
 	nx  int
 	len uint64
 }
@@ -53,7 +53,7 @@ func (d *digest) Reset() {
 const (
 	magic         = "md5\x01"
-	marshaledSize = len(magic) + 4*4 + chunk + 8
+	marshaledSize = len(magic) + 4*4 + BlockSize + 8
 )
 func (d *digest) MarshalBinary() ([]byte, error) {
@@ -83,45 +83,28 @@ func (d *digest) UnmarshalBinary(b []byte) error {
 	b, d.s[3] = consumeUint32(b)
 	b = b[copy(d.x[:], b):]
 	b, d.len = consumeUint64(b)
-	d.nx = int(d.len) % chunk
+	d.nx = int(d.len) % BlockSize
 	return nil
 }
 func appendUint64(b []byte, x uint64) []byte {
-	a := [8]byte{
+	var a [8]byte
-		byte(x >> 56),
+	binary.BigEndian.PutUint64(a[:], x)
-		byte(x >> 48),
-		byte(x >> 40),
-		byte(x >> 32),
-		byte(x >> 24),
-		byte(x >> 16),
-		byte(x >> 8),
-		byte(x),
-	}
 	return append(b, a[:]...)
 }
 func appendUint32(b []byte, x uint32) []byte {
-	a := [4]byte{
+	var a [4]byte
-		byte(x >> 24),
+	binary.BigEndian.PutUint32(a[:], x)
-		byte(x >> 16),
-		byte(x >> 8),
-		byte(x),
-	}
 	return append(b, a[:]...)
 }
 func consumeUint64(b []byte) ([]byte, uint64) {
-	_ = b[7]
+	return b[8:], binary.BigEndian.Uint64(b[0:8])
-	x := uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 |
-		uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56
-	return b[8:], x
 }
 func consumeUint32(b []byte) ([]byte, uint32) {
-	_ = b[3]
+	return b[4:], binary.BigEndian.Uint32(b[0:4])
-	x := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
-	return b[4:], x
 }
 // New returns a new hash.Hash computing the MD5 checksum. The Hash also
@@ -138,20 +121,31 @@ func (d *digest) Size() int { return Size }
 func (d *digest) BlockSize() int { return BlockSize }
 func (d *digest) Write(p []byte) (nn int, err error) {
+	// Note that we currently call block or blockGeneric
+	// directly (guarded using haveAsm) because this allows
+	// escape analysis to see that p and d don't escape.
 	nn = len(p)
 	d.len += uint64(nn)
 	if d.nx > 0 {
 		n := copy(d.x[d.nx:], p)
 		d.nx += n
-		if d.nx == chunk {
+		if d.nx == BlockSize {
-			block(d, d.x[:])
+			if haveAsm {
+				block(d, d.x[:])
+			} else {
+				blockGeneric(d, d.x[:])
+			}
 			d.nx = 0
 		}
 		p = p[n:]
 	}
-	if len(p) >= chunk {
+	if len(p) >= BlockSize {
-		n := len(p) &^ (chunk - 1)
+		n := len(p) &^ (BlockSize - 1)
-		block(d, p[:n])
+		if haveAsm {
+			block(d, p[:n])
+		} else {
+			blockGeneric(d, p[:n])
+		}
 		p = p[n:]
 	}
 	if len(p) > 0 {
@@ -168,35 +162,27 @@ func (d *digest) Sum(in []byte) []byte {
 }
 func (d *digest) checkSum() [Size]byte {
-	// Padding. Add a 1 bit and 0 bits until 56 bytes mod 64.
+	// Append 0x80 to the end of the message and then append zeros
-	len := d.len
+	// until the length is a multiple of 56 bytes. Finally append
-	var tmp [64]byte
+	// 8 bytes representing the message length in bits.
-	tmp[0] = 0x80
+	//
-	if len%64 < 56 {
+	// 1 byte end marker :: 0-63 padding bytes :: 8 byte length
-		d.Write(tmp[0 : 56-len%64])
+	tmp := [1 + 63 + 8]byte{0x80}
-	} else {
+	pad := (55 - d.len) % 64                             // calculate number of padding bytes
-		d.Write(tmp[0 : 64+56-len%64])
+	binary.LittleEndian.PutUint64(tmp[1+pad:], d.len<<3) // append length in bits
-	}
+	d.Write(tmp[:1+pad+8])
-	// Length in bits.
+	// The previous write ensures that a whole number of
-	len <<= 3
+	// blocks (i.e. a multiple of 64 bytes) have been hashed.
-	for i := uint(0); i < 8; i++ {
-		tmp[i] = byte(len >> (8 * i))
-	}
-	d.Write(tmp[0:8])
 	if d.nx != 0 {
 		panic("d.nx != 0")
 	}
 	var digest [Size]byte
-	for i, s := range d.s {
+	binary.LittleEndian.PutUint32(digest[0:], d.s[0])
-		digest[i*4] = byte(s)
+	binary.LittleEndian.PutUint32(digest[4:], d.s[1])
-		digest[i*4+1] = byte(s >> 8)
+	binary.LittleEndian.PutUint32(digest[8:], d.s[2])
-		digest[i*4+2] = byte(s >> 16)
+	binary.LittleEndian.PutUint32(digest[12:], d.s[3])
-		digest[i*4+3] = byte(s >> 24)
-	}
 	return digest
 }

--- a/src/crypto/md5/md5block.go
+++ b/src/crypto/md5/md5block.go
@@ -2,263 +2,124 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// Code generated by go run gen.go -full -output md5block.go; DO NOT EDIT.
+// Code generated by go run gen.go -output md5block.go; DO NOT EDIT.
 package md5
 import (
-	"runtime"
+	"encoding/binary"
-	"unsafe"
+	"math/bits"
 )
-const x86 = runtime.GOARCH == "amd64" || runtime.GOARCH == "386"
-var littleEndian bool
-func init() {
-	x := uint32(0x04030201)
-	y := [4]byte{0x1, 0x2, 0x3, 0x4}
-	littleEndian = *(*[4]byte)(unsafe.Pointer(&x)) == y
-}
 func blockGeneric(dig *digest, p []byte) {
-	a := dig.s[0]
+	// load state
-	b := dig.s[1]
+	a, b, c, d := dig.s[0], dig.s[1], dig.s[2], dig.s[3]
-	c := dig.s[2]
-	d := dig.s[3]
-	var X *[16]uint32
-	var xbuf [16]uint32
-	for len(p) >= chunk {
-		aa, bb, cc, dd := a, b, c, d
-		// This is a constant condition - it is not evaluated on each iteration.
-		if x86 {
-			// MD5 was designed so that x86 processors can just iterate
-			// over the block data directly as uint32s, and we generate
-			// less code and run 1.3x faster if we take advantage of that.
-			// My apologies.
-			X = (*[16]uint32)(unsafe.Pointer(&p[0]))
-		} else if littleEndian && uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
-			X = (*[16]uint32)(unsafe.Pointer(&p[0]))
-		} else {
-			X = &xbuf
-			j := 0
-			for i := 0; i < 16; i++ {
-				X[i&15] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
-				j += 4
-			}
-		}
-		// Round 1.
-		a += (((c ^ d) & b) ^ d) + X[0] + 3614090360
-		a = a<<7 | a>>(32-7) + b
-		d += (((b ^ c) & a) ^ c) + X[1] + 3905402710
-		d = d<<12 | d>>(32-12) + a
-		c += (((a ^ b) & d) ^ b) + X[2] + 606105819
-		c = c<<17 | c>>(32-17) + d
-		b += (((d ^ a) & c) ^ a) + X[3] + 3250441966
-		b = b<<22 | b>>(32-22) + c
-		a += (((c ^ d) & b) ^ d) + X[4] + 4118548399
-		a = a<<7 | a>>(32-7) + b
-		d += (((b ^ c) & a) ^ c) + X[5] + 1200080426
-		d = d<<12 | d>>(32-12) + a
-		c += (((a ^ b) & d) ^ b) + X[6] + 2821735955
-		c = c<<17 | c>>(32-17) + d
-		b += (((d ^ a) & c) ^ a) + X[7] + 4249261313
-		b = b<<22 | b>>(32-22) + c
-		a += (((c ^ d) & b) ^ d) + X[8] + 1770035416
-		a = a<<7 | a>>(32-7) + b
-		d += (((b ^ c) & a) ^ c) + X[9] + 2336552879
-		d = d<<12 | d>>(32-12) + a
-		c += (((a ^ b) & d) ^ b) + X[10] + 4294925233
-		c = c<<17 | c>>(32-17) + d
-		b += (((d ^ a) & c) ^ a) + X[11] + 2304563134
-		b = b<<22 | b>>(32-22) + c
-		a += (((c ^ d) & b) ^ d) + X[12] + 1804603682
-		a = a<<7 | a>>(32-7) + b
-		d += (((b ^ c) & a) ^ c) + X[13] + 4254626195
-		d = d<<12 | d>>(32-12) + a
-		c += (((a ^ b) & d) ^ b) + X[14] + 2792965006
-		c = c<<17 | c>>(32-17) + d
-		b += (((d ^ a) & c) ^ a) + X[15] + 1236535329
-		b = b<<22 | b>>(32-22) + c
-		// Round 2.
-		a += (((b ^ c) & d) ^ c) + X[(1+5*0)&15] + 4129170786
-		a = a<<5 | a>>(32-5) + b
-		d += (((a ^ b) & c) ^ b) + X[(1+5*1)&15] + 3225465664
-		d = d<<9 | d>>(32-9) + a
-		c += (((d ^ a) & b) ^ a) + X[(1+5*2)&15] + 643717713
-		c = c<<14 | c>>(32-14) + d
-		b += (((c ^ d) & a) ^ d) + X[(1+5*3)&15] + 3921069994
-		b = b<<20 | b>>(32-20) + c
-		a += (((b ^ c) & d) ^ c) + X[(1+5*4)&15] + 3593408605
-		a = a<<5 | a>>(32-5) + b
-		d += (((a ^ b) & c) ^ b) + X[(1+5*5)&15] + 38016083
-		d = d<<9 | d>>(32-9) + a
-		c += (((d ^ a) & b) ^ a) + X[(1+5*6)&15] + 3634488961
-		c = c<<14 | c>>(32-14) + d
-		b += (((c ^ d) & a) ^ d) + X[(1+5*7)&15] + 3889429448
-		b = b<<20 | b>>(32-20) + c
-		a += (((b ^ c) & d) ^ c) + X[(1+5*8)&15] + 568446438
-		a = a<<5 | a>>(32-5) + b
-		d += (((a ^ b) & c) ^ b) + X[(1+5*9)&15] + 3275163606
-		d = d<<9 | d>>(32-9) + a
-		c += (((d ^ a) & b) ^ a) + X[(1+5*10)&15] + 4107603335
-		c = c<<14 | c>>(32-14) + d
-		b += (((c ^ d) & a) ^ d) + X[(1+5*11)&15] + 1163531501
-		b = b<<20 | b>>(32-20) + c
-		a += (((b ^ c) & d) ^ c) + X[(1+5*12)&15] + 2850285829
-		a = a<<5 | a>>(32-5) + b
-		d += (((a ^ b) & c) ^ b) + X[(1+5*13)&15] + 4243563512
+	for i := 0; i <= len(p)-BlockSize; i += BlockSize {
-		d = d<<9 | d>>(32-9) + a
+		// eliminate bounds checks on p
+		q := p[i:]
+		q = q[:BlockSize:BlockSize]
-		c += (((d ^ a) & b) ^ a) + X[(1+5*14)&15] + 1735328473
+		// save current state
-		c = c<<14 | c>>(32-14) + d
+		aa, bb, cc, dd := a, b, c, d
-		b += (((c ^ d) & a) ^ d) + X[(1+5*15)&15] + 2368359562
-		b = b<<20 | b>>(32-20) + c
-		// Round 3.
-		a += (b ^ c ^ d) + X[(5+3*0)&15] + 4294588738
-		a = a<<4 | a>>(32-4) + b
-		d += (a ^ b ^ c) + X[(5+3*1)&15] + 2272392833
-		d = d<<11 | d>>(32-11) + a
-		c += (d ^ a ^ b) + X[(5+3*2)&15] + 1839030562
-		c = c<<16 | c>>(32-16) + d
-		b += (c ^ d ^ a) + X[(5+3*3)&15] + 4259657740
-		b = b<<23 | b>>(32-23) + c
-		a += (b ^ c ^ d) + X[(5+3*4)&15] + 2763975236
-		a = a<<4 | a>>(32-4) + b
-		d += (a ^ b ^ c) + X[(5+3*5)&15] + 1272893353
-		d = d<<11 | d>>(32-11) + a
-		c += (d ^ a ^ b) + X[(5+3*6)&15] + 4139469664
-		c = c<<16 | c>>(32-16) + d
-		b += (c ^ d ^ a) + X[(5+3*7)&15] + 3200236656
-		b = b<<23 | b>>(32-23) + c
-		a += (b ^ c ^ d) + X[(5+3*8)&15] + 681279174
-		a = a<<4 | a>>(32-4) + b
-		d += (a ^ b ^ c) + X[(5+3*9)&15] + 3936430074
-		d = d<<11 | d>>(32-11) + a
-		c += (d ^ a ^ b) + X[(5+3*10)&15] + 3572445317
-		c = c<<16 | c>>(32-16) + d
-		b += (c ^ d ^ a) + X[(5+3*11)&15] + 76029189
-		b = b<<23 | b>>(32-23) + c
-		a += (b ^ c ^ d) + X[(5+3*12)&15] + 3654602809
-		a = a<<4 | a>>(32-4) + b
-		d += (a ^ b ^ c) + X[(5+3*13)&15] + 3873151461
-		d = d<<11 | d>>(32-11) + a
-		c += (d ^ a ^ b) + X[(5+3*14)&15] + 530742520
-		c = c<<16 | c>>(32-16) + d
-		b += (c ^ d ^ a) + X[(5+3*15)&15] + 3299628645
-		b = b<<23 | b>>(32-23) + c
-		// Round 4.
-		a += (c ^ (b | ^d)) + X[(7*0)&15] + 4096336452
-		a = a<<6 | a>>(32-6) + b
-		d += (b ^ (a | ^c)) + X[(7*1)&15] + 1126891415
-		d = d<<10 | d>>(32-10) + a
-		c += (a ^ (d | ^b)) + X[(7*2)&15] + 2878612391
-		c = c<<15 | c>>(32-15) + d
-		b += (d ^ (c | ^a)) + X[(7*3)&15] + 4237533241
-		b = b<<21 | b>>(32-21) + c
-		a += (c ^ (b | ^d)) + X[(7*4)&15] + 1700485571
-		a = a<<6 | a>>(32-6) + b
-		d += (b ^ (a | ^c)) + X[(7*5)&15] + 2399980690
-		d = d<<10 | d>>(32-10) + a
-		c += (a ^ (d | ^b)) + X[(7*6)&15] + 4293915773
-		c = c<<15 | c>>(32-15) + d
-		b += (d ^ (c | ^a)) + X[(7*7)&15] + 2240044497
-		b = b<<21 | b>>(32-21) + c
-		a += (c ^ (b | ^d)) + X[(7*8)&15] + 1873313359
-		a = a<<6 | a>>(32-6) + b
-		d += (b ^ (a | ^c)) + X[(7*9)&15] + 4264355552
-		d = d<<10 | d>>(32-10) + a
-		c += (a ^ (d | ^b)) + X[(7*10)&15] + 2734768916
-		c = c<<15 | c>>(32-15) + d
-		b += (d ^ (c | ^a)) + X[(7*11)&15] + 1309151649
-		b = b<<21 | b>>(32-21) + c
-		a += (c ^ (b | ^d)) + X[(7*12)&15] + 4149444226
-		a = a<<6 | a>>(32-6) + b
-		d += (b ^ (a | ^c)) + X[(7*13)&15] + 3174756917
-		d = d<<10 | d>>(32-10) + a
-		c += (a ^ (d | ^b)) + X[(7*14)&15] + 718787259
-		c = c<<15 | c>>(32-15) + d
-		b += (d ^ (c | ^a)) + X[(7*15)&15] + 3951481745
-		b = b<<21 | b>>(32-21) + c
+		// load input block
+		x0 := binary.LittleEndian.Uint32(q[4*0x0:])
+		x1 := binary.LittleEndian.Uint32(q[4*0x1:])
+		x2 := binary.LittleEndian.Uint32(q[4*0x2:])
+		x3 := binary.LittleEndian.Uint32(q[4*0x3:])
+		x4 := binary.LittleEndian.Uint32(q[4*0x4:])
+		x5 := binary.LittleEndian.Uint32(q[4*0x5:])
+		x6 := binary.LittleEndian.Uint32(q[4*0x6:])
+		x7 := binary.LittleEndian.Uint32(q[4*0x7:])
+		x8 := binary.LittleEndian.Uint32(q[4*0x8:])
+		x9 := binary.LittleEndian.Uint32(q[4*0x9:])
+		xa := binary.LittleEndian.Uint32(q[4*0xa:])
+		xb := binary.LittleEndian.Uint32(q[4*0xb:])
+		xc := binary.LittleEndian.Uint32(q[4*0xc:])
+		xd := binary.LittleEndian.Uint32(q[4*0xd:])
+		xe := binary.LittleEndian.Uint32(q[4*0xe:])
+		xf := binary.LittleEndian.Uint32(q[4*0xf:])
+		// round 1
+		a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x0+0xd76aa478, 7)
+		d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x1+0xe8c7b756, 12)
+		c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x2+0x242070db, 17)
+		b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x3+0xc1bdceee, 22)
+		a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x4+0xf57c0faf, 7)
+		d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x5+0x4787c62a, 12)
+		c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x6+0xa8304613, 17)
+		b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x7+0xfd469501, 22)
+		a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x8+0x698098d8, 7)
+		d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x9+0x8b44f7af, 12)
+		c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xa+0xffff5bb1, 17)
+		b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xb+0x895cd7be, 22)
+		a = b + bits.RotateLeft32((((c^d)&b)^d)+a+xc+0x6b901122, 7)
+		d = a + bits.RotateLeft32((((b^c)&a)^c)+d+xd+0xfd987193, 12)
+		c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xe+0xa679438e, 17)
+		b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xf+0x49b40821, 22)
+		// round 2
+		a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x1+0xf61e2562, 5)
+		d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x6+0xc040b340, 9)
+		c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xb+0x265e5a51, 14)
+		b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x0+0xe9b6c7aa, 20)
+		a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x5+0xd62f105d, 5)
+		d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xa+0x02441453, 9)
+		c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xf+0xd8a1e681, 14)
+		b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x4+0xe7d3fbc8, 20)
+		a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x9+0x21e1cde6, 5)
+		d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xe+0xc33707d6, 9)
+		c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x3+0xf4d50d87, 14)
+		b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x8+0x455a14ed, 20)
+		a = b + bits.RotateLeft32((((b^c)&d)^c)+a+xd+0xa9e3e905, 5)
+		d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x2+0xfcefa3f8, 9)
+		c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x7+0x676f02d9, 14)
+		b = c + bits.RotateLeft32((((c^d)&a)^d)+b+xc+0x8d2a4c8a, 20)
+		// round 3
+		a = b + bits.RotateLeft32((b^c^d)+a+x5+0xfffa3942, 4)
+		d = a + bits.RotateLeft32((a^b^c)+d+x8+0x8771f681, 11)
+		c = d + bits.RotateLeft32((d^a^b)+c+xb+0x6d9d6122, 16)
+		b = c + bits.RotateLeft32((c^d^a)+b+xe+0xfde5380c, 23)
+		a = b + bits.RotateLeft32((b^c^d)+a+x1+0xa4beea44, 4)
+		d = a + bits.RotateLeft32((a^b^c)+d+x4+0x4bdecfa9, 11)
+		c = d + bits.RotateLeft32((d^a^b)+c+x7+0xf6bb4b60, 16)
+		b = c + bits.RotateLeft32((c^d^a)+b+xa+0xbebfbc70, 23)
+		a = b + bits.RotateLeft32((b^c^d)+a+xd+0x289b7ec6, 4)
+		d = a + bits.RotateLeft32((a^b^c)+d+x0+0xeaa127fa, 11)
+		c = d + bits.RotateLeft32((d^a^b)+c+x3+0xd4ef3085, 16)
+		b = c + bits.RotateLeft32((c^d^a)+b+x6+0x04881d05, 23)
+		a = b + bits.RotateLeft32((b^c^d)+a+x9+0xd9d4d039, 4)
+		d = a + bits.RotateLeft32((a^b^c)+d+xc+0xe6db99e5, 11)
+		c = d + bits.RotateLeft32((d^a^b)+c+xf+0x1fa27cf8, 16)
+		b = c + bits.RotateLeft32((c^d^a)+b+x2+0xc4ac5665, 23)
+		// round 4
+		a = b + bits.RotateLeft32((c^(b|^d))+a+x0+0xf4292244, 6)
+		d = a + bits.RotateLeft32((b^(a|^c))+d+x7+0x432aff97, 10)
+		c = d + bits.RotateLeft32((a^(d|^b))+c+xe+0xab9423a7, 15)
+		b = c + bits.RotateLeft32((d^(c|^a))+b+x5+0xfc93a039, 21)
+		a = b + bits.RotateLeft32((c^(b|^d))+a+xc+0x655b59c3, 6)
+		d = a + bits.RotateLeft32((b^(a|^c))+d+x3+0x8f0ccc92, 10)
+		c = d + bits.RotateLeft32((a^(d|^b))+c+xa+0xffeff47d, 15)
+		b = c + bits.RotateLeft32((d^(c|^a))+b+x1+0x85845dd1, 21)
+		a = b + bits.RotateLeft32((c^(b|^d))+a+x8+0x6fa87e4f, 6)
+		d = a + bits.RotateLeft32((b^(a|^c))+d+xf+0xfe2ce6e0, 10)
+		c = d + bits.RotateLeft32((a^(d|^b))+c+x6+0xa3014314, 15)
+		b = c + bits.RotateLeft32((d^(c|^a))+b+xd+0x4e0811a1, 21)
+		a = b + bits.RotateLeft32((c^(b|^d))+a+x4+0xf7537e82, 6)
+		d = a + bits.RotateLeft32((b^(a|^c))+d+xb+0xbd3af235, 10)
+		c = d + bits.RotateLeft32((a^(d|^b))+c+x2+0x2ad7d2bb, 15)
+		b = c + bits.RotateLeft32((d^(c|^a))+b+x9+0xeb86d391, 21)
+		// add saved state
 		a += aa
 		b += bb
 		c += cc
 		d += dd
-		p = p[chunk:]
 	}
-	dig.s[0] = a
+	// save state
-	dig.s[1] = b
+	dig.s[0], dig.s[1], dig.s[2], dig.s[3] = a, b, c, d
-	dig.s[2] = c
-	dig.s[3] = d
 }
--- a/src/crypto/md5/md5block_decl.go
+++ b/src/crypto/md5/md5block_decl.go
@@ -6,6 +6,8 @@
 package md5
+const haveAsm = true
 //go:noescape
 func block(dig *digest, p []byte)
--- a/src/crypto/md5/md5block_generic.go
+++ b/src/crypto/md5/md5block_generic.go
@@ -6,4 +6,6 @@
 package md5
+const haveAsm = false
 var block = blockGeneric