crypto/md5: native arm assembler version

An ARM version of md5block.go with a big improvement in throughput (up to 2.5x) and a reduction in object size (21%). Code size Before 3100 bytes After 2424 bytes 21% smaller Benchmarks on Rasperry Pi benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 11703 6636 -43.30% BenchmarkHash1K 38057 21881 -42.50% BenchmarkHash8K 208131 142735 -31.42% BenchmarkHash8BytesUnaligned 11457 6570 -42.66% BenchmarkHash1KUnaligned 69334 26841 -61.29% BenchmarkHash8KUnaligned 455120 182223 -59.96% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 0.68 1.21 1.78x BenchmarkHash1K 26.91 46.80 1.74x BenchmarkHash8K 39.36 57.39 1.46x BenchmarkHash8BytesUnaligned 0.70 1.22 1.74x BenchmarkHash1KUnaligned 14.77 38.15 2.58x BenchmarkHash8KUnaligned 18.00 44.96 2.50x benchmark old allocs new allocs delta BenchmarkHash8Bytes 1 0 -100.00% BenchmarkHash1K 2 0 -100.00% BenchmarkHash8K 2 0 -100.00% BenchmarkHash8BytesUnaligned 1 0 -100.00% BenchmarkHash1KUnaligned 2 0 -100.00% BenchmarkHash8KUnaligned 2 0 -100.00% benchmark old bytes new bytes delta BenchmarkHash8Bytes 64 0 -100.00% BenchmarkHash1K 128 0 -100.00% BenchmarkHash8K 128 0 -100.00% BenchmarkHash8BytesUnaligned 64 0 -100.00% BenchmarkHash1KUnaligned 128 0 -100.00% BenchmarkHash8KUnaligned 128 0 -100.00% This also adds another test which makes sure that the sums over larger blocks work properly. I wrote this test when I was worried about memory corruption. R=golang-dev, dave, bradfitz, rsc, ajstarks CC=golang-dev, minux.ma, remyoudompheng https://golang.org/cl/11648043

crypto/md5: native arm assembler version
An ARM version of md5block.go with a big improvement in throughput (up to 2.5x) and a reduction in object size (21%). Code size Before 3100 bytes After 2424 bytes 21% smaller Benchmarks on Rasperry Pi benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 11703 6636 -43.30% BenchmarkHash1K 38057 21881 -42.50% BenchmarkHash8K 208131 142735 -31.42% BenchmarkHash8BytesUnaligned 11457 6570 -42.66% BenchmarkHash1KUnaligned 69334 26841 -61.29% BenchmarkHash8KUnaligned 455120 182223 -59.96% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 0.68 1.21 1.78x BenchmarkHash1K 26.91 46.80 1.74x BenchmarkHash8K 39.36 57.39 1.46x BenchmarkHash8BytesUnaligned 0.70 1.22 1.74x BenchmarkHash1KUnaligned 14.77 38.15 2.58x BenchmarkHash8KUnaligned 18.00 44.96 2.50x benchmark old allocs new allocs delta BenchmarkHash8Bytes 1 0 -100.00% BenchmarkHash1K 2 0 -100.00% BenchmarkHash8K 2 0 -100.00% BenchmarkHash8BytesUnaligned 1 0 -100.00% BenchmarkHash1KUnaligned 2 0 -100.00% BenchmarkHash8KUnaligned 2 0 -100.00% benchmark old bytes new bytes delta BenchmarkHash8Bytes 64 0 -100.00% BenchmarkHash1K 128 0 -100.00% BenchmarkHash8K 128 0 -100.00% BenchmarkHash8BytesUnaligned 64 0 -100.00% BenchmarkHash1KUnaligned 128 0 -100.00% BenchmarkHash8KUnaligned 128 0 -100.00% This also adds another test which makes sure that the sums over larger blocks work properly. I wrote this test when I was worried about memory corruption. R=golang-dev, dave, bradfitz, rsc, ajstarks CC=golang-dev, minux.ma, remyoudompheng https://golang.org/cl/11648043
085159da · Nick Craig-Wood · Dave Cheney · 93c6d0ef · 085159da · 085159da
Commit 085159da authored Jul 25, 2013 by Nick Craig-Wood Committed by Dave Cheney Jul 25, 2013
5 changed files
--- a/src/pkg/crypto/md5/gen.go
+++ b/src/pkg/crypto/md5/gen.go
@@ -164,7 +164,7 @@ var program = `
 // DO NOT EDIT.
 // Generate with: go run gen.go{{if .Full}} -full{{end}} | gofmt >md5block.go

-// +build !amd64
+// +build !amd64,!386,!arm

 package md5


--- a/src/pkg/crypto/md5/md5_test.go
+++ b/src/pkg/crypto/md5/md5_test.go
@@ -81,6 +81,30 @@ func TestGolden(t *testing.T) {
 	}
 }

+func TestLarge(t *testing.T) {
+	const N = 10000
+	ok := "2bb571599a4180e1d542f76904adc3df" // md5sum of "0123456789" * 1000
+	block := make([]byte, 10004)
+	c := New()
+	for offset := 0; offset < 4; offset++ {
+		for i := 0; i < N; i++ {
+			block[offset+i] = '0' + byte(i%10)
+		}
+		for blockSize := 10; blockSize <= N; blockSize *= 10 {
+			blocks := N / blockSize
+			b := block[offset : offset+blockSize]
+			c.Reset()
+			for i := 0; i < blocks; i++ {
+				c.Write(b)
+			}
+			s := fmt.Sprintf("%x", c.Sum(nil))
+			if s != ok {
+				t.Fatalf("md5 TestLarge offset=%d, blockSize=%d = %s want %s", offset, blockSize, s, ok)
+			}
+		}
+	}
+}
+
 func ExampleNew() {
 	h := New()
 	io.WriteString(h, "The fog is getting thicker!")

--- a/src/pkg/crypto/md5/md5block.go
+++ b/src/pkg/crypto/md5/md5block.go
 // DO NOT EDIT.
 // Generate with: go run gen.go -full | gofmt >md5block.go

-// +build !amd64,!386
+// +build !amd64,!386,!arm

 package md5


--- a/src/pkg/crypto/md5/md5block_arm.s
+++ b/src/pkg/crypto/md5/md5block_arm.s
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// ARM version of md5block.go
+
+// Register definitions
+table = 0	// Pointer to MD5 constants table
+data = 1	// Pointer to data to hash
+a = 2		// MD5 accumulator
+b = 3		// MD5 accumulator
+c = 4		// MD5 accumulator
+d = 5		// MD5 accumulator
+c0 = 6		// MD5 constant
+c1 = 7		// MD5 constant
+c2 = 8		// MD5 constant
+// r9, r10 are forbidden
+// r11 is OK provided you check the assembler that no synthetic instructions use it
+c3 = 11		// MD5 constant
+t0 = 12		// temporary
+t1 = 14		// temporary
+
+// func block(dig *digest, p []byte)
+// 0(FP) is *digest
+// 4(FP) is p.array (struct Slice)
+// 8(FP) is p.len
+//12(FP) is p.cap
+//
+// Stack frame
+p_end = -4	// -4(SP) pointer to the end of data
+p_data = -8	// -8(SP) current data pointer
+buf = -8-4*16	//-72(SP) 16 words temporary buffer
+		// 3 words at 4..12(R13) for called routine parameters
+
+TEXT	·block(SB), 7, $84-16
+	MOVW	p+4(FP), R(data)	// pointer to the data
+	MOVW	p_len+8(FP), R(t0)	// number of bytes
+	ADD	R(data), R(t0)
+	MOVW	R(t0), p_end(SP)	// pointer to end of data
+
+loop:
+	MOVW	R(data), p_data(SP)	// Save R(data)
+	AND.S	$3, R(data), R(t0)	// TST $3, R(data) not working see issue 5921
+	BEQ	aligned			// aligned detected - skip copy
+
+	// Copy the unaligned source data into the aligned temporary buffer
+	// memove(to=4(R13), from=8(R13), n=12(R13)) - Corrupts all registers
+	MOVW	$buf(SP), R(table)	// to
+	MOVW	$64, R(c0)		// n
+	MOVM.IB	[R(table),R(data),R(c0)], (R13)
+	BL	runtime·memmove(SB)
+
+	// Point to the local aligned copy of the data
+	MOVW	$buf(SP), R(data)
+
+aligned:
+	// Point to the table of constants
+	// A PC relative add would be cheaper than this
+	MOVW	$·table(SB), R(table)
+
+	// Load up initial MD5 accumulator
+	MOVW	dig+0(FP), R(c0)
+	MOVM.IA (R(c0)), [R(a),R(b),R(c),R(d)]
+
+// a += (((c^d)&b)^d) + X[index] + const
+// a = a<<shift | a>>(32-shift) + b
+#define ROUND1(a, b, c, d, index, shift, const) \
+	EOR	R(c), R(d), R(t0)		; \
+	AND	R(b), R(t0)			; \
+	EOR	R(d), R(t0)			; \
+	MOVW	(index<<2)(R(data)), R(t1)	; \
+	ADD	R(t1), R(t0)			; \
+	ADD	R(const), R(t0)			; \
+	ADD	R(t0), R(a)			; \
+	ADD	R(a)@>(32-shift), R(b), R(a)	;
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND1(a, b, c, d,  0,	7, c0)
+	ROUND1(d, a, b, c,  1, 12, c1)
+	ROUND1(c, d, a, b,  2, 17, c2)
+	ROUND1(b, c, d, a,  3, 22, c3)
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND1(a, b, c, d,  4,	7, c0)
+	ROUND1(d, a, b, c,  5, 12, c1)
+	ROUND1(c, d, a, b,  6, 17, c2)
+	ROUND1(b, c, d, a,  7, 22, c3)
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND1(a, b, c, d,  8,	7, c0)
+	ROUND1(d, a, b, c,  9, 12, c1)
+	ROUND1(c, d, a, b, 10, 17, c2)
+	ROUND1(b, c, d, a, 11, 22, c3)
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND1(a, b, c, d, 12,	7, c0)
+	ROUND1(d, a, b, c, 13, 12, c1)
+	ROUND1(c, d, a, b, 14, 17, c2)
+	ROUND1(b, c, d, a, 15, 22, c3)
+
+// a += (((b^c)&d)^c) + X[index] + const
+// a = a<<shift | a>>(32-shift) + b
+#define ROUND2(a, b, c, d, index, shift, const) \
+	EOR	R(b), R(c), R(t0)		; \
+	AND	R(d), R(t0)			; \
+	EOR	R(c), R(t0)			; \
+	MOVW	(index<<2)(R(data)), R(t1)	; \
+	ADD	R(t1), R(t0)			; \
+	ADD	R(const), R(t0)			; \
+	ADD	R(t0), R(a)			; \
+	ADD	R(a)@>(32-shift), R(b), R(a)	;
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND2(a, b, c, d,  1,	5, c0)
+	ROUND2(d, a, b, c,  6,	9, c1)
+	ROUND2(c, d, a, b, 11, 14, c2)
+	ROUND2(b, c, d, a,  0, 20, c3)
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND2(a, b, c, d,  5,	5, c0)
+	ROUND2(d, a, b, c, 10,	9, c1)
+	ROUND2(c, d, a, b, 15, 14, c2)
+	ROUND2(b, c, d, a,  4, 20, c3)
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND2(a, b, c, d,  9,	5, c0)
+	ROUND2(d, a, b, c, 14,	9, c1)
+	ROUND2(c, d, a, b,  3, 14, c2)
+	ROUND2(b, c, d, a,  8, 20, c3)
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND2(a, b, c, d, 13,	5, c0)
+	ROUND2(d, a, b, c,  2,	9, c1)
+	ROUND2(c, d, a, b,  7, 14, c2)
+	ROUND2(b, c, d, a, 12, 20, c3)
+
+// a += (b^c^d) + X[index] + const
+// a = a<<shift | a>>(32-shift) + b
+#define ROUND3(a, b, c, d, index, shift, const) \
+	EOR	R(b), R(c), R(t0)		; \
+	EOR	R(d), R(t0)			; \
+	MOVW	(index<<2)(R(data)), R(t1)	; \
+	ADD	R(t1), R(t0)			; \
+	ADD	R(const), R(t0)			; \
+	ADD	R(t0), R(a)			; \
+	ADD	R(a)@>(32-shift), R(b), R(a)	;
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND3(a, b, c, d,  5,	4, c0)
+	ROUND3(d, a, b, c,  8, 11, c1)
+	ROUND3(c, d, a, b, 11, 16, c2)
+	ROUND3(b, c, d, a, 14, 23, c3)
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND3(a, b, c, d,  1,	4, c0)
+	ROUND3(d, a, b, c,  4, 11, c1)
+	ROUND3(c, d, a, b,  7, 16, c2)
+	ROUND3(b, c, d, a, 10, 23, c3)
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND3(a, b, c, d, 13,	4, c0)
+	ROUND3(d, a, b, c,  0, 11, c1)
+	ROUND3(c, d, a, b,  3, 16, c2)
+	ROUND3(b, c, d, a,  6, 23, c3)
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND3(a, b, c, d,  9,	4, c0)
+	ROUND3(d, a, b, c, 12, 11, c1)
+	ROUND3(c, d, a, b, 15, 16, c2)
+	ROUND3(b, c, d, a,  2, 23, c3)
+
+// a += (c^(b|^d)) + X[index] + const
+// a = a<<shift | a>>(32-shift) + b
+#define ROUND4(a, b, c, d, index, shift, const) \
+	MVN	R(d), R(t0)			; \
+	ORR	R(b), R(t0)			; \
+	EOR	R(c), R(t0)			; \
+	MOVW	(index<<2)(R(data)), R(t1)	; \
+	ADD	R(t1), R(t0)			; \
+	ADD	R(const), R(t0)			; \
+	ADD	R(t0), R(a)			; \
+	ADD	R(a)@>(32-shift), R(b), R(a)	;
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND4(a, b, c, d,  0,	6, c0)
+	ROUND4(d, a, b, c,  7, 10, c1)
+	ROUND4(c, d, a, b, 14, 15, c2)
+	ROUND4(b, c, d, a,  5, 21, c3)
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND4(a, b, c, d, 12,	6, c0)
+	ROUND4(d, a, b, c,  3, 10, c1)
+	ROUND4(c, d, a, b, 10, 15, c2)
+	ROUND4(b, c, d, a,  1, 21, c3)
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND4(a, b, c, d,  8,	6, c0)
+	ROUND4(d, a, b, c, 15, 10, c1)
+	ROUND4(c, d, a, b,  6, 15, c2)
+	ROUND4(b, c, d, a, 13, 21, c3)
+
+	MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)]
+	ROUND4(a, b, c, d,  4,	6, c0)
+	ROUND4(d, a, b, c, 11, 10, c1)
+	ROUND4(c, d, a, b,  2, 15, c2)
+	ROUND4(b, c, d, a,  9, 21, c3)
+
+	MOVW	dig+0(FP), R(t0)
+	MOVM.IA (R(t0)), [R(c0),R(c1),R(c2),R(c3)]
+
+	ADD	R(c0), R(a)
+	ADD	R(c1), R(b)
+	ADD	R(c2), R(c)
+	ADD	R(c3), R(d)
+
+	MOVM.IA [R(a),R(b),R(c),R(d)], (R(t0))
+
+	MOVW	p_data(SP), R(data)
+	MOVW	p_end(SP), R(t0)
+	ADD	$64, R(data)
+	CMP	R(t0), R(data)
+	BLO	loop
+
+	RET
+
+// MD5 constants table
+
+	// Round 1
+	DATA	·table+0x00(SB)/4, $0xd76aa478
+	DATA	·table+0x04(SB)/4, $0xe8c7b756
+	DATA	·table+0x08(SB)/4, $0x242070db
+	DATA	·table+0x0c(SB)/4, $0xc1bdceee
+	DATA	·table+0x10(SB)/4, $0xf57c0faf
+	DATA	·table+0x14(SB)/4, $0x4787c62a
+	DATA	·table+0x18(SB)/4, $0xa8304613
+	DATA	·table+0x1c(SB)/4, $0xfd469501
+	DATA	·table+0x20(SB)/4, $0x698098d8
+	DATA	·table+0x24(SB)/4, $0x8b44f7af
+	DATA	·table+0x28(SB)/4, $0xffff5bb1
+	DATA	·table+0x2c(SB)/4, $0x895cd7be
+	DATA	·table+0x30(SB)/4, $0x6b901122
+	DATA	·table+0x34(SB)/4, $0xfd987193
+	DATA	·table+0x38(SB)/4, $0xa679438e
+	DATA	·table+0x3c(SB)/4, $0x49b40821
+	// Round 2
+	DATA	·table+0x40(SB)/4, $0xf61e2562
+	DATA	·table+0x44(SB)/4, $0xc040b340
+	DATA	·table+0x48(SB)/4, $0x265e5a51
+	DATA	·table+0x4c(SB)/4, $0xe9b6c7aa
+	DATA	·table+0x50(SB)/4, $0xd62f105d
+	DATA	·table+0x54(SB)/4, $0x02441453
+	DATA	·table+0x58(SB)/4, $0xd8a1e681
+	DATA	·table+0x5c(SB)/4, $0xe7d3fbc8
+	DATA	·table+0x60(SB)/4, $0x21e1cde6
+	DATA	·table+0x64(SB)/4, $0xc33707d6
+	DATA	·table+0x68(SB)/4, $0xf4d50d87
+	DATA	·table+0x6c(SB)/4, $0x455a14ed
+	DATA	·table+0x70(SB)/4, $0xa9e3e905
+	DATA	·table+0x74(SB)/4, $0xfcefa3f8
+	DATA	·table+0x78(SB)/4, $0x676f02d9
+	DATA	·table+0x7c(SB)/4, $0x8d2a4c8a
+	// Round 3
+	DATA	·table+0x80(SB)/4, $0xfffa3942
+	DATA	·table+0x84(SB)/4, $0x8771f681
+	DATA	·table+0x88(SB)/4, $0x6d9d6122
+	DATA	·table+0x8c(SB)/4, $0xfde5380c
+	DATA	·table+0x90(SB)/4, $0xa4beea44
+	DATA	·table+0x94(SB)/4, $0x4bdecfa9
+	DATA	·table+0x98(SB)/4, $0xf6bb4b60
+	DATA	·table+0x9c(SB)/4, $0xbebfbc70
+	DATA	·table+0xa0(SB)/4, $0x289b7ec6
+	DATA	·table+0xa4(SB)/4, $0xeaa127fa
+	DATA	·table+0xa8(SB)/4, $0xd4ef3085
+	DATA	·table+0xac(SB)/4, $0x04881d05
+	DATA	·table+0xb0(SB)/4, $0xd9d4d039
+	DATA	·table+0xb4(SB)/4, $0xe6db99e5
+	DATA	·table+0xb8(SB)/4, $0x1fa27cf8
+	DATA	·table+0xbc(SB)/4, $0xc4ac5665
+	// Round 4
+	DATA	·table+0xc0(SB)/4, $0xf4292244
+	DATA	·table+0xc4(SB)/4, $0x432aff97
+	DATA	·table+0xc8(SB)/4, $0xab9423a7
+	DATA	·table+0xcc(SB)/4, $0xfc93a039
+	DATA	·table+0xd0(SB)/4, $0x655b59c3
+	DATA	·table+0xd4(SB)/4, $0x8f0ccc92
+	DATA	·table+0xd8(SB)/4, $0xffeff47d
+	DATA	·table+0xdc(SB)/4, $0x85845dd1
+	DATA	·table+0xe0(SB)/4, $0x6fa87e4f
+	DATA	·table+0xe4(SB)/4, $0xfe2ce6e0
+	DATA	·table+0xe8(SB)/4, $0xa3014314
+	DATA	·table+0xec(SB)/4, $0x4e0811a1
+	DATA	·table+0xf0(SB)/4, $0xf7537e82
+	DATA	·table+0xf4(SB)/4, $0xbd3af235
+	DATA	·table+0xf8(SB)/4, $0x2ad7d2bb
+	DATA	·table+0xfc(SB)/4, $0xeb86d391
+	// Global definition
+	GLOBL	·table(SB),8,$256
--- a/src/pkg/crypto/md5/md5block_decl.go
+++ b/src/pkg/crypto/md5/md5block_decl.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build amd64 386
+// +build amd64 386 arm

 package md5