hash/crc32: speedup crc32 of IEEE using slicingBy8

The Slicing-By-8 [1] algorithm has much performance improvements than current approach. This patch only uses it for IEEE, which is the most common case in practice. There is the benchmark on Mac OS X 10.9: benchmark old MB/s new MB/s speedup BenchmarkIEEECrc1KB 349.40 353.03 1.01x BenchmarkIEEECrc4KB 351.55 934.35 2.66x BenchmarkCastagnoliCrc1KB 7037.58 7392.63 1.05x This algorithm need 8K lookup table, so it's enabled only for block larger than 4K. We can see about 2.6x improvement for IEEE. Change-Id: I7f786d20f0949245e4aa101d7921669f496ed0f7 Reviewed-on: https://go-review.googlesource.com/1863Reviewed-by: Russ Cox <rsc@golang.org>

hash/crc32: speedup crc32 of IEEE using slicingBy8
The Slicing-By-8 [1] algorithm has much performance improvements than current approach. This patch only uses it for IEEE, which is the most common case in practice. There is the benchmark on Mac OS X 10.9: benchmark old MB/s new MB/s speedup BenchmarkIEEECrc1KB 349.40 353.03 1.01x BenchmarkIEEECrc4KB 351.55 934.35 2.66x BenchmarkCastagnoliCrc1KB 7037.58 7392.63 1.05x This algorithm need 8K lookup table, so it's enabled only for block larger than 4K. We can see about 2.6x improvement for IEEE. Change-Id: I7f786d20f0949245e4aa101d7921669f496ed0f7 Reviewed-on: https://go-review.googlesource.com/1863Reviewed-by: Russ Cox <rsc@golang.org>
1e076035 · Davies Liu · Russ Cox · d1e7980d · 1e076035 · 1e076035
Commit 1e076035 authored Dec 19, 2014 by Davies Liu Committed by Russ Cox Jun 18, 2015
Hide whitespace changes
Inline Side-by-side

Showing with 78 additions and 2 deletions

crc32.go src/hash/crc32/crc32.go +43 -1

crc32_test.go src/hash/crc32/crc32_test.go +35 -1

No files found.
--- a/src/hash/crc32/crc32.go
+++ b/src/hash/crc32/crc32.go
@@ -54,6 +54,13 @@ func castagnoliInit() {
 // IEEETable is the table for the IEEE polynomial.
 var IEEETable = makeTable(IEEE)

+// slicing8Table is array of 8 Tables
+type slicing8Table [8]Table
+
+// iEEETable8 is the slicing8Table for IEEE
+var iEEETable8 *slicing8Table
+var iEEETable8Once sync.Once
+
 // MakeTable returns the Table constructed from the specified polynomial.
 func MakeTable(poly uint32) *Table {
 	switch poly {
@@ -83,6 +90,20 @@ func makeTable(poly uint32) *Table {
 	return t
 }

+// makeTable8 returns slicing8Table constructed from the specified polynomial.
+func makeTable8(poly uint32) *slicing8Table {
+	t := new(slicing8Table)
+	t[0] = *makeTable(poly)
+	for i := 0; i < 256; i++ {
+		crc := t[0][i]
+		for j := 1; j < 8; j++ {
+			crc = t[0][crc&0xFF] ^ (crc >> 8)
+			t[j][i] = crc
+		}
+	}
+	return t
+}
+
 // digest represents the partial evaluation of a checksum.
 type digest struct {
 	crc uint32
@@ -111,11 +132,32 @@ func update(crc uint32, tab *Table, p []byte) uint32 {
 	return ^crc
 }

+// updateSlicingBy8 updates CRC using Slicing-by-8
+func updateSlicingBy8(crc uint32, tab *slicing8Table, p []byte) uint32 {
+	crc = ^crc
+	for len(p) > 8 {
+		crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
+		crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^
+			tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^
+			tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF]
+		p = p[8:]
+	}
+	crc = ^crc
+	return update(crc, &tab[0], p)
+}
+
 // Update returns the result of adding the bytes in p to the crc.
 func Update(crc uint32, tab *Table, p []byte) uint32 {
 	if tab == castagnoliTable {
 		return updateCastagnoli(crc, p)
 	}
+	// only use slicing-by-8 when input is larger than 4KB
+	if tab == IEEETable && len(p) >= 4096 {
+		iEEETable8Once.Do(func() {
+			iEEETable8 = makeTable8(IEEE)
+		})
+		return updateSlicingBy8(crc, iEEETable8, p)
+	}
 	return update(crc, tab, p)
 }

@@ -137,4 +179,4 @@ func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }

 // ChecksumIEEE returns the CRC-32 checksum of data
 // using the IEEE polynomial.
-func ChecksumIEEE(data []byte) uint32 { return update(0, IEEETable, data) }
+func ChecksumIEEE(data []byte) uint32 { return Update(0, IEEETable, data) }
--- a/src/hash/crc32/crc32_test.go
+++ b/src/hash/crc32/crc32_test.go
@@ -81,7 +81,7 @@ func TestGolden(t *testing.T) {
 	}
 }

-func BenchmarkCrc32KB(b *testing.B) {
+func BenchmarkIEEECrc1KB(b *testing.B) {
 	b.SetBytes(1024)
 	data := make([]byte, 1024)
 	for i := range data {
@@ -97,3 +97,37 @@ func BenchmarkCrc32KB(b *testing.B) {
 		h.Sum(in)
 	}
 }
+
+func BenchmarkIEEECrc4KB(b *testing.B) {
+	b.SetBytes(4096)
+	data := make([]byte, 4096)
+	for i := range data {
+		data[i] = byte(i)
+	}
+	h := NewIEEE()
+	in := make([]byte, 0, h.Size())
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		h.Reset()
+		h.Write(data)
+		h.Sum(in)
+	}
+}
+
+func BenchmarkCastagnoliCrc1KB(b *testing.B) {
+	b.SetBytes(1024)
+	data := make([]byte, 1024)
+	for i := range data {
+		data[i] = byte(i)
+	}
+	h := New(MakeTable(Castagnoli))
+	in := make([]byte, 0, h.Size())
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		h.Reset()
+		h.Write(data)
+		h.Sum(in)
+	}
+}