hash/crc32: improve performance for ppc64le

This change improves the performance of crc32 for ppc64le by using vpmsum and other vector instructions in the algorithm. The testcase was updated to test more sizes. Fixes #19570 BenchmarkCRC32/poly=IEEE/size=15/align=0-8 90.5 81.8 -9.61% BenchmarkCRC32/poly=IEEE/size=15/align=1-8 89.7 81.7 -8.92% BenchmarkCRC32/poly=IEEE/size=40/align=0-8 93.2 61.1 -34.44% BenchmarkCRC32/poly=IEEE/size=40/align=1-8 92.8 60.9 -34.38% BenchmarkCRC32/poly=IEEE/size=512/align=0-8 501 55.8 -88.86% BenchmarkCRC32/poly=IEEE/size=512/align=1-8 502 132 -73.71% BenchmarkCRC32/poly=IEEE/size=1kB/align=0-8 947 69.9 -92.62% BenchmarkCRC32/poly=IEEE/size=1kB/align=1-8 946 144 -84.78% BenchmarkCRC32/poly=IEEE/size=4kB/align=0-8 3602 186 -94.84% BenchmarkCRC32/poly=IEEE/size=4kB/align=1-8 3603 263 -92.70% BenchmarkCRC32/poly=IEEE/size=32kB/align=0-8 28404 1338 -95.29% BenchmarkCRC32/poly=IEEE/size=32kB/align=1-8 28856 1405 -95.13% BenchmarkCRC32/poly=Castagnoli/size=15/align=0-8 89.7 81.8 -8.81% BenchmarkCRC32/poly=Castagnoli/size=15/align=1-8 89.8 81.9 -8.80% BenchmarkCRC32/poly=Castagnoli/size=40/align=0-8 93.8 61.4 -34.54% BenchmarkCRC32/poly=Castagnoli/size=40/align=1-8 94.3 61.3 -34.99% BenchmarkCRC32/poly=Castagnoli/size=512/align=0-8 503 56.4 -88.79% BenchmarkCRC32/poly=Castagnoli/size=512/align=1-8 502 132 -73.71% BenchmarkCRC32/poly=Castagnoli/size=1kB/align=0-8 941 70.2 -92.54% BenchmarkCRC32/poly=Castagnoli/size=1kB/align=1-8 943 145 -84.62% BenchmarkCRC32/poly=Castagnoli/size=4kB/align=0-8 3588 186 -94.82% BenchmarkCRC32/poly=Castagnoli/size=4kB/align=1-8 3595 264 -92.66% BenchmarkCRC32/poly=Castagnoli/size=32kB/align=0-8 28266 1323 -95.32% BenchmarkCRC32/poly=Castagnoli/size=32kB/align=1-8 28344 1404 -95.05% Change-Id: Ic4d8274c66e0e87bfba5f609f508a3877aee6bb5 Reviewed-on: https://go-review.googlesource.com/38184Reviewed-by: David Chase <drchase@google.com>

hash/crc32: improve performance for ppc64le
This change improves the performance of crc32 for ppc64le by using vpmsum and other vector instructions in the algorithm. The testcase was updated to test more sizes. Fixes #19570 BenchmarkCRC32/poly=IEEE/size=15/align=0-8 90.5 81.8 -9.61% BenchmarkCRC32/poly=IEEE/size=15/align=1-8 89.7 81.7 -8.92% BenchmarkCRC32/poly=IEEE/size=40/align=0-8 93.2 61.1 -34.44% BenchmarkCRC32/poly=IEEE/size=40/align=1-8 92.8 60.9 -34.38% BenchmarkCRC32/poly=IEEE/size=512/align=0-8 501 55.8 -88.86% BenchmarkCRC32/poly=IEEE/size=512/align=1-8 502 132 -73.71% BenchmarkCRC32/poly=IEEE/size=1kB/align=0-8 947 69.9 -92.62% BenchmarkCRC32/poly=IEEE/size=1kB/align=1-8 946 144 -84.78% BenchmarkCRC32/poly=IEEE/size=4kB/align=0-8 3602 186 -94.84% BenchmarkCRC32/poly=IEEE/size=4kB/align=1-8 3603 263 -92.70% BenchmarkCRC32/poly=IEEE/size=32kB/align=0-8 28404 1338 -95.29% BenchmarkCRC32/poly=IEEE/size=32kB/align=1-8 28856 1405 -95.13% BenchmarkCRC32/poly=Castagnoli/size=15/align=0-8 89.7 81.8 -8.81% BenchmarkCRC32/poly=Castagnoli/size=15/align=1-8 89.8 81.9 -8.80% BenchmarkCRC32/poly=Castagnoli/size=40/align=0-8 93.8 61.4 -34.54% BenchmarkCRC32/poly=Castagnoli/size=40/align=1-8 94.3 61.3 -34.99% BenchmarkCRC32/poly=Castagnoli/size=512/align=0-8 503 56.4 -88.79% BenchmarkCRC32/poly=Castagnoli/size=512/align=1-8 502 132 -73.71% BenchmarkCRC32/poly=Castagnoli/size=1kB/align=0-8 941 70.2 -92.54% BenchmarkCRC32/poly=Castagnoli/size=1kB/align=1-8 943 145 -84.62% BenchmarkCRC32/poly=Castagnoli/size=4kB/align=0-8 3588 186 -94.82% BenchmarkCRC32/poly=Castagnoli/size=4kB/align=1-8 3595 264 -92.66% BenchmarkCRC32/poly=Castagnoli/size=32kB/align=0-8 28266 1323 -95.32% BenchmarkCRC32/poly=Castagnoli/size=32kB/align=1-8 28344 1404 -95.05% Change-Id: Ic4d8274c66e0e87bfba5f609f508a3877aee6bb5 Reviewed-on: https://go-review.googlesource.com/38184Reviewed-by: David Chase <drchase@google.com>
b6cd22c2 · Lynn Boger · 16663a85 · b6cd22c2 · b6cd22c2 · b6cd22c2
Commit b6cd22c2 authored Mar 13, 2017 by Lynn Boger
6 changed files
--- a/src/hash/crc32/crc32_otherarch.go
+++ b/src/hash/crc32/crc32_otherarch.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build !amd64,!amd64p32,!s390x
+// +build !amd64,!amd64p32,!s390x,!ppc64le

 package crc32


--- a/src/hash/crc32/crc32_ppc64le.go
+++ b/src/hash/crc32/crc32_ppc64le.go
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package crc32
+
+import (
+	"unsafe"
+)
+
+const (
+	vecMinLen    = 16
+	vecAlignMask = 15 // align to 16 bytes
+	crcIEEE      = 1
+	crcCast      = 2
+)
+
+//go:noescape
+func ppc64SlicingUpdateBy8(crc uint32, table8 *slicing8Table, p []byte) uint32
+
+// this function requires the buffer to be 16 byte aligned and > 16 bytes long
+//go:noescape
+func vectorCrc32(crc uint32, poly uint32, p []byte) uint32
+
+var archCastagnoliTable8 *slicing8Table
+
+func archInitCastagnoli() {
+	archCastagnoliTable8 = slicingMakeTable(Castagnoli)
+}
+
+func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
+	if len(p) >= 4*vecMinLen {
+		// If not aligned then process the initial unaligned bytes
+
+		if uint64(uintptr(unsafe.Pointer(&p[0])))&uint64(vecAlignMask) != 0 {
+			align := uint64(uintptr(unsafe.Pointer(&p[0]))) & uint64(vecAlignMask)
+			newlen := vecMinLen - align
+			crc = ppc64SlicingUpdateBy8(crc, archCastagnoliTable8, p[:newlen])
+			p = p[newlen:]
+		}
+		// p should be aligned now
+		aligned := len(p) & ^vecAlignMask
+		crc = vectorCrc32(crc, crcCast, p[:aligned])
+		p = p[aligned:]
+	}
+	if len(p) == 0 {
+		return crc
+	}
+	return ppc64SlicingUpdateBy8(crc, archCastagnoliTable8, p)
+}
+
+func archAvailableIEEE() bool {
+	return true
+}
+func archAvailableCastagnoli() bool {
+	return true
+}
+
+var archIeeeTable8 *slicing8Table
+
+func archInitIEEE() {
+	// We still use slicing-by-8 for small buffers.
+	archIeeeTable8 = slicingMakeTable(IEEE)
+}
+
+// archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
+func archUpdateIEEE(crc uint32, p []byte) uint32 {
+
+	// Check if vector code should be used.  If not aligned, then handle those
+	// first up to the aligned bytes.
+
+	if len(p) >= 4*vecMinLen {
+		if uint64(uintptr(unsafe.Pointer(&p[0])))&uint64(vecAlignMask) != 0 {
+			align := uint64(uintptr(unsafe.Pointer(&p[0]))) & uint64(vecAlignMask)
+			newlen := vecMinLen - align
+			crc = ppc64SlicingUpdateBy8(crc, archIeeeTable8, p[:newlen])
+			p = p[newlen:]
+		}
+		aligned := len(p) & ^vecAlignMask
+		crc = vectorCrc32(crc, crcIEEE, p[:aligned])
+		p = p[aligned:]
+	}
+	if len(p) == 0 {
+		return crc
+	}
+	return ppc64SlicingUpdateBy8(crc, archIeeeTable8, p)
+}
--- a/src/hash/crc32/crc32_ppc64le.s
+++ b/src/hash/crc32/crc32_ppc64le.s
--- a/src/hash/crc32/crc32_table_ppc64le.s
+++ b/src/hash/crc32/crc32_table_ppc64le.s
--- a/src/hash/crc32/crc32_test.go
+++ b/src/hash/crc32/crc32_test.go
@@ -76,8 +76,9 @@ func testCrossCheck(t *testing.T, crcFunc1, crcFunc2 func(crc uint32, b []byte)
 	// The AMD64 implementation has some cutoffs at lengths 168*3=504 and
 	// 1344*3=4032. We should make sure lengths around these values are in the
 	// list.
-	lengths := []int{0, 1, 2, 3, 4, 5, 10, 16, 50, 100, 128,
-		500, 501, 502, 503, 504, 505, 512, 1000, 1024, 2000,
+	lengths := []int{0, 1, 2, 3, 4, 5, 10, 16, 50, 63, 64, 65, 100,
+		127, 128, 129, 255, 256, 257, 300, 312, 384, 416, 448, 480,
+		500, 501, 502, 503, 504, 505, 512, 513, 1000, 1024, 2000,
 		4030, 4031, 4032, 4033, 4036, 4040, 4048, 4096, 5000, 10000}
 	for _, length := range lengths {
 		p := make([]byte, length)

--- a/src/hash/crc32/gen_const_ppc64le.go
+++ b/src/hash/crc32/gen_const_ppc64le.go
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// Generate the constant table associated with the poly used by the
+// vpmsumd crc32 algorithm.
+//
+// go run gen_const_ppc64le.go
+//
+// generates crc32_table_ppc64le.s
+
+// The following is derived from code written by Anton Blanchard
+// <anton@au.ibm.com> found at https://github.com/antonblanchard/crc32-vpmsum.
+// The original is dual licensed under GPL and Apache 2.  As the copyright holder
+// for the work, IBM has contributed this new work under the golang license.
+
+// This code was written in Go based on the original C implementation.
+
+// This is a tool needed to generate the appropriate constants needed for
+// the vpmsum algorithm.  It is included to generate new constant tables if
+// new polynomial values are included in the future.
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+)
+
+var blocking = 32 * 1024
+
+func reflect_bits(b uint64, nr uint) uint64 {
+	var ref uint64
+
+	for bit := uint64(0); bit < uint64(nr); bit++ {
+		if (b & uint64(1)) == 1 {
+			ref |= (1 << (uint64(nr-1) - bit))
+		}
+		b = (b >> 1)
+	}
+	return ref
+}
+
+func get_remainder(poly uint64, deg uint, n uint) uint64 {
+
+	rem, _ := xnmodp(n, poly, deg)
+	return rem
+}
+
+func get_quotient(poly uint64, bits, n uint) uint64 {
+
+	_, div := xnmodp(n, poly, bits)
+	return div
+}
+
+// xnmodp returns two values, p and div:
+// p is the representation of the binary polynomial x**n mod (x ** deg + "poly")
+// That is p is the binary representation of the modulus polynomial except for its highest-order term.
+// div is the binary representation of the polynomial x**n / (x ** deg + "poly")
+func xnmodp(n uint, poly uint64, deg uint) (uint64, uint64) {
+
+	var mod, mask, high, div uint64
+
+	if n < deg {
+		div = 0
+		return poly, div
+	}
+	mask = 1<<deg - 1
+	poly &= mask
+	mod = poly
+	div = 1
+	deg--
+	n--
+	for n > deg {
+		high = (mod >> deg) & 1
+		div = (div << 1) | high
+		mod <<= 1
+		if high != 0 {
+			mod ^= poly
+		}
+		n--
+	}
+	return mod & mask, div
+}
+
+func main() {
+	w := new(bytes.Buffer)
+
+	fmt.Fprintf(w, "// autogenerated: do not edit!\n")
+	fmt.Fprintf(w, "// generated from crc32/gen_const_ppc64le.go\n")
+	fmt.Fprintln(w)
+	fmt.Fprintf(w, "#include \"textflag.h\"\n")
+
+	// These are the polynomials supported in vector now.
+	// If adding others, include the polynomial and a name
+	// to identify it.
+
+	genCrc32ConstTable(w, 0xedb88320, "IEEE")
+	genCrc32ConstTable(w, 0x82f63b78, "Cast")
+	genCrc32ConstTable(w, 0xeb31d82e, "Koop")
+	b := w.Bytes()
+
+	err := ioutil.WriteFile("crc32_table_ppc64le.s", b, 0666)
+	if err != nil {
+		fmt.Printf("can't write output: %s\n", err)
+	}
+}
+
+func genCrc32ConstTable(w *bytes.Buffer, poly uint32, polyid string) {
+
+	ref_poly := reflect_bits(uint64(poly), 32)
+	fmt.Fprintf(w, "\n\t/* Reduce %d kbits to 1024 bits */\n", blocking*8)
+	j := 0
+	for i := (blocking * 8) - 1024; i > 0; i -= 1024 {
+		a := reflect_bits(get_remainder(ref_poly, 32, uint(i)), 32) << 1
+		b := reflect_bits(get_remainder(ref_poly, 32, uint(i+64)), 32) << 1
+
+		fmt.Fprintf(w, "\t/* x^%d mod p(x)%s, x^%d mod p(x)%s */\n", uint(i+64), "", uint(i), "")
+		fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%016x\n", polyid, j*8, b)
+		fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%016x\n", polyid, (j+1)*8, a)
+
+		j += 2
+		fmt.Fprintf(w, "\n")
+	}
+
+	for i := (1024 * 2) - 128; i >= 0; i -= 128 {
+		a := reflect_bits(get_remainder(ref_poly, 32, uint(i+32)), 32)
+		b := reflect_bits(get_remainder(ref_poly, 32, uint(i+64)), 32)
+		c := reflect_bits(get_remainder(ref_poly, 32, uint(i+96)), 32)
+		d := reflect_bits(get_remainder(ref_poly, 32, uint(i+128)), 32)
+
+		fmt.Fprintf(w, "\t/* x^%d mod p(x)%s, x^%d mod p(x)%s, x^%d mod p(x)%s, x^%d mod p(x)%s */\n", i+128, "", i+96, "", i+64, "", i+32, "")
+		fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%08x%08x\n", polyid, j*8, c, d)
+		fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%08x%08x\n", polyid, (j+1)*8, a, b)
+
+		j += 2
+		fmt.Fprintf(w, "\n")
+	}
+
+	fmt.Fprintf(w, "GLOBL ·%sConst(SB),RODATA,$4336\n", polyid)
+	fmt.Fprintf(w, "\n /* Barrett constant m - (4^32)/n */\n")
+	fmt.Fprintf(w, "DATA ·%sBarConst(SB)/8,$0x%016x\n", polyid, reflect_bits(get_quotient(ref_poly, 32, 64), 33))
+	fmt.Fprintf(w, "DATA ·%sBarConst+8(SB)/8,$0x0000000000000000\n", polyid)
+	fmt.Fprintf(w, "DATA ·%sBarConst+16(SB)/8,$0x%016x\n", polyid, reflect_bits((uint64(1)<<32)|ref_poly, 33)) // reflected?
+	fmt.Fprintf(w, "DATA ·%sBarConst+24(SB)/8,$0x0000000000000000\n", polyid)
+	fmt.Fprintf(w, "GLOBL ·%sBarConst(SB),RODATA,$32\n", polyid)
+}