Commit b6cd22c2 authored by Lynn Boger's avatar Lynn Boger

hash/crc32: improve performance for ppc64le

This change improves the performance of crc32 for ppc64le by using
vpmsum and other vector instructions in the algorithm.

The testcase was updated to test more sizes.

Fixes #19570

BenchmarkCRC32/poly=IEEE/size=15/align=0-8             90.5          81.8          -9.61%
BenchmarkCRC32/poly=IEEE/size=15/align=1-8             89.7          81.7          -8.92%
BenchmarkCRC32/poly=IEEE/size=40/align=0-8             93.2          61.1          -34.44%
BenchmarkCRC32/poly=IEEE/size=40/align=1-8             92.8          60.9          -34.38%
BenchmarkCRC32/poly=IEEE/size=512/align=0-8            501           55.8          -88.86%
BenchmarkCRC32/poly=IEEE/size=512/align=1-8            502           132           -73.71%
BenchmarkCRC32/poly=IEEE/size=1kB/align=0-8            947           69.9          -92.62%
BenchmarkCRC32/poly=IEEE/size=1kB/align=1-8            946           144           -84.78%
BenchmarkCRC32/poly=IEEE/size=4kB/align=0-8            3602          186           -94.84%
BenchmarkCRC32/poly=IEEE/size=4kB/align=1-8            3603          263           -92.70%
BenchmarkCRC32/poly=IEEE/size=32kB/align=0-8           28404         1338          -95.29%
BenchmarkCRC32/poly=IEEE/size=32kB/align=1-8           28856         1405          -95.13%
BenchmarkCRC32/poly=Castagnoli/size=15/align=0-8       89.7          81.8          -8.81%
BenchmarkCRC32/poly=Castagnoli/size=15/align=1-8       89.8          81.9          -8.80%
BenchmarkCRC32/poly=Castagnoli/size=40/align=0-8       93.8          61.4          -34.54%
BenchmarkCRC32/poly=Castagnoli/size=40/align=1-8       94.3          61.3          -34.99%
BenchmarkCRC32/poly=Castagnoli/size=512/align=0-8      503           56.4          -88.79%
BenchmarkCRC32/poly=Castagnoli/size=512/align=1-8      502           132           -73.71%
BenchmarkCRC32/poly=Castagnoli/size=1kB/align=0-8      941           70.2          -92.54%
BenchmarkCRC32/poly=Castagnoli/size=1kB/align=1-8      943           145           -84.62%
BenchmarkCRC32/poly=Castagnoli/size=4kB/align=0-8      3588          186           -94.82%
BenchmarkCRC32/poly=Castagnoli/size=4kB/align=1-8      3595          264           -92.66%
BenchmarkCRC32/poly=Castagnoli/size=32kB/align=0-8     28266         1323          -95.32%
BenchmarkCRC32/poly=Castagnoli/size=32kB/align=1-8     28344         1404          -95.05%

Change-Id: Ic4d8274c66e0e87bfba5f609f508a3877aee6bb5
Reviewed-on: https://go-review.googlesource.com/38184Reviewed-by: 's avatarDavid Chase <drchase@google.com>
parent 16663a85
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!amd64p32,!s390x
// +build !amd64,!amd64p32,!s390x,!ppc64le
package crc32
......
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package crc32
import (
"unsafe"
)
const (
vecMinLen = 16
vecAlignMask = 15 // align to 16 bytes
crcIEEE = 1
crcCast = 2
)
//go:noescape
func ppc64SlicingUpdateBy8(crc uint32, table8 *slicing8Table, p []byte) uint32
// this function requires the buffer to be 16 byte aligned and > 16 bytes long
//go:noescape
func vectorCrc32(crc uint32, poly uint32, p []byte) uint32
var archCastagnoliTable8 *slicing8Table
func archInitCastagnoli() {
archCastagnoliTable8 = slicingMakeTable(Castagnoli)
}
func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
if len(p) >= 4*vecMinLen {
// If not aligned then process the initial unaligned bytes
if uint64(uintptr(unsafe.Pointer(&p[0])))&uint64(vecAlignMask) != 0 {
align := uint64(uintptr(unsafe.Pointer(&p[0]))) & uint64(vecAlignMask)
newlen := vecMinLen - align
crc = ppc64SlicingUpdateBy8(crc, archCastagnoliTable8, p[:newlen])
p = p[newlen:]
}
// p should be aligned now
aligned := len(p) & ^vecAlignMask
crc = vectorCrc32(crc, crcCast, p[:aligned])
p = p[aligned:]
}
if len(p) == 0 {
return crc
}
return ppc64SlicingUpdateBy8(crc, archCastagnoliTable8, p)
}
func archAvailableIEEE() bool {
return true
}
func archAvailableCastagnoli() bool {
return true
}
var archIeeeTable8 *slicing8Table
func archInitIEEE() {
// We still use slicing-by-8 for small buffers.
archIeeeTable8 = slicingMakeTable(IEEE)
}
// archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
func archUpdateIEEE(crc uint32, p []byte) uint32 {
// Check if vector code should be used. If not aligned, then handle those
// first up to the aligned bytes.
if len(p) >= 4*vecMinLen {
if uint64(uintptr(unsafe.Pointer(&p[0])))&uint64(vecAlignMask) != 0 {
align := uint64(uintptr(unsafe.Pointer(&p[0]))) & uint64(vecAlignMask)
newlen := vecMinLen - align
crc = ppc64SlicingUpdateBy8(crc, archIeeeTable8, p[:newlen])
p = p[newlen:]
}
aligned := len(p) & ^vecAlignMask
crc = vectorCrc32(crc, crcIEEE, p[:aligned])
p = p[aligned:]
}
if len(p) == 0 {
return crc
}
return ppc64SlicingUpdateBy8(crc, archIeeeTable8, p)
}
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -76,8 +76,9 @@ func testCrossCheck(t *testing.T, crcFunc1, crcFunc2 func(crc uint32, b []byte)
// The AMD64 implementation has some cutoffs at lengths 168*3=504 and
// 1344*3=4032. We should make sure lengths around these values are in the
// list.
lengths := []int{0, 1, 2, 3, 4, 5, 10, 16, 50, 100, 128,
500, 501, 502, 503, 504, 505, 512, 1000, 1024, 2000,
lengths := []int{0, 1, 2, 3, 4, 5, 10, 16, 50, 63, 64, 65, 100,
127, 128, 129, 255, 256, 257, 300, 312, 384, 416, 448, 480,
500, 501, 502, 503, 504, 505, 512, 513, 1000, 1024, 2000,
4030, 4031, 4032, 4033, 4036, 4040, 4048, 4096, 5000, 10000}
for _, length := range lengths {
p := make([]byte, length)
......
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
// Generate the constant table associated with the poly used by the
// vpmsumd crc32 algorithm.
//
// go run gen_const_ppc64le.go
//
// generates crc32_table_ppc64le.s
// The following is derived from code written by Anton Blanchard
// <anton@au.ibm.com> found at https://github.com/antonblanchard/crc32-vpmsum.
// The original is dual licensed under GPL and Apache 2. As the copyright holder
// for the work, IBM has contributed this new work under the golang license.
// This code was written in Go based on the original C implementation.
// This is a tool needed to generate the appropriate constants needed for
// the vpmsum algorithm. It is included to generate new constant tables if
// new polynomial values are included in the future.
package main
import (
"bytes"
"fmt"
"io/ioutil"
)
var blocking = 32 * 1024
func reflect_bits(b uint64, nr uint) uint64 {
var ref uint64
for bit := uint64(0); bit < uint64(nr); bit++ {
if (b & uint64(1)) == 1 {
ref |= (1 << (uint64(nr-1) - bit))
}
b = (b >> 1)
}
return ref
}
func get_remainder(poly uint64, deg uint, n uint) uint64 {
rem, _ := xnmodp(n, poly, deg)
return rem
}
func get_quotient(poly uint64, bits, n uint) uint64 {
_, div := xnmodp(n, poly, bits)
return div
}
// xnmodp returns two values, p and div:
// p is the representation of the binary polynomial x**n mod (x ** deg + "poly")
// That is p is the binary representation of the modulus polynomial except for its highest-order term.
// div is the binary representation of the polynomial x**n / (x ** deg + "poly")
func xnmodp(n uint, poly uint64, deg uint) (uint64, uint64) {
var mod, mask, high, div uint64
if n < deg {
div = 0
return poly, div
}
mask = 1<<deg - 1
poly &= mask
mod = poly
div = 1
deg--
n--
for n > deg {
high = (mod >> deg) & 1
div = (div << 1) | high
mod <<= 1
if high != 0 {
mod ^= poly
}
n--
}
return mod & mask, div
}
func main() {
w := new(bytes.Buffer)
fmt.Fprintf(w, "// autogenerated: do not edit!\n")
fmt.Fprintf(w, "// generated from crc32/gen_const_ppc64le.go\n")
fmt.Fprintln(w)
fmt.Fprintf(w, "#include \"textflag.h\"\n")
// These are the polynomials supported in vector now.
// If adding others, include the polynomial and a name
// to identify it.
genCrc32ConstTable(w, 0xedb88320, "IEEE")
genCrc32ConstTable(w, 0x82f63b78, "Cast")
genCrc32ConstTable(w, 0xeb31d82e, "Koop")
b := w.Bytes()
err := ioutil.WriteFile("crc32_table_ppc64le.s", b, 0666)
if err != nil {
fmt.Printf("can't write output: %s\n", err)
}
}
func genCrc32ConstTable(w *bytes.Buffer, poly uint32, polyid string) {
ref_poly := reflect_bits(uint64(poly), 32)
fmt.Fprintf(w, "\n\t/* Reduce %d kbits to 1024 bits */\n", blocking*8)
j := 0
for i := (blocking * 8) - 1024; i > 0; i -= 1024 {
a := reflect_bits(get_remainder(ref_poly, 32, uint(i)), 32) << 1
b := reflect_bits(get_remainder(ref_poly, 32, uint(i+64)), 32) << 1
fmt.Fprintf(w, "\t/* x^%d mod p(x)%s, x^%d mod p(x)%s */\n", uint(i+64), "", uint(i), "")
fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%016x\n", polyid, j*8, b)
fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%016x\n", polyid, (j+1)*8, a)
j += 2
fmt.Fprintf(w, "\n")
}
for i := (1024 * 2) - 128; i >= 0; i -= 128 {
a := reflect_bits(get_remainder(ref_poly, 32, uint(i+32)), 32)
b := reflect_bits(get_remainder(ref_poly, 32, uint(i+64)), 32)
c := reflect_bits(get_remainder(ref_poly, 32, uint(i+96)), 32)
d := reflect_bits(get_remainder(ref_poly, 32, uint(i+128)), 32)
fmt.Fprintf(w, "\t/* x^%d mod p(x)%s, x^%d mod p(x)%s, x^%d mod p(x)%s, x^%d mod p(x)%s */\n", i+128, "", i+96, "", i+64, "", i+32, "")
fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%08x%08x\n", polyid, j*8, c, d)
fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%08x%08x\n", polyid, (j+1)*8, a, b)
j += 2
fmt.Fprintf(w, "\n")
}
fmt.Fprintf(w, "GLOBL ·%sConst(SB),RODATA,$4336\n", polyid)
fmt.Fprintf(w, "\n /* Barrett constant m - (4^32)/n */\n")
fmt.Fprintf(w, "DATA ·%sBarConst(SB)/8,$0x%016x\n", polyid, reflect_bits(get_quotient(ref_poly, 32, 64), 33))
fmt.Fprintf(w, "DATA ·%sBarConst+8(SB)/8,$0x0000000000000000\n", polyid)
fmt.Fprintf(w, "DATA ·%sBarConst+16(SB)/8,$0x%016x\n", polyid, reflect_bits((uint64(1)<<32)|ref_poly, 33)) // reflected?
fmt.Fprintf(w, "DATA ·%sBarConst+24(SB)/8,$0x0000000000000000\n", polyid)
fmt.Fprintf(w, "GLOBL ·%sBarConst(SB),RODATA,$32\n", polyid)
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment