hash/crc32: use vector instructions on s390x

The input buffer is aligned to a doubleword boundary to improve performance of the vector instructions. The pure Go implementation is used to align the input data, and is also used when the vector instructions are not available or the data length is less than 64 bytes. Change-Id: Ie259a5f2f1562bcc17961c99e5776c99091d6bed Reviewed-on: https://go-review.googlesource.com/22201Reviewed-by: Michael Munday <munday@ca.ibm.com> Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Bill O'Farrell <billotosyr@gmail.com> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>

hash/crc32: use vector instructions on s390x
The input buffer is aligned to a doubleword boundary to improve performance of the vector instructions. The pure Go implementation is used to align the input data, and is also used when the vector instructions are not available or the data length is less than 64 bytes. Change-Id: Ie259a5f2f1562bcc17961c99e5776c99091d6bed Reviewed-on: https://go-review.googlesource.com/22201Reviewed-by: Michael Munday <munday@ca.ibm.com> Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Bill O'Farrell <billotosyr@gmail.com> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
5833d843 · Chris Zou · Brad Fitzpatrick · 7879e919 · 5833d843 · 5833d843
Commit 5833d843 authored Apr 18, 2016 by Chris Zou Committed by Brad Fitzpatrick Apr 22, 2016
Hide whitespace changes
Inline Side-by-side

Showing with 347 additions and 1 deletion

crc32_generic.go src/hash/crc32/crc32_generic.go +1 -1

crc32_s390x.go src/hash/crc32/crc32_s390x.go +101 -0

crc32_s390x.s src/hash/crc32/crc32_s390x.s +245 -0

No files found.
--- a/src/hash/crc32/crc32_generic.go
+++ b/src/hash/crc32/crc32_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build !amd64,!amd64p32
+// +build !amd64,!amd64p32,!s390x

 package crc32


--- a/src/hash/crc32/crc32_s390x.go
+++ b/src/hash/crc32/crc32_s390x.go
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package crc32
+
+import (
+	"unsafe"
+)
+
+const (
+	vxMinLen    = 64
+	vxAlignment = 16
+	vxAlignMask = vxAlignment - 1
+)
+
+// hasVectorFacility reports whether the machine has the z/Architecture
+// vector facility installed and enabled.
+func hasVectorFacility() bool
+
+var hasVX = hasVectorFacility()
+
+// vectorizedCastagnoli implements CRC32 using vector instructions.
+// It is defined in crc32_s390x.s.
+//go:noescape
+func vectorizedCastagnoli(crc uint32, p []byte) uint32
+
+// vectorizedIEEE implements CRC32 using vector instructions.
+// It is defined in crc32_s390x.s.
+//go:noescape
+func vectorizedIEEE(crc uint32, p []byte) uint32
+
+func genericCastagnoli(crc uint32, p []byte) uint32 {
+	// Use slicing-by-8 on larger inputs.
+	if len(p) >= sliceBy8Cutoff {
+		return updateSlicingBy8(crc, castagnoliTable8, p)
+	}
+	return update(crc, castagnoliTable, p)
+}
+
+func genericIEEE(crc uint32, p []byte) uint32 {
+	// Use slicing-by-8 on larger inputs.
+	if len(p) >= sliceBy8Cutoff {
+		ieeeTable8Once.Do(func() {
+			ieeeTable8 = makeTable8(IEEE)
+		})
+		return updateSlicingBy8(crc, ieeeTable8, p)
+	}
+	return update(crc, IEEETable, p)
+}
+
+// updateCastagnoli calculates the checksum of p using genericCastagnoli to
+// align the data appropriately for vectorCastagnoli. It avoids using
+// vectorCastagnoli entirely if the length of p is less than or equal to
+// vxMinLen.
+func updateCastagnoli(crc uint32, p []byte) uint32 {
+	// Use vectorized function if vector facility is available and
+	// data length is above threshold.
+	if hasVX && len(p) > vxMinLen {
+		pAddr := uintptr(unsafe.Pointer(&p[0]))
+		if pAddr&vxAlignMask != 0 {
+			prealign := vxAlignment - int(pAddr&vxAlignMask)
+			crc = genericCastagnoli(crc, p[:prealign])
+			p = p[prealign:]
+		}
+		aligned := len(p) & ^vxAlignMask
+		crc = vectorizedCastagnoli(crc, p[:aligned])
+		p = p[aligned:]
+		// process remaining data
+		if len(p) > 0 {
+			crc = genericCastagnoli(crc, p)
+		}
+		return crc
+	}
+	return genericCastagnoli(crc, p)
+}
+
+// updateIEEE calculates the checksum of p using genericIEEE to align the data
+// appropriately for vectorIEEE. It avoids using vectorIEEE entirely if the length
+// of p is less than or equal to vxMinLen.
+func updateIEEE(crc uint32, p []byte) uint32 {
+	// Use vectorized function if vector facility is available and
+	// data length is above threshold.
+	if hasVX && len(p) > vxMinLen {
+		pAddr := uintptr(unsafe.Pointer(&p[0]))
+		if pAddr&vxAlignMask != 0 {
+			prealign := vxAlignment - int(pAddr&vxAlignMask)
+			crc = genericIEEE(crc, p[:prealign])
+			p = p[prealign:]
+		}
+		aligned := len(p) & ^vxAlignMask
+		crc = vectorizedIEEE(crc, p[:aligned])
+		p = p[aligned:]
+		// process remaining data
+		if len(p) > 0 {
+			crc = genericIEEE(crc, p)
+		}
+		return crc
+	}
+	return genericIEEE(crc, p)
+}
--- a/src/hash/crc32/crc32_s390x.s
+++ b/src/hash/crc32/crc32_s390x.s
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// Vector register range containing CRC-32 constants
+
+#define CONST_PERM_LE2BE        V9
+#define CONST_R2R1              V10
+#define CONST_R4R3              V11
+#define CONST_R5                V12
+#define CONST_RU_POLY           V13
+#define CONST_CRC_POLY          V14
+
+
+// The CRC-32 constant block contains reduction constants to fold and
+// process particular chunks of the input data stream in parallel.
+//
+// Note that the constant definitions below are extended in order to compute
+// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
+// The rightmost doubleword can be 0 to prevent contribution to the result or
+// can be multiplied by 1 to perform an XOR without the need for a separate
+// VECTOR EXCLUSIVE OR instruction.
+//
+// The polynomials used are bit-reflected:
+//
+//            IEEE: P'(x) = 0x0edb88320
+//      Castagnoli: P'(x) = 0x082f63b78
+
+
+// IEEE polynomial constants
+DATA    ·crclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908       // LE-to-BE mask
+DATA    ·crclecons+8(SB)/8,  $0x0706050403020100
+DATA    ·crclecons+16(SB)/8, $0x00000001c6e41596       // R2
+DATA    ·crclecons+24(SB)/8, $0x0000000154442bd4       // R1
+DATA    ·crclecons+32(SB)/8, $0x00000000ccaa009e       // R4
+DATA    ·crclecons+40(SB)/8, $0x00000001751997d0       // R3
+DATA    ·crclecons+48(SB)/8, $0x0000000000000000
+DATA    ·crclecons+56(SB)/8, $0x0000000163cd6124       // R5
+DATA    ·crclecons+64(SB)/8, $0x0000000000000000
+DATA    ·crclecons+72(SB)/8, $0x00000001F7011641       // u'
+DATA    ·crclecons+80(SB)/8, $0x0000000000000000
+DATA    ·crclecons+88(SB)/8, $0x00000001DB710641       // P'(x) << 1
+
+GLOBL    ·crclecons(SB),RODATA, $144
+
+// Castagonli Polynomial constants
+DATA    ·crcclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908      // LE-to-BE mask
+DATA    ·crcclecons+8(SB)/8,  $0x0706050403020100
+DATA    ·crcclecons+16(SB)/8, $0x000000009e4addf8      // R2
+DATA    ·crcclecons+24(SB)/8, $0x00000000740eef02      // R1
+DATA    ·crcclecons+32(SB)/8, $0x000000014cd00bd6      // R4
+DATA    ·crcclecons+40(SB)/8, $0x00000000f20c0dfe      // R3
+DATA    ·crcclecons+48(SB)/8, $0x0000000000000000
+DATA    ·crcclecons+56(SB)/8, $0x00000000dd45aab8      // R5
+DATA    ·crcclecons+64(SB)/8, $0x0000000000000000
+DATA    ·crcclecons+72(SB)/8, $0x00000000dea713f1      // u'
+DATA    ·crcclecons+80(SB)/8, $0x0000000000000000
+DATA    ·crcclecons+88(SB)/8, $0x0000000105ec76f0      // P'(x) << 1
+
+GLOBL   ·crcclecons(SB),RODATA, $144
+
+// func hasVectorFacility() bool
+TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
+	MOVD    $x-24(SP), R1
+	XC      $24, 0(R1), 0(R1) // clear the storage
+	MOVD    $2, R0            // R0 is the number of double words stored -1
+	WORD    $0xB2B01000       // STFLE 0(R1)
+	XOR     R0, R0            // reset the value of R0
+	MOVBZ   z-8(SP), R1
+	AND     $0x40, R1
+	BEQ     novector
+vectorinstalled:
+	// check if the vector instruction has been enabled
+	VLEIB   $0, $0xF, V16
+	VLGVB   $0, V16, R1
+	CMPBNE  R1, $0xF, novector
+	MOVB    $1, ret+0(FP) // have vx
+	RET
+novector:
+	MOVB    $0, ret+0(FP) // no vx
+	RET
+
+
+// The CRC-32 function(s) use these calling conventions:
+//
+// Parameters:
+//
+//      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
+//      R3:    Input buffer pointer, performance might be improved if the
+//             buffer is on a doubleword boundary.
+//      R4:    Length of the buffer, must be 64 bytes or greater.
+//
+// Register usage:
+//
+//      R5:     CRC-32 constant pool base pointer.
+//      V0:     Initial CRC value and intermediate constants and results.
+//      V1..V4: Data for CRC computation.
+//      V5..V8: Next data chunks that are fetched from the input buffer.
+//
+//      V9..V14: CRC-32 constants.
+
+// func vectorizedIEEE(crc uint32, p []byte) uint32
+TEXT ·vectorizedIEEE(SB),NOSPLIT,$0
+	MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
+	MOVD    p+8(FP), R3       // data pointer
+	MOVD    p_len+16(FP), R4  // len(p)
+
+	MOVD    $·crclecons(SB), R5
+	BR      vectorizedBody<>(SB)
+
+// func vectorizedCastagnoli(crc uint32, p []byte) uint32
+TEXT ·vectorizedCastagnoli(SB),NOSPLIT,$0
+	MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
+	MOVD    p+8(FP), R3       // data pointer
+	MOVD    p_len+16(FP), R4  // len(p)
+
+	// R5: crc-32 constant pool base pointer, constant is used to reduce crc
+	MOVD    $·crcclecons(SB), R5
+	BR      vectorizedBody<>(SB)
+
+TEXT vectorizedBody<>(SB),NOSPLIT,$0
+	XOR     $0xffffffff, R2 // NOTW R2
+	VLM     0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
+
+	// Load the initial CRC value into the rightmost word of V0
+	VZERO   V0
+	VLVGF   $3, R2, V0
+
+	// Load a 64-byte data chunk and XOR with CRC
+	VLM     0(R3), V1, V4    // 64-bytes into V1..V4
+
+	// Reflect the data if the CRC operation is in the bit-reflected domain
+	VPERM   V1, V1, CONST_PERM_LE2BE, V1
+	VPERM   V2, V2, CONST_PERM_LE2BE, V2
+	VPERM   V3, V3, CONST_PERM_LE2BE, V3
+	VPERM   V4, V4, CONST_PERM_LE2BE, V4
+
+	VX      V0, V1, V1     // V1 ^= CRC
+	ADD     $64, R3        // BUF = BUF + 64
+	ADD     $(-64), R4
+
+	// Check remaining buffer size and jump to proper folding method
+	CMP     R4, $64
+	BLT     less_than_64bytes
+
+fold_64bytes_loop:
+	// Load the next 64-byte data chunk into V5 to V8
+	VLM     0(R3), V5, V8
+	VPERM   V5, V5, CONST_PERM_LE2BE, V5
+	VPERM   V6, V6, CONST_PERM_LE2BE, V6
+	VPERM   V7, V7, CONST_PERM_LE2BE, V7
+	VPERM   V8, V8, CONST_PERM_LE2BE, V8
+
+
+	// Perform a GF(2) multiplication of the doublewords in V1 with
+	// the reduction constants in V0.  The intermediate result is
+	// then folded (accumulated) with the next data chunk in V5 and
+	// stored in V1.  Repeat this step for the register contents
+	// in V2, V3, and V4 respectively.
+
+	VGFMAG  CONST_R2R1, V1, V5, V1
+	VGFMAG  CONST_R2R1, V2, V6, V2
+	VGFMAG  CONST_R2R1, V3, V7, V3
+	VGFMAG  CONST_R2R1, V4, V8 ,V4
+
+	// Adjust buffer pointer and length for next loop
+	ADD     $64, R3                  // BUF = BUF + 64
+	ADD     $(-64), R4               // LEN = LEN - 64
+
+	CMP     R4, $64
+	BGE     fold_64bytes_loop
+
+less_than_64bytes:
+	// Fold V1 to V4 into a single 128-bit value in V1
+	VGFMAG  CONST_R4R3, V1, V2, V1
+	VGFMAG  CONST_R4R3, V1, V3, V1
+	VGFMAG  CONST_R4R3, V1, V4, V1
+
+	// Check whether to continue with 64-bit folding
+	CMP R4, $16
+	BLT final_fold
+
+fold_16bytes_loop:
+	VL      0(R3), V2               // Load next data chunk
+	VPERM   V2, V2, CONST_PERM_LE2BE, V2
+
+	VGFMAG  CONST_R4R3, V1, V2, V1  // Fold next data chunk
+
+	// Adjust buffer pointer and size for folding next data chunk
+	ADD     $16, R3
+	ADD     $-16, R4
+
+	// Process remaining data chunks
+	CMP     R4 ,$16
+	BGE     fold_16bytes_loop
+
+final_fold:
+	VLEIB   $7, $0x40, V9
+	VSRLB   V9, CONST_R4R3, V0
+	VLEIG   $0, $1, V0
+
+	VGFMG   V0, V1, V1
+
+	VLEIB   $7, $0x20, V9         // Shift by words
+	VSRLB   V9, V1, V2            // Store remaining bits in V2
+	VUPLLF  V1, V1                // Split rightmost doubleword
+	VGFMAG  CONST_R5, V1, V2, V1  // V1 = (V1 * R5) XOR V2
+
+
+	// The input values to the Barret reduction are the degree-63 polynomial
+	// in V1 (R(x)), degree-32 generator polynomial, and the reduction
+	// constant u.  The Barret reduction result is the CRC value of R(x) mod
+	// P(x).
+	//
+	// The Barret reduction algorithm is defined as:
+	//
+	//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+	//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+	//    3. C(x)  = R(x) XOR T2(x) mod x^32
+	//
+	// Note: To compensate the division by x^32, use the vector unpack
+	// instruction to move the leftmost word into the leftmost doubleword
+	// of the vector register.  The rightmost doubleword is multiplied
+	// with zero to not contribute to the intermedate results.
+
+
+	// T1(x) = floor( R(x) / x^32 ) GF2MUL u
+	VUPLLF  V1, V2
+	VGFMG   CONST_RU_POLY, V2, V2
+
+
+	// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
+	// V2 and XOR the intermediate result, T2(x),  with the value in V1.
+	// The final result is in the rightmost word of V2.
+
+	VUPLLF  V2 , V2
+	VGFMAG  CONST_CRC_POLY, V2, V1, V2
+
+done:
+	VLGVF   $2, V2, R2
+	XOR     $0xffffffff, R2 // NOTW R2
+	MOVWZ   R2, ret + 32(FP)
+	RET