Commit 5833d843 authored by Chris Zou's avatar Chris Zou Committed by Brad Fitzpatrick

hash/crc32: use vector instructions on s390x

The input buffer is aligned to a doubleword boundary to
improve performance of the vector instructions. The pure
Go implementation is used to align the input data, and is
also used when the vector instructions are not available
or the data length is less than 64 bytes.

Change-Id: Ie259a5f2f1562bcc17961c99e5776c99091d6bed
Reviewed-on: https://go-review.googlesource.com/22201Reviewed-by: 's avatarMichael Munday <munday@ca.ibm.com>
Run-TryBot: Michael Munday <munday@ca.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarBill O'Farrell <billotosyr@gmail.com>
Reviewed-by: 's avatarBrad Fitzpatrick <bradfitz@golang.org>
parent 7879e919
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!amd64p32
// +build !amd64,!amd64p32,!s390x
package crc32
......
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package crc32
import (
"unsafe"
)
const (
vxMinLen = 64
vxAlignment = 16
vxAlignMask = vxAlignment - 1
)
// hasVectorFacility reports whether the machine has the z/Architecture
// vector facility installed and enabled.
func hasVectorFacility() bool
var hasVX = hasVectorFacility()
// vectorizedCastagnoli implements CRC32 using vector instructions.
// It is defined in crc32_s390x.s.
//go:noescape
func vectorizedCastagnoli(crc uint32, p []byte) uint32
// vectorizedIEEE implements CRC32 using vector instructions.
// It is defined in crc32_s390x.s.
//go:noescape
func vectorizedIEEE(crc uint32, p []byte) uint32
func genericCastagnoli(crc uint32, p []byte) uint32 {
// Use slicing-by-8 on larger inputs.
if len(p) >= sliceBy8Cutoff {
return updateSlicingBy8(crc, castagnoliTable8, p)
}
return update(crc, castagnoliTable, p)
}
func genericIEEE(crc uint32, p []byte) uint32 {
// Use slicing-by-8 on larger inputs.
if len(p) >= sliceBy8Cutoff {
ieeeTable8Once.Do(func() {
ieeeTable8 = makeTable8(IEEE)
})
return updateSlicingBy8(crc, ieeeTable8, p)
}
return update(crc, IEEETable, p)
}
// updateCastagnoli calculates the checksum of p using genericCastagnoli to
// align the data appropriately for vectorCastagnoli. It avoids using
// vectorCastagnoli entirely if the length of p is less than or equal to
// vxMinLen.
func updateCastagnoli(crc uint32, p []byte) uint32 {
// Use vectorized function if vector facility is available and
// data length is above threshold.
if hasVX && len(p) > vxMinLen {
pAddr := uintptr(unsafe.Pointer(&p[0]))
if pAddr&vxAlignMask != 0 {
prealign := vxAlignment - int(pAddr&vxAlignMask)
crc = genericCastagnoli(crc, p[:prealign])
p = p[prealign:]
}
aligned := len(p) & ^vxAlignMask
crc = vectorizedCastagnoli(crc, p[:aligned])
p = p[aligned:]
// process remaining data
if len(p) > 0 {
crc = genericCastagnoli(crc, p)
}
return crc
}
return genericCastagnoli(crc, p)
}
// updateIEEE calculates the checksum of p using genericIEEE to align the data
// appropriately for vectorIEEE. It avoids using vectorIEEE entirely if the length
// of p is less than or equal to vxMinLen.
func updateIEEE(crc uint32, p []byte) uint32 {
// Use vectorized function if vector facility is available and
// data length is above threshold.
if hasVX && len(p) > vxMinLen {
pAddr := uintptr(unsafe.Pointer(&p[0]))
if pAddr&vxAlignMask != 0 {
prealign := vxAlignment - int(pAddr&vxAlignMask)
crc = genericIEEE(crc, p[:prealign])
p = p[prealign:]
}
aligned := len(p) & ^vxAlignMask
crc = vectorizedIEEE(crc, p[:aligned])
p = p[aligned:]
// process remaining data
if len(p) > 0 {
crc = genericIEEE(crc, p)
}
return crc
}
return genericIEEE(crc, p)
}
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// Vector register range containing CRC-32 constants
#define CONST_PERM_LE2BE V9
#define CONST_R2R1 V10
#define CONST_R4R3 V11
#define CONST_R5 V12
#define CONST_RU_POLY V13
#define CONST_CRC_POLY V14
// The CRC-32 constant block contains reduction constants to fold and
// process particular chunks of the input data stream in parallel.
//
// Note that the constant definitions below are extended in order to compute
// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
// The rightmost doubleword can be 0 to prevent contribution to the result or
// can be multiplied by 1 to perform an XOR without the need for a separate
// VECTOR EXCLUSIVE OR instruction.
//
// The polynomials used are bit-reflected:
//
// IEEE: P'(x) = 0x0edb88320
// Castagnoli: P'(x) = 0x082f63b78
// IEEE polynomial constants
DATA ·crclecons+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
DATA ·crclecons+8(SB)/8, $0x0706050403020100
DATA ·crclecons+16(SB)/8, $0x00000001c6e41596 // R2
DATA ·crclecons+24(SB)/8, $0x0000000154442bd4 // R1
DATA ·crclecons+32(SB)/8, $0x00000000ccaa009e // R4
DATA ·crclecons+40(SB)/8, $0x00000001751997d0 // R3
DATA ·crclecons+48(SB)/8, $0x0000000000000000
DATA ·crclecons+56(SB)/8, $0x0000000163cd6124 // R5
DATA ·crclecons+64(SB)/8, $0x0000000000000000
DATA ·crclecons+72(SB)/8, $0x00000001F7011641 // u'
DATA ·crclecons+80(SB)/8, $0x0000000000000000
DATA ·crclecons+88(SB)/8, $0x00000001DB710641 // P'(x) << 1
GLOBL ·crclecons(SB),RODATA, $144
// Castagonli Polynomial constants
DATA ·crcclecons+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
DATA ·crcclecons+8(SB)/8, $0x0706050403020100
DATA ·crcclecons+16(SB)/8, $0x000000009e4addf8 // R2
DATA ·crcclecons+24(SB)/8, $0x00000000740eef02 // R1
DATA ·crcclecons+32(SB)/8, $0x000000014cd00bd6 // R4
DATA ·crcclecons+40(SB)/8, $0x00000000f20c0dfe // R3
DATA ·crcclecons+48(SB)/8, $0x0000000000000000
DATA ·crcclecons+56(SB)/8, $0x00000000dd45aab8 // R5
DATA ·crcclecons+64(SB)/8, $0x0000000000000000
DATA ·crcclecons+72(SB)/8, $0x00000000dea713f1 // u'
DATA ·crcclecons+80(SB)/8, $0x0000000000000000
DATA ·crcclecons+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1
GLOBL ·crcclecons(SB),RODATA, $144
// func hasVectorFacility() bool
TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
MOVD $x-24(SP), R1
XC $24, 0(R1), 0(R1) // clear the storage
MOVD $2, R0 // R0 is the number of double words stored -1
WORD $0xB2B01000 // STFLE 0(R1)
XOR R0, R0 // reset the value of R0
MOVBZ z-8(SP), R1
AND $0x40, R1
BEQ novector
vectorinstalled:
// check if the vector instruction has been enabled
VLEIB $0, $0xF, V16
VLGVB $0, V16, R1
CMPBNE R1, $0xF, novector
MOVB $1, ret+0(FP) // have vx
RET
novector:
MOVB $0, ret+0(FP) // no vx
RET
// The CRC-32 function(s) use these calling conventions:
//
// Parameters:
//
// R2: Initial CRC value, typically ~0; and final CRC (return) value.
// R3: Input buffer pointer, performance might be improved if the
// buffer is on a doubleword boundary.
// R4: Length of the buffer, must be 64 bytes or greater.
//
// Register usage:
//
// R5: CRC-32 constant pool base pointer.
// V0: Initial CRC value and intermediate constants and results.
// V1..V4: Data for CRC computation.
// V5..V8: Next data chunks that are fetched from the input buffer.
//
// V9..V14: CRC-32 constants.
// func vectorizedIEEE(crc uint32, p []byte) uint32
TEXT ·vectorizedIEEE(SB),NOSPLIT,$0
MOVWZ crc+0(FP), R2 // R2 stores the CRC value
MOVD p+8(FP), R3 // data pointer
MOVD p_len+16(FP), R4 // len(p)
MOVD $·crclecons(SB), R5
BR vectorizedBody<>(SB)
// func vectorizedCastagnoli(crc uint32, p []byte) uint32
TEXT ·vectorizedCastagnoli(SB),NOSPLIT,$0
MOVWZ crc+0(FP), R2 // R2 stores the CRC value
MOVD p+8(FP), R3 // data pointer
MOVD p_len+16(FP), R4 // len(p)
// R5: crc-32 constant pool base pointer, constant is used to reduce crc
MOVD $·crcclecons(SB), R5
BR vectorizedBody<>(SB)
TEXT vectorizedBody<>(SB),NOSPLIT,$0
XOR $0xffffffff, R2 // NOTW R2
VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
// Load the initial CRC value into the rightmost word of V0
VZERO V0
VLVGF $3, R2, V0
// Load a 64-byte data chunk and XOR with CRC
VLM 0(R3), V1, V4 // 64-bytes into V1..V4
// Reflect the data if the CRC operation is in the bit-reflected domain
VPERM V1, V1, CONST_PERM_LE2BE, V1
VPERM V2, V2, CONST_PERM_LE2BE, V2
VPERM V3, V3, CONST_PERM_LE2BE, V3
VPERM V4, V4, CONST_PERM_LE2BE, V4
VX V0, V1, V1 // V1 ^= CRC
ADD $64, R3 // BUF = BUF + 64
ADD $(-64), R4
// Check remaining buffer size and jump to proper folding method
CMP R4, $64
BLT less_than_64bytes
fold_64bytes_loop:
// Load the next 64-byte data chunk into V5 to V8
VLM 0(R3), V5, V8
VPERM V5, V5, CONST_PERM_LE2BE, V5
VPERM V6, V6, CONST_PERM_LE2BE, V6
VPERM V7, V7, CONST_PERM_LE2BE, V7
VPERM V8, V8, CONST_PERM_LE2BE, V8
// Perform a GF(2) multiplication of the doublewords in V1 with
// the reduction constants in V0. The intermediate result is
// then folded (accumulated) with the next data chunk in V5 and
// stored in V1. Repeat this step for the register contents
// in V2, V3, and V4 respectively.
VGFMAG CONST_R2R1, V1, V5, V1
VGFMAG CONST_R2R1, V2, V6, V2
VGFMAG CONST_R2R1, V3, V7, V3
VGFMAG CONST_R2R1, V4, V8 ,V4
// Adjust buffer pointer and length for next loop
ADD $64, R3 // BUF = BUF + 64
ADD $(-64), R4 // LEN = LEN - 64
CMP R4, $64
BGE fold_64bytes_loop
less_than_64bytes:
// Fold V1 to V4 into a single 128-bit value in V1
VGFMAG CONST_R4R3, V1, V2, V1
VGFMAG CONST_R4R3, V1, V3, V1
VGFMAG CONST_R4R3, V1, V4, V1
// Check whether to continue with 64-bit folding
CMP R4, $16
BLT final_fold
fold_16bytes_loop:
VL 0(R3), V2 // Load next data chunk
VPERM V2, V2, CONST_PERM_LE2BE, V2
VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk
// Adjust buffer pointer and size for folding next data chunk
ADD $16, R3
ADD $-16, R4
// Process remaining data chunks
CMP R4 ,$16
BGE fold_16bytes_loop
final_fold:
VLEIB $7, $0x40, V9
VSRLB V9, CONST_R4R3, V0
VLEIG $0, $1, V0
VGFMG V0, V1, V1
VLEIB $7, $0x20, V9 // Shift by words
VSRLB V9, V1, V2 // Store remaining bits in V2
VUPLLF V1, V1 // Split rightmost doubleword
VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2
// The input values to the Barret reduction are the degree-63 polynomial
// in V1 (R(x)), degree-32 generator polynomial, and the reduction
// constant u. The Barret reduction result is the CRC value of R(x) mod
// P(x).
//
// The Barret reduction algorithm is defined as:
//
// 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
// 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
// 3. C(x) = R(x) XOR T2(x) mod x^32
//
// Note: To compensate the division by x^32, use the vector unpack
// instruction to move the leftmost word into the leftmost doubleword
// of the vector register. The rightmost doubleword is multiplied
// with zero to not contribute to the intermedate results.
// T1(x) = floor( R(x) / x^32 ) GF2MUL u
VUPLLF V1, V2
VGFMG CONST_RU_POLY, V2, V2
// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
// V2 and XOR the intermediate result, T2(x), with the value in V1.
// The final result is in the rightmost word of V2.
VUPLLF V2 , V2
VGFMAG CONST_CRC_POLY, V2, V1, V2
done:
VLGVF $2, V2, R2
XOR $0xffffffff, R2 // NOTW R2
MOVWZ R2, ret + 32(FP)
RET
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment