bytes: improve performance for bytes.Compare on ppc64x

This improves the performance for byte.Compare by rewriting the cmpbody function in runtime/asm_ppc64x.s. The previous code had a simple loop which loaded a pair of bytes and compared them, which is inefficient for long buffers. The updated function checks for 8 or 32 byte chunks and then loads and compares double words where possible. Because the byte.Compare result indicates greater or less than, the doubleword loads must take endianness into account, using a byte reversed load in the little endian case. Fixes #17433 benchmark old ns/op new ns/op delta BenchmarkBytesCompare/8-16 13.6 7.16 -47.35% BenchmarkBytesCompare/16-16 25.7 7.83 -69.53% BenchmarkBytesCompare/32-16 38.1 7.78 -79.58% BenchmarkBytesCompare/64-16 63.0 10.6 -83.17% BenchmarkBytesCompare/128-16 112 13.0 -88.39% BenchmarkBytesCompare/256-16 211 28.1 -86.68% BenchmarkBytesCompare/512-16 410 38.6 -90.59% BenchmarkBytesCompare/1024-16 807 60.2 -92.54% BenchmarkBytesCompare/2048-16 1601 103 -93.57% Change-Id: I121acc74fcd27c430797647b8d682eb0607c63eb Reviewed-on: https://go-review.googlesource.com/30949Reviewed-by: David Chase <drchase@google.com>

bytes: improve performance for bytes.Compare on ppc64x
This improves the performance for byte.Compare by rewriting the cmpbody function in runtime/asm_ppc64x.s. The previous code had a simple loop which loaded a pair of bytes and compared them, which is inefficient for long buffers. The updated function checks for 8 or 32 byte chunks and then loads and compares double words where possible. Because the byte.Compare result indicates greater or less than, the doubleword loads must take endianness into account, using a byte reversed load in the little endian case. Fixes #17433 benchmark old ns/op new ns/op delta BenchmarkBytesCompare/8-16 13.6 7.16 -47.35% BenchmarkBytesCompare/16-16 25.7 7.83 -69.53% BenchmarkBytesCompare/32-16 38.1 7.78 -79.58% BenchmarkBytesCompare/64-16 63.0 10.6 -83.17% BenchmarkBytesCompare/128-16 112 13.0 -88.39% BenchmarkBytesCompare/256-16 211 28.1 -86.68% BenchmarkBytesCompare/512-16 410 38.6 -90.59% BenchmarkBytesCompare/1024-16 807 60.2 -92.54% BenchmarkBytesCompare/2048-16 1601 103 -93.57% Change-Id: I121acc74fcd27c430797647b8d682eb0607c63eb Reviewed-on: https://go-review.googlesource.com/30949Reviewed-by: David Chase <drchase@google.com>
1e28dce8 · Lynn Boger · Michael Munday · 7c46f034 · 1e28dce8 · 1e28dce8
Commit 1e28dce8 authored Oct 13, 2016 by Lynn Boger Committed by Michael Munday Oct 17, 2016
Showing with 225 additions and 48 deletions

a.out.go src/cmd/internal/obj/ppc64/a.out.go +1 -0

anames.go src/cmd/internal/obj/ppc64/anames.go +1 -0

asm9.go src/cmd/internal/obj/ppc64/asm9.go +3 -0

asm_ppc64x.s src/runtime/asm_ppc64x.s +220 -48

No files found.
--- a/src/cmd/internal/obj/ppc64/a.out.go
+++ b/src/cmd/internal/obj/ppc64/a.out.go
@@ -424,6 +424,7 @@ const (
 	ALSW
 	ALWAR
 	ALWSYNC
+	AMOVDBR
 	AMOVWBR
 	AMOVB
 	AMOVBU

--- a/src/cmd/internal/obj/ppc64/anames.go
+++ b/src/cmd/internal/obj/ppc64/anames.go
@@ -125,6 +125,7 @@ var Anames = []string{
 	"LSW",
 	"LWAR",
 	"LWSYNC",
+	"MOVDBR",
 	"MOVWBR",
 	"MOVB",
 	"MOVBU",

--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -1513,6 +1513,7 @@ func buildop(ctxt *obj.Link) {

 		case AMOVHBR:
 			opset(AMOVWBR, r0)
+			opset(AMOVDBR, r0)

 		case ASLBMFEE:
 			opset(ASLBMFEV, r0)
@@ -3923,6 +3924,8 @@ func oploadx(ctxt *obj.Link, a obj.As) uint32 {
 		return OPVCC(31, 790, 0, 0) /* lhbrx */
 	case AMOVWBR:
 		return OPVCC(31, 534, 0, 0) /* lwbrx */
+	case AMOVDBR:
+		return OPVCC(31, 532, 0, 0) /* ldbrx */
 	case AMOVHZ:
 		return OPVCC(31, 279, 0, 0) /* lhzx */
 	case AMOVHZU:

--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -812,13 +812,220 @@ eq:
 	MOVB	R3, ret+16(FP)
 	RET

-// Do an efficieint memequal for ppc64
-// for reuse where possible.
+// Do an efficient memcmp for ppc64le
+// R3 = s1 len
+// R4 = s2 len
+// R5 = s1 addr
+// R6 = s2 addr
+// R7 = addr of return value
+TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R8		// set up length
+	CMP	R3,R4,CR2	// unequal?
+	BC	12,8,setuplen	// BLT CR2
+	MOVD	R4,R8		// use R4 for comparison len
+setuplen:
+	MOVD	R8,CTR		// set up loop counter
+	CMP	R8,$8		// only optimize >=8
+	BLT	simplecheck
+	DCBT	(R5)		// cache hint
+	DCBT	(R6)
+	CMP	R8,$32		// optimize >= 32
+	MOVD	R8,R9
+	BLT	setup8a		// 8 byte moves only
+setup32a:
+	SRADCC	$5,R8,R9	// number of 32 byte chunks
+	MOVD	R9,CTR
+
+        // Special processing for 32 bytes or longer.
+        // Loading this way is faster and correct as long as the
+	// doublewords being compared are equal. Once they
+	// are found unequal, reload them in proper byte order
+	// to determine greater or less than.
+loop32a:
+	MOVD	0(R5),R9	// doublewords to compare
+	MOVD	0(R6),R10	// get 4 doublewords
+	MOVD	8(R5),R14
+	MOVD	8(R6),R15
+	CMPU	R9,R10		// bytes equal?
+	MOVD	$0,R16		// set up for cmpne
+	BNE	cmpne		// further compare for LT or GT
+	MOVD	16(R5),R9	// get next pair of doublewords
+	MOVD	16(R6),R10
+	CMPU	R4,R15		// bytes match?
+	MOVD	$8,R16		// set up for cmpne
+	BNE	cmpne		// further compare for LT or GT
+	MOVD	24(R5),R14	// get next pair of doublewords
+	MOVD    24(R6),R15
+	CMPU	R9,R10		// bytes match?
+	MOVD	$16,R16		// set up for cmpne
+	BNE	cmpne		// further compare for LT or GT
+	MOVD	$-8,R16		// for cmpne, R5,R6 already inc by 32
+	ADD	$32,R5		// bump up to next 32
+	ADD	$32,R6
+	CMPU    R14,R15		// bytes match?
+	BC	8,2,loop32a	// br ctr and cr
+	BNE	cmpne
+	ANDCC	$24,R8,R9	// Any 8 byte chunks?
+	BEQ	leftover	// and result is 0
+setup8a:
+	SRADCC	$3,R9,R9	// get the 8 byte count
+	BEQ	leftover	// shifted value is 0
+	MOVD	R9,CTR		// loop count for doublewords
+loop8:
+	MOVDBR	(R5+R0),R9	// doublewords to compare
+	MOVDBR	(R6+R0),R10	// LE compare order
+	ADD	$8,R5
+	ADD	$8,R6
+	CMPU	R9,R10		// match?
+	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BGT	greater
+	BLT	less
+leftover:
+	ANDCC	$7,R8,R9	// check for leftover bytes
+	MOVD	R9,CTR		// save the ctr
+	BNE	simple		// leftover bytes
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,8,less
+	BR	greater
+simplecheck:
+	CMP	R8,$0		// remaining compare length 0
+	BNE	simple		// do simple compare
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,8,less	// 1st len < 2nd len, result less
+	BR	greater		// 1st len > 2nd len must be greater
+simple:
+	MOVBZ	0(R5), R9	// get byte from 1st operand
+	ADD	$1,R5
+	MOVBZ	0(R6), R10	// get byte from 2nd operand
+	ADD	$1,R6
+	CMPU	R9, R10
+	BC	8,2,simple	// bc ctr <> 0 && cr
+	BGT	greater		// 1st > 2nd
+	BLT	less		// 1st < 2nd
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,9,greater	// 2nd len > 1st len
+	BR	less		// must be less
+cmpne:				// only here is not equal
+	MOVDBR	(R5+R16),R8	// reload in reverse order
+	MOVDBR	(R6+R16),R9
+	CMPU	R8,R9		// compare correct endianness
+	BGT	greater		// here only if NE
+less:
+	MOVD	$-1,R3
+	MOVD	R3,(R7)		// return value if A < B
+	RET
+equal:
+	MOVD	$0,(R7)		// return value if A == B
+	RET
+greater:
+	MOVD	$1,R3
+	MOVD	R3,(R7)		// return value if A > B
+	RET
+
+// Do an efficient memcmp for ppc64 (BE)
+// R3 = s1 len
+// R4 = s2 len
+// R5 = s1 addr
+// R6 = s2 addr
+// R7 = addr of return value
+TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R8		// set up length
+	CMP	R3,R4,CR2	// unequal?
+	BC	12,8,setuplen	// BLT CR2
+	MOVD	R4,R8		// use R4 for comparison len
+setuplen:
+	MOVD	R8,CTR		// set up loop counter
+	CMP	R8,$8		// only optimize >=8
+	BLT	simplecheck
+	DCBT	(R5)		// cache hint
+	DCBT	(R6)
+	CMP	R8,$32		// optimize >= 32
+	MOVD	R8,R9
+	BLT	setup8a		// 8 byte moves only
+
+setup32a:
+	SRADCC	$5,R8,R9	// number of 32 byte chunks
+	MOVD	R9,CTR
+loop32a:
+	MOVD	0(R5),R9	// doublewords to compare
+	MOVD	0(R6),R10	// get 4 doublewords
+	MOVD	8(R5),R14
+	MOVD	8(R6),R15
+	CMPU	R9,R10		// bytes equal?
+	BLT	less		// found to be less
+	BGT	greater		// found to be greater
+	MOVD	16(R5),R9	// get next pair of doublewords
+	MOVD	16(R6),R10
+	CMPU	R14,R15		// bytes match?
+	BLT	less		// found less
+	BGT	greater		// found greater
+	MOVD	24(R5),R14	// get next pair of doublewords
+	MOVD	24(R6),R15
+	CMPU	R9,R10		// bytes match?
+	BLT	less		// found to be less
+	BGT	greater		// found to be greater
+	ADD	$32,R5		// bump up to next 32
+	ADD	$32,R6
+	CMPU	R14,R15		// bytes match?
+	BC	8,2,loop32a	// br ctr and cr
+	BLT	less		// with BE, byte ordering is
+	BGT	greater		// good for compare
+	ANDCC	$24,R8,R9	// Any 8 byte chunks?
+	BEQ	leftover	// and result is 0
+setup8a:
+	SRADCC	$3,R9,R9	// get the 8 byte count
+	BEQ	leftover	// shifted value is 0
+	MOVD	R9,CTR		// loop count for doublewords
+loop8:
+	MOVD	(R5),R9
+	MOVD	(R6),R10
+	ADD	$8,R5
+	ADD	$8,R6
+	CMPU	R9,R10		// match?
+	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BGT	greater
+	BLT	less
+leftover:
+	ANDCC	$7,R8,R9	// check for leftover bytes
+	MOVD	R9,CTR		// save the ctr
+	BNE	simple		// leftover bytes
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,8,less
+	BR	greater
+simplecheck:
+	CMP	R8,$0		// remaining compare length 0
+	BNE	simple		// do simple compare
+	BC	12,10,equal	// test CR2 for length comparison
+	BC 	12,8,less	// 1st len < 2nd len, result less
+	BR	greater		// same len, must be equal
+simple:
+	MOVBZ	0(R5),R9	// get byte from 1st operand
+	ADD	$1,R5
+	MOVBZ	0(R6),R10	// get byte from 2nd operand
+	ADD	$1,R6
+	CMPU	R9,R10
+	BC	8,2,simple	// bc ctr <> 0 && cr
+	BGT	greater		// 1st > 2nd
+	BLT	less		// 1st < 2nd
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,9,greater	// 2nd len > 1st len
+less:
+	MOVD	$-1,R3
+	MOVD    R3,(R7)		// return value if A < B
+	RET
+equal:
+	MOVD    $0,(R7)		// return value if A == B
+	RET
+greater:
+	MOVD	$1,R3
+	MOVD	R3,(R7)		// return value if A > B
+	RET
+
+// Do an efficient memequal for ppc64
 // R3 = s1
 // R4 = s2
 // R5 = len
 // R9 = return value
-// R6, R7 clobbered
 TEXT runtime·memeqbody(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD    R5,CTR
 	CMP     R5,$8		// only optimize >=8
@@ -983,7 +1190,11 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
 	MOVD	s2_base+16(FP), R6
 	MOVD	s2_len+24(FP), R4
 	MOVD	$ret+32(FP), R7
-	BR	runtime·cmpbody<>(SB)
+#ifdef	GOARCH_ppc64le
+	BR	cmpbodyLE<>(SB)
+#else
+	BR      cmpbodyBE<>(SB)
+#endif

 TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
 	MOVD	s1+0(FP), R5
@@ -991,50 +1202,11 @@ TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
 	MOVD	s2+24(FP), R6
 	MOVD	s2+32(FP), R4
 	MOVD	$ret+48(FP), R7
-	BR	runtime·cmpbody<>(SB)
-
-// On entry:
-// R3 is the length of s1
-// R4 is the length of s2
-// R5 points to the start of s1
-// R6 points to the start of s2
-// R7 points to return value (-1/0/1 will be written here)
-//
-// On exit:
-// R5, R6, R8, R9 and R10 are clobbered
-TEXT runtime·cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
-	CMP	R5, R6
-	BEQ	samebytes // same starting pointers; compare lengths
-	SUB	$1, R5
-	SUB	$1, R6
-	MOVD	R4, R8
-	CMP	R3, R4
-	BGE	2(PC)
-	MOVD	R3, R8	// R8 is min(R3, R4)
-	ADD	R5, R8	// R5 is current byte in s1, R8 is last byte in s1 to compare
-loop:
-	CMP	R5, R8
-	BEQ	samebytes // all compared bytes were the same; compare lengths
-	MOVBZU	1(R5), R9
-	MOVBZU	1(R6), R10
-	CMP	R9, R10
-	BEQ	loop
-	// bytes differed
-	MOVD	$1, R4
-	BGT	2(PC)
-	NEG	R4
-	MOVD	R4, (R7)
-	RET
-samebytes:
-	MOVD	$1, R8
-	CMP	R3, R4
-	BNE	3(PC)
-	MOVD	R0, (R7)
-	RET
-	BGT	2(PC)
-	NEG	R8
-	MOVD	R8, (R7)
-	RET
+#ifdef	GOARCH_ppc64le
+	BR	cmpbodyLE<>(SB)
+#else
+	BR      cmpbodyBE<>(SB)
+#endif

 TEXT runtime·fastrand(SB), NOSPLIT, $0-4
 	MOVD	g_m(g), R4