bytes: Equal perf improvements on ppc64le/ppc64

The existing implementation for Equal and similar functions in the bytes package operate on one byte at at time. This performs poorly on ppc64/ppc64le especially when the byte buffers are large. This change improves those functions by loading and comparing double words where possible. The common code has been moved to a function that can be shared by the other functions in this file which perform the same type of comparison. Further optimizations are done for the case where >= 32 bytes are being compared. The new function memeqbody is used by memeq_varlen, Equal, and eqstring. When running the bytes test with -test.bench=Equal benchmark old MB/s new MB/s speedup BenchmarkEqual1 164.83 129.49 0.79x BenchmarkEqual6 563.51 445.47 0.79x BenchmarkEqual9 656.15 1099.00 1.67x BenchmarkEqual15 591.93 1024.30 1.73x BenchmarkEqual16 613.25 1914.12 3.12x BenchmarkEqual20 682.37 1687.04 2.47x BenchmarkEqual32 807.96 3843.29 4.76x BenchmarkEqual4K 1076.25 23280.51 21.63x BenchmarkEqual4M 1079.30 13120.14 12.16x BenchmarkEqual64M 1073.28 10876.92 10.13x It was determined that the degradation in the smaller byte tests were due to unfavorable code alignment of the single byte loop. Fixes #14368 Change-Id: I0dd87382c28887c70f4fbe80877a8ba03c31d7cd Reviewed-on: https://go-review.googlesource.com/20249Reviewed-by: Minux Ma <minux@golang.org>

bytes: Equal perf improvements on ppc64le/ppc64
The existing implementation for Equal and similar functions in the bytes package operate on one byte at at time. This performs poorly on ppc64/ppc64le especially when the byte buffers are large. This change improves those functions by loading and comparing double words where possible. The common code has been moved to a function that can be shared by the other functions in this file which perform the same type of comparison. Further optimizations are done for the case where >= 32 bytes are being compared. The new function memeqbody is used by memeq_varlen, Equal, and eqstring. When running the bytes test with -test.bench=Equal benchmark old MB/s new MB/s speedup BenchmarkEqual1 164.83 129.49 0.79x BenchmarkEqual6 563.51 445.47 0.79x BenchmarkEqual9 656.15 1099.00 1.67x BenchmarkEqual15 591.93 1024.30 1.73x BenchmarkEqual16 613.25 1914.12 3.12x BenchmarkEqual20 682.37 1687.04 2.47x BenchmarkEqual32 807.96 3843.29 4.76x BenchmarkEqual4K 1076.25 23280.51 21.63x BenchmarkEqual4M 1079.30 13120.14 12.16x BenchmarkEqual64M 1073.28 10876.92 10.13x It was determined that the degradation in the smaller byte tests were due to unfavorable code alignment of the single byte loop. Fixes #14368 Change-Id: I0dd87382c28887c70f4fbe80877a8ba03c31d7cd Reviewed-on: https://go-review.googlesource.com/20249Reviewed-by: Minux Ma <minux@golang.org>
baec1487 · Lynn Boger · Minux Ma · 516c6b40 · baec1487
Commit baec1487 authored Mar 07, 2016 by Lynn Boger Committed by Minux Ma Mar 23, 2016
Hide whitespace changes
Inline Side-by-side

Showing with 106 additions and 72 deletions

asm_ppc64x.s src/runtime/asm_ppc64x.s +106 -72

No files found.
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -795,33 +795,13 @@ TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-0
 TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-0
 	MOVW	(R0), R1

-// memequal(p, q unsafe.Pointer, size uintptr) bool
-TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
-	MOVD	a+0(FP), R3
-	MOVD	b+8(FP), R4
-	CMP	R3, R4
-	BEQ	eq
-	MOVD	size+16(FP), R5
-	SUB	$1, R3
-	SUB	$1, R4
-	ADD	R3, R5, R8
-loop:
-	CMP	R3, R8
-	BNE	test
-	MOVD	$1, R3
-	MOVB	R3, ret+24(FP)
-	RET
-test:
-	MOVBZU	1(R3), R6
-	MOVBZU	1(R4), R7
-	CMP	R6, R7
-	BEQ	loop
+TEXT runtime·memequal(SB),NOSPLIT,$0-25
+	MOVD    a+0(FP), R3
+	MOVD    b+8(FP), R4
+	MOVD    size+16(FP), R5

-	MOVB	R0, ret+24(FP)
-	RET
-eq:
-	MOVD	$1, R1
-	MOVB	R1, ret+24(FP)
+	BL	runtime·memeqbody(SB)
+	MOVB    R9, ret+24(FP)
 	RET

 // memequal_varlen(a, b unsafe.Pointer) bool
@@ -831,75 +811,129 @@ TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
 	CMP	R3, R4
 	BEQ	eq
 	MOVD	8(R11), R5    // compiler stores size at offset 8 in the closure
-	MOVD	R3, FIXED_FRAME+0(R1)
-	MOVD	R4, FIXED_FRAME+8(R1)
-	MOVD	R5, FIXED_FRAME+16(R1)
-	BL	runtime·memequal(SB)
-	MOVBZ	FIXED_FRAME+24(R1), R3
-	MOVB	R3, ret+16(FP)
+	BL	runtime·memeqbody(SB)
+	MOVB	R9, ret+16(FP)
 	RET
 eq:
 	MOVD	$1, R3
 	MOVB	R3, ret+16(FP)
 	RET

+// Do an efficieint memequal for ppc64
+// for reuse where possible.
+// R3 = s1
+// R4 = s2
+// R5 = len
+// R9 = return value
+// R6, R7 clobbered
+TEXT runtime·memeqbody(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD    R5,CTR
+	CMP     R5,$8		// only optimize >=8
+	BLT     simplecheck
+	DCBT	(R3)		// cache hint
+	DCBT	(R4)
+	CMP	R5,$32		// optimize >= 32
+	MOVD	R5,R6		// needed if setup8a branch
+	BLT	setup8a		// 8 byte moves only
+setup32a:                       // 8 byte aligned, >= 32 bytes
+	SRADCC  $5,R5,R6        // number of 32 byte chunks to compare
+	MOVD	R6,CTR
+loop32a:
+	MOVD    0(R3),R6        // doublewords to compare
+	MOVD    0(R4),R7
+	MOVD	8(R3),R8	//
+	MOVD	8(R4),R9
+	CMP     R6,R7           // bytes batch?
+	BNE     noteq
+	MOVD	16(R3),R6
+	MOVD	16(R4),R7
+	CMP     R8,R9		// bytes match?
+	MOVD	24(R3),R8
+	MOVD	24(R4),R9
+	BNE     noteq
+	CMP     R6,R7           // bytes match?
+	BNE	noteq
+	ADD     $32,R3		// bump up to next 32
+	ADD     $32,R4
+	CMP     R8,R9           // bytes match?
+	BC      8,2,loop32a	// br ctr and cr
+	BNE	noteq
+	ANDCC	$24,R5,R6       // Any 8 byte chunks?
+	BEQ	leftover	// and result is 0
+setup8a:
+	SRADCC  $3,R6,R6        // get the 8 byte count
+	BEQ	leftover	// shifted value is 0
+	MOVD    R6,CTR
+loop8:
+	MOVD    0(R3),R6        // doublewords to compare
+	ADD	$8,R3
+	MOVD    0(R4),R7
+	ADD     $8,R4
+	CMP     R6,R7           // match?
+	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BNE     noteq
+leftover:
+	ANDCC   $7,R5,R6        // check for leftover bytes
+	BEQ     equal
+	MOVD    R6,CTR
+	BR	simple
+simplecheck:
+	CMP	R5,$0
+	BEQ	equal
+simple:
+	MOVBZ   0(R3), R6
+	ADD	$1,R3
+	MOVBZ   0(R4), R7
+	ADD     $1,R4
+	CMP     R6, R7
+	BNE     noteq
+	BC      8,2,simple
+	BNE	noteq
+	BR	equal
+noteq:
+	MOVD    $0, R9
+	RET
+equal:
+	MOVD    $1, R9
+	RET
+
 // eqstring tests whether two strings are equal.
 // The compiler guarantees that strings passed
 // to eqstring have equal length.
 // See runtime_test.go:eqstring_generic for
 // equivalent Go code.
 TEXT runtime·eqstring(SB),NOSPLIT,$0-33
-	MOVD	s1str+0(FP), R3
-	MOVD	s2str+16(FP), R4
-	MOVD	$1, R5
-	MOVB	R5, ret+32(FP)
-	CMP	R3, R4
-	BNE	2(PC)
+	MOVD    s1str+0(FP), R3
+	MOVD    s2str+16(FP), R4
+	MOVD    $1, R5
+	MOVB    R5, ret+32(FP)
+	CMP     R3, R4
+	BNE     2(PC)
 	RET
-	MOVD	s1len+8(FP), R5
-	SUB	$1, R3
-	SUB	$1, R4
-	ADD	R3, R5, R8
-loop:
-	CMP	R3, R8
-	BNE	2(PC)
-	RET
-	MOVBZU	1(R3), R6
-	MOVBZU	1(R4), R7
-	CMP	R6, R7
-	BEQ	loop
-	MOVB	R0, ret+32(FP)
+	MOVD    s1len+8(FP), R5
+	BL      runtime·memeqbody(SB)
+	MOVB    R9, ret+32(FP)
 	RET

-// TODO: share code with memequal?
 TEXT bytes·Equal(SB),NOSPLIT,$0-49
-	MOVD	a_len+8(FP), R3
-	MOVD	b_len+32(FP), R4
-
-	CMP	R3, R4		// unequal lengths are not equal
+	MOVD	a_len+8(FP), R4
+	MOVD	b_len+32(FP), R5
+	CMP	R5, R4		// unequal lengths are not equal
 	BNE	noteq
+	MOVD	a+0(FP), R3
+	MOVD	b+24(FP), R4
+	BL	runtime·memeqbody(SB)

-	MOVD	a+0(FP), R5
-	MOVD	b+24(FP), R6
-	SUB	$1, R5
-	SUB	$1, R6
-	ADD	R5, R3		// end-1
-
-loop:
-	CMP	R5, R3
-	BEQ	equal		// reached the end
-	MOVBZU	1(R5), R4
-	MOVBZU	1(R6), R7
-	CMP	R4, R7
-	BEQ	loop
+	MOVBZ	R9,ret+48(FP)
+	RET

 noteq:
-	MOVBZ	R0, ret+48(FP)
+	MOVBZ	$0,ret+48(FP)
 	RET

 equal:
-	MOVD	$1, R3
-	MOVBZ	R3, ret+48(FP)
+	MOVD	$1,R3
+	MOVBZ	R3,ret+48(FP)
 	RET

 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40