internal/bytealg: optimize Equal on arm64

Currently the 16-byte loop chunk16_loop is implemented with NEON instructions LD1, VMOV and VCMEQ. Using scalar instructions LDP and CMP to achieve this loop can reduce the number of clock cycles. For cases where the length of strings are between 4 to 15 bytes, loading the last 8 or 4 bytes at a time to reduce the number of comparisons. Benchmarks: name old time/op new time/op delta Equal/0-8 5.51ns ± 0% 5.84ns ±14% ~ (p=0.246 n=7+8) Equal/1-8 10.5ns ± 0% 10.5ns ± 0% ~ (all equal) Equal/6-8 14.0ns ± 0% 12.5ns ± 0% -10.71% (p=0.000 n=8+8) Equal/9-8 13.5ns ± 0% 12.5ns ± 0% -7.41% (p=0.000 n=8+8) Equal/15-8 15.5ns ± 0% 12.5ns ± 0% -19.35% (p=0.000 n=8+8) Equal/16-8 14.0ns ± 0% 13.0ns ± 0% -7.14% (p=0.000 n=8+8) Equal/20-8 16.5ns ± 0% 16.0ns ± 0% -3.03% (p=0.000 n=8+8) Equal/32-8 16.5ns ± 0% 15.3ns ± 0% -7.27% (p=0.000 n=8+8) Equal/4K-8 552ns ± 0% 553ns ± 0% ~ (p=0.315 n=8+8) Equal/4M-8 1.13ms ±23% 1.20ms ±27% ~ (p=0.442 n=8+8) Equal/64M-8 32.9ms ± 0% 32.6ms ± 0% -1.15% (p=0.000 n=8+8) CompareBytesEqual-8 12.0ns ± 0% 12.0ns ± 0% ~ (all equal) Change-Id: If317ecdcc98e31883d37fd7d42b113b548c5bd2a Reviewed-on: https://go-review.googlesource.com/112496Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com>

internal/bytealg: optimize Equal on arm64
Currently the 16-byte loop chunk16_loop is implemented with NEON instructions LD1, VMOV and VCMEQ. Using scalar instructions LDP and CMP to achieve this loop can reduce the number of clock cycles. For cases where the length of strings are between 4 to 15 bytes, loading the last 8 or 4 bytes at a time to reduce the number of comparisons. Benchmarks: name old time/op new time/op delta Equal/0-8 5.51ns ± 0% 5.84ns ±14% ~ (p=0.246 n=7+8) Equal/1-8 10.5ns ± 0% 10.5ns ± 0% ~ (all equal) Equal/6-8 14.0ns ± 0% 12.5ns ± 0% -10.71% (p=0.000 n=8+8) Equal/9-8 13.5ns ± 0% 12.5ns ± 0% -7.41% (p=0.000 n=8+8) Equal/15-8 15.5ns ± 0% 12.5ns ± 0% -19.35% (p=0.000 n=8+8) Equal/16-8 14.0ns ± 0% 13.0ns ± 0% -7.14% (p=0.000 n=8+8) Equal/20-8 16.5ns ± 0% 16.0ns ± 0% -3.03% (p=0.000 n=8+8) Equal/32-8 16.5ns ± 0% 15.3ns ± 0% -7.27% (p=0.000 n=8+8) Equal/4K-8 552ns ± 0% 553ns ± 0% ~ (p=0.315 n=8+8) Equal/4M-8 1.13ms ±23% 1.20ms ±27% ~ (p=0.442 n=8+8) Equal/64M-8 32.9ms ± 0% 32.6ms ± 0% -1.15% (p=0.000 n=8+8) CompareBytesEqual-8 12.0ns ± 0% 12.0ns ± 0% ~ (all equal) Change-Id: If317ecdcc98e31883d37fd7d42b113b548c5bd2a Reviewed-on: https://go-review.googlesource.com/112496Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com>
de28555c · erifan01 · Cherry Zhang · 21f3d581 · de28555c
Commit de28555c authored May 07, 2018 by erifan01 Committed by Cherry Zhang Sep 12, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 28 additions and 16 deletions

equal_arm64.s src/internal/bytealg/equal_arm64.s +28 -16

No files found.
--- a/src/internal/bytealg/equal_arm64.s
+++ b/src/internal/bytealg/equal_arm64.s
@@ -67,6 +67,7 @@ TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
 	CMP	R3, R4
 	BEQ	eq
 	MOVD	8(R26), R5    // compiler stores size at offset 8 in the closure
+	CBZ	R5, eq
 	MOVD	R3, 8(RSP)
 	MOVD	R4, 16(RSP)
 	MOVD	R5, 24(RSP)
@@ -119,30 +120,41 @@ chunk16:
 	CBZ	R3, tail
 	ADD	R3, R0, R6	// end of chunks
 chunk16_loop:
-	VLD1.P	(R0), [V0.D2]
-	VLD1.P	(R2), [V1.D2]
-	VCMEQ	V0.D2, V1.D2, V2.D2
+	LDP.P	16(R0), (R4, R5)
+	LDP.P	16(R2), (R7, R9)
+	EOR	R4, R7
+	CBNZ	R7, not_equal
+	EOR	R5, R9
+	CBNZ	R9, not_equal
 	CMP	R0, R6
-	VMOV	V2.D[0], R4
-	VMOV	V2.D[1], R5
-	CBZ	R4, not_equal
-	CBZ	R5, not_equal
 	BNE	chunk16_loop
 	AND	$0xf, R1, R1
 	CBZ	R1, equal
 tail:
 	// special compare of tail with length < 16
 	TBZ	$3, R1, lt_8
-	MOVD.P	8(R0), R4
-	MOVD.P	8(R2), R5
-	CMP	R4, R5
-	BNE	not_equal
+	MOVD	(R0), R4
+	MOVD	(R2), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	SUB	$8, R1, R6	// offset of the last 8 bytes
+	MOVD	(R0)(R6), R4
+	MOVD	(R2)(R6), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	B	equal
 lt_8:
 	TBZ	$2, R1, lt_4
-	MOVWU.P	4(R0), R4
-	MOVWU.P	4(R2), R5
-	CMP	R4, R5
-	BNE	not_equal
+	MOVWU	(R0), R4
+	MOVWU	(R2), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	SUB	$4, R1, R6	// offset of the last 4 bytes
+	MOVWU	(R0)(R6), R4
+	MOVWU	(R2)(R6), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	B	equal
 lt_4:
 	TBZ	$1, R1, lt_2
 	MOVHU.P	2(R0), R4
@@ -150,7 +162,7 @@ lt_4:
 	CMP	R4, R5
 	BNE	not_equal
 lt_2:
-	TBZ     $0, R1, equal
+	TBZ	$0, R1, equal
 one:
 	MOVBU	(R0), R4
 	MOVBU	(R2), R5