runtime: speed up amd64 memmove

MOV with SSE registers seems faster than REP MOVSQ if the size being copied is less than about 2K. Previously we didn't use MOV if the memory region is larger than 256 byte. This patch improves the performance of 257 ~ 2048 byte non-overlapping copy by using MOV. Here is the benchmark result on Intel Xeon 3.5GHz (Nehalem). benchmark old ns/op new ns/op delta BenchmarkMemmove16 4 4 +0.42% BenchmarkMemmove32 5 5 -0.20% BenchmarkMemmove64 6 6 -0.81% BenchmarkMemmove128 7 7 -0.82% BenchmarkMemmove256 10 10 +1.92% BenchmarkMemmove512 29 16 -44.90% BenchmarkMemmove1024 37 25 -31.55% BenchmarkMemmove2048 55 44 -19.46% BenchmarkMemmove4096 92 91 -0.76% benchmark old MB/s new MB/s speedup BenchmarkMemmove16 3370.61 3356.88 1.00x BenchmarkMemmove32 6368.68 6386.99 1.00x BenchmarkMemmove64 10367.37 10462.62 1.01x BenchmarkMemmove128 17551.16 17713.48 1.01x BenchmarkMemmove256 24692.81 24142.99 0.98x BenchmarkMemmove512 17428.70 31687.72 1.82x BenchmarkMemmove1024 27401.82 40009.45 1.46x BenchmarkMemmove2048 36884.86 45766.98 1.24x BenchmarkMemmove4096 44295.91 44627.86 1.01x LGTM=khr R=golang-codereviews, gobot, khr CC=golang-codereviews https://golang.org/cl/90500043

runtime: speed up amd64 memmove
MOV with SSE registers seems faster than REP MOVSQ if the size being copied is less than about 2K. Previously we didn't use MOV if the memory region is larger than 256 byte. This patch improves the performance of 257 ~ 2048 byte non-overlapping copy by using MOV. Here is the benchmark result on Intel Xeon 3.5GHz (Nehalem). benchmark old ns/op new ns/op delta BenchmarkMemmove16 4 4 +0.42% BenchmarkMemmove32 5 5 -0.20% BenchmarkMemmove64 6 6 -0.81% BenchmarkMemmove128 7 7 -0.82% BenchmarkMemmove256 10 10 +1.92% BenchmarkMemmove512 29 16 -44.90% BenchmarkMemmove1024 37 25 -31.55% BenchmarkMemmove2048 55 44 -19.46% BenchmarkMemmove4096 92 91 -0.76% benchmark old MB/s new MB/s speedup BenchmarkMemmove16 3370.61 3356.88 1.00x BenchmarkMemmove32 6368.68 6386.99 1.00x BenchmarkMemmove64 10367.37 10462.62 1.01x BenchmarkMemmove128 17551.16 17713.48 1.01x BenchmarkMemmove256 24692.81 24142.99 0.98x BenchmarkMemmove512 17428.70 31687.72 1.82x BenchmarkMemmove1024 27401.82 40009.45 1.46x BenchmarkMemmove2048 36884.86 45766.98 1.24x BenchmarkMemmove4096 44295.91 44627.86 1.01x LGTM=khr R=golang-codereviews, gobot, khr CC=golang-codereviews https://golang.org/cl/90500043
a712e20a · Rui Ueyama · e3e48cd0 · a712e20a
Commit a712e20a authored Jun 23, 2014 by Rui Ueyama
Show whitespace changes
Inline Side-by-side

Showing with 48 additions and 4 deletions

memmove_amd64.s src/pkg/runtime/memmove_amd64.s +48 -4

No files found.
--- a/src/pkg/runtime/memmove_amd64.s
+++ b/src/pkg/runtime/memmove_amd64.s
@@ -36,10 +36,13 @@ TEXT runtime·memmove(SB), NOSPLIT, $0-24
 	// REP instructions have a high startup cost, so we handle small sizes
 	// with some straightline code.  The REP MOVSQ instruction is really fast
-	// for large sizes.  The cutover is approximately 1K.  We implement up to
+	// for large sizes.  The cutover is approximately 2K.
-	// 256 because that is the maximum SSE register load (loading all data
-	// into registers lets us ignore copy direction).
 tail:
+	// move_129through256 or smaller work whether or not the source and the
+	// destination memory regions overlap because they load all data into
+	// registers before writing it back.  move_256through2048 on the other
+	// hand can be used only when the memory regions don't overlap or the copy
+	// direction is forward.
 	TESTQ	BX, BX
 	JEQ	move_0
 	CMPQ	BX, $2
@@ -70,10 +73,12 @@ tail:
 * forward copy loop
 */
 forward:
+	CMPQ	BX, $2048
+	JLS	move_256through2048
 	MOVQ	BX, CX
 	SHRQ	$3, CX
 	ANDQ	$7, BX
 	REP;	MOVSQ
 	JMP	tail
@@ -205,3 +210,42 @@ move_129through256:
 	MOVOU	X14, -32(DI)(BX*1)
 	MOVOU	X15, -16(DI)(BX*1)
 	RET
+move_256through2048:
+	SUBQ	$256, BX
+	MOVOU	(SI), X0
+	MOVOU	16(SI), X1
+	MOVOU	32(SI), X2
+	MOVOU	48(SI), X3
+	MOVOU	64(SI), X4
+	MOVOU	80(SI), X5
+	MOVOU	96(SI), X6
+	MOVOU	112(SI), X7
+	MOVOU	128(SI), X8
+	MOVOU	144(SI), X9
+	MOVOU	160(SI), X10
+	MOVOU	176(SI), X11
+	MOVOU	192(SI), X12
+	MOVOU	208(SI), X13
+	MOVOU	224(SI), X14
+	MOVOU	240(SI), X15
+	MOVOU	X0, (DI)
+	MOVOU	X1, 16(DI)
+	MOVOU	X2, 32(DI)
+	MOVOU	X3, 48(DI)
+	MOVOU	X4, 64(DI)
+	MOVOU	X5, 80(DI)
+	MOVOU	X6, 96(DI)
+	MOVOU	X7, 112(DI)
+	MOVOU	X8, 128(DI)
+	MOVOU	X9, 144(DI)
+	MOVOU	X10, 160(DI)
+	MOVOU	X11, 176(DI)
+	MOVOU	X12, 192(DI)
+	MOVOU	X13, 208(DI)
+	MOVOU	X14, 224(DI)
+	MOVOU	X15, 240(DI)
+	CMPQ	BX, $256
+	LEAQ	256(SI), SI
+	LEAQ	256(DI), DI
+	JGE	move_256through2048
+	JMP	tail