Commit a712e20a authored by Rui Ueyama's avatar Rui Ueyama

runtime: speed up amd64 memmove

MOV with SSE registers seems faster than REP MOVSQ if the
size being copied is less than about 2K. Previously we
didn't use MOV if the memory region is larger than 256
byte. This patch improves the performance of 257 ~ 2048
byte non-overlapping copy by using MOV.

Here is the benchmark result on Intel Xeon 3.5GHz (Nehalem).

benchmark               old ns/op    new ns/op    delta
BenchmarkMemmove16              4            4   +0.42%
BenchmarkMemmove32              5            5   -0.20%
BenchmarkMemmove64              6            6   -0.81%
BenchmarkMemmove128             7            7   -0.82%
BenchmarkMemmove256            10           10   +1.92%
BenchmarkMemmove512            29           16  -44.90%
BenchmarkMemmove1024           37           25  -31.55%
BenchmarkMemmove2048           55           44  -19.46%
BenchmarkMemmove4096           92           91   -0.76%

benchmark                old MB/s     new MB/s  speedup
BenchmarkMemmove16        3370.61      3356.88    1.00x
BenchmarkMemmove32        6368.68      6386.99    1.00x
BenchmarkMemmove64       10367.37     10462.62    1.01x
BenchmarkMemmove128      17551.16     17713.48    1.01x
BenchmarkMemmove256      24692.81     24142.99    0.98x
BenchmarkMemmove512      17428.70     31687.72    1.82x
BenchmarkMemmove1024     27401.82     40009.45    1.46x
BenchmarkMemmove2048     36884.86     45766.98    1.24x
BenchmarkMemmove4096     44295.91     44627.86    1.01x

LGTM=khr
R=golang-codereviews, gobot, khr
CC=golang-codereviews
https://golang.org/cl/90500043
parent e3e48cd0
...@@ -36,10 +36,13 @@ TEXT runtime·memmove(SB), NOSPLIT, $0-24 ...@@ -36,10 +36,13 @@ TEXT runtime·memmove(SB), NOSPLIT, $0-24
// REP instructions have a high startup cost, so we handle small sizes // REP instructions have a high startup cost, so we handle small sizes
// with some straightline code. The REP MOVSQ instruction is really fast // with some straightline code. The REP MOVSQ instruction is really fast
// for large sizes. The cutover is approximately 1K. We implement up to // for large sizes. The cutover is approximately 2K.
// 256 because that is the maximum SSE register load (loading all data
// into registers lets us ignore copy direction).
tail: tail:
// move_129through256 or smaller work whether or not the source and the
// destination memory regions overlap because they load all data into
// registers before writing it back. move_256through2048 on the other
// hand can be used only when the memory regions don't overlap or the copy
// direction is forward.
TESTQ BX, BX TESTQ BX, BX
JEQ move_0 JEQ move_0
CMPQ BX, $2 CMPQ BX, $2
...@@ -70,10 +73,12 @@ tail: ...@@ -70,10 +73,12 @@ tail:
* forward copy loop * forward copy loop
*/ */
forward: forward:
CMPQ BX, $2048
JLS move_256through2048
MOVQ BX, CX MOVQ BX, CX
SHRQ $3, CX SHRQ $3, CX
ANDQ $7, BX ANDQ $7, BX
REP; MOVSQ REP; MOVSQ
JMP tail JMP tail
...@@ -205,3 +210,42 @@ move_129through256: ...@@ -205,3 +210,42 @@ move_129through256:
MOVOU X14, -32(DI)(BX*1) MOVOU X14, -32(DI)(BX*1)
MOVOU X15, -16(DI)(BX*1) MOVOU X15, -16(DI)(BX*1)
RET RET
move_256through2048:
SUBQ $256, BX
MOVOU (SI), X0
MOVOU 16(SI), X1
MOVOU 32(SI), X2
MOVOU 48(SI), X3
MOVOU 64(SI), X4
MOVOU 80(SI), X5
MOVOU 96(SI), X6
MOVOU 112(SI), X7
MOVOU 128(SI), X8
MOVOU 144(SI), X9
MOVOU 160(SI), X10
MOVOU 176(SI), X11
MOVOU 192(SI), X12
MOVOU 208(SI), X13
MOVOU 224(SI), X14
MOVOU 240(SI), X15
MOVOU X0, (DI)
MOVOU X1, 16(DI)
MOVOU X2, 32(DI)
MOVOU X3, 48(DI)
MOVOU X4, 64(DI)
MOVOU X5, 80(DI)
MOVOU X6, 96(DI)
MOVOU X7, 112(DI)
MOVOU X8, 128(DI)
MOVOU X9, 144(DI)
MOVOU X10, 160(DI)
MOVOU X11, 176(DI)
MOVOU X12, 192(DI)
MOVOU X13, 208(DI)
MOVOU X14, 224(DI)
MOVOU X15, 240(DI)
CMPQ BX, $256
LEAQ 256(SI), SI
LEAQ 256(DI), DI
JGE move_256through2048
JMP tail
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment