Commit 8916773a authored by Meng Zhuo's avatar Meng Zhuo Committed by Brad Fitzpatrick

runtime, cmd/compile: use ldp for DUFFCOPY on ARM64

name         old time/op  new time/op  delta
CopyFat8     2.15ns ± 1%  2.19ns ± 6%     ~     (p=0.171 n=8+9)
CopyFat12    2.15ns ± 0%  2.17ns ± 2%     ~     (p=0.137 n=8+10)
CopyFat16    2.17ns ± 3%  2.15ns ± 0%     ~     (p=0.211 n=10+10)
CopyFat24    2.16ns ± 1%  2.15ns ± 0%     ~     (p=0.087 n=10+10)
CopyFat32    11.5ns ± 0%  12.8ns ± 2%  +10.87%  (p=0.000 n=8+10)
CopyFat64    20.2ns ± 2%  12.9ns ± 0%  -36.11%  (p=0.000 n=10+10)
CopyFat128   37.2ns ± 0%  21.5ns ± 0%  -42.20%  (p=0.000 n=10+10)
CopyFat256   71.6ns ± 0%  38.7ns ± 0%  -45.95%  (p=0.000 n=10+10)
CopyFat512    140ns ± 0%    73ns ± 0%  -47.86%  (p=0.000 n=10+9)
CopyFat520    142ns ± 0%    74ns ± 0%  -47.54%  (p=0.000 n=10+10)
CopyFat1024   277ns ± 0%   141ns ± 0%  -49.10%  (p=0.000 n=10+10)

Change-Id: If54bc571add5db674d5e081579c87e80153d0a5a
Reviewed-on: https://go-review.googlesource.com/97395Reviewed-by: 's avatarCherry Zhang <cherryyz@google.com>
parent baf3eb16
......@@ -462,11 +462,21 @@
(Move [s-s%8] dst src mem))
// medium move uses a duff device
// 8 and 128 are magic constants, see runtime/mkduff.go
(Move [s] dst src mem)
&& s%8 == 0 && s > 24 && s <= 8*128
&& s > 32 && s <= 16*64 && s%16 == 8
&& !config.noDuffDevice ->
(DUFFCOPY [8 * (128 - s/8)] dst src mem)
(MOVDstore [s-8] dst (MOVDload [s-8] src mem)
(DUFFCOPY <types.TypeMem> [8*(64-(s-8)/16)] dst src mem))
(Move [s] dst src mem)
&& s > 32 && s <= 16*64 && s%16 == 0
&& !config.noDuffDevice ->
(DUFFCOPY [8 * (64 - s/16)] dst src mem)
// 8 is the number of bytes to encode:
//
// LDP.P 16(R16), (R26, R27)
// STP.P (R26, R27), 16(R17)
//
// 64 is number of these blocks. See runtime/duff_arm64.s:duffcopy
// large move uses a loop
(Move [s] dst src mem)
......
......@@ -426,7 +426,7 @@ func init() {
argLength: 3,
reg: regInfo{
inputs: []regMask{buildReg("R17"), buildReg("R16")},
clobbers: buildReg("R16 R17 R30"),
clobbers: buildReg("R16 R17 R26 R30"),
},
faultOnNilArg0: true,
faultOnNilArg1: true,
......
......@@ -14656,7 +14656,7 @@ var opcodeTable = [...]opInfo{
{0, 131072}, // R17
{1, 65536}, // R16
},
clobbers: 537067520, // R16 R17 R30
clobbers: 604176384, // R16 R17 R26 R30
},
},
{
......
......@@ -17371,19 +17371,47 @@ func rewriteValueARM64_OpMove_10(v *Value) bool {
return true
}
// match: (Move [s] dst src mem)
// cond: s%8 == 0 && s > 24 && s <= 8*128 && !config.noDuffDevice
// result: (DUFFCOPY [8 * (128 - s/8)] dst src mem)
// cond: s > 32 && s <= 16*64 && s%16 == 8 && !config.noDuffDevice
// result: (MOVDstore [s-8] dst (MOVDload [s-8] src mem) (DUFFCOPY <types.TypeMem> [8*(64-(s-8)/16)] dst src mem))
for {
s := v.AuxInt
_ = v.Args[2]
dst := v.Args[0]
src := v.Args[1]
mem := v.Args[2]
if !(s%8 == 0 && s > 24 && s <= 8*128 && !config.noDuffDevice) {
if !(s > 32 && s <= 16*64 && s%16 == 8 && !config.noDuffDevice) {
break
}
v.reset(OpARM64MOVDstore)
v.AuxInt = s - 8
v.AddArg(dst)
v0 := b.NewValue0(v.Pos, OpARM64MOVDload, typ.UInt64)
v0.AuxInt = s - 8
v0.AddArg(src)
v0.AddArg(mem)
v.AddArg(v0)
v1 := b.NewValue0(v.Pos, OpARM64DUFFCOPY, types.TypeMem)
v1.AuxInt = 8 * (64 - (s-8)/16)
v1.AddArg(dst)
v1.AddArg(src)
v1.AddArg(mem)
v.AddArg(v1)
return true
}
// match: (Move [s] dst src mem)
// cond: s > 32 && s <= 16*64 && s%16 == 0 && !config.noDuffDevice
// result: (DUFFCOPY [8 * (64 - s/16)] dst src mem)
for {
s := v.AuxInt
_ = v.Args[2]
dst := v.Args[0]
src := v.Args[1]
mem := v.Args[2]
if !(s > 32 && s <= 16*64 && s%16 == 0 && !config.noDuffDevice) {
break
}
v.reset(OpARM64DUFFCOPY)
v.AuxInt = 8 * (128 - s/8)
v.AuxInt = 8 * (64 - s/16)
v.AddArg(dst)
v.AddArg(src)
v.AddArg(mem)
......
......@@ -71,389 +71,197 @@ TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0
STP (ZR, ZR), (R16)
RET
TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
MOVD.P 8(R16), R27
MOVD.P R27, 8(R17)
LDP.P 16(R16), (R26, R27)
STP.P (R26, R27), 16(R17)
RET
......@@ -450,6 +450,13 @@ func BenchmarkCopyFat512(b *testing.B) {
_ = y
}
}
func BenchmarkCopyFat520(b *testing.B) {
var x [520 / 4]uint32
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat1024(b *testing.B) {
var x [1024 / 4]uint32
for i := 0; i < b.N; i++ {
......
......@@ -164,12 +164,13 @@ func zeroARM64(w io.Writer) {
func copyARM64(w io.Writer) {
// R16 (aka REGRT1): ptr to source memory
// R17 (aka REGRT2): ptr to destination memory
// R27 (aka REGTMP): scratch space
// R26, R27 (aka REGTMP): scratch space
// R16 and R17 are updated as a side effect
fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")
for i := 0; i < 128; i++ {
fmt.Fprintln(w, "\tMOVD.P\t8(R16), R27")
fmt.Fprintln(w, "\tMOVD.P\tR27, 8(R17)")
fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0")
for i := 0; i < 64; i++ {
fmt.Fprintln(w, "\tLDP.P\t16(R16), (R26, R27)")
fmt.Fprintln(w, "\tSTP.P\t(R26, R27), 16(R17)")
fmt.Fprintln(w)
}
fmt.Fprintln(w, "\tRET")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment