Commit 6c7cbf08 authored by Keith Randall's avatar Keith Randall

runtime: get rid of most uses of REP for copying/zeroing.

REP MOVSQ and REP STOSQ have a really high startup overhead.
Use a Duff's device to do the repetition instead.

benchmark                 old ns/op     new ns/op     delta
BenchmarkClearFat32       7.20          1.60          -77.78%
BenchmarkCopyFat32        6.88          2.38          -65.41%
BenchmarkClearFat64       7.15          3.20          -55.24%
BenchmarkCopyFat64        6.88          3.44          -50.00%
BenchmarkClearFat128      9.53          5.34          -43.97%
BenchmarkCopyFat128       9.27          5.56          -40.02%
BenchmarkClearFat256      13.8          9.53          -30.94%
BenchmarkCopyFat256       13.5          10.3          -23.70%
BenchmarkClearFat512      22.3          18.0          -19.28%
BenchmarkCopyFat512       22.0          19.7          -10.45%
BenchmarkCopyFat1024      36.5          38.4          +5.21%
BenchmarkClearFat1024     35.1          35.0          -0.28%

TODO: use for stack frame zeroing
TODO: REP prefixes are still used for "reverse" copying when src/dst
regions overlap.  Might be worth fixing.

LGTM=rsc
R=golang-codereviews, rsc
CC=golang-codereviews, r
https://golang.org/cl/81370046
parent cfb347fc
......@@ -1345,6 +1345,7 @@ sgen(Node *n, Node *ns, int64 w)
Node nodl, nodr, nodsi, noddi, cx, oldcx, tmp;
vlong c, q, odst, osrc;
NodeList *l;
Prog *p;
if(debug['g']) {
print("\nsgen w=%lld\n", w);
......@@ -1447,10 +1448,16 @@ sgen(Node *n, Node *ns, int64 w)
gins(ACLD, N, N);
} else {
// normal direction
if(q >= 4) {
if(q > 128) {
gconreg(movptr, q, D_CX);
gins(AREP, N, N); // repeat
gins(AMOVSQ, N, N); // MOVQ *(SI)+,*(DI)+
} else if (q >= 4) {
p = gins(ADUFFCOPY, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffcopy", runtimepkg));
// 14 and 128 = magic constants: see ../../pkg/runtime/asm_amd64.s
p->to.offset = 14*(128-q);
} else
while(q > 0) {
gins(AMOVSQ, N, N); // MOVQ *(SI)+,*(DI)+
......
......@@ -1088,10 +1088,16 @@ clearfat(Node *nl)
savex(D_AX, &ax, &oldax, N, types[tptr]);
gconreg(AMOVL, 0, D_AX);
if(q >= 4) {
if(q > 128) {
gconreg(movptr, q, D_CX);
gins(AREP, N, N); // repeat
gins(ASTOSQ, N, N); // STOQ AL,*(DI)+
} else if(q >= 4) {
p = gins(ADUFFZERO, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffzero", runtimepkg));
// 2 and 128 = magic constants: see ../../pkg/runtime/asm_amd64.s
p->to.offset = 2*(128-q);
} else
while(q > 0) {
gins(ASTOSQ, N, N); // STOQ AL,*(DI)+
......
......@@ -170,6 +170,7 @@ static ProgInfo progtable[ALAST] = {
[AMOVSL]= {OK, DI|SI, DI|SI},
[AMOVSQ]= {OK, DI|SI, DI|SI},
[AMOVSW]= {OK, DI|SI, DI|SI},
[ADUFFCOPY]= {OK, DI|SI, DI|SI|CX},
[AMOVSD]= {SizeD | LeftRead | RightWrite | Move},
[AMOVSS]= {SizeF | LeftRead | RightWrite | Move},
......@@ -257,6 +258,7 @@ static ProgInfo progtable[ALAST] = {
[ASTOSL]= {OK, AX|DI, DI},
[ASTOSQ]= {OK, AX|DI, DI},
[ASTOSW]= {OK, AX|DI, DI},
[ADUFFZERO]= {OK, AX|DI, DI},
[ASUBB]= {SizeB | LeftRead | RightRdwr | SetCarry},
[ASUBL]= {SizeL | LeftRead | RightRdwr | SetCarry},
......
......@@ -764,6 +764,8 @@ enum as
ACHECKNIL,
AVARDEF,
AVARKILL,
ADUFFCOPY,
ADUFFZERO,
ALAST
};
......
......@@ -1212,6 +1212,7 @@ sgen(Node *n, Node *res, int64 w)
Node dst, src, tdst, tsrc;
int32 c, q, odst, osrc;
NodeList *l;
Prog *p;
if(debug['g']) {
print("\nsgen w=%lld\n", w);
......@@ -1314,10 +1315,16 @@ sgen(Node *n, Node *res, int64 w)
} else {
gins(ACLD, N, N); // paranoia. TODO(rsc): remove?
// normal direction
if(q >= 4) {
if(q > 128) {
gconreg(AMOVL, q, D_CX);
gins(AREP, N, N); // repeat
gins(AMOVSL, N, N); // MOVL *(SI)+,*(DI)+
} else if(q >= 4) {
p = gins(ADUFFCOPY, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffcopy", runtimepkg));
// 10 and 128 = magic constants: see ../../pkg/runtime/asm_386.s
p->to.offset = 10*(128-q);
} else
while(q > 0) {
gins(AMOVSL, N, N); // MOVL *(SI)+,*(DI)+
......
......@@ -130,6 +130,7 @@ clearfat(Node *nl)
{
uint32 w, c, q;
Node n1;
Prog *p;
/* clear a fat object */
if(debug['g'])
......@@ -147,10 +148,16 @@ clearfat(Node *nl)
agen(nl, &n1);
gconreg(AMOVL, 0, D_AX);
if(q >= 4) {
if(q > 128) {
gconreg(AMOVL, q, D_CX);
gins(AREP, N, N); // repeat
gins(ASTOSL, N, N); // STOL AL,*(DI)+
} else if(q >= 4) {
p = gins(ADUFFZERO, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffzero", runtimepkg));
// 1 and 128 = magic constants: see ../../pkg/runtime/asm_386.s
p->to.offset = 1*(128-q);
} else
while(q > 0) {
gins(ASTOSL, N, N); // STOL AL,*(DI)+
......
......@@ -195,6 +195,7 @@ static ProgInfo progtable[ALAST] = {
[AMOVSB]= {OK, DI|SI, DI|SI},
[AMOVSL]= {OK, DI|SI, DI|SI},
[AMOVSW]= {OK, DI|SI, DI|SI},
[ADUFFCOPY]= {OK, DI|SI, DI|SI|CX},
[AMOVSD]= {SizeD | LeftRead | RightWrite | Move},
[AMOVSS]= {SizeF | LeftRead | RightWrite | Move},
......@@ -287,6 +288,7 @@ static ProgInfo progtable[ALAST] = {
[ASTOSB]= {OK, AX|DI, DI},
[ASTOSL]= {OK, AX|DI, DI},
[ASTOSW]= {OK, AX|DI, DI},
[ADUFFZERO]= {OK, AX|DI, DI},
[ASUBB]= {SizeB | LeftRead | RightRdwr | SetCarry},
[ASUBL]= {SizeL | LeftRead | RightRdwr | SetCarry},
......
......@@ -581,6 +581,8 @@ enum as
ACHECKNIL,
AVARDEF,
AVARKILL,
ADUFFCOPY,
ADUFFZERO,
ALAST
};
......
......@@ -507,6 +507,11 @@ static uchar ycall[] =
Ynone, Ybr, Zcall, 1,
0
};
static uchar yduff[] =
{
Ynone, Yi32, Zcall, 1,
0
};
static uchar yjmp[] =
{
Ynone, Yml, Zo_m64, 2,
......@@ -1519,6 +1524,9 @@ Optab optab[] =
{ APCDATA, ypcdata, Px, 0,0 },
{ ACHECKNIL },
{ AVARDEF },
{ AVARKILL },
{ ADUFFCOPY, yduff, Px, 0xe8 },
{ ADUFFZERO, yduff, Px, 0xe8 },
{ AEND },
0
......@@ -3030,6 +3038,7 @@ found:
r = addrel(ctxt->cursym);
r->off = p->pc + ctxt->andptr - ctxt->and;
r->sym = p->to.sym;
r->add = p->to.offset;
r->type = D_PCREL;
r->siz = 4;
put4(ctxt, 0);
......
......@@ -420,6 +420,11 @@ static uchar ycall[] =
Ynone, Yi32, Zcallcon, 1,
0
};
static uchar yduff[] =
{
Ynone, Yi32, Zcall, 1,
0
};
static uchar yjmp[] =
{
Ynone, Yml, Zo_m, 2,
......@@ -1147,6 +1152,9 @@ static Optab optab[] =
{ APCDATA, ypcdata, Px, 0,0 },
{ ACHECKNIL },
{ AVARDEF },
{ AVARKILL },
{ ADUFFCOPY, yduff, Px, 0xe8 },
{ ADUFFZERO, yduff, Px, 0xe8 },
0
};
......@@ -2377,6 +2385,7 @@ found:
r->type = D_PCREL;
r->siz = 4;
r->sym = p->to.sym;
r->add = p->to.offset;
put4(ctxt, 0);
break;
......
......@@ -1348,3 +1348,797 @@ cmp_allsame:
SETEQ CX // 1 if alen == blen
LEAL -1(CX)(AX*2), AX // 1,0,-1 result
RET
// A Duff's device for zeroing memory.
// The compiler jumps to computed addresses within
// this routine to zero chunks of memory. Do not
// change this code without also changing the code
// in ../../cmd/8g/ggen.c:clearfat.
// AX: zero
// DI: ptr to memory to be zeroed
// DI is updated as a side effect.
TEXT runtime·duffzero(SB), NOSPLIT, $0-0
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
STOSL
RET
// A Duff's device for copying memory.
// The compiler jumps to computed addresses within
// this routine to copy chunks of memory. Source
// and destination must not overlap. Do not
// change this code without also changing the code
// in ../../cmd/6g/cgen.c:sgen.
// SI: ptr to source memory
// DI: ptr to destination memory
// SI and DI are updated as a side effect.
// NOTE: this is equivalent to a sequence of MOVSL but
// for some reason MOVSL is really slow.
TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
MOVL (SI),CX
ADDL $4,SI
MOVL CX,(DI)
ADDL $4,DI
RET
......@@ -1380,3 +1380,798 @@ TEXT bytes·Equal(SB),NOSPLIT,$0-49
eqret:
MOVB AX, ret+48(FP)
RET
// A Duff's device for zeroing memory.
// The compiler jumps to computed addresses within
// this routine to zero chunks of memory. Do not
// change this code without also changing the code
// in ../../cmd/6g/ggen.c:clearfat.
// AX: zero
// DI: ptr to memory to be zeroed
// DI is updated as a side effect.
TEXT runtime·duffzero(SB), NOSPLIT, $0-0
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
RET
// A Duff's device for copying memory.
// The compiler jumps to computed addresses within
// this routine to copy chunks of memory. Source
// and destination must not overlap. Do not
// change this code without also changing the code
// in ../../cmd/6g/cgen.c:sgen.
// SI: ptr to source memory
// DI: ptr to destination memory
// SI and DI are updated as a side effect.
// NOTE: this is equivalent to a sequence of MOVSQ but
// for some reason that is 3.5x slower than this code.
// The STOSQ above seem fine, though.
TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
RET
......@@ -161,3 +161,83 @@ func BenchmarkMemclr64(b *testing.B) { bmMemclr(b, 64) }
func BenchmarkMemclr256(b *testing.B) { bmMemclr(b, 256) }
func BenchmarkMemclr4096(b *testing.B) { bmMemclr(b, 4096) }
func BenchmarkMemclr65536(b *testing.B) { bmMemclr(b, 65536) }
func BenchmarkClearFat32(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [32]byte
_ = x
}
}
func BenchmarkClearFat64(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [64]byte
_ = x
}
}
func BenchmarkClearFat128(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [128]byte
_ = x
}
}
func BenchmarkClearFat256(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [256]byte
_ = x
}
}
func BenchmarkClearFat512(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [512]byte
_ = x
}
}
func BenchmarkClearFat1024(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [1024]byte
_ = x
}
}
func BenchmarkCopyFat32(b *testing.B) {
var x [32]byte
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat64(b *testing.B) {
var x [64]byte
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat128(b *testing.B) {
var x [128]byte
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat256(b *testing.B) {
var x [256]byte
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat512(b *testing.B) {
var x [512]byte
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
func BenchmarkCopyFat1024(b *testing.B) {
var x [1024]byte
for i := 0; i < b.N; i++ {
y := x
_ = y
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment