Commit 747c8498 authored by Josh Bleecher Snyder's avatar Josh Bleecher Snyder

cmd/6g, cmd/8g: make 2/3 word sgen more efficient

When compiling the stdlib most of the calls
to sgen are for exactly 2 or 3 words:
85% for 6g and 70% for 8g.
Special case them for performance.
This optimization is not relevant to 5g and 9g.

6g

benchmark                old ns/op     new ns/op     delta
BenchmarkCopyFat16       3.25          0.82          -74.77%
BenchmarkCopyFat24       5.47          0.95          -82.63%

8g

benchmark               old ns/op     new ns/op     delta
BenchmarkCopyFat8       3.84          2.42          -36.98%
BenchmarkCopyFat12      4.94          2.15          -56.48%

Change-Id: I8bc60b453f12597dfd916df2d072a7d5fc33ab85
Reviewed-on: https://go-review.googlesource.com/2607Reviewed-by: 's avatarRuss Cox <rsc@golang.org>
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
parent c1c3ce6b
......@@ -1457,6 +1457,18 @@ sgen(Node *n, Node *ns, int64 w)
p->to.sym = linksym(pkglookup("duffcopy", runtimepkg));
// 14 and 128 = magic constants: see ../../runtime/asm_amd64.s
p->to.offset = 14*(128-q);
} else if(!nacl && c == 0) {
// We don't need the MOVSQ side-effect of updating SI and DI,
// and issuing a sequence of MOVQs directly is faster.
nodsi.op = OINDREG;
noddi.op = OINDREG;
while(q > 0) {
gmove(&nodsi, &cx); // MOVQ x+(SI),CX
gmove(&cx, &noddi); // MOVQ CX,x+(DI)
nodsi.xoffset += 8;
noddi.xoffset += 8;
q--;
}
} else
while(q > 0) {
gins(AMOVSQ, N, N); // MOVQ *(SI)+,*(DI)+
......
......@@ -1213,7 +1213,7 @@ stkof(Node *n)
void
sgen(Node *n, Node *res, int64 w)
{
Node dst, src, tdst, tsrc;
Node dst, src, tdst, tsrc, cx;
int32 c, q, odst, osrc;
NodeList *l;
Prog *p;
......@@ -1329,6 +1329,19 @@ sgen(Node *n, Node *res, int64 w)
p->to.sym = linksym(pkglookup("duffcopy", runtimepkg));
// 10 and 128 = magic constants: see ../../runtime/asm_386.s
p->to.offset = 10*(128-q);
} else if(!nacl && c == 0) {
nodreg(&cx, types[TINT32], REG_CX);
// We don't need the MOVSL side-effect of updating SI and DI,
// and issuing a sequence of MOVLs directly is faster.
src.op = OINDREG;
dst.op = OINDREG;
while(q > 0) {
gmove(&src, &cx); // MOVL x+(SI),CX
gmove(&cx, &dst); // MOVL CX,x+(DI)
src.xoffset += 4;
dst.xoffset += 4;
q--;
}
} else
while(q > 0) {
gins(AMOVSL, N, N); // MOVL *(SI)+,*(DI)+
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment