Commit 3c40ee0f authored by Russ Cox's avatar Russ Cox

cmd/gc: simplify compiled code for explicit zeroing

Among other things, *x = T{} does not need a write barrier.
The changes here avoid an unnecessary copy even when
no pointers are involved, so it may have larger effects.

In 6g and 8g, avoid manually repeated STOSQ in favor of
writing explicit MOVs, under the theory that the MOVs
should have fewer dependencies and pipeline better.

Benchmarks compare best of 5 on a 2012 MacBook Pro Core i5
with TurboBoost disabled. Most improvements can be explained
by the changes in this CL.

The effect in Revcomp is real but harder to explain: none of
the instructions in the inner loop changed. I suspect loop
alignment but really have no idea.

benchmark                       old         new         delta
BenchmarkBinaryTree17           3809027371  3819907076  +0.29%
BenchmarkFannkuch11             3607547556  3686983012  +2.20%
BenchmarkFmtFprintfEmpty        118         103         -12.71%
BenchmarkFmtFprintfString       289         277         -4.15%
BenchmarkFmtFprintfInt          304         290         -4.61%
BenchmarkFmtFprintfIntInt       507         458         -9.66%
BenchmarkFmtFprintfPrefixedInt  425         408         -4.00%
BenchmarkFmtFprintfFloat        555         555         +0.00%
BenchmarkFmtManyArgs            1835        1733        -5.56%
BenchmarkGobDecode              14738209    14639331    -0.67%
BenchmarkGobEncode              14239039    13703571    -3.76%
BenchmarkGzip                   538211054   538701315   +0.09%
BenchmarkGunzip                 135430877   134818459   -0.45%
BenchmarkHTTPClientServer       116488      116618      +0.11%
BenchmarkJSONEncode             28923406    29294334    +1.28%
BenchmarkJSONDecode             105779820   104289543   -1.41%
BenchmarkMandelbrot200          5791758     5771964     -0.34%
BenchmarkGoParse                5376642     5310943     -1.22%
BenchmarkRegexpMatchEasy0_32    195         190         -2.56%
BenchmarkRegexpMatchEasy0_1K    477         455         -4.61%
BenchmarkRegexpMatchEasy1_32    170         165         -2.94%
BenchmarkRegexpMatchEasy1_1K    1410        1394        -1.13%
BenchmarkRegexpMatchMedium_32   336         329         -2.08%
BenchmarkRegexpMatchMedium_1K   108979      106328      -2.43%
BenchmarkRegexpMatchHard_32     5854        5821        -0.56%
BenchmarkRegexpMatchHard_1K     185089      182838      -1.22%
BenchmarkRevcomp                834920364   780202624   -6.55%
BenchmarkTemplate               137046937   129728756   -5.34%
BenchmarkTimeParse              600         594         -1.00%
BenchmarkTimeFormat             559         539         -3.58%

LGTM=r
R=r
CC=golang-codereviews, iant, khr, rlh
https://golang.org/cl/157910047
parent 2dcb6138
......@@ -1102,26 +1102,54 @@ clearfat(Node *nl)
c = w % 8; // bytes
q = w / 8; // quads
if(q < 4) {
// Write sequence of MOV 0, off(base) instead of using STOSQ.
// The hope is that although the code will be slightly longer,
// the MOVs will have no dependencies and pipeline better
// than the unrolled STOSQ loop.
// NOTE: Must use agen, not igen, so that optimizer sees address
// being taken. We are not writing on field boundaries.
agenr(nl, &n1, N);
n1.op = OINDREG;
nodconst(&z, types[TUINT64], 0);
while(q-- > 0) {
n1.type = z.type;
gins(AMOVQ, &z, &n1);
n1.xoffset += 8;
}
if(c >= 4) {
nodconst(&z, types[TUINT32], 0);
n1.type = z.type;
gins(AMOVL, &z, &n1);
n1.xoffset += 4;
c -= 4;
}
nodconst(&z, types[TUINT8], 0);
while(c-- > 0) {
n1.type = z.type;
gins(AMOVB, &z, &n1);
n1.xoffset++;
}
regfree(&n1);
return;
}
savex(D_DI, &n1, &oldn1, N, types[tptr]);
agen(nl, &n1);
savex(D_AX, &ax, &oldax, N, types[tptr]);
gconreg(AMOVL, 0, D_AX);
if(q > 128 || (q >= 4 && nacl)) {
if(q > 128 || nacl) {
gconreg(movptr, q, D_CX);
gins(AREP, N, N); // repeat
gins(ASTOSQ, N, N); // STOQ AL,*(DI)+
} else if(q >= 4) {
} else {
p = gins(ADUFFZERO, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffzero", runtimepkg));
// 2 and 128 = magic constants: see ../../runtime/asm_amd64.s
p->to.offset = 2*(128-q);
} else
while(q > 0) {
gins(ASTOSQ, N, N); // STOQ AL,*(DI)+
q--;
}
z = ax;
......
......@@ -157,7 +157,7 @@ void
clearfat(Node *nl)
{
uint32 w, c, q;
Node n1;
Node n1, z;
Prog *p;
/* clear a fat object */
......@@ -172,6 +172,32 @@ clearfat(Node *nl)
c = w % 4; // bytes
q = w / 4; // quads
if(q < 4) {
// Write sequence of MOV 0, off(base) instead of using STOSL.
// The hope is that although the code will be slightly longer,
// the MOVs will have no dependencies and pipeline better
// than the unrolled STOSL loop.
// NOTE: Must use agen, not igen, so that optimizer sees address
// being taken. We are not writing on field boundaries.
regalloc(&n1, types[tptr], N);
agen(nl, &n1);
n1.op = OINDREG;
nodconst(&z, types[TUINT64], 0);
while(q-- > 0) {
n1.type = z.type;
gins(AMOVL, &z, &n1);
n1.xoffset += 4;
}
nodconst(&z, types[TUINT8], 0);
while(c-- > 0) {
n1.type = z.type;
gins(AMOVB, &z, &n1);
n1.xoffset++;
}
regfree(&n1);
return;
}
nodreg(&n1, types[tptr], D_DI);
agen(nl, &n1);
gconreg(AMOVL, 0, D_AX);
......
......@@ -731,7 +731,7 @@ cgen_as(Node *nl, Node *nr)
return;
}
if(nr == N || isnil(nr)) {
if(nr == N || iszero(nr)) {
// externals and heaps should already be clear
if(nr == N) {
if(nl->class == PEXTERN)
......
......@@ -1374,6 +1374,7 @@ int isnilinter(Type *t);
int isptrto(Type *t, int et);
int isslice(Type *t);
int istype(Type *t, int et);
int iszero(Node *n);
void linehist(char *file, int32 off, int relative);
NodeList* list(NodeList *l, Node *n);
NodeList* list1(Node *n);
......
......@@ -656,7 +656,7 @@ mpdivmodfixfix(Mpint *q, Mpint *r, Mpint *n, Mpint *d)
}
static int
iszero(Mpint *a)
mpiszero(Mpint *a)
{
long *a1;
int i;
......@@ -687,7 +687,7 @@ mpdivfract(Mpint *a, Mpint *b)
for(j=0; j<Mpscale; j++) {
x <<= 1;
if(mpcmp(&d, &n) <= 0) {
if(!iszero(&d))
if(!mpiszero(&d))
x |= 1;
mpsubfixfix(&n, &d);
}
......
......@@ -17,7 +17,6 @@ enum
InitPending = 2,
};
static int iszero(Node*);
static void initplan(Node*);
static NodeList *initlist;
static void init2(Node*, NodeList**);
......@@ -1356,7 +1355,6 @@ no:
return 0;
}
static int iszero(Node*);
static int isvaluelit(Node*);
static InitEntry* entry(InitPlan*);
static void addvalue(InitPlan*, vlong, Node*, Node*);
......@@ -1440,7 +1438,7 @@ addvalue(InitPlan *p, vlong xoffset, Node *key, Node *n)
e->expr = n;
}
static int
int
iszero(Node *n)
{
NodeList *l;
......
......@@ -1390,7 +1390,12 @@ walkexpr(Node **np, NodeList **init)
case OMAPLIT:
case OSTRUCTLIT:
case OPTRLIT:
// XXX TODO do we need to clear var?
// NOTE(rsc): Race detector cannot handle seeing
// a STRUCTLIT or ARRAYLIT representing a zero value,
// so make a temporary for those always in race mode.
// Otherwise, leave zero values in place.
if(iszero(n) && !flag_race)
goto ret;
var = temp(n->type);
anylit(0, n, var, init);
n = var;
......@@ -2009,8 +2014,8 @@ needwritebarrier(Node *l, Node *r)
if(isstack(l))
return 0;
// No write barrier for zeroing.
if(r == N)
// No write barrier for implicit or explicit zeroing.
if(r == N || iszero(r))
return 0;
// No write barrier for initialization to constant.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment