Commit 1afbceb5 authored by Russ Cox's avatar Russ Cox

cmd/6g: treat vardef-initialized fat variables as live at calls

This CL forces the optimizer to preserve some memory stores
that would be redundant except that a stack scan due to garbage
collection or stack copying might look at them during a function call.
As such, it forces additional memory writes and therefore slows
down the execution of some programs, especially garbage-heavy
programs that are already limited by memory bandwidth.

The slowdown can be as much as 7% for end-to-end benchmarks.

These numbers are from running go1.test -test.benchtime=5s three times,
taking the best (lowest) ns/op for each benchmark. I am excluding
benchmarks with time/op < 10us to focus on macro effects.
All benchmarks are on amd64.

Comparing tip (a27f34c771cb) against this CL on an Intel Core i5 MacBook Pro:

benchmark                          old ns/op      new ns/op      delta
BenchmarkBinaryTree17              3876500413     3856337341     -0.52%
BenchmarkFannkuch11                2965104777     2991182127     +0.88%
BenchmarkGobDecode                 8563026        8788340        +2.63%
BenchmarkGobEncode                 5050608        5267394        +4.29%
BenchmarkGzip                      431191816      434168065      +0.69%
BenchmarkGunzip                    107873523      110563792      +2.49%
BenchmarkHTTPClientServer          85036          86131          +1.29%
BenchmarkJSONEncode                22143764       22501647       +1.62%
BenchmarkJSONDecode                79646916       85658808       +7.55%
BenchmarkMandelbrot200             4720421        4700108        -0.43%
BenchmarkGoParse                   4651575        4712247        +1.30%
BenchmarkRegexpMatchMedium_1K      71986          73490          +2.09%
BenchmarkRegexpMatchHard_1K        111018         117495         +5.83%
BenchmarkRevcomp                   648798723      659352759      +1.63%
BenchmarkTemplate                  112673009      112819078      +0.13%

Comparing tip (a27f34c771cb) against this CL on an Intel Xeon E5520:

BenchmarkBinaryTree17              5461110720     5393104469     -1.25%
BenchmarkFannkuch11                4314677151     4327177615     +0.29%
BenchmarkGobDecode                 11065853       11235272       +1.53%
BenchmarkGobEncode                 65000655        6959837        +7.07%
BenchmarkGzip                      647478596      671769097      +3.75%
BenchmarkGunzip                    139348579      141096376      +1.25%
BenchmarkHTTPClientServer          69376          73610          +6.10%
BenchmarkJSONEncode                30172320       31796106       +5.38%
BenchmarkJSONDecode                113704905      114239137      +0.47%
BenchmarkMandelbrot200             6032730        6003077        -0.49%
BenchmarkGoParse                   6775251        6405995        -5.45%
BenchmarkRegexpMatchMedium_1K      111832         113895         +1.84%
BenchmarkRegexpMatchHard_1K        161112         168420         +4.54%
BenchmarkRevcomp                   876363406      892319935      +1.82%
BenchmarkTemplate                  146273096      148998339      +1.86%

Just to get a sense of where we are compared to the previous release,
here are the same benchmarks comparing Go 1.2 to this CL.

Comparing Go 1.2 against this CL on an Intel Core i5 MacBook Pro:

BenchmarkBinaryTree17              4370077662     3856337341     -11.76%
BenchmarkFannkuch11                3347052657     2991182127     -10.63%
BenchmarkGobDecode                 8791384        8788340        -0.03%
BenchmarkGobEncode                 4968759        5267394        +6.01%
BenchmarkGzip                      437815669      434168065      -0.83%
BenchmarkGunzip                    94604099       110563792      +16.87%
BenchmarkHTTPClientServer          87798          86131          -1.90%
BenchmarkJSONEncode                22818243       22501647       -1.39%
BenchmarkJSONDecode                97182444       85658808       -11.86%
BenchmarkMandelbrot200             4733516        4700108        -0.71%
BenchmarkGoParse                   5054384        4712247        -6.77%
BenchmarkRegexpMatchMedium_1K      67612          73490          +8.69%
BenchmarkRegexpMatchHard_1K        107321         117495         +9.48%
BenchmarkRevcomp                   733270055      659352759      -10.08%
BenchmarkTemplate                  109304977      112819078      +3.21%

Comparing Go 1.2 against this CL on an Intel Xeon E5520:

BenchmarkBinaryTree17              5986953594     5393104469     -9.92%
BenchmarkFannkuch11                4861139174     4327177615     -10.98%
BenchmarkGobDecode                 11830997       11235272       -5.04%
BenchmarkGobEncode                 6608722        6959837        +5.31%
BenchmarkGzip                      661875826      671769097      +1.49%
BenchmarkGunzip                    138630019      141096376      +1.78%
BenchmarkHTTPClientServer          71534          73610          +2.90%
BenchmarkJSONEncode                30393609       31796106       +4.61%
BenchmarkJSONDecode                139645860      114239137      -18.19%
BenchmarkMandelbrot200             5988660        6003077        +0.24%
BenchmarkGoParse                   6974092        6405995        -8.15%
BenchmarkRegexpMatchMedium_1K      111331         113895         +2.30%
BenchmarkRegexpMatchHard_1K        165961         168420         +1.48%
BenchmarkRevcomp                   995049292      892319935      -10.32%
BenchmarkTemplate                  145623363      148998339      +2.32%

Fixes #8036.

LGTM=khr
R=golang-codereviews, josharian, khr
CC=golang-codereviews, iant, r
https://golang.org/cl/99660044
parent aad4609c
......@@ -129,13 +129,15 @@ static char* regname[] = {
static Node* regnodes[NREGVAR];
static void walkvardef(Node *n, Reg *r, int active);
void
regopt(Prog *firstp)
{
Reg *r, *r1;
Prog *p;
Graph *g;
int i, z;
int i, z, active;
uint32 vreg;
Bits bit;
ProgInfo info;
......@@ -249,6 +251,26 @@ regopt(Prog *firstp)
if(debug['R'] && debug['v'])
dumpit("pass2", &firstr->f, 1);
/*
* pass 2.5
* iterate propagating fat vardef covering forward
* r->act records vars with a VARDEF since the last CALL.
* (r->act will be reused in pass 5 for something else,
* but we'll be done with it by then.)
*/
active = 0;
for(r = firstr; r != R; r = (Reg*)r->f.link) {
r->f.active = 0;
r->act = zbits;
}
for(r = firstr; r != R; r = (Reg*)r->f.link) {
p = r->f.prog;
if(p->as == AVARDEF && isfat(p->to.node->type) && p->to.node->opt != nil) {
active++;
walkvardef(p->to.node, r, active);
}
}
/*
* pass 3
* iterate propagating usage
......@@ -524,6 +546,32 @@ brk:
}
}
static void
walkvardef(Node *n, Reg *r, int active)
{
Reg *r1, *r2;
int bn;
Var *v;
for(r1=r; r1!=R; r1=(Reg*)r1->f.s1) {
if(r1->f.active == active)
break;
r1->f.active = active;
if(r1->f.prog->as == AVARKILL && r1->f.prog->to.node == n)
break;
for(v=n->opt; v!=nil; v=v->nextinnode) {
bn = v - var;
r1->act.b[bn/32] |= 1L << (bn%32);
}
if(r1->f.prog->as == ABL)
break;
}
for(r2=r; r2!=r1; r2=(Reg*)r2->f.s1)
if(r2->f.s2 != nil)
walkvardef(n, (Reg*)r2->f.s2, active);
}
void
addsplits(void)
{
......@@ -891,8 +939,13 @@ prop(Reg *r, Bits ref, Bits cal)
// Mark all input variables (ivar) as used, because that's what the
// liveness bitmaps say. The liveness bitmaps say that so that a
// panic will not show stale values in the parameter dump.
// Mark variables with a recent VARDEF (r1->act) as used,
// so that the optimizer flushes initializations to memory,
// so that if a garbage collection happens during this CALL,
// the collector will see initialized memory. Again this is to
// match what the liveness bitmaps say.
for(z=0; z<BITS; z++) {
cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z];
cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z] | r1->act.b[z];
ref.b[z] = 0;
}
......
......@@ -114,6 +114,8 @@ static char* regname[] = {
static Node* regnodes[NREGVAR];
static void walkvardef(Node *n, Reg *r, int active);
void
regopt(Prog *firstp)
{
......@@ -121,7 +123,7 @@ regopt(Prog *firstp)
Prog *p;
Graph *g;
ProgInfo info;
int i, z;
int i, z, active;
uint32 vreg;
Bits bit;
......@@ -234,6 +236,26 @@ regopt(Prog *firstp)
if(debug['R'] && debug['v'])
dumpit("pass2", &firstr->f, 1);
/*
* pass 2.5
* iterate propagating fat vardef covering forward
* r->act records vars with a VARDEF since the last CALL.
* (r->act will be reused in pass 5 for something else,
* but we'll be done with it by then.)
*/
active = 0;
for(r = firstr; r != R; r = (Reg*)r->f.link) {
r->f.active = 0;
r->act = zbits;
}
for(r = firstr; r != R; r = (Reg*)r->f.link) {
p = r->f.prog;
if(p->as == AVARDEF && isfat(p->to.node->type) && p->to.node->opt != nil) {
active++;
walkvardef(p->to.node, r, active);
}
}
/*
* pass 3
* iterate propagating usage
......@@ -435,6 +457,32 @@ brk:
}
}
static void
walkvardef(Node *n, Reg *r, int active)
{
Reg *r1, *r2;
int bn;
Var *v;
for(r1=r; r1!=R; r1=(Reg*)r1->f.s1) {
if(r1->f.active == active)
break;
r1->f.active = active;
if(r1->f.prog->as == AVARKILL && r1->f.prog->to.node == n)
break;
for(v=n->opt; v!=nil; v=v->nextinnode) {
bn = v - var;
r1->act.b[bn/32] |= 1L << (bn%32);
}
if(r1->f.prog->as == ACALL)
break;
}
for(r2=r; r2!=r1; r2=(Reg*)r2->f.s1)
if(r2->f.s2 != nil)
walkvardef(n, (Reg*)r2->f.s2, active);
}
/*
* add mov b,rn
* just after r
......@@ -745,8 +793,13 @@ prop(Reg *r, Bits ref, Bits cal)
// Mark all input variables (ivar) as used, because that's what the
// liveness bitmaps say. The liveness bitmaps say that so that a
// panic will not show stale values in the parameter dump.
// Mark variables with a recent VARDEF (r1->act) as used,
// so that the optimizer flushes initializations to memory,
// so that if a garbage collection happens during this CALL,
// the collector will see initialized memory. Again this is to
// match what the liveness bitmaps say.
for(z=0; z<BITS; z++) {
cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z];
cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z] | r1->act.b[z];
ref.b[z] = 0;
}
......
......@@ -84,6 +84,8 @@ static char* regname[] = {
static Node* regnodes[NREGVAR];
static void walkvardef(Node *n, Reg *r, int active);
void
regopt(Prog *firstp)
{
......@@ -91,7 +93,7 @@ regopt(Prog *firstp)
Prog *p;
Graph *g;
ProgInfo info;
int i, z;
int i, z, active;
uint32 vreg;
Bits bit;
......@@ -206,6 +208,26 @@ regopt(Prog *firstp)
if(debug['R'] && debug['v'])
dumpit("pass2", &firstr->f, 1);
/*
* pass 2.5
* iterate propagating fat vardef covering forward
* r->act records vars with a VARDEF since the last CALL.
* (r->act will be reused in pass 5 for something else,
* but we'll be done with it by then.)
*/
active = 0;
for(r = firstr; r != R; r = (Reg*)r->f.link) {
r->f.active = 0;
r->act = zbits;
}
for(r = firstr; r != R; r = (Reg*)r->f.link) {
p = r->f.prog;
if(p->as == AVARDEF && isfat(p->to.node->type) && p->to.node->opt != nil) {
active++;
walkvardef(p->to.node, r, active);
}
}
/*
* pass 3
* iterate propagating usage
......@@ -404,6 +426,32 @@ brk:
}
}
static void
walkvardef(Node *n, Reg *r, int active)
{
Reg *r1, *r2;
int bn;
Var *v;
for(r1=r; r1!=R; r1=(Reg*)r1->f.s1) {
if(r1->f.active == active)
break;
r1->f.active = active;
if(r1->f.prog->as == AVARKILL && r1->f.prog->to.node == n)
break;
for(v=n->opt; v!=nil; v=v->nextinnode) {
bn = v - var;
r1->act.b[bn/32] |= 1L << (bn%32);
}
if(r1->f.prog->as == ACALL)
break;
}
for(r2=r; r2!=r1; r2=(Reg*)r2->f.s1)
if(r2->f.s2 != nil)
walkvardef(n, (Reg*)r2->f.s2, active);
}
/*
* add mov b,rn
* just after r
......@@ -711,8 +759,13 @@ prop(Reg *r, Bits ref, Bits cal)
// Mark all input variables (ivar) as used, because that's what the
// liveness bitmaps say. The liveness bitmaps say that so that a
// panic will not show stale values in the parameter dump.
// Mark variables with a recent VARDEF (r1->act) as used,
// so that the optimizer flushes initializations to memory,
// so that if a garbage collection happens during this CALL,
// the collector will see initialized memory. Again this is to
// match what the liveness bitmaps say.
for(z=0; z<BITS; z++) {
cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z];
cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z] | r1->act.b[z];
ref.b[z] = 0;
}
......
// run
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Issue 8036. Stores necessary for stack scan being eliminated as redundant by optimizer.
package main
import "runtime"
type T struct {
X *int
Y *int
Z *int
}
type TI [3]uintptr
func G() (t TI) {
t[0] = 1
t[1] = 2
t[2] = 3
runtime.GC() // prevent inlining
return
}
func F() (t T) {
t.X = newint()
t.Y = t.X
t.Z = t.Y
runtime.GC() // prevent inlining
return
}
func newint() *int {
runtime.GC()
return nil
}
func main() {
G() // leave non-pointers where F's return values go
F()
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment