Commit c9152a85 authored by Dmitriy Vyukov's avatar Dmitriy Vyukov Committed by Russ Cox

runtime: eliminate contention during stack allocation

Standard-sized stack frames use plain malloc/free
instead of centralized lock-protected FixAlloc.
Benchmark results on HP Z600 (2 x Xeon E5620, 8 HT cores, 2.40GHz)
are as follows:
benchmark                                        old ns/op    new ns/op    delta
BenchmarkStackGrowth                               1045.00       949.00   -9.19%
BenchmarkStackGrowth-2                             3450.00       800.00  -76.81%
BenchmarkStackGrowth-4                             5076.00       513.00  -89.89%
BenchmarkStackGrowth-8                             7805.00       471.00  -93.97%
BenchmarkStackGrowth-16                           11751.00       321.00  -97.27%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/4657091
parent daaf29cf
......@@ -358,26 +358,11 @@ func new(n uint32) (ret *uint8) {
ret = runtime·mal(n);
}
// Stack allocator uses malloc/free most of the time,
// but if we're in the middle of malloc and need stack,
// we have to do something else to avoid deadlock.
// In that case, we fall back on a fixed-size free-list
// allocator, assuming that inside malloc all the stack
// frames are small, so that all the stack allocations
// will be a single size, the minimum (right now, 5k).
static struct {
Lock;
FixAlloc;
} stacks;
enum {
FixedStack = StackMin,
};
void*
runtime·stackalloc(uint32 n)
{
void *v;
uintptr sys0;
// Stackalloc must be called on scheduler stack, so that we
// never try to grow the stack during the code that stackalloc runs.
......@@ -385,18 +370,22 @@ runtime·stackalloc(uint32 n)
if(g != m->g0)
runtime·throw("stackalloc not on scheduler stack");
// Stack allocator uses malloc/free most of the time,
// but if we're in the middle of malloc and need stack,
// we have to do something else to avoid deadlock.
// In that case, we fall back on a fixed-size free-list
// allocator, assuming that inside malloc all the stack
// frames are small, so that all the stack allocations
// will be a single size, the minimum (right now, 5k).
if(m->mallocing || m->gcing || n == FixedStack) {
runtime·lock(&stacks);
if(stacks.size == 0)
runtime·FixAlloc_Init(&stacks, n, runtime·SysAlloc, nil, nil);
if(stacks.size != n) {
runtime·printf("stackalloc: in malloc, size=%D want %d", (uint64)stacks.size, n);
if(n != FixedStack) {
runtime·printf("stackalloc: in malloc, size=%d want %d", FixedStack, n);
runtime·throw("stackalloc");
}
v = runtime·FixAlloc_Alloc(&stacks);
mstats.stacks_inuse = stacks.inuse;
mstats.stacks_sys = stacks.sys;
runtime·unlock(&stacks);
sys0 = m->stackalloc->sys;
v = runtime·FixAlloc_Alloc(m->stackalloc);
mstats.stacks_inuse += FixedStack;
mstats.stacks_sys += m->stackalloc->sys - sys0;
return v;
}
return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0);
......@@ -405,12 +394,13 @@ runtime·stackalloc(uint32 n)
void
runtime·stackfree(void *v, uintptr n)
{
uintptr sys0;
if(m->mallocing || m->gcing || n == FixedStack) {
runtime·lock(&stacks);
runtime·FixAlloc_Free(&stacks, v);
mstats.stacks_inuse = stacks.inuse;
mstats.stacks_sys = stacks.sys;
runtime·unlock(&stacks);
sys0 = m->stackalloc->sys;
runtime·FixAlloc_Free(m->stackalloc, v);
mstats.stacks_inuse -= FixedStack;
mstats.stacks_sys += m->stackalloc->sys - sys0;
return;
}
runtime·free(v);
......
......@@ -80,7 +80,6 @@
// This C code was written with an eye toward translating to Go
// in the future. Methods have the form Type_Method(Type *t, ...).
typedef struct FixAlloc FixAlloc;
typedef struct MCentral MCentral;
typedef struct MHeap MHeap;
typedef struct MSpan MSpan;
......
......@@ -97,6 +97,7 @@ static G* gfget(void);
static void matchmg(void); // match ms to gs
static void readylocked(G*); // ready, but sched is locked
static void mnextg(M*, G*);
static void mcommoninit(M*);
// The bootstrap sequence is:
//
......@@ -116,11 +117,10 @@ runtime·schedinit(void)
int32 n;
byte *p;
runtime·allm = m;
m->nomemprof++;
m->fastrand = 0x49f6428aUL + m->id;
runtime·mallocinit();
mcommoninit(m);
runtime·goargs();
runtime·goenvs();
......@@ -134,7 +134,6 @@ runtime·schedinit(void)
if(p != nil && (n = runtime·atoi(p)) != 0)
runtime·gomaxprocs = n;
runtime·sched.mcpumax = runtime·gomaxprocs;
runtime·sched.mcount = 1;
runtime·sched.predawn = 1;
m->nomemprof--;
......@@ -208,6 +207,17 @@ runtime·idlegoroutine(void)
g->idlem = m;
}
static void
mcommoninit(M *m)
{
m->alllink = runtime·allm;
runtime·allm = m;
m->id = runtime·sched.mcount++;
m->fastrand = 0x49f6428aUL + m->id;
m->stackalloc = runtime·malloc(sizeof(*m->stackalloc));
runtime·FixAlloc_Init(m->stackalloc, FixedStack, runtime·SysAlloc, nil, nil);
}
// Put on `g' queue. Sched must be locked.
static void
gput(G *g)
......@@ -494,10 +504,7 @@ matchmg(void)
m = runtime·malloc(sizeof(M));
// Add to runtime·allm so garbage collector doesn't free m
// when it is just in a register or thread-local storage.
m->alllink = runtime·allm;
runtime·allm = m;
m->id = runtime·sched.mcount++;
m->fastrand = 0x49f6428aUL + m->id;
mcommoninit(m);
if(runtime·iscgo) {
CgoThreadStart ts;
......
......@@ -6,6 +6,7 @@ package runtime_test
import (
"runtime"
"sync/atomic"
"testing"
)
......@@ -44,3 +45,31 @@ func TestStopTheWorldDeadlock(t *testing.T) {
stop <- true
runtime.GOMAXPROCS(maxprocs)
}
func stackGrowthRecursive(i int) {
var pad [128]uint64
if i != 0 && pad[0] == 0 {
stackGrowthRecursive(i - 1)
}
}
func BenchmarkStackGrowth(b *testing.B) {
const CallsPerSched = 1000
procs := runtime.GOMAXPROCS(-1)
N := int32(b.N / CallsPerSched)
c := make(chan bool, procs)
for p := 0; p < procs; p++ {
go func() {
for atomic.AddInt32(&N, -1) >= 0 {
runtime.Gosched()
for g := 0; g < CallsPerSched; g++ {
stackGrowthRecursive(10)
}
}
c <- true
}()
}
for p := 0; p < procs; p++ {
<-c
}
}
......@@ -57,6 +57,7 @@ typedef struct String String;
typedef struct Usema Usema;
typedef struct SigTab SigTab;
typedef struct MCache MCache;
typedef struct FixAlloc FixAlloc;
typedef struct Iface Iface;
typedef struct Itab Itab;
typedef struct Eface Eface;
......@@ -236,6 +237,7 @@ struct M
M* schedlink;
uint32 machport; // Return address for Mach IPC (OS X)
MCache *mcache;
FixAlloc *stackalloc;
G* lockedg;
G* idleg;
uint32 freglo[16]; // D[i] lsb and F[i]
......
......@@ -71,6 +71,7 @@ enum {
// If the amount needed for the splitting frame + StackExtra
// is less than this number, the stack will have this size instead.
StackMin = 4096,
FixedStack = StackMin,
// Functions that need frames bigger than this call morestack
// unconditionally. That is, on entry to a function it is assumed
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment