Commit c9152a85 authored by Dmitriy Vyukov's avatar Dmitriy Vyukov Committed by Russ Cox

runtime: eliminate contention during stack allocation

Standard-sized stack frames use plain malloc/free
instead of centralized lock-protected FixAlloc.
Benchmark results on HP Z600 (2 x Xeon E5620, 8 HT cores, 2.40GHz)
are as follows:
benchmark                                        old ns/op    new ns/op    delta
BenchmarkStackGrowth                               1045.00       949.00   -9.19%
BenchmarkStackGrowth-2                             3450.00       800.00  -76.81%
BenchmarkStackGrowth-4                             5076.00       513.00  -89.89%
BenchmarkStackGrowth-8                             7805.00       471.00  -93.97%
BenchmarkStackGrowth-16                           11751.00       321.00  -97.27%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/4657091
parent daaf29cf
...@@ -358,26 +358,11 @@ func new(n uint32) (ret *uint8) { ...@@ -358,26 +358,11 @@ func new(n uint32) (ret *uint8) {
ret = runtime·mal(n); ret = runtime·mal(n);
} }
// Stack allocator uses malloc/free most of the time,
// but if we're in the middle of malloc and need stack,
// we have to do something else to avoid deadlock.
// In that case, we fall back on a fixed-size free-list
// allocator, assuming that inside malloc all the stack
// frames are small, so that all the stack allocations
// will be a single size, the minimum (right now, 5k).
static struct {
Lock;
FixAlloc;
} stacks;
enum {
FixedStack = StackMin,
};
void* void*
runtime·stackalloc(uint32 n) runtime·stackalloc(uint32 n)
{ {
void *v; void *v;
uintptr sys0;
// Stackalloc must be called on scheduler stack, so that we // Stackalloc must be called on scheduler stack, so that we
// never try to grow the stack during the code that stackalloc runs. // never try to grow the stack during the code that stackalloc runs.
...@@ -385,18 +370,22 @@ runtime·stackalloc(uint32 n) ...@@ -385,18 +370,22 @@ runtime·stackalloc(uint32 n)
if(g != m->g0) if(g != m->g0)
runtime·throw("stackalloc not on scheduler stack"); runtime·throw("stackalloc not on scheduler stack");
// Stack allocator uses malloc/free most of the time,
// but if we're in the middle of malloc and need stack,
// we have to do something else to avoid deadlock.
// In that case, we fall back on a fixed-size free-list
// allocator, assuming that inside malloc all the stack
// frames are small, so that all the stack allocations
// will be a single size, the minimum (right now, 5k).
if(m->mallocing || m->gcing || n == FixedStack) { if(m->mallocing || m->gcing || n == FixedStack) {
runtime·lock(&stacks); if(n != FixedStack) {
if(stacks.size == 0) runtime·printf("stackalloc: in malloc, size=%d want %d", FixedStack, n);
runtime·FixAlloc_Init(&stacks, n, runtime·SysAlloc, nil, nil);
if(stacks.size != n) {
runtime·printf("stackalloc: in malloc, size=%D want %d", (uint64)stacks.size, n);
runtime·throw("stackalloc"); runtime·throw("stackalloc");
} }
v = runtime·FixAlloc_Alloc(&stacks); sys0 = m->stackalloc->sys;
mstats.stacks_inuse = stacks.inuse; v = runtime·FixAlloc_Alloc(m->stackalloc);
mstats.stacks_sys = stacks.sys; mstats.stacks_inuse += FixedStack;
runtime·unlock(&stacks); mstats.stacks_sys += m->stackalloc->sys - sys0;
return v; return v;
} }
return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0); return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0);
...@@ -405,12 +394,13 @@ runtime·stackalloc(uint32 n) ...@@ -405,12 +394,13 @@ runtime·stackalloc(uint32 n)
void void
runtime·stackfree(void *v, uintptr n) runtime·stackfree(void *v, uintptr n)
{ {
uintptr sys0;
if(m->mallocing || m->gcing || n == FixedStack) { if(m->mallocing || m->gcing || n == FixedStack) {
runtime·lock(&stacks); sys0 = m->stackalloc->sys;
runtime·FixAlloc_Free(&stacks, v); runtime·FixAlloc_Free(m->stackalloc, v);
mstats.stacks_inuse = stacks.inuse; mstats.stacks_inuse -= FixedStack;
mstats.stacks_sys = stacks.sys; mstats.stacks_sys += m->stackalloc->sys - sys0;
runtime·unlock(&stacks);
return; return;
} }
runtime·free(v); runtime·free(v);
......
...@@ -80,7 +80,6 @@ ...@@ -80,7 +80,6 @@
// This C code was written with an eye toward translating to Go // This C code was written with an eye toward translating to Go
// in the future. Methods have the form Type_Method(Type *t, ...). // in the future. Methods have the form Type_Method(Type *t, ...).
typedef struct FixAlloc FixAlloc;
typedef struct MCentral MCentral; typedef struct MCentral MCentral;
typedef struct MHeap MHeap; typedef struct MHeap MHeap;
typedef struct MSpan MSpan; typedef struct MSpan MSpan;
......
...@@ -97,6 +97,7 @@ static G* gfget(void); ...@@ -97,6 +97,7 @@ static G* gfget(void);
static void matchmg(void); // match ms to gs static void matchmg(void); // match ms to gs
static void readylocked(G*); // ready, but sched is locked static void readylocked(G*); // ready, but sched is locked
static void mnextg(M*, G*); static void mnextg(M*, G*);
static void mcommoninit(M*);
// The bootstrap sequence is: // The bootstrap sequence is:
// //
...@@ -116,11 +117,10 @@ runtime·schedinit(void) ...@@ -116,11 +117,10 @@ runtime·schedinit(void)
int32 n; int32 n;
byte *p; byte *p;
runtime·allm = m;
m->nomemprof++; m->nomemprof++;
m->fastrand = 0x49f6428aUL + m->id;
runtime·mallocinit(); runtime·mallocinit();
mcommoninit(m);
runtime·goargs(); runtime·goargs();
runtime·goenvs(); runtime·goenvs();
...@@ -134,7 +134,6 @@ runtime·schedinit(void) ...@@ -134,7 +134,6 @@ runtime·schedinit(void)
if(p != nil && (n = runtime·atoi(p)) != 0) if(p != nil && (n = runtime·atoi(p)) != 0)
runtime·gomaxprocs = n; runtime·gomaxprocs = n;
runtime·sched.mcpumax = runtime·gomaxprocs; runtime·sched.mcpumax = runtime·gomaxprocs;
runtime·sched.mcount = 1;
runtime·sched.predawn = 1; runtime·sched.predawn = 1;
m->nomemprof--; m->nomemprof--;
...@@ -208,6 +207,17 @@ runtime·idlegoroutine(void) ...@@ -208,6 +207,17 @@ runtime·idlegoroutine(void)
g->idlem = m; g->idlem = m;
} }
static void
mcommoninit(M *m)
{
m->alllink = runtime·allm;
runtime·allm = m;
m->id = runtime·sched.mcount++;
m->fastrand = 0x49f6428aUL + m->id;
m->stackalloc = runtime·malloc(sizeof(*m->stackalloc));
runtime·FixAlloc_Init(m->stackalloc, FixedStack, runtime·SysAlloc, nil, nil);
}
// Put on `g' queue. Sched must be locked. // Put on `g' queue. Sched must be locked.
static void static void
gput(G *g) gput(G *g)
...@@ -494,10 +504,7 @@ matchmg(void) ...@@ -494,10 +504,7 @@ matchmg(void)
m = runtime·malloc(sizeof(M)); m = runtime·malloc(sizeof(M));
// Add to runtime·allm so garbage collector doesn't free m // Add to runtime·allm so garbage collector doesn't free m
// when it is just in a register or thread-local storage. // when it is just in a register or thread-local storage.
m->alllink = runtime·allm; mcommoninit(m);
runtime·allm = m;
m->id = runtime·sched.mcount++;
m->fastrand = 0x49f6428aUL + m->id;
if(runtime·iscgo) { if(runtime·iscgo) {
CgoThreadStart ts; CgoThreadStart ts;
......
...@@ -6,6 +6,7 @@ package runtime_test ...@@ -6,6 +6,7 @@ package runtime_test
import ( import (
"runtime" "runtime"
"sync/atomic"
"testing" "testing"
) )
...@@ -44,3 +45,31 @@ func TestStopTheWorldDeadlock(t *testing.T) { ...@@ -44,3 +45,31 @@ func TestStopTheWorldDeadlock(t *testing.T) {
stop <- true stop <- true
runtime.GOMAXPROCS(maxprocs) runtime.GOMAXPROCS(maxprocs)
} }
func stackGrowthRecursive(i int) {
var pad [128]uint64
if i != 0 && pad[0] == 0 {
stackGrowthRecursive(i - 1)
}
}
func BenchmarkStackGrowth(b *testing.B) {
const CallsPerSched = 1000
procs := runtime.GOMAXPROCS(-1)
N := int32(b.N / CallsPerSched)
c := make(chan bool, procs)
for p := 0; p < procs; p++ {
go func() {
for atomic.AddInt32(&N, -1) >= 0 {
runtime.Gosched()
for g := 0; g < CallsPerSched; g++ {
stackGrowthRecursive(10)
}
}
c <- true
}()
}
for p := 0; p < procs; p++ {
<-c
}
}
...@@ -57,6 +57,7 @@ typedef struct String String; ...@@ -57,6 +57,7 @@ typedef struct String String;
typedef struct Usema Usema; typedef struct Usema Usema;
typedef struct SigTab SigTab; typedef struct SigTab SigTab;
typedef struct MCache MCache; typedef struct MCache MCache;
typedef struct FixAlloc FixAlloc;
typedef struct Iface Iface; typedef struct Iface Iface;
typedef struct Itab Itab; typedef struct Itab Itab;
typedef struct Eface Eface; typedef struct Eface Eface;
...@@ -236,6 +237,7 @@ struct M ...@@ -236,6 +237,7 @@ struct M
M* schedlink; M* schedlink;
uint32 machport; // Return address for Mach IPC (OS X) uint32 machport; // Return address for Mach IPC (OS X)
MCache *mcache; MCache *mcache;
FixAlloc *stackalloc;
G* lockedg; G* lockedg;
G* idleg; G* idleg;
uint32 freglo[16]; // D[i] lsb and F[i] uint32 freglo[16]; // D[i] lsb and F[i]
......
...@@ -71,6 +71,7 @@ enum { ...@@ -71,6 +71,7 @@ enum {
// If the amount needed for the splitting frame + StackExtra // If the amount needed for the splitting frame + StackExtra
// is less than this number, the stack will have this size instead. // is less than this number, the stack will have this size instead.
StackMin = 4096, StackMin = 4096,
FixedStack = StackMin,
// Functions that need frames bigger than this call morestack // Functions that need frames bigger than this call morestack
// unconditionally. That is, on entry to a function it is assumed // unconditionally. That is, on entry to a function it is assumed
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment