Commit d324f214 authored by Russ Cox's avatar Russ Cox

runtime: parallelize garbage collector mark + sweep

Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r 	 1 cpu, no atomics
32.27u 0.58s 32.86r 	 1 cpu, atomic instructions
33.04u 0.83s 27.47r 	 2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r 	 1 cpu, no atomics
34.87u 1.12s 29.60r 	 2 cpu
36.00u 1.87s 28.43r 	 3 cpu
36.46u 2.34s 27.10r 	 4 cpu
38.28u 3.85s 26.92r 	 5 cpu
37.72u 5.25s 26.73r	 6 cpu
39.63u 7.11s 26.95r	 7 cpu
39.67u 8.10s 26.68r	 8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r 	 1 cpu, no atomics
43.98u 2.95s 38.69r 	 2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r 	 1 cpu, no atomics
57.15u 4.72s 51.54r 	 2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082
parent b0cddb98
......@@ -97,7 +97,7 @@ TEXT runtime·sigtramp(SB),7,$40
// save g
MOVL g(CX), DI
MOVL DI, 20(SP)
// g = m->gsignal
MOVL m(CX), BP
MOVL m_gsignal(BP), BP
......@@ -111,7 +111,7 @@ TEXT runtime·sigtramp(SB),7,$40
MOVL context+16(FP), BX
MOVL BX, 8(SP)
MOVL DI, 12(SP)
MOVL handler+0(FP), BX
CALL BX
......@@ -138,6 +138,26 @@ TEXT runtime·sigaltstack(SB),7,$0
CALL runtime·notok(SB)
RET
TEXT runtime·usleep(SB),7,$32
MOVL $0, DX
MOVL usec+0(FP), AX
MOVL $1000000, CX
DIVL CX
MOVL AX, 24(SP) // sec
MOVL DX, 28(SP) // usec
// select(0, 0, 0, 0, &tv)
MOVL $0, 0(SP) // "return PC" - ignored
MOVL $0, 4(SP)
MOVL $0, 8(SP)
MOVL $0, 12(SP)
MOVL $0, 16(SP)
LEAL 24(SP), AX
MOVL AX, 20(SP)
MOVL $93, AX
INT $0x80
RET
// void bsdthread_create(void *stk, M *m, G *g, void (*fn)(void))
// System call args are: func arg stack pthread flags.
TEXT runtime·bsdthread_create(SB),7,$32
......@@ -309,3 +329,12 @@ TEXT runtime·setldt(SB),7,$32
XORL AX, AX
MOVW GS, AX
RET
TEXT runtime·sysctl(SB),7,$0
MOVL $202, AX
INT $0x80
JAE 3(PC)
NEGL AX
RET
MOVL $0, AX
RET
......@@ -81,11 +81,11 @@ TEXT runtime·sigaction(SB),7,$0
TEXT runtime·sigtramp(SB),7,$64
get_tls(BX)
// save g
MOVQ g(BX), R10
MOVQ R10, 48(SP)
// g = m->gsignal
MOVQ m(BX), BP
MOVQ m_gsignal(BP), BP
......@@ -146,6 +146,24 @@ TEXT runtime·sigaltstack(SB),7,$0
CALL runtime·notok(SB)
RET
TEXT runtime·usleep(SB),7,$16
MOVL $0, DX
MOVL usec+0(FP), AX
MOVL $1000000, CX
DIVL CX
MOVQ AX, 0(SP) // sec
MOVL DX, 8(SP) // usec
// select(0, 0, 0, 0, &tv)
MOVL $0, DI
MOVL $0, SI
MOVL $0, DX
MOVL $0, R10
MOVQ SP, R8
MOVL $(0x2000000+23), AX
SYSCALL
RET
// void bsdthread_create(void *stk, M *m, G *g, void (*fn)(void))
TEXT runtime·bsdthread_create(SB),7,$0
// Set up arguments to bsdthread_create system call.
......@@ -189,7 +207,7 @@ TEXT runtime·bsdthread_start(SB),7,$0
POPQ SI
POPQ CX
POPQ DX
get_tls(BX)
MOVQ CX, m(BX)
MOVQ SI, m_procid(CX) // thread port is m->procid
......@@ -293,3 +311,18 @@ TEXT runtime·settls(SB),7,$32
MOVL $(0x3000000+3), AX // thread_fast_set_cthread_self - machdep call #3
SYSCALL
RET
TEXT runtime·sysctl(SB),7,$0
MOVQ 8(SP), DI
MOVL 16(SP), SI
MOVQ 24(SP), DX
MOVQ 32(SP), R10
MOVQ 40(SP), R8
MOVQ 48(SP), R9
MOVL $(0x2000000+202), AX // syscall entry
SYSCALL
JCC 3(PC)
NEGL AX
RET
MOVL $0, AX
RET
......@@ -18,6 +18,7 @@ uint32 runtime·mach_task_self(void);
uint32 runtime·mach_task_self(void);
uint32 runtime·mach_thread_self(void);
uint32 runtime·mach_thread_self(void);
int32 runtime·sysctl(uint32*, uint32, byte*, uintptr*, byte*, uintptr);
struct Sigaction;
void runtime·sigaction(uintptr, struct Sigaction*, struct Sigaction*);
......
......@@ -148,6 +148,20 @@ runtime·osinit(void)
if(!runtime·iscgo)
runtime·bsdthread_register();
runtime·destroylock = destroylock;
// Use sysctl to fetch hw.ncpu.
uint32 mib[2];
uint32 out;
int32 ret;
uintptr nout;
mib[0] = 6;
mib[1] = 3;
nout = sizeof out;
out = 0;
ret = runtime·sysctl(mib, 2, (byte*)&out, &nout, nil, 0);
if(ret >= 0)
runtime·ncpu = out;
}
void
......
......@@ -52,6 +52,25 @@ TEXT runtime·read(SB),7,$0
CALL *runtime·_vdso(SB)
RET
TEXT runtime·usleep(SB),7,$28
MOVL $0, DX
MOVL usec+0(FP), AX
MOVL $1000000, CX
DIVL CX
MOVL AX, 20(SP)
MOVL DX, 24(SP)
// select(0, 0, 0, 0, &tv)
MOVL $0, 0(SP)
MOVL $0, 4(SP)
MOVL $0, 8(SP)
MOVL $0, 12(SP)
LEAL 20(SP), AX
MOVL AX, 16(SP)
MOVL $82, AX
SYSCALL
RET
TEXT runtime·raisesigpipe(SB),7,$12
MOVL $224, AX // syscall - gettid
CALL *runtime·_vdso(SB)
......@@ -105,16 +124,16 @@ TEXT runtime·rt_sigaction(SB),7,$0
TEXT runtime·sigtramp(SB),7,$44
get_tls(CX)
// save g
MOVL g(CX), DI
MOVL DI, 20(SP)
// g = m->gsignal
MOVL m(CX), BX
MOVL m_gsignal(BX), BX
MOVL BX, g(CX)
// copy arguments for call to sighandler
MOVL sig+0(FP), BX
MOVL BX, 0(SP)
......@@ -125,12 +144,12 @@ TEXT runtime·sigtramp(SB),7,$44
MOVL DI, 12(SP)
CALL runtime·sighandler(SB)
// restore g
get_tls(CX)
MOVL 20(SP), BX
MOVL BX, g(CX)
RET
TEXT runtime·sigignore(SB),7,$0
......@@ -202,7 +221,7 @@ TEXT runtime·clone(SB),7,$0
MOVL $1234, 12(CX)
// cannot use CALL *runtime·_vdso(SB) here, because
// the stack changes during the system call (after
// the stack changes during the system call (after
// CALL *runtime·_vdso(SB), the child is still using
// the parent's stack when executing its RET instruction).
INT $0x80
......
......@@ -50,6 +50,24 @@ TEXT runtime·read(SB),7,$0-24
SYSCALL
RET
TEXT runtime·usleep(SB),7,$16
MOVL $0, DX
MOVL usec+0(FP), AX
MOVL $1000000, CX
DIVL CX
MOVQ AX, 0(SP)
MOVQ DX, 8(SP)
// select(0, 0, 0, 0, &tv)
MOVL $0, DI
MOVL $0, SI
MOVL $0, DX
MOVL $0, R10
MOVQ SP, R8
MOVL $23, AX
SYSCALL
RET
TEXT runtime·raisesigpipe(SB),7,$12
MOVL $186, AX // syscall - gettid
SYSCALL
......@@ -195,10 +213,10 @@ TEXT runtime·clone(SB),7,$0
CMPQ AX, $0
JEQ 2(PC)
RET
// In child, on new stack.
MOVQ SI, SP
// Initialize m->procid to Linux tid
MOVL $186, AX // gettid
SYSCALL
......
......@@ -33,6 +33,7 @@
#define SYS_gettid (SYS_BASE + 224)
#define SYS_tkill (SYS_BASE + 238)
#define SYS_sched_yield (SYS_BASE + 158)
#define SYS_select (SYS_BASE + 82)
#define ARM_BASE (SYS_BASE + 0x0f0000)
#define SYS_ARM_cacheflush (ARM_BASE + 2)
......@@ -254,7 +255,7 @@ TEXT runtime·sigtramp(SB),7,$24
// save g
MOVW g, R3
MOVW g, 20(R13)
// g = m->gsignal
MOVW m_gsignal(m), g
......@@ -265,7 +266,7 @@ TEXT runtime·sigtramp(SB),7,$24
MOVW R3, 16(R13)
BL runtime·sighandler(SB)
// restore g
MOVW 20(R13), g
......@@ -285,6 +286,23 @@ TEXT runtime·sigreturn(SB),7,$0
SWI $0
RET
TEXT runtime·usleep(SB),7,$12
MOVW usec+0(FP), R0
MOVW R0, R1
MOVW $1000000, R2
DIV R1, R0
MOD R2, R0
MOVW R1, 4(SP)
MOVW R2, 8(SP)
MOVW $0, R0
MOVW $0, R1
MOVW $0, R2
MOVW $0, R3
MOVW $4(SP), R4
MOVW $SYS_select, R7
SWI $0
RET
// Use kernel version instead of native armcas in ../../arm.s.
// See ../../../sync/atomic/asm_linux_arm.s for details.
TEXT cas<>(SB),7,$0
......
......@@ -8,7 +8,6 @@
#include "stack.h"
extern SigTab runtime·sigtab[];
static int32 proccount;
int32 runtime·open(uint8*, int32, int32);
int32 runtime·close(int32);
......@@ -136,13 +135,10 @@ futexlock(Lock *l)
// its wakeup call.
wait = v;
if(proccount == 0)
proccount = getproccount();
// On uniprocessor's, no point spinning.
// On multiprocessors, spin for ACTIVE_SPIN attempts.
spin = 0;
if(proccount > 1)
if(runtime·ncpu > 1)
spin = ACTIVE_SPIN;
for(;;) {
......@@ -276,6 +272,7 @@ runtime·newosproc(M *m, G *g, void *stk, void (*fn)(void))
void
runtime·osinit(void)
{
runtime·ncpu = getproccount();
}
void
......
......@@ -120,6 +120,13 @@ enum
#else
MHeapMap_Bits = 20,
#endif
// Max number of threads to run garbage collection.
// 2, 3, and 4 are all plausible maximums depending
// on the hardware details of the machine. The second
// proc is the one that helps the most (after the first),
// so start with just 2 for now.
MaxGcproc = 2,
};
// A generic linked list of blocks. (Typically the block is bigger than sizeof(MLink).)
......@@ -192,7 +199,7 @@ struct MStats
uint64 nlookup; // number of pointer lookups
uint64 nmalloc; // number of mallocs
uint64 nfree; // number of frees
// Statistics about malloc heap.
// protected by mheap.Lock
uint64 heap_alloc; // bytes allocated and still in use
......@@ -210,7 +217,7 @@ struct MStats
uint64 mcache_inuse; // MCache structures
uint64 mcache_sys;
uint64 buckhash_sys; // profiling bucket hash table
// Statistics about garbage collector.
// Protected by stopping the world during GC.
uint64 next_gc; // next GC (in heap_alloc time)
......@@ -219,7 +226,7 @@ struct MStats
uint32 numgc;
bool enablegc;
bool debuggc;
// Statistics about allocation size classes.
struct {
uint32 size;
......@@ -240,7 +247,7 @@ extern MStats mstats;
//
// class_to_size[i] = largest size in class i
// class_to_allocnpages[i] = number of pages to allocate when
// making new objects in class i
// making new objects in class i
// class_to_transfercount[i] = number of objects to move when
// taking a bunch of objects out of the central lists
// and putting them in the thread free list.
......@@ -279,7 +286,7 @@ struct MCache
int64 nmalloc;
int64 nfree;
} local_by_size[NumSizeClasses];
};
void* runtime·MCache_Alloc(MCache *c, int32 sizeclass, uintptr size, int32 zeroed);
......@@ -352,7 +359,7 @@ struct MHeap
byte *arena_start;
byte *arena_used;
byte *arena_end;
// central free lists for small size classes.
// the union makes sure that the MCentrals are
// spaced 64 bytes apart, so that each MCentral.Lock
......@@ -400,6 +407,8 @@ enum
void runtime·MProf_Malloc(void*, uintptr);
void runtime·MProf_Free(void*, uintptr);
int32 runtime·helpgc(void);
void runtime·gchelper(void);
// Malloc profiling settings.
// Must match definition in extern.go.
......
This diff is collapsed.
......@@ -51,7 +51,7 @@ vprintf(int8 *s, byte *base)
uintptr arg, narg;
byte *v;
// lock(&debuglock);
//runtime·lock(&debuglock);
lp = p = s;
arg = 0;
......@@ -152,7 +152,7 @@ vprintf(int8 *s, byte *base)
if(p > lp)
runtime·write(2, lp, p-lp);
// unlock(&debuglock);
//runtime·unlock(&debuglock);
}
#pragma textflag 7
......@@ -348,4 +348,4 @@ runtime·typestring(Eface e, String s)
s = *e.type->string;
FLUSH(&s);
}
......@@ -15,6 +15,7 @@ static void unwindstack(G*, byte*);
static void schedule(G*);
static void acquireproc(void);
static void releaseproc(void);
static M *startm(void);
typedef struct Sched Sched;
......@@ -323,6 +324,9 @@ mcommoninit(M *m)
m->fastrand = 0x49f6428aUL + m->id;
m->stackalloc = runtime·malloc(sizeof(*m->stackalloc));
runtime·FixAlloc_Init(m->stackalloc, FixedStack, runtime·SysAlloc, nil, nil);
if(m->mcache == nil)
m->mcache = runtime·allocmcache();
}
// Try to increment mcpu. Report whether succeeded.
......@@ -422,7 +426,7 @@ mget(G *g)
M *m;
// if g has its own m, use it.
if((m = g->lockedm) != nil)
if(g && (m = g->lockedm) != nil)
return m;
// otherwise use general m pool.
......@@ -507,6 +511,7 @@ nextgandunlock(void)
G *gp;
uint32 v;
top:
if(atomic_mcpu(runtime·sched.atomic) >= maxgomaxprocs)
runtime·throw("negative mcpu");
......@@ -584,12 +589,49 @@ nextgandunlock(void)
schedunlock();
runtime·notesleep(&m->havenextg);
if(m->helpgc) {
runtime·gchelper();
m->helpgc = 0;
runtime·lock(&runtime·sched);
goto top;
}
if((gp = m->nextg) == nil)
runtime·throw("bad m->nextg in nextgoroutine");
m->nextg = nil;
return gp;
}
int32
runtime·helpgc(void)
{
M *m;
int32 n, max;
// Figure out how many CPUs to use.
// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
max = runtime·gomaxprocs;
if(max > runtime·ncpu)
max = runtime·ncpu;
if(max > MaxGcproc)
max = MaxGcproc;
// We're going to use one CPU no matter what.
// Figure out the max number of additional CPUs.
max--;
runtime·lock(&runtime·sched);
n = 0;
while(n < max && (m = mget(nil)) != nil) {
n++;
m->helpgc = 1;
m->waitnextg = 0;
runtime·notewakeup(&m->havenextg);
}
runtime·unlock(&runtime·sched);
return n;
}
void
runtime·stoptheworld(void)
{
......@@ -626,15 +668,28 @@ runtime·stoptheworld(void)
schedunlock();
}
// TODO(rsc): Remove. This is only temporary,
// for the mark and sweep collector.
void
runtime·starttheworld(void)
runtime·starttheworld(bool extra)
{
M *m;
schedlock();
runtime·gcwaiting = 0;
setmcpumax(runtime·gomaxprocs);
matchmg();
if(extra && canaddmcpu()) {
// Start a new m that will (we hope) be idle
// and so available to help when the next
// garbage collection happens.
// canaddmcpu above did mcpu++
// (necessary, because m will be doing various
// initialization work so is definitely running),
// but m is not running a specific goroutine,
// so set the helpgc flag as a signal to m's
// first schedule(nil) to mcpu--.
m = startm();
m->helpgc = 1;
}
schedunlock();
}
......@@ -644,8 +699,6 @@ runtime·mstart(void)
{
if(g != m->g0)
runtime·throw("bad runtime·mstart");
if(m->mcache == nil)
m->mcache = runtime·allocmcache();
// Record top of stack for use by mcall.
// Once we call schedule we're never coming back,
......@@ -677,46 +730,55 @@ struct CgoThreadStart
static void
matchmg(void)
{
G *g;
G *gp;
M *mp;
if(m->mallocing || m->gcing)
return;
while(haveg() && canaddmcpu()) {
g = gget();
if(g == nil)
gp = gget();
if(gp == nil)
runtime·throw("gget inconsistency");
// Find the m that will run g.
M *m;
if((m = mget(g)) == nil){
m = runtime·malloc(sizeof(M));
mcommoninit(m);
if(runtime·iscgo) {
CgoThreadStart ts;
if(libcgo_thread_start == nil)
runtime·throw("libcgo_thread_start missing");
// pthread_create will make us a stack.
m->g0 = runtime·malg(-1);
ts.m = m;
ts.g = m->g0;
ts.fn = runtime·mstart;
runtime·asmcgocall(libcgo_thread_start, &ts);
} else {
if(Windows)
// windows will layout sched stack on os stack
m->g0 = runtime·malg(-1);
else
m->g0 = runtime·malg(8192);
runtime·newosproc(m, m->g0, m->g0->stackbase, runtime·mstart);
}
}
mnextg(m, g);
// Find the m that will run gp.
if((mp = mget(gp)) == nil)
mp = startm();
mnextg(mp, gp);
}
}
static M*
startm(void)
{
M *m;
m = runtime·malloc(sizeof(M));
mcommoninit(m);
if(runtime·iscgo) {
CgoThreadStart ts;
if(libcgo_thread_start == nil)
runtime·throw("libcgo_thread_start missing");
// pthread_create will make us a stack.
m->g0 = runtime·malg(-1);
ts.m = m;
ts.g = m->g0;
ts.fn = runtime·mstart;
runtime·asmcgocall(libcgo_thread_start, &ts);
} else {
if(Windows)
// windows will layout sched stack on os stack
m->g0 = runtime·malg(-1);
else
m->g0 = runtime·malg(8192);
runtime·newosproc(m, m->g0, m->g0->stackbase, runtime·mstart);
}
return m;
}
// One round of scheduler: find a goroutine and run it.
// The argument is the goroutine that was running before
// schedule was called, or nil if this is the first call.
......@@ -767,6 +829,12 @@ schedule(G *gp)
gp->readyonstop = 0;
readylocked(gp);
}
} else if(m->helpgc) {
// atomic { mcpu-- }
v = runtime·xadd(&runtime·sched.atomic, -1<<mcpuShift);
if(atomic_mcpu(v) > maxgomaxprocs)
runtime·throw("negative mcpu in scheduler");
m->helpgc = 0;
}
// Find (or wait for) g to run. Unlocks runtime·sched.
......@@ -1097,7 +1165,7 @@ runtime·newproc1(byte *fn, byte *argp, int32 narg, int32 nret, void *callerpc)
//printf("newproc1 %p %p narg=%d nret=%d\n", fn, argp, narg, nret);
siz = narg + nret;
siz = (siz+7) & ~7;
// We could instead create a secondary stack frame
// and make it look like goexit was on the original but
// the call to the actual goroutine function was split.
......
......@@ -57,7 +57,7 @@ typedef struct String String;
typedef struct Usema Usema;
typedef struct SigTab SigTab;
typedef struct MCache MCache;
typedef struct FixAlloc FixAlloc;
typedef struct FixAlloc FixAlloc;
typedef struct Iface Iface;
typedef struct Itab Itab;
typedef struct Eface Eface;
......@@ -238,6 +238,7 @@ struct M
int32 waitnextg;
int32 dying;
int32 profilehz;
int32 helpgc;
uint32 fastrand;
uint64 ncgocall;
Note havenextg;
......@@ -406,6 +407,7 @@ extern bool runtime·singleproc;
extern uint32 runtime·panicking;
extern int32 runtime·gcwaiting; // gc is waiting to run
int8* runtime·goos;
int32 runtime·ncpu;
extern bool runtime·iscgo;
extern void (*runtime·destroylock)(Lock*);
......@@ -515,6 +517,7 @@ void runtime·startpanic(void);
void runtime·sigprof(uint8 *pc, uint8 *sp, uint8 *lr, G *gp);
void runtime·resetcpuprofiler(int32);
void runtime·setcpuprofilerate(void(*)(uintptr*, int32), int32);
void runtime·usleep(uint32);
#pragma varargck argpos runtime·printf 1
#pragma varargck type "d" int32
......@@ -534,7 +537,7 @@ void runtime·setcpuprofilerate(void(*)(uintptr*, int32), int32);
// TODO(rsc): Remove. These are only temporary,
// for the mark and sweep collector.
void runtime·stoptheworld(void);
void runtime·starttheworld(void);
void runtime·starttheworld(bool);
/*
* mutual exclusion locks. in the uncontended case,
......
......@@ -18,7 +18,7 @@ all: $(addsuffix .out, $(ALL))
$(LD) -o $@ $*.$O
%.bench: %.out
./$*.out
time ./$*.out
bench: $(addsuffix .bench, $(ALL))
......
......@@ -73,10 +73,6 @@ func parseDir(dirpath string) map[string]*ast.Package {
}
func main() {
runtime.GOMAXPROCS(4)
go func() {}()
go func() {}()
go func() {}()
st := &runtime.MemStats
packages = append(packages, packages...)
packages = append(packages, packages...)
......@@ -132,7 +128,6 @@ func main() {
}
}
var packages = []string{
"archive/tar",
"asn1",
......@@ -148,7 +143,6 @@ var packages = []string{
"container/ring",
"container/vector",
"crypto/aes",
"crypto/block",
"crypto/blowfish",
"crypto/hmac",
"crypto/md4",
......@@ -167,7 +161,6 @@ var packages = []string{
"debug/macho",
"debug/elf",
"debug/gosym",
"debug/proc",
"ebnf",
"encoding/ascii85",
"encoding/base64",
......@@ -177,9 +170,6 @@ var packages = []string{
"encoding/pem",
"exec",
"exp/datafmt",
"exp/draw",
"exp/eval",
"exp/iterable",
"expvar",
"flag",
"fmt",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment