Commit 8d03acce authored by Austin Clements's avatar Austin Clements

runtime: multi-threaded, utilization-scheduled background mark

Currently, the concurrent mark phase is performed by the main GC
goroutine. Prior to the previous commit enabling preemption, this
caused marking to always consume 1/GOMAXPROCS of the available CPU
time. If GOMAXPROCS=1, this meant background GC would consume 100% of
the CPU (effectively a STW). If GOMAXPROCS>4, background GC would use
less than the goal of 25%. If GOMAXPROCS=4, background GC would use
the goal 25%, but if the mutator wasn't using the remaining 75%,
background marking wouldn't take advantage of the idle time. Enabling
preemption in the previous commit made GC miss CPU targets in
completely different ways, but set us up to bring everything back in
line.

This change replaces the fixed GC goroutine with per-P background mark
goroutines. Once started, these goroutines don't go in the standard
run queues; instead, they are scheduled specially such that the time
spent in mutator assists and the background mark goroutines totals 25%
of the CPU time available to the program. Furthermore, this lets
background marking take advantage of idle Ps, which significantly
boosts GC performance for applications that under-utilize the CPU.

This requires also changing how time is reported for gctrace, so this
change splits the concurrent mark CPU time into assist/background/idle
scanning.

This also requires increasing the size of the StackRecord slice used
in a GoroutineProfile test.

Change-Id: I0936ff907d2cee6cb687a208f2df47e8988e3157
Reviewed-on: https://go-review.googlesource.com/8850Reviewed-by: 's avatarRick Hudson <rlh@golang.org>
parent af060c30
This diff is collapsed.
......@@ -226,6 +226,8 @@ func gcAssistAlloc(size uintptr, allowAssist bool) {
// just measure start and end time.
startTime := nanotime()
xadd(&work.nwait, -1)
// drain own current wbuf first in the hopes that it
// will be more cache friendly.
var gcw gcWork
......@@ -240,6 +242,16 @@ func gcAssistAlloc(size uintptr, allowAssist bool) {
// write barrier wbuf cache).
gcw.dispose()
// If this is the last worker and we ran out of work,
// signal a completion point.
if xadd(&work.nwait, +1) == work.nproc && work.full == 0 && work.partial == 0 {
// This has reached a background completion
// point. Is it the first this cycle?
if cas(&work.bgMarkDone, 0, 1) {
notewakeup(&work.bgMarkNote)
}
}
duration := nanotime() - startTime
_p_ := gp.m.p.ptr()
_p_.gcAssistTime += duration
......@@ -398,6 +410,8 @@ func scanframeworker(frame *stkframe, unused unsafe.Pointer, gcw *gcWork) {
}
}
// TODO(austin): Can we consolidate the gcDrain* functions?
// gcDrain scans objects in work buffers, blackening grey
// objects until all work buffers have been drained.
// If flushScanCredit != -1, gcDrain flushes accumulated scan work
......@@ -453,6 +467,58 @@ func gcDrain(gcw *gcWork, flushScanCredit int64) {
checknocurrentwbuf()
}
// gcDrainUntilPreempt blackens grey objects until g.preempt is set.
// This is best-effort, so it will return as soon as it is unable to
// get work, even though there may be more work in the system.
//go:nowritebarrier
func gcDrainUntilPreempt(gcw *gcWork, flushScanCredit int64) {
if gcphase != _GCmark {
println("gcphase =", gcphase)
throw("gcDrainUntilPreempt phase incorrect")
}
var lastScanFlush, nextScanFlush int64
if flushScanCredit != -1 {
lastScanFlush = gcw.scanWork
nextScanFlush = lastScanFlush + flushScanCredit
} else {
nextScanFlush = int64(^uint64(0) >> 1)
}
gp := getg()
for !gp.preempt {
// If the work queue is empty, balance. During
// concurrent mark we don't really know if anyone else
// can make use of this work, but even if we're the
// only worker, the total cost of this per cycle is
// only O(_WorkbufSize) pointer copies.
if work.full == 0 && work.partial == 0 {
gcw.balance()
}
b := gcw.tryGet()
if b == 0 {
// No more work
break
}
scanobject(b, 0, nil, gcw)
// Flush background scan work credit to the global
// account if we've accumulated enough locally so
// mutator assists can draw on it.
if gcw.scanWork >= nextScanFlush {
credit := gcw.scanWork - lastScanFlush
xaddint64(&gcController.bgScanCredit, credit)
lastScanFlush = gcw.scanWork
nextScanFlush = lastScanFlush + flushScanCredit
}
}
if flushScanCredit != -1 {
credit := gcw.scanWork - lastScanFlush
xaddint64(&gcController.bgScanCredit, credit)
}
}
// gcDrainN blackens grey objects until it has performed roughly
// scanWork units of scan work. This is best-effort, so it may perform
// less work if it fails to get a work buffer. Otherwise, it will
......
......@@ -1335,6 +1335,18 @@ top:
}
stop:
// We have nothing to do. If we're in the GC mark phase, run
// idle-time marking rather than give up the P.
if _p_ := _g_.m.p.ptr(); gcphase == _GCmark && _p_.gcBgMarkWorker != nil {
_p_.gcBgMarkIdle = true
gp := _p_.gcBgMarkWorker
casgstatus(gp, _Gwaiting, _Grunnable)
if trace.enabled {
traceGoUnpark(gp, 0)
}
return gp
}
// return P and block
lock(&sched.lock)
if sched.gcwaiting != 0 {
......@@ -1474,6 +1486,12 @@ top:
resetspinning()
}
}
if gp == nil && gcphase == _GCmark {
gp = gcController.findRunnable(_g_.m.p.ptr())
if gp != nil {
resetspinning()
}
}
if gp == nil {
// Check the global runnable queue once in a while to ensure fairness.
// Otherwise two goroutines can completely occupy the local runqueue
......@@ -2585,6 +2603,16 @@ func procresize(nprocs int32) *p {
}
sched.runqsize++
}
// if there's a background worker, make it runnable and put
// it on the global queue so it can clean itself up
if p.gcBgMarkWorker != nil {
casgstatus(p.gcBgMarkWorker, _Gwaiting, _Grunnable)
if trace.enabled {
traceGoUnpark(p.gcBgMarkWorker, 0)
}
globrunqput(p.gcBgMarkWorker)
p.gcBgMarkWorker = nil
}
for i := range p.sudogbuf {
p.sudogbuf[i] = nil
}
......
......@@ -367,7 +367,9 @@ type p struct {
palloc persistentAlloc // per-P to avoid mutex
// Per-P GC state
gcAssistTime int64 // Nanoseconds in assistAlloc
gcAssistTime int64 // Nanoseconds in assistAlloc
gcBgMarkWorker *g
gcBgMarkIdle bool
pad [64]byte
}
......
......@@ -42,7 +42,7 @@ func TestGoroutineProfile(t *testing.T) {
if testing.Short() {
max = 100
}
stk := make([]runtime.StackRecord, 100)
stk := make([]runtime.StackRecord, 128)
for n := 0; n < max; n++ {
_, ok := runtime.GoroutineProfile(stk)
if !ok {
......
......@@ -44,6 +44,7 @@ var (
bgsweepPC uintptr
forcegchelperPC uintptr
timerprocPC uintptr
gcBgMarkWorkerPC uintptr
systemstack_switchPC uintptr
externalthreadhandlerp uintptr // initialized elsewhere
......@@ -66,6 +67,7 @@ func tracebackinit() {
bgsweepPC = funcPC(bgsweep)
forcegchelperPC = funcPC(forcegchelper)
timerprocPC = funcPC(timerproc)
gcBgMarkWorkerPC = funcPC(gcBgMarkWorker)
systemstack_switchPC = funcPC(systemstack_switch)
}
......@@ -654,5 +656,6 @@ func isSystemGoroutine(gp *g) bool {
pc == backgroundgcPC ||
pc == bgsweepPC ||
pc == forcegchelperPC ||
pc == timerprocPC
pc == timerprocPC ||
pc == gcBgMarkWorkerPC
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment