Commit 2a889b9d authored by Austin Clements's avatar Austin Clements

runtime: make stack re-scan O(# dirty stacks)

Currently the stack re-scan during mark termination is O(# stacks)
because we enqueue a root marking job for every goroutine. It takes
~34ns to process this root marking job for a valid (clean) stack, so
at around 300k goroutines we exceed the 10ms pause goal. A non-trivial
portion of this time is spent simply taking the cache miss to check
the gcscanvalid flag, so simply optimizing the path that handles clean
stacks can only improve this so much.

Fix this by keeping an explicit list of goroutines with dirty stacks
that need to be rescanned. When a goroutine first transitions to
running after a stack scan and marks its stack dirty, it adds itself
to this list. We enqueue root marking jobs only for the goroutines in
this list, so this improves stack re-scanning asymptotically by
completely eliminating time spent on clean goroutines.

This reduces mark termination time for 500k idle goroutines from 15ms
to 238µs. Overall performance effect is negligible.

name \ 95%ile-time/markTerm     old           new         delta
IdleGs/gs:500000/gomaxprocs:12  15000µs ± 0%  238µs ± 5%  -98.41% (p=0.000 n=10+10)

name              old time/op  new time/op  delta
XBenchGarbage-12  2.30ms ± 3%  2.29ms ± 1%  -0.43%  (p=0.049 n=17+18)

name                      old time/op    new time/op    delta
BinaryTree17-12              2.57s ± 3%     2.59s ± 2%    ~     (p=0.141 n=19+20)
Fannkuch11-12                2.09s ± 0%     2.10s ± 1%  +0.53%  (p=0.000 n=19+19)
FmtFprintfEmpty-12          45.3ns ± 3%    45.2ns ± 2%    ~     (p=0.845 n=20+20)
FmtFprintfString-12          129ns ± 0%     127ns ± 0%  -1.55%  (p=0.000 n=16+16)
FmtFprintfInt-12             123ns ± 0%     119ns ± 1%  -3.24%  (p=0.000 n=19+19)
FmtFprintfIntInt-12          195ns ± 1%     189ns ± 1%  -3.11%  (p=0.000 n=17+17)
FmtFprintfPrefixedInt-12     193ns ± 1%     187ns ± 1%  -3.06%  (p=0.000 n=19+19)
FmtFprintfFloat-12           254ns ± 0%     255ns ± 1%  +0.35%  (p=0.001 n=14+17)
FmtManyArgs-12               781ns ± 0%     770ns ± 0%  -1.48%  (p=0.000 n=16+19)
GobDecode-12                7.00ms ± 1%    6.98ms ± 1%    ~     (p=0.563 n=19+19)
GobEncode-12                5.91ms ± 1%    5.92ms ± 0%    ~     (p=0.118 n=19+18)
Gzip-12                      219ms ± 1%     215ms ± 1%  -1.81%  (p=0.000 n=18+18)
Gunzip-12                   37.2ms ± 0%    37.4ms ± 0%  +0.45%  (p=0.000 n=17+19)
HTTPClientServer-12         76.9µs ± 3%    77.5µs ± 2%  +0.81%  (p=0.030 n=20+19)
JSONEncode-12               15.0ms ± 0%    14.8ms ± 1%  -0.88%  (p=0.001 n=15+19)
JSONDecode-12               50.6ms ± 0%    53.2ms ± 2%  +5.07%  (p=0.000 n=17+19)
Mandelbrot200-12            4.05ms ± 0%    4.05ms ± 1%    ~     (p=0.581 n=16+17)
GoParse-12                  3.34ms ± 1%    3.30ms ± 1%  -1.21%  (p=0.000 n=15+20)
RegexpMatchEasy0_32-12      69.6ns ± 1%    69.8ns ± 2%    ~     (p=0.566 n=19+19)
RegexpMatchEasy0_1K-12       238ns ± 1%     236ns ± 0%  -0.91%  (p=0.000 n=17+13)
RegexpMatchEasy1_32-12      69.8ns ± 1%    70.0ns ± 1%  +0.23%  (p=0.026 n=17+16)
RegexpMatchEasy1_1K-12       371ns ± 1%     363ns ± 1%  -2.07%  (p=0.000 n=19+19)
RegexpMatchMedium_32-12      107ns ± 2%     106ns ± 1%  -0.51%  (p=0.031 n=18+20)
RegexpMatchMedium_1K-12     33.0µs ± 0%    32.9µs ± 0%  -0.30%  (p=0.004 n=16+16)
RegexpMatchHard_32-12       1.70µs ± 0%    1.70µs ± 0%  +0.45%  (p=0.000 n=16+17)
RegexpMatchHard_1K-12       51.1µs ± 2%    51.4µs ± 1%  +0.53%  (p=0.000 n=17+19)
Revcomp-12                   378ms ± 1%     385ms ± 1%  +1.92%  (p=0.000 n=19+18)
Template-12                 64.3ms ± 2%    65.0ms ± 2%  +1.09%  (p=0.001 n=19+19)
TimeParse-12                 315ns ± 1%     317ns ± 2%    ~     (p=0.108 n=18+20)
TimeFormat-12                360ns ± 1%     337ns ± 0%  -6.30%  (p=0.000 n=18+13)
[Geo mean]                  51.8µs         51.6µs       -0.48%

Change-Id: Icf8994671476840e3998236e15407a505d4c760c
Reviewed-on: https://go-review.googlesource.com/20700Reviewed-by: 's avatarRick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 5b765ce3
...@@ -762,7 +762,7 @@ var work struct { ...@@ -762,7 +762,7 @@ var work struct {
alldone note alldone note
// Number of roots of various root types. Set by gcMarkRootPrepare. // Number of roots of various root types. Set by gcMarkRootPrepare.
nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int nDataRoots, nBSSRoots, nSpanRoots, nStackRoots, nRescanRoots int
// markrootDone indicates that roots have been marked at least // markrootDone indicates that roots have been marked at least
// once during the current GC cycle. This is checked by root // once during the current GC cycle. This is checked by root
...@@ -830,6 +830,14 @@ var work struct { ...@@ -830,6 +830,14 @@ var work struct {
head, tail guintptr head, tail guintptr
} }
// rescan is a list of G's that need to be rescanned during
// mark termination. A G adds itself to this list when it
// first invalidates its stack scan.
rescan struct {
lock mutex
list []guintptr
}
// Timing/utilization stats for this cycle. // Timing/utilization stats for this cycle.
stwprocs, maxprocs int32 stwprocs, maxprocs int32
tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
...@@ -1736,14 +1744,22 @@ func gcCopySpans() { ...@@ -1736,14 +1744,22 @@ func gcCopySpans() {
func gcResetMarkState() { func gcResetMarkState() {
// This may be called during a concurrent phase, so make sure // This may be called during a concurrent phase, so make sure
// allgs doesn't change. // allgs doesn't change.
if !(gcphase == _GCoff || gcphase == _GCmarktermination) {
// Accessing gcRescan is unsafe.
throw("bad GC phase")
}
lock(&allglock) lock(&allglock)
for _, gp := range allgs { for _, gp := range allgs {
gp.gcscandone = false // set to true in gcphasework gp.gcscandone = false // set to true in gcphasework
gp.gcscanvalid = false // stack has not been scanned gp.gcscanvalid = false // stack has not been scanned
gp.gcRescan = -1
gp.gcAssistBytes = 0 gp.gcAssistBytes = 0
} }
unlock(&allglock) unlock(&allglock)
// Clear rescan list.
work.rescan.list = work.rescan.list[:0]
work.bytesMarked = 0 work.bytesMarked = 0
work.initialHeapLive = memstats.heap_live work.initialHeapLive = memstats.heap_live
work.markrootDone = false work.markrootDone = false
......
...@@ -32,6 +32,8 @@ const ( ...@@ -32,6 +32,8 @@ const (
// //
// The caller must have call gcCopySpans(). // The caller must have call gcCopySpans().
// //
// The world must be stopped.
//
//go:nowritebarrier //go:nowritebarrier
func gcMarkRootPrepare() { func gcMarkRootPrepare() {
// Compute how many data and BSS root blocks there are. // Compute how many data and BSS root blocks there are.
...@@ -63,24 +65,31 @@ func gcMarkRootPrepare() { ...@@ -63,24 +65,31 @@ func gcMarkRootPrepare() {
// after concurrent mark. In STW GC, this will happen // after concurrent mark. In STW GC, this will happen
// during mark termination. // during mark termination.
work.nSpanRoots = (len(work.spans) + rootBlockSpans - 1) / rootBlockSpans work.nSpanRoots = (len(work.spans) + rootBlockSpans - 1) / rootBlockSpans
// On the first markroot, we need to scan all Gs. Gs
// may be created after this point, but it's okay that
// we ignore them because they begin life without any
// roots, so there's nothing to scan, and any roots
// they create during the concurrent phase will be
// scanned during mark termination. During mark
// termination, allglen isn't changing, so we'll scan
// all Gs.
work.nStackRoots = int(atomic.Loaduintptr(&allglen))
work.nRescanRoots = 0
} else { } else {
// We've already scanned span roots and kept the scan // We've already scanned span roots and kept the scan
// up-to-date during concurrent mark. // up-to-date during concurrent mark.
work.nSpanRoots = 0 work.nSpanRoots = 0
}
// Snapshot of allglen. During concurrent scan, we just need // On the second pass of markroot, we're just scanning
// to be consistent about how many markroot jobs we create and // dirty stacks. It's safe to access rescan since the
// how many Gs we check. Gs may be created after this point, // world is stopped.
// but it's okay that we ignore them because they begin life work.nStackRoots = 0
// without any roots, so there's nothing to scan, and any work.nRescanRoots = len(work.rescan.list)
// roots they create during the concurrent phase will be }
// scanned during mark termination. During mark termination,
// allglen isn't changing, so we'll scan all Gs.
work.nStackRoots = int(atomic.Loaduintptr(&allglen))
work.markrootNext = 0 work.markrootNext = 0
work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots) work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots)
} }
// gcMarkRootCheck checks that all roots have been scanned. It is // gcMarkRootCheck checks that all roots have been scanned. It is
...@@ -92,11 +101,24 @@ func gcMarkRootCheck() { ...@@ -92,11 +101,24 @@ func gcMarkRootCheck() {
} }
lock(&allglock) lock(&allglock)
// Check that gc work is done. // Check that stacks have been scanned.
for i := 0; i < work.nStackRoots; i++ { if gcphase == _GCmarktermination {
gp := allgs[i] for i := 0; i < len(allgs); i++ {
if !gp.gcscandone { gp := allgs[i]
throw("scan missed a g") if !(gp.gcscandone && gp.gcscanvalid) && readgstatus(gp) != _Gdead {
println("gp", gp, "goid", gp.goid,
"status", readgstatus(gp),
"gcscandone", gp.gcscandone,
"gcscanvalid", gp.gcscanvalid)
throw("scan missed a g")
}
}
} else {
for i := 0; i < work.nStackRoots; i++ {
gp := allgs[i]
if !gp.gcscandone {
throw("scan missed a g")
}
} }
} }
unlock(&allglock) unlock(&allglock)
...@@ -109,12 +131,18 @@ var oneptrmask = [...]uint8{1} ...@@ -109,12 +131,18 @@ var oneptrmask = [...]uint8{1}
// //
// Preemption must be disabled (because this uses a gcWork). // Preemption must be disabled (because this uses a gcWork).
// //
// nowritebarrier is only advisory here.
//
//go:nowritebarrier //go:nowritebarrier
func markroot(gcw *gcWork, i uint32) { func markroot(gcw *gcWork, i uint32) {
// TODO(austin): This is a bit ridiculous. Compute and store
// the bases in gcMarkRootPrepare instead of the counts.
baseData := uint32(fixedRootCount) baseData := uint32(fixedRootCount)
baseBSS := baseData + uint32(work.nDataRoots) baseBSS := baseData + uint32(work.nDataRoots)
baseSpans := baseBSS + uint32(work.nBSSRoots) baseSpans := baseBSS + uint32(work.nBSSRoots)
baseStacks := baseSpans + uint32(work.nSpanRoots) baseStacks := baseSpans + uint32(work.nSpanRoots)
baseRescan := baseStacks + uint32(work.nStackRoots)
end := baseRescan + uint32(work.nRescanRoots)
// Note: if you add a case here, please also update heapdump.go:dumproots. // Note: if you add a case here, please also update heapdump.go:dumproots.
switch { switch {
...@@ -151,10 +179,14 @@ func markroot(gcw *gcWork, i uint32) { ...@@ -151,10 +179,14 @@ func markroot(gcw *gcWork, i uint32) {
default: default:
// the rest is scanning goroutine stacks // the rest is scanning goroutine stacks
if uintptr(i-baseStacks) >= allglen { var gp *g
if baseStacks <= i && i < baseRescan {
gp = allgs[i-baseStacks]
} else if baseRescan <= i && i < end {
gp = work.rescan.list[i-baseRescan].ptr()
} else {
throw("markroot: bad index") throw("markroot: bad index")
} }
gp := allgs[i-baseStacks]
// remember when we've first observed the G blocked // remember when we've first observed the G blocked
// needed only to output in traceback // needed only to output in traceback
...@@ -163,13 +195,14 @@ func markroot(gcw *gcWork, i uint32) { ...@@ -163,13 +195,14 @@ func markroot(gcw *gcWork, i uint32) {
gp.waitsince = work.tstart gp.waitsince = work.tstart
} }
if gcphase != _GCmarktermination && gp.startpc == gcBgMarkWorkerPC { if gcphase != _GCmarktermination && gp.startpc == gcBgMarkWorkerPC && readgstatus(gp) != _Gdead {
// GC background workers may be // GC background workers may be
// non-preemptible, so we may deadlock if we // non-preemptible, so we may deadlock if we
// try to scan them during a concurrent phase. // try to scan them during a concurrent phase.
// They also have tiny stacks, so just ignore // They also have tiny stacks, so just ignore
// them until mark termination. // them until mark termination.
gp.gcscandone = true gp.gcscandone = true
queueRescan(gp)
break break
} }
...@@ -721,6 +754,14 @@ func scanstack(gp *g) { ...@@ -721,6 +754,14 @@ func scanstack(gp *g) {
gcw.dispose() gcw.dispose()
} }
gcUnlockStackBarriers(gp) gcUnlockStackBarriers(gp)
if gcphase == _GCmark {
// gp may have added itself to the rescan list between
// when GC started and now. It's clean now, so remove
// it. This isn't safe during mark termination because
// mark termination is consuming this list, but it's
// also not necessary.
dequeueRescan(gp)
}
gp.gcscanvalid = true gp.gcscanvalid = true
} }
...@@ -797,6 +838,60 @@ func scanframeworker(frame *stkframe, cache *pcvalueCache, gcw *gcWork) { ...@@ -797,6 +838,60 @@ func scanframeworker(frame *stkframe, cache *pcvalueCache, gcw *gcWork) {
} }
} }
// queueRescan adds gp to the stack rescan list and clears
// gp.gcscanvalid. The caller must own gp and ensure that gp isn't
// already on the rescan list.
func queueRescan(gp *g) {
if gcphase == _GCoff {
gp.gcscanvalid = false
return
}
if gp.gcRescan != -1 {
throw("g already on rescan list")
}
lock(&work.rescan.lock)
gp.gcscanvalid = false
// Recheck gcphase under the lock in case there was a phase change.
if gcphase == _GCoff {
unlock(&work.rescan.lock)
return
}
if len(work.rescan.list) == cap(work.rescan.list) {
throw("rescan list overflow")
}
n := len(work.rescan.list)
gp.gcRescan = int32(n)
work.rescan.list = work.rescan.list[:n+1]
work.rescan.list[n].set(gp)
unlock(&work.rescan.lock)
}
// dequeueRescan removes gp from the stack rescan list, if gp is on
// the rescan list. The caller must own gp.
func dequeueRescan(gp *g) {
if gp.gcRescan == -1 {
return
}
if gcphase == _GCoff {
gp.gcRescan = -1
return
}
lock(&work.rescan.lock)
if work.rescan.list[gp.gcRescan].ptr() != gp {
throw("bad dequeueRescan")
}
// Careful: gp may itself be the last G on the list.
last := work.rescan.list[len(work.rescan.list)-1]
work.rescan.list[gp.gcRescan] = last
last.ptr().gcRescan = gp.gcRescan
gp.gcRescan = -1
work.rescan.list = work.rescan.list[:len(work.rescan.list)-1]
unlock(&work.rescan.lock)
}
type gcDrainFlags int type gcDrainFlags int
const ( const (
......
...@@ -402,6 +402,16 @@ func allgadd(gp *g) { ...@@ -402,6 +402,16 @@ func allgadd(gp *g) {
lock(&allglock) lock(&allglock)
allgs = append(allgs, gp) allgs = append(allgs, gp)
allglen = uintptr(len(allgs)) allglen = uintptr(len(allgs))
// Grow GC rescan list if necessary.
if len(allgs) > cap(work.rescan.list) {
lock(&work.rescan.lock)
l := work.rescan.list
// Let append do the heavy lifting, but keep the
// length the same.
work.rescan.list = append(l[:cap(l)], 0)[:len(l)]
unlock(&work.rescan.lock)
}
unlock(&allglock) unlock(&allglock)
} }
...@@ -754,8 +764,9 @@ func casgstatus(gp *g, oldval, newval uint32) { ...@@ -754,8 +764,9 @@ func casgstatus(gp *g, oldval, newval uint32) {
nextYield = nanotime() + yieldDelay/2 nextYield = nanotime() + yieldDelay/2
} }
} }
if newval == _Grunning { if newval == _Grunning && gp.gcscanvalid {
gp.gcscanvalid = false // Run queueRescan on the system stack so it has more space.
systemstack(func() { queueRescan(gp) })
} }
} }
...@@ -1405,6 +1416,8 @@ func newextram() { ...@@ -1405,6 +1416,8 @@ func newextram() {
gp.syscallpc = gp.sched.pc gp.syscallpc = gp.sched.pc
gp.syscallsp = gp.sched.sp gp.syscallsp = gp.sched.sp
gp.stktopsp = gp.sched.sp gp.stktopsp = gp.sched.sp
gp.gcscanvalid = true // fresh G, so no dequeueRescan necessary
gp.gcRescan = -1
// malg returns status as Gidle, change to Gsyscall before adding to allg // malg returns status as Gidle, change to Gsyscall before adding to allg
// where GC will see it. // where GC will see it.
casgstatus(gp, _Gidle, _Gsyscall) casgstatus(gp, _Gidle, _Gsyscall)
...@@ -2210,6 +2223,10 @@ func goexit0(gp *g) { ...@@ -2210,6 +2223,10 @@ func goexit0(gp *g) {
gp.waitreason = "" gp.waitreason = ""
gp.param = nil gp.param = nil
// Note that gp's stack scan is now "valid" because it has no
// stack. We could dequeueRescan, but that takes a lock and
// isn't really necessary.
gp.gcscanvalid = true
dropg() dropg()
if _g_.m.locked&^_LockExternal != 0 { if _g_.m.locked&^_LockExternal != 0 {
...@@ -2700,6 +2717,7 @@ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr ...@@ -2700,6 +2717,7 @@ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr
if newg == nil { if newg == nil {
newg = malg(_StackMin) newg = malg(_StackMin)
casgstatus(newg, _Gidle, _Gdead) casgstatus(newg, _Gidle, _Gdead)
newg.gcRescan = -1
allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack. allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
} }
if newg.stack.hi == 0 { if newg.stack.hi == 0 {
...@@ -2733,6 +2751,17 @@ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr ...@@ -2733,6 +2751,17 @@ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr
if isSystemGoroutine(newg) { if isSystemGoroutine(newg) {
atomic.Xadd(&sched.ngsys, +1) atomic.Xadd(&sched.ngsys, +1)
} }
// The stack is dirty from the argument frame, so queue it for
// scanning. Do this before setting it to runnable so we still
// own the G. If we're recycling a G, it may already be on the
// rescan list.
if newg.gcRescan == -1 {
queueRescan(newg)
} else {
// The recycled G is already on the rescan list. Just
// mark the stack dirty.
newg.gcscanvalid = false
}
casgstatus(newg, _Gdead, _Grunnable) casgstatus(newg, _Gdead, _Grunnable)
if _p_.goidcache == _p_.goidcacheend { if _p_.goidcache == _p_.goidcacheend {
......
...@@ -336,7 +336,7 @@ type g struct { ...@@ -336,7 +336,7 @@ type g struct {
paniconfault bool // panic (instead of crash) on unexpected fault address paniconfault bool // panic (instead of crash) on unexpected fault address
preemptscan bool // preempted g does scan for gc preemptscan bool // preempted g does scan for gc
gcscandone bool // g has scanned stack; protected by _Gscan bit in status gcscandone bool // g has scanned stack; protected by _Gscan bit in status
gcscanvalid bool // false at start of gc cycle, true if G has not run since last scan gcscanvalid bool // false at start of gc cycle, true if G has not run since last scan; transition from true to false by calling queueRescan and false to true by calling dequeueRescan
throwsplit bool // must not split stack throwsplit bool // must not split stack
raceignore int8 // ignore race detection events raceignore int8 // ignore race detection events
sysblocktraced bool // StartTrace has emitted EvGoInSyscall about this goroutine sysblocktraced bool // StartTrace has emitted EvGoInSyscall about this goroutine
...@@ -354,7 +354,14 @@ type g struct { ...@@ -354,7 +354,14 @@ type g struct {
racectx uintptr racectx uintptr
waiting *sudog // sudog structures this g is waiting on (that have a valid elem ptr); in lock order waiting *sudog // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
// Per-G gcController state // Per-G GC state
// gcRescan is this G's index in work.rescan.list. If this is
// -1, this G is not on the rescan list.
//
// If gcphase != _GCoff and this G is visible to the garbage
// collector, writes to this are protected by work.rescan.lock.
gcRescan int32
// gcAssistBytes is this G's GC assist credit in terms of // gcAssistBytes is this G's GC assist credit in terms of
// bytes allocated. If this is positive, then the G has credit // bytes allocated. If this is positive, then the G has credit
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment