Commit 438b9544 authored by Austin Clements's avatar Austin Clements

runtime: check more work flushing races

This adds several new checks to help debug #27993. It adds a mechanism
for freezing write barriers and gcWork puts during the mark completion
algorithm. This way, if we do detect mark completion, we can catch any
puts that happened during the completion algorithm. Based on build
dashboard failures, this seems to be the window of time when these are
happening.

This also double-checks that all work buffers are empty immediately
upon entering mark termination (much earlier than the current check).
This is unlikely to trigger based on the current failures, but is a
good safety net.

Change-Id: I03f56c48c4322069e28c50fbc3c15b2fee2130c2
Reviewed-on: https://go-review.googlesource.com/c/151797
Run-TryBot: Austin Clements <austin@google.com>
Reviewed-by: 's avatarMichael Knyszek <mknyszek@google.com>
parent fbdaa965
......@@ -1363,6 +1363,19 @@ func gcStart(trigger gcTrigger) {
// This is protected by markDoneSema.
var gcMarkDoneFlushed uint32
// debugCachedWork enables extra checks for debugging premature mark
// termination.
//
// For debugging issue #27993.
const debugCachedWork = true
// gcWorkPauseGen is for debugging the mark completion algorithm.
// gcWork put operations spin while gcWork.pauseGen == gcWorkPauseGen.
// Only used if debugCachedWork is true.
//
// For debugging issue #27993.
var gcWorkPauseGen uint32 = 1
// gcMarkDone transitions the GC from mark to mark termination if all
// reachable objects have been marked (that is, there are no grey
// objects and can be no more in the future). Otherwise, it flushes
......@@ -1408,6 +1421,14 @@ top:
// Flush the write barrier buffer, since this may add
// work to the gcWork.
wbBufFlush1(_p_)
// For debugging, shrink the write barrier
// buffer so it flushes immediately.
// wbBuf.reset will keep it at this size as
// long as throwOnGCWork is set.
if debugCachedWork {
b := &_p_.wbBuf
b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
}
// Flush the gcWork, since this may create global work
// and set the flushedWork flag.
//
......@@ -1418,11 +1439,23 @@ top:
if _p_.gcw.flushedWork {
atomic.Xadd(&gcMarkDoneFlushed, 1)
_p_.gcw.flushedWork = false
} else if debugCachedWork {
// For debugging, freeze the gcWork
// until we know whether we've reached
// completion or not. If we think
// we've reached completion, but
// there's a paused gcWork, then
// that's a bug.
_p_.gcw.pauseGen = gcWorkPauseGen
}
})
})
if gcMarkDoneFlushed != 0 {
if debugCachedWork {
// Release paused gcWorks.
atomic.Xadd(&gcWorkPauseGen, 1)
}
// More grey objects were discovered since the
// previous termination check, so there may be more
// work to do. Keep going. It's possible the
......@@ -1431,7 +1464,12 @@ top:
goto top
}
throwOnGCWork = true
if debugCachedWork {
throwOnGCWork = true
// Release paused gcWorks. If there are any, they
// should now observe throwOnGCWork and panic.
atomic.Xadd(&gcWorkPauseGen, 1)
}
// There was no global work, no local work, and no Ps
// communicated work since we took markDoneSema. Therefore
......@@ -1449,6 +1487,30 @@ top:
// below. The important thing is that the wb remains active until
// all marking is complete. This includes writes made by the GC.
if debugCachedWork {
// For debugging, double check that no work was added after we
// went around above and disable write barrier buffering.
for _, p := range allp {
gcw := &p.gcw
if !gcw.empty() {
printlock()
print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
if gcw.wbuf1 == nil {
print(" wbuf1=<nil>")
} else {
print(" wbuf1.n=", gcw.wbuf1.nobj)
}
if gcw.wbuf2 == nil {
print(" wbuf2=<nil>")
} else {
print(" wbuf2.n=", gcw.wbuf2.nobj)
}
print("\n")
throw("throwOnGCWork")
}
}
}
// Disable assists and background workers. We must do
// this before waking blocked assists.
atomic.Store(&gcBlackenEnabled, 0)
......
......@@ -93,6 +93,10 @@ type gcWork struct {
// termination check. Specifically, this indicates that this
// gcWork may have communicated work to another gcWork.
flushedWork bool
// pauseGen causes put operations to spin while pauseGen ==
// gcWorkPauseGen if debugCachedWork is true.
pauseGen uint32
}
// Most of the methods of gcWork are go:nowritebarrierrec because the
......@@ -111,13 +115,21 @@ func (w *gcWork) init() {
w.wbuf2 = wbuf2
}
func (w *gcWork) checkPut() {
if debugCachedWork {
for atomic.Load(&gcWorkPauseGen) == w.pauseGen {
}
if throwOnGCWork {
throw("throwOnGCWork")
}
}
}
// put enqueues a pointer for the garbage collector to trace.
// obj must point to the beginning of a heap object or an oblet.
//go:nowritebarrierrec
func (w *gcWork) put(obj uintptr) {
if throwOnGCWork {
throw("throwOnGCWork")
}
w.checkPut()
flushed := false
wbuf := w.wbuf1
......@@ -153,9 +165,7 @@ func (w *gcWork) put(obj uintptr) {
// otherwise it returns false and the caller needs to call put.
//go:nowritebarrierrec
func (w *gcWork) putFast(obj uintptr) bool {
if throwOnGCWork {
throw("throwOnGCWork")
}
w.checkPut()
wbuf := w.wbuf1
if wbuf == nil {
......@@ -178,9 +188,7 @@ func (w *gcWork) putBatch(obj []uintptr) {
return
}
if throwOnGCWork {
throw("throwOnGCWork")
}
w.checkPut()
flushed := false
wbuf := w.wbuf1
......@@ -303,16 +311,12 @@ func (w *gcWork) balance() {
return
}
if wbuf := w.wbuf2; wbuf.nobj != 0 {
if throwOnGCWork {
throw("throwOnGCWork")
}
w.checkPut()
putfull(wbuf)
w.flushedWork = true
w.wbuf2 = getempty()
} else if wbuf := w.wbuf1; wbuf.nobj > 4 {
if throwOnGCWork {
throw("throwOnGCWork")
}
w.checkPut()
w.wbuf1 = handoff(wbuf)
w.flushedWork = true // handoff did putfull
} else {
......
......@@ -79,7 +79,7 @@ const (
func (b *wbBuf) reset() {
start := uintptr(unsafe.Pointer(&b.buf[0]))
b.next = start
if writeBarrier.cgo {
if writeBarrier.cgo || (debugCachedWork && throwOnGCWork) {
// Effectively disable the buffer by forcing a flush
// on every barrier.
b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment