runtime: check more work flushing races

This adds several new checks to help debug #27993. It adds a mechanism for freezing write barriers and gcWork puts during the mark completion algorithm. This way, if we do detect mark completion, we can catch any puts that happened during the completion algorithm. Based on build dashboard failures, this seems to be the window of time when these are happening. This also double-checks that all work buffers are empty immediately upon entering mark termination (much earlier than the current check). This is unlikely to trigger based on the current failures, but is a good safety net. Change-Id: I03f56c48c4322069e28c50fbc3c15b2fee2130c2 Reviewed-on: https://go-review.googlesource.com/c/151797 Run-TryBot: Austin Clements <austin@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com>

runtime: check more work flushing races
This adds several new checks to help debug #27993. It adds a mechanism for freezing write barriers and gcWork puts during the mark completion algorithm. This way, if we do detect mark completion, we can catch any puts that happened during the completion algorithm. Based on build dashboard failures, this seems to be the window of time when these are happening. This also double-checks that all work buffers are empty immediately upon entering mark termination (much earlier than the current check). This is unlikely to trigger based on the current failures, but is a good safety net. Change-Id: I03f56c48c4322069e28c50fbc3c15b2fee2130c2 Reviewed-on: https://go-review.googlesource.com/c/151797 Run-TryBot: Austin Clements <austin@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com>
438b9544 · Austin Clements · fbdaa965 · 438b9544 · 438b9544 · 438b9544
Commit 438b9544 authored Nov 26, 2018 by Austin Clements
Hide whitespace changes
Inline Side-by-side

Showing with 83 additions and 17 deletions

mgc.go src/runtime/mgc.go +63 -1

mgcwork.go src/runtime/mgcwork.go +19 -15

mwbbuf.go src/runtime/mwbbuf.go +1 -1

No files found.
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -1363,6 +1363,19 @@ func gcStart(trigger gcTrigger) {
 // This is protected by markDoneSema.
 var gcMarkDoneFlushed uint32

+// debugCachedWork enables extra checks for debugging premature mark
+// termination.
+//
+// For debugging issue #27993.
+const debugCachedWork = true
+
+// gcWorkPauseGen is for debugging the mark completion algorithm.
+// gcWork put operations spin while gcWork.pauseGen == gcWorkPauseGen.
+// Only used if debugCachedWork is true.
+//
+// For debugging issue #27993.
+var gcWorkPauseGen uint32 = 1
+
 // gcMarkDone transitions the GC from mark to mark termination if all
 // reachable objects have been marked (that is, there are no grey
 // objects and can be no more in the future). Otherwise, it flushes
@@ -1408,6 +1421,14 @@ top:
 			// Flush the write barrier buffer, since this may add
 			// work to the gcWork.
 			wbBufFlush1(_p_)
+			// For debugging, shrink the write barrier
+			// buffer so it flushes immediately.
+			// wbBuf.reset will keep it at this size as
+			// long as throwOnGCWork is set.
+			if debugCachedWork {
+				b := &_p_.wbBuf
+				b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
+			}
 			// Flush the gcWork, since this may create global work
 			// and set the flushedWork flag.
 			//
@@ -1418,11 +1439,23 @@ top:
 			if _p_.gcw.flushedWork {
 				atomic.Xadd(&gcMarkDoneFlushed, 1)
 				_p_.gcw.flushedWork = false
+			} else if debugCachedWork {
+				// For debugging, freeze the gcWork
+				// until we know whether we've reached
+				// completion or not. If we think
+				// we've reached completion, but
+				// there's a paused gcWork, then
+				// that's a bug.
+				_p_.gcw.pauseGen = gcWorkPauseGen
 			}
 		})
 	})

 	if gcMarkDoneFlushed != 0 {
+		if debugCachedWork {
+			// Release paused gcWorks.
+			atomic.Xadd(&gcWorkPauseGen, 1)
+		}
 		// More grey objects were discovered since the
 		// previous termination check, so there may be more
 		// work to do. Keep going. It's possible the
@@ -1431,7 +1464,12 @@ top:
 		goto top
 	}

-	throwOnGCWork = true
+	if debugCachedWork {
+		throwOnGCWork = true
+		// Release paused gcWorks. If there are any, they
+		// should now observe throwOnGCWork and panic.
+		atomic.Xadd(&gcWorkPauseGen, 1)
+	}

 	// There was no global work, no local work, and no Ps
 	// communicated work since we took markDoneSema. Therefore
@@ -1449,6 +1487,30 @@ top:
 	// below. The important thing is that the wb remains active until
 	// all marking is complete. This includes writes made by the GC.

+	if debugCachedWork {
+		// For debugging, double check that no work was added after we
+		// went around above and disable write barrier buffering.
+		for _, p := range allp {
+			gcw := &p.gcw
+			if !gcw.empty() {
+				printlock()
+				print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
+				if gcw.wbuf1 == nil {
+					print(" wbuf1=<nil>")
+				} else {
+					print(" wbuf1.n=", gcw.wbuf1.nobj)
+				}
+				if gcw.wbuf2 == nil {
+					print(" wbuf2=<nil>")
+				} else {
+					print(" wbuf2.n=", gcw.wbuf2.nobj)
+				}
+				print("\n")
+				throw("throwOnGCWork")
+			}
+		}
+	}
+
 	// Disable assists and background workers. We must do
 	// this before waking blocked assists.
 	atomic.Store(&gcBlackenEnabled, 0)

--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -93,6 +93,10 @@ type gcWork struct {
 	// termination check. Specifically, this indicates that this
 	// gcWork may have communicated work to another gcWork.
 	flushedWork bool
+
+	// pauseGen causes put operations to spin while pauseGen ==
+	// gcWorkPauseGen if debugCachedWork is true.
+	pauseGen uint32
 }

 // Most of the methods of gcWork are go:nowritebarrierrec because the
@@ -111,13 +115,21 @@ func (w *gcWork) init() {
 	w.wbuf2 = wbuf2
 }

+func (w *gcWork) checkPut() {
+	if debugCachedWork {
+		for atomic.Load(&gcWorkPauseGen) == w.pauseGen {
+		}
+		if throwOnGCWork {
+			throw("throwOnGCWork")
+		}
+	}
+}
+
 // put enqueues a pointer for the garbage collector to trace.
 // obj must point to the beginning of a heap object or an oblet.
 //go:nowritebarrierrec
 func (w *gcWork) put(obj uintptr) {
-	if throwOnGCWork {
-		throw("throwOnGCWork")
-	}
+	w.checkPut()

 	flushed := false
 	wbuf := w.wbuf1
@@ -153,9 +165,7 @@ func (w *gcWork) put(obj uintptr) {
 // otherwise it returns false and the caller needs to call put.
 //go:nowritebarrierrec
 func (w *gcWork) putFast(obj uintptr) bool {
-	if throwOnGCWork {
-		throw("throwOnGCWork")
-	}
+	w.checkPut()

 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -178,9 +188,7 @@ func (w *gcWork) putBatch(obj []uintptr) {
 		return
 	}

-	if throwOnGCWork {
-		throw("throwOnGCWork")
-	}
+	w.checkPut()

 	flushed := false
 	wbuf := w.wbuf1
@@ -303,16 +311,12 @@ func (w *gcWork) balance() {
 		return
 	}
 	if wbuf := w.wbuf2; wbuf.nobj != 0 {
-		if throwOnGCWork {
-			throw("throwOnGCWork")
-		}
+		w.checkPut()
 		putfull(wbuf)
 		w.flushedWork = true
 		w.wbuf2 = getempty()
 	} else if wbuf := w.wbuf1; wbuf.nobj > 4 {
-		if throwOnGCWork {
-			throw("throwOnGCWork")
-		}
+		w.checkPut()
 		w.wbuf1 = handoff(wbuf)
 		w.flushedWork = true // handoff did putfull
 	} else {

--- a/src/runtime/mwbbuf.go
+++ b/src/runtime/mwbbuf.go
@@ -79,7 +79,7 @@ const (
 func (b *wbBuf) reset() {
 	start := uintptr(unsafe.Pointer(&b.buf[0]))
 	b.next = start
-	if writeBarrier.cgo {
+	if writeBarrier.cgo || (debugCachedWork && throwOnGCWork) {
 		// Effectively disable the buffer by forcing a flush
 		// on every barrier.
 		b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))