runtime: reduce thrashing of gs between ps

One important use case is a pipeline computation that pass values from one Goroutine to the next and then exits or is placed in a wait state. If GOMAXPROCS > 1 a Goroutine running on P1 will enable another Goroutine and then immediately make P1 available to execute it. We need to prevent other Ps from stealing the G that P1 is about to execute. Otherwise the Gs can thrash between Ps causing unneeded synchronization and slowing down throughput. Fix this by changing the stealing logic so that when a P attempts to steal the only G on some other P's run queue, it will pause momentarily to allow the victim P to schedule the G. As part of optimizing stealing we also use a per P victim queue move stolen gs. This eliminates the zeroing of a stack local victim queue which turned out to be expensive. This CL is a necessary but not sufficient prerequisite to changing the default value of GOMAXPROCS to something > 1 which is another CL/discussion. For highly serialized programs, such as GoroutineRing below this can make a large difference. For larger and more parallel programs such as the x/benchmarks there is no noticeable detriment. ~/work/code/src/rsc.io/benchstat/benchstat old.txt new.txt name old mean new mean delta GoroutineRing 30.2µs × (0.98,1.01) 30.1µs × (0.97,1.04) ~ (p=0.941) GoroutineRing-2 113µs × (0.91,1.07) 30µs × (0.98,1.03) -73.17% (p=0.004) GoroutineRing-4 144µs × (0.98,1.02) 32µs × (0.98,1.01) -77.69% (p=0.000) GoroutineRingBuf 32.7µs × (0.97,1.03) 32.5µs × (0.97,1.02) ~ (p=0.795) GoroutineRingBuf-2 120µs × (0.92,1.08) 33µs × (1.00,1.00) -72.48% (p=0.004) GoroutineRingBuf-4 138µs × (0.92,1.06) 33µs × (1.00,1.00) -76.21% (p=0.003) The bench benchmarks show little impact. old new garbage 7032879 7011696 httpold 25509 25301 splayold 1022073 1019499 jsonold 28230624 28081433 Change-Id: I228c48fed8d85c9bbef16a7edc53ab7898506f50 Reviewed-on: https://go-review.googlesource.com/9872Reviewed-by: Austin Clements <austin@google.com>

runtime: reduce thrashing of gs between ps
One important use case is a pipeline computation that pass values from one Goroutine to the next and then exits or is placed in a wait state. If GOMAXPROCS > 1 a Goroutine running on P1 will enable another Goroutine and then immediately make P1 available to execute it. We need to prevent other Ps from stealing the G that P1 is about to execute. Otherwise the Gs can thrash between Ps causing unneeded synchronization and slowing down throughput. Fix this by changing the stealing logic so that when a P attempts to steal the only G on some other P's run queue, it will pause momentarily to allow the victim P to schedule the G. As part of optimizing stealing we also use a per P victim queue move stolen gs. This eliminates the zeroing of a stack local victim queue which turned out to be expensive. This CL is a necessary but not sufficient prerequisite to changing the default value of GOMAXPROCS to something > 1 which is another CL/discussion. For highly serialized programs, such as GoroutineRing below this can make a large difference. For larger and more parallel programs such as the x/benchmarks there is no noticeable detriment. ~/work/code/src/rsc.io/benchstat/benchstat old.txt new.txt name old mean new mean delta GoroutineRing 30.2µs × (0.98,1.01) 30.1µs × (0.97,1.04) ~ (p=0.941) GoroutineRing-2 113µs × (0.91,1.07) 30µs × (0.98,1.03) -73.17% (p=0.004) GoroutineRing-4 144µs × (0.98,1.02) 32µs × (0.98,1.01) -77.69% (p=0.000) GoroutineRingBuf 32.7µs × (0.97,1.03) 32.5µs × (0.97,1.02) ~ (p=0.795) GoroutineRingBuf-2 120µs × (0.92,1.08) 33µs × (1.00,1.00) -72.48% (p=0.004) GoroutineRingBuf-4 138µs × (0.92,1.06) 33µs × (1.00,1.00) -76.21% (p=0.003) The bench benchmarks show little impact. old new garbage 7032879 7011696 httpold 25509 25301 splayold 1022073 1019499 jsonold 28230624 28081433 Change-Id: I228c48fed8d85c9bbef16a7edc53ab7898506f50 Reviewed-on: https://go-review.googlesource.com/9872Reviewed-by: Austin Clements <austin@google.com>
c4fe5031 · Rick Hudson · 3b38626f · c4fe5031 · c4fe5031
Commit c4fe5031 authored May 07, 2015 by Rick Hudson
Hide whitespace changes
Inline Side-by-side

Showing with 31 additions and 20 deletions

proc1.go src/runtime/proc1.go +27 -17

runtime2.go src/runtime/runtime2.go +4 -3

No files found.
--- a/src/runtime/proc1.go
+++ b/src/runtime/proc1.go
@@ -1420,7 +1420,7 @@ top:
 		xadd(&sched.nmspinning, 1)
 	}
 	// random steal from other P's
-	for i := 0; i < int(2*gomaxprocs); i++ {
+	for i := 0; i < int(4*gomaxprocs); i++ {
 		if sched.gcwaiting != 0 {
 			goto top
 		}
@@ -1429,15 +1429,17 @@ top:
 		if _p_ == _g_.m.p.ptr() {
 			gp, _ = runqget(_p_)
 		} else {
-			gp = runqsteal(_g_.m.p.ptr(), _p_)
+			stealRunNextG := i > 2*int(gomaxprocs) // first look for ready queues with more than 1 g
+			gp = runqsteal(_g_.m.p.ptr(), _p_, stealRunNextG)
 		}
 		if gp != nil {
 			return gp, false
 		}
 	}
+
 stop:

-	// We have nothing to do. If we're in the GC mark phaseand can
+	// We have nothing to do. If we're in the GC mark phase and can
 	// safely scan and blacken objects, run idle-time marking
 	// rather than give up the P.
 	if _p_ := _g_.m.p.ptr(); gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != nil {
@@ -3461,20 +3463,30 @@ func runqget(_p_ *p) (gp *g, inheritTime bool) {
 // Grabs a batch of goroutines from local runnable queue.
 // batch array must be of size len(p->runq)/2. Returns number of grabbed goroutines.
 // Can be executed by any P.
-func runqgrab(_p_ *p, batch []*g) uint32 {
+func runqgrab(_p_ *p, batch []*g, stealRunNextG bool) uint32 {
 	for {
 		h := atomicload(&_p_.runqhead) // load-acquire, synchronize with other consumers
 		t := atomicload(&_p_.runqtail) // load-acquire, synchronize with the producer
 		n := t - h
 		n = n - n/2
 		if n == 0 {
-			// Try to steal from _p_.runnext.
-			if next := _p_.runnext; next != 0 {
-				if !_p_.runnext.cas(next, 0) {
-					continue
+			if stealRunNextG {
+				// Try to steal from _p_.runnext.
+				if next := _p_.runnext; next != 0 {
+					// Sleep to ensure that _p_ isn't about to run the g we
+					// are about to steal.
+					// The important use case here is when the g running on _p_
+					// ready()s another g and then almost immediately blocks.
+					// Instead of stealing runnext in this window, back off
+					// to give _p_ a chance to schedule runnext. This will avoid
+					// thrashing gs between different Ps.
+					usleep(100)
+					if !_p_.runnext.cas(next, 0) {
+						continue
+					}
+					batch[0] = next.ptr()
+					return 1
 				}
-				batch[0] = next.ptr()
-				return 1
 			}
 			return 0
 		}
@@ -3493,15 +3505,13 @@ func runqgrab(_p_ *p, batch []*g) uint32 {
 // Steal half of elements from local runnable queue of p2
 // and put onto local runnable queue of p.
 // Returns one of the stolen elements (or nil if failed).
-func runqsteal(_p_, p2 *p) *g {
-	var batch [len(_p_.runq) / 2]*g
-
-	n := runqgrab(p2, batch[:])
+func runqsteal(_p_, p2 *p, stealRunNextG bool) *g {
+	n := runqgrab(p2, _p_.runqvictims[:], stealRunNextG)
 	if n == 0 {
 		return nil
 	}
 	n--
-	gp := batch[n]
+	gp := _p_.runqvictims[n]
 	if n == 0 {
 		return gp
 	}
@@ -3511,7 +3521,7 @@ func runqsteal(_p_, p2 *p) *g {
 		throw("runqsteal: runq overflow")
 	}
 	for i := uint32(0); i < n; i++ {
-		_p_.runq[(t+i)%uint32(len(_p_.runq))] = batch[i]
+		_p_.runq[(t+i)%uint32(len(_p_.runq))] = _p_.runqvictims[i]
 	}
 	atomicstore(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
 	return gp
@@ -3548,7 +3558,7 @@ func testSchedLocalQueueSteal() {
 			gs[j].sig = 0
 			runqput(p1, &gs[j], false)
 		}
-		gp := runqsteal(p2, p1)
+		gp := runqsteal(p2, p1, true)
 		s := 0
 		if gp != nil {
 			s++

--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -353,9 +353,10 @@ type p struct {
 	goidcacheend uint64

 	// Queue of runnable goroutines. Accessed without lock.
-	runqhead uint32
-	runqtail uint32
-	runq     [256]*g
+	runqhead    uint32
+	runqtail    uint32
+	runq        [256]*g
+	runqvictims [128]*g // Used to stage victims from another p's runq
 	// runnext, if non-nil, is a runnable G that was ready'd by
 	// the current G and should be run next instead of what's in
 	// runq if there's time remaining in the running G's time