Commit c4fe5031 authored by Rick Hudson's avatar Rick Hudson

runtime: reduce thrashing of gs between ps

One important use case is a pipeline computation that pass values
from one Goroutine to the next and then exits or is placed in a
wait state. If GOMAXPROCS > 1 a Goroutine running on P1 will enable
another Goroutine and then immediately make P1 available to execute
it. We need to prevent other Ps from stealing the G that P1 is about
to execute. Otherwise the Gs can thrash between Ps causing unneeded
synchronization and slowing down throughput.

Fix this by changing the stealing logic so that when a P attempts to
steal the only G on some other P's run queue, it will pause
momentarily to allow the victim P to schedule the G.

As part of optimizing stealing we also use a per P victim queue
move stolen gs. This eliminates the zeroing of a stack local victim
queue which turned out to be expensive.

This CL is a necessary but not sufficient prerequisite to changing
the default value of GOMAXPROCS to something > 1 which is another
CL/discussion.

For highly serialized programs, such as GoroutineRing below this can
make a large difference. For larger and more parallel programs such
as the x/benchmarks there is no noticeable detriment.

~/work/code/src/rsc.io/benchstat/benchstat old.txt new.txt
name                old mean              new mean              delta
GoroutineRing       30.2µs × (0.98,1.01)  30.1µs × (0.97,1.04)     ~    (p=0.941)
GoroutineRing-2      113µs × (0.91,1.07)    30µs × (0.98,1.03)  -73.17% (p=0.004)
GoroutineRing-4      144µs × (0.98,1.02)    32µs × (0.98,1.01)  -77.69% (p=0.000)
GoroutineRingBuf    32.7µs × (0.97,1.03)  32.5µs × (0.97,1.02)     ~    (p=0.795)
GoroutineRingBuf-2   120µs × (0.92,1.08)    33µs × (1.00,1.00)  -72.48% (p=0.004)
GoroutineRingBuf-4   138µs × (0.92,1.06)    33µs × (1.00,1.00)  -76.21% (p=0.003)

The bench benchmarks show little impact.
    	  	      old  	 new
garbage	      	      7032879	 7011696
httpold		        25509	   25301
splayold	      1022073	 1019499
jsonold		     28230624   28081433

Change-Id: I228c48fed8d85c9bbef16a7edc53ab7898506f50
Reviewed-on: https://go-review.googlesource.com/9872Reviewed-by: 's avatarAustin Clements <austin@google.com>
parent 3b38626f
......@@ -1420,7 +1420,7 @@ top:
xadd(&sched.nmspinning, 1)
}
// random steal from other P's
for i := 0; i < int(2*gomaxprocs); i++ {
for i := 0; i < int(4*gomaxprocs); i++ {
if sched.gcwaiting != 0 {
goto top
}
......@@ -1429,15 +1429,17 @@ top:
if _p_ == _g_.m.p.ptr() {
gp, _ = runqget(_p_)
} else {
gp = runqsteal(_g_.m.p.ptr(), _p_)
stealRunNextG := i > 2*int(gomaxprocs) // first look for ready queues with more than 1 g
gp = runqsteal(_g_.m.p.ptr(), _p_, stealRunNextG)
}
if gp != nil {
return gp, false
}
}
stop:
// We have nothing to do. If we're in the GC mark phaseand can
// We have nothing to do. If we're in the GC mark phase and can
// safely scan and blacken objects, run idle-time marking
// rather than give up the P.
if _p_ := _g_.m.p.ptr(); gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != nil {
......@@ -3461,20 +3463,30 @@ func runqget(_p_ *p) (gp *g, inheritTime bool) {
// Grabs a batch of goroutines from local runnable queue.
// batch array must be of size len(p->runq)/2. Returns number of grabbed goroutines.
// Can be executed by any P.
func runqgrab(_p_ *p, batch []*g) uint32 {
func runqgrab(_p_ *p, batch []*g, stealRunNextG bool) uint32 {
for {
h := atomicload(&_p_.runqhead) // load-acquire, synchronize with other consumers
t := atomicload(&_p_.runqtail) // load-acquire, synchronize with the producer
n := t - h
n = n - n/2
if n == 0 {
// Try to steal from _p_.runnext.
if next := _p_.runnext; next != 0 {
if !_p_.runnext.cas(next, 0) {
continue
if stealRunNextG {
// Try to steal from _p_.runnext.
if next := _p_.runnext; next != 0 {
// Sleep to ensure that _p_ isn't about to run the g we
// are about to steal.
// The important use case here is when the g running on _p_
// ready()s another g and then almost immediately blocks.
// Instead of stealing runnext in this window, back off
// to give _p_ a chance to schedule runnext. This will avoid
// thrashing gs between different Ps.
usleep(100)
if !_p_.runnext.cas(next, 0) {
continue
}
batch[0] = next.ptr()
return 1
}
batch[0] = next.ptr()
return 1
}
return 0
}
......@@ -3493,15 +3505,13 @@ func runqgrab(_p_ *p, batch []*g) uint32 {
// Steal half of elements from local runnable queue of p2
// and put onto local runnable queue of p.
// Returns one of the stolen elements (or nil if failed).
func runqsteal(_p_, p2 *p) *g {
var batch [len(_p_.runq) / 2]*g
n := runqgrab(p2, batch[:])
func runqsteal(_p_, p2 *p, stealRunNextG bool) *g {
n := runqgrab(p2, _p_.runqvictims[:], stealRunNextG)
if n == 0 {
return nil
}
n--
gp := batch[n]
gp := _p_.runqvictims[n]
if n == 0 {
return gp
}
......@@ -3511,7 +3521,7 @@ func runqsteal(_p_, p2 *p) *g {
throw("runqsteal: runq overflow")
}
for i := uint32(0); i < n; i++ {
_p_.runq[(t+i)%uint32(len(_p_.runq))] = batch[i]
_p_.runq[(t+i)%uint32(len(_p_.runq))] = _p_.runqvictims[i]
}
atomicstore(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
return gp
......@@ -3548,7 +3558,7 @@ func testSchedLocalQueueSteal() {
gs[j].sig = 0
runqput(p1, &gs[j], false)
}
gp := runqsteal(p2, p1)
gp := runqsteal(p2, p1, true)
s := 0
if gp != nil {
s++
......
......@@ -353,9 +353,10 @@ type p struct {
goidcacheend uint64
// Queue of runnable goroutines. Accessed without lock.
runqhead uint32
runqtail uint32
runq [256]*g
runqhead uint32
runqtail uint32
runq [256]*g
runqvictims [128]*g // Used to stage victims from another p's runq
// runnext, if non-nil, is a runnable G that was ready'd by
// the current G and should be run next instead of what's in
// runq if there's time remaining in the running G's time
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment