runtime: improve randomized stealing logic

During random stealing we steal 4*GOMAXPROCS times from random procs. One would expect that most of the time we check all procs this way, but due to low quality PRNG we actually miss procs with frightening probability. Below are modelling experiment results for 1e6 tries: GOMAXPROCS = 2 : missed 1 procs 7944 times GOMAXPROCS = 3 : missed 1 procs 101620 times GOMAXPROCS = 3 : missed 2 procs 3571 times GOMAXPROCS = 4 : missed 1 procs 63916 times GOMAXPROCS = 4 : missed 2 procs 61 times GOMAXPROCS = 4 : missed 3 procs 16 times GOMAXPROCS = 5 : missed 1 procs 133136 times GOMAXPROCS = 5 : missed 2 procs 1025 times GOMAXPROCS = 5 : missed 3 procs 101 times GOMAXPROCS = 5 : missed 4 procs 15 times GOMAXPROCS = 8 : missed 1 procs 151765 times GOMAXPROCS = 8 : missed 2 procs 5057 times GOMAXPROCS = 8 : missed 3 procs 1726 times GOMAXPROCS = 8 : missed 4 procs 68 times GOMAXPROCS = 12 : missed 1 procs 199081 times GOMAXPROCS = 12 : missed 2 procs 27489 times GOMAXPROCS = 12 : missed 3 procs 3113 times GOMAXPROCS = 12 : missed 4 procs 233 times GOMAXPROCS = 12 : missed 5 procs 9 times GOMAXPROCS = 16 : missed 1 procs 237477 times GOMAXPROCS = 16 : missed 2 procs 30037 times GOMAXPROCS = 16 : missed 3 procs 9466 times GOMAXPROCS = 16 : missed 4 procs 1334 times GOMAXPROCS = 16 : missed 5 procs 192 times GOMAXPROCS = 16 : missed 6 procs 5 times GOMAXPROCS = 16 : missed 7 procs 1 times GOMAXPROCS = 16 : missed 8 procs 1 times A missed proc won't lead to underutilization because we check all procs again after dropping P. But it can lead to an unpleasant situation when we miss a proc, drop P, check all procs, discover work, acquire P, miss the proc again, repeat. Improve stealing logic to cover all procs. Also don't enter spinning mode and try to steal when there is nobody around. Change-Id: Ibb6b122cc7fb836991bad7d0639b77c807aab4c2 Reviewed-on: https://go-review.googlesource.com/20836Reviewed-by: Rick Hudson <rlh@golang.org> Run-TryBot: Dmitry Vyukov <dvyukov@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Marvin Stenger <marvin.stenger94@gmail.com>

runtime: improve randomized stealing logic
During random stealing we steal 4*GOMAXPROCS times from random procs. One would expect that most of the time we check all procs this way, but due to low quality PRNG we actually miss procs with frightening probability. Below are modelling experiment results for 1e6 tries: GOMAXPROCS = 2 : missed 1 procs 7944 times GOMAXPROCS = 3 : missed 1 procs 101620 times GOMAXPROCS = 3 : missed 2 procs 3571 times GOMAXPROCS = 4 : missed 1 procs 63916 times GOMAXPROCS = 4 : missed 2 procs 61 times GOMAXPROCS = 4 : missed 3 procs 16 times GOMAXPROCS = 5 : missed 1 procs 133136 times GOMAXPROCS = 5 : missed 2 procs 1025 times GOMAXPROCS = 5 : missed 3 procs 101 times GOMAXPROCS = 5 : missed 4 procs 15 times GOMAXPROCS = 8 : missed 1 procs 151765 times GOMAXPROCS = 8 : missed 2 procs 5057 times GOMAXPROCS = 8 : missed 3 procs 1726 times GOMAXPROCS = 8 : missed 4 procs 68 times GOMAXPROCS = 12 : missed 1 procs 199081 times GOMAXPROCS = 12 : missed 2 procs 27489 times GOMAXPROCS = 12 : missed 3 procs 3113 times GOMAXPROCS = 12 : missed 4 procs 233 times GOMAXPROCS = 12 : missed 5 procs 9 times GOMAXPROCS = 16 : missed 1 procs 237477 times GOMAXPROCS = 16 : missed 2 procs 30037 times GOMAXPROCS = 16 : missed 3 procs 9466 times GOMAXPROCS = 16 : missed 4 procs 1334 times GOMAXPROCS = 16 : missed 5 procs 192 times GOMAXPROCS = 16 : missed 6 procs 5 times GOMAXPROCS = 16 : missed 7 procs 1 times GOMAXPROCS = 16 : missed 8 procs 1 times A missed proc won't lead to underutilization because we check all procs again after dropping P. But it can lead to an unpleasant situation when we miss a proc, drop P, check all procs, discover work, acquire P, miss the proc again, repeat. Improve stealing logic to cover all procs. Also don't enter spinning mode and try to steal when there is nobody around. Change-Id: Ibb6b122cc7fb836991bad7d0639b77c807aab4c2 Reviewed-on: https://go-review.googlesource.com/20836Reviewed-by: Rick Hudson <rlh@golang.org> Run-TryBot: Dmitry Vyukov <dvyukov@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Marvin Stenger <marvin.stenger94@gmail.com>
ea0386f8 · Dmitry Vyukov · 44189299 · ea0386f8 · ea0386f8 · ea0386f8
Commit ea0386f8 authored Mar 18, 2016 by Dmitry Vyukov
Hide whitespace changes
Inline Side-by-side

Showing with 122 additions and 23 deletions

proc.go src/runtime/proc.go +85 -23

proc_runtime_test.go src/runtime/proc_runtime_test.go +33 -0

proc_test.go src/runtime/proc_test.go +4 -0

No files found.
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -1787,11 +1787,12 @@ func findrunnable() (gp *g, inheritTime bool) {
 	// an M.

 top:
+	_p_ := _g_.m.p.ptr()
 	if sched.gcwaiting != 0 {
 		gcstopm()
 		goto top
 	}
-	if _g_.m.p.ptr().runSafePointFn != 0 {
+	if _p_.runSafePointFn != 0 {
 		runSafePointFn()
 	}
 	if fingwait && fingwake {
@@ -1801,14 +1802,14 @@ top:
 	}

 	// local runq
-	if gp, inheritTime := runqget(_g_.m.p.ptr()); gp != nil {
+	if gp, inheritTime := runqget(_p_); gp != nil {
 		return gp, inheritTime
 	}

 	// global runq
 	if sched.runqsize != 0 {
 		lock(&sched.lock)
-		gp := globrunqget(_g_.m.p.ptr(), 0)
+		gp := globrunqget(_p_, 0)
 		unlock(&sched.lock)
 		if gp != nil {
 			return gp, false
@@ -1833,31 +1834,33 @@ top:
 		}
 	}

+	// Steal work from other P's.
+	procs := uint32(gomaxprocs)
+	if atomic.Load(&sched.npidle) == procs-1 {
+		// Either GOMAXPROCS=1 or everybody, except for us, is idle already.
+		// New work can appear from returning syscall/cgocall, network or timers.
+		// Neither of that submits to local run queues, so no point in stealing.
+		goto stop
+	}
 	// If number of spinning M's >= number of busy P's, block.
 	// This is necessary to prevent excessive CPU consumption
 	// when GOMAXPROCS>>1 but the program parallelism is low.
-	if !_g_.m.spinning && 2*atomic.Load(&sched.nmspinning) >= uint32(gomaxprocs)-atomic.Load(&sched.npidle) { // TODO: fast atomic
+	if !_g_.m.spinning && 2*atomic.Load(&sched.nmspinning) >= procs-atomic.Load(&sched.npidle) { // TODO: fast atomic
 		goto stop
 	}
 	if !_g_.m.spinning {
 		_g_.m.spinning = true
 		atomic.Xadd(&sched.nmspinning, 1)
 	}
-	// random steal from other P's
-	for i := 0; i < int(4*gomaxprocs); i++ {
-		if sched.gcwaiting != 0 {
-			goto top
-		}
-		_p_ := allp[fastrand1()%uint32(gomaxprocs)]
-		var gp *g
-		if _p_ == _g_.m.p.ptr() {
-			gp, _ = runqget(_p_)
-		} else {
-			stealRunNextG := i > 2*int(gomaxprocs) // first look for ready queues with more than 1 g
-			gp = runqsteal(_g_.m.p.ptr(), _p_, stealRunNextG)
-		}
-		if gp != nil {
-			return gp, false
+	for i := 0; i < 4; i++ {
+		for enum := stealOrder.start(fastrand1()); !enum.done(); enum.next() {
+			if sched.gcwaiting != 0 {
+				goto top
+			}
+			stealRunNextG := i > 2 // first look for ready queues with more than 1 g
+			if gp := runqsteal(_p_, allp[enum.position()], stealRunNextG); gp != nil {
+				return gp, false
+			}
 		}
 	}

@@ -1866,7 +1869,7 @@ stop:
 	// We have nothing to do. If we're in the GC mark phase, can
 	// safely scan and blacken objects, and have work to do, run
 	// idle-time marking rather than give up the P.
-	if _p_ := _g_.m.p.ptr(); gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != 0 && gcMarkWorkAvailable(_p_) {
+	if gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != 0 && gcMarkWorkAvailable(_p_) {
 		_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
 		gp := _p_.gcBgMarkWorker.ptr()
 		casgstatus(gp, _Gwaiting, _Grunnable)
@@ -1878,16 +1881,18 @@ stop:

 	// return P and block
 	lock(&sched.lock)
-	if sched.gcwaiting != 0 || _g_.m.p.ptr().runSafePointFn != 0 {
+	if sched.gcwaiting != 0 || _p_.runSafePointFn != 0 {
 		unlock(&sched.lock)
 		goto top
 	}
 	if sched.runqsize != 0 {
-		gp := globrunqget(_g_.m.p.ptr(), 0)
+		gp := globrunqget(_p_, 0)
 		unlock(&sched.lock)
 		return gp, false
 	}
-	_p_ := releasep()
+	if releasep() != _p_ {
+		throw("findrunnable: wrong p")
+	}
 	pidleput(_p_)
 	unlock(&sched.lock)

@@ -3265,6 +3270,7 @@ func procresize(nprocs int32) *p {
 			runnablePs = p
 		}
 	}
+	stealOrder.reset(uint32(nprocs))
 	var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
 	atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
 	return runnablePs
@@ -4121,3 +4127,59 @@ func sync_runtime_canSpin(i int) bool {
 func sync_runtime_doSpin() {
 	procyield(active_spin_cnt)
 }
+
+var stealOrder randomOrder
+
+// randomOrder/randomEnum are helper types for randomized work stealing.
+// They allow to enumerate all Ps in different pseudo-random orders without repetitions.
+// The algorithm is based on the fact that if we have X such that X and GOMAXPROCS
+// are coprime, then a sequences of (i + X) % GOMAXPROCS gives the required enumeration.
+type randomOrder struct {
+	count    uint32
+	coprimes []uint32
+}
+
+type randomEnum struct {
+	i     uint32
+	count uint32
+	pos   uint32
+	inc   uint32
+}
+
+func (ord *randomOrder) reset(count uint32) {
+	ord.count = count
+	ord.coprimes = ord.coprimes[:0]
+	for i := uint32(1); i <= count; i++ {
+		if gcd(i, count) == 1 {
+			ord.coprimes = append(ord.coprimes, i)
+		}
+	}
+}
+
+func (ord *randomOrder) start(i uint32) randomEnum {
+	return randomEnum{
+		count: ord.count,
+		pos:   i % ord.count,
+		inc:   ord.coprimes[i%uint32(len(ord.coprimes))],
+	}
+}
+
+func (enum *randomEnum) done() bool {
+	return enum.i == enum.count
+}
+
+func (enum *randomEnum) next() {
+	enum.i++
+	enum.pos = (enum.pos + enum.inc) % enum.count
+}
+
+func (enum *randomEnum) position() uint32 {
+	return enum.pos
+}
+
+func gcd(a, b uint32) uint32 {
+	for b != 0 {
+		a, b = b, a%b
+	}
+	return a
+}
--- a/src/runtime/proc_runtime_test.go
+++ b/src/runtime/proc_runtime_test.go
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Proc unit tests. In runtime package so can use runtime guts.
+
+package runtime
+
+func RunStealOrderTest() {
+	var ord randomOrder
+	for procs := 1; procs <= 64; procs++ {
+		ord.reset(uint32(procs))
+		if procs >= 3 && len(ord.coprimes) < 2 {
+			panic("too few coprimes")
+		}
+		for co := 0; co < len(ord.coprimes); co++ {
+			enum := ord.start(uint32(co))
+			checked := make([]bool, procs)
+			for p := 0; p < procs; p++ {
+				x := enum.position()
+				if checked[x] {
+					println("procs:", procs, "inc:", enum.inc)
+					panic("duplicate during enumeration")
+				}
+				checked[x] = true
+				enum.next()
+			}
+			if !enum.done() {
+				panic("not done")
+			}
+		}
+	}
+}
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -689,3 +689,7 @@ func matmult(done chan<- struct{}, A, B, C Matrix, i0, i1, j0, j1, k0, k1, thres
 		done <- struct{}{}
 	}
 }
+
+func TestStealOrder(t *testing.T) {
+	runtime.RunStealOrderTest()
+}