runtime: fix CPU underutilization

runtime.newproc/ready are deliberately sloppy about waking new M's, they only ensure that there is at least 1 spinning M. Currently to compensate for that, schedule() checks if the current P has local work and there are no spinning M's, it wakes up another one. It does not work if goroutines do not call schedule. With this change a spinning M wakes up another M when it finds work to do. It's also not ideal, but it fixes the underutilization. A proper check would require to know the exact number of runnable G's, but it's too expensive to maintain. Fixes #5586. This is reincarnation of cl/9776044 with the bug fixed. The bug was due to code added after cl/9776044 was created: if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime·sched.runqsize > 0) { runtime·lock(&runtime·sched); gp = globrunqget(m->p, 1); runtime·unlock(&runtime·sched); } If M gets gp from global runq here, it does not reset m->spinning. R=golang-dev, rsc CC=golang-dev https://golang.org/cl/10743044

runtime: fix CPU underutilization
runtime.newproc/ready are deliberately sloppy about waking new M's, they only ensure that there is at least 1 spinning M. Currently to compensate for that, schedule() checks if the current P has local work and there are no spinning M's, it wakes up another one. It does not work if goroutines do not call schedule. With this change a spinning M wakes up another M when it finds work to do. It's also not ideal, but it fixes the underutilization. A proper check would require to know the exact number of runnable G's, but it's too expensive to maintain. Fixes #5586. This is reincarnation of cl/9776044 with the bug fixed. The bug was due to code added after cl/9776044 was created: if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime·sched.runqsize > 0) { runtime·lock(&runtime·sched); gp = globrunqget(m->p, 1); runtime·unlock(&runtime·sched); } If M gets gp from global runq here, it does not reset m->spinning. R=golang-dev, rsc CC=golang-dev https://golang.org/cl/10743044
32fef990 · Dmitriy Vyukov · Russ Cox · 735cf529 · 32fef990 · 32fef990
Commit 32fef990 authored Jul 11, 2013 by Dmitriy Vyukov Committed by Russ Cox Jul 11, 2013
Show whitespace changes
Inline Side-by-side

Showing with 50 additions and 13 deletions

proc.c src/pkg/runtime/proc.c +26 -13

proc_test.go src/pkg/runtime/proc_test.go +24 -0

No files found.
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -1133,6 +1133,25 @@ stop:
 	goto top;
 }

+static void
+resetspinning(void)
+{
+	int32 nmspinning;
+
+	if(m->spinning) {
+		m->spinning = false;
+		nmspinning = runtime·xadd(&runtime·sched.nmspinning, -1);
+		if(nmspinning < 0)
+			runtime·throw("findrunnable: negative nmspinning");
+	} else
+		nmspinning = runtime·atomicload(&runtime·sched.nmspinning);
+
+	// M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
+	// so see if we need to wakeup another P here.
+	if (nmspinning == 0 && runtime·atomicload(&runtime·sched.npidle) > 0)
+		wakep();
+}
+
 // Injects the list of runnable G's into the scheduler.
 // Can run concurrently with GC.
 static void
@@ -1184,28 +1203,22 @@ top:
 		runtime·lock(&runtime·sched);
 		gp = globrunqget(m->p, 1);
 		runtime·unlock(&runtime·sched);
+		if(gp)
+			resetspinning();
 	}
 	if(gp == nil) {
 		gp = runqget(m->p);
 		if(gp && m->spinning)
 			runtime·throw("schedule: spinning with local work");
 	}
-	if(gp == nil)
-		gp = findrunnable();
-
-	if(m->spinning) {
-		m->spinning = false;
-		runtime·xadd(&runtime·sched.nmspinning, -1);
+	if(gp == nil) {
+		gp = findrunnable();  // blocks until work is available
+		resetspinning();
 	}

-	// M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
-	// so see if we need to wakeup another M here.
-	if (m->p->runqhead != m->p->runqtail &&
-		runtime·atomicload(&runtime·sched.nmspinning) == 0 &&
-		runtime·atomicload(&runtime·sched.npidle) > 0)  // TODO: fast atomic
-		wakep();
-
 	if(gp->lockedm) {
+		// Hands off own p to the locked m,
+		// then blocks waiting for a new p.
 		startlockedm(gp);
 		goto top;
 	}

--- a/src/pkg/runtime/proc_test.go
+++ b/src/pkg/runtime/proc_test.go
@@ -93,6 +93,30 @@ func TestYieldLocked(t *testing.T) {
 	<-c
 }

+func TestGoroutineParallelism(t *testing.T) {
+	const P = 4
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(P))
+	for try := 0; try < 10; try++ {
+		done := make(chan bool)
+		x := uint32(0)
+		for p := 0; p < P; p++ {
+			// Test that all P goroutines are scheduled at the same time
+			go func(p int) {
+				for i := 0; i < 3; i++ {
+					expected := uint32(P*i + p)
+					for atomic.LoadUint32(&x) != expected {
+					}
+					atomic.StoreUint32(&x, expected+1)
+				}
+				done <- true
+			}(p)
+		}
+		for p := 0; p < P; p++ {
+			<-done
+		}
+	}
+}
+
 func TestBlockLocked(t *testing.T) {
 	const N = 10
 	c := make(chan bool)