Commit 868c8b37 authored by Jamie Liu's avatar Jamie Liu Committed by Austin Clements

runtime: only sleep before stealing work from a running P

The sleep in question does not make sense if the stolen-from P cannot
run the stolen G. The usleep(3) has been observed delaying execution of
woken G's by ~60us; skipping it reduces the wakeup-to-execution latency
to ~7us in these cases, improving CPU utilization.

Benchmarks added by this change:

name                             old time/op  new time/op  delta
WakeupParallelSpinning/0s-12     14.4µs ± 1%  14.3µs ± 1%     ~     (p=0.227 n=19+20)
WakeupParallelSpinning/1µs-12    18.3µs ± 0%  18.3µs ± 1%     ~     (p=0.950 n=20+19)
WakeupParallelSpinning/2µs-12    22.3µs ± 1%  22.3µs ± 1%     ~     (p=0.670 n=20+18)
WakeupParallelSpinning/5µs-12    31.7µs ± 0%  31.7µs ± 0%     ~     (p=0.460 n=20+17)
WakeupParallelSpinning/10µs-12   51.8µs ± 0%  51.8µs ± 0%     ~     (p=0.883 n=20+20)
WakeupParallelSpinning/20µs-12   91.9µs ± 0%  91.9µs ± 0%     ~     (p=0.245 n=20+20)
WakeupParallelSpinning/50µs-12    214µs ± 0%   214µs ± 0%     ~     (p=0.509 n=19+20)
WakeupParallelSpinning/100µs-12   335µs ± 0%   335µs ± 0%   -0.05%  (p=0.006 n=17+15)
WakeupParallelSyscall/0s-12       228µs ± 2%   129µs ± 1%  -43.32%  (p=0.000 n=20+19)
WakeupParallelSyscall/1µs-12      232µs ± 1%   131µs ± 1%  -43.60%  (p=0.000 n=19+20)
WakeupParallelSyscall/2µs-12      236µs ± 1%   133µs ± 1%  -43.44%  (p=0.000 n=18+19)
WakeupParallelSyscall/5µs-12      248µs ± 2%   139µs ± 1%  -43.68%  (p=0.000 n=18+19)
WakeupParallelSyscall/10µs-12     263µs ± 3%   150µs ± 2%  -42.97%  (p=0.000 n=18+20)
WakeupParallelSyscall/20µs-12     281µs ± 2%   170µs ± 1%  -39.43%  (p=0.000 n=19+19)
WakeupParallelSyscall/50µs-12     345µs ± 4%   246µs ± 7%  -28.85%  (p=0.000 n=20+20)
WakeupParallelSyscall/100µs-12    460µs ± 5%   350µs ± 4%  -23.85%  (p=0.000 n=20+20)

Benchmarks associated with the change that originally added this sleep
(see https://golang.org/s/go15gomaxprocs):

name        old time/op  new time/op  delta
Chain       19.4µs ± 2%  19.3µs ± 1%    ~     (p=0.101 n=19+20)
ChainBuf    19.5µs ± 2%  19.4µs ± 2%    ~     (p=0.840 n=19+19)
Chain-2     19.9µs ± 1%  19.9µs ± 2%    ~     (p=0.734 n=19+19)
ChainBuf-2  20.0µs ± 2%  20.0µs ± 2%    ~     (p=0.175 n=19+17)
Chain-4     20.3µs ± 1%  20.1µs ± 1%  -0.62%  (p=0.010 n=19+18)
ChainBuf-4  20.3µs ± 1%  20.2µs ± 1%  -0.52%  (p=0.023 n=19+19)
Powser       2.09s ± 1%   2.10s ± 3%    ~     (p=0.908 n=19+19)
Powser-2     2.21s ± 1%   2.20s ± 1%  -0.35%  (p=0.010 n=19+18)
Powser-4     2.31s ± 2%   2.31s ± 2%    ~     (p=0.578 n=18+19)
Sieve        13.6s ± 1%   13.6s ± 1%    ~     (p=0.909 n=17+18)
Sieve-2      8.02s ±52%   7.28s ±15%    ~     (p=0.336 n=20+16)
Sieve-4      4.00s ±35%   3.98s ±26%    ~     (p=0.654 n=20+18)

Change-Id: I58edd8ce01075859d871e2348fc0833e9c01f70f
Reviewed-on: https://go-review.googlesource.com/78538Reviewed-by: 's avatarAustin Clements <austin@google.com>
parent 2951f909
......@@ -4773,22 +4773,25 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool
if stealRunNextG {
// Try to steal from _p_.runnext.
if next := _p_.runnext; next != 0 {
// Sleep to ensure that _p_ isn't about to run the g we
// are about to steal.
// The important use case here is when the g running on _p_
// ready()s another g and then almost immediately blocks.
// Instead of stealing runnext in this window, back off
// to give _p_ a chance to schedule runnext. This will avoid
// thrashing gs between different Ps.
// A sync chan send/recv takes ~50ns as of time of writing,
// so 3us gives ~50x overshoot.
if GOOS != "windows" {
usleep(3)
} else {
// On windows system timer granularity is 1-15ms,
// which is way too much for this optimization.
// So just yield.
osyield()
if _p_.status == _Prunning {
// Sleep to ensure that _p_ isn't about to run the g
// we are about to steal.
// The important use case here is when the g running
// on _p_ ready()s another g and then almost
// immediately blocks. Instead of stealing runnext
// in this window, back off to give _p_ a chance to
// schedule runnext. This will avoid thrashing gs
// between different Ps.
// A sync chan send/recv takes ~50ns as of time of
// writing, so 3us gives ~50x overshoot.
if GOOS != "windows" {
usleep(3)
} else {
// On windows system timer granularity is
// 1-15ms, which is way too much for this
// optimization. So just yield.
osyield()
}
}
if !_p_.runnext.cas(next, 0) {
continue
......
......@@ -655,6 +655,114 @@ func BenchmarkClosureCall(b *testing.B) {
_ = sum
}
func benchmarkWakeupParallel(b *testing.B, spin func(time.Duration)) {
if runtime.GOMAXPROCS(0) == 1 {
b.Skip("skipping: GOMAXPROCS=1")
}
wakeDelay := 5 * time.Microsecond
for _, delay := range []time.Duration{
0,
1 * time.Microsecond,
2 * time.Microsecond,
5 * time.Microsecond,
10 * time.Microsecond,
20 * time.Microsecond,
50 * time.Microsecond,
100 * time.Microsecond,
} {
b.Run(delay.String(), func(b *testing.B) {
if b.N == 0 {
return
}
// Start two goroutines, which alternate between being
// sender and receiver in the following protocol:
//
// - The receiver spins for `delay` and then does a
// blocking receive on a channel.
//
// - The sender spins for `delay+wakeDelay` and then
// sends to the same channel. (The addition of
// `wakeDelay` improves the probability that the
// receiver will be blocking when the send occurs when
// the goroutines execute in parallel.)
//
// In each iteration of the benchmark, each goroutine
// acts once as sender and once as receiver, so each
// goroutine spins for delay twice.
//
// BenchmarkWakeupParallel is used to estimate how
// efficiently the scheduler parallelizes goroutines in
// the presence of blocking:
//
// - If both goroutines are executed on the same core,
// an increase in delay by N will increase the time per
// iteration by 4*N, because all 4 delays are
// serialized.
//
// - Otherwise, an increase in delay by N will increase
// the time per iteration by 2*N, and the time per
// iteration is 2 * (runtime overhead + chan
// send/receive pair + delay + wakeDelay). This allows
// the runtime overhead, including the time it takes
// for the unblocked goroutine to be scheduled, to be
// estimated.
ping, pong := make(chan struct{}), make(chan struct{})
start := make(chan struct{})
done := make(chan struct{})
go func() {
<-start
for i := 0; i < b.N; i++ {
// sender
spin(delay + wakeDelay)
ping <- struct{}{}
// receiver
spin(delay)
<-pong
}
done <- struct{}{}
}()
go func() {
for i := 0; i < b.N; i++ {
// receiver
spin(delay)
<-ping
// sender
spin(delay + wakeDelay)
pong <- struct{}{}
}
done <- struct{}{}
}()
b.ResetTimer()
start <- struct{}{}
<-done
<-done
})
}
}
func BenchmarkWakeupParallelSpinning(b *testing.B) {
benchmarkWakeupParallel(b, func(d time.Duration) {
end := time.Now().Add(d)
for time.Now().Before(end) {
// do nothing
}
})
}
func BenchmarkWakeupParallelSyscall(b *testing.B) {
benchmarkWakeupParallel(b, func(d time.Duration) {
// Invoke a blocking syscall directly; calling time.Sleep()
// would deschedule the goroutine instead.
ts := syscall.NsecToTimespec(d.Nanoseconds())
for {
if err := syscall.Nanosleep(&ts, &ts); err != syscall.EINTR {
return
}
}
})
}
type Matrix [][]float64
func BenchmarkMatmult(b *testing.B) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment