Commit 990124da authored by Russ Cox's avatar Russ Cox

runtime: use balanced tree for addr lookup in semaphore implementation

CL 36792 fixed #17953, a linear scan caused by n goroutines piling into
two different locks that hashed to the same bucket in the semaphore table.
In that CL, n goroutines contending for 2 unfortunately chosen locks
went from O(n²) to O(n).

This CL fixes a different linear scan, when n goroutines are contending for
n/2 different locks that all hash to the same bucket in the semaphore table.
In this CL, n goroutines contending for n/2 unfortunately chosen locks
goes from O(n²) to O(n log n). This case is much less likely, but any linear
scan eventually hurts, so we might as well fix it while the problem is fresh
in our minds.

The new test in this CL checks for both linear scans.

The effect of this CL on the sync benchmarks is negligible
(but it fixes the new test).

name                      old time/op    new time/op    delta
Cond1-48                     576ns ±10%     575ns ±13%     ~     (p=0.679 n=71+71)
Cond2-48                    1.59µs ± 8%    1.61µs ± 9%     ~     (p=0.107 n=73+69)
Cond4-48                    4.56µs ± 7%    4.55µs ± 7%     ~     (p=0.670 n=74+72)
Cond8-48                    9.87µs ± 9%    9.90µs ± 7%     ~     (p=0.507 n=69+73)
Cond16-48                   20.4µs ± 7%    20.4µs ±10%     ~     (p=0.588 n=69+71)
Cond32-48                   45.4µs ±10%    45.4µs ±14%     ~     (p=0.944 n=73+73)
UncontendedSemaphore-48     19.7ns ±12%    19.7ns ± 8%     ~     (p=0.589 n=65+63)
ContendedSemaphore-48       55.4ns ±26%    54.9ns ±32%     ~     (p=0.441 n=75+75)
MutexUncontended-48         0.63ns ± 0%    0.63ns ± 0%     ~     (all equal)
Mutex-48                     210ns ± 6%     213ns ±10%   +1.30%  (p=0.035 n=70+74)
MutexSlack-48                210ns ± 7%     211ns ± 9%     ~     (p=0.184 n=71+72)
MutexWork-48                 299ns ± 5%     300ns ± 5%     ~     (p=0.678 n=73+75)
MutexWorkSlack-48            302ns ± 6%     300ns ± 5%     ~     (p=0.149 n=74+72)
MutexNoSpin-48               135ns ± 6%     135ns ±10%     ~     (p=0.788 n=67+75)
MutexSpin-48                 693ns ± 5%     689ns ± 6%     ~     (p=0.092 n=65+74)
Once-48                     0.22ns ±25%    0.22ns ±24%     ~     (p=0.882 n=74+73)
Pool-48                     5.88ns ±36%    5.79ns ±24%     ~     (p=0.655 n=69+69)
PoolOverflow-48             4.79µs ±18%    4.87µs ±20%     ~     (p=0.233 n=75+75)
SemaUncontended-48          0.80ns ± 1%    0.82ns ± 8%   +2.46%  (p=0.000 n=60+74)
SemaSyntNonblock-48          103ns ± 4%     102ns ± 5%   -1.11%  (p=0.003 n=75+75)
SemaSyntBlock-48             104ns ± 4%     104ns ± 5%     ~     (p=0.231 n=71+75)
SemaWorkNonblock-48          128ns ± 4%     129ns ± 6%   +1.51%  (p=0.000 n=63+75)
SemaWorkBlock-48             129ns ± 8%     130ns ± 7%     ~     (p=0.072 n=75+74)
RWMutexUncontended-48       2.35ns ± 1%    2.35ns ± 0%     ~     (p=0.144 n=70+55)
RWMutexWrite100-48           139ns ±18%     141ns ±21%     ~     (p=0.071 n=75+73)
RWMutexWrite10-48            145ns ± 9%     145ns ± 8%     ~     (p=0.553 n=75+75)
RWMutexWorkWrite100-48       297ns ±13%     297ns ±15%     ~     (p=0.519 n=75+74)
RWMutexWorkWrite10-48        588ns ± 7%     585ns ± 5%     ~     (p=0.173 n=73+70)
WaitGroupUncontended-48     0.87ns ± 0%    0.87ns ± 0%     ~     (all equal)
WaitGroupAddDone-48         63.2ns ± 4%    62.7ns ± 4%   -0.82%  (p=0.027 n=72+75)
WaitGroupAddDoneWork-48      109ns ± 5%     109ns ± 4%     ~     (p=0.233 n=75+75)
WaitGroupWait-48            0.17ns ± 0%    0.16ns ±16%   -8.55%  (p=0.000 n=56+75)
WaitGroupWaitWork-48        1.78ns ± 1%    2.08ns ± 5%  +16.92%  (p=0.000 n=74+70)
WaitGroupActuallyWait-48    52.0ns ± 3%    50.6ns ± 5%   -2.70%  (p=0.000 n=71+69)

https://perf.golang.org/search?q=upload:20170215.1

Change-Id: Ia29a8bd006c089e401ec4297c3038cca656bcd0a
Reviewed-on: https://go-review.googlesource.com/37103
Run-TryBot: Russ Cox <rsc@golang.org>
Reviewed-by: 's avatarIan Lance Taylor <iant@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent fc456c7f
......@@ -286,6 +286,7 @@ type sudog struct {
acquiretime int64
releasetime int64
ticket uint32
parent *sudog // semaRoot binary tree
waitlink *sudog // g.waiting list or semaRoot
waittail *sudog // semaRoot
c *hchan // channel
......
......@@ -27,24 +27,19 @@ import (
// Asynchronous semaphore for sync.Mutex.
// A semaRoot holds a linked list of sudog with distinct addresses (s.elem).
// A semaRoot holds a balanced tree of sudog with distinct addresses (s.elem).
// Each of those sudog may in turn point (through s.waitlink) to a list
// of other sudogs waiting on the same address.
// The operations on the inner lists of sudogs with the same address
// are all O(1). Only the scanning of the top-level semaRoot list is O(n),
// are all O(1). The scanning of the top-level semaRoot list is O(log n),
// where n is the number of distinct addresses with goroutines blocked
// on them that hash to the given semaRoot.
// In systems with many goroutines, most queue up on a few addresses,
// so the linear search across unique addresses is probably OK.
// At least, we'll use this until it's not.
// The next step is probably to make the top-level list a treap instead
// of a linked list.
// See golang.org/issue/17953 for a program that worked badly
// before we introduced the second level of list.
// before we introduced the second level of list, and test/locklinear.go
// for a test that exercises this.
type semaRoot struct {
lock mutex
head *sudog
tail *sudog
treap *sudog // root of balanced tree of unique waiters.
nwait uint32 // Number of waiters. Read w/o the lock.
}
......@@ -205,8 +200,12 @@ func cansemacquire(addr *uint32) bool {
func (root *semaRoot) queue(addr *uint32, s *sudog) {
s.g = getg()
s.elem = unsafe.Pointer(addr)
s.next = nil
s.prev = nil
for t := root.head; t != nil; t = t.next {
var last *sudog
pt := &root.treap
for t := *pt; t != nil; t = *pt {
if t.elem == unsafe.Pointer(addr) {
// Already have addr in list; add s to end of per-addr list.
if t.waittail == nil {
......@@ -218,17 +217,37 @@ func (root *semaRoot) queue(addr *uint32, s *sudog) {
s.waitlink = nil
return
}
last = t
if uintptr(unsafe.Pointer(addr)) < uintptr(t.elem) {
pt = &t.prev
} else {
pt = &t.next
}
}
// Add s as new entry in list of unique addrs.
s.next = nil
s.prev = root.tail
if root.tail != nil {
root.tail.next = s
} else {
root.head = s
// Add s as new leaf in tree of unique addrs.
// The balanced tree is a treap using ticket as the random heap priority.
// That is, it is a binary tree ordered according to the elem addresses,
// but then among the space of possible binary trees respecting those
// addresses, it is kept balanced on average by maintaining a heap ordering
// on the ticket: s.ticket <= both s.prev.ticket and s.next.ticket.
// https://en.wikipedia.org/wiki/Treap
// http://faculty.washington.edu/aragon/pubs/rst89.pdf
s.ticket = fastrand()
s.parent = last
*pt = s
// Rotate up into tree according to ticket (priority).
for s.parent != nil && s.parent.ticket > s.ticket {
if s.parent.prev == s {
root.rotateRight(s.parent)
} else {
if s.parent.next != s {
panic("semaRoot queue")
}
root.rotateLeft(s.parent)
}
}
root.tail = s
}
// dequeue searches for and finds the first goroutine
......@@ -236,11 +255,17 @@ func (root *semaRoot) queue(addr *uint32, s *sudog) {
// If the sudog was being profiled, dequeue returns the time
// at which it was woken up as now. Otherwise now is 0.
func (root *semaRoot) dequeue(addr *uint32) (found *sudog, now int64) {
s := root.head
for ; s != nil; s = s.next {
ps := &root.treap
s := *ps
for ; s != nil; s = *ps {
if s.elem == unsafe.Pointer(addr) {
goto Found
}
if uintptr(unsafe.Pointer(addr)) < uintptr(s.elem) {
ps = &s.prev
} else {
ps = &s.next
}
}
return nil, 0
......@@ -250,18 +275,17 @@ Found:
now = cputicks()
}
if t := s.waitlink; t != nil {
// Substitute t, also waiting on addr, for s in root list of unique addrs.
// Substitute t, also waiting on addr, for s in root tree of unique addrs.
*ps = t
t.ticket = s.ticket
t.parent = s.parent
t.prev = s.prev
t.next = s.next
if t.prev != nil {
t.prev.next = t
} else {
root.head = t
t.prev.parent = t
}
t.next = s.next
if t.next != nil {
t.next.prev = t
} else {
root.tail = t
t.next.parent = t
}
if t.waitlink != nil {
t.waittail = s.waittail
......@@ -272,24 +296,104 @@ Found:
s.waitlink = nil
s.waittail = nil
} else {
// Remove s from list.
if s.next != nil {
s.next.prev = s.prev
} else {
root.tail = s.prev
// Rotate s down to be leaf of tree for removal, respecting priorities.
for s.next != nil || s.prev != nil {
if s.next == nil || s.prev != nil && s.prev.ticket < s.next.ticket {
root.rotateRight(s)
} else {
root.rotateLeft(s)
}
}
if s.prev != nil {
s.prev.next = s.next
// Remove s, now a leaf.
if s.parent != nil {
if s.parent.prev == s {
s.parent.prev = nil
} else {
s.parent.next = nil
}
} else {
root.head = s.next
root.treap = nil
}
}
s.parent = nil
s.elem = nil
s.next = nil
s.prev = nil
return s, now
}
// rotateLeft rotates the tree rooted at node x.
// turning (x a (y b c)) into (y (x a b) c).
func (root *semaRoot) rotateLeft(x *sudog) {
// p -> (x a (y b c))
p := x.parent
a, y := x.prev, x.next
b, c := y.prev, y.next
y.prev = x
x.parent = y
y.next = c
if c != nil {
c.parent = y
}
x.prev = a
if a != nil {
a.parent = x
}
x.next = b
if b != nil {
b.parent = x
}
y.parent = p
if p == nil {
root.treap = y
} else if p.prev == x {
p.prev = y
} else {
if p.next != x {
throw("semaRoot rotateLeft")
}
p.next = y
}
}
// rotateRight rotates the tree rooted at node y.
// turning (y (x a b) c) into (x a (y b c)).
func (root *semaRoot) rotateRight(y *sudog) {
// p -> (y (x a b) c)
p := y.parent
x, c := y.prev, y.next
a, b := x.prev, x.next
x.prev = a
if a != nil {
a.parent = x
}
x.next = y
y.parent = x
y.prev = b
if b != nil {
b.parent = y
}
y.next = c
if c != nil {
c.parent = y
}
x.parent = p
if p == nil {
root.treap = x
} else if p.prev == y {
p.prev = x
} else {
if p.next != y {
throw("semaRoot rotateRight")
}
p.next = x
}
}
// notifyList is a ticket-based notification list used to implement sync.Cond.
//
// It must be kept in sync with the sync package.
......@@ -414,10 +518,22 @@ func notifyListNotifyOne(l *notifyList) {
return
}
// Update the next notify ticket number, and try to find the G that
// needs to be notified. If it hasn't made it to the list yet we won't
// find it, but it won't park itself once it sees the new notify number.
// Update the next notify ticket number.
atomic.Store(&l.notify, t+1)
// Try to find the g that needs to be notified.
// If it hasn't made it to the list yet we won't find it,
// but it won't park itself once it sees the new notify number.
//
// This scan looks linear but essentially always stops quickly.
// Because g's queue separately from taking numbers,
// there may be minor reorderings in the list, but we
// expect the g we're looking for to be near the front.
// The g has others in front of it on the list only to the
// extent that it lost the race, so the iteration will not
// be too long. This applies even when the g is missing:
// it hasn't yet gotten to sleep and has lost the race to
// the (few) other g's that we find on the list.
for p, s := (*sudog)(nil), l.head; s != nil; p, s = s, s.next {
if s.ticket == t {
n := s.next
......
// run
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test that locks don't go quadratic due to runtime hash table collisions.
package main
import (
"fmt"
"runtime"
"sync"
"time"
)
const debug = false
// checkLinear asserts that the running time of f(n) is at least linear but sub-quadratic.
// tries is the initial number of iterations.
func checkLinear(typ string, tries int, f func(n int)) {
// Depending on the machine and OS, this test might be too fast
// to measure with accurate enough granularity. On failure,
// make it run longer, hoping that the timing granularity
// is eventually sufficient.
timeF := func(n int) time.Duration {
t1 := time.Now()
f(n)
return time.Since(t1)
}
n := tries
fails := 0
for {
t1 := timeF(n)
t2 := timeF(2 * n)
if debug {
println(n, t1.String(), 2*n, t2.String())
}
// should be 2x (linear); allow up to 2.5x
if t1*3/2 < t2 && t2 < t1*5/2 {
return
}
// If 2n ops run in under a second and the ratio
// doesn't work out, make n bigger, trying to reduce
// the effect that a constant amount of overhead has
// on the computed ratio.
if t2 < 1*time.Second {
n *= 2
continue
}
// Once the test runs long enough for n ops,
// try to get the right ratio at least once.
// If five in a row all fail, give up.
if fails++; fails >= 5 {
panic(fmt.Sprintf("%s: too slow: %d ops: %v; %d ops: %v\n",
typ, n, t1, 2*n, t2))
}
}
}
const offset = 251 // known size of runtime hash table
func main() {
checkLinear("lockone", 1000, func(n int) {
ch := make(chan int)
locks := make([]sync.RWMutex, offset+1)
for i := 0; i < n; i++ {
go func() {
locks[0].Lock()
ch <- 1
}()
}
time.Sleep(1 * time.Millisecond)
go func() {
for j := 0; j < n; j++ {
locks[1].Lock()
locks[offset].Lock()
locks[1].Unlock()
runtime.Gosched()
locks[offset].Unlock()
}
}()
for j := 0; j < n; j++ {
locks[1].Lock()
locks[offset].Lock()
locks[1].Unlock()
runtime.Gosched()
locks[offset].Unlock()
}
for i := 0; i < n; i++ {
<-ch
locks[0].Unlock()
}
})
checkLinear("lockmany", 1000, func(n int) {
locks := make([]sync.RWMutex, n*offset+1)
var wg sync.WaitGroup
for i := 0; i < n; i++ {
wg.Add(1)
go func(i int) {
locks[(i+1)*offset].Lock()
wg.Done()
locks[(i+1)*offset].Lock()
locks[(i+1)*offset].Unlock()
}(i)
}
wg.Wait()
go func() {
for j := 0; j < n; j++ {
locks[1].Lock()
locks[0].Lock()
locks[1].Unlock()
runtime.Gosched()
locks[0].Unlock()
}
}()
for j := 0; j < n; j++ {
locks[1].Lock()
locks[0].Lock()
locks[1].Unlock()
runtime.Gosched()
locks[0].Unlock()
}
for i := 0; i < n; i++ {
locks[(i+1)*offset].Unlock()
}
})
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment