runtime: use balanced tree for addr lookup in semaphore implementation

CL 36792 fixed #17953, a linear scan caused by n goroutines piling into two different locks that hashed to the same bucket in the semaphore table. In that CL, n goroutines contending for 2 unfortunately chosen locks went from O(n²) to O(n). This CL fixes a different linear scan, when n goroutines are contending for n/2 different locks that all hash to the same bucket in the semaphore table. In this CL, n goroutines contending for n/2 unfortunately chosen locks goes from O(n²) to O(n log n). This case is much less likely, but any linear scan eventually hurts, so we might as well fix it while the problem is fresh in our minds. The new test in this CL checks for both linear scans. The effect of this CL on the sync benchmarks is negligible (but it fixes the new test). name old time/op new time/op delta Cond1-48 576ns ±10% 575ns ±13% ~ (p=0.679 n=71+71) Cond2-48 1.59µs ± 8% 1.61µs ± 9% ~ (p=0.107 n=73+69) Cond4-48 4.56µs ± 7% 4.55µs ± 7% ~ (p=0.670 n=74+72) Cond8-48 9.87µs ± 9% 9.90µs ± 7% ~ (p=0.507 n=69+73) Cond16-48 20.4µs ± 7% 20.4µs ±10% ~ (p=0.588 n=69+71) Cond32-48 45.4µs ±10% 45.4µs ±14% ~ (p=0.944 n=73+73) UncontendedSemaphore-48 19.7ns ±12% 19.7ns ± 8% ~ (p=0.589 n=65+63) ContendedSemaphore-48 55.4ns ±26% 54.9ns ±32% ~ (p=0.441 n=75+75) MutexUncontended-48 0.63ns ± 0% 0.63ns ± 0% ~ (all equal) Mutex-48 210ns ± 6% 213ns ±10% +1.30% (p=0.035 n=70+74) MutexSlack-48 210ns ± 7% 211ns ± 9% ~ (p=0.184 n=71+72) MutexWork-48 299ns ± 5% 300ns ± 5% ~ (p=0.678 n=73+75) MutexWorkSlack-48 302ns ± 6% 300ns ± 5% ~ (p=0.149 n=74+72) MutexNoSpin-48 135ns ± 6% 135ns ±10% ~ (p=0.788 n=67+75) MutexSpin-48 693ns ± 5% 689ns ± 6% ~ (p=0.092 n=65+74) Once-48 0.22ns ±25% 0.22ns ±24% ~ (p=0.882 n=74+73) Pool-48 5.88ns ±36% 5.79ns ±24% ~ (p=0.655 n=69+69) PoolOverflow-48 4.79µs ±18% 4.87µs ±20% ~ (p=0.233 n=75+75) SemaUncontended-48 0.80ns ± 1% 0.82ns ± 8% +2.46% (p=0.000 n=60+74) SemaSyntNonblock-48 103ns ± 4% 102ns ± 5% -1.11% (p=0.003 n=75+75) SemaSyntBlock-48 104ns ± 4% 104ns ± 5% ~ (p=0.231 n=71+75) SemaWorkNonblock-48 128ns ± 4% 129ns ± 6% +1.51% (p=0.000 n=63+75) SemaWorkBlock-48 129ns ± 8% 130ns ± 7% ~ (p=0.072 n=75+74) RWMutexUncontended-48 2.35ns ± 1% 2.35ns ± 0% ~ (p=0.144 n=70+55) RWMutexWrite100-48 139ns ±18% 141ns ±21% ~ (p=0.071 n=75+73) RWMutexWrite10-48 145ns ± 9% 145ns ± 8% ~ (p=0.553 n=75+75) RWMutexWorkWrite100-48 297ns ±13% 297ns ±15% ~ (p=0.519 n=75+74) RWMutexWorkWrite10-48 588ns ± 7% 585ns ± 5% ~ (p=0.173 n=73+70) WaitGroupUncontended-48 0.87ns ± 0% 0.87ns ± 0% ~ (all equal) WaitGroupAddDone-48 63.2ns ± 4% 62.7ns ± 4% -0.82% (p=0.027 n=72+75) WaitGroupAddDoneWork-48 109ns ± 5% 109ns ± 4% ~ (p=0.233 n=75+75) WaitGroupWait-48 0.17ns ± 0% 0.16ns ±16% -8.55% (p=0.000 n=56+75) WaitGroupWaitWork-48 1.78ns ± 1% 2.08ns ± 5% +16.92% (p=0.000 n=74+70) WaitGroupActuallyWait-48 52.0ns ± 3% 50.6ns ± 5% -2.70% (p=0.000 n=71+69) https://perf.golang.org/search?q=upload:20170215.1 Change-Id: Ia29a8bd006c089e401ec4297c3038cca656bcd0a Reviewed-on: https://go-review.googlesource.com/37103 Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Ian Lance Taylor <iant@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>

runtime: use balanced tree for addr lookup in semaphore implementation
CL 36792 fixed #17953, a linear scan caused by n goroutines piling into two different locks that hashed to the same bucket in the semaphore table. In that CL, n goroutines contending for 2 unfortunately chosen locks went from O(n²) to O(n). This CL fixes a different linear scan, when n goroutines are contending for n/2 different locks that all hash to the same bucket in the semaphore table. In this CL, n goroutines contending for n/2 unfortunately chosen locks goes from O(n²) to O(n log n). This case is much less likely, but any linear scan eventually hurts, so we might as well fix it while the problem is fresh in our minds. The new test in this CL checks for both linear scans. The effect of this CL on the sync benchmarks is negligible (but it fixes the new test). name old time/op new time/op delta Cond1-48 576ns ±10% 575ns ±13% ~ (p=0.679 n=71+71) Cond2-48 1.59µs ± 8% 1.61µs ± 9% ~ (p=0.107 n=73+69) Cond4-48 4.56µs ± 7% 4.55µs ± 7% ~ (p=0.670 n=74+72) Cond8-48 9.87µs ± 9% 9.90µs ± 7% ~ (p=0.507 n=69+73) Cond16-48 20.4µs ± 7% 20.4µs ±10% ~ (p=0.588 n=69+71) Cond32-48 45.4µs ±10% 45.4µs ±14% ~ (p=0.944 n=73+73) UncontendedSemaphore-48 19.7ns ±12% 19.7ns ± 8% ~ (p=0.589 n=65+63) ContendedSemaphore-48 55.4ns ±26% 54.9ns ±32% ~ (p=0.441 n=75+75) MutexUncontended-48 0.63ns ± 0% 0.63ns ± 0% ~ (all equal) Mutex-48 210ns ± 6% 213ns ±10% +1.30% (p=0.035 n=70+74) MutexSlack-48 210ns ± 7% 211ns ± 9% ~ (p=0.184 n=71+72) MutexWork-48 299ns ± 5% 300ns ± 5% ~ (p=0.678 n=73+75) MutexWorkSlack-48 302ns ± 6% 300ns ± 5% ~ (p=0.149 n=74+72) MutexNoSpin-48 135ns ± 6% 135ns ±10% ~ (p=0.788 n=67+75) MutexSpin-48 693ns ± 5% 689ns ± 6% ~ (p=0.092 n=65+74) Once-48 0.22ns ±25% 0.22ns ±24% ~ (p=0.882 n=74+73) Pool-48 5.88ns ±36% 5.79ns ±24% ~ (p=0.655 n=69+69) PoolOverflow-48 4.79µs ±18% 4.87µs ±20% ~ (p=0.233 n=75+75) SemaUncontended-48 0.80ns ± 1% 0.82ns ± 8% +2.46% (p=0.000 n=60+74) SemaSyntNonblock-48 103ns ± 4% 102ns ± 5% -1.11% (p=0.003 n=75+75) SemaSyntBlock-48 104ns ± 4% 104ns ± 5% ~ (p=0.231 n=71+75) SemaWorkNonblock-48 128ns ± 4% 129ns ± 6% +1.51% (p=0.000 n=63+75) SemaWorkBlock-48 129ns ± 8% 130ns ± 7% ~ (p=0.072 n=75+74) RWMutexUncontended-48 2.35ns ± 1% 2.35ns ± 0% ~ (p=0.144 n=70+55) RWMutexWrite100-48 139ns ±18% 141ns ±21% ~ (p=0.071 n=75+73) RWMutexWrite10-48 145ns ± 9% 145ns ± 8% ~ (p=0.553 n=75+75) RWMutexWorkWrite100-48 297ns ±13% 297ns ±15% ~ (p=0.519 n=75+74) RWMutexWorkWrite10-48 588ns ± 7% 585ns ± 5% ~ (p=0.173 n=73+70) WaitGroupUncontended-48 0.87ns ± 0% 0.87ns ± 0% ~ (all equal) WaitGroupAddDone-48 63.2ns ± 4% 62.7ns ± 4% -0.82% (p=0.027 n=72+75) WaitGroupAddDoneWork-48 109ns ± 5% 109ns ± 4% ~ (p=0.233 n=75+75) WaitGroupWait-48 0.17ns ± 0% 0.16ns ±16% -8.55% (p=0.000 n=56+75) WaitGroupWaitWork-48 1.78ns ± 1% 2.08ns ± 5% +16.92% (p=0.000 n=74+70) WaitGroupActuallyWait-48 52.0ns ± 3% 50.6ns ± 5% -2.70% (p=0.000 n=71+69) https://perf.golang.org/search?q=upload:20170215.1 Change-Id: Ia29a8bd006c089e401ec4297c3038cca656bcd0a Reviewed-on: https://go-review.googlesource.com/37103 Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Ian Lance Taylor <iant@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
990124da · Russ Cox · fc456c7f · 990124da · 990124da · 990124da
Commit 990124da authored Feb 12, 2017 by Russ Cox
Hide whitespace changes
Inline Side-by-side

Showing with 296 additions and 40 deletions

runtime2.go src/runtime/runtime2.go +1 -0

sema.go src/runtime/sema.go +156 -40

locklinear.go test/locklinear.go +139 -0

No files found.
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -286,6 +286,7 @@ type sudog struct {
 	acquiretime int64
 	releasetime int64
 	ticket      uint32
+	parent      *sudog // semaRoot binary tree
 	waitlink    *sudog // g.waiting list or semaRoot
 	waittail    *sudog // semaRoot
 	c           *hchan // channel

--- a/src/runtime/sema.go
+++ b/src/runtime/sema.go
@@ -27,24 +27,19 @@ import (

 // Asynchronous semaphore for sync.Mutex.

-// A semaRoot holds a linked list of sudog with distinct addresses (s.elem).
+// A semaRoot holds a balanced tree of sudog with distinct addresses (s.elem).
 // Each of those sudog may in turn point (through s.waitlink) to a list
 // of other sudogs waiting on the same address.
 // The operations on the inner lists of sudogs with the same address
-// are all O(1). Only the scanning of the top-level semaRoot list is O(n),
+// are all O(1). The scanning of the top-level semaRoot list is O(log n),
 // where n is the number of distinct addresses with goroutines blocked
 // on them that hash to the given semaRoot.
-// In systems with many goroutines, most queue up on a few addresses,
-// so the linear search across unique addresses is probably OK.
-// At least, we'll use this until it's not.
-// The next step is probably to make the top-level list a treap instead
-// of a linked list.
 // See golang.org/issue/17953 for a program that worked badly
-// before we introduced the second level of list.
+// before we introduced the second level of list, and test/locklinear.go
+// for a test that exercises this.
 type semaRoot struct {
 	lock  mutex
-	head  *sudog
-	tail  *sudog
+	treap *sudog // root of balanced tree of unique waiters.
 	nwait uint32 // Number of waiters. Read w/o the lock.
 }

@@ -205,8 +200,12 @@ func cansemacquire(addr *uint32) bool {
 func (root *semaRoot) queue(addr *uint32, s *sudog) {
 	s.g = getg()
 	s.elem = unsafe.Pointer(addr)
+	s.next = nil
+	s.prev = nil

-	for t := root.head; t != nil; t = t.next {
+	var last *sudog
+	pt := &root.treap
+	for t := *pt; t != nil; t = *pt {
 		if t.elem == unsafe.Pointer(addr) {
 			// Already have addr in list; add s to end of per-addr list.
 			if t.waittail == nil {
@@ -218,17 +217,37 @@ func (root *semaRoot) queue(addr *uint32, s *sudog) {
 			s.waitlink = nil
 			return
 		}
+		last = t
+		if uintptr(unsafe.Pointer(addr)) < uintptr(t.elem) {
+			pt = &t.prev
+		} else {
+			pt = &t.next
+		}
 	}

-	// Add s as new entry in list of unique addrs.
-	s.next = nil
-	s.prev = root.tail
-	if root.tail != nil {
-		root.tail.next = s
-	} else {
-		root.head = s
+	// Add s as new leaf in tree of unique addrs.
+	// The balanced tree is a treap using ticket as the random heap priority.
+	// That is, it is a binary tree ordered according to the elem addresses,
+	// but then among the space of possible binary trees respecting those
+	// addresses, it is kept balanced on average by maintaining a heap ordering
+	// on the ticket: s.ticket <= both s.prev.ticket and s.next.ticket.
+	// https://en.wikipedia.org/wiki/Treap
+	// http://faculty.washington.edu/aragon/pubs/rst89.pdf
+	s.ticket = fastrand()
+	s.parent = last
+	*pt = s
+
+	// Rotate up into tree according to ticket (priority).
+	for s.parent != nil && s.parent.ticket > s.ticket {
+		if s.parent.prev == s {
+			root.rotateRight(s.parent)
+		} else {
+			if s.parent.next != s {
+				panic("semaRoot queue")
+			}
+			root.rotateLeft(s.parent)
+		}
 	}
-	root.tail = s
 }

 // dequeue searches for and finds the first goroutine
@@ -236,11 +255,17 @@ func (root *semaRoot) queue(addr *uint32, s *sudog) {
 // If the sudog was being profiled, dequeue returns the time
 // at which it was woken up as now. Otherwise now is 0.
 func (root *semaRoot) dequeue(addr *uint32) (found *sudog, now int64) {
-	s := root.head
-	for ; s != nil; s = s.next {
+	ps := &root.treap
+	s := *ps
+	for ; s != nil; s = *ps {
 		if s.elem == unsafe.Pointer(addr) {
 			goto Found
 		}
+		if uintptr(unsafe.Pointer(addr)) < uintptr(s.elem) {
+			ps = &s.prev
+		} else {
+			ps = &s.next
+		}
 	}
 	return nil, 0

@@ -250,18 +275,17 @@ Found:
 		now = cputicks()
 	}
 	if t := s.waitlink; t != nil {
-		// Substitute t, also waiting on addr, for s in root list of unique addrs.
+		// Substitute t, also waiting on addr, for s in root tree of unique addrs.
+		*ps = t
+		t.ticket = s.ticket
+		t.parent = s.parent
 		t.prev = s.prev
-		t.next = s.next
 		if t.prev != nil {
-			t.prev.next = t
-		} else {
-			root.head = t
+			t.prev.parent = t
 		}
+		t.next = s.next
 		if t.next != nil {
-			t.next.prev = t
-		} else {
-			root.tail = t
+			t.next.parent = t
 		}
 		if t.waitlink != nil {
 			t.waittail = s.waittail
@@ -272,24 +296,104 @@ Found:
 		s.waitlink = nil
 		s.waittail = nil
 	} else {
-		// Remove s from list.
-		if s.next != nil {
-			s.next.prev = s.prev
-		} else {
-			root.tail = s.prev
+		// Rotate s down to be leaf of tree for removal, respecting priorities.
+		for s.next != nil || s.prev != nil {
+			if s.next == nil || s.prev != nil && s.prev.ticket < s.next.ticket {
+				root.rotateRight(s)
+			} else {
+				root.rotateLeft(s)
+			}
 		}
-		if s.prev != nil {
-			s.prev.next = s.next
+		// Remove s, now a leaf.
+		if s.parent != nil {
+			if s.parent.prev == s {
+				s.parent.prev = nil
+			} else {
+				s.parent.next = nil
+			}
 		} else {
-			root.head = s.next
+			root.treap = nil
 		}
 	}
+	s.parent = nil
 	s.elem = nil
 	s.next = nil
 	s.prev = nil
 	return s, now
 }

+// rotateLeft rotates the tree rooted at node x.
+// turning (x a (y b c)) into (y (x a b) c).
+func (root *semaRoot) rotateLeft(x *sudog) {
+	// p -> (x a (y b c))
+	p := x.parent
+	a, y := x.prev, x.next
+	b, c := y.prev, y.next
+
+	y.prev = x
+	x.parent = y
+	y.next = c
+	if c != nil {
+		c.parent = y
+	}
+	x.prev = a
+	if a != nil {
+		a.parent = x
+	}
+	x.next = b
+	if b != nil {
+		b.parent = x
+	}
+
+	y.parent = p
+	if p == nil {
+		root.treap = y
+	} else if p.prev == x {
+		p.prev = y
+	} else {
+		if p.next != x {
+			throw("semaRoot rotateLeft")
+		}
+		p.next = y
+	}
+}
+
+// rotateRight rotates the tree rooted at node y.
+// turning (y (x a b) c) into (x a (y b c)).
+func (root *semaRoot) rotateRight(y *sudog) {
+	// p -> (y (x a b) c)
+	p := y.parent
+	x, c := y.prev, y.next
+	a, b := x.prev, x.next
+
+	x.prev = a
+	if a != nil {
+		a.parent = x
+	}
+	x.next = y
+	y.parent = x
+	y.prev = b
+	if b != nil {
+		b.parent = y
+	}
+	y.next = c
+	if c != nil {
+		c.parent = y
+	}
+
+	x.parent = p
+	if p == nil {
+		root.treap = x
+	} else if p.prev == y {
+		p.prev = x
+	} else {
+		if p.next != y {
+			throw("semaRoot rotateRight")
+		}
+		p.next = x
+	}
+}
+
 // notifyList is a ticket-based notification list used to implement sync.Cond.
 //
 // It must be kept in sync with the sync package.
@@ -414,10 +518,22 @@ func notifyListNotifyOne(l *notifyList) {
 		return
 	}

-	// Update the next notify ticket number, and try to find the G that
-	// needs to be notified. If it hasn't made it to the list yet we won't
-	// find it, but it won't park itself once it sees the new notify number.
+	// Update the next notify ticket number.
 	atomic.Store(&l.notify, t+1)
+
+	// Try to find the g that needs to be notified.
+	// If it hasn't made it to the list yet we won't find it,
+	// but it won't park itself once it sees the new notify number.
+	//
+	// This scan looks linear but essentially always stops quickly.
+	// Because g's queue separately from taking numbers,
+	// there may be minor reorderings in the list, but we
+	// expect the g we're looking for to be near the front.
+	// The g has others in front of it on the list only to the
+	// extent that it lost the race, so the iteration will not
+	// be too long. This applies even when the g is missing:
+	// it hasn't yet gotten to sleep and has lost the race to
+	// the (few) other g's that we find on the list.
 	for p, s := (*sudog)(nil), l.head; s != nil; p, s = s, s.next {
 		if s.ticket == t {
 			n := s.next

--- a/test/locklinear.go
+++ b/test/locklinear.go
+// run
+
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test that locks don't go quadratic due to runtime hash table collisions.
+
+package main
+
+import (
+	"fmt"
+	"runtime"
+	"sync"
+	"time"
+)
+
+const debug = false
+
+// checkLinear asserts that the running time of f(n) is at least linear but sub-quadratic.
+// tries is the initial number of iterations.
+func checkLinear(typ string, tries int, f func(n int)) {
+	// Depending on the machine and OS, this test might be too fast
+	// to measure with accurate enough granularity. On failure,
+	// make it run longer, hoping that the timing granularity
+	// is eventually sufficient.
+
+	timeF := func(n int) time.Duration {
+		t1 := time.Now()
+		f(n)
+		return time.Since(t1)
+	}
+
+	n := tries
+	fails := 0
+	for {
+		t1 := timeF(n)
+		t2 := timeF(2 * n)
+		if debug {
+			println(n, t1.String(), 2*n, t2.String())
+		}
+		// should be 2x (linear); allow up to 2.5x
+		if t1*3/2 < t2 && t2 < t1*5/2 {
+			return
+		}
+		// If 2n ops run in under a second and the ratio
+		// doesn't work out, make n bigger, trying to reduce
+		// the effect that a constant amount of overhead has
+		// on the computed ratio.
+		if t2 < 1*time.Second {
+			n *= 2
+			continue
+		}
+		// Once the test runs long enough for n ops,
+		// try to get the right ratio at least once.
+		// If five in a row all fail, give up.
+		if fails++; fails >= 5 {
+			panic(fmt.Sprintf("%s: too slow: %d ops: %v; %d ops: %v\n",
+				typ, n, t1, 2*n, t2))
+		}
+	}
+}
+
+const offset = 251 // known size of runtime hash table
+
+func main() {
+	checkLinear("lockone", 1000, func(n int) {
+		ch := make(chan int)
+		locks := make([]sync.RWMutex, offset+1)
+		for i := 0; i < n; i++ {
+			go func() {
+				locks[0].Lock()
+				ch <- 1
+			}()
+		}
+		time.Sleep(1 * time.Millisecond)
+
+		go func() {
+			for j := 0; j < n; j++ {
+				locks[1].Lock()
+				locks[offset].Lock()
+				locks[1].Unlock()
+				runtime.Gosched()
+				locks[offset].Unlock()
+			}
+		}()
+
+		for j := 0; j < n; j++ {
+			locks[1].Lock()
+			locks[offset].Lock()
+			locks[1].Unlock()
+			runtime.Gosched()
+			locks[offset].Unlock()
+		}
+
+		for i := 0; i < n; i++ {
+			<-ch
+			locks[0].Unlock()
+		}
+	})
+
+	checkLinear("lockmany", 1000, func(n int) {
+		locks := make([]sync.RWMutex, n*offset+1)
+
+		var wg sync.WaitGroup
+		for i := 0; i < n; i++ {
+			wg.Add(1)
+			go func(i int) {
+				locks[(i+1)*offset].Lock()
+				wg.Done()
+				locks[(i+1)*offset].Lock()
+				locks[(i+1)*offset].Unlock()
+			}(i)
+		}
+		wg.Wait()
+
+		go func() {
+			for j := 0; j < n; j++ {
+				locks[1].Lock()
+				locks[0].Lock()
+				locks[1].Unlock()
+				runtime.Gosched()
+				locks[0].Unlock()
+			}
+		}()
+
+		for j := 0; j < n; j++ {
+			locks[1].Lock()
+			locks[0].Lock()
+			locks[1].Unlock()
+			runtime.Gosched()
+			locks[0].Unlock()
+		}
+
+		for i := 0; i < n; i++ {
+			locks[(i+1)*offset].Unlock()
+		}
+	})
+}