Commit 5333550b authored by Austin Clements's avatar Austin Clements

runtime: implement efficient page reclaimer

When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.

This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.

Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.

The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.

Updates #18155.

Fixes #21378 by completely replacing the logic that had that bug.

Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarRick Hudson <rlh@golang.org>
parent ba1698e9
......@@ -1974,6 +1974,9 @@ func gcSweep(mode gcMode) {
throw("non-empty swept list")
}
mheap_.pagesSwept = 0
mheap_.sweepArenas = mheap_.allArenas
mheap_.reclaimIndex = 0
mheap_.reclaimCredit = 0
unlock(&mheap_.lock)
if !_ConcurrentSweep || mode == gcForceBlockMode {
......
......@@ -4,6 +4,24 @@
// Garbage collector: sweeping
// The sweeper consists of two different algorithms:
//
// * The object reclaimer finds and frees unmarked slots in spans. It
// can free a whole span if none of the objects are marked, but that
// isn't its goal. This can be driven either synchronously by
// mcentral.cacheSpan for mcentral spans, or asynchronously by
// sweepone from the list of all in-use spans in mheap_.sweepSpans.
//
// * The span reclaimer looks for spans that contain no marked objects
// and frees whole spans. This is a separate algorithm because
// freeing whole spans is the hardest task for the object reclaimer,
// but is critical when allocating new spans. The entry point for
// this is mheap_.reclaim and it's driven by a sequential scan of
// the page marks bitmap in the heap arenas.
//
// Both algorithms ultimately call mspan.sweep, which sweeps a single
// heap span.
package runtime
import (
......@@ -72,7 +90,7 @@ func bgsweep(c chan int) {
}
}
// sweepone sweeps one span and returns the number of pages returned
// sweepone sweeps some unswept heap span and returns the number of pages returned
// to the heap, or ^uintptr(0) if there was nothing to sweep.
func sweepone() uintptr {
_g_ := getg()
......@@ -115,7 +133,12 @@ func sweepone() uintptr {
npages := ^uintptr(0)
if s != nil {
npages = s.npages
if !s.sweep(false) {
if s.sweep(false) {
// Whole span was freed. Count it toward the
// page reclaimer credit since these pages can
// now be used for span allocation.
atomic.Xadduintptr(&mheap_.reclaimCredit, npages)
} else {
// Span is still in-use, so this returned no
// pages to the heap and the span needs to
// move to the swept in-use list.
......
......@@ -89,6 +89,25 @@ type mheap struct {
// TODO(austin): pagesInUse should be a uintptr, but the 386
// compiler can't 8-byte align fields.
// Page reclaimer state
// reclaimIndex is the page index in allArenas of next page to
// reclaim. Specifically, it refers to page (i %
// pagesPerArena) of arena allArenas[i / pagesPerArena].
//
// If this is >= 1<<63, the page reclaimer is done scanning
// the page marks.
//
// This is accessed atomically.
reclaimIndex uint64
// reclaimCredit is spare credit for extra pages swept. Since
// the page reclaimer works in large chunks, it may reclaim
// more than requested. Any spare pages released go to this
// credit pool.
//
// This is accessed atomically.
reclaimCredit uintptr
// Malloc stats.
largealloc uint64 // bytes allocated for large objects
nlargealloc uint64 // number of large object allocations
......@@ -142,6 +161,11 @@ type mheap struct {
// then release mheap_.lock.
allArenas []arenaIdx
// sweepArenas is a snapshot of allArenas taken at the
// beginning of the sweep cycle. This can be read safely by
// simply blocking GC (by disabling preemption).
sweepArenas []arenaIdx
_ uint32 // ensure 64-bit alignment of central
// central free lists for small size classes.
......@@ -658,61 +682,158 @@ func (h *mheap) init() {
}
}
// Sweeps spans in list until reclaims at least npages into heap.
// Returns the actual number of pages reclaimed.
func (h *mheap) reclaimList(list *mSpanList, npages uintptr) uintptr {
n := uintptr(0)
sg := mheap_.sweepgen
retry:
for s := list.first; s != nil; s = s.next {
if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
list.remove(s)
// swept spans are at the end of the list
list.insertBack(s) // Puts it back on a busy list. s is not in the treap at this point.
unlock(&h.lock)
snpages := s.npages
if s.sweep(false) {
n += snpages
// reclaim sweeps and reclaims at least npage pages into the heap.
// It is called before allocating npage pages to keep growth in check.
//
// reclaim implements the page-reclaimer half of the sweeper.
//
// h must NOT be locked.
func (h *mheap) reclaim(npage uintptr) {
// This scans pagesPerChunk at a time. Higher values reduce
// contention on h.reclaimPos, but increase the minimum
// latency of performing a reclaim.
//
// Must be a multiple of the pageInUse bitmap element size.
//
// The time required by this can vary a lot depending on how
// many spans are actually freed. Experimentally, it can scan
// for pages at ~300 GB/ms on a 2.6GHz Core i7, but can only
// free spans at ~32 MB/ms. Using 512 pages bounds this at
// roughly 100µs.
//
// TODO(austin): Half of the time spent freeing spans is in
// locking/unlocking the heap (even with low contention). We
// could make the slow path here several times faster by
// batching heap frees.
const pagesPerChunk = 512
// Bail early if there's no more reclaim work.
if atomic.Load64(&h.reclaimIndex) >= 1<<63 {
return
}
// Disable preemption so the GC can't start while we're
// sweeping, so we can read h.sweepArenas, and so
// traceGCSweepStart/Done pair on the P.
mp := acquirem()
if trace.enabled {
traceGCSweepStart()
}
arenas := h.sweepArenas
locked := false
for npage > 0 {
// Pull from accumulated credit first.
if credit := atomic.Loaduintptr(&h.reclaimCredit); credit > 0 {
take := credit
if take > npage {
// Take only what we need.
take = npage
}
lock(&h.lock)
if n >= npages {
return n
if atomic.Casuintptr(&h.reclaimCredit, credit, credit-take) {
npage -= take
}
// the span could have been moved elsewhere
goto retry
}
if s.sweepgen == sg-1 {
// the span is being swept by background sweeper, skip
continue
}
// already swept empty span,
// all subsequent ones must also be either swept or in process of sweeping
break
// Claim a chunk of work.
idx := uintptr(atomic.Xadd64(&h.reclaimIndex, pagesPerChunk) - pagesPerChunk)
if idx/pagesPerArena >= uintptr(len(arenas)) {
// Page reclaiming is done.
atomic.Store64(&h.reclaimIndex, 1<<63)
break
}
if !locked {
// Lock the heap for reclaimChunk.
lock(&h.lock)
locked = true
}
// Scan this chunk.
nfound := h.reclaimChunk(arenas, idx, pagesPerChunk)
if nfound <= npage {
npage -= nfound
} else {
// Put spare pages toward global credit.
atomic.Xadduintptr(&h.reclaimCredit, nfound-npage)
npage = 0
}
}
if locked {
unlock(&h.lock)
}
return n
}
// Sweeps and reclaims at least npage pages into heap.
// Called before allocating npage pages.
func (h *mheap) reclaim(npage uintptr) {
if h.reclaimList(&h.busy, npage) != 0 {
return // Bingo!
if trace.enabled {
traceGCSweepDone()
}
releasem(mp)
}
// Now sweep everything that is not yet swept.
var reclaimed uintptr
unlock(&h.lock)
for {
n := sweepone()
if n == ^uintptr(0) { // all spans are swept
break
// reclaimChunk sweeps unmarked spans that start at page indexes [pageIdx, pageIdx+n).
// It returns the number of pages returned to the heap.
//
// h.lock must be held and the caller must be non-preemptible.
func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
// The heap lock must be held because this accesses the
// heapArena.spans arrays using potentially non-live pointers.
// In particular, if a span were freed and merged concurrently
// with this probing heapArena.spans, it would be possible to
// observe arbitrary, stale span pointers.
n0 := n
var nFreed uintptr
sg := h.sweepgen
for n > 0 {
ai := arenas[pageIdx/pagesPerArena]
ha := h.arenas[ai.l1()][ai.l2()]
// Get a chunk of the bitmap to work on.
arenaPage := uint(pageIdx % pagesPerArena)
inUse := ha.pageInUse[arenaPage/8:]
marked := ha.pageMarks[arenaPage/8:]
if uintptr(len(inUse)) > n/8 {
inUse = inUse[:n/8]
marked = marked[:n/8]
}
reclaimed += n
if reclaimed >= npage {
break
// Scan this bitmap chunk for spans that are in-use
// but have no marked objects on them.
for i := range inUse {
inUseUnmarked := inUse[i] &^ marked[i]
if inUseUnmarked == 0 {
continue
}
for j := uint(0); j < 8; j++ {
if inUseUnmarked&(1<<j) != 0 {
s := ha.spans[arenaPage+uint(i)*8+j]
if atomic.Load(&s.sweepgen) == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
npages := s.npages
unlock(&h.lock)
if s.sweep(false) {
nFreed += npages
}
lock(&h.lock)
// Reload inUse. It's possible nearby
// spans were freed when we dropped the
// lock and we don't want to get stale
// pointers from the spans array.
inUseUnmarked = inUse[i] &^ marked[i]
}
}
}
}
// Advance.
pageIdx += uintptr(len(inUse) * 8)
n -= uintptr(len(inUse) * 8)
}
lock(&h.lock)
if trace.enabled {
// Account for pages scanned but not reclaimed.
traceGCSweepSpan((n0 - nFreed) * pageSize)
}
return nFreed
}
// alloc_m is the internal implementation of mheap.alloc.
......@@ -723,27 +844,14 @@ func (h *mheap) reclaim(npage uintptr) {
//go:systemstack
func (h *mheap) alloc_m(npage uintptr, spanclass spanClass, large bool) *mspan {
_g_ := getg()
lock(&h.lock)
// To prevent excessive heap growth, before allocating n pages
// we need to sweep and reclaim at least n pages.
if h.sweepdone == 0 {
// TODO(austin): This tends to sweep a large number of
// spans in order to find a few completely free spans
// (for example, in the garbage benchmark, this sweeps
// ~30x the number of pages it's trying to allocate).
// If GC kept a bit for whether there were any marks
// in a span, we could release these free spans
// at the end of GC and eliminate this entirely.
if trace.enabled {
traceGCSweepStart()
}
h.reclaim(npage)
if trace.enabled {
traceGCSweepDone()
}
}
lock(&h.lock)
// transfer stats from cache to global
memstats.heap_scan += uint64(_g_.m.mcache.local_scan)
_g_.m.mcache.local_scan = 0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment