Commit 899a4ad4 authored by Rick Hudson's avatar Rick Hudson

runtime: Speed up heapBitsForObject

Optimized heapBitsForObject by special casing
objects whose size is a power of two. When a
span holding such objects is initialized I
added a mask that when &ed with an interior pointer
results in the base of the pointer. For the garbage
benchmark this resulted in CPU_CLK_UNHALTED in
heapBitsForObject going from 7.7% down to 5.9%
of the total, INST_RETIRED went from 12.2 -> 8.7.

Here are the benchmarks that were at lease plus or minus 1%.

benchmark                          old ns/op      new ns/op      delta
BenchmarkFmtFprintfString          249            221            -11.24%
BenchmarkFmtFprintfInt             247            223            -9.72%
BenchmarkFmtFprintfEmpty           76.5           69.6           -9.02%
BenchmarkBinaryTree17              4106631412     3744550160     -8.82%
BenchmarkFmtFprintfFloat           424            399            -5.90%
BenchmarkGoParse                   4484421        4242115        -5.40%
BenchmarkGobEncode                 8803668        8449107        -4.03%
BenchmarkFmtManyArgs               1494           1436           -3.88%
BenchmarkGobDecode                 10431051       10032606       -3.82%
BenchmarkFannkuch11                2591306713     2517400464     -2.85%
BenchmarkTimeParse                 361            371            +2.77%
BenchmarkJSONDecode                70620492       68830357       -2.53%
BenchmarkRegexpMatchMedium_1K      54693          53343          -2.47%
BenchmarkTemplate                  90008879       91929940       +2.13%
BenchmarkTimeFormat                380            387            +1.84%
BenchmarkRegexpMatchEasy1_32       111            113            +1.80%
BenchmarkJSONEncode                21359159       21007583       -1.65%
BenchmarkRegexpMatchEasy1_1K       603            613            +1.66%
BenchmarkRegexpMatchEasy0_32       127            129            +1.57%
BenchmarkFmtFprintfIntInt          399            393            -1.50%
BenchmarkRegexpMatchEasy0_1K       373            378            +1.34%

Change-Id: I78e297161026f8b5cc7507c965fd3e486f81ed29
Reviewed-on: https://go-review.googlesource.com/8980Reviewed-by: 's avatarAustin Clements <austin@google.com>
parent e7ffafdb
...@@ -154,17 +154,16 @@ func heapBitsForSpan(base uintptr) (hbits heapBits) { ...@@ -154,17 +154,16 @@ func heapBitsForSpan(base uintptr) (hbits heapBits) {
// return base == 0 // return base == 0
// otherwise return the base of the object. // otherwise return the base of the object.
func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits, s *mspan) { func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits, s *mspan) {
if p < mheap_.arena_start || p >= mheap_.arena_used { arenaStart := mheap_.arena_start
if p < arenaStart || p >= mheap_.arena_used {
return return
} }
off := p - arenaStart
idx := off >> _PageShift
// p points into the heap, but possibly to the middle of an object. // p points into the heap, but possibly to the middle of an object.
// Consult the span table to find the block beginning. // Consult the span table to find the block beginning.
// TODO(rsc): Factor this out.
k := p >> _PageShift k := p >> _PageShift
x := k s = h_spans[idx]
x -= mheap_.arena_start >> _PageShift
s = h_spans[x]
if s == nil || pageID(k) < s.start || p >= s.limit || s.state != mSpanInUse { if s == nil || pageID(k) < s.start || p >= s.limit || s.state != mSpanInUse {
if s == nil || s.state == _MSpanStack { if s == nil || s.state == _MSpanStack {
// If s is nil, the virtual address has never been part of the heap. // If s is nil, the virtual address has never been part of the heap.
...@@ -188,23 +187,23 @@ func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits, s *mspan) { ...@@ -188,23 +187,23 @@ func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits, s *mspan) {
printunlock() printunlock()
throw("objectstart: bad pointer in unexpected span") throw("objectstart: bad pointer in unexpected span")
} }
return
} }
base = s.base() // If this span holds object of a power of 2 size, just mask off the bits to
if p-base >= s.elemsize { // the interior of the object. Otherwise use the size to get the base.
// n := (p - base) / s.elemsize, using division by multiplication if s.baseMask != 0 {
n := uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2) // optimize for power of 2 sized objects.
base = s.base()
const debugMagic = false base = base + (p-base)&s.baseMask
if debugMagic { // base = p & s.baseMask is faster for small spans,
n2 := (p - base) / s.elemsize // but doesn't work for large spans.
if n != n2 { // Overall, it's faster to use the more general computation above.
println("runtime: bad div magic", (p - base), s.elemsize, s.divShift, s.divMul, s.divShift2) } else {
throw("bad div magic") base = s.base()
} if p-base >= s.elemsize {
// n := (p - base) / s.elemsize, using division by multiplication
n := uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2)
base += n * s.elemsize
} }
base += n * s.elemsize
} }
// Now that we know the actual base, compute heapBits to return to caller. // Now that we know the actual base, compute heapBits to return to caller.
hbits = heapBitsForAddr(base) hbits = heapBitsForAddr(base)
......
...@@ -24,7 +24,6 @@ type mheap struct { ...@@ -24,7 +24,6 @@ type mheap struct {
nspan uint32 nspan uint32
sweepgen uint32 // sweep generation, see comment in mspan sweepgen uint32 // sweep generation, see comment in mspan
sweepdone uint32 // all spans are swept sweepdone uint32 // all spans are swept
// span lookup // span lookup
spans **mspan spans **mspan
spans_mapped uintptr spans_mapped uintptr
...@@ -99,6 +98,7 @@ type mspan struct { ...@@ -99,6 +98,7 @@ type mspan struct {
// if sweepgen == h->sweepgen - 1, the span is currently being swept // if sweepgen == h->sweepgen - 1, the span is currently being swept
// if sweepgen == h->sweepgen, the span is swept and ready to use // if sweepgen == h->sweepgen, the span is swept and ready to use
// h->sweepgen is incremented by 2 after every GC // h->sweepgen is incremented by 2 after every GC
sweepgen uint32 sweepgen uint32
divMul uint32 // for divide by elemsize - divMagic.mul divMul uint32 // for divide by elemsize - divMagic.mul
ref uint16 // capacity - number of objects in freelist ref uint16 // capacity - number of objects in freelist
...@@ -114,6 +114,7 @@ type mspan struct { ...@@ -114,6 +114,7 @@ type mspan struct {
limit uintptr // end of data in span limit uintptr // end of data in span
speciallock mutex // guards specials list speciallock mutex // guards specials list
specials *special // linked list of special records sorted by offset. specials *special // linked list of special records sorted by offset.
baseMask uintptr // if non-0, elemsize is a power of 2, & this will get object allocation base
} }
func (s *mspan) base() uintptr { func (s *mspan) base() uintptr {
...@@ -384,12 +385,14 @@ func mHeap_Alloc_m(h *mheap, npage uintptr, sizeclass int32, large bool) *mspan ...@@ -384,12 +385,14 @@ func mHeap_Alloc_m(h *mheap, npage uintptr, sizeclass int32, large bool) *mspan
s.divShift = 0 s.divShift = 0
s.divMul = 0 s.divMul = 0
s.divShift2 = 0 s.divShift2 = 0
s.baseMask = 0
} else { } else {
s.elemsize = uintptr(class_to_size[sizeclass]) s.elemsize = uintptr(class_to_size[sizeclass])
m := &class_to_divmagic[sizeclass] m := &class_to_divmagic[sizeclass]
s.divShift = m.shift s.divShift = m.shift
s.divMul = m.mul s.divMul = m.mul
s.divShift2 = m.shift2 s.divShift2 = m.shift2
s.baseMask = m.baseMask
} }
// update stats, sweep lists // update stats, sweep lists
......
...@@ -215,14 +215,24 @@ func roundupsize(size uintptr) uintptr { ...@@ -215,14 +215,24 @@ func roundupsize(size uintptr) uintptr {
// http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html // http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
// http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html // http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
type divMagic struct { type divMagic struct {
shift uint8 shift uint8
mul uint32 mul uint32
shift2 uint8 shift2 uint8
baseMask uintptr
} }
func computeDivMagic(d uint32) divMagic { func computeDivMagic(d uint32) divMagic {
var m divMagic var m divMagic
// If the size is a power of two, heapBitsForObject can divide even faster by masking.
// Compute this mask.
if d&(d-1) == 0 {
// It is a power of 2 (assuming dinptr != 1)
m.baseMask = ^(uintptr(d) - 1)
} else {
m.baseMask = 0
}
// Compute pre-shift by factoring power of 2 out of d. // Compute pre-shift by factoring power of 2 out of d.
for d&1 == 0 { for d&1 == 0 {
m.shift++ m.shift++
...@@ -239,5 +249,6 @@ func computeDivMagic(d uint32) divMagic { ...@@ -239,5 +249,6 @@ func computeDivMagic(d uint32) divMagic {
} }
m.mul = uint32(((1 << k) + d64 - 1) / d64) // ⌈2^k / d⌉ m.mul = uint32(((1 << k) + d64 - 1) / d64) // ⌈2^k / d⌉
m.shift2 = k m.shift2 = k
return m return m
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment