Commit 5af0b28a authored by Josh Bleecher Snyder's avatar Josh Bleecher Snyder

runtime: iterate over set bits in adjustpointers

There are several things combined in this change.

First, eliminate the gobitvector type in favor
of adding a ptrbit method to bitvector.
In non-performance-critical code, use that method.
In performance critical code, though, load the bitvector data
one byte at a time and iterate only over set bits.
To support that, add and use sys.Ctz8.

name                old time/op  new time/op  delta
StackCopyPtr-8      81.8ms ± 5%  78.9ms ± 3%   -3.58%  (p=0.000 n=97+96)
StackCopy-8         65.9ms ± 3%  62.8ms ± 3%   -4.67%  (p=0.000 n=96+92)
StackCopyNoCache-8   105ms ± 3%   102ms ± 3%   -3.38%  (p=0.000 n=96+95)

Change-Id: I00b80f45612708bd440b1a411a57fa6dfa24aa74
Reviewed-on: https://go-review.googlesource.com/109716
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarAustin Clements <austin@google.com>
parent 13cd0061
...@@ -2980,6 +2980,8 @@ func init() { ...@@ -2980,6 +2980,8 @@ func init() {
alias("runtime/internal/atomic", "Casp1", "runtime/internal/atomic", "Cas", p4...) alias("runtime/internal/atomic", "Casp1", "runtime/internal/atomic", "Cas", p4...)
alias("runtime/internal/atomic", "Casp1", "runtime/internal/atomic", "Cas64", p8...) alias("runtime/internal/atomic", "Casp1", "runtime/internal/atomic", "Cas64", p8...)
alias("runtime/internal/sys", "Ctz8", "math/bits", "TrailingZeros8", all...)
/******** math ********/ /******** math ********/
addF("math", "Sqrt", addF("math", "Sqrt",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value { func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
......
...@@ -233,9 +233,8 @@ type childInfo struct { ...@@ -233,9 +233,8 @@ type childInfo struct {
// dump kinds & offsets of interesting fields in bv // dump kinds & offsets of interesting fields in bv
func dumpbv(cbv *bitvector, offset uintptr) { func dumpbv(cbv *bitvector, offset uintptr) {
bv := gobv(*cbv) for i := uintptr(0); i < uintptr(cbv.n); i++ {
for i := uintptr(0); i < bv.n; i++ { if cbv.ptrbit(i) == 1 {
if bv.bytedata[i/8]>>(i%8)&1 == 1 {
dumpint(fieldKindPtr) dumpint(fieldKindPtr)
dumpint(uint64(offset + i*sys.PtrSize)) dumpint(uint64(offset + i*sys.PtrSize))
} }
......
...@@ -50,6 +50,30 @@ func Ctz32(x uint32) int { ...@@ -50,6 +50,30 @@ func Ctz32(x uint32) int {
return i + z return i + z
} }
// Ctz8 returns the number of trailing zero bits in x; the result is 8 for x == 0.
func Ctz8(x uint8) int {
return int(ntz8tab[x])
}
var ntz8tab = [256]uint8{
0x08, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x07, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
}
// Bswap64 returns its input with byte order reversed // Bswap64 returns its input with byte order reversed
// 0x0102030405060708 -> 0x0807060504030201 // 0x0102030405060708 -> 0x0807060504030201
func Bswap64(x uint64) uint64 { func Bswap64(x uint64) uint64 {
......
...@@ -34,6 +34,14 @@ TEXT runtime∕internal∕sys·Ctz32(SB), NOSPLIT, $0-8 ...@@ -34,6 +34,14 @@ TEXT runtime∕internal∕sys·Ctz32(SB), NOSPLIT, $0-8
MOVL AX, ret+4(FP) MOVL AX, ret+4(FP)
RET RET
TEXT runtime∕internal∕sys·Ctz8(SB), NOSPLIT, $0-8
MOVBLZX x+0(FP), AX
BSFL AX, AX
JNZ 2(PC)
MOVL $8, AX
MOVL AX, ret+4(FP)
RET
TEXT runtime∕internal∕sys·Bswap64(SB), NOSPLIT, $0-16 TEXT runtime∕internal∕sys·Bswap64(SB), NOSPLIT, $0-16
MOVL x_lo+0(FP), AX MOVL x_lo+0(FP), AX
MOVL x_hi+4(FP), BX MOVL x_hi+4(FP), BX
......
...@@ -8,5 +8,6 @@ package sys ...@@ -8,5 +8,6 @@ package sys
func Ctz64(x uint64) int func Ctz64(x uint64) int
func Ctz32(x uint32) int func Ctz32(x uint32) int
func Ctz8(x uint8) int
func Bswap64(x uint64) uint64 func Bswap64(x uint64) uint64
func Bswap32(x uint32) uint32 func Bswap32(x uint32) uint32
...@@ -2021,9 +2021,8 @@ func getgcmask(ep interface{}) (mask []byte) { ...@@ -2021,9 +2021,8 @@ func getgcmask(ep interface{}) (mask []byte) {
n := (*ptrtype)(unsafe.Pointer(t)).elem.size n := (*ptrtype)(unsafe.Pointer(t)).elem.size
mask = make([]byte, n/sys.PtrSize) mask = make([]byte, n/sys.PtrSize)
for i := uintptr(0); i < n; i += sys.PtrSize { for i := uintptr(0); i < n; i += sys.PtrSize {
bitmap := bv.bytedata
off := (uintptr(p) + i - frame.varp + size) / sys.PtrSize off := (uintptr(p) + i - frame.varp + size) / sys.PtrSize
mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1 mask[i/sys.PtrSize] = bv.ptrbit(off)
} }
} }
return return
......
...@@ -544,44 +544,39 @@ type bitvector struct { ...@@ -544,44 +544,39 @@ type bitvector struct {
bytedata *uint8 bytedata *uint8
} }
type gobitvector struct { // ptrbit returns the i'th bit in bv.
n uintptr // ptrbit is less efficient than iterating directly over bitvector bits,
bytedata []uint8 // and should only be used in non-performance-critical code.
} // See adjustpointers for an example of a high-efficiency walk of a bitvector.
func (bv *bitvector) ptrbit(i uintptr) uint8 {
func gobv(bv bitvector) gobitvector { b := *(addb(bv.bytedata, i/8))
return gobitvector{ return (b >> (i % 8)) & 1
uintptr(bv.n),
(*[1 << 30]byte)(unsafe.Pointer(bv.bytedata))[:(bv.n+7)/8],
}
}
func ptrbit(bv *gobitvector, i uintptr) uint8 {
return (bv.bytedata[i/8] >> (i % 8)) & 1
} }
// bv describes the memory starting at address scanp. // bv describes the memory starting at address scanp.
// Adjust any pointers contained therein. // Adjust any pointers contained therein.
func adjustpointers(scanp unsafe.Pointer, cbv *bitvector, adjinfo *adjustinfo, f funcInfo) { func adjustpointers(scanp unsafe.Pointer, bv *bitvector, adjinfo *adjustinfo, f funcInfo) {
bv := gobv(*cbv)
minp := adjinfo.old.lo minp := adjinfo.old.lo
maxp := adjinfo.old.hi maxp := adjinfo.old.hi
delta := adjinfo.delta delta := adjinfo.delta
num := bv.n num := uintptr(bv.n)
// If this frame might contain channel receive slots, use CAS // If this frame might contain channel receive slots, use CAS
// to adjust pointers. If the slot hasn't been received into // to adjust pointers. If the slot hasn't been received into
// yet, it may contain stack pointers and a concurrent send // yet, it may contain stack pointers and a concurrent send
// could race with adjusting those pointers. (The sent value // could race with adjusting those pointers. (The sent value
// itself can never contain stack pointers.) // itself can never contain stack pointers.)
useCAS := uintptr(scanp) < adjinfo.sghi useCAS := uintptr(scanp) < adjinfo.sghi
for i := uintptr(0); i < num; i++ { for i := uintptr(0); i < num; i += 8 {
if stackDebug >= 4 { if stackDebug >= 4 {
print(" ", add(scanp, i*sys.PtrSize), ":", ptrnames[ptrbit(&bv, i)], ":", hex(*(*uintptr)(add(scanp, i*sys.PtrSize))), " # ", i, " ", bv.bytedata[i/8], "\n") for j := uintptr(0); j < 8; j++ {
print(" ", add(scanp, (i+j)*sys.PtrSize), ":", ptrnames[bv.ptrbit(i+j)], ":", hex(*(*uintptr)(add(scanp, (i+j)*sys.PtrSize))), " # ", i, " ", *addb(bv.bytedata, i/8), "\n")
} }
if ptrbit(&bv, i) != 1 {
continue
} }
pp := (*uintptr)(add(scanp, i*sys.PtrSize)) b := *(addb(bv.bytedata, i/8))
for b != 0 {
j := uintptr(sys.Ctz8(b))
b &= b - 1
pp := (*uintptr)(add(scanp, (i+j)*sys.PtrSize))
retry: retry:
p := *pp p := *pp
if f.valid() && 0 < p && p < minLegalPointer && debug.invalidptr != 0 { if f.valid() && 0 < p && p < minLegalPointer && debug.invalidptr != 0 {
...@@ -605,6 +600,7 @@ func adjustpointers(scanp unsafe.Pointer, cbv *bitvector, adjinfo *adjustinfo, f ...@@ -605,6 +600,7 @@ func adjustpointers(scanp unsafe.Pointer, cbv *bitvector, adjinfo *adjustinfo, f
} }
} }
} }
}
} }
// Note: the argument/return area is adjusted by the callee. // Note: the argument/return area is adjusted by the callee.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment