Commit aac7f1a0 authored by Dmitriy Vyukov's avatar Dmitriy Vyukov

runtime: convert markallocated from C to Go

benchmark                      old ns/op     new ns/op     delta
BenchmarkMalloc8               28.7          22.4          -21.95%
BenchmarkMalloc16              44.8          33.8          -24.55%
BenchmarkMallocTypeInfo8       49.0          32.9          -32.86%
BenchmarkMallocTypeInfo16      46.7          35.8          -23.34%
BenchmarkMallocLargeStruct     907           901           -0.66%
BenchmarkGobDecode             13235542      12036851      -9.06%
BenchmarkGobEncode             10639699      9539155       -10.34%
BenchmarkJSONEncode            25193036      21898922      -13.08%
BenchmarkJSONDecode            96104044      89464904      -6.91%

Fixes #8452.

LGTM=khr
R=golang-codereviews, bradfitz, rsc, dave, khr
CC=golang-codereviews
https://golang.org/cl/122090043
parent c5b2c370
...@@ -9,6 +9,8 @@ import ( ...@@ -9,6 +9,8 @@ import (
) )
const ( const (
debugMalloc = false
flagNoScan = 1 << 0 // GC doesn't have to scan object flagNoScan = 1 << 0 // GC doesn't have to scan object
flagNoProfiling = 1 << 1 // must not profile flagNoProfiling = 1 << 1 // must not profile
flagNoZero = 1 << 2 // don't zero memory flagNoZero = 1 << 2 // don't zero memory
...@@ -29,6 +31,22 @@ const ( ...@@ -29,6 +31,22 @@ const (
pageShift = 13 pageShift = 13
pageSize = 1 << pageShift pageSize = 1 << pageShift
pageMask = pageSize - 1 pageMask = pageSize - 1
wordsPerBitmapWord = ptrSize * 8 / 4
gcBits = 4
bitsPerPointer = 2
bitsMask = 1<<bitsPerPointer - 1
pointersPerByte = 8 / bitsPerPointer
bitPtrMask = bitsMask << 2
maxGCMask = 0 // disabled because wastes several bytes of memory
bitsDead = 0
bitsPointer = 2
bitMiddle = 0
bitBoundary = 1
bitAllocated = 2
bitMarked = 3
bitMask = bitMiddle | bitBoundary | bitAllocated | bitMarked
) )
// All zero-sized allocations return a pointer to this byte. // All zero-sized allocations return a pointer to this byte.
...@@ -168,14 +186,112 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer { ...@@ -168,14 +186,112 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
size = uintptr(s.elemsize) size = uintptr(s.elemsize)
} }
// TODO: write markallocated in Go // From here till marked label marking the object as allocated
mp.ptrarg[0] = x // and storing type info in the GC bitmap.
mp.scalararg[0] = uint(size) arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
mp.scalararg[1] = uint(size0) off := (uintptr(x) - arena_start) / ptrSize
mp.ptrarg[1] = unsafe.Pointer(typ) xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
mp.scalararg[2] = uint(flags & flagNoScan) shift := (off % wordsPerBitmapWord) * gcBits
onM(&markallocated_m) if debugMalloc && (((*xbits)>>shift)&bitMask) != bitBoundary {
gothrow("bad bits in markallocated")
}
var ti, te uintptr
var ptrmask *uint8
if flags&flagNoScan != 0 {
// bitsDead in the first quadruple means don't scan.
if size == ptrSize {
*xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated + (bitsDead << 2)) << shift)
} else {
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
*xbitsb = bitAllocated + (bitsDead << 2)
}
goto marked
}
if size == ptrSize {
// It's one word and it has pointers, it must be a pointer.
*xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated | (bitsPointer << 2)) << shift)
goto marked
}
if typ != nil && (uintptr(typ.gc[0])|uintptr(typ.gc[1])) != 0 && uintptr(typ.size) > ptrSize {
if typ.kind&kindGCProg != 0 {
nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
masksize := nptr
if masksize%2 != 0 {
masksize *= 2 // repeated
}
masksize = masksize * pointersPerByte / 8 // 4 bits per word
masksize++ // unroll flag in the beginning
if masksize > maxGCMask && typ.gc[1] != 0 {
// If the mask is too large, unroll the program directly
// into the GC bitmap. It's 7 times slower than copying
// from the pre-unrolled mask, but saves 1/16 of type size
// memory for the mask.
mp.ptrarg[0] = x
mp.ptrarg[1] = unsafe.Pointer(typ)
mp.scalararg[0] = uint(size)
mp.scalararg[1] = uint(size0)
onM(&unrollgcproginplace_m)
goto marked
}
ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
// Check whether the program is already unrolled.
if uintptr(goatomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 {
mp.ptrarg[0] = unsafe.Pointer(typ)
onM(&unrollgcprog_m)
}
ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
} else {
ptrmask = (*uint8)(unsafe.Pointer(&typ.gc[0])) // embed mask
}
if size == 2*ptrSize {
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
*xbitsb = *ptrmask | bitAllocated
goto marked
}
te = uintptr(typ.size) / ptrSize
// If the type occupies odd number of words, its mask is repeated.
if te%2 == 0 {
te /= 2
}
}
if size == 2*ptrSize {
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
*xbitsb = (bitsPointer << 2) | (bitsPointer << 6) | bitAllocated
goto marked
}
// Copy pointer bitmask into the bitmap.
for i := uintptr(0); i < size0; i += 2 * ptrSize {
v := uint8((bitsPointer << 2) | (bitsPointer << 6))
if ptrmask != nil {
v = *(*uint8)(add(unsafe.Pointer(ptrmask), ti))
ti++
if ti == te {
ti = 0
}
}
if i == 0 {
v |= bitAllocated
}
if i+ptrSize == size0 {
v &= ^uint8(bitPtrMask << 4)
}
off := (uintptr(x) + i - arena_start) / ptrSize
xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
shift := (off % wordsPerBitmapWord) * gcBits
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
*xbitsb = v
}
if size0%(2*ptrSize) == 0 && size0 < size {
// Mark the word after last object's word as bitsDead.
off := (uintptr(x) + size0 - arena_start) / ptrSize
xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
shift := (off % wordsPerBitmapWord) * gcBits
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
*xbitsb = bitsDead << 2
}
marked:
mp.mallocing = 0 mp.mallocing = 0
if raceenabled { if raceenabled {
......
...@@ -517,7 +517,6 @@ void* runtime·persistentalloc(uintptr size, uintptr align, uint64 *stat); ...@@ -517,7 +517,6 @@ void* runtime·persistentalloc(uintptr size, uintptr align, uint64 *stat);
int32 runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s); int32 runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s);
void runtime·gc(int32 force); void runtime·gc(int32 force);
uintptr runtime·sweepone(void); uintptr runtime·sweepone(void);
void runtime·markallocated(void *v, uintptr size, uintptr size0, Type* typ, bool scan);
void runtime·markspan(void *v, uintptr size, uintptr n, bool leftover); void runtime·markspan(void *v, uintptr size, uintptr n, bool leftover);
void runtime·unmarkspan(void *v, uintptr size); void runtime·unmarkspan(void *v, uintptr size);
void runtime·purgecachedstats(MCache*); void runtime·purgecachedstats(MCache*);
......
...@@ -1741,7 +1741,7 @@ runtime·wakefing(void) ...@@ -1741,7 +1741,7 @@ runtime·wakefing(void)
return res; return res;
} }
// Recursively GC program in prog. // Recursively unrolls GC program in prog.
// mask is where to store the result. // mask is where to store the result.
// ppos is a pointer to position in mask, in bits. // ppos is a pointer to position in mask, in bits.
// sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise). // sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise).
...@@ -1833,11 +1833,20 @@ unrollglobgcprog(byte *prog, uintptr size) ...@@ -1833,11 +1833,20 @@ unrollglobgcprog(byte *prog, uintptr size)
return mask; return mask;
} }
static void void
unrollgcproginplace(void *v, uintptr size, uintptr size0, Type *typ) runtime·unrollgcproginplace_m(void)
{ {
uintptr *b, off, shift, pos; uintptr size, size0, *b, off, shift, pos;
byte *arena_start, *prog; byte *arena_start, *prog;
Type *typ;
void *v;
v = g->m->ptrarg[0];
typ = g->m->ptrarg[1];
size = g->m->scalararg[0];
size0 = g->m->scalararg[1];
g->m->ptrarg[0] = nil;
g->m->ptrarg[1] = nil;
pos = 0; pos = 0;
prog = (byte*)typ->gc[1]; prog = (byte*)typ->gc[1];
...@@ -1859,14 +1868,18 @@ unrollgcproginplace(void *v, uintptr size, uintptr size0, Type *typ) ...@@ -1859,14 +1868,18 @@ unrollgcproginplace(void *v, uintptr size, uintptr size0, Type *typ)
} }
// Unrolls GC program in typ->gc[1] into typ->gc[0] // Unrolls GC program in typ->gc[1] into typ->gc[0]
static void void
unrollgcprog(Type *typ) runtime·unrollgcprog_m(void)
{ {
static Lock lock; static Lock lock;
Type *typ;
byte *mask, *prog; byte *mask, *prog;
uintptr pos; uintptr pos;
uint32 x; uint32 x;
typ = g->m->ptrarg[0];
g->m->ptrarg[0] = nil;
runtime·lock(&lock); runtime·lock(&lock);
mask = (byte*)typ->gc[0]; mask = (byte*)typ->gc[0];
if(mask[0] == 0) { if(mask[0] == 0) {
...@@ -1887,110 +1900,6 @@ unrollgcprog(Type *typ) ...@@ -1887,110 +1900,6 @@ unrollgcprog(Type *typ)
runtime·unlock(&lock); runtime·unlock(&lock);
} }
void
runtime·markallocated(void *v, uintptr size, uintptr size0, Type *typ, bool scan)
{
uintptr *b, off, shift, i, ti, te, nptr, masksize;
byte *arena_start, x;
bool *ptrmask;
arena_start = runtime·mheap.arena_start;
off = (uintptr*)v - (uintptr*)arena_start;
b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
shift = (off % wordsPerBitmapWord) * gcBits;
if(Debug && (((*b)>>shift)&bitMask) != bitBoundary) {
runtime·printf("runtime: bad bits in markallocated (%p) b=%p[%p]\n", v, b, *b);
runtime·throw("bad bits in markallocated");
}
if(!scan) {
// BitsDead in the first quadruple means don't scan.
if(size == PtrSize)
*b = (*b & ~((bitBoundary|bitPtrMask)<<shift)) | ((bitAllocated+(BitsDead<<2))<<shift);
else
((byte*)b)[shift/8] = bitAllocated+(BitsDead<<2);
return;
}
if(size == PtrSize) {
// It's one word and it has pointers, it must be a pointer.
*b = (*b & ~((bitBoundary|bitPtrMask)<<shift)) | ((bitAllocated | (BitsPointer<<2))<<shift);
return;
}
ti = te = 0;
ptrmask = nil;
if(typ != nil && (typ->gc[0]|typ->gc[1]) != 0 && typ->size > PtrSize) {
if(typ->kind&KindGCProg) {
nptr = ROUND(typ->size, PtrSize)/PtrSize;
masksize = nptr;
if(masksize%2)
masksize *= 2; // repeated twice
masksize = masksize*PointersPerByte/8; // 4 bits per word
masksize++; // unroll flag in the beginning
if(masksize > MaxGCMask && typ->gc[1] != 0) {
// If the mask is too large, unroll the program directly
// into the GC bitmap. It's 7 times slower than copying
// from the pre-unrolled mask, but saves 1/16 of type size
// memory for the mask.
unrollgcproginplace(v, size, size0, typ);
return;
}
ptrmask = (byte*)typ->gc[0];
// check whether the program is already unrolled
if((runtime·atomicload((uint32*)ptrmask)&0xff) == 0)
unrollgcprog(typ);
ptrmask++; // skip the unroll flag byte
} else
ptrmask = (byte*)&typ->gc[0]; // embed mask
if(size == 2*PtrSize) {
((byte*)b)[shift/8] = ptrmask[0] | bitAllocated;
return;
}
te = typ->size/PtrSize;
// if the type occupies odd number of words, its mask is repeated twice
if((te%2) == 0)
te /= 2;
}
if(size == 2*PtrSize) {
((byte*)b)[shift/8] = (BitsPointer<<2) | (BitsPointer<<6) | bitAllocated;
return;
}
// Copy pointer bitmask into the bitmap.
for(i=0; i<size0; i+=2*PtrSize) {
x = (BitsPointer<<2) | (BitsPointer<<6);
if(ptrmask != nil) {
x = ptrmask[ti++];
if(ti == te)
ti = 0;
}
off = (uintptr*)((byte*)v + i) - (uintptr*)arena_start;
b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
shift = (off % wordsPerBitmapWord) * gcBits;
if(i == 0)
x |= bitAllocated;
if(i+PtrSize == size0)
x &= ~(bitPtrMask<<4);
((byte*)b)[shift/8] = x;
}
if(size0 == i && size0 < size) {
// mark the word after last object's word as BitsDead
off = (uintptr*)((byte*)v + size0) - (uintptr*)arena_start;
b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
shift = (off % wordsPerBitmapWord) * gcBits;
((byte*)b)[shift/8] = 0;
}
}
void
runtime·markallocated_m(void)
{
M *mp;
mp = g->m;
runtime·markallocated(mp->ptrarg[0], mp->scalararg[0], mp->scalararg[1], mp->ptrarg[1], mp->scalararg[2] == 0);
mp->ptrarg[0] = nil;
mp->ptrarg[1] = nil;
}
// mark the span of memory at v as having n blocks of the given size. // mark the span of memory at v as having n blocks of the given size.
// if leftover is true, there is left over space at the end of the span. // if leftover is true, there is left over space at the end of the span.
void void
......
...@@ -65,7 +65,9 @@ var ( ...@@ -65,7 +65,9 @@ var (
mprofMalloc_m, mprofMalloc_m,
gc_m, gc_m,
setFinalizer_m, setFinalizer_m,
markallocated_m mFunction markallocated_m,
unrollgcprog_m,
unrollgcproginplace_m mFunction
) )
// memclr clears n bytes starting at ptr. // memclr clears n bytes starting at ptr.
...@@ -87,7 +89,12 @@ const ( ...@@ -87,7 +89,12 @@ const (
concurrentSweep = true concurrentSweep = true
) )
// in asm_*.s // Atomic operations to read/write a pointer.
// in stubs.goc
func goatomicloadp(p unsafe.Pointer) unsafe.Pointer // return *p
func goatomicstorep(p unsafe.Pointer, v unsafe.Pointer) // *p = v
// in stubs.goc
// if *p == x { *p = y; return true } else { return false }, atomically // if *p == x { *p = y; return true } else { return false }, atomically
//go:noescape //go:noescape
func gocas(p *uint32, x uint32, y uint32) bool func gocas(p *uint32, x uint32, y uint32) bool
......
...@@ -48,6 +48,16 @@ func gonanotime() (r int64) { ...@@ -48,6 +48,16 @@ func gonanotime() (r int64) {
r = runtime·nanotime(); r = runtime·nanotime();
} }
#pragma textflag NOSPLIT
func goatomicloadp(p **byte) (v *byte) {
v = runtime·atomicloadp(p);
}
#pragma textflag NOSPLIT
func goatomicstorep(p **byte, v *byte) {
runtime·atomicstorep(p, v);
}
#pragma textflag NOSPLIT #pragma textflag NOSPLIT
func runtime·gocas(p *uint32, x uint32, y uint32) (ret bool) { func runtime·gocas(p *uint32, x uint32, y uint32) (ret bool) {
ret = runtime·cas(p, x, y); ret = runtime·cas(p, x, y);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment