Commit 1fa70294 authored by Dmitriy Vyukov's avatar Dmitriy Vyukov

runtime: combine small NoScan allocations

Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047
parent f8e0057b
......@@ -5,6 +5,8 @@
// +build darwin dragonfly freebsd linux netbsd openbsd solaris windows
#include "runtime.h"
#include "arch_GOARCH.h"
#include "malloc.h"
Slice syscall·envs;
......@@ -44,15 +46,24 @@ void
syscall·setenv_c(String k, String v)
{
byte *arg[2];
uintptr len;
if(_cgo_setenv == nil)
return;
arg[0] = runtime·malloc(k.len + 1);
// Objects that are explicitly freed must be at least 16 bytes in size,
// so that they are not allocated using tiny alloc.
len = k.len + 1;
if(len < TinySize)
len = TinySize;
arg[0] = runtime·malloc(len);
runtime·memmove(arg[0], k.str, k.len);
arg[0][k.len] = 0;
arg[1] = runtime·malloc(v.len + 1);
len = v.len + 1;
if(len < TinySize)
len = TinySize;
arg[1] = runtime·malloc(len);
runtime·memmove(arg[1], v.str, v.len);
arg[1][v.len] = 0;
......
......@@ -26,6 +26,8 @@ extern MStats mstats; // defined in zruntime_def_$GOOS_$GOARCH.go
extern volatile intgo runtime·MemProfileRate;
static void* largealloc(uint32, uintptr*);
// Allocate an object of at least size bytes.
// Small objects are allocated from the per-thread cache's free lists.
// Large objects (> 32 kB) are allocated straight from the heap.
......@@ -34,12 +36,13 @@ void*
runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
{
int32 sizeclass;
uintptr tinysize, size1;
intgo rate;
MCache *c;
MCacheList *l;
uintptr npages;
MSpan *s;
MLink *v;
byte *tiny;
P *p;
if(size == 0) {
// All 0-length allocations use this pointer.
......@@ -59,6 +62,79 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
c = m->mcache;
if(!runtime·debug.efence && size <= MaxSmallSize) {
if((flag&(FlagNoScan|FlagNoGC)) == FlagNoScan && size < TinySize) {
// Tiny allocator.
//
// Tiny allocator combines several tiny allocation requests
// into a single memory block. The resulting memory block
// is freed when all subobjects are unreachable. The subobjects
// must be FlagNoScan (don't have pointers), this ensures that
// the amount of potentially wasted memory is bounded.
//
// Size of the memory block used for combining (TinySize) is tunable.
// Current setting is 16 bytes, which relates to 2x worst case memory
// wastage (when all but one subobjects are unreachable).
// 8 bytes would result in no wastage at all, but provides less
// opportunities for combining.
// 32 bytes provides more opportunities for combining,
// but can lead to 4x worst case wastage.
// The best case winning is 8x regardless of block size.
//
// Objects obtained from tiny allocator must not be freed explicitly.
// So when an object will be freed explicitly, we ensure that
// its size >= TinySize.
//
// SetFinalizer has a special case for objects potentially coming
// from tiny allocator, it such case it allows to set finalizers
// for an inner byte of a memory block.
//
// The main targets of tiny allocator are small strings and
// standalone escaping variables. On a json benchmark
// the allocator reduces number of allocations by ~12% and
// reduces heap size by ~20%.
p = m->p;
tinysize = p->tinysize;
if(size <= tinysize) {
tiny = p->tiny;
// Align tiny pointer for required (conservative) alignment.
if((size&7) == 0)
tiny = (byte*)ROUND((uintptr)tiny, 8);
else if((size&3) == 0)
tiny = (byte*)ROUND((uintptr)tiny, 4);
else if((size&1) == 0)
tiny = (byte*)ROUND((uintptr)tiny, 2);
size1 = size + (tiny - p->tiny);
if(size1 <= tinysize) {
// The object fits into existing tiny block.
v = (MLink*)tiny;
p->tiny += size1;
p->tinysize -= size1;
m->mallocing = 0;
m->locks--;
if(m->locks == 0 && g->preempt) // restore the preemption request in case we've cleared it in newstack
g->stackguard0 = StackPreempt;
return v;
}
}
// Allocate a new TinySize block.
l = &c->list[TinySizeClass];
if(l->list == nil)
runtime·MCache_Refill(c, TinySizeClass);
v = l->list;
l->list = v->next;
l->nlist--;
((uint64*)v)[0] = 0;
((uint64*)v)[1] = 0;
// See if we need to replace the existing tiny block with the new one
// based on amount of remaining free space.
if(TinySize-size > tinysize) {
p->tiny = (byte*)v + size;
p->tinysize = TinySize - size;
}
size = TinySize;
goto done;
}
// Allocate from mcache free lists.
// Inlined version of SizeToClass().
if(size <= 1024-8)
......@@ -78,23 +154,11 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
if(size > 2*sizeof(uintptr) && ((uintptr*)v)[1] != 0)
runtime·memclr((byte*)v, size);
}
done:
c->local_cachealloc += size;
} else {
// TODO(rsc): Report tracebacks for very large allocations.
// Allocate directly from heap.
npages = size >> PageShift;
if((size & PageMask) != 0)
npages++;
s = runtime·MHeap_Alloc(&runtime·mheap, npages, 0, 1, !(flag & FlagNoZero));
if(s == nil)
runtime·throw("out of memory");
s->limit = (byte*)(s->start<<PageShift) + size;
size = npages<<PageShift;
v = (void*)(s->start << PageShift);
// setup for mark sweep
runtime·markspan(v, 0, 0, true);
v = largealloc(flag, &size);
}
if(flag & FlagNoGC)
......@@ -151,6 +215,29 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
return v;
}
static void*
largealloc(uint32 flag, uintptr *sizep)
{
uintptr npages, size;
MSpan *s;
void *v;
// Allocate directly from heap.
size = *sizep;
npages = size >> PageShift;
if((size & PageMask) != 0)
npages++;
s = runtime·MHeap_Alloc(&runtime·mheap, npages, 0, 1, !(flag & FlagNoZero));
if(s == nil)
runtime·throw("out of memory");
s->limit = (byte*)(s->start<<PageShift) + size;
*sizep = npages<<PageShift;
v = (void*)(s->start << PageShift);
// setup for mark sweep
runtime·markspan(v, 0, 0, true);
return v;
}
void*
runtime·malloc(uintptr size)
{
......@@ -182,6 +269,10 @@ runtime·free(void *v)
}
size = s->elemsize;
sizeclass = s->sizeclass;
// Objects that are smaller than TinySize can be allocated using tiny alloc,
// if then such object is combined with an object with finalizer, we will crash.
if(size < TinySize)
runtime·throw("freeing too small block");
if(raceenabled)
runtime·racefree(v);
......@@ -347,6 +438,9 @@ runtime·mallocinit(void)
runtime·InitSizes();
if(runtime·class_to_size[TinySizeClass] != TinySize)
runtime·throw("bad TinySizeClass");
// limit = runtime·memlimit();
// See https://code.google.com/p/go/issues/detail?id=5049
// TODO(rsc): Fix after 1.1.
......@@ -450,7 +544,7 @@ runtime·mallocinit(void)
m->mcache = runtime·allocmcache();
// See if it works.
runtime·free(runtime·malloc(1));
runtime·free(runtime·malloc(TinySize));
}
void*
......@@ -760,12 +854,17 @@ func SetFinalizer(obj Eface, finalizer Eface) {
goto throw;
}
ot = (PtrType*)obj.type;
if(ot->elem != nil && ot->elem->size == 0) {
// As an implementation detail we do not run finalizers for zero-sized objects,
// because we use &runtime·zerobase for all such allocations.
if(ot->elem != nil && ot->elem->size == 0)
return;
}
if(!runtime·mlookup(obj.data, &base, &size, nil) || obj.data != base) {
runtime·printf("runtime.SetFinalizer: pointer not at beginning of allocated block\n");
goto throw;
// As an implementation detail we allow to set finalizers for an inner byte
// of an object if it could come from tiny alloc (see mallocgc for details).
if(ot->elem == nil || (ot->elem->kind&KindNoPointers) == 0 || ot->elem->size >= TinySize) {
runtime·printf("runtime.SetFinalizer: pointer not at beginning of allocated block\n");
goto throw;
}
}
if(finalizer.type != nil) {
if(finalizer.type->kind != KindFunc)
......
......@@ -108,6 +108,10 @@ enum
// Tunable constants.
MaxSmallSize = 32<<10,
// Tiny allocator parameters, see "Tiny allocator" comment in malloc.goc.
TinySize = 16,
TinySizeClass = 2,
FixAllocChunk = 16<<10, // Chunk size for FixAlloc
MaxMHeapList = 1<<(20 - PageShift), // Maximum page length for fixed-size list in MHeap.
HeapAllocChunk = 1<<20, // Chunk size for heap growth
......
......@@ -84,8 +84,11 @@ clearpools(void)
}
pools.head = nil;
// clear defer pools
for(pp=runtime·allp; p=*pp; pp++) {
// clear tinyalloc pool
p->tiny = nil;
p->tinysize = 0;
// clear defer pools
for(i=0; i<nelem(p->deferpool); i++)
p->deferpool[i] = nil;
}
......@@ -1202,6 +1205,7 @@ markroot(ParFor *desc, uint32 i)
MSpan **allspans, *s;
uint32 spanidx;
G *gp;
void *p;
USED(&desc);
wbuf = getempty(nil);
......@@ -1241,7 +1245,9 @@ markroot(ParFor *desc, uint32 i)
// don't mark finalized object, but scan it so we
// retain everything it points to.
spf = (SpecialFinalizer*)sp;
enqueue1(&wbuf, (Obj){(void*)((s->start << PageShift) + spf->offset), s->elemsize, 0});
// A finalizer can be set for an inner byte of an object, find object beginning.
p = (void*)((s->start << PageShift) + spf->offset/s->elemsize*s->elemsize);
enqueue1(&wbuf, (Obj){p, s->elemsize, 0});
enqueue1(&wbuf, (Obj){(void*)&spf->fn, PtrSize, 0});
enqueue1(&wbuf, (Obj){(void*)&spf->fint, PtrSize, 0});
enqueue1(&wbuf, (Obj){(void*)&spf->ot, PtrSize, 0});
......@@ -1663,12 +1669,16 @@ sweepspan(ParFor *desc, uint32 idx)
specialp = &s->specials;
special = *specialp;
while(special != nil) {
p = (byte*)(s->start << PageShift) + special->offset;
// A finalizer can be set for an inner byte of an object, find object beginning.
p = (byte*)(s->start << PageShift) + special->offset/size*size;
off = (uintptr*)p - (uintptr*)arena_start;
bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
shift = off % wordsPerBitmapWord;
bits = *bitp>>shift;
if((bits & (bitAllocated|bitMarked)) == bitAllocated) {
// Find the exact byte for which the special was setup
// (as opposed to object beginning).
p = (byte*)(s->start << PageShift) + special->offset;
// about to free object: splice out special record
y = special;
special = special->next;
......
......@@ -605,6 +605,8 @@ removespecial(void *p, byte kind)
runtime·lock(&span->specialLock);
t = &span->specials;
while((s = *t) != nil) {
// This function is used for finalizers only, so we don't check for
// "interior" specials (p must be exactly equal to s->offset).
if(offset == s->offset && kind == s->kind) {
*t = s->next;
runtime·unlock(&span->specialLock);
......@@ -713,9 +715,9 @@ runtime·freeallspecials(MSpan *span, void *p, uintptr size)
runtime·lock(&span->specialLock);
t = &span->specials;
while((s = *t) != nil) {
if(offset < s->offset)
if(offset + size <= s->offset)
break;
if(offset == s->offset) {
if(offset <= s->offset) {
*t = s->next;
s->next = list;
list = s;
......
......@@ -385,6 +385,11 @@ struct P
MCache* mcache;
Defer* deferpool[5]; // pool of available Defer structs of different sizes (see panic.c)
// Allocator cache for tiny objects w/o pointers.
// See "Tiny allocator" comment in malloc.goc.
byte* tiny;
uintptr tinysize;
// Cache of goroutine ids, amortizes accesses to runtime·sched.goidgen.
uint64 goidcache;
uint64 goidcacheend;
......
......@@ -73,8 +73,8 @@ func TestPoolGC(t *testing.T) {
var fin uint32
const N = 100
for i := 0; i < N; i++ {
v := new(int)
runtime.SetFinalizer(v, func(vv *int) {
v := new(string)
runtime.SetFinalizer(v, func(vv *string) {
atomic.AddUint32(&fin, 1)
})
p.Put(v)
......
......@@ -34,17 +34,17 @@ func main() {
for i := 0; i < N; i++ {
go func() {
defer wg.Done()
v := new(int)
v := new(string)
f := func() {
if *v != 0 {
if *v != "" {
panic("oops")
}
}
if *v != 0 {
if *v != "" {
// let the compiler think f escapes
sink = f
}
runtime.SetFinalizer(v, func(p *int) {
runtime.SetFinalizer(v, func(p *string) {
atomic.AddInt32(&count, -1)
})
defer f()
......
......@@ -30,7 +30,7 @@ func G() {
func main() {
nf := testing.AllocsPerRun(100, F)
ng := testing.AllocsPerRun(100, G)
if int(nf) != 1 {
if int(nf) > 1 {
fmt.Printf("AllocsPerRun(100, F) = %v, want 1\n", nf)
os.Exit(1)
}
......
......@@ -26,11 +26,11 @@ func F() {
func main() {
nf := testing.AllocsPerRun(100, F)
ng := testing.AllocsPerRun(100, G)
if int(nf) != 1 {
if int(nf) > 1 {
fmt.Printf("AllocsPerRun(100, F) = %v, want 1\n", nf)
os.Exit(1)
}
if int(ng) != 1 {
if int(ng) > 1 {
fmt.Printf("AllocsPerRun(100, G) = %v, want 1\n", ng)
os.Exit(1)
}
......
// run
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test finalizers work for tiny (combined) allocations.
package main
import (
"runtime"
"sync/atomic"
"time"
)
func main() {
// Does not work on 32-bits due to partially conservative GC.
// Try to enable when we have fully precise GC.
if runtime.GOARCH != "amd64" {
return
}
// Likewise for gccgo.
if runtime.Compiler == "gccgo" {
return
}
N := int32(100)
count := N
done := make([]bool, N)
for i := int32(0); i < N; i++ {
x := i // subject to tiny alloc
// the closure must be big enough to be combined
runtime.SetFinalizer(&x, func(p *int32) {
// Check that p points to the correct subobject of the tiny allocation.
// It's a bit tricky, because we can't capture another variable
// with the expected value (it would be combined as well).
if *p < 0 || *p >= N {
println("got", *p)
panic("corrupted")
}
if done[*p] {
println("got", *p)
panic("already finalized")
}
done[*p] = true
atomic.AddInt32(&count, -1)
})
}
for i := 0; i < 4; i++ {
runtime.GC()
time.Sleep(10 * time.Millisecond)
}
// Some of the finalizers may not be executed,
// if the outermost allocations are combined with something persistent.
// Currently 4 int32's are combined into a 16-byte block,
// ensure that most of them are finalized.
if count >= N/4 {
println(count, "out of", N, "finalizer are not called")
panic("not all finalizers are called")
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment