runtime/pprof: use new profile buffers for CPU profiling

This doesn't change the functionality of the current code, but it sets us up for exporting the profiling labels into the profile. The old code had a hash table of profile samples maintained during the signal handler, with evictions going into a log. The new code just logs every sample directly, leaving the hash-based deduplication to an ordinary goroutine. The new code also avoids storing the entire profile in two forms in memory, an unfortunate regression introduced when binary profile support was added. After this CL the entire profile is only stored once in memory. We'd still like to get back down to storing it zero times (streaming it to the underlying io.Writer). Change-Id: I0893a1788267c564aa1af17970d47377b2a43457 Reviewed-on: https://go-review.googlesource.com/36712 Run-TryBot: Russ Cox <rsc@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Matloob <matloob@golang.org>

runtime/pprof: use new profile buffers for CPU profiling
This doesn't change the functionality of the current code, but it sets us up for exporting the profiling labels into the profile. The old code had a hash table of profile samples maintained during the signal handler, with evictions going into a log. The new code just logs every sample directly, leaving the hash-based deduplication to an ordinary goroutine. The new code also avoids storing the entire profile in two forms in memory, an unfortunate regression introduced when binary profile support was added. After this CL the entire profile is only stored once in memory. We'd still like to get back down to storing it zero times (streaming it to the underlying io.Writer). Change-Id: I0893a1788267c564aa1af17970d47377b2a43457 Reviewed-on: https://go-review.googlesource.com/36712 Run-TryBot: Russ Cox <rsc@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Matloob <matloob@golang.org>
1a680a90 · Russ Cox · a1261b8b · 1a680a90 · 1a680a90 · 1a680a90
Commit 1a680a90 authored Feb 09, 2017 by Russ Cox
6 changed files
--- a/src/runtime/cpuprof.go
+++ b/src/runtime/cpuprof.go
--- a/src/runtime/pprof/pprof.go
+++ b/src/runtime/pprof/pprof.go
@@ -83,6 +83,7 @@ import (
 	"sync"
 	"text/tabwriter"
 	"time"
+	"unsafe"
 )

 // BUG(rsc): Profiles are only as good as the kernel support used to generate them.
@@ -696,30 +697,33 @@ func StartCPUProfile(w io.Writer) error {
 	return nil
 }

+// readProfile, provided by the runtime, returns the next chunk of
+// binary CPU profiling stack trace data, blocking until data is available.
+// If profiling is turned off and all the profile data accumulated while it was
+// on has been returned, readProfile returns eof=true.
+// The caller must save the returned data and tags before calling readProfile again.
+func readProfile() (data []uint64, tags []unsafe.Pointer, eof bool)
+
 func profileWriter(w io.Writer) {
-	startTime := time.Now()
-	// This will buffer the entire profile into buf and then
-	// translate it into a profile.Profile structure. This will
-	// create two copies of all the data in the profile in memory.
-	// TODO(matloob): Convert each chunk of the proto output and
-	// stream it out instead of converting the entire profile.
-	var buf bytes.Buffer
+	b := newProfileBuilder()
+	var err error
 	for {
-		data := runtime.CPUProfile()
-		if data == nil {
+		time.Sleep(100 * time.Millisecond)
+		data, _, eof := readProfile()
+		if e := b.addCPUData(data); e != nil && err == nil {
+			err = e
+		}
+		if eof {
 			break
 		}
-		buf.Write(data)
 	}
-
-	profile, err := translateCPUProfile(buf.Bytes(), startTime)
+	p := b.build()
 	if err != nil {
 		// The runtime should never produce an invalid or truncated profile.
 		// It drops records that can't fit into its log buffers.
-		panic(fmt.Errorf("could not translate binary profile to proto format: %v", err))
+		panic("runtime/pprof: converting profile: " + err.Error())
 	}
-
-	profile.Write(w)
+	p.Write(w)
 	cpu.done <- true
 }


--- a/src/runtime/pprof/proto.go
+++ b/src/runtime/pprof/proto.go
@@ -15,50 +15,125 @@ import (
 	"internal/pprof/profile"
 )

-// translateCPUProfile parses binary CPU profiling stack trace data
-// generated by runtime.CPUProfile() into a profile struct.
-func translateCPUProfile(b []byte, startTime time.Time) (*profile.Profile, error) {
-	const wordSize = unsafe.Sizeof(uintptr(0))
-	const minRawProfile = 5 * wordSize // Need a minimum of 5 words.
-	if uintptr(len(b)) < minRawProfile {
-		return nil, fmt.Errorf("truncated profile")
-	}
-	n := int(uintptr(len(b)) / wordSize)
-	data := ((*[1 << 28]uintptr)(unsafe.Pointer(&b[0])))[:n:n]
-	period := data[3]
-	data = data[5:] // skip header
+// lostProfileEvent is the function to which lost profiling
+// events are attributed.
+// (The name shows up in the pprof graphs.)
+func lostProfileEvent() { lostProfileEvent() }
+
+// funcPC returns the PC for the func value f.
+func funcPC(f interface{}) uintptr {
+	return *(*[2]*uintptr)(unsafe.Pointer(&f))[1]
+}

-	// profile initialization taken from pprof tool
+// A profileBuilder builds a profile.Profile incrementally from a
+// stream of profile samples delivered by the runtime.
+// TODO(rsc,matloob): In the long term, we'd like to avoid
+// storing the entire profile.Profile in memory, instead streaming
+// the encoded form out to an underlying writer.
+// Even so, this one copy is a step forward from Go 1.8,
+// which had two full copies of the data in memory.
+type profileBuilder struct {
+	p          *profile.Profile
+	start      time.Time
+	havePeriod bool
+	locs       map[uint64]*profile.Location
+	samples    map[sampleKey]*profile.Sample
+}
+
+// A sampleKey is the key for the map from stack to profile.Sample.
+// It is an unbounded array of profile.Location, broken into
+// fixed-size chunks. The chunks are chained by the next field,
+// which is an interface{} holding a sampleKey so that the default
+// Go equality will consider the whole array contents.
+// (In contrast, if next were *sampleKey or the interface{} held a
+// *sampleKey, equality would only look at the pointer, not the values
+// in the next sampleKey in the chain.)
+// This is a bit of a hack, but it has the right effect and is expedient.
+// At some point we will want to do a better job, so that lookups
+// of large stacks need not allocate just to build a key.
+type sampleKey struct {
+	loc  [8]*profile.Location
+	i    int
+	next interface{}
+}
+
+// newProfileBuilder returns a new profileBuilder.
+// CPU profiling data obtained from the runtime can be added
+// by calling b.addCPUData, and then the eventual profile
+// can be obtained by calling b.finish.
+func newProfileBuilder() *profileBuilder {
+	start := time.Now()
 	p := &profile.Profile{
-		Period:     int64(period) * 1000,
 		PeriodType: &profile.ValueType{Type: "cpu", Unit: "nanoseconds"},
 		SampleType: []*profile.ValueType{
 			{Type: "samples", Unit: "count"},
 			{Type: "cpu", Unit: "nanoseconds"},
 		},
-		TimeNanos:     int64(startTime.UnixNano()),
-		DurationNanos: time.Since(startTime).Nanoseconds(),
+		TimeNanos: int64(start.UnixNano()),
+	}
+	return &profileBuilder{
+		p:       p,
+		start:   start,
+		locs:    make(map[uint64]*profile.Location),
+		samples: make(map[sampleKey]*profile.Sample),
 	}
+}
+
+// addCPUData adds the CPU profiling data to the profile.
+// The data must be a whole number of records,
+// as delivered by the runtime.
+func (b *profileBuilder) addCPUData(data []uint64) error {
+	p := b.p
+	if !b.havePeriod {
+		// first record is period
+		if len(data) < 3 {
+			return fmt.Errorf("truncated profile")
+		}
+		if data[0] != 3 || data[2] == 0 {
+			return fmt.Errorf("malformed profile")
+		}
+		period := int64(data[2])
+		p.Period = period * 1000
+		data = data[3:]
+		b.havePeriod = true
+	}
+
 	// Parse CPU samples from the profile.
-	locs := make(map[uint64]*profile.Location)
-	for len(b) > 0 {
-		if len(data) < 2 || uintptr(len(data)) < 2+data[1] {
-			return nil, fmt.Errorf("truncated profile")
+	// Each sample is 3+n uint64s:
+	//	data[0] = 3+n
+	//	data[1] = time stamp (ignored)
+	//	data[2] = count
+	//	data[3:3+n] = stack
+	// If the count is 0 and the stack has length 1,
+	// that's an overflow record inserted by the runtime
+	// to indicate that stack[0] samples were lost.
+	// Otherwise the count is usually 1,
+	// but in a few special cases like lost non-Go samples
+	// there can be larger counts.
+	// Because many samples with the same stack arrive,
+	// we want to deduplicate immediately, which we do
+	// using the b.samples map.
+	for len(data) > 0 {
+		if len(data) < 3 || data[0] > uint64(len(data)) {
+			return fmt.Errorf("truncated profile")
 		}
-		count := data[0]
-		nstk := data[1]
-		if uintptr(len(data)) < 2+nstk {
-			return nil, fmt.Errorf("truncated profile")
+		if data[0] < 3 {
+			return fmt.Errorf("malformed profile")
 		}
-		stk := data[2 : 2+nstk]
-		data = data[2+nstk:]
+		count := data[2]
+		stk := data[3:data[0]]
+		data = data[data[0]:]

-		if count == 0 && nstk == 1 && stk[0] == 0 {
-			// end of data marker
-			break
+		if count == 0 && len(stk) == 1 {
+			// overflow record
+			count = uint64(stk[0])
+			stk = []uint64{
+				uint64(funcPC(lostProfileEvent)),
+			}
 		}

 		sloc := make([]*profile.Location, len(stk))
+		skey := sampleKey{}
 		for i, addr := range stk {
 			addr := uint64(addr)
 			// Addresses from stack traces point to the next instruction after
@@ -67,40 +142,57 @@ func translateCPUProfile(b []byte, startTime time.Time) (*profile.Profile, error
 			if i > 0 {
 				addr--
 			}
-			loc := locs[addr]
+			loc := b.locs[addr]
 			if loc == nil {
 				loc = &profile.Location{
 					ID:      uint64(len(p.Location) + 1),
 					Address: addr,
 				}
-				locs[addr] = loc
+				b.locs[addr] = loc
 				p.Location = append(p.Location, loc)
 			}
 			sloc[i] = loc
+			if skey.i == len(skey.loc) {
+				skey = sampleKey{next: skey}
+			}
+			skey.loc[skey.i] = loc
+			skey.i++
 		}
-		p.Sample = append(p.Sample, &profile.Sample{
-			Value:    []int64{int64(count), int64(count) * int64(p.Period)},
-			Location: sloc,
-		})
+		s := b.samples[skey]
+		if s == nil {
+			s = &profile.Sample{
+				Value:    []int64{0, 0},
+				Location: sloc,
+			}
+			b.samples[skey] = s
+			p.Sample = append(p.Sample, s)
+		}
+		s.Value[0] += int64(count)
+		s.Value[1] += int64(count) * int64(p.Period)
 	}
+	return nil
+}

+// build completes and returns the constructed profile.
+func (b *profileBuilder) build() *profile.Profile {
+	b.p.DurationNanos = time.Since(b.start).Nanoseconds()
 	if runtime.GOOS == "linux" {
-		if err := addMappings(p); err != nil {
-			return nil, err
-		}
+		addMappings(b.p)
 	}
-	symbolize(p)
-	return p, nil
+	symbolize(b.p)
+	return b.p
 }

-func addMappings(p *profile.Profile) error {
+// addMappings adds information from /proc/self/maps
+// to the profile if possible.
+func addMappings(p *profile.Profile) {
 	// Parse memory map from /proc/self/maps
 	f, err := os.Open("/proc/self/maps")
 	if err != nil {
-		return err
+		return
 	}
-	defer f.Close()
-	return p.ParseMemoryMap(f)
+	p.ParseMemoryMap(f)
+	f.Close()
 }

 type function interface {

--- a/src/runtime/pprof/proto_test.go
+++ b/src/runtime/pprof/proto_test.go
@@ -6,80 +6,50 @@ package pprof

 import (
 	"bytes"
-	"fmt"
+	"encoding/json"
 	"internal/pprof/profile"
 	"io/ioutil"
 	"reflect"
 	"runtime"
 	"testing"
-	"time"
-	"unsafe"
 )

-// Helper function to initialize empty cpu profile with sampling period provided.
-func createEmptyProfileWithPeriod(t *testing.T, periodMs uint64) bytes.Buffer {
-	// Mock the sample header produced by cpu profiler. Write a sample
-	// period of 2000 microseconds, followed by no samples.
-	buf := new(bytes.Buffer)
-	// Profile header is as follows:
-	// The first, third and fifth words are 0. The second word is 3.
-	// The fourth word is the period.
-	// EOD marker:
-	// The sixth word -- count is initialized to 0 above.
-	// The code below sets the seventh word -- nstk to 1
-	// The eighth word -- addr is initialized to 0 above.
-	words := []int{0, 3, 0, int(periodMs), 0, 0, 1, 0}
-	n := int(unsafe.Sizeof(0)) * len(words)
-	data := ((*[1 << 29]byte)(unsafe.Pointer(&words[0])))[:n:n]
-	if _, err := buf.Write(data); err != nil {
-		t.Fatalf("createEmptyProfileWithPeriod failed: %v", err)
-	}
-	return *buf
+// translateCPUProfile parses binary CPU profiling stack trace data
+// generated by runtime.CPUProfile() into a profile struct.
+// This is only used for testing. Real conversions stream the
+// data into the profileBuilder as it becomes available.
+func translateCPUProfile(data []uint64) (*profile.Profile, error) {
+	b := newProfileBuilder()
+	if err := b.addCPUData(data); err != nil {
+		return nil, err
+	}
+	return b.build(), nil
 }

-// Helper function to initialize cpu profile with two sample values.
-func createProfileWithTwoSamples(t *testing.T, periodMs uintptr, count1 uintptr, count2 uintptr,
-	address1 uintptr, address2 uintptr) bytes.Buffer {
-	// Mock the sample header produced by cpu profiler. Write a sample
-	// period of 2000 microseconds, followed by no samples.
-	buf := new(bytes.Buffer)
-	words := []uintptr{0, 3, 0, uintptr(periodMs), 0, uintptr(count1), 2,
-		uintptr(address1), uintptr(address1 + 2),
-		uintptr(count2), 2, uintptr(address2), uintptr(address2 + 2),
-		0, 1, 0}
-	for _, n := range words {
-		var err error
-		switch unsafe.Sizeof(int(0)) {
-		case 8:
-			_, err = buf.Write((*[8]byte)(unsafe.Pointer(&n))[:8:8])
-		case 4:
-			_, err = buf.Write((*[4]byte)(unsafe.Pointer(&n))[:4:4])
-		}
-		if err != nil {
-			t.Fatalf("createProfileWithTwoSamples failed: %v", err)
-		}
-	}
-	return *buf
+// fmtJSON returns a pretty-printed JSON form for x.
+// It works reasonbly well for printing protocol-buffer
+// data structures like profile.Profile.
+func fmtJSON(x interface{}) string {
+	js, _ := json.MarshalIndent(x, "", "\t")
+	return string(js)
 }

-// Tests translateCPUProfile parses correct sampling period in an otherwise empty cpu profile.
-func TestTranlateCPUProfileSamplingPeriod(t *testing.T) {
+func TestConvertCPUProfileEmpty(t *testing.T) {
 	// A test server with mock cpu profile data.
 	var buf bytes.Buffer

-	startTime := time.Now()
-	b := createEmptyProfileWithPeriod(t, 2000)
-	p, err := translateCPUProfile(b.Bytes(), startTime)
+	b := []uint64{3, 0, 2000} // empty profile with 2000ms sample period
+	p, err := translateCPUProfile(b)
 	if err != nil {
-		t.Fatalf("translate failed: %v", err)
+		t.Fatalf("translateCPUProfile: %v", err)
 	}
 	if err := p.Write(&buf); err != nil {
-		t.Fatalf("write failed: %v", err)
+		t.Fatalf("writing profile: %v", err)
 	}

 	p, err = profile.Parse(&buf)
 	if err != nil {
-		t.Fatalf("Could not parse Profile profile: %v", err)
+		t.Fatalf("profile.Parse: %v", err)
 	}

 	// Expected PeriodType and SampleType.
@@ -94,79 +64,89 @@ func TestTranlateCPUProfileSamplingPeriod(t *testing.T) {
 	}
 }

-func getSampleAsString(sample []*profile.Sample) string {
-	var str string
-	for _, x := range sample {
-		for _, y := range x.Location {
-			if y.Mapping != nil {
-				str += fmt.Sprintf("Mapping:%v\n", *y.Mapping)
-			}
-			str += fmt.Sprintf("Location:%v\n", y)
+func f1() { f1() }
+func f2() { f2() }
+
+// testPCs returns two PCs and two corresponding memory mappings
+// to use in test profiles.
+func testPCs(t *testing.T) (addr1, addr2 uint64, map1, map2 *profile.Mapping) {
+	if runtime.GOOS == "linux" {
+		// Figure out two addresses from /proc/self/maps.
+		mmap, err := ioutil.ReadFile("/proc/self/maps")
+		if err != nil {
+			t.Fatal(err)
 		}
-		str += fmt.Sprintf("Sample:%v\n", *x)
-	}
-	return str
+		mprof := &profile.Profile{}
+		if err = mprof.ParseMemoryMap(bytes.NewReader(mmap)); err != nil {
+			t.Fatalf("parsing /proc/self/maps: %v", err)
+		}
+		if len(mprof.Mapping) < 2 {
+			// It is possible for a binary to only have 1 executable
+			// region of memory.
+			t.Skipf("need 2 or more mappings, got %v", len(mprof.Mapping))
+		}
+		addr1 = mprof.Mapping[0].Start
+		map1 = mprof.Mapping[0]
+		addr2 = mprof.Mapping[1].Start
+		map2 = mprof.Mapping[1]
+	} else {
+		addr1 = uint64(funcPC(f1))
+		addr2 = uint64(funcPC(f2))
+	}
+	return
 }

-// Tests translateCPUProfile parses a cpu profile with sample values present.
-func TestTranslateCPUProfileWithSamples(t *testing.T) {
-	if runtime.GOOS != "linux" {
-		t.Skip("test requires a system with /proc/self/maps")
-	}
-	// Figure out two addresses from /proc/self/maps.
-	mmap, err := ioutil.ReadFile("/proc/self/maps")
-	if err != nil {
-		t.Fatal("Cannot read /proc/self/maps")
-	}
-	rd := bytes.NewReader(mmap)
-	mprof := &profile.Profile{}
-	if err = mprof.ParseMemoryMap(rd); err != nil {
-		t.Fatalf("Cannot parse /proc/self/maps")
-	}
-	if len(mprof.Mapping) < 2 {
-		// It is possible for a binary to only have 1 executable
-		// region of memory.
-		t.Skipf("need 2 or more mappings, got %v", len(mprof.Mapping))
-	}
-	address1 := mprof.Mapping[0].Start
-	address2 := mprof.Mapping[1].Start
-	// A test server with mock cpu profile data.
-
-	startTime := time.Now()
-	b := createProfileWithTwoSamples(t, 2000, 20, 40, uintptr(address1), uintptr(address2))
-	p, err := translateCPUProfile(b.Bytes(), startTime)
+func TestConvertCPUProfile(t *testing.T) {
+	addr1, addr2, map1, map2 := testPCs(t)

+	b := []uint64{
+		3, 0, 2000, // periodMs = 2000
+		5, 0, 10, uint64(addr1), uint64(addr1 + 2), // 10 samples in addr1
+		5, 0, 40, uint64(addr2), uint64(addr2 + 2), // 40 samples in addr2
+		5, 0, 10, uint64(addr1), uint64(addr1 + 2), // 10 samples in addr1
+	}
+	p, err := translateCPUProfile(b)
 	if err != nil {
-		t.Fatalf("Could not parse Profile profile: %v", err)
+		t.Fatalf("translating profile: %v", err)
 	}
-	// Expected PeriodType, SampleType and Sample.
-	expectedPeriodType := &profile.ValueType{Type: "cpu", Unit: "nanoseconds"}
-	expectedSampleType := []*profile.ValueType{
+	period := int64(2000 * 1000)
+	periodType := &profile.ValueType{Type: "cpu", Unit: "nanoseconds"}
+	sampleType := []*profile.ValueType{
 		{Type: "samples", Unit: "count"},
 		{Type: "cpu", Unit: "nanoseconds"},
 	}
-	expectedSample := []*profile.Sample{
+	samples := []*profile.Sample{
 		{Value: []int64{20, 20 * 2000 * 1000}, Location: []*profile.Location{
-			{ID: 1, Mapping: mprof.Mapping[0], Address: address1},
-			{ID: 2, Mapping: mprof.Mapping[0], Address: address1 + 1},
+			{ID: 1, Mapping: map1, Address: addr1},
+			{ID: 2, Mapping: map1, Address: addr1 + 1},
 		}},
 		{Value: []int64{40, 40 * 2000 * 1000}, Location: []*profile.Location{
-			{ID: 3, Mapping: mprof.Mapping[1], Address: address2},
-			{ID: 4, Mapping: mprof.Mapping[1], Address: address2 + 1},
+			{ID: 3, Mapping: map2, Address: addr2},
+			{ID: 4, Mapping: map2, Address: addr2 + 1},
 		}},
 	}
-	if p.Period != 2000*1000 {
-		t.Fatalf("Sampling periods do not match")
+	checkProfile(t, p, period, periodType, sampleType, samples)
+}
+
+func checkProfile(t *testing.T, p *profile.Profile, period int64, periodType *profile.ValueType, sampleType []*profile.ValueType, samples []*profile.Sample) {
+	if p.Period != period {
+		t.Fatalf("p.Period = %d, want %d", p.Period, period)
+	}
+	if !reflect.DeepEqual(p.PeriodType, periodType) {
+		t.Fatalf("p.PeriodType = %v\nwant = %v", fmtJSON(p.PeriodType), fmtJSON(periodType))
 	}
-	if !reflect.DeepEqual(p.PeriodType, expectedPeriodType) {
-		t.Fatalf("Period types do not match")
+	if !reflect.DeepEqual(p.SampleType, sampleType) {
+		t.Fatalf("p.SampleType = %v\nwant = %v", fmtJSON(p.SampleType), fmtJSON(sampleType))
 	}
-	if !reflect.DeepEqual(p.SampleType, expectedSampleType) {
-		t.Fatalf("Sample types do not match")
+	// Clear line info since it is not in the expected samples.
+	// If we used f1 and f2 above, then the samples will have line info.
+	for _, s := range p.Sample {
+		for _, l := range s.Location {
+			l.Line = nil
+		}
 	}
-	if !reflect.DeepEqual(p.Sample, expectedSample) {
-		t.Fatalf("Samples do not match: Expected: %v, Got:%v", getSampleAsString(expectedSample),
-			getSampleAsString(p.Sample))
+	if !reflect.DeepEqual(p.Sample, samples) {
+		t.Fatalf("p.Sample = %v\nwant = %v", fmtJSON(p.Sample), fmtJSON(samples))
 	}
 }

@@ -179,7 +159,7 @@ type fakeFunc struct {
 func (f *fakeFunc) Name() string {
 	return f.name
 }
-func (f *fakeFunc) FileLine(_ uintptr) (string, int) {
+func (f *fakeFunc) FileLine(uintptr) (string, int) {
 	return f.file, f.lineno
 }


--- a/src/runtime/pprof/protomem_test.go
+++ b/src/runtime/pprof/protomem_test.go
@@ -7,98 +7,54 @@ package pprof
 import (
 	"bytes"
 	"internal/pprof/profile"
-	"io/ioutil"
-	"reflect"
 	"runtime"
 	"testing"
 	"time"
 )

-// TestSampledHeapAllocProfile tests encoding of a memory profile from
-// runtime.MemProfileRecord data.
-func TestSampledHeapAllocProfile(t *testing.T) {
-	if runtime.GOOS != "linux" {
-		t.Skip("Test requires a system with /proc/self/maps")
-	}
-
-	// Figure out two addresses from /proc/self/maps.
-	mmap, err := ioutil.ReadFile("/proc/self/maps")
-	if err != nil {
-		t.Fatal("Cannot read /proc/self/maps")
-	}
-	rd := bytes.NewReader(mmap)
-	mprof := &profile.Profile{}
-	if err = mprof.ParseMemoryMap(rd); err != nil {
-		t.Fatalf("Cannot parse /proc/self/maps")
-	}
-	if len(mprof.Mapping) < 2 {
-		// It is possible for a binary to only have 1 executable
-		// region of memory.
-		t.Skipf("need 2 or more mappings, got %v", len(mprof.Mapping))
-	}
-	address1 := mprof.Mapping[0].Start
-	address2 := mprof.Mapping[1].Start
+func TestConvertMemProfile(t *testing.T) {
+	addr1, addr2, map1, map2 := testPCs(t)

 	var buf bytes.Buffer
+	a1, a2 := uintptr(addr1), uintptr(addr2)
+	rate := int64(512 * 1024)
+	rec := []runtime.MemProfileRecord{
+		{AllocBytes: 4096, FreeBytes: 1024, AllocObjects: 4, FreeObjects: 1, Stack0: [32]uintptr{a1, a2}},
+		{AllocBytes: 512 * 1024, FreeBytes: 0, AllocObjects: 1, FreeObjects: 0, Stack0: [32]uintptr{a2 + 1, a2 + 2}},
+		{AllocBytes: 512 * 1024, FreeBytes: 512 * 1024, AllocObjects: 1, FreeObjects: 1, Stack0: [32]uintptr{a1 + 1, a1 + 2, a2 + 3}},
+	}

-	rec, rate := testMemRecords(address1, address2)
 	p := encodeMemProfile(rec, rate, time.Now())
 	if err := p.Write(&buf); err != nil {
-		t.Fatalf("Failed to write profile: %v", err)
+		t.Fatalf("writing profile: %v", err)
 	}

-	p, err = profile.Parse(&buf)
+	p, err := profile.Parse(&buf)
 	if err != nil {
-		t.Fatalf("Could not parse Profile profile: %v", err)
+		t.Fatalf("profile.Parse: %v", err)
 	}

-	// Expected PeriodType, SampleType and Sample.
-	expectedPeriodType := &profile.ValueType{Type: "space", Unit: "bytes"}
-	expectedSampleType := []*profile.ValueType{
+	periodType := &profile.ValueType{Type: "space", Unit: "bytes"}
+	sampleType := []*profile.ValueType{
 		{Type: "alloc_objects", Unit: "count"},
 		{Type: "alloc_space", Unit: "bytes"},
 		{Type: "inuse_objects", Unit: "count"},
 		{Type: "inuse_space", Unit: "bytes"},
 	}
-	// Expected samples, with values unsampled according to the profiling rate.
-	expectedSample := []*profile.Sample{
+	samples := []*profile.Sample{
 		{Value: []int64{2050, 2099200, 1537, 1574400}, Location: []*profile.Location{
-			{ID: 1, Mapping: mprof.Mapping[0], Address: address1},
-			{ID: 2, Mapping: mprof.Mapping[1], Address: address2},
+			{ID: 1, Mapping: map1, Address: addr1},
+			{ID: 2, Mapping: map2, Address: addr2},
 		}},
 		{Value: []int64{1, 829411, 1, 829411}, Location: []*profile.Location{
-			{ID: 3, Mapping: mprof.Mapping[1], Address: address2 + 1},
-			{ID: 4, Mapping: mprof.Mapping[1], Address: address2 + 2},
+			{ID: 3, Mapping: map2, Address: addr2 + 1},
+			{ID: 4, Mapping: map2, Address: addr2 + 2},
 		}},
 		{Value: []int64{1, 829411, 0, 0}, Location: []*profile.Location{
-			{ID: 5, Mapping: mprof.Mapping[0], Address: address1 + 1},
-			{ID: 6, Mapping: mprof.Mapping[0], Address: address1 + 2},
-			{ID: 7, Mapping: mprof.Mapping[1], Address: address2 + 3},
+			{ID: 5, Mapping: map1, Address: addr1 + 1},
+			{ID: 6, Mapping: map1, Address: addr1 + 2},
+			{ID: 7, Mapping: map2, Address: addr2 + 3},
 		}},
 	}
-
-	if p.Period != 512*1024 {
-		t.Fatalf("Sampling periods do not match")
-	}
-	if !reflect.DeepEqual(p.PeriodType, expectedPeriodType) {
-		t.Fatalf("Period types do not match")
-	}
-	if !reflect.DeepEqual(p.SampleType, expectedSampleType) {
-		t.Fatalf("Sample types do not match")
-	}
-	if !reflect.DeepEqual(p.Sample, expectedSample) {
-		t.Fatalf("Samples do not match: Expected: %v, Got:%v", getSampleAsString(expectedSample),
-			getSampleAsString(p.Sample))
-	}
-}
-
-func testMemRecords(a1, a2 uint64) ([]runtime.MemProfileRecord, int64) {
-	addr1, addr2 := uintptr(a1), uintptr(a2)
-	rate := int64(512 * 1024)
-	rec := []runtime.MemProfileRecord{
-		{AllocBytes: 4096, FreeBytes: 1024, AllocObjects: 4, FreeObjects: 1, Stack0: [32]uintptr{addr1, addr2}},
-		{AllocBytes: 512 * 1024, FreeBytes: 0, AllocObjects: 1, FreeObjects: 0, Stack0: [32]uintptr{addr2 + 1, addr2 + 2}},
-		{AllocBytes: 512 * 1024, FreeBytes: 512 * 1024, AllocObjects: 1, FreeObjects: 1, Stack0: [32]uintptr{addr1 + 1, addr1 + 2, addr2 + 3}},
-	}
-	return rec, rate
+	checkProfile(t, p, rate, periodType, sampleType, samples)
 }
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -3137,13 +3137,14 @@ func mcount() int32 {
 }

 var prof struct {
-	lock uint32
-	hz   int32
+	signalLock uint32
+	hz         int32
 }

-func _System()       { _System() }
-func _ExternalCode() { _ExternalCode() }
-func _GC()           { _GC() }
+func _System()           { _System() }
+func _ExternalCode()     { _ExternalCode() }
+func _LostExternalCode() { _LostExternalCode() }
+func _GC()               { _GC() }

 // Called if we receive a SIGPROF signal.
 // Called by the signal handler, may run during STW.
@@ -3279,14 +3280,7 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 	}

 	if prof.hz != 0 {
-		// Simple cas-lock to coordinate with setcpuprofilerate.
-		for !atomic.Cas(&prof.lock, 0, 1) {
-			osyield()
-		}
-		if prof.hz != 0 {
-			cpuprof.add(stk[:n])
-		}
-		atomic.Store(&prof.lock, 0)
+		cpuprof.add(gp, stk[:n])
 	}
 	getg().m.mallocing--
 }
@@ -3309,15 +3303,7 @@ func sigprofNonGo() {
 		for n < len(sigprofCallers) && sigprofCallers[n] != 0 {
 			n++
 		}
-
-		// Simple cas-lock to coordinate with setcpuprofilerate.
-		for !atomic.Cas(&prof.lock, 0, 1) {
-			osyield()
-		}
-		if prof.hz != 0 {
-			cpuprof.addNonGo(sigprofCallers[:n])
-		}
-		atomic.Store(&prof.lock, 0)
+		cpuprof.addNonGo(sigprofCallers[:n])
 	}

 	atomic.Store(&sigprofCallersUse, 0)
@@ -3330,19 +3316,11 @@ func sigprofNonGo() {
 //go:nowritebarrierrec
 func sigprofNonGoPC(pc uintptr) {
 	if prof.hz != 0 {
-		pc := []uintptr{
+		stk := []uintptr{
 			pc,
 			funcPC(_ExternalCode) + sys.PCQuantum,
 		}
-
-		// Simple cas-lock to coordinate with setcpuprofilerate.
-		for !atomic.Cas(&prof.lock, 0, 1) {
-			osyield()
-		}
-		if prof.hz != 0 {
-			cpuprof.addNonGo(pc)
-		}
-		atomic.Store(&prof.lock, 0)
+		cpuprof.addNonGo(stk)
 	}
 }

@@ -3370,8 +3348,9 @@ func setsSP(pc uintptr) bool {
 	return false
 }

-// Arrange to call fn with a traceback hz times a second.
-func setcpuprofilerate_m(hz int32) {
+// setcpuprofilerate sets the CPU profiling rate to hz times per second.
+// If hz <= 0, setcpuprofilerate turns off CPU profiling.
+func setcpuprofilerate(hz int32) {
 	// Force sane arguments.
 	if hz < 0 {
 		hz = 0
@@ -3387,14 +3366,14 @@ func setcpuprofilerate_m(hz int32) {
 	// it would deadlock.
 	setThreadCPUProfiler(0)

-	for !atomic.Cas(&prof.lock, 0, 1) {
+	for !atomic.Cas(&prof.signalLock, 0, 1) {
 		osyield()
 	}
 	if prof.hz != hz {
 		setProcessCPUProfiler(hz)
 		prof.hz = hz
 	}
-	atomic.Store(&prof.lock, 0)
+	atomic.Store(&prof.signalLock, 0)

 	lock(&sched.lock)
 	sched.profilehz = hz