runtime: scheduler, cgo reorganization

* Change use of m->g0 stack (aka scheduler stack). * Provide runtime.mcall(f) to invoke f() on m->g0 stack. * Replace scheduler loop entry with runtime.mcall(schedule). Runtime.mcall eliminates the need for fake scheduler states that exist just to run a bit of code on the m->g0 stack (Grecovery, Gstackalloc). The elimination of the scheduler as a loop that stops and starts using gosave and gogo fixes a bad interaction with the way cgo uses the m->g0 stack. Cgo runs external (gcc-compiled) C functions on that stack, and then when calling back into Go, it sets m->g0->sched.sp below the added call frames, so that other uses of m->g0's stack will not interfere with those frames. Unfortunately, gogo (longjmp) back to the scheduler loop at this point would end up running scheduler with the lower sp, which no longer points at a valid stack frame for a call to scheduler. If scheduler then wrote any function call arguments or local variables to where it expected the stack frame to be, it would overwrite other data on the stack. I realized this possibility while debugging a problem with calling complex Go code in a Go -> C -> Go cgo callback. This wasn't the bug I was looking for, it turns out, but I believe it is a real bug nonetheless. Switching to runtime.mcall, which only adds new frames to the stack and never jumps into functions running in existing ones, fixes this bug. * Move cgo-related code out of proc.c into cgocall.c. * Add very large comment describing cgo call sequences. * Simpilify, regularize cgo function implementations and names. * Add test suite as misc/cgo/test. Now the Go -> C path calls cgocall, which calls asmcgocall, and the C -> Go path calls cgocallback, which calls cgocallbackg. The shuffling, which affects mainly the callback case, moves most of the callback implementation to cgocallback running on the m->curg stack (not the m->g0 scheduler stack) and only while accounted for with $GOMAXPROCS (between calls to exitsyscall and entersyscall). The previous callback code did not block in startcgocallback's approximation to exitsyscall, so if, say, the garbage collector were running, it would still barge in and start doing things like call malloc. Similarly endcgocallback's approximation of entersyscall did not call matchmg to kick off new OS threads when necessary, which caused the bug in issue 1560. Fixes #1560. R=iant CC=golang-dev https://golang.org/cl/4253054

runtime: scheduler, cgo reorganization
* Change use of m->g0 stack (aka scheduler stack). * Provide runtime.mcall(f) to invoke f() on m->g0 stack. * Replace scheduler loop entry with runtime.mcall(schedule). Runtime.mcall eliminates the need for fake scheduler states that exist just to run a bit of code on the m->g0 stack (Grecovery, Gstackalloc). The elimination of the scheduler as a loop that stops and starts using gosave and gogo fixes a bad interaction with the way cgo uses the m->g0 stack. Cgo runs external (gcc-compiled) C functions on that stack, and then when calling back into Go, it sets m->g0->sched.sp below the added call frames, so that other uses of m->g0's stack will not interfere with those frames. Unfortunately, gogo (longjmp) back to the scheduler loop at this point would end up running scheduler with the lower sp, which no longer points at a valid stack frame for a call to scheduler. If scheduler then wrote any function call arguments or local variables to where it expected the stack frame to be, it would overwrite other data on the stack. I realized this possibility while debugging a problem with calling complex Go code in a Go -> C -> Go cgo callback. This wasn't the bug I was looking for, it turns out, but I believe it is a real bug nonetheless. Switching to runtime.mcall, which only adds new frames to the stack and never jumps into functions running in existing ones, fixes this bug. * Move cgo-related code out of proc.c into cgocall.c. * Add very large comment describing cgo call sequences. * Simpilify, regularize cgo function implementations and names. * Add test suite as misc/cgo/test. Now the Go -> C path calls cgocall, which calls asmcgocall, and the C -> Go path calls cgocallback, which calls cgocallbackg. The shuffling, which affects mainly the callback case, moves most of the callback implementation to cgocallback running on the m->curg stack (not the m->g0 scheduler stack) and only while accounted for with $GOMAXPROCS (between calls to exitsyscall and entersyscall). The previous callback code did not block in startcgocallback's approximation to exitsyscall, so if, say, the garbage collector were running, it would still barge in and start doing things like call malloc. Similarly endcgocallback's approximation of entersyscall did not call matchmg to kick off new OS threads when necessary, which caused the bug in issue 1560. Fixes #1560. R=iant CC=golang-dev https://golang.org/cl/4253054
f9ca3b5d · Russ Cox · 6d6f3381 · f9ca3b5d · f9ca3b5d · f9ca3b5d
Commit f9ca3b5d authored Mar 07, 2011 by Russ Cox
21 changed files
--- a/misc/cgo/stdio/Makefile
+++ b/misc/cgo/stdio/Makefile
@@ -6,10 +6,7 @@ include ../../../src/Make.inc

 TARG=stdio
 CGOFILES=\
-	align.go\
 	file.go\
-	test.go\
-	test1.go\

 CLEANFILES+=hello fib chain run.out


--- a/misc/cgo/stdio/hello.go
+++ b/misc/cgo/stdio/hello.go
@@ -4,26 +4,8 @@

 package main

-import (
-	"os"
-	"stdio"
-)
+import "stdio"

 func main() {
 	stdio.Stdout.WriteString(stdio.Greeting + "\n")
-
-	l := stdio.Atol("123")
-	if l != 123 {
-		println("Atol 123: ", l)
-		panic("bad atol")
-	}
-
-	n, err := stdio.Strtol("asdf", 123)
-	if n != 0 || err != os.EINVAL {
-		println("Strtol: ", n, err)
-		panic("bad atoi2")
-	}
-
-	stdio.TestAlign()
-	stdio.TestEnum()
 }
--- a/misc/cgo/test/Makefile
+++ b/misc/cgo/test/Makefile
+# Copyright 2011 The Go Authors.  All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+include ../../../src/Make.inc
+
+TARG=runtime/cgotest
+
+CGOFILES=\
+	align.go\
+	basic.go\
+	callback.go\
+	issue1222.go\
+	issue1328.go\
+	issue1560.go\
+
+CGO_OFILES=\
+	callback_c.o\
+
+OFILES=\
+	runtime.$O\
+
+include ../../../src/Make.pkg
--- a/misc/cgo/stdio/align.go
+++ b/misc/cgo/stdio/align.go
-package stdio
+package cgotest

 /*
 #include <stdio.h>
@@ -55,24 +55,18 @@ void cTest(SDL_KeyboardEvent *event) {
 import "C"

 import (
-	"fmt"
-	"syscall"
+	"testing"
 )

-func TestAlign() {
-	if syscall.ARCH == "amd64" {
-		// alignment is known to be broken on amd64.
-		// http://code.google.com/p/go/issues/detail?id=609
-		return
-	}
+func TestAlign(t *testing.T) {
 	var evt C.SDL_KeyboardEvent
 	C.makeEvent(&evt)
 	if C.same(&evt, evt.typ, evt.which, evt.state, evt.keysym.scancode, evt.keysym.sym, evt.keysym.mod, evt.keysym.unicode) == 0 {
-		fmt.Println("*** bad alignment")
+		t.Error("*** bad alignment")
 		C.cTest(&evt)
-		fmt.Printf("Go: %#x %#x %#x %#x %#x %#x %#x\n",
+		t.Errorf("Go: %#x %#x %#x %#x %#x %#x %#x\n",
 			evt.typ, evt.which, evt.state, evt.keysym.scancode,
 			evt.keysym.sym, evt.keysym.mod, evt.keysym.unicode)
-		fmt.Println(evt)
+		t.Error(evt)
 	}
 }
--- a/misc/cgo/stdio/test.go
+++ b/misc/cgo/stdio/test.go
@@ -2,9 +2,9 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// This file contains test cases for cgo.
+// Basic test cases for cgo.

-package stdio
+package cgotest

 /*
 #include <stdio.h>
@@ -52,6 +52,7 @@ struct ibv_context {
 import "C"
 import (
 	"os"
+	"testing"
 	"unsafe"
 )

@@ -89,38 +90,35 @@ func Atol(s string) int {
 	return int(n)
 }

-func TestConst() {
+func TestConst(t *testing.T) {
 	C.myConstFunc(nil, 0, nil)
 }

-func TestEnum() {
+func TestEnum(t *testing.T) {
 	if C.Enum1 != 1 || C.Enum2 != 2 {
-		println("bad enum", C.Enum1, C.Enum2)
+		t.Error("bad enum", C.Enum1, C.Enum2)
 	}
 }

-func TestAtol() {
+func TestAtol(t *testing.T) {
 	l := Atol("123")
 	if l != 123 {
-		println("Atol 123: ", l)
-		panic("bad atol")
+		t.Error("Atol 123: ", l)
 	}
 }

-func TestErrno() {
+func TestErrno(t *testing.T) {
 	n, err := Strtol("asdf", 123)
 	if n != 0 || err != os.EINVAL {
-		println("Strtol: ", n, err)
-		panic("bad strtol")
+		t.Error("Strtol: ", n, err)
 	}
 }

-func TestMultipleAssign() {
-	p := C.CString("123")
+func TestMultipleAssign(t *testing.T) {
+	p := C.CString("234")
 	n, m := C.strtol(p, nil, 345), C.strtol(p, nil, 10)
 	if n != 0 || m != 234 {
-		println("Strtol x2: ", n, m)
-		panic("bad strtol x2")
+		t.Fatal("Strtol x2: ", n, m)
 	}
 	C.free(unsafe.Pointer(p))
 }
@@ -134,11 +132,3 @@ var (
 type Context struct {
 	ctx *C.struct_ibv_context
 }
-
-func Test() {
-	TestAlign()
-	TestAtol()
-	TestEnum()
-	TestErrno()
-	TestConst()
-}
--- a/misc/cgo/test/callback.go
+++ b/misc/cgo/test/callback.go
+package cgotest
+
+/*
+void callback(void *f);
+void callGoFoo(void) {
+	extern void goFoo(void);
+	goFoo();
+}
+*/
+import "C"
+
+import (
+	"runtime"
+	"testing"
+	"unsafe"
+)
+
+// nestedCall calls into C, back into Go, and finally to f.
+func nestedCall(f func()) {
+	// NOTE: Depends on representation of f.
+	// callback(x) calls goCallback(x)
+	C.callback(*(*unsafe.Pointer)(unsafe.Pointer(&f)))
+}
+
+//export goCallback
+func goCallback(p unsafe.Pointer) {
+	(*(*func())(unsafe.Pointer(&p)))()
+}
+
+func TestCallback(t *testing.T) {
+	var x = false
+	nestedCall(func(){x = true})
+	if !x {
+		t.Fatal("nestedCall did not call func")
+	}
+}
+
+func TestCallbackGC(t *testing.T) {
+	nestedCall(runtime.GC)
+}
+
+func lockedOSThread() bool  // in runtime.c
+
+func TestCallbackPanic(t *testing.T) {
+	// Make sure panic during callback unwinds properly.
+	if lockedOSThread() {
+		t.Fatal("locked OS thread on entry to TestCallbackPanic")
+	}
+	defer func() {
+		s := recover()
+		if s == nil {
+			t.Fatal("did not panic")
+		}
+		if s.(string) != "callback panic" {
+			t.Fatal("wrong panic:", s)
+		}
+		if lockedOSThread() {
+			t.Fatal("locked OS thread on exit from TestCallbackPanic")
+		}
+	}()
+	nestedCall(func(){panic("callback panic")})
+	panic("nestedCall returned")
+}
+
+func TestCallbackPanicLoop(t *testing.T) {
+	// Make sure we don't blow out m->g0 stack.
+	for i := 0; i < 100000; i++ {
+		TestCallbackPanic(t)
+	}
+}
+
+func TestCallbackPanicLocked(t *testing.T) {
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	if !lockedOSThread() {
+		t.Fatal("runtime.LockOSThread didn't")
+	}
+	defer func() {
+		s := recover()
+		if s == nil {
+			t.Fatal("did not panic")
+		}
+		if s.(string) != "callback panic" {
+			t.Fatal("wrong panic:", s)
+		}
+		if !lockedOSThread() {
+			t.Fatal("lost lock on OS thread after panic")
+		}
+	}()
+	nestedCall(func(){panic("callback panic")})
+	panic("nestedCall returned")
+}
+
+// Callback with zero arguments used to make the stack misaligned,
+// which broke the garbage collector and other things.
+func TestZeroArgCallback(t *testing.T) {
+	defer func() {
+		s := recover()
+		if s != nil {
+			t.Fatal("panic during callback:", s)
+		}
+	}()
+	C.callGoFoo()
+}
+
+//export goFoo
+func goFoo() {
+	x := 1
+	for i := 0; i < 10000; i++ {
+		// variadic call mallocs + writes to 
+		variadic(x, x, x)
+		if x != 1 {
+			panic("bad x")
+		}
+	}
+}
+
+func variadic(x ...interface{}) {}
+
+func TestBlocking(t *testing.T) {
+	c := make(chan int)
+	go func() {
+		for i := 0; i < 10; i++ {
+			c <- <-c
+		}
+	}()
+	nestedCall(func(){
+		for i := 0; i < 10; i++ {
+			c <- i
+			if j := <-c; j != i {
+				t.Errorf("out of sync %d != %d", j, i)
+			}
+		}
+	})
+}
--- a/misc/cgo/test/callback_c.c
+++ b/misc/cgo/test/callback_c.c
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include "_cgo_export.h"
+
+void
+callback(void *f)
+{
+	goCallback(f);
+}
--- a/misc/cgo/test/cgo_test.go
+++ b/misc/cgo/test/cgo_test.go
+package cgotest
+
+// dummy file so gotest thinks there are tests.
+// the actual tests are in the main go files, next
+// to the code they test.
+
--- a/misc/cgo/stdio/test1.go
+++ b/misc/cgo/stdio/test1.go
@@ -4,7 +4,7 @@

 // This file contains test cases for cgo.

-package stdio
+package cgotest

 /*
 // issue 1222

--- a/misc/cgo/test/issue1328.go
+++ b/misc/cgo/test/issue1328.go
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cgotest
+
+import "testing"
+
+// extern void BackIntoGo(void);
+// void IntoC() { BackIntoGo(); }
+import "C"
+
+//export BackIntoGo
+func BackIntoGo() {
+	x := 1
+
+	for i := 0; i < 10000; i++ {
+		xvariadic(x)
+		if x != 1 {
+			panic("x is not 1?")
+		}
+	}
+}
+
+func xvariadic(x ...interface{}) {
+}
+
+func Test1328(t *testing.T) {
+	C.IntoC()
+}
--- a/misc/cgo/test/issue1560.go
+++ b/misc/cgo/test/issue1560.go
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cgotest
+
+/*
+#include <unistd.h>
+
+extern void BackgroundSleep(int);
+void twoSleep(int n) {
+	BackgroundSleep(n);
+	sleep(n);
+}
+*/
+import "C"
+
+import (
+	"testing"
+	"time"
+)
+
+var sleepDone = make(chan bool)
+
+func parallelSleep(n int) {
+	C.twoSleep(C.int(n))
+	<-sleepDone
+}
+
+//export BackgroundSleep
+func BackgroundSleep(n int){
+	go func(){
+		C.sleep(C.uint(n))
+		sleepDone <- true
+	}()
+}
+
+func TestParallelSleep(t *testing.T) {
+	dt := -time.Nanoseconds()
+	parallelSleep(1)
+	dt += time.Nanoseconds()
+	// bug used to run sleeps in serial, producing a 2-second delay.
+	if dt >= 1.3e9 {
+		t.Fatalf("parallel 1-second sleeps slept for %f seconds", float64(dt)/1e9)
+	}
+}
--- a/misc/cgo/test/runtime.c
+++ b/misc/cgo/test/runtime.c
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Expose some runtime functions for testing.
+
+typedef char bool;
+
+bool runtime·lockedOSThread(void);
+
+static void
+FLUSH(void*)
+{
+}
+
+void
+·lockedOSThread(bool b)
+{
+	b = runtime·lockedOSThread();
+	FLUSH(&b);
+}
--- a/src/clean.bash
+++ b/src/clean.bash
@@ -21,6 +21,7 @@ fi
 rm -f "$GOROOT"/lib/*.a
 for i in lib9 libbio libmach cmd pkg \
 	../misc/cgo/gmp ../misc/cgo/stdio \
+	../misc/cgo/life ../misc/cgo/test \
 	../test/bench ../test/garbage
 do
 	gomake -C "$GOROOT/src/$i" clean

--- a/src/pkg/runtime/386/asm.s
+++ b/src/pkg/runtime/386/asm.s
@@ -105,7 +105,7 @@ TEXT runtime·breakpoint(SB),7,$0
 *  go-routine
 */

-// uintptr gosave(Gobuf*)
+// void gosave(Gobuf*)
 // save state in Gobuf; setjmp
 TEXT runtime·gosave(SB), 7, $0
 	MOVL	4(SP), AX		// gobuf
@@ -116,7 +116,6 @@ TEXT runtime·gosave(SB), 7, $0
 	get_tls(CX)
 	MOVL	g(CX), BX
 	MOVL	BX, gobuf_g(AX)
-	MOVL	$0, AX			// return 0
 	RET

 // void gogo(Gobuf*, uintptr)
@@ -148,6 +147,35 @@ TEXT runtime·gogocall(SB), 7, $0
 	JMP	AX
 	POPL	BX	// not reached

+// void mcall(void (*fn)(G*))
+// Switch to m->g0's stack, call fn(g).
+// Fn must never return.  It should gogo(&g->gobuf)
+// to keep running g.
+TEXT runtime·mcall(SB), 7, $0
+	MOVL	fn+0(FP), DI
+	
+	get_tls(CX)
+	MOVL	g(CX), AX	// save state in g->gobuf
+	MOVL	0(SP), BX	// caller's PC
+	MOVL	BX, (g_sched+gobuf_pc)(AX)
+	LEAL	4(SP), BX	// caller's SP
+	MOVL	BX, (g_sched+gobuf_sp)(AX)
+	MOVL	AX, (g_sched+gobuf_g)(AX)
+
+	// switch to m->g0 & its stack, call fn
+	MOVL	m(CX), BX
+	MOVL	m_g0(BX), SI
+	CMPL	SI, AX	// if g == m->g0 call badmcall
+	JNE	2(PC)
+	CALL	runtime·badmcall(SB)
+	MOVL	SI, g(CX)	// g = m->g0
+	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->gobuf.sp
+	PUSHL	AX
+	CALL	DI
+	POPL	AX
+	CALL	runtime·badmcall2(SB)
+	RET
+
 /*
 * support for morestack
 */
@@ -183,10 +211,10 @@ TEXT runtime·morestack(SB),7,$0
 	MOVL	0(SP), AX
 	MOVL	AX, m_morepc(BX)

-	// Call newstack on m's scheduling stack.
+	// Call newstack on m->g0's stack.
 	MOVL	m_g0(BX), BP
 	MOVL	BP, g(CX)
-	MOVL	(m_sched+gobuf_sp)(BX), AX
+	MOVL	(g_sched+gobuf_sp)(BP), AX
 	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
 	MOVL	AX, SP
 	CALL	runtime·newstack(SB)
@@ -226,11 +254,11 @@ TEXT reflect·call(SB), 7, $0
 	MOVL	CX, m_moreargsize(BX)	// f's argument size
 	MOVL	$1, m_moreframesize(BX)	// f's frame size

-	// Call newstack on m's scheduling stack.
+	// Call newstack on m->g0's stack.
 	MOVL	m_g0(BX), BP
 	get_tls(CX)
 	MOVL	BP, g(CX)
-	MOVL	(m_sched+gobuf_sp)(BX), SP
+	MOVL	(g_sched+gobuf_sp)(BP), SP
 	CALL	runtime·newstack(SB)
 	MOVL	$0, 0x1103	// crash if newstack returns
 	RET
@@ -243,10 +271,10 @@ TEXT runtime·lessstack(SB), 7, $0
 	MOVL	m(CX), BX
 	MOVL	AX, m_cret(BX)

-	// Call oldstack on m's scheduling stack.
-	MOVL	m_g0(BX), DX
-	MOVL	DX, g(CX)
-	MOVL	(m_sched+gobuf_sp)(BX), SP
+	// Call oldstack on m->g0's stack.
+	MOVL	m_g0(BX), BP
+	MOVL	BP, g(CX)
+	MOVL	(g_sched+gobuf_sp)(BP), SP
 	CALL	runtime·oldstack(SB)
 	MOVL	$0, 0x1004	// crash if oldstack returns
 	RET
@@ -302,6 +330,133 @@ TEXT runtime·jmpdefer(SB), 7, $0
 	SUBL	$5, (SP)	// return to CALL again
 	JMP	AX	// but first run the deferred function

+// Dummy function to use in saved gobuf.PC,
+// to match SP pointing at a return address.
+// The gobuf.PC is unused by the contortions here
+// but setting it to return will make the traceback code work.
+TEXT return<>(SB),7,$0
+	RET
+
+// asmcgocall(void(*fn)(void*), void *arg)
+// Call fn(arg) on the scheduler stack,
+// aligned appropriately for the gcc ABI.
+// See cgocall.c for more details.
+TEXT runtime·asmcgocall(SB),7,$0
+	MOVL	fn+0(FP), AX
+	MOVL	arg+4(FP), BX
+	MOVL	SP, DX
+
+	// Figure out if we need to switch to m->g0 stack.
+	// We get called to create new OS threads too, and those
+	// come in on the m->g0 stack already.
+	get_tls(CX)
+	MOVL	m(CX), BP
+	MOVL	m_g0(BP), SI
+	MOVL	g(CX), DI
+	CMPL	SI, DI
+	JEQ	6(PC)
+	MOVL	SP, (g_sched+gobuf_sp)(DI)
+	MOVL	$return<>(SB), (g_sched+gobuf_pc)(DI)
+	MOVL	DI, (g_sched+gobuf_g)(DI)
+	MOVL	SI, g(CX)
+	MOVL	(g_sched+gobuf_sp)(SI), SP
+
+	// Now on a scheduling stack (a pthread-created stack).
+	SUBL	$32, SP
+	ANDL	$~15, SP	// alignment, perhaps unnecessary
+	MOVL	DI, 8(SP)	// save g
+	MOVL	DX, 4(SP)	// save SP
+	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
+	CALL	AX
+
+	// Restore registers, g, stack pointer.
+	get_tls(CX)
+	MOVL	8(SP), DI
+	MOVL	DI, g(CX)
+	MOVL	4(SP), SP
+	RET
+
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+// See cgocall.c for more details.
+TEXT runtime·cgocallback(SB),7,$12
+	MOVL	fn+0(FP), AX
+	MOVL	frame+4(FP), BX
+	MOVL	framesize+8(FP), DX
+
+	// Save current m->g0->sched.sp on stack and then set it to SP.
+	get_tls(CX)
+	MOVL	m(CX), BP
+	MOVL	m_g0(BP), SI
+	PUSHL	(g_sched+gobuf_sp)(SI)
+	MOVL	SP, (g_sched+gobuf_sp)(SI)
+
+	// Switch to m->curg stack and call runtime.cgocallback
+	// with the three arguments.  Because we are taking over
+	// the execution of m->curg but *not* resuming what had
+	// been running, we need to save that information (m->curg->gobuf)
+	// so that we can restore it when we're done. 
+	// We can restore m->curg->gobuf.sp easily, because calling
+	// runtime.cgocallback leaves SP unchanged upon return.
+	// To save m->curg->gobuf.pc, we push it onto the stack.
+	// This has the added benefit that it looks to the traceback
+	// routine like cgocallback is going to return to that
+	// PC (because we defined cgocallback to have
+	// a frame size of 12, the same amount that we use below),
+	// so that the traceback will seamlessly trace back into
+	// the earlier calls.
+	MOVL	m_curg(BP), SI
+	MOVL	SI, g(CX)
+	MOVL	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
+
+	// Push gobuf.pc
+	MOVL	(g_sched+gobuf_pc)(SI), BP
+	SUBL	$4, DI
+	MOVL	BP, 0(DI)
+
+	// Push arguments to cgocallbackg.
+	// Frame size here must match the frame size above
+	// to trick traceback routines into doing the right thing.
+	SUBL	$12, DI
+	MOVL	AX, 0(DI)
+	MOVL	BX, 4(DI)
+	MOVL	DX, 8(DI)
+	
+	// Switch stack and make the call.
+	MOVL	DI, SP
+	CALL	runtime·cgocallbackg(SB)
+
+	// Restore g->gobuf (== m->curg->gobuf) from saved values.
+	get_tls(CX)
+	MOVL	g(CX), SI
+	MOVL	12(SP), BP
+	MOVL	BP, (g_sched+gobuf_pc)(SI)
+	LEAL	(12+4)(SP), DI
+	MOVL	DI, (g_sched+gobuf_sp)(SI)
+
+	// Switch back to m->g0's stack and restore m->g0->sched.sp.
+	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
+	// so we do not have to restore it.)
+	MOVL	m(CX), BP
+	MOVL	m_g0(BP), SI
+	MOVL	SI, g(CX)
+	MOVL	(g_sched+gobuf_sp)(SI), SP
+	POPL	(g_sched+gobuf_sp)(SI)
+
+	// Done!
+	RET
+
+// check that SP is in range [g->stackbase, g->stackguard)
+TEXT runtime·stackcheck(SB), 7, $0
+	get_tls(CX)
+	MOVL	g(CX), AX
+	CMPL	g_stackbase(AX), SP
+	JHI	2(PC)
+	INT	$3
+	CMPL	SP, g_stackguard(AX)
+	JHI	2(PC)
+	INT	$3
+	RET
+
 TEXT runtime·memclr(SB),7,$0
 	MOVL	4(SP), DI		// arg 1 addr
 	MOVL	8(SP), CX		// arg 2 count
@@ -345,82 +500,4 @@ TEXT runtime·emptyfunc(SB),0,$0
 TEXT runtime·abort(SB),7,$0
 	INT $0x3

-// runcgo(void(*fn)(void*), void *arg)
-// Call fn(arg) on the scheduler stack,
-// aligned appropriately for the gcc ABI.
-TEXT runtime·runcgo(SB),7,$16
-	MOVL	fn+0(FP), AX
-	MOVL	arg+4(FP), BX
-	MOVL	SP, CX
-
-	// Figure out if we need to switch to m->g0 stack.
-	get_tls(DI)
-	MOVL	m(DI), DX
-	MOVL	m_g0(DX), SI
-	CMPL	g(DI), SI
-	JEQ	2(PC)
-	MOVL	(m_sched+gobuf_sp)(DX), SP
-
-	// Now on a scheduling stack (a pthread-created stack).
-	SUBL	$16, SP
-	ANDL	$~15, SP	// alignment for gcc ABI
-	MOVL	g(DI), BP
-	MOVL	BP, 8(SP)
-	MOVL	SI, g(DI)
-	MOVL	CX, 4(SP)
-	MOVL	BX, 0(SP)
-	CALL	AX
-	
-	// Back; switch to original g and stack, re-establish
-	// "DF is clear" invariant.
-	CLD
-	get_tls(DI)
-	MOVL	8(SP), SI
-	MOVL	SI, g(DI)
-	MOVL	4(SP), SP
-	RET
-
-// runcgocallback(G *g1, void* sp, void (*fn)(void))
-// Switch to g1 and sp, call fn, switch back.  fn's arguments are on
-// the new stack.
-TEXT runtime·runcgocallback(SB),7,$32
-	MOVL	g1+0(FP), DX
-	MOVL	sp+4(FP), AX
-	MOVL	fn+8(FP), BX
-
-	// We are running on m's scheduler stack.  Save current SP
-	// into m->sched.sp so that a recursive call to runcgo doesn't
-	// clobber our stack, and also so that we can restore
-	// the SP when the call finishes.  Reusing m->sched.sp
-	// for this purpose depends on the fact that there is only
-	// one possible gosave of m->sched.
-	get_tls(CX)
-	MOVL	DX, g(CX)
-	MOVL	m(CX), CX
-	MOVL	SP, (m_sched+gobuf_sp)(CX)
-
-	// Set new SP, call fn
-	MOVL	AX, SP
-	CALL	BX
-
-	// Restore old g and SP, return
-	get_tls(CX)
-	MOVL	m(CX), DX
-	MOVL	m_g0(DX), BX
-	MOVL	BX, g(CX)
-	MOVL	(m_sched+gobuf_sp)(DX), SP
-	RET
-
-// check that SP is in range [g->stackbase, g->stackguard)
-TEXT runtime·stackcheck(SB), 7, $0
-	get_tls(CX)
-	MOVL	g(CX), AX
-	CMPL	g_stackbase(AX), SP
-	JHI	2(PC)
-	INT	$3
-	CMPL	SP, g_stackguard(AX)
-	JHI	2(PC)
-	INT	$3
-	RET
-
 GLOBL runtime·tls0(SB), $32
--- a/src/pkg/runtime/amd64/asm.s
+++ b/src/pkg/runtime/amd64/asm.s
@@ -89,7 +89,7 @@ TEXT runtime·breakpoint(SB),7,$0
 *  go-routine
 */

-// uintptr gosave(Gobuf*)
+// void gosave(Gobuf*)
 // save state in Gobuf; setjmp
 TEXT runtime·gosave(SB), 7, $0
 	MOVQ	8(SP), AX		// gobuf
@@ -100,7 +100,6 @@ TEXT runtime·gosave(SB), 7, $0
 	get_tls(CX)
 	MOVQ	g(CX), BX
 	MOVQ	BX, gobuf_g(AX)
-	MOVL	$0, AX			// return 0
 	RET

 // void gogo(Gobuf*, uintptr)
@@ -132,6 +131,35 @@ TEXT runtime·gogocall(SB), 7, $0
 	JMP	AX
 	POPQ	BX	// not reached

+// void mcall(void (*fn)(G*))
+// Switch to m->g0's stack, call fn(g).
+// Fn must never return.  It should gogo(&g->gobuf)
+// to keep running g.
+TEXT runtime·mcall(SB), 7, $0
+	MOVQ	fn+0(FP), DI
+	
+	get_tls(CX)
+	MOVQ	g(CX), AX	// save state in g->gobuf
+	MOVQ	0(SP), BX	// caller's PC
+	MOVQ	BX, (g_sched+gobuf_pc)(AX)
+	LEAQ	8(SP), BX	// caller's SP
+	MOVQ	BX, (g_sched+gobuf_sp)(AX)
+	MOVQ	AX, (g_sched+gobuf_g)(AX)
+
+	// switch to m->g0 & its stack, call fn
+	MOVQ	m(CX), BX
+	MOVQ	m_g0(BX), SI
+	CMPQ	SI, AX	// if g == m->g0 call badmcall
+	JNE	2(PC)
+	CALL	runtime·badmcall(SB)
+	MOVQ	SI, g(CX)	// g = m->g0
+	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->gobuf.sp
+	PUSHQ	AX
+	CALL	DI
+	POPQ	AX
+	CALL	runtime·badmcall2(SB)
+	RET
+
 /*
 * support for morestack
 */
@@ -160,10 +188,10 @@ TEXT runtime·morestack(SB),7,$0
 	MOVQ	0(SP), AX
 	MOVQ	AX, m_morepc(BX)

-	// Call newstack on m's scheduling stack.
+	// Call newstack on m->g0's stack.
 	MOVQ	m_g0(BX), BP
 	MOVQ	BP, g(CX)
-	MOVQ	(m_sched+gobuf_sp)(BX), SP
+	MOVQ	(g_sched+gobuf_sp)(BP), SP
 	CALL	runtime·newstack(SB)
 	MOVQ	$0, 0x1003	// crash if newstack returns
 	RET
@@ -201,11 +229,11 @@ TEXT reflect·call(SB), 7, $0
 	MOVL	CX, m_moreargsize(BX)	// f's argument size
 	MOVL	$1, m_moreframesize(BX)	// f's frame size

-	// Call newstack on m's scheduling stack.
+	// Call newstack on m->g0's stack.
 	MOVQ	m_g0(BX), BP
 	get_tls(CX)
 	MOVQ	BP, g(CX)
-	MOVQ	(m_sched+gobuf_sp)(BX), SP
+	MOVQ	(g_sched+gobuf_sp)(BP), SP
 	CALL	runtime·newstack(SB)
 	MOVQ	$0, 0x1103	// crash if newstack returns
 	RET
@@ -217,10 +245,10 @@ TEXT runtime·lessstack(SB), 7, $0
 	MOVQ	m(CX), BX
 	MOVQ	AX, m_cret(BX)

-	// Call oldstack on m's scheduling stack.
-	MOVQ	m_g0(BX), DX
-	MOVQ	DX, g(CX)
-	MOVQ	(m_sched+gobuf_sp)(BX), SP
+	// Call oldstack on m->g0's stack.
+	MOVQ	m_g0(BX), BP
+	MOVQ	BP, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(BP), SP
 	CALL	runtime·oldstack(SB)
 	MOVQ	$0, 0x1004	// crash if oldstack returns
 	RET
@@ -336,7 +364,6 @@ TEXT runtime·casp(SB), 7, $0
 	MOVL	$1, AX
 	RET

-
 // void jmpdefer(fn, sp);
 // called from deferreturn.
 // 1. pop the caller
@@ -349,68 +376,119 @@ TEXT runtime·jmpdefer(SB), 7, $0
 	SUBQ	$5, (SP)	// return to CALL again
 	JMP	AX	// but first run the deferred function

-// runcgo(void(*fn)(void*), void *arg)
+// Dummy function to use in saved gobuf.PC,
+// to match SP pointing at a return address.
+// The gobuf.PC is unused by the contortions here
+// but setting it to return will make the traceback code work.
+TEXT return<>(SB),7,$0
+	RET
+
+// asmcgocall(void(*fn)(void*), void *arg)
 // Call fn(arg) on the scheduler stack,
 // aligned appropriately for the gcc ABI.
-TEXT runtime·runcgo(SB),7,$32
-	MOVQ	fn+0(FP), R12
-	MOVQ	arg+8(FP), R13
-	MOVQ	SP, CX
+// See cgocall.c for more details.
+TEXT runtime·asmcgocall(SB),7,$0
+	MOVQ	fn+0(FP), AX
+	MOVQ	arg+8(FP), BX
+	MOVQ	SP, DX

 	// Figure out if we need to switch to m->g0 stack.
-	get_tls(DI)
-	MOVQ	m(DI), DX
-	MOVQ	m_g0(DX), SI
-	CMPQ	g(DI), SI
-	JEQ	2(PC)
-	MOVQ	(m_sched+gobuf_sp)(DX), SP
+	// We get called to create new OS threads too, and those
+	// come in on the m->g0 stack already.
+	get_tls(CX)
+	MOVQ	m(CX), BP
+	MOVQ	m_g0(BP), SI
+	MOVQ	g(CX), DI
+	CMPQ	SI, DI
+	JEQ	6(PC)
+	MOVQ	SP, (g_sched+gobuf_sp)(DI)
+	MOVQ	$return<>(SB), (g_sched+gobuf_pc)(DI)
+	MOVQ	DI, (g_sched+gobuf_g)(DI)
+	MOVQ	SI, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(SI), SP

 	// Now on a scheduling stack (a pthread-created stack).
 	SUBQ	$32, SP
 	ANDQ	$~15, SP	// alignment for gcc ABI
-	MOVQ	g(DI), BP
-	MOVQ	BP, 16(SP)
-	MOVQ	SI, g(DI)
-	MOVQ	CX, 8(SP)
-	MOVQ	R13, DI		// DI = first argument in AMD64 ABI
-	CALL	R12
+	MOVQ	DI, 16(SP)	// save g
+	MOVQ	DX, 8(SP)	// save SP
+	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
+	CALL	AX

 	// Restore registers, g, stack pointer.
-	get_tls(DI)
-	MOVQ	16(SP), SI
-	MOVQ	SI, g(DI)
+	get_tls(CX)
+	MOVQ	16(SP), DI
+	MOVQ	DI, g(CX)
 	MOVQ	8(SP), SP
 	RET

-// runcgocallback(G *g1, void* sp, void (*fn)(void))
-// Switch to g1 and sp, call fn, switch back.  fn's arguments are on
-// the new stack.
-TEXT runtime·runcgocallback(SB),7,$48
-	MOVQ	g1+0(FP), DX
-	MOVQ	sp+8(FP), AX
-	MOVQ	fp+16(FP), BX
-
-	// We are running on m's scheduler stack.  Save current SP
-	// into m->sched.sp so that a recursive call to runcgo doesn't
-	// clobber our stack, and also so that we can restore
-	// the SP when the call finishes.  Reusing m->sched.sp
-	// for this purpose depends on the fact that there is only
-	// one possible gosave of m->sched.
-	get_tls(CX)
-	MOVQ	DX, g(CX)
-	MOVQ	m(CX), CX
-	MOVQ	SP, (m_sched+gobuf_sp)(CX)
-
-	// Set new SP, call fn
-	MOVQ	AX, SP
-	CALL	BX
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+// See cgocall.c for more details.
+TEXT runtime·cgocallback(SB),7,$24
+	MOVQ	fn+0(FP), AX
+	MOVQ	frame+8(FP), BX
+	MOVQ	framesize+16(FP), DX

-	// Restore old g and SP, return
+	// Save current m->g0->sched.sp on stack and then set it to SP.
 	get_tls(CX)
-	MOVQ	m(CX), DX
-	MOVQ	m_g0(DX), BX
-	MOVQ	BX, g(CX)
-	MOVQ	(m_sched+gobuf_sp)(DX), SP
+	MOVQ	m(CX), BP
+	MOVQ	m_g0(BP), SI
+	PUSHQ	(g_sched+gobuf_sp)(SI)
+	MOVQ	SP, (g_sched+gobuf_sp)(SI)
+
+	// Switch to m->curg stack and call runtime.cgocallback
+	// with the three arguments.  Because we are taking over
+	// the execution of m->curg but *not* resuming what had
+	// been running, we need to save that information (m->curg->gobuf)
+	// so that we can restore it when we're done. 
+	// We can restore m->curg->gobuf.sp easily, because calling
+	// runtime.cgocallback leaves SP unchanged upon return.
+	// To save m->curg->gobuf.pc, we push it onto the stack.
+	// This has the added benefit that it looks to the traceback
+	// routine like cgocallback is going to return to that
+	// PC (because we defined cgocallback to have
+	// a frame size of 24, the same amount that we use below),
+	// so that the traceback will seamlessly trace back into
+	// the earlier calls.
+	MOVQ	m_curg(BP), SI
+	MOVQ	SI, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
+
+	// Push gobuf.pc
+	MOVQ	(g_sched+gobuf_pc)(SI), BP
+	SUBQ	$8, DI
+	MOVQ	BP, 0(DI)
+
+	// Push arguments to cgocallbackg.
+	// Frame size here must match the frame size above
+	// to trick traceback routines into doing the right thing.
+	SUBQ	$24, DI
+	MOVQ	AX, 0(DI)
+	MOVQ	BX, 8(DI)
+	MOVQ	DX, 16(DI)
+	
+	// Switch stack and make the call.
+	MOVQ	DI, SP
+	CALL	runtime·cgocallbackg(SB)
+
+	// Restore g->gobuf (== m->curg->gobuf) from saved values.
+	get_tls(CX)
+	MOVQ	g(CX), SI
+	MOVQ	24(SP), BP
+	MOVQ	BP, (g_sched+gobuf_pc)(SI)
+	LEAQ	(24+8)(SP), DI
+	MOVQ	DI, (g_sched+gobuf_sp)(SI)
+
+	// Switch back to m->g0's stack and restore m->g0->sched.sp.
+	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
+	// so we do not have to restore it.)
+	MOVQ	m(CX), BP
+	MOVQ	m_g0(BP), SI
+	MOVQ	SI, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(SI), SP
+	POPQ	(g_sched+gobuf_sp)(SI)
+
+	// Done!
 	RET

 // check that SP is in range [g->stackbase, g->stackguard)

--- a/src/pkg/runtime/arm/asm.s
+++ b/src/pkg/runtime/arm/asm.s
@@ -93,14 +93,13 @@ TEXT runtime·breakpoint(SB),7,$0
 *  go-routine
 */

-// uintptr gosave(Gobuf*)
+// void gosave(Gobuf*)
 // save state in Gobuf; setjmp
 TEXT runtime·gosave(SB), 7, $-4
 	MOVW	0(FP), R0		// gobuf
 	MOVW	SP, gobuf_sp(R0)
 	MOVW	LR, gobuf_pc(R0)
 	MOVW	g, gobuf_g(R0)
-	MOVW	$0, R0			// return 0
 	RET

 // void gogo(Gobuf*, uintptr)
@@ -127,6 +126,30 @@ TEXT runtime·gogocall(SB), 7, $-4
 	MOVW	gobuf_pc(R0), LR
 	MOVW	R1, PC

+// void mcall(void (*fn)(G*))
+// Switch to m->g0's stack, call fn(g).
+// Fn must never return.  It should gogo(&g->gobuf)
+// to keep running g.
+TEXT runtime·mcall(SB), 7, $-4
+	MOVW	fn+0(FP), R0
+
+	// Save caller state in g->gobuf.
+	MOVW	SP, (g_sched+gobuf_sp)(g)
+	MOVW	LR, (g_sched+gobuf_pc)(g)
+	MOVW	g, (g_sched+gobuf_g)(g)
+
+	// Switch to m->g0 & its stack, call fn.
+	MOVW	g, R1
+	MOVW	m_g0(m), g
+	CMP	g, R1
+	BL.EQ	runtime·badmcall(SB)
+	MOVW	(g_sched+gobuf_sp)(g), SP
+	SUB	$8, SP
+	MOVW	R1, 4(SP)
+	BL	(R0)
+	BL	runtime·badmcall2(SB)
+	RET
+
 /*
 * support for morestack
 */
@@ -159,9 +182,9 @@ TEXT runtime·morestack(SB),7,$-4
 	// Set m->morepc to f's PC.
 	MOVW	LR, m_morepc(m)

-	// Call newstack on m's scheduling stack.
+	// Call newstack on m->g0's stack.
 	MOVW	m_g0(m), g
-	MOVW	(m_sched+gobuf_sp)(m), SP
+	MOVW	(g_sched+gobuf_sp)(g), SP
 	B	runtime·newstack(SB)

 // Called from reflection library.  Mimics morestack,
@@ -192,9 +215,9 @@ TEXT reflect·call(SB), 7, $-4
 	MOVW	$1, R3
 	MOVW	R3, m_moreframesize(m)		// f's frame size

-	// Call newstack on m's scheduling stack.
+	// Call newstack on m->g0's stack.
 	MOVW	m_g0(m), g
-	MOVW	(m_sched+gobuf_sp)(m), SP
+	MOVW	(g_sched+gobuf_sp)(g), SP
 	B	runtime·newstack(SB)

 // Return point when leaving stack.
@@ -203,9 +226,9 @@ TEXT runtime·lessstack(SB), 7, $-4
 	// Save return value in m->cret
 	MOVW	R0, m_cret(m)

-	// Call oldstack on m's scheduling stack.
+	// Call oldstack on m->g0's stack.
 	MOVW	m_g0(m), g
-	MOVW	(m_sched+gobuf_sp)(m), SP
+	MOVW	(g_sched+gobuf_sp)(g), SP
 	B	runtime·oldstack(SB)

 // void jmpdefer(fn, sp);
@@ -221,6 +244,12 @@ TEXT runtime·jmpdefer(SB), 7, $0
 	MOVW	$-4(SP), SP	// SP is 4 below argp, due to saved LR
 	B		(R0)

+TEXT	runtime·asmcgocall(SB),7,$0
+	B	runtime·cgounimpl(SB)
+
+TEXT	runtime·cgocallback(SB),7,$0
+	B	runtime·cgounimpl(SB)
+
 TEXT runtime·memclr(SB),7,$20
 	MOVW	0(FP), R0
 	MOVW	$0, R1		// c = 0
@@ -248,22 +277,6 @@ TEXT runtime·getcallersp(SB),7,$-4
 	MOVW	$-4(R0), R0
 	RET

-// runcgo(void(*fn)(void*), void *arg)
-// Just call fn(arg), but first align the stack
-// appropriately for the gcc ABI.
-// TODO(kaib): figure out the arm-gcc ABI
-TEXT runtime·runcgo(SB),7,$16
-	BL	runtime·abort(SB)
-//	MOVL	fn+0(FP), AX
-//	MOVL	arg+4(FP), BX
-//	MOVL	SP, CX
-//	ANDL	$~15, SP	// alignment for gcc ABI
-//	MOVL	CX, 4(SP)
-//	MOVL	BX, 0(SP)
-//	CALL	AX
-//	MOVL	4(SP), SP
-//	RET
-
 TEXT runtime·emptyfunc(SB),0,$0
 	RET

@@ -271,10 +284,6 @@ TEXT runtime·abort(SB),7,$-4
 	MOVW	$0, R0
 	MOVW	(R0), R1

-TEXT runtime·runcgocallback(SB),7,$0
-	MOVW	$0, R0
-	MOVW	(R0), R1
-
 // bool armcas(int32 *val, int32 old, int32 new)
 // Atomically:
 //	if(*val == old){

--- a/src/pkg/runtime/cgocall.c
+++ b/src/pkg/runtime/cgocall.c
@@ -3,18 +3,97 @@
 // license that can be found in the LICENSE file.

 #include "runtime.h"
+#include "arch.h"
 #include "stack.h"
 #include "cgocall.h"

+// Cgo call and callback support.
+//
+// To call into the C function f from Go, the cgo-generated code calls
+// runtime.cgocall(_cgo_Cfunc_f, frame), where _cgo_Cfunc_f is a
+// gcc-compiled function written by cgo.
+//
+// runtime.cgocall (below) locks g to m, calls entersyscall
+// so as not to block other goroutines or the garbage collector,
+// and then calls runtime.asmcgocall(_cgo_Cfunc_f, frame). 
+//
+// runtime.asmcgocall (in $GOARCH/asm.s) switches to the m->g0 stack
+// (assumed to be an operating system-allocated stack, so safe to run
+// gcc-compiled code on) and calls _cgo_Cfunc_f(frame).
+//
+// _cgo_Cfunc_f invokes the actual C function f with arguments
+// taken from the frame structure, records the results in the frame,
+// and returns to runtime.asmcgocall.
+//
+// After it regains control, runtime.asmcgocall switches back to the
+// original g (m->curg)'s stack and returns to runtime.cgocall.
+//
+// After it regains control, runtime.cgocall calls exitsyscall, which blocks
+// until this m can run Go code without violating the $GOMAXPROCS limit,
+// and then unlocks g from m.
+//
+// The above description skipped over the possibility of the gcc-compiled
+// function f calling back into Go.  If that happens, we continue down
+// the rabbit hole during the execution of f.
+//
+// To make it possible for gcc-compiled C code to call a Go function p.GoF,
+// cgo writes a gcc-compiled function named GoF (not p.GoF, since gcc doesn't
+// know about packages).  The gcc-compiled C function f calls GoF.
+//
+// GoF calls crosscall2(_cgoexp_GoF, frame, framesize).  Crosscall2
+// (in cgo/$GOOS.S, a gcc-compiled assembly file) is a two-argument
+// adapter from the gcc function call ABI to the 6c function call ABI.
+// It is called from gcc to call 6c functions.  In this case it calls
+// _cgoexp_GoF(frame, framesize), still running on m->g0's stack
+// and outside the $GOMAXPROCS limit.  Thus, this code cannot yet
+// call arbitrary Go code directly and must be careful not to allocate
+// memory or use up m->g0's stack.
+//
+// _cgoexp_GoF calls runtime.cgocallback(p.GoF, frame, framesize).
+// (The reason for having _cgoexp_GoF instead of writing a crosscall3
+// to make this call directly is that _cgoexp_GoF, because it is compiled
+// with 6c instead of gcc, can refer to dotted names like
+// runtime.cgocallback and p.GoF.)
+//
+// runtime.cgocallback (in $GOOS/asm.s) switches from m->g0's
+// stack to the original g (m->curg)'s stack, on which it calls
+// runtime.cgocallbackg(p.GoF, frame, framesize).
+// As part of the stack switch, runtime.cgocallback saves the current
+// SP as m->g0->sched.sp, so that any use of m->g0's stack during the
+// execution of the callback will be done below the existing stack frames.
+// Before overwriting m->g0->sched.sp, it pushes the old value on the
+// m->g0 stack, so that it can be restored later.
+//
+// runtime.cgocallbackg (below) is now running on a real goroutine
+// stack (not an m->g0 stack).  First it calls runtime.exitsyscall, which will
+// block until the $GOMAXPROCS limit allows running this goroutine.
+// Once exitsyscall has returned, it is safe to do things like call the memory
+// allocator or invoke the Go callback function p.GoF.  runtime.cgocallback
+// first defers a function to unwind m->g0.sched.sp, so that if p.GoF
+// panics, m->g0.sched.sp will be restored to its old value: the m->g0 stack
+// and the m->curg stack will be unwound in lock step.
+// Then it calls p.GoF.  Finally it pops but does not execute the deferred
+// function, calls runtime.entersyscall, and returns to runtime.cgocallback.
+//
+// After it regains control, runtime.cgocallback switches back to
+// m->g0's stack (the pointer is still in m->g0.sched.sp), restores the old
+// m->g0.sched.sp value from the stack, and returns to _cgoexp_GoF.
+//
+// _cgoexp_GoF immediately returns to crosscall2, which restores the
+// callee-save registers for gcc and returns to GoF, which returns to f.
+
 void *initcgo;	/* filled in by dynamic linker when Cgo is available */
 int64 ncgocall;
-void runtime·entersyscall(void);
-void runtime·exitsyscall(void);
+
+static void unlockm(void);
+static void unwindm(void);
+
+// Call from Go to C.

 void
 runtime·cgocall(void (*fn)(void*), void *arg)
 {
-	G *oldlock;
+	Defer *d;

 	if(!runtime·iscgo)
 		runtime·throw("cgocall unavailable");
@@ -28,61 +107,49 @@ runtime·cgocall(void (*fn)(void*), void *arg)
 	 * Lock g to m to ensure we stay on the same stack if we do a
 	 * cgo callback.
 	 */
-	oldlock = m->lockedg;
-	m->lockedg = g;
-	g->lockedm = m;
+	d = nil;
+	if(m->lockedg == nil) {
+		m->lockedg = g;
+		g->lockedm = m;
+
+		// Add entry to defer stack in case of panic.
+		d = runtime·malloc(sizeof(*d));
+		d->fn = (byte*)unlockm;
+		d->siz = 0;
+		d->link = g->defer;
+		d->argp = (void*)-1;  // unused because unwindm never recovers
+		g->defer = d;
+	}

 	/*
 	 * Announce we are entering a system call
 	 * so that the scheduler knows to create another
 	 * M to run goroutines while we are in the
 	 * foreign code.
+	 *
+	 * The call to asmcgocall is guaranteed not to
+	 * split the stack and does not allocate memory,
+	 * so it is safe to call while "in a system call", outside
+	 * the $GOMAXPROCS accounting.
 	 */
 	runtime·entersyscall();
-	runtime·runcgo(fn, arg);
+	runtime·asmcgocall(fn, arg);
 	runtime·exitsyscall();

-	m->lockedg = oldlock;
-	if(oldlock == nil)
-		g->lockedm = nil;
-
-	return;
+	if(d != nil) {
+		if(g->defer != d || d->fn != (byte*)unlockm)
+			runtime·throw("runtime: bad defer entry in cgocallback");
+		g->defer = d->link;
+		runtime·free(d);
+		unlockm();
+	}
 }

-// When a C function calls back into Go, the wrapper function will
-// call this.  This switches to a Go stack, copies the arguments
-// (arg/argsize) on to the stack, calls the function, copies the
-// arguments back where they came from, and finally returns to the old
-// stack.
-void
-runtime·cgocallback(void (*fn)(void), void *arg, int32 argsize)
+static void
+unlockm(void)
 {
-	Gobuf oldsched, oldg1sched;
-	G *g1;
-	void *sp;
-
-	if(g != m->g0)
-		runtime·throw("bad g in cgocallback");
-
-	g1 = m->curg;
-	oldsched = m->sched;
-	oldg1sched = g1->sched;
-
-	runtime·startcgocallback(g1);
-
-	sp = g1->sched.sp - argsize;
-	if(sp < g1->stackguard - StackGuard - StackSystem + 8) // +8 for return address
-		runtime·throw("g stack overflow in cgocallback");
-	runtime·mcpy(sp, arg, argsize);
-
-	runtime·runcgocallback(g1, sp, fn);
-
-	runtime·mcpy(arg, sp, argsize);
-
-	runtime·endcgocallback(g1);
-
-	m->sched = oldsched;
-	g1->sched = oldg1sched;
+	m->lockedg = nil;
+	g->lockedm = nil;
 }

 void
@@ -92,6 +159,8 @@ runtime·Cgocalls(int64 ret)
 	FLUSH(&ret);
 }

+// Helper functions for cgo code.
+
 void (*_cgo_malloc)(void*);
 void (*_cgo_free)(void*);

@@ -115,3 +184,63 @@ runtime·cfree(void *p)
 	runtime·cgocall(_cgo_free, p);
 }

+// Call from C back to Go.
+
+void
+runtime·cgocallbackg(void (*fn)(void), void *arg, uintptr argsize)
+{
+	Defer *d;
+
+	if(g != m->curg)
+		runtime·throw("runtime: bad g in cgocallback");
+
+	runtime·exitsyscall();	// coming out of cgo call
+
+	// Add entry to defer stack in case of panic.
+	d = runtime·malloc(sizeof(*d));
+	d->fn = (byte*)unwindm;
+	d->siz = 0;
+	d->link = g->defer;
+	d->argp = (void*)-1;  // unused because unwindm never recovers
+	g->defer = d;
+
+	// Invoke callback.
+	reflect·call((byte*)fn, arg, argsize);
+
+	// Pop defer.
+	// Do not unwind m->g0->sched.sp.
+	// Our caller, cgocallback, will do that.
+	if(g->defer != d || d->fn != (byte*)unwindm)
+		runtime·throw("runtime: bad defer entry in cgocallback");
+	g->defer = d->link;
+	runtime·free(d);
+
+	runtime·entersyscall();	// going back to cgo call
+}
+
+static void
+unwindm(void)
+{
+	// Restore sp saved by cgocallback during
+	// unwind of g's stack (see comment at top of file).
+	switch(thechar){
+	default:
+		runtime·throw("runtime: unwindm not implemented");
+	case '8':
+	case '6':
+		m->g0->sched.sp = *(void**)m->g0->sched.sp;
+		break;
+	}
+}
+
+void
+runtime·badcgocallback(void)	// called from assembly
+{
+	runtime·throw("runtime: misaligned stack in cgocallback");
+}
+
+void
+runtime·cgounimpl(void)	// called from (incomplete) assembly
+{
+	runtime·throw("runtime: cgo not implemented");
+}
--- a/src/pkg/runtime/cgocall.h
+++ b/src/pkg/runtime/cgocall.h
@@ -7,6 +7,6 @@
 */

 void runtime·cgocall(void (*fn)(void*), void*);
-void runtime·cgocallback(void (*fn)(void), void*, int32);
+void runtime·cgocallback(void (*fn)(void), void*, uintptr);
 void *runtime·cmalloc(uintptr);
 void runtime·cfree(void*);
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -379,8 +379,6 @@ mark(void)
 		case Gdead:
 			break;
 		case Grunning:
-		case Grecovery:
-		case Gstackalloc:
 			if(gp != g)
 				runtime·throw("mark - world not stopped");
 			scanstack(gp);

--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -103,8 +103,6 @@ enum
 	Gwaiting,
 	Gmoribund,
 	Gdead,
-	Grecovery,
-	Gstackalloc,
 };
 enum
 {
@@ -219,7 +217,6 @@ struct	M
 	uint64	procid;		// for debuggers, but offset not hard-coded
 	G*	gsignal;	// signal-handling G
 	uint32	tls[8];		// thread-local storage (for 386 extern register)
-	Gobuf	sched;	// scheduling stack
 	G*	curg;		// current running goroutine
 	int32	id;
 	int32	mallocing;
@@ -385,7 +382,7 @@ int32	runtime·charntorune(int32*, uint8*, int32);

 void	runtime·gogo(Gobuf*, uintptr);
 void	runtime·gogocall(Gobuf*, void(*)(void));
-uintptr	runtime·gosave(Gobuf*);
+void	runtime·gosave(Gobuf*);
 void	runtime·lessstack(void);
 void	runtime·goargs(void);
 void	runtime·goenvs(void);
@@ -442,17 +439,15 @@ void	runtime·walkfintab(void (*fn)(void*));
 void	runtime·runpanic(Panic*);
 void*	runtime·getcallersp(void*);
 int32	runtime·mcount(void);
+void	runtime·mcall(void(*)(G*));

 void	runtime·exit(int32);
 void	runtime·breakpoint(void);
 void	runtime·gosched(void);
 void	runtime·goexit(void);
-void	runtime·runcgo(void (*fn)(void*), void*);
-void	runtime·runcgocallback(G*, void*, void (*fn)());
+void	runtime·asmcgocall(void (*fn)(void*), void*);
 void	runtime·entersyscall(void);
 void	runtime·exitsyscall(void);
-void	runtime·startcgocallback(G*);
-void	runtime·endcgocallback(G*);
 G*	runtime·newproc1(byte*, byte*, int32, int32, void*);
 void	runtime·siginit(void);
 bool	runtime·sigsend(int32 sig);