runtime: improve Linux mutex

The implementation is hybrid active/passive spin/blocking mutex. The design minimizes amount of context switches and futex calls. The idea is that all critical sections in runtime are intentially small, so pure blocking mutex behaves badly causing a lot of context switches, thread parking/unparking and kernel calls. Note that some synthetic benchmarks become somewhat slower, that's due to increased contention on other data structures, it should not affect programs that do any real work. On 2 x Intel E5620, 8 HT cores, 2.4GHz benchmark old ns/op new ns/op delta BenchmarkSelectContended 521.00 503.00 -3.45% BenchmarkSelectContended-2 661.00 320.00 -51.59% BenchmarkSelectContended-4 1139.00 629.00 -44.78% BenchmarkSelectContended-8 2870.00 878.00 -69.41% BenchmarkSelectContended-16 5276.00 818.00 -84.50% BenchmarkChanContended 112.00 103.00 -8.04% BenchmarkChanContended-2 631.00 174.00 -72.42% BenchmarkChanContended-4 682.00 272.00 -60.12% BenchmarkChanContended-8 1601.00 520.00 -67.52% BenchmarkChanContended-16 3100.00 372.00 -88.00% BenchmarkChanSync 253.00 239.00 -5.53% BenchmarkChanSync-2 5030.00 4648.00 -7.59% BenchmarkChanSync-4 4826.00 4694.00 -2.74% BenchmarkChanSync-8 4778.00 4713.00 -1.36% BenchmarkChanSync-16 5289.00 4710.00 -10.95% BenchmarkChanProdCons0 273.00 254.00 -6.96% BenchmarkChanProdCons0-2 599.00 400.00 -33.22% BenchmarkChanProdCons0-4 1168.00 659.00 -43.58% BenchmarkChanProdCons0-8 2831.00 1057.00 -62.66% BenchmarkChanProdCons0-16 4197.00 1037.00 -75.29% BenchmarkChanProdCons10 150.00 140.00 -6.67% BenchmarkChanProdCons10-2 607.00 268.00 -55.85% BenchmarkChanProdCons10-4 1137.00 404.00 -64.47% BenchmarkChanProdCons10-8 2115.00 828.00 -60.85% BenchmarkChanProdCons10-16 4283.00 855.00 -80.04% BenchmarkChanProdCons100 117.00 110.00 -5.98% BenchmarkChanProdCons100-2 558.00 218.00 -60.93% BenchmarkChanProdCons100-4 722.00 287.00 -60.25% BenchmarkChanProdCons100-8 1840.00 431.00 -76.58% BenchmarkChanProdCons100-16 3394.00 448.00 -86.80% BenchmarkChanProdConsWork0 2014.00 1996.00 -0.89% BenchmarkChanProdConsWork0-2 1207.00 1127.00 -6.63% BenchmarkChanProdConsWork0-4 1913.00 611.00 -68.06% BenchmarkChanProdConsWork0-8 3016.00 949.00 -68.53% BenchmarkChanProdConsWork0-16 4320.00 1154.00 -73.29% BenchmarkChanProdConsWork10 1906.00 1897.00 -0.47% BenchmarkChanProdConsWork10-2 1123.00 1033.00 -8.01% BenchmarkChanProdConsWork10-4 1076.00 571.00 -46.93% BenchmarkChanProdConsWork10-8 2748.00 1096.00 -60.12% BenchmarkChanProdConsWork10-16 4600.00 1105.00 -75.98% BenchmarkChanProdConsWork100 1884.00 1852.00 -1.70% BenchmarkChanProdConsWork100-2 1235.00 1146.00 -7.21% BenchmarkChanProdConsWork100-4 1217.00 619.00 -49.14% BenchmarkChanProdConsWork100-8 1534.00 509.00 -66.82% BenchmarkChanProdConsWork100-16 4126.00 918.00 -77.75% BenchmarkSyscall 34.40 33.30 -3.20% BenchmarkSyscall-2 160.00 121.00 -24.38% BenchmarkSyscall-4 131.00 136.00 +3.82% BenchmarkSyscall-8 139.00 131.00 -5.76% BenchmarkSyscall-16 161.00 168.00 +4.35% BenchmarkSyscallWork 950.00 950.00 +0.00% BenchmarkSyscallWork-2 481.00 480.00 -0.21% BenchmarkSyscallWork-4 268.00 270.00 +0.75% BenchmarkSyscallWork-8 156.00 169.00 +8.33% BenchmarkSyscallWork-16 188.00 184.00 -2.13% BenchmarkSemaSyntNonblock 36.40 35.60 -2.20% BenchmarkSemaSyntNonblock-2 81.40 45.10 -44.59% BenchmarkSemaSyntNonblock-4 126.00 108.00 -14.29% BenchmarkSemaSyntNonblock-8 112.00 112.00 +0.00% BenchmarkSemaSyntNonblock-16 110.00 112.00 +1.82% BenchmarkSemaSyntBlock 35.30 35.30 +0.00% BenchmarkSemaSyntBlock-2 118.00 124.00 +5.08% BenchmarkSemaSyntBlock-4 105.00 108.00 +2.86% BenchmarkSemaSyntBlock-8 101.00 111.00 +9.90% BenchmarkSemaSyntBlock-16 112.00 118.00 +5.36% BenchmarkSemaWorkNonblock 810.00 811.00 +0.12% BenchmarkSemaWorkNonblock-2 476.00 414.00 -13.03% BenchmarkSemaWorkNonblock-4 238.00 228.00 -4.20% BenchmarkSemaWorkNonblock-8 140.00 126.00 -10.00% BenchmarkSemaWorkNonblock-16 117.00 116.00 -0.85% BenchmarkSemaWorkBlock 810.00 811.00 +0.12% BenchmarkSemaWorkBlock-2 454.00 466.00 +2.64% BenchmarkSemaWorkBlock-4 243.00 241.00 -0.82% BenchmarkSemaWorkBlock-8 145.00 137.00 -5.52% BenchmarkSemaWorkBlock-16 132.00 123.00 -6.82% BenchmarkContendedSemaphore 123.00 102.00 -17.07% BenchmarkContendedSemaphore-2 34.80 34.90 +0.29% BenchmarkContendedSemaphore-4 34.70 34.80 +0.29% BenchmarkContendedSemaphore-8 34.70 34.70 +0.00% BenchmarkContendedSemaphore-16 34.80 34.70 -0.29% BenchmarkMutex 26.80 26.00 -2.99% BenchmarkMutex-2 108.00 45.20 -58.15% BenchmarkMutex-4 103.00 127.00 +23.30% BenchmarkMutex-8 109.00 147.00 +34.86% BenchmarkMutex-16 102.00 152.00 +49.02% BenchmarkMutexSlack 27.00 26.90 -0.37% BenchmarkMutexSlack-2 149.00 165.00 +10.74% BenchmarkMutexSlack-4 121.00 209.00 +72.73% BenchmarkMutexSlack-8 101.00 158.00 +56.44% BenchmarkMutexSlack-16 97.00 129.00 +32.99% BenchmarkMutexWork 792.00 794.00 +0.25% BenchmarkMutexWork-2 407.00 409.00 +0.49% BenchmarkMutexWork-4 220.00 209.00 -5.00% BenchmarkMutexWork-8 267.00 160.00 -40.07% BenchmarkMutexWork-16 315.00 300.00 -4.76% BenchmarkMutexWorkSlack 792.00 793.00 +0.13% BenchmarkMutexWorkSlack-2 406.00 404.00 -0.49% BenchmarkMutexWorkSlack-4 225.00 212.00 -5.78% BenchmarkMutexWorkSlack-8 268.00 136.00 -49.25% BenchmarkMutexWorkSlack-16 300.00 300.00 +0.00% BenchmarkRWMutexWrite100 27.10 27.00 -0.37% BenchmarkRWMutexWrite100-2 33.10 40.80 +23.26% BenchmarkRWMutexWrite100-4 113.00 88.10 -22.04% BenchmarkRWMutexWrite100-8 119.00 95.30 -19.92% BenchmarkRWMutexWrite100-16 148.00 109.00 -26.35% BenchmarkRWMutexWrite10 29.60 29.40 -0.68% BenchmarkRWMutexWrite10-2 111.00 61.40 -44.68% BenchmarkRWMutexWrite10-4 270.00 208.00 -22.96% BenchmarkRWMutexWrite10-8 204.00 185.00 -9.31% BenchmarkRWMutexWrite10-16 261.00 190.00 -27.20% BenchmarkRWMutexWorkWrite100 1040.00 1036.00 -0.38% BenchmarkRWMutexWorkWrite100-2 593.00 580.00 -2.19% BenchmarkRWMutexWorkWrite100-4 470.00 365.00 -22.34% BenchmarkRWMutexWorkWrite100-8 468.00 289.00 -38.25% BenchmarkRWMutexWorkWrite100-16 604.00 374.00 -38.08% BenchmarkRWMutexWorkWrite10 951.00 951.00 +0.00% BenchmarkRWMutexWorkWrite10-2 1001.00 928.00 -7.29% BenchmarkRWMutexWorkWrite10-4 1555.00 1006.00 -35.31% BenchmarkRWMutexWorkWrite10-8 2085.00 1171.00 -43.84% BenchmarkRWMutexWorkWrite10-16 2082.00 1614.00 -22.48% R=rsc, iant, msolo, fw, iant CC=golang-dev https://golang.org/cl/4711045

runtime: improve Linux mutex
The implementation is hybrid active/passive spin/blocking mutex. The design minimizes amount of context switches and futex calls. The idea is that all critical sections in runtime are intentially small, so pure blocking mutex behaves badly causing a lot of context switches, thread parking/unparking and kernel calls. Note that some synthetic benchmarks become somewhat slower, that's due to increased contention on other data structures, it should not affect programs that do any real work. On 2 x Intel E5620, 8 HT cores, 2.4GHz benchmark old ns/op new ns/op delta BenchmarkSelectContended 521.00 503.00 -3.45% BenchmarkSelectContended-2 661.00 320.00 -51.59% BenchmarkSelectContended-4 1139.00 629.00 -44.78% BenchmarkSelectContended-8 2870.00 878.00 -69.41% BenchmarkSelectContended-16 5276.00 818.00 -84.50% BenchmarkChanContended 112.00 103.00 -8.04% BenchmarkChanContended-2 631.00 174.00 -72.42% BenchmarkChanContended-4 682.00 272.00 -60.12% BenchmarkChanContended-8 1601.00 520.00 -67.52% BenchmarkChanContended-16 3100.00 372.00 -88.00% BenchmarkChanSync 253.00 239.00 -5.53% BenchmarkChanSync-2 5030.00 4648.00 -7.59% BenchmarkChanSync-4 4826.00 4694.00 -2.74% BenchmarkChanSync-8 4778.00 4713.00 -1.36% BenchmarkChanSync-16 5289.00 4710.00 -10.95% BenchmarkChanProdCons0 273.00 254.00 -6.96% BenchmarkChanProdCons0-2 599.00 400.00 -33.22% BenchmarkChanProdCons0-4 1168.00 659.00 -43.58% BenchmarkChanProdCons0-8 2831.00 1057.00 -62.66% BenchmarkChanProdCons0-16 4197.00 1037.00 -75.29% BenchmarkChanProdCons10 150.00 140.00 -6.67% BenchmarkChanProdCons10-2 607.00 268.00 -55.85% BenchmarkChanProdCons10-4 1137.00 404.00 -64.47% BenchmarkChanProdCons10-8 2115.00 828.00 -60.85% BenchmarkChanProdCons10-16 4283.00 855.00 -80.04% BenchmarkChanProdCons100 117.00 110.00 -5.98% BenchmarkChanProdCons100-2 558.00 218.00 -60.93% BenchmarkChanProdCons100-4 722.00 287.00 -60.25% BenchmarkChanProdCons100-8 1840.00 431.00 -76.58% BenchmarkChanProdCons100-16 3394.00 448.00 -86.80% BenchmarkChanProdConsWork0 2014.00 1996.00 -0.89% BenchmarkChanProdConsWork0-2 1207.00 1127.00 -6.63% BenchmarkChanProdConsWork0-4 1913.00 611.00 -68.06% BenchmarkChanProdConsWork0-8 3016.00 949.00 -68.53% BenchmarkChanProdConsWork0-16 4320.00 1154.00 -73.29% BenchmarkChanProdConsWork10 1906.00 1897.00 -0.47% BenchmarkChanProdConsWork10-2 1123.00 1033.00 -8.01% BenchmarkChanProdConsWork10-4 1076.00 571.00 -46.93% BenchmarkChanProdConsWork10-8 2748.00 1096.00 -60.12% BenchmarkChanProdConsWork10-16 4600.00 1105.00 -75.98% BenchmarkChanProdConsWork100 1884.00 1852.00 -1.70% BenchmarkChanProdConsWork100-2 1235.00 1146.00 -7.21% BenchmarkChanProdConsWork100-4 1217.00 619.00 -49.14% BenchmarkChanProdConsWork100-8 1534.00 509.00 -66.82% BenchmarkChanProdConsWork100-16 4126.00 918.00 -77.75% BenchmarkSyscall 34.40 33.30 -3.20% BenchmarkSyscall-2 160.00 121.00 -24.38% BenchmarkSyscall-4 131.00 136.00 +3.82% BenchmarkSyscall-8 139.00 131.00 -5.76% BenchmarkSyscall-16 161.00 168.00 +4.35% BenchmarkSyscallWork 950.00 950.00 +0.00% BenchmarkSyscallWork-2 481.00 480.00 -0.21% BenchmarkSyscallWork-4 268.00 270.00 +0.75% BenchmarkSyscallWork-8 156.00 169.00 +8.33% BenchmarkSyscallWork-16 188.00 184.00 -2.13% BenchmarkSemaSyntNonblock 36.40 35.60 -2.20% BenchmarkSemaSyntNonblock-2 81.40 45.10 -44.59% BenchmarkSemaSyntNonblock-4 126.00 108.00 -14.29% BenchmarkSemaSyntNonblock-8 112.00 112.00 +0.00% BenchmarkSemaSyntNonblock-16 110.00 112.00 +1.82% BenchmarkSemaSyntBlock 35.30 35.30 +0.00% BenchmarkSemaSyntBlock-2 118.00 124.00 +5.08% BenchmarkSemaSyntBlock-4 105.00 108.00 +2.86% BenchmarkSemaSyntBlock-8 101.00 111.00 +9.90% BenchmarkSemaSyntBlock-16 112.00 118.00 +5.36% BenchmarkSemaWorkNonblock 810.00 811.00 +0.12% BenchmarkSemaWorkNonblock-2 476.00 414.00 -13.03% BenchmarkSemaWorkNonblock-4 238.00 228.00 -4.20% BenchmarkSemaWorkNonblock-8 140.00 126.00 -10.00% BenchmarkSemaWorkNonblock-16 117.00 116.00 -0.85% BenchmarkSemaWorkBlock 810.00 811.00 +0.12% BenchmarkSemaWorkBlock-2 454.00 466.00 +2.64% BenchmarkSemaWorkBlock-4 243.00 241.00 -0.82% BenchmarkSemaWorkBlock-8 145.00 137.00 -5.52% BenchmarkSemaWorkBlock-16 132.00 123.00 -6.82% BenchmarkContendedSemaphore 123.00 102.00 -17.07% BenchmarkContendedSemaphore-2 34.80 34.90 +0.29% BenchmarkContendedSemaphore-4 34.70 34.80 +0.29% BenchmarkContendedSemaphore-8 34.70 34.70 +0.00% BenchmarkContendedSemaphore-16 34.80 34.70 -0.29% BenchmarkMutex 26.80 26.00 -2.99% BenchmarkMutex-2 108.00 45.20 -58.15% BenchmarkMutex-4 103.00 127.00 +23.30% BenchmarkMutex-8 109.00 147.00 +34.86% BenchmarkMutex-16 102.00 152.00 +49.02% BenchmarkMutexSlack 27.00 26.90 -0.37% BenchmarkMutexSlack-2 149.00 165.00 +10.74% BenchmarkMutexSlack-4 121.00 209.00 +72.73% BenchmarkMutexSlack-8 101.00 158.00 +56.44% BenchmarkMutexSlack-16 97.00 129.00 +32.99% BenchmarkMutexWork 792.00 794.00 +0.25% BenchmarkMutexWork-2 407.00 409.00 +0.49% BenchmarkMutexWork-4 220.00 209.00 -5.00% BenchmarkMutexWork-8 267.00 160.00 -40.07% BenchmarkMutexWork-16 315.00 300.00 -4.76% BenchmarkMutexWorkSlack 792.00 793.00 +0.13% BenchmarkMutexWorkSlack-2 406.00 404.00 -0.49% BenchmarkMutexWorkSlack-4 225.00 212.00 -5.78% BenchmarkMutexWorkSlack-8 268.00 136.00 -49.25% BenchmarkMutexWorkSlack-16 300.00 300.00 +0.00% BenchmarkRWMutexWrite100 27.10 27.00 -0.37% BenchmarkRWMutexWrite100-2 33.10 40.80 +23.26% BenchmarkRWMutexWrite100-4 113.00 88.10 -22.04% BenchmarkRWMutexWrite100-8 119.00 95.30 -19.92% BenchmarkRWMutexWrite100-16 148.00 109.00 -26.35% BenchmarkRWMutexWrite10 29.60 29.40 -0.68% BenchmarkRWMutexWrite10-2 111.00 61.40 -44.68% BenchmarkRWMutexWrite10-4 270.00 208.00 -22.96% BenchmarkRWMutexWrite10-8 204.00 185.00 -9.31% BenchmarkRWMutexWrite10-16 261.00 190.00 -27.20% BenchmarkRWMutexWorkWrite100 1040.00 1036.00 -0.38% BenchmarkRWMutexWorkWrite100-2 593.00 580.00 -2.19% BenchmarkRWMutexWorkWrite100-4 470.00 365.00 -22.34% BenchmarkRWMutexWorkWrite100-8 468.00 289.00 -38.25% BenchmarkRWMutexWorkWrite100-16 604.00 374.00 -38.08% BenchmarkRWMutexWorkWrite10 951.00 951.00 +0.00% BenchmarkRWMutexWorkWrite10-2 1001.00 928.00 -7.29% BenchmarkRWMutexWorkWrite10-4 1555.00 1006.00 -35.31% BenchmarkRWMutexWorkWrite10-8 2085.00 1171.00 -43.84% BenchmarkRWMutexWorkWrite10-16 2082.00 1614.00 -22.48% R=rsc, iant, msolo, fw, iant CC=golang-dev https://golang.org/cl/4711045
4e5086b9 · Dmitriy Vyukov · Russ Cox · bed7e3ed · 4e5086b9 · 4e5086b9
Commit 4e5086b9 authored Jul 29, 2011 by Dmitriy Vyukov Committed by Russ Cox Jul 29, 2011
18 changed files
--- a/src/cmd/6a/lex.c
+++ b/src/cmd/6a/lex.c
@@ -527,6 +527,7 @@ struct
 	"OUTSB",	LTYPE0,	AOUTSB,
 	"OUTSL",	LTYPE0,	AOUTSL,
 	"OUTSW",	LTYPE0,	AOUTSW,
+	"PAUSE",	LTYPEN,	APAUSE,
 	"POPAL",	LTYPE0,	APOPAL,
 	"POPAW",	LTYPE0,	APOPAW,
 	"POPFL",	LTYPE0,	APOPFL,

--- a/src/cmd/6l/6.out.h
+++ b/src/cmd/6l/6.out.h
@@ -190,6 +190,7 @@ enum	as
 	AOUTSB,
 	AOUTSL,
 	AOUTSW,
+	APAUSE,
 	APOPAL,
 	APOPAW,
 	APOPFL,

--- a/src/cmd/6l/optab.c
+++ b/src/cmd/6l/optab.c
@@ -919,6 +919,7 @@ Optab optab[] =
 	{ APADDW,	ymm,	Py, 0xfd,Pe,0xfd },
 	{ APAND,	ymm,	Py, 0xdb,Pe,0xdb },
 	{ APANDN,	ymm,	Py, 0xdf,Pe,0xdf },
+	{ APAUSE,	ynone,	Px, 0xf3,0x90 },
 	{ APAVGB,	ymm,	Py, 0xe0,Pe,0xe0 },
 	{ APAVGW,	ymm,	Py, 0xe3,Pe,0xe3 },
 	{ APCMPEQB,	ymm,	Py, 0x74,Pe,0x74 },

--- a/src/cmd/8a/lex.c
+++ b/src/cmd/8a/lex.c
@@ -421,6 +421,7 @@ struct
 	"OUTSB",	LTYPE0,	AOUTSB,
 	"OUTSL",	LTYPE0,	AOUTSL,
 	"OUTSW",	LTYPE0,	AOUTSW,
+	"PAUSE",	LTYPEN,	APAUSE,
 	"POPAL",	LTYPE0,	APOPAL,
 	"POPAW",	LTYPE0,	APOPAW,
 	"POPFL",	LTYPE0,	APOPFL,

--- a/src/cmd/8l/8.out.h
+++ b/src/cmd/8l/8.out.h
@@ -180,6 +180,7 @@ enum	as
 	AOUTSB,
 	AOUTSL,
 	AOUTSW,
+	APAUSE,
 	APOPAL,
 	APOPAW,
 	APOPFL,

--- a/src/cmd/8l/optab.c
+++ b/src/cmd/8l/optab.c
@@ -495,6 +495,7 @@ Optab optab[] =
 	{ AOUTSB,	ynone,	Pb, 0x6e },
 	{ AOUTSL,	ynone,	Px, 0x6f },
 	{ AOUTSW,	ynone,	Pe, 0x6f },
+	{ APAUSE,	ynone,	Px, 0xf3,0x90 },
 	{ APOPAL,	ynone,	Px, 0x61 },
 	{ APOPAW,	ynone,	Pe, 0x61 },
 	{ APOPFL,	ynone,	Px, 0x9d },

--- a/src/pkg/runtime/386/asm.s
+++ b/src/pkg/runtime/386/asm.s
@@ -334,6 +334,20 @@ TEXT runtime·xadd(SB), 7, $0
 	ADDL	CX, AX
 	RET

+TEXT runtime·xchg(SB), 7, $0
+	MOVL	4(SP), BX
+	MOVL	8(SP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime·procyield(SB),7,$0
+	MOVL	4(SP), AX
+again:
+	PAUSE
+	SUBL	$1, AX
+	JNZ	again
+	RET
+
 TEXT runtime·atomicstorep(SB), 7, $0
 	MOVL	4(SP), BX
 	MOVL	8(SP), AX

--- a/src/pkg/runtime/amd64/asm.s
+++ b/src/pkg/runtime/amd64/asm.s
@@ -378,6 +378,20 @@ TEXT runtime·xadd(SB), 7, $0
 	ADDL	CX, AX
 	RET

+TEXT runtime·xchg(SB), 7, $0
+	MOVQ	8(SP), BX
+	MOVL	16(SP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime·procyield(SB),7,$0
+	MOVL	8(SP), AX
+again:
+	PAUSE
+	SUBL	$1, AX
+	JNZ	again
+	RET
+
 TEXT runtime·atomicstorep(SB), 7, $0
 	MOVQ	8(SP), BX
 	MOVQ	16(SP), AX

--- a/src/pkg/runtime/arm/atomic.c
+++ b/src/pkg/runtime/arm/atomic.c
@@ -19,6 +19,29 @@ runtime·xadd(uint32 volatile *val, int32 delta)
 	}
 }

+#pragma textflag 7
+uint32
+runtime·xchg(uint32 volatile* addr, uint32 v)
+{
+	uint32 old;
+
+	for(;;) {
+		old = *addr;
+		if(runtime·cas(addr, old, v))
+			return old;
+	}
+}
+
+#pragma textflag 7
+void
+runtime·procyield(uint32 cnt)
+{
+	uint32 volatile i;
+
+	for(i = 0; i < cnt; i++) {
+	}
+}
+
 #pragma textflag 7
 uint32
 runtime·atomicload(uint32 volatile* addr)

--- a/src/pkg/runtime/linux/386/defs.h
+++ b/src/pkg/runtime/linux/386/defs.h
@@ -61,6 +61,8 @@ enum {
 	ITIMER_REAL = 0,
 	ITIMER_VIRTUAL = 0x1,
 	ITIMER_PROF = 0x2,
+	O_RDONLY = 0,
+	O_CLOEXEC = 02000000,
 };

 // Types

--- a/src/pkg/runtime/linux/386/sys.s
+++ b/src/pkg/runtime/linux/386/sys.s
@@ -22,9 +22,31 @@ TEXT runtime·exit1(SB),7,$0
 	INT $3	// not reached
 	RET

+TEXT runtime·open(SB),7,$0
+	MOVL	$5, AX		// syscall - open
+	MOVL	4(SP), BX
+	MOVL	8(SP), CX
+	MOVL	12(SP), DX
+	INT	$0x80
+	RET
+
+TEXT runtime·close(SB),7,$0
+	MOVL	$6, AX		// syscall - close
+	MOVL	4(SP), BX
+	INT	$0x80
+	RET
+
 TEXT runtime·write(SB),7,$0
 	MOVL	$4, AX		// syscall - write
-	MOVL	4(SP),  BX
+	MOVL	4(SP), BX
+	MOVL	8(SP), CX
+	MOVL	12(SP), DX
+	INT	$0x80
+	RET
+
+TEXT runtime·read(SB),7,$0
+	MOVL	$3, AX		// syscall - read
+	MOVL	4(SP), BX
 	MOVL	8(SP), CX
 	MOVL	12(SP), DX
 	INT	$0x80
@@ -315,3 +337,8 @@ TEXT runtime·setldt(SB),7,$32
 	MOVW	AX, GS

 	RET
+
+TEXT runtime·osyield(SB),7,$0
+	MOVL	$158, AX
+	INT	$0x80
+	RET
--- a/src/pkg/runtime/linux/amd64/defs.h
+++ b/src/pkg/runtime/linux/amd64/defs.h
@@ -61,6 +61,8 @@ enum {
 	ITIMER_REAL = 0,
 	ITIMER_VIRTUAL = 0x1,
 	ITIMER_PROF = 0x2,
+	O_RDONLY = 0,
+	O_CLOEXEC = 02000000,
 };

 // Types

--- a/src/pkg/runtime/linux/amd64/sys.s
+++ b/src/pkg/runtime/linux/amd64/sys.s
@@ -28,6 +28,12 @@ TEXT runtime·open(SB),7,$0-16
 	SYSCALL
 	RET

+TEXT runtime·close(SB),7,$0-16
+	MOVL	8(SP), DI
+	MOVL	$3, AX			// syscall entry
+	SYSCALL
+	RET
+
 TEXT runtime·write(SB),7,$0-24
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
@@ -36,6 +42,14 @@ TEXT runtime·write(SB),7,$0-24
 	SYSCALL
 	RET

+TEXT runtime·read(SB),7,$0-24
+	MOVL	8(SP), DI
+	MOVQ	16(SP), SI
+	MOVL	24(SP), DX
+	MOVL	$0, AX			// syscall entry
+	SYSCALL
+	RET
+
 TEXT runtime·raisesigpipe(SB),7,$12
 	MOVL	$186, AX	// syscall - gettid
 	SYSCALL
@@ -232,3 +246,7 @@ TEXT runtime·settls(SB),7,$32
 	CALL	runtime·notok(SB)
 	RET

+TEXT runtime·osyield(SB),7,$0
+	MOVL	$24, AX
+	SYSCALL
+	RET
--- a/src/pkg/runtime/linux/arm/defs.h
+++ b/src/pkg/runtime/linux/arm/defs.h
@@ -61,6 +61,8 @@ enum {
 	ITIMER_REAL = 0,
 	ITIMER_PROF = 0x2,
 	ITIMER_VIRTUAL = 0x1,
+	O_RDONLY = 0,
+	O_CLOEXEC = 02000000,
 };

 // Types

--- a/src/pkg/runtime/linux/arm/sys.s
+++ b/src/pkg/runtime/linux/arm/sys.s
@@ -15,7 +15,10 @@
 #define SYS_BASE 0x0

 #define SYS_exit (SYS_BASE + 1)
+#define SYS_read (SYS_BASE + 3)
 #define SYS_write (SYS_BASE + 4)
+#define SYS_open (SYS_BASE + 5)
+#define SYS_close (SYS_BASE + 6)
 #define SYS_gettimeofday (SYS_BASE + 78)
 #define SYS_clone (SYS_BASE + 120)
 #define SYS_rt_sigreturn (SYS_BASE + 173)
@@ -29,10 +32,25 @@
 #define SYS_mincore (SYS_BASE + 219)
 #define SYS_gettid (SYS_BASE + 224)
 #define SYS_tkill (SYS_BASE + 238)
+#define SYS_sched_yield (SYS_BASE + 158)

 #define ARM_BASE (SYS_BASE + 0x0f0000)
 #define SYS_ARM_cacheflush (ARM_BASE + 2)

+TEXT runtime·open(SB),7,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	$SYS_open, R7
+	SWI	$0
+	RET
+
+TEXT runtime·close(SB),7,$0
+	MOVW	0(FP), R0
+	MOVW	$SYS_close, R7
+	SWI	$0
+	RET
+
 TEXT runtime·write(SB),7,$0
 	MOVW	0(FP), R0
 	MOVW	4(FP), R1
@@ -41,6 +59,14 @@ TEXT runtime·write(SB),7,$0
 	SWI	$0
 	RET

+TEXT runtime·read(SB),7,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	$SYS_read, R7
+	SWI	$0
+	RET
+
 TEXT runtime·exit(SB),7,$-4
 	MOVW	0(FP), R0
 	MOVW	$SYS_exit_group, R7
@@ -287,3 +313,7 @@ cascheck:
 TEXT runtime·casp(SB),7,$0
 	B	runtime·cas(SB)

+TEXT runtime·osyield(SB),7,$0
+	MOVW	$SYS_sched_yield, R7
+	SWI	$0
+	RET
--- a/src/pkg/runtime/linux/thread.c
+++ b/src/pkg/runtime/linux/thread.c
@@ -8,6 +8,11 @@
 #include "stack.h"

 extern SigTab runtime·sigtab[];
+static int32 proccount;
+
+int32 runtime·open(uint8*, int32, int32);
+int32 runtime·close(int32);
+int32 runtime·read(int32, void*, int32);

 // Linux futex.
 //
@@ -15,11 +20,19 @@ extern SigTab runtime·sigtab[];
 //	futexwakeup(uint32 *addr)
 //
 // Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
-// Futexwakeup wakes up one thread sleeping on addr.
+// Futexwakeup wakes up threads sleeping on addr.
 // Futexsleep is allowed to wake up spuriously.

 enum
 {
+	MUTEX_UNLOCKED = 0,
+	MUTEX_LOCKED = 1,
+	MUTEX_SLEEPING = 2,
+
+	ACTIVE_SPIN = 4,
+	ACTIVE_SPIN_CNT = 30,
+	PASSIVE_SPIN = 1,
+
 	FUTEX_WAIT = 0,
 	FUTEX_WAKE = 1,

@@ -52,13 +65,13 @@ futexsleep(uint32 *addr, uint32 val)
 	runtime·futex(addr, FUTEX_WAIT, val, &longtime, nil, 0);
 }

-// If any procs are sleeping on addr, wake up at least one.
+// If any procs are sleeping on addr, wake up at most cnt.
 static void
-futexwakeup(uint32 *addr)
+futexwakeup(uint32 *addr, uint32 cnt)
 {
 	int64 ret;

-	ret = runtime·futex(addr, FUTEX_WAKE, 1, nil, nil, 0);
+	ret = runtime·futex(addr, FUTEX_WAKE, cnt, nil, nil, 0);

 	if(ret >= 0)
 		return;
@@ -66,70 +79,96 @@ futexwakeup(uint32 *addr)
 	// I don't know that futex wakeup can return
 	// EAGAIN or EINTR, but if it does, it would be
 	// safe to loop and call futex again.
-
-	runtime·prints("futexwakeup addr=");
-	runtime·printpointer(addr);
-	runtime·prints(" returned ");
-	runtime·printint(ret);
-	runtime·prints("\n");
+	runtime·printf("futexwakeup addr=%p returned %D\n", addr, ret);
 	*(int32*)0x1006 = 0x1006;
 }

+static int32
+getproccount(void)
+{
+	int32 fd, rd, cnt, cpustrlen;
+	byte *cpustr, *pos, *bufpos;
+	byte buf[256];
+
+	fd = runtime·open((byte*)"/proc/stat", O_RDONLY|O_CLOEXEC, 0);
+	if(fd == -1)
+		return 1;
+	cnt = 0;
+	bufpos = buf;
+	cpustr = (byte*)"\ncpu";
+	cpustrlen = runtime·findnull(cpustr);
+	for(;;) {
+		rd = runtime·read(fd, bufpos, sizeof(buf)-cpustrlen);
+		if(rd == -1)
+			break;
+		bufpos[rd] = 0;
+		for(pos=buf; pos=runtime·strstr(pos, cpustr); cnt++, pos++) {
+		}
+		if(rd < cpustrlen)
+			break;
+		runtime·memmove(buf, bufpos+rd-cpustrlen+1, cpustrlen-1);
+		bufpos = buf+cpustrlen-1;
+	}
+	runtime·close(fd);
+	return cnt ? cnt : 1;
+}

-// Lock and unlock.
-//
-// The lock state is a single 32-bit word that holds
-// a 31-bit count of threads waiting for the lock
-// and a single bit (the low bit) saying whether the lock is held.
-// The uncontended case runs entirely in user space.
-// When contention is detected, we defer to the kernel (futex).
-//
-// A reminder: compare-and-swap runtime·cas(addr, old, new) does
-//	if(*addr == old) { *addr = new; return 1; }
-//	else return 0;
-// but atomically.
-
+// Possible lock states are MUTEX_UNLOCKED, MUTEX_LOCKED and MUTEX_SLEEPING.
+// MUTEX_SLEEPING means that there is presumably at least one sleeping thread.
+// Note that there can be spinning threads during all states - they do not
+// affect mutex's state.
 static void
 futexlock(Lock *l)
 {
-	uint32 v;
+	uint32 i, v, wait, spin;

-again:
-	v = l->key;
-	if((v&1) == 0){
-		if(runtime·cas(&l->key, v, v|1)){
-			// Lock wasn't held; we grabbed it.
-			return;
+	// Speculative grab for lock.
+	v = runtime·xchg(&l->key, MUTEX_LOCKED);
+	if(v == MUTEX_UNLOCKED)
+		return;
+
+	// wait is either MUTEX_LOCKED or MUTEX_SLEEPING
+	// depending on whether there is a thread sleeping
+	// on this mutex.  If we ever change l->key from
+	// MUTEX_SLEEPING to some other value, we must be
+	// careful to change it back to MUTEX_SLEEPING before
+	// returning, to ensure that the sleeping thread gets
+	// its wakeup call.
+	wait = v;
+
+	if(proccount == 0)
+		proccount = getproccount();
+
+	// On uniprocessor's, no point spinning.
+	// On multiprocessors, spin for ACTIVE_SPIN attempts.
+	spin = 0;
+	if(proccount > 1)
+		spin = ACTIVE_SPIN;
+
+	for(;;) {
+		// Try for lock, spinning.
+		for(i = 0; i < spin; i++) {
+			while(l->key == MUTEX_UNLOCKED)
+				if(runtime·cas(&l->key, MUTEX_UNLOCKED, wait))
+						return;
+			runtime·procyield(ACTIVE_SPIN_CNT);
 		}
-		goto again;
-	}

-	// Lock was held; try to add ourselves to the waiter count.
-	if(!runtime·cas(&l->key, v, v+2))
-		goto again;
-
-	// We're accounted for, now sleep in the kernel.
-	//
-	// We avoid the obvious lock/unlock race because
-	// the kernel won't put us to sleep if l->key has
-	// changed underfoot and is no longer v+2.
-	//
-	// We only really care that (v&1) == 1 (the lock is held),
-	// and in fact there is a futex variant that could
-	// accommodate that check, but let's not get carried away.)
-	futexsleep(&l->key, v+2);
-
-	// We're awake: remove ourselves from the count.
-	for(;;){
-		v = l->key;
-		if(v < 2)
-			runtime·throw("bad lock key");
-		if(runtime·cas(&l->key, v, v-2))
-			break;
-	}
+		// Try for lock, rescheduling.
+		for(i=0; i < PASSIVE_SPIN; i++) {
+			while(l->key == MUTEX_UNLOCKED)
+				if(runtime·cas(&l->key, MUTEX_UNLOCKED, wait))
+					return;
+			runtime·osyield();
+		}

-	// Try for the lock again.
-	goto again;
+		// Sleep.
+		v = runtime·xchg(&l->key, MUTEX_SLEEPING);
+		if(v == MUTEX_UNLOCKED)
+			return;
+		wait = MUTEX_SLEEPING;
+		futexsleep(&l->key, MUTEX_SLEEPING);
+	}
 }

 static void
@@ -137,34 +176,26 @@ futexunlock(Lock *l)
 {
 	uint32 v;

-	// Atomically get value and clear lock bit.
-again:
-	v = l->key;
-	if((v&1) == 0)
+	v = runtime·xchg(&l->key, MUTEX_UNLOCKED);
+	if(v == MUTEX_UNLOCKED)
 		runtime·throw("unlock of unlocked lock");
-	if(!runtime·cas(&l->key, v, v&~1))
-		goto again;
-
-	// If there were waiters, wake one.
-	if(v & ~1)
-		futexwakeup(&l->key);
+	if(v == MUTEX_SLEEPING)
+		futexwakeup(&l->key, 1);
 }

 void
 runtime·lock(Lock *l)
 {
-	if(m->locks < 0)
-		runtime·throw("lock count");
-	m->locks++;
+	if(m->locks++ < 0)
+		runtime·throw("runtime·lock: lock count");
 	futexlock(l);
 }

 void
 runtime·unlock(Lock *l)
 {
-	m->locks--;
-	if(m->locks < 0)
-		runtime·throw("lock count");
+	if(--m->locks < 0)
+		runtime·throw("runtime·unlock: lock count");
 	futexunlock(l);
 }

@@ -175,35 +206,24 @@ runtime·destroylock(Lock*)


 // One-time notifications.
-//
-// Since the lock/unlock implementation already
-// takes care of sleeping in the kernel, we just reuse it.
-// (But it's a weird use, so it gets its own interface.)
-//
-// We use a lock to represent the event:
-// unlocked == event has happened.
-// Thus the lock starts out locked, and to wait for the
-// event you try to lock the lock.  To signal the event,
-// you unlock the lock.
-
 void
 runtime·noteclear(Note *n)
 {
-	n->lock.key = 0;	// memset(n, 0, sizeof *n)
-	futexlock(&n->lock);
+	n->state = 0;
 }

 void
 runtime·notewakeup(Note *n)
 {
-	futexunlock(&n->lock);
+	runtime·xchg(&n->state, 1);
+	futexwakeup(&n->state, 1<<30);
 }

 void
 runtime·notesleep(Note *n)
 {
-	futexlock(&n->lock);
-	futexunlock(&n->lock);	// Let other sleepers find out too.
+	while(runtime·atomicload(&n->state) == 0)
+		futexsleep(&n->state, 0);
 }



--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -131,7 +131,10 @@ struct	Usema
 union	Note
 {
 	struct {	// Linux
-		Lock	lock;
+		uint32	state;
+	};
+	struct {	// Windows
+		Lock lock;
 	};
 	struct {	// OS X
 		int32	wakeup;
@@ -382,6 +385,7 @@ extern	bool	runtime·iscgo;
 * common functions and data
 */
 int32	runtime·strcmp(byte*, byte*);
+byte*	runtime·strstr(byte*, byte*);
 int32	runtime·findnull(byte*);
 int32	runtime·findnullw(uint16*);
 void	runtime·dump(byte*, int32);
@@ -427,6 +431,7 @@ bool	runtime·casp(void**, void*, void*);
 // Don't confuse with XADD x86 instruction,
 // this one is actually 'addx', that is, add-and-fetch.
 uint32	runtime·xadd(uint32 volatile*, int32);
+uint32	runtime·xchg(uint32 volatile*, uint32);
 uint32	runtime·atomicload(uint32 volatile*);
 void*	runtime·atomicloadp(void* volatile*);
 void	runtime·atomicstorep(void* volatile*, void*);
@@ -596,6 +601,8 @@ void	runtime·semacquire(uint32*);
 void	runtime·semrelease(uint32*);
 String	runtime·signame(int32 sig);
 int32	runtime·gomaxprocsfunc(int32 n);
+void	runtime·procyield(uint32);
+void	runtime·osyield(void);

 void	runtime·mapassign(Hmap*, byte*, byte*);
 void	runtime·mapaccess(Hmap*, byte*, byte*, bool*);

--- a/src/pkg/runtime/string.goc
+++ b/src/pkg/runtime/string.goc
@@ -203,6 +203,28 @@ runtime·strcmp(byte *s1, byte *s2)
 	}
 }

+byte*
+runtime·strstr(byte *s1, byte *s2)
+{
+	byte *sp1, *sp2;
+
+	if(*s2 == 0)
+		return s1;
+	for(; *s1; s1++) {
+		if(*s1 != *s2)
+			continue;
+		sp1 = s1;
+		sp2 = s2;
+		for(;;) {
+			if(*sp2 == 0)
+				return s1;
+			if(*sp1++ != *sp2++)
+				break;
+		}
+	}
+	return nil;
+}
+
 func slicestring(si String, lindex int32, hindex int32) (so String) {
 	int32 l;