runtime/internal/atomic: improve ARM atomics

This is a follow-up of CL 93637. There, when we redirect sync/atomic to runtime/internal/atomic, a few good implementations of ARM atomics were lost. This CL brings most of them back, with some improvements. - Change atomic Store to a plain store with memory barrier, as we already changed atomic Load to plain load with memory barrier. - Use native 64-bit atomics on ARMv7, jump to Go implementations on older machines. But drop the kernel helper. In particular, for Load64, just do loads, not using Cas on the address being load from, so it works also for read-only memory (since we have already fixed 32-bit Load). Change-Id: I725cd65cf945ae5200db81a35be3f251c9f7af14 Reviewed-on: https://go-review.googlesource.com/111315 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com>

runtime/internal/atomic: improve ARM atomics
This is a follow-up of CL 93637. There, when we redirect sync/atomic to runtime/internal/atomic, a few good implementations of ARM atomics were lost. This CL brings most of them back, with some improvements. - Change atomic Store to a plain store with memory barrier, as we already changed atomic Load to plain load with memory barrier. - Use native 64-bit atomics on ARMv7, jump to Go implementations on older machines. But drop the kernel helper. In particular, for Load64, just do loads, not using Cas on the address being load from, so it works also for read-only memory (since we have already fixed 32-bit Load). Change-Id: I725cd65cf945ae5200db81a35be3f251c9f7af14 Reviewed-on: https://go-review.googlesource.com/111315 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com>
14f929af · Cherry Zhang · 150b7286 · 14f929af · 14f929af · 14f929af
Commit 14f929af authored May 03, 2018 by Cherry Zhang
4 changed files
--- a/src/runtime/internal/atomic/asm_arm.s
+++ b/src/runtime/internal/atomic/asm_arm.s
@@ -27,8 +27,8 @@ casl:
 	CMP	R0, R2
 	BNE	casfail

-	MOVB	runtime·goarm(SB), R11
-	CMP	$7, R11
+	MOVB	runtime·goarm(SB), R8
+	CMP	$7, R8
 	BLT	2(PC)
 	DMB	MB_ISHST

@@ -37,8 +37,7 @@ casl:
 	BNE	casl
 	MOVW	$1, R0

-	MOVB	runtime·goarm(SB), R11
-	CMP	$7, R11
+	CMP	$7, R8
 	BLT	2(PC)
 	DMB	MB_ISH

@@ -49,12 +48,17 @@ casfail:
 	MOVB	R0, ret+12(FP)
 	RET

+// stubs
+
 TEXT runtime∕internal∕atomic·Loadp(SB),NOSPLIT|NOFRAME,$0-8
 	B runtime∕internal∕atomic·Load(SB)

 TEXT runtime∕internal∕atomic·Casuintptr(SB),NOSPLIT,$0-13
 	B	runtime∕internal∕atomic·Cas(SB)

+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0-13
+	B	runtime∕internal∕atomic·Cas(SB)
+
 TEXT runtime∕internal∕atomic·Loaduintptr(SB),NOSPLIT,$0-8
 	B	runtime∕internal∕atomic·Load(SB)

@@ -64,6 +68,9 @@ TEXT runtime∕internal∕atomic·Loaduint(SB),NOSPLIT,$0-8
 TEXT runtime∕internal∕atomic·Storeuintptr(SB),NOSPLIT,$0-8
 	B	runtime∕internal∕atomic·Store(SB)

+TEXT runtime∕internal∕atomic·StorepNoWB(SB),NOSPLIT,$0-8
+	B	runtime∕internal∕atomic·Store(SB)
+
 TEXT runtime∕internal∕atomic·Xadduintptr(SB),NOSPLIT,$0-12
 	B	runtime∕internal∕atomic·Xadd(SB)

@@ -72,3 +79,162 @@ TEXT runtime∕internal∕atomic·Loadint64(SB),NOSPLIT,$0-12

 TEXT runtime∕internal∕atomic·Xaddint64(SB),NOSPLIT,$0-20
 	B	runtime∕internal∕atomic·Xadd64(SB)
+
+// 64-bit atomics
+// The native ARM implementations use LDREXD/STREXD, which are
+// available on ARMv6k or later. We use them only on ARMv7.
+// On older ARM, we use Go implementations which simulate 64-bit
+// atomics with locks.
+
+TEXT	armCas64<>(SB),NOSPLIT,$0-21
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	2(PC)
+	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	MOVW	old_lo+4(FP), R2
+	MOVW	old_hi+8(FP), R3
+	MOVW	new_lo+12(FP), R4
+	MOVW	new_hi+16(FP), R5
+cas64loop:
+	LDREXD	(R1), R6	// loads R6 and R7
+	CMP	R2, R6
+	BNE	cas64fail
+	CMP	R3, R7
+	BNE	cas64fail
+
+	DMB	MB_ISHST
+
+	STREXD	R4, (R1), R0	// stores R4 and R5
+	CMP	$0, R0
+	BNE	cas64loop
+	MOVW	$1, R0
+
+	DMB	MB_ISH
+
+	MOVBU	R0, swapped+20(FP)
+	RET
+cas64fail:
+	MOVW	$0, R0
+	MOVBU	R0, swapped+20(FP)
+	RET
+
+TEXT	armXadd64<>(SB),NOSPLIT,$0-20
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	2(PC)
+	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	MOVW	delta_lo+4(FP), R2
+	MOVW	delta_hi+8(FP), R3
+
+add64loop:
+	LDREXD	(R1), R4	// loads R4 and R5
+	ADD.S	R2, R4
+	ADC	R3, R5
+
+	DMB	MB_ISHST
+
+	STREXD	R4, (R1), R0	// stores R4 and R5
+	CMP	$0, R0
+	BNE	add64loop
+
+	DMB	MB_ISH
+
+	MOVW	R4, new_lo+12(FP)
+	MOVW	R5, new_hi+16(FP)
+	RET
+
+TEXT	armXchg64<>(SB),NOSPLIT,$0-20
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	2(PC)
+	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	MOVW	new_lo+4(FP), R2
+	MOVW	new_hi+8(FP), R3
+
+swap64loop:
+	LDREXD	(R1), R4	// loads R4 and R5
+
+	DMB	MB_ISHST
+
+	STREXD	R2, (R1), R0	// stores R2 and R3
+	CMP	$0, R0
+	BNE	swap64loop
+
+	DMB	MB_ISH
+
+	MOVW	R4, old_lo+12(FP)
+	MOVW	R5, old_hi+16(FP)
+	RET
+
+TEXT	armLoad64<>(SB),NOSPLIT,$0-12
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	2(PC)
+	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+
+	LDREXD	(R1), R2	// loads R2 and R3
+	DMB	MB_ISH
+
+	MOVW	R2, val_lo+4(FP)
+	MOVW	R3, val_hi+8(FP)
+	RET
+
+TEXT	armStore64<>(SB),NOSPLIT,$0-12
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	2(PC)
+	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+	MOVW	val_lo+4(FP), R2
+	MOVW	val_hi+8(FP), R3
+
+store64loop:
+	LDREXD	(R1), R4	// loads R4 and R5
+
+	DMB	MB_ISHST
+
+	STREXD	R2, (R1), R0	// stores R2 and R3
+	CMP	$0, R0
+	BNE	store64loop
+
+	DMB	MB_ISH
+	RET
+
+TEXT	·Cas64(SB),NOSPLIT,$0-21
+	MOVB	runtime·goarm(SB), R11
+	CMP	$7, R11
+	BLT	2(PC)
+	JMP	armCas64<>(SB)
+	JMP	·goCas64(SB)
+
+TEXT	·Xadd64(SB),NOSPLIT,$0-20
+	MOVB	runtime·goarm(SB), R11
+	CMP	$7, R11
+	BLT	2(PC)
+	JMP	armXadd64<>(SB)
+	JMP	·goXadd64(SB)
+
+TEXT	·Xchg64(SB),NOSPLIT,$0-20
+	MOVB	runtime·goarm(SB), R11
+	CMP	$7, R11
+	BLT	2(PC)
+	JMP	armXchg64<>(SB)
+	JMP	·goXchg64(SB)
+
+TEXT	·Load64(SB),NOSPLIT,$0-12
+	MOVB	runtime·goarm(SB), R11
+	CMP	$7, R11
+	BLT	2(PC)
+	JMP	armLoad64<>(SB)
+	JMP	·goLoad64(SB)
+
+TEXT	·Store64(SB),NOSPLIT,$0-12
+	MOVB	runtime·goarm(SB), R11
+	CMP	$7, R11
+	BLT	2(PC)
+	JMP	armStore64<>(SB)
+	JMP	·goStore64(SB)
--- a/src/runtime/internal/atomic/atomic_arm.go
+++ b/src/runtime/internal/atomic/atomic_arm.go
@@ -68,28 +68,14 @@ func Xchguintptr(addr *uintptr, v uintptr) uintptr {
 	return uintptr(Xchg((*uint32)(unsafe.Pointer(addr)), uint32(v)))
 }

-//go:nosplit
-func StorepNoWB(addr unsafe.Pointer, v unsafe.Pointer) {
-	for {
-		old := *(*unsafe.Pointer)(addr)
-		if Casp1((*unsafe.Pointer)(addr), old, v) {
-			return
-		}
-	}
-}
+// Not noescape -- it installs a pointer to addr.
+func StorepNoWB(addr unsafe.Pointer, v unsafe.Pointer)

-//go:nosplit
-func Store(addr *uint32, v uint32) {
-	for {
-		old := *addr
-		if Cas(addr, old, v) {
-			return
-		}
-	}
-}
+//go:noescape
+func Store(addr *uint32, v uint32)

 //go:nosplit
-func Cas64(addr *uint64, old, new uint64) bool {
+func goCas64(addr *uint64, old, new uint64) bool {
 	if uintptr(unsafe.Pointer(addr))&7 != 0 {
 		*(*int)(nil) = 0 // crash on unaligned uint64
 	}
@@ -105,7 +91,7 @@ func Cas64(addr *uint64, old, new uint64) bool {
 }

 //go:nosplit
-func Xadd64(addr *uint64, delta int64) uint64 {
+func goXadd64(addr *uint64, delta int64) uint64 {
 	if uintptr(unsafe.Pointer(addr))&7 != 0 {
 		*(*int)(nil) = 0 // crash on unaligned uint64
 	}
@@ -119,7 +105,7 @@ func Xadd64(addr *uint64, delta int64) uint64 {
 }

 //go:nosplit
-func Xchg64(addr *uint64, v uint64) uint64 {
+func goXchg64(addr *uint64, v uint64) uint64 {
 	if uintptr(unsafe.Pointer(addr))&7 != 0 {
 		*(*int)(nil) = 0 // crash on unaligned uint64
 	}
@@ -133,7 +119,7 @@ func Xchg64(addr *uint64, v uint64) uint64 {
 }

 //go:nosplit
-func Load64(addr *uint64) uint64 {
+func goLoad64(addr *uint64) uint64 {
 	if uintptr(unsafe.Pointer(addr))&7 != 0 {
 		*(*int)(nil) = 0 // crash on unaligned uint64
 	}
@@ -146,7 +132,7 @@ func Load64(addr *uint64) uint64 {
 }

 //go:nosplit
-func Store64(addr *uint64, v uint64) {
+func goStore64(addr *uint64, v uint64) {
 	if uintptr(unsafe.Pointer(addr))&7 != 0 {
 		*(*int)(nil) = 0 // crash on unaligned uint64
 	}
@@ -194,3 +180,18 @@ func Load(addr *uint32) uint32

 //go:noescape
 func Loadp(addr unsafe.Pointer) unsafe.Pointer
+
+//go:noescape
+func Cas64(addr *uint64, old, new uint64) bool
+
+//go:noescape
+func Xadd64(addr *uint64, delta int64) uint64
+
+//go:noescape
+func Xchg64(addr *uint64, v uint64) uint64
+
+//go:noescape
+func Load64(addr *uint64) uint64
+
+//go:noescape
+func Store64(addr *uint64, v uint64)
--- a/src/runtime/internal/atomic/sys_linux_arm.s
+++ b/src/runtime/internal/atomic/sys_linux_arm.s
@@ -55,9 +55,6 @@ check:
 	MOVB	R0, ret+12(FP)
 	RET

-TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
-	B	runtime∕internal∕atomic·Cas(SB)
-
 // As for cas, memory barriers are complicated on ARM, but the kernel
 // provides a user helper. ARMv5 does not support SMP and has no
 // memory barrier instruction at all. ARMv6 added SMP support and has
@@ -70,7 +67,7 @@ TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
 TEXT memory_barrier<>(SB),NOSPLIT|NOFRAME,$0
 	MOVW	$0xffff0fa0, R15 // R15 is hardware PC.

-TEXT runtime∕internal∕atomic·Load(SB),NOSPLIT,$0-8
+TEXT	·Load(SB),NOSPLIT,$0-8
 	MOVW	addr+0(FP), R0
 	MOVW	(R0), R1

@@ -78,10 +75,32 @@ TEXT runtime∕internal∕atomic·Load(SB),NOSPLIT,$0-8
 	CMP	$7, R11
 	BGE	native_barrier
 	BL	memory_barrier<>(SB)
-	B	prolog
+	B	end
 native_barrier:
 	DMB	MB_ISH
-
-prolog:
+end:
 	MOVW	R1, ret+4(FP)
 	RET
+
+TEXT	·Store(SB),NOSPLIT,$0-8
+	MOVW	addr+0(FP), R1
+	MOVW	v+4(FP), R2
+
+	MOVB	runtime·goarm(SB), R8
+	CMP	$7, R8
+	BGE	native_barrier
+	BL	memory_barrier<>(SB)
+	B	store
+native_barrier:
+	DMB	MB_ISH
+
+store:
+	MOVW	R2, (R1)
+
+	CMP	$7, R8
+	BGE	native_barrier2
+	BL	memory_barrier<>(SB)
+	RET
+native_barrier2:
+	DMB	MB_ISH
+	RET
--- a/src/runtime/internal/atomic/sys_nonlinux_arm.s
+++ b/src/runtime/internal/atomic/sys_nonlinux_arm.s
@@ -17,10 +17,11 @@
 TEXT	·Cas(SB),NOSPLIT,$0
 	JMP	·armcas(SB)

-TEXT	·Casp1(SB),NOSPLIT,$0
-	JMP	·Cas(SB)
+// Non-linux OSes support only single processor machines before ARMv7.
+// So we don't need memory barriers if goarm < 7. And we fail loud at
+// startup (runtime.checkgoarm) if it is a multi-processor but goarm < 7.

-TEXT runtime∕internal∕atomic·Load(SB),NOSPLIT|NOFRAME,$0-8
+TEXT	·Load(SB),NOSPLIT|NOFRAME,$0-8
 	MOVW	addr+0(FP), R0
 	MOVW	(R0), R1

@@ -31,3 +32,19 @@ TEXT runtime∕internal∕atomic·Load(SB),NOSPLIT|NOFRAME,$0-8

 	MOVW	R1, ret+4(FP)
 	RET
+
+TEXT	·Store(SB),NOSPLIT,$0-8
+	MOVW	addr+0(FP), R1
+	MOVW	v+4(FP), R2
+
+	MOVB	runtime·goarm(SB), R8
+	CMP	$7, R8
+	BLT	2(PC)
+	DMB	MB_ISH
+
+	MOVW	R2, (R1)
+
+	CMP	$7, R8
+	BLT	2(PC)
+	DMB	MB_ISH
+	RET