Commit 14f929af authored by Cherry Zhang's avatar Cherry Zhang

runtime/internal/atomic: improve ARM atomics

This is a follow-up of CL 93637. There, when we redirect sync/atomic
to runtime/internal/atomic, a few good implementations of ARM atomics
were lost. This CL brings most of them back, with some improvements.

- Change atomic Store to a plain store with memory barrier, as we
  already changed atomic Load to plain load with memory barrier.

- Use native 64-bit atomics on ARMv7, jump to Go implementations
  on older machines. But drop the kernel helper. In particular,
  for Load64, just do loads, not using Cas on the address being
  load from, so it works also for read-only memory (since we have
  already fixed 32-bit Load).

Change-Id: I725cd65cf945ae5200db81a35be3f251c9f7af14
Reviewed-on: https://go-review.googlesource.com/111315
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarAustin Clements <austin@google.com>
parent 150b7286
...@@ -27,8 +27,8 @@ casl: ...@@ -27,8 +27,8 @@ casl:
CMP R0, R2 CMP R0, R2
BNE casfail BNE casfail
MOVB runtime·goarm(SB), R11 MOVB runtime·goarm(SB), R8
CMP $7, R11 CMP $7, R8
BLT 2(PC) BLT 2(PC)
DMB MB_ISHST DMB MB_ISHST
...@@ -37,8 +37,7 @@ casl: ...@@ -37,8 +37,7 @@ casl:
BNE casl BNE casl
MOVW $1, R0 MOVW $1, R0
MOVB runtime·goarm(SB), R11 CMP $7, R8
CMP $7, R11
BLT 2(PC) BLT 2(PC)
DMB MB_ISH DMB MB_ISH
...@@ -49,12 +48,17 @@ casfail: ...@@ -49,12 +48,17 @@ casfail:
MOVB R0, ret+12(FP) MOVB R0, ret+12(FP)
RET RET
// stubs
TEXT runtime∕internal∕atomic·Loadp(SB),NOSPLIT|NOFRAME,$0-8 TEXT runtime∕internal∕atomic·Loadp(SB),NOSPLIT|NOFRAME,$0-8
B runtime∕internal∕atomic·Load(SB) B runtime∕internal∕atomic·Load(SB)
TEXT runtime∕internal∕atomic·Casuintptr(SB),NOSPLIT,$0-13 TEXT runtime∕internal∕atomic·Casuintptr(SB),NOSPLIT,$0-13
B runtime∕internal∕atomic·Cas(SB) B runtime∕internal∕atomic·Cas(SB)
TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0-13
B runtime∕internal∕atomic·Cas(SB)
TEXT runtime∕internal∕atomic·Loaduintptr(SB),NOSPLIT,$0-8 TEXT runtime∕internal∕atomic·Loaduintptr(SB),NOSPLIT,$0-8
B runtime∕internal∕atomic·Load(SB) B runtime∕internal∕atomic·Load(SB)
...@@ -64,6 +68,9 @@ TEXT runtime∕internal∕atomic·Loaduint(SB),NOSPLIT,$0-8 ...@@ -64,6 +68,9 @@ TEXT runtime∕internal∕atomic·Loaduint(SB),NOSPLIT,$0-8
TEXT runtime∕internal∕atomic·Storeuintptr(SB),NOSPLIT,$0-8 TEXT runtime∕internal∕atomic·Storeuintptr(SB),NOSPLIT,$0-8
B runtime∕internal∕atomic·Store(SB) B runtime∕internal∕atomic·Store(SB)
TEXT runtime∕internal∕atomic·StorepNoWB(SB),NOSPLIT,$0-8
B runtime∕internal∕atomic·Store(SB)
TEXT runtime∕internal∕atomic·Xadduintptr(SB),NOSPLIT,$0-12 TEXT runtime∕internal∕atomic·Xadduintptr(SB),NOSPLIT,$0-12
B runtime∕internal∕atomic·Xadd(SB) B runtime∕internal∕atomic·Xadd(SB)
...@@ -72,3 +79,162 @@ TEXT runtime∕internal∕atomic·Loadint64(SB),NOSPLIT,$0-12 ...@@ -72,3 +79,162 @@ TEXT runtime∕internal∕atomic·Loadint64(SB),NOSPLIT,$0-12
TEXT runtime∕internal∕atomic·Xaddint64(SB),NOSPLIT,$0-20 TEXT runtime∕internal∕atomic·Xaddint64(SB),NOSPLIT,$0-20
B runtime∕internal∕atomic·Xadd64(SB) B runtime∕internal∕atomic·Xadd64(SB)
// 64-bit atomics
// The native ARM implementations use LDREXD/STREXD, which are
// available on ARMv6k or later. We use them only on ARMv7.
// On older ARM, we use Go implementations which simulate 64-bit
// atomics with locks.
TEXT armCas64<>(SB),NOSPLIT,$0-21
MOVW addr+0(FP), R1
// make unaligned atomic access panic
AND.S $7, R1, R2
BEQ 2(PC)
MOVW R2, (R2) // crash. AND.S above left only low 3 bits in R2.
MOVW old_lo+4(FP), R2
MOVW old_hi+8(FP), R3
MOVW new_lo+12(FP), R4
MOVW new_hi+16(FP), R5
cas64loop:
LDREXD (R1), R6 // loads R6 and R7
CMP R2, R6
BNE cas64fail
CMP R3, R7
BNE cas64fail
DMB MB_ISHST
STREXD R4, (R1), R0 // stores R4 and R5
CMP $0, R0
BNE cas64loop
MOVW $1, R0
DMB MB_ISH
MOVBU R0, swapped+20(FP)
RET
cas64fail:
MOVW $0, R0
MOVBU R0, swapped+20(FP)
RET
TEXT armXadd64<>(SB),NOSPLIT,$0-20
MOVW addr+0(FP), R1
// make unaligned atomic access panic
AND.S $7, R1, R2
BEQ 2(PC)
MOVW R2, (R2) // crash. AND.S above left only low 3 bits in R2.
MOVW delta_lo+4(FP), R2
MOVW delta_hi+8(FP), R3
add64loop:
LDREXD (R1), R4 // loads R4 and R5
ADD.S R2, R4
ADC R3, R5
DMB MB_ISHST
STREXD R4, (R1), R0 // stores R4 and R5
CMP $0, R0
BNE add64loop
DMB MB_ISH
MOVW R4, new_lo+12(FP)
MOVW R5, new_hi+16(FP)
RET
TEXT armXchg64<>(SB),NOSPLIT,$0-20
MOVW addr+0(FP), R1
// make unaligned atomic access panic
AND.S $7, R1, R2
BEQ 2(PC)
MOVW R2, (R2) // crash. AND.S above left only low 3 bits in R2.
MOVW new_lo+4(FP), R2
MOVW new_hi+8(FP), R3
swap64loop:
LDREXD (R1), R4 // loads R4 and R5
DMB MB_ISHST
STREXD R2, (R1), R0 // stores R2 and R3
CMP $0, R0
BNE swap64loop
DMB MB_ISH
MOVW R4, old_lo+12(FP)
MOVW R5, old_hi+16(FP)
RET
TEXT armLoad64<>(SB),NOSPLIT,$0-12
MOVW addr+0(FP), R1
// make unaligned atomic access panic
AND.S $7, R1, R2
BEQ 2(PC)
MOVW R2, (R2) // crash. AND.S above left only low 3 bits in R2.
LDREXD (R1), R2 // loads R2 and R3
DMB MB_ISH
MOVW R2, val_lo+4(FP)
MOVW R3, val_hi+8(FP)
RET
TEXT armStore64<>(SB),NOSPLIT,$0-12
MOVW addr+0(FP), R1
// make unaligned atomic access panic
AND.S $7, R1, R2
BEQ 2(PC)
MOVW R2, (R2) // crash. AND.S above left only low 3 bits in R2.
MOVW val_lo+4(FP), R2
MOVW val_hi+8(FP), R3
store64loop:
LDREXD (R1), R4 // loads R4 and R5
DMB MB_ISHST
STREXD R2, (R1), R0 // stores R2 and R3
CMP $0, R0
BNE store64loop
DMB MB_ISH
RET
TEXT ·Cas64(SB),NOSPLIT,$0-21
MOVB runtime·goarm(SB), R11
CMP $7, R11
BLT 2(PC)
JMP armCas64<>(SB)
JMP ·goCas64(SB)
TEXT ·Xadd64(SB),NOSPLIT,$0-20
MOVB runtime·goarm(SB), R11
CMP $7, R11
BLT 2(PC)
JMP armXadd64<>(SB)
JMP ·goXadd64(SB)
TEXT ·Xchg64(SB),NOSPLIT,$0-20
MOVB runtime·goarm(SB), R11
CMP $7, R11
BLT 2(PC)
JMP armXchg64<>(SB)
JMP ·goXchg64(SB)
TEXT ·Load64(SB),NOSPLIT,$0-12
MOVB runtime·goarm(SB), R11
CMP $7, R11
BLT 2(PC)
JMP armLoad64<>(SB)
JMP ·goLoad64(SB)
TEXT ·Store64(SB),NOSPLIT,$0-12
MOVB runtime·goarm(SB), R11
CMP $7, R11
BLT 2(PC)
JMP armStore64<>(SB)
JMP ·goStore64(SB)
...@@ -68,28 +68,14 @@ func Xchguintptr(addr *uintptr, v uintptr) uintptr { ...@@ -68,28 +68,14 @@ func Xchguintptr(addr *uintptr, v uintptr) uintptr {
return uintptr(Xchg((*uint32)(unsafe.Pointer(addr)), uint32(v))) return uintptr(Xchg((*uint32)(unsafe.Pointer(addr)), uint32(v)))
} }
//go:nosplit // Not noescape -- it installs a pointer to addr.
func StorepNoWB(addr unsafe.Pointer, v unsafe.Pointer) { func StorepNoWB(addr unsafe.Pointer, v unsafe.Pointer)
for {
old := *(*unsafe.Pointer)(addr)
if Casp1((*unsafe.Pointer)(addr), old, v) {
return
}
}
}
//go:nosplit //go:noescape
func Store(addr *uint32, v uint32) { func Store(addr *uint32, v uint32)
for {
old := *addr
if Cas(addr, old, v) {
return
}
}
}
//go:nosplit //go:nosplit
func Cas64(addr *uint64, old, new uint64) bool { func goCas64(addr *uint64, old, new uint64) bool {
if uintptr(unsafe.Pointer(addr))&7 != 0 { if uintptr(unsafe.Pointer(addr))&7 != 0 {
*(*int)(nil) = 0 // crash on unaligned uint64 *(*int)(nil) = 0 // crash on unaligned uint64
} }
...@@ -105,7 +91,7 @@ func Cas64(addr *uint64, old, new uint64) bool { ...@@ -105,7 +91,7 @@ func Cas64(addr *uint64, old, new uint64) bool {
} }
//go:nosplit //go:nosplit
func Xadd64(addr *uint64, delta int64) uint64 { func goXadd64(addr *uint64, delta int64) uint64 {
if uintptr(unsafe.Pointer(addr))&7 != 0 { if uintptr(unsafe.Pointer(addr))&7 != 0 {
*(*int)(nil) = 0 // crash on unaligned uint64 *(*int)(nil) = 0 // crash on unaligned uint64
} }
...@@ -119,7 +105,7 @@ func Xadd64(addr *uint64, delta int64) uint64 { ...@@ -119,7 +105,7 @@ func Xadd64(addr *uint64, delta int64) uint64 {
} }
//go:nosplit //go:nosplit
func Xchg64(addr *uint64, v uint64) uint64 { func goXchg64(addr *uint64, v uint64) uint64 {
if uintptr(unsafe.Pointer(addr))&7 != 0 { if uintptr(unsafe.Pointer(addr))&7 != 0 {
*(*int)(nil) = 0 // crash on unaligned uint64 *(*int)(nil) = 0 // crash on unaligned uint64
} }
...@@ -133,7 +119,7 @@ func Xchg64(addr *uint64, v uint64) uint64 { ...@@ -133,7 +119,7 @@ func Xchg64(addr *uint64, v uint64) uint64 {
} }
//go:nosplit //go:nosplit
func Load64(addr *uint64) uint64 { func goLoad64(addr *uint64) uint64 {
if uintptr(unsafe.Pointer(addr))&7 != 0 { if uintptr(unsafe.Pointer(addr))&7 != 0 {
*(*int)(nil) = 0 // crash on unaligned uint64 *(*int)(nil) = 0 // crash on unaligned uint64
} }
...@@ -146,7 +132,7 @@ func Load64(addr *uint64) uint64 { ...@@ -146,7 +132,7 @@ func Load64(addr *uint64) uint64 {
} }
//go:nosplit //go:nosplit
func Store64(addr *uint64, v uint64) { func goStore64(addr *uint64, v uint64) {
if uintptr(unsafe.Pointer(addr))&7 != 0 { if uintptr(unsafe.Pointer(addr))&7 != 0 {
*(*int)(nil) = 0 // crash on unaligned uint64 *(*int)(nil) = 0 // crash on unaligned uint64
} }
...@@ -194,3 +180,18 @@ func Load(addr *uint32) uint32 ...@@ -194,3 +180,18 @@ func Load(addr *uint32) uint32
//go:noescape //go:noescape
func Loadp(addr unsafe.Pointer) unsafe.Pointer func Loadp(addr unsafe.Pointer) unsafe.Pointer
//go:noescape
func Cas64(addr *uint64, old, new uint64) bool
//go:noescape
func Xadd64(addr *uint64, delta int64) uint64
//go:noescape
func Xchg64(addr *uint64, v uint64) uint64
//go:noescape
func Load64(addr *uint64) uint64
//go:noescape
func Store64(addr *uint64, v uint64)
...@@ -55,9 +55,6 @@ check: ...@@ -55,9 +55,6 @@ check:
MOVB R0, ret+12(FP) MOVB R0, ret+12(FP)
RET RET
TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
B runtime∕internal∕atomic·Cas(SB)
// As for cas, memory barriers are complicated on ARM, but the kernel // As for cas, memory barriers are complicated on ARM, but the kernel
// provides a user helper. ARMv5 does not support SMP and has no // provides a user helper. ARMv5 does not support SMP and has no
// memory barrier instruction at all. ARMv6 added SMP support and has // memory barrier instruction at all. ARMv6 added SMP support and has
...@@ -70,7 +67,7 @@ TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0 ...@@ -70,7 +67,7 @@ TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
TEXT memory_barrier<>(SB),NOSPLIT|NOFRAME,$0 TEXT memory_barrier<>(SB),NOSPLIT|NOFRAME,$0
MOVW $0xffff0fa0, R15 // R15 is hardware PC. MOVW $0xffff0fa0, R15 // R15 is hardware PC.
TEXT runtime∕internal∕atomic·Load(SB),NOSPLIT,$0-8 TEXT ·Load(SB),NOSPLIT,$0-8
MOVW addr+0(FP), R0 MOVW addr+0(FP), R0
MOVW (R0), R1 MOVW (R0), R1
...@@ -78,10 +75,32 @@ TEXT runtime∕internal∕atomic·Load(SB),NOSPLIT,$0-8 ...@@ -78,10 +75,32 @@ TEXT runtime∕internal∕atomic·Load(SB),NOSPLIT,$0-8
CMP $7, R11 CMP $7, R11
BGE native_barrier BGE native_barrier
BL memory_barrier<>(SB) BL memory_barrier<>(SB)
B prolog B end
native_barrier: native_barrier:
DMB MB_ISH DMB MB_ISH
end:
prolog:
MOVW R1, ret+4(FP) MOVW R1, ret+4(FP)
RET RET
TEXT ·Store(SB),NOSPLIT,$0-8
MOVW addr+0(FP), R1
MOVW v+4(FP), R2
MOVB runtime·goarm(SB), R8
CMP $7, R8
BGE native_barrier
BL memory_barrier<>(SB)
B store
native_barrier:
DMB MB_ISH
store:
MOVW R2, (R1)
CMP $7, R8
BGE native_barrier2
BL memory_barrier<>(SB)
RET
native_barrier2:
DMB MB_ISH
RET
...@@ -17,10 +17,11 @@ ...@@ -17,10 +17,11 @@
TEXT ·Cas(SB),NOSPLIT,$0 TEXT ·Cas(SB),NOSPLIT,$0
JMP ·armcas(SB) JMP ·armcas(SB)
TEXT ·Casp1(SB),NOSPLIT,$0 // Non-linux OSes support only single processor machines before ARMv7.
JMP ·Cas(SB) // So we don't need memory barriers if goarm < 7. And we fail loud at
// startup (runtime.checkgoarm) if it is a multi-processor but goarm < 7.
TEXT runtime∕internal∕atomic·Load(SB),NOSPLIT|NOFRAME,$0-8 TEXT ·Load(SB),NOSPLIT|NOFRAME,$0-8
MOVW addr+0(FP), R0 MOVW addr+0(FP), R0
MOVW (R0), R1 MOVW (R0), R1
...@@ -31,3 +32,19 @@ TEXT runtime∕internal∕atomic·Load(SB),NOSPLIT|NOFRAME,$0-8 ...@@ -31,3 +32,19 @@ TEXT runtime∕internal∕atomic·Load(SB),NOSPLIT|NOFRAME,$0-8
MOVW R1, ret+4(FP) MOVW R1, ret+4(FP)
RET RET
TEXT ·Store(SB),NOSPLIT,$0-8
MOVW addr+0(FP), R1
MOVW v+4(FP), R2
MOVB runtime·goarm(SB), R8
CMP $7, R8
BLT 2(PC)
DMB MB_ISH
MOVW R2, (R1)
CMP $7, R8
BLT 2(PC)
DMB MB_ISH
RET
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment