Commit 6633bb2a authored by Carlos Eduardo Seo's avatar Carlos Eduardo Seo Committed by Lynn Boger

cmd/compile/internal/ppc64, runtime internal/atomic, sync/atomic: implement…

cmd/compile/internal/ppc64, runtime internal/atomic, sync/atomic: implement faster atomics for ppc64x

This change implements faster atomics for ppc64x based on the ISA 2.07B,
Appendix B.2 recommendations, replacing SYNC/ISYNC by LWSYNC in some
cases.

Updates #21348

name                                           old time/op new time/op    delta
Cond1-16                                           955ns     856ns      -10.33%
Cond2-16                                          2.38µs    2.03µs      -14.59%
Cond4-16                                          5.90µs    5.44µs       -7.88%
Cond8-16                                          12.1µs    11.1µs       -8.42%
Cond16-16                                         27.0µs    25.1µs       -7.04%
Cond32-16                                         59.1µs    55.5µs       -6.14%
LoadMostlyHits/*sync_test.DeepCopyMap-16          22.1ns    24.1ns       +9.02%
LoadMostlyHits/*sync_test.RWMutexMap-16            252ns     249ns       -1.20%
LoadMostlyHits/*sync.Map-16                       16.2ns    16.3ns         ~
LoadMostlyMisses/*sync_test.DeepCopyMap-16        22.3ns    22.6ns         ~
LoadMostlyMisses/*sync_test.RWMutexMap-16          249ns     247ns       -0.51%
LoadMostlyMisses/*sync.Map-16                     12.7ns    12.7ns         ~
LoadOrStoreBalanced/*sync_test.RWMutexMap-16      1.27µs    1.17µs       -7.54%
LoadOrStoreBalanced/*sync.Map-16                  1.12µs    1.10µs       -2.35%
LoadOrStoreUnique/*sync_test.RWMutexMap-16        1.75µs    1.68µs       -3.84%
LoadOrStoreUnique/*sync.Map-16                    2.07µs    1.97µs       -5.13%
LoadOrStoreCollision/*sync_test.DeepCopyMap-16    15.8ns    15.9ns         ~
LoadOrStoreCollision/*sync_test.RWMutexMap-16      496ns     424ns      -14.48%
LoadOrStoreCollision/*sync.Map-16                 6.07ns    6.07ns         ~
Range/*sync_test.DeepCopyMap-16                   1.65µs    1.64µs         ~
Range/*sync_test.RWMutexMap-16                     278µs     288µs       +3.75%
Range/*sync.Map-16                                2.00µs    2.01µs         ~
AdversarialAlloc/*sync_test.DeepCopyMap-16        3.45µs    3.44µs         ~
AdversarialAlloc/*sync_test.RWMutexMap-16          226ns     227ns         ~
AdversarialAlloc/*sync.Map-16                     1.09µs    1.07µs       -2.36%
AdversarialDelete/*sync_test.DeepCopyMap-16        553ns     550ns       -0.57%
AdversarialDelete/*sync_test.RWMutexMap-16         273ns     274ns         ~
AdversarialDelete/*sync.Map-16                     247ns     249ns         ~
UncontendedSemaphore-16                           79.0ns    65.5ns      -17.11%
ContendedSemaphore-16                              112ns      97ns      -13.77%
MutexUncontended-16                               3.34ns    2.51ns      -24.69%
Mutex-16                                           266ns     191ns      -28.26%
MutexSlack-16                                      226ns     159ns      -29.55%
MutexWork-16                                       377ns     338ns      -10.14%
MutexWorkSlack-16                                  335ns     308ns       -8.20%
MutexNoSpin-16                                     196ns     184ns       -5.91%
MutexSpin-16                                       710ns     666ns       -6.21%
Once-16                                           1.29ns    1.29ns         ~
Pool-16                                           8.64ns    8.71ns         ~
PoolOverflow-16                                   1.60µs    1.44µs      -10.25%
SemaUncontended-16                                5.39ns    4.42ns      -17.96%
SemaSyntNonblock-16                                539ns     483ns      -10.42%
SemaSyntBlock-16                                   413ns     354ns      -14.20%
SemaWorkNonblock-16                                305ns     258ns      -15.36%
SemaWorkBlock-16                                   266ns     229ns      -14.06%
RWMutexUncontended-16                             12.9ns     9.7ns      -24.80%
RWMutexWrite100-16                                 203ns     147ns      -27.47%
RWMutexWrite10-16                                  177ns     119ns      -32.74%
RWMutexWorkWrite100-16                             435ns     403ns       -7.39%
RWMutexWorkWrite10-16                              642ns     611ns       -4.79%
WaitGroupUncontended-16                           4.67ns    3.70ns      -20.92%
WaitGroupAddDone-16                                402ns     355ns      -11.54%
WaitGroupAddDoneWork-16                            208ns     250ns      +20.09%
WaitGroupWait-16                                  1.21ns    1.21ns         ~
WaitGroupWaitWork-16                              5.91ns    5.87ns       -0.81%
WaitGroupActuallyWait-16                          92.2ns    85.8ns       -6.91%

Updates #21348

Change-Id: Ibb9b271d11b308264103829e176c6d9fe8f867d3
Reviewed-on: https://go-review.googlesource.com/95175
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarLynn Boger <laboger@linux.vnet.ibm.com>
parent a3d83269
......@@ -154,16 +154,17 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
case ssa.OpPPC64LoweredAtomicAnd8,
ssa.OpPPC64LoweredAtomicOr8:
// SYNC
// LWSYNC
// LBAR (Rarg0), Rtmp
// AND/OR Rarg1, Rtmp
// STBCCC Rtmp, (Rarg0)
// BNE -3(PC)
// ISYNC
r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg()
psync := s.Prog(ppc64.ASYNC)
psync.To.Type = obj.TYPE_NONE
// LWSYNC - Assuming shared data not write-through-required nor
// caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
plwsync := s.Prog(ppc64.ALWSYNC)
plwsync.To.Type = obj.TYPE_NONE
p := s.Prog(ppc64.ALBAR)
p.From.Type = obj.TYPE_MEM
p.From.Reg = r0
......@@ -183,17 +184,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p3 := s.Prog(ppc64.ABNE)
p3.To.Type = obj.TYPE_BRANCH
gc.Patch(p3, p)
pisync := s.Prog(ppc64.AISYNC)
pisync.To.Type = obj.TYPE_NONE
case ssa.OpPPC64LoweredAtomicAdd32,
ssa.OpPPC64LoweredAtomicAdd64:
// SYNC
// LWSYNC
// LDAR/LWAR (Rarg0), Rout
// ADD Rarg1, Rout
// STDCCC/STWCCC Rout, (Rarg0)
// BNE -3(PC)
// ISYNC
// MOVW Rout,Rout (if Add32)
ld := ppc64.ALDAR
st := ppc64.ASTDCCC
......@@ -204,9 +202,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg()
out := v.Reg0()
// SYNC
psync := s.Prog(ppc64.ASYNC)
psync.To.Type = obj.TYPE_NONE
// LWSYNC - Assuming shared data not write-through-required nor
// caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
plwsync := s.Prog(ppc64.ALWSYNC)
plwsync.To.Type = obj.TYPE_NONE
// LDAR or LWAR
p := s.Prog(ld)
p.From.Type = obj.TYPE_MEM
......@@ -229,9 +228,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p4 := s.Prog(ppc64.ABNE)
p4.To.Type = obj.TYPE_BRANCH
gc.Patch(p4, p)
// ISYNC
pisync := s.Prog(ppc64.AISYNC)
pisync.To.Type = obj.TYPE_NONE
// Ensure a 32 bit result
if v.Op == ssa.OpPPC64LoweredAtomicAdd32 {
......@@ -244,7 +240,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
case ssa.OpPPC64LoweredAtomicExchange32,
ssa.OpPPC64LoweredAtomicExchange64:
// SYNC
// LWSYNC
// LDAR/LWAR (Rarg0), Rout
// STDCCC/STWCCC Rout, (Rarg0)
// BNE -2(PC)
......@@ -258,9 +254,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg()
out := v.Reg0()
// SYNC
psync := s.Prog(ppc64.ASYNC)
psync.To.Type = obj.TYPE_NONE
// LWSYNC - Assuming shared data not write-through-required nor
// caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
plwsync := s.Prog(ppc64.ALWSYNC)
plwsync.To.Type = obj.TYPE_NONE
// LDAR or LWAR
p := s.Prog(ld)
p.From.Type = obj.TYPE_MEM
......@@ -342,14 +339,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
case ssa.OpPPC64LoweredAtomicCas64,
ssa.OpPPC64LoweredAtomicCas32:
// SYNC
// LWSYNC
// loop:
// LDAR (Rarg0), Rtmp
// CMP Rarg1, Rtmp
// BNE fail
// STDCCC Rarg2, (Rarg0)
// BNE loop
// ISYNC
// LWSYNC
// MOVD $1, Rout
// BR end
// fail:
......@@ -367,9 +364,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
r1 := v.Args[1].Reg()
r2 := v.Args[2].Reg()
out := v.Reg0()
// SYNC
psync := s.Prog(ppc64.ASYNC)
psync.To.Type = obj.TYPE_NONE
// LWSYNC - Assuming shared data not write-through-required nor
// caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
plwsync1 := s.Prog(ppc64.ALWSYNC)
plwsync1.To.Type = obj.TYPE_NONE
// LDAR or LWAR
p := s.Prog(ld)
p.From.Type = obj.TYPE_MEM
......@@ -395,9 +393,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p4 := s.Prog(ppc64.ABNE)
p4.To.Type = obj.TYPE_BRANCH
gc.Patch(p4, p)
// ISYNC
pisync := s.Prog(ppc64.AISYNC)
pisync.To.Type = obj.TYPE_NONE
// LWSYNC - Assuming shared data not write-through-required nor
// caching-inhibited. See Appendix B.2.1.1 in the ISA 2.07b.
plwsync2 := s.Prog(ppc64.ALWSYNC)
plwsync2.To.Type = obj.TYPE_NONE
// return true
p5 := s.Prog(ppc64.AMOVD)
p5.From.Type = obj.TYPE_CONST
......
......@@ -17,7 +17,7 @@ TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-17
MOVD ptr+0(FP), R3
MOVWZ old+8(FP), R4
MOVWZ new+12(FP), R5
SYNC
LWSYNC
cas_again:
LWAR (R3), R6
CMPW R6, R4
......@@ -25,7 +25,7 @@ cas_again:
STWCCC R5, (R3)
BNE cas_again
MOVD $1, R3
ISYNC
LWSYNC
MOVB R3, ret+16(FP)
RET
cas_fail:
......@@ -44,7 +44,7 @@ TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
MOVD ptr+0(FP), R3
MOVD old+8(FP), R4
MOVD new+16(FP), R5
SYNC
LWSYNC
cas64_again:
LDAR (R3), R6
CMP R6, R4
......@@ -52,7 +52,7 @@ cas64_again:
STDCCC R5, (R3)
BNE cas64_again
MOVD $1, R3
ISYNC
LWSYNC
MOVB R3, ret+24(FP)
RET
cas64_fail:
......@@ -97,31 +97,29 @@ TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-25
TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
MOVD ptr+0(FP), R4
MOVW delta+8(FP), R5
SYNC
LWSYNC
LWAR (R4), R3
ADD R5, R3
STWCCC R3, (R4)
BNE -3(PC)
ISYNC
MOVW R3, ret+16(FP)
RET
TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
MOVD ptr+0(FP), R4
MOVD delta+8(FP), R5
SYNC
LWSYNC
LDAR (R4), R3
ADD R5, R3
STDCCC R3, (R4)
BNE -3(PC)
ISYNC
MOVD R3, ret+16(FP)
RET
TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
MOVD ptr+0(FP), R4
MOVW new+8(FP), R5
SYNC
LWSYNC
LWAR (R4), R3
STWCCC R5, (R4)
BNE -2(PC)
......@@ -132,7 +130,7 @@ TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
MOVD ptr+0(FP), R4
MOVD new+8(FP), R5
SYNC
LWSYNC
LDAR (R4), R3
STDCCC R5, (R4)
BNE -2(PC)
......@@ -165,24 +163,22 @@ TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
MOVD ptr+0(FP), R3
MOVBZ val+8(FP), R4
SYNC
LWSYNC
again:
LBAR (R3), R6
OR R4, R6
STBCCC R6, (R3)
BNE again
ISYNC
RET
// void runtime∕internal∕atomic·And8(byte volatile*, byte);
TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
MOVD ptr+0(FP), R3
MOVBZ val+8(FP), R4
SYNC
LWSYNC
again:
LBAR (R3),R6
AND R4,R6
STBCCC R6,(R3)
BNE again
ISYNC
RET
......@@ -12,7 +12,7 @@ TEXT ·SwapInt32(SB),NOSPLIT,$0-20
TEXT ·SwapUint32(SB),NOSPLIT,$0-20
MOVD addr+0(FP), R3
MOVW new+8(FP), R4
SYNC
LWSYNC
LWAR (R3), R5
STWCCC R4, (R3)
BNE -2(PC)
......@@ -26,7 +26,7 @@ TEXT ·SwapInt64(SB),NOSPLIT,$0-24
TEXT ·SwapUint64(SB),NOSPLIT,$0-24
MOVD addr+0(FP), R3
MOVD new+8(FP), R4
SYNC
LWSYNC
LDAR (R3), R5
STDCCC R4, (R3)
BNE -2(PC)
......@@ -44,13 +44,13 @@ TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-17
MOVD addr+0(FP), R3
MOVW old+8(FP), R4
MOVW new+12(FP), R5
SYNC
LWSYNC
LWAR (R3), R6
CMPW R6, R4
BNE 7(PC)
STWCCC R5, (R3)
BNE -4(PC)
ISYNC
LWSYNC
MOVD $1, R3
MOVB R3, swapped+16(FP)
RET
......@@ -67,13 +67,13 @@ TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-25
MOVD addr+0(FP), R3
MOVD old+8(FP), R4
MOVD new+16(FP), R5
SYNC
LWSYNC
LDAR (R3), R6
CMP R6, R4
BNE 7(PC)
STDCCC R5, (R3)
BNE -4(PC)
ISYNC
LWSYNC
MOVD $1, R3
MOVB R3, swapped+24(FP)
RET
......@@ -86,12 +86,11 @@ TEXT ·AddInt32(SB),NOSPLIT,$0-20
TEXT ·AddUint32(SB),NOSPLIT,$0-20
MOVD addr+0(FP), R3
MOVW delta+8(FP), R4
SYNC
LWSYNC
LWAR (R3), R5
ADD R4, R5
STWCCC R5, (R3)
BNE -3(PC)
ISYNC
MOVW R5, new+16(FP)
RET
......@@ -104,12 +103,11 @@ TEXT ·AddInt64(SB),NOSPLIT,$0-24
TEXT ·AddUint64(SB),NOSPLIT,$0-24
MOVD addr+0(FP), R3
MOVD delta+8(FP), R4
SYNC
LWSYNC
LDAR (R3), R5
ADD R4, R5
STDCCC R5, (R3)
BNE -3(PC)
ISYNC
MOVD R5, new+16(FP)
RET
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment