Commit 3107c91e authored by Lynn Boger's avatar Lynn Boger Committed by Michael Munday

runtime: memclr perf improvements on ppc64x

This updates runtime/memclr_ppc64x.s to improve performance,
by unrolling loops for larger clears.

Fixes #17348

benchmark                    old MB/s     new MB/s     speedup
BenchmarkMemclr/5-80         199.71       406.63       2.04x
BenchmarkMemclr/16-80        693.66       1817.41      2.62x
BenchmarkMemclr/64-80        2309.35      5793.34      2.51x
BenchmarkMemclr/256-80       5428.18      14765.81     2.72x
BenchmarkMemclr/4096-80      8611.65      27191.94     3.16x
BenchmarkMemclr/65536-80     8736.69      28604.23     3.27x
BenchmarkMemclr/1M-80        9304.94      27600.09     2.97x
BenchmarkMemclr/4M-80        8705.66      27589.64     3.17x
BenchmarkMemclr/8M-80        8575.74      23631.04     2.76x
BenchmarkMemclr/16M-80       8443.10      19240.68     2.28x
BenchmarkMemclr/64M-80       8390.40      9493.04      1.13x
BenchmarkGoMemclr/5-80       263.05       630.37       2.40x
BenchmarkGoMemclr/16-80      904.33       1148.49      1.27x
BenchmarkGoMemclr/64-80      2830.20      8756.70      3.09x
BenchmarkGoMemclr/256-80     6064.59      20299.46     3.35x

Change-Id: Ic76c9183c8b4129ba3df512ca8b0fe6bd424e088
Reviewed-on: https://go-review.googlesource.com/30373
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarMichael Munday <munday@ca.ibm.com>
Reviewed-by: 's avatarDavid Chase <drchase@google.com>
parent ce645534
......@@ -7,25 +7,56 @@
#include "textflag.h"
// void runtime·memclr(void*, uintptr)
TEXT runtime·memclr(SB),NOSPLIT|NOFRAME,$0-16
MOVD ptr+0(FP), R3
MOVD n+8(FP), R4
SRADCC $3, R4, R6 // R6 is the number of words to zero
BEQ bytes
SUB $8, R3
MOVD R6, CTR
MOVDU R0, 8(R3)
BC 25, 0, -1(PC) // bdnz+ $-4
ADD $8, R3
bytes:
ANDCC $7, R4, R7 // R7 is the number of bytes to zero
BEQ done
SUB $1, R3
MOVD R7, CTR
MOVBU R0, 1(R3)
BC 25, 0, -1(PC) // bdnz+ $-4
done:
TEXT runtime·memclr(SB), NOSPLIT|NOFRAME, $0-16
MOVD ptr+0(FP), R3
MOVD n+8(FP), R4
// Determine if there are doublewords to clear
check:
ANDCC $7, R4, R5 // R5: leftover bytes to clear
SRAD $3, R4, R6 // R6: double words to clear
CMP R6, $0, CR1 // CR1[EQ] set if no double words
BC 12, 6, nozerolarge // only single bytes
MOVD R6, CTR // R6 = number of double words
SRADCC $2, R6, R7 // 32 byte chunks?
BNE zero32setup
// Clear double words
zero8:
MOVD R0, 0(R3) // double word
ADD $8, R3
BC 16, 0, zero8 // dec ctr, br zero8 if ctr not 0
BR nozerolarge // handle remainder
// Prepare to clear 32 bytes at a time.
zero32setup:
DCBTST (R3) // prepare data cache
MOVD R7, CTR // number of 32 byte chunks
zero32:
MOVD R0, 0(R3) // clear 4 double words
MOVD R0, 8(R3)
MOVD R0, 16(R3)
MOVD R0, 24(R3)
ADD $32, R3
BC 16, 0, zero32 // dec ctr, br zero32 if ctr not 0
RLDCLCC $61, R4, $3, R6 // remaining doublewords
BEQ nozerolarge
MOVD R6, CTR // set up the CTR for doublewords
BR zero8
nozerolarge:
CMP R5, $0 // any remaining bytes
BC 4, 1, LR // ble lr
zerotail:
MOVD R5, CTR // set up to clear tail bytes
zerotailloop:
MOVB R0, 0(R3) // clear single bytes
ADD $1, R3
BC 16, 0, zerotailloop // dec ctr, br zerotailloop if ctr not 0
RET
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment