Commit aa9bcea3 authored by Lynn Boger's avatar Lynn Boger

runtime: improve performance of memclr, memmove on ppc64x

This improves the asm implementations for memmove and memclr on
ppc64x through use of vsx loads and stores when size is >= 32 bytes.
For memclr, dcbz is used when the size is >= 512 and aligned to 128.

Memclr/64       13.3ns ± 0%     10.7ns ± 0%   -19.55%  (p=0.000 n=8+7)
Memclr/96       14.9ns ± 0%     11.4ns ± 0%   -23.49%  (p=0.000 n=8+8)
Memclr/128      16.3ns ± 0%     12.3ns ± 0%   -24.54%  (p=0.000 n=8+8)
Memclr/160      17.3ns ± 0%     13.0ns ± 0%   -24.86%  (p=0.000 n=8+8)
Memclr/256      20.0ns ± 0%     15.3ns ± 0%   -23.62%  (p=0.000 n=8+8)
Memclr/512      34.2ns ± 0%     10.2ns ± 0%   -70.20%  (p=0.000 n=8+8)
Memclr/4096      178ns ± 0%       23ns ± 0%   -87.13%  (p=0.000 n=8+8)
Memclr/65536    2.67µs ± 0%     0.30µs ± 0%   -88.89%  (p=0.000 n=7+8)
Memclr/1M       43.2µs ± 0%     10.0µs ± 0%   -76.85%  (p=0.000 n=8+8)
Memclr/4M        173µs ± 0%       40µs ± 0%   -76.88%  (p=0.000 n=8+8)
Memclr/8M        349µs ± 0%       82µs ± 0%   -76.58%  (p=0.000 n=8+8)
Memclr/16M       701µs ± 7%      672µs ± 0%    -4.05%  (p=0.040 n=8+7)
Memclr/64M      2.70ms ± 0%     2.67ms ± 0%    -0.96%  (p=0.000 n=8+7)

Memmove/32      6.59ns ± 0%    5.84ns ± 0%  -11.34%  (p=0.029 n=4+4)
Memmove/64      7.91ns ± 0%    6.97ns ± 0%  -11.92%  (p=0.029 n=4+4)
Memmove/128     10.5ns ± 0%     8.8ns ± 0%  -16.24%  (p=0.029 n=4+4)
Memmove/256     21.0ns ± 0%    12.9ns ± 0%  -38.57%  (p=0.029 n=4+4)
Memmove/512     28.4ns ± 0%    26.2ns ± 0%   -7.75%  (p=0.029 n=4+4)
Memmove/1024    48.2ns ± 1%    39.4ns ± 0%  -18.26%  (p=0.029 n=4+4)
Memmove/2048    85.4ns ± 0%    69.0ns ± 0%  -19.20%  (p=0.029 n=4+4)
Memmove/4096     159ns ± 0%     128ns ± 0%  -19.50%  (p=0.029 n=4+4)

Change-Id: I8c1adf88790845bf31444a15249456006eb5bf8b
Reviewed-on: https://go-review.googlesource.com/c/141217
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarMichael Munday <mike.munday@ibm.com>
parent e1978a2d
......@@ -14,34 +14,68 @@ TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT|NOFRAME, $0-16
// Determine if there are doublewords to clear
check:
ANDCC $7, R4, R5 // R5: leftover bytes to clear
SRAD $3, R4, R6 // R6: double words to clear
SRD $3, R4, R6 // R6: double words to clear
CMP R6, $0, CR1 // CR1[EQ] set if no double words
BC 12, 6, nozerolarge // only single bytes
MOVD R6, CTR // R6 = number of double words
SRADCC $2, R6, R7 // 32 byte chunks?
BNE zero32setup
BC 12, 6, nozerolarge // only single bytes
CMP R4, $512
BLT under512 // special case for < 512
ANDCC $127, R3, R8 // check for 128 alignment of address
BEQ zero512setup
ANDCC $7, R3, R15
BEQ zero512xsetup // at least 8 byte aligned
// zero bytes up to 8 byte alignment
ANDCC $1, R3, R15 // check for byte alignment
BEQ byte2
MOVB R0, 0(R3) // zero 1 byte
ADD $1, R3 // bump ptr by 1
ADD $-1, R4
byte2:
ANDCC $2, R3, R15 // check for 2 byte alignment
BEQ byte4
MOVH R0, 0(R3) // zero 2 bytes
ADD $2, R3 // bump ptr by 2
ADD $-2, R4
byte4:
ANDCC $4, R3, R15 // check for 4 byte alignment
BEQ zero512xsetup
MOVW R0, 0(R3) // zero 4 bytes
ADD $4, R3 // bump ptr by 4
ADD $-4, R4
BR zero512xsetup // ptr should now be 8 byte aligned
under512:
MOVD R6, CTR // R6 = number of double words
SRDCC $2, R6, R7 // 32 byte chunks?
BNE zero32setup
// Clear double words
zero8:
MOVD R0, 0(R3) // double word
ADD $8, R3
ADD $-8, R4
BC 16, 0, zero8 // dec ctr, br zero8 if ctr not 0
BR nozerolarge // handle remainder
BR nozerolarge // handle leftovers
// Prepare to clear 32 bytes at a time.
zero32setup:
DCBTST (R3) // prepare data cache
MOVD R7, CTR // number of 32 byte chunks
DCBTST (R3) // prepare data cache
XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
MOVD R7, CTR // number of 32 byte chunks
MOVD $16, R8
zero32:
MOVD R0, 0(R3) // clear 4 double words
MOVD R0, 8(R3)
MOVD R0, 16(R3)
MOVD R0, 24(R3)
STXVD2X VS32, (R3+R0) // store 16 bytes
STXVD2X VS32, (R3+R8)
ADD $32, R3
ADD $-32, R4
BC 16, 0, zero32 // dec ctr, br zero32 if ctr not 0
RLDCLCC $61, R4, $3, R6 // remaining doublewords
BEQ nozerolarge
......@@ -49,8 +83,8 @@ zero32:
BR zero8
nozerolarge:
CMP R5, $0 // any remaining bytes
BC 4, 1, LR // ble lr
ANDCC $7, R4, R5 // any remaining bytes
BC 4, 1, LR // ble lr
zerotail:
MOVD R5, CTR // set up to clear tail bytes
......@@ -60,3 +94,70 @@ zerotailloop:
ADD $1, R3
BC 16, 0, zerotailloop // dec ctr, br zerotailloop if ctr not 0
RET
zero512xsetup: // 512 chunk with extra needed
ANDCC $8, R3, R11 // 8 byte alignment?
BEQ zero512setup16
MOVD R0, 0(R3) // clear 8 bytes
ADD $8, R3 // update ptr to next 8
ADD $-8, R4 // dec count by 8
zero512setup16:
ANDCC $127, R3, R14 // < 128 byte alignment
BEQ zero512setup // handle 128 byte alignment
MOVD $128, R15
SUB R14, R15, R14 // find increment to 128 alignment
SRD $4, R14, R15 // number of 16 byte chunks
zero512presetup:
MOVD R15, CTR // loop counter of 16 bytes
XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
zero512preloop: // clear up to 128 alignment
STXVD2X VS32, (R3+R0) // clear 16 bytes
ADD $16, R3 // update ptr
ADD $-16, R4 // dec count
BC 16, 0, zero512preloop
zero512setup: // setup for dcbz loop
CMP R4, $512 // check if at least 512
BLT remain
SRD $9, R4, R8 // loop count for 512 chunks
MOVD R8, CTR // set up counter
MOVD $128, R9 // index regs for 128 bytes
MOVD $256, R10
MOVD $384, R11
zero512:
DCBZ (R3+R0) // clear first chunk
DCBZ (R3+R9) // clear second chunk
DCBZ (R3+R10) // clear third chunk
DCBZ (R3+R11) // clear fourth chunk
ADD $512, R3
ADD $-512, R4
BC 16, 0, zero512
remain:
CMP R4, $128 // check if 128 byte chunks left
BLT smaller
DCBZ (R3+R0) // clear 128
ADD $128, R3
ADD $-128, R4
BR remain
smaller:
ANDCC $127, R4, R7 // find leftovers
BEQ done
CMP R7, $64 // more than 64, do 32 at a time
BLT zero8setup // less than 64, do 8 at a time
SRD $5, R7, R7 // set up counter for 32
BR zero32setup
zero8setup:
SRDCC $3, R7, R7 // less than 8 bytes
BEQ nozerolarge
MOVD R7, CTR
BR zero8
done:
RET
......@@ -16,7 +16,7 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
// copy so a more efficient move can be done
check:
ANDCC $7, R5, R7 // R7: bytes to copy
SRAD $3, R5, R6 // R6: double words to copy
SRD $3, R5, R6 // R6: double words to copy
CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy
// Determine overlap by subtracting dest - src and comparing against the
......@@ -31,9 +31,9 @@ check:
// Copying forward if no overlap.
BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
MOVD R6,CTR // R6 = number of double words
SRADCC $2,R6,R8 // 32 byte chunks?
SRDCC $2,R6,R8 // 32 byte chunks?
BNE forward32setup //
MOVD R6,CTR // R6 = number of double words
// Move double words
......@@ -51,17 +51,14 @@ forward32setup:
DCBTST (R3) // prepare data cache
DCBT (R4)
MOVD R8, CTR // double work count
MOVD $16, R8
forward32:
MOVD 0(R4), R8 // load 4 double words
MOVD 8(R4), R9
MOVD 16(R4), R14
MOVD 24(R4), R15
ADD $32,R4
MOVD R8, 0(R3) // store those 4
MOVD R9, 8(R3)
MOVD R14,16(R3)
MOVD R15,24(R3)
LXVD2X (R4+R0), VS32 // load 16 bytes
LXVD2X (R4+R8), VS33
ADD $32, R4
STXVD2X VS32, (R3+R0) // store 16 bytes
STXVD2X VS33, (R3+R8)
ADD $32,R3 // bump up for next set
BC 16, 0, forward32 // continue
RLDCLCC $61,R5,$3,R6 // remaining doublewords
......@@ -71,7 +68,7 @@ forward32:
noforwardlarge:
CMP R7,$0 // any remaining bytes
BC 4, 1, LR
BC 4, 1, LR // ble lr
forwardtail:
MOVD R7, CTR // move tail bytes
......@@ -101,19 +98,39 @@ backwardtailloop:
SUB $1,R4
MOVBZ R8, -1(R3)
SUB $1,R3
BC 16, 0, backwardtailloop
BC 16, 0, backwardtailloop // bndz
nobackwardtail:
CMP R6,$0
BC 4, 5, LR
BC 4, 5, LR // ble CR1 lr
backwardlarge:
MOVD R6, CTR
SUB R3, R4, R9 // Use vsx if moving
CMP R9, $32 // at least 32 byte chunks
BLT backwardlargeloop // and distance >= 32
SRDCC $2,R6,R8 // 32 byte chunks
BNE backward32setup
backwardlargeloop:
MOVD -8(R4), R8
SUB $8,R4
MOVD R8, -8(R3)
SUB $8,R3
BC 16, 0, backwardlargeloop //
BC 16, 0, backwardlargeloop // bndz
RET
backward32setup:
MOVD R8, CTR // set up loop ctr
MOVD $16, R8 // 32 bytes at at time
backward32loop:
SUB $32, R4
SUB $32, R3
LXVD2X (R4+R0), VS32 // load 16 bytes
LXVD2X (R4+R8), VS33
STXVD2X VS32, (R3+R0) // store 16 bytes
STXVD2X VS33, (R3+R8)
BC 16, 0, backward32loop // bndz
BC 4, 5, LR // ble CR1 lr
MOVD R6, CTR
BR backwardlargeloop
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment