Commit c4807d4c authored by Lynn Boger's avatar Lynn Boger Committed by Brad Fitzpatrick

runtime: improve memmove performance ppc64,ppc64le

This change improves the performance of memmove
on ppc64 & ppc64le mainly for moves >=32 bytes.
In addition, the test to detect backward moves
 was enhanced to avoid backward moves if source
and dest were in different types of storage, since
backward moves might not always be efficient.

Fixes #14507

The following shows some of the improvements from the test
in the runtime package:

BenchmarkMemmove32                   4229.56      4717.13      1.12x
BenchmarkMemmove64                   6156.03      7810.42      1.27x
BenchmarkMemmove128                  7521.69      12468.54     1.66x
BenchmarkMemmove256                  6729.90      18260.33     2.71x
BenchmarkMemmove512                  8521.59      18033.81     2.12x
BenchmarkMemmove1024                 9760.92      25762.61     2.64x
BenchmarkMemmove2048                 10241.00     29584.94     2.89x
BenchmarkMemmove4096                 10399.37     31882.31     3.07x

BenchmarkMemmoveUnalignedDst16       1943.69      2258.33      1.16x
BenchmarkMemmoveUnalignedDst32       3885.08      3965.81      1.02x
BenchmarkMemmoveUnalignedDst64       5121.63      6965.54      1.36x
BenchmarkMemmoveUnalignedDst128      7212.34      11372.68     1.58x
BenchmarkMemmoveUnalignedDst256      6564.52      16913.59     2.58x
BenchmarkMemmoveUnalignedDst512      8364.35      17782.57     2.13x
BenchmarkMemmoveUnalignedDst1024     9539.87      24914.72     2.61x
BenchmarkMemmoveUnalignedDst2048     9199.23      21235.11     2.31x
BenchmarkMemmoveUnalignedDst4096     10077.39     25231.99     2.50x

BenchmarkMemmoveUnalignedSrc32       3249.83      3742.52      1.15x
BenchmarkMemmoveUnalignedSrc64       5562.35      6627.96      1.19x
BenchmarkMemmoveUnalignedSrc128      6023.98      10200.84     1.69x
BenchmarkMemmoveUnalignedSrc256      6921.83      15258.43     2.20x
BenchmarkMemmoveUnalignedSrc512      8593.13      16541.97     1.93x
BenchmarkMemmoveUnalignedSrc1024     9730.95      22927.84     2.36x
BenchmarkMemmoveUnalignedSrc2048     9793.28      21537.73     2.20x
BenchmarkMemmoveUnalignedSrc4096     10132.96     26295.06     2.60x

Change-Id: I73af59970d4c97c728deabb9708b31ec7e01bdf2
Reviewed-on: https://go-review.googlesource.com/21990Reviewed-by: 's avatarBill O'Farrell <billotosyr@gmail.com>
Reviewed-by: 's avatarBrad Fitzpatrick <bradfitz@golang.org>
parent 66afbf10
...@@ -11,78 +11,109 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 ...@@ -11,78 +11,109 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
MOVD to+0(FP), R3 MOVD to+0(FP), R3
MOVD from+8(FP), R4 MOVD from+8(FP), R4
MOVD n+16(FP), R5 MOVD n+16(FP), R5
CMP R5, $0
BNE check
RET
// Determine if there are doublewords to
// copy so a more efficient move can be done
check: check:
ANDCC $7, R5, R7 // R7 is the number of bytes to copy and CR0[EQ] is set if there are none. ANDCC $7, R5, R7 // R7: bytes to copy
SRAD $3, R5, R6 // R6 is the number of words to copy SRAD $3, R5, R6 // R6: double words to copy
CMP R6, $0, CR1 // CR1[EQ] is set if there are no words to copy. CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy
CMP R3, R4, CR2
BC 12, 9, backward // I think you should be able to write this as "BGT CR2, backward"
// Copying forward proceeds by copying R6 words then copying R7 bytes. // Determine overlap by subtracting dest - src and comparing against the
// R3 and R4 are advanced as we copy. Because PPC64 lacks post-increment // length. The catches the cases where src and dest are in different types
// load/store, R3 and R4 point before the bytes that are to be copied. // of storage such as stack and static to avoid doing backward move when not
// necessary.
BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge" SUB R4, R3, R8 // dest - src
CMPU R8, R5, CR2 // < len?
MOVD R6, CTR BC 12, 8, backward // BLT CR2 backward
SUB $8, R3 // Copying forward if no overlap.
SUB $8, R4
forwardlargeloop: BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
MOVDU 8(R4), R8 MOVD R6,CTR // R6 = number of double words
MOVDU R8, 8(R3) SRADCC $2,R6,R8 // 32 byte chunks?
BC 16, 0, forwardlargeloop // "BDNZ" BNE forward32setup //
ADD $8, R3 // Move double words
ADD $8, R4
forward8:
MOVD 0(R4), R8 // double word
ADD $8,R4
MOVD R8, 0(R3) //
ADD $8,R3
BC 16, 0, forward8
BR noforwardlarge // handle remainder
// Prepare for moves of 32 bytes at a time.
forward32setup:
DCBTST (R3) // prepare data cache
DCBT (R4)
MOVD R8, CTR // double work count
forward32:
MOVD 0(R4), R8 // load 4 double words
MOVD 8(R4), R9
MOVD 16(R4), R14
MOVD 24(R4), R15
ADD $32,R4
MOVD R8, 0(R3) // store those 4
MOVD R9, 8(R3)
MOVD R14,16(R3)
MOVD R15,24(R3)
ADD $32,R3 // bump up for next set
BC 16, 0, forward32 // continue
RLDCLCC $61,R5,$3,R6 // remaining doublewords
BEQ noforwardlarge
MOVD R6,CTR // set up the CTR
BR forward8
noforwardlarge: noforwardlarge:
BNE forwardtail // Tests the bit set by ANDCC above CMP R7,$0 // any remaining bytes
RET BC 4, 1, LR
forwardtail: forwardtail:
SUB $1, R3 MOVD R7, CTR // move tail bytes
SUB $1, R4
MOVD R7, CTR
forwardtailloop: forwardtailloop:
MOVBZU 1(R4), R8 MOVBZ 0(R4), R8 // move single bytes
MOVBZU R8, 1(R3) ADD $1,R4
MOVBZ R8, 0(R3)
ADD $1,R3
BC 16, 0, forwardtailloop BC 16, 0, forwardtailloop
RET RET
backward: backward:
// Copying backwards proceeds by copying R7 bytes then copying R6 words. // Copying backwards proceeds by copying R7 bytes then copying R6 double words.
// R3 and R4 are advanced to the end of the destination/source buffers // R3 and R4 are advanced to the end of the destination/source buffers
// respectively and moved back as we copy. // respectively and moved back as we copy.
ADD R5, R4, R4 ADD R5, R4, R4 // end of source
ADD R3, R5, R3 ADD R3, R5, R3 // end of dest
BEQ nobackwardtail BEQ nobackwardtail // earlier condition
MOVD R7, CTR MOVD R7, CTR // bytes to move
backwardtailloop: backwardtailloop:
MOVBZU -1(R4), R8 MOVBZ -1(R4), R8 // point to last byte
MOVBZU R8, -1(R3) SUB $1,R4
MOVBZ R8, -1(R3)
SUB $1,R3
BC 16, 0, backwardtailloop BC 16, 0, backwardtailloop
nobackwardtail: nobackwardtail:
BC 4, 6, backwardlarge // "BNE CR1" CMP R6,$0
RET BC 4, 5, LR
backwardlarge: backwardlarge:
MOVD R6, CTR MOVD R6, CTR
backwardlargeloop: backwardlargeloop:
MOVDU -8(R4), R8 MOVD -8(R4), R8
MOVDU R8, -8(R3) SUB $8,R4
BC 16, 0, backwardlargeloop // "BDNZ" MOVD R8, -8(R3)
SUB $8,R3
BC 16, 0, backwardlargeloop //
RET RET
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment