Commit 6994731e authored by Lynn Boger's avatar Lynn Boger

internal/bytealg: improve asm for memequal on ppc64x

This includes two changes to the memequal function.

Previously the asm implementation on ppc64x for Equal called the internal
function memequal using a BL, whereas the other asm implementations for
bytes functions on ppc64x used BR. The BR is preferred because the BL
causes the calling function to stack a frame. This changes Equal so it
uses BR and is consistent with the others.

This also uses vsx instructions where possible to improve performance
of the compares for sizes over 32.

Here are results from the sizes affected:

Equal/32             8.40ns ± 0%     7.66ns ± 0%    -8.81%  (p=0.029 n=4+4)
Equal/4K              193ns ± 0%      144ns ± 0%   -25.39%  (p=0.029 n=4+4)
Equal/4M              346µs ± 0%      277µs ± 0%   -20.08%  (p=0.029 n=4+4)
Equal/64M            7.66ms ± 1%     7.27ms ± 0%    -5.10%  (p=0.029 n=4+4)

Change-Id: Ib6ee2cdc3e5d146e2705e3338858b8e965d25420
Reviewed-on: https://go-review.googlesource.com/c/143060
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: 's avatarCarlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Reviewed-by: 's avatarDavid Chase <drchase@google.com>
parent 5c472132
...@@ -7,17 +7,15 @@ ...@@ -7,17 +7,15 @@
#include "go_asm.h" #include "go_asm.h"
#include "textflag.h" #include "textflag.h"
TEXT ·Equal(SB),NOSPLIT,$0-49 TEXT ·Equal(SB),NOSPLIT|NOFRAME,$0-49
MOVD a_len+8(FP), R4 MOVD a_len+8(FP), R4
MOVD b_len+32(FP), R5 MOVD b_len+32(FP), R5
CMP R5, R4 // unequal lengths are not equal CMP R5, R4 // unequal lengths are not equal
BNE noteq BNE noteq
MOVD a_base+0(FP), R3 MOVD a_base+0(FP), R3
MOVD b_base+24(FP), R4 MOVD b_base+24(FP), R4
BL memeqbody<>(SB) MOVD $ret+48(FP), R10
BR memeqbody<>(SB)
MOVBZ R9,ret+48(FP)
RET
noteq: noteq:
MOVBZ $0,ret+48(FP) MOVBZ $0,ret+48(FP)
...@@ -28,7 +26,7 @@ equal: ...@@ -28,7 +26,7 @@ equal:
MOVBZ R3,ret+48(FP) MOVBZ R3,ret+48(FP)
RET RET
TEXT bytes·Equal(SB),NOSPLIT,$0-49 TEXT bytes·Equal(SB),NOSPLIT|NOFRAME,$0-49
FUNCDATA $0, ·Equal·args_stackmap(SB) FUNCDATA $0, ·Equal·args_stackmap(SB)
MOVD a_len+8(FP), R4 MOVD a_len+8(FP), R4
MOVD b_len+32(FP), R5 MOVD b_len+32(FP), R5
...@@ -36,10 +34,8 @@ TEXT bytes·Equal(SB),NOSPLIT,$0-49 ...@@ -36,10 +34,8 @@ TEXT bytes·Equal(SB),NOSPLIT,$0-49
BNE noteq BNE noteq
MOVD a_base+0(FP), R3 MOVD a_base+0(FP), R3
MOVD b_base+24(FP), R4 MOVD b_base+24(FP), R4
BL memeqbody<>(SB) MOVD $ret+48(FP), R10
BR memeqbody<>(SB)
MOVBZ R9,ret+48(FP)
RET
noteq: noteq:
MOVBZ $0,ret+48(FP) MOVBZ $0,ret+48(FP)
...@@ -51,25 +47,23 @@ equal: ...@@ -51,25 +47,23 @@ equal:
RET RET
// memequal(a, b unsafe.Pointer, size uintptr) bool // memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal(SB),NOSPLIT,$0-25 TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
MOVD a+0(FP), R3 MOVD a+0(FP), R3
MOVD b+8(FP), R4 MOVD b+8(FP), R4
MOVD size+16(FP), R5 MOVD size+16(FP), R5
MOVD $ret+24(FP), R10
BL memeqbody<>(SB) BR memeqbody<>(SB)
MOVB R9, ret+24(FP)
RET
// memequal_varlen(a, b unsafe.Pointer) bool // memequal_varlen(a, b unsafe.Pointer) bool
TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17 TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
MOVD a+0(FP), R3 MOVD a+0(FP), R3
MOVD b+8(FP), R4 MOVD b+8(FP), R4
CMP R3, R4 CMP R3, R4
BEQ eq BEQ eq
MOVD 8(R11), R5 // compiler stores size at offset 8 in the closure MOVD 8(R11), R5 // compiler stores size at offset 8 in the closure
BL memeqbody<>(SB) MOVD $ret+16(FP), R10
MOVB R9, ret+16(FP) BR memeqbody<>(SB)
RET
eq: eq:
MOVD $1, R3 MOVD $1, R3
MOVB R3, ret+16(FP) MOVB R3, ret+16(FP)
...@@ -79,7 +73,7 @@ eq: ...@@ -79,7 +73,7 @@ eq:
// R3 = s1 // R3 = s1
// R4 = s2 // R4 = s2
// R5 = len // R5 = len
// R9 = return value // R10 = addr of return value (byte)
TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0 TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R5,CTR MOVD R5,CTR
CMP R5,$8 // only optimize >=8 CMP R5,$8 // only optimize >=8
...@@ -92,26 +86,19 @@ TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0 ...@@ -92,26 +86,19 @@ TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
setup32a: // 8 byte aligned, >= 32 bytes setup32a: // 8 byte aligned, >= 32 bytes
SRADCC $5,R5,R6 // number of 32 byte chunks to compare SRADCC $5,R5,R6 // number of 32 byte chunks to compare
MOVD R6,CTR MOVD R6,CTR
MOVD $16,R14 // index for VSX loads and stores
loop32a: loop32a:
MOVD 0(R3),R6 // doublewords to compare LXVD2X (R3+R0), VS32 // VS32 = V0
MOVD 0(R4),R7 LXVD2X (R4+R0), VS33 // VS33 = V1
MOVD 8(R3),R8 // VCMPEQUBCC V0, V1, V2 // compare, setting CR6
MOVD 8(R4),R9 BGE CR6, noteq
CMP R6,R7 // bytes batch? LXVD2X (R3+R14), VS32
BNE noteq LXVD2X (R4+R14), VS33
MOVD 16(R3),R6 VCMPEQUBCC V0, V1, V2
MOVD 16(R4),R7 BGE CR6, noteq
CMP R8,R9 // bytes match?
MOVD 24(R3),R8
MOVD 24(R4),R9
BNE noteq
CMP R6,R7 // bytes match?
BNE noteq
ADD $32,R3 // bump up to next 32 ADD $32,R3 // bump up to next 32
ADD $32,R4 ADD $32,R4
CMP R8,R9 // bytes match? BC 16, 0, loop32a // br ctr and cr
BC 8,2,loop32a // br ctr and cr
BNE noteq
ANDCC $24,R5,R6 // Any 8 byte chunks? ANDCC $24,R5,R6 // Any 8 byte chunks?
BEQ leftover // and result is 0 BEQ leftover // and result is 0
setup8a: setup8a:
...@@ -145,9 +132,10 @@ simple: ...@@ -145,9 +132,10 @@ simple:
BNE noteq BNE noteq
BR equal BR equal
noteq: noteq:
MOVD $0, R9 MOVB $0, (R10)
RET RET
equal: equal:
MOVD $1, R9 MOVD $1, R3
MOVB R3, (R10)
RET RET
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment