Commit 8d881b81 authored by Russ Cox's avatar Russ Cox

cmd/asm: correct, complete newly added AVX instructions

Use the standard names, for discoverability.
Use the standard register arguments, for correctness.
Implement all possible arguments, for completeness.
Enable the corresponding tests now that everything is standard.
Update the uses in package runtime.

Fixes #14068.

Change-Id: I8e1af9a41e7d02d98c2a82af3d4cdb3e9204824f
Reviewed-on: https://go-review.googlesource.com/18852
Run-TryBot: Russ Cox <rsc@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarRob Pike <r@golang.org>
parent 7f620a57
......@@ -551,6 +551,7 @@ const (
AFXRSTOR64
AFXSAVE
AFXSAVE64
ALDDQU
ALDMXCSR
AMASKMOVOU
AMASKMOVQ
......@@ -751,9 +752,9 @@ const (
APCLMULQDQ
AVZEROUPPER
AMOVHDU
AMOVNTHD
AMOVHDA
AVMOVDQU
AVMOVNTDQ
AVMOVDQA
AVPCMPEQB
AVPXOR
AVPMOVMSKB
......
......@@ -500,6 +500,7 @@ var Anames = []string{
"FXRSTOR64",
"FXSAVE",
"FXSAVE64",
"LDDQU",
"LDMXCSR",
"MASKMOVOU",
"MASKMOVQ",
......@@ -692,9 +693,9 @@ var Anames = []string{
"PSHUFD",
"PCLMULQDQ",
"VZEROUPPER",
"MOVHDU",
"MOVNTHD",
"MOVHDA",
"VMOVDQU",
"VMOVNTDQ",
"VMOVDQA",
"VPCMPEQB",
"VPXOR",
"VPMOVMSKB",
......
This diff is collapsed.
......@@ -1350,14 +1350,14 @@ hugeloop:
hugeloop_avx2:
CMPQ BX, $64
JB bigloop_avx2
MOVHDU (SI), X0
MOVHDU (DI), X1
MOVHDU 32(SI), X2
MOVHDU 32(DI), X3
VPCMPEQB X1, X0, X4
VPCMPEQB X2, X3, X5
VPAND X4, X5, X6
VPMOVMSKB X6, DX
VMOVDQU (SI), Y0
VMOVDQU (DI), Y1
VMOVDQU 32(SI), Y2
VMOVDQU 32(DI), Y3
VPCMPEQB Y1, Y0, Y4
VPCMPEQB Y2, Y3, Y5
VPAND Y4, Y5, Y6
VPMOVMSKB Y6, DX
ADDQ $64, SI
ADDQ $64, DI
SUBQ $64, BX
......@@ -1614,16 +1614,16 @@ big_loop:
// Compare 64-bytes per loop iteration.
// Loop is unrolled and uses AVX2.
big_loop_avx2:
MOVHDU (SI), X2
MOVHDU (DI), X3
MOVHDU 32(SI), X4
MOVHDU 32(DI), X5
VPCMPEQB X2, X3, X0
VPMOVMSKB X0, AX
VMOVDQU (SI), Y2
VMOVDQU (DI), Y3
VMOVDQU 32(SI), Y4
VMOVDQU 32(DI), Y5
VPCMPEQB Y2, Y3, Y0
VPMOVMSKB Y0, AX
XORL $0xffffffff, AX
JNE diff32_avx2
VPCMPEQB X4, X5, X6
VPMOVMSKB X6, AX
VPCMPEQB Y4, Y5, Y6
VPMOVMSKB Y6, AX
XORL $0xffffffff, AX
JNE diff64_avx2
......@@ -1908,26 +1908,26 @@ avx2:
JNE no_avx2
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, X1
VPBROADCASTB X0, Y1
avx2_loop:
MOVHDU (DI), X2
VPCMPEQB X1, X2, X3
VPTEST X3, X3
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPTEST Y3, Y3
JNZ avx2success
ADDQ $32, DI
CMPQ DI, R11
JLT avx2_loop
MOVQ R11, DI
MOVHDU (DI), X2
VPCMPEQB X1, X2, X3
VPTEST X3, X3
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPTEST Y3, Y3
JNZ avx2success
VZEROUPPER
MOVQ $-1, (R8)
RET
avx2success:
VPMOVMSKB X3, DX
VPMOVMSKB Y3, DX
BSFL DX, DX
SUBQ SI, DI
ADDQ DI, DX
......
......@@ -65,40 +65,40 @@ loop:
JMP tail
loop_preheader_avx2:
VPXOR X0, X0, X0
VPXOR Y0, Y0, Y0
// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
// For larger sizes it is always faster, even on dual Xeons with 30M cache.
// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
CMPQ BX, $0x2000000
JAE loop_preheader_avx2_huge
loop_avx2:
MOVHDU X0, 0(DI)
MOVHDU X0, 32(DI)
MOVHDU X0, 64(DI)
MOVHDU X0, 96(DI)
VMOVDQU Y0, 0(DI)
VMOVDQU Y0, 32(DI)
VMOVDQU Y0, 64(DI)
VMOVDQU Y0, 96(DI)
SUBQ $128, BX
ADDQ $128, DI
CMPQ BX, $128
JAE loop_avx2
MOVHDU X0, -32(DI)(BX*1)
MOVHDU X0, -64(DI)(BX*1)
MOVHDU X0, -96(DI)(BX*1)
MOVHDU X0, -128(DI)(BX*1)
VMOVDQU Y0, -32(DI)(BX*1)
VMOVDQU Y0, -64(DI)(BX*1)
VMOVDQU Y0, -96(DI)(BX*1)
VMOVDQU Y0, -128(DI)(BX*1)
VZEROUPPER
RET
loop_preheader_avx2_huge:
// Align to 32 byte boundary
MOVHDU X0, 0(DI)
VMOVDQU Y0, 0(DI)
MOVQ DI, SI
ADDQ $32, DI
ANDQ $~31, DI
SUBQ DI, SI
ADDQ SI, BX
loop_avx2_huge:
MOVNTHD X0, 0(DI)
MOVNTHD X0, 32(DI)
MOVNTHD X0, 64(DI)
MOVNTHD X0, 96(DI)
VMOVNTDQ Y0, 0(DI)
VMOVNTDQ Y0, 32(DI)
VMOVNTDQ Y0, 64(DI)
VMOVNTDQ Y0, 96(DI)
SUBQ $128, BX
ADDQ $128, DI
CMPQ BX, $128
......@@ -108,10 +108,10 @@ loop_avx2_huge:
// should be used in conjunction with MOVNTDQ instructions..."
// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
SFENCE
MOVHDU X0, -32(DI)(BX*1)
MOVHDU X0, -64(DI)(BX*1)
MOVHDU X0, -96(DI)(BX*1)
MOVHDU X0, -128(DI)(BX*1)
VMOVDQU Y0, -32(DI)(BX*1)
VMOVDQU Y0, -64(DI)(BX*1)
VMOVDQU Y0, -96(DI)(BX*1)
VMOVDQU Y0, -128(DI)(BX*1)
VZEROUPPER
RET
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment