runtime: optimize indexbytebody on amd64

Use avx2 to compare 32 bytes per iteration. Results (haswell): name old time/op new time/op delta IndexByte32-6 15.5ns ± 0% 14.7ns ± 5% -4.87% (p=0.000 n=16+20) IndexByte4K-6 360ns ± 0% 183ns ± 0% -49.17% (p=0.000 n=19+20) IndexByte4M-6 384µs ± 0% 256µs ± 1% -33.41% (p=0.000 n=20+20) IndexByte64M-6 6.20ms ± 0% 4.18ms ± 1% -32.52% (p=0.000 n=19+20) IndexBytePortable32-6 73.4ns ± 5% 75.8ns ± 3% +3.35% (p=0.000 n=20+19) IndexBytePortable4K-6 5.15µs ± 0% 5.15µs ± 0% ~ (all samples are equal) IndexBytePortable4M-6 5.26ms ± 0% 5.25ms ± 0% -0.12% (p=0.000 n=20+18) IndexBytePortable64M-6 84.1ms ± 0% 84.1ms ± 0% -0.08% (p=0.012 n=18+20) Index32-6 352ns ± 0% 352ns ± 0% ~ (all samples are equal) Index4K-6 53.8µs ± 0% 53.8µs ± 0% -0.03% (p=0.000 n=16+18) Index4M-6 55.4ms ± 0% 55.4ms ± 0% ~ (p=0.149 n=20+19) Index64M-6 886ms ± 0% 886ms ± 0% ~ (p=0.108 n=20+20) IndexEasy32-6 80.3ns ± 0% 80.1ns ± 0% -0.21% (p=0.000 n=20+20) IndexEasy4K-6 426ns ± 0% 215ns ± 0% -49.53% (p=0.000 n=20+20) IndexEasy4M-6 388µs ± 0% 262µs ± 1% -32.42% (p=0.000 n=18+20) IndexEasy64M-6 6.20ms ± 0% 4.19ms ± 1% -32.47% (p=0.000 n=18+20) name old speed new speed delta IndexByte32-6 2.06GB/s ± 1% 2.17GB/s ± 5% +5.19% (p=0.000 n=18+20) IndexByte4K-6 11.4GB/s ± 0% 22.3GB/s ± 0% +96.45% (p=0.000 n=17+20) IndexByte4M-6 10.9GB/s ± 0% 16.4GB/s ± 1% +50.17% (p=0.000 n=20+20) IndexByte64M-6 10.8GB/s ± 0% 16.0GB/s ± 1% +48.19% (p=0.000 n=19+20) IndexBytePortable32-6 436MB/s ± 5% 422MB/s ± 3% -3.27% (p=0.000 n=20+19) IndexBytePortable4K-6 795MB/s ± 0% 795MB/s ± 0% ~ (p=0.940 n=17+18) IndexBytePortable4M-6 798MB/s ± 0% 799MB/s ± 0% +0.12% (p=0.000 n=20+18) IndexBytePortable64M-6 798MB/s ± 0% 798MB/s ± 0% +0.08% (p=0.011 n=18+20) Index32-6 90.9MB/s ± 0% 90.9MB/s ± 0% -0.00% (p=0.025 n=20+20) Index4K-6 76.1MB/s ± 0% 76.1MB/s ± 0% +0.03% (p=0.000 n=14+15) Index4M-6 75.7MB/s ± 0% 75.7MB/s ± 0% ~ (p=0.076 n=20+19) Index64M-6 75.7MB/s ± 0% 75.7MB/s ± 0% ~ (p=0.456 n=20+17) IndexEasy32-6 399MB/s ± 0% 399MB/s ± 0% +0.20% (p=0.000 n=20+19) IndexEasy4K-6 9.60GB/s ± 0% 19.02GB/s ± 0% +98.19% (p=0.000 n=20+20) IndexEasy4M-6 10.8GB/s ± 0% 16.0GB/s ± 1% +47.98% (p=0.000 n=18+20) IndexEasy64M-6 10.8GB/s ± 0% 16.0GB/s ± 1% +48.08% (p=0.000 n=18+20) Change-Id: I46075921dde9f3580a89544c0b3a2d8c9181ebc4 Reviewed-on: https://go-review.googlesource.com/16484Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> Reviewed-by: Klaus Post <klauspost@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org>

runtime: optimize indexbytebody on amd64
Use avx2 to compare 32 bytes per iteration. Results (haswell): name old time/op new time/op delta IndexByte32-6 15.5ns ± 0% 14.7ns ± 5% -4.87% (p=0.000 n=16+20) IndexByte4K-6 360ns ± 0% 183ns ± 0% -49.17% (p=0.000 n=19+20) IndexByte4M-6 384µs ± 0% 256µs ± 1% -33.41% (p=0.000 n=20+20) IndexByte64M-6 6.20ms ± 0% 4.18ms ± 1% -32.52% (p=0.000 n=19+20) IndexBytePortable32-6 73.4ns ± 5% 75.8ns ± 3% +3.35% (p=0.000 n=20+19) IndexBytePortable4K-6 5.15µs ± 0% 5.15µs ± 0% ~ (all samples are equal) IndexBytePortable4M-6 5.26ms ± 0% 5.25ms ± 0% -0.12% (p=0.000 n=20+18) IndexBytePortable64M-6 84.1ms ± 0% 84.1ms ± 0% -0.08% (p=0.012 n=18+20) Index32-6 352ns ± 0% 352ns ± 0% ~ (all samples are equal) Index4K-6 53.8µs ± 0% 53.8µs ± 0% -0.03% (p=0.000 n=16+18) Index4M-6 55.4ms ± 0% 55.4ms ± 0% ~ (p=0.149 n=20+19) Index64M-6 886ms ± 0% 886ms ± 0% ~ (p=0.108 n=20+20) IndexEasy32-6 80.3ns ± 0% 80.1ns ± 0% -0.21% (p=0.000 n=20+20) IndexEasy4K-6 426ns ± 0% 215ns ± 0% -49.53% (p=0.000 n=20+20) IndexEasy4M-6 388µs ± 0% 262µs ± 1% -32.42% (p=0.000 n=18+20) IndexEasy64M-6 6.20ms ± 0% 4.19ms ± 1% -32.47% (p=0.000 n=18+20) name old speed new speed delta IndexByte32-6 2.06GB/s ± 1% 2.17GB/s ± 5% +5.19% (p=0.000 n=18+20) IndexByte4K-6 11.4GB/s ± 0% 22.3GB/s ± 0% +96.45% (p=0.000 n=17+20) IndexByte4M-6 10.9GB/s ± 0% 16.4GB/s ± 1% +50.17% (p=0.000 n=20+20) IndexByte64M-6 10.8GB/s ± 0% 16.0GB/s ± 1% +48.19% (p=0.000 n=19+20) IndexBytePortable32-6 436MB/s ± 5% 422MB/s ± 3% -3.27% (p=0.000 n=20+19) IndexBytePortable4K-6 795MB/s ± 0% 795MB/s ± 0% ~ (p=0.940 n=17+18) IndexBytePortable4M-6 798MB/s ± 0% 799MB/s ± 0% +0.12% (p=0.000 n=20+18) IndexBytePortable64M-6 798MB/s ± 0% 798MB/s ± 0% +0.08% (p=0.011 n=18+20) Index32-6 90.9MB/s ± 0% 90.9MB/s ± 0% -0.00% (p=0.025 n=20+20) Index4K-6 76.1MB/s ± 0% 76.1MB/s ± 0% +0.03% (p=0.000 n=14+15) Index4M-6 75.7MB/s ± 0% 75.7MB/s ± 0% ~ (p=0.076 n=20+19) Index64M-6 75.7MB/s ± 0% 75.7MB/s ± 0% ~ (p=0.456 n=20+17) IndexEasy32-6 399MB/s ± 0% 399MB/s ± 0% +0.20% (p=0.000 n=20+19) IndexEasy4K-6 9.60GB/s ± 0% 19.02GB/s ± 0% +98.19% (p=0.000 n=20+20) IndexEasy4M-6 10.8GB/s ± 0% 16.0GB/s ± 1% +47.98% (p=0.000 n=18+20) IndexEasy64M-6 10.8GB/s ± 0% 16.0GB/s ± 1% +48.08% (p=0.000 n=18+20) Change-Id: I46075921dde9f3580a89544c0b3a2d8c9181ebc4 Reviewed-on: https://go-review.googlesource.com/16484Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> Reviewed-by: Klaus Post <klauspost@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
321a4072 · Ilya Tocar · Keith Randall · ffb20631 · 321a4072 · 321a4072
Commit 321a4072 authored Oct 29, 2015 by Ilya Tocar Committed by Keith Randall Nov 06, 2015
Showing with 59 additions and 8 deletions

a.out.go src/cmd/internal/obj/x86/a.out.go +2 -0

anames.go src/cmd/internal/obj/x86/anames.go +2 -0

asm6.go src/cmd/internal/obj/x86/asm6.go +20 -8

asm_amd64.s src/runtime/asm_amd64.s +35 -0

No files found.
--- a/src/cmd/internal/obj/x86/a.out.go
+++ b/src/cmd/internal/obj/x86/a.out.go
@@ -749,6 +749,8 @@ const (
 	AVPCMPEQB
 	AVPMOVMSKB
 	AVPAND
+	AVPTEST
+	AVPBROADCASTB

 	// from 386
 	AJCXZW

--- a/src/cmd/internal/obj/x86/anames.go
+++ b/src/cmd/internal/obj/x86/anames.go
@@ -690,6 +690,8 @@ var Anames = []string{
 	"VPCMPEQB",
 	"VPMOVMSKB",
 	"VPAND",
+	"VPTEST",
+	"VPBROADCASTB",
 	"JCXZW",
 	"FCMOVCC",
 	"FCMOVCS",

--- a/src/cmd/internal/obj/x86/asm6.go
+++ b/src/cmd/internal/obj/x86/asm6.go
@@ -219,8 +219,9 @@ const (
 	Pf2   = 0xf2 /* xmm escape 1: f2 0f */
 	Pf3   = 0xf3 /* xmm escape 2: f3 0f */
 	Pq3   = 0x67 /* xmm escape 3: 66 48 0f */
-	Pvex1 = 0xc5 /* 66 escape, vex encoding */
-	Pvex2 = 0xc6 /* f3 escape, vex encoding */
+	Pvex1 = 0xc5 /* 66.0f escape, vex encoding */
+	Pvex2 = 0xc6 /* f3.0f escape, vex encoding */
+	Pvex3 = 0xc7 /* 66.0f38 escape, vex encoding */
 	Pw    = 0x48 /* Rex.w */
 	Pw8   = 0x90 // symbolic; exact value doesn't matter
 	Py    = 0x80 /* defaults to 64-bit mode */
@@ -631,6 +632,11 @@ var yxr_ml_vex = []ytab{
 	{Yxr, Ynone, Yml, Zr_m_xm_vex, 1},
 }

+var yml_xr_vex = []ytab{
+	{Yml, Ynone, Yxr, Zm_r_xm_vex, 1},
+	{Yxr, Ynone, Yxr, Zm_r_xm_vex, 1},
+}
+
 var yxm_xm_xm = []ytab{
 	{Yxr, Yxr, Yxr, Zr_r_r_vex, 1},
 	{Yxm, Yxr, Yxr, Zr_r_r_vex, 1},
@@ -1510,6 +1516,8 @@ var optab =
 	{AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}},
 	{AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}},
 	{AVPAND, yxm_xm_xm, Pvex1, [23]uint8{0xdb, 0xdb}},
+	{AVPBROADCASTB, yml_xr_vex, Pvex3, [23]uint8{0x78, 0x78}},
+	{AVPTEST, yml_xr_vex, Pvex3, [23]uint8{0x17, 0x17}},
 	{obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}},
 	{obj.ATYPE, nil, 0, [23]uint8{}},
 	{obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}},
@@ -2965,13 +2973,13 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, from3 *obj.Addr, pr
 	rexX := regrex[from.Index]
 	var prefBit uint8
 	// This will go into VEX.PP field.
-	if pref == Pvex1 {
+	if pref == Pvex1 || pref == Pvex3 {
 		prefBit = 1
 	} else if pref == Pvex2 {
 		prefBit = 2
-	} // TODO add Pvex0,Pvex3
+	} // TODO add Pvex0

-	if rexX == 0 && rexB == 0 { // 2-byte vex prefix
+	if rexX == 0 && rexB == 0 && pref != Pvex3 { // 2-byte vex prefix
 		// In 2-byte case, first byte is always C5
 		ctxt.Andptr[0] = 0xc5
 		ctxt.Andptr = ctxt.Andptr[1:]
@@ -2998,9 +3006,13 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, from3 *obj.Addr, pr
 		ctxt.Andptr[0] = 0xc4
 		ctxt.Andptr = ctxt.Andptr[1:]

-		// Encode VEX.mmmmm with prefix value, for now assume 0F 38,
-		// which encodes as 1.
-		ctxt.Andptr[0] = 0x1 // TODO handle different prefix
+		// Encode VEX.mmmmm with prefix value, assume 0F,
+		// which encodes as 1, unless 0F38 was specified with pvex3.
+		ctxt.Andptr[0] = 0x1 // TODO handle 0F3A
+		if pref == Pvex3 {
+			ctxt.Andptr[0] = 0x2
+		}
+
 		// REX.[RXB] are inverted and encoded in 3 upper bits
 		if rexR == 0 {
 			ctxt.Andptr[0] |= 0x80

--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1940,6 +1940,9 @@ TEXT runtime·indexbytebody(SB),NOSPLIT,$0
 	CMPQ BX, $16
 	JLT small

+	CMPQ BX, $32
+	JA avx2
+no_avx2:
 	// round up to first 16-byte boundary
 	TESTQ $15, SI
 	JZ aligned
@@ -2003,6 +2006,38 @@ small:
 	MOVQ $-1, (R8)
 	RET

+avx2:
+	CMPB   runtime·support_avx2(SB), $1
+	JNE no_avx2
+	MOVD AX, X0
+	LEAQ -32(SI)(BX*1), R11
+	VPBROADCASTB  X0, X1
+avx2_loop:
+	MOVHDU (DI), X2
+	VPCMPEQB X1, X2, X3
+	VPTEST X3, X3
+	JNZ avx2success
+	ADDQ $32, DI
+	CMPQ DI, R11
+	JLT avx2_loop
+	MOVQ R11, DI
+	MOVHDU (DI), X2
+	VPCMPEQB X1, X2, X3
+	VPTEST X3, X3
+	JNZ avx2success
+	VZEROUPPER
+	MOVQ $-1, (R8)
+	RET
+
+avx2success:
+	VPMOVMSKB X3, DX
+	BSFL DX, DX
+	SUBQ SI, DI
+	ADDQ DI, DX
+	MOVQ DX, (R8)
+	VZEROUPPER
+	RET
+
 // we've found the chunk containing the byte
 // now just figure out which specific byte it is
 ssesuccess: