cmd/internal/obj/x86: add AVX2 instrutions needed for sha1/sha512/sha256 acceleration

This means: VPSHUFB, VPSHUFD, VPERM2F128, VPALIGNR, VPADDQ, VPADDD, VPSRLDQ, VPSLLDQ, VPSRLQ, VPSLLQ, VPSRLD, VPSLLD, VPOR, VPBLENDD, VINSERTI128, VPERM2I128, RORXL, RORXQ. Change-Id: Ief27190ee6acfa86b109262af5d999bc101e923d Reviewed-on: https://go-review.googlesource.com/22606 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>

cmd/internal/obj/x86: add AVX2 instrutions needed for sha1/sha512/sha256 acceleration
This means: VPSHUFB, VPSHUFD, VPERM2F128, VPALIGNR, VPADDQ, VPADDD, VPSRLDQ, VPSLLDQ, VPSRLQ, VPSLLQ, VPSRLD, VPSLLD, VPOR, VPBLENDD, VINSERTI128, VPERM2I128, RORXL, RORXQ. Change-Id: Ief27190ee6acfa86b109262af5d999bc101e923d Reviewed-on: https://go-review.googlesource.com/22606 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
009c002c · Ilya Tocar · Russ Cox · 2e32efc4 · 009c002c · 009c002c
Commit 009c002c authored Apr 29, 2016 by Ilya Tocar Committed by Russ Cox May 06, 2016
7 changed files
--- a/src/cmd/asm/internal/arch/amd64.go
+++ b/src/cmd/asm/internal/arch/amd64.go
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file encapsulates some of the odd characteristics of the
+// AMD64 instruction set, to minimize its interaction
+// with the core of the assembler.
+
+package arch
+
+import (
+	"cmd/internal/obj"
+	"cmd/internal/obj/x86"
+)
+
+// IsAMD4OP reports whether the op (as defined by an ppc64.A* constant) is
+// The FMADD-like instructions behave similarly.
+func IsAMD4OP(op obj.As) bool {
+	switch op {
+	case x86.AVPERM2F128,
+		x86.AVPALIGNR,
+		x86.AVPERM2I128,
+		x86.AVINSERTI128,
+		x86.AVPBLENDD:
+		return true
+	}
+	return false
+}
--- a/src/cmd/asm/internal/asm/asm.go
+++ b/src/cmd/asm/internal/asm/asm.go
@@ -568,6 +568,15 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 			prog.From = a[0]
 			prog.Reg = p.getRegister(prog, op, &a[1])
 			prog.To = a[2]
+		case sys.AMD64:
+			// Catch missing operand here, because we store immediate as part of From3, and can't distinguish
+			// missing operand from legal value 0 in obj/x86/asm6.
+			if arch.IsAMD4OP(op) {
+				p.errorf("4 operands required, but only 3 are provided for %s instruction", obj.Aconv(op))
+			}
+			prog.From = a[0]
+			prog.From3 = newAddr(a[1])
+			prog.To = a[2]
 		case sys.ARM64:
 			// ARM64 instructions with one input and two outputs.
 			if arch.IsARM64STLXR(op) {
@@ -583,7 +592,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 			prog.From = a[0]
 			prog.Reg = p.getRegister(prog, op, &a[1])
 			prog.To = a[2]
-		case sys.AMD64, sys.I386:
+		case sys.I386:
 			prog.From = a[0]
 			prog.From3 = newAddr(a[1])
 			prog.To = a[2]
@@ -640,6 +649,23 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
 			prog.Reg = r1
 			break
 		}
+		if p.arch.Family == sys.AMD64 {
+			// 4 operand instruction have form  ymm1, ymm2, ymm3/m256, imm8
+			// So From3 is always just a register, so we store imm8 in Offset field,
+			// to avoid increasing size of Prog.
+			prog.From = a[1]
+			prog.From3 = newAddr(a[2])
+			if a[0].Type != obj.TYPE_CONST {
+				p.errorf("first operand must be an immediate in %s instruction", obj.Aconv(op))
+			}
+			if prog.From3.Type != obj.TYPE_REG {
+				p.errorf("third operand must be a register in %s instruction", obj.Aconv(op))
+			}
+			prog.From3.Offset = int64(p.getImmediate(prog, op, &a[0]))
+			prog.To = a[3]
+			prog.RegTo2 = -1
+			break
+		}
 		if p.arch.Family == sys.ARM64 {
 			prog.From = a[0]
 			prog.Reg = p.getRegister(prog, op, &a[1])

--- a/src/cmd/asm/internal/asm/testdata/amd64enc.s
+++ b/src/cmd/asm/internal/asm/testdata/amd64enc.s
--- a/src/cmd/internal/obj/util.go
+++ b/src/cmd/internal/obj/util.go
@@ -140,6 +140,11 @@ func (p *Prog) String() string {

 	fmt.Fprintf(&buf, "%.5d (%v)\t%v%s", p.Pc, p.Line(), Aconv(p.As), sc)
 	sep := "\t"
+	quadOpAmd64 := p.RegTo2 == -1
+	if quadOpAmd64 {
+		fmt.Fprintf(&buf, "%s$%d", sep, p.From3.Offset)
+		sep = ", "
+	}
 	if p.From.Type != TYPE_NONE {
 		fmt.Fprintf(&buf, "%s%v", sep, Dconv(p, &p.From))
 		sep = ", "
@@ -153,6 +158,8 @@ func (p *Prog) String() string {
 		if p.From3.Type == TYPE_CONST && (p.As == ATEXT || p.As == AGLOBL) {
 			// Special case - omit $.
 			fmt.Fprintf(&buf, "%s%d", sep, p.From3.Offset)
+		} else if quadOpAmd64 {
+			fmt.Fprintf(&buf, "%s%v", sep, Rconv(int(p.From3.Reg)))
 		} else {
 			fmt.Fprintf(&buf, "%s%v", sep, Dconv(p, p.From3))
 		}
@@ -161,7 +168,7 @@ func (p *Prog) String() string {
 	if p.To.Type != TYPE_NONE {
 		fmt.Fprintf(&buf, "%s%v", sep, Dconv(p, &p.To))
 	}
-	if p.RegTo2 != REG_NONE {
+	if p.RegTo2 != REG_NONE && !quadOpAmd64 {
 		fmt.Fprintf(&buf, "%s%v", sep, Rconv(int(p.RegTo2)))
 	}
 	return buf.String()

--- a/src/cmd/internal/obj/x86/a.out.go
+++ b/src/cmd/internal/obj/x86/a.out.go
@@ -785,6 +785,24 @@ const (
 	AVPAND
 	AVPTEST
 	AVPBROADCASTB
+	AVPSHUFB
+	AVPSHUFD
+	AVPERM2F128
+	AVPALIGNR
+	AVPADDQ
+	AVPADDD
+	AVPSRLDQ
+	AVPSLLDQ
+	AVPSRLQ
+	AVPSLLQ
+	AVPSRLD
+	AVPSLLD
+	AVPOR
+	AVPBLENDD
+	AVINSERTI128
+	AVPERM2I128
+	ARORXL
+	ARORXQ

 	// from 386
 	AJCXZW

--- a/src/cmd/internal/obj/x86/anames.go
+++ b/src/cmd/internal/obj/x86/anames.go
@@ -720,6 +720,24 @@ var Anames = []string{
 	"VPAND",
 	"VPTEST",
 	"VPBROADCASTB",
+	"VPSHUFB",
+	"VPSHUFD",
+	"VPERM2F128",
+	"VPALIGNR",
+	"VPADDQ",
+	"VPADDD",
+	"VPSRLDQ",
+	"VPSLLDQ",
+	"VPSRLQ",
+	"VPSLLQ",
+	"VPSRLD",
+	"VPSLLD",
+	"VPOR",
+	"VPBLENDD",
+	"VINSERTI128",
+	"VPERM2I128",
+	"RORXL",
+	"RORXQ",
 	"JCXZW",
 	"FCMOVCC",
 	"FCMOVCS",

--- a/src/cmd/internal/obj/x86/asm6.go
+++ b/src/cmd/internal/obj/x86/asm6.go
@@ -208,6 +208,9 @@ const (
 	Zvex_rm_v_r
 	Zvex_r_v_rm
 	Zvex_v_rm_r
+	Zvex_i_rm_r
+	Zvex_i_r_v
+	Zvex_i_rm_v_r
 	Zmax
 )

@@ -847,6 +850,35 @@ var yvex_xy3 = []ytab{
 	{Yym, Yyr, Yyr, Zvex_rm_v_r, 2},
 }

+var yvex_ri3 = []ytab{
+	{Yi8, Ymb, Yrl, Zvex_i_rm_r, 2},
+}
+
+var yvex_xyi3 = []ytab{
+	{Yi8, Yxm, Yxr, Zvex_i_rm_r, 2},
+	{Yi8, Yym, Yyr, Zvex_i_rm_r, 2},
+}
+
+var yvex_yyi4 = []ytab{ //TODO don't hide 4 op, some version have xmm version
+	{Yym, Yyr, Yyr, Zvex_i_rm_v_r, 2},
+}
+
+var yvex_xyi4 = []ytab{
+	{Yxm, Yyr, Yyr, Zvex_i_rm_v_r, 2},
+}
+
+var yvex_shift = []ytab{
+	{Yi8, Yxr, Yxr, Zvex_i_r_v, 3},
+	{Yi8, Yyr, Yyr, Zvex_i_r_v, 3},
+	{Yxm, Yxr, Yxr, Zvex_rm_v_r, 2},
+	{Yxm, Yyr, Yyr, Zvex_rm_v_r, 2},
+}
+
+var yvex_shift_dq = []ytab{
+	{Yi8, Yxr, Yxr, Zvex_i_r_v, 3},
+	{Yi8, Yyr, Yyr, Zvex_i_r_v, 3},
+}
+
 var yvex_r3 = []ytab{
 	{Yml, Yrl, Yrl, Zvex_rm_v_r, 2},
 	{Yml, Yrl, Yrl, Zvex_rm_v_r, 2},
@@ -1679,6 +1711,24 @@ var optab =
 	{AVPAND, yvex_xy3, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0xDB, VEX_256_66_0F_WIG, 0xDB}},
 	{AVPBROADCASTB, yvex_vpbroadcast, Pvex, [23]uint8{VEX_128_66_0F38_W0, 0x78, VEX_256_66_0F38_W0, 0x78}},
 	{AVPTEST, yvex_xy2, Pvex, [23]uint8{VEX_128_66_0F38_WIG, 0x17, VEX_256_66_0F38_WIG, 0x17}},
+	{AVPSHUFB, yvex_xy3, Pvex, [23]uint8{VEX_128_66_0F38_WIG, 0x00, VEX_256_66_0F38_WIG, 0x00}},
+	{AVPSHUFD, yvex_xyi3, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x70, VEX_256_66_0F_WIG, 0x70}},
+	{AVPOR, yvex_xy3, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0xeb, VEX_256_66_0F_WIG, 0xeb}},
+	{AVPADDQ, yvex_xy3, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0xd4, VEX_256_66_0F_WIG, 0xd4}},
+	{AVPADDD, yvex_xy3, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0xfe, VEX_256_66_0F_WIG, 0xfe}},
+	{AVPSLLD, yvex_shift, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x72, 0xf0, VEX_256_66_0F_WIG, 0x72, 0xf0, VEX_128_66_0F_WIG, 0xf2, VEX_256_66_0F_WIG, 0xf2}},
+	{AVPSLLQ, yvex_shift, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x73, 0xf0, VEX_256_66_0F_WIG, 0x73, 0xf0, VEX_128_66_0F_WIG, 0xf3, VEX_256_66_0F_WIG, 0xf3}},
+	{AVPSRLD, yvex_shift, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x72, 0xd0, VEX_256_66_0F_WIG, 0x72, 0xd0, VEX_128_66_0F_WIG, 0xd2, VEX_256_66_0F_WIG, 0xd2}},
+	{AVPSRLQ, yvex_shift, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x73, 0xd0, VEX_256_66_0F_WIG, 0x73, 0xd0, VEX_128_66_0F_WIG, 0xd3, VEX_256_66_0F_WIG, 0xd3}},
+	{AVPSRLDQ, yvex_shift_dq, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x73, 0xd8, VEX_256_66_0F_WIG, 0x73, 0xd8}},
+	{AVPSLLDQ, yvex_shift_dq, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x73, 0xf8, VEX_256_66_0F_WIG, 0x73, 0xf8}},
+	{AVPERM2F128, yvex_yyi4, Pvex, [23]uint8{VEX_256_66_0F3A_W0, 0x06}},
+	{AVPALIGNR, yvex_yyi4, Pvex, [23]uint8{VEX_256_66_0F3A_WIG, 0x0f}},
+	{AVPBLENDD, yvex_yyi4, Pvex, [23]uint8{VEX_256_66_0F3A_WIG, 0x02}},
+	{AVINSERTI128, yvex_xyi4, Pvex, [23]uint8{VEX_256_66_0F3A_WIG, 0x38}},
+	{AVPERM2I128, yvex_yyi4, Pvex, [23]uint8{VEX_256_66_0F3A_WIG, 0x46}},
+	{ARORXL, yvex_ri3, Pvex, [23]uint8{VEX_LZ_F2_0F3A_W0, 0xf0}},
+	{ARORXQ, yvex_ri3, Pvex, [23]uint8{VEX_LZ_F2_0F3A_W1, 0xf0}},

 	{AXACQUIRE, ynone, Px, [23]uint8{0xf2}},
 	{AXRELEASE, ynone, Px, [23]uint8{0xf3}},
@@ -3189,9 +3239,16 @@ var bpduff2 = []byte{
 // https://en.wikipedia.org/wiki/VEX_prefix#Technical_description
 func asmvex(ctxt *obj.Link, rm, v, r *obj.Addr, vex, opcode uint8) {
 	ctxt.Vexflag = 1
-	rexR := regrex[r.Reg] & Rxr
-	rexB := regrex[rm.Reg] & Rxb
-	rexX := regrex[rm.Index] & Rxx
+	rexR := 0
+	if r != nil {
+		rexR = regrex[r.Reg] & Rxr
+	}
+	rexB := 0
+	rexX := 0
+	if rm != nil {
+		rexB = regrex[rm.Reg] & Rxb
+		rexX = regrex[rm.Index] & Rxx
+	}
 	vexM := (vex >> 3) & 0xF
 	vexWLP := vex & 0x87
 	vexV := byte(0)
@@ -3477,6 +3534,27 @@ func doasm(ctxt *obj.Link, p *obj.Prog) {
 				asmvex(ctxt, &p.From, p.From3, &p.To, o.op[z], o.op[z+1])
 				asmand(ctxt, p, &p.From, &p.To)

+			case Zvex_i_r_v:
+				asmvex(ctxt, p.From3, &p.To, nil, o.op[z], o.op[z+1])
+				regnum := byte(0x7)
+				if p.From3.Reg >= REG_X0 && p.From3.Reg <= REG_X15 {
+					regnum &= byte(p.From3.Reg - REG_X0)
+				} else {
+					regnum &= byte(p.From3.Reg - REG_Y0)
+				}
+				ctxt.AsmBuf.Put1(byte(o.op[z+2]) | regnum)
+				ctxt.AsmBuf.Put1(byte(p.From.Offset))
+
+			case Zvex_i_rm_v_r:
+				asmvex(ctxt, &p.From, p.From3, &p.To, o.op[z], o.op[z+1])
+				asmand(ctxt, p, &p.From, &p.To)
+				ctxt.AsmBuf.Put1(byte(p.From3.Offset))
+
+			case Zvex_i_rm_r:
+				asmvex(ctxt, p.From3, nil, &p.To, o.op[z], o.op[z+1])
+				asmand(ctxt, p, p.From3, &p.To)
+				ctxt.AsmBuf.Put1(byte(p.From.Offset))
+
 			case Zvex_v_rm_r:
 				asmvex(ctxt, p.From3, &p.From, &p.To, o.op[z], o.op[z+1])
 				asmand(ctxt, p, p.From3, &p.To)