[dev.ssa] cmd/compile: add more on ARM64 SSA

Support the following: - Shifts. ARM64 machine instructions only use lowest 6 bits of the shift (i.e. mod 64). Use conditional selection instruction to ensure Go semantics. - Zero/Move. Alignment is ensured. - Hmul, Avg64u, Sqrt. - reserve R18 (platform register in ARM64 ABI) and R29 (frame pointer in ARM64 ABI). Everything compiles, all.bash passed (with non-SSA test disabled). Change-Id: Ia8ed58dae5cbc001946f0b889357b258655078b1 Reviewed-on: https://go-review.googlesource.com/25290 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>

[dev.ssa] cmd/compile: add more on ARM64 SSA
Support the following: - Shifts. ARM64 machine instructions only use lowest 6 bits of the shift (i.e. mod 64). Use conditional selection instruction to ensure Go semantics. - Zero/Move. Alignment is ensured. - Hmul, Avg64u, Sqrt. - reserve R18 (platform register in ARM64 ABI) and R29 (frame pointer in ARM64 ABI). Everything compiles, all.bash passed (with non-SSA test disabled). Change-Id: Ia8ed58dae5cbc001946f0b889357b258655078b1 Reviewed-on: https://go-review.googlesource.com/25290 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
83208504 · Cherry Zhang · 2d16e431 · 83208504 · 83208504 · 83208504
Commit 83208504 authored Jul 22, 2016 by Cherry Zhang
10 changed files
--- a/src/cmd/compile/internal/arm64/prog.go
+++ b/src/cmd/compile/internal/arm64/prog.go
@@ -44,24 +44,35 @@ var progtable = [arm64.ALAST & obj.AMask]obj.ProgInfo{
 	// Integer
 	arm64.AADD & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ASUB & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
-	arm64.ANEG & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.ANEG & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite},
 	arm64.AAND & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AORR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AEOR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AMVN & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite},
 	arm64.AMUL & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AMULW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ASMULL & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AUMULL & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
-	arm64.ASMULH & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
-	arm64.AUMULH & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.ASMULH & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AUMULH & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ASDIV & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AUDIV & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.ASDIVW & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AUDIVW & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AREM & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AUREM & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AREMW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	arm64.AUREMW & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ALSL & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ALSR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AASR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.ACMP & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead},
+	arm64.ACMPW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead},
 	arm64.AADC & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite | gc.UseCarry},
 	arm64.AROR & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	arm64.AADDS & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite | gc.SetCarry},
+	arm64.ACSET & obj.AMask:  {Flags: gc.SizeQ | gc.RightWrite},
+	arm64.ACSEL & obj.AMask:  {Flags: gc.SizeQ | gc.RegRead | gc.RightWrite},

 	// Floating point.
 	arm64.AFADDD & obj.AMask:  {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite},

--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -32,7 +32,7 @@ var ssaRegToReg = []int16{
 	arm64.REG_R15,
 	arm64.REG_R16,
 	arm64.REG_R17,
-	arm64.REG_R18,
+	arm64.REG_R18, // platform register, not used
 	arm64.REG_R19,
 	arm64.REG_R20,
 	arm64.REG_R21,
@@ -42,8 +42,8 @@ var ssaRegToReg = []int16{
 	arm64.REG_R25,
 	arm64.REG_R26,
 	// R27 = REGTMP not used in regalloc
-	arm64.REGG, // R28
-	arm64.REG_R29,
+	arm64.REGG,    // R28
+	arm64.REG_R29, // frame pointer, not used
 	// R30 = REGLINK not used in regalloc
 	arm64.REGSP, // R31

@@ -229,6 +229,11 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		ssa.OpARM64XOR,
 		ssa.OpARM64BIC,
 		ssa.OpARM64MUL,
+		ssa.OpARM64MULW,
+		ssa.OpARM64MULH,
+		ssa.OpARM64UMULH,
+		ssa.OpARM64MULL,
+		ssa.OpARM64UMULL,
 		ssa.OpARM64DIV,
 		ssa.OpARM64UDIV,
 		ssa.OpARM64DIVW,
@@ -237,6 +242,9 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		ssa.OpARM64UMOD,
 		ssa.OpARM64MODW,
 		ssa.OpARM64UMODW,
+		ssa.OpARM64SLL,
+		ssa.OpARM64SRL,
+		ssa.OpARM64SRA,
 		ssa.OpARM64FADDS,
 		ssa.OpARM64FADDD,
 		ssa.OpARM64FSUBS,
@@ -259,7 +267,12 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		ssa.OpARM64ANDconst,
 		ssa.OpARM64ORconst,
 		ssa.OpARM64XORconst,
-		ssa.OpARM64BICconst:
+		ssa.OpARM64BICconst,
+		ssa.OpARM64SLLconst,
+		ssa.OpARM64SRLconst,
+		ssa.OpARM64SRAconst,
+		ssa.OpARM64RORconst,
+		ssa.OpARM64RORWconst:
 		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_CONST
 		p.From.Offset = v.AuxInt
@@ -415,6 +428,107 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.From.Reg = gc.SSARegNum(v.Args[0])
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64CSELULT:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG // assembler encodes conditional bits in Reg
+		p.From.Reg = arm64.COND_LO
+		p.Reg = gc.SSARegNum(v.Args[0])
+		p.From3 = &obj.Addr{Type: obj.TYPE_REG, Reg: gc.SSARegNum(v.Args[1])}
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARM64DUFFZERO:
+		// runtime.duffzero expects start address - 8 in R16
+		p := gc.Prog(arm64.ASUB)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 8
+		p.Reg = gc.SSARegNum(v.Args[0])
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = arm64.REG_R16
+		p = gc.Prog(obj.ADUFFZERO)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
+		p.To.Offset = v.AuxInt
+	case ssa.OpARM64LoweredZero:
+		// MOVD.P	ZR, 8(R16)
+		// CMP	Rarg1, R16
+		// BLE	-2(PC)
+		// arg1 is the address of the last element to zero
+		// auxint is alignment
+		var sz int64
+		var mov obj.As
+		switch {
+		case v.AuxInt%8 == 0:
+			sz = 8
+			mov = arm64.AMOVD
+		case v.AuxInt%4 == 0:
+			sz = 4
+			mov = arm64.AMOVW
+		case v.AuxInt%2 == 0:
+			sz = 2
+			mov = arm64.AMOVH
+		default:
+			sz = 1
+			mov = arm64.AMOVB
+		}
+		p := gc.Prog(mov)
+		p.Scond = arm64.C_XPOST
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = arm64.REGZERO
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = arm64.REG_R16
+		p.To.Offset = sz
+		p2 := gc.Prog(arm64.ACMP)
+		p2.From.Type = obj.TYPE_REG
+		p2.From.Reg = gc.SSARegNum(v.Args[1])
+		p2.Reg = arm64.REG_R16
+		p3 := gc.Prog(arm64.ABLE)
+		p3.To.Type = obj.TYPE_BRANCH
+		gc.Patch(p3, p)
+	case ssa.OpARM64LoweredMove:
+		// MOVD.P	8(R16), Rtmp
+		// MOVD.P	Rtmp, 8(R17)
+		// CMP	Rarg2, R16
+		// BLE	-3(PC)
+		// arg2 is the address of the last element of src
+		// auxint is alignment
+		var sz int64
+		var mov obj.As
+		switch {
+		case v.AuxInt%8 == 0:
+			sz = 8
+			mov = arm64.AMOVD
+		case v.AuxInt%4 == 0:
+			sz = 4
+			mov = arm64.AMOVW
+		case v.AuxInt%2 == 0:
+			sz = 2
+			mov = arm64.AMOVH
+		default:
+			sz = 1
+			mov = arm64.AMOVB
+		}
+		p := gc.Prog(mov)
+		p.Scond = arm64.C_XPOST
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = arm64.REG_R16
+		p.From.Offset = sz
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = arm64.REGTMP
+		p2 := gc.Prog(mov)
+		p2.Scond = arm64.C_XPOST
+		p2.From.Type = obj.TYPE_REG
+		p2.From.Reg = arm64.REGTMP
+		p2.To.Type = obj.TYPE_MEM
+		p2.To.Reg = arm64.REG_R17
+		p2.To.Offset = sz
+		p3 := gc.Prog(arm64.ACMP)
+		p3.From.Type = obj.TYPE_REG
+		p3.From.Reg = gc.SSARegNum(v.Args[2])
+		p3.Reg = arm64.REG_R16
+		p4 := gc.Prog(arm64.ABLE)
+		p4.To.Type = obj.TYPE_BRANCH
+		gc.Patch(p4, p)
 	case ssa.OpARM64CALLstatic:
 		if v.Aux.(*gc.Sym) == gc.Deferreturn.Sym {
 			// Deferred calls will appear to be returning to
@@ -507,7 +621,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		ssa.OpARM64GreaterEqualU:
 		// generate boolean values using CSET
 		p := gc.Prog(arm64.ACSET)
-		p.From.Type = obj.TYPE_REG
+		p.From.Type = obj.TYPE_REG // assembler encodes conditional bits in Reg
 		p.From.Reg = condBits[v.Op]
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = gc.SSARegNum(v)

--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
--- a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
--- a/src/cmd/compile/internal/ssa/gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/gen/genericOps.go
@@ -312,8 +312,8 @@ var genericOps = []opData{
 	// Memory operations
 	{name: "Load", argLength: 2},                            // Load from arg0.  arg1=memory
 	{name: "Store", argLength: 3, typ: "Mem", aux: "Int64"}, // Store arg1 to arg0.  arg2=memory, auxint=size.  Returns memory.
-	{name: "Move", argLength: 3, aux: "Int64"},              // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size.  Returns memory.
-	{name: "Zero", argLength: 2, aux: "Int64"},              // arg0=destptr, arg1=mem, auxint=size. Returns memory.
+	{name: "Move", argLength: 3, typ: "Mem", aux: "Int64"},  // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size.  Returns memory.
+	{name: "Zero", argLength: 2, typ: "Mem", aux: "Int64"},  // arg0=destptr, arg1=mem, auxint=size. Returns memory.

 	// Function calls. Arguments to the call have already been written to the stack.
 	// Return values appear on the stack. The method receiver, if any, is treated

--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -486,6 +486,8 @@ func (s *regAllocState) init(f *Func) {
 			s.allocatable &^= 1 << 15 // R15
 		case "arm":
 			s.allocatable &^= 1 << 9 // R9
+		case "arm64":
+			// nothing to do?
 		default:
 			s.f.Config.fe.Unimplementedf(0, "arch %s not implemented", s.f.Config.arch)
 		}

--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -304,6 +304,19 @@ func duffAMD64(size int64) (int64, int64) {
 	return off, adj
 }

+// moveSize returns the number of bytes an aligned MOV instruction moves
+func moveSize(align int64, c *Config) int64 {
+	switch {
+	case align%8 == 0 && c.IntSize == 8:
+		return 8
+	case align%4 == 0:
+		return 4
+	case align%2 == 0:
+		return 2
+	}
+	return 1
+}
+
 // mergePoint finds a block among a's blocks which dominates b and is itself
 // dominated by all of a's blocks. Returns nil if it can't find one.
 // Might return nil even if one does exist.

--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
--- a/src/cmd/compile/internal/ssa/schedule.go
+++ b/src/cmd/compile/internal/ssa/schedule.go
@@ -84,7 +84,7 @@ func schedule(f *Func) {
 		// Compute score. Larger numbers are scheduled closer to the end of the block.
 		for _, v := range b.Values {
 			switch {
-			case v.Op == OpAMD64LoweredGetClosurePtr || v.Op == OpPPC64LoweredGetClosurePtr || v.Op == OpARMLoweredGetClosurePtr || v.Op == Op386LoweredGetClosurePtr:
+			case v.Op == OpAMD64LoweredGetClosurePtr || v.Op == OpPPC64LoweredGetClosurePtr || v.Op == OpARMLoweredGetClosurePtr || v.Op == OpARM64LoweredGetClosurePtr || v.Op == Op386LoweredGetClosurePtr:
 				// We also score GetLoweredClosurePtr as early as possible to ensure that the
 				// context register is not stomped. GetLoweredClosurePtr should only appear
 				// in the entry block where there are no phi functions, so there is no