[dev.ssa] cmd/compile: decompose 64-bit integer on ARM

Introduce dec64 rules to (generically) decompose 64-bit integer on 32-bit architectures. 64-bit integer is composed/decomposed with Int64Make/Hi/Lo ops, as for complex types. The idea of dealing with Add64 is the following: (Add64 (Int64Make xh xl) (Int64Make yh yl)) -> (Int64Make (Add32withcarry xh yh (Select0 (Add32carry xl yl))) (Select1 (Add32carry xl yl))) where Add32carry returns a tuple (flags,uint32). Select0 and Select1 read the first and the second component of the tuple, respectively. The two Add32carry will be CSE'd. Similarly for multiplication, Mul32uhilo returns a tuple (hi, lo). Also add support of KeepAlive, to fix build after merge. Tests addressed_ssa.go, array_ssa.go, break_ssa.go, chan_ssa.go, cmp_ssa.go, ctl_ssa.go, map_ssa.go, and string_ssa.go in cmd/compile/internal/gc/testdata passed. Progress on SSA for ARM. Still not complete. Updates #15365. Change-Id: I7867c76785a456312de5d8398a6b3f7ca5a4f7ec Reviewed-on: https://go-review.googlesource.com/23213Reviewed-by: Keith Randall <khr@golang.org>

[dev.ssa] cmd/compile: decompose 64-bit integer on ARM
Introduce dec64 rules to (generically) decompose 64-bit integer on 32-bit architectures. 64-bit integer is composed/decomposed with Int64Make/Hi/Lo ops, as for complex types. The idea of dealing with Add64 is the following: (Add64 (Int64Make xh xl) (Int64Make yh yl)) -> (Int64Make (Add32withcarry xh yh (Select0 (Add32carry xl yl))) (Select1 (Add32carry xl yl))) where Add32carry returns a tuple (flags,uint32). Select0 and Select1 read the first and the second component of the tuple, respectively. The two Add32carry will be CSE'd. Similarly for multiplication, Mul32uhilo returns a tuple (hi, lo). Also add support of KeepAlive, to fix build after merge. Tests addressed_ssa.go, array_ssa.go, break_ssa.go, chan_ssa.go, cmp_ssa.go, ctl_ssa.go, map_ssa.go, and string_ssa.go in cmd/compile/internal/gc/testdata passed. Progress on SSA for ARM. Still not complete. Updates #15365. Change-Id: I7867c76785a456312de5d8398a6b3f7ca5a4f7ec Reviewed-on: https://go-review.googlesource.com/23213Reviewed-by: Keith Randall <khr@golang.org>
8756d925 · Cherry Zhang · 31e13c83 · 8756d925 · 8756d925 · 8756d925
Commit 8756d925 authored May 18, 2016 by Cherry Zhang
20 changed files
--- a/src/cmd/compile/internal/arm/ssa.go
+++ b/src/cmd/compile/internal/arm/ssa.go
@@ -149,7 +149,9 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			p.To.Name = obj.NAME_AUTO
 		}
 	case ssa.OpARMADD,
+		ssa.OpARMADC,
 		ssa.OpARMSUB,
+		ssa.OpARMSBC,
 		ssa.OpARMRSB,
 		ssa.OpARMAND,
 		ssa.OpARMOR,
@@ -165,6 +167,18 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.Reg = r1
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = r
+	case ssa.OpARMADDS,
+		ssa.OpARMSUBS:
+		r := gc.SSARegNum(v)
+		r1 := gc.SSARegNum(v.Args[0])
+		r2 := gc.SSARegNum(v.Args[1])
+		p := gc.Prog(v.Op.Asm())
+		p.Scond = arm.C_SBIT
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = r2
+		p.Reg = r1
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = r
 	case ssa.OpARMSLL,
 		ssa.OpARMSRL:
 		// ARM shift instructions uses only the low-order byte of the shift amount
@@ -273,6 +287,23 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Type = obj.TYPE_REGREG
 		p.To.Reg = gc.SSARegNum(v)
 		p.To.Offset = arm.REGTMP // throw away low 32-bit into tmp register
+	case ssa.OpARMMULLU:
+		// 32-bit multiplication, results 64-bit, low 32-bit in reg(v), high 32-bit in R0
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		p.Reg = gc.SSARegNum(v.Args[1])
+		p.To.Type = obj.TYPE_REGREG
+		p.To.Reg = arm.REG_R0                // high 32-bit
+		p.To.Offset = int64(gc.SSARegNum(v)) // low 32-bit
+	case ssa.OpARMMULA:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		p.Reg = gc.SSARegNum(v.Args[1])
+		p.To.Type = obj.TYPE_REGREG2
+		p.To.Reg = gc.SSARegNum(v)                   // result
+		p.To.Offset = int64(gc.SSARegNum(v.Args[2])) // addend
 	case ssa.OpARMMOVWconst:
 		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_CONST
@@ -458,6 +489,18 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		gc.Gvarkill(v.Aux.(*gc.Node))
 	case ssa.OpVarLive:
 		gc.Gvarlive(v.Aux.(*gc.Node))
+	case ssa.OpKeepAlive:
+		if !v.Args[0].Type.IsPtrShaped() {
+			v.Fatalf("keeping non-pointer alive %v", v.Args[0])
+		}
+		n, off := gc.AutoVar(v.Args[0])
+		if n == nil {
+			v.Fatalf("KeepLive with non-spilled value %s %s", v, v.Args[0])
+		}
+		if off != 0 {
+			v.Fatalf("KeepLive with non-zero offset spill location %s:%d", n, off)
+		}
+		gc.Gvarlive(n)
 	case ssa.OpARMEqual,
 		ssa.OpARMNotEqual,
 		ssa.OpARMLessThan,
@@ -481,6 +524,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.From.Offset = 1
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpARMCarry,
+		ssa.OpARMLoweredSelect0,
+		ssa.OpARMLoweredSelect1:
+		// nothing to do
 	default:
 		v.Unimplementedf("genValue not implemented: %s", v.LongString())
 	}

--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -4334,6 +4334,25 @@ func (e *ssaExport) SplitComplex(name ssa.LocalSlot) (ssa.LocalSlot, ssa.LocalSl
 	return ssa.LocalSlot{N: n, Type: t, Off: name.Off}, ssa.LocalSlot{N: n, Type: t, Off: name.Off + s}
 }

+func (e *ssaExport) SplitInt64(name ssa.LocalSlot) (ssa.LocalSlot, ssa.LocalSlot) {
+	n := name.N.(*Node)
+	var t *Type
+	if name.Type.IsSigned() {
+		t = Types[TINT32]
+	} else {
+		t = Types[TUINT32]
+	}
+	if n.Class == PAUTO && !n.Addrtaken {
+		// Split this int64 up into two separate variables.
+		h := e.namedAuto(n.Sym.Name+".hi", t)
+		l := e.namedAuto(n.Sym.Name+".lo", Types[TUINT32])
+		return ssa.LocalSlot{N: h, Type: t, Off: 0}, ssa.LocalSlot{N: l, Type: Types[TUINT32], Off: 0}
+	}
+	// Return the two parts of the larger variable.
+	// Assuming little endian (we don't support big endian 32-bit architecture yet)
+	return ssa.LocalSlot{N: n, Type: t, Off: name.Off + 4}, ssa.LocalSlot{N: n, Type: Types[TUINT32], Off: name.Off}
+}
+
 func (e *ssaExport) SplitStruct(name ssa.LocalSlot, i int) ssa.LocalSlot {
 	n := name.N.(*Node)
 	st := name.Type

--- a/src/cmd/compile/internal/gc/type.go
+++ b/src/cmd/compile/internal/gc/type.go
@@ -1207,6 +1207,7 @@ func (t *Type) ChanDir() ChanDir {
 func (t *Type) IsMemory() bool { return false }
 func (t *Type) IsFlags() bool  { return false }
 func (t *Type) IsVoid() bool   { return false }
+func (t *Type) IsTuple() bool  { return false }

 // IsUntyped reports whether t is an untyped type.
 func (t *Type) IsUntyped() bool {

--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@@ -107,6 +107,7 @@ type Frontend interface {
 	SplitSlice(LocalSlot) (LocalSlot, LocalSlot, LocalSlot)
 	SplitComplex(LocalSlot) (LocalSlot, LocalSlot)
 	SplitStruct(LocalSlot, int) LocalSlot
+	SplitInt64(LocalSlot) (LocalSlot, LocalSlot) // returns (hi, lo)

 	// Line returns a string describing the given line number.
 	Line(int32) string

--- a/src/cmd/compile/internal/ssa/decompose.go
+++ b/src/cmd/compile/internal/ssa/decompose.go
@@ -25,6 +25,22 @@ func decomposeBuiltIn(f *Func) {
 	for _, name := range f.Names {
 		t := name.Type
 		switch {
+		case t.IsInteger() && t.Size() == 8 && f.Config.IntSize == 4:
+			var elemType Type
+			if t.IsSigned() {
+				elemType = f.Config.fe.TypeInt32()
+			} else {
+				elemType = f.Config.fe.TypeUInt32()
+			}
+			hiName, loName := f.Config.fe.SplitInt64(name)
+			newNames = append(newNames, hiName, loName)
+			for _, v := range f.NamedValues[name] {
+				hi := v.Block.NewValue1(v.Line, OpInt64Hi, elemType, v)
+				lo := v.Block.NewValue1(v.Line, OpInt64Lo, f.Config.fe.TypeUInt32(), v)
+				f.NamedValues[hiName] = append(f.NamedValues[hiName], hi)
+				f.NamedValues[loName] = append(f.NamedValues[loName], lo)
+			}
+			delete(f.NamedValues, name)
 		case t.IsComplex():
 			var elemType Type
 			if t.Size() == 16 {
@@ -88,8 +104,9 @@ func decomposeBuiltIn(f *Func) {
 }

 func decomposeBuiltInPhi(v *Value) {
-	// TODO: decompose 64-bit ops on 32-bit archs?
 	switch {
+	case v.Type.IsInteger() && v.Type.Size() == 8 && v.Block.Func.Config.IntSize == 4:
+		decomposeInt64Phi(v)
 	case v.Type.IsComplex():
 		decomposeComplexPhi(v)
 	case v.Type.IsString():
@@ -138,6 +155,26 @@ func decomposeSlicePhi(v *Value) {
 	v.AddArg(cap)
 }

+func decomposeInt64Phi(v *Value) {
+	fe := v.Block.Func.Config.fe
+	var partType Type
+	if v.Type.IsSigned() {
+		partType = fe.TypeInt32()
+	} else {
+		partType = fe.TypeUInt32()
+	}
+
+	hi := v.Block.NewValue0(v.Line, OpPhi, partType)
+	lo := v.Block.NewValue0(v.Line, OpPhi, fe.TypeUInt32())
+	for _, a := range v.Args {
+		hi.AddArg(a.Block.NewValue1(v.Line, OpInt64Hi, partType, a))
+		lo.AddArg(a.Block.NewValue1(v.Line, OpInt64Lo, fe.TypeUInt32(), a))
+	}
+	v.reset(OpInt64Make)
+	v.AddArg(hi)
+	v.AddArg(lo)
+}
+
 func decomposeComplexPhi(v *Value) {
 	fe := v.Block.Func.Config.fe
 	var partType Type

--- a/src/cmd/compile/internal/ssa/export_test.go
+++ b/src/cmd/compile/internal/ssa/export_test.go
@@ -49,6 +49,12 @@ func (d DummyFrontend) SplitComplex(s LocalSlot) (LocalSlot, LocalSlot) {
 	}
 	return LocalSlot{s.N, d.TypeFloat32(), s.Off}, LocalSlot{s.N, d.TypeFloat32(), s.Off + 4}
 }
+func (d DummyFrontend) SplitInt64(s LocalSlot) (LocalSlot, LocalSlot) {
+	if s.Type.IsSigned() {
+		return LocalSlot{s.N, d.TypeInt32(), s.Off + 4}, LocalSlot{s.N, d.TypeUInt32(), s.Off}
+	}
+	return LocalSlot{s.N, d.TypeUInt32(), s.Off + 4}, LocalSlot{s.N, d.TypeUInt32(), s.Off}
+}
 func (d DummyFrontend) SplitStruct(s LocalSlot, i int) LocalSlot {
 	return LocalSlot{s.N, s.Type.FieldType(i), s.Off + s.Type.FieldOff(i)}
 }

--- a/src/cmd/compile/internal/ssa/flagalloc.go
+++ b/src/cmd/compile/internal/ssa/flagalloc.go
@@ -95,9 +95,18 @@ func flagalloc(f *Func) {
 					continue
 				}
 				// Recalculate a
+				var c1 *Value
+				if a.Op == OpARMCarry {
+					// Pseudo-op does not generate flags, its arg actually does
+					//TODO: generalize this condition?
+					c1 = a.Args[0].copyInto(b)
+				}
 				c := a.copyInto(b)
 				// Update v.
 				v.SetArg(i, c)
+				if c1 != nil {
+					c.SetArg(0, c1)
+				}
 				// Remember the most-recently computed flag value.
 				flag = a
 			}

--- a/src/cmd/compile/internal/ssa/gen/ARM.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM.rules
@@ -7,11 +7,17 @@
 (Add16 x y) -> (ADD x y)
 (Add8 x y) -> (ADD x y)

+(Add32carry x y) -> (ADDS x y)
+(Add32withcarry x y c) -> (ADC x y c)
+
 (SubPtr x y) -> (SUB x y)
 (Sub32 x y) -> (SUB x y)
 (Sub16 x y) -> (SUB x y)
 (Sub8 x y) -> (SUB x y)

+(Sub32carry x y) -> (SUBS x y)
+(Sub32withcarry x y c) -> (SBC x y c)
+
 (Mul32 x y) -> (MUL x y)
 (Mul16 x y) -> (MUL x y)
 (Mul8 x y) -> (MUL x y)
@@ -23,6 +29,8 @@
 (Hmul8 x y) -> (SRAconst (MUL <config.fe.TypeInt16()> (SignExt8to32 x) (SignExt8to32 y)) [8])
 (Hmul8u x y) -> (SRLconst (MUL <config.fe.TypeUInt16()> (ZeroExt8to32 x) (ZeroExt8to32 y)) [8])

+(Mul32uhilo x y) -> (MULLU x y)
+
 (And32 x y) -> (AND x y)
 (And16 x y) -> (AND x y)
 (And8 x y) -> (AND x y)
@@ -135,6 +143,8 @@
 (SignExt8to32 x) -> (MOVBreg x)
 (SignExt16to32 x) -> (MOVHreg x)

+(Signmask x) -> (SRAconst x [31])
+
 // comparisons
 (Eq8 x y)  -> (Equal (CMP (ZeroExt8to32 x) (ZeroExt8to32 y)))
 (Eq16 x y) -> (Equal (CMP (ZeroExt16to32 x) (ZeroExt16to32 y)))
@@ -258,6 +268,11 @@
 (IsInBounds idx len) -> (LessThanU (CMP idx len))
 (IsSliceInBounds idx len) -> (LessEqualU (CMP idx len))

+// pseudo-ops
+(Select0 <t> x) && t.IsFlags() -> (Carry x)
+(Select0 <t> x) && !t.IsFlags() -> (LoweredSelect0 x)
+(Select1 x) -> (LoweredSelect1 x)
+
 // Absorb pseudo-ops into blocks.
 (If (Equal cc) yes no) -> (EQ cc yes no)
 (If (NotEqual cc) yes no) -> (NE cc yes no)
@@ -306,3 +321,6 @@
  (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
 (MOVWstore [off1] {sym1} (ADDconst [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) ->
  (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+
+(ADD (MUL x y) a) -> (MULA x y a)
+(ADD a (MUL x y)) -> (MULA x y a)
--- a/src/cmd/compile/internal/ssa/gen/ARMOps.go
+++ b/src/cmd/compile/internal/ssa/gen/ARMOps.go
@@ -84,6 +84,8 @@ func init() {
 		gp21      = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
 		gp21cf    = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}, clobbers: flags} // cf: clobbers flags
 		gp2flags  = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{flags}}
+		gp2flags1 = regInfo{inputs: []regMask{gp, gp, flags}, outputs: []regMask{gp}}
+		gp31      = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}}
 		gpload    = regInfo{inputs: []regMask{gpspsb}, outputs: []regMask{gp}}
 		gpstore   = regInfo{inputs: []regMask{gpspsb, gp}, outputs: []regMask{}}
 		readflags = regInfo{inputs: []regMask{flags}, outputs: []regMask{gp}}
@@ -100,6 +102,14 @@ func init() {
 		{name: "HMUL", argLength: 2, reg: gp21, asm: "MULL", commutative: true},   // (arg0 * arg1) >> 32, signed
 		{name: "HMULU", argLength: 2, reg: gp21, asm: "MULLU", commutative: true}, // (arg0 * arg1) >> 32, unsigned

+		{name: "ADDS", argLength: 2, reg: gp21cf, asm: "ADD", commutative: true},   // arg0 + arg1, set carry flag
+		{name: "ADC", argLength: 3, reg: gp2flags1, asm: "ADC", commutative: true}, // arg0 + arg1 + carry, arg2=flags
+		{name: "SUBS", argLength: 2, reg: gp21cf, asm: "SUB"},                      // arg0 - arg1, set carry flag
+		{name: "SBC", argLength: 3, reg: gp2flags1, asm: "SBC"},                    // arg0 - arg1 - carry, arg2=flags
+
+		{name: "MULLU", argLength: 2, reg: regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp &^ buildReg("R0")}, clobbers: buildReg("R0")}, asm: "MULLU", commutative: true}, // arg0 * arg1, results 64-bit, high 32-bit in R0
+		{name: "MULA", argLength: 3, reg: gp31, asm: "MULA"},                                                                                                                        // arg0 * arg1 + arg2
+
 		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
 		{name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int32"}, // arg0 & auxInt
 		{name: "OR", argLength: 2, reg: gp21, asm: "ORR", commutative: true},  // arg0 | arg1
@@ -166,6 +176,10 @@ func init() {
 		{name: "GreaterThanU", argLength: 1, reg: readflags},  // bool, true flags encode unsigned x>y false otherwise.
 		{name: "GreaterEqualU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x>=y false otherwise.

+		{name: "Carry", argLength: 1, reg: regInfo{inputs: []regMask{}, outputs: []regMask{flags}}, typ: "Flags"},     // flags of a (Flags,UInt32)
+		{name: "LoweredSelect0", argLength: 1, reg: regInfo{inputs: []regMask{}, outputs: []regMask{buildReg("R0")}}}, // the first component of a tuple, implicitly in R0, arg0=tuple
+		{name: "LoweredSelect1", argLength: 1, reg: gp11, resultInArg0: true},                                         // the second component of a tuple, arg0=tuple
+
 		// duffzero
 		// arg0 = address of memory to zero (in R1, changed as side effect)
 		// arg1 = value to store (always zero)

--- a/src/cmd/compile/internal/ssa/gen/dec64.rules
+++ b/src/cmd/compile/internal/ssa/gen/dec64.rules
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains rules to decompose [u]int32 types on 32-bit
+// architectures. These rules work together with the decomposeBuiltIn
+// pass which handles phis of these types.
+
+(Int64Hi (Int64Make hi _)) -> hi
+(Int64Lo (Int64Make _ lo)) -> lo
+
+// Assuming little endian (we don't support big endian 32-bit architecture yet)
+(Load <t> ptr mem) && is64BitInt(t) && t.IsSigned() ->
+	(Int64Make
+		(Load <config.fe.TypeInt32()> (OffPtr <config.fe.TypeInt32().PtrTo()> [4] ptr) mem)
+		(Load <config.fe.TypeUInt32()> ptr mem))
+(Load <t> ptr mem) && is64BitInt(t) && !t.IsSigned() ->
+	(Int64Make
+		(Load <config.fe.TypeUInt32()> (OffPtr <config.fe.TypeUInt32().PtrTo()> [4] ptr) mem)
+		(Load <config.fe.TypeUInt32()> ptr mem))
+
+(Store [8] dst (Int64Make hi lo) mem) ->
+	(Store [4]
+		(OffPtr <hi.Type.PtrTo()> [4] dst)
+		hi
+		(Store [4] dst lo mem))
+
+(Arg {n} [off]) && is64BitInt(v.Type) && v.Type.IsSigned() ->
+  (Int64Make
+    (Arg <config.fe.TypeInt32()> {n} [off+4])
+    (Arg <config.fe.TypeUInt32()> {n} [off]))
+(Arg {n} [off]) && is64BitInt(v.Type) && !v.Type.IsSigned() ->
+  (Int64Make
+    (Arg <config.fe.TypeUInt32()> {n} [off+4])
+    (Arg <config.fe.TypeUInt32()> {n} [off]))
+
+(Add64 x y) ->
+	(Int64Make
+		(Add32withcarry <config.fe.TypeInt32()>
+			(Int64Hi x)
+			(Int64Hi y)
+			(Select0 <TypeFlags> (Add32carry (Int64Lo x) (Int64Lo y))))
+		(Select1 <config.fe.TypeUInt32()> (Add32carry (Int64Lo x) (Int64Lo y))))
+
+(Sub64 x y) ->
+	(Int64Make
+		(Sub32withcarry <config.fe.TypeInt32()>
+			(Int64Hi x)
+			(Int64Hi y)
+			(Select0 <TypeFlags> (Sub32carry (Int64Lo x) (Int64Lo y))))
+		(Select1 <config.fe.TypeUInt32()> (Sub32carry (Int64Lo x) (Int64Lo y))))
+
+(Mul64 x y) ->
+	(Int64Make
+		(Add32 <config.fe.TypeUInt32()>
+			(Mul32 <config.fe.TypeUInt32()> (Int64Lo x) (Int64Hi y))
+			(Add32 <config.fe.TypeUInt32()>
+				(Mul32 <config.fe.TypeUInt32()> (Int64Hi x) (Int64Lo y))
+				(Select0 <config.fe.TypeUInt32()> (Mul32uhilo (Int64Lo x) (Int64Lo y)))))
+		(Select1 <config.fe.TypeUInt32()> (Mul32uhilo (Int64Lo x) (Int64Lo y))))
+
+(And64 x y) ->
+	(Int64Make
+		(And32 <config.fe.TypeUInt32()> (Int64Hi x) (Int64Hi y))
+		(And32 <config.fe.TypeUInt32()> (Int64Lo x) (Int64Lo y)))
+
+(Or64 x y) ->
+	(Int64Make
+		(Or32 <config.fe.TypeUInt32()> (Int64Hi x) (Int64Hi y))
+		(Or32 <config.fe.TypeUInt32()> (Int64Lo x) (Int64Lo y)))
+
+(Xor64 x y) ->
+	(Int64Make
+		(Xor32 <config.fe.TypeUInt32()> (Int64Hi x) (Int64Hi y))
+		(Xor32 <config.fe.TypeUInt32()> (Int64Lo x) (Int64Lo y)))
+
+(Neg64 <t> x) -> (Sub64 (Const64 <t> [0]) x)
+
+(Com64 x) ->
+	(Int64Make
+		(Com32 <config.fe.TypeUInt32()> (Int64Hi x))
+		(Com32 <config.fe.TypeUInt32()> (Int64Lo x)))
+
+(SignExt32to64 x) -> (Int64Make (Signmask x) x)
+(SignExt16to64 x) -> (SignExt32to64 (SignExt16to32 x))
+(SignExt8to64 x) -> (SignExt32to64 (SignExt8to32 x))
+
+(ZeroExt32to64 x) -> (Int64Make (Const32 <config.fe.TypeUInt32()> [0]) x)
+(ZeroExt16to64 x) -> (ZeroExt32to64 (ZeroExt16to32 x))
+(ZeroExt8to64 x) -> (ZeroExt32to64 (ZeroExt8to32 x))
+
+(Trunc64to32 (Int64Make _ lo)) -> lo
+(Trunc64to16 (Int64Make _ lo)) -> (Trunc32to16 lo)
+(Trunc64to8 (Int64Make _ lo)) -> (Trunc32to8 lo)
+
+(Lsh32x64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const32 [0])
+(Rsh32x64 x (Int64Make (Const32 [c]) _)) && c != 0 -> (Signmask x)
+(Rsh32Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const32 [0])
+(Lsh16x64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const32 [0])
+(Rsh16x64 x (Int64Make (Const32 [c]) _)) && c != 0 -> (Signmask (SignExt16to32 x))
+(Rsh16Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const32 [0])
+(Lsh8x64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const32 [0])
+(Rsh8x64 x (Int64Make (Const32 [c]) _)) && c != 0 -> (Signmask (SignExt8to32 x))
+(Rsh8Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 -> (Const32 [0])
+
+(Lsh32x64 x (Int64Make (Const32 [0]) lo)) -> (Lsh32x32 x lo)
+(Rsh32x64 x (Int64Make (Const32 [0]) lo)) -> (Rsh32x32 x lo)
+(Rsh32Ux64 x (Int64Make (Const32 [0]) lo)) -> (Rsh32Ux32 x lo)
+(Lsh16x64 x (Int64Make (Const32 [0]) lo)) -> (Lsh16x32 x lo)
+(Rsh16x64 x (Int64Make (Const32 [0]) lo)) -> (Rsh16x32 x lo)
+(Rsh16Ux64 x (Int64Make (Const32 [0]) lo)) -> (Rsh16Ux32 x lo)
+(Lsh8x64 x (Int64Make (Const32 [0]) lo)) -> (Lsh8x32 x lo)
+(Rsh8x64 x (Int64Make (Const32 [0]) lo)) -> (Rsh8x32 x lo)
+(Rsh8Ux64 x (Int64Make (Const32 [0]) lo)) -> (Rsh8Ux32 x lo)
+
+(Const64 <t> [c]) && t.IsSigned() ->
+	(Int64Make (Const32 <config.fe.TypeInt32()> [c>>32]) (Const32 <config.fe.TypeUInt32()> [c&0xffffffff]))
+(Const64 <t> [c]) && !t.IsSigned() ->
+	(Int64Make (Const32 <config.fe.TypeUInt32()> [c>>32]) (Const32 <config.fe.TypeUInt32()> [c&0xffffffff]))
+
+(Eq64 x y) ->
+	(AndB
+		(Eq32 (Int64Hi x) (Int64Hi y))
+		(Eq32 (Int64Lo x) (Int64Lo y)))
+
+(Neq64 x y) ->
+	(OrB
+		(Neq32 (Int64Hi x) (Int64Hi y))
+		(Neq32 (Int64Lo x) (Int64Lo y)))
+
+(Less64U x y) ->
+	(OrB
+		(Less32U (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Less32U (Int64Lo x) (Int64Lo y))))
+
+(Leq64U x y) ->
+	(OrB
+		(Less32U (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Leq32U (Int64Lo x) (Int64Lo y))))
+
+(Greater64U x y) ->
+	(OrB
+		(Greater32U (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Greater32U (Int64Lo x) (Int64Lo y))))
+
+(Geq64U x y) ->
+	(OrB
+		(Greater32U (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Geq32U (Int64Lo x) (Int64Lo y))))
+
+(Less64 x y) ->
+	(OrB
+		(Less32 (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Less32U (Int64Lo x) (Int64Lo y))))
+
+(Leq64 x y) ->
+	(OrB
+		(Less32 (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Leq32U (Int64Lo x) (Int64Lo y))))
+
+(Greater64 x y) ->
+	(OrB
+		(Greater32 (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Greater32U (Int64Lo x) (Int64Lo y))))
+
+(Geq64 x y) ->
+	(OrB
+		(Greater32 (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Geq32U (Int64Lo x) (Int64Lo y))))
--- a/src/cmd/compile/internal/ssa/gen/dec64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/dec64Ops.go
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+var dec64Ops = []opData{}
+
+var dec64Blocks = []blockData{}
+
+func init() {
+	archs = append(archs, arch{
+		name:    "dec64",
+		ops:     dec64Ops,
+		blocks:  dec64Blocks,
+		generic: true,
+	})
+}
--- a/src/cmd/compile/internal/ssa/gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/gen/genericOps.go
@@ -173,76 +173,76 @@ var genericOps = []opData{
 	{name: "Lrot64", argLength: 1, aux: "Int64"},

 	// 2-input comparisons
-	{name: "Eq8", argLength: 2, commutative: true}, // arg0 == arg1
-	{name: "Eq16", argLength: 2, commutative: true},
-	{name: "Eq32", argLength: 2, commutative: true},
-	{name: "Eq64", argLength: 2, commutative: true},
-	{name: "EqPtr", argLength: 2, commutative: true},
-	{name: "EqInter", argLength: 2}, // arg0 or arg1 is nil; other cases handled by frontend
-	{name: "EqSlice", argLength: 2}, // arg0 or arg1 is nil; other cases handled by frontend
-	{name: "Eq32F", argLength: 2},
-	{name: "Eq64F", argLength: 2},
-
-	{name: "Neq8", argLength: 2, commutative: true}, // arg0 != arg1
-	{name: "Neq16", argLength: 2, commutative: true},
-	{name: "Neq32", argLength: 2, commutative: true},
-	{name: "Neq64", argLength: 2, commutative: true},
-	{name: "NeqPtr", argLength: 2, commutative: true},
-	{name: "NeqInter", argLength: 2}, // arg0 or arg1 is nil; other cases handled by frontend
-	{name: "NeqSlice", argLength: 2}, // arg0 or arg1 is nil; other cases handled by frontend
-	{name: "Neq32F", argLength: 2},
+	{name: "Eq8", argLength: 2, commutative: true, typ: "Bool"}, // arg0 == arg1
+	{name: "Eq16", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Eq32", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Eq64", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "EqPtr", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "EqInter", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+	{name: "EqSlice", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+	{name: "Eq32F", argLength: 2, typ: "Bool"},
+	{name: "Eq64F", argLength: 2, typ: "Bool"},
+
+	{name: "Neq8", argLength: 2, commutative: true, typ: "Bool"}, // arg0 != arg1
+	{name: "Neq16", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Neq32", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Neq64", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "NeqPtr", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "NeqInter", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+	{name: "NeqSlice", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+	{name: "Neq32F", argLength: 2, typ: "Bool"},
 	{name: "Neq64F", argLength: 2},

-	{name: "Less8", argLength: 2},  // arg0 < arg1, signed
-	{name: "Less8U", argLength: 2}, // arg0 < arg1, unsigned
-	{name: "Less16", argLength: 2},
-	{name: "Less16U", argLength: 2},
-	{name: "Less32", argLength: 2},
-	{name: "Less32U", argLength: 2},
-	{name: "Less64", argLength: 2},
-	{name: "Less64U", argLength: 2},
-	{name: "Less32F", argLength: 2},
-	{name: "Less64F", argLength: 2},
-
-	{name: "Leq8", argLength: 2},  // arg0 <= arg1, signed
-	{name: "Leq8U", argLength: 2}, // arg0 <= arg1, unsigned
-	{name: "Leq16", argLength: 2},
-	{name: "Leq16U", argLength: 2},
-	{name: "Leq32", argLength: 2},
-	{name: "Leq32U", argLength: 2},
-	{name: "Leq64", argLength: 2},
-	{name: "Leq64U", argLength: 2},
-	{name: "Leq32F", argLength: 2},
-	{name: "Leq64F", argLength: 2},
-
-	{name: "Greater8", argLength: 2},  // arg0 > arg1, signed
-	{name: "Greater8U", argLength: 2}, // arg0 > arg1, unsigned
-	{name: "Greater16", argLength: 2},
-	{name: "Greater16U", argLength: 2},
-	{name: "Greater32", argLength: 2},
-	{name: "Greater32U", argLength: 2},
-	{name: "Greater64", argLength: 2},
-	{name: "Greater64U", argLength: 2},
-	{name: "Greater32F", argLength: 2},
-	{name: "Greater64F", argLength: 2},
-
-	{name: "Geq8", argLength: 2},  // arg0 <= arg1, signed
-	{name: "Geq8U", argLength: 2}, // arg0 <= arg1, unsigned
-	{name: "Geq16", argLength: 2},
-	{name: "Geq16U", argLength: 2},
-	{name: "Geq32", argLength: 2},
-	{name: "Geq32U", argLength: 2},
-	{name: "Geq64", argLength: 2},
-	{name: "Geq64U", argLength: 2},
-	{name: "Geq32F", argLength: 2},
-	{name: "Geq64F", argLength: 2},
+	{name: "Less8", argLength: 2, typ: "Bool"},  // arg0 < arg1, signed
+	{name: "Less8U", argLength: 2, typ: "Bool"}, // arg0 < arg1, unsigned
+	{name: "Less16", argLength: 2, typ: "Bool"},
+	{name: "Less16U", argLength: 2, typ: "Bool"},
+	{name: "Less32", argLength: 2, typ: "Bool"},
+	{name: "Less32U", argLength: 2, typ: "Bool"},
+	{name: "Less64", argLength: 2, typ: "Bool"},
+	{name: "Less64U", argLength: 2, typ: "Bool"},
+	{name: "Less32F", argLength: 2, typ: "Bool"},
+	{name: "Less64F", argLength: 2, typ: "Bool"},
+
+	{name: "Leq8", argLength: 2, typ: "Bool"},  // arg0 <= arg1, signed
+	{name: "Leq8U", argLength: 2, typ: "Bool"}, // arg0 <= arg1, unsigned
+	{name: "Leq16", argLength: 2, typ: "Bool"},
+	{name: "Leq16U", argLength: 2, typ: "Bool"},
+	{name: "Leq32", argLength: 2, typ: "Bool"},
+	{name: "Leq32U", argLength: 2, typ: "Bool"},
+	{name: "Leq64", argLength: 2, typ: "Bool"},
+	{name: "Leq64U", argLength: 2, typ: "Bool"},
+	{name: "Leq32F", argLength: 2, typ: "Bool"},
+	{name: "Leq64F", argLength: 2, typ: "Bool"},
+
+	{name: "Greater8", argLength: 2, typ: "Bool"},  // arg0 > arg1, signed
+	{name: "Greater8U", argLength: 2, typ: "Bool"}, // arg0 > arg1, unsigned
+	{name: "Greater16", argLength: 2, typ: "Bool"},
+	{name: "Greater16U", argLength: 2, typ: "Bool"},
+	{name: "Greater32", argLength: 2, typ: "Bool"},
+	{name: "Greater32U", argLength: 2, typ: "Bool"},
+	{name: "Greater64", argLength: 2, typ: "Bool"},
+	{name: "Greater64U", argLength: 2, typ: "Bool"},
+	{name: "Greater32F", argLength: 2, typ: "Bool"},
+	{name: "Greater64F", argLength: 2, typ: "Bool"},
+
+	{name: "Geq8", argLength: 2, typ: "Bool"},  // arg0 <= arg1, signed
+	{name: "Geq8U", argLength: 2, typ: "Bool"}, // arg0 <= arg1, unsigned
+	{name: "Geq16", argLength: 2, typ: "Bool"},
+	{name: "Geq16U", argLength: 2, typ: "Bool"},
+	{name: "Geq32", argLength: 2, typ: "Bool"},
+	{name: "Geq32U", argLength: 2, typ: "Bool"},
+	{name: "Geq64", argLength: 2, typ: "Bool"},
+	{name: "Geq64U", argLength: 2, typ: "Bool"},
+	{name: "Geq32F", argLength: 2, typ: "Bool"},
+	{name: "Geq64F", argLength: 2, typ: "Bool"},

 	// boolean ops
-	{name: "AndB", argLength: 2}, // arg0 && arg1 (not shortcircuited)
-	{name: "OrB", argLength: 2},  // arg0 || arg1 (not shortcircuited)
-	{name: "EqB", argLength: 2},  // arg0 == arg1
-	{name: "NeqB", argLength: 2}, // arg0 != arg1
-	{name: "Not", argLength: 1},  // !arg0, boolean
+	{name: "AndB", argLength: 2, typ: "Bool"}, // arg0 && arg1 (not shortcircuited)
+	{name: "OrB", argLength: 2, typ: "Bool"},  // arg0 || arg1 (not shortcircuited)
+	{name: "EqB", argLength: 2, typ: "Bool"},  // arg0 == arg1
+	{name: "NeqB", argLength: 2, typ: "Bool"}, // arg0 != arg1
+	{name: "Not", argLength: 1, typ: "Bool"},  // !arg0, boolean

 	// 1-input ops
 	{name: "Neg8", argLength: 1}, // -arg0
@@ -416,6 +416,25 @@ var genericOps = []opData{
 	{name: "VarKill", argLength: 1, aux: "Sym"},            // aux is a *gc.Node of a variable that is known to be dead.  arg0=mem, returns mem
 	{name: "VarLive", argLength: 1, aux: "Sym"},            // aux is a *gc.Node of a variable that must be kept live.  arg0=mem, returns mem
 	{name: "KeepAlive", argLength: 2, typ: "Mem"},          // arg[0] is a value that must be kept alive until this mark.  arg[1]=mem, returns mem
+
+	// Ops for breaking 64-bit operations on 32-bit architectures
+	{name: "Int64Make", argLength: 2, typ: "UInt64"}, // arg0=hi, arg1=lo
+	{name: "Int64Hi", argLength: 1, typ: "UInt32"},   // high 32-bit of arg0
+	{name: "Int64Lo", argLength: 1, typ: "UInt32"},   // low 32-bit of arg0
+
+	{name: "Add32carry", argLength: 2, commutative: true, typ: "(Flags,UInt32)"}, // arg0 + arg1, returns (carry, value)
+	{name: "Add32withcarry", argLength: 3, commutative: true},                    // arg0 + arg1 + arg2, arg2=carry (0 or 1)
+
+	{name: "Sub32carry", argLength: 2, typ: "(Flags,UInt32)"}, // arg0 - arg1, returns (carry, value)
+	{name: "Sub32withcarry", argLength: 3},                    // arg0 - arg1 - arg2, arg2=carry (0 or 1)
+
+	{name: "Mul32uhilo", argLength: 2, typ: "(UInt32,UInt32)"}, // arg0 * arg1, returns (hi, lo)
+
+	{name: "Signmask", argLength: 1, typ: "Int32"}, // 0 if arg0 >= 0, -1 if arg0 < 0
+
+	// pseudo-ops for breaking Tuple
+	{name: "Select0", argLength: 1}, // the first component of a tuple
+	{name: "Select1", argLength: 1}, // the second component of a tuple
 }

 //     kind           control    successors       implicit exit

--- a/src/cmd/compile/internal/ssa/gen/rulegen.go
+++ b/src/cmd/compile/internal/ssa/gen/rulegen.go
@@ -689,6 +689,13 @@ func blockName(name string, arch arch) string {

 // typeName returns the string to use to generate a type.
 func typeName(typ string) string {
+	if typ[0] == '(' {
+		ts := strings.Split(typ[1:len(typ)-1], ",")
+		if len(ts) != 2 {
+			panic("Tuple expect 2 arguments")
+		}
+		return "MakeTuple(" + typeName(ts[0]) + ", " + typeName(ts[1]) + ")"
+	}
 	switch typ {
 	case "Flags", "Mem", "Void", "Int128":
 		return "Type" + typ

--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -332,6 +332,12 @@ const (
 	OpARMMUL
 	OpARMHMUL
 	OpARMHMULU
+	OpARMADDS
+	OpARMADC
+	OpARMSUBS
+	OpARMSBC
+	OpARMMULLU
+	OpARMMULA
 	OpARMAND
 	OpARMANDconst
 	OpARMOR
@@ -384,6 +390,9 @@ const (
 	OpARMLessEqualU
 	OpARMGreaterThanU
 	OpARMGreaterEqualU
+	OpARMCarry
+	OpARMLoweredSelect0
+	OpARMLoweredSelect1
 	OpARMDUFFZERO
 	OpARMDUFFCOPY
 	OpARMLoweredZero
@@ -675,6 +684,17 @@ const (
 	OpVarKill
 	OpVarLive
 	OpKeepAlive
+	OpInt64Make
+	OpInt64Hi
+	OpInt64Lo
+	OpAdd32carry
+	OpAdd32withcarry
+	OpSub32carry
+	OpSub32withcarry
+	OpMul32uhilo
+	OpSignmask
+	OpSelect0
+	OpSelect1
 )

 var opcodeTable = [...]opInfo{
@@ -3985,6 +4005,99 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:        "ADDS",
+		argLen:      2,
+		commutative: true,
+		asm:         arm.AADD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 5119}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+				{1, 5119}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+			clobbers: 65536, // FLAGS
+			outputs: []regMask{
+				5119, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+		},
+	},
+	{
+		name:        "ADC",
+		argLen:      3,
+		commutative: true,
+		asm:         arm.AADC,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 65536}, // FLAGS
+				{0, 5119},  // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+				{1, 5119},  // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+			outputs: []regMask{
+				5119, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+		},
+	},
+	{
+		name:   "SUBS",
+		argLen: 2,
+		asm:    arm.ASUB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 5119}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+				{1, 5119}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+			clobbers: 65536, // FLAGS
+			outputs: []regMask{
+				5119, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+		},
+	},
+	{
+		name:   "SBC",
+		argLen: 3,
+		asm:    arm.ASBC,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 65536}, // FLAGS
+				{0, 5119},  // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+				{1, 5119},  // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+			outputs: []regMask{
+				5119, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+		},
+	},
+	{
+		name:        "MULLU",
+		argLen:      2,
+		commutative: true,
+		asm:         arm.AMULLU,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 5119}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+				{1, 5119}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+			clobbers: 1, // R0
+			outputs: []regMask{
+				5118, // R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+		},
+	},
+	{
+		name:   "MULA",
+		argLen: 3,
+		asm:    arm.AMULA,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 5119}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+				{1, 5119}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+				{2, 5119}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+			outputs: []regMask{
+				5119, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+		},
+	},
 	{
 		name:        "AND",
 		argLen:      2,
@@ -4661,6 +4774,37 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "Carry",
+		argLen: 1,
+		reg: regInfo{
+			outputs: []regMask{
+				65536, // FLAGS
+			},
+		},
+	},
+	{
+		name:   "LoweredSelect0",
+		argLen: 1,
+		reg: regInfo{
+			outputs: []regMask{
+				1, // R0
+			},
+		},
+	},
+	{
+		name:         "LoweredSelect1",
+		argLen:       1,
+		resultInArg0: true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 5119}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+			outputs: []regMask{
+				5119, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12
+			},
+		},
+	},
 	{
 		name:    "DUFFZERO",
 		auxType: auxInt64,
@@ -6200,6 +6344,63 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "Int64Make",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Int64Hi",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "Int64Lo",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:        "Add32carry",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "Add32withcarry",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:    "Sub32carry",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Sub32withcarry",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Mul32uhilo",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Signmask",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "Select0",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "Select1",
+		argLen:  1,
+		generic: true,
+	},
 }

 func (o Op) Asm() obj.As    { return opcodeTable[o].asm }

--- a/src/cmd/compile/internal/ssa/opt.go
+++ b/src/cmd/compile/internal/ssa/opt.go
@@ -11,4 +11,7 @@ func opt(f *Func) {

 func dec(f *Func) {
 	applyRewrite(f, rewriteBlockdec, rewriteValuedec)
+	if f.Config.IntSize == 4 {
+		applyRewrite(f, rewriteBlockdec64, rewriteValuedec64)
+	}
 }
--- a/src/cmd/compile/internal/ssa/rewriteARM.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM.go
@@ -14,6 +14,10 @@ func rewriteValueARM(v *Value, config *Config) bool {
 		return rewriteValueARM_OpAdd16(v, config)
 	case OpAdd32:
 		return rewriteValueARM_OpAdd32(v, config)
+	case OpAdd32carry:
+		return rewriteValueARM_OpAdd32carry(v, config)
+	case OpAdd32withcarry:
+		return rewriteValueARM_OpAdd32withcarry(v, config)
 	case OpAdd8:
 		return rewriteValueARM_OpAdd8(v, config)
 	case OpAddPtr:
@@ -176,6 +180,8 @@ func rewriteValueARM(v *Value, config *Config) bool {
 		return rewriteValueARM_OpMul16(v, config)
 	case OpMul32:
 		return rewriteValueARM_OpMul32(v, config)
+	case OpMul32uhilo:
+		return rewriteValueARM_OpMul32uhilo(v, config)
 	case OpMul8:
 		return rewriteValueARM_OpMul8(v, config)
 	case OpNeg16:
@@ -256,12 +262,18 @@ func rewriteValueARM(v *Value, config *Config) bool {
 		return rewriteValueARM_OpRsh8x64(v, config)
 	case OpRsh8x8:
 		return rewriteValueARM_OpRsh8x8(v, config)
+	case OpSelect0:
+		return rewriteValueARM_OpSelect0(v, config)
+	case OpSelect1:
+		return rewriteValueARM_OpSelect1(v, config)
 	case OpSignExt16to32:
 		return rewriteValueARM_OpSignExt16to32(v, config)
 	case OpSignExt8to16:
 		return rewriteValueARM_OpSignExt8to16(v, config)
 	case OpSignExt8to32:
 		return rewriteValueARM_OpSignExt8to32(v, config)
+	case OpSignmask:
+		return rewriteValueARM_OpSignmask(v, config)
 	case OpStaticCall:
 		return rewriteValueARM_OpStaticCall(v, config)
 	case OpStore:
@@ -270,6 +282,10 @@ func rewriteValueARM(v *Value, config *Config) bool {
 		return rewriteValueARM_OpSub16(v, config)
 	case OpSub32:
 		return rewriteValueARM_OpSub32(v, config)
+	case OpSub32carry:
+		return rewriteValueARM_OpSub32carry(v, config)
+	case OpSub32withcarry:
+		return rewriteValueARM_OpSub32withcarry(v, config)
 	case OpSub8:
 		return rewriteValueARM_OpSub8(v, config)
 	case OpSubPtr:
@@ -330,6 +346,40 @@ func rewriteValueARM_OpARMADD(v *Value, config *Config) bool {
 		v.AddArg(x)
 		return true
 	}
+	// match: (ADD (MUL x y) a)
+	// cond:
+	// result: (MULA x y a)
+	for {
+		v_0 := v.Args[0]
+		if v_0.Op != OpARMMUL {
+			break
+		}
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		a := v.Args[1]
+		v.reset(OpARMMULA)
+		v.AddArg(x)
+		v.AddArg(y)
+		v.AddArg(a)
+		return true
+	}
+	// match: (ADD a (MUL x y))
+	// cond:
+	// result: (MULA x y a)
+	for {
+		a := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARMMUL {
+			break
+		}
+		x := v_1.Args[0]
+		y := v_1.Args[1]
+		v.reset(OpARMMULA)
+		v.AddArg(x)
+		v.AddArg(y)
+		v.AddArg(a)
+		return true
+	}
 	return false
 }
 func rewriteValueARM_OpAdd16(v *Value, config *Config) bool {
@@ -362,6 +412,38 @@ func rewriteValueARM_OpAdd32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValueARM_OpAdd32carry(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Add32carry x y)
+	// cond:
+	// result: (ADDS x y)
+	for {
+		x := v.Args[0]
+		y := v.Args[1]
+		v.reset(OpARMADDS)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+}
+func rewriteValueARM_OpAdd32withcarry(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Add32withcarry x y c)
+	// cond:
+	// result: (ADC x y c)
+	for {
+		x := v.Args[0]
+		y := v.Args[1]
+		c := v.Args[2]
+		v.reset(OpARMADC)
+		v.AddArg(x)
+		v.AddArg(y)
+		v.AddArg(c)
+		return true
+	}
+}
 func rewriteValueARM_OpAdd8(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -2156,6 +2238,21 @@ func rewriteValueARM_OpMul32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValueARM_OpMul32uhilo(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Mul32uhilo x y)
+	// cond:
+	// result: (MULLU x y)
+	for {
+		x := v.Args[0]
+		y := v.Args[1]
+		v.reset(OpARMMULLU)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+}
 func rewriteValueARM_OpMul8(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -2968,6 +3065,50 @@ func rewriteValueARM_OpRsh8x8(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValueARM_OpSelect0(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Select0 <t> x)
+	// cond: t.IsFlags()
+	// result: (Carry x)
+	for {
+		t := v.Type
+		x := v.Args[0]
+		if !(t.IsFlags()) {
+			break
+		}
+		v.reset(OpARMCarry)
+		v.AddArg(x)
+		return true
+	}
+	// match: (Select0 <t> x)
+	// cond: !t.IsFlags()
+	// result: (LoweredSelect0 x)
+	for {
+		t := v.Type
+		x := v.Args[0]
+		if !(!t.IsFlags()) {
+			break
+		}
+		v.reset(OpARMLoweredSelect0)
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
+func rewriteValueARM_OpSelect1(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Select1 x)
+	// cond:
+	// result: (LoweredSelect1 x)
+	for {
+		x := v.Args[0]
+		v.reset(OpARMLoweredSelect1)
+		v.AddArg(x)
+		return true
+	}
+}
 func rewriteValueARM_OpSignExt16to32(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -3007,6 +3148,20 @@ func rewriteValueARM_OpSignExt8to32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValueARM_OpSignmask(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Signmask x)
+	// cond:
+	// result: (SRAconst x [31])
+	for {
+		x := v.Args[0]
+		v.reset(OpARMSRAconst)
+		v.AddArg(x)
+		v.AuxInt = 31
+		return true
+	}
+}
 func rewriteValueARM_OpStaticCall(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -3107,6 +3262,38 @@ func rewriteValueARM_OpSub32(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValueARM_OpSub32carry(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Sub32carry x y)
+	// cond:
+	// result: (SUBS x y)
+	for {
+		x := v.Args[0]
+		y := v.Args[1]
+		v.reset(OpARMSUBS)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+}
+func rewriteValueARM_OpSub32withcarry(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Sub32withcarry x y c)
+	// cond:
+	// result: (SBC x y c)
+	for {
+		x := v.Args[0]
+		y := v.Args[1]
+		c := v.Args[2]
+		v.reset(OpARMSBC)
+		v.AddArg(x)
+		v.AddArg(y)
+		v.AddArg(c)
+		return true
+	}
+}
 func rewriteValueARM_OpSub8(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b

--- a/src/cmd/compile/internal/ssa/rewritedec64.go
+++ b/src/cmd/compile/internal/ssa/rewritedec64.go
--- a/src/cmd/compile/internal/ssa/schedule.go
+++ b/src/cmd/compile/internal/ssa/schedule.go
@@ -8,6 +8,7 @@ import "container/heap"

 const (
 	ScorePhi = iota // towards top of block
+	ScoreReadTuple
 	ScoreVarDef
 	ScoreMemory
 	ScoreDefault
@@ -44,6 +45,21 @@ func (h ValHeap) Less(i, j int) bool {
 	if c := sx - sy; c != 0 {
 		return c > 0 // higher score comes later.
 	}
+	if sx == ScoreReadTuple {
+		// both are tuple-reading ops
+		// if they read same tuple, flag-reading op comes earlier
+		if x.Args[0] == y.Args[0] {
+			if x.Op == OpARMCarry || x.Op == OpARMLoweredSelect0 { //TODO: abstract this condition?
+				return false
+			} else {
+				return true
+			}
+		}
+		// if they read different tuples, order them as
+		// tuple-generating order to avoid interleaving
+		x = x.Args[0]
+		y = y.Args[0]
+	}
 	if x.Line != y.Line { // Favor in-order line stepping
 		return x.Line > y.Line
 	}
@@ -103,7 +119,14 @@ func schedule(f *Func) {
 				// reduce register pressure. It also helps make sure
 				// VARDEF ops are scheduled before the corresponding LEA.
 				score[v.ID] = ScoreMemory
-			case v.Type.IsFlags():
+			case v.Op == OpARMCarry || v.Op == OpARMLoweredSelect0 || v.Op == OpARMLoweredSelect1:
+				// Schedule the pseudo-op of reading part of a tuple
+				// immediately after the tuple-generating op, since
+				// this value is already live. This also removes its
+				// false dependency on the other part of the tuple.
+				// Also ensures tuple is never spilled.
+				score[v.ID] = ScoreReadTuple
+			case v.Type.IsFlags() || v.Type.IsTuple():
 				// Schedule flag register generation as late as possible.
 				// This makes sure that we only have one live flags
 				// value at a time.

--- a/src/cmd/compile/internal/ssa/type.go
+++ b/src/cmd/compile/internal/ssa/type.go
@@ -27,6 +27,7 @@ type Type interface {
 	IsMemory() bool // special ssa-package-only types
 	IsFlags() bool
 	IsVoid() bool
+	IsTuple() bool

 	ElemType() Type // given []T or *T or [n]T, return T
 	PtrTo() Type    // given T, return *T
@@ -69,6 +70,7 @@ func (t *CompilerType) IsInterface() bool      { return false }
 func (t *CompilerType) IsMemory() bool         { return t.Memory }
 func (t *CompilerType) IsFlags() bool          { return t.Flags }
 func (t *CompilerType) IsVoid() bool           { return t.Void }
+func (t *CompilerType) IsTuple() bool          { return false }
 func (t *CompilerType) String() string         { return t.Name }
 func (t *CompilerType) SimpleString() string   { return t.Name }
 func (t *CompilerType) ElemType() Type         { panic("not implemented") }
@@ -79,6 +81,38 @@ func (t *CompilerType) FieldOff(i int) int64   { panic("not implemented") }
 func (t *CompilerType) FieldName(i int) string { panic("not implemented") }
 func (t *CompilerType) NumElem() int64         { panic("not implemented") }

+type TupleType struct {
+	first  Type
+	second Type
+}
+
+func (t *TupleType) Size() int64            { panic("not implemented") }
+func (t *TupleType) Alignment() int64       { panic("not implemented") }
+func (t *TupleType) IsBoolean() bool        { return false }
+func (t *TupleType) IsInteger() bool        { return false }
+func (t *TupleType) IsSigned() bool         { return false }
+func (t *TupleType) IsFloat() bool          { return false }
+func (t *TupleType) IsComplex() bool        { return false }
+func (t *TupleType) IsPtrShaped() bool      { return false }
+func (t *TupleType) IsString() bool         { return false }
+func (t *TupleType) IsSlice() bool          { return false }
+func (t *TupleType) IsArray() bool          { return false }
+func (t *TupleType) IsStruct() bool         { return false }
+func (t *TupleType) IsInterface() bool      { return false }
+func (t *TupleType) IsMemory() bool         { return false }
+func (t *TupleType) IsFlags() bool          { return false }
+func (t *TupleType) IsVoid() bool           { return false }
+func (t *TupleType) IsTuple() bool          { return true }
+func (t *TupleType) String() string         { return t.first.String() + "," + t.second.String() }
+func (t *TupleType) SimpleString() string   { return "Tuple" }
+func (t *TupleType) ElemType() Type         { panic("not implemented") }
+func (t *TupleType) PtrTo() Type            { panic("not implemented") }
+func (t *TupleType) NumFields() int         { panic("not implemented") }
+func (t *TupleType) FieldType(i int) Type   { panic("not implemented") }
+func (t *TupleType) FieldOff(i int) int64   { panic("not implemented") }
+func (t *TupleType) FieldName(i int) string { panic("not implemented") }
+func (t *TupleType) NumElem() int64         { panic("not implemented") }
+
 // Cmp is a comparison between values a and b.
 // -1 if a < b
 //  0 if a == b
@@ -116,6 +150,25 @@ func (t *CompilerType) Compare(u Type) Cmp {
 	return CMPlt
 }

+func (t *TupleType) Compare(u Type) Cmp {
+	// ssa.TupleType is greater than ssa.CompilerType
+	if _, ok := u.(*CompilerType); ok {
+		return CMPgt
+	}
+	// ssa.TupleType is smaller than any other type
+	x, ok := u.(*TupleType)
+	if !ok {
+		return CMPlt
+	}
+	if t == x {
+		return CMPeq
+	}
+	if c := t.first.Compare(x.first); c != CMPeq {
+		return c
+	}
+	return t.second.Compare(x.second)
+}
+
 var (
 	TypeInvalid = &CompilerType{Name: "invalid"}
 	TypeMem     = &CompilerType{Name: "mem", Memory: true}
@@ -123,3 +176,7 @@ var (
 	TypeVoid    = &CompilerType{Name: "void", Void: true}
 	TypeInt128  = &CompilerType{Name: "int128", size: 16, Int128: true}
 )
+
+func MakeTuple(t0, t1 Type) *TupleType {
+	return &TupleType{first: t0, second: t1}
+}
--- a/src/cmd/compile/internal/ssa/type_test.go
+++ b/src/cmd/compile/internal/ssa/type_test.go
@@ -39,6 +39,7 @@ func (t *TypeImpl) IsStruct() bool         { return t.struct_ }
 func (t *TypeImpl) IsInterface() bool      { return t.inter }
 func (t *TypeImpl) IsMemory() bool         { return false }
 func (t *TypeImpl) IsFlags() bool          { return false }
+func (t *TypeImpl) IsTuple() bool          { return false }
 func (t *TypeImpl) IsVoid() bool           { return false }
 func (t *TypeImpl) String() string         { return t.Name }
 func (t *TypeImpl) SimpleString() string   { return t.Name }