[dev.ssa] cmd/compile: PPC64, add cmp->bool, some shifts, hmul

Includes hmul (all widths) compare for boolean result and simplifications shift operations plus changes/additions for implementation (ORN, ADDME, ADDC) Also fixed a backwards-operand CMP. Change-Id: Id723c4e25125c38e0d9ab9ec9448176b75f4cdb4 Reviewed-on: https://go-review.googlesource.com/25410 Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>

[dev.ssa] cmd/compile: PPC64, add cmp->bool, some shifts, hmul
Includes hmul (all widths) compare for boolean result and simplifications shift operations plus changes/additions for implementation (ORN, ADDME, ADDC) Also fixed a backwards-operand CMP. Change-Id: Id723c4e25125c38e0d9ab9ec9448176b75f4cdb4 Reviewed-on: https://go-review.googlesource.com/25410 Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
dd1d9b36 · David Chase · d2286ea2 · dd1d9b36 · dd1d9b36 · dd1d9b36
Commit dd1d9b36 authored Aug 02, 2016 by David Chase
8 changed files
--- a/src/cmd/compile/internal/ppc64/prog.go
+++ b/src/cmd/compile/internal/ppc64/prog.go
@@ -42,10 +42,14 @@ var progtable = [ppc64.ALAST & obj.AMask]obj.ProgInfo{

 	// Integer
 	ppc64.AADD & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AADDC & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ASUB & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AADDME & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.ANEG & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AAND & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AANDN & obj.AMask:   {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AOR & obj.AMask:     {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
+	ppc64.AORN & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AXOR & obj.AMask:    {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AMULLD & obj.AMask:  {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite},
 	ppc64.AMULLW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RegRead | gc.RightWrite},

--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@@ -102,6 +102,14 @@ var condBits = map[ssa.Op]uint8{
 	ssa.OpPPC64GreaterThan:  ppc64.C_COND_GT,
 	ssa.OpPPC64LessEqual:    ppc64.C_COND_GT,
 }
+var condOps = map[ssa.Op]obj.As{
+	ssa.OpPPC64Equal:        ppc64.ABEQ,
+	ssa.OpPPC64NotEqual:     ppc64.ABNE,
+	ssa.OpPPC64LessThan:     ppc64.ABLT,
+	ssa.OpPPC64GreaterEqual: ppc64.ABGE,
+	ssa.OpPPC64GreaterThan:  ppc64.ABGT,
+	ssa.OpPPC64LessEqual:    ppc64.ABLE,
+}

 // Is the condition bit set? 1=yes 0=no
 var condBitSet = map[ssa.Op]uint8{
@@ -198,7 +206,13 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		} else {
 			p.To.Name = obj.NAME_AUTO
 		}
-	case ssa.OpPPC64ADD, ssa.OpPPC64FADD, ssa.OpPPC64FADDS, ssa.OpPPC64SUB, ssa.OpPPC64FSUB, ssa.OpPPC64FSUBS, ssa.OpPPC64MULLD, ssa.OpPPC64MULLW, ssa.OpPPC64FMUL, ssa.OpPPC64FMULS, ssa.OpPPC64FDIV, ssa.OpPPC64FDIVS, ssa.OpPPC64AND, ssa.OpPPC64OR, ssa.OpPPC64XOR:
+
+	case ssa.OpPPC64ADD, ssa.OpPPC64FADD, ssa.OpPPC64FADDS, ssa.OpPPC64SUB, ssa.OpPPC64FSUB, ssa.OpPPC64FSUBS,
+		ssa.OpPPC64MULLD, ssa.OpPPC64MULLW,
+		ssa.OpPPC64SRAD, ssa.OpPPC64SRAW, ssa.OpPPC64SRD, ssa.OpPPC64SRW, ssa.OpPPC64SLD, ssa.OpPPC64SLW,
+		ssa.OpPPC64MULHD, ssa.OpPPC64MULHW, ssa.OpPPC64MULHDU, ssa.OpPPC64MULHWU,
+		ssa.OpPPC64FMUL, ssa.OpPPC64FMULS, ssa.OpPPC64FDIV, ssa.OpPPC64FDIVS,
+		ssa.OpPPC64AND, ssa.OpPPC64OR, ssa.OpPPC64ANDN, ssa.OpPPC64ORN, ssa.OpPPC64XOR:
 		r := gc.SSARegNum(v)
 		r1 := gc.SSARegNum(v.Args[0])
 		r2 := gc.SSARegNum(v.Args[1])
@@ -208,6 +222,24 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.Reg = r1
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = r
+
+	case ssa.OpPPC64MaskIfNotCarry:
+		r := gc.SSARegNum(v)
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = ppc64.REGZERO
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = r
+
+	case ssa.OpPPC64ADDIforC:
+		r1 := gc.SSARegNum(v.Args[0])
+		p := gc.Prog(v.Op.Asm())
+		p.Reg = r1
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = ppc64.REGTMP // Ignored; this is for the carry effect.
+
 	case ssa.OpPPC64NEG:
 		r := gc.SSARegNum(v)
 		p := gc.Prog(v.Op.Asm())
@@ -216,7 +248,9 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		}
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = r
-	case ssa.OpPPC64ADDconst, ssa.OpPPC64ANDconst, ssa.OpPPC64ORconst, ssa.OpPPC64XORconst:
+
+	case ssa.OpPPC64ADDconst, ssa.OpPPC64ANDconst, ssa.OpPPC64ORconst, ssa.OpPPC64XORconst,
+		ssa.OpPPC64SRADconst, ssa.OpPPC64SRAWconst, ssa.OpPPC64SRDconst, ssa.OpPPC64SRWconst, ssa.OpPPC64SLDconst, ssa.OpPPC64SLWconst:
 		p := gc.Prog(v.Op.Asm())
 		p.Reg = gc.SSARegNum(v.Args[0])
 		if v.Aux != nil {
@@ -275,7 +309,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = gc.SSARegNum(v.Args[1])

-	case ssa.OpPPC64CMPconst:
+	case ssa.OpPPC64CMPconst, ssa.OpPPC64CMPUconst, ssa.OpPPC64CMPWconst, ssa.OpPPC64CMPWUconst:
 		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = gc.SSARegNum(v.Args[0])
@@ -328,6 +362,48 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Reg = gc.SSARegNum(v.Args[0])
 		gc.AddAux(&p.To, v)

+	case ssa.OpPPC64Equal,
+		ssa.OpPPC64NotEqual,
+		ssa.OpPPC64LessThan,
+		ssa.OpPPC64LessEqual,
+		ssa.OpPPC64GreaterThan,
+		ssa.OpPPC64GreaterEqual:
+		// On Power7 or later, can use isel instruction:
+		// for a < b, a > b, a = b:
+		//   rt := 1
+		//   isel rt,rt,r0,cond
+
+		// for  a >= b, a <= b, a != b:
+		//   rt := 1
+		//   isel rt,0,rt,!cond
+
+		// However, PPCbe support is for older machines than that,
+		// and isel (which looks a lot like fsel) isn't recognized
+		// yet by the Go assembler.  So for now, use the old instruction
+		// sequence, which we'll need anyway.
+		// TODO: add support for isel on PPCle and use it.
+
+		// generate boolean values
+		// use conditional move
+
+		p := gc.Prog(ppc64.AMOVW)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 1
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+
+		pb := gc.Prog(condOps[v.Op])
+		pb.To.Type = obj.TYPE_BRANCH
+
+		p = gc.Prog(ppc64.AMOVW)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 0
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+
+		p = gc.Prog(obj.ANOP)
+		gc.Patch(pb, p)
+
 	case ssa.OpPPC64LoweredZero:
 		// Similar to how this is done on ARM,
 		// except that PPC MOVDU x,off(y) is *(y+off) = x; y=y+off
@@ -341,7 +417,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		//
 		// ADD    -8,R3,R3
 		// MOVDU  R0, 8(R3)
-		// CMP	  Rarg1, R3
+		// CMP	  R3, Rarg1
 		// BL	  -2(PC)
 		// arg1 is the address of the last element to zero
 		// auxint is alignment
@@ -375,11 +451,13 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Type = obj.TYPE_MEM
 		p.To.Reg = gc.SSARegNum(v.Args[0])
 		p.To.Offset = sz
-		p2 := gc.Prog(ppc64.ACMP)
+
+		p2 := gc.Prog(ppc64.ACMPU)
 		p2.From.Type = obj.TYPE_REG
-		p2.From.Reg = gc.SSARegNum(v.Args[1])
-		p2.To.Reg = ppc64.REG_R3
+		p2.From.Reg = gc.SSARegNum(v.Args[0])
+		p2.To.Reg = gc.SSARegNum(v.Args[1])
 		p2.To.Type = obj.TYPE_REG
+
 		p3 := gc.Prog(ppc64.ABLT)
 		p3.To.Type = obj.TYPE_BRANCH
 		gc.Patch(p3, p)
@@ -396,7 +474,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		// ADD    -8,R4,R4
 		// MOVDU	8(R4), Rtmp
 		// MOVDU 	Rtmp, 8(R3)
-		// CMP	Rarg2, R4
+		// CMP	R4, Rarg2
 		// BL	-3(PC)
 		// arg2 is the address of the last element of src
 		// auxint is alignment
@@ -527,13 +605,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		}
 		gc.Gvarlive(n)

-	case ssa.OpPPC64Equal,
-		ssa.OpPPC64NotEqual,
-		ssa.OpPPC64LessThan,
-		ssa.OpPPC64LessEqual,
-		ssa.OpPPC64GreaterThan,
-		ssa.OpPPC64GreaterEqual:
-		v.Fatalf("pseudo-op made it to output: %s", v.LongString())
 	case ssa.OpPhi:
 		// just check to make sure regalloc and stackalloc did it right
 		if v.Type.IsMemory() {
@@ -566,10 +637,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		// 		v.Fatalf("wrong store after nilcheck v=%s w=%s", v, w)
 		// 	}
 		// 	switch w.Op {
-		// 	case ssa.OpARMMOVBload, ssa.OpARMMOVBUload, ssa.OpARMMOVHload, ssa.OpARMMOVHUload,
-		// 		ssa.OpARMMOVWload, ssa.OpARMMOVFload, ssa.OpARMMOVDload,
-		// 		ssa.OpARMMOVBstore, ssa.OpARMMOVHstore, ssa.OpARMMOVWstore,
-		// 		ssa.OpARMMOVFstore, ssa.OpARMMOVDstore:
+		// 	case ssa.OpPPC64MOVBload, ssa.OpPPC64MOVBUload, ssa.OpPPC64MOVHload, ssa.OpPPC64MOVHUload,
+		// 		ssa.OpPPC64MOVWload, ssa.OpPPC64MOVFload, ssa.OpPPC64MOVDload,
+		// 		ssa.OpPPC64MOVBstore, ssa.OpPPC64MOVHstore, ssa.OpPPC64MOVWstore,
+		// 		ssa.OpPPC64MOVFstore, ssa.OpPPC64MOVDstore:
 		// 		// arg0 is ptr, auxint is offset
 		// 		if w.Args[0] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage {
 		// 			if gc.Debug_checknil != 0 && int(v.Line) > 1 {
@@ -577,7 +648,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		// 			}
 		// 			return
 		// 		}
-		// 	case ssa.OpARMDUFFZERO, ssa.OpARMLoweredZero, ssa.OpARMLoweredZeroU:
+		// 	case ssa.OpPPC64DUFFZERO, ssa.OpPPC64LoweredZero, ssa.OpPPC64LoweredZeroU:
 		// 		// arg0 is ptr
 		// 		if w.Args[0] == v.Args[0] {
 		// 			if gc.Debug_checknil != 0 && int(v.Line) > 1 {
@@ -585,7 +656,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		// 			}
 		// 			return
 		// 		}
-		// 	case ssa.OpARMDUFFCOPY, ssa.OpARMLoweredMove, ssa.OpARMLoweredMoveU:
+		// 	case ssa.OpPPC64DUFFCOPY, ssa.OpPPC64LoweredMove, ssa.OpPPC64LoweredMoveU:
 		// 		// arg0 is dst ptr, arg1 is src ptr
 		// 		if w.Args[0] == v.Args[0] || w.Args[1] == v.Args[0] {
 		// 			if gc.Debug_checknil != 0 && int(v.Line) > 1 {
@@ -616,6 +687,11 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			gc.Warnl(v.Line, "generated nil check")
 		}

+	case ssa.OpPPC64InvertFlags:
+		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
+	case ssa.OpPPC64FlagEQ, ssa.OpPPC64FlagLT, ssa.OpPPC64FlagGT:
+		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
+
 	default:
 		v.Unimplementedf("genValue not implemented: %s", v.LongString())
 	}

--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@@ -181,6 +181,7 @@ func NewConfig(arch string, fe Frontend, ctxt *obj.Link, optimize bool) *Config
 		c.registers = registersPPC64[:]
 		c.gpRegMask = gpRegMaskPPC64
 		c.fpRegMask = fpRegMaskPPC64
+		c.flagRegMask = flagRegMaskPPC64
 		c.FPReg = framepointerRegPPC64
 		c.noDuffDevice = true // TODO: Resolve PPC64 DuffDevice (has zero, but not copy)
 		c.hasGReg = true

--- a/src/cmd/compile/internal/ssa/gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@@ -5,30 +5,54 @@
 // Lowering arithmetic
 (Add64  x y) -> (ADD  x y)
 (AddPtr x y) -> (ADD  x y)
-(Add32  x y) -> (ADD (SignExt32to64 x) (SignExt32to64 y))
-(Add16  x y) -> (ADD (SignExt16to64 x) (SignExt16to64 y))
-(Add8   x y) -> (ADD (SignExt8to64 x) (SignExt8to64 y))
+(Add32  x y) -> (ADD x y)
+(Add16  x y) -> (ADD x y)
+(Add8   x y) -> (ADD x y)
 (Add64F x y) -> (FADD x y)
 (Add32F x y) -> (FADDS x y)

 (Sub64  x y) -> (SUB  x y)
 (SubPtr x y) -> (SUB  x y)
 (Sub32  x y) -> (SUB x y)
-(Sub16  x y) -> (SUB (SignExt16to64 x) (SignExt16to64 y))
-(Sub8   x y) -> (SUB (SignExt8to64 x) (SignExt8to64 y))
+(Sub16  x y) -> (SUB x y)
+(Sub8   x y) -> (SUB x y)
 (Sub32F x y) -> (FSUBS x y)
 (Sub64F x y) -> (FSUB x y)

 (Mul64  x y) -> (MULLD  x y)
 (Mul32  x y) -> (MULLW  x y)
-(Mul16  x y) -> (MULLW (SignExt16to32 x) (SignExt16to32 y))
-(Mul8   x y) -> (MULLW (SignExt8to32  x) (SignExt8to32 y))
+(Mul16  x y) -> (MULLW x y)
+(Mul8   x y) -> (MULLW x y)
+
+(Hmul64  x y) -> (MULHD  x y)
+(Hmul64u  x y) -> (MULHDU x y)
+(Hmul32  x y) -> (MULHW  x y)
+(Hmul32u  x y) -> (MULHWU x y)
+(Hmul16 x y) -> (SRAWconst (MULLW <config.fe.TypeInt32()> (SignExt16to32 x) (SignExt16to32 y)) [16])
+(Hmul16u x y) -> (SRWconst (MULLW <config.fe.TypeUInt32()> (ZeroExt16to32 x) (ZeroExt16to32 y)) [16])
+(Hmul8 x y) -> (SRAWconst (MULLW <config.fe.TypeInt16()> (SignExt8to32 x) (SignExt8to32 y)) [8])
+(Hmul8u x y) -> (SRWconst (MULLW <config.fe.TypeUInt16()> (ZeroExt8to32 x) (ZeroExt8to32 y)) [8])
+
 (Mul32F x y) -> (FMULS x y)
 (Mul64F x y) -> (FMUL x y)

 (Div32F x y) -> (FDIVS x y)
 (Div64F x y) -> (FDIV x y)

+(Rsh64x64 x y)  -> (SRAD x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDIforC [-64] y))))
+(Rsh64Ux64 x y) -> (SRD  x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDIforC [-64] y))))
+(Lsh64x64 x y)  -> (SLD  x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry (ADDIforC [-64] y))))
+
+(Rsh32x32 x y)  -> (SRAW x (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry <config.fe.TypeInt64()> (ADDIforC [-32] (ZeroExt32to64 y)))))
+(Rsh32Ux32 x y) -> (SRW x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry <config.fe.TypeInt64()> (ADDIforC [-32] (ZeroExt32to64 y)))))
+(Lsh32x32 x y)  -> (SLW x  (ORN y <config.fe.TypeInt64()> (MaskIfNotCarry <config.fe.TypeInt64()> (ADDIforC [-32] (ZeroExt32to64 y)))))
+
+// Potentially useful optimizing rewrites.
+// (ADDIforC [k] c), k < 0 && (c < 0 || k+c >= 0) -> CarrySet
+// (ADDIforC [k] c), K < 0 && (c >= 0 && k+c < 0) -> CarryClear
+// (MaskIfNotCarry CarrySet) -> 0
+// (MaskIfNotCarry CarrySet) -> -1
+
 // Lowering constants
 (Const8   [val]) -> (MOVWconst [val])
 (Const16  [val]) -> (MOVWconst [val])
@@ -44,24 +68,24 @@
 (OffPtr [off] ptr) -> (ADD (MOVDconst <config.Frontend().TypeInt64()> [off]) ptr)

 (And64 x y) -> (AND x y)
-(And32 x y) -> (AND (ZeroExt32to64 x) (ZeroExt32to64 y)) // Or?  (AND (ZeroExt32to64 x) (ZeroExt32to64 y))
-(And16 x y) -> (AND (ZeroExt16to64 x) (ZeroExt16to64 y))
-(And8  x y) -> (AND (ZeroExt8to64 x) (ZeroExt8to64 y))
+(And32 x y) -> (AND x y)
+(And16 x y) -> (AND x y)
+(And8  x y) -> (AND x y)

 (Or64 x y) -> (OR x y)
-(Or32 x y) -> (OR (ZeroExt32to64 x) (ZeroExt32to64 y))
-(Or16 x y) -> (OR (ZeroExt16to64 x) (ZeroExt16to64 y))
-(Or8  x y) -> (OR (ZeroExt8to64 x) (ZeroExt8to64 y))
+(Or32 x y) -> (OR x y)
+(Or16 x y) -> (OR x y)
+(Or8  x y) -> (OR x y)

 (Xor64 x y) -> (XOR x y)
-(Xor32 x y) -> (XOR (ZeroExt32to64 x) (ZeroExt32to64 y))
-(Xor16 x y) -> (XOR (ZeroExt16to64 x) (ZeroExt16to64 y))
-(Xor8  x y) -> (XOR (ZeroExt8to64 x) (ZeroExt8to64 y))
+(Xor32 x y) -> (XOR x y)
+(Xor16 x y) -> (XOR x y)
+(Xor8  x y) -> (XOR x y)

 (Neg64  x) -> (NEG x)
-(Neg32  x) -> (NEG (ZeroExt32to64 x))
-(Neg16  x) -> (NEG (ZeroExt16to64 x))
-(Neg8   x) -> (NEG (ZeroExt8to64 x))
+(Neg32  x) -> (NEG x)
+(Neg16  x) -> (NEG x)
+(Neg8   x) -> (NEG x)

 // Lowering comparisons
 (Eq8 x y)  -> (Equal (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
@@ -132,15 +156,105 @@
 (If (GreaterThan cc) yes no) -> (GT cc yes no)
 (If (GreaterEqual cc) yes no) -> (GE cc yes no)

-(If cond yes no) -> (NE (CMPconst [0] cond) yes no)
+(If cond yes no) -> (NE (CMPWconst [0] cond) yes no)

 // Absorb boolean tests into block
-(NE (CMPconst [0] (Equal cc)) yes no) -> (EQ cc yes no)
-(NE (CMPconst [0] (NotEqual cc)) yes no) -> (NE cc yes no)
-(NE (CMPconst [0] (LessThan cc)) yes no) -> (LT cc yes no)
-(NE (CMPconst [0] (LessEqual cc)) yes no) -> (LE cc yes no)
-(NE (CMPconst [0] (GreaterThan cc)) yes no) -> (GT cc yes no)
-(NE (CMPconst [0] (GreaterEqual cc)) yes no) -> (GE cc yes no)
+(NE (CMPWconst [0] (Equal cc)) yes no) -> (EQ cc yes no)
+(NE (CMPWconst [0] (NotEqual cc)) yes no) -> (NE cc yes no)
+(NE (CMPWconst [0] (LessThan cc)) yes no) -> (LT cc yes no)
+(NE (CMPWconst [0] (LessEqual cc)) yes no) -> (LE cc yes no)
+(NE (CMPWconst [0] (GreaterThan cc)) yes no) -> (GT cc yes no)
+(NE (CMPWconst [0] (GreaterEqual cc)) yes no) -> (GE cc yes no)
+
+// absorb flag constants into branches
+(EQ (FlagEQ) yes no) -> (First nil yes no)
+(EQ (FlagLT) yes no) -> (First nil no yes)
+(EQ (FlagGT) yes no) -> (First nil no yes)
+
+(NE (FlagEQ) yes no) -> (First nil no yes)
+(NE (FlagLT) yes no) -> (First nil yes no)
+(NE (FlagGT) yes no) -> (First nil yes no)
+
+(LT (FlagEQ) yes no) -> (First nil no yes)
+(LT (FlagLT) yes no) -> (First nil yes no)
+(LT (FlagGT) yes no) -> (First nil no yes)
+
+(LE (FlagEQ) yes no) -> (First nil yes no)
+(LE (FlagLT) yes no) -> (First nil yes no)
+(LE (FlagGT) yes no) -> (First nil no yes)
+
+(GT (FlagEQ) yes no) -> (First nil no yes)
+(GT (FlagLT) yes no) -> (First nil no yes)
+(GT (FlagGT) yes no) -> (First nil yes no)
+
+(GE (FlagEQ) yes no) -> (First nil yes no)
+(GE (FlagLT) yes no) -> (First nil no yes)
+(GE (FlagGT) yes no) -> (First nil yes no)
+
+// absorb InvertFlags into branches
+(LT (InvertFlags cmp) yes no) -> (GT cmp yes no)
+(GT (InvertFlags cmp) yes no) -> (LT cmp yes no)
+(LE (InvertFlags cmp) yes no) -> (GE cmp yes no)
+(GE (InvertFlags cmp) yes no) -> (LE cmp yes no)
+(EQ (InvertFlags cmp) yes no) -> (EQ cmp yes no)
+(NE (InvertFlags cmp) yes no) -> (NE cmp yes no)
+
+// constant comparisons
+(CMPWconst (MOVWconst [x]) [y]) && int32(x)==int32(y) -> (FlagEQ)
+(CMPWconst (MOVWconst [x]) [y]) && int32(x)<int32(y)  -> (FlagLT)
+(CMPWconst (MOVWconst [x]) [y]) && int32(x)>int32(y)  -> (FlagGT)
+
+(CMPconst (MOVDconst [x]) [y]) && int64(x)==int64(y) -> (FlagEQ)
+(CMPconst (MOVDconst [x]) [y]) && int64(x)<int64(y)  -> (FlagLT)
+(CMPconst (MOVDconst [x]) [y]) && int64(x)>int64(y)  -> (FlagGT)
+
+(CMPWUconst (MOVWconst [x]) [y]) && int32(x)==int32(y)  -> (FlagEQ)
+(CMPWUconst (MOVWconst [x]) [y]) && uint32(x)<uint32(y) -> (FlagLT)
+(CMPWUconst (MOVWconst [x]) [y]) && uint32(x)>uint32(y) -> (FlagGT)
+
+(CMPUconst (MOVDconst [x]) [y]) && int64(x)==int64(y)  -> (FlagEQ)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)<uint64(y) -> (FlagLT)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)>uint64(y) -> (FlagGT)
+
+// other known comparisons
+//(CMPconst (MOVBUreg _) [c]) && 0xff < c -> (FlagLT)
+//(CMPconst (MOVHUreg _) [c]) && 0xffff < c -> (FlagLT)
+//(CMPconst (ANDconst _ [m]) [n]) && 0 <= int32(m) && int32(m) < int32(n) -> (FlagLT)
+//(CMPconst (SRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint32(32-c)) <= uint32(n) -> (FlagLT)
+
+// absorb flag constants into boolean values
+(Equal (FlagEQ)) -> (MOVWconst [1])
+(Equal (FlagLT)) -> (MOVWconst [0])
+(Equal (FlagGT)) -> (MOVWconst [0])
+
+(NotEqual (FlagEQ)) -> (MOVWconst [0])
+(NotEqual (FlagLT)) -> (MOVWconst [1])
+(NotEqual (FlagGT)) -> (MOVWconst [1])
+
+(LessThan (FlagEQ)) -> (MOVWconst [0])
+(LessThan (FlagLT)) -> (MOVWconst [1])
+(LessThan (FlagGT)) -> (MOVWconst [0])
+
+(LessEqual (FlagEQ)) -> (MOVWconst [1])
+(LessEqual (FlagLT)) -> (MOVWconst [1])
+(LessEqual (FlagGT)) -> (MOVWconst [0])
+
+(GreaterThan (FlagEQ)) -> (MOVWconst [0])
+(GreaterThan (FlagLT)) -> (MOVWconst [0])
+(GreaterThan (FlagGT)) -> (MOVWconst [1])
+
+(GreaterEqual (FlagEQ)) -> (MOVWconst [1])
+(GreaterEqual (FlagLT)) -> (MOVWconst [0])
+(GreaterEqual (FlagGT)) -> (MOVWconst [1])
+
+// absorb InvertFlags into boolean values
+(Equal (InvertFlags x)) -> (Equal x)
+(NotEqual (InvertFlags x)) -> (NotEqual x)
+(LessThan (InvertFlags x)) -> (GreaterThan x)
+(GreaterThan (InvertFlags x)) -> (LessThan x)
+(LessEqual (InvertFlags x)) -> (GreaterEqual x)
+(GreaterEqual (InvertFlags x)) -> (LessEqual x)
+

 // Lowering loads
 (Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) -> (MOVDload ptr mem)

--- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
@@ -148,19 +148,47 @@ func init() {
 		{name: "SUB", argLength: 2, reg: gp21, asm: "SUB"},                        // arg0-arg1
 		{name: "FSUB", argLength: 2, reg: fp21, asm: "FSUB"},                      // arg0-arg1
 		{name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS"},                    // arg0-arg1
-		{name: "MULLD", argLength: 2, reg: gp21, asm: "MULLD", commutative: true}, // arg0*arg1
-		{name: "MULLW", argLength: 2, reg: gp21, asm: "MULLW", commutative: true}, // arg0*arg1
+
+		{name: "MULLD", argLength: 2, reg: gp21, asm: "MULLD", commutative: true}, // arg0*arg1 (signed 64-bit)
+		{name: "MULLW", argLength: 2, reg: gp21, asm: "MULLW", commutative: true}, // arg0*arg1 (signed 32-bit)
+
+		{name: "MULHD", argLength: 2, reg: gp21, asm: "MULHD", commutative: true},   // (arg0 * arg1) >> 64, signed
+		{name: "MULHW", argLength: 2, reg: gp21, asm: "MULHW", commutative: true},   // (arg0 * arg1) >> 32, signed
+		{name: "MULHDU", argLength: 2, reg: gp21, asm: "MULHDU", commutative: true}, // (arg0 * arg1) >> 64, unsigned
+		{name: "MULHWU", argLength: 2, reg: gp21, asm: "MULHWU", commutative: true}, // (arg0 * arg1) >> 32, unsigned
+
 		{name: "FMUL", argLength: 2, reg: fp21, asm: "FMUL", commutative: true},   // arg0*arg1
 		{name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0*arg1
-		{name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV"},                      // arg0/arg1
-		{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"},                    // arg0/arg1
-		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true},     // arg0&arg1
-		{name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int32"},     // arg0&arg1 ??
-		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},       // arg0|arg1
-		{name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int32"},       // arg0|arg1 ??
-		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true},     // arg0^arg1
-		{name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int32"},     // arg0|arg1 ??
-		{name: "NEG", argLength: 1, reg: gp11, asm: "NEG"},                        // ^arg0
+
+		{name: "SRAD", argLength: 2, reg: gp21, asm: "SRAD"}, // arg0 >>a arg1, 64 bits (all sign if arg1 & 64 != 0)
+		{name: "SRAW", argLength: 2, reg: gp21, asm: "SRAW"}, // arg0 >>a arg1, 32 bits (all sign if arg1 & 32 != 0)
+		{name: "SRD", argLength: 2, reg: gp21, asm: "SRD"},   // arg0 >> arg1, 64 bits  (0 if arg1 & 64 != 0)
+		{name: "SRW", argLength: 2, reg: gp21, asm: "SRW"},   // arg0 >> arg1, 32 bits  (0 if arg1 & 32 != 0)
+		{name: "SLD", argLength: 2, reg: gp21, asm: "SLD"},   // arg0 << arg1, 64 bits  (0 if arg1 & 64 != 0)
+		{name: "SLW", argLength: 2, reg: gp21, asm: "SLW"},   // arg0 << arg1, 32 bits  (0 if arg1 & 32 != 0)
+
+		{name: "ADDIforC", argLength: 1, reg: regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{cr}, clobbers: tmp}, aux: "Int16", asm: "ADDC", typ: "Flags"}, // _, carry := arg0 + aux
+		{name: "MaskIfNotCarry", argLength: 1, reg: crgp, asm: "ADDME", typ: "Int64"},                                                                                   // carry - 1 (if carry then 0 else -1)
+
+		{name: "SRADconst", argLength: 1, reg: gp11, asm: "SRAD", aux: "Int64"}, // arg0 >>a aux, 64 bits
+		{name: "SRAWconst", argLength: 1, reg: gp11, asm: "SRAW", aux: "Int64"}, // arg0 >>a aux, 32 bits
+		{name: "SRDconst", argLength: 1, reg: gp11, asm: "SRD", aux: "Int64"},   // arg0 >> aux, 64 bits
+		{name: "SRWconst", argLength: 1, reg: gp11, asm: "SRW", aux: "Int64"},   // arg0 >> aux, 32 bits
+		{name: "SLDconst", argLength: 1, reg: gp11, asm: "SLD", aux: "Int64"},   // arg0 << aux, 64 bits
+		{name: "SLWconst", argLength: 1, reg: gp11, asm: "SLW", aux: "Int64"},   // arg0 << aux, 32 bits
+
+		{name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV"},   // arg0/arg1
+		{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0/arg1
+
+		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0&arg1
+		{name: "ANDN", argLength: 2, reg: gp21, asm: "ANDN"},                  // arg0&^arg1
+		{name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64"}, // arg0&arg1 ??
+		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},   // arg0|arg1
+		{name: "ORN", argLength: 2, reg: gp21, asm: "ORN"},                    // arg0|^arg1
+		{name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64"},   // arg0|arg1 ??
+		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0^arg1
+		{name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64"}, // arg0|arg1 ??
+		{name: "NEG", argLength: 1, reg: gp11, asm: "NEG"},                    // -arg0

 		{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"},                                    // sign extend int8 to int64
 		{name: "MOVBZreg", argLength: 1, reg: gp11, asm: "MOVBZ"},                                  // zero extend uint8 to uint64
@@ -202,15 +230,18 @@ func init() {
 		{name: "CMPU", argLength: 2, reg: gp2cr, asm: "CMPU", typ: "Flags"},   // arg0 compare to arg1
 		{name: "CMPW", argLength: 2, reg: gp2cr, asm: "CMPW", typ: "Flags"},   // arg0 compare to arg1
 		{name: "CMPWU", argLength: 2, reg: gp2cr, asm: "CMPWU", typ: "Flags"}, // arg0 compare to arg1
-		{name: "CMPconst", argLength: 1, reg: gp1cr, asm: "CMP", aux: "Int32", typ: "Flags"},
+		{name: "CMPconst", argLength: 1, reg: gp1cr, asm: "CMP", aux: "Int64", typ: "Flags"},
+		{name: "CMPUconst", argLength: 1, reg: gp1cr, asm: "CMPU", aux: "Int64", typ: "Flags"},
+		{name: "CMPWconst", argLength: 1, reg: gp1cr, asm: "CMPW", aux: "Int32", typ: "Flags"},
+		{name: "CMPWUconst", argLength: 1, reg: gp1cr, asm: "CMPWU", aux: "Int32", typ: "Flags"},

 		// pseudo-ops
 		{name: "Equal", argLength: 1, reg: crgp},        // bool, true flags encode x==y false otherwise.
 		{name: "NotEqual", argLength: 1, reg: crgp},     // bool, true flags encode x!=y false otherwise.
-		{name: "LessThan", argLength: 1, reg: crgp},     // bool, true flags encode signed x<y false otherwise.
-		{name: "LessEqual", argLength: 1, reg: crgp},    // bool, true flags encode signed x<=y false otherwise.
-		{name: "GreaterThan", argLength: 1, reg: crgp},  // bool, true flags encode signed x>y false otherwise.
-		{name: "GreaterEqual", argLength: 1, reg: crgp}, // bool, true flags encode signed x>=y false otherwise.
+		{name: "LessThan", argLength: 1, reg: crgp},     // bool, true flags encode  x<y false otherwise.
+		{name: "LessEqual", argLength: 1, reg: crgp},    // bool, true flags encode  x<=y false otherwise.
+		{name: "GreaterThan", argLength: 1, reg: crgp},  // bool, true flags encode  x>y false otherwise.
+		{name: "GreaterEqual", argLength: 1, reg: crgp}, // bool, true flags encode  x>=y false otherwise.

 		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
 		// and sorts it to the very beginning of the block to prevent other
@@ -271,6 +302,27 @@ func init() {
 			},
 			typ: "Mem",
 		},
+
+		// (InvertFlags (CMP a b)) == (CMP b a)
+		// So if we want (LessThan (CMP a b)) but we can't do that because a is a constant,
+		// then we do (LessThan (InvertFlags (CMP b a))) instead.
+		// Rewrites will convert this to (GreaterThan (CMP b a)).
+		// InvertFlags is a pseudo-op which can't appear in assembly output.
+		{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+		// Constant flag values. For any comparison, there are 3 possible
+		// outcomes: either the three from the signed total order (<,==,>)
+		// or the three from the unsigned total order, depending on which
+		// comparison operation was used (CMP or CMPU -- PPC is different from
+		// the other architectures, which have a single comparison producing
+		// both signed and unsigned comparison results.)
+
+		// These ops are for temporary use by rewrite rules. They
+		// cannot appear in the generated assembly.
+		{name: "FlagEQ"}, // equal
+		{name: "FlagLT"}, // signed < or unsigned <
+		{name: "FlagGT"}, // signed > or unsigned >
+
 	}

 	blocks := []blockData{
@@ -295,6 +347,7 @@ func init() {
 		regnames:        regNamesPPC64,
 		gpregmask:       gp,
 		fpregmask:       fp,
+		flagmask:        cr,
 		framepointerreg: int8(num["SP"]),
 	})
 }
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -336,11 +336,11 @@ func (s *regAllocState) assignReg(r register, v *Value, c *Value) {
 // allocReg chooses a register from the set of registers in mask.
 // If there is no unused register, a Value will be kicked out of
 // a register to make room.
-func (s *regAllocState) allocReg(mask regMask) register {
+func (s *regAllocState) allocReg(mask regMask, v *Value) register {
 	mask &= s.allocatable
 	mask &^= s.nospill
 	if mask == 0 {
-		s.f.Fatalf("no register available")
+		s.f.Fatalf("no register available for %s", v)
 	}

 	// Pick an unused register if one is available.
@@ -401,7 +401,7 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, line
 	}

 	// Allocate a register.
-	r := s.allocReg(mask)
+	r := s.allocReg(mask, v)

 	// Allocate v to the new register.
 	var c *Value
@@ -1220,7 +1220,7 @@ func (s *regAllocState) regalloc(f *Func) {
 					if mask&^desired.avoid != 0 {
 						mask &^= desired.avoid
 					}
-					r := s.allocReg(mask)
+					r := s.allocReg(mask, v)
 					outRegs[out.idx] = r
 					used |= regMask(1) << r
 				}

--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go