cmd/compile: use ISEL, cleanup use of zero & extensions

Abandoned earlier efforts to expose zero register, but left it in numbering to decrease squirrelyness of register allocator. ISELrelOp used in code generation of bool := x relOp y. Some patterns added to better elide zero case and some sign extension. Updates: #17109 Change-Id: Ida7839f0023ca8f0ffddc0545f0ac269e65b05d9 Reviewed-on: https://go-review.googlesource.com/29380 Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>

cmd/compile: use ISEL, cleanup use of zero & extensions
Abandoned earlier efforts to expose zero register, but left it in numbering to decrease squirrelyness of register allocator. ISELrelOp used in code generation of bool := x relOp y. Some patterns added to better elide zero case and some sign extension. Updates: #17109 Change-Id: Ida7839f0023ca8f0ffddc0545f0ac269e65b05d9 Reviewed-on: https://go-review.googlesource.com/29380 Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
cddddbc6 · David Chase · dcbbd319 · cddddbc6 · cddddbc6 · cddddbc6
Commit cddddbc6 authored Sep 16, 2016 by David Chase
12 changed files
--- a/src/cmd/compile/internal/gc/go.go
+++ b/src/cmd/compile/internal/gc/go.go
@@ -336,6 +336,9 @@ const (

 	// Instruction updates whichever of from/to is type D_OREG. (ppc64)
 	PostInc = 1 << 29
+
+	// Optional 3rd input operand, only ever read.
+	From3Read = 1 << 30
 )

 type Arch struct {

--- a/src/cmd/compile/internal/gc/plive.go
+++ b/src/cmd/compile/internal/gc/plive.go
@@ -624,6 +624,20 @@ func progeffects(prog *obj.Prog, vars []*Node, uevar bvec, varkill bvec, avarini
 		}
 	}

+	if info.Flags&From3Read != 0 {
+		from := prog.From3
+		if from.Node != nil && from.Sym != nil {
+			n := from.Node.(*Node)
+			if pos := liveIndex(n, vars); pos >= 0 {
+				if n.Addrtaken {
+					bvset(avarinit, pos)
+				} else {
+					bvset(uevar, pos)
+				}
+			}
+		}
+	}
+
 	if info.Flags&(RightRead|RightWrite|RightAddr) != 0 {
 		to := &prog.To
 		if to.Node != nil && to.Sym != nil {

--- a/src/cmd/compile/internal/ppc64/prog.go
+++ b/src/cmd/compile/internal/ppc64/prog.go
@@ -100,6 +100,8 @@ var progtable = [ppc64.ALAST & obj.AMask]gc.ProgInfo{
 	ppc64.AMOVHZ & obj.AMask: {Flags: gc.SizeW | gc.LeftRead | gc.RightWrite | gc.Move | gc.Conv},
 	ppc64.AMOVW & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RightWrite | gc.Move | gc.Conv},

+	ppc64.AISEL & obj.AMask: {Flags: gc.SizeQ | gc.RegRead | gc.From3Read | gc.RightWrite},
+
 	// there is no AMOVWU.
 	ppc64.AMOVWZU & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RightWrite | gc.Move | gc.Conv | gc.PostInc},
 	ppc64.AMOVWZ & obj.AMask:  {Flags: gc.SizeL | gc.LeftRead | gc.RightWrite | gc.Move | gc.Conv},

--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@@ -26,6 +26,28 @@ var condOps = map[ssa.Op]obj.As{
 	ssa.OpPPC64FGreaterEqual: ppc64.ABGT, // 2 branches for FCMP >=, second is BEQ
 }

+// iselOp encodes mapping of comparison operations onto ISEL operands
+type iselOp struct {
+	cond        int64
+	valueIfCond int // if cond is true, the value to return (0 or 1)
+}
+
+// Input registers to ISEL used for comparison. Index 0 is zero, 1 is (will be) 1
+var iselRegs = [2]int16{ppc64.REG_R0, ppc64.REGTMP}
+
+var iselOps = map[ssa.Op]iselOp{
+	ssa.OpPPC64Equal:         iselOp{cond: ppc64.C_COND_EQ, valueIfCond: 1},
+	ssa.OpPPC64NotEqual:      iselOp{cond: ppc64.C_COND_EQ, valueIfCond: 0},
+	ssa.OpPPC64LessThan:      iselOp{cond: ppc64.C_COND_LT, valueIfCond: 1},
+	ssa.OpPPC64GreaterEqual:  iselOp{cond: ppc64.C_COND_LT, valueIfCond: 0},
+	ssa.OpPPC64GreaterThan:   iselOp{cond: ppc64.C_COND_GT, valueIfCond: 1},
+	ssa.OpPPC64LessEqual:     iselOp{cond: ppc64.C_COND_GT, valueIfCond: 0},
+	ssa.OpPPC64FLessThan:     iselOp{cond: ppc64.C_COND_LT, valueIfCond: 1},
+	ssa.OpPPC64FGreaterThan:  iselOp{cond: ppc64.C_COND_GT, valueIfCond: 1},
+	ssa.OpPPC64FLessEqual:    iselOp{cond: ppc64.C_COND_LT, valueIfCond: 1}, // 2 comparisons, 2nd is EQ
+	ssa.OpPPC64FGreaterEqual: iselOp{cond: ppc64.C_COND_GT, valueIfCond: 1}, // 2 comparisons, 2nd is EQ
+}
+
 // markMoves marks any MOVXconst ops that need to avoid clobbering flags.
 func ssaMarkMoves(s *gc.SSAGenState, b *ssa.Block) {
 	//	flive := b.FlagsLiveAtEnd
@@ -34,7 +56,7 @@ func ssaMarkMoves(s *gc.SSAGenState, b *ssa.Block) {
 	//	}
 	//	for i := len(b.Values) - 1; i >= 0; i-- {
 	//		v := b.Values[i]
-	//		if flive && (v.Op == ssa.OpPPC64MOVWconst || v.Op == ssa.OpPPC64MOVDconst) {
+	//		if flive && (v.Op == v.Op == ssa.OpPPC64MOVDconst) {
 	//			// The "mark" is any non-nil Aux value.
 	//			v.Aux = v
 	//		}
@@ -120,6 +142,17 @@ func scratchFpMem(s *gc.SSAGenState, a *obj.Addr) {
 	a.Reg = ppc64.REGSP
 }

+func ssaGenISEL(v *ssa.Value, cr int64, r1, r2 int16) {
+	r := v.Reg()
+	p := gc.Prog(ppc64.AISEL)
+	p.To.Type = obj.TYPE_REG
+	p.To.Reg = r
+	p.Reg = r1
+	p.From3 = &obj.Addr{Type: obj.TYPE_REG, Reg: r2}
+	p.From.Type = obj.TYPE_CONST
+	p.From.Offset = cr
+}
+
 func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	s.SetLineno(v.Line)
 	switch v.Op {
@@ -382,7 +415,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			v.Fatalf("bad reg %s for symbol type %T, want %s", reg, v.Aux, wantreg)
 		}

-	case ssa.OpPPC64MOVDconst, ssa.OpPPC64MOVWconst:
+	case ssa.OpPPC64MOVDconst:
 		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_CONST
 		p.From.Offset = v.AuxInt
@@ -418,7 +451,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Reg = v.Reg()
 		p.To.Type = obj.TYPE_REG

-	case ssa.OpPPC64MOVDload, ssa.OpPPC64MOVWload, ssa.OpPPC64MOVBload, ssa.OpPPC64MOVHload, ssa.OpPPC64MOVWZload, ssa.OpPPC64MOVBZload, ssa.OpPPC64MOVHZload:
+	case ssa.OpPPC64MOVDload, ssa.OpPPC64MOVWload, ssa.OpPPC64MOVHload, ssa.OpPPC64MOVWZload, ssa.OpPPC64MOVBZload, ssa.OpPPC64MOVHZload:
 		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_MEM
 		p.From.Reg = v.Args[0].Reg()
@@ -465,25 +498,18 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		ssa.OpPPC64GreaterThan,
 		ssa.OpPPC64FGreaterThan,
 		ssa.OpPPC64GreaterEqual:
+
 		// On Power7 or later, can use isel instruction:
 		// for a < b, a > b, a = b:
-		//   rt := 1
-		//   isel rt,rt,r0,cond
+		//   rtmp := 1
+		//   isel rt,rtmp,r0,cond // rt is target in ppc asm

 		// for  a >= b, a <= b, a != b:
-		//   rt := 1
-		//   isel rt,0,rt,!cond
-
-		// However, PPCbe support is for older machines than that,
-		// and isel (which looks a lot like fsel) isn't recognized
-		// yet by the Go assembler.  So for now, use the old instruction
-		// sequence, which we'll need anyway.
-		// TODO: add support for isel on PPCle and use it.
-
-		// generate boolean values
-		// use conditional move
+		//   rtmp := 1
+		//   isel rt,0,rtmp,!cond // rt is target in ppc asm

-		p := gc.Prog(ppc64.AMOVW)
+		if v.Block.Func.Config.OldArch {
+			p := gc.Prog(ppc64.AMOVD)
 			p.From.Type = obj.TYPE_CONST
 			p.From.Offset = 1
 			p.To.Type = obj.TYPE_REG
@@ -492,7 +518,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			pb := gc.Prog(condOps[v.Op])
 			pb.To.Type = obj.TYPE_BRANCH

-		p = gc.Prog(ppc64.AMOVW)
+			p = gc.Prog(ppc64.AMOVD)
 			p.From.Type = obj.TYPE_CONST
 			p.From.Offset = 0
 			p.To.Type = obj.TYPE_REG
@@ -500,10 +526,21 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {

 			p = gc.Prog(obj.ANOP)
 			gc.Patch(pb, p)
+			break
+		}
+		// Modern PPC uses ISEL
+		p := gc.Prog(ppc64.AMOVD)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 1
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = iselRegs[1]
+		iop := iselOps[v.Op]
+		ssaGenISEL(v, iop.cond, iselRegs[iop.valueIfCond], iselRegs[1-iop.valueIfCond])

 	case ssa.OpPPC64FLessEqual, // These include a second branch for EQ -- dealing with NaN prevents REL= to !REL conversion
 		ssa.OpPPC64FGreaterEqual:

+		if v.Block.Func.Config.OldArch {
 			p := gc.Prog(ppc64.AMOVW)
 			p.From.Type = obj.TYPE_CONST
 			p.From.Offset = 1
@@ -524,6 +561,17 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			p = gc.Prog(obj.ANOP)
 			gc.Patch(pb0, p)
 			gc.Patch(pb1, p)
+			break
+		}
+		// Modern PPC uses ISEL
+		p := gc.Prog(ppc64.AMOVD)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 1
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = iselRegs[1]
+		iop := iselOps[v.Op]
+		ssaGenISEL(v, iop.cond, iselRegs[iop.valueIfCond], iselRegs[1-iop.valueIfCond])
+		ssaGenISEL(v, ppc64.C_COND_EQ, iselRegs[1], v.Reg())

 	case ssa.OpPPC64LoweredZero:
 		// Similar to how this is done on ARM,

--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@@ -32,8 +32,9 @@ type Config struct {
 	noDuffDevice    bool                       // Don't use Duff's device
 	nacl            bool                       // GOOS=nacl
 	use387          bool                       // GO386=387
+	OldArch         bool                       // True for older versions of architecture, e.g. true for PPC64BE, false for PPC64LE
 	NeedsFpScratch  bool                       // No direct move between GP and FP register sets
-	DebugTest       bool                       // as a debugging aid for binary search using GOSSAHASH, make buggy new code conditional on this
+	DebugTest       bool                       // default true unless $GOSSAHASH != ""; as a debugging aid, make new code conditional on this and use GOSSAHASH to binary search for failing cases
 	sparsePhiCutoff uint64                     // Sparse phi location algorithm used above this #blocks*#variables score
 	curFunc         *Func

@@ -180,7 +181,10 @@ func NewConfig(arch string, fe Frontend, ctxt *obj.Link, optimize bool) *Config
 		c.FPReg = framepointerRegARM64
 		c.hasGReg = true
 		c.noDuffDevice = obj.GOOS == "darwin" // darwin linker cannot handle BR26 reloc with non-zero addend
-	case "ppc64le", "ppc64":
+	case "ppc64":
+		c.OldArch = true
+		fallthrough
+	case "ppc64le":
 		c.IntSize = 8
 		c.PtrSize = 8
 		c.lowerBlock = rewriteBlockPPC64

--- a/src/cmd/compile/internal/ssa/gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@@ -149,14 +149,14 @@
 // (MaskIfNotCarry CarrySet) -> -1

 // Lowering constants
-(Const8   [val]) -> (MOVWconst [val])
-(Const16  [val]) -> (MOVWconst [val])
-(Const32  [val]) -> (MOVWconst [val])
+(Const8   [val]) -> (MOVDconst [val])
+(Const16  [val]) -> (MOVDconst [val])
+(Const32  [val]) -> (MOVDconst [val])
 (Const64  [val]) -> (MOVDconst [val])
 (Const32F [val]) -> (FMOVSconst [val])
 (Const64F [val]) -> (FMOVDconst [val])
 (ConstNil) -> (MOVDconst [0])
-(ConstBool [b]) -> (MOVWconst [b])
+(ConstBool [b]) -> (MOVDconst [b])

 (Addr {sym} base) -> (MOVDaddr {sym} base)
 // (Addr {sym} base) -> (ADDconst {sym} base)
@@ -326,23 +326,18 @@
 (EQ (InvertFlags cmp) yes no) -> (EQ cmp yes no)
 (NE (InvertFlags cmp) yes no) -> (NE cmp yes no)

-// (FLT (InvertFlags cmp) yes no) -> (FGT cmp yes no)
-// (FGT (InvertFlags cmp) yes no) -> (FLT cmp yes no)
-// (FLE (InvertFlags cmp) yes no) -> (FGE cmp yes no)
-// (FGE (InvertFlags cmp) yes no) -> (FLE cmp yes no)
-
 // constant comparisons
-(CMPWconst (MOVWconst [x]) [y]) && int32(x)==int32(y) -> (FlagEQ)
-(CMPWconst (MOVWconst [x]) [y]) && int32(x)<int32(y)  -> (FlagLT)
-(CMPWconst (MOVWconst [x]) [y]) && int32(x)>int32(y)  -> (FlagGT)
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)==int32(y) -> (FlagEQ)
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)<int32(y)  -> (FlagLT)
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)>int32(y)  -> (FlagGT)

 (CMPconst (MOVDconst [x]) [y]) && int64(x)==int64(y) -> (FlagEQ)
 (CMPconst (MOVDconst [x]) [y]) && int64(x)<int64(y)  -> (FlagLT)
 (CMPconst (MOVDconst [x]) [y]) && int64(x)>int64(y)  -> (FlagGT)

-(CMPWUconst (MOVWconst [x]) [y]) && int32(x)==int32(y)  -> (FlagEQ)
-(CMPWUconst (MOVWconst [x]) [y]) && uint32(x)<uint32(y) -> (FlagLT)
-(CMPWUconst (MOVWconst [x]) [y]) && uint32(x)>uint32(y) -> (FlagGT)
+(CMPWUconst (MOVDconst [x]) [y]) && int32(x)==int32(y)  -> (FlagEQ)
+(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)<uint32(y) -> (FlagLT)
+(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)>uint32(y) -> (FlagGT)

 (CMPUconst (MOVDconst [x]) [y]) && int64(x)==int64(y)  -> (FlagEQ)
 (CMPUconst (MOVDconst [x]) [y]) && uint64(x)<uint64(y) -> (FlagLT)
@@ -355,29 +350,29 @@
 //(CMPconst (SRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint32(32-c)) <= uint32(n) -> (FlagLT)

 // absorb flag constants into boolean values
-(Equal (FlagEQ)) -> (MOVWconst [1])
-(Equal (FlagLT)) -> (MOVWconst [0])
-(Equal (FlagGT)) -> (MOVWconst [0])
+(Equal (FlagEQ)) -> (MOVDconst [1])
+(Equal (FlagLT)) -> (MOVDconst [0])
+(Equal (FlagGT)) -> (MOVDconst [0])

-(NotEqual (FlagEQ)) -> (MOVWconst [0])
-(NotEqual (FlagLT)) -> (MOVWconst [1])
-(NotEqual (FlagGT)) -> (MOVWconst [1])
+(NotEqual (FlagEQ)) -> (MOVDconst [0])
+(NotEqual (FlagLT)) -> (MOVDconst [1])
+(NotEqual (FlagGT)) -> (MOVDconst [1])

-(LessThan (FlagEQ)) -> (MOVWconst [0])
-(LessThan (FlagLT)) -> (MOVWconst [1])
-(LessThan (FlagGT)) -> (MOVWconst [0])
+(LessThan (FlagEQ)) -> (MOVDconst [0])
+(LessThan (FlagLT)) -> (MOVDconst [1])
+(LessThan (FlagGT)) -> (MOVDconst [0])

-(LessEqual (FlagEQ)) -> (MOVWconst [1])
-(LessEqual (FlagLT)) -> (MOVWconst [1])
-(LessEqual (FlagGT)) -> (MOVWconst [0])
+(LessEqual (FlagEQ)) -> (MOVDconst [1])
+(LessEqual (FlagLT)) -> (MOVDconst [1])
+(LessEqual (FlagGT)) -> (MOVDconst [0])

-(GreaterThan (FlagEQ)) -> (MOVWconst [0])
-(GreaterThan (FlagLT)) -> (MOVWconst [0])
-(GreaterThan (FlagGT)) -> (MOVWconst [1])
+(GreaterThan (FlagEQ)) -> (MOVDconst [0])
+(GreaterThan (FlagLT)) -> (MOVDconst [0])
+(GreaterThan (FlagGT)) -> (MOVDconst [1])

-(GreaterEqual (FlagEQ)) -> (MOVWconst [1])
-(GreaterEqual (FlagLT)) -> (MOVWconst [0])
-(GreaterEqual (FlagGT)) -> (MOVWconst [1])
+(GreaterEqual (FlagEQ)) -> (MOVDconst [1])
+(GreaterEqual (FlagLT)) -> (MOVDconst [0])
+(GreaterEqual (FlagGT)) -> (MOVDconst [1])

 // absorb InvertFlags into boolean values
 (Equal (InvertFlags x)) -> (Equal x)
@@ -387,19 +382,14 @@
 (LessEqual (InvertFlags x)) -> (GreaterEqual x)
 (GreaterEqual (InvertFlags x)) -> (LessEqual x)

-// (FLessThan (InvertFlags x)) -> (FGreaterThan x)
-// (FGreaterThan (InvertFlags x)) -> (FLessThan x)
-// (FLessEqual (InvertFlags x)) -> (FGreaterEqual x)
-// (FGreaterEqual (InvertFlags x)) -> (FLessEqual x)
-
-
 // Lowering loads
 (Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) -> (MOVDload ptr mem)
 (Load <t> ptr mem) && is32BitInt(t) && isSigned(t) -> (MOVWload ptr mem)
 (Load <t> ptr mem) && is32BitInt(t) && !isSigned(t) -> (MOVWZload ptr mem)
 (Load <t> ptr mem) && is16BitInt(t) && isSigned(t) -> (MOVHload ptr mem)
 (Load <t> ptr mem) && is16BitInt(t) && !isSigned(t) -> (MOVHZload ptr mem)
-(Load <t> ptr mem) && (t.IsBoolean() || (is8BitInt(t) && isSigned(t))) -> (MOVBload ptr mem)
+(Load <t> ptr mem) && t.IsBoolean() -> (MOVBZload ptr mem)
+(Load <t> ptr mem) && is8BitInt(t) && isSigned(t) -> (MOVBreg (MOVBZload ptr mem)) // PPC has no signed-byte load.
 (Load <t> ptr mem) && is8BitInt(t) && !isSigned(t) -> (MOVBZload ptr mem)

 (Load <t> ptr mem) && is32BitFloat(t) -> (FMOVSload ptr mem)
@@ -533,6 +523,15 @@
 (ADD (MOVDconst [c]) x) && is32Bit(c) -> (ADDconst [c] x)
 (ADD x (MOVDconst [c])) && is32Bit(c) -> (ADDconst [c] x)
 (ADDconst [c] (ADDconst [d] x)) && is32Bit(c+d) -> (ADDconst [c+d] x)
+(ADDconst [0] x) -> x
+(ANDconst [-1] x) -> x
+(ANDconst [0] _) -> (MOVDconst [0])
+(XORconst [0] x) -> x
+
+(XOR (MOVDconst [0]) x) -> x
+(XOR x (MOVDconst [0])) -> x
+(ADD (MOVDconst [0]) x) -> x
+(ADD x (MOVDconst [0])) -> x

 // Fold offsets for stores.
 (MOVDstore [off1] {sym} (ADDconst [off2] x) val mem) && is16Bit(off1+off2) -> (MOVDstore [off1+off2] {sym} x val mem)
@@ -557,8 +556,6 @@
 (FMOVDstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) ->
        (FMOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)

-(MOVBload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) ->
-        (MOVBload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
 (MOVBZload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) ->
        (MOVBZload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
 (MOVHload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) ->
@@ -585,7 +582,6 @@
 (MOVWZload [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(off1+off2) -> (MOVWZload [off1+off2] {sym} x mem)
 (MOVHload [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(off1+off2) -> (MOVHload [off1+off2] {sym} x mem)
 (MOVHZload [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(off1+off2) -> (MOVHZload [off1+off2] {sym} x mem)
-(MOVBload [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(off1+off2) -> (MOVBload [off1+off2] {sym} x mem)
 (MOVBZload [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(off1+off2) -> (MOVBZload [off1+off2] {sym} x mem)

 // Store of zero -> storezero
@@ -630,6 +626,16 @@
 // Note that MOV??reg returns a 64-bit int, x is not necessarily that wide
 // This may interact with other patterns in the future. (Compare with arm64)
 (MOVBZreg x:(MOVBZload _ _))  -> x
-(MOVBreg x:(MOVBload _ _))  -> x
 (MOVHZreg x:(MOVHZload _ _))  -> x
 (MOVHreg x:(MOVHload _ _))  -> x
+
+(MOVBZreg (MOVDconst [c]))  -> (MOVDconst [int64(uint8(c))])
+(MOVBreg (MOVDconst [c]))  -> (MOVDconst [int64(int8(c))])
+(MOVHZreg (MOVDconst [c]))  -> (MOVDconst [int64(uint16(c))])
+(MOVHreg (MOVDconst [c]))  -> (MOVDconst [int64(int16(c))])
+
+(MOVBstore [off] {sym} ptr (MOVBreg x) mem) -> (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBZreg x) mem) -> (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x) mem) -> (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHZreg x) mem) -> (MOVHstore [off] {sym} ptr x mem)
+
--- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
@@ -17,7 +17,7 @@ import "strings"
 //    register (R31).

 var regNamesPPC64 = []string{
-	// "R0", // REGZERO
+	"R0", // REGZERO, not used, but simplifies counting in regalloc
 	"SP", // REGSP
 	"SB", // REGSB
 	"R3",
@@ -233,7 +233,6 @@ func init() {
 		{name: "MOVHZreg", argLength: 1, reg: gp11, asm: "MOVHZ", typ: "Int64"},                                          // zero extend uint16 to uint64
 		{name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW", typ: "Int64"},                                            // sign extend int32 to int64
 		{name: "MOVWZreg", argLength: 1, reg: gp11, asm: "MOVWZ", typ: "Int64"},                                          // zero extend uint32 to uint64
-		{name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVB", aux: "SymOff", typ: "Int8", faultOnNilArg0: true},     // sign extend int8 to int64
 		{name: "MOVBZload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true},  // zero extend uint8 to uint64
 		{name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", typ: "Int16", faultOnNilArg0: true},    // sign extend int16 to int64
 		{name: "MOVHZload", argLength: 2, reg: gpload, asm: "MOVHZ", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true}, // zero extend uint16 to uint64
@@ -258,7 +257,6 @@ func init() {
 		{name: "MOVDaddr", argLength: 1, reg: regInfo{inputs: []regMask{sp | sb}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVD", rematerializeable: true}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB

 		{name: "MOVDconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVD", rematerializeable: true},     //
-		{name: "MOVWconst", argLength: 0, reg: gp01, aux: "Int32", asm: "MOVW", rematerializeable: true},     // 32 low bits of auxint
 		{name: "FMOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVD", rematerializeable: true}, //
 		{name: "FMOVSconst", argLength: 0, reg: fp01, aux: "Float32", asm: "FMOVS", rematerializeable: true}, //
 		{name: "FCMPU", argLength: 2, reg: fp2cr, asm: "FCMPU", typ: "Flags"},

--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -485,7 +485,7 @@ func (s *regAllocState) init(f *Func) {
 	if s.f.Config.ctxt.Flag_shared {
 		switch s.f.Config.arch {
 		case "ppc64le": // R2 already reserved.
-			s.allocatable &^= 1 << 11 // R12 -- R0 is skipped in PPC64Ops.go
+			s.allocatable &^= 1 << 12 // R12
 		}
 	}
 	if s.f.Config.ctxt.Flag_dynlink {
@@ -495,7 +495,7 @@ func (s *regAllocState) init(f *Func) {
 		case "arm":
 			s.allocatable &^= 1 << 9 // R9
 		case "ppc64le": // R2 already reserved.
-			s.allocatable &^= 1 << 11 // R12 -- R0 is skipped in PPC64Ops.go
+			s.allocatable &^= 1 << 12 // R12
 		case "arm64":
 			// nothing to do?
 		case "386":
@@ -813,7 +813,9 @@ func (s *regAllocState) regalloc(f *Func) {
 					continue
 				}
 				a := v.Args[idx]
-				m := s.values[a.ID].regs &^ phiUsed
+				// Some instructions target not-allocatable registers.
+				// They're not suitable for further (phi-function) allocation.
+				m := s.values[a.ID].regs &^ phiUsed & s.allocatable
 				if m != 0 {
 					r := pickReg(m)
 					s.freeReg(r)
@@ -1942,7 +1944,7 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value, line int32
 	var x *Value
 	if c == nil {
 		if !e.s.values[vid].rematerializeable {
-			e.s.f.Fatalf("can't find source for %s->%s: v%d\n", e.p, e.b, vid)
+			e.s.f.Fatalf("can't find source for %s->%s: %s\n", e.p, e.b, v.LongString())
 		}
 		if dstReg {
 			x = v.copyInto(e.p)

--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -356,6 +356,15 @@ func clobber(v *Value) bool {
 	return true
 }

+// noteRule is an easy way to track if a rule is matched when writing
+// new ones.  Make the rule of interest also conditional on
+//     noteRule("note to self: rule of interest matched")
+// and that message will print when the rule matches.
+func noteRule(s string) bool {
+	println(s)
+	return true
+}
+
 // logRule logs the use of the rule s. This will only be enabled if
 // rewrite rules were generated with the -log option, see gen/rulegen.go.
 func logRule(s string) {

--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
--- a/src/cmd/internal/obj/ppc64/a.out.go
+++ b/src/cmd/internal/obj/ppc64/a.out.go
@@ -219,7 +219,7 @@ const (
 	C_COND_LT = iota // 0 result is negative
 	C_COND_GT        // 1 result is positive
 	C_COND_EQ        // 2 result is zero
-	C_COND_SO        // 3 summary overflow
+	C_COND_SO        // 3 summary overflow or FP compare w/ NaN
 )

 const (
@@ -300,8 +300,8 @@ const (
 	ABLE // not GT = L/E/U
 	ABLT
 	ABNE // not EQ = L/G/U
-	ABVC // apparently Unordered-clear
-	ABVS // apparently Unordered-set
+	ABVC // Unordered-clear
+	ABVS // Unordered-set
 	ACMP
 	ACMPU
 	ACNTLZW