cmd/asm, cmd/internal/obj/ppc64: avoid unnecessary load zeros

When instructions add, and, or, xor, and movd have constant operands in some cases more instructions are generated than necessary by the assembler. This adds more opcode/operand combinations to the optab and improves the code generation for the cases where the size and sign of the constant allows the use of 1 instructions instead of 2. Example of previous code: oris r3, r0, 0 ori r3, r3, 65533 now: ori r3, r0, 65533 This does not significantly reduce the overall binary size because the improvement depends on the constant value. Some procedures show a 1-2% reduction in size. This improvement could also be significant in cases where the extra instructions occur in a critical loop. Testcase ppc64enc.s was added to cmd/asm/internal/asm/testdata with the variations affected by this change. Updates #23845 Change-Id: I7fdf2320c95815d99f2755ba77d0c6921cd7fad7 Reviewed-on: https://go-review.googlesource.com/95135 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>

cmd/asm, cmd/internal/obj/ppc64: avoid unnecessary load zeros
When instructions add, and, or, xor, and movd have constant operands in some cases more instructions are generated than necessary by the assembler. This adds more opcode/operand combinations to the optab and improves the code generation for the cases where the size and sign of the constant allows the use of 1 instructions instead of 2. Example of previous code: oris r3, r0, 0 ori r3, r3, 65533 now: ori r3, r0, 65533 This does not significantly reduce the overall binary size because the improvement depends on the constant value. Some procedures show a 1-2% reduction in size. This improvement could also be significant in cases where the extra instructions occur in a critical loop. Testcase ppc64enc.s was added to cmd/asm/internal/asm/testdata with the variations affected by this change. Updates #23845 Change-Id: I7fdf2320c95815d99f2755ba77d0c6921cd7fad7 Reviewed-on: https://go-review.googlesource.com/95135 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
5b14c7b3 · Lynn Boger · 0add9a4d · 5b14c7b3 · 5b14c7b3 · 5b14c7b3
Commit 5b14c7b3 authored Feb 19, 2018 by Lynn Boger
Showing with 170 additions and 25 deletions

endtoend_test.go src/cmd/asm/internal/asm/endtoend_test.go +4 -0

ppc64enc.s src/cmd/asm/internal/asm/testdata/ppc64enc.s +87 -0

asm9.go src/cmd/internal/obj/ppc64/asm9.go +79 -25

No files found.
--- a/src/cmd/asm/internal/asm/endtoend_test.go
+++ b/src/cmd/asm/internal/asm/endtoend_test.go
@@ -411,6 +411,10 @@ func TestPPC64EndToEnd(t *testing.T) {
 	testEndToEnd(t, "ppc64", "ppc64")
 }

+func TestPPC64Encoder(t *testing.T) {
+	testEndToEnd(t, "ppc64", "ppc64enc")
+}
+
 func TestS390XEndToEnd(t *testing.T) {
 	testEndToEnd(t, "s390x", "s390x")
 }
--- a/src/cmd/asm/internal/asm/testdata/ppc64enc.s
+++ b/src/cmd/asm/internal/asm/testdata/ppc64enc.s
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Initial set of opcode combinations based on
+// improvements to processing of constant
+// operands.
+
+// Full set will be added at a later date.
+
+#include "../../../../../runtime/textflag.h"
+
+TEXT asmtest(SB),DUPOK|NOSPLIT,$0
+	// move constants
+	MOVD $1, R3                     // 38600001
+	MOVD $-1, R4                    // 3880ffff
+	MOVD $65535, R5                 // 6005ffff
+	MOVD $65536, R6                 // 64060001
+	MOVD $-32767, R5                // 38a08001
+	MOVD $-32768, R6                // 38c08000
+	MOVD $1234567, R5               // 6405001260a5d687
+
+	// add constants
+	ADD $1, R3                      // 38630001
+	ADD $1, R3, R4                  // 38830001
+	ADD $-1, R4                     // 3884ffff
+	ADD $-1, R4, R5                 // 38a4ffff
+	ADD $65535, R5                  // 601fffff7cbf2a14
+	ADD $65535, R5, R6              // 601fffff7cdf2a14
+	ADD $65536, R6                  // 3cc60001
+	ADD $65536, R6, R7              // 3ce60001
+	ADD $-32767, R5                 // 38a58001
+	ADD $-32767, R5, R4             // 38858001
+	ADD $-32768, R6                 // 38c68000
+	ADD $-32768, R6, R5             // 38a68000
+	ADD $1234567, R5                // 641f001263ffd6877cbf2a14
+	ADD $1234567, R5, R6            // 641f001263ffd6877cdf2a14
+
+	// and constants
+	ANDCC $1, R3                    // 70630001
+	ANDCC $1, R3, R4                // 70640001
+	ANDCC $-1, R4                   // 3be0ffff7fe42039
+	ANDCC $-1, R4, R5               // 3be0ffff7fe52039
+	ANDCC $65535, R5                // 70a5ffff
+	ANDCC $65535, R5, R6            // 70a6ffff
+	ANDCC $65536, R6                // 74c60001
+	ANDCC $65536, R6, R7            // 74c70001
+	ANDCC $-32767, R5               // 3be080017fe52839
+	ANDCC $-32767, R5, R4           // 3be080017fe42839
+	ANDCC $-32768, R6               // 3be080007fe63039
+	ANDCC $-32768, R5, R6           // 3be080007fe62839
+	ANDCC $1234567, R5              // 641f001263ffd6877fe52839
+	ANDCC $1234567, R5, R6          // 641f001263ffd6877fe62839
+
+	// or constants
+	OR $1, R3                       // 60630001
+	OR $1, R3, R4                   // 60640001
+	OR $-1, R4                      // 3be0ffff7fe42378
+	OR $-1, R4, R5                  // 3be0ffff7fe52378
+	OR $65535, R5                   // 60a5ffff
+	OR $65535, R5, R6               // 60a6ffff
+	OR $65536, R6                   // 64c60001
+	OR $65536, R6, R7               // 64c70001
+	OR $-32767, R5                  // 3be080017fe52b78
+	OR $-32767, R5, R6              // 3be080017fe62b78
+	OR $-32768, R6                  // 3be080007fe63378
+	OR $-32768, R6, R7              // 3be080007fe73378
+	OR $1234567, R5                 // 641f001263ffd6877fe52b78
+	OR $1234567, R5, R3             // 641f001263ffd6877fe32b78
+
+	// or constants
+	XOR $1, R3                      // 68630001
+	XOR $1, R3, R4                  // 68640001
+	XOR $-1, R4                     // 3be0ffff7fe42278
+	XOR $-1, R4, R5                 // 3be0ffff7fe52278
+	XOR $65535, R5                  // 68a5ffff
+	XOR $65535, R5, R6              // 68a6ffff
+	XOR $65536, R6                  // 6cc60001
+	XOR $65536, R6, R7              // 6cc70001
+	XOR $-32767, R5                 // 3be080017fe52a78
+	XOR $-32767, R5, R6             // 3be080017fe62a78
+	XOR $-32768, R6                 // 3be080007fe63278
+	XOR $-32768, R6, R7             // 3be080007fe73278
+	XOR $1234567, R5                // 641f001263ffd6877fe52a78
+	XOR $1234567, R5, R3            // 641f001263ffd6877fe32a78
+
+	RET
--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -71,6 +71,14 @@ type Optab struct {
 	param int16
 }

+// This optab contains a list of opcodes with the operand
+// combinations that are implemented. Not all opcodes are in this
+// table, but are added later in buildop by calling opset for those
+// opcodes which allow the same operand combinations as an opcode
+// already in the table.
+//
+// The type field in the Optabl identifies the case in asmout where
+// the instruction word is assembled.
 var optab = []Optab{
 	{obj.ATEXT, C_LEXT, C_NONE, C_NONE, C_TEXTSIZE, 0, 0, 0},
 	{obj.ATEXT, C_LEXT, C_NONE, C_LCON, C_TEXTSIZE, 0, 0, 0},
@@ -84,14 +92,18 @@ var optab = []Optab{
 	{AMOVWZ, C_REG, C_NONE, C_NONE, C_REG, 13, 4, 0},
 	{AADD, C_REG, C_REG, C_NONE, C_REG, 2, 4, 0},
 	{AADD, C_REG, C_NONE, C_NONE, C_REG, 2, 4, 0},
+	{AADD, C_SCON, C_REG, C_NONE, C_REG, 4, 4, 0},
+	{AADD, C_SCON, C_NONE, C_NONE, C_REG, 4, 4, 0},
 	{AADD, C_ADDCON, C_REG, C_NONE, C_REG, 4, 4, 0},
 	{AADD, C_ADDCON, C_NONE, C_NONE, C_REG, 4, 4, 0},
 	{AADD, C_UCON, C_REG, C_NONE, C_REG, 20, 4, 0},
 	{AADD, C_UCON, C_NONE, C_NONE, C_REG, 20, 4, 0},
-	{AADDIS, C_ADDCON, C_REG, C_NONE, C_REG, 20, 4, 0},
-	{AADDIS, C_ADDCON, C_NONE, C_NONE, C_REG, 20, 4, 0},
+	{AADD, C_ANDCON, C_REG, C_NONE, C_REG, 22, 8, 0},
+	{AADD, C_ANDCON, C_NONE, C_NONE, C_REG, 22, 8, 0},
 	{AADD, C_LCON, C_REG, C_NONE, C_REG, 22, 12, 0},
 	{AADD, C_LCON, C_NONE, C_NONE, C_REG, 22, 12, 0},
+	{AADDIS, C_ADDCON, C_REG, C_NONE, C_REG, 20, 4, 0},
+	{AADDIS, C_ADDCON, C_NONE, C_NONE, C_REG, 20, 4, 0},
 	{AADDC, C_REG, C_REG, C_NONE, C_REG, 2, 4, 0},
 	{AADDC, C_REG, C_NONE, C_NONE, C_REG, 2, 4, 0},
 	{AADDC, C_ADDCON, C_REG, C_NONE, C_REG, 4, 4, 0},
@@ -106,10 +118,12 @@ var optab = []Optab{
 	{AANDCC, C_ANDCON, C_REG, C_NONE, C_REG, 58, 4, 0},
 	{AANDCC, C_UCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
 	{AANDCC, C_UCON, C_REG, C_NONE, C_REG, 59, 4, 0},
-	{AANDISCC, C_ANDCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
-	{AANDISCC, C_ANDCON, C_REG, C_NONE, C_REG, 59, 4, 0},
+	{AANDCC, C_ADDCON, C_NONE, C_NONE, C_REG, 23, 8, 0},
+	{AANDCC, C_ADDCON, C_REG, C_NONE, C_REG, 23, 8, 0},
 	{AANDCC, C_LCON, C_NONE, C_NONE, C_REG, 23, 12, 0},
 	{AANDCC, C_LCON, C_REG, C_NONE, C_REG, 23, 12, 0},
+	{AANDISCC, C_ANDCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
+	{AANDISCC, C_ANDCON, C_REG, C_NONE, C_REG, 59, 4, 0},
 	{AMULLW, C_REG, C_REG, C_NONE, C_REG, 2, 4, 0},
 	{AMULLW, C_REG, C_NONE, C_NONE, C_REG, 2, 4, 0},
 	{AMULLW, C_ADDCON, C_REG, C_NONE, C_REG, 4, 4, 0},
@@ -128,10 +142,12 @@ var optab = []Optab{
 	{AOR, C_ANDCON, C_REG, C_NONE, C_REG, 58, 4, 0},
 	{AOR, C_UCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
 	{AOR, C_UCON, C_REG, C_NONE, C_REG, 59, 4, 0},
-	{AORIS, C_ANDCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
-	{AORIS, C_ANDCON, C_REG, C_NONE, C_REG, 59, 4, 0},
+	{AOR, C_ADDCON, C_NONE, C_NONE, C_REG, 23, 8, 0},
+	{AOR, C_ADDCON, C_REG, C_NONE, C_REG, 23, 8, 0},
 	{AOR, C_LCON, C_NONE, C_NONE, C_REG, 23, 12, 0},
 	{AOR, C_LCON, C_REG, C_NONE, C_REG, 23, 12, 0},
+	{AORIS, C_ANDCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
+	{AORIS, C_ANDCON, C_REG, C_NONE, C_REG, 59, 4, 0},
 	{ADIVW, C_REG, C_REG, C_NONE, C_REG, 2, 4, 0}, /* op r1[,r2],r3 */
 	{ADIVW, C_REG, C_NONE, C_NONE, C_REG, 2, 4, 0},
 	{ASUB, C_REG, C_REG, C_NONE, C_REG, 10, 4, 0}, /* op r2[,r1],r3 */
@@ -277,16 +293,19 @@ var optab = []Optab{
 	{AMOVD, C_LECON, C_NONE, C_NONE, C_REG, 26, 8, REGSB},
 	{AMOVD, C_LACON, C_NONE, C_NONE, C_REG, 26, 8, REGSP},
 	{AMOVD, C_ADDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
+	{AMOVD, C_ANDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
 	{AMOVW, C_SECON, C_NONE, C_NONE, C_REG, 3, 4, REGSB}, /* TO DO: check */
 	{AMOVW, C_SACON, C_NONE, C_NONE, C_REG, 3, 4, REGSP},
 	{AMOVW, C_LECON, C_NONE, C_NONE, C_REG, 26, 8, REGSB},
 	{AMOVW, C_LACON, C_NONE, C_NONE, C_REG, 26, 8, REGSP},
 	{AMOVW, C_ADDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
+	{AMOVW, C_ANDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
 	{AMOVWZ, C_SECON, C_NONE, C_NONE, C_REG, 3, 4, REGSB}, /* TO DO: check */
 	{AMOVWZ, C_SACON, C_NONE, C_NONE, C_REG, 3, 4, REGSP},
 	{AMOVWZ, C_LECON, C_NONE, C_NONE, C_REG, 26, 8, REGSB},
 	{AMOVWZ, C_LACON, C_NONE, C_NONE, C_REG, 26, 8, REGSP},
 	{AMOVWZ, C_ADDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
+	{AMOVWZ, C_ANDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},

 	/* load unsigned/long constants (TO DO: check) */
 	{AMOVD, C_UCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
@@ -1048,13 +1067,25 @@ func (x ocmp) Swap(i, j int) {
 	x[i], x[j] = x[j], x[i]
 }

+// Used when sorting the optab. Sorting is
+// done in a way so that the best choice of
+// opcode/operand combination is considered first.
 func (x ocmp) Less(i, j int) bool {
 	p1 := &x[i]
 	p2 := &x[j]
 	n := int(p1.as) - int(p2.as)
+	// same opcode
+	if n != 0 {
+		return n < 0
+	}
+	// Consider those that generate fewer
+	// instructions first.
+	n = int(p1.size) - int(p2.size)
 	if n != 0 {
 		return n < 0
 	}
+	// operand order should match
+	// better choices first
 	n = int(p1.a1) - int(p2.a1)
 	if n != 0 {
 		return n < 0
@@ -1073,10 +1104,15 @@ func (x ocmp) Less(i, j int) bool {
 	}
 	return false
 }
+
+// Add an entry to the opcode table for
+// a new opcode b0 with the same operand combinations
+// as opcode a.
 func opset(a, b0 obj.As) {
 	oprange[a&obj.AMask] = oprange[b0]
 }

+// Build the opcode table
 func buildop(ctxt *obj.Link) {
 	if oprange[AANDN&obj.AMask] != nil {
 		// Already initialized; stop now.
@@ -2256,7 +2292,7 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		}
 		o1 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), uint32(r), uint32(p.From.Reg))

-	case 3: /* mov $soreg/addcon/ucon, r ==> addis/addi $i,reg',r */
+	case 3: /* mov $soreg/addcon/andcon/ucon, r ==> addis/oris/addi/ori $i,reg',r */
 		d := c.vregoff(&p.From)

 		v := int32(d)
@@ -2272,6 +2308,8 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			if d&0xffff != 0 {
 				log.Fatalf("invalid handling of %v", p)
 			}
+			// For UCON operands the value is right shifted 16, using ADDIS if the
+			// value should be signed, ORIS if unsigned.
 			v >>= 16
 			if r == REGZERO && isuint32(uint64(d)) {
 				o1 = LOP_IRR(OP_ORIS, uint32(p.To.Reg), REGZERO, uint32(v))
@@ -2279,8 +2317,16 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			}

 			a = OP_ADDIS
-		} else {
-			if int64(int16(d)) != d {
+		} else if int64(int16(d)) != d {
+			// Operand is 16 bit value with sign bit set
+			if o.a1 == C_ANDCON {
+				// Needs unsigned 16 bit so use ORI
+				if r == 0 || r == REGZERO {
+					o1 = LOP_IRR(uint32(OP_ORI), uint32(p.To.Reg), uint32(0), uint32(v))
+					break
+				}
+				// With ADDCON, needs signed 16 bit value, fall through to use ADDI
+			} else if o.a1 != C_ADDCON {
 				log.Fatalf("invalid handling of %v", p)
 			}
 		}
@@ -2632,8 +2678,6 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			o1, o2 = c.symbolAccess(p.From.Sym, d, p.To.Reg, OP_ADDI)
 		}

-	//if(dlm) reloc(&p->from, p->pc, 0);
-
 	case 20: /* add $ucon,,r | addis $addcon,r,r */
 		v := c.regoff(&p.From)

@@ -2650,43 +2694,53 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			o1 = AOP_IRR(c.opirr(AADDIS), uint32(p.To.Reg), uint32(r), uint32(v)>>16)
 		}

-	case 22: /* add $lcon,r1,r2 ==> cau+or+add */ /* could do add/sub more efficiently */
+	case 22: /* add $lcon/$andcon,r1,r2 ==> oris+ori+add/ori+add */
 		if p.To.Reg == REGTMP || p.Reg == REGTMP {
 			c.ctxt.Diag("can't synthesize large constant\n%v", p)
 		}
 		d := c.vregoff(&p.From)
-		o1 = loadu32(REGTMP, d)
-		o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d)))
 		r := int(p.Reg)
 		if r == 0 {
 			r = int(p.To.Reg)
 		}
-		o3 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
 		if p.From.Sym != nil {
 			c.ctxt.Diag("%v is not supported", p)
 		}
+		// If operand is ANDCON, generate 2 instructions using
+		// ORI for unsigned value; with LCON 3 instructions.
+		if o.size == 8 {
+			o1 = LOP_IRR(OP_ORI, REGTMP, REGZERO, uint32(int32(d)))
+			o2 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
+		} else {
+			o1 = loadu32(REGTMP, d)
+			o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d)))
+			o3 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
+		}

-	//if(dlm) reloc(&p->from, p->pc, 0);
-
-	case 23: /* and $lcon,r1,r2 ==> cau+or+and */ /* masks could be done using rlnm etc. */
+	case 23: /* and $lcon/$addcon,r1,r2 ==> oris+ori+and/addi+and */
 		if p.To.Reg == REGTMP || p.Reg == REGTMP {
 			c.ctxt.Diag("can't synthesize large constant\n%v", p)
 		}
 		d := c.vregoff(&p.From)
-		o1 = loadu32(REGTMP, d)
-		o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d)))
 		r := int(p.Reg)
 		if r == 0 {
 			r = int(p.To.Reg)
 		}
-		o3 = LOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
+
+		// With ADDCON operand, generate 2 instructions using ADDI for signed value,
+		// with LCON operand generate 3 instructions.
+		if o.size == 8 {
+			o1 = LOP_IRR(OP_ADDI, REGZERO, REGTMP, uint32(int32(d)))
+			o2 = LOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
+		} else {
+			o1 = loadu32(REGTMP, d)
+			o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d)))
+			o3 = LOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
+		}
 		if p.From.Sym != nil {
 			c.ctxt.Diag("%v is not supported", p)
 		}

-		//if(dlm) reloc(&p->from, p->pc, 0);
-
-		/*24*/
 	case 25:
 		/* sld[.] $sh,rS,rA -> rldicr[.] $sh,rS,mask(0,63-sh),rA; srd[.] -> rldicl */
 		v := c.regoff(&p.From)
@@ -3090,7 +3144,7 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		case AXOR:
 			o1 = LOP_IRR(c.opirr(AXORIS), uint32(p.To.Reg), uint32(r), uint32(v)>>16)
 		case AANDCC:
-			o1 = LOP_IRR(c.opirr(AANDCC), uint32(p.To.Reg), uint32(r), uint32(v)>>16)
+			o1 = LOP_IRR(c.opirr(AANDISCC), uint32(p.To.Reg), uint32(r), uint32(v)>>16)
 		default:
 			o1 = LOP_IRR(c.opirr(p.As), uint32(p.To.Reg), uint32(r), uint32(v))
 		}