Commit 5b14c7b3 authored by Lynn Boger's avatar Lynn Boger

cmd/asm, cmd/internal/obj/ppc64: avoid unnecessary load zeros

When instructions add, and, or, xor, and movd have
constant operands in some cases more instructions are
generated than necessary by the assembler.

This adds more opcode/operand combinations to the optab
and improves the code generation for the cases where the
size and sign of the constant allows the use of 1
instructions instead of 2.

Example of previous code:
	oris r3, r0, 0
	ori  r3, r3, 65533

now:
	ori r3, r0, 65533

This does not significantly reduce the overall binary size
because the improvement depends on the constant value.
Some procedures show a 1-2% reduction in size. This improvement
could also be significant in cases where the extra instructions
occur in a critical loop.

Testcase ppc64enc.s was added to cmd/asm/internal/asm/testdata
with the variations affected by this change.

Updates #23845

Change-Id: I7fdf2320c95815d99f2755ba77d0c6921cd7fad7
Reviewed-on: https://go-review.googlesource.com/95135
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarDavid Chase <drchase@google.com>
parent 0add9a4d
......@@ -411,6 +411,10 @@ func TestPPC64EndToEnd(t *testing.T) {
testEndToEnd(t, "ppc64", "ppc64")
}
func TestPPC64Encoder(t *testing.T) {
testEndToEnd(t, "ppc64", "ppc64enc")
}
func TestS390XEndToEnd(t *testing.T) {
testEndToEnd(t, "s390x", "s390x")
}
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Initial set of opcode combinations based on
// improvements to processing of constant
// operands.
// Full set will be added at a later date.
#include "../../../../../runtime/textflag.h"
TEXT asmtest(SB),DUPOK|NOSPLIT,$0
// move constants
MOVD $1, R3 // 38600001
MOVD $-1, R4 // 3880ffff
MOVD $65535, R5 // 6005ffff
MOVD $65536, R6 // 64060001
MOVD $-32767, R5 // 38a08001
MOVD $-32768, R6 // 38c08000
MOVD $1234567, R5 // 6405001260a5d687
// add constants
ADD $1, R3 // 38630001
ADD $1, R3, R4 // 38830001
ADD $-1, R4 // 3884ffff
ADD $-1, R4, R5 // 38a4ffff
ADD $65535, R5 // 601fffff7cbf2a14
ADD $65535, R5, R6 // 601fffff7cdf2a14
ADD $65536, R6 // 3cc60001
ADD $65536, R6, R7 // 3ce60001
ADD $-32767, R5 // 38a58001
ADD $-32767, R5, R4 // 38858001
ADD $-32768, R6 // 38c68000
ADD $-32768, R6, R5 // 38a68000
ADD $1234567, R5 // 641f001263ffd6877cbf2a14
ADD $1234567, R5, R6 // 641f001263ffd6877cdf2a14
// and constants
ANDCC $1, R3 // 70630001
ANDCC $1, R3, R4 // 70640001
ANDCC $-1, R4 // 3be0ffff7fe42039
ANDCC $-1, R4, R5 // 3be0ffff7fe52039
ANDCC $65535, R5 // 70a5ffff
ANDCC $65535, R5, R6 // 70a6ffff
ANDCC $65536, R6 // 74c60001
ANDCC $65536, R6, R7 // 74c70001
ANDCC $-32767, R5 // 3be080017fe52839
ANDCC $-32767, R5, R4 // 3be080017fe42839
ANDCC $-32768, R6 // 3be080007fe63039
ANDCC $-32768, R5, R6 // 3be080007fe62839
ANDCC $1234567, R5 // 641f001263ffd6877fe52839
ANDCC $1234567, R5, R6 // 641f001263ffd6877fe62839
// or constants
OR $1, R3 // 60630001
OR $1, R3, R4 // 60640001
OR $-1, R4 // 3be0ffff7fe42378
OR $-1, R4, R5 // 3be0ffff7fe52378
OR $65535, R5 // 60a5ffff
OR $65535, R5, R6 // 60a6ffff
OR $65536, R6 // 64c60001
OR $65536, R6, R7 // 64c70001
OR $-32767, R5 // 3be080017fe52b78
OR $-32767, R5, R6 // 3be080017fe62b78
OR $-32768, R6 // 3be080007fe63378
OR $-32768, R6, R7 // 3be080007fe73378
OR $1234567, R5 // 641f001263ffd6877fe52b78
OR $1234567, R5, R3 // 641f001263ffd6877fe32b78
// or constants
XOR $1, R3 // 68630001
XOR $1, R3, R4 // 68640001
XOR $-1, R4 // 3be0ffff7fe42278
XOR $-1, R4, R5 // 3be0ffff7fe52278
XOR $65535, R5 // 68a5ffff
XOR $65535, R5, R6 // 68a6ffff
XOR $65536, R6 // 6cc60001
XOR $65536, R6, R7 // 6cc70001
XOR $-32767, R5 // 3be080017fe52a78
XOR $-32767, R5, R6 // 3be080017fe62a78
XOR $-32768, R6 // 3be080007fe63278
XOR $-32768, R6, R7 // 3be080007fe73278
XOR $1234567, R5 // 641f001263ffd6877fe52a78
XOR $1234567, R5, R3 // 641f001263ffd6877fe32a78
RET
......@@ -71,6 +71,14 @@ type Optab struct {
param int16
}
// This optab contains a list of opcodes with the operand
// combinations that are implemented. Not all opcodes are in this
// table, but are added later in buildop by calling opset for those
// opcodes which allow the same operand combinations as an opcode
// already in the table.
//
// The type field in the Optabl identifies the case in asmout where
// the instruction word is assembled.
var optab = []Optab{
{obj.ATEXT, C_LEXT, C_NONE, C_NONE, C_TEXTSIZE, 0, 0, 0},
{obj.ATEXT, C_LEXT, C_NONE, C_LCON, C_TEXTSIZE, 0, 0, 0},
......@@ -84,14 +92,18 @@ var optab = []Optab{
{AMOVWZ, C_REG, C_NONE, C_NONE, C_REG, 13, 4, 0},
{AADD, C_REG, C_REG, C_NONE, C_REG, 2, 4, 0},
{AADD, C_REG, C_NONE, C_NONE, C_REG, 2, 4, 0},
{AADD, C_SCON, C_REG, C_NONE, C_REG, 4, 4, 0},
{AADD, C_SCON, C_NONE, C_NONE, C_REG, 4, 4, 0},
{AADD, C_ADDCON, C_REG, C_NONE, C_REG, 4, 4, 0},
{AADD, C_ADDCON, C_NONE, C_NONE, C_REG, 4, 4, 0},
{AADD, C_UCON, C_REG, C_NONE, C_REG, 20, 4, 0},
{AADD, C_UCON, C_NONE, C_NONE, C_REG, 20, 4, 0},
{AADDIS, C_ADDCON, C_REG, C_NONE, C_REG, 20, 4, 0},
{AADDIS, C_ADDCON, C_NONE, C_NONE, C_REG, 20, 4, 0},
{AADD, C_ANDCON, C_REG, C_NONE, C_REG, 22, 8, 0},
{AADD, C_ANDCON, C_NONE, C_NONE, C_REG, 22, 8, 0},
{AADD, C_LCON, C_REG, C_NONE, C_REG, 22, 12, 0},
{AADD, C_LCON, C_NONE, C_NONE, C_REG, 22, 12, 0},
{AADDIS, C_ADDCON, C_REG, C_NONE, C_REG, 20, 4, 0},
{AADDIS, C_ADDCON, C_NONE, C_NONE, C_REG, 20, 4, 0},
{AADDC, C_REG, C_REG, C_NONE, C_REG, 2, 4, 0},
{AADDC, C_REG, C_NONE, C_NONE, C_REG, 2, 4, 0},
{AADDC, C_ADDCON, C_REG, C_NONE, C_REG, 4, 4, 0},
......@@ -106,10 +118,12 @@ var optab = []Optab{
{AANDCC, C_ANDCON, C_REG, C_NONE, C_REG, 58, 4, 0},
{AANDCC, C_UCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
{AANDCC, C_UCON, C_REG, C_NONE, C_REG, 59, 4, 0},
{AANDISCC, C_ANDCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
{AANDISCC, C_ANDCON, C_REG, C_NONE, C_REG, 59, 4, 0},
{AANDCC, C_ADDCON, C_NONE, C_NONE, C_REG, 23, 8, 0},
{AANDCC, C_ADDCON, C_REG, C_NONE, C_REG, 23, 8, 0},
{AANDCC, C_LCON, C_NONE, C_NONE, C_REG, 23, 12, 0},
{AANDCC, C_LCON, C_REG, C_NONE, C_REG, 23, 12, 0},
{AANDISCC, C_ANDCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
{AANDISCC, C_ANDCON, C_REG, C_NONE, C_REG, 59, 4, 0},
{AMULLW, C_REG, C_REG, C_NONE, C_REG, 2, 4, 0},
{AMULLW, C_REG, C_NONE, C_NONE, C_REG, 2, 4, 0},
{AMULLW, C_ADDCON, C_REG, C_NONE, C_REG, 4, 4, 0},
......@@ -128,10 +142,12 @@ var optab = []Optab{
{AOR, C_ANDCON, C_REG, C_NONE, C_REG, 58, 4, 0},
{AOR, C_UCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
{AOR, C_UCON, C_REG, C_NONE, C_REG, 59, 4, 0},
{AORIS, C_ANDCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
{AORIS, C_ANDCON, C_REG, C_NONE, C_REG, 59, 4, 0},
{AOR, C_ADDCON, C_NONE, C_NONE, C_REG, 23, 8, 0},
{AOR, C_ADDCON, C_REG, C_NONE, C_REG, 23, 8, 0},
{AOR, C_LCON, C_NONE, C_NONE, C_REG, 23, 12, 0},
{AOR, C_LCON, C_REG, C_NONE, C_REG, 23, 12, 0},
{AORIS, C_ANDCON, C_NONE, C_NONE, C_REG, 59, 4, 0},
{AORIS, C_ANDCON, C_REG, C_NONE, C_REG, 59, 4, 0},
{ADIVW, C_REG, C_REG, C_NONE, C_REG, 2, 4, 0}, /* op r1[,r2],r3 */
{ADIVW, C_REG, C_NONE, C_NONE, C_REG, 2, 4, 0},
{ASUB, C_REG, C_REG, C_NONE, C_REG, 10, 4, 0}, /* op r2[,r1],r3 */
......@@ -277,16 +293,19 @@ var optab = []Optab{
{AMOVD, C_LECON, C_NONE, C_NONE, C_REG, 26, 8, REGSB},
{AMOVD, C_LACON, C_NONE, C_NONE, C_REG, 26, 8, REGSP},
{AMOVD, C_ADDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
{AMOVD, C_ANDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
{AMOVW, C_SECON, C_NONE, C_NONE, C_REG, 3, 4, REGSB}, /* TO DO: check */
{AMOVW, C_SACON, C_NONE, C_NONE, C_REG, 3, 4, REGSP},
{AMOVW, C_LECON, C_NONE, C_NONE, C_REG, 26, 8, REGSB},
{AMOVW, C_LACON, C_NONE, C_NONE, C_REG, 26, 8, REGSP},
{AMOVW, C_ADDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
{AMOVW, C_ANDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
{AMOVWZ, C_SECON, C_NONE, C_NONE, C_REG, 3, 4, REGSB}, /* TO DO: check */
{AMOVWZ, C_SACON, C_NONE, C_NONE, C_REG, 3, 4, REGSP},
{AMOVWZ, C_LECON, C_NONE, C_NONE, C_REG, 26, 8, REGSB},
{AMOVWZ, C_LACON, C_NONE, C_NONE, C_REG, 26, 8, REGSP},
{AMOVWZ, C_ADDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
{AMOVWZ, C_ANDCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
/* load unsigned/long constants (TO DO: check) */
{AMOVD, C_UCON, C_NONE, C_NONE, C_REG, 3, 4, REGZERO},
......@@ -1048,13 +1067,25 @@ func (x ocmp) Swap(i, j int) {
x[i], x[j] = x[j], x[i]
}
// Used when sorting the optab. Sorting is
// done in a way so that the best choice of
// opcode/operand combination is considered first.
func (x ocmp) Less(i, j int) bool {
p1 := &x[i]
p2 := &x[j]
n := int(p1.as) - int(p2.as)
// same opcode
if n != 0 {
return n < 0
}
// Consider those that generate fewer
// instructions first.
n = int(p1.size) - int(p2.size)
if n != 0 {
return n < 0
}
// operand order should match
// better choices first
n = int(p1.a1) - int(p2.a1)
if n != 0 {
return n < 0
......@@ -1073,10 +1104,15 @@ func (x ocmp) Less(i, j int) bool {
}
return false
}
// Add an entry to the opcode table for
// a new opcode b0 with the same operand combinations
// as opcode a.
func opset(a, b0 obj.As) {
oprange[a&obj.AMask] = oprange[b0]
}
// Build the opcode table
func buildop(ctxt *obj.Link) {
if oprange[AANDN&obj.AMask] != nil {
// Already initialized; stop now.
......@@ -2256,7 +2292,7 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
}
o1 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), uint32(r), uint32(p.From.Reg))
case 3: /* mov $soreg/addcon/ucon, r ==> addis/addi $i,reg',r */
case 3: /* mov $soreg/addcon/andcon/ucon, r ==> addis/oris/addi/ori $i,reg',r */
d := c.vregoff(&p.From)
v := int32(d)
......@@ -2272,6 +2308,8 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
if d&0xffff != 0 {
log.Fatalf("invalid handling of %v", p)
}
// For UCON operands the value is right shifted 16, using ADDIS if the
// value should be signed, ORIS if unsigned.
v >>= 16
if r == REGZERO && isuint32(uint64(d)) {
o1 = LOP_IRR(OP_ORIS, uint32(p.To.Reg), REGZERO, uint32(v))
......@@ -2279,8 +2317,16 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
}
a = OP_ADDIS
} else {
if int64(int16(d)) != d {
} else if int64(int16(d)) != d {
// Operand is 16 bit value with sign bit set
if o.a1 == C_ANDCON {
// Needs unsigned 16 bit so use ORI
if r == 0 || r == REGZERO {
o1 = LOP_IRR(uint32(OP_ORI), uint32(p.To.Reg), uint32(0), uint32(v))
break
}
// With ADDCON, needs signed 16 bit value, fall through to use ADDI
} else if o.a1 != C_ADDCON {
log.Fatalf("invalid handling of %v", p)
}
}
......@@ -2632,8 +2678,6 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
o1, o2 = c.symbolAccess(p.From.Sym, d, p.To.Reg, OP_ADDI)
}
//if(dlm) reloc(&p->from, p->pc, 0);
case 20: /* add $ucon,,r | addis $addcon,r,r */
v := c.regoff(&p.From)
......@@ -2650,43 +2694,53 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
o1 = AOP_IRR(c.opirr(AADDIS), uint32(p.To.Reg), uint32(r), uint32(v)>>16)
}
case 22: /* add $lcon,r1,r2 ==> cau+or+add */ /* could do add/sub more efficiently */
case 22: /* add $lcon/$andcon,r1,r2 ==> oris+ori+add/ori+add */
if p.To.Reg == REGTMP || p.Reg == REGTMP {
c.ctxt.Diag("can't synthesize large constant\n%v", p)
}
d := c.vregoff(&p.From)
o1 = loadu32(REGTMP, d)
o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d)))
r := int(p.Reg)
if r == 0 {
r = int(p.To.Reg)
}
o3 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
if p.From.Sym != nil {
c.ctxt.Diag("%v is not supported", p)
}
// If operand is ANDCON, generate 2 instructions using
// ORI for unsigned value; with LCON 3 instructions.
if o.size == 8 {
o1 = LOP_IRR(OP_ORI, REGTMP, REGZERO, uint32(int32(d)))
o2 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
} else {
o1 = loadu32(REGTMP, d)
o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d)))
o3 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
}
//if(dlm) reloc(&p->from, p->pc, 0);
case 23: /* and $lcon,r1,r2 ==> cau+or+and */ /* masks could be done using rlnm etc. */
case 23: /* and $lcon/$addcon,r1,r2 ==> oris+ori+and/addi+and */
if p.To.Reg == REGTMP || p.Reg == REGTMP {
c.ctxt.Diag("can't synthesize large constant\n%v", p)
}
d := c.vregoff(&p.From)
o1 = loadu32(REGTMP, d)
o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d)))
r := int(p.Reg)
if r == 0 {
r = int(p.To.Reg)
}
o3 = LOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
// With ADDCON operand, generate 2 instructions using ADDI for signed value,
// with LCON operand generate 3 instructions.
if o.size == 8 {
o1 = LOP_IRR(OP_ADDI, REGZERO, REGTMP, uint32(int32(d)))
o2 = LOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
} else {
o1 = loadu32(REGTMP, d)
o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d)))
o3 = LOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r))
}
if p.From.Sym != nil {
c.ctxt.Diag("%v is not supported", p)
}
//if(dlm) reloc(&p->from, p->pc, 0);
/*24*/
case 25:
/* sld[.] $sh,rS,rA -> rldicr[.] $sh,rS,mask(0,63-sh),rA; srd[.] -> rldicl */
v := c.regoff(&p.From)
......@@ -3090,7 +3144,7 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
case AXOR:
o1 = LOP_IRR(c.opirr(AXORIS), uint32(p.To.Reg), uint32(r), uint32(v)>>16)
case AANDCC:
o1 = LOP_IRR(c.opirr(AANDCC), uint32(p.To.Reg), uint32(r), uint32(v)>>16)
o1 = LOP_IRR(c.opirr(AANDISCC), uint32(p.To.Reg), uint32(r), uint32(v)>>16)
default:
o1 = LOP_IRR(c.opirr(p.As), uint32(p.To.Reg), uint32(r), uint32(v))
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment