Commit f876fb9b authored by Cherry Zhang's avatar Cherry Zhang

cmd/compile: move value around before kick it out of register

When allocating registers, before kicking out the existing value,
copy it to a spare register if there is one. So later use of this
value can be found in register instead of reload from spill. This
is very helpful for instructions of which the input and/or output
can only be in specific registers, e.g. DIV on x86, MUL/DIV on
MIPS. May also be helpful in general.

For "go build -a cmd/go" on AMD64, reduce "spilled value remains"
by 1% (not including args, which almost certainly remain).

For the code in issue #16061 on AMD64:
MaxRem-12   111µs ± 1%    94µs ± 0%  -15.38%  (p=0.008 n=5+5)

Go1 benchmark on AMD64:
BinaryTree17-12              2.32s ± 2%     2.30s ± 1%    ~     (p=0.421 n=5+5)
Fannkuch11-12                2.52s ± 0%     2.44s ± 0%  -3.44%  (p=0.008 n=5+5)
FmtFprintfEmpty-12          39.9ns ± 3%    39.8ns ± 0%    ~     (p=0.635 n=5+4)
FmtFprintfString-12          114ns ± 1%     113ns ± 1%    ~     (p=0.905 n=5+5)
FmtFprintfInt-12             102ns ± 6%      98ns ± 1%    ~     (p=0.087 n=5+5)
FmtFprintfIntInt-12          146ns ± 5%     147ns ± 1%    ~     (p=0.238 n=5+5)
FmtFprintfPrefixedInt-12     155ns ± 2%     151ns ± 1%  -2.58%  (p=0.008 n=5+5)
FmtFprintfFloat-12           231ns ± 1%     232ns ± 1%    ~     (p=0.286 n=5+5)
FmtManyArgs-12               657ns ± 1%     649ns ± 0%  -1.31%  (p=0.008 n=5+5)
GobDecode-12                6.35ms ± 0%    6.29ms ± 1%    ~     (p=0.056 n=5+5)
GobEncode-12                5.38ms ± 1%    5.45ms ± 1%    ~     (p=0.056 n=5+5)
Gzip-12                      209ms ± 0%     209ms ± 1%    ~     (p=0.690 n=5+5)
Gunzip-12                   31.2ms ± 1%    31.1ms ± 1%    ~     (p=0.548 n=5+5)
HTTPClientServer-12          123µs ± 4%     130µs ± 8%    ~     (p=0.151 n=5+5)
JSONEncode-12               14.0ms ± 1%    14.0ms ± 1%    ~     (p=0.421 n=5+5)
JSONDecode-12               41.2ms ± 1%    41.1ms ± 2%    ~     (p=0.421 n=5+5)
Mandelbrot200-12            3.96ms ± 1%    3.98ms ± 0%    ~     (p=0.421 n=5+5)
GoParse-12                  2.88ms ± 1%    2.88ms ± 1%    ~     (p=0.841 n=5+5)
RegexpMatchEasy0_32-12      68.0ns ± 3%    66.6ns ± 1%  -2.00%  (p=0.024 n=5+5)
RegexpMatchEasy0_1K-12       728ns ± 8%     682ns ± 1%  -6.26%  (p=0.008 n=5+5)
RegexpMatchEasy1_32-12      66.8ns ± 2%    66.0ns ± 1%    ~     (p=0.302 n=5+5)
RegexpMatchEasy1_1K-12       291ns ± 2%     288ns ± 1%    ~     (p=0.111 n=5+5)
RegexpMatchMedium_32-12      103ns ± 2%     100ns ± 0%  -2.53%  (p=0.016 n=5+4)
RegexpMatchMedium_1K-12     31.9µs ± 1%    31.3µs ± 0%  -1.75%  (p=0.008 n=5+5)
RegexpMatchHard_32-12       1.59µs ± 2%    1.59µs ± 1%    ~     (p=0.548 n=5+5)
RegexpMatchHard_1K-12       48.3µs ± 2%    47.7µs ± 1%    ~     (p=0.222 n=5+5)
Revcomp-12                   340ms ± 1%     338ms ± 1%    ~     (p=0.421 n=5+5)
Template-12                 46.3ms ± 1%    46.5ms ± 1%    ~     (p=0.690 n=5+5)
TimeParse-12                 252ns ± 1%     247ns ± 0%  -1.91%  (p=0.000 n=5+4)
TimeFormat-12                277ns ± 1%     267ns ± 0%  -3.82%  (p=0.008 n=5+5)
[Geo mean]                  48.8µs         48.3µs       -0.93%

It has very little effect on binary size and compiler speed.
compilebench:
Template       230ms ±10%      231ms ± 8%    ~             (p=0.546 n=9+9)
Unicode        123ms ± 6%      124ms ± 9%    ~           (p=0.481 n=10+10)
GoTypes        742ms ± 6%      755ms ± 3%    ~           (p=0.123 n=10+10)
Compiler       3.10s ± 3%      3.08s ± 1%    ~           (p=0.631 n=10+10)

Fixes #16061.

Change-Id: Id99cdc7a182ee10a704fa0f04e8e0d0809b2ac56
Reviewed-on: https://go-review.googlesource.com/29732
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarDavid Chase <drchase@google.com>
parent 9d4b40f5
......@@ -242,6 +242,9 @@ type regAllocState struct {
// mask of registers currently in use
used regMask
// mask of registers used in the current instruction
tmpused regMask
// current block we're working on
curBlock *Block
......@@ -259,6 +262,10 @@ type regAllocState struct {
// spillLive[blockid] is the set of live spills at the end of each block
spillLive [][]ID
// a set of copies we generated to move things around, and
// whether it is used in shuffle. Unused copies will be deleted.
copies map[*Value]bool
loopnest *loopnest
}
......@@ -377,6 +384,21 @@ func (s *regAllocState) allocReg(mask regMask, v *Value) register {
if maxuse == -1 {
s.f.Fatalf("couldn't find register to spill")
}
// Try to move it around before kicking out, if there is a free register.
// We generate a Copy and record it. It will be deleted if never used.
v2 := s.regs[r].v
m := s.compatRegs(v2.Type) &^ s.used &^ s.tmpused &^ (regMask(1) << r)
if countRegs(s.values[v2.ID].regs) == 1 && m != 0 {
r2 := pickReg(m)
c := s.curBlock.NewValue1(v2.Line, OpCopy, v2.Type, s.regs[r].c)
s.copies[c] = false
if s.f.pass.debug > regDebug {
fmt.Printf("copy %s to %s : %s\n", v2, c, s.registers[r2].Name())
}
s.setOrig(c, v2)
s.assignReg(r2, v2, c)
}
s.freeReg(r)
return r
}
......@@ -526,6 +548,7 @@ func (s *regAllocState) init(f *Func) {
s.regs = make([]regState, s.numRegs)
s.values = make([]valState, f.NumValues())
s.orig = make([]*Value, f.NumValues())
s.copies = make(map[*Value]bool)
for _, b := range f.Blocks {
for _, v := range b.Values {
if !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && !v.Type.IsTuple() {
......@@ -627,6 +650,9 @@ func (s *regAllocState) setState(regs []endReg) {
// compatRegs returns the set of registers which can store a type t.
func (s *regAllocState) compatRegs(t Type) regMask {
var m regMask
if t.IsTuple() || t.IsFlags() {
return 0
}
if t.IsFloat() || t == TypeInt128 {
m = s.f.Config.fpRegMask
} else {
......@@ -1163,7 +1189,8 @@ func (s *regAllocState) regalloc(f *Func) {
for _, r := range dinfo[idx].in[0] {
if r != noRegister && m>>r&1 != 0 {
m = regMask(1) << r
s.allocValToReg(v.Args[0], m, true, v.Line)
c := s.allocValToReg(v.Args[0], m, true, v.Line)
s.copies[c] = false
// Note: no update to args[0] so the instruction will
// use the original copy.
goto ok
......@@ -1173,7 +1200,8 @@ func (s *regAllocState) regalloc(f *Func) {
for _, r := range dinfo[idx].in[1] {
if r != noRegister && m>>r&1 != 0 {
m = regMask(1) << r
s.allocValToReg(v.Args[1], m, true, v.Line)
c := s.allocValToReg(v.Args[1], m, true, v.Line)
s.copies[c] = false
args[0], args[1] = args[1], args[0]
goto ok
}
......@@ -1184,21 +1212,24 @@ func (s *regAllocState) regalloc(f *Func) {
m &^= desired.avoid
}
// Save input 0 to a new register so we can clobber it.
s.allocValToReg(v.Args[0], m, true, v.Line)
ok:
c := s.allocValToReg(v.Args[0], m, true, v.Line)
s.copies[c] = false
}
ok:
// Now that all args are in regs, we're ready to issue the value itself.
// Before we pick a register for the output value, allow input registers
// to be deallocated. We do this here so that the output can use the
// same register as a dying input.
if !opcodeTable[v.Op].resultNotInArgs {
s.tmpused = s.nospill
s.nospill = 0
s.advanceUses(v) // frees any registers holding args that are no longer live
}
// Dump any registers which will be clobbered
s.freeRegs(regspec.clobbers)
s.tmpused |= regspec.clobbers
// Pick registers for outputs.
{
......@@ -1250,6 +1281,7 @@ func (s *regAllocState) regalloc(f *Func) {
r := s.allocReg(mask, v)
outRegs[out.idx] = r
used |= regMask(1) << r
s.tmpused |= regMask(1) << r
}
// Record register choices
if v.Type.IsTuple() {
......@@ -1274,6 +1306,7 @@ func (s *regAllocState) regalloc(f *Func) {
s.nospill = 0
s.advanceUses(v) // frees any registers holding args that are no longer live
}
s.tmpused = 0
// Issue the Value itself.
for i, a := range args {
......@@ -1314,6 +1347,10 @@ func (s *regAllocState) regalloc(f *Func) {
// type-compatible register. If this turns out not to be true,
// we'll need to introduce a regspec for a block's control value.
b.Control = s.allocValToReg(v, s.compatRegs(v.Type), false, b.Line)
if b.Control != v {
v.Uses--
b.Control.Uses++
}
// Remove this use from the uses list.
vi := &s.values[v.ID]
u := vi.uses
......@@ -1512,7 +1549,7 @@ func (s *regAllocState) regalloc(f *Func) {
for i := range s.values {
vi := s.values[i]
if vi.spillUsed {
if s.f.pass.debug > logSpills {
if s.f.pass.debug > logSpills && vi.spill.Op != OpArg {
s.f.Config.Warnl(vi.spill.Line, "spilled value at %v remains", vi.spill)
}
continue
......@@ -1670,6 +1707,29 @@ sinking:
}
}
// Erase any copies we never used
for c, used := range s.copies {
if !used && c.Uses == 0 {
if s.f.pass.debug > regDebug {
fmt.Printf("delete copied value %s\n", c.LongString())
}
c.Args[0].Uses--
f.freeValue(c)
}
}
for _, b := range f.Blocks {
i := 0
for _, v := range b.Values {
if v.Op == OpInvalid {
continue
}
b.Values[i] = v
i++
}
b.Values = b.Values[:i]
}
if f.pass.stats > 0 {
f.LogStat("spills_info",
nSpills, "spills", nSpillsInner, "inner_spills_remaining", nSpillsSunk, "inner_spills_sunk", nSpillsSunkUnused, "inner_spills_unused", nSpillsNotSunkLateUse, "inner_spills_shuffled", nSpillsChanged, "inner_spills_changed")
......@@ -1904,6 +1964,10 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value, line int32
// Note: if splice==nil then c will appear dead. This is
// non-SSA formed code, so be careful after this pass not to run
// deadcode elimination.
if _, ok := e.s.copies[occupant.c]; ok {
// The copy at occupant.c was used to avoid spill.
e.s.copies[occupant.c] = true
}
return true
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment