cmd/compile: put spills in better places

Previously we always issued a spill right after the op that was being spilled. This CL pushes spills father away from the generator, hopefully pushing them into unlikely branches. For example: x = ... if unlikely { call ... } ... use x ... Used to compile to x = ... spill x if unlikely { call ... restore x } It now compiles to x = ... if unlikely { spill x call ... restore x } This is particularly useful for code which appends, as the only call is an unlikely call to growslice. It also helps for the spills needed around write barrier calls. The basic algorithm is walk down the dominator tree following a path where the block still dominates all of the restores. We're looking for a block that: 1) dominates all restores 2) has the value being spilled in a register 3) has a loop depth no deeper than the value being spilled The walking-down code is iterative. I was forced to limit it to searching 100 blocks so it doesn't become O(n^2). Maybe one day we'll find a better way. I had to delete most of David's code which pushed spills out of loops. I suspect this CL subsumes most of the cases that his code handled. Generally positive performance improvements, but hard to tell for sure with all the noise. (compilebench times are unchanged.) name old time/op new time/op delta BinaryTree17-12 2.91s ±15% 2.80s ±12% ~ (p=0.063 n=10+10) Fannkuch11-12 3.47s ± 0% 3.30s ± 4% -4.91% (p=0.000 n=9+10) FmtFprintfEmpty-12 48.0ns ± 1% 47.4ns ± 1% -1.32% (p=0.002 n=9+9) FmtFprintfString-12 85.6ns ±11% 79.4ns ± 3% -7.27% (p=0.005 n=10+10) FmtFprintfInt-12 91.8ns ±10% 85.9ns ± 4% ~ (p=0.203 n=10+9) FmtFprintfIntInt-12 135ns ±13% 127ns ± 1% -5.72% (p=0.025 n=10+9) FmtFprintfPrefixedInt-12 167ns ± 1% 168ns ± 2% ~ (p=0.580 n=9+10) FmtFprintfFloat-12 249ns ±11% 230ns ± 1% -7.32% (p=0.000 n=10+10) FmtManyArgs-12 504ns ± 7% 506ns ± 1% ~ (p=0.198 n=9+9) GobDecode-12 6.95ms ± 1% 7.04ms ± 1% +1.37% (p=0.001 n=10+10) GobEncode-12 6.32ms ±13% 6.04ms ± 1% ~ (p=0.063 n=10+10) Gzip-12 233ms ± 1% 235ms ± 0% +1.01% (p=0.000 n=10+9) Gunzip-12 40.1ms ± 1% 39.6ms ± 0% -1.12% (p=0.000 n=10+8) HTTPClientServer-12 227µs ± 9% 221µs ± 5% ~ (p=0.114 n=9+8) JSONEncode-12 16.1ms ± 2% 15.8ms ± 1% -2.09% (p=0.002 n=9+8) JSONDecode-12 61.8ms ±11% 57.9ms ± 1% -6.30% (p=0.000 n=10+9) Mandelbrot200-12 4.30ms ± 3% 4.28ms ± 1% ~ (p=0.203 n=10+8) GoParse-12 3.18ms ± 2% 3.18ms ± 2% ~ (p=0.579 n=10+10) RegexpMatchEasy0_32-12 76.7ns ± 1% 77.5ns ± 1% +0.92% (p=0.002 n=9+8) RegexpMatchEasy0_1K-12 239ns ± 3% 239ns ± 1% ~ (p=0.204 n=10+10) RegexpMatchEasy1_32-12 71.4ns ± 1% 70.6ns ± 0% -1.15% (p=0.000 n=10+9) RegexpMatchEasy1_1K-12 383ns ± 2% 390ns ±10% ~ (p=0.181 n=8+9) RegexpMatchMedium_32-12 114ns ± 0% 113ns ± 1% -0.88% (p=0.000 n=9+8) RegexpMatchMedium_1K-12 36.3µs ± 1% 36.8µs ± 1% +1.59% (p=0.000 n=10+8) RegexpMatchHard_32-12 1.90µs ± 1% 1.90µs ± 1% ~ (p=0.341 n=10+10) RegexpMatchHard_1K-12 59.4µs ±11% 57.8µs ± 1% ~ (p=0.968 n=10+9) Revcomp-12 461ms ± 1% 462ms ± 1% ~ (p=1.000 n=9+9) Template-12 67.5ms ± 1% 66.3ms ± 1% -1.77% (p=0.000 n=10+8) TimeParse-12 314ns ± 3% 309ns ± 0% -1.56% (p=0.000 n=9+8) TimeFormat-12 340ns ± 2% 331ns ± 1% -2.79% (p=0.000 n=10+10) The go binary is 0.2% larger. Not really sure why the size would change. Change-Id: Ia5116e53a3aeb025ef350ffc51c14ae5cc17871c Reviewed-on: https://go-review.googlesource.com/34822Reviewed-by: David Chase <drchase@google.com>

cmd/compile: put spills in better places
Previously we always issued a spill right after the op that was being spilled. This CL pushes spills father away from the generator, hopefully pushing them into unlikely branches. For example: x = ... if unlikely { call ... } ... use x ... Used to compile to x = ... spill x if unlikely { call ... restore x } It now compiles to x = ... if unlikely { spill x call ... restore x } This is particularly useful for code which appends, as the only call is an unlikely call to growslice. It also helps for the spills needed around write barrier calls. The basic algorithm is walk down the dominator tree following a path where the block still dominates all of the restores. We're looking for a block that: 1) dominates all restores 2) has the value being spilled in a register 3) has a loop depth no deeper than the value being spilled The walking-down code is iterative. I was forced to limit it to searching 100 blocks so it doesn't become O(n^2). Maybe one day we'll find a better way. I had to delete most of David's code which pushed spills out of loops. I suspect this CL subsumes most of the cases that his code handled. Generally positive performance improvements, but hard to tell for sure with all the noise. (compilebench times are unchanged.) name old time/op new time/op delta BinaryTree17-12 2.91s ±15% 2.80s ±12% ~ (p=0.063 n=10+10) Fannkuch11-12 3.47s ± 0% 3.30s ± 4% -4.91% (p=0.000 n=9+10) FmtFprintfEmpty-12 48.0ns ± 1% 47.4ns ± 1% -1.32% (p=0.002 n=9+9) FmtFprintfString-12 85.6ns ±11% 79.4ns ± 3% -7.27% (p=0.005 n=10+10) FmtFprintfInt-12 91.8ns ±10% 85.9ns ± 4% ~ (p=0.203 n=10+9) FmtFprintfIntInt-12 135ns ±13% 127ns ± 1% -5.72% (p=0.025 n=10+9) FmtFprintfPrefixedInt-12 167ns ± 1% 168ns ± 2% ~ (p=0.580 n=9+10) FmtFprintfFloat-12 249ns ±11% 230ns ± 1% -7.32% (p=0.000 n=10+10) FmtManyArgs-12 504ns ± 7% 506ns ± 1% ~ (p=0.198 n=9+9) GobDecode-12 6.95ms ± 1% 7.04ms ± 1% +1.37% (p=0.001 n=10+10) GobEncode-12 6.32ms ±13% 6.04ms ± 1% ~ (p=0.063 n=10+10) Gzip-12 233ms ± 1% 235ms ± 0% +1.01% (p=0.000 n=10+9) Gunzip-12 40.1ms ± 1% 39.6ms ± 0% -1.12% (p=0.000 n=10+8) HTTPClientServer-12 227µs ± 9% 221µs ± 5% ~ (p=0.114 n=9+8) JSONEncode-12 16.1ms ± 2% 15.8ms ± 1% -2.09% (p=0.002 n=9+8) JSONDecode-12 61.8ms ±11% 57.9ms ± 1% -6.30% (p=0.000 n=10+9) Mandelbrot200-12 4.30ms ± 3% 4.28ms ± 1% ~ (p=0.203 n=10+8) GoParse-12 3.18ms ± 2% 3.18ms ± 2% ~ (p=0.579 n=10+10) RegexpMatchEasy0_32-12 76.7ns ± 1% 77.5ns ± 1% +0.92% (p=0.002 n=9+8) RegexpMatchEasy0_1K-12 239ns ± 3% 239ns ± 1% ~ (p=0.204 n=10+10) RegexpMatchEasy1_32-12 71.4ns ± 1% 70.6ns ± 0% -1.15% (p=0.000 n=10+9) RegexpMatchEasy1_1K-12 383ns ± 2% 390ns ±10% ~ (p=0.181 n=8+9) RegexpMatchMedium_32-12 114ns ± 0% 113ns ± 1% -0.88% (p=0.000 n=9+8) RegexpMatchMedium_1K-12 36.3µs ± 1% 36.8µs ± 1% +1.59% (p=0.000 n=10+8) RegexpMatchHard_32-12 1.90µs ± 1% 1.90µs ± 1% ~ (p=0.341 n=10+10) RegexpMatchHard_1K-12 59.4µs ±11% 57.8µs ± 1% ~ (p=0.968 n=10+9) Revcomp-12 461ms ± 1% 462ms ± 1% ~ (p=1.000 n=9+9) Template-12 67.5ms ± 1% 66.3ms ± 1% -1.77% (p=0.000 n=10+8) TimeParse-12 314ns ± 3% 309ns ± 0% -1.56% (p=0.000 n=9+8) TimeFormat-12 340ns ± 2% 331ns ± 1% -2.79% (p=0.000 n=10+10) The go binary is 0.2% larger. Not really sure why the size would change. Change-Id: Ia5116e53a3aeb025ef350ffc51c14ae5cc17871c Reviewed-on: https://go-review.googlesource.com/34822Reviewed-by: David Chase <drchase@google.com>
886e9e60 · David Chase · Keith Randall · 710f4d3e · 886e9e60 · 886e9e60
Commit 886e9e60 authored Mar 07, 2017 by David Chase Committed by Keith Randall Mar 15, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 273 additions and 396 deletions

dom.go src/cmd/compile/internal/ssa/dom.go +2 -1

func.go src/cmd/compile/internal/ssa/func.go +25 -0

regalloc.go src/cmd/compile/internal/ssa/regalloc.go +246 -395

No files found.
--- a/src/cmd/compile/internal/ssa/dom.go
+++ b/src/cmd/compile/internal/ssa/dom.go
@@ -296,7 +296,8 @@ func dominatorsSimple(f *Func) []*Block {
 // intersect finds the closest dominator of both b and c.
 // It requires a postorder numbering of all the blocks.
 func intersect(b, c *Block, postnum []int, idom []*Block) *Block {
-	// TODO: This loop is O(n^2). See BenchmarkNilCheckDeep*.
+	// TODO: This loop is O(n^2). It used to be used in nilcheck,
+	// see BenchmarkNilCheckDeep*.
 	for b != c {
 		if postnum[b.ID] < postnum[c.ID] {
 			b = idom[b.ID]

--- a/src/cmd/compile/internal/ssa/func.go
+++ b/src/cmd/compile/internal/ssa/func.go
@@ -104,6 +104,31 @@ func (f *Func) newValue(op Op, t Type, b *Block, pos src.XPos) *Value {
 	return v
 }

+// newValueNoBlock allocates a new Value with the given fields.
+// The returned value is not placed in any block.  Once the caller
+// decides on a block b, it must set b.Block and append
+// the returned value to b.Values.
+func (f *Func) newValueNoBlock(op Op, t Type, pos src.XPos) *Value {
+	var v *Value
+	if f.freeValues != nil {
+		v = f.freeValues
+		f.freeValues = v.argstorage[0]
+		v.argstorage[0] = nil
+	} else {
+		ID := f.vid.get()
+		if int(ID) < len(f.Config.values) {
+			v = &f.Config.values[ID]
+		} else {
+			v = &Value{ID: ID}
+		}
+	}
+	v.Op = op
+	v.Type = t
+	v.Block = nil // caller must fix this.
+	v.Pos = pos
+	return v
+}
+
 // logPassStat writes a string key and int value as a warning in a
 // tab-separated format easily handled by spreadsheets or awk.
 // file names, lines, and function names are included to provide enough (?)

--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -26,28 +26,48 @@

 // Spilling
 //
-// For every value, we generate a spill immediately after the value itself.
-//     x = Op y z    : AX
-//     x2 = StoreReg x
-// While AX still holds x, any uses of x will use that value. When AX is needed
-// for another value, we simply reuse AX.  Spill code has already been generated
-// so there is no code generated at "spill" time. When x is referenced
-// subsequently, we issue a load to restore x to a register using x2 as
-//  its argument:
-//    x3 = Restore x2 : CX
-// x3 can then be used wherever x is referenced again.
-// If the spill (x2) is never used, it will be removed at the end of regalloc.
+// During the normal course of the allocator, we might throw a still-live
+// value out of all registers. When that value is subsequently used, we must
+// load it from a slot on the stack. We must also issue an instruction to
+// initialize that stack location with a copy of v.
+//
+// pre-regalloc:
+//   (1) v = Op ...
+//   (2) x = Op ...
+//   (3) ... = Op v ...
+//
+// post-regalloc:
+//   (1) v = Op ...    : AX // computes v, store result in AX
+//       s = StoreReg v     // spill v to a stack slot
+//   (2) x = Op ...    : AX // some other op uses AX
+//       c = LoadReg s : CX // restore v from stack slot
+//   (3) ... = Op c ...     // use the restored value
+//
+// Allocation occurs normally until we reach (3) and we realize we have
+// a use of v and it isn't in any register. At that point, we allocate
+// a spill (a StoreReg) for v. We can't determine the correct place for
+// the spill at this point, so we allocate the spill as blockless initially.
+// The restore is then generated to load v back into a register so it can
+// be used. Subsequent uses of v will use the restored value c instead.
+//
+// What remains is the question of where to schedule the spill.
+// During allocation, we keep track of the dominator of all restores of v.
+// The spill of v must dominate that block. The spill must also be issued at
+// a point where v is still in a register.
+//
+// To find the right place, start at b, the block which dominates all restores.
+//  - If b is v.Block, then issue the spill right after v.
+//    It is known to be in a register at that point, and dominates any restores.
+//  - Otherwise, if v is in a register at the start of b,
+//    put the spill of v at the start of b.
+//  - Otherwise, set b = immediate dominator of b, and repeat.
 //
 // Phi values are special, as always. We define two kinds of phis, those
 // where the merge happens in a register (a "register" phi) and those where
 // the merge happens in a stack location (a "stack" phi).
 //
 // A register phi must have the phi and all of its inputs allocated to the
-// same register. Register phis are spilled similarly to regular ops:
-//     b1: y = ... : AX        b2: z = ... : AX
-//         goto b3                 goto b3
-//     b3: x = phi(y, z) : AX
-//         x2 = StoreReg x
+// same register. Register phis are spilled similarly to regular ops.
 //
 // A stack phi must have the phi and all of its inputs allocated to the same
 // stack location. Stack phis start out life already spilled - each phi
@@ -91,18 +111,6 @@
 // will have no use (so don't run deadcode after regalloc!).
 // TODO: maybe we should introduce these extra phis?

-// Additional not-quite-SSA output occurs when spills are sunk out
-// of loops to the targets of exit edges from the loop.  Before sinking,
-// there is one spill site (one StoreReg) targeting stack slot X, after
-// sinking there may be multiple spill sites targeting stack slot X,
-// with no phi functions at any join points reachable by the multiple
-// spill sites.  In addition, uses of the spill from copies of the original
-// will not name the copy in their reference; instead they will name
-// the original, though both will have the same spill location.  The
-// first sunk spill will be the original, but moved, to an exit block,
-// thus ensuring that there is a definition somewhere corresponding to
-// the original spill's uses.
-
 package ssa

 import (
@@ -186,14 +194,15 @@ type use struct {
 	next *use     // linked list of uses of a value in nondecreasing dist order
 }

+// A valState records the register allocation state for a (pre-regalloc) value.
 type valState struct {
 	regs              regMask // the set of registers holding a Value (usually just one)
 	uses              *use    // list of uses in this block
-	spill             *Value  // spilled copy of the Value
-	spillUsed         bool
-	spillUsedShuffle  bool // true if used in shuffling, after ordinary uses
-	needReg           bool // cached value of !v.Type.IsMemory() && !v.Type.IsVoid() && !.v.Type.IsFlags()
-	rematerializeable bool // cached value of v.rematerializeable()
+	spill             *Value  // spilled copy of the Value (if any)
+	restoreMin        int32   // minimum of all restores' blocks' sdom.entry
+	restoreMax        int32   // maximum of all restores' blocks' sdom.exit
+	needReg           bool    // cached value of !v.Type.IsMemory() && !v.Type.IsVoid() && !.v.Type.IsFlags()
+	rematerializeable bool    // cached value of v.rematerializeable()
 }

 type regState struct {
@@ -205,6 +214,7 @@ type regState struct {
 type regAllocState struct {
 	f *Func

+	sdom        SparseTree
 	registers   []Register
 	numRegs     register
 	SPReg       register
@@ -271,15 +281,6 @@ type regAllocState struct {
 	loopnest *loopnest
 }

-type spillToSink struct {
-	spill *Value // Spill instruction to move (a StoreReg)
-	dests int32  // Bitmask indicating exit blocks from loop in which spill/val is defined. 1<<i set means val is live into loop.exitBlocks[i]
-}
-
-func (sts *spillToSink) spilledValue() *Value {
-	return sts.spill.Args[0]
-}
-
 type endReg struct {
 	r register
 	v *Value // pre-regalloc value held in this register (TODO: can we use ID here?)
@@ -288,7 +289,8 @@ type endReg struct {

 type startReg struct {
 	r   register
-	vid ID       // pre-regalloc value needed in this register
+	v   *Value   // pre-regalloc value needed in this register
+	c   *Value   // cached version of the value
 	pos src.XPos // source position of use of this register
 }

@@ -405,6 +407,28 @@ func (s *regAllocState) allocReg(mask regMask, v *Value) register {
 	return r
 }

+// makeSpill returns a Value which represents the spilled value of v.
+// b is the block in which the spill is used.
+func (s *regAllocState) makeSpill(v *Value, b *Block) *Value {
+	vi := &s.values[v.ID]
+	if vi.spill != nil {
+		// Final block not known - keep track of subtree where restores reside.
+		vi.restoreMin = min32(vi.restoreMin, s.sdom[b.ID].entry)
+		vi.restoreMax = max32(vi.restoreMax, s.sdom[b.ID].exit)
+		return vi.spill
+	}
+	// Make a spill for v. We don't know where we want
+	// to put it yet, so we leave it blockless for now.
+	spill := s.f.newValueNoBlock(OpStoreReg, v.Type, v.Pos)
+	// We also don't know what the spill's arg will be.
+	// Leave it argless for now.
+	s.setOrig(spill, v)
+	vi.spill = spill
+	vi.restoreMin = s.sdom[b.ID].entry
+	vi.restoreMax = s.sdom[b.ID].exit
+	return spill
+}
+
 // allocValToReg allocates v to a register selected from regMask and
 // returns the register copy of v. Any previous user is kicked out and spilled
 // (if necessary). Load code is added at the current pc. If nospill is set the
@@ -442,17 +466,12 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, pos
 		// Rematerialize instead of loading from the spill location.
 		c = v.copyInto(s.curBlock)
 	} else {
-		switch {
 		// Load v from its spill location.
-		case vi.spill != nil:
-			if s.f.pass.debug > logSpills {
-				s.f.Config.Warnl(vi.spill.Pos, "load spill for %v from %v", v, vi.spill)
-			}
-			c = s.curBlock.NewValue1(pos, OpLoadReg, v.Type, vi.spill)
-			vi.spillUsed = true
-		default:
-			s.f.Fatalf("attempt to load unspilled value %v", v.LongString())
+		spill := s.makeSpill(v, s.curBlock)
+		if s.f.pass.debug > logSpills {
+			s.f.Config.Warnl(vi.spill.Pos, "load spill for %v from %v", v, spill)
 		}
+		c = s.curBlock.NewValue1(pos, OpLoadReg, v.Type, spill)
 	}
 	s.setOrig(c, v)
 	s.assignReg(r, v, c)
@@ -615,6 +634,7 @@ func (s *regAllocState) init(f *Func) {
 	s.endRegs = make([][]endReg, f.NumBlocks())
 	s.startRegs = make([][]startReg, f.NumBlocks())
 	s.spillLive = make([][]ID, f.NumBlocks())
+	s.sdom = f.sdom()
 }

 // Adds a use record for id at distance dist from the start of the block.
@@ -688,22 +708,6 @@ func (s *regAllocState) compatRegs(t Type) regMask {
 	return m & s.allocatable
 }

-// loopForBlock returns the loop containing block b,
-// provided that the loop is "interesting" for purposes
-// of improving register allocation (= is inner, and does
-// not contain a call)
-func (s *regAllocState) loopForBlock(b *Block) *loop {
-	loop := s.loopnest.b2l[b.ID]
-
-	// Minor for-the-time-being optimization: nothing happens
-	// unless a loop is both inner and call-free, therefore
-	// don't bother with other loops.
-	if loop != nil && (loop.containsCall || !loop.isInner) {
-		loop = nil
-	}
-	return loop
-}
-
 func (s *regAllocState) regalloc(f *Func) {
 	liveSet := f.newSparseSet(f.NumValues())
 	defer f.retSparseSet(liveSet)
@@ -712,14 +716,6 @@ func (s *regAllocState) regalloc(f *Func) {
 	var phiRegs []register
 	var args []*Value

-	// statistics
-	var nSpills int               // # of spills remaining
-	var nSpillsInner int          // # of spills remaining in inner loops
-	var nSpillsSunk int           // # of sunk spills remaining
-	var nSpillsChanged int        // # of sunk spills lost because of register use change
-	var nSpillsSunkUnused int     // # of spills not sunk because they were removed completely
-	var nSpillsNotSunkLateUse int // # of spills not sunk because of very late use (in shuffle)
-
 	// Data structure used for computing desired registers.
 	var desired desiredState

@@ -734,24 +730,8 @@ func (s *regAllocState) regalloc(f *Func) {
 		f.Fatalf("entry block must be first")
 	}

-	// Get loop nest so that spills in inner loops can be
-	// tracked.  When the last block of a loop is processed,
-	// attempt to move spills out of the loop.
-	s.loopnest.findExits()
-
-	// Spills are moved from one block's slice of values to another's.
-	// This confuses register allocation if it occurs before it is
-	// complete, so candidates are recorded, then rechecked and
-	// moved after all allocation (register and stack) is complete.
-	// Because movement is only within a stack slot's lifetime, it
-	// is safe to do this.
-	var toSink []spillToSink
-	// Will be used to figure out live inputs to exit blocks of inner loops.
-	entryCandidates := newSparseMap(f.NumValues())
-
 	for _, b := range f.Blocks {
 		s.curBlock = b
-		loop := s.loopForBlock(b)

 		// Initialize liveSet and uses fields for this block.
 		// Walk backwards through the block doing liveness analysis.
@@ -945,22 +925,11 @@ func (s *regAllocState) regalloc(f *Func) {
 				if r == noRegister {
 					// stack-based phi
 					// Spills will be inserted in all the predecessors below.
-					s.values[v.ID].spill = v        // v starts life spilled
-					s.values[v.ID].spillUsed = true // use is guaranteed
+					s.values[v.ID].spill = v // v starts life spilled
 					continue
 				}
 				// register-based phi
 				s.assignReg(r, v, v)
-				// Spill the phi in case we need to restore it later.
-				spill := b.NewValue1(v.Pos, OpStoreReg, v.Type, v)
-				s.setOrig(spill, v)
-				s.values[v.ID].spill = spill
-				s.values[v.ID].spillUsed = false
-				if loop != nil {
-					loop.spills = append(loop.spills, v)
-					nSpillsInner++
-				}
-				nSpills++
 			}

 			// Save the starting state for use by merge edges.
@@ -975,14 +944,14 @@ func (s *regAllocState) regalloc(f *Func) {
 					// specially during merge edge processing.
 					continue
 				}
-				regList = append(regList, startReg{r, v.ID, s.values[v.ID].uses.pos})
+				regList = append(regList, startReg{r, v, s.regs[r].c, s.values[v.ID].uses.pos})
 			}
 			s.startRegs[b.ID] = regList

 			if s.f.pass.debug > regDebug {
 				fmt.Printf("after phis\n")
 				for _, x := range s.startRegs[b.ID] {
-					fmt.Printf("  %s: v%d\n", s.registers[x.r].Name(), x.vid)
+					fmt.Printf("  %s: v%d\n", s.registers[x.r].Name(), x.v.ID)
 				}
 			}
 		}
@@ -1005,7 +974,7 @@ func (s *regAllocState) regalloc(f *Func) {
 			succ := e.b
 			// TODO: prioritize likely successor?
 			for _, x := range s.startRegs[succ.ID] {
-				desired.add(x.vid, x.r)
+				desired.add(x.v.ID, x.r)
 			}
 			// Process phi ops in succ.
 			pidx := e.i
@@ -1100,7 +1069,6 @@ func (s *regAllocState) regalloc(f *Func) {
 				// any register here. We just set up the spill pointer to
 				// point at itself and any later user will restore it to use it.
 				s.values[v.ID].spill = v
-				s.values[v.ID].spillUsed = true // use is guaranteed
 				b.Values = append(b.Values, v)
 				s.advanceUses(v)
 				continue
@@ -1109,7 +1077,7 @@ func (s *regAllocState) regalloc(f *Func) {
 				// Make sure the argument to v is still live here.
 				s.advanceUses(v)
 				vi := &s.values[v.Args[0].ID]
-				if vi.spillUsed {
+				if vi.spill != nil {
 					// Use the spill location.
 					v.SetArg(0, vi.spill)
 				} else {
@@ -1373,28 +1341,7 @@ func (s *regAllocState) regalloc(f *Func) {
 			}
 			b.Values = append(b.Values, v)

-			// Issue a spill for this value. We issue spills unconditionally,
-			// then at the end of regalloc delete the ones we never use.
-			// TODO: schedule the spill at a point that dominates all restores.
-			// The restore may be off in an unlikely branch somewhere and it
-			// would be better to have the spill in that unlikely branch as well.
-			// v := ...
-			// if unlikely {
-			//     f()
-			// }
-			// It would be good to have both spill and restore inside the IF.
 		issueSpill:
-			if s.values[v.ID].needReg {
-				spill := b.NewValue1(v.Pos, OpStoreReg, v.Type, v)
-				s.setOrig(spill, v)
-				s.values[v.ID].spill = spill
-				s.values[v.ID].spillUsed = false
-				if loop != nil {
-					loop.spills = append(loop.spills, v)
-					nSpillsInner++
-				}
-				nSpills++
-			}
 		}

 		// Load control value into reg.
@@ -1504,90 +1451,23 @@ func (s *regAllocState) regalloc(f *Func) {
 		}

 		// If a value is live at the end of the block and
-		// isn't in a register, remember that its spill location
-		// is live. We need to remember this information so that
+		// isn't in a register, generate a use for the spill location.
+		// We need to remember this information so that
 		// the liveness analysis in stackalloc is correct.
 		for _, e := range s.live[b.ID] {
-			if s.values[e.ID].regs != 0 {
+			vi := &s.values[e.ID]
+			if vi.regs != 0 {
 				// in a register, we'll use that source for the merge.
 				continue
 			}
-			spill := s.values[e.ID].spill
-			if spill == nil {
-				// rematerializeable values will have spill==nil.
+			if vi.rematerializeable {
+				// we'll rematerialize during the merge.
 				continue
 			}
+			//fmt.Printf("live-at-end spill for %s at %s\n", s.orig[e.ID], b)
+			spill := s.makeSpill(s.orig[e.ID], b)
 			s.spillLive[b.ID] = append(s.spillLive[b.ID], spill.ID)
-			s.values[e.ID].spillUsed = true
-		}
-
-		// Keep track of values that are spilled in the loop, but whose spill
-		// is not used in the loop.  It may be possible to move ("sink") the
-		// spill out of the loop into one or more exit blocks.
-		if loop != nil {
-			loop.scratch++                    // increment count of blocks in this loop that have been processed
-			if loop.scratch == loop.nBlocks { // just processed last block of loop, if it is an inner loop.
-				// This check is redundant with code at the top of the loop.
-				// This is definitive; the one at the top of the loop is an optimization.
-				if loop.isInner && // Common case, easier, most likely to be profitable
-					!loop.containsCall && // Calls force spills, also lead to puzzling spill info.
-					len(loop.exits) <= 32 { // Almost no inner loops have more than 32 exits,
-					// and this allows use of a bitvector and a sparseMap.
-
-					// TODO: exit calculation is messed up for non-inner loops
-					// because of multilevel exits that are not part of the "exit"
-					// count.
-
-					// Compute the set of spill-movement candidates live at entry to exit blocks.
-					// isLoopSpillCandidate filters for
-					// (1) defined in appropriate loop
-					// (2) needs a register
-					// (3) spill not already used (in the loop)
-					// Condition (3) === "in a register at all loop exits"
-
-					entryCandidates.clear()
-
-					for whichExit, ss := range loop.exits {
-						// Start with live at end.
-						for _, li := range s.live[ss.ID] {
-							if s.isLoopSpillCandidate(loop, s.orig[li.ID]) {
-								// s.live contains original IDs, use s.orig above to map back to *Value
-								entryCandidates.setBit(li.ID, uint(whichExit))
-							}
-						}
-						// Control can also be live.
-						if ss.Control != nil && s.orig[ss.Control.ID] != nil && s.isLoopSpillCandidate(loop, s.orig[ss.Control.ID]) {
-							entryCandidates.setBit(s.orig[ss.Control.ID].ID, uint(whichExit))
-						}
-						// Walk backwards, filling in locally live values, removing those defined.
-						for i := len(ss.Values) - 1; i >= 0; i-- {
-							v := ss.Values[i]
-							vorig := s.orig[v.ID]
-							if vorig != nil {
-								entryCandidates.remove(vorig.ID) // Cannot be an issue, only keeps the sets smaller.
-							}
-							for _, a := range v.Args {
-								aorig := s.orig[a.ID]
-								if aorig != nil && s.isLoopSpillCandidate(loop, aorig) {
-									entryCandidates.setBit(aorig.ID, uint(whichExit))
-								}
-							}
-						}
-					}
-
-					for _, e := range loop.spills {
-						whichblocks := entryCandidates.get(e.ID)
-						oldSpill := s.values[e.ID].spill
-						if whichblocks != 0 && whichblocks != -1 { // -1 = not in map.
-							toSink = append(toSink, spillToSink{spill: oldSpill, dests: whichblocks})
-						}
-					}
-
-				} // loop is inner etc
-				loop.scratch = 0 // Don't leave a mess, just in case.
-				loop.spills = nil
-			} // if scratch == nBlocks
-		} // if loop is not nil
+		}

 		// Clear any final uses.
 		// All that is left should be the pseudo-uses added for values which
@@ -1606,52 +1486,8 @@ func (s *regAllocState) regalloc(f *Func) {
 		}
 	}

-	// Erase any spills we never used
-	for i := range s.values {
-		vi := s.values[i]
-		if vi.spillUsed {
-			if s.f.pass.debug > logSpills && vi.spill.Op != OpArg {
-				s.f.Config.Warnl(vi.spill.Pos, "spilled value at %v remains", vi.spill)
-			}
-			continue
-		}
-		spill := vi.spill
-		if spill == nil {
-			// Constants, SP, SB, ...
-			continue
-		}
-		loop := s.loopForBlock(spill.Block)
-		if loop != nil {
-			nSpillsInner--
-		}
-
-		spill.Args[0].Uses--
-		f.freeValue(spill)
-		nSpills--
-	}
-
-	for _, b := range f.Blocks {
-		i := 0
-		for _, v := range b.Values {
-			if v.Op == OpInvalid {
-				continue
-			}
-			b.Values[i] = v
-			i++
-		}
-		b.Values = b.Values[:i]
-		// TODO: zero b.Values[i:], recycle Values
-		// Not important now because this is the last phase that manipulates Values
-	}
-
-	// Must clear these out before any potential recycling, though that's
-	// not currently implemented.
-	for i, ts := range toSink {
-		vsp := ts.spill
-		if vsp.Op == OpInvalid { // This spill was completely eliminated
-			toSink[i].spill = nil
-		}
-	}
+	// Decide where the spills we generated will go.
+	s.placeSpills()

 	// Anything that didn't get a register gets a stack location here.
 	// (StoreReg, stack-based phis, inputs, ...)
@@ -1660,114 +1496,6 @@ func (s *regAllocState) regalloc(f *Func) {
 	// Fix up all merge edges.
 	s.shuffle(stacklive)

-	// Insert moved spills (that have not been marked invalid above)
-	// at start of appropriate block and remove the originals from their
-	// location within loops.  Notice that this can break SSA form;
-	// if a spill is sunk to multiple exits, there will be no phi for that
-	// spill at a join point downstream of those two exits, though the
-	// two spills will target the same stack slot.  Notice also that this
-	// takes place after stack allocation, so the stack allocator does
-	// not need to process these malformed flow graphs.
-sinking:
-	for _, ts := range toSink {
-		vsp := ts.spill
-		if vsp == nil { // This spill was completely eliminated
-			nSpillsSunkUnused++
-			continue sinking
-		}
-		e := ts.spilledValue()
-		if s.values[e.ID].spillUsedShuffle {
-			nSpillsNotSunkLateUse++
-			continue sinking
-		}
-
-		// move spills to a better (outside of loop) block.
-		// This would be costly if it occurred very often, but it doesn't.
-		b := vsp.Block
-		loop := s.loopnest.b2l[b.ID]
-		dests := ts.dests
-
-		// Pre-check to be sure that spilled value is still in expected register on all exits where live.
-	check_val_still_in_reg:
-		for i := uint(0); i < 32 && dests != 0; i++ {
-
-			if dests&(1<<i) == 0 {
-				continue
-			}
-			dests ^= 1 << i
-			d := loop.exits[i]
-			if len(d.Preds) > 1 {
-				panic("Should be impossible given critical edges removed")
-			}
-			p := d.Preds[0].b // block in loop exiting to d.
-
-			endregs := s.endRegs[p.ID]
-			for _, regrec := range endregs {
-				if regrec.v == e && regrec.r != noRegister && regrec.c == e { // TODO: regrec.c != e implies different spill possible.
-					continue check_val_still_in_reg
-				}
-			}
-			// If here, the register assignment was lost down at least one exit and it can't be sunk
-			if s.f.pass.debug > moveSpills {
-				s.f.Config.Warnl(e.Pos, "lost register assignment for spill %v in %v at exit %v to %v",
-					vsp, b, p, d)
-			}
-			nSpillsChanged++
-			continue sinking
-		}
-
-		nSpillsSunk++
-		nSpillsInner--
-		// don't update nSpills, since spill is only moved, and if it is duplicated, the spills-on-a-path is not increased.
-
-		dests = ts.dests
-
-		// remove vsp from b.Values
-		i := 0
-		for _, w := range b.Values {
-			if vsp == w {
-				continue
-			}
-			b.Values[i] = w
-			i++
-		}
-		b.Values = b.Values[:i]
-
-		first := true
-		for i := uint(0); i < 32 && dests != 0; i++ {
-
-			if dests&(1<<i) == 0 {
-				continue
-			}
-
-			dests ^= 1 << i
-
-			d := loop.exits[i]
-			vspnew := vsp // reuse original for first sunk spill, saves tracking down and renaming uses
-			if !first {   // any sunk spills after first must make a copy
-				vspnew = d.NewValue1(e.Pos, OpStoreReg, e.Type, e)
-				f.setHome(vspnew, f.getHome(vsp.ID)) // copy stack home
-				if s.f.pass.debug > moveSpills {
-					s.f.Config.Warnl(e.Pos, "copied spill %v in %v for %v to %v in %v",
-						vsp, b, e, vspnew, d)
-				}
-			} else {
-				first = false
-				vspnew.Block = d
-				d.Values = append(d.Values, vspnew)
-				if s.f.pass.debug > moveSpills {
-					s.f.Config.Warnl(e.Pos, "moved spill %v in %v for %v to %v in %v",
-						vsp, b, e, vspnew, d)
-				}
-			}
-
-			// shuffle vspnew to the beginning of its block
-			copy(d.Values[1:], d.Values[0:len(d.Values)-1])
-			d.Values[0] = vspnew
-
-		}
-	}
-
 	// Erase any copies we never used.
 	// Also, an unused copy might be the only use of another copy,
 	// so continue erasing until we reach a fixed point.
@@ -1800,38 +1528,143 @@ sinking:
 		}
 		b.Values = b.Values[:i]
 	}
+}
+
+func (s *regAllocState) placeSpills() {
+	f := s.f

-	if f.pass.stats > 0 {
-		f.LogStat("spills_info",
-			nSpills, "spills", nSpillsInner, "inner_spills_remaining", nSpillsSunk, "inner_spills_sunk", nSpillsSunkUnused, "inner_spills_unused", nSpillsNotSunkLateUse, "inner_spills_shuffled", nSpillsChanged, "inner_spills_changed")
+	// Precompute some useful info.
+	phiRegs := make([]regMask, f.NumBlocks())
+	for _, b := range f.Blocks {
+		var m regMask
+		for _, v := range b.Values {
+			if v.Op != OpPhi {
+				break
+			}
+			if r, ok := f.getHome(v.ID).(*Register); ok {
+				m |= regMask(1) << uint(r.num)
+			}
+		}
+		phiRegs[b.ID] = m
 	}
-}

-// isLoopSpillCandidate indicates whether the spill for v satisfies preliminary
-// spill-sinking conditions just after the last block of loop has been processed.
-// In particular:
-//   v needs a register.
-//   v's spill is not (YET) used.
-//   v's definition is within loop.
-// The spill may be used in the future, either by an outright use
-// in the code, or by shuffling code inserted after stack allocation.
-// Outright uses cause sinking; shuffling (within the loop) inhibits it.
-func (s *regAllocState) isLoopSpillCandidate(loop *loop, v *Value) bool {
-	return s.values[v.ID].needReg && !s.values[v.ID].spillUsed && s.loopnest.b2l[v.Block.ID] == loop
-}
+	// Start maps block IDs to the list of spills
+	// that go at the start of the block (but after any phis).
+	start := map[ID][]*Value{}
+	// After maps value IDs to the list of spills
+	// that go immediately after that value ID.
+	after := map[ID][]*Value{}
+
+	for i := range s.values {
+		vi := s.values[i]
+		spill := vi.spill
+		if spill == nil {
+			continue
+		}
+		if spill.Block != nil {
+			// Some spills are already fully set up,
+			// like OpArgs and stack-based phis.
+			continue
+		}
+		v := s.orig[i]
+
+		// Walk down the dominator tree looking for a good place to
+		// put the spill of v.  At the start "best" is the best place
+		// we have found so far.
+		// TODO: find a way to make this O(1) without arbitrary cutoffs.
+		best := v.Block
+		bestArg := v
+		var bestDepth int16
+		if l := s.loopnest.b2l[best.ID]; l != nil {
+			bestDepth = l.depth
+		}
+		b := best
+		const maxSpillSearch = 100
+		for i := 0; i < maxSpillSearch; i++ {
+			// Find the child of b in the dominator tree which
+			// dominates all restores.
+			p := b
+			b = nil
+			for c := s.sdom.Child(p); c != nil && i < maxSpillSearch; c, i = s.sdom.Sibling(c), i+1 {
+				if s.sdom[c.ID].entry <= vi.restoreMin && s.sdom[c.ID].exit >= vi.restoreMax {
+					// c also dominates all restores.  Walk down into c.
+					b = c
+					break
+				}
+			}
+			if b == nil {
+				// Ran out of blocks which dominate all restores.
+				break
+			}
+
+			var depth int16
+			if l := s.loopnest.b2l[b.ID]; l != nil {
+				depth = l.depth
+			}
+			if depth > bestDepth {
+				// Don't push the spill into a deeper loop.
+				continue
+			}

-// lateSpillUse notes a late (after stack allocation) use of the spill of value with ID vid.
-// This will inhibit spill sinking.
-func (s *regAllocState) lateSpillUse(vid ID) {
-	// TODO investigate why this is necessary.
-	// It appears that an outside-the-loop use of
-	// an otherwise sinkable spill makes the spill
-	// a candidate for shuffling, when it would not
-	// otherwise have been the case (spillUsed was not
-	// true when isLoopSpillCandidate was called, yet
-	// it was shuffled).  Such shuffling cuts the amount
-	// of spill sinking by more than half (in make.bash)
-	s.values[vid].spillUsedShuffle = true
+			// If v is in a register at the start of b, we can
+			// place the spill here (after the phis).
+			if len(b.Preds) == 1 {
+				for _, e := range s.endRegs[b.Preds[0].b.ID] {
+					if e.v == v {
+						// Found a better spot for the spill.
+						best = b
+						bestArg = e.c
+						bestDepth = depth
+						break
+					}
+				}
+			} else {
+				for _, e := range s.startRegs[b.ID] {
+					if e.v == v {
+						// Found a better spot for the spill.
+						best = b
+						bestArg = e.c
+						bestDepth = depth
+						break
+					}
+				}
+			}
+		}
+
+		// Put the spill in the best block we found.
+		spill.Block = best
+		spill.AddArg(bestArg)
+		if best == v.Block && v.Op != OpPhi {
+			// Place immediately after v.
+			after[v.ID] = append(after[v.ID], spill)
+		} else {
+			// Place at the start of best block.
+			start[best.ID] = append(start[best.ID], spill)
+		}
+	}
+
+	// Insert spill instructions into the block schedules.
+	var oldSched []*Value
+	for _, b := range f.Blocks {
+		nphi := 0
+		for _, v := range b.Values {
+			if v.Op != OpPhi {
+				break
+			}
+			nphi++
+		}
+		oldSched = append(oldSched[:0], b.Values[nphi:]...)
+		b.Values = b.Values[:nphi]
+		for _, v := range start[b.ID] {
+			b.Values = append(b.Values, v)
+		}
+		for _, v := range oldSched {
+			b.Values = append(b.Values, v)
+			for _, w := range after[v.ID] {
+				b.Values = append(b.Values, w)
+			}
+		}
+	}
 }

 // shuffle fixes up all the merge edges (those going into blocks of indegree > 1).
@@ -1919,13 +1752,24 @@ func (e *edgeState) setup(idx int, srcReg []endReg, dstReg []startReg, stacklive
 	for _, spillID := range stacklive {
 		v := e.s.orig[spillID]
 		spill := e.s.values[v.ID].spill
+		if !e.s.sdom.isAncestorEq(spill.Block, e.p) {
+			// Spills were placed that only dominate the uses found
+			// during the first regalloc pass. The edge fixup code
+			// can't use a spill location if the spill doesn't dominate
+			// the edge.
+			// We are guaranteed that if the spill doesn't dominate this edge,
+			// then the value is available in a register (because we called
+			// makeSpill for every value not in a register at the start
+			// of an edge).
+			continue
+		}
 		e.set(e.s.f.getHome(spillID), v.ID, spill, false, src.NoXPos) // don't care the position of the source
 	}

 	// Figure out all the destinations we need.
 	dsts := e.destinations[:0]
 	for _, x := range dstReg {
-		dsts = append(dsts, dstRecord{&e.s.registers[x.r], x.vid, nil, x.pos})
+		dsts = append(dsts, dstRecord{&e.s.registers[x.r], x.v.ID, nil, x.pos})
 	}
 	// Phis need their args to end up in a specific location.
 	for _, v := range e.b.Values {
@@ -2010,7 +1854,6 @@ func (e *edgeState) process() {
 		if _, isReg := loc.(*Register); isReg {
 			c = e.p.NewValue1(d.pos, OpCopy, c.Type, c)
 		} else {
-			e.s.lateSpillUse(vid)
 			c = e.p.NewValue1(d.pos, OpLoadReg, c.Type, c)
 		}
 		e.set(r, vid, c, false, d.pos)
@@ -2028,9 +1871,6 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value, pos src.XP
 			(*splice).Uses--
 			*splice = occupant.c
 			occupant.c.Uses++
-			if occupant.c.Op == OpStoreReg {
-				e.s.lateSpillUse(vid)
-			}
 		}
 		// Note: if splice==nil then c will appear dead. This is
 		// non-SSA formed code, so be careful after this pass not to run
@@ -2106,7 +1946,6 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value, pos src.XP
 			}
 		} else {
 			if dstReg {
-				e.s.lateSpillUse(vid)
 				x = e.p.NewValue1(pos, OpLoadReg, c.Type, c)
 			} else {
 				// mem->mem. Use temp register.
@@ -2124,7 +1963,6 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value, pos src.XP
 				e.erase(loc)

 				r := e.findRegFor(c.Type)
-				e.s.lateSpillUse(vid)
 				t := e.p.NewValue1(pos, OpLoadReg, c.Type, c)
 				e.set(r, vid, t, false, pos)
 				x = e.p.NewValue1(pos, OpStoreReg, loc.(LocalSlot).Type, t)
@@ -2630,3 +2468,16 @@ func (d *desiredState) merge(x *desiredState) {
 		d.addList(e.ID, e.regs)
 	}
 }
+
+func min32(x, y int32) int32 {
+	if x < y {
+		return x
+	}
+	return y
+}
+func max32(x, y int32) int32 {
+	if x > y {
+		return x
+	}
+	return y
+}