Commit 5cadc91b authored by Keith Randall's avatar Keith Randall Committed by Keith Randall

cmd/compile: intrinsics for math/bits.OnesCount

Popcount instructions on amd64 are not guaranteed to be
present, so we must guard their call.  Rewrite rules can't
generate control flow at the moment, so the intrinsifier
needs to generate that code.

name           old time/op  new time/op  delta
OnesCount-8    2.47ns ± 5%  1.04ns ± 2%  -57.70%  (p=0.000 n=10+10)
OnesCount16-8  1.05ns ± 1%  0.78ns ± 0%  -25.56%    (p=0.000 n=9+8)
OnesCount32-8  1.63ns ± 5%  1.04ns ± 2%  -35.96%  (p=0.000 n=10+10)
OnesCount64-8  2.45ns ± 0%  1.04ns ± 1%  -57.55%   (p=0.000 n=6+10)

Update #18616

Change-Id: I4aff2cc9aa93787898d7b22055fe272a7cf95673
Reviewed-on: https://go-review.googlesource.com/38320
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarRobert Griesemer <gri@golang.org>
parent 59f6549d
......@@ -767,6 +767,21 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL:
if v.Args[0].Reg() != v.Reg() {
// POPCNT on Intel has a false dependency on the destination register.
// Zero the destination to break the dependency.
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 0
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
......
......@@ -699,6 +699,34 @@ var linuxAMD64Tests = []*asmTest{
`,
[]string{"\tBSRQ\t"},
},
{
`
func pop1(x uint64) int {
return bits.OnesCount64(x)
}`,
[]string{"\tPOPCNTQ\t", "support_popcnt"},
},
{
`
func pop2(x uint32) int {
return bits.OnesCount32(x)
}`,
[]string{"\tPOPCNTL\t", "support_popcnt"},
},
{
`
func pop3(x uint16) int {
return bits.OnesCount16(x)
}`,
[]string{"\tPOPCNTL\t", "support_popcnt"},
},
{
`
func pop4(x uint) int {
return bits.OnesCount(x)
}`,
[]string{"\tPOPCNTQ\t", "support_popcnt"},
},
// see issue 19595.
// We want to merge load+op in f58, but not in f59.
{
......
......@@ -142,6 +142,7 @@ var runtimeDecls = [...]struct {
{"racewriterange", funcTag, 111},
{"msanread", funcTag, 111},
{"msanwrite", funcTag, 111},
{"support_popcnt", varTag, 11},
}
func runtimeTypes() []*Type {
......
......@@ -187,3 +187,6 @@ func racewriterange(addr, size uintptr)
// memory sanitizer
func msanread(addr, size uintptr)
func msanwrite(addr, size uintptr)
// architecture variants
var support_popcnt bool
......@@ -2823,6 +2823,54 @@ func init() {
return s.newValue1(ssa.OpBitRev64, Types[TINT], args[0])
},
sys.ARM64)
makeOnesCount := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
aux := s.lookupSymbol(n, &ssa.ExternSymbol{Typ: Types[TBOOL], Sym: Linksym(syslook("support_popcnt").Sym)})
addr := s.entryNewValue1A(ssa.OpAddr, Types[TBOOL].PtrTo(), aux, s.sb)
v := s.newValue2(ssa.OpLoad, Types[TBOOL], addr, s.mem())
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
// We have the intrinsic - use it directly.
s.startBlock(bTrue)
op := op64
if s.config.IntSize == 4 {
op = op32
}
s.vars[n] = s.newValue1(op, Types[TINT], args[0])
s.endBlock().AddEdgeTo(bEnd)
// Call the pure Go version.
s.startBlock(bFalse)
a := s.call(n, callNormal)
s.vars[n] = s.newValue2(ssa.OpLoad, Types[TINT], a, s.mem())
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, Types[TINT])
}
}
addF("math/bits", "OnesCount64",
makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount64),
sys.AMD64)
addF("math/bits", "OnesCount32",
makeOnesCount(ssa.OpPopCount32, ssa.OpPopCount32),
sys.AMD64)
addF("math/bits", "OnesCount16",
makeOnesCount(ssa.OpPopCount16, ssa.OpPopCount16),
sys.AMD64)
// Note: no OnesCount8, the Go implementation is faster - just a table load.
addF("math/bits", "OnesCount",
makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount32),
sys.AMD64)
/******** sync/atomic ********/
......
......@@ -106,6 +106,11 @@
(Bswap64 x) -> (BSWAPQ x)
(Bswap32 x) -> (BSWAPL x)
(PopCount64 x) -> (POPCNTQ x)
(PopCount32 x) -> (POPCNTL x)
(PopCount16 x) -> (POPCNTL (MOVWQZX <types.UInt32> x))
(PopCount8 x) -> (POPCNTL (MOVBQZX <types.UInt32> x))
(Sqrt x) -> (SQRTSD x)
// Lowering extension
......
......@@ -323,6 +323,11 @@ func init() {
{name: "BSWAPQ", argLength: 1, reg: gp11, asm: "BSWAPQ", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
// POPCNT instructions aren't guaranteed to be on the target platform (they are SSE4).
// Any use must be preceded by a successful check of runtime.support_popcnt.
{name: "POPCNTQ", argLength: 1, reg: gp11, asm: "POPCNTQ", clobberFlags: true}, // count number of set bits in arg0
{name: "POPCNTL", argLength: 1, reg: gp11, asm: "POPCNTL", clobberFlags: true}, // count number of set bits in arg0
{name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
{name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.
......
......@@ -250,6 +250,11 @@ var genericOps = []opData{
{name: "BitRev32", argLength: 1}, // Reverse the bits in arg[0]
{name: "BitRev64", argLength: 1}, // Reverse the bits in arg[0]
{name: "PopCount8", argLength: 1}, // Count bits in arg[0]
{name: "PopCount16", argLength: 1}, // Count bits in arg[0]
{name: "PopCount32", argLength: 1}, // Count bits in arg[0]
{name: "PopCount64", argLength: 1}, // Count bits in arg[0]
{name: "Sqrt", argLength: 1}, // sqrt(arg0), float64 only
// Data movement, max argument length for Phi is indefinite so just pick
......
......@@ -538,6 +538,8 @@ const (
OpAMD64CMOVLEQ
OpAMD64BSWAPQ
OpAMD64BSWAPL
OpAMD64POPCNTQ
OpAMD64POPCNTL
OpAMD64SQRTSD
OpAMD64SBBQcarrymask
OpAMD64SBBLcarrymask
......@@ -1778,6 +1780,10 @@ const (
OpBitRev16
OpBitRev32
OpBitRev64
OpPopCount8
OpPopCount16
OpPopCount32
OpPopCount64
OpSqrt
OpPhi
OpCopy
......@@ -6368,6 +6374,34 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "POPCNTQ",
argLen: 1,
clobberFlags: true,
asm: x86.APOPCNTQ,
reg: regInfo{
inputs: []inputInfo{
{0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
},
outputs: []outputInfo{
{0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
},
},
},
{
name: "POPCNTL",
argLen: 1,
clobberFlags: true,
asm: x86.APOPCNTL,
reg: regInfo{
inputs: []inputInfo{
{0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
},
outputs: []outputInfo{
{0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
},
},
},
{
name: "SQRTSD",
argLen: 1,
......@@ -21680,6 +21714,26 @@ var opcodeTable = [...]opInfo{
argLen: 1,
generic: true,
},
{
name: "PopCount8",
argLen: 1,
generic: true,
},
{
name: "PopCount16",
argLen: 1,
generic: true,
},
{
name: "PopCount32",
argLen: 1,
generic: true,
},
{
name: "PopCount64",
argLen: 1,
generic: true,
},
{
name: "Sqrt",
argLen: 1,
......
......@@ -686,6 +686,14 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpOr8(v)
case OpOrB:
return rewriteValueAMD64_OpOrB(v)
case OpPopCount16:
return rewriteValueAMD64_OpPopCount16(v)
case OpPopCount32:
return rewriteValueAMD64_OpPopCount32(v)
case OpPopCount64:
return rewriteValueAMD64_OpPopCount64(v)
case OpPopCount8:
return rewriteValueAMD64_OpPopCount8(v)
case OpRound32F:
return rewriteValueAMD64_OpRound32F(v)
case OpRound64F:
......@@ -33467,6 +33475,62 @@ func rewriteValueAMD64_OpOrB(v *Value) bool {
return true
}
}
func rewriteValueAMD64_OpPopCount16(v *Value) bool {
b := v.Block
_ = b
types := &b.Func.Config.Types
_ = types
// match: (PopCount16 x)
// cond:
// result: (POPCNTL (MOVWQZX <types.UInt32> x))
for {
x := v.Args[0]
v.reset(OpAMD64POPCNTL)
v0 := b.NewValue0(v.Pos, OpAMD64MOVWQZX, types.UInt32)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpPopCount32(v *Value) bool {
// match: (PopCount32 x)
// cond:
// result: (POPCNTL x)
for {
x := v.Args[0]
v.reset(OpAMD64POPCNTL)
v.AddArg(x)
return true
}
}
func rewriteValueAMD64_OpPopCount64(v *Value) bool {
// match: (PopCount64 x)
// cond:
// result: (POPCNTQ x)
for {
x := v.Args[0]
v.reset(OpAMD64POPCNTQ)
v.AddArg(x)
return true
}
}
func rewriteValueAMD64_OpPopCount8(v *Value) bool {
b := v.Block
_ = b
types := &b.Func.Config.Types
_ = types
// match: (PopCount8 x)
// cond:
// result: (POPCNTL (MOVBQZX <types.UInt32> x))
for {
x := v.Args[0]
v.reset(OpAMD64POPCNTL)
v0 := b.NewValue0(v.Pos, OpAMD64MOVBQZX, types.UInt32)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpRound32F(v *Value) bool {
// match: (Round32F x)
// cond:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment