Commit 3bc34385 authored by Ben Shi's avatar Ben Shi

cmd/compile: introduce more read-modify-write operations for amd64

Add suport of read-modify-write for AND/SUB/AND/OR/XOR on amd64.

1. The total size of pkg/linux_amd64 decreases about 4KB, excluding
cmd/compile.

2. The go1 benchmark shows a little improvement, excluding noise.

name                     old time/op    new time/op    delta
BinaryTree17-4              2.63s ± 3%     2.65s ± 4%   +1.01%  (p=0.037 n=35+35)
Fannkuch11-4                2.33s ± 2%     2.39s ± 2%   +2.49%  (p=0.000 n=35+35)
FmtFprintfEmpty-4          45.4ns ± 5%    40.8ns ± 6%  -10.09%  (p=0.000 n=35+35)
FmtFprintfString-4         73.3ns ± 4%    70.9ns ± 3%   -3.23%  (p=0.000 n=30+35)
FmtFprintfInt-4            79.9ns ± 4%    79.5ns ± 3%     ~     (p=0.736 n=34+35)
FmtFprintfIntInt-4          126ns ± 4%     125ns ± 4%     ~     (p=0.083 n=35+35)
FmtFprintfPrefixedInt-4     152ns ± 6%     152ns ± 3%     ~     (p=0.855 n=34+35)
FmtFprintfFloat-4           215ns ± 4%     213ns ± 4%     ~     (p=0.066 n=35+35)
FmtManyArgs-4               522ns ± 3%     506ns ± 3%   -3.15%  (p=0.000 n=35+35)
GobDecode-4                6.45ms ± 8%    6.51ms ± 7%   +0.96%  (p=0.026 n=35+35)
GobEncode-4                6.10ms ± 6%    6.02ms ± 8%     ~     (p=0.160 n=35+35)
Gzip-4                      228ms ± 3%     221ms ± 3%   -2.92%  (p=0.000 n=35+35)
Gunzip-4                   37.5ms ± 4%    37.2ms ± 3%   -0.78%  (p=0.036 n=35+35)
HTTPClientServer-4         58.7µs ± 2%    59.2µs ± 1%   +0.80%  (p=0.000 n=33+33)
JSONEncode-4               12.0ms ± 3%    12.2ms ± 3%   +1.84%  (p=0.008 n=35+35)
JSONDecode-4               57.0ms ± 4%    56.6ms ± 3%     ~     (p=0.320 n=35+35)
Mandelbrot200-4            3.82ms ± 3%    3.79ms ± 3%     ~     (p=0.074 n=35+35)
GoParse-4                  3.21ms ± 5%    3.24ms ± 4%     ~     (p=0.119 n=35+35)
RegexpMatchEasy0_32-4      76.3ns ± 4%    75.4ns ± 4%   -1.14%  (p=0.014 n=34+33)
RegexpMatchEasy0_1K-4       251ns ± 4%     254ns ± 3%   +1.28%  (p=0.016 n=35+35)
RegexpMatchEasy1_32-4      69.6ns ± 3%    70.1ns ± 3%   +0.82%  (p=0.005 n=35+35)
RegexpMatchEasy1_1K-4       367ns ± 4%     376ns ± 4%   +2.47%  (p=0.000 n=35+35)
RegexpMatchMedium_32-4      108ns ± 5%     104ns ± 4%   -3.18%  (p=0.000 n=35+35)
RegexpMatchMedium_1K-4     33.8µs ± 3%    32.7µs ± 3%   -3.27%  (p=0.000 n=35+35)
RegexpMatchHard_32-4       1.55µs ± 3%    1.52µs ± 3%   -1.64%  (p=0.000 n=35+35)
RegexpMatchHard_1K-4       46.6µs ± 3%    46.6µs ± 4%     ~     (p=0.149 n=35+35)
Revcomp-4                   416ms ± 7%     412ms ± 6%   -0.95%  (p=0.033 n=33+35)
Template-4                 64.3ms ± 3%    62.4ms ± 7%   -2.94%  (p=0.000 n=35+35)
TimeParse-4                 320ns ± 2%     322ns ± 3%     ~     (p=0.589 n=35+35)
TimeFormat-4                300ns ± 3%     300ns ± 3%     ~     (p=0.597 n=35+35)
[Geo mean]                 47.4µs         47.0µs        -0.86%

name                     old speed      new speed      delta
GobDecode-4               119MB/s ± 7%   118MB/s ± 7%   -0.96%  (p=0.027 n=35+35)
GobEncode-4               126MB/s ± 7%   127MB/s ± 6%     ~     (p=0.157 n=34+34)
Gzip-4                   85.3MB/s ± 3%  87.9MB/s ± 3%   +3.02%  (p=0.000 n=35+35)
Gunzip-4                  518MB/s ± 4%   522MB/s ± 3%   +0.79%  (p=0.037 n=35+35)
JSONEncode-4              162MB/s ± 3%   159MB/s ± 3%   -1.81%  (p=0.009 n=35+35)
JSONDecode-4             34.1MB/s ± 4%  34.3MB/s ± 3%     ~     (p=0.318 n=35+35)
GoParse-4                18.0MB/s ± 5%  17.9MB/s ± 4%     ~     (p=0.117 n=35+35)
RegexpMatchEasy0_32-4     419MB/s ± 3%   425MB/s ± 4%   +1.46%  (p=0.003 n=32+33)
RegexpMatchEasy0_1K-4    4.07GB/s ± 4%  4.02GB/s ± 3%   -1.28%  (p=0.014 n=35+35)
RegexpMatchEasy1_32-4     460MB/s ± 3%   456MB/s ± 4%   -0.82%  (p=0.004 n=35+35)
RegexpMatchEasy1_1K-4    2.79GB/s ± 4%  2.72GB/s ± 4%   -2.39%  (p=0.000 n=35+35)
RegexpMatchMedium_32-4   9.23MB/s ± 4%  9.53MB/s ± 4%   +3.16%  (p=0.000 n=35+35)
RegexpMatchMedium_1K-4   30.3MB/s ± 3%  31.3MB/s ± 3%   +3.38%  (p=0.000 n=35+35)
RegexpMatchHard_32-4     20.7MB/s ± 3%  21.0MB/s ± 3%   +1.67%  (p=0.000 n=35+35)
RegexpMatchHard_1K-4     22.0MB/s ± 3%  21.9MB/s ± 4%     ~     (p=0.277 n=35+33)
Revcomp-4                 612MB/s ± 7%   618MB/s ± 6%   +0.96%  (p=0.034 n=33+35)
Template-4               30.2MB/s ± 3%  31.1MB/s ± 6%   +3.05%  (p=0.000 n=35+35)
[Geo mean]                123MB/s        124MB/s        +0.64%

Change-Id: Ia025da272e07d0069413824bfff3471b106d6280
Reviewed-on: https://go-review.googlesource.com/121535
Run-TryBot: Ben Shi <powerman1st@163.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarIlya Tocar <ilya.tocar@intel.com>
Reviewed-by: 's avatarKeith Randall <khr@golang.org>
parent aacc891d
......@@ -699,7 +699,9 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
gc.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore:
case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
......
......@@ -1045,6 +1045,10 @@
((ADD|AND|OR|XOR)Qconstmodify [ValAndOff(valoff1).add(off2)] {sym} base mem)
((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd(off2) ->
((ADD|AND|OR|XOR)Lconstmodify [ValAndOff(valoff1).add(off2)] {sym} base mem)
((ADD|SUB|AND|OR|XOR)Qmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(off1+off2) ->
((ADD|SUB|AND|OR|XOR)Qmodify [off1+off2] {sym} base val mem)
((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(off1+off2) ->
((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {sym} base val mem)
// Fold constants into stores.
(MOVQstore [off] {sym} ptr (MOVQconst [c]) mem) && validValAndOff(c,off) ->
......@@ -1091,6 +1095,12 @@
((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem)
&& ValAndOff(valoff1).canAdd(off2) && canMergeSym(sym1, sym2) ->
((ADD|AND|OR|XOR)Lconstmodify [ValAndOff(valoff1).add(off2)] {mergeSym(sym1,sym2)} base mem)
((ADD|SUB|AND|OR|XOR)Qmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
&& is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
((ADD|SUB|AND|OR|XOR)Qmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
&& is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
// generating indexed loads and stores
(MOV(B|W|L|Q|SS|SD)load [off1] {sym1} (LEAQ1 [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
......@@ -2276,6 +2286,12 @@
((ADD|SUB|AND|OR|XOR)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && clobber(l) -> ((ADD|SUB|AND|OR|XOR)Lload x [off] {sym} ptr mem)
((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && clobber(l) -> ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && clobber(l) -> ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
(MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) -> ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
(MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y) && clobber(l) ->
((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
(MOVQstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Qload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) -> ((ADD|AND|OR|XOR)Qmodify [off] {sym} ptr x mem)
(MOVQstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)Q l:(MOVQload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y) && clobber(l) ->
((ADD|SUB|AND|OR|XOR)Qmodify [off] {sym} ptr x mem)
// Merge ADDQconst and LEAQ into atomic loads.
(MOVQatomicload [off1] {sym} (ADDQconst [off2] ptr) mem) && is32Bit(off1+off2) ->
......
......@@ -346,6 +346,18 @@ func init() {
{name: "XORQload", argLength: 3, reg: gp21load, asm: "XORQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
{name: "XORLload", argLength: 3, reg: gp21load, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
// direct binary-op on memory (read-modify-write)
{name: "ADDQmodify", argLength: 3, reg: gpstore, asm: "ADDQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) += arg1, arg2=mem
{name: "SUBQmodify", argLength: 3, reg: gpstore, asm: "SUBQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) -= arg1, arg2=mem
{name: "ANDQmodify", argLength: 3, reg: gpstore, asm: "ANDQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) &= arg1, arg2=mem
{name: "ORQmodify", argLength: 3, reg: gpstore, asm: "ORQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) |= arg1, arg2=mem
{name: "XORQmodify", argLength: 3, reg: gpstore, asm: "XORQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) ^= arg1, arg2=mem
{name: "ADDLmodify", argLength: 3, reg: gpstore, asm: "ADDL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) += arg1, arg2=mem
{name: "SUBLmodify", argLength: 3, reg: gpstore, asm: "SUBL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) -= arg1, arg2=mem
{name: "ANDLmodify", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) &= arg1, arg2=mem
{name: "ORLmodify", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) |= arg1, arg2=mem
{name: "XORLmodify", argLength: 3, reg: gpstore, asm: "XORL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) ^= arg1, arg2=mem
// unary ops
{name: "NEGQ", argLength: 1, reg: gp11, asm: "NEGQ", resultInArg0: true, clobberFlags: true}, // -arg0
{name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, // -arg0
......
......@@ -600,6 +600,16 @@ const (
OpAMD64ORLload
OpAMD64XORQload
OpAMD64XORLload
OpAMD64ADDQmodify
OpAMD64SUBQmodify
OpAMD64ANDQmodify
OpAMD64ORQmodify
OpAMD64XORQmodify
OpAMD64ADDLmodify
OpAMD64SUBLmodify
OpAMD64ANDLmodify
OpAMD64ORLmodify
OpAMD64XORLmodify
OpAMD64NEGQ
OpAMD64NEGL
OpAMD64NOTQ
......@@ -7661,6 +7671,156 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "ADDQmodify",
auxType: auxSymOff,
argLen: 3,
clobberFlags: true,
faultOnNilArg0: true,
symEffect: SymRead | SymWrite,
asm: x86.AADDQ,
reg: regInfo{
inputs: []inputInfo{
{1, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
},
},
},
{
name: "SUBQmodify",
auxType: auxSymOff,
argLen: 3,
clobberFlags: true,
faultOnNilArg0: true,
symEffect: SymRead | SymWrite,
asm: x86.ASUBQ,
reg: regInfo{
inputs: []inputInfo{
{1, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
},
},
},
{
name: "ANDQmodify",
auxType: auxSymOff,
argLen: 3,
clobberFlags: true,
faultOnNilArg0: true,
symEffect: SymRead | SymWrite,
asm: x86.AANDQ,
reg: regInfo{
inputs: []inputInfo{
{1, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
},
},
},
{
name: "ORQmodify",
auxType: auxSymOff,
argLen: 3,
clobberFlags: true,
faultOnNilArg0: true,
symEffect: SymRead | SymWrite,
asm: x86.AORQ,
reg: regInfo{
inputs: []inputInfo{
{1, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
},
},
},
{
name: "XORQmodify",
auxType: auxSymOff,
argLen: 3,
clobberFlags: true,
faultOnNilArg0: true,
symEffect: SymRead | SymWrite,
asm: x86.AXORQ,
reg: regInfo{
inputs: []inputInfo{
{1, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
},
},
},
{
name: "ADDLmodify",
auxType: auxSymOff,
argLen: 3,
clobberFlags: true,
faultOnNilArg0: true,
symEffect: SymRead | SymWrite,
asm: x86.AADDL,
reg: regInfo{
inputs: []inputInfo{
{1, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
},
},
},
{
name: "SUBLmodify",
auxType: auxSymOff,
argLen: 3,
clobberFlags: true,
faultOnNilArg0: true,
symEffect: SymRead | SymWrite,
asm: x86.ASUBL,
reg: regInfo{
inputs: []inputInfo{
{1, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
},
},
},
{
name: "ANDLmodify",
auxType: auxSymOff,
argLen: 3,
clobberFlags: true,
faultOnNilArg0: true,
symEffect: SymRead | SymWrite,
asm: x86.AANDL,
reg: regInfo{
inputs: []inputInfo{
{1, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
},
},
},
{
name: "ORLmodify",
auxType: auxSymOff,
argLen: 3,
clobberFlags: true,
faultOnNilArg0: true,
symEffect: SymRead | SymWrite,
asm: x86.AORL,
reg: regInfo{
inputs: []inputInfo{
{1, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
},
},
},
{
name: "XORLmodify",
auxType: auxSymOff,
argLen: 3,
clobberFlags: true,
faultOnNilArg0: true,
symEffect: SymRead | SymWrite,
asm: x86.AXORL,
reg: regInfo{
inputs: []inputInfo{
{1, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
},
},
},
{
name: "NEGQ",
argLen: 1,
......
......@@ -16,8 +16,10 @@ package codegen
func SubMem(arr []int, b int) int {
// 386:`SUBL\s[A-Z]+,\s8\([A-Z]+\)`
// amd64:`SUBQ\s[A-Z]+,\s16\([A-Z]+\)`
arr[2] -= b
// 386:`SUBL\s[A-Z]+,\s12\([A-Z]+\)`
// amd64:`SUBQ\s[A-Z]+,\s24\([A-Z]+\)`
arr[3] -= b
// 386:`DECL\s16\([A-Z]+\)`
arr[4]--
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment