Commit bb1fd3b5 authored by Lynn Boger's avatar Lynn Boger

cmd/compile: add rules to improve consecutive byte loads and stores on ppc64le

This adds new rules to recognize consecutive byte loads and
stores and lowers them to loads and stores such as lhz, lwz, ld,
sth, stw, std. This change only covers the little endian cases
on little endian machines, such as is found in encoding/binary
UintXX or PutUintXX for little endian. Big endian will be done
later.

Updates were also made to binary_test.go to allow the benchmark
for Uint and PutUint to actually use those functions because
the way they were written, those functions were being
optimized out.

Testcases were also added to cmd/compile/internal/gc/asm_test.go.

Updates #22496

The following improvement can be found in golang.org/x/crypto

poly1305:

Benchmark64-16              142           114           -19.72%
Benchmark1K-16              1717          1424          -17.06%
Benchmark64Unaligned-16     142           113           -20.42%
Benchmark1KUnaligned-16     1721          1428          -17.02%

chacha20poly1305:

BenchmarkChacha20Poly1305Open_64-16     1012       885   -12.55%
BenchmarkChacha20Poly1305Seal_64-16     971        836   -13.90%
BenchmarkChacha20Poly1305Open_1350-16   11113      9539  -14.16%
BenchmarkChacha20Poly1305Seal_1350-16   11013      9392  -14.72%
BenchmarkChacha20Poly1305Open_8K-16     61074      53431 -12.51%
BenchmarkChacha20Poly1305Seal_8K-16     61214      54806 -10.47%

Other improvements of around 10% found in crypto/tls.

Results after updating encoding/binary/binary_test.go:

BenchmarkLittleEndianPutUint64-16     1.87      0.93      -50.27%
BenchmarkLittleEndianPutUint32-16     1.19      0.93      -21.85%
BenchmarkLittleEndianPutUint16-16     1.16      1.03      -11.21%

Change-Id: I7bbe2fbcbd11362d58662fecd907a0c07e6ca2fb
Reviewed-on: https://go-review.googlesource.com/74410
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarMichael Munday <mike.munday@ibm.com>
parent f99d14e0
......@@ -265,7 +265,7 @@ var allAsmTests = []*asmTests{
{
arch: "ppc64le",
os: "linux",
imports: []string{"math", "math/bits"},
imports: []string{"encoding/binary", "math", "math/bits"},
tests: linuxPPC64LETests,
},
{
......@@ -2359,6 +2359,61 @@ var linuxPPC64LETests = []*asmTest{
pos: []string{"\tFABS\t"},
},
{
fn: `
func f14(b []byte) uint16 {
return binary.LittleEndian.Uint16(b)
}
`,
pos: []string{"\tMOVHZ\t"},
},
{
fn: `
func f15(b []byte) uint32 {
return binary.LittleEndian.Uint32(b)
}
`,
pos: []string{"\tMOVWZ\t"},
},
{
fn: `
func f16(b []byte) uint64 {
return binary.LittleEndian.Uint64(b)
}
`,
pos: []string{"\tMOVD\t"},
neg: []string{"MOVBZ", "MOVHZ", "MOVWZ"},
},
{
fn: `
func f17(b []byte, v uint16) {
binary.LittleEndian.PutUint16(b, v)
}
`,
pos: []string{"\tMOVH\t"},
},
{
fn: `
func f18(b []byte, v uint32) {
binary.LittleEndian.PutUint32(b, v)
}
`,
pos: []string{"\tMOVW\t"},
},
{
fn: `
func f19(b []byte, v uint64) {
binary.LittleEndian.PutUint64(b, v)
}
`,
pos: []string{"\tMOVD\t"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
// check that stack store is optimized away
fn: `
......
......@@ -945,3 +945,117 @@
(FSUB (FMUL x y) z) -> (FMSUB x y z)
(FADDS (FMULS x y) z) -> (FMADDS x y z)
(FSUBS (FMULS x y) z) -> (FMSUBS x y z)
// The following statements are found in encoding/binary functions UintXX (load) and PutUintXX (store)
// and convert the statements in these functions from multiple single byte loads or stores to
// the single largest possible load or store. For now only little endian loads and stores on
// little endian machines are implemented. Longer rules make use of the match with shorter rules
// where possible.
// TODO implement big endian loads and stores for little endian machines (using byte reverse
// loads and stores).
// b[0] | b[1]<<8 -> load 16-bit Little endian
(OR <t> x0:(MOVBZload [i0] {s} p mem)
o1:(SLWconst x1:(MOVBZload [i1] {s} p mem) [8]))
&& !config.BigEndian
&& i1 == i0+1
&& x0.Uses ==1 && x1.Uses == 1
&& o1.Uses == 1
&& mergePoint(b, x0, x1) != nil
&& clobber(x0) && clobber(x1) && clobber(o1)
-> @mergePoint(b,x0,x1) (MOVHZload <t> {s} [i0] p mem)
// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 -> load 32-bit Little endian
(OR <t> s1:(SLWconst x2:(MOVBZload [i3] {s} p mem) [24])
o0:(OR <t> s0:(SLWconst x1:(MOVBZload [i2] {s} p mem) [16]) x0:(MOVHZload [i0] {s} p mem)))
&& !config.BigEndian
&& i2 == i0+2
&& i3 == i0+3
&& x0.Uses ==1 && x1.Uses == 1 && x2.Uses == 1
&& o0.Uses == 1
&& s0.Uses == 1 && s1.Uses == 1
&& mergePoint(b, x0, x1, x2) != nil
&& clobber(x0) && clobber(x1) && clobber(x2)
&& clobber(s0) && clobber(s1)
&& clobber(o0)
-> @mergePoint(b,x0,x1,x2) (MOVWZload <t> {s} [i0] p mem)
// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 | b[4] <<32 | b[5]<<40 | b[6]<<48 | b[7]<<56 -> load 64-bit Little endian
// Can't build on shorter rules because they use SLW instead of SLD
// Offset must be multiple of 4 for MOVD
(OR <t> s6:(SLDconst x7:(MOVBZload [i7] {s} p mem) [56])
o5:(OR <t> s5:(SLDconst x6:(MOVBZload [i6] {s} p mem) [48])
o4:(OR <t> s4:(SLDconst x5:(MOVBZload [i5] {s} p mem) [40])
o3:(OR <t> s3:(SLDconst x4:(MOVBZload [i4] {s} p mem) [32])
o2:(OR <t> s2:(SLDconst x3:(MOVBZload [i3] {s} p mem) [24])
o1:(OR <t> s1:(SLDconst x2:(MOVBZload [i2] {s} p mem) [16])
o0:(OR <t> s0:(SLDconst x1:(MOVBZload [i1] {s} p mem) [8]) x0:(MOVBZload [i0] {s} p mem))))))))
&& !config.BigEndian
&& i0%4 == 0
&& i1 == i0+1
&& i2 == i0+2
&& i3 == i0+3
&& i4 == i0+4
&& i5 == i0+5
&& i6 == i0+6
&& i7 == i0+7
&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses ==1 && x7.Uses == 1
&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1
&& s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1
&& mergePoint(b, x0, x1, x2, x3, x4, x5, x6, x7) != nil
&& clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6) && clobber(x7)
&& clobber(s0) && clobber(s1) && clobber(s2) && clobber(s3) && clobber(s4) && clobber(s5) && clobber (s6)
&& clobber(o0) && clobber(o1) && clobber(o2) && clobber(o3) && clobber(o4) && clobber(o5)
-> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVDload <t> {s} [i0] p mem)
// 2 byte store Little endian as in:
// b[0] = byte(v)
// b[1] = byte(v >> 8)
(MOVBstore [i1] {s} p (SRWconst (MOVHZreg w) [8])
x0:(MOVBstore [i0] {s} p w mem))
&& !config.BigEndian
&& x0.Uses == 1
&& i1 == i0+1
&& clobber(x0)
-> (MOVHstore [i0] {s} p w mem)
// 4 byte store Little endian as in:
// b[0] = byte(v)
// b[1] = byte(v >> 8)
// b[2] = byte(v >> 16)
// b[3] = byte(v >> 24)
(MOVBstore [i3] {s} p (SRWconst w [24])
x0:(MOVBstore [i2] {s} p (SRWconst w [16])
x1:(MOVBstore [i1] {s} p (SRWconst w [8])
x2:(MOVBstore [i0] {s} p w mem))))
&& !config.BigEndian
&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
&& i1 == i0+1 && i2 == i0+2 && i3 == i0+3
&& clobber(x0) && clobber(x1) && clobber(x2)
-> (MOVWstore [i0] {s} p w mem)
// 8 byte store Little endian as in:
// b[0] = byte(v)
// b[1] = byte(v >> 8)
// b[2] = byte(v >> 16)
// b[3] = byte(v >> 24)
// b[4] = byte(v >> 32)
// b[5] = byte(v >> 40)
// b[6] = byte(v >> 48)
// b[7] = byte(v >> 56)
// Offset must be multiple of 4 for MOVDstore
// Can't build on previous rules for 2 or 4 bytes because they use SRW not SRD
(MOVBstore [i7] {s} p (SRDconst w [56])
x0:(MOVBstore [i6] {s} p (SRDconst w [48])
x1:(MOVBstore [i5] {s} p (SRDconst w [40])
x2:(MOVBstore [i4] {s} p (SRDconst w [32])
x3:(MOVBstore [i3] {s} p (SRDconst w [24])
x4:(MOVBstore [i2] {s} p (SRDconst w [16])
x5:(MOVBstore [i1] {s} p (SRDconst w [8])
x6:(MOVBstore [i0] {s} p w mem))))))))
&& !config.BigEndian
&& i0%4 == 0
&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1
&& i1 == i0+1 && i2 == i0+2 && i3 == i0+3 && i4 == i0+4 && i5 == i0+5 && i6 == i0+6 && i7 == i0+7
&& clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6)
-> (MOVDstore [i0] {s} p w mem)
......@@ -109,6 +109,7 @@ var little = []byte{
var src = []byte{1, 2, 3, 4, 5, 6, 7, 8}
var res = []int32{0x01020304, 0x05060708}
var putbuf = []byte{0, 0, 0, 0, 0, 0, 0, 0}
func checkResult(t *testing.T, dir string, order ByteOrder, err error, have, want interface{}) {
if err != nil {
......@@ -502,25 +503,42 @@ func BenchmarkWriteSlice1000Int32s(b *testing.B) {
}
func BenchmarkPutUint16(b *testing.B) {
buf := [2]byte{}
b.SetBytes(2)
for i := 0; i < b.N; i++ {
BigEndian.PutUint16(buf[:], uint16(i))
BigEndian.PutUint16(putbuf[:], uint16(i))
}
}
func BenchmarkPutUint32(b *testing.B) {
buf := [4]byte{}
b.SetBytes(4)
for i := 0; i < b.N; i++ {
BigEndian.PutUint32(buf[:], uint32(i))
BigEndian.PutUint32(putbuf[:], uint32(i))
}
}
func BenchmarkPutUint64(b *testing.B) {
buf := [8]byte{}
b.SetBytes(8)
for i := 0; i < b.N; i++ {
BigEndian.PutUint64(buf[:], uint64(i))
BigEndian.PutUint64(putbuf[:], uint64(i))
}
}
func BenchmarkLittleEndianPutUint16(b *testing.B) {
b.SetBytes(2)
for i := 0; i < b.N; i++ {
LittleEndian.PutUint16(putbuf[:], uint16(i))
}
}
func BenchmarkLittleEndianPutUint32(b *testing.B) {
b.SetBytes(4)
for i := 0; i < b.N; i++ {
LittleEndian.PutUint32(putbuf[:], uint32(i))
}
}
func BenchmarkLittleEndianPutUint64(b *testing.B) {
b.SetBytes(8)
for i := 0; i < b.N; i++ {
LittleEndian.PutUint64(putbuf[:], uint64(i))
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment