cmd/gc: alias more variables during register allocation

This is joint work with Daniel Morsing. In order for the register allocator to alias two variables, they must have the same width, stack offset, and etype. Code generation was altering a variable's etype in a few places. This prevented the variable from being moved to a register, which in turn prevented peephole optimization. This failure to alias was very common, with almost 23,000 instances just running make.bash. This phenomenon was not visible in the register allocation debug output because the variables that failed to alias had the same name. The debugging-only change to bits.c fixes this by printing the variable number with its name. This CL fixes the source of all etype mismatches for 6g, all but one case for 8g, and depressingly few cases for 5g. (I believe that extending CL 6819083 to 5g is a prerequisite.) Fixing the remaining cases in 8g and 5g is work for the future. The etype mismatch fixes are: * [gc] Slicing changed the type of the base pointer into a uintptr in order to perform arithmetic on it. Instead, support addition directly on pointers. * [*g] OSPTR was giving type uintptr to slice base pointers; undo that. This arose, for example, while compiling copy(dst, src). * [8g] 64 bit float conversion was assigning int64 type during codegen, overwriting the existing uint64 type. Note that some etype mismatches are appropriate, such as a struct with a single field or an array with a single element. With these fixes, the number of registerizations that occur while running make.bash for 6g increases ~10%. Hello world binary size shrinks ~1.5%. Running all benchmarks in the standard library show performance improvements ranging from nominal to substantive (>10%); a full comparison using 6g on my laptop is available at https://gist.github.com/josharian/8f9b5beb46667c272064. The microbenchmarks must be taken with a grain of salt; see issue 7920. The few benchmarks that show real regressions are likely due to issue 7920. I manually examined the generated code for the top few regressions and none had any assembly output changes. The few benchmarks that show extraordinary improvements are likely also due to issue 7920. Performance results from 8g appear similar to 6g. 5g shows no performance improvements. This is not surprising, given the discussion above. Update #7316 LGTM=rsc R=rsc, daniel.morsing, bradfitz CC=dave, golang-codereviews https://golang.org/cl/91850043

cmd/gc: alias more variables during register allocation
This is joint work with Daniel Morsing. In order for the register allocator to alias two variables, they must have the same width, stack offset, and etype. Code generation was altering a variable's etype in a few places. This prevented the variable from being moved to a register, which in turn prevented peephole optimization. This failure to alias was very common, with almost 23,000 instances just running make.bash. This phenomenon was not visible in the register allocation debug output because the variables that failed to alias had the same name. The debugging-only change to bits.c fixes this by printing the variable number with its name. This CL fixes the source of all etype mismatches for 6g, all but one case for 8g, and depressingly few cases for 5g. (I believe that extending CL 6819083 to 5g is a prerequisite.) Fixing the remaining cases in 8g and 5g is work for the future. The etype mismatch fixes are: * [gc] Slicing changed the type of the base pointer into a uintptr in order to perform arithmetic on it. Instead, support addition directly on pointers. * [*g] OSPTR was giving type uintptr to slice base pointers; undo that. This arose, for example, while compiling copy(dst, src). * [8g] 64 bit float conversion was assigning int64 type during codegen, overwriting the existing uint64 type. Note that some etype mismatches are appropriate, such as a struct with a single field or an array with a single element. With these fixes, the number of registerizations that occur while running make.bash for 6g increases ~10%. Hello world binary size shrinks ~1.5%. Running all benchmarks in the standard library show performance improvements ranging from nominal to substantive (>10%); a full comparison using 6g on my laptop is available at https://gist.github.com/josharian/8f9b5beb46667c272064. The microbenchmarks must be taken with a grain of salt; see issue 7920. The few benchmarks that show real regressions are likely due to issue 7920. I manually examined the generated code for the top few regressions and none had any assembly output changes. The few benchmarks that show extraordinary improvements are likely also due to issue 7920. Performance results from 8g appear similar to 6g. 5g shows no performance improvements. This is not surprising, given the discussion above. Update #7316 LGTM=rsc R=rsc, daniel.morsing, bradfitz CC=dave, golang-codereviews https://golang.org/cl/91850043
03c0f3fe · Josh Bleecher Snyder · Russ Cox · 2497c430 · 03c0f3fe · 03c0f3fe
Commit 03c0f3fe authored May 12, 2014 by Josh Bleecher Snyder Committed by Russ Cox May 12, 2014
11 changed files
--- a/src/cmd/5g/cgen.c
+++ b/src/cmd/5g/cgen.c
@@ -254,6 +254,7 @@ cgen(Node *n, Node *res)
 	case OOR:
 	case OXOR:
 	case OADD:
+	case OADDPTR:
 	case OMUL:
 		a = optoas(n->op, nl->type);
 		goto sbop;

--- a/src/cmd/5g/gsubr.c
+++ b/src/cmd/5g/gsubr.c
@@ -1366,7 +1366,7 @@ naddr(Node *n, Addr *a, int canemitcode)
 		naddr(n->left, a, canemitcode);
 		if(a->type == D_CONST && a->offset == 0)
 			break;	// ptr(nil)
-		a->etype = simtype[TUINTPTR];
+		a->etype = simtype[tptr];
 		a->offset += Array_array;
 		a->width = widthptr;
 		break;
@@ -1592,6 +1592,7 @@ optoas(int op, Type *t)
 	case CASE(OADD, TINT32):
 	case CASE(OADD, TUINT32):
 	case CASE(OADD, TPTR32):
+	case CASE(OADDPTR, TPTR32):
 		a = AADD;
 		break;


--- a/src/cmd/6g/cgen.c
+++ b/src/cmd/6g/cgen.c
@@ -247,6 +247,7 @@ cgen(Node *n, Node *res)
 	case OOR:
 	case OXOR:
 	case OADD:
+	case OADDPTR:
 	case OMUL:
 		a = optoas(n->op, nl->type);
 		if(a == AIMULB) {

--- a/src/cmd/6g/gsubr.c
+++ b/src/cmd/6g/gsubr.c
@@ -1300,7 +1300,7 @@ naddr(Node *n, Addr *a, int canemitcode)
 		naddr(n->left, a, canemitcode);
 		if(a->type == D_CONST && a->offset == 0)
 			break;	// ptr(nil)
-		a->etype = simtype[TUINTPTR];
+		a->etype = simtype[tptr];
 		a->offset += Array_array;
 		a->width = widthptr;
 		break;
@@ -1533,12 +1533,14 @@ optoas(int op, Type *t)
 	case CASE(OADD, TINT32):
 	case CASE(OADD, TUINT32):
 	case CASE(OADD, TPTR32):
+	case CASE(OADDPTR, TPTR32):
 		a = AADDL;
 		break;

 	case CASE(OADD, TINT64):
 	case CASE(OADD, TUINT64):
 	case CASE(OADD, TPTR64):
+	case CASE(OADDPTR, TPTR64):
 		a = AADDQ;
 		break;


--- a/src/cmd/8g/cgen.c
+++ b/src/cmd/8g/cgen.c
@@ -242,6 +242,7 @@ cgen(Node *n, Node *res)
 	case OOR:
 	case OXOR:
 	case OADD:
+	case OADDPTR:
 	case OMUL:
 		a = optoas(n->op, nl->type);
 		if(a == AIMULB) {

--- a/src/cmd/8g/gsubr.c
+++ b/src/cmd/8g/gsubr.c
@@ -432,6 +432,7 @@ optoas(int op, Type *t)
 	case CASE(OADD, TINT32):
 	case CASE(OADD, TUINT32):
 	case CASE(OADD, TPTR32):
+	case CASE(OADDPTR, TPTR32):
 		a = AADDL;
 		break;

@@ -1687,7 +1688,6 @@ floatmove(Node *f, Node *t)
 		gins(ACMPL, &thi, ncon(0));
 		p1 = gbranch(AJLT, T, 0);
 		// native
-		t1.type = types[TINT64];
 		nodreg(&r1, types[tt], D_F0);
 		gins(AFMOVV, &t1, &r1);
 		if(tt == TFLOAT32)
@@ -2327,7 +2327,7 @@ naddr(Node *n, Addr *a, int canemitcode)
 		naddr(n->left, a, canemitcode);
 		if(a->type == D_CONST && a->offset == 0)
 			break;	// ptr(nil)
-		a->etype = simtype[TUINTPTR];
+		a->etype = simtype[tptr];
 		a->offset += Array_array;
 		a->width = widthptr;
 		break;

--- a/src/cmd/gc/bits.c
+++ b/src/cmd/gc/bits.c
@@ -153,7 +153,7 @@ Qconv(Fmt *fp)
 		if(var[i].node == N || var[i].node->sym == S)
 			fmtprint(fp, "$%d", i);
 		else {
-			fmtprint(fp, "%s", var[i].node->sym->name);
+			fmtprint(fp, "%s(%d)", var[i].node->sym->name, i);
 			if(var[i].offset != 0)
 				fmtprint(fp, "%+lld", (vlong)var[i].offset);
 		}

--- a/src/cmd/gc/gen.c
+++ b/src/cmd/gc/gen.c
@@ -829,7 +829,6 @@ cgen_slice(Node *n, Node *res)
 		src = *n->left;
 	if(n->op == OSLICE || n->op == OSLICE3 || n->op == OSLICESTR)
 		src.xoffset += Array_array;
-	src.type = types[TUINTPTR];

 	if(n->op == OSLICEARR || n->op == OSLICE3ARR) {
 		if(!isptr[n->left->type->etype])
@@ -842,9 +841,11 @@ cgen_slice(Node *n, Node *res)
 			cgen(add, base);
 		}
 	} else if(offs == N) {
+		src.type = types[tptr];
 		cgen(&src, base);
 	} else {
-		add = nod(OADD, &src, offs);
+		src.type = types[tptr];
+		add = nod(OADDPTR, &src, offs);
 		typecheck(&add, Erv);
 		cgen(add, base);
 	}
@@ -855,7 +856,7 @@ cgen_slice(Node *n, Node *res)
 	// dst.array = src.array  [ + lo *width ]
 	dst = *res;
 	dst.xoffset += Array_array;
-	dst.type = types[TUINTPTR];
+	dst.type = types[tptr];
 	
 	cgen(base, &dst);


--- a/src/cmd/gc/go.h
+++ b/src/cmd/gc/go.h
@@ -445,6 +445,7 @@ enum
 	OSUB,	// x - y
 	OOR,	// x | y
 	OXOR,	// x ^ y
+	OADDPTR,	// ptr + uintptr, inserted by compiler only, used to avoid unsafe type changes during codegen
 	OADDSTR,	// s + "foo"
 	OADDR,	// &x
 	OANDAND,	// b0 && b1

--- a/src/cmd/gc/typecheck.c
+++ b/src/cmd/gc/typecheck.c
@@ -535,6 +535,19 @@ reswitch:
 		op = n->etype;
 		goto arith;

+	case OADDPTR:
+		ok |= Erv;
+		l = typecheck(&n->left, Erv);
+		r = typecheck(&n->right, Erv);
+		if(l->type == T || r->type == T)
+			goto error;
+		if(l->type->etype != tptr)
+			fatal("bad OADDPTR left type %E for %N", l->type->etype, n->left);
+		if(r->type->etype != TUINTPTR)
+			fatal("bad OADDPTR right type %E for %N", r->type->etype, n->right);
+		n->type = types[tptr];
+		goto ret;
+
 	case OADD:
 	case OAND:
 	case OANDAND:

--- a/test/fixedbugs/issue7316.go
+++ b/test/fixedbugs/issue7316.go
+// runoutput
+
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Issue 7316
+// This test exercises all types of numeric conversions, which was one
+// of the sources of etype mismatch during register allocation in 8g.
+
+package main
+
+import "fmt"
+
+const tpl = `
+func init() {
+	var i %s
+	j := %s(i)
+	_ = %s(j)
+}
+`
+
+func main() {
+	fmt.Println("package main")
+	ntypes := []string{
+		"byte", "rune", "uintptr",
+		"float32", "float64",
+		"int", "int8", "int16", "int32", "int64",
+		"uint", "uint8", "uint16", "uint32", "uint64",
+	}
+	for i, from := range ntypes {
+		for _, to := range ntypes[i:] {
+			fmt.Printf(tpl, from, to, from)
+		}
+	}
+	fmt.Println("func main() {}")
+}