cmd/internal/gc: use hardware instruction for math.Sqrt (amd64/arm)

I first prototyped this change in Sept 2011, and I discarded it because it made no difference in the obvious benchmark loop. It still makes no difference in the obvious benchmark loop, but in a less obvious one, doing some extra computation around the calls to Sqrt, not making the call does have a significant effect. benchmark old ns/op new ns/op delta BenchmarkSqrt 4.56 4.57 +0.22% BenchmarkSqrtIndirect 4.56 4.56 +0.00% BenchmarkSqrtGo 69.4 69.4 +0.00% BenchmarkSqrtPrime 4417 3647 -17.43% This is a warmup for using hardware expansions for some calls to 1-line assembly routines in the runtime (for example getg). Change-Id: Ie66be23f8c09d0f7dc4ddd7ca8a93cfce28f55a4 Reviewed-on: https://go-review.googlesource.com/8356Reviewed-by: Rob Pike <r@golang.org> Reviewed-by: Ian Lance Taylor <iant@golang.org>

cmd/internal/gc: use hardware instruction for math.Sqrt (amd64/arm)
I first prototyped this change in Sept 2011, and I discarded it because it made no difference in the obvious benchmark loop. It still makes no difference in the obvious benchmark loop, but in a less obvious one, doing some extra computation around the calls to Sqrt, not making the call does have a significant effect. benchmark old ns/op new ns/op delta BenchmarkSqrt 4.56 4.57 +0.22% BenchmarkSqrtIndirect 4.56 4.56 +0.00% BenchmarkSqrtGo 69.4 69.4 +0.00% BenchmarkSqrtPrime 4417 3647 -17.43% This is a warmup for using hardware expansions for some calls to 1-line assembly routines in the runtime (for example getg). Change-Id: Ie66be23f8c09d0f7dc4ddd7ca8a93cfce28f55a4 Reviewed-on: https://go-review.googlesource.com/8356Reviewed-by: Rob Pike <r@golang.org> Reviewed-by: Ian Lance Taylor <iant@golang.org>
92dba0d2 · Russ Cox · 90c0fefe · 92dba0d2 · 92dba0d2 · 92dba0d2
Commit 92dba0d2 authored Apr 01, 2015 by Russ Cox
11 changed files
--- a/src/cmd/5g/gsubr.go
+++ b/src/cmd/5g/gsubr.go
@@ -1055,6 +1055,9 @@ func optoas(op int, t *gc.Type) int {

 	case gc.ODIV<<16 | gc.TFLOAT64:
 		a = arm.ADIVD
+
+	case gc.OSQRT<<16 | gc.TFLOAT64:
+		a = arm.ASQRTD
 	}

 	return a

--- a/src/cmd/5g/peep.go
+++ b/src/cmd/5g/peep.go
@@ -1101,6 +1101,7 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int {
 		return 0

 	case obj.ANOP, /* read,, write */
+		arm.ASQRTD,
 		arm.AMOVW,
 		arm.AMOVF,
 		arm.AMOVD,

--- a/src/cmd/5g/prog.go
+++ b/src/cmd/5g/prog.go
@@ -70,16 +70,17 @@ var progtable = [arm.ALAST]obj.ProgInfo{
 	arm.ATST:    {gc.SizeL | gc.LeftRead | gc.RightRead, 0, 0, 0},

 	// Floating point.
-	arm.AADDD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.AADDF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ACMPD: {gc.SizeD | gc.LeftRead | gc.RightRead, 0, 0, 0},
-	arm.ACMPF: {gc.SizeF | gc.LeftRead | gc.RightRead, 0, 0, 0},
-	arm.ADIVD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ADIVF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.AMULD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.AMULF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ASUBD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ASUBF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AADDD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AADDF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ACMPD:  {gc.SizeD | gc.LeftRead | gc.RightRead, 0, 0, 0},
+	arm.ACMPF:  {gc.SizeF | gc.LeftRead | gc.RightRead, 0, 0, 0},
+	arm.ADIVD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ADIVF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AMULD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AMULF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ASUBD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ASUBF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ASQRTD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},

 	// Conversions.
 	arm.AMOVWD: {gc.SizeD | gc.LeftRead | gc.RightWrite | gc.Conv, 0, 0, 0},

--- a/src/cmd/6g/gsubr.go
+++ b/src/cmd/6g/gsubr.go
@@ -1131,6 +1131,9 @@ func optoas(op int, t *gc.Type) int {

 	case gc.ODIV<<16 | gc.TFLOAT64:
 		a = x86.ADIVSD
+
+	case gc.OSQRT<<16 | gc.TFLOAT64:
+		a = x86.ASQRTSD
 	}

 	return a

--- a/src/cmd/6g/prog.go
+++ b/src/cmd/6g/prog.go
@@ -204,6 +204,7 @@ var progtable = [x86.ALAST]obj.ProgInfo{
 	x86.ASHRL:     {gc.SizeL | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
 	x86.ASHRQ:     {gc.SizeQ | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
 	x86.ASHRW:     {gc.SizeW | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
+	x86.ASQRTSD:   {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
 	x86.ASTOSB:    {gc.OK, AX | DI, DI, 0},
 	x86.ASTOSL:    {gc.OK, AX | DI, DI, 0},
 	x86.ASTOSQ:    {gc.OK, AX | DI, DI, 0},

--- a/src/cmd/internal/gc/cgen.go
+++ b/src/cmd/internal/gc/cgen.go
@@ -409,6 +409,15 @@ func Cgen(n *Node, res *Node) {
 		cgen_norm(n, &n1, res)
 		return

+	case OSQRT:
+		var n1 Node
+		Regalloc(&n1, nl.Type, res)
+		Cgen(n.Left, &n1)
+		Thearch.Gins(Thearch.Optoas(OSQRT, nl.Type), &n1, &n1)
+		Thearch.Gmove(&n1, res)
+		Regfree(&n1)
+		return
+
 		// symmetric binary
 	case OAND,
 		OOR,

--- a/src/cmd/internal/gc/gen.go
+++ b/src/cmd/internal/gc/gen.go
@@ -1002,6 +1002,9 @@ func gen(n *Node) {
 	case ORETURN, ORETJMP:
 		cgen_ret(n)

+	case OSQRT:
+		cgen_discard(n.Left)
+
 	case OCHECKNIL:
 		Cgen_checknil(n.Left)


--- a/src/cmd/internal/gc/syntax.go
+++ b/src/cmd/internal/gc/syntax.go
@@ -293,7 +293,7 @@ const (
 	OREGISTER // a register, such as AX.
 	OINDREG   // offset plus indirect of a register, such as 8(SP).

-	// 386/amd64-specific opcodes
+	// arch-specific opcodes
 	OCMP    // compare: ACMP.
 	ODEC    // decrement: ADEC.
 	OINC    // increment: AINC.
@@ -303,6 +303,7 @@ const (
 	ORROTC  // right rotate-carry: ARCR.
 	ORETJMP // return to other function
 	OPS     // compare parity set (for x86 NaN check)
+	OSQRT   // sqrt(float64), on systems that have hw support

 	OEND
 )

--- a/src/cmd/internal/gc/walk.go
+++ b/src/cmd/internal/gc/walk.go
@@ -622,6 +622,16 @@ func walkexpr(np **Node, init **NodeList) {
 		walkexpr(&n.Left, init)
 		walkexprlist(n.List, init)

+		if n.Left.Op == ONAME && n.Left.Sym.Name == "Sqrt" && n.Left.Sym.Pkg.Path == "math" {
+			switch Thearch.Thechar {
+			case '5', '6':
+				n.Op = OSQRT
+				n.Left = n.List.N
+				n.List = nil
+				goto ret
+			}
+		}
+
 		ll := ascompatte(int(n.Op), n, n.Isddd, getinarg(t), n.List, 0, init)
 		n.List = reorder1(ll)
 		goto ret

--- a/src/math/all_test.go
+++ b/src/math/all_test.go
@@ -2977,15 +2977,56 @@ func BenchmarkSinh(b *testing.B) {
 	}
 }

+var Global float64
+
 func BenchmarkSqrt(b *testing.B) {
+	x, y := 0.0, 10.0
+	for i := 0; i < b.N; i++ {
+		x += Sqrt(y)
+	}
+	Global = x
+}
+
+func BenchmarkSqrtIndirect(b *testing.B) {
+	x, y := 0.0, 10.0
+	f := Sqrt
 	for i := 0; i < b.N; i++ {
-		Sqrt(10)
+		x += f(y)
 	}
+	Global = x
 }

 func BenchmarkSqrtGo(b *testing.B) {
+	x, y := 0.0, 10.0
 	for i := 0; i < b.N; i++ {
-		SqrtGo(10)
+		x += SqrtGo(y)
+	}
+	Global = x
+}
+
+func isPrime(i int) bool {
+	// Yes, this is a dumb way to write this code,
+	// but calling Sqrt repeatedly in this way demonstrates
+	// the benefit of using a direct SQRT instruction on systems
+	// that have one, whereas the obvious loop seems not to
+	// demonstrate such a benefit.
+	for j := 2; float64(j) <= Sqrt(float64(i)); j++ {
+		if i%j == 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func BenchmarkSqrtPrime(b *testing.B) {
+	any := false
+	for i := 0; i < b.N; i++ {
+		if isPrime(100003) {
+			any = true
+		}
+	}
+	if any {
+		Global = 1
 	}
 }


--- a/src/math/sqrt.go
+++ b/src/math/sqrt.go
@@ -91,6 +91,11 @@ package math
 //	Sqrt(NaN) = NaN
 func Sqrt(x float64) float64

+// Note: Sqrt is implemented in assembly on some systems.
+// Others have assembly stubs that jump to func sqrt below.
+// On systems where Sqrt is a single instruction, the compiler
+// may turn a direct call into a direct use of that instruction instead.
+
 func sqrt(x float64) float64 {
 	// special cases
 	switch {