math: optimize Exp and Exp2 on arm64

This CL implements Exp and Exp2 with arm64 assembly. By inlining Ldexp and using fused instructions(fmadd, fmsub, fnmsub), this CL helps to improve the performance of functions Exp, Exp2, Sinh, Cosh and Tanh. Benchmarks: name old time/op new time/op delta Cosh-8 138ns ± 0% 96ns ± 0% -30.72% (p=0.008 n=5+5) Exp-8 105ns ± 0% 58ns ± 0% -45.24% (p=0.000 n=5+4) Exp2-8 100ns ± 0% 57ns ± 0% -43.21% (p=0.008 n=5+5) Sinh-8 139ns ± 0% 102ns ± 0% -26.62% (p=0.008 n=5+5) Tanh-8 134ns ± 0% 100ns ± 0% -25.67% (p=0.008 n=5+5) Change-Id: I7483a3333062a1d3525cedf3de56db78d79031c6 Reviewed-on: https://go-review.googlesource.com/86615 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>

math: optimize Exp and Exp2 on arm64
This CL implements Exp and Exp2 with arm64 assembly. By inlining Ldexp and using fused instructions(fmadd, fmsub, fnmsub), this CL helps to improve the performance of functions Exp, Exp2, Sinh, Cosh and Tanh. Benchmarks: name old time/op new time/op delta Cosh-8 138ns ± 0% 96ns ± 0% -30.72% (p=0.008 n=5+5) Exp-8 105ns ± 0% 58ns ± 0% -45.24% (p=0.000 n=5+4) Exp2-8 100ns ± 0% 57ns ± 0% -43.21% (p=0.008 n=5+5) Sinh-8 139ns ± 0% 102ns ± 0% -26.62% (p=0.008 n=5+5) Tanh-8 134ns ± 0% 100ns ± 0% -25.67% (p=0.008 n=5+5) Change-Id: I7483a3333062a1d3525cedf3de56db78d79031c6 Reviewed-on: https://go-review.googlesource.com/86615 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
711a373c · erifan01 · Cherry Zhang · 0a5be12f · 711a373c · 711a373c
Commit 711a373c authored Dec 19, 2017 by erifan01 Committed by Cherry Zhang Mar 27, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 192 additions and 7 deletions

all_test.go src/math/all_test.go +10 -1

exp_arm64.s src/math/exp_arm64.s +182 -0

stubs_arm64.s src/math/stubs_arm64.s +0 -6

No files found.
--- a/src/math/all_test.go
+++ b/src/math/all_test.go
@@ -1012,6 +1012,8 @@ var vfexpSC = []float64{
 	1.48852223e+09,
 	1.4885222e+09,
 	1,
+	// near zero
+	3.725290298461915e-09,
 }
 var expSC = []float64{
 	0,
@@ -1023,6 +1025,7 @@ var expSC = []float64{
 	Inf(1),
 	Inf(1),
 	2.718281828459045,
+	1.0000000037252903,
 }

 var vfexp2SC = []float64{
@@ -1033,6 +1036,10 @@ var vfexp2SC = []float64{
 	NaN(),
 	// smallest float64 that overflows Exp2(x)
 	1024,
+	// near underflow
+	-1.07399999999999e+03,
+	// near zero
+	3.725290298461915e-09,
 }
 var exp2SC = []float64{
 	0,
@@ -1041,6 +1048,8 @@ var exp2SC = []float64{
 	Inf(1),
 	NaN(),
 	Inf(1),
+	5e-324,
+	1.0000000025821745,
 }

 var vfexpm1SC = []float64{
@@ -2316,7 +2325,7 @@ func testExp2(t *testing.T, Exp2 func(float64) float64, name string) {
 	}
 	for i := 0; i < len(vfexp2SC); i++ {
 		if f := Exp2(vfexp2SC[i]); !alike(exp2SC[i], f) {
-			t.Errorf("%s(%g) = %g, want %g", name, vfexpSC[i], f, expSC[i])
+			t.Errorf("%s(%g) = %g, want %g", name, vfexp2SC[i], f, exp2SC[i])
 		}
 	}
 	for n := -1074; n < 1024; n++ {

--- a/src/math/exp_arm64.s
+++ b/src/math/exp_arm64.s
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define	Ln2Hi	6.93147180369123816490e-01
+#define	Ln2Lo	1.90821492927058770002e-10
+#define	Log2e	1.44269504088896338700e+00
+#define	Overflow	7.09782712893383973096e+02
+#define	Underflow	-7.45133219101941108420e+02
+#define	Overflow2	1.0239999999999999e+03
+#define	Underflow2	-1.0740e+03
+#define	NearZero	0x3e30000000000000	// 2**-28
+#define	PosInf	0x7ff0000000000000
+#define	FracMask	0x000fffffffffffff
+#define	C1	0x3cb0000000000000	// 2**-52
+#define	P1	1.66666666666666657415e-01	// 0x3FC55555; 0x55555555
+#define	P2	-2.77777777770155933842e-03	// 0xBF66C16C; 0x16BEBD93
+#define	P3	6.61375632143793436117e-05	// 0x3F11566A; 0xAF25DE2C
+#define	P4	-1.65339022054652515390e-06	// 0xBEBBBD41; 0xC5D26BF1
+#define	P5	4.13813679705723846039e-08	// 0x3E663769; 0x72BEA4D0
+
+// Exp returns e**x, the base-e exponential of x.
+// This is an assembly implementation of the method used for function Exp in file exp.go.
+//
+// func Exp(x float64) float64
+TEXT ·Exp(SB),$0-16
+	FMOVD	x+0(FP), F0	// F0 = x
+	FCMPD	F0, F0
+	BNE	isNaN		// x = NaN, return NaN
+	FMOVD	$Overflow, F1
+	FCMPD	F1, F0
+	BGT	overflow	// x > Overflow, return PosInf
+	FMOVD	$Underflow, F1
+	FCMPD	F1, F0
+	BLT	underflow	// x < Underflow, return 0
+	MOVD	$NearZero, R0
+	FMOVD	R0, F2
+	FABSD	F0, F3
+	FMOVD	$1.0, F1	// F1 = 1.0
+	FCMPD	F2, F3
+	BLT	nearzero	// fabs(x) < NearZero, return 1 + x
+	// argument reduction, x = k*ln2 + r,  |r| <= 0.5*ln2
+	// computed as r = hi - lo for extra precision.
+	FMOVD	$Log2e, F2
+	FMOVD	$0.5, F3
+	FNMSUBD	F0, F3, F2, F4	// Log2e*x - 0.5
+	FMADDD	F0, F3, F2, F3	// Log2e*x + 0.5
+	FCMPD	$0.0, F0
+	FCSELD	LT, F4, F3, F3	// F3 = k
+	FCVTZSD	F3, R1		// R1 = int(k)
+	SCVTFD	R1, F3		// F3 = float64(int(k))
+	FMOVD	$Ln2Hi, F4	// F4 = Ln2Hi
+	FMOVD	$Ln2Lo, F5	// F5 = Ln2Lo
+	FMSUBD	F3, F0, F4, F4	// F4 = hi = x - float64(int(k))*Ln2Hi
+	FMULD	F3, F5		// F5 = lo = float64(int(k)) * Ln2Lo
+	FSUBD	F5, F4, F6	// F6 = r = hi - lo
+	FMULD	F6, F6, F7	// F7 = t = r * r
+	// compute y
+	FMOVD	$P5, F8		// F8 = P5
+	FMOVD	$P4, F9		// F9 = P4
+	FMADDD	F7, F9, F8, F13	// P4+t*P5
+	FMOVD	$P3, F10	// F10 = P3
+	FMADDD	F7, F10, F13, F13	// P3+t*(P4+t*P5)
+	FMOVD	$P2, F11	// F11 = P2
+	FMADDD	F7, F11, F13, F13	// P2+t*(P3+t*(P4+t*P5))
+	FMOVD	$P1, F12	// F12 = P1
+	FMADDD	F7, F12, F13, F13	// P1+t*(P2+t*(P3+t*(P4+t*P5)))
+	FMSUBD	F7, F6, F13, F13	// F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))))
+	FMOVD	$2.0, F14
+	FSUBD	F13, F14
+	FMULD	F6, F13, F15
+	FDIVD	F14, F15	// F15 = (r*c)/(2-c)
+	FSUBD	F15, F5, F15	// lo-(r*c)/(2-c)
+	FSUBD	F4, F15, F15	// (lo-(r*c)/(2-c))-hi
+	FSUBD	F15, F1, F16	// F16 = y = 1-((lo-(r*c)/(2-c))-hi)
+	// inline Ldexp(y, k), benefit:
+	// 1, no parameter pass overhead.
+	// 2, skip unnecessary checks for Inf/NaN/Zero
+	FMOVD	F16, R0
+	AND	$FracMask, R0, R2	// fraction
+	LSR	$52, R0, R5	// exponent
+	ADD	R1, R5		// R1 = int(k)
+	CMP	$1, R5
+	BGE	normal
+	ADD	$52, R5		// denormal
+	MOVD	$C1, R8
+	FMOVD	R8, F1		// m = 2**-52
+normal:
+	ORR	R5<<52, R2, R0
+	FMOVD	R0, F0
+	FMULD	F1, F0		// return m * x
+	FMOVD	F0, ret+8(FP)
+	RET
+nearzero:
+	FADDD	F1, F0
+isNaN:
+	FMOVD	F0, ret+8(FP)
+	RET
+underflow:
+	MOVD	ZR, ret+8(FP)
+	RET
+overflow:
+	MOVD	$PosInf, R0
+	MOVD	R0, ret+8(FP)
+	RET
+
+
+// Exp2 returns 2**x, the base-2 exponential of x.
+// This is an assembly implementation of the method used for function Exp2 in file exp.go.
+//
+// func Exp2(x float64) float64
+TEXT ·Exp2(SB),$0-16
+	FMOVD	x+0(FP), F0	// F0 = x
+	FCMPD	F0, F0
+	BNE	isNaN		// x = NaN, return NaN
+	FMOVD	$Overflow2, F1
+	FCMPD	F1, F0
+	BGT	overflow	// x > Overflow, return PosInf
+	FMOVD	$Underflow2, F1
+	FCMPD	F1, F0
+	BLT	underflow	// x < Underflow, return 0
+	// argument reduction; x = r*lg(e) + k with |r| <= ln(2)/2
+	// computed as r = hi - lo for extra precision.
+	FMOVD	$0.5, F2
+	FSUBD	F2, F0, F3	// x + 0.5
+	FADDD	F2, F0, F4	// x - 0.5
+	FCMPD	$0.0, F0
+	FCSELD	LT, F3, F4, F3	// F3 = k
+	FCVTZSD	F3, R1		// R1 = int(k)
+	SCVTFD	R1, F3		// F3 = float64(int(k))
+	FSUBD	F3, F0, F3	// t = x - float64(int(k))
+	FMOVD	$Ln2Hi, F4	// F4 = Ln2Hi
+	FMOVD	$Ln2Lo, F5	// F5 = Ln2Lo
+	FMULD	F3, F4		// F4 = hi = t * Ln2Hi
+	FNMULD	F3, F5		// F5 = lo = -t * Ln2Lo
+	FSUBD	F5, F4, F6	// F6 = r = hi - lo
+	FMULD	F6, F6, F7	// F7 = t = r * r
+	// compute y
+	FMOVD	$P5, F8		// F8 = P5
+	FMOVD	$P4, F9		// F9 = P4
+	FMADDD	F7, F9, F8, F13	// P4+t*P5
+	FMOVD	$P3, F10	// F10 = P3
+	FMADDD	F7, F10, F13, F13	// P3+t*(P4+t*P5)
+	FMOVD	$P2, F11	// F11 = P2
+	FMADDD	F7, F11, F13, F13	// P2+t*(P3+t*(P4+t*P5))
+	FMOVD	$P1, F12	// F12 = P1
+	FMADDD	F7, F12, F13, F13	// P1+t*(P2+t*(P3+t*(P4+t*P5)))
+	FMSUBD	F7, F6, F13, F13	// F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))))
+	FMOVD	$2.0, F14
+	FSUBD	F13, F14
+	FMULD	F6, F13, F15
+	FDIVD	F14, F15	// F15 = (r*c)/(2-c)
+	FMOVD	$1.0, F1	// F1 = 1.0
+	FSUBD	F15, F5, F15	// lo-(r*c)/(2-c)
+	FSUBD	F4, F15, F15	// (lo-(r*c)/(2-c))-hi
+	FSUBD	F15, F1, F16	// F16 = y = 1-((lo-(r*c)/(2-c))-hi)
+	// inline Ldexp(y, k), benefit:
+	// 1, no parameter pass overhead.
+	// 2, skip unnecessary checks for Inf/NaN/Zero
+	FMOVD	F16, R0
+	AND	$FracMask, R0, R2	// fraction
+	LSR	$52, R0, R5	// exponent
+	ADD	R1, R5		// R1 = int(k)
+	CMP	$1, R5
+	BGE	normal
+	ADD	$52, R5		// denormal
+	MOVD	$C1, R8
+	FMOVD	R8, F1		// m = 2**-52
+normal:
+	ORR	R5<<52, R2, R0
+	FMOVD	R0, F0
+	FMULD	F1, F0		// return m * x
+isNaN:
+	FMOVD	F0, ret+8(FP)
+	RET
+underflow:
+	MOVD	ZR, ret+8(FP)
+	RET
+overflow:
+	MOVD	$PosInf, R0
+	MOVD	R0, ret+8(FP)
+	RET
--- a/src/math/stubs_arm64.s
+++ b/src/math/stubs_arm64.s
@@ -27,9 +27,6 @@ TEXT ·Atan(SB),NOSPLIT,$0
 TEXT ·Atanh(SB),NOSPLIT,$0
 	B ·atanh(SB)

-TEXT ·Exp2(SB),NOSPLIT,$0
-	B ·exp2(SB)
-
 TEXT ·Erf(SB),NOSPLIT,$0
 	B ·erf(SB)

@@ -45,9 +42,6 @@ TEXT ·Cosh(SB),NOSPLIT,$0
 TEXT ·Expm1(SB),NOSPLIT,$0
 	B ·expm1(SB)

-TEXT ·Exp(SB),NOSPLIT,$0
-	B ·exp(SB)
-
 TEXT ·Frexp(SB),NOSPLIT,$0
 	B ·frexp(SB)