math/big: fixed Float.Float64, implemented Float.Float32

- fix bounds checks for exponent range of denormalized numbers - use correct rounding precision for denormalized numbers - added extra tests Change-Id: I6be56399afd0d9a603300a2e44b5539e08d6f592 Reviewed-on: https://go-review.googlesource.com/8096Reviewed-by: Alan Donovan <adonovan@google.com>

math/big: fixed Float.Float64, implemented Float.Float32
- fix bounds checks for exponent range of denormalized numbers - use correct rounding precision for denormalized numbers - added extra tests Change-Id: I6be56399afd0d9a603300a2e44b5539e08d6f592 Reviewed-on: https://go-review.googlesource.com/8096Reviewed-by: Alan Donovan <adonovan@google.com>
8d267b9b · Robert Griesemer · f8fd5502 · 8d267b9b · 8d267b9b
Commit 8d267b9b authored Mar 24, 2015 by Robert Griesemer
Hide whitespace changes
Inline Side-by-side

Showing with 259 additions and 46 deletions

float.go src/math/big/float.go +163 -35

float_test.go src/math/big/float_test.go +96 -11

No files found.
--- a/src/math/big/float.go
+++ b/src/math/big/float.go
@@ -750,6 +750,11 @@ func (z *Float) Copy(x *Float) *Float {
 	return z
 }

+func high32(x nat) uint32 {
+	// TODO(gri) This can be done more efficiently on 32bit platforms.
+	return uint32(high64(x) >> 32)
+}
+
 func high64(x nat) uint64 {
 	i := len(x)
 	if i == 0 {
@@ -872,15 +877,16 @@ func (x *Float) Int64() (int64, Accuracy) {
 	panic("unreachable")
 }

-// Float64 returns the float64 value nearest to x by rounding ToNearestEven
-// with 53 bits of precision.
-// If x is too small to be represented by a float64
-// (|x| < math.SmallestNonzeroFloat64), the result is (0, Below) or
-// (-0, Above), respectively, depending on the sign of x.
-// If x is too large to be represented by a float64 (|x| > math.MaxFloat64),
+// TODO(gri) Float32 and Float64 are very similar internally but for the
+// floatxx parameters and some conversions. Should factor out shared code.
+
+// Float32 returns the float32 value nearest to x. If x is too small to be
+// represented by a float32 (|x| < math.SmallestNonzeroFloat32), the result
+// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
+// If x is too large to be represented by a float32 (|x| > math.MaxFloat32),
 // the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
 // The result is (NaN, Undef) for NaNs.
-func (x *Float) Float64() (float64, Accuracy) {
+func (x *Float) Float32() (float32, Accuracy) {
 	if debugFloat {
 		x.validate()
 	}
@@ -888,61 +894,183 @@ func (x *Float) Float64() (float64, Accuracy) {
 	switch x.form {
 	case finite:
 		// 0 < |x| < +Inf
+
+		const (
+			fbits = 32                //        float size
+			mbits = 23                //        mantissa size (excluding implicit msb)
+			ebits = fbits - mbits - 1 //     8  exponent size
+			bias  = 1<<(ebits-1) - 1  //   127  exponent bias
+			dmin  = 1 - bias - mbits  //  -149  smallest unbiased exponent (denormal)
+			emin  = 1 - bias          //  -126  smallest unbiased exponent (normal)
+			emax  = bias              //   127  largest unbiased exponent (normal)
+		)
+
+		// Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0.
+		// floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0.
+		// For a given mantissa m, we need to add 1 to a floatxx exponent to get the
+		// corresponding Float exponent.
+		// (see also implementation of math.Ldexp for similar code)
+
+		if x.exp < dmin+1 {
+			// underflow
+			if x.neg {
+				var z float32
+				return -z, Above
+			}
+			return 0.0, Below
+		}
+		// x.exp >= dmin+1
+
 		var r Float
-		r.prec = 53
+		r.prec = mbits + 1 // +1 for implicit msb
+		if x.exp < emin+1 {
+			// denormal number - round to fewer bits
+			r.prec = uint32(x.exp - dmin)
+		}
 		r.Set(x)

-		// Rounding via Set may have caused r to overflow
-		// to ±Inf (rounding never causes underflows to 0).
+		// Rounding may have caused r to overflow to ±Inf
+		// (rounding never causes underflows to 0).
 		if r.form == inf {
-			r.exp = 10000 // cause overflow below
+			r.exp = emax + 2 // cause overflow below
 		}

-		// see also implementation of math.Ldexp
+		if r.exp > emax+1 {
+			// overflow
+			if x.neg {
+				return float32(math.Inf(-1)), Below
+			}
+			return float32(math.Inf(+1)), Above
+		}
+		// dmin+1 <= r.exp <= emax+1
+
+		var s uint32
+		if r.neg {
+			s = 1 << (fbits - 1)
+		}
+
+		m := high32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
+
+		// Rounding may have caused a denormal number to
+		// become normal. Check again.
+		c := float32(1.0)
+		if r.exp < emin+1 {
+			// denormal number
+			r.exp += mbits
+			c = 1.0 / (1 << mbits) // 2**-mbits
+		}
+		// emin+1 <= r.exp <= emax+1
+		e := uint32(r.exp-emin) << mbits
+
+		return c * math.Float32frombits(s|e|m), r.acc

-		e := int64(r.exp) + 1022
-		if e <= -52 {
+	case zero:
+		if x.neg {
+			var z float32
+			return -z, Exact
+		}
+		return 0.0, Exact
+
+	case inf:
+		if x.neg {
+			return float32(math.Inf(-1)), Exact
+		}
+		return float32(math.Inf(+1)), Exact
+
+	case nan:
+		return float32(math.NaN()), Undef
+	}
+
+	panic("unreachable")
+}
+
+// Float64 returns the float64 value nearest to x. If x is too small to be
+// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
+// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
+// If x is too large to be represented by a float64 (|x| > math.MaxFloat64),
+// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
+// The result is (NaN, Undef) for NaNs.
+func (x *Float) Float64() (float64, Accuracy) {
+	if debugFloat {
+		x.validate()
+	}
+
+	switch x.form {
+	case finite:
+		// 0 < |x| < +Inf
+
+		const (
+			fbits = 64                //        float size
+			mbits = 52                //        mantissa size (excluding implicit msb)
+			ebits = fbits - mbits - 1 //    11  exponent size
+			bias  = 1<<(ebits-1) - 1  //  1023  exponent bias
+			dmin  = 1 - bias - mbits  // -1074  smallest unbiased exponent (denormal)
+			emin  = 1 - bias          // -1022  smallest unbiased exponent (normal)
+			emax  = bias              //  1023  largest unbiased exponent (normal)
+		)
+
+		// Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0.
+		// floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0.
+		// For a given mantissa m, we need to add 1 to a floatxx exponent to get the
+		// corresponding Float exponent.
+		// (see also implementation of math.Ldexp for similar code)
+
+		if x.exp < dmin+1 {
 			// underflow
 			if x.neg {
-				z := 0.0
+				var z float64
 				return -z, Above
 			}
 			return 0.0, Below
 		}
-		// e > -52
+		// x.exp >= dmin+1
+
+		var r Float
+		r.prec = mbits + 1 // +1 for implicit msb
+		if x.exp < emin+1 {
+			// denormal number - round to fewer bits
+			r.prec = uint32(x.exp - dmin)
+		}
+		r.Set(x)
+
+		// Rounding may have caused r to overflow to ±Inf
+		// (rounding never causes underflows to 0).
+		if r.form == inf {
+			r.exp = emax + 2 // cause overflow below
+		}

-		if e >= 2047 {
+		if r.exp > emax+1 {
 			// overflow
 			if x.neg {
 				return math.Inf(-1), Below
 			}
 			return math.Inf(+1), Above
 		}
-		// -52 < e < 2047
-
-		denormal := false
-		if e < 0 {
-			denormal = true
-			e += 52
-		}
-		// 0 < e < 2047
+		// dmin+1 <= r.exp <= emax+1

-		s := uint64(0)
+		var s uint64
 		if r.neg {
-			s = 1 << 63
+			s = 1 << (fbits - 1)
 		}
-		m := high64(r.mant) >> 11 & (1<<52 - 1) // cut off msb (implicit 1 bit)
-		z := math.Float64frombits(s | uint64(e)<<52 | m)
-		if denormal {
-			// adjust for denormal
-			// TODO(gri) does this change accuracy?
-			z /= 1 << 52
+
+		m := high64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
+
+		// Rounding may have caused a denormal number to
+		// become normal. Check again.
+		c := 1.0
+		if r.exp < emin+1 {
+			// denormal number
+			r.exp += mbits
+			c = 1.0 / (1 << mbits) // 2**-mbits
 		}
-		return z, r.acc
+		// emin+1 <= r.exp <= emax+1
+		e := uint64(r.exp-emin) << mbits
+
+		return c * math.Float64frombits(s|e|m), r.acc

 	case zero:
 		if x.neg {
-			z := 0.0
+			var z float64
 			return -z, Exact
 		}
 		return 0.0, Exact

--- a/src/math/big/float_test.go
+++ b/src/math/big/float_test.go
@@ -537,14 +537,14 @@ func TestFloatRound(t *testing.T) {

 // TestFloatRound24 tests that rounding a float64 to 24 bits
 // matches IEEE-754 rounding to nearest when converting a
-// float64 to a float32.
+// float64 to a float32 (excluding denormal numbers).
 func TestFloatRound24(t *testing.T) {
 	const x0 = 1<<26 - 0x10 // 11...110000 (26 bits)
 	for d := 0; d <= 0x10; d++ {
 		x := float64(x0 + d)
 		f := new(Float).SetPrec(24).SetFloat64(x)
-		got, _ := f.Float64()
-		want := float64(float32(x))
+		got, _ := f.Float32()
+		want := float32(x)
 		if got != want {
 			t.Errorf("Round(%g, 24) = %g; want %g", x, got, want)
 		}
@@ -837,7 +837,70 @@ func TestFloatInt64(t *testing.T) {
 	}
 }

+func TestFloatFloat32(t *testing.T) {
+	for _, test := range []struct {
+		x   string
+		out float32
+		acc Accuracy
+	}{
+		{"-Inf", float32(math.Inf(-1)), Exact},
+		{"-0x1.ffffff0p2147483646", float32(-math.Inf(+1)), Below}, // overflow in rounding
+		{"-1e10000", float32(math.Inf(-1)), Below},                 // overflow
+		{"-0x1p128", float32(math.Inf(-1)), Below},                 // overflow
+		{"-0x1.ffffff0p127", float32(-math.Inf(+1)), Below},        // overflow
+		{"-0x1.fffffe8p127", -math.MaxFloat32, Above},
+		{"-0x1.fffffe0p127", -math.MaxFloat32, Exact},
+		{"-12345.000000000000000000001", -12345, Above},
+		{"-12345.0", -12345, Exact},
+		{"-1.000000000000000000001", -1, Above},
+		{"-1", -1, Exact},
+		{"-0x0.000002p-126", -math.SmallestNonzeroFloat32, Exact},
+		{"-0x0.000002p-127", -0, Above}, // underflow
+		{"-1e-1000", -0, Above},         // underflow
+		{"0", 0, Exact},
+		{"1e-1000", 0, Below},         // underflow
+		{"0x0.000002p-127", 0, Below}, // underflow
+		{"0x0.000002p-126", math.SmallestNonzeroFloat32, Exact},
+		{"1", 1, Exact},
+		{"1.000000000000000000001", 1, Below},
+		{"12345.0", 12345, Exact},
+		{"12345.000000000000000000001", 12345, Below},
+		{"0x1.fffffe0p127", math.MaxFloat32, Exact},
+		{"0x1.fffffe8p127", math.MaxFloat32, Below},
+		{"0x1.ffffff0p127", float32(math.Inf(+1)), Above},        // overflow
+		{"0x1p128", float32(math.Inf(+1)), Above},                // overflow
+		{"1e10000", float32(math.Inf(+1)), Above},                // overflow
+		{"0x1.ffffff0p2147483646", float32(math.Inf(+1)), Above}, // overflow in rounding
+		{"+Inf", float32(math.Inf(+1)), Exact},
+	} {
+		// conversion should match strconv where syntax is agreeable
+		if f, err := strconv.ParseFloat(test.x, 32); err == nil && float32(f) != test.out {
+			t.Errorf("%s: got %g; want %g (incorrect test data)", test.x, f, test.out)
+		}
+
+		x := makeFloat(test.x)
+		out, acc := x.Float32()
+		if out != test.out || acc != test.acc {
+			t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", test.x, out, math.Float32bits(out), acc, test.out, math.Float32bits(test.out), test.acc)
+		}
+
+		// test that x.SetFloat64(float64(f)).Float32() == f
+		var x2 Float
+		out2, acc2 := x2.SetFloat64(float64(out)).Float32()
+		if out2 != out || acc2 != Exact {
+			t.Errorf("idempotency test: got %g (%s); want %g (Exact)", out2, acc2, out)
+		}
+	}
+
+	// test NaN
+	x := makeFloat("NaN")
+	if out, acc := x.Float32(); out == out || acc != Undef {
+		t.Errorf("NaN: got %g (%s); want NaN (Undef)", out, acc)
+	}
+}
+
 func TestFloatFloat64(t *testing.T) {
+	const smallestNormalFloat64 = 2.2250738585072014e-308 // 1p-1022
 	for _, test := range []struct {
 		x   string
 		out float64
@@ -849,7 +912,7 @@ func TestFloatFloat64(t *testing.T) {
 		{"-0x1p1024", math.Inf(-1), Below},                       // overflow
 		{"-0x1.fffffffffffff8p1023", -math.Inf(+1), Below},       // overflow
 		{"-0x1.fffffffffffff4p1023", -math.MaxFloat64, Above},
-		{"-0x1.fffffffffffffp1023", -math.MaxFloat64, Exact},
+		{"-0x1.fffffffffffff0p1023", -math.MaxFloat64, Exact},
 		{"-12345.000000000000000000001", -12345, Above},
 		{"-12345.0", -12345, Exact},
 		{"-1.000000000000000000001", -1, Above},
@@ -865,18 +928,39 @@ func TestFloatFloat64(t *testing.T) {
 		{"1.000000000000000000001", 1, Below},
 		{"12345.0", 12345, Exact},
 		{"12345.000000000000000000001", 12345, Below},
-		{"0x1.fffffffffffffp1023", math.MaxFloat64, Exact},
+		{"0x1.fffffffffffff0p1023", math.MaxFloat64, Exact},
 		{"0x1.fffffffffffff4p1023", math.MaxFloat64, Below},
 		{"0x1.fffffffffffff8p1023", math.Inf(+1), Above},       // overflow
 		{"0x1p1024", math.Inf(+1), Above},                      // overflow
 		{"1e10000", math.Inf(+1), Above},                       // overflow
 		{"0x1.fffffffffffff8p2147483646", math.Inf(+1), Above}, // overflow in rounding
 		{"+Inf", math.Inf(+1), Exact},
+
+		// selected denormalized values that were handled incorrectly in the past
+		{"0x.fffffffffffffp-1022", smallestNormalFloat64 - math.SmallestNonzeroFloat64, Exact},
+		{"4503599627370495p-1074", smallestNormalFloat64 - math.SmallestNonzeroFloat64, Exact},
+
+		// http://www.exploringbinary.com/php-hangs-on-numeric-value-2-2250738585072011e-308/
+		{"2.2250738585072011e-308", 2.225073858507201e-308, Below},
+		// http://www.exploringbinary.com/java-hangs-when-converting-2-2250738585072012e-308/
+		{"2.2250738585072012e-308", 2.2250738585072014e-308, Above},
 	} {
+		// conversion should match strconv where syntax is agreeable
+		if f, err := strconv.ParseFloat(test.x, 64); err == nil && f != test.out {
+			t.Errorf("%s: got %g; want %g (incorrect test data)", test.x, f, test.out)
+		}
+
 		x := makeFloat(test.x)
 		out, acc := x.Float64()
 		if out != test.out || acc != test.acc {
-			t.Errorf("%s: got %g (%s); want %g (%s)", test.x, out, acc, test.out, test.acc)
+			t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", test.x, out, math.Float64bits(out), acc, test.out, math.Float64bits(test.out), test.acc)
+		}
+
+		// test that x.SetFloat64(f).Float64() == f
+		var x2 Float
+		out2, acc2 := x2.SetFloat64(out).Float64()
+		if out2 != out || acc2 != Exact {
+			t.Errorf("idempotency test: got %g (%s); want %g (Exact)", out2, acc2, out)
 		}
 	}

@@ -1108,7 +1192,8 @@ func TestFloatAdd(t *testing.T) {
 }

 // TestFloatAdd32 tests that Float.Add/Sub of numbers with
-// 24bit mantissa behaves like float32 addition/subtraction.
+// 24bit mantissa behaves like float32 addition/subtraction
+// (excluding denormal numbers).
 func TestFloatAdd32(t *testing.T) {
 	// chose base such that we cross the mantissa precision limit
 	const base = 1<<26 - 0x10 // 11...110000 (26 bits)
@@ -1124,15 +1209,15 @@ func TestFloatAdd32(t *testing.T) {
 			z := new(Float).SetPrec(24)

 			z.Add(x, y)
-			got, acc := z.Float64()
-			want := float64(float32(y0) + float32(x0))
+			got, acc := z.Float32()
+			want := float32(y0) + float32(x0)
 			if got != want || acc != Exact {
 				t.Errorf("d = %d: %g + %g = %g (%s); want %g (Exact)", d, x0, y0, got, acc, want)
 			}

 			z.Sub(z, y)
-			got, acc = z.Float64()
-			want = float64(float32(want) - float32(y0))
+			got, acc = z.Float32()
+			want = float32(want) - float32(y0)
 			if got != want || acc != Exact {
 				t.Errorf("d = %d: %g - %g = %g (%s); want %g (Exact)", d, x0+y0, y0, got, acc, want)
 			}