math/big: fix Float.Float64 conversion for denormal corner cases

- This change uses the same code as for Float32 and fixes the case of a number that gets rounded up to the smallest denormal. - Enabled correspoding test case. Change-Id: I8aac874a566cd727863a82717854f603fbdc26c6 Reviewed-on: https://go-review.googlesource.com/10352Reviewed-by: Alan Donovan <adonovan@google.com>

math/big: fix Float.Float64 conversion for denormal corner cases
- This change uses the same code as for Float32 and fixes the case of a number that gets rounded up to the smallest denormal. - Enabled correspoding test case. Change-Id: I8aac874a566cd727863a82717854f603fbdc26c6 Reviewed-on: https://go-review.googlesource.com/10352Reviewed-by: Alan Donovan <adonovan@google.com>
5ffbca48 · Robert Griesemer · 8de8925c · 5ffbca48 · 5ffbca48
Commit 5ffbca48 authored May 22, 2015 by Robert Griesemer
Hide whitespace changes
Inline Side-by-side

Showing with 40 additions and 39 deletions

float.go src/math/big/float.go +38 -36

float_test.go src/math/big/float_test.go +2 -3

No files found.
--- a/src/math/big/float.go
+++ b/src/math/big/float.go
@@ -959,10 +959,6 @@ func (x *Float) Float32() (float32, Accuracy) {
 	panic("unreachable")
 }

-// TODO(gri) Use same algorithm for Float64 as for Float32. The Float64 code
-// is incorrect for some corner cases (numbers that get rounded up to smallest
-// denormal - test cases are missing).
-
 // Float64 returns the float64 value nearest to x. If x is too small to be
 // represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
 // is (0, Below) or (-0, Above), respectively, depending on the sign of x.
@@ -987,64 +983,70 @@ func (x *Float) Float64() (float64, Accuracy) {
 			emax  = bias              //  1023  largest unbiased exponent (normal)
 		)

-		// Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0.
-		// floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0.
-		// For a given mantissa m, we need to add 1 to a floatxx exponent to get the
-		// corresponding Float exponent.
-		// (see also implementation of math.Ldexp for similar code)
+		// Float mantissa m is 0.5 <= m < 1.0; compute exponent for floatxx mantissa.
+		e := x.exp - 1 // exponent for mantissa m with 1.0 <= m < 2.0
+		p := mbits + 1 // precision of normal float

-		if x.exp < dmin+1 {
-			// underflow
-			if x.neg {
-				var z float64
-				return -z, Above
+		// If the exponent is too small, we may have a denormal number
+		// in which case we have fewer mantissa bits available: reduce
+		// precision accordingly.
+		if e < emin {
+			p -= emin - int(e)
+			// Make sure we have at least 1 bit so that we don't
+			// lose numbers rounded up to the smallest denormal.
+			if p < 1 {
+				p = 1
 			}
-			return 0.0, Below
 		}
-		// x.exp >= dmin+1

+		// round
 		var r Float
-		r.prec = mbits + 1 // +1 for implicit msb
-		if x.exp < emin+1 {
-			// denormal number - round to fewer bits
-			r.prec = uint32(x.exp - dmin)
-		}
+		r.prec = uint32(p)
 		r.Set(x)
+		e = r.exp - 1

 		// Rounding may have caused r to overflow to ±Inf
 		// (rounding never causes underflows to 0).
 		if r.form == inf {
-			r.exp = emax + 2 // cause overflow below
+			e = emax + 1 // cause overflow below
 		}

-		if r.exp > emax+1 {
+		// If the exponent is too large, overflow to ±Inf.
+		if e > emax {
 			// overflow
 			if x.neg {
 				return math.Inf(-1), Below
 			}
 			return math.Inf(+1), Above
 		}
-		// dmin+1 <= r.exp <= emax+1

-		var s uint64
-		if r.neg {
-			s = 1 << (fbits - 1)
+		// Determine sign, biased exponent, and mantissa.
+		var sign, bexp, mant uint64
+		if x.neg {
+			sign = 1 << (fbits - 1)
 		}

-		m := high64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
-
 		// Rounding may have caused a denormal number to
 		// become normal. Check again.
-		c := 1.0
-		if r.exp < emin+1 {
+		if e < emin {
 			// denormal number
-			r.exp += mbits
-			c = 1.0 / (1 << mbits) // 2**-mbits
+			if e < dmin {
+				// underflow to ±0
+				if x.neg {
+					var z float64
+					return -z, Above
+				}
+				return 0.0, Below
+			}
+			// bexp = 0
+			mant = high64(r.mant) >> (fbits - r.prec)
+		} else {
+			// normal number: emin <= e <= emax
+			bexp = uint64(e+bias) << mbits
+			mant = high64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
 		}
-		// emin+1 <= r.exp <= emax+1
-		e := uint64(r.exp-emin) << mbits

-		return c * math.Float64frombits(s|e|m), r.acc
+		return math.Float64frombits(sign | bexp | mant), r.acc

 	case zero:
 		if x.neg {

--- a/src/math/big/float_test.go
+++ b/src/math/big/float_test.go
@@ -922,9 +922,8 @@ func TestFloatFloat64(t *testing.T) {
 		{"0x0.00000000000008p-1022", 0, Below},

 		// denormals
-		// TODO(gri) enable once Float64 is fixed
-		// {"0x0.0000000000000cp-1022", math.SmallestNonzeroFloat64, Above}, // rounded up to smallest denormal
-		{"0x0.0000000000001p-1022", math.SmallestNonzeroFloat64, Exact}, // smallest denormal
+		{"0x0.0000000000000cp-1022", math.SmallestNonzeroFloat64, Above}, // rounded up to smallest denormal
+		{"0x0.0000000000001p-1022", math.SmallestNonzeroFloat64, Exact},  // smallest denormal
 		{"0x.8p-1073", math.SmallestNonzeroFloat64, Exact},
 		{"1p-1074", math.SmallestNonzeroFloat64, Exact},
 		{"0x.fffffffffffffp-1022", math.Float64frombits(0x000fffffffffffff), Exact}, // largest denormal