Commit 5ffbca48 authored by Robert Griesemer's avatar Robert Griesemer

math/big: fix Float.Float64 conversion for denormal corner cases

- This change uses the same code as for Float32 and fixes the case
  of a number that gets rounded up to the smallest denormal.

- Enabled correspoding test case.

Change-Id: I8aac874a566cd727863a82717854f603fbdc26c6
Reviewed-on: https://go-review.googlesource.com/10352Reviewed-by: 's avatarAlan Donovan <adonovan@google.com>
parent 8de8925c
...@@ -959,10 +959,6 @@ func (x *Float) Float32() (float32, Accuracy) { ...@@ -959,10 +959,6 @@ func (x *Float) Float32() (float32, Accuracy) {
panic("unreachable") panic("unreachable")
} }
// TODO(gri) Use same algorithm for Float64 as for Float32. The Float64 code
// is incorrect for some corner cases (numbers that get rounded up to smallest
// denormal - test cases are missing).
// Float64 returns the float64 value nearest to x. If x is too small to be // Float64 returns the float64 value nearest to x. If x is too small to be
// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result // represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
// is (0, Below) or (-0, Above), respectively, depending on the sign of x. // is (0, Below) or (-0, Above), respectively, depending on the sign of x.
...@@ -987,64 +983,70 @@ func (x *Float) Float64() (float64, Accuracy) { ...@@ -987,64 +983,70 @@ func (x *Float) Float64() (float64, Accuracy) {
emax = bias // 1023 largest unbiased exponent (normal) emax = bias // 1023 largest unbiased exponent (normal)
) )
// Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0. // Float mantissa m is 0.5 <= m < 1.0; compute exponent for floatxx mantissa.
// floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0. e := x.exp - 1 // exponent for mantissa m with 1.0 <= m < 2.0
// For a given mantissa m, we need to add 1 to a floatxx exponent to get the p := mbits + 1 // precision of normal float
// corresponding Float exponent.
// (see also implementation of math.Ldexp for similar code)
if x.exp < dmin+1 { // If the exponent is too small, we may have a denormal number
// underflow // in which case we have fewer mantissa bits available: reduce
if x.neg { // precision accordingly.
var z float64 if e < emin {
return -z, Above p -= emin - int(e)
// Make sure we have at least 1 bit so that we don't
// lose numbers rounded up to the smallest denormal.
if p < 1 {
p = 1
} }
return 0.0, Below
} }
// x.exp >= dmin+1
// round
var r Float var r Float
r.prec = mbits + 1 // +1 for implicit msb r.prec = uint32(p)
if x.exp < emin+1 {
// denormal number - round to fewer bits
r.prec = uint32(x.exp - dmin)
}
r.Set(x) r.Set(x)
e = r.exp - 1
// Rounding may have caused r to overflow to ±Inf // Rounding may have caused r to overflow to ±Inf
// (rounding never causes underflows to 0). // (rounding never causes underflows to 0).
if r.form == inf { if r.form == inf {
r.exp = emax + 2 // cause overflow below e = emax + 1 // cause overflow below
} }
if r.exp > emax+1 { // If the exponent is too large, overflow to ±Inf.
if e > emax {
// overflow // overflow
if x.neg { if x.neg {
return math.Inf(-1), Below return math.Inf(-1), Below
} }
return math.Inf(+1), Above return math.Inf(+1), Above
} }
// dmin+1 <= r.exp <= emax+1
var s uint64 // Determine sign, biased exponent, and mantissa.
if r.neg { var sign, bexp, mant uint64
s = 1 << (fbits - 1) if x.neg {
sign = 1 << (fbits - 1)
} }
m := high64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
// Rounding may have caused a denormal number to // Rounding may have caused a denormal number to
// become normal. Check again. // become normal. Check again.
c := 1.0 if e < emin {
if r.exp < emin+1 {
// denormal number // denormal number
r.exp += mbits if e < dmin {
c = 1.0 / (1 << mbits) // 2**-mbits // underflow to ±0
if x.neg {
var z float64
return -z, Above
}
return 0.0, Below
}
// bexp = 0
mant = high64(r.mant) >> (fbits - r.prec)
} else {
// normal number: emin <= e <= emax
bexp = uint64(e+bias) << mbits
mant = high64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
} }
// emin+1 <= r.exp <= emax+1
e := uint64(r.exp-emin) << mbits
return c * math.Float64frombits(s|e|m), r.acc return math.Float64frombits(sign | bexp | mant), r.acc
case zero: case zero:
if x.neg { if x.neg {
......
...@@ -922,9 +922,8 @@ func TestFloatFloat64(t *testing.T) { ...@@ -922,9 +922,8 @@ func TestFloatFloat64(t *testing.T) {
{"0x0.00000000000008p-1022", 0, Below}, {"0x0.00000000000008p-1022", 0, Below},
// denormals // denormals
// TODO(gri) enable once Float64 is fixed {"0x0.0000000000000cp-1022", math.SmallestNonzeroFloat64, Above}, // rounded up to smallest denormal
// {"0x0.0000000000000cp-1022", math.SmallestNonzeroFloat64, Above}, // rounded up to smallest denormal {"0x0.0000000000001p-1022", math.SmallestNonzeroFloat64, Exact}, // smallest denormal
{"0x0.0000000000001p-1022", math.SmallestNonzeroFloat64, Exact}, // smallest denormal
{"0x.8p-1073", math.SmallestNonzeroFloat64, Exact}, {"0x.8p-1073", math.SmallestNonzeroFloat64, Exact},
{"1p-1074", math.SmallestNonzeroFloat64, Exact}, {"1p-1074", math.SmallestNonzeroFloat64, Exact},
{"0x.fffffffffffffp-1022", math.Float64frombits(0x000fffffffffffff), Exact}, // largest denormal {"0x.fffffffffffffp-1022", math.Float64frombits(0x000fffffffffffff), Exact}, // largest denormal
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment