Message ID | 20180124131315.30567-13-alex.bennee@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | re-factor softfloat and add fp16 functions | expand |
On 01/24/2018 10:13 AM, Alex Bennée wrote: > We can now add float16_add/sub and use the common decompose and > canonicalize functions to have a single implementation for > float16/32/64 add and sub functions. > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org> > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > > --- > > v3 > - renames for FloatParts > - shorten internal names > - fix comment style > - use FloatFmt values for for extract/deposit in raw_pack/unpack > - move #include bitops from previous patch > - add is_nan helper > - avoid temp variables where we can > --- > fpu/softfloat.c | 884 +++++++++++++++++++++++++----------------------- > include/fpu/softfloat.h | 4 + > 2 files changed, 461 insertions(+), 427 deletions(-) > > diff --git a/fpu/softfloat.c b/fpu/softfloat.c > index 568d555595..b179790889 100644 > --- a/fpu/softfloat.c > +++ b/fpu/softfloat.c > @@ -83,6 +83,7 @@ this code that are retained. > * target-dependent and needs the TARGET_* macros. > */ > #include "qemu/osdep.h" > +#include "qemu/bitops.h" > #include "fpu/softfloat.h" > > /* We only need stdlib for abort() */ > @@ -270,6 +271,462 @@ static const FloatFmt float64_params = { > FLOAT_PARAMS(11, 52) > }; > > +/* Unpack a float to parts, but do not canonicalize. */ > +static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) > +{ > + const int sign_pos = fmt.frac_size + fmt.exp_size; > + > + return (FloatParts) { > + .cls = float_class_unclassified, > + .sign = extract64(raw, sign_pos, 1), > + .exp = extract64(raw, fmt.frac_size, fmt.exp_size), > + .frac = extract64(raw, 0, fmt.frac_size), > + }; > +} > + > +static inline FloatParts float16_unpack_raw(float16 f) > +{ > + return unpack_raw(float16_params, f); > +} > + > +static inline FloatParts float32_unpack_raw(float32 f) > +{ > + return unpack_raw(float32_params, f); > +} > + > +static inline FloatParts float64_unpack_raw(float64 f) > +{ > + return unpack_raw(float64_params, f); > +} > + > +/* Pack a float from parts, but do not canonicalize. */ > +static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) > +{ > + const int sign_pos = fmt.frac_size + fmt.exp_size; > + uint64_t ret = p.frac; saving 1 line: uint64_t ret = deposit64(p.frac, sign_pos, 1, p.sign); and then eventually: return deposit64(ret, fmt.frac_size, fmt.exp_size, p.exp); > + > + ret = deposit64(ret, fmt.frac_size, fmt.exp_size, p.exp); > + ret = deposit64(ret, sign_pos, 1, p.sign); > + return ret; > +} > + > +static inline float16 float16_pack_raw(FloatParts p) > +{ > + return make_float16(pack_raw(float16_params, p)); > +} > + > +static inline float32 float32_pack_raw(FloatParts p) > +{ > + return make_float32(pack_raw(float32_params, p)); > +} > + > +static inline float64 float64_pack_raw(FloatParts p) > +{ > + return make_float64(pack_raw(float64_params, p)); > +} > + > +/* Canonicalize EXP and FRAC, setting CLS. */ > +static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, > + float_status *status) > +{ > + if (part.exp == parm->exp_max) { > + if (part.frac == 0) { > + part.cls = float_class_inf; > + } else { > +#ifdef NO_SIGNALING_NANS > + part.cls = float_class_qnan; > +#else > + int64_t msb = part.frac << (parm->frac_shift + 2); > + if ((msb < 0) == status->snan_bit_is_one) { > + part.cls = float_class_snan; > + } else { > + part.cls = float_class_qnan; > + } > +#endif > + } > + } else if (part.exp == 0) { > + if (likely(part.frac == 0)) { > + part.cls = float_class_zero; > + } else if (status->flush_inputs_to_zero) { > + float_raise(float_flag_input_denormal, status); > + part.cls = float_class_zero; > + part.frac = 0; > + } else { > + int shift = clz64(part.frac) - 1; > + part.cls = float_class_normal; > + part.exp = parm->frac_shift - parm->exp_bias - shift + 1; > + part.frac <<= shift; > + } > + } else { > + part.cls = float_class_normal; > + part.exp -= parm->exp_bias; > + part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); > + } > + return part; > +} > + > +/* Round and uncanonicalize a floating-point number by parts. There > + * are FRAC_SHIFT bits that may require rounding at the bottom of the > + * fraction; these bits will be removed. The exponent will be biased > + * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. > + */ > + > +static FloatParts round_canonical(FloatParts p, float_status *s, > + const FloatFmt *parm) > +{ > + const uint64_t frac_lsbm1 = parm->frac_lsbm1; > + const uint64_t round_mask = parm->round_mask; > + const uint64_t roundeven_mask = parm->roundeven_mask; > + const int exp_max = parm->exp_max; > + const int frac_shift = parm->frac_shift; > + uint64_t frac, inc; > + int exp, flags = 0; > + bool overflow_norm; > + > + frac = p.frac; > + exp = p.exp; > + > + switch (p.cls) { > + case float_class_normal: > + switch (s->float_rounding_mode) { > + case float_round_nearest_even: > + overflow_norm = false; > + inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); > + break; > + case float_round_ties_away: > + overflow_norm = false; > + inc = frac_lsbm1; > + break; > + case float_round_to_zero: > + overflow_norm = true; > + inc = 0; > + break; > + case float_round_up: > + inc = p.sign ? 0 : round_mask; > + overflow_norm = p.sign; > + break; > + case float_round_down: > + inc = p.sign ? round_mask : 0; > + overflow_norm = !p.sign; > + break; > + default: > + g_assert_not_reached(); > + } > + > + exp += parm->exp_bias; > + if (likely(exp > 0)) { > + if (frac & round_mask) { > + flags |= float_flag_inexact; > + frac += inc; > + if (frac & DECOMPOSED_OVERFLOW_BIT) { > + frac >>= 1; > + exp++; > + } > + } > + frac >>= frac_shift; > + > + if (unlikely(exp >= exp_max)) { > + flags |= float_flag_overflow | float_flag_inexact; > + if (overflow_norm) { > + exp = exp_max - 1; > + frac = -1; > + } else { > + p.cls = float_class_inf; > + goto do_inf; > + } > + } > + } else if (s->flush_to_zero) { > + flags |= float_flag_output_denormal; > + p.cls = float_class_zero; > + goto do_zero; > + } else { > + bool is_tiny = (s->float_detect_tininess > + == float_tininess_before_rounding) > + || (exp < 0) > + || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); > + > + shift64RightJamming(frac, 1 - exp, &frac); > + if (frac & round_mask) { > + /* Need to recompute round-to-even. */ > + if (s->float_rounding_mode == float_round_nearest_even) { > + inc = ((frac & roundeven_mask) != frac_lsbm1 > + ? frac_lsbm1 : 0); > + } > + flags |= float_flag_inexact; > + frac += inc; > + } > + > + exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); > + frac >>= frac_shift; > + > + if (is_tiny && (flags & float_flag_inexact)) { > + flags |= float_flag_underflow; > + } > + if (exp == 0 && frac == 0) { > + p.cls = float_class_zero; > + } > + } > + break; > + > + case float_class_zero: > + do_zero: > + exp = 0; > + frac = 0; > + break; > + > + case float_class_inf: > + do_inf: > + exp = exp_max; > + frac = 0; > + break; > + > + case float_class_qnan: > + case float_class_snan: > + exp = exp_max; > + break; > + > + default: > + g_assert_not_reached(); > + } > + > + float_raise(flags, s); > + p.exp = exp; > + p.frac = frac; > + return p; > +} > + > +static FloatParts float16_unpack_canonical(float16 f, float_status *s) > +{ > + return canonicalize(float16_unpack_raw(f), &float16_params, s); > +} > + > +static float16 float16_round_pack_canonical(FloatParts p, float_status *s) > +{ > + switch (p.cls) { > + case float_class_dnan: > + return float16_default_nan(s); > + case float_class_msnan: > + return float16_maybe_silence_nan(float16_pack_raw(p), s); > + default: > + p = round_canonical(p, s, &float16_params); > + return float16_pack_raw(p); > + } > +} > + > +static FloatParts float32_unpack_canonical(float32 f, float_status *s) > +{ > + return canonicalize(float32_unpack_raw(f), &float32_params, s); > +} > + > +static float32 float32_round_pack_canonical(FloatParts p, float_status *s) > +{ > + switch (p.cls) { > + case float_class_dnan: > + return float32_default_nan(s); > + case float_class_msnan: > + return float32_maybe_silence_nan(float32_pack_raw(p), s); > + default: > + p = round_canonical(p, s, &float32_params); > + return float32_pack_raw(p); > + } > +} > + > +static FloatParts float64_unpack_canonical(float64 f, float_status *s) > +{ > + return canonicalize(float64_unpack_raw(f), &float64_params, s); > +} > + > +static float64 float64_round_pack_canonical(FloatParts p, float_status *s) > +{ > + switch (p.cls) { > + case float_class_dnan: > + return float64_default_nan(s); > + case float_class_msnan: > + return float64_maybe_silence_nan(float64_pack_raw(p), s); > + default: > + p = round_canonical(p, s, &float64_params); > + return float64_pack_raw(p); > + } > +} > + > +static FloatParts pick_nan_parts(FloatParts a, FloatParts b, float_status *s) > +{ > + if (a.cls == float_class_snan || b.cls == float_class_snan) { > + s->float_exception_flags |= float_flag_invalid; > + } > + > + if (s->default_nan_mode) { > + a.cls = float_class_dnan; > + } else { > + if (pickNaN(a.cls == float_class_qnan, > + a.cls == float_class_snan, > + b.cls == float_class_qnan, > + b.cls == float_class_snan, > + a.frac > b.frac > + || (a.frac == b.frac && a.sign < b.sign))) { > + a = b; > + } > + a.cls = float_class_msnan; > + } > + return a; > +} > + > +/* Simple helper for checking if FloatClass is any NaN */ > + > +static bool is_nan(FloatClass c) > +{ > + return c >= float_class_qnan; > +} > + > +/* > + * Returns the result of adding or subtracting the values of the > + * floating-point values `a' and `b'. The operation is performed > + * according to the IEC/IEEE Standard for Binary Floating-Point > + * Arithmetic. > + */ > + > +static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, > + float_status *s) > +{ > + bool a_sign = a.sign; > + bool b_sign = b.sign ^ subtract; > + > + if (a_sign != b_sign) { > + /* Subtraction */ > + > + if (a.cls == float_class_normal && b.cls == float_class_normal) { > + if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { > + shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); > + a.frac = a.frac - b.frac; > + } else { > + shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); > + a.frac = b.frac - a.frac; > + a.exp = b.exp; > + a_sign ^= 1; > + } > + > + if (a.frac == 0) { > + a.cls = float_class_zero; > + a.sign = s->float_rounding_mode == float_round_down; > + } else { > + int shift = clz64(a.frac) - 1; > + a.frac = a.frac << shift; > + a.exp = a.exp - shift; > + a.sign = a_sign; > + } > + return a; > + } > + if (is_nan(a.cls) || is_nan(b.cls)) { > + return pick_nan_parts(a, b, s); > + } > + if (a.cls == float_class_inf) { > + if (b.cls == float_class_inf) { > + float_raise(float_flag_invalid, s); > + a.cls = float_class_dnan; > + } > + return a; > + } > + if (a.cls == float_class_zero && b.cls == float_class_zero) { > + a.sign = s->float_rounding_mode == float_round_down; > + return a; > + } > + if (a.cls == float_class_zero || b.cls == float_class_inf) { > + b.sign = a_sign ^ 1; > + return b; > + } > + if (b.cls == float_class_zero) { > + return a; > + } > + } else { > + /* Addition */ > + if (a.cls == float_class_normal && b.cls == float_class_normal) { > + if (a.exp > b.exp) { > + shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); > + } else if (a.exp < b.exp) { > + shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); > + a.exp = b.exp; > + } > + a.frac += b.frac; > + if (a.frac & DECOMPOSED_OVERFLOW_BIT) { > + a.frac >>= 1; > + a.exp += 1; > + } > + return a; > + } > + if (is_nan(a.cls) || is_nan(b.cls)) { > + return pick_nan_parts(a, b, s); > + } > + if (a.cls == float_class_inf || b.cls == float_class_zero) { > + return a; > + } > + if (b.cls == float_class_inf || a.cls == float_class_zero) { > + b.sign = b_sign; > + return b; > + } > + } > + g_assert_not_reached(); > +} > + > +/* > + * Returns the result of adding or subtracting the floating-point > + * values `a' and `b'. The operation is performed according to the > + * IEC/IEEE Standard for Binary Floating-Point Arithmetic. > + */ > + > +float16 float16_add(float16 a, float16 b, float_status *status) > +{ > + FloatParts pa = float16_unpack_canonical(a, status); > + FloatParts pb = float16_unpack_canonical(b, status); > + FloatParts pr = addsub_floats(pa, pb, false, status); > + > + return float16_round_pack_canonical(pr, status); > +} > + > +float32 float32_add(float32 a, float32 b, float_status *status) > +{ > + FloatParts pa = float32_unpack_canonical(a, status); > + FloatParts pb = float32_unpack_canonical(b, status); > + FloatParts pr = addsub_floats(pa, pb, false, status); > + > + return float32_round_pack_canonical(pr, status); > +} > + > +float64 float64_add(float64 a, float64 b, float_status *status) > +{ > + FloatParts pa = float64_unpack_canonical(a, status); > + FloatParts pb = float64_unpack_canonical(b, status); > + FloatParts pr = addsub_floats(pa, pb, false, status); > + > + return float64_round_pack_canonical(pr, status); > +} > + > +float16 float16_sub(float16 a, float16 b, float_status *status) > +{ > + FloatParts pa = float16_unpack_canonical(a, status); > + FloatParts pb = float16_unpack_canonical(b, status); > + FloatParts pr = addsub_floats(pa, pb, true, status); > + > + return float16_round_pack_canonical(pr, status); > +} > + > +float32 float32_sub(float32 a, float32 b, float_status *status) > +{ > + FloatParts pa = float32_unpack_canonical(a, status); > + FloatParts pb = float32_unpack_canonical(b, status); > + FloatParts pr = addsub_floats(pa, pb, true, status); > + > + return float32_round_pack_canonical(pr, status); > +} > + > +float64 float64_sub(float64 a, float64 b, float_status *status) > +{ > + FloatParts pa = float64_unpack_canonical(a, status); > + FloatParts pb = float64_unpack_canonical(b, status); > + FloatParts pr = addsub_floats(pa, pb, true, status); > + > + return float64_round_pack_canonical(pr, status); > +} > + > /*---------------------------------------------------------------------------- > | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 > | and 7, and returns the properly rounded 32-bit integer corresponding to the > @@ -2081,220 +2538,6 @@ float32 float32_round_to_int(float32 a, float_status *status) > > } > > -/*---------------------------------------------------------------------------- > -| Returns the result of adding the absolute values of the single-precision > -| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated > -| before being returned. `zSign' is ignored if the result is a NaN. > -| The addition is performed according to the IEC/IEEE Standard for Binary > -| Floating-Point Arithmetic. > -*----------------------------------------------------------------------------*/ > - > -static float32 addFloat32Sigs(float32 a, float32 b, flag zSign, > - float_status *status) > -{ > - int aExp, bExp, zExp; > - uint32_t aSig, bSig, zSig; > - int expDiff; > - > - aSig = extractFloat32Frac( a ); > - aExp = extractFloat32Exp( a ); > - bSig = extractFloat32Frac( b ); > - bExp = extractFloat32Exp( b ); > - expDiff = aExp - bExp; > - aSig <<= 6; > - bSig <<= 6; > - if ( 0 < expDiff ) { > - if ( aExp == 0xFF ) { > - if (aSig) { > - return propagateFloat32NaN(a, b, status); > - } > - return a; > - } > - if ( bExp == 0 ) { > - --expDiff; > - } > - else { > - bSig |= 0x20000000; > - } > - shift32RightJamming( bSig, expDiff, &bSig ); > - zExp = aExp; > - } > - else if ( expDiff < 0 ) { > - if ( bExp == 0xFF ) { > - if (bSig) { > - return propagateFloat32NaN(a, b, status); > - } > - return packFloat32( zSign, 0xFF, 0 ); > - } > - if ( aExp == 0 ) { > - ++expDiff; > - } > - else { > - aSig |= 0x20000000; > - } > - shift32RightJamming( aSig, - expDiff, &aSig ); > - zExp = bExp; > - } > - else { > - if ( aExp == 0xFF ) { > - if (aSig | bSig) { > - return propagateFloat32NaN(a, b, status); > - } > - return a; > - } > - if ( aExp == 0 ) { > - if (status->flush_to_zero) { > - if (aSig | bSig) { > - float_raise(float_flag_output_denormal, status); > - } > - return packFloat32(zSign, 0, 0); > - } > - return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); > - } > - zSig = 0x40000000 + aSig + bSig; > - zExp = aExp; > - goto roundAndPack; > - } > - aSig |= 0x20000000; > - zSig = ( aSig + bSig )<<1; > - --zExp; > - if ( (int32_t) zSig < 0 ) { > - zSig = aSig + bSig; > - ++zExp; > - } > - roundAndPack: > - return roundAndPackFloat32(zSign, zExp, zSig, status); > - > -} > - > -/*---------------------------------------------------------------------------- > -| Returns the result of subtracting the absolute values of the single- > -| precision floating-point values `a' and `b'. If `zSign' is 1, the > -| difference is negated before being returned. `zSign' is ignored if the > -| result is a NaN. The subtraction is performed according to the IEC/IEEE > -| Standard for Binary Floating-Point Arithmetic. > -*----------------------------------------------------------------------------*/ > - > -static float32 subFloat32Sigs(float32 a, float32 b, flag zSign, > - float_status *status) > -{ > - int aExp, bExp, zExp; > - uint32_t aSig, bSig, zSig; > - int expDiff; > - > - aSig = extractFloat32Frac( a ); > - aExp = extractFloat32Exp( a ); > - bSig = extractFloat32Frac( b ); > - bExp = extractFloat32Exp( b ); > - expDiff = aExp - bExp; > - aSig <<= 7; > - bSig <<= 7; > - if ( 0 < expDiff ) goto aExpBigger; > - if ( expDiff < 0 ) goto bExpBigger; > - if ( aExp == 0xFF ) { > - if (aSig | bSig) { > - return propagateFloat32NaN(a, b, status); > - } > - float_raise(float_flag_invalid, status); > - return float32_default_nan(status); > - } > - if ( aExp == 0 ) { > - aExp = 1; > - bExp = 1; > - } > - if ( bSig < aSig ) goto aBigger; > - if ( aSig < bSig ) goto bBigger; > - return packFloat32(status->float_rounding_mode == float_round_down, 0, 0); > - bExpBigger: > - if ( bExp == 0xFF ) { > - if (bSig) { > - return propagateFloat32NaN(a, b, status); > - } > - return packFloat32( zSign ^ 1, 0xFF, 0 ); > - } > - if ( aExp == 0 ) { > - ++expDiff; > - } > - else { > - aSig |= 0x40000000; > - } > - shift32RightJamming( aSig, - expDiff, &aSig ); > - bSig |= 0x40000000; > - bBigger: > - zSig = bSig - aSig; > - zExp = bExp; > - zSign ^= 1; > - goto normalizeRoundAndPack; > - aExpBigger: > - if ( aExp == 0xFF ) { > - if (aSig) { > - return propagateFloat32NaN(a, b, status); > - } > - return a; > - } > - if ( bExp == 0 ) { > - --expDiff; > - } > - else { > - bSig |= 0x40000000; > - } > - shift32RightJamming( bSig, expDiff, &bSig ); > - aSig |= 0x40000000; > - aBigger: > - zSig = aSig - bSig; > - zExp = aExp; > - normalizeRoundAndPack: > - --zExp; > - return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status); > - > -} > - > -/*---------------------------------------------------------------------------- > -| Returns the result of adding the single-precision floating-point values `a' > -| and `b'. The operation is performed according to the IEC/IEEE Standard for > -| Binary Floating-Point Arithmetic. > -*----------------------------------------------------------------------------*/ > - > -float32 float32_add(float32 a, float32 b, float_status *status) > -{ > - flag aSign, bSign; > - a = float32_squash_input_denormal(a, status); > - b = float32_squash_input_denormal(b, status); > - > - aSign = extractFloat32Sign( a ); > - bSign = extractFloat32Sign( b ); > - if ( aSign == bSign ) { > - return addFloat32Sigs(a, b, aSign, status); > - } > - else { > - return subFloat32Sigs(a, b, aSign, status); > - } > - > -} > - > -/*---------------------------------------------------------------------------- > -| Returns the result of subtracting the single-precision floating-point values > -| `a' and `b'. The operation is performed according to the IEC/IEEE Standard > -| for Binary Floating-Point Arithmetic. > -*----------------------------------------------------------------------------*/ > - > -float32 float32_sub(float32 a, float32 b, float_status *status) > -{ > - flag aSign, bSign; > - a = float32_squash_input_denormal(a, status); > - b = float32_squash_input_denormal(b, status); > - > - aSign = extractFloat32Sign( a ); > - bSign = extractFloat32Sign( b ); > - if ( aSign == bSign ) { > - return subFloat32Sigs(a, b, aSign, status); > - } > - else { > - return addFloat32Sigs(a, b, aSign, status); > - } > - > -} > - > /*---------------------------------------------------------------------------- > | Returns the result of multiplying the single-precision floating-point values > | `a' and `b'. The operation is performed according to the IEC/IEEE Standard > @@ -3891,219 +4134,6 @@ float64 float64_trunc_to_int(float64 a, float_status *status) > return res; > } > > -/*---------------------------------------------------------------------------- > -| Returns the result of adding the absolute values of the double-precision > -| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated > -| before being returned. `zSign' is ignored if the result is a NaN. > -| The addition is performed according to the IEC/IEEE Standard for Binary > -| Floating-Point Arithmetic. > -*----------------------------------------------------------------------------*/ > - > -static float64 addFloat64Sigs(float64 a, float64 b, flag zSign, > - float_status *status) > -{ > - int aExp, bExp, zExp; > - uint64_t aSig, bSig, zSig; > - int expDiff; > - > - aSig = extractFloat64Frac( a ); > - aExp = extractFloat64Exp( a ); > - bSig = extractFloat64Frac( b ); > - bExp = extractFloat64Exp( b ); > - expDiff = aExp - bExp; > - aSig <<= 9; > - bSig <<= 9; > - if ( 0 < expDiff ) { > - if ( aExp == 0x7FF ) { > - if (aSig) { > - return propagateFloat64NaN(a, b, status); > - } > - return a; > - } > - if ( bExp == 0 ) { > - --expDiff; > - } > - else { > - bSig |= LIT64( 0x2000000000000000 ); > - } > - shift64RightJamming( bSig, expDiff, &bSig ); > - zExp = aExp; > - } > - else if ( expDiff < 0 ) { > - if ( bExp == 0x7FF ) { > - if (bSig) { > - return propagateFloat64NaN(a, b, status); > - } > - return packFloat64( zSign, 0x7FF, 0 ); > - } > - if ( aExp == 0 ) { > - ++expDiff; > - } > - else { > - aSig |= LIT64( 0x2000000000000000 ); > - } > - shift64RightJamming( aSig, - expDiff, &aSig ); > - zExp = bExp; > - } > - else { > - if ( aExp == 0x7FF ) { > - if (aSig | bSig) { > - return propagateFloat64NaN(a, b, status); > - } > - return a; > - } > - if ( aExp == 0 ) { > - if (status->flush_to_zero) { > - if (aSig | bSig) { > - float_raise(float_flag_output_denormal, status); > - } > - return packFloat64(zSign, 0, 0); > - } > - return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); > - } > - zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; > - zExp = aExp; > - goto roundAndPack; > - } > - aSig |= LIT64( 0x2000000000000000 ); > - zSig = ( aSig + bSig )<<1; > - --zExp; > - if ( (int64_t) zSig < 0 ) { > - zSig = aSig + bSig; > - ++zExp; > - } > - roundAndPack: > - return roundAndPackFloat64(zSign, zExp, zSig, status); > - > -} > - > -/*---------------------------------------------------------------------------- > -| Returns the result of subtracting the absolute values of the double- > -| precision floating-point values `a' and `b'. If `zSign' is 1, the > -| difference is negated before being returned. `zSign' is ignored if the > -| result is a NaN. The subtraction is performed according to the IEC/IEEE > -| Standard for Binary Floating-Point Arithmetic. > -*----------------------------------------------------------------------------*/ > - > -static float64 subFloat64Sigs(float64 a, float64 b, flag zSign, > - float_status *status) > -{ > - int aExp, bExp, zExp; > - uint64_t aSig, bSig, zSig; > - int expDiff; > - > - aSig = extractFloat64Frac( a ); > - aExp = extractFloat64Exp( a ); > - bSig = extractFloat64Frac( b ); > - bExp = extractFloat64Exp( b ); > - expDiff = aExp - bExp; > - aSig <<= 10; > - bSig <<= 10; > - if ( 0 < expDiff ) goto aExpBigger; > - if ( expDiff < 0 ) goto bExpBigger; > - if ( aExp == 0x7FF ) { > - if (aSig | bSig) { > - return propagateFloat64NaN(a, b, status); > - } > - float_raise(float_flag_invalid, status); > - return float64_default_nan(status); > - } > - if ( aExp == 0 ) { > - aExp = 1; > - bExp = 1; > - } > - if ( bSig < aSig ) goto aBigger; > - if ( aSig < bSig ) goto bBigger; > - return packFloat64(status->float_rounding_mode == float_round_down, 0, 0); > - bExpBigger: > - if ( bExp == 0x7FF ) { > - if (bSig) { > - return propagateFloat64NaN(a, b, status); > - } > - return packFloat64( zSign ^ 1, 0x7FF, 0 ); > - } > - if ( aExp == 0 ) { > - ++expDiff; > - } > - else { > - aSig |= LIT64( 0x4000000000000000 ); > - } > - shift64RightJamming( aSig, - expDiff, &aSig ); > - bSig |= LIT64( 0x4000000000000000 ); > - bBigger: > - zSig = bSig - aSig; > - zExp = bExp; > - zSign ^= 1; > - goto normalizeRoundAndPack; > - aExpBigger: > - if ( aExp == 0x7FF ) { > - if (aSig) { > - return propagateFloat64NaN(a, b, status); > - } > - return a; > - } > - if ( bExp == 0 ) { > - --expDiff; > - } > - else { > - bSig |= LIT64( 0x4000000000000000 ); > - } > - shift64RightJamming( bSig, expDiff, &bSig ); > - aSig |= LIT64( 0x4000000000000000 ); > - aBigger: > - zSig = aSig - bSig; > - zExp = aExp; > - normalizeRoundAndPack: > - --zExp; > - return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status); > - > -} > - > -/*---------------------------------------------------------------------------- > -| Returns the result of adding the double-precision floating-point values `a' > -| and `b'. The operation is performed according to the IEC/IEEE Standard for > -| Binary Floating-Point Arithmetic. > -*----------------------------------------------------------------------------*/ > - > -float64 float64_add(float64 a, float64 b, float_status *status) > -{ > - flag aSign, bSign; > - a = float64_squash_input_denormal(a, status); > - b = float64_squash_input_denormal(b, status); > - > - aSign = extractFloat64Sign( a ); > - bSign = extractFloat64Sign( b ); > - if ( aSign == bSign ) { > - return addFloat64Sigs(a, b, aSign, status); > - } > - else { > - return subFloat64Sigs(a, b, aSign, status); > - } > - > -} > - > -/*---------------------------------------------------------------------------- > -| Returns the result of subtracting the double-precision floating-point values > -| `a' and `b'. The operation is performed according to the IEC/IEEE Standard > -| for Binary Floating-Point Arithmetic. > -*----------------------------------------------------------------------------*/ > - > -float64 float64_sub(float64 a, float64 b, float_status *status) > -{ > - flag aSign, bSign; > - a = float64_squash_input_denormal(a, status); > - b = float64_squash_input_denormal(b, status); > - > - aSign = extractFloat64Sign( a ); > - bSign = extractFloat64Sign( b ); > - if ( aSign == bSign ) { > - return subFloat64Sigs(a, b, aSign, status); > - } > - else { > - return addFloat64Sigs(a, b, aSign, status); > - } > - > -} > > /*---------------------------------------------------------------------------- > | Returns the result of multiplying the double-precision floating-point values > diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h > index 23824a3000..693ece0974 100644 > --- a/include/fpu/softfloat.h > +++ b/include/fpu/softfloat.h > @@ -236,6 +236,10 @@ float64 float16_to_float64(float16 a, flag ieee, float_status *status); > /*---------------------------------------------------------------------------- > | Software half-precision operations. > *----------------------------------------------------------------------------*/ > + > +float16 float16_add(float16, float16, float_status *status); > +float16 float16_sub(float16, float16, float_status *status); > + > int float16_is_quiet_nan(float16, float_status *status); > int float16_is_signaling_nan(float16, float_status *status); > float16 float16_maybe_silence_nan(float16, float_status *status); > (note for myself, since I need more time for those 3) all bug round_canonical() pick_nan_parts() addsub_floats(): Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
> (note for myself, since I need more time for those 3) > all bug round_canonical() pick_nan_parts() addsub_floats(): haha I check this mail back to copy the function names and read I wrote "bug", lapsus/typo? :) I obviously meant "all _BUT_ ... :" > Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
diff --git a/fpu/softfloat.c b/fpu/softfloat.c index 568d555595..b179790889 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -83,6 +83,7 @@ this code that are retained. * target-dependent and needs the TARGET_* macros. */ #include "qemu/osdep.h" +#include "qemu/bitops.h" #include "fpu/softfloat.h" /* We only need stdlib for abort() */ @@ -270,6 +271,462 @@ static const FloatFmt float64_params = { FLOAT_PARAMS(11, 52) }; +/* Unpack a float to parts, but do not canonicalize. */ +static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) +{ + const int sign_pos = fmt.frac_size + fmt.exp_size; + + return (FloatParts) { + .cls = float_class_unclassified, + .sign = extract64(raw, sign_pos, 1), + .exp = extract64(raw, fmt.frac_size, fmt.exp_size), + .frac = extract64(raw, 0, fmt.frac_size), + }; +} + +static inline FloatParts float16_unpack_raw(float16 f) +{ + return unpack_raw(float16_params, f); +} + +static inline FloatParts float32_unpack_raw(float32 f) +{ + return unpack_raw(float32_params, f); +} + +static inline FloatParts float64_unpack_raw(float64 f) +{ + return unpack_raw(float64_params, f); +} + +/* Pack a float from parts, but do not canonicalize. */ +static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) +{ + const int sign_pos = fmt.frac_size + fmt.exp_size; + uint64_t ret = p.frac; + + ret = deposit64(ret, fmt.frac_size, fmt.exp_size, p.exp); + ret = deposit64(ret, sign_pos, 1, p.sign); + return ret; +} + +static inline float16 float16_pack_raw(FloatParts p) +{ + return make_float16(pack_raw(float16_params, p)); +} + +static inline float32 float32_pack_raw(FloatParts p) +{ + return make_float32(pack_raw(float32_params, p)); +} + +static inline float64 float64_pack_raw(FloatParts p) +{ + return make_float64(pack_raw(float64_params, p)); +} + +/* Canonicalize EXP and FRAC, setting CLS. */ +static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, + float_status *status) +{ + if (part.exp == parm->exp_max) { + if (part.frac == 0) { + part.cls = float_class_inf; + } else { +#ifdef NO_SIGNALING_NANS + part.cls = float_class_qnan; +#else + int64_t msb = part.frac << (parm->frac_shift + 2); + if ((msb < 0) == status->snan_bit_is_one) { + part.cls = float_class_snan; + } else { + part.cls = float_class_qnan; + } +#endif + } + } else if (part.exp == 0) { + if (likely(part.frac == 0)) { + part.cls = float_class_zero; + } else if (status->flush_inputs_to_zero) { + float_raise(float_flag_input_denormal, status); + part.cls = float_class_zero; + part.frac = 0; + } else { + int shift = clz64(part.frac) - 1; + part.cls = float_class_normal; + part.exp = parm->frac_shift - parm->exp_bias - shift + 1; + part.frac <<= shift; + } + } else { + part.cls = float_class_normal; + part.exp -= parm->exp_bias; + part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); + } + return part; +} + +/* Round and uncanonicalize a floating-point number by parts. There + * are FRAC_SHIFT bits that may require rounding at the bottom of the + * fraction; these bits will be removed. The exponent will be biased + * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. + */ + +static FloatParts round_canonical(FloatParts p, float_status *s, + const FloatFmt *parm) +{ + const uint64_t frac_lsbm1 = parm->frac_lsbm1; + const uint64_t round_mask = parm->round_mask; + const uint64_t roundeven_mask = parm->roundeven_mask; + const int exp_max = parm->exp_max; + const int frac_shift = parm->frac_shift; + uint64_t frac, inc; + int exp, flags = 0; + bool overflow_norm; + + frac = p.frac; + exp = p.exp; + + switch (p.cls) { + case float_class_normal: + switch (s->float_rounding_mode) { + case float_round_nearest_even: + overflow_norm = false; + inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); + break; + case float_round_ties_away: + overflow_norm = false; + inc = frac_lsbm1; + break; + case float_round_to_zero: + overflow_norm = true; + inc = 0; + break; + case float_round_up: + inc = p.sign ? 0 : round_mask; + overflow_norm = p.sign; + break; + case float_round_down: + inc = p.sign ? round_mask : 0; + overflow_norm = !p.sign; + break; + default: + g_assert_not_reached(); + } + + exp += parm->exp_bias; + if (likely(exp > 0)) { + if (frac & round_mask) { + flags |= float_flag_inexact; + frac += inc; + if (frac & DECOMPOSED_OVERFLOW_BIT) { + frac >>= 1; + exp++; + } + } + frac >>= frac_shift; + + if (unlikely(exp >= exp_max)) { + flags |= float_flag_overflow | float_flag_inexact; + if (overflow_norm) { + exp = exp_max - 1; + frac = -1; + } else { + p.cls = float_class_inf; + goto do_inf; + } + } + } else if (s->flush_to_zero) { + flags |= float_flag_output_denormal; + p.cls = float_class_zero; + goto do_zero; + } else { + bool is_tiny = (s->float_detect_tininess + == float_tininess_before_rounding) + || (exp < 0) + || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); + + shift64RightJamming(frac, 1 - exp, &frac); + if (frac & round_mask) { + /* Need to recompute round-to-even. */ + if (s->float_rounding_mode == float_round_nearest_even) { + inc = ((frac & roundeven_mask) != frac_lsbm1 + ? frac_lsbm1 : 0); + } + flags |= float_flag_inexact; + frac += inc; + } + + exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); + frac >>= frac_shift; + + if (is_tiny && (flags & float_flag_inexact)) { + flags |= float_flag_underflow; + } + if (exp == 0 && frac == 0) { + p.cls = float_class_zero; + } + } + break; + + case float_class_zero: + do_zero: + exp = 0; + frac = 0; + break; + + case float_class_inf: + do_inf: + exp = exp_max; + frac = 0; + break; + + case float_class_qnan: + case float_class_snan: + exp = exp_max; + break; + + default: + g_assert_not_reached(); + } + + float_raise(flags, s); + p.exp = exp; + p.frac = frac; + return p; +} + +static FloatParts float16_unpack_canonical(float16 f, float_status *s) +{ + return canonicalize(float16_unpack_raw(f), &float16_params, s); +} + +static float16 float16_round_pack_canonical(FloatParts p, float_status *s) +{ + switch (p.cls) { + case float_class_dnan: + return float16_default_nan(s); + case float_class_msnan: + return float16_maybe_silence_nan(float16_pack_raw(p), s); + default: + p = round_canonical(p, s, &float16_params); + return float16_pack_raw(p); + } +} + +static FloatParts float32_unpack_canonical(float32 f, float_status *s) +{ + return canonicalize(float32_unpack_raw(f), &float32_params, s); +} + +static float32 float32_round_pack_canonical(FloatParts p, float_status *s) +{ + switch (p.cls) { + case float_class_dnan: + return float32_default_nan(s); + case float_class_msnan: + return float32_maybe_silence_nan(float32_pack_raw(p), s); + default: + p = round_canonical(p, s, &float32_params); + return float32_pack_raw(p); + } +} + +static FloatParts float64_unpack_canonical(float64 f, float_status *s) +{ + return canonicalize(float64_unpack_raw(f), &float64_params, s); +} + +static float64 float64_round_pack_canonical(FloatParts p, float_status *s) +{ + switch (p.cls) { + case float_class_dnan: + return float64_default_nan(s); + case float_class_msnan: + return float64_maybe_silence_nan(float64_pack_raw(p), s); + default: + p = round_canonical(p, s, &float64_params); + return float64_pack_raw(p); + } +} + +static FloatParts pick_nan_parts(FloatParts a, FloatParts b, float_status *s) +{ + if (a.cls == float_class_snan || b.cls == float_class_snan) { + s->float_exception_flags |= float_flag_invalid; + } + + if (s->default_nan_mode) { + a.cls = float_class_dnan; + } else { + if (pickNaN(a.cls == float_class_qnan, + a.cls == float_class_snan, + b.cls == float_class_qnan, + b.cls == float_class_snan, + a.frac > b.frac + || (a.frac == b.frac && a.sign < b.sign))) { + a = b; + } + a.cls = float_class_msnan; + } + return a; +} + +/* Simple helper for checking if FloatClass is any NaN */ + +static bool is_nan(FloatClass c) +{ + return c >= float_class_qnan; +} + +/* + * Returns the result of adding or subtracting the values of the + * floating-point values `a' and `b'. The operation is performed + * according to the IEC/IEEE Standard for Binary Floating-Point + * Arithmetic. + */ + +static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, + float_status *s) +{ + bool a_sign = a.sign; + bool b_sign = b.sign ^ subtract; + + if (a_sign != b_sign) { + /* Subtraction */ + + if (a.cls == float_class_normal && b.cls == float_class_normal) { + if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { + shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); + a.frac = a.frac - b.frac; + } else { + shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); + a.frac = b.frac - a.frac; + a.exp = b.exp; + a_sign ^= 1; + } + + if (a.frac == 0) { + a.cls = float_class_zero; + a.sign = s->float_rounding_mode == float_round_down; + } else { + int shift = clz64(a.frac) - 1; + a.frac = a.frac << shift; + a.exp = a.exp - shift; + a.sign = a_sign; + } + return a; + } + if (is_nan(a.cls) || is_nan(b.cls)) { + return pick_nan_parts(a, b, s); + } + if (a.cls == float_class_inf) { + if (b.cls == float_class_inf) { + float_raise(float_flag_invalid, s); + a.cls = float_class_dnan; + } + return a; + } + if (a.cls == float_class_zero && b.cls == float_class_zero) { + a.sign = s->float_rounding_mode == float_round_down; + return a; + } + if (a.cls == float_class_zero || b.cls == float_class_inf) { + b.sign = a_sign ^ 1; + return b; + } + if (b.cls == float_class_zero) { + return a; + } + } else { + /* Addition */ + if (a.cls == float_class_normal && b.cls == float_class_normal) { + if (a.exp > b.exp) { + shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); + } else if (a.exp < b.exp) { + shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); + a.exp = b.exp; + } + a.frac += b.frac; + if (a.frac & DECOMPOSED_OVERFLOW_BIT) { + a.frac >>= 1; + a.exp += 1; + } + return a; + } + if (is_nan(a.cls) || is_nan(b.cls)) { + return pick_nan_parts(a, b, s); + } + if (a.cls == float_class_inf || b.cls == float_class_zero) { + return a; + } + if (b.cls == float_class_inf || a.cls == float_class_zero) { + b.sign = b_sign; + return b; + } + } + g_assert_not_reached(); +} + +/* + * Returns the result of adding or subtracting the floating-point + * values `a' and `b'. The operation is performed according to the + * IEC/IEEE Standard for Binary Floating-Point Arithmetic. + */ + +float16 float16_add(float16 a, float16 b, float_status *status) +{ + FloatParts pa = float16_unpack_canonical(a, status); + FloatParts pb = float16_unpack_canonical(b, status); + FloatParts pr = addsub_floats(pa, pb, false, status); + + return float16_round_pack_canonical(pr, status); +} + +float32 float32_add(float32 a, float32 b, float_status *status) +{ + FloatParts pa = float32_unpack_canonical(a, status); + FloatParts pb = float32_unpack_canonical(b, status); + FloatParts pr = addsub_floats(pa, pb, false, status); + + return float32_round_pack_canonical(pr, status); +} + +float64 float64_add(float64 a, float64 b, float_status *status) +{ + FloatParts pa = float64_unpack_canonical(a, status); + FloatParts pb = float64_unpack_canonical(b, status); + FloatParts pr = addsub_floats(pa, pb, false, status); + + return float64_round_pack_canonical(pr, status); +} + +float16 float16_sub(float16 a, float16 b, float_status *status) +{ + FloatParts pa = float16_unpack_canonical(a, status); + FloatParts pb = float16_unpack_canonical(b, status); + FloatParts pr = addsub_floats(pa, pb, true, status); + + return float16_round_pack_canonical(pr, status); +} + +float32 float32_sub(float32 a, float32 b, float_status *status) +{ + FloatParts pa = float32_unpack_canonical(a, status); + FloatParts pb = float32_unpack_canonical(b, status); + FloatParts pr = addsub_floats(pa, pb, true, status); + + return float32_round_pack_canonical(pr, status); +} + +float64 float64_sub(float64 a, float64 b, float_status *status) +{ + FloatParts pa = float64_unpack_canonical(a, status); + FloatParts pb = float64_unpack_canonical(b, status); + FloatParts pr = addsub_floats(pa, pb, true, status); + + return float64_round_pack_canonical(pr, status); +} + /*---------------------------------------------------------------------------- | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 | and 7, and returns the properly rounded 32-bit integer corresponding to the @@ -2081,220 +2538,6 @@ float32 float32_round_to_int(float32 a, float_status *status) } -/*---------------------------------------------------------------------------- -| Returns the result of adding the absolute values of the single-precision -| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated -| before being returned. `zSign' is ignored if the result is a NaN. -| The addition is performed according to the IEC/IEEE Standard for Binary -| Floating-Point Arithmetic. -*----------------------------------------------------------------------------*/ - -static float32 addFloat32Sigs(float32 a, float32 b, flag zSign, - float_status *status) -{ - int aExp, bExp, zExp; - uint32_t aSig, bSig, zSig; - int expDiff; - - aSig = extractFloat32Frac( a ); - aExp = extractFloat32Exp( a ); - bSig = extractFloat32Frac( b ); - bExp = extractFloat32Exp( b ); - expDiff = aExp - bExp; - aSig <<= 6; - bSig <<= 6; - if ( 0 < expDiff ) { - if ( aExp == 0xFF ) { - if (aSig) { - return propagateFloat32NaN(a, b, status); - } - return a; - } - if ( bExp == 0 ) { - --expDiff; - } - else { - bSig |= 0x20000000; - } - shift32RightJamming( bSig, expDiff, &bSig ); - zExp = aExp; - } - else if ( expDiff < 0 ) { - if ( bExp == 0xFF ) { - if (bSig) { - return propagateFloat32NaN(a, b, status); - } - return packFloat32( zSign, 0xFF, 0 ); - } - if ( aExp == 0 ) { - ++expDiff; - } - else { - aSig |= 0x20000000; - } - shift32RightJamming( aSig, - expDiff, &aSig ); - zExp = bExp; - } - else { - if ( aExp == 0xFF ) { - if (aSig | bSig) { - return propagateFloat32NaN(a, b, status); - } - return a; - } - if ( aExp == 0 ) { - if (status->flush_to_zero) { - if (aSig | bSig) { - float_raise(float_flag_output_denormal, status); - } - return packFloat32(zSign, 0, 0); - } - return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); - } - zSig = 0x40000000 + aSig + bSig; - zExp = aExp; - goto roundAndPack; - } - aSig |= 0x20000000; - zSig = ( aSig + bSig )<<1; - --zExp; - if ( (int32_t) zSig < 0 ) { - zSig = aSig + bSig; - ++zExp; - } - roundAndPack: - return roundAndPackFloat32(zSign, zExp, zSig, status); - -} - -/*---------------------------------------------------------------------------- -| Returns the result of subtracting the absolute values of the single- -| precision floating-point values `a' and `b'. If `zSign' is 1, the -| difference is negated before being returned. `zSign' is ignored if the -| result is a NaN. The subtraction is performed according to the IEC/IEEE -| Standard for Binary Floating-Point Arithmetic. -*----------------------------------------------------------------------------*/ - -static float32 subFloat32Sigs(float32 a, float32 b, flag zSign, - float_status *status) -{ - int aExp, bExp, zExp; - uint32_t aSig, bSig, zSig; - int expDiff; - - aSig = extractFloat32Frac( a ); - aExp = extractFloat32Exp( a ); - bSig = extractFloat32Frac( b ); - bExp = extractFloat32Exp( b ); - expDiff = aExp - bExp; - aSig <<= 7; - bSig <<= 7; - if ( 0 < expDiff ) goto aExpBigger; - if ( expDiff < 0 ) goto bExpBigger; - if ( aExp == 0xFF ) { - if (aSig | bSig) { - return propagateFloat32NaN(a, b, status); - } - float_raise(float_flag_invalid, status); - return float32_default_nan(status); - } - if ( aExp == 0 ) { - aExp = 1; - bExp = 1; - } - if ( bSig < aSig ) goto aBigger; - if ( aSig < bSig ) goto bBigger; - return packFloat32(status->float_rounding_mode == float_round_down, 0, 0); - bExpBigger: - if ( bExp == 0xFF ) { - if (bSig) { - return propagateFloat32NaN(a, b, status); - } - return packFloat32( zSign ^ 1, 0xFF, 0 ); - } - if ( aExp == 0 ) { - ++expDiff; - } - else { - aSig |= 0x40000000; - } - shift32RightJamming( aSig, - expDiff, &aSig ); - bSig |= 0x40000000; - bBigger: - zSig = bSig - aSig; - zExp = bExp; - zSign ^= 1; - goto normalizeRoundAndPack; - aExpBigger: - if ( aExp == 0xFF ) { - if (aSig) { - return propagateFloat32NaN(a, b, status); - } - return a; - } - if ( bExp == 0 ) { - --expDiff; - } - else { - bSig |= 0x40000000; - } - shift32RightJamming( bSig, expDiff, &bSig ); - aSig |= 0x40000000; - aBigger: - zSig = aSig - bSig; - zExp = aExp; - normalizeRoundAndPack: - --zExp; - return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status); - -} - -/*---------------------------------------------------------------------------- -| Returns the result of adding the single-precision floating-point values `a' -| and `b'. The operation is performed according to the IEC/IEEE Standard for -| Binary Floating-Point Arithmetic. -*----------------------------------------------------------------------------*/ - -float32 float32_add(float32 a, float32 b, float_status *status) -{ - flag aSign, bSign; - a = float32_squash_input_denormal(a, status); - b = float32_squash_input_denormal(b, status); - - aSign = extractFloat32Sign( a ); - bSign = extractFloat32Sign( b ); - if ( aSign == bSign ) { - return addFloat32Sigs(a, b, aSign, status); - } - else { - return subFloat32Sigs(a, b, aSign, status); - } - -} - -/*---------------------------------------------------------------------------- -| Returns the result of subtracting the single-precision floating-point values -| `a' and `b'. The operation is performed according to the IEC/IEEE Standard -| for Binary Floating-Point Arithmetic. -*----------------------------------------------------------------------------*/ - -float32 float32_sub(float32 a, float32 b, float_status *status) -{ - flag aSign, bSign; - a = float32_squash_input_denormal(a, status); - b = float32_squash_input_denormal(b, status); - - aSign = extractFloat32Sign( a ); - bSign = extractFloat32Sign( b ); - if ( aSign == bSign ) { - return subFloat32Sigs(a, b, aSign, status); - } - else { - return addFloat32Sigs(a, b, aSign, status); - } - -} - /*---------------------------------------------------------------------------- | Returns the result of multiplying the single-precision floating-point values | `a' and `b'. The operation is performed according to the IEC/IEEE Standard @@ -3891,219 +4134,6 @@ float64 float64_trunc_to_int(float64 a, float_status *status) return res; } -/*---------------------------------------------------------------------------- -| Returns the result of adding the absolute values of the double-precision -| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated -| before being returned. `zSign' is ignored if the result is a NaN. -| The addition is performed according to the IEC/IEEE Standard for Binary -| Floating-Point Arithmetic. -*----------------------------------------------------------------------------*/ - -static float64 addFloat64Sigs(float64 a, float64 b, flag zSign, - float_status *status) -{ - int aExp, bExp, zExp; - uint64_t aSig, bSig, zSig; - int expDiff; - - aSig = extractFloat64Frac( a ); - aExp = extractFloat64Exp( a ); - bSig = extractFloat64Frac( b ); - bExp = extractFloat64Exp( b ); - expDiff = aExp - bExp; - aSig <<= 9; - bSig <<= 9; - if ( 0 < expDiff ) { - if ( aExp == 0x7FF ) { - if (aSig) { - return propagateFloat64NaN(a, b, status); - } - return a; - } - if ( bExp == 0 ) { - --expDiff; - } - else { - bSig |= LIT64( 0x2000000000000000 ); - } - shift64RightJamming( bSig, expDiff, &bSig ); - zExp = aExp; - } - else if ( expDiff < 0 ) { - if ( bExp == 0x7FF ) { - if (bSig) { - return propagateFloat64NaN(a, b, status); - } - return packFloat64( zSign, 0x7FF, 0 ); - } - if ( aExp == 0 ) { - ++expDiff; - } - else { - aSig |= LIT64( 0x2000000000000000 ); - } - shift64RightJamming( aSig, - expDiff, &aSig ); - zExp = bExp; - } - else { - if ( aExp == 0x7FF ) { - if (aSig | bSig) { - return propagateFloat64NaN(a, b, status); - } - return a; - } - if ( aExp == 0 ) { - if (status->flush_to_zero) { - if (aSig | bSig) { - float_raise(float_flag_output_denormal, status); - } - return packFloat64(zSign, 0, 0); - } - return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); - } - zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; - zExp = aExp; - goto roundAndPack; - } - aSig |= LIT64( 0x2000000000000000 ); - zSig = ( aSig + bSig )<<1; - --zExp; - if ( (int64_t) zSig < 0 ) { - zSig = aSig + bSig; - ++zExp; - } - roundAndPack: - return roundAndPackFloat64(zSign, zExp, zSig, status); - -} - -/*---------------------------------------------------------------------------- -| Returns the result of subtracting the absolute values of the double- -| precision floating-point values `a' and `b'. If `zSign' is 1, the -| difference is negated before being returned. `zSign' is ignored if the -| result is a NaN. The subtraction is performed according to the IEC/IEEE -| Standard for Binary Floating-Point Arithmetic. -*----------------------------------------------------------------------------*/ - -static float64 subFloat64Sigs(float64 a, float64 b, flag zSign, - float_status *status) -{ - int aExp, bExp, zExp; - uint64_t aSig, bSig, zSig; - int expDiff; - - aSig = extractFloat64Frac( a ); - aExp = extractFloat64Exp( a ); - bSig = extractFloat64Frac( b ); - bExp = extractFloat64Exp( b ); - expDiff = aExp - bExp; - aSig <<= 10; - bSig <<= 10; - if ( 0 < expDiff ) goto aExpBigger; - if ( expDiff < 0 ) goto bExpBigger; - if ( aExp == 0x7FF ) { - if (aSig | bSig) { - return propagateFloat64NaN(a, b, status); - } - float_raise(float_flag_invalid, status); - return float64_default_nan(status); - } - if ( aExp == 0 ) { - aExp = 1; - bExp = 1; - } - if ( bSig < aSig ) goto aBigger; - if ( aSig < bSig ) goto bBigger; - return packFloat64(status->float_rounding_mode == float_round_down, 0, 0); - bExpBigger: - if ( bExp == 0x7FF ) { - if (bSig) { - return propagateFloat64NaN(a, b, status); - } - return packFloat64( zSign ^ 1, 0x7FF, 0 ); - } - if ( aExp == 0 ) { - ++expDiff; - } - else { - aSig |= LIT64( 0x4000000000000000 ); - } - shift64RightJamming( aSig, - expDiff, &aSig ); - bSig |= LIT64( 0x4000000000000000 ); - bBigger: - zSig = bSig - aSig; - zExp = bExp; - zSign ^= 1; - goto normalizeRoundAndPack; - aExpBigger: - if ( aExp == 0x7FF ) { - if (aSig) { - return propagateFloat64NaN(a, b, status); - } - return a; - } - if ( bExp == 0 ) { - --expDiff; - } - else { - bSig |= LIT64( 0x4000000000000000 ); - } - shift64RightJamming( bSig, expDiff, &bSig ); - aSig |= LIT64( 0x4000000000000000 ); - aBigger: - zSig = aSig - bSig; - zExp = aExp; - normalizeRoundAndPack: - --zExp; - return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status); - -} - -/*---------------------------------------------------------------------------- -| Returns the result of adding the double-precision floating-point values `a' -| and `b'. The operation is performed according to the IEC/IEEE Standard for -| Binary Floating-Point Arithmetic. -*----------------------------------------------------------------------------*/ - -float64 float64_add(float64 a, float64 b, float_status *status) -{ - flag aSign, bSign; - a = float64_squash_input_denormal(a, status); - b = float64_squash_input_denormal(b, status); - - aSign = extractFloat64Sign( a ); - bSign = extractFloat64Sign( b ); - if ( aSign == bSign ) { - return addFloat64Sigs(a, b, aSign, status); - } - else { - return subFloat64Sigs(a, b, aSign, status); - } - -} - -/*---------------------------------------------------------------------------- -| Returns the result of subtracting the double-precision floating-point values -| `a' and `b'. The operation is performed according to the IEC/IEEE Standard -| for Binary Floating-Point Arithmetic. -*----------------------------------------------------------------------------*/ - -float64 float64_sub(float64 a, float64 b, float_status *status) -{ - flag aSign, bSign; - a = float64_squash_input_denormal(a, status); - b = float64_squash_input_denormal(b, status); - - aSign = extractFloat64Sign( a ); - bSign = extractFloat64Sign( b ); - if ( aSign == bSign ) { - return subFloat64Sigs(a, b, aSign, status); - } - else { - return addFloat64Sigs(a, b, aSign, status); - } - -} /*---------------------------------------------------------------------------- | Returns the result of multiplying the double-precision floating-point values diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h index 23824a3000..693ece0974 100644 --- a/include/fpu/softfloat.h +++ b/include/fpu/softfloat.h @@ -236,6 +236,10 @@ float64 float16_to_float64(float16 a, flag ieee, float_status *status); /*---------------------------------------------------------------------------- | Software half-precision operations. *----------------------------------------------------------------------------*/ + +float16 float16_add(float16, float16, float_status *status); +float16 float16_sub(float16, float16, float_status *status); + int float16_is_quiet_nan(float16, float_status *status); int float16_is_signaling_nan(float16, float_status *status); float16 float16_maybe_silence_nan(float16, float_status *status);