diff mbox series

[RFC] fpu: add compile time check for old glibc/libm and fma

Message ID 20181220111008.24954-1-alex.bennee@linaro.org
State New
Headers show
Series [RFC] fpu: add compile time check for old glibc/libm and fma | expand

Commit Message

Alex Bennée Dec. 20, 2018, 11:10 a.m. UTC
Some versions of glibc have been reported to have problems with
fused-multiply-accumulate operations. If the underlying fma
implementation does a two step operation it will instroduce subtle
rounding errors. Newer versions of the library seem to deal with this
better and modern hardware has fused operations which the library can
use.

Reported-by: Laurent Desnogues <laurent.desnogues@gmail.com>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

Cc: Emilio G. Cota <cota@braap.org>
---
 fpu/softfloat.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

-- 
2.17.1

Comments

Aleksandar Markovic Dec. 20, 2018, 1:10 p.m. UTC | #1
On Dec 20, 2018 12:11 PM, "Alex Bennée" <alex.bennee@linaro.org> wrote:
>

> Some versions of glibc have been reported to have problems with

> fused-multiply-accumulate operations. If the underlying fma

> implementation does a two step operation it will instroduce subtle

> rounding errors. Newer versions of the library seem to deal with this

> better and modern hardware has fused operations which the library can

> use.

>

> Reported-by: Laurent Desnogues <laurent.desnogues@gmail.com>

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> Cc: Emilio G. Cota <cota@braap.org>

> ---

>  fpu/softfloat.c | 25 +++++++++++++++++++++++++

>  1 file changed, 25 insertions(+)

>


Acked-by: Aleksandar Markovic <amarkovic@wavecomp.com>


> diff --git a/fpu/softfloat.c b/fpu/softfloat.c

> index 59eac97d10..9c2dbd04b5 100644

> --- a/fpu/softfloat.c

> +++ b/fpu/softfloat.c

> @@ -203,6 +203,25 @@ GEN_INPUT_FLUSH3(float64_input_flush3, float64)

>  # define QEMU_HARDFLOAT_3F64_USE_FP 0

>  #endif

>

> +/*

> + * Choose whether to accelerate fused multiply-accumulate operations

> + * with hard float functions. Some versions of glibc's maths library

> + * have been reported to be broken on x86 without FMA instructions.

> + */

> +#if defined(__x86_64__)

> +/* this was actually reported as glibc-2.12-1.149.el6_6.5.x86_64 was

> + * broken but glibc 2.12-1.209 works but out of caution lets disable

> + * for all older glibcs.

> + */

> +#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ <= 12)

> +#define QEMU_HARDFLOAT_USE_FMA 0

> +#else

> +#define QEMU_HARDFLOAT_USE_FMA 1

> +#endif

> +#else

> +#define QEMU_HARDFLOAT_USE_FMA 1

> +#endif

> +

>  /*

>   * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over

>   * float{32,64}_is_infinity when !USE_FP.

> @@ -1551,6 +1570,9 @@ float32_muladd(float32 xa, float32 xb, float32 xc,

int flags, float_status *s)
>      ub.s = xb;

>      uc.s = xc;

>

> +    if (!QEMU_HARDFLOAT_USE_FMA) {

> +        goto soft;

> +    }

>      if (unlikely(!can_use_fpu(s))) {

>          goto soft;

>      }

> @@ -1612,6 +1634,9 @@ float64_muladd(float64 xa, float64 xb, float64 xc,

int flags, float_status *s)
>      ub.s = xb;

>      uc.s = xc;

>

> +    if (!QEMU_HARDFLOAT_USE_FMA) {

> +        goto soft;

> +    }

>      if (unlikely(!can_use_fpu(s))) {

>          goto soft;

>      }

> --

> 2.17.1

>

>
Laurent Desnogues Dec. 21, 2018, 2:14 p.m. UTC | #2
Hi,

On Thu, Dec 20, 2018 at 12:10 PM Alex Bennée <alex.bennee@linaro.org> wrote:
>

> Some versions of glibc have been reported to have problems with

> fused-multiply-accumulate operations. If the underlying fma

> implementation does a two step operation it will instroduce subtle

> rounding errors. Newer versions of the library seem to deal with this

> better and modern hardware has fused operations which the library can

> use.


Thanks for taking care of this issue.  The fix was tested on our
machines and it works as expected.

So:

Tested-by: Laurent Desnogues <laurent.desnogues@gmail.com>


> Reported-by: Laurent Desnogues <laurent.desnogues@gmail.com>

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> Cc: Emilio G. Cota <cota@braap.org>

> ---

>  fpu/softfloat.c | 25 +++++++++++++++++++++++++

>  1 file changed, 25 insertions(+)

>

> diff --git a/fpu/softfloat.c b/fpu/softfloat.c

> index 59eac97d10..9c2dbd04b5 100644

> --- a/fpu/softfloat.c

> +++ b/fpu/softfloat.c

> @@ -203,6 +203,25 @@ GEN_INPUT_FLUSH3(float64_input_flush3, float64)

>  # define QEMU_HARDFLOAT_3F64_USE_FP 0

>  #endif

>

> +/*

> + * Choose whether to accelerate fused multiply-accumulate operations

> + * with hard float functions. Some versions of glibc's maths library

> + * have been reported to be broken on x86 without FMA instructions.

> + */

> +#if defined(__x86_64__)

> +/* this was actually reported as glibc-2.12-1.149.el6_6.5.x86_64 was

> + * broken but glibc 2.12-1.209 works but out of caution lets disable

> + * for all older glibcs.

> + */

> +#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ <= 12)

> +#define QEMU_HARDFLOAT_USE_FMA 0

> +#else

> +#define QEMU_HARDFLOAT_USE_FMA 1

> +#endif

> +#else

> +#define QEMU_HARDFLOAT_USE_FMA 1

> +#endif

> +

>  /*

>   * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over

>   * float{32,64}_is_infinity when !USE_FP.

> @@ -1551,6 +1570,9 @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)

>      ub.s = xb;

>      uc.s = xc;

>

> +    if (!QEMU_HARDFLOAT_USE_FMA) {

> +        goto soft;

> +    }

>      if (unlikely(!can_use_fpu(s))) {

>          goto soft;

>      }

> @@ -1612,6 +1634,9 @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)

>      ub.s = xb;

>      uc.s = xc;

>

> +    if (!QEMU_HARDFLOAT_USE_FMA) {

> +        goto soft;

> +    }

>      if (unlikely(!can_use_fpu(s))) {

>          goto soft;

>      }

> --

> 2.17.1

>
Emilio Cota Dec. 21, 2018, 7:30 p.m. UTC | #3
On Thu, Dec 20, 2018 at 11:10:08 +0000, Alex Bennée wrote:
(snip)
> +#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ <= 12)

> +#define QEMU_HARDFLOAT_USE_FMA 0

> +#else

> +#define QEMU_HARDFLOAT_USE_FMA 1

> +#endif

> +#else

> +#define QEMU_HARDFLOAT_USE_FMA 1

> +#endif

> +

>  /*

>   * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over

>   * float{32,64}_is_infinity when !USE_FP.

> @@ -1551,6 +1570,9 @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)

>      ub.s = xb;

>      uc.s = xc;

>  

> +    if (!QEMU_HARDFLOAT_USE_FMA) {

> +        goto soft;

> +    }


I don't think this should be a compile-time check; if the QEMU binary
is run on a system with a newer, fixed glibc (or any other libc), then
we'll have disabled fma hardfloat unnecessarily.

What do you think about the following?

Laurent: if you want to test the below, you can pull it from
   https://github.com/cota/qemu/tree/fma-fix

Thanks,

		Emilio
---
commit ddeec29a2c33550c5d018aeea05d45a23579ae1b
Author: Emilio G. Cota <cota@braap.org>
Date:   Fri Dec 21 14:08:57 2018 -0500

    softfloat: enforce softfloat if the host's FMA is broken
    
    The added branch is marked as unlikely and therefore its impact
    on performance (measured with fp-bench) is within the noise range
    when measured on an Intel(R) Xeon(R) Gold 6142 CPU @ 2.60GHz.
    
    Laurent Desnogues <laurent.desnogues@gmail.com>
    Signed-off-by: Emilio G. Cota <cota@braap.org>


diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 59eac97d10..8b3670ca9d 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -1542,6 +1542,8 @@ soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
     return float64_round_pack_canonical(pr, status);
 }
 
+static bool host_fma_is_broken;
+
 float32 QEMU_FLATTEN
 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
 {
@@ -1562,6 +1564,11 @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
         goto soft;
     }
+
+    if (unlikely(host_fma_is_broken)) {
+        goto soft;
+    }
+
     /*
      * When (a || b) == 0, there's no need to check for under/over flow,
      * since we know the addend is (normal || 0) and the product is 0.
@@ -1623,6 +1630,11 @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
         goto soft;
     }
+
+    if (unlikely(host_fma_is_broken)) {
+        goto soft;
+    }
+
     /*
      * When (a || b) == 0, there's no need to check for under/over flow,
      * since we know the addend is (normal || 0) and the product is 0.
@@ -7974,3 +7986,25 @@ float128 float128_scalbn(float128 a, int n, float_status *status)
                                          , status);
 
 }
+
+static void __attribute__((constructor)) softfloat_init(void)
+{
+    union_float64 ua, ub, uc, ur;
+
+    if (QEMU_NO_HARDFLOAT) {
+        return;
+    }
+
+    /*
+     * Test that the host's FMA is not obviously broken. For example,
+     * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
+     *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
+     */
+    ua.s = 0x0020000000000001;
+    ub.s = 0x3ca0000000000000;
+    uc.s = 0x0020000000000000;
+    ur.h = fma(ua.h, ub.h, uc.h);
+    if (ur.s != 0x0020000000000001) {
+        host_fma_is_broken = true;
+    }
+}
Richard Henderson Dec. 21, 2018, 10:01 p.m. UTC | #4
On 12/21/18 11:30 AM, Emilio G. Cota wrote:
> +    ua.s = 0x0020000000000001;

> +    ub.s = 0x3ca0000000000000;

> +    uc.s = 0x0020000000000000;

> +    ur.h = fma(ua.h, ub.h, uc.h);

> +    if (ur.s != 0x0020000000000001) {


Forgot your ull's, but otherwise ok.

In email to Alex, I did wonder if we should check for fma hardware (at least on
x86).  Without a hardware insn, the libm routine is probably no faster than
softmmu.


r~
Aleksandar Markovic Dec. 22, 2018, 6:40 a.m. UTC | #5
On Dec 20, 2018 12:11 PM, "Alex Bennée" <alex.bennee@linaro.org> wrote:
>

> Some versions of glibc have been reported to have problems with

> fused-multiply-accumulate operations. If the underlying fma

> implementation does a two step operation it will instroduce subtle

> rounding errors. Newer versions of the library seem to deal with this

> better and modern hardware has fused operations which the library can

> use.

>

> Reported-by: Laurent Desnogues <laurent.desnogues@gmail.com>

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> Cc: Emilio G. Cota <cota@braap.org>

> ---

>  fpu/softfloat.c | 25 +++++++++++++++++++++++++

>  1 file changed, 25 insertions(+)

>


What about using gnu_get_libc_version() at runtime?

http://man7.org/linux/man-pages/man3/gnu_get_libc_version.3.html

> diff --git a/fpu/softfloat.c b/fpu/softfloat.c

> index 59eac97d10..9c2dbd04b5 100644

> --- a/fpu/softfloat.c

> +++ b/fpu/softfloat.c

> @@ -203,6 +203,25 @@ GEN_INPUT_FLUSH3(float64_input_flush3, float64)

>  # define QEMU_HARDFLOAT_3F64_USE_FP 0

>  #endif

>

> +/*

> + * Choose whether to accelerate fused multiply-accumulate operations

> + * with hard float functions. Some versions of glibc's maths library

> + * have been reported to be broken on x86 without FMA instructions.

> + */

> +#if defined(__x86_64__)

> +/* this was actually reported as glibc-2.12-1.149.el6_6.5.x86_64 was

> + * broken but glibc 2.12-1.209 works but out of caution lets disable

> + * for all older glibcs.

> + */

> +#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ <= 12)

> +#define QEMU_HARDFLOAT_USE_FMA 0

> +#else

> +#define QEMU_HARDFLOAT_USE_FMA 1

> +#endif

> +#else

> +#define QEMU_HARDFLOAT_USE_FMA 1

> +#endif

> +

>  /*

>   * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over

>   * float{32,64}_is_infinity when !USE_FP.

> @@ -1551,6 +1570,9 @@ float32_muladd(float32 xa, float32 xb, float32 xc,

int flags, float_status *s)
>      ub.s = xb;

>      uc.s = xc;

>

> +    if (!QEMU_HARDFLOAT_USE_FMA) {

> +        goto soft;

> +    }

>      if (unlikely(!can_use_fpu(s))) {

>          goto soft;

>      }

> @@ -1612,6 +1634,9 @@ float64_muladd(float64 xa, float64 xb, float64 xc,

int flags, float_status *s)
>      ub.s = xb;

>      uc.s = xc;

>

> +    if (!QEMU_HARDFLOAT_USE_FMA) {

> +        goto soft;

> +    }

>      if (unlikely(!can_use_fpu(s))) {

>          goto soft;

>      }

> --

> 2.17.1

>

>
Alex Bennée Jan. 7, 2019, 11:50 a.m. UTC | #6
Richard Henderson <richard.henderson@linaro.org> writes:

> On 12/21/18 11:30 AM, Emilio G. Cota wrote:

>> +    ua.s = 0x0020000000000001;

>> +    ub.s = 0x3ca0000000000000;

>> +    uc.s = 0x0020000000000000;

>> +    ur.h = fma(ua.h, ub.h, uc.h);

>> +    if (ur.s != 0x0020000000000001) {

>

> Forgot your ull's, but otherwise ok.

>

> In email to Alex, I did wonder if we should check for fma hardware (at least on

> x86).  Without a hardware insn, the libm routine is probably no faster than

> softmmu.


My only worry is we could end up doing a bunch of these processor
id/capability type checks. Is a correct but slow libm FMA going to be
much slower than our FMA? Will users even notice?

--
Alex Bennée
Alex Bennée Jan. 7, 2019, 12:42 p.m. UTC | #7
Emilio G. Cota <cota@braap.org> writes:

> On Thu, Dec 20, 2018 at 11:10:08 +0000, Alex Bennée wrote:

> (snip)

>> +#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ <= 12)

>> +#define QEMU_HARDFLOAT_USE_FMA 0

>> +#else

>> +#define QEMU_HARDFLOAT_USE_FMA 1

>> +#endif

>> +#else

>> +#define QEMU_HARDFLOAT_USE_FMA 1

>> +#endif

>> +

>>  /*

>>   * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over

>>   * float{32,64}_is_infinity when !USE_FP.

>> @@ -1551,6 +1570,9 @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)

>>      ub.s = xb;

>>      uc.s = xc;

>>

>> +    if (!QEMU_HARDFLOAT_USE_FMA) {

>> +        goto soft;

>> +    }

>

> I don't think this should be a compile-time check; if the QEMU binary

> is run on a system with a newer, fixed glibc (or any other libc), then

> we'll have disabled fma hardfloat unnecessarily.

>

> What do you think about the following?

>

> Laurent: if you want to test the below, you can pull it from

>    https://github.com/cota/qemu/tree/fma-fix

>

> Thanks,

>

> 		Emilio

> ---

> commit ddeec29a2c33550c5d018aeea05d45a23579ae1b

> Author: Emilio G. Cota <cota@braap.org>

> Date:   Fri Dec 21 14:08:57 2018 -0500

>

>     softfloat: enforce softfloat if the host's FMA is broken

>

>     The added branch is marked as unlikely and therefore its impact

>     on performance (measured with fp-bench) is within the noise range

>     when measured on an Intel(R) Xeon(R) Gold 6142 CPU @ 2.60GHz.

>

>     Laurent Desnogues <laurent.desnogues@gmail.com>

>     Signed-off-by: Emilio G. Cota <cota@braap.org>


I've applied b8cc3928cee7b1d91bf39c86bec4801b9dc612e1 from your tree to
fpu/next although the numbers look a bit odd. I see:

Before:
  143.83 MFlops
  89.34 MFlops

After:
  150.20 MFlops
  85.73 MFlops

On my i7-4770 where as your commit seems to show a big jump in
performance which is odd as this is preventing a bug not enabling FMA.

> +

> +static void __attribute__((constructor)) softfloat_init(void)

> +{

> +    union_float64 ua, ub, uc, ur;

> +

> +    if (QEMU_NO_HARDFLOAT) {

> +        return;

> +    }

> +

> +    /*

> +     * Test that the host's FMA is not obviously broken. For example,

> +     * glibc < 2.23 can perform an incorrect FMA on certain hosts; see

> +     *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304

> +     */

> +    ua.s = 0x0020000000000001;

> +    ub.s = 0x3ca0000000000000;

> +    uc.s = 0x0020000000000000;

> +    ur.h = fma(ua.h, ub.h, uc.h);

> +    if (ur.s != 0x0020000000000001) {

> +        host_fma_is_broken = true;

> +    }

> +}


I'm fine with the cpuid stuff at the bottom of softfloat for now. We can
move it later if the other micro-architectures want to get in on the
detecting features game.

--
Alex Bennée
diff mbox series

Patch

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 59eac97d10..9c2dbd04b5 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -203,6 +203,25 @@  GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 # define QEMU_HARDFLOAT_3F64_USE_FP 0
 #endif
 
+/*
+ * Choose whether to accelerate fused multiply-accumulate operations
+ * with hard float functions. Some versions of glibc's maths library
+ * have been reported to be broken on x86 without FMA instructions.
+ */
+#if defined(__x86_64__)
+/* this was actually reported as glibc-2.12-1.149.el6_6.5.x86_64 was
+ * broken but glibc 2.12-1.209 works but out of caution lets disable
+ * for all older glibcs.
+ */
+#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ <= 12)
+#define QEMU_HARDFLOAT_USE_FMA 0
+#else
+#define QEMU_HARDFLOAT_USE_FMA 1
+#endif
+#else
+#define QEMU_HARDFLOAT_USE_FMA 1
+#endif
+
 /*
  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
  * float{32,64}_is_infinity when !USE_FP.
@@ -1551,6 +1570,9 @@  float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
     ub.s = xb;
     uc.s = xc;
 
+    if (!QEMU_HARDFLOAT_USE_FMA) {
+        goto soft;
+    }
     if (unlikely(!can_use_fpu(s))) {
         goto soft;
     }
@@ -1612,6 +1634,9 @@  float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
     ub.s = xb;
     uc.s = xc;
 
+    if (!QEMU_HARDFLOAT_USE_FMA) {
+        goto soft;
+    }
     if (unlikely(!can_use_fpu(s))) {
         goto soft;
     }