diff mbox series

[v2,4/7] softfloat: fallback to __int128 maths for s390x and others

Message ID 20190116202349.29272-5-alex.bennee@linaro.org
State New
Headers show
Series current fpu/next queue | expand

Commit Message

Alex Bennée Jan. 16, 2019, 8:23 p.m. UTC
Apparently some versions of clang can't handle inline assembly with
__int128 parameters, especially on s390. Instead of hand-coding the
s390 divide provide a generic fallback for anything that provides
__int128 capable maths.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

Cc: Thomas Huth <thuth@redhat.com>
---
 include/fpu/softfloat-macros.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

-- 
2.17.1

Comments

Richard Henderson Jan. 16, 2019, 10:17 p.m. UTC | #1
On 1/17/19 7:23 AM, Alex Bennée wrote:
> Apparently some versions of clang can't handle inline assembly with

> __int128 parameters, especially on s390. Instead of hand-coding the

> s390 divide provide a generic fallback for anything that provides

> __int128 capable maths.

> 

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> Cc: Thomas Huth <thuth@redhat.com>

> ---

>  include/fpu/softfloat-macros.h | 10 ++++------

>  1 file changed, 4 insertions(+), 6 deletions(-)

> 

> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h

> index b1d772e6d4..1a43609eef 100644

> --- a/include/fpu/softfloat-macros.h

> +++ b/include/fpu/softfloat-macros.h

> @@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,

>      uint64_t q;

>      asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));

>      return q;

> -#elif defined(__s390x__)

> -    /* Need to use a TImode type to get an even register pair for DLGR.  */

> -    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;

> -    asm("dlgr %0, %1" : "+r"(n) : "r"(d));

> -    *r = n >> 64;

> -    return n;

>  #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)

>      /* From Power ISA 2.06, programming note for divdeu.  */

>      uint64_t q1, q2, Q, r1, r2, R;

> @@ -663,6 +657,10 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,

>      }

>      *r = R;

>      return Q;

> +#elif defined(CONFIG_INT128)

> +    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;

> +    *r = n % d;

> +    return n / d;

>  #else


I thought that we'd shown that, at least at present, no compiler is taking
advantage of hardware insns for this, and is promoting this to a full 128-bit
divide.  And further that the version using 64-bit arithmetic was competitive
with the hardware insn.

I'd rather not include this hunk for now.


r~
Thomas Huth Jan. 17, 2019, 6:08 a.m. UTC | #2
On 2019-01-16 21:23, Alex Bennée wrote:
> Apparently some versions of clang can't handle inline assembly with

> __int128 parameters, especially on s390. Instead of hand-coding the

> s390 divide provide a generic fallback for anything that provides

> __int128 capable maths.

> 

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> Cc: Thomas Huth <thuth@redhat.com>

> ---

>  include/fpu/softfloat-macros.h | 10 ++++------

>  1 file changed, 4 insertions(+), 6 deletions(-)

> 

> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h

> index b1d772e6d4..1a43609eef 100644

> --- a/include/fpu/softfloat-macros.h

> +++ b/include/fpu/softfloat-macros.h

> @@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,

>      uint64_t q;

>      asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));

>      return q;

> -#elif defined(__s390x__)

> -    /* Need to use a TImode type to get an even register pair for DLGR.  */

> -    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;

> -    asm("dlgr %0, %1" : "+r"(n) : "r"(d));

> -    *r = n >> 64;

> -    return n;

>  #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)

>      /* From Power ISA 2.06, programming note for divdeu.  */

>      uint64_t q1, q2, Q, r1, r2, R;

> @@ -663,6 +657,10 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,

>      }

>      *r = R;

>      return Q;

> +#elif defined(CONFIG_INT128)

> +    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;

> +    *r = n % d;

> +    return n / d;

>  #else

>      uint64_t d0, d1, q0, q1, r1, r0, m;


No, please don't. Use my !defined(__clang__) patch instead, please.

 Thomas
Alex Bennée Jan. 17, 2019, 7:48 a.m. UTC | #3
Richard Henderson <richard.henderson@linaro.org> writes:

> On 1/17/19 7:23 AM, Alex Bennée wrote:

>> Apparently some versions of clang can't handle inline assembly with

>> __int128 parameters, especially on s390. Instead of hand-coding the

>> s390 divide provide a generic fallback for anything that provides

>> __int128 capable maths.

>>

>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>> Cc: Thomas Huth <thuth@redhat.com>

>> ---

>>  include/fpu/softfloat-macros.h | 10 ++++------

>>  1 file changed, 4 insertions(+), 6 deletions(-)

>>

>> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h

>> index b1d772e6d4..1a43609eef 100644

>> --- a/include/fpu/softfloat-macros.h

>> +++ b/include/fpu/softfloat-macros.h

>> @@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,

>>      uint64_t q;

>>      asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));

>>      return q;

>> -#elif defined(__s390x__)

>> -    /* Need to use a TImode type to get an even register pair for DLGR.  */

>> -    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;

>> -    asm("dlgr %0, %1" : "+r"(n) : "r"(d));

>> -    *r = n >> 64;

>> -    return n;

>>  #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)

>>      /* From Power ISA 2.06, programming note for divdeu.  */

>>      uint64_t q1, q2, Q, r1, r2, R;

>> @@ -663,6 +657,10 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,

>>      }

>>      *r = R;

>>      return Q;

>> +#elif defined(CONFIG_INT128)

>> +    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;

>> +    *r = n % d;

>> +    return n / d;

>>  #else

>

> I thought that we'd shown that, at least at present, no compiler is taking

> advantage of hardware insns for this, and is promoting this to a full 128-bit

> divide.  And further that the version using 64-bit arithmetic was competitive

> with the hardware insn.


Yeah it seems so. While Thomas' numbers weren't convincing the
CONFIG_INT128 fallback did trigger on my SynQuacer an knocked off about
2 MFlops of it's admittedly slow performance. Amusingly of course it's
faster under translation because of the hardware fall back:

07:44:44 [alex@idun:~/l/q/t/fp] (8973c1e5…) + ./fp-bench -o div -p double
13.28 MFlops
07:44:49 [alex@idun:~/l/q/t/fp] (8973c1e5…) + ./fp-bench -o div -p double -t host
498.20 MFlops
07:44:53 [alex@idun:~/l/q/t/fp] (8973c1e5…) + ../../aarch64-linux-user/qemu-aarch64  ./fp-bench -o div -p double -t host
52.71 MFlops

I'll drop this and use Thomas' #elif defined(__s390x__) &&
!defined(__clang__) version in the pull-request.

--
Alex Bennée
diff mbox series

Patch

diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
index b1d772e6d4..1a43609eef 100644
--- a/include/fpu/softfloat-macros.h
+++ b/include/fpu/softfloat-macros.h
@@ -641,12 +641,6 @@  static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
     uint64_t q;
     asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
     return q;
-#elif defined(__s390x__)
-    /* Need to use a TImode type to get an even register pair for DLGR.  */
-    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
-    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
-    *r = n >> 64;
-    return n;
 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
     /* From Power ISA 2.06, programming note for divdeu.  */
     uint64_t q1, q2, Q, r1, r2, R;
@@ -663,6 +657,10 @@  static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
     }
     *r = R;
     return Q;
+#elif defined(CONFIG_INT128)
+    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
+    *r = n % d;
+    return n / d;
 #else
     uint64_t d0, d1, q0, q1, r1, r0, m;