diff mbox series

[RFC,for,2.11,12/23] target/arm/translate-a64.c: add FP16 FAGCT to AdvSIMD 3 Same

Message ID 20170720150426.12393-13-alex.bennee@linaro.org
State New
Headers show
Series Implementing FP16 for ARMv8.2 using SoftFloat2a and 3c | expand

Commit Message

Alex Bennée July 20, 2017, 3:04 p.m. UTC
This adds the first of the softfloat3c helpers for handling FP16
operations. To interwork with the existing SoftFloat2a code we need to
copy the exceptions conditions across before each operation and
restore them afterwards.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

---
 target/arm/Makefile.objs    |  1 +
 target/arm/advsimd_helper.c | 85 +++++++++++++++++++++++++++++++++++++++++++++
 target/arm/helper-a64.h     |  3 ++
 target/arm/translate-a64.c  |  3 ++
 4 files changed, 92 insertions(+)
 create mode 100644 target/arm/advsimd_helper.c

-- 
2.13.0

Comments

Richard Henderson July 20, 2017, 7:35 p.m. UTC | #1
On 07/20/2017 05:04 AM, Alex Bennée wrote:
> +static softfloat_flags softfloat_mapping_table[] = {

> +    { float_flag_inexact  , softfloat_flag_inexact },

> +    { float_flag_underflow, softfloat_flag_underflow },

> +    { float_flag_overflow , softfloat_flag_overflow },

> +    { float_flag_invalid  , softfloat_flag_invalid },

> +};

> +/* FIXME: 2a has no infinite flag */

> +

> +static uint8_t sync_softfloat_flags_from_2a(float_status *flags2a)

> +{

> +    uint8_t flags = flags2a->float_exception_flags;

> +    int i;

> +    if (flags) {

> +        for (i = 0; i < ARRAY_SIZE(softfloat_mapping_table); i++) {

> +            struct softfloat_flags *map = &softfloat_mapping_table[i];

> +            if (flags & map->float2a_flag) {

> +                softfloat_raiseFlags(map->float3c_flag);

> +            }

> +        }

> +    } else {

> +        softfloat_exceptionFlags = 0;

> +    }

> +

> +    return softfloat_exceptionFlags;

> +}

> +


For conversions like this IMO it's better to make the compiler do the work. 
C.f. target/alpha/fpu_helper.c and CONVERT_BIT.

BTW, I think these TLS variables that softfloat3a are using are going to be a 
real problem.  It's one thing to do it temporarily like you are here, 
coordinating between 2a and 3c, but later, when the front end is fully 
converted?  That's just nonsense.

Is it going to be worthwhile to significantly hack up the incoming sources?

If so, then we might do something like this:  Given a 3c function foo,

   T foo_st(T, T, float3a_status *) { ... }
   T foo(T x, T y) { return foo_st(x, y, &tls_status); }

And of course pack all of the relevant state into a struct then

   #define softfloat_exceptionFlags tls_status.exceptionflags

etc, instead of having individual tls variables.  This feels like something 
that we could push back upstream, assuming that upstream is active.


r~
Alex Bennée July 21, 2017, 9:56 a.m. UTC | #2
Richard Henderson <rth@twiddle.net> writes:

> On 07/20/2017 05:04 AM, Alex Bennée wrote:

>> +static softfloat_flags softfloat_mapping_table[] = {

>> +    { float_flag_inexact  , softfloat_flag_inexact },

>> +    { float_flag_underflow, softfloat_flag_underflow },

>> +    { float_flag_overflow , softfloat_flag_overflow },

>> +    { float_flag_invalid  , softfloat_flag_invalid },

>> +};

>> +/* FIXME: 2a has no infinite flag */

>> +

>> +static uint8_t sync_softfloat_flags_from_2a(float_status *flags2a)

>> +{

>> +    uint8_t flags = flags2a->float_exception_flags;

>> +    int i;

>> +    if (flags) {

>> +        for (i = 0; i < ARRAY_SIZE(softfloat_mapping_table); i++) {

>> +            struct softfloat_flags *map = &softfloat_mapping_table[i];

>> +            if (flags & map->float2a_flag) {

>> +                softfloat_raiseFlags(map->float3c_flag);

>> +            }

>> +        }

>> +    } else {

>> +        softfloat_exceptionFlags = 0;

>> +    }

>> +

>> +    return softfloat_exceptionFlags;

>> +}

>> +

>

> For conversions like this IMO it's better to make the compiler do the

> work. C.f. target/alpha/fpu_helper.c and CONVERT_BIT.


Cool, I'll look at that.

> BTW, I think these TLS variables that softfloat3a are using are going

> to be a real problem.  It's one thing to do it temporarily like you

> are here, coordinating between 2a and 3c, but later, when the front

> end is fully converted?  That's just nonsense.


Wouldn't the other option to be to drop float_status out of the guests
CPUEnv and grab it from the TLS variable instead? Of course all guests
would need to be MTTCG enabled for that to work.

> Is it going to be worthwhile to significantly hack up the incoming sources?

>

> If so, then we might do something like this:  Given a 3c function foo,

>

>   T foo_st(T, T, float3a_status *) { ... }

>   T foo(T x, T y) { return foo_st(x, y, &tls_status); }

>

> And of course pack all of the relevant state into a struct then

>

>   #define softfloat_exceptionFlags tls_status.exceptionflags

>

> etc, instead of having individual tls variables.  This feels like

> something that we could push back upstream, assuming that upstream is

> active.


Peter has found what might be an upstream source repository so we can
certainly look at that.

--
Alex Bennée
Aurelien Jarno July 21, 2017, 1:21 p.m. UTC | #3
On 2017-07-21 10:56, Alex Bennée wrote:
> 

> Richard Henderson <rth@twiddle.net> writes:

> 

> > On 07/20/2017 05:04 AM, Alex Bennée wrote:

> >> +static softfloat_flags softfloat_mapping_table[] = {

> >> +    { float_flag_inexact  , softfloat_flag_inexact },

> >> +    { float_flag_underflow, softfloat_flag_underflow },

> >> +    { float_flag_overflow , softfloat_flag_overflow },

> >> +    { float_flag_invalid  , softfloat_flag_invalid },

> >> +};

> >> +/* FIXME: 2a has no infinite flag */

> >> +

> >> +static uint8_t sync_softfloat_flags_from_2a(float_status *flags2a)

> >> +{

> >> +    uint8_t flags = flags2a->float_exception_flags;

> >> +    int i;

> >> +    if (flags) {

> >> +        for (i = 0; i < ARRAY_SIZE(softfloat_mapping_table); i++) {

> >> +            struct softfloat_flags *map = &softfloat_mapping_table[i];

> >> +            if (flags & map->float2a_flag) {

> >> +                softfloat_raiseFlags(map->float3c_flag);

> >> +            }

> >> +        }

> >> +    } else {

> >> +        softfloat_exceptionFlags = 0;

> >> +    }

> >> +

> >> +    return softfloat_exceptionFlags;

> >> +}

> >> +

> >

> > For conversions like this IMO it's better to make the compiler do the

> > work. C.f. target/alpha/fpu_helper.c and CONVERT_BIT.

> 

> Cool, I'll look at that.

> 

> > BTW, I think these TLS variables that softfloat3a are using are going

> > to be a real problem.  It's one thing to do it temporarily like you

> > are here, coordinating between 2a and 3c, but later, when the front

> > end is fully converted?  That's just nonsense.

> 

> Wouldn't the other option to be to drop float_status out of the guests

> CPUEnv and grab it from the TLS variable instead? Of course all guests

> would need to be MTTCG enabled for that to work.


As said in another email, some architectures actually use more than one
float_status. We therefore need to implement a solution like the one
proposed by Richard.

Aurelien

-- 
Aurelien Jarno                          GPG: 4096R/1DDD8C9B
aurelien@aurel32.net                 http://www.aurel32.net
Alex Bennée July 21, 2017, 1:50 p.m. UTC | #4
Aurelien Jarno <aurelien@aurel32.net> writes:

> On 2017-07-21 10:56, Alex Bennée wrote:

>>

>> Richard Henderson <rth@twiddle.net> writes:

>>

>> > On 07/20/2017 05:04 AM, Alex Bennée wrote:

>> >> +static softfloat_flags softfloat_mapping_table[] = {

>> >> +    { float_flag_inexact  , softfloat_flag_inexact },

>> >> +    { float_flag_underflow, softfloat_flag_underflow },

>> >> +    { float_flag_overflow , softfloat_flag_overflow },

>> >> +    { float_flag_invalid  , softfloat_flag_invalid },

>> >> +};

>> >> +/* FIXME: 2a has no infinite flag */

>> >> +

>> >> +static uint8_t sync_softfloat_flags_from_2a(float_status *flags2a)

>> >> +{

>> >> +    uint8_t flags = flags2a->float_exception_flags;

>> >> +    int i;

>> >> +    if (flags) {

>> >> +        for (i = 0; i < ARRAY_SIZE(softfloat_mapping_table); i++) {

>> >> +            struct softfloat_flags *map = &softfloat_mapping_table[i];

>> >> +            if (flags & map->float2a_flag) {

>> >> +                softfloat_raiseFlags(map->float3c_flag);

>> >> +            }

>> >> +        }

>> >> +    } else {

>> >> +        softfloat_exceptionFlags = 0;

>> >> +    }

>> >> +

>> >> +    return softfloat_exceptionFlags;

>> >> +}

>> >> +

>> >

>> > For conversions like this IMO it's better to make the compiler do the

>> > work. C.f. target/alpha/fpu_helper.c and CONVERT_BIT.

>>

>> Cool, I'll look at that.

>>

>> > BTW, I think these TLS variables that softfloat3a are using are going

>> > to be a real problem.  It's one thing to do it temporarily like you

>> > are here, coordinating between 2a and 3c, but later, when the front

>> > end is fully converted?  That's just nonsense.

>>

>> Wouldn't the other option to be to drop float_status out of the guests

>> CPUEnv and grab it from the TLS variable instead? Of course all guests

>> would need to be MTTCG enabled for that to work.

>

> As said in another email, some architectures actually use more than one

> float_status. We therefore need to implement a solution like the one

> proposed by Richard.


Ahh you mean more than one float_status for a given vCPU context?

--
Alex Bennée
Peter Maydell July 21, 2017, 1:58 p.m. UTC | #5
On 21 July 2017 at 14:50, Alex Bennée <alex.bennee@linaro.org> wrote:
> Aurelien Jarno <aurelien@aurel32.net> writes:

>> As said in another email, some architectures actually use more than one

>> float_status. We therefore need to implement a solution like the one

>> proposed by Richard.

>

> Ahh you mean more than one float_status for a given vCPU context?


Yep. ARM's cpu state struct has
        float_status fp_status;
        float_status standard_fp_status;

which we use to handle (1) operations which use the state
controlled by the FPSCR value and (2) operations which
ignore the FPSCR and use the "Standard FPSCR Value" (generally
Neon ops). More info in the comment in cpu.h...

thanks
-- PMM
Aurelien Jarno July 21, 2017, 5:43 p.m. UTC | #6
On 2017-07-21 14:58, Peter Maydell wrote:
> On 21 July 2017 at 14:50, Alex Bennée <alex.bennee@linaro.org> wrote:

> > Aurelien Jarno <aurelien@aurel32.net> writes:

> >> As said in another email, some architectures actually use more than one

> >> float_status. We therefore need to implement a solution like the one

> >> proposed by Richard.

> >

> > Ahh you mean more than one float_status for a given vCPU context?

> 

> Yep. ARM's cpu state struct has

>         float_status fp_status;

>         float_status standard_fp_status;

> 

> which we use to handle (1) operations which use the state

> controlled by the FPSCR value and (2) operations which

> ignore the FPSCR and use the "Standard FPSCR Value" (generally

> Neon ops). More info in the comment in cpu.h...


Indeed. This is also the case on:
- i386: one float_status for the x87 FPU, one for MMX and one for SSE.
- mips: one for the FPU and one for the MSA FP (SIMD operations).
- ppc: one for the FPU instructions and one for the VEC instructions.

Aurelien

-- 
Aurelien Jarno                          GPG: 4096R/1DDD8C9B
aurelien@aurel32.net                 http://www.aurel32.net
Alex Bennée July 28, 2017, 1 p.m. UTC | #7
Richard Henderson <rth@twiddle.net> writes:

> On 07/20/2017 05:04 AM, Alex Bennée wrote:

>> +static softfloat_flags softfloat_mapping_table[] = {

>> +    { float_flag_inexact  , softfloat_flag_inexact },

>> +    { float_flag_underflow, softfloat_flag_underflow },

>> +    { float_flag_overflow , softfloat_flag_overflow },

>> +    { float_flag_invalid  , softfloat_flag_invalid },

>> +};

>> +/* FIXME: 2a has no infinite flag */

>> +

>> +static uint8_t sync_softfloat_flags_from_2a(float_status *flags2a)

>> +{

>> +    uint8_t flags = flags2a->float_exception_flags;

>> +    int i;

>> +    if (flags) {

>> +        for (i = 0; i < ARRAY_SIZE(softfloat_mapping_table); i++) {

>> +            struct softfloat_flags *map = &softfloat_mapping_table[i];

>> +            if (flags & map->float2a_flag) {

>> +                softfloat_raiseFlags(map->float3c_flag);

>> +            }

>> +        }

>> +    } else {

>> +        softfloat_exceptionFlags = 0;

>> +    }

>> +

>> +    return softfloat_exceptionFlags;

>> +}

>> +

>

> For conversions like this IMO it's better to make the compiler do the

> work. C.f. target/alpha/fpu_helper.c and CONVERT_BIT.

>

> BTW, I think these TLS variables that softfloat3a are using are going

> to be a real problem.  It's one thing to do it temporarily like you

> are here, coordinating between 2a and 3c, but later, when the front

> end is fully converted?  That's just nonsense.

>

> Is it going to be worthwhile to significantly hack up the incoming sources?

>

> If so, then we might do something like this:  Given a 3c function foo,

>

>   T foo_st(T, T, float3a_status *) { ... }

>   T foo(T x, T y) { return foo_st(x, y, &tls_status); }

>

> And of course pack all of the relevant state into a struct then

>

>   #define softfloat_exceptionFlags tls_status.exceptionflags

>

> etc, instead of having individual tls variables.  This feels like

> something that we could push back upstream, assuming that upstream is

> active.


Another option which might be less invasive to the API (although
arguably a bit side-effecty) would be to make:

THREAD_LOCAL uint_fast8_t *softfloat_exceptionFlags;

And add a helper to de-reference softfloat_exceptionFlags when setting
them so that all the:

  softfloat_exceptionFlags |= softfloat_flag_inexact;

Become:

  softfloat_raiseException(softfloat_flag_inexect);

With something like:

void softfloat_raiseException(uint_fast8_t new)
{
        assert(softfloat_exceptionFlags);
        *softfloat_exceptionFlags |= new;
}

So new helpers could just do:

uint32_t HELPER(advsimd_foo)(uint32_t a, uint32_t b, void *fpstp)
{
    uint_fast8_t *old_fpst = softfloat_exceptionFlags;
    softfloat_exceptionFlags = (uint_fast8t *) fpstp;

    /* Do stuff */

    softfloat_exceptionFlags = old_fpst;
}

Unless there is code that directly fiddles the fpst bits in TCG I think
that should work well enough as any vCPU can only be doing one sort of
float at a time.

Anyway I guess this is all moot until we get an answer as to there being
an active upstream to push stuff back up to. I reached out yesterday
(https://github.com/ucb-bar/berkeley-softfloat-3/issues/5) so we'll see
if we get any feedback. Otherwise I'm minded to stick with 2a and just
implement the half-precision stuff there.


--
Alex Bennée
Richard Henderson July 28, 2017, 4:07 p.m. UTC | #8
On 07/28/2017 06:00 AM, Alex Bennée wrote:
>      uint_fast8_t *old_fpst = softfloat_exceptionFlags;

>      softfloat_exceptionFlags = (uint_fast8t *) fpstp;


Better, but not best.  We still need to collect all of the different variables 
that are currently members of float_status, of which this is but one.

For QEMU purposes, I wonder if resetting this to NULL when leaving the 
softfloat helper would be best.


r~
diff mbox series

Patch

diff --git a/target/arm/Makefile.objs b/target/arm/Makefile.objs
index 847fb52ee0..d4e81b13f0 100644
--- a/target/arm/Makefile.objs
+++ b/target/arm/Makefile.objs
@@ -6,6 +6,7 @@  obj-$(call land,$(CONFIG_KVM),$(TARGET_AARCH64)) += kvm64.o
 obj-$(call lnot,$(CONFIG_KVM)) += kvm-stub.o
 obj-y += translate.o op_helper.o helper.o cpu.o
 obj-y += neon_helper.o iwmmxt_helper.o
+obj-$(TARGET_AARCH64) += advsimd_helper.o
 obj-y += gdbstub.o
 obj-$(TARGET_AARCH64) += cpu64.o translate-a64.o helper-a64.o gdbstub64.o
 obj-y += crypto_helper.o
diff --git a/target/arm/advsimd_helper.c b/target/arm/advsimd_helper.c
new file mode 100644
index 0000000000..ec875a83bb
--- /dev/null
+++ b/target/arm/advsimd_helper.c
@@ -0,0 +1,85 @@ 
+/*
+ * ARM AdvancedSIMD helper functions
+ *
+ * Copyright (c) 2017 Linaro.
+ * Author: Alex Bennée <alex.bennee@linaro.org>
+ *
+ * This code is licensed under the GNU GPL v2.
+ *
+ * This code is specifically for AdvancedSIMD helpers rather than
+ * shared NEON helpers which are re-purposed for ARMv8. In practice
+ * these are helpers for newer features not found in older ARMs,
+ * currently half-precision float support.
+ *
+ * As such these helpers use the SoftFloat3c code. As we currently use
+ * both SoftFloat cores we copy the SoftFloat2a status bits to 3c
+ * before the operation and copy back at the end.
+ */
+#include "qemu/osdep.h"
+
+#include "cpu.h"
+#include "exec/exec-all.h"
+#include "exec/helper-proto.h"
+
+#include "fpu/softfloat3c/platform.h"
+#include "fpu/softfloat3c/softfloat.h"
+#include "fpu/softfloat3c/softfloat-internals.h"
+
+typedef struct softfloat_flags {
+    uint8_t float2a_flag;
+    uint8_t float3c_flag;
+} softfloat_flags;
+
+static softfloat_flags softfloat_mapping_table[] = {
+    { float_flag_inexact  , softfloat_flag_inexact },
+    { float_flag_underflow, softfloat_flag_underflow },
+    { float_flag_overflow , softfloat_flag_overflow },
+    { float_flag_invalid  , softfloat_flag_invalid },
+};
+/* FIXME: 2a has no infinite flag */
+
+static uint8_t sync_softfloat_flags_from_2a(float_status *flags2a)
+{
+    uint8_t flags = flags2a->float_exception_flags;
+    int i;
+    if (flags) {
+        for (i = 0; i < ARRAY_SIZE(softfloat_mapping_table); i++) {
+            struct softfloat_flags *map = &softfloat_mapping_table[i];
+            if (flags & map->float2a_flag) {
+                softfloat_raiseFlags(map->float3c_flag);
+            }
+        }
+    } else {
+        softfloat_exceptionFlags = 0;
+    }
+
+    return softfloat_exceptionFlags;
+}
+
+static void sync_softfloat_flags_to_2a(float_status *flags2a)
+{
+    int i;
+    if (softfloat_exceptionFlags) {
+        for (i = 0; i < ARRAY_SIZE(softfloat_mapping_table); i++) {
+            struct softfloat_flags *map = &softfloat_mapping_table[i];
+            if (softfloat_exceptionFlags & map->float3c_flag) {
+                float_raise(map->float2a_flag, flags2a);
+            }
+        }
+    } else {
+        flags2a->float_exception_flags = 0;
+    }
+}
+
+
+uint32_t HELPER(advsimd_acgt_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+    union ui16_f16 uA = { .ui = (a & 0x7fff) };
+    union ui16_f16 uB = { .ui = (b & 0x7fff) };
+    uint32_t r;
+
+    sync_softfloat_flags_from_2a(fpstp);
+    r = -f16_lt(uB.f, uA.f);
+    sync_softfloat_flags_to_2a(fpstp);
+    return r;
+}
diff --git a/target/arm/helper-a64.h b/target/arm/helper-a64.h
index 6f9eaba533..48b400ca8e 100644
--- a/target/arm/helper-a64.h
+++ b/target/arm/helper-a64.h
@@ -44,3 +44,6 @@  DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
 DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
 DEF_HELPER_FLAGS_4(paired_cmpxchg64_le, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
 DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
+
+/* helper_advsimd.c */
+DEF_HELPER_3(advsimd_acgt_f16, i32, i32, i32, ptr)
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index c766829ff9..06da8408f6 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -9764,6 +9764,9 @@  static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn)
         read_vec_element_i32(s, tcg_op2, rm, pass, MO_16);
 
         switch (fpopcode) {
+        case 0x35: /* FACGT */
+            gen_helper_advsimd_acgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+            break;
         default:
             fprintf(stderr,"%s: insn %#04x fpop %#2x\n", __func__, insn, fpopcode);
             unsupported_encoding(s, insn);