diff mbox series

[23/36] target/arm: Convert Neon 64-bit element 3-reg-same insns

Message ID 20200430181003.21682-24-peter.maydell@linaro.org
State Superseded
Headers show
Series target/arm: Convert Neon to decodetree (part 1) | expand

Commit Message

Peter Maydell April 30, 2020, 6:09 p.m. UTC
Convert the 64-bit element insns in the 3-reg-same group
to decodetree. This covers VQSHL, VRSHL and VQRSHL where
size==0b11.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

---
 target/arm/translate-neon.inc.c | 62 +++++++++++++++++++++++++++++++++
 target/arm/translate.c          | 38 ++------------------
 target/arm/neon-dp.decode       | 11 ++++++
 3 files changed, 75 insertions(+), 36 deletions(-)

-- 
2.20.1

Comments

Richard Henderson April 30, 2020, 8:54 p.m. UTC | #1
On 4/30/20 11:09 AM, Peter Maydell wrote:
> +

> +    rn = tcg_temp_new_i64();

> +    rm = tcg_temp_new_i64();

> +    rd = tcg_temp_new_i64();

> +

> +    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {

> +        neon_load_reg64(rn, a->vn + pass);

> +        neon_load_reg64(rm, a->vm + pass);

> +        fn(rd, rm, rn);

> +        neon_store_reg64(rd, a->vd + pass);

> +    }

> +

> +    tcg_temp_free_i64(rn);

> +    tcg_temp_free_i64(rm);

> +    tcg_temp_free_i64(rd);

> +

> +    return true;

> +}

> +

> +#define DO_3SAME_64(INSN, FUNC)                                         \

> +    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \

> +    {                                                                   \

> +        return do_3same_64(s, a, FUNC);                                 \

> +    }


You can morph this into the gvec interface like so:

#define DO_3SAME_64(INSN, FUNC) \
    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,
                                uint32_t rn_ofs, uint32_t rm_ofs,
                                uint32_t oprsz, uint32_t maxsz)
    {
        static const GVecGen3 op = { .fni8 = FUNC };
        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs,
                       oprsz, maxsz, &op);
    }
    DO_3SAME(INSN, gen_##INSN##_3s)

The .fni8 function tells gvec that we have a helper that processes the
operation in 8 byte chunks.  It will handle the pass loop for you.

There's also a .fni4 member, for those neon helpers that operate on 4-byte
quantities, fwiw.


r~
Peter Maydell May 1, 2020, 3:36 p.m. UTC | #2
On Thu, 30 Apr 2020 at 21:54, Richard Henderson
<richard.henderson@linaro.org> wrote:
> You can morph this into the gvec interface like so:

>

> #define DO_3SAME_64(INSN, FUNC) \

>     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,

>                                 uint32_t rn_ofs, uint32_t rm_ofs,

>                                 uint32_t oprsz, uint32_t maxsz)

>     {

>         static const GVecGen3 op = { .fni8 = FUNC };

>         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs,

>                        oprsz, maxsz, &op);

>     }

>     DO_3SAME(INSN, gen_##INSN##_3s)

>

> The .fni8 function tells gvec that we have a helper that processes the

> operation in 8 byte chunks.  It will handle the pass loop for you.

>

> There's also a .fni4 member, for those neon helpers that operate on 4-byte

> quantities, fwiw.


Is there a version of this that works on functions that need
to be passed the cpu_env, or do I have to create a trampoline
function that just calls the real helper function passing it
the extra argument ?

thanks
-- PMM
Richard Henderson May 1, 2020, 3:50 p.m. UTC | #3
On 5/1/20 8:36 AM, Peter Maydell wrote:
> On Thu, 30 Apr 2020 at 21:54, Richard Henderson

> <richard.henderson@linaro.org> wrote:

>> You can morph this into the gvec interface like so:

>>

>> #define DO_3SAME_64(INSN, FUNC) \

>>     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,

>>                                 uint32_t rn_ofs, uint32_t rm_ofs,

>>                                 uint32_t oprsz, uint32_t maxsz)

>>     {

>>         static const GVecGen3 op = { .fni8 = FUNC };

>>         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs,

>>                        oprsz, maxsz, &op);

>>     }

>>     DO_3SAME(INSN, gen_##INSN##_3s)

>>

>> The .fni8 function tells gvec that we have a helper that processes the

>> operation in 8 byte chunks.  It will handle the pass loop for you.

>>

>> There's also a .fni4 member, for those neon helpers that operate on 4-byte

>> quantities, fwiw.

> 

> Is there a version of this that works on functions that need

> to be passed the cpu_env, or do I have to create a trampoline

> function that just calls the real helper function passing it

> the extra argument ?


A trampoline is required.

The original intention of the hook is to expand some inline tcg ops.  That it
can be used to call a helper is a happy accident.  For a helper that needs env,
ideally we would use tcg_gen_gvec_ptr and handle the vector with one call.


r~
Peter Maydell May 1, 2020, 3:54 p.m. UTC | #4
On Thu, 30 Apr 2020 at 21:54, Richard Henderson
<richard.henderson@linaro.org> wrote:
>

> On 4/30/20 11:09 AM, Peter Maydell wrote:

> > +

> > +    rn = tcg_temp_new_i64();

> > +    rm = tcg_temp_new_i64();

> > +    rd = tcg_temp_new_i64();

> > +

> > +    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {

> > +        neon_load_reg64(rn, a->vn + pass);

> > +        neon_load_reg64(rm, a->vm + pass);

> > +        fn(rd, rm, rn);

> > +        neon_store_reg64(rd, a->vd + pass);

> > +    }

> > +

> > +    tcg_temp_free_i64(rn);

> > +    tcg_temp_free_i64(rm);

> > +    tcg_temp_free_i64(rd);

> > +

> > +    return true;

> > +}

> > +

> > +#define DO_3SAME_64(INSN, FUNC)                                         \

> > +    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \

> > +    {                                                                   \

> > +        return do_3same_64(s, a, FUNC);                                 \

> > +    }

>

> You can morph this into the gvec interface like so:

>

> #define DO_3SAME_64(INSN, FUNC) \

>     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,

>                                 uint32_t rn_ofs, uint32_t rm_ofs,

>                                 uint32_t oprsz, uint32_t maxsz)

>     {

>         static const GVecGen3 op = { .fni8 = FUNC };

>         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs,

>                        oprsz, maxsz, &op);

>     }

>     DO_3SAME(INSN, gen_##INSN##_3s)

>

> The .fni8 function tells gvec that we have a helper that processes the

> operation in 8 byte chunks.  It will handle the pass loop for you.


This doesn't quite work, because these are shift ops and
so the operands are passed to the helper in the order
rd, rm, rn. Reshuffling the order of arguments to
tcg_gen_gvec_3() fixes this, though.

I guess I should call the macro DO_3SAME_SHIFT64, I hadn't
noticed it was shift specific because the only thing we do
with it is shifts.

thanks
-- PMM
Peter Maydell May 1, 2020, 3:57 p.m. UTC | #5
On Fri, 1 May 2020 at 16:50, Richard Henderson
<richard.henderson@linaro.org> wrote:
> The original intention of the hook is to expand some inline tcg ops.  That it

> can be used to call a helper is a happy accident.  For a helper that needs env,

> ideally we would use tcg_gen_gvec_ptr and handle the vector with one call.


The inconsistency where half the helpers nede to be passed cpu_env
and the other half don't is really irritating for writing code
that calls them. Lots of ought-to-be-common code ends up needing
two versions :-(

-- PMM
Richard Henderson May 1, 2020, 4:12 p.m. UTC | #6
On 5/1/20 8:57 AM, Peter Maydell wrote:
> On Fri, 1 May 2020 at 16:50, Richard Henderson

> <richard.henderson@linaro.org> wrote:

>> The original intention of the hook is to expand some inline tcg ops.  That it

>> can be used to call a helper is a happy accident.  For a helper that needs env,

>> ideally we would use tcg_gen_gvec_ptr and handle the vector with one call.

> 

> The inconsistency where half the helpers nede to be passed cpu_env

> and the other half don't is really irritating for writing code

> that calls them. Lots of ought-to-be-common code ends up needing

> two versions :-(


Yep.  Lots of room for additional cleanup here.


r~
Richard Henderson May 1, 2020, 4:13 p.m. UTC | #7
On 5/1/20 8:54 AM, Peter Maydell wrote:
> On Thu, 30 Apr 2020 at 21:54, Richard Henderson

> <richard.henderson@linaro.org> wrote:

>>

>> On 4/30/20 11:09 AM, Peter Maydell wrote:

>>> +

>>> +    rn = tcg_temp_new_i64();

>>> +    rm = tcg_temp_new_i64();

>>> +    rd = tcg_temp_new_i64();

>>> +

>>> +    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {

>>> +        neon_load_reg64(rn, a->vn + pass);

>>> +        neon_load_reg64(rm, a->vm + pass);

>>> +        fn(rd, rm, rn);

>>> +        neon_store_reg64(rd, a->vd + pass);

>>> +    }

>>> +

>>> +    tcg_temp_free_i64(rn);

>>> +    tcg_temp_free_i64(rm);

>>> +    tcg_temp_free_i64(rd);

>>> +

>>> +    return true;

>>> +}

>>> +

>>> +#define DO_3SAME_64(INSN, FUNC)                                         \

>>> +    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \

>>> +    {                                                                   \

>>> +        return do_3same_64(s, a, FUNC);                                 \

>>> +    }

>>

>> You can morph this into the gvec interface like so:

>>

>> #define DO_3SAME_64(INSN, FUNC) \

>>     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,

>>                                 uint32_t rn_ofs, uint32_t rm_ofs,

>>                                 uint32_t oprsz, uint32_t maxsz)

>>     {

>>         static const GVecGen3 op = { .fni8 = FUNC };

>>         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs,

>>                        oprsz, maxsz, &op);

>>     }

>>     DO_3SAME(INSN, gen_##INSN##_3s)

>>

>> The .fni8 function tells gvec that we have a helper that processes the

>> operation in 8 byte chunks.  It will handle the pass loop for you.

> 

> This doesn't quite work, because these are shift ops and

> so the operands are passed to the helper in the order

> rd, rm, rn. Reshuffling the order of arguments to

> tcg_gen_gvec_3() fixes this, though.

> 

> I guess I should call the macro DO_3SAME_SHIFT64, I hadn't

> noticed it was shift specific because the only thing we do

> with it is shifts.


See my reply to patch 26.  I think we should swap these operands during decode.


r~
diff mbox series

Patch

diff --git a/target/arm/translate-neon.inc.c b/target/arm/translate-neon.inc.c
index 161313ad879..bc5afb368e3 100644
--- a/target/arm/translate-neon.inc.c
+++ b/target/arm/translate-neon.inc.c
@@ -908,3 +908,65 @@  static bool trans_SHA256SU1_3s(DisasContext *s, arg_SHA256SU1_3s *a)
 
     return true;
 }
+
+static bool do_3same_64(DisasContext *s, arg_3same *a, NeonGenTwo64OpFn *fn)
+{
+    /* Handle 3-reg-same operations to be performed 64 bits at a time */
+    TCGv_i64 rn, rm, rd;
+    int pass;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vn | a->vm | a->vd) & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    rn = tcg_temp_new_i64();
+    rm = tcg_temp_new_i64();
+    rd = tcg_temp_new_i64();
+
+    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
+        neon_load_reg64(rn, a->vn + pass);
+        neon_load_reg64(rm, a->vm + pass);
+        fn(rd, rm, rn);
+        neon_store_reg64(rd, a->vd + pass);
+    }
+
+    tcg_temp_free_i64(rn);
+    tcg_temp_free_i64(rm);
+    tcg_temp_free_i64(rd);
+
+    return true;
+}
+
+#define DO_3SAME_64(INSN, FUNC)                                         \
+    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
+    {                                                                   \
+        return do_3same_64(s, a, FUNC);                                 \
+    }
+
+#define DO_3SAME_64_ENV(INSN, FUNC)                                     \
+    static void gen_##INSN##_3s(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)     \
+    {                                                                   \
+        FUNC(d, cpu_env, n, m);                                         \
+    }                                                                   \
+    DO_3SAME_64(INSN, gen_##INSN##_3s)
+
+DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
+DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
+DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
+DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
+DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
+DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 160638e2a7c..fb64eb3a800 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -4791,42 +4791,8 @@  static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
         }
 
         if (size == 3) {
-            /* 64-bit element instructions. */
-            for (pass = 0; pass < (q ? 2 : 1); pass++) {
-                neon_load_reg64(cpu_V0, rn + pass);
-                neon_load_reg64(cpu_V1, rm + pass);
-                switch (op) {
-                case NEON_3R_VQSHL:
-                    if (u) {
-                        gen_helper_neon_qshl_u64(cpu_V0, cpu_env,
-                                                 cpu_V1, cpu_V0);
-                    } else {
-                        gen_helper_neon_qshl_s64(cpu_V0, cpu_env,
-                                                 cpu_V1, cpu_V0);
-                    }
-                    break;
-                case NEON_3R_VRSHL:
-                    if (u) {
-                        gen_helper_neon_rshl_u64(cpu_V0, cpu_V1, cpu_V0);
-                    } else {
-                        gen_helper_neon_rshl_s64(cpu_V0, cpu_V1, cpu_V0);
-                    }
-                    break;
-                case NEON_3R_VQRSHL:
-                    if (u) {
-                        gen_helper_neon_qrshl_u64(cpu_V0, cpu_env,
-                                                  cpu_V1, cpu_V0);
-                    } else {
-                        gen_helper_neon_qrshl_s64(cpu_V0, cpu_env,
-                                                  cpu_V1, cpu_V0);
-                    }
-                    break;
-                default:
-                    abort();
-                }
-                neon_store_reg64(cpu_V0, rd + pass);
-            }
-            return 0;
+            /* 64-bit element instructions: handled by decodetree */
+            return 1;
         }
         pairwise = 0;
         switch (op) {
diff --git a/target/arm/neon-dp.decode b/target/arm/neon-dp.decode
index f22606b2bd5..a4932e550ed 100644
--- a/target/arm/neon-dp.decode
+++ b/target/arm/neon-dp.decode
@@ -68,6 +68,17 @@  VCGE_U_3s        1111 001 1 0 . .. .... .... 0011 . . . 1 .... @3same
 VSHL_S_3s        1111 001 0 0 . .. .... .... 0100 . . . 0 .... @3same
 VSHL_U_3s        1111 001 1 0 . .. .... .... 0100 . . . 0 .... @3same
 
+# Insns operating on 64-bit elements (size!=0b11 handled elsewhere)
+@3same_64        .... ... . . . 11 .... .... .... . q:1 . . .... \
+                 &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp size=3
+
+VQSHL_S64_3s     1111 001 0 0 . .. .... .... 0100 . . . 1 .... @3same_64
+VQSHL_U64_3s     1111 001 1 0 . .. .... .... 0100 . . . 1 .... @3same_64
+VRSHL_S64_3s     1111 001 0 0 . .. .... .... 0101 . . . 0 .... @3same_64
+VRSHL_U64_3s     1111 001 1 0 . .. .... .... 0101 . . . 0 .... @3same_64
+VQRSHL_S64_3s    1111 001 0 0 . .. .... .... 0101 . . . 1 .... @3same_64
+VQRSHL_U64_3s    1111 001 1 0 . .. .... .... 0101 . . . 1 .... @3same_64
+
 VMAX_S_3s        1111 001 0 0 . .. .... .... 0110 . . . 0 .... @3same
 VMAX_U_3s        1111 001 1 0 . .. .... .... 0110 . . . 0 .... @3same
 VMIN_S_3s        1111 001 0 0 . .. .... .... 0110 . . . 1 .... @3same