diff mbox series

[20/55] target/arm: Implement MVE VDUP

Message ID 20210607165821.9892-21-peter.maydell@linaro.org
State Superseded
Headers show
Series target/arm: First slice of MVE implementation | expand

Commit Message

Peter Maydell June 7, 2021, 4:57 p.m. UTC
Implement the MVE VDUP insn, which duplicates a value from
a general-purpose register into every lane of a vector
register (subject to predication).

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

---
 target/arm/helper-mve.h    |  4 ++++
 target/arm/mve.decode      | 10 +++++++++
 target/arm/mve_helper.c    | 18 ++++++++++++++++
 target/arm/translate-mve.c | 43 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 75 insertions(+)

-- 
2.20.1

Comments

Richard Henderson June 8, 2021, 11:17 p.m. UTC | #1
On 6/7/21 9:57 AM, Peter Maydell wrote:
> +#define DO_VDUP(OP, ESIZE, TYPE, H)                                     \

> +    void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t val)     \

> +    {                                                                   \

> +        TYPE *d = vd;                                                   \

> +        uint16_t mask = mve_element_mask(env);                          \

> +        unsigned e;                                                     \

> +        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \

> +            uint64_t bytemask = mask_to_bytemask##ESIZE(mask);          \

> +            d[H(e)] &= ~bytemask;                                       \

> +            d[H(e)] |= (val & bytemask);                                \

> +        }                                                               \

> +        mve_advance_vpt(env);                                           \

> +    }

> +

> +DO_VDUP(vdupb, 1, uint8_t, H1)

> +DO_VDUP(vduph, 2, uint16_t, H2)

> +DO_VDUP(vdupw, 4, uint32_t, H4)


Hmm.  I think the masking should be done at either uint32_t or uint64_t.  Doing 
it byte-by-byte is wasteful.

Whether you want to do the replication in tcg (I can export gen_dup_i32 from 
tcg-op-gvec.c) and have one helper, or do the replication here e.g.

static void do_vdup(CPUARMState *env, void *vd, uint64_t val);
void helper(mve_vdupb)(CPUARMState *env, void *vd, uint32_t val)
{
     do_vdup(env, vd, dup_const(MO_8, val));
}


r~
Peter Maydell June 9, 2021, 10:06 a.m. UTC | #2
On Wed, 9 Jun 2021 at 00:17, Richard Henderson
<richard.henderson@linaro.org> wrote:
>

> On 6/7/21 9:57 AM, Peter Maydell wrote:

> > +#define DO_VDUP(OP, ESIZE, TYPE, H)                                     \

> > +    void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t val)     \

> > +    {                                                                   \

> > +        TYPE *d = vd;                                                   \

> > +        uint16_t mask = mve_element_mask(env);                          \

> > +        unsigned e;                                                     \

> > +        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \

> > +            uint64_t bytemask = mask_to_bytemask##ESIZE(mask);          \

> > +            d[H(e)] &= ~bytemask;                                       \

> > +            d[H(e)] |= (val & bytemask);                                \

> > +        }                                                               \

> > +        mve_advance_vpt(env);                                           \

> > +    }

> > +

> > +DO_VDUP(vdupb, 1, uint8_t, H1)

> > +DO_VDUP(vduph, 2, uint16_t, H2)

> > +DO_VDUP(vdupw, 4, uint32_t, H4)

>

> Hmm.  I think the masking should be done at either uint32_t or uint64_t.  Doing

> it byte-by-byte is wasteful.


Mmm. I think some of this structure is holdover from an initial
misinterpretation
of the spec that all these ops looked at the predicate bit for the LS byte
of the element to see if the entire element was acted upon, in which case
you do need to work element-by-element with the right size. (This is actually
true for some operations, but mostly the predicate bits do bytewise masking
and can give you a partial chunk of a result element, as here.)

-- PMM
Richard Henderson June 9, 2021, 5:16 p.m. UTC | #3
On 6/9/21 3:06 AM, Peter Maydell wrote:
> Mmm. I think some of this structure is holdover from an initial

> misinterpretation

> of the spec that all these ops looked at the predicate bit for the LS byte

> of the element to see if the entire element was acted upon, in which case

> you do need to work element-by-element with the right size. (This is actually

> true for some operations, but mostly the predicate bits do bytewise masking

> and can give you a partial chunk of a result element, as here.)


Even if the operation did look at specific predicate bits, that simply puts it 
in line with SVE, which is quite happy with expand_pred_[bhsd].


r~
diff mbox series

Patch

diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
index 733a54d2e3c..ece9c481367 100644
--- a/target/arm/helper-mve.h
+++ b/target/arm/helper-mve.h
@@ -33,6 +33,10 @@  DEF_HELPER_FLAGS_3(mve_vstrb_h, TCG_CALL_NO_WG, void, env, ptr, i32)
 DEF_HELPER_FLAGS_3(mve_vstrb_w, TCG_CALL_NO_WG, void, env, ptr, i32)
 DEF_HELPER_FLAGS_3(mve_vstrh_w, TCG_CALL_NO_WG, void, env, ptr, i32)
 
+DEF_HELPER_FLAGS_3(mve_vdupb, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vduph, TCG_CALL_NO_WG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(mve_vdupw, TCG_CALL_NO_WG, void, env, ptr, i32)
+
 DEF_HELPER_FLAGS_3(mve_vclsb, TCG_CALL_NO_WG, void, env, ptr, ptr)
 DEF_HELPER_FLAGS_3(mve_vclsh, TCG_CALL_NO_WG, void, env, ptr, ptr)
 DEF_HELPER_FLAGS_3(mve_vclsw, TCG_CALL_NO_WG, void, env, ptr, ptr)
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
index 82cc0abcb82..09849917f5a 100644
--- a/target/arm/mve.decode
+++ b/target/arm/mve.decode
@@ -21,6 +21,7 @@ 
 
 %qd 22:1 13:3
 %qm 5:1 1:3
+%qn 7:1 17:3
 
 &vldr_vstr rn qd imm p a w size l u
 &1op qd qm size
@@ -82,3 +83,12 @@  VABS             1111 1111 1 . 11 .. 01 ... 0 0011 01 . 0 ... 0 @1op
 VABS_fp          1111 1111 1 . 11 .. 01 ... 0 0111 01 . 0 ... 0 @1op
 VNEG             1111 1111 1 . 11 .. 01 ... 0 0011 11 . 0 ... 0 @1op
 VNEG_fp          1111 1111 1 . 11 .. 01 ... 0 0111 11 . 0 ... 0 @1op
+
+&vdup qd rt size
+# Qd is in the fields usually named Qn
+@vdup            .... .... . . .. ... . rt:4 .... . . . . .... qd=%qn &vdup
+
+# B and E bits encode size, which we decode here to the usual size values
+VDUP             1110 1110 1 1 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=0
+VDUP             1110 1110 1 0 10 ... 0 .... 1011 . 0 1 1 0000 @vdup size=1
+VDUP             1110 1110 1 0 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=2
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
index b14826c05a7..a5ed4e01e33 100644
--- a/target/arm/mve_helper.c
+++ b/target/arm/mve_helper.c
@@ -229,6 +229,24 @@  static uint64_t mask_to_bytemask8(uint16_t mask)
         ((uint64_t)mask_to_bytemask4(mask >> 4) << 32);
 }
 
+#define DO_VDUP(OP, ESIZE, TYPE, H)                                     \
+    void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t val)     \
+    {                                                                   \
+        TYPE *d = vd;                                                   \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            uint64_t bytemask = mask_to_bytemask##ESIZE(mask);          \
+            d[H(e)] &= ~bytemask;                                       \
+            d[H(e)] |= (val & bytemask);                                \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+DO_VDUP(vdupb, 1, uint8_t, H1)
+DO_VDUP(vduph, 2, uint16_t, H2)
+DO_VDUP(vdupw, 4, uint32_t, H4)
+
 #define DO_1OP(OP, ESIZE, TYPE, H, FN)                                  \
     void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
     {                                                                   \
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
index 086cac9f0cd..b4fc4054fe1 100644
--- a/target/arm/translate-mve.c
+++ b/target/arm/translate-mve.c
@@ -169,6 +169,49 @@  DO_VLDST_WIDE_NARROW(VLDSTB_H, vldrb_sh, vldrb_uh, vstrb_h)
 DO_VLDST_WIDE_NARROW(VLDSTB_W, vldrb_sw, vldrb_uw, vstrb_w)
 DO_VLDST_WIDE_NARROW(VLDSTH_W, vldrh_sw, vldrh_uw, vstrh_w)
 
+static bool trans_VDUP(DisasContext *s, arg_VDUP *a)
+{
+    TCGv_ptr qd;
+    TCGv_i32 rt;
+
+    if (!dc_isar_feature(aa32_mve, s)) {
+        return false;
+    }
+    if (a->qd > 7) {
+        return false;
+    }
+    if (a->rt == 13 || a->rt == 15) {
+        /* UNPREDICTABLE; we choose to UNDEF */
+        return false;
+    }
+    if (!mve_eci_check(s)) {
+        return true;
+    }
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    qd = mve_qreg_ptr(a->qd);
+    rt = load_reg(s, a->rt);
+    switch (a->size) {
+    case 0:
+        gen_helper_mve_vdupb(cpu_env, qd, rt);
+        break;
+    case 1:
+        gen_helper_mve_vduph(cpu_env, qd, rt);
+        break;
+    case 2:
+        gen_helper_mve_vdupw(cpu_env, qd, rt);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    tcg_temp_free_ptr(qd);
+    tcg_temp_free_i32(rt);
+    mve_update_eci(s);
+    return true;
+}
+
 static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn)
 {
     TCGv_ptr qd, qm;