diff mbox series

[v4,19/45] target/arm: Implement SME MOVA

Message ID 20220628042117.368549-20-richard.henderson@linaro.org
State New
Headers show
Series target/arm: Scalable Matrix Extension | expand

Commit Message

Richard Henderson June 28, 2022, 4:20 a.m. UTC
We can reuse the SVE functions for implementing moves to/from
horizontal tile slices, but we need new ones for moves to/from
vertical tile slices.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/helper-sme.h    |  12 ++++
 target/arm/helper-sve.h    |   2 +
 target/arm/translate-a64.h |   8 +++
 target/arm/translate.h     |   5 ++
 target/arm/sme.decode      |  15 +++++
 target/arm/sme_helper.c    | 112 +++++++++++++++++++++++++++++++-
 target/arm/sve_helper.c    |  12 ++++
 target/arm/translate-sme.c | 130 +++++++++++++++++++++++++++++++++++++
 8 files changed, 295 insertions(+), 1 deletion(-)

Comments

Peter Maydell July 1, 2022, 4:19 p.m. UTC | #1
On Tue, 28 Jun 2022 at 05:40, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> We can reuse the SVE functions for implementing moves to/from
> horizontal tile slices, but we need new ones for moves to/from
> vertical tile slices.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---

> diff --git a/target/arm/sme_helper.c b/target/arm/sme_helper.c
> index eef2df73e1..95159862de 100644
> --- a/target/arm/sme_helper.c
> +++ b/target/arm/sme_helper.c
> @@ -19,8 +19,10 @@
>
>  #include "qemu/osdep.h"
>  #include "cpu.h"
> -#include "internals.h"

Did you mean to delete this #include line ?

> +#include "tcg/tcg-gvec-desc.h"
>  #include "exec/helper-proto.h"
> +#include "qemu/int128.h"
> +#include "vec_internal.h"
>
>  /* ResetSVEState */
>  void arm_reset_sve_state(CPUARMState *env)
> @@ -84,3 +86,111 @@ void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
>          }
>      }
>  }
> +
> +/*
> + * Move Zreg vector to ZArray column.
> + */
> +#define DO_MOVA_C(NAME, TYPE, H)                                        \
> +void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc)          \
> +{                                                                       \
> +    int i, oprsz = simd_oprsz(desc);                                    \
> +    for (i = 0; i < oprsz; ) {                                          \
> +        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
> +        do {                                                            \
> +            if (pg & 1) {                                               \
> +                *(TYPE *)za = *(TYPE *)(vn + H(i));                     \
> +            }                                                           \
> +            za += sizeof(ARMVectorReg) * sizeof(TYPE);                  \
> +            i += sizeof(TYPE);                                          \
> +            pg >>= sizeof(TYPE);                                        \
> +        } while (i & 15);                                               \
> +    }                                                                   \
> +}
> +
> +DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
> +DO_MOVA_C(sme_mova_cz_h, uint16_t, H2)
> +DO_MOVA_C(sme_mova_cz_s, uint32_t, H4)
> +
> +void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
> +{
> +    int i, oprsz = simd_oprsz(desc) / 8;
> +    uint8_t *pg = vg;
> +    uint64_t *n = vn;
> +    uint64_t *a = za;
> +
> +    for (i = 0; i < oprsz; i++) {
> +        if (pg[H1_2(i)] & 1) {

The documentation of the H macros says
"The H1_<N> macros are used when performing byte arithmetic and then
 casting the final pointer to a type of size N."
but we're not casting anything to a 2-byte type, so what's happening here?

> +            a[i * sizeof(ARMVectorReg)] = n[i];
> +        }
> +    }
> +}
> +
> +void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
> +{
> +    int i, oprsz = simd_oprsz(desc) / 16;
> +    uint16_t *pg = vg;
> +    Int128 *n = vn;
> +    Int128 *a = za;
> +
> +    for (i = 0; i < oprsz; i++) {
> +        if (pg[H2(i)] & 1) {
> +            a[i * sizeof(ARMVectorReg)] = n[i];

Is it really OK to do this with an Int128 store? That is
in host-order, but the two halves of a 128-bit quantity
in the ZA array are in architectural order. I suppose the
source also will have them in the architectural order, but
it does look odd, especially uncommented.

> +        }
> +    }
> +}
> +
> +#undef DO_MOVA_C

> diff --git a/target/arm/translate-sme.c b/target/arm/translate-sme.c
> index 971504559b..8e6881086b 100644
> --- a/target/arm/translate-sme.c
> +++ b/target/arm/translate-sme.c
> @@ -35,6 +35,77 @@
>  #include "decode-sme.c.inc"
>
>
> +static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
> +                                int tile_index, bool vertical)
> +{
> +    int tile = tile_index >> (4 - esz);
> +    int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz);
> +    int pos, len, offset;
> +    TCGv_i32 t_index;
> +    TCGv_ptr addr;
> +
> +    /* Resolve tile.size[index] to an untyped ZA slice index. */
> +    t_index = tcg_temp_new_i32();
> +    tcg_gen_trunc_tl_i32(t_index, cpu_reg(s, rs));
> +    tcg_gen_addi_i32(t_index, t_index, index);

This code isn't doing what the comment says; it's just calculating
the (not-yet-taken-MOD-dim) slice index, which does depend on the type.

> +
> +    len = ctz32(streaming_vec_reg_size(s)) - esz;

What is this the length of ?

> +
> +    if (vertical) {

I ended up reviewing this function from bottom to top because to
me the horizontal case seemed simpler to understand first.
(As a result the review comments might read a bit out-of-order.)
Dunno if that justifies flipping the condition around, though.

> +        /*
> +         * Compute the offset of the index within the tile:

"byte offset" ?

> +         *     (index % (svl / size)) * size
> +         *   = (index % (svl >> esz)) << esz
> +         * Perform the power-of-two modulo via extraction of the low @len bits.
> +         * Perform the multiply by shifting left by @pos bits.
> +         * These two operations are performed simultaneously via deposit.
> +         */
> +        pos = esz;
> +        tcg_gen_deposit_z_i32(t_index, t_index, pos, len);

As an aside, this appears to be "deposit into a zero value", but
I had to go digging in the source to find that out, because we
don't document the semantics of this either in the header file
where this function is declared, or in a doc comment on the
implementation of it, or in tcg/README, or in docs/devel.
We really desperately need to document the interface the TCG
core code provides to frontends, because it's getting further
and further away from "it's functions to emit the IR ops
described in tcg/README, with obvious looking function names"...

> +
> +        /* The tile slice offset within env->zarray is the column offset. */
> +        offset = tile;

I don't understand why we can just add the tile index
(which is going to be an integer 0, 1, 2..) to a byte offset.
In the horizontal case we add tile * sizeof(ARMVectorReg),
which makes more sense to me.

> +
> +        /* Include the offset of zarray to make this relative to env. */
> +        offset += offsetof(CPUARMState, zarray);
> +        tcg_gen_addi_i32(t_index, t_index, offset);
> +
> +        /*
> +         * For big-endian, adjust the column slice offset within
> +         * the uint64_t host words that make up env->zarray.
> +         * This must wait until index and offset are combined.

Why? Neither the byte-offset of the start of the tile nor
the byte offset of zarray in CPUARMState ought to be non-8-aligned.

> +         */
> +        if (HOST_BIG_ENDIAN && esz < MO_64) {
> +            tcg_gen_xori_i32(t_index, t_index, 8 - (1 << esz));
> +        }
> +    } else {
> +        /*
> +         * Compute the offset of the index within the tile:

"byte offset", right ?

Also this is doing the "take index MOD dim" that the pseudocode
does as a preceding step at the same time, so we should mention that.

> +         *     (index % (svl / size)) * (size * sizeof(row))
> +         *   = (index % (svl >> esz)) << (esz + log2(sizeof(row)))
> +         */
> +        pos = esz + ctz32(sizeof(ARMVectorReg));
> +        tcg_gen_deposit_z_i32(t_index, t_index, pos, len);
> +
> +        /* The tile slice offset within env->zarray is the row offset. */
> +        offset = tile * sizeof(ARMVectorReg);
> +
> +        /* Include the offset of zarray to make this relative to env. */
> +        offset += offsetof(CPUARMState, zarray);
> +        tcg_gen_addi_i32(t_index, t_index, offset);
> +
> +        /* Row slices need no endian adjustment. */
> +    }
> +
> +    /* Add the offset to env to produce the final pointer. */
> +    addr = tcg_temp_new_ptr();
> +    tcg_gen_ext_i32_ptr(addr, t_index);
> +    tcg_temp_free_i32(t_index);
> +    tcg_gen_add_ptr(addr, addr, cpu_env);
> +
> +    return addr;
> +}

-- PMM
Richard Henderson July 4, 2022, 9:08 a.m. UTC | #2
On 7/1/22 21:49, Peter Maydell wrote:
> On Tue, 28 Jun 2022 at 05:40, Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> We can reuse the SVE functions for implementing moves to/from
>> horizontal tile slices, but we need new ones for moves to/from
>> vertical tile slices.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
> 
>> diff --git a/target/arm/sme_helper.c b/target/arm/sme_helper.c
>> index eef2df73e1..95159862de 100644
>> --- a/target/arm/sme_helper.c
>> +++ b/target/arm/sme_helper.c
>> @@ -19,8 +19,10 @@
>>
>>   #include "qemu/osdep.h"
>>   #include "cpu.h"
>> -#include "internals.h"
> 
> Did you mean to delete this #include line ?

I meant to go back and remove it from whence it came, but kept forgetting.

>> +void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
>> +{
>> +    int i, oprsz = simd_oprsz(desc) / 8;
>> +    uint8_t *pg = vg;
>> +    uint64_t *n = vn;
>> +    uint64_t *a = za;
>> +
>> +    for (i = 0; i < oprsz; i++) {
>> +        if (pg[H1_2(i)] & 1) {
> 
> The documentation of the H macros says
> "The H1_<N> macros are used when performing byte arithmetic and then
>   casting the final pointer to a type of size N."
> but we're not casting anything to a 2-byte type, so what's happening here?

Yep, error.

>> +void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
>> +{
>> +    int i, oprsz = simd_oprsz(desc) / 16;
>> +    uint16_t *pg = vg;
>> +    Int128 *n = vn;
>> +    Int128 *a = za;
>> +
>> +    for (i = 0; i < oprsz; i++) {
>> +        if (pg[H2(i)] & 1) {
>> +            a[i * sizeof(ARMVectorReg)] = n[i];
> 
> Is it really OK to do this with an Int128 store? That is
> in host-order, but the two halves of a 128-bit quantity
> in the ZA array are in architectural order. I suppose the
> source also will have them in the architectural order, but
> it does look odd, especially uncommented.

Would memcpy be better for you?

>> +    /* Resolve tile.size[index] to an untyped ZA slice index. */
>> +    t_index = tcg_temp_new_i32();
>> +    tcg_gen_trunc_tl_i32(t_index, cpu_reg(s, rs));
>> +    tcg_gen_addi_i32(t_index, t_index, index);
> 
> This code isn't doing what the comment says; it's just calculating
> the (not-yet-taken-MOD-dim) slice index, which does depend on the type.

I guess the comment applies to a larger section than just these two lines.

> 
>> +
>> +    len = ctz32(streaming_vec_reg_size(s)) - esz;
> 
> What is this the length of ?

The length of the extract, aka the mod.

>> +        /* The tile slice offset within env->zarray is the column offset. */
>> +        offset = tile;
> 
> I don't understand why we can just add the tile index
> (which is going to be an integer 0, 1, 2..) to a byte offset.
> In the horizontal case we add tile * sizeof(ARMVectorReg),
> which makes more sense to me.

Hmm.  I think you're right this should be tile * column width, or

   offset = tile << esz;

I wish I could compare vs FVP...

>> +        /*
>> +         * For big-endian, adjust the column slice offset within
>> +         * the uint64_t host words that make up env->zarray.
>> +         * This must wait until index and offset are combined.
> 
> Why? Neither the byte-offset of the start of the tile nor
> the byte offset of zarray in CPUARMState ought to be non-8-aligned.

Columns will not be 8-aligned.  On page 38 of 0616A.a, see the illustration of ZA0V.B[22], 
which is 6 mod 8.

I'll try and improve the commentary.


r~
Peter Maydell July 4, 2022, 9:31 a.m. UTC | #3
On Mon, 4 Jul 2022 at 10:08, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 7/1/22 21:49, Peter Maydell wrote:
> > On Tue, 28 Jun 2022 at 05:40, Richard Henderson
> > <richard.henderson@linaro.org> wrote:
> >>
> >> We can reuse the SVE functions for implementing moves to/from
> >> horizontal tile slices, but we need new ones for moves to/from
> >> vertical tile slices.
> >>
> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>


> >> +void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
> >> +{
> >> +    int i, oprsz = simd_oprsz(desc) / 16;
> >> +    uint16_t *pg = vg;
> >> +    Int128 *n = vn;
> >> +    Int128 *a = za;
> >> +
> >> +    for (i = 0; i < oprsz; i++) {
> >> +        if (pg[H2(i)] & 1) {
> >> +            a[i * sizeof(ARMVectorReg)] = n[i];
> >
> > Is it really OK to do this with an Int128 store? That is
> > in host-order, but the two halves of a 128-bit quantity
> > in the ZA array are in architectural order. I suppose the
> > source also will have them in the architectural order, but
> > it does look odd, especially uncommented.
>
> Would memcpy be better for you?

I guess that means we end up doing it all as byte-pointer
arithmetic, which might look worse. I think with a comment
that the two halves of the Int128 might be swapped but this
is OK because we are only copying it will be fine.

> >> +    /* Resolve tile.size[index] to an untyped ZA slice index. */
> >> +    t_index = tcg_temp_new_i32();
> >> +    tcg_gen_trunc_tl_i32(t_index, cpu_reg(s, rs));
> >> +    tcg_gen_addi_i32(t_index, t_index, index);
> >
> > This code isn't doing what the comment says; it's just calculating
> > the (not-yet-taken-MOD-dim) slice index, which does depend on the type.
>
> I guess the comment applies to a larger section than just these two lines.
>
> >
> >> +
> >> +    len = ctz32(streaming_vec_reg_size(s)) - esz;
> >
> > What is this the length of ?
>
> The length of the extract, aka the mod.
>
> >> +        /* The tile slice offset within env->zarray is the column offset. */
> >> +        offset = tile;
> >
> > I don't understand why we can just add the tile index
> > (which is going to be an integer 0, 1, 2..) to a byte offset.
> > In the horizontal case we add tile * sizeof(ARMVectorReg),
> > which makes more sense to me.
>
> Hmm.  I think you're right this should be tile * column width, or
>
>    offset = tile << esz;
>
> I wish I could compare vs FVP...

> >> +        /*
> >> +         * For big-endian, adjust the column slice offset within
> >> +         * the uint64_t host words that make up env->zarray.
> >> +         * This must wait until index and offset are combined.
> >
> > Why? Neither the byte-offset of the start of the tile nor
> > the byte offset of zarray in CPUARMState ought to be non-8-aligned.
>
> Columns will not be 8-aligned.  On page 38 of 0616A.a, see the illustration of ZA0V.B[22],
> which is 6 mod 8.

Yes, but the column slice number isn't part of offset, it's
in index, so (contra the comment) you could do the xor before
the "add offset to index" if you wanted (ie it doesn't matter
which order we do these things).

thanks
-- PMM
Richard Henderson July 4, 2022, 9:43 a.m. UTC | #4
On 7/4/22 15:01, Peter Maydell wrote:
>> Columns will not be 8-aligned.  On page 38 of 0616A.a, see the illustration of ZA0V.B[22],
>> which is 6 mod 8.
> 
> Yes, but the column slice number isn't part of offset, it's
> in index, so (contra the comment) you could do the xor before
> the "add offset to index" if you wanted (ie it doesn't matter
> which order we do these things).

Oh, whoops, yes.


r~
diff mbox series

Patch

diff --git a/target/arm/helper-sme.h b/target/arm/helper-sme.h
index c4ee1f09e4..154bc73d2e 100644
--- a/target/arm/helper-sme.h
+++ b/target/arm/helper-sme.h
@@ -21,3 +21,15 @@  DEF_HELPER_FLAGS_2(set_pstate_sm, TCG_CALL_NO_RWG, void, env, i32)
 DEF_HELPER_FLAGS_2(set_pstate_za, TCG_CALL_NO_RWG, void, env, i32)
 
 DEF_HELPER_FLAGS_3(sme_zero, TCG_CALL_NO_RWG, void, env, i32, i32)
+
+/* Move to/from vertical array slices, i.e. columns, so 'c'.  */
+DEF_HELPER_FLAGS_4(sme_mova_cz_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme_mova_zc_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme_mova_cz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme_mova_zc_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme_mova_cz_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme_mova_zc_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme_mova_cz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme_mova_zc_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme_mova_cz_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme_mova_zc_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index dc629f851a..ab0333400f 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -325,6 +325,8 @@  DEF_HELPER_FLAGS_5(sve_sel_zpzz_s, TCG_CALL_NO_RWG,
                    void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_sel_zpzz_d, TCG_CALL_NO_RWG,
                    void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_sel_zpzz_q, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_5(sve2_addp_zpzz_b, TCG_CALL_NO_RWG,
                    void, ptr, ptr, ptr, ptr, i32)
diff --git a/target/arm/translate-a64.h b/target/arm/translate-a64.h
index 099d3d11d6..2a7fe6e9e7 100644
--- a/target/arm/translate-a64.h
+++ b/target/arm/translate-a64.h
@@ -178,6 +178,14 @@  static inline int pred_gvec_reg_size(DisasContext *s)
     return size_for_gvec(pred_full_reg_size(s));
 }
 
+/* Return a newly allocated pointer to the predicate register.  */
+static inline TCGv_ptr pred_full_reg_ptr(DisasContext *s, int regno)
+{
+    TCGv_ptr ret = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(ret, cpu_env, pred_full_reg_offset(s, regno));
+    return ret;
+}
+
 bool disas_sve(DisasContext *, uint32_t);
 bool disas_sme(DisasContext *, uint32_t);
 
diff --git a/target/arm/translate.h b/target/arm/translate.h
index e2e619dab2..af5d4a7086 100644
--- a/target/arm/translate.h
+++ b/target/arm/translate.h
@@ -156,6 +156,11 @@  static inline int plus_2(DisasContext *s, int x)
     return x + 2;
 }
 
+static inline int plus_12(DisasContext *s, int x)
+{
+    return x + 12;
+}
+
 static inline int times_2(DisasContext *s, int x)
 {
     return x * 2;
diff --git a/target/arm/sme.decode b/target/arm/sme.decode
index 6e4483fdce..241b4895b7 100644
--- a/target/arm/sme.decode
+++ b/target/arm/sme.decode
@@ -22,3 +22,18 @@ 
 ### SME Misc
 
 ZERO            11000000 00 001 00000000000 imm:8
+
+### SME Move into/from Array
+
+%mova_rs        13:2 !function=plus_12
+&mova           esz rs pg zr za_imm v:bool to_vec:bool
+
+MOVA            11000000 esz:2 00000 0 v:1 .. pg:3 zr:5 0 za_imm:4  \
+                &mova to_vec=0 rs=%mova_rs
+MOVA            11000000 11    00000 1 v:1 .. pg:3 zr:5 0 za_imm:4  \
+                &mova to_vec=0 rs=%mova_rs esz=4
+
+MOVA            11000000 esz:2 00001 0 v:1 .. pg:3 0 za_imm:4 zr:5  \
+                &mova to_vec=1 rs=%mova_rs
+MOVA            11000000 11    00001 1 v:1 .. pg:3 0 za_imm:4 zr:5  \
+                &mova to_vec=1 rs=%mova_rs esz=4
diff --git a/target/arm/sme_helper.c b/target/arm/sme_helper.c
index eef2df73e1..95159862de 100644
--- a/target/arm/sme_helper.c
+++ b/target/arm/sme_helper.c
@@ -19,8 +19,10 @@ 
 
 #include "qemu/osdep.h"
 #include "cpu.h"
-#include "internals.h"
+#include "tcg/tcg-gvec-desc.h"
 #include "exec/helper-proto.h"
+#include "qemu/int128.h"
+#include "vec_internal.h"
 
 /* ResetSVEState */
 void arm_reset_sve_state(CPUARMState *env)
@@ -84,3 +86,111 @@  void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
         }
     }
 }
+
+/*
+ * Move Zreg vector to ZArray column.
+ */
+#define DO_MOVA_C(NAME, TYPE, H)                                        \
+void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc)          \
+{                                                                       \
+    int i, oprsz = simd_oprsz(desc);                                    \
+    for (i = 0; i < oprsz; ) {                                          \
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
+        do {                                                            \
+            if (pg & 1) {                                               \
+                *(TYPE *)za = *(TYPE *)(vn + H(i));                     \
+            }                                                           \
+            za += sizeof(ARMVectorReg) * sizeof(TYPE);                  \
+            i += sizeof(TYPE);                                          \
+            pg >>= sizeof(TYPE);                                        \
+        } while (i & 15);                                               \
+    }                                                                   \
+}
+
+DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
+DO_MOVA_C(sme_mova_cz_h, uint16_t, H2)
+DO_MOVA_C(sme_mova_cz_s, uint32_t, H4)
+
+void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
+{
+    int i, oprsz = simd_oprsz(desc) / 8;
+    uint8_t *pg = vg;
+    uint64_t *n = vn;
+    uint64_t *a = za;
+
+    for (i = 0; i < oprsz; i++) {
+        if (pg[H1_2(i)] & 1) {
+            a[i * sizeof(ARMVectorReg)] = n[i];
+        }
+    }
+}
+
+void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
+{
+    int i, oprsz = simd_oprsz(desc) / 16;
+    uint16_t *pg = vg;
+    Int128 *n = vn;
+    Int128 *a = za;
+
+    for (i = 0; i < oprsz; i++) {
+        if (pg[H2(i)] & 1) {
+            a[i * sizeof(ARMVectorReg)] = n[i];
+        }
+    }
+}
+
+#undef DO_MOVA_C
+
+/*
+ * Move ZArray column to Zreg vector.
+ */
+#define DO_MOVA_Z(NAME, TYPE, H)                                        \
+void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc)          \
+{                                                                       \
+    int i, oprsz = simd_oprsz(desc);                                    \
+    for (i = 0; i < oprsz; ) {                                          \
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
+        do {                                                            \
+            if (pg & 1) {                                               \
+                *(TYPE *)(vd + H(i)) = *(TYPE *)za;                     \
+            }                                                           \
+            za += sizeof(ARMVectorReg) * sizeof(TYPE);                  \
+            i += sizeof(TYPE);                                          \
+            pg >>= sizeof(TYPE);                                        \
+        } while (i & 15);                                               \
+    }                                                                   \
+}
+
+DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
+DO_MOVA_Z(sme_mova_zc_h, uint16_t, H2)
+DO_MOVA_Z(sme_mova_zc_s, uint32_t, H4)
+
+void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
+{
+    int i, oprsz = simd_oprsz(desc) / 8;
+    uint8_t *pg = vg;
+    uint64_t *d = vd;
+    uint64_t *a = za;
+
+    for (i = 0; i < oprsz; i++) {
+        if (pg[H1_2(i)] & 1) {
+            d[i] = a[i * sizeof(ARMVectorReg)];
+        }
+    }
+}
+
+void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
+{
+    int i, oprsz = simd_oprsz(desc) / 16;
+    uint16_t *pg = vg;
+    Int128 *d = vd;
+    Int128 *a = za;
+
+    for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
+        if (pg[H2(i)] & 1) {
+            d[i] = a[i * sizeof(ARMVectorReg)];
+        }
+    }
+}
+
+#undef DO_MOVA_Z
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 1654c0bbf9..9a26f253e0 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -3565,6 +3565,18 @@  void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
     }
 }
 
+void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
+                            void *vg, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 16;
+    Int128 *d = vd, *n = vn, *m = vm;
+    uint16_t *pg = vg;
+
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = (pg[H2(i)] & 1 ? n : m)[i];
+    }
+}
+
 /* Two operand comparison controlled by a predicate.
  * ??? It is very tempting to want to be able to expand this inline
  * with x86 instructions, e.g.
diff --git a/target/arm/translate-sme.c b/target/arm/translate-sme.c
index 971504559b..8e6881086b 100644
--- a/target/arm/translate-sme.c
+++ b/target/arm/translate-sme.c
@@ -35,6 +35,77 @@ 
 #include "decode-sme.c.inc"
 
 
+static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
+                                int tile_index, bool vertical)
+{
+    int tile = tile_index >> (4 - esz);
+    int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz);
+    int pos, len, offset;
+    TCGv_i32 t_index;
+    TCGv_ptr addr;
+
+    /* Resolve tile.size[index] to an untyped ZA slice index. */
+    t_index = tcg_temp_new_i32();
+    tcg_gen_trunc_tl_i32(t_index, cpu_reg(s, rs));
+    tcg_gen_addi_i32(t_index, t_index, index);
+
+    len = ctz32(streaming_vec_reg_size(s)) - esz;
+
+    if (vertical) {
+        /*
+         * Compute the offset of the index within the tile:
+         *     (index % (svl / size)) * size
+         *   = (index % (svl >> esz)) << esz
+         * Perform the power-of-two modulo via extraction of the low @len bits.
+         * Perform the multiply by shifting left by @pos bits.
+         * These two operations are performed simultaneously via deposit.
+         */
+        pos = esz;
+        tcg_gen_deposit_z_i32(t_index, t_index, pos, len);
+
+        /* The tile slice offset within env->zarray is the column offset. */
+        offset = tile;
+
+        /* Include the offset of zarray to make this relative to env. */
+        offset += offsetof(CPUARMState, zarray);
+        tcg_gen_addi_i32(t_index, t_index, offset);
+
+        /*
+         * For big-endian, adjust the column slice offset within
+         * the uint64_t host words that make up env->zarray.
+         * This must wait until index and offset are combined.
+         */
+        if (HOST_BIG_ENDIAN && esz < MO_64) {
+            tcg_gen_xori_i32(t_index, t_index, 8 - (1 << esz));
+        }
+    } else {
+        /*
+         * Compute the offset of the index within the tile:
+         *     (index % (svl / size)) * (size * sizeof(row))
+         *   = (index % (svl >> esz)) << (esz + log2(sizeof(row)))
+         */
+        pos = esz + ctz32(sizeof(ARMVectorReg));
+        tcg_gen_deposit_z_i32(t_index, t_index, pos, len);
+
+        /* The tile slice offset within env->zarray is the row offset. */
+        offset = tile * sizeof(ARMVectorReg);
+
+        /* Include the offset of zarray to make this relative to env. */
+        offset += offsetof(CPUARMState, zarray);
+        tcg_gen_addi_i32(t_index, t_index, offset);
+
+        /* Row slices need no endian adjustment. */
+    }
+
+    /* Add the offset to env to produce the final pointer. */
+    addr = tcg_temp_new_ptr();
+    tcg_gen_ext_i32_ptr(addr, t_index);
+    tcg_temp_free_i32(t_index);
+    tcg_gen_add_ptr(addr, addr, cpu_env);
+
+    return addr;
+}
+
 static bool trans_ZERO(DisasContext *s, arg_ZERO *a)
 {
     if (!dc_isar_feature(aa64_sme, s)) {
@@ -46,3 +117,62 @@  static bool trans_ZERO(DisasContext *s, arg_ZERO *a)
     }
     return true;
 }
+
+static bool trans_MOVA(DisasContext *s, arg_MOVA *a)
+{
+    static gen_helper_gvec_4 * const h_fns[5] = {
+        gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h,
+        gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d,
+        gen_helper_sve_sel_zpzz_q
+    };
+    static gen_helper_gvec_3 * const cz_fns[5] = {
+        gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h,
+        gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d,
+        gen_helper_sme_mova_cz_q,
+    };
+    static gen_helper_gvec_3 * const zc_fns[5] = {
+        gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h,
+        gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d,
+        gen_helper_sme_mova_zc_q,
+    };
+
+    TCGv_ptr t_za, t_zr, t_pg;
+    TCGv_i32 t_desc;
+    int svl;
+
+    if (!dc_isar_feature(aa64_sme, s)) {
+        return false;
+    }
+    if (!sme_smza_enabled_check(s)) {
+        return true;
+    }
+
+    t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
+    t_zr = vec_full_reg_ptr(s, a->zr);
+    t_pg = pred_full_reg_ptr(s, a->pg);
+
+    svl = streaming_vec_reg_size(s);
+    t_desc = tcg_constant_i32(simd_desc(svl, svl, 0));
+
+    if (a->v) {
+        /* Vertical slice -- use sme mova helpers. */
+        if (a->to_vec) {
+            zc_fns[a->esz](t_zr, t_za, t_pg, t_desc);
+        } else {
+            cz_fns[a->esz](t_za, t_zr, t_pg, t_desc);
+        }
+    } else {
+        /* Horizontal slice -- reuse sve sel helpers. */
+        if (a->to_vec) {
+            h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc);
+        } else {
+            h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc);
+        }
+    }
+
+    tcg_temp_free_ptr(t_za);
+    tcg_temp_free_ptr(t_zr);
+    tcg_temp_free_ptr(t_pg);
+
+    return true;
+}