@@ -180,3 +180,14 @@ DEF_HELPER_FLAGS_6(sme2_fdot_idx_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, env, i32)
DEF_HELPER_FLAGS_6(sme2_fvdot_idx_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_4(sme2_svdot_idx_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uvdot_idx_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_suvdot_idx_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_usvdot_idx_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_svdot_idx_4h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uvdot_idx_4h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_svdot_idx_2h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uvdot_idx_2h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
@@ -1386,3 +1386,45 @@ DEF_IMOP_16x2_32(umopa2_s, uint16_t, uint16_t)
DEF_IMOPH(sme2, smopa2, s)
DEF_IMOPH(sme2, umopa2, s)
+
+#define DO_VDOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD, HN) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t svl = simd_oprsz(desc); \
+ intptr_t elements = svl / sizeof(TYPED); \
+ intptr_t eltperseg = 16 / sizeof(TYPED); \
+ intptr_t vstride = (svl / 4) * sizeof(ARMVectorReg); \
+ intptr_t zstride = sizeof(ARMVectorReg) / sizeof(TYPEN); \
+ intptr_t nreg = sizeof(TYPED) / sizeof(TYPEN); \
+ intptr_t idx = extract32(desc, SIMD_DATA_SHIFT, 2); \
+ TYPEN *n = vn; \
+ TYPEM *m = vm; \
+ for (intptr_t r = 0; r < nreg; r++) { \
+ TYPED *d = vd + r * vstride; \
+ for (intptr_t seg = 0; seg < elements; seg += eltperseg) { \
+ intptr_t s = seg + idx; \
+ for (intptr_t e = seg; e < seg + eltperseg; e++) { \
+ TYPED sum = d[HD(e)]; \
+ for (intptr_t i = 0; i < nreg; i++) { \
+ TYPED nn = n[i * zstride + HN(nreg * e + r)]; \
+ TYPED mm = m[HN(nreg * s + i)]; \
+ sum += nn * mm; \
+ } \
+ d[HD(e)] = sum; \
+ } \
+ } \
+ } \
+}
+
+DO_VDOT_IDX(sme2_svdot_idx_4b, int32_t, int8_t, int8_t, H4, H1)
+DO_VDOT_IDX(sme2_uvdot_idx_4b, uint32_t, uint8_t, uint8_t, H4, H1)
+DO_VDOT_IDX(sme2_suvdot_idx_4b, int32_t, int8_t, uint8_t, H4, H1)
+DO_VDOT_IDX(sme2_usvdot_idx_4b, int32_t, uint8_t, int8_t, H4, H1)
+
+DO_VDOT_IDX(sme2_svdot_idx_4h, int64_t, int16_t, int16_t, H8, H2)
+DO_VDOT_IDX(sme2_uvdot_idx_4h, uint64_t, uint16_t, uint16_t, H8, H2)
+
+DO_VDOT_IDX(sme2_svdot_idx_2h, int32_t, int16_t, int16_t, H4, H2)
+DO_VDOT_IDX(sme2_uvdot_idx_2h, uint32_t, uint16_t, uint16_t, H4, H2)
+
+#undef DO_VDOT_IDX
@@ -980,3 +980,26 @@ TRANS_FEAT(SDOT_nx_4b, aa64_sme, do_dot_nx, a, gen_helper_gvec_sdot_idx_4b)
TRANS_FEAT(UDOT_nx_4b, aa64_sme, do_dot_nx, a, gen_helper_gvec_udot_idx_4b)
TRANS_FEAT(SDOT_nx_4h, aa64_sme2_i16i64, do_dot_nx, a, gen_helper_gvec_sdot_idx_4h)
TRANS_FEAT(UDOT_nx_4h, aa64_sme2_i16i64, do_dot_nx, a, gen_helper_gvec_udot_idx_4h)
+
+static bool do_vdot_nx(DisasContext *s, arg_azx_n *a, gen_helper_gvec_3 *fn)
+{
+ if (sme_sm_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ fn(get_zarray(s, a->rv, a->off, a->n),
+ vec_full_reg_ptr(s, a->zn),
+ vec_full_reg_ptr(s, a->zm),
+ tcg_constant_i32(simd_desc(svl, svl, a->idx)));
+ }
+ return true;
+}
+
+TRANS_FEAT(SVDOT_nx_2h, aa64_sme, do_vdot_nx, a, gen_helper_sme2_svdot_idx_2h)
+TRANS_FEAT(SVDOT_nx_4b, aa64_sme, do_vdot_nx, a, gen_helper_sme2_svdot_idx_4b)
+TRANS_FEAT(SVDOT_nx_4h, aa64_sme, do_vdot_nx, a, gen_helper_sme2_svdot_idx_4h)
+
+TRANS_FEAT(UVDOT_nx_2h, aa64_sme, do_vdot_nx, a, gen_helper_sme2_uvdot_idx_2h)
+TRANS_FEAT(UVDOT_nx_4b, aa64_sme, do_vdot_nx, a, gen_helper_sme2_uvdot_idx_4b)
+TRANS_FEAT(UVDOT_nx_4h, aa64_sme, do_vdot_nx, a, gen_helper_sme2_uvdot_idx_4h)
+
+TRANS_FEAT(SUVDOT_nx_4b, aa64_sme, do_vdot_nx, a, gen_helper_sme2_suvdot_idx_4b)
+TRANS_FEAT(USVDOT_nx_4b, aa64_sme, do_vdot_nx, a, gen_helper_sme2_usvdot_idx_4b)
@@ -438,3 +438,14 @@ USDOT_nx 11000001 0101 .... 1 .. 1 .. ...01 01 ... @azx_4x1_i2_o3
SUDOT_nx 11000001 0101 .... 0 .. 1 .. ....1 11 ... @azx_2x1_i2_o3
SUDOT_nx 11000001 0101 .... 1 .. 1 .. ...01 11 ... @azx_4x1_i2_o3
+
+SVDOT_nx_2h 11000001 0101 .... 0 .. 0 .. ....1 00 ... @azx_2x1_i2_o3
+SVDOT_nx_4b 11000001 0101 .... 1 .. 0 .. ...01 00 ... @azx_4x1_i2_o3
+SVDOT_nx_4h 11000001 1101 .... 1 .. 01 . ...00 01 ... @azx_4x1_i1_o3
+
+UVDOT_nx_2h 11000001 0101 .... 0 .. 0 .. ....1 10 ... @azx_2x1_i2_o3
+UVDOT_nx_4b 11000001 0101 .... 1 .. 0 .. ...01 10 ... @azx_4x1_i2_o3
+UVDOT_nx_4h 11000001 1101 .... 1 .. 01 . ...00 11 ... @azx_4x1_i1_o3
+
+SUVDOT_nx_4b 11000001 0101 .... 1 .. 0 .. ...01 11 ... @azx_4x1_i2_o3
+USVDOT_nx_4b 11000001 0101 .... 1 .. 0 .. ...01 01 ... @azx_4x1_i2_o3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/helper-sme.h | 11 +++++++++ target/arm/tcg/sme_helper.c | 42 ++++++++++++++++++++++++++++++++++ target/arm/tcg/translate-sme.c | 23 +++++++++++++++++++ target/arm/tcg/sme.decode | 11 +++++++++ 4 files changed, 87 insertions(+)