@@ -751,3 +751,96 @@ TRANS_FEAT(ADD_azz_nn_s, aa64_sme2, do_azz_nn, a, MO_32, tcg_gen_gvec_add_var)
TRANS_FEAT(SUB_azz_nn_s, aa64_sme2, do_azz_nn, a, MO_32, tcg_gen_gvec_sub_var)
TRANS_FEAT(ADD_azz_nn_d, aa64_sme2_i16i64, do_azz_nn, a, MO_64, tcg_gen_gvec_add_var)
TRANS_FEAT(SUB_azz_nn_d, aa64_sme2_i16i64, do_azz_nn, a, MO_64, tcg_gen_gvec_sub_var)
+
+/*
+ * Expand array multi-vector single (n1), array multi-vector (nn),
+ * and array multi-vector indexed (nx), for floating-point accumulate.
+ * multi: true for nn, false for n1.
+ * fpst: >= 0 to set ptr argument for FPST_*, < 0 for ENV.
+ * data: stuff for simd_data, including any index.
+ */
+#define FPST_ENV -1
+
+static bool do_azz_acc_fp(DisasContext *s, int nreg, int nsel,
+ int rv, int off, int zn, int zm,
+ int data, int shsel, bool multi, int fpst,
+ gen_helper_gvec_4_ptr *fn)
+{
+ if (sme_sm_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ int vstride = svl / nreg;
+ TCGv_ptr t_za = get_zarray(s, rv, off, nreg);
+ TCGv_ptr t, ptr;
+
+ if (fpst >= 0) {
+ ptr = fpstatus_ptr(fpst);
+ } else {
+ ptr = tcg_env;
+ }
+ t = tcg_temp_new_ptr();
+
+ for (int r = 0; r < nreg; ++r) {
+ TCGv_ptr t_zn = vec_full_reg_ptr(s, zn);
+ TCGv_ptr t_zm = vec_full_reg_ptr(s, zm);
+
+ for (int i = 0; i < nsel; ++i) {
+ int o_za = (r * vstride + i) * sizeof(ARMVectorReg);
+ int desc = simd_desc(svl, svl, data | (i << shsel));
+
+ tcg_gen_addi_ptr(t, t_za, o_za);
+ fn(t, t_zn, t_zm, t, ptr, tcg_constant_i32(desc));
+ }
+
+ /*
+ * For multiple-and-single vectors, Zn may wrap.
+ * For multiple vectors, both Zn and Zm are aligned.
+ */
+ zn = (zn + 1) % 32;
+ zm += multi;
+ }
+ }
+ return true;
+}
+
+static bool do_fmlal(DisasContext *s, arg_azz_n *a, bool sub, bool multi)
+{
+ return do_azz_acc_fp(s, a->n, 2, a->rv, a->off, a->zn, a->zm,
+ (1 << 2) | sub, 1,
+ multi, FPST_ENV, gen_helper_sve2_fmlal_zzzw_s);
+}
+
+TRANS_FEAT(FMLAL_n1, aa64_sme2, do_fmlal, a, false, false)
+TRANS_FEAT(FMLSL_n1, aa64_sme2, do_fmlal, a, true, false)
+TRANS_FEAT(FMLAL_nn, aa64_sme2, do_fmlal, a, false, true)
+TRANS_FEAT(FMLSL_nn, aa64_sme2, do_fmlal, a, true, true)
+
+static bool do_fmlal_nx(DisasContext *s, arg_azx_n *a, bool sub)
+{
+ return do_azz_acc_fp(s, a->n, 2, a->rv, a->off, a->zn, a->zm,
+ (a->idx << 3) | (1 << 2) | sub, 1,
+ false, FPST_ENV, gen_helper_sve2_fmlal_zzxw_s);
+}
+
+TRANS_FEAT(FMLAL_nx, aa64_sme2, do_fmlal_nx, a, false)
+TRANS_FEAT(FMLSL_nx, aa64_sme2, do_fmlal_nx, a, true)
+
+static bool do_bfmlal(DisasContext *s, arg_azz_n *a, bool sub, bool multi)
+{
+ return do_azz_acc_fp(s, a->n, 2, a->rv, a->off, a->zn, a->zm, sub, 1,
+ multi, FPST_ZA, gen_helper_gvec_bfmlal);
+}
+
+TRANS_FEAT(BFMLAL_n1, aa64_sme2, do_bfmlal, a, false, false)
+TRANS_FEAT(BFMLSL_n1, aa64_sme2, do_bfmlal, a, true, false)
+TRANS_FEAT(BFMLAL_nn, aa64_sme2, do_bfmlal, a, false, true)
+TRANS_FEAT(BFMLSL_nn, aa64_sme2, do_bfmlal, a, true, true)
+
+static bool do_bfmlal_nx(DisasContext *s, arg_azx_n *a, bool sub)
+{
+ return do_azz_acc_fp(s, a->n, 2, a->rv, a->off, a->zn, a->zm,
+ (a->idx << 2) | sub, 1,
+ false, FPST_ZA, gen_helper_gvec_bfmlal_idx);
+}
+
+TRANS_FEAT(BFMLAL_nx, aa64_sme2, do_bfmlal_nx, a, false)
+TRANS_FEAT(BFMLSL_nx, aa64_sme2, do_bfmlal_nx, a, true)
@@ -261,6 +261,30 @@ SUB_azz_n1_s 11000001 0011 .... 0 .. 110 ..... 11 ... @azz_nx1_o3 n=4
SUB_azz_n1_d 11000001 0110 .... 0 .. 110 ..... 11 ... @azz_nx1_o3 n=2
SUB_azz_n1_d 11000001 0111 .... 0 .. 110 ..... 11 ... @azz_nx1_o3 n=4
+%off3_x2 0:3 !function=times_2
+%off2_x2 0:2 !function=times_2
+
+@azz_nx1_o3x2 ........ ... . zm:4 . .. ... zn:5 .. ... \
+ &azz_n off=%off3_x2 rv=%mova_rv
+@azz_nx1_o2x2 ........ ... . zm:4 . .. ... zn:5 ... .. \
+ &azz_n off=%off2_x2 rv=%mova_rv
+
+FMLAL_n1 11000001 001 0 .... 0 .. 011 ..... 00 ... @azz_nx1_o3x2 n=1
+FMLAL_n1 11000001 001 0 .... 0 .. 010 ..... 000 .. @azz_nx1_o2x2 n=2
+FMLAL_n1 11000001 001 1 .... 0 .. 010 ..... 000 .. @azz_nx1_o2x2 n=4
+
+FMLSL_n1 11000001 001 0 .... 0 .. 011 ..... 01 ... @azz_nx1_o3x2 n=1
+FMLSL_n1 11000001 001 0 .... 0 .. 010 ..... 010 .. @azz_nx1_o2x2 n=2
+FMLSL_n1 11000001 001 1 .... 0 .. 010 ..... 010 .. @azz_nx1_o2x2 n=4
+
+BFMLAL_n1 11000001 001 0 .... 0 .. 011 ..... 10 ... @azz_nx1_o3x2 n=1
+BFMLAL_n1 11000001 001 0 .... 0 .. 010 ..... 100 .. @azz_nx1_o2x2 n=2
+BFMLAL_n1 11000001 001 1 .... 0 .. 010 ..... 100 .. @azz_nx1_o2x2 n=4
+
+BFMLSL_n1 11000001 001 0 .... 0 .. 011 ..... 11 ... @azz_nx1_o3x2 n=1
+BFMLSL_n1 11000001 001 0 .... 0 .. 010 ..... 110 .. @azz_nx1_o2x2 n=2
+BFMLSL_n1 11000001 001 1 .... 0 .. 010 ..... 110 .. @azz_nx1_o2x2 n=4
+
### SME2 Multi-vector Multiple Array Vectors
%zn_ax2 6:4 !function=times_2
@@ -280,3 +304,50 @@ SUB_azz_nn_s 11000001 101 ....0 0 .. 110 ....0 11 ... @azz_2x2_o3
SUB_azz_nn_s 11000001 101 ...01 0 .. 110 ...00 11 ... @azz_4x4_o3
SUB_azz_nn_d 11000001 111 ....0 0 .. 110 ....0 11 ... @azz_2x2_o3
SUB_azz_nn_d 11000001 111 ...01 0 .. 110 ...00 11 ... @azz_4x4_o3
+
+@azz_2x2_o2x2 ........ ... ..... . .. ... ..... ... .. \
+ &azz_n n=2 rv=%mova_rv zn=%zn_ax2 zm=%zm_ax2 off=%off2_x2
+@azz_4x4_o2x2 ........ ... ..... . .. ... ..... ... .. \
+ &azz_n n=4 rv=%mova_rv zn=%zn_ax4 zm=%zm_ax4 off=%off2_x2
+
+FMLAL_nn 11000001 101 ....0 0 .. 010 ....0 000 .. @azz_2x2_o2x2
+FMLAL_nn 11000001 101 ...01 0 .. 010 ...00 000 .. @azz_4x4_o2x2
+
+FMLSL_nn 11000001 101 ....0 0 .. 010 ....0 010 .. @azz_2x2_o2x2
+FMLSL_nn 11000001 101 ...01 0 .. 010 ...00 010 .. @azz_4x4_o2x2
+
+BFMLAL_nn 11000001 101 ....0 0 .. 010 ....0 100 .. @azz_2x2_o2x2
+BFMLAL_nn 11000001 101 ...01 0 .. 010 ...00 100 .. @azz_4x4_o2x2
+
+BFMLSL_nn 11000001 101 ....0 0 .. 010 ....0 110 .. @azz_2x2_o2x2
+BFMLSL_nn 11000001 101 ...01 0 .. 010 ...00 110 .. @azz_4x4_o2x2
+
+### SME2 Multi-vector Indexed
+
+&azx_n n off rv zn zm idx
+
+%idx3_15_10 15:1 10:2
+%idx2_10_2 10:2 2:1
+
+@azx_1x1_o3x2 ........ .... zm:4 . .. . .. zn:5 .. ... \
+ &azx_n n=1 rv=%mova_rv off=%off3_x2 idx=%idx3_15_10
+@azx_2x1_o2x2 ........ .... zm:4 . .. . .. ..... .. ... \
+ &azx_n n=2 rv=%mova_rv off=%off2_x2 zn=%zn_ax2 idx=%idx2_10_2
+@azx_4x1_o2x2 ........ .... zm:4 . .. . .. ..... .. ... \
+ &azx_n n=4 rv=%mova_rv off=%off2_x2 zn=%zn_ax4 idx=%idx2_10_2
+
+FMLAL_nx 11000001 1000 .... . .. 1 .. ..... 00 ... @azx_1x1_o3x2
+FMLAL_nx 11000001 1001 .... 0 .. 1 .. ....0 00 ... @azx_2x1_o2x2
+FMLAL_nx 11000001 1001 .... 1 .. 1 .. ...00 00 ... @azx_4x1_o2x2
+
+FMLSL_nx 11000001 1000 .... . .. 1 .. ..... 01 ... @azx_1x1_o3x2
+FMLSL_nx 11000001 1001 .... 0 .. 1 .. ....0 01 ... @azx_2x1_o2x2
+FMLSL_nx 11000001 1001 .... 1 .. 1 .. ...00 01 ... @azx_4x1_o2x2
+
+BFMLAL_nx 11000001 1000 .... . .. 1 .. ..... 10 ... @azx_1x1_o3x2
+BFMLAL_nx 11000001 1001 .... 0 .. 1 .. ....0 10 ... @azx_2x1_o2x2
+BFMLAL_nx 11000001 1001 .... 1 .. 1 .. ...00 10 ... @azx_4x1_o2x2
+
+BFMLSL_nx 11000001 1000 .... . .. 1 .. ..... 11 ... @azx_1x1_o3x2
+BFMLSL_nx 11000001 1001 .... 0 .. 1 .. ....0 11 ... @azx_2x1_o2x2
+BFMLSL_nx 11000001 1001 .... 1 .. 1 .. ...00 11 ... @azx_4x1_o2x2
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/translate-sme.c | 93 ++++++++++++++++++++++++++++++++++ target/arm/tcg/sme.decode | 71 ++++++++++++++++++++++++++ 2 files changed, 164 insertions(+)