@@ -173,3 +173,8 @@ DEF_HELPER_FLAGS_5(gvec_fmaxnum_b16, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_fminnum_b16, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, fpst, i32)
+
+DEF_HELPER_FLAGS_6(sme2_fdot_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_6(sme2_fdot_idx_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, env, i32)
@@ -1122,6 +1122,50 @@ void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
}
}
+void HELPER(sme2_fdot_h)(void *vd, void *vn, void *vm, void *va,
+ CPUARMState *env, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_maxsz(desc);
+ bool za = extract32(desc, SIMD_DATA_SHIFT, 1);
+ float_status *fpst_std = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
+ float_status *fpst_f16 = &env->vfp.fp_status[za ? FPST_ZA_F16 : FPST_A64_F16];
+ float_status fpst_odd = *fpst_std;
+ float32 *d = vd, *a = va;
+ uint32_t *n = vn, *m = vm;
+
+ set_float_rounding_mode(float_round_to_odd, &fpst_odd);
+
+ for (i = 0; i < oprsz / sizeof(float32); ++i) {
+ d[H4(i)] = f16_dotadd(a[H4(i)], n[H4(i)], m[H4(i)],
+ fpst_f16, fpst_std, &fpst_odd);
+ }
+}
+
+void HELPER(sme2_fdot_idx_h)(void *vd, void *vn, void *vm, void *va,
+ CPUARMState *env, uint32_t desc)
+{
+ intptr_t i, j, oprsz = simd_maxsz(desc);
+ intptr_t elements = oprsz / sizeof(float32);
+ intptr_t eltspersegment = MIN(4, elements);
+ int idx = extract32(desc, SIMD_DATA_SHIFT, 2);
+ bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
+ float_status *fpst_std = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
+ float_status *fpst_f16 = &env->vfp.fp_status[za ? FPST_ZA_F16 : FPST_A64_F16];
+ float_status fpst_odd = *fpst_std;
+ float32 *d = vd, *a = va;
+ uint32_t *n = vn, *m = (uint32_t *)vm + H4(idx);
+
+ set_float_rounding_mode(float_round_to_odd, &fpst_odd);
+
+ for (i = 0; i < elements; i += eltspersegment) {
+ uint32_t mm = m[i];
+ for (j = 0; j < eltspersegment; ++j) {
+ d[H4(i + j)] = f16_dotadd(a[H4(i + j)], n[H4(i + j)], mm,
+ fpst_f16, fpst_std, &fpst_odd);
+ }
+ }
+}
+
void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm,
void *vpn, void *vpm, CPUARMState *env, uint32_t desc)
{
@@ -844,3 +844,21 @@ static bool do_bfmlal_nx(DisasContext *s, arg_azx_n *a, bool sub)
TRANS_FEAT(BFMLAL_nx, aa64_sme2, do_bfmlal_nx, a, false)
TRANS_FEAT(BFMLSL_nx, aa64_sme2, do_bfmlal_nx, a, true)
+
+static bool do_fdot(DisasContext *s, arg_azz_n *a, bool multi)
+{
+ return do_azz_acc_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm, 1, 0,
+ multi, FPST_ENV, gen_helper_sme2_fdot_h);
+}
+
+TRANS_FEAT(FDOT_n1, aa64_sme2, do_fdot, a, false)
+TRANS_FEAT(FDOT_nn, aa64_sme2, do_fdot, a, true)
+
+static bool do_fdot_nx(DisasContext *s, arg_azx_n *a)
+{
+ return do_azz_acc_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm,
+ a->idx | 2, 0, false, FPST_ENV,
+ gen_helper_sme2_fdot_idx_h);
+}
+
+TRANS_FEAT(FDOT_nx, aa64_sme2, do_fdot_nx, a)
@@ -7183,6 +7183,11 @@ TRANS_FEAT_NONSTREAMING(USMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
TRANS_FEAT_NONSTREAMING(UMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
gen_helper_gvec_ummla_b, a, 0)
+TRANS_FEAT(FDOT_zzzz, aa64_sme2_or_sve2p1, gen_gvec_env_arg_zzzz,
+ gen_helper_sme2_fdot_h, a, 0)
+TRANS_FEAT(FDOT_zzxz, aa64_sme2_or_sve2p1, gen_gvec_env_arg_zzxz,
+ gen_helper_sme2_fdot_idx_h, a)
+
TRANS_FEAT(BFDOT_zzzz, aa64_sve_bf16, gen_gvec_env_arg_zzzz,
gen_helper_gvec_bfdot, a, 0)
TRANS_FEAT(BFDOT_zzxz, aa64_sve_bf16, gen_gvec_env_arg_zzxz,
@@ -285,6 +285,9 @@ BFMLSL_n1 11000001 001 0 .... 0 .. 011 ..... 11 ... @azz_nx1_o3x2 n=1
BFMLSL_n1 11000001 001 0 .... 0 .. 010 ..... 110 .. @azz_nx1_o2x2 n=2
BFMLSL_n1 11000001 001 1 .... 0 .. 010 ..... 110 .. @azz_nx1_o2x2 n=4
+FDOT_n1 11000001 001 0 .... 0 .. 100 ..... 00 ... @azz_nx1_o3 n=2
+FDOT_n1 11000001 001 1 .... 0 .. 100 ..... 00 ... @azz_nx1_o3 n=4
+
### SME2 Multi-vector Multiple Array Vectors
%zn_ax2 6:4 !function=times_2
@@ -322,6 +325,9 @@ BFMLAL_nn 11000001 101 ...01 0 .. 010 ...00 100 .. @azz_4x4_o2x2
BFMLSL_nn 11000001 101 ....0 0 .. 010 ....0 110 .. @azz_2x2_o2x2
BFMLSL_nn 11000001 101 ...01 0 .. 010 ...00 110 .. @azz_4x4_o2x2
+FDOT_nn 11000001 101 ....0 0 .. 100 ....0 00 ... @azz_2x2_o3
+FDOT_nn 11000001 101 ...01 0 .. 100 ...00 00 ... @azz_4x4_o3
+
### SME2 Multi-vector Indexed
&azx_n n off rv zn zm idx
@@ -351,3 +357,11 @@ BFMLAL_nx 11000001 1001 .... 1 .. 1 .. ...00 10 ... @azx_4x1_o2x2
BFMLSL_nx 11000001 1000 .... . .. 1 .. ..... 11 ... @azx_1x1_o3x2
BFMLSL_nx 11000001 1001 .... 0 .. 1 .. ....0 11 ... @azx_2x1_o2x2
BFMLSL_nx 11000001 1001 .... 1 .. 1 .. ...00 11 ... @azx_4x1_o2x2
+
+@azx_2x1_i2_o3 ........ .... zm:4 . .. . idx:2 .... ... off:3 \
+ &azx_n n=2 rv=%mova_rv zn=%zn_ax2
+@azx_4x1_i2_o3 ........ .... zm:4 . .. . idx:2 .... ... off:3 \
+ &azx_n n=2 rv=%mova_rv zn=%zn_ax4
+
+FDOT_nx 11000001 0101 .... 0 .. 1 .. ....0 01 ... @azx_2x1_i2_o3
+FDOT_nx 11000001 0101 .... 1 .. 1 .. ...00 01 ... @azx_4x1_i2_o3
@@ -1662,7 +1662,9 @@ FMLSLT_zzzw 01100100 10 1 ..... 10 1 00 1 ..... ..... @rda_rn_rm_e0
BFMLALB_zzzw 01100100 11 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
BFMLALT_zzzw 01100100 11 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0
-### SVE2 floating-point bfloat16 dot-product
+### SVE2 floating-point dot-product
+
+FDOT_zzzz 01100100 00 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
BFDOT_zzzz 01100100 01 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
### SVE2 floating-point multiply-add long (indexed)
@@ -1673,7 +1675,9 @@ FMLSLT_zzxw 01100100 10 1 ..... 0110.1 ..... ..... @rrxr_3a esz=2
BFMLALB_zzxw 01100100 11 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2
BFMLALT_zzxw 01100100 11 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2
-### SVE2 floating-point bfloat16 dot-product (indexed)
+### SVE2 floating-point dot-product (indexed)
+
+FDOT_zzxz 01100100 00 1 ..... 010000 ..... ..... @rrxr_2 esz=2
BFDOT_zzxz 01100100 01 1 ..... 010000 ..... ..... @rrxr_2 esz=2
### SVE broadcast predicate element
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/helper-sme.h | 5 ++++ target/arm/tcg/sme_helper.c | 44 ++++++++++++++++++++++++++++++++++ target/arm/tcg/translate-sme.c | 18 ++++++++++++++ target/arm/tcg/translate-sve.c | 5 ++++ target/arm/tcg/sme.decode | 14 +++++++++++ target/arm/tcg/sve.decode | 8 +++++-- 6 files changed, 92 insertions(+), 2 deletions(-)