@@ -1106,8 +1106,16 @@ DEF_HELPER_FLAGS_6(gvec_bfmmla, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_6(gvec_bfmlal, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_bfmlsl, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_ah_bfmlsl, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_6(gvec_bfmlal_idx, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_bfmlsl_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_ah_bfmlsl_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_sclamp_b, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
@@ -7375,6 +7375,36 @@ static bool do_BFMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sel)
TRANS_FEAT(BFMLALB_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, false)
TRANS_FEAT(BFMLALT_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, true)
+static bool do_BFMLSL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sel)
+{
+ if (s->fpcr_ah) {
+ return gen_gvec_fpst_zzzz(s, gen_helper_gvec_ah_bfmlsl,
+ a->rd, a->rn, a->rm, a->ra, sel, FPST_AH);
+ } else {
+ return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlsl,
+ a->rd, a->rn, a->rm, a->ra, sel, FPST_A64);
+ }
+}
+
+TRANS_FEAT(BFMLSLB_zzzw, aa64_sme2_or_sve2p1, do_BFMLSL_zzzw, a, false)
+TRANS_FEAT(BFMLSLT_zzzw, aa64_sme2_or_sve2p1, do_BFMLSL_zzzw, a, true)
+
+static bool do_BFMLSL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sel)
+{
+ if (s->fpcr_ah) {
+ return gen_gvec_fpst_zzzz(s, gen_helper_gvec_ah_bfmlsl_idx,
+ a->rd, a->rn, a->rm, a->ra,
+ (a->index << 1) | sel, FPST_AH);
+ } else {
+ return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlsl_idx,
+ a->rd, a->rn, a->rm, a->ra,
+ (a->index << 1) | sel, FPST_A64);
+ }
+}
+
+TRANS_FEAT(BFMLSLB_zzxw, aa64_sme2_or_sve2p1, do_BFMLSL_zzxw, a, false)
+TRANS_FEAT(BFMLSLT_zzxw, aa64_sme2_or_sve2p1, do_BFMLSL_zzxw, a, true)
+
static bool trans_PSEL(DisasContext *s, arg_psel *a)
{
int vl = vec_full_reg_size(s);
@@ -3272,44 +3272,76 @@ void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
- float_status *stat, uint32_t desc)
+static void do_bfmlal(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
+ float_status *stat, uint32_t desc, int negx, int negf)
{
intptr_t i, opr_sz = simd_oprsz(desc);
intptr_t sel = simd_data(desc);
- float32 *d = vd, *a = va;
- bfloat16 *n = vn, *m = vm;
for (i = 0; i < opr_sz / 4; ++i) {
- float32 nn = n[H2(i * 2 + sel)] << 16;
+ float32 nn = (negx ^ n[H2(i * 2 + sel)]) << 16;
float32 mm = m[H2(i * 2 + sel)] << 16;
- d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
+ d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], negf, stat);
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
- void *va, float_status *stat, uint32_t desc)
+void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal(vd, vn, vm, va, stat, desc, 0, 0);
+}
+
+void HELPER(gvec_bfmlsl)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal(vd, vn, vm, va, stat, desc, 0x8000, 0);
+}
+
+void HELPER(gvec_ah_bfmlsl)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
+}
+
+static void do_bfmlal_idx(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
+ float_status *stat, uint32_t desc, int negx, int negf)
{
intptr_t i, j, opr_sz = simd_oprsz(desc);
intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
intptr_t elements = opr_sz / 4;
intptr_t eltspersegment = MIN(16 / 4, elements);
- float32 *d = vd, *a = va;
- bfloat16 *n = vn, *m = vm;
for (i = 0; i < elements; i += eltspersegment) {
float32 m_idx = m[H2(2 * i + index)] << 16;
for (j = i; j < i + eltspersegment; j++) {
- float32 n_j = n[H2(2 * j + sel)] << 16;
- d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
+ float32 n_j = (negx ^ n[H2(2 * j + sel)]) << 16;
+ d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], negf, stat);
}
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, 0);
+}
+
+void HELPER(gvec_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0x8000, 0);
+}
+
+void HELPER(gvec_ah_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
+}
+
#define DO_CLAMP(NAME, TYPE) \
void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \
{ \
@@ -1720,6 +1720,7 @@ FCVTLT_sd 01100100 11 0010 11 101 ... ..... ..... @rd_pg_rn_e0
FLOGB 01100101 00 011 esz:2 0101 pg:3 rn:5 rd:5 &rpr_esz
### SVE2 floating-point multiply-add long (vectors)
+
FMLALB_zzzw 01100100 10 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
FMLALT_zzzw 01100100 10 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0
FMLSLB_zzzw 01100100 10 1 ..... 10 1 00 0 ..... ..... @rda_rn_rm_e0
@@ -1727,6 +1728,8 @@ FMLSLT_zzzw 01100100 10 1 ..... 10 1 00 1 ..... ..... @rda_rn_rm_e0
BFMLALB_zzzw 01100100 11 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
BFMLALT_zzzw 01100100 11 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0
+BFMLSLB_zzzw 01100100 11 1 ..... 10 1 00 0 ..... ..... @rda_rn_rm_e0
+BFMLSLT_zzzw 01100100 11 1 ..... 10 1 00 1 ..... ..... @rda_rn_rm_e0
### SVE2 floating-point dot-product
@@ -1734,12 +1737,16 @@ FDOT_zzzz 01100100 00 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
BFDOT_zzzz 01100100 01 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
### SVE2 floating-point multiply-add long (indexed)
+
FMLALB_zzxw 01100100 10 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2
FMLALT_zzxw 01100100 10 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2
FMLSLB_zzxw 01100100 10 1 ..... 0110.0 ..... ..... @rrxr_3a esz=2
FMLSLT_zzxw 01100100 10 1 ..... 0110.1 ..... ..... @rrxr_3a esz=2
+
BFMLALB_zzxw 01100100 11 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2
BFMLALT_zzxw 01100100 11 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2
+BFMLSLB_zzxw 01100100 11 1 ..... 0110.0 ..... ..... @rrxr_3a esz=2
+BFMLSLT_zzxw 01100100 11 1 ..... 0110.1 ..... ..... @rrxr_3a esz=2
### SVE2 floating-point dot-product (indexed)
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/helper.h | 8 +++++ target/arm/tcg/translate-sve.c | 30 ++++++++++++++++++ target/arm/tcg/vec_helper.c | 56 ++++++++++++++++++++++++++-------- target/arm/tcg/sve.decode | 7 +++++ 4 files changed, 89 insertions(+), 12 deletions(-)