@@ -293,3 +293,18 @@ DEF_HELPER_FLAGS_3(sme2_sqrshrun_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sme2_sqrshrn_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sme2_uqrshrn_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sme2_sqrshrun_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_sclamp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_sclamp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_sclamp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_sclamp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_uclamp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uclamp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uclamp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uclamp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sme2_fclamp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sme2_fclamp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sme2_fclamp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sme2_bfclamp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
@@ -1949,3 +1949,55 @@ UZP4(sme2_uzp4_d, uint64_t, )
UZP4(sme2_uzp4_q, Int128, )
#undef UZP4
+
+#define ICLAMP(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ size_t stride = sizeof(ARMVectorReg) / sizeof(TYPE); \
+ size_t elements = simd_oprsz(desc) / sizeof(TYPE); \
+ size_t nreg = simd_data(desc); \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ for (size_t e = 0; e < elements; e++) { \
+ TYPE nn = n[H(e)], mm = m[H(e)]; \
+ for (size_t r = 0; r < nreg; r++) { \
+ TYPE *dd = &d[r * stride + H(e)]; \
+ *dd = MIN(MAX(*dd, nn), mm); \
+ } \
+ } \
+}
+
+ICLAMP(sme2_sclamp_b, int8_t, H1)
+ICLAMP(sme2_sclamp_h, int16_t, H2)
+ICLAMP(sme2_sclamp_s, int32_t, H4)
+ICLAMP(sme2_sclamp_d, int64_t, H8)
+
+ICLAMP(sme2_uclamp_b, uint8_t, H1)
+ICLAMP(sme2_uclamp_h, uint16_t, H2)
+ICLAMP(sme2_uclamp_s, uint32_t, H4)
+ICLAMP(sme2_uclamp_d, uint64_t, H8)
+
+#undef ICLAMP
+
+#define FCLAMP(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, \
+ float_status *fpst, uint32_t desc) \
+{ \
+ size_t stride = sizeof(ARMVectorReg) / sizeof(TYPE); \
+ size_t elements = simd_oprsz(desc) / sizeof(TYPE); \
+ size_t nreg = simd_data(desc); \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ for (size_t e = 0; e < elements; e++) { \
+ TYPE nn = n[H(e)], mm = m[H(e)]; \
+ for (size_t r = 0; r < nreg; r++) { \
+ TYPE *dd = &d[r * stride + H(e)]; \
+ *dd = TYPE##_minnum(TYPE##_maxnum(*dd, nn, fpst), mm, fpst); \
+ } \
+ } \
+}
+
+FCLAMP(sme2_fclamp_h, float16, H2)
+FCLAMP(sme2_fclamp_s, float32, H4)
+FCLAMP(sme2_fclamp_d, float64, H8)
+FCLAMP(sme2_bfclamp, bfloat16, H2)
+
+#undef FCLAMP
@@ -1464,3 +1464,59 @@ static gen_helper_gvec_3 * const uzp2_fns[] = {
};
TRANS_FEAT(UZP_2, aa64_sme2, do_zipuzp_2, a, uzp2_fns)
+static bool trans_FCLAMP(DisasContext *s, arg_zzz_en *a)
+{
+ static gen_helper_gvec_3_ptr * const fn[] = {
+ gen_helper_sme2_bfclamp,
+ gen_helper_sme2_fclamp_h,
+ gen_helper_sme2_fclamp_s,
+ gen_helper_sme2_fclamp_d,
+ };
+
+ /* This insn uses MO_8 to encode BFloat16. */
+ if (!(a->esz == MO_8
+ ? dc_isar_feature(aa64_sme2_b16b16, s)
+ : dc_isar_feature(aa64_sme2, s))) {
+ return false;
+ }
+
+ if (sme_sm_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ TCGv_ptr fpst = fpstatus_ptr(a->esz == MO_16 ? FPST_A64_F16 : FPST_A64);
+
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->zd),
+ vec_full_reg_offset(s, a->zn),
+ vec_full_reg_offset(s, a->zm),
+ fpst, svl, svl, a->n, fn[a->esz]);
+ }
+ return true;
+}
+
+static bool do_clamp(DisasContext *s, arg_zzz_en *a,
+ gen_helper_gvec_3 * const fn[4])
+{
+ if (sme_sm_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->zd),
+ vec_full_reg_offset(s, a->zn),
+ vec_full_reg_offset(s, a->zm),
+ svl, svl, a->n, fn[a->esz]);
+ }
+ return true;
+}
+
+static gen_helper_gvec_3 * const sclamp_fns[] = {
+ gen_helper_sme2_sclamp_b,
+ gen_helper_sme2_sclamp_h,
+ gen_helper_sme2_sclamp_s,
+ gen_helper_sme2_sclamp_d,
+};
+TRANS_FEAT(SCLAMP, aa64_sme2, do_clamp, a, sclamp_fns)
+
+static gen_helper_gvec_3 * const uclamp_fns[] = {
+ gen_helper_sme2_uclamp_b,
+ gen_helper_sme2_uclamp_h,
+ gen_helper_sme2_uclamp_s,
+ gen_helper_sme2_uclamp_d,
+};
+TRANS_FEAT(UCLAMP, aa64_sme2, do_clamp, a, uclamp_fns)
@@ -855,3 +855,20 @@ UZP_2 11000001 esz:2 1 zm:5 110100 zn:5 .... 1 \
&zzz_e zd=%zd_ax2
UZP_2 11000001 00 1 zm:5 110101 zn:5 .... 1 \
&zzz_e zd=%zd_ax2 esz=4
+
+&zzz_en zd zn zm esz n
+
+FCLAMP 11000001 esz:2 1 zm:5 110000 zn:5 .... 0 \
+ &zzz_en zd=%zd_ax2 n=2
+FCLAMP 11000001 esz:2 1 zm:5 110010 zn:5 ...0 0 \
+ &zzz_en zd=%zd_ax4 n=4
+
+SCLAMP 11000001 esz:2 1 zm:5 110001 zn:5 .... 0 \
+ &zzz_en zd=%zd_ax2 n=2
+SCLAMP 11000001 esz:2 1 zm:5 110011 zn:5 ...0 0 \
+ &zzz_en zd=%zd_ax4 n=4
+
+UCLAMP 11000001 esz:2 1 zm:5 110001 zn:5 .... 1 \
+ &zzz_en zd=%zd_ax2 n=2
+UCLAMP 11000001 esz:2 1 zm:5 110011 zn:5 ...0 1 \
+ &zzz_en zd=%zd_ax4 n=4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/helper-sme.h | 15 +++++++++ target/arm/tcg/sme_helper.c | 52 +++++++++++++++++++++++++++++++ target/arm/tcg/translate-sme.c | 56 ++++++++++++++++++++++++++++++++++ target/arm/tcg/sme.decode | 17 +++++++++++ 4 files changed, 140 insertions(+)