Message ID | 20180621015359.12018-3-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | target/arm SVE patches | expand |
On 21 June 2018 at 02:53, Richard Henderson <richard.henderson@linaro.org> wrote: > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > target/arm/helper-sve.h | 40 ++++++++++ > target/arm/sve_helper.c | 156 +++++++++++++++++++++++++++++++++++++ > target/arm/translate-sve.c | 69 ++++++++++++++++ > target/arm/sve.decode | 6 ++ > 4 files changed, 271 insertions(+) > > diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h > index fcc9ba5f50..7338abbbcf 100644 > --- a/target/arm/helper-sve.h > +++ b/target/arm/helper-sve.h > @@ -754,3 +754,43 @@ DEF_HELPER_FLAGS_4(sve_ld1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > > DEF_HELPER_FLAGS_4(sve_ld1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > DEF_HELPER_FLAGS_4(sve_ld1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldff1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldff1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldff1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldnf1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldnf1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldnf1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldnf1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c > index 4e6ad282f9..6e1b539ce3 100644 > --- a/target/arm/sve_helper.c > +++ b/target/arm/sve_helper.c > @@ -2963,3 +2963,159 @@ DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) > #undef DO_LD2 > #undef DO_LD3 > #undef DO_LD4 > + > +/* > + * Load contiguous data, first-fault and no-fault. > + */ > + > +#ifdef CONFIG_USER_ONLY > + > +/* Fault on byte I. All bits in FFR from I are cleared. The vector > + * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE > + * option, which leaves subsequent data unchanged. > + */ > +static void __attribute__((cold)) attribute cold was first introduced in GCC 4.3. As of commit fa54abb8c29 I think we still support gcc 4.1, so we need to hide this behind a QEMU_COLD or something I think, eg #ifndef __has_attribute #define __has_attribute(x) 0 /* compatibility with older gcc */ #endif #if __has_attribute(cold) || QEMU_GNUC_PREREQ(4, 3) #define QEMU_COLD __attribute__((cold)) #else #define QEMU_COLD #endif (gcc added __has_attribute in gcc 5, which is nice.) > +record_fault(CPUARMState *env, intptr_t i, intptr_t oprsz) > +{ > + uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; > + if (i & 63) { > + ffr[i / 64] &= MAKE_64BIT_MASK(0, (i & 63) - 1); Should this really have a - 1 here? (i & 63) will be anything between 1 and 63, so I would have expected the set of masks to be anything from "1 bit set" to "63 bits set", not "0 bits set" to "62 bits set". > + i = ROUND_UP(i, 64); > + } > + for (; i < oprsz; i += 64) { > + ffr[i / 64] = 0; > + } > +} > + Otherwise Reviewed-by: Peter Maydell <peter.maydell@linaro.org> thanks -- PMM
On 06/22/2018 09:04 AM, Peter Maydell wrote: > On 21 June 2018 at 02:53, Richard Henderson > <richard.henderson@linaro.org> wrote: >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> >> --- >> target/arm/helper-sve.h | 40 ++++++++++ >> target/arm/sve_helper.c | 156 +++++++++++++++++++++++++++++++++++++ >> target/arm/translate-sve.c | 69 ++++++++++++++++ >> target/arm/sve.decode | 6 ++ >> 4 files changed, 271 insertions(+) >> >> diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h >> index fcc9ba5f50..7338abbbcf 100644 >> --- a/target/arm/helper-sve.h >> +++ b/target/arm/helper-sve.h >> @@ -754,3 +754,43 @@ DEF_HELPER_FLAGS_4(sve_ld1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> >> DEF_HELPER_FLAGS_4(sve_ld1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> DEF_HELPER_FLAGS_4(sve_ld1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> + >> +DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldff1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldff1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldff1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldff1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> + >> +DEF_HELPER_FLAGS_4(sve_ldff1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldff1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldff1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldff1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldff1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> + >> +DEF_HELPER_FLAGS_4(sve_ldff1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldff1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldff1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> + >> +DEF_HELPER_FLAGS_4(sve_ldff1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> + >> +DEF_HELPER_FLAGS_4(sve_ldnf1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldnf1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldnf1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> + >> +DEF_HELPER_FLAGS_4(sve_ldnf1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldnf1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldnf1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldnf1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldnf1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> + >> +DEF_HELPER_FLAGS_4(sve_ldnf1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldnf1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> +DEF_HELPER_FLAGS_4(sve_ldnf1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> + >> +DEF_HELPER_FLAGS_4(sve_ldnf1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) >> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c >> index 4e6ad282f9..6e1b539ce3 100644 >> --- a/target/arm/sve_helper.c >> +++ b/target/arm/sve_helper.c >> @@ -2963,3 +2963,159 @@ DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) >> #undef DO_LD2 >> #undef DO_LD3 >> #undef DO_LD4 >> + >> +/* >> + * Load contiguous data, first-fault and no-fault. >> + */ >> + >> +#ifdef CONFIG_USER_ONLY >> + >> +/* Fault on byte I. All bits in FFR from I are cleared. The vector >> + * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE >> + * option, which leaves subsequent data unchanged. >> + */ >> +static void __attribute__((cold)) > > attribute cold was first introduced in GCC 4.3. As of > commit fa54abb8c29 I think we still support gcc 4.1, > so we need to hide this behind a QEMU_COLD or something I think, eg > > #ifndef __has_attribute > #define __has_attribute(x) 0 /* compatibility with older gcc */ > #endif > > #if __has_attribute(cold) || QEMU_GNUC_PREREQ(4, 3) > #define QEMU_COLD __attribute__((cold)) > #else > #define QEMU_COLD > #endif > > (gcc added __has_attribute in gcc 5, which is nice.) Ah, good archaeology. But I think I'll just drop this. I put it in there as a hint that it won't be called, but the x86_64 code generation for putting this into the .text.unlikely section is really ugly. > >> +record_fault(CPUARMState *env, intptr_t i, intptr_t oprsz) >> +{ >> + uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; >> + if (i & 63) { >> + ffr[i / 64] &= MAKE_64BIT_MASK(0, (i & 63) - 1); > > Should this really have a - 1 here? (i & 63) will > be anything between 1 and 63, so I would have expected > the set of masks to be anything from "1 bit set" to > "63 bits set", not "0 bits set" to "62 bits set". We want to zero bits I to OPRSZ-1, which means retaining bits 0 to I-1. But you're right that for e.g. I==65 this will produce ~0ULL >> 64. I'll re-work this. r~
Richard Henderson <richard.henderson@linaro.org> writes: > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > target/arm/helper-sve.h | 40 ++++++++++ > target/arm/sve_helper.c | 156 +++++++++++++++++++++++++++++++++++++ > target/arm/translate-sve.c | 69 ++++++++++++++++ > target/arm/sve.decode | 6 ++ > 4 files changed, 271 insertions(+) > > diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h > index fcc9ba5f50..7338abbbcf 100644 > --- a/target/arm/helper-sve.h > +++ b/target/arm/helper-sve.h > @@ -754,3 +754,43 @@ DEF_HELPER_FLAGS_4(sve_ld1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > > DEF_HELPER_FLAGS_4(sve_ld1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > DEF_HELPER_FLAGS_4(sve_ld1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldff1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldff1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldff1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldff1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldnf1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldnf1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldnf1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > +DEF_HELPER_FLAGS_4(sve_ldnf1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve_ldnf1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) > diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c > index 4e6ad282f9..6e1b539ce3 100644 > --- a/target/arm/sve_helper.c > +++ b/target/arm/sve_helper.c > @@ -2963,3 +2963,159 @@ DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) > #undef DO_LD2 > #undef DO_LD3 > #undef DO_LD4 > + > +/* > + * Load contiguous data, first-fault and no-fault. > + */ > + > +#ifdef CONFIG_USER_ONLY > + > +/* Fault on byte I. All bits in FFR from I are cleared. The vector > + * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE > + * option, which leaves subsequent data unchanged. > + */ > +static void __attribute__((cold)) > +record_fault(CPUARMState *env, intptr_t i, intptr_t oprsz) > +{ > + uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; > + if (i & 63) { > + ffr[i / 64] &= MAKE_64BIT_MASK(0, (i & 63) - 1); > + i = ROUND_UP(i, 64); > + } > + for (; i < oprsz; i += 64) { > + ffr[i / 64] = 0; > + } > +} > + > +/* Hold the mmap lock during the operation so that there is no race > + * between page_check_range and the load operation. We expect the > + * usual case to have no faults at all, so we check the whole range > + * first and if successful defer to the normal load operation. > + * > + * TODO: Change mmap_lock to a rwlock so that multiple readers > + * can run simultaneously. This will probably help other uses > + * within QEMU as well. > + */ > +#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \ > +static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \ > + target_ulong addr, intptr_t oprsz, \ > + bool first, uintptr_t ra) \ > +{ \ > + intptr_t i = 0; \ > + do { \ > + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ > + do { \ > + TYPEM m = 0; \ > + if (pg & 1) { \ > + if (!first && \ > + page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \ > + record_fault(env, i, oprsz); \ > + return; \ > + } \ > + m = FN(env, addr, ra); \ > + first = false; \ > + } \ > + *(TYPEE *)(vd + H(i)) = m; \ > + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ > + addr += sizeof(TYPEM); \ > + } while (i & 15); \ > + } while (i < oprsz); \ > +} > \ So I noticed that the disassembly of these two functions is mostly parameter pushing and popping. Is there a case to be made to use the __flatten__ approach and see how the compiler unrolls it all? > +void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \ > + target_ulong addr, uint32_t desc) \ > +{ \ > + intptr_t oprsz = simd_oprsz(desc); \ > + unsigned rd = simd_data(desc); \ > + void *vd = &env->vfp.zregs[rd]; \ > + mmap_lock(); \ > + if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \ > + do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \ > + } else { \ > + do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \ > + } \ > + mmap_unlock(); \ > +} > + > +/* No-fault loads are like first-fault loads without the > + * first faulting special case. > + */ > +#define DO_LDNF1(PART) \ > +void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \ > + target_ulong addr, uint32_t desc) \ > +{ \ > + intptr_t oprsz = simd_oprsz(desc); \ > + unsigned rd = simd_data(desc); \ > + void *vd = &env->vfp.zregs[rd]; \ > + mmap_lock(); \ > + if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \ > + do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \ > + } else { \ > + do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \ > + } \ > + mmap_unlock(); \ > +} > + > +#else > + > +/* TODO: System mode is not yet supported. > + * This would probably use tlb_vaddr_to_host. > + */ > +#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \ > +void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \ > + target_ulong addr, uint32_t desc) \ > +{ \ > + g_assert_not_reached(); \ > +} > + > +#define DO_LDNF1(PART) \ > +void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \ > + target_ulong addr, uint32_t desc) \ > +{ \ > + g_assert_not_reached(); \ > +} > + > +#endif > + > +DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1) > +DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2) > +DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2) > +DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4) > +DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4) > +DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, ) > +DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, ) > + > +DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2) > +DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4) > +DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4) > +DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, ) > +DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, ) > + > +DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4) > +DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, ) > +DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, ) > + > +DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) > + > +#undef DO_LDFF1 > + > +DO_LDNF1(bb_r) > +DO_LDNF1(bhu_r) > +DO_LDNF1(bhs_r) > +DO_LDNF1(bsu_r) > +DO_LDNF1(bss_r) > +DO_LDNF1(bdu_r) > +DO_LDNF1(bds_r) > + > +DO_LDNF1(hh_r) > +DO_LDNF1(hsu_r) > +DO_LDNF1(hss_r) > +DO_LDNF1(hdu_r) > +DO_LDNF1(hds_r) > + > +DO_LDNF1(ss_r) > +DO_LDNF1(sdu_r) > +DO_LDNF1(sds_r) > + > +DO_LDNF1(dd_r) > + > +#undef DO_LDNF1 > diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c > index 3543daff48..09f77b5405 100644 > --- a/target/arm/translate-sve.c > +++ b/target/arm/translate-sve.c > @@ -3647,3 +3647,72 @@ static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn) > } > return true; > } > + > +static bool trans_LDFF1_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn) > +{ > + static gen_helper_gvec_mem * const fns[16] = { > + gen_helper_sve_ldff1bb_r, > + gen_helper_sve_ldff1bhu_r, > + gen_helper_sve_ldff1bsu_r, > + gen_helper_sve_ldff1bdu_r, > + > + gen_helper_sve_ldff1sds_r, > + gen_helper_sve_ldff1hh_r, > + gen_helper_sve_ldff1hsu_r, > + gen_helper_sve_ldff1hdu_r, > + > + gen_helper_sve_ldff1hds_r, > + gen_helper_sve_ldff1hss_r, > + gen_helper_sve_ldff1ss_r, > + gen_helper_sve_ldff1sdu_r, > + > + gen_helper_sve_ldff1bds_r, > + gen_helper_sve_ldff1bss_r, > + gen_helper_sve_ldff1bhs_r, > + gen_helper_sve_ldff1dd_r, > + }; > + > + if (sve_access_check(s)) { > + TCGv_i64 addr = new_tmp_a64(s); > + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype)); > + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); > + do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]); > + } > + return true; > +} > + > +static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn) > +{ > + static gen_helper_gvec_mem * const fns[16] = { > + gen_helper_sve_ldnf1bb_r, > + gen_helper_sve_ldnf1bhu_r, > + gen_helper_sve_ldnf1bsu_r, > + gen_helper_sve_ldnf1bdu_r, > + > + gen_helper_sve_ldnf1sds_r, > + gen_helper_sve_ldnf1hh_r, > + gen_helper_sve_ldnf1hsu_r, > + gen_helper_sve_ldnf1hdu_r, > + > + gen_helper_sve_ldnf1hds_r, > + gen_helper_sve_ldnf1hss_r, > + gen_helper_sve_ldnf1ss_r, > + gen_helper_sve_ldnf1sdu_r, > + > + gen_helper_sve_ldnf1bds_r, > + gen_helper_sve_ldnf1bss_r, > + gen_helper_sve_ldnf1bhs_r, > + gen_helper_sve_ldnf1dd_r, > + }; > + > + if (sve_access_check(s)) { > + int vsz = vec_full_reg_size(s); > + int elements = vsz >> dtype_esz[a->dtype]; > + int off = (a->imm * elements) << dtype_msz(a->dtype); > + TCGv_i64 addr = new_tmp_a64(s); > + > + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), off); > + do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]); > + } > + return true; > +} > diff --git a/target/arm/sve.decode b/target/arm/sve.decode > index cfb12da639..afbed57de1 100644 > --- a/target/arm/sve.decode > +++ b/target/arm/sve.decode > @@ -685,9 +685,15 @@ LDR_zri 10000101 10 ...... 010 ... ..... ..... @rd_rn_i9 > # SVE contiguous load (scalar plus scalar) > LD_zprr 1010010 .... ..... 010 ... ..... ..... @rprr_load_dt nreg=0 > > +# SVE contiguous first-fault load (scalar plus scalar) > +LDFF1_zprr 1010010 .... ..... 011 ... ..... ..... @rprr_load_dt nreg=0 > + > # SVE contiguous load (scalar plus immediate) > LD_zpri 1010010 .... 0.... 101 ... ..... ..... @rpri_load_dt nreg=0 > > +# SVE contiguous non-fault load (scalar plus immediate) > +LDNF1_zpri 1010010 .... 1.... 101 ... ..... ..... @rpri_load_dt nreg=0 > + > # SVE contiguous non-temporal load (scalar plus scalar) > # LDNT1B, LDNT1H, LDNT1W, LDNT1D > # SVE load multiple structures (scalar plus scalar) -- Alex Bennée
On 06/26/2018 05:52 AM, Alex Bennée wrote: >> +#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \ >> +static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \ >> + target_ulong addr, intptr_t oprsz, \ >> + bool first, uintptr_t ra) \ >> +{ \ >> + intptr_t i = 0; \ >> + do { \ >> + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ >> + do { \ >> + TYPEM m = 0; \ >> + if (pg & 1) { \ >> + if (!first && \ >> + page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \ >> + record_fault(env, i, oprsz); \ >> + return; \ >> + } \ >> + m = FN(env, addr, ra); \ >> + first = false; \ >> + } \ >> + *(TYPEE *)(vd + H(i)) = m; \ >> + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ >> + addr += sizeof(TYPEM); \ >> + } while (i & 15); \ >> + } while (i < oprsz); \ >> +} >> \ > So I noticed that the disassembly of these two functions is mostly > parameter pushing and popping. Is there a case to be made to use the > __flatten__ approach and see how the compiler unrolls it all? Em... for the most part the functions being called are not inlinable, being defined in accel/tcg/. r~
Richard Henderson <richard.henderson@linaro.org> writes: > On 06/26/2018 05:52 AM, Alex Bennée wrote: >>> +#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \ >>> +static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \ >>> + target_ulong addr, intptr_t oprsz, \ >>> + bool first, uintptr_t ra) \ >>> +{ \ >>> + intptr_t i = 0; \ >>> + do { \ >>> + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ >>> + do { \ >>> + TYPEM m = 0; \ >>> + if (pg & 1) { \ >>> + if (!first && \ >>> + page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \ >>> + record_fault(env, i, oprsz); \ >>> + return; \ >>> + } \ >>> + m = FN(env, addr, ra); \ >>> + first = false; \ >>> + } \ >>> + *(TYPEE *)(vd + H(i)) = m; \ >>> + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ >>> + addr += sizeof(TYPEM); \ >>> + } while (i & 15); \ >>> + } while (i < oprsz); \ >>> +} >>> \ >> So I noticed that the disassembly of these two functions is mostly >> parameter pushing and popping. Is there a case to be made to use the >> __flatten__ approach and see how the compiler unrolls it all? > > Em... for the most part the functions being called are not inlinable, > being defined in accel/tcg/. *sigh* I guess. It's a shame because the numbers get more disappointing: 12:13:48 [alex@zen:~/l/q/q/aarch64-linux-user] review/rth-sve-v5(+26/-1) + ./qemu-aarch64 ./tests/simd-memcpy libc intreg intpair simdreg simdpair sve libc, 248298053, 4228 kb/s intreg, 646085220, 1623 kb/s intpair, 369350825, 2841 kb/s simdreg, 1422096252, 737 kb/s simdpair, 1369635566, 765 kb/s sve, 2646179942, 396 kb/s and the above example doesn't have the cost of page_check_range. I guess this isn't something that could be improved until other architectures had a similar predicated load solution we could use in generated code. Helpers are always going to suck here :-/ Anyway my boy-racer disappointments aside: Reviewed-by: Alex Bennée <alex.bennee@linaro.org> -- Alex Bennée
diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h index fcc9ba5f50..7338abbbcf 100644 --- a/target/arm/helper-sve.h +++ b/target/arm/helper-sve.h @@ -754,3 +754,43 @@ DEF_HELPER_FLAGS_4(sve_ld1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_ld1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_ld1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldff1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldff1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldff1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldnf1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldnf1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldnf1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldnf1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c index 4e6ad282f9..6e1b539ce3 100644 --- a/target/arm/sve_helper.c +++ b/target/arm/sve_helper.c @@ -2963,3 +2963,159 @@ DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) #undef DO_LD2 #undef DO_LD3 #undef DO_LD4 + +/* + * Load contiguous data, first-fault and no-fault. + */ + +#ifdef CONFIG_USER_ONLY + +/* Fault on byte I. All bits in FFR from I are cleared. The vector + * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE + * option, which leaves subsequent data unchanged. + */ +static void __attribute__((cold)) +record_fault(CPUARMState *env, intptr_t i, intptr_t oprsz) +{ + uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; + if (i & 63) { + ffr[i / 64] &= MAKE_64BIT_MASK(0, (i & 63) - 1); + i = ROUND_UP(i, 64); + } + for (; i < oprsz; i += 64) { + ffr[i / 64] = 0; + } +} + +/* Hold the mmap lock during the operation so that there is no race + * between page_check_range and the load operation. We expect the + * usual case to have no faults at all, so we check the whole range + * first and if successful defer to the normal load operation. + * + * TODO: Change mmap_lock to a rwlock so that multiple readers + * can run simultaneously. This will probably help other uses + * within QEMU as well. + */ +#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \ +static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \ + target_ulong addr, intptr_t oprsz, \ + bool first, uintptr_t ra) \ +{ \ + intptr_t i = 0; \ + do { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + TYPEM m = 0; \ + if (pg & 1) { \ + if (!first && \ + page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \ + record_fault(env, i, oprsz); \ + return; \ + } \ + m = FN(env, addr, ra); \ + first = false; \ + } \ + *(TYPEE *)(vd + H(i)) = m; \ + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ + addr += sizeof(TYPEM); \ + } while (i & 15); \ + } while (i < oprsz); \ +} \ +void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + unsigned rd = simd_data(desc); \ + void *vd = &env->vfp.zregs[rd]; \ + mmap_lock(); \ + if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \ + do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \ + } else { \ + do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \ + } \ + mmap_unlock(); \ +} + +/* No-fault loads are like first-fault loads without the + * first faulting special case. + */ +#define DO_LDNF1(PART) \ +void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + unsigned rd = simd_data(desc); \ + void *vd = &env->vfp.zregs[rd]; \ + mmap_lock(); \ + if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \ + do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \ + } else { \ + do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \ + } \ + mmap_unlock(); \ +} + +#else + +/* TODO: System mode is not yet supported. + * This would probably use tlb_vaddr_to_host. + */ +#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \ +void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + g_assert_not_reached(); \ +} + +#define DO_LDNF1(PART) \ +void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + g_assert_not_reached(); \ +} + +#endif + +DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1) +DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2) +DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2) +DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4) +DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4) +DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, ) +DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, ) + +DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2) +DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4) +DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4) +DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, ) +DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, ) + +DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4) +DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, ) +DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, ) + +DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) + +#undef DO_LDFF1 + +DO_LDNF1(bb_r) +DO_LDNF1(bhu_r) +DO_LDNF1(bhs_r) +DO_LDNF1(bsu_r) +DO_LDNF1(bss_r) +DO_LDNF1(bdu_r) +DO_LDNF1(bds_r) + +DO_LDNF1(hh_r) +DO_LDNF1(hsu_r) +DO_LDNF1(hss_r) +DO_LDNF1(hdu_r) +DO_LDNF1(hds_r) + +DO_LDNF1(ss_r) +DO_LDNF1(sdu_r) +DO_LDNF1(sds_r) + +DO_LDNF1(dd_r) + +#undef DO_LDNF1 diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c index 3543daff48..09f77b5405 100644 --- a/target/arm/translate-sve.c +++ b/target/arm/translate-sve.c @@ -3647,3 +3647,72 @@ static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn) } return true; } + +static bool trans_LDFF1_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn) +{ + static gen_helper_gvec_mem * const fns[16] = { + gen_helper_sve_ldff1bb_r, + gen_helper_sve_ldff1bhu_r, + gen_helper_sve_ldff1bsu_r, + gen_helper_sve_ldff1bdu_r, + + gen_helper_sve_ldff1sds_r, + gen_helper_sve_ldff1hh_r, + gen_helper_sve_ldff1hsu_r, + gen_helper_sve_ldff1hdu_r, + + gen_helper_sve_ldff1hds_r, + gen_helper_sve_ldff1hss_r, + gen_helper_sve_ldff1ss_r, + gen_helper_sve_ldff1sdu_r, + + gen_helper_sve_ldff1bds_r, + gen_helper_sve_ldff1bss_r, + gen_helper_sve_ldff1bhs_r, + gen_helper_sve_ldff1dd_r, + }; + + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype)); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]); + } + return true; +} + +static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn) +{ + static gen_helper_gvec_mem * const fns[16] = { + gen_helper_sve_ldnf1bb_r, + gen_helper_sve_ldnf1bhu_r, + gen_helper_sve_ldnf1bsu_r, + gen_helper_sve_ldnf1bdu_r, + + gen_helper_sve_ldnf1sds_r, + gen_helper_sve_ldnf1hh_r, + gen_helper_sve_ldnf1hsu_r, + gen_helper_sve_ldnf1hdu_r, + + gen_helper_sve_ldnf1hds_r, + gen_helper_sve_ldnf1hss_r, + gen_helper_sve_ldnf1ss_r, + gen_helper_sve_ldnf1sdu_r, + + gen_helper_sve_ldnf1bds_r, + gen_helper_sve_ldnf1bss_r, + gen_helper_sve_ldnf1bhs_r, + gen_helper_sve_ldnf1dd_r, + }; + + if (sve_access_check(s)) { + int vsz = vec_full_reg_size(s); + int elements = vsz >> dtype_esz[a->dtype]; + int off = (a->imm * elements) << dtype_msz(a->dtype); + TCGv_i64 addr = new_tmp_a64(s); + + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), off); + do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]); + } + return true; +} diff --git a/target/arm/sve.decode b/target/arm/sve.decode index cfb12da639..afbed57de1 100644 --- a/target/arm/sve.decode +++ b/target/arm/sve.decode @@ -685,9 +685,15 @@ LDR_zri 10000101 10 ...... 010 ... ..... ..... @rd_rn_i9 # SVE contiguous load (scalar plus scalar) LD_zprr 1010010 .... ..... 010 ... ..... ..... @rprr_load_dt nreg=0 +# SVE contiguous first-fault load (scalar plus scalar) +LDFF1_zprr 1010010 .... ..... 011 ... ..... ..... @rprr_load_dt nreg=0 + # SVE contiguous load (scalar plus immediate) LD_zpri 1010010 .... 0.... 101 ... ..... ..... @rpri_load_dt nreg=0 +# SVE contiguous non-fault load (scalar plus immediate) +LDNF1_zpri 1010010 .... 1.... 101 ... ..... ..... @rpri_load_dt nreg=0 + # SVE contiguous non-temporal load (scalar plus scalar) # LDNT1B, LDNT1H, LDNT1W, LDNT1D # SVE load multiple structures (scalar plus scalar)
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/helper-sve.h | 40 ++++++++++ target/arm/sve_helper.c | 156 +++++++++++++++++++++++++++++++++++++ target/arm/translate-sve.c | 69 ++++++++++++++++ target/arm/sve.decode | 6 ++ 4 files changed, 271 insertions(+) -- 2.17.1