Message ID | 20180809034033.10579-4-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | target/arm: sve linux-user patches | expand |
Richard Henderson <richard.henderson@linaro.org> writes: > The pseudocode for this operation is an increment + compare loop, > so comparing <= the maximum integer produces an all-true predicate. > > Rather than bound in both the inline code and the helper, pass the > helper the number of predicate bits to set instead of the number > of predicate elements to set. > > Cc: qemu-stable@nongnu.org (3.0.1) > Tested-by: Laurent Desnogues <laurent.desnogues@gmail.com> > Reviewed-by: Laurent Desnogues <laurent.desnogues@gmail.com> > Reported-by: Laurent Desnogues <laurent.desnogues@gmail.com> > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> > --- > target/arm/sve_helper.c | 5 ---- > target/arm/translate-sve.c | 49 +++++++++++++++++++++++++------------- > 2 files changed, 32 insertions(+), 22 deletions(-) > > diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c > index 9bd0694d55..87594a8adb 100644 > --- a/target/arm/sve_helper.c > +++ b/target/arm/sve_helper.c > @@ -2846,11 +2846,6 @@ uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc) > return flags; > } > > - /* Scale from predicate element count to bits. */ > - count <<= esz; > - /* Bound to the bits in the predicate. */ > - count = MIN(count, oprsz * 8); > - > /* Set all of the requested bits. */ > for (i = 0; i < count / 64; ++i) { > d->p[i] = esz_mask; > diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c > index 9dd4c38bab..89efc80ee7 100644 > --- a/target/arm/translate-sve.c > +++ b/target/arm/translate-sve.c > @@ -3173,19 +3173,19 @@ static bool trans_CTERM(DisasContext *s, arg_CTERM *a, uint32_t insn) > > static bool trans_WHILE(DisasContext *s, arg_WHILE *a, uint32_t insn) > { > - if (!sve_access_check(s)) { > - return true; > - } > - > - TCGv_i64 op0 = read_cpu_reg(s, a->rn, 1); > - TCGv_i64 op1 = read_cpu_reg(s, a->rm, 1); > - TCGv_i64 t0 = tcg_temp_new_i64(); > - TCGv_i64 t1 = tcg_temp_new_i64(); > + TCGv_i64 op0, op1, t0, t1, tmax; > TCGv_i32 t2, t3; > TCGv_ptr ptr; > unsigned desc, vsz = vec_full_reg_size(s); > TCGCond cond; > > + if (!sve_access_check(s)) { > + return true; > + } > + > + op0 = read_cpu_reg(s, a->rn, 1); > + op1 = read_cpu_reg(s, a->rm, 1); > + > if (!a->sf) { > if (a->u) { > tcg_gen_ext32u_i64(op0, op0); > @@ -3198,32 +3198,47 @@ static bool trans_WHILE(DisasContext *s, arg_WHILE *a, uint32_t insn) > > /* For the helper, compress the different conditions into a computation > * of how many iterations for which the condition is true. > - * > - * This is slightly complicated by 0 <= UINT64_MAX, which is nominally > - * 2**64 iterations, overflowing to 0. Of course, predicate registers > - * aren't that large, so any value >= predicate size is sufficient. > */ > + t0 = tcg_temp_new_i64(); > + t1 = tcg_temp_new_i64(); > tcg_gen_sub_i64(t0, op1, op0); > > - /* t0 = MIN(op1 - op0, vsz). */ > - tcg_gen_movi_i64(t1, vsz); > - tcg_gen_umin_i64(t0, t0, t1); > + tmax = tcg_const_i64(vsz >> a->esz); > if (a->eq) { > /* Equality means one more iteration. */ > tcg_gen_addi_i64(t0, t0, 1); > + > + /* If op1 is max (un)signed integer (and the only time the addition > + * above could overflow), then we produce an all-true predicate by > + * setting the count to the vector length. This is because the > + * pseudocode is described as an increment + compare loop, and the > + * max integer would always compare true. > + */ > + tcg_gen_movi_i64(t1, (a->sf > + ? (a->u ? UINT64_MAX : INT64_MAX) > + : (a->u ? UINT32_MAX : INT32_MAX))); > + tcg_gen_movcond_i64(TCG_COND_EQ, t0, op1, t1, tmax, t0); > } > > - /* t0 = (condition true ? t0 : 0). */ > + /* Bound to the maximum. */ > + tcg_gen_umin_i64(t0, t0, tmax); > + tcg_temp_free_i64(tmax); > + > + /* Set the count to zero if the condition is false. */ > cond = (a->u > ? (a->eq ? TCG_COND_LEU : TCG_COND_LTU) > : (a->eq ? TCG_COND_LE : TCG_COND_LT)); > tcg_gen_movi_i64(t1, 0); > tcg_gen_movcond_i64(cond, t0, op0, op1, t0, t1); > + tcg_temp_free_i64(t1); > > + /* Since we're bounded, pass as a 32-bit type. */ > t2 = tcg_temp_new_i32(); > tcg_gen_extrl_i64_i32(t2, t0); > tcg_temp_free_i64(t0); > - tcg_temp_free_i64(t1); > + > + /* Scale elements to bits. */ > + tcg_gen_shli_i32(t2, t2, a->esz); > > desc = (vsz / 8) - 2; > desc = deposit32(desc, SIMD_DATA_SHIFT, 2, a->esz); -- Alex Bennée
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c index 9bd0694d55..87594a8adb 100644 --- a/target/arm/sve_helper.c +++ b/target/arm/sve_helper.c @@ -2846,11 +2846,6 @@ uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc) return flags; } - /* Scale from predicate element count to bits. */ - count <<= esz; - /* Bound to the bits in the predicate. */ - count = MIN(count, oprsz * 8); - /* Set all of the requested bits. */ for (i = 0; i < count / 64; ++i) { d->p[i] = esz_mask; diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c index 9dd4c38bab..89efc80ee7 100644 --- a/target/arm/translate-sve.c +++ b/target/arm/translate-sve.c @@ -3173,19 +3173,19 @@ static bool trans_CTERM(DisasContext *s, arg_CTERM *a, uint32_t insn) static bool trans_WHILE(DisasContext *s, arg_WHILE *a, uint32_t insn) { - if (!sve_access_check(s)) { - return true; - } - - TCGv_i64 op0 = read_cpu_reg(s, a->rn, 1); - TCGv_i64 op1 = read_cpu_reg(s, a->rm, 1); - TCGv_i64 t0 = tcg_temp_new_i64(); - TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 op0, op1, t0, t1, tmax; TCGv_i32 t2, t3; TCGv_ptr ptr; unsigned desc, vsz = vec_full_reg_size(s); TCGCond cond; + if (!sve_access_check(s)) { + return true; + } + + op0 = read_cpu_reg(s, a->rn, 1); + op1 = read_cpu_reg(s, a->rm, 1); + if (!a->sf) { if (a->u) { tcg_gen_ext32u_i64(op0, op0); @@ -3198,32 +3198,47 @@ static bool trans_WHILE(DisasContext *s, arg_WHILE *a, uint32_t insn) /* For the helper, compress the different conditions into a computation * of how many iterations for which the condition is true. - * - * This is slightly complicated by 0 <= UINT64_MAX, which is nominally - * 2**64 iterations, overflowing to 0. Of course, predicate registers - * aren't that large, so any value >= predicate size is sufficient. */ + t0 = tcg_temp_new_i64(); + t1 = tcg_temp_new_i64(); tcg_gen_sub_i64(t0, op1, op0); - /* t0 = MIN(op1 - op0, vsz). */ - tcg_gen_movi_i64(t1, vsz); - tcg_gen_umin_i64(t0, t0, t1); + tmax = tcg_const_i64(vsz >> a->esz); if (a->eq) { /* Equality means one more iteration. */ tcg_gen_addi_i64(t0, t0, 1); + + /* If op1 is max (un)signed integer (and the only time the addition + * above could overflow), then we produce an all-true predicate by + * setting the count to the vector length. This is because the + * pseudocode is described as an increment + compare loop, and the + * max integer would always compare true. + */ + tcg_gen_movi_i64(t1, (a->sf + ? (a->u ? UINT64_MAX : INT64_MAX) + : (a->u ? UINT32_MAX : INT32_MAX))); + tcg_gen_movcond_i64(TCG_COND_EQ, t0, op1, t1, tmax, t0); } - /* t0 = (condition true ? t0 : 0). */ + /* Bound to the maximum. */ + tcg_gen_umin_i64(t0, t0, tmax); + tcg_temp_free_i64(tmax); + + /* Set the count to zero if the condition is false. */ cond = (a->u ? (a->eq ? TCG_COND_LEU : TCG_COND_LTU) : (a->eq ? TCG_COND_LE : TCG_COND_LT)); tcg_gen_movi_i64(t1, 0); tcg_gen_movcond_i64(cond, t0, op0, op1, t0, t1); + tcg_temp_free_i64(t1); + /* Since we're bounded, pass as a 32-bit type. */ t2 = tcg_temp_new_i32(); tcg_gen_extrl_i64_i32(t2, t0); tcg_temp_free_i64(t0); - tcg_temp_free_i64(t1); + + /* Scale elements to bits. */ + tcg_gen_shli_i32(t2, t2, a->esz); desc = (vsz / 8) - 2; desc = deposit32(desc, SIMD_DATA_SHIFT, 2, a->esz);