diff mbox series

[07/10] target/arm: Implement v8.1M low-overhead-loop instructions

Message ID 20201012153746.9996-8-peter.maydell@linaro.org
State Accepted
Headers show
Series target/arm: Various v8.1M minor features | expand

Commit Message

Peter Maydell Oct. 12, 2020, 3:37 p.m. UTC
v8.1M's "low-overhead-loop" extension has three instructions
for looping:
 * DLS (start of a do-loop)
 * WLS (start of a while-loop)
 * LE (end of a loop)

The loop-start instructions are both simple operations to start a
loop whose iteration count (if any) is in LR.  The loop-end
instruction handles "decrement iteration count and jump back to loop
start"; it also caches the information about the branch back to the
start of the loop to improve performance of the branch on subsequent
iterations.

As with the branch-future instructions, the architecture permits an
implementation to discard the LO_BRANCH_INFO cache at any time, and
QEMU takes the IMPDEF option to never set it in the first place
(equivalent to discarding it immediately), because for us a "real"
implementation would be unnecessary complexity.

(This implementation only provides the simple looping constructs; the
vector extension MVE (Helium) adds some extra variants to handle
looping across vectors.  We'll add those later when we implement
MVE.)

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/t32.decode  |  8 +++++
 target/arm/translate.c | 74 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

Comments

Peter Maydell Oct. 12, 2020, 7:56 p.m. UTC | #1
On Mon, 12 Oct 2020 at 16:37, Peter Maydell <peter.maydell@linaro.org> wrote:
>

> v8.1M's "low-overhead-loop" extension has three instructions

> for looping:

>  * DLS (start of a do-loop)

>  * WLS (start of a while-loop)

>  * LE (end of a loop)

>

> +static bool trans_WLS(DisasContext *s, arg_WLS *a)

> +{

> +    /* M-profile low-overhead while-loop start */

> +    TCGv_i32 tmp;

> +    TCGLabel *nextlabel;

> +

> +    if (!dc_isar_feature(aa32_lob, s)) {

> +        return false;

> +    }

> +    if (a->rn == 13 || a->rn == 15) {

> +        /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */

> +        return false;

> +    }

> +

> +    nextlabel = gen_new_label();

> +    tcg_gen_brcondi_i32(TCG_COND_NE, cpu_R[a->rn], 0, nextlabel);

> +    gen_jmp(s, read_pc(s) + a->imm);

> +

> +    gen_set_label(nextlabel);

> +    tmp = load_reg(s, a->rn);

> +    store_reg(s, 14, tmp);

> +    gen_jmp(s, s->base.pc_next);

> +    return true;

> +}


This turns out not to work, because gen_jmp() always generates
a goto-tb for tb exit 0, and we hit the assert() that exit 0
was not used twice. Here's a fixup to fold into this patch:

--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -2490,17 +2490,23 @@ static void gen_goto_tb(DisasContext *s, int
n, target_ulong dest)
     s->base.is_jmp = DISAS_NORETURN;
 }

-static inline void gen_jmp (DisasContext *s, uint32_t dest)
+/* Jump, specifying which TB number to use if we gen_goto_tb() */
+static inline void gen_jmp_tb(DisasContext *s, uint32_t dest, int tbno)
 {
     if (unlikely(is_singlestepping(s))) {
         /* An indirect jump so that we still trigger the debug exception.  */
         gen_set_pc_im(s, dest);
         s->base.is_jmp = DISAS_JUMP;
     } else {
-        gen_goto_tb(s, 0, dest);
+        gen_goto_tb(s, tbno, dest);
     }
 }

+static inline void gen_jmp(DisasContext *s, uint32_t dest)
+{
+    gen_jmp_tb(s, dest, 0);
+}
+
 static inline void gen_mulxy(TCGv_i32 t0, TCGv_i32 t1, int x, int y)
 {
     if (x)
@@ -8023,7 +8029,16 @@ static bool trans_WLS(DisasContext *s, arg_WLS *a)
         /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
         return false;
     }
-
+    if (s->condexec_mask) {
+        /*
+         * WLS in an IT block is CONSTRAINED UNPREDICTABLE;
+         * we choose to UNDEF, because otherwise our use of
+         * gen_goto_tb(1) would clash with the use of TB exit 1
+         * in the dc->condjmp condition-failed codepath in
+         * arm_tr_tb_stop() and we'd get an assertion.
+         */
+        return false;
+    }
     nextlabel = gen_new_label();
     tcg_gen_brcondi_i32(TCG_COND_NE, cpu_R[a->rn], 0, nextlabel);
     gen_jmp(s, read_pc(s) + a->imm);
@@ -8031,7 +8046,7 @@ static bool trans_WLS(DisasContext *s, arg_WLS *a)
     gen_set_label(nextlabel);
     tmp = load_reg(s, a->rn);
     store_reg(s, 14, tmp);
-    gen_jmp(s, s->base.pc_next);
+    gen_jmp_tb(s, s->base.pc_next, 1);
     return true;
 }

thanks
-- PMM
Richard Henderson Oct. 13, 2020, 5:10 p.m. UTC | #2
On 10/12/20 12:56 PM, Peter Maydell wrote:
> On Mon, 12 Oct 2020 at 16:37, Peter Maydell <peter.maydell@linaro.org> wrote:

>>

>> v8.1M's "low-overhead-loop" extension has three instructions

>> for looping:

>>  * DLS (start of a do-loop)

>>  * WLS (start of a while-loop)

>>  * LE (end of a loop)

>>

>> +static bool trans_WLS(DisasContext *s, arg_WLS *a)

>> +{

>> +    /* M-profile low-overhead while-loop start */

>> +    TCGv_i32 tmp;

>> +    TCGLabel *nextlabel;

>> +

>> +    if (!dc_isar_feature(aa32_lob, s)) {

>> +        return false;

>> +    }

>> +    if (a->rn == 13 || a->rn == 15) {

>> +        /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */

>> +        return false;

>> +    }

>> +

>> +    nextlabel = gen_new_label();

>> +    tcg_gen_brcondi_i32(TCG_COND_NE, cpu_R[a->rn], 0, nextlabel);

>> +    gen_jmp(s, read_pc(s) + a->imm);

>> +

>> +    gen_set_label(nextlabel);

>> +    tmp = load_reg(s, a->rn);

>> +    store_reg(s, 14, tmp);

>> +    gen_jmp(s, s->base.pc_next);

>> +    return true;

>> +}

> 

> This turns out not to work, because gen_jmp() always generates

> a goto-tb for tb exit 0, and we hit the assert() that exit 0

> was not used twice. Here's a fixup to fold into this patch:


Indeed.  I was going to suggest that here you should use arm_gen_condlabel()
like you did for LE.  Which I think would be still cleaner than your fixup patch.


r~
Peter Maydell Oct. 13, 2020, 5:12 p.m. UTC | #3
On Tue, 13 Oct 2020 at 18:10, Richard Henderson
<richard.henderson@linaro.org> wrote:
>

> On 10/12/20 12:56 PM, Peter Maydell wrote:

> > On Mon, 12 Oct 2020 at 16:37, Peter Maydell <peter.maydell@linaro.org> wrote:

> > This turns out not to work, because gen_jmp() always generates

> > a goto-tb for tb exit 0, and we hit the assert() that exit 0

> > was not used twice. Here's a fixup to fold into this patch:

>

> Indeed.  I was going to suggest that here you should use arm_gen_condlabel()

> like you did for LE.  Which I think would be still cleaner than your fixup patch.


I thought about that but it doesn't really fit, because
the condlabel is for "go to the next instruction
without having done anything". Here we need to do something
on that codepath (unlike LE).

thanks
-- PMM
Richard Henderson Oct. 13, 2020, 5:30 p.m. UTC | #4
On 10/13/20 10:12 AM, Peter Maydell wrote:
> On Tue, 13 Oct 2020 at 18:10, Richard Henderson

> <richard.henderson@linaro.org> wrote:

>>

>> On 10/12/20 12:56 PM, Peter Maydell wrote:

>>> On Mon, 12 Oct 2020 at 16:37, Peter Maydell <peter.maydell@linaro.org> wrote:

>>> This turns out not to work, because gen_jmp() always generates

>>> a goto-tb for tb exit 0, and we hit the assert() that exit 0

>>> was not used twice. Here's a fixup to fold into this patch:

>>

>> Indeed.  I was going to suggest that here you should use arm_gen_condlabel()

>> like you did for LE.  Which I think would be still cleaner than your fixup patch.

> 

> I thought about that but it doesn't really fit, because

> the condlabel is for "go to the next instruction

> without having done anything". Here we need to do something

> on that codepath (unlike LE).


Ah, right.

Well, the only further comment is that, in the followup, only WLS gains the IT
block check.  While I understand that's required to avoid an abort in QEMU for
this case, all three of the insns have that case as CONSTRAINED UNPREDICTABLE.
 It might be worthwhile checking for IT in all of them, just to continue our
normal "unpredictable raises sigill, when easy" choice.


r~
Peter Maydell Oct. 13, 2020, 8:24 p.m. UTC | #5
On Tue, 13 Oct 2020 at 18:30, Richard Henderson
<richard.henderson@linaro.org> wrote:
> Well, the only further comment is that, in the followup, only WLS gains the IT

> block check.  While I understand that's required to avoid an abort in QEMU for

> this case, all three of the insns have that case as CONSTRAINED UNPREDICTABLE.

>  It might be worthwhile checking for IT in all of them, just to continue our

> normal "unpredictable raises sigill, when easy" choice.


Maybe, but there are a lot of instructions that are
unpredictable-in-an-IT-block (CPSID, CRC32B, HVC...)
and our general approach seems to have been "don't check unless
it would cause an actual problem". The only place I can find
where we do check for this case is in trans_B_cond_thumb(),
which we do for the same reason as here.

thanks
-- PMM
Richard Henderson Oct. 13, 2020, 10:31 p.m. UTC | #6
On 10/12/20 8:37 AM, Peter Maydell wrote:
> +    nextlabel = gen_new_label();

> +    tcg_gen_brcondi_i32(TCG_COND_NE, cpu_R[a->rn], 0, nextlabel);

> +    gen_jmp(s, read_pc(s) + a->imm);

> +

> +    gen_set_label(nextlabel);

> +    tmp = load_reg(s, a->rn);

> +    store_reg(s, 14, tmp);

> +    gen_jmp(s, s->base.pc_next);

> +    return true;


Oh, fwiw, with the tcg optimization patches just posted, this branch is better
inverted.  That way the load of rn can be reused on the non-taken branch path.

Maybe sometime I'll try to propagate the data to the taken path, but that
automatically requires extra memory allocation, so it'll be difficult to do
that without a tcg slowdown.


r~
diff mbox series

Patch

diff --git a/target/arm/t32.decode b/target/arm/t32.decode
index 3015731a8d0..8152739b52b 100644
--- a/target/arm/t32.decode
+++ b/target/arm/t32.decode
@@ -659,4 +659,12 @@  BL               1111 0. .......... 11.1 ............         @branch24
     BF           1111 0 boff:4 10 ----- 1110 - ---------- 1    # BF
     BF           1111 0 boff:4 11 ----- 1110 0 0000000000 1    # BFX, BFLX
   ]
+  [
+    # LE and WLS immediate
+    %lob_imm 1:10 11:1 !function=times_2
+
+    DLS          1111 0 0000 100     rn:4 1110 0000 0000 0001
+    WLS          1111 0 0000 100     rn:4 1100 . .......... 1 imm=%lob_imm
+    LE           1111 0 0000 0 f:1 0 1111 1100 . .......... 1 imm=%lob_imm
+  ]
 }
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 9e72d719c6f..742c219c071 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -7953,6 +7953,80 @@  static bool trans_BF(DisasContext *s, arg_BF *a)
     return true;
 }
 
+static bool trans_DLS(DisasContext *s, arg_DLS *a)
+{
+    /* M-profile low-overhead loop start */
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_lob, s)) {
+        return false;
+    }
+    if (a->rn == 13 || a->rn == 15) {
+        /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
+        return false;
+    }
+
+    /* Not a while loop, no tail predication: just set LR to the count */
+    tmp = load_reg(s, a->rn);
+    store_reg(s, 14, tmp);
+    return true;
+}
+
+static bool trans_WLS(DisasContext *s, arg_WLS *a)
+{
+    /* M-profile low-overhead while-loop start */
+    TCGv_i32 tmp;
+    TCGLabel *nextlabel;
+
+    if (!dc_isar_feature(aa32_lob, s)) {
+        return false;
+    }
+    if (a->rn == 13 || a->rn == 15) {
+        /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
+        return false;
+    }
+
+    nextlabel = gen_new_label();
+    tcg_gen_brcondi_i32(TCG_COND_NE, cpu_R[a->rn], 0, nextlabel);
+    gen_jmp(s, read_pc(s) + a->imm);
+
+    gen_set_label(nextlabel);
+    tmp = load_reg(s, a->rn);
+    store_reg(s, 14, tmp);
+    gen_jmp(s, s->base.pc_next);
+    return true;
+}
+
+static bool trans_LE(DisasContext *s, arg_LE *a)
+{
+    /*
+     * M-profile low-overhead loop end. The architecture permits an
+     * implementation to discard the LO_BRANCH_INFO cache at any time,
+     * and we take the IMPDEF option to never set it in the first place
+     * (equivalent to always discarding it immediately), because for QEMU
+     * a "real" implementation would be complicated and wouldn't execute
+     * any faster.
+     */
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_lob, s)) {
+        return false;
+    }
+
+    if (!a->f) {
+        /* Not loop-forever. If LR <= 1 this is the last loop: do nothing. */
+        arm_gen_condlabel(s);
+        tcg_gen_brcondi_i32(TCG_COND_LEU, cpu_R[14], 1, s->condlabel);
+        /* Decrement LR */
+        tmp = load_reg(s, 14);
+        tcg_gen_addi_i32(tmp, tmp, -1);
+        store_reg(s, 14, tmp);
+    }
+    /* Jump back to the loop start */
+    gen_jmp(s, read_pc(s) - a->imm);
+    return true;
+}
+
 static bool op_tbranch(DisasContext *s, arg_tbranch *a, bool half)
 {
     TCGv_i32 addr, tmp;