diff mbox series

[v2,20/22] tcg/arm: Implement direct branch for goto_tb

Message ID 20230109014248.2894281-21-richard.henderson@linaro.org
State Superseded
Headers show
Series tcg: exit_tb tidy, goto_tb reorg | expand

Commit Message

Richard Henderson Jan. 9, 2023, 1:42 a.m. UTC
Now that tcg can handle direct and indirect goto_tb
simultaneously, we can optimistically leave space for
a direct branch and fall back to loading the pointer
from the TB for an indirect branch.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 52 ++++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 15 deletions(-)

Comments

Alex Bennée Jan. 17, 2023, 6:33 p.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> Now that tcg can handle direct and indirect goto_tb
> simultaneously, we can optimistically leave space for
> a direct branch and fall back to loading the pointer
> from the TB for an indirect branch.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/arm/tcg-target.c.inc | 52 ++++++++++++++++++++++++++++------------
>  1 file changed, 37 insertions(+), 15 deletions(-)
>
> diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
> index e1e1c2620d..794ed8c3a2 100644
> --- a/tcg/arm/tcg-target.c.inc
> +++ b/tcg/arm/tcg-target.c.inc
> @@ -135,6 +135,8 @@ typedef enum {
>      ARITH_BIC = 0xe << 21,
>      ARITH_MVN = 0xf << 21,
>  
> +    INSN_B         = 0x0a000000,
> +
>      INSN_CLZ       = 0x016f0f10,
>      INSN_RBIT      = 0x06ff0f30,
>  
> @@ -546,7 +548,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
>  
>  static void tcg_out_b_imm(TCGContext *s, ARMCond cond, int32_t offset)
>  {
> -    tcg_out32(s, (cond << 28) | 0x0a000000 |
> +    tcg_out32(s, (cond << 28) | INSN_B |
>                      (((offset - 8) >> 2) & 0x00ffffff));

deposit32?

>  }
>  
> @@ -1941,32 +1943,52 @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
>  
>  static void tcg_out_goto_tb(TCGContext *s, int which)
>  {
> -    /* Indirect jump method */
> -    intptr_t ptr, dif, dil;
> -    TCGReg base = TCG_REG_PC;
> +    uintptr_t i_addr;
> +    intptr_t i_disp;
>  
> -    ptr = get_jmp_target_addr(s, which);
> -    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
> -    dil = sextract32(dif, 0, 12);
> -    if (dif != dil) {
> +    /* Direct branch will be patched by tb_target_set_jmp_target. */
> +    set_jmp_insn_offset(s, which);
> +    tcg_out32(s, INSN_NOP);
> +
> +    /* When branch is out of range, fall through to indirect. */
> +    i_addr = get_jmp_target_addr(s, which);
> +    i_disp = tcg_pcrel_diff(s, (void *)i_addr) - 8;
> +    tcg_debug_assert(i_disp < 0);
> +    if (i_disp >= -0xfff) {
> +        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, i_disp);
> +    } else {
>          /*
>           * The TB is close, but outside the 12 bits addressable by
>           * the load.  We can extend this to 20 bits with a sub of a
> -         * shifted immediate from pc.  In the vastly unlikely event
> -         * the code requires more than 1MB, we'll use 2 insns and
> -         * be no worse off.
> +         * shifted immediate from pc.
>           */
> -        base = TCG_REG_R0;
> -        tcg_out_movi32(s, COND_AL, base, ptr - dil);
> +        int h = -i_disp;
> +        int l = h & 0xfff;
> +
> +        h = encode_imm_nofail(h - l);
> +        tcg_out_dat_imm(s, COND_AL, ARITH_SUB, TCG_REG_R0, TCG_REG_PC, h);
> +        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, l);
>      }
> -    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
>      set_jmp_reset_offset(s, which);
>  }
>  
>  void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
>                                uintptr_t jmp_rx, uintptr_t jmp_rw)
>  {
> -    /* Always indirect, nothing to do */
> +    uintptr_t addr = tb->jmp_target_addr[n];
> +    ptrdiff_t offset = addr - (jmp_rx + 8);
> +    tcg_insn_unit insn;
> +
> +    /* Either directly branch, or fall through to indirect branch. */
> +    if (offset == sextract64(offset, 0, 26)) {
> +        /* B <addr> */
> +        insn = (COND_AL << 28) | INSN_B | ((offset >> 2) &
>  0x00ffffff);

deposit32

> +    } else {
> +        insn = INSN_NOP;
> +    }
> +
> +    qatomic_set((uint32_t *)jmp_rw, insn);
> +    flush_idcache_range(jmp_rx, jmp_rw, 4);
>  }
>  
>  static void tcg_out_op(TCGContext *s, TCGOpcode opc,

Otherwise:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
diff mbox series

Patch

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index e1e1c2620d..794ed8c3a2 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -135,6 +135,8 @@  typedef enum {
     ARITH_BIC = 0xe << 21,
     ARITH_MVN = 0xf << 21,
 
+    INSN_B         = 0x0a000000,
+
     INSN_CLZ       = 0x016f0f10,
     INSN_RBIT      = 0x06ff0f30,
 
@@ -546,7 +548,7 @@  static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 
 static void tcg_out_b_imm(TCGContext *s, ARMCond cond, int32_t offset)
 {
-    tcg_out32(s, (cond << 28) | 0x0a000000 |
+    tcg_out32(s, (cond << 28) | INSN_B |
                     (((offset - 8) >> 2) & 0x00ffffff));
 }
 
@@ -1941,32 +1943,52 @@  static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
 
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
-    /* Indirect jump method */
-    intptr_t ptr, dif, dil;
-    TCGReg base = TCG_REG_PC;
+    uintptr_t i_addr;
+    intptr_t i_disp;
 
-    ptr = get_jmp_target_addr(s, which);
-    dif = tcg_pcrel_diff(s, (void *)ptr) - 8;
-    dil = sextract32(dif, 0, 12);
-    if (dif != dil) {
+    /* Direct branch will be patched by tb_target_set_jmp_target. */
+    set_jmp_insn_offset(s, which);
+    tcg_out32(s, INSN_NOP);
+
+    /* When branch is out of range, fall through to indirect. */
+    i_addr = get_jmp_target_addr(s, which);
+    i_disp = tcg_pcrel_diff(s, (void *)i_addr) - 8;
+    tcg_debug_assert(i_disp < 0);
+    if (i_disp >= -0xfff) {
+        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, i_disp);
+    } else {
         /*
          * The TB is close, but outside the 12 bits addressable by
          * the load.  We can extend this to 20 bits with a sub of a
-         * shifted immediate from pc.  In the vastly unlikely event
-         * the code requires more than 1MB, we'll use 2 insns and
-         * be no worse off.
+         * shifted immediate from pc.
          */
-        base = TCG_REG_R0;
-        tcg_out_movi32(s, COND_AL, base, ptr - dil);
+        int h = -i_disp;
+        int l = h & 0xfff;
+
+        h = encode_imm_nofail(h - l);
+        tcg_out_dat_imm(s, COND_AL, ARITH_SUB, TCG_REG_R0, TCG_REG_PC, h);
+        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, l);
     }
-    tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
     set_jmp_reset_offset(s, which);
 }
 
 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
                               uintptr_t jmp_rx, uintptr_t jmp_rw)
 {
-    /* Always indirect, nothing to do */
+    uintptr_t addr = tb->jmp_target_addr[n];
+    ptrdiff_t offset = addr - (jmp_rx + 8);
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or fall through to indirect branch. */
+    if (offset == sextract64(offset, 0, 26)) {
+        /* B <addr> */
+        insn = (COND_AL << 28) | INSN_B | ((offset >> 2) & 0x00ffffff);
+    } else {
+        insn = INSN_NOP;
+    }
+
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,