diff mbox series

[v2,16/22] tcg/aarch64: Reorg goto_tb implementation

Message ID 20230109014248.2894281-17-richard.henderson@linaro.org
State Superseded
Headers show
Series tcg: exit_tb tidy, goto_tb reorg | expand

Commit Message

Richard Henderson Jan. 9, 2023, 1:42 a.m. UTC
The old implementation replaces two insns, swapping between

	b	<dest>
	nop
	br	x30
and
	adrp	x30, <dest>
	addi	x30, x30, lo12:<dest>
	br	x30

There is a race condition in which a thread could be stopped at
the PC of the second insn, and when restarted does not see the
complete address computation and branches to nowhere.

The new implemetation replaces only one insn, swapping between

	b	<dest>
	br	tmp
and
	ldr	tmp, <jmp_addr>
	br	tmp

Reported-by: hev <r@hev.cc>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.h     |  3 +-
 tcg/aarch64/tcg-target.c.inc | 64 +++++++++++++++---------------------
 2 files changed, 28 insertions(+), 39 deletions(-)

Comments

Alex Bennée Jan. 17, 2023, 6:26 p.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> The old implementation replaces two insns, swapping between
>
> 	b	<dest>
> 	nop
> 	br	x30
> and
> 	adrp	x30, <dest>
> 	addi	x30, x30, lo12:<dest>
> 	br	x30
>
> There is a race condition in which a thread could be stopped at
> the PC of the second insn, and when restarted does not see the
> complete address computation and branches to nowhere.
>
> The new implemetation replaces only one insn, swapping between
>
> 	b	<dest>
> 	br	tmp
> and
> 	ldr	tmp, <jmp_addr>
> 	br	tmp
>
> Reported-by: hev <r@hev.cc>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/aarch64/tcg-target.h     |  3 +-
>  tcg/aarch64/tcg-target.c.inc | 64 +++++++++++++++---------------------
>  2 files changed, 28 insertions(+), 39 deletions(-)
>
> diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
> index 6067446b03..0ba2298ea6 100644
> --- a/tcg/aarch64/tcg-target.h
> +++ b/tcg/aarch64/tcg-target.h
> @@ -15,7 +15,8 @@
>  
>  #define TCG_TARGET_INSN_UNIT_SIZE  4
>  #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
> -#define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
> +#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
> +#undef TCG_TARGET_STACK_GROWSUP
>  
>  typedef enum {
>      TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
> diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
> index 0b65f2cac1..1d0ebf01a5 100644
> --- a/tcg/aarch64/tcg-target.c.inc
> +++ b/tcg/aarch64/tcg-target.c.inc
> @@ -1353,33 +1353,6 @@ static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
>      tcg_out_call_int(s, target);
>  }
>  
> -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> -                              uintptr_t jmp_rx, uintptr_t jmp_rw)
> -{
> -    uintptr_t addr = tb->jmp_target_addr[n];
> -    tcg_insn_unit i1, i2;
> -    TCGType rt = TCG_TYPE_I64;
> -    TCGReg  rd = TCG_REG_TMP;
> -    uint64_t pair;
> -
> -    ptrdiff_t offset = addr - jmp_rx;
> -
> -    if (offset == sextract64(offset, 0, 26)) {
> -        i1 = I3206_B | ((offset >> 2) & 0x3ffffff);
> -        i2 = NOP;
> -    } else {
> -        offset = (addr >> 12) - (jmp_rx >> 12);
> -
> -        /* patch ADRP */
> -        i1 = I3406_ADRP | (offset & 3) << 29 | (offset & 0x1ffffc) << (5 - 2) | rd;
> -        /* patch ADDI */
> -        i2 = I3401_ADDI | rt << 31 | (addr & 0xfff) << 10 | rd << 5 | rd;
> -    }
> -    pair = (uint64_t)i2 << 32 | i1;
> -    qatomic_set((uint64_t *)jmp_rw, pair);
> -    flush_idcache_range(jmp_rx, jmp_rw, 8);
> -}
> -
>  static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
>  {
>      if (!l->has_value) {
> @@ -1902,23 +1875,38 @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
>  static void tcg_out_goto_tb(TCGContext *s, int which)
>  {
>      /*
> -     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
> -     * write can be used to patch the target address.
> +     * Direct branch, or indirect address load, will be patched
> +     * by tb_target_set_jmp_target.  Assert indirect load offset
> +     * in range early, regardless of direct branch distance.
>       */
> -    if ((uintptr_t)s->code_ptr & 7) {
> -        tcg_out32(s, NOP);
> -    }
> +    intptr_t i_off = tcg_pcrel_diff(s, (void *)get_jmp_target_addr(s, which));
> +    tcg_debug_assert(i_off == sextract64(i_off, 0, 21));
> +
>      set_jmp_insn_offset(s, which);
> -    /*
> -     * actual branch destination will be patched by
> -     * tb_target_set_jmp_target later
> -     */
> -    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
> -    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
> +    tcg_out32(s, I3206_B);
>      tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
>      set_jmp_reset_offset(s, which);
>  }
>  
> +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
> +{
> +    uintptr_t d_addr = tb->jmp_target_addr[n];
> +    uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
> +    ptrdiff_t d_offset = d_addr - jmp_rx;
> +    ptrdiff_t i_offset = i_addr - jmp_rx;
> +    tcg_insn_unit insn;
> +
> +    /* Either directly branch, or indirect branch load. */
> +    if (d_offset == sextract64(d_offset, 0, 26)) {
> +        insn = I3206_B | ((d_offset >> 2) & 0x3ffffff);
> +    } else {
> +        insn = I3305_LDR | TCG_REG_TMP | (((i_offset >> 2) & 0x7ffff) << 5);
> +    }

Could we use deposits to build our instructions here? Also the mask
doesn't match the 24 bits you have left, bug from old code?

Otherwise:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
diff mbox series

Patch

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 6067446b03..0ba2298ea6 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -15,7 +15,8 @@ 
 
 #define TCG_TARGET_INSN_UNIT_SIZE  4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
-#define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
+#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
+#undef TCG_TARGET_STACK_GROWSUP
 
 typedef enum {
     TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index 0b65f2cac1..1d0ebf01a5 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -1353,33 +1353,6 @@  static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
     tcg_out_call_int(s, target);
 }
 
-void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
-                              uintptr_t jmp_rx, uintptr_t jmp_rw)
-{
-    uintptr_t addr = tb->jmp_target_addr[n];
-    tcg_insn_unit i1, i2;
-    TCGType rt = TCG_TYPE_I64;
-    TCGReg  rd = TCG_REG_TMP;
-    uint64_t pair;
-
-    ptrdiff_t offset = addr - jmp_rx;
-
-    if (offset == sextract64(offset, 0, 26)) {
-        i1 = I3206_B | ((offset >> 2) & 0x3ffffff);
-        i2 = NOP;
-    } else {
-        offset = (addr >> 12) - (jmp_rx >> 12);
-
-        /* patch ADRP */
-        i1 = I3406_ADRP | (offset & 3) << 29 | (offset & 0x1ffffc) << (5 - 2) | rd;
-        /* patch ADDI */
-        i2 = I3401_ADDI | rt << 31 | (addr & 0xfff) << 10 | rd << 5 | rd;
-    }
-    pair = (uint64_t)i2 << 32 | i1;
-    qatomic_set((uint64_t *)jmp_rw, pair);
-    flush_idcache_range(jmp_rx, jmp_rw, 8);
-}
-
 static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
 {
     if (!l->has_value) {
@@ -1902,23 +1875,38 @@  static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 static void tcg_out_goto_tb(TCGContext *s, int which)
 {
     /*
-     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
-     * write can be used to patch the target address.
+     * Direct branch, or indirect address load, will be patched
+     * by tb_target_set_jmp_target.  Assert indirect load offset
+     * in range early, regardless of direct branch distance.
      */
-    if ((uintptr_t)s->code_ptr & 7) {
-        tcg_out32(s, NOP);
-    }
+    intptr_t i_off = tcg_pcrel_diff(s, (void *)get_jmp_target_addr(s, which));
+    tcg_debug_assert(i_off == sextract64(i_off, 0, 21));
+
     set_jmp_insn_offset(s, which);
-    /*
-     * actual branch destination will be patched by
-     * tb_target_set_jmp_target later
-     */
-    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
-    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
+    tcg_out32(s, I3206_B);
     tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
     set_jmp_reset_offset(s, which);
 }
 
+void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
+                              uintptr_t jmp_rx, uintptr_t jmp_rw)
+{
+    uintptr_t d_addr = tb->jmp_target_addr[n];
+    uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
+    ptrdiff_t d_offset = d_addr - jmp_rx;
+    ptrdiff_t i_offset = i_addr - jmp_rx;
+    tcg_insn_unit insn;
+
+    /* Either directly branch, or indirect branch load. */
+    if (d_offset == sextract64(d_offset, 0, 26)) {
+        insn = I3206_B | ((d_offset >> 2) & 0x3ffffff);
+    } else {
+        insn = I3305_LDR | TCG_REG_TMP | (((i_offset >> 2) & 0x7ffff) << 5);
+    }
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
                        const int const_args[TCG_MAX_OP_ARGS])