diff mbox series

[PULL,15/20] include/hw/core: Create struct CPUJumpCache

Message ID 20221004195241.46491-16-richard.henderson@linaro.org
State New
Headers show
Series [PULL,01/20] cpu: cache CPUClass in CPUState for hot code paths | expand

Commit Message

Richard Henderson Oct. 4, 2022, 7:52 p.m. UTC
Wrap the bare TranslationBlock pointer into a structure.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tb-hash.h       |  1 +
 accel/tcg/tb-jmp-cache.h  | 24 ++++++++++++++++++++++++
 include/exec/cpu-common.h |  1 +
 include/hw/core/cpu.h     | 15 +--------------
 include/qemu/typedefs.h   |  1 +
 accel/stubs/tcg-stub.c    |  4 ++++
 accel/tcg/cpu-exec.c      | 10 +++++++---
 accel/tcg/cputlb.c        |  9 +++++----
 accel/tcg/translate-all.c | 28 +++++++++++++++++++++++++---
 hw/core/cpu-common.c      |  3 +--
 plugins/core.c            |  2 +-
 trace/control-target.c    |  2 +-
 12 files changed, 72 insertions(+), 28 deletions(-)
 create mode 100644 accel/tcg/tb-jmp-cache.h

Comments

Ilya Leoshkevich Oct. 27, 2022, 2:18 p.m. UTC | #1
On Tue, Oct 04, 2022 at 12:52:36PM -0700, Richard Henderson wrote:
> Wrap the bare TranslationBlock pointer into a structure.
> 
> Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
> Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  accel/tcg/tb-hash.h       |  1 +
>  accel/tcg/tb-jmp-cache.h  | 24 ++++++++++++++++++++++++
>  include/exec/cpu-common.h |  1 +
>  include/hw/core/cpu.h     | 15 +--------------
>  include/qemu/typedefs.h   |  1 +
>  accel/stubs/tcg-stub.c    |  4 ++++
>  accel/tcg/cpu-exec.c      | 10 +++++++---
>  accel/tcg/cputlb.c        |  9 +++++----
>  accel/tcg/translate-all.c | 28 +++++++++++++++++++++++++---
>  hw/core/cpu-common.c      |  3 +--
>  plugins/core.c            |  2 +-
>  trace/control-target.c    |  2 +-
>  12 files changed, 72 insertions(+), 28 deletions(-)
>  create mode 100644 accel/tcg/tb-jmp-cache.h

Hi,

After this patch, I get:

    qemu-s390x: qemu/include/qemu/rcu.h:102: rcu_read_unlock: Assertion `p_rcu_reader->depth != 0' failed.

in one of the wasmtime tests (host=x86_64, guest=s390x).
GDB shows that the root cause is actually this:

    Thread 181 "wasi_tokio::pat" received signal SIGSEGV, Segmentation fault.
    [Switching to Thread 0x7ffff6c54640 (LWP 168352)]
    0x0000555555626736 in do_tb_phys_invalidate (tb=tb@entry=0x7fffea4b8500 <code_gen_buffer+38503635>, rm_from_page_list=rm_from_page_list@entry=true) at qemu/accel/tcg/translate-all.c:1192
    1192	        if (qatomic_read(&jc->array[h].tb) == tb) {
    (gdb) bt
    #0  0x0000555555626736 in do_tb_phys_invalidate (tb=tb@entry=0x7fffea4b8500 <code_gen_buffer+38503635>, rm_from_page_list=rm_from_page_list@entry=true) at qemu/accel/tcg/translate-all.c:1192
    #1  0x0000555555626b98 in tb_phys_invalidate__locked (tb=0x7fffea4b8500 <code_gen_buffer+38503635>) at qemu/accel/tcg/translate-all.c:1211
    #2  tb_invalidate_phys_page_range__locked (p=<optimized out>, start=start@entry=836716683264, end=end@entry=836716687360, retaddr=0, pages=0x0) at qemu/accel/tcg/translate-all.c:1678
    #3  0x0000555555626dfb in tb_invalidate_phys_range (start=836716683264, start@entry=836716584960, end=end@entry=836716982272) at qemu/accel/tcg/translate-all.c:1753
    #4  0x0000555555639e43 in target_munmap (start=start@entry=836716584960, len=len@entry=397312) at qemu/linux-user/mmap.c:769

Let me know if you need more information, I can try to extract a
minimal reproducer.

Best regards,
Ilya
Ilya Leoshkevich Oct. 27, 2022, 2:44 p.m. UTC | #2
On Thu, Oct 27, 2022 at 04:18:56PM +0200, Ilya Leoshkevich wrote:
> On Tue, Oct 04, 2022 at 12:52:36PM -0700, Richard Henderson wrote:
> > Wrap the bare TranslationBlock pointer into a structure.
> > 
> > Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
> > Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
> > Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> > ---
> >  accel/tcg/tb-hash.h       |  1 +
> >  accel/tcg/tb-jmp-cache.h  | 24 ++++++++++++++++++++++++
> >  include/exec/cpu-common.h |  1 +
> >  include/hw/core/cpu.h     | 15 +--------------
> >  include/qemu/typedefs.h   |  1 +
> >  accel/stubs/tcg-stub.c    |  4 ++++
> >  accel/tcg/cpu-exec.c      | 10 +++++++---
> >  accel/tcg/cputlb.c        |  9 +++++----
> >  accel/tcg/translate-all.c | 28 +++++++++++++++++++++++++---
> >  hw/core/cpu-common.c      |  3 +--
> >  plugins/core.c            |  2 +-
> >  trace/control-target.c    |  2 +-
> >  12 files changed, 72 insertions(+), 28 deletions(-)
> >  create mode 100644 accel/tcg/tb-jmp-cache.h
> 
> Hi,
> 
> After this patch, I get:
> 
>     qemu-s390x: qemu/include/qemu/rcu.h:102: rcu_read_unlock: Assertion `p_rcu_reader->depth != 0' failed.
> 
> in one of the wasmtime tests (host=x86_64, guest=s390x).
> GDB shows that the root cause is actually this:
> 
>     Thread 181 "wasi_tokio::pat" received signal SIGSEGV, Segmentation fault.
>     [Switching to Thread 0x7ffff6c54640 (LWP 168352)]
>     0x0000555555626736 in do_tb_phys_invalidate (tb=tb@entry=0x7fffea4b8500 <code_gen_buffer+38503635>, rm_from_page_list=rm_from_page_list@entry=true) at qemu/accel/tcg/translate-all.c:1192
>     1192	        if (qatomic_read(&jc->array[h].tb) == tb) {
>     (gdb) bt
>     #0  0x0000555555626736 in do_tb_phys_invalidate (tb=tb@entry=0x7fffea4b8500 <code_gen_buffer+38503635>, rm_from_page_list=rm_from_page_list@entry=true) at qemu/accel/tcg/translate-all.c:1192
>     #1  0x0000555555626b98 in tb_phys_invalidate__locked (tb=0x7fffea4b8500 <code_gen_buffer+38503635>) at qemu/accel/tcg/translate-all.c:1211
>     #2  tb_invalidate_phys_page_range__locked (p=<optimized out>, start=start@entry=836716683264, end=end@entry=836716687360, retaddr=0, pages=0x0) at qemu/accel/tcg/translate-all.c:1678
>     #3  0x0000555555626dfb in tb_invalidate_phys_range (start=836716683264, start@entry=836716584960, end=end@entry=836716982272) at qemu/accel/tcg/translate-all.c:1753
>     #4  0x0000555555639e43 in target_munmap (start=start@entry=836716584960, len=len@entry=397312) at qemu/linux-user/mmap.c:769
> 
> Let me know if you need more information, I can try to extract a
> minimal reproducer.
> 
> Best regards,
> Ilya

Putting CPUJumpCache inside CPUState made problem go away:

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 18ca701b443..3ea528566c3 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -32,6 +32,7 @@
 #include "qemu/thread.h"
 #include "qemu/plugin.h"
 #include "qom/object.h"
+#include "accel/tcg/tb-jmp-cache.h"
 
 typedef int (*WriteCoreDumpFunction)(const void *buf, size_t size,
                                      void *opaque);
@@ -366,7 +367,7 @@ struct CPUState {
     CPUArchState *env_ptr;
     IcountDecr *icount_decr_ptr;
 
-    CPUJumpCache *tb_jmp_cache;
+    CPUJumpCache tb_jmp_cache;
 
     struct GDBRegisterState *gdb_regs;
     int gdb_num_regs;
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index 2d7e610ee24..47165fc03e3 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -253,7 +253,7 @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
     tcg_debug_assert(!(cflags & CF_INVALID));
 
     hash = tb_jmp_cache_hash_func(pc);
-    tb = qatomic_rcu_read(&cpu->tb_jmp_cache->array[hash].tb);
+    tb = qatomic_rcu_read(&cpu->tb_jmp_cache.array[hash].tb);
 
     if (likely(tb &&
                tb->pc == pc &&
@@ -267,7 +267,7 @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
     if (tb == NULL) {
         return NULL;
     }
-    qatomic_set(&cpu->tb_jmp_cache->array[hash].tb, tb);
+    qatomic_set(&cpu->tb_jmp_cache.array[hash].tb, tb);
     return tb;
 }
 
@@ -998,7 +998,7 @@ int cpu_exec(CPUState *cpu)
                  * for the fast lookup
                  */
                 h = tb_jmp_cache_hash_func(pc);
-                qatomic_set(&cpu->tb_jmp_cache->array[h].tb, tb);
+                qatomic_set(&cpu->tb_jmp_cache.array[h].tb, tb);
             }
 
 #ifndef CONFIG_USER_ONLY
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 63ecc152366..fffd9cb15f8 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1188,7 +1188,7 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
     /* remove the TB from the hash list */
     h = tb_jmp_cache_hash_func(tb->pc);
     CPU_FOREACH(cpu) {
-        CPUJumpCache *jc = cpu->tb_jmp_cache;
+        CPUJumpCache *jc = &cpu->tb_jmp_cache;
         if (qatomic_read(&jc->array[h].tb) == tb) {
             qatomic_set(&jc->array[h].tb, NULL);
         }
@@ -2445,23 +2445,12 @@ int page_unprotect(target_ulong address, uintptr_t pc)
 }
 #endif /* CONFIG_USER_ONLY */
 
-/*
- * Called by generic code at e.g. cpu reset after cpu creation,
- * therefore we must be prepared to allocate the jump cache.
- */
 void tcg_flush_jmp_cache(CPUState *cpu)
 {
-    CPUJumpCache *jc = cpu->tb_jmp_cache;
+    CPUJumpCache *jc = &cpu->tb_jmp_cache;
 
-    if (likely(jc)) {
-        for (int i = 0; i < TB_JMP_CACHE_SIZE; i++) {
-            qatomic_set(&jc->array[i].tb, NULL);
-        }
-    } else {
-        /* This should happen once during realize, and thus never race. */
-        jc = g_new0(CPUJumpCache, 1);
-        jc = qatomic_xchg(&cpu->tb_jmp_cache, jc);
-        assert(jc == NULL);
+    for (int i = 0; i < TB_JMP_CACHE_SIZE; i++) {
+        qatomic_set(&jc->array[i].tb, NULL);
     }
 }
 
So there must be a race in tcg_flush_jmp_cache() after all?

Best regards,
Ilya
Richard Henderson Oct. 27, 2022, 8:46 p.m. UTC | #3
On 10/28/22 00:18, Ilya Leoshkevich wrote:
> in one of the wasmtime tests (host=x86_64, guest=s390x).
> GDB shows that the root cause is actually this:
> 
>      Thread 181 "wasi_tokio::pat" received signal SIGSEGV, Segmentation fault.
>      [Switching to Thread 0x7ffff6c54640 (LWP 168352)]
>      0x0000555555626736 in do_tb_phys_invalidate (tb=tb@entry=0x7fffea4b8500 <code_gen_buffer+38503635>, rm_from_page_list=rm_from_page_list@entry=true) at qemu/accel/tcg/translate-all.c:1192
>      1192	        if (qatomic_read(&jc->array[h].tb) == tb) {
>      (gdb) bt
>      #0  0x0000555555626736 in do_tb_phys_invalidate (tb=tb@entry=0x7fffea4b8500 <code_gen_buffer+38503635>, rm_from_page_list=rm_from_page_list@entry=true) at qemu/accel/tcg/translate-all.c:1192
>      #1  0x0000555555626b98 in tb_phys_invalidate__locked (tb=0x7fffea4b8500 <code_gen_buffer+38503635>) at qemu/accel/tcg/translate-all.c:1211
>      #2  tb_invalidate_phys_page_range__locked (p=<optimized out>, start=start@entry=836716683264, end=end@entry=836716687360, retaddr=0, pages=0x0) at qemu/accel/tcg/translate-all.c:1678
>      #3  0x0000555555626dfb in tb_invalidate_phys_range (start=836716683264, start@entry=836716584960, end=end@entry=836716982272) at qemu/accel/tcg/translate-all.c:1753
>      #4  0x0000555555639e43 in target_munmap (start=start@entry=836716584960, len=len@entry=397312) at qemu/linux-user/mmap.c:769
> 
> Let me know if you need more information, I can try to extract a
> minimal reproducer.

A reproducer would be helpful.


r~
Richard Henderson Oct. 27, 2022, 9:02 p.m. UTC | #4
On 10/28/22 00:44, Ilya Leoshkevich wrote:
> Putting CPUJumpCache inside CPUState made problem go away:
> 
> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
> index 18ca701b443..3ea528566c3 100644
> --- a/include/hw/core/cpu.h
> +++ b/include/hw/core/cpu.h
> @@ -32,6 +32,7 @@
>   #include "qemu/thread.h"
>   #include "qemu/plugin.h"
>   #include "qom/object.h"
> +#include "accel/tcg/tb-jmp-cache.h"
>   
>   typedef int (*WriteCoreDumpFunction)(const void *buf, size_t size,
>                                        void *opaque);
> @@ -366,7 +367,7 @@ struct CPUState {
>       CPUArchState *env_ptr;
>       IcountDecr *icount_decr_ptr;
>   
> -    CPUJumpCache *tb_jmp_cache;
> +    CPUJumpCache tb_jmp_cache;

Yes, well.  That structure is quite large (128kB?) and I had been hoping to (1) save that 
extra memory for e.g. KVM and (2) hide the tcg-specific stuff from core.

But clearly something went wrong during some threadedness with your test case.


>   void tcg_flush_jmp_cache(CPUState *cpu)
>   {
> -    CPUJumpCache *jc = cpu->tb_jmp_cache;
>   
> -    if (likely(jc)) {
> -        for (int i = 0; i < TB_JMP_CACHE_SIZE; i++) {
> -            qatomic_set(&jc->array[i].tb, NULL);
> -        }
> -    } else {
> -        /* This should happen once during realize, and thus never race. */
> -        jc = g_new0(CPUJumpCache, 1);
> -        jc = qatomic_xchg(&cpu->tb_jmp_cache, jc);
> -        assert(jc == NULL);
>       }
>   }
>   
> So there must be a race in tcg_flush_jmp_cache() after all?

If there had been a race here, we would abort with the assert.
It must be something else...


r~
diff mbox series

Patch

diff --git a/accel/tcg/tb-hash.h b/accel/tcg/tb-hash.h
index 0a273d9605..83dc610e4c 100644
--- a/accel/tcg/tb-hash.h
+++ b/accel/tcg/tb-hash.h
@@ -23,6 +23,7 @@ 
 #include "exec/cpu-defs.h"
 #include "exec/exec-all.h"
 #include "qemu/xxhash.h"
+#include "tb-jmp-cache.h"
 
 #ifdef CONFIG_SOFTMMU
 
diff --git a/accel/tcg/tb-jmp-cache.h b/accel/tcg/tb-jmp-cache.h
new file mode 100644
index 0000000000..2d8fbb1bfe
--- /dev/null
+++ b/accel/tcg/tb-jmp-cache.h
@@ -0,0 +1,24 @@ 
+/*
+ * The per-CPU TranslationBlock jump cache.
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef ACCEL_TCG_TB_JMP_CACHE_H
+#define ACCEL_TCG_TB_JMP_CACHE_H
+
+#define TB_JMP_CACHE_BITS 12
+#define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
+
+/*
+ * Accessed in parallel; all accesses to 'tb' must be atomic.
+ */
+struct CPUJumpCache {
+    struct {
+        TranslationBlock *tb;
+    } array[TB_JMP_CACHE_SIZE];
+};
+
+#endif /* ACCEL_TCG_TB_JMP_CACHE_H */
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index d909429427..c493510ee9 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -38,6 +38,7 @@  void cpu_list_unlock(void);
 unsigned int cpu_list_generation_id_get(void);
 
 void tcg_flush_softmmu_tlb(CPUState *cs);
+void tcg_flush_jmp_cache(CPUState *cs);
 
 void tcg_iommu_init_notifier_list(CPUState *cpu);
 void tcg_iommu_free_notifier_list(CPUState *cpu);
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 009dc0d336..18ca701b44 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -236,9 +236,6 @@  struct kvm_run;
 struct hax_vcpu_state;
 struct hvf_vcpu_state;
 
-#define TB_JMP_CACHE_BITS 12
-#define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
-
 /* work queue */
 
 /* The union type allows passing of 64 bit target pointers on 32 bit
@@ -369,8 +366,7 @@  struct CPUState {
     CPUArchState *env_ptr;
     IcountDecr *icount_decr_ptr;
 
-    /* Accessed in parallel; all accesses must be atomic */
-    TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE];
+    CPUJumpCache *tb_jmp_cache;
 
     struct GDBRegisterState *gdb_regs;
     int gdb_num_regs;
@@ -456,15 +452,6 @@  extern CPUTailQ cpus;
 
 extern __thread CPUState *current_cpu;
 
-static inline void cpu_tb_jmp_cache_clear(CPUState *cpu)
-{
-    unsigned int i;
-
-    for (i = 0; i < TB_JMP_CACHE_SIZE; i++) {
-        qatomic_set(&cpu->tb_jmp_cache[i], NULL);
-    }
-}
-
 /**
  * qemu_tcg_mttcg_enabled:
  * Check whether we are running MultiThread TCG or not.
diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index a4aee238c7..5f95169827 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -41,6 +41,7 @@  typedef struct CoMutex CoMutex;
 typedef struct ConfidentialGuestSupport ConfidentialGuestSupport;
 typedef struct CPUAddressSpace CPUAddressSpace;
 typedef struct CPUArchState CPUArchState;
+typedef struct CPUJumpCache CPUJumpCache;
 typedef struct CPUState CPUState;
 typedef struct CPUTLBEntryFull CPUTLBEntryFull;
 typedef struct DeviceListener DeviceListener;
diff --git a/accel/stubs/tcg-stub.c b/accel/stubs/tcg-stub.c
index 6ce8a34228..c1b05767c0 100644
--- a/accel/stubs/tcg-stub.c
+++ b/accel/stubs/tcg-stub.c
@@ -21,6 +21,10 @@  void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
 {
 }
 
+void tcg_flush_jmp_cache(CPUState *cpu)
+{
+}
+
 int probe_access_flags(CPUArchState *env, target_ulong addr,
                        MMUAccessType access_type, int mmu_idx,
                        bool nonfault, void **phost, uintptr_t retaddr)
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index dd58a144a8..2d7e610ee2 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -42,6 +42,7 @@ 
 #include "sysemu/replay.h"
 #include "sysemu/tcg.h"
 #include "exec/helper-proto.h"
+#include "tb-jmp-cache.h"
 #include "tb-hash.h"
 #include "tb-context.h"
 #include "internal.h"
@@ -252,7 +253,7 @@  static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
     tcg_debug_assert(!(cflags & CF_INVALID));
 
     hash = tb_jmp_cache_hash_func(pc);
-    tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
+    tb = qatomic_rcu_read(&cpu->tb_jmp_cache->array[hash].tb);
 
     if (likely(tb &&
                tb->pc == pc &&
@@ -266,7 +267,7 @@  static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
     if (tb == NULL) {
         return NULL;
     }
-    qatomic_set(&cpu->tb_jmp_cache[hash], tb);
+    qatomic_set(&cpu->tb_jmp_cache->array[hash].tb, tb);
     return tb;
 }
 
@@ -987,6 +988,8 @@  int cpu_exec(CPUState *cpu)
 
             tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
             if (tb == NULL) {
+                uint32_t h;
+
                 mmap_lock();
                 tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
                 mmap_unlock();
@@ -994,7 +997,8 @@  int cpu_exec(CPUState *cpu)
                  * We add the TB in the virtual pc hash table
                  * for the fast lookup
                  */
-                qatomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
+                h = tb_jmp_cache_hash_func(pc);
+                qatomic_set(&cpu->tb_jmp_cache->array[h].tb, tb);
             }
 
 #ifndef CONFIG_USER_ONLY
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index c7909fb619..6f1c00682b 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -100,10 +100,11 @@  static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,
 
 static void tb_jmp_cache_clear_page(CPUState *cpu, target_ulong page_addr)
 {
-    unsigned int i, i0 = tb_jmp_cache_hash_page(page_addr);
+    int i, i0 = tb_jmp_cache_hash_page(page_addr);
+    CPUJumpCache *jc = cpu->tb_jmp_cache;
 
     for (i = 0; i < TB_JMP_PAGE_SIZE; i++) {
-        qatomic_set(&cpu->tb_jmp_cache[i0 + i], NULL);
+        qatomic_set(&jc->array[i0 + i].tb, NULL);
     }
 }
 
@@ -356,7 +357,7 @@  static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 
     qemu_spin_unlock(&env_tlb(env)->c.lock);
 
-    cpu_tb_jmp_cache_clear(cpu);
+    tcg_flush_jmp_cache(cpu);
 
     if (to_clean == ALL_MMUIDX_BITS) {
         qatomic_set(&env_tlb(env)->c.full_flush_count,
@@ -785,7 +786,7 @@  static void tlb_flush_range_by_mmuidx_async_0(CPUState *cpu,
      * longer to clear each entry individually than it will to clear it all.
      */
     if (d.len >= (TARGET_PAGE_SIZE * TB_JMP_CACHE_SIZE)) {
-        cpu_tb_jmp_cache_clear(cpu);
+        tcg_flush_jmp_cache(cpu);
         return;
     }
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 3a63113c41..63ecc15236 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -58,6 +58,7 @@ 
 #include "sysemu/tcg.h"
 #include "qapi/error.h"
 #include "hw/core/tcg-cpu-ops.h"
+#include "tb-jmp-cache.h"
 #include "tb-hash.h"
 #include "tb-context.h"
 #include "internal.h"
@@ -967,7 +968,7 @@  static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
     }
 
     CPU_FOREACH(cpu) {
-        cpu_tb_jmp_cache_clear(cpu);
+        tcg_flush_jmp_cache(cpu);
     }
 
     qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
@@ -1187,8 +1188,9 @@  static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
     /* remove the TB from the hash list */
     h = tb_jmp_cache_hash_func(tb->pc);
     CPU_FOREACH(cpu) {
-        if (qatomic_read(&cpu->tb_jmp_cache[h]) == tb) {
-            qatomic_set(&cpu->tb_jmp_cache[h], NULL);
+        CPUJumpCache *jc = cpu->tb_jmp_cache;
+        if (qatomic_read(&jc->array[h].tb) == tb) {
+            qatomic_set(&jc->array[h].tb, NULL);
         }
     }
 
@@ -2443,6 +2445,26 @@  int page_unprotect(target_ulong address, uintptr_t pc)
 }
 #endif /* CONFIG_USER_ONLY */
 
+/*
+ * Called by generic code at e.g. cpu reset after cpu creation,
+ * therefore we must be prepared to allocate the jump cache.
+ */
+void tcg_flush_jmp_cache(CPUState *cpu)
+{
+    CPUJumpCache *jc = cpu->tb_jmp_cache;
+
+    if (likely(jc)) {
+        for (int i = 0; i < TB_JMP_CACHE_SIZE; i++) {
+            qatomic_set(&jc->array[i].tb, NULL);
+        }
+    } else {
+        /* This should happen once during realize, and thus never race. */
+        jc = g_new0(CPUJumpCache, 1);
+        jc = qatomic_xchg(&cpu->tb_jmp_cache, jc);
+        assert(jc == NULL);
+    }
+}
+
 /* This is a wrapper for common code that can not use CONFIG_SOFTMMU */
 void tcg_flush_softmmu_tlb(CPUState *cs)
 {
diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c
index 9e3241b430..f9fdd46b9d 100644
--- a/hw/core/cpu-common.c
+++ b/hw/core/cpu-common.c
@@ -137,8 +137,7 @@  static void cpu_common_reset(DeviceState *dev)
     cpu->cflags_next_tb = -1;
 
     if (tcg_enabled()) {
-        cpu_tb_jmp_cache_clear(cpu);
-
+        tcg_flush_jmp_cache(cpu);
         tcg_flush_softmmu_tlb(cpu);
     }
 }
diff --git a/plugins/core.c b/plugins/core.c
index 792262da08..c3ae284994 100644
--- a/plugins/core.c
+++ b/plugins/core.c
@@ -56,7 +56,7 @@  struct qemu_plugin_ctx *plugin_id_to_ctx_locked(qemu_plugin_id_t id)
 static void plugin_cpu_update__async(CPUState *cpu, run_on_cpu_data data)
 {
     bitmap_copy(cpu->plugin_mask, &data.host_ulong, QEMU_PLUGIN_EV_MAX);
-    cpu_tb_jmp_cache_clear(cpu);
+    tcg_flush_jmp_cache(cpu);
 }
 
 static void plugin_cpu_update__locked(gpointer k, gpointer v, gpointer udata)
diff --git a/trace/control-target.c b/trace/control-target.c
index 8418673c18..232c97a4a1 100644
--- a/trace/control-target.c
+++ b/trace/control-target.c
@@ -65,7 +65,7 @@  static void trace_event_synchronize_vcpu_state_dynamic(
 {
     bitmap_copy(vcpu->trace_dstate, vcpu->trace_dstate_delayed,
                 CPU_TRACE_DSTATE_MAX_EVENTS);
-    cpu_tb_jmp_cache_clear(vcpu);
+    tcg_flush_jmp_cache(vcpu);
 }
 
 void trace_event_set_vcpu_state_dynamic(CPUState *vcpu,