diff mbox

[v5,28/33] cputlb: make tlb_flush_by_mmuidx safe for MTTCG

Message ID 20161027151030.20863-29-alex.bennee@linaro.org
State New
Headers show

Commit Message

Alex Bennée Oct. 27, 2016, 3:10 p.m. UTC
These flushes allow a per-mmuidx granularity to the TLB flushing and are
currently only used by the ARM model. As it is possible to hammer the
other vCPU threads with flushes (and build up long queues of identical
flushes) we extend mechanism used for the global tlb_flush and set a
bitmap describing all the pending flushes. The updates are done
atomically to avoid corruption of the bitmap but repeating a flush is
certainly not a problem.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>


---
v5
  - fix tlb_flush_page_by_mmuidx to defer all checks to async work
  - convert to run_on_cpu_data
  - additional tlb_debugs

You can't be checking a cross cpu env-> variable

WARNING: ThreadSanitizer: data race (pid=1962)
  Read of size 8 at 0x7dd00005e998 by thread T2:
    #0 tlb_flush_page_by_mmuidx /home/alex/lsrc/qemu/qemu.git/cputlb.c:285 (qemu-system-aarch64+0x0000004a1732)
    #1 tlbi_aa64_vae1is_write /home/alex/lsrc/qemu/qemu.git/target-arm/helper.c:3023 (qemu-system-aarch64+0x000000672a98)
    #2 helper_set_cp_reg64 /home/alex/lsrc/qemu/qemu.git/target-arm/op_helper.c:744 (qemu-system-aarch64+0x000000668699)
    #3 <null> <null> (0x000040029eb5)
    #4 cpu_loop_exec_tb /home/alex/lsrc/qemu/qemu.git/cpu-exec.c:558 (qemu-system-aarch64+0x000000430d00)
    #5 cpu_exec /home/alex/lsrc/qemu/qemu.git/cpu-exec.c:646 (qemu-system-aarch64+0x0000004310e5)
    #6 tcg_cpu_exec /home/alex/lsrc/qemu/qemu.git/cpus.c:1156 (qemu-system-aarch64+0x000000474d6f)
    #7 qemu_tcg_cpu_thread_fn /home/alex/lsrc/qemu/qemu.git/cpus.c:1345 (qemu-system-aarch64+0x000000475641)
    #8 <null> <null> (libtsan.so.0+0x0000000230d9)

  Previous write of size 8 at 0x7dd00005e998 by thread T4:
    #0 tlb_add_large_page /home/alex/lsrc/qemu/qemu.git/cputlb.c:459 (qemu-system-aarch64+0x0000004a1ebf)
    #1 tlb_set_page_with_attrs /home/alex/lsrc/qemu/qemu.git/cputlb.c:487 (qemu-system-aarch64+0x0000004a2002)
    #2 arm_tlb_fill /home/alex/lsrc/qemu/qemu.git/target-arm/helper.c:8116 (qemu-system-aarch64+0x0000006849de)
    #3 tlb_fill /home/alex/lsrc/qemu/qemu.git/target-arm/op_helper.c:127 (qemu-system-aarch64+0x000000666b4c)
    #4 helper_le_ldul_mmu /home/alex/lsrc/qemu/qemu.git/softmmu_template.h:127 (qemu-system-aarch64+0x0000004a4bba)
    #5 <null> <null> (0x000040017833)
    #6 cpu_loop_exec_tb /home/alex/lsrc/qemu/qemu.git/cpu-exec.c:558 (qemu-system-aarch64+0x000000430d00)
    #7 cpu_exec /home/alex/lsrc/qemu/qemu.git/cpu-exec.c:646 (qemu-system-aarch64+0x0000004310e5)
    #8 tcg_cpu_exec /home/alex/lsrc/qemu/qemu.git/cpus.c:1156 (qemu-system-aarch64+0x000000474d6f)
    #9 qemu_tcg_cpu_thread_fn /home/alex/lsrc/qemu/qemu.git/cpus.c:1345 (qemu-system-aarch64+0x000000475641)
    #10 <null> <null> (libtsan.so.0+0x0000000230d9)

  Location is heap block of size 125904 at 0x7dd000040000 allocated by main thread:
    #0 malloc <null> (libtsan.so.0+0x0000000254a3)
    #1 g_malloc <null> (libglib-2.0.so.0+0x00000004f728)
    #2 object_new qom/object.c:488 (qemu-system-aarch64+0x000000b157c3)
    #3 machvirt_init /home/alex/lsrc/qemu/qemu.git/hw/arm/virt.c:1289 (qemu-system-aarch64+0x0000005d733e)
    #4 main /home/alex/lsrc/qemu/qemu.git/vl.c:4573 (qemu-system-aarch64+0x00000070f2eb)

  Thread T2 'CPU 0/TCG' (tid=1965, running) created by main thread at:
    #0 pthread_create <null> (libtsan.so.0+0x000000027577)
    #1 qemu_thread_create util/qemu-thread-posix.c:471 (qemu-system-aarch64+0x000000c710a6)
    #2 qemu_tcg_init_vcpu /home/alex/lsrc/qemu/qemu.git/cpus.c:1528 (qemu-system-aarch64+0x000000475f09)
    #3 qemu_init_vcpu /home/alex/lsrc/qemu/qemu.git/cpus.c:1605 (qemu-system-aarch64+0x00000047645e)
    #4 arm_cpu_realizefn /home/alex/lsrc/qemu/qemu.git/target-arm/cpu.c:708 (qemu-system-aarch64+0x00000068de38)
    #5 device_set_realized hw/core/qdev.c:918 (qemu-system-aarch64+0x00000080b429)
    #6 property_set_bool qom/object.c:1854 (qemu-system-aarch64+0x000000b19cb9)
    #7 object_property_set qom/object.c:1088 (qemu-system-aarch64+0x000000b177b5)
    #8 object_property_set_qobject qom/qom-qobject.c:27 (qemu-system-aarch64+0x000000b1b77a)
    #9 object_property_set_bool qom/object.c:1157 (qemu-system-aarch64+0x000000b17ac4)
    #10 machvirt_init /home/alex/lsrc/qemu/qemu.git/hw/arm/virt.c:1332 (qemu-system-aarch64+0x0000005d7576)
    #11 main /home/alex/lsrc/qemu/qemu.git/vl.c:4573 (qemu-system-aarch64+0x00000070f2eb)

  Thread T4 'CPU 2/TCG' (tid=1967, running) created by main thread at:
    #0 pthread_create <null> (libtsan.so.0+0x000000027577)
    #1 qemu_thread_create util/qemu-thread-posix.c:471 (qemu-system-aarch64+0x000000c710a6)
    #2 qemu_tcg_init_vcpu /home/alex/lsrc/qemu/qemu.git/cpus.c:1528 (qemu-system-aarch64+0x000000475f09)
    #3 qemu_init_vcpu /home/alex/lsrc/qemu/qemu.git/cpus.c:1605 (qemu-system-aarch64+0x00000047645e)
    #4 arm_cpu_realizefn /home/alex/lsrc/qemu/qemu.git/target-arm/cpu.c:708 (qemu-system-aarch64+0x00000068de38)
    #5 device_set_realized hw/core/qdev.c:918 (qemu-system-aarch64+0x00000080b429)
    #6 property_set_bool qom/object.c:1854 (qemu-system-aarch64+0x000000b19cb9)
    #7 object_property_set qom/object.c:1088 (qemu-system-aarch64+0x000000b177b5)
    #8 object_property_set_qobject qom/qom-qobject.c:27 (qemu-system-aarch64+0x000000b1b77a)
    #9 object_property_set_bool qom/object.c:1157 (qemu-system-aarch64+0x000000b17ac4)
    #10 machvirt_init /home/alex/lsrc/qemu/qemu.git/hw/arm/virt.c:1332 (qemu-system-aarch64+0x0000005d7576)
    #11 main /home/alex/lsrc/qemu/qemu.git/vl.c:4573 (qemu-system-aarch64+0x00000070f2eb)

SUMMARY: ThreadSanitizer: data race /home/alex/lsrc/qemu/qemu.git/cputlb.c:285 tlb_flush_page_by_mmuidx

debug for mmu_idx

mmu_idx debug
---
 cputlb.c          | 169 +++++++++++++++++++++++++++++++++++++++++-------------
 include/qom/cpu.h |  13 +++--
 2 files changed, 137 insertions(+), 45 deletions(-)

-- 
2.10.1

Comments

Pranith Kumar Nov. 1, 2016, 5:20 a.m. UTC | #1
Hi Alex,

Alex Bennée writes:

> These flushes allow a per-mmuidx granularity to the TLB flushing and are

> currently only used by the ARM model. As it is possible to hammer the

> other vCPU threads with flushes (and build up long queues of identical

> flushes) we extend mechanism used for the global tlb_flush and set a

> bitmap describing all the pending flushes. The updates are done

> atomically to avoid corruption of the bitmap but repeating a flush is

> certainly not a problem.

>

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>


<snip>

>

>  static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)

> @@ -233,16 +288,50 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)

>      }

>  }

>  

> -void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)

> +/* As we are going to hijack the bottom bits of the page address for a

> + * mmuidx bit mask we need to fail to build if we can't do that

> + */

> +QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS);

> +


FYI, this is causing a build error on my system with gcc 6.2.

  CC      aarch64-softmmu/cputlb.o
In file included from /home/pranith/devops/code/qemu/include/qemu/osdep.h:36:0,
                 from /home/pranith/devops/code/qemu/cputlb.c:20:
/home/pranith/devops/code/qemu/include/exec/cpu-all.h:196:26: error: braced-group within expression allowed only inside a function
 #define TARGET_PAGE_BITS ({ assert(target_page_bits_decided); \
                          ^
/home/pranith/devops/code/qemu/include/qemu/compiler.h:89:54: note: in definition of macro ‘QEMU_BUILD_BUG_ON’
     typedef char glue(qemu_build_bug_on__,__LINE__)[(x)?-1:1] __attribute__((unused));
                                                      ^
/home/pranith/devops/code/qemu/cputlb.c:293:34: note: in expansion of macro ‘TARGET_PAGE_BITS’
 QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS);
                                  ^~~~~~~~~~~~~~~~
/home/pranith/devops/code/qemu/rules.mak:60: recipe for target 'cputlb.o' failed
make[1]: *** [cputlb.o] Error 1
Makefile:202: recipe for target 'subdir-aarch64-softmmu' failed
make: *** [subdir-aarch64-softmmu] Error 2

Thanks,
--
Pranith
Alex Bennée Nov. 1, 2016, 7:45 a.m. UTC | #2
Pranith Kumar <bobby.prani@gmail.com> writes:

> Hi Alex,

>

> Alex Bennée writes:

>

>> These flushes allow a per-mmuidx granularity to the TLB flushing and are

>> currently only used by the ARM model. As it is possible to hammer the

>> other vCPU threads with flushes (and build up long queues of identical

>> flushes) we extend mechanism used for the global tlb_flush and set a

>> bitmap describing all the pending flushes. The updates are done

>> atomically to avoid corruption of the bitmap but repeating a flush is

>> certainly not a problem.

>>

>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>

> <snip>

>

>>

>>  static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)

>> @@ -233,16 +288,50 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)

>>      }

>>  }

>>

>> -void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)

>> +/* As we are going to hijack the bottom bits of the page address for a

>> + * mmuidx bit mask we need to fail to build if we can't do that

>> + */

>> +QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS);

>> +

>

> FYI, this is causing a build error on my system with gcc 6.2.

>

>   CC      aarch64-softmmu/cputlb.o

> In file included from /home/pranith/devops/code/qemu/include/qemu/osdep.h:36:0,

>                  from /home/pranith/devops/code/qemu/cputlb.c:20:

> /home/pranith/devops/code/qemu/include/exec/cpu-all.h:196:26: error: braced-group within expression allowed only inside a function

>  #define TARGET_PAGE_BITS ({ assert(target_page_bits_decided); \

>                           ^

> /home/pranith/devops/code/qemu/include/qemu/compiler.h:89:54: note: in definition of macro ‘QEMU_BUILD_BUG_ON’

>      typedef char glue(qemu_build_bug_on__,__LINE__)[(x)?-1:1] __attribute__((unused));

>                                                       ^

> /home/pranith/devops/code/qemu/cputlb.c:293:34: note: in expansion of macro ‘TARGET_PAGE_BITS’

>  QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS);

>                                   ^~~~~~~~~~~~~~~~

> /home/pranith/devops/code/qemu/rules.mak:60: recipe for target 'cputlb.o' failed

> make[1]: *** [cputlb.o] Error 1

> Makefile:202: recipe for target 'subdir-aarch64-softmmu' failed

> make: *** [subdir-aarch64-softmmu] Error 2


Odd. I'll look into it. What was you configure string and host architecture?

>

> Thanks,



--
Alex Bennée
Peter Maydell Nov. 1, 2016, 8:03 a.m. UTC | #3
On 1 November 2016 at 07:45, Alex Bennée <alex.bennee@linaro.org> wrote:
>

> Pranith Kumar <bobby.prani@gmail.com> writes:

>> FYI, this is causing a build error on my system with gcc 6.2.

>>

>>   CC      aarch64-softmmu/cputlb.o

>> In file included from /home/pranith/devops/code/qemu/include/qemu/osdep.h:36:0,

>>                  from /home/pranith/devops/code/qemu/cputlb.c:20:

>> /home/pranith/devops/code/qemu/include/exec/cpu-all.h:196:26: error: braced-group within expression allowed only inside a function

>>  #define TARGET_PAGE_BITS ({ assert(target_page_bits_decided); \

>>                           ^

>> /home/pranith/devops/code/qemu/include/qemu/compiler.h:89:54: note: in definition of macro ‘QEMU_BUILD_BUG_ON’

>>      typedef char glue(qemu_build_bug_on__,__LINE__)[(x)?-1:1] __attribute__((unused));

>>                                                       ^

>> /home/pranith/devops/code/qemu/cputlb.c:293:34: note: in expansion of macro ‘TARGET_PAGE_BITS’

>>  QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS);

>>                                   ^~~~~~~~~~~~~~~~

>> /home/pranith/devops/code/qemu/rules.mak:60: recipe for target 'cputlb.o' failed

>> make[1]: *** [cputlb.o] Error 1

>> Makefile:202: recipe for target 'subdir-aarch64-softmmu' failed

>> make: *** [subdir-aarch64-softmmu] Error 2

>

> Odd. I'll look into it. What was you configure string and host architecture?


Looks like a clash between the variable-page-size patchset
and your stuff. Now TARGET_PAGE_BITS isn't necessarily
constant you can't use it in a compile time assert like that.

thanks
-- PMM
Pranith Kumar Nov. 1, 2016, 1:22 p.m. UTC | #4
On Tue, Nov 1, 2016 at 3:45 AM, Alex Bennée <alex.bennee@linaro.org> wrote:

>

> Odd. I'll look into it. What was you configure string and host architecture?

>


It's a plain configure string, nothing special:

$ ../configure --target-list=aarch64-softmmu

But I did rebase your patches on master, May be something new in the
tree tripped this?

-- 
Pranith
Alex Bennée Nov. 1, 2016, 4:53 p.m. UTC | #5
Pranith Kumar <bobby.prani@gmail.com> writes:

> On Tue, Nov 1, 2016 at 3:45 AM, Alex Bennée <alex.bennee@linaro.org> wrote:

>

>>

>> Odd. I'll look into it. What was you configure string and host architecture?

>>

>

> It's a plain configure string, nothing special:

>

> $ ../configure --target-list=aarch64-softmmu

>

> But I did rebase your patches on master, May be something new in the

> tree tripped this?


Yeah, I'll look for a fix with dynamic page sizes when I re-base.

--
Alex Bennée
diff mbox

Patch

diff --git a/cputlb.c b/cputlb.c
index 981cb42..602cbb3 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -81,6 +81,22 @@  static inline run_on_cpu_data host_int(int hint)
     return d;
 }
 
+static inline run_on_cpu_data host_unsigned(unsigned hun)
+{
+    run_on_cpu_data d = { .host_unsigned = hun };
+    return d;
+}
+
+static inline run_on_cpu_data host_ulong(unsigned long hlong)
+{
+    run_on_cpu_data d = { .host_unsigned_long = hlong };
+    return d;
+}
+
+/* We currently can't handle more than 16 bits in the MMUIDX bitmask.
+ */
+QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
+#define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
 
 /* statistics */
 int tlb_flush_count;
@@ -105,7 +121,7 @@  static void tlb_flush_nocheck(CPUState *cpu, int flush_global)
 
     tb_unlock();
 
-    atomic_mb_set(&cpu->pending_tlb_flush, false);
+    atomic_mb_set(&cpu->pending_tlb_flush, 0);
 }
 
 static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)
@@ -128,7 +144,8 @@  static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)
 void tlb_flush(CPUState *cpu, int flush_global)
 {
     if (cpu->created && !qemu_cpu_is_self(cpu)) {
-        if (atomic_bool_cmpxchg(&cpu->pending_tlb_flush, false, true)) {
+        if (atomic_mb_read(&cpu->pending_tlb_flush) != ALL_MMUIDX_BITS) {
+            atomic_mb_set(&cpu->pending_tlb_flush, ALL_MMUIDX_BITS);
             async_run_on_cpu(cpu, tlb_flush_global_async_work,
                              host_int(flush_global));
         }
@@ -137,39 +154,77 @@  void tlb_flush(CPUState *cpu, int flush_global)
     }
 }
 
-static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp)
+static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 {
     CPUArchState *env = cpu->env_ptr;
+    unsigned long mmu_idx_bitmask = data.host_unsigned_long;
+    int mmu_idx;
 
     assert_cpu_is_self(cpu);
-    tlb_debug("start\n");
 
     tb_lock();
 
-    for (;;) {
-        int mmu_idx = va_arg(argp, int);
+    tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask);
 
-        if (mmu_idx < 0) {
-            break;
-        }
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
 
-        tlb_debug("%d\n", mmu_idx);
+        if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
+            tlb_debug("%d\n", mmu_idx);
 
-        memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
-        memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
+            memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
+            memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
+        }
     }
 
     memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
 
+    tlb_debug("done\n");
+
     tb_unlock();
 }
 
+/* Helper function to slurp va_args list into a bitmap
+ */
+static inline unsigned long make_mmu_index_bitmap(va_list args)
+{
+    unsigned long bitmap = 0;
+    int mmu_index = va_arg(args, int);
+
+    /* An empty va_list would be a bad call */
+    g_assert(mmu_index > 0);
+
+    do {
+        set_bit(mmu_index, &bitmap);
+        mmu_index = va_arg(args, int);
+    } while (mmu_index >= 0);
+
+    return bitmap;
+}
+
 void tlb_flush_by_mmuidx(CPUState *cpu, ...)
 {
     va_list argp;
+    unsigned long mmu_idx_bitmap;
+
     va_start(argp, cpu);
-    v_tlb_flush_by_mmuidx(cpu, argp);
+    mmu_idx_bitmap = make_mmu_index_bitmap(argp);
     va_end(argp);
+
+    tlb_debug("mmu_idx: 0x%04lx\n", mmu_idx_bitmap);
+
+    if (!qemu_cpu_is_self(cpu)) {
+        uint16_t pending_flushes =
+            mmu_idx_bitmap & ~atomic_mb_read(&cpu->pending_tlb_flush);
+        if (pending_flushes) {
+            tlb_debug("reduced mmu_idx: 0x%" PRIx16 "\n", pending_flushes);
+
+            atomic_or(&cpu->pending_tlb_flush, pending_flushes);
+            async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work,
+                             host_int(pending_flushes));
+        }
+    } else {
+        tlb_flush_by_mmuidx_async_work(cpu, host_ulong(mmu_idx_bitmap));
+    }
 }
 
 static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
@@ -233,16 +288,50 @@  void tlb_flush_page(CPUState *cpu, target_ulong addr)
     }
 }
 
-void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
+/* As we are going to hijack the bottom bits of the page address for a
+ * mmuidx bit mask we need to fail to build if we can't do that
+ */
+QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS);
+
+static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
+                                                run_on_cpu_data data)
 {
     CPUArchState *env = cpu->env_ptr;
-    int i, k;
-    va_list argp;
-
-    va_start(argp, addr);
+    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
+    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
+    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
+    int page = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    int mmu_idx;
+    int i;
 
     assert_cpu_is_self(cpu);
-    tlb_debug("addr "TARGET_FMT_lx"\n", addr);
+
+    tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx%" PRIxPTR "\n",
+              page, addr, mmu_idx_bitmap);
+
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+        if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
+            tlb_flush_entry(&env->tlb_table[mmu_idx][page], addr);
+
+            /* check whether there are vltb entries that need to be flushed */
+            for (i = 0; i < CPU_VTLB_SIZE; i++) {
+                tlb_flush_entry(&env->tlb_v_table[mmu_idx][i], addr);
+            }
+        }
+    }
+
+    tb_flush_jmp_cache(cpu, addr);
+}
+
+static void tlb_check_page_and_flush_by_mmuidx_async_work(CPUState *cpu,
+                                                          run_on_cpu_data data)
+{
+    CPUArchState *env = cpu->env_ptr;
+    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
+    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
+    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
+
+    tlb_debug("addr:"TARGET_FMT_lx" mmu_idx: %04lx\n", addr, mmu_idx_bitmap);
 
     /* Check if we need to flush due to large pages.  */
     if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) {
@@ -250,33 +339,35 @@  void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
                   TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
                   env->tlb_flush_addr, env->tlb_flush_mask);
 
-        v_tlb_flush_by_mmuidx(cpu, argp);
-        va_end(argp);
-        return;
+        tlb_flush_by_mmuidx_async_work(cpu, host_ulong(mmu_idx_bitmap));
+    } else {
+        tlb_flush_page_by_mmuidx_async_work(cpu, data);
     }
+}
 
-    addr &= TARGET_PAGE_MASK;
-    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-
-    for (;;) {
-        int mmu_idx = va_arg(argp, int);
+void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
+{
+    unsigned long mmu_idx_bitmap;
+    target_ulong addr_and_mmu_idx;
+    va_list argp;
 
-        if (mmu_idx < 0) {
-            break;
-        }
+    va_start(argp, addr);
+    mmu_idx_bitmap = make_mmu_index_bitmap(argp);
+    va_end(argp);
 
-        tlb_debug("idx %d\n", mmu_idx);
+    tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%lx\n", addr, mmu_idx_bitmap);
 
-        tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
+    /* This should already be page aligned */
+    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
+    addr_and_mmu_idx |= mmu_idx_bitmap;
 
-        /* check whether there are vltb entries that need to be flushed */
-        for (k = 0; k < CPU_VTLB_SIZE; k++) {
-            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
-        }
+    if (!qemu_cpu_is_self(cpu)) {
+        async_run_on_cpu(cpu, tlb_check_page_and_flush_by_mmuidx_async_work,
+                         target_ptr(addr_and_mmu_idx));
+    } else {
+        tlb_check_page_and_flush_by_mmuidx_async_work(
+            cpu, target_ptr(addr_and_mmu_idx));
     }
-    va_end(argp);
-
-    tb_flush_jmp_cache(cpu, addr);
 }
 
 void tlb_flush_page_all(target_ulong addr)
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 1fe5b99..4faf795 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -238,6 +238,7 @@  struct kvm_run;
 typedef union {
     int       host_int;
     unsigned  host_unsigned;
+    unsigned long host_unsigned_long;
     uintptr_t host_ptr;
     void      *void_ptr;  /* for (run_on_cpu_data) NULL casts */
     vaddr     target_ptr;
@@ -391,17 +392,17 @@  struct CPUState {
      */
     bool throttle_thread_scheduled;
 
+    /* The pending_tlb_flush flag is set and cleared atomically to
+     * avoid potential races. The aim of the flag is to avoid
+     * unnecessary flushes.
+     */
+    uint16_t pending_tlb_flush;
+
     /* Note that this is accessed at the start of every TB via a negative
        offset from AREG0.  Leave this field at the end so as to make the
        (absolute value) offset as small as possible.  This reduces code
        size, especially for hosts without large memory offsets.  */
     uint32_t tcg_exit_req;
-
-    /* The pending_tlb_flush flag is set and cleared atomically to
-     * avoid potential races. The aim of the flag is to avoid
-     * unnecessary flushes.
-     */
-    bool pending_tlb_flush;
 };
 
 QTAILQ_HEAD(CPUTailQ, CPUState);