diff mbox

[v6,13/19] cputlb: atomically update tlb fields used by tlb_reset_dirty

Message ID 20161109145748.27282-14-alex.bennee@linaro.org
State New
Headers show

Commit Message

Alex Bennée Nov. 9, 2016, 2:57 p.m. UTC
The main use case for tlb_reset_dirty is to set the TLB_NOTDIRTY flags
in TLB entries to force the slow-path on writes. This is used to mark
page ranges containing code which has been translated so it can be
invalidated if written to. To do this safely we need to ensure the TLB
entries in question for all vCPUs are updated before we attempt to run
the code otherwise a race could be introduced.

To achieve this we atomically set the flag in tlb_reset_dirty_range and
take care when setting it when the TLB entry is filled.

The helper function is made static as it isn't used outside of cputlb.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>


---
v6
  - use TARGET_PAGE_BITS_MIN
  - use run_on_cpu helpers
---
 cputlb.c              | 250 +++++++++++++++++++++++++++++++++++++-------------
 include/exec/cputlb.h |   2 -
 include/qom/cpu.h     |  12 +--
 3 files changed, 194 insertions(+), 70 deletions(-)

-- 
2.10.1

Comments

Pranith Kumar Nov. 9, 2016, 7:36 p.m. UTC | #1
Hi Alex,

This patch is causing some build errors on a 32-bit box:

In file included from /home/pranith/qemu/include/exec/exec-all.h:44:0,
                 from /home/pranith/qemu/cputlb.c:23:
/home/pranith/qemu/cputlb.c: In function ‘tlb_flush_page_by_mmuidx_async_work’:
/home/pranith/qemu/cputlb.c:54:36: error: format ‘%x’ expects argument of type ‘unsigned int’, but argument 5 has type ‘long unsigned int’ [-Werror=format=]
         qemu_log_mask(CPU_LOG_MMU, "%s: " fmt, __func__, \
                                    ^
/home/pranith/qemu/include/qemu/log.h:94:22: note: in definition of macro ‘qemu_log_mask’
             qemu_log(FMT, ## __VA_ARGS__);              \
                      ^~~
/home/pranith/qemu/cputlb.c:286:5: note: in expansion of macro ‘tlb_debug’
     tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx%" PRIxPTR "\n",
     ^~~~~~~~~
/home/pranith/qemu/cputlb.c:57:25: error: format ‘%x’ expects argument of type ‘unsigned int’, but argument 6 has type ‘long unsigned int’ [-Werror=format=]
         fprintf(stderr, "%s: " fmt, __func__, ## __VA_ARGS__); \
                         ^
/home/pranith/qemu/cputlb.c:286:5: note: in expansion of macro ‘tlb_debug’
     tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx%" PRIxPTR "\n",
     ^~~~~~~~~
cc1: all warnings being treated as errors

Thanks,

Alex Bennée writes:

> The main use case for tlb_reset_dirty is to set the TLB_NOTDIRTY flags

> in TLB entries to force the slow-path on writes. This is used to mark

> page ranges containing code which has been translated so it can be

> invalidated if written to. To do this safely we need to ensure the TLB

> entries in question for all vCPUs are updated before we attempt to run

> the code otherwise a race could be introduced.

>

> To achieve this we atomically set the flag in tlb_reset_dirty_range and

> take care when setting it when the TLB entry is filled.

>

> The helper function is made static as it isn't used outside of cputlb.

>

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>

> ---

> v6

>   - use TARGET_PAGE_BITS_MIN

>   - use run_on_cpu helpers

> ---

>  cputlb.c              | 250 +++++++++++++++++++++++++++++++++++++-------------

>  include/exec/cputlb.h |   2 -

>  include/qom/cpu.h     |  12 +--

>  3 files changed, 194 insertions(+), 70 deletions(-)

>

> diff --git a/cputlb.c b/cputlb.c

> index cd1ff71..ae94b7f 100644

> --- a/cputlb.c

> +++ b/cputlb.c

> @@ -68,6 +68,11 @@

>   * target_ulong even on 32 bit builds */

>  QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));

>  

> +/* We currently can't handle more than 16 bits in the MMUIDX bitmask.

> + */

> +QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);

> +#define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)

> +

>  /* statistics */

>  int tlb_flush_count;

>  

> @@ -98,7 +103,7 @@ static void tlb_flush_nocheck(CPUState *cpu, int flush_global)

>  

>      tb_unlock();

>  

> -    atomic_mb_set(&cpu->pending_tlb_flush, false);

> +    atomic_mb_set(&cpu->pending_tlb_flush, 0);

>  }

>  

>  static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)

> @@ -121,7 +126,8 @@ static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)

>  void tlb_flush(CPUState *cpu, int flush_global)

>  {

>      if (cpu->created && !qemu_cpu_is_self(cpu)) {

> -        if (atomic_cmpxchg(&cpu->pending_tlb_flush, false, true) == true) {

> +        if (atomic_mb_read(&cpu->pending_tlb_flush) != ALL_MMUIDX_BITS) {

> +            atomic_mb_set(&cpu->pending_tlb_flush, ALL_MMUIDX_BITS);

>              async_run_on_cpu(cpu, tlb_flush_global_async_work,

>                               RUN_ON_CPU_HOST_INT(flush_global));

>          }

> @@ -130,39 +136,78 @@ void tlb_flush(CPUState *cpu, int flush_global)

>      }

>  }

>  

> -static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp)

> +static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)

>  {

>      CPUArchState *env = cpu->env_ptr;

> +    unsigned long mmu_idx_bitmask = data.host_ulong;

> +    int mmu_idx;

>  

>      assert_cpu_is_self(cpu);

> -    tlb_debug("start\n");

>  

>      tb_lock();

>  

> -    for (;;) {

> -        int mmu_idx = va_arg(argp, int);

> +    tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask);

>  

> -        if (mmu_idx < 0) {

> -            break;

> -        }

> +    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {

>  

> -        tlb_debug("%d\n", mmu_idx);

> +        if (test_bit(mmu_idx, &mmu_idx_bitmask)) {

> +            tlb_debug("%d\n", mmu_idx);

>  

> -        memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));

> -        memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));

> +            memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));

> +            memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));

> +        }

>      }

>  

>      memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));

>  

> +    tlb_debug("done\n");

> +

>      tb_unlock();

>  }

>  

> +/* Helper function to slurp va_args list into a bitmap

> + */

> +static inline unsigned long make_mmu_index_bitmap(va_list args)

> +{

> +    unsigned long bitmap = 0;

> +    int mmu_index = va_arg(args, int);

> +

> +    /* An empty va_list would be a bad call */

> +    g_assert(mmu_index > 0);

> +

> +    do {

> +        set_bit(mmu_index, &bitmap);

> +        mmu_index = va_arg(args, int);

> +    } while (mmu_index >= 0);

> +

> +    return bitmap;

> +}

> +

>  void tlb_flush_by_mmuidx(CPUState *cpu, ...)

>  {

>      va_list argp;

> +    unsigned long mmu_idx_bitmap;

> +

>      va_start(argp, cpu);

> -    v_tlb_flush_by_mmuidx(cpu, argp);

> +    mmu_idx_bitmap = make_mmu_index_bitmap(argp);

>      va_end(argp);

> +

> +    tlb_debug("mmu_idx: 0x%04lx\n", mmu_idx_bitmap);

> +

> +    if (!qemu_cpu_is_self(cpu)) {

> +        uint16_t pending_flushes =

> +            mmu_idx_bitmap & ~atomic_mb_read(&cpu->pending_tlb_flush);

> +        if (pending_flushes) {

> +            tlb_debug("reduced mmu_idx: 0x%" PRIx16 "\n", pending_flushes);

> +

> +            atomic_or(&cpu->pending_tlb_flush, pending_flushes);

> +            async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work,

> +                             RUN_ON_CPU_HOST_INT(pending_flushes));

> +        }

> +    } else {

> +        tlb_flush_by_mmuidx_async_work(cpu,

> +                                       RUN_ON_CPU_HOST_ULONG(mmu_idx_bitmap));

> +    }

>  }

>  

>  static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)

> @@ -227,16 +272,50 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)

>      }

>  }

>  

> -void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)

> +/* As we are going to hijack the bottom bits of the page address for a

> + * mmuidx bit mask we need to fail to build if we can't do that

> + */

> +QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS_MIN);

> +

> +static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,

> +                                                run_on_cpu_data data)

>  {

>      CPUArchState *env = cpu->env_ptr;

> -    int i, k;

> -    va_list argp;

> -

> -    va_start(argp, addr);

> +    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;

> +    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;

> +    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;

> +    int page = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);

> +    int mmu_idx;

> +    int i;

>  

>      assert_cpu_is_self(cpu);

> -    tlb_debug("addr "TARGET_FMT_lx"\n", addr);

> +

> +    tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx%" PRIxPTR "\n",

> +              page, addr, mmu_idx_bitmap);

> +

> +    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {

> +        if (test_bit(mmu_idx, &mmu_idx_bitmap)) {

> +            tlb_flush_entry(&env->tlb_table[mmu_idx][page], addr);

> +

> +            /* check whether there are vltb entries that need to be flushed */

> +            for (i = 0; i < CPU_VTLB_SIZE; i++) {

> +                tlb_flush_entry(&env->tlb_v_table[mmu_idx][i], addr);

> +            }

> +        }

> +    }

> +

> +    tb_flush_jmp_cache(cpu, addr);

> +}

> +

> +static void tlb_check_page_and_flush_by_mmuidx_async_work(CPUState *cpu,

> +                                                          run_on_cpu_data data)

> +{

> +    CPUArchState *env = cpu->env_ptr;

> +    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;

> +    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;

> +    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;

> +

> +    tlb_debug("addr:"TARGET_FMT_lx" mmu_idx: %04lx\n", addr, mmu_idx_bitmap);

>  

>      /* Check if we need to flush due to large pages.  */

>      if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) {

> @@ -244,33 +323,35 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)

>                    TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",

>                    env->tlb_flush_addr, env->tlb_flush_mask);

>  

> -        v_tlb_flush_by_mmuidx(cpu, argp);

> -        va_end(argp);

> -        return;

> +        tlb_flush_by_mmuidx_async_work(cpu, RUN_ON_CPU_HOST_ULONG(mmu_idx_bitmap));

> +    } else {

> +        tlb_flush_page_by_mmuidx_async_work(cpu, data);

>      }

> +}

>  

> -    addr &= TARGET_PAGE_MASK;

> -    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);

> -

> -    for (;;) {

> -        int mmu_idx = va_arg(argp, int);

> +void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)

> +{

> +    unsigned long mmu_idx_bitmap;

> +    target_ulong addr_and_mmu_idx;

> +    va_list argp;

>  

> -        if (mmu_idx < 0) {

> -            break;

> -        }

> +    va_start(argp, addr);

> +    mmu_idx_bitmap = make_mmu_index_bitmap(argp);

> +    va_end(argp);

>  

> -        tlb_debug("idx %d\n", mmu_idx);

> +    tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%lx\n", addr, mmu_idx_bitmap);

>  

> -        tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);

> +    /* This should already be page aligned */

> +    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;

> +    addr_and_mmu_idx |= mmu_idx_bitmap;

>  

> -        /* check whether there are vltb entries that need to be flushed */

> -        for (k = 0; k < CPU_VTLB_SIZE; k++) {

> -            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);

> -        }

> +    if (!qemu_cpu_is_self(cpu)) {

> +        async_run_on_cpu(cpu, tlb_check_page_and_flush_by_mmuidx_async_work,

> +                         RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));

> +    } else {

> +        tlb_check_page_and_flush_by_mmuidx_async_work(

> +            cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));

>      }

> -    va_end(argp);

> -

> -    tb_flush_jmp_cache(cpu, addr);

>  }

>  

>  void tlb_flush_page_all(target_ulong addr)

> @@ -298,32 +379,50 @@ void tlb_unprotect_code(ram_addr_t ram_addr)

>      cpu_physical_memory_set_dirty_flag(ram_addr, DIRTY_MEMORY_CODE);

>  }

>  

> -static bool tlb_is_dirty_ram(CPUTLBEntry *tlbe)

> -{

> -    return (tlbe->addr_write & (TLB_INVALID_MASK|TLB_MMIO|TLB_NOTDIRTY)) == 0;

> -}

>  

> -void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,

> +/*

> + * Dirty write flag handling

> + *

> + * When the TCG code writes to a location it looks up the address in

> + * the TLB and uses that data to compute the final address. If any of

> + * the lower bits of the address are set then the slow path is forced.

> + * There are a number of reasons to do this but for normal RAM the

> + * most usual is detecting writes to code regions which may invalidate

> + * generated code.

> + *

> + * Because we want other vCPUs to respond to changes straight away we

> + * update the te->addr_write field atomically. If the TLB entry has

> + * been changed by the vCPU in the mean time we skip the update.

> + */

> +

> +static void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,

>                             uintptr_t length)

>  {

> -    uintptr_t addr;

> +    /* paired with atomic_mb_set in tlb_set_page_with_attrs */

> +    uintptr_t orig_addr = atomic_mb_read(&tlb_entry->addr_write);

> +    uintptr_t addr = orig_addr;

>  

> -    if (tlb_is_dirty_ram(tlb_entry)) {

> -        addr = (tlb_entry->addr_write & TARGET_PAGE_MASK) + tlb_entry->addend;

> +    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {

> +        addr &= TARGET_PAGE_MASK;

> +        addr += atomic_read(&tlb_entry->addend);

>          if ((addr - start) < length) {

> -            tlb_entry->addr_write |= TLB_NOTDIRTY;

> +            uintptr_t notdirty_addr = orig_addr | TLB_NOTDIRTY;

> +            atomic_cmpxchg(&tlb_entry->addr_write, orig_addr, notdirty_addr);

>          }

>      }

>  }

>  

> +/* This is a cross vCPU call (i.e. another vCPU resetting the flags of

> + * the target vCPU). As such care needs to be taken that we don't

> + * dangerously race with another vCPU update. The only thing actually

> + * updated is the target TLB entry ->addr_write flags.

> + */

>  void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)

>  {

>      CPUArchState *env;

>  

>      int mmu_idx;

>  

> -    assert_cpu_is_self(cpu);

> -

>      env = cpu->env_ptr;

>      for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {

>          unsigned int i;

> @@ -409,9 +508,9 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,

>      MemoryRegionSection *section;

>      unsigned int index;

>      target_ulong address;

> -    target_ulong code_address;

> +    target_ulong code_address, write_address;

>      uintptr_t addend;

> -    CPUTLBEntry *te;

> +    CPUTLBEntry *te, *tv;

>      hwaddr iotlb, xlat, sz;

>      unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE;

>      int asidx = cpu_asidx_from_attrs(cpu, attrs);

> @@ -446,15 +545,21 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,

>  

>      index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);

>      te = &env->tlb_table[mmu_idx][index];

> -

>      /* do not discard the translation in te, evict it into a victim tlb */

> -    env->tlb_v_table[mmu_idx][vidx] = *te;

> +    tv = &env->tlb_v_table[mmu_idx][vidx];

> +

> +    /* addr_write can race with tlb_reset_dirty_range_all */

> +    tv->addr_read = te->addr_read;

> +    atomic_set(&tv->addr_write, atomic_read(&te->addr_write));

> +    tv->addr_code = te->addr_code;

> +    atomic_set(&tv->addend, atomic_read(&te->addend));

> +

>      env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];

>  

>      /* refill the tlb */

>      env->iotlb[mmu_idx][index].addr = iotlb - vaddr;

>      env->iotlb[mmu_idx][index].attrs = attrs;

> -    te->addend = addend - vaddr;

> +    atomic_set(&te->addend, addend - vaddr);

>      if (prot & PAGE_READ) {

>          te->addr_read = address;

>      } else {

> @@ -466,21 +571,24 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,

>      } else {

>          te->addr_code = -1;

>      }

> +

> +    write_address = -1;

>      if (prot & PAGE_WRITE) {

>          if ((memory_region_is_ram(section->mr) && section->readonly)

>              || memory_region_is_romd(section->mr)) {

>              /* Write access calls the I/O callback.  */

> -            te->addr_write = address | TLB_MMIO;

> +            write_address = address | TLB_MMIO;

>          } else if (memory_region_is_ram(section->mr)

>                     && cpu_physical_memory_is_clean(

>                          memory_region_get_ram_addr(section->mr) + xlat)) {

> -            te->addr_write = address | TLB_NOTDIRTY;

> +            write_address = address | TLB_NOTDIRTY;

>          } else {

> -            te->addr_write = address;

> +            write_address = address;

>          }

> -    } else {

> -        te->addr_write = -1;

>      }

> +

> +    /* Pairs with flag setting in tlb_reset_dirty_range */

> +    atomic_mb_set(&te->addr_write, write_address);

>  }

>  

>  /* Add a new TLB entry, but without specifying the memory

> @@ -643,10 +751,28 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,

>          if (cmp == page) {

>              /* Found entry in victim tlb, swap tlb and iotlb.  */

>              CPUTLBEntry tmptlb, *tlb = &env->tlb_table[mmu_idx][index];

> +

> +            /* tmptlb = *tlb; */

> +            /* addr_write can race with tlb_reset_dirty_range_all */

> +            tmptlb.addr_read = tlb->addr_read;

> +            tmptlb.addr_write = atomic_read(&tlb->addr_write);

> +            tmptlb.addr_code = tlb->addr_code;

> +            tmptlb.addend = atomic_read(&tlb->addend);

> +

> +            /* *tlb = *vtlb; */

> +            tlb->addr_read = vtlb->addr_read;

> +            atomic_set(&tlb->addr_write, atomic_read(&vtlb->addr_write));

> +            tlb->addr_code = vtlb->addr_code;

> +            atomic_set(&tlb->addend, atomic_read(&vtlb->addend));

> +

> +            /* *vtlb = tmptlb; */

> +            vtlb->addr_read = tmptlb.addr_read;

> +            atomic_set(&vtlb->addr_write, tmptlb.addr_write);

> +            vtlb->addr_code = tmptlb.addr_code;

> +            atomic_set(&vtlb->addend, tmptlb.addend);

> +

>              CPUIOTLBEntry tmpio, *io = &env->iotlb[mmu_idx][index];

>              CPUIOTLBEntry *vio = &env->iotlb_v[mmu_idx][vidx];

> -

> -            tmptlb = *tlb; *tlb = *vtlb; *vtlb = tmptlb;

>              tmpio = *io; *io = *vio; *vio = tmpio;

>              return true;

>          }

> diff --git a/include/exec/cputlb.h b/include/exec/cputlb.h

> index d454c00..3f94178 100644

> --- a/include/exec/cputlb.h

> +++ b/include/exec/cputlb.h

> @@ -23,8 +23,6 @@

>  /* cputlb.c */

>  void tlb_protect_code(ram_addr_t ram_addr);

>  void tlb_unprotect_code(ram_addr_t ram_addr);

> -void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,

> -                           uintptr_t length);

>  extern int tlb_flush_count;

>  

>  #endif

> diff --git a/include/qom/cpu.h b/include/qom/cpu.h

> index 880ba42..d945221 100644

> --- a/include/qom/cpu.h

> +++ b/include/qom/cpu.h

> @@ -388,17 +388,17 @@ struct CPUState {

>       */

>      bool throttle_thread_scheduled;

>  

> +    /* The pending_tlb_flush flag is set and cleared atomically to

> +     * avoid potential races. The aim of the flag is to avoid

> +     * unnecessary flushes.

> +     */

> +    uint16_t pending_tlb_flush;

> +

>      /* Note that this is accessed at the start of every TB via a negative

>         offset from AREG0.  Leave this field at the end so as to make the

>         (absolute value) offset as small as possible.  This reduces code

>         size, especially for hosts without large memory offsets.  */

>      uint32_t tcg_exit_req;

> -

> -    /* The pending_tlb_flush flag is set and cleared atomically to

> -     * avoid potential races. The aim of the flag is to avoid

> -     * unnecessary flushes.

> -     */

> -    bool pending_tlb_flush;

>  };

>  

>  QTAILQ_HEAD(CPUTailQ, CPUState);


-- 
Pranith
Alex Bennée Nov. 10, 2016, 4:14 p.m. UTC | #2
Pranith Kumar <bobby.prani@gmail.com> writes:

> Hi Alex,

>

> This patch is causing some build errors on a 32-bit box:

>

> In file included from /home/pranith/qemu/include/exec/exec-all.h:44:0,

>                  from /home/pranith/qemu/cputlb.c:23:

> /home/pranith/qemu/cputlb.c: In function ‘tlb_flush_page_by_mmuidx_async_work’:

> /home/pranith/qemu/cputlb.c:54:36: error: format ‘%x’ expects argument of type ‘unsigned int’, but argument 5 has type ‘long unsigned int’ [-Werror=format=]

>          qemu_log_mask(CPU_LOG_MMU, "%s: " fmt, __func__, \

>                                     ^

> /home/pranith/qemu/include/qemu/log.h:94:22: note: in definition of macro ‘qemu_log_mask’

>              qemu_log(FMT, ## __VA_ARGS__);              \

>                       ^~~

> /home/pranith/qemu/cputlb.c:286:5: note: in expansion of macro ‘tlb_debug’

>      tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx%" PRIxPTR "\n",

>      ^~~~~~~~~

> /home/pranith/qemu/cputlb.c:57:25: error: format ‘%x’ expects argument of type ‘unsigned int’, but argument 6 has type ‘long unsigned int’ [-Werror=format=]

>          fprintf(stderr, "%s: " fmt, __func__, ## __VA_ARGS__); \

>                          ^

> /home/pranith/qemu/cputlb.c:286:5: note: in expansion of macro ‘tlb_debug’

>      tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx%" PRIxPTR "\n",

>      ^~~~~~~~~

> cc1: all warnings being treated as errors

>

> Thanks,

>

<snip>
>> +/*

>> + * Dirty write flag handling

>> + *

>> + * When the TCG code writes to a location it looks up the address in

>> + * the TLB and uses that data to compute the final address. If any of

>> + * the lower bits of the address are set then the slow path is forced.

>> + * There are a number of reasons to do this but for normal RAM the

>> + * most usual is detecting writes to code regions which may invalidate

>> + * generated code.

>> + *

>> + * Because we want other vCPUs to respond to changes straight away we

>> + * update the te->addr_write field atomically. If the TLB entry has

>> + * been changed by the vCPU in the mean time we skip the update.

>> + */

>> +

>> +static void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,

>>                             uintptr_t length)

>>  {

>> -    uintptr_t addr;

>> +    /* paired with atomic_mb_set in tlb_set_page_with_attrs */

>> +    uintptr_t orig_addr = atomic_mb_read(&tlb_entry->addr_write);

>> +    uintptr_t addr = orig_addr;

>>

>> -    if (tlb_is_dirty_ram(tlb_entry)) {

>> -        addr = (tlb_entry->addr_write & TARGET_PAGE_MASK) + tlb_entry->addend;

>> +    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {

>> +        addr &= TARGET_PAGE_MASK;

>> +        addr += atomic_read(&tlb_entry->addend);

>>          if ((addr - start) < length) {

>> -            tlb_entry->addr_write |= TLB_NOTDIRTY;

>> +            uintptr_t notdirty_addr = orig_addr | TLB_NOTDIRTY;

>> +            atomic_cmpxchg(&tlb_entry->addr_write, orig_addr, notdirty_addr);

>>          }

>>      }

>>  }


Even worse than that we trip up the atomic.h QEMU_BUILD_BUG_ON with the
atomic_cmpxchg. Now I believe we can use atomic_cmpxchg__nocheck without
too much issue on x86 but we'll need to #ifdef it on detection of wide
atomics.

--
Alex Bennée
Richard Henderson Nov. 10, 2016, 5:23 p.m. UTC | #3
On 11/09/2016 03:57 PM, Alex Bennée wrote:
> +/* We currently can't handle more than 16 bits in the MMUIDX bitmask.

> + */

> +QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);


We already assert <= 12 in exec/cpu_ldst.h.  Although really any such assert 
belongs in exec/cpu-defs.h, where we define CPU_TLB_BITS et al.

That said, what's the technical restriction here?


r~
Richard Henderson Nov. 10, 2016, 5:27 p.m. UTC | #4
On 11/10/2016 05:14 PM, Alex Bennée wrote:
> Even worse than that we trip up the atomic.h QEMU_BUILD_BUG_ON with the

> atomic_cmpxchg. Now I believe we can use atomic_cmpxchg__nocheck without

> too much issue on x86 but we'll need to #ifdef it on detection of wide

> atomics.


You've already got CONFIG_ATOMIC64.  And what's the fallback?

We ought not be enabling mttcg for 32-bit host and 64-bit guest at all.  But 
that doesn't help much here, where we're otherwise guest width agnostic.


r~
Alex Bennée Nov. 10, 2016, 6 p.m. UTC | #5
Richard Henderson <rth@twiddle.net> writes:

> On 11/10/2016 05:14 PM, Alex Bennée wrote:

>> Even worse than that we trip up the atomic.h QEMU_BUILD_BUG_ON with the

>> atomic_cmpxchg. Now I believe we can use atomic_cmpxchg__nocheck without

>> too much issue on x86 but we'll need to #ifdef it on detection of wide

>> atomics.

>

> You've already got CONFIG_ATOMIC64.  And what's the fallback?


I'm going to re-factor cputlb a bit so all the TLB read and write's can
be done in helper functions so I don't scatter stuff around too much. I
was thinking something like:

#ifdef CONFIG_ATOMIC64
  .. as usual ..
#else
  assert(!parallel_cpus)
  .. non atomic update ..
#endif

> We ought not be enabling mttcg for 32-bit host and 64-bit guest at all.  But

> that doesn't help much here, where we're otherwise guest width

> agnostic.


Hmm well the most common case (any guest on x86) should work. Currently
the default mttcg code in cpus.c works when:

  #if defined(CONFIG_MTTCG_TARGET) && defined(CONFIG_MTTCG_HOST)

I should probably expand that to default to false in the case of (sizeof
target_ulong > sizeof void *) when we don't have CONFIG_ATOMIC64.

Then if the user does force mttcg on they will quickly get an assert
although maybe we want to report that in a nicer way?

>

>

> r~



--
Alex Bennée
Alex Bennée Nov. 10, 2016, 6:07 p.m. UTC | #6
Richard Henderson <rth@twiddle.net> writes:

> On 11/09/2016 03:57 PM, Alex Bennée wrote:

>> +/* We currently can't handle more than 16 bits in the MMUIDX bitmask.

>> + */

>> +QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);

>

> We already assert <= 12 in exec/cpu_ldst.h.  Although really any such assert

> belongs in exec/cpu-defs.h, where we define CPU_TLB_BITS et al.

>

> That said, what's the technical restriction here?


Really we just need to ensure that we don't run out of bits to convert
the MMUIDX var args into the bottom bit of a page aligned address. We
already have:

  QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS_MIN);

So I guess I can drop the other one.

--
Alex Bennée
Richard Henderson Nov. 10, 2016, 6:32 p.m. UTC | #7
On 11/10/2016 07:00 PM, Alex Bennée wrote:
> I should probably expand that to default to false in the case of (sizeof

> target_ulong > sizeof void *) when we don't have CONFIG_ATOMIC64.

>

> Then if the user does force mttcg on they will quickly get an assert

> although maybe we want to report that in a nicer way?


While forcing mttcg is good for testing, small hosts will definitely fail, so 
there's not point in even trying.  We should report it in a nicer way.

We shouldn't be checking sizeof(void*), but checking TCG_TARGET_REG_BITS.  That 
says how wide the host registers actually are, as opposed to the memory model 
in effect -- think x86_64 in x32 mode and the like.

If the host register size is smaller than the guest register size, we should 
force disable mttcg, regardles of CONFIG_ATOMIC64, because e.g. normal 64 bit 
loads and stores won't be atomic.


r~
diff mbox

Patch

diff --git a/cputlb.c b/cputlb.c
index cd1ff71..ae94b7f 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -68,6 +68,11 @@ 
  * target_ulong even on 32 bit builds */
 QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
 
+/* We currently can't handle more than 16 bits in the MMUIDX bitmask.
+ */
+QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
+#define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
+
 /* statistics */
 int tlb_flush_count;
 
@@ -98,7 +103,7 @@  static void tlb_flush_nocheck(CPUState *cpu, int flush_global)
 
     tb_unlock();
 
-    atomic_mb_set(&cpu->pending_tlb_flush, false);
+    atomic_mb_set(&cpu->pending_tlb_flush, 0);
 }
 
 static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)
@@ -121,7 +126,8 @@  static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)
 void tlb_flush(CPUState *cpu, int flush_global)
 {
     if (cpu->created && !qemu_cpu_is_self(cpu)) {
-        if (atomic_cmpxchg(&cpu->pending_tlb_flush, false, true) == true) {
+        if (atomic_mb_read(&cpu->pending_tlb_flush) != ALL_MMUIDX_BITS) {
+            atomic_mb_set(&cpu->pending_tlb_flush, ALL_MMUIDX_BITS);
             async_run_on_cpu(cpu, tlb_flush_global_async_work,
                              RUN_ON_CPU_HOST_INT(flush_global));
         }
@@ -130,39 +136,78 @@  void tlb_flush(CPUState *cpu, int flush_global)
     }
 }
 
-static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, va_list argp)
+static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 {
     CPUArchState *env = cpu->env_ptr;
+    unsigned long mmu_idx_bitmask = data.host_ulong;
+    int mmu_idx;
 
     assert_cpu_is_self(cpu);
-    tlb_debug("start\n");
 
     tb_lock();
 
-    for (;;) {
-        int mmu_idx = va_arg(argp, int);
+    tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask);
 
-        if (mmu_idx < 0) {
-            break;
-        }
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
 
-        tlb_debug("%d\n", mmu_idx);
+        if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
+            tlb_debug("%d\n", mmu_idx);
 
-        memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
-        memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
+            memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
+            memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
+        }
     }
 
     memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
 
+    tlb_debug("done\n");
+
     tb_unlock();
 }
 
+/* Helper function to slurp va_args list into a bitmap
+ */
+static inline unsigned long make_mmu_index_bitmap(va_list args)
+{
+    unsigned long bitmap = 0;
+    int mmu_index = va_arg(args, int);
+
+    /* An empty va_list would be a bad call */
+    g_assert(mmu_index > 0);
+
+    do {
+        set_bit(mmu_index, &bitmap);
+        mmu_index = va_arg(args, int);
+    } while (mmu_index >= 0);
+
+    return bitmap;
+}
+
 void tlb_flush_by_mmuidx(CPUState *cpu, ...)
 {
     va_list argp;
+    unsigned long mmu_idx_bitmap;
+
     va_start(argp, cpu);
-    v_tlb_flush_by_mmuidx(cpu, argp);
+    mmu_idx_bitmap = make_mmu_index_bitmap(argp);
     va_end(argp);
+
+    tlb_debug("mmu_idx: 0x%04lx\n", mmu_idx_bitmap);
+
+    if (!qemu_cpu_is_self(cpu)) {
+        uint16_t pending_flushes =
+            mmu_idx_bitmap & ~atomic_mb_read(&cpu->pending_tlb_flush);
+        if (pending_flushes) {
+            tlb_debug("reduced mmu_idx: 0x%" PRIx16 "\n", pending_flushes);
+
+            atomic_or(&cpu->pending_tlb_flush, pending_flushes);
+            async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work,
+                             RUN_ON_CPU_HOST_INT(pending_flushes));
+        }
+    } else {
+        tlb_flush_by_mmuidx_async_work(cpu,
+                                       RUN_ON_CPU_HOST_ULONG(mmu_idx_bitmap));
+    }
 }
 
 static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
@@ -227,16 +272,50 @@  void tlb_flush_page(CPUState *cpu, target_ulong addr)
     }
 }
 
-void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
+/* As we are going to hijack the bottom bits of the page address for a
+ * mmuidx bit mask we need to fail to build if we can't do that
+ */
+QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS_MIN);
+
+static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
+                                                run_on_cpu_data data)
 {
     CPUArchState *env = cpu->env_ptr;
-    int i, k;
-    va_list argp;
-
-    va_start(argp, addr);
+    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
+    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
+    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
+    int page = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    int mmu_idx;
+    int i;
 
     assert_cpu_is_self(cpu);
-    tlb_debug("addr "TARGET_FMT_lx"\n", addr);
+
+    tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx%" PRIxPTR "\n",
+              page, addr, mmu_idx_bitmap);
+
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+        if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
+            tlb_flush_entry(&env->tlb_table[mmu_idx][page], addr);
+
+            /* check whether there are vltb entries that need to be flushed */
+            for (i = 0; i < CPU_VTLB_SIZE; i++) {
+                tlb_flush_entry(&env->tlb_v_table[mmu_idx][i], addr);
+            }
+        }
+    }
+
+    tb_flush_jmp_cache(cpu, addr);
+}
+
+static void tlb_check_page_and_flush_by_mmuidx_async_work(CPUState *cpu,
+                                                          run_on_cpu_data data)
+{
+    CPUArchState *env = cpu->env_ptr;
+    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
+    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
+    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
+
+    tlb_debug("addr:"TARGET_FMT_lx" mmu_idx: %04lx\n", addr, mmu_idx_bitmap);
 
     /* Check if we need to flush due to large pages.  */
     if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) {
@@ -244,33 +323,35 @@  void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
                   TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
                   env->tlb_flush_addr, env->tlb_flush_mask);
 
-        v_tlb_flush_by_mmuidx(cpu, argp);
-        va_end(argp);
-        return;
+        tlb_flush_by_mmuidx_async_work(cpu, RUN_ON_CPU_HOST_ULONG(mmu_idx_bitmap));
+    } else {
+        tlb_flush_page_by_mmuidx_async_work(cpu, data);
     }
+}
 
-    addr &= TARGET_PAGE_MASK;
-    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-
-    for (;;) {
-        int mmu_idx = va_arg(argp, int);
+void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
+{
+    unsigned long mmu_idx_bitmap;
+    target_ulong addr_and_mmu_idx;
+    va_list argp;
 
-        if (mmu_idx < 0) {
-            break;
-        }
+    va_start(argp, addr);
+    mmu_idx_bitmap = make_mmu_index_bitmap(argp);
+    va_end(argp);
 
-        tlb_debug("idx %d\n", mmu_idx);
+    tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%lx\n", addr, mmu_idx_bitmap);
 
-        tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
+    /* This should already be page aligned */
+    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
+    addr_and_mmu_idx |= mmu_idx_bitmap;
 
-        /* check whether there are vltb entries that need to be flushed */
-        for (k = 0; k < CPU_VTLB_SIZE; k++) {
-            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
-        }
+    if (!qemu_cpu_is_self(cpu)) {
+        async_run_on_cpu(cpu, tlb_check_page_and_flush_by_mmuidx_async_work,
+                         RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+    } else {
+        tlb_check_page_and_flush_by_mmuidx_async_work(
+            cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
     }
-    va_end(argp);
-
-    tb_flush_jmp_cache(cpu, addr);
 }
 
 void tlb_flush_page_all(target_ulong addr)
@@ -298,32 +379,50 @@  void tlb_unprotect_code(ram_addr_t ram_addr)
     cpu_physical_memory_set_dirty_flag(ram_addr, DIRTY_MEMORY_CODE);
 }
 
-static bool tlb_is_dirty_ram(CPUTLBEntry *tlbe)
-{
-    return (tlbe->addr_write & (TLB_INVALID_MASK|TLB_MMIO|TLB_NOTDIRTY)) == 0;
-}
 
-void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,
+/*
+ * Dirty write flag handling
+ *
+ * When the TCG code writes to a location it looks up the address in
+ * the TLB and uses that data to compute the final address. If any of
+ * the lower bits of the address are set then the slow path is forced.
+ * There are a number of reasons to do this but for normal RAM the
+ * most usual is detecting writes to code regions which may invalidate
+ * generated code.
+ *
+ * Because we want other vCPUs to respond to changes straight away we
+ * update the te->addr_write field atomically. If the TLB entry has
+ * been changed by the vCPU in the mean time we skip the update.
+ */
+
+static void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,
                            uintptr_t length)
 {
-    uintptr_t addr;
+    /* paired with atomic_mb_set in tlb_set_page_with_attrs */
+    uintptr_t orig_addr = atomic_mb_read(&tlb_entry->addr_write);
+    uintptr_t addr = orig_addr;
 
-    if (tlb_is_dirty_ram(tlb_entry)) {
-        addr = (tlb_entry->addr_write & TARGET_PAGE_MASK) + tlb_entry->addend;
+    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
+        addr &= TARGET_PAGE_MASK;
+        addr += atomic_read(&tlb_entry->addend);
         if ((addr - start) < length) {
-            tlb_entry->addr_write |= TLB_NOTDIRTY;
+            uintptr_t notdirty_addr = orig_addr | TLB_NOTDIRTY;
+            atomic_cmpxchg(&tlb_entry->addr_write, orig_addr, notdirty_addr);
         }
     }
 }
 
+/* This is a cross vCPU call (i.e. another vCPU resetting the flags of
+ * the target vCPU). As such care needs to be taken that we don't
+ * dangerously race with another vCPU update. The only thing actually
+ * updated is the target TLB entry ->addr_write flags.
+ */
 void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
 {
     CPUArchState *env;
 
     int mmu_idx;
 
-    assert_cpu_is_self(cpu);
-
     env = cpu->env_ptr;
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
         unsigned int i;
@@ -409,9 +508,9 @@  void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     MemoryRegionSection *section;
     unsigned int index;
     target_ulong address;
-    target_ulong code_address;
+    target_ulong code_address, write_address;
     uintptr_t addend;
-    CPUTLBEntry *te;
+    CPUTLBEntry *te, *tv;
     hwaddr iotlb, xlat, sz;
     unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE;
     int asidx = cpu_asidx_from_attrs(cpu, attrs);
@@ -446,15 +545,21 @@  void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
 
     index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     te = &env->tlb_table[mmu_idx][index];
-
     /* do not discard the translation in te, evict it into a victim tlb */
-    env->tlb_v_table[mmu_idx][vidx] = *te;
+    tv = &env->tlb_v_table[mmu_idx][vidx];
+
+    /* addr_write can race with tlb_reset_dirty_range_all */
+    tv->addr_read = te->addr_read;
+    atomic_set(&tv->addr_write, atomic_read(&te->addr_write));
+    tv->addr_code = te->addr_code;
+    atomic_set(&tv->addend, atomic_read(&te->addend));
+
     env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
 
     /* refill the tlb */
     env->iotlb[mmu_idx][index].addr = iotlb - vaddr;
     env->iotlb[mmu_idx][index].attrs = attrs;
-    te->addend = addend - vaddr;
+    atomic_set(&te->addend, addend - vaddr);
     if (prot & PAGE_READ) {
         te->addr_read = address;
     } else {
@@ -466,21 +571,24 @@  void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     } else {
         te->addr_code = -1;
     }
+
+    write_address = -1;
     if (prot & PAGE_WRITE) {
         if ((memory_region_is_ram(section->mr) && section->readonly)
             || memory_region_is_romd(section->mr)) {
             /* Write access calls the I/O callback.  */
-            te->addr_write = address | TLB_MMIO;
+            write_address = address | TLB_MMIO;
         } else if (memory_region_is_ram(section->mr)
                    && cpu_physical_memory_is_clean(
                         memory_region_get_ram_addr(section->mr) + xlat)) {
-            te->addr_write = address | TLB_NOTDIRTY;
+            write_address = address | TLB_NOTDIRTY;
         } else {
-            te->addr_write = address;
+            write_address = address;
         }
-    } else {
-        te->addr_write = -1;
     }
+
+    /* Pairs with flag setting in tlb_reset_dirty_range */
+    atomic_mb_set(&te->addr_write, write_address);
 }
 
 /* Add a new TLB entry, but without specifying the memory
@@ -643,10 +751,28 @@  static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
         if (cmp == page) {
             /* Found entry in victim tlb, swap tlb and iotlb.  */
             CPUTLBEntry tmptlb, *tlb = &env->tlb_table[mmu_idx][index];
+
+            /* tmptlb = *tlb; */
+            /* addr_write can race with tlb_reset_dirty_range_all */
+            tmptlb.addr_read = tlb->addr_read;
+            tmptlb.addr_write = atomic_read(&tlb->addr_write);
+            tmptlb.addr_code = tlb->addr_code;
+            tmptlb.addend = atomic_read(&tlb->addend);
+
+            /* *tlb = *vtlb; */
+            tlb->addr_read = vtlb->addr_read;
+            atomic_set(&tlb->addr_write, atomic_read(&vtlb->addr_write));
+            tlb->addr_code = vtlb->addr_code;
+            atomic_set(&tlb->addend, atomic_read(&vtlb->addend));
+
+            /* *vtlb = tmptlb; */
+            vtlb->addr_read = tmptlb.addr_read;
+            atomic_set(&vtlb->addr_write, tmptlb.addr_write);
+            vtlb->addr_code = tmptlb.addr_code;
+            atomic_set(&vtlb->addend, tmptlb.addend);
+
             CPUIOTLBEntry tmpio, *io = &env->iotlb[mmu_idx][index];
             CPUIOTLBEntry *vio = &env->iotlb_v[mmu_idx][vidx];
-
-            tmptlb = *tlb; *tlb = *vtlb; *vtlb = tmptlb;
             tmpio = *io; *io = *vio; *vio = tmpio;
             return true;
         }
diff --git a/include/exec/cputlb.h b/include/exec/cputlb.h
index d454c00..3f94178 100644
--- a/include/exec/cputlb.h
+++ b/include/exec/cputlb.h
@@ -23,8 +23,6 @@ 
 /* cputlb.c */
 void tlb_protect_code(ram_addr_t ram_addr);
 void tlb_unprotect_code(ram_addr_t ram_addr);
-void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,
-                           uintptr_t length);
 extern int tlb_flush_count;
 
 #endif
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 880ba42..d945221 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -388,17 +388,17 @@  struct CPUState {
      */
     bool throttle_thread_scheduled;
 
+    /* The pending_tlb_flush flag is set and cleared atomically to
+     * avoid potential races. The aim of the flag is to avoid
+     * unnecessary flushes.
+     */
+    uint16_t pending_tlb_flush;
+
     /* Note that this is accessed at the start of every TB via a negative
        offset from AREG0.  Leave this field at the end so as to make the
        (absolute value) offset as small as possible.  This reduces code
        size, especially for hosts without large memory offsets.  */
     uint32_t tcg_exit_req;
-
-    /* The pending_tlb_flush flag is set and cleared atomically to
-     * avoid potential races. The aim of the flag is to avoid
-     * unnecessary flushes.
-     */
-    bool pending_tlb_flush;
 };
 
 QTAILQ_HEAD(CPUTailQ, CPUState);