diff mbox

[Xen-devel,RFC,v3,5/6] xen/arm: Add log_dirty support for ARM

Message ID 1399583908-21755-6-git-send-email-w1.huang@samsung.com
State New
Headers show

Commit Message

Wei Huang May 8, 2014, 9:18 p.m. UTC
This patch implements log_dirty for ARM guest VMs. This feature
is provided via two basic blocks: dirty_bit_map and VLPT
(virtual-linear page table)

1. VLPT provides fast accessing of 3rd PTE of guest P2M.
When creating a mapping for VLPT, the page table mapping
becomes the following:
   xen's 1st PTE --> xen's 2nd PTE --> guest p2m's 2nd PTE -->
   guest p2m's 3rd PTE

With VLPT, xen can immediately locate the 3rd PTE of guest P2M
and modify PTE attirbute during dirty page tracking. The following
link shows the performance comparison for handling a dirty-page
between VLPT and typical page table walking.
http://lists.xen.org/archives/html/xen-devel/2013-08/msg01503.html

For more info about VLPT, please see
http://www.technovelty.org/linux/virtual-linear-page-table.html.

2. Dirty bitmap
The dirty bitmap is used to mark the pages which are dirty during
migration. The info is used by Xen tools, via DOMCTL_SHADOW_OP_*,
to figure out which guest pages need to be resent.

Signed-off-by: Jaeyong Yoo <jaeyong.yoo@samsung.com>
Signed-off-by: Evgeny Fedotov <e.fedotov@samsung.com>
Signed-off-by: Wei Huang <w1.huang@samsung.com>
---
 xen/arch/arm/domain.c           |    6 +
 xen/arch/arm/domctl.c           |   31 +++-
 xen/arch/arm/mm.c               |  298 ++++++++++++++++++++++++++++++++++++++-
 xen/arch/arm/p2m.c              |  204 +++++++++++++++++++++++++++
 xen/arch/arm/traps.c            |    9 ++
 xen/include/asm-arm/config.h    |   12 +-
 xen/include/asm-arm/domain.h    |   19 +++
 xen/include/asm-arm/mm.h        |   23 +++
 xen/include/asm-arm/p2m.h       |    8 +-
 xen/include/asm-arm/processor.h |    2 +
 10 files changed, 599 insertions(+), 13 deletions(-)

Comments

Andrew Cooper May 8, 2014, 11:46 p.m. UTC | #1
On 08/05/2014 22:18, Wei Huang wrote:
> This patch implements log_dirty for ARM guest VMs. This feature
> is provided via two basic blocks: dirty_bit_map and VLPT
> (virtual-linear page table)
>
> 1. VLPT provides fast accessing of 3rd PTE of guest P2M.
> When creating a mapping for VLPT, the page table mapping
> becomes the following:
>    xen's 1st PTE --> xen's 2nd PTE --> guest p2m's 2nd PTE -->
>    guest p2m's 3rd PTE
>
> With VLPT, xen can immediately locate the 3rd PTE of guest P2M
> and modify PTE attirbute during dirty page tracking. The following
> link shows the performance comparison for handling a dirty-page
> between VLPT and typical page table walking.
> http://lists.xen.org/archives/html/xen-devel/2013-08/msg01503.html
>
> For more info about VLPT, please see
> http://www.technovelty.org/linux/virtual-linear-page-table.html.
>
> 2. Dirty bitmap
> The dirty bitmap is used to mark the pages which are dirty during
> migration. The info is used by Xen tools, via DOMCTL_SHADOW_OP_*,
> to figure out which guest pages need to be resent.
>
> Signed-off-by: Jaeyong Yoo <jaeyong.yoo@samsung.com>
> Signed-off-by: Evgeny Fedotov <e.fedotov@samsung.com>
> Signed-off-by: Wei Huang <w1.huang@samsung.com>
> ---
>  xen/arch/arm/domain.c           |    6 +
>  xen/arch/arm/domctl.c           |   31 +++-
>  xen/arch/arm/mm.c               |  298 ++++++++++++++++++++++++++++++++++++++-
>  xen/arch/arm/p2m.c              |  204 +++++++++++++++++++++++++++
>  xen/arch/arm/traps.c            |    9 ++
>  xen/include/asm-arm/config.h    |   12 +-
>  xen/include/asm-arm/domain.h    |   19 +++
>  xen/include/asm-arm/mm.h        |   23 +++
>  xen/include/asm-arm/p2m.h       |    8 +-
>  xen/include/asm-arm/processor.h |    2 +
>  10 files changed, 599 insertions(+), 13 deletions(-)
>
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index 40f1c3a..2eb5ce0 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -208,6 +208,9 @@ static void ctxt_switch_to(struct vcpu *n)
>  
>      isb();
>  
> +    /* Dirty-page tracing */
> +    log_dirty_restore(n->domain);
> +
>      /* This is could trigger an hardware interrupt from the virtual
>       * timer. The interrupt needs to be injected into the guest. */
>      WRITE_SYSREG32(n->arch.cntkctl, CNTKCTL_EL1);
> @@ -504,6 +507,9 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
>      /* Default the virtual ID to match the physical */
>      d->arch.vpidr = boot_cpu_data.midr.bits;
>  
> +    /* Init log dirty support */
> +    log_dirty_init(d);
> +
>      clear_page(d->shared_info);
>      share_xen_page_with_guest(
>          virt_to_page(d->shared_info), d, XENSHARE_writable);
> diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
> index 45974e7..f1c34da 100644
> --- a/xen/arch/arm/domctl.c
> +++ b/xen/arch/arm/domctl.c
> @@ -10,30 +10,53 @@
>  #include <xen/errno.h>
>  #include <xen/sched.h>
>  #include <xen/hypercall.h>
> +#include <xen/guest_access.h>
>  #include <public/domctl.h>
>  
>  long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
>                      XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
>  {
> +    long ret = 0;

'rc' is the more common name.

> +    bool_t copyback = 0;
> +
>      switch ( domctl->cmd )
>      {
> +    case XEN_DOMCTL_shadow_op:
> +    {
> +        ret = -EINVAL;
> +        copyback = 1;
> +
> +        if ( (d == current->domain) )   /* no domain_pause() */
> +            break;
> +
> +        domain_pause(d);
> +        ret = dirty_mode_op(d, &domctl->u.shadow_op);
> +        domain_unpause(d);
> +    }
> +    break;
> +
>      case XEN_DOMCTL_cacheflush:
>      {
>          unsigned long s = domctl->u.cacheflush.start_pfn;
>          unsigned long e = s + domctl->u.cacheflush.nr_pfns;
>  
>          if ( domctl->u.cacheflush.nr_pfns > (1U<<MAX_ORDER) )
> -            return -EINVAL;
> +            ret = -EINVAL;

This breaks the error handling.  The prevailing style would be:

rc = -EINVAL;
if ( something bad )
    goto out;

>  
>          if ( e < s )
> -            return -EINVAL;
> +            ret = -EINVAL;
>  
> -        return p2m_cache_flush(d, s, e);
> +        ret = p2m_cache_flush(d, s, e);
>      }
>  
>      default:
> -        return subarch_do_domctl(domctl, d, u_domctl);
> +        ret = subarch_do_domctl(domctl, d, u_domctl);
>      }
> +
> +    if ( copyback && __copy_to_guest(u_domctl, domctl, 1) )
> +        ret = -EFAULT;
> +
> +    return ret;
>  }
>  
>  void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
> diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
> index eac228c..81c0691 100644
> --- a/xen/arch/arm/mm.c
> +++ b/xen/arch/arm/mm.c
> @@ -865,7 +865,6 @@ void destroy_xen_mappings(unsigned long v, unsigned long e)
>      create_xen_entries(REMOVE, v, 0, (e - v) >> PAGE_SHIFT, 0);
>  }
>  
> -enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
>  static void set_pte_flags_on_range(const char *p, unsigned long l, enum mg mg)
>  {
>      lpae_t pte;
> @@ -945,11 +944,6 @@ int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
>      return 0;
>  }
>  
> -unsigned long domain_get_maximum_gpfn(struct domain *d)
> -{
> -    return -ENOSYS;
> -}
> -
>  void share_xen_page_with_guest(struct page_info *page,
>                            struct domain *d, int readonly)
>  {
> @@ -1235,6 +1229,298 @@ int is_iomem_page(unsigned long mfn)
>          return 1;
>      return 0;
>  }
> +
> +
> +/* Return start and end addr of guest RAM. Note this function only reports 
> + * regular RAM. It does not cover other areas such as foreign mapped
> + * pages or MMIO space. */
> +void domain_get_ram_range(struct domain *d, paddr_t *start, paddr_t *end)

const struct domain *d;

> +{
> +    if ( start )
> +        *start = GUEST_RAM_BASE;
> +
> +    if ( end )
> +        *end = GUEST_RAM_BASE + ((paddr_t) d->max_pages << PAGE_SHIFT);
> +}
> +
> +/* Return the maximum GPFN of guest VM. It covers all guest memory types. */
> +unsigned long domain_get_maximum_gpfn(struct domain *d)
> +{
> +    struct p2m_domain *p2m = &d->arch.p2m;
> +
> +    return p2m->max_mapped_gfn;

This can be reduced to a single statement.

> +}
> +
> +/************************************/
> +/*    Dirty Page Tracking Support   */
> +/************************************/
> +/* Mark the bitmap for a corresponding page as dirty */
> +static inline void bitmap_mark_dirty(struct domain *d, paddr_t addr)
> +{
> +    paddr_t ram_base = (paddr_t) GUEST_RAM_BASE;

Useless cast

> +    int bit_index = PFN_DOWN(addr - ram_base);

This is liable to truncation, and should absolutely be unsigned.

> +    int page_index = bit_index >> (PAGE_SHIFT + 3);
> +    int bit_index_residual = bit_index & ((1ul << (PAGE_SHIFT + 3)) - 1);

As should all of these.

> +
> +    set_bit(bit_index_residual, d->arch.dirty.bitmap[page_index]);
> +}
> +
> +/* Allocate dirty bitmap resource */
> +static int bitmap_init(struct domain *d)

This function name is far too generic.

> +{
> +    paddr_t gma_start = 0;
> +    paddr_t gma_end = 0;
> +    int nr_bytes;
> +    int nr_pages;
> +    int i;

Truncation and unsigned issues.  I will stop commenting on them now, but
most of this patch needs fixing.

> +
> +    domain_get_ram_range(d, &gma_start, &gma_end);
> +
> +    nr_bytes = (PFN_DOWN(gma_end - gma_start) + 7) / 8;
> +    nr_pages = (nr_bytes + PAGE_SIZE - 1) / PAGE_SIZE;
> +
> +    BUG_ON(nr_pages > MAX_DIRTY_BITMAP_PAGES);

This looks like it should be an init() failure, or BUILD_BUG_ON().

> +
> +    for ( i = 0; i < nr_pages; ++i )
> +    {
> +        struct page_info *page;
> +        page = alloc_domheap_page(NULL, 0);

Those two lines can be combined, and needs a blank line following it.

> +        if ( page == NULL )
> +            goto cleanup_on_failure;
> +
> +        d->arch.dirty.bitmap[i] = map_domain_page_global(__page_to_mfn(page));

__map_domain_page_global(page) is your friend, and it can fail so needs
checking.

How many pages is this?  global dompage mapping are scarce.

This function would become substantially more trivial if Xen had a
zalloc_domheap_pages() helper.  There is quite a bit of other code which
could take advantage.

> +        clear_page(d->arch.dirty.bitmap[i]);
> +    }
> +
> +    d->arch.dirty.bitmap_pages = nr_pages;
> +    return 0;
> +
> +cleanup_on_failure:
> +    nr_pages = i;
> +    for ( i = 0; i < nr_pages; ++i )
> +    {
> +        unmap_domain_page_global(d->arch.dirty.bitmap[i]);
> +    }
> +
> +    return -ENOMEM;
> +}
> +
> +/* Cleanup dirty bitmap resource */
> +static void bitmap_cleanup(struct domain *d)
> +{
> +    int i;
> +
> +    for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
> +    {
> +        unmap_domain_page_global(d->arch.dirty.bitmap[i]);
> +    }
> +}
> +
> +/* Flush VLPT area */
> +static void vlpt_flush(struct domain *d)
> +{
> +    int flush_size;
> +    flush_size = (d->arch.dirty.second_lvl_end - 
> +                  d->arch.dirty.second_lvl_start) << SECOND_SHIFT;
> +
> +    /* flushing the 3rd level mapping */
> +    flush_xen_data_tlb_range_va(d->arch.dirty.second_lvl_start << SECOND_SHIFT,
> +                                flush_size);
> +}
> +
> +/* Set up a page table for VLPT mapping */
> +static int vlpt_init(struct domain *d)
> +{
> +    uint64_t required, avail = VIRT_LIN_P2M_END - VIRT_LIN_P2M_START;
> +    int xen_second_linear_base;
> +    int gp2m_start_index, gp2m_end_index;
> +    struct p2m_domain *p2m = &d->arch.p2m;
> +    struct page_info *second_lvl_page;
> +    paddr_t gma_start = 0;
> +    paddr_t gma_end = 0;
> +    lpae_t *first[2];
> +    int i;
> +
> +    /* Check if reserved space is enough to cover guest physical address space.
> +     * Note that each LPAE page table entry is 64-bit (8 bytes). So we only
> +     * shift left with LPAE_SHIFT instead of PAGE_SHIFT. */
> +    domain_get_ram_range(d, &gma_start, &gma_end);
> +    required = (gma_end - gma_start) >> LPAE_SHIFT;
> +    if ( required > avail )
> +    {
> +        dprintk(XENLOG_ERR, "Available VLPT is small for domU guest (avail: "
> +                "%#llx, required: %#llx)\n", (unsigned long long)avail,

PRIx64 please, and loose the casts.

> +                (unsigned long long)required);
> +        return -ENOMEM;
> +    }
> +
> +    /* Caulculate the base of 2nd linear table base for VIRT_LIN_P2M_START */
> +    xen_second_linear_base = second_linear_offset(VIRT_LIN_P2M_START);
> +
> +    gp2m_start_index = gma_start >> FIRST_SHIFT;
> +    gp2m_end_index = (gma_end >> FIRST_SHIFT) + 1;
> +
> +    if ( xen_second_linear_base + gp2m_end_index >= LPAE_ENTRIES * 2 )
> +    {
> +        dprintk(XENLOG_ERR, "xen second page is small for VLPT for domU");
> +        return -ENOMEM;
> +    }
> +
> +    /* Two pages are allocated to backup the related PTE content of guest 
> +     * VM's 1st-level table. */
> +    second_lvl_page = alloc_domheap_pages(NULL, 1, 0);
> +    if ( second_lvl_page == NULL )
> +        return -ENOMEM;
> +    d->arch.dirty.second_lvl[0] = map_domain_page_global(
> +        page_to_mfn(second_lvl_page) );
> +    d->arch.dirty.second_lvl[1] = map_domain_page_global(
> +        page_to_mfn(second_lvl_page+1) );
> +
> +    /* 1st level P2M of guest VM is 2 consecutive pages */
> +    first[0] = __map_domain_page(p2m->first_level);
> +    first[1] = __map_domain_page(p2m->first_level+1);

spaces around binary operators.

> +
> +    for ( i = gp2m_start_index; i < gp2m_end_index; ++i )
> +    {
> +        int k = i % LPAE_ENTRIES;
> +        int l = i / LPAE_ENTRIES;
> +        int k2 = (xen_second_linear_base + i) % LPAE_ENTRIES;
> +        int l2 = (xen_second_linear_base + i) / LPAE_ENTRIES;
> +
> +        /* Update 2nd-level PTE of Xen linear table. With this, Xen linear 
> +         * page table layout becomes: 1st Xen linear ==> 2nd Xen linear ==> 
> +         * 2nd guest P2M (i.e. 3rd Xen linear) ==> 3rd guest P2M (i.e. Xen 
> +         * linear content) for VIRT_LIN_P2M_START address space. */
> +        write_pte(&xen_second[xen_second_linear_base+i], first[l][k]);
> +
> +        /* We copy the mapping into domain's structure as a reference
> +         * in case of the context switch (used in vlpt_restore function ) */
> +        d->arch.dirty.second_lvl[l2][k2] = first[l][k];
> +    }
> +    unmap_domain_page(first[0]);
> +    unmap_domain_page(first[1]);
> +
> +    /* storing the start and end index */
> +    d->arch.dirty.second_lvl_start = xen_second_linear_base + gp2m_start_index;
> +    d->arch.dirty.second_lvl_end = xen_second_linear_base + gp2m_end_index;
> +
> +    vlpt_flush(d);
> +
> +    return 0;
> +}
> +
> +static void vlpt_cleanup(struct domain *d)
> +{
> +    /* First level p2m is 2 consecutive pages */
> +    unmap_domain_page_global(d->arch.dirty.second_lvl[0]);
> +    unmap_domain_page_global(d->arch.dirty.second_lvl[1]);
> +}
> +
> +/* Returns zero if addr is not valid or dirty mode is not set */
> +int handle_page_fault(struct domain *d, paddr_t addr)
> +{
> +    lpae_t *vlp2m_pte = 0;
> +    paddr_t gma_start = 0;
> +    paddr_t gma_end = 0;
> +
> +    if ( !d->arch.dirty.mode )
> +        return 0;
> +
> +    domain_get_ram_range(d, &gma_start, &gma_end);
> +
> +    /* Ensure that addr is inside guest's RAM */
> +    if ( addr < gma_start || addr > gma_end )
> +        return 0;
> +
> +    vlp2m_pte = vlpt_get_3lvl_pte(addr);
> +    if ( vlp2m_pte->p2m.valid && vlp2m_pte->p2m.write == 0 &&
> +         vlp2m_pte->p2m.type == p2m_ram_logdirty )
> +    {
> +        lpae_t pte = *vlp2m_pte;
> +        pte.p2m.write = 1;
> +        write_pte(vlp2m_pte, pte);
> +        flush_tlb_local();
> +
> +        /* only necessary to lock between get-dirty bitmap and mark dirty
> +         * bitmap. If get-dirty bitmap happens immediately before this
> +         * lock, the corresponding dirty-page would be marked at the next
> +         * round of get-dirty bitmap */
> +        spin_lock(&d->arch.dirty.lock);
> +        bitmap_mark_dirty(d, addr);
> +        spin_unlock(&d->arch.dirty.lock);
> +    }
> +
> +    return 1;
> +}
> +
> +/* Restore the xen page table for vlpt mapping for domain */
> +void log_dirty_restore(struct domain *d)
> +{
> +    int i;
> +
> +    /* Nothing to do as log dirty mode is off */
> +    if ( !(d->arch.dirty.mode) )

superfluous brackets.

> +        return;
> +
> +    dsb(sy);
> +
> +    for ( i = d->arch.dirty.second_lvl_start; i < d->arch.dirty.second_lvl_end;
> +          ++i )
> +    {
> +        int k = i % LPAE_ENTRIES;
> +        int l = i / LPAE_ENTRIES;
> +
> +        if ( xen_second[i].bits != d->arch.dirty.second_lvl[l][k].bits )
> +        {
> +            write_pte(&xen_second[i], d->arch.dirty.second_lvl[l][k]);
> +            flush_xen_data_tlb_range_va(i << SECOND_SHIFT, 1 << SECOND_SHIFT);
> +        }
> +    }
> +
> +    dsb(sy);
> +    isb();
> +}
> +
> +/* Turn on log dirty */
> +int log_dirty_on(struct domain *d)
> +{
> +    if ( vlpt_init(d) || bitmap_init(d) )
> +        return -EINVAL;

This hides -ENOMEM from each of the init functions.

I am a fan of

return vlpt_init(d) ?: bitmap_init(d);

As an easy way of chaining a set of functions together if they succeed. 
Ian on the other hand isn't so I doubt you could get away with it.

> +
> +    return 0;
> +}
> +
> +/* Turn off log dirty */
> +void log_dirty_off(struct domain *d)
> +{
> +    bitmap_cleanup(d);
> +    vlpt_cleanup(d);
> +}
> +
> +/* Initialize log dirty fields */
> +int log_dirty_init(struct domain *d)
> +{
> +    d->arch.dirty.count = 0;
> +    d->arch.dirty.mode = 0;
> +    spin_lock_init(&d->arch.dirty.lock);
> +
> +    d->arch.dirty.second_lvl_start = 0;
> +    d->arch.dirty.second_lvl_end = 0;
> +    d->arch.dirty.second_lvl[0] = NULL;
> +    d->arch.dirty.second_lvl[1] = NULL;
> +
> +    memset(d->arch.dirty.bitmap, 0, sizeof(d->arch.dirty.bitmap));
> +    d->arch.dirty.bitmap_pages = 0;
> +
> +    return 0;
> +}
> +
> +/* Log dirty tear down */
> +void log_dirty_teardown(struct domain *d)
> +{
> +    return;
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
> index 603c097..0808cc9 100644
> --- a/xen/arch/arm/p2m.c
> +++ b/xen/arch/arm/p2m.c
> @@ -6,6 +6,8 @@
>  #include <xen/bitops.h>
>  #include <asm/flushtlb.h>
>  #include <asm/gic.h>
> +#include <xen/guest_access.h>
> +#include <xen/pfn.h>
>  #include <asm/event.h>
>  #include <asm/hardirq.h>
>  #include <asm/page.h>
> @@ -208,6 +210,7 @@ static lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr,
>          break;
>  
>      case p2m_ram_ro:
> +    case p2m_ram_logdirty:
>          e.p2m.xn = 0;
>          e.p2m.write = 0;
>          break;
> @@ -261,6 +264,10 @@ static int p2m_create_table(struct domain *d,
>  
>      pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM, p2m_invalid);
>  
> +    /* mark the write bit (page table's case, ro bit) as 0
> +     * so, it is writable in case of vlpt access */
> +    pte.pt.ro = 0;
> +
>      write_pte(entry, pte);
>  
>      return 0;
> @@ -696,6 +703,203 @@ unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn)
>      return p >> PAGE_SHIFT;
>  }
>  
> +/* Change types across all p2m entries in a domain */
> +void p2m_change_entry_type_global(struct domain *d, enum mg nt)
> +{
> +    struct p2m_domain *p2m = &d->arch.p2m;
> +    paddr_t ram_base;
> +    int i1, i2, i3;
> +    int first_index, second_index, third_index;
> +    lpae_t *first = __map_domain_page(p2m->first_level);
> +    lpae_t pte, *second = NULL, *third = NULL;
> +
> +    domain_get_ram_range(d, &ram_base, NULL);
> +
> +    first_index = first_table_offset((uint64_t)ram_base);
> +    second_index = second_table_offset((uint64_t)ram_base);
> +    third_index = third_table_offset((uint64_t)ram_base);
> +
> +    BUG_ON(!first);
> +
> +    spin_lock(&p2m->lock);
> +
> +    for ( i1 = first_index; i1 < LPAE_ENTRIES*2; ++i1 )
> +    {
> +        lpae_walk_t first_pte = first[i1].walk;
> +        if ( !first_pte.valid || !first_pte.table )
> +            goto out;
> +
> +        second = map_domain_page(first_pte.base);
> +        BUG_ON(!second);

map_domain_page() cant fail.

> +
> +        for ( i2 = second_index; i2 < LPAE_ENTRIES; ++i2 )
> +        {
> +            lpae_walk_t second_pte = second[i2].walk;
> +
> +            if ( !second_pte.valid || !second_pte.table )
> +                goto out;
> +
> +            third = map_domain_page(second_pte.base);
> +            BUG_ON(!third);
> +
> +            for ( i3 = third_index; i3 < LPAE_ENTRIES; ++i3 )
> +            {
> +                lpae_walk_t third_pte = third[i3].walk;
> +
> +                if ( !third_pte.valid )
> +                    goto out;
> +
> +                pte = third[i3];
> +
> +                if ( nt == mg_ro )
> +                {
> +                    if ( pte.p2m.write == 1 )
> +                    {
> +                        pte.p2m.write = 0;
> +                        pte.p2m.type = p2m_ram_logdirty;
> +                    }
> +                    else
> +                    {
> +                        /* reuse avail bit as an indicator of 'actual' 
> +                         * read-only */
> +                        pte.p2m.type = p2m_ram_rw;
> +                    }
> +                }
> +                else if ( nt == mg_rw )
> +                {
> +                    if ( pte.p2m.write == 0 && 
> +                         pte.p2m.type == p2m_ram_logdirty )
> +                    {
> +                        pte.p2m.write = p2m_ram_rw;
> +                    }
> +                }
> +                write_pte(&third[i3], pte);
> +            }
> +            unmap_domain_page(third);
> +
> +            third = NULL;
> +            third_index = 0;
> +        }
> +        unmap_domain_page(second);
> +
> +        second = NULL;
> +        second_index = 0;
> +        third_index = 0;
> +    }
> +
> +out:
> +    flush_tlb_all_local();
> +    if ( third ) unmap_domain_page(third);
> +    if ( second ) unmap_domain_page(second);
> +    if ( first ) unmap_domain_page(first);
> +
> +    spin_unlock(&p2m->lock);
> +}
> +
> +/* Read a domain's log-dirty bitmap and stats. If the operation is a CLEAN, 
> + * clear the bitmap and stats. */
> +int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc)
> +{
> +    int peek = 1;
> +    int i;
> +    int bitmap_size;
> +    paddr_t gma_start, gma_end;
> +
> +    /* this hypercall is called from domain 0, and we don't know which guest's
> +     * vlpt is mapped in xen_second, so, to be sure, we restore vlpt here */
> +    log_dirty_restore(d);
> +
> +    domain_get_ram_range(d, &gma_start, &gma_end);
> +    bitmap_size = (gma_end - gma_start) / 8;
> +
> +    if ( guest_handle_is_null(sc->dirty_bitmap) )
> +    {
> +        peek = 0;
> +    }
> +    else
> +    {
> +        spin_lock(&d->arch.dirty.lock);
> +
> +        for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
> +        {
> +            int j = 0;
> +            uint8_t *bitmap;
> +
> +            copy_to_guest_offset(sc->dirty_bitmap, i * PAGE_SIZE,
> +                                 d->arch.dirty.bitmap[i],
> +                                 bitmap_size < PAGE_SIZE ? bitmap_size :
> +                                                           PAGE_SIZE);
> +            bitmap_size -= PAGE_SIZE;
> +
> +            /* set p2m page table read-only */
> +            bitmap = d->arch.dirty.bitmap[i];
> +            while ((j = find_next_bit((const long unsigned int *)bitmap,
> +                                      PAGE_SIZE*8, j)) < PAGE_SIZE*8)
> +            {
> +                lpae_t *vlpt;
> +                paddr_t addr = gma_start + (i << (2*PAGE_SHIFT+3)) +
> +                    (j << PAGE_SHIFT);
> +                vlpt = vlpt_get_3lvl_pte(addr);
> +                vlpt->p2m.write = 0;
> +                j++;
> +            }
> +        }
> +
> +        if ( sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN )
> +        {
> +            for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
> +            {
> +                clear_page(d->arch.dirty.bitmap[i]);
> +            }
> +        }
> +
> +        spin_unlock(&d->arch.dirty.lock);
> +        flush_tlb_local();
> +    }
> +
> +    sc->stats.dirty_count = d->arch.dirty.count;
> +
> +    return 0;
> +}
> +
> +long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc)
> +{
> +    long ret = 0;
> +    switch (sc->op)
> +    {
> +        case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
> +        case XEN_DOMCTL_SHADOW_OP_OFF:
> +        {
> +            enum mg nt = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? mg_rw : mg_ro;
> +
> +            d->arch.dirty.mode = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? 0 : 1;
> +            p2m_change_entry_type_global(d, nt);
> +
> +            if ( sc->op == XEN_DOMCTL_SHADOW_OP_OFF )
> +            {
> +                log_dirty_off(d);
> +            }
> +            else
> +            {
> +                if ( (ret = log_dirty_on(d)) )
> +                    return ret;
> +            }
> +        }
> +        break;
> +
> +        case XEN_DOMCTL_SHADOW_OP_CLEAN:
> +        case XEN_DOMCTL_SHADOW_OP_PEEK:
> +        {
> +            ret = log_dirty_op(d, sc);
> +        }
> +        break;
> +
> +        default:
> +            return -ENOSYS;
> +    }
> +    return ret;
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
> index df4d375..b652565 100644
> --- a/xen/arch/arm/traps.c
> +++ b/xen/arch/arm/traps.c
> @@ -1556,6 +1556,8 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
>      struct hsr_dabt dabt = hsr.dabt;
>      int rc;
>      mmio_info_t info;
> +    int page_fault = ( dabt.write && ((dabt.dfsc & FSC_MASK) == 
> +                                      (FSC_FLT_PERM|FSC_3RD_LEVEL)) );

This looks like a bool_t to me.

~Andrew

>  
>      if ( !check_conditional_instr(regs, hsr) )
>      {
> @@ -1577,6 +1579,13 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
>      if ( rc == -EFAULT )
>          goto bad_data_abort;
>  
> +    /* domU page fault handling for guest live migration. Note that 
> +     * dabt.valid can be 0 here */
> +    if ( page_fault && handle_page_fault(current->domain, info.gpa) )
> +    {
> +        /* Do not modify PC as guest needs to repeat memory operation */
> +        return;
> +    }
>      /* XXX: Decode the instruction if ISS is not valid */
>      if ( !dabt.valid )
>          goto bad_data_abort;
> diff --git a/xen/include/asm-arm/config.h b/xen/include/asm-arm/config.h
> index ef291ff..f18fae4 100644
> --- a/xen/include/asm-arm/config.h
> +++ b/xen/include/asm-arm/config.h
> @@ -87,6 +87,7 @@
>   *   0  -   8M   <COMMON>
>   *
>   *  32M - 128M   Frametable: 24 bytes per page for 16GB of RAM
> + * 128M - 256M   Virtual-linear mapping to P2M table
>   * 256M -   1G   VMAP: ioremap and early_ioremap use this virtual address
>   *                    space
>   *
> @@ -124,13 +125,15 @@
>  #define CONFIG_SEPARATE_XENHEAP 1
>  
>  #define FRAMETABLE_VIRT_START  _AT(vaddr_t,0x02000000)
> -#define VMAP_VIRT_START  _AT(vaddr_t,0x10000000)
> +#define VIRT_LIN_P2M_START     _AT(vaddr_t,0x08000000)
> +#define VMAP_VIRT_START        _AT(vaddr_t,0x10000000)
> +#define VIRT_LIN_P2M_END       VMAP_VIRT_START
>  #define XENHEAP_VIRT_START     _AT(vaddr_t,0x40000000)
>  #define XENHEAP_VIRT_END       _AT(vaddr_t,0x7fffffff)
>  #define DOMHEAP_VIRT_START     _AT(vaddr_t,0x80000000)
>  #define DOMHEAP_VIRT_END       _AT(vaddr_t,0xffffffff)
>  
> -#define VMAP_VIRT_END    XENHEAP_VIRT_START
> +#define VMAP_VIRT_END          XENHEAP_VIRT_START
>  
>  #define DOMHEAP_ENTRIES        1024  /* 1024 2MB mapping slots */
>  
> @@ -157,6 +160,11 @@
>  
>  #define HYPERVISOR_VIRT_END    DIRECTMAP_VIRT_END
>  
> +/* Definition for VIRT_LIN_P2M_START and VIRT_LIN_P2M_END (64-bit)
> + * TODO: Needs evaluation. */
> +#define VIRT_LIN_P2M_START     _AT(vaddr_t, 0x08000000)
> +#define VIRT_LIN_P2M_END       VMAP_VIRT_START
> +
>  #endif
>  
>  /* Fixmap slots */
> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
> index aabeb51..ac82643 100644
> --- a/xen/include/asm-arm/domain.h
> +++ b/xen/include/asm-arm/domain.h
> @@ -162,6 +162,25 @@ struct arch_domain
>      } vuart;
>  
>      unsigned int evtchn_irq;
> +
> +    /* dirty page tracing */
> +    struct {
> +        spinlock_t lock;
> +        volatile int mode;               /* 1 if dirty pages tracing enabled */
> +        volatile unsigned int count;     /* dirty pages counter */
> +
> +        /* vlpt context switch */
> +        volatile int second_lvl_start; /* start idx of virt linear space 2nd */
> +        volatile int second_lvl_end;   /* end idx of virt linear space 2nd */
> +        lpae_t *second_lvl[2];         /* copy of guest P2M 1st-lvl content */
> +
> +        /* bitmap to track dirty pages */
> +#define MAX_DIRTY_BITMAP_PAGES 64
> +        /* Because each bit represents a dirty page, the total supported guest 
> +         * memory is (64 entries x 4KB/entry x 8bits/byte x 4KB) = 8GB. */
> +        uint8_t *bitmap[MAX_DIRTY_BITMAP_PAGES]; /* dirty bitmap */
> +        int bitmap_pages;                        /* # of dirty bitmap pages */
> +    } dirty;
>  }  __cacheline_aligned;
>  
>  struct arch_vcpu
> diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
> index b8d4e7d..ab19025 100644
> --- a/xen/include/asm-arm/mm.h
> +++ b/xen/include/asm-arm/mm.h
> @@ -4,6 +4,7 @@
>  #include <xen/config.h>
>  #include <xen/kernel.h>
>  #include <asm/page.h>
> +#include <asm/config.h>
>  #include <public/xen.h>
>  
>  /* Align Xen to a 2 MiB boundary. */
> @@ -320,6 +321,7 @@ int donate_page(
>  #define domain_clamp_alloc_bitsize(d, b) (b)
>  
>  unsigned long domain_get_maximum_gpfn(struct domain *d);
> +void domain_get_ram_range(struct domain *d, paddr_t *start, paddr_t *end);
>  
>  extern struct domain *dom_xen, *dom_io, *dom_cow;
>  
> @@ -341,6 +343,27 @@ static inline void put_page_and_type(struct page_info *page)
>      put_page(page);
>  }
>  
> +enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
> +
> +/************************************/
> +/*    Log-dirty support functions   */
> +/************************************/
> +int log_dirty_on(struct domain *d);
> +void log_dirty_off(struct domain *d);
> +int log_dirty_init(struct domain *d);
> +void log_dirty_teardown(struct domain *d);
> +void log_dirty_restore(struct domain *d);
> +int handle_page_fault(struct domain *d, paddr_t addr);
> +/* access leaf PTE of a given guest address (GPA) */
> +static inline lpae_t * vlpt_get_3lvl_pte(paddr_t addr)
> +{
> +    lpae_t *table = (lpae_t *)VIRT_LIN_P2M_START;
> +
> +    /* Since we slotted the guest's first p2m page table to xen's
> +     * second page table, one shift is enough for calculating the
> +     * index of guest p2m table entry */
> +    return &table[addr >> PAGE_SHIFT];
> +}
>  #endif /*  __ARCH_ARM_MM__ */
>  /*
>   * Local variables:
> diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
> index bd71abe..0cecbe7 100644
> --- a/xen/include/asm-arm/p2m.h
> +++ b/xen/include/asm-arm/p2m.h
> @@ -2,6 +2,7 @@
>  #define _XEN_P2M_H
>  
>  #include <xen/mm.h>
> +#include <public/domctl.h>
>  
>  struct domain;
>  
> @@ -41,6 +42,7 @@ typedef enum {
>      p2m_invalid = 0,    /* Nothing mapped here */
>      p2m_ram_rw,         /* Normal read/write guest RAM */
>      p2m_ram_ro,         /* Read-only; writes are silently dropped */
> +    p2m_ram_logdirty,   /* Read-only: special mode for log dirty */
>      p2m_mmio_direct,    /* Read/write mapping of genuine MMIO area */
>      p2m_map_foreign,    /* Ram pages from foreign domain */
>      p2m_grant_map_rw,   /* Read/write grant mapping */
> @@ -49,7 +51,8 @@ typedef enum {
>  } p2m_type_t;
>  
>  #define p2m_is_foreign(_t)  ((_t) == p2m_map_foreign)
> -#define p2m_is_ram(_t)      ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro)
> +#define p2m_is_ram(_t)      ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro ||  \
> +                             (_t) == p2m_ram_logdirty)
>  
>  /* Initialise vmid allocator */
>  void p2m_vmid_allocator_init(void);
> @@ -178,6 +181,9 @@ static inline int get_page_and_type(struct page_info *page,
>      return rc;
>  }
>  
> +void p2m_change_entry_type_global(struct domain *d, enum mg nt);
> +long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc);
> +
>  #endif /* _XEN_P2M_H */
>  
>  /*
> diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
> index 750864a..0bf3d67 100644
> --- a/xen/include/asm-arm/processor.h
> +++ b/xen/include/asm-arm/processor.h
> @@ -407,6 +407,8 @@ union hsr {
>  #define FSC_CPR        (0x3a) /* Coprocossor Abort */
>  
>  #define FSC_LL_MASK    (_AC(0x03,U)<<0)
> +#define FSC_MASK       (0x3f) /* Fault status mask */
> +#define FSC_3RD_LEVEL  (0x03) /* Third level fault */
>  
>  /* Time counter hypervisor control register */
>  #define CNTHCTL_PA      (1u<<0)  /* Kernel/user access to physical counter */
Ian Campbell May 14, 2014, 11:51 a.m. UTC | #2
On Fri, 2014-05-09 at 00:46 +0100, Andrew Cooper wrote:
> > +/* Allocate dirty bitmap resource */
> > +static int bitmap_init(struct domain *d)
> 
> This function name is far too generic.

It might be OK if it were in p2m.c rather than mm.c, or even better
would be to create logdirty.c.

> > +        if ( page == NULL )
> > +            goto cleanup_on_failure;
> > +
> > +        d->arch.dirty.bitmap[i] = map_domain_page_global(__page_to_mfn(page));
> 
> __map_domain_page_global(page) is your friend, and it can fail so needs
> checking.
> 
> How many pages is this?  global dompage mapping are scarce.

It's not so scarce on ARM32 as on x86, IIRC we have 2GB of domheap
space. Not to say we should be profligate with it, but it is less of a
concern than on x86 I think.

> > +int log_dirty_on(struct domain *d)
> > +{
> > +    if ( vlpt_init(d) || bitmap_init(d) )
> > +        return -EINVAL;
> 
> This hides -ENOMEM from each of the init functions.
> 
> I am a fan of
> 
> return vlpt_init(d) ?: bitmap_init(d);
> 
> As an easy way of chaining a set of functions together if they succeed. 
> Ian on the other hand isn't so I doubt you could get away with it.

:-)

I'd prefer:
	rc = foo_init(d)
	if (rc)
		return rc

I'd also be happy enough with if ((rc = foo_init(d))

Ian.
Ian Campbell May 14, 2014, 1:18 p.m. UTC | #3
On Thu, 2014-05-08 at 16:18 -0500, Wei Huang wrote:
> This patch implements log_dirty for ARM guest VMs. This feature
> is provided via two basic blocks: dirty_bit_map and VLPT
> (virtual-linear page table)

> 1. VLPT provides fast accessing of 3rd PTE of guest P2M.
> When creating a mapping for VLPT, the page table mapping
> becomes the following:
>    xen's 1st PTE --> xen's 2nd PTE --> guest p2m's 2nd PTE -->
>    guest p2m's 3rd PTE

I think "xen's 2nd PTE" here literally means the xen_second[] page
table? As discussed with Jaeyong this is shared between all PCPUs, which
means that only a single domain can be being migrated at one time.

> 
> With VLPT, xen can immediately locate the 3rd PTE of guest P2M
> and modify PTE attirbute during dirty page tracking. The following

"attribute"

> link shows the performance comparison for handling a dirty-page
> between VLPT and typical page table walking.
> http://lists.xen.org/archives/html/xen-devel/2013-08/msg01503.html

Can you inline the results here please.


> +/* Mark the bitmap for a corresponding page as dirty */
> +static inline void bitmap_mark_dirty(struct domain *d, paddr_t addr)
> +{
> +    paddr_t ram_base = (paddr_t) GUEST_RAM_BASE;
> +    int bit_index = PFN_DOWN(addr - ram_base);
> +    int page_index = bit_index >> (PAGE_SHIFT + 3);
> +    int bit_index_residual = bit_index & ((1ul << (PAGE_SHIFT + 3)) - 1);

I queried this magic +3 on Jaeyong's v5 posting of this functionality
too.

> +
> +    set_bit(bit_index_residual, d->arch.dirty.bitmap[page_index]);
> +}
> +
> +/* Allocate dirty bitmap resource */
> +static int bitmap_init(struct domain *d)
> +{
> +    paddr_t gma_start = 0;
> +    paddr_t gma_end = 0;
> +    int nr_bytes;
> +    int nr_pages;
> +    int i;
> +
> +    domain_get_ram_range(d, &gma_start, &gma_end);
> +
> +    nr_bytes = (PFN_DOWN(gma_end - gma_start) + 7) / 8;
> +    nr_pages = (nr_bytes + PAGE_SIZE - 1) / PAGE_SIZE;

DIV_ROUNDUP?

> +
> +    BUG_ON(nr_pages > MAX_DIRTY_BITMAP_PAGES);
> +
> +    for ( i = 0; i < nr_pages; ++i )
> +    {
> +        struct page_info *page;
> +        page = alloc_domheap_page(NULL, 0);
> +        if ( page == NULL )
> +            goto cleanup_on_failure;
> +
> +        d->arch.dirty.bitmap[i] = map_domain_page_global(__page_to_mfn(page));
> +        clear_page(d->arch.dirty.bitmap[i]);
> +    }
> +
> +    d->arch.dirty.bitmap_pages = nr_pages;
> +    return 0;
> +
> +cleanup_on_failure:

This would more normally be called out or err.

> +    nr_pages = i;
> +    for ( i = 0; i < nr_pages; ++i )

I think people normally do this with a while counting backwards.

> +    {
> +        unmap_domain_page_global(d->arch.dirty.bitmap[i]);
> +    }

Coding Style doesn't need {}'s around single line statement like this.

> +
> +    return -ENOMEM;
> +}
> +
> +/* Cleanup dirty bitmap resource */
> +static void bitmap_cleanup(struct domain *d)
> +{
> +    int i;
> +
> +    for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
> +    {
> +        unmap_domain_page_global(d->arch.dirty.bitmap[i]);
> +    }
> +}
> +
> +/* Flush VLPT area */
> +static void vlpt_flush(struct domain *d)
> +{
> +    int flush_size;
> +    flush_size = (d->arch.dirty.second_lvl_end - 
> +                  d->arch.dirty.second_lvl_start) << SECOND_SHIFT;
> +
> +    /* flushing the 3rd level mapping */
> +    flush_xen_data_tlb_range_va(d->arch.dirty.second_lvl_start << SECOND_SHIFT,
> +                                flush_size);
> +}
> +
> +/* Set up a page table for VLPT mapping */
> +static int vlpt_init(struct domain *d)
> +{
> +    uint64_t required, avail = VIRT_LIN_P2M_END - VIRT_LIN_P2M_START;
> +    int xen_second_linear_base;
> +    int gp2m_start_index, gp2m_end_index;
> +    struct p2m_domain *p2m = &d->arch.p2m;
> +    struct page_info *second_lvl_page;
> +    paddr_t gma_start = 0;
> +    paddr_t gma_end = 0;
> +    lpae_t *first[2];
> +    int i;
> +
> +    /* Check if reserved space is enough to cover guest physical address space.
> +     * Note that each LPAE page table entry is 64-bit (8 bytes). So we only
> +     * shift left with LPAE_SHIFT instead of PAGE_SHIFT. */
> +    domain_get_ram_range(d, &gma_start, &gma_end);
> +    required = (gma_end - gma_start) >> LPAE_SHIFT;
> +    if ( required > avail )
> +    {
> +        dprintk(XENLOG_ERR, "Available VLPT is small for domU guest (avail: "

"...is too small...".

What is the size limit here? We can probably accept a reasonable limit
for 32-bit guests, but for 64-bit guests we might need to consider other
options. I have patches which enable up to 1TB guests for 64-bit, and I
would expect that to grow again sooner rather than later.

This will need doing differently for arm64 anyway since there is no
per-pcpu page tables there. But there is plenty of address space so
there can probably be a linear area per pcpu, which depending on the
size of the VLPT might do, otherwise we might need to switch to per-pcpu
page tables.

> +                "%#llx, required: %#llx)\n", (unsigned long long)avail,
> +                (unsigned long long)required);
> +        return -ENOMEM;
> +    }
> +
> +    /* Caulculate the base of 2nd linear table base for VIRT_LIN_P2M_START */
> +    xen_second_linear_base = second_linear_offset(VIRT_LIN_P2M_START);
> +
> +    gp2m_start_index = gma_start >> FIRST_SHIFT;
> +    gp2m_end_index = (gma_end >> FIRST_SHIFT) + 1;
> +
> +    if ( xen_second_linear_base + gp2m_end_index >= LPAE_ENTRIES * 2 )
> +    {
> +        dprintk(XENLOG_ERR, "xen second page is small for VLPT for domU");
> +        return -ENOMEM;
> +    }
> +
> +    /* Two pages are allocated to backup the related PTE content of guest 
> +     * VM's 1st-level table. */

By "backup" do you mean context switch?

> +    second_lvl_page = alloc_domheap_pages(NULL, 1, 0);

> +    if ( second_lvl_page == NULL )
> +        return -ENOMEM;
> +    d->arch.dirty.second_lvl[0] = map_domain_page_global(
> +        page_to_mfn(second_lvl_page) );
> +    d->arch.dirty.second_lvl[1] = map_domain_page_global(
> +        page_to_mfn(second_lvl_page+1) );
> +
> +    /* 1st level P2M of guest VM is 2 consecutive pages */

Note that 4 level P2M with no concatenated level zero is on the cards
for arm64 soon.

> +    first[0] = __map_domain_page(p2m->first_level);
> +    first[1] = __map_domain_page(p2m->first_level+1);
> +
> +    for ( i = gp2m_start_index; i < gp2m_end_index; ++i )
> +    {
> +        int k = i % LPAE_ENTRIES;
> +        int l = i / LPAE_ENTRIES;
> +        int k2 = (xen_second_linear_base + i) % LPAE_ENTRIES;
> +        int l2 = (xen_second_linear_base + i) / LPAE_ENTRIES;
> +
> +        /* Update 2nd-level PTE of Xen linear table. With this, Xen linear 
> +         * page table layout becomes: 1st Xen linear ==> 2nd Xen linear ==> 
> +         * 2nd guest P2M (i.e. 3rd Xen linear) ==> 3rd guest P2M (i.e. Xen 
> +         * linear content) for VIRT_LIN_P2M_START address space. */
> +        write_pte(&xen_second[xen_second_linear_base+i], first[l][k]);

This has two barriers for each write, but you only actually need one
before the loop and one after, i.e. dsb() + copy_page() + dsb()
> +
> +        /* We copy the mapping into domain's structure as a reference
> +         * in case of the context switch (used in vlpt_restore function ) */

This is to avoid having to map the p2m pages on context switch I think.
But in order to do that you have to create a permanent mapping of two
other fresh pages to create a "cache". Why not just map the 2 actual p2m
pages instead?

Having done that I wonder how much of this loop can then be shared with
the context switcher?

> +        d->arch.dirty.second_lvl[l2][k2] = first[l][k];
> +    }
> +    unmap_domain_page(first[0]);
> +    unmap_domain_page(first[1]);
> +
> +    /* storing the start and end index */
> +    d->arch.dirty.second_lvl_start = xen_second_linear_base + gp2m_start_index;
> +    d->arch.dirty.second_lvl_end = xen_second_linear_base + gp2m_end_index;
> +
> +    vlpt_flush(d);
> +
> +    return 0;
> +}
> +
> +static void vlpt_cleanup(struct domain *d)
> +{
> +    /* First level p2m is 2 consecutive pages */
> +    unmap_domain_page_global(d->arch.dirty.second_lvl[0]);
> +    unmap_domain_page_global(d->arch.dirty.second_lvl[1]);
> +}
> +
> +/* Returns zero if addr is not valid or dirty mode is not set */
> +int handle_page_fault(struct domain *d, paddr_t addr)
> +{
> +    lpae_t *vlp2m_pte = 0;
> +    paddr_t gma_start = 0;
> +    paddr_t gma_end = 0;
> +
> +    if ( !d->arch.dirty.mode )
> +        return 0;
> +
> +    domain_get_ram_range(d, &gma_start, &gma_end);

Couldn't this just be d->arch.foo_start/end?

> +
> +    /* Ensure that addr is inside guest's RAM */
> +    if ( addr < gma_start || addr > gma_end )
> +        return 0;
> +
> +    vlp2m_pte = vlpt_get_3lvl_pte(addr);
> +    if ( vlp2m_pte->p2m.valid && vlp2m_pte->p2m.write == 0 &&
> +         vlp2m_pte->p2m.type == p2m_ram_logdirty )
> +    {
> +        lpae_t pte = *vlp2m_pte;
> +        pte.p2m.write = 1;
> +        write_pte(vlp2m_pte, pte);

Should you not be changing the type back to p2m_ram_rw?

> +        flush_tlb_local();

What about other CPUs?

> +
> +        /* only necessary to lock between get-dirty bitmap and mark dirty
> +         * bitmap. If get-dirty bitmap happens immediately before this
> +         * lock, the corresponding dirty-page would be marked at the next
> +         * round of get-dirty bitmap */
> +        spin_lock(&d->arch.dirty.lock);

At some point I think I suggested that you might be able to use an
atomic bitop rather than a lock, did that turn out to be impossible?

> +        bitmap_mark_dirty(d, addr);
> +        spin_unlock(&d->arch.dirty.lock);
> +    }
> +
> +    return 1;
> +}
> +
> +/* Restore the xen page table for vlpt mapping for domain */
> +void log_dirty_restore(struct domain *d)
> +{
> +    int i;
> +
> +    /* Nothing to do as log dirty mode is off */
> +    if ( !(d->arch.dirty.mode) )
> +        return;
> +
> +    dsb(sy);
> +
> +    for ( i = d->arch.dirty.second_lvl_start; i < d->arch.dirty.second_lvl_end;
> +          ++i )
> +    {
> +        int k = i % LPAE_ENTRIES;
> +        int l = i / LPAE_ENTRIES;
> +
> +        if ( xen_second[i].bits != d->arch.dirty.second_lvl[l][k].bits )
> +        {
> +            write_pte(&xen_second[i], d->arch.dirty.second_lvl[l][k]);
> +            flush_xen_data_tlb_range_va(i << SECOND_SHIFT, 1 << SECOND_SHIFT);
> +        }
> +    }
> +
> +    dsb(sy);
> +    isb();

You have barriers here, and in write_pte and in
flush_xen_data_tlb_range, which is somewhat overkill. I think you can
just have a single set before and after.

> +/* Initialize log dirty fields */
> +int log_dirty_init(struct domain *d)
> +{
> +    d->arch.dirty.count = 0;

I have a feeling that all of these are zeroed in struct domain when it
is allocated.

> +    d->arch.dirty.mode = 0;
> +    spin_lock_init(&d->arch.dirty.lock);
> +
> +    d->arch.dirty.second_lvl_start = 0;
> +    d->arch.dirty.second_lvl_end = 0;
> +    d->arch.dirty.second_lvl[0] = NULL;
> +    d->arch.dirty.second_lvl[1] = NULL;
> +
> +    memset(d->arch.dirty.bitmap, 0, sizeof(d->arch.dirty.bitmap));
> +    d->arch.dirty.bitmap_pages = 0;
> +
> +    return 0;
> +}
> +
> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
> index 603c097..0808cc9 100644
> --- a/xen/arch/arm/p2m.c
> +++ b/xen/arch/arm/p2m.c
> @@ -6,6 +6,8 @@
>  #include <xen/bitops.h>
>  #include <asm/flushtlb.h>
>  #include <asm/gic.h>
> +#include <xen/guest_access.h>
> +#include <xen/pfn.h>
>  #include <asm/event.h>
>  #include <asm/hardirq.h>
>  #include <asm/page.h>
> @@ -208,6 +210,7 @@ static lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr,
>          break;
>  
>      case p2m_ram_ro:
> +    case p2m_ram_logdirty:
>          e.p2m.xn = 0;
>          e.p2m.write = 0;
>          break;
> @@ -261,6 +264,10 @@ static int p2m_create_table(struct domain *d,
>  
>      pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM, p2m_invalid);
>  
> +    /* mark the write bit (page table's case, ro bit) as 0
> +     * so, it is writable in case of vlpt access */

"writeable"

> +    pte.pt.ro = 0;
> +
>      write_pte(entry, pte);
>  
>      return 0;
> @@ -696,6 +703,203 @@ unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn)
>      return p >> PAGE_SHIFT;
>  }
>  
> +/* Change types across all p2m entries in a domain */
> +void p2m_change_entry_type_global(struct domain *d, enum mg nt)

Please try and use apply_p2m_changes for this.

mg_* are host (i.e. Xen) mapping types. You should be using p2m_type_t
here.

This should all just fall out nicely if you use apply_p2m_changes I
think.

> +                if ( nt == mg_ro )
> +                {
> +                    if ( pte.p2m.write == 1 )
> +                    {
> +                        pte.p2m.write = 0;
> +                        pte.p2m.type = p2m_ram_logdirty;

If the new type is mg_ro then shouldn't this be p2m_ram_ro? There's no
need to do logdirty tracking on ro pages.

> +                    }
> +                    else
> +                    {
> +                        /* reuse avail bit as an indicator of 'actual' 

The avail bit?

> +                         * read-only */
> +                        pte.p2m.type = p2m_ram_rw;

The new type is mg_ro -- so surely here we *do* need logdirty, iff
logdirty is currently enabled.

> +                    }
> +                }
> +                else if ( nt == mg_rw )
> +                {
> +                    if ( pte.p2m.write == 0 && 
> +                         pte.p2m.type == p2m_ram_logdirty )
> +                    {
> +                        pte.p2m.write = p2m_ram_rw;

This also seems wrong to me.

Surely the logic here ought to be something like:

    switch (nt)
    {
    case p2m_ram_ro:
        pte.p2m.write = 0;
        pte.p2m.type = nt;
        break;
    case p2m_ram_rw:
        if ( logdirty_is_enable(d) )
        {
            pte.p2m.write = 0;
            pte.p2m.type = p2m_ram_logdirty;
        }
        else
        {
            pte.p2m.write = 1;
            pte.p2m.type = p2m_ram_rw;
        }
        break;
    /* Other types, perhaps via default... */
    }
?

> +                    }
> +                }
> +                write_pte(&third[i3], pte);
> +            }
> +            unmap_domain_page(third);
> +
> +            third = NULL;
> +            third_index = 0;
> +        }
> +        unmap_domain_page(second);
> +
> +        second = NULL;
> +        second_index = 0;
> +        third_index = 0;
> +    }
> +
> +out:
> +    flush_tlb_all_local();
> +    if ( third ) unmap_domain_page(third);
> +    if ( second ) unmap_domain_page(second);
> +    if ( first ) unmap_domain_page(first);
> +
> +    spin_unlock(&p2m->lock);
> +}
> +
> +/* Read a domain's log-dirty bitmap and stats. If the operation is a CLEAN, 
> + * clear the bitmap and stats. */
> +int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc)
> +{
> +    int peek = 1;
> +    int i;
> +    int bitmap_size;
> +    paddr_t gma_start, gma_end;
> +
> +    /* this hypercall is called from domain 0, and we don't know which guest's
> +     * vlpt is mapped in xen_second, so, to be sure, we restore vlpt here */
> +    log_dirty_restore(d);

You don't seem to clear it again at the end?

> +    domain_get_ram_range(d, &gma_start, &gma_end);
> +    bitmap_size = (gma_end - gma_start) / 8;
> +
> +    if ( guest_handle_is_null(sc->dirty_bitmap) )
> +    {
> +        peek = 0;
> +    }
> +    else
> +    {
> +        spin_lock(&d->arch.dirty.lock);
> +
> +        for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
> +        {
> +            int j = 0;
> +            uint8_t *bitmap;
> +
> +            copy_to_guest_offset(sc->dirty_bitmap, i * PAGE_SIZE,
> +                                 d->arch.dirty.bitmap[i],
> +                                 bitmap_size < PAGE_SIZE ? bitmap_size :
> +                                                           PAGE_SIZE);
> +            bitmap_size -= PAGE_SIZE;
> +
> +            /* set p2m page table read-only */
> +            bitmap = d->arch.dirty.bitmap[i];
> +            while ((j = find_next_bit((const long unsigned int *)bitmap,
> +                                      PAGE_SIZE*8, j)) < PAGE_SIZE*8)

What are these magic 8s?

> +            {
> +                lpae_t *vlpt;
> +                paddr_t addr = gma_start + (i << (2*PAGE_SHIFT+3)) +

Magic 2* and+3, a helper might be nice.

Isn't j effectively a pfn here (or a pfn offswet from RAM base). In
which case all the normal helpers for manipulating pfns and addresses
are available to you.

> +                    (j << PAGE_SHIFT);
> +                vlpt = vlpt_get_3lvl_pte(addr);
> +                vlpt->p2m.write = 0;
> +                j++;
> +            }

No barrier here?

> +        }
> +
> +        if ( sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN )
> +        {
> +            for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
> +            {
> +                clear_page(d->arch.dirty.bitmap[i]);
> +            }
> +        }
> +
> +        spin_unlock(&d->arch.dirty.lock);
> +        flush_tlb_local();
> +    }
> +
> +    sc->stats.dirty_count = d->arch.dirty.count;
> +
> +    return 0;
> +}
> +
> @@ -1577,6 +1579,13 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
>      if ( rc == -EFAULT )
>          goto bad_data_abort;
>  
> +    /* domU page fault handling for guest live migration. Note that 
> +     * dabt.valid can be 0 here */
> +    if ( page_fault && handle_page_fault(current->domain, info.gpa) )

handle_page_fault only deals with logdirty faults -- please name it
appropriately.

I'm not sure "page_fault" is a very descriptive name -- it's more
specific than that I think?

> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
> index aabeb51..ac82643 100644
> --- a/xen/include/asm-arm/domain.h
> +++ b/xen/include/asm-arm/domain.h
> @@ -162,6 +162,25 @@ struct arch_domain
>      } vuart;
>  
>      unsigned int evtchn_irq;
> +
> +    /* dirty page tracing */
> +    struct {
> +        spinlock_t lock;
> +        volatile int mode;               /* 1 if dirty pages tracing enabled */
> +        volatile unsigned int count;     /* dirty pages counter */
> +
> +        /* vlpt context switch */
> +        volatile int second_lvl_start; /* start idx of virt linear space 2nd */
> +        volatile int second_lvl_end;   /* end idx of virt linear space 2nd */
> +        lpae_t *second_lvl[2];         /* copy of guest P2M 1st-lvl content */
> +
> +        /* bitmap to track dirty pages */
> +#define MAX_DIRTY_BITMAP_PAGES 64
> +        /* Because each bit represents a dirty page, the total supported guest 
> +         * memory is (64 entries x 4KB/entry x 8bits/byte x 4KB) = 8GB. */

8GB isn't very much, especially not for a 64-bit guest.

I've previously discussed with Jaeyong the possibility of using some
state in each pte to track dirtiness and walking the vlpt to copy them
out into the toolstacks bitmap. I think pte.p2m.type could be used --
any page which is p2m_ram_rw has been dirtied (otherwise it would still
be p2m_ram_logdirty). Only thing I'm not sure about is other types --
e.g. ballooning a page out in the middle of a migrate -- I suppose there
is some explicit "dirtying" of such a page somewhere in decrease
reservation.

BTW x86 seems to use a 4-level bitmap trie here.

> +        uint8_t *bitmap[MAX_DIRTY_BITMAP_PAGES]; /* dirty bitmap */
> +        int bitmap_pages;                        /* # of dirty bitmap pages */
> +    } dirty;
>  }  __cacheline_aligned;
>  
>  struct arch_vcpu

Ian.
Julien Grall May 16, 2014, 10:59 a.m. UTC | #4
Hi Wei,

On 05/08/2014 10:18 PM, Wei Huang wrote:
> This patch implements log_dirty for ARM guest VMs. This feature
> is provided via two basic blocks: dirty_bit_map and VLPT
> (virtual-linear page table)
> 
> 1. VLPT provides fast accessing of 3rd PTE of guest P2M.
> When creating a mapping for VLPT, the page table mapping
> becomes the following:
>    xen's 1st PTE --> xen's 2nd PTE --> guest p2m's 2nd PTE -->
>    guest p2m's 3rd PTE
> 
> With VLPT, xen can immediately locate the 3rd PTE of guest P2M
> and modify PTE attirbute during dirty page tracking. The following
> link shows the performance comparison for handling a dirty-page
> between VLPT and typical page table walking.
> http://lists.xen.org/archives/html/xen-devel/2013-08/msg01503.html
> 
> For more info about VLPT, please see
> http://www.technovelty.org/linux/virtual-linear-page-table.html.
> 
> 2. Dirty bitmap
> The dirty bitmap is used to mark the pages which are dirty during
> migration. The info is used by Xen tools, via DOMCTL_SHADOW_OP_*,
> to figure out which guest pages need to be resent.

I think you forgot a case in the log dirty support. You don't handle
dirty page when Xen is writing data in the guest memory (with
raw_copy_to_guest).

I think we need to handle this case during live migration otherwise data
may be corrupted.

Regards,
diff mbox

Patch

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 40f1c3a..2eb5ce0 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -208,6 +208,9 @@  static void ctxt_switch_to(struct vcpu *n)
 
     isb();
 
+    /* Dirty-page tracing */
+    log_dirty_restore(n->domain);
+
     /* This is could trigger an hardware interrupt from the virtual
      * timer. The interrupt needs to be injected into the guest. */
     WRITE_SYSREG32(n->arch.cntkctl, CNTKCTL_EL1);
@@ -504,6 +507,9 @@  int arch_domain_create(struct domain *d, unsigned int domcr_flags)
     /* Default the virtual ID to match the physical */
     d->arch.vpidr = boot_cpu_data.midr.bits;
 
+    /* Init log dirty support */
+    log_dirty_init(d);
+
     clear_page(d->shared_info);
     share_xen_page_with_guest(
         virt_to_page(d->shared_info), d, XENSHARE_writable);
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index 45974e7..f1c34da 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -10,30 +10,53 @@ 
 #include <xen/errno.h>
 #include <xen/sched.h>
 #include <xen/hypercall.h>
+#include <xen/guest_access.h>
 #include <public/domctl.h>
 
 long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
                     XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
+    long ret = 0;
+    bool_t copyback = 0;
+
     switch ( domctl->cmd )
     {
+    case XEN_DOMCTL_shadow_op:
+    {
+        ret = -EINVAL;
+        copyback = 1;
+
+        if ( (d == current->domain) )   /* no domain_pause() */
+            break;
+
+        domain_pause(d);
+        ret = dirty_mode_op(d, &domctl->u.shadow_op);
+        domain_unpause(d);
+    }
+    break;
+
     case XEN_DOMCTL_cacheflush:
     {
         unsigned long s = domctl->u.cacheflush.start_pfn;
         unsigned long e = s + domctl->u.cacheflush.nr_pfns;
 
         if ( domctl->u.cacheflush.nr_pfns > (1U<<MAX_ORDER) )
-            return -EINVAL;
+            ret = -EINVAL;
 
         if ( e < s )
-            return -EINVAL;
+            ret = -EINVAL;
 
-        return p2m_cache_flush(d, s, e);
+        ret = p2m_cache_flush(d, s, e);
     }
 
     default:
-        return subarch_do_domctl(domctl, d, u_domctl);
+        ret = subarch_do_domctl(domctl, d, u_domctl);
     }
+
+    if ( copyback && __copy_to_guest(u_domctl, domctl, 1) )
+        ret = -EFAULT;
+
+    return ret;
 }
 
 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index eac228c..81c0691 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -865,7 +865,6 @@  void destroy_xen_mappings(unsigned long v, unsigned long e)
     create_xen_entries(REMOVE, v, 0, (e - v) >> PAGE_SHIFT, 0);
 }
 
-enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
 static void set_pte_flags_on_range(const char *p, unsigned long l, enum mg mg)
 {
     lpae_t pte;
@@ -945,11 +944,6 @@  int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
     return 0;
 }
 
-unsigned long domain_get_maximum_gpfn(struct domain *d)
-{
-    return -ENOSYS;
-}
-
 void share_xen_page_with_guest(struct page_info *page,
                           struct domain *d, int readonly)
 {
@@ -1235,6 +1229,298 @@  int is_iomem_page(unsigned long mfn)
         return 1;
     return 0;
 }
+
+
+/* Return start and end addr of guest RAM. Note this function only reports 
+ * regular RAM. It does not cover other areas such as foreign mapped
+ * pages or MMIO space. */
+void domain_get_ram_range(struct domain *d, paddr_t *start, paddr_t *end)
+{
+    if ( start )
+        *start = GUEST_RAM_BASE;
+
+    if ( end )
+        *end = GUEST_RAM_BASE + ((paddr_t) d->max_pages << PAGE_SHIFT);
+}
+
+/* Return the maximum GPFN of guest VM. It covers all guest memory types. */
+unsigned long domain_get_maximum_gpfn(struct domain *d)
+{
+    struct p2m_domain *p2m = &d->arch.p2m;
+
+    return p2m->max_mapped_gfn;
+}
+
+/************************************/
+/*    Dirty Page Tracking Support   */
+/************************************/
+/* Mark the bitmap for a corresponding page as dirty */
+static inline void bitmap_mark_dirty(struct domain *d, paddr_t addr)
+{
+    paddr_t ram_base = (paddr_t) GUEST_RAM_BASE;
+    int bit_index = PFN_DOWN(addr - ram_base);
+    int page_index = bit_index >> (PAGE_SHIFT + 3);
+    int bit_index_residual = bit_index & ((1ul << (PAGE_SHIFT + 3)) - 1);
+
+    set_bit(bit_index_residual, d->arch.dirty.bitmap[page_index]);
+}
+
+/* Allocate dirty bitmap resource */
+static int bitmap_init(struct domain *d)
+{
+    paddr_t gma_start = 0;
+    paddr_t gma_end = 0;
+    int nr_bytes;
+    int nr_pages;
+    int i;
+
+    domain_get_ram_range(d, &gma_start, &gma_end);
+
+    nr_bytes = (PFN_DOWN(gma_end - gma_start) + 7) / 8;
+    nr_pages = (nr_bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+
+    BUG_ON(nr_pages > MAX_DIRTY_BITMAP_PAGES);
+
+    for ( i = 0; i < nr_pages; ++i )
+    {
+        struct page_info *page;
+        page = alloc_domheap_page(NULL, 0);
+        if ( page == NULL )
+            goto cleanup_on_failure;
+
+        d->arch.dirty.bitmap[i] = map_domain_page_global(__page_to_mfn(page));
+        clear_page(d->arch.dirty.bitmap[i]);
+    }
+
+    d->arch.dirty.bitmap_pages = nr_pages;
+    return 0;
+
+cleanup_on_failure:
+    nr_pages = i;
+    for ( i = 0; i < nr_pages; ++i )
+    {
+        unmap_domain_page_global(d->arch.dirty.bitmap[i]);
+    }
+
+    return -ENOMEM;
+}
+
+/* Cleanup dirty bitmap resource */
+static void bitmap_cleanup(struct domain *d)
+{
+    int i;
+
+    for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
+    {
+        unmap_domain_page_global(d->arch.dirty.bitmap[i]);
+    }
+}
+
+/* Flush VLPT area */
+static void vlpt_flush(struct domain *d)
+{
+    int flush_size;
+    flush_size = (d->arch.dirty.second_lvl_end - 
+                  d->arch.dirty.second_lvl_start) << SECOND_SHIFT;
+
+    /* flushing the 3rd level mapping */
+    flush_xen_data_tlb_range_va(d->arch.dirty.second_lvl_start << SECOND_SHIFT,
+                                flush_size);
+}
+
+/* Set up a page table for VLPT mapping */
+static int vlpt_init(struct domain *d)
+{
+    uint64_t required, avail = VIRT_LIN_P2M_END - VIRT_LIN_P2M_START;
+    int xen_second_linear_base;
+    int gp2m_start_index, gp2m_end_index;
+    struct p2m_domain *p2m = &d->arch.p2m;
+    struct page_info *second_lvl_page;
+    paddr_t gma_start = 0;
+    paddr_t gma_end = 0;
+    lpae_t *first[2];
+    int i;
+
+    /* Check if reserved space is enough to cover guest physical address space.
+     * Note that each LPAE page table entry is 64-bit (8 bytes). So we only
+     * shift left with LPAE_SHIFT instead of PAGE_SHIFT. */
+    domain_get_ram_range(d, &gma_start, &gma_end);
+    required = (gma_end - gma_start) >> LPAE_SHIFT;
+    if ( required > avail )
+    {
+        dprintk(XENLOG_ERR, "Available VLPT is small for domU guest (avail: "
+                "%#llx, required: %#llx)\n", (unsigned long long)avail,
+                (unsigned long long)required);
+        return -ENOMEM;
+    }
+
+    /* Caulculate the base of 2nd linear table base for VIRT_LIN_P2M_START */
+    xen_second_linear_base = second_linear_offset(VIRT_LIN_P2M_START);
+
+    gp2m_start_index = gma_start >> FIRST_SHIFT;
+    gp2m_end_index = (gma_end >> FIRST_SHIFT) + 1;
+
+    if ( xen_second_linear_base + gp2m_end_index >= LPAE_ENTRIES * 2 )
+    {
+        dprintk(XENLOG_ERR, "xen second page is small for VLPT for domU");
+        return -ENOMEM;
+    }
+
+    /* Two pages are allocated to backup the related PTE content of guest 
+     * VM's 1st-level table. */
+    second_lvl_page = alloc_domheap_pages(NULL, 1, 0);
+    if ( second_lvl_page == NULL )
+        return -ENOMEM;
+    d->arch.dirty.second_lvl[0] = map_domain_page_global(
+        page_to_mfn(second_lvl_page) );
+    d->arch.dirty.second_lvl[1] = map_domain_page_global(
+        page_to_mfn(second_lvl_page+1) );
+
+    /* 1st level P2M of guest VM is 2 consecutive pages */
+    first[0] = __map_domain_page(p2m->first_level);
+    first[1] = __map_domain_page(p2m->first_level+1);
+
+    for ( i = gp2m_start_index; i < gp2m_end_index; ++i )
+    {
+        int k = i % LPAE_ENTRIES;
+        int l = i / LPAE_ENTRIES;
+        int k2 = (xen_second_linear_base + i) % LPAE_ENTRIES;
+        int l2 = (xen_second_linear_base + i) / LPAE_ENTRIES;
+
+        /* Update 2nd-level PTE of Xen linear table. With this, Xen linear 
+         * page table layout becomes: 1st Xen linear ==> 2nd Xen linear ==> 
+         * 2nd guest P2M (i.e. 3rd Xen linear) ==> 3rd guest P2M (i.e. Xen 
+         * linear content) for VIRT_LIN_P2M_START address space. */
+        write_pte(&xen_second[xen_second_linear_base+i], first[l][k]);
+
+        /* We copy the mapping into domain's structure as a reference
+         * in case of the context switch (used in vlpt_restore function ) */
+        d->arch.dirty.second_lvl[l2][k2] = first[l][k];
+    }
+    unmap_domain_page(first[0]);
+    unmap_domain_page(first[1]);
+
+    /* storing the start and end index */
+    d->arch.dirty.second_lvl_start = xen_second_linear_base + gp2m_start_index;
+    d->arch.dirty.second_lvl_end = xen_second_linear_base + gp2m_end_index;
+
+    vlpt_flush(d);
+
+    return 0;
+}
+
+static void vlpt_cleanup(struct domain *d)
+{
+    /* First level p2m is 2 consecutive pages */
+    unmap_domain_page_global(d->arch.dirty.second_lvl[0]);
+    unmap_domain_page_global(d->arch.dirty.second_lvl[1]);
+}
+
+/* Returns zero if addr is not valid or dirty mode is not set */
+int handle_page_fault(struct domain *d, paddr_t addr)
+{
+    lpae_t *vlp2m_pte = 0;
+    paddr_t gma_start = 0;
+    paddr_t gma_end = 0;
+
+    if ( !d->arch.dirty.mode )
+        return 0;
+
+    domain_get_ram_range(d, &gma_start, &gma_end);
+
+    /* Ensure that addr is inside guest's RAM */
+    if ( addr < gma_start || addr > gma_end )
+        return 0;
+
+    vlp2m_pte = vlpt_get_3lvl_pte(addr);
+    if ( vlp2m_pte->p2m.valid && vlp2m_pte->p2m.write == 0 &&
+         vlp2m_pte->p2m.type == p2m_ram_logdirty )
+    {
+        lpae_t pte = *vlp2m_pte;
+        pte.p2m.write = 1;
+        write_pte(vlp2m_pte, pte);
+        flush_tlb_local();
+
+        /* only necessary to lock between get-dirty bitmap and mark dirty
+         * bitmap. If get-dirty bitmap happens immediately before this
+         * lock, the corresponding dirty-page would be marked at the next
+         * round of get-dirty bitmap */
+        spin_lock(&d->arch.dirty.lock);
+        bitmap_mark_dirty(d, addr);
+        spin_unlock(&d->arch.dirty.lock);
+    }
+
+    return 1;
+}
+
+/* Restore the xen page table for vlpt mapping for domain */
+void log_dirty_restore(struct domain *d)
+{
+    int i;
+
+    /* Nothing to do as log dirty mode is off */
+    if ( !(d->arch.dirty.mode) )
+        return;
+
+    dsb(sy);
+
+    for ( i = d->arch.dirty.second_lvl_start; i < d->arch.dirty.second_lvl_end;
+          ++i )
+    {
+        int k = i % LPAE_ENTRIES;
+        int l = i / LPAE_ENTRIES;
+
+        if ( xen_second[i].bits != d->arch.dirty.second_lvl[l][k].bits )
+        {
+            write_pte(&xen_second[i], d->arch.dirty.second_lvl[l][k]);
+            flush_xen_data_tlb_range_va(i << SECOND_SHIFT, 1 << SECOND_SHIFT);
+        }
+    }
+
+    dsb(sy);
+    isb();
+}
+
+/* Turn on log dirty */
+int log_dirty_on(struct domain *d)
+{
+    if ( vlpt_init(d) || bitmap_init(d) )
+        return -EINVAL;
+
+    return 0;
+}
+
+/* Turn off log dirty */
+void log_dirty_off(struct domain *d)
+{
+    bitmap_cleanup(d);
+    vlpt_cleanup(d);
+}
+
+/* Initialize log dirty fields */
+int log_dirty_init(struct domain *d)
+{
+    d->arch.dirty.count = 0;
+    d->arch.dirty.mode = 0;
+    spin_lock_init(&d->arch.dirty.lock);
+
+    d->arch.dirty.second_lvl_start = 0;
+    d->arch.dirty.second_lvl_end = 0;
+    d->arch.dirty.second_lvl[0] = NULL;
+    d->arch.dirty.second_lvl[1] = NULL;
+
+    memset(d->arch.dirty.bitmap, 0, sizeof(d->arch.dirty.bitmap));
+    d->arch.dirty.bitmap_pages = 0;
+
+    return 0;
+}
+
+/* Log dirty tear down */
+void log_dirty_teardown(struct domain *d)
+{
+    return;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 603c097..0808cc9 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -6,6 +6,8 @@ 
 #include <xen/bitops.h>
 #include <asm/flushtlb.h>
 #include <asm/gic.h>
+#include <xen/guest_access.h>
+#include <xen/pfn.h>
 #include <asm/event.h>
 #include <asm/hardirq.h>
 #include <asm/page.h>
@@ -208,6 +210,7 @@  static lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr,
         break;
 
     case p2m_ram_ro:
+    case p2m_ram_logdirty:
         e.p2m.xn = 0;
         e.p2m.write = 0;
         break;
@@ -261,6 +264,10 @@  static int p2m_create_table(struct domain *d,
 
     pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM, p2m_invalid);
 
+    /* mark the write bit (page table's case, ro bit) as 0
+     * so, it is writable in case of vlpt access */
+    pte.pt.ro = 0;
+
     write_pte(entry, pte);
 
     return 0;
@@ -696,6 +703,203 @@  unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn)
     return p >> PAGE_SHIFT;
 }
 
+/* Change types across all p2m entries in a domain */
+void p2m_change_entry_type_global(struct domain *d, enum mg nt)
+{
+    struct p2m_domain *p2m = &d->arch.p2m;
+    paddr_t ram_base;
+    int i1, i2, i3;
+    int first_index, second_index, third_index;
+    lpae_t *first = __map_domain_page(p2m->first_level);
+    lpae_t pte, *second = NULL, *third = NULL;
+
+    domain_get_ram_range(d, &ram_base, NULL);
+
+    first_index = first_table_offset((uint64_t)ram_base);
+    second_index = second_table_offset((uint64_t)ram_base);
+    third_index = third_table_offset((uint64_t)ram_base);
+
+    BUG_ON(!first);
+
+    spin_lock(&p2m->lock);
+
+    for ( i1 = first_index; i1 < LPAE_ENTRIES*2; ++i1 )
+    {
+        lpae_walk_t first_pte = first[i1].walk;
+        if ( !first_pte.valid || !first_pte.table )
+            goto out;
+
+        second = map_domain_page(first_pte.base);
+        BUG_ON(!second);
+
+        for ( i2 = second_index; i2 < LPAE_ENTRIES; ++i2 )
+        {
+            lpae_walk_t second_pte = second[i2].walk;
+
+            if ( !second_pte.valid || !second_pte.table )
+                goto out;
+
+            third = map_domain_page(second_pte.base);
+            BUG_ON(!third);
+
+            for ( i3 = third_index; i3 < LPAE_ENTRIES; ++i3 )
+            {
+                lpae_walk_t third_pte = third[i3].walk;
+
+                if ( !third_pte.valid )
+                    goto out;
+
+                pte = third[i3];
+
+                if ( nt == mg_ro )
+                {
+                    if ( pte.p2m.write == 1 )
+                    {
+                        pte.p2m.write = 0;
+                        pte.p2m.type = p2m_ram_logdirty;
+                    }
+                    else
+                    {
+                        /* reuse avail bit as an indicator of 'actual' 
+                         * read-only */
+                        pte.p2m.type = p2m_ram_rw;
+                    }
+                }
+                else if ( nt == mg_rw )
+                {
+                    if ( pte.p2m.write == 0 && 
+                         pte.p2m.type == p2m_ram_logdirty )
+                    {
+                        pte.p2m.write = p2m_ram_rw;
+                    }
+                }
+                write_pte(&third[i3], pte);
+            }
+            unmap_domain_page(third);
+
+            third = NULL;
+            third_index = 0;
+        }
+        unmap_domain_page(second);
+
+        second = NULL;
+        second_index = 0;
+        third_index = 0;
+    }
+
+out:
+    flush_tlb_all_local();
+    if ( third ) unmap_domain_page(third);
+    if ( second ) unmap_domain_page(second);
+    if ( first ) unmap_domain_page(first);
+
+    spin_unlock(&p2m->lock);
+}
+
+/* Read a domain's log-dirty bitmap and stats. If the operation is a CLEAN, 
+ * clear the bitmap and stats. */
+int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc)
+{
+    int peek = 1;
+    int i;
+    int bitmap_size;
+    paddr_t gma_start, gma_end;
+
+    /* this hypercall is called from domain 0, and we don't know which guest's
+     * vlpt is mapped in xen_second, so, to be sure, we restore vlpt here */
+    log_dirty_restore(d);
+
+    domain_get_ram_range(d, &gma_start, &gma_end);
+    bitmap_size = (gma_end - gma_start) / 8;
+
+    if ( guest_handle_is_null(sc->dirty_bitmap) )
+    {
+        peek = 0;
+    }
+    else
+    {
+        spin_lock(&d->arch.dirty.lock);
+
+        for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
+        {
+            int j = 0;
+            uint8_t *bitmap;
+
+            copy_to_guest_offset(sc->dirty_bitmap, i * PAGE_SIZE,
+                                 d->arch.dirty.bitmap[i],
+                                 bitmap_size < PAGE_SIZE ? bitmap_size :
+                                                           PAGE_SIZE);
+            bitmap_size -= PAGE_SIZE;
+
+            /* set p2m page table read-only */
+            bitmap = d->arch.dirty.bitmap[i];
+            while ((j = find_next_bit((const long unsigned int *)bitmap,
+                                      PAGE_SIZE*8, j)) < PAGE_SIZE*8)
+            {
+                lpae_t *vlpt;
+                paddr_t addr = gma_start + (i << (2*PAGE_SHIFT+3)) +
+                    (j << PAGE_SHIFT);
+                vlpt = vlpt_get_3lvl_pte(addr);
+                vlpt->p2m.write = 0;
+                j++;
+            }
+        }
+
+        if ( sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN )
+        {
+            for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
+            {
+                clear_page(d->arch.dirty.bitmap[i]);
+            }
+        }
+
+        spin_unlock(&d->arch.dirty.lock);
+        flush_tlb_local();
+    }
+
+    sc->stats.dirty_count = d->arch.dirty.count;
+
+    return 0;
+}
+
+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc)
+{
+    long ret = 0;
+    switch (sc->op)
+    {
+        case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
+        case XEN_DOMCTL_SHADOW_OP_OFF:
+        {
+            enum mg nt = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? mg_rw : mg_ro;
+
+            d->arch.dirty.mode = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? 0 : 1;
+            p2m_change_entry_type_global(d, nt);
+
+            if ( sc->op == XEN_DOMCTL_SHADOW_OP_OFF )
+            {
+                log_dirty_off(d);
+            }
+            else
+            {
+                if ( (ret = log_dirty_on(d)) )
+                    return ret;
+            }
+        }
+        break;
+
+        case XEN_DOMCTL_SHADOW_OP_CLEAN:
+        case XEN_DOMCTL_SHADOW_OP_PEEK:
+        {
+            ret = log_dirty_op(d, sc);
+        }
+        break;
+
+        default:
+            return -ENOSYS;
+    }
+    return ret;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
index df4d375..b652565 100644
--- a/xen/arch/arm/traps.c
+++ b/xen/arch/arm/traps.c
@@ -1556,6 +1556,8 @@  static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
     struct hsr_dabt dabt = hsr.dabt;
     int rc;
     mmio_info_t info;
+    int page_fault = ( dabt.write && ((dabt.dfsc & FSC_MASK) == 
+                                      (FSC_FLT_PERM|FSC_3RD_LEVEL)) );
 
     if ( !check_conditional_instr(regs, hsr) )
     {
@@ -1577,6 +1579,13 @@  static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
     if ( rc == -EFAULT )
         goto bad_data_abort;
 
+    /* domU page fault handling for guest live migration. Note that 
+     * dabt.valid can be 0 here */
+    if ( page_fault && handle_page_fault(current->domain, info.gpa) )
+    {
+        /* Do not modify PC as guest needs to repeat memory operation */
+        return;
+    }
     /* XXX: Decode the instruction if ISS is not valid */
     if ( !dabt.valid )
         goto bad_data_abort;
diff --git a/xen/include/asm-arm/config.h b/xen/include/asm-arm/config.h
index ef291ff..f18fae4 100644
--- a/xen/include/asm-arm/config.h
+++ b/xen/include/asm-arm/config.h
@@ -87,6 +87,7 @@ 
  *   0  -   8M   <COMMON>
  *
  *  32M - 128M   Frametable: 24 bytes per page for 16GB of RAM
+ * 128M - 256M   Virtual-linear mapping to P2M table
  * 256M -   1G   VMAP: ioremap and early_ioremap use this virtual address
  *                    space
  *
@@ -124,13 +125,15 @@ 
 #define CONFIG_SEPARATE_XENHEAP 1
 
 #define FRAMETABLE_VIRT_START  _AT(vaddr_t,0x02000000)
-#define VMAP_VIRT_START  _AT(vaddr_t,0x10000000)
+#define VIRT_LIN_P2M_START     _AT(vaddr_t,0x08000000)
+#define VMAP_VIRT_START        _AT(vaddr_t,0x10000000)
+#define VIRT_LIN_P2M_END       VMAP_VIRT_START
 #define XENHEAP_VIRT_START     _AT(vaddr_t,0x40000000)
 #define XENHEAP_VIRT_END       _AT(vaddr_t,0x7fffffff)
 #define DOMHEAP_VIRT_START     _AT(vaddr_t,0x80000000)
 #define DOMHEAP_VIRT_END       _AT(vaddr_t,0xffffffff)
 
-#define VMAP_VIRT_END    XENHEAP_VIRT_START
+#define VMAP_VIRT_END          XENHEAP_VIRT_START
 
 #define DOMHEAP_ENTRIES        1024  /* 1024 2MB mapping slots */
 
@@ -157,6 +160,11 @@ 
 
 #define HYPERVISOR_VIRT_END    DIRECTMAP_VIRT_END
 
+/* Definition for VIRT_LIN_P2M_START and VIRT_LIN_P2M_END (64-bit)
+ * TODO: Needs evaluation. */
+#define VIRT_LIN_P2M_START     _AT(vaddr_t, 0x08000000)
+#define VIRT_LIN_P2M_END       VMAP_VIRT_START
+
 #endif
 
 /* Fixmap slots */
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index aabeb51..ac82643 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -162,6 +162,25 @@  struct arch_domain
     } vuart;
 
     unsigned int evtchn_irq;
+
+    /* dirty page tracing */
+    struct {
+        spinlock_t lock;
+        volatile int mode;               /* 1 if dirty pages tracing enabled */
+        volatile unsigned int count;     /* dirty pages counter */
+
+        /* vlpt context switch */
+        volatile int second_lvl_start; /* start idx of virt linear space 2nd */
+        volatile int second_lvl_end;   /* end idx of virt linear space 2nd */
+        lpae_t *second_lvl[2];         /* copy of guest P2M 1st-lvl content */
+
+        /* bitmap to track dirty pages */
+#define MAX_DIRTY_BITMAP_PAGES 64
+        /* Because each bit represents a dirty page, the total supported guest 
+         * memory is (64 entries x 4KB/entry x 8bits/byte x 4KB) = 8GB. */
+        uint8_t *bitmap[MAX_DIRTY_BITMAP_PAGES]; /* dirty bitmap */
+        int bitmap_pages;                        /* # of dirty bitmap pages */
+    } dirty;
 }  __cacheline_aligned;
 
 struct arch_vcpu
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index b8d4e7d..ab19025 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -4,6 +4,7 @@ 
 #include <xen/config.h>
 #include <xen/kernel.h>
 #include <asm/page.h>
+#include <asm/config.h>
 #include <public/xen.h>
 
 /* Align Xen to a 2 MiB boundary. */
@@ -320,6 +321,7 @@  int donate_page(
 #define domain_clamp_alloc_bitsize(d, b) (b)
 
 unsigned long domain_get_maximum_gpfn(struct domain *d);
+void domain_get_ram_range(struct domain *d, paddr_t *start, paddr_t *end);
 
 extern struct domain *dom_xen, *dom_io, *dom_cow;
 
@@ -341,6 +343,27 @@  static inline void put_page_and_type(struct page_info *page)
     put_page(page);
 }
 
+enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
+
+/************************************/
+/*    Log-dirty support functions   */
+/************************************/
+int log_dirty_on(struct domain *d);
+void log_dirty_off(struct domain *d);
+int log_dirty_init(struct domain *d);
+void log_dirty_teardown(struct domain *d);
+void log_dirty_restore(struct domain *d);
+int handle_page_fault(struct domain *d, paddr_t addr);
+/* access leaf PTE of a given guest address (GPA) */
+static inline lpae_t * vlpt_get_3lvl_pte(paddr_t addr)
+{
+    lpae_t *table = (lpae_t *)VIRT_LIN_P2M_START;
+
+    /* Since we slotted the guest's first p2m page table to xen's
+     * second page table, one shift is enough for calculating the
+     * index of guest p2m table entry */
+    return &table[addr >> PAGE_SHIFT];
+}
 #endif /*  __ARCH_ARM_MM__ */
 /*
  * Local variables:
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index bd71abe..0cecbe7 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -2,6 +2,7 @@ 
 #define _XEN_P2M_H
 
 #include <xen/mm.h>
+#include <public/domctl.h>
 
 struct domain;
 
@@ -41,6 +42,7 @@  typedef enum {
     p2m_invalid = 0,    /* Nothing mapped here */
     p2m_ram_rw,         /* Normal read/write guest RAM */
     p2m_ram_ro,         /* Read-only; writes are silently dropped */
+    p2m_ram_logdirty,   /* Read-only: special mode for log dirty */
     p2m_mmio_direct,    /* Read/write mapping of genuine MMIO area */
     p2m_map_foreign,    /* Ram pages from foreign domain */
     p2m_grant_map_rw,   /* Read/write grant mapping */
@@ -49,7 +51,8 @@  typedef enum {
 } p2m_type_t;
 
 #define p2m_is_foreign(_t)  ((_t) == p2m_map_foreign)
-#define p2m_is_ram(_t)      ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro)
+#define p2m_is_ram(_t)      ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro ||  \
+                             (_t) == p2m_ram_logdirty)
 
 /* Initialise vmid allocator */
 void p2m_vmid_allocator_init(void);
@@ -178,6 +181,9 @@  static inline int get_page_and_type(struct page_info *page,
     return rc;
 }
 
+void p2m_change_entry_type_global(struct domain *d, enum mg nt);
+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc);
+
 #endif /* _XEN_P2M_H */
 
 /*
diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
index 750864a..0bf3d67 100644
--- a/xen/include/asm-arm/processor.h
+++ b/xen/include/asm-arm/processor.h
@@ -407,6 +407,8 @@  union hsr {
 #define FSC_CPR        (0x3a) /* Coprocossor Abort */
 
 #define FSC_LL_MASK    (_AC(0x03,U)<<0)
+#define FSC_MASK       (0x3f) /* Fault status mask */
+#define FSC_3RD_LEVEL  (0x03) /* Third level fault */
 
 /* Time counter hypervisor control register */
 #define CNTHCTL_PA      (1u<<0)  /* Kernel/user access to physical counter */