diff mbox

[Xen-devel,RFC,v2,5/6] xen/arm: Implement hypercall for dirty page tracing

Message ID 1397595918-30419-6-git-send-email-w1.huang@samsung.com
State New
Headers show

Commit Message

Wei Huang April 15, 2014, 9:05 p.m. UTC
From: Jaeyong Yoo <jaeyong.yoo@samsung.com>

This patch adds hypercall for shadow operations, including enable/disable
and clean/peek dirty page bitmap.

The design consists of two parts: dirty page detecting and saving. For
detecting, we setup the guest p2m's leaf PTE read-only and whenever
the guest tries to write something, permission fault happens and traps
into xen. The permission-faulted GPA should be saved for the toolstack,
which checks which pages are dirty. For this purpose, it temporarily saves
the GPAs into bitmap.

Signed-off-by: Jaeyong Yoo <jaeyong.yoo@samsung.com>
Signed-off-by: Wei Huang <w1.huang@samsung.com>
---
 xen/arch/arm/domain.c           |   14 +++
 xen/arch/arm/domctl.c           |   21 ++++
 xen/arch/arm/mm.c               |  105 ++++++++++++++++++-
 xen/arch/arm/p2m.c              |  211 +++++++++++++++++++++++++++++++++++++++
 xen/arch/arm/traps.c            |   11 ++
 xen/include/asm-arm/domain.h    |    7 ++
 xen/include/asm-arm/mm.h        |   10 ++
 xen/include/asm-arm/p2m.h       |    8 +-
 xen/include/asm-arm/processor.h |    2 +
 9 files changed, 387 insertions(+), 2 deletions(-)

Comments

Andrew Cooper April 15, 2014, 11:38 p.m. UTC | #1
On 15/04/2014 22:05, Wei Huang wrote:
> From: Jaeyong Yoo <jaeyong.yoo@samsung.com>
>
> This patch adds hypercall for shadow operations, including enable/disable
> and clean/peek dirty page bitmap.
>
> The design consists of two parts: dirty page detecting and saving. For
> detecting, we setup the guest p2m's leaf PTE read-only and whenever
> the guest tries to write something, permission fault happens and traps
> into xen. The permission-faulted GPA should be saved for the toolstack,
> which checks which pages are dirty. For this purpose, it temporarily saves
> the GPAs into bitmap.
>
> Signed-off-by: Jaeyong Yoo <jaeyong.yoo@samsung.com>
> Signed-off-by: Wei Huang <w1.huang@samsung.com>
> ---
>  xen/arch/arm/domain.c           |   14 +++
>  xen/arch/arm/domctl.c           |   21 ++++
>  xen/arch/arm/mm.c               |  105 ++++++++++++++++++-
>  xen/arch/arm/p2m.c              |  211 +++++++++++++++++++++++++++++++++++++++
>  xen/arch/arm/traps.c            |   11 ++
>  xen/include/asm-arm/domain.h    |    7 ++
>  xen/include/asm-arm/mm.h        |   10 ++
>  xen/include/asm-arm/p2m.h       |    8 +-
>  xen/include/asm-arm/processor.h |    2 +
>  9 files changed, 387 insertions(+), 2 deletions(-)
>
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index 3f04a77..d2531ed 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -207,6 +207,12 @@ static void ctxt_switch_to(struct vcpu *n)
>  
>      isb();
>  
> +    /* Dirty-page tracing
> +     * NB: How do we consider SMP case?
> +     */
> +    if ( n->domain->arch.dirty.mode )
> +        restore_vlpt(n->domain);
> +
>      /* This is could trigger an hardware interrupt from the virtual
>       * timer. The interrupt needs to be injected into the guest. */
>      virt_timer_restore(n);
> @@ -502,11 +508,19 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
>      /* Default the virtual ID to match the physical */
>      d->arch.vpidr = boot_cpu_data.midr.bits;
>  
> +    /* init for dirty-page tracing */
> +    d->arch.dirty.count = 0;
> +    d->arch.dirty.mode = 0;

Redundant initialization to 0.

> +    spin_lock_init(&d->arch.dirty.lock);
> +
>      d->arch.dirty.second_lvl_start = 0;
>      d->arch.dirty.second_lvl_end = 0;
>      d->arch.dirty.second_lvl[0] = NULL;
>      d->arch.dirty.second_lvl[1] = NULL;
>  
> +    memset(d->arch.dirty.bitmap, 0, sizeof(d->arch.dirty.bitmap));
> +    d->arch.dirty.bitmap_pages = 0;
> +
>      clear_page(d->shared_info);
>      share_xen_page_with_guest(
>          virt_to_page(d->shared_info), d, XENSHARE_writable);
> diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
> index 45974e7..e84651f 100644
> --- a/xen/arch/arm/domctl.c
> +++ b/xen/arch/arm/domctl.c
> @@ -11,12 +11,33 @@
>  #include <xen/sched.h>
>  #include <xen/hypercall.h>
>  #include <public/domctl.h>
> +#include <xen/hvm/save.h>
> +#include <xen/guest_access.h>
> +
>  

Spurious whitespace change

>  long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
>                      XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
>  {
> +    long ret = 0;
> +
>      switch ( domctl->cmd )
>      {
> +    case XEN_DOMCTL_shadow_op:
> +    {
> +        if ( d == current->domain ) /* no domain_pause() */
> +            return -EINVAL;
> +                                          
> +        domain_pause(d);
> +        ret = dirty_mode_op(d, &domctl->u.shadow_op);
> +        domain_unpause(d);
> +
> +        if ( __copy_to_guest(u_domctl, domctl, 1) )
> +            ret = -EFAULT;
> +
> +        return ret;
> +    }
> +    break;
> +
>      case XEN_DOMCTL_cacheflush:
>      {
>          unsigned long s = domctl->u.cacheflush.start_pfn;
> diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
> index a315752..ae852eb 100644
> --- a/xen/arch/arm/mm.c
> +++ b/xen/arch/arm/mm.c
> @@ -981,7 +981,6 @@ void destroy_xen_mappings(unsigned long v, unsigned long e)
>      create_xen_entries(REMOVE, v, 0, (e - v) >> PAGE_SHIFT, 0);
>  }
>  
> -enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
>  static void set_pte_flags_on_range(const char *p, unsigned long l, enum mg mg)
>  {
>      lpae_t pte;
> @@ -1370,6 +1369,110 @@ void domain_get_gpfn_range(struct domain *d, paddr_t *start, paddr_t *end)
>          *end = GUEST_RAM_BASE + ((paddr_t) p2m->max_mapped_gfn);
>  }
>  
> +static inline void mark_dirty_bitmap(struct domain *d, paddr_t addr)
> +{
> +    paddr_t ram_base = (paddr_t) GUEST_RAM_BASE;
> +    int bit_index = PFN_DOWN(addr - ram_base);
> +    int page_index = bit_index >> (PAGE_SHIFT + 3);
> +    int bit_index_residual = bit_index & ((1ul << (PAGE_SHIFT + 3)) - 1);

These must be unsigned quantities, and larger than an int for 64 bit. 
Same applies throughout this patch.

> +
> +    set_bit(bit_index_residual, d->arch.dirty.bitmap[page_index]);
> +}
> +
> +/* Routine for dirty-page tracing
> + *
> + * On first write, it page faults, its entry is changed to read-write,
> + * and on retry the write succeeds. For locating p2m of the faulting entry,
> + * we use virtual-linear page table.
> + *
> + * Returns zero if addr is not valid or dirty mode is not set
> + */
> +int handle_page_fault(struct domain *d, paddr_t addr)
> +{
> +
> +    lpae_t *vlp2m_pte = 0;

Pointers should be initialised to NULL.

> +    paddr_t gma_start = 0;
> +    paddr_t gma_end = 0;
> +
> +    if ( !d->arch.dirty.mode )
> +        return 0;
> +
> +    domain_get_gpfn_range(d, &gma_start, &gma_end);
> +    /* Ensure that addr is inside guest's RAM */
> +    if ( addr < gma_start || addr > gma_end )
> +        return 0;
> +
> +    vlp2m_pte = get_vlpt_3lvl_pte(addr);
> +    if ( vlp2m_pte->p2m.valid && vlp2m_pte->p2m.write == 0 &&
> +         vlp2m_pte->p2m.type == p2m_ram_logdirty )
> +    {
> +        lpae_t pte = *vlp2m_pte;
> +        pte.p2m.write = 1;
> +        write_pte(vlp2m_pte, pte);
> +        flush_tlb_local();
> +
> +        /* only necessary to lock between get-dirty bitmap and mark dirty
> +         * bitmap. If get-dirty bitmap happens immediately before this
> +         * lock, the corresponding dirty-page would be marked at the next
> +         * round of get-dirty bitmap */
> +        spin_lock(&d->arch.dirty.lock);
> +        mark_dirty_bitmap(d, addr);
> +        spin_unlock(&d->arch.dirty.lock);
> +    }
> +
> +    return 1;
> +}
> +
> +int prepare_bitmap(struct domain *d)
> +{
> +    paddr_t gma_start = 0;
> +    paddr_t gma_end = 0;
> +    int nr_bytes;
> +    int nr_pages;
> +    int i;
> +
> +    domain_get_gpfn_range(d, &gma_start, &gma_end);
> +
> +    nr_bytes = (PFN_DOWN(gma_end - gma_start) + 7) / 8;
> +    nr_pages = (nr_bytes + PAGE_SIZE - 1) / PAGE_SIZE;
> +
> +    BUG_ON( nr_pages > MAX_DIRTY_BITMAP_PAGES );
> +
> +    for ( i = 0; i < nr_pages; ++i )
> +    {
> +        struct page_info *page;
> +
> +        page = alloc_domheap_page(NULL, 0);
> +        if ( page == NULL )
> +            goto cleanup_on_failure;
> +
> +        d->arch.dirty.bitmap[i] = map_domain_page_global(__page_to_mfn(page));
> +        clear_page(d->arch.dirty.bitmap[i]);
> +    }
> +
> +    d->arch.dirty.bitmap_pages = nr_pages;
> +    return 0;
> +
> +cleanup_on_failure:
> +    nr_pages = i;
> +    for ( i = 0; i < nr_pages; ++i )
> +    {

Extraneous braces.  (and elsewhere)

> +        unmap_domain_page_global(d->arch.dirty.bitmap[i]);
> +    }
> +
> +    return -ENOMEM;
> +}
> +
> +void cleanup_bitmap(struct domain *d)
> +{
> +    int i;
> +
> +    for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
> +    {
> +        unmap_domain_page_global(d->arch.dirty.bitmap[i]);
> +    }
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
> index 403fd89..d57a44a 100644
> --- a/xen/arch/arm/p2m.c
> +++ b/xen/arch/arm/p2m.c
> @@ -6,6 +6,8 @@
>  #include <xen/bitops.h>
>  #include <asm/flushtlb.h>
>  #include <asm/gic.h>
> +#include <xen/guest_access.h>
> +#include <xen/pfn.h>
>  #include <asm/event.h>
>  #include <asm/hardirq.h>
>  #include <asm/page.h>
> @@ -208,6 +210,7 @@ static lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr,
>          break;
>  
>      case p2m_ram_ro:
> +    case p2m_ram_logdirty:
>          e.p2m.xn = 0;
>          e.p2m.write = 0;
>          break;
> @@ -261,6 +264,10 @@ static int p2m_create_table(struct domain *d,
>  
>      pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM, p2m_invalid);
>  
> +    /* mark the write bit (page table's case, ro bit) as 0. So it is writable 
> +     * in case of vlpt access */
> +    pte.pt.ro = 0;
> +
>      write_pte(entry, pte);
>  
>      return 0;
> @@ -697,6 +704,210 @@ unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn)
>      return p >> PAGE_SHIFT;
>  }
>  
> +/* Change types across all p2m entries in a domain */
> +void p2m_change_entry_type_global(struct domain *d, enum mg nt)
> +{
> +    struct p2m_domain *p2m = &d->arch.p2m;
> +    paddr_t ram_base;
> +    int i1, i2, i3;
> +    int first_index, second_index, third_index;
> +    lpae_t *first = __map_domain_page(p2m->first_level);
> +    lpae_t pte, *second = NULL, *third = NULL;
> +
> +    domain_get_gpfn_range(d, &ram_base, NULL);
> +
> +    first_index = first_table_offset((uint64_t)ram_base);

You should not need to cast a paddr_t to uint64_t at all.

> +    second_index = second_table_offset((uint64_t)ram_base);
> +    third_index = third_table_offset((uint64_t)ram_base);
> +
> +    BUG_ON( !first && "Can't map first level p2m." );

map_domain_page() doesn't fail.  It will bug itself if it cant succeed.

> +
> +    spin_lock(&p2m->lock);
> +
> +    for ( i1 = first_index; i1 < LPAE_ENTRIES*2; ++i1 )
> +    {
> +        lpae_walk_t first_pte = first[i1].walk;
> +
> +        if ( !first_pte.valid || !first_pte.table )
> +            goto out;
> +
> +        second = map_domain_page(first_pte.base);
> +        BUG_ON( !second && "Can't map second level p2m.");
> +
> +        for ( i2 = second_index; i2 < LPAE_ENTRIES; ++i2 )
> +        {
> +            lpae_walk_t second_pte = second[i2].walk;
> +
> +            if ( !second_pte.valid || !second_pte.table )
> +                goto out;
> +
> +            third = map_domain_page(second_pte.base);
> +            BUG_ON( !third && "Can't map third level p2m.");
> +
> +            for ( i3 = third_index; i3 < LPAE_ENTRIES; ++i3 )
> +            {
> +
> +                lpae_walk_t third_pte = third[i3].walk;
> +                if ( !third_pte.valid )
> +                    goto out;
> +
> +                pte = third[i3];
> +                if ( nt == mg_ro )
> +                {
> +                    if ( pte.p2m.write == 1 )
> +                    {
> +                        pte.p2m.write = 0;
> +                        pte.p2m.type = p2m_ram_logdirty;
> +                    }
> +                    else
> +                    {
> +                        /* reuse avail bit as an indicator of 'actual'
> +                         * read-only */
> +                        pte.p2m.type = p2m_ram_rw;
> +                    }
> +                }
> +                else if ( nt == mg_rw )
> +                {
> +                    if ( pte.p2m.write == 0 &&
> +                         pte.p2m.type == p2m_ram_logdirty )
> +                    {
> +                        pte.p2m.write = p2m_ram_rw;
> +                    }
> +                }
> +                write_pte(&third[i3], pte);
> +            }
> +            unmap_domain_page(third);
> +
> +            third = NULL;
> +            third_index = 0;
> +        }
> +        unmap_domain_page(second);
> +
> +        second = NULL;
> +        second_index = 0;
> +        third_index = 0;
> +    }
> +
> +out:
> +    flush_tlb_all_local();
> +    if ( third ) unmap_domain_page(third);

Newlines please.

> +    if ( second ) unmap_domain_page(second);
> +    if ( first ) unmap_domain_page(first);
> +
> +    spin_unlock(&p2m->lock);
> +}
> +
> +/* Read a domain's log-dirty bitmap and stats.
> + * If the operation is a CLEAN, clear the bitmap and stats. */
> +int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc)
> +{
> +    int peek = 1;
> +    int i;
> +    int bitmap_size;
> +    paddr_t gma_start, gma_end;
> +
> +    /* this hypercall is called from domain 0, and we don't know which guest's
> +     * vlpt is mapped in xen_second, so, to be sure, we restore vlpt here */
> +    restore_vlpt(d);
> +
> +    domain_get_gpfn_range(d, &gma_start, &gma_end);
> +    bitmap_size = (gma_end - gma_start) / 8;
> +
> +    if ( guest_handle_is_null(sc->dirty_bitmap) )
> +    {
> +        peek = 0;
> +    }
> +    else
> +    {
> +        spin_lock(&d->arch.dirty.lock);
> +        for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
> +        {
> +            int j = 0;
> +            uint8_t *bitmap;
> +            copy_to_guest_offset(sc->dirty_bitmap, i * PAGE_SIZE,
> +                                 d->arch.dirty.bitmap[i],
> +                                 bitmap_size < PAGE_SIZE ? bitmap_size :
> +                                                           PAGE_SIZE);
> +            bitmap_size -= PAGE_SIZE;
> +
> +            /* set p2m page table read-only */
> +            bitmap = d->arch.dirty.bitmap[i];
> +            while ((j = find_next_bit((const long unsigned int *)bitmap,
> +                                      PAGE_SIZE*8, j)) < PAGE_SIZE*8)
> +            {
> +                lpae_t *vlpt;
> +                paddr_t addr = gma_start + (i << (2*PAGE_SHIFT+3)) +
> +                    (j << PAGE_SHIFT);
> +                vlpt = get_vlpt_3lvl_pte(addr);
> +                vlpt->p2m.write = 0;
> +                j++;
> +            }
> +        }
> +
> +        if ( sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN )
> +        {
> +            for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
> +            {
> +                clear_page(d->arch.dirty.bitmap[i]);
> +            }
> +        }
> +
> +        spin_unlock(&d->arch.dirty.lock);
> +        flush_tlb_local();
> +    }
> +
> +    sc->stats.dirty_count = d->arch.dirty.count;
> +
> +    return 0;
> +}
> +
> +long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc)
> +{
> +    long ret = 0;
> +    switch (sc->op)
> +    {
> +        case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
> +        case XEN_DOMCTL_SHADOW_OP_OFF:
> +        {
> +            enum mg nt = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? mg_rw : mg_ro;
> +
> +            d->arch.dirty.mode = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? 0 : 1;
> +            p2m_change_entry_type_global(d, nt);
> +
> +            if ( sc->op == XEN_DOMCTL_SHADOW_OP_OFF )
> +            {
> +                cleanup_vlpt(d);
> +                cleanup_bitmap(d);
> +            }
> +            else
> +            {
> +                if ( (ret = prepare_vlpt(d)) )
> +                   return ret;
> +
> +                if ( (ret = prepare_bitmap(d)) )
> +                {
> +                   /* in case of failure, we have to cleanup vlpt */
> +                   cleanup_vlpt(d);
> +                   return ret;
> +                }
> +            }
> +        }
> +        break;
> +
> +        case XEN_DOMCTL_SHADOW_OP_CLEAN:
> +        case XEN_DOMCTL_SHADOW_OP_PEEK:
> +        {
> +            ret = log_dirty_op(d, sc);
> +        }
> +        break;
> +
> +        default:
> +            return -ENOSYS;
> +    }
> +
> +    return ret;
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
> index a7edc4e..cca34e9 100644
> --- a/xen/arch/arm/traps.c
> +++ b/xen/arch/arm/traps.c
> @@ -1491,6 +1491,8 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
>      struct hsr_dabt dabt = hsr.dabt;
>      int rc;
>      mmio_info_t info;
> +    int page_fault = ( (dabt.dfsc & FSC_MASK) ==
> +                       (FSC_FLT_PERM | FSC_3D_LEVEL) && dabt.write );
>  
>      if ( !check_conditional_instr(regs, hsr) )
>      {
> @@ -1512,6 +1514,15 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
>      if ( rc == -EFAULT )
>          goto bad_data_abort;
>  
> +    /* domU page fault handling for guest live migration. dabt.valid can be 
> +     * 0 here.
> +     */
> +    if ( page_fault && handle_page_fault(current->domain, info.gpa) )
> +    {
> +        /* Do not modify pc after page fault to repeat memory operation */
> +        return;
> +    }
> +
>      /* XXX: Decode the instruction if ISS is not valid */
>      if ( !dabt.valid )
>          goto bad_data_abort;
> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
> index 5321bd6..99f9f51 100644
> --- a/xen/include/asm-arm/domain.h
> +++ b/xen/include/asm-arm/domain.h
> @@ -163,9 +163,16 @@ struct arch_domain
>  
>      /* dirty-page tracing */
>      struct {
> +#define MAX_DIRTY_BITMAP_PAGES 64        /* support upto 8GB guest memory */
> +        spinlock_t lock;                 /* protect list: head, mvn_head */
> +        volatile int mode;               /* 1 if dirty pages tracing enabled */
> +        volatile unsigned int count;     /* dirty pages counter */
>          volatile int second_lvl_start;   /* for context switch */
>          volatile int second_lvl_end;
>          lpae_t *second_lvl[2];           /* copy of guest p2m's first */
> +        /* dirty bitmap */
> +        uint8_t *bitmap[MAX_DIRTY_BITMAP_PAGES];
> +        int bitmap_pages;                /* number of dirty bitmap pages */
>      } dirty;
>  
>      unsigned int evtchn_irq;
> diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
> index 5fd684f..5f9478b 100644
> --- a/xen/include/asm-arm/mm.h
> +++ b/xen/include/asm-arm/mm.h
> @@ -343,10 +343,18 @@ static inline void put_page_and_type(struct page_info *page)
>      put_page(page);
>  }
>  
> +enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
> +
> +/* routine for dirty-page tracing */
> +int handle_page_fault(struct domain *d, paddr_t addr);
> +
>  int prepare_vlpt(struct domain *d);
>  void cleanup_vlpt(struct domain *d);
>  void restore_vlpt(struct domain *d);
>  
> +int prepare_bitmap(struct domain *d);
> +void cleanup_bitmap(struct domain *d);

Too generically named.  Perhaps {prepare,cleanup}_logdirty_bitmap() ?

~Andrew

> +
>  /* calculate the xen's virtual address for accessing the leaf PTE of
>   * a given address (GPA) */
>  static inline lpae_t * get_vlpt_3lvl_pte(paddr_t addr)
> @@ -359,6 +367,8 @@ static inline lpae_t * get_vlpt_3lvl_pte(paddr_t addr)
>      return &table[addr >> PAGE_SHIFT];
>  }
>  
> +void get_gma_start_end(struct domain *d, paddr_t *start, paddr_t *end);
> +
>  #endif /*  __ARCH_ARM_MM__ */
>  /*
>   * Local variables:
> diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
> index bd71abe..0cecbe7 100644
> --- a/xen/include/asm-arm/p2m.h
> +++ b/xen/include/asm-arm/p2m.h
> @@ -2,6 +2,7 @@
>  #define _XEN_P2M_H
>  
>  #include <xen/mm.h>
> +#include <public/domctl.h>
>  
>  struct domain;
>  
> @@ -41,6 +42,7 @@ typedef enum {
>      p2m_invalid = 0,    /* Nothing mapped here */
>      p2m_ram_rw,         /* Normal read/write guest RAM */
>      p2m_ram_ro,         /* Read-only; writes are silently dropped */
> +    p2m_ram_logdirty,   /* Read-only: special mode for log dirty */
>      p2m_mmio_direct,    /* Read/write mapping of genuine MMIO area */
>      p2m_map_foreign,    /* Ram pages from foreign domain */
>      p2m_grant_map_rw,   /* Read/write grant mapping */
> @@ -49,7 +51,8 @@ typedef enum {
>  } p2m_type_t;
>  
>  #define p2m_is_foreign(_t)  ((_t) == p2m_map_foreign)
> -#define p2m_is_ram(_t)      ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro)
> +#define p2m_is_ram(_t)      ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro ||  \
> +                             (_t) == p2m_ram_logdirty)
>  
>  /* Initialise vmid allocator */
>  void p2m_vmid_allocator_init(void);
> @@ -178,6 +181,9 @@ static inline int get_page_and_type(struct page_info *page,
>      return rc;
>  }
>  
> +void p2m_change_entry_type_global(struct domain *d, enum mg nt);
> +long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc);
> +
>  #endif /* _XEN_P2M_H */
>  
>  /*
> diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
> index 06e638f..9dc49c3 100644
> --- a/xen/include/asm-arm/processor.h
> +++ b/xen/include/asm-arm/processor.h
> @@ -399,6 +399,8 @@ union hsr {
>  #define FSC_CPR        (0x3a) /* Coprocossor Abort */
>  
>  #define FSC_LL_MASK    (_AC(0x03,U)<<0)
> +#define FSC_MASK       (0x3f) /* Fault status mask */
> +#define FSC_3D_LEVEL   (0x03) /* Third level fault*/
>  
>  /* Time counter hypervisor control register */
>  #define CNTHCTL_PA      (1u<<0)  /* Kernel/user access to physical counter */
diff mbox

Patch

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 3f04a77..d2531ed 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -207,6 +207,12 @@  static void ctxt_switch_to(struct vcpu *n)
 
     isb();
 
+    /* Dirty-page tracing
+     * NB: How do we consider SMP case?
+     */
+    if ( n->domain->arch.dirty.mode )
+        restore_vlpt(n->domain);
+
     /* This is could trigger an hardware interrupt from the virtual
      * timer. The interrupt needs to be injected into the guest. */
     virt_timer_restore(n);
@@ -502,11 +508,19 @@  int arch_domain_create(struct domain *d, unsigned int domcr_flags)
     /* Default the virtual ID to match the physical */
     d->arch.vpidr = boot_cpu_data.midr.bits;
 
+    /* init for dirty-page tracing */
+    d->arch.dirty.count = 0;
+    d->arch.dirty.mode = 0;
+    spin_lock_init(&d->arch.dirty.lock);
+
     d->arch.dirty.second_lvl_start = 0;
     d->arch.dirty.second_lvl_end = 0;
     d->arch.dirty.second_lvl[0] = NULL;
     d->arch.dirty.second_lvl[1] = NULL;
 
+    memset(d->arch.dirty.bitmap, 0, sizeof(d->arch.dirty.bitmap));
+    d->arch.dirty.bitmap_pages = 0;
+
     clear_page(d->shared_info);
     share_xen_page_with_guest(
         virt_to_page(d->shared_info), d, XENSHARE_writable);
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index 45974e7..e84651f 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -11,12 +11,33 @@ 
 #include <xen/sched.h>
 #include <xen/hypercall.h>
 #include <public/domctl.h>
+#include <xen/hvm/save.h>
+#include <xen/guest_access.h>
+
 
 long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
                     XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
+    long ret = 0;
+
     switch ( domctl->cmd )
     {
+    case XEN_DOMCTL_shadow_op:
+    {
+        if ( d == current->domain ) /* no domain_pause() */
+            return -EINVAL;
+                                          
+        domain_pause(d);
+        ret = dirty_mode_op(d, &domctl->u.shadow_op);
+        domain_unpause(d);
+
+        if ( __copy_to_guest(u_domctl, domctl, 1) )
+            ret = -EFAULT;
+
+        return ret;
+    }
+    break;
+
     case XEN_DOMCTL_cacheflush:
     {
         unsigned long s = domctl->u.cacheflush.start_pfn;
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index a315752..ae852eb 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -981,7 +981,6 @@  void destroy_xen_mappings(unsigned long v, unsigned long e)
     create_xen_entries(REMOVE, v, 0, (e - v) >> PAGE_SHIFT, 0);
 }
 
-enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
 static void set_pte_flags_on_range(const char *p, unsigned long l, enum mg mg)
 {
     lpae_t pte;
@@ -1370,6 +1369,110 @@  void domain_get_gpfn_range(struct domain *d, paddr_t *start, paddr_t *end)
         *end = GUEST_RAM_BASE + ((paddr_t) p2m->max_mapped_gfn);
 }
 
+static inline void mark_dirty_bitmap(struct domain *d, paddr_t addr)
+{
+    paddr_t ram_base = (paddr_t) GUEST_RAM_BASE;
+    int bit_index = PFN_DOWN(addr - ram_base);
+    int page_index = bit_index >> (PAGE_SHIFT + 3);
+    int bit_index_residual = bit_index & ((1ul << (PAGE_SHIFT + 3)) - 1);
+
+    set_bit(bit_index_residual, d->arch.dirty.bitmap[page_index]);
+}
+
+/* Routine for dirty-page tracing
+ *
+ * On first write, it page faults, its entry is changed to read-write,
+ * and on retry the write succeeds. For locating p2m of the faulting entry,
+ * we use virtual-linear page table.
+ *
+ * Returns zero if addr is not valid or dirty mode is not set
+ */
+int handle_page_fault(struct domain *d, paddr_t addr)
+{
+
+    lpae_t *vlp2m_pte = 0;
+    paddr_t gma_start = 0;
+    paddr_t gma_end = 0;
+
+    if ( !d->arch.dirty.mode )
+        return 0;
+
+    domain_get_gpfn_range(d, &gma_start, &gma_end);
+    /* Ensure that addr is inside guest's RAM */
+    if ( addr < gma_start || addr > gma_end )
+        return 0;
+
+    vlp2m_pte = get_vlpt_3lvl_pte(addr);
+    if ( vlp2m_pte->p2m.valid && vlp2m_pte->p2m.write == 0 &&
+         vlp2m_pte->p2m.type == p2m_ram_logdirty )
+    {
+        lpae_t pte = *vlp2m_pte;
+        pte.p2m.write = 1;
+        write_pte(vlp2m_pte, pte);
+        flush_tlb_local();
+
+        /* only necessary to lock between get-dirty bitmap and mark dirty
+         * bitmap. If get-dirty bitmap happens immediately before this
+         * lock, the corresponding dirty-page would be marked at the next
+         * round of get-dirty bitmap */
+        spin_lock(&d->arch.dirty.lock);
+        mark_dirty_bitmap(d, addr);
+        spin_unlock(&d->arch.dirty.lock);
+    }
+
+    return 1;
+}
+
+int prepare_bitmap(struct domain *d)
+{
+    paddr_t gma_start = 0;
+    paddr_t gma_end = 0;
+    int nr_bytes;
+    int nr_pages;
+    int i;
+
+    domain_get_gpfn_range(d, &gma_start, &gma_end);
+
+    nr_bytes = (PFN_DOWN(gma_end - gma_start) + 7) / 8;
+    nr_pages = (nr_bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+
+    BUG_ON( nr_pages > MAX_DIRTY_BITMAP_PAGES );
+
+    for ( i = 0; i < nr_pages; ++i )
+    {
+        struct page_info *page;
+
+        page = alloc_domheap_page(NULL, 0);
+        if ( page == NULL )
+            goto cleanup_on_failure;
+
+        d->arch.dirty.bitmap[i] = map_domain_page_global(__page_to_mfn(page));
+        clear_page(d->arch.dirty.bitmap[i]);
+    }
+
+    d->arch.dirty.bitmap_pages = nr_pages;
+    return 0;
+
+cleanup_on_failure:
+    nr_pages = i;
+    for ( i = 0; i < nr_pages; ++i )
+    {
+        unmap_domain_page_global(d->arch.dirty.bitmap[i]);
+    }
+
+    return -ENOMEM;
+}
+
+void cleanup_bitmap(struct domain *d)
+{
+    int i;
+
+    for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
+    {
+        unmap_domain_page_global(d->arch.dirty.bitmap[i]);
+    }
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 403fd89..d57a44a 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -6,6 +6,8 @@ 
 #include <xen/bitops.h>
 #include <asm/flushtlb.h>
 #include <asm/gic.h>
+#include <xen/guest_access.h>
+#include <xen/pfn.h>
 #include <asm/event.h>
 #include <asm/hardirq.h>
 #include <asm/page.h>
@@ -208,6 +210,7 @@  static lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr,
         break;
 
     case p2m_ram_ro:
+    case p2m_ram_logdirty:
         e.p2m.xn = 0;
         e.p2m.write = 0;
         break;
@@ -261,6 +264,10 @@  static int p2m_create_table(struct domain *d,
 
     pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM, p2m_invalid);
 
+    /* mark the write bit (page table's case, ro bit) as 0. So it is writable 
+     * in case of vlpt access */
+    pte.pt.ro = 0;
+
     write_pte(entry, pte);
 
     return 0;
@@ -697,6 +704,210 @@  unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn)
     return p >> PAGE_SHIFT;
 }
 
+/* Change types across all p2m entries in a domain */
+void p2m_change_entry_type_global(struct domain *d, enum mg nt)
+{
+    struct p2m_domain *p2m = &d->arch.p2m;
+    paddr_t ram_base;
+    int i1, i2, i3;
+    int first_index, second_index, third_index;
+    lpae_t *first = __map_domain_page(p2m->first_level);
+    lpae_t pte, *second = NULL, *third = NULL;
+
+    domain_get_gpfn_range(d, &ram_base, NULL);
+
+    first_index = first_table_offset((uint64_t)ram_base);
+    second_index = second_table_offset((uint64_t)ram_base);
+    third_index = third_table_offset((uint64_t)ram_base);
+
+    BUG_ON( !first && "Can't map first level p2m." );
+
+    spin_lock(&p2m->lock);
+
+    for ( i1 = first_index; i1 < LPAE_ENTRIES*2; ++i1 )
+    {
+        lpae_walk_t first_pte = first[i1].walk;
+
+        if ( !first_pte.valid || !first_pte.table )
+            goto out;
+
+        second = map_domain_page(first_pte.base);
+        BUG_ON( !second && "Can't map second level p2m.");
+
+        for ( i2 = second_index; i2 < LPAE_ENTRIES; ++i2 )
+        {
+            lpae_walk_t second_pte = second[i2].walk;
+
+            if ( !second_pte.valid || !second_pte.table )
+                goto out;
+
+            third = map_domain_page(second_pte.base);
+            BUG_ON( !third && "Can't map third level p2m.");
+
+            for ( i3 = third_index; i3 < LPAE_ENTRIES; ++i3 )
+            {
+
+                lpae_walk_t third_pte = third[i3].walk;
+                if ( !third_pte.valid )
+                    goto out;
+
+                pte = third[i3];
+                if ( nt == mg_ro )
+                {
+                    if ( pte.p2m.write == 1 )
+                    {
+                        pte.p2m.write = 0;
+                        pte.p2m.type = p2m_ram_logdirty;
+                    }
+                    else
+                    {
+                        /* reuse avail bit as an indicator of 'actual'
+                         * read-only */
+                        pte.p2m.type = p2m_ram_rw;
+                    }
+                }
+                else if ( nt == mg_rw )
+                {
+                    if ( pte.p2m.write == 0 &&
+                         pte.p2m.type == p2m_ram_logdirty )
+                    {
+                        pte.p2m.write = p2m_ram_rw;
+                    }
+                }
+                write_pte(&third[i3], pte);
+            }
+            unmap_domain_page(third);
+
+            third = NULL;
+            third_index = 0;
+        }
+        unmap_domain_page(second);
+
+        second = NULL;
+        second_index = 0;
+        third_index = 0;
+    }
+
+out:
+    flush_tlb_all_local();
+    if ( third ) unmap_domain_page(third);
+    if ( second ) unmap_domain_page(second);
+    if ( first ) unmap_domain_page(first);
+
+    spin_unlock(&p2m->lock);
+}
+
+/* Read a domain's log-dirty bitmap and stats.
+ * If the operation is a CLEAN, clear the bitmap and stats. */
+int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc)
+{
+    int peek = 1;
+    int i;
+    int bitmap_size;
+    paddr_t gma_start, gma_end;
+
+    /* this hypercall is called from domain 0, and we don't know which guest's
+     * vlpt is mapped in xen_second, so, to be sure, we restore vlpt here */
+    restore_vlpt(d);
+
+    domain_get_gpfn_range(d, &gma_start, &gma_end);
+    bitmap_size = (gma_end - gma_start) / 8;
+
+    if ( guest_handle_is_null(sc->dirty_bitmap) )
+    {
+        peek = 0;
+    }
+    else
+    {
+        spin_lock(&d->arch.dirty.lock);
+        for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
+        {
+            int j = 0;
+            uint8_t *bitmap;
+            copy_to_guest_offset(sc->dirty_bitmap, i * PAGE_SIZE,
+                                 d->arch.dirty.bitmap[i],
+                                 bitmap_size < PAGE_SIZE ? bitmap_size :
+                                                           PAGE_SIZE);
+            bitmap_size -= PAGE_SIZE;
+
+            /* set p2m page table read-only */
+            bitmap = d->arch.dirty.bitmap[i];
+            while ((j = find_next_bit((const long unsigned int *)bitmap,
+                                      PAGE_SIZE*8, j)) < PAGE_SIZE*8)
+            {
+                lpae_t *vlpt;
+                paddr_t addr = gma_start + (i << (2*PAGE_SHIFT+3)) +
+                    (j << PAGE_SHIFT);
+                vlpt = get_vlpt_3lvl_pte(addr);
+                vlpt->p2m.write = 0;
+                j++;
+            }
+        }
+
+        if ( sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN )
+        {
+            for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
+            {
+                clear_page(d->arch.dirty.bitmap[i]);
+            }
+        }
+
+        spin_unlock(&d->arch.dirty.lock);
+        flush_tlb_local();
+    }
+
+    sc->stats.dirty_count = d->arch.dirty.count;
+
+    return 0;
+}
+
+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc)
+{
+    long ret = 0;
+    switch (sc->op)
+    {
+        case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
+        case XEN_DOMCTL_SHADOW_OP_OFF:
+        {
+            enum mg nt = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? mg_rw : mg_ro;
+
+            d->arch.dirty.mode = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? 0 : 1;
+            p2m_change_entry_type_global(d, nt);
+
+            if ( sc->op == XEN_DOMCTL_SHADOW_OP_OFF )
+            {
+                cleanup_vlpt(d);
+                cleanup_bitmap(d);
+            }
+            else
+            {
+                if ( (ret = prepare_vlpt(d)) )
+                   return ret;
+
+                if ( (ret = prepare_bitmap(d)) )
+                {
+                   /* in case of failure, we have to cleanup vlpt */
+                   cleanup_vlpt(d);
+                   return ret;
+                }
+            }
+        }
+        break;
+
+        case XEN_DOMCTL_SHADOW_OP_CLEAN:
+        case XEN_DOMCTL_SHADOW_OP_PEEK:
+        {
+            ret = log_dirty_op(d, sc);
+        }
+        break;
+
+        default:
+            return -ENOSYS;
+    }
+
+    return ret;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
index a7edc4e..cca34e9 100644
--- a/xen/arch/arm/traps.c
+++ b/xen/arch/arm/traps.c
@@ -1491,6 +1491,8 @@  static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
     struct hsr_dabt dabt = hsr.dabt;
     int rc;
     mmio_info_t info;
+    int page_fault = ( (dabt.dfsc & FSC_MASK) ==
+                       (FSC_FLT_PERM | FSC_3D_LEVEL) && dabt.write );
 
     if ( !check_conditional_instr(regs, hsr) )
     {
@@ -1512,6 +1514,15 @@  static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
     if ( rc == -EFAULT )
         goto bad_data_abort;
 
+    /* domU page fault handling for guest live migration. dabt.valid can be 
+     * 0 here.
+     */
+    if ( page_fault && handle_page_fault(current->domain, info.gpa) )
+    {
+        /* Do not modify pc after page fault to repeat memory operation */
+        return;
+    }
+
     /* XXX: Decode the instruction if ISS is not valid */
     if ( !dabt.valid )
         goto bad_data_abort;
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 5321bd6..99f9f51 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -163,9 +163,16 @@  struct arch_domain
 
     /* dirty-page tracing */
     struct {
+#define MAX_DIRTY_BITMAP_PAGES 64        /* support upto 8GB guest memory */
+        spinlock_t lock;                 /* protect list: head, mvn_head */
+        volatile int mode;               /* 1 if dirty pages tracing enabled */
+        volatile unsigned int count;     /* dirty pages counter */
         volatile int second_lvl_start;   /* for context switch */
         volatile int second_lvl_end;
         lpae_t *second_lvl[2];           /* copy of guest p2m's first */
+        /* dirty bitmap */
+        uint8_t *bitmap[MAX_DIRTY_BITMAP_PAGES];
+        int bitmap_pages;                /* number of dirty bitmap pages */
     } dirty;
 
     unsigned int evtchn_irq;
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index 5fd684f..5f9478b 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -343,10 +343,18 @@  static inline void put_page_and_type(struct page_info *page)
     put_page(page);
 }
 
+enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
+
+/* routine for dirty-page tracing */
+int handle_page_fault(struct domain *d, paddr_t addr);
+
 int prepare_vlpt(struct domain *d);
 void cleanup_vlpt(struct domain *d);
 void restore_vlpt(struct domain *d);
 
+int prepare_bitmap(struct domain *d);
+void cleanup_bitmap(struct domain *d);
+
 /* calculate the xen's virtual address for accessing the leaf PTE of
  * a given address (GPA) */
 static inline lpae_t * get_vlpt_3lvl_pte(paddr_t addr)
@@ -359,6 +367,8 @@  static inline lpae_t * get_vlpt_3lvl_pte(paddr_t addr)
     return &table[addr >> PAGE_SHIFT];
 }
 
+void get_gma_start_end(struct domain *d, paddr_t *start, paddr_t *end);
+
 #endif /*  __ARCH_ARM_MM__ */
 /*
  * Local variables:
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index bd71abe..0cecbe7 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -2,6 +2,7 @@ 
 #define _XEN_P2M_H
 
 #include <xen/mm.h>
+#include <public/domctl.h>
 
 struct domain;
 
@@ -41,6 +42,7 @@  typedef enum {
     p2m_invalid = 0,    /* Nothing mapped here */
     p2m_ram_rw,         /* Normal read/write guest RAM */
     p2m_ram_ro,         /* Read-only; writes are silently dropped */
+    p2m_ram_logdirty,   /* Read-only: special mode for log dirty */
     p2m_mmio_direct,    /* Read/write mapping of genuine MMIO area */
     p2m_map_foreign,    /* Ram pages from foreign domain */
     p2m_grant_map_rw,   /* Read/write grant mapping */
@@ -49,7 +51,8 @@  typedef enum {
 } p2m_type_t;
 
 #define p2m_is_foreign(_t)  ((_t) == p2m_map_foreign)
-#define p2m_is_ram(_t)      ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro)
+#define p2m_is_ram(_t)      ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro ||  \
+                             (_t) == p2m_ram_logdirty)
 
 /* Initialise vmid allocator */
 void p2m_vmid_allocator_init(void);
@@ -178,6 +181,9 @@  static inline int get_page_and_type(struct page_info *page,
     return rc;
 }
 
+void p2m_change_entry_type_global(struct domain *d, enum mg nt);
+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc);
+
 #endif /* _XEN_P2M_H */
 
 /*
diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
index 06e638f..9dc49c3 100644
--- a/xen/include/asm-arm/processor.h
+++ b/xen/include/asm-arm/processor.h
@@ -399,6 +399,8 @@  union hsr {
 #define FSC_CPR        (0x3a) /* Coprocossor Abort */
 
 #define FSC_LL_MASK    (_AC(0x03,U)<<0)
+#define FSC_MASK       (0x3f) /* Fault status mask */
+#define FSC_3D_LEVEL   (0x03) /* Third level fault*/
 
 /* Time counter hypervisor control register */
 #define CNTHCTL_PA      (1u<<0)  /* Kernel/user access to physical counter */