Message ID | 1397595918-30419-6-git-send-email-w1.huang@samsung.com |
---|---|
State | New |
Headers | show |
On 15/04/2014 22:05, Wei Huang wrote: > From: Jaeyong Yoo <jaeyong.yoo@samsung.com> > > This patch adds hypercall for shadow operations, including enable/disable > and clean/peek dirty page bitmap. > > The design consists of two parts: dirty page detecting and saving. For > detecting, we setup the guest p2m's leaf PTE read-only and whenever > the guest tries to write something, permission fault happens and traps > into xen. The permission-faulted GPA should be saved for the toolstack, > which checks which pages are dirty. For this purpose, it temporarily saves > the GPAs into bitmap. > > Signed-off-by: Jaeyong Yoo <jaeyong.yoo@samsung.com> > Signed-off-by: Wei Huang <w1.huang@samsung.com> > --- > xen/arch/arm/domain.c | 14 +++ > xen/arch/arm/domctl.c | 21 ++++ > xen/arch/arm/mm.c | 105 ++++++++++++++++++- > xen/arch/arm/p2m.c | 211 +++++++++++++++++++++++++++++++++++++++ > xen/arch/arm/traps.c | 11 ++ > xen/include/asm-arm/domain.h | 7 ++ > xen/include/asm-arm/mm.h | 10 ++ > xen/include/asm-arm/p2m.h | 8 +- > xen/include/asm-arm/processor.h | 2 + > 9 files changed, 387 insertions(+), 2 deletions(-) > > diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c > index 3f04a77..d2531ed 100644 > --- a/xen/arch/arm/domain.c > +++ b/xen/arch/arm/domain.c > @@ -207,6 +207,12 @@ static void ctxt_switch_to(struct vcpu *n) > > isb(); > > + /* Dirty-page tracing > + * NB: How do we consider SMP case? > + */ > + if ( n->domain->arch.dirty.mode ) > + restore_vlpt(n->domain); > + > /* This is could trigger an hardware interrupt from the virtual > * timer. The interrupt needs to be injected into the guest. */ > virt_timer_restore(n); > @@ -502,11 +508,19 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags) > /* Default the virtual ID to match the physical */ > d->arch.vpidr = boot_cpu_data.midr.bits; > > + /* init for dirty-page tracing */ > + d->arch.dirty.count = 0; > + d->arch.dirty.mode = 0; Redundant initialization to 0. > + spin_lock_init(&d->arch.dirty.lock); > + > d->arch.dirty.second_lvl_start = 0; > d->arch.dirty.second_lvl_end = 0; > d->arch.dirty.second_lvl[0] = NULL; > d->arch.dirty.second_lvl[1] = NULL; > > + memset(d->arch.dirty.bitmap, 0, sizeof(d->arch.dirty.bitmap)); > + d->arch.dirty.bitmap_pages = 0; > + > clear_page(d->shared_info); > share_xen_page_with_guest( > virt_to_page(d->shared_info), d, XENSHARE_writable); > diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c > index 45974e7..e84651f 100644 > --- a/xen/arch/arm/domctl.c > +++ b/xen/arch/arm/domctl.c > @@ -11,12 +11,33 @@ > #include <xen/sched.h> > #include <xen/hypercall.h> > #include <public/domctl.h> > +#include <xen/hvm/save.h> > +#include <xen/guest_access.h> > + > Spurious whitespace change > long arch_do_domctl(struct xen_domctl *domctl, struct domain *d, > XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) > { > + long ret = 0; > + > switch ( domctl->cmd ) > { > + case XEN_DOMCTL_shadow_op: > + { > + if ( d == current->domain ) /* no domain_pause() */ > + return -EINVAL; > + > + domain_pause(d); > + ret = dirty_mode_op(d, &domctl->u.shadow_op); > + domain_unpause(d); > + > + if ( __copy_to_guest(u_domctl, domctl, 1) ) > + ret = -EFAULT; > + > + return ret; > + } > + break; > + > case XEN_DOMCTL_cacheflush: > { > unsigned long s = domctl->u.cacheflush.start_pfn; > diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c > index a315752..ae852eb 100644 > --- a/xen/arch/arm/mm.c > +++ b/xen/arch/arm/mm.c > @@ -981,7 +981,6 @@ void destroy_xen_mappings(unsigned long v, unsigned long e) > create_xen_entries(REMOVE, v, 0, (e - v) >> PAGE_SHIFT, 0); > } > > -enum mg { mg_clear, mg_ro, mg_rw, mg_rx }; > static void set_pte_flags_on_range(const char *p, unsigned long l, enum mg mg) > { > lpae_t pte; > @@ -1370,6 +1369,110 @@ void domain_get_gpfn_range(struct domain *d, paddr_t *start, paddr_t *end) > *end = GUEST_RAM_BASE + ((paddr_t) p2m->max_mapped_gfn); > } > > +static inline void mark_dirty_bitmap(struct domain *d, paddr_t addr) > +{ > + paddr_t ram_base = (paddr_t) GUEST_RAM_BASE; > + int bit_index = PFN_DOWN(addr - ram_base); > + int page_index = bit_index >> (PAGE_SHIFT + 3); > + int bit_index_residual = bit_index & ((1ul << (PAGE_SHIFT + 3)) - 1); These must be unsigned quantities, and larger than an int for 64 bit. Same applies throughout this patch. > + > + set_bit(bit_index_residual, d->arch.dirty.bitmap[page_index]); > +} > + > +/* Routine for dirty-page tracing > + * > + * On first write, it page faults, its entry is changed to read-write, > + * and on retry the write succeeds. For locating p2m of the faulting entry, > + * we use virtual-linear page table. > + * > + * Returns zero if addr is not valid or dirty mode is not set > + */ > +int handle_page_fault(struct domain *d, paddr_t addr) > +{ > + > + lpae_t *vlp2m_pte = 0; Pointers should be initialised to NULL. > + paddr_t gma_start = 0; > + paddr_t gma_end = 0; > + > + if ( !d->arch.dirty.mode ) > + return 0; > + > + domain_get_gpfn_range(d, &gma_start, &gma_end); > + /* Ensure that addr is inside guest's RAM */ > + if ( addr < gma_start || addr > gma_end ) > + return 0; > + > + vlp2m_pte = get_vlpt_3lvl_pte(addr); > + if ( vlp2m_pte->p2m.valid && vlp2m_pte->p2m.write == 0 && > + vlp2m_pte->p2m.type == p2m_ram_logdirty ) > + { > + lpae_t pte = *vlp2m_pte; > + pte.p2m.write = 1; > + write_pte(vlp2m_pte, pte); > + flush_tlb_local(); > + > + /* only necessary to lock between get-dirty bitmap and mark dirty > + * bitmap. If get-dirty bitmap happens immediately before this > + * lock, the corresponding dirty-page would be marked at the next > + * round of get-dirty bitmap */ > + spin_lock(&d->arch.dirty.lock); > + mark_dirty_bitmap(d, addr); > + spin_unlock(&d->arch.dirty.lock); > + } > + > + return 1; > +} > + > +int prepare_bitmap(struct domain *d) > +{ > + paddr_t gma_start = 0; > + paddr_t gma_end = 0; > + int nr_bytes; > + int nr_pages; > + int i; > + > + domain_get_gpfn_range(d, &gma_start, &gma_end); > + > + nr_bytes = (PFN_DOWN(gma_end - gma_start) + 7) / 8; > + nr_pages = (nr_bytes + PAGE_SIZE - 1) / PAGE_SIZE; > + > + BUG_ON( nr_pages > MAX_DIRTY_BITMAP_PAGES ); > + > + for ( i = 0; i < nr_pages; ++i ) > + { > + struct page_info *page; > + > + page = alloc_domheap_page(NULL, 0); > + if ( page == NULL ) > + goto cleanup_on_failure; > + > + d->arch.dirty.bitmap[i] = map_domain_page_global(__page_to_mfn(page)); > + clear_page(d->arch.dirty.bitmap[i]); > + } > + > + d->arch.dirty.bitmap_pages = nr_pages; > + return 0; > + > +cleanup_on_failure: > + nr_pages = i; > + for ( i = 0; i < nr_pages; ++i ) > + { Extraneous braces. (and elsewhere) > + unmap_domain_page_global(d->arch.dirty.bitmap[i]); > + } > + > + return -ENOMEM; > +} > + > +void cleanup_bitmap(struct domain *d) > +{ > + int i; > + > + for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i ) > + { > + unmap_domain_page_global(d->arch.dirty.bitmap[i]); > + } > +} > + > /* > * Local variables: > * mode: C > diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c > index 403fd89..d57a44a 100644 > --- a/xen/arch/arm/p2m.c > +++ b/xen/arch/arm/p2m.c > @@ -6,6 +6,8 @@ > #include <xen/bitops.h> > #include <asm/flushtlb.h> > #include <asm/gic.h> > +#include <xen/guest_access.h> > +#include <xen/pfn.h> > #include <asm/event.h> > #include <asm/hardirq.h> > #include <asm/page.h> > @@ -208,6 +210,7 @@ static lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr, > break; > > case p2m_ram_ro: > + case p2m_ram_logdirty: > e.p2m.xn = 0; > e.p2m.write = 0; > break; > @@ -261,6 +264,10 @@ static int p2m_create_table(struct domain *d, > > pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM, p2m_invalid); > > + /* mark the write bit (page table's case, ro bit) as 0. So it is writable > + * in case of vlpt access */ > + pte.pt.ro = 0; > + > write_pte(entry, pte); > > return 0; > @@ -697,6 +704,210 @@ unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn) > return p >> PAGE_SHIFT; > } > > +/* Change types across all p2m entries in a domain */ > +void p2m_change_entry_type_global(struct domain *d, enum mg nt) > +{ > + struct p2m_domain *p2m = &d->arch.p2m; > + paddr_t ram_base; > + int i1, i2, i3; > + int first_index, second_index, third_index; > + lpae_t *first = __map_domain_page(p2m->first_level); > + lpae_t pte, *second = NULL, *third = NULL; > + > + domain_get_gpfn_range(d, &ram_base, NULL); > + > + first_index = first_table_offset((uint64_t)ram_base); You should not need to cast a paddr_t to uint64_t at all. > + second_index = second_table_offset((uint64_t)ram_base); > + third_index = third_table_offset((uint64_t)ram_base); > + > + BUG_ON( !first && "Can't map first level p2m." ); map_domain_page() doesn't fail. It will bug itself if it cant succeed. > + > + spin_lock(&p2m->lock); > + > + for ( i1 = first_index; i1 < LPAE_ENTRIES*2; ++i1 ) > + { > + lpae_walk_t first_pte = first[i1].walk; > + > + if ( !first_pte.valid || !first_pte.table ) > + goto out; > + > + second = map_domain_page(first_pte.base); > + BUG_ON( !second && "Can't map second level p2m."); > + > + for ( i2 = second_index; i2 < LPAE_ENTRIES; ++i2 ) > + { > + lpae_walk_t second_pte = second[i2].walk; > + > + if ( !second_pte.valid || !second_pte.table ) > + goto out; > + > + third = map_domain_page(second_pte.base); > + BUG_ON( !third && "Can't map third level p2m."); > + > + for ( i3 = third_index; i3 < LPAE_ENTRIES; ++i3 ) > + { > + > + lpae_walk_t third_pte = third[i3].walk; > + if ( !third_pte.valid ) > + goto out; > + > + pte = third[i3]; > + if ( nt == mg_ro ) > + { > + if ( pte.p2m.write == 1 ) > + { > + pte.p2m.write = 0; > + pte.p2m.type = p2m_ram_logdirty; > + } > + else > + { > + /* reuse avail bit as an indicator of 'actual' > + * read-only */ > + pte.p2m.type = p2m_ram_rw; > + } > + } > + else if ( nt == mg_rw ) > + { > + if ( pte.p2m.write == 0 && > + pte.p2m.type == p2m_ram_logdirty ) > + { > + pte.p2m.write = p2m_ram_rw; > + } > + } > + write_pte(&third[i3], pte); > + } > + unmap_domain_page(third); > + > + third = NULL; > + third_index = 0; > + } > + unmap_domain_page(second); > + > + second = NULL; > + second_index = 0; > + third_index = 0; > + } > + > +out: > + flush_tlb_all_local(); > + if ( third ) unmap_domain_page(third); Newlines please. > + if ( second ) unmap_domain_page(second); > + if ( first ) unmap_domain_page(first); > + > + spin_unlock(&p2m->lock); > +} > + > +/* Read a domain's log-dirty bitmap and stats. > + * If the operation is a CLEAN, clear the bitmap and stats. */ > +int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc) > +{ > + int peek = 1; > + int i; > + int bitmap_size; > + paddr_t gma_start, gma_end; > + > + /* this hypercall is called from domain 0, and we don't know which guest's > + * vlpt is mapped in xen_second, so, to be sure, we restore vlpt here */ > + restore_vlpt(d); > + > + domain_get_gpfn_range(d, &gma_start, &gma_end); > + bitmap_size = (gma_end - gma_start) / 8; > + > + if ( guest_handle_is_null(sc->dirty_bitmap) ) > + { > + peek = 0; > + } > + else > + { > + spin_lock(&d->arch.dirty.lock); > + for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i ) > + { > + int j = 0; > + uint8_t *bitmap; > + copy_to_guest_offset(sc->dirty_bitmap, i * PAGE_SIZE, > + d->arch.dirty.bitmap[i], > + bitmap_size < PAGE_SIZE ? bitmap_size : > + PAGE_SIZE); > + bitmap_size -= PAGE_SIZE; > + > + /* set p2m page table read-only */ > + bitmap = d->arch.dirty.bitmap[i]; > + while ((j = find_next_bit((const long unsigned int *)bitmap, > + PAGE_SIZE*8, j)) < PAGE_SIZE*8) > + { > + lpae_t *vlpt; > + paddr_t addr = gma_start + (i << (2*PAGE_SHIFT+3)) + > + (j << PAGE_SHIFT); > + vlpt = get_vlpt_3lvl_pte(addr); > + vlpt->p2m.write = 0; > + j++; > + } > + } > + > + if ( sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN ) > + { > + for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i ) > + { > + clear_page(d->arch.dirty.bitmap[i]); > + } > + } > + > + spin_unlock(&d->arch.dirty.lock); > + flush_tlb_local(); > + } > + > + sc->stats.dirty_count = d->arch.dirty.count; > + > + return 0; > +} > + > +long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc) > +{ > + long ret = 0; > + switch (sc->op) > + { > + case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY: > + case XEN_DOMCTL_SHADOW_OP_OFF: > + { > + enum mg nt = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? mg_rw : mg_ro; > + > + d->arch.dirty.mode = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? 0 : 1; > + p2m_change_entry_type_global(d, nt); > + > + if ( sc->op == XEN_DOMCTL_SHADOW_OP_OFF ) > + { > + cleanup_vlpt(d); > + cleanup_bitmap(d); > + } > + else > + { > + if ( (ret = prepare_vlpt(d)) ) > + return ret; > + > + if ( (ret = prepare_bitmap(d)) ) > + { > + /* in case of failure, we have to cleanup vlpt */ > + cleanup_vlpt(d); > + return ret; > + } > + } > + } > + break; > + > + case XEN_DOMCTL_SHADOW_OP_CLEAN: > + case XEN_DOMCTL_SHADOW_OP_PEEK: > + { > + ret = log_dirty_op(d, sc); > + } > + break; > + > + default: > + return -ENOSYS; > + } > + > + return ret; > +} > + > /* > * Local variables: > * mode: C > diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c > index a7edc4e..cca34e9 100644 > --- a/xen/arch/arm/traps.c > +++ b/xen/arch/arm/traps.c > @@ -1491,6 +1491,8 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs, > struct hsr_dabt dabt = hsr.dabt; > int rc; > mmio_info_t info; > + int page_fault = ( (dabt.dfsc & FSC_MASK) == > + (FSC_FLT_PERM | FSC_3D_LEVEL) && dabt.write ); > > if ( !check_conditional_instr(regs, hsr) ) > { > @@ -1512,6 +1514,15 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs, > if ( rc == -EFAULT ) > goto bad_data_abort; > > + /* domU page fault handling for guest live migration. dabt.valid can be > + * 0 here. > + */ > + if ( page_fault && handle_page_fault(current->domain, info.gpa) ) > + { > + /* Do not modify pc after page fault to repeat memory operation */ > + return; > + } > + > /* XXX: Decode the instruction if ISS is not valid */ > if ( !dabt.valid ) > goto bad_data_abort; > diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h > index 5321bd6..99f9f51 100644 > --- a/xen/include/asm-arm/domain.h > +++ b/xen/include/asm-arm/domain.h > @@ -163,9 +163,16 @@ struct arch_domain > > /* dirty-page tracing */ > struct { > +#define MAX_DIRTY_BITMAP_PAGES 64 /* support upto 8GB guest memory */ > + spinlock_t lock; /* protect list: head, mvn_head */ > + volatile int mode; /* 1 if dirty pages tracing enabled */ > + volatile unsigned int count; /* dirty pages counter */ > volatile int second_lvl_start; /* for context switch */ > volatile int second_lvl_end; > lpae_t *second_lvl[2]; /* copy of guest p2m's first */ > + /* dirty bitmap */ > + uint8_t *bitmap[MAX_DIRTY_BITMAP_PAGES]; > + int bitmap_pages; /* number of dirty bitmap pages */ > } dirty; > > unsigned int evtchn_irq; > diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h > index 5fd684f..5f9478b 100644 > --- a/xen/include/asm-arm/mm.h > +++ b/xen/include/asm-arm/mm.h > @@ -343,10 +343,18 @@ static inline void put_page_and_type(struct page_info *page) > put_page(page); > } > > +enum mg { mg_clear, mg_ro, mg_rw, mg_rx }; > + > +/* routine for dirty-page tracing */ > +int handle_page_fault(struct domain *d, paddr_t addr); > + > int prepare_vlpt(struct domain *d); > void cleanup_vlpt(struct domain *d); > void restore_vlpt(struct domain *d); > > +int prepare_bitmap(struct domain *d); > +void cleanup_bitmap(struct domain *d); Too generically named. Perhaps {prepare,cleanup}_logdirty_bitmap() ? ~Andrew > + > /* calculate the xen's virtual address for accessing the leaf PTE of > * a given address (GPA) */ > static inline lpae_t * get_vlpt_3lvl_pte(paddr_t addr) > @@ -359,6 +367,8 @@ static inline lpae_t * get_vlpt_3lvl_pte(paddr_t addr) > return &table[addr >> PAGE_SHIFT]; > } > > +void get_gma_start_end(struct domain *d, paddr_t *start, paddr_t *end); > + > #endif /* __ARCH_ARM_MM__ */ > /* > * Local variables: > diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h > index bd71abe..0cecbe7 100644 > --- a/xen/include/asm-arm/p2m.h > +++ b/xen/include/asm-arm/p2m.h > @@ -2,6 +2,7 @@ > #define _XEN_P2M_H > > #include <xen/mm.h> > +#include <public/domctl.h> > > struct domain; > > @@ -41,6 +42,7 @@ typedef enum { > p2m_invalid = 0, /* Nothing mapped here */ > p2m_ram_rw, /* Normal read/write guest RAM */ > p2m_ram_ro, /* Read-only; writes are silently dropped */ > + p2m_ram_logdirty, /* Read-only: special mode for log dirty */ > p2m_mmio_direct, /* Read/write mapping of genuine MMIO area */ > p2m_map_foreign, /* Ram pages from foreign domain */ > p2m_grant_map_rw, /* Read/write grant mapping */ > @@ -49,7 +51,8 @@ typedef enum { > } p2m_type_t; > > #define p2m_is_foreign(_t) ((_t) == p2m_map_foreign) > -#define p2m_is_ram(_t) ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro) > +#define p2m_is_ram(_t) ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro || \ > + (_t) == p2m_ram_logdirty) > > /* Initialise vmid allocator */ > void p2m_vmid_allocator_init(void); > @@ -178,6 +181,9 @@ static inline int get_page_and_type(struct page_info *page, > return rc; > } > > +void p2m_change_entry_type_global(struct domain *d, enum mg nt); > +long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc); > + > #endif /* _XEN_P2M_H */ > > /* > diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h > index 06e638f..9dc49c3 100644 > --- a/xen/include/asm-arm/processor.h > +++ b/xen/include/asm-arm/processor.h > @@ -399,6 +399,8 @@ union hsr { > #define FSC_CPR (0x3a) /* Coprocossor Abort */ > > #define FSC_LL_MASK (_AC(0x03,U)<<0) > +#define FSC_MASK (0x3f) /* Fault status mask */ > +#define FSC_3D_LEVEL (0x03) /* Third level fault*/ > > /* Time counter hypervisor control register */ > #define CNTHCTL_PA (1u<<0) /* Kernel/user access to physical counter */
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c index 3f04a77..d2531ed 100644 --- a/xen/arch/arm/domain.c +++ b/xen/arch/arm/domain.c @@ -207,6 +207,12 @@ static void ctxt_switch_to(struct vcpu *n) isb(); + /* Dirty-page tracing + * NB: How do we consider SMP case? + */ + if ( n->domain->arch.dirty.mode ) + restore_vlpt(n->domain); + /* This is could trigger an hardware interrupt from the virtual * timer. The interrupt needs to be injected into the guest. */ virt_timer_restore(n); @@ -502,11 +508,19 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags) /* Default the virtual ID to match the physical */ d->arch.vpidr = boot_cpu_data.midr.bits; + /* init for dirty-page tracing */ + d->arch.dirty.count = 0; + d->arch.dirty.mode = 0; + spin_lock_init(&d->arch.dirty.lock); + d->arch.dirty.second_lvl_start = 0; d->arch.dirty.second_lvl_end = 0; d->arch.dirty.second_lvl[0] = NULL; d->arch.dirty.second_lvl[1] = NULL; + memset(d->arch.dirty.bitmap, 0, sizeof(d->arch.dirty.bitmap)); + d->arch.dirty.bitmap_pages = 0; + clear_page(d->shared_info); share_xen_page_with_guest( virt_to_page(d->shared_info), d, XENSHARE_writable); diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c index 45974e7..e84651f 100644 --- a/xen/arch/arm/domctl.c +++ b/xen/arch/arm/domctl.c @@ -11,12 +11,33 @@ #include <xen/sched.h> #include <xen/hypercall.h> #include <public/domctl.h> +#include <xen/hvm/save.h> +#include <xen/guest_access.h> + long arch_do_domctl(struct xen_domctl *domctl, struct domain *d, XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) { + long ret = 0; + switch ( domctl->cmd ) { + case XEN_DOMCTL_shadow_op: + { + if ( d == current->domain ) /* no domain_pause() */ + return -EINVAL; + + domain_pause(d); + ret = dirty_mode_op(d, &domctl->u.shadow_op); + domain_unpause(d); + + if ( __copy_to_guest(u_domctl, domctl, 1) ) + ret = -EFAULT; + + return ret; + } + break; + case XEN_DOMCTL_cacheflush: { unsigned long s = domctl->u.cacheflush.start_pfn; diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c index a315752..ae852eb 100644 --- a/xen/arch/arm/mm.c +++ b/xen/arch/arm/mm.c @@ -981,7 +981,6 @@ void destroy_xen_mappings(unsigned long v, unsigned long e) create_xen_entries(REMOVE, v, 0, (e - v) >> PAGE_SHIFT, 0); } -enum mg { mg_clear, mg_ro, mg_rw, mg_rx }; static void set_pte_flags_on_range(const char *p, unsigned long l, enum mg mg) { lpae_t pte; @@ -1370,6 +1369,110 @@ void domain_get_gpfn_range(struct domain *d, paddr_t *start, paddr_t *end) *end = GUEST_RAM_BASE + ((paddr_t) p2m->max_mapped_gfn); } +static inline void mark_dirty_bitmap(struct domain *d, paddr_t addr) +{ + paddr_t ram_base = (paddr_t) GUEST_RAM_BASE; + int bit_index = PFN_DOWN(addr - ram_base); + int page_index = bit_index >> (PAGE_SHIFT + 3); + int bit_index_residual = bit_index & ((1ul << (PAGE_SHIFT + 3)) - 1); + + set_bit(bit_index_residual, d->arch.dirty.bitmap[page_index]); +} + +/* Routine for dirty-page tracing + * + * On first write, it page faults, its entry is changed to read-write, + * and on retry the write succeeds. For locating p2m of the faulting entry, + * we use virtual-linear page table. + * + * Returns zero if addr is not valid or dirty mode is not set + */ +int handle_page_fault(struct domain *d, paddr_t addr) +{ + + lpae_t *vlp2m_pte = 0; + paddr_t gma_start = 0; + paddr_t gma_end = 0; + + if ( !d->arch.dirty.mode ) + return 0; + + domain_get_gpfn_range(d, &gma_start, &gma_end); + /* Ensure that addr is inside guest's RAM */ + if ( addr < gma_start || addr > gma_end ) + return 0; + + vlp2m_pte = get_vlpt_3lvl_pte(addr); + if ( vlp2m_pte->p2m.valid && vlp2m_pte->p2m.write == 0 && + vlp2m_pte->p2m.type == p2m_ram_logdirty ) + { + lpae_t pte = *vlp2m_pte; + pte.p2m.write = 1; + write_pte(vlp2m_pte, pte); + flush_tlb_local(); + + /* only necessary to lock between get-dirty bitmap and mark dirty + * bitmap. If get-dirty bitmap happens immediately before this + * lock, the corresponding dirty-page would be marked at the next + * round of get-dirty bitmap */ + spin_lock(&d->arch.dirty.lock); + mark_dirty_bitmap(d, addr); + spin_unlock(&d->arch.dirty.lock); + } + + return 1; +} + +int prepare_bitmap(struct domain *d) +{ + paddr_t gma_start = 0; + paddr_t gma_end = 0; + int nr_bytes; + int nr_pages; + int i; + + domain_get_gpfn_range(d, &gma_start, &gma_end); + + nr_bytes = (PFN_DOWN(gma_end - gma_start) + 7) / 8; + nr_pages = (nr_bytes + PAGE_SIZE - 1) / PAGE_SIZE; + + BUG_ON( nr_pages > MAX_DIRTY_BITMAP_PAGES ); + + for ( i = 0; i < nr_pages; ++i ) + { + struct page_info *page; + + page = alloc_domheap_page(NULL, 0); + if ( page == NULL ) + goto cleanup_on_failure; + + d->arch.dirty.bitmap[i] = map_domain_page_global(__page_to_mfn(page)); + clear_page(d->arch.dirty.bitmap[i]); + } + + d->arch.dirty.bitmap_pages = nr_pages; + return 0; + +cleanup_on_failure: + nr_pages = i; + for ( i = 0; i < nr_pages; ++i ) + { + unmap_domain_page_global(d->arch.dirty.bitmap[i]); + } + + return -ENOMEM; +} + +void cleanup_bitmap(struct domain *d) +{ + int i; + + for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i ) + { + unmap_domain_page_global(d->arch.dirty.bitmap[i]); + } +} + /* * Local variables: * mode: C diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c index 403fd89..d57a44a 100644 --- a/xen/arch/arm/p2m.c +++ b/xen/arch/arm/p2m.c @@ -6,6 +6,8 @@ #include <xen/bitops.h> #include <asm/flushtlb.h> #include <asm/gic.h> +#include <xen/guest_access.h> +#include <xen/pfn.h> #include <asm/event.h> #include <asm/hardirq.h> #include <asm/page.h> @@ -208,6 +210,7 @@ static lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr, break; case p2m_ram_ro: + case p2m_ram_logdirty: e.p2m.xn = 0; e.p2m.write = 0; break; @@ -261,6 +264,10 @@ static int p2m_create_table(struct domain *d, pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM, p2m_invalid); + /* mark the write bit (page table's case, ro bit) as 0. So it is writable + * in case of vlpt access */ + pte.pt.ro = 0; + write_pte(entry, pte); return 0; @@ -697,6 +704,210 @@ unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn) return p >> PAGE_SHIFT; } +/* Change types across all p2m entries in a domain */ +void p2m_change_entry_type_global(struct domain *d, enum mg nt) +{ + struct p2m_domain *p2m = &d->arch.p2m; + paddr_t ram_base; + int i1, i2, i3; + int first_index, second_index, third_index; + lpae_t *first = __map_domain_page(p2m->first_level); + lpae_t pte, *second = NULL, *third = NULL; + + domain_get_gpfn_range(d, &ram_base, NULL); + + first_index = first_table_offset((uint64_t)ram_base); + second_index = second_table_offset((uint64_t)ram_base); + third_index = third_table_offset((uint64_t)ram_base); + + BUG_ON( !first && "Can't map first level p2m." ); + + spin_lock(&p2m->lock); + + for ( i1 = first_index; i1 < LPAE_ENTRIES*2; ++i1 ) + { + lpae_walk_t first_pte = first[i1].walk; + + if ( !first_pte.valid || !first_pte.table ) + goto out; + + second = map_domain_page(first_pte.base); + BUG_ON( !second && "Can't map second level p2m."); + + for ( i2 = second_index; i2 < LPAE_ENTRIES; ++i2 ) + { + lpae_walk_t second_pte = second[i2].walk; + + if ( !second_pte.valid || !second_pte.table ) + goto out; + + third = map_domain_page(second_pte.base); + BUG_ON( !third && "Can't map third level p2m."); + + for ( i3 = third_index; i3 < LPAE_ENTRIES; ++i3 ) + { + + lpae_walk_t third_pte = third[i3].walk; + if ( !third_pte.valid ) + goto out; + + pte = third[i3]; + if ( nt == mg_ro ) + { + if ( pte.p2m.write == 1 ) + { + pte.p2m.write = 0; + pte.p2m.type = p2m_ram_logdirty; + } + else + { + /* reuse avail bit as an indicator of 'actual' + * read-only */ + pte.p2m.type = p2m_ram_rw; + } + } + else if ( nt == mg_rw ) + { + if ( pte.p2m.write == 0 && + pte.p2m.type == p2m_ram_logdirty ) + { + pte.p2m.write = p2m_ram_rw; + } + } + write_pte(&third[i3], pte); + } + unmap_domain_page(third); + + third = NULL; + third_index = 0; + } + unmap_domain_page(second); + + second = NULL; + second_index = 0; + third_index = 0; + } + +out: + flush_tlb_all_local(); + if ( third ) unmap_domain_page(third); + if ( second ) unmap_domain_page(second); + if ( first ) unmap_domain_page(first); + + spin_unlock(&p2m->lock); +} + +/* Read a domain's log-dirty bitmap and stats. + * If the operation is a CLEAN, clear the bitmap and stats. */ +int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc) +{ + int peek = 1; + int i; + int bitmap_size; + paddr_t gma_start, gma_end; + + /* this hypercall is called from domain 0, and we don't know which guest's + * vlpt is mapped in xen_second, so, to be sure, we restore vlpt here */ + restore_vlpt(d); + + domain_get_gpfn_range(d, &gma_start, &gma_end); + bitmap_size = (gma_end - gma_start) / 8; + + if ( guest_handle_is_null(sc->dirty_bitmap) ) + { + peek = 0; + } + else + { + spin_lock(&d->arch.dirty.lock); + for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i ) + { + int j = 0; + uint8_t *bitmap; + copy_to_guest_offset(sc->dirty_bitmap, i * PAGE_SIZE, + d->arch.dirty.bitmap[i], + bitmap_size < PAGE_SIZE ? bitmap_size : + PAGE_SIZE); + bitmap_size -= PAGE_SIZE; + + /* set p2m page table read-only */ + bitmap = d->arch.dirty.bitmap[i]; + while ((j = find_next_bit((const long unsigned int *)bitmap, + PAGE_SIZE*8, j)) < PAGE_SIZE*8) + { + lpae_t *vlpt; + paddr_t addr = gma_start + (i << (2*PAGE_SHIFT+3)) + + (j << PAGE_SHIFT); + vlpt = get_vlpt_3lvl_pte(addr); + vlpt->p2m.write = 0; + j++; + } + } + + if ( sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN ) + { + for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i ) + { + clear_page(d->arch.dirty.bitmap[i]); + } + } + + spin_unlock(&d->arch.dirty.lock); + flush_tlb_local(); + } + + sc->stats.dirty_count = d->arch.dirty.count; + + return 0; +} + +long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc) +{ + long ret = 0; + switch (sc->op) + { + case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY: + case XEN_DOMCTL_SHADOW_OP_OFF: + { + enum mg nt = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? mg_rw : mg_ro; + + d->arch.dirty.mode = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? 0 : 1; + p2m_change_entry_type_global(d, nt); + + if ( sc->op == XEN_DOMCTL_SHADOW_OP_OFF ) + { + cleanup_vlpt(d); + cleanup_bitmap(d); + } + else + { + if ( (ret = prepare_vlpt(d)) ) + return ret; + + if ( (ret = prepare_bitmap(d)) ) + { + /* in case of failure, we have to cleanup vlpt */ + cleanup_vlpt(d); + return ret; + } + } + } + break; + + case XEN_DOMCTL_SHADOW_OP_CLEAN: + case XEN_DOMCTL_SHADOW_OP_PEEK: + { + ret = log_dirty_op(d, sc); + } + break; + + default: + return -ENOSYS; + } + + return ret; +} + /* * Local variables: * mode: C diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c index a7edc4e..cca34e9 100644 --- a/xen/arch/arm/traps.c +++ b/xen/arch/arm/traps.c @@ -1491,6 +1491,8 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs, struct hsr_dabt dabt = hsr.dabt; int rc; mmio_info_t info; + int page_fault = ( (dabt.dfsc & FSC_MASK) == + (FSC_FLT_PERM | FSC_3D_LEVEL) && dabt.write ); if ( !check_conditional_instr(regs, hsr) ) { @@ -1512,6 +1514,15 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs, if ( rc == -EFAULT ) goto bad_data_abort; + /* domU page fault handling for guest live migration. dabt.valid can be + * 0 here. + */ + if ( page_fault && handle_page_fault(current->domain, info.gpa) ) + { + /* Do not modify pc after page fault to repeat memory operation */ + return; + } + /* XXX: Decode the instruction if ISS is not valid */ if ( !dabt.valid ) goto bad_data_abort; diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h index 5321bd6..99f9f51 100644 --- a/xen/include/asm-arm/domain.h +++ b/xen/include/asm-arm/domain.h @@ -163,9 +163,16 @@ struct arch_domain /* dirty-page tracing */ struct { +#define MAX_DIRTY_BITMAP_PAGES 64 /* support upto 8GB guest memory */ + spinlock_t lock; /* protect list: head, mvn_head */ + volatile int mode; /* 1 if dirty pages tracing enabled */ + volatile unsigned int count; /* dirty pages counter */ volatile int second_lvl_start; /* for context switch */ volatile int second_lvl_end; lpae_t *second_lvl[2]; /* copy of guest p2m's first */ + /* dirty bitmap */ + uint8_t *bitmap[MAX_DIRTY_BITMAP_PAGES]; + int bitmap_pages; /* number of dirty bitmap pages */ } dirty; unsigned int evtchn_irq; diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h index 5fd684f..5f9478b 100644 --- a/xen/include/asm-arm/mm.h +++ b/xen/include/asm-arm/mm.h @@ -343,10 +343,18 @@ static inline void put_page_and_type(struct page_info *page) put_page(page); } +enum mg { mg_clear, mg_ro, mg_rw, mg_rx }; + +/* routine for dirty-page tracing */ +int handle_page_fault(struct domain *d, paddr_t addr); + int prepare_vlpt(struct domain *d); void cleanup_vlpt(struct domain *d); void restore_vlpt(struct domain *d); +int prepare_bitmap(struct domain *d); +void cleanup_bitmap(struct domain *d); + /* calculate the xen's virtual address for accessing the leaf PTE of * a given address (GPA) */ static inline lpae_t * get_vlpt_3lvl_pte(paddr_t addr) @@ -359,6 +367,8 @@ static inline lpae_t * get_vlpt_3lvl_pte(paddr_t addr) return &table[addr >> PAGE_SHIFT]; } +void get_gma_start_end(struct domain *d, paddr_t *start, paddr_t *end); + #endif /* __ARCH_ARM_MM__ */ /* * Local variables: diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h index bd71abe..0cecbe7 100644 --- a/xen/include/asm-arm/p2m.h +++ b/xen/include/asm-arm/p2m.h @@ -2,6 +2,7 @@ #define _XEN_P2M_H #include <xen/mm.h> +#include <public/domctl.h> struct domain; @@ -41,6 +42,7 @@ typedef enum { p2m_invalid = 0, /* Nothing mapped here */ p2m_ram_rw, /* Normal read/write guest RAM */ p2m_ram_ro, /* Read-only; writes are silently dropped */ + p2m_ram_logdirty, /* Read-only: special mode for log dirty */ p2m_mmio_direct, /* Read/write mapping of genuine MMIO area */ p2m_map_foreign, /* Ram pages from foreign domain */ p2m_grant_map_rw, /* Read/write grant mapping */ @@ -49,7 +51,8 @@ typedef enum { } p2m_type_t; #define p2m_is_foreign(_t) ((_t) == p2m_map_foreign) -#define p2m_is_ram(_t) ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro) +#define p2m_is_ram(_t) ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro || \ + (_t) == p2m_ram_logdirty) /* Initialise vmid allocator */ void p2m_vmid_allocator_init(void); @@ -178,6 +181,9 @@ static inline int get_page_and_type(struct page_info *page, return rc; } +void p2m_change_entry_type_global(struct domain *d, enum mg nt); +long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc); + #endif /* _XEN_P2M_H */ /* diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h index 06e638f..9dc49c3 100644 --- a/xen/include/asm-arm/processor.h +++ b/xen/include/asm-arm/processor.h @@ -399,6 +399,8 @@ union hsr { #define FSC_CPR (0x3a) /* Coprocossor Abort */ #define FSC_LL_MASK (_AC(0x03,U)<<0) +#define FSC_MASK (0x3f) /* Fault status mask */ +#define FSC_3D_LEVEL (0x03) /* Third level fault*/ /* Time counter hypervisor control register */ #define CNTHCTL_PA (1u<<0) /* Kernel/user access to physical counter */