@@ -143,6 +143,13 @@ static int migration_maybe_pause(MigrationState *s,
int new_state);
static void migrate_fd_cancel(MigrationState *s);
+static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
+{
+ uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp;
+
+ return (a > b) - (a < b);
+}
+
void migration_object_init(void)
{
MachineState *ms = MACHINE(qdev_get_machine());
@@ -165,6 +172,8 @@ void migration_object_init(void)
qemu_event_init(¤t_incoming->main_thread_load_event, false);
qemu_sem_init(¤t_incoming->postcopy_pause_sem_dst, 0);
qemu_sem_init(¤t_incoming->postcopy_pause_sem_fault, 0);
+ qemu_mutex_init(¤t_incoming->page_request_mutex);
+ current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
if (!migration_object_check(current_migration, &err)) {
error_report_err(err);
@@ -240,6 +249,11 @@ void migration_incoming_state_destroy(void)
qemu_event_reset(&mis->main_thread_load_event);
+ if (mis->page_requested) {
+ g_tree_destroy(mis->page_requested);
+ mis->page_requested = NULL;
+ }
+
if (mis->socket_address_list) {
qapi_free_SocketAddressList(mis->socket_address_list);
mis->socket_address_list = NULL;
@@ -354,8 +368,33 @@ int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
}
int migrate_send_rp_req_pages(MigrationIncomingState *mis,
- RAMBlock *rb, ram_addr_t start)
+ RAMBlock *rb, ram_addr_t start, uint64_t haddr)
{
+ void *aligned = (void *)(uintptr_t)(haddr & (-qemu_ram_pagesize(rb)));
+ bool received;
+
+ WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
+ received = ramblock_recv_bitmap_test_byte_offset(rb, start);
+ if (!received && !g_tree_lookup(mis->page_requested, aligned)) {
+ /*
+ * The page has not been received, and it's not yet in the page
+ * request list. Queue it. Set the value of element to 1, so that
+ * things like g_tree_lookup() will return TRUE (1) when found.
+ */
+ g_tree_insert(mis->page_requested, aligned, (gpointer)1);
+ mis->page_requested_count++;
+ trace_postcopy_page_req_add(aligned, mis->page_requested_count);
+ }
+ }
+
+ /*
+ * If the page is there, skip sending the message. We don't even need the
+ * lock because as long as the page arrived, it'll be there forever.
+ */
+ if (received) {
+ return 0;
+ }
+
return migrate_send_rp_message_req_pages(mis, rb, start);
}
@@ -104,6 +104,23 @@ struct MigrationIncomingState {
/* List of listening socket addresses */
SocketAddressList *socket_address_list;
+
+ /* A tree of pages that we requested to the source VM */
+ GTree *page_requested;
+ /* For debugging purpose only, but would be nice to keep */
+ int page_requested_count;
+ /*
+ * The mutex helps to maintain the requested pages that we sent to the
+ * source, IOW, to guarantee coherent between the page_requests tree and
+ * the per-ramblock receivedmap. Note! This does not guarantee consistency
+ * of the real page copy procedures (using UFFDIO_[ZERO]COPY). E.g., even
+ * if one bit in receivedmap is cleared, UFFDIO_COPY could have happened
+ * for that page already. This is intended so that the mutex won't
+ * serialize and blocked by slow operations like UFFDIO_* ioctls. However
+ * this should be enough to make sure the page_requested tree always
+ * contains valid information.
+ */
+ QemuMutex page_request_mutex;
};
MigrationIncomingState *migration_incoming_get_current(void);
@@ -331,7 +348,7 @@ void migrate_send_rp_shut(MigrationIncomingState *mis,
void migrate_send_rp_pong(MigrationIncomingState *mis,
uint32_t value);
int migrate_send_rp_req_pages(MigrationIncomingState *mis, RAMBlock *rb,
- ram_addr_t start);
+ ram_addr_t start, uint64_t haddr);
int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
RAMBlock *rb, ram_addr_t start);
void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
@@ -684,7 +684,7 @@ int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
qemu_ram_get_idstr(rb), rb_offset);
return postcopy_wake_shared(pcfd, client_addr, rb);
}
- migrate_send_rp_req_pages(mis, rb, aligned_rbo);
+ migrate_send_rp_req_pages(mis, rb, aligned_rbo, client_addr);
return 0;
}
@@ -979,7 +979,8 @@ retry:
* Send the request to the source - we want to request one
* of our host page sizes (which is >= TPS)
*/
- ret = migrate_send_rp_req_pages(mis, rb, rb_offset);
+ ret = migrate_send_rp_req_pages(mis, rb, rb_offset,
+ msg.arg.pagefault.address);
if (ret) {
/* May be network failure, try to wait for recovery */
if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
@@ -1149,10 +1150,20 @@ static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
}
if (!ret) {
+ qemu_mutex_lock(&mis->page_request_mutex);
ramblock_recv_bitmap_set_range(rb, host_addr,
pagesize / qemu_target_page_size());
+ /*
+ * If this page resolves a page fault for a previous recorded faulted
+ * address, take a special note to maintain the requested page list.
+ */
+ if (g_tree_lookup(mis->page_requested, host_addr)) {
+ g_tree_remove(mis->page_requested, host_addr);
+ mis->page_requested_count--;
+ trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
+ }
+ qemu_mutex_unlock(&mis->page_request_mutex);
mark_postcopy_blocktime_end((uintptr_t)host_addr);
-
}
return ret;
}
@@ -162,6 +162,7 @@ postcopy_pause_return_path(void) ""
postcopy_pause_return_path_continued(void) ""
postcopy_pause_continued(void) ""
postcopy_start_set_run(void) ""
+postcopy_page_req_add(void *addr, int count) "new page req %p total %d"
source_return_path_thread_bad_end(void) ""
source_return_path_thread_end(void) ""
source_return_path_thread_entry(void) ""
@@ -272,6 +273,7 @@ postcopy_ram_incoming_cleanup_blocktime(uint64_t total) "total blocktime %" PRIu
postcopy_request_shared_page(const char *sharer, const char *rb, uint64_t rb_offset) "for %s in %s offset 0x%"PRIx64
postcopy_request_shared_page_present(const char *sharer, const char *rb, uint64_t rb_offset) "%s already %s offset 0x%"PRIx64
postcopy_wake_shared(uint64_t client_addr, const char *rb) "at 0x%"PRIx64" in %s"
+postcopy_page_req_del(void *addr, int count) "resolved page req %p total %d"
get_mem_fault_cpu_index(int cpu, uint32_t pid) "cpu: %d, pid: %u"