Message ID | 1603449643-12851-1-git-send-email-kwankhede@nvidia.com |
---|---|
Headers | show |
Series | Add migration support for VFIO devices | expand |
On Fri, Oct 23, 2020 at 04:10:33PM +0530, Kirti Wankhede wrote: > Define flags to be used as delimiter in migration stream for VFIO devices. > Added .save_setup and .save_cleanup functions. Map & unmap migration > region from these functions at source during saving or pre-copy phase. > > Set VFIO device state depending on VM's state. During live migration, VM is > running when .save_setup is called, _SAVING | _RUNNING state is set for VFIO > device. During save-restore, VM is paused, _SAVING state is set for VFIO device. > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com> > Reviewed-by: Neo Jia <cjia@nvidia.com> > --- > hw/vfio/migration.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++++ > hw/vfio/trace-events | 2 + > 2 files changed, 104 insertions(+) > > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c > index a0f0e79b9b73..94d2bdae5c54 100644 > --- a/hw/vfio/migration.c > +++ b/hw/vfio/migration.c > @@ -8,12 +8,15 @@ > */ > > #include "qemu/osdep.h" > +#include "qemu/main-loop.h" > +#include "qemu/cutils.h" > #include <linux/vfio.h> > > #include "sysemu/runstate.h" > #include "hw/vfio/vfio-common.h" > #include "cpu.h" > #include "migration/migration.h" > +#include "migration/vmstate.h" > #include "migration/qemu-file.h" > #include "migration/register.h" > #include "migration/blocker.h" > @@ -25,6 +28,22 @@ > #include "trace.h" > #include "hw/hw.h" > > +/* > + * Flags to be used as unique delimiters for VFIO devices in the migration > + * stream. These flags are composed as: > + * 0xffffffff => MSB 32-bit all 1s > + * 0xef10 => Magic ID, represents emulated (virtual) function IO > + * 0x0000 => 16-bits reserved for flags > + * > + * The beginning of state information is marked by _DEV_CONFIG_STATE, > + * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a > + * certain state information is marked by _END_OF_STATE. > + */ > +#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) > +#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) > +#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) > +#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) > + > static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count, > off_t off, bool iswrite) > { > @@ -129,6 +148,75 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask, > return 0; > } > > +static void vfio_migration_cleanup(VFIODevice *vbasedev) > +{ > + VFIOMigration *migration = vbasedev->migration; > + > + if (migration->region.mmaps) { > + vfio_region_unmap(&migration->region); > + } > +} > + > +/* ---------------------------------------------------------------------- */ > + > +static int vfio_save_setup(QEMUFile *f, void *opaque) > +{ > + VFIODevice *vbasedev = opaque; > + VFIOMigration *migration = vbasedev->migration; > + int ret; > + > + trace_vfio_save_setup(vbasedev->name); > + > + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); > + > + if (migration->region.mmaps) { > + /* > + * Calling vfio_region_mmap() from migration thread. Memory API called > + * from this function require locking the iothread when called from > + * outside the main loop thread. > + */ > + qemu_mutex_lock_iothread(); > + ret = vfio_region_mmap(&migration->region); > + qemu_mutex_unlock_iothread(); > + if (ret) { > + error_report("%s: Failed to mmap VFIO migration region: %s", > + vbasedev->name, strerror(-ret)); > + error_report("%s: Falling back to slow path", vbasedev->name); > + } > + } > + > + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_MASK, > + VFIO_DEVICE_STATE_SAVING); > + if (ret) { > + error_report("%s: Failed to set state SAVING", vbasedev->name); > + return ret; > + } > + is it possible to call vfio_update_pending() and vfio_save_buffer() here? so that vendor driver has a chance to hook compatibility checking string early in save_setup stage and can avoid to hook the string in both precopy iteration stage and stop and copy stage. But I think it's ok if we agree to add this later. Besides that, Reviewed-by: Yan Zhao <yan.y.zhao@intel.com> > + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); > + > + ret = qemu_file_get_error(f); > + if (ret) { > + return ret; > + } > + > + return 0; > +} > + > +static void vfio_save_cleanup(void *opaque) > +{ > + VFIODevice *vbasedev = opaque; > + > + vfio_migration_cleanup(vbasedev); > + trace_vfio_save_cleanup(vbasedev->name); > +} > + > +static SaveVMHandlers savevm_vfio_handlers = { > + .save_setup = vfio_save_setup, > + .save_cleanup = vfio_save_cleanup, > +}; > + > +/* ---------------------------------------------------------------------- */ > + > static void vfio_vmstate_change(void *opaque, int running, RunState state) > { > VFIODevice *vbasedev = opaque; > @@ -217,6 +305,8 @@ static int vfio_migration_init(VFIODevice *vbasedev, > int ret; > Object *obj; > VFIOMigration *migration; > + char id[256] = ""; > + g_autofree char *path = NULL, *oid = NULL; > > if (!vbasedev->ops->vfio_get_object) { > return -EINVAL; > @@ -247,6 +337,18 @@ static int vfio_migration_init(VFIODevice *vbasedev, > } > > migration->vbasedev = vbasedev; > + > + oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); > + if (oid) { > + path = g_strdup_printf("%s/vfio", oid); > + } else { > + path = g_strdup("vfio"); > + } > + strpadcpy(id, sizeof(id), path, '\0'); > + > + register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, > + vbasedev); > + > migration->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change, > vbasedev); > migration->migration_state.notify = vfio_migration_state_notifier; > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events > index 78d7d83b5ef8..f148b5e828c1 100644 > --- a/hw/vfio/trace-events > +++ b/hw/vfio/trace-events > @@ -151,3 +151,5 @@ vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d" > vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d" > vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d" > vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s" > +vfio_save_setup(const char *name) " (%s)" > +vfio_save_cleanup(const char *name) " (%s)" > -- > 2.7.0 >
hi when I migrating VFs, the PCI_COMMAND is not properly saved. and the target side would meet below bug root@tester:~# [ 189.360671] ++++++++++>> reset starts here: iavf_reset_task !!! [ 199.360798] iavf 0000:00:04.0: Reset never finished (0) [ 199.380504] kernel BUG at drivers/pci/msi.c:352! [ 199.382957] invalid opcode: 0000 [#1] SMP PTI [ 199.384855] CPU: 1 PID: 419 Comm: kworker/1:2 Tainted: G OE 5.0.0-13-generic #14-Ubuntu [ 199.388204] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 199.392401] Workqueue: events iavf_reset_task [iavf] [ 199.393586] RIP: 0010:free_msi_irqs+0x17b/0x1b0 [ 199.394659] Code: 84 e1 fe ff ff 45 31 f6 eb 11 41 83 c6 01 44 39 73 14 0f 86 ce fe ff ff 8b 7b 10 44 01 f7 e8 3c 7a ba ff 48 83 78 70 00 74 e0 <0f> 0b 49 8d b5 b0 00 00 00 e8 07 27 bb ff e9 cf fe ff ff 48 8b 78 [ 199.399056] RSP: 0018:ffffabd1006cfdb8 EFLAGS: 00010282 [ 199.400302] RAX: ffff9e336d8a2800 RBX: ffff9e3333b006c0 RCX: 0000000000000000 [ 199.402000] RDX: 0000000000000000 RSI: 0000000000000019 RDI: ffffffffbaa68100 [ 199.403168] RBP: ffffabd1006cfde8 R08: ffff9e3375000248 R09: ffff9e3375000338 [ 199.404343] R10: 0000000000000000 R11: ffffffffbaa68108 R12: ffff9e3374ef12c0 [ 199.405526] R13: ffff9e3374ef1000 R14: 0000000000000000 R15: ffff9e3371f2d018 [ 199.406702] FS: 0000000000000000(0000) GS:ffff9e3375b00000(0000) knlGS:0000000000000000 [ 199.408027] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 199.408987] CR2: 00000000ffffffff CR3: 0000000033266000 CR4: 00000000000006e0 [ 199.410155] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 199.411321] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 199.412437] Call Trace: [ 199.412750] pci_disable_msix+0xf3/0x120 [ 199.413227] iavf_reset_interrupt_capability.part.40+0x19/0x40 [iavf] [ 199.413998] iavf_reset_task+0x4b3/0x9d0 [iavf] [ 199.414544] process_one_work+0x20f/0x410 [ 199.415026] worker_thread+0x34/0x400 [ 199.415486] kthread+0x120/0x140 [ 199.415876] ? process_one_work+0x410/0x410 [ 199.416380] ? __kthread_parkme+0x70/0x70 [ 199.416864] ret_from_fork+0x35/0x40 I fixed it with below patch. commit ad3efa0eeea7edb352294bfce35b904b8d3c759c Author: Yan Zhao <yan.y.zhao@intel.com> Date: Sat Oct 24 19:45:01 2020 +0800 msix fix. Signed-off-by: Yan Zhao <yan.y.zhao@intel.com> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index f63f15b553..92f71bf933 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2423,8 +2423,14 @@ const VMStateDescription vmstate_vfio_pci_config = { static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + PCIDevice *pdev = &vdev->pdev; + uint16_t pci_cmd; + + pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2); + qemu_put_be16(f, pci_cmd); vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL); + } static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) @@ -2432,6 +2438,10 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); PCIDevice *pdev = &vdev->pdev; int ret; + uint16_t pci_cmd; + + pci_cmd = qemu_get_be16(f); + vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2); ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); if (ret) { On Fri, Oct 23, 2020 at 04:10:29PM +0530, Kirti Wankhede wrote: > Added functions to save and restore PCI device specific data, > specifically config space of PCI device. > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com> > Reviewed-by: Neo Jia <cjia@nvidia.com> > --- > hw/vfio/pci.c | 48 +++++++++++++++++++++++++++++++++++++++++++ > include/hw/vfio/vfio-common.h | 2 ++ > 2 files changed, 50 insertions(+) > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index bffd5bfe3b78..92cc25a5489f 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -41,6 +41,7 @@ > #include "trace.h" > #include "qapi/error.h" > #include "migration/blocker.h" > +#include "migration/qemu-file.h" > > #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" > > @@ -2401,11 +2402,58 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev) > return OBJECT(vdev); > } > > +static bool vfio_msix_present(void *opaque, int version_id) > +{ > + PCIDevice *pdev = opaque; > + > + return msix_present(pdev); > +} > + > +const VMStateDescription vmstate_vfio_pci_config = { > + .name = "VFIOPCIDevice", > + .version_id = 1, > + .minimum_version_id = 1, > + .fields = (VMStateField[]) { > + VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), > + VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), > + VMSTATE_END_OF_LIST() > + } > +}; > + > +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) > +{ > + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > + > + vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL); > +} > + > +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) > +{ > + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > + PCIDevice *pdev = &vdev->pdev; > + int ret; > + > + ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); > + if (ret) { > + return ret; > + } > + > + if (msi_enabled(pdev)) { > + vfio_msi_enable(vdev); > + } else if (msix_enabled(pdev)) { > + vfio_msix_enable(vdev); > + } > + > + return ret; > +} > + > static VFIODeviceOps vfio_pci_ops = { > .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, > .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, > .vfio_eoi = vfio_intx_eoi, > .vfio_get_object = vfio_pci_get_object, > + .vfio_save_config = vfio_pci_save_config, > + .vfio_load_config = vfio_pci_load_config, > }; > > int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index fe99c36a693a..ba6169cd926e 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -120,6 +120,8 @@ struct VFIODeviceOps { > int (*vfio_hot_reset_multi)(VFIODevice *vdev); > void (*vfio_eoi)(VFIODevice *vdev); > Object *(*vfio_get_object)(VFIODevice *vdev); > + void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f); > + int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f); > }; > > typedef struct VFIOGroup { > -- > 2.7.0 >
On Sat, 24 Oct 2020 19:53:39 +0800 Yan Zhao <yan.y.zhao@intel.com> wrote: > hi > when I migrating VFs, the PCI_COMMAND is not properly saved. and the > target side would meet below bug > root@tester:~# [ 189.360671] ++++++++++>> reset starts here: iavf_reset_task !!! > [ 199.360798] iavf 0000:00:04.0: Reset never finished (0) > [ 199.380504] kernel BUG at drivers/pci/msi.c:352! > [ 199.382957] invalid opcode: 0000 [#1] SMP PTI > [ 199.384855] CPU: 1 PID: 419 Comm: kworker/1:2 Tainted: G OE 5.0.0-13-generic #14-Ubuntu > [ 199.388204] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 > [ 199.392401] Workqueue: events iavf_reset_task [iavf] > [ 199.393586] RIP: 0010:free_msi_irqs+0x17b/0x1b0 > [ 199.394659] Code: 84 e1 fe ff ff 45 31 f6 eb 11 41 83 c6 01 44 39 73 14 0f 86 ce fe ff ff 8b 7b 10 44 01 f7 e8 3c 7a ba ff 48 83 78 70 00 74 e0 <0f> 0b 49 8d b5 b0 00 00 00 e8 07 27 bb ff e9 cf fe ff ff 48 8b 78 > [ 199.399056] RSP: 0018:ffffabd1006cfdb8 EFLAGS: 00010282 > [ 199.400302] RAX: ffff9e336d8a2800 RBX: ffff9e3333b006c0 RCX: 0000000000000000 > [ 199.402000] RDX: 0000000000000000 RSI: 0000000000000019 RDI: ffffffffbaa68100 > [ 199.403168] RBP: ffffabd1006cfde8 R08: ffff9e3375000248 R09: ffff9e3375000338 > [ 199.404343] R10: 0000000000000000 R11: ffffffffbaa68108 R12: ffff9e3374ef12c0 > [ 199.405526] R13: ffff9e3374ef1000 R14: 0000000000000000 R15: ffff9e3371f2d018 > [ 199.406702] FS: 0000000000000000(0000) GS:ffff9e3375b00000(0000) knlGS:0000000000000000 > [ 199.408027] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 199.408987] CR2: 00000000ffffffff CR3: 0000000033266000 CR4: 00000000000006e0 > [ 199.410155] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [ 199.411321] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > [ 199.412437] Call Trace: > [ 199.412750] pci_disable_msix+0xf3/0x120 > [ 199.413227] iavf_reset_interrupt_capability.part.40+0x19/0x40 [iavf] > [ 199.413998] iavf_reset_task+0x4b3/0x9d0 [iavf] > [ 199.414544] process_one_work+0x20f/0x410 > [ 199.415026] worker_thread+0x34/0x400 > [ 199.415486] kthread+0x120/0x140 > [ 199.415876] ? process_one_work+0x410/0x410 > [ 199.416380] ? __kthread_parkme+0x70/0x70 > [ 199.416864] ret_from_fork+0x35/0x40 > > I fixed it with below patch. > > > commit ad3efa0eeea7edb352294bfce35b904b8d3c759c > Author: Yan Zhao <yan.y.zhao@intel.com> > Date: Sat Oct 24 19:45:01 2020 +0800 > > msix fix. > > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com> > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index f63f15b553..92f71bf933 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -2423,8 +2423,14 @@ const VMStateDescription vmstate_vfio_pci_config = { > static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) > { > VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > + PCIDevice *pdev = &vdev->pdev; > + uint16_t pci_cmd; > + > + pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2); > + qemu_put_be16(f, pci_cmd); > > vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL); > + > } > > static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) > @@ -2432,6 +2438,10 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) > VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > PCIDevice *pdev = &vdev->pdev; > int ret; > + uint16_t pci_cmd; > + > + pci_cmd = qemu_get_be16(f); > + vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2); > > ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); > if (ret) { > We need to avoid this sort of ad-hoc stuffing random fields into the config stream. The command register is already migrated in vconfig, it only needs to be written through vfio: vfio_pci_write_config(pdev, PCI_COMMAND, pci_get_word(pdev->config, PCI_COMMAND), 2); Thanks, Alex > On Fri, Oct 23, 2020 at 04:10:29PM +0530, Kirti Wankhede wrote: > > Added functions to save and restore PCI device specific data, > > specifically config space of PCI device. > > > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com> > > Reviewed-by: Neo Jia <cjia@nvidia.com> > > --- > > hw/vfio/pci.c | 48 +++++++++++++++++++++++++++++++++++++++++++ > > include/hw/vfio/vfio-common.h | 2 ++ > > 2 files changed, 50 insertions(+) > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > > index bffd5bfe3b78..92cc25a5489f 100644 > > --- a/hw/vfio/pci.c > > +++ b/hw/vfio/pci.c > > @@ -41,6 +41,7 @@ > > #include "trace.h" > > #include "qapi/error.h" > > #include "migration/blocker.h" > > +#include "migration/qemu-file.h" > > > > #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" > > > > @@ -2401,11 +2402,58 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev) > > return OBJECT(vdev); > > } > > > > +static bool vfio_msix_present(void *opaque, int version_id) > > +{ > > + PCIDevice *pdev = opaque; > > + > > + return msix_present(pdev); > > +} > > + > > +const VMStateDescription vmstate_vfio_pci_config = { > > + .name = "VFIOPCIDevice", > > + .version_id = 1, > > + .minimum_version_id = 1, > > + .fields = (VMStateField[]) { > > + VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), > > + VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), > > + VMSTATE_END_OF_LIST() > > + } > > +}; > > + > > +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) > > +{ > > + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > > + > > + vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL); > > +} > > + > > +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) > > +{ > > + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > > + PCIDevice *pdev = &vdev->pdev; > > + int ret; > > + > > + ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); > > + if (ret) { > > + return ret; > > + } > > + > > + if (msi_enabled(pdev)) { > > + vfio_msi_enable(vdev); > > + } else if (msix_enabled(pdev)) { > > + vfio_msix_enable(vdev); > > + } > > + > > + return ret; > > +} > > + > > static VFIODeviceOps vfio_pci_ops = { > > .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, > > .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, > > .vfio_eoi = vfio_intx_eoi, > > .vfio_get_object = vfio_pci_get_object, > > + .vfio_save_config = vfio_pci_save_config, > > + .vfio_load_config = vfio_pci_load_config, > > }; > > > > int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) > > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > > index fe99c36a693a..ba6169cd926e 100644 > > --- a/include/hw/vfio/vfio-common.h > > +++ b/include/hw/vfio/vfio-common.h > > @@ -120,6 +120,8 @@ struct VFIODeviceOps { > > int (*vfio_hot_reset_multi)(VFIODevice *vdev); > > void (*vfio_eoi)(VFIODevice *vdev); > > Object *(*vfio_get_object)(VFIODevice *vdev); > > + void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f); > > + int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f); > > }; > > > > typedef struct VFIOGroup { > > -- > > 2.7.0 > > >
Hi Kirti, On 10/23/20 12:40 PM, Kirti Wankhede wrote: > Hi, > > This Patch set adds migration support for VFIO devices in QEMU. ... > Since there is no device which has hardware support for system memmory > dirty bitmap tracking, right now there is no other API from vendor driver > to VFIO IOMMU module to report dirty pages. In future, when such hardware > support will be implemented, an API will be required in kernel such that > vendor driver could report dirty pages to VFIO module during migration phases. > > Below is the flow of state change for live migration where states in brackets > represent VM state, migration state and VFIO device state as: > (VM state, MIGRATION_STATUS, VFIO_DEVICE_STATE) > > Live migration save path: > QEMU normal running state > (RUNNING, _NONE, _RUNNING) > | > migrate_init spawns migration_thread. > (RUNNING, _SETUP, _RUNNING|_SAVING) > Migration thread then calls each device's .save_setup() > | > (RUNNING, _ACTIVE, _RUNNING|_SAVING) > If device is active, get pending bytes by .save_live_pending() > if pending bytes >= threshold_size, call save_live_iterate() > Data of VFIO device for pre-copy phase is copied. > Iterate till total pending bytes converge and are less than threshold > | > On migration completion, vCPUs stops and calls .save_live_complete_precopy > for each active device. VFIO device is then transitioned in > _SAVING state. > (FINISH_MIGRATE, _DEVICE, _SAVING) > For VFIO device, iterate in .save_live_complete_precopy until > pending data is 0. > (FINISH_MIGRATE, _DEVICE, _STOPPED) > | > (FINISH_MIGRATE, _COMPLETED, _STOPPED) > Migraton thread schedule cleanup bottom half and exit > > Live migration resume path: > Incomming migration calls .load_setup for each device > (RESTORE_VM, _ACTIVE, _STOPPED) > | > For each device, .load_state is called for that device section data > (RESTORE_VM, _ACTIVE, _RESUMING) > | > At the end, called .load_cleanup for each device and vCPUs are started. > | > (RUNNING, _NONE, _RUNNING) > > Note that: > - Migration post copy is not supported. Can you commit this ^^^ somewhere in docs/devel/ please? (as a patch on top of this series)
On 10/24/2020 10:26 PM, Philippe Mathieu-Daudé wrote: > Hi Kirti, > > On 10/23/20 12:40 PM, Kirti Wankhede wrote: >> Hi, >> >> This Patch set adds migration support for VFIO devices in QEMU. > ... > >> Since there is no device which has hardware support for system memmory >> dirty bitmap tracking, right now there is no other API from vendor driver >> to VFIO IOMMU module to report dirty pages. In future, when such hardware >> support will be implemented, an API will be required in kernel such that >> vendor driver could report dirty pages to VFIO module during migration >> phases. >> >> Below is the flow of state change for live migration where states in >> brackets >> represent VM state, migration state and VFIO device state as: >> (VM state, MIGRATION_STATUS, VFIO_DEVICE_STATE) >> >> Live migration save path: >> QEMU normal running state >> (RUNNING, _NONE, _RUNNING) >> | >> migrate_init spawns migration_thread. >> (RUNNING, _SETUP, _RUNNING|_SAVING) >> Migration thread then calls each device's .save_setup() >> | >> (RUNNING, _ACTIVE, _RUNNING|_SAVING) >> If device is active, get pending bytes by .save_live_pending() >> if pending bytes >= threshold_size, call save_live_iterate() >> Data of VFIO device for pre-copy phase is copied. >> Iterate till total pending bytes converge and are less than >> threshold >> | >> On migration completion, vCPUs stops and calls >> .save_live_complete_precopy >> for each active device. VFIO device is then transitioned in >> _SAVING state. >> (FINISH_MIGRATE, _DEVICE, _SAVING) >> For VFIO device, iterate in .save_live_complete_precopy until >> pending data is 0. >> (FINISH_MIGRATE, _DEVICE, _STOPPED) >> | >> (FINISH_MIGRATE, _COMPLETED, _STOPPED) >> Migraton thread schedule cleanup bottom half and exit >> >> Live migration resume path: >> Incomming migration calls .load_setup for each device >> (RESTORE_VM, _ACTIVE, _STOPPED) >> | >> For each device, .load_state is called for that device section data >> (RESTORE_VM, _ACTIVE, _RESUMING) >> | >> At the end, called .load_cleanup for each device and vCPUs are >> started. >> | >> (RUNNING, _NONE, _RUNNING) >> >> Note that: >> - Migration post copy is not supported. > > Can you commit this ^^^ somewhere in docs/devel/ please? > (as a patch on top of this series) > Philippe, Alex, I'm going to respin this series with r-bs and fix suggested by Yan. Should this doc be part of this series or we can add it later after 10/27 if again review of this doc would need some iterations? Thanks, Kirti
On 10/24/20 7:48 PM, Kirti Wankhede wrote: > On 10/24/2020 10:26 PM, Philippe Mathieu-Daudé wrote: >> Hi Kirti, >> >> On 10/23/20 12:40 PM, Kirti Wankhede wrote: >>> Hi, >>> >>> This Patch set adds migration support for VFIO devices in QEMU. >> ... >> >>> Since there is no device which has hardware support for system memmory >>> dirty bitmap tracking, right now there is no other API from vendor >>> driver >>> to VFIO IOMMU module to report dirty pages. In future, when such >>> hardware >>> support will be implemented, an API will be required in kernel such that >>> vendor driver could report dirty pages to VFIO module during >>> migration phases. >>> >>> Below is the flow of state change for live migration where states in >>> brackets >>> represent VM state, migration state and VFIO device state as: >>> (VM state, MIGRATION_STATUS, VFIO_DEVICE_STATE) >>> >>> Live migration save path: >>> QEMU normal running state >>> (RUNNING, _NONE, _RUNNING) >>> | >>> migrate_init spawns migration_thread. >>> (RUNNING, _SETUP, _RUNNING|_SAVING) >>> Migration thread then calls each device's .save_setup() >>> | >>> (RUNNING, _ACTIVE, _RUNNING|_SAVING) >>> If device is active, get pending bytes by .save_live_pending() >>> if pending bytes >= threshold_size, call save_live_iterate() >>> Data of VFIO device for pre-copy phase is copied. >>> Iterate till total pending bytes converge and are less than >>> threshold >>> | >>> On migration completion, vCPUs stops and calls >>> .save_live_complete_precopy >>> for each active device. VFIO device is then transitioned in >>> _SAVING state. >>> (FINISH_MIGRATE, _DEVICE, _SAVING) >>> For VFIO device, iterate in .save_live_complete_precopy until >>> pending data is 0. >>> (FINISH_MIGRATE, _DEVICE, _STOPPED) >>> | >>> (FINISH_MIGRATE, _COMPLETED, _STOPPED) >>> Migraton thread schedule cleanup bottom half and exit >>> >>> Live migration resume path: >>> Incomming migration calls .load_setup for each device >>> (RESTORE_VM, _ACTIVE, _STOPPED) >>> | >>> For each device, .load_state is called for that device section data >>> (RESTORE_VM, _ACTIVE, _RESUMING) >>> | >>> At the end, called .load_cleanup for each device and vCPUs are >>> started. >>> | >>> (RUNNING, _NONE, _RUNNING) >>> >>> Note that: >>> - Migration post copy is not supported. >> >> Can you commit this ^^^ somewhere in docs/devel/ please? >> (as a patch on top of this series) >> > > Philippe, Alex, > I'm going to respin this series with r-bs and fix suggested by Yan. > Should this doc be part of this series or we can add it later after > 10/27 if again review of this doc would need some iterations? I suppose it is up to the maintainer, no objection from my part. This information seems valuable and wouldn't like it be lost. If by 10/27 you refer to the "soft freeze", then there is no problem to add documentation patches after this date :) Regards, Phil.
On 10/24/2020 7:46 PM, Alex Williamson wrote: > On Sat, 24 Oct 2020 19:53:39 +0800 > Yan Zhao <yan.y.zhao@intel.com> wrote: > >> hi >> when I migrating VFs, the PCI_COMMAND is not properly saved. and the >> target side would meet below bug >> root@tester:~# [ 189.360671] ++++++++++>> reset starts here: iavf_reset_task !!! >> [ 199.360798] iavf 0000:00:04.0: Reset never finished (0) >> [ 199.380504] kernel BUG at drivers/pci/msi.c:352! >> [ 199.382957] invalid opcode: 0000 [#1] SMP PTI >> [ 199.384855] CPU: 1 PID: 419 Comm: kworker/1:2 Tainted: G OE 5.0.0-13-generic #14-Ubuntu >> [ 199.388204] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 >> [ 199.392401] Workqueue: events iavf_reset_task [iavf] >> [ 199.393586] RIP: 0010:free_msi_irqs+0x17b/0x1b0 >> [ 199.394659] Code: 84 e1 fe ff ff 45 31 f6 eb 11 41 83 c6 01 44 39 73 14 0f 86 ce fe ff ff 8b 7b 10 44 01 f7 e8 3c 7a ba ff 48 83 78 70 00 74 e0 <0f> 0b 49 8d b5 b0 00 00 00 e8 07 27 bb ff e9 cf fe ff ff 48 8b 78 >> [ 199.399056] RSP: 0018:ffffabd1006cfdb8 EFLAGS: 00010282 >> [ 199.400302] RAX: ffff9e336d8a2800 RBX: ffff9e3333b006c0 RCX: 0000000000000000 >> [ 199.402000] RDX: 0000000000000000 RSI: 0000000000000019 RDI: ffffffffbaa68100 >> [ 199.403168] RBP: ffffabd1006cfde8 R08: ffff9e3375000248 R09: ffff9e3375000338 >> [ 199.404343] R10: 0000000000000000 R11: ffffffffbaa68108 R12: ffff9e3374ef12c0 >> [ 199.405526] R13: ffff9e3374ef1000 R14: 0000000000000000 R15: ffff9e3371f2d018 >> [ 199.406702] FS: 0000000000000000(0000) GS:ffff9e3375b00000(0000) knlGS:0000000000000000 >> [ 199.408027] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 >> [ 199.408987] CR2: 00000000ffffffff CR3: 0000000033266000 CR4: 00000000000006e0 >> [ 199.410155] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 >> [ 199.411321] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 >> [ 199.412437] Call Trace: >> [ 199.412750] pci_disable_msix+0xf3/0x120 >> [ 199.413227] iavf_reset_interrupt_capability.part.40+0x19/0x40 [iavf] >> [ 199.413998] iavf_reset_task+0x4b3/0x9d0 [iavf] >> [ 199.414544] process_one_work+0x20f/0x410 >> [ 199.415026] worker_thread+0x34/0x400 >> [ 199.415486] kthread+0x120/0x140 >> [ 199.415876] ? process_one_work+0x410/0x410 >> [ 199.416380] ? __kthread_parkme+0x70/0x70 >> [ 199.416864] ret_from_fork+0x35/0x40 >> I verified MSIx with SRIOV VF, and I don't see this issue at my end. >> I fixed it with below patch. >> >> >> commit ad3efa0eeea7edb352294bfce35b904b8d3c759c >> Author: Yan Zhao <yan.y.zhao@intel.com> >> Date: Sat Oct 24 19:45:01 2020 +0800 >> >> msix fix. >> >> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com> >> >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >> index f63f15b553..92f71bf933 100644 >> --- a/hw/vfio/pci.c >> +++ b/hw/vfio/pci.c >> @@ -2423,8 +2423,14 @@ const VMStateDescription vmstate_vfio_pci_config = { >> static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) >> { >> VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >> + PCIDevice *pdev = &vdev->pdev; >> + uint16_t pci_cmd; >> + >> + pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2); >> + qemu_put_be16(f, pci_cmd); >> >> vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL); >> + >> } >> >> static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) >> @@ -2432,6 +2438,10 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) >> VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >> PCIDevice *pdev = &vdev->pdev; >> int ret; >> + uint16_t pci_cmd; >> + >> + pci_cmd = qemu_get_be16(f); >> + vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2); >> >> ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); >> if (ret) { >> > > > We need to avoid this sort of ad-hoc stuffing random fields into the > config stream. The command register is already migrated in vconfig, it > only needs to be written through vfio: > > vfio_pci_write_config(pdev, PCI_COMMAND, > pci_get_word(pdev->config, PCI_COMMAND), 2); > I verified at my end again. pci command value (using pci_default_read_config()) before vmstate_save_state() is 0x507 and at destination after vmstate_load_state() is also 0x507 - with pci_default_read_config() and the cached config space value using pci_get_word() - both are 0x507. VM restores successfully. Yan, can you share pci command values before and after as above? what exactly is missing? Thanks, Kirti > Thanks, > Alex > > >> On Fri, Oct 23, 2020 at 04:10:29PM +0530, Kirti Wankhede wrote: >>> Added functions to save and restore PCI device specific data, >>> specifically config space of PCI device. >>> >>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com> >>> Reviewed-by: Neo Jia <cjia@nvidia.com> >>> --- >>> hw/vfio/pci.c | 48 +++++++++++++++++++++++++++++++++++++++++++ >>> include/hw/vfio/vfio-common.h | 2 ++ >>> 2 files changed, 50 insertions(+) >>> >>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >>> index bffd5bfe3b78..92cc25a5489f 100644 >>> --- a/hw/vfio/pci.c >>> +++ b/hw/vfio/pci.c >>> @@ -41,6 +41,7 @@ >>> #include "trace.h" >>> #include "qapi/error.h" >>> #include "migration/blocker.h" >>> +#include "migration/qemu-file.h" >>> >>> #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" >>> >>> @@ -2401,11 +2402,58 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev) >>> return OBJECT(vdev); >>> } >>> >>> +static bool vfio_msix_present(void *opaque, int version_id) >>> +{ >>> + PCIDevice *pdev = opaque; >>> + >>> + return msix_present(pdev); >>> +} >>> + >>> +const VMStateDescription vmstate_vfio_pci_config = { >>> + .name = "VFIOPCIDevice", >>> + .version_id = 1, >>> + .minimum_version_id = 1, >>> + .fields = (VMStateField[]) { >>> + VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), >>> + VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), >>> + VMSTATE_END_OF_LIST() >>> + } >>> +}; >>> + >>> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) >>> +{ >>> + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >>> + >>> + vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL); >>> +} >>> + >>> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) >>> +{ >>> + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >>> + PCIDevice *pdev = &vdev->pdev; >>> + int ret; >>> + >>> + ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); >>> + if (ret) { >>> + return ret; >>> + } >>> + >>> + if (msi_enabled(pdev)) { >>> + vfio_msi_enable(vdev); >>> + } else if (msix_enabled(pdev)) { >>> + vfio_msix_enable(vdev); >>> + } >>> + >>> + return ret; >>> +} >>> + >>> static VFIODeviceOps vfio_pci_ops = { >>> .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, >>> .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, >>> .vfio_eoi = vfio_intx_eoi, >>> .vfio_get_object = vfio_pci_get_object, >>> + .vfio_save_config = vfio_pci_save_config, >>> + .vfio_load_config = vfio_pci_load_config, >>> }; >>> >>> int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) >>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >>> index fe99c36a693a..ba6169cd926e 100644 >>> --- a/include/hw/vfio/vfio-common.h >>> +++ b/include/hw/vfio/vfio-common.h >>> @@ -120,6 +120,8 @@ struct VFIODeviceOps { >>> int (*vfio_hot_reset_multi)(VFIODevice *vdev); >>> void (*vfio_eoi)(VFIODevice *vdev); >>> Object *(*vfio_get_object)(VFIODevice *vdev); >>> + void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f); >>> + int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f); >>> }; >>> >>> typedef struct VFIOGroup { >>> -- >>> 2.7.0 >>> >> >
On Sun, 25 Oct 2020 01:18:37 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > On 10/24/2020 7:46 PM, Alex Williamson wrote: > > On Sat, 24 Oct 2020 19:53:39 +0800 > > Yan Zhao <yan.y.zhao@intel.com> wrote: > > > >> hi > >> when I migrating VFs, the PCI_COMMAND is not properly saved. and the > >> target side would meet below bug > >> root@tester:~# [ 189.360671] ++++++++++>> reset starts here: iavf_reset_task !!! > >> [ 199.360798] iavf 0000:00:04.0: Reset never finished (0) > >> [ 199.380504] kernel BUG at drivers/pci/msi.c:352! > >> [ 199.382957] invalid opcode: 0000 [#1] SMP PTI > >> [ 199.384855] CPU: 1 PID: 419 Comm: kworker/1:2 Tainted: G OE 5.0.0-13-generic #14-Ubuntu > >> [ 199.388204] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 > >> [ 199.392401] Workqueue: events iavf_reset_task [iavf] > >> [ 199.393586] RIP: 0010:free_msi_irqs+0x17b/0x1b0 > >> [ 199.394659] Code: 84 e1 fe ff ff 45 31 f6 eb 11 41 83 c6 01 44 39 73 14 0f 86 ce fe ff ff 8b 7b 10 44 01 f7 e8 3c 7a ba ff 48 83 78 70 00 74 e0 <0f> 0b 49 8d b5 b0 00 00 00 e8 07 27 bb ff e9 cf fe ff ff 48 8b 78 > >> [ 199.399056] RSP: 0018:ffffabd1006cfdb8 EFLAGS: 00010282 > >> [ 199.400302] RAX: ffff9e336d8a2800 RBX: ffff9e3333b006c0 RCX: 0000000000000000 > >> [ 199.402000] RDX: 0000000000000000 RSI: 0000000000000019 RDI: ffffffffbaa68100 > >> [ 199.403168] RBP: ffffabd1006cfde8 R08: ffff9e3375000248 R09: ffff9e3375000338 > >> [ 199.404343] R10: 0000000000000000 R11: ffffffffbaa68108 R12: ffff9e3374ef12c0 > >> [ 199.405526] R13: ffff9e3374ef1000 R14: 0000000000000000 R15: ffff9e3371f2d018 > >> [ 199.406702] FS: 0000000000000000(0000) GS:ffff9e3375b00000(0000) knlGS:0000000000000000 > >> [ 199.408027] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > >> [ 199.408987] CR2: 00000000ffffffff CR3: 0000000033266000 CR4: 00000000000006e0 > >> [ 199.410155] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > >> [ 199.411321] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > >> [ 199.412437] Call Trace: > >> [ 199.412750] pci_disable_msix+0xf3/0x120 > >> [ 199.413227] iavf_reset_interrupt_capability.part.40+0x19/0x40 [iavf] > >> [ 199.413998] iavf_reset_task+0x4b3/0x9d0 [iavf] > >> [ 199.414544] process_one_work+0x20f/0x410 > >> [ 199.415026] worker_thread+0x34/0x400 > >> [ 199.415486] kthread+0x120/0x140 > >> [ 199.415876] ? process_one_work+0x410/0x410 > >> [ 199.416380] ? __kthread_parkme+0x70/0x70 > >> [ 199.416864] ret_from_fork+0x35/0x40 > >> > > I verified MSIx with SRIOV VF, and I don't see this issue at my end. > > >> I fixed it with below patch. > >> > >> > >> commit ad3efa0eeea7edb352294bfce35b904b8d3c759c > >> Author: Yan Zhao <yan.y.zhao@intel.com> > >> Date: Sat Oct 24 19:45:01 2020 +0800 > >> > >> msix fix. > >> > >> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com> > >> > >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > >> index f63f15b553..92f71bf933 100644 > >> --- a/hw/vfio/pci.c > >> +++ b/hw/vfio/pci.c > >> @@ -2423,8 +2423,14 @@ const VMStateDescription vmstate_vfio_pci_config = { > >> static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) > >> { > >> VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > >> + PCIDevice *pdev = &vdev->pdev; > >> + uint16_t pci_cmd; > >> + > >> + pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2); > >> + qemu_put_be16(f, pci_cmd); > >> > >> vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL); > >> + > >> } > >> > >> static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) > >> @@ -2432,6 +2438,10 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) > >> VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > >> PCIDevice *pdev = &vdev->pdev; > >> int ret; > >> + uint16_t pci_cmd; > >> + > >> + pci_cmd = qemu_get_be16(f); > >> + vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2); > >> > >> ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); > >> if (ret) { > >> > > > > > > We need to avoid this sort of ad-hoc stuffing random fields into the > > config stream. The command register is already migrated in vconfig, it > > only needs to be written through vfio: > > > > vfio_pci_write_config(pdev, PCI_COMMAND, > > pci_get_word(pdev->config, PCI_COMMAND), 2); > > > > I verified at my end again. > pci command value (using pci_default_read_config()) before > vmstate_save_state() is 0x507 and at destination after > vmstate_load_state() is also 0x507 - with pci_default_read_config() and > the cached config space value using pci_get_word() - both are 0x507. > VM restores successfully. > > Yan, can you share pci command values before and after as above? what > exactly is missing? pci_default_read_config() or pci_get_word() only read from virtual config space, something needs to write it through to the device on the target, much like we're doing to enable msi and msix. Thanks, Alex > >> On Fri, Oct 23, 2020 at 04:10:29PM +0530, Kirti Wankhede wrote: > >>> Added functions to save and restore PCI device specific data, > >>> specifically config space of PCI device. > >>> > >>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com> > >>> Reviewed-by: Neo Jia <cjia@nvidia.com> > >>> --- > >>> hw/vfio/pci.c | 48 +++++++++++++++++++++++++++++++++++++++++++ > >>> include/hw/vfio/vfio-common.h | 2 ++ > >>> 2 files changed, 50 insertions(+) > >>> > >>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > >>> index bffd5bfe3b78..92cc25a5489f 100644 > >>> --- a/hw/vfio/pci.c > >>> +++ b/hw/vfio/pci.c > >>> @@ -41,6 +41,7 @@ > >>> #include "trace.h" > >>> #include "qapi/error.h" > >>> #include "migration/blocker.h" > >>> +#include "migration/qemu-file.h" > >>> > >>> #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" > >>> > >>> @@ -2401,11 +2402,58 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev) > >>> return OBJECT(vdev); > >>> } > >>> > >>> +static bool vfio_msix_present(void *opaque, int version_id) > >>> +{ > >>> + PCIDevice *pdev = opaque; > >>> + > >>> + return msix_present(pdev); > >>> +} > >>> + > >>> +const VMStateDescription vmstate_vfio_pci_config = { > >>> + .name = "VFIOPCIDevice", > >>> + .version_id = 1, > >>> + .minimum_version_id = 1, > >>> + .fields = (VMStateField[]) { > >>> + VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), > >>> + VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), > >>> + VMSTATE_END_OF_LIST() > >>> + } > >>> +}; > >>> + > >>> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) > >>> +{ > >>> + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > >>> + > >>> + vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL); > >>> +} > >>> + > >>> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) > >>> +{ > >>> + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > >>> + PCIDevice *pdev = &vdev->pdev; > >>> + int ret; > >>> + > >>> + ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); > >>> + if (ret) { > >>> + return ret; > >>> + } > >>> + > >>> + if (msi_enabled(pdev)) { > >>> + vfio_msi_enable(vdev); > >>> + } else if (msix_enabled(pdev)) { > >>> + vfio_msix_enable(vdev); > >>> + } > >>> + > >>> + return ret; > >>> +} > >>> + > >>> static VFIODeviceOps vfio_pci_ops = { > >>> .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, > >>> .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, > >>> .vfio_eoi = vfio_intx_eoi, > >>> .vfio_get_object = vfio_pci_get_object, > >>> + .vfio_save_config = vfio_pci_save_config, > >>> + .vfio_load_config = vfio_pci_load_config, > >>> }; > >>> > >>> int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) > >>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > >>> index fe99c36a693a..ba6169cd926e 100644 > >>> --- a/include/hw/vfio/vfio-common.h > >>> +++ b/include/hw/vfio/vfio-common.h > >>> @@ -120,6 +120,8 @@ struct VFIODeviceOps { > >>> int (*vfio_hot_reset_multi)(VFIODevice *vdev); > >>> void (*vfio_eoi)(VFIODevice *vdev); > >>> Object *(*vfio_get_object)(VFIODevice *vdev); > >>> + void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f); > >>> + int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f); > >>> }; > >>> > >>> typedef struct VFIOGroup { > >>> -- > >>> 2.7.0 > >>> > >> > > >
On 10/24/2020 4:56 PM, Yan Zhao wrote: > On Fri, Oct 23, 2020 at 04:10:33PM +0530, Kirti Wankhede wrote: >> Define flags to be used as delimiter in migration stream for VFIO devices. >> Added .save_setup and .save_cleanup functions. Map & unmap migration >> region from these functions at source during saving or pre-copy phase. >> >> Set VFIO device state depending on VM's state. During live migration, VM is >> running when .save_setup is called, _SAVING | _RUNNING state is set for VFIO >> device. During save-restore, VM is paused, _SAVING state is set for VFIO device. >> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com> >> Reviewed-by: Neo Jia <cjia@nvidia.com> >> --- >> hw/vfio/migration.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++++ >> hw/vfio/trace-events | 2 + >> 2 files changed, 104 insertions(+) >> >> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c >> index a0f0e79b9b73..94d2bdae5c54 100644 >> --- a/hw/vfio/migration.c >> +++ b/hw/vfio/migration.c >> @@ -8,12 +8,15 @@ >> */ >> >> #include "qemu/osdep.h" >> +#include "qemu/main-loop.h" >> +#include "qemu/cutils.h" >> #include <linux/vfio.h> >> >> #include "sysemu/runstate.h" >> #include "hw/vfio/vfio-common.h" >> #include "cpu.h" >> #include "migration/migration.h" >> +#include "migration/vmstate.h" >> #include "migration/qemu-file.h" >> #include "migration/register.h" >> #include "migration/blocker.h" >> @@ -25,6 +28,22 @@ >> #include "trace.h" >> #include "hw/hw.h" >> >> +/* >> + * Flags to be used as unique delimiters for VFIO devices in the migration >> + * stream. These flags are composed as: >> + * 0xffffffff => MSB 32-bit all 1s >> + * 0xef10 => Magic ID, represents emulated (virtual) function IO >> + * 0x0000 => 16-bits reserved for flags >> + * >> + * The beginning of state information is marked by _DEV_CONFIG_STATE, >> + * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a >> + * certain state information is marked by _END_OF_STATE. >> + */ >> +#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) >> +#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) >> +#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) >> +#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) >> + >> static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count, >> off_t off, bool iswrite) >> { >> @@ -129,6 +148,75 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask, >> return 0; >> } >> >> +static void vfio_migration_cleanup(VFIODevice *vbasedev) >> +{ >> + VFIOMigration *migration = vbasedev->migration; >> + >> + if (migration->region.mmaps) { >> + vfio_region_unmap(&migration->region); >> + } >> +} >> + >> +/* ---------------------------------------------------------------------- */ >> + >> +static int vfio_save_setup(QEMUFile *f, void *opaque) >> +{ >> + VFIODevice *vbasedev = opaque; >> + VFIOMigration *migration = vbasedev->migration; >> + int ret; >> + >> + trace_vfio_save_setup(vbasedev->name); >> + >> + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); >> + >> + if (migration->region.mmaps) { >> + /* >> + * Calling vfio_region_mmap() from migration thread. Memory API called >> + * from this function require locking the iothread when called from >> + * outside the main loop thread. >> + */ >> + qemu_mutex_lock_iothread(); >> + ret = vfio_region_mmap(&migration->region); >> + qemu_mutex_unlock_iothread(); >> + if (ret) { >> + error_report("%s: Failed to mmap VFIO migration region: %s", >> + vbasedev->name, strerror(-ret)); >> + error_report("%s: Falling back to slow path", vbasedev->name); >> + } >> + } >> + >> + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_MASK, >> + VFIO_DEVICE_STATE_SAVING); >> + if (ret) { >> + error_report("%s: Failed to set state SAVING", vbasedev->name); >> + return ret; >> + } >> + > > is it possible to call vfio_update_pending() and vfio_save_buffer() here? > so that vendor driver has a chance to hook compatibility checking string > early in save_setup stage and can avoid to hook the string in both > precopy iteration stage and stop and copy stage. I would says its not about which stage, very first string irrespective of migration stage, it should be version compatibility check. I don't think that needed in setup. > > But I think it's ok if we agree to add this later. > > Besides that, > Reviewed-by: Yan Zhao <yan.y.zhao@intel.com> > Thanks. Kirti >> + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); >> + >> + ret = qemu_file_get_error(f); >> + if (ret) { >> + return ret; >> + } >> + >> + return 0; >> +} >> + >> +static void vfio_save_cleanup(void *opaque) >> +{ >> + VFIODevice *vbasedev = opaque; >> + >> + vfio_migration_cleanup(vbasedev); >> + trace_vfio_save_cleanup(vbasedev->name); >> +} >> + >> +static SaveVMHandlers savevm_vfio_handlers = { >> + .save_setup = vfio_save_setup, >> + .save_cleanup = vfio_save_cleanup, >> +}; >> + >> +/* ---------------------------------------------------------------------- */ >> + >> static void vfio_vmstate_change(void *opaque, int running, RunState state) >> { >> VFIODevice *vbasedev = opaque; >> @@ -217,6 +305,8 @@ static int vfio_migration_init(VFIODevice *vbasedev, >> int ret; >> Object *obj; >> VFIOMigration *migration; >> + char id[256] = ""; >> + g_autofree char *path = NULL, *oid = NULL; >> >> if (!vbasedev->ops->vfio_get_object) { >> return -EINVAL; >> @@ -247,6 +337,18 @@ static int vfio_migration_init(VFIODevice *vbasedev, >> } >> >> migration->vbasedev = vbasedev; >> + >> + oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); >> + if (oid) { >> + path = g_strdup_printf("%s/vfio", oid); >> + } else { >> + path = g_strdup("vfio"); >> + } >> + strpadcpy(id, sizeof(id), path, '\0'); >> + >> + register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, >> + vbasedev); >> + >> migration->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change, >> vbasedev); >> migration->migration_state.notify = vfio_migration_state_notifier; >> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events >> index 78d7d83b5ef8..f148b5e828c1 100644 >> --- a/hw/vfio/trace-events >> +++ b/hw/vfio/trace-events >> @@ -151,3 +151,5 @@ vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d" >> vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d" >> vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d" >> vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s" >> +vfio_save_setup(const char *name) " (%s)" >> +vfio_save_cleanup(const char *name) " (%s)" >> -- >> 2.7.0 >>
On Sat, Oct 24, 2020 at 08:16:30AM -0600, Alex Williamson wrote: > On Sat, 24 Oct 2020 19:53:39 +0800 > Yan Zhao <yan.y.zhao@intel.com> wrote: > > > hi > > when I migrating VFs, the PCI_COMMAND is not properly saved. and the > > target side would meet below bug > > root@tester:~# [ 189.360671] ++++++++++>> reset starts here: iavf_reset_task !!! > > [ 199.360798] iavf 0000:00:04.0: Reset never finished (0) > > [ 199.380504] kernel BUG at drivers/pci/msi.c:352! > > [ 199.382957] invalid opcode: 0000 [#1] SMP PTI > > [ 199.384855] CPU: 1 PID: 419 Comm: kworker/1:2 Tainted: G OE 5.0.0-13-generic #14-Ubuntu > > [ 199.388204] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 > > [ 199.392401] Workqueue: events iavf_reset_task [iavf] > > [ 199.393586] RIP: 0010:free_msi_irqs+0x17b/0x1b0 > > [ 199.394659] Code: 84 e1 fe ff ff 45 31 f6 eb 11 41 83 c6 01 44 39 73 14 0f 86 ce fe ff ff 8b 7b 10 44 01 f7 e8 3c 7a ba ff 48 83 78 70 00 74 e0 <0f> 0b 49 8d b5 b0 00 00 00 e8 07 27 bb ff e9 cf fe ff ff 48 8b 78 > > [ 199.399056] RSP: 0018:ffffabd1006cfdb8 EFLAGS: 00010282 > > [ 199.400302] RAX: ffff9e336d8a2800 RBX: ffff9e3333b006c0 RCX: 0000000000000000 > > [ 199.402000] RDX: 0000000000000000 RSI: 0000000000000019 RDI: ffffffffbaa68100 > > [ 199.403168] RBP: ffffabd1006cfde8 R08: ffff9e3375000248 R09: ffff9e3375000338 > > [ 199.404343] R10: 0000000000000000 R11: ffffffffbaa68108 R12: ffff9e3374ef12c0 > > [ 199.405526] R13: ffff9e3374ef1000 R14: 0000000000000000 R15: ffff9e3371f2d018 > > [ 199.406702] FS: 0000000000000000(0000) GS:ffff9e3375b00000(0000) knlGS:0000000000000000 > > [ 199.408027] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > > [ 199.408987] CR2: 00000000ffffffff CR3: 0000000033266000 CR4: 00000000000006e0 > > [ 199.410155] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > > [ 199.411321] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > > [ 199.412437] Call Trace: > > [ 199.412750] pci_disable_msix+0xf3/0x120 > > [ 199.413227] iavf_reset_interrupt_capability.part.40+0x19/0x40 [iavf] > > [ 199.413998] iavf_reset_task+0x4b3/0x9d0 [iavf] > > [ 199.414544] process_one_work+0x20f/0x410 > > [ 199.415026] worker_thread+0x34/0x400 > > [ 199.415486] kthread+0x120/0x140 > > [ 199.415876] ? process_one_work+0x410/0x410 > > [ 199.416380] ? __kthread_parkme+0x70/0x70 > > [ 199.416864] ret_from_fork+0x35/0x40 > > > > I fixed it with below patch. > > > > > > commit ad3efa0eeea7edb352294bfce35b904b8d3c759c > > Author: Yan Zhao <yan.y.zhao@intel.com> > > Date: Sat Oct 24 19:45:01 2020 +0800 > > > > msix fix. > > > > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com> > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > > index f63f15b553..92f71bf933 100644 > > --- a/hw/vfio/pci.c > > +++ b/hw/vfio/pci.c > > @@ -2423,8 +2423,14 @@ const VMStateDescription vmstate_vfio_pci_config = { > > static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) > > { > > VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > > + PCIDevice *pdev = &vdev->pdev; > > + uint16_t pci_cmd; > > + > > + pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2); > > + qemu_put_be16(f, pci_cmd); > > > > vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL); > > + > > } > > > > static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) > > @@ -2432,6 +2438,10 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) > > VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > > PCIDevice *pdev = &vdev->pdev; > > int ret; > > + uint16_t pci_cmd; > > + > > + pci_cmd = qemu_get_be16(f); > > + vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2); > > > > ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); > > if (ret) { > > > > > We need to avoid this sort of ad-hoc stuffing random fields into the > config stream. The command register is already migrated in vconfig, it > only needs to be written through vfio: > > vfio_pci_write_config(pdev, PCI_COMMAND, > pci_get_word(pdev->config, PCI_COMMAND), 2); > yes, it should work. previously we just rely on qemu to save and load the common fields. Thanks Yan > > > > On Fri, Oct 23, 2020 at 04:10:29PM +0530, Kirti Wankhede wrote: > > > Added functions to save and restore PCI device specific data, > > > specifically config space of PCI device. > > > > > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com> > > > Reviewed-by: Neo Jia <cjia@nvidia.com> > > > --- > > > hw/vfio/pci.c | 48 +++++++++++++++++++++++++++++++++++++++++++ > > > include/hw/vfio/vfio-common.h | 2 ++ > > > 2 files changed, 50 insertions(+) > > > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > > > index bffd5bfe3b78..92cc25a5489f 100644 > > > --- a/hw/vfio/pci.c > > > +++ b/hw/vfio/pci.c > > > @@ -41,6 +41,7 @@ > > > #include "trace.h" > > > #include "qapi/error.h" > > > #include "migration/blocker.h" > > > +#include "migration/qemu-file.h" > > > > > > #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" > > > > > > @@ -2401,11 +2402,58 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev) > > > return OBJECT(vdev); > > > } > > > > > > +static bool vfio_msix_present(void *opaque, int version_id) > > > +{ > > > + PCIDevice *pdev = opaque; > > > + > > > + return msix_present(pdev); > > > +} > > > + > > > +const VMStateDescription vmstate_vfio_pci_config = { > > > + .name = "VFIOPCIDevice", > > > + .version_id = 1, > > > + .minimum_version_id = 1, > > > + .fields = (VMStateField[]) { > > > + VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), > > > + VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), > > > + VMSTATE_END_OF_LIST() > > > + } > > > +}; > > > + > > > +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) > > > +{ > > > + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > > > + > > > + vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL); > > > +} > > > + > > > +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) > > > +{ > > > + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > > > + PCIDevice *pdev = &vdev->pdev; > > > + int ret; > > > + > > > + ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); > > > + if (ret) { > > > + return ret; > > > + } > > > + > > > + if (msi_enabled(pdev)) { > > > + vfio_msi_enable(vdev); > > > + } else if (msix_enabled(pdev)) { > > > + vfio_msix_enable(vdev); > > > + } > > > + > > > + return ret; > > > +} > > > + > > > static VFIODeviceOps vfio_pci_ops = { > > > .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, > > > .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, > > > .vfio_eoi = vfio_intx_eoi, > > > .vfio_get_object = vfio_pci_get_object, > > > + .vfio_save_config = vfio_pci_save_config, > > > + .vfio_load_config = vfio_pci_load_config, > > > }; > > > > > > int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) > > > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > > > index fe99c36a693a..ba6169cd926e 100644 > > > --- a/include/hw/vfio/vfio-common.h > > > +++ b/include/hw/vfio/vfio-common.h > > > @@ -120,6 +120,8 @@ struct VFIODeviceOps { > > > int (*vfio_hot_reset_multi)(VFIODevice *vdev); > > > void (*vfio_eoi)(VFIODevice *vdev); > > > Object *(*vfio_get_object)(VFIODevice *vdev); > > > + void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f); > > > + int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f); > > > }; > > > > > > typedef struct VFIOGroup { > > > -- > > > 2.7.0 > > > > > >