diff mbox

[v5,06/10] hw/vfio: create common module

Message ID 1407594349-9291-7-git-send-email-eric.auger@linaro.org
State New
Headers show

Commit Message

Auger Eric Aug. 9, 2014, 2:25 p.m. UTC
A new common module is created. It implements all functions
that have no device specificity (PCI, Platform).

This patch only consists in move (no functional changes)

Signed-off-by: Kim Phillips <kim.phillips@linaro.org>
Signed-off-by: Eric Auger <eric.auger@linaro.org>

---

v4 -> v5:
- integrate "sPAPR/IOMMU: Fix TCE entry permission"
- VFIOdevice .name dealloc removed from vfio_put_base_device
- add some includes according to vfio inclusion policy

v3 -> v4:
[Eric Auger]
move done after all PCI modifications to anticipate for
VFIO Platform needs. Purpose is to alleviate the whole
review process.

<= v3
First split done by Kim Phillips
---
 hw/vfio/Makefile.objs         |    1 +
 hw/vfio/common.c              |  990 ++++++++++++++++++++++++++++++++++++++
 hw/vfio/pci.c                 | 1070 +----------------------------------------
 include/hw/vfio/vfio-common.h |  151 ++++++
 4 files changed, 1147 insertions(+), 1065 deletions(-)
 create mode 100644 hw/vfio/common.c
 create mode 100644 include/hw/vfio/vfio-common.h

Comments

Alex Williamson Aug. 11, 2014, 7:20 p.m. UTC | #1
On Sat, 2014-08-09 at 15:25 +0100, Eric Auger wrote:
> A new common module is created. It implements all functions
> that have no device specificity (PCI, Platform).
> 
> This patch only consists in move (no functional changes)
> 
> Signed-off-by: Kim Phillips <kim.phillips@linaro.org>
> Signed-off-by: Eric Auger <eric.auger@linaro.org>
> 
> ---
> 
> v4 -> v5:
> - integrate "sPAPR/IOMMU: Fix TCE entry permission"
> - VFIOdevice .name dealloc removed from vfio_put_base_device
> - add some includes according to vfio inclusion policy
> 
> v3 -> v4:
> [Eric Auger]
> move done after all PCI modifications to anticipate for
> VFIO Platform needs. Purpose is to alleviate the whole
> review process.
> 
> <= v3
> First split done by Kim Phillips
> ---
>  hw/vfio/Makefile.objs         |    1 +
>  hw/vfio/common.c              |  990 ++++++++++++++++++++++++++++++++++++++
>  hw/vfio/pci.c                 | 1070 +----------------------------------------
>  include/hw/vfio/vfio-common.h |  151 ++++++
>  4 files changed, 1147 insertions(+), 1065 deletions(-)
>  create mode 100644 hw/vfio/common.c
>  create mode 100644 include/hw/vfio/vfio-common.h
> 
> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
> index 31c7dab..e31f30e 100644
> --- a/hw/vfio/Makefile.objs
> +++ b/hw/vfio/Makefile.objs
> @@ -1,3 +1,4 @@
>  ifeq ($(CONFIG_LINUX), y)
> +obj-$(CONFIG_SOFTMMU) += common.o
>  obj-$(CONFIG_PCI) += pci.o
>  endif
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> new file mode 100644
> index 0000000..297c508
> --- /dev/null
> +++ b/hw/vfio/common.c
> @@ -0,0 +1,990 @@
> +/*
> + * generic functions used by VFIO devices
> + *
> + * Copyright Red Hat, Inc. 2012
> + *
> + * Authors:
> + *  Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on qemu-kvm device-assignment:
> + *  Adapted for KVM by Qumranet.
> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
> + */
> +
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <linux/vfio.h>
> +
> +#include "hw/vfio/vfio-common.h"
> +#include "hw/vfio/vfio.h"
> +#include "exec/address-spaces.h"
> +#include "exec/memory.h"
> +#include "hw/hw.h"
> +#include "qemu/error-report.h"
> +#include "sysemu/kvm.h"
> +
> +QLIST_HEAD(, VFIOGroup)
> +    group_list = QLIST_HEAD_INITIALIZER(group_list);
> +
> +QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
> +    QLIST_HEAD_INITIALIZER(vfio_address_spaces);
> +
> +#ifdef CONFIG_KVM
> +/*
> + * We have a single VFIO pseudo device per KVM VM.  Once created it lives
> + * for the life of the VM.  Closing the file descriptor only drops our
> + * reference to it and the device's reference to kvm.  Therefore once
> + * initialized, this file descriptor is only released on QEMU exit and
> + * we'll re-use it should another vfio device be attached before then.
> + */
> +static int vfio_kvm_device_fd = -1;
> +#endif
> +
> +/*
> + * Common VFIO interrupt disable
> + */
> +void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
> +{
> +    struct vfio_irq_set irq_set = {
> +        .argsz = sizeof(irq_set),
> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
> +        .index = index,
> +        .start = 0,
> +        .count = 0,
> +    };
> +
> +    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> +}
> +
> +void vfio_unmask_irqindex(VFIODevice *vbasedev, int index)
> +{
> +    struct vfio_irq_set irq_set = {
> +        .argsz = sizeof(irq_set),
> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
> +        .index = index,
> +        .start = 0,
> +        .count = 1,
> +    };
> +
> +    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> +}
> +
> +#ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */

Can we remove the ifdef here and in the common header now?  I'm hoping
the compiler won't complain once it's no longer static.

...
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 5f218b7..d2ccb3b 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -39,27 +39,12 @@
>  #include "qemu/range.h"
>  #include "sysemu/kvm.h"
>  #include "sysemu/sysemu.h"
> -#include "hw/vfio/vfio.h"
> +#include "hw/vfio/vfio-common.h"
>  
> -/* #define DEBUG_VFIO */
> -#ifdef DEBUG_VFIO
> -#define DPRINTF(fmt, ...) \
> -    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
> -#else
> -#define DPRINTF(fmt, ...) \
> -    do { } while (0)
> -#endif
> -
> -/* Extra debugging, trap acceleration paths for more logging */
> -#define VFIO_ALLOW_MMAP 1
> -#define VFIO_ALLOW_KVM_INTX 1
> -#define VFIO_ALLOW_KVM_MSI 1
> -#define VFIO_ALLOW_KVM_MSIX 1
> -
> -enum {
> -    VFIO_DEVICE_TYPE_PCI = 0,
> -    VFIO_DEVICE_TYPE_PLATFORM = 1,
> -};
> +extern const MemoryRegionOps vfio_region_ops;
> +extern const MemoryListener vfio_memory_listener;
> +extern QLIST_HEAD(, VFIOGroup) group_list;
> +extern QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces;

This seems odd, why doesn't the common header provide these for us?  We
should also rename group_list to vfio_group_list to be polite to the
rest of the namespace.  Thanks,

Alex
Alex Williamson Aug. 11, 2014, 7:25 p.m. UTC | #2
On Sat, 2014-08-09 at 15:25 +0100, Eric Auger wrote:
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> new file mode 100644
> index 0000000..4684ee5
> --- /dev/null
> +++ b/include/hw/vfio/vfio-common.h
> @@ -0,0 +1,151 @@
> +/*
> + * common header for vfio based device assignment support
> + *
> + * Copyright Red Hat, Inc. 2012
> + *
> + * Authors:
> + *  Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on qemu-kvm device-assignment:
> + *  Adapted for KVM by Qumranet.
> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
> + */
> +#ifndef HW_VFIO_VFIO_COMMON_H
> +#define HW_VFIO_VFIO_COMMON_H
> +
> +#include "qemu-common.h"
> +#include "exec/address-spaces.h"
> +#include "exec/memory.h"
> +#include "qemu/queue.h"
> +#include "qemu/notify.h"
> +
> +/*#define DEBUG_VFIO*/
> +#ifdef DEBUG_VFIO
> +#define DPRINTF(fmt, ...) \
> +    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
> +#else
> +#define DPRINTF(fmt, ...) \
> +    do { } while (0)
> +#endif


DPRINTF also need to be renamed to avoid conflicting namespace issues.
Thanks,

Alex
Auger Eric Aug. 12, 2014, 5:57 a.m. UTC | #3
On 08/11/2014 09:20 PM, Alex Williamson wrote:
> On Sat, 2014-08-09 at 15:25 +0100, Eric Auger wrote:
>> A new common module is created. It implements all functions
>> that have no device specificity (PCI, Platform).
>>
>> This patch only consists in move (no functional changes)
>>
>> Signed-off-by: Kim Phillips <kim.phillips@linaro.org>
>> Signed-off-by: Eric Auger <eric.auger@linaro.org>
>>
>> ---
>>
>> v4 -> v5:
>> - integrate "sPAPR/IOMMU: Fix TCE entry permission"
>> - VFIOdevice .name dealloc removed from vfio_put_base_device
>> - add some includes according to vfio inclusion policy
>>
>> v3 -> v4:
>> [Eric Auger]
>> move done after all PCI modifications to anticipate for
>> VFIO Platform needs. Purpose is to alleviate the whole
>> review process.
>>
>> <= v3
>> First split done by Kim Phillips
>> ---
>>  hw/vfio/Makefile.objs         |    1 +
>>  hw/vfio/common.c              |  990 ++++++++++++++++++++++++++++++++++++++
>>  hw/vfio/pci.c                 | 1070 +----------------------------------------
>>  include/hw/vfio/vfio-common.h |  151 ++++++
>>  4 files changed, 1147 insertions(+), 1065 deletions(-)
>>  create mode 100644 hw/vfio/common.c
>>  create mode 100644 include/hw/vfio/vfio-common.h
>>
>> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
>> index 31c7dab..e31f30e 100644
>> --- a/hw/vfio/Makefile.objs
>> +++ b/hw/vfio/Makefile.objs
>> @@ -1,3 +1,4 @@
>>  ifeq ($(CONFIG_LINUX), y)
>> +obj-$(CONFIG_SOFTMMU) += common.o
>>  obj-$(CONFIG_PCI) += pci.o
>>  endif
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> new file mode 100644
>> index 0000000..297c508
>> --- /dev/null
>> +++ b/hw/vfio/common.c
>> @@ -0,0 +1,990 @@
>> +/*
>> + * generic functions used by VFIO devices
>> + *
>> + * Copyright Red Hat, Inc. 2012
>> + *
>> + * Authors:
>> + *  Alex Williamson <alex.williamson@redhat.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + *
>> + * Based on qemu-kvm device-assignment:
>> + *  Adapted for KVM by Qumranet.
>> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
>> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
>> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
>> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
>> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
>> + */
>> +
>> +#include <sys/ioctl.h>
>> +#include <sys/mman.h>
>> +#include <linux/vfio.h>
>> +
>> +#include "hw/vfio/vfio-common.h"
>> +#include "hw/vfio/vfio.h"
>> +#include "exec/address-spaces.h"
>> +#include "exec/memory.h"
>> +#include "hw/hw.h"
>> +#include "qemu/error-report.h"
>> +#include "sysemu/kvm.h"
>> +
>> +QLIST_HEAD(, VFIOGroup)
>> +    group_list = QLIST_HEAD_INITIALIZER(group_list);
>> +
>> +QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
>> +    QLIST_HEAD_INITIALIZER(vfio_address_spaces);
>> +
>> +#ifdef CONFIG_KVM
>> +/*
>> + * We have a single VFIO pseudo device per KVM VM.  Once created it lives
>> + * for the life of the VM.  Closing the file descriptor only drops our
>> + * reference to it and the device's reference to kvm.  Therefore once
>> + * initialized, this file descriptor is only released on QEMU exit and
>> + * we'll re-use it should another vfio device be attached before then.
>> + */
>> +static int vfio_kvm_device_fd = -1;
>> +#endif
>> +
>> +/*
>> + * Common VFIO interrupt disable
>> + */
>> +void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
>> +{
>> +    struct vfio_irq_set irq_set = {
>> +        .argsz = sizeof(irq_set),
>> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
>> +        .index = index,
>> +        .start = 0,
>> +        .count = 0,
>> +    };
>> +
>> +    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
>> +}
>> +
>> +void vfio_unmask_irqindex(VFIODevice *vbasedev, int index)
>> +{
>> +    struct vfio_irq_set irq_set = {
>> +        .argsz = sizeof(irq_set),
>> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
>> +        .index = index,
>> +        .start = 0,
>> +        .count = 1,
>> +    };
>> +
>> +    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
>> +}
>> +
>> +#ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */
> 
> Can we remove the ifdef here and in the common header now?  I'm hoping
> the compiler won't complain once it's no longer static.
OK
> 
> ...
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 5f218b7..d2ccb3b 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -39,27 +39,12 @@
>>  #include "qemu/range.h"
>>  #include "sysemu/kvm.h"
>>  #include "sysemu/sysemu.h"
>> -#include "hw/vfio/vfio.h"
>> +#include "hw/vfio/vfio-common.h"
>>  
>> -/* #define DEBUG_VFIO */
>> -#ifdef DEBUG_VFIO
>> -#define DPRINTF(fmt, ...) \
>> -    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
>> -#else
>> -#define DPRINTF(fmt, ...) \
>> -    do { } while (0)
>> -#endif
>> -
>> -/* Extra debugging, trap acceleration paths for more logging */
>> -#define VFIO_ALLOW_MMAP 1
>> -#define VFIO_ALLOW_KVM_INTX 1
>> -#define VFIO_ALLOW_KVM_MSI 1
>> -#define VFIO_ALLOW_KVM_MSIX 1
>> -
>> -enum {
>> -    VFIO_DEVICE_TYPE_PCI = 0,
>> -    VFIO_DEVICE_TYPE_PLATFORM = 1,
>> -};
>> +extern const MemoryRegionOps vfio_region_ops;
>> +extern const MemoryListener vfio_memory_listener;
>> +extern QLIST_HEAD(, VFIOGroup) group_list;
>> +extern QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces;
> 
> This seems odd, why doesn't the common header provide these for us?  We
> should also rename group_list to vfio_group_list to be polite to the
> rest of the namespace.  Thanks,

OK will rework that

Thanks

Eric
> 
> Alex
>
Auger Eric Aug. 12, 2014, 6:09 a.m. UTC | #4
On 08/11/2014 09:25 PM, Alex Williamson wrote:
> On Sat, 2014-08-09 at 15:25 +0100, Eric Auger wrote:
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> new file mode 100644
>> index 0000000..4684ee5
>> --- /dev/null
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -0,0 +1,151 @@
>> +/*
>> + * common header for vfio based device assignment support
>> + *
>> + * Copyright Red Hat, Inc. 2012
>> + *
>> + * Authors:
>> + *  Alex Williamson <alex.williamson@redhat.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + *
>> + * Based on qemu-kvm device-assignment:
>> + *  Adapted for KVM by Qumranet.
>> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
>> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
>> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
>> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
>> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
>> + */
>> +#ifndef HW_VFIO_VFIO_COMMON_H
>> +#define HW_VFIO_VFIO_COMMON_H
>> +
>> +#include "qemu-common.h"
>> +#include "exec/address-spaces.h"
>> +#include "exec/memory.h"
>> +#include "qemu/queue.h"
>> +#include "qemu/notify.h"
>> +
>> +/*#define DEBUG_VFIO*/
>> +#ifdef DEBUG_VFIO
>> +#define DPRINTF(fmt, ...) \
>> +    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
>> +#else
>> +#define DPRINTF(fmt, ...) \
>> +    do { } while (0)
>> +#endif
> 
> 
> DPRINTF also need to be renamed to avoid conflicting namespace issues.
Ji Alex,

OK.

As I am going to touch at traces,
- are you OK if I use the new .name field to simply format strings?

    DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
            vdev->host.bus, vdev->host.slot, vdev->host.function,
            'A' + vdev->intx.pin);
- Also Alex was suggesting to use trace points. What is your position
about that? Also I am not 100% sure of what it consists in? is it trace
events as documented in docs/tracing.txt

Thanks

Eric



> Thanks,
> 
> Alex
>
Alex Williamson Aug. 13, 2014, 7:59 p.m. UTC | #5
On Tue, 2014-08-12 at 08:09 +0200, Eric Auger wrote:
> On 08/11/2014 09:25 PM, Alex Williamson wrote:
> > On Sat, 2014-08-09 at 15:25 +0100, Eric Auger wrote:
> >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> >> new file mode 100644
> >> index 0000000..4684ee5
> >> --- /dev/null
> >> +++ b/include/hw/vfio/vfio-common.h
> >> @@ -0,0 +1,151 @@
> >> +/*
> >> + * common header for vfio based device assignment support
> >> + *
> >> + * Copyright Red Hat, Inc. 2012
> >> + *
> >> + * Authors:
> >> + *  Alex Williamson <alex.williamson@redhat.com>
> >> + *
> >> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> >> + * the COPYING file in the top-level directory.
> >> + *
> >> + * Based on qemu-kvm device-assignment:
> >> + *  Adapted for KVM by Qumranet.
> >> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> >> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> >> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> >> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> >> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
> >> + */
> >> +#ifndef HW_VFIO_VFIO_COMMON_H
> >> +#define HW_VFIO_VFIO_COMMON_H
> >> +
> >> +#include "qemu-common.h"
> >> +#include "exec/address-spaces.h"
> >> +#include "exec/memory.h"
> >> +#include "qemu/queue.h"
> >> +#include "qemu/notify.h"
> >> +
> >> +/*#define DEBUG_VFIO*/
> >> +#ifdef DEBUG_VFIO
> >> +#define DPRINTF(fmt, ...) \
> >> +    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
> >> +#else
> >> +#define DPRINTF(fmt, ...) \
> >> +    do { } while (0)
> >> +#endif
> > 
> > 
> > DPRINTF also need to be renamed to avoid conflicting namespace issues.
> Ji Alex,
> 
> OK.
> 
> As I am going to touch at traces,
> - are you OK if I use the new .name field to simply format strings?

Sure, that's fine.

>     DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
>             vdev->host.bus, vdev->host.slot, vdev->host.function,
>             'A' + vdev->intx.pin);
> - Also Alex was suggesting to use trace points. What is your position
> about that? Also I am not 100% sure of what it consists in? is it trace
> events as documented in docs/tracing.txt

I think it would be a great conversion, but it's not required.  Thanks,

Alex
Joel Schopp Aug. 20, 2014, 7:12 p.m. UTC | #6
> +int vfio_get_device(VFIOGroup *group, const char *name,
> +                       VFIODevice *vbasedev)
> +{
> +    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
> +    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
> +    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
> +    int ret;
> +
> +    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
> +    if (ret < 0) {
Should be:
if(ret) {
instead of:
if (ret < 0) {

The ioctl can, and sometimes does, return positive values in case of
errors.  This should also be fixed in vfio_container_do_ioctl()
Alex Williamson Aug. 20, 2014, 7:41 p.m. UTC | #7
On Wed, 2014-08-20 at 14:12 -0500, Joel Schopp wrote:
> > +int vfio_get_device(VFIOGroup *group, const char *name,
> > +                       VFIODevice *vbasedev)
> > +{
> > +    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
> > +    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
> > +    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
> > +    int ret;
> > +
> > +    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
> > +    if (ret < 0) {
> Should be:
> if(ret) {
> instead of:
> if (ret < 0) {
> 
> The ioctl can, and sometimes does, return positive values in case of
> errors.  This should also be fixed in vfio_container_do_ioctl()

This particular ioctl usually does return a positive value, the file
descriptor for the the device, so I think it's correct as written.
Thanks,

Alex
Joel Schopp Aug. 20, 2014, 8:08 p.m. UTC | #8
On 08/20/2014 02:41 PM, Alex Williamson wrote:
> On Wed, 2014-08-20 at 14:12 -0500, Joel Schopp wrote:
>>> +int vfio_get_device(VFIOGroup *group, const char *name,
>>> +                       VFIODevice *vbasedev)
>>> +{
>>> +    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>>> +    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
>>> +    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
>>> +    int ret;
>>> +
>>> +    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
>>> +    if (ret < 0) {
>> Should be:
>> if(ret) {
>> instead of:
>> if (ret < 0) {
>>
>> The ioctl can, and sometimes does, return positive values in case of
>> errors.  This should also be fixed in vfio_container_do_ioctl()
> This particular ioctl usually does return a positive value, the file
> descriptor for the the device, so I think it's correct as written.
> Thanks,
Thanks for the catch, I stand corrected.  The kernel I am running
against contains corresponding patches that are spitting out an
erroneous pr_err() on if(ret).  In retrospect it looks like the kernel
patches and not the qemu patches are in the wrong.
Auger Eric Sept. 1, 2014, 4:31 p.m. UTC | #9
On 08/13/2014 09:59 PM, Alex Williamson wrote:
> On Tue, 2014-08-12 at 08:09 +0200, Eric Auger wrote:
>> On 08/11/2014 09:25 PM, Alex Williamson wrote:
>>> On Sat, 2014-08-09 at 15:25 +0100, Eric Auger wrote:
>>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>>>> new file mode 100644
>>>> index 0000000..4684ee5
>>>> --- /dev/null
>>>> +++ b/include/hw/vfio/vfio-common.h
>>>> @@ -0,0 +1,151 @@
>>>> +/*
>>>> + * common header for vfio based device assignment support
>>>> + *
>>>> + * Copyright Red Hat, Inc. 2012
>>>> + *
>>>> + * Authors:
>>>> + *  Alex Williamson <alex.williamson@redhat.com>
>>>> + *
>>>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>>>> + * the COPYING file in the top-level directory.
>>>> + *
>>>> + * Based on qemu-kvm device-assignment:
>>>> + *  Adapted for KVM by Qumranet.
>>>> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
>>>> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
>>>> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
>>>> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
>>>> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
>>>> + */
>>>> +#ifndef HW_VFIO_VFIO_COMMON_H
>>>> +#define HW_VFIO_VFIO_COMMON_H
>>>> +
>>>> +#include "qemu-common.h"
>>>> +#include "exec/address-spaces.h"
>>>> +#include "exec/memory.h"
>>>> +#include "qemu/queue.h"
>>>> +#include "qemu/notify.h"
>>>> +
>>>> +/*#define DEBUG_VFIO*/
>>>> +#ifdef DEBUG_VFIO
>>>> +#define DPRINTF(fmt, ...) \
>>>> +    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
>>>> +#else
>>>> +#define DPRINTF(fmt, ...) \
>>>> +    do { } while (0)
>>>> +#endif
>>>
>>>
>>> DPRINTF also need to be renamed to avoid conflicting namespace issues.
>> Ji Alex,
>>
>> OK.
>>
>> As I am going to touch at traces,
>> - are you OK if I use the new .name field to simply format strings?
> 
> Sure, that's fine.
> 
>>     DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
>>             vdev->host.bus, vdev->host.slot, vdev->host.function,
>>             'A' + vdev->intx.pin);
>> - Also Alex was suggesting to use trace points. What is your position
>> about that? Also I am not 100% sure of what it consists in? is it trace
>> events as documented in docs/tracing.txt
> 
> I think it would be a great conversion, but it's not required.  Thanks,

Hi Alex,

I am currently progressing on the conversion to trace points (I did it
for platform and common and now do the job for PCI). I wonder whether it
makes sense I convert all DPRINTF into trace-points or only convert a
subset (state transitions, ...). Would you accept a mixture of DPRINTFs
and trace-points or do you advise to convert everything?

Also the tracing.txt doc says we should use the name of the function as
prefix. That being said it could be interesting to trace all pci* or all
platform* and wildcard seems to work fine to select the trace-events. So
my second question is would you accept using pci_<function_name>_* as a
generic pattern.

Thanks in advance

Best Regards

Eric
> 
> Alex
>
Alexander Graf Sept. 1, 2014, 5:41 p.m. UTC | #10
> Am 01.09.2014 um 18:31 schrieb Eric Auger <eric.auger@linaro.org>:
> 
>> On 08/13/2014 09:59 PM, Alex Williamson wrote:
>>> On Tue, 2014-08-12 at 08:09 +0200, Eric Auger wrote:
>>>> On 08/11/2014 09:25 PM, Alex Williamson wrote:
>>>>> On Sat, 2014-08-09 at 15:25 +0100, Eric Auger wrote:
>>>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>>>>> new file mode 100644
>>>>> index 0000000..4684ee5
>>>>> --- /dev/null
>>>>> +++ b/include/hw/vfio/vfio-common.h
>>>>> @@ -0,0 +1,151 @@
>>>>> +/*
>>>>> + * common header for vfio based device assignment support
>>>>> + *
>>>>> + * Copyright Red Hat, Inc. 2012
>>>>> + *
>>>>> + * Authors:
>>>>> + *  Alex Williamson <alex.williamson@redhat.com>
>>>>> + *
>>>>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>>>>> + * the COPYING file in the top-level directory.
>>>>> + *
>>>>> + * Based on qemu-kvm device-assignment:
>>>>> + *  Adapted for KVM by Qumranet.
>>>>> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
>>>>> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
>>>>> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
>>>>> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
>>>>> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
>>>>> + */
>>>>> +#ifndef HW_VFIO_VFIO_COMMON_H
>>>>> +#define HW_VFIO_VFIO_COMMON_H
>>>>> +
>>>>> +#include "qemu-common.h"
>>>>> +#include "exec/address-spaces.h"
>>>>> +#include "exec/memory.h"
>>>>> +#include "qemu/queue.h"
>>>>> +#include "qemu/notify.h"
>>>>> +
>>>>> +/*#define DEBUG_VFIO*/
>>>>> +#ifdef DEBUG_VFIO
>>>>> +#define DPRINTF(fmt, ...) \
>>>>> +    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
>>>>> +#else
>>>>> +#define DPRINTF(fmt, ...) \
>>>>> +    do { } while (0)
>>>>> +#endif
>>>> 
>>>> 
>>>> DPRINTF also need to be renamed to avoid conflicting namespace issues.
>>> Ji Alex,
>>> 
>>> OK.
>>> 
>>> As I am going to touch at traces,
>>> - are you OK if I use the new .name field to simply format strings?
>> 
>> Sure, that's fine.
>> 
>>>    DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
>>>            vdev->host.bus, vdev->host.slot, vdev->host.function,
>>>            'A' + vdev->intx.pin);
>>> - Also Alex was suggesting to use trace points. What is your position
>>> about that? Also I am not 100% sure of what it consists in? is it trace
>>> events as documented in docs/tracing.txt
>> 
>> I think it would be a great conversion, but it's not required.  Thanks,
> 
> Hi Alex,
> 
> I am currently progressing on the conversion to trace points (I did it
> for platform and common and now do the job for PCI). I wonder whether it
> makes sense I convert all DPRINTF into trace-points or only convert a
> subset (state transitions, ...). Would you accept a mixture of DPRINTFs
> and trace-points or do you advise to convert everything?

Yeah, it's perfectly good to even just nit introduce new dprintfs.

> 
> Also the tracing.txt doc says we should use the name of the function as
> prefix. That being said it could be interesting to trace all pci* or all
> platform* and wildcard seems to work fine to select the trace-events. So
> my second question is would you accept using pci_<function_name>_* as a
> generic pattern.

Not sure - maybe be more explicit and call it vfio_pci_...?


Alex

> 
> Thanks in advance
> 
> Best Regards
> 
> Eric
>> 
>> Alex
>
Auger Eric Sept. 2, 2014, 7:13 a.m. UTC | #11
On 09/01/2014 07:41 PM, Alexander Graf wrote:
> 
> 
>> Am 01.09.2014 um 18:31 schrieb Eric Auger <eric.auger@linaro.org>:
>>
>>> On 08/13/2014 09:59 PM, Alex Williamson wrote:
>>>> On Tue, 2014-08-12 at 08:09 +0200, Eric Auger wrote:
>>>>> On 08/11/2014 09:25 PM, Alex Williamson wrote:
>>>>>> On Sat, 2014-08-09 at 15:25 +0100, Eric Auger wrote:
>>>>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>>>>>> new file mode 100644
>>>>>> index 0000000..4684ee5
>>>>>> --- /dev/null
>>>>>> +++ b/include/hw/vfio/vfio-common.h
>>>>>> @@ -0,0 +1,151 @@
>>>>>> +/*
>>>>>> + * common header for vfio based device assignment support
>>>>>> + *
>>>>>> + * Copyright Red Hat, Inc. 2012
>>>>>> + *
>>>>>> + * Authors:
>>>>>> + *  Alex Williamson <alex.williamson@redhat.com>
>>>>>> + *
>>>>>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>>>>>> + * the COPYING file in the top-level directory.
>>>>>> + *
>>>>>> + * Based on qemu-kvm device-assignment:
>>>>>> + *  Adapted for KVM by Qumranet.
>>>>>> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
>>>>>> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
>>>>>> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
>>>>>> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
>>>>>> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
>>>>>> + */
>>>>>> +#ifndef HW_VFIO_VFIO_COMMON_H
>>>>>> +#define HW_VFIO_VFIO_COMMON_H
>>>>>> +
>>>>>> +#include "qemu-common.h"
>>>>>> +#include "exec/address-spaces.h"
>>>>>> +#include "exec/memory.h"
>>>>>> +#include "qemu/queue.h"
>>>>>> +#include "qemu/notify.h"
>>>>>> +
>>>>>> +/*#define DEBUG_VFIO*/
>>>>>> +#ifdef DEBUG_VFIO
>>>>>> +#define DPRINTF(fmt, ...) \
>>>>>> +    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
>>>>>> +#else
>>>>>> +#define DPRINTF(fmt, ...) \
>>>>>> +    do { } while (0)
>>>>>> +#endif
>>>>>
>>>>>
>>>>> DPRINTF also need to be renamed to avoid conflicting namespace issues.
>>>> Ji Alex,
>>>>
>>>> OK.
>>>>
>>>> As I am going to touch at traces,
>>>> - are you OK if I use the new .name field to simply format strings?
>>>
>>> Sure, that's fine.
>>>
>>>>    DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
>>>>            vdev->host.bus, vdev->host.slot, vdev->host.function,
>>>>            'A' + vdev->intx.pin);
>>>> - Also Alex was suggesting to use trace points. What is your position
>>>> about that? Also I am not 100% sure of what it consists in? is it trace
>>>> events as documented in docs/tracing.txt
>>>
>>> I think it would be a great conversion, but it's not required.  Thanks,
>>
>> Hi Alex,
>>
>> I am currently progressing on the conversion to trace points (I did it
>> for platform and common and now do the job for PCI). I wonder whether it
>> makes sense I convert all DPRINTF into trace-points or only convert a
>> subset (state transitions, ...). Would you accept a mixture of DPRINTFs
>> and trace-points or do you advise to convert everything?
> 
> Yeah, it's perfectly good to even just nit introduce new dprintfs.
ok thanks
> 
>>
>> Also the tracing.txt doc says we should use the name of the function as
>> prefix. That being said it could be interesting to trace all pci* or all
>> platform* and wildcard seems to work fine to select the trace-events. So
>> my second question is would you accept using pci_<function_name>_* as a
>> generic pattern.
> 
> Not sure - maybe be more explicit and call it vfio_pci_...?
well. maybe as a first draft I will follow the tracing.txt guideline and
you will tell me, both Alex's, what you think of the outcome. Anyway it
is not a big deal then to change ...

Thanks

Eric
> 
> 
> Alex
> 
>>
>> Thanks in advance
>>
>> Best Regards
>>
>> Eric
>>>
>>> Alex
>>
Alex Williamson Sept. 2, 2014, 9:13 p.m. UTC | #12
On Tue, 2014-09-02 at 09:13 +0200, Eric Auger wrote:
> On 09/01/2014 07:41 PM, Alexander Graf wrote:
> > 
> > 
> >> Am 01.09.2014 um 18:31 schrieb Eric Auger <eric.auger@linaro.org>:
> >>
> >>> On 08/13/2014 09:59 PM, Alex Williamson wrote:
> >>>> On Tue, 2014-08-12 at 08:09 +0200, Eric Auger wrote:
> >>>>> On 08/11/2014 09:25 PM, Alex Williamson wrote:
> >>>>>> On Sat, 2014-08-09 at 15:25 +0100, Eric Auger wrote:
> >>>>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> >>>>>> new file mode 100644
> >>>>>> index 0000000..4684ee5
> >>>>>> --- /dev/null
> >>>>>> +++ b/include/hw/vfio/vfio-common.h
> >>>>>> @@ -0,0 +1,151 @@
> >>>>>> +/*
> >>>>>> + * common header for vfio based device assignment support
> >>>>>> + *
> >>>>>> + * Copyright Red Hat, Inc. 2012
> >>>>>> + *
> >>>>>> + * Authors:
> >>>>>> + *  Alex Williamson <alex.williamson@redhat.com>
> >>>>>> + *
> >>>>>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> >>>>>> + * the COPYING file in the top-level directory.
> >>>>>> + *
> >>>>>> + * Based on qemu-kvm device-assignment:
> >>>>>> + *  Adapted for KVM by Qumranet.
> >>>>>> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> >>>>>> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> >>>>>> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> >>>>>> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> >>>>>> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
> >>>>>> + */
> >>>>>> +#ifndef HW_VFIO_VFIO_COMMON_H
> >>>>>> +#define HW_VFIO_VFIO_COMMON_H
> >>>>>> +
> >>>>>> +#include "qemu-common.h"
> >>>>>> +#include "exec/address-spaces.h"
> >>>>>> +#include "exec/memory.h"
> >>>>>> +#include "qemu/queue.h"
> >>>>>> +#include "qemu/notify.h"
> >>>>>> +
> >>>>>> +/*#define DEBUG_VFIO*/
> >>>>>> +#ifdef DEBUG_VFIO
> >>>>>> +#define DPRINTF(fmt, ...) \
> >>>>>> +    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
> >>>>>> +#else
> >>>>>> +#define DPRINTF(fmt, ...) \
> >>>>>> +    do { } while (0)
> >>>>>> +#endif
> >>>>>
> >>>>>
> >>>>> DPRINTF also need to be renamed to avoid conflicting namespace issues.
> >>>> Ji Alex,
> >>>>
> >>>> OK.
> >>>>
> >>>> As I am going to touch at traces,
> >>>> - are you OK if I use the new .name field to simply format strings?
> >>>
> >>> Sure, that's fine.
> >>>
> >>>>    DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
> >>>>            vdev->host.bus, vdev->host.slot, vdev->host.function,
> >>>>            'A' + vdev->intx.pin);
> >>>> - Also Alex was suggesting to use trace points. What is your position
> >>>> about that? Also I am not 100% sure of what it consists in? is it trace
> >>>> events as documented in docs/tracing.txt
> >>>
> >>> I think it would be a great conversion, but it's not required.  Thanks,
> >>
> >> Hi Alex,
> >>
> >> I am currently progressing on the conversion to trace points (I did it
> >> for platform and common and now do the job for PCI). I wonder whether it
> >> makes sense I convert all DPRINTF into trace-points or only convert a
> >> subset (state transitions, ...). Would you accept a mixture of DPRINTFs
> >> and trace-points or do you advise to convert everything?
> > 
> > Yeah, it's perfectly good to even just nit introduce new dprintfs.
> ok thanks
> > 
> >>
> >> Also the tracing.txt doc says we should use the name of the function as
> >> prefix. That being said it could be interesting to trace all pci* or all
> >> platform* and wildcard seems to work fine to select the trace-events. So
> >> my second question is would you accept using pci_<function_name>_* as a
> >> generic pattern.
> > 
> > Not sure - maybe be more explicit and call it vfio_pci_...?
> well. maybe as a first draft I will follow the tracing.txt guideline and
> you will tell me, both Alex's, what you think of the outcome. Anyway it
> is not a big deal then to change ...

I haven't touched tracing yet, so I'll defer to you and agraf for now ;)
Thanks,

Alex
diff mbox

Patch

diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
index 31c7dab..e31f30e 100644
--- a/hw/vfio/Makefile.objs
+++ b/hw/vfio/Makefile.objs
@@ -1,3 +1,4 @@ 
 ifeq ($(CONFIG_LINUX), y)
+obj-$(CONFIG_SOFTMMU) += common.o
 obj-$(CONFIG_PCI) += pci.o
 endif
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
new file mode 100644
index 0000000..297c508
--- /dev/null
+++ b/hw/vfio/common.c
@@ -0,0 +1,990 @@ 
+/*
+ * generic functions used by VFIO devices
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ *  Adapted for KVM by Qumranet.
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <linux/vfio.h>
+
+#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio.h"
+#include "exec/address-spaces.h"
+#include "exec/memory.h"
+#include "hw/hw.h"
+#include "qemu/error-report.h"
+#include "sysemu/kvm.h"
+
+QLIST_HEAD(, VFIOGroup)
+    group_list = QLIST_HEAD_INITIALIZER(group_list);
+
+QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
+    QLIST_HEAD_INITIALIZER(vfio_address_spaces);
+
+#ifdef CONFIG_KVM
+/*
+ * We have a single VFIO pseudo device per KVM VM.  Once created it lives
+ * for the life of the VM.  Closing the file descriptor only drops our
+ * reference to it and the device's reference to kvm.  Therefore once
+ * initialized, this file descriptor is only released on QEMU exit and
+ * we'll re-use it should another vfio device be attached before then.
+ */
+static int vfio_kvm_device_fd = -1;
+#endif
+
+/*
+ * Common VFIO interrupt disable
+ */
+void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+        .index = index,
+        .start = 0,
+        .count = 0,
+    };
+
+    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+}
+
+void vfio_unmask_irqindex(VFIODevice *vbasedev, int index)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
+        .index = index,
+        .start = 0,
+        .count = 1,
+    };
+
+    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+}
+
+#ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */
+void vfio_mask_irqindex(VFIODevice *vbasedev, int index)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
+        .index = index,
+        .start = 0,
+        .count = 1,
+    };
+
+    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+}
+#endif
+
+/*
+ * IO Port/MMIO - Beware of the endians, VFIO is always little endian
+ */
+void vfio_region_write(void *opaque, hwaddr addr,
+                       uint64_t data, unsigned size)
+{
+    VFIORegion *region = opaque;
+    VFIODevice *vbasedev = region->vbasedev;
+    union {
+        uint8_t byte;
+        uint16_t word;
+        uint32_t dword;
+        uint64_t qword;
+    } buf;
+
+    switch (size) {
+    case 1:
+        buf.byte = data;
+        break;
+    case 2:
+        buf.word = data;
+        break;
+    case 4:
+        buf.dword = data;
+        break;
+    default:
+        hw_error("vfio: unsupported write size, %d bytes", size);
+        break;
+    }
+
+    if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
+        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
+                     ",%d) failed: %m",
+                     __func__, vbasedev->name, region->nr,
+                     addr, data, size);
+    }
+
+    DPRINTF("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
+            ", %d)\n", __func__, vbasedev->name,
+            region->nr, addr, data, size);
+
+    /*
+     * A read or write to a BAR always signals an INTx EOI.  This will
+     * do nothing if not pending (including not in INTx mode).  We assume
+     * that a BAR access is in response to an interrupt and that BAR
+     * accesses will service the interrupt.  Unfortunately, we don't know
+     * which access will service the interrupt, so we're potentially
+     * getting quite a few host interrupts per guest interrupt.
+     */
+    vbasedev->ops->vfio_eoi(vbasedev);
+}
+
+uint64_t vfio_region_read(void *opaque,
+                          hwaddr addr, unsigned size)
+{
+    VFIORegion *region = opaque;
+    VFIODevice *vbasedev = region->vbasedev;
+    union {
+        uint8_t byte;
+        uint16_t word;
+        uint32_t dword;
+        uint64_t qword;
+    } buf;
+    uint64_t data = 0;
+
+    if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
+        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
+                     __func__, vbasedev->name, region->nr,
+                     addr, size);
+        return (uint64_t)-1;
+    }
+
+    switch (size) {
+    case 1:
+        data = buf.byte;
+        break;
+    case 2:
+        data = buf.word;
+        break;
+    case 4:
+        data = buf.dword;
+        break;
+    default:
+        hw_error("vfio: unsupported read size, %d bytes", size);
+        break;
+    }
+
+    DPRINTF("%s(%s:region%d+0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n",
+            __func__, vbasedev->name,
+            region->nr, addr, size, data);
+
+    /* Same as write above */
+    vbasedev->ops->vfio_eoi(vbasedev);
+
+    return data;
+}
+
+const MemoryRegionOps vfio_region_ops = {
+    .read = vfio_region_read,
+    .write = vfio_region_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+/*
+ * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
+ */
+static int vfio_dma_unmap(VFIOContainer *container,
+                          hwaddr iova, ram_addr_t size)
+{
+    struct vfio_iommu_type1_dma_unmap unmap = {
+        .argsz = sizeof(unmap),
+        .flags = 0,
+        .iova = iova,
+        .size = size,
+    };
+
+    if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+        DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
+        return -errno;
+    }
+
+    return 0;
+}
+
+static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
+                        ram_addr_t size, void *vaddr, bool readonly)
+{
+    struct vfio_iommu_type1_dma_map map = {
+        .argsz = sizeof(map),
+        .flags = VFIO_DMA_MAP_FLAG_READ,
+        .vaddr = (__u64)(uintptr_t)vaddr,
+        .iova = iova,
+        .size = size,
+    };
+
+    if (!readonly) {
+        map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
+    }
+
+    /*
+     * Try the mapping, if it fails with EBUSY, unmap the region and try
+     * again.  This shouldn't be necessary, but we sometimes see it in
+     * the the VGA ROM space.
+     */
+    if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
+        (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
+         ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
+        return 0;
+    }
+
+    DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
+    return -errno;
+}
+
+static bool vfio_listener_skipped_section(MemoryRegionSection *section)
+{
+    return (!memory_region_is_ram(section->mr) &&
+            !memory_region_is_iommu(section->mr)) ||
+           /*
+            * Sizing an enabled 64-bit BAR can cause spurious mappings to
+            * addresses in the upper part of the 64-bit address space.  These
+            * are never accessed by the CPU and beyond the address width of
+            * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU width.
+            */
+           section->offset_within_address_space & (1ULL << 63);
+}
+
+static void vfio_iommu_map_notify(Notifier *n, void *data)
+{
+    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
+    VFIOContainer *container = giommu->container;
+    IOMMUTLBEntry *iotlb = data;
+    MemoryRegion *mr;
+    hwaddr xlat;
+    hwaddr len = iotlb->addr_mask + 1;
+    void *vaddr;
+    int ret;
+
+    DPRINTF("iommu map @ %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
+            iotlb->iova, iotlb->iova + iotlb->addr_mask);
+
+    /*
+     * The IOMMU TLB entry we have just covers translation through
+     * this IOMMU to its immediate target.  We need to translate
+     * it the rest of the way through to memory.
+     */
+    mr = address_space_translate(&address_space_memory,
+                                 iotlb->translated_addr,
+                                 &xlat, &len, iotlb->perm & IOMMU_WO);
+    if (!memory_region_is_ram(mr)) {
+        DPRINTF("iommu map to non memory area %"HWADDR_PRIx"\n",
+                xlat);
+        return;
+    }
+    /*
+     * Translation truncates length to the IOMMU page size,
+     * check that it did not truncate too much.
+     */
+    if (len & iotlb->addr_mask) {
+        DPRINTF("iommu has granularity incompatible with target AS\n");
+        return;
+    }
+
+    if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
+        vaddr = memory_region_get_ram_ptr(mr) + xlat;
+
+        ret = vfio_dma_map(container, iotlb->iova,
+                           iotlb->addr_mask + 1, vaddr,
+                           !(iotlb->perm & IOMMU_WO) || mr->readonly);
+        if (ret) {
+            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
+                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
+                         container, iotlb->iova,
+                         iotlb->addr_mask + 1, vaddr, ret);
+        }
+    } else {
+        ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
+        if (ret) {
+            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+                         "0x%"HWADDR_PRIx") = %d (%m)",
+                         container, iotlb->iova,
+                         iotlb->addr_mask + 1, ret);
+        }
+    }
+}
+
+static void vfio_listener_region_add(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            iommu_data.type1.listener);
+    hwaddr iova, end;
+    Int128 llend;
+    void *vaddr;
+    int ret;
+
+    if (vfio_listener_skipped_section(section)) {
+        DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
+                section->offset_within_address_space,
+                section->offset_within_address_space +
+                int128_get64(int128_sub(section->size, int128_one())));
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
+        error_report("%s received unaligned region", __func__);
+        return;
+    }
+
+    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
+
+    if (int128_ge(int128_make64(iova), llend)) {
+        return;
+    }
+
+    memory_region_ref(section->mr);
+
+    if (memory_region_is_iommu(section->mr)) {
+        VFIOGuestIOMMU *giommu;
+
+        DPRINTF("region_add [iommu] %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
+                iova, int128_get64(int128_sub(llend, int128_one())));
+        /*
+         * FIXME: We should do some checking to see if the
+         * capabilities of the host VFIO IOMMU are adequate to model
+         * the guest IOMMU
+         *
+         * FIXME: For VFIO iommu types which have KVM acceleration to
+         * avoid bouncing all map/unmaps through qemu this way, this
+         * would be the right place to wire that up (tell the KVM
+         * device emulation the VFIO iommu handles to use).
+         */
+        /*
+         * This assumes that the guest IOMMU is empty of
+         * mappings at this point.
+         *
+         * One way of doing this is:
+         * 1. Avoid sharing IOMMUs between emulated devices or different
+         * IOMMU groups.
+         * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
+         * there are some mappings in IOMMU.
+         *
+         * VFIO on SPAPR does that. Other IOMMU models may do that different,
+         * they must make sure there are no existing mappings or
+         * loop through existing mappings to map them into VFIO.
+         */
+        giommu = g_malloc0(sizeof(*giommu));
+        giommu->iommu = section->mr;
+        giommu->container = container;
+        giommu->n.notify = vfio_iommu_map_notify;
+        QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
+        memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
+
+        return;
+    }
+
+    /* Here we assume that memory_region_is_ram(section->mr)==true */
+
+    end = int128_get64(llend);
+    vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+
+    DPRINTF("region_add [ram] %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
+            iova, end - 1, vaddr);
+
+    ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
+    if (ret) {
+        error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
+                     "0x%"HWADDR_PRIx", %p) = %d (%m)",
+                     container, iova, end - iova, vaddr, ret);
+
+        /*
+         * On the initfn path, store the first error in the container so we
+         * can gracefully fail.  Runtime, there's not much we can do other
+         * than throw a hardware error.
+         */
+        if (!container->iommu_data.type1.initialized) {
+            if (!container->iommu_data.type1.error) {
+                container->iommu_data.type1.error = ret;
+            }
+        } else {
+            hw_error("vfio: DMA mapping failed, unable to continue");
+        }
+    }
+}
+
+static void vfio_listener_region_del(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            iommu_data.type1.listener);
+    hwaddr iova, end;
+    int ret;
+
+    if (vfio_listener_skipped_section(section)) {
+        DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
+                section->offset_within_address_space,
+                section->offset_within_address_space +
+                int128_get64(int128_sub(section->size, int128_one())));
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
+        error_report("%s received unaligned region", __func__);
+        return;
+    }
+
+    if (memory_region_is_iommu(section->mr)) {
+        VFIOGuestIOMMU *giommu;
+
+        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
+            if (giommu->iommu == section->mr) {
+                memory_region_unregister_iommu_notifier(&giommu->n);
+                QLIST_REMOVE(giommu, giommu_next);
+                g_free(giommu);
+                break;
+            }
+        }
+
+        /*
+         * FIXME: We assume the one big unmap below is adequate to
+         * remove any individual page mappings in the IOMMU which
+         * might have been copied into VFIO. This works for a page table
+         * based IOMMU where a big unmap flattens a large range of IO-PTEs.
+         * That may not be true for all IOMMU types.
+         */
+    }
+
+    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+    end = (section->offset_within_address_space + int128_get64(section->size)) &
+          TARGET_PAGE_MASK;
+
+    if (iova >= end) {
+        return;
+    }
+
+    DPRINTF("region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
+            iova, end - 1);
+
+    ret = vfio_dma_unmap(container, iova, end - iova);
+    memory_region_unref(section->mr);
+    if (ret) {
+        error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+                     "0x%"HWADDR_PRIx") = %d (%m)",
+                     container, iova, end - iova, ret);
+    }
+}
+
+const MemoryListener vfio_memory_listener = {
+    .region_add = vfio_listener_region_add,
+    .region_del = vfio_listener_region_del,
+};
+
+void vfio_listener_release(VFIOContainer *container)
+{
+    memory_listener_unregister(&container->iommu_data.type1.listener);
+}
+
+int vfio_mmap_region(Object *obj, VFIORegion *region,
+                     MemoryRegion *mem, MemoryRegion *submem,
+                     void **map, size_t size, off_t offset,
+                     const char *name)
+{
+    int ret = 0;
+
+    if (VFIO_ALLOW_MMAP && size && region->flags &
+        VFIO_REGION_INFO_FLAG_MMAP) {
+        int prot = 0;
+
+        if (region->flags & VFIO_REGION_INFO_FLAG_READ) {
+            prot |= PROT_READ;
+        }
+
+        if (region->flags & VFIO_REGION_INFO_FLAG_WRITE) {
+            prot |= PROT_WRITE;
+        }
+
+        *map = mmap(NULL, size, prot, MAP_SHARED,
+                    region->vbasedev->fd,
+                    region->fd_offset + offset);
+        if (*map == MAP_FAILED) {
+            *map = NULL;
+            ret = -errno;
+            goto empty_region;
+        }
+
+        memory_region_init_ram_ptr(submem, obj, name, size, *map);
+    } else {
+empty_region:
+        /* Create a zero sized sub-region to make cleanup easy. */
+        memory_region_init(submem, obj, name, 0);
+    }
+
+    memory_region_add_subregion(mem, offset, submem);
+
+    return ret;
+}
+
+void vfio_reset_handler(void *opaque)
+{
+    VFIOGroup *group;
+    VFIODevice *vbasedev;
+
+    QLIST_FOREACH(group, &group_list, next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            vbasedev->ops->vfio_compute_needs_reset(vbasedev);
+        }
+    }
+
+    QLIST_FOREACH(group, &group_list, next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if (vbasedev->needs_reset) {
+                vbasedev->ops->vfio_hot_reset_multi(vbasedev);
+            }
+        }
+    }
+}
+
+static void vfio_kvm_device_add_group(VFIOGroup *group)
+{
+#ifdef CONFIG_KVM
+    struct kvm_device_attr attr = {
+        .group = KVM_DEV_VFIO_GROUP,
+        .attr = KVM_DEV_VFIO_GROUP_ADD,
+        .addr = (uint64_t)(unsigned long)&group->fd,
+    };
+
+    if (!kvm_enabled()) {
+        return;
+    }
+
+    if (vfio_kvm_device_fd < 0) {
+        struct kvm_create_device cd = {
+            .type = KVM_DEV_TYPE_VFIO,
+        };
+
+        if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
+            DPRINTF("KVM_CREATE_DEVICE: %m\n");
+            return;
+        }
+
+        vfio_kvm_device_fd = cd.fd;
+    }
+
+    if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
+        error_report("Failed to add group %d to KVM VFIO device: %m",
+                     group->groupid);
+    }
+#endif
+}
+
+static void vfio_kvm_device_del_group(VFIOGroup *group)
+{
+#ifdef CONFIG_KVM
+    struct kvm_device_attr attr = {
+        .group = KVM_DEV_VFIO_GROUP,
+        .attr = KVM_DEV_VFIO_GROUP_DEL,
+        .addr = (uint64_t)(unsigned long)&group->fd,
+    };
+
+    if (vfio_kvm_device_fd < 0) {
+        return;
+    }
+
+    if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
+        error_report("Failed to remove group %d from KVM VFIO device: %m",
+                     group->groupid);
+    }
+#endif
+}
+
+static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
+{
+    VFIOAddressSpace *space;
+
+    QLIST_FOREACH(space, &vfio_address_spaces, list) {
+        if (space->as == as) {
+            return space;
+        }
+    }
+
+    /* No suitable VFIOAddressSpace, create a new one */
+    space = g_malloc0(sizeof(*space));
+    space->as = as;
+    QLIST_INIT(&space->containers);
+
+    QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
+
+    return space;
+}
+
+static void vfio_put_address_space(VFIOAddressSpace *space)
+{
+    if (QLIST_EMPTY(&space->containers)) {
+        QLIST_REMOVE(space, list);
+        g_free(space);
+    }
+}
+
+static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
+{
+    VFIOContainer *container;
+    int ret, fd;
+    VFIOAddressSpace *space;
+
+    space = vfio_get_address_space(as);
+
+    QLIST_FOREACH(container, &space->containers, next) {
+        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
+            group->container = container;
+            QLIST_INSERT_HEAD(&container->group_list, group, container_next);
+            return 0;
+        }
+    }
+
+    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
+    if (fd < 0) {
+        error_report("vfio: failed to open /dev/vfio/vfio: %m");
+        ret = -errno;
+        goto put_space_exit;
+    }
+
+    ret = ioctl(fd, VFIO_GET_API_VERSION);
+    if (ret != VFIO_API_VERSION) {
+        error_report("vfio: supported vfio version: %d, "
+                     "reported version: %d", VFIO_API_VERSION, ret);
+        ret = -EINVAL;
+        goto close_fd_exit;
+    }
+
+    container = g_malloc0(sizeof(*container));
+    container->space = space;
+    container->fd = fd;
+
+    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
+        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
+        if (ret) {
+            error_report("vfio: failed to set group container: %m");
+            ret = -errno;
+            goto free_container_exit;
+        }
+
+        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
+        if (ret) {
+            error_report("vfio: failed to set iommu for container: %m");
+            ret = -errno;
+            goto free_container_exit;
+        }
+
+        container->iommu_data.type1.listener = vfio_memory_listener;
+        container->iommu_data.release = vfio_listener_release;
+
+        memory_listener_register(&container->iommu_data.type1.listener,
+                                 &address_space_memory);
+
+        if (container->iommu_data.type1.error) {
+            ret = container->iommu_data.type1.error;
+            error_report("vfio: memory listener initialization failed for container");
+            goto listener_release_exit;
+        }
+
+        container->iommu_data.type1.initialized = true;
+
+    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
+        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
+        if (ret) {
+            error_report("vfio: failed to set group container: %m");
+            ret = -errno;
+            goto free_container_exit;
+        }
+
+        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU);
+        if (ret) {
+            error_report("vfio: failed to set iommu for container: %m");
+            ret = -errno;
+            goto free_container_exit;
+        }
+
+        /*
+         * The host kernel code implementing VFIO_IOMMU_DISABLE is called
+         * when container fd is closed so we do not call it explicitly
+         * in this file.
+         */
+        ret = ioctl(fd, VFIO_IOMMU_ENABLE);
+        if (ret) {
+            error_report("vfio: failed to enable container: %m");
+            ret = -errno;
+            goto free_container_exit;
+        }
+
+        container->iommu_data.type1.listener = vfio_memory_listener;
+        container->iommu_data.release = vfio_listener_release;
+
+        memory_listener_register(&container->iommu_data.type1.listener,
+                                 container->space->as);
+
+    } else {
+        error_report("vfio: No available IOMMU models");
+        ret = -EINVAL;
+        goto free_container_exit;
+    }
+
+    QLIST_INIT(&container->group_list);
+    QLIST_INSERT_HEAD(&space->containers, container, next);
+
+    group->container = container;
+    QLIST_INSERT_HEAD(&container->group_list, group, container_next);
+
+    return 0;
+
+listener_release_exit:
+    vfio_listener_release(container);
+
+free_container_exit:
+    g_free(container);
+
+close_fd_exit:
+    close(fd);
+
+put_space_exit:
+    vfio_put_address_space(space);
+
+    return ret;
+}
+
+static void vfio_disconnect_container(VFIOGroup *group)
+{
+    VFIOContainer *container = group->container;
+
+    if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
+        error_report("vfio: error disconnecting group %d from container",
+                     group->groupid);
+    }
+
+    QLIST_REMOVE(group, container_next);
+    group->container = NULL;
+
+    if (QLIST_EMPTY(&container->group_list)) {
+        VFIOAddressSpace *space = container->space;
+
+        if (container->iommu_data.release) {
+            container->iommu_data.release(container);
+        }
+        QLIST_REMOVE(container, next);
+        DPRINTF("vfio_disconnect_container: close container->fd\n");
+        close(container->fd);
+        g_free(container);
+
+        vfio_put_address_space(space);
+    }
+}
+
+VFIOGroup *vfio_get_group(int groupid, AddressSpace *as)
+{
+    VFIOGroup *group;
+    char path[32];
+    struct vfio_group_status status = { .argsz = sizeof(status) };
+
+    QLIST_FOREACH(group, &group_list, next) {
+        if (group->groupid == groupid) {
+            /* Found it.  Now is it already in the right context? */
+            if (group->container->space->as == as) {
+                return group;
+            } else {
+                error_report("vfio: group %d used in multiple address spaces",
+                             group->groupid);
+                return NULL;
+            }
+        }
+    }
+
+    group = g_malloc0(sizeof(*group));
+
+    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
+    group->fd = qemu_open(path, O_RDWR);
+    if (group->fd < 0) {
+        error_report("vfio: error opening %s: %m", path);
+        goto free_group_exit;
+    }
+
+    if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
+        error_report("vfio: error getting group status: %m");
+        goto close_fd_exit;
+    }
+
+    if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+        error_report("vfio: error, group %d is not viable, please ensure "
+                     "all devices within the iommu_group are bound to their "
+                     "vfio bus driver.", groupid);
+        goto close_fd_exit;
+    }
+
+    group->groupid = groupid;
+    QLIST_INIT(&group->device_list);
+
+    if (vfio_connect_container(group, as)) {
+        error_report("vfio: failed to setup container for group %d", groupid);
+        goto close_fd_exit;
+    }
+
+    if (QLIST_EMPTY(&group_list)) {
+        qemu_register_reset(vfio_reset_handler, NULL);
+    }
+
+    QLIST_INSERT_HEAD(&group_list, group, next);
+
+    vfio_kvm_device_add_group(group);
+
+    return group;
+
+close_fd_exit:
+    close(group->fd);
+
+free_group_exit:
+    g_free(group);
+
+    return NULL;
+}
+
+void vfio_put_group(VFIOGroup *group)
+{
+    if (!QLIST_EMPTY(&group->device_list)) {
+        return;
+    }
+
+    vfio_kvm_device_del_group(group);
+    vfio_disconnect_container(group);
+    QLIST_REMOVE(group, next);
+    DPRINTF("vfio_put_group: close group->fd\n");
+    close(group->fd);
+    g_free(group);
+
+    if (QLIST_EMPTY(&group_list)) {
+        qemu_unregister_reset(vfio_reset_handler, NULL);
+    }
+}
+
+int vfio_get_device(VFIOGroup *group, const char *name,
+                       VFIODevice *vbasedev)
+{
+    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
+    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
+    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
+    int ret;
+
+    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+    if (ret < 0) {
+        error_report("vfio: error getting device %s from group %d: %m",
+                     name, group->groupid);
+        error_printf("Verify all devices in group %d are bound to vfio-pci "
+                     "or pci-stub and not already in use\n", group->groupid);
+        return ret;
+    }
+
+    vbasedev->fd = ret;
+    vbasedev->group = group;
+    QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
+
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
+    if (ret) {
+        error_report("vfio: error getting device info: %m");
+        goto error;
+    }
+
+    vbasedev->num_irqs = dev_info.num_irqs;
+    vbasedev->num_regions = dev_info.num_regions;
+    vbasedev->flags = dev_info.flags;
+
+    DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
+            dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
+
+    vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
+
+    /* call device specific functions */
+    ret = vbasedev->ops->vfio_check_device(vbasedev);
+    if (ret) {
+        error_report("vfio: error when checking device %s\n",
+                     vbasedev->name);
+        goto error;
+    }
+    ret = vbasedev->ops->vfio_populate_regions(vbasedev);
+    if (ret) {
+        error_report("vfio: error when populating regions of device %s\n",
+                     vbasedev->name);
+        goto error;
+    }
+    ret = vbasedev->ops->vfio_populate_interrupts(vbasedev);
+    if (ret) {
+        error_report("vfio: error when populating interrupts of device %s\n",
+                     vbasedev->name);
+        goto error;
+    }
+
+error:
+    if (ret) {
+        vfio_put_base_device(vbasedev);
+    }
+    return ret;
+}
+
+void vfio_put_base_device(VFIODevice *vbasedev)
+{
+    QLIST_REMOVE(vbasedev, next);
+    vbasedev->group = NULL;
+    DPRINTF("vfio_put_base_device: close vdev->fd\n");
+    close(vbasedev->fd);
+}
+
+static int vfio_container_do_ioctl(AddressSpace *as, int32_t groupid,
+                                   int req, void *param)
+{
+    VFIOGroup *group;
+    VFIOContainer *container;
+    int ret = -1;
+
+    group = vfio_get_group(groupid, as);
+    if (!group) {
+        error_report("vfio: group %d not registered", groupid);
+        return ret;
+    }
+
+    container = group->container;
+    if (group->container) {
+        ret = ioctl(container->fd, req, param);
+        if (ret < 0) {
+            error_report("vfio: failed to ioctl container: ret=%d, %s",
+                         ret, strerror(errno));
+        }
+    }
+
+    vfio_put_group(group);
+
+    return ret;
+}
+
+int vfio_container_ioctl(AddressSpace *as, int32_t groupid,
+                         int req, void *param)
+{
+    /* We allow only certain ioctls to the container */
+    switch (req) {
+    case VFIO_CHECK_EXTENSION:
+    case VFIO_IOMMU_SPAPR_TCE_GET_INFO:
+        break;
+    default:
+        /* Return an error on unknown requests */
+        error_report("vfio: unsupported ioctl %X", req);
+        return -1;
+    }
+
+    return vfio_container_do_ioctl(as, groupid, req, param);
+}
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 5f218b7..d2ccb3b 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -39,27 +39,12 @@ 
 #include "qemu/range.h"
 #include "sysemu/kvm.h"
 #include "sysemu/sysemu.h"
-#include "hw/vfio/vfio.h"
+#include "hw/vfio/vfio-common.h"
 
-/* #define DEBUG_VFIO */
-#ifdef DEBUG_VFIO
-#define DPRINTF(fmt, ...) \
-    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
-#else
-#define DPRINTF(fmt, ...) \
-    do { } while (0)
-#endif
-
-/* Extra debugging, trap acceleration paths for more logging */
-#define VFIO_ALLOW_MMAP 1
-#define VFIO_ALLOW_KVM_INTX 1
-#define VFIO_ALLOW_KVM_MSI 1
-#define VFIO_ALLOW_KVM_MSIX 1
-
-enum {
-    VFIO_DEVICE_TYPE_PCI = 0,
-    VFIO_DEVICE_TYPE_PLATFORM = 1,
-};
+extern const MemoryRegionOps vfio_region_ops;
+extern const MemoryListener vfio_memory_listener;
+extern QLIST_HEAD(, VFIOGroup) group_list;
+extern QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces;
 
 struct VFIOPCIDevice;
 
@@ -86,17 +71,6 @@  typedef struct VFIOQuirk {
     } data;
 } VFIOQuirk;
 
-typedef struct VFIORegion {
-    struct VFIODevice *vbasedev;
-    off_t fd_offset; /* offset of region within device fd */
-    MemoryRegion mem; /* slow, read/write access */
-    MemoryRegion mmap_mem; /* direct mapped access */
-    void *mmap;
-    size_t size;
-    uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
-    uint8_t nr; /* cache the region number for debug */
-} VFIORegion;
-
 typedef struct VFIOBAR {
     VFIORegion region;
     bool ioport;
@@ -152,45 +126,6 @@  enum {
     VFIO_INT_MSIX = 3,
 };
 
-typedef struct VFIOAddressSpace {
-    AddressSpace *as;
-    QLIST_HEAD(, VFIOContainer) containers;
-    QLIST_ENTRY(VFIOAddressSpace) list;
-} VFIOAddressSpace;
-
-static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
-    QLIST_HEAD_INITIALIZER(vfio_address_spaces);
-
-struct VFIOGroup;
-
-typedef struct VFIOType1 {
-    MemoryListener listener;
-    int error;
-    bool initialized;
-} VFIOType1;
-
-typedef struct VFIOContainer {
-    VFIOAddressSpace *space;
-    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
-    struct {
-        /* enable abstraction to support various iommu backends */
-        union {
-            VFIOType1 type1;
-        };
-        void (*release)(struct VFIOContainer *);
-    } iommu_data;
-    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
-    QLIST_HEAD(, VFIOGroup) group_list;
-    QLIST_ENTRY(VFIOContainer) next;
-} VFIOContainer;
-
-typedef struct VFIOGuestIOMMU {
-    VFIOContainer *container;
-    MemoryRegion *iommu;
-    Notifier n;
-    QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
-} VFIOGuestIOMMU;
-
 /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
 typedef struct VFIOMSIXInfo {
     uint8_t table_bar;
@@ -202,31 +137,6 @@  typedef struct VFIOMSIXInfo {
     void *mmap;
 } VFIOMSIXInfo;
 
-typedef struct VFIODeviceOps VFIODeviceOps;
-
-typedef struct VFIODevice {
-    QLIST_ENTRY(VFIODevice) next;
-    struct VFIOGroup *group;
-    char *name;
-    int fd;
-    int type;
-    bool reset_works;
-    bool needs_reset;
-    VFIODeviceOps *ops;
-    unsigned int num_irqs;
-    unsigned int num_regions;
-    unsigned int flags;
-} VFIODevice;
-
-struct VFIODeviceOps {
-    bool (*vfio_compute_needs_reset)(VFIODevice *vdev);
-    int (*vfio_hot_reset_multi)(VFIODevice *vdev);
-    void (*vfio_eoi)(VFIODevice *vdev);
-    int (*vfio_check_device)(VFIODevice *vdev);
-    int (*vfio_populate_regions)(VFIODevice *vdev);
-    int (*vfio_populate_interrupts)(VFIODevice *vdev);
-};
-
 typedef struct VFIOPCIDevice {
     PCIDevice pdev;
     VFIODevice vbasedev;
@@ -258,15 +168,6 @@  typedef struct VFIOPCIDevice {
     bool rom_read_failed;
 } VFIOPCIDevice;
 
-typedef struct VFIOGroup {
-    int fd;
-    int groupid;
-    VFIOContainer *container;
-    QLIST_HEAD(, VFIODevice) device_list;
-    QLIST_ENTRY(VFIOGroup) next;
-    QLIST_ENTRY(VFIOGroup) container_next;
-} VFIOGroup;
-
 typedef struct VFIORomBlacklistEntry {
     uint16_t vendor_id;
     uint16_t device_id;
@@ -292,78 +193,16 @@  static const VFIORomBlacklistEntry romblacklist[] = {
 
 #define MSIX_CAP_LENGTH 12
 
-static QLIST_HEAD(, VFIOGroup)
-    group_list = QLIST_HEAD_INITIALIZER(group_list);
-
-#ifdef CONFIG_KVM
-/*
- * We have a single VFIO pseudo device per KVM VM.  Once created it lives
- * for the life of the VM.  Closing the file descriptor only drops our
- * reference to it and the device's reference to kvm.  Therefore once
- * initialized, this file descriptor is only released on QEMU exit and
- * we'll re-use it should another vfio device be attached before then.
- */
-static int vfio_kvm_device_fd = -1;
-#endif
-
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
                                   uint32_t val, int len);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
-static void vfio_put_base_device(VFIODevice *vbasedev);
 static int vfio_check_device(VFIODevice *vbasedev);
 static int vfio_populate_regions(VFIODevice *vbasedev);
 static int vfio_populate_interrupts(VFIODevice *vbasedev);
 
 /*
- * Common VFIO interrupt disable
- */
-static void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
-{
-    struct vfio_irq_set irq_set = {
-        .argsz = sizeof(irq_set),
-        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
-        .index = index,
-        .start = 0,
-        .count = 0,
-    };
-
-    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
-}
-
-/*
- * INTx
- */
-static void vfio_unmask_irqindex(VFIODevice *vbasedev, int index)
-{
-    struct vfio_irq_set irq_set = {
-        .argsz = sizeof(irq_set),
-        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
-        .index = index,
-        .start = 0,
-        .count = 1,
-    };
-
-    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
-}
-
-#ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */
-static void vfio_mask_irqindex(VFIODevice *vbasedev, int index)
-{
-    struct vfio_irq_set irq_set = {
-        .argsz = sizeof(irq_set),
-        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
-        .index = index,
-        .start = 0,
-        .count = 1,
-    };
-
-    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
-}
-#endif
-
-/*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
  * also be a huge overhead.  We try to get the best of both worlds by
  * waiting until an interrupt to disable mmaps (subsequent transitions
@@ -1115,110 +954,6 @@  static void vfio_update_msi(VFIOPCIDevice *vdev)
     }
 }
 
-/*
- * IO Port/MMIO - Beware of the endians, VFIO is always little endian
- */
-static void vfio_region_write(void *opaque, hwaddr addr,
-                           uint64_t data, unsigned size)
-{
-    VFIORegion *region = opaque;
-    VFIODevice *vbasedev = region->vbasedev;
-    union {
-        uint8_t byte;
-        uint16_t word;
-        uint32_t dword;
-        uint64_t qword;
-    } buf;
-
-    switch (size) {
-    case 1:
-        buf.byte = data;
-        break;
-    case 2:
-        buf.word = data;
-        break;
-    case 4:
-        buf.dword = data;
-        break;
-    default:
-        hw_error("vfio: unsupported write size, %d bytes", size);
-        break;
-    }
-
-    if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
-        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
-                     ",%d) failed: %m",
-                     __func__, vbasedev->name, region->nr,
-                     addr, data, size);
-    }
-
-    DPRINTF("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
-            ", %d)\n", __func__, vbasedev->name,
-            region->nr, addr, data, size);
-
-    /*
-     * A read or write to a BAR always signals an INTx EOI.  This will
-     * do nothing if not pending (including not in INTx mode).  We assume
-     * that a BAR access is in response to an interrupt and that BAR
-     * accesses will service the interrupt.  Unfortunately, we don't know
-     * which access will service the interrupt, so we're potentially
-     * getting quite a few host interrupts per guest interrupt.
-     */
-    vbasedev->ops->vfio_eoi(vbasedev);
-
-}
-
-static uint64_t vfio_region_read(void *opaque,
-                              hwaddr addr, unsigned size)
-{
-    VFIORegion *region = opaque;
-    VFIODevice *vbasedev = region->vbasedev;
-    union {
-        uint8_t byte;
-        uint16_t word;
-        uint32_t dword;
-        uint64_t qword;
-    } buf;
-    uint64_t data = 0;
-
-    if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
-        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
-                     __func__, vbasedev->name, region->nr,
-                     addr, size);
-        return (uint64_t)-1;
-    }
-
-    switch (size) {
-    case 1:
-        data = buf.byte;
-        break;
-    case 2:
-        data = buf.word;
-        break;
-    case 4:
-        data = buf.dword;
-        break;
-    default:
-        hw_error("vfio: unsupported read size, %d bytes", size);
-        break;
-    }
-
-    DPRINTF("%s(%s:region%d+0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n",
-            __func__, vdev->name,
-            region->nr, addr, size, data);
-
-    /* Same as write above */
-    vbasedev->ops->vfio_eoi(vbasedev);
-
-    return data;
-}
-
-static const MemoryRegionOps vfio_region_ops = {
-    .read = vfio_region_read,
-    .write = vfio_region_write,
-    .endianness = DEVICE_NATIVE_ENDIAN,
-};
-
 static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
 {
     struct vfio_region_info reg_info = {
@@ -2445,307 +2180,6 @@  static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
 }
 
 /*
- * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
- */
-static int vfio_dma_unmap(VFIOContainer *container,
-                          hwaddr iova, ram_addr_t size)
-{
-    struct vfio_iommu_type1_dma_unmap unmap = {
-        .argsz = sizeof(unmap),
-        .flags = 0,
-        .iova = iova,
-        .size = size,
-    };
-
-    if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
-        DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
-        return -errno;
-    }
-
-    return 0;
-}
-
-static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
-                        ram_addr_t size, void *vaddr, bool readonly)
-{
-    struct vfio_iommu_type1_dma_map map = {
-        .argsz = sizeof(map),
-        .flags = VFIO_DMA_MAP_FLAG_READ,
-        .vaddr = (__u64)(uintptr_t)vaddr,
-        .iova = iova,
-        .size = size,
-    };
-
-    if (!readonly) {
-        map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
-    }
-
-    /*
-     * Try the mapping, if it fails with EBUSY, unmap the region and try
-     * again.  This shouldn't be necessary, but we sometimes see it in
-     * the the VGA ROM space.
-     */
-    if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
-        (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
-         ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
-        return 0;
-    }
-
-    DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
-    return -errno;
-}
-
-static bool vfio_listener_skipped_section(MemoryRegionSection *section)
-{
-    return (!memory_region_is_ram(section->mr) &&
-            !memory_region_is_iommu(section->mr)) ||
-           /*
-            * Sizing an enabled 64-bit BAR can cause spurious mappings to
-            * addresses in the upper part of the 64-bit address space.  These
-            * are never accessed by the CPU and beyond the address width of
-            * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU width.
-            */
-           section->offset_within_address_space & (1ULL << 63);
-}
-
-static void vfio_iommu_map_notify(Notifier *n, void *data)
-{
-    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
-    VFIOContainer *container = giommu->container;
-    IOMMUTLBEntry *iotlb = data;
-    MemoryRegion *mr;
-    hwaddr xlat;
-    hwaddr len = iotlb->addr_mask + 1;
-    void *vaddr;
-    int ret;
-
-    DPRINTF("iommu map @ %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
-            iotlb->iova, iotlb->iova + iotlb->addr_mask);
-
-    /*
-     * The IOMMU TLB entry we have just covers translation through
-     * this IOMMU to its immediate target.  We need to translate
-     * it the rest of the way through to memory.
-     */
-    mr = address_space_translate(&address_space_memory,
-                                 iotlb->translated_addr,
-                                 &xlat, &len, iotlb->perm & IOMMU_WO);
-    if (!memory_region_is_ram(mr)) {
-        DPRINTF("iommu map to non memory area %"HWADDR_PRIx"\n",
-                xlat);
-        return;
-    }
-    /*
-     * Translation truncates length to the IOMMU page size,
-     * check that it did not truncate too much.
-     */
-    if (len & iotlb->addr_mask) {
-        DPRINTF("iommu has granularity incompatible with target AS\n");
-        return;
-    }
-
-    if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
-        vaddr = memory_region_get_ram_ptr(mr) + xlat;
-
-        ret = vfio_dma_map(container, iotlb->iova,
-                           iotlb->addr_mask + 1, vaddr,
-                           !(iotlb->perm & IOMMU_WO) || mr->readonly);
-        if (ret) {
-            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
-                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
-                         container, iotlb->iova,
-                         iotlb->addr_mask + 1, vaddr, ret);
-        }
-    } else {
-        ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
-        if (ret) {
-            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
-                         "0x%"HWADDR_PRIx") = %d (%m)",
-                         container, iotlb->iova,
-                         iotlb->addr_mask + 1, ret);
-        }
-    }
-}
-
-static void vfio_listener_region_add(MemoryListener *listener,
-                                     MemoryRegionSection *section)
-{
-    VFIOContainer *container = container_of(listener, VFIOContainer,
-                                            iommu_data.type1.listener);
-    hwaddr iova, end;
-    Int128 llend;
-    void *vaddr;
-    int ret;
-
-    if (vfio_listener_skipped_section(section)) {
-        DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
-                section->offset_within_address_space,
-                section->offset_within_address_space +
-                int128_get64(int128_sub(section->size, int128_one())));
-        return;
-    }
-
-    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
-                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
-        error_report("%s received unaligned region", __func__);
-        return;
-    }
-
-    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
-    llend = int128_make64(section->offset_within_address_space);
-    llend = int128_add(llend, section->size);
-    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
-
-    if (int128_ge(int128_make64(iova), llend)) {
-        return;
-    }
-
-    memory_region_ref(section->mr);
-
-    if (memory_region_is_iommu(section->mr)) {
-        VFIOGuestIOMMU *giommu;
-
-        DPRINTF("region_add [iommu] %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
-                iova, int128_get64(int128_sub(llend, int128_one())));
-        /*
-         * FIXME: We should do some checking to see if the
-         * capabilities of the host VFIO IOMMU are adequate to model
-         * the guest IOMMU
-         *
-         * FIXME: For VFIO iommu types which have KVM acceleration to
-         * avoid bouncing all map/unmaps through qemu this way, this
-         * would be the right place to wire that up (tell the KVM
-         * device emulation the VFIO iommu handles to use).
-         */
-        /*
-         * This assumes that the guest IOMMU is empty of
-         * mappings at this point.
-         *
-         * One way of doing this is:
-         * 1. Avoid sharing IOMMUs between emulated devices or different
-         * IOMMU groups.
-         * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
-         * there are some mappings in IOMMU.
-         *
-         * VFIO on SPAPR does that. Other IOMMU models may do that different,
-         * they must make sure there are no existing mappings or
-         * loop through existing mappings to map them into VFIO.
-         */
-        giommu = g_malloc0(sizeof(*giommu));
-        giommu->iommu = section->mr;
-        giommu->container = container;
-        giommu->n.notify = vfio_iommu_map_notify;
-        QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
-        memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
-
-        return;
-    }
-
-    /* Here we assume that memory_region_is_ram(section->mr)==true */
-
-    end = int128_get64(llend);
-    vaddr = memory_region_get_ram_ptr(section->mr) +
-            section->offset_within_region +
-            (iova - section->offset_within_address_space);
-
-    DPRINTF("region_add [ram] %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
-            iova, end - 1, vaddr);
-
-    ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
-    if (ret) {
-        error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
-                     "0x%"HWADDR_PRIx", %p) = %d (%m)",
-                     container, iova, end - iova, vaddr, ret);
-
-        /*
-         * On the initfn path, store the first error in the container so we
-         * can gracefully fail.  Runtime, there's not much we can do other
-         * than throw a hardware error.
-         */
-        if (!container->iommu_data.type1.initialized) {
-            if (!container->iommu_data.type1.error) {
-                container->iommu_data.type1.error = ret;
-            }
-        } else {
-            hw_error("vfio: DMA mapping failed, unable to continue");
-        }
-    }
-}
-
-static void vfio_listener_region_del(MemoryListener *listener,
-                                     MemoryRegionSection *section)
-{
-    VFIOContainer *container = container_of(listener, VFIOContainer,
-                                            iommu_data.type1.listener);
-    hwaddr iova, end;
-    int ret;
-
-    if (vfio_listener_skipped_section(section)) {
-        DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
-                section->offset_within_address_space,
-                section->offset_within_address_space +
-                int128_get64(int128_sub(section->size, int128_one())));
-        return;
-    }
-
-    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
-                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
-        error_report("%s received unaligned region", __func__);
-        return;
-    }
-
-    if (memory_region_is_iommu(section->mr)) {
-        VFIOGuestIOMMU *giommu;
-
-        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
-            if (giommu->iommu == section->mr) {
-                memory_region_unregister_iommu_notifier(&giommu->n);
-                QLIST_REMOVE(giommu, giommu_next);
-                g_free(giommu);
-                break;
-            }
-        }
-
-        /*
-         * FIXME: We assume the one big unmap below is adequate to
-         * remove any individual page mappings in the IOMMU which
-         * might have been copied into VFIO. This works for a page table
-         * based IOMMU where a big unmap flattens a large range of IO-PTEs.
-         * That may not be true for all IOMMU types.
-         */
-    }
-
-    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
-    end = (section->offset_within_address_space + int128_get64(section->size)) &
-          TARGET_PAGE_MASK;
-
-    if (iova >= end) {
-        return;
-    }
-
-    DPRINTF("region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
-            iova, end - 1);
-
-    ret = vfio_dma_unmap(container, iova, end - iova);
-    memory_region_unref(section->mr);
-    if (ret) {
-        error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
-                     "0x%"HWADDR_PRIx") = %d (%m)",
-                     container, iova, end - iova, ret);
-    }
-}
-
-static MemoryListener vfio_memory_listener = {
-    .region_add = vfio_listener_region_add,
-    .region_del = vfio_listener_region_del,
-};
-
-static void vfio_listener_release(VFIOContainer *container)
-{
-    memory_listener_unregister(&container->iommu_data.type1.listener);
-}
-
-/*
  * Interrupt setup
  */
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
@@ -2925,46 +2359,6 @@  static void vfio_unmap_bar(VFIOPCIDevice *vdev, int nr)
     memory_region_destroy(&bar->region.mem);
 }
 
-static int vfio_mmap_region(Object *obj, VFIORegion *region,
-                         MemoryRegion *mem, MemoryRegion *submem,
-                         void **map, size_t size, off_t offset,
-                         const char *name)
-{
-    int ret = 0;
-    VFIODevice *vbasedev = region->vbasedev;
-
-    if (VFIO_ALLOW_MMAP && size && region->flags &
-        VFIO_REGION_INFO_FLAG_MMAP) {
-        int prot = 0;
-
-        if (region->flags & VFIO_REGION_INFO_FLAG_READ) {
-            prot |= PROT_READ;
-        }
-
-        if (region->flags & VFIO_REGION_INFO_FLAG_WRITE) {
-            prot |= PROT_WRITE;
-        }
-
-        *map = mmap(NULL, size, prot, MAP_SHARED,
-                    vbasedev->fd, region->fd_offset + offset);
-        if (*map == MAP_FAILED) {
-            *map = NULL;
-            ret = -errno;
-            goto empty_region;
-        }
-
-        memory_region_init_ram_ptr(submem, obj, name, size, *map);
-    } else {
-empty_region:
-        /* Create a zero sized sub-region to make cleanup easy. */
-        memory_region_init(submem, obj, name, 0);
-    }
-
-    memory_region_add_subregion(mem, offset, submem);
-
-    return ret;
-}
-
 static void vfio_map_bar(VFIOPCIDevice *vdev, int nr)
 {
     VFIOBAR *bar = &vdev->bars[nr];
@@ -3623,345 +3017,6 @@  static VFIODeviceOps vfio_pci_ops = {
     .vfio_populate_interrupts = vfio_populate_interrupts,
 };
 
-static void vfio_reset_handler(void *opaque)
-{
-    VFIOGroup *group;
-    VFIODevice *vbasedev;
-
-    QLIST_FOREACH(group, &group_list, next) {
-        QLIST_FOREACH(vbasedev, &group->device_list, next) {
-            vbasedev->ops->vfio_compute_needs_reset(vbasedev);
-        }
-    }
-
-    QLIST_FOREACH(group, &group_list, next) {
-        QLIST_FOREACH(vbasedev, &group->device_list, next) {
-            if (vbasedev->needs_reset) {
-                vbasedev->ops->vfio_hot_reset_multi(vbasedev);
-            }
-        }
-    }
-}
-
-static void vfio_kvm_device_add_group(VFIOGroup *group)
-{
-#ifdef CONFIG_KVM
-    struct kvm_device_attr attr = {
-        .group = KVM_DEV_VFIO_GROUP,
-        .attr = KVM_DEV_VFIO_GROUP_ADD,
-        .addr = (uint64_t)(unsigned long)&group->fd,
-    };
-
-    if (!kvm_enabled()) {
-        return;
-    }
-
-    if (vfio_kvm_device_fd < 0) {
-        struct kvm_create_device cd = {
-            .type = KVM_DEV_TYPE_VFIO,
-        };
-
-        if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
-            DPRINTF("KVM_CREATE_DEVICE: %m\n");
-            return;
-        }
-
-        vfio_kvm_device_fd = cd.fd;
-    }
-
-    if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
-        error_report("Failed to add group %d to KVM VFIO device: %m",
-                     group->groupid);
-    }
-#endif
-}
-
-static void vfio_kvm_device_del_group(VFIOGroup *group)
-{
-#ifdef CONFIG_KVM
-    struct kvm_device_attr attr = {
-        .group = KVM_DEV_VFIO_GROUP,
-        .attr = KVM_DEV_VFIO_GROUP_DEL,
-        .addr = (uint64_t)(unsigned long)&group->fd,
-    };
-
-    if (vfio_kvm_device_fd < 0) {
-        return;
-    }
-
-    if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
-        error_report("Failed to remove group %d from KVM VFIO device: %m",
-                     group->groupid);
-    }
-#endif
-}
-
-static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
-{
-    VFIOAddressSpace *space;
-
-    QLIST_FOREACH(space, &vfio_address_spaces, list) {
-        if (space->as == as) {
-            return space;
-        }
-    }
-
-    /* No suitable VFIOAddressSpace, create a new one */
-    space = g_malloc0(sizeof(*space));
-    space->as = as;
-    QLIST_INIT(&space->containers);
-
-    QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
-
-    return space;
-}
-
-static void vfio_put_address_space(VFIOAddressSpace *space)
-{
-    if (QLIST_EMPTY(&space->containers)) {
-        QLIST_REMOVE(space, list);
-        g_free(space);
-    }
-}
-
-static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
-{
-    VFIOContainer *container;
-    int ret, fd;
-    VFIOAddressSpace *space;
-
-    space = vfio_get_address_space(as);
-
-    QLIST_FOREACH(container, &space->containers, next) {
-        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
-            group->container = container;
-            QLIST_INSERT_HEAD(&container->group_list, group, container_next);
-            return 0;
-        }
-    }
-
-    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
-    if (fd < 0) {
-        error_report("vfio: failed to open /dev/vfio/vfio: %m");
-        ret = -errno;
-        goto put_space_exit;
-    }
-
-    ret = ioctl(fd, VFIO_GET_API_VERSION);
-    if (ret != VFIO_API_VERSION) {
-        error_report("vfio: supported vfio version: %d, "
-                     "reported version: %d", VFIO_API_VERSION, ret);
-        ret = -EINVAL;
-        goto close_fd_exit;
-    }
-
-    container = g_malloc0(sizeof(*container));
-    container->space = space;
-    container->fd = fd;
-
-    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
-        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
-        if (ret) {
-            error_report("vfio: failed to set group container: %m");
-            ret = -errno;
-            goto free_container_exit;
-        }
-
-        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
-        if (ret) {
-            error_report("vfio: failed to set iommu for container: %m");
-            ret = -errno;
-            goto free_container_exit;
-        }
-
-        container->iommu_data.type1.listener = vfio_memory_listener;
-        container->iommu_data.release = vfio_listener_release;
-
-        memory_listener_register(&container->iommu_data.type1.listener,
-                                 &address_space_memory);
-
-        if (container->iommu_data.type1.error) {
-            ret = container->iommu_data.type1.error;
-            error_report("vfio: memory listener initialization failed for container");
-            goto listener_release_exit;
-        }
-
-        container->iommu_data.type1.initialized = true;
-
-    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
-        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
-        if (ret) {
-            error_report("vfio: failed to set group container: %m");
-            ret = -errno;
-            goto free_container_exit;
-        }
-
-        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU);
-        if (ret) {
-            error_report("vfio: failed to set iommu for container: %m");
-            ret = -errno;
-            goto free_container_exit;
-        }
-
-        /*
-         * The host kernel code implementing VFIO_IOMMU_DISABLE is called
-         * when container fd is closed so we do not call it explicitly
-         * in this file.
-         */
-        ret = ioctl(fd, VFIO_IOMMU_ENABLE);
-        if (ret) {
-            error_report("vfio: failed to enable container: %m");
-            ret = -errno;
-            goto free_container_exit;
-        }
-
-        container->iommu_data.type1.listener = vfio_memory_listener;
-        container->iommu_data.release = vfio_listener_release;
-
-        memory_listener_register(&container->iommu_data.type1.listener,
-                                 container->space->as);
-
-    } else {
-        error_report("vfio: No available IOMMU models");
-        ret = -EINVAL;
-        goto free_container_exit;
-    }
-
-    QLIST_INIT(&container->group_list);
-    QLIST_INSERT_HEAD(&space->containers, container, next);
-
-    group->container = container;
-    QLIST_INSERT_HEAD(&container->group_list, group, container_next);
-
-    return 0;
-
-listener_release_exit:
-    vfio_listener_release(container);
-
-free_container_exit:
-    g_free(container);
-
-close_fd_exit:
-    close(fd);
-
-put_space_exit:
-    vfio_put_address_space(space);
-
-    return ret;
-}
-
-static void vfio_disconnect_container(VFIOGroup *group)
-{
-    VFIOContainer *container = group->container;
-
-    if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
-        error_report("vfio: error disconnecting group %d from container",
-                     group->groupid);
-    }
-
-    QLIST_REMOVE(group, container_next);
-    group->container = NULL;
-
-    if (QLIST_EMPTY(&container->group_list)) {
-        VFIOAddressSpace *space = container->space;
-
-        if (container->iommu_data.release) {
-            container->iommu_data.release(container);
-        }
-        QLIST_REMOVE(container, next);
-        DPRINTF("vfio_disconnect_container: close container->fd\n");
-        close(container->fd);
-        g_free(container);
-
-        vfio_put_address_space(space);
-    }
-}
-
-static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as)
-{
-    VFIOGroup *group;
-    char path[32];
-    struct vfio_group_status status = { .argsz = sizeof(status) };
-
-    QLIST_FOREACH(group, &group_list, next) {
-        if (group->groupid == groupid) {
-            /* Found it.  Now is it already in the right context? */
-            if (group->container->space->as == as) {
-                return group;
-            } else {
-                error_report("vfio: group %d used in multiple address spaces",
-                             group->groupid);
-                return NULL;
-            }
-        }
-    }
-
-    group = g_malloc0(sizeof(*group));
-
-    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
-    group->fd = qemu_open(path, O_RDWR);
-    if (group->fd < 0) {
-        error_report("vfio: error opening %s: %m", path);
-        goto free_group_exit;
-    }
-
-    if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
-        error_report("vfio: error getting group status: %m");
-        goto close_fd_exit;
-    }
-
-    if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
-        error_report("vfio: error, group %d is not viable, please ensure "
-                     "all devices within the iommu_group are bound to their "
-                     "vfio bus driver.", groupid);
-        goto close_fd_exit;
-    }
-
-    group->groupid = groupid;
-    QLIST_INIT(&group->device_list);
-
-    if (vfio_connect_container(group, as)) {
-        error_report("vfio: failed to setup container for group %d", groupid);
-        goto close_fd_exit;
-    }
-
-    if (QLIST_EMPTY(&group_list)) {
-        qemu_register_reset(vfio_reset_handler, NULL);
-    }
-
-    QLIST_INSERT_HEAD(&group_list, group, next);
-
-    vfio_kvm_device_add_group(group);
-
-    return group;
-
-close_fd_exit:
-    close(group->fd);
-
-free_group_exit:
-    g_free(group);
-
-    return NULL;
-}
-
-static void vfio_put_group(VFIOGroup *group)
-{
-    if (!QLIST_EMPTY(&group->device_list)) {
-        return;
-    }
-
-    vfio_kvm_device_del_group(group);
-    vfio_disconnect_container(group);
-    QLIST_REMOVE(group, next);
-    DPRINTF("vfio_put_group: close group->fd\n");
-    close(group->fd);
-    g_free(group);
-
-    if (QLIST_EMPTY(&group_list)) {
-        qemu_unregister_reset(vfio_reset_handler, NULL);
-    }
-}
-
 static int vfio_check_device(VFIODevice *vbasedev)
 {
     if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
@@ -4095,77 +3150,6 @@  error:
     return ret;
 }
 
-static int vfio_get_device(VFIOGroup *group, const char *name,
-                           VFIODevice *vbasedev)
-{
-    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
-    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
-    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
-    int ret;
-
-    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
-    if (ret < 0) {
-        error_report("vfio: error getting device %s from group %d: %m",
-                     name, group->groupid);
-        error_printf("Verify all devices in group %d are bound to vfio-pci "
-                     "or pci-stub and not already in use\n", group->groupid);
-        return ret;
-    }
-
-    vbasedev->fd = ret;
-    vbasedev->group = group;
-    QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
-
-    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
-    if (ret) {
-        error_report("vfio: error getting device info: %m");
-        goto error;
-    }
-
-    vbasedev->num_irqs = dev_info.num_irqs;
-    vbasedev->num_regions = dev_info.num_regions;
-    vbasedev->flags = dev_info.flags;
-
-    DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
-            dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
-
-    vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
-
-    /* call device specific functions */
-    ret = vbasedev->ops->vfio_check_device(vbasedev);
-    if (ret) {
-        error_report("vfio: error when checking device %s\n",
-                     vbasedev->name);
-        goto error;
-    }
-    ret = vbasedev->ops->vfio_populate_regions(vbasedev);
-    if (ret) {
-        error_report("vfio: error when populating regions of device %s\n",
-                     vbasedev->name);
-        goto error;
-    }
-    ret = vbasedev->ops->vfio_populate_interrupts(vbasedev);
-    if (ret) {
-        error_report("vfio: error when populating interrupts of device %s\n",
-                     vbasedev->name);
-        goto error;
-    }
-
-error:
-    if (ret) {
-        vfio_put_base_device(vbasedev);
-    }
-    return ret;
-}
-
-void vfio_put_base_device(VFIODevice *vbasedev)
-{
-    QLIST_REMOVE(vbasedev, next);
-    vbasedev->group = NULL;
-    DPRINTF("vfio_put_base_device: close vdev->fd\n");
-    close(vbasedev->fd);
-}
-
 static void vfio_put_device(VFIOPCIDevice *vdev)
 {
     g_free(vdev->vbasedev.name);
@@ -4543,47 +3527,3 @@  static void register_vfio_pci_dev_type(void)
 }
 
 type_init(register_vfio_pci_dev_type)
-
-static int vfio_container_do_ioctl(AddressSpace *as, int32_t groupid,
-                                   int req, void *param)
-{
-    VFIOGroup *group;
-    VFIOContainer *container;
-    int ret = -1;
-
-    group = vfio_get_group(groupid, as);
-    if (!group) {
-        error_report("vfio: group %d not registered", groupid);
-        return ret;
-    }
-
-    container = group->container;
-    if (group->container) {
-        ret = ioctl(container->fd, req, param);
-        if (ret < 0) {
-            error_report("vfio: failed to ioctl container: ret=%d, %s",
-                         ret, strerror(errno));
-        }
-    }
-
-    vfio_put_group(group);
-
-    return ret;
-}
-
-int vfio_container_ioctl(AddressSpace *as, int32_t groupid,
-                         int req, void *param)
-{
-    /* We allow only certain ioctls to the container */
-    switch (req) {
-    case VFIO_CHECK_EXTENSION:
-    case VFIO_IOMMU_SPAPR_TCE_GET_INFO:
-        break;
-    default:
-        /* Return an error on unknown requests */
-        error_report("vfio: unsupported ioctl %X", req);
-        return -1;
-    }
-
-    return vfio_container_do_ioctl(as, groupid, req, param);
-}
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
new file mode 100644
index 0000000..4684ee5
--- /dev/null
+++ b/include/hw/vfio/vfio-common.h
@@ -0,0 +1,151 @@ 
+/*
+ * common header for vfio based device assignment support
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ *  Adapted for KVM by Qumranet.
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+#ifndef HW_VFIO_VFIO_COMMON_H
+#define HW_VFIO_VFIO_COMMON_H
+
+#include "qemu-common.h"
+#include "exec/address-spaces.h"
+#include "exec/memory.h"
+#include "qemu/queue.h"
+#include "qemu/notify.h"
+
+/*#define DEBUG_VFIO*/
+#ifdef DEBUG_VFIO
+#define DPRINTF(fmt, ...) \
+    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+/* Extra debugging, trap acceleration paths for more logging */
+#define VFIO_ALLOW_MMAP 1
+#define VFIO_ALLOW_KVM_INTX 1
+#define VFIO_ALLOW_KVM_MSI 1
+#define VFIO_ALLOW_KVM_MSIX 1
+
+enum {
+    VFIO_DEVICE_TYPE_PCI = 0,
+    VFIO_DEVICE_TYPE_PLATFORM = 1,
+};
+
+typedef struct VFIORegion {
+    struct VFIODevice *vbasedev;
+    off_t fd_offset; /* offset of region within device fd */
+    MemoryRegion mem; /* slow, read/write access */
+    MemoryRegion mmap_mem; /* direct mapped access */
+    void *mmap;
+    size_t size;
+    uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
+    uint8_t nr; /* cache the region number for debug */
+} VFIORegion;
+
+typedef struct VFIOAddressSpace {
+    AddressSpace *as;
+    QLIST_HEAD(, VFIOContainer) containers;
+    QLIST_ENTRY(VFIOAddressSpace) list;
+} VFIOAddressSpace;
+
+struct VFIOGroup;
+
+typedef struct VFIOType1 {
+    MemoryListener listener;
+    int error;
+    bool initialized;
+} VFIOType1;
+
+typedef struct VFIOContainer {
+    VFIOAddressSpace *space;
+    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
+    struct {
+        /* enable abstraction to support various iommu backends */
+        union {
+            VFIOType1 type1;
+        };
+        void (*release)(struct VFIOContainer *);
+    } iommu_data;
+    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
+    QLIST_HEAD(, VFIOGroup) group_list;
+    QLIST_ENTRY(VFIOContainer) next;
+} VFIOContainer;
+
+typedef struct VFIOGuestIOMMU {
+    VFIOContainer *container;
+    MemoryRegion *iommu;
+    Notifier n;
+    QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
+} VFIOGuestIOMMU;
+
+typedef struct VFIODeviceOps VFIODeviceOps;
+
+typedef struct VFIODevice {
+    QLIST_ENTRY(VFIODevice) next;
+    struct VFIOGroup *group;
+    char *name;
+    int fd;
+    int type;
+    bool reset_works;
+    bool needs_reset;
+    VFIODeviceOps *ops;
+    unsigned int num_irqs;
+    unsigned int num_regions;
+    unsigned int flags;
+} VFIODevice;
+
+struct VFIODeviceOps {
+    bool (*vfio_compute_needs_reset)(VFIODevice *vdev);
+    int (*vfio_hot_reset_multi)(VFIODevice *vdev);
+    void (*vfio_eoi)(VFIODevice *vdev);
+    int (*vfio_check_device)(VFIODevice *vdev);
+    int (*vfio_populate_regions)(VFIODevice *vdev);
+    int (*vfio_populate_interrupts)(VFIODevice *vdev);
+};
+
+typedef struct VFIOGroup {
+    int fd;
+    int groupid;
+    VFIOContainer *container;
+    QLIST_HEAD(, VFIODevice) device_list;
+    QLIST_ENTRY(VFIOGroup) next;
+    QLIST_ENTRY(VFIOGroup) container_next;
+} VFIOGroup;
+
+void vfio_put_base_device(VFIODevice *vbasedev);
+void vfio_disable_irqindex(VFIODevice *vbasedev, int index);
+void vfio_unmask_irqindex(VFIODevice *vbasedev, int index);
+#ifdef CONFIG_KVM
+void vfio_mask_irqindex(VFIODevice *vbasedev, int index);
+#endif
+void vfio_region_write(void *opaque, hwaddr addr,
+                           uint64_t data, unsigned size);
+uint64_t vfio_region_read(void *opaque,
+                          hwaddr addr, unsigned size);
+void vfio_listener_release(VFIOContainer *container);
+int vfio_mmap_region(Object *vdev, VFIORegion *region,
+                     MemoryRegion *mem, MemoryRegion *submem,
+                     void **map, size_t size, off_t offset,
+                     const char *name);
+void vfio_reset_handler(void *opaque);
+VFIOGroup *vfio_get_group(int groupid, AddressSpace *as);
+void vfio_put_group(VFIOGroup *group);
+int vfio_get_device(VFIOGroup *group, const char *name,
+                    VFIODevice *vbasedev);
+
+#endif /* !HW_VFIO_VFIO_COMMON_H */