diff mbox

[RFC,2/2] hw/misc/vfio: add vfio-platform support

Message ID 1393382272-29021-3-git-send-email-kim.phillips@linaro.org
State New
Headers show

Commit Message

kim-phillips Feb. 26, 2014, 2:37 a.m. UTC
We basically add support for the SysBusDevice type in addition to the
existing PCIDevice support.  This involves taking common code from the
existing vfio_initfn(), and putting it under a new vfio_find_get_group(),
that both vfio_initfn() and the new vfio_platform_realize() call.
Since realize() returns void, unlike PCIDevice's initfn(),
error codes are moved into the error message text with %m.
Some exceptions to the PCI path are added for platform devices,
mostly in the form of early returns, since less setup is needed.
I chose to reuse VFIODevice's config_size to determine whether
the device was a PCI device or a platform device, which might
need to change.

Currently only MMIO access is supported at this time.  It works
because of qemu's stage 1 translation, but needs to be in stage 2
in case other entities than the guest OS want to access it.
A KVM patch to address this is in the works.

The perceived path for future QEMU development is:

- support allocating a variable number of regions
  - VFIODevice's bars[] become dynamically allocated *regions
  - VFIOBAR's device fd replaced with parent VFIODevice ptr,
    to facilitate BAR r/w ops calling vfio_eoi()
- add support for interrupts
- verify and test platform dev unmap path
- test existing PCI path for any regressions
- add support for creating platform devices on the qemu command line
  - currently device address specification is hardcoded for test
    development on Calxeda Midway's fff51000.ethernet device
- reset is not supported and registration of reset functions is
  bypassed for platform devices.
  - there is no standard means of resetting a platform device,
    unsure if it suffices to be handled at device--VFIO binding time

Signed-off-by: Kim Phillips <kim.phillips@linaro.org>
---
 hw/misc/vfio.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 145 insertions(+), 35 deletions(-)

Comments

Alex Williamson Feb. 28, 2014, 6:03 p.m. UTC | #1
On Tue, 2014-02-25 at 20:37 -0600, Kim Phillips wrote:
> We basically add support for the SysBusDevice type in addition to the
> existing PCIDevice support.  This involves taking common code from the
> existing vfio_initfn(), and putting it under a new vfio_find_get_group(),
> that both vfio_initfn() and the new vfio_platform_realize() call.
> Since realize() returns void, unlike PCIDevice's initfn(),
> error codes are moved into the error message text with %m.
> Some exceptions to the PCI path are added for platform devices,
> mostly in the form of early returns, since less setup is needed.
> I chose to reuse VFIODevice's config_size to determine whether
> the device was a PCI device or a platform device, which might
> need to change.
> 
> Currently only MMIO access is supported at this time.  It works
> because of qemu's stage 1 translation, but needs to be in stage 2
> in case other entities than the guest OS want to access it.
> A KVM patch to address this is in the works.
> 
> The perceived path for future QEMU development is:
> 
> - support allocating a variable number of regions
>   - VFIODevice's bars[] become dynamically allocated *regions
>   - VFIOBAR's device fd replaced with parent VFIODevice ptr,
>     to facilitate BAR r/w ops calling vfio_eoi()
> - add support for interrupts
> - verify and test platform dev unmap path
> - test existing PCI path for any regressions
> - add support for creating platform devices on the qemu command line
>   - currently device address specification is hardcoded for test
>     development on Calxeda Midway's fff51000.ethernet device
> - reset is not supported and registration of reset functions is
>   bypassed for platform devices.
>   - there is no standard means of resetting a platform device,
>     unsure if it suffices to be handled at device--VFIO binding time


On one hand, I had assumed we'd create a hw/misc/vfio/ directory and
perhaps split things into pci, common, vga, and platform.  We already
need the pci vs vga split as there's lots of irrelevant and complicated
vga quirks that most people don't want to see.  On the other hand, I'm
surprised how much of the PCI code you can re-use.  Still, I think it
would be cleaner to create a VFIOPCIDevice and a VFIOPlatformDevice.

> Signed-off-by: Kim Phillips <kim.phillips@linaro.org>
> ---
>  hw/misc/vfio.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 145 insertions(+), 35 deletions(-)
> 
> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> index 8db182f..eed24db 100644
> --- a/hw/misc/vfio.c
> +++ b/hw/misc/vfio.c
> @@ -32,6 +32,7 @@
>  #include "hw/pci/msi.h"
>  #include "hw/pci/msix.h"
>  #include "hw/pci/pci.h"
> +#include "hw/sysbus.h"
>  #include "qemu-common.h"
>  #include "qemu/error-report.h"
>  #include "qemu/event_notifier.h"
> @@ -166,7 +167,10 @@ typedef struct VFIOMSIXInfo {
>  } VFIOMSIXInfo;
>  
>  typedef struct VFIODevice {
> -    PCIDevice pdev;
> +    union {
> +        PCIDevice pdev;
> +        SysBusDevice sbdev;
> +    };
>      int fd;
>      VFIOINTx intx;
>      unsigned int config_size;
> @@ -180,6 +184,8 @@ typedef struct VFIODevice {
>      VFIOMSIXInfo *msix;
>      int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
>      int interrupt; /* Current interrupt type */
> +    char *name;  /* platform device name, e.g., fff51000.ethernet */
> +    int nr_regions; /* platform devices' number of regions */
>      VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
>      VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */
>      PCIHostDeviceAddress host;
> @@ -2497,8 +2503,6 @@ empty_region:
>          memory_region_init(submem, OBJECT(vdev), name, 0);
>      }
>  
> -    memory_region_add_subregion(mem, offset, submem);
> -

The "mem" parameter to this function is never used if we do this, but
I'm not sure I buy into this change anyway, see below.

>      return ret;
>  }
>  
> @@ -2552,6 +2556,7 @@ static void vfio_map_bar(VFIODevice *vdev, int nr)
>                        &bar->mmap_mem, &bar->mmap, size, 0, name)) {
>          error_report("%s unsupported. Performance may be slow", name);
>      }
> +    memory_region_add_subregion(&bar->mem, 0, &bar->mmap_mem);
>  
>      if (vdev->msix && vdev->msix->table_bar == nr) {
>          unsigned start;
> @@ -2566,15 +2571,38 @@ static void vfio_map_bar(VFIODevice *vdev, int nr)
>                            &vdev->msix->mmap, size, start, name)) {
>              error_report("%s unsupported. Performance may be slow", name);
>          }
> +        memory_region_add_subregion(&bar->mem, start, &vdev->msix->mmap_mem);
>      }
>  
>      vfio_bar_quirk_setup(vdev, nr);
>  }
>  
> +static void vfio_map_region(VFIODevice *vdev, int nr)
> +{
> +    VFIOBAR *bar = &vdev->bars[nr];
> +    unsigned size = bar->size;
> +    char name[64];
> +
> +    snprintf(name, sizeof(name), "VFIO %s region %d mmap", vdev->name, nr);
> +
> +    if (vfio_mmap_bar(vdev, bar, &bar->mem,
> +                      &bar->mmap_mem, &bar->mmap, size, 0, name)) {
> +        error_report("error mmapping %s: %m", name);
> +    }
> +}
> +
>  static void vfio_map_bars(VFIODevice *vdev)
>  {
>      int i;
>  
> +    if (!vdev->config_size) {
> +        /* platform device */
> +        for (i = 0; i < vdev->nr_regions; i++) {
> +            vfio_map_region(vdev, i);
> +        }
> +        return;
> +    }

I don't understand this, vfio_map_region() calls vfio_mmap_bar(), but
vfio_mmap_bar() has been gutted so that it only initializes the mmap
subregion, but never adds it.  vfio_map_bar() does both the
initialization of the slow, read/write, memory region as well as adding
the mmap sub-region.  I don't see where platform devices get a memory
region inserted anywhere into the guest address space.

> +
>      for (i = 0; i < PCI_ROM_SLOT; i++) {
>          vfio_map_bar(vdev, i);
>      }
> @@ -3144,7 +3172,8 @@ static void vfio_pci_reset_handler(void *opaque)
>  
>      QLIST_FOREACH(group, &group_list, next) {
>          QLIST_FOREACH(vdev, &group->device_list, next) {
> -            if (vdev->needs_reset) {
> +            /* HACK: restrict reset to PCI devices (have config_size) for now */
> +            if (vdev->config_size && vdev->needs_reset) {
>                  vfio_pci_hot_reset_multi(vdev);
>              }
>          }
> @@ -3418,25 +3447,18 @@ static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
>      DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
>              dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
>  
> -    if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
> -        error_report("vfio: Um, this isn't a PCI device");
> -        goto error;
> -    }
> -
> +    vdev->nr_regions = dev_info.num_regions;
>      vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
>  
> -    if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
> +    if (dev_info.num_regions > PCI_NUM_REGIONS) ||
> +        ((dev_info.flags & VFIO_DEVICE_FLAGS_PCI) &&
> +         (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1)) {

Now we start to have platform devices error out if they have more
regions than PCI knows about...  That doesn't make much sense.

>          error_report("vfio: unexpected number of io regions %u",
>                       dev_info.num_regions);
>          goto error;
>      }
>  
> -    if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
> -        error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
> -        goto error;
> -    }
> -
> -    for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
> +    for (i = 0; i < dev_info.num_regions; i++) {
>          reg_info.index = i;

This would break PCI since there are region indexes beyond the BARs.
>  
>          ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> @@ -3458,6 +3480,15 @@ static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
>          QLIST_INIT(&vdev->bars[i].quirks);
>      }
>  
> +    /* following is for PCI devices, at least for now */
> +    if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI))
> +        return 0;
> +
> +    if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
> +        error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
> +        goto error;
> +    }
> +
>      reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
>  
>      ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> @@ -3659,32 +3690,25 @@ static void vfio_unregister_err_notifier(VFIODevice *vdev)
>      event_notifier_cleanup(&vdev->err_notifier);
>  }
>  
> -static int vfio_initfn(PCIDevice *pdev)
> +static VFIOGroup *vfio_find_get_group(char *path)
>  {
> -    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
>      VFIOGroup *group;
> -    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
> +    char iommu_group_path[PATH_MAX], *group_name;
>      ssize_t len;
>      struct stat st;
>      int groupid;
> -    int ret;
>  
> -    /* Check that the host device exists */
> -    snprintf(path, sizeof(path),
> -             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
> -             vdev->host.domain, vdev->host.bus, vdev->host.slot,
> -             vdev->host.function);
>      if (stat(path, &st) < 0) {
> -        error_report("vfio: error: no such host device: %s", path);
> -        return -errno;
> +        error_report("vfio: error: no such host device: %s: %m", path);
> +        return NULL;
>      }
>  
>      strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
>  
>      len = readlink(path, iommu_group_path, PATH_MAX);
>      if (len <= 0) {
> -        error_report("vfio: error no iommu_group for device");
> -        return -errno;
> +        error_report("vfio: error no iommu_group for device: %m");
> +        return NULL;
>      }
>  
>      iommu_group_path[len] = 0;
> @@ -3692,18 +3716,37 @@ static int vfio_initfn(PCIDevice *pdev)
>  
>      if (sscanf(group_name, "%d", &groupid) != 1) {
>          error_report("vfio: error reading %s: %m", path);
> -        return -errno;
> +        return NULL;
>      }
>  
> -    DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
> -            vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
> -
>      group = vfio_get_group(groupid);
>      if (!group) {
> -        error_report("vfio: failed to get group %d", groupid);
> -        return -ENOENT;
> +        error_report("vfio: failed to get group %d: %m", groupid);
> +        return NULL;
>      }
>  
> +    return group;
> +}
> +
> +static int vfio_initfn(PCIDevice *pdev)
> +{
> +    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> +    VFIOGroup *group;
> +    char path[PATH_MAX];
> +    int ret;
> +
> +    /* Check that the host device exists */
> +    snprintf(path, sizeof(path),
> +             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
> +             vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +             vdev->host.function);
> +
> +    group = vfio_find_get_group(path);
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
> +            vdev->host.bus, vdev->host.slot, vdev->host.function,
> +            group->groupid);
> +
>      snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
>              vdev->host.domain, vdev->host.bus, vdev->host.slot,
>              vdev->host.function);
> @@ -3916,3 +3959,70 @@ static void register_vfio_pci_dev_type(void)
>  }
>  
>  type_init(register_vfio_pci_dev_type)
> +
> +
> +/*
> + * VFIO platform devices
> + */
> +static void vfio_platform_realize(DeviceState *dev, Error **errp)
> +{
> +    SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
> +    VFIODevice *vdev = DO_UPCAST(VFIODevice, sbdev, sbdev);
> +    VFIOGroup *group;
> +    char path[PATH_MAX];
> +    int ret;
> +
> +    vdev->name = malloc(PATH_MAX);
> +    strcpy(vdev->name, "fff51000.ethernet");
> +
> +    /* Check that the host device exists */
> +    snprintf(path, sizeof(path),
> +             "/sys/bus/platform/devices/%s/", vdev->name);
> +
> +    group = vfio_find_get_group(path);
> +    if (!group) {
> +        error_report("vfio: failed to get group for device %s", path);
> +        return;
> +    }
> +
> +    strcpy(path, vdev->name);
> +    ret = vfio_get_device(group, path, vdev);
> +    if (ret) {
> +        error_report("vfio: failed to get device %s", path);
> +        vfio_put_group(group);
> +        return;
> +    }
> +
> +    vfio_map_bars(vdev);
> +
> +    sysbus_init_mmio(sbdev, &vdev->bars[0].mmap_mem);
> +}
> +
> +static const VMStateDescription vfio_platform_vmstate = {
> +    .name = "vfio-platform",
> +    .unmigratable = 1,
> +};
> +
> +static void vfio_platform_dev_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    dc->realize = vfio_platform_realize;
> +    dc->vmsd = &vfio_platform_vmstate;
> +    dc->desc = "VFIO-based platform device assignment";
> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> +}
> +
> +static const TypeInfo vfio_platform_dev_info = {
> +    .name = "vfio-platform",
> +    .parent = TYPE_SYS_BUS_DEVICE,
> +    .instance_size = sizeof(VFIODevice),
> +    .class_init = vfio_platform_dev_class_init,
> +};
> +
> +static void register_vfio_platform_dev_type(void)
> +{
> +    type_register_static(&vfio_platform_dev_info);
> +}
> +
> +type_init(register_vfio_platform_dev_type)

This all looks reasonable, but I suspect it would be cleaner if
vfio_find_get_group() was in a common file along with basic mmap and
read/write access functions.  Thanks,

Alex
kim-phillips March 5, 2014, 12:24 a.m. UTC | #2
On Fri, 28 Feb 2014 11:03:18 -0700
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Tue, 2014-02-25 at 20:37 -0600, Kim Phillips wrote:
> > - support allocating a variable number of regions
> >   - VFIODevice's bars[] become dynamically allocated *regions
> >   - VFIOBAR's device fd replaced with parent VFIODevice ptr,
> >     to facilitate BAR r/w ops calling vfio_eoi()
> 
> On one hand, I had assumed we'd create a hw/misc/vfio/ directory and
> perhaps split things into pci, common, vga, and platform.  We already
> need the pci vs vga split as there's lots of irrelevant and complicated
> vga quirks that most people don't want to see.  On the other hand, I'm
> surprised how much of the PCI code you can re-use.  Still, I think it
> would be cleaner to create a VFIOPCIDevice and a VFIOPlatformDevice.

sounds good, had started down that path but reverted it because I too
realized how much code was indeed being reused.  I'm ok with
VFIOPCIDevice and a separate VFIOPlatformDevice for now, thanks.

> >  static void vfio_map_bars(VFIODevice *vdev)
> >  {
> >      int i;
> >  
> > +    if (!vdev->config_size) {
> > +        /* platform device */
> > +        for (i = 0; i < vdev->nr_regions; i++) {
> > +            vfio_map_region(vdev, i);
> > +        }
> > +        return;
> > +    }
> 
> I don't understand this, vfio_map_region() calls vfio_mmap_bar(), but
> vfio_mmap_bar() has been gutted so that it only initializes the mmap
> subregion, but never adds it.  vfio_map_bar() does both the
> initialization of the slow, read/write, memory region as well as adding
> the mmap sub-region.  I don't see where platform devices get a memory
> region inserted anywhere into the guest address space.

it's being done by the board code via the first RFC/patch ("hw/arm/virt:
add a Calxeda XGMAC device"), and vfio_mmap_bar()'s call to
memory_region_init_ram_ptr().

Yeah, the slow, r/w ops region path isn't used, and I'm not sure how to
make it work without addressing the problems the first RFC presents.

> > -    if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
> > +    if (dev_info.num_regions > PCI_NUM_REGIONS) ||
> > +        ((dev_info.flags & VFIO_DEVICE_FLAGS_PCI) &&
> > +         (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1)) {
> 
> Now we start to have platform devices error out if they have more
> regions than PCI knows about...  That doesn't make much sense.

right, I should have made it more clear that the multiple-regions code
is sketchy in the "support allocating a variable number of regions"
blurb in the commit text of this RFC.

> > +static void register_vfio_platform_dev_type(void)
> > +{
> > +    type_register_static(&vfio_platform_dev_info);
> > +}
> > +
> > +type_init(register_vfio_platform_dev_type)
> 
> This all looks reasonable, but I suspect it would be cleaner if
> vfio_find_get_group() was in a common file along with basic mmap and
> read/write access functions.  Thanks,

so rename existing hw/misc/vfio.c to its original name vfio-pci.c, and
load all common functions back into a vfio.c?

Kim
Alex Williamson March 5, 2014, 1:23 a.m. UTC | #3
On Tue, 2014-03-04 at 18:24 -0600, Kim Phillips wrote:
> On Fri, 28 Feb 2014 11:03:18 -0700
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> > On Tue, 2014-02-25 at 20:37 -0600, Kim Phillips wrote:
> > > - support allocating a variable number of regions
> > >   - VFIODevice's bars[] become dynamically allocated *regions
> > >   - VFIOBAR's device fd replaced with parent VFIODevice ptr,
> > >     to facilitate BAR r/w ops calling vfio_eoi()
> > 
> > On one hand, I had assumed we'd create a hw/misc/vfio/ directory and
> > perhaps split things into pci, common, vga, and platform.  We already
> > need the pci vs vga split as there's lots of irrelevant and complicated
> > vga quirks that most people don't want to see.  On the other hand, I'm
> > surprised how much of the PCI code you can re-use.  Still, I think it
> > would be cleaner to create a VFIOPCIDevice and a VFIOPlatformDevice.
> 
> sounds good, had started down that path but reverted it because I too
> realized how much code was indeed being reused.  I'm ok with
> VFIOPCIDevice and a separate VFIOPlatformDevice for now, thanks.
> 
> > >  static void vfio_map_bars(VFIODevice *vdev)
> > >  {
> > >      int i;
> > >  
> > > +    if (!vdev->config_size) {
> > > +        /* platform device */
> > > +        for (i = 0; i < vdev->nr_regions; i++) {
> > > +            vfio_map_region(vdev, i);
> > > +        }
> > > +        return;
> > > +    }
> > 
> > I don't understand this, vfio_map_region() calls vfio_mmap_bar(), but
> > vfio_mmap_bar() has been gutted so that it only initializes the mmap
> > subregion, but never adds it.  vfio_map_bar() does both the
> > initialization of the slow, read/write, memory region as well as adding
> > the mmap sub-region.  I don't see where platform devices get a memory
> > region inserted anywhere into the guest address space.
> 
> it's being done by the board code via the first RFC/patch ("hw/arm/virt:
> add a Calxeda XGMAC device"), and vfio_mmap_bar()'s call to
> memory_region_init_ram_ptr().
> 
> Yeah, the slow, r/w ops region path isn't used, and I'm not sure how to
> make it work without addressing the problems the first RFC presents.
> 
> > > -    if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
> > > +    if (dev_info.num_regions > PCI_NUM_REGIONS) ||
> > > +        ((dev_info.flags & VFIO_DEVICE_FLAGS_PCI) &&
> > > +         (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1)) {
> > 
> > Now we start to have platform devices error out if they have more
> > regions than PCI knows about...  That doesn't make much sense.
> 
> right, I should have made it more clear that the multiple-regions code
> is sketchy in the "support allocating a variable number of regions"
> blurb in the commit text of this RFC.
> 
> > > +static void register_vfio_platform_dev_type(void)
> > > +{
> > > +    type_register_static(&vfio_platform_dev_info);
> > > +}
> > > +
> > > +type_init(register_vfio_platform_dev_type)
> > 
> > This all looks reasonable, but I suspect it would be cleaner if
> > vfio_find_get_group() was in a common file along with basic mmap and
> > read/write access functions.  Thanks,
> 
> so rename existing hw/misc/vfio.c to its original name vfio-pci.c, and
> load all common functions back into a vfio.c?

I think hw/misc/vfio/{pci.c,common.c,platform.c,etc}  Thanks,

Alex
Auger Eric March 5, 2014, 7:56 a.m. UTC | #4
Hi Kim,

1) I agree with Alex on the fact I would prefer to have the qemu-side
platform device code separated from PCI device one, both using a separate
generic helper code. Indeed calling functions referencing BAR in the
platform device does not look natural to me;  although I understand the
code reuse rationale. This is also how the kernel side code is laid out.

2) about nr_regions. What is the number of nr_regions do you expect for
platform_devices? bars[] possible overshoot as already mentionned if
nr_regions > 6.

3) I understand you build nr_regions MemoryRegion (each mmaped on the hpa).
However I understand you only attach a single one to the SysBusDevice in
vfio_platform_realize:
sysbus_init_mmio(sbdev, &vdev->bars[0].mmap_mem);  this MemRegion is
assigned a gpa in virt.c? is it the correct understanding? What is the mem
hierarchy that is targetted. In PCI there are subregions. In this device,
there would be a single layer or hierarchy?

4)  vfio_unmap_bars should destroy the platform device mmaped regions
(vdev->mmap_mem and munmap the correspond hva (currently on slow regions
are munmapped and destroyed)

Best Regards

Eric


On 26 February 2014 03:37, Kim Phillips <kim.phillips@linaro.org> wrote:

> We basically add support for the SysBusDevice type in addition to the
> existing PCIDevice support.  This involves taking common code from the
> existing vfio_initfn(), and putting it under a new vfio_find_get_group(),
> that both vfio_initfn() and the new vfio_platform_realize() call.
> Since realize() returns void, unlike PCIDevice's initfn(),
> error codes are moved into the error message text with %m.
> Some exceptions to the PCI path are added for platform devices,
> mostly in the form of early returns, since less setup is needed.
> I chose to reuse VFIODevice's config_size to determine whether
> the device was a PCI device or a platform device, which might
> need to change.
>
> Currently only MMIO access is supported at this time.  It works
> because of qemu's stage 1 translation, but needs to be in stage 2
> in case other entities than the guest OS want to access it.
> A KVM patch to address this is in the works.
>
> The perceived path for future QEMU development is:
>
> - support allocating a variable number of regions
>   - VFIODevice's bars[] become dynamically allocated *regions
>   - VFIOBAR's device fd replaced with parent VFIODevice ptr,
>     to facilitate BAR r/w ops calling vfio_eoi()
> - add support for interrupts
> - verify and test platform dev unmap path
> - test existing PCI path for any regressions
> - add support for creating platform devices on the qemu command line
>   - currently device address specification is hardcoded for test
>     development on Calxeda Midway's fff51000.ethernet device
> - reset is not supported and registration of reset functions is
>   bypassed for platform devices.
>   - there is no standard means of resetting a platform device,
>     unsure if it suffices to be handled at device--VFIO binding time
>
> Signed-off-by: Kim Phillips <kim.phillips@linaro.org>
> ---
>  hw/misc/vfio.c | 180
> ++++++++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 145 insertions(+), 35 deletions(-)
>
> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> index 8db182f..eed24db 100644
> --- a/hw/misc/vfio.c
> +++ b/hw/misc/vfio.c
> @@ -32,6 +32,7 @@
>  #include "hw/pci/msi.h"
>  #include "hw/pci/msix.h"
>  #include "hw/pci/pci.h"
> +#include "hw/sysbus.h"
>  #include "qemu-common.h"
>  #include "qemu/error-report.h"
>  #include "qemu/event_notifier.h"
> @@ -166,7 +167,10 @@ typedef struct VFIOMSIXInfo {
>  } VFIOMSIXInfo;
>
>  typedef struct VFIODevice {
> -    PCIDevice pdev;
> +    union {
> +        PCIDevice pdev;
> +        SysBusDevice sbdev;
> +    };
>      int fd;
>      VFIOINTx intx;
>      unsigned int config_size;
> @@ -180,6 +184,8 @@ typedef struct VFIODevice {
>      VFIOMSIXInfo *msix;
>      int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
>      int interrupt; /* Current interrupt type */
> +    char *name;  /* platform device name, e.g., fff51000.ethernet */
> +    int nr_regions; /* platform devices' number of regions */
>      VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
>      VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */
>      PCIHostDeviceAddress host;
> @@ -2497,8 +2503,6 @@ empty_region:
>          memory_region_init(submem, OBJECT(vdev), name, 0);
>      }
>
> -    memory_region_add_subregion(mem, offset, submem);
> -
>      return ret;
>  }
>
> @@ -2552,6 +2556,7 @@ static void vfio_map_bar(VFIODevice *vdev, int nr)
>                        &bar->mmap_mem, &bar->mmap, size, 0, name)) {
>          error_report("%s unsupported. Performance may be slow", name);
>      }
> +    memory_region_add_subregion(&bar->mem, 0, &bar->mmap_mem);
>
>      if (vdev->msix && vdev->msix->table_bar == nr) {
>          unsigned start;
> @@ -2566,15 +2571,38 @@ static void vfio_map_bar(VFIODevice *vdev, int nr)
>                            &vdev->msix->mmap, size, start, name)) {
>              error_report("%s unsupported. Performance may be slow", name);
>          }
> +        memory_region_add_subregion(&bar->mem, start,
> &vdev->msix->mmap_mem);
>      }
>
>      vfio_bar_quirk_setup(vdev, nr);
>  }
>
> +static void vfio_map_region(VFIODevice *vdev, int nr)
> +{
> +    VFIOBAR *bar = &vdev->bars[nr];
> +    unsigned size = bar->size;
> +    char name[64];
> +
> +    snprintf(name, sizeof(name), "VFIO %s region %d mmap", vdev->name,
> nr);
> +
> +    if (vfio_mmap_bar(vdev, bar, &bar->mem,
> +                      &bar->mmap_mem, &bar->mmap, size, 0, name)) {
> +        error_report("error mmapping %s: %m", name);
> +    }
> +}
> +
>  static void vfio_map_bars(VFIODevice *vdev)
>  {
>      int i;
>
> +    if (!vdev->config_size) {
> +        /* platform device */
> +        for (i = 0; i < vdev->nr_regions; i++) {
> +            vfio_map_region(vdev, i);
> +        }
> +        return;
> +    }
> +
>      for (i = 0; i < PCI_ROM_SLOT; i++) {
>          vfio_map_bar(vdev, i);
>      }
> @@ -3144,7 +3172,8 @@ static void vfio_pci_reset_handler(void *opaque)
>
>      QLIST_FOREACH(group, &group_list, next) {
>          QLIST_FOREACH(vdev, &group->device_list, next) {
> -            if (vdev->needs_reset) {
> +            /* HACK: restrict reset to PCI devices (have config_size) for
> now */
> +            if (vdev->config_size && vdev->needs_reset) {
>                  vfio_pci_hot_reset_multi(vdev);
>              }
>          }
> @@ -3418,25 +3447,18 @@ static int vfio_get_device(VFIOGroup *group, const
> char *name, VFIODevice *vdev)
>      DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
>              dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
>
> -    if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
> -        error_report("vfio: Um, this isn't a PCI device");
> -        goto error;
> -    }
> -
> +    vdev->nr_regions = dev_info.num_regions;
>      vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
>
> -    if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
> +    if (dev_info.num_regions > PCI_NUM_REGIONS) ||
> +        ((dev_info.flags & VFIO_DEVICE_FLAGS_PCI) &&
> +         (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1)) {
>          error_report("vfio: unexpected number of io regions %u",
>                       dev_info.num_regions);
>          goto error;
>      }
>
> -    if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
> -        error_report("vfio: unexpected number of irqs %u",
> dev_info.num_irqs);
> -        goto error;
> -    }
> -
> -    for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX;
> i++) {
> +    for (i = 0; i < dev_info.num_regions; i++) {
>          reg_info.index = i;
>
>          ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> @@ -3458,6 +3480,15 @@ static int vfio_get_device(VFIOGroup *group, const
> char *name, VFIODevice *vdev)
>          QLIST_INIT(&vdev->bars[i].quirks);
>      }
>
> +    /* following is for PCI devices, at least for now */
> +    if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI))
> +        return 0;
> +
> +    if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
> +        error_report("vfio: unexpected number of irqs %u",
> dev_info.num_irqs);
> +        goto error;
> +    }
> +
>      reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
>
>      ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> @@ -3659,32 +3690,25 @@ static void
> vfio_unregister_err_notifier(VFIODevice *vdev)
>      event_notifier_cleanup(&vdev->err_notifier);
>  }
>
> -static int vfio_initfn(PCIDevice *pdev)
> +static VFIOGroup *vfio_find_get_group(char *path)
>  {
> -    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
>      VFIOGroup *group;
> -    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
> +    char iommu_group_path[PATH_MAX], *group_name;
>      ssize_t len;
>      struct stat st;
>      int groupid;
> -    int ret;
>
> -    /* Check that the host device exists */
> -    snprintf(path, sizeof(path),
> -             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
> -             vdev->host.domain, vdev->host.bus, vdev->host.slot,
> -             vdev->host.function);
>      if (stat(path, &st) < 0) {
> -        error_report("vfio: error: no such host device: %s", path);
> -        return -errno;
> +        error_report("vfio: error: no such host device: %s: %m", path);
> +        return NULL;
>      }
>
>      strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
>
>      len = readlink(path, iommu_group_path, PATH_MAX);
>      if (len <= 0) {
> -        error_report("vfio: error no iommu_group for device");
> -        return -errno;
> +        error_report("vfio: error no iommu_group for device: %m");
> +        return NULL;
>      }
>
>      iommu_group_path[len] = 0;
> @@ -3692,18 +3716,37 @@ static int vfio_initfn(PCIDevice *pdev)
>
>      if (sscanf(group_name, "%d", &groupid) != 1) {
>          error_report("vfio: error reading %s: %m", path);
> -        return -errno;
> +        return NULL;
>      }
>
> -    DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__,
> vdev->host.domain,
> -            vdev->host.bus, vdev->host.slot, vdev->host.function,
> groupid);
> -
>      group = vfio_get_group(groupid);
>      if (!group) {
> -        error_report("vfio: failed to get group %d", groupid);
> -        return -ENOENT;
> +        error_report("vfio: failed to get group %d: %m", groupid);
> +        return NULL;
>      }
>
> +    return group;
> +}
> +
> +static int vfio_initfn(PCIDevice *pdev)
> +{
> +    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> +    VFIOGroup *group;
> +    char path[PATH_MAX];
> +    int ret;
> +
> +    /* Check that the host device exists */
> +    snprintf(path, sizeof(path),
> +             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
> +             vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +             vdev->host.function);
> +
> +    group = vfio_find_get_group(path);
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__,
> vdev->host.domain,
> +            vdev->host.bus, vdev->host.slot, vdev->host.function,
> +            group->groupid);
> +
>      snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
>              vdev->host.domain, vdev->host.bus, vdev->host.slot,
>              vdev->host.function);
> @@ -3916,3 +3959,70 @@ static void register_vfio_pci_dev_type(void)
>  }
>
>  type_init(register_vfio_pci_dev_type)
> +
> +
> +/*
> + * VFIO platform devices
> + */
> +static void vfio_platform_realize(DeviceState *dev, Error **errp)
> +{
> +    SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
> +    VFIODevice *vdev = DO_UPCAST(VFIODevice, sbdev, sbdev);
> +    VFIOGroup *group;
> +    char path[PATH_MAX];
> +    int ret;
> +
> +    vdev->name = malloc(PATH_MAX);
> +    strcpy(vdev->name, "fff51000.ethernet");
> +
> +    /* Check that the host device exists */
> +    snprintf(path, sizeof(path),
> +             "/sys/bus/platform/devices/%s/", vdev->name);
> +
> +    group = vfio_find_get_group(path);
> +    if (!group) {
> +        error_report("vfio: failed to get group for device %s", path);
> +        return;
> +    }
> +
> +    strcpy(path, vdev->name);
> +    ret = vfio_get_device(group, path, vdev);
> +    if (ret) {
> +        error_report("vfio: failed to get device %s", path);
> +        vfio_put_group(group);
> +        return;
> +    }
> +
> +    vfio_map_bars(vdev);
> +
> +    sysbus_init_mmio(sbdev, &vdev->bars[0].mmap_mem);
> +}
> +
> +static const VMStateDescription vfio_platform_vmstate = {
> +    .name = "vfio-platform",
> +    .unmigratable = 1,
> +};
> +
> +static void vfio_platform_dev_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    dc->realize = vfio_platform_realize;
> +    dc->vmsd = &vfio_platform_vmstate;
> +    dc->desc = "VFIO-based platform device assignment";
> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> +}
> +
> +static const TypeInfo vfio_platform_dev_info = {
> +    .name = "vfio-platform",
> +    .parent = TYPE_SYS_BUS_DEVICE,
> +    .instance_size = sizeof(VFIODevice),
> +    .class_init = vfio_platform_dev_class_init,
> +};
> +
> +static void register_vfio_platform_dev_type(void)
> +{
> +    type_register_static(&vfio_platform_dev_info);
> +}
> +
> +type_init(register_vfio_platform_dev_type)
> --
> 1.9.0
>
>
Andreas Färber March 5, 2014, 11:28 a.m. UTC | #5
Am 05.03.2014 02:23, schrieb Alex Williamson:
> On Tue, 2014-03-04 at 18:24 -0600, Kim Phillips wrote:
>> On Fri, 28 Feb 2014 11:03:18 -0700
>> Alex Williamson <alex.williamson@redhat.com> wrote:
>>
>>> This all looks reasonable, but I suspect it would be cleaner if
>>> vfio_find_get_group() was in a common file along with basic mmap and
>>> read/write access functions.  Thanks,
>>
>> so rename existing hw/misc/vfio.c to its original name vfio-pci.c, and
>> load all common functions back into a vfio.c?
> 
> I think hw/misc/vfio/{pci.c,common.c,platform.c,etc}  Thanks,

What about hw/vfio/ then?

Andreas
Paolo Bonzini March 5, 2014, 11:30 a.m. UTC | #6
Il 05/03/2014 12:28, Andreas Färber ha scritto:
> Am 05.03.2014 02:23, schrieb Alex Williamson:
>> On Tue, 2014-03-04 at 18:24 -0600, Kim Phillips wrote:
>>> On Fri, 28 Feb 2014 11:03:18 -0700
>>> Alex Williamson <alex.williamson@redhat.com> wrote:
>>>
>>>> This all looks reasonable, but I suspect it would be cleaner if
>>>> vfio_find_get_group() was in a common file along with basic mmap and
>>>> read/write access functions.  Thanks,
>>>
>>> so rename existing hw/misc/vfio.c to its original name vfio-pci.c, and
>>> load all common functions back into a vfio.c?
>>
>> I think hw/misc/vfio/{pci.c,common.c,platform.c,etc}  Thanks,
>
> What about hw/vfio/ then?

Agreed.

Paolo
diff mbox

Patch

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 8db182f..eed24db 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -32,6 +32,7 @@ 
 #include "hw/pci/msi.h"
 #include "hw/pci/msix.h"
 #include "hw/pci/pci.h"
+#include "hw/sysbus.h"
 #include "qemu-common.h"
 #include "qemu/error-report.h"
 #include "qemu/event_notifier.h"
@@ -166,7 +167,10 @@  typedef struct VFIOMSIXInfo {
 } VFIOMSIXInfo;
 
 typedef struct VFIODevice {
-    PCIDevice pdev;
+    union {
+        PCIDevice pdev;
+        SysBusDevice sbdev;
+    };
     int fd;
     VFIOINTx intx;
     unsigned int config_size;
@@ -180,6 +184,8 @@  typedef struct VFIODevice {
     VFIOMSIXInfo *msix;
     int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
     int interrupt; /* Current interrupt type */
+    char *name;  /* platform device name, e.g., fff51000.ethernet */
+    int nr_regions; /* platform devices' number of regions */
     VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
     VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */
     PCIHostDeviceAddress host;
@@ -2497,8 +2503,6 @@  empty_region:
         memory_region_init(submem, OBJECT(vdev), name, 0);
     }
 
-    memory_region_add_subregion(mem, offset, submem);
-
     return ret;
 }
 
@@ -2552,6 +2556,7 @@  static void vfio_map_bar(VFIODevice *vdev, int nr)
                       &bar->mmap_mem, &bar->mmap, size, 0, name)) {
         error_report("%s unsupported. Performance may be slow", name);
     }
+    memory_region_add_subregion(&bar->mem, 0, &bar->mmap_mem);
 
     if (vdev->msix && vdev->msix->table_bar == nr) {
         unsigned start;
@@ -2566,15 +2571,38 @@  static void vfio_map_bar(VFIODevice *vdev, int nr)
                           &vdev->msix->mmap, size, start, name)) {
             error_report("%s unsupported. Performance may be slow", name);
         }
+        memory_region_add_subregion(&bar->mem, start, &vdev->msix->mmap_mem);
     }
 
     vfio_bar_quirk_setup(vdev, nr);
 }
 
+static void vfio_map_region(VFIODevice *vdev, int nr)
+{
+    VFIOBAR *bar = &vdev->bars[nr];
+    unsigned size = bar->size;
+    char name[64];
+
+    snprintf(name, sizeof(name), "VFIO %s region %d mmap", vdev->name, nr);
+
+    if (vfio_mmap_bar(vdev, bar, &bar->mem,
+                      &bar->mmap_mem, &bar->mmap, size, 0, name)) {
+        error_report("error mmapping %s: %m", name);
+    }
+}
+
 static void vfio_map_bars(VFIODevice *vdev)
 {
     int i;
 
+    if (!vdev->config_size) {
+        /* platform device */
+        for (i = 0; i < vdev->nr_regions; i++) {
+            vfio_map_region(vdev, i);
+        }
+        return;
+    }
+
     for (i = 0; i < PCI_ROM_SLOT; i++) {
         vfio_map_bar(vdev, i);
     }
@@ -3144,7 +3172,8 @@  static void vfio_pci_reset_handler(void *opaque)
 
     QLIST_FOREACH(group, &group_list, next) {
         QLIST_FOREACH(vdev, &group->device_list, next) {
-            if (vdev->needs_reset) {
+            /* HACK: restrict reset to PCI devices (have config_size) for now */
+            if (vdev->config_size && vdev->needs_reset) {
                 vfio_pci_hot_reset_multi(vdev);
             }
         }
@@ -3418,25 +3447,18 @@  static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
     DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
             dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
 
-    if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
-        error_report("vfio: Um, this isn't a PCI device");
-        goto error;
-    }
-
+    vdev->nr_regions = dev_info.num_regions;
     vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
 
-    if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
+    if (dev_info.num_regions > PCI_NUM_REGIONS) ||
+        ((dev_info.flags & VFIO_DEVICE_FLAGS_PCI) &&
+         (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1)) {
         error_report("vfio: unexpected number of io regions %u",
                      dev_info.num_regions);
         goto error;
     }
 
-    if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
-        error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
-        goto error;
-    }
-
-    for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
+    for (i = 0; i < dev_info.num_regions; i++) {
         reg_info.index = i;
 
         ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
@@ -3458,6 +3480,15 @@  static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
         QLIST_INIT(&vdev->bars[i].quirks);
     }
 
+    /* following is for PCI devices, at least for now */
+    if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI))
+        return 0;
+
+    if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
+        error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
+        goto error;
+    }
+
     reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
 
     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
@@ -3659,32 +3690,25 @@  static void vfio_unregister_err_notifier(VFIODevice *vdev)
     event_notifier_cleanup(&vdev->err_notifier);
 }
 
-static int vfio_initfn(PCIDevice *pdev)
+static VFIOGroup *vfio_find_get_group(char *path)
 {
-    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
     VFIOGroup *group;
-    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
+    char iommu_group_path[PATH_MAX], *group_name;
     ssize_t len;
     struct stat st;
     int groupid;
-    int ret;
 
-    /* Check that the host device exists */
-    snprintf(path, sizeof(path),
-             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
-             vdev->host.domain, vdev->host.bus, vdev->host.slot,
-             vdev->host.function);
     if (stat(path, &st) < 0) {
-        error_report("vfio: error: no such host device: %s", path);
-        return -errno;
+        error_report("vfio: error: no such host device: %s: %m", path);
+        return NULL;
     }
 
     strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
 
     len = readlink(path, iommu_group_path, PATH_MAX);
     if (len <= 0) {
-        error_report("vfio: error no iommu_group for device");
-        return -errno;
+        error_report("vfio: error no iommu_group for device: %m");
+        return NULL;
     }
 
     iommu_group_path[len] = 0;
@@ -3692,18 +3716,37 @@  static int vfio_initfn(PCIDevice *pdev)
 
     if (sscanf(group_name, "%d", &groupid) != 1) {
         error_report("vfio: error reading %s: %m", path);
-        return -errno;
+        return NULL;
     }
 
-    DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
-            vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
-
     group = vfio_get_group(groupid);
     if (!group) {
-        error_report("vfio: failed to get group %d", groupid);
-        return -ENOENT;
+        error_report("vfio: failed to get group %d: %m", groupid);
+        return NULL;
     }
 
+    return group;
+}
+
+static int vfio_initfn(PCIDevice *pdev)
+{
+    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    VFIOGroup *group;
+    char path[PATH_MAX];
+    int ret;
+
+    /* Check that the host device exists */
+    snprintf(path, sizeof(path),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
+             vdev->host.domain, vdev->host.bus, vdev->host.slot,
+             vdev->host.function);
+
+    group = vfio_find_get_group(path);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function,
+            group->groupid);
+
     snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
             vdev->host.domain, vdev->host.bus, vdev->host.slot,
             vdev->host.function);
@@ -3916,3 +3959,70 @@  static void register_vfio_pci_dev_type(void)
 }
 
 type_init(register_vfio_pci_dev_type)
+
+
+/*
+ * VFIO platform devices
+ */
+static void vfio_platform_realize(DeviceState *dev, Error **errp)
+{
+    SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, sbdev, sbdev);
+    VFIOGroup *group;
+    char path[PATH_MAX];
+    int ret;
+
+    vdev->name = malloc(PATH_MAX);
+    strcpy(vdev->name, "fff51000.ethernet");
+
+    /* Check that the host device exists */
+    snprintf(path, sizeof(path),
+             "/sys/bus/platform/devices/%s/", vdev->name);
+
+    group = vfio_find_get_group(path);
+    if (!group) {
+        error_report("vfio: failed to get group for device %s", path);
+        return;
+    }
+
+    strcpy(path, vdev->name);
+    ret = vfio_get_device(group, path, vdev);
+    if (ret) {
+        error_report("vfio: failed to get device %s", path);
+        vfio_put_group(group);
+        return;
+    }
+
+    vfio_map_bars(vdev);
+
+    sysbus_init_mmio(sbdev, &vdev->bars[0].mmap_mem);
+}
+
+static const VMStateDescription vfio_platform_vmstate = {
+    .name = "vfio-platform",
+    .unmigratable = 1,
+};
+
+static void vfio_platform_dev_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = vfio_platform_realize;
+    dc->vmsd = &vfio_platform_vmstate;
+    dc->desc = "VFIO-based platform device assignment";
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+}
+
+static const TypeInfo vfio_platform_dev_info = {
+    .name = "vfio-platform",
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(VFIODevice),
+    .class_init = vfio_platform_dev_class_init,
+};
+
+static void register_vfio_platform_dev_type(void)
+{
+    type_register_static(&vfio_platform_dev_info);
+}
+
+type_init(register_vfio_platform_dev_type)