diff mbox

[v5,07/10] hw/vfio/platform: add vfio-platform support

Message ID 1407594349-9291-8-git-send-email-eric.auger@linaro.org
State New
Headers show

Commit Message

Auger Eric Aug. 9, 2014, 2:25 p.m. UTC
Minimal VFIO platform implementation supporting
- register space user mapping,
- IRQ assignment based on eventfds handled on qemu side.

irqfd kernel acceleration comes in a subsequent patch.

Signed-off-by: Kim Phillips <kim.phillips@linaro.org>
Signed-off-by: Eric Auger <eric.auger@linaro.org>

---

v4 -> v5:
- vfio-plaform.h included first
- cleanup error handling in *populate*, vfio_get_device,
  vfio_enable_intp
- vfio_put_device not called anymore
- add some includes to follow vfio policy

v3 -> v4:
[Eric Auger]
- merge of "vfio: Add initial IRQ support in platform device"
  to get a full functional patch although perfs are limited.
- removal of unrealize function since I currently understand
  it is only used with device hot-plug feature.

v2 -> v3:
[Eric Auger]
- further factorization between PCI and platform (VFIORegion,
  VFIODevice). same level of functionality.

<= v2:
[Kim Philipps]
- Initial Creation of the device supporting register space mapping
---
 hw/vfio/Makefile.objs           |   1 +
 hw/vfio/platform.c              | 517 ++++++++++++++++++++++++++++++++++++++++
 include/hw/vfio/vfio-platform.h |  77 ++++++
 3 files changed, 595 insertions(+)
 create mode 100644 hw/vfio/platform.c
 create mode 100644 include/hw/vfio/vfio-platform.h

Comments

Alexander Graf Aug. 11, 2014, 9:36 a.m. UTC | #1
On 09.08.14 16:25, Eric Auger wrote:
> Minimal VFIO platform implementation supporting
> - register space user mapping,
> - IRQ assignment based on eventfds handled on qemu side.
>
> irqfd kernel acceleration comes in a subsequent patch.
>
> Signed-off-by: Kim Phillips <kim.phillips@linaro.org>
> Signed-off-by: Eric Auger <eric.auger@linaro.org>
>
> ---
>
> v4 -> v5:
> - vfio-plaform.h included first
> - cleanup error handling in *populate*, vfio_get_device,
>    vfio_enable_intp
> - vfio_put_device not called anymore
> - add some includes to follow vfio policy
>
> v3 -> v4:
> [Eric Auger]
> - merge of "vfio: Add initial IRQ support in platform device"
>    to get a full functional patch although perfs are limited.
> - removal of unrealize function since I currently understand
>    it is only used with device hot-plug feature.
>
> v2 -> v3:
> [Eric Auger]
> - further factorization between PCI and platform (VFIORegion,
>    VFIODevice). same level of functionality.
>
> <= v2:
> [Kim Philipps]
> - Initial Creation of the device supporting register space mapping
> ---
>   hw/vfio/Makefile.objs           |   1 +
>   hw/vfio/platform.c              | 517 ++++++++++++++++++++++++++++++++++++++++
>   include/hw/vfio/vfio-platform.h |  77 ++++++
>   3 files changed, 595 insertions(+)
>   create mode 100644 hw/vfio/platform.c
>   create mode 100644 include/hw/vfio/vfio-platform.h
>
> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
> index e31f30e..c5c76fe 100644
> --- a/hw/vfio/Makefile.objs
> +++ b/hw/vfio/Makefile.objs
> @@ -1,4 +1,5 @@
>   ifeq ($(CONFIG_LINUX), y)
>   obj-$(CONFIG_SOFTMMU) += common.o
>   obj-$(CONFIG_PCI) += pci.o
> +obj-$(CONFIG_SOFTMMU) += platform.o
>   endif
> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
> new file mode 100644
> index 0000000..f1a1b55
> --- /dev/null
> +++ b/hw/vfio/platform.c
> @@ -0,0 +1,517 @@
> +/*
> + * vfio based device assignment support - platform devices
> + *
> + * Copyright Linaro Limited, 2014
> + *
> + * Authors:
> + *  Kim Phillips <kim.phillips@linaro.org>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on vfio based PCI device assignment support:
> + *  Copyright Red Hat, Inc. 2012
> + */
> +
> +#include <linux/vfio.h>
> +#include <sys/ioctl.h>
> +
> +#include "hw/vfio/vfio-platform.h"
> +#include "qemu/error-report.h"
> +#include "qemu/range.h"
> +#include "sysemu/sysemu.h"
> +#include "exec/memory.h"
> +#include "qemu/queue.h"
> +#include "hw/sysbus.h"
> +
> +extern const MemoryRegionOps vfio_region_ops;
> +extern const MemoryListener vfio_memory_listener;
> +extern QLIST_HEAD(, VFIOGroup) group_list;
> +extern QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces;
> +void vfio_put_device(VFIOPlatformDevice *vdev);
> +
> +/*
> + * It is mandatory to pass a VFIOPlatformDevice since VFIODevice
> + * is not a QOM Object and cannot be passed to memory region functions
> +*/
> +static void vfio_map_region(VFIOPlatformDevice *vdev, int nr)
> +{
> +    VFIORegion *region = vdev->regions[nr];
> +    unsigned size = region->size;
> +    char name[64];
> +
> +    if (!size) {
> +        return;
> +    }
> +
> +    snprintf(name, sizeof(name), "VFIO %s region %d",
> +             vdev->vbasedev.name, nr);
> +
> +    /* A "slow" read/write mapping underlies all regions */
> +    memory_region_init_io(&region->mem, OBJECT(vdev), &vfio_region_ops,
> +                          region, name, size);
> +
> +    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
> +
> +    if (vfio_mmap_region(OBJECT(vdev), region, &region->mem,
> +                         &region->mmap_mem, &region->mmap, size, 0, name)) {
> +        error_report("%s unsupported. Performance may be slow", name);
> +    }
> +}
> +
> +static void print_regions(VFIOPlatformDevice *vdev)
> +{
> +    int i;
> +
> +    DPRINTF("Device \"%s\" counts %d region(s):\n",
> +             vdev->vbasedev.name, vdev->vbasedev.num_regions);
> +
> +    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
> +        DPRINTF("- region %d flags = 0x%lx, size = 0x%lx, "
> +                "fd= %d, offset = 0x%lx\n",
> +                vdev->regions[i]->nr,
> +                (unsigned long)vdev->regions[i]->flags,
> +                (unsigned long)vdev->regions[i]->size,
> +                vdev->regions[i]->vbasedev->fd,
> +                (unsigned long)vdev->regions[i]->fd_offset);
> +    }
> +}
> +
> +static int vfio_populate_regions(VFIODevice *vbasedev)
> +{
> +    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
> +    int i, ret = 0;
> +    VFIOPlatformDevice *vdev =
> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
> +
> +    vdev->regions = g_malloc0(sizeof(VFIORegion *) * vbasedev->num_regions);
> +
> +    for (i = 0; i < vbasedev->num_regions; i++) {
> +        vdev->regions[i] = g_malloc0(sizeof(VFIORegion));
> +        reg_info.index = i;
> +        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> +        if (ret) {
> +            error_report("vfio: Error getting region %d info: %m", i);
> +            goto error;
> +        }
> +
> +        vdev->regions[i]->flags = reg_info.flags;
> +        vdev->regions[i]->size = reg_info.size;
> +        vdev->regions[i]->fd_offset = reg_info.offset;
> +        vdev->regions[i]->nr = i;
> +        vdev->regions[i]->vbasedev = vbasedev;
> +    }
> +    print_regions(vdev);
> +error:
> +    return ret;
> +}
> +
> +/* not implemented yet */
> +static int vfio_platform_check_device(VFIODevice *vdev)
> +{
> +    return 0;
> +}
> +
> +/* not implemented yet */
> +static bool vfio_platform_compute_needs_reset(VFIODevice *vdev)
> +{
> +return false;
> +}
> +
> +/* not implemented yet */
> +static int vfio_platform_hot_reset_multi(VFIODevice *vdev)
> +{
> +return 0;
> +}
> +
> +/*
> + * eoi function is called on the first access to any MMIO region
> + * after an IRQ was triggered. It is assumed this access corresponds
> + * to the IRQ status register reset.
> + * With such a mechanism, a single IRQ can be handled at a time since
> + * there is no way to know which IRQ was completed by the guest.
> + * (we would need additional details about the IRQ status register mask)
> + */
> +static void vfio_platform_eoi(VFIODevice *vbasedev)
> +{
> +    VFIOINTp *intp;
> +    VFIOPlatformDevice *vdev =
> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
> +
> +    QLIST_FOREACH(intp, &vdev->intp_list, next) {
> +        if (intp->state == VFIO_IRQ_ACTIVE) {
> +            DPRINTF("EOI IRQ #%d fd=%d\n",
> +                    intp->pin, event_notifier_get_fd(&intp->interrupt));
> +            intp->state = VFIO_IRQ_INACTIVE;
> +
> +            /* deassert the virtual IRQ and unmask physical one */
> +            qemu_set_irq(intp->qemuirq, 0);
> +            vfio_unmask_irqindex(vbasedev, intp->pin);
> +
> +            /* a single IRQ can be active at a time */
> +            break;
> +        }
> +    }
> +
> +    /* in case there are pending IRQs, handle them one at a time */
> +    if (!QSIMPLEQ_EMPTY(&vdev->pending_intp_queue)) {
> +        intp = QSIMPLEQ_FIRST(&vdev->pending_intp_queue);
> +        vfio_intp_interrupt(intp);
> +        QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext);
> +    }
> +}
> +
> +/*
> + * enable/disable the fast path mode
> + * fast path = MMIO region is mmaped (no KVM TRAP)
> + * slow path = MMIO region is trapped and region callbacks are called
> + * slow path enables to trap the IRQ status register guest reset
> +*/
> +
> +static void vfio_mmap_set_enabled(VFIOPlatformDevice *vdev, bool enabled)
> +{
> +    VFIORegion *region;
> +    int i;
> +
> +    DPRINTF("fast path = %d\n", enabled);
> +
> +    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
> +        region = vdev->regions[i];
> +
> +        /* register space is unmapped to trap EOI */
> +        memory_region_set_enabled(&region->mmap_mem, enabled);
> +    }
> +}
> +
> +/*
> + * Checks whether the IRQ is still pending. In the negative
> + * the fast path mode (where reg space is mmaped) can be restored.
> + * if the IRQ is still pending, we must keep on trapping IRQ status
> + * register reset with mmap disabled (slow path).
> + * the function is called on mmap_timer event.
> + * by construction a single fd is handled at a time. See EOI comment
> + * for additional details.
> + */
> +static void vfio_intp_mmap_enable(void *opaque)
> +{
> +    VFIOINTp *tmp;
> +    VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque;
> +
> +    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
> +        if (tmp->state == VFIO_IRQ_ACTIVE) {
> +            DPRINTF("IRQ #%d still active, stay in slow path\n",
> +                    tmp->pin);
> +            timer_mod(vdev->mmap_timer,
> +                      qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> +                          vdev->mmap_timeout);
> +            return;
> +        }
> +    }
> +    DPRINTF("no active IRQ, restore fast path\n");
> +    vfio_mmap_set_enabled(vdev, true);
> +}
> +
> +/*
> + * The fd handler
> + */
> +void vfio_intp_interrupt(void *opaque)
> +{
> +    int ret;
> +    VFIOINTp *tmp, *intp = (VFIOINTp *)opaque;
> +    VFIOPlatformDevice *vdev = intp->vdev;
> +    bool one_active_irq = false;
> +
> +    /*
> +     * first check whether there is a pending IRQ
> +     * in the positive the new IRQ cannot be handled until the
> +     * active one is not completed.
> +     * by construction the same IRQ as the pending one cannot hit
> +     * since the physical IRQ was disabled by the VFIO driver
> +     */
> +    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
> +        if (tmp->state == VFIO_IRQ_ACTIVE) {
> +            one_active_irq = true;
> +            break;
> +        }
> +    }
> +    if (one_active_irq) {
> +        /*
> +         * the new IRQ gets a pending status and is pushed in
> +         * the pending queue
> +         */
> +        intp->state = VFIO_IRQ_PENDING;
> +        QSIMPLEQ_INSERT_TAIL(&vdev->pending_intp_queue,
> +                             intp, pqnext);
> +        return;
> +    }
> +
> +    /* no active IRQ, the new IRQ can be forwarded to the guest */
> +    DPRINTF("Handle IRQ #%d (fd = %d)\n",
> +            intp->pin, event_notifier_get_fd(&intp->interrupt));
> +
> +    ret = event_notifier_test_and_clear(&intp->interrupt);
> +    if (!ret) {
> +        DPRINTF("Error when clearing fd=%d\n",
> +                event_notifier_get_fd(&intp->interrupt));
> +    }
> +
> +    intp->state = VFIO_IRQ_ACTIVE;
> +
> +    /* sets slow path */
> +    vfio_mmap_set_enabled(vdev, false);
> +
> +    /* trigger the virtual IRQ */
> +    qemu_set_irq(intp->qemuirq, 1);
> +
> +    /* schedule the mmap timer which will restore mmap path after EOI*/
> +    if (vdev->mmap_timeout) {
> +        timer_mod(vdev->mmap_timer,
> +                  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> +                      vdev->mmap_timeout);
> +    }
> +}
> +
> +static int vfio_enable_intp(VFIODevice *vbasedev, unsigned int index)
> +{
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +    int ret, argsz;
> +    int device = vbasedev->fd;
> +    VFIOPlatformDevice *vdev =
> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
> +    SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev);
> +    VFIOINTp *intp;
> +
> +    /* allocate and populate a new VFIOINTp structure put in a queue list */
> +    intp = g_malloc0(sizeof(*intp));
> +    intp->vdev = vdev;
> +    intp->pin = index;
> +    intp->state = VFIO_IRQ_INACTIVE;
> +    sysbus_init_irq(sbdev, &intp->qemuirq);
> +
> +    ret = event_notifier_init(&intp->interrupt, 0);
> +    if (ret) {
> +        g_free(intp);
> +        error_report("vfio: Error: event_notifier_init failed ");
> +        return ret;
> +    }
> +
> +    /* build the irq_set to be passed to the vfio kernel driver */
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = index;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +
> +    *pfd = event_notifier_get_fd(&intp->interrupt);
> +
> +    DPRINTF("register fd=%d/irq index=%d to kernel\n", *pfd, index);
> +
> +    qemu_set_fd_handler(*pfd, vfio_intp_interrupt, NULL, intp);
> +
> +    /*
> +     * pass the index/fd binding to the kernel driver so that it
> +     * triggers this fd on HW IRQ
> +     */
> +    ret = ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set);
> +    g_free(irq_set);
> +    if (ret) {
> +        error_report("vfio: Error: Failed to pass IRQ fd to the driver: %m");
> +        qemu_set_fd_handler(*pfd, NULL, NULL, NULL);
> +        event_notifier_cleanup(&intp->interrupt);
> +        return -errno;
> +    }
> +
> +    /* store the new intp in qlist */
> +    QLIST_INSERT_HEAD(&vdev->intp_list, intp, next);
> +    return 0;
> +}
> +
> +static int vfio_populate_interrupts(VFIODevice *vbasedev)
> +{
> +    struct vfio_irq_info irq = { .argsz = sizeof(irq) };
> +    int i, ret;
> +    VFIOPlatformDevice *vdev =
> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
> +
> +    vdev->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
> +                                    vfio_intp_mmap_enable, vdev);
> +
> +    QSIMPLEQ_INIT(&vdev->pending_intp_queue);
> +
> +    for (i = 0; i < vbasedev->num_irqs; i++) {
> +        irq.index = i;
> +
> +        DPRINTF("Retrieve IRQ info from vfio platform driver ...\n");
> +
> +        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
> +        if (ret) {
> +            /* This can fail for an old kernel or legacy PCI dev */
> +            error_printf("vfio: error getting device %s irq info",
> +                         vbasedev->name);
> +        } else {
> +            DPRINTF("- IRQ index %d: count %d, flags=0x%x\n",
> +                    irq.index, irq.count, irq.flags);
> +
> +            ret = vfio_enable_intp(vbasedev, irq.index);
> +            if (ret) {
> +                error_report("vfio: Error setting IRQ %d up", i);
> +                return ret;
> +            }
> +        }
> +    }
> +    return 0;
> +}
> +
> +static VFIODeviceOps vfio_platform_ops = {
> +    .vfio_compute_needs_reset = vfio_platform_compute_needs_reset,
> +    .vfio_hot_reset_multi = vfio_platform_hot_reset_multi,
> +    .vfio_eoi = vfio_platform_eoi,
> +    .vfio_check_device = vfio_platform_check_device,
> +    .vfio_populate_regions = vfio_populate_regions,
> +    .vfio_populate_interrupts = vfio_populate_interrupts,
> +};
> +
> +static int vfio_base_device_init(VFIODevice *vbasedev)
> +{
> +    VFIOGroup *group;
> +    VFIODevice *vbasedev_iter;
> +    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
> +    ssize_t len;
> +    struct stat st;
> +    int groupid;
> +    int ret;
> +
> +    /* name must be set prior to the call */
> +    if (!vbasedev->name) {
> +        return -EINVAL;
> +    }
> +
> +    /* Check that the host device exists */
> +    snprintf(path, sizeof(path), "/sys/bus/platform/devices/%s/",
> +             vbasedev->name);
> +
> +    if (stat(path, &st) < 0) {
> +        error_report("vfio: error: no such host device: %s", path);
> +        return -errno;
> +    }
> +
> +    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
> +    len = readlink(path, iommu_group_path, sizeof(path));
> +    if (len <= 0 || len >= sizeof(path)) {
> +        error_report("vfio: error no iommu_group for device");
> +        return len < 0 ? -errno : ENAMETOOLONG;
> +    }
> +
> +    iommu_group_path[len] = 0;
> +    group_name = basename(iommu_group_path);
> +
> +    if (sscanf(group_name, "%d", &groupid) != 1) {
> +        error_report("vfio: error reading %s: %m", path);
> +        return -errno;
> +    }
> +
> +    DPRINTF("%s(%s) group %d\n", __func__, vbasedev->name, groupid);
> +
> +    group = vfio_get_group(groupid, &address_space_memory);
> +    if (!group) {
> +        error_report("vfio: failed to get group %d", groupid);
> +        return -ENOENT;
> +    }
> +
> +    snprintf(path, sizeof(path), "%s", vbasedev->name);
> +
> +    QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
> +        if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
> +            error_report("vfio: error: device %s is already attached", path);
> +            vfio_put_group(group);
> +            return -EBUSY;
> +        }
> +    }
> +    ret = vfio_get_device(group, path, vbasedev);
> +    if (ret) {
> +        error_report("vfio: failed to get device %s", path);
> +        vfio_put_group(group);
> +    }
> + return ret;
> +}
> +
> +void vfio_put_device(VFIOPlatformDevice *vdev)
> +{
> +    unsigned int i;
> +    VFIODevice *vbasedev = &vdev->vbasedev;
> +
> +    for (i = 0; i < vbasedev->num_regions; i++) {
> +            g_free(vdev->regions[i]);
> +    }
> +    g_free(vdev->regions);
> +    g_free(vdev->vbasedev.name);
> +    vfio_put_base_device(&vdev->vbasedev);
> +}
> +
> +static void vfio_platform_realize(DeviceState *dev, Error **errp)
> +{
> +    VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev);
> +    SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
> +    VFIODevice *vbasedev = &vdev->vbasedev;
> +    int i, ret;
> +
> +    vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
> +    vbasedev->ops = &vfio_platform_ops;
> +
> +    DPRINTF("vfio device %s, compat = %s\n", vbasedev->name, vdev->compat);
> +
> +    ret = vfio_base_device_init(vbasedev);
> +    if (ret) {
> +        return;
> +    }
> +
> +    for (i = 0; i < vbasedev->num_regions; i++) {
> +        vfio_map_region(vdev, i);
> +        sysbus_init_mmio(sbdev, &vdev->regions[i]->mem);
> +    }
> +}
> +
> +static const VMStateDescription vfio_platform_vmstate = {
> +    .name = TYPE_VFIO_PLATFORM,
> +    .unmigratable = 1,
> +};
> +
> +static Property vfio_platform_dev_properties[] = {
> +    DEFINE_PROP_STRING("vfio_device", VFIOPlatformDevice, vbasedev.name),
> +    DEFINE_PROP_STRING("compat", VFIOPlatformDevice, compat),
> +    DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice,
> +                       mmap_timeout, 1100),
> +    DEFINE_PROP_BOOL("irqfd", VFIOPlatformDevice, irqfd_allowed, true),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void vfio_platform_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    dc->realize = vfio_platform_realize;
> +    dc->props = vfio_platform_dev_properties;
> +    dc->vmsd = &vfio_platform_vmstate;
> +    dc->desc = "VFIO-based platform device assignment";
> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> +}
> +
> +static const TypeInfo vfio_platform_dev_info = {
> +    .name = TYPE_VFIO_PLATFORM,
> +    .parent = TYPE_SYS_BUS_DEVICE,
> +    .instance_size = sizeof(VFIOPlatformDevice),
> +    .class_init = vfio_platform_class_init,
> +    .class_size = sizeof(VFIOPlatformDeviceClass),

This should be an abstract class. People must never instantiate a 
generic "vfio-platform" device. Only "vfio-xgmac", "vfio-etsec", etc 
devices should be exposed to the user.


Alex
Alex Williamson Aug. 11, 2014, 8:13 p.m. UTC | #2
On Sat, 2014-08-09 at 15:25 +0100, Eric Auger wrote:
> Minimal VFIO platform implementation supporting
> - register space user mapping,
> - IRQ assignment based on eventfds handled on qemu side.
> 
> irqfd kernel acceleration comes in a subsequent patch.
> 
> Signed-off-by: Kim Phillips <kim.phillips@linaro.org>
> Signed-off-by: Eric Auger <eric.auger@linaro.org>
> 
> ---
> 
> v4 -> v5:
> - vfio-plaform.h included first
> - cleanup error handling in *populate*, vfio_get_device,
>   vfio_enable_intp
> - vfio_put_device not called anymore
> - add some includes to follow vfio policy
> 
> v3 -> v4:
> [Eric Auger]
> - merge of "vfio: Add initial IRQ support in platform device"
>   to get a full functional patch although perfs are limited.
> - removal of unrealize function since I currently understand
>   it is only used with device hot-plug feature.
> 
> v2 -> v3:
> [Eric Auger]
> - further factorization between PCI and platform (VFIORegion,
>   VFIODevice). same level of functionality.
> 
> <= v2:
> [Kim Philipps]
> - Initial Creation of the device supporting register space mapping
> ---
>  hw/vfio/Makefile.objs           |   1 +
>  hw/vfio/platform.c              | 517 ++++++++++++++++++++++++++++++++++++++++
>  include/hw/vfio/vfio-platform.h |  77 ++++++
>  3 files changed, 595 insertions(+)
>  create mode 100644 hw/vfio/platform.c
>  create mode 100644 include/hw/vfio/vfio-platform.h
> 
> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
> index e31f30e..c5c76fe 100644
> --- a/hw/vfio/Makefile.objs
> +++ b/hw/vfio/Makefile.objs
> @@ -1,4 +1,5 @@
>  ifeq ($(CONFIG_LINUX), y)
>  obj-$(CONFIG_SOFTMMU) += common.o
>  obj-$(CONFIG_PCI) += pci.o
> +obj-$(CONFIG_SOFTMMU) += platform.o
>  endif
> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
> new file mode 100644
> index 0000000..f1a1b55
> --- /dev/null
> +++ b/hw/vfio/platform.c
> @@ -0,0 +1,517 @@
> +/*
> + * vfio based device assignment support - platform devices
> + *
> + * Copyright Linaro Limited, 2014
> + *
> + * Authors:
> + *  Kim Phillips <kim.phillips@linaro.org>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on vfio based PCI device assignment support:
> + *  Copyright Red Hat, Inc. 2012
> + */
> +
> +#include <linux/vfio.h>
> +#include <sys/ioctl.h>
> +
> +#include "hw/vfio/vfio-platform.h"
> +#include "qemu/error-report.h"
> +#include "qemu/range.h"
> +#include "sysemu/sysemu.h"
> +#include "exec/memory.h"
> +#include "qemu/queue.h"
> +#include "hw/sysbus.h"
> +
> +extern const MemoryRegionOps vfio_region_ops;
> +extern const MemoryListener vfio_memory_listener;
> +extern QLIST_HEAD(, VFIOGroup) group_list;
> +extern QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces;
> +void vfio_put_device(VFIOPlatformDevice *vdev);
> +
> +/*
> + * It is mandatory to pass a VFIOPlatformDevice since VFIODevice
> + * is not a QOM Object and cannot be passed to memory region functions
> +*/
> +static void vfio_map_region(VFIOPlatformDevice *vdev, int nr)
> +{
> +    VFIORegion *region = vdev->regions[nr];
> +    unsigned size = region->size;
> +    char name[64];
> +
> +    if (!size) {
> +        return;
> +    }
> +
> +    snprintf(name, sizeof(name), "VFIO %s region %d",
> +             vdev->vbasedev.name, nr);
> +
> +    /* A "slow" read/write mapping underlies all regions */
> +    memory_region_init_io(&region->mem, OBJECT(vdev), &vfio_region_ops,
> +                          region, name, size);
> +
> +    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
> +
> +    if (vfio_mmap_region(OBJECT(vdev), region, &region->mem,
> +                         &region->mmap_mem, &region->mmap, size, 0, name)) {
> +        error_report("%s unsupported. Performance may be slow", name);
> +    }
> +}
> +
> +static void print_regions(VFIOPlatformDevice *vdev)
> +{
> +    int i;
> +
> +    DPRINTF("Device \"%s\" counts %d region(s):\n",
> +             vdev->vbasedev.name, vdev->vbasedev.num_regions);
> +
> +    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
> +        DPRINTF("- region %d flags = 0x%lx, size = 0x%lx, "
> +                "fd= %d, offset = 0x%lx\n",
> +                vdev->regions[i]->nr,
> +                (unsigned long)vdev->regions[i]->flags,
> +                (unsigned long)vdev->regions[i]->size,
> +                vdev->regions[i]->vbasedev->fd,
> +                (unsigned long)vdev->regions[i]->fd_offset);
> +    }
> +}
> +
> +static int vfio_populate_regions(VFIODevice *vbasedev)
> +{
> +    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
> +    int i, ret = 0;
> +    VFIOPlatformDevice *vdev =
> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
> +
> +    vdev->regions = g_malloc0(sizeof(VFIORegion *) * vbasedev->num_regions);
> +
> +    for (i = 0; i < vbasedev->num_regions; i++) {
> +        vdev->regions[i] = g_malloc0(sizeof(VFIORegion));
> +        reg_info.index = i;
> +        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> +        if (ret) {
> +            error_report("vfio: Error getting region %d info: %m", i);
> +            goto error;
> +        }
> +
> +        vdev->regions[i]->flags = reg_info.flags;
> +        vdev->regions[i]->size = reg_info.size;
> +        vdev->regions[i]->fd_offset = reg_info.offset;
> +        vdev->regions[i]->nr = i;
> +        vdev->regions[i]->vbasedev = vbasedev;
> +    }
> +    print_regions(vdev);
> +error:
> +    return ret;
> +}
> +
> +/* not implemented yet */
> +static int vfio_platform_check_device(VFIODevice *vdev)
> +{
> +    return 0;
> +}
> +
> +/* not implemented yet */
> +static bool vfio_platform_compute_needs_reset(VFIODevice *vdev)
> +{
> +return false;
> +}
> +
> +/* not implemented yet */
> +static int vfio_platform_hot_reset_multi(VFIODevice *vdev)
> +{
> +return 0;
> +}
> +
> +/*
> + * eoi function is called on the first access to any MMIO region
> + * after an IRQ was triggered. It is assumed this access corresponds
> + * to the IRQ status register reset.
> + * With such a mechanism, a single IRQ can be handled at a time since
> + * there is no way to know which IRQ was completed by the guest.
> + * (we would need additional details about the IRQ status register mask)
> + */
> +static void vfio_platform_eoi(VFIODevice *vbasedev)
> +{
> +    VFIOINTp *intp;
> +    VFIOPlatformDevice *vdev =
> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
> +
> +    QLIST_FOREACH(intp, &vdev->intp_list, next) {
> +        if (intp->state == VFIO_IRQ_ACTIVE) {
> +            DPRINTF("EOI IRQ #%d fd=%d\n",
> +                    intp->pin, event_notifier_get_fd(&intp->interrupt));
> +            intp->state = VFIO_IRQ_INACTIVE;
> +
> +            /* deassert the virtual IRQ and unmask physical one */
> +            qemu_set_irq(intp->qemuirq, 0);
> +            vfio_unmask_irqindex(vbasedev, intp->pin);
> +
> +            /* a single IRQ can be active at a time */
> +            break;
> +        }
> +    }
> +
> +    /* in case there are pending IRQs, handle them one at a time */
> +    if (!QSIMPLEQ_EMPTY(&vdev->pending_intp_queue)) {
> +        intp = QSIMPLEQ_FIRST(&vdev->pending_intp_queue);
> +        vfio_intp_interrupt(intp);
> +        QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext);
> +    }
> +}
> +
> +/*
> + * enable/disable the fast path mode
> + * fast path = MMIO region is mmaped (no KVM TRAP)
> + * slow path = MMIO region is trapped and region callbacks are called
> + * slow path enables to trap the IRQ status register guest reset
> +*/
> +
> +static void vfio_mmap_set_enabled(VFIOPlatformDevice *vdev, bool enabled)
> +{
> +    VFIORegion *region;
> +    int i;
> +
> +    DPRINTF("fast path = %d\n", enabled);
> +
> +    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
> +        region = vdev->regions[i];
> +
> +        /* register space is unmapped to trap EOI */
> +        memory_region_set_enabled(&region->mmap_mem, enabled);
> +    }
> +}
> +
> +/*
> + * Checks whether the IRQ is still pending. In the negative
> + * the fast path mode (where reg space is mmaped) can be restored.
> + * if the IRQ is still pending, we must keep on trapping IRQ status
> + * register reset with mmap disabled (slow path).
> + * the function is called on mmap_timer event.
> + * by construction a single fd is handled at a time. See EOI comment
> + * for additional details.
> + */
> +static void vfio_intp_mmap_enable(void *opaque)
> +{
> +    VFIOINTp *tmp;
> +    VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque;
> +
> +    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
> +        if (tmp->state == VFIO_IRQ_ACTIVE) {
> +            DPRINTF("IRQ #%d still active, stay in slow path\n",
> +                    tmp->pin);
> +            timer_mod(vdev->mmap_timer,
> +                      qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> +                          vdev->mmap_timeout);
> +            return;
> +        }
> +    }
> +    DPRINTF("no active IRQ, restore fast path\n");
> +    vfio_mmap_set_enabled(vdev, true);
> +}
> +
> +/*
> + * The fd handler
> + */
> +void vfio_intp_interrupt(void *opaque)
> +{
> +    int ret;
> +    VFIOINTp *tmp, *intp = (VFIOINTp *)opaque;
> +    VFIOPlatformDevice *vdev = intp->vdev;
> +    bool one_active_irq = false;
> +
> +    /*
> +     * first check whether there is a pending IRQ
> +     * in the positive the new IRQ cannot be handled until the
> +     * active one is not completed.
> +     * by construction the same IRQ as the pending one cannot hit
> +     * since the physical IRQ was disabled by the VFIO driver
> +     */
> +    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
> +        if (tmp->state == VFIO_IRQ_ACTIVE) {
> +            one_active_irq = true;
> +            break;
> +        }
> +    }
> +    if (one_active_irq) {
> +        /*
> +         * the new IRQ gets a pending status and is pushed in
> +         * the pending queue
> +         */
> +        intp->state = VFIO_IRQ_PENDING;
> +        QSIMPLEQ_INSERT_TAIL(&vdev->pending_intp_queue,
> +                             intp, pqnext);
> +        return;
> +    }
> +
> +    /* no active IRQ, the new IRQ can be forwarded to the guest */
> +    DPRINTF("Handle IRQ #%d (fd = %d)\n",
> +            intp->pin, event_notifier_get_fd(&intp->interrupt));
> +
> +    ret = event_notifier_test_and_clear(&intp->interrupt);
> +    if (!ret) {
> +        DPRINTF("Error when clearing fd=%d\n",
> +                event_notifier_get_fd(&intp->interrupt));
> +    }
> +
> +    intp->state = VFIO_IRQ_ACTIVE;
> +
> +    /* sets slow path */
> +    vfio_mmap_set_enabled(vdev, false);
> +
> +    /* trigger the virtual IRQ */
> +    qemu_set_irq(intp->qemuirq, 1);
> +
> +    /* schedule the mmap timer which will restore mmap path after EOI*/
> +    if (vdev->mmap_timeout) {
> +        timer_mod(vdev->mmap_timer,
> +                  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> +                      vdev->mmap_timeout);
> +    }
> +}
> +
> +static int vfio_enable_intp(VFIODevice *vbasedev, unsigned int index)
> +{
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +    int ret, argsz;
> +    int device = vbasedev->fd;
> +    VFIOPlatformDevice *vdev =
> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
> +    SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev);
> +    VFIOINTp *intp;
> +
> +    /* allocate and populate a new VFIOINTp structure put in a queue list */
> +    intp = g_malloc0(sizeof(*intp));
> +    intp->vdev = vdev;
> +    intp->pin = index;
> +    intp->state = VFIO_IRQ_INACTIVE;
> +    sysbus_init_irq(sbdev, &intp->qemuirq);
> +
> +    ret = event_notifier_init(&intp->interrupt, 0);
> +    if (ret) {
> +        g_free(intp);
> +        error_report("vfio: Error: event_notifier_init failed ");
> +        return ret;
> +    }
> +
> +    /* build the irq_set to be passed to the vfio kernel driver */
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = index;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +
> +    *pfd = event_notifier_get_fd(&intp->interrupt);
> +
> +    DPRINTF("register fd=%d/irq index=%d to kernel\n", *pfd, index);
> +
> +    qemu_set_fd_handler(*pfd, vfio_intp_interrupt, NULL, intp);
> +
> +    /*
> +     * pass the index/fd binding to the kernel driver so that it
> +     * triggers this fd on HW IRQ
> +     */
> +    ret = ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set);
> +    g_free(irq_set);
> +    if (ret) {
> +        error_report("vfio: Error: Failed to pass IRQ fd to the driver: %m");
> +        qemu_set_fd_handler(*pfd, NULL, NULL, NULL);
> +        event_notifier_cleanup(&intp->interrupt);
> +        return -errno;
> +    }
> +
> +    /* store the new intp in qlist */
> +    QLIST_INSERT_HEAD(&vdev->intp_list, intp, next);
> +    return 0;
> +}
> +
> +static int vfio_populate_interrupts(VFIODevice *vbasedev)
> +{
> +    struct vfio_irq_info irq = { .argsz = sizeof(irq) };
> +    int i, ret;
> +    VFIOPlatformDevice *vdev =
> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
> +
> +    vdev->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
> +                                    vfio_intp_mmap_enable, vdev);
> +
> +    QSIMPLEQ_INIT(&vdev->pending_intp_queue);
> +
> +    for (i = 0; i < vbasedev->num_irqs; i++) {
> +        irq.index = i;
> +
> +        DPRINTF("Retrieve IRQ info from vfio platform driver ...\n");
> +
> +        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
> +        if (ret) {
> +            /* This can fail for an old kernel or legacy PCI dev */
> +            error_printf("vfio: error getting device %s irq info",
> +                         vbasedev->name);

Strange comment for a platform device.  On PCI this comment only applied
to the virtual error IRQ since it may or may not be supported per
device.  For PCI, the number of IRQs and regions is really more of a
highest index, so it can be sparsely populated.  We know about the error
IRQ, so probe for it, but it may not be present.  Likewise, we know
about the VGA region, but it may not be supported by this device and
will return error on the info call.

> +        } else {
> +            DPRINTF("- IRQ index %d: count %d, flags=0x%x\n",
> +                    irq.index, irq.count, irq.flags);
> +
> +            ret = vfio_enable_intp(vbasedev, irq.index);
> +            if (ret) {
> +                error_report("vfio: Error setting IRQ %d up", i);
> +                return ret;
> +            }
> +        }
> +    }
> +    return 0;
> +}
> +
> +static VFIODeviceOps vfio_platform_ops = {
> +    .vfio_compute_needs_reset = vfio_platform_compute_needs_reset,
> +    .vfio_hot_reset_multi = vfio_platform_hot_reset_multi,
> +    .vfio_eoi = vfio_platform_eoi,
> +    .vfio_check_device = vfio_platform_check_device,
> +    .vfio_populate_regions = vfio_populate_regions,
> +    .vfio_populate_interrupts = vfio_populate_interrupts,
> +};
> +
> +static int vfio_base_device_init(VFIODevice *vbasedev)
> +{
> +    VFIOGroup *group;
> +    VFIODevice *vbasedev_iter;
> +    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
> +    ssize_t len;
> +    struct stat st;
> +    int groupid;
> +    int ret;
> +
> +    /* name must be set prior to the call */
> +    if (!vbasedev->name) {
> +        return -EINVAL;
> +    }
> +
> +    /* Check that the host device exists */
> +    snprintf(path, sizeof(path), "/sys/bus/platform/devices/%s/",
> +             vbasedev->name);
> +
> +    if (stat(path, &st) < 0) {
> +        error_report("vfio: error: no such host device: %s", path);
> +        return -errno;
> +    }
> +
> +    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
> +    len = readlink(path, iommu_group_path, sizeof(path));
> +    if (len <= 0 || len >= sizeof(path)) {
> +        error_report("vfio: error no iommu_group for device");
> +        return len < 0 ? -errno : ENAMETOOLONG;
> +    }
> +
> +    iommu_group_path[len] = 0;
> +    group_name = basename(iommu_group_path);
> +
> +    if (sscanf(group_name, "%d", &groupid) != 1) {
> +        error_report("vfio: error reading %s: %m", path);
> +        return -errno;
> +    }
> +
> +    DPRINTF("%s(%s) group %d\n", __func__, vbasedev->name, groupid);
> +
> +    group = vfio_get_group(groupid, &address_space_memory);
> +    if (!group) {
> +        error_report("vfio: failed to get group %d", groupid);
> +        return -ENOENT;
> +    }
> +
> +    snprintf(path, sizeof(path), "%s", vbasedev->name);
> +
> +    QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
> +        if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
> +            error_report("vfio: error: device %s is already attached", path);
> +            vfio_put_group(group);
> +            return -EBUSY;
> +        }
> +    }
> +    ret = vfio_get_device(group, path, vbasedev);
> +    if (ret) {
> +        error_report("vfio: failed to get device %s", path);
> +        vfio_put_group(group);
> +    }
> + return ret;
> +}
> +
> +void vfio_put_device(VFIOPlatformDevice *vdev)
> +{
> +    unsigned int i;
> +    VFIODevice *vbasedev = &vdev->vbasedev;
> +
> +    for (i = 0; i < vbasedev->num_regions; i++) {
> +            g_free(vdev->regions[i]);
> +    }
> +    g_free(vdev->regions);
> +    g_free(vdev->vbasedev.name);
> +    vfio_put_base_device(&vdev->vbasedev);
> +}
> +
> +static void vfio_platform_realize(DeviceState *dev, Error **errp)
> +{
> +    VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev);
> +    SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
> +    VFIODevice *vbasedev = &vdev->vbasedev;
> +    int i, ret;
> +
> +    vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
> +    vbasedev->ops = &vfio_platform_ops;
> +
> +    DPRINTF("vfio device %s, compat = %s\n", vbasedev->name, vdev->compat);
> +
> +    ret = vfio_base_device_init(vbasedev);
> +    if (ret) {
> +        return;
> +    }
> +
> +    for (i = 0; i < vbasedev->num_regions; i++) {
> +        vfio_map_region(vdev, i);
> +        sysbus_init_mmio(sbdev, &vdev->regions[i]->mem);
> +    }
> +}
> +
> +static const VMStateDescription vfio_platform_vmstate = {
> +    .name = TYPE_VFIO_PLATFORM,
> +    .unmigratable = 1,
> +};
> +
> +static Property vfio_platform_dev_properties[] = {
> +    DEFINE_PROP_STRING("vfio_device", VFIOPlatformDevice, vbasedev.name),

Hmm, is this really a good name for this option?  "host" would give you
some consistency with vfio-pci.

> +    DEFINE_PROP_STRING("compat", VFIOPlatformDevice, compat),
> +    DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice,
> +                       mmap_timeout, 1100),
> +    DEFINE_PROP_BOOL("irqfd", VFIOPlatformDevice, irqfd_allowed, true),

Should some of these be x- options or do you plan to support them long
term and support users twiddling them?

> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void vfio_platform_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    dc->realize = vfio_platform_realize;
> +    dc->props = vfio_platform_dev_properties;
> +    dc->vmsd = &vfio_platform_vmstate;
> +    dc->desc = "VFIO-based platform device assignment";
> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> +}
> +
> +static const TypeInfo vfio_platform_dev_info = {
> +    .name = TYPE_VFIO_PLATFORM,
> +    .parent = TYPE_SYS_BUS_DEVICE,
> +    .instance_size = sizeof(VFIOPlatformDevice),
> +    .class_init = vfio_platform_class_init,
> +    .class_size = sizeof(VFIOPlatformDeviceClass),
> +};
> +
> +static void register_vfio_platform_dev_type(void)
> +{
> +    type_register_static(&vfio_platform_dev_info);
> +}
> +
> +type_init(register_vfio_platform_dev_type)
> diff --git a/include/hw/vfio/vfio-platform.h b/include/hw/vfio/vfio-platform.h
> new file mode 100644
> index 0000000..1ee072a
> --- /dev/null
> +++ b/include/hw/vfio/vfio-platform.h
> @@ -0,0 +1,77 @@
> +/*
> + * vfio based device assignment support - platform devices
> + *
> + * Copyright Linaro Limited, 2014
> + *
> + * Authors:
> + *  Kim Phillips <kim.phillips@linaro.org>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on vfio based PCI device assignment support:
> + *  Copyright Red Hat, Inc. 2012
> + */
> +
> +#ifndef HW_VFIO_VFIO_PLATFORM_H
> +#define HW_VFIO_VFIO_PLATFORM_H
> +
> +#include "hw/sysbus.h"
> +#include "hw/vfio/vfio-common.h"
> +#include "qemu/event_notifier.h"
> +#include "qemu/queue.h"
> +#include "hw/irq.h"
> +
> +#define TYPE_VFIO_PLATFORM "vfio-platform"
> +
> +enum {
> +    VFIO_IRQ_INACTIVE = 0,
> +    VFIO_IRQ_PENDING = 1,
> +    VFIO_IRQ_ACTIVE = 2,
> +    /* VFIO_IRQ_ACTIVE_AND_PENDING cannot happen with VFIO */
> +};
> +
> +typedef struct VFIOINTp {
> +    QLIST_ENTRY(VFIOINTp) next; /* entry for IRQ list */
> +    QSIMPLEQ_ENTRY(VFIOINTp) pqnext; /* entry for pending IRQ queue */
> +    EventNotifier interrupt; /* eventfd triggered on interrupt */
> +    EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
> +    qemu_irq qemuirq;
> +    struct VFIOPlatformDevice *vdev; /* back pointer to device */
> +    int state; /* inactive, pending, active */
> +    bool kvm_accel; /* set when QEMU bypass through KVM enabled */
> +    uint8_t pin; /* index */
> +    uint8_t virtualID; /* virtual IRQ */
> +} VFIOINTp;
> +
> +typedef struct VFIOPlatformDevice {
> +    SysBusDevice sbdev;
> +    VFIODevice vbasedev; /* not a QOM object */
> +    VFIORegion **regions;
> +    QLIST_HEAD(, VFIOINTp) intp_list; /* list of IRQ */
> +    /* queue of pending IRQ */
> +    QSIMPLEQ_HEAD(pending_intp_queue, VFIOINTp) pending_intp_queue;
> +    char *compat; /* compatibility string */
> +    bool irqfd_allowed;
> +    uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
> +    QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
> +} VFIOPlatformDevice;
> +
> +
> +typedef struct VFIOPlatformDeviceClass {
> +    /*< private >*/
> +    SysBusDeviceClass parent_class;
> +    /*< public >*/
> +} VFIOPlatformDeviceClass;
> +
> +#define VFIO_PLATFORM_DEVICE(obj) \
> +     OBJECT_CHECK(VFIOPlatformDevice, (obj), TYPE_VFIO_PLATFORM)
> +#define VFIO_PLATFORM_DEVICE_CLASS(klass) \
> +     OBJECT_CLASS_CHECK(VFIOPlatformDeviceClass, (klass), TYPE_VFIO_PLATFORM)
> +#define VFIO_PLATFORM_DEVICE_GET_CLASS(obj) \
> +     OBJECT_GET_CLASS(VFIOPlatformDeviceClass, (obj), TYPE_VFIO_PLATFORM)
> +
> +void vfio_intp_interrupt(void *opaque);
> +void vfio_setup_irqfd(SysBusDevice *dev, int index, int virq);

This was never defined.  Thanks,

Alex
Auger Eric Aug. 12, 2014, 5:51 a.m. UTC | #3
On 08/11/2014 10:13 PM, Alex Williamson wrote:
> On Sat, 2014-08-09 at 15:25 +0100, Eric Auger wrote:
>> Minimal VFIO platform implementation supporting
>> - register space user mapping,
>> - IRQ assignment based on eventfds handled on qemu side.
>>
>> irqfd kernel acceleration comes in a subsequent patch.
>>
>> Signed-off-by: Kim Phillips <kim.phillips@linaro.org>
>> Signed-off-by: Eric Auger <eric.auger@linaro.org>
>>
>> ---
>>
>> v4 -> v5:
>> - vfio-plaform.h included first
>> - cleanup error handling in *populate*, vfio_get_device,
>>   vfio_enable_intp
>> - vfio_put_device not called anymore
>> - add some includes to follow vfio policy
>>
>> v3 -> v4:
>> [Eric Auger]
>> - merge of "vfio: Add initial IRQ support in platform device"
>>   to get a full functional patch although perfs are limited.
>> - removal of unrealize function since I currently understand
>>   it is only used with device hot-plug feature.
>>
>> v2 -> v3:
>> [Eric Auger]
>> - further factorization between PCI and platform (VFIORegion,
>>   VFIODevice). same level of functionality.
>>
>> <= v2:
>> [Kim Philipps]
>> - Initial Creation of the device supporting register space mapping
>> ---
>>  hw/vfio/Makefile.objs           |   1 +
>>  hw/vfio/platform.c              | 517 ++++++++++++++++++++++++++++++++++++++++
>>  include/hw/vfio/vfio-platform.h |  77 ++++++
>>  3 files changed, 595 insertions(+)
>>  create mode 100644 hw/vfio/platform.c
>>  create mode 100644 include/hw/vfio/vfio-platform.h
>>
>> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
>> index e31f30e..c5c76fe 100644
>> --- a/hw/vfio/Makefile.objs
>> +++ b/hw/vfio/Makefile.objs
>> @@ -1,4 +1,5 @@
>>  ifeq ($(CONFIG_LINUX), y)
>>  obj-$(CONFIG_SOFTMMU) += common.o
>>  obj-$(CONFIG_PCI) += pci.o
>> +obj-$(CONFIG_SOFTMMU) += platform.o
>>  endif
>> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
>> new file mode 100644
>> index 0000000..f1a1b55
>> --- /dev/null
>> +++ b/hw/vfio/platform.c
>> @@ -0,0 +1,517 @@
>> +/*
>> + * vfio based device assignment support - platform devices
>> + *
>> + * Copyright Linaro Limited, 2014
>> + *
>> + * Authors:
>> + *  Kim Phillips <kim.phillips@linaro.org>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + *
>> + * Based on vfio based PCI device assignment support:
>> + *  Copyright Red Hat, Inc. 2012
>> + */
>> +
>> +#include <linux/vfio.h>
>> +#include <sys/ioctl.h>
>> +
>> +#include "hw/vfio/vfio-platform.h"
>> +#include "qemu/error-report.h"
>> +#include "qemu/range.h"
>> +#include "sysemu/sysemu.h"
>> +#include "exec/memory.h"
>> +#include "qemu/queue.h"
>> +#include "hw/sysbus.h"
>> +
>> +extern const MemoryRegionOps vfio_region_ops;
>> +extern const MemoryListener vfio_memory_listener;
>> +extern QLIST_HEAD(, VFIOGroup) group_list;
>> +extern QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces;
>> +void vfio_put_device(VFIOPlatformDevice *vdev);
>> +
>> +/*
>> + * It is mandatory to pass a VFIOPlatformDevice since VFIODevice
>> + * is not a QOM Object and cannot be passed to memory region functions
>> +*/
>> +static void vfio_map_region(VFIOPlatformDevice *vdev, int nr)
>> +{
>> +    VFIORegion *region = vdev->regions[nr];
>> +    unsigned size = region->size;
>> +    char name[64];
>> +
>> +    if (!size) {
>> +        return;
>> +    }
>> +
>> +    snprintf(name, sizeof(name), "VFIO %s region %d",
>> +             vdev->vbasedev.name, nr);
>> +
>> +    /* A "slow" read/write mapping underlies all regions */
>> +    memory_region_init_io(&region->mem, OBJECT(vdev), &vfio_region_ops,
>> +                          region, name, size);
>> +
>> +    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
>> +
>> +    if (vfio_mmap_region(OBJECT(vdev), region, &region->mem,
>> +                         &region->mmap_mem, &region->mmap, size, 0, name)) {
>> +        error_report("%s unsupported. Performance may be slow", name);
>> +    }
>> +}
>> +
>> +static void print_regions(VFIOPlatformDevice *vdev)
>> +{
>> +    int i;
>> +
>> +    DPRINTF("Device \"%s\" counts %d region(s):\n",
>> +             vdev->vbasedev.name, vdev->vbasedev.num_regions);
>> +
>> +    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
>> +        DPRINTF("- region %d flags = 0x%lx, size = 0x%lx, "
>> +                "fd= %d, offset = 0x%lx\n",
>> +                vdev->regions[i]->nr,
>> +                (unsigned long)vdev->regions[i]->flags,
>> +                (unsigned long)vdev->regions[i]->size,
>> +                vdev->regions[i]->vbasedev->fd,
>> +                (unsigned long)vdev->regions[i]->fd_offset);
>> +    }
>> +}
>> +
>> +static int vfio_populate_regions(VFIODevice *vbasedev)
>> +{
>> +    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
>> +    int i, ret = 0;
>> +    VFIOPlatformDevice *vdev =
>> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
>> +
>> +    vdev->regions = g_malloc0(sizeof(VFIORegion *) * vbasedev->num_regions);
>> +
>> +    for (i = 0; i < vbasedev->num_regions; i++) {
>> +        vdev->regions[i] = g_malloc0(sizeof(VFIORegion));
>> +        reg_info.index = i;
>> +        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
>> +        if (ret) {
>> +            error_report("vfio: Error getting region %d info: %m", i);
>> +            goto error;
>> +        }
>> +
>> +        vdev->regions[i]->flags = reg_info.flags;
>> +        vdev->regions[i]->size = reg_info.size;
>> +        vdev->regions[i]->fd_offset = reg_info.offset;
>> +        vdev->regions[i]->nr = i;
>> +        vdev->regions[i]->vbasedev = vbasedev;
>> +    }
>> +    print_regions(vdev);
>> +error:
>> +    return ret;
>> +}
>> +
>> +/* not implemented yet */
>> +static int vfio_platform_check_device(VFIODevice *vdev)
>> +{
>> +    return 0;
>> +}
>> +
>> +/* not implemented yet */
>> +static bool vfio_platform_compute_needs_reset(VFIODevice *vdev)
>> +{
>> +return false;
>> +}
>> +
>> +/* not implemented yet */
>> +static int vfio_platform_hot_reset_multi(VFIODevice *vdev)
>> +{
>> +return 0;
>> +}
>> +
>> +/*
>> + * eoi function is called on the first access to any MMIO region
>> + * after an IRQ was triggered. It is assumed this access corresponds
>> + * to the IRQ status register reset.
>> + * With such a mechanism, a single IRQ can be handled at a time since
>> + * there is no way to know which IRQ was completed by the guest.
>> + * (we would need additional details about the IRQ status register mask)
>> + */
>> +static void vfio_platform_eoi(VFIODevice *vbasedev)
>> +{
>> +    VFIOINTp *intp;
>> +    VFIOPlatformDevice *vdev =
>> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
>> +
>> +    QLIST_FOREACH(intp, &vdev->intp_list, next) {
>> +        if (intp->state == VFIO_IRQ_ACTIVE) {
>> +            DPRINTF("EOI IRQ #%d fd=%d\n",
>> +                    intp->pin, event_notifier_get_fd(&intp->interrupt));
>> +            intp->state = VFIO_IRQ_INACTIVE;
>> +
>> +            /* deassert the virtual IRQ and unmask physical one */
>> +            qemu_set_irq(intp->qemuirq, 0);
>> +            vfio_unmask_irqindex(vbasedev, intp->pin);
>> +
>> +            /* a single IRQ can be active at a time */
>> +            break;
>> +        }
>> +    }
>> +
>> +    /* in case there are pending IRQs, handle them one at a time */
>> +    if (!QSIMPLEQ_EMPTY(&vdev->pending_intp_queue)) {
>> +        intp = QSIMPLEQ_FIRST(&vdev->pending_intp_queue);
>> +        vfio_intp_interrupt(intp);
>> +        QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext);
>> +    }
>> +}
>> +
>> +/*
>> + * enable/disable the fast path mode
>> + * fast path = MMIO region is mmaped (no KVM TRAP)
>> + * slow path = MMIO region is trapped and region callbacks are called
>> + * slow path enables to trap the IRQ status register guest reset
>> +*/
>> +
>> +static void vfio_mmap_set_enabled(VFIOPlatformDevice *vdev, bool enabled)
>> +{
>> +    VFIORegion *region;
>> +    int i;
>> +
>> +    DPRINTF("fast path = %d\n", enabled);
>> +
>> +    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
>> +        region = vdev->regions[i];
>> +
>> +        /* register space is unmapped to trap EOI */
>> +        memory_region_set_enabled(&region->mmap_mem, enabled);
>> +    }
>> +}
>> +
>> +/*
>> + * Checks whether the IRQ is still pending. In the negative
>> + * the fast path mode (where reg space is mmaped) can be restored.
>> + * if the IRQ is still pending, we must keep on trapping IRQ status
>> + * register reset with mmap disabled (slow path).
>> + * the function is called on mmap_timer event.
>> + * by construction a single fd is handled at a time. See EOI comment
>> + * for additional details.
>> + */
>> +static void vfio_intp_mmap_enable(void *opaque)
>> +{
>> +    VFIOINTp *tmp;
>> +    VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque;
>> +
>> +    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
>> +        if (tmp->state == VFIO_IRQ_ACTIVE) {
>> +            DPRINTF("IRQ #%d still active, stay in slow path\n",
>> +                    tmp->pin);
>> +            timer_mod(vdev->mmap_timer,
>> +                      qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
>> +                          vdev->mmap_timeout);
>> +            return;
>> +        }
>> +    }
>> +    DPRINTF("no active IRQ, restore fast path\n");
>> +    vfio_mmap_set_enabled(vdev, true);
>> +}
>> +
>> +/*
>> + * The fd handler
>> + */
>> +void vfio_intp_interrupt(void *opaque)
>> +{
>> +    int ret;
>> +    VFIOINTp *tmp, *intp = (VFIOINTp *)opaque;
>> +    VFIOPlatformDevice *vdev = intp->vdev;
>> +    bool one_active_irq = false;
>> +
>> +    /*
>> +     * first check whether there is a pending IRQ
>> +     * in the positive the new IRQ cannot be handled until the
>> +     * active one is not completed.
>> +     * by construction the same IRQ as the pending one cannot hit
>> +     * since the physical IRQ was disabled by the VFIO driver
>> +     */
>> +    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
>> +        if (tmp->state == VFIO_IRQ_ACTIVE) {
>> +            one_active_irq = true;
>> +            break;
>> +        }
>> +    }
>> +    if (one_active_irq) {
>> +        /*
>> +         * the new IRQ gets a pending status and is pushed in
>> +         * the pending queue
>> +         */
>> +        intp->state = VFIO_IRQ_PENDING;
>> +        QSIMPLEQ_INSERT_TAIL(&vdev->pending_intp_queue,
>> +                             intp, pqnext);
>> +        return;
>> +    }
>> +
>> +    /* no active IRQ, the new IRQ can be forwarded to the guest */
>> +    DPRINTF("Handle IRQ #%d (fd = %d)\n",
>> +            intp->pin, event_notifier_get_fd(&intp->interrupt));
>> +
>> +    ret = event_notifier_test_and_clear(&intp->interrupt);
>> +    if (!ret) {
>> +        DPRINTF("Error when clearing fd=%d\n",
>> +                event_notifier_get_fd(&intp->interrupt));
>> +    }
>> +
>> +    intp->state = VFIO_IRQ_ACTIVE;
>> +
>> +    /* sets slow path */
>> +    vfio_mmap_set_enabled(vdev, false);
>> +
>> +    /* trigger the virtual IRQ */
>> +    qemu_set_irq(intp->qemuirq, 1);
>> +
>> +    /* schedule the mmap timer which will restore mmap path after EOI*/
>> +    if (vdev->mmap_timeout) {
>> +        timer_mod(vdev->mmap_timer,
>> +                  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
>> +                      vdev->mmap_timeout);
>> +    }
>> +}
>> +
>> +static int vfio_enable_intp(VFIODevice *vbasedev, unsigned int index)
>> +{
>> +    struct vfio_irq_set *irq_set;
>> +    int32_t *pfd;
>> +    int ret, argsz;
>> +    int device = vbasedev->fd;
>> +    VFIOPlatformDevice *vdev =
>> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
>> +    SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev);
>> +    VFIOINTp *intp;
>> +
>> +    /* allocate and populate a new VFIOINTp structure put in a queue list */
>> +    intp = g_malloc0(sizeof(*intp));
>> +    intp->vdev = vdev;
>> +    intp->pin = index;
>> +    intp->state = VFIO_IRQ_INACTIVE;
>> +    sysbus_init_irq(sbdev, &intp->qemuirq);
>> +
>> +    ret = event_notifier_init(&intp->interrupt, 0);
>> +    if (ret) {
>> +        g_free(intp);
>> +        error_report("vfio: Error: event_notifier_init failed ");
>> +        return ret;
>> +    }
>> +
>> +    /* build the irq_set to be passed to the vfio kernel driver */
>> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
>> +
>> +    irq_set = g_malloc0(argsz);
>> +    irq_set->argsz = argsz;
>> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
>> +    irq_set->index = index;
>> +    irq_set->start = 0;
>> +    irq_set->count = 1;
>> +    pfd = (int32_t *)&irq_set->data;
>> +
>> +    *pfd = event_notifier_get_fd(&intp->interrupt);
>> +
>> +    DPRINTF("register fd=%d/irq index=%d to kernel\n", *pfd, index);
>> +
>> +    qemu_set_fd_handler(*pfd, vfio_intp_interrupt, NULL, intp);
>> +
>> +    /*
>> +     * pass the index/fd binding to the kernel driver so that it
>> +     * triggers this fd on HW IRQ
>> +     */
>> +    ret = ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set);
>> +    g_free(irq_set);
>> +    if (ret) {
>> +        error_report("vfio: Error: Failed to pass IRQ fd to the driver: %m");
>> +        qemu_set_fd_handler(*pfd, NULL, NULL, NULL);
>> +        event_notifier_cleanup(&intp->interrupt);
>> +        return -errno;
>> +    }
>> +
>> +    /* store the new intp in qlist */
>> +    QLIST_INSERT_HEAD(&vdev->intp_list, intp, next);
>> +    return 0;
>> +}
>> +
>> +static int vfio_populate_interrupts(VFIODevice *vbasedev)
>> +{
>> +    struct vfio_irq_info irq = { .argsz = sizeof(irq) };
>> +    int i, ret;
>> +    VFIOPlatformDevice *vdev =
>> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
>> +
>> +    vdev->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
>> +                                    vfio_intp_mmap_enable, vdev);
>> +
>> +    QSIMPLEQ_INIT(&vdev->pending_intp_queue);
>> +
>> +    for (i = 0; i < vbasedev->num_irqs; i++) {
>> +        irq.index = i;
>> +
>> +        DPRINTF("Retrieve IRQ info from vfio platform driver ...\n");
>> +
>> +        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
>> +        if (ret) {
>> +            /* This can fail for an old kernel or legacy PCI dev */
>> +            error_printf("vfio: error getting device %s irq info",
>> +                         vbasedev->name);
> 
> Strange comment for a platform device.  On PCI this comment only applied
> to the virtual error IRQ since it may or may not be supported per
> device.  For PCI, the number of IRQs and regions is really more of a
> highest index, so it can be sparsely populated.  We know about the error
> IRQ, so probe for it, but it may not be present.  Likewise, we know
> about the VGA region, but it may not be supported by this device and
> will return error on the info call.

Hi Alex,

thanks for explaining the legacy. I will treat that as an error then.
> 
>> +        } else {
>> +            DPRINTF("- IRQ index %d: count %d, flags=0x%x\n",
>> +                    irq.index, irq.count, irq.flags);
>> +
>> +            ret = vfio_enable_intp(vbasedev, irq.index);
>> +            if (ret) {
>> +                error_report("vfio: Error setting IRQ %d up", i);
>> +                return ret;
>> +            }
>> +        }
>> +    }
>> +    return 0;
>> +}
>> +
>> +static VFIODeviceOps vfio_platform_ops = {
>> +    .vfio_compute_needs_reset = vfio_platform_compute_needs_reset,
>> +    .vfio_hot_reset_multi = vfio_platform_hot_reset_multi,
>> +    .vfio_eoi = vfio_platform_eoi,
>> +    .vfio_check_device = vfio_platform_check_device,
>> +    .vfio_populate_regions = vfio_populate_regions,
>> +    .vfio_populate_interrupts = vfio_populate_interrupts,
>> +};
>> +
>> +static int vfio_base_device_init(VFIODevice *vbasedev)
>> +{
>> +    VFIOGroup *group;
>> +    VFIODevice *vbasedev_iter;
>> +    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
>> +    ssize_t len;
>> +    struct stat st;
>> +    int groupid;
>> +    int ret;
>> +
>> +    /* name must be set prior to the call */
>> +    if (!vbasedev->name) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    /* Check that the host device exists */
>> +    snprintf(path, sizeof(path), "/sys/bus/platform/devices/%s/",
>> +             vbasedev->name);
>> +
>> +    if (stat(path, &st) < 0) {
>> +        error_report("vfio: error: no such host device: %s", path);
>> +        return -errno;
>> +    }
>> +
>> +    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
>> +    len = readlink(path, iommu_group_path, sizeof(path));
>> +    if (len <= 0 || len >= sizeof(path)) {
>> +        error_report("vfio: error no iommu_group for device");
>> +        return len < 0 ? -errno : ENAMETOOLONG;
>> +    }
>> +
>> +    iommu_group_path[len] = 0;
>> +    group_name = basename(iommu_group_path);
>> +
>> +    if (sscanf(group_name, "%d", &groupid) != 1) {
>> +        error_report("vfio: error reading %s: %m", path);
>> +        return -errno;
>> +    }
>> +
>> +    DPRINTF("%s(%s) group %d\n", __func__, vbasedev->name, groupid);
>> +
>> +    group = vfio_get_group(groupid, &address_space_memory);
>> +    if (!group) {
>> +        error_report("vfio: failed to get group %d", groupid);
>> +        return -ENOENT;
>> +    }
>> +
>> +    snprintf(path, sizeof(path), "%s", vbasedev->name);
>> +
>> +    QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
>> +        if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
>> +            error_report("vfio: error: device %s is already attached", path);
>> +            vfio_put_group(group);
>> +            return -EBUSY;
>> +        }
>> +    }
>> +    ret = vfio_get_device(group, path, vbasedev);
>> +    if (ret) {
>> +        error_report("vfio: failed to get device %s", path);
>> +        vfio_put_group(group);
>> +    }
>> + return ret;
>> +}
>> +
>> +void vfio_put_device(VFIOPlatformDevice *vdev)
>> +{
>> +    unsigned int i;
>> +    VFIODevice *vbasedev = &vdev->vbasedev;
>> +
>> +    for (i = 0; i < vbasedev->num_regions; i++) {
>> +            g_free(vdev->regions[i]);
>> +    }
>> +    g_free(vdev->regions);
>> +    g_free(vdev->vbasedev.name);
>> +    vfio_put_base_device(&vdev->vbasedev);
>> +}
>> +
>> +static void vfio_platform_realize(DeviceState *dev, Error **errp)
>> +{
>> +    VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev);
>> +    SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
>> +    VFIODevice *vbasedev = &vdev->vbasedev;
>> +    int i, ret;
>> +
>> +    vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
>> +    vbasedev->ops = &vfio_platform_ops;
>> +
>> +    DPRINTF("vfio device %s, compat = %s\n", vbasedev->name, vdev->compat);
>> +
>> +    ret = vfio_base_device_init(vbasedev);
>> +    if (ret) {
>> +        return;
>> +    }
>> +
>> +    for (i = 0; i < vbasedev->num_regions; i++) {
>> +        vfio_map_region(vdev, i);
>> +        sysbus_init_mmio(sbdev, &vdev->regions[i]->mem);
>> +    }
>> +}
>> +
>> +static const VMStateDescription vfio_platform_vmstate = {
>> +    .name = TYPE_VFIO_PLATFORM,
>> +    .unmigratable = 1,
>> +};
>> +
>> +static Property vfio_platform_dev_properties[] = {
>> +    DEFINE_PROP_STRING("vfio_device", VFIOPlatformDevice, vbasedev.name),
> 
> Hmm, is this really a good name for this option?  "host" would give you
> some consistency with vfio-pci.
ok
> 
>> +    DEFINE_PROP_STRING("compat", VFIOPlatformDevice, compat),
>> +    DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice,
>> +                       mmap_timeout, 1100),
>> +    DEFINE_PROP_BOOL("irqfd", VFIOPlatformDevice, irqfd_allowed, true),
> 
> Should some of these be x- options or do you plan to support them long
> term and support users twiddling them?
- compat should disappear if we transform the vfio-platform class as an
abstract
- irqfd currently is here for testing
- mmap-timeout-ms will stay
> 
>> +    DEFINE_PROP_END_OF_LIST(),
>> +};
>> +
>> +static void vfio_platform_class_init(ObjectClass *klass, void *data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +
>> +    dc->realize = vfio_platform_realize;
>> +    dc->props = vfio_platform_dev_properties;
>> +    dc->vmsd = &vfio_platform_vmstate;
>> +    dc->desc = "VFIO-based platform device assignment";
>> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
>> +}
>> +
>> +static const TypeInfo vfio_platform_dev_info = {
>> +    .name = TYPE_VFIO_PLATFORM,
>> +    .parent = TYPE_SYS_BUS_DEVICE,
>> +    .instance_size = sizeof(VFIOPlatformDevice),
>> +    .class_init = vfio_platform_class_init,
>> +    .class_size = sizeof(VFIOPlatformDeviceClass),
>> +};
>> +
>> +static void register_vfio_platform_dev_type(void)
>> +{
>> +    type_register_static(&vfio_platform_dev_info);
>> +}
>> +
>> +type_init(register_vfio_platform_dev_type)
>> diff --git a/include/hw/vfio/vfio-platform.h b/include/hw/vfio/vfio-platform.h
>> new file mode 100644
>> index 0000000..1ee072a
>> --- /dev/null
>> +++ b/include/hw/vfio/vfio-platform.h
>> @@ -0,0 +1,77 @@
>> +/*
>> + * vfio based device assignment support - platform devices
>> + *
>> + * Copyright Linaro Limited, 2014
>> + *
>> + * Authors:
>> + *  Kim Phillips <kim.phillips@linaro.org>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + *
>> + * Based on vfio based PCI device assignment support:
>> + *  Copyright Red Hat, Inc. 2012
>> + */
>> +
>> +#ifndef HW_VFIO_VFIO_PLATFORM_H
>> +#define HW_VFIO_VFIO_PLATFORM_H
>> +
>> +#include "hw/sysbus.h"
>> +#include "hw/vfio/vfio-common.h"
>> +#include "qemu/event_notifier.h"
>> +#include "qemu/queue.h"
>> +#include "hw/irq.h"
>> +
>> +#define TYPE_VFIO_PLATFORM "vfio-platform"
>> +
>> +enum {
>> +    VFIO_IRQ_INACTIVE = 0,
>> +    VFIO_IRQ_PENDING = 1,
>> +    VFIO_IRQ_ACTIVE = 2,
>> +    /* VFIO_IRQ_ACTIVE_AND_PENDING cannot happen with VFIO */
>> +};
>> +
>> +typedef struct VFIOINTp {
>> +    QLIST_ENTRY(VFIOINTp) next; /* entry for IRQ list */
>> +    QSIMPLEQ_ENTRY(VFIOINTp) pqnext; /* entry for pending IRQ queue */
>> +    EventNotifier interrupt; /* eventfd triggered on interrupt */
>> +    EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
>> +    qemu_irq qemuirq;
>> +    struct VFIOPlatformDevice *vdev; /* back pointer to device */
>> +    int state; /* inactive, pending, active */
>> +    bool kvm_accel; /* set when QEMU bypass through KVM enabled */
>> +    uint8_t pin; /* index */
>> +    uint8_t virtualID; /* virtual IRQ */
>> +} VFIOINTp;
>> +
>> +typedef struct VFIOPlatformDevice {
>> +    SysBusDevice sbdev;
>> +    VFIODevice vbasedev; /* not a QOM object */
>> +    VFIORegion **regions;
>> +    QLIST_HEAD(, VFIOINTp) intp_list; /* list of IRQ */
>> +    /* queue of pending IRQ */
>> +    QSIMPLEQ_HEAD(pending_intp_queue, VFIOINTp) pending_intp_queue;
>> +    char *compat; /* compatibility string */
>> +    bool irqfd_allowed;
>> +    uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
>> +    QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
>> +} VFIOPlatformDevice;
>> +
>> +
>> +typedef struct VFIOPlatformDeviceClass {
>> +    /*< private >*/
>> +    SysBusDeviceClass parent_class;
>> +    /*< public >*/
>> +} VFIOPlatformDeviceClass;
>> +
>> +#define VFIO_PLATFORM_DEVICE(obj) \
>> +     OBJECT_CHECK(VFIOPlatformDevice, (obj), TYPE_VFIO_PLATFORM)
>> +#define VFIO_PLATFORM_DEVICE_CLASS(klass) \
>> +     OBJECT_CLASS_CHECK(VFIOPlatformDeviceClass, (klass), TYPE_VFIO_PLATFORM)
>> +#define VFIO_PLATFORM_DEVICE_GET_CLASS(obj) \
>> +     OBJECT_GET_CLASS(VFIOPlatformDeviceClass, (obj), TYPE_VFIO_PLATFORM)
>> +
>> +void vfio_intp_interrupt(void *opaque);
>> +void vfio_setup_irqfd(SysBusDevice *dev, int index, int virq);
Indeed belongs to irqfd patch file!

Thanks

Best Regards

Eric
> 
> This was never defined.  Thanks,
> 
> Alex
>
Bharat.Bhushan@freescale.com Aug. 12, 2014, 7:59 a.m. UTC | #4
> -----Original Message-----
> From: Alexander Graf [mailto:agraf@suse.de]
> Sent: Monday, August 11, 2014 3:06 PM
> To: Eric Auger; eric.auger@st.com; christoffer.dall@linaro.org; qemu-
> devel@nongnu.org; Phillips Kim-R1AAHA; a.rigo@virtualopensystems.com
> Cc: will.deacon@arm.com; kvmarm@lists.cs.columbia.edu;
> alex.williamson@redhat.com; Bhushan Bharat-R65777; peter.maydell@linaro.org;
> Yoder Stuart-B08248; a.motakis@virtualopensystems.com; patches@linaro.org;
> joel.schopp@amd.com; Kim Phillips
> Subject: Re: [PATCH v5 07/10] hw/vfio/platform: add vfio-platform support
> 
> 
> On 09.08.14 16:25, Eric Auger wrote:
> > Minimal VFIO platform implementation supporting
> > - register space user mapping,
> > - IRQ assignment based on eventfds handled on qemu side.
> >
> > irqfd kernel acceleration comes in a subsequent patch.
> >
> > Signed-off-by: Kim Phillips <kim.phillips@linaro.org>
> > Signed-off-by: Eric Auger <eric.auger@linaro.org>
> >
> > ---
> >
> > v4 -> v5:
> > - vfio-plaform.h included first
> > - cleanup error handling in *populate*, vfio_get_device,
> >    vfio_enable_intp
> > - vfio_put_device not called anymore
> > - add some includes to follow vfio policy
> >
> > v3 -> v4:
> > [Eric Auger]
> > - merge of "vfio: Add initial IRQ support in platform device"
> >    to get a full functional patch although perfs are limited.
> > - removal of unrealize function since I currently understand
> >    it is only used with device hot-plug feature.
> >
> > v2 -> v3:
> > [Eric Auger]
> > - further factorization between PCI and platform (VFIORegion,
> >    VFIODevice). same level of functionality.
> >
> > <= v2:
> > [Kim Philipps]
> > - Initial Creation of the device supporting register space mapping
> > ---
> >   hw/vfio/Makefile.objs           |   1 +
> >   hw/vfio/platform.c              | 517
> ++++++++++++++++++++++++++++++++++++++++
> >   include/hw/vfio/vfio-platform.h |  77 ++++++
> >   3 files changed, 595 insertions(+)
> >   create mode 100644 hw/vfio/platform.c
> >   create mode 100644 include/hw/vfio/vfio-platform.h
> >
> > diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
> > index e31f30e..c5c76fe 100644
> > --- a/hw/vfio/Makefile.objs
> > +++ b/hw/vfio/Makefile.objs
> > @@ -1,4 +1,5 @@
> >   ifeq ($(CONFIG_LINUX), y)
> >   obj-$(CONFIG_SOFTMMU) += common.o
> >   obj-$(CONFIG_PCI) += pci.o
> > +obj-$(CONFIG_SOFTMMU) += platform.o
> >   endif
> > diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
> > new file mode 100644
> > index 0000000..f1a1b55
> > --- /dev/null
> > +++ b/hw/vfio/platform.c
> > @@ -0,0 +1,517 @@
> > +/*
> > + * vfio based device assignment support - platform devices
> > + *
> > + * Copyright Linaro Limited, 2014
> > + *
> > + * Authors:
> > + *  Kim Phillips <kim.phillips@linaro.org>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Based on vfio based PCI device assignment support:
> > + *  Copyright Red Hat, Inc. 2012
> > + */
> > +
> > +#include <linux/vfio.h>
> > +#include <sys/ioctl.h>
> > +
> > +#include "hw/vfio/vfio-platform.h"
> > +#include "qemu/error-report.h"
> > +#include "qemu/range.h"
> > +#include "sysemu/sysemu.h"
> > +#include "exec/memory.h"
> > +#include "qemu/queue.h"
> > +#include "hw/sysbus.h"
> > +
> > +extern const MemoryRegionOps vfio_region_ops;
> > +extern const MemoryListener vfio_memory_listener;
> > +extern QLIST_HEAD(, VFIOGroup) group_list;
> > +extern QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces;
> > +void vfio_put_device(VFIOPlatformDevice *vdev);
> > +
> > +/*
> > + * It is mandatory to pass a VFIOPlatformDevice since VFIODevice
> > + * is not a QOM Object and cannot be passed to memory region functions
> > +*/
> > +static void vfio_map_region(VFIOPlatformDevice *vdev, int nr)
> > +{
> > +    VFIORegion *region = vdev->regions[nr];
> > +    unsigned size = region->size;
> > +    char name[64];
> > +
> > +    if (!size) {
> > +        return;
> > +    }
> > +
> > +    snprintf(name, sizeof(name), "VFIO %s region %d",
> > +             vdev->vbasedev.name, nr);
> > +
> > +    /* A "slow" read/write mapping underlies all regions */
> > +    memory_region_init_io(&region->mem, OBJECT(vdev), &vfio_region_ops,
> > +                          region, name, size);
> > +
> > +    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
> > +
> > +    if (vfio_mmap_region(OBJECT(vdev), region, &region->mem,
> > +                         &region->mmap_mem, &region->mmap, size, 0, name)) {
> > +        error_report("%s unsupported. Performance may be slow", name);
> > +    }
> > +}
> > +
> > +static void print_regions(VFIOPlatformDevice *vdev)
> > +{
> > +    int i;
> > +
> > +    DPRINTF("Device \"%s\" counts %d region(s):\n",
> > +             vdev->vbasedev.name, vdev->vbasedev.num_regions);
> > +
> > +    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
> > +        DPRINTF("- region %d flags = 0x%lx, size = 0x%lx, "
> > +                "fd= %d, offset = 0x%lx\n",
> > +                vdev->regions[i]->nr,
> > +                (unsigned long)vdev->regions[i]->flags,
> > +                (unsigned long)vdev->regions[i]->size,
> > +                vdev->regions[i]->vbasedev->fd,
> > +                (unsigned long)vdev->regions[i]->fd_offset);
> > +    }
> > +}
> > +
> > +static int vfio_populate_regions(VFIODevice *vbasedev)
> > +{
> > +    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
> > +    int i, ret = 0;
> > +    VFIOPlatformDevice *vdev =
> > +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
> > +
> > +    vdev->regions = g_malloc0(sizeof(VFIORegion *) * vbasedev->num_regions);
> > +
> > +    for (i = 0; i < vbasedev->num_regions; i++) {
> > +        vdev->regions[i] = g_malloc0(sizeof(VFIORegion));
> > +        reg_info.index = i;
> > +        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> > +        if (ret) {
> > +            error_report("vfio: Error getting region %d info: %m", i);
> > +            goto error;
> > +        }
> > +
> > +        vdev->regions[i]->flags = reg_info.flags;
> > +        vdev->regions[i]->size = reg_info.size;
> > +        vdev->regions[i]->fd_offset = reg_info.offset;
> > +        vdev->regions[i]->nr = i;
> > +        vdev->regions[i]->vbasedev = vbasedev;
> > +    }
> > +    print_regions(vdev);
> > +error:
> > +    return ret;
> > +}
> > +
> > +/* not implemented yet */
> > +static int vfio_platform_check_device(VFIODevice *vdev)
> > +{
> > +    return 0;
> > +}
> > +
> > +/* not implemented yet */
> > +static bool vfio_platform_compute_needs_reset(VFIODevice *vdev)
> > +{
> > +return false;
> > +}
> > +
> > +/* not implemented yet */
> > +static int vfio_platform_hot_reset_multi(VFIODevice *vdev)
> > +{
> > +return 0;
> > +}
> > +
> > +/*
> > + * eoi function is called on the first access to any MMIO region
> > + * after an IRQ was triggered. It is assumed this access corresponds
> > + * to the IRQ status register reset.
> > + * With such a mechanism, a single IRQ can be handled at a time since
> > + * there is no way to know which IRQ was completed by the guest.
> > + * (we would need additional details about the IRQ status register mask)
> > + */
> > +static void vfio_platform_eoi(VFIODevice *vbasedev)
> > +{
> > +    VFIOINTp *intp;
> > +    VFIOPlatformDevice *vdev =
> > +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
> > +
> > +    QLIST_FOREACH(intp, &vdev->intp_list, next) {
> > +        if (intp->state == VFIO_IRQ_ACTIVE) {
> > +            DPRINTF("EOI IRQ #%d fd=%d\n",
> > +                    intp->pin, event_notifier_get_fd(&intp->interrupt));
> > +            intp->state = VFIO_IRQ_INACTIVE;
> > +
> > +            /* deassert the virtual IRQ and unmask physical one */
> > +            qemu_set_irq(intp->qemuirq, 0);
> > +            vfio_unmask_irqindex(vbasedev, intp->pin);
> > +
> > +            /* a single IRQ can be active at a time */
> > +            break;
> > +        }
> > +    }
> > +
> > +    /* in case there are pending IRQs, handle them one at a time */
> > +    if (!QSIMPLEQ_EMPTY(&vdev->pending_intp_queue)) {
> > +        intp = QSIMPLEQ_FIRST(&vdev->pending_intp_queue);
> > +        vfio_intp_interrupt(intp);

We are calling vfio_intp_interrupt() with physical interrupt enabled, while there is a comment in vfio_intp_interrupt() which says physical interrupt is disabled by VFIO.

> > +        QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext);
> > +    }
> > +}
> > +
> > +/*
> > + * enable/disable the fast path mode
> > + * fast path = MMIO region is mmaped (no KVM TRAP)
> > + * slow path = MMIO region is trapped and region callbacks are called
> > + * slow path enables to trap the IRQ status register guest reset
> > +*/
> > +
> > +static void vfio_mmap_set_enabled(VFIOPlatformDevice *vdev, bool enabled)
> > +{
> > +    VFIORegion *region;
> > +    int i;
> > +
> > +    DPRINTF("fast path = %d\n", enabled);
> > +
> > +    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
> > +        region = vdev->regions[i];
> > +
> > +        /* register space is unmapped to trap EOI */
> > +        memory_region_set_enabled(&region->mmap_mem, enabled);
> > +    }
> > +}
> > +
> > +/*
> > + * Checks whether the IRQ is still pending. In the negative
> > + * the fast path mode (where reg space is mmaped) can be restored.
> > + * if the IRQ is still pending, we must keep on trapping IRQ status
> > + * register reset with mmap disabled (slow path).
> > + * the function is called on mmap_timer event.
> > + * by construction a single fd is handled at a time. See EOI comment
> > + * for additional details.
> > + */
> > +static void vfio_intp_mmap_enable(void *opaque)
> > +{
> > +    VFIOINTp *tmp;
> > +    VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque;
> > +
> > +    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
> > +        if (tmp->state == VFIO_IRQ_ACTIVE) {
> > +            DPRINTF("IRQ #%d still active, stay in slow path\n",
> > +                    tmp->pin);
> > +            timer_mod(vdev->mmap_timer,
> > +                      qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > +                          vdev->mmap_timeout);
> > +            return;
> > +        }
> > +    }
> > +    DPRINTF("no active IRQ, restore fast path\n");
> > +    vfio_mmap_set_enabled(vdev, true);
> > +}
> > +
> > +/*
> > + * The fd handler
> > + */
> > +void vfio_intp_interrupt(void *opaque)
> > +{
> > +    int ret;
> > +    VFIOINTp *tmp, *intp = (VFIOINTp *)opaque;
> > +    VFIOPlatformDevice *vdev = intp->vdev;
> > +    bool one_active_irq = false;
> > +
> > +    /*
> > +     * first check whether there is a pending IRQ
> > +     * in the positive the new IRQ cannot be handled until the
> > +     * active one is not completed.
> > +     * by construction the same IRQ as the pending one cannot hit
> > +     * since the physical IRQ was disabled by the VFIO driver
> > +     */

Here we assume physical interrupt disabled.

> > +    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
> > +        if (tmp->state == VFIO_IRQ_ACTIVE) {
> > +            one_active_irq = true;
> > +            break;
> > +        }
> > +    }
> > +    if (one_active_irq) {
> > +        /*
> > +         * the new IRQ gets a pending status and is pushed in
> > +         * the pending queue
> > +         */
> > +        intp->state = VFIO_IRQ_PENDING;
> > +        QSIMPLEQ_INSERT_TAIL(&vdev->pending_intp_queue,
> > +                             intp, pqnext);
> > +        return;
> > +    }
> > +
> > +    /* no active IRQ, the new IRQ can be forwarded to the guest */
> > +    DPRINTF("Handle IRQ #%d (fd = %d)\n",
> > +            intp->pin, event_notifier_get_fd(&intp->interrupt));
> > +
> > +    ret = event_notifier_test_and_clear(&intp->interrupt);
> > +    if (!ret) {
> > +        DPRINTF("Error when clearing fd=%d\n",
> > +                event_notifier_get_fd(&intp->interrupt));
> > +    }
> > +
> > +    intp->state = VFIO_IRQ_ACTIVE;
> > +
> > +    /* sets slow path */
> > +    vfio_mmap_set_enabled(vdev, false);
> > +
> > +    /* trigger the virtual IRQ */
> > +    qemu_set_irq(intp->qemuirq, 1);
> > +
> > +    /* schedule the mmap timer which will restore mmap path after EOI*/
> > +    if (vdev->mmap_timeout) {
> > +        timer_mod(vdev->mmap_timer,
> > +                  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
> > +                      vdev->mmap_timeout);
> > +    }
> > +}
> > +
> > +static int vfio_enable_intp(VFIODevice *vbasedev, unsigned int index)
> > +{
> > +    struct vfio_irq_set *irq_set;
> > +    int32_t *pfd;
> > +    int ret, argsz;
> > +    int device = vbasedev->fd;
> > +    VFIOPlatformDevice *vdev =
> > +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
> > +    SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev);
> > +    VFIOINTp *intp;
> > +
> > +    /* allocate and populate a new VFIOINTp structure put in a queue list */
> > +    intp = g_malloc0(sizeof(*intp));
> > +    intp->vdev = vdev;
> > +    intp->pin = index;
> > +    intp->state = VFIO_IRQ_INACTIVE;
> > +    sysbus_init_irq(sbdev, &intp->qemuirq);
> > +
> > +    ret = event_notifier_init(&intp->interrupt, 0);
> > +    if (ret) {
> > +        g_free(intp);
> > +        error_report("vfio: Error: event_notifier_init failed ");
> > +        return ret;
> > +    }
> > +
> > +    /* build the irq_set to be passed to the vfio kernel driver */
> > +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> > +
> > +    irq_set = g_malloc0(argsz);
> > +    irq_set->argsz = argsz;
> > +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
> > +    irq_set->index = index;
> > +    irq_set->start = 0;
> > +    irq_set->count = 1;
> > +    pfd = (int32_t *)&irq_set->data;
> > +
> > +    *pfd = event_notifier_get_fd(&intp->interrupt);
> > +
> > +    DPRINTF("register fd=%d/irq index=%d to kernel\n", *pfd, index);
> > +
> > +    qemu_set_fd_handler(*pfd, vfio_intp_interrupt, NULL, intp);
> > +
> > +    /*
> > +     * pass the index/fd binding to the kernel driver so that it
> > +     * triggers this fd on HW IRQ
> > +     */
> > +    ret = ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set);
> > +    g_free(irq_set);
> > +    if (ret) {
> > +        error_report("vfio: Error: Failed to pass IRQ fd to the driver: %m");
> > +        qemu_set_fd_handler(*pfd, NULL, NULL, NULL);
> > +        event_notifier_cleanup(&intp->interrupt);
> > +        return -errno;
> > +    }
> > +
> > +    /* store the new intp in qlist */
> > +    QLIST_INSERT_HEAD(&vdev->intp_list, intp, next);
> > +    return 0;
> > +}
> > +
> > +static int vfio_populate_interrupts(VFIODevice *vbasedev)
> > +{
> > +    struct vfio_irq_info irq = { .argsz = sizeof(irq) };
> > +    int i, ret;
> > +    VFIOPlatformDevice *vdev =
> > +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
> > +
> > +    vdev->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
> > +                                    vfio_intp_mmap_enable, vdev);
> > +
> > +    QSIMPLEQ_INIT(&vdev->pending_intp_queue);
> > +
> > +    for (i = 0; i < vbasedev->num_irqs; i++) {
> > +        irq.index = i;
> > +
> > +        DPRINTF("Retrieve IRQ info from vfio platform driver ...\n");
> > +
> > +        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
> > +        if (ret) {
> > +            /* This can fail for an old kernel or legacy PCI dev */
> > +            error_printf("vfio: error getting device %s irq info",
> > +                         vbasedev->name);
> > +        } else {
> > +            DPRINTF("- IRQ index %d: count %d, flags=0x%x\n",
> > +                    irq.index, irq.count, irq.flags);
> > +
> > +            ret = vfio_enable_intp(vbasedev, irq.index);
> > +            if (ret) {
> > +                error_report("vfio: Error setting IRQ %d up", i);
> > +                return ret;
> > +            }
> > +        }
> > +    }
> > +    return 0;
> > +}
> > +
> > +static VFIODeviceOps vfio_platform_ops = {
> > +    .vfio_compute_needs_reset = vfio_platform_compute_needs_reset,
> > +    .vfio_hot_reset_multi = vfio_platform_hot_reset_multi,
> > +    .vfio_eoi = vfio_platform_eoi,
> > +    .vfio_check_device = vfio_platform_check_device,
> > +    .vfio_populate_regions = vfio_populate_regions,
> > +    .vfio_populate_interrupts = vfio_populate_interrupts,
> > +};
> > +
> > +static int vfio_base_device_init(VFIODevice *vbasedev)
> > +{
> > +    VFIOGroup *group;
> > +    VFIODevice *vbasedev_iter;
> > +    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
> > +    ssize_t len;
> > +    struct stat st;
> > +    int groupid;
> > +    int ret;
> > +
> > +    /* name must be set prior to the call */
> > +    if (!vbasedev->name) {
> > +        return -EINVAL;
> > +    }
> > +
> > +    /* Check that the host device exists */
> > +    snprintf(path, sizeof(path), "/sys/bus/platform/devices/%s/",
> > +             vbasedev->name);
> > +
> > +    if (stat(path, &st) < 0) {
> > +        error_report("vfio: error: no such host device: %s", path);
> > +        return -errno;
> > +    }
> > +
> > +    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
> > +    len = readlink(path, iommu_group_path, sizeof(path));
> > +    if (len <= 0 || len >= sizeof(path)) {
> > +        error_report("vfio: error no iommu_group for device");
> > +        return len < 0 ? -errno : ENAMETOOLONG;
> > +    }
> > +
> > +    iommu_group_path[len] = 0;
> > +    group_name = basename(iommu_group_path);
> > +
> > +    if (sscanf(group_name, "%d", &groupid) != 1) {
> > +        error_report("vfio: error reading %s: %m", path);
> > +        return -errno;
> > +    }
> > +
> > +    DPRINTF("%s(%s) group %d\n", __func__, vbasedev->name, groupid);
> > +
> > +    group = vfio_get_group(groupid, &address_space_memory);
> > +    if (!group) {
> > +        error_report("vfio: failed to get group %d", groupid);
> > +        return -ENOENT;
> > +    }
> > +
> > +    snprintf(path, sizeof(path), "%s", vbasedev->name);
> > +
> > +    QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
> > +        if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
> > +            error_report("vfio: error: device %s is already attached", path);
> > +            vfio_put_group(group);
> > +            return -EBUSY;
> > +        }
> > +    }
> > +    ret = vfio_get_device(group, path, vbasedev);
> > +    if (ret) {
> > +        error_report("vfio: failed to get device %s", path);
> > +        vfio_put_group(group);
> > +    }
> > + return ret;
> > +}
> > +
> > +void vfio_put_device(VFIOPlatformDevice *vdev)
> > +{
> > +    unsigned int i;
> > +    VFIODevice *vbasedev = &vdev->vbasedev;
> > +
> > +    for (i = 0; i < vbasedev->num_regions; i++) {
> > +            g_free(vdev->regions[i]);
> > +    }
> > +    g_free(vdev->regions);
> > +    g_free(vdev->vbasedev.name);
> > +    vfio_put_base_device(&vdev->vbasedev);
> > +}
> > +
> > +static void vfio_platform_realize(DeviceState *dev, Error **errp)
> > +{
> > +    VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev);
> > +    SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
> > +    VFIODevice *vbasedev = &vdev->vbasedev;
> > +    int i, ret;
> > +
> > +    vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
> > +    vbasedev->ops = &vfio_platform_ops;
> > +
> > +    DPRINTF("vfio device %s, compat = %s\n", vbasedev->name, vdev->compat);
> > +
> > +    ret = vfio_base_device_init(vbasedev);
> > +    if (ret) {
> > +        return;
> > +    }
> > +
> > +    for (i = 0; i < vbasedev->num_regions; i++) {
> > +        vfio_map_region(vdev, i);
> > +        sysbus_init_mmio(sbdev, &vdev->regions[i]->mem);
> > +    }
> > +}
> > +
> > +static const VMStateDescription vfio_platform_vmstate = {
> > +    .name = TYPE_VFIO_PLATFORM,
> > +    .unmigratable = 1,
> > +};
> > +
> > +static Property vfio_platform_dev_properties[] = {
> > +    DEFINE_PROP_STRING("vfio_device", VFIOPlatformDevice, vbasedev.name),
> > +    DEFINE_PROP_STRING("compat", VFIOPlatformDevice, compat),
> > +    DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice,
> > +                       mmap_timeout, 1100),
> > +    DEFINE_PROP_BOOL("irqfd", VFIOPlatformDevice, irqfd_allowed, true),
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> > +static void vfio_platform_class_init(ObjectClass *klass, void *data)
> > +{
> > +    DeviceClass *dc = DEVICE_CLASS(klass);
> > +
> > +    dc->realize = vfio_platform_realize;
> > +    dc->props = vfio_platform_dev_properties;
> > +    dc->vmsd = &vfio_platform_vmstate;
> > +    dc->desc = "VFIO-based platform device assignment";
> > +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> > +}
> > +
> > +static const TypeInfo vfio_platform_dev_info = {
> > +    .name = TYPE_VFIO_PLATFORM,
> > +    .parent = TYPE_SYS_BUS_DEVICE,
> > +    .instance_size = sizeof(VFIOPlatformDevice),
> > +    .class_init = vfio_platform_class_init,
> > +    .class_size = sizeof(VFIOPlatformDeviceClass),
> 
> This should be an abstract class. People must never instantiate a
> generic "vfio-platform" device. Only "vfio-xgmac", "vfio-etsec", etc
> devices should be exposed to the user.
> 
> 
> Alex
Auger Eric Aug. 12, 2014, 4:34 p.m. UTC | #5
On 08/12/2014 09:59 AM, Bharat.Bhushan@freescale.com wrote:
> 
> 
>> -----Original Message-----
>> From: Alexander Graf [mailto:agraf@suse.de]
>> Sent: Monday, August 11, 2014 3:06 PM
>> To: Eric Auger; eric.auger@st.com; christoffer.dall@linaro.org; qemu-
>> devel@nongnu.org; Phillips Kim-R1AAHA; a.rigo@virtualopensystems.com
>> Cc: will.deacon@arm.com; kvmarm@lists.cs.columbia.edu;
>> alex.williamson@redhat.com; Bhushan Bharat-R65777; peter.maydell@linaro.org;
>> Yoder Stuart-B08248; a.motakis@virtualopensystems.com; patches@linaro.org;
>> joel.schopp@amd.com; Kim Phillips
>> Subject: Re: [PATCH v5 07/10] hw/vfio/platform: add vfio-platform support
>>
>>
>> On 09.08.14 16:25, Eric Auger wrote:
>>> Minimal VFIO platform implementation supporting
>>> - register space user mapping,
>>> - IRQ assignment based on eventfds handled on qemu side.
>>>
>>> irqfd kernel acceleration comes in a subsequent patch.
>>>
>>> Signed-off-by: Kim Phillips <kim.phillips@linaro.org>
>>> Signed-off-by: Eric Auger <eric.auger@linaro.org>
>>>
>>> ---
>>>
>>> v4 -> v5:
>>> - vfio-plaform.h included first
>>> - cleanup error handling in *populate*, vfio_get_device,
>>>    vfio_enable_intp
>>> - vfio_put_device not called anymore
>>> - add some includes to follow vfio policy
>>>
>>> v3 -> v4:
>>> [Eric Auger]
>>> - merge of "vfio: Add initial IRQ support in platform device"
>>>    to get a full functional patch although perfs are limited.
>>> - removal of unrealize function since I currently understand
>>>    it is only used with device hot-plug feature.
>>>
>>> v2 -> v3:
>>> [Eric Auger]
>>> - further factorization between PCI and platform (VFIORegion,
>>>    VFIODevice). same level of functionality.
>>>
>>> <= v2:
>>> [Kim Philipps]
>>> - Initial Creation of the device supporting register space mapping
>>> ---
>>>   hw/vfio/Makefile.objs           |   1 +
>>>   hw/vfio/platform.c              | 517
>> ++++++++++++++++++++++++++++++++++++++++
>>>   include/hw/vfio/vfio-platform.h |  77 ++++++
>>>   3 files changed, 595 insertions(+)
>>>   create mode 100644 hw/vfio/platform.c
>>>   create mode 100644 include/hw/vfio/vfio-platform.h
>>>
>>> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
>>> index e31f30e..c5c76fe 100644
>>> --- a/hw/vfio/Makefile.objs
>>> +++ b/hw/vfio/Makefile.objs
>>> @@ -1,4 +1,5 @@
>>>   ifeq ($(CONFIG_LINUX), y)
>>>   obj-$(CONFIG_SOFTMMU) += common.o
>>>   obj-$(CONFIG_PCI) += pci.o
>>> +obj-$(CONFIG_SOFTMMU) += platform.o
>>>   endif
>>> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
>>> new file mode 100644
>>> index 0000000..f1a1b55
>>> --- /dev/null
>>> +++ b/hw/vfio/platform.c
>>> @@ -0,0 +1,517 @@
>>> +/*
>>> + * vfio based device assignment support - platform devices
>>> + *
>>> + * Copyright Linaro Limited, 2014
>>> + *
>>> + * Authors:
>>> + *  Kim Phillips <kim.phillips@linaro.org>
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>>> + * the COPYING file in the top-level directory.
>>> + *
>>> + * Based on vfio based PCI device assignment support:
>>> + *  Copyright Red Hat, Inc. 2012
>>> + */
>>> +
>>> +#include <linux/vfio.h>
>>> +#include <sys/ioctl.h>
>>> +
>>> +#include "hw/vfio/vfio-platform.h"
>>> +#include "qemu/error-report.h"
>>> +#include "qemu/range.h"
>>> +#include "sysemu/sysemu.h"
>>> +#include "exec/memory.h"
>>> +#include "qemu/queue.h"
>>> +#include "hw/sysbus.h"
>>> +
>>> +extern const MemoryRegionOps vfio_region_ops;
>>> +extern const MemoryListener vfio_memory_listener;
>>> +extern QLIST_HEAD(, VFIOGroup) group_list;
>>> +extern QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces;
>>> +void vfio_put_device(VFIOPlatformDevice *vdev);
>>> +
>>> +/*
>>> + * It is mandatory to pass a VFIOPlatformDevice since VFIODevice
>>> + * is not a QOM Object and cannot be passed to memory region functions
>>> +*/
>>> +static void vfio_map_region(VFIOPlatformDevice *vdev, int nr)
>>> +{
>>> +    VFIORegion *region = vdev->regions[nr];
>>> +    unsigned size = region->size;
>>> +    char name[64];
>>> +
>>> +    if (!size) {
>>> +        return;
>>> +    }
>>> +
>>> +    snprintf(name, sizeof(name), "VFIO %s region %d",
>>> +             vdev->vbasedev.name, nr);
>>> +
>>> +    /* A "slow" read/write mapping underlies all regions */
>>> +    memory_region_init_io(&region->mem, OBJECT(vdev), &vfio_region_ops,
>>> +                          region, name, size);
>>> +
>>> +    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
>>> +
>>> +    if (vfio_mmap_region(OBJECT(vdev), region, &region->mem,
>>> +                         &region->mmap_mem, &region->mmap, size, 0, name)) {
>>> +        error_report("%s unsupported. Performance may be slow", name);
>>> +    }
>>> +}
>>> +
>>> +static void print_regions(VFIOPlatformDevice *vdev)
>>> +{
>>> +    int i;
>>> +
>>> +    DPRINTF("Device \"%s\" counts %d region(s):\n",
>>> +             vdev->vbasedev.name, vdev->vbasedev.num_regions);
>>> +
>>> +    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
>>> +        DPRINTF("- region %d flags = 0x%lx, size = 0x%lx, "
>>> +                "fd= %d, offset = 0x%lx\n",
>>> +                vdev->regions[i]->nr,
>>> +                (unsigned long)vdev->regions[i]->flags,
>>> +                (unsigned long)vdev->regions[i]->size,
>>> +                vdev->regions[i]->vbasedev->fd,
>>> +                (unsigned long)vdev->regions[i]->fd_offset);
>>> +    }
>>> +}
>>> +
>>> +static int vfio_populate_regions(VFIODevice *vbasedev)
>>> +{
>>> +    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
>>> +    int i, ret = 0;
>>> +    VFIOPlatformDevice *vdev =
>>> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
>>> +
>>> +    vdev->regions = g_malloc0(sizeof(VFIORegion *) * vbasedev->num_regions);
>>> +
>>> +    for (i = 0; i < vbasedev->num_regions; i++) {
>>> +        vdev->regions[i] = g_malloc0(sizeof(VFIORegion));
>>> +        reg_info.index = i;
>>> +        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
>>> +        if (ret) {
>>> +            error_report("vfio: Error getting region %d info: %m", i);
>>> +            goto error;
>>> +        }
>>> +
>>> +        vdev->regions[i]->flags = reg_info.flags;
>>> +        vdev->regions[i]->size = reg_info.size;
>>> +        vdev->regions[i]->fd_offset = reg_info.offset;
>>> +        vdev->regions[i]->nr = i;
>>> +        vdev->regions[i]->vbasedev = vbasedev;
>>> +    }
>>> +    print_regions(vdev);
>>> +error:
>>> +    return ret;
>>> +}
>>> +
>>> +/* not implemented yet */
>>> +static int vfio_platform_check_device(VFIODevice *vdev)
>>> +{
>>> +    return 0;
>>> +}
>>> +
>>> +/* not implemented yet */
>>> +static bool vfio_platform_compute_needs_reset(VFIODevice *vdev)
>>> +{
>>> +return false;
>>> +}
>>> +
>>> +/* not implemented yet */
>>> +static int vfio_platform_hot_reset_multi(VFIODevice *vdev)
>>> +{
>>> +return 0;
>>> +}
>>> +
>>> +/*
>>> + * eoi function is called on the first access to any MMIO region
>>> + * after an IRQ was triggered. It is assumed this access corresponds
>>> + * to the IRQ status register reset.
>>> + * With such a mechanism, a single IRQ can be handled at a time since
>>> + * there is no way to know which IRQ was completed by the guest.
>>> + * (we would need additional details about the IRQ status register mask)
>>> + */
>>> +static void vfio_platform_eoi(VFIODevice *vbasedev)
>>> +{
>>> +    VFIOINTp *intp;
>>> +    VFIOPlatformDevice *vdev =
>>> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
>>> +
>>> +    QLIST_FOREACH(intp, &vdev->intp_list, next) {
>>> +        if (intp->state == VFIO_IRQ_ACTIVE) {
>>> +            DPRINTF("EOI IRQ #%d fd=%d\n",
>>> +                    intp->pin, event_notifier_get_fd(&intp->interrupt));
>>> +            intp->state = VFIO_IRQ_INACTIVE;
>>> +
>>> +            /* deassert the virtual IRQ and unmask physical one */
>>> +            qemu_set_irq(intp->qemuirq, 0);
>>> +            vfio_unmask_irqindex(vbasedev, intp->pin);
>>> +
>>> +            /* a single IRQ can be active at a time */
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    /* in case there are pending IRQs, handle them one at a time */
>>> +    if (!QSIMPLEQ_EMPTY(&vdev->pending_intp_queue)) {
>>> +        intp = QSIMPLEQ_FIRST(&vdev->pending_intp_queue);
>>> +        vfio_intp_interrupt(intp);
> 
> We are calling vfio_intp_interrupt() with physical interrupt enabled, while there is a comment in vfio_intp_interrupt() which says physical interrupt is disabled by VFIO.
Hi Bharat,

What I wanted to say is vfio_intp_interrupt cannot be called several
times, on the same IRQ, from eventfd handler while this IRQ is
pending/active (because VFIO driver unmasked that IRQ). I also call
vfio_intp_interrupt for handling pending IRQs - those who hit while the
virtual IRQ was active -, from MMIO handler. Nethertheless after more
careful review, I foresee 2 problems:
- need a lock in vfio_interrupt_intp. It can be called from the eventfd
handler and from the MMIO handler. I am not sure about the threading
model but I guess both can run concurrently with the risk several IRQs
get active at the same time, which is wrong (no way to detect which one
completes).
- I should not handle a new IRQ before pending ones are not handled.

I do not know whether multiple IRQ handling without irqfd support makes
much sense. However I need to fix that + clarify my comment indeed.

Thanks

Best Regards

Eric
> 
>>> +        QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext);
>>> +    }
>>> +}
>>> +
>>> +/*
>>> + * enable/disable the fast path mode
>>> + * fast path = MMIO region is mmaped (no KVM TRAP)
>>> + * slow path = MMIO region is trapped and region callbacks are called
>>> + * slow path enables to trap the IRQ status register guest reset
>>> +*/
>>> +
>>> +static void vfio_mmap_set_enabled(VFIOPlatformDevice *vdev, bool enabled)
>>> +{
>>> +    VFIORegion *region;
>>> +    int i;
>>> +
>>> +    DPRINTF("fast path = %d\n", enabled);
>>> +
>>> +    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
>>> +        region = vdev->regions[i];
>>> +
>>> +        /* register space is unmapped to trap EOI */
>>> +        memory_region_set_enabled(&region->mmap_mem, enabled);
>>> +    }
>>> +}
>>> +
>>> +/*
>>> + * Checks whether the IRQ is still pending. In the negative
>>> + * the fast path mode (where reg space is mmaped) can be restored.
>>> + * if the IRQ is still pending, we must keep on trapping IRQ status
>>> + * register reset with mmap disabled (slow path).
>>> + * the function is called on mmap_timer event.
>>> + * by construction a single fd is handled at a time. See EOI comment
>>> + * for additional details.
>>> + */
>>> +static void vfio_intp_mmap_enable(void *opaque)
>>> +{
>>> +    VFIOINTp *tmp;
>>> +    VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque;
>>> +
>>> +    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
>>> +        if (tmp->state == VFIO_IRQ_ACTIVE) {
>>> +            DPRINTF("IRQ #%d still active, stay in slow path\n",
>>> +                    tmp->pin);
>>> +            timer_mod(vdev->mmap_timer,
>>> +                      qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
>>> +                          vdev->mmap_timeout);
>>> +            return;
>>> +        }
>>> +    }
>>> +    DPRINTF("no active IRQ, restore fast path\n");
>>> +    vfio_mmap_set_enabled(vdev, true);
>>> +}
>>> +
>>> +/*
>>> + * The fd handler
>>> + */
>>> +void vfio_intp_interrupt(void *opaque)
>>> +{
>>> +    int ret;
>>> +    VFIOINTp *tmp, *intp = (VFIOINTp *)opaque;
>>> +    VFIOPlatformDevice *vdev = intp->vdev;
>>> +    bool one_active_irq = false;
>>> +
>>> +    /*
>>> +     * first check whether there is a pending IRQ
>>> +     * in the positive the new IRQ cannot be handled until the
>>> +     * active one is not completed.
>>> +     * by construction the same IRQ as the pending one cannot hit
>>> +     * since the physical IRQ was disabled by the VFIO driver
>>> +     */
> 
> Here we assume physical interrupt disabled.
> 
>>> +    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
>>> +        if (tmp->state == VFIO_IRQ_ACTIVE) {
>>> +            one_active_irq = true;
>>> +            break;
>>> +        }
>>> +    }
>>> +    if (one_active_irq) {
>>> +        /*
>>> +         * the new IRQ gets a pending status and is pushed in
>>> +         * the pending queue
>>> +         */
>>> +        intp->state = VFIO_IRQ_PENDING;
>>> +        QSIMPLEQ_INSERT_TAIL(&vdev->pending_intp_queue,
>>> +                             intp, pqnext);
>>> +        return;
>>> +    }
>>> +
>>> +    /* no active IRQ, the new IRQ can be forwarded to the guest */
>>> +    DPRINTF("Handle IRQ #%d (fd = %d)\n",
>>> +            intp->pin, event_notifier_get_fd(&intp->interrupt));
>>> +
>>> +    ret = event_notifier_test_and_clear(&intp->interrupt);
>>> +    if (!ret) {
>>> +        DPRINTF("Error when clearing fd=%d\n",
>>> +                event_notifier_get_fd(&intp->interrupt));
>>> +    }
>>> +
>>> +    intp->state = VFIO_IRQ_ACTIVE;
>>> +
>>> +    /* sets slow path */
>>> +    vfio_mmap_set_enabled(vdev, false);
>>> +
>>> +    /* trigger the virtual IRQ */
>>> +    qemu_set_irq(intp->qemuirq, 1);
>>> +
>>> +    /* schedule the mmap timer which will restore mmap path after EOI*/
>>> +    if (vdev->mmap_timeout) {
>>> +        timer_mod(vdev->mmap_timer,
>>> +                  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
>>> +                      vdev->mmap_timeout);
>>> +    }
>>> +}
>>> +
>>> +static int vfio_enable_intp(VFIODevice *vbasedev, unsigned int index)
>>> +{
>>> +    struct vfio_irq_set *irq_set;
>>> +    int32_t *pfd;
>>> +    int ret, argsz;
>>> +    int device = vbasedev->fd;
>>> +    VFIOPlatformDevice *vdev =
>>> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
>>> +    SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev);
>>> +    VFIOINTp *intp;
>>> +
>>> +    /* allocate and populate a new VFIOINTp structure put in a queue list */
>>> +    intp = g_malloc0(sizeof(*intp));
>>> +    intp->vdev = vdev;
>>> +    intp->pin = index;
>>> +    intp->state = VFIO_IRQ_INACTIVE;
>>> +    sysbus_init_irq(sbdev, &intp->qemuirq);
>>> +
>>> +    ret = event_notifier_init(&intp->interrupt, 0);
>>> +    if (ret) {
>>> +        g_free(intp);
>>> +        error_report("vfio: Error: event_notifier_init failed ");
>>> +        return ret;
>>> +    }
>>> +
>>> +    /* build the irq_set to be passed to the vfio kernel driver */
>>> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
>>> +
>>> +    irq_set = g_malloc0(argsz);
>>> +    irq_set->argsz = argsz;
>>> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
>>> +    irq_set->index = index;
>>> +    irq_set->start = 0;
>>> +    irq_set->count = 1;
>>> +    pfd = (int32_t *)&irq_set->data;
>>> +
>>> +    *pfd = event_notifier_get_fd(&intp->interrupt);
>>> +
>>> +    DPRINTF("register fd=%d/irq index=%d to kernel\n", *pfd, index);
>>> +
>>> +    qemu_set_fd_handler(*pfd, vfio_intp_interrupt, NULL, intp);
>>> +
>>> +    /*
>>> +     * pass the index/fd binding to the kernel driver so that it
>>> +     * triggers this fd on HW IRQ
>>> +     */
>>> +    ret = ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set);
>>> +    g_free(irq_set);
>>> +    if (ret) {
>>> +        error_report("vfio: Error: Failed to pass IRQ fd to the driver: %m");
>>> +        qemu_set_fd_handler(*pfd, NULL, NULL, NULL);
>>> +        event_notifier_cleanup(&intp->interrupt);
>>> +        return -errno;
>>> +    }
>>> +
>>> +    /* store the new intp in qlist */
>>> +    QLIST_INSERT_HEAD(&vdev->intp_list, intp, next);
>>> +    return 0;
>>> +}
>>> +
>>> +static int vfio_populate_interrupts(VFIODevice *vbasedev)
>>> +{
>>> +    struct vfio_irq_info irq = { .argsz = sizeof(irq) };
>>> +    int i, ret;
>>> +    VFIOPlatformDevice *vdev =
>>> +        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
>>> +
>>> +    vdev->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
>>> +                                    vfio_intp_mmap_enable, vdev);
>>> +
>>> +    QSIMPLEQ_INIT(&vdev->pending_intp_queue);
>>> +
>>> +    for (i = 0; i < vbasedev->num_irqs; i++) {
>>> +        irq.index = i;
>>> +
>>> +        DPRINTF("Retrieve IRQ info from vfio platform driver ...\n");
>>> +
>>> +        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
>>> +        if (ret) {
>>> +            /* This can fail for an old kernel or legacy PCI dev */
>>> +            error_printf("vfio: error getting device %s irq info",
>>> +                         vbasedev->name);
>>> +        } else {
>>> +            DPRINTF("- IRQ index %d: count %d, flags=0x%x\n",
>>> +                    irq.index, irq.count, irq.flags);
>>> +
>>> +            ret = vfio_enable_intp(vbasedev, irq.index);
>>> +            if (ret) {
>>> +                error_report("vfio: Error setting IRQ %d up", i);
>>> +                return ret;
>>> +            }
>>> +        }
>>> +    }
>>> +    return 0;
>>> +}
>>> +
>>> +static VFIODeviceOps vfio_platform_ops = {
>>> +    .vfio_compute_needs_reset = vfio_platform_compute_needs_reset,
>>> +    .vfio_hot_reset_multi = vfio_platform_hot_reset_multi,
>>> +    .vfio_eoi = vfio_platform_eoi,
>>> +    .vfio_check_device = vfio_platform_check_device,
>>> +    .vfio_populate_regions = vfio_populate_regions,
>>> +    .vfio_populate_interrupts = vfio_populate_interrupts,
>>> +};
>>> +
>>> +static int vfio_base_device_init(VFIODevice *vbasedev)
>>> +{
>>> +    VFIOGroup *group;
>>> +    VFIODevice *vbasedev_iter;
>>> +    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
>>> +    ssize_t len;
>>> +    struct stat st;
>>> +    int groupid;
>>> +    int ret;
>>> +
>>> +    /* name must be set prior to the call */
>>> +    if (!vbasedev->name) {
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    /* Check that the host device exists */
>>> +    snprintf(path, sizeof(path), "/sys/bus/platform/devices/%s/",
>>> +             vbasedev->name);
>>> +
>>> +    if (stat(path, &st) < 0) {
>>> +        error_report("vfio: error: no such host device: %s", path);
>>> +        return -errno;
>>> +    }
>>> +
>>> +    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
>>> +    len = readlink(path, iommu_group_path, sizeof(path));
>>> +    if (len <= 0 || len >= sizeof(path)) {
>>> +        error_report("vfio: error no iommu_group for device");
>>> +        return len < 0 ? -errno : ENAMETOOLONG;
>>> +    }
>>> +
>>> +    iommu_group_path[len] = 0;
>>> +    group_name = basename(iommu_group_path);
>>> +
>>> +    if (sscanf(group_name, "%d", &groupid) != 1) {
>>> +        error_report("vfio: error reading %s: %m", path);
>>> +        return -errno;
>>> +    }
>>> +
>>> +    DPRINTF("%s(%s) group %d\n", __func__, vbasedev->name, groupid);
>>> +
>>> +    group = vfio_get_group(groupid, &address_space_memory);
>>> +    if (!group) {
>>> +        error_report("vfio: failed to get group %d", groupid);
>>> +        return -ENOENT;
>>> +    }
>>> +
>>> +    snprintf(path, sizeof(path), "%s", vbasedev->name);
>>> +
>>> +    QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
>>> +        if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
>>> +            error_report("vfio: error: device %s is already attached", path);
>>> +            vfio_put_group(group);
>>> +            return -EBUSY;
>>> +        }
>>> +    }
>>> +    ret = vfio_get_device(group, path, vbasedev);
>>> +    if (ret) {
>>> +        error_report("vfio: failed to get device %s", path);
>>> +        vfio_put_group(group);
>>> +    }
>>> + return ret;
>>> +}
>>> +
>>> +void vfio_put_device(VFIOPlatformDevice *vdev)
>>> +{
>>> +    unsigned int i;
>>> +    VFIODevice *vbasedev = &vdev->vbasedev;
>>> +
>>> +    for (i = 0; i < vbasedev->num_regions; i++) {
>>> +            g_free(vdev->regions[i]);
>>> +    }
>>> +    g_free(vdev->regions);
>>> +    g_free(vdev->vbasedev.name);
>>> +    vfio_put_base_device(&vdev->vbasedev);
>>> +}
>>> +
>>> +static void vfio_platform_realize(DeviceState *dev, Error **errp)
>>> +{
>>> +    VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev);
>>> +    SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
>>> +    VFIODevice *vbasedev = &vdev->vbasedev;
>>> +    int i, ret;
>>> +
>>> +    vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
>>> +    vbasedev->ops = &vfio_platform_ops;
>>> +
>>> +    DPRINTF("vfio device %s, compat = %s\n", vbasedev->name, vdev->compat);
>>> +
>>> +    ret = vfio_base_device_init(vbasedev);
>>> +    if (ret) {
>>> +        return;
>>> +    }
>>> +
>>> +    for (i = 0; i < vbasedev->num_regions; i++) {
>>> +        vfio_map_region(vdev, i);
>>> +        sysbus_init_mmio(sbdev, &vdev->regions[i]->mem);
>>> +    }
>>> +}
>>> +
>>> +static const VMStateDescription vfio_platform_vmstate = {
>>> +    .name = TYPE_VFIO_PLATFORM,
>>> +    .unmigratable = 1,
>>> +};
>>> +
>>> +static Property vfio_platform_dev_properties[] = {
>>> +    DEFINE_PROP_STRING("vfio_device", VFIOPlatformDevice, vbasedev.name),
>>> +    DEFINE_PROP_STRING("compat", VFIOPlatformDevice, compat),
>>> +    DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice,
>>> +                       mmap_timeout, 1100),
>>> +    DEFINE_PROP_BOOL("irqfd", VFIOPlatformDevice, irqfd_allowed, true),
>>> +    DEFINE_PROP_END_OF_LIST(),
>>> +};
>>> +
>>> +static void vfio_platform_class_init(ObjectClass *klass, void *data)
>>> +{
>>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>>> +
>>> +    dc->realize = vfio_platform_realize;
>>> +    dc->props = vfio_platform_dev_properties;
>>> +    dc->vmsd = &vfio_platform_vmstate;
>>> +    dc->desc = "VFIO-based platform device assignment";
>>> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
>>> +}
>>> +
>>> +static const TypeInfo vfio_platform_dev_info = {
>>> +    .name = TYPE_VFIO_PLATFORM,
>>> +    .parent = TYPE_SYS_BUS_DEVICE,
>>> +    .instance_size = sizeof(VFIOPlatformDevice),
>>> +    .class_init = vfio_platform_class_init,
>>> +    .class_size = sizeof(VFIOPlatformDeviceClass),
>>
>> This should be an abstract class. People must never instantiate a
>> generic "vfio-platform" device. Only "vfio-xgmac", "vfio-etsec", etc
>> devices should be exposed to the user.
>>
>>
>> Alex
>
diff mbox

Patch

diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
index e31f30e..c5c76fe 100644
--- a/hw/vfio/Makefile.objs
+++ b/hw/vfio/Makefile.objs
@@ -1,4 +1,5 @@ 
 ifeq ($(CONFIG_LINUX), y)
 obj-$(CONFIG_SOFTMMU) += common.o
 obj-$(CONFIG_PCI) += pci.o
+obj-$(CONFIG_SOFTMMU) += platform.o
 endif
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
new file mode 100644
index 0000000..f1a1b55
--- /dev/null
+++ b/hw/vfio/platform.c
@@ -0,0 +1,517 @@ 
+/*
+ * vfio based device assignment support - platform devices
+ *
+ * Copyright Linaro Limited, 2014
+ *
+ * Authors:
+ *  Kim Phillips <kim.phillips@linaro.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on vfio based PCI device assignment support:
+ *  Copyright Red Hat, Inc. 2012
+ */
+
+#include <linux/vfio.h>
+#include <sys/ioctl.h>
+
+#include "hw/vfio/vfio-platform.h"
+#include "qemu/error-report.h"
+#include "qemu/range.h"
+#include "sysemu/sysemu.h"
+#include "exec/memory.h"
+#include "qemu/queue.h"
+#include "hw/sysbus.h"
+
+extern const MemoryRegionOps vfio_region_ops;
+extern const MemoryListener vfio_memory_listener;
+extern QLIST_HEAD(, VFIOGroup) group_list;
+extern QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces;
+void vfio_put_device(VFIOPlatformDevice *vdev);
+
+/*
+ * It is mandatory to pass a VFIOPlatformDevice since VFIODevice
+ * is not a QOM Object and cannot be passed to memory region functions
+*/
+static void vfio_map_region(VFIOPlatformDevice *vdev, int nr)
+{
+    VFIORegion *region = vdev->regions[nr];
+    unsigned size = region->size;
+    char name[64];
+
+    if (!size) {
+        return;
+    }
+
+    snprintf(name, sizeof(name), "VFIO %s region %d",
+             vdev->vbasedev.name, nr);
+
+    /* A "slow" read/write mapping underlies all regions */
+    memory_region_init_io(&region->mem, OBJECT(vdev), &vfio_region_ops,
+                          region, name, size);
+
+    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
+
+    if (vfio_mmap_region(OBJECT(vdev), region, &region->mem,
+                         &region->mmap_mem, &region->mmap, size, 0, name)) {
+        error_report("%s unsupported. Performance may be slow", name);
+    }
+}
+
+static void print_regions(VFIOPlatformDevice *vdev)
+{
+    int i;
+
+    DPRINTF("Device \"%s\" counts %d region(s):\n",
+             vdev->vbasedev.name, vdev->vbasedev.num_regions);
+
+    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
+        DPRINTF("- region %d flags = 0x%lx, size = 0x%lx, "
+                "fd= %d, offset = 0x%lx\n",
+                vdev->regions[i]->nr,
+                (unsigned long)vdev->regions[i]->flags,
+                (unsigned long)vdev->regions[i]->size,
+                vdev->regions[i]->vbasedev->fd,
+                (unsigned long)vdev->regions[i]->fd_offset);
+    }
+}
+
+static int vfio_populate_regions(VFIODevice *vbasedev)
+{
+    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
+    int i, ret = 0;
+    VFIOPlatformDevice *vdev =
+        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
+
+    vdev->regions = g_malloc0(sizeof(VFIORegion *) * vbasedev->num_regions);
+
+    for (i = 0; i < vbasedev->num_regions; i++) {
+        vdev->regions[i] = g_malloc0(sizeof(VFIORegion));
+        reg_info.index = i;
+        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
+        if (ret) {
+            error_report("vfio: Error getting region %d info: %m", i);
+            goto error;
+        }
+
+        vdev->regions[i]->flags = reg_info.flags;
+        vdev->regions[i]->size = reg_info.size;
+        vdev->regions[i]->fd_offset = reg_info.offset;
+        vdev->regions[i]->nr = i;
+        vdev->regions[i]->vbasedev = vbasedev;
+    }
+    print_regions(vdev);
+error:
+    return ret;
+}
+
+/* not implemented yet */
+static int vfio_platform_check_device(VFIODevice *vdev)
+{
+    return 0;
+}
+
+/* not implemented yet */
+static bool vfio_platform_compute_needs_reset(VFIODevice *vdev)
+{
+return false;
+}
+
+/* not implemented yet */
+static int vfio_platform_hot_reset_multi(VFIODevice *vdev)
+{
+return 0;
+}
+
+/*
+ * eoi function is called on the first access to any MMIO region
+ * after an IRQ was triggered. It is assumed this access corresponds
+ * to the IRQ status register reset.
+ * With such a mechanism, a single IRQ can be handled at a time since
+ * there is no way to know which IRQ was completed by the guest.
+ * (we would need additional details about the IRQ status register mask)
+ */
+static void vfio_platform_eoi(VFIODevice *vbasedev)
+{
+    VFIOINTp *intp;
+    VFIOPlatformDevice *vdev =
+        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
+
+    QLIST_FOREACH(intp, &vdev->intp_list, next) {
+        if (intp->state == VFIO_IRQ_ACTIVE) {
+            DPRINTF("EOI IRQ #%d fd=%d\n",
+                    intp->pin, event_notifier_get_fd(&intp->interrupt));
+            intp->state = VFIO_IRQ_INACTIVE;
+
+            /* deassert the virtual IRQ and unmask physical one */
+            qemu_set_irq(intp->qemuirq, 0);
+            vfio_unmask_irqindex(vbasedev, intp->pin);
+
+            /* a single IRQ can be active at a time */
+            break;
+        }
+    }
+
+    /* in case there are pending IRQs, handle them one at a time */
+    if (!QSIMPLEQ_EMPTY(&vdev->pending_intp_queue)) {
+        intp = QSIMPLEQ_FIRST(&vdev->pending_intp_queue);
+        vfio_intp_interrupt(intp);
+        QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext);
+    }
+}
+
+/*
+ * enable/disable the fast path mode
+ * fast path = MMIO region is mmaped (no KVM TRAP)
+ * slow path = MMIO region is trapped and region callbacks are called
+ * slow path enables to trap the IRQ status register guest reset
+*/
+
+static void vfio_mmap_set_enabled(VFIOPlatformDevice *vdev, bool enabled)
+{
+    VFIORegion *region;
+    int i;
+
+    DPRINTF("fast path = %d\n", enabled);
+
+    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
+        region = vdev->regions[i];
+
+        /* register space is unmapped to trap EOI */
+        memory_region_set_enabled(&region->mmap_mem, enabled);
+    }
+}
+
+/*
+ * Checks whether the IRQ is still pending. In the negative
+ * the fast path mode (where reg space is mmaped) can be restored.
+ * if the IRQ is still pending, we must keep on trapping IRQ status
+ * register reset with mmap disabled (slow path).
+ * the function is called on mmap_timer event.
+ * by construction a single fd is handled at a time. See EOI comment
+ * for additional details.
+ */
+static void vfio_intp_mmap_enable(void *opaque)
+{
+    VFIOINTp *tmp;
+    VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque;
+
+    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
+        if (tmp->state == VFIO_IRQ_ACTIVE) {
+            DPRINTF("IRQ #%d still active, stay in slow path\n",
+                    tmp->pin);
+            timer_mod(vdev->mmap_timer,
+                      qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+                          vdev->mmap_timeout);
+            return;
+        }
+    }
+    DPRINTF("no active IRQ, restore fast path\n");
+    vfio_mmap_set_enabled(vdev, true);
+}
+
+/*
+ * The fd handler
+ */
+void vfio_intp_interrupt(void *opaque)
+{
+    int ret;
+    VFIOINTp *tmp, *intp = (VFIOINTp *)opaque;
+    VFIOPlatformDevice *vdev = intp->vdev;
+    bool one_active_irq = false;
+
+    /*
+     * first check whether there is a pending IRQ
+     * in the positive the new IRQ cannot be handled until the
+     * active one is not completed.
+     * by construction the same IRQ as the pending one cannot hit
+     * since the physical IRQ was disabled by the VFIO driver
+     */
+    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
+        if (tmp->state == VFIO_IRQ_ACTIVE) {
+            one_active_irq = true;
+            break;
+        }
+    }
+    if (one_active_irq) {
+        /*
+         * the new IRQ gets a pending status and is pushed in
+         * the pending queue
+         */
+        intp->state = VFIO_IRQ_PENDING;
+        QSIMPLEQ_INSERT_TAIL(&vdev->pending_intp_queue,
+                             intp, pqnext);
+        return;
+    }
+
+    /* no active IRQ, the new IRQ can be forwarded to the guest */
+    DPRINTF("Handle IRQ #%d (fd = %d)\n",
+            intp->pin, event_notifier_get_fd(&intp->interrupt));
+
+    ret = event_notifier_test_and_clear(&intp->interrupt);
+    if (!ret) {
+        DPRINTF("Error when clearing fd=%d\n",
+                event_notifier_get_fd(&intp->interrupt));
+    }
+
+    intp->state = VFIO_IRQ_ACTIVE;
+
+    /* sets slow path */
+    vfio_mmap_set_enabled(vdev, false);
+
+    /* trigger the virtual IRQ */
+    qemu_set_irq(intp->qemuirq, 1);
+
+    /* schedule the mmap timer which will restore mmap path after EOI*/
+    if (vdev->mmap_timeout) {
+        timer_mod(vdev->mmap_timer,
+                  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+                      vdev->mmap_timeout);
+    }
+}
+
+static int vfio_enable_intp(VFIODevice *vbasedev, unsigned int index)
+{
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+    int ret, argsz;
+    int device = vbasedev->fd;
+    VFIOPlatformDevice *vdev =
+        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
+    SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev);
+    VFIOINTp *intp;
+
+    /* allocate and populate a new VFIOINTp structure put in a queue list */
+    intp = g_malloc0(sizeof(*intp));
+    intp->vdev = vdev;
+    intp->pin = index;
+    intp->state = VFIO_IRQ_INACTIVE;
+    sysbus_init_irq(sbdev, &intp->qemuirq);
+
+    ret = event_notifier_init(&intp->interrupt, 0);
+    if (ret) {
+        g_free(intp);
+        error_report("vfio: Error: event_notifier_init failed ");
+        return ret;
+    }
+
+    /* build the irq_set to be passed to the vfio kernel driver */
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = index;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+
+    *pfd = event_notifier_get_fd(&intp->interrupt);
+
+    DPRINTF("register fd=%d/irq index=%d to kernel\n", *pfd, index);
+
+    qemu_set_fd_handler(*pfd, vfio_intp_interrupt, NULL, intp);
+
+    /*
+     * pass the index/fd binding to the kernel driver so that it
+     * triggers this fd on HW IRQ
+     */
+    ret = ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set);
+    g_free(irq_set);
+    if (ret) {
+        error_report("vfio: Error: Failed to pass IRQ fd to the driver: %m");
+        qemu_set_fd_handler(*pfd, NULL, NULL, NULL);
+        event_notifier_cleanup(&intp->interrupt);
+        return -errno;
+    }
+
+    /* store the new intp in qlist */
+    QLIST_INSERT_HEAD(&vdev->intp_list, intp, next);
+    return 0;
+}
+
+static int vfio_populate_interrupts(VFIODevice *vbasedev)
+{
+    struct vfio_irq_info irq = { .argsz = sizeof(irq) };
+    int i, ret;
+    VFIOPlatformDevice *vdev =
+        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
+
+    vdev->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
+                                    vfio_intp_mmap_enable, vdev);
+
+    QSIMPLEQ_INIT(&vdev->pending_intp_queue);
+
+    for (i = 0; i < vbasedev->num_irqs; i++) {
+        irq.index = i;
+
+        DPRINTF("Retrieve IRQ info from vfio platform driver ...\n");
+
+        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
+        if (ret) {
+            /* This can fail for an old kernel or legacy PCI dev */
+            error_printf("vfio: error getting device %s irq info",
+                         vbasedev->name);
+        } else {
+            DPRINTF("- IRQ index %d: count %d, flags=0x%x\n",
+                    irq.index, irq.count, irq.flags);
+
+            ret = vfio_enable_intp(vbasedev, irq.index);
+            if (ret) {
+                error_report("vfio: Error setting IRQ %d up", i);
+                return ret;
+            }
+        }
+    }
+    return 0;
+}
+
+static VFIODeviceOps vfio_platform_ops = {
+    .vfio_compute_needs_reset = vfio_platform_compute_needs_reset,
+    .vfio_hot_reset_multi = vfio_platform_hot_reset_multi,
+    .vfio_eoi = vfio_platform_eoi,
+    .vfio_check_device = vfio_platform_check_device,
+    .vfio_populate_regions = vfio_populate_regions,
+    .vfio_populate_interrupts = vfio_populate_interrupts,
+};
+
+static int vfio_base_device_init(VFIODevice *vbasedev)
+{
+    VFIOGroup *group;
+    VFIODevice *vbasedev_iter;
+    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
+    ssize_t len;
+    struct stat st;
+    int groupid;
+    int ret;
+
+    /* name must be set prior to the call */
+    if (!vbasedev->name) {
+        return -EINVAL;
+    }
+
+    /* Check that the host device exists */
+    snprintf(path, sizeof(path), "/sys/bus/platform/devices/%s/",
+             vbasedev->name);
+
+    if (stat(path, &st) < 0) {
+        error_report("vfio: error: no such host device: %s", path);
+        return -errno;
+    }
+
+    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
+    len = readlink(path, iommu_group_path, sizeof(path));
+    if (len <= 0 || len >= sizeof(path)) {
+        error_report("vfio: error no iommu_group for device");
+        return len < 0 ? -errno : ENAMETOOLONG;
+    }
+
+    iommu_group_path[len] = 0;
+    group_name = basename(iommu_group_path);
+
+    if (sscanf(group_name, "%d", &groupid) != 1) {
+        error_report("vfio: error reading %s: %m", path);
+        return -errno;
+    }
+
+    DPRINTF("%s(%s) group %d\n", __func__, vbasedev->name, groupid);
+
+    group = vfio_get_group(groupid, &address_space_memory);
+    if (!group) {
+        error_report("vfio: failed to get group %d", groupid);
+        return -ENOENT;
+    }
+
+    snprintf(path, sizeof(path), "%s", vbasedev->name);
+
+    QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+        if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
+            error_report("vfio: error: device %s is already attached", path);
+            vfio_put_group(group);
+            return -EBUSY;
+        }
+    }
+    ret = vfio_get_device(group, path, vbasedev);
+    if (ret) {
+        error_report("vfio: failed to get device %s", path);
+        vfio_put_group(group);
+    }
+ return ret;
+}
+
+void vfio_put_device(VFIOPlatformDevice *vdev)
+{
+    unsigned int i;
+    VFIODevice *vbasedev = &vdev->vbasedev;
+
+    for (i = 0; i < vbasedev->num_regions; i++) {
+            g_free(vdev->regions[i]);
+    }
+    g_free(vdev->regions);
+    g_free(vdev->vbasedev.name);
+    vfio_put_base_device(&vdev->vbasedev);
+}
+
+static void vfio_platform_realize(DeviceState *dev, Error **errp)
+{
+    VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev);
+    SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
+    VFIODevice *vbasedev = &vdev->vbasedev;
+    int i, ret;
+
+    vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
+    vbasedev->ops = &vfio_platform_ops;
+
+    DPRINTF("vfio device %s, compat = %s\n", vbasedev->name, vdev->compat);
+
+    ret = vfio_base_device_init(vbasedev);
+    if (ret) {
+        return;
+    }
+
+    for (i = 0; i < vbasedev->num_regions; i++) {
+        vfio_map_region(vdev, i);
+        sysbus_init_mmio(sbdev, &vdev->regions[i]->mem);
+    }
+}
+
+static const VMStateDescription vfio_platform_vmstate = {
+    .name = TYPE_VFIO_PLATFORM,
+    .unmigratable = 1,
+};
+
+static Property vfio_platform_dev_properties[] = {
+    DEFINE_PROP_STRING("vfio_device", VFIOPlatformDevice, vbasedev.name),
+    DEFINE_PROP_STRING("compat", VFIOPlatformDevice, compat),
+    DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice,
+                       mmap_timeout, 1100),
+    DEFINE_PROP_BOOL("irqfd", VFIOPlatformDevice, irqfd_allowed, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vfio_platform_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = vfio_platform_realize;
+    dc->props = vfio_platform_dev_properties;
+    dc->vmsd = &vfio_platform_vmstate;
+    dc->desc = "VFIO-based platform device assignment";
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+}
+
+static const TypeInfo vfio_platform_dev_info = {
+    .name = TYPE_VFIO_PLATFORM,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(VFIOPlatformDevice),
+    .class_init = vfio_platform_class_init,
+    .class_size = sizeof(VFIOPlatformDeviceClass),
+};
+
+static void register_vfio_platform_dev_type(void)
+{
+    type_register_static(&vfio_platform_dev_info);
+}
+
+type_init(register_vfio_platform_dev_type)
diff --git a/include/hw/vfio/vfio-platform.h b/include/hw/vfio/vfio-platform.h
new file mode 100644
index 0000000..1ee072a
--- /dev/null
+++ b/include/hw/vfio/vfio-platform.h
@@ -0,0 +1,77 @@ 
+/*
+ * vfio based device assignment support - platform devices
+ *
+ * Copyright Linaro Limited, 2014
+ *
+ * Authors:
+ *  Kim Phillips <kim.phillips@linaro.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on vfio based PCI device assignment support:
+ *  Copyright Red Hat, Inc. 2012
+ */
+
+#ifndef HW_VFIO_VFIO_PLATFORM_H
+#define HW_VFIO_VFIO_PLATFORM_H
+
+#include "hw/sysbus.h"
+#include "hw/vfio/vfio-common.h"
+#include "qemu/event_notifier.h"
+#include "qemu/queue.h"
+#include "hw/irq.h"
+
+#define TYPE_VFIO_PLATFORM "vfio-platform"
+
+enum {
+    VFIO_IRQ_INACTIVE = 0,
+    VFIO_IRQ_PENDING = 1,
+    VFIO_IRQ_ACTIVE = 2,
+    /* VFIO_IRQ_ACTIVE_AND_PENDING cannot happen with VFIO */
+};
+
+typedef struct VFIOINTp {
+    QLIST_ENTRY(VFIOINTp) next; /* entry for IRQ list */
+    QSIMPLEQ_ENTRY(VFIOINTp) pqnext; /* entry for pending IRQ queue */
+    EventNotifier interrupt; /* eventfd triggered on interrupt */
+    EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
+    qemu_irq qemuirq;
+    struct VFIOPlatformDevice *vdev; /* back pointer to device */
+    int state; /* inactive, pending, active */
+    bool kvm_accel; /* set when QEMU bypass through KVM enabled */
+    uint8_t pin; /* index */
+    uint8_t virtualID; /* virtual IRQ */
+} VFIOINTp;
+
+typedef struct VFIOPlatformDevice {
+    SysBusDevice sbdev;
+    VFIODevice vbasedev; /* not a QOM object */
+    VFIORegion **regions;
+    QLIST_HEAD(, VFIOINTp) intp_list; /* list of IRQ */
+    /* queue of pending IRQ */
+    QSIMPLEQ_HEAD(pending_intp_queue, VFIOINTp) pending_intp_queue;
+    char *compat; /* compatibility string */
+    bool irqfd_allowed;
+    uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
+    QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
+} VFIOPlatformDevice;
+
+
+typedef struct VFIOPlatformDeviceClass {
+    /*< private >*/
+    SysBusDeviceClass parent_class;
+    /*< public >*/
+} VFIOPlatformDeviceClass;
+
+#define VFIO_PLATFORM_DEVICE(obj) \
+     OBJECT_CHECK(VFIOPlatformDevice, (obj), TYPE_VFIO_PLATFORM)
+#define VFIO_PLATFORM_DEVICE_CLASS(klass) \
+     OBJECT_CLASS_CHECK(VFIOPlatformDeviceClass, (klass), TYPE_VFIO_PLATFORM)
+#define VFIO_PLATFORM_DEVICE_GET_CLASS(obj) \
+     OBJECT_GET_CLASS(VFIOPlatformDeviceClass, (obj), TYPE_VFIO_PLATFORM)
+
+void vfio_intp_interrupt(void *opaque);
+void vfio_setup_irqfd(SysBusDevice *dev, int index, int virq);
+
+#endif