vhost: Ignore vrings in dirty log when using a vIOMMU

Message ID	160105498386.68108.2145229309875282336.stgit@bahia.lan
State	New
Headers	show Return-Path: <SRS0=+pSw=DC=nongnu.org=qemu-devel-bounces+qemu-devel=archiver.kernel.org@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org CF7AC2086A Subject: [PATCH] vhost: Ignore vrings in dirty log when using a vIOMMU From: Greg Kurz <groug@kaod.org> To: "Michael S. Tsirkin" <mst@redhat.com> Date: Fri, 25 Sep 2020 19:29:43 +0200 Message-ID: <160105498386.68108.2145229309875282336.stgit@bahia.lan> User-Agent: StGit/0.21 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: quoted-printable Received-SPF: softfail client-ip=207.211.30.44; envelope-from=groug@kaod.org; helo=us-smtp-delivery-44.mimecast.com Precedence: list Cc: Laurent Vivier <laurent@vivier.eu>, qemu-devel@nongnu.org, David Gibson <david@gibson.dropbear.id.au> Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: "Qemu-devel" <qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org>
Series	vhost: Ignore vrings in dirty log when using a vIOMMU \| expand vhost: Ignore vrings in dirty log when using a vIOMMU

Greg Kurz Sept. 25, 2020, 5:29 p.m. UTC

When a vIOMMU is present, any address comming from the guest is an IO
virtual address, including those of the vrings. The backend's accesses
to the vrings happen through vIOMMU translation : the backend hence
only logs the final guest physical address, not the IO virtual one.
It thus doesn't make sense to make room for the vring addresses in the
dirty log in this case.

This fixes a crash of the source when migrating a guest using in-kernel
vhost-net and iommu_platform=on on POWER, because DMA regions are put
at very high addresses and the resulting log size is likely to cause
g_malloc0() to abort.

BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1879349
Signed-off-by: Greg Kurz <groug@kaod.org>
---
 hw/virtio/vhost.c |   38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

Greg Kurz Sept. 25, 2020, 5:41 p.m. UTC | #1

Cc'ing Jason since this was detected using vhost-net.

On Fri, 25 Sep 2020 19:29:43 +0200
Greg Kurz <groug@kaod.org> wrote:

> When a vIOMMU is present, any address comming from the guest is an IO

> virtual address, including those of the vrings. The backend's accesses

> to the vrings happen through vIOMMU translation : the backend hence

> only logs the final guest physical address, not the IO virtual one.

> It thus doesn't make sense to make room for the vring addresses in the

> dirty log in this case.

> 

> This fixes a crash of the source when migrating a guest using in-kernel

> vhost-net and iommu_platform=on on POWER, because DMA regions are put

> at very high addresses and the resulting log size is likely to cause

> g_malloc0() to abort.

> 

> BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1879349

> Signed-off-by: Greg Kurz <groug@kaod.org>

> ---

>  hw/virtio/vhost.c |   38 ++++++++++++++++++++++++--------------

>  1 file changed, 24 insertions(+), 14 deletions(-)

> 

> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

> index 1a1384e7a642..0b83d6b8e65e 100644

> --- a/hw/virtio/vhost.c

> +++ b/hw/virtio/vhost.c

> @@ -106,6 +106,20 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,

>      }

>  }

>  

> +static int vhost_dev_has_iommu(struct vhost_dev *dev)

> +{

> +    VirtIODevice *vdev = dev->vdev;

> +

> +    /*

> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> +     * incremental memory mapping API via IOTLB API. For platform that

> +     * does not have IOMMU, there's no need to enable this feature

> +     * which may cause unnecessary IOTLB miss/update trnasactions.

> +     */

> +    return vdev->dma_as != &address_space_memory &&

> +           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> +}

> +

>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

>                                     MemoryRegionSection *section,

>                                     hwaddr first,

> @@ -130,6 +144,11 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

>                                range_get_last(reg->guest_phys_addr,

>                                               reg->memory_size));

>      }

> +

> +    if (vhost_dev_has_iommu(dev)) {

> +        return 0;

> +    }

> +

>      for (i = 0; i < dev->nvqs; ++i) {

>          struct vhost_virtqueue *vq = dev->vqs + i;

>  

> @@ -172,6 +191,11 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev)

>                                         reg->memory_size);

>          log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);

>      }

> +

> +    if (vhost_dev_has_iommu(dev)) {

> +        return log_size;

> +    }

> +

>      for (i = 0; i < dev->nvqs; ++i) {

>          struct vhost_virtqueue *vq = dev->vqs + i;

>  

> @@ -287,20 +311,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)

>      dev->log_size = size;

>  }

>  

> -static int vhost_dev_has_iommu(struct vhost_dev *dev)

> -{

> -    VirtIODevice *vdev = dev->vdev;

> -

> -    /*

> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> -     * incremental memory mapping API via IOTLB API. For platform that

> -     * does not have IOMMU, there's no need to enable this feature

> -     * which may cause unnecessary IOTLB miss/update trnasactions.

> -     */

> -    return vdev->dma_as != &address_space_memory &&

> -           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> -}

> -

>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,

>                                hwaddr *plen, bool is_write)

>  {

> 

> 

>

David Gibson Sept. 28, 2020, 6:23 a.m. UTC | #2

On Fri, Sep 25, 2020 at 07:29:43PM +0200, Greg Kurz wrote:
> When a vIOMMU is present, any address comming from the guest is an IO

> virtual address, including those of the vrings. The backend's accesses

> to the vrings happen through vIOMMU translation : the backend hence

> only logs the final guest physical address, not the IO virtual one.

> It thus doesn't make sense to make room for the vring addresses in the

> dirty log in this case.

> 

> This fixes a crash of the source when migrating a guest using in-kernel

> vhost-net and iommu_platform=on on POWER, because DMA regions are put

> at very high addresses and the resulting log size is likely to cause

> g_malloc0() to abort.

> 

> BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1879349

> Signed-off-by: Greg Kurz <groug@kaod.org>


I'm a little confused as to what's going on here.  Obviously
allocating dirty bitmaps in IOVA space doesn't make much sense.
But.. in all cases isn't the ring ending up in guest memory, whether
translated or not.  So why do specific addresses of the ring make a
difference in *any* case.

> ---

>  hw/virtio/vhost.c |   38 ++++++++++++++++++++++++--------------

>  1 file changed, 24 insertions(+), 14 deletions(-)

> 

> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

> index 1a1384e7a642..0b83d6b8e65e 100644

> --- a/hw/virtio/vhost.c

> +++ b/hw/virtio/vhost.c

> @@ -106,6 +106,20 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,

>      }

>  }

>  

> +static int vhost_dev_has_iommu(struct vhost_dev *dev)

> +{

> +    VirtIODevice *vdev = dev->vdev;

> +

> +    /*

> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> +     * incremental memory mapping API via IOTLB API. For platform that

> +     * does not have IOMMU, there's no need to enable this feature

> +     * which may cause unnecessary IOTLB miss/update trnasactions.

> +     */

> +    return vdev->dma_as != &address_space_memory &&

> +           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> +}

> +

>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

>                                     MemoryRegionSection *section,

>                                     hwaddr first,

> @@ -130,6 +144,11 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

>                                range_get_last(reg->guest_phys_addr,

>                                               reg->memory_size));

>      }

> +

> +    if (vhost_dev_has_iommu(dev)) {

> +        return 0;

> +    }

> +

>      for (i = 0; i < dev->nvqs; ++i) {

>          struct vhost_virtqueue *vq = dev->vqs + i;

>  

> @@ -172,6 +191,11 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev)

>                                         reg->memory_size);

>          log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);

>      }

> +

> +    if (vhost_dev_has_iommu(dev)) {

> +        return log_size;

> +    }

> +

>      for (i = 0; i < dev->nvqs; ++i) {

>          struct vhost_virtqueue *vq = dev->vqs + i;

>  

> @@ -287,20 +311,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)

>      dev->log_size = size;

>  }

>  

> -static int vhost_dev_has_iommu(struct vhost_dev *dev)

> -{

> -    VirtIODevice *vdev = dev->vdev;

> -

> -    /*

> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> -     * incremental memory mapping API via IOTLB API. For platform that

> -     * does not have IOMMU, there's no need to enable this feature

> -     * which may cause unnecessary IOTLB miss/update trnasactions.

> -     */

> -    return vdev->dma_as != &address_space_memory &&

> -           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> -}

> -

>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,

>                                hwaddr *plen, bool is_write)

>  {

> 

> 


-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

Greg Kurz Sept. 28, 2020, 7:37 a.m. UTC | #3

On Mon, 28 Sep 2020 16:23:43 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Fri, Sep 25, 2020 at 07:29:43PM +0200, Greg Kurz wrote:

> > When a vIOMMU is present, any address comming from the guest is an IO

> > virtual address, including those of the vrings. The backend's accesses

> > to the vrings happen through vIOMMU translation : the backend hence

> > only logs the final guest physical address, not the IO virtual one.

> > It thus doesn't make sense to make room for the vring addresses in the

> > dirty log in this case.

> > 

> > This fixes a crash of the source when migrating a guest using in-kernel

> > vhost-net and iommu_platform=on on POWER, because DMA regions are put

> > at very high addresses and the resulting log size is likely to cause

> > g_malloc0() to abort.

> > 

> > BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1879349

> > Signed-off-by: Greg Kurz <groug@kaod.org>

> 

> I'm a little confused as to what's going on here.  Obviously

> allocating dirty bitmaps in IOVA space doesn't make much sense.

> But.. in all cases isn't the ring ending up in guest memory, whether

> translated or not.  So why do specific addresses of the ring make a

> difference in *any* case.

> 


I admit I'm a bit surprised as well... I can't think of a scenario
where the address of the used ring would be higher than the guest
memory... Maybe MST can shed some light here ?

> > ---

> >  hw/virtio/vhost.c |   38 ++++++++++++++++++++++++--------------

> >  1 file changed, 24 insertions(+), 14 deletions(-)

> > 

> > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

> > index 1a1384e7a642..0b83d6b8e65e 100644

> > --- a/hw/virtio/vhost.c

> > +++ b/hw/virtio/vhost.c

> > @@ -106,6 +106,20 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,

> >      }

> >  }

> >  

> > +static int vhost_dev_has_iommu(struct vhost_dev *dev)

> > +{

> > +    VirtIODevice *vdev = dev->vdev;

> > +

> > +    /*

> > +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> > +     * incremental memory mapping API via IOTLB API. For platform that

> > +     * does not have IOMMU, there's no need to enable this feature

> > +     * which may cause unnecessary IOTLB miss/update trnasactions.

> > +     */

> > +    return vdev->dma_as != &address_space_memory &&

> > +           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> > +}

> > +

> >  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

> >                                     MemoryRegionSection *section,

> >                                     hwaddr first,

> > @@ -130,6 +144,11 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

> >                                range_get_last(reg->guest_phys_addr,

> >                                               reg->memory_size));

> >      }

> > +

> > +    if (vhost_dev_has_iommu(dev)) {

> > +        return 0;

> > +    }

> > +

> >      for (i = 0; i < dev->nvqs; ++i) {

> >          struct vhost_virtqueue *vq = dev->vqs + i;

> >  

> > @@ -172,6 +191,11 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev)

> >                                         reg->memory_size);

> >          log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);

> >      }

> > +

> > +    if (vhost_dev_has_iommu(dev)) {

> > +        return log_size;

> > +    }

> > +

> >      for (i = 0; i < dev->nvqs; ++i) {

> >          struct vhost_virtqueue *vq = dev->vqs + i;

> >  

> > @@ -287,20 +311,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)

> >      dev->log_size = size;

> >  }

> >  

> > -static int vhost_dev_has_iommu(struct vhost_dev *dev)

> > -{

> > -    VirtIODevice *vdev = dev->vdev;

> > -

> > -    /*

> > -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> > -     * incremental memory mapping API via IOTLB API. For platform that

> > -     * does not have IOMMU, there's no need to enable this feature

> > -     * which may cause unnecessary IOTLB miss/update trnasactions.

> > -     */

> > -    return vdev->dma_as != &address_space_memory &&

> > -           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> > -}

> > -

> >  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,

> >                                hwaddr *plen, bool is_write)

> >  {

> > 

> > 

>

Michael S. Tsirkin Oct. 5, 2020, 2:18 p.m. UTC | #4

On Mon, Sep 28, 2020 at 09:37:18AM +0200, Greg Kurz wrote:
> On Mon, 28 Sep 2020 16:23:43 +1000

> David Gibson <david@gibson.dropbear.id.au> wrote:

> 

> > On Fri, Sep 25, 2020 at 07:29:43PM +0200, Greg Kurz wrote:

> > > When a vIOMMU is present, any address comming from the guest is an IO

> > > virtual address, including those of the vrings. The backend's accesses

> > > to the vrings happen through vIOMMU translation : the backend hence

> > > only logs the final guest physical address, not the IO virtual one.

> > > It thus doesn't make sense to make room for the vring addresses in the

> > > dirty log in this case.

> > > 

> > > This fixes a crash of the source when migrating a guest using in-kernel

> > > vhost-net and iommu_platform=on on POWER, because DMA regions are put

> > > at very high addresses and the resulting log size is likely to cause

> > > g_malloc0() to abort.

> > > 

> > > BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1879349

> > > Signed-off-by: Greg Kurz <groug@kaod.org>

> > 

> > I'm a little confused as to what's going on here.  Obviously

> > allocating dirty bitmaps in IOVA space doesn't make much sense.

> > But.. in all cases isn't the ring ending up in guest memory, whether

> > translated or not.  So why do specific addresses of the ring make a

> > difference in *any* case.

> > 

> 

> I admit I'm a bit surprised as well... I can't think of a scenario

> where the address of the used ring would be higher than the guest

> memory... Maybe MST can shed some light here ?


So the original idea was that vring itself is specified in
terms of HVA as opposed to rest of stuff which is specified
in terms of GPA.
This way we wanted to support e.g. migration by vhost writing into
qemu address space, qemu copying data to guest memory.




> > > ---

> > >  hw/virtio/vhost.c |   38 ++++++++++++++++++++++++--------------

> > >  1 file changed, 24 insertions(+), 14 deletions(-)

> > > 

> > > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

> > > index 1a1384e7a642..0b83d6b8e65e 100644

> > > --- a/hw/virtio/vhost.c

> > > +++ b/hw/virtio/vhost.c

> > > @@ -106,6 +106,20 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,

> > >      }

> > >  }

> > >  

> > > +static int vhost_dev_has_iommu(struct vhost_dev *dev)

> > > +{

> > > +    VirtIODevice *vdev = dev->vdev;

> > > +

> > > +    /*

> > > +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> > > +     * incremental memory mapping API via IOTLB API. For platform that

> > > +     * does not have IOMMU, there's no need to enable this feature

> > > +     * which may cause unnecessary IOTLB miss/update trnasactions.

> > > +     */

> > > +    return vdev->dma_as != &address_space_memory &&

> > > +           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> > > +}

> > > +

> > >  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

> > >                                     MemoryRegionSection *section,

> > >                                     hwaddr first,

> > > @@ -130,6 +144,11 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

> > >                                range_get_last(reg->guest_phys_addr,

> > >                                               reg->memory_size));

> > >      }

> > > +

> > > +    if (vhost_dev_has_iommu(dev)) {

> > > +        return 0;

> > > +    }

> > > +

> > >      for (i = 0; i < dev->nvqs; ++i) {

> > >          struct vhost_virtqueue *vq = dev->vqs + i;

> > >  

> > > @@ -172,6 +191,11 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev)

> > >                                         reg->memory_size);

> > >          log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);

> > >      }

> > > +

> > > +    if (vhost_dev_has_iommu(dev)) {

> > > +        return log_size;

> > > +    }

> > > +

> > >      for (i = 0; i < dev->nvqs; ++i) {

> > >          struct vhost_virtqueue *vq = dev->vqs + i;

> > >  

> > > @@ -287,20 +311,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)

> > >      dev->log_size = size;

> > >  }

> > >  

> > > -static int vhost_dev_has_iommu(struct vhost_dev *dev)

> > > -{

> > > -    VirtIODevice *vdev = dev->vdev;

> > > -

> > > -    /*

> > > -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> > > -     * incremental memory mapping API via IOTLB API. For platform that

> > > -     * does not have IOMMU, there's no need to enable this feature

> > > -     * which may cause unnecessary IOTLB miss/update trnasactions.

> > > -     */

> > > -    return vdev->dma_as != &address_space_memory &&

> > > -           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> > > -}

> > > -

> > >  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,

> > >                                hwaddr *plen, bool is_write)

> > >  {

> > > 

> > > 

> > 

>

Greg Kurz Oct. 6, 2020, 9:58 a.m. UTC | #5

On Mon, 5 Oct 2020 10:18:03 -0400
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Mon, Sep 28, 2020 at 09:37:18AM +0200, Greg Kurz wrote:

> > On Mon, 28 Sep 2020 16:23:43 +1000

> > David Gibson <david@gibson.dropbear.id.au> wrote:

> > 

> > > On Fri, Sep 25, 2020 at 07:29:43PM +0200, Greg Kurz wrote:

> > > > When a vIOMMU is present, any address comming from the guest is an IO

> > > > virtual address, including those of the vrings. The backend's accesses

> > > > to the vrings happen through vIOMMU translation : the backend hence

> > > > only logs the final guest physical address, not the IO virtual one.

> > > > It thus doesn't make sense to make room for the vring addresses in the

> > > > dirty log in this case.

> > > > 

> > > > This fixes a crash of the source when migrating a guest using in-kernel

> > > > vhost-net and iommu_platform=on on POWER, because DMA regions are put

> > > > at very high addresses and the resulting log size is likely to cause

> > > > g_malloc0() to abort.

> > > > 

> > > > BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1879349

> > > > Signed-off-by: Greg Kurz <groug@kaod.org>

> > > 

> > > I'm a little confused as to what's going on here.  Obviously

> > > allocating dirty bitmaps in IOVA space doesn't make much sense.

> > > But.. in all cases isn't the ring ending up in guest memory, whether

> > > translated or not.  So why do specific addresses of the ring make a

> > > difference in *any* case.

> > > 

> > 

> > I admit I'm a bit surprised as well... I can't think of a scenario

> > where the address of the used ring would be higher than the guest

> > memory... Maybe MST can shed some light here ?

> 

> So the original idea was that vring itself is specified in

> terms of HVA as opposed to rest of stuff which is specified

> in terms of GPA.


The vring itself is indeed described in term of HVA (vq->used) but
when it comes to the dirty log, QEMU passes the GPA of the used
structure to the vhost backend:

From vhost_virtqueue_set_addr():

    addr.log_guest_addr = vq->used_phys;
                              ^^ GPA ^^
    addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;
    r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);

And the sizing of the bitmap computed in vhost_get_log_size() is
also based on this GPA:

    for (i = 0; i < dev->nvqs; ++i) {
        struct vhost_virtqueue *vq = dev->vqs + i;

        if (!vq->used_phys && !vq->used_size) {
            continue;
        }

        uint64_t last = vq->used_phys + vq->used_size - 1;
                            ^^ GPA ^^
        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
    }

With platform_iommu=off, I couldn't find a case where this second
loop in vhost_get_log_size() increases the log size actually... and
when platform_iommu=on, vq->used_phys is a GIOVA that my other patches
you've already merged in the kernel explicitly ignore when it comes to
the dirty log... So it really seems that this loop is useless for the
general case. If it is there to address some corner case, I guess it
deserves a comment at the very least.

Cheers,

--
Greg

> This way we wanted to support e.g. migration by vhost writing into

> qemu address space, qemu copying data to guest memory.

> 

> 

> 

> 

> > > > ---

> > > >  hw/virtio/vhost.c |   38 ++++++++++++++++++++++++--------------

> > > >  1 file changed, 24 insertions(+), 14 deletions(-)

> > > > 

> > > > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

> > > > index 1a1384e7a642..0b83d6b8e65e 100644

> > > > --- a/hw/virtio/vhost.c

> > > > +++ b/hw/virtio/vhost.c

> > > > @@ -106,6 +106,20 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,

> > > >      }

> > > >  }

> > > >  

> > > > +static int vhost_dev_has_iommu(struct vhost_dev *dev)

> > > > +{

> > > > +    VirtIODevice *vdev = dev->vdev;

> > > > +

> > > > +    /*

> > > > +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> > > > +     * incremental memory mapping API via IOTLB API. For platform that

> > > > +     * does not have IOMMU, there's no need to enable this feature

> > > > +     * which may cause unnecessary IOTLB miss/update trnasactions.

> > > > +     */

> > > > +    return vdev->dma_as != &address_space_memory &&

> > > > +           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> > > > +}

> > > > +

> > > >  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

> > > >                                     MemoryRegionSection *section,

> > > >                                     hwaddr first,

> > > > @@ -130,6 +144,11 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

> > > >                                range_get_last(reg->guest_phys_addr,

> > > >                                               reg->memory_size));

> > > >      }

> > > > +

> > > > +    if (vhost_dev_has_iommu(dev)) {

> > > > +        return 0;

> > > > +    }

> > > > +

> > > >      for (i = 0; i < dev->nvqs; ++i) {

> > > >          struct vhost_virtqueue *vq = dev->vqs + i;

> > > >  

> > > > @@ -172,6 +191,11 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev)

> > > >                                         reg->memory_size);

> > > >          log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);

> > > >      }

> > > > +

> > > > +    if (vhost_dev_has_iommu(dev)) {

> > > > +        return log_size;

> > > > +    }

> > > > +

> > > >      for (i = 0; i < dev->nvqs; ++i) {

> > > >          struct vhost_virtqueue *vq = dev->vqs + i;

> > > >  

> > > > @@ -287,20 +311,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)

> > > >      dev->log_size = size;

> > > >  }

> > > >  

> > > > -static int vhost_dev_has_iommu(struct vhost_dev *dev)

> > > > -{

> > > > -    VirtIODevice *vdev = dev->vdev;

> > > > -

> > > > -    /*

> > > > -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> > > > -     * incremental memory mapping API via IOTLB API. For platform that

> > > > -     * does not have IOMMU, there's no need to enable this feature

> > > > -     * which may cause unnecessary IOTLB miss/update trnasactions.

> > > > -     */

> > > > -    return vdev->dma_as != &address_space_memory &&

> > > > -           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> > > > -}

> > > > -

> > > >  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,

> > > >                                hwaddr *plen, bool is_write)

> > > >  {

> > > > 

> > > > 

> > > 

> > 

> 

>

Michael S. Tsirkin Oct. 6, 2020, 10:49 a.m. UTC | #6

On Tue, Oct 06, 2020 at 11:58:50AM +0200, Greg Kurz wrote:
> On Mon, 5 Oct 2020 10:18:03 -0400

> "Michael S. Tsirkin" <mst@redhat.com> wrote:

> 

> > On Mon, Sep 28, 2020 at 09:37:18AM +0200, Greg Kurz wrote:

> > > On Mon, 28 Sep 2020 16:23:43 +1000

> > > David Gibson <david@gibson.dropbear.id.au> wrote:

> > > 

> > > > On Fri, Sep 25, 2020 at 07:29:43PM +0200, Greg Kurz wrote:

> > > > > When a vIOMMU is present, any address comming from the guest is an IO

> > > > > virtual address, including those of the vrings. The backend's accesses

> > > > > to the vrings happen through vIOMMU translation : the backend hence

> > > > > only logs the final guest physical address, not the IO virtual one.

> > > > > It thus doesn't make sense to make room for the vring addresses in the

> > > > > dirty log in this case.

> > > > > 

> > > > > This fixes a crash of the source when migrating a guest using in-kernel

> > > > > vhost-net and iommu_platform=on on POWER, because DMA regions are put

> > > > > at very high addresses and the resulting log size is likely to cause

> > > > > g_malloc0() to abort.

> > > > > 

> > > > > BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1879349

> > > > > Signed-off-by: Greg Kurz <groug@kaod.org>

> > > > 

> > > > I'm a little confused as to what's going on here.  Obviously

> > > > allocating dirty bitmaps in IOVA space doesn't make much sense.

> > > > But.. in all cases isn't the ring ending up in guest memory, whether

> > > > translated or not.  So why do specific addresses of the ring make a

> > > > difference in *any* case.

> > > > 

> > > 

> > > I admit I'm a bit surprised as well... I can't think of a scenario

> > > where the address of the used ring would be higher than the guest

> > > memory... Maybe MST can shed some light here ?

> > 

> > So the original idea was that vring itself is specified in

> > terms of HVA as opposed to rest of stuff which is specified

> > in terms of GPA.

> 

> The vring itself is indeed described in term of HVA (vq->used) but

> when it comes to the dirty log, QEMU passes the GPA of the used

> structure to the vhost backend:

> 

> >From vhost_virtqueue_set_addr():

> 

>     addr.log_guest_addr = vq->used_phys;

>                               ^^ GPA ^^

>     addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;

>     r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);


Right. The point being we either use the log bitmap or the ring aliasing
trick to track memory changes, not both. If we used the ring aliasing
trick then presumably VHOST_VRING_F_LOG would be clear and then
log_guest_addr is unused.

> And the sizing of the bitmap computed in vhost_get_log_size() is

> also based on this GPA:

> 

>     for (i = 0; i < dev->nvqs; ++i) {

>         struct vhost_virtqueue *vq = dev->vqs + i;

> 

>         if (!vq->used_phys && !vq->used_size) {

>             continue;

>         }

> 

>         uint64_t last = vq->used_phys + vq->used_size - 1;

>                             ^^ GPA ^^

>         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);

>     }

>

> With platform_iommu=off, I couldn't find a case where this second

> loop in vhost_get_log_size() increases the log size actually... and

> when platform_iommu=on, vq->used_phys is a GIOVA that my other patches

> you've already merged in the kernel explicitly ignore when it comes to

> the dirty log... So it really seems that this loop is useless for the

> general case. If it is there to address some corner case, I guess it

> deserves a comment at the very least.

> 

> Cheers,

> 

> --

> Greg

> 

> > This way we wanted to support e.g. migration by vhost writing into

> > qemu address space, qemu copying data to guest memory.

> > 

> > 

> > 

> > 

> > > > > ---

> > > > >  hw/virtio/vhost.c |   38 ++++++++++++++++++++++++--------------

> > > > >  1 file changed, 24 insertions(+), 14 deletions(-)

> > > > > 

> > > > > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

> > > > > index 1a1384e7a642..0b83d6b8e65e 100644

> > > > > --- a/hw/virtio/vhost.c

> > > > > +++ b/hw/virtio/vhost.c

> > > > > @@ -106,6 +106,20 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,

> > > > >      }

> > > > >  }

> > > > >  

> > > > > +static int vhost_dev_has_iommu(struct vhost_dev *dev)

> > > > > +{

> > > > > +    VirtIODevice *vdev = dev->vdev;

> > > > > +

> > > > > +    /*

> > > > > +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> > > > > +     * incremental memory mapping API via IOTLB API. For platform that

> > > > > +     * does not have IOMMU, there's no need to enable this feature

> > > > > +     * which may cause unnecessary IOTLB miss/update trnasactions.

> > > > > +     */

> > > > > +    return vdev->dma_as != &address_space_memory &&

> > > > > +           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> > > > > +}

> > > > > +

> > > > >  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

> > > > >                                     MemoryRegionSection *section,

> > > > >                                     hwaddr first,

> > > > > @@ -130,6 +144,11 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

> > > > >                                range_get_last(reg->guest_phys_addr,

> > > > >                                               reg->memory_size));

> > > > >      }

> > > > > +

> > > > > +    if (vhost_dev_has_iommu(dev)) {

> > > > > +        return 0;

> > > > > +    }

> > > > > +

> > > > >      for (i = 0; i < dev->nvqs; ++i) {

> > > > >          struct vhost_virtqueue *vq = dev->vqs + i;

> > > > >  

> > > > > @@ -172,6 +191,11 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev)

> > > > >                                         reg->memory_size);

> > > > >          log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);

> > > > >      }

> > > > > +

> > > > > +    if (vhost_dev_has_iommu(dev)) {

> > > > > +        return log_size;

> > > > > +    }

> > > > > +

> > > > >      for (i = 0; i < dev->nvqs; ++i) {

> > > > >          struct vhost_virtqueue *vq = dev->vqs + i;

> > > > >  

> > > > > @@ -287,20 +311,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)

> > > > >      dev->log_size = size;

> > > > >  }

> > > > >  

> > > > > -static int vhost_dev_has_iommu(struct vhost_dev *dev)

> > > > > -{

> > > > > -    VirtIODevice *vdev = dev->vdev;

> > > > > -

> > > > > -    /*

> > > > > -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> > > > > -     * incremental memory mapping API via IOTLB API. For platform that

> > > > > -     * does not have IOMMU, there's no need to enable this feature

> > > > > -     * which may cause unnecessary IOTLB miss/update trnasactions.

> > > > > -     */

> > > > > -    return vdev->dma_as != &address_space_memory &&

> > > > > -           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> > > > > -}

> > > > > -

> > > > >  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,

> > > > >                                hwaddr *plen, bool is_write)

> > > > >  {

> > > > > 

> > > > > 

> > > > 

> > > 

> > 

> >

Greg Kurz Oct. 6, 2020, 3:30 p.m. UTC | #7

On Tue, 6 Oct 2020 06:49:28 -0400
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, Oct 06, 2020 at 11:58:50AM +0200, Greg Kurz wrote:

> > On Mon, 5 Oct 2020 10:18:03 -0400

> > "Michael S. Tsirkin" <mst@redhat.com> wrote:

> > 

> > > On Mon, Sep 28, 2020 at 09:37:18AM +0200, Greg Kurz wrote:

> > > > On Mon, 28 Sep 2020 16:23:43 +1000

> > > > David Gibson <david@gibson.dropbear.id.au> wrote:

> > > > 

> > > > > On Fri, Sep 25, 2020 at 07:29:43PM +0200, Greg Kurz wrote:

> > > > > > When a vIOMMU is present, any address comming from the guest is an IO

> > > > > > virtual address, including those of the vrings. The backend's accesses

> > > > > > to the vrings happen through vIOMMU translation : the backend hence

> > > > > > only logs the final guest physical address, not the IO virtual one.

> > > > > > It thus doesn't make sense to make room for the vring addresses in the

> > > > > > dirty log in this case.

> > > > > > 

> > > > > > This fixes a crash of the source when migrating a guest using in-kernel

> > > > > > vhost-net and iommu_platform=on on POWER, because DMA regions are put

> > > > > > at very high addresses and the resulting log size is likely to cause

> > > > > > g_malloc0() to abort.

> > > > > > 

> > > > > > BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1879349

> > > > > > Signed-off-by: Greg Kurz <groug@kaod.org>

> > > > > 

> > > > > I'm a little confused as to what's going on here.  Obviously

> > > > > allocating dirty bitmaps in IOVA space doesn't make much sense.

> > > > > But.. in all cases isn't the ring ending up in guest memory, whether

> > > > > translated or not.  So why do specific addresses of the ring make a

> > > > > difference in *any* case.

> > > > > 

> > > > 

> > > > I admit I'm a bit surprised as well... I can't think of a scenario

> > > > where the address of the used ring would be higher than the guest

> > > > memory... Maybe MST can shed some light here ?

> > > 

> > > So the original idea was that vring itself is specified in

> > > terms of HVA as opposed to rest of stuff which is specified

> > > in terms of GPA.

> > 

> > The vring itself is indeed described in term of HVA (vq->used) but

> > when it comes to the dirty log, QEMU passes the GPA of the used

> > structure to the vhost backend:

> > 

> > >From vhost_virtqueue_set_addr():

> > 

> >     addr.log_guest_addr = vq->used_phys;

> >                               ^^ GPA ^^

> >     addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;

> >     r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);

> 

> Right. The point being we either use the log bitmap or the ring aliasing


Not sure what the "ring aliasing trick" is referring too...

> trick to track memory changes, not both. If we used the ring aliasing

> trick then presumably VHOST_VRING_F_LOG would be clear and then

> log_guest_addr is unused.

> 


Ok, so IIUC this means that vhost_get_log_size() never needs to
special case vq->used_phys (ie. log_guest_addr) since it is a
GPA (ie. <= guest RAM size and thus already covered by the first
loop), right ?

> > And the sizing of the bitmap computed in vhost_get_log_size() is

> > also based on this GPA:

> > 

> >     for (i = 0; i < dev->nvqs; ++i) {

> >         struct vhost_virtqueue *vq = dev->vqs + i;

> > 

> >         if (!vq->used_phys && !vq->used_size) {

> >             continue;

> >         }

> > 

> >         uint64_t last = vq->used_phys + vq->used_size - 1;

> >                             ^^ GPA ^^

> >         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);

> >     }

> >

> > With platform_iommu=off, I couldn't find a case where this second

> > loop in vhost_get_log_size() increases the log size actually... and

> > when platform_iommu=on, vq->used_phys is a GIOVA that my other patches

> > you've already merged in the kernel explicitly ignore when it comes to

> > the dirty log... So it really seems that this loop is useless for the

> > general case. If it is there to address some corner case, I guess it

> > deserves a comment at the very least.

> > 

> > Cheers,

> > 

> > --

> > Greg

> > 

> > > This way we wanted to support e.g. migration by vhost writing into

> > > qemu address space, qemu copying data to guest memory.

> > > 

> > > 

> > > 

> > > 

> > > > > > ---

> > > > > >  hw/virtio/vhost.c |   38 ++++++++++++++++++++++++--------------

> > > > > >  1 file changed, 24 insertions(+), 14 deletions(-)

> > > > > > 

> > > > > > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

> > > > > > index 1a1384e7a642..0b83d6b8e65e 100644

> > > > > > --- a/hw/virtio/vhost.c

> > > > > > +++ b/hw/virtio/vhost.c

> > > > > > @@ -106,6 +106,20 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,

> > > > > >      }

> > > > > >  }

> > > > > >  

> > > > > > +static int vhost_dev_has_iommu(struct vhost_dev *dev)

> > > > > > +{

> > > > > > +    VirtIODevice *vdev = dev->vdev;

> > > > > > +

> > > > > > +    /*

> > > > > > +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> > > > > > +     * incremental memory mapping API via IOTLB API. For platform that

> > > > > > +     * does not have IOMMU, there's no need to enable this feature

> > > > > > +     * which may cause unnecessary IOTLB miss/update trnasactions.

> > > > > > +     */

> > > > > > +    return vdev->dma_as != &address_space_memory &&

> > > > > > +           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> > > > > > +}

> > > > > > +

> > > > > >  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

> > > > > >                                     MemoryRegionSection *section,

> > > > > >                                     hwaddr first,

> > > > > > @@ -130,6 +144,11 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,

> > > > > >                                range_get_last(reg->guest_phys_addr,

> > > > > >                                               reg->memory_size));

> > > > > >      }

> > > > > > +

> > > > > > +    if (vhost_dev_has_iommu(dev)) {

> > > > > > +        return 0;

> > > > > > +    }

> > > > > > +

> > > > > >      for (i = 0; i < dev->nvqs; ++i) {

> > > > > >          struct vhost_virtqueue *vq = dev->vqs + i;

> > > > > >  

> > > > > > @@ -172,6 +191,11 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev)

> > > > > >                                         reg->memory_size);

> > > > > >          log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);

> > > > > >      }

> > > > > > +

> > > > > > +    if (vhost_dev_has_iommu(dev)) {

> > > > > > +        return log_size;

> > > > > > +    }

> > > > > > +

> > > > > >      for (i = 0; i < dev->nvqs; ++i) {

> > > > > >          struct vhost_virtqueue *vq = dev->vqs + i;

> > > > > >  

> > > > > > @@ -287,20 +311,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)

> > > > > >      dev->log_size = size;

> > > > > >  }

> > > > > >  

> > > > > > -static int vhost_dev_has_iommu(struct vhost_dev *dev)

> > > > > > -{

> > > > > > -    VirtIODevice *vdev = dev->vdev;

> > > > > > -

> > > > > > -    /*

> > > > > > -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support

> > > > > > -     * incremental memory mapping API via IOTLB API. For platform that

> > > > > > -     * does not have IOMMU, there's no need to enable this feature

> > > > > > -     * which may cause unnecessary IOTLB miss/update trnasactions.

> > > > > > -     */

> > > > > > -    return vdev->dma_as != &address_space_memory &&

> > > > > > -           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);

> > > > > > -}

> > > > > > -

> > > > > >  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,

> > > > > >                                hwaddr *plen, bool is_write)

> > > > > >  {

> > > > > > 

> > > > > > 

> > > > > 

> > > > 

> > > 

> > > 

>

vhost: Ignore vrings in dirty log when using a vIOMMU

Commit Message

Comments

Patch