diff mbox series

[v2] hw/virtio/vhost: re-factor vhost-section and allow DIRTY_MEMORY_CODE

Message ID 20200604134022.10564-1-alex.bennee@linaro.org
State Superseded
Headers show
Series [v2] hw/virtio/vhost: re-factor vhost-section and allow DIRTY_MEMORY_CODE | expand

Commit Message

Alex Bennée June 4, 2020, 1:40 p.m. UTC
The purpose of vhost_section is to identify RAM regions that need to
be made available to a vhost client. However when running under TCG
all RAM sections have DIRTY_MEMORY_CODE set which leads to problems
down the line.

Re-factor the code so:

  - steps are clearer to follow
  - reason for rejection is recorded in the trace point
  - we allow DIRTY_MEMORY_CODE when TCG is enabled

We expand the comment to explain that kernel based vhost has specific
support for migration tracking.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>

---
v2
  - drop enum, add trace_vhost_reject_section
  - return false at any fail point
  - unconditionally add DIRTY_MEMORY_CODE to handled cases
  - slightly re-word the explanatory comment and commit message
---
 hw/virtio/vhost.c      | 55 ++++++++++++++++++++++++++++++------------
 hw/virtio/trace-events |  3 ++-
 2 files changed, 41 insertions(+), 17 deletions(-)

-- 
2.20.1

Comments

Fabiano Rosas June 4, 2020, 3:32 p.m. UTC | #1
Alex Bennée <alex.bennee@linaro.org> writes:

> The purpose of vhost_section is to identify RAM regions that need to

> be made available to a vhost client. However when running under TCG

> all RAM sections have DIRTY_MEMORY_CODE set which leads to problems

> down the line.

>

> Re-factor the code so:

>

>   - steps are clearer to follow

>   - reason for rejection is recorded in the trace point

>   - we allow DIRTY_MEMORY_CODE when TCG is enabled

>

> We expand the comment to explain that kernel based vhost has specific

> support for migration tracking.

>


Hi, I tested this with virtio-fs in x86_64 and ppc64le. Both TCG and KVM
(just in case).

With this patch the call to vhost_set_mem_table succeeds and the device
is set up properly:

# mount -t virtiofs myfs /mnt
# ls /mnt      <--- previously this would hang in TCG
a  b

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> Cc: Michael S. Tsirkin <mst@redhat.com>

> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>

> Cc: Stefan Hajnoczi <stefanha@redhat.com>

>


Tested-by: Fabiano Rosas <farosas@linux.ibm.com>


> ---

> v2

>   - drop enum, add trace_vhost_reject_section

>   - return false at any fail point

>   - unconditionally add DIRTY_MEMORY_CODE to handled cases

>   - slightly re-word the explanatory comment and commit message

> ---

>  hw/virtio/vhost.c      | 55 ++++++++++++++++++++++++++++++------------

>  hw/virtio/trace-events |  3 ++-

>  2 files changed, 41 insertions(+), 17 deletions(-)

>

> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

> index aff98a0ede5..120c0cc747b 100644

> --- a/hw/virtio/vhost.c

> +++ b/hw/virtio/vhost.c

> @@ -27,6 +27,7 @@

>  #include "migration/blocker.h"

>  #include "migration/qemu-file-types.h"

>  #include "sysemu/dma.h"

> +#include "sysemu/tcg.h"

>  #include "trace.h"

>  

>  /* enabled until disconnected backend stabilizes */

> @@ -403,26 +404,48 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,

>      return r;

>  }

>  

> +/*

> + * vhost_section: identify sections needed for vhost access

> + *

> + * We only care about RAM sections here (where virtqueue can live). If

> + * we find one we still allow the backend to potentially filter it out

> + * of our list.

> + */

>  static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)

>  {

> -    bool result;

> -    bool log_dirty = memory_region_get_dirty_log_mask(section->mr) &

> -                     ~(1 << DIRTY_MEMORY_MIGRATION);

> -    result = memory_region_is_ram(section->mr) &&

> -        !memory_region_is_rom(section->mr);

> -

> -    /* Vhost doesn't handle any block which is doing dirty-tracking other

> -     * than migration; this typically fires on VGA areas.

> -     */

> -    result &= !log_dirty;

> +    MemoryRegion *mr = section->mr;

> +

> +    if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {

> +        uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);

> +        uint8_t handled_dirty;

> +

> +        /*

> +         * Kernel based vhost doesn't handle any block which is doing

> +         * dirty-tracking other than migration for which it has

> +         * specific logging support. However for TCG the kernel never

> +         * gets involved anyway so we can also ignore it's

> +         * self-modiying code detection flags.

> +         */

> +        handled_dirty = (1 << DIRTY_MEMORY_MIGRATION);

> +        handled_dirty |= (1 << DIRTY_MEMORY_CODE);

>  

> -    if (result && dev->vhost_ops->vhost_backend_mem_section_filter) {

> -        result &=

> -            dev->vhost_ops->vhost_backend_mem_section_filter(dev, section);

> -    }

> +        if (dirty_mask & ~handled_dirty) {

> +            trace_vhost_reject_section(mr->name, 1);

> +            return false;

> +        }

> +

> +        if (dev->vhost_ops->vhost_backend_mem_section_filter &&

> +            !dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) {

> +            trace_vhost_reject_section(mr->name, 2);

> +            return false;

> +        }

>  

> -    trace_vhost_section(section->mr->name, result);

> -    return result;

> +        trace_vhost_section(mr->name);

> +        return true;

> +    } else {

> +        trace_vhost_reject_section(mr->name, 3);

> +        return false;

> +    }

>  }

>  

>  static void vhost_begin(MemoryListener *listener)

> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events

> index 84ecb85d445..22427126b97 100644

> --- a/hw/virtio/trace-events

> +++ b/hw/virtio/trace-events

> @@ -5,7 +5,8 @@ vhost_commit(bool started, bool changed) "Started: %d Changed: %d"

>  vhost_region_add_section(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64

>  vhost_region_add_section_merge(const char *name, uint64_t new_size, uint64_t gpa, uint64_t owr) "%s: size: 0x%"PRIx64 " gpa: 0x%"PRIx64 " owr: 0x%"PRIx64

>  vhost_region_add_section_aligned(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64

> -vhost_section(const char *name, int r) "%s:%d"

> +vhost_section(const char *name) "%s"

> +vhost_reject_section(const char *name, int d) "%s:%d"

>  vhost_iotlb_miss(void *dev, int step) "%p step %d"

>  

>  # vhost-user.c
Michael S. Tsirkin June 4, 2020, 4:51 p.m. UTC | #2
On Thu, Jun 04, 2020 at 02:40:22PM +0100, Alex Bennée wrote:
> The purpose of vhost_section is to identify RAM regions that need to

> be made available to a vhost client. However when running under TCG

> all RAM sections have DIRTY_MEMORY_CODE set which leads to problems

> down the line.

> 

> Re-factor the code so:

> 

>   - steps are clearer to follow

>   - reason for rejection is recorded in the trace point

>   - we allow DIRTY_MEMORY_CODE when TCG is enabled

> 

> We expand the comment to explain that kernel based vhost has specific

> support for migration tracking.

> 

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> Cc: Michael S. Tsirkin <mst@redhat.com>

> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>

> Cc: Stefan Hajnoczi <stefanha@redhat.com>

> 

> ---

> v2

>   - drop enum, add trace_vhost_reject_section

>   - return false at any fail point

>   - unconditionally add DIRTY_MEMORY_CODE to handled cases

>   - slightly re-word the explanatory comment and commit message

> ---

>  hw/virtio/vhost.c      | 55 ++++++++++++++++++++++++++++++------------

>  hw/virtio/trace-events |  3 ++-

>  2 files changed, 41 insertions(+), 17 deletions(-)

> 

> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

> index aff98a0ede5..120c0cc747b 100644

> --- a/hw/virtio/vhost.c

> +++ b/hw/virtio/vhost.c

> @@ -27,6 +27,7 @@

>  #include "migration/blocker.h"

>  #include "migration/qemu-file-types.h"

>  #include "sysemu/dma.h"

> +#include "sysemu/tcg.h"

>  #include "trace.h"

>  

>  /* enabled until disconnected backend stabilizes */

> @@ -403,26 +404,48 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,

>      return r;

>  }

>  

> +/*

> + * vhost_section: identify sections needed for vhost access

> + *

> + * We only care about RAM sections here (where virtqueue can live). If

> + * we find one we still allow the backend to potentially filter it out

> + * of our list.

> + */

>  static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)

>  {

> -    bool result;

> -    bool log_dirty = memory_region_get_dirty_log_mask(section->mr) &

> -                     ~(1 << DIRTY_MEMORY_MIGRATION);

> -    result = memory_region_is_ram(section->mr) &&

> -        !memory_region_is_rom(section->mr);

> -

> -    /* Vhost doesn't handle any block which is doing dirty-tracking other

> -     * than migration; this typically fires on VGA areas.

> -     */

> -    result &= !log_dirty;

> +    MemoryRegion *mr = section->mr;

> +

> +    if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {

> +        uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);

> +        uint8_t handled_dirty;

> +

> +        /*

> +         * Kernel based vhost doesn't handle any block which is doing

> +         * dirty-tracking other than migration for which it has

> +         * specific logging support. However for TCG the kernel never

> +         * gets involved anyway so we can also ignore it's

> +         * self-modiying code detection flags.

> +         */

> +        handled_dirty = (1 << DIRTY_MEMORY_MIGRATION);

> +        handled_dirty |= (1 << DIRTY_MEMORY_CODE);


I'd just rewrite it in a single statement:

         handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) |
                         (1 << DIRTY_MEMORY_CODE);


>  

> -    if (result && dev->vhost_ops->vhost_backend_mem_section_filter) {

> -        result &=

> -            dev->vhost_ops->vhost_backend_mem_section_filter(dev, section);

> -    }

> +        if (dirty_mask & ~handled_dirty) {

> +            trace_vhost_reject_section(mr->name, 1);

> +            return false;

> +        }

> +

> +        if (dev->vhost_ops->vhost_backend_mem_section_filter &&

> +            !dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) {

> +            trace_vhost_reject_section(mr->name, 2);

> +            return false;

> +        }

>  

> -    trace_vhost_section(section->mr->name, result);

> -    return result;

> +        trace_vhost_section(mr->name);

> +        return true;

> +    } else {

> +        trace_vhost_reject_section(mr->name, 3);

> +        return false;

> +    }

>  }

>  

>  static void vhost_begin(MemoryListener *listener)

> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events

> index 84ecb85d445..22427126b97 100644

> --- a/hw/virtio/trace-events

> +++ b/hw/virtio/trace-events

> @@ -5,7 +5,8 @@ vhost_commit(bool started, bool changed) "Started: %d Changed: %d"

>  vhost_region_add_section(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64

>  vhost_region_add_section_merge(const char *name, uint64_t new_size, uint64_t gpa, uint64_t owr) "%s: size: 0x%"PRIx64 " gpa: 0x%"PRIx64 " owr: 0x%"PRIx64

>  vhost_region_add_section_aligned(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64

> -vhost_section(const char *name, int r) "%s:%d"

> +vhost_section(const char *name) "%s"

> +vhost_reject_section(const char *name, int d) "%s:%d"

>  vhost_iotlb_miss(void *dev, int step) "%p step %d"


Looks good otherwise, thanks!


>  # vhost-user.c

> -- 

> 2.20.1
Stefan Hajnoczi June 5, 2020, 9:03 a.m. UTC | #3
On Thu, Jun 04, 2020 at 02:40:22PM +0100, Alex Bennée wrote:
> The purpose of vhost_section is to identify RAM regions that need to

> be made available to a vhost client. However when running under TCG

> all RAM sections have DIRTY_MEMORY_CODE set which leads to problems

> down the line.

> 

> Re-factor the code so:

> 

>   - steps are clearer to follow

>   - reason for rejection is recorded in the trace point

>   - we allow DIRTY_MEMORY_CODE when TCG is enabled

> 

> We expand the comment to explain that kernel based vhost has specific

> support for migration tracking.

> 

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> Cc: Michael S. Tsirkin <mst@redhat.com>

> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>

> Cc: Stefan Hajnoczi <stefanha@redhat.com>

> 

> ---

> v2

>   - drop enum, add trace_vhost_reject_section

>   - return false at any fail point

>   - unconditionally add DIRTY_MEMORY_CODE to handled cases

>   - slightly re-word the explanatory comment and commit message

> ---

>  hw/virtio/vhost.c      | 55 ++++++++++++++++++++++++++++++------------

>  hw/virtio/trace-events |  3 ++-

>  2 files changed, 41 insertions(+), 17 deletions(-)

> 

> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

> index aff98a0ede5..120c0cc747b 100644

> --- a/hw/virtio/vhost.c

> +++ b/hw/virtio/vhost.c

> @@ -27,6 +27,7 @@

>  #include "migration/blocker.h"

>  #include "migration/qemu-file-types.h"

>  #include "sysemu/dma.h"

> +#include "sysemu/tcg.h"

>  #include "trace.h"

>  

>  /* enabled until disconnected backend stabilizes */

> @@ -403,26 +404,48 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,

>      return r;

>  }

>  

> +/*

> + * vhost_section: identify sections needed for vhost access

> + *

> + * We only care about RAM sections here (where virtqueue can live). If


It's not just the virtqueue. Arbitrary guest RAM buffers can be placed
into the virtqueue so we need to pass all guest RAM to the vhost device
backend.

> + * we find one we still allow the backend to potentially filter it out

> + * of our list.

> + */

>  static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)

>  {

> -    bool result;

> -    bool log_dirty = memory_region_get_dirty_log_mask(section->mr) &

> -                     ~(1 << DIRTY_MEMORY_MIGRATION);

> -    result = memory_region_is_ram(section->mr) &&

> -        !memory_region_is_rom(section->mr);

> -

> -    /* Vhost doesn't handle any block which is doing dirty-tracking other

> -     * than migration; this typically fires on VGA areas.

> -     */

> -    result &= !log_dirty;

> +    MemoryRegion *mr = section->mr;

> +

> +    if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {

> +        uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);

> +        uint8_t handled_dirty;

> +

> +        /*

> +         * Kernel based vhost doesn't handle any block which is doing

> +         * dirty-tracking other than migration for which it has

> +         * specific logging support. However for TCG the kernel never

> +         * gets involved anyway so we can also ignore it's

> +         * self-modiying code detection flags.

> +         */

> +        handled_dirty = (1 << DIRTY_MEMORY_MIGRATION);

> +        handled_dirty |= (1 << DIRTY_MEMORY_CODE);


Wait, how is vhost going to support TCG self-modifying code detection?

It seems like this change will allow vhost devices to run, but now QEMU
will miss out on self-modifying code. Do we already enable vhost dirty
memory logging for DIRTY_MEMORY_CODE memory somehwere?

Or is there some cross-architectural reason why we can be sure that
allowing the vhost backend to DMA to guest RAM without marking pages
dirty is safe? For example, maybe the CPU needs to explicitly flush the
icache after DMA because this was a DMA operation not a regular
self-modifying code memory store? But is this true across all
architectures?

Stefan
Alex Bennée June 5, 2020, 10:19 a.m. UTC | #4
Stefan Hajnoczi <stefanha@redhat.com> writes:

> On Thu, Jun 04, 2020 at 02:40:22PM +0100, Alex Bennée wrote:

>> The purpose of vhost_section is to identify RAM regions that need to

>> be made available to a vhost client. However when running under TCG

>> all RAM sections have DIRTY_MEMORY_CODE set which leads to problems

>> down the line.

>> 

>> Re-factor the code so:

>> 

>>   - steps are clearer to follow

>>   - reason for rejection is recorded in the trace point

>>   - we allow DIRTY_MEMORY_CODE when TCG is enabled

>> 

>> We expand the comment to explain that kernel based vhost has specific

>> support for migration tracking.

>> 

>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>> Cc: Michael S. Tsirkin <mst@redhat.com>

>> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>

>> Cc: Stefan Hajnoczi <stefanha@redhat.com>

>> 

>> ---

>> v2

>>   - drop enum, add trace_vhost_reject_section

>>   - return false at any fail point

>>   - unconditionally add DIRTY_MEMORY_CODE to handled cases

>>   - slightly re-word the explanatory comment and commit message

>> ---

>>  hw/virtio/vhost.c      | 55 ++++++++++++++++++++++++++++++------------

>>  hw/virtio/trace-events |  3 ++-

>>  2 files changed, 41 insertions(+), 17 deletions(-)

>> 

>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

>> index aff98a0ede5..120c0cc747b 100644

>> --- a/hw/virtio/vhost.c

>> +++ b/hw/virtio/vhost.c

>> @@ -27,6 +27,7 @@

>>  #include "migration/blocker.h"

>>  #include "migration/qemu-file-types.h"

>>  #include "sysemu/dma.h"

>> +#include "sysemu/tcg.h"

>>  #include "trace.h"

>>  

>>  /* enabled until disconnected backend stabilizes */

>> @@ -403,26 +404,48 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,

>>      return r;

>>  }

>>  

>> +/*

>> + * vhost_section: identify sections needed for vhost access

>> + *

>> + * We only care about RAM sections here (where virtqueue can live). If

>

> It's not just the virtqueue. Arbitrary guest RAM buffers can be placed

> into the virtqueue so we need to pass all guest RAM to the vhost device

> backend.

>

>> + * we find one we still allow the backend to potentially filter it out

>> + * of our list.

>> + */

>>  static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)

>>  {

>> -    bool result;

>> -    bool log_dirty = memory_region_get_dirty_log_mask(section->mr) &

>> -                     ~(1 << DIRTY_MEMORY_MIGRATION);

>> -    result = memory_region_is_ram(section->mr) &&

>> -        !memory_region_is_rom(section->mr);

>> -

>> -    /* Vhost doesn't handle any block which is doing dirty-tracking other

>> -     * than migration; this typically fires on VGA areas.

>> -     */

>> -    result &= !log_dirty;

>> +    MemoryRegion *mr = section->mr;

>> +

>> +    if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {

>> +        uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);

>> +        uint8_t handled_dirty;

>> +

>> +        /*

>> +         * Kernel based vhost doesn't handle any block which is doing

>> +         * dirty-tracking other than migration for which it has

>> +         * specific logging support. However for TCG the kernel never

>> +         * gets involved anyway so we can also ignore it's

>> +         * self-modiying code detection flags.

>> +         */

>> +        handled_dirty = (1 << DIRTY_MEMORY_MIGRATION);

>> +        handled_dirty |= (1 << DIRTY_MEMORY_CODE);

>

> Wait, how is vhost going to support TCG self-modifying code detection?

>

> It seems like this change will allow vhost devices to run, but now QEMU

> will miss out on self-modifying code. Do we already enable vhost dirty

> memory logging for DIRTY_MEMORY_CODE memory somehwere?


Well any guest code running will still trigger the SMC detection. It's
true we currently don't have a mechanism if the vhost-user client
updates an executable page.

> Or is there some cross-architectural reason why we can be sure that

> allowing the vhost backend to DMA to guest RAM without marking pages

> dirty is safe?


Hmm, for devices just updating guest queues we should be fine because
the guest won't look until signalled. If we map executable pages in
virtiofsd will the guest kernel still go through it's mprotect setup
once the pages are DMA'ed?

> For example, maybe the CPU needs to explicitly flush the

> icache after DMA because this was a DMA operation not a regular

> self-modifying code memory store?


We don't care about cache ops in TCG, only page protections and write to
pages that already have code that has been translated in them.

> But is this true across all

> architectures?

>

> Stefan



-- 
Alex Bennée
Stefan Hajnoczi June 9, 2020, 11:10 a.m. UTC | #5
On Fri, Jun 05, 2020 at 11:19:30AM +0100, Alex Bennée wrote:
> 

> Stefan Hajnoczi <stefanha@redhat.com> writes:

> 

> > On Thu, Jun 04, 2020 at 02:40:22PM +0100, Alex Bennée wrote:

> >> The purpose of vhost_section is to identify RAM regions that need to

> >> be made available to a vhost client. However when running under TCG

> >> all RAM sections have DIRTY_MEMORY_CODE set which leads to problems

> >> down the line.

> >> 

> >> Re-factor the code so:

> >> 

> >>   - steps are clearer to follow

> >>   - reason for rejection is recorded in the trace point

> >>   - we allow DIRTY_MEMORY_CODE when TCG is enabled

> >> 

> >> We expand the comment to explain that kernel based vhost has specific

> >> support for migration tracking.

> >> 

> >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> >> Cc: Michael S. Tsirkin <mst@redhat.com>

> >> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>

> >> Cc: Stefan Hajnoczi <stefanha@redhat.com>

> >> 

> >> ---

> >> v2

> >>   - drop enum, add trace_vhost_reject_section

> >>   - return false at any fail point

> >>   - unconditionally add DIRTY_MEMORY_CODE to handled cases

> >>   - slightly re-word the explanatory comment and commit message

> >> ---

> >>  hw/virtio/vhost.c      | 55 ++++++++++++++++++++++++++++++------------

> >>  hw/virtio/trace-events |  3 ++-

> >>  2 files changed, 41 insertions(+), 17 deletions(-)

> >> 

> >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

> >> index aff98a0ede5..120c0cc747b 100644

> >> --- a/hw/virtio/vhost.c

> >> +++ b/hw/virtio/vhost.c

> >> @@ -27,6 +27,7 @@

> >>  #include "migration/blocker.h"

> >>  #include "migration/qemu-file-types.h"

> >>  #include "sysemu/dma.h"

> >> +#include "sysemu/tcg.h"

> >>  #include "trace.h"

> >>  

> >>  /* enabled until disconnected backend stabilizes */

> >> @@ -403,26 +404,48 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,

> >>      return r;

> >>  }

> >>  

> >> +/*

> >> + * vhost_section: identify sections needed for vhost access

> >> + *

> >> + * We only care about RAM sections here (where virtqueue can live). If

> >

> > It's not just the virtqueue. Arbitrary guest RAM buffers can be placed

> > into the virtqueue so we need to pass all guest RAM to the vhost device

> > backend.

> >

> >> + * we find one we still allow the backend to potentially filter it out

> >> + * of our list.

> >> + */

> >>  static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)

> >>  {

> >> -    bool result;

> >> -    bool log_dirty = memory_region_get_dirty_log_mask(section->mr) &

> >> -                     ~(1 << DIRTY_MEMORY_MIGRATION);

> >> -    result = memory_region_is_ram(section->mr) &&

> >> -        !memory_region_is_rom(section->mr);

> >> -

> >> -    /* Vhost doesn't handle any block which is doing dirty-tracking other

> >> -     * than migration; this typically fires on VGA areas.

> >> -     */

> >> -    result &= !log_dirty;

> >> +    MemoryRegion *mr = section->mr;

> >> +

> >> +    if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {

> >> +        uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);

> >> +        uint8_t handled_dirty;

> >> +

> >> +        /*

> >> +         * Kernel based vhost doesn't handle any block which is doing

> >> +         * dirty-tracking other than migration for which it has

> >> +         * specific logging support. However for TCG the kernel never

> >> +         * gets involved anyway so we can also ignore it's

> >> +         * self-modiying code detection flags.

> >> +         */

> >> +        handled_dirty = (1 << DIRTY_MEMORY_MIGRATION);

> >> +        handled_dirty |= (1 << DIRTY_MEMORY_CODE);

> >

> > Wait, how is vhost going to support TCG self-modifying code detection?

> >

> > It seems like this change will allow vhost devices to run, but now QEMU

> > will miss out on self-modifying code. Do we already enable vhost dirty

> > memory logging for DIRTY_MEMORY_CODE memory somehwere?

> 

> Well any guest code running will still trigger the SMC detection. It's

> true we currently don't have a mechanism if the vhost-user client

> updates an executable page.


Seems like a problem. If it didn't matter we could get rid of
DIRTY_MEMORY_CODE entirely.

If an exception is being made here because I/O devices aren't expected
to trigger SMC in real-world guests, please document it.

Stefan
Alex Bennée June 9, 2020, 11:18 a.m. UTC | #6
Stefan Hajnoczi <stefanha@redhat.com> writes:

> On Fri, Jun 05, 2020 at 11:19:30AM +0100, Alex Bennée wrote:

>> 

>> Stefan Hajnoczi <stefanha@redhat.com> writes:

>> 

>> > On Thu, Jun 04, 2020 at 02:40:22PM +0100, Alex Bennée wrote:

>> >> The purpose of vhost_section is to identify RAM regions that need to

>> >> be made available to a vhost client. However when running under TCG

>> >> all RAM sections have DIRTY_MEMORY_CODE set which leads to problems

>> >> down the line.

>> >> 

>> >> Re-factor the code so:

>> >> 

>> >>   - steps are clearer to follow

>> >>   - reason for rejection is recorded in the trace point

>> >>   - we allow DIRTY_MEMORY_CODE when TCG is enabled

>> >> 

>> >> We expand the comment to explain that kernel based vhost has specific

>> >> support for migration tracking.

>> >> 

>> >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>> >> Cc: Michael S. Tsirkin <mst@redhat.com>

>> >> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>

>> >> Cc: Stefan Hajnoczi <stefanha@redhat.com>

>> >> 

>> >> ---

>> >> v2

>> >>   - drop enum, add trace_vhost_reject_section

>> >>   - return false at any fail point

>> >>   - unconditionally add DIRTY_MEMORY_CODE to handled cases

>> >>   - slightly re-word the explanatory comment and commit message

>> >> ---

>> >>  hw/virtio/vhost.c      | 55 ++++++++++++++++++++++++++++++------------

>> >>  hw/virtio/trace-events |  3 ++-

>> >>  2 files changed, 41 insertions(+), 17 deletions(-)

>> >> 

>> >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

>> >> index aff98a0ede5..120c0cc747b 100644

>> >> --- a/hw/virtio/vhost.c

>> >> +++ b/hw/virtio/vhost.c

>> >> @@ -27,6 +27,7 @@

>> >>  #include "migration/blocker.h"

>> >>  #include "migration/qemu-file-types.h"

>> >>  #include "sysemu/dma.h"

>> >> +#include "sysemu/tcg.h"

>> >>  #include "trace.h"

>> >>  

>> >>  /* enabled until disconnected backend stabilizes */

>> >> @@ -403,26 +404,48 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,

>> >>      return r;

>> >>  }

>> >>  

>> >> +/*

>> >> + * vhost_section: identify sections needed for vhost access

>> >> + *

>> >> + * We only care about RAM sections here (where virtqueue can live). If

>> >

>> > It's not just the virtqueue. Arbitrary guest RAM buffers can be placed

>> > into the virtqueue so we need to pass all guest RAM to the vhost device

>> > backend.

>> >

>> >> + * we find one we still allow the backend to potentially filter it out

>> >> + * of our list.

>> >> + */

>> >>  static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)

>> >>  {

>> >> -    bool result;

>> >> -    bool log_dirty = memory_region_get_dirty_log_mask(section->mr) &

>> >> -                     ~(1 << DIRTY_MEMORY_MIGRATION);

>> >> -    result = memory_region_is_ram(section->mr) &&

>> >> -        !memory_region_is_rom(section->mr);

>> >> -

>> >> -    /* Vhost doesn't handle any block which is doing dirty-tracking other

>> >> -     * than migration; this typically fires on VGA areas.

>> >> -     */

>> >> -    result &= !log_dirty;

>> >> +    MemoryRegion *mr = section->mr;

>> >> +

>> >> +    if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {

>> >> +        uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);

>> >> +        uint8_t handled_dirty;

>> >> +

>> >> +        /*

>> >> +         * Kernel based vhost doesn't handle any block which is doing

>> >> +         * dirty-tracking other than migration for which it has

>> >> +         * specific logging support. However for TCG the kernel never

>> >> +         * gets involved anyway so we can also ignore it's

>> >> +         * self-modiying code detection flags.

>> >> +         */

>> >> +        handled_dirty = (1 << DIRTY_MEMORY_MIGRATION);

>> >> +        handled_dirty |= (1 << DIRTY_MEMORY_CODE);

>> >

>> > Wait, how is vhost going to support TCG self-modifying code detection?

>> >

>> > It seems like this change will allow vhost devices to run, but now QEMU

>> > will miss out on self-modifying code. Do we already enable vhost dirty

>> > memory logging for DIRTY_MEMORY_CODE memory somehwere?

>> 

>> Well any guest code running will still trigger the SMC detection. It's

>> true we currently don't have a mechanism if the vhost-user client

>> updates an executable page.

>

> Seems like a problem. If it didn't matter we could get rid of

> DIRTY_MEMORY_CODE entirely.

>

> If an exception is being made here because I/O devices aren't expected

> to trigger SMC in real-world guests, please document it.


In the comment here or somewhere in the docs?

-- 
Alex Bennée
Stefan Hajnoczi June 17, 2020, 11:38 a.m. UTC | #7
On Tue, Jun 09, 2020 at 12:18:07PM +0100, Alex Bennée wrote:
> 

> Stefan Hajnoczi <stefanha@redhat.com> writes:

> 

> > On Fri, Jun 05, 2020 at 11:19:30AM +0100, Alex Bennée wrote:

> >> 

> >> Stefan Hajnoczi <stefanha@redhat.com> writes:

> >> 

> >> > On Thu, Jun 04, 2020 at 02:40:22PM +0100, Alex Bennée wrote:

> >> >> The purpose of vhost_section is to identify RAM regions that need to

> >> >> be made available to a vhost client. However when running under TCG

> >> >> all RAM sections have DIRTY_MEMORY_CODE set which leads to problems

> >> >> down the line.

> >> >> 

> >> >> Re-factor the code so:

> >> >> 

> >> >>   - steps are clearer to follow

> >> >>   - reason for rejection is recorded in the trace point

> >> >>   - we allow DIRTY_MEMORY_CODE when TCG is enabled

> >> >> 

> >> >> We expand the comment to explain that kernel based vhost has specific

> >> >> support for migration tracking.

> >> >> 

> >> >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> >> >> Cc: Michael S. Tsirkin <mst@redhat.com>

> >> >> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>

> >> >> Cc: Stefan Hajnoczi <stefanha@redhat.com>

> >> >> 

> >> >> ---

> >> >> v2

> >> >>   - drop enum, add trace_vhost_reject_section

> >> >>   - return false at any fail point

> >> >>   - unconditionally add DIRTY_MEMORY_CODE to handled cases

> >> >>   - slightly re-word the explanatory comment and commit message

> >> >> ---

> >> >>  hw/virtio/vhost.c      | 55 ++++++++++++++++++++++++++++++------------

> >> >>  hw/virtio/trace-events |  3 ++-

> >> >>  2 files changed, 41 insertions(+), 17 deletions(-)

> >> >> 

> >> >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

> >> >> index aff98a0ede5..120c0cc747b 100644

> >> >> --- a/hw/virtio/vhost.c

> >> >> +++ b/hw/virtio/vhost.c

> >> >> @@ -27,6 +27,7 @@

> >> >>  #include "migration/blocker.h"

> >> >>  #include "migration/qemu-file-types.h"

> >> >>  #include "sysemu/dma.h"

> >> >> +#include "sysemu/tcg.h"

> >> >>  #include "trace.h"

> >> >>  

> >> >>  /* enabled until disconnected backend stabilizes */

> >> >> @@ -403,26 +404,48 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,

> >> >>      return r;

> >> >>  }

> >> >>  

> >> >> +/*

> >> >> + * vhost_section: identify sections needed for vhost access

> >> >> + *

> >> >> + * We only care about RAM sections here (where virtqueue can live). If

> >> >

> >> > It's not just the virtqueue. Arbitrary guest RAM buffers can be placed

> >> > into the virtqueue so we need to pass all guest RAM to the vhost device

> >> > backend.

> >> >

> >> >> + * we find one we still allow the backend to potentially filter it out

> >> >> + * of our list.

> >> >> + */

> >> >>  static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)

> >> >>  {

> >> >> -    bool result;

> >> >> -    bool log_dirty = memory_region_get_dirty_log_mask(section->mr) &

> >> >> -                     ~(1 << DIRTY_MEMORY_MIGRATION);

> >> >> -    result = memory_region_is_ram(section->mr) &&

> >> >> -        !memory_region_is_rom(section->mr);

> >> >> -

> >> >> -    /* Vhost doesn't handle any block which is doing dirty-tracking other

> >> >> -     * than migration; this typically fires on VGA areas.

> >> >> -     */

> >> >> -    result &= !log_dirty;

> >> >> +    MemoryRegion *mr = section->mr;

> >> >> +

> >> >> +    if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {

> >> >> +        uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);

> >> >> +        uint8_t handled_dirty;

> >> >> +

> >> >> +        /*

> >> >> +         * Kernel based vhost doesn't handle any block which is doing

> >> >> +         * dirty-tracking other than migration for which it has

> >> >> +         * specific logging support. However for TCG the kernel never

> >> >> +         * gets involved anyway so we can also ignore it's

> >> >> +         * self-modiying code detection flags.

> >> >> +         */

> >> >> +        handled_dirty = (1 << DIRTY_MEMORY_MIGRATION);

> >> >> +        handled_dirty |= (1 << DIRTY_MEMORY_CODE);

> >> >

> >> > Wait, how is vhost going to support TCG self-modifying code detection?

> >> >

> >> > It seems like this change will allow vhost devices to run, but now QEMU

> >> > will miss out on self-modifying code. Do we already enable vhost dirty

> >> > memory logging for DIRTY_MEMORY_CODE memory somehwere?

> >> 

> >> Well any guest code running will still trigger the SMC detection. It's

> >> true we currently don't have a mechanism if the vhost-user client

> >> updates an executable page.

> >

> > Seems like a problem. If it didn't matter we could get rid of

> > DIRTY_MEMORY_CODE entirely.

> >

> > If an exception is being made here because I/O devices aren't expected

> > to trigger SMC in real-world guests, please document it.

> 

> In the comment here or somewhere in the docs?


If it's a user-visible limitation (e.g. DMA from vhost-user devices
bypasses TCG SMC checks and could result in behavior that differs from
built-in virtio device models), then the docs would be a good place.

Stefan
diff mbox series

Patch

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index aff98a0ede5..120c0cc747b 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -27,6 +27,7 @@ 
 #include "migration/blocker.h"
 #include "migration/qemu-file-types.h"
 #include "sysemu/dma.h"
+#include "sysemu/tcg.h"
 #include "trace.h"
 
 /* enabled until disconnected backend stabilizes */
@@ -403,26 +404,48 @@  static int vhost_verify_ring_mappings(struct vhost_dev *dev,
     return r;
 }
 
+/*
+ * vhost_section: identify sections needed for vhost access
+ *
+ * We only care about RAM sections here (where virtqueue can live). If
+ * we find one we still allow the backend to potentially filter it out
+ * of our list.
+ */
 static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
 {
-    bool result;
-    bool log_dirty = memory_region_get_dirty_log_mask(section->mr) &
-                     ~(1 << DIRTY_MEMORY_MIGRATION);
-    result = memory_region_is_ram(section->mr) &&
-        !memory_region_is_rom(section->mr);
-
-    /* Vhost doesn't handle any block which is doing dirty-tracking other
-     * than migration; this typically fires on VGA areas.
-     */
-    result &= !log_dirty;
+    MemoryRegion *mr = section->mr;
+
+    if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {
+        uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);
+        uint8_t handled_dirty;
+
+        /*
+         * Kernel based vhost doesn't handle any block which is doing
+         * dirty-tracking other than migration for which it has
+         * specific logging support. However for TCG the kernel never
+         * gets involved anyway so we can also ignore it's
+         * self-modiying code detection flags.
+         */
+        handled_dirty = (1 << DIRTY_MEMORY_MIGRATION);
+        handled_dirty |= (1 << DIRTY_MEMORY_CODE);
 
-    if (result && dev->vhost_ops->vhost_backend_mem_section_filter) {
-        result &=
-            dev->vhost_ops->vhost_backend_mem_section_filter(dev, section);
-    }
+        if (dirty_mask & ~handled_dirty) {
+            trace_vhost_reject_section(mr->name, 1);
+            return false;
+        }
+
+        if (dev->vhost_ops->vhost_backend_mem_section_filter &&
+            !dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) {
+            trace_vhost_reject_section(mr->name, 2);
+            return false;
+        }
 
-    trace_vhost_section(section->mr->name, result);
-    return result;
+        trace_vhost_section(mr->name);
+        return true;
+    } else {
+        trace_vhost_reject_section(mr->name, 3);
+        return false;
+    }
 }
 
 static void vhost_begin(MemoryListener *listener)
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 84ecb85d445..22427126b97 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -5,7 +5,8 @@  vhost_commit(bool started, bool changed) "Started: %d Changed: %d"
 vhost_region_add_section(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64
 vhost_region_add_section_merge(const char *name, uint64_t new_size, uint64_t gpa, uint64_t owr) "%s: size: 0x%"PRIx64 " gpa: 0x%"PRIx64 " owr: 0x%"PRIx64
 vhost_region_add_section_aligned(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64
-vhost_section(const char *name, int r) "%s:%d"
+vhost_section(const char *name) "%s"
+vhost_reject_section(const char *name, int d) "%s:%d"
 vhost_iotlb_miss(void *dev, int step) "%p step %d"
 
 # vhost-user.c