diff mbox series

[v1,5/9] hw/virtio: introduce virtio_device_should_start

Message ID 20221108092308.1717426-6-alex.bennee@linaro.org
State New
Headers show
Series test and doc updates | expand

Commit Message

Alex Bennée Nov. 8, 2022, 9:23 a.m. UTC
The previous fix to virtio_device_started revealed a problem in its
use by both the core and the device code. The core code should be able
to handle the device "starting" while the VM isn't running to handle
the restoration of migration state. To solve this dual use introduce a
new helper for use by the vhost-user backends who all use it to feed a
should_start variable.

We can also pick up a change vhost_user_blk_set_status while we are at
it which follows the same pattern.

Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
---
 include/hw/virtio/virtio.h   | 18 ++++++++++++++++++
 hw/block/vhost-user-blk.c    |  6 +-----
 hw/virtio/vhost-user-fs.c    |  2 +-
 hw/virtio/vhost-user-gpio.c  |  2 +-
 hw/virtio/vhost-user-i2c.c   |  2 +-
 hw/virtio/vhost-user-rng.c   |  2 +-
 hw/virtio/vhost-user-vsock.c |  2 +-
 hw/virtio/vhost-vsock.c      |  2 +-
 8 files changed, 25 insertions(+), 11 deletions(-)

Comments

Michael S. Tsirkin Nov. 8, 2022, 9:32 a.m. UTC | #1
On Tue, Nov 08, 2022 at 09:23:04AM +0000, Alex Bennée wrote:
> The previous fix to virtio_device_started revealed a problem in its
> use by both the core and the device code. The core code should be able
> to handle the device "starting" while the VM isn't running to handle
> the restoration of migration state. To solve this dual use introduce a
> new helper for use by the vhost-user backends who all use it to feed a
> should_start variable.
> 
> We can also pick up a change vhost_user_blk_set_status while we are at
> it which follows the same pattern.
> 
> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: "Michael S. Tsirkin" <mst@redhat.com>

why is this in this patchset?

> ---
>  include/hw/virtio/virtio.h   | 18 ++++++++++++++++++
>  hw/block/vhost-user-blk.c    |  6 +-----
>  hw/virtio/vhost-user-fs.c    |  2 +-
>  hw/virtio/vhost-user-gpio.c  |  2 +-
>  hw/virtio/vhost-user-i2c.c   |  2 +-
>  hw/virtio/vhost-user-rng.c   |  2 +-
>  hw/virtio/vhost-user-vsock.c |  2 +-
>  hw/virtio/vhost-vsock.c      |  2 +-
>  8 files changed, 25 insertions(+), 11 deletions(-)
> 
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index f41b4a7e64..3191c618f3 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -389,6 +389,24 @@ static inline bool virtio_device_started(VirtIODevice *vdev, uint8_t status)
>          return vdev->started;
>      }
>  
> +    return status & VIRTIO_CONFIG_S_DRIVER_OK;
> +}
> +
> +/**
> + * virtio_device_should_start() - check if device startable
> + * @vdev - the VirtIO device
> + * @status - the devices status bits
> + *
> + * This is similar to virtio_device_started() but also encapsulates a
> + * check on the VM status which would prevent a device starting
> + * anyway.
> + */
> +static inline bool virtio_device_should_start(VirtIODevice *vdev, uint8_t status)
> +{
> +    if (vdev->use_started) {
> +        return vdev->started;
> +    }
> +
>      if (!vdev->vm_running) {
>          return false;
>      }
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index 13bf5cc47a..8feaf12e4e 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -222,14 +222,10 @@ static void vhost_user_blk_stop(VirtIODevice *vdev)
>  static void vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostUserBlk *s = VHOST_USER_BLK(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>      Error *local_err = NULL;
>      int ret;
>  
> -    if (!vdev->vm_running) {
> -        should_start = false;
> -    }
> -
>      if (!s->connected) {
>          return;
>      }
> diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
> index ad0f91c607..1c40f42045 100644
> --- a/hw/virtio/vhost-user-fs.c
> +++ b/hw/virtio/vhost-user-fs.c
> @@ -123,7 +123,7 @@ static void vuf_stop(VirtIODevice *vdev)
>  static void vuf_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostUserFS *fs = VHOST_USER_FS(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>  
>      if (vhost_dev_is_started(&fs->vhost_dev) == should_start) {
>          return;
> diff --git a/hw/virtio/vhost-user-gpio.c b/hw/virtio/vhost-user-gpio.c
> index 8b40fe450c..677d1c7730 100644
> --- a/hw/virtio/vhost-user-gpio.c
> +++ b/hw/virtio/vhost-user-gpio.c
> @@ -152,7 +152,7 @@ static void vu_gpio_stop(VirtIODevice *vdev)
>  static void vu_gpio_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>  
>      trace_virtio_gpio_set_status(status);
>  
> diff --git a/hw/virtio/vhost-user-i2c.c b/hw/virtio/vhost-user-i2c.c
> index bc58b6c0d1..864eba695e 100644
> --- a/hw/virtio/vhost-user-i2c.c
> +++ b/hw/virtio/vhost-user-i2c.c
> @@ -93,7 +93,7 @@ static void vu_i2c_stop(VirtIODevice *vdev)
>  static void vu_i2c_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostUserI2C *i2c = VHOST_USER_I2C(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>  
>      if (vhost_dev_is_started(&i2c->vhost_dev) == should_start) {
>          return;
> diff --git a/hw/virtio/vhost-user-rng.c b/hw/virtio/vhost-user-rng.c
> index bc1f36c5ac..8b47287875 100644
> --- a/hw/virtio/vhost-user-rng.c
> +++ b/hw/virtio/vhost-user-rng.c
> @@ -90,7 +90,7 @@ static void vu_rng_stop(VirtIODevice *vdev)
>  static void vu_rng_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostUserRNG *rng = VHOST_USER_RNG(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>  
>      if (vhost_dev_is_started(&rng->vhost_dev) == should_start) {
>          return;
> diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c
> index 7b67e29d83..9431b9792c 100644
> --- a/hw/virtio/vhost-user-vsock.c
> +++ b/hw/virtio/vhost-user-vsock.c
> @@ -55,7 +55,7 @@ const VhostDevConfigOps vsock_ops = {
>  static void vuv_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>  
>      if (vhost_dev_is_started(&vvc->vhost_dev) == should_start) {
>          return;
> diff --git a/hw/virtio/vhost-vsock.c b/hw/virtio/vhost-vsock.c
> index 7dc3c73931..aa16d584ee 100644
> --- a/hw/virtio/vhost-vsock.c
> +++ b/hw/virtio/vhost-vsock.c
> @@ -70,7 +70,7 @@ static int vhost_vsock_set_running(VirtIODevice *vdev, int start)
>  static void vhost_vsock_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>      int ret;
>  
>      if (vhost_dev_is_started(&vvc->vhost_dev) == should_start) {
> -- 
> 2.34.1
Michael S. Tsirkin Nov. 8, 2022, 9:33 a.m. UTC | #2
On Tue, Nov 08, 2022 at 09:23:04AM +0000, Alex Bennée wrote:
> The previous fix to virtio_device_started revealed a problem in its
> use by both the core and the device code. The core code should be able
> to handle the device "starting" while the VM isn't running to handle
> the restoration of migration state. To solve this dual use introduce a
> new helper for use by the vhost-user backends who all use it to feed a
> should_start variable.
> 
> We can also pick up a change vhost_user_blk_set_status while we are at
> it which follows the same pattern.
> 
> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: "Michael S. Tsirkin" <mst@redhat.com>

is this the same as the RFC?

> ---
>  include/hw/virtio/virtio.h   | 18 ++++++++++++++++++
>  hw/block/vhost-user-blk.c    |  6 +-----
>  hw/virtio/vhost-user-fs.c    |  2 +-
>  hw/virtio/vhost-user-gpio.c  |  2 +-
>  hw/virtio/vhost-user-i2c.c   |  2 +-
>  hw/virtio/vhost-user-rng.c   |  2 +-
>  hw/virtio/vhost-user-vsock.c |  2 +-
>  hw/virtio/vhost-vsock.c      |  2 +-
>  8 files changed, 25 insertions(+), 11 deletions(-)
> 
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index f41b4a7e64..3191c618f3 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -389,6 +389,24 @@ static inline bool virtio_device_started(VirtIODevice *vdev, uint8_t status)
>          return vdev->started;
>      }
>  
> +    return status & VIRTIO_CONFIG_S_DRIVER_OK;
> +}
> +
> +/**
> + * virtio_device_should_start() - check if device startable
> + * @vdev - the VirtIO device
> + * @status - the devices status bits
> + *
> + * This is similar to virtio_device_started() but also encapsulates a
> + * check on the VM status which would prevent a device starting
> + * anyway.
> + */
> +static inline bool virtio_device_should_start(VirtIODevice *vdev, uint8_t status)
> +{
> +    if (vdev->use_started) {
> +        return vdev->started;
> +    }
> +
>      if (!vdev->vm_running) {
>          return false;
>      }
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index 13bf5cc47a..8feaf12e4e 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -222,14 +222,10 @@ static void vhost_user_blk_stop(VirtIODevice *vdev)
>  static void vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostUserBlk *s = VHOST_USER_BLK(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>      Error *local_err = NULL;
>      int ret;
>  
> -    if (!vdev->vm_running) {
> -        should_start = false;
> -    }
> -
>      if (!s->connected) {
>          return;
>      }
> diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
> index ad0f91c607..1c40f42045 100644
> --- a/hw/virtio/vhost-user-fs.c
> +++ b/hw/virtio/vhost-user-fs.c
> @@ -123,7 +123,7 @@ static void vuf_stop(VirtIODevice *vdev)
>  static void vuf_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostUserFS *fs = VHOST_USER_FS(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>  
>      if (vhost_dev_is_started(&fs->vhost_dev) == should_start) {
>          return;
> diff --git a/hw/virtio/vhost-user-gpio.c b/hw/virtio/vhost-user-gpio.c
> index 8b40fe450c..677d1c7730 100644
> --- a/hw/virtio/vhost-user-gpio.c
> +++ b/hw/virtio/vhost-user-gpio.c
> @@ -152,7 +152,7 @@ static void vu_gpio_stop(VirtIODevice *vdev)
>  static void vu_gpio_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>  
>      trace_virtio_gpio_set_status(status);
>  
> diff --git a/hw/virtio/vhost-user-i2c.c b/hw/virtio/vhost-user-i2c.c
> index bc58b6c0d1..864eba695e 100644
> --- a/hw/virtio/vhost-user-i2c.c
> +++ b/hw/virtio/vhost-user-i2c.c
> @@ -93,7 +93,7 @@ static void vu_i2c_stop(VirtIODevice *vdev)
>  static void vu_i2c_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostUserI2C *i2c = VHOST_USER_I2C(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>  
>      if (vhost_dev_is_started(&i2c->vhost_dev) == should_start) {
>          return;
> diff --git a/hw/virtio/vhost-user-rng.c b/hw/virtio/vhost-user-rng.c
> index bc1f36c5ac..8b47287875 100644
> --- a/hw/virtio/vhost-user-rng.c
> +++ b/hw/virtio/vhost-user-rng.c
> @@ -90,7 +90,7 @@ static void vu_rng_stop(VirtIODevice *vdev)
>  static void vu_rng_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostUserRNG *rng = VHOST_USER_RNG(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>  
>      if (vhost_dev_is_started(&rng->vhost_dev) == should_start) {
>          return;
> diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c
> index 7b67e29d83..9431b9792c 100644
> --- a/hw/virtio/vhost-user-vsock.c
> +++ b/hw/virtio/vhost-user-vsock.c
> @@ -55,7 +55,7 @@ const VhostDevConfigOps vsock_ops = {
>  static void vuv_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>  
>      if (vhost_dev_is_started(&vvc->vhost_dev) == should_start) {
>          return;
> diff --git a/hw/virtio/vhost-vsock.c b/hw/virtio/vhost-vsock.c
> index 7dc3c73931..aa16d584ee 100644
> --- a/hw/virtio/vhost-vsock.c
> +++ b/hw/virtio/vhost-vsock.c
> @@ -70,7 +70,7 @@ static int vhost_vsock_set_running(VirtIODevice *vdev, int start)
>  static void vhost_vsock_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
> -    bool should_start = virtio_device_started(vdev, status);
> +    bool should_start = virtio_device_should_start(vdev, status);
>      int ret;
>  
>      if (vhost_dev_is_started(&vvc->vhost_dev) == should_start) {
> -- 
> 2.34.1
> 
> 
>
Alex Bennée Nov. 8, 2022, 10:23 a.m. UTC | #3
"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Tue, Nov 08, 2022 at 09:23:04AM +0000, Alex Bennée wrote:
>> The previous fix to virtio_device_started revealed a problem in its
>> use by both the core and the device code. The core code should be able
>> to handle the device "starting" while the VM isn't running to handle
>> the restoration of migration state. To solve this dual use introduce a
>> new helper for use by the vhost-user backends who all use it to feed a
>> should_start variable.
>> 
>> We can also pick up a change vhost_user_blk_set_status while we are at
>> it which follows the same pattern.
>> 
>> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
>> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>
> why is this in this patchset?

As per my cover letter:

  Most of these patches have been posted before as single patch RFCs. A
  couple are already scheduled through other trees so will drop out in
  due course

but I keep them in my tree until they are merged so I can continue to
soak test them (and have a stable base for my other WIP trees).
Michael S. Tsirkin Nov. 8, 2022, 10:26 a.m. UTC | #4
On Tue, Nov 08, 2022 at 10:23:15AM +0000, Alex Bennée wrote:
> 
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Tue, Nov 08, 2022 at 09:23:04AM +0000, Alex Bennée wrote:
> >> The previous fix to virtio_device_started revealed a problem in its
> >> use by both the core and the device code. The core code should be able
> >> to handle the device "starting" while the VM isn't running to handle
> >> the restoration of migration state. To solve this dual use introduce a
> >> new helper for use by the vhost-user backends who all use it to feed a
> >> should_start variable.
> >> 
> >> We can also pick up a change vhost_user_blk_set_status while we are at
> >> it which follows the same pattern.
> >> 
> >> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
> >> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
> >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> >> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> >
> > why is this in this patchset?
> 
> As per my cover letter:
> 
>   Most of these patches have been posted before as single patch RFCs. A
>   couple are already scheduled through other trees so will drop out in
>   due course
> 
> but I keep them in my tree until they are merged so I can continue to
> soak test them (and have a stable base for my other WIP trees).

That's fine just pls don't double-post them on list, certainly
not as part of a patchset.

> -- 
> Alex Bennée
Alex Bennée Nov. 8, 2022, 11:21 a.m. UTC | #5
"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Tue, Nov 08, 2022 at 10:23:15AM +0000, Alex Bennée wrote:
>> 
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>> > On Tue, Nov 08, 2022 at 09:23:04AM +0000, Alex Bennée wrote:
>> >> The previous fix to virtio_device_started revealed a problem in its
>> >> use by both the core and the device code. The core code should be able
>> >> to handle the device "starting" while the VM isn't running to handle
>> >> the restoration of migration state. To solve this dual use introduce a
>> >> new helper for use by the vhost-user backends who all use it to feed a
>> >> should_start variable.
>> >> 
>> >> We can also pick up a change vhost_user_blk_set_status while we are at
>> >> it which follows the same pattern.
>> >> 
>> >> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
>> >> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
>> >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> >> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>> >
>> > why is this in this patchset?
>> 
>> As per my cover letter:
>> 
>>   Most of these patches have been posted before as single patch RFCs. A
>>   couple are already scheduled through other trees so will drop out in
>>   due course
>> 
>> but I keep them in my tree until they are merged so I can continue to
>> soak test them (and have a stable base for my other WIP trees).
>
> That's fine just pls don't double-post them on list, certainly
> not as part of a patchset.

Why not? Is this breaking some tooling?
Michael S. Tsirkin Nov. 8, 2022, 3:24 p.m. UTC | #6
On Tue, Nov 08, 2022 at 11:21:26AM +0000, Alex Bennée wrote:
> 
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Tue, Nov 08, 2022 at 10:23:15AM +0000, Alex Bennée wrote:
> >> 
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> 
> >> > On Tue, Nov 08, 2022 at 09:23:04AM +0000, Alex Bennée wrote:
> >> >> The previous fix to virtio_device_started revealed a problem in its
> >> >> use by both the core and the device code. The core code should be able
> >> >> to handle the device "starting" while the VM isn't running to handle
> >> >> the restoration of migration state. To solve this dual use introduce a
> >> >> new helper for use by the vhost-user backends who all use it to feed a
> >> >> should_start variable.
> >> >> 
> >> >> We can also pick up a change vhost_user_blk_set_status while we are at
> >> >> it which follows the same pattern.
> >> >> 
> >> >> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
> >> >> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
> >> >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> >> >> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> >> >
> >> > why is this in this patchset?
> >> 
> >> As per my cover letter:
> >> 
> >>   Most of these patches have been posted before as single patch RFCs. A
> >>   couple are already scheduled through other trees so will drop out in
> >>   due course
> >> 
> >> but I keep them in my tree until they are merged so I can continue to
> >> soak test them (and have a stable base for my other WIP trees).
> >
> > That's fine just pls don't double-post them on list, certainly
> > not as part of a patchset.
> 
> Why not? Is this breaking some tooling?

Yes patchset breaks git am if you try to apply part of it.

Reposting creates work for reviewers - why should they have to read the same
patch twice?  In this case it also made me scratch my head trying to
figure out what to do about it.

But, if you are careful and maintain an ordered changelog after "---"
and there it says 
	changes since rfc:
		no changes, subject changed 

then this second part is less of a problem

> -- 
> Alex Bennée
Alex Bennée Nov. 8, 2022, 4:41 p.m. UTC | #7
"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Tue, Nov 08, 2022 at 11:21:26AM +0000, Alex Bennée wrote:
>> 
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>> > On Tue, Nov 08, 2022 at 10:23:15AM +0000, Alex Bennée wrote:
>> >> 
>> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> >> 
>> >> > On Tue, Nov 08, 2022 at 09:23:04AM +0000, Alex Bennée wrote:
>> >> >> The previous fix to virtio_device_started revealed a problem in its
>> >> >> use by both the core and the device code. The core code should be able
>> >> >> to handle the device "starting" while the VM isn't running to handle
>> >> >> the restoration of migration state. To solve this dual use introduce a
>> >> >> new helper for use by the vhost-user backends who all use it to feed a
>> >> >> should_start variable.
>> >> >> 
>> >> >> We can also pick up a change vhost_user_blk_set_status while we are at
>> >> >> it which follows the same pattern.
>> >> >> 
>> >> >> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
>> >> >> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
>> >> >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> >> >> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>> >> >
>> >> > why is this in this patchset?
>> >> 
>> >> As per my cover letter:
>> >> 
>> >>   Most of these patches have been posted before as single patch RFCs. A
>> >>   couple are already scheduled through other trees so will drop out in
>> >>   due course
>> >> 
>> >> but I keep them in my tree until they are merged so I can continue to
>> >> soak test them (and have a stable base for my other WIP trees).
>> >
>> > That's fine just pls don't double-post them on list, certainly
>> > not as part of a patchset.
>> 
>> Why not? Is this breaking some tooling?
>
> Yes patchset breaks git am if you try to apply part of it.
>
> Reposting creates work for reviewers - why should they have to read the same
> patch twice?  In this case it also made me scratch my head trying to
> figure out what to do about it.
>
> But, if you are careful and maintain an ordered changelog after "---"
> and there it says 
> 	changes since rfc:
> 		no changes, subject changed 
>
> then this second part is less of a problem

Ahh yes, I should have updated to point out I added the extra Fixes line
as per the review. I guess you added that in your PR? Anyway it's
dropped now your PR has gone in.
Christian Borntraeger Nov. 14, 2022, 4:18 p.m. UTC | #8
Am 08.11.22 um 10:23 schrieb Alex Bennée:
> The previous fix to virtio_device_started revealed a problem in its
> use by both the core and the device code. The core code should be able
> to handle the device "starting" while the VM isn't running to handle
> the restoration of migration state. To solve this dual use introduce a
> new helper for use by the vhost-user backends who all use it to feed a
> should_start variable.
> 
> We can also pick up a change vhost_user_blk_set_status while we are at
> it which follows the same pattern.
> 
> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: "Michael S. Tsirkin" <mst@redhat.com>

Hmmm, is this
commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
Author:     Alex Bennée <alex.bennee@linaro.org>
AuthorDate: Mon Nov 7 12:14:07 2022 +0000
Commit:     Michael S. Tsirkin <mst@redhat.com>
CommitDate: Mon Nov 7 14:08:18 2022 -0500

     hw/virtio: introduce virtio_device_should_start

and older version?

This does not seem to fix the regression that I have reported.
Michael S. Tsirkin Nov. 14, 2022, 4:37 p.m. UTC | #9
On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
> Am 08.11.22 um 10:23 schrieb Alex Bennée:
> > The previous fix to virtio_device_started revealed a problem in its
> > use by both the core and the device code. The core code should be able
> > to handle the device "starting" while the VM isn't running to handle
> > the restoration of migration state. To solve this dual use introduce a
> > new helper for use by the vhost-user backends who all use it to feed a
> > should_start variable.
> > 
> > We can also pick up a change vhost_user_blk_set_status while we are at
> > it which follows the same pattern.
> > 
> > Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
> > Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
> > Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> > Cc: "Michael S. Tsirkin" <mst@redhat.com>
> 
> Hmmm, is this
> commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
> Author:     Alex Bennée <alex.bennee@linaro.org>
> AuthorDate: Mon Nov 7 12:14:07 2022 +0000
> Commit:     Michael S. Tsirkin <mst@redhat.com>
> CommitDate: Mon Nov 7 14:08:18 2022 -0500
> 
>     hw/virtio: introduce virtio_device_should_start
> 
> and older version?

This is what got merged:
https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
This patch was sent after I merged the RFC.
I think the only difference is the commit log but I might be missing
something.

> This does not seem to fix the regression that I have reported.

This was applied on top of 9f6bcfd99f which IIUC does, right?
Alex Bennée Nov. 14, 2022, 4:43 p.m. UTC | #10
Christian Borntraeger <borntraeger@linux.ibm.com> writes:

> Am 08.11.22 um 10:23 schrieb Alex Bennée:
>> The previous fix to virtio_device_started revealed a problem in its
>> use by both the core and the device code. The core code should be able
>> to handle the device "starting" while the VM isn't running to handle
>> the restoration of migration state. To solve this dual use introduce a
>> new helper for use by the vhost-user backends who all use it to feed a
>> should_start variable.
>> We can also pick up a change vhost_user_blk_set_status while we are
>> at
>> it which follows the same pattern.
>> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to
>> virtio_device_started)
>> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>
> Hmmm, is this
> commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
> Author:     Alex Bennée <alex.bennee@linaro.org>
> AuthorDate: Mon Nov 7 12:14:07 2022 +0000
> Commit:     Michael S. Tsirkin <mst@redhat.com>
> CommitDate: Mon Nov 7 14:08:18 2022 -0500
>
>     hw/virtio: introduce virtio_device_should_start
>
> and older version?

Only missing the additional Fixes line MST suggested in the review. I
should have made it clearer in the --- comment.

Which test is failing?
Christian Borntraeger Nov. 14, 2022, 4:55 p.m. UTC | #11
Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
> On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
>> Am 08.11.22 um 10:23 schrieb Alex Bennée:
>>> The previous fix to virtio_device_started revealed a problem in its
>>> use by both the core and the device code. The core code should be able
>>> to handle the device "starting" while the VM isn't running to handle
>>> the restoration of migration state. To solve this dual use introduce a
>>> new helper for use by the vhost-user backends who all use it to feed a
>>> should_start variable.
>>>
>>> We can also pick up a change vhost_user_blk_set_status while we are at
>>> it which follows the same pattern.
>>>
>>> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
>>> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>>
>> Hmmm, is this
>> commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
>> Author:     Alex Bennée <alex.bennee@linaro.org>
>> AuthorDate: Mon Nov 7 12:14:07 2022 +0000
>> Commit:     Michael S. Tsirkin <mst@redhat.com>
>> CommitDate: Mon Nov 7 14:08:18 2022 -0500
>>
>>      hw/virtio: introduce virtio_device_should_start
>>
>> and older version?
> 
> This is what got merged:
> https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
> This patch was sent after I merged the RFC.
> I think the only difference is the commit log but I might be missing
> something.
> 
>> This does not seem to fix the regression that I have reported.
> 
> This was applied on top of 9f6bcfd99f which IIUC does, right?
> 
> 

QEMU master still fails for me for suspend/resume to disk:

#0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
#1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
#2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
#3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
#4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
#5  0x000002aa1ffa8966 in vhost_vsock_common_pre_save (opaque=<optimized out>) at ../hw/virtio/vhost-vsock-common.c:203
#6  0x000002aa1fe5e0ee in vmstate_save_state_v
     (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0 <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8, vmdesc=vmdesc@entry=0x3fddc08eb30, version_id=version_id@entry=0) at ../migration/vmstate.c:329
#7  0x000002aa1fe5ebf8 in vmstate_save_state (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>, opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30) at ../migration/vmstate.c:317
#8  0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170, se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at ../migration/savevm.c:908
#9  0x000002aa1fe79584 in qemu_savevm_state_complete_precopy_non_iterable (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false, inactivate_disks=inactivate_disks@entry=true)
     at ../migration/savevm.c:1393
#10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false, inactivate_disks=inactivate_disks@entry=true) at ../migration/savevm.c:1459
#11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
#12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
#13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
#14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
#15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
#16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6

Michael, your previous branch did work if I recall correctly.
Michael S. Tsirkin Nov. 14, 2022, 5:10 p.m. UTC | #12
On Mon, Nov 14, 2022 at 05:55:09PM +0100, Christian Borntraeger wrote:
> 
> 
> Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
> > On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
> > > Am 08.11.22 um 10:23 schrieb Alex Bennée:
> > > > The previous fix to virtio_device_started revealed a problem in its
> > > > use by both the core and the device code. The core code should be able
> > > > to handle the device "starting" while the VM isn't running to handle
> > > > the restoration of migration state. To solve this dual use introduce a
> > > > new helper for use by the vhost-user backends who all use it to feed a
> > > > should_start variable.
> > > > 
> > > > We can also pick up a change vhost_user_blk_set_status while we are at
> > > > it which follows the same pattern.
> > > > 
> > > > Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
> > > > Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
> > > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> > > > Cc: "Michael S. Tsirkin" <mst@redhat.com>
> > > 
> > > Hmmm, is this
> > > commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
> > > Author:     Alex Bennée <alex.bennee@linaro.org>
> > > AuthorDate: Mon Nov 7 12:14:07 2022 +0000
> > > Commit:     Michael S. Tsirkin <mst@redhat.com>
> > > CommitDate: Mon Nov 7 14:08:18 2022 -0500
> > > 
> > >      hw/virtio: introduce virtio_device_should_start
> > > 
> > > and older version?
> > 
> > This is what got merged:
> > https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
> > This patch was sent after I merged the RFC.
> > I think the only difference is the commit log but I might be missing
> > something.
> > 
> > > This does not seem to fix the regression that I have reported.
> > 
> > This was applied on top of 9f6bcfd99f which IIUC does, right?
> > 
> > 
> 
> QEMU master still fails for me for suspend/resume to disk:
> 
> #0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
> #1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
> #2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
> #3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
> #4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
> #5  0x000002aa1ffa8966 in vhost_vsock_common_pre_save (opaque=<optimized out>) at ../hw/virtio/vhost-vsock-common.c:203
> #6  0x000002aa1fe5e0ee in vmstate_save_state_v
>     (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0 <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8, vmdesc=vmdesc@entry=0x3fddc08eb30, version_id=version_id@entry=0) at ../migration/vmstate.c:329
> #7  0x000002aa1fe5ebf8 in vmstate_save_state (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>, opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30) at ../migration/vmstate.c:317
> #8  0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170, se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at ../migration/savevm.c:908
> #9  0x000002aa1fe79584 in qemu_savevm_state_complete_precopy_non_iterable (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false, inactivate_disks=inactivate_disks@entry=true)
>     at ../migration/savevm.c:1393
> #10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false, inactivate_disks=inactivate_disks@entry=true) at ../migration/savevm.c:1459
> #11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
> #12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
> #13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
> #14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
> #15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
> #16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6
> 
> Michael, your previous branch did work if I recall correctly.

That one was failing under github CI though (for reasons we didn't
really address, such as disconnect during stop causing a recursive
call to stop, but there you are).
Christian Borntraeger Nov. 14, 2022, 5:15 p.m. UTC | #13
Am 14.11.22 um 18:10 schrieb Michael S. Tsirkin:
> On Mon, Nov 14, 2022 at 05:55:09PM +0100, Christian Borntraeger wrote:
>>
>>
>> Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
>>> On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
>>>> Am 08.11.22 um 10:23 schrieb Alex Bennée:
>>>>> The previous fix to virtio_device_started revealed a problem in its
>>>>> use by both the core and the device code. The core code should be able
>>>>> to handle the device "starting" while the VM isn't running to handle
>>>>> the restoration of migration state. To solve this dual use introduce a
>>>>> new helper for use by the vhost-user backends who all use it to feed a
>>>>> should_start variable.
>>>>>
>>>>> We can also pick up a change vhost_user_blk_set_status while we are at
>>>>> it which follows the same pattern.
>>>>>
>>>>> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
>>>>> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
>>>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>>>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>>>>
>>>> Hmmm, is this
>>>> commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
>>>> Author:     Alex Bennée <alex.bennee@linaro.org>
>>>> AuthorDate: Mon Nov 7 12:14:07 2022 +0000
>>>> Commit:     Michael S. Tsirkin <mst@redhat.com>
>>>> CommitDate: Mon Nov 7 14:08:18 2022 -0500
>>>>
>>>>       hw/virtio: introduce virtio_device_should_start
>>>>
>>>> and older version?
>>>
>>> This is what got merged:
>>> https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
>>> This patch was sent after I merged the RFC.
>>> I think the only difference is the commit log but I might be missing
>>> something.
>>>
>>>> This does not seem to fix the regression that I have reported.
>>>
>>> This was applied on top of 9f6bcfd99f which IIUC does, right?
>>>
>>>
>>
>> QEMU master still fails for me for suspend/resume to disk:
>>
>> #0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
>> #1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
>> #2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
>> #3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
>> #4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
>> #5  0x000002aa1ffa8966 in vhost_vsock_common_pre_save (opaque=<optimized out>) at ../hw/virtio/vhost-vsock-common.c:203
>> #6  0x000002aa1fe5e0ee in vmstate_save_state_v
>>      (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0 <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8, vmdesc=vmdesc@entry=0x3fddc08eb30, version_id=version_id@entry=0) at ../migration/vmstate.c:329
>> #7  0x000002aa1fe5ebf8 in vmstate_save_state (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>, opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30) at ../migration/vmstate.c:317
>> #8  0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170, se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at ../migration/savevm.c:908
>> #9  0x000002aa1fe79584 in qemu_savevm_state_complete_precopy_non_iterable (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false, inactivate_disks=inactivate_disks@entry=true)
>>      at ../migration/savevm.c:1393
>> #10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false, inactivate_disks=inactivate_disks@entry=true) at ../migration/savevm.c:1459
>> #11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
>> #12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
>> #13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
>> #14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
>> #15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
>> #16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6
>>
>> Michael, your previous branch did work if I recall correctly.
> 
> That one was failing under github CI though (for reasons we didn't
> really address, such as disconnect during stop causing a recursive
> call to stop, but there you are).
Even the double revert of everything?
So how do we proceed now?
Michael S. Tsirkin Nov. 14, 2022, 5:20 p.m. UTC | #14
On Mon, Nov 14, 2022 at 06:15:30PM +0100, Christian Borntraeger wrote:
> 
> 
> Am 14.11.22 um 18:10 schrieb Michael S. Tsirkin:
> > On Mon, Nov 14, 2022 at 05:55:09PM +0100, Christian Borntraeger wrote:
> > > 
> > > 
> > > Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
> > > > On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
> > > > > Am 08.11.22 um 10:23 schrieb Alex Bennée:
> > > > > > The previous fix to virtio_device_started revealed a problem in its
> > > > > > use by both the core and the device code. The core code should be able
> > > > > > to handle the device "starting" while the VM isn't running to handle
> > > > > > the restoration of migration state. To solve this dual use introduce a
> > > > > > new helper for use by the vhost-user backends who all use it to feed a
> > > > > > should_start variable.
> > > > > > 
> > > > > > We can also pick up a change vhost_user_blk_set_status while we are at
> > > > > > it which follows the same pattern.
> > > > > > 
> > > > > > Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
> > > > > > Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
> > > > > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> > > > > > Cc: "Michael S. Tsirkin" <mst@redhat.com>
> > > > > 
> > > > > Hmmm, is this
> > > > > commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
> > > > > Author:     Alex Bennée <alex.bennee@linaro.org>
> > > > > AuthorDate: Mon Nov 7 12:14:07 2022 +0000
> > > > > Commit:     Michael S. Tsirkin <mst@redhat.com>
> > > > > CommitDate: Mon Nov 7 14:08:18 2022 -0500
> > > > > 
> > > > >       hw/virtio: introduce virtio_device_should_start
> > > > > 
> > > > > and older version?
> > > > 
> > > > This is what got merged:
> > > > https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
> > > > This patch was sent after I merged the RFC.
> > > > I think the only difference is the commit log but I might be missing
> > > > something.
> > > > 
> > > > > This does not seem to fix the regression that I have reported.
> > > > 
> > > > This was applied on top of 9f6bcfd99f which IIUC does, right?
> > > > 
> > > > 
> > > 
> > > QEMU master still fails for me for suspend/resume to disk:
> > > 
> > > #0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
> > > #1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
> > > #2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
> > > #3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
> > > #4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
> > > #5  0x000002aa1ffa8966 in vhost_vsock_common_pre_save (opaque=<optimized out>) at ../hw/virtio/vhost-vsock-common.c:203
> > > #6  0x000002aa1fe5e0ee in vmstate_save_state_v
> > >      (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0 <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8, vmdesc=vmdesc@entry=0x3fddc08eb30, version_id=version_id@entry=0) at ../migration/vmstate.c:329
> > > #7  0x000002aa1fe5ebf8 in vmstate_save_state (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>, opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30) at ../migration/vmstate.c:317
> > > #8  0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170, se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at ../migration/savevm.c:908
> > > #9  0x000002aa1fe79584 in qemu_savevm_state_complete_precopy_non_iterable (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false, inactivate_disks=inactivate_disks@entry=true)
> > >      at ../migration/savevm.c:1393
> > > #10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false, inactivate_disks=inactivate_disks@entry=true) at ../migration/savevm.c:1459
> > > #11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
> > > #12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
> > > #13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
> > > #14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
> > > #15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
> > > #16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6
> > > 
> > > Michael, your previous branch did work if I recall correctly.
> > 
> > That one was failing under github CI though (for reasons we didn't
> > really address, such as disconnect during stop causing a recursive
> > call to stop, but there you are).
> Even the double revert of everything?

I don't remember at this point.

> So how do we proceed now?

I'm hopeful Alex will come up with a fix.
Christian Borntraeger Nov. 15, 2022, 7:44 a.m. UTC | #15
Am 14.11.22 um 18:20 schrieb Michael S. Tsirkin:
[...]

>>>>>
>>>>>> This does not seem to fix the regression that I have reported.
>>>>>
>>>>> This was applied on top of 9f6bcfd99f which IIUC does, right?

Just dobble checked,

9f6bcfd99f was the patch that created the original problem, no?
Christian Borntraeger Nov. 15, 2022, 8:18 a.m. UTC | #16
Am 14.11.22 um 18:20 schrieb Michael S. Tsirkin:
> On Mon, Nov 14, 2022 at 06:15:30PM +0100, Christian Borntraeger wrote:
>>
>>
>> Am 14.11.22 um 18:10 schrieb Michael S. Tsirkin:
>>> On Mon, Nov 14, 2022 at 05:55:09PM +0100, Christian Borntraeger wrote:
>>>>
>>>>
>>>> Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
>>>>> On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
>>>>>> Am 08.11.22 um 10:23 schrieb Alex Bennée:
>>>>>>> The previous fix to virtio_device_started revealed a problem in its
>>>>>>> use by both the core and the device code. The core code should be able
>>>>>>> to handle the device "starting" while the VM isn't running to handle
>>>>>>> the restoration of migration state. To solve this dual use introduce a
>>>>>>> new helper for use by the vhost-user backends who all use it to feed a
>>>>>>> should_start variable.
>>>>>>>
>>>>>>> We can also pick up a change vhost_user_blk_set_status while we are at
>>>>>>> it which follows the same pattern.
>>>>>>>
>>>>>>> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
>>>>>>> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
>>>>>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>>>>>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>>>>>>
>>>>>> Hmmm, is this
>>>>>> commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
>>>>>> Author:     Alex Bennée <alex.bennee@linaro.org>
>>>>>> AuthorDate: Mon Nov 7 12:14:07 2022 +0000
>>>>>> Commit:     Michael S. Tsirkin <mst@redhat.com>
>>>>>> CommitDate: Mon Nov 7 14:08:18 2022 -0500
>>>>>>
>>>>>>        hw/virtio: introduce virtio_device_should_start
>>>>>>
>>>>>> and older version?
>>>>>
>>>>> This is what got merged:
>>>>> https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
>>>>> This patch was sent after I merged the RFC.
>>>>> I think the only difference is the commit log but I might be missing
>>>>> something.
>>>>>
>>>>>> This does not seem to fix the regression that I have reported.
>>>>>
>>>>> This was applied on top of 9f6bcfd99f which IIUC does, right?
>>>>>
>>>>>
>>>>
>>>> QEMU master still fails for me for suspend/resume to disk:
>>>>
>>>> #0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
>>>> #1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
>>>> #2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
>>>> #3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
>>>> #4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
>>>> #5  0x000002aa1ffa8966 in vhost_vsock_common_pre_save (opaque=<optimized out>) at ../hw/virtio/vhost-vsock-common.c:203
>>>> #6  0x000002aa1fe5e0ee in vmstate_save_state_v
>>>>       (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0 <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8, vmdesc=vmdesc@entry=0x3fddc08eb30, version_id=version_id@entry=0) at ../migration/vmstate.c:329
>>>> #7  0x000002aa1fe5ebf8 in vmstate_save_state (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>, opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30) at ../migration/vmstate.c:317
>>>> #8  0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170, se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at ../migration/savevm.c:908
>>>> #9  0x000002aa1fe79584 in qemu_savevm_state_complete_precopy_non_iterable (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false, inactivate_disks=inactivate_disks@entry=true)
>>>>       at ../migration/savevm.c:1393
>>>> #10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false, inactivate_disks=inactivate_disks@entry=true) at ../migration/savevm.c:1459
>>>> #11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
>>>> #12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
>>>> #13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
>>>> #14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
>>>> #15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
>>>> #16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6
>>>>
>>>> Michael, your previous branch did work if I recall correctly.
>>>
>>> That one was failing under github CI though (for reasons we didn't
>>> really address, such as disconnect during stop causing a recursive
>>> call to stop, but there you are).
>> Even the double revert of everything?
> 
> I don't remember at this point.
> 
>> So how do we proceed now?
> 
> I'm hopeful Alex will come up with a fix.


The initial fix changed to qemu/master does still work for me

diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index a973811cbfc6..fb3072838119 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -411,14 +411,14 @@ static inline bool virtio_device_started(VirtIODevice *vdev, uint8_t status)
   */
  static inline bool virtio_device_should_start(VirtIODevice *vdev, uint8_t status)
  {
-    if (vdev->use_started) {
-        return vdev->started;
-    }
-
      if (!vdev->vm_running) {
          return false;
      }
  
+    if (vdev->use_started) {
+        return vdev->started;
+    }
+
      return status & VIRTIO_CONFIG_S_DRIVER_OK;
  }
Michael S. Tsirkin Nov. 15, 2022, 9:05 a.m. UTC | #17
On Tue, Nov 15, 2022 at 09:18:27AM +0100, Christian Borntraeger wrote:
> 
> Am 14.11.22 um 18:20 schrieb Michael S. Tsirkin:
> > On Mon, Nov 14, 2022 at 06:15:30PM +0100, Christian Borntraeger wrote:
> > > 
> > > 
> > > Am 14.11.22 um 18:10 schrieb Michael S. Tsirkin:
> > > > On Mon, Nov 14, 2022 at 05:55:09PM +0100, Christian Borntraeger wrote:
> > > > > 
> > > > > 
> > > > > Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
> > > > > > On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
> > > > > > > Am 08.11.22 um 10:23 schrieb Alex Bennée:
> > > > > > > > The previous fix to virtio_device_started revealed a problem in its
> > > > > > > > use by both the core and the device code. The core code should be able
> > > > > > > > to handle the device "starting" while the VM isn't running to handle
> > > > > > > > the restoration of migration state. To solve this dual use introduce a
> > > > > > > > new helper for use by the vhost-user backends who all use it to feed a
> > > > > > > > should_start variable.
> > > > > > > > 
> > > > > > > > We can also pick up a change vhost_user_blk_set_status while we are at
> > > > > > > > it which follows the same pattern.
> > > > > > > > 
> > > > > > > > Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
> > > > > > > > Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
> > > > > > > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> > > > > > > > Cc: "Michael S. Tsirkin" <mst@redhat.com>
> > > > > > > 
> > > > > > > Hmmm, is this
> > > > > > > commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
> > > > > > > Author:     Alex Bennée <alex.bennee@linaro.org>
> > > > > > > AuthorDate: Mon Nov 7 12:14:07 2022 +0000
> > > > > > > Commit:     Michael S. Tsirkin <mst@redhat.com>
> > > > > > > CommitDate: Mon Nov 7 14:08:18 2022 -0500
> > > > > > > 
> > > > > > >        hw/virtio: introduce virtio_device_should_start
> > > > > > > 
> > > > > > > and older version?
> > > > > > 
> > > > > > This is what got merged:
> > > > > > https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
> > > > > > This patch was sent after I merged the RFC.
> > > > > > I think the only difference is the commit log but I might be missing
> > > > > > something.
> > > > > > 
> > > > > > > This does not seem to fix the regression that I have reported.
> > > > > > 
> > > > > > This was applied on top of 9f6bcfd99f which IIUC does, right?
> > > > > > 
> > > > > > 
> > > > > 
> > > > > QEMU master still fails for me for suspend/resume to disk:
> > > > > 
> > > > > #0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
> > > > > #1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
> > > > > #2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
> > > > > #3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
> > > > > #4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
> > > > > #5  0x000002aa1ffa8966 in vhost_vsock_common_pre_save (opaque=<optimized out>) at ../hw/virtio/vhost-vsock-common.c:203
> > > > > #6  0x000002aa1fe5e0ee in vmstate_save_state_v
> > > > >       (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0 <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8, vmdesc=vmdesc@entry=0x3fddc08eb30, version_id=version_id@entry=0) at ../migration/vmstate.c:329
> > > > > #7  0x000002aa1fe5ebf8 in vmstate_save_state (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>, opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30) at ../migration/vmstate.c:317
> > > > > #8  0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170, se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at ../migration/savevm.c:908
> > > > > #9  0x000002aa1fe79584 in qemu_savevm_state_complete_precopy_non_iterable (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false, inactivate_disks=inactivate_disks@entry=true)
> > > > >       at ../migration/savevm.c:1393
> > > > > #10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false, inactivate_disks=inactivate_disks@entry=true) at ../migration/savevm.c:1459
> > > > > #11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
> > > > > #12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
> > > > > #13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
> > > > > #14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
> > > > > #15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
> > > > > #16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6
> > > > > 
> > > > > Michael, your previous branch did work if I recall correctly.
> > > > 
> > > > That one was failing under github CI though (for reasons we didn't
> > > > really address, such as disconnect during stop causing a recursive
> > > > call to stop, but there you are).
> > > Even the double revert of everything?
> > 
> > I don't remember at this point.
> > 
> > > So how do we proceed now?
> > 
> > I'm hopeful Alex will come up with a fix.
> 
> 
> The initial fix changed to qemu/master does still work for me
> 
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index a973811cbfc6..fb3072838119 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -411,14 +411,14 @@ static inline bool virtio_device_started(VirtIODevice *vdev, uint8_t status)
>   */
>  static inline bool virtio_device_should_start(VirtIODevice *vdev, uint8_t status)
>  {
> -    if (vdev->use_started) {
> -        return vdev->started;
> -    }
> -
>      if (!vdev->vm_running) {
>          return false;
>      }
> +    if (vdev->use_started) {
> +        return vdev->started;
> +    }
> +
>      return status & VIRTIO_CONFIG_S_DRIVER_OK;
>  }

Hmm this makes sense to me. And with the new API the
follout should be minimal. Let's see how it behaves on github.
It would be nice to fix the recursive stop problem properly
too but I"m not optimistic on that for this release.
Michael S. Tsirkin Nov. 15, 2022, 11:25 a.m. UTC | #18
On Tue, Nov 15, 2022 at 09:18:27AM +0100, Christian Borntraeger wrote:
> 
> Am 14.11.22 um 18:20 schrieb Michael S. Tsirkin:
> > On Mon, Nov 14, 2022 at 06:15:30PM +0100, Christian Borntraeger wrote:
> > > 
> > > 
> > > Am 14.11.22 um 18:10 schrieb Michael S. Tsirkin:
> > > > On Mon, Nov 14, 2022 at 05:55:09PM +0100, Christian Borntraeger wrote:
> > > > > 
> > > > > 
> > > > > Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
> > > > > > On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
> > > > > > > Am 08.11.22 um 10:23 schrieb Alex Bennée:
> > > > > > > > The previous fix to virtio_device_started revealed a problem in its
> > > > > > > > use by both the core and the device code. The core code should be able
> > > > > > > > to handle the device "starting" while the VM isn't running to handle
> > > > > > > > the restoration of migration state. To solve this dual use introduce a
> > > > > > > > new helper for use by the vhost-user backends who all use it to feed a
> > > > > > > > should_start variable.
> > > > > > > > 
> > > > > > > > We can also pick up a change vhost_user_blk_set_status while we are at
> > > > > > > > it which follows the same pattern.
> > > > > > > > 
> > > > > > > > Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
> > > > > > > > Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
> > > > > > > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> > > > > > > > Cc: "Michael S. Tsirkin" <mst@redhat.com>
> > > > > > > 
> > > > > > > Hmmm, is this
> > > > > > > commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
> > > > > > > Author:     Alex Bennée <alex.bennee@linaro.org>
> > > > > > > AuthorDate: Mon Nov 7 12:14:07 2022 +0000
> > > > > > > Commit:     Michael S. Tsirkin <mst@redhat.com>
> > > > > > > CommitDate: Mon Nov 7 14:08:18 2022 -0500
> > > > > > > 
> > > > > > >        hw/virtio: introduce virtio_device_should_start
> > > > > > > 
> > > > > > > and older version?
> > > > > > 
> > > > > > This is what got merged:
> > > > > > https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
> > > > > > This patch was sent after I merged the RFC.
> > > > > > I think the only difference is the commit log but I might be missing
> > > > > > something.
> > > > > > 
> > > > > > > This does not seem to fix the regression that I have reported.
> > > > > > 
> > > > > > This was applied on top of 9f6bcfd99f which IIUC does, right?
> > > > > > 
> > > > > > 
> > > > > 
> > > > > QEMU master still fails for me for suspend/resume to disk:
> > > > > 
> > > > > #0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
> > > > > #1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
> > > > > #2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
> > > > > #3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
> > > > > #4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
> > > > > #5  0x000002aa1ffa8966 in vhost_vsock_common_pre_save (opaque=<optimized out>) at ../hw/virtio/vhost-vsock-common.c:203
> > > > > #6  0x000002aa1fe5e0ee in vmstate_save_state_v
> > > > >       (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0 <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8, vmdesc=vmdesc@entry=0x3fddc08eb30, version_id=version_id@entry=0) at ../migration/vmstate.c:329
> > > > > #7  0x000002aa1fe5ebf8 in vmstate_save_state (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>, opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30) at ../migration/vmstate.c:317
> > > > > #8  0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170, se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at ../migration/savevm.c:908
> > > > > #9  0x000002aa1fe79584 in qemu_savevm_state_complete_precopy_non_iterable (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false, inactivate_disks=inactivate_disks@entry=true)
> > > > >       at ../migration/savevm.c:1393
> > > > > #10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false, inactivate_disks=inactivate_disks@entry=true) at ../migration/savevm.c:1459
> > > > > #11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
> > > > > #12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
> > > > > #13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
> > > > > #14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
> > > > > #15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
> > > > > #16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6
> > > > > 
> > > > > Michael, your previous branch did work if I recall correctly.
> > > > 
> > > > That one was failing under github CI though (for reasons we didn't
> > > > really address, such as disconnect during stop causing a recursive
> > > > call to stop, but there you are).
> > > Even the double revert of everything?
> > 
> > I don't remember at this point.
> > 
> > > So how do we proceed now?
> > 
> > I'm hopeful Alex will come up with a fix.
> 
> 
> The initial fix changed to qemu/master does still work for me
> 
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index a973811cbfc6..fb3072838119 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -411,14 +411,14 @@ static inline bool virtio_device_started(VirtIODevice *vdev, uint8_t status)
>   */
>  static inline bool virtio_device_should_start(VirtIODevice *vdev, uint8_t status)
>  {
> -    if (vdev->use_started) {
> -        return vdev->started;
> -    }
> -
>      if (!vdev->vm_running) {
>          return false;
>      }
> +    if (vdev->use_started) {
> +        return vdev->started;
> +    }
> +
>      return status & VIRTIO_CONFIG_S_DRIVER_OK;
>  }

Triggers failure on gitlab unfortunately:

https://gitlab.com/mstredhat/qemu/-/jobs/3323768122
Christian Borntraeger Nov. 15, 2022, 1:25 p.m. UTC | #19
Am 15.11.22 um 12:25 schrieb Michael S. Tsirkin:
> On Tue, Nov 15, 2022 at 09:18:27AM +0100, Christian Borntraeger wrote:
>>
>> Am 14.11.22 um 18:20 schrieb Michael S. Tsirkin:
>>> On Mon, Nov 14, 2022 at 06:15:30PM +0100, Christian Borntraeger wrote:
>>>>
>>>>
>>>> Am 14.11.22 um 18:10 schrieb Michael S. Tsirkin:
>>>>> On Mon, Nov 14, 2022 at 05:55:09PM +0100, Christian Borntraeger wrote:
>>>>>>
>>>>>>
>>>>>> Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
>>>>>>> On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
>>>>>>>> Am 08.11.22 um 10:23 schrieb Alex Bennée:
>>>>>>>>> The previous fix to virtio_device_started revealed a problem in its
>>>>>>>>> use by both the core and the device code. The core code should be able
>>>>>>>>> to handle the device "starting" while the VM isn't running to handle
>>>>>>>>> the restoration of migration state. To solve this dual use introduce a
>>>>>>>>> new helper for use by the vhost-user backends who all use it to feed a
>>>>>>>>> should_start variable.
>>>>>>>>>
>>>>>>>>> We can also pick up a change vhost_user_blk_set_status while we are at
>>>>>>>>> it which follows the same pattern.
>>>>>>>>>
>>>>>>>>> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
>>>>>>>>> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
>>>>>>>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>>>>>>>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>>>>>>>>
>>>>>>>> Hmmm, is this
>>>>>>>> commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
>>>>>>>> Author:     Alex Bennée <alex.bennee@linaro.org>
>>>>>>>> AuthorDate: Mon Nov 7 12:14:07 2022 +0000
>>>>>>>> Commit:     Michael S. Tsirkin <mst@redhat.com>
>>>>>>>> CommitDate: Mon Nov 7 14:08:18 2022 -0500
>>>>>>>>
>>>>>>>>         hw/virtio: introduce virtio_device_should_start
>>>>>>>>
>>>>>>>> and older version?
>>>>>>>
>>>>>>> This is what got merged:
>>>>>>> https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
>>>>>>> This patch was sent after I merged the RFC.
>>>>>>> I think the only difference is the commit log but I might be missing
>>>>>>> something.
>>>>>>>
>>>>>>>> This does not seem to fix the regression that I have reported.
>>>>>>>
>>>>>>> This was applied on top of 9f6bcfd99f which IIUC does, right?
>>>>>>>
>>>>>>>
>>>>>>
>>>>>> QEMU master still fails for me for suspend/resume to disk:
>>>>>>
>>>>>> #0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
>>>>>> #1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
>>>>>> #2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
>>>>>> #3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
>>>>>> #4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
>>>>>> #5  0x000002aa1ffa8966 in vhost_vsock_common_pre_save (opaque=<optimized out>) at ../hw/virtio/vhost-vsock-common.c:203
>>>>>> #6  0x000002aa1fe5e0ee in vmstate_save_state_v
>>>>>>        (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0 <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8, vmdesc=vmdesc@entry=0x3fddc08eb30, version_id=version_id@entry=0) at ../migration/vmstate.c:329
>>>>>> #7  0x000002aa1fe5ebf8 in vmstate_save_state (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>, opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30) at ../migration/vmstate.c:317
>>>>>> #8  0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170, se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at ../migration/savevm.c:908
>>>>>> #9  0x000002aa1fe79584 in qemu_savevm_state_complete_precopy_non_iterable (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false, inactivate_disks=inactivate_disks@entry=true)
>>>>>>        at ../migration/savevm.c:1393
>>>>>> #10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false, inactivate_disks=inactivate_disks@entry=true) at ../migration/savevm.c:1459
>>>>>> #11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
>>>>>> #12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
>>>>>> #13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
>>>>>> #14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
>>>>>> #15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
>>>>>> #16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6
>>>>>>
>>>>>> Michael, your previous branch did work if I recall correctly.
>>>>>
>>>>> That one was failing under github CI though (for reasons we didn't
>>>>> really address, such as disconnect during stop causing a recursive
>>>>> call to stop, but there you are).
>>>> Even the double revert of everything?
>>>
>>> I don't remember at this point.
>>>
>>>> So how do we proceed now?
>>>
>>> I'm hopeful Alex will come up with a fix.
>>
>>
>> The initial fix changed to qemu/master does still work for me
>>
>> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
>> index a973811cbfc6..fb3072838119 100644
>> --- a/include/hw/virtio/virtio.h
>> +++ b/include/hw/virtio/virtio.h
>> @@ -411,14 +411,14 @@ static inline bool virtio_device_started(VirtIODevice *vdev, uint8_t status)
>>    */
>>   static inline bool virtio_device_should_start(VirtIODevice *vdev, uint8_t status)
>>   {
>> -    if (vdev->use_started) {
>> -        return vdev->started;
>> -    }
>> -
>>       if (!vdev->vm_running) {
>>           return false;
>>       }
>> +    if (vdev->use_started) {
>> +        return vdev->started;
>> +    }
>> +
>>       return status & VIRTIO_CONFIG_S_DRIVER_OK;
>>   }
> 
> Triggers failure on gitlab unfortunately:
> 
> https://gitlab.com/mstredhat/qemu/-/jobs/3323768122

So maybe we should go forward and revert the whole thing?
After all 9f6bcfd99f mostly looks like a cleanup patch and not something that was really necessary.
Alex Bennée Nov. 15, 2022, 2:31 p.m. UTC | #20
"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Mon, Nov 14, 2022 at 06:15:30PM +0100, Christian Borntraeger wrote:
>> 
>> 
>> Am 14.11.22 um 18:10 schrieb Michael S. Tsirkin:
>> > On Mon, Nov 14, 2022 at 05:55:09PM +0100, Christian Borntraeger wrote:
>> > > 
>> > > 
>> > > Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
>> > > > On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
>> > > > > Am 08.11.22 um 10:23 schrieb Alex Bennée:
>> > > > > > The previous fix to virtio_device_started revealed a problem in its
>> > > > > > use by both the core and the device code. The core code should be able
>> > > > > > to handle the device "starting" while the VM isn't running to handle
>> > > > > > the restoration of migration state. To solve this dual use introduce a
>> > > > > > new helper for use by the vhost-user backends who all use it to feed a
>> > > > > > should_start variable.
>> > > > > > 
>> > > > > > We can also pick up a change vhost_user_blk_set_status while we are at
>> > > > > > it which follows the same pattern.
>> > > > > > 
>> > > > > > Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
>> > > > > > Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
>> > > > > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> > > > > > Cc: "Michael S. Tsirkin" <mst@redhat.com>
>> > > > > 
>> > > > > Hmmm, is this
>> > > > > commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
>> > > > > Author:     Alex Bennée <alex.bennee@linaro.org>
>> > > > > AuthorDate: Mon Nov 7 12:14:07 2022 +0000
>> > > > > Commit:     Michael S. Tsirkin <mst@redhat.com>
>> > > > > CommitDate: Mon Nov 7 14:08:18 2022 -0500
>> > > > > 
>> > > > >       hw/virtio: introduce virtio_device_should_start
>> > > > > 
>> > > > > and older version?
>> > > > 
>> > > > This is what got merged:
>> > > > https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
>> > > > This patch was sent after I merged the RFC.
>> > > > I think the only difference is the commit log but I might be missing
>> > > > something.
>> > > > 
>> > > > > This does not seem to fix the regression that I have reported.
>> > > > 
>> > > > This was applied on top of 9f6bcfd99f which IIUC does, right?
>> > > > 
>> > > > 
>> > > 
>> > > QEMU master still fails for me for suspend/resume to disk:
>> > > 
>> > > #0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
>> > > #1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
>> > > #2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
>> > > #3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
>> > > #4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
>> > > #5 0x000002aa1ffa8966 in vhost_vsock_common_pre_save
>> > > (opaque=<optimized out>) at
>> > > ../hw/virtio/vhost-vsock-common.c:203
>> > > #6  0x000002aa1fe5e0ee in vmstate_save_state_v
>> > >      (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0
>> > > <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8,
>> > > vmdesc=vmdesc@entry=0x3fddc08eb30,
>> > > version_id=version_id@entry=0) at ../migration/vmstate.c:329
>> > > #7 0x000002aa1fe5ebf8 in vmstate_save_state
>> > > (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>,
>> > > opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30)
>> > > at ../migration/vmstate.c:317
>> > > #8 0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170,
>> > > se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at
>> > > ../migration/savevm.c:908
>> > > #9 0x000002aa1fe79584 in
>> > > qemu_savevm_state_complete_precopy_non_iterable
>> > > (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false,
>> > > inactivate_disks=inactivate_disks@entry=true)
>> > >      at ../migration/savevm.c:1393
>> > > #10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy
>> > > (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false,
>> > > inactivate_disks=inactivate_disks@entry=true) at
>> > > ../migration/savevm.c:1459
>> > > #11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
>> > > #12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
>> > > #13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
>> > > #14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
>> > > #15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
>> > > #16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6
>> > > 
>> > > Michael, your previous branch did work if I recall correctly.
>> > 
>> > That one was failing under github CI though (for reasons we didn't
>> > really address, such as disconnect during stop causing a recursive
>> > call to stop, but there you are).
>> Even the double revert of everything?
>
> I don't remember at this point.
>
>> So how do we proceed now?
>
> I'm hopeful Alex will come up with a fix.

I need to replicate the failing test for that. Which test is failing?
Christian Borntraeger Nov. 15, 2022, 3:09 p.m. UTC | #21
Am 15.11.22 um 15:31 schrieb Alex Bennée:
> 
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
>> On Mon, Nov 14, 2022 at 06:15:30PM +0100, Christian Borntraeger wrote:
>>>
>>>
>>> Am 14.11.22 um 18:10 schrieb Michael S. Tsirkin:
>>>> On Mon, Nov 14, 2022 at 05:55:09PM +0100, Christian Borntraeger wrote:
>>>>>
>>>>>
>>>>> Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
>>>>>> On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
>>>>>>> Am 08.11.22 um 10:23 schrieb Alex Bennée:
>>>>>>>> The previous fix to virtio_device_started revealed a problem in its
>>>>>>>> use by both the core and the device code. The core code should be able
>>>>>>>> to handle the device "starting" while the VM isn't running to handle
>>>>>>>> the restoration of migration state. To solve this dual use introduce a
>>>>>>>> new helper for use by the vhost-user backends who all use it to feed a
>>>>>>>> should_start variable.
>>>>>>>>
>>>>>>>> We can also pick up a change vhost_user_blk_set_status while we are at
>>>>>>>> it which follows the same pattern.
>>>>>>>>
>>>>>>>> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
>>>>>>>> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
>>>>>>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>>>>>>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>>>>>>>
>>>>>>> Hmmm, is this
>>>>>>> commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
>>>>>>> Author:     Alex Bennée <alex.bennee@linaro.org>
>>>>>>> AuthorDate: Mon Nov 7 12:14:07 2022 +0000
>>>>>>> Commit:     Michael S. Tsirkin <mst@redhat.com>
>>>>>>> CommitDate: Mon Nov 7 14:08:18 2022 -0500
>>>>>>>
>>>>>>>        hw/virtio: introduce virtio_device_should_start
>>>>>>>
>>>>>>> and older version?
>>>>>>
>>>>>> This is what got merged:
>>>>>> https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
>>>>>> This patch was sent after I merged the RFC.
>>>>>> I think the only difference is the commit log but I might be missing
>>>>>> something.
>>>>>>
>>>>>>> This does not seem to fix the regression that I have reported.
>>>>>>
>>>>>> This was applied on top of 9f6bcfd99f which IIUC does, right?
>>>>>>
>>>>>>
>>>>>
>>>>> QEMU master still fails for me for suspend/resume to disk:
>>>>>
>>>>> #0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
>>>>> #1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
>>>>> #2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
>>>>> #3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
>>>>> #4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
>>>>> #5 0x000002aa1ffa8966 in vhost_vsock_common_pre_save
>>>>> (opaque=<optimized out>) at
>>>>> ../hw/virtio/vhost-vsock-common.c:203
>>>>> #6  0x000002aa1fe5e0ee in vmstate_save_state_v
>>>>>       (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0
>>>>> <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8,
>>>>> vmdesc=vmdesc@entry=0x3fddc08eb30,
>>>>> version_id=version_id@entry=0) at ../migration/vmstate.c:329
>>>>> #7 0x000002aa1fe5ebf8 in vmstate_save_state
>>>>> (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>,
>>>>> opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30)
>>>>> at ../migration/vmstate.c:317
>>>>> #8 0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170,
>>>>> se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at
>>>>> ../migration/savevm.c:908
>>>>> #9 0x000002aa1fe79584 in
>>>>> qemu_savevm_state_complete_precopy_non_iterable
>>>>> (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false,
>>>>> inactivate_disks=inactivate_disks@entry=true)
>>>>>       at ../migration/savevm.c:1393
>>>>> #10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy
>>>>> (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false,
>>>>> inactivate_disks=inactivate_disks@entry=true) at
>>>>> ../migration/savevm.c:1459
>>>>> #11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
>>>>> #12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
>>>>> #13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
>>>>> #14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
>>>>> #15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
>>>>> #16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6
>>>>>
>>>>> Michael, your previous branch did work if I recall correctly.
>>>>
>>>> That one was failing under github CI though (for reasons we didn't
>>>> really address, such as disconnect during stop causing a recursive
>>>> call to stop, but there you are).
>>> Even the double revert of everything?
>>
>> I don't remember at this point.
>>
>>> So how do we proceed now?
>>
>> I'm hopeful Alex will come up with a fix.
> 
> I need to replicate the failing test for that. Which test is failing?


Pretty much the same as before. guest with vsock, managedsave and restore.
Alex Bennée Nov. 15, 2022, 4:05 p.m. UTC | #22
Christian Borntraeger <borntraeger@linux.ibm.com> writes:

> Am 15.11.22 um 15:31 schrieb Alex Bennée:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>>> On Mon, Nov 14, 2022 at 06:15:30PM +0100, Christian Borntraeger wrote:
>>>>
>>>>
>>>> Am 14.11.22 um 18:10 schrieb Michael S. Tsirkin:
>>>>> On Mon, Nov 14, 2022 at 05:55:09PM +0100, Christian Borntraeger wrote:
>>>>>>
>>>>>>
>>>>>> Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
>>>>>>> On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
>>>>>>>> Am 08.11.22 um 10:23 schrieb Alex Bennée:
>>>>>>>>> The previous fix to virtio_device_started revealed a problem in its
>>>>>>>>> use by both the core and the device code. The core code should be able
>>>>>>>>> to handle the device "starting" while the VM isn't running to handle
>>>>>>>>> the restoration of migration state. To solve this dual use introduce a
>>>>>>>>> new helper for use by the vhost-user backends who all use it to feed a
>>>>>>>>> should_start variable.
>>>>>>>>>
>>>>>>>>> We can also pick up a change vhost_user_blk_set_status while we are at
>>>>>>>>> it which follows the same pattern.
>>>>>>>>>
>>>>>>>>> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
>>>>>>>>> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
>>>>>>>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>>>>>>>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>>>>>>>>
>>>>>>>> Hmmm, is this
>>>>>>>> commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
>>>>>>>> Author:     Alex Bennée <alex.bennee@linaro.org>
>>>>>>>> AuthorDate: Mon Nov 7 12:14:07 2022 +0000
>>>>>>>> Commit:     Michael S. Tsirkin <mst@redhat.com>
>>>>>>>> CommitDate: Mon Nov 7 14:08:18 2022 -0500
>>>>>>>>
>>>>>>>>        hw/virtio: introduce virtio_device_should_start
>>>>>>>>
>>>>>>>> and older version?
>>>>>>>
>>>>>>> This is what got merged:
>>>>>>> https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
>>>>>>> This patch was sent after I merged the RFC.
>>>>>>> I think the only difference is the commit log but I might be missing
>>>>>>> something.
>>>>>>>
>>>>>>>> This does not seem to fix the regression that I have reported.
>>>>>>>
>>>>>>> This was applied on top of 9f6bcfd99f which IIUC does, right?
>>>>>>>
>>>>>>>
>>>>>>
>>>>>> QEMU master still fails for me for suspend/resume to disk:
>>>>>>
>>>>>> #0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
>>>>>> #1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
>>>>>> #2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
>>>>>> #3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
>>>>>> #4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
>>>>>> #5 0x000002aa1ffa8966 in vhost_vsock_common_pre_save
>>>>>> (opaque=<optimized out>) at
>>>>>> ../hw/virtio/vhost-vsock-common.c:203
>>>>>> #6  0x000002aa1fe5e0ee in vmstate_save_state_v
>>>>>>       (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0
>>>>>> <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8,
>>>>>> vmdesc=vmdesc@entry=0x3fddc08eb30,
>>>>>> version_id=version_id@entry=0) at ../migration/vmstate.c:329
>>>>>> #7 0x000002aa1fe5ebf8 in vmstate_save_state
>>>>>> (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>,
>>>>>> opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30)
>>>>>> at ../migration/vmstate.c:317
>>>>>> #8 0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170,
>>>>>> se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at
>>>>>> ../migration/savevm.c:908
>>>>>> #9 0x000002aa1fe79584 in
>>>>>> qemu_savevm_state_complete_precopy_non_iterable
>>>>>> (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false,
>>>>>> inactivate_disks=inactivate_disks@entry=true)
>>>>>>       at ../migration/savevm.c:1393
>>>>>> #10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy
>>>>>> (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false,
>>>>>> inactivate_disks=inactivate_disks@entry=true) at
>>>>>> ../migration/savevm.c:1459
>>>>>> #11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
>>>>>> #12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
>>>>>> #13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
>>>>>> #14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
>>>>>> #15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
>>>>>> #16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6
>>>>>>
>>>>>> Michael, your previous branch did work if I recall correctly.
>>>>>
>>>>> That one was failing under github CI though (for reasons we didn't
>>>>> really address, such as disconnect during stop causing a recursive
>>>>> call to stop, but there you are).
>>>> Even the double revert of everything?
>>>
>>> I don't remember at this point.
>>>
>>>> So how do we proceed now?
>>>
>>> I'm hopeful Alex will come up with a fix.
>> I need to replicate the failing test for that. Which test is
>> failing?
>
>
> Pretty much the same as before. guest with vsock, managedsave and
> restore.

If this isn't in our test suite I'm going to need exact steps.
Christian Borntraeger Nov. 15, 2022, 4:40 p.m. UTC | #23
Am 15.11.22 um 17:05 schrieb Alex Bennée:
> 
> Christian Borntraeger <borntraeger@linux.ibm.com> writes:
> 
>> Am 15.11.22 um 15:31 schrieb Alex Bennée:
>>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>>>
>>>> On Mon, Nov 14, 2022 at 06:15:30PM +0100, Christian Borntraeger wrote:
>>>>>
>>>>>
>>>>> Am 14.11.22 um 18:10 schrieb Michael S. Tsirkin:
>>>>>> On Mon, Nov 14, 2022 at 05:55:09PM +0100, Christian Borntraeger wrote:
>>>>>>>
>>>>>>>
>>>>>>> Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
>>>>>>>> On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
>>>>>>>>> Am 08.11.22 um 10:23 schrieb Alex Bennée:
>>>>>>>>>> The previous fix to virtio_device_started revealed a problem in its
>>>>>>>>>> use by both the core and the device code. The core code should be able
>>>>>>>>>> to handle the device "starting" while the VM isn't running to handle
>>>>>>>>>> the restoration of migration state. To solve this dual use introduce a
>>>>>>>>>> new helper for use by the vhost-user backends who all use it to feed a
>>>>>>>>>> should_start variable.
>>>>>>>>>>
>>>>>>>>>> We can also pick up a change vhost_user_blk_set_status while we are at
>>>>>>>>>> it which follows the same pattern.
>>>>>>>>>>
>>>>>>>>>> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
>>>>>>>>>> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
>>>>>>>>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>>>>>>>>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>>>>>>>>>
>>>>>>>>> Hmmm, is this
>>>>>>>>> commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
>>>>>>>>> Author:     Alex Bennée <alex.bennee@linaro.org>
>>>>>>>>> AuthorDate: Mon Nov 7 12:14:07 2022 +0000
>>>>>>>>> Commit:     Michael S. Tsirkin <mst@redhat.com>
>>>>>>>>> CommitDate: Mon Nov 7 14:08:18 2022 -0500
>>>>>>>>>
>>>>>>>>>         hw/virtio: introduce virtio_device_should_start
>>>>>>>>>
>>>>>>>>> and older version?
>>>>>>>>
>>>>>>>> This is what got merged:
>>>>>>>> https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
>>>>>>>> This patch was sent after I merged the RFC.
>>>>>>>> I think the only difference is the commit log but I might be missing
>>>>>>>> something.
>>>>>>>>
>>>>>>>>> This does not seem to fix the regression that I have reported.
>>>>>>>>
>>>>>>>> This was applied on top of 9f6bcfd99f which IIUC does, right?
>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>> QEMU master still fails for me for suspend/resume to disk:
>>>>>>>
>>>>>>> #0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
>>>>>>> #1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
>>>>>>> #2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
>>>>>>> #3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
>>>>>>> #4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
>>>>>>> #5 0x000002aa1ffa8966 in vhost_vsock_common_pre_save
>>>>>>> (opaque=<optimized out>) at
>>>>>>> ../hw/virtio/vhost-vsock-common.c:203
>>>>>>> #6  0x000002aa1fe5e0ee in vmstate_save_state_v
>>>>>>>        (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0
>>>>>>> <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8,
>>>>>>> vmdesc=vmdesc@entry=0x3fddc08eb30,
>>>>>>> version_id=version_id@entry=0) at ../migration/vmstate.c:329
>>>>>>> #7 0x000002aa1fe5ebf8 in vmstate_save_state
>>>>>>> (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>,
>>>>>>> opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30)
>>>>>>> at ../migration/vmstate.c:317
>>>>>>> #8 0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170,
>>>>>>> se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at
>>>>>>> ../migration/savevm.c:908
>>>>>>> #9 0x000002aa1fe79584 in
>>>>>>> qemu_savevm_state_complete_precopy_non_iterable
>>>>>>> (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false,
>>>>>>> inactivate_disks=inactivate_disks@entry=true)
>>>>>>>        at ../migration/savevm.c:1393
>>>>>>> #10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy
>>>>>>> (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false,
>>>>>>> inactivate_disks=inactivate_disks@entry=true) at
>>>>>>> ../migration/savevm.c:1459
>>>>>>> #11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
>>>>>>> #12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
>>>>>>> #13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
>>>>>>> #14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
>>>>>>> #15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
>>>>>>> #16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6
>>>>>>>
>>>>>>> Michael, your previous branch did work if I recall correctly.
>>>>>>
>>>>>> That one was failing under github CI though (for reasons we didn't
>>>>>> really address, such as disconnect during stop causing a recursive
>>>>>> call to stop, but there you are).
>>>>> Even the double revert of everything?
>>>>
>>>> I don't remember at this point.
>>>>
>>>>> So how do we proceed now?
>>>>
>>>> I'm hopeful Alex will come up with a fix.
>>> I need to replicate the failing test for that. Which test is
>>> failing?
>>
>>
>> Pretty much the same as before. guest with vsock, managedsave and
>> restore.
> 
> If this isn't in our test suite I'm going to need exact steps.

Just get any libvirt guest, add
     <vsock model='virtio'>
       <cid auto='yes'/>
     </vsock>

to your libvirt xml. Start the guest (with the new xml).
Run virsh managedsave - qemu crashes. On x86 and s390.
Christian Borntraeger Nov. 15, 2022, 4:46 p.m. UTC | #24
Am 15.11.22 um 17:40 schrieb Christian Borntraeger:
> 
> 
> Am 15.11.22 um 17:05 schrieb Alex Bennée:
>>
>> Christian Borntraeger <borntraeger@linux.ibm.com> writes:
>>
>>> Am 15.11.22 um 15:31 schrieb Alex Bennée:
>>>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>>>>
>>>>> On Mon, Nov 14, 2022 at 06:15:30PM +0100, Christian Borntraeger wrote:
>>>>>>
>>>>>>
>>>>>> Am 14.11.22 um 18:10 schrieb Michael S. Tsirkin:
>>>>>>> On Mon, Nov 14, 2022 at 05:55:09PM +0100, Christian Borntraeger wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
>>>>>>>>> On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
>>>>>>>>>> Am 08.11.22 um 10:23 schrieb Alex Bennée:
>>>>>>>>>>> The previous fix to virtio_device_started revealed a problem in its
>>>>>>>>>>> use by both the core and the device code. The core code should be able
>>>>>>>>>>> to handle the device "starting" while the VM isn't running to handle
>>>>>>>>>>> the restoration of migration state. To solve this dual use introduce a
>>>>>>>>>>> new helper for use by the vhost-user backends who all use it to feed a
>>>>>>>>>>> should_start variable.
>>>>>>>>>>>
>>>>>>>>>>> We can also pick up a change vhost_user_blk_set_status while we are at
>>>>>>>>>>> it which follows the same pattern.
>>>>>>>>>>>
>>>>>>>>>>> Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
>>>>>>>>>>> Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
>>>>>>>>>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>>>>>>>>>>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>>>>>>>>>>
>>>>>>>>>> Hmmm, is this
>>>>>>>>>> commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
>>>>>>>>>> Author:     Alex Bennée <alex.bennee@linaro.org>
>>>>>>>>>> AuthorDate: Mon Nov 7 12:14:07 2022 +0000
>>>>>>>>>> Commit:     Michael S. Tsirkin <mst@redhat.com>
>>>>>>>>>> CommitDate: Mon Nov 7 14:08:18 2022 -0500
>>>>>>>>>>
>>>>>>>>>>         hw/virtio: introduce virtio_device_should_start
>>>>>>>>>>
>>>>>>>>>> and older version?
>>>>>>>>>
>>>>>>>>> This is what got merged:
>>>>>>>>> https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
>>>>>>>>> This patch was sent after I merged the RFC.
>>>>>>>>> I think the only difference is the commit log but I might be missing
>>>>>>>>> something.
>>>>>>>>>
>>>>>>>>>> This does not seem to fix the regression that I have reported.
>>>>>>>>>
>>>>>>>>> This was applied on top of 9f6bcfd99f which IIUC does, right?
>>>>>>>>>
>>>>>>>>>
>>>>>>>>
>>>>>>>> QEMU master still fails for me for suspend/resume to disk:
>>>>>>>>
>>>>>>>> #0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
>>>>>>>> #1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
>>>>>>>> #2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
>>>>>>>> #3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
>>>>>>>> #4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
>>>>>>>> #5 0x000002aa1ffa8966 in vhost_vsock_common_pre_save
>>>>>>>> (opaque=<optimized out>) at
>>>>>>>> ../hw/virtio/vhost-vsock-common.c:203
>>>>>>>> #6  0x000002aa1fe5e0ee in vmstate_save_state_v
>>>>>>>>        (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0
>>>>>>>> <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8,
>>>>>>>> vmdesc=vmdesc@entry=0x3fddc08eb30,
>>>>>>>> version_id=version_id@entry=0) at ../migration/vmstate.c:329
>>>>>>>> #7 0x000002aa1fe5ebf8 in vmstate_save_state
>>>>>>>> (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>,
>>>>>>>> opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30)
>>>>>>>> at ../migration/vmstate.c:317
>>>>>>>> #8 0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170,
>>>>>>>> se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at
>>>>>>>> ../migration/savevm.c:908
>>>>>>>> #9 0x000002aa1fe79584 in
>>>>>>>> qemu_savevm_state_complete_precopy_non_iterable
>>>>>>>> (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false,
>>>>>>>> inactivate_disks=inactivate_disks@entry=true)
>>>>>>>>        at ../migration/savevm.c:1393
>>>>>>>> #10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy
>>>>>>>> (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false,
>>>>>>>> inactivate_disks=inactivate_disks@entry=true) at
>>>>>>>> ../migration/savevm.c:1459
>>>>>>>> #11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
>>>>>>>> #12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
>>>>>>>> #13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
>>>>>>>> #14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
>>>>>>>> #15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
>>>>>>>> #16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6
>>>>>>>>
>>>>>>>> Michael, your previous branch did work if I recall correctly.
>>>>>>>
>>>>>>> That one was failing under github CI though (for reasons we didn't
>>>>>>> really address, such as disconnect during stop causing a recursive
>>>>>>> call to stop, but there you are).
>>>>>> Even the double revert of everything?
>>>>>
>>>>> I don't remember at this point.
>>>>>
>>>>>> So how do we proceed now?
>>>>>
>>>>> I'm hopeful Alex will come up with a fix.
>>>> I need to replicate the failing test for that. Which test is
>>>> failing?
>>>
>>>
>>> Pretty much the same as before. guest with vsock, managedsave and
>>> restore.
>>
>> If this isn't in our test suite I'm going to need exact steps.
> 
> Just get any libvirt guest, add
>      <vsock model='virtio'>
>        <cid auto='yes'/>
>      </vsock>
> 
> to your libvirt xml. Start the guest (with the new xml).
> Run virsh managedsave - qemu crashes. On x86 and s390.


the libvirt log:

/home/cborntra/REPOS/qemu/build/x86_64-softmmu/qemu-system-x86_64 \
-name guest=f36,debug-threads=on \
-S \
-object '{"qom-type":"secret","id":"masterKey0","format":"raw","file":"/var/lib/libvirt/qemu/domain-1-f36/master-key.aes"}' \
-machine pc-i440fx-7.2,usb=off,dump-guest-core=off,memory-backend=pc.ram \
-accel kvm \
-cpu Cooperlake,ss=on,pdcm=on,hypervisor=on,tsc-adjust=on,avx512ifma=on,sha-ni=on,avx512vbmi=on,umip=on,avx512vbmi2=on,gfni=on,vaes=on,vpclmulqdq=on,avx512bitalg=on,avx512-vpopcntdq=on,rdpid=on,movdiri=on,movdir64b=on,fsrm=on,md-clear=on,xsaves=on,ibpb=on,ibrs=on,amd-stibp=on,amd-ssbd=on,hle=off,rtm=off,avx512-bf16=off,taa-no=off \
-m 2048 \
-object '{"qom-type":"memory-backend-ram","id":"pc.ram","size":2147483648}' \
-overcommit mem-lock=off \
-smp 2,sockets=2,cores=1,threads=1 \
-uuid 712590b2-fbd8-4a2f-a8e9-be33cb9ee0da \
-display none \
-no-user-config \
-nodefaults \
-chardev socket,id=charmonitor,fd=39,server=on,wait=off \
-mon chardev=charmonitor,id=monitor,mode=control \
-rtc base=utc,driftfix=slew \
-global kvm-pit.lost_tick_policy=delay \
-no-hpet \
-no-shutdown \
-global PIIX4_PM.disable_s3=1 \
-global PIIX4_PM.disable_s4=1 \
-boot strict=on \
-device ich9-usb-ehci1,id=usb,bus=pci.0,addr=0x3.0x7 \
-device ich9-usb-uhci1,masterbus=usb.0,firstport=0,bus=pci.0,multifunction=on,addr=0x3 \
-device ich9-usb-uhci2,masterbus=usb.0,firstport=2,bus=pci.0,addr=0x3.0x1 \
-device ich9-usb-uhci3,masterbus=usb.0,firstport=4,bus=pci.0,addr=0x3.0x2 \
-blockdev '{"driver":"file","filename":"/var/lib/libvirt/images/f36.qcow2","node-name":"libvirt-1-storage","auto-read-only":true,"discard":"unmap"}' \
-blockdev '{"node-name":"libvirt-1-format","read-only":false,"driver":"qcow2","file":"libvirt-1-storage","backing":null}' \
-device ide-hd,bus=ide.0,unit=0,drive=libvirt-1-format,id=ide0-0-0,bootindex=1 \
-netdev user,id=hostnet0 \
-device e1000,netdev=hostnet0,id=net0,mac=52:54:00:20:ba:4a,bus=pci.0,addr=0x2 \
-chardev pty,id=charserial0 \
-device isa-serial,chardev=charserial0,id=serial0 \
-audiodev '{"id":"audio1","driver":"none"}' \
-device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x4 \
-sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny \
-device vhost-vsock-pci,id=vsock0,guest-cid=3,vhostfd=35,bus=pci.0,addr=0x5 \
-msg timestamp=on
char device redirected to /dev/pts/1 (label charserial0)
qemu-system-x86_64: ../hw/virtio/vhost-vsock-common.c:203: vhost_vsock_common_pre_save: Assertion `!vhost_dev_is_started(&vvc->vhost_dev)' failed.
2022-11-15 16:38:46.096+0000: shutting down, reason=crashed
Michael S. Tsirkin Nov. 21, 2022, 10:37 p.m. UTC | #25
On Tue, Nov 15, 2022 at 05:46:58PM +0100, Christian Borntraeger wrote:
> 
> 
> Am 15.11.22 um 17:40 schrieb Christian Borntraeger:
> > 
> > 
> > Am 15.11.22 um 17:05 schrieb Alex Bennée:
> > > 
> > > Christian Borntraeger <borntraeger@linux.ibm.com> writes:
> > > 
> > > > Am 15.11.22 um 15:31 schrieb Alex Bennée:
> > > > > "Michael S. Tsirkin" <mst@redhat.com> writes:
> > > > > 
> > > > > > On Mon, Nov 14, 2022 at 06:15:30PM +0100, Christian Borntraeger wrote:
> > > > > > > 
> > > > > > > 
> > > > > > > Am 14.11.22 um 18:10 schrieb Michael S. Tsirkin:
> > > > > > > > On Mon, Nov 14, 2022 at 05:55:09PM +0100, Christian Borntraeger wrote:
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Am 14.11.22 um 17:37 schrieb Michael S. Tsirkin:
> > > > > > > > > > On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
> > > > > > > > > > > Am 08.11.22 um 10:23 schrieb Alex Bennée:
> > > > > > > > > > > > The previous fix to virtio_device_started revealed a problem in its
> > > > > > > > > > > > use by both the core and the device code. The core code should be able
> > > > > > > > > > > > to handle the device "starting" while the VM isn't running to handle
> > > > > > > > > > > > the restoration of migration state. To solve this dual use introduce a
> > > > > > > > > > > > new helper for use by the vhost-user backends who all use it to feed a
> > > > > > > > > > > > should_start variable.
> > > > > > > > > > > > 
> > > > > > > > > > > > We can also pick up a change vhost_user_blk_set_status while we are at
> > > > > > > > > > > > it which follows the same pattern.
> > > > > > > > > > > > 
> > > > > > > > > > > > Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
> > > > > > > > > > > > Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
> > > > > > > > > > > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> > > > > > > > > > > > Cc: "Michael S. Tsirkin" <mst@redhat.com>
> > > > > > > > > > > 
> > > > > > > > > > > Hmmm, is this
> > > > > > > > > > > commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
> > > > > > > > > > > Author:     Alex Bennée <alex.bennee@linaro.org>
> > > > > > > > > > > AuthorDate: Mon Nov 7 12:14:07 2022 +0000
> > > > > > > > > > > Commit:     Michael S. Tsirkin <mst@redhat.com>
> > > > > > > > > > > CommitDate: Mon Nov 7 14:08:18 2022 -0500
> > > > > > > > > > > 
> > > > > > > > > > >         hw/virtio: introduce virtio_device_should_start
> > > > > > > > > > > 
> > > > > > > > > > > and older version?
> > > > > > > > > > 
> > > > > > > > > > This is what got merged:
> > > > > > > > > > https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
> > > > > > > > > > This patch was sent after I merged the RFC.
> > > > > > > > > > I think the only difference is the commit log but I might be missing
> > > > > > > > > > something.
> > > > > > > > > > 
> > > > > > > > > > > This does not seem to fix the regression that I have reported.
> > > > > > > > > > 
> > > > > > > > > > This was applied on top of 9f6bcfd99f which IIUC does, right?
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > QEMU master still fails for me for suspend/resume to disk:
> > > > > > > > > 
> > > > > > > > > #0  0x000003ff8e3980a6 in __pthread_kill_implementation () at /lib64/libc.so.6
> > > > > > > > > #1  0x000003ff8e348580 in raise () at /lib64/libc.so.6
> > > > > > > > > #2  0x000003ff8e32b5c0 in abort () at /lib64/libc.so.6
> > > > > > > > > #3  0x000003ff8e3409da in __assert_fail_base () at /lib64/libc.so.6
> > > > > > > > > #4  0x000003ff8e340a4e in  () at /lib64/libc.so.6
> > > > > > > > > #5 0x000002aa1ffa8966 in vhost_vsock_common_pre_save
> > > > > > > > > (opaque=<optimized out>) at
> > > > > > > > > ../hw/virtio/vhost-vsock-common.c:203
> > > > > > > > > #6  0x000002aa1fe5e0ee in vmstate_save_state_v
> > > > > > > > >        (f=f@entry=0x2aa21bdc170, vmsd=0x2aa204ac5f0
> > > > > > > > > <vmstate_virtio_vhost_vsock>, opaque=0x2aa21bac9f8,
> > > > > > > > > vmdesc=vmdesc@entry=0x3fddc08eb30,
> > > > > > > > > version_id=version_id@entry=0) at ../migration/vmstate.c:329
> > > > > > > > > #7 0x000002aa1fe5ebf8 in vmstate_save_state
> > > > > > > > > (f=f@entry=0x2aa21bdc170, vmsd=<optimized out>,
> > > > > > > > > opaque=<optimized out>, vmdesc_id=vmdesc_id@entry=0x3fddc08eb30)
> > > > > > > > > at ../migration/vmstate.c:317
> > > > > > > > > #8 0x000002aa1fe75bd0 in vmstate_save (f=f@entry=0x2aa21bdc170,
> > > > > > > > > se=se@entry=0x2aa21bdbe90, vmdesc=vmdesc@entry=0x3fddc08eb30) at
> > > > > > > > > ../migration/savevm.c:908
> > > > > > > > > #9 0x000002aa1fe79584 in
> > > > > > > > > qemu_savevm_state_complete_precopy_non_iterable
> > > > > > > > > (f=f@entry=0x2aa21bdc170, in_postcopy=in_postcopy@entry=false,
> > > > > > > > > inactivate_disks=inactivate_disks@entry=true)
> > > > > > > > >        at ../migration/savevm.c:1393
> > > > > > > > > #10 0x000002aa1fe79a96 in qemu_savevm_state_complete_precopy
> > > > > > > > > (f=0x2aa21bdc170, iterable_only=iterable_only@entry=false,
> > > > > > > > > inactivate_disks=inactivate_disks@entry=true) at
> > > > > > > > > ../migration/savevm.c:1459
> > > > > > > > > #11 0x000002aa1fe6d6ee in migration_completion (s=0x2aa218ef600) at ../migration/migration.c:3314
> > > > > > > > > #12 migration_iteration_run (s=0x2aa218ef600) at ../migration/migration.c:3761
> > > > > > > > > #13 migration_thread (opaque=opaque@entry=0x2aa218ef600) at ../migration/migration.c:3989
> > > > > > > > > #14 0x000002aa201f0b8c in qemu_thread_start (args=<optimized out>) at ../util/qemu-thread-posix.c:505
> > > > > > > > > #15 0x000003ff8e396248 in start_thread () at /lib64/libc.so.6
> > > > > > > > > #16 0x000003ff8e41183e in thread_start () at /lib64/libc.so.6
> > > > > > > > > 
> > > > > > > > > Michael, your previous branch did work if I recall correctly.
> > > > > > > > 
> > > > > > > > That one was failing under github CI though (for reasons we didn't
> > > > > > > > really address, such as disconnect during stop causing a recursive
> > > > > > > > call to stop, but there you are).
> > > > > > > Even the double revert of everything?
> > > > > > 
> > > > > > I don't remember at this point.
> > > > > > 
> > > > > > > So how do we proceed now?
> > > > > > 
> > > > > > I'm hopeful Alex will come up with a fix.
> > > > > I need to replicate the failing test for that. Which test is
> > > > > failing?
> > > > 
> > > > 
> > > > Pretty much the same as before. guest with vsock, managedsave and
> > > > restore.
> > > 
> > > If this isn't in our test suite I'm going to need exact steps.
> > 
> > Just get any libvirt guest, add
> >      <vsock model='virtio'>
> >        <cid auto='yes'/>
> >      </vsock>
> > 
> > to your libvirt xml. Start the guest (with the new xml).
> > Run virsh managedsave - qemu crashes. On x86 and s390.
> 
> 
> the libvirt log:
> 
> /home/cborntra/REPOS/qemu/build/x86_64-softmmu/qemu-system-x86_64 \
> -name guest=f36,debug-threads=on \
> -S \
> -object '{"qom-type":"secret","id":"masterKey0","format":"raw","file":"/var/lib/libvirt/qemu/domain-1-f36/master-key.aes"}' \
> -machine pc-i440fx-7.2,usb=off,dump-guest-core=off,memory-backend=pc.ram \
> -accel kvm \
> -cpu Cooperlake,ss=on,pdcm=on,hypervisor=on,tsc-adjust=on,avx512ifma=on,sha-ni=on,avx512vbmi=on,umip=on,avx512vbmi2=on,gfni=on,vaes=on,vpclmulqdq=on,avx512bitalg=on,avx512-vpopcntdq=on,rdpid=on,movdiri=on,movdir64b=on,fsrm=on,md-clear=on,xsaves=on,ibpb=on,ibrs=on,amd-stibp=on,amd-ssbd=on,hle=off,rtm=off,avx512-bf16=off,taa-no=off \
> -m 2048 \
> -object '{"qom-type":"memory-backend-ram","id":"pc.ram","size":2147483648}' \
> -overcommit mem-lock=off \
> -smp 2,sockets=2,cores=1,threads=1 \
> -uuid 712590b2-fbd8-4a2f-a8e9-be33cb9ee0da \
> -display none \
> -no-user-config \
> -nodefaults \
> -chardev socket,id=charmonitor,fd=39,server=on,wait=off \
> -mon chardev=charmonitor,id=monitor,mode=control \
> -rtc base=utc,driftfix=slew \
> -global kvm-pit.lost_tick_policy=delay \
> -no-hpet \
> -no-shutdown \
> -global PIIX4_PM.disable_s3=1 \
> -global PIIX4_PM.disable_s4=1 \
> -boot strict=on \
> -device ich9-usb-ehci1,id=usb,bus=pci.0,addr=0x3.0x7 \
> -device ich9-usb-uhci1,masterbus=usb.0,firstport=0,bus=pci.0,multifunction=on,addr=0x3 \
> -device ich9-usb-uhci2,masterbus=usb.0,firstport=2,bus=pci.0,addr=0x3.0x1 \
> -device ich9-usb-uhci3,masterbus=usb.0,firstport=4,bus=pci.0,addr=0x3.0x2 \
> -blockdev '{"driver":"file","filename":"/var/lib/libvirt/images/f36.qcow2","node-name":"libvirt-1-storage","auto-read-only":true,"discard":"unmap"}' \
> -blockdev '{"node-name":"libvirt-1-format","read-only":false,"driver":"qcow2","file":"libvirt-1-storage","backing":null}' \
> -device ide-hd,bus=ide.0,unit=0,drive=libvirt-1-format,id=ide0-0-0,bootindex=1 \
> -netdev user,id=hostnet0 \
> -device e1000,netdev=hostnet0,id=net0,mac=52:54:00:20:ba:4a,bus=pci.0,addr=0x2 \
> -chardev pty,id=charserial0 \
> -device isa-serial,chardev=charserial0,id=serial0 \
> -audiodev '{"id":"audio1","driver":"none"}' \
> -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x4 \
> -sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny \
> -device vhost-vsock-pci,id=vsock0,guest-cid=3,vhostfd=35,bus=pci.0,addr=0x5 \
> -msg timestamp=on
> char device redirected to /dev/pts/1 (label charserial0)
> qemu-system-x86_64: ../hw/virtio/vhost-vsock-common.c:203: vhost_vsock_common_pre_save: Assertion `!vhost_dev_is_started(&vvc->vhost_dev)' failed.
> 2022-11-15 16:38:46.096+0000: shutting down, reason=crashed

Alex were you able to replicate? Just curious.
Christian Borntraeger Nov. 23, 2022, 6:27 a.m. UTC | #26
Am 21.11.22 um 23:37 schrieb Michael S. Tsirkin:
[...]
>> qemu-system-x86_64: ../hw/virtio/vhost-vsock-common.c:203: vhost_vsock_common_pre_save: Assertion `!vhost_dev_is_started(&vvc->vhost_dev)' failed.
>> 2022-11-15 16:38:46.096+0000: shutting down, reason=crashed
> 
> Alex were you able to replicate? Just curious.

Ping?
diff mbox series

Patch

diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index f41b4a7e64..3191c618f3 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -389,6 +389,24 @@  static inline bool virtio_device_started(VirtIODevice *vdev, uint8_t status)
         return vdev->started;
     }
 
+    return status & VIRTIO_CONFIG_S_DRIVER_OK;
+}
+
+/**
+ * virtio_device_should_start() - check if device startable
+ * @vdev - the VirtIO device
+ * @status - the devices status bits
+ *
+ * This is similar to virtio_device_started() but also encapsulates a
+ * check on the VM status which would prevent a device starting
+ * anyway.
+ */
+static inline bool virtio_device_should_start(VirtIODevice *vdev, uint8_t status)
+{
+    if (vdev->use_started) {
+        return vdev->started;
+    }
+
     if (!vdev->vm_running) {
         return false;
     }
diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 13bf5cc47a..8feaf12e4e 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -222,14 +222,10 @@  static void vhost_user_blk_stop(VirtIODevice *vdev)
 static void vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status)
 {
     VHostUserBlk *s = VHOST_USER_BLK(vdev);
-    bool should_start = virtio_device_started(vdev, status);
+    bool should_start = virtio_device_should_start(vdev, status);
     Error *local_err = NULL;
     int ret;
 
-    if (!vdev->vm_running) {
-        should_start = false;
-    }
-
     if (!s->connected) {
         return;
     }
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index ad0f91c607..1c40f42045 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -123,7 +123,7 @@  static void vuf_stop(VirtIODevice *vdev)
 static void vuf_set_status(VirtIODevice *vdev, uint8_t status)
 {
     VHostUserFS *fs = VHOST_USER_FS(vdev);
-    bool should_start = virtio_device_started(vdev, status);
+    bool should_start = virtio_device_should_start(vdev, status);
 
     if (vhost_dev_is_started(&fs->vhost_dev) == should_start) {
         return;
diff --git a/hw/virtio/vhost-user-gpio.c b/hw/virtio/vhost-user-gpio.c
index 8b40fe450c..677d1c7730 100644
--- a/hw/virtio/vhost-user-gpio.c
+++ b/hw/virtio/vhost-user-gpio.c
@@ -152,7 +152,7 @@  static void vu_gpio_stop(VirtIODevice *vdev)
 static void vu_gpio_set_status(VirtIODevice *vdev, uint8_t status)
 {
     VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
-    bool should_start = virtio_device_started(vdev, status);
+    bool should_start = virtio_device_should_start(vdev, status);
 
     trace_virtio_gpio_set_status(status);
 
diff --git a/hw/virtio/vhost-user-i2c.c b/hw/virtio/vhost-user-i2c.c
index bc58b6c0d1..864eba695e 100644
--- a/hw/virtio/vhost-user-i2c.c
+++ b/hw/virtio/vhost-user-i2c.c
@@ -93,7 +93,7 @@  static void vu_i2c_stop(VirtIODevice *vdev)
 static void vu_i2c_set_status(VirtIODevice *vdev, uint8_t status)
 {
     VHostUserI2C *i2c = VHOST_USER_I2C(vdev);
-    bool should_start = virtio_device_started(vdev, status);
+    bool should_start = virtio_device_should_start(vdev, status);
 
     if (vhost_dev_is_started(&i2c->vhost_dev) == should_start) {
         return;
diff --git a/hw/virtio/vhost-user-rng.c b/hw/virtio/vhost-user-rng.c
index bc1f36c5ac..8b47287875 100644
--- a/hw/virtio/vhost-user-rng.c
+++ b/hw/virtio/vhost-user-rng.c
@@ -90,7 +90,7 @@  static void vu_rng_stop(VirtIODevice *vdev)
 static void vu_rng_set_status(VirtIODevice *vdev, uint8_t status)
 {
     VHostUserRNG *rng = VHOST_USER_RNG(vdev);
-    bool should_start = virtio_device_started(vdev, status);
+    bool should_start = virtio_device_should_start(vdev, status);
 
     if (vhost_dev_is_started(&rng->vhost_dev) == should_start) {
         return;
diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c
index 7b67e29d83..9431b9792c 100644
--- a/hw/virtio/vhost-user-vsock.c
+++ b/hw/virtio/vhost-user-vsock.c
@@ -55,7 +55,7 @@  const VhostDevConfigOps vsock_ops = {
 static void vuv_set_status(VirtIODevice *vdev, uint8_t status)
 {
     VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
-    bool should_start = virtio_device_started(vdev, status);
+    bool should_start = virtio_device_should_start(vdev, status);
 
     if (vhost_dev_is_started(&vvc->vhost_dev) == should_start) {
         return;
diff --git a/hw/virtio/vhost-vsock.c b/hw/virtio/vhost-vsock.c
index 7dc3c73931..aa16d584ee 100644
--- a/hw/virtio/vhost-vsock.c
+++ b/hw/virtio/vhost-vsock.c
@@ -70,7 +70,7 @@  static int vhost_vsock_set_running(VirtIODevice *vdev, int start)
 static void vhost_vsock_set_status(VirtIODevice *vdev, uint8_t status)
 {
     VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
-    bool should_start = virtio_device_started(vdev, status);
+    bool should_start = virtio_device_should_start(vdev, status);
     int ret;
 
     if (vhost_dev_is_started(&vvc->vhost_dev) == should_start) {