diff mbox series

[v1] vdpa/mlx5: Restore the hardware used index after change map

Message ID 20210204073618.36336-1-elic@nvidia.com
State New
Headers show
Series [v1] vdpa/mlx5: Restore the hardware used index after change map | expand

Commit Message

Eli Cohen Feb. 4, 2021, 7:36 a.m. UTC
When a change of memory map occurs, the hardware resources are destroyed
and then re-created again with the new memory map. In such case, we need
to restore the hardware available and used indices. The driver failed to
restore the used index which is added here.

Also, since the driver also fails to reset the available and used
indices upon device reset, fix this here to avoid regression caused by
the fact that used index may not be zero upon device reset.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Signed-off-by: Eli Cohen <elic@nvidia.com>
---
v0 -> v1:
Clear indices upon device reset

 drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

Comments

Jason Wang Feb. 5, 2021, 3:57 a.m. UTC | #1
On 2021/2/4 下午3:36, Eli Cohen wrote:
> When a change of memory map occurs, the hardware resources are destroyed

> and then re-created again with the new memory map. In such case, we need

> to restore the hardware available and used indices. The driver failed to

> restore the used index which is added here.

>

> Also, since the driver also fails to reset the available and used

> indices upon device reset, fix this here to avoid regression caused by

> the fact that used index may not be zero upon device reset.

>

> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")

> Signed-off-by: Eli Cohen <elic@nvidia.com>

> ---

> v0 -> v1:

> Clear indices upon device reset



Acked-by: Jason Wang <jasowang@redhat.com>



>

>   drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

>   1 file changed, 18 insertions(+)

>

> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> index 88dde3455bfd..b5fe6d2ad22f 100644

> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>   	u64 device_addr;

>   	u64 driver_addr;

>   	u16 avail_index;

> +	u16 used_index;

>   	bool ready;

>   	struct vdpa_callback cb;

>   	bool restore;

> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>   	u32 virtq_id;

>   	struct mlx5_vdpa_net *ndev;

>   	u16 avail_idx;

> +	u16 used_idx;

>   	int fw_state;

>   

>   	/* keep last in the struct */

> @@ -804,6 +806,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque

>   

>   	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);

>   	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);

> +	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);

>   	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,

>   		 get_features_12_3(ndev->mvdev.actual_features));

>   	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);

> @@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *m

>   struct mlx5_virtq_attr {

>   	u8 state;

>   	u16 available_index;

> +	u16 used_index;

>   };

>   

>   static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,

> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>   	memset(attr, 0, sizeof(*attr));

>   	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

>   	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);

> +	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);

>   	kfree(out);

>   	return 0;

>   

> @@ -1535,6 +1540,16 @@ static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)

>   	}

>   }

>   

> +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

> +{

> +	int i;

> +

> +	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

> +		ndev->vqs[i].avail_idx = 0;

> +		ndev->vqs[i].used_idx = 0;

> +	}

> +}

> +

>   /* TODO: cross-endian support */

>   static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)

>   {

> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>   		return err;

>   

>   	ri->avail_index = attr.available_index;

> +	ri->used_index = attr.used_index;

>   	ri->ready = mvq->ready;

>   	ri->num_ent = mvq->num_ent;

>   	ri->desc_addr = mvq->desc_addr;

> @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct mlx5_vdpa_net *ndev)

>   			continue;

>   

>   		mvq->avail_idx = ri->avail_index;

> +		mvq->used_idx = ri->used_index;

>   		mvq->ready = ri->ready;

>   		mvq->num_ent = ri->num_ent;

>   		mvq->desc_addr = ri->desc_addr;

> @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)

>   	if (!status) {

>   		mlx5_vdpa_info(mvdev, "performing device reset\n");

>   		teardown_driver(ndev);

> +		clear_virtqueues(ndev);

>   		mlx5_vdpa_destroy_mr(&ndev->mvdev);

>   		ndev->mvdev.status = 0;

>   		ndev->mvdev.mlx_features = 0;
Si-Wei Liu Feb. 5, 2021, 11:07 p.m. UTC | #2
On 2/3/2021 11:36 PM, Eli Cohen wrote:
> When a change of memory map occurs, the hardware resources are destroyed

> and then re-created again with the new memory map. In such case, we need

> to restore the hardware available and used indices. The driver failed to

> restore the used index which is added here.

>

> Also, since the driver also fails to reset the available and used

> indices upon device reset, fix this here to avoid regression caused by

> the fact that used index may not be zero upon device reset.

>

> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")

> Signed-off-by: Eli Cohen <elic@nvidia.com>

> ---

> v0 -> v1:

> Clear indices upon device reset

>

>   drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

>   1 file changed, 18 insertions(+)

>

> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> index 88dde3455bfd..b5fe6d2ad22f 100644

> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>   	u64 device_addr;

>   	u64 driver_addr;

>   	u16 avail_index;

> +	u16 used_index;

>   	bool ready;

>   	struct vdpa_callback cb;

>   	bool restore;

> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>   	u32 virtq_id;

>   	struct mlx5_vdpa_net *ndev;

>   	u16 avail_idx;

> +	u16 used_idx;

>   	int fw_state;

>   

>   	/* keep last in the struct */

> @@ -804,6 +806,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque

>   

>   	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);

>   	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);

> +	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);

>   	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,

>   		 get_features_12_3(ndev->mvdev.actual_features));

>   	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);

> @@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *m

>   struct mlx5_virtq_attr {

>   	u8 state;

>   	u16 available_index;

> +	u16 used_index;

>   };

>   

>   static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,

> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>   	memset(attr, 0, sizeof(*attr));

>   	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

>   	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);

> +	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);

>   	kfree(out);

>   	return 0;

>   

> @@ -1535,6 +1540,16 @@ static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)

>   	}

>   }

>   

> +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

> +{

> +	int i;

> +

> +	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

> +		ndev->vqs[i].avail_idx = 0;

> +		ndev->vqs[i].used_idx = 0;

> +	}

> +}

> +

>   /* TODO: cross-endian support */

>   static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)

>   {

> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>   		return err;

>   

>   	ri->avail_index = attr.available_index;

> +	ri->used_index = attr.used_index;

>   	ri->ready = mvq->ready;

>   	ri->num_ent = mvq->num_ent;

>   	ri->desc_addr = mvq->desc_addr;

> @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct mlx5_vdpa_net *ndev)

>   			continue;

>   

>   		mvq->avail_idx = ri->avail_index;

> +		mvq->used_idx = ri->used_index;

>   		mvq->ready = ri->ready;

>   		mvq->num_ent = ri->num_ent;

>   		mvq->desc_addr = ri->desc_addr;

> @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)

>   	if (!status) {

>   		mlx5_vdpa_info(mvdev, "performing device reset\n");

>   		teardown_driver(ndev);

> +		clear_virtqueues(ndev);

The clearing looks fine at the first glance, as it aligns with the other 
state cleanups floating around at the same place. However, the thing is 
get_vq_state() is supposed to be called right after to get sync'ed with 
the latest internal avail_index from device while vq is stopped. The 
index was saved in the driver software at vq suspension, but before the 
virtq object is destroyed. We shouldn't clear the avail_index too early.

Possibly it can be postponed to where VIRTIO_CONFIG_S_DRIVER_OK gets set 
again, i.e. right before the setup_driver() in mlx5_vdpa_set_status()?

-Siwei

>   		mlx5_vdpa_destroy_mr(&ndev->mvdev);

>   		ndev->mvdev.status = 0;

>   		ndev->mvdev.mlx_features = 0;
Jason Wang Feb. 8, 2021, 4:27 a.m. UTC | #3
On 2021/2/6 上午7:07, Si-Wei Liu wrote:
>

>

> On 2/3/2021 11:36 PM, Eli Cohen wrote:

>> When a change of memory map occurs, the hardware resources are destroyed

>> and then re-created again with the new memory map. In such case, we need

>> to restore the hardware available and used indices. The driver failed to

>> restore the used index which is added here.

>>

>> Also, since the driver also fails to reset the available and used

>> indices upon device reset, fix this here to avoid regression caused by

>> the fact that used index may not be zero upon device reset.

>>

>> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 

>> devices")

>> Signed-off-by: Eli Cohen <elic@nvidia.com>

>> ---

>> v0 -> v1:

>> Clear indices upon device reset

>>

>>   drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

>>   1 file changed, 18 insertions(+)

>>

>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 

>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>> index 88dde3455bfd..b5fe6d2ad22f 100644

>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>>       u64 device_addr;

>>       u64 driver_addr;

>>       u16 avail_index;

>> +    u16 used_index;

>>       bool ready;

>>       struct vdpa_callback cb;

>>       bool restore;

>> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>>       u32 virtq_id;

>>       struct mlx5_vdpa_net *ndev;

>>       u16 avail_idx;

>> +    u16 used_idx;

>>       int fw_state;

>>         /* keep last in the struct */

>> @@ -804,6 +806,7 @@ static int create_virtqueue(struct mlx5_vdpa_net 

>> *ndev, struct mlx5_vdpa_virtque

>>         obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, 

>> obj_context);

>>       MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, 

>> mvq->avail_idx);

>> +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, 

>> mvq->used_idx);

>>       MLX5_SET(virtio_net_q_object, obj_context, 

>> queue_feature_bit_mask_12_3,

>>            get_features_12_3(ndev->mvdev.actual_features));

>>       vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, 

>> virtio_q_context);

>> @@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net 

>> *ndev, struct mlx5_vdpa_virtqueue *m

>>   struct mlx5_virtq_attr {

>>       u8 state;

>>       u16 available_index;

>> +    u16 used_index;

>>   };

>>     static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct 

>> mlx5_vdpa_virtqueue *mvq,

>> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct mlx5_vdpa_net 

>> *ndev, struct mlx5_vdpa_virtqueu

>>       memset(attr, 0, sizeof(*attr));

>>       attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

>>       attr->available_index = MLX5_GET(virtio_net_q_object, 

>> obj_context, hw_available_index);

>> +    attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, 

>> hw_used_index);

>>       kfree(out);

>>       return 0;

>>   @@ -1535,6 +1540,16 @@ static void teardown_virtqueues(struct 

>> mlx5_vdpa_net *ndev)

>>       }

>>   }

>>   +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

>> +{

>> +    int i;

>> +

>> +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

>> +        ndev->vqs[i].avail_idx = 0;

>> +        ndev->vqs[i].used_idx = 0;

>> +    }

>> +}

>> +

>>   /* TODO: cross-endian support */

>>   static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev 

>> *mvdev)

>>   {

>> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct 

>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>>           return err;

>>         ri->avail_index = attr.available_index;

>> +    ri->used_index = attr.used_index;

>>       ri->ready = mvq->ready;

>>       ri->num_ent = mvq->num_ent;

>>       ri->desc_addr = mvq->desc_addr;

>> @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct 

>> mlx5_vdpa_net *ndev)

>>               continue;

>>             mvq->avail_idx = ri->avail_index;

>> +        mvq->used_idx = ri->used_index;

>>           mvq->ready = ri->ready;

>>           mvq->num_ent = ri->num_ent;

>>           mvq->desc_addr = ri->desc_addr;

>> @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct 

>> vdpa_device *vdev, u8 status)

>>       if (!status) {

>>           mlx5_vdpa_info(mvdev, "performing device reset\n");

>>           teardown_driver(ndev);

>> +        clear_virtqueues(ndev);

> The clearing looks fine at the first glance, as it aligns with the 

> other state cleanups floating around at the same place. However, the 

> thing is get_vq_state() is supposed to be called right after to get 

> sync'ed with the latest internal avail_index from device while vq is 

> stopped. The index was saved in the driver software at vq suspension, 

> but before the virtq object is destroyed. We shouldn't clear the 

> avail_index too early.



Good point.

There's a limitation on the virtio spec and vDPA framework that we can 
not simply differ device suspending from device reset.

Need to think about that. I suggest a new state in [1], the issue is 
that people doesn't like the asynchronous API that it introduces.


>

> Possibly it can be postponed to where VIRTIO_CONFIG_S_DRIVER_OK gets 

> set again, i.e. right before the setup_driver() in 

> mlx5_vdpa_set_status()?



Looks like a good workaround.

Thanks


>

> -Siwei



[1] 
https://lists.oasis-open.org/archives/virtio-comment/202012/msg00029.html


>

>> mlx5_vdpa_destroy_mr(&ndev->mvdev);

>>           ndev->mvdev.status = 0;

>>           ndev->mvdev.mlx_features = 0;

>
Eli Cohen Feb. 8, 2021, 6:37 a.m. UTC | #4
On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:
> 

> On 2021/2/6 上午7:07, Si-Wei Liu wrote:

> > 

> > 

> > On 2/3/2021 11:36 PM, Eli Cohen wrote:

> > > When a change of memory map occurs, the hardware resources are destroyed

> > > and then re-created again with the new memory map. In such case, we need

> > > to restore the hardware available and used indices. The driver failed to

> > > restore the used index which is added here.

> > > 

> > > Also, since the driver also fails to reset the available and used

> > > indices upon device reset, fix this here to avoid regression caused by

> > > the fact that used index may not be zero upon device reset.

> > > 

> > > Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5

> > > devices")

> > > Signed-off-by: Eli Cohen <elic@nvidia.com>

> > > ---

> > > v0 -> v1:

> > > Clear indices upon device reset

> > > 

> > >   drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

> > >   1 file changed, 18 insertions(+)

> > > 

> > > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > index 88dde3455bfd..b5fe6d2ad22f 100644

> > > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

> > >       u64 device_addr;

> > >       u64 driver_addr;

> > >       u16 avail_index;

> > > +    u16 used_index;

> > >       bool ready;

> > >       struct vdpa_callback cb;

> > >       bool restore;

> > > @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

> > >       u32 virtq_id;

> > >       struct mlx5_vdpa_net *ndev;

> > >       u16 avail_idx;

> > > +    u16 used_idx;

> > >       int fw_state;

> > >         /* keep last in the struct */

> > > @@ -804,6 +806,7 @@ static int create_virtqueue(struct mlx5_vdpa_net

> > > *ndev, struct mlx5_vdpa_virtque

> > >         obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in,

> > > obj_context);

> > >       MLX5_SET(virtio_net_q_object, obj_context, hw_available_index,

> > > mvq->avail_idx);

> > > +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,

> > > mvq->used_idx);

> > >       MLX5_SET(virtio_net_q_object, obj_context,

> > > queue_feature_bit_mask_12_3,

> > >            get_features_12_3(ndev->mvdev.actual_features));

> > >       vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context,

> > > virtio_q_context);

> > > @@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net

> > > *ndev, struct mlx5_vdpa_virtqueue *m

> > >   struct mlx5_virtq_attr {

> > >       u8 state;

> > >       u16 available_index;

> > > +    u16 used_index;

> > >   };

> > >     static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct

> > > mlx5_vdpa_virtqueue *mvq,

> > > @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

> > > mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

> > >       memset(attr, 0, sizeof(*attr));

> > >       attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

> > >       attr->available_index = MLX5_GET(virtio_net_q_object,

> > > obj_context, hw_available_index);

> > > +    attr->used_index = MLX5_GET(virtio_net_q_object, obj_context,

> > > hw_used_index);

> > >       kfree(out);

> > >       return 0;

> > >   @@ -1535,6 +1540,16 @@ static void teardown_virtqueues(struct

> > > mlx5_vdpa_net *ndev)

> > >       }

> > >   }

> > >   +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

> > > +{

> > > +    int i;

> > > +

> > > +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

> > > +        ndev->vqs[i].avail_idx = 0;

> > > +        ndev->vqs[i].used_idx = 0;

> > > +    }

> > > +}

> > > +

> > >   /* TODO: cross-endian support */

> > >   static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev

> > > *mvdev)

> > >   {

> > > @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

> > > mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

> > >           return err;

> > >         ri->avail_index = attr.available_index;

> > > +    ri->used_index = attr.used_index;

> > >       ri->ready = mvq->ready;

> > >       ri->num_ent = mvq->num_ent;

> > >       ri->desc_addr = mvq->desc_addr;

> > > @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct

> > > mlx5_vdpa_net *ndev)

> > >               continue;

> > >             mvq->avail_idx = ri->avail_index;

> > > +        mvq->used_idx = ri->used_index;

> > >           mvq->ready = ri->ready;

> > >           mvq->num_ent = ri->num_ent;

> > >           mvq->desc_addr = ri->desc_addr;

> > > @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

> > > vdpa_device *vdev, u8 status)

> > >       if (!status) {

> > >           mlx5_vdpa_info(mvdev, "performing device reset\n");

> > >           teardown_driver(ndev);

> > > +        clear_virtqueues(ndev);

> > The clearing looks fine at the first glance, as it aligns with the other

> > state cleanups floating around at the same place. However, the thing is

> > get_vq_state() is supposed to be called right after to get sync'ed with

> > the latest internal avail_index from device while vq is stopped. The

> > index was saved in the driver software at vq suspension, but before the

> > virtq object is destroyed. We shouldn't clear the avail_index too early.

> 

> 

> Good point.

> 

> There's a limitation on the virtio spec and vDPA framework that we can not

> simply differ device suspending from device reset.

> 


Are you talking about live migration where you reset the device but
still want to know how far it progressed in order to continue from the
same place in the new VM?

> Need to think about that. I suggest a new state in [1], the issue is that

> people doesn't like the asynchronous API that it introduces.

> 

> 

> > 

> > Possibly it can be postponed to where VIRTIO_CONFIG_S_DRIVER_OK gets set

> > again, i.e. right before the setup_driver() in mlx5_vdpa_set_status()?

> 

> 

> Looks like a good workaround.

> 

> Thanks

> 

> 

> > 

> > -Siwei

> 

> 

> [1]

> https://lists.oasis-open.org/archives/virtio-comment/202012/msg00029.html

> 

> 

> > 

> > > mlx5_vdpa_destroy_mr(&ndev->mvdev);

> > >           ndev->mvdev.status = 0;

> > >           ndev->mvdev.mlx_features = 0;

> > 

>
Jason Wang Feb. 8, 2021, 9:04 a.m. UTC | #5
On 2021/2/8 下午2:37, Eli Cohen wrote:
> On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

>> On 2021/2/6 上午7:07, Si-Wei Liu wrote:

>>>

>>> On 2/3/2021 11:36 PM, Eli Cohen wrote:

>>>> When a change of memory map occurs, the hardware resources are destroyed

>>>> and then re-created again with the new memory map. In such case, we need

>>>> to restore the hardware available and used indices. The driver failed to

>>>> restore the used index which is added here.

>>>>

>>>> Also, since the driver also fails to reset the available and used

>>>> indices upon device reset, fix this here to avoid regression caused by

>>>> the fact that used index may not be zero upon device reset.

>>>>

>>>> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5

>>>> devices")

>>>> Signed-off-by: Eli Cohen <elic@nvidia.com>

>>>> ---

>>>> v0 -> v1:

>>>> Clear indices upon device reset

>>>>

>>>>    drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

>>>>    1 file changed, 18 insertions(+)

>>>>

>>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>> index 88dde3455bfd..b5fe6d2ad22f 100644

>>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>>>>        u64 device_addr;

>>>>        u64 driver_addr;

>>>>        u16 avail_index;

>>>> +    u16 used_index;

>>>>        bool ready;

>>>>        struct vdpa_callback cb;

>>>>        bool restore;

>>>> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>>>>        u32 virtq_id;

>>>>        struct mlx5_vdpa_net *ndev;

>>>>        u16 avail_idx;

>>>> +    u16 used_idx;

>>>>        int fw_state;

>>>>          /* keep last in the struct */

>>>> @@ -804,6 +806,7 @@ static int create_virtqueue(struct mlx5_vdpa_net

>>>> *ndev, struct mlx5_vdpa_virtque

>>>>          obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in,

>>>> obj_context);

>>>>        MLX5_SET(virtio_net_q_object, obj_context, hw_available_index,

>>>> mvq->avail_idx);

>>>> +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,

>>>> mvq->used_idx);

>>>>        MLX5_SET(virtio_net_q_object, obj_context,

>>>> queue_feature_bit_mask_12_3,

>>>>             get_features_12_3(ndev->mvdev.actual_features));

>>>>        vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context,

>>>> virtio_q_context);

>>>> @@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net

>>>> *ndev, struct mlx5_vdpa_virtqueue *m

>>>>    struct mlx5_virtq_attr {

>>>>        u8 state;

>>>>        u16 available_index;

>>>> +    u16 used_index;

>>>>    };

>>>>      static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct

>>>> mlx5_vdpa_virtqueue *mvq,

>>>> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>>>>        memset(attr, 0, sizeof(*attr));

>>>>        attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

>>>>        attr->available_index = MLX5_GET(virtio_net_q_object,

>>>> obj_context, hw_available_index);

>>>> +    attr->used_index = MLX5_GET(virtio_net_q_object, obj_context,

>>>> hw_used_index);

>>>>        kfree(out);

>>>>        return 0;

>>>>    @@ -1535,6 +1540,16 @@ static void teardown_virtqueues(struct

>>>> mlx5_vdpa_net *ndev)

>>>>        }

>>>>    }

>>>>    +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

>>>> +{

>>>> +    int i;

>>>> +

>>>> +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

>>>> +        ndev->vqs[i].avail_idx = 0;

>>>> +        ndev->vqs[i].used_idx = 0;

>>>> +    }

>>>> +}

>>>> +

>>>>    /* TODO: cross-endian support */

>>>>    static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev

>>>> *mvdev)

>>>>    {

>>>> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>>>>            return err;

>>>>          ri->avail_index = attr.available_index;

>>>> +    ri->used_index = attr.used_index;

>>>>        ri->ready = mvq->ready;

>>>>        ri->num_ent = mvq->num_ent;

>>>>        ri->desc_addr = mvq->desc_addr;

>>>> @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct

>>>> mlx5_vdpa_net *ndev)

>>>>                continue;

>>>>              mvq->avail_idx = ri->avail_index;

>>>> +        mvq->used_idx = ri->used_index;

>>>>            mvq->ready = ri->ready;

>>>>            mvq->num_ent = ri->num_ent;

>>>>            mvq->desc_addr = ri->desc_addr;

>>>> @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

>>>> vdpa_device *vdev, u8 status)

>>>>        if (!status) {

>>>>            mlx5_vdpa_info(mvdev, "performing device reset\n");

>>>>            teardown_driver(ndev);

>>>> +        clear_virtqueues(ndev);

>>> The clearing looks fine at the first glance, as it aligns with the other

>>> state cleanups floating around at the same place. However, the thing is

>>> get_vq_state() is supposed to be called right after to get sync'ed with

>>> the latest internal avail_index from device while vq is stopped. The

>>> index was saved in the driver software at vq suspension, but before the

>>> virtq object is destroyed. We shouldn't clear the avail_index too early.

>>

>> Good point.

>>

>> There's a limitation on the virtio spec and vDPA framework that we can not

>> simply differ device suspending from device reset.

>>

> Are you talking about live migration where you reset the device but

> still want to know how far it progressed in order to continue from the

> same place in the new VM?



Yes. So if we want to support live migration at we need:

in src node:
1) suspend the device
2) get last_avail_idx via get_vq_state()

in the dst node:
3) set last_avail_idx via set_vq_state()
4) resume the device

So you can see, step 2 requires the device/driver not to forget the 
last_avail_idx.

The annoying thing is that, in the virtio spec there's no definition of 
device suspending. So we reuse set_status(0) right now for vq 
suspending. Then if we forget last_avail_idx in set_status(0), it will 
break the assumption of step 2).


>

>> Need to think about that. I suggest a new state in [1], the issue is that

>> people doesn't like the asynchronous API that it introduces.

>>

>>

>>> Possibly it can be postponed to where VIRTIO_CONFIG_S_DRIVER_OK gets set

>>> again, i.e. right before the setup_driver() in mlx5_vdpa_set_status()?

>>

>> Looks like a good workaround.



Rethink of this, this won't work for the step 4), if we reuse the 
S_DRING_OK for resuming.

The most clean way is to invent the feature in virtio spec and implement 
that in the driver.

Thanks



>>

>> Thanks

>>

>>

>>> -Siwei

>>

>> [1]

>> https://lists.oasis-open.org/archives/virtio-comment/202012/msg00029.html

>>

>>

>>>> mlx5_vdpa_destroy_mr(&ndev->mvdev);

>>>>            ndev->mvdev.status = 0;

>>>>            ndev->mvdev.mlx_features = 0;
Michael S. Tsirkin Feb. 8, 2021, 9:26 a.m. UTC | #6
On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:
> 

> On 2021/2/8 下午2:37, Eli Cohen wrote:

> > On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

> > > On 2021/2/6 上午7:07, Si-Wei Liu wrote:

> > > > 

> > > > On 2/3/2021 11:36 PM, Eli Cohen wrote:

> > > > > When a change of memory map occurs, the hardware resources are destroyed

> > > > > and then re-created again with the new memory map. In such case, we need

> > > > > to restore the hardware available and used indices. The driver failed to

> > > > > restore the used index which is added here.

> > > > > 

> > > > > Also, since the driver also fails to reset the available and used

> > > > > indices upon device reset, fix this here to avoid regression caused by

> > > > > the fact that used index may not be zero upon device reset.

> > > > > 

> > > > > Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5

> > > > > devices")

> > > > > Signed-off-by: Eli Cohen <elic@nvidia.com>

> > > > > ---

> > > > > v0 -> v1:

> > > > > Clear indices upon device reset

> > > > > 

> > > > >    drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

> > > > >    1 file changed, 18 insertions(+)

> > > > > 

> > > > > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > index 88dde3455bfd..b5fe6d2ad22f 100644

> > > > > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

> > > > >        u64 device_addr;

> > > > >        u64 driver_addr;

> > > > >        u16 avail_index;

> > > > > +    u16 used_index;

> > > > >        bool ready;

> > > > >        struct vdpa_callback cb;

> > > > >        bool restore;

> > > > > @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

> > > > >        u32 virtq_id;

> > > > >        struct mlx5_vdpa_net *ndev;

> > > > >        u16 avail_idx;

> > > > > +    u16 used_idx;

> > > > >        int fw_state;

> > > > >          /* keep last in the struct */

> > > > > @@ -804,6 +806,7 @@ static int create_virtqueue(struct mlx5_vdpa_net

> > > > > *ndev, struct mlx5_vdpa_virtque

> > > > >          obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in,

> > > > > obj_context);

> > > > >        MLX5_SET(virtio_net_q_object, obj_context, hw_available_index,

> > > > > mvq->avail_idx);

> > > > > +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,

> > > > > mvq->used_idx);

> > > > >        MLX5_SET(virtio_net_q_object, obj_context,

> > > > > queue_feature_bit_mask_12_3,

> > > > >             get_features_12_3(ndev->mvdev.actual_features));

> > > > >        vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context,

> > > > > virtio_q_context);

> > > > > @@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net

> > > > > *ndev, struct mlx5_vdpa_virtqueue *m

> > > > >    struct mlx5_virtq_attr {

> > > > >        u8 state;

> > > > >        u16 available_index;

> > > > > +    u16 used_index;

> > > > >    };

> > > > >      static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct

> > > > > mlx5_vdpa_virtqueue *mvq,

> > > > > @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

> > > > > mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

> > > > >        memset(attr, 0, sizeof(*attr));

> > > > >        attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

> > > > >        attr->available_index = MLX5_GET(virtio_net_q_object,

> > > > > obj_context, hw_available_index);

> > > > > +    attr->used_index = MLX5_GET(virtio_net_q_object, obj_context,

> > > > > hw_used_index);

> > > > >        kfree(out);

> > > > >        return 0;

> > > > >    @@ -1535,6 +1540,16 @@ static void teardown_virtqueues(struct

> > > > > mlx5_vdpa_net *ndev)

> > > > >        }

> > > > >    }

> > > > >    +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

> > > > > +{

> > > > > +    int i;

> > > > > +

> > > > > +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

> > > > > +        ndev->vqs[i].avail_idx = 0;

> > > > > +        ndev->vqs[i].used_idx = 0;

> > > > > +    }

> > > > > +}

> > > > > +

> > > > >    /* TODO: cross-endian support */

> > > > >    static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev

> > > > > *mvdev)

> > > > >    {

> > > > > @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

> > > > > mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

> > > > >            return err;

> > > > >          ri->avail_index = attr.available_index;

> > > > > +    ri->used_index = attr.used_index;

> > > > >        ri->ready = mvq->ready;

> > > > >        ri->num_ent = mvq->num_ent;

> > > > >        ri->desc_addr = mvq->desc_addr;

> > > > > @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct

> > > > > mlx5_vdpa_net *ndev)

> > > > >                continue;

> > > > >              mvq->avail_idx = ri->avail_index;

> > > > > +        mvq->used_idx = ri->used_index;

> > > > >            mvq->ready = ri->ready;

> > > > >            mvq->num_ent = ri->num_ent;

> > > > >            mvq->desc_addr = ri->desc_addr;

> > > > > @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

> > > > > vdpa_device *vdev, u8 status)

> > > > >        if (!status) {

> > > > >            mlx5_vdpa_info(mvdev, "performing device reset\n");

> > > > >            teardown_driver(ndev);

> > > > > +        clear_virtqueues(ndev);

> > > > The clearing looks fine at the first glance, as it aligns with the other

> > > > state cleanups floating around at the same place. However, the thing is

> > > > get_vq_state() is supposed to be called right after to get sync'ed with

> > > > the latest internal avail_index from device while vq is stopped. The

> > > > index was saved in the driver software at vq suspension, but before the

> > > > virtq object is destroyed. We shouldn't clear the avail_index too early.

> > > 

> > > Good point.

> > > 

> > > There's a limitation on the virtio spec and vDPA framework that we can not

> > > simply differ device suspending from device reset.

> > > 

> > Are you talking about live migration where you reset the device but

> > still want to know how far it progressed in order to continue from the

> > same place in the new VM?

> 

> 

> Yes. So if we want to support live migration at we need:

> 

> in src node:

> 1) suspend the device

> 2) get last_avail_idx via get_vq_state()

> 

> in the dst node:

> 3) set last_avail_idx via set_vq_state()

> 4) resume the device

> 

> So you can see, step 2 requires the device/driver not to forget the

> last_avail_idx.

> 

> The annoying thing is that, in the virtio spec there's no definition of

> device suspending. So we reuse set_status(0) right now for vq suspending.

> Then if we forget last_avail_idx in set_status(0), it will break the

> assumption of step 2).

> 

> 

> > 

> > > Need to think about that. I suggest a new state in [1], the issue is that

> > > people doesn't like the asynchronous API that it introduces.

> > > 

> > > 

> > > > Possibly it can be postponed to where VIRTIO_CONFIG_S_DRIVER_OK gets set

> > > > again, i.e. right before the setup_driver() in mlx5_vdpa_set_status()?

> > > 

> > > Looks like a good workaround.

> 

> 

> Rethink of this, this won't work for the step 4), if we reuse the S_DRING_OK

> for resuming.

> 

> The most clean way is to invent the feature in virtio spec and implement

> that in the driver.

> 

> Thanks


Given it's between parts of device (between qemu and host kernel)
I don't think we need it in the spec even. Just add a new ioctl.


> 

> 

> > > 

> > > Thanks

> > > 

> > > 

> > > > -Siwei

> > > 

> > > [1]

> > > https://lists.oasis-open.org/archives/virtio-comment/202012/msg00029.html

> > > 

> > > 

> > > > > mlx5_vdpa_destroy_mr(&ndev->mvdev);

> > > > >            ndev->mvdev.status = 0;

> > > > >            ndev->mvdev.mlx_features = 0;
Eli Cohen Feb. 8, 2021, 10:04 a.m. UTC | #7
On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:
> 

> On 2021/2/8 下午2:37, Eli Cohen wrote:

> > On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

> > > On 2021/2/6 上午7:07, Si-Wei Liu wrote:

> > > > 

> > > > On 2/3/2021 11:36 PM, Eli Cohen wrote:

> > > > > When a change of memory map occurs, the hardware resources are destroyed

> > > > > and then re-created again with the new memory map. In such case, we need

> > > > > to restore the hardware available and used indices. The driver failed to

> > > > > restore the used index which is added here.

> > > > > 

> > > > > Also, since the driver also fails to reset the available and used

> > > > > indices upon device reset, fix this here to avoid regression caused by

> > > > > the fact that used index may not be zero upon device reset.

> > > > > 

> > > > > Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5

> > > > > devices")

> > > > > Signed-off-by: Eli Cohen <elic@nvidia.com>

> > > > > ---

> > > > > v0 -> v1:

> > > > > Clear indices upon device reset

> > > > > 

> > > > >    drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

> > > > >    1 file changed, 18 insertions(+)

> > > > > 

> > > > > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > index 88dde3455bfd..b5fe6d2ad22f 100644

> > > > > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

> > > > >        u64 device_addr;

> > > > >        u64 driver_addr;

> > > > >        u16 avail_index;

> > > > > +    u16 used_index;

> > > > >        bool ready;

> > > > >        struct vdpa_callback cb;

> > > > >        bool restore;

> > > > > @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

> > > > >        u32 virtq_id;

> > > > >        struct mlx5_vdpa_net *ndev;

> > > > >        u16 avail_idx;

> > > > > +    u16 used_idx;

> > > > >        int fw_state;

> > > > >          /* keep last in the struct */

> > > > > @@ -804,6 +806,7 @@ static int create_virtqueue(struct mlx5_vdpa_net

> > > > > *ndev, struct mlx5_vdpa_virtque

> > > > >          obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in,

> > > > > obj_context);

> > > > >        MLX5_SET(virtio_net_q_object, obj_context, hw_available_index,

> > > > > mvq->avail_idx);

> > > > > +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,

> > > > > mvq->used_idx);

> > > > >        MLX5_SET(virtio_net_q_object, obj_context,

> > > > > queue_feature_bit_mask_12_3,

> > > > >             get_features_12_3(ndev->mvdev.actual_features));

> > > > >        vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context,

> > > > > virtio_q_context);

> > > > > @@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net

> > > > > *ndev, struct mlx5_vdpa_virtqueue *m

> > > > >    struct mlx5_virtq_attr {

> > > > >        u8 state;

> > > > >        u16 available_index;

> > > > > +    u16 used_index;

> > > > >    };

> > > > >      static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct

> > > > > mlx5_vdpa_virtqueue *mvq,

> > > > > @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

> > > > > mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

> > > > >        memset(attr, 0, sizeof(*attr));

> > > > >        attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

> > > > >        attr->available_index = MLX5_GET(virtio_net_q_object,

> > > > > obj_context, hw_available_index);

> > > > > +    attr->used_index = MLX5_GET(virtio_net_q_object, obj_context,

> > > > > hw_used_index);

> > > > >        kfree(out);

> > > > >        return 0;

> > > > >    @@ -1535,6 +1540,16 @@ static void teardown_virtqueues(struct

> > > > > mlx5_vdpa_net *ndev)

> > > > >        }

> > > > >    }

> > > > >    +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

> > > > > +{

> > > > > +    int i;

> > > > > +

> > > > > +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

> > > > > +        ndev->vqs[i].avail_idx = 0;

> > > > > +        ndev->vqs[i].used_idx = 0;

> > > > > +    }

> > > > > +}

> > > > > +

> > > > >    /* TODO: cross-endian support */

> > > > >    static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev

> > > > > *mvdev)

> > > > >    {

> > > > > @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

> > > > > mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

> > > > >            return err;

> > > > >          ri->avail_index = attr.available_index;

> > > > > +    ri->used_index = attr.used_index;

> > > > >        ri->ready = mvq->ready;

> > > > >        ri->num_ent = mvq->num_ent;

> > > > >        ri->desc_addr = mvq->desc_addr;

> > > > > @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct

> > > > > mlx5_vdpa_net *ndev)

> > > > >                continue;

> > > > >              mvq->avail_idx = ri->avail_index;

> > > > > +        mvq->used_idx = ri->used_index;

> > > > >            mvq->ready = ri->ready;

> > > > >            mvq->num_ent = ri->num_ent;

> > > > >            mvq->desc_addr = ri->desc_addr;

> > > > > @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

> > > > > vdpa_device *vdev, u8 status)

> > > > >        if (!status) {

> > > > >            mlx5_vdpa_info(mvdev, "performing device reset\n");

> > > > >            teardown_driver(ndev);

> > > > > +        clear_virtqueues(ndev);

> > > > The clearing looks fine at the first glance, as it aligns with the other

> > > > state cleanups floating around at the same place. However, the thing is

> > > > get_vq_state() is supposed to be called right after to get sync'ed with

> > > > the latest internal avail_index from device while vq is stopped. The

> > > > index was saved in the driver software at vq suspension, but before the

> > > > virtq object is destroyed. We shouldn't clear the avail_index too early.

> > > 

> > > Good point.

> > > 

> > > There's a limitation on the virtio spec and vDPA framework that we can not

> > > simply differ device suspending from device reset.

> > > 

> > Are you talking about live migration where you reset the device but

> > still want to know how far it progressed in order to continue from the

> > same place in the new VM?

> 

> 

> Yes. So if we want to support live migration at we need:

> 

> in src node:

> 1) suspend the device

> 2) get last_avail_idx via get_vq_state()

> 

> in the dst node:

> 3) set last_avail_idx via set_vq_state()

> 4) resume the device

> 

> So you can see, step 2 requires the device/driver not to forget the

> last_avail_idx.

> 


Just to be sure, what really matters here is the used index. Becuase the
vriqtueue itself is copied from the src VM to the dest VM. The available
index is alreay there and we know the hardware reads it from there.

So it puzzles me why is set_vq_state() we do not communicate the saved
used index.

> The annoying thing is that, in the virtio spec there's no definition of

> device suspending. So we reuse set_status(0) right now for vq suspending.

> Then if we forget last_avail_idx in set_status(0), it will break the

> assumption of step 2).

> 

> 

> > 

> > > Need to think about that. I suggest a new state in [1], the issue is that

> > > people doesn't like the asynchronous API that it introduces.

> > > 

> > > 

> > > > Possibly it can be postponed to where VIRTIO_CONFIG_S_DRIVER_OK gets set

> > > > again, i.e. right before the setup_driver() in mlx5_vdpa_set_status()?

> > > 

> > > Looks like a good workaround.

> 

> 

> Rethink of this, this won't work for the step 4), if we reuse the S_DRING_OK

> for resuming.

> 

> The most clean way is to invent the feature in virtio spec and implement

> that in the driver.

> 

> Thanks

> 

> 

> 

> > > 

> > > Thanks

> > > 

> > > 

> > > > -Siwei

> > > 

> > > [1]

> > > https://lists.oasis-open.org/archives/virtio-comment/202012/msg00029.html

> > > 

> > > 

> > > > > mlx5_vdpa_destroy_mr(&ndev->mvdev);

> > > > >            ndev->mvdev.status = 0;

> > > > >            ndev->mvdev.mlx_features = 0;

>
Jason Wang Feb. 9, 2021, 3:20 a.m. UTC | #8
On 2021/2/8 下午6:04, Eli Cohen wrote:
> On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

>> On 2021/2/8 下午2:37, Eli Cohen wrote:

>>> On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

>>>> On 2021/2/6 上午7:07, Si-Wei Liu wrote:

>>>>> On 2/3/2021 11:36 PM, Eli Cohen wrote:

>>>>>> When a change of memory map occurs, the hardware resources are destroyed

>>>>>> and then re-created again with the new memory map. In such case, we need

>>>>>> to restore the hardware available and used indices. The driver failed to

>>>>>> restore the used index which is added here.

>>>>>>

>>>>>> Also, since the driver also fails to reset the available and used

>>>>>> indices upon device reset, fix this here to avoid regression caused by

>>>>>> the fact that used index may not be zero upon device reset.

>>>>>>

>>>>>> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5

>>>>>> devices")

>>>>>> Signed-off-by: Eli Cohen<elic@nvidia.com>

>>>>>> ---

>>>>>> v0 -> v1:

>>>>>> Clear indices upon device reset

>>>>>>

>>>>>>     drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

>>>>>>     1 file changed, 18 insertions(+)

>>>>>>

>>>>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>> index 88dde3455bfd..b5fe6d2ad22f 100644

>>>>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>>>>>>         u64 device_addr;

>>>>>>         u64 driver_addr;

>>>>>>         u16 avail_index;

>>>>>> +    u16 used_index;

>>>>>>         bool ready;

>>>>>>         struct vdpa_callback cb;

>>>>>>         bool restore;

>>>>>> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>>>>>>         u32 virtq_id;

>>>>>>         struct mlx5_vdpa_net *ndev;

>>>>>>         u16 avail_idx;

>>>>>> +    u16 used_idx;

>>>>>>         int fw_state;

>>>>>>           /* keep last in the struct */

>>>>>> @@ -804,6 +806,7 @@ static int create_virtqueue(struct mlx5_vdpa_net

>>>>>> *ndev, struct mlx5_vdpa_virtque

>>>>>>           obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in,

>>>>>> obj_context);

>>>>>>         MLX5_SET(virtio_net_q_object, obj_context, hw_available_index,

>>>>>> mvq->avail_idx);

>>>>>> +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,

>>>>>> mvq->used_idx);

>>>>>>         MLX5_SET(virtio_net_q_object, obj_context,

>>>>>> queue_feature_bit_mask_12_3,

>>>>>>              get_features_12_3(ndev->mvdev.actual_features));

>>>>>>         vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context,

>>>>>> virtio_q_context);

>>>>>> @@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net

>>>>>> *ndev, struct mlx5_vdpa_virtqueue *m

>>>>>>     struct mlx5_virtq_attr {

>>>>>>         u8 state;

>>>>>>         u16 available_index;

>>>>>> +    u16 used_index;

>>>>>>     };

>>>>>>       static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct

>>>>>> mlx5_vdpa_virtqueue *mvq,

>>>>>> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>>>>>>         memset(attr, 0, sizeof(*attr));

>>>>>>         attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

>>>>>>         attr->available_index = MLX5_GET(virtio_net_q_object,

>>>>>> obj_context, hw_available_index);

>>>>>> +    attr->used_index = MLX5_GET(virtio_net_q_object, obj_context,

>>>>>> hw_used_index);

>>>>>>         kfree(out);

>>>>>>         return 0;

>>>>>>     @@ -1535,6 +1540,16 @@ static void teardown_virtqueues(struct

>>>>>> mlx5_vdpa_net *ndev)

>>>>>>         }

>>>>>>     }

>>>>>>     +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

>>>>>> +{

>>>>>> +    int i;

>>>>>> +

>>>>>> +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

>>>>>> +        ndev->vqs[i].avail_idx = 0;

>>>>>> +        ndev->vqs[i].used_idx = 0;

>>>>>> +    }

>>>>>> +}

>>>>>> +

>>>>>>     /* TODO: cross-endian support */

>>>>>>     static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev

>>>>>> *mvdev)

>>>>>>     {

>>>>>> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>>>>>>             return err;

>>>>>>           ri->avail_index = attr.available_index;

>>>>>> +    ri->used_index = attr.used_index;

>>>>>>         ri->ready = mvq->ready;

>>>>>>         ri->num_ent = mvq->num_ent;

>>>>>>         ri->desc_addr = mvq->desc_addr;

>>>>>> @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct

>>>>>> mlx5_vdpa_net *ndev)

>>>>>>                 continue;

>>>>>>               mvq->avail_idx = ri->avail_index;

>>>>>> +        mvq->used_idx = ri->used_index;

>>>>>>             mvq->ready = ri->ready;

>>>>>>             mvq->num_ent = ri->num_ent;

>>>>>>             mvq->desc_addr = ri->desc_addr;

>>>>>> @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

>>>>>> vdpa_device *vdev, u8 status)

>>>>>>         if (!status) {

>>>>>>             mlx5_vdpa_info(mvdev, "performing device reset\n");

>>>>>>             teardown_driver(ndev);

>>>>>> +        clear_virtqueues(ndev);

>>>>> The clearing looks fine at the first glance, as it aligns with the other

>>>>> state cleanups floating around at the same place. However, the thing is

>>>>> get_vq_state() is supposed to be called right after to get sync'ed with

>>>>> the latest internal avail_index from device while vq is stopped. The

>>>>> index was saved in the driver software at vq suspension, but before the

>>>>> virtq object is destroyed. We shouldn't clear the avail_index too early.

>>>> Good point.

>>>>

>>>> There's a limitation on the virtio spec and vDPA framework that we can not

>>>> simply differ device suspending from device reset.

>>>>

>>> Are you talking about live migration where you reset the device but

>>> still want to know how far it progressed in order to continue from the

>>> same place in the new VM?

>> Yes. So if we want to support live migration at we need:

>>

>> in src node:

>> 1) suspend the device

>> 2) get last_avail_idx via get_vq_state()

>>

>> in the dst node:

>> 3) set last_avail_idx via set_vq_state()

>> 4) resume the device

>>

>> So you can see, step 2 requires the device/driver not to forget the

>> last_avail_idx.

>>

> Just to be sure, what really matters here is the used index. Becuase the

> vriqtueue itself is copied from the src VM to the dest VM. The available

> index is alreay there and we know the hardware reads it from there.



So for "last_avail_idx" I meant the hardware internal avail index. It's 
not stored in the virtqueue so we must migrate it from src to dest and 
set them through set_vq_state(). Then in the destination, the virtqueue 
can be restarted from that index.


>

> So it puzzles me why is set_vq_state() we do not communicate the saved

> used index.



We don't do that since:

1) if the hardware can sync its internal used index from the virtqueue 
during device, then we don't need it
2) if the hardware can not sync its internal used index, the driver (e.g 
as you did here) can do that.

But there's no way for the hardware to deduce the internal avail index 
from the virtqueue, that's why avail index is sycned.

Thanks


>
Eli Cohen Feb. 9, 2021, 6:12 a.m. UTC | #9
On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:
> 

> On 2021/2/8 下午6:04, Eli Cohen wrote:

> > On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

> > > On 2021/2/8 下午2:37, Eli Cohen wrote:

> > > > On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

> > > > > On 2021/2/6 上午7:07, Si-Wei Liu wrote:

> > > > > > On 2/3/2021 11:36 PM, Eli Cohen wrote:

> > > > > > > When a change of memory map occurs, the hardware resources are destroyed

> > > > > > > and then re-created again with the new memory map. In such case, we need

> > > > > > > to restore the hardware available and used indices. The driver failed to

> > > > > > > restore the used index which is added here.

> > > > > > > 

> > > > > > > Also, since the driver also fails to reset the available and used

> > > > > > > indices upon device reset, fix this here to avoid regression caused by

> > > > > > > the fact that used index may not be zero upon device reset.

> > > > > > > 

> > > > > > > Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5

> > > > > > > devices")

> > > > > > > Signed-off-by: Eli Cohen<elic@nvidia.com>

> > > > > > > ---

> > > > > > > v0 -> v1:

> > > > > > > Clear indices upon device reset

> > > > > > > 

> > > > > > >     drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

> > > > > > >     1 file changed, 18 insertions(+)

> > > > > > > 

> > > > > > > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > > > b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > > > index 88dde3455bfd..b5fe6d2ad22f 100644

> > > > > > > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > > > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > > > @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

> > > > > > >         u64 device_addr;

> > > > > > >         u64 driver_addr;

> > > > > > >         u16 avail_index;

> > > > > > > +    u16 used_index;

> > > > > > >         bool ready;

> > > > > > >         struct vdpa_callback cb;

> > > > > > >         bool restore;

> > > > > > > @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

> > > > > > >         u32 virtq_id;

> > > > > > >         struct mlx5_vdpa_net *ndev;

> > > > > > >         u16 avail_idx;

> > > > > > > +    u16 used_idx;

> > > > > > >         int fw_state;

> > > > > > >           /* keep last in the struct */

> > > > > > > @@ -804,6 +806,7 @@ static int create_virtqueue(struct mlx5_vdpa_net

> > > > > > > *ndev, struct mlx5_vdpa_virtque

> > > > > > >           obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in,

> > > > > > > obj_context);

> > > > > > >         MLX5_SET(virtio_net_q_object, obj_context, hw_available_index,

> > > > > > > mvq->avail_idx);

> > > > > > > +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,

> > > > > > > mvq->used_idx);

> > > > > > >         MLX5_SET(virtio_net_q_object, obj_context,

> > > > > > > queue_feature_bit_mask_12_3,

> > > > > > >              get_features_12_3(ndev->mvdev.actual_features));

> > > > > > >         vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context,

> > > > > > > virtio_q_context);

> > > > > > > @@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net

> > > > > > > *ndev, struct mlx5_vdpa_virtqueue *m

> > > > > > >     struct mlx5_virtq_attr {

> > > > > > >         u8 state;

> > > > > > >         u16 available_index;

> > > > > > > +    u16 used_index;

> > > > > > >     };

> > > > > > >       static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct

> > > > > > > mlx5_vdpa_virtqueue *mvq,

> > > > > > > @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

> > > > > > > mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

> > > > > > >         memset(attr, 0, sizeof(*attr));

> > > > > > >         attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

> > > > > > >         attr->available_index = MLX5_GET(virtio_net_q_object,

> > > > > > > obj_context, hw_available_index);

> > > > > > > +    attr->used_index = MLX5_GET(virtio_net_q_object, obj_context,

> > > > > > > hw_used_index);

> > > > > > >         kfree(out);

> > > > > > >         return 0;

> > > > > > >     @@ -1535,6 +1540,16 @@ static void teardown_virtqueues(struct

> > > > > > > mlx5_vdpa_net *ndev)

> > > > > > >         }

> > > > > > >     }

> > > > > > >     +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

> > > > > > > +{

> > > > > > > +    int i;

> > > > > > > +

> > > > > > > +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

> > > > > > > +        ndev->vqs[i].avail_idx = 0;

> > > > > > > +        ndev->vqs[i].used_idx = 0;

> > > > > > > +    }

> > > > > > > +}

> > > > > > > +

> > > > > > >     /* TODO: cross-endian support */

> > > > > > >     static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev

> > > > > > > *mvdev)

> > > > > > >     {

> > > > > > > @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

> > > > > > > mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

> > > > > > >             return err;

> > > > > > >           ri->avail_index = attr.available_index;

> > > > > > > +    ri->used_index = attr.used_index;

> > > > > > >         ri->ready = mvq->ready;

> > > > > > >         ri->num_ent = mvq->num_ent;

> > > > > > >         ri->desc_addr = mvq->desc_addr;

> > > > > > > @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct

> > > > > > > mlx5_vdpa_net *ndev)

> > > > > > >                 continue;

> > > > > > >               mvq->avail_idx = ri->avail_index;

> > > > > > > +        mvq->used_idx = ri->used_index;

> > > > > > >             mvq->ready = ri->ready;

> > > > > > >             mvq->num_ent = ri->num_ent;

> > > > > > >             mvq->desc_addr = ri->desc_addr;

> > > > > > > @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

> > > > > > > vdpa_device *vdev, u8 status)

> > > > > > >         if (!status) {

> > > > > > >             mlx5_vdpa_info(mvdev, "performing device reset\n");

> > > > > > >             teardown_driver(ndev);

> > > > > > > +        clear_virtqueues(ndev);

> > > > > > The clearing looks fine at the first glance, as it aligns with the other

> > > > > > state cleanups floating around at the same place. However, the thing is

> > > > > > get_vq_state() is supposed to be called right after to get sync'ed with

> > > > > > the latest internal avail_index from device while vq is stopped. The

> > > > > > index was saved in the driver software at vq suspension, but before the

> > > > > > virtq object is destroyed. We shouldn't clear the avail_index too early.

> > > > > Good point.

> > > > > 

> > > > > There's a limitation on the virtio spec and vDPA framework that we can not

> > > > > simply differ device suspending from device reset.

> > > > > 

> > > > Are you talking about live migration where you reset the device but

> > > > still want to know how far it progressed in order to continue from the

> > > > same place in the new VM?

> > > Yes. So if we want to support live migration at we need:

> > > 

> > > in src node:

> > > 1) suspend the device

> > > 2) get last_avail_idx via get_vq_state()

> > > 

> > > in the dst node:

> > > 3) set last_avail_idx via set_vq_state()

> > > 4) resume the device

> > > 

> > > So you can see, step 2 requires the device/driver not to forget the

> > > last_avail_idx.

> > > 

> > Just to be sure, what really matters here is the used index. Becuase the

> > vriqtueue itself is copied from the src VM to the dest VM. The available

> > index is alreay there and we know the hardware reads it from there.

> 

> 

> So for "last_avail_idx" I meant the hardware internal avail index. It's not

> stored in the virtqueue so we must migrate it from src to dest and set them

> through set_vq_state(). Then in the destination, the virtqueue can be

> restarted from that index.

> 


Consider this case: driver posted buffers till avail index becomes the
value 50. Hardware is executing but made it till 20 when virtqueue was
suspended due to live migration - this is indicated by hardware used
index equal 20. Now the vritqueue is copied to the new VM and the
hardware now has to continue execution from index 20. We need to tell
the hardware via configuring the last used_index. So why don't we
restore the used index?

> 

> > 

> > So it puzzles me why is set_vq_state() we do not communicate the saved

> > used index.

> 

> 

> We don't do that since:

> 

> 1) if the hardware can sync its internal used index from the virtqueue

> during device, then we don't need it

> 2) if the hardware can not sync its internal used index, the driver (e.g as

> you did here) can do that.

> 

> But there's no way for the hardware to deduce the internal avail index from

> the virtqueue, that's why avail index is sycned.

> 

> Thanks

> 

> 

> > 

>
Jason Wang Feb. 9, 2021, 6:37 a.m. UTC | #10
On 2021/2/9 下午2:12, Eli Cohen wrote:
> On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

>> On 2021/2/8 下午6:04, Eli Cohen wrote:

>>> On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

>>>> On 2021/2/8 下午2:37, Eli Cohen wrote:

>>>>> On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

>>>>>> On 2021/2/6 上午7:07, Si-Wei Liu wrote:

>>>>>>> On 2/3/2021 11:36 PM, Eli Cohen wrote:

>>>>>>>> When a change of memory map occurs, the hardware resources are destroyed

>>>>>>>> and then re-created again with the new memory map. In such case, we need

>>>>>>>> to restore the hardware available and used indices. The driver failed to

>>>>>>>> restore the used index which is added here.

>>>>>>>>

>>>>>>>> Also, since the driver also fails to reset the available and used

>>>>>>>> indices upon device reset, fix this here to avoid regression caused by

>>>>>>>> the fact that used index may not be zero upon device reset.

>>>>>>>>

>>>>>>>> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5

>>>>>>>> devices")

>>>>>>>> Signed-off-by: Eli Cohen<elic@nvidia.com>

>>>>>>>> ---

>>>>>>>> v0 -> v1:

>>>>>>>> Clear indices upon device reset

>>>>>>>>

>>>>>>>>      drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

>>>>>>>>      1 file changed, 18 insertions(+)

>>>>>>>>

>>>>>>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>> index 88dde3455bfd..b5fe6d2ad22f 100644

>>>>>>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>>>>>>>>          u64 device_addr;

>>>>>>>>          u64 driver_addr;

>>>>>>>>          u16 avail_index;

>>>>>>>> +    u16 used_index;

>>>>>>>>          bool ready;

>>>>>>>>          struct vdpa_callback cb;

>>>>>>>>          bool restore;

>>>>>>>> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>>>>>>>>          u32 virtq_id;

>>>>>>>>          struct mlx5_vdpa_net *ndev;

>>>>>>>>          u16 avail_idx;

>>>>>>>> +    u16 used_idx;

>>>>>>>>          int fw_state;

>>>>>>>>            /* keep last in the struct */

>>>>>>>> @@ -804,6 +806,7 @@ static int create_virtqueue(struct mlx5_vdpa_net

>>>>>>>> *ndev, struct mlx5_vdpa_virtque

>>>>>>>>            obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in,

>>>>>>>> obj_context);

>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context, hw_available_index,

>>>>>>>> mvq->avail_idx);

>>>>>>>> +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,

>>>>>>>> mvq->used_idx);

>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context,

>>>>>>>> queue_feature_bit_mask_12_3,

>>>>>>>>               get_features_12_3(ndev->mvdev.actual_features));

>>>>>>>>          vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context,

>>>>>>>> virtio_q_context);

>>>>>>>> @@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net

>>>>>>>> *ndev, struct mlx5_vdpa_virtqueue *m

>>>>>>>>      struct mlx5_virtq_attr {

>>>>>>>>          u8 state;

>>>>>>>>          u16 available_index;

>>>>>>>> +    u16 used_index;

>>>>>>>>      };

>>>>>>>>        static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct

>>>>>>>> mlx5_vdpa_virtqueue *mvq,

>>>>>>>> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>>>>>>>>          memset(attr, 0, sizeof(*attr));

>>>>>>>>          attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

>>>>>>>>          attr->available_index = MLX5_GET(virtio_net_q_object,

>>>>>>>> obj_context, hw_available_index);

>>>>>>>> +    attr->used_index = MLX5_GET(virtio_net_q_object, obj_context,

>>>>>>>> hw_used_index);

>>>>>>>>          kfree(out);

>>>>>>>>          return 0;

>>>>>>>>      @@ -1535,6 +1540,16 @@ static void teardown_virtqueues(struct

>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>          }

>>>>>>>>      }

>>>>>>>>      +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

>>>>>>>> +{

>>>>>>>> +    int i;

>>>>>>>> +

>>>>>>>> +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

>>>>>>>> +        ndev->vqs[i].avail_idx = 0;

>>>>>>>> +        ndev->vqs[i].used_idx = 0;

>>>>>>>> +    }

>>>>>>>> +}

>>>>>>>> +

>>>>>>>>      /* TODO: cross-endian support */

>>>>>>>>      static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev

>>>>>>>> *mvdev)

>>>>>>>>      {

>>>>>>>> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>>>>>>>>              return err;

>>>>>>>>            ri->avail_index = attr.available_index;

>>>>>>>> +    ri->used_index = attr.used_index;

>>>>>>>>          ri->ready = mvq->ready;

>>>>>>>>          ri->num_ent = mvq->num_ent;

>>>>>>>>          ri->desc_addr = mvq->desc_addr;

>>>>>>>> @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct

>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>                  continue;

>>>>>>>>                mvq->avail_idx = ri->avail_index;

>>>>>>>> +        mvq->used_idx = ri->used_index;

>>>>>>>>              mvq->ready = ri->ready;

>>>>>>>>              mvq->num_ent = ri->num_ent;

>>>>>>>>              mvq->desc_addr = ri->desc_addr;

>>>>>>>> @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

>>>>>>>> vdpa_device *vdev, u8 status)

>>>>>>>>          if (!status) {

>>>>>>>>              mlx5_vdpa_info(mvdev, "performing device reset\n");

>>>>>>>>              teardown_driver(ndev);

>>>>>>>> +        clear_virtqueues(ndev);

>>>>>>> The clearing looks fine at the first glance, as it aligns with the other

>>>>>>> state cleanups floating around at the same place. However, the thing is

>>>>>>> get_vq_state() is supposed to be called right after to get sync'ed with

>>>>>>> the latest internal avail_index from device while vq is stopped. The

>>>>>>> index was saved in the driver software at vq suspension, but before the

>>>>>>> virtq object is destroyed. We shouldn't clear the avail_index too early.

>>>>>> Good point.

>>>>>>

>>>>>> There's a limitation on the virtio spec and vDPA framework that we can not

>>>>>> simply differ device suspending from device reset.

>>>>>>

>>>>> Are you talking about live migration where you reset the device but

>>>>> still want to know how far it progressed in order to continue from the

>>>>> same place in the new VM?

>>>> Yes. So if we want to support live migration at we need:

>>>>

>>>> in src node:

>>>> 1) suspend the device

>>>> 2) get last_avail_idx via get_vq_state()

>>>>

>>>> in the dst node:

>>>> 3) set last_avail_idx via set_vq_state()

>>>> 4) resume the device

>>>>

>>>> So you can see, step 2 requires the device/driver not to forget the

>>>> last_avail_idx.

>>>>

>>> Just to be sure, what really matters here is the used index. Becuase the

>>> vriqtueue itself is copied from the src VM to the dest VM. The available

>>> index is alreay there and we know the hardware reads it from there.

>>

>> So for "last_avail_idx" I meant the hardware internal avail index. It's not

>> stored in the virtqueue so we must migrate it from src to dest and set them

>> through set_vq_state(). Then in the destination, the virtqueue can be

>> restarted from that index.

>>

> Consider this case: driver posted buffers till avail index becomes the

> value 50. Hardware is executing but made it till 20 when virtqueue was

> suspended due to live migration - this is indicated by hardware used

> index equal 20.



So in this case the used index in the virtqueue should be 20? Otherwise 
we need not sync used index itself but all the used entries that is not 
committed to the used ring.


> Now the vritqueue is copied to the new VM and the

> hardware now has to continue execution from index 20. We need to tell

> the hardware via configuring the last used_index.



If the hardware can not sync the index from the virtqueue, the driver 
can do the synchronization by make the last_used_idx equals to used 
index in the virtqueue.

Thanks


>   So why don't we

> restore the used index?

>

>>> So it puzzles me why is set_vq_state() we do not communicate the saved

>>> used index.

>>

>> We don't do that since:

>>

>> 1) if the hardware can sync its internal used index from the virtqueue

>> during device, then we don't need it

>> 2) if the hardware can not sync its internal used index, the driver (e.g as

>> you did here) can do that.

>>

>> But there's no way for the hardware to deduce the internal avail index from

>> the virtqueue, that's why avail index is sycned.

>>

>> Thanks

>>

>>
Si-Wei Liu Feb. 10, 2021, 2:30 a.m. UTC | #11
On 2/8/2021 10:37 PM, Jason Wang wrote:
>

> On 2021/2/9 下午2:12, Eli Cohen wrote:

>> On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

>>> On 2021/2/8 下午6:04, Eli Cohen wrote:

>>>> On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

>>>>> On 2021/2/8 下午2:37, Eli Cohen wrote:

>>>>>> On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

>>>>>>> On 2021/2/6 上午7:07, Si-Wei Liu wrote:

>>>>>>>> On 2/3/2021 11:36 PM, Eli Cohen wrote:

>>>>>>>>> When a change of memory map occurs, the hardware resources are 

>>>>>>>>> destroyed

>>>>>>>>> and then re-created again with the new memory map. In such 

>>>>>>>>> case, we need

>>>>>>>>> to restore the hardware available and used indices. The driver 

>>>>>>>>> failed to

>>>>>>>>> restore the used index which is added here.

>>>>>>>>>

>>>>>>>>> Also, since the driver also fails to reset the available and used

>>>>>>>>> indices upon device reset, fix this here to avoid regression 

>>>>>>>>> caused by

>>>>>>>>> the fact that used index may not be zero upon device reset.

>>>>>>>>>

>>>>>>>>> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported 

>>>>>>>>> mlx5

>>>>>>>>> devices")

>>>>>>>>> Signed-off-by: Eli Cohen<elic@nvidia.com>

>>>>>>>>> ---

>>>>>>>>> v0 -> v1:

>>>>>>>>> Clear indices upon device reset

>>>>>>>>>

>>>>>>>>>      drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

>>>>>>>>>      1 file changed, 18 insertions(+)

>>>>>>>>>

>>>>>>>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>> index 88dde3455bfd..b5fe6d2ad22f 100644

>>>>>>>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>>>>>>>>>          u64 device_addr;

>>>>>>>>>          u64 driver_addr;

>>>>>>>>>          u16 avail_index;

>>>>>>>>> +    u16 used_index;

>>>>>>>>>          bool ready;

>>>>>>>>>          struct vdpa_callback cb;

>>>>>>>>>          bool restore;

>>>>>>>>> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>>>>>>>>>          u32 virtq_id;

>>>>>>>>>          struct mlx5_vdpa_net *ndev;

>>>>>>>>>          u16 avail_idx;

>>>>>>>>> +    u16 used_idx;

>>>>>>>>>          int fw_state;

>>>>>>>>>            /* keep last in the struct */

>>>>>>>>> @@ -804,6 +806,7 @@ static int create_virtqueue(struct 

>>>>>>>>> mlx5_vdpa_net

>>>>>>>>> *ndev, struct mlx5_vdpa_virtque

>>>>>>>>>            obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in,

>>>>>>>>> obj_context);

>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context, 

>>>>>>>>> hw_available_index,

>>>>>>>>> mvq->avail_idx);

>>>>>>>>> +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,

>>>>>>>>> mvq->used_idx);

>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context,

>>>>>>>>> queue_feature_bit_mask_12_3,

>>>>>>>>> get_features_12_3(ndev->mvdev.actual_features));

>>>>>>>>>          vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context,

>>>>>>>>> virtio_q_context);

>>>>>>>>> @@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net

>>>>>>>>> *ndev, struct mlx5_vdpa_virtqueue *m

>>>>>>>>>      struct mlx5_virtq_attr {

>>>>>>>>>          u8 state;

>>>>>>>>>          u16 available_index;

>>>>>>>>> +    u16 used_index;

>>>>>>>>>      };

>>>>>>>>>        static int query_virtqueue(struct mlx5_vdpa_net *ndev, 

>>>>>>>>> struct

>>>>>>>>> mlx5_vdpa_virtqueue *mvq,

>>>>>>>>> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>>>>>>>>>          memset(attr, 0, sizeof(*attr));

>>>>>>>>>          attr->state = MLX5_GET(virtio_net_q_object, 

>>>>>>>>> obj_context, state);

>>>>>>>>>          attr->available_index = MLX5_GET(virtio_net_q_object,

>>>>>>>>> obj_context, hw_available_index);

>>>>>>>>> +    attr->used_index = MLX5_GET(virtio_net_q_object, 

>>>>>>>>> obj_context,

>>>>>>>>> hw_used_index);

>>>>>>>>>          kfree(out);

>>>>>>>>>          return 0;

>>>>>>>>>      @@ -1535,6 +1540,16 @@ static void 

>>>>>>>>> teardown_virtqueues(struct

>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>          }

>>>>>>>>>      }

>>>>>>>>>      +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

>>>>>>>>> +{

>>>>>>>>> +    int i;

>>>>>>>>> +

>>>>>>>>> +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

>>>>>>>>> +        ndev->vqs[i].avail_idx = 0;

>>>>>>>>> +        ndev->vqs[i].used_idx = 0;

>>>>>>>>> +    }

>>>>>>>>> +}

>>>>>>>>> +

>>>>>>>>>      /* TODO: cross-endian support */

>>>>>>>>>      static inline bool mlx5_vdpa_is_little_endian(struct 

>>>>>>>>> mlx5_vdpa_dev

>>>>>>>>> *mvdev)

>>>>>>>>>      {

>>>>>>>>> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>>>>>>>>>              return err;

>>>>>>>>>            ri->avail_index = attr.available_index;

>>>>>>>>> +    ri->used_index = attr.used_index;

>>>>>>>>>          ri->ready = mvq->ready;

>>>>>>>>>          ri->num_ent = mvq->num_ent;

>>>>>>>>>          ri->desc_addr = mvq->desc_addr;

>>>>>>>>> @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct

>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>                  continue;

>>>>>>>>>                mvq->avail_idx = ri->avail_index;

>>>>>>>>> +        mvq->used_idx = ri->used_index;

>>>>>>>>>              mvq->ready = ri->ready;

>>>>>>>>>              mvq->num_ent = ri->num_ent;

>>>>>>>>>              mvq->desc_addr = ri->desc_addr;

>>>>>>>>> @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

>>>>>>>>> vdpa_device *vdev, u8 status)

>>>>>>>>>          if (!status) {

>>>>>>>>>              mlx5_vdpa_info(mvdev, "performing device reset\n");

>>>>>>>>>              teardown_driver(ndev);

>>>>>>>>> +        clear_virtqueues(ndev);

>>>>>>>> The clearing looks fine at the first glance, as it aligns with 

>>>>>>>> the other

>>>>>>>> state cleanups floating around at the same place. However, the 

>>>>>>>> thing is

>>>>>>>> get_vq_state() is supposed to be called right after to get 

>>>>>>>> sync'ed with

>>>>>>>> the latest internal avail_index from device while vq is 

>>>>>>>> stopped. The

>>>>>>>> index was saved in the driver software at vq suspension, but 

>>>>>>>> before the

>>>>>>>> virtq object is destroyed. We shouldn't clear the avail_index 

>>>>>>>> too early.

>>>>>>> Good point.

>>>>>>>

>>>>>>> There's a limitation on the virtio spec and vDPA framework that 

>>>>>>> we can not

>>>>>>> simply differ device suspending from device reset.

>>>>>>>

>>>>>> Are you talking about live migration where you reset the device but

>>>>>> still want to know how far it progressed in order to continue 

>>>>>> from the

>>>>>> same place in the new VM?

>>>>> Yes. So if we want to support live migration at we need:

>>>>>

>>>>> in src node:

>>>>> 1) suspend the device

>>>>> 2) get last_avail_idx via get_vq_state()

>>>>>

>>>>> in the dst node:

>>>>> 3) set last_avail_idx via set_vq_state()

>>>>> 4) resume the device

>>>>>

>>>>> So you can see, step 2 requires the device/driver not to forget the

>>>>> last_avail_idx.

>>>>>

>>>> Just to be sure, what really matters here is the used index. 

>>>> Becuase the

>>>> vriqtueue itself is copied from the src VM to the dest VM. The 

>>>> available

>>>> index is alreay there and we know the hardware reads it from there.

>>>

>>> So for "last_avail_idx" I meant the hardware internal avail index. 

>>> It's not

>>> stored in the virtqueue so we must migrate it from src to dest and 

>>> set them

>>> through set_vq_state(). Then in the destination, the virtqueue can be

>>> restarted from that index.

>>>

>> Consider this case: driver posted buffers till avail index becomes the

>> value 50. Hardware is executing but made it till 20 when virtqueue was

>> suspended due to live migration - this is indicated by hardware used

>> index equal 20.

>

>

> So in this case the used index in the virtqueue should be 20? 

> Otherwise we need not sync used index itself but all the used entries 

> that is not committed to the used ring.


In other word, for mlx5 vdpa there's no such internal last_avail_idx 
stuff maintained by the hardware, right? And the used_idx in the 
virtqueue is always in sync with the hardware used_index, and hardware 
is supposed to commit pending used buffers to the ring while bumping up 
the hardware used_index (and also committed to memory) altogether prior 
to suspension, is my understanding correct here? Double checking if this 
is the expected semantics of what 
modify_virtqueue(MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND) should achieve.

If the above is true, then it looks to me for mlx5 vdpa we should really 
return h/w used_idx rather than the last_avail_idx through 
get_vq_state(), in order to reconstruct the virt queue state post live 
migration. For the set_map case, the internal last_avail_idx really 
doesn't matter, although both indices are saved and restored 
transparently as-is.

-Siwei

>

>

>> Now the vritqueue is copied to the new VM and the

>> hardware now has to continue execution from index 20. We need to tell

>> the hardware via configuring the last used_index.

>

>

> If the hardware can not sync the index from the virtqueue, the driver 

> can do the synchronization by make the last_used_idx equals to used 

> index in the virtqueue.

>

> Thanks

>

>

>>   So why don't we

>> restore the used index?

>>

>>>> So it puzzles me why is set_vq_state() we do not communicate the saved

>>>> used index.

>>>

>>> We don't do that since:

>>>

>>> 1) if the hardware can sync its internal used index from the virtqueue

>>> during device, then we don't need it

>>> 2) if the hardware can not sync its internal used index, the driver 

>>> (e.g as

>>> you did here) can do that.

>>>

>>> But there's no way for the hardware to deduce the internal avail 

>>> index from

>>> the virtqueue, that's why avail index is sycned.

>>>

>>> Thanks

>>>

>>>

>
Jason Wang Feb. 10, 2021, 3:53 a.m. UTC | #12
On 2021/2/10 上午10:30, Si-Wei Liu wrote:
>

>

> On 2/8/2021 10:37 PM, Jason Wang wrote:

>>

>> On 2021/2/9 下午2:12, Eli Cohen wrote:

>>> On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

>>>> On 2021/2/8 下午6:04, Eli Cohen wrote:

>>>>> On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

>>>>>> On 2021/2/8 下午2:37, Eli Cohen wrote:

>>>>>>> On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

>>>>>>>> On 2021/2/6 上午7:07, Si-Wei Liu wrote:

>>>>>>>>> On 2/3/2021 11:36 PM, Eli Cohen wrote:

>>>>>>>>>> When a change of memory map occurs, the hardware resources 

>>>>>>>>>> are destroyed

>>>>>>>>>> and then re-created again with the new memory map. In such 

>>>>>>>>>> case, we need

>>>>>>>>>> to restore the hardware available and used indices. The 

>>>>>>>>>> driver failed to

>>>>>>>>>> restore the used index which is added here.

>>>>>>>>>>

>>>>>>>>>> Also, since the driver also fails to reset the available and 

>>>>>>>>>> used

>>>>>>>>>> indices upon device reset, fix this here to avoid regression 

>>>>>>>>>> caused by

>>>>>>>>>> the fact that used index may not be zero upon device reset.

>>>>>>>>>>

>>>>>>>>>> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for 

>>>>>>>>>> supported mlx5

>>>>>>>>>> devices")

>>>>>>>>>> Signed-off-by: Eli Cohen<elic@nvidia.com>

>>>>>>>>>> ---

>>>>>>>>>> v0 -> v1:

>>>>>>>>>> Clear indices upon device reset

>>>>>>>>>>

>>>>>>>>>>      drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

>>>>>>>>>>      1 file changed, 18 insertions(+)

>>>>>>>>>>

>>>>>>>>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>> index 88dde3455bfd..b5fe6d2ad22f 100644

>>>>>>>>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>>>>>>>>>>          u64 device_addr;

>>>>>>>>>>          u64 driver_addr;

>>>>>>>>>>          u16 avail_index;

>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>          bool ready;

>>>>>>>>>>          struct vdpa_callback cb;

>>>>>>>>>>          bool restore;

>>>>>>>>>> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>>>>>>>>>>          u32 virtq_id;

>>>>>>>>>>          struct mlx5_vdpa_net *ndev;

>>>>>>>>>>          u16 avail_idx;

>>>>>>>>>> +    u16 used_idx;

>>>>>>>>>>          int fw_state;

>>>>>>>>>>            /* keep last in the struct */

>>>>>>>>>> @@ -804,6 +806,7 @@ static int create_virtqueue(struct 

>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>> *ndev, struct mlx5_vdpa_virtque

>>>>>>>>>>            obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, 

>>>>>>>>>> in,

>>>>>>>>>> obj_context);

>>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context, 

>>>>>>>>>> hw_available_index,

>>>>>>>>>> mvq->avail_idx);

>>>>>>>>>> +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,

>>>>>>>>>> mvq->used_idx);

>>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context,

>>>>>>>>>> queue_feature_bit_mask_12_3,

>>>>>>>>>> get_features_12_3(ndev->mvdev.actual_features));

>>>>>>>>>>          vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context,

>>>>>>>>>> virtio_q_context);

>>>>>>>>>> @@ -1022,6 +1025,7 @@ static int connect_qps(struct 

>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>> *ndev, struct mlx5_vdpa_virtqueue *m

>>>>>>>>>>      struct mlx5_virtq_attr {

>>>>>>>>>>          u8 state;

>>>>>>>>>>          u16 available_index;

>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>      };

>>>>>>>>>>        static int query_virtqueue(struct mlx5_vdpa_net *ndev, 

>>>>>>>>>> struct

>>>>>>>>>> mlx5_vdpa_virtqueue *mvq,

>>>>>>>>>> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>>>>>>>>>>          memset(attr, 0, sizeof(*attr));

>>>>>>>>>>          attr->state = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>> obj_context, state);

>>>>>>>>>>          attr->available_index = MLX5_GET(virtio_net_q_object,

>>>>>>>>>> obj_context, hw_available_index);

>>>>>>>>>> +    attr->used_index = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>> obj_context,

>>>>>>>>>> hw_used_index);

>>>>>>>>>>          kfree(out);

>>>>>>>>>>          return 0;

>>>>>>>>>>      @@ -1535,6 +1540,16 @@ static void 

>>>>>>>>>> teardown_virtqueues(struct

>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>          }

>>>>>>>>>>      }

>>>>>>>>>>      +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

>>>>>>>>>> +{

>>>>>>>>>> +    int i;

>>>>>>>>>> +

>>>>>>>>>> +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

>>>>>>>>>> +        ndev->vqs[i].avail_idx = 0;

>>>>>>>>>> +        ndev->vqs[i].used_idx = 0;

>>>>>>>>>> +    }

>>>>>>>>>> +}

>>>>>>>>>> +

>>>>>>>>>>      /* TODO: cross-endian support */

>>>>>>>>>>      static inline bool mlx5_vdpa_is_little_endian(struct 

>>>>>>>>>> mlx5_vdpa_dev

>>>>>>>>>> *mvdev)

>>>>>>>>>>      {

>>>>>>>>>> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>>>>>>>>>>              return err;

>>>>>>>>>>            ri->avail_index = attr.available_index;

>>>>>>>>>> +    ri->used_index = attr.used_index;

>>>>>>>>>>          ri->ready = mvq->ready;

>>>>>>>>>>          ri->num_ent = mvq->num_ent;

>>>>>>>>>>          ri->desc_addr = mvq->desc_addr;

>>>>>>>>>> @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct

>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>                  continue;

>>>>>>>>>>                mvq->avail_idx = ri->avail_index;

>>>>>>>>>> +        mvq->used_idx = ri->used_index;

>>>>>>>>>>              mvq->ready = ri->ready;

>>>>>>>>>>              mvq->num_ent = ri->num_ent;

>>>>>>>>>>              mvq->desc_addr = ri->desc_addr;

>>>>>>>>>> @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

>>>>>>>>>> vdpa_device *vdev, u8 status)

>>>>>>>>>>          if (!status) {

>>>>>>>>>>              mlx5_vdpa_info(mvdev, "performing device reset\n");

>>>>>>>>>>              teardown_driver(ndev);

>>>>>>>>>> +        clear_virtqueues(ndev);

>>>>>>>>> The clearing looks fine at the first glance, as it aligns with 

>>>>>>>>> the other

>>>>>>>>> state cleanups floating around at the same place. However, the 

>>>>>>>>> thing is

>>>>>>>>> get_vq_state() is supposed to be called right after to get 

>>>>>>>>> sync'ed with

>>>>>>>>> the latest internal avail_index from device while vq is 

>>>>>>>>> stopped. The

>>>>>>>>> index was saved in the driver software at vq suspension, but 

>>>>>>>>> before the

>>>>>>>>> virtq object is destroyed. We shouldn't clear the avail_index 

>>>>>>>>> too early.

>>>>>>>> Good point.

>>>>>>>>

>>>>>>>> There's a limitation on the virtio spec and vDPA framework that 

>>>>>>>> we can not

>>>>>>>> simply differ device suspending from device reset.

>>>>>>>>

>>>>>>> Are you talking about live migration where you reset the device but

>>>>>>> still want to know how far it progressed in order to continue 

>>>>>>> from the

>>>>>>> same place in the new VM?

>>>>>> Yes. So if we want to support live migration at we need:

>>>>>>

>>>>>> in src node:

>>>>>> 1) suspend the device

>>>>>> 2) get last_avail_idx via get_vq_state()

>>>>>>

>>>>>> in the dst node:

>>>>>> 3) set last_avail_idx via set_vq_state()

>>>>>> 4) resume the device

>>>>>>

>>>>>> So you can see, step 2 requires the device/driver not to forget the

>>>>>> last_avail_idx.

>>>>>>

>>>>> Just to be sure, what really matters here is the used index. 

>>>>> Becuase the

>>>>> vriqtueue itself is copied from the src VM to the dest VM. The 

>>>>> available

>>>>> index is alreay there and we know the hardware reads it from there.

>>>>

>>>> So for "last_avail_idx" I meant the hardware internal avail index. 

>>>> It's not

>>>> stored in the virtqueue so we must migrate it from src to dest and 

>>>> set them

>>>> through set_vq_state(). Then in the destination, the virtqueue can be

>>>> restarted from that index.

>>>>

>>> Consider this case: driver posted buffers till avail index becomes the

>>> value 50. Hardware is executing but made it till 20 when virtqueue was

>>> suspended due to live migration - this is indicated by hardware used

>>> index equal 20.

>>

>>

>> So in this case the used index in the virtqueue should be 20? 

>> Otherwise we need not sync used index itself but all the used entries 

>> that is not committed to the used ring.

>

> In other word, for mlx5 vdpa there's no such internal last_avail_idx 

> stuff maintained by the hardware, right? 



For each device it should have one otherwise it won't work correctly 
during stop/resume. See the codes mlx5_vdpa_get_vq_state() which calls 
query_virtqueue() that build commands to query "last_avail_idx" from the 
hardware:

     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 
MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 
MLX5_OBJ_TYPE_VIRTIO_NET_Q);
     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
     err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
     if (err)
         goto err_cmd;

     obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
     memset(attr, 0, sizeof(*attr));
     attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
     attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, 
hw_available_index);




> And the used_idx in the virtqueue is always in sync with the hardware 

> used_index, and hardware is supposed to commit pending used buffers to 

> the ring while bumping up the hardware used_index (and also committed 

> to memory) altogether prior to suspension, is my understanding correct 

> here? Double checking if this is the expected semantics of what 

> modify_virtqueue(MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND) should achieve.

>

> If the above is true, then it looks to me for mlx5 vdpa we should 

> really return h/w used_idx rather than the last_avail_idx through 

> get_vq_state(), in order to reconstruct the virt queue state post live 

> migration. For the set_map case, the internal last_avail_idx really 

> doesn't matter, although both indices are saved and restored 

> transparently as-is.



Right, a subtle thing here is that: for the device that might have can't 
not complete all virtqueue requests during vq suspending, the 
"last_avail_idx" might not be equal to the hardware used_idx. Thing 
might be true for the storage devices that needs to connect to a remote 
backend. But this is not the case of networking device, so 
last_avail_idx should be equal to hardware used_idx here. But using the 
"last_avail_idx" or hardware avail_idx should always be better in this 
case since it's guaranteed to correct and will have less confusion. We 
use this convention in other types of vhost backends (vhost-kernel, 
vhost-user).

So looking at mlx5_set_vq_state(), it probably won't work since it 
doesn't not set either hardware avail_idx or hardware used_idx:

static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
                   const struct vdpa_vq_state *state)
{
     struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
     struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
     struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];

     if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
         mlx5_vdpa_warn(mvdev, "can't modify available index\n");
         return -EINVAL;
     }

     mvq->avail_idx = state->avail_index;
     return 0;
}

Depends on the hardware, we should either set hardware used_idx or 
hardware avail_idx here.

I think we need to clarify how device is supposed to work in the virtio 
spec.

Thanks


>

> -Siwei

>

>>

>>

>>> Now the vritqueue is copied to the new VM and the

>>> hardware now has to continue execution from index 20. We need to tell

>>> the hardware via configuring the last used_index.

>>

>>

>> If the hardware can not sync the index from the virtqueue, the driver 

>> can do the synchronization by make the last_used_idx equals to used 

>> index in the virtqueue.

>>

>> Thanks

>>

>>

>>>   So why don't we

>>> restore the used index?

>>>

>>>>> So it puzzles me why is set_vq_state() we do not communicate the 

>>>>> saved

>>>>> used index.

>>>>

>>>> We don't do that since:

>>>>

>>>> 1) if the hardware can sync its internal used index from the virtqueue

>>>> during device, then we don't need it

>>>> 2) if the hardware can not sync its internal used index, the driver 

>>>> (e.g as

>>>> you did here) can do that.

>>>>

>>>> But there's no way for the hardware to deduce the internal avail 

>>>> index from

>>>> the virtqueue, that's why avail index is sycned.

>>>>

>>>> Thanks

>>>>

>>>>

>>

>
Si-Wei Liu Feb. 10, 2021, 8:59 a.m. UTC | #13
On 2/9/2021 7:53 PM, Jason Wang wrote:
>

> On 2021/2/10 上午10:30, Si-Wei Liu wrote:

>>

>>

>> On 2/8/2021 10:37 PM, Jason Wang wrote:

>>>

>>> On 2021/2/9 下午2:12, Eli Cohen wrote:

>>>> On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

>>>>> On 2021/2/8 下午6:04, Eli Cohen wrote:

>>>>>> On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

>>>>>>> On 2021/2/8 下午2:37, Eli Cohen wrote:

>>>>>>>> On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

>>>>>>>>> On 2021/2/6 上午7:07, Si-Wei Liu wrote:

>>>>>>>>>> On 2/3/2021 11:36 PM, Eli Cohen wrote:

>>>>>>>>>>> When a change of memory map occurs, the hardware resources 

>>>>>>>>>>> are destroyed

>>>>>>>>>>> and then re-created again with the new memory map. In such 

>>>>>>>>>>> case, we need

>>>>>>>>>>> to restore the hardware available and used indices. The 

>>>>>>>>>>> driver failed to

>>>>>>>>>>> restore the used index which is added here.

>>>>>>>>>>>

>>>>>>>>>>> Also, since the driver also fails to reset the available and 

>>>>>>>>>>> used

>>>>>>>>>>> indices upon device reset, fix this here to avoid regression 

>>>>>>>>>>> caused by

>>>>>>>>>>> the fact that used index may not be zero upon device reset.

>>>>>>>>>>>

>>>>>>>>>>> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for 

>>>>>>>>>>> supported mlx5

>>>>>>>>>>> devices")

>>>>>>>>>>> Signed-off-by: Eli Cohen<elic@nvidia.com>

>>>>>>>>>>> ---

>>>>>>>>>>> v0 -> v1:

>>>>>>>>>>> Clear indices upon device reset

>>>>>>>>>>>

>>>>>>>>>>>      drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

>>>>>>>>>>>      1 file changed, 18 insertions(+)

>>>>>>>>>>>

>>>>>>>>>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>> index 88dde3455bfd..b5fe6d2ad22f 100644

>>>>>>>>>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>>>>>>>>>>>          u64 device_addr;

>>>>>>>>>>>          u64 driver_addr;

>>>>>>>>>>>          u16 avail_index;

>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>          bool ready;

>>>>>>>>>>>          struct vdpa_callback cb;

>>>>>>>>>>>          bool restore;

>>>>>>>>>>> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>>>>>>>>>>>          u32 virtq_id;

>>>>>>>>>>>          struct mlx5_vdpa_net *ndev;

>>>>>>>>>>>          u16 avail_idx;

>>>>>>>>>>> +    u16 used_idx;

>>>>>>>>>>>          int fw_state;

>>>>>>>>>>>            /* keep last in the struct */

>>>>>>>>>>> @@ -804,6 +806,7 @@ static int create_virtqueue(struct 

>>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtque

>>>>>>>>>>>            obj_context = 

>>>>>>>>>>> MLX5_ADDR_OF(create_virtio_net_q_in, in,

>>>>>>>>>>> obj_context);

>>>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context, 

>>>>>>>>>>> hw_available_index,

>>>>>>>>>>> mvq->avail_idx);

>>>>>>>>>>> +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,

>>>>>>>>>>> mvq->used_idx);

>>>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context,

>>>>>>>>>>> queue_feature_bit_mask_12_3,

>>>>>>>>>>> get_features_12_3(ndev->mvdev.actual_features));

>>>>>>>>>>>          vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, 

>>>>>>>>>>> obj_context,

>>>>>>>>>>> virtio_q_context);

>>>>>>>>>>> @@ -1022,6 +1025,7 @@ static int connect_qps(struct 

>>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtqueue *m

>>>>>>>>>>>      struct mlx5_virtq_attr {

>>>>>>>>>>>          u8 state;

>>>>>>>>>>>          u16 available_index;

>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>      };

>>>>>>>>>>>        static int query_virtqueue(struct mlx5_vdpa_net 

>>>>>>>>>>> *ndev, struct

>>>>>>>>>>> mlx5_vdpa_virtqueue *mvq,

>>>>>>>>>>> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>>>>>>>>>>>          memset(attr, 0, sizeof(*attr));

>>>>>>>>>>>          attr->state = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>>> obj_context, state);

>>>>>>>>>>>          attr->available_index = MLX5_GET(virtio_net_q_object,

>>>>>>>>>>> obj_context, hw_available_index);

>>>>>>>>>>> +    attr->used_index = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>>> obj_context,

>>>>>>>>>>> hw_used_index);

>>>>>>>>>>>          kfree(out);

>>>>>>>>>>>          return 0;

>>>>>>>>>>>      @@ -1535,6 +1540,16 @@ static void 

>>>>>>>>>>> teardown_virtqueues(struct

>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>          }

>>>>>>>>>>>      }

>>>>>>>>>>>      +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

>>>>>>>>>>> +{

>>>>>>>>>>> +    int i;

>>>>>>>>>>> +

>>>>>>>>>>> +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

>>>>>>>>>>> +        ndev->vqs[i].avail_idx = 0;

>>>>>>>>>>> +        ndev->vqs[i].used_idx = 0;

>>>>>>>>>>> +    }

>>>>>>>>>>> +}

>>>>>>>>>>> +

>>>>>>>>>>>      /* TODO: cross-endian support */

>>>>>>>>>>>      static inline bool mlx5_vdpa_is_little_endian(struct 

>>>>>>>>>>> mlx5_vdpa_dev

>>>>>>>>>>> *mvdev)

>>>>>>>>>>>      {

>>>>>>>>>>> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>>>>>>>>>>>              return err;

>>>>>>>>>>>            ri->avail_index = attr.available_index;

>>>>>>>>>>> +    ri->used_index = attr.used_index;

>>>>>>>>>>>          ri->ready = mvq->ready;

>>>>>>>>>>>          ri->num_ent = mvq->num_ent;

>>>>>>>>>>>          ri->desc_addr = mvq->desc_addr;

>>>>>>>>>>> @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct

>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>                  continue;

>>>>>>>>>>>                mvq->avail_idx = ri->avail_index;

>>>>>>>>>>> +        mvq->used_idx = ri->used_index;

>>>>>>>>>>>              mvq->ready = ri->ready;

>>>>>>>>>>>              mvq->num_ent = ri->num_ent;

>>>>>>>>>>>              mvq->desc_addr = ri->desc_addr;

>>>>>>>>>>> @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

>>>>>>>>>>> vdpa_device *vdev, u8 status)

>>>>>>>>>>>          if (!status) {

>>>>>>>>>>>              mlx5_vdpa_info(mvdev, "performing device 

>>>>>>>>>>> reset\n");

>>>>>>>>>>>              teardown_driver(ndev);

>>>>>>>>>>> +        clear_virtqueues(ndev);

>>>>>>>>>> The clearing looks fine at the first glance, as it aligns 

>>>>>>>>>> with the other

>>>>>>>>>> state cleanups floating around at the same place. However, 

>>>>>>>>>> the thing is

>>>>>>>>>> get_vq_state() is supposed to be called right after to get 

>>>>>>>>>> sync'ed with

>>>>>>>>>> the latest internal avail_index from device while vq is 

>>>>>>>>>> stopped. The

>>>>>>>>>> index was saved in the driver software at vq suspension, but 

>>>>>>>>>> before the

>>>>>>>>>> virtq object is destroyed. We shouldn't clear the avail_index 

>>>>>>>>>> too early.

>>>>>>>>> Good point.

>>>>>>>>>

>>>>>>>>> There's a limitation on the virtio spec and vDPA framework 

>>>>>>>>> that we can not

>>>>>>>>> simply differ device suspending from device reset.

>>>>>>>>>

>>>>>>>> Are you talking about live migration where you reset the device 

>>>>>>>> but

>>>>>>>> still want to know how far it progressed in order to continue 

>>>>>>>> from the

>>>>>>>> same place in the new VM?

>>>>>>> Yes. So if we want to support live migration at we need:

>>>>>>>

>>>>>>> in src node:

>>>>>>> 1) suspend the device

>>>>>>> 2) get last_avail_idx via get_vq_state()

>>>>>>>

>>>>>>> in the dst node:

>>>>>>> 3) set last_avail_idx via set_vq_state()

>>>>>>> 4) resume the device

>>>>>>>

>>>>>>> So you can see, step 2 requires the device/driver not to forget the

>>>>>>> last_avail_idx.

>>>>>>>

>>>>>> Just to be sure, what really matters here is the used index. 

>>>>>> Becuase the

>>>>>> vriqtueue itself is copied from the src VM to the dest VM. The 

>>>>>> available

>>>>>> index is alreay there and we know the hardware reads it from there.

>>>>>

>>>>> So for "last_avail_idx" I meant the hardware internal avail index. 

>>>>> It's not

>>>>> stored in the virtqueue so we must migrate it from src to dest and 

>>>>> set them

>>>>> through set_vq_state(). Then in the destination, the virtqueue can be

>>>>> restarted from that index.

>>>>>

>>>> Consider this case: driver posted buffers till avail index becomes the

>>>> value 50. Hardware is executing but made it till 20 when virtqueue was

>>>> suspended due to live migration - this is indicated by hardware used

>>>> index equal 20.

>>>

>>>

>>> So in this case the used index in the virtqueue should be 20? 

>>> Otherwise we need not sync used index itself but all the used 

>>> entries that is not committed to the used ring.

>>

>> In other word, for mlx5 vdpa there's no such internal last_avail_idx 

>> stuff maintained by the hardware, right? 

>

>

> For each device it should have one otherwise it won't work correctly 

> during stop/resume. See the codes mlx5_vdpa_get_vq_state() which calls 

> query_virtqueue() that build commands to query "last_avail_idx" from 

> the hardware:

>

>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 

> MLX5_CMD_OP_QUERY_GENERAL_OBJECT);

>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 

> MLX5_OBJ_TYPE_VIRTIO_NET_Q);

>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);

>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);

>     err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);

>     if (err)

>         goto err_cmd;

>

>     obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);

>     memset(attr, 0, sizeof(*attr));

>     attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

>     attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, 

> hw_available_index);

>

Eli should be able to correct me, but this hw_available_index might just 
be a cached value of virtqueue avail_index in the memory from the most 
recent sync. I doubt it's the one you talked about in software 
implementation. If I understand Eli correctly, hardware will always 
reload the latest avail_index from memory whenever it's being sync'ed 
again.

<quote>
The hardware always goes to read the available index from memory. The 
requirement to configure it when creating a new object is still a 
requirement defined by the interface so I must not violate interface 
requirments.
</quote>

If the hardware does everything perfectly that is able to flush pending 
requests, update descriptors, rings plus used indices all at once before 
the suspension, there's no need for hardware to maintain a separate 
internal index than the h/w used_index. The hardware can get started 
from the saved used_index upon resuming. I view this is of (hardware) 
implementation choices and thought it does not violate the virtio spec?


>

>

>

>> And the used_idx in the virtqueue is always in sync with the hardware 

>> used_index, and hardware is supposed to commit pending used buffers 

>> to the ring while bumping up the hardware used_index (and also 

>> committed to memory) altogether prior to suspension, is my 

>> understanding correct here? Double checking if this is the expected 

>> semantics of what 

>> modify_virtqueue(MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND) should achieve.

>>

>> If the above is true, then it looks to me for mlx5 vdpa we should 

>> really return h/w used_idx rather than the last_avail_idx through 

>> get_vq_state(), in order to reconstruct the virt queue state post 

>> live migration. For the set_map case, the internal last_avail_idx 

>> really doesn't matter, although both indices are saved and restored 

>> transparently as-is.

>

>

> Right, a subtle thing here is that: for the device that might have 

> can't not complete all virtqueue requests during vq suspending, the 

> "last_avail_idx" might not be equal to the hardware used_idx. Thing 

> might be true for the storage devices that needs to connect to a 

> remote backend. But this is not the case of networking device, so 

> last_avail_idx should be equal to hardware used_idx here. 

Eli, since it's your hardware, does it work this way? i.e. does the 
firmware interface see a case where virtqueue requests can't be 
completed before suspending vq?

> But using the "last_avail_idx" or hardware avail_idx should always be 

> better in this case since it's guaranteed to correct and will have 

> less confusion. We use this convention in other types of vhost 

> backends (vhost-kernel, vhost-user).

>

> So looking at mlx5_set_vq_state(), it probably won't work since it 

> doesn't not set either hardware avail_idx or hardware used_idx:

The saved mvq->avail_idx will be used to recreate hardware virtq object 
and the used index in create_virtqueue(), once status DRIVER_OK is set. 
I suspect we should pass the index to mvq->used_idx in 
mlx5_vdpa_set_vq_state() below instead.


Thanks,
-Siwei
>

> static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,

>                   const struct vdpa_vq_state *state)

> {

>     struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);

>     struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);

>     struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];

>

>     if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {

>         mlx5_vdpa_warn(mvdev, "can't modify available index\n");

>         return -EINVAL;

>     }

>

>     mvq->avail_idx = state->avail_index;

>     return 0;

> }

>

> Depends on the hardware, we should either set hardware used_idx or 

> hardware avail_idx here.

>

> I think we need to clarify how device is supposed to work in the 

> virtio spec.

>

> Thanks

>

>

>>

>> -Siwei

>>

>>>

>>>

>>>> Now the vritqueue is copied to the new VM and the

>>>> hardware now has to continue execution from index 20. We need to tell

>>>> the hardware via configuring the last used_index.

>>>

>>>

>>> If the hardware can not sync the index from the virtqueue, the 

>>> driver can do the synchronization by make the last_used_idx equals 

>>> to used index in the virtqueue.

>>>

>>> Thanks

>>>

>>>

>>>>   So why don't we

>>>> restore the used index?

>>>>

>>>>>> So it puzzles me why is set_vq_state() we do not communicate the 

>>>>>> saved

>>>>>> used index.

>>>>>

>>>>> We don't do that since:

>>>>>

>>>>> 1) if the hardware can sync its internal used index from the 

>>>>> virtqueue

>>>>> during device, then we don't need it

>>>>> 2) if the hardware can not sync its internal used index, the 

>>>>> driver (e.g as

>>>>> you did here) can do that.

>>>>>

>>>>> But there's no way for the hardware to deduce the internal avail 

>>>>> index from

>>>>> the virtqueue, that's why avail index is sycned.

>>>>>

>>>>> Thanks

>>>>>

>>>>>

>>>

>>

>
Eli Cohen Feb. 10, 2021, 3:45 p.m. UTC | #14
On Wed, Feb 10, 2021 at 12:59:03AM -0800, Si-Wei Liu wrote:
> 

> 

> On 2/9/2021 7:53 PM, Jason Wang wrote:

> > 

> > On 2021/2/10 上午10:30, Si-Wei Liu wrote:

> > > 

> > > 

> > > On 2/8/2021 10:37 PM, Jason Wang wrote:

> > > > 

> > > > On 2021/2/9 下午2:12, Eli Cohen wrote:

> > > > > On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

> > > > > > On 2021/2/8 下午6:04, Eli Cohen wrote:

> > > > > > > On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

> > > > > > > > On 2021/2/8 下午2:37, Eli Cohen wrote:

> > > > > > > > > On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

> > > > > > > > > > On 2021/2/6 上午7:07, Si-Wei Liu wrote:

> > > > > > > > > > > On 2/3/2021 11:36 PM, Eli Cohen wrote:

> > > > > > > > > > > > When a change of memory map

> > > > > > > > > > > > occurs, the hardware resources

> > > > > > > > > > > > are destroyed

> > > > > > > > > > > > and then re-created again with

> > > > > > > > > > > > the new memory map. In such

> > > > > > > > > > > > case, we need

> > > > > > > > > > > > to restore the hardware

> > > > > > > > > > > > available and used indices. The

> > > > > > > > > > > > driver failed to

> > > > > > > > > > > > restore the used index which is added here.

> > > > > > > > > > > > 

> > > > > > > > > > > > Also, since the driver also

> > > > > > > > > > > > fails to reset the available and

> > > > > > > > > > > > used

> > > > > > > > > > > > indices upon device reset, fix

> > > > > > > > > > > > this here to avoid regression

> > > > > > > > > > > > caused by

> > > > > > > > > > > > the fact that used index may not be zero upon device reset.

> > > > > > > > > > > > 

> > > > > > > > > > > > Fixes: 1a86b377aa21 ("vdpa/mlx5:

> > > > > > > > > > > > Add VDPA driver for supported

> > > > > > > > > > > > mlx5

> > > > > > > > > > > > devices")

> > > > > > > > > > > > Signed-off-by: Eli Cohen<elic@nvidia.com>

> > > > > > > > > > > > ---

> > > > > > > > > > > > v0 -> v1:

> > > > > > > > > > > > Clear indices upon device reset

> > > > > > > > > > > > 

> > > > > > > > > > > >      drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

> > > > > > > > > > > >      1 file changed, 18 insertions(+)

> > > > > > > > > > > > 

> > > > > > > > > > > > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > > > > > > > > b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > > > > > > > > index 88dde3455bfd..b5fe6d2ad22f 100644

> > > > > > > > > > > > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > > > > > > > > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

> > > > > > > > > > > > @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

> > > > > > > > > > > >          u64 device_addr;

> > > > > > > > > > > >          u64 driver_addr;

> > > > > > > > > > > >          u16 avail_index;

> > > > > > > > > > > > +    u16 used_index;

> > > > > > > > > > > >          bool ready;

> > > > > > > > > > > >          struct vdpa_callback cb;

> > > > > > > > > > > >          bool restore;

> > > > > > > > > > > > @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

> > > > > > > > > > > >          u32 virtq_id;

> > > > > > > > > > > >          struct mlx5_vdpa_net *ndev;

> > > > > > > > > > > >          u16 avail_idx;

> > > > > > > > > > > > +    u16 used_idx;

> > > > > > > > > > > >          int fw_state;

> > > > > > > > > > > >            /* keep last in the struct */

> > > > > > > > > > > > @@ -804,6 +806,7 @@ static int

> > > > > > > > > > > > create_virtqueue(struct

> > > > > > > > > > > > mlx5_vdpa_net

> > > > > > > > > > > > *ndev, struct mlx5_vdpa_virtque

> > > > > > > > > > > >            obj_context =

> > > > > > > > > > > > MLX5_ADDR_OF(create_virtio_net_q_in,

> > > > > > > > > > > > in,

> > > > > > > > > > > > obj_context);

> > > > > > > > > > > >         

> > > > > > > > > > > > MLX5_SET(virtio_net_q_object,

> > > > > > > > > > > > obj_context, hw_available_index,

> > > > > > > > > > > > mvq->avail_idx);

> > > > > > > > > > > > +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,

> > > > > > > > > > > > mvq->used_idx);

> > > > > > > > > > > >          MLX5_SET(virtio_net_q_object, obj_context,

> > > > > > > > > > > > queue_feature_bit_mask_12_3,

> > > > > > > > > > > > get_features_12_3(ndev->mvdev.actual_features));

> > > > > > > > > > > >          vq_ctx =

> > > > > > > > > > > > MLX5_ADDR_OF(virtio_net_q_object,

> > > > > > > > > > > > obj_context,

> > > > > > > > > > > > virtio_q_context);

> > > > > > > > > > > > @@ -1022,6 +1025,7 @@ static int

> > > > > > > > > > > > connect_qps(struct mlx5_vdpa_net

> > > > > > > > > > > > *ndev, struct mlx5_vdpa_virtqueue *m

> > > > > > > > > > > >      struct mlx5_virtq_attr {

> > > > > > > > > > > >          u8 state;

> > > > > > > > > > > >          u16 available_index;

> > > > > > > > > > > > +    u16 used_index;

> > > > > > > > > > > >      };

> > > > > > > > > > > >        static int

> > > > > > > > > > > > query_virtqueue(struct

> > > > > > > > > > > > mlx5_vdpa_net *ndev, struct

> > > > > > > > > > > > mlx5_vdpa_virtqueue *mvq,

> > > > > > > > > > > > @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

> > > > > > > > > > > > mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

> > > > > > > > > > > >          memset(attr, 0, sizeof(*attr));

> > > > > > > > > > > >          attr->state =

> > > > > > > > > > > > MLX5_GET(virtio_net_q_object,

> > > > > > > > > > > > obj_context, state);

> > > > > > > > > > > >          attr->available_index = MLX5_GET(virtio_net_q_object,

> > > > > > > > > > > > obj_context, hw_available_index);

> > > > > > > > > > > > +    attr->used_index =

> > > > > > > > > > > > MLX5_GET(virtio_net_q_object,

> > > > > > > > > > > > obj_context,

> > > > > > > > > > > > hw_used_index);

> > > > > > > > > > > >          kfree(out);

> > > > > > > > > > > >          return 0;

> > > > > > > > > > > >      @@ -1535,6 +1540,16 @@

> > > > > > > > > > > > static void

> > > > > > > > > > > > teardown_virtqueues(struct

> > > > > > > > > > > > mlx5_vdpa_net *ndev)

> > > > > > > > > > > >          }

> > > > > > > > > > > >      }

> > > > > > > > > > > >      +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

> > > > > > > > > > > > +{

> > > > > > > > > > > > +    int i;

> > > > > > > > > > > > +

> > > > > > > > > > > > +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

> > > > > > > > > > > > +        ndev->vqs[i].avail_idx = 0;

> > > > > > > > > > > > +        ndev->vqs[i].used_idx = 0;

> > > > > > > > > > > > +    }

> > > > > > > > > > > > +}

> > > > > > > > > > > > +

> > > > > > > > > > > >      /* TODO: cross-endian support */

> > > > > > > > > > > >      static inline bool

> > > > > > > > > > > > mlx5_vdpa_is_little_endian(struct

> > > > > > > > > > > > mlx5_vdpa_dev

> > > > > > > > > > > > *mvdev)

> > > > > > > > > > > >      {

> > > > > > > > > > > > @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

> > > > > > > > > > > > mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

> > > > > > > > > > > >              return err;

> > > > > > > > > > > >            ri->avail_index = attr.available_index;

> > > > > > > > > > > > +    ri->used_index = attr.used_index;

> > > > > > > > > > > >          ri->ready = mvq->ready;

> > > > > > > > > > > >          ri->num_ent = mvq->num_ent;

> > > > > > > > > > > >          ri->desc_addr = mvq->desc_addr;

> > > > > > > > > > > > @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct

> > > > > > > > > > > > mlx5_vdpa_net *ndev)

> > > > > > > > > > > >                  continue;

> > > > > > > > > > > >                mvq->avail_idx = ri->avail_index;

> > > > > > > > > > > > +        mvq->used_idx = ri->used_index;

> > > > > > > > > > > >              mvq->ready = ri->ready;

> > > > > > > > > > > >              mvq->num_ent = ri->num_ent;

> > > > > > > > > > > >              mvq->desc_addr = ri->desc_addr;

> > > > > > > > > > > > @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

> > > > > > > > > > > > vdpa_device *vdev, u8 status)

> > > > > > > > > > > >          if (!status) {

> > > > > > > > > > > >             

> > > > > > > > > > > > mlx5_vdpa_info(mvdev,

> > > > > > > > > > > > "performing device reset\n");

> > > > > > > > > > > >              teardown_driver(ndev);

> > > > > > > > > > > > +        clear_virtqueues(ndev);

> > > > > > > > > > > The clearing looks fine at the first

> > > > > > > > > > > glance, as it aligns with the other

> > > > > > > > > > > state cleanups floating around at

> > > > > > > > > > > the same place. However, the thing

> > > > > > > > > > > is

> > > > > > > > > > > get_vq_state() is supposed to be

> > > > > > > > > > > called right after to get sync'ed

> > > > > > > > > > > with

> > > > > > > > > > > the latest internal avail_index from

> > > > > > > > > > > device while vq is stopped. The

> > > > > > > > > > > index was saved in the driver

> > > > > > > > > > > software at vq suspension, but

> > > > > > > > > > > before the

> > > > > > > > > > > virtq object is destroyed. We

> > > > > > > > > > > shouldn't clear the avail_index too

> > > > > > > > > > > early.

> > > > > > > > > > Good point.

> > > > > > > > > > 

> > > > > > > > > > There's a limitation on the virtio spec

> > > > > > > > > > and vDPA framework that we can not

> > > > > > > > > > simply differ device suspending from device reset.

> > > > > > > > > > 

> > > > > > > > > Are you talking about live migration where

> > > > > > > > > you reset the device but

> > > > > > > > > still want to know how far it progressed in

> > > > > > > > > order to continue from the

> > > > > > > > > same place in the new VM?

> > > > > > > > Yes. So if we want to support live migration at we need:

> > > > > > > > 

> > > > > > > > in src node:

> > > > > > > > 1) suspend the device

> > > > > > > > 2) get last_avail_idx via get_vq_state()

> > > > > > > > 

> > > > > > > > in the dst node:

> > > > > > > > 3) set last_avail_idx via set_vq_state()

> > > > > > > > 4) resume the device

> > > > > > > > 

> > > > > > > > So you can see, step 2 requires the device/driver not to forget the

> > > > > > > > last_avail_idx.

> > > > > > > > 

> > > > > > > Just to be sure, what really matters here is the

> > > > > > > used index. Becuase the

> > > > > > > vriqtueue itself is copied from the src VM to the

> > > > > > > dest VM. The available

> > > > > > > index is alreay there and we know the hardware reads it from there.

> > > > > > 

> > > > > > So for "last_avail_idx" I meant the hardware internal

> > > > > > avail index. It's not

> > > > > > stored in the virtqueue so we must migrate it from src

> > > > > > to dest and set them

> > > > > > through set_vq_state(). Then in the destination, the virtqueue can be

> > > > > > restarted from that index.

> > > > > > 

> > > > > Consider this case: driver posted buffers till avail index becomes the

> > > > > value 50. Hardware is executing but made it till 20 when virtqueue was

> > > > > suspended due to live migration - this is indicated by hardware used

> > > > > index equal 20.

> > > > 

> > > > 

> > > > So in this case the used index in the virtqueue should be 20?

> > > > Otherwise we need not sync used index itself but all the used

> > > > entries that is not committed to the used ring.

> > > 

> > > In other word, for mlx5 vdpa there's no such internal last_avail_idx

> > > stuff maintained by the hardware, right?

> > 

> > 

> > For each device it should have one otherwise it won't work correctly

> > during stop/resume. See the codes mlx5_vdpa_get_vq_state() which calls

> > query_virtqueue() that build commands to query "last_avail_idx" from the

> > hardware:

> > 

> >     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,

> > MLX5_CMD_OP_QUERY_GENERAL_OBJECT);

> >     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,

> > MLX5_OBJ_TYPE_VIRTIO_NET_Q);

> >     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);

> >     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);

> >     err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);

> >     if (err)

> >         goto err_cmd;

> > 

> >     obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);

> >     memset(attr, 0, sizeof(*attr));

> >     attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

> >     attr->available_index = MLX5_GET(virtio_net_q_object, obj_context,

> > hw_available_index);

> > 

> Eli should be able to correct me, but this hw_available_index might just be

> a cached value of virtqueue avail_index in the memory from the most recent

> sync. I doubt it's the one you talked about in software implementation. If I

> understand Eli correctly, hardware will always reload the latest avail_index

> from memory whenever it's being sync'ed again.


That's my understanding too. I am still trying to get confirmation from
hardware guys. Will send when I have them.

> 

> <quote>

> The hardware always goes to read the available index from memory. The

> requirement to configure it when creating a new object is still a

> requirement defined by the interface so I must not violate interface

> requirments.

> </quote>

> 

> If the hardware does everything perfectly that is able to flush pending

> requests, update descriptors, rings plus used indices all at once before the

> suspension, there's no need for hardware to maintain a separate internal

> index than the h/w used_index. The hardware can get started from the saved

> used_index upon resuming. I view this is of (hardware) implementation

> choices and thought it does not violate the virtio spec?

> 

> 

> > 

> > 

> > 

> > > And the used_idx in the virtqueue is always in sync with the

> > > hardware used_index, and hardware is supposed to commit pending used

> > > buffers to the ring while bumping up the hardware used_index (and

> > > also committed to memory) altogether prior to suspension, is my

> > > understanding correct here? Double checking if this is the expected

> > > semantics of what

> > > modify_virtqueue(MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND) should

> > > achieve.

> > > 


That's my understanding too.

> > > If the above is true, then it looks to me for mlx5 vdpa we should

> > > really return h/w used_idx rather than the last_avail_idx through

> > > get_vq_state(), in order to reconstruct the virt queue state post

> > > live migration. For the set_map case, the internal last_avail_idx

> > > really doesn't matter, although both indices are saved and restored

> > > transparently as-is.

> > 


Right, that's what I think too. In fact, I discussed that already with
Jason over the phone yesterday. The conclusion was since the only thing
that matters for a network device is the used index, I tried to return
the hardware used index in get_vq_state (instead of hardware available
index) and restore the value I get in set_vq_state into the hardware
used index. I am still not done with this experiment but will update.

> > 

> > Right, a subtle thing here is that: for the device that might have can't

> > not complete all virtqueue requests during vq suspending, the

> > "last_avail_idx" might not be equal to the hardware used_idx. Thing

> > might be true for the storage devices that needs to connect to a remote

> > backend. But this is not the case of networking device, so

> > last_avail_idx should be equal to hardware used_idx here.

> Eli, since it's your hardware, does it work this way? i.e. does the firmware

> interface see a case where virtqueue requests can't be completed before

> suspending vq?

> 

This can happen regardless of what effort firmware does to complete all
pending requests. An example is the hot plugging of memory. The set map
call is called and mlx5 vdpa driver tears down the vitqueue objects
while the guest driver can still be posting requests to the virtqueue.
So we find ourselves in a situation and used index is behine the
available index.

> > But using the "last_avail_idx" or hardware avail_idx should always be

> > better in this case since it's guaranteed to correct and will have less

> > confusion. We use this convention in other types of vhost backends

> > (vhost-kernel, vhost-user).

> > 

> > So looking at mlx5_set_vq_state(), it probably won't work since it

> > doesn't not set either hardware avail_idx or hardware used_idx:

> The saved mvq->avail_idx will be used to recreate hardware virtq object and

> the used index in create_virtqueue(), once status DRIVER_OK is set. I

> suspect we should pass the index to mvq->used_idx in

> mlx5_vdpa_set_vq_state() below instead.

> 

Right, that's what I am checking but still no final conclusions. I need
to harness hardware guy to provide me with clear answers.
> 

> Thanks,

> -Siwei

> > 

> > static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,

> >                   const struct vdpa_vq_state *state)

> > {

> >     struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);

> >     struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);

> >     struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];

> > 

> >     if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {

> >         mlx5_vdpa_warn(mvdev, "can't modify available index\n");

> >         return -EINVAL;

> >     }

> > 

> >     mvq->avail_idx = state->avail_index;

> >     return 0;

> > }

> > 

> > Depends on the hardware, we should either set hardware used_idx or

> > hardware avail_idx here.

> > 

> > I think we need to clarify how device is supposed to work in the virtio

> > spec.

> > 

> > Thanks

> > 

> > 

> > > 

> > > -Siwei

> > > 

> > > > 

> > > > 

> > > > > Now the vritqueue is copied to the new VM and the

> > > > > hardware now has to continue execution from index 20. We need to tell

> > > > > the hardware via configuring the last used_index.

> > > > 

> > > > 

> > > > If the hardware can not sync the index from the virtqueue, the

> > > > driver can do the synchronization by make the last_used_idx

> > > > equals to used index in the virtqueue.

> > > > 

> > > > Thanks

> > > > 

> > > > 

> > > > >   So why don't we

> > > > > restore the used index?

> > > > > 

> > > > > > > So it puzzles me why is set_vq_state() we do not

> > > > > > > communicate the saved

> > > > > > > used index.

> > > > > > 

> > > > > > We don't do that since:

> > > > > > 

> > > > > > 1) if the hardware can sync its internal used index from

> > > > > > the virtqueue

> > > > > > during device, then we don't need it

> > > > > > 2) if the hardware can not sync its internal used index,

> > > > > > the driver (e.g as

> > > > > > you did here) can do that.

> > > > > > 

> > > > > > But there's no way for the hardware to deduce the

> > > > > > internal avail index from

> > > > > > the virtqueue, that's why avail index is sycned.

> > > > > > 

> > > > > > Thanks

> > > > > > 

> > > > > > 

> > > > 

> > > 

> > 

>
Si-Wei Liu Feb. 17, 2021, 12:25 a.m. UTC | #15
On 2/10/2021 7:45 AM, Eli Cohen wrote:
> On Wed, Feb 10, 2021 at 12:59:03AM -0800, Si-Wei Liu wrote:

>>

>> On 2/9/2021 7:53 PM, Jason Wang wrote:

>>> On 2021/2/10 上午10:30, Si-Wei Liu wrote:

>>>>

>>>> On 2/8/2021 10:37 PM, Jason Wang wrote:

>>>>> On 2021/2/9 下午2:12, Eli Cohen wrote:

>>>>>> On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

>>>>>>> On 2021/2/8 下午6:04, Eli Cohen wrote:

>>>>>>>> On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

>>>>>>>>> On 2021/2/8 下午2:37, Eli Cohen wrote:

>>>>>>>>>> On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

>>>>>>>>>>> On 2021/2/6 上午7:07, Si-Wei Liu wrote:

>>>>>>>>>>>> On 2/3/2021 11:36 PM, Eli Cohen wrote:

>>>>>>>>>>>>> When a change of memory map

>>>>>>>>>>>>> occurs, the hardware resources

>>>>>>>>>>>>> are destroyed

>>>>>>>>>>>>> and then re-created again with

>>>>>>>>>>>>> the new memory map. In such

>>>>>>>>>>>>> case, we need

>>>>>>>>>>>>> to restore the hardware

>>>>>>>>>>>>> available and used indices. The

>>>>>>>>>>>>> driver failed to

>>>>>>>>>>>>> restore the used index which is added here.

>>>>>>>>>>>>>

>>>>>>>>>>>>> Also, since the driver also

>>>>>>>>>>>>> fails to reset the available and

>>>>>>>>>>>>> used

>>>>>>>>>>>>> indices upon device reset, fix

>>>>>>>>>>>>> this here to avoid regression

>>>>>>>>>>>>> caused by

>>>>>>>>>>>>> the fact that used index may not be zero upon device reset.

>>>>>>>>>>>>>

>>>>>>>>>>>>> Fixes: 1a86b377aa21 ("vdpa/mlx5:

>>>>>>>>>>>>> Add VDPA driver for supported

>>>>>>>>>>>>> mlx5

>>>>>>>>>>>>> devices")

>>>>>>>>>>>>> Signed-off-by: Eli Cohen<elic@nvidia.com>

>>>>>>>>>>>>> ---

>>>>>>>>>>>>> v0 -> v1:

>>>>>>>>>>>>> Clear indices upon device reset

>>>>>>>>>>>>>

>>>>>>>>>>>>>       drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

>>>>>>>>>>>>>       1 file changed, 18 insertions(+)

>>>>>>>>>>>>>

>>>>>>>>>>>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>> index 88dde3455bfd..b5fe6d2ad22f 100644

>>>>>>>>>>>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>>>>>>>>>>>>>           u64 device_addr;

>>>>>>>>>>>>>           u64 driver_addr;

>>>>>>>>>>>>>           u16 avail_index;

>>>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>>>           bool ready;

>>>>>>>>>>>>>           struct vdpa_callback cb;

>>>>>>>>>>>>>           bool restore;

>>>>>>>>>>>>> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>>>>>>>>>>>>>           u32 virtq_id;

>>>>>>>>>>>>>           struct mlx5_vdpa_net *ndev;

>>>>>>>>>>>>>           u16 avail_idx;

>>>>>>>>>>>>> +    u16 used_idx;

>>>>>>>>>>>>>           int fw_state;

>>>>>>>>>>>>>             /* keep last in the struct */

>>>>>>>>>>>>> @@ -804,6 +806,7 @@ static int

>>>>>>>>>>>>> create_virtqueue(struct

>>>>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtque

>>>>>>>>>>>>>             obj_context =

>>>>>>>>>>>>> MLX5_ADDR_OF(create_virtio_net_q_in,

>>>>>>>>>>>>> in,

>>>>>>>>>>>>> obj_context);

>>>>>>>>>>>>>          

>>>>>>>>>>>>> MLX5_SET(virtio_net_q_object,

>>>>>>>>>>>>> obj_context, hw_available_index,

>>>>>>>>>>>>> mvq->avail_idx);

>>>>>>>>>>>>> +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,

>>>>>>>>>>>>> mvq->used_idx);

>>>>>>>>>>>>>           MLX5_SET(virtio_net_q_object, obj_context,

>>>>>>>>>>>>> queue_feature_bit_mask_12_3,

>>>>>>>>>>>>> get_features_12_3(ndev->mvdev.actual_features));

>>>>>>>>>>>>>           vq_ctx =

>>>>>>>>>>>>> MLX5_ADDR_OF(virtio_net_q_object,

>>>>>>>>>>>>> obj_context,

>>>>>>>>>>>>> virtio_q_context);

>>>>>>>>>>>>> @@ -1022,6 +1025,7 @@ static int

>>>>>>>>>>>>> connect_qps(struct mlx5_vdpa_net

>>>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtqueue *m

>>>>>>>>>>>>>       struct mlx5_virtq_attr {

>>>>>>>>>>>>>           u8 state;

>>>>>>>>>>>>>           u16 available_index;

>>>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>>>       };

>>>>>>>>>>>>>         static int

>>>>>>>>>>>>> query_virtqueue(struct

>>>>>>>>>>>>> mlx5_vdpa_net *ndev, struct

>>>>>>>>>>>>> mlx5_vdpa_virtqueue *mvq,

>>>>>>>>>>>>> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

>>>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>>>>>>>>>>>>>           memset(attr, 0, sizeof(*attr));

>>>>>>>>>>>>>           attr->state =

>>>>>>>>>>>>> MLX5_GET(virtio_net_q_object,

>>>>>>>>>>>>> obj_context, state);

>>>>>>>>>>>>>           attr->available_index = MLX5_GET(virtio_net_q_object,

>>>>>>>>>>>>> obj_context, hw_available_index);

>>>>>>>>>>>>> +    attr->used_index =

>>>>>>>>>>>>> MLX5_GET(virtio_net_q_object,

>>>>>>>>>>>>> obj_context,

>>>>>>>>>>>>> hw_used_index);

>>>>>>>>>>>>>           kfree(out);

>>>>>>>>>>>>>           return 0;

>>>>>>>>>>>>>       @@ -1535,6 +1540,16 @@

>>>>>>>>>>>>> static void

>>>>>>>>>>>>> teardown_virtqueues(struct

>>>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>>>           }

>>>>>>>>>>>>>       }

>>>>>>>>>>>>>       +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

>>>>>>>>>>>>> +{

>>>>>>>>>>>>> +    int i;

>>>>>>>>>>>>> +

>>>>>>>>>>>>> +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

>>>>>>>>>>>>> +        ndev->vqs[i].avail_idx = 0;

>>>>>>>>>>>>> +        ndev->vqs[i].used_idx = 0;

>>>>>>>>>>>>> +    }

>>>>>>>>>>>>> +}

>>>>>>>>>>>>> +

>>>>>>>>>>>>>       /* TODO: cross-endian support */

>>>>>>>>>>>>>       static inline bool

>>>>>>>>>>>>> mlx5_vdpa_is_little_endian(struct

>>>>>>>>>>>>> mlx5_vdpa_dev

>>>>>>>>>>>>> *mvdev)

>>>>>>>>>>>>>       {

>>>>>>>>>>>>> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

>>>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>>>>>>>>>>>>>               return err;

>>>>>>>>>>>>>             ri->avail_index = attr.available_index;

>>>>>>>>>>>>> +    ri->used_index = attr.used_index;

>>>>>>>>>>>>>           ri->ready = mvq->ready;

>>>>>>>>>>>>>           ri->num_ent = mvq->num_ent;

>>>>>>>>>>>>>           ri->desc_addr = mvq->desc_addr;

>>>>>>>>>>>>> @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct

>>>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>>>                   continue;

>>>>>>>>>>>>>                 mvq->avail_idx = ri->avail_index;

>>>>>>>>>>>>> +        mvq->used_idx = ri->used_index;

>>>>>>>>>>>>>               mvq->ready = ri->ready;

>>>>>>>>>>>>>               mvq->num_ent = ri->num_ent;

>>>>>>>>>>>>>               mvq->desc_addr = ri->desc_addr;

>>>>>>>>>>>>> @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

>>>>>>>>>>>>> vdpa_device *vdev, u8 status)

>>>>>>>>>>>>>           if (!status) {

>>>>>>>>>>>>>              

>>>>>>>>>>>>> mlx5_vdpa_info(mvdev,

>>>>>>>>>>>>> "performing device reset\n");

>>>>>>>>>>>>>               teardown_driver(ndev);

>>>>>>>>>>>>> +        clear_virtqueues(ndev);

>>>>>>>>>>>> The clearing looks fine at the first

>>>>>>>>>>>> glance, as it aligns with the other

>>>>>>>>>>>> state cleanups floating around at

>>>>>>>>>>>> the same place. However, the thing

>>>>>>>>>>>> is

>>>>>>>>>>>> get_vq_state() is supposed to be

>>>>>>>>>>>> called right after to get sync'ed

>>>>>>>>>>>> with

>>>>>>>>>>>> the latest internal avail_index from

>>>>>>>>>>>> device while vq is stopped. The

>>>>>>>>>>>> index was saved in the driver

>>>>>>>>>>>> software at vq suspension, but

>>>>>>>>>>>> before the

>>>>>>>>>>>> virtq object is destroyed. We

>>>>>>>>>>>> shouldn't clear the avail_index too

>>>>>>>>>>>> early.

>>>>>>>>>>> Good point.

>>>>>>>>>>>

>>>>>>>>>>> There's a limitation on the virtio spec

>>>>>>>>>>> and vDPA framework that we can not

>>>>>>>>>>> simply differ device suspending from device reset.

>>>>>>>>>>>

>>>>>>>>>> Are you talking about live migration where

>>>>>>>>>> you reset the device but

>>>>>>>>>> still want to know how far it progressed in

>>>>>>>>>> order to continue from the

>>>>>>>>>> same place in the new VM?

>>>>>>>>> Yes. So if we want to support live migration at we need:

>>>>>>>>>

>>>>>>>>> in src node:

>>>>>>>>> 1) suspend the device

>>>>>>>>> 2) get last_avail_idx via get_vq_state()

>>>>>>>>>

>>>>>>>>> in the dst node:

>>>>>>>>> 3) set last_avail_idx via set_vq_state()

>>>>>>>>> 4) resume the device

>>>>>>>>>

>>>>>>>>> So you can see, step 2 requires the device/driver not to forget the

>>>>>>>>> last_avail_idx.

>>>>>>>>>

>>>>>>>> Just to be sure, what really matters here is the

>>>>>>>> used index. Becuase the

>>>>>>>> vriqtueue itself is copied from the src VM to the

>>>>>>>> dest VM. The available

>>>>>>>> index is alreay there and we know the hardware reads it from there.

>>>>>>> So for "last_avail_idx" I meant the hardware internal

>>>>>>> avail index. It's not

>>>>>>> stored in the virtqueue so we must migrate it from src

>>>>>>> to dest and set them

>>>>>>> through set_vq_state(). Then in the destination, the virtqueue can be

>>>>>>> restarted from that index.

>>>>>>>

>>>>>> Consider this case: driver posted buffers till avail index becomes the

>>>>>> value 50. Hardware is executing but made it till 20 when virtqueue was

>>>>>> suspended due to live migration - this is indicated by hardware used

>>>>>> index equal 20.

>>>>>

>>>>> So in this case the used index in the virtqueue should be 20?

>>>>> Otherwise we need not sync used index itself but all the used

>>>>> entries that is not committed to the used ring.

>>>> In other word, for mlx5 vdpa there's no such internal last_avail_idx

>>>> stuff maintained by the hardware, right?

>>>

>>> For each device it should have one otherwise it won't work correctly

>>> during stop/resume. See the codes mlx5_vdpa_get_vq_state() which calls

>>> query_virtqueue() that build commands to query "last_avail_idx" from the

>>> hardware:

>>>

>>>      MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,

>>> MLX5_CMD_OP_QUERY_GENERAL_OBJECT);

>>>      MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,

>>> MLX5_OBJ_TYPE_VIRTIO_NET_Q);

>>>      MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);

>>>      MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);

>>>      err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);

>>>      if (err)

>>>          goto err_cmd;

>>>

>>>      obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);

>>>      memset(attr, 0, sizeof(*attr));

>>>      attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

>>>      attr->available_index = MLX5_GET(virtio_net_q_object, obj_context,

>>> hw_available_index);

>>>

>> Eli should be able to correct me, but this hw_available_index might just be

>> a cached value of virtqueue avail_index in the memory from the most recent

>> sync. I doubt it's the one you talked about in software implementation. If I

>> understand Eli correctly, hardware will always reload the latest avail_index

>> from memory whenever it's being sync'ed again.

> That's my understanding too. I am still trying to get confirmation from

> hardware guys. Will send when I have them.

>

>> <quote>

>> The hardware always goes to read the available index from memory. The

>> requirement to configure it when creating a new object is still a

>> requirement defined by the interface so I must not violate interface

>> requirments.

>> </quote>

>>

>> If the hardware does everything perfectly that is able to flush pending

>> requests, update descriptors, rings plus used indices all at once before the

>> suspension, there's no need for hardware to maintain a separate internal

>> index than the h/w used_index. The hardware can get started from the saved

>> used_index upon resuming. I view this is of (hardware) implementation

>> choices and thought it does not violate the virtio spec?

>>

>>

>>>

>>>

>>>> And the used_idx in the virtqueue is always in sync with the

>>>> hardware used_index, and hardware is supposed to commit pending used

>>>> buffers to the ring while bumping up the hardware used_index (and

>>>> also committed to memory) altogether prior to suspension, is my

>>>> understanding correct here? Double checking if this is the expected

>>>> semantics of what

>>>> modify_virtqueue(MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND) should

>>>> achieve.

>>>>

> That's my understanding too.

>

>>>> If the above is true, then it looks to me for mlx5 vdpa we should

>>>> really return h/w used_idx rather than the last_avail_idx through

>>>> get_vq_state(), in order to reconstruct the virt queue state post

>>>> live migration. For the set_map case, the internal last_avail_idx

>>>> really doesn't matter, although both indices are saved and restored

>>>> transparently as-is.

> Right, that's what I think too. In fact, I discussed that already with

> Jason over the phone yesterday. The conclusion was since the only thing

> that matters for a network device is the used index, I tried to return

> the hardware used index in get_vq_state (instead of hardware available

> index) and restore the value I get in set_vq_state into the hardware

> used index. I am still not done with this experiment but will update.

>

>>> Right, a subtle thing here is that: for the device that might have can't

>>> not complete all virtqueue requests during vq suspending, the

>>> "last_avail_idx" might not be equal to the hardware used_idx. Thing

>>> might be true for the storage devices that needs to connect to a remote

>>> backend. But this is not the case of networking device, so

>>> last_avail_idx should be equal to hardware used_idx here.

>> Eli, since it's your hardware, does it work this way? i.e. does the firmware

>> interface see a case where virtqueue requests can't be completed before

>> suspending vq?

>>

> This can happen regardless of what effort firmware does to complete all

> pending requests. An example is the hot plugging of memory. The set map

> call is called and mlx5 vdpa driver tears down the vitqueue objects

> while the guest driver can still be posting requests to the virtqueue.

> So we find ourselves in a situation and used index is behine the

> available index.

The set_map case is a bit unusual from the migration or reboot case. 
Since suspend/resume on set_map is done in a single ioctl call, I assume 
the driver is responsible for saving and restoring the used index, and 
hardware can only update used_index to the memory until the memory maps 
are fully set up (i.e. after memory keys and virtq objects get created). 
Whereas for the migration and reboot case, used_index is required to be 
sync'ed to memory for all completed requests before suspension 
(exception: network Rx path where packet drop can be tolerated). The 
procedure to resume vDPA device between these two should differ I 
suppose, but did not seem see it in the code?

>

>>> But using the "last_avail_idx" or hardware avail_idx should always be

>>> better in this case since it's guaranteed to correct and will have less

>>> confusion. We use this convention in other types of vhost backends

>>> (vhost-kernel, vhost-user).

>>>

>>> So looking at mlx5_set_vq_state(), it probably won't work since it

>>> doesn't not set either hardware avail_idx or hardware used_idx:

>> The saved mvq->avail_idx will be used to recreate hardware virtq object and

>> the used index in create_virtqueue(), once status DRIVER_OK is set. I

>> suspect we should pass the index to mvq->used_idx in

>> mlx5_vdpa_set_vq_state() below instead.

>>

> Right, that's what I am checking but still no final conclusions. I need

> to harness hardware guy to provide me with clear answers.

OK. Could you update what you find from the hardware guy and let us know 
e.g. if the current firmware interface would suffice?

Thanks,
-Siwei
>> Thanks,

>> -Siwei

>>> static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,

>>>                    const struct vdpa_vq_state *state)

>>> {

>>>      struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);

>>>      struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);

>>>      struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];

>>>

>>>      if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {

>>>          mlx5_vdpa_warn(mvdev, "can't modify available index\n");

>>>          return -EINVAL;

>>>      }

>>>

>>>      mvq->avail_idx = state->avail_index;

>>>      return 0;

>>> }

>>>

>>> Depends on the hardware, we should either set hardware used_idx or

>>> hardware avail_idx here.

>>>

>>> I think we need to clarify how device is supposed to work in the virtio

>>> spec.

>>>

>>> Thanks

>>>

>>>

>>>> -Siwei

>>>>

>>>>>

>>>>>> Now the vritqueue is copied to the new VM and the

>>>>>> hardware now has to continue execution from index 20. We need to tell

>>>>>> the hardware via configuring the last used_index.

>>>>>

>>>>> If the hardware can not sync the index from the virtqueue, the

>>>>> driver can do the synchronization by make the last_used_idx

>>>>> equals to used index in the virtqueue.

>>>>>

>>>>> Thanks

>>>>>

>>>>>

>>>>>>    So why don't we

>>>>>> restore the used index?

>>>>>>

>>>>>>>> So it puzzles me why is set_vq_state() we do not

>>>>>>>> communicate the saved

>>>>>>>> used index.

>>>>>>> We don't do that since:

>>>>>>>

>>>>>>> 1) if the hardware can sync its internal used index from

>>>>>>> the virtqueue

>>>>>>> during device, then we don't need it

>>>>>>> 2) if the hardware can not sync its internal used index,

>>>>>>> the driver (e.g as

>>>>>>> you did here) can do that.

>>>>>>>

>>>>>>> But there's no way for the hardware to deduce the

>>>>>>> internal avail index from

>>>>>>> the virtqueue, that's why avail index is sycned.

>>>>>>>

>>>>>>> Thanks

>>>>>>>

>>>>>>>
Eli Cohen Feb. 17, 2021, 6:51 a.m. UTC | #16
On Tue, Feb 16, 2021 at 04:25:20PM -0800, Si-Wei Liu wrote:
> 

> > > The saved mvq->avail_idx will be used to recreate hardware virtq object and

> > > the used index in create_virtqueue(), once status DRIVER_OK is set. I

> > > suspect we should pass the index to mvq->used_idx in

> > > mlx5_vdpa_set_vq_state() below instead.

> > > 

> > Right, that's what I am checking but still no final conclusions. I need

> > to harness hardware guy to provide me with clear answers.

> OK. Could you update what you find from the hardware guy and let us know

> e.g. if the current firmware interface would suffice?

> 


Te answer I got is that upon query_virtqueue, the hardware available and
used indices should always return the same value for virtqueues that
complete in order - that's the case for network virtqueues. The value
returned is the consumer index of the hardware. These values should be
provided when creating a virtqueue; in case of attaching to an existing
virtqueue (e.g. after suspend and resume), the values can be non zero.

Currently there's a bug in the firmware where for RX virtqueue, the
value returned for the available index is wrong. However, the value
returned for used index is the correct value.

Therefore, we need to return the hardware used index in get_vq_state()
and restore this value into both the new object's available and used
indices.
Jason Wang Feb. 18, 2021, 4:44 a.m. UTC | #17
On 2021/2/10 下午4:59, Si-Wei Liu wrote:
>

>

> On 2/9/2021 7:53 PM, Jason Wang wrote:

>>

>> On 2021/2/10 上午10:30, Si-Wei Liu wrote:

>>>

>>>

>>> On 2/8/2021 10:37 PM, Jason Wang wrote:

>>>>

>>>> On 2021/2/9 下午2:12, Eli Cohen wrote:

>>>>> On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

>>>>>> On 2021/2/8 下午6:04, Eli Cohen wrote:

>>>>>>> On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

>>>>>>>> On 2021/2/8 下午2:37, Eli Cohen wrote:

>>>>>>>>> On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

>>>>>>>>>> On 2021/2/6 上午7:07, Si-Wei Liu wrote:

>>>>>>>>>>> On 2/3/2021 11:36 PM, Eli Cohen wrote:

>>>>>>>>>>>> When a change of memory map occurs, the hardware resources 

>>>>>>>>>>>> are destroyed

>>>>>>>>>>>> and then re-created again with the new memory map. In such 

>>>>>>>>>>>> case, we need

>>>>>>>>>>>> to restore the hardware available and used indices. The 

>>>>>>>>>>>> driver failed to

>>>>>>>>>>>> restore the used index which is added here.

>>>>>>>>>>>>

>>>>>>>>>>>> Also, since the driver also fails to reset the available 

>>>>>>>>>>>> and used

>>>>>>>>>>>> indices upon device reset, fix this here to avoid 

>>>>>>>>>>>> regression caused by

>>>>>>>>>>>> the fact that used index may not be zero upon device reset.

>>>>>>>>>>>>

>>>>>>>>>>>> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for 

>>>>>>>>>>>> supported mlx5

>>>>>>>>>>>> devices")

>>>>>>>>>>>> Signed-off-by: Eli Cohen<elic@nvidia.com>

>>>>>>>>>>>> ---

>>>>>>>>>>>> v0 -> v1:

>>>>>>>>>>>> Clear indices upon device reset

>>>>>>>>>>>>

>>>>>>>>>>>>      drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

>>>>>>>>>>>>      1 file changed, 18 insertions(+)

>>>>>>>>>>>>

>>>>>>>>>>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>> index 88dde3455bfd..b5fe6d2ad22f 100644

>>>>>>>>>>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>>>>>>>>>>>>          u64 device_addr;

>>>>>>>>>>>>          u64 driver_addr;

>>>>>>>>>>>>          u16 avail_index;

>>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>>          bool ready;

>>>>>>>>>>>>          struct vdpa_callback cb;

>>>>>>>>>>>>          bool restore;

>>>>>>>>>>>> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>>>>>>>>>>>>          u32 virtq_id;

>>>>>>>>>>>>          struct mlx5_vdpa_net *ndev;

>>>>>>>>>>>>          u16 avail_idx;

>>>>>>>>>>>> +    u16 used_idx;

>>>>>>>>>>>>          int fw_state;

>>>>>>>>>>>>            /* keep last in the struct */

>>>>>>>>>>>> @@ -804,6 +806,7 @@ static int create_virtqueue(struct 

>>>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtque

>>>>>>>>>>>>            obj_context = 

>>>>>>>>>>>> MLX5_ADDR_OF(create_virtio_net_q_in, in,

>>>>>>>>>>>> obj_context);

>>>>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context, 

>>>>>>>>>>>> hw_available_index,

>>>>>>>>>>>> mvq->avail_idx);

>>>>>>>>>>>> +    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,

>>>>>>>>>>>> mvq->used_idx);

>>>>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context,

>>>>>>>>>>>> queue_feature_bit_mask_12_3,

>>>>>>>>>>>> get_features_12_3(ndev->mvdev.actual_features));

>>>>>>>>>>>>          vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, 

>>>>>>>>>>>> obj_context,

>>>>>>>>>>>> virtio_q_context);

>>>>>>>>>>>> @@ -1022,6 +1025,7 @@ static int connect_qps(struct 

>>>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtqueue *m

>>>>>>>>>>>>      struct mlx5_virtq_attr {

>>>>>>>>>>>>          u8 state;

>>>>>>>>>>>>          u16 available_index;

>>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>>      };

>>>>>>>>>>>>        static int query_virtqueue(struct mlx5_vdpa_net 

>>>>>>>>>>>> *ndev, struct

>>>>>>>>>>>> mlx5_vdpa_virtqueue *mvq,

>>>>>>>>>>>> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

>>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>>>>>>>>>>>>          memset(attr, 0, sizeof(*attr));

>>>>>>>>>>>>          attr->state = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>>>> obj_context, state);

>>>>>>>>>>>>          attr->available_index = MLX5_GET(virtio_net_q_object,

>>>>>>>>>>>> obj_context, hw_available_index);

>>>>>>>>>>>> +    attr->used_index = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>>>> obj_context,

>>>>>>>>>>>> hw_used_index);

>>>>>>>>>>>>          kfree(out);

>>>>>>>>>>>>          return 0;

>>>>>>>>>>>>      @@ -1535,6 +1540,16 @@ static void 

>>>>>>>>>>>> teardown_virtqueues(struct

>>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>>          }

>>>>>>>>>>>>      }

>>>>>>>>>>>>      +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

>>>>>>>>>>>> +{

>>>>>>>>>>>> +    int i;

>>>>>>>>>>>> +

>>>>>>>>>>>> +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

>>>>>>>>>>>> +        ndev->vqs[i].avail_idx = 0;

>>>>>>>>>>>> +        ndev->vqs[i].used_idx = 0;

>>>>>>>>>>>> +    }

>>>>>>>>>>>> +}

>>>>>>>>>>>> +

>>>>>>>>>>>>      /* TODO: cross-endian support */

>>>>>>>>>>>>      static inline bool mlx5_vdpa_is_little_endian(struct 

>>>>>>>>>>>> mlx5_vdpa_dev

>>>>>>>>>>>> *mvdev)

>>>>>>>>>>>>      {

>>>>>>>>>>>> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

>>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>>>>>>>>>>>>              return err;

>>>>>>>>>>>>            ri->avail_index = attr.available_index;

>>>>>>>>>>>> +    ri->used_index = attr.used_index;

>>>>>>>>>>>>          ri->ready = mvq->ready;

>>>>>>>>>>>>          ri->num_ent = mvq->num_ent;

>>>>>>>>>>>>          ri->desc_addr = mvq->desc_addr;

>>>>>>>>>>>> @@ -1654,6 +1670,7 @@ static void restore_channels_info(struct

>>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>>                  continue;

>>>>>>>>>>>>                mvq->avail_idx = ri->avail_index;

>>>>>>>>>>>> +        mvq->used_idx = ri->used_index;

>>>>>>>>>>>>              mvq->ready = ri->ready;

>>>>>>>>>>>>              mvq->num_ent = ri->num_ent;

>>>>>>>>>>>>              mvq->desc_addr = ri->desc_addr;

>>>>>>>>>>>> @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

>>>>>>>>>>>> vdpa_device *vdev, u8 status)

>>>>>>>>>>>>          if (!status) {

>>>>>>>>>>>>              mlx5_vdpa_info(mvdev, "performing device 

>>>>>>>>>>>> reset\n");

>>>>>>>>>>>>              teardown_driver(ndev);

>>>>>>>>>>>> +        clear_virtqueues(ndev);

>>>>>>>>>>> The clearing looks fine at the first glance, as it aligns 

>>>>>>>>>>> with the other

>>>>>>>>>>> state cleanups floating around at the same place. However, 

>>>>>>>>>>> the thing is

>>>>>>>>>>> get_vq_state() is supposed to be called right after to get 

>>>>>>>>>>> sync'ed with

>>>>>>>>>>> the latest internal avail_index from device while vq is 

>>>>>>>>>>> stopped. The

>>>>>>>>>>> index was saved in the driver software at vq suspension, but 

>>>>>>>>>>> before the

>>>>>>>>>>> virtq object is destroyed. We shouldn't clear the 

>>>>>>>>>>> avail_index too early.

>>>>>>>>>> Good point.

>>>>>>>>>>

>>>>>>>>>> There's a limitation on the virtio spec and vDPA framework 

>>>>>>>>>> that we can not

>>>>>>>>>> simply differ device suspending from device reset.

>>>>>>>>>>

>>>>>>>>> Are you talking about live migration where you reset the 

>>>>>>>>> device but

>>>>>>>>> still want to know how far it progressed in order to continue 

>>>>>>>>> from the

>>>>>>>>> same place in the new VM?

>>>>>>>> Yes. So if we want to support live migration at we need:

>>>>>>>>

>>>>>>>> in src node:

>>>>>>>> 1) suspend the device

>>>>>>>> 2) get last_avail_idx via get_vq_state()

>>>>>>>>

>>>>>>>> in the dst node:

>>>>>>>> 3) set last_avail_idx via set_vq_state()

>>>>>>>> 4) resume the device

>>>>>>>>

>>>>>>>> So you can see, step 2 requires the device/driver not to forget 

>>>>>>>> the

>>>>>>>> last_avail_idx.

>>>>>>>>

>>>>>>> Just to be sure, what really matters here is the used index. 

>>>>>>> Becuase the

>>>>>>> vriqtueue itself is copied from the src VM to the dest VM. The 

>>>>>>> available

>>>>>>> index is alreay there and we know the hardware reads it from there.

>>>>>>

>>>>>> So for "last_avail_idx" I meant the hardware internal avail 

>>>>>> index. It's not

>>>>>> stored in the virtqueue so we must migrate it from src to dest 

>>>>>> and set them

>>>>>> through set_vq_state(). Then in the destination, the virtqueue 

>>>>>> can be

>>>>>> restarted from that index.

>>>>>>

>>>>> Consider this case: driver posted buffers till avail index becomes 

>>>>> the

>>>>> value 50. Hardware is executing but made it till 20 when virtqueue 

>>>>> was

>>>>> suspended due to live migration - this is indicated by hardware used

>>>>> index equal 20.

>>>>

>>>>

>>>> So in this case the used index in the virtqueue should be 20? 

>>>> Otherwise we need not sync used index itself but all the used 

>>>> entries that is not committed to the used ring.

>>>

>>> In other word, for mlx5 vdpa there's no such internal last_avail_idx 

>>> stuff maintained by the hardware, right? 

>>

>>

>> For each device it should have one otherwise it won't work correctly 

>> during stop/resume. See the codes mlx5_vdpa_get_vq_state() which 

>> calls query_virtqueue() that build commands to query "last_avail_idx" 

>> from the hardware:

>>

>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 

>> MLX5_CMD_OP_QUERY_GENERAL_OBJECT);

>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 

>> MLX5_OBJ_TYPE_VIRTIO_NET_Q);

>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);

>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);

>>     err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);

>>     if (err)

>>         goto err_cmd;

>>

>>     obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, 

>> obj_context);

>>     memset(attr, 0, sizeof(*attr));

>>     attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

>>     attr->available_index = MLX5_GET(virtio_net_q_object, 

>> obj_context, hw_available_index);

>>

> Eli should be able to correct me, but this hw_available_index might 

> just be a cached value of virtqueue avail_index in the memory from the 

> most recent sync. 



It should not, otherwise it will be a bug.


> I doubt it's the one you talked about in software implementation.



Actually not, it's a virtio general issue:

Consider there's not indices wrap. And:
- used_idx is the used index in the virtqueue
- last_used_idx is the used index maintained by the device, it points to 
the location where to put the next done requests to the used_ring
- avail_idx is the available index in the virtqueue
- last_avail_idx is the index maintained by the device, it points to the 
location where device need to read from the available.

So bascially, from device POV, it only cares the buffer that belong to 
itself which are [used_idx, avail_idx). So we have:

[used_idx, last_used_idx) The requests that have been completed by the 
device but not completed to the used ring (or at least used_idx is not 
updated).
[last_used_idx, last_avail_idx) The requests that are being processed by 
the device.
[last_avail_idx, avail_idx) The requests that are made available by the 
driver but not processed by the device.

During device stop/suspend, the device should:

- stop reading new request from available ring (or read until the end of 
descriptor chain)
- sync used_idx with last_used_idx. Otherwise we need a complicated but 
not necessary API to sync last_used_idx and the indices that are not 
committed to used ring (since device may complete the request out of order)

So we know used_idx == last_used_idx in this case, so we have:

[used_idx/last_used_idx, last_avail_idx) The requests that are being 
processed.
[last_avail_idx, avail_idx) The requests that are available for the 
driver but not yet processed.

For networking device, it's sufficient to think the requests are 
completed when TX/RX DMA are finished. So there's no requests that are 
being processed after the stop. In this case we had: used_idx == 
last_used_idx == last_avail_idx. Then we only had:

[used_idx/last_used_idx/last_avail_idx, avail_idx] The requests that are 
made available by the driver but not processed by the device. That's why 
you may think only used_idx matters here.

For block device, the completion of the request might require the 
communication with the remote backend, so we can't assume last_used_idx 
is equal to the last_avail_idx. Whether or not to wait for the drain the 
request is still being discussed[1].

So you can see, for all the cases, what really matters is the 
last_avail_idx. The device should know where it need to start reading 
for the next request, and it is not necessarily equal to last_used_idx 
or used_idx. What makes things a little bit easier is the networking 
device whose last_used_idx is equal to last_avail_idx.


> If I understand Eli correctly, hardware will always reload the latest 

> avail_index from memory whenever it's being sync'ed again.

>

> <quote>

> The hardware always goes to read the available index from memory. The 

> requirement to configure it when creating a new object is still a 

> requirement defined by the interface so I must not violate interface 

> requirments.

> </quote>

>

> If the hardware does everything perfectly that is able to flush 

> pending requests, update descriptors, rings plus used indices all at 

> once before the suspension, there's no need for hardware to maintain a 

> separate internal index than the h/w used_index. The hardware can get 

> started from the saved used_index upon resuming. I view this is of 

> (hardware) implementation choices and thought it does not violate the 

> virtio spec?



Yes, but as you said, it has a lot of assumptions which may not work for 
other type of devices. So what I refer "last_avail_idx" is probably the 
"used_idx" in your description here. It might be the same in this case 
for networking device.


>

>

>>

>>

>>

>>> And the used_idx in the virtqueue is always in sync with the 

>>> hardware used_index, and hardware is supposed to commit pending used 

>>> buffers to the ring while bumping up the hardware used_index (and 

>>> also committed to memory) altogether prior to suspension, is my 

>>> understanding correct here? Double checking if this is the expected 

>>> semantics of what 

>>> modify_virtqueue(MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND) should 

>>> achieve.

>>>

>>> If the above is true, then it looks to me for mlx5 vdpa we should 

>>> really return h/w used_idx rather than the last_avail_idx through 

>>> get_vq_state(), in order to reconstruct the virt queue state post 

>>> live migration. For the set_map case, the internal last_avail_idx 

>>> really doesn't matter, although both indices are saved and restored 

>>> transparently as-is.

>>

>>

>> Right, a subtle thing here is that: for the device that might have 

>> can't not complete all virtqueue requests during vq suspending, the 

>> "last_avail_idx" might not be equal to the hardware used_idx. Thing 

>> might be true for the storage devices that needs to connect to a 

>> remote backend. But this is not the case of networking device, so 

>> last_avail_idx should be equal to hardware used_idx here. 

> Eli, since it's your hardware, does it work this way? i.e. does the 

> firmware interface see a case where virtqueue requests can't be 

> completed before suspending vq?



For storage device, I think it can happen.


>

>> But using the "last_avail_idx" or hardware avail_idx should always be 

>> better in this case since it's guaranteed to correct and will have 

>> less confusion. We use this convention in other types of vhost 

>> backends (vhost-kernel, vhost-user).

>>

>> So looking at mlx5_set_vq_state(), it probably won't work since it 

>> doesn't not set either hardware avail_idx or hardware used_idx:

> The saved mvq->avail_idx will be used to recreate hardware virtq 

> object and the used index in create_virtqueue(), once status DRIVER_OK 

> is set. I suspect we should pass the index to mvq->used_idx in 

> mlx5_vdpa_set_vq_state() below instead.

>


It depends on what did mvq->used_idx meant? If it's last_used_idx, it 
should be the same with mvq->avail_idx for networking device.

Thanks


>

> Thanks,

> -Siwei

>>

>> static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,

>>                   const struct vdpa_vq_state *state)

>> {

>>     struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);

>>     struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);

>>     struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];

>>

>>     if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {

>>         mlx5_vdpa_warn(mvdev, "can't modify available index\n");

>>         return -EINVAL;

>>     }

>>

>>     mvq->avail_idx = state->avail_index;

>>     return 0;

>> }

>>

>> Depends on the hardware, we should either set hardware used_idx or 

>> hardware avail_idx here.

>>

>> I think we need to clarify how device is supposed to work in the 

>> virtio spec.

>>

>> Thanks

>>

>>

>>>

>>> -Siwei

>>>

>>>>

>>>>

>>>>> Now the vritqueue is copied to the new VM and the

>>>>> hardware now has to continue execution from index 20. We need to tell

>>>>> the hardware via configuring the last used_index.

>>>>

>>>>

>>>> If the hardware can not sync the index from the virtqueue, the 

>>>> driver can do the synchronization by make the last_used_idx equals 

>>>> to used index in the virtqueue.

>>>>

>>>> Thanks

>>>>

>>>>

>>>>>   So why don't we

>>>>> restore the used index?

>>>>>

>>>>>>> So it puzzles me why is set_vq_state() we do not communicate the 

>>>>>>> saved

>>>>>>> used index.

>>>>>>

>>>>>> We don't do that since:

>>>>>>

>>>>>> 1) if the hardware can sync its internal used index from the 

>>>>>> virtqueue

>>>>>> during device, then we don't need it

>>>>>> 2) if the hardware can not sync its internal used index, the 

>>>>>> driver (e.g as

>>>>>> you did here) can do that.

>>>>>>

>>>>>> But there's no way for the hardware to deduce the internal avail 

>>>>>> index from

>>>>>> the virtqueue, that's why avail index is sycned.

>>>>>>

>>>>>> Thanks

>>>>>>

>>>>>>

>>>>

>>>

>>

>
Si-Wei Liu Feb. 18, 2021, 12:43 p.m. UTC | #18
On 2/17/2021 8:44 PM, Jason Wang wrote:
>

> On 2021/2/10 下午4:59, Si-Wei Liu wrote:

>>

>>

>> On 2/9/2021 7:53 PM, Jason Wang wrote:

>>>

>>> On 2021/2/10 上午10:30, Si-Wei Liu wrote:

>>>>

>>>>

>>>> On 2/8/2021 10:37 PM, Jason Wang wrote:

>>>>>

>>>>> On 2021/2/9 下午2:12, Eli Cohen wrote:

>>>>>> On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

>>>>>>> On 2021/2/8 下午6:04, Eli Cohen wrote:

>>>>>>>> On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

>>>>>>>>> On 2021/2/8 下午2:37, Eli Cohen wrote:

>>>>>>>>>> On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

>>>>>>>>>>> On 2021/2/6 上午7:07, Si-Wei Liu wrote:

>>>>>>>>>>>> On 2/3/2021 11:36 PM, Eli Cohen wrote:

>>>>>>>>>>>>> When a change of memory map occurs, the hardware resources 

>>>>>>>>>>>>> are destroyed

>>>>>>>>>>>>> and then re-created again with the new memory map. In such 

>>>>>>>>>>>>> case, we need

>>>>>>>>>>>>> to restore the hardware available and used indices. The 

>>>>>>>>>>>>> driver failed to

>>>>>>>>>>>>> restore the used index which is added here.

>>>>>>>>>>>>>

>>>>>>>>>>>>> Also, since the driver also fails to reset the available 

>>>>>>>>>>>>> and used

>>>>>>>>>>>>> indices upon device reset, fix this here to avoid 

>>>>>>>>>>>>> regression caused by

>>>>>>>>>>>>> the fact that used index may not be zero upon device reset.

>>>>>>>>>>>>>

>>>>>>>>>>>>> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for 

>>>>>>>>>>>>> supported mlx5

>>>>>>>>>>>>> devices")

>>>>>>>>>>>>> Signed-off-by: Eli Cohen<elic@nvidia.com>

>>>>>>>>>>>>> ---

>>>>>>>>>>>>> v0 -> v1:

>>>>>>>>>>>>> Clear indices upon device reset

>>>>>>>>>>>>>

>>>>>>>>>>>>>      drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 

>>>>>>>>>>>>> ++++++++++++++++++

>>>>>>>>>>>>>      1 file changed, 18 insertions(+)

>>>>>>>>>>>>>

>>>>>>>>>>>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>> index 88dde3455bfd..b5fe6d2ad22f 100644

>>>>>>>>>>>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>>>>>>>>>>>>>          u64 device_addr;

>>>>>>>>>>>>>          u64 driver_addr;

>>>>>>>>>>>>>          u16 avail_index;

>>>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>>>          bool ready;

>>>>>>>>>>>>>          struct vdpa_callback cb;

>>>>>>>>>>>>>          bool restore;

>>>>>>>>>>>>> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>>>>>>>>>>>>>          u32 virtq_id;

>>>>>>>>>>>>>          struct mlx5_vdpa_net *ndev;

>>>>>>>>>>>>>          u16 avail_idx;

>>>>>>>>>>>>> +    u16 used_idx;

>>>>>>>>>>>>>          int fw_state;

>>>>>>>>>>>>>            /* keep last in the struct */

>>>>>>>>>>>>> @@ -804,6 +806,7 @@ static int create_virtqueue(struct 

>>>>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtque

>>>>>>>>>>>>>            obj_context = 

>>>>>>>>>>>>> MLX5_ADDR_OF(create_virtio_net_q_in, in,

>>>>>>>>>>>>> obj_context);

>>>>>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context, 

>>>>>>>>>>>>> hw_available_index,

>>>>>>>>>>>>> mvq->avail_idx);

>>>>>>>>>>>>> +    MLX5_SET(virtio_net_q_object, obj_context, 

>>>>>>>>>>>>> hw_used_index,

>>>>>>>>>>>>> mvq->used_idx);

>>>>>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context,

>>>>>>>>>>>>> queue_feature_bit_mask_12_3,

>>>>>>>>>>>>> get_features_12_3(ndev->mvdev.actual_features));

>>>>>>>>>>>>>          vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, 

>>>>>>>>>>>>> obj_context,

>>>>>>>>>>>>> virtio_q_context);

>>>>>>>>>>>>> @@ -1022,6 +1025,7 @@ static int connect_qps(struct 

>>>>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtqueue *m

>>>>>>>>>>>>>      struct mlx5_virtq_attr {

>>>>>>>>>>>>>          u8 state;

>>>>>>>>>>>>>          u16 available_index;

>>>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>>>      };

>>>>>>>>>>>>>        static int query_virtqueue(struct mlx5_vdpa_net 

>>>>>>>>>>>>> *ndev, struct

>>>>>>>>>>>>> mlx5_vdpa_virtqueue *mvq,

>>>>>>>>>>>>> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

>>>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>>>>>>>>>>>>>          memset(attr, 0, sizeof(*attr));

>>>>>>>>>>>>>          attr->state = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>>>>> obj_context, state);

>>>>>>>>>>>>>          attr->available_index = 

>>>>>>>>>>>>> MLX5_GET(virtio_net_q_object,

>>>>>>>>>>>>> obj_context, hw_available_index);

>>>>>>>>>>>>> +    attr->used_index = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>>>>> obj_context,

>>>>>>>>>>>>> hw_used_index);

>>>>>>>>>>>>>          kfree(out);

>>>>>>>>>>>>>          return 0;

>>>>>>>>>>>>>      @@ -1535,6 +1540,16 @@ static void 

>>>>>>>>>>>>> teardown_virtqueues(struct

>>>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>>>          }

>>>>>>>>>>>>>      }

>>>>>>>>>>>>>      +static void clear_virtqueues(struct mlx5_vdpa_net 

>>>>>>>>>>>>> *ndev)

>>>>>>>>>>>>> +{

>>>>>>>>>>>>> +    int i;

>>>>>>>>>>>>> +

>>>>>>>>>>>>> +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

>>>>>>>>>>>>> +        ndev->vqs[i].avail_idx = 0;

>>>>>>>>>>>>> +        ndev->vqs[i].used_idx = 0;

>>>>>>>>>>>>> +    }

>>>>>>>>>>>>> +}

>>>>>>>>>>>>> +

>>>>>>>>>>>>>      /* TODO: cross-endian support */

>>>>>>>>>>>>>      static inline bool mlx5_vdpa_is_little_endian(struct 

>>>>>>>>>>>>> mlx5_vdpa_dev

>>>>>>>>>>>>> *mvdev)

>>>>>>>>>>>>>      {

>>>>>>>>>>>>> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

>>>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>>>>>>>>>>>>>              return err;

>>>>>>>>>>>>>            ri->avail_index = attr.available_index;

>>>>>>>>>>>>> +    ri->used_index = attr.used_index;

>>>>>>>>>>>>>          ri->ready = mvq->ready;

>>>>>>>>>>>>>          ri->num_ent = mvq->num_ent;

>>>>>>>>>>>>>          ri->desc_addr = mvq->desc_addr;

>>>>>>>>>>>>> @@ -1654,6 +1670,7 @@ static void 

>>>>>>>>>>>>> restore_channels_info(struct

>>>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>>>                  continue;

>>>>>>>>>>>>>                mvq->avail_idx = ri->avail_index;

>>>>>>>>>>>>> +        mvq->used_idx = ri->used_index;

>>>>>>>>>>>>>              mvq->ready = ri->ready;

>>>>>>>>>>>>>              mvq->num_ent = ri->num_ent;

>>>>>>>>>>>>>              mvq->desc_addr = ri->desc_addr;

>>>>>>>>>>>>> @@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct

>>>>>>>>>>>>> vdpa_device *vdev, u8 status)

>>>>>>>>>>>>>          if (!status) {

>>>>>>>>>>>>>              mlx5_vdpa_info(mvdev, "performing device 

>>>>>>>>>>>>> reset\n");

>>>>>>>>>>>>>              teardown_driver(ndev);

>>>>>>>>>>>>> +        clear_virtqueues(ndev);

>>>>>>>>>>>> The clearing looks fine at the first glance, as it aligns 

>>>>>>>>>>>> with the other

>>>>>>>>>>>> state cleanups floating around at the same place. However, 

>>>>>>>>>>>> the thing is

>>>>>>>>>>>> get_vq_state() is supposed to be called right after to get 

>>>>>>>>>>>> sync'ed with

>>>>>>>>>>>> the latest internal avail_index from device while vq is 

>>>>>>>>>>>> stopped. The

>>>>>>>>>>>> index was saved in the driver software at vq suspension, 

>>>>>>>>>>>> but before the

>>>>>>>>>>>> virtq object is destroyed. We shouldn't clear the 

>>>>>>>>>>>> avail_index too early.

>>>>>>>>>>> Good point.

>>>>>>>>>>>

>>>>>>>>>>> There's a limitation on the virtio spec and vDPA framework 

>>>>>>>>>>> that we can not

>>>>>>>>>>> simply differ device suspending from device reset.

>>>>>>>>>>>

>>>>>>>>>> Are you talking about live migration where you reset the 

>>>>>>>>>> device but

>>>>>>>>>> still want to know how far it progressed in order to continue 

>>>>>>>>>> from the

>>>>>>>>>> same place in the new VM?

>>>>>>>>> Yes. So if we want to support live migration at we need:

>>>>>>>>>

>>>>>>>>> in src node:

>>>>>>>>> 1) suspend the device

>>>>>>>>> 2) get last_avail_idx via get_vq_state()

>>>>>>>>>

>>>>>>>>> in the dst node:

>>>>>>>>> 3) set last_avail_idx via set_vq_state()

>>>>>>>>> 4) resume the device

>>>>>>>>>

>>>>>>>>> So you can see, step 2 requires the device/driver not to 

>>>>>>>>> forget the

>>>>>>>>> last_avail_idx.

>>>>>>>>>

>>>>>>>> Just to be sure, what really matters here is the used index. 

>>>>>>>> Becuase the

>>>>>>>> vriqtueue itself is copied from the src VM to the dest VM. The 

>>>>>>>> available

>>>>>>>> index is alreay there and we know the hardware reads it from 

>>>>>>>> there.

>>>>>>>

>>>>>>> So for "last_avail_idx" I meant the hardware internal avail 

>>>>>>> index. It's not

>>>>>>> stored in the virtqueue so we must migrate it from src to dest 

>>>>>>> and set them

>>>>>>> through set_vq_state(). Then in the destination, the virtqueue 

>>>>>>> can be

>>>>>>> restarted from that index.

>>>>>>>

>>>>>> Consider this case: driver posted buffers till avail index 

>>>>>> becomes the

>>>>>> value 50. Hardware is executing but made it till 20 when 

>>>>>> virtqueue was

>>>>>> suspended due to live migration - this is indicated by hardware used

>>>>>> index equal 20.

>>>>>

>>>>>

>>>>> So in this case the used index in the virtqueue should be 20? 

>>>>> Otherwise we need not sync used index itself but all the used 

>>>>> entries that is not committed to the used ring.

>>>>

>>>> In other word, for mlx5 vdpa there's no such internal 

>>>> last_avail_idx stuff maintained by the hardware, right? 

>>>

>>>

>>> For each device it should have one otherwise it won't work correctly 

>>> during stop/resume. See the codes mlx5_vdpa_get_vq_state() which 

>>> calls query_virtqueue() that build commands to query 

>>> "last_avail_idx" from the hardware:

>>>

>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 

>>> MLX5_CMD_OP_QUERY_GENERAL_OBJECT);

>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 

>>> MLX5_OBJ_TYPE_VIRTIO_NET_Q);

>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);

>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, 

>>> ndev->mvdev.res.uid);

>>>     err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);

>>>     if (err)

>>>         goto err_cmd;

>>>

>>>     obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, 

>>> obj_context);

>>>     memset(attr, 0, sizeof(*attr));

>>>     attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

>>>     attr->available_index = MLX5_GET(virtio_net_q_object, 

>>> obj_context, hw_available_index);

>>>

>> Eli should be able to correct me, but this hw_available_index might 

>> just be a cached value of virtqueue avail_index in the memory from 

>> the most recent sync. 

>

>

> It should not, otherwise it will be a bug.

The hw_available_index not showing the correct last_avail_index value is 
technically a firmware bug, as Eli alluded to. That's why I had the 
original question for how this entire synchronization scheme could work 
if just saving and restoring this cached value. In my observation, the 
hw_available_index was seen far off from the hw_used_index post vq 
suspension, I just pointed out the fact that this hardware value is 
neither last_avail_idx nor last_used_idx that is useful to represent vq 
state.

However, the core question I'm having is should we care about fixing 
this in the firmware interface level (actually, the hardware 
implementation), or rather, just as said, some devices e.g. network 
device could live with the simplified form of assumption (used_idx ==  
last_used_idx  == last_avail_idx) where pending requests can be drained 
and completed before device is stopped or suspended. Despite of what 
device behavior would be expected or defined in the virtio spec, the 
get/set_vq_state() API should have very clear semantics: whether both 
forms (i.e. restore either one of used_idx or last_avail_idx is fine) 
are acceptable, or it needs to be religious to restore vq state via 
last_avail_idx only. In the latter case, both the mlx5 vdpa net driver 
and the firmware interface needs to be fixed to accommodate the stricter 
API requirement.

FWIW I don't think network (Ethernet) device should always assume 
(used_idx ==  last_used_idx  == last_avail_idx) while being stopped 
(were there virtio RDMA device in reality this assumption would break): 
it could well follow the golden rules in virtio S/W implementation to 
have separate last_used_idx and last_avail_idx in the hardware, then 
it'll be 100% API compliant.. But we know that such implementation is 
unnecessarily complicated for Ethernet device. Hence I thought there 
could be sort of reliefs for some driver/device and it's totally up to 
the device/driver to choose the implementation.

>

>

>> I doubt it's the one you talked about in software implementation.

>

>

> Actually not, it's a virtio general issue:

>

> Consider there's not indices wrap. And:

> - used_idx is the used index in the virtqueue

> - last_used_idx is the used index maintained by the device, it points 

> to the location where to put the next done requests to the used_ring

> - avail_idx is the available index in the virtqueue

> - last_avail_idx is the index maintained by the device, it points to 

> the location where device need to read from the available.

>

> So bascially, from device POV, it only cares the buffer that belong to 

> itself which are [used_idx, avail_idx). So we have:

>

> [used_idx, last_used_idx) The requests that have been completed by the 

> device but not completed to the used ring (or at least used_idx is not 

> updated).

> [last_used_idx, last_avail_idx) The requests that are being processed 

> by the device.

> [last_avail_idx, avail_idx) The requests that are made available by 

> the driver but not processed by the device.

>

> During device stop/suspend, the device should:

>

> - stop reading new request from available ring (or read until the end 

> of descriptor chain)

> - sync used_idx with last_used_idx. Otherwise we need a complicated 

> but not necessary API to sync last_used_idx and the indices that are 

> not committed to used ring (since device may complete the request out 

> of order)

>

> So we know used_idx == last_used_idx in this case, so we have:

>

> [used_idx/last_used_idx, last_avail_idx) The requests that are being 

> processed.

> [last_avail_idx, avail_idx) The requests that are available for the 

> driver but not yet processed.

>

> For networking device, it's sufficient to think the requests are 

> completed when TX/RX DMA are finished. So there's no requests that are 

> being processed after the stop. In this case we had: used_idx == 

> last_used_idx == last_avail_idx. Then we only had:

>

> [used_idx/last_used_idx/last_avail_idx, avail_idx] The requests that 

> are made available by the driver but not processed by the device. 

> That's why you may think only used_idx matters here.

>

> For block device, the completion of the request might require the 

> communication with the remote backend, so we can't assume 

> last_used_idx is equal to the last_avail_idx. Whether or not to wait 

> for the drain the request is still being discussed[1].

I guess this requirement is very subject to the specific storage setup 
(networked v.s. local) including the guest app/configuration (retry v.s. 
time out). IMHO there shouldn't be a definite yes-or-no answer here. But 
in general, for req-ack type of request if the completion (ack) cannot 
simply replicate across live migration, it should wait until the pending 
requests are completely drained.

>

> So you can see, for all the cases, what really matters is the 

> last_avail_idx. The device should know where it need to start reading 

> for the next request, and it is not necessarily equal to last_used_idx 

> or used_idx. What makes things a little bit easier is the networking 

> device whose last_used_idx is equal to last_avail_idx.

>

>

>> If I understand Eli correctly, hardware will always reload the latest 

>> avail_index from memory whenever it's being sync'ed again.

>>

>> <quote>

>> The hardware always goes to read the available index from memory. The 

>> requirement to configure it when creating a new object is still a 

>> requirement defined by the interface so I must not violate interface 

>> requirments.

>> </quote>

>>

>> If the hardware does everything perfectly that is able to flush 

>> pending requests, update descriptors, rings plus used indices all at 

>> once before the suspension, there's no need for hardware to maintain 

>> a separate internal index than the h/w used_index. The hardware can 

>> get started from the saved used_index upon resuming. I view this is 

>> of (hardware) implementation choices and thought it does not violate 

>> the virtio spec?

>

>

> Yes, but as you said, it has a lot of assumptions which may not work 

> for other type of devices. So what I refer "last_avail_idx" is 

> probably the "used_idx" in your description here. It might be the same 

> in this case for networking device.

>

>

>>

>>

>>>

>>>

>>>

>>>> And the used_idx in the virtqueue is always in sync with the 

>>>> hardware used_index, and hardware is supposed to commit pending 

>>>> used buffers to the ring while bumping up the hardware used_index 

>>>> (and also committed to memory) altogether prior to suspension, is 

>>>> my understanding correct here? Double checking if this is the 

>>>> expected semantics of what 

>>>> modify_virtqueue(MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND) should 

>>>> achieve.

>>>>

>>>> If the above is true, then it looks to me for mlx5 vdpa we should 

>>>> really return h/w used_idx rather than the last_avail_idx through 

>>>> get_vq_state(), in order to reconstruct the virt queue state post 

>>>> live migration. For the set_map case, the internal last_avail_idx 

>>>> really doesn't matter, although both indices are saved and restored 

>>>> transparently as-is.

>>>

>>>

>>> Right, a subtle thing here is that: for the device that might have 

>>> can't not complete all virtqueue requests during vq suspending, the 

>>> "last_avail_idx" might not be equal to the hardware used_idx. Thing 

>>> might be true for the storage devices that needs to connect to a 

>>> remote backend. But this is not the case of networking device, so 

>>> last_avail_idx should be equal to hardware used_idx here. 

>> Eli, since it's your hardware, does it work this way? i.e. does the 

>> firmware interface see a case where virtqueue requests can't be 

>> completed before suspending vq?

>

>

> For storage device, I think it can happen.

>

>

>>

>>> But using the "last_avail_idx" or hardware avail_idx should always 

>>> be better in this case since it's guaranteed to correct and will 

>>> have less confusion. We use this convention in other types of vhost 

>>> backends (vhost-kernel, vhost-user).

>>>

>>> So looking at mlx5_set_vq_state(), it probably won't work since it 

>>> doesn't not set either hardware avail_idx or hardware used_idx:

>> The saved mvq->avail_idx will be used to recreate hardware virtq 

>> object and the used index in create_virtqueue(), once status 

>> DRIVER_OK is set. I suspect we should pass the index to mvq->used_idx 

>> in mlx5_vdpa_set_vq_state() below instead.

>>

>

> It depends on what did mvq->used_idx meant? If it's last_used_idx, it 

> should be the same with mvq->avail_idx for networking device.

It's the last_used_idx. Note, Eli already posted a patch ("vdpa/mlx5: 
Fix suspend/resume index restoration") to repurpose set_vq_state() to 
restore both used_idx and avail_idx (both indices should be conceptually 
equal in that device model) across reset. Which implies the simplified 
assumption I mentioned earlier. The requirement of set_vq_state() API 
should make it clear if this kind of assumption is acceptable.


Thanks,
-Siwei


>

> Thanks

>

>

>>

>> Thanks,

>> -Siwei

>>>

>>> static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,

>>>                   const struct vdpa_vq_state *state)

>>> {

>>>     struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);

>>>     struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);

>>>     struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];

>>>

>>>     if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {

>>>         mlx5_vdpa_warn(mvdev, "can't modify available index\n");

>>>         return -EINVAL;

>>>     }

>>>

>>>     mvq->avail_idx = state->avail_index;

>>>     return 0;

>>> }

>>>

>>> Depends on the hardware, we should either set hardware used_idx or 

>>> hardware avail_idx here.

>>>

>>> I think we need to clarify how device is supposed to work in the 

>>> virtio spec.

>>>

>>> Thanks

>>>

>>>

>>>>

>>>> -Siwei

>>>>

>>>>>

>>>>>

>>>>>> Now the vritqueue is copied to the new VM and the

>>>>>> hardware now has to continue execution from index 20. We need to 

>>>>>> tell

>>>>>> the hardware via configuring the last used_index.

>>>>>

>>>>>

>>>>> If the hardware can not sync the index from the virtqueue, the 

>>>>> driver can do the synchronization by make the last_used_idx equals 

>>>>> to used index in the virtqueue.

>>>>>

>>>>> Thanks

>>>>>

>>>>>

>>>>>>   So why don't we

>>>>>> restore the used index?

>>>>>>

>>>>>>>> So it puzzles me why is set_vq_state() we do not communicate 

>>>>>>>> the saved

>>>>>>>> used index.

>>>>>>>

>>>>>>> We don't do that since:

>>>>>>>

>>>>>>> 1) if the hardware can sync its internal used index from the 

>>>>>>> virtqueue

>>>>>>> during device, then we don't need it

>>>>>>> 2) if the hardware can not sync its internal used index, the 

>>>>>>> driver (e.g as

>>>>>>> you did here) can do that.

>>>>>>>

>>>>>>> But there's no way for the hardware to deduce the internal avail 

>>>>>>> index from

>>>>>>> the virtqueue, that's why avail index is sycned.

>>>>>>>

>>>>>>> Thanks

>>>>>>>

>>>>>>>

>>>>>

>>>>

>>>

>>

>
Jason Wang Feb. 19, 2021, 3:10 a.m. UTC | #19
On 2021/2/18 8:43 下午, Si-Wei Liu wrote:
>

>

> On 2/17/2021 8:44 PM, Jason Wang wrote:

>>

>> On 2021/2/10 下午4:59, Si-Wei Liu wrote:

>>>

>>>

>>> On 2/9/2021 7:53 PM, Jason Wang wrote:

>>>>

>>>> On 2021/2/10 上午10:30, Si-Wei Liu wrote:

>>>>>

>>>>>

>>>>> On 2/8/2021 10:37 PM, Jason Wang wrote:

>>>>>>

>>>>>> On 2021/2/9 下午2:12, Eli Cohen wrote:

>>>>>>> On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

>>>>>>>> On 2021/2/8 下午6:04, Eli Cohen wrote:

>>>>>>>>> On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

>>>>>>>>>> On 2021/2/8 下午2:37, Eli Cohen wrote:

>>>>>>>>>>> On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

>>>>>>>>>>>> On 2021/2/6 上午7:07, Si-Wei Liu wrote:

>>>>>>>>>>>>> On 2/3/2021 11:36 PM, Eli Cohen wrote:

>>>>>>>>>>>>>> When a change of memory map occurs, the hardware 

>>>>>>>>>>>>>> resources are destroyed

>>>>>>>>>>>>>> and then re-created again with the new memory map. In 

>>>>>>>>>>>>>> such case, we need

>>>>>>>>>>>>>> to restore the hardware available and used indices. The 

>>>>>>>>>>>>>> driver failed to

>>>>>>>>>>>>>> restore the used index which is added here.

>>>>>>>>>>>>>>

>>>>>>>>>>>>>> Also, since the driver also fails to reset the available 

>>>>>>>>>>>>>> and used

>>>>>>>>>>>>>> indices upon device reset, fix this here to avoid 

>>>>>>>>>>>>>> regression caused by

>>>>>>>>>>>>>> the fact that used index may not be zero upon device reset.

>>>>>>>>>>>>>>

>>>>>>>>>>>>>> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for 

>>>>>>>>>>>>>> supported mlx5

>>>>>>>>>>>>>> devices")

>>>>>>>>>>>>>> Signed-off-by: Eli Cohen<elic@nvidia.com>

>>>>>>>>>>>>>> ---

>>>>>>>>>>>>>> v0 -> v1:

>>>>>>>>>>>>>> Clear indices upon device reset

>>>>>>>>>>>>>>

>>>>>>>>>>>>>>      drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 

>>>>>>>>>>>>>> ++++++++++++++++++

>>>>>>>>>>>>>>      1 file changed, 18 insertions(+)

>>>>>>>>>>>>>>

>>>>>>>>>>>>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>>> index 88dde3455bfd..b5fe6d2ad22f 100644

>>>>>>>>>>>>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>>> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>>>>>>>>>>>>>>          u64 device_addr;

>>>>>>>>>>>>>>          u64 driver_addr;

>>>>>>>>>>>>>>          u16 avail_index;

>>>>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>>>>          bool ready;

>>>>>>>>>>>>>>          struct vdpa_callback cb;

>>>>>>>>>>>>>>          bool restore;

>>>>>>>>>>>>>> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>>>>>>>>>>>>>>          u32 virtq_id;

>>>>>>>>>>>>>>          struct mlx5_vdpa_net *ndev;

>>>>>>>>>>>>>>          u16 avail_idx;

>>>>>>>>>>>>>> +    u16 used_idx;

>>>>>>>>>>>>>>          int fw_state;

>>>>>>>>>>>>>>            /* keep last in the struct */

>>>>>>>>>>>>>> @@ -804,6 +806,7 @@ static int create_virtqueue(struct 

>>>>>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtque

>>>>>>>>>>>>>>            obj_context = 

>>>>>>>>>>>>>> MLX5_ADDR_OF(create_virtio_net_q_in, in,

>>>>>>>>>>>>>> obj_context);

>>>>>>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context, 

>>>>>>>>>>>>>> hw_available_index,

>>>>>>>>>>>>>> mvq->avail_idx);

>>>>>>>>>>>>>> +    MLX5_SET(virtio_net_q_object, obj_context, 

>>>>>>>>>>>>>> hw_used_index,

>>>>>>>>>>>>>> mvq->used_idx);

>>>>>>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context,

>>>>>>>>>>>>>> queue_feature_bit_mask_12_3,

>>>>>>>>>>>>>> get_features_12_3(ndev->mvdev.actual_features));

>>>>>>>>>>>>>>          vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, 

>>>>>>>>>>>>>> obj_context,

>>>>>>>>>>>>>> virtio_q_context);

>>>>>>>>>>>>>> @@ -1022,6 +1025,7 @@ static int connect_qps(struct 

>>>>>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtqueue *m

>>>>>>>>>>>>>>      struct mlx5_virtq_attr {

>>>>>>>>>>>>>>          u8 state;

>>>>>>>>>>>>>>          u16 available_index;

>>>>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>>>>      };

>>>>>>>>>>>>>>        static int query_virtqueue(struct mlx5_vdpa_net 

>>>>>>>>>>>>>> *ndev, struct

>>>>>>>>>>>>>> mlx5_vdpa_virtqueue *mvq,

>>>>>>>>>>>>>> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

>>>>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>>>>>>>>>>>>>>          memset(attr, 0, sizeof(*attr));

>>>>>>>>>>>>>>          attr->state = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>>>>>> obj_context, state);

>>>>>>>>>>>>>>          attr->available_index = 

>>>>>>>>>>>>>> MLX5_GET(virtio_net_q_object,

>>>>>>>>>>>>>> obj_context, hw_available_index);

>>>>>>>>>>>>>> +    attr->used_index = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>>>>>> obj_context,

>>>>>>>>>>>>>> hw_used_index);

>>>>>>>>>>>>>>          kfree(out);

>>>>>>>>>>>>>>          return 0;

>>>>>>>>>>>>>>      @@ -1535,6 +1540,16 @@ static void 

>>>>>>>>>>>>>> teardown_virtqueues(struct

>>>>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>>>>          }

>>>>>>>>>>>>>>      }

>>>>>>>>>>>>>>      +static void clear_virtqueues(struct mlx5_vdpa_net 

>>>>>>>>>>>>>> *ndev)

>>>>>>>>>>>>>> +{

>>>>>>>>>>>>>> +    int i;

>>>>>>>>>>>>>> +

>>>>>>>>>>>>>> +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

>>>>>>>>>>>>>> +        ndev->vqs[i].avail_idx = 0;

>>>>>>>>>>>>>> +        ndev->vqs[i].used_idx = 0;

>>>>>>>>>>>>>> +    }

>>>>>>>>>>>>>> +}

>>>>>>>>>>>>>> +

>>>>>>>>>>>>>>      /* TODO: cross-endian support */

>>>>>>>>>>>>>>      static inline bool mlx5_vdpa_is_little_endian(struct 

>>>>>>>>>>>>>> mlx5_vdpa_dev

>>>>>>>>>>>>>> *mvdev)

>>>>>>>>>>>>>>      {

>>>>>>>>>>>>>> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

>>>>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>>>>>>>>>>>>>>              return err;

>>>>>>>>>>>>>>            ri->avail_index = attr.available_index;

>>>>>>>>>>>>>> +    ri->used_index = attr.used_index;

>>>>>>>>>>>>>>          ri->ready = mvq->ready;

>>>>>>>>>>>>>>          ri->num_ent = mvq->num_ent;

>>>>>>>>>>>>>>          ri->desc_addr = mvq->desc_addr;

>>>>>>>>>>>>>> @@ -1654,6 +1670,7 @@ static void 

>>>>>>>>>>>>>> restore_channels_info(struct

>>>>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>>>>                  continue;

>>>>>>>>>>>>>>                mvq->avail_idx = ri->avail_index;

>>>>>>>>>>>>>> +        mvq->used_idx = ri->used_index;

>>>>>>>>>>>>>>              mvq->ready = ri->ready;

>>>>>>>>>>>>>>              mvq->num_ent = ri->num_ent;

>>>>>>>>>>>>>>              mvq->desc_addr = ri->desc_addr;

>>>>>>>>>>>>>> @@ -1768,6 +1785,7 @@ static void 

>>>>>>>>>>>>>> mlx5_vdpa_set_status(struct

>>>>>>>>>>>>>> vdpa_device *vdev, u8 status)

>>>>>>>>>>>>>>          if (!status) {

>>>>>>>>>>>>>>              mlx5_vdpa_info(mvdev, "performing device 

>>>>>>>>>>>>>> reset\n");

>>>>>>>>>>>>>>              teardown_driver(ndev);

>>>>>>>>>>>>>> +        clear_virtqueues(ndev);

>>>>>>>>>>>>> The clearing looks fine at the first glance, as it aligns 

>>>>>>>>>>>>> with the other

>>>>>>>>>>>>> state cleanups floating around at the same place. However, 

>>>>>>>>>>>>> the thing is

>>>>>>>>>>>>> get_vq_state() is supposed to be called right after to get 

>>>>>>>>>>>>> sync'ed with

>>>>>>>>>>>>> the latest internal avail_index from device while vq is 

>>>>>>>>>>>>> stopped. The

>>>>>>>>>>>>> index was saved in the driver software at vq suspension, 

>>>>>>>>>>>>> but before the

>>>>>>>>>>>>> virtq object is destroyed. We shouldn't clear the 

>>>>>>>>>>>>> avail_index too early.

>>>>>>>>>>>> Good point.

>>>>>>>>>>>>

>>>>>>>>>>>> There's a limitation on the virtio spec and vDPA framework 

>>>>>>>>>>>> that we can not

>>>>>>>>>>>> simply differ device suspending from device reset.

>>>>>>>>>>>>

>>>>>>>>>>> Are you talking about live migration where you reset the 

>>>>>>>>>>> device but

>>>>>>>>>>> still want to know how far it progressed in order to 

>>>>>>>>>>> continue from the

>>>>>>>>>>> same place in the new VM?

>>>>>>>>>> Yes. So if we want to support live migration at we need:

>>>>>>>>>>

>>>>>>>>>> in src node:

>>>>>>>>>> 1) suspend the device

>>>>>>>>>> 2) get last_avail_idx via get_vq_state()

>>>>>>>>>>

>>>>>>>>>> in the dst node:

>>>>>>>>>> 3) set last_avail_idx via set_vq_state()

>>>>>>>>>> 4) resume the device

>>>>>>>>>>

>>>>>>>>>> So you can see, step 2 requires the device/driver not to 

>>>>>>>>>> forget the

>>>>>>>>>> last_avail_idx.

>>>>>>>>>>

>>>>>>>>> Just to be sure, what really matters here is the used index. 

>>>>>>>>> Becuase the

>>>>>>>>> vriqtueue itself is copied from the src VM to the dest VM. The 

>>>>>>>>> available

>>>>>>>>> index is alreay there and we know the hardware reads it from 

>>>>>>>>> there.

>>>>>>>>

>>>>>>>> So for "last_avail_idx" I meant the hardware internal avail 

>>>>>>>> index. It's not

>>>>>>>> stored in the virtqueue so we must migrate it from src to dest 

>>>>>>>> and set them

>>>>>>>> through set_vq_state(). Then in the destination, the virtqueue 

>>>>>>>> can be

>>>>>>>> restarted from that index.

>>>>>>>>

>>>>>>> Consider this case: driver posted buffers till avail index 

>>>>>>> becomes the

>>>>>>> value 50. Hardware is executing but made it till 20 when 

>>>>>>> virtqueue was

>>>>>>> suspended due to live migration - this is indicated by hardware 

>>>>>>> used

>>>>>>> index equal 20.

>>>>>>

>>>>>>

>>>>>> So in this case the used index in the virtqueue should be 20? 

>>>>>> Otherwise we need not sync used index itself but all the used 

>>>>>> entries that is not committed to the used ring.

>>>>>

>>>>> In other word, for mlx5 vdpa there's no such internal 

>>>>> last_avail_idx stuff maintained by the hardware, right? 

>>>>

>>>>

>>>> For each device it should have one otherwise it won't work 

>>>> correctly during stop/resume. See the codes 

>>>> mlx5_vdpa_get_vq_state() which calls query_virtqueue() that build 

>>>> commands to query "last_avail_idx" from the hardware:

>>>>

>>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 

>>>> MLX5_CMD_OP_QUERY_GENERAL_OBJECT);

>>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 

>>>> MLX5_OBJ_TYPE_VIRTIO_NET_Q);

>>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);

>>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, 

>>>> ndev->mvdev.res.uid);

>>>>     err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, 

>>>> outlen);

>>>>     if (err)

>>>>         goto err_cmd;

>>>>

>>>>     obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, 

>>>> obj_context);

>>>>     memset(attr, 0, sizeof(*attr));

>>>>     attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

>>>>     attr->available_index = MLX5_GET(virtio_net_q_object, 

>>>> obj_context, hw_available_index);

>>>>

>>> Eli should be able to correct me, but this hw_available_index might 

>>> just be a cached value of virtqueue avail_index in the memory from 

>>> the most recent sync. 

>>

>>

>> It should not, otherwise it will be a bug.

> The hw_available_index not showing the correct last_avail_index value 

> is technically a firmware bug, as Eli alluded to. That's why I had the 

> original question for how this entire synchronization scheme could 

> work if just saving and restoring this cached value. In my 

> observation, the hw_available_index was seen far off from the 

> hw_used_index post vq suspension, I just pointed out the fact that 

> this hardware value is neither last_avail_idx nor last_used_idx that 

> is useful to represent vq state.

>

> However, the core question I'm having is should we care about fixing 

> this in the firmware interface level (actually, the hardware 

> implementation), or rather, just as said, some devices e.g. network 

> device could live with the simplified form of assumption (used_idx ==  

> last_used_idx  == last_avail_idx) where pending requests can be 

> drained and completed before device is stopped or suspended. Despite 

> of what device behavior would be expected or defined in the virtio 

> spec, the get/set_vq_state() API should have very clear semantics: 

> whether both forms (i.e. restore either one of used_idx or 

> last_avail_idx is fine) are acceptable, or it needs to be religious to 

> restore vq state via last_avail_idx only. In the latter case, both the 

> mlx5 vdpa net driver and the firmware interface needs to be fixed to 

> accommodate the stricter API requirement.

>

> FWIW I don't think network (Ethernet) device should always assume 

> (used_idx ==  last_used_idx  == last_avail_idx) while being stopped 

> (were there virtio RDMA device in reality this assumption would break): 



Right, it's the choice of device and it would be more complicated if we 
had devices without this assumption. The main issue is whther or not we 
need to wait for the drain of the request and if such darining can only 
take reasonable time to finish.


> it could well follow the golden rules in virtio S/W implementation to 

> have separate last_used_idx and last_avail_idx in the hardware, then 

> it'll be 100% API compliant.. But we know that such implementation is 

> unnecessarily complicated for Ethernet device. Hence I thought there 

> could be sort of reliefs for some driver/device and it's totally up to 

> the device/driver to choose the implementation.



Yes, the current API leaves the room for last_used_idx but we may still 
need to migrate the indices (as I mentioned below).


>

>>

>>

>>> I doubt it's the one you talked about in software implementation.

>>

>>

>> Actually not, it's a virtio general issue:

>>

>> Consider there's not indices wrap. And:

>> - used_idx is the used index in the virtqueue

>> - last_used_idx is the used index maintained by the device, it points 

>> to the location where to put the next done requests to the used_ring

>> - avail_idx is the available index in the virtqueue

>> - last_avail_idx is the index maintained by the device, it points to 

>> the location where device need to read from the available.

>>

>> So bascially, from device POV, it only cares the buffer that belong 

>> to itself which are [used_idx, avail_idx). So we have:

>>

>> [used_idx, last_used_idx) The requests that have been completed by 

>> the device but not completed to the used ring (or at least used_idx 

>> is not updated).

>> [last_used_idx, last_avail_idx) The requests that are being processed 

>> by the device.

>> [last_avail_idx, avail_idx) The requests that are made available by 

>> the driver but not processed by the device.

>>

>> During device stop/suspend, the device should:

>>

>> - stop reading new request from available ring (or read until the end 

>> of descriptor chain)

>> - sync used_idx with last_used_idx. Otherwise we need a complicated 

>> but not necessary API to sync last_used_idx and the indices that are 

>> not committed to used ring (since device may complete the request out 

>> of order)

>>

>> So we know used_idx == last_used_idx in this case, so we have:

>>

>> [used_idx/last_used_idx, last_avail_idx) The requests that are being 

>> processed.

>> [last_avail_idx, avail_idx) The requests that are available for the 

>> driver but not yet processed.

>>

>> For networking device, it's sufficient to think the requests are 

>> completed when TX/RX DMA are finished. So there's no requests that 

>> are being processed after the stop. In this case we had: used_idx == 

>> last_used_idx == last_avail_idx. Then we only had:

>>

>> [used_idx/last_used_idx/last_avail_idx, avail_idx] The requests that 

>> are made available by the driver but not processed by the device. 

>> That's why you may think only used_idx matters here.

>>

>> For block device, the completion of the request might require the 

>> communication with the remote backend, so we can't assume 

>> last_used_idx is equal to the last_avail_idx. Whether or not to wait 

>> for the drain the request is still being discussed[1].

> I guess this requirement is very subject to the specific storage setup 

> (networked v.s. local) including the guest app/configuration (retry 

> v.s. time out). IMHO there shouldn't be a definite yes-or-no answer 

> here. But in general, for req-ack type of request if the completion 

> (ack) cannot simply replicate across live migration, it should wait 

> until the pending requests are completely drained.



Yes, and qemu may fail the migartion if the pending requets takes too 
long to be drained, that's why an asynchornous API is introduced in [1].


>

>>

>> So you can see, for all the cases, what really matters is the 

>> last_avail_idx. The device should know where it need to start reading 

>> for the next request, and it is not necessarily equal to 

>> last_used_idx or used_idx. What makes things a little bit easier is 

>> the networking device whose last_used_idx is equal to last_avail_idx.

>>

>>

>>> If I understand Eli correctly, hardware will always reload the 

>>> latest avail_index from memory whenever it's being sync'ed again.

>>>

>>> <quote>

>>> The hardware always goes to read the available index from memory. 

>>> The requirement to configure it when creating a new object is still 

>>> a requirement defined by the interface so I must not violate 

>>> interface requirments.

>>> </quote>

>>>

>>> If the hardware does everything perfectly that is able to flush 

>>> pending requests, update descriptors, rings plus used indices all at 

>>> once before the suspension, there's no need for hardware to maintain 

>>> a separate internal index than the h/w used_index. The hardware can 

>>> get started from the saved used_index upon resuming. I view this is 

>>> of (hardware) implementation choices and thought it does not violate 

>>> the virtio spec?

>>

>>

>> Yes, but as you said, it has a lot of assumptions which may not work 

>> for other type of devices. So what I refer "last_avail_idx" is 

>> probably the "used_idx" in your description here. It might be the 

>> same in this case for networking device.

>>

>>

>>>

>>>

>>>>

>>>>

>>>>

>>>>> And the used_idx in the virtqueue is always in sync with the 

>>>>> hardware used_index, and hardware is supposed to commit pending 

>>>>> used buffers to the ring while bumping up the hardware used_index 

>>>>> (and also committed to memory) altogether prior to suspension, is 

>>>>> my understanding correct here? Double checking if this is the 

>>>>> expected semantics of what 

>>>>> modify_virtqueue(MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND) should 

>>>>> achieve.

>>>>>

>>>>> If the above is true, then it looks to me for mlx5 vdpa we should 

>>>>> really return h/w used_idx rather than the last_avail_idx through 

>>>>> get_vq_state(), in order to reconstruct the virt queue state post 

>>>>> live migration. For the set_map case, the internal last_avail_idx 

>>>>> really doesn't matter, although both indices are saved and 

>>>>> restored transparently as-is.

>>>>

>>>>

>>>> Right, a subtle thing here is that: for the device that might have 

>>>> can't not complete all virtqueue requests during vq suspending, the 

>>>> "last_avail_idx" might not be equal to the hardware used_idx. Thing 

>>>> might be true for the storage devices that needs to connect to a 

>>>> remote backend. But this is not the case of networking device, so 

>>>> last_avail_idx should be equal to hardware used_idx here. 

>>> Eli, since it's your hardware, does it work this way? i.e. does the 

>>> firmware interface see a case where virtqueue requests can't be 

>>> completed before suspending vq?

>>

>>

>> For storage device, I think it can happen.

>>

>>

>>>

>>>> But using the "last_avail_idx" or hardware avail_idx should always 

>>>> be better in this case since it's guaranteed to correct and will 

>>>> have less confusion. We use this convention in other types of vhost 

>>>> backends (vhost-kernel, vhost-user).

>>>>

>>>> So looking at mlx5_set_vq_state(), it probably won't work since it 

>>>> doesn't not set either hardware avail_idx or hardware used_idx:

>>> The saved mvq->avail_idx will be used to recreate hardware virtq 

>>> object and the used index in create_virtqueue(), once status 

>>> DRIVER_OK is set. I suspect we should pass the index to 

>>> mvq->used_idx in mlx5_vdpa_set_vq_state() below instead.

>>>

>>

>> It depends on what did mvq->used_idx meant? If it's last_used_idx, it 

>> should be the same with mvq->avail_idx for networking device.

> It's the last_used_idx. Note, Eli already posted a patch ("vdpa/mlx5: 

> Fix suspend/resume index restoration") to repurpose set_vq_state() to 

> restore both used_idx and avail_idx (both indices should be 

> conceptually equal in that device model) across reset. 



Will check, burried by mails :(


> Which implies the simplified assumption I mentioned earlier. The 

> requirement of set_vq_state() API should make it clear if this kind of 

> assumption is acceptable.



Right now the value is exposed to userspace via GET_VRING_BASE, so only 
last_avail_idx is synced. If we need sync last_used_idx, we should also 
sync pending indices which requires more thoughts.

Thanks


>

>

> Thanks,

> -Siwei

>

>

>>

>> Thanks

>>

>>

>>>

>>> Thanks,

>>> -Siwei

>>>>

>>>> static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,

>>>>                   const struct vdpa_vq_state *state)

>>>> {

>>>>     struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);

>>>>     struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);

>>>>     struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];

>>>>

>>>>     if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {

>>>>         mlx5_vdpa_warn(mvdev, "can't modify available index\n");

>>>>         return -EINVAL;

>>>>     }

>>>>

>>>>     mvq->avail_idx = state->avail_index;

>>>>     return 0;

>>>> }

>>>>

>>>> Depends on the hardware, we should either set hardware used_idx or 

>>>> hardware avail_idx here.

>>>>

>>>> I think we need to clarify how device is supposed to work in the 

>>>> virtio spec.

>>>>

>>>> Thanks

>>>>

>>>>

>>>>>

>>>>> -Siwei

>>>>>

>>>>>>

>>>>>>

>>>>>>> Now the vritqueue is copied to the new VM and the

>>>>>>> hardware now has to continue execution from index 20. We need to 

>>>>>>> tell

>>>>>>> the hardware via configuring the last used_index.

>>>>>>

>>>>>>

>>>>>> If the hardware can not sync the index from the virtqueue, the 

>>>>>> driver can do the synchronization by make the last_used_idx 

>>>>>> equals to used index in the virtqueue.

>>>>>>

>>>>>> Thanks

>>>>>>

>>>>>>

>>>>>>>   So why don't we

>>>>>>> restore the used index?

>>>>>>>

>>>>>>>>> So it puzzles me why is set_vq_state() we do not communicate 

>>>>>>>>> the saved

>>>>>>>>> used index.

>>>>>>>>

>>>>>>>> We don't do that since:

>>>>>>>>

>>>>>>>> 1) if the hardware can sync its internal used index from the 

>>>>>>>> virtqueue

>>>>>>>> during device, then we don't need it

>>>>>>>> 2) if the hardware can not sync its internal used index, the 

>>>>>>>> driver (e.g as

>>>>>>>> you did here) can do that.

>>>>>>>>

>>>>>>>> But there's no way for the hardware to deduce the internal 

>>>>>>>> avail index from

>>>>>>>> the virtqueue, that's why avail index is sycned.

>>>>>>>>

>>>>>>>> Thanks

>>>>>>>>

>>>>>>>>

>>>>>>

>>>>>

>>>>

>>>

>>

>
Si-Wei Liu Feb. 20, 2021, 2:05 a.m. UTC | #20
On 2/18/2021 7:10 PM, Jason Wang wrote:
>

> On 2021/2/18 8:43 下午, Si-Wei Liu wrote:

>>

>>

>> On 2/17/2021 8:44 PM, Jason Wang wrote:

>>>

>>> On 2021/2/10 下午4:59, Si-Wei Liu wrote:

>>>>

>>>>

>>>> On 2/9/2021 7:53 PM, Jason Wang wrote:

>>>>>

>>>>> On 2021/2/10 上午10:30, Si-Wei Liu wrote:

>>>>>>

>>>>>>

>>>>>> On 2/8/2021 10:37 PM, Jason Wang wrote:

>>>>>>>

>>>>>>> On 2021/2/9 下午2:12, Eli Cohen wrote:

>>>>>>>> On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

>>>>>>>>> On 2021/2/8 下午6:04, Eli Cohen wrote:

>>>>>>>>>> On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

>>>>>>>>>>> On 2021/2/8 下午2:37, Eli Cohen wrote:

>>>>>>>>>>>> On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

>>>>>>>>>>>>> On 2021/2/6 上午7:07, Si-Wei Liu wrote:

>>>>>>>>>>>>>> On 2/3/2021 11:36 PM, Eli Cohen wrote:

>>>>>>>>>>>>>>> When a change of memory map occurs, the hardware 

>>>>>>>>>>>>>>> resources are destroyed

>>>>>>>>>>>>>>> and then re-created again with the new memory map. In 

>>>>>>>>>>>>>>> such case, we need

>>>>>>>>>>>>>>> to restore the hardware available and used indices. The 

>>>>>>>>>>>>>>> driver failed to

>>>>>>>>>>>>>>> restore the used index which is added here.

>>>>>>>>>>>>>>>

>>>>>>>>>>>>>>> Also, since the driver also fails to reset the available 

>>>>>>>>>>>>>>> and used

>>>>>>>>>>>>>>> indices upon device reset, fix this here to avoid 

>>>>>>>>>>>>>>> regression caused by

>>>>>>>>>>>>>>> the fact that used index may not be zero upon device reset.

>>>>>>>>>>>>>>>

>>>>>>>>>>>>>>> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for 

>>>>>>>>>>>>>>> supported mlx5

>>>>>>>>>>>>>>> devices")

>>>>>>>>>>>>>>> Signed-off-by: Eli Cohen<elic@nvidia.com>

>>>>>>>>>>>>>>> ---

>>>>>>>>>>>>>>> v0 -> v1:

>>>>>>>>>>>>>>> Clear indices upon device reset

>>>>>>>>>>>>>>>

>>>>>>>>>>>>>>>      drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 

>>>>>>>>>>>>>>> ++++++++++++++++++

>>>>>>>>>>>>>>>      1 file changed, 18 insertions(+)

>>>>>>>>>>>>>>>

>>>>>>>>>>>>>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>>>> index 88dde3455bfd..b5fe6d2ad22f 100644

>>>>>>>>>>>>>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>>>> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>>>>>>>>>>>>>>>          u64 device_addr;

>>>>>>>>>>>>>>>          u64 driver_addr;

>>>>>>>>>>>>>>>          u16 avail_index;

>>>>>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>>>>>          bool ready;

>>>>>>>>>>>>>>>          struct vdpa_callback cb;

>>>>>>>>>>>>>>>          bool restore;

>>>>>>>>>>>>>>> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>>>>>>>>>>>>>>>          u32 virtq_id;

>>>>>>>>>>>>>>>          struct mlx5_vdpa_net *ndev;

>>>>>>>>>>>>>>>          u16 avail_idx;

>>>>>>>>>>>>>>> +    u16 used_idx;

>>>>>>>>>>>>>>>          int fw_state;

>>>>>>>>>>>>>>>            /* keep last in the struct */

>>>>>>>>>>>>>>> @@ -804,6 +806,7 @@ static int create_virtqueue(struct 

>>>>>>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtque

>>>>>>>>>>>>>>>            obj_context = 

>>>>>>>>>>>>>>> MLX5_ADDR_OF(create_virtio_net_q_in, in,

>>>>>>>>>>>>>>> obj_context);

>>>>>>>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context, 

>>>>>>>>>>>>>>> hw_available_index,

>>>>>>>>>>>>>>> mvq->avail_idx);

>>>>>>>>>>>>>>> +    MLX5_SET(virtio_net_q_object, obj_context, 

>>>>>>>>>>>>>>> hw_used_index,

>>>>>>>>>>>>>>> mvq->used_idx);

>>>>>>>>>>>>>>>          MLX5_SET(virtio_net_q_object, obj_context,

>>>>>>>>>>>>>>> queue_feature_bit_mask_12_3,

>>>>>>>>>>>>>>> get_features_12_3(ndev->mvdev.actual_features));

>>>>>>>>>>>>>>>          vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, 

>>>>>>>>>>>>>>> obj_context,

>>>>>>>>>>>>>>> virtio_q_context);

>>>>>>>>>>>>>>> @@ -1022,6 +1025,7 @@ static int connect_qps(struct 

>>>>>>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtqueue *m

>>>>>>>>>>>>>>>      struct mlx5_virtq_attr {

>>>>>>>>>>>>>>>          u8 state;

>>>>>>>>>>>>>>>          u16 available_index;

>>>>>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>>>>>      };

>>>>>>>>>>>>>>>        static int query_virtqueue(struct mlx5_vdpa_net 

>>>>>>>>>>>>>>> *ndev, struct

>>>>>>>>>>>>>>> mlx5_vdpa_virtqueue *mvq,

>>>>>>>>>>>>>>> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

>>>>>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>>>>>>>>>>>>>>>          memset(attr, 0, sizeof(*attr));

>>>>>>>>>>>>>>>          attr->state = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>>>>>>> obj_context, state);

>>>>>>>>>>>>>>>          attr->available_index = 

>>>>>>>>>>>>>>> MLX5_GET(virtio_net_q_object,

>>>>>>>>>>>>>>> obj_context, hw_available_index);

>>>>>>>>>>>>>>> +    attr->used_index = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>>>>>>> obj_context,

>>>>>>>>>>>>>>> hw_used_index);

>>>>>>>>>>>>>>>          kfree(out);

>>>>>>>>>>>>>>>          return 0;

>>>>>>>>>>>>>>>      @@ -1535,6 +1540,16 @@ static void 

>>>>>>>>>>>>>>> teardown_virtqueues(struct

>>>>>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>>>>>          }

>>>>>>>>>>>>>>>      }

>>>>>>>>>>>>>>>      +static void clear_virtqueues(struct mlx5_vdpa_net 

>>>>>>>>>>>>>>> *ndev)

>>>>>>>>>>>>>>> +{

>>>>>>>>>>>>>>> +    int i;

>>>>>>>>>>>>>>> +

>>>>>>>>>>>>>>> +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

>>>>>>>>>>>>>>> +        ndev->vqs[i].avail_idx = 0;

>>>>>>>>>>>>>>> +        ndev->vqs[i].used_idx = 0;

>>>>>>>>>>>>>>> +    }

>>>>>>>>>>>>>>> +}

>>>>>>>>>>>>>>> +

>>>>>>>>>>>>>>>      /* TODO: cross-endian support */

>>>>>>>>>>>>>>>      static inline bool 

>>>>>>>>>>>>>>> mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev

>>>>>>>>>>>>>>> *mvdev)

>>>>>>>>>>>>>>>      {

>>>>>>>>>>>>>>> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

>>>>>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>>>>>>>>>>>>>>>              return err;

>>>>>>>>>>>>>>>            ri->avail_index = attr.available_index;

>>>>>>>>>>>>>>> +    ri->used_index = attr.used_index;

>>>>>>>>>>>>>>>          ri->ready = mvq->ready;

>>>>>>>>>>>>>>>          ri->num_ent = mvq->num_ent;

>>>>>>>>>>>>>>>          ri->desc_addr = mvq->desc_addr;

>>>>>>>>>>>>>>> @@ -1654,6 +1670,7 @@ static void 

>>>>>>>>>>>>>>> restore_channels_info(struct

>>>>>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>>>>>                  continue;

>>>>>>>>>>>>>>>                mvq->avail_idx = ri->avail_index;

>>>>>>>>>>>>>>> +        mvq->used_idx = ri->used_index;

>>>>>>>>>>>>>>>              mvq->ready = ri->ready;

>>>>>>>>>>>>>>>              mvq->num_ent = ri->num_ent;

>>>>>>>>>>>>>>>              mvq->desc_addr = ri->desc_addr;

>>>>>>>>>>>>>>> @@ -1768,6 +1785,7 @@ static void 

>>>>>>>>>>>>>>> mlx5_vdpa_set_status(struct

>>>>>>>>>>>>>>> vdpa_device *vdev, u8 status)

>>>>>>>>>>>>>>>          if (!status) {

>>>>>>>>>>>>>>>              mlx5_vdpa_info(mvdev, "performing device 

>>>>>>>>>>>>>>> reset\n");

>>>>>>>>>>>>>>>              teardown_driver(ndev);

>>>>>>>>>>>>>>> +        clear_virtqueues(ndev);

>>>>>>>>>>>>>> The clearing looks fine at the first glance, as it aligns 

>>>>>>>>>>>>>> with the other

>>>>>>>>>>>>>> state cleanups floating around at the same place. 

>>>>>>>>>>>>>> However, the thing is

>>>>>>>>>>>>>> get_vq_state() is supposed to be called right after to 

>>>>>>>>>>>>>> get sync'ed with

>>>>>>>>>>>>>> the latest internal avail_index from device while vq is 

>>>>>>>>>>>>>> stopped. The

>>>>>>>>>>>>>> index was saved in the driver software at vq suspension, 

>>>>>>>>>>>>>> but before the

>>>>>>>>>>>>>> virtq object is destroyed. We shouldn't clear the 

>>>>>>>>>>>>>> avail_index too early.

>>>>>>>>>>>>> Good point.

>>>>>>>>>>>>>

>>>>>>>>>>>>> There's a limitation on the virtio spec and vDPA framework 

>>>>>>>>>>>>> that we can not

>>>>>>>>>>>>> simply differ device suspending from device reset.

>>>>>>>>>>>>>

>>>>>>>>>>>> Are you talking about live migration where you reset the 

>>>>>>>>>>>> device but

>>>>>>>>>>>> still want to know how far it progressed in order to 

>>>>>>>>>>>> continue from the

>>>>>>>>>>>> same place in the new VM?

>>>>>>>>>>> Yes. So if we want to support live migration at we need:

>>>>>>>>>>>

>>>>>>>>>>> in src node:

>>>>>>>>>>> 1) suspend the device

>>>>>>>>>>> 2) get last_avail_idx via get_vq_state()

>>>>>>>>>>>

>>>>>>>>>>> in the dst node:

>>>>>>>>>>> 3) set last_avail_idx via set_vq_state()

>>>>>>>>>>> 4) resume the device

>>>>>>>>>>>

>>>>>>>>>>> So you can see, step 2 requires the device/driver not to 

>>>>>>>>>>> forget the

>>>>>>>>>>> last_avail_idx.

>>>>>>>>>>>

>>>>>>>>>> Just to be sure, what really matters here is the used index. 

>>>>>>>>>> Becuase the

>>>>>>>>>> vriqtueue itself is copied from the src VM to the dest VM. 

>>>>>>>>>> The available

>>>>>>>>>> index is alreay there and we know the hardware reads it from 

>>>>>>>>>> there.

>>>>>>>>>

>>>>>>>>> So for "last_avail_idx" I meant the hardware internal avail 

>>>>>>>>> index. It's not

>>>>>>>>> stored in the virtqueue so we must migrate it from src to dest 

>>>>>>>>> and set them

>>>>>>>>> through set_vq_state(). Then in the destination, the virtqueue 

>>>>>>>>> can be

>>>>>>>>> restarted from that index.

>>>>>>>>>

>>>>>>>> Consider this case: driver posted buffers till avail index 

>>>>>>>> becomes the

>>>>>>>> value 50. Hardware is executing but made it till 20 when 

>>>>>>>> virtqueue was

>>>>>>>> suspended due to live migration - this is indicated by hardware 

>>>>>>>> used

>>>>>>>> index equal 20.

>>>>>>>

>>>>>>>

>>>>>>> So in this case the used index in the virtqueue should be 20? 

>>>>>>> Otherwise we need not sync used index itself but all the used 

>>>>>>> entries that is not committed to the used ring.

>>>>>>

>>>>>> In other word, for mlx5 vdpa there's no such internal 

>>>>>> last_avail_idx stuff maintained by the hardware, right? 

>>>>>

>>>>>

>>>>> For each device it should have one otherwise it won't work 

>>>>> correctly during stop/resume. See the codes 

>>>>> mlx5_vdpa_get_vq_state() which calls query_virtqueue() that build 

>>>>> commands to query "last_avail_idx" from the hardware:

>>>>>

>>>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 

>>>>> MLX5_CMD_OP_QUERY_GENERAL_OBJECT);

>>>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 

>>>>> MLX5_OBJ_TYPE_VIRTIO_NET_Q);

>>>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);

>>>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, 

>>>>> ndev->mvdev.res.uid);

>>>>>     err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, 

>>>>> outlen);

>>>>>     if (err)

>>>>>         goto err_cmd;

>>>>>

>>>>>     obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, 

>>>>> obj_context);

>>>>>     memset(attr, 0, sizeof(*attr));

>>>>>     attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

>>>>>     attr->available_index = MLX5_GET(virtio_net_q_object, 

>>>>> obj_context, hw_available_index);

>>>>>

>>>> Eli should be able to correct me, but this hw_available_index might 

>>>> just be a cached value of virtqueue avail_index in the memory from 

>>>> the most recent sync. 

>>>

>>>

>>> It should not, otherwise it will be a bug.

>> The hw_available_index not showing the correct last_avail_index value 

>> is technically a firmware bug, as Eli alluded to. That's why I had 

>> the original question for how this entire synchronization scheme 

>> could work if just saving and restoring this cached value. In my 

>> observation, the hw_available_index was seen far off from the 

>> hw_used_index post vq suspension, I just pointed out the fact that 

>> this hardware value is neither last_avail_idx nor last_used_idx that 

>> is useful to represent vq state.

>>

>> However, the core question I'm having is should we care about fixing 

>> this in the firmware interface level (actually, the hardware 

>> implementation), or rather, just as said, some devices e.g. network 

>> device could live with the simplified form of assumption (used_idx 

>> ==  last_used_idx  == last_avail_idx) where pending requests can be 

>> drained and completed before device is stopped or suspended. Despite 

>> of what device behavior would be expected or defined in the virtio 

>> spec, the get/set_vq_state() API should have very clear semantics: 

>> whether both forms (i.e. restore either one of used_idx or 

>> last_avail_idx is fine) are acceptable, or it needs to be religious 

>> to restore vq state via last_avail_idx only. In the latter case, both 

>> the mlx5 vdpa net driver and the firmware interface needs to be fixed 

>> to accommodate the stricter API requirement.

>>

>> FWIW I don't think network (Ethernet) device should always assume 

>> (used_idx ==  last_used_idx  == last_avail_idx) while being stopped 

>> (were there virtio RDMA device in reality this assumption would break): 

>

>

> Right, it's the choice of device and it would be more complicated if 

> we had devices without this assumption. The main issue is whther or 

> not we need to wait for the drain of the request and if such darining 

> can only take reasonable time to finish.

>

>

>> it could well follow the golden rules in virtio S/W implementation to 

>> have separate last_used_idx and last_avail_idx in the hardware, then 

>> it'll be 100% API compliant.. But we know that such implementation is 

>> unnecessarily complicated for Ethernet device. Hence I thought there 

>> could be sort of reliefs for some driver/device and it's totally up 

>> to the device/driver to choose the implementation.

>

>

> Yes, the current API leaves the room for last_used_idx but we may 

> still need to migrate the indices (as I mentioned below).

>

>

>>

>>>

>>>

>>>> I doubt it's the one you talked about in software implementation.

>>>

>>>

>>> Actually not, it's a virtio general issue:

>>>

>>> Consider there's not indices wrap. And:

>>> - used_idx is the used index in the virtqueue

>>> - last_used_idx is the used index maintained by the device, it 

>>> points to the location where to put the next done requests to the 

>>> used_ring

>>> - avail_idx is the available index in the virtqueue

>>> - last_avail_idx is the index maintained by the device, it points to 

>>> the location where device need to read from the available.

>>>

>>> So bascially, from device POV, it only cares the buffer that belong 

>>> to itself which are [used_idx, avail_idx). So we have:

>>>

>>> [used_idx, last_used_idx) The requests that have been completed by 

>>> the device but not completed to the used ring (or at least used_idx 

>>> is not updated).

>>> [last_used_idx, last_avail_idx) The requests that are being 

>>> processed by the device.

>>> [last_avail_idx, avail_idx) The requests that are made available by 

>>> the driver but not processed by the device.

>>>

>>> During device stop/suspend, the device should:

>>>

>>> - stop reading new request from available ring (or read until the 

>>> end of descriptor chain)

>>> - sync used_idx with last_used_idx. Otherwise we need a complicated 

>>> but not necessary API to sync last_used_idx and the indices that are 

>>> not committed to used ring (since device may complete the request 

>>> out of order)

>>>

>>> So we know used_idx == last_used_idx in this case, so we have:

>>>

>>> [used_idx/last_used_idx, last_avail_idx) The requests that are being 

>>> processed.

>>> [last_avail_idx, avail_idx) The requests that are available for the 

>>> driver but not yet processed.

>>>

>>> For networking device, it's sufficient to think the requests are 

>>> completed when TX/RX DMA are finished. So there's no requests that 

>>> are being processed after the stop. In this case we had: used_idx == 

>>> last_used_idx == last_avail_idx. Then we only had:

>>>

>>> [used_idx/last_used_idx/last_avail_idx, avail_idx] The requests that 

>>> are made available by the driver but not processed by the device. 

>>> That's why you may think only used_idx matters here.

>>>

>>> For block device, the completion of the request might require the 

>>> communication with the remote backend, so we can't assume 

>>> last_used_idx is equal to the last_avail_idx. Whether or not to wait 

>>> for the drain the request is still being discussed[1].

>> I guess this requirement is very subject to the specific storage 

>> setup (networked v.s. local) including the guest app/configuration 

>> (retry v.s. time out). IMHO there shouldn't be a definite yes-or-no 

>> answer here. But in general, for req-ack type of request if the 

>> completion (ack) cannot simply replicate across live migration, it 

>> should wait until the pending requests are completely drained.

>

>

> Yes, and qemu may fail the migartion if the pending requets takes too 

> long to be drained, that's why an asynchornous API is introduced in [1].


Just curious, what is the link for [1]? Sorry, I don't find it in your 
previous reply.

>

>

>>

>>>

>>> So you can see, for all the cases, what really matters is the 

>>> last_avail_idx. The device should know where it need to start 

>>> reading for the next request, and it is not necessarily equal to 

>>> last_used_idx or used_idx. What makes things a little bit easier is 

>>> the networking device whose last_used_idx is equal to last_avail_idx.

>>>

>>>

>>>> If I understand Eli correctly, hardware will always reload the 

>>>> latest avail_index from memory whenever it's being sync'ed again.

>>>>

>>>> <quote>

>>>> The hardware always goes to read the available index from memory. 

>>>> The requirement to configure it when creating a new object is still 

>>>> a requirement defined by the interface so I must not violate 

>>>> interface requirments.

>>>> </quote>

>>>>

>>>> If the hardware does everything perfectly that is able to flush 

>>>> pending requests, update descriptors, rings plus used indices all 

>>>> at once before the suspension, there's no need for hardware to 

>>>> maintain a separate internal index than the h/w used_index. The 

>>>> hardware can get started from the saved used_index upon resuming. I 

>>>> view this is of (hardware) implementation choices and thought it 

>>>> does not violate the virtio spec?

>>>

>>>

>>> Yes, but as you said, it has a lot of assumptions which may not work 

>>> for other type of devices. So what I refer "last_avail_idx" is 

>>> probably the "used_idx" in your description here. It might be the 

>>> same in this case for networking device.

>>>

>>>

>>>>

>>>>

>>>>>

>>>>>

>>>>>

>>>>>> And the used_idx in the virtqueue is always in sync with the 

>>>>>> hardware used_index, and hardware is supposed to commit pending 

>>>>>> used buffers to the ring while bumping up the hardware used_index 

>>>>>> (and also committed to memory) altogether prior to suspension, is 

>>>>>> my understanding correct here? Double checking if this is the 

>>>>>> expected semantics of what 

>>>>>> modify_virtqueue(MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND) should 

>>>>>> achieve.

>>>>>>

>>>>>> If the above is true, then it looks to me for mlx5 vdpa we should 

>>>>>> really return h/w used_idx rather than the last_avail_idx through 

>>>>>> get_vq_state(), in order to reconstruct the virt queue state post 

>>>>>> live migration. For the set_map case, the internal last_avail_idx 

>>>>>> really doesn't matter, although both indices are saved and 

>>>>>> restored transparently as-is.

>>>>>

>>>>>

>>>>> Right, a subtle thing here is that: for the device that might have 

>>>>> can't not complete all virtqueue requests during vq suspending, 

>>>>> the "last_avail_idx" might not be equal to the hardware used_idx. 

>>>>> Thing might be true for the storage devices that needs to connect 

>>>>> to a remote backend. But this is not the case of networking 

>>>>> device, so last_avail_idx should be equal to hardware used_idx here. 

>>>> Eli, since it's your hardware, does it work this way? i.e. does the 

>>>> firmware interface see a case where virtqueue requests can't be 

>>>> completed before suspending vq?

>>>

>>>

>>> For storage device, I think it can happen.

>>>

>>>

>>>>

>>>>> But using the "last_avail_idx" or hardware avail_idx should always 

>>>>> be better in this case since it's guaranteed to correct and will 

>>>>> have less confusion. We use this convention in other types of 

>>>>> vhost backends (vhost-kernel, vhost-user).

>>>>>

>>>>> So looking at mlx5_set_vq_state(), it probably won't work since it 

>>>>> doesn't not set either hardware avail_idx or hardware used_idx:

>>>> The saved mvq->avail_idx will be used to recreate hardware virtq 

>>>> object and the used index in create_virtqueue(), once status 

>>>> DRIVER_OK is set. I suspect we should pass the index to 

>>>> mvq->used_idx in mlx5_vdpa_set_vq_state() below instead.

>>>>

>>>

>>> It depends on what did mvq->used_idx meant? If it's last_used_idx, 

>>> it should be the same with mvq->avail_idx for networking device.

>> It's the last_used_idx. Note, Eli already posted a patch ("vdpa/mlx5: 

>> Fix suspend/resume index restoration") to repurpose set_vq_state() to 

>> restore both used_idx and avail_idx (both indices should be 

>> conceptually equal in that device model) across reset. 

>

>

> Will check, burried by mails :(


Understood. Will point you to the exact email showing the code change.

>

>

>> Which implies the simplified assumption I mentioned earlier. The 

>> requirement of set_vq_state() API should make it clear if this kind 

>> of assumption is acceptable.

>

>

> Right now the value is exposed to userspace via GET_VRING_BASE, so 

> only last_avail_idx is synced. If we need sync last_used_idx, we 

> should also sync pending indices which requires more thoughts.

Technically it doesn't sound right - crossing the boundary a bit even 
with simplified form of assumption. But depending on how userspace could 
make use of this API, it doesn't seem it breaks existing functionality 
for the moment.

PS, happy Lunar New Year. :)

-Siwei

>

> Thanks

>

>

>>

>>

>> Thanks,

>> -Siwei

>>

>>

>>>

>>> Thanks

>>>

>>>

>>>>

>>>> Thanks,

>>>> -Siwei

>>>>>

>>>>> static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,

>>>>>                   const struct vdpa_vq_state *state)

>>>>> {

>>>>>     struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);

>>>>>     struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);

>>>>>     struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];

>>>>>

>>>>>     if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {

>>>>>         mlx5_vdpa_warn(mvdev, "can't modify available index\n");

>>>>>         return -EINVAL;

>>>>>     }

>>>>>

>>>>>     mvq->avail_idx = state->avail_index;

>>>>>     return 0;

>>>>> }

>>>>>

>>>>> Depends on the hardware, we should either set hardware used_idx or 

>>>>> hardware avail_idx here.

>>>>>

>>>>> I think we need to clarify how device is supposed to work in the 

>>>>> virtio spec.

>>>>>

>>>>> Thanks

>>>>>

>>>>>

>>>>>>

>>>>>> -Siwei

>>>>>>

>>>>>>>

>>>>>>>

>>>>>>>> Now the vritqueue is copied to the new VM and the

>>>>>>>> hardware now has to continue execution from index 20. We need 

>>>>>>>> to tell

>>>>>>>> the hardware via configuring the last used_index.

>>>>>>>

>>>>>>>

>>>>>>> If the hardware can not sync the index from the virtqueue, the 

>>>>>>> driver can do the synchronization by make the last_used_idx 

>>>>>>> equals to used index in the virtqueue.

>>>>>>>

>>>>>>> Thanks

>>>>>>>

>>>>>>>

>>>>>>>>   So why don't we

>>>>>>>> restore the used index?

>>>>>>>>

>>>>>>>>>> So it puzzles me why is set_vq_state() we do not communicate 

>>>>>>>>>> the saved

>>>>>>>>>> used index.

>>>>>>>>>

>>>>>>>>> We don't do that since:

>>>>>>>>>

>>>>>>>>> 1) if the hardware can sync its internal used index from the 

>>>>>>>>> virtqueue

>>>>>>>>> during device, then we don't need it

>>>>>>>>> 2) if the hardware can not sync its internal used index, the 

>>>>>>>>> driver (e.g as

>>>>>>>>> you did here) can do that.

>>>>>>>>>

>>>>>>>>> But there's no way for the hardware to deduce the internal 

>>>>>>>>> avail index from

>>>>>>>>> the virtqueue, that's why avail index is sycned.

>>>>>>>>>

>>>>>>>>> Thanks

>>>>>>>>>

>>>>>>>>>

>>>>>>>

>>>>>>

>>>>>

>>>>

>>>

>>

>
Jason Wang Feb. 20, 2021, 2:38 a.m. UTC | #21
On 2021/2/20 10:05 上午, Si-Wei Liu wrote:
>

>

> On 2/18/2021 7:10 PM, Jason Wang wrote:

>>

>> On 2021/2/18 8:43 下午, Si-Wei Liu wrote:

>>>

>>>

>>> On 2/17/2021 8:44 PM, Jason Wang wrote:

>>>>

>>>> On 2021/2/10 下午4:59, Si-Wei Liu wrote:

>>>>>

>>>>>

>>>>> On 2/9/2021 7:53 PM, Jason Wang wrote:

>>>>>>

>>>>>> On 2021/2/10 上午10:30, Si-Wei Liu wrote:

>>>>>>>

>>>>>>>

>>>>>>> On 2/8/2021 10:37 PM, Jason Wang wrote:

>>>>>>>>

>>>>>>>> On 2021/2/9 下午2:12, Eli Cohen wrote:

>>>>>>>>> On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

>>>>>>>>>> On 2021/2/8 下午6:04, Eli Cohen wrote:

>>>>>>>>>>> On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

>>>>>>>>>>>> On 2021/2/8 下午2:37, Eli Cohen wrote:

>>>>>>>>>>>>> On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

>>>>>>>>>>>>>> On 2021/2/6 上午7:07, Si-Wei Liu wrote:

>>>>>>>>>>>>>>> On 2/3/2021 11:36 PM, Eli Cohen wrote:

>>>>>>>>>>>>>>>> When a change of memory map occurs, the hardware 

>>>>>>>>>>>>>>>> resources are destroyed

>>>>>>>>>>>>>>>> and then re-created again with the new memory map. In 

>>>>>>>>>>>>>>>> such case, we need

>>>>>>>>>>>>>>>> to restore the hardware available and used indices. The 

>>>>>>>>>>>>>>>> driver failed to

>>>>>>>>>>>>>>>> restore the used index which is added here.

>>>>>>>>>>>>>>>>

>>>>>>>>>>>>>>>> Also, since the driver also fails to reset the 

>>>>>>>>>>>>>>>> available and used

>>>>>>>>>>>>>>>> indices upon device reset, fix this here to avoid 

>>>>>>>>>>>>>>>> regression caused by

>>>>>>>>>>>>>>>> the fact that used index may not be zero upon device 

>>>>>>>>>>>>>>>> reset.

>>>>>>>>>>>>>>>>

>>>>>>>>>>>>>>>> Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for 

>>>>>>>>>>>>>>>> supported mlx5

>>>>>>>>>>>>>>>> devices")

>>>>>>>>>>>>>>>> Signed-off-by: Eli Cohen<elic@nvidia.com>

>>>>>>>>>>>>>>>> ---

>>>>>>>>>>>>>>>> v0 -> v1:

>>>>>>>>>>>>>>>> Clear indices upon device reset

>>>>>>>>>>>>>>>>

>>>>>>>>>>>>>>>> drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++

>>>>>>>>>>>>>>>>      1 file changed, 18 insertions(+)

>>>>>>>>>>>>>>>>

>>>>>>>>>>>>>>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>>>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>>>>> index 88dde3455bfd..b5fe6d2ad22f 100644

>>>>>>>>>>>>>>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>>>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c

>>>>>>>>>>>>>>>> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {

>>>>>>>>>>>>>>>>          u64 device_addr;

>>>>>>>>>>>>>>>>          u64 driver_addr;

>>>>>>>>>>>>>>>>          u16 avail_index;

>>>>>>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>>>>>>          bool ready;

>>>>>>>>>>>>>>>>          struct vdpa_callback cb;

>>>>>>>>>>>>>>>>          bool restore;

>>>>>>>>>>>>>>>> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {

>>>>>>>>>>>>>>>>          u32 virtq_id;

>>>>>>>>>>>>>>>>          struct mlx5_vdpa_net *ndev;

>>>>>>>>>>>>>>>>          u16 avail_idx;

>>>>>>>>>>>>>>>> +    u16 used_idx;

>>>>>>>>>>>>>>>>          int fw_state;

>>>>>>>>>>>>>>>>            /* keep last in the struct */

>>>>>>>>>>>>>>>> @@ -804,6 +806,7 @@ static int create_virtqueue(struct 

>>>>>>>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtque

>>>>>>>>>>>>>>>>            obj_context = 

>>>>>>>>>>>>>>>> MLX5_ADDR_OF(create_virtio_net_q_in, in,

>>>>>>>>>>>>>>>> obj_context);

>>>>>>>>>>>>>>>> MLX5_SET(virtio_net_q_object, obj_context, 

>>>>>>>>>>>>>>>> hw_available_index,

>>>>>>>>>>>>>>>> mvq->avail_idx);

>>>>>>>>>>>>>>>> +    MLX5_SET(virtio_net_q_object, obj_context, 

>>>>>>>>>>>>>>>> hw_used_index,

>>>>>>>>>>>>>>>> mvq->used_idx);

>>>>>>>>>>>>>>>> MLX5_SET(virtio_net_q_object, obj_context,

>>>>>>>>>>>>>>>> queue_feature_bit_mask_12_3,

>>>>>>>>>>>>>>>> get_features_12_3(ndev->mvdev.actual_features));

>>>>>>>>>>>>>>>>          vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, 

>>>>>>>>>>>>>>>> obj_context,

>>>>>>>>>>>>>>>> virtio_q_context);

>>>>>>>>>>>>>>>> @@ -1022,6 +1025,7 @@ static int connect_qps(struct 

>>>>>>>>>>>>>>>> mlx5_vdpa_net

>>>>>>>>>>>>>>>> *ndev, struct mlx5_vdpa_virtqueue *m

>>>>>>>>>>>>>>>>      struct mlx5_virtq_attr {

>>>>>>>>>>>>>>>>          u8 state;

>>>>>>>>>>>>>>>>          u16 available_index;

>>>>>>>>>>>>>>>> +    u16 used_index;

>>>>>>>>>>>>>>>>      };

>>>>>>>>>>>>>>>>        static int query_virtqueue(struct mlx5_vdpa_net 

>>>>>>>>>>>>>>>> *ndev, struct

>>>>>>>>>>>>>>>> mlx5_vdpa_virtqueue *mvq,

>>>>>>>>>>>>>>>> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct

>>>>>>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu

>>>>>>>>>>>>>>>>          memset(attr, 0, sizeof(*attr));

>>>>>>>>>>>>>>>>          attr->state = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>>>>>>>> obj_context, state);

>>>>>>>>>>>>>>>>          attr->available_index = 

>>>>>>>>>>>>>>>> MLX5_GET(virtio_net_q_object,

>>>>>>>>>>>>>>>> obj_context, hw_available_index);

>>>>>>>>>>>>>>>> +    attr->used_index = MLX5_GET(virtio_net_q_object, 

>>>>>>>>>>>>>>>> obj_context,

>>>>>>>>>>>>>>>> hw_used_index);

>>>>>>>>>>>>>>>>          kfree(out);

>>>>>>>>>>>>>>>>          return 0;

>>>>>>>>>>>>>>>>      @@ -1535,6 +1540,16 @@ static void 

>>>>>>>>>>>>>>>> teardown_virtqueues(struct

>>>>>>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>>>>>>          }

>>>>>>>>>>>>>>>>      }

>>>>>>>>>>>>>>>>      +static void clear_virtqueues(struct mlx5_vdpa_net 

>>>>>>>>>>>>>>>> *ndev)

>>>>>>>>>>>>>>>> +{

>>>>>>>>>>>>>>>> +    int i;

>>>>>>>>>>>>>>>> +

>>>>>>>>>>>>>>>> +    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {

>>>>>>>>>>>>>>>> +        ndev->vqs[i].avail_idx = 0;

>>>>>>>>>>>>>>>> +        ndev->vqs[i].used_idx = 0;

>>>>>>>>>>>>>>>> +    }

>>>>>>>>>>>>>>>> +}

>>>>>>>>>>>>>>>> +

>>>>>>>>>>>>>>>>      /* TODO: cross-endian support */

>>>>>>>>>>>>>>>>      static inline bool 

>>>>>>>>>>>>>>>> mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev

>>>>>>>>>>>>>>>> *mvdev)

>>>>>>>>>>>>>>>>      {

>>>>>>>>>>>>>>>> @@ -1610,6 +1625,7 @@ static int save_channel_info(struct

>>>>>>>>>>>>>>>> mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu

>>>>>>>>>>>>>>>>              return err;

>>>>>>>>>>>>>>>>            ri->avail_index = attr.available_index;

>>>>>>>>>>>>>>>> +    ri->used_index = attr.used_index;

>>>>>>>>>>>>>>>>          ri->ready = mvq->ready;

>>>>>>>>>>>>>>>>          ri->num_ent = mvq->num_ent;

>>>>>>>>>>>>>>>>          ri->desc_addr = mvq->desc_addr;

>>>>>>>>>>>>>>>> @@ -1654,6 +1670,7 @@ static void 

>>>>>>>>>>>>>>>> restore_channels_info(struct

>>>>>>>>>>>>>>>> mlx5_vdpa_net *ndev)

>>>>>>>>>>>>>>>>                  continue;

>>>>>>>>>>>>>>>>                mvq->avail_idx = ri->avail_index;

>>>>>>>>>>>>>>>> +        mvq->used_idx = ri->used_index;

>>>>>>>>>>>>>>>>              mvq->ready = ri->ready;

>>>>>>>>>>>>>>>>              mvq->num_ent = ri->num_ent;

>>>>>>>>>>>>>>>>              mvq->desc_addr = ri->desc_addr;

>>>>>>>>>>>>>>>> @@ -1768,6 +1785,7 @@ static void 

>>>>>>>>>>>>>>>> mlx5_vdpa_set_status(struct

>>>>>>>>>>>>>>>> vdpa_device *vdev, u8 status)

>>>>>>>>>>>>>>>>          if (!status) {

>>>>>>>>>>>>>>>>              mlx5_vdpa_info(mvdev, "performing device 

>>>>>>>>>>>>>>>> reset\n");

>>>>>>>>>>>>>>>>              teardown_driver(ndev);

>>>>>>>>>>>>>>>> +        clear_virtqueues(ndev);

>>>>>>>>>>>>>>> The clearing looks fine at the first glance, as it 

>>>>>>>>>>>>>>> aligns with the other

>>>>>>>>>>>>>>> state cleanups floating around at the same place. 

>>>>>>>>>>>>>>> However, the thing is

>>>>>>>>>>>>>>> get_vq_state() is supposed to be called right after to 

>>>>>>>>>>>>>>> get sync'ed with

>>>>>>>>>>>>>>> the latest internal avail_index from device while vq is 

>>>>>>>>>>>>>>> stopped. The

>>>>>>>>>>>>>>> index was saved in the driver software at vq suspension, 

>>>>>>>>>>>>>>> but before the

>>>>>>>>>>>>>>> virtq object is destroyed. We shouldn't clear the 

>>>>>>>>>>>>>>> avail_index too early.

>>>>>>>>>>>>>> Good point.

>>>>>>>>>>>>>>

>>>>>>>>>>>>>> There's a limitation on the virtio spec and vDPA 

>>>>>>>>>>>>>> framework that we can not

>>>>>>>>>>>>>> simply differ device suspending from device reset.

>>>>>>>>>>>>>>

>>>>>>>>>>>>> Are you talking about live migration where you reset the 

>>>>>>>>>>>>> device but

>>>>>>>>>>>>> still want to know how far it progressed in order to 

>>>>>>>>>>>>> continue from the

>>>>>>>>>>>>> same place in the new VM?

>>>>>>>>>>>> Yes. So if we want to support live migration at we need:

>>>>>>>>>>>>

>>>>>>>>>>>> in src node:

>>>>>>>>>>>> 1) suspend the device

>>>>>>>>>>>> 2) get last_avail_idx via get_vq_state()

>>>>>>>>>>>>

>>>>>>>>>>>> in the dst node:

>>>>>>>>>>>> 3) set last_avail_idx via set_vq_state()

>>>>>>>>>>>> 4) resume the device

>>>>>>>>>>>>

>>>>>>>>>>>> So you can see, step 2 requires the device/driver not to 

>>>>>>>>>>>> forget the

>>>>>>>>>>>> last_avail_idx.

>>>>>>>>>>>>

>>>>>>>>>>> Just to be sure, what really matters here is the used index. 

>>>>>>>>>>> Becuase the

>>>>>>>>>>> vriqtueue itself is copied from the src VM to the dest VM. 

>>>>>>>>>>> The available

>>>>>>>>>>> index is alreay there and we know the hardware reads it from 

>>>>>>>>>>> there.

>>>>>>>>>>

>>>>>>>>>> So for "last_avail_idx" I meant the hardware internal avail 

>>>>>>>>>> index. It's not

>>>>>>>>>> stored in the virtqueue so we must migrate it from src to 

>>>>>>>>>> dest and set them

>>>>>>>>>> through set_vq_state(). Then in the destination, the 

>>>>>>>>>> virtqueue can be

>>>>>>>>>> restarted from that index.

>>>>>>>>>>

>>>>>>>>> Consider this case: driver posted buffers till avail index 

>>>>>>>>> becomes the

>>>>>>>>> value 50. Hardware is executing but made it till 20 when 

>>>>>>>>> virtqueue was

>>>>>>>>> suspended due to live migration - this is indicated by 

>>>>>>>>> hardware used

>>>>>>>>> index equal 20.

>>>>>>>>

>>>>>>>>

>>>>>>>> So in this case the used index in the virtqueue should be 20? 

>>>>>>>> Otherwise we need not sync used index itself but all the used 

>>>>>>>> entries that is not committed to the used ring.

>>>>>>>

>>>>>>> In other word, for mlx5 vdpa there's no such internal 

>>>>>>> last_avail_idx stuff maintained by the hardware, right? 

>>>>>>

>>>>>>

>>>>>> For each device it should have one otherwise it won't work 

>>>>>> correctly during stop/resume. See the codes 

>>>>>> mlx5_vdpa_get_vq_state() which calls query_virtqueue() that build 

>>>>>> commands to query "last_avail_idx" from the hardware:

>>>>>>

>>>>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 

>>>>>> MLX5_CMD_OP_QUERY_GENERAL_OBJECT);

>>>>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 

>>>>>> MLX5_OBJ_TYPE_VIRTIO_NET_Q);

>>>>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, 

>>>>>> mvq->virtq_id);

>>>>>>     MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, 

>>>>>> ndev->mvdev.res.uid);

>>>>>>     err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, 

>>>>>> outlen);

>>>>>>     if (err)

>>>>>>         goto err_cmd;

>>>>>>

>>>>>>     obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, 

>>>>>> obj_context);

>>>>>>     memset(attr, 0, sizeof(*attr));

>>>>>>     attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);

>>>>>>     attr->available_index = MLX5_GET(virtio_net_q_object, 

>>>>>> obj_context, hw_available_index);

>>>>>>

>>>>> Eli should be able to correct me, but this hw_available_index 

>>>>> might just be a cached value of virtqueue avail_index in the 

>>>>> memory from the most recent sync. 

>>>>

>>>>

>>>> It should not, otherwise it will be a bug.

>>> The hw_available_index not showing the correct last_avail_index 

>>> value is technically a firmware bug, as Eli alluded to. That's why I 

>>> had the original question for how this entire synchronization scheme 

>>> could work if just saving and restoring this cached value. In my 

>>> observation, the hw_available_index was seen far off from the 

>>> hw_used_index post vq suspension, I just pointed out the fact that 

>>> this hardware value is neither last_avail_idx nor last_used_idx that 

>>> is useful to represent vq state.

>>>

>>> However, the core question I'm having is should we care about fixing 

>>> this in the firmware interface level (actually, the hardware 

>>> implementation), or rather, just as said, some devices e.g. network 

>>> device could live with the simplified form of assumption (used_idx 

>>> ==  last_used_idx  == last_avail_idx) where pending requests can be 

>>> drained and completed before device is stopped or suspended. Despite 

>>> of what device behavior would be expected or defined in the virtio 

>>> spec, the get/set_vq_state() API should have very clear semantics: 

>>> whether both forms (i.e. restore either one of used_idx or 

>>> last_avail_idx is fine) are acceptable, or it needs to be religious 

>>> to restore vq state via last_avail_idx only. In the latter case, 

>>> both the mlx5 vdpa net driver and the firmware interface needs to be 

>>> fixed to accommodate the stricter API requirement.

>>>

>>> FWIW I don't think network (Ethernet) device should always assume 

>>> (used_idx ==  last_used_idx  == last_avail_idx) while being stopped 

>>> (were there virtio RDMA device in reality this assumption would break): 

>>

>>

>> Right, it's the choice of device and it would be more complicated if 

>> we had devices without this assumption. The main issue is whther or 

>> not we need to wait for the drain of the request and if such darining 

>> can only take reasonable time to finish.

>>

>>

>>> it could well follow the golden rules in virtio S/W implementation 

>>> to have separate last_used_idx and last_avail_idx in the hardware, 

>>> then it'll be 100% API compliant.. But we know that such 

>>> implementation is unnecessarily complicated for Ethernet device. 

>>> Hence I thought there could be sort of reliefs for some 

>>> driver/device and it's totally up to the device/driver to choose the 

>>> implementation.

>>

>>

>> Yes, the current API leaves the room for last_used_idx but we may 

>> still need to migrate the indices (as I mentioned below).

>>

>>

>>>

>>>>

>>>>

>>>>> I doubt it's the one you talked about in software implementation.

>>>>

>>>>

>>>> Actually not, it's a virtio general issue:

>>>>

>>>> Consider there's not indices wrap. And:

>>>> - used_idx is the used index in the virtqueue

>>>> - last_used_idx is the used index maintained by the device, it 

>>>> points to the location where to put the next done requests to the 

>>>> used_ring

>>>> - avail_idx is the available index in the virtqueue

>>>> - last_avail_idx is the index maintained by the device, it points 

>>>> to the location where device need to read from the available.

>>>>

>>>> So bascially, from device POV, it only cares the buffer that belong 

>>>> to itself which are [used_idx, avail_idx). So we have:

>>>>

>>>> [used_idx, last_used_idx) The requests that have been completed by 

>>>> the device but not completed to the used ring (or at least used_idx 

>>>> is not updated).

>>>> [last_used_idx, last_avail_idx) The requests that are being 

>>>> processed by the device.

>>>> [last_avail_idx, avail_idx) The requests that are made available by 

>>>> the driver but not processed by the device.

>>>>

>>>> During device stop/suspend, the device should:

>>>>

>>>> - stop reading new request from available ring (or read until the 

>>>> end of descriptor chain)

>>>> - sync used_idx with last_used_idx. Otherwise we need a complicated 

>>>> but not necessary API to sync last_used_idx and the indices that 

>>>> are not committed to used ring (since device may complete the 

>>>> request out of order)

>>>>

>>>> So we know used_idx == last_used_idx in this case, so we have:

>>>>

>>>> [used_idx/last_used_idx, last_avail_idx) The requests that are 

>>>> being processed.

>>>> [last_avail_idx, avail_idx) The requests that are available for the 

>>>> driver but not yet processed.

>>>>

>>>> For networking device, it's sufficient to think the requests are 

>>>> completed when TX/RX DMA are finished. So there's no requests that 

>>>> are being processed after the stop. In this case we had: used_idx 

>>>> == last_used_idx == last_avail_idx. Then we only had:

>>>>

>>>> [used_idx/last_used_idx/last_avail_idx, avail_idx] The requests 

>>>> that are made available by the driver but not processed by the 

>>>> device. That's why you may think only used_idx matters here.

>>>>

>>>> For block device, the completion of the request might require the 

>>>> communication with the remote backend, so we can't assume 

>>>> last_used_idx is equal to the last_avail_idx. Whether or not to 

>>>> wait for the drain the request is still being discussed[1].

>>> I guess this requirement is very subject to the specific storage 

>>> setup (networked v.s. local) including the guest app/configuration 

>>> (retry v.s. time out). IMHO there shouldn't be a definite yes-or-no 

>>> answer here. But in general, for req-ack type of request if the 

>>> completion (ack) cannot simply replicate across live migration, it 

>>> should wait until the pending requests are completely drained.

>>

>>

>> Yes, and qemu may fail the migartion if the pending requets takes too 

>> long to be drained, that's why an asynchornous API is introduced in [1].

>

> Just curious, what is the link for [1]? Sorry, I don't find it in your 

> previous reply.



My bad, it should be 
https://lists.oasis-open.org/archives/virtio-comment/202012/msg00027.html


>

>>

>>

>>>

>>>>

>>>> So you can see, for all the cases, what really matters is the 

>>>> last_avail_idx. The device should know where it need to start 

>>>> reading for the next request, and it is not necessarily equal to 

>>>> last_used_idx or used_idx. What makes things a little bit easier is 

>>>> the networking device whose last_used_idx is equal to last_avail_idx.

>>>>

>>>>

>>>>> If I understand Eli correctly, hardware will always reload the 

>>>>> latest avail_index from memory whenever it's being sync'ed again.

>>>>>

>>>>> <quote>

>>>>> The hardware always goes to read the available index from memory. 

>>>>> The requirement to configure it when creating a new object is 

>>>>> still a requirement defined by the interface so I must not violate 

>>>>> interface requirments.

>>>>> </quote>

>>>>>

>>>>> If the hardware does everything perfectly that is able to flush 

>>>>> pending requests, update descriptors, rings plus used indices all 

>>>>> at once before the suspension, there's no need for hardware to 

>>>>> maintain a separate internal index than the h/w used_index. The 

>>>>> hardware can get started from the saved used_index upon resuming. 

>>>>> I view this is of (hardware) implementation choices and thought it 

>>>>> does not violate the virtio spec?

>>>>

>>>>

>>>> Yes, but as you said, it has a lot of assumptions which may not 

>>>> work for other type of devices. So what I refer "last_avail_idx" is 

>>>> probably the "used_idx" in your description here. It might be the 

>>>> same in this case for networking device.

>>>>

>>>>

>>>>>

>>>>>

>>>>>>

>>>>>>

>>>>>>

>>>>>>> And the used_idx in the virtqueue is always in sync with the 

>>>>>>> hardware used_index, and hardware is supposed to commit pending 

>>>>>>> used buffers to the ring while bumping up the hardware 

>>>>>>> used_index (and also committed to memory) altogether prior to 

>>>>>>> suspension, is my understanding correct here? Double checking if 

>>>>>>> this is the expected semantics of what 

>>>>>>> modify_virtqueue(MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND) should 

>>>>>>> achieve.

>>>>>>>

>>>>>>> If the above is true, then it looks to me for mlx5 vdpa we 

>>>>>>> should really return h/w used_idx rather than the last_avail_idx 

>>>>>>> through get_vq_state(), in order to reconstruct the virt queue 

>>>>>>> state post live migration. For the set_map case, the internal 

>>>>>>> last_avail_idx really doesn't matter, although both indices are 

>>>>>>> saved and restored transparently as-is.

>>>>>>

>>>>>>

>>>>>> Right, a subtle thing here is that: for the device that might 

>>>>>> have can't not complete all virtqueue requests during vq 

>>>>>> suspending, the "last_avail_idx" might not be equal to the 

>>>>>> hardware used_idx. Thing might be true for the storage devices 

>>>>>> that needs to connect to a remote backend. But this is not the 

>>>>>> case of networking device, so last_avail_idx should be equal to 

>>>>>> hardware used_idx here. 

>>>>> Eli, since it's your hardware, does it work this way? i.e. does 

>>>>> the firmware interface see a case where virtqueue requests can't 

>>>>> be completed before suspending vq?

>>>>

>>>>

>>>> For storage device, I think it can happen.

>>>>

>>>>

>>>>>

>>>>>> But using the "last_avail_idx" or hardware avail_idx should 

>>>>>> always be better in this case since it's guaranteed to correct 

>>>>>> and will have less confusion. We use this convention in other 

>>>>>> types of vhost backends (vhost-kernel, vhost-user).

>>>>>>

>>>>>> So looking at mlx5_set_vq_state(), it probably won't work since 

>>>>>> it doesn't not set either hardware avail_idx or hardware used_idx:

>>>>> The saved mvq->avail_idx will be used to recreate hardware virtq 

>>>>> object and the used index in create_virtqueue(), once status 

>>>>> DRIVER_OK is set. I suspect we should pass the index to 

>>>>> mvq->used_idx in mlx5_vdpa_set_vq_state() below instead.

>>>>>

>>>>

>>>> It depends on what did mvq->used_idx meant? If it's last_used_idx, 

>>>> it should be the same with mvq->avail_idx for networking device.

>>> It's the last_used_idx. Note, Eli already posted a patch 

>>> ("vdpa/mlx5: Fix suspend/resume index restoration") to repurpose 

>>> set_vq_state() to restore both used_idx and avail_idx (both indices 

>>> should be conceptually equal in that device model) across reset. 

>>

>>

>> Will check, burried by mails :(

>

> Understood. Will point you to the exact email showing the code change.

>

>>

>>

>>> Which implies the simplified assumption I mentioned earlier. The 

>>> requirement of set_vq_state() API should make it clear if this kind 

>>> of assumption is acceptable.

>>

>>

>> Right now the value is exposed to userspace via GET_VRING_BASE, so 

>> only last_avail_idx is synced. If we need sync last_used_idx, we 

>> should also sync pending indices which requires more thoughts.

> Technically it doesn't sound right - crossing the boundary a bit even 

> with simplified form of assumption. But depending on how userspace 

> could make use of this API, it doesn't seem it breaks existing 

> functionality for the moment.



I don't get here, maybe you can explain a little bit more?


>

> PS, happy Lunar New Year. :)



Thanks :)


>

> -Siwei

>

>>

>> Thanks

>>

>>

>>>

>>>

>>> Thanks,

>>> -Siwei

>>>

>>>

>>>>

>>>> Thanks

>>>>

>>>>

>>>>>

>>>>> Thanks,

>>>>> -Siwei

>>>>>>

>>>>>> static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,

>>>>>>                   const struct vdpa_vq_state *state)

>>>>>> {

>>>>>>     struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);

>>>>>>     struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);

>>>>>>     struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];

>>>>>>

>>>>>>     if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {

>>>>>>         mlx5_vdpa_warn(mvdev, "can't modify available index\n");

>>>>>>         return -EINVAL;

>>>>>>     }

>>>>>>

>>>>>>     mvq->avail_idx = state->avail_index;

>>>>>>     return 0;

>>>>>> }

>>>>>>

>>>>>> Depends on the hardware, we should either set hardware used_idx 

>>>>>> or hardware avail_idx here.

>>>>>>

>>>>>> I think we need to clarify how device is supposed to work in the 

>>>>>> virtio spec.

>>>>>>

>>>>>> Thanks

>>>>>>

>>>>>>

>>>>>>>

>>>>>>> -Siwei

>>>>>>>

>>>>>>>>

>>>>>>>>

>>>>>>>>> Now the vritqueue is copied to the new VM and the

>>>>>>>>> hardware now has to continue execution from index 20. We need 

>>>>>>>>> to tell

>>>>>>>>> the hardware via configuring the last used_index.

>>>>>>>>

>>>>>>>>

>>>>>>>> If the hardware can not sync the index from the virtqueue, the 

>>>>>>>> driver can do the synchronization by make the last_used_idx 

>>>>>>>> equals to used index in the virtqueue.

>>>>>>>>

>>>>>>>> Thanks

>>>>>>>>

>>>>>>>>

>>>>>>>>>   So why don't we

>>>>>>>>> restore the used index?

>>>>>>>>>

>>>>>>>>>>> So it puzzles me why is set_vq_state() we do not communicate 

>>>>>>>>>>> the saved

>>>>>>>>>>> used index.

>>>>>>>>>>

>>>>>>>>>> We don't do that since:

>>>>>>>>>>

>>>>>>>>>> 1) if the hardware can sync its internal used index from the 

>>>>>>>>>> virtqueue

>>>>>>>>>> during device, then we don't need it

>>>>>>>>>> 2) if the hardware can not sync its internal used index, the 

>>>>>>>>>> driver (e.g as

>>>>>>>>>> you did here) can do that.

>>>>>>>>>>

>>>>>>>>>> But there's no way for the hardware to deduce the internal 

>>>>>>>>>> avail index from

>>>>>>>>>> the virtqueue, that's why avail index is sycned.

>>>>>>>>>>

>>>>>>>>>> Thanks

>>>>>>>>>>

>>>>>>>>>>

>>>>>>>>

>>>>>>>

>>>>>>

>>>>>

>>>>

>>>

>>

>
Si-Wei Liu Feb. 20, 2021, 3 a.m. UTC | #22
On 2/19/2021 6:38 PM, Jason Wang wrote:
>>>

>>>

>>> Right now the value is exposed to userspace via GET_VRING_BASE, so 

>>> only last_avail_idx is synced. If we need sync last_used_idx, we 

>>> should also sync pending indices which requires more thoughts.

>> Technically it doesn't sound right - crossing the boundary a bit even 

>> with simplified form of assumption. But depending on how userspace 

>> could make use of this API, it doesn't seem it breaks existing 

>> functionality for the moment.

>

>

> I don't get here, maybe you can explain a little bit more? 

Please refer to the email I just sent.

https://lore.kernel.org/lkml/033b0806-4037-5755-a1fa-91dbb58bab2e@oracle.com/

-Siwei
diff mbox series

Patch

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 88dde3455bfd..b5fe6d2ad22f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -87,6 +87,7 @@  struct mlx5_vq_restore_info {
 	u64 device_addr;
 	u64 driver_addr;
 	u16 avail_index;
+	u16 used_index;
 	bool ready;
 	struct vdpa_callback cb;
 	bool restore;
@@ -121,6 +122,7 @@  struct mlx5_vdpa_virtqueue {
 	u32 virtq_id;
 	struct mlx5_vdpa_net *ndev;
 	u16 avail_idx;
+	u16 used_idx;
 	int fw_state;
 
 	/* keep last in the struct */
@@ -804,6 +806,7 @@  static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque
 
 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
 	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
+	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
 		 get_features_12_3(ndev->mvdev.actual_features));
 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
@@ -1022,6 +1025,7 @@  static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *m
 struct mlx5_virtq_attr {
 	u8 state;
 	u16 available_index;
+	u16 used_index;
 };
 
 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
@@ -1052,6 +1056,7 @@  static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu
 	memset(attr, 0, sizeof(*attr));
 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
+	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
 	kfree(out);
 	return 0;
 
@@ -1535,6 +1540,16 @@  static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
 	}
 }
 
+static void clear_virtqueues(struct mlx5_vdpa_net *ndev)
+{
+	int i;
+
+	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
+		ndev->vqs[i].avail_idx = 0;
+		ndev->vqs[i].used_idx = 0;
+	}
+}
+
 /* TODO: cross-endian support */
 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
 {
@@ -1610,6 +1625,7 @@  static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu
 		return err;
 
 	ri->avail_index = attr.available_index;
+	ri->used_index = attr.used_index;
 	ri->ready = mvq->ready;
 	ri->num_ent = mvq->num_ent;
 	ri->desc_addr = mvq->desc_addr;
@@ -1654,6 +1670,7 @@  static void restore_channels_info(struct mlx5_vdpa_net *ndev)
 			continue;
 
 		mvq->avail_idx = ri->avail_index;
+		mvq->used_idx = ri->used_index;
 		mvq->ready = ri->ready;
 		mvq->num_ent = ri->num_ent;
 		mvq->desc_addr = ri->desc_addr;
@@ -1768,6 +1785,7 @@  static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
 	if (!status) {
 		mlx5_vdpa_info(mvdev, "performing device reset\n");
 		teardown_driver(ndev);
+		clear_virtqueues(ndev);
 		mlx5_vdpa_destroy_mr(&ndev->mvdev);
 		ndev->mvdev.status = 0;
 		ndev->mvdev.mlx_features = 0;