diff mbox series

[RFC,v2,2/5] vfio/type1: Check reserve region conflict and update iova list

Message ID 20180112164531.93712-3-shameerali.kolothum.thodi@huawei.com
State New
Headers show
Series vfio/type1: Add support for valid iova list management | expand

Commit Message

Shameerali Kolothum Thodi Jan. 12, 2018, 4:45 p.m. UTC
This retrieves the reserved regions associated with dev group and
checks for conflicts with any existing dma mappings. Also update
the iova list excluding the reserved regions.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

---
 drivers/vfio/vfio_iommu_type1.c | 161 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 2 deletions(-)

-- 
1.9.1

Comments

Alex Williamson Jan. 18, 2018, 12:04 a.m. UTC | #1
On Fri, 12 Jan 2018 16:45:28 +0000
Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

> This retrieves the reserved regions associated with dev group and

> checks for conflicts with any existing dma mappings. Also update

> the iova list excluding the reserved regions.

> 

> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

> ---

>  drivers/vfio/vfio_iommu_type1.c | 161 +++++++++++++++++++++++++++++++++++++++-

>  1 file changed, 159 insertions(+), 2 deletions(-)

> 

> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

> index 11cbd49..7609070 100644

> --- a/drivers/vfio/vfio_iommu_type1.c

> +++ b/drivers/vfio/vfio_iommu_type1.c

> @@ -28,6 +28,7 @@

>  #include <linux/device.h>

>  #include <linux/fs.h>

>  #include <linux/iommu.h>

> +#include <linux/list_sort.h>

>  #include <linux/module.h>

>  #include <linux/mm.h>

>  #include <linux/rbtree.h>

> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)

>  	return ret;

>  }

>  


/* list_sort helper */

> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)

> +{

> +	struct iommu_resv_region *ra, *rb;

> +

> +	ra = container_of(a, struct iommu_resv_region, list);

> +	rb = container_of(b, struct iommu_resv_region, list);

> +

> +	if (ra->start < rb->start)

> +		return -1;

> +	if (ra->start > rb->start)

> +		return 1;

> +	return 0;

> +}

> +

>  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,

>  				struct list_head *head)

>  {

> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,

>  }

>  

>  /*

> + * Check reserved region conflicts with existing dma mappings

> + */

> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,

> +				struct list_head *resv_regions)

> +{

> +	struct iommu_resv_region *region;

> +

> +	/* Check for conflict with existing dma mappings */

> +	list_for_each_entry(region, resv_regions, list) {

> +		if (vfio_find_dma_overlap(iommu, region->start,

> +				    region->start + region->length - 1))

> +			return -EINVAL;

> +	}

> +

> +	return 0;

> +}


This basically does the same test as vfio_iommu_valid_aperture but
properly names it a conflict test.  Please be consistent.  Should this
also return bool, "conflict" is a yes/no answer.

> +

> +/*

>   * Adjust the iommu aperture window if new aperture is a valid one

>   */

>  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,

> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,

>  	return 0;

>  }

>  

> +/*

> + * Check and update iova region list in case a reserved region

> + * overlaps the iommu iova range

> + */

> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,

> +					struct list_head *resv_regions)


"resv_region" in previous function, just "resv" here, use consistent
names.  Also, what are we adjusting.  Maybe "exclude" is a better term.

> +{

> +	struct iommu_resv_region *resv;

> +	struct list_head *iova = &iommu->iova_list;

> +	struct vfio_iova *n, *next;

> +

> +	list_for_each_entry(resv, resv_regions, list) {

> +		phys_addr_t start, end;

> +

> +		start = resv->start;

> +		end = resv->start + resv->length - 1;

> +

> +		list_for_each_entry_safe(n, next, iova, list) {

> +			phys_addr_t a, b;

> +			int ret = 0;

> +

> +			a = n->start;

> +			b = n->end;


'a' and 'b' variables actually make this incredibly confusing.  Use
better variable names or just drop them entirely, it's much easier to
follow as n->start & n->end.

> +			/* No overlap */

> +			if ((start > b) || (end < a))

> +				continue;

> +			/* Split the current node and create holes */

> +			if (start > a)

> +				ret = vfio_insert_iova(a, start - 1, &n->list);

> +			if (!ret && end < b)

> +				ret = vfio_insert_iova(end + 1, b, &n->list);

> +			if (ret)

> +				return ret;

> +

> +			list_del(&n->list);


This is trickier than it appears and deserves some explanation.  AIUI,
we're actually inserting duplicate entries for the remainder at the
start of the range and then at the end of the range (and the order is
important here because we're inserting each before the current node),
and then we delete the current node.  So the iova_list is kept sorted
through this process, though temporarily includes some bogus, unordered
sub-sets.

> +			kfree(n);

> +		}

> +	}

> +

> +	if (list_empty(iova))

> +		return -EINVAL;

> +

> +	return 0;

> +}

> +

>  static int vfio_iommu_type1_attach_group(void *iommu_data,

>  					 struct iommu_group *iommu_group)

>  {

> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>  	bool resv_msi, msi_remap;

>  	phys_addr_t resv_msi_base;

>  	struct iommu_domain_geometry geo;

> +	struct list_head group_resv_regions;

> +	struct iommu_resv_region *resv, *resv_next;

>  

>  	mutex_lock(&iommu->lock);

>  

> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>  	if (ret)

>  		goto out_detach;

>  

> +	INIT_LIST_HEAD(&group_resv_regions);

> +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);

> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);

> +

> +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);

> +	if (ret)

> +		goto out_detach;

> +

>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);

>  

>  	INIT_LIST_HEAD(&domain->group_list);

> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>  		    d->prot == domain->prot) {

>  			iommu_detach_group(domain->domain, iommu_group);

>  			if (!iommu_attach_group(d->domain, iommu_group)) {

> +				ret = vfio_iommu_iova_resv_adjust(iommu,

> +							&group_resv_regions);

> +				if (!ret)

> +					goto out_domain;


The above function is not without side effects if it fails, it's
altered the iova_list.  It needs to be valid for the remaining domains
if we're going to continue.

> +

>  				list_add(&group->next, &d->group_list);

>  				iommu_domain_free(domain->domain);

>  				kfree(domain);

> -				mutex_unlock(&iommu->lock);

> -				return 0;

> +				goto done;

>  			}

>  

>  			ret = iommu_attach_group(domain->domain, iommu_group);

> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>  	if (ret)

>  		goto out_detach;

>  

> +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);

> +	if (ret)

> +		goto out_detach;


Can't we process the reserved regions once before we get here rather
than have two separate call points that do the same thing?  In order to
roll back from errors above, it seems like we need to copy iova_list
and work on the copy, installing it and deleting the original only on
success.

> +

>  	list_add(&domain->next, &iommu->domain_list);

>  

> +done:

> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)

> +		kfree(resv);

>  	mutex_unlock(&iommu->lock);

>  

>  	return 0;

> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>  	iommu_detach_group(domain->domain, iommu_group);

>  out_domain:

>  	iommu_domain_free(domain->domain);

> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)

> +		kfree(resv);

>  out_free:

>  	kfree(domain);

>  	kfree(group);

> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)

>  	node->end = end;

>  }

>  

> +/*

> + * Called when a group is detached. The reserved regions for that

> + * group can be part of valid iova now. But since reserved regions

> + * may be duplicated among groups, populate the iova valid regions

> +   list again.

> + */

> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)

> +{

> +	struct vfio_domain *d;

> +	struct vfio_group *g;

> +	struct vfio_iova *node, *tmp;

> +	struct iommu_resv_region *resv, *resv_next;

> +	struct list_head resv_regions;

> +	phys_addr_t start, end;

> +

> +	INIT_LIST_HEAD(&resv_regions);

> +

> +	list_for_each_entry(d, &iommu->domain_list, next) {

> +		list_for_each_entry(g, &d->group_list, next)

> +			iommu_get_group_resv_regions(g->iommu_group,

> +							 &resv_regions);

> +	}

> +

> +	if (list_empty(&resv_regions))

> +		return;

> +

> +	list_sort(NULL, &resv_regions, vfio_resv_cmp);

> +

> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);

> +	start = node->start;

> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);

> +	end = node->end;


list_sort() only sorts based on ->start, we added reserved regions for
all our groups to one list, we potentially have multiple entries with
the same ->start.  How can we be sure that the last one in the list
actually has the largest ->end value?

> +

> +	/* purge the iova list and create new one */

> +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {

> +		list_del(&node->list);

> +		kfree(node);

> +	}

> +

> +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {

> +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map request may fail\n",

> +			__func__);


Map requests "will" fail.  Is this the right error strategy?  Detaching
a group cannot fail.  Aren't we better off leaving the iova_list we had
in place?  If we cannot expand the iova aperture when a group is
removed, a user can continue unscathed.

> +		goto done;

> +	}

> +

> +	/* adjust the iova with current reserved regions */

> +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))

> +		pr_warn("%s: Failed to update iova list with reserve regions. VFIO DMA map request may fail\n",

> +			__func__);


Same.

> +done:

> +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)

> +		kfree(resv);

> +}

> +

>  static void vfio_iommu_type1_detach_group(void *iommu_data,

>  					  struct iommu_group *iommu_group)

>  {

> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,

>  		break;

>  	}

>  

> +	vfio_iommu_iova_resv_refresh(iommu);

> +

>  detach_group_done:

>  	mutex_unlock(&iommu->lock);

>  }
Shameerali Kolothum Thodi Jan. 19, 2018, 9:48 a.m. UTC | #2
> -----Original Message-----

> From: Alex Williamson [mailto:alex.williamson@redhat.com]

> Sent: Thursday, January 18, 2018 12:05 AM

> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>

> Cc: eric.auger@redhat.com; pmorel@linux.vnet.ibm.com;

> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm

> <linuxarm@huawei.com>; John Garry <john.garry@huawei.com>; xuwei (O)

> <xuwei5@huawei.com>

> Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update

> iova list

> 

> On Fri, 12 Jan 2018 16:45:28 +0000

> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

> 

> > This retrieves the reserved regions associated with dev group and

> > checks for conflicts with any existing dma mappings. Also update

> > the iova list excluding the reserved regions.

> >

> > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

> > ---

> >  drivers/vfio/vfio_iommu_type1.c | 161

> +++++++++++++++++++++++++++++++++++++++-

> >  1 file changed, 159 insertions(+), 2 deletions(-)

> >

> > diff --git a/drivers/vfio/vfio_iommu_type1.c

> b/drivers/vfio/vfio_iommu_type1.c

> > index 11cbd49..7609070 100644

> > --- a/drivers/vfio/vfio_iommu_type1.c

> > +++ b/drivers/vfio/vfio_iommu_type1.c

> > @@ -28,6 +28,7 @@

> >  #include <linux/device.h>

> >  #include <linux/fs.h>

> >  #include <linux/iommu.h>

> > +#include <linux/list_sort.h>

> >  #include <linux/module.h>

> >  #include <linux/mm.h>

> >  #include <linux/rbtree.h>

> > @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct

> iommu_group *group, phys_addr_t *base)

> >  	return ret;

> >  }

> >

> 

> /* list_sort helper */

> 

> > +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)

> > +{

> > +	struct iommu_resv_region *ra, *rb;

> > +

> > +	ra = container_of(a, struct iommu_resv_region, list);

> > +	rb = container_of(b, struct iommu_resv_region, list);

> > +

> > +	if (ra->start < rb->start)

> > +		return -1;

> > +	if (ra->start > rb->start)

> > +		return 1;

> > +	return 0;

> > +}

> > +

> >  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,

> >  				struct list_head *head)

> >  {

> > @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct

> vfio_iommu *iommu,

> >  }

> >

> >  /*

> > + * Check reserved region conflicts with existing dma mappings

> > + */

> > +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,

> > +				struct list_head *resv_regions)

> > +{

> > +	struct iommu_resv_region *region;

> > +

> > +	/* Check for conflict with existing dma mappings */

> > +	list_for_each_entry(region, resv_regions, list) {

> > +		if (vfio_find_dma_overlap(iommu, region->start,

> > +				    region->start + region->length - 1))

> > +			return -EINVAL;

> > +	}

> > +

> > +	return 0;

> > +}

> 

> This basically does the same test as vfio_iommu_valid_aperture but

> properly names it a conflict test.  Please be consistent.  Should this

> also return bool, "conflict" is a yes/no answer.


Ok.
 
> > +

> > +/*

> >   * Adjust the iommu aperture window if new aperture is a valid one

> >   */

> >  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,

> > @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct

> vfio_iommu *iommu,

> >  	return 0;

> >  }

> >

> > +/*

> > + * Check and update iova region list in case a reserved region

> > + * overlaps the iommu iova range

> > + */

> > +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,

> > +					struct list_head *resv_regions)

> 

> "resv_region" in previous function, just "resv" here, use consistent

> names.  Also, what are we adjusting.  Maybe "exclude" is a better term.


Ok.

> > +{

> > +	struct iommu_resv_region *resv;

> > +	struct list_head *iova = &iommu->iova_list;

> > +	struct vfio_iova *n, *next;

> > +

> > +	list_for_each_entry(resv, resv_regions, list) {

> > +		phys_addr_t start, end;

> > +

> > +		start = resv->start;

> > +		end = resv->start + resv->length - 1;

> > +

> > +		list_for_each_entry_safe(n, next, iova, list) {

> > +			phys_addr_t a, b;

> > +			int ret = 0;

> > +

> > +			a = n->start;

> > +			b = n->end;

> 

> 'a' and 'b' variables actually make this incredibly confusing.  Use

> better variable names or just drop them entirely, it's much easier to

> follow as n->start & n->end.


I will drop the name and go with  n->start & n->end.
 
> > +			/* No overlap */

> > +			if ((start > b) || (end < a))

> > +				continue;

> > +			/* Split the current node and create holes */

> > +			if (start > a)

> > +				ret = vfio_insert_iova(a, start - 1, &n->list);

> > +			if (!ret && end < b)

> > +				ret = vfio_insert_iova(end + 1, b, &n->list);

> > +			if (ret)

> > +				return ret;

> > +

> > +			list_del(&n->list);

> 

> This is trickier than it appears and deserves some explanation.  AIUI,

> we're actually inserting duplicate entries for the remainder at the

> start of the range and then at the end of the range (and the order is

> important here because we're inserting each before the current node),

> and then we delete the current node.  So the iova_list is kept sorted

> through this process, though temporarily includes some bogus, unordered

> sub-sets.


Yes. That understanding is correct. I will add comments to make it clear.

> > +			kfree(n);

> > +		}

> > +	}

> > +

> > +	if (list_empty(iova))

> > +		return -EINVAL;


The above is also not correct. The list cannot be empty. I think as you
said below, need to work on a copy.

> > +	return 0;

> > +}

> > +

> >  static int vfio_iommu_type1_attach_group(void *iommu_data,

> >  					 struct iommu_group *iommu_group)

> >  {

> > @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >  	bool resv_msi, msi_remap;

> >  	phys_addr_t resv_msi_base;

> >  	struct iommu_domain_geometry geo;

> > +	struct list_head group_resv_regions;

> > +	struct iommu_resv_region *resv, *resv_next;

> >

> >  	mutex_lock(&iommu->lock);

> >

> > @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >  	if (ret)

> >  		goto out_detach;

> >

> > +	INIT_LIST_HEAD(&group_resv_regions);

> > +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);

> > +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);

> > +

> > +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);

> > +	if (ret)

> > +		goto out_detach;

> > +

> >  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);

> >

> >  	INIT_LIST_HEAD(&domain->group_list);

> > @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >  		    d->prot == domain->prot) {

> >  			iommu_detach_group(domain->domain,

> iommu_group);

> >  			if (!iommu_attach_group(d->domain, iommu_group)) {

> > +				ret = vfio_iommu_iova_resv_adjust(iommu,

> > +

> 	&group_resv_regions);

> > +				if (!ret)

> > +					goto out_domain;

> 

> The above function is not without side effects if it fails, it's

> altered the iova_list.  It needs to be valid for the remaining domains

> if we're going to continue.

> 

> > +

> >  				list_add(&group->next, &d->group_list);

> >  				iommu_domain_free(domain->domain);

> >  				kfree(domain);

> > -				mutex_unlock(&iommu->lock);

> > -				return 0;

> > +				goto done;

> >  			}

> >

> >  			ret = iommu_attach_group(domain->domain,

> iommu_group);

> > @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >  	if (ret)

> >  		goto out_detach;

> >

> > +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);

> > +	if (ret)

> > +		goto out_detach;

> 

> Can't we process the reserved regions once before we get here rather

> than have two separate call points that do the same thing?  In order to

> roll back from errors above, it seems like we need to copy iova_list

> and work on the copy, installing it and deleting the original only on

> success.


Correct. In case of error, the iova list needs to be rolled back to previous
state. Yes, it looks like have to work on a copy. I will address this in next
revision.
 
> > +

> >  	list_add(&domain->next, &iommu->domain_list);

> >

> > +done:

> > +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)

> > +		kfree(resv);

> >  	mutex_unlock(&iommu->lock);

> >

> >  	return 0;

> > @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >  	iommu_detach_group(domain->domain, iommu_group);

> >  out_domain:

> >  	iommu_domain_free(domain->domain);

> > +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)

> > +		kfree(resv);

> >  out_free:

> >  	kfree(domain);

> >  	kfree(group);

> > @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct

> vfio_iommu *iommu)

> >  	node->end = end;

> >  }

> >

> > +/*

> > + * Called when a group is detached. The reserved regions for that

> > + * group can be part of valid iova now. But since reserved regions

> > + * may be duplicated among groups, populate the iova valid regions

> > +   list again.

> > + */

> > +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)

> > +{

> > +	struct vfio_domain *d;

> > +	struct vfio_group *g;

> > +	struct vfio_iova *node, *tmp;

> > +	struct iommu_resv_region *resv, *resv_next;

> > +	struct list_head resv_regions;

> > +	phys_addr_t start, end;

> > +

> > +	INIT_LIST_HEAD(&resv_regions);

> > +

> > +	list_for_each_entry(d, &iommu->domain_list, next) {

> > +		list_for_each_entry(g, &d->group_list, next)

> > +			iommu_get_group_resv_regions(g->iommu_group,

> > +							 &resv_regions);

> > +	}

> > +

> > +	if (list_empty(&resv_regions))

> > +		return;

> > +

> > +	list_sort(NULL, &resv_regions, vfio_resv_cmp);

> > +

> > +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);

> > +	start = node->start;

> > +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);

> > +	end = node->end;

> 

> list_sort() only sorts based on ->start, we added reserved regions for

> all our groups to one list, we potentially have multiple entries with

> the same ->start.  How can we be sure that the last one in the list

> actually has the largest ->end value?


Hmm.. the sorting is done on the reserved list. The start and end entries 
are of the iova list which is kept updated on _attach(). So I don't think
there is a problem here.

> > +

> > +	/* purge the iova list and create new one */

> > +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {

> > +		list_del(&node->list);

> > +		kfree(node);

> > +	}

> > +

> > +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {

> > +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map

> request may fail\n",

> > +			__func__);

> 

> Map requests "will" fail.  Is this the right error strategy?  Detaching

> a group cannot fail.  Aren't we better off leaving the iova_list we had

> in place?  If we cannot expand the iova aperture when a group is

> removed, a user can continue unscathed.


Ok. I think that's a better strategy rather than trying to update the iova list
here. I will remove this.

Thanks,
Shameer

> > +		goto done;

> > +	}

> > +

> > +	/* adjust the iova with current reserved regions */

> > +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))

> > +		pr_warn("%s: Failed to update iova list with reserve regions.

> VFIO DMA map request may fail\n",

> > +			__func__);

> 

> Same.

> 

> > +done:

> > +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)

> > +		kfree(resv);

> > +}

> > +

> >  static void vfio_iommu_type1_detach_group(void *iommu_data,

> >  					  struct iommu_group *iommu_group)

> >  {

> > @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void

> *iommu_data,

> >  		break;

> >  	}

> >

> > +	vfio_iommu_iova_resv_refresh(iommu);

> > +

> >  detach_group_done:

> >  	mutex_unlock(&iommu->lock);

> >  }
Alex Williamson Jan. 19, 2018, 3:45 p.m. UTC | #3
On Fri, 19 Jan 2018 09:48:22 +0000
Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com> wrote:
> > > +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)

> > > +{

> > > +	struct vfio_domain *d;

> > > +	struct vfio_group *g;

> > > +	struct vfio_iova *node, *tmp;

> > > +	struct iommu_resv_region *resv, *resv_next;

> > > +	struct list_head resv_regions;

> > > +	phys_addr_t start, end;

> > > +

> > > +	INIT_LIST_HEAD(&resv_regions);

> > > +

> > > +	list_for_each_entry(d, &iommu->domain_list, next) {

> > > +		list_for_each_entry(g, &d->group_list, next)

> > > +			iommu_get_group_resv_regions(g->iommu_group,

> > > +							 &resv_regions);

> > > +	}

> > > +

> > > +	if (list_empty(&resv_regions))

> > > +		return;

> > > +

> > > +	list_sort(NULL, &resv_regions, vfio_resv_cmp);

> > > +

> > > +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);

> > > +	start = node->start;

> > > +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);

> > > +	end = node->end;  

> > 

> > list_sort() only sorts based on ->start, we added reserved regions for

> > all our groups to one list, we potentially have multiple entries with

> > the same ->start.  How can we be sure that the last one in the list

> > actually has the largest ->end value?  

> 

> Hmm.. the sorting is done on the reserved list. The start and end entries 

> are of the iova list which is kept updated on _attach(). So I don't think

> there is a problem here.


Oops, yes you're right.  List confusion.  Thanks,

Alex
Eric Auger Jan. 23, 2018, 8:32 a.m. UTC | #4
Hi Shameer,

On 18/01/18 01:04, Alex Williamson wrote:
> On Fri, 12 Jan 2018 16:45:28 +0000

> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

> 

>> This retrieves the reserved regions associated with dev group and

>> checks for conflicts with any existing dma mappings. Also update

>> the iova list excluding the reserved regions.

>>

>> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

>> ---

>>  drivers/vfio/vfio_iommu_type1.c | 161 +++++++++++++++++++++++++++++++++++++++-

>>  1 file changed, 159 insertions(+), 2 deletions(-)

>>

>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

>> index 11cbd49..7609070 100644

>> --- a/drivers/vfio/vfio_iommu_type1.c

>> +++ b/drivers/vfio/vfio_iommu_type1.c

>> @@ -28,6 +28,7 @@

>>  #include <linux/device.h>

>>  #include <linux/fs.h>

>>  #include <linux/iommu.h>

>> +#include <linux/list_sort.h>

>>  #include <linux/module.h>

>>  #include <linux/mm.h>

>>  #include <linux/rbtree.h>

>> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)

>>  	return ret;

>>  }

>>  

> 

> /* list_sort helper */

> 

>> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)

>> +{

>> +	struct iommu_resv_region *ra, *rb;

>> +

>> +	ra = container_of(a, struct iommu_resv_region, list);

>> +	rb = container_of(b, struct iommu_resv_region, list);

>> +

>> +	if (ra->start < rb->start)

>> +		return -1;

>> +	if (ra->start > rb->start)

>> +		return 1;

>> +	return 0;

>> +}

>> +

>>  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,

>>  				struct list_head *head)

>>  {

>> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,

>>  }

>>  

>>  /*

>> + * Check reserved region conflicts with existing dma mappings

>> + */

>> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,

>> +				struct list_head *resv_regions)

>> +{

>> +	struct iommu_resv_region *region;

>> +

>> +	/* Check for conflict with existing dma mappings */

>> +	list_for_each_entry(region, resv_regions, list) {

>> +		if (vfio_find_dma_overlap(iommu, region->start,

>> +				    region->start + region->length - 1))

>> +			return -EINVAL;

>> +	}

>> +

>> +	return 0;

>> +}

> 

> This basically does the same test as vfio_iommu_valid_aperture but

> properly names it a conflict test.  Please be consistent.  Should this

> also return bool, "conflict" is a yes/no answer.

> 

>> +

>> +/*

>>   * Adjust the iommu aperture window if new aperture is a valid one

>>   */

>>  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,

>> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,

>>  	return 0;

>>  }

>>  

>> +/*

>> + * Check and update iova region list in case a reserved region

>> + * overlaps the iommu iova range

>> + */

>> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,

>> +					struct list_head *resv_regions)

> 

> "resv_region" in previous function, just "resv" here, use consistent

> names.  Also, what are we adjusting.  Maybe "exclude" is a better term.

> 

>> +{

>> +	struct iommu_resv_region *resv;

>> +	struct list_head *iova = &iommu->iova_list;

>> +	struct vfio_iova *n, *next;

>> +

>> +	list_for_each_entry(resv, resv_regions, list) {

>> +		phys_addr_t start, end;

>> +

>> +		start = resv->start;

>> +		end = resv->start + resv->length - 1;

>> +

>> +		list_for_each_entry_safe(n, next, iova, list) {

>> +			phys_addr_t a, b;

>> +			int ret = 0;

>> +

>> +			a = n->start;

>> +			b = n->end;

> 

> 'a' and 'b' variables actually make this incredibly confusing.  Use

> better variable names or just drop them entirely, it's much easier to

> follow as n->start & n->end.

> 

>> +			/* No overlap */

>> +			if ((start > b) || (end < a))

>> +				continue;

>> +			/* Split the current node and create holes */

>> +			if (start > a)

>> +				ret = vfio_insert_iova(a, start - 1, &n->list);

>> +			if (!ret && end < b)

>> +				ret = vfio_insert_iova(end + 1, b, &n->list);

>> +			if (ret)

>> +				return ret;

>> +

>> +			list_del(&n->list);

> 

> This is trickier than it appears and deserves some explanation.  AIUI,

> we're actually inserting duplicate entries for the remainder at the

> start of the range and then at the end of the range (and the order is

> important here because we're inserting each before the current node),

> and then we delete the current node.  So the iova_list is kept sorted

> through this process, though temporarily includes some bogus, unordered

> sub-sets.

> 

>> +			kfree(n);

>> +		}

>> +	}

>> +

>> +	if (list_empty(iova))

>> +		return -EINVAL;

>> +

>> +	return 0;

>> +}

>> +

>>  static int vfio_iommu_type1_attach_group(void *iommu_data,

>>  					 struct iommu_group *iommu_group)

>>  {

>> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>>  	bool resv_msi, msi_remap;

>>  	phys_addr_t resv_msi_base;

>>  	struct iommu_domain_geometry geo;

>> +	struct list_head group_resv_regions;

>> +	struct iommu_resv_region *resv, *resv_next;

>>  

>>  	mutex_lock(&iommu->lock);

>>  

>> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>>  	if (ret)

>>  		goto out_detach;

>>  

>> +	INIT_LIST_HEAD(&group_resv_regions);

>> +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);

>> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);

iommu_get_group_resv_regions returns a sorted list (see
iommu_insert_resv_regions kerneldoc comment). You can have overlapping
regions of different types though.

Thanks

Eric
>> +

>> +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);

>> +	if (ret)

>> +		goto out_detach;

>> +

>>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);

>>  

>>  	INIT_LIST_HEAD(&domain->group_list);

>> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>>  		    d->prot == domain->prot) {

>>  			iommu_detach_group(domain->domain, iommu_group);

>>  			if (!iommu_attach_group(d->domain, iommu_group)) {

>> +				ret = vfio_iommu_iova_resv_adjust(iommu,

>> +							&group_resv_regions);

>> +				if (!ret)

>> +					goto out_domain;

> 

> The above function is not without side effects if it fails, it's

> altered the iova_list.  It needs to be valid for the remaining domains

> if we're going to continue.

> 

>> +

>>  				list_add(&group->next, &d->group_list);

>>  				iommu_domain_free(domain->domain);

>>  				kfree(domain);

>> -				mutex_unlock(&iommu->lock);

>> -				return 0;

>> +				goto done;

>>  			}

>>  

>>  			ret = iommu_attach_group(domain->domain, iommu_group);

>> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>>  	if (ret)

>>  		goto out_detach;

>>  

>> +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);

>> +	if (ret)

>> +		goto out_detach;

> 

> Can't we process the reserved regions once before we get here rather

> than have two separate call points that do the same thing?  In order to

> roll back from errors above, it seems like we need to copy iova_list

> and work on the copy, installing it and deleting the original only on

> success.

> 

>> +

>>  	list_add(&domain->next, &iommu->domain_list);

>>  

>> +done:

>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)

>> +		kfree(resv);

>>  	mutex_unlock(&iommu->lock);

>>  

>>  	return 0;

>> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>>  	iommu_detach_group(domain->domain, iommu_group);

>>  out_domain:

>>  	iommu_domain_free(domain->domain);

>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)

>> +		kfree(resv);

>>  out_free:

>>  	kfree(domain);

>>  	kfree(group);

>> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)

>>  	node->end = end;

>>  }

>>  

>> +/*

>> + * Called when a group is detached. The reserved regions for that

>> + * group can be part of valid iova now. But since reserved regions

>> + * may be duplicated among groups, populate the iova valid regions

>> +   list again.

>> + */

>> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)

>> +{

>> +	struct vfio_domain *d;

>> +	struct vfio_group *g;

>> +	struct vfio_iova *node, *tmp;

>> +	struct iommu_resv_region *resv, *resv_next;

>> +	struct list_head resv_regions;

>> +	phys_addr_t start, end;

>> +

>> +	INIT_LIST_HEAD(&resv_regions);

>> +

>> +	list_for_each_entry(d, &iommu->domain_list, next) {

>> +		list_for_each_entry(g, &d->group_list, next)

>> +			iommu_get_group_resv_regions(g->iommu_group,

>> +							 &resv_regions);

>> +	}

>> +

>> +	if (list_empty(&resv_regions))

>> +		return;

>> +

>> +	list_sort(NULL, &resv_regions, vfio_resv_cmp);

>> +

>> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);

>> +	start = node->start;

>> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);

>> +	end = node->end;

> 

> list_sort() only sorts based on ->start, we added reserved regions for

> all our groups to one list, we potentially have multiple entries with

> the same ->start.  How can we be sure that the last one in the list

> actually has the largest ->end value?

> 

>> +

>> +	/* purge the iova list and create new one */

>> +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {

>> +		list_del(&node->list);

>> +		kfree(node);

>> +	}

>> +

>> +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {

>> +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map request may fail\n",

>> +			__func__);

> 

> Map requests "will" fail.  Is this the right error strategy?  Detaching

> a group cannot fail.  Aren't we better off leaving the iova_list we had

> in place?  If we cannot expand the iova aperture when a group is

> removed, a user can continue unscathed.

> 

>> +		goto done;

>> +	}

>> +

>> +	/* adjust the iova with current reserved regions */

>> +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))

>> +		pr_warn("%s: Failed to update iova list with reserve regions. VFIO DMA map request may fail\n",

>> +			__func__);

> 

> Same.

> 

>> +done:

>> +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)

>> +		kfree(resv);

>> +}

>> +

>>  static void vfio_iommu_type1_detach_group(void *iommu_data,

>>  					  struct iommu_group *iommu_group)

>>  {

>> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,

>>  		break;

>>  	}

>>  

>> +	vfio_iommu_iova_resv_refresh(iommu);

>> +

>>  detach_group_done:

>>  	mutex_unlock(&iommu->lock);

>>  }

>
Shameerali Kolothum Thodi Jan. 23, 2018, 12:16 p.m. UTC | #5
Hi Eric,

> -----Original Message-----

> From: Auger Eric [mailto:eric.auger@redhat.com]

> Sent: Tuesday, January 23, 2018 8:32 AM

> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum

> Thodi <shameerali.kolothum.thodi@huawei.com>

> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>

> Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update

> iova list

> 

> Hi Shameer,

> 

> On 18/01/18 01:04, Alex Williamson wrote:

> > On Fri, 12 Jan 2018 16:45:28 +0000

> > Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

> >

> >> This retrieves the reserved regions associated with dev group and

> >> checks for conflicts with any existing dma mappings. Also update

> >> the iova list excluding the reserved regions.

> >>

> >> Signed-off-by: Shameer Kolothum

> <shameerali.kolothum.thodi@huawei.com>

> >> ---

> >>  drivers/vfio/vfio_iommu_type1.c | 161

> +++++++++++++++++++++++++++++++++++++++-

> >>  1 file changed, 159 insertions(+), 2 deletions(-)

> >>

> >> diff --git a/drivers/vfio/vfio_iommu_type1.c

> b/drivers/vfio/vfio_iommu_type1.c

> >> index 11cbd49..7609070 100644

> >> --- a/drivers/vfio/vfio_iommu_type1.c

> >> +++ b/drivers/vfio/vfio_iommu_type1.c

> >> @@ -28,6 +28,7 @@

> >>  #include <linux/device.h>

> >>  #include <linux/fs.h>

> >>  #include <linux/iommu.h>

> >> +#include <linux/list_sort.h>

> >>  #include <linux/module.h>

> >>  #include <linux/mm.h>

> >>  #include <linux/rbtree.h>

> >> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct

> iommu_group *group, phys_addr_t *base)

> >>  	return ret;

> >>  }

> >>

> >

> > /* list_sort helper */

> >

> >> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)

> >> +{

> >> +	struct iommu_resv_region *ra, *rb;

> >> +

> >> +	ra = container_of(a, struct iommu_resv_region, list);

> >> +	rb = container_of(b, struct iommu_resv_region, list);

> >> +

> >> +	if (ra->start < rb->start)

> >> +		return -1;

> >> +	if (ra->start > rb->start)

> >> +		return 1;

> >> +	return 0;

> >> +}

> >> +

> >>  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,

> >>  				struct list_head *head)

> >>  {

> >> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct

> vfio_iommu *iommu,

> >>  }

> >>

> >>  /*

> >> + * Check reserved region conflicts with existing dma mappings

> >> + */

> >> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,

> >> +				struct list_head *resv_regions)

> >> +{

> >> +	struct iommu_resv_region *region;

> >> +

> >> +	/* Check for conflict with existing dma mappings */

> >> +	list_for_each_entry(region, resv_regions, list) {

> >> +		if (vfio_find_dma_overlap(iommu, region->start,

> >> +				    region->start + region->length - 1))

> >> +			return -EINVAL;

> >> +	}

> >> +

> >> +	return 0;

> >> +}

> >

> > This basically does the same test as vfio_iommu_valid_aperture but

> > properly names it a conflict test.  Please be consistent.  Should this

> > also return bool, "conflict" is a yes/no answer.

> >

> >> +

> >> +/*

> >>   * Adjust the iommu aperture window if new aperture is a valid one

> >>   */

> >>  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,

> >> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct

> vfio_iommu *iommu,

> >>  	return 0;

> >>  }

> >>

> >> +/*

> >> + * Check and update iova region list in case a reserved region

> >> + * overlaps the iommu iova range

> >> + */

> >> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,

> >> +					struct list_head *resv_regions)

> >

> > "resv_region" in previous function, just "resv" here, use consistent

> > names.  Also, what are we adjusting.  Maybe "exclude" is a better term.

> >

> >> +{

> >> +	struct iommu_resv_region *resv;

> >> +	struct list_head *iova = &iommu->iova_list;

> >> +	struct vfio_iova *n, *next;

> >> +

> >> +	list_for_each_entry(resv, resv_regions, list) {

> >> +		phys_addr_t start, end;

> >> +

> >> +		start = resv->start;

> >> +		end = resv->start + resv->length - 1;

> >> +

> >> +		list_for_each_entry_safe(n, next, iova, list) {

> >> +			phys_addr_t a, b;

> >> +			int ret = 0;

> >> +

> >> +			a = n->start;

> >> +			b = n->end;

> >

> > 'a' and 'b' variables actually make this incredibly confusing.  Use

> > better variable names or just drop them entirely, it's much easier to

> > follow as n->start & n->end.

> >

> >> +			/* No overlap */

> >> +			if ((start > b) || (end < a))

> >> +				continue;

> >> +			/* Split the current node and create holes */

> >> +			if (start > a)

> >> +				ret = vfio_insert_iova(a, start - 1, &n->list);

> >> +			if (!ret && end < b)

> >> +				ret = vfio_insert_iova(end + 1, b, &n->list);

> >> +			if (ret)

> >> +				return ret;

> >> +

> >> +			list_del(&n->list);

> >

> > This is trickier than it appears and deserves some explanation.  AIUI,

> > we're actually inserting duplicate entries for the remainder at the

> > start of the range and then at the end of the range (and the order is

> > important here because we're inserting each before the current node),

> > and then we delete the current node.  So the iova_list is kept sorted

> > through this process, though temporarily includes some bogus, unordered

> > sub-sets.

> >

> >> +			kfree(n);

> >> +		}

> >> +	}

> >> +

> >> +	if (list_empty(iova))

> >> +		return -EINVAL;

> >> +

> >> +	return 0;

> >> +}

> >> +

> >>  static int vfio_iommu_type1_attach_group(void *iommu_data,

> >>  					 struct iommu_group *iommu_group)

> >>  {

> >> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >>  	bool resv_msi, msi_remap;

> >>  	phys_addr_t resv_msi_base;

> >>  	struct iommu_domain_geometry geo;

> >> +	struct list_head group_resv_regions;

> >> +	struct iommu_resv_region *resv, *resv_next;

> >>

> >>  	mutex_lock(&iommu->lock);

> >>

> >> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >>  	if (ret)

> >>  		goto out_detach;

> >>

> >> +	INIT_LIST_HEAD(&group_resv_regions);

> >> +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);

> >> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);

> iommu_get_group_resv_regions returns a sorted list (see

> iommu_insert_resv_regions kerneldoc comment). You can have overlapping

> regions of different types though.


Hmm..I am not sure. It looks like it is sorted only if the regions are of same type.

"* The new element is sorted by address with respect to the other
 * regions of the same type."

So hypothetically if there are two groups with regions like,

Group 1.
 Start       size            type  
  0x0000   0x1000        1
  0x2000   0x1000        1
  0x5000   0x1000        1

Group 2
  Start       size              type
   0x2000  0x4000           2
   0x7000   0x1000          1

Then the  iommu_get_group_resv_regions() will return,

0x0000   0x1000        1
0x2000   0x1000        1
0x5000   0x1000        1
0x2000  0x4000         2
0x7000   0x1000        1  

But honestly I am not sure the above is a valid scenario or not. I am
happy to remove the sorting if such a case will never happen.

Please let me know.

Thanks,
Shameer

> Eric

> >> +

> >> +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);

> >> +	if (ret)

> >> +		goto out_detach;

> >> +

> >>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);

> >>

> >>  	INIT_LIST_HEAD(&domain->group_list);

> >> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >>  		    d->prot == domain->prot) {

> >>  			iommu_detach_group(domain->domain,

> iommu_group);

> >>  			if (!iommu_attach_group(d->domain, iommu_group)) {

> >> +				ret = vfio_iommu_iova_resv_adjust(iommu,

> >> +

> 	&group_resv_regions);

> >> +				if (!ret)

> >> +					goto out_domain;

> >

> > The above function is not without side effects if it fails, it's

> > altered the iova_list.  It needs to be valid for the remaining domains

> > if we're going to continue.

> >

> >> +

> >>  				list_add(&group->next, &d->group_list);

> >>  				iommu_domain_free(domain->domain);

> >>  				kfree(domain);

> >> -				mutex_unlock(&iommu->lock);

> >> -				return 0;

> >> +				goto done;

> >>  			}

> >>

> >>  			ret = iommu_attach_group(domain->domain,

> iommu_group);

> >> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >>  	if (ret)

> >>  		goto out_detach;

> >>

> >> +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);

> >> +	if (ret)

> >> +		goto out_detach;

> >

> > Can't we process the reserved regions once before we get here rather

> > than have two separate call points that do the same thing?  In order to

> > roll back from errors above, it seems like we need to copy iova_list

> > and work on the copy, installing it and deleting the original only on

> > success.

> >

> >> +

> >>  	list_add(&domain->next, &iommu->domain_list);

> >>

> >> +done:

> >> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)

> >> +		kfree(resv);

> >>  	mutex_unlock(&iommu->lock);

> >>

> >>  	return 0;

> >> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >>  	iommu_detach_group(domain->domain, iommu_group);

> >>  out_domain:

> >>  	iommu_domain_free(domain->domain);

> >> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)

> >> +		kfree(resv);

> >>  out_free:

> >>  	kfree(domain);

> >>  	kfree(group);

> >> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct

> vfio_iommu *iommu)

> >>  	node->end = end;

> >>  }

> >>

> >> +/*

> >> + * Called when a group is detached. The reserved regions for that

> >> + * group can be part of valid iova now. But since reserved regions

> >> + * may be duplicated among groups, populate the iova valid regions

> >> +   list again.

> >> + */

> >> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)

> >> +{

> >> +	struct vfio_domain *d;

> >> +	struct vfio_group *g;

> >> +	struct vfio_iova *node, *tmp;

> >> +	struct iommu_resv_region *resv, *resv_next;

> >> +	struct list_head resv_regions;

> >> +	phys_addr_t start, end;

> >> +

> >> +	INIT_LIST_HEAD(&resv_regions);

> >> +

> >> +	list_for_each_entry(d, &iommu->domain_list, next) {

> >> +		list_for_each_entry(g, &d->group_list, next)

> >> +			iommu_get_group_resv_regions(g->iommu_group,

> >> +							 &resv_regions);

> >> +	}

> >> +

> >> +	if (list_empty(&resv_regions))

> >> +		return;

> >> +

> >> +	list_sort(NULL, &resv_regions, vfio_resv_cmp);

> >> +

> >> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);

> >> +	start = node->start;

> >> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);

> >> +	end = node->end;

> >

> > list_sort() only sorts based on ->start, we added reserved regions for

> > all our groups to one list, we potentially have multiple entries with

> > the same ->start.  How can we be sure that the last one in the list

> > actually has the largest ->end value?

> >

> >> +

> >> +	/* purge the iova list and create new one */

> >> +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {

> >> +		list_del(&node->list);

> >> +		kfree(node);

> >> +	}

> >> +

> >> +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {

> >> +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map

> request may fail\n",

> >> +			__func__);

> >

> > Map requests "will" fail.  Is this the right error strategy?  Detaching

> > a group cannot fail.  Aren't we better off leaving the iova_list we had

> > in place?  If we cannot expand the iova aperture when a group is

> > removed, a user can continue unscathed.

> >

> >> +		goto done;

> >> +	}

> >> +

> >> +	/* adjust the iova with current reserved regions */

> >> +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))

> >> +		pr_warn("%s: Failed to update iova list with reserve regions.

> VFIO DMA map request may fail\n",

> >> +			__func__);

> >

> > Same.

> >

> >> +done:

> >> +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)

> >> +		kfree(resv);

> >> +}

> >> +

> >>  static void vfio_iommu_type1_detach_group(void *iommu_data,

> >>  					  struct iommu_group *iommu_group)

> >>  {

> >> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void

> *iommu_data,

> >>  		break;

> >>  	}

> >>

> >> +	vfio_iommu_iova_resv_refresh(iommu);

> >> +

> >>  detach_group_done:

> >>  	mutex_unlock(&iommu->lock);

> >>  }

> >
Eric Auger Jan. 23, 2018, 12:51 p.m. UTC | #6
Hi Shameer,

On 23/01/18 13:16, Shameerali Kolothum Thodi wrote:
> Hi Eric,

> 

>> -----Original Message-----

>> From: Auger Eric [mailto:eric.auger@redhat.com]

>> Sent: Tuesday, January 23, 2018 8:32 AM

>> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum

>> Thodi <shameerali.kolothum.thodi@huawei.com>

>> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>

>> Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update

>> iova list

>>

>> Hi Shameer,

>>

>> On 18/01/18 01:04, Alex Williamson wrote:

>>> On Fri, 12 Jan 2018 16:45:28 +0000

>>> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

>>>

>>>> This retrieves the reserved regions associated with dev group and

>>>> checks for conflicts with any existing dma mappings. Also update

>>>> the iova list excluding the reserved regions.

>>>>

>>>> Signed-off-by: Shameer Kolothum

>> <shameerali.kolothum.thodi@huawei.com>

>>>> ---

>>>>  drivers/vfio/vfio_iommu_type1.c | 161

>> +++++++++++++++++++++++++++++++++++++++-

>>>>  1 file changed, 159 insertions(+), 2 deletions(-)

>>>>

>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c

>> b/drivers/vfio/vfio_iommu_type1.c

>>>> index 11cbd49..7609070 100644

>>>> --- a/drivers/vfio/vfio_iommu_type1.c

>>>> +++ b/drivers/vfio/vfio_iommu_type1.c

>>>> @@ -28,6 +28,7 @@

>>>>  #include <linux/device.h>

>>>>  #include <linux/fs.h>

>>>>  #include <linux/iommu.h>

>>>> +#include <linux/list_sort.h>

>>>>  #include <linux/module.h>

>>>>  #include <linux/mm.h>

>>>>  #include <linux/rbtree.h>

>>>> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct

>> iommu_group *group, phys_addr_t *base)

>>>>  	return ret;

>>>>  }

>>>>

>>>

>>> /* list_sort helper */

>>>

>>>> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)

>>>> +{

>>>> +	struct iommu_resv_region *ra, *rb;

>>>> +

>>>> +	ra = container_of(a, struct iommu_resv_region, list);

>>>> +	rb = container_of(b, struct iommu_resv_region, list);

>>>> +

>>>> +	if (ra->start < rb->start)

>>>> +		return -1;

>>>> +	if (ra->start > rb->start)

>>>> +		return 1;

>>>> +	return 0;

>>>> +}

>>>> +

>>>>  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,

>>>>  				struct list_head *head)

>>>>  {

>>>> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct

>> vfio_iommu *iommu,

>>>>  }

>>>>

>>>>  /*

>>>> + * Check reserved region conflicts with existing dma mappings

>>>> + */

>>>> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,

>>>> +				struct list_head *resv_regions)

>>>> +{

>>>> +	struct iommu_resv_region *region;

>>>> +

>>>> +	/* Check for conflict with existing dma mappings */

>>>> +	list_for_each_entry(region, resv_regions, list) {

>>>> +		if (vfio_find_dma_overlap(iommu, region->start,

>>>> +				    region->start + region->length - 1))

>>>> +			return -EINVAL;

>>>> +	}

>>>> +

>>>> +	return 0;

>>>> +}

>>>

>>> This basically does the same test as vfio_iommu_valid_aperture but

>>> properly names it a conflict test.  Please be consistent.  Should this

>>> also return bool, "conflict" is a yes/no answer.

>>>

>>>> +

>>>> +/*

>>>>   * Adjust the iommu aperture window if new aperture is a valid one

>>>>   */

>>>>  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,

>>>> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct

>> vfio_iommu *iommu,

>>>>  	return 0;

>>>>  }

>>>>

>>>> +/*

>>>> + * Check and update iova region list in case a reserved region

>>>> + * overlaps the iommu iova range

>>>> + */

>>>> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,

>>>> +					struct list_head *resv_regions)

>>>

>>> "resv_region" in previous function, just "resv" here, use consistent

>>> names.  Also, what are we adjusting.  Maybe "exclude" is a better term.

>>>

>>>> +{

>>>> +	struct iommu_resv_region *resv;

>>>> +	struct list_head *iova = &iommu->iova_list;

>>>> +	struct vfio_iova *n, *next;

>>>> +

>>>> +	list_for_each_entry(resv, resv_regions, list) {

>>>> +		phys_addr_t start, end;

>>>> +

>>>> +		start = resv->start;

>>>> +		end = resv->start + resv->length - 1;

>>>> +

>>>> +		list_for_each_entry_safe(n, next, iova, list) {

>>>> +			phys_addr_t a, b;

>>>> +			int ret = 0;

>>>> +

>>>> +			a = n->start;

>>>> +			b = n->end;

>>>

>>> 'a' and 'b' variables actually make this incredibly confusing.  Use

>>> better variable names or just drop them entirely, it's much easier to

>>> follow as n->start & n->end.

>>>

>>>> +			/* No overlap */

>>>> +			if ((start > b) || (end < a))

>>>> +				continue;

>>>> +			/* Split the current node and create holes */

>>>> +			if (start > a)

>>>> +				ret = vfio_insert_iova(a, start - 1, &n->list);

>>>> +			if (!ret && end < b)

>>>> +				ret = vfio_insert_iova(end + 1, b, &n->list);

>>>> +			if (ret)

>>>> +				return ret;

>>>> +

>>>> +			list_del(&n->list);

>>>

>>> This is trickier than it appears and deserves some explanation.  AIUI,

>>> we're actually inserting duplicate entries for the remainder at the

>>> start of the range and then at the end of the range (and the order is

>>> important here because we're inserting each before the current node),

>>> and then we delete the current node.  So the iova_list is kept sorted

>>> through this process, though temporarily includes some bogus, unordered

>>> sub-sets.

>>>

>>>> +			kfree(n);

>>>> +		}

>>>> +	}

>>>> +

>>>> +	if (list_empty(iova))

>>>> +		return -EINVAL;

>>>> +

>>>> +	return 0;

>>>> +}

>>>> +

>>>>  static int vfio_iommu_type1_attach_group(void *iommu_data,

>>>>  					 struct iommu_group *iommu_group)

>>>>  {

>>>> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void

>> *iommu_data,

>>>>  	bool resv_msi, msi_remap;

>>>>  	phys_addr_t resv_msi_base;

>>>>  	struct iommu_domain_geometry geo;

>>>> +	struct list_head group_resv_regions;

>>>> +	struct iommu_resv_region *resv, *resv_next;

>>>>

>>>>  	mutex_lock(&iommu->lock);

>>>>

>>>> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void

>> *iommu_data,

>>>>  	if (ret)

>>>>  		goto out_detach;

>>>>

>>>> +	INIT_LIST_HEAD(&group_resv_regions);

>>>> +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);

>>>> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);

>> iommu_get_group_resv_regions returns a sorted list (see

>> iommu_insert_resv_regions kerneldoc comment). You can have overlapping

>> regions of different types though.

> 

> Hmm..I am not sure. It looks like it is sorted only if the regions are of same type.

> 

> "* The new element is sorted by address with respect to the other

>  * regions of the same type."

> 

> So hypothetically if there are two groups with regions like,

> 

> Group 1.

>  Start       size            type  

>   0x0000   0x1000        1

>   0x2000   0x1000        1

>   0x5000   0x1000        1

> 

> Group 2

>   Start       size              type

>    0x2000  0x4000           2

>    0x7000   0x1000          1

> 

> Then the  iommu_get_group_resv_regions() will return,

> 

> 0x0000   0x1000        1

> 0x2000   0x1000        1

> 0x5000   0x1000        1

> 0x2000  0x4000         2

> 0x7000   0x1000        1  


Hum yes, I remember now, sorry. It was made on purpose to avoid to
display interleaved resv region types in
/sys/kernel/iommu_groups/reserved_regions. I think it gives a better
user experience.

Thanks

Eric
> 

> But honestly I am not sure the above is a valid scenario or not. I am

> happy to remove the sorting if such a case will never happen.

> 

> Please let me know.

> 

> Thanks,

> Shameer

> 

>> Eric

>>>> +

>>>> +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);

>>>> +	if (ret)

>>>> +		goto out_detach;

>>>> +

>>>>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);

>>>>

>>>>  	INIT_LIST_HEAD(&domain->group_list);

>>>> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void

>> *iommu_data,

>>>>  		    d->prot == domain->prot) {

>>>>  			iommu_detach_group(domain->domain,

>> iommu_group);

>>>>  			if (!iommu_attach_group(d->domain, iommu_group)) {

>>>> +				ret = vfio_iommu_iova_resv_adjust(iommu,

>>>> +

>> 	&group_resv_regions);

>>>> +				if (!ret)

>>>> +					goto out_domain;

>>>

>>> The above function is not without side effects if it fails, it's

>>> altered the iova_list.  It needs to be valid for the remaining domains

>>> if we're going to continue.

>>>

>>>> +

>>>>  				list_add(&group->next, &d->group_list);

>>>>  				iommu_domain_free(domain->domain);

>>>>  				kfree(domain);

>>>> -				mutex_unlock(&iommu->lock);

>>>> -				return 0;

>>>> +				goto done;

>>>>  			}

>>>>

>>>>  			ret = iommu_attach_group(domain->domain,

>> iommu_group);

>>>> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void

>> *iommu_data,

>>>>  	if (ret)

>>>>  		goto out_detach;

>>>>

>>>> +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);

>>>> +	if (ret)

>>>> +		goto out_detach;

>>>

>>> Can't we process the reserved regions once before we get here rather

>>> than have two separate call points that do the same thing?  In order to

>>> roll back from errors above, it seems like we need to copy iova_list

>>> and work on the copy, installing it and deleting the original only on

>>> success.

>>>

>>>> +

>>>>  	list_add(&domain->next, &iommu->domain_list);

>>>>

>>>> +done:

>>>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)

>>>> +		kfree(resv);

>>>>  	mutex_unlock(&iommu->lock);

>>>>

>>>>  	return 0;

>>>> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void

>> *iommu_data,

>>>>  	iommu_detach_group(domain->domain, iommu_group);

>>>>  out_domain:

>>>>  	iommu_domain_free(domain->domain);

>>>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)

>>>> +		kfree(resv);

>>>>  out_free:

>>>>  	kfree(domain);

>>>>  	kfree(group);

>>>> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct

>> vfio_iommu *iommu)

>>>>  	node->end = end;

>>>>  }

>>>>

>>>> +/*

>>>> + * Called when a group is detached. The reserved regions for that

>>>> + * group can be part of valid iova now. But since reserved regions

>>>> + * may be duplicated among groups, populate the iova valid regions

>>>> +   list again.

>>>> + */

>>>> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)

>>>> +{

>>>> +	struct vfio_domain *d;

>>>> +	struct vfio_group *g;

>>>> +	struct vfio_iova *node, *tmp;

>>>> +	struct iommu_resv_region *resv, *resv_next;

>>>> +	struct list_head resv_regions;

>>>> +	phys_addr_t start, end;

>>>> +

>>>> +	INIT_LIST_HEAD(&resv_regions);

>>>> +

>>>> +	list_for_each_entry(d, &iommu->domain_list, next) {

>>>> +		list_for_each_entry(g, &d->group_list, next)

>>>> +			iommu_get_group_resv_regions(g->iommu_group,

>>>> +							 &resv_regions);

>>>> +	}

>>>> +

>>>> +	if (list_empty(&resv_regions))

>>>> +		return;

>>>> +

>>>> +	list_sort(NULL, &resv_regions, vfio_resv_cmp);

>>>> +

>>>> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);

>>>> +	start = node->start;

>>>> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);

>>>> +	end = node->end;

>>>

>>> list_sort() only sorts based on ->start, we added reserved regions for

>>> all our groups to one list, we potentially have multiple entries with

>>> the same ->start.  How can we be sure that the last one in the list

>>> actually has the largest ->end value?

>>>

>>>> +

>>>> +	/* purge the iova list and create new one */

>>>> +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {

>>>> +		list_del(&node->list);

>>>> +		kfree(node);

>>>> +	}

>>>> +

>>>> +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {

>>>> +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map

>> request may fail\n",

>>>> +			__func__);

>>>

>>> Map requests "will" fail.  Is this the right error strategy?  Detaching

>>> a group cannot fail.  Aren't we better off leaving the iova_list we had

>>> in place?  If we cannot expand the iova aperture when a group is

>>> removed, a user can continue unscathed.

>>>

>>>> +		goto done;

>>>> +	}

>>>> +

>>>> +	/* adjust the iova with current reserved regions */

>>>> +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))

>>>> +		pr_warn("%s: Failed to update iova list with reserve regions.

>> VFIO DMA map request may fail\n",

>>>> +			__func__);

>>>

>>> Same.

>>>

>>>> +done:

>>>> +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)

>>>> +		kfree(resv);

>>>> +}

>>>> +

>>>>  static void vfio_iommu_type1_detach_group(void *iommu_data,

>>>>  					  struct iommu_group *iommu_group)

>>>>  {

>>>> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void

>> *iommu_data,

>>>>  		break;

>>>>  	}

>>>>

>>>> +	vfio_iommu_iova_resv_refresh(iommu);

>>>> +

>>>>  detach_group_done:

>>>>  	mutex_unlock(&iommu->lock);

>>>>  }

>>>
Shameerali Kolothum Thodi Jan. 23, 2018, 3:26 p.m. UTC | #7
> -----Original Message-----

> From: Auger Eric [mailto:eric.auger@redhat.com]

> Sent: Tuesday, January 23, 2018 12:52 PM

> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;

> Alex Williamson <alex.williamson@redhat.com>

> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>

> Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update

> iova list

> 

> Hi Shameer,

> 

> On 23/01/18 13:16, Shameerali Kolothum Thodi wrote:

> > Hi Eric,

> >

> >> -----Original Message-----

> >> From: Auger Eric [mailto:eric.auger@redhat.com]

> >> Sent: Tuesday, January 23, 2018 8:32 AM

> >> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum

> >> Thodi <shameerali.kolothum.thodi@huawei.com>

> >> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> >> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> >> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>

> >> Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and

> update

> >> iova list

> >>

> >> Hi Shameer,

> >>

> >> On 18/01/18 01:04, Alex Williamson wrote:

> >>> On Fri, 12 Jan 2018 16:45:28 +0000

> >>> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

> >>>

> >>>> This retrieves the reserved regions associated with dev group and

> >>>> checks for conflicts with any existing dma mappings. Also update

> >>>> the iova list excluding the reserved regions.

> >>>>

> >>>> Signed-off-by: Shameer Kolothum

> >> <shameerali.kolothum.thodi@huawei.com>

> >>>> ---

> >>>>  drivers/vfio/vfio_iommu_type1.c | 161

> >> +++++++++++++++++++++++++++++++++++++++-

> >>>>  1 file changed, 159 insertions(+), 2 deletions(-)

> >>>>

> >>>> diff --git a/drivers/vfio/vfio_iommu_type1.c

> >> b/drivers/vfio/vfio_iommu_type1.c

> >>>> index 11cbd49..7609070 100644

> >>>> --- a/drivers/vfio/vfio_iommu_type1.c

> >>>> +++ b/drivers/vfio/vfio_iommu_type1.c

> >>>> @@ -28,6 +28,7 @@

> >>>>  #include <linux/device.h>

> >>>>  #include <linux/fs.h>

> >>>>  #include <linux/iommu.h>

> >>>> +#include <linux/list_sort.h>

> >>>>  #include <linux/module.h>

> >>>>  #include <linux/mm.h>

> >>>>  #include <linux/rbtree.h>

> >>>> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct

> >> iommu_group *group, phys_addr_t *base)

> >>>>  	return ret;

> >>>>  }

> >>>>

> >>>

> >>> /* list_sort helper */

> >>>

> >>>> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head

> *b)

> >>>> +{

> >>>> +	struct iommu_resv_region *ra, *rb;

> >>>> +

> >>>> +	ra = container_of(a, struct iommu_resv_region, list);

> >>>> +	rb = container_of(b, struct iommu_resv_region, list);

> >>>> +

> >>>> +	if (ra->start < rb->start)

> >>>> +		return -1;

> >>>> +	if (ra->start > rb->start)

> >>>> +		return 1;

> >>>> +	return 0;

> >>>> +}

> >>>> +

> >>>>  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,

> >>>>  				struct list_head *head)

> >>>>  {

> >>>> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct

> >> vfio_iommu *iommu,

> >>>>  }

> >>>>

> >>>>  /*

> >>>> + * Check reserved region conflicts with existing dma mappings

> >>>> + */

> >>>> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,

> >>>> +				struct list_head *resv_regions)

> >>>> +{

> >>>> +	struct iommu_resv_region *region;

> >>>> +

> >>>> +	/* Check for conflict with existing dma mappings */

> >>>> +	list_for_each_entry(region, resv_regions, list) {

> >>>> +		if (vfio_find_dma_overlap(iommu, region->start,

> >>>> +				    region->start + region->length - 1))

> >>>> +			return -EINVAL;

> >>>> +	}

> >>>> +

> >>>> +	return 0;

> >>>> +}

> >>>

> >>> This basically does the same test as vfio_iommu_valid_aperture but

> >>> properly names it a conflict test.  Please be consistent.  Should this

> >>> also return bool, "conflict" is a yes/no answer.

> >>>

> >>>> +

> >>>> +/*

> >>>>   * Adjust the iommu aperture window if new aperture is a valid one

> >>>>   */

> >>>>  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,

> >>>> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct

> >> vfio_iommu *iommu,

> >>>>  	return 0;

> >>>>  }

> >>>>

> >>>> +/*

> >>>> + * Check and update iova region list in case a reserved region

> >>>> + * overlaps the iommu iova range

> >>>> + */

> >>>> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,

> >>>> +					struct list_head *resv_regions)

> >>>

> >>> "resv_region" in previous function, just "resv" here, use consistent

> >>> names.  Also, what are we adjusting.  Maybe "exclude" is a better term.

> >>>

> >>>> +{

> >>>> +	struct iommu_resv_region *resv;

> >>>> +	struct list_head *iova = &iommu->iova_list;

> >>>> +	struct vfio_iova *n, *next;

> >>>> +

> >>>> +	list_for_each_entry(resv, resv_regions, list) {

> >>>> +		phys_addr_t start, end;

> >>>> +

> >>>> +		start = resv->start;

> >>>> +		end = resv->start + resv->length - 1;

> >>>> +

> >>>> +		list_for_each_entry_safe(n, next, iova, list) {

> >>>> +			phys_addr_t a, b;

> >>>> +			int ret = 0;

> >>>> +

> >>>> +			a = n->start;

> >>>> +			b = n->end;

> >>>

> >>> 'a' and 'b' variables actually make this incredibly confusing.  Use

> >>> better variable names or just drop them entirely, it's much easier to

> >>> follow as n->start & n->end.

> >>>

> >>>> +			/* No overlap */

> >>>> +			if ((start > b) || (end < a))

> >>>> +				continue;

> >>>> +			/* Split the current node and create holes */

> >>>> +			if (start > a)

> >>>> +				ret = vfio_insert_iova(a, start - 1, &n->list);

> >>>> +			if (!ret && end < b)

> >>>> +				ret = vfio_insert_iova(end + 1, b, &n->list);

> >>>> +			if (ret)

> >>>> +				return ret;

> >>>> +

> >>>> +			list_del(&n->list);

> >>>

> >>> This is trickier than it appears and deserves some explanation.  AIUI,

> >>> we're actually inserting duplicate entries for the remainder at the

> >>> start of the range and then at the end of the range (and the order is

> >>> important here because we're inserting each before the current node),

> >>> and then we delete the current node.  So the iova_list is kept sorted

> >>> through this process, though temporarily includes some bogus, unordered

> >>> sub-sets.

> >>>

> >>>> +			kfree(n);

> >>>> +		}

> >>>> +	}

> >>>> +

> >>>> +	if (list_empty(iova))

> >>>> +		return -EINVAL;

> >>>> +

> >>>> +	return 0;

> >>>> +}

> >>>> +

> >>>>  static int vfio_iommu_type1_attach_group(void *iommu_data,

> >>>>  					 struct iommu_group *iommu_group)

> >>>>  {

> >>>> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void

> >> *iommu_data,

> >>>>  	bool resv_msi, msi_remap;

> >>>>  	phys_addr_t resv_msi_base;

> >>>>  	struct iommu_domain_geometry geo;

> >>>> +	struct list_head group_resv_regions;

> >>>> +	struct iommu_resv_region *resv, *resv_next;

> >>>>

> >>>>  	mutex_lock(&iommu->lock);

> >>>>

> >>>> @@ -1404,6 +1484,14 @@ static int

> vfio_iommu_type1_attach_group(void

> >> *iommu_data,

> >>>>  	if (ret)

> >>>>  		goto out_detach;

> >>>>

> >>>> +	INIT_LIST_HEAD(&group_resv_regions);

> >>>> +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);

> >>>> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);

> >> iommu_get_group_resv_regions returns a sorted list (see

> >> iommu_insert_resv_regions kerneldoc comment). You can have overlapping

> >> regions of different types though.

> >

> > Hmm..I am not sure. It looks like it is sorted only if the regions are of same

> type.

> >

> > "* The new element is sorted by address with respect to the other

> >  * regions of the same type."

> >

> > So hypothetically if there are two groups with regions like,

> >

> > Group 1.

> >  Start       size            type

> >   0x0000   0x1000        1

> >   0x2000   0x1000        1

> >   0x5000   0x1000        1

> >

> > Group 2

> >   Start       size              type

> >    0x2000  0x4000           2

> >    0x7000   0x1000          1

> >

> > Then the  iommu_get_group_resv_regions() will return,

> >

> > 0x0000   0x1000        1

> > 0x2000   0x1000        1

> > 0x5000   0x1000        1

> > 0x2000  0x4000         2

> > 0x7000   0x1000        1

> 

> Hum yes, I remember now, sorry. It was made on purpose to avoid to

> display interleaved resv region types in

> /sys/kernel/iommu_groups/reserved_regions. I think it gives a better

> user experience.


Ok. However, I have a feeling that sorting may not be required in this 
patch. I will double check the logic in vfio_iommu_iova_resv_adjust() and if
possible will remove the sorting.

Thanks,
Shameer

> Thanks

> 

> Eric

> >

> > But honestly I am not sure the above is a valid scenario or not. I am

> > happy to remove the sorting if such a case will never happen.

> >

> > Please let me know.

> >

> > Thanks,

> > Shameer

> >

> >> Eric

> >>>> +

> >>>> +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);

> >>>> +	if (ret)

> >>>> +		goto out_detach;

> >>>> +

> >>>>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);

> >>>>

> >>>>  	INIT_LIST_HEAD(&domain->group_list);

> >>>> @@ -1434,11 +1522,15 @@ static int

> vfio_iommu_type1_attach_group(void

> >> *iommu_data,

> >>>>  		    d->prot == domain->prot) {

> >>>>  			iommu_detach_group(domain->domain,

> >> iommu_group);

> >>>>  			if (!iommu_attach_group(d->domain, iommu_group)) {

> >>>> +				ret = vfio_iommu_iova_resv_adjust(iommu,

> >>>> +

> >> 	&group_resv_regions);

> >>>> +				if (!ret)

> >>>> +					goto out_domain;

> >>>

> >>> The above function is not without side effects if it fails, it's

> >>> altered the iova_list.  It needs to be valid for the remaining domains

> >>> if we're going to continue.

> >>>

> >>>> +

> >>>>  				list_add(&group->next, &d->group_list);

> >>>>  				iommu_domain_free(domain->domain);

> >>>>  				kfree(domain);

> >>>> -				mutex_unlock(&iommu->lock);

> >>>> -				return 0;

> >>>> +				goto done;

> >>>>  			}

> >>>>

> >>>>  			ret = iommu_attach_group(domain->domain,

> >> iommu_group);

> >>>> @@ -1465,8 +1557,15 @@ static int

> vfio_iommu_type1_attach_group(void

> >> *iommu_data,

> >>>>  	if (ret)

> >>>>  		goto out_detach;

> >>>>

> >>>> +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);

> >>>> +	if (ret)

> >>>> +		goto out_detach;

> >>>

> >>> Can't we process the reserved regions once before we get here rather

> >>> than have two separate call points that do the same thing?  In order to

> >>> roll back from errors above, it seems like we need to copy iova_list

> >>> and work on the copy, installing it and deleting the original only on

> >>> success.

> >>>

> >>>> +

> >>>>  	list_add(&domain->next, &iommu->domain_list);

> >>>>

> >>>> +done:

> >>>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)

> >>>> +		kfree(resv);

> >>>>  	mutex_unlock(&iommu->lock);

> >>>>

> >>>>  	return 0;

> >>>> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void

> >> *iommu_data,

> >>>>  	iommu_detach_group(domain->domain, iommu_group);

> >>>>  out_domain:

> >>>>  	iommu_domain_free(domain->domain);

> >>>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)

> >>>> +		kfree(resv);

> >>>>  out_free:

> >>>>  	kfree(domain);

> >>>>  	kfree(group);

> >>>> @@ -1559,6 +1660,60 @@ static void

> vfio_iommu_iova_aper_refresh(struct

> >> vfio_iommu *iommu)

> >>>>  	node->end = end;

> >>>>  }

> >>>>

> >>>> +/*

> >>>> + * Called when a group is detached. The reserved regions for that

> >>>> + * group can be part of valid iova now. But since reserved regions

> >>>> + * may be duplicated among groups, populate the iova valid regions

> >>>> +   list again.

> >>>> + */

> >>>> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)

> >>>> +{

> >>>> +	struct vfio_domain *d;

> >>>> +	struct vfio_group *g;

> >>>> +	struct vfio_iova *node, *tmp;

> >>>> +	struct iommu_resv_region *resv, *resv_next;

> >>>> +	struct list_head resv_regions;

> >>>> +	phys_addr_t start, end;

> >>>> +

> >>>> +	INIT_LIST_HEAD(&resv_regions);

> >>>> +

> >>>> +	list_for_each_entry(d, &iommu->domain_list, next) {

> >>>> +		list_for_each_entry(g, &d->group_list, next)

> >>>> +			iommu_get_group_resv_regions(g->iommu_group,

> >>>> +							 &resv_regions);

> >>>> +	}

> >>>> +

> >>>> +	if (list_empty(&resv_regions))

> >>>> +		return;

> >>>> +

> >>>> +	list_sort(NULL, &resv_regions, vfio_resv_cmp);

> >>>> +

> >>>> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);

> >>>> +	start = node->start;

> >>>> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);

> >>>> +	end = node->end;

> >>>

> >>> list_sort() only sorts based on ->start, we added reserved regions for

> >>> all our groups to one list, we potentially have multiple entries with

> >>> the same ->start.  How can we be sure that the last one in the list

> >>> actually has the largest ->end value?

> >>>

> >>>> +

> >>>> +	/* purge the iova list and create new one */

> >>>> +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {

> >>>> +		list_del(&node->list);

> >>>> +		kfree(node);

> >>>> +	}

> >>>> +

> >>>> +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {

> >>>> +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map

> >> request may fail\n",

> >>>> +			__func__);

> >>>

> >>> Map requests "will" fail.  Is this the right error strategy?  Detaching

> >>> a group cannot fail.  Aren't we better off leaving the iova_list we had

> >>> in place?  If we cannot expand the iova aperture when a group is

> >>> removed, a user can continue unscathed.

> >>>

> >>>> +		goto done;

> >>>> +	}

> >>>> +

> >>>> +	/* adjust the iova with current reserved regions */

> >>>> +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))

> >>>> +		pr_warn("%s: Failed to update iova list with reserve regions.

> >> VFIO DMA map request may fail\n",

> >>>> +			__func__);

> >>>

> >>> Same.

> >>>

> >>>> +done:

> >>>> +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)

> >>>> +		kfree(resv);

> >>>> +}

> >>>> +

> >>>>  static void vfio_iommu_type1_detach_group(void *iommu_data,

> >>>>  					  struct iommu_group *iommu_group)

> >>>>  {

> >>>> @@ -1617,6 +1772,8 @@ static void

> vfio_iommu_type1_detach_group(void

> >> *iommu_data,

> >>>>  		break;

> >>>>  	}

> >>>>

> >>>> +	vfio_iommu_iova_resv_refresh(iommu);

> >>>> +

> >>>>  detach_group_done:

> >>>>  	mutex_unlock(&iommu->lock);

> >>>>  }

> >>>
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 11cbd49..7609070 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -28,6 +28,7 @@ 
 #include <linux/device.h>
 #include <linux/fs.h>
 #include <linux/iommu.h>
+#include <linux/list_sort.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/rbtree.h>
@@ -1199,6 +1200,20 @@  static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
 	return ret;
 }
 
+static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct iommu_resv_region *ra, *rb;
+
+	ra = container_of(a, struct iommu_resv_region, list);
+	rb = container_of(b, struct iommu_resv_region, list);
+
+	if (ra->start < rb->start)
+		return -1;
+	if (ra->start > rb->start)
+		return 1;
+	return 0;
+}
+
 static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
 				struct list_head *head)
 {
@@ -1274,6 +1289,24 @@  static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
 }
 
 /*
+ * Check reserved region conflicts with existing dma mappings
+ */
+static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,
+				struct list_head *resv_regions)
+{
+	struct iommu_resv_region *region;
+
+	/* Check for conflict with existing dma mappings */
+	list_for_each_entry(region, resv_regions, list) {
+		if (vfio_find_dma_overlap(iommu, region->start,
+				    region->start + region->length - 1))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
  * Adjust the iommu aperture window if new aperture is a valid one
  */
 static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
@@ -1316,6 +1349,51 @@  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
 	return 0;
 }
 
+/*
+ * Check and update iova region list in case a reserved region
+ * overlaps the iommu iova range
+ */
+static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,
+					struct list_head *resv_regions)
+{
+	struct iommu_resv_region *resv;
+	struct list_head *iova = &iommu->iova_list;
+	struct vfio_iova *n, *next;
+
+	list_for_each_entry(resv, resv_regions, list) {
+		phys_addr_t start, end;
+
+		start = resv->start;
+		end = resv->start + resv->length - 1;
+
+		list_for_each_entry_safe(n, next, iova, list) {
+			phys_addr_t a, b;
+			int ret = 0;
+
+			a = n->start;
+			b = n->end;
+			/* No overlap */
+			if ((start > b) || (end < a))
+				continue;
+			/* Split the current node and create holes */
+			if (start > a)
+				ret = vfio_insert_iova(a, start - 1, &n->list);
+			if (!ret && end < b)
+				ret = vfio_insert_iova(end + 1, b, &n->list);
+			if (ret)
+				return ret;
+
+			list_del(&n->list);
+			kfree(n);
+		}
+	}
+
+	if (list_empty(iova))
+		return -EINVAL;
+
+	return 0;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
@@ -1327,6 +1405,8 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	bool resv_msi, msi_remap;
 	phys_addr_t resv_msi_base;
 	struct iommu_domain_geometry geo;
+	struct list_head group_resv_regions;
+	struct iommu_resv_region *resv, *resv_next;
 
 	mutex_lock(&iommu->lock);
 
@@ -1404,6 +1484,14 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_detach;
 
+	INIT_LIST_HEAD(&group_resv_regions);
+	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
+	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
+
+	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);
+	if (ret)
+		goto out_detach;
+
 	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
 
 	INIT_LIST_HEAD(&domain->group_list);
@@ -1434,11 +1522,15 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 		    d->prot == domain->prot) {
 			iommu_detach_group(domain->domain, iommu_group);
 			if (!iommu_attach_group(d->domain, iommu_group)) {
+				ret = vfio_iommu_iova_resv_adjust(iommu,
+							&group_resv_regions);
+				if (!ret)
+					goto out_domain;
+
 				list_add(&group->next, &d->group_list);
 				iommu_domain_free(domain->domain);
 				kfree(domain);
-				mutex_unlock(&iommu->lock);
-				return 0;
+				goto done;
 			}
 
 			ret = iommu_attach_group(domain->domain, iommu_group);
@@ -1465,8 +1557,15 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_detach;
 
+	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);
+	if (ret)
+		goto out_detach;
+
 	list_add(&domain->next, &iommu->domain_list);
 
+done:
+	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
+		kfree(resv);
 	mutex_unlock(&iommu->lock);
 
 	return 0;
@@ -1475,6 +1574,8 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	iommu_detach_group(domain->domain, iommu_group);
 out_domain:
 	iommu_domain_free(domain->domain);
+	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
+		kfree(resv);
 out_free:
 	kfree(domain);
 	kfree(group);
@@ -1559,6 +1660,60 @@  static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
 	node->end = end;
 }
 
+/*
+ * Called when a group is detached. The reserved regions for that
+ * group can be part of valid iova now. But since reserved regions
+ * may be duplicated among groups, populate the iova valid regions
+   list again.
+ */
+static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)
+{
+	struct vfio_domain *d;
+	struct vfio_group *g;
+	struct vfio_iova *node, *tmp;
+	struct iommu_resv_region *resv, *resv_next;
+	struct list_head resv_regions;
+	phys_addr_t start, end;
+
+	INIT_LIST_HEAD(&resv_regions);
+
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		list_for_each_entry(g, &d->group_list, next)
+			iommu_get_group_resv_regions(g->iommu_group,
+							 &resv_regions);
+	}
+
+	if (list_empty(&resv_regions))
+		return;
+
+	list_sort(NULL, &resv_regions, vfio_resv_cmp);
+
+	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
+	start = node->start;
+	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
+	end = node->end;
+
+	/* purge the iova list and create new one */
+	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {
+		list_del(&node->list);
+		kfree(node);
+	}
+
+	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {
+		pr_warn("%s: Failed to update iova aperture. VFIO DMA map request may fail\n",
+			__func__);
+		goto done;
+	}
+
+	/* adjust the iova with current reserved regions */
+	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))
+		pr_warn("%s: Failed to update iova list with reserve regions. VFIO DMA map request may fail\n",
+			__func__);
+done:
+	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)
+		kfree(resv);
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -1617,6 +1772,8 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 		break;
 	}
 
+	vfio_iommu_iova_resv_refresh(iommu);
+
 detach_group_done:
 	mutex_unlock(&iommu->lock);
 }