diff mbox series

[RFC,v2,1/5] vfio/type1: Introduce iova list and add iommu aperture validity check

Message ID 20180112164531.93712-2-shameerali.kolothum.thodi@huawei.com
State New
Headers show
Series vfio/type1: Add support for valid iova list management | expand

Commit Message

Shameerali Kolothum Thodi Jan. 12, 2018, 4:45 p.m. UTC
This introduces an iova list that is valid for dma mappings. Make
sure the new iommu aperture window is valid and doesn't conflict
with any existing dma mappings during attach. Also update the iova
list with new aperture window during attach/detach.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

---
 drivers/vfio/vfio_iommu_type1.c | 177 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)

-- 
1.9.1

Comments

Alex Williamson Jan. 18, 2018, 12:04 a.m. UTC | #1
On Fri, 12 Jan 2018 16:45:27 +0000
Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

> This introduces an iova list that is valid for dma mappings. Make

> sure the new iommu aperture window is valid and doesn't conflict

> with any existing dma mappings during attach. Also update the iova

> list with new aperture window during attach/detach.

> 

> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

> ---

>  drivers/vfio/vfio_iommu_type1.c | 177 ++++++++++++++++++++++++++++++++++++++++

>  1 file changed, 177 insertions(+)

> 

> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

> index e30e29a..11cbd49 100644

> --- a/drivers/vfio/vfio_iommu_type1.c

> +++ b/drivers/vfio/vfio_iommu_type1.c

> @@ -60,6 +60,7 @@

>  

>  struct vfio_iommu {

>  	struct list_head	domain_list;

> +	struct list_head	iova_list;

>  	struct vfio_domain	*external_domain; /* domain for external user */

>  	struct mutex		lock;

>  	struct rb_root		dma_list;

> @@ -92,6 +93,12 @@ struct vfio_group {

>  	struct list_head	next;

>  };

>  

> +struct vfio_iova {

> +	struct list_head	list;

> +	phys_addr_t		start;

> +	phys_addr_t		end;

> +};


dma_list uses dma_addr_t for the iova.  IOVAs are naturally DMA
addresses, why are we using phys_addr_t?

> +

>  /*

>   * Guest RAM pinning working set or DMA target

>   */

> @@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)

>  	return ret;

>  }

>  

> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,

> +				struct list_head *head)

> +{

> +	struct vfio_iova *region;

> +

> +	region = kmalloc(sizeof(*region), GFP_KERNEL);

> +	if (!region)

> +		return -ENOMEM;

> +

> +	INIT_LIST_HEAD(&region->list);

> +	region->start = start;

> +	region->end = end;

> +

> +	list_add_tail(&region->list, head);

> +	return 0;

> +}


As I'm reading through this series, I'm learning that there are a lot
of assumptions and subtle details that should be documented.  For
instance, the IOMMU API only provides a single geometry and we build
upon that here as this patch creates a list, but there's only a single
entry for now.  The following patches carve that single iova range into
pieces and somewhat subtly use the list_head passed to keep the list
sorted, allowing the first/last_entry tricks used throughout.  Subtle
interfaces are prone to bugs.

> +

> +/*

> + * Find whether a mem region overlaps with existing dma mappings

> + */

> +static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,

> +				  phys_addr_t start, phys_addr_t end)

> +{

> +	struct rb_node *n = rb_first(&iommu->dma_list);

> +

> +	for (; n; n = rb_next(n)) {

> +		struct vfio_dma *dma;

> +

> +		dma = rb_entry(n, struct vfio_dma, node);

> +

> +		if (end < dma->iova)

> +			break;

> +		if (start >= dma->iova + dma->size)

> +			continue;

> +		return true;

> +	}

> +

> +	return false;

> +}


Why do we need this in addition to the existing vfio_find_dma()?  Why
doesn't this use the tree structure of the dma_list?

> +

> +/*

> + * Check the new iommu aperture is a valid one

> + */

> +static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,

> +				     phys_addr_t start,

> +				     phys_addr_t end)

> +{

> +	struct vfio_iova *first, *last;

> +	struct list_head *iova = &iommu->iova_list;

> +

> +	if (list_empty(iova))

> +		return 0;

> +

> +	/* Check if new one is outside the current aperture */


"Disjoint sets"

> +	first = list_first_entry(iova, struct vfio_iova, list);

> +	last = list_last_entry(iova, struct vfio_iova, list);

> +	if ((start > last->end) || (end < first->start))

> +		return -EINVAL;

> +

> +	/* Check for any existing dma mappings outside the new start */

> +	if (start > first->start) {

> +		if (vfio_find_dma_overlap(iommu, first->start, start - 1))

> +			return -EINVAL;

> +	}

> +

> +	/* Check for any existing dma mappings outside the new end */

> +	if (end < last->end) {

> +		if (vfio_find_dma_overlap(iommu, end + 1, last->end))

> +			return -EINVAL;

> +	}

> +

> +	return 0;

> +}


I think this returns an int because you want to use it for the return
value below, but it really seems like a bool question, ie. does this
aperture conflict with existing mappings.  Additionally, the aperture
is valid, it was provided to us by the IOMMU API, the question is
whether it conflicts.  Please also name consistently to the other
functions in this patch, vfio_iommu_aper_xxxx().

> +

> +/*

> + * Adjust the iommu aperture window if new aperture is a valid one

> + */

> +static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,

> +				      phys_addr_t start,

> +				      phys_addr_t end)


Perhaps "resize", "prune", or "shrink" to make it more clear what is
being adjusted?

> +{

> +	struct vfio_iova *node, *next;

> +	struct list_head *iova = &iommu->iova_list;

> +

> +	if (list_empty(iova))

> +		return vfio_insert_iova(start, end, iova);

> +

> +	/* Adjust iova list start */

> +	list_for_each_entry_safe(node, next, iova, list) {

> +		if (start < node->start)

> +			break;

> +		if ((start >= node->start) && (start <= node->end)) {


start == node->end results in a zero sized node.  s/<=/</

> +			node->start = start;

> +			break;

> +		}

> +		/* Delete nodes before new start */

> +		list_del(&node->list);

> +		kfree(node);

> +	}

> +

> +	/* Adjust iova list end */

> +	list_for_each_entry_safe(node, next, iova, list) {

> +		if (end > node->end)

> +			continue;

> +

> +		if ((end >= node->start) && (end <= node->end)) {


end == node->start results in a zero sized node.  s/>=/>/

> +			node->end = end;

> +			continue;

> +		}

> +		/* Delete nodes after new end */

> +		list_del(&node->list);

> +		kfree(node);

> +	}

> +

> +	return 0;

> +}

> +

>  static int vfio_iommu_type1_attach_group(void *iommu_data,

>  					 struct iommu_group *iommu_group)

>  {

> @@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>  	int ret;

>  	bool resv_msi, msi_remap;

>  	phys_addr_t resv_msi_base;

> +	struct iommu_domain_geometry geo;

>  

>  	mutex_lock(&iommu->lock);

>  

> @@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>  	if (ret)

>  		goto out_domain;

>  

> +	/* Get aperture info */

> +	iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);

> +

> +	ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,

> +					geo.aperture_end);

> +	if (ret)

> +		goto out_detach;

> +

>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);

>  

>  	INIT_LIST_HEAD(&domain->group_list);

> @@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>  			goto out_detach;

>  	}

>  

> +	ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,

> +					  geo.aperture_end);

> +	if (ret)

> +		goto out_detach;

> +

>  	list_add(&domain->next, &iommu->domain_list);

>  

>  	mutex_unlock(&iommu->lock);

> @@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)

>  	WARN_ON(iommu->notifier.head);

>  }

>  

> +/*

> + * Called when a domain is removed in detach. It is possible that

> + * the removed domain decided the iova aperture window. Modify the

> + * iova aperture with the smallest window among existing domains.

> + */

> +static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)

> +{

> +	struct vfio_domain *domain;

> +	struct iommu_domain_geometry geo;

> +	struct vfio_iova *node;

> +	phys_addr_t start = 0;

> +	phys_addr_t end = (phys_addr_t)~0;

> +

> +	list_for_each_entry(domain, &iommu->domain_list, next) {

> +		iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,

> +				      &geo);

> +			if (geo.aperture_start > start)

> +				start = geo.aperture_start;

> +			if (geo.aperture_end < end)

> +				end = geo.aperture_end;

> +	}

> +

> +	/* modify iova aperture limits */

> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);

> +	node->start = start;

> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);

> +	node->end = end;


We can do this because the new aperture is the same or bigger than the
current aperture, never smaller.  That's not fully obvious and should
be noted in the comment.  Perhaps this function should be "expand"
rather than "refresh".

> +}

> +

>  static void vfio_iommu_type1_detach_group(void *iommu_data,

>  					  struct iommu_group *iommu_group)

>  {

> @@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,

>  			iommu_domain_free(domain->domain);

>  			list_del(&domain->next);

>  			kfree(domain);

> +			vfio_iommu_iova_aper_refresh(iommu);

>  		}

>  		break;

>  	}

> @@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)

>  	}

>  

>  	INIT_LIST_HEAD(&iommu->domain_list);

> +	INIT_LIST_HEAD(&iommu->iova_list);

>  	iommu->dma_list = RB_ROOT;

>  	mutex_init(&iommu->lock);

>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);

> @@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void *iommu_data)

>  {

>  	struct vfio_iommu *iommu = iommu_data;

>  	struct vfio_domain *domain, *domain_tmp;

> +	struct vfio_iova *iova, *iova_tmp;

>  

>  	if (iommu->external_domain) {

>  		vfio_release_domain(iommu->external_domain, true);

> @@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void *iommu_data)

>  		list_del(&domain->next);

>  		kfree(domain);

>  	}

> +

> +	list_for_each_entry_safe(iova, iova_tmp,

> +				 &iommu->iova_list, list) {

> +		list_del(&iova->list);

> +		kfree(iova);

> +	}

> +

>  	kfree(iommu);

>  }

>
Shameerali Kolothum Thodi Jan. 19, 2018, 9:47 a.m. UTC | #2
Hi Alex,

> -----Original Message-----

> From: Alex Williamson [mailto:alex.williamson@redhat.com]

> Sent: Thursday, January 18, 2018 12:05 AM

> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>

> Cc: eric.auger@redhat.com; pmorel@linux.vnet.ibm.com;

> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm

> <linuxarm@huawei.com>; John Garry <john.garry@huawei.com>; xuwei (O)

> <xuwei5@huawei.com>

> Subject: Re: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu

> aperture validity check

> 

> On Fri, 12 Jan 2018 16:45:27 +0000

> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

> 

> > This introduces an iova list that is valid for dma mappings. Make

> > sure the new iommu aperture window is valid and doesn't conflict

> > with any existing dma mappings during attach. Also update the iova

> > list with new aperture window during attach/detach.

> >

> > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

> > ---

> >  drivers/vfio/vfio_iommu_type1.c | 177

> ++++++++++++++++++++++++++++++++++++++++

> >  1 file changed, 177 insertions(+)

> >

> > diff --git a/drivers/vfio/vfio_iommu_type1.c

> b/drivers/vfio/vfio_iommu_type1.c

> > index e30e29a..11cbd49 100644

> > --- a/drivers/vfio/vfio_iommu_type1.c

> > +++ b/drivers/vfio/vfio_iommu_type1.c

> > @@ -60,6 +60,7 @@

> >

> >  struct vfio_iommu {

> >  	struct list_head	domain_list;

> > +	struct list_head	iova_list;

> >  	struct vfio_domain	*external_domain; /* domain for external user

> */

> >  	struct mutex		lock;

> >  	struct rb_root		dma_list;

> > @@ -92,6 +93,12 @@ struct vfio_group {

> >  	struct list_head	next;

> >  };

> >

> > +struct vfio_iova {

> > +	struct list_head	list;

> > +	phys_addr_t		start;

> > +	phys_addr_t		end;

> > +};

> 

> dma_list uses dma_addr_t for the iova.  IOVAs are naturally DMA

> addresses, why are we using phys_addr_t?


Ok. I will change that to dma_addr_t.

> > +

> >  /*

> >   * Guest RAM pinning working set or DMA target

> >   */

> > @@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct

> iommu_group *group, phys_addr_t *base)

> >  	return ret;

> >  }

> >

> > +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,

> > +				struct list_head *head)

> > +{

> > +	struct vfio_iova *region;

> > +

> > +	region = kmalloc(sizeof(*region), GFP_KERNEL);

> > +	if (!region)

> > +		return -ENOMEM;

> > +

> > +	INIT_LIST_HEAD(&region->list);

> > +	region->start = start;

> > +	region->end = end;

> > +

> > +	list_add_tail(&region->list, head);

> > +	return 0;

> > +}

> 

> As I'm reading through this series, I'm learning that there are a lot

> of assumptions and subtle details that should be documented.  For

> instance, the IOMMU API only provides a single geometry and we build

> upon that here as this patch creates a list, but there's only a single

> entry for now.  The following patches carve that single iova range into

> pieces and somewhat subtly use the list_head passed to keep the list

> sorted, allowing the first/last_entry tricks used throughout.  Subtle

> interfaces are prone to bugs.


Agree. The iova list management logic needs to be documented properly.
I will address this in next revision.

> > +

> > +/*

> > + * Find whether a mem region overlaps with existing dma mappings

> > + */

> > +static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,

> > +				  phys_addr_t start, phys_addr_t end)

> > +{

> > +	struct rb_node *n = rb_first(&iommu->dma_list);

> > +

> > +	for (; n; n = rb_next(n)) {

> > +		struct vfio_dma *dma;

> > +

> > +		dma = rb_entry(n, struct vfio_dma, node);

> > +

> > +		if (end < dma->iova)

> > +			break;

> > +		if (start >= dma->iova + dma->size)

> > +			continue;

> > +		return true;

> > +	}

> > +

> > +	return false;

> > +}

> 

> Why do we need this in addition to the existing vfio_find_dma()?  Why

> doesn't this use the tree structure of the dma_list?


Ok. I will take a look at the vfio_find_dma().

> > +

> > +/*

> > + * Check the new iommu aperture is a valid one

> > + */

> > +static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,

> > +				     phys_addr_t start,

> > +				     phys_addr_t end)

> > +{

> > +	struct vfio_iova *first, *last;

> > +	struct list_head *iova = &iommu->iova_list;

> > +

> > +	if (list_empty(iova))

> > +		return 0;

> > +

> > +	/* Check if new one is outside the current aperture */

> 

> "Disjoint sets"

> 

> > +	first = list_first_entry(iova, struct vfio_iova, list);

> > +	last = list_last_entry(iova, struct vfio_iova, list);

> > +	if ((start > last->end) || (end < first->start))

> > +		return -EINVAL;

> > +

> > +	/* Check for any existing dma mappings outside the new start */

> > +	if (start > first->start) {

> > +		if (vfio_find_dma_overlap(iommu, first->start, start - 1))

> > +			return -EINVAL;

> > +	}

> > +

> > +	/* Check for any existing dma mappings outside the new end */

> > +	if (end < last->end) {

> > +		if (vfio_find_dma_overlap(iommu, end + 1, last->end))

> > +			return -EINVAL;

> > +	}

> > +

> > +	return 0;

> > +}

> 

> I think this returns an int because you want to use it for the return

> value below, but it really seems like a bool question, ie. does this

> aperture conflict with existing mappings.  Additionally, the aperture

> is valid, it was provided to us by the IOMMU API, the question is

> whether it conflicts.  Please also name consistently to the other

> functions in this patch, vfio_iommu_aper_xxxx().


Sure.

> > +

> > +/*

> > + * Adjust the iommu aperture window if new aperture is a valid one

> > + */

> > +static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,

> > +				      phys_addr_t start,

> > +				      phys_addr_t end)

> 

> Perhaps "resize", "prune", or "shrink" to make it more clear what is

> being adjusted?


Ok.

> > +{

> > +	struct vfio_iova *node, *next;

> > +	struct list_head *iova = &iommu->iova_list;

> > +

> > +	if (list_empty(iova))

> > +		return vfio_insert_iova(start, end, iova);

> > +

> > +	/* Adjust iova list start */

> > +	list_for_each_entry_safe(node, next, iova, list) {

> > +		if (start < node->start)

> > +			break;

> > +		if ((start >= node->start) && (start <= node->end)) {

> 

> start == node->end results in a zero sized node.  s/<=/</


Ok.

> 

> > +			node->start = start;

> > +			break;

> > +		}

> > +		/* Delete nodes before new start */

> > +		list_del(&node->list);

> > +		kfree(node);

> > +	}

> > +

> > +	/* Adjust iova list end */

> > +	list_for_each_entry_safe(node, next, iova, list) {

> > +		if (end > node->end)

> > +			continue;

> > +

> > +		if ((end >= node->start) && (end <= node->end)) {

> 

> end == node->start results in a zero sized node.  s/>=/>/


Ok.

> > +			node->end = end;

> > +			continue;

> > +		}

> > +		/* Delete nodes after new end */

> > +		list_del(&node->list);

> > +		kfree(node);

> > +	}

> > +

> > +	return 0;

> > +}

> > +

> >  static int vfio_iommu_type1_attach_group(void *iommu_data,

> >  					 struct iommu_group *iommu_group)

> >  {

> > @@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >  	int ret;

> >  	bool resv_msi, msi_remap;

> >  	phys_addr_t resv_msi_base;

> > +	struct iommu_domain_geometry geo;

> >

> >  	mutex_lock(&iommu->lock);

> >

> > @@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >  	if (ret)

> >  		goto out_domain;

> >

> > +	/* Get aperture info */

> > +	iommu_domain_get_attr(domain->domain,

> DOMAIN_ATTR_GEOMETRY, &geo);

> > +

> > +	ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,

> > +					geo.aperture_end);

> > +	if (ret)

> > +		goto out_detach;

> > +

> >  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);

> >

> >  	INIT_LIST_HEAD(&domain->group_list);

> > @@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >  			goto out_detach;

> >  	}

> >

> > +	ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,

> > +					  geo.aperture_end);

> > +	if (ret)

> > +		goto out_detach;

> > +

> >  	list_add(&domain->next, &iommu->domain_list);

> >

> >  	mutex_unlock(&iommu->lock);

> > @@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct

> vfio_iommu *iommu)

> >  	WARN_ON(iommu->notifier.head);

> >  }

> >

> > +/*

> > + * Called when a domain is removed in detach. It is possible that

> > + * the removed domain decided the iova aperture window. Modify the

> > + * iova aperture with the smallest window among existing domains.

> > + */

> > +static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)

> > +{

> > +	struct vfio_domain *domain;

> > +	struct iommu_domain_geometry geo;

> > +	struct vfio_iova *node;

> > +	phys_addr_t start = 0;

> > +	phys_addr_t end = (phys_addr_t)~0;

> > +

> > +	list_for_each_entry(domain, &iommu->domain_list, next) {

> > +		iommu_domain_get_attr(domain->domain,

> DOMAIN_ATTR_GEOMETRY,

> > +				      &geo);

> > +			if (geo.aperture_start > start)

> > +				start = geo.aperture_start;

> > +			if (geo.aperture_end < end)

> > +				end = geo.aperture_end;

> > +	}

> > +

> > +	/* modify iova aperture limits */

> > +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);

> > +	node->start = start;

> > +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);

> > +	node->end = end;

> 

> We can do this because the new aperture is the same or bigger than the

> current aperture, never smaller.  That's not fully obvious and should

> be noted in the comment.  Perhaps this function should be "expand"

> rather than "refresh".


Ok. Based on the comments from patch#2, I will remove this aperture expand
logic as reserved region conflict handling for the expanded aperture might fail.
Looks like it is better to leave the smaller aperture so that the user can continue.

Thanks for going through this.

Shameer
 
> > +}

> > +

> >  static void vfio_iommu_type1_detach_group(void *iommu_data,

> >  					  struct iommu_group *iommu_group)

> >  {

> > @@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void

> *iommu_data,

> >  			iommu_domain_free(domain->domain);

> >  			list_del(&domain->next);

> >  			kfree(domain);

> > +			vfio_iommu_iova_aper_refresh(iommu);

> >  		}

> >  		break;

> >  	}

> > @@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned

> long arg)

> >  	}

> >

> >  	INIT_LIST_HEAD(&iommu->domain_list);

> > +	INIT_LIST_HEAD(&iommu->iova_list);

> >  	iommu->dma_list = RB_ROOT;

> >  	mutex_init(&iommu->lock);

> >  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);

> > @@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void

> *iommu_data)

> >  {

> >  	struct vfio_iommu *iommu = iommu_data;

> >  	struct vfio_domain *domain, *domain_tmp;

> > +	struct vfio_iova *iova, *iova_tmp;

> >

> >  	if (iommu->external_domain) {

> >  		vfio_release_domain(iommu->external_domain, true);

> > @@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void

> *iommu_data)

> >  		list_del(&domain->next);

> >  		kfree(domain);

> >  	}

> > +

> > +	list_for_each_entry_safe(iova, iova_tmp,

> > +				 &iommu->iova_list, list) {

> > +		list_del(&iova->list);

> > +		kfree(iova);

> > +	}

> > +

> >  	kfree(iommu);

> >  }

> >
Eric Auger Jan. 23, 2018, 8:25 a.m. UTC | #3
Hi Shameer,

On 18/01/18 01:04, Alex Williamson wrote:
> On Fri, 12 Jan 2018 16:45:27 +0000

> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

> 

>> This introduces an iova list that is valid for dma mappings. Make

>> sure the new iommu aperture window is valid and doesn't conflict

>> with any existing dma mappings during attach. Also update the iova

>> list with new aperture window during attach/detach.

>>

>> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

>> ---

>>  drivers/vfio/vfio_iommu_type1.c | 177 ++++++++++++++++++++++++++++++++++++++++

>>  1 file changed, 177 insertions(+)

>>

>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

>> index e30e29a..11cbd49 100644

>> --- a/drivers/vfio/vfio_iommu_type1.c

>> +++ b/drivers/vfio/vfio_iommu_type1.c

>> @@ -60,6 +60,7 @@

>>  

>>  struct vfio_iommu {

>>  	struct list_head	domain_list;

>> +	struct list_head	iova_list;

>>  	struct vfio_domain	*external_domain; /* domain for external user */

>>  	struct mutex		lock;

>>  	struct rb_root		dma_list;

>> @@ -92,6 +93,12 @@ struct vfio_group {

>>  	struct list_head	next;

>>  };

>>  

>> +struct vfio_iova {

>> +	struct list_head	list;

>> +	phys_addr_t		start;

>> +	phys_addr_t		end;

>> +};

> 

> dma_list uses dma_addr_t for the iova.  IOVAs are naturally DMA

> addresses, why are we using phys_addr_t?

> 

>> +

>>  /*

>>   * Guest RAM pinning working set or DMA target

>>   */

>> @@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)

>>  	return ret;

>>  }

>>  

>> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,

>> +				struct list_head *head)

>> +{

>> +	struct vfio_iova *region;

>> +

>> +	region = kmalloc(sizeof(*region), GFP_KERNEL);

>> +	if (!region)

>> +		return -ENOMEM;

>> +

>> +	INIT_LIST_HEAD(&region->list);

>> +	region->start = start;

>> +	region->end = end;

>> +

>> +	list_add_tail(&region->list, head);

>> +	return 0;

>> +}

> 

> As I'm reading through this series, I'm learning that there are a lot

> of assumptions and subtle details that should be documented.  For

> instance, the IOMMU API only provides a single geometry and we build

> upon that here as this patch creates a list, but there's only a single

> entry for now.  The following patches carve that single iova range into

> pieces and somewhat subtly use the list_head passed to keep the list

> sorted, allowing the first/last_entry tricks used throughout.  Subtle

> interfaces are prone to bugs.

> 

>> +

>> +/*

>> + * Find whether a mem region overlaps with existing dma mappings

>> + */

>> +static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,

>> +				  phys_addr_t start, phys_addr_t end)

>> +{

>> +	struct rb_node *n = rb_first(&iommu->dma_list);

>> +

>> +	for (; n; n = rb_next(n)) {

>> +		struct vfio_dma *dma;

>> +

>> +		dma = rb_entry(n, struct vfio_dma, node);

>> +

>> +		if (end < dma->iova)

>> +			break;

>> +		if (start >= dma->iova + dma->size)

>> +			continue;

>> +		return true;

>> +	}

>> +

>> +	return false;

>> +}

> 

> Why do we need this in addition to the existing vfio_find_dma()?  Why

> doesn't this use the tree structure of the dma_list?

> 

>> +

>> +/*

>> + * Check the new iommu aperture is a valid one

>> + */

>> +static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,

>> +				     phys_addr_t start,

>> +				     phys_addr_t end)

>> +{

>> +	struct vfio_iova *first, *last;

>> +	struct list_head *iova = &iommu->iova_list;

>> +

>> +	if (list_empty(iova))

>> +		return 0;

>> +

>> +	/* Check if new one is outside the current aperture */

> 

> "Disjoint sets"

> 

>> +	first = list_first_entry(iova, struct vfio_iova, list);

>> +	last = list_last_entry(iova, struct vfio_iova, list);

>> +	if ((start > last->end) || (end < first->start))

>> +		return -EINVAL;

>> +

>> +	/* Check for any existing dma mappings outside the new start */

>> +	if (start > first->start) {

>> +		if (vfio_find_dma_overlap(iommu, first->start, start - 1))

>> +			return -EINVAL;

>> +	}

>> +

>> +	/* Check for any existing dma mappings outside the new end */

>> +	if (end < last->end) {

>> +		if (vfio_find_dma_overlap(iommu, end + 1, last->end))

>> +			return -EINVAL;

>> +	}

>> +

>> +	return 0;

>> +}

> 

> I think this returns an int because you want to use it for the return

> value below, but it really seems like a bool question, ie. does this

> aperture conflict with existing mappings.  Additionally, the aperture

> is valid, it was provided to us by the IOMMU API, the question is

> whether it conflicts.  Please also name consistently to the other

> functions in this patch, vfio_iommu_aper_xxxx().

> 

>> +

>> +/*

>> + * Adjust the iommu aperture window if new aperture is a valid one

>> + */

>> +static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,

>> +				      phys_addr_t start,

>> +				      phys_addr_t end)

> 

> Perhaps "resize", "prune", or "shrink" to make it more clear what is

> being adjusted?

> 

>> +{

>> +	struct vfio_iova *node, *next;

>> +	struct list_head *iova = &iommu->iova_list;

>> +

>> +	if (list_empty(iova))

>> +		return vfio_insert_iova(start, end, iova);

>> +

>> +	/* Adjust iova list start */

>> +	list_for_each_entry_safe(node, next, iova, list) {

>> +		if (start < node->start)

>> +			break;

>> +		if ((start >= node->start) && (start <= node->end)) {

> 

> start == node->end results in a zero sized node.  s/<=/</

> 

>> +			node->start = start;

>> +			break;

>> +		}

>> +		/* Delete nodes before new start */

>> +		list_del(&node->list);

>> +		kfree(node);

>> +	}

>> +

>> +	/* Adjust iova list end */

>> +	list_for_each_entry_safe(node, next, iova, list) {

>> +		if (end > node->end)

>> +			continue;

>> +

>> +		if ((end >= node->start) && (end <= node->end)) {

> 

> end == node->start results in a zero sized node.  s/>=/>/

> 

>> +			node->end = end;

>> +			continue;

>> +		}

>> +		/* Delete nodes after new end */

>> +		list_del(&node->list);

>> +		kfree(node);

>> +	}

>> +

>> +	return 0;

>> +}

>> +

>>  static int vfio_iommu_type1_attach_group(void *iommu_data,

>>  					 struct iommu_group *iommu_group)

>>  {

>> @@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>>  	int ret;

>>  	bool resv_msi, msi_remap;

>>  	phys_addr_t resv_msi_base;

>> +	struct iommu_domain_geometry geo;

>>  

>>  	mutex_lock(&iommu->lock);

>>  

>> @@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>>  	if (ret)

>>  		goto out_domain;

>>  

>> +	/* Get aperture info */

>> +	iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);

>> +

>> +	ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,

>> +					geo.aperture_end);

>> +	if (ret)

>> +		goto out_detach;

>> +

>>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);

>>  

>>  	INIT_LIST_HEAD(&domain->group_list);

>> @@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

>>  			goto out_detach;

>>  	}

>>  

>> +	ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,

>> +					  geo.aperture_end);

>> +	if (ret)

>> +		goto out_detach;

>> +

>>  	list_add(&domain->next, &iommu->domain_list);

>>  

>>  	mutex_unlock(&iommu->lock);

>> @@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)

>>  	WARN_ON(iommu->notifier.head);

>>  }

>>  

>> +/*

>> + * Called when a domain is removed in detach. It is possible that

>> + * the removed domain decided the iova aperture window. Modify the

>> + * iova aperture with the smallest window among existing domains.

>> + */

>> +static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)

>> +{

>> +	struct vfio_domain *domain;

>> +	struct iommu_domain_geometry geo;

>> +	struct vfio_iova *node;

>> +	phys_addr_t start = 0;

>> +	phys_addr_t end = (phys_addr_t)~0;

>> +

>> +	list_for_each_entry(domain, &iommu->domain_list, next) {

>> +		iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,

>> +				      &geo);

>> +			if (geo.aperture_start > start)

>> +				start = geo.aperture_start;

>> +			if (geo.aperture_end < end)

>> +				end = geo.aperture_end;

>> +	}

>> +

>> +	/* modify iova aperture limits */

>> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);

>> +	node->start = start;

>> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);

>> +	node->end = end;

> 

> We can do this because the new aperture is the same or bigger than the

> current aperture, never smaller.  That's not fully obvious and should

> be noted in the comment.  Perhaps this function should be "expand"

> rather than "refresh".

This one is not obvious to me either:
assuming you have 2 domains, resp with aperture 1 and 2, resulting into
aperture 3. Holes are created by resv regions for instance. If you
remove domain 1, don't you get 4) instead of 2)?

1)   |------------|
 +
2) |---|    |--|       |-----|
=
3)   |-|    |--|


4) |---|    |----------------|

Thanks

Eric
> 

>> +}

>> +

>>  static void vfio_iommu_type1_detach_group(void *iommu_data,

>>  					  struct iommu_group *iommu_group)

>>  {

>> @@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,

>>  			iommu_domain_free(domain->domain);

>>  			list_del(&domain->next);

>>  			kfree(domain);

>> +			vfio_iommu_iova_aper_refresh(iommu);

>>  		}

>>  		break;

>>  	}

>> @@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)

>>  	}

>>  

>>  	INIT_LIST_HEAD(&iommu->domain_list);

>> +	INIT_LIST_HEAD(&iommu->iova_list);

>>  	iommu->dma_list = RB_ROOT;

>>  	mutex_init(&iommu->lock);

>>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);

>> @@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void *iommu_data)

>>  {

>>  	struct vfio_iommu *iommu = iommu_data;

>>  	struct vfio_domain *domain, *domain_tmp;

>> +	struct vfio_iova *iova, *iova_tmp;

>>  

>>  	if (iommu->external_domain) {

>>  		vfio_release_domain(iommu->external_domain, true);

>> @@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void *iommu_data)

>>  		list_del(&domain->next);

>>  		kfree(domain);

>>  	}

>> +

>> +	list_for_each_entry_safe(iova, iova_tmp,

>> +				 &iommu->iova_list, list) {

>> +		list_del(&iova->list);

>> +		kfree(iova);

>> +	}

>> +

>>  	kfree(iommu);

>>  }

>>  

>
Shameerali Kolothum Thodi Jan. 23, 2018, 10:04 a.m. UTC | #4
Hi Eric,

> -----Original Message-----

> From: Auger Eric [mailto:eric.auger@redhat.com]

> Sent: Tuesday, January 23, 2018 8:25 AM

> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum

> Thodi <shameerali.kolothum.thodi@huawei.com>

> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>

> Subject: Re: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu

> aperture validity check

> 

> Hi Shameer,

> 

> On 18/01/18 01:04, Alex Williamson wrote:

> > On Fri, 12 Jan 2018 16:45:27 +0000

> > Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

> >

> >> This introduces an iova list that is valid for dma mappings. Make

> >> sure the new iommu aperture window is valid and doesn't conflict

> >> with any existing dma mappings during attach. Also update the iova

> >> list with new aperture window during attach/detach.

> >>

> >> Signed-off-by: Shameer Kolothum

> <shameerali.kolothum.thodi@huawei.com>

> >> ---

> >>  drivers/vfio/vfio_iommu_type1.c | 177

> ++++++++++++++++++++++++++++++++++++++++

> >>  1 file changed, 177 insertions(+)

> >>

> >> diff --git a/drivers/vfio/vfio_iommu_type1.c

> b/drivers/vfio/vfio_iommu_type1.c

> >> index e30e29a..11cbd49 100644

> >> --- a/drivers/vfio/vfio_iommu_type1.c

> >> +++ b/drivers/vfio/vfio_iommu_type1.c

> >> @@ -60,6 +60,7 @@

> >>

> >>  struct vfio_iommu {

> >>  	struct list_head	domain_list;

> >> +	struct list_head	iova_list;

> >>  	struct vfio_domain	*external_domain; /* domain for external user

> */

> >>  	struct mutex		lock;

> >>  	struct rb_root		dma_list;

> >> @@ -92,6 +93,12 @@ struct vfio_group {

> >>  	struct list_head	next;

> >>  };

> >>

> >> +struct vfio_iova {

> >> +	struct list_head	list;

> >> +	phys_addr_t		start;

> >> +	phys_addr_t		end;

> >> +};

> >

> > dma_list uses dma_addr_t for the iova.  IOVAs are naturally DMA

> > addresses, why are we using phys_addr_t?

> >

> >> +

> >>  /*

> >>   * Guest RAM pinning working set or DMA target

> >>   */

> >> @@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct

> iommu_group *group, phys_addr_t *base)

> >>  	return ret;

> >>  }

> >>

> >> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,

> >> +				struct list_head *head)

> >> +{

> >> +	struct vfio_iova *region;

> >> +

> >> +	region = kmalloc(sizeof(*region), GFP_KERNEL);

> >> +	if (!region)

> >> +		return -ENOMEM;

> >> +

> >> +	INIT_LIST_HEAD(&region->list);

> >> +	region->start = start;

> >> +	region->end = end;

> >> +

> >> +	list_add_tail(&region->list, head);

> >> +	return 0;

> >> +}

> >

> > As I'm reading through this series, I'm learning that there are a lot

> > of assumptions and subtle details that should be documented.  For

> > instance, the IOMMU API only provides a single geometry and we build

> > upon that here as this patch creates a list, but there's only a single

> > entry for now.  The following patches carve that single iova range into

> > pieces and somewhat subtly use the list_head passed to keep the list

> > sorted, allowing the first/last_entry tricks used throughout.  Subtle

> > interfaces are prone to bugs.

> >

> >> +

> >> +/*

> >> + * Find whether a mem region overlaps with existing dma mappings

> >> + */

> >> +static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,

> >> +				  phys_addr_t start, phys_addr_t end)

> >> +{

> >> +	struct rb_node *n = rb_first(&iommu->dma_list);

> >> +

> >> +	for (; n; n = rb_next(n)) {

> >> +		struct vfio_dma *dma;

> >> +

> >> +		dma = rb_entry(n, struct vfio_dma, node);

> >> +

> >> +		if (end < dma->iova)

> >> +			break;

> >> +		if (start >= dma->iova + dma->size)

> >> +			continue;

> >> +		return true;

> >> +	}

> >> +

> >> +	return false;

> >> +}

> >

> > Why do we need this in addition to the existing vfio_find_dma()?  Why

> > doesn't this use the tree structure of the dma_list?

> >

> >> +

> >> +/*

> >> + * Check the new iommu aperture is a valid one

> >> + */

> >> +static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,

> >> +				     phys_addr_t start,

> >> +				     phys_addr_t end)

> >> +{

> >> +	struct vfio_iova *first, *last;

> >> +	struct list_head *iova = &iommu->iova_list;

> >> +

> >> +	if (list_empty(iova))

> >> +		return 0;

> >> +

> >> +	/* Check if new one is outside the current aperture */

> >

> > "Disjoint sets"

> >

> >> +	first = list_first_entry(iova, struct vfio_iova, list);

> >> +	last = list_last_entry(iova, struct vfio_iova, list);

> >> +	if ((start > last->end) || (end < first->start))

> >> +		return -EINVAL;

> >> +

> >> +	/* Check for any existing dma mappings outside the new start */

> >> +	if (start > first->start) {

> >> +		if (vfio_find_dma_overlap(iommu, first->start, start - 1))

> >> +			return -EINVAL;

> >> +	}

> >> +

> >> +	/* Check for any existing dma mappings outside the new end */

> >> +	if (end < last->end) {

> >> +		if (vfio_find_dma_overlap(iommu, end + 1, last->end))

> >> +			return -EINVAL;

> >> +	}

> >> +

> >> +	return 0;

> >> +}

> >

> > I think this returns an int because you want to use it for the return

> > value below, but it really seems like a bool question, ie. does this

> > aperture conflict with existing mappings.  Additionally, the aperture

> > is valid, it was provided to us by the IOMMU API, the question is

> > whether it conflicts.  Please also name consistently to the other

> > functions in this patch, vfio_iommu_aper_xxxx().

> >

> >> +

> >> +/*

> >> + * Adjust the iommu aperture window if new aperture is a valid one

> >> + */

> >> +static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,

> >> +				      phys_addr_t start,

> >> +				      phys_addr_t end)

> >

> > Perhaps "resize", "prune", or "shrink" to make it more clear what is

> > being adjusted?

> >

> >> +{

> >> +	struct vfio_iova *node, *next;

> >> +	struct list_head *iova = &iommu->iova_list;

> >> +

> >> +	if (list_empty(iova))

> >> +		return vfio_insert_iova(start, end, iova);

> >> +

> >> +	/* Adjust iova list start */

> >> +	list_for_each_entry_safe(node, next, iova, list) {

> >> +		if (start < node->start)

> >> +			break;

> >> +		if ((start >= node->start) && (start <= node->end)) {

> >

> > start == node->end results in a zero sized node.  s/<=/</

> >

> >> +			node->start = start;

> >> +			break;

> >> +		}

> >> +		/* Delete nodes before new start */

> >> +		list_del(&node->list);

> >> +		kfree(node);

> >> +	}

> >> +

> >> +	/* Adjust iova list end */

> >> +	list_for_each_entry_safe(node, next, iova, list) {

> >> +		if (end > node->end)

> >> +			continue;

> >> +

> >> +		if ((end >= node->start) && (end <= node->end)) {

> >

> > end == node->start results in a zero sized node.  s/>=/>/

> >

> >> +			node->end = end;

> >> +			continue;

> >> +		}

> >> +		/* Delete nodes after new end */

> >> +		list_del(&node->list);

> >> +		kfree(node);

> >> +	}

> >> +

> >> +	return 0;

> >> +}

> >> +

> >>  static int vfio_iommu_type1_attach_group(void *iommu_data,

> >>  					 struct iommu_group *iommu_group)

> >>  {

> >> @@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >>  	int ret;

> >>  	bool resv_msi, msi_remap;

> >>  	phys_addr_t resv_msi_base;

> >> +	struct iommu_domain_geometry geo;

> >>

> >>  	mutex_lock(&iommu->lock);

> >>

> >> @@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >>  	if (ret)

> >>  		goto out_domain;

> >>

> >> +	/* Get aperture info */

> >> +	iommu_domain_get_attr(domain->domain,

> DOMAIN_ATTR_GEOMETRY, &geo);

> >> +

> >> +	ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,

> >> +					geo.aperture_end);

> >> +	if (ret)

> >> +		goto out_detach;

> >> +

> >>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);

> >>

> >>  	INIT_LIST_HEAD(&domain->group_list);

> >> @@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void

> *iommu_data,

> >>  			goto out_detach;

> >>  	}

> >>

> >> +	ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,

> >> +					  geo.aperture_end);

> >> +	if (ret)

> >> +		goto out_detach;

> >> +

> >>  	list_add(&domain->next, &iommu->domain_list);

> >>

> >>  	mutex_unlock(&iommu->lock);

> >> @@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct

> vfio_iommu *iommu)

> >>  	WARN_ON(iommu->notifier.head);

> >>  }

> >>

> >> +/*

> >> + * Called when a domain is removed in detach. It is possible that

> >> + * the removed domain decided the iova aperture window. Modify the

> >> + * iova aperture with the smallest window among existing domains.

> >> + */

> >> +static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)

> >> +{

> >> +	struct vfio_domain *domain;

> >> +	struct iommu_domain_geometry geo;

> >> +	struct vfio_iova *node;

> >> +	phys_addr_t start = 0;

> >> +	phys_addr_t end = (phys_addr_t)~0;

> >> +

> >> +	list_for_each_entry(domain, &iommu->domain_list, next) {

> >> +		iommu_domain_get_attr(domain->domain,

> DOMAIN_ATTR_GEOMETRY,

> >> +				      &geo);

> >> +			if (geo.aperture_start > start)

> >> +				start = geo.aperture_start;

> >> +			if (geo.aperture_end < end)

> >> +				end = geo.aperture_end;

> >> +	}

> >> +

> >> +	/* modify iova aperture limits */

> >> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);

> >> +	node->start = start;

> >> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);

> >> +	node->end = end;

> >

> > We can do this because the new aperture is the same or bigger than the

> > current aperture, never smaller.  That's not fully obvious and should

> > be noted in the comment.  Perhaps this function should be "expand"

> > rather than "refresh".

> This one is not obvious to me either:

> assuming you have 2 domains, resp with aperture 1 and 2, resulting into

> aperture 3. Holes are created by resv regions for instance. If you

> remove domain 1, don't you get 4) instead of 2)?

> 

> 1)   |------------|

>  +

> 2) |---|    |--|       |-----|

> =

> 3)   |-|    |--|

> 

> 

> 4) |---|    |----------------|


That is true partially. But please remember that this patch is not aware of
any reserved regions yet. That is introduced in patch #2. So patch #1 and #2
together, the iova aperture might looks like 4) after this function call and once 
vfio_iommu_iova_resv_refresh() in patch #2 is done, the aperture will be
back to 2).

Hope I am clear. Please let me know.

In any case, based on comments by Alex, I will be removing this aperture/reserve
refresh functions and leave the iova list as it is when a group is detached. 

Thanks,
Shameer

> Thanks

> 

> Eric

> >

> >> +}

> >> +

> >>  static void vfio_iommu_type1_detach_group(void *iommu_data,

> >>  					  struct iommu_group *iommu_group)

> >>  {

> >> @@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void

> *iommu_data,

> >>  			iommu_domain_free(domain->domain);

> >>  			list_del(&domain->next);

> >>  			kfree(domain);

> >> +			vfio_iommu_iova_aper_refresh(iommu);

> >>  		}

> >>  		break;

> >>  	}

> >> @@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned

> long arg)

> >>  	}

> >>

> >>  	INIT_LIST_HEAD(&iommu->domain_list);

> >> +	INIT_LIST_HEAD(&iommu->iova_list);

> >>  	iommu->dma_list = RB_ROOT;

> >>  	mutex_init(&iommu->lock);

> >>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);

> >> @@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void

> *iommu_data)

> >>  {

> >>  	struct vfio_iommu *iommu = iommu_data;

> >>  	struct vfio_domain *domain, *domain_tmp;

> >> +	struct vfio_iova *iova, *iova_tmp;

> >>

> >>  	if (iommu->external_domain) {

> >>  		vfio_release_domain(iommu->external_domain, true);

> >> @@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void

> *iommu_data)

> >>  		list_del(&domain->next);

> >>  		kfree(domain);

> >>  	}

> >> +

> >> +	list_for_each_entry_safe(iova, iova_tmp,

> >> +				 &iommu->iova_list, list) {

> >> +		list_del(&iova->list);

> >> +		kfree(iova);

> >> +	}

> >> +

> >>  	kfree(iommu);

> >>  }

> >>

> >
Eric Auger Jan. 23, 2018, 11:20 a.m. UTC | #5
Hi Shameer,

On 23/01/18 11:04, Shameerali Kolothum Thodi wrote:
> Hi Eric,

> 

>> -----Original Message-----

>> From: Auger Eric [mailto:eric.auger@redhat.com]

>> Sent: Tuesday, January 23, 2018 8:25 AM

>> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum

>> Thodi <shameerali.kolothum.thodi@huawei.com>

>> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-

>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry

>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>

>> Subject: Re: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu

>> aperture validity check

>>

>> Hi Shameer,

>>

>> On 18/01/18 01:04, Alex Williamson wrote:

>>> On Fri, 12 Jan 2018 16:45:27 +0000

>>> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

>>>

>>>> This introduces an iova list that is valid for dma mappings. Make

>>>> sure the new iommu aperture window is valid and doesn't conflict

>>>> with any existing dma mappings during attach. Also update the iova

>>>> list with new aperture window during attach/detach.

>>>>

>>>> Signed-off-by: Shameer Kolothum

>> <shameerali.kolothum.thodi@huawei.com>

>>>> ---

>>>>  drivers/vfio/vfio_iommu_type1.c | 177

>> ++++++++++++++++++++++++++++++++++++++++

>>>>  1 file changed, 177 insertions(+)

>>>>

>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c

>> b/drivers/vfio/vfio_iommu_type1.c

>>>> index e30e29a..11cbd49 100644

>>>> --- a/drivers/vfio/vfio_iommu_type1.c

>>>> +++ b/drivers/vfio/vfio_iommu_type1.c

>>>> @@ -60,6 +60,7 @@

>>>>

>>>>  struct vfio_iommu {

>>>>  	struct list_head	domain_list;

>>>> +	struct list_head	iova_list;

>>>>  	struct vfio_domain	*external_domain; /* domain for external user

>> */

>>>>  	struct mutex		lock;

>>>>  	struct rb_root		dma_list;

>>>> @@ -92,6 +93,12 @@ struct vfio_group {

>>>>  	struct list_head	next;

>>>>  };

>>>>

>>>> +struct vfio_iova {

>>>> +	struct list_head	list;

>>>> +	phys_addr_t		start;

>>>> +	phys_addr_t		end;

>>>> +};

>>>

>>> dma_list uses dma_addr_t for the iova.  IOVAs are naturally DMA

>>> addresses, why are we using phys_addr_t?

>>>

>>>> +

>>>>  /*

>>>>   * Guest RAM pinning working set or DMA target

>>>>   */

>>>> @@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct

>> iommu_group *group, phys_addr_t *base)

>>>>  	return ret;

>>>>  }

>>>>

>>>> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,

>>>> +				struct list_head *head)

>>>> +{

>>>> +	struct vfio_iova *region;

>>>> +

>>>> +	region = kmalloc(sizeof(*region), GFP_KERNEL);

>>>> +	if (!region)

>>>> +		return -ENOMEM;

>>>> +

>>>> +	INIT_LIST_HEAD(&region->list);

>>>> +	region->start = start;

>>>> +	region->end = end;

>>>> +

>>>> +	list_add_tail(&region->list, head);

>>>> +	return 0;

>>>> +}

>>>

>>> As I'm reading through this series, I'm learning that there are a lot

>>> of assumptions and subtle details that should be documented.  For

>>> instance, the IOMMU API only provides a single geometry and we build

>>> upon that here as this patch creates a list, but there's only a single

>>> entry for now.  The following patches carve that single iova range into

>>> pieces and somewhat subtly use the list_head passed to keep the list

>>> sorted, allowing the first/last_entry tricks used throughout.  Subtle

>>> interfaces are prone to bugs.

>>>

>>>> +

>>>> +/*

>>>> + * Find whether a mem region overlaps with existing dma mappings

>>>> + */

>>>> +static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,

>>>> +				  phys_addr_t start, phys_addr_t end)

>>>> +{

>>>> +	struct rb_node *n = rb_first(&iommu->dma_list);

>>>> +

>>>> +	for (; n; n = rb_next(n)) {

>>>> +		struct vfio_dma *dma;

>>>> +

>>>> +		dma = rb_entry(n, struct vfio_dma, node);

>>>> +

>>>> +		if (end < dma->iova)

>>>> +			break;

>>>> +		if (start >= dma->iova + dma->size)

>>>> +			continue;

>>>> +		return true;

>>>> +	}

>>>> +

>>>> +	return false;

>>>> +}

>>>

>>> Why do we need this in addition to the existing vfio_find_dma()?  Why

>>> doesn't this use the tree structure of the dma_list?

>>>

>>>> +

>>>> +/*

>>>> + * Check the new iommu aperture is a valid one

>>>> + */

>>>> +static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,

>>>> +				     phys_addr_t start,

>>>> +				     phys_addr_t end)

>>>> +{

>>>> +	struct vfio_iova *first, *last;

>>>> +	struct list_head *iova = &iommu->iova_list;

>>>> +

>>>> +	if (list_empty(iova))

>>>> +		return 0;

>>>> +

>>>> +	/* Check if new one is outside the current aperture */

>>>

>>> "Disjoint sets"

>>>

>>>> +	first = list_first_entry(iova, struct vfio_iova, list);

>>>> +	last = list_last_entry(iova, struct vfio_iova, list);

>>>> +	if ((start > last->end) || (end < first->start))

>>>> +		return -EINVAL;

>>>> +

>>>> +	/* Check for any existing dma mappings outside the new start */

>>>> +	if (start > first->start) {

>>>> +		if (vfio_find_dma_overlap(iommu, first->start, start - 1))

>>>> +			return -EINVAL;

>>>> +	}

>>>> +

>>>> +	/* Check for any existing dma mappings outside the new end */

>>>> +	if (end < last->end) {

>>>> +		if (vfio_find_dma_overlap(iommu, end + 1, last->end))

>>>> +			return -EINVAL;

>>>> +	}

>>>> +

>>>> +	return 0;

>>>> +}

>>>

>>> I think this returns an int because you want to use it for the return

>>> value below, but it really seems like a bool question, ie. does this

>>> aperture conflict with existing mappings.  Additionally, the aperture

>>> is valid, it was provided to us by the IOMMU API, the question is

>>> whether it conflicts.  Please also name consistently to the other

>>> functions in this patch, vfio_iommu_aper_xxxx().

>>>

>>>> +

>>>> +/*

>>>> + * Adjust the iommu aperture window if new aperture is a valid one

>>>> + */

>>>> +static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,

>>>> +				      phys_addr_t start,

>>>> +				      phys_addr_t end)

>>>

>>> Perhaps "resize", "prune", or "shrink" to make it more clear what is

>>> being adjusted?

>>>

>>>> +{

>>>> +	struct vfio_iova *node, *next;

>>>> +	struct list_head *iova = &iommu->iova_list;

>>>> +

>>>> +	if (list_empty(iova))

>>>> +		return vfio_insert_iova(start, end, iova);

>>>> +

>>>> +	/* Adjust iova list start */

>>>> +	list_for_each_entry_safe(node, next, iova, list) {

>>>> +		if (start < node->start)

>>>> +			break;

>>>> +		if ((start >= node->start) && (start <= node->end)) {

>>>

>>> start == node->end results in a zero sized node.  s/<=/</

>>>

>>>> +			node->start = start;

>>>> +			break;

>>>> +		}

>>>> +		/* Delete nodes before new start */

>>>> +		list_del(&node->list);

>>>> +		kfree(node);

>>>> +	}

>>>> +

>>>> +	/* Adjust iova list end */

>>>> +	list_for_each_entry_safe(node, next, iova, list) {

>>>> +		if (end > node->end)

>>>> +			continue;

>>>> +

>>>> +		if ((end >= node->start) && (end <= node->end)) {

>>>

>>> end == node->start results in a zero sized node.  s/>=/>/

>>>

>>>> +			node->end = end;

>>>> +			continue;

>>>> +		}

>>>> +		/* Delete nodes after new end */

>>>> +		list_del(&node->list);

>>>> +		kfree(node);

>>>> +	}

>>>> +

>>>> +	return 0;

>>>> +}

>>>> +

>>>>  static int vfio_iommu_type1_attach_group(void *iommu_data,

>>>>  					 struct iommu_group *iommu_group)

>>>>  {

>>>> @@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void

>> *iommu_data,

>>>>  	int ret;

>>>>  	bool resv_msi, msi_remap;

>>>>  	phys_addr_t resv_msi_base;

>>>> +	struct iommu_domain_geometry geo;

>>>>

>>>>  	mutex_lock(&iommu->lock);

>>>>

>>>> @@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void

>> *iommu_data,

>>>>  	if (ret)

>>>>  		goto out_domain;

>>>>

>>>> +	/* Get aperture info */

>>>> +	iommu_domain_get_attr(domain->domain,

>> DOMAIN_ATTR_GEOMETRY, &geo);

>>>> +

>>>> +	ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,

>>>> +					geo.aperture_end);

>>>> +	if (ret)

>>>> +		goto out_detach;

>>>> +

>>>>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);

>>>>

>>>>  	INIT_LIST_HEAD(&domain->group_list);

>>>> @@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void

>> *iommu_data,

>>>>  			goto out_detach;

>>>>  	}

>>>>

>>>> +	ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,

>>>> +					  geo.aperture_end);

>>>> +	if (ret)

>>>> +		goto out_detach;

>>>> +

>>>>  	list_add(&domain->next, &iommu->domain_list);

>>>>

>>>>  	mutex_unlock(&iommu->lock);

>>>> @@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct

>> vfio_iommu *iommu)

>>>>  	WARN_ON(iommu->notifier.head);

>>>>  }

>>>>

>>>> +/*

>>>> + * Called when a domain is removed in detach. It is possible that

>>>> + * the removed domain decided the iova aperture window. Modify the

>>>> + * iova aperture with the smallest window among existing domains.

>>>> + */

>>>> +static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)

>>>> +{

>>>> +	struct vfio_domain *domain;

>>>> +	struct iommu_domain_geometry geo;

>>>> +	struct vfio_iova *node;

>>>> +	phys_addr_t start = 0;

>>>> +	phys_addr_t end = (phys_addr_t)~0;

>>>> +

>>>> +	list_for_each_entry(domain, &iommu->domain_list, next) {

>>>> +		iommu_domain_get_attr(domain->domain,

>> DOMAIN_ATTR_GEOMETRY,

>>>> +				      &geo);

>>>> +			if (geo.aperture_start > start)

>>>> +				start = geo.aperture_start;

>>>> +			if (geo.aperture_end < end)

>>>> +				end = geo.aperture_end;

>>>> +	}

>>>> +

>>>> +	/* modify iova aperture limits */

>>>> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);

>>>> +	node->start = start;

>>>> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);

>>>> +	node->end = end;

>>>

>>> We can do this because the new aperture is the same or bigger than the

>>> current aperture, never smaller.  That's not fully obvious and should

>>> be noted in the comment.  Perhaps this function should be "expand"

>>> rather than "refresh".

>> This one is not obvious to me either:

>> assuming you have 2 domains, resp with aperture 1 and 2, resulting into

>> aperture 3. Holes are created by resv regions for instance. If you

>> remove domain 1, don't you get 4) instead of 2)?

>>

>> 1)   |------------|

>>  +

>> 2) |---|    |--|       |-----|

>> =

>> 3)   |-|    |--|

>>

>>

>> 4) |---|    |----------------|

> 

> That is true partially. But please remember that this patch is not aware of

> any reserved regions yet. That is introduced in patch #2. So patch #1 and #2

> together, the iova aperture might looks like 4) after this function call and once 

> vfio_iommu_iova_resv_refresh() in patch #2 is done, the aperture will be

> back to 2).

> 

> Hope I am clear. Please let me know.

Ah OK.
> 

> In any case, based on comments by Alex, I will be removing this aperture/reserve

> refresh functions and leave the iova list as it is when a group is detached. 

Looking forwarding to reviewing the next version then.

Thanks

Eric
> 

> Thanks,

> Shameer

> 

>> Thanks

>>

>> Eric

>>>

>>>> +}

>>>> +

>>>>  static void vfio_iommu_type1_detach_group(void *iommu_data,

>>>>  					  struct iommu_group *iommu_group)

>>>>  {

>>>> @@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void

>> *iommu_data,

>>>>  			iommu_domain_free(domain->domain);

>>>>  			list_del(&domain->next);

>>>>  			kfree(domain);

>>>> +			vfio_iommu_iova_aper_refresh(iommu);

>>>>  		}

>>>>  		break;

>>>>  	}

>>>> @@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned

>> long arg)

>>>>  	}

>>>>

>>>>  	INIT_LIST_HEAD(&iommu->domain_list);

>>>> +	INIT_LIST_HEAD(&iommu->iova_list);

>>>>  	iommu->dma_list = RB_ROOT;

>>>>  	mutex_init(&iommu->lock);

>>>>  	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);

>>>> @@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void

>> *iommu_data)

>>>>  {

>>>>  	struct vfio_iommu *iommu = iommu_data;

>>>>  	struct vfio_domain *domain, *domain_tmp;

>>>> +	struct vfio_iova *iova, *iova_tmp;

>>>>

>>>>  	if (iommu->external_domain) {

>>>>  		vfio_release_domain(iommu->external_domain, true);

>>>> @@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void

>> *iommu_data)

>>>>  		list_del(&domain->next);

>>>>  		kfree(domain);

>>>>  	}

>>>> +

>>>> +	list_for_each_entry_safe(iova, iova_tmp,

>>>> +				 &iommu->iova_list, list) {

>>>> +		list_del(&iova->list);

>>>> +		kfree(iova);

>>>> +	}

>>>> +

>>>>  	kfree(iommu);

>>>>  }

>>>>

>>>
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e30e29a..11cbd49 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -60,6 +60,7 @@ 
 
 struct vfio_iommu {
 	struct list_head	domain_list;
+	struct list_head	iova_list;
 	struct vfio_domain	*external_domain; /* domain for external user */
 	struct mutex		lock;
 	struct rb_root		dma_list;
@@ -92,6 +93,12 @@  struct vfio_group {
 	struct list_head	next;
 };
 
+struct vfio_iova {
+	struct list_head	list;
+	phys_addr_t		start;
+	phys_addr_t		end;
+};
+
 /*
  * Guest RAM pinning working set or DMA target
  */
@@ -1192,6 +1199,123 @@  static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
 	return ret;
 }
 
+static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
+				struct list_head *head)
+{
+	struct vfio_iova *region;
+
+	region = kmalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&region->list);
+	region->start = start;
+	region->end = end;
+
+	list_add_tail(&region->list, head);
+	return 0;
+}
+
+/*
+ * Find whether a mem region overlaps with existing dma mappings
+ */
+static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,
+				  phys_addr_t start, phys_addr_t end)
+{
+	struct rb_node *n = rb_first(&iommu->dma_list);
+
+	for (; n; n = rb_next(n)) {
+		struct vfio_dma *dma;
+
+		dma = rb_entry(n, struct vfio_dma, node);
+
+		if (end < dma->iova)
+			break;
+		if (start >= dma->iova + dma->size)
+			continue;
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Check the new iommu aperture is a valid one
+ */
+static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
+				     phys_addr_t start,
+				     phys_addr_t end)
+{
+	struct vfio_iova *first, *last;
+	struct list_head *iova = &iommu->iova_list;
+
+	if (list_empty(iova))
+		return 0;
+
+	/* Check if new one is outside the current aperture */
+	first = list_first_entry(iova, struct vfio_iova, list);
+	last = list_last_entry(iova, struct vfio_iova, list);
+	if ((start > last->end) || (end < first->start))
+		return -EINVAL;
+
+	/* Check for any existing dma mappings outside the new start */
+	if (start > first->start) {
+		if (vfio_find_dma_overlap(iommu, first->start, start - 1))
+			return -EINVAL;
+	}
+
+	/* Check for any existing dma mappings outside the new end */
+	if (end < last->end) {
+		if (vfio_find_dma_overlap(iommu, end + 1, last->end))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Adjust the iommu aperture window if new aperture is a valid one
+ */
+static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
+				      phys_addr_t start,
+				      phys_addr_t end)
+{
+	struct vfio_iova *node, *next;
+	struct list_head *iova = &iommu->iova_list;
+
+	if (list_empty(iova))
+		return vfio_insert_iova(start, end, iova);
+
+	/* Adjust iova list start */
+	list_for_each_entry_safe(node, next, iova, list) {
+		if (start < node->start)
+			break;
+		if ((start >= node->start) && (start <= node->end)) {
+			node->start = start;
+			break;
+		}
+		/* Delete nodes before new start */
+		list_del(&node->list);
+		kfree(node);
+	}
+
+	/* Adjust iova list end */
+	list_for_each_entry_safe(node, next, iova, list) {
+		if (end > node->end)
+			continue;
+
+		if ((end >= node->start) && (end <= node->end)) {
+			node->end = end;
+			continue;
+		}
+		/* Delete nodes after new end */
+		list_del(&node->list);
+		kfree(node);
+	}
+
+	return 0;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
@@ -1202,6 +1326,7 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	int ret;
 	bool resv_msi, msi_remap;
 	phys_addr_t resv_msi_base;
+	struct iommu_domain_geometry geo;
 
 	mutex_lock(&iommu->lock);
 
@@ -1271,6 +1396,14 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_domain;
 
+	/* Get aperture info */
+	iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
+
+	ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,
+					geo.aperture_end);
+	if (ret)
+		goto out_detach;
+
 	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
 
 	INIT_LIST_HEAD(&domain->group_list);
@@ -1327,6 +1460,11 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 			goto out_detach;
 	}
 
+	ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,
+					  geo.aperture_end);
+	if (ret)
+		goto out_detach;
+
 	list_add(&domain->next, &iommu->domain_list);
 
 	mutex_unlock(&iommu->lock);
@@ -1392,6 +1530,35 @@  static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
 	WARN_ON(iommu->notifier.head);
 }
 
+/*
+ * Called when a domain is removed in detach. It is possible that
+ * the removed domain decided the iova aperture window. Modify the
+ * iova aperture with the smallest window among existing domains.
+ */
+static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
+{
+	struct vfio_domain *domain;
+	struct iommu_domain_geometry geo;
+	struct vfio_iova *node;
+	phys_addr_t start = 0;
+	phys_addr_t end = (phys_addr_t)~0;
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
+				      &geo);
+			if (geo.aperture_start > start)
+				start = geo.aperture_start;
+			if (geo.aperture_end < end)
+				end = geo.aperture_end;
+	}
+
+	/* modify iova aperture limits */
+	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
+	node->start = start;
+	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
+	node->end = end;
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
@@ -1445,6 +1612,7 @@  static void vfio_iommu_type1_detach_group(void *iommu_data,
 			iommu_domain_free(domain->domain);
 			list_del(&domain->next);
 			kfree(domain);
+			vfio_iommu_iova_aper_refresh(iommu);
 		}
 		break;
 	}
@@ -1475,6 +1643,7 @@  static void *vfio_iommu_type1_open(unsigned long arg)
 	}
 
 	INIT_LIST_HEAD(&iommu->domain_list);
+	INIT_LIST_HEAD(&iommu->iova_list);
 	iommu->dma_list = RB_ROOT;
 	mutex_init(&iommu->lock);
 	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
@@ -1502,6 +1671,7 @@  static void vfio_iommu_type1_release(void *iommu_data)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_domain *domain, *domain_tmp;
+	struct vfio_iova *iova, *iova_tmp;
 
 	if (iommu->external_domain) {
 		vfio_release_domain(iommu->external_domain, true);
@@ -1517,6 +1687,13 @@  static void vfio_iommu_type1_release(void *iommu_data)
 		list_del(&domain->next);
 		kfree(domain);
 	}
+
+	list_for_each_entry_safe(iova, iova_tmp,
+				 &iommu->iova_list, list) {
+		list_del(&iova->list);
+		kfree(iova);
+	}
+
 	kfree(iommu);
 }