diff mbox

[RFC] vfio/type1: handle case where IOMMU does not support PAGE_SIZE size

Message ID 1446037965-2341-1-git-send-email-eric.auger@linaro.org
State New
Headers show

Commit Message

Auger Eric Oct. 28, 2015, 1:12 p.m. UTC
Current vfio_pgsize_bitmap code hides the supported IOMMU page
sizes smaller than PAGE_SIZE. As a result, in case the IOMMU
does not support PAGE_SIZE page, the alignment check on map/unmap
is done with larger page sizes, if any. This can fail although
mapping could be done with pages smaller than PAGE_SIZE.

vfio_pgsize_bitmap is modified to expose the IOMMU page sizes,
supported by all domains, even those smaller than PAGE_SIZE. The
alignment check on map is performed against PAGE_SIZE if the minimum
IOMMU size is less than PAGE_SIZE or against the min page size greater
than PAGE_SIZE.

Signed-off-by: Eric Auger <eric.auger@linaro.org>


---

This was tested on AMD Seattle with 64kB page host. ARM MMU 401
currently expose 4kB, 2MB and 1GB page support. With a 64kB page host,
the map/unmap check is done against 2MB. Some alignment check fail
so VFIO_IOMMU_MAP_DMA fail while we could map using 4kB IOMMU page
size.
---
 drivers/vfio/vfio_iommu_type1.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

-- 
1.9.1

Comments

Will Deacon Oct. 28, 2015, 3:37 p.m. UTC | #1
On Wed, Oct 28, 2015 at 01:12:45PM +0000, Eric Auger wrote:
> Current vfio_pgsize_bitmap code hides the supported IOMMU page

> sizes smaller than PAGE_SIZE. As a result, in case the IOMMU

> does not support PAGE_SIZE page, the alignment check on map/unmap

> is done with larger page sizes, if any. This can fail although

> mapping could be done with pages smaller than PAGE_SIZE.

> 

> vfio_pgsize_bitmap is modified to expose the IOMMU page sizes,

> supported by all domains, even those smaller than PAGE_SIZE. The

> alignment check on map is performed against PAGE_SIZE if the minimum

> IOMMU size is less than PAGE_SIZE or against the min page size greater

> than PAGE_SIZE.

> 

> Signed-off-by: Eric Auger <eric.auger@linaro.org>

> 

> ---

> 

> This was tested on AMD Seattle with 64kB page host. ARM MMU 401

> currently expose 4kB, 2MB and 1GB page support. With a 64kB page host,

> the map/unmap check is done against 2MB. Some alignment check fail

> so VFIO_IOMMU_MAP_DMA fail while we could map using 4kB IOMMU page

> size.

> ---

>  drivers/vfio/vfio_iommu_type1.c | 25 +++++++++++--------------

>  1 file changed, 11 insertions(+), 14 deletions(-)

> 

> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

> index 57d8c37..13fb974 100644

> --- a/drivers/vfio/vfio_iommu_type1.c

> +++ b/drivers/vfio/vfio_iommu_type1.c

> @@ -403,7 +403,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)

>  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

>  {

>  	struct vfio_domain *domain;

> -	unsigned long bitmap = PAGE_MASK;

> +	unsigned long bitmap = ULONG_MAX;

>  

>  	mutex_lock(&iommu->lock);

>  	list_for_each_entry(domain, &iommu->domain_list, next)

> @@ -416,20 +416,18 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

>  			     struct vfio_iommu_type1_dma_unmap *unmap)

>  {

> -	uint64_t mask;

>  	struct vfio_dma *dma;

>  	size_t unmapped = 0;

>  	int ret = 0;

> +	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));

> +	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?

> +						PAGE_SIZE : min_pagesz;


max_t ?

>  

> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;

> -

> -	if (unmap->iova & mask)

> +	if (!IS_ALIGNED(unmap->iova, requested_alignment))

>  		return -EINVAL;

> -	if (!unmap->size || unmap->size & mask)

> +	if (!unmap->size || !IS_ALIGNED(unmap->size, requested_alignment))

>  		return -EINVAL;

>  

> -	WARN_ON(mask & PAGE_MASK);

> -

>  	mutex_lock(&iommu->lock);

>  

>  	/*

> @@ -553,25 +551,24 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,

>  	size_t size = map->size;

>  	long npage;

>  	int ret = 0, prot = 0;

> -	uint64_t mask;

>  	struct vfio_dma *dma;

>  	unsigned long pfn;

> +	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));

> +	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?

> +						PAGE_SIZE : min_pagesz;


Same code here. Perhaps you need another function to get the alignment?

Otherwise, this looks pretty straightforward to me; iommu_map will take
care of splitting up the requests to the IOMMU driver so they are in the
right chunks.

Will
Auger Eric Oct. 28, 2015, 5:10 p.m. UTC | #2
Hi Alex,
On 10/28/2015 05:27 PM, Alex Williamson wrote:
> On Wed, 2015-10-28 at 13:12 +0000, Eric Auger wrote:

>> Current vfio_pgsize_bitmap code hides the supported IOMMU page

>> sizes smaller than PAGE_SIZE. As a result, in case the IOMMU

>> does not support PAGE_SIZE page, the alignment check on map/unmap

>> is done with larger page sizes, if any. This can fail although

>> mapping could be done with pages smaller than PAGE_SIZE.

>>

>> vfio_pgsize_bitmap is modified to expose the IOMMU page sizes,

>> supported by all domains, even those smaller than PAGE_SIZE. The

>> alignment check on map is performed against PAGE_SIZE if the minimum

>> IOMMU size is less than PAGE_SIZE or against the min page size greater

>> than PAGE_SIZE.

>>

>> Signed-off-by: Eric Auger <eric.auger@linaro.org>

>>

>> ---

>>

>> This was tested on AMD Seattle with 64kB page host. ARM MMU 401

>> currently expose 4kB, 2MB and 1GB page support. With a 64kB page host,

>> the map/unmap check is done against 2MB. Some alignment check fail

>> so VFIO_IOMMU_MAP_DMA fail while we could map using 4kB IOMMU page

>> size.

>> ---

>>  drivers/vfio/vfio_iommu_type1.c | 25 +++++++++++--------------

>>  1 file changed, 11 insertions(+), 14 deletions(-)

>>

>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

>> index 57d8c37..13fb974 100644

>> --- a/drivers/vfio/vfio_iommu_type1.c

>> +++ b/drivers/vfio/vfio_iommu_type1.c

>> @@ -403,7 +403,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)

>>  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

>>  {

>>  	struct vfio_domain *domain;

>> -	unsigned long bitmap = PAGE_MASK;

>> +	unsigned long bitmap = ULONG_MAX;

> 

> Isn't this and removing the WARN_ON()s the only real change in this

> patch?  The rest looks like conversion to use IS_ALIGNED and the

> following test, that I don't really understand...

Yes basically you're right.
> 

>>  

>>  	mutex_lock(&iommu->lock);

>>  	list_for_each_entry(domain, &iommu->domain_list, next)

>> @@ -416,20 +416,18 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

>>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

>>  			     struct vfio_iommu_type1_dma_unmap *unmap)

>>  {

>> -	uint64_t mask;

>>  	struct vfio_dma *dma;

>>  	size_t unmapped = 0;

>>  	int ret = 0;

>> +	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));

>> +	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?

>> +						PAGE_SIZE : min_pagesz;

> 

> This one.  If we're going to support sub-PAGE_SIZE mappings, why do we

> care to cap alignment at PAGE_SIZE?

My intent in this patch isn't to allow the user-space to map/unmap
sub-PAGE_SIZE buffers. The new test makes sure the mapped area is bigger
or equal than a host page whatever the supported page sizes.

I noticed that chunk construction, pinning and other many things are
based on PAGE_SIZE and far be it from me to change that code! I want to
keep that minimal granularity for all those computation.

However on iommu side, I would like to rely on the fact the iommu driver
is clever enough to choose the right page size and even to choose a size
that is smaller than PAGE_SIZE if this latter is not supported.
> 

>> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;

>> -

>> -	if (unmap->iova & mask)

>> +	if (!IS_ALIGNED(unmap->iova, requested_alignment))

>>  		return -EINVAL;

>> -	if (!unmap->size || unmap->size & mask)

>> +	if (!unmap->size || !IS_ALIGNED(unmap->size, requested_alignment))

>>  		return -EINVAL;

>>  

>> -	WARN_ON(mask & PAGE_MASK);

>> -

>>  	mutex_lock(&iommu->lock);

>>  

>>  	/*

>> @@ -553,25 +551,24 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,

>>  	size_t size = map->size;

>>  	long npage;

>>  	int ret = 0, prot = 0;

>> -	uint64_t mask;

>>  	struct vfio_dma *dma;

>>  	unsigned long pfn;

>> +	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));

>> +	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?

>> +						PAGE_SIZE : min_pagesz;

>>  

>>  	/* Verify that none of our __u64 fields overflow */

>>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)

>>  		return -EINVAL;

>>  

>> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;

>> -

>> -	WARN_ON(mask & PAGE_MASK);

>> -

>>  	/* READ/WRITE from device perspective */

>>  	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)

>>  		prot |= IOMMU_WRITE;

>>  	if (map->flags & VFIO_DMA_MAP_FLAG_READ)

>>  		prot |= IOMMU_READ;

>>  

>> -	if (!prot || !size || (size | iova | vaddr) & mask)

>> +	if (!prot || !size ||

>> +		!IS_ALIGNED(size | iova | vaddr, requested_alignment))

>>  		return -EINVAL;

>>  

>>  	/* Don't allow IOVA or virtual address wrap */

> 

> This is mostly ignoring the problems with sub-PAGE_SIZE mappings.  For

> instance, we can only pin on PAGE_SIZE and therefore we only do

> accounting on PAGE_SIZE, so if the user does 4K mappings across your 64K

> page, that page gets pinned and accounted 16 times.  Are we going to

> tell users that their locked memory limit needs to be 16x now?  The rest

> of the code would need an audit as well to see what other sub-page bugs

> might be hiding.  Thanks,

So if the user is not allowed to map sub-PAGE_SIZE buffers, accounting
still is based on PAGE_SIZE while iommu mapping can be based on
sub-PAGE_SIZE pages. I am misunderstanding something?

Best Regards

Eric
> 

> Alex

> 

> 

>
Will Deacon Oct. 28, 2015, 5:14 p.m. UTC | #3
On Wed, Oct 28, 2015 at 10:27:28AM -0600, Alex Williamson wrote:
> On Wed, 2015-10-28 at 13:12 +0000, Eric Auger wrote:

> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

> > index 57d8c37..13fb974 100644

> > --- a/drivers/vfio/vfio_iommu_type1.c

> > +++ b/drivers/vfio/vfio_iommu_type1.c

> > @@ -403,7 +403,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)

> >  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

> >  {

> >  	struct vfio_domain *domain;

> > -	unsigned long bitmap = PAGE_MASK;

> > +	unsigned long bitmap = ULONG_MAX;

> 

> Isn't this and removing the WARN_ON()s the only real change in this

> patch?  The rest looks like conversion to use IS_ALIGNED and the

> following test, that I don't really understand...

> 

> >  

> >  	mutex_lock(&iommu->lock);

> >  	list_for_each_entry(domain, &iommu->domain_list, next)

> > @@ -416,20 +416,18 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

> >  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

> >  			     struct vfio_iommu_type1_dma_unmap *unmap)

> >  {

> > -	uint64_t mask;

> >  	struct vfio_dma *dma;

> >  	size_t unmapped = 0;

> >  	int ret = 0;

> > +	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));

> > +	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?

> > +						PAGE_SIZE : min_pagesz;

> 

> This one.  If we're going to support sub-PAGE_SIZE mappings, why do we

> care to cap alignment at PAGE_SIZE?


Eric can clarify, but I think the intention here is to have VFIO continue
doing things in PAGE_SIZE chunks precisely so that we don't have to rework
all of the pinning code etc. The IOMMU API can then deal with the smaller
page size.

> > -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;

> > -

> > -	if (unmap->iova & mask)

> > +	if (!IS_ALIGNED(unmap->iova, requested_alignment))

> >  		return -EINVAL;

> > -	if (!unmap->size || unmap->size & mask)

> > +	if (!unmap->size || !IS_ALIGNED(unmap->size, requested_alignment))

> >  		return -EINVAL;

> >  

> > -	WARN_ON(mask & PAGE_MASK);

> > -

> >  	mutex_lock(&iommu->lock);

> >  

> >  	/*

> > @@ -553,25 +551,24 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,

> >  	size_t size = map->size;

> >  	long npage;

> >  	int ret = 0, prot = 0;

> > -	uint64_t mask;

> >  	struct vfio_dma *dma;

> >  	unsigned long pfn;

> > +	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));

> > +	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?

> > +						PAGE_SIZE : min_pagesz;

> >  

> >  	/* Verify that none of our __u64 fields overflow */

> >  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)

> >  		return -EINVAL;

> >  

> > -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;

> > -

> > -	WARN_ON(mask & PAGE_MASK);

> > -

> >  	/* READ/WRITE from device perspective */

> >  	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)

> >  		prot |= IOMMU_WRITE;

> >  	if (map->flags & VFIO_DMA_MAP_FLAG_READ)

> >  		prot |= IOMMU_READ;

> >  

> > -	if (!prot || !size || (size | iova | vaddr) & mask)

> > +	if (!prot || !size ||

> > +		!IS_ALIGNED(size | iova | vaddr, requested_alignment))

> >  		return -EINVAL;

> >  

> >  	/* Don't allow IOVA or virtual address wrap */

> 

> This is mostly ignoring the problems with sub-PAGE_SIZE mappings.  For

> instance, we can only pin on PAGE_SIZE and therefore we only do

> accounting on PAGE_SIZE, so if the user does 4K mappings across your 64K

> page, that page gets pinned and accounted 16 times.  Are we going to

> tell users that their locked memory limit needs to be 16x now?  The rest

> of the code would need an audit as well to see what other sub-page bugs

> might be hiding.  Thanks,


I don't see that. The pinning all happens the same in VFIO, which can
then happily pass a 64k region to iommu_map. iommu_map will then call
->map in 4k chunks on the IOMMU driver ops.

Will
Auger Eric Oct. 28, 2015, 5:17 p.m. UTC | #4
Hi Will,
On 10/28/2015 06:14 PM, Will Deacon wrote:
> On Wed, Oct 28, 2015 at 10:27:28AM -0600, Alex Williamson wrote:

>> On Wed, 2015-10-28 at 13:12 +0000, Eric Auger wrote:

>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

>>> index 57d8c37..13fb974 100644

>>> --- a/drivers/vfio/vfio_iommu_type1.c

>>> +++ b/drivers/vfio/vfio_iommu_type1.c

>>> @@ -403,7 +403,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)

>>>  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

>>>  {

>>>  	struct vfio_domain *domain;

>>> -	unsigned long bitmap = PAGE_MASK;

>>> +	unsigned long bitmap = ULONG_MAX;

>>

>> Isn't this and removing the WARN_ON()s the only real change in this

>> patch?  The rest looks like conversion to use IS_ALIGNED and the

>> following test, that I don't really understand...

>>

>>>  

>>>  	mutex_lock(&iommu->lock);

>>>  	list_for_each_entry(domain, &iommu->domain_list, next)

>>> @@ -416,20 +416,18 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

>>>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

>>>  			     struct vfio_iommu_type1_dma_unmap *unmap)

>>>  {

>>> -	uint64_t mask;

>>>  	struct vfio_dma *dma;

>>>  	size_t unmapped = 0;

>>>  	int ret = 0;

>>> +	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));

>>> +	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?

>>> +						PAGE_SIZE : min_pagesz;

>>

>> This one.  If we're going to support sub-PAGE_SIZE mappings, why do we

>> care to cap alignment at PAGE_SIZE?

> 

> Eric can clarify, but I think the intention here is to have VFIO continue

> doing things in PAGE_SIZE chunks precisely so that we don't have to rework

> all of the pinning code etc.

That's my intention indeed ;-)

Thanks

Eric
 The IOMMU API can then deal with the smaller
> page size.


> 

>>> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;

>>> -

>>> -	if (unmap->iova & mask)

>>> +	if (!IS_ALIGNED(unmap->iova, requested_alignment))

>>>  		return -EINVAL;

>>> -	if (!unmap->size || unmap->size & mask)

>>> +	if (!unmap->size || !IS_ALIGNED(unmap->size, requested_alignment))

>>>  		return -EINVAL;

>>>  

>>> -	WARN_ON(mask & PAGE_MASK);

>>> -

>>>  	mutex_lock(&iommu->lock);

>>>  

>>>  	/*

>>> @@ -553,25 +551,24 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,

>>>  	size_t size = map->size;

>>>  	long npage;

>>>  	int ret = 0, prot = 0;

>>> -	uint64_t mask;

>>>  	struct vfio_dma *dma;

>>>  	unsigned long pfn;

>>> +	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));

>>> +	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?

>>> +						PAGE_SIZE : min_pagesz;

>>>  

>>>  	/* Verify that none of our __u64 fields overflow */

>>>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)

>>>  		return -EINVAL;

>>>  

>>> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;

>>> -

>>> -	WARN_ON(mask & PAGE_MASK);

>>> -

>>>  	/* READ/WRITE from device perspective */

>>>  	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)

>>>  		prot |= IOMMU_WRITE;

>>>  	if (map->flags & VFIO_DMA_MAP_FLAG_READ)

>>>  		prot |= IOMMU_READ;

>>>  

>>> -	if (!prot || !size || (size | iova | vaddr) & mask)

>>> +	if (!prot || !size ||

>>> +		!IS_ALIGNED(size | iova | vaddr, requested_alignment))

>>>  		return -EINVAL;

>>>  

>>>  	/* Don't allow IOVA or virtual address wrap */

>>

>> This is mostly ignoring the problems with sub-PAGE_SIZE mappings.  For

>> instance, we can only pin on PAGE_SIZE and therefore we only do

>> accounting on PAGE_SIZE, so if the user does 4K mappings across your 64K

>> page, that page gets pinned and accounted 16 times.  Are we going to

>> tell users that their locked memory limit needs to be 16x now?  The rest

>> of the code would need an audit as well to see what other sub-page bugs

>> might be hiding.  Thanks,

> 

> I don't see that. The pinning all happens the same in VFIO, which can

> then happily pass a 64k region to iommu_map. iommu_map will then call

> ->map in 4k chunks on the IOMMU driver ops.

> 

> Will

>
Auger Eric Oct. 28, 2015, 5:41 p.m. UTC | #5
Alex,
On 10/28/2015 06:28 PM, Alex Williamson wrote:
> On Wed, 2015-10-28 at 17:14 +0000, Will Deacon wrote:

>> On Wed, Oct 28, 2015 at 10:27:28AM -0600, Alex Williamson wrote:

>>> On Wed, 2015-10-28 at 13:12 +0000, Eric Auger wrote:

>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

>>>> index 57d8c37..13fb974 100644

>>>> --- a/drivers/vfio/vfio_iommu_type1.c

>>>> +++ b/drivers/vfio/vfio_iommu_type1.c

>>>> @@ -403,7 +403,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)

>>>>  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

>>>>  {

>>>>  	struct vfio_domain *domain;

>>>> -	unsigned long bitmap = PAGE_MASK;

>>>> +	unsigned long bitmap = ULONG_MAX;

>>>

>>> Isn't this and removing the WARN_ON()s the only real change in this

>>> patch?  The rest looks like conversion to use IS_ALIGNED and the

>>> following test, that I don't really understand...

>>>

>>>>  

>>>>  	mutex_lock(&iommu->lock);

>>>>  	list_for_each_entry(domain, &iommu->domain_list, next)

>>>> @@ -416,20 +416,18 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

>>>>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

>>>>  			     struct vfio_iommu_type1_dma_unmap *unmap)

>>>>  {

>>>> -	uint64_t mask;

>>>>  	struct vfio_dma *dma;

>>>>  	size_t unmapped = 0;

>>>>  	int ret = 0;

>>>> +	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));

>>>> +	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?

>>>> +						PAGE_SIZE : min_pagesz;

>>>

>>> This one.  If we're going to support sub-PAGE_SIZE mappings, why do we

>>> care to cap alignment at PAGE_SIZE?

>>

>> Eric can clarify, but I think the intention here is to have VFIO continue

>> doing things in PAGE_SIZE chunks precisely so that we don't have to rework

>> all of the pinning code etc. The IOMMU API can then deal with the smaller

>> page size.

> 

> Gak, I read this wrong.  So really we're just artificially adding

> PAGE_SIZE as a supported IOMMU size so long as the IOMMU support

> something smaller than PAGE_SIZE, where PAGE_SIZE is obviously a

> multiple of that smaller size.  Ok, but should we just do this once in

> vfio_pgsize_bitmap()?  This is exactly why VT-d just reports ~(4k - 1)

> for the iommu bitmap.

Yes I can do this in vfio_pgsize_bitmap if you prefer.

Thanks

Eric

> 

>>>> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;

>>>> -

>>>> -	if (unmap->iova & mask)

>>>> +	if (!IS_ALIGNED(unmap->iova, requested_alignment))

>>>>  		return -EINVAL;

>>>> -	if (!unmap->size || unmap->size & mask)

>>>> +	if (!unmap->size || !IS_ALIGNED(unmap->size, requested_alignment))

>>>>  		return -EINVAL;

>>>>  

>>>> -	WARN_ON(mask & PAGE_MASK);

>>>> -

>>>>  	mutex_lock(&iommu->lock);

>>>>  

>>>>  	/*

>>>> @@ -553,25 +551,24 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,

>>>>  	size_t size = map->size;

>>>>  	long npage;

>>>>  	int ret = 0, prot = 0;

>>>> -	uint64_t mask;

>>>>  	struct vfio_dma *dma;

>>>>  	unsigned long pfn;

>>>> +	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));

>>>> +	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?

>>>> +						PAGE_SIZE : min_pagesz;

>>>>  

>>>>  	/* Verify that none of our __u64 fields overflow */

>>>>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)

>>>>  		return -EINVAL;

>>>>  

>>>> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;

>>>> -

>>>> -	WARN_ON(mask & PAGE_MASK);

>>>> -

>>>>  	/* READ/WRITE from device perspective */

>>>>  	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)

>>>>  		prot |= IOMMU_WRITE;

>>>>  	if (map->flags & VFIO_DMA_MAP_FLAG_READ)

>>>>  		prot |= IOMMU_READ;

>>>>  

>>>> -	if (!prot || !size || (size | iova | vaddr) & mask)

>>>> +	if (!prot || !size ||

>>>> +		!IS_ALIGNED(size | iova | vaddr, requested_alignment))

>>>>  		return -EINVAL;

>>>>  

>>>>  	/* Don't allow IOVA or virtual address wrap */

>>>

>>> This is mostly ignoring the problems with sub-PAGE_SIZE mappings.  For

>>> instance, we can only pin on PAGE_SIZE and therefore we only do

>>> accounting on PAGE_SIZE, so if the user does 4K mappings across your 64K

>>> page, that page gets pinned and accounted 16 times.  Are we going to

>>> tell users that their locked memory limit needs to be 16x now?  The rest

>>> of the code would need an audit as well to see what other sub-page bugs

>>> might be hiding.  Thanks,

>>

>> I don't see that. The pinning all happens the same in VFIO, which can

>> then happily pass a 64k region to iommu_map. iommu_map will then call

>> ->map in 4k chunks on the IOMMU driver ops.

> 

> Yep, I see now that this isn't doing sub-page mappings.  Thanks,

> 

> Alex

>
Auger Eric Oct. 28, 2015, 5:48 p.m. UTC | #6
On 10/28/2015 06:37 PM, Alex Williamson wrote:
> On Wed, 2015-10-28 at 18:10 +0100, Eric Auger wrote:

>> Hi Alex,

>> On 10/28/2015 05:27 PM, Alex Williamson wrote:

>>> On Wed, 2015-10-28 at 13:12 +0000, Eric Auger wrote:

>>>> Current vfio_pgsize_bitmap code hides the supported IOMMU page

>>>> sizes smaller than PAGE_SIZE. As a result, in case the IOMMU

>>>> does not support PAGE_SIZE page, the alignment check on map/unmap

>>>> is done with larger page sizes, if any. This can fail although

>>>> mapping could be done with pages smaller than PAGE_SIZE.

>>>>

>>>> vfio_pgsize_bitmap is modified to expose the IOMMU page sizes,

>>>> supported by all domains, even those smaller than PAGE_SIZE. The

>>>> alignment check on map is performed against PAGE_SIZE if the minimum

>>>> IOMMU size is less than PAGE_SIZE or against the min page size greater

>>>> than PAGE_SIZE.

>>>>

>>>> Signed-off-by: Eric Auger <eric.auger@linaro.org>

>>>>

>>>> ---

>>>>

>>>> This was tested on AMD Seattle with 64kB page host. ARM MMU 401

>>>> currently expose 4kB, 2MB and 1GB page support. With a 64kB page host,

>>>> the map/unmap check is done against 2MB. Some alignment check fail

>>>> so VFIO_IOMMU_MAP_DMA fail while we could map using 4kB IOMMU page

>>>> size.

>>>> ---

>>>>  drivers/vfio/vfio_iommu_type1.c | 25 +++++++++++--------------

>>>>  1 file changed, 11 insertions(+), 14 deletions(-)

>>>>

>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

>>>> index 57d8c37..13fb974 100644

>>>> --- a/drivers/vfio/vfio_iommu_type1.c

>>>> +++ b/drivers/vfio/vfio_iommu_type1.c

>>>> @@ -403,7 +403,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)

>>>>  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

>>>>  {

>>>>  	struct vfio_domain *domain;

>>>> -	unsigned long bitmap = PAGE_MASK;

>>>> +	unsigned long bitmap = ULONG_MAX;

>>>

>>> Isn't this and removing the WARN_ON()s the only real change in this

>>> patch?  The rest looks like conversion to use IS_ALIGNED and the

>>> following test, that I don't really understand...

>> Yes basically you're right.

> 

> 

> Ok, so with hopefully correcting my understand of what this does, isn't

> this effectively the same:

> 

> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

> index 57d8c37..7db4f5a 100644

> --- a/drivers/vfio/vfio_iommu_type1.c

> +++ b/drivers/vfio/vfio_iommu_type1.c

> @@ -403,13 +403,19 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, stru

>  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

>  {

>         struct vfio_domain *domain;

> -       unsigned long bitmap = PAGE_MASK;

> +       unsigned long bitmap = ULONG_MAX;

>  

>         mutex_lock(&iommu->lock);

>         list_for_each_entry(domain, &iommu->domain_list, next)

>                 bitmap &= domain->domain->ops->pgsize_bitmap;

>         mutex_unlock(&iommu->lock);

>  

> +       /* Some comment about how the IOMMU API splits requests */

> +       if (bitmap & ~PAGE_MASK) {

> +               bitmap &= PAGE_MASK;

> +               bitmap |= PAGE_SIZE;

> +       }

> +

>         return bitmap;

>  }

Yes, to me it is indeed the same
>  

> This would also expose to the user that we're accepting PAGE_SIZE, which

> we weren't before, so it was not quite right to just let them do it

> anyway.  I don't think we even need to get rid of the WARN_ONs, do we?

> Thanks,


The end-user might be afraid of those latter. Personally I would get rid
of them but that's definitively up to you.

Just let me know and I will respin.

Best Regards

Eric

> 

> Alex

> 

>>>

>>>>  

>>>>  	mutex_lock(&iommu->lock);

>>>>  	list_for_each_entry(domain, &iommu->domain_list, next)

>>>> @@ -416,20 +416,18 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

>>>>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

>>>>  			     struct vfio_iommu_type1_dma_unmap *unmap)

>>>>  {

>>>> -	uint64_t mask;

>>>>  	struct vfio_dma *dma;

>>>>  	size_t unmapped = 0;

>>>>  	int ret = 0;

>>>> +	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));

>>>> +	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?

>>>> +						PAGE_SIZE : min_pagesz;

>>>

>>> This one.  If we're going to support sub-PAGE_SIZE mappings, why do we

>>> care to cap alignment at PAGE_SIZE?

>> My intent in this patch isn't to allow the user-space to map/unmap

>> sub-PAGE_SIZE buffers. The new test makes sure the mapped area is bigger

>> or equal than a host page whatever the supported page sizes.

>>

>> I noticed that chunk construction, pinning and other many things are

>> based on PAGE_SIZE and far be it from me to change that code! I want to

>> keep that minimal granularity for all those computation.

>>

>> However on iommu side, I would like to rely on the fact the iommu driver

>> is clever enough to choose the right page size and even to choose a size

>> that is smaller than PAGE_SIZE if this latter is not supported.

>>>

>>>> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;

>>>> -

>>>> -	if (unmap->iova & mask)

>>>> +	if (!IS_ALIGNED(unmap->iova, requested_alignment))

>>>>  		return -EINVAL;

>>>> -	if (!unmap->size || unmap->size & mask)

>>>> +	if (!unmap->size || !IS_ALIGNED(unmap->size, requested_alignment))

>>>>  		return -EINVAL;

>>>>  

>>>> -	WARN_ON(mask & PAGE_MASK);

>>>> -

>>>>  	mutex_lock(&iommu->lock);

>>>>  

>>>>  	/*

>>>> @@ -553,25 +551,24 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,

>>>>  	size_t size = map->size;

>>>>  	long npage;

>>>>  	int ret = 0, prot = 0;

>>>> -	uint64_t mask;

>>>>  	struct vfio_dma *dma;

>>>>  	unsigned long pfn;

>>>> +	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));

>>>> +	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?

>>>> +						PAGE_SIZE : min_pagesz;

>>>>  

>>>>  	/* Verify that none of our __u64 fields overflow */

>>>>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)

>>>>  		return -EINVAL;

>>>>  

>>>> -	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;

>>>> -

>>>> -	WARN_ON(mask & PAGE_MASK);

>>>> -

>>>>  	/* READ/WRITE from device perspective */

>>>>  	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)

>>>>  		prot |= IOMMU_WRITE;

>>>>  	if (map->flags & VFIO_DMA_MAP_FLAG_READ)

>>>>  		prot |= IOMMU_READ;

>>>>  

>>>> -	if (!prot || !size || (size | iova | vaddr) & mask)

>>>> +	if (!prot || !size ||

>>>> +		!IS_ALIGNED(size | iova | vaddr, requested_alignment))

>>>>  		return -EINVAL;

>>>>  

>>>>  	/* Don't allow IOVA or virtual address wrap */

>>>

>>> This is mostly ignoring the problems with sub-PAGE_SIZE mappings.  For

>>> instance, we can only pin on PAGE_SIZE and therefore we only do

>>> accounting on PAGE_SIZE, so if the user does 4K mappings across your 64K

>>> page, that page gets pinned and accounted 16 times.  Are we going to

>>> tell users that their locked memory limit needs to be 16x now?  The rest

>>> of the code would need an audit as well to see what other sub-page bugs

>>> might be hiding.  Thanks,

>> So if the user is not allowed to map sub-PAGE_SIZE buffers, accounting

>> still is based on PAGE_SIZE while iommu mapping can be based on

>> sub-PAGE_SIZE pages. I am misunderstanding something?

>>

>> Best Regards

>>

>> Eric

>>>

>>> Alex

>>>

>>>

>>>

>>

> 

> 

>
Will Deacon Oct. 28, 2015, 5:55 p.m. UTC | #7
On Wed, Oct 28, 2015 at 06:48:41PM +0100, Eric Auger wrote:
> On 10/28/2015 06:37 PM, Alex Williamson wrote:

> > Ok, so with hopefully correcting my understand of what this does, isn't

> > this effectively the same:

> > 

> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

> > index 57d8c37..7db4f5a 100644

> > --- a/drivers/vfio/vfio_iommu_type1.c

> > +++ b/drivers/vfio/vfio_iommu_type1.c

> > @@ -403,13 +403,19 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, stru

> >  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

> >  {

> >         struct vfio_domain *domain;

> > -       unsigned long bitmap = PAGE_MASK;

> > +       unsigned long bitmap = ULONG_MAX;

> >  

> >         mutex_lock(&iommu->lock);

> >         list_for_each_entry(domain, &iommu->domain_list, next)

> >                 bitmap &= domain->domain->ops->pgsize_bitmap;

> >         mutex_unlock(&iommu->lock);

> >  

> > +       /* Some comment about how the IOMMU API splits requests */

> > +       if (bitmap & ~PAGE_MASK) {

> > +               bitmap &= PAGE_MASK;

> > +               bitmap |= PAGE_SIZE;

> > +       }

> > +

> >         return bitmap;

> >  }

> Yes, to me it is indeed the same

> >  

> > This would also expose to the user that we're accepting PAGE_SIZE, which

> > we weren't before, so it was not quite right to just let them do it

> > anyway.  I don't think we even need to get rid of the WARN_ONs, do we?

> > Thanks,

> 

> The end-user might be afraid of those latter. Personally I would get rid

> of them but that's definitively up to you.


I think Alex's point is that the WARN_ON's won't trigger with this patch,
because he clears those lower bits in the bitmap.

Will
Auger Eric Oct. 28, 2015, 6 p.m. UTC | #8
On 10/28/2015 06:55 PM, Will Deacon wrote:
> On Wed, Oct 28, 2015 at 06:48:41PM +0100, Eric Auger wrote:

>> On 10/28/2015 06:37 PM, Alex Williamson wrote:

>>> Ok, so with hopefully correcting my understand of what this does, isn't

>>> this effectively the same:

>>>

>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c

>>> index 57d8c37..7db4f5a 100644

>>> --- a/drivers/vfio/vfio_iommu_type1.c

>>> +++ b/drivers/vfio/vfio_iommu_type1.c

>>> @@ -403,13 +403,19 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, stru

>>>  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

>>>  {

>>>         struct vfio_domain *domain;

>>> -       unsigned long bitmap = PAGE_MASK;

>>> +       unsigned long bitmap = ULONG_MAX;

>>>  

>>>         mutex_lock(&iommu->lock);

>>>         list_for_each_entry(domain, &iommu->domain_list, next)

>>>                 bitmap &= domain->domain->ops->pgsize_bitmap;

>>>         mutex_unlock(&iommu->lock);

>>>  

>>> +       /* Some comment about how the IOMMU API splits requests */

>>> +       if (bitmap & ~PAGE_MASK) {

>>> +               bitmap &= PAGE_MASK;

>>> +               bitmap |= PAGE_SIZE;

>>> +       }

>>> +

>>>         return bitmap;

>>>  }

>> Yes, to me it is indeed the same

>>>  

>>> This would also expose to the user that we're accepting PAGE_SIZE, which

>>> we weren't before, so it was not quite right to just let them do it

>>> anyway.  I don't think we even need to get rid of the WARN_ONs, do we?

>>> Thanks,

>>

>> The end-user might be afraid of those latter. Personally I would get rid

>> of them but that's definitively up to you.

> 

> I think Alex's point is that the WARN_ON's won't trigger with this patch,

> because he clears those lower bits in the bitmap.

ah yes sure!

Thanks

Eric
> 

> Will

>
diff mbox

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 57d8c37..13fb974 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -403,7 +403,7 @@  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 {
 	struct vfio_domain *domain;
-	unsigned long bitmap = PAGE_MASK;
+	unsigned long bitmap = ULONG_MAX;
 
 	mutex_lock(&iommu->lock);
 	list_for_each_entry(domain, &iommu->domain_list, next)
@@ -416,20 +416,18 @@  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			     struct vfio_iommu_type1_dma_unmap *unmap)
 {
-	uint64_t mask;
 	struct vfio_dma *dma;
 	size_t unmapped = 0;
 	int ret = 0;
+	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));
+	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?
+						PAGE_SIZE : min_pagesz;
 
-	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
-
-	if (unmap->iova & mask)
+	if (!IS_ALIGNED(unmap->iova, requested_alignment))
 		return -EINVAL;
-	if (!unmap->size || unmap->size & mask)
+	if (!unmap->size || !IS_ALIGNED(unmap->size, requested_alignment))
 		return -EINVAL;
 
-	WARN_ON(mask & PAGE_MASK);
-
 	mutex_lock(&iommu->lock);
 
 	/*
@@ -553,25 +551,24 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	size_t size = map->size;
 	long npage;
 	int ret = 0, prot = 0;
-	uint64_t mask;
 	struct vfio_dma *dma;
 	unsigned long pfn;
+	unsigned int min_pagesz = __ffs(vfio_pgsize_bitmap(iommu));
+	unsigned int requested_alignment = (min_pagesz < PAGE_SIZE) ?
+						PAGE_SIZE : min_pagesz;
 
 	/* Verify that none of our __u64 fields overflow */
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
 		return -EINVAL;
 
-	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
-
-	WARN_ON(mask & PAGE_MASK);
-
 	/* READ/WRITE from device perspective */
 	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
 		prot |= IOMMU_WRITE;
 	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
 		prot |= IOMMU_READ;
 
-	if (!prot || !size || (size | iova | vaddr) & mask)
+	if (!prot || !size ||
+		!IS_ALIGNED(size | iova | vaddr, requested_alignment))
 		return -EINVAL;
 
 	/* Don't allow IOVA or virtual address wrap */