diff mbox series

[1/6] iommu: Move IOVA power-of-2 roundup into allocator

Message ID 1616160348-29451-2-git-send-email-john.garry@huawei.com
State New
Headers show
Series dma mapping/iommu: Allow IOMMU IOVA rcache range to be configured | expand

Commit Message

John Garry March 19, 2021, 1:25 p.m. UTC
Move the IOVA size power-of-2 rcache roundup into the IOVA allocator.

This is to eventually make it possible to be able to configure the upper
limit of the IOVA rcache range.

Signed-off-by: John Garry <john.garry@huawei.com>

---
 drivers/iommu/dma-iommu.c |  8 ------
 drivers/iommu/iova.c      | 51 ++++++++++++++++++++++++++-------------
 2 files changed, 34 insertions(+), 25 deletions(-)

-- 
2.26.2

Comments

Robin Murphy March 19, 2021, 4:13 p.m. UTC | #1
On 2021-03-19 13:25, John Garry wrote:
> Move the IOVA size power-of-2 rcache roundup into the IOVA allocator.

> 

> This is to eventually make it possible to be able to configure the upper

> limit of the IOVA rcache range.

> 

> Signed-off-by: John Garry <john.garry@huawei.com>

> ---

>   drivers/iommu/dma-iommu.c |  8 ------

>   drivers/iommu/iova.c      | 51 ++++++++++++++++++++++++++-------------

>   2 files changed, 34 insertions(+), 25 deletions(-)

> 

> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c

> index af765c813cc8..15b7270a5c2a 100644

> --- a/drivers/iommu/dma-iommu.c

> +++ b/drivers/iommu/dma-iommu.c

> @@ -429,14 +429,6 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,

>   

>   	shift = iova_shift(iovad);

>   	iova_len = size >> shift;

> -	/*

> -	 * Freeing non-power-of-two-sized allocations back into the IOVA caches

> -	 * will come back to bite us badly, so we have to waste a bit of space

> -	 * rounding up anything cacheable to make sure that can't happen. The

> -	 * order of the unadjusted size will still match upon freeing.

> -	 */

> -	if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))

> -		iova_len = roundup_pow_of_two(iova_len);

>   

>   	dma_limit = min_not_zero(dma_limit, dev->bus_dma_limit);

>   

> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c

> index e6e2fa85271c..e62e9e30b30c 100644

> --- a/drivers/iommu/iova.c

> +++ b/drivers/iommu/iova.c

> @@ -179,7 +179,7 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova,

>   

>   static int __alloc_and_insert_iova_range(struct iova_domain *iovad,

>   		unsigned long size, unsigned long limit_pfn,

> -			struct iova *new, bool size_aligned)

> +			struct iova *new, bool size_aligned, bool fast)

>   {

>   	struct rb_node *curr, *prev;

>   	struct iova *curr_iova;

> @@ -188,6 +188,15 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,

>   	unsigned long align_mask = ~0UL;

>   	unsigned long high_pfn = limit_pfn, low_pfn = iovad->start_pfn;

>   

> +	/*

> +	 * Freeing non-power-of-two-sized allocations back into the IOVA caches

> +	 * will come back to bite us badly, so we have to waste a bit of space

> +	 * rounding up anything cacheable to make sure that can't happen. The

> +	 * order of the unadjusted size will still match upon freeing.

> +	 */

> +	if (fast && size < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))

> +		size = roundup_pow_of_two(size);


If this transformation is only relevant to alloc_iova_fast(), and we 
have to add a special parameter here to tell whether we were called from 
alloc_iova_fast(), doesn't it seem more sensible to just do it in 
alloc_iova_fast() rather than here?

But then the API itself has no strict requirement that a pfn passed to 
free_iova_fast() wasn't originally allocated with alloc_iova(), so 
arguably hiding the adjustment away makes it less clear that the 
responsibility is really on any caller of free_iova_fast() to make sure 
they don't get things wrong.

Robin.

> +

>   	if (size_aligned)

>   		align_mask <<= fls_long(size - 1);

>   

> @@ -288,21 +297,10 @@ void iova_cache_put(void)

>   }

>   EXPORT_SYMBOL_GPL(iova_cache_put);

>   

> -/**

> - * alloc_iova - allocates an iova

> - * @iovad: - iova domain in question

> - * @size: - size of page frames to allocate

> - * @limit_pfn: - max limit address

> - * @size_aligned: - set if size_aligned address range is required

> - * This function allocates an iova in the range iovad->start_pfn to limit_pfn,

> - * searching top-down from limit_pfn to iovad->start_pfn. If the size_aligned

> - * flag is set then the allocated address iova->pfn_lo will be naturally

> - * aligned on roundup_power_of_two(size).

> - */

> -struct iova *

> -alloc_iova(struct iova_domain *iovad, unsigned long size,

> +static struct iova *

> +__alloc_iova(struct iova_domain *iovad, unsigned long size,

>   	unsigned long limit_pfn,

> -	bool size_aligned)

> +	bool size_aligned, bool fast)

>   {

>   	struct iova *new_iova;

>   	int ret;

> @@ -312,7 +310,7 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,

>   		return NULL;

>   

>   	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,

> -			new_iova, size_aligned);

> +			new_iova, size_aligned, fast);

>   

>   	if (ret) {

>   		free_iova_mem(new_iova);

> @@ -321,6 +319,25 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,

>   

>   	return new_iova;

>   }

> +

> +/**

> + * alloc_iova - allocates an iova

> + * @iovad: - iova domain in question

> + * @size: - size of page frames to allocate

> + * @limit_pfn: - max limit address

> + * @size_aligned: - set if size_aligned address range is required

> + * This function allocates an iova in the range iovad->start_pfn to limit_pfn,

> + * searching top-down from limit_pfn to iovad->start_pfn. If the size_aligned

> + * flag is set then the allocated address iova->pfn_lo will be naturally

> + * aligned on roundup_power_of_two(size).

> + */

> +struct iova *

> +alloc_iova(struct iova_domain *iovad, unsigned long size,

> +	unsigned long limit_pfn,

> +	bool size_aligned)

> +{

> +	return __alloc_iova(iovad, size, limit_pfn, size_aligned, false);

> +}

>   EXPORT_SYMBOL_GPL(alloc_iova);

>   

>   static struct iova *

> @@ -433,7 +450,7 @@ alloc_iova_fast(struct iova_domain *iovad, unsigned long size,

>   		return iova_pfn;

>   

>   retry:

> -	new_iova = alloc_iova(iovad, size, limit_pfn, true);

> +	new_iova = __alloc_iova(iovad, size, limit_pfn, true, true);

>   	if (!new_iova) {

>   		unsigned int cpu;

>   

>
John Garry March 19, 2021, 4:58 p.m. UTC | #2
On 19/03/2021 16:13, Robin Murphy wrote:
> On 2021-03-19 13:25, John Garry wrote:

>> Move the IOVA size power-of-2 rcache roundup into the IOVA allocator.

>>

>> This is to eventually make it possible to be able to configure the upper

>> limit of the IOVA rcache range.

>>

>> Signed-off-by: John Garry <john.garry@huawei.com>

>> ---

>>   drivers/iommu/dma-iommu.c |  8 ------

>>   drivers/iommu/iova.c      | 51 ++++++++++++++++++++++++++-------------

>>   2 files changed, 34 insertions(+), 25 deletions(-)

>>

>> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c

>> index af765c813cc8..15b7270a5c2a 100644

>> --- a/drivers/iommu/dma-iommu.c

>> +++ b/drivers/iommu/dma-iommu.c

>> @@ -429,14 +429,6 @@ static dma_addr_t iommu_dma_alloc_iova(struct 

>> iommu_domain *domain,

>>       shift = iova_shift(iovad);

>>       iova_len = size >> shift;

>> -    /*

>> -     * Freeing non-power-of-two-sized allocations back into the IOVA 

>> caches

>> -     * will come back to bite us badly, so we have to waste a bit of 

>> space

>> -     * rounding up anything cacheable to make sure that can't happen. 

>> The

>> -     * order of the unadjusted size will still match upon freeing.

>> -     */

>> -    if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))

>> -        iova_len = roundup_pow_of_two(iova_len);

>>       dma_limit = min_not_zero(dma_limit, dev->bus_dma_limit);

>> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c

>> index e6e2fa85271c..e62e9e30b30c 100644

>> --- a/drivers/iommu/iova.c

>> +++ b/drivers/iommu/iova.c

>> @@ -179,7 +179,7 @@ iova_insert_rbtree(struct rb_root *root, struct 

>> iova *iova,

>>   static int __alloc_and_insert_iova_range(struct iova_domain *iovad,

>>           unsigned long size, unsigned long limit_pfn,

>> -            struct iova *new, bool size_aligned)

>> +            struct iova *new, bool size_aligned, bool fast)

>>   {

>>       struct rb_node *curr, *prev;

>>       struct iova *curr_iova;

>> @@ -188,6 +188,15 @@ static int __alloc_and_insert_iova_range(struct 

>> iova_domain *iovad,

>>       unsigned long align_mask = ~0UL;

>>       unsigned long high_pfn = limit_pfn, low_pfn = iovad->start_pfn;

>> +    /*

>> +     * Freeing non-power-of-two-sized allocations back into the IOVA 

>> caches

>> +     * will come back to bite us badly, so we have to waste a bit of 

>> space

>> +     * rounding up anything cacheable to make sure that can't happen. 

>> The

>> +     * order of the unadjusted size will still match upon freeing.

>> +     */

>> +    if (fast && size < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))

>> +        size = roundup_pow_of_two(size);

> 

> If this transformation is only relevant to alloc_iova_fast(), and we 

> have to add a special parameter here to tell whether we were called from 

> alloc_iova_fast(), doesn't it seem more sensible to just do it in 

> alloc_iova_fast() rather than here?


We have the restriction that anything we put in the rcache needs be a 
power-of-2.

So then we have the issue of how to dynamically increase this rcache 
threshold. The problem is that we may have many devices associated with 
the same domain. So, in theory, we can't assume that when we increase 
the threshold that some other device will try to fast free an IOVA which 
was allocated prior to the increase and was not rounded up.

I'm very open to better (or less bad) suggestions on how to do this ...

I could say that we only allow this for a group with a single device, so 
these sort of things don't have to be worried about, but even then the 
iommu_group internals are not readily accessible here.

> 

> But then the API itself has no strict requirement that a pfn passed to 

> free_iova_fast() wasn't originally allocated with alloc_iova(), so 

> arguably hiding the adjustment away makes it less clear that the 

> responsibility is really on any caller of free_iova_fast() to make sure 

> they don't get things wrong.

> 


alloc_iova() doesn't roundup to pow-of-2, so wouldn't it be broken to do 
that?

Cheers,
John
Robin Murphy March 19, 2021, 7:20 p.m. UTC | #3
On 2021-03-19 16:58, John Garry wrote:
> On 19/03/2021 16:13, Robin Murphy wrote:

>> On 2021-03-19 13:25, John Garry wrote:

>>> Move the IOVA size power-of-2 rcache roundup into the IOVA allocator.

>>>

>>> This is to eventually make it possible to be able to configure the upper

>>> limit of the IOVA rcache range.

>>>

>>> Signed-off-by: John Garry <john.garry@huawei.com>

>>> ---

>>>   drivers/iommu/dma-iommu.c |  8 ------

>>>   drivers/iommu/iova.c      | 51 ++++++++++++++++++++++++++-------------

>>>   2 files changed, 34 insertions(+), 25 deletions(-)

>>>

>>> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c

>>> index af765c813cc8..15b7270a5c2a 100644

>>> --- a/drivers/iommu/dma-iommu.c

>>> +++ b/drivers/iommu/dma-iommu.c

>>> @@ -429,14 +429,6 @@ static dma_addr_t iommu_dma_alloc_iova(struct 

>>> iommu_domain *domain,

>>>       shift = iova_shift(iovad);

>>>       iova_len = size >> shift;

>>> -    /*

>>> -     * Freeing non-power-of-two-sized allocations back into the IOVA 

>>> caches

>>> -     * will come back to bite us badly, so we have to waste a bit of 

>>> space

>>> -     * rounding up anything cacheable to make sure that can't 

>>> happen. The

>>> -     * order of the unadjusted size will still match upon freeing.

>>> -     */

>>> -    if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))

>>> -        iova_len = roundup_pow_of_two(iova_len);

>>>       dma_limit = min_not_zero(dma_limit, dev->bus_dma_limit);

>>> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c

>>> index e6e2fa85271c..e62e9e30b30c 100644

>>> --- a/drivers/iommu/iova.c

>>> +++ b/drivers/iommu/iova.c

>>> @@ -179,7 +179,7 @@ iova_insert_rbtree(struct rb_root *root, struct 

>>> iova *iova,

>>>   static int __alloc_and_insert_iova_range(struct iova_domain *iovad,

>>>           unsigned long size, unsigned long limit_pfn,

>>> -            struct iova *new, bool size_aligned)

>>> +            struct iova *new, bool size_aligned, bool fast)

>>>   {

>>>       struct rb_node *curr, *prev;

>>>       struct iova *curr_iova;

>>> @@ -188,6 +188,15 @@ static int __alloc_and_insert_iova_range(struct 

>>> iova_domain *iovad,

>>>       unsigned long align_mask = ~0UL;

>>>       unsigned long high_pfn = limit_pfn, low_pfn = iovad->start_pfn;

>>> +    /*

>>> +     * Freeing non-power-of-two-sized allocations back into the IOVA 

>>> caches

>>> +     * will come back to bite us badly, so we have to waste a bit of 

>>> space

>>> +     * rounding up anything cacheable to make sure that can't 

>>> happen. The

>>> +     * order of the unadjusted size will still match upon freeing.

>>> +     */

>>> +    if (fast && size < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))

>>> +        size = roundup_pow_of_two(size);

>>

>> If this transformation is only relevant to alloc_iova_fast(), and we 

>> have to add a special parameter here to tell whether we were called 

>> from alloc_iova_fast(), doesn't it seem more sensible to just do it in 

>> alloc_iova_fast() rather than here?

> 

> We have the restriction that anything we put in the rcache needs be a 

> power-of-2.


I was really only talking about the apparently silly structure of:

void foo(bool in_bar) {
	if (in_bar)
		//do thing
	...
}
void bar() {
	foo(true);
}

vs.:

void foo() {
	...
}
void bar() {
	//do thing
	foo();
}

> So then we have the issue of how to dynamically increase this rcache 

> threshold. The problem is that we may have many devices associated with 

> the same domain. So, in theory, we can't assume that when we increase 

> the threshold that some other device will try to fast free an IOVA which 

> was allocated prior to the increase and was not rounded up.

> 

> I'm very open to better (or less bad) suggestions on how to do this ...


...but yes, regardless of exactly where it happens, rounding up or not 
is the problem for rcaches in general. I've said several times that my 
preferred approach is to not change it that dynamically at all, but 
instead treat it more like we treat the default domain type.

> I could say that we only allow this for a group with a single device, so 

> these sort of things don't have to be worried about, but even then the 

> iommu_group internals are not readily accessible here.

> 

>>

>> But then the API itself has no strict requirement that a pfn passed to 

>> free_iova_fast() wasn't originally allocated with alloc_iova(), so 

>> arguably hiding the adjustment away makes it less clear that the 

>> responsibility is really on any caller of free_iova_fast() to make 

>> sure they don't get things wrong.

>>

> 

> alloc_iova() doesn't roundup to pow-of-2, so wouldn't it be broken to do 

> that?


Well, right now neither call rounds up, which is why iommu-dma takes 
care to avoid any issues by explicitly rounding up for itself 
beforehand. I'm just concerned that giving the impression that the API 
takes care of everything for itself will make it easier to write broken 
code in future, if that impression is in fact not entirely true.

I don't even think it's very likely that someone would manage to hit 
that rather wacky alloc/free pattern either way, I just know that 
getting wrong-sized things into the rcaches is an absolute sod to debug, 
so...

Robin.
John Garry March 22, 2021, 3:01 p.m. UTC | #4
On 19/03/2021 19:20, Robin Murphy wrote:

Hi Robin,

>> So then we have the issue of how to dynamically increase this rcache

>> threshold. The problem is that we may have many devices associated with

>> the same domain. So, in theory, we can't assume that when we increase

>> the threshold that some other device will try to fast free an IOVA which

>> was allocated prior to the increase and was not rounded up.

>>

>> I'm very open to better (or less bad) suggestions on how to do this ...

> ...but yes, regardless of exactly where it happens, rounding up or not

> is the problem for rcaches in general. I've said several times that my

> preferred approach is to not change it that dynamically at all, but

> instead treat it more like we treat the default domain type.

> 


Can you remind me of that idea? I don't remember you mentioning using 
default domain handling as a reference in any context.

Thanks,
John
Robin Murphy March 31, 2021, 9:58 a.m. UTC | #5
On 2021-03-22 15:01, John Garry wrote:
> On 19/03/2021 19:20, Robin Murphy wrote:

> 

> Hi Robin,

> 

>>> So then we have the issue of how to dynamically increase this rcache

>>> threshold. The problem is that we may have many devices associated with

>>> the same domain. So, in theory, we can't assume that when we increase

>>> the threshold that some other device will try to fast free an IOVA which

>>> was allocated prior to the increase and was not rounded up.

>>>

>>> I'm very open to better (or less bad) suggestions on how to do this ...

>> ...but yes, regardless of exactly where it happens, rounding up or not

>> is the problem for rcaches in general. I've said several times that my

>> preferred approach is to not change it that dynamically at all, but

>> instead treat it more like we treat the default domain type.

>>

> 

> Can you remind me of that idea? I don't remember you mentioning using 

> default domain handling as a reference in any context.


Sorry if the phrasing was unclear there - the allusion to default 
domains is new, it just occurred to me that what we do there is in fact 
fairly close to what I've suggested previously for this. In that case, 
we have a global policy set by the command line, which *can* be 
overridden per-domain via sysfs at runtime, provided the user is willing 
to tear the whole thing down. Using a similar approach here would give a 
fair degree of flexibility but still mean that changes never have to be 
made dynamically to a live domain.

Robin.
John Garry April 6, 2021, 4:54 p.m. UTC | #6
>>>> So then we have the issue of how to dynamically increase this rcache

>>>> threshold. The problem is that we may have many devices associated with

>>>> the same domain. So, in theory, we can't assume that when we increase

>>>> the threshold that some other device will try to fast free an IOVA 

>>>> which

>>>> was allocated prior to the increase and was not rounded up.

>>>>

>>>> I'm very open to better (or less bad) suggestions on how to do this ...

>>> ...but yes, regardless of exactly where it happens, rounding up or not

>>> is the problem for rcaches in general. I've said several times that my

>>> preferred approach is to not change it that dynamically at all, but

>>> instead treat it more like we treat the default domain type.

>>>

>>

>> Can you remind me of that idea? I don't remember you mentioning using 

>> default domain handling as a reference in any context.

> 


Hi Robin,

> Sorry if the phrasing was unclear there - the allusion to default 

> domains is new, it just occurred to me that what we do there is in fact 

> fairly close to what I've suggested previously for this. In that case, 

> we have a global policy set by the command line, which *can* be 

> overridden per-domain via sysfs at runtime, provided the user is willing 

> to tear the whole thing down. Using a similar approach here would give a 

> fair degree of flexibility but still mean that changes never have to be 

> made dynamically to a live domain.


So are you saying that we can handle it similar to how we now can handle 
changing default domain for an IOMMU group via sysfs? If so, that just 
is not practical here. Reason being that this particular DMA engine 
provides the block device giving / mount point, so if we unbind the 
driver, we lose / mount point.

And I am not sure if the end user would even know how to set such a 
tunable. Or, in this case, why the end user would not want the optimized 
range configured always.

I'd still rather if the device driver could provide info which can be 
used to configure this before or during probing.

Cheers,
John
John Garry April 14, 2021, 5:44 p.m. UTC | #7
On 06/04/2021 17:54, John Garry wrote:

Hi Robin,

> 

>> Sorry if the phrasing was unclear there - the allusion to default 

>> domains is new, it just occurred to me that what we do there is in 

>> fact fairly close to what I've suggested previously for this. In that 

>> case, we have a global policy set by the command line, which *can* be 

>> overridden per-domain via sysfs at runtime, provided the user is 

>> willing to tear the whole thing down. Using a similar approach here 

>> would give a fair degree of flexibility but still mean that changes 

>> never have to be made dynamically to a live domain.

> 

> So are you saying that we can handle it similar to how we now can handle 

> changing default domain for an IOMMU group via sysfs? If so, that just 

> is not practical here. Reason being that this particular DMA engine 

> provides the block device giving / mount point, so if we unbind the 

> driver, we lose / mount point.

> 

> And I am not sure if the end user would even know how to set such a 

> tunable. Or, in this case, why the end user would not want the optimized 

> range configured always.

> 

> I'd still rather if the device driver could provide info which can be 

> used to configure this before or during probing.


As a new solution, how about do both of these:
a. Add a per-IOMMU group sysfs file to set this tunable. Works same as 
how we change the default domain, and has all the same 
restrictions/steps. I think that this is what you are already suggesting.
b. Provide a DMA mapping API to set this value, similar to this current 
series. In the IOMMU backend for that API, we record a new range value 
and return -EPROBE_DEFER when successful. In the reprobe we reset the 
default domain for the devices' IOMMU group, with the IOVA domain rcache 
range configured as previously requested. Again, allocating the new 
default domain is similar to how we change default domain type today.
This means that we don't play with a live domain. Downside is that we 
need to defer the probe.

Thanks,
John
diff mbox series

Patch

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index af765c813cc8..15b7270a5c2a 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -429,14 +429,6 @@  static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
 
 	shift = iova_shift(iovad);
 	iova_len = size >> shift;
-	/*
-	 * Freeing non-power-of-two-sized allocations back into the IOVA caches
-	 * will come back to bite us badly, so we have to waste a bit of space
-	 * rounding up anything cacheable to make sure that can't happen. The
-	 * order of the unadjusted size will still match upon freeing.
-	 */
-	if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
-		iova_len = roundup_pow_of_two(iova_len);
 
 	dma_limit = min_not_zero(dma_limit, dev->bus_dma_limit);
 
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index e6e2fa85271c..e62e9e30b30c 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -179,7 +179,7 @@  iova_insert_rbtree(struct rb_root *root, struct iova *iova,
 
 static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 		unsigned long size, unsigned long limit_pfn,
-			struct iova *new, bool size_aligned)
+			struct iova *new, bool size_aligned, bool fast)
 {
 	struct rb_node *curr, *prev;
 	struct iova *curr_iova;
@@ -188,6 +188,15 @@  static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 	unsigned long align_mask = ~0UL;
 	unsigned long high_pfn = limit_pfn, low_pfn = iovad->start_pfn;
 
+	/*
+	 * Freeing non-power-of-two-sized allocations back into the IOVA caches
+	 * will come back to bite us badly, so we have to waste a bit of space
+	 * rounding up anything cacheable to make sure that can't happen. The
+	 * order of the unadjusted size will still match upon freeing.
+	 */
+	if (fast && size < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
+		size = roundup_pow_of_two(size);
+
 	if (size_aligned)
 		align_mask <<= fls_long(size - 1);
 
@@ -288,21 +297,10 @@  void iova_cache_put(void)
 }
 EXPORT_SYMBOL_GPL(iova_cache_put);
 
-/**
- * alloc_iova - allocates an iova
- * @iovad: - iova domain in question
- * @size: - size of page frames to allocate
- * @limit_pfn: - max limit address
- * @size_aligned: - set if size_aligned address range is required
- * This function allocates an iova in the range iovad->start_pfn to limit_pfn,
- * searching top-down from limit_pfn to iovad->start_pfn. If the size_aligned
- * flag is set then the allocated address iova->pfn_lo will be naturally
- * aligned on roundup_power_of_two(size).
- */
-struct iova *
-alloc_iova(struct iova_domain *iovad, unsigned long size,
+static struct iova *
+__alloc_iova(struct iova_domain *iovad, unsigned long size,
 	unsigned long limit_pfn,
-	bool size_aligned)
+	bool size_aligned, bool fast)
 {
 	struct iova *new_iova;
 	int ret;
@@ -312,7 +310,7 @@  alloc_iova(struct iova_domain *iovad, unsigned long size,
 		return NULL;
 
 	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,
-			new_iova, size_aligned);
+			new_iova, size_aligned, fast);
 
 	if (ret) {
 		free_iova_mem(new_iova);
@@ -321,6 +319,25 @@  alloc_iova(struct iova_domain *iovad, unsigned long size,
 
 	return new_iova;
 }
+
+/**
+ * alloc_iova - allocates an iova
+ * @iovad: - iova domain in question
+ * @size: - size of page frames to allocate
+ * @limit_pfn: - max limit address
+ * @size_aligned: - set if size_aligned address range is required
+ * This function allocates an iova in the range iovad->start_pfn to limit_pfn,
+ * searching top-down from limit_pfn to iovad->start_pfn. If the size_aligned
+ * flag is set then the allocated address iova->pfn_lo will be naturally
+ * aligned on roundup_power_of_two(size).
+ */
+struct iova *
+alloc_iova(struct iova_domain *iovad, unsigned long size,
+	unsigned long limit_pfn,
+	bool size_aligned)
+{
+	return __alloc_iova(iovad, size, limit_pfn, size_aligned, false);
+}
 EXPORT_SYMBOL_GPL(alloc_iova);
 
 static struct iova *
@@ -433,7 +450,7 @@  alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
 		return iova_pfn;
 
 retry:
-	new_iova = alloc_iova(iovad, size, limit_pfn, true);
+	new_iova = __alloc_iova(iovad, size, limit_pfn, true, true);
 	if (!new_iova) {
 		unsigned int cpu;