[v3,4/6] iommu/io-pgtable-arm: add support for non-strict mode

Message ID 1531376312-2192-5-git-send-email-thunder.leizhen@huawei.com
State New
Headers show
Series
  • add non-strict mode support for arm-smmu-v3
Related show

Commit Message

Leizhen (ThunderTown) July 12, 2018, 6:18 a.m.
To support the non-strict mode, now we only tlbi and sync for the strict
mode. But for the non-leaf case, always follow strict mode.

Use the lowest bit of the iova parameter to pass the strict mode:
0, IOMMU_STRICT;
1, IOMMU_NON_STRICT;
Treat 0 as IOMMU_STRICT, so that the unmap operation can compatible with
other IOMMUs which still use strict mode.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>

---
 drivers/iommu/io-pgtable-arm.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

-- 
1.8.3

Comments

Robin Murphy July 24, 2018, 10:25 p.m. | #1
On 2018-07-12 7:18 AM, Zhen Lei wrote:
> To support the non-strict mode, now we only tlbi and sync for the strict

> mode. But for the non-leaf case, always follow strict mode.

> 

> Use the lowest bit of the iova parameter to pass the strict mode:

> 0, IOMMU_STRICT;

> 1, IOMMU_NON_STRICT;

> Treat 0 as IOMMU_STRICT, so that the unmap operation can compatible with

> other IOMMUs which still use strict mode.

> 

> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>

> ---

>   drivers/iommu/io-pgtable-arm.c | 23 ++++++++++++++---------

>   1 file changed, 14 insertions(+), 9 deletions(-)

> 

> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c

> index 010a254..9234db3 100644

> --- a/drivers/iommu/io-pgtable-arm.c

> +++ b/drivers/iommu/io-pgtable-arm.c

> @@ -292,7 +292,7 @@ static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,

>   

>   static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>   			       unsigned long iova, size_t size, int lvl,

> -			       arm_lpae_iopte *ptep);

> +			       arm_lpae_iopte *ptep, int strict);

>   

>   static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,

>   				phys_addr_t paddr, arm_lpae_iopte prot,

> @@ -334,7 +334,7 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,

>   		size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);

>   

>   		tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);

> -		if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz))

> +		if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp, IOMMU_STRICT) != sz))

>   			return -EINVAL;

>   	}

>   

> @@ -531,7 +531,7 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop)

>   static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,

>   				       unsigned long iova, size_t size,

>   				       arm_lpae_iopte blk_pte, int lvl,

> -				       arm_lpae_iopte *ptep)

> +				       arm_lpae_iopte *ptep, int strict)


DMA code should never ever be splitting blocks anyway, and frankly the 
TLB maintenance here is dodgy enough (since we can't reasonably do 
break-before make as VMSA says we should) that I *really* don't want to 
introduce any possibility of making it more asynchronous. I'd much 
rather just hard-code the expectation of strict == true for this.

Robin.

>   {

>   	struct io_pgtable_cfg *cfg = &data->iop.cfg;

>   	arm_lpae_iopte pte, *tablep;

> @@ -576,15 +576,18 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,

>   	}

>   

>   	if (unmap_idx < 0)

> -		return __arm_lpae_unmap(data, iova, size, lvl, tablep);

> +		return __arm_lpae_unmap(data, iova, size, lvl, tablep, strict);

>   

>   	io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true);

> +	if (!strict)

> +		io_pgtable_tlb_sync(&data->iop);

> +

>   	return size;

>   }

>   

>   static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>   			       unsigned long iova, size_t size, int lvl,

> -			       arm_lpae_iopte *ptep)

> +			       arm_lpae_iopte *ptep, int strict)

>   {

>   	arm_lpae_iopte pte;

>   	struct io_pgtable *iop = &data->iop;

> @@ -609,7 +612,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>   			io_pgtable_tlb_sync(iop);

>   			ptep = iopte_deref(pte, data);

>   			__arm_lpae_free_pgtable(data, lvl + 1, ptep);

> -		} else {

> +		} else if (strict) {

>   			io_pgtable_tlb_add_flush(iop, iova, size, size, true);

>   		}

>   

> @@ -620,25 +623,27 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>   		 * minus the part we want to unmap

>   		 */

>   		return arm_lpae_split_blk_unmap(data, iova, size, pte,

> -						lvl + 1, ptep);

> +						lvl + 1, ptep, strict);

>   	}

>   

>   	/* Keep on walkin' */

>   	ptep = iopte_deref(pte, data);

> -	return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep);

> +	return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep, strict);

>   }

>   

>   static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,

>   			     size_t size)

>   {

> +	int strict = ((iova & IOMMU_STRICT_MODE_MASK) == IOMMU_STRICT);

>   	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);

>   	arm_lpae_iopte *ptep = data->pgd;

>   	int lvl = ARM_LPAE_START_LVL(data);

>   

> +	iova &= ~IOMMU_STRICT_MODE_MASK;

>   	if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias)))

>   		return 0;

>   

> -	return __arm_lpae_unmap(data, iova, size, lvl, ptep);

> +	return __arm_lpae_unmap(data, iova, size, lvl, ptep, strict);

>   }

>   

>   static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,

>
Leizhen (ThunderTown) July 26, 2018, 7:20 a.m. | #2
On 2018/7/25 6:25, Robin Murphy wrote:
> On 2018-07-12 7:18 AM, Zhen Lei wrote:

>> To support the non-strict mode, now we only tlbi and sync for the strict

>> mode. But for the non-leaf case, always follow strict mode.

>>

>> Use the lowest bit of the iova parameter to pass the strict mode:

>> 0, IOMMU_STRICT;

>> 1, IOMMU_NON_STRICT;

>> Treat 0 as IOMMU_STRICT, so that the unmap operation can compatible with

>> other IOMMUs which still use strict mode.

>>

>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>

>> ---

>>   drivers/iommu/io-pgtable-arm.c | 23 ++++++++++++++---------

>>   1 file changed, 14 insertions(+), 9 deletions(-)

>>

>> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c

>> index 010a254..9234db3 100644

>> --- a/drivers/iommu/io-pgtable-arm.c

>> +++ b/drivers/iommu/io-pgtable-arm.c

>> @@ -292,7 +292,7 @@ static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,

>>     static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>                      unsigned long iova, size_t size, int lvl,

>> -                   arm_lpae_iopte *ptep);

>> +                   arm_lpae_iopte *ptep, int strict);

>>     static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,

>>                   phys_addr_t paddr, arm_lpae_iopte prot,

>> @@ -334,7 +334,7 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,

>>           size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);

>>             tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);

>> -        if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz))

>> +        if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp, IOMMU_STRICT) != sz))

>>               return -EINVAL;

>>       }

>>   @@ -531,7 +531,7 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop)

>>   static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,

>>                          unsigned long iova, size_t size,

>>                          arm_lpae_iopte blk_pte, int lvl,

>> -                       arm_lpae_iopte *ptep)

>> +                       arm_lpae_iopte *ptep, int strict)

> 

> DMA code should never ever be splitting blocks anyway, and frankly the TLB maintenance here is dodgy enough (since we can't reasonably do break-before make as VMSA says we should) that I *really* don't want to introduce any possibility of making it more asynchronous. I'd much rather just hard-code the expectation of strict == true for this.


OK, I will hard-code strict=true for it.

But since it never ever be happened, why did not give a warning at the beginning?

> 

> Robin.

> 

>>   {

>>       struct io_pgtable_cfg *cfg = &data->iop.cfg;

>>       arm_lpae_iopte pte, *tablep;

>> @@ -576,15 +576,18 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,

>>       }

>>         if (unmap_idx < 0)

>> -        return __arm_lpae_unmap(data, iova, size, lvl, tablep);

>> +        return __arm_lpae_unmap(data, iova, size, lvl, tablep, strict);

>>         io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true);

>> +    if (!strict)

>> +        io_pgtable_tlb_sync(&data->iop);

>> +

>>       return size;

>>   }

>>     static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>                      unsigned long iova, size_t size, int lvl,

>> -                   arm_lpae_iopte *ptep)

>> +                   arm_lpae_iopte *ptep, int strict)

>>   {

>>       arm_lpae_iopte pte;

>>       struct io_pgtable *iop = &data->iop;

>> @@ -609,7 +612,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>               io_pgtable_tlb_sync(iop);

>>               ptep = iopte_deref(pte, data);

>>               __arm_lpae_free_pgtable(data, lvl + 1, ptep);

>> -        } else {

>> +        } else if (strict) {

>>               io_pgtable_tlb_add_flush(iop, iova, size, size, true);

>>           }

>>   @@ -620,25 +623,27 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>            * minus the part we want to unmap

>>            */

>>           return arm_lpae_split_blk_unmap(data, iova, size, pte,

>> -                        lvl + 1, ptep);

>> +                        lvl + 1, ptep, strict);

>>       }

>>         /* Keep on walkin' */

>>       ptep = iopte_deref(pte, data);

>> -    return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep);

>> +    return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep, strict);

>>   }

>>     static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,

>>                    size_t size)

>>   {

>> +    int strict = ((iova & IOMMU_STRICT_MODE_MASK) == IOMMU_STRICT);

>>       struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);

>>       arm_lpae_iopte *ptep = data->pgd;

>>       int lvl = ARM_LPAE_START_LVL(data);

>>   +    iova &= ~IOMMU_STRICT_MODE_MASK;

>>       if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias)))

>>           return 0;

>>   -    return __arm_lpae_unmap(data, iova, size, lvl, ptep);

>> +    return __arm_lpae_unmap(data, iova, size, lvl, ptep, strict);

>>   }

>>     static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,

>>

> 

> .

> 


-- 
Thanks!
BestRegards
Robin Murphy July 26, 2018, 2:35 p.m. | #3
On 2018-07-26 8:20 AM, Leizhen (ThunderTown) wrote:
> On 2018/7/25 6:25, Robin Murphy wrote:

>> On 2018-07-12 7:18 AM, Zhen Lei wrote:

>>> To support the non-strict mode, now we only tlbi and sync for the strict

>>> mode. But for the non-leaf case, always follow strict mode.

>>>

>>> Use the lowest bit of the iova parameter to pass the strict mode:

>>> 0, IOMMU_STRICT;

>>> 1, IOMMU_NON_STRICT;

>>> Treat 0 as IOMMU_STRICT, so that the unmap operation can compatible with

>>> other IOMMUs which still use strict mode.

>>>

>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>

>>> ---

>>>    drivers/iommu/io-pgtable-arm.c | 23 ++++++++++++++---------

>>>    1 file changed, 14 insertions(+), 9 deletions(-)

>>>

>>> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c

>>> index 010a254..9234db3 100644

>>> --- a/drivers/iommu/io-pgtable-arm.c

>>> +++ b/drivers/iommu/io-pgtable-arm.c

>>> @@ -292,7 +292,7 @@ static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,

>>>      static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>>                       unsigned long iova, size_t size, int lvl,

>>> -                   arm_lpae_iopte *ptep);

>>> +                   arm_lpae_iopte *ptep, int strict);

>>>      static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,

>>>                    phys_addr_t paddr, arm_lpae_iopte prot,

>>> @@ -334,7 +334,7 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,

>>>            size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);

>>>              tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);

>>> -        if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz))

>>> +        if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp, IOMMU_STRICT) != sz))

>>>                return -EINVAL;

>>>        }

>>>    @@ -531,7 +531,7 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop)

>>>    static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,

>>>                           unsigned long iova, size_t size,

>>>                           arm_lpae_iopte blk_pte, int lvl,

>>> -                       arm_lpae_iopte *ptep)

>>> +                       arm_lpae_iopte *ptep, int strict)

>>

>> DMA code should never ever be splitting blocks anyway, and frankly the TLB maintenance here is dodgy enough (since we can't reasonably do break-before make as VMSA says we should) that I *really* don't want to introduce any possibility of making it more asynchronous. I'd much rather just hard-code the expectation of strict == true for this.

> 

> OK, I will hard-code strict=true for it.

> 

> But since it never ever be happened, why did not give a warning at the beginning?


Because DMA code is not the only caller of iommu_map/unmap. It's 
perfectly legal in the IOMMU API to partially unmap a previous mapping 
such that a block entry needs to be split. The DMA API, however, is a 
lot more constrined, and thus by construction the iommu-dma layer will 
never generate a block-splitting iommu_unmap() except as a result of 
illegal DMA API usage, and we obviously do not need to optimise for that 
(you will get a warning about mismatched unmaps under dma-debug, but 
it's a bit too expensive to police in the general case).

Robin.

>>>    {

>>>        struct io_pgtable_cfg *cfg = &data->iop.cfg;

>>>        arm_lpae_iopte pte, *tablep;

>>> @@ -576,15 +576,18 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,

>>>        }

>>>          if (unmap_idx < 0)

>>> -        return __arm_lpae_unmap(data, iova, size, lvl, tablep);

>>> +        return __arm_lpae_unmap(data, iova, size, lvl, tablep, strict);

>>>          io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true);

>>> +    if (!strict)

>>> +        io_pgtable_tlb_sync(&data->iop);

>>> +

>>>        return size;

>>>    }

>>>      static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>>                       unsigned long iova, size_t size, int lvl,

>>> -                   arm_lpae_iopte *ptep)

>>> +                   arm_lpae_iopte *ptep, int strict)

>>>    {

>>>        arm_lpae_iopte pte;

>>>        struct io_pgtable *iop = &data->iop;

>>> @@ -609,7 +612,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>>                io_pgtable_tlb_sync(iop);

>>>                ptep = iopte_deref(pte, data);

>>>                __arm_lpae_free_pgtable(data, lvl + 1, ptep);

>>> -        } else {

>>> +        } else if (strict) {

>>>                io_pgtable_tlb_add_flush(iop, iova, size, size, true);

>>>            }

>>>    @@ -620,25 +623,27 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>>             * minus the part we want to unmap

>>>             */

>>>            return arm_lpae_split_blk_unmap(data, iova, size, pte,

>>> -                        lvl + 1, ptep);

>>> +                        lvl + 1, ptep, strict);

>>>        }

>>>          /* Keep on walkin' */

>>>        ptep = iopte_deref(pte, data);

>>> -    return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep);

>>> +    return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep, strict);

>>>    }

>>>      static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,

>>>                     size_t size)

>>>    {

>>> +    int strict = ((iova & IOMMU_STRICT_MODE_MASK) == IOMMU_STRICT);

>>>        struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);

>>>        arm_lpae_iopte *ptep = data->pgd;

>>>        int lvl = ARM_LPAE_START_LVL(data);

>>>    +    iova &= ~IOMMU_STRICT_MODE_MASK;

>>>        if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias)))

>>>            return 0;

>>>    -    return __arm_lpae_unmap(data, iova, size, lvl, ptep);

>>> +    return __arm_lpae_unmap(data, iova, size, lvl, ptep, strict);

>>>    }

>>>      static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,

>>>

>>

>> .

>>

>
Yang, Shunyong Aug. 6, 2018, 1:32 a.m. | #4
Hi, Robin,

On 2018/7/26 22:37, Robin Murphy wrote:
> On 2018-07-26 8:20 AM, Leizhen (ThunderTown) wrote:

>> On 2018/7/25 6:25, Robin Murphy wrote:

>>> On 2018-07-12 7:18 AM, Zhen Lei wrote:

>>>> To support the non-strict mode, now we only tlbi and sync for the strict

>>>> mode. But for the non-leaf case, always follow strict mode.

>>>>

>>>> Use the lowest bit of the iova parameter to pass the strict mode:

>>>> 0, IOMMU_STRICT;

>>>> 1, IOMMU_NON_STRICT;

>>>> Treat 0 as IOMMU_STRICT, so that the unmap operation can compatible with

>>>> other IOMMUs which still use strict mode.

>>>>

>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>

>>>> ---

>>>>    drivers/iommu/io-pgtable-arm.c | 23 ++++++++++++++---------

>>>>    1 file changed, 14 insertions(+), 9 deletions(-)

>>>>

>>>> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c

>>>> index 010a254..9234db3 100644

>>>> --- a/drivers/iommu/io-pgtable-arm.c

>>>> +++ b/drivers/iommu/io-pgtable-arm.c

>>>> @@ -292,7 +292,7 @@ static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,

>>>>      static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>>>                       unsigned long iova, size_t size, int lvl,

>>>> -                   arm_lpae_iopte *ptep);

>>>> +                   arm_lpae_iopte *ptep, int strict);

>>>>      static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,

>>>>                    phys_addr_t paddr, arm_lpae_iopte prot,

>>>> @@ -334,7 +334,7 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,

>>>>            size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);

>>>>              tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);

>>>> -        if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz))

>>>> +        if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp, IOMMU_STRICT) != sz))

>>>>                return -EINVAL;

>>>>        }

>>>>    @@ -531,7 +531,7 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop)

>>>>    static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,

>>>>                           unsigned long iova, size_t size,

>>>>                           arm_lpae_iopte blk_pte, int lvl,

>>>> -                       arm_lpae_iopte *ptep)

>>>> +                       arm_lpae_iopte *ptep, int strict)

>>>

>>> DMA code should never ever be splitting blocks anyway, and frankly the TLB maintenance here is dodgy enough (since we can't reasonably do break-before make as VMSA says we should) that I *really* don't want to introduce any possibility of making it more asynchronous. I'd much rather just hard-code the expectation of strict == true for this.

>>

>> OK, I will hard-code strict=true for it.

>>

>> But since it never ever be happened, why did not give a warning at the beginning?

> 

> Because DMA code is not the only caller of iommu_map/unmap. It's 

> perfectly legal in the IOMMU API to partially unmap a previous mapping 

> such that a block entry needs to be split. The DMA API, however, is a 

> lot more constrined, and thus by construction the iommu-dma layer will 

> never generate a block-splitting iommu_unmap() except as a result of 

> illegal DMA API usage, and we obviously do not need to optimise for that 

> (you will get a warning about mismatched unmaps under dma-debug, but 

> it's a bit too expensive to police in the general case).

> 


When I was reading the code around arm_lpae_split_blk_unmap(), I was
curious in which scenario a block will be split. Now with your comments
"Because DMA code is not the only caller of iommu_map/unmap", it seems
depending on the user.

Would you please explain this further? I mean besides DMA, which user
will use iommu_map/umap and how it split a block.

Thanks.
Shunyong.

> 

>>>>    {

>>>>        struct io_pgtable_cfg *cfg = &data->iop.cfg;

>>>>        arm_lpae_iopte pte, *tablep;

>>>> @@ -576,15 +576,18 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,

>>>>        }

>>>>          if (unmap_idx < 0)

>>>> -        return __arm_lpae_unmap(data, iova, size, lvl, tablep);

>>>> +        return __arm_lpae_unmap(data, iova, size, lvl, tablep, strict);

>>>>          io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true);

>>>> +    if (!strict)

>>>> +        io_pgtable_tlb_sync(&data->iop);

>>>> +

>>>>        return size;

>>>>    }

>>>>      static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>>>                       unsigned long iova, size_t size, int lvl,

>>>> -                   arm_lpae_iopte *ptep)

>>>> +                   arm_lpae_iopte *ptep, int strict)

>>>>    {

>>>>        arm_lpae_iopte pte;

>>>>        struct io_pgtable *iop = &data->iop;

>>>> @@ -609,7 +612,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>>>                io_pgtable_tlb_sync(iop);

>>>>                ptep = iopte_deref(pte, data);

>>>>                __arm_lpae_free_pgtable(data, lvl + 1, ptep);

>>>> -        } else {

>>>> +        } else if (strict) {

>>>>                io_pgtable_tlb_add_flush(iop, iova, size, size, true);

>>>>            }

>>>>    @@ -620,25 +623,27 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>>>             * minus the part we want to unmap

>>>>             */

>>>>            return arm_lpae_split_blk_unmap(data, iova, size, pte,

>>>> -                        lvl + 1, ptep);

>>>> +                        lvl + 1, ptep, strict);

>>>>        }

>>>>          /* Keep on walkin' */

>>>>        ptep = iopte_deref(pte, data);

>>>> -    return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep);

>>>> +    return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep, strict);

>>>>    }

>>>>      static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,

>>>>                     size_t size)

>>>>    {

>>>> +    int strict = ((iova & IOMMU_STRICT_MODE_MASK) == IOMMU_STRICT);

>>>>        struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);

>>>>        arm_lpae_iopte *ptep = data->pgd;

>>>>        int lvl = ARM_LPAE_START_LVL(data);

>>>>    +    iova &= ~IOMMU_STRICT_MODE_MASK;

>>>>        if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias)))

>>>>            return 0;

>>>>    -    return __arm_lpae_unmap(data, iova, size, lvl, ptep);

>>>> +    return __arm_lpae_unmap(data, iova, size, lvl, ptep, strict);

>>>>    }

>>>>      static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,

>>>>

>>>

>>> .

>>>

>>

> _______________________________________________

> iommu mailing list

> iommu@lists.linux-foundation.org

> https://lists.linuxfoundation.org/mailman/listinfo/iommu

>
Leizhen (ThunderTown) Aug. 14, 2018, 8:33 a.m. | #5
On 2018/8/6 9:32, Yang, Shunyong wrote:
> Hi, Robin,

> 

> On 2018/7/26 22:37, Robin Murphy wrote:

>> On 2018-07-26 8:20 AM, Leizhen (ThunderTown) wrote:

>>> On 2018/7/25 6:25, Robin Murphy wrote:

>>>> On 2018-07-12 7:18 AM, Zhen Lei wrote:

>>>>> To support the non-strict mode, now we only tlbi and sync for the strict

>>>>> mode. But for the non-leaf case, always follow strict mode.

>>>>>

>>>>> Use the lowest bit of the iova parameter to pass the strict mode:

>>>>> 0, IOMMU_STRICT;

>>>>> 1, IOMMU_NON_STRICT;

>>>>> Treat 0 as IOMMU_STRICT, so that the unmap operation can compatible with

>>>>> other IOMMUs which still use strict mode.

>>>>>

>>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>

>>>>> ---

>>>>>    drivers/iommu/io-pgtable-arm.c | 23 ++++++++++++++---------

>>>>>    1 file changed, 14 insertions(+), 9 deletions(-)

>>>>>

>>>>> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c

>>>>> index 010a254..9234db3 100644

>>>>> --- a/drivers/iommu/io-pgtable-arm.c

>>>>> +++ b/drivers/iommu/io-pgtable-arm.c

>>>>> @@ -292,7 +292,7 @@ static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,

>>>>>      static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>>>>                       unsigned long iova, size_t size, int lvl,

>>>>> -                   arm_lpae_iopte *ptep);

>>>>> +                   arm_lpae_iopte *ptep, int strict);

>>>>>      static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,

>>>>>                    phys_addr_t paddr, arm_lpae_iopte prot,

>>>>> @@ -334,7 +334,7 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,

>>>>>            size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);

>>>>>              tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);

>>>>> -        if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz))

>>>>> +        if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp, IOMMU_STRICT) != sz))

>>>>>                return -EINVAL;

>>>>>        }

>>>>>    @@ -531,7 +531,7 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop)

>>>>>    static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,

>>>>>                           unsigned long iova, size_t size,

>>>>>                           arm_lpae_iopte blk_pte, int lvl,

>>>>> -                       arm_lpae_iopte *ptep)

>>>>> +                       arm_lpae_iopte *ptep, int strict)

>>>>

>>>> DMA code should never ever be splitting blocks anyway, and frankly the TLB maintenance here is dodgy enough (since we can't reasonably do break-before make as VMSA says we should) that I *really* don't want to introduce any possibility of making it more asynchronous. I'd much rather just hard-code the expectation of strict == true for this.

>>>

>>> OK, I will hard-code strict=true for it.

>>>

>>> But since it never ever be happened, why did not give a warning at the beginning?

>>

>> Because DMA code is not the only caller of iommu_map/unmap. It's 

>> perfectly legal in the IOMMU API to partially unmap a previous mapping 

>> such that a block entry needs to be split. The DMA API, however, is a 

>> lot more constrined, and thus by construction the iommu-dma layer will 

>> never generate a block-splitting iommu_unmap() except as a result of 

>> illegal DMA API usage, and we obviously do not need to optimise for that 

>> (you will get a warning about mismatched unmaps under dma-debug, but 

>> it's a bit too expensive to police in the general case).

>>

> 

> When I was reading the code around arm_lpae_split_blk_unmap(), I was

> curious in which scenario a block will be split. Now with your comments

> "Because DMA code is not the only caller of iommu_map/unmap", it seems

> depending on the user.

> 

> Would you please explain this further? I mean besides DMA, which user

> will use iommu_map/umap and how it split a block.


I also think that arm_lpae_split_blk_unmap() scenario is not exist, maybe
we should remove it, and give a warning for this wrong usage.

> 

> Thanks.

> Shunyong.

> 

>>

>>>>>    {

>>>>>        struct io_pgtable_cfg *cfg = &data->iop.cfg;

>>>>>        arm_lpae_iopte pte, *tablep;

>>>>> @@ -576,15 +576,18 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,

>>>>>        }

>>>>>          if (unmap_idx < 0)

>>>>> -        return __arm_lpae_unmap(data, iova, size, lvl, tablep);

>>>>> +        return __arm_lpae_unmap(data, iova, size, lvl, tablep, strict);

>>>>>          io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true);

>>>>> +    if (!strict)

>>>>> +        io_pgtable_tlb_sync(&data->iop);

>>>>> +

>>>>>        return size;

>>>>>    }

>>>>>      static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>>>>                       unsigned long iova, size_t size, int lvl,

>>>>> -                   arm_lpae_iopte *ptep)

>>>>> +                   arm_lpae_iopte *ptep, int strict)

>>>>>    {

>>>>>        arm_lpae_iopte pte;

>>>>>        struct io_pgtable *iop = &data->iop;

>>>>> @@ -609,7 +612,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>>>>                io_pgtable_tlb_sync(iop);

>>>>>                ptep = iopte_deref(pte, data);

>>>>>                __arm_lpae_free_pgtable(data, lvl + 1, ptep);

>>>>> -        } else {

>>>>> +        } else if (strict) {

>>>>>                io_pgtable_tlb_add_flush(iop, iova, size, size, true);

>>>>>            }

>>>>>    @@ -620,25 +623,27 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,

>>>>>             * minus the part we want to unmap

>>>>>             */

>>>>>            return arm_lpae_split_blk_unmap(data, iova, size, pte,

>>>>> -                        lvl + 1, ptep);

>>>>> +                        lvl + 1, ptep, strict);

>>>>>        }

>>>>>          /* Keep on walkin' */

>>>>>        ptep = iopte_deref(pte, data);

>>>>> -    return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep);

>>>>> +    return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep, strict);

>>>>>    }

>>>>>      static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,

>>>>>                     size_t size)

>>>>>    {

>>>>> +    int strict = ((iova & IOMMU_STRICT_MODE_MASK) == IOMMU_STRICT);

>>>>>        struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);

>>>>>        arm_lpae_iopte *ptep = data->pgd;

>>>>>        int lvl = ARM_LPAE_START_LVL(data);

>>>>>    +    iova &= ~IOMMU_STRICT_MODE_MASK;

>>>>>        if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias)))

>>>>>            return 0;

>>>>>    -    return __arm_lpae_unmap(data, iova, size, lvl, ptep);

>>>>> +    return __arm_lpae_unmap(data, iova, size, lvl, ptep, strict);

>>>>>    }

>>>>>      static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,

>>>>>

>>>>

>>>> .

>>>>

>>>

>> _______________________________________________

>> iommu mailing list

>> iommu@lists.linux-foundation.org

>> https://lists.linuxfoundation.org/mailman/listinfo/iommu

>>

> 

> 

> .

> 


-- 
Thanks!
BestRegards
Will Deacon Aug. 14, 2018, 8:35 a.m. | #6
On Tue, Aug 14, 2018 at 04:33:41PM +0800, Leizhen (ThunderTown) wrote:
> On 2018/8/6 9:32, Yang, Shunyong wrote:

> > On 2018/7/26 22:37, Robin Murphy wrote:

> >> Because DMA code is not the only caller of iommu_map/unmap. It's 

> >> perfectly legal in the IOMMU API to partially unmap a previous mapping 

> >> such that a block entry needs to be split. The DMA API, however, is a 

> >> lot more constrined, and thus by construction the iommu-dma layer will 

> >> never generate a block-splitting iommu_unmap() except as a result of 

> >> illegal DMA API usage, and we obviously do not need to optimise for that 

> >> (you will get a warning about mismatched unmaps under dma-debug, but 

> >> it's a bit too expensive to police in the general case).

> >>

> > 

> > When I was reading the code around arm_lpae_split_blk_unmap(), I was

> > curious in which scenario a block will be split. Now with your comments

> > "Because DMA code is not the only caller of iommu_map/unmap", it seems

> > depending on the user.

> > 

> > Would you please explain this further? I mean besides DMA, which user

> > will use iommu_map/umap and how it split a block.

> 

> I also think that arm_lpae_split_blk_unmap() scenario is not exist, maybe

> we should remove it, and give a warning for this wrong usage.


Can't it happen with VFIO?

Will
Robin Murphy Aug. 14, 2018, 10:02 a.m. | #7
On 14/08/18 09:35, Will Deacon wrote:
> On Tue, Aug 14, 2018 at 04:33:41PM +0800, Leizhen (ThunderTown) wrote:

>> On 2018/8/6 9:32, Yang, Shunyong wrote:

>>> On 2018/7/26 22:37, Robin Murphy wrote:

>>>> Because DMA code is not the only caller of iommu_map/unmap. It's

>>>> perfectly legal in the IOMMU API to partially unmap a previous mapping

>>>> such that a block entry needs to be split. The DMA API, however, is a

>>>> lot more constrined, and thus by construction the iommu-dma layer will

>>>> never generate a block-splitting iommu_unmap() except as a result of

>>>> illegal DMA API usage, and we obviously do not need to optimise for that

>>>> (you will get a warning about mismatched unmaps under dma-debug, but

>>>> it's a bit too expensive to police in the general case).

>>>>

>>>

>>> When I was reading the code around arm_lpae_split_blk_unmap(), I was

>>> curious in which scenario a block will be split. Now with your comments

>>> "Because DMA code is not the only caller of iommu_map/unmap", it seems

>>> depending on the user.

>>>

>>> Would you please explain this further? I mean besides DMA, which user

>>> will use iommu_map/umap and how it split a block.

>>

>> I also think that arm_lpae_split_blk_unmap() scenario is not exist, maybe

>> we should remove it, and give a warning for this wrong usage.

> 

> Can't it happen with VFIO?


...or GPU drivers, or anyone else managing their own IOMMU domain 
directly. A sequence like this is perfectly legal:

	iommu_map(domain, iova, paddr, SZ_8M, prot);
	...
	iommu_unmap(domain, iova + SZ_1M * 5, SZ_1M * 3);

where if iova and paddr happen to be suitably aligned, the map will lay 
down blocks, and the unmap will then have to split one of them into 
pages to remove half of it. We don't tear our hair out maintaining 
split_blk_unmap() for the fun of it :(

Robin.
Yang, Shunyong Aug. 15, 2018, 1:43 a.m. | #8
Hi, Robin,

On Tue, 2018-08-14 at 11:02 +0100, Robin Murphy wrote:
> On 14/08/18 09:35, Will Deacon wrote:

> > On Tue, Aug 14, 2018 at 04:33:41PM +0800, Leizhen (ThunderTown)

> > wrote:

> > > On 2018/8/6 9:32, Yang, Shunyong wrote:

> > > > On 2018/7/26 22:37, Robin Murphy wrote:

> > > > > Because DMA code is not the only caller of iommu_map/unmap.

> > > > > It's

> > > > > perfectly legal in the IOMMU API to partially unmap a

> > > > > previous mapping

> > > > > such that a block entry needs to be split. The DMA API,

> > > > > however, is a

> > > > > lot more constrined, and thus by construction the iommu-dma

> > > > > layer will

> > > > > never generate a block-splitting iommu_unmap() except as a

> > > > > result of

> > > > > illegal DMA API usage, and we obviously do not need to

> > > > > optimise for that

> > > > > (you will get a warning about mismatched unmaps under dma-

> > > > > debug, but

> > > > > it's a bit too expensive to police in the general case).

> > > > > 

> > > > 

> > > > When I was reading the code around arm_lpae_split_blk_unmap(),

> > > > I was

> > > > curious in which scenario a block will be split. Now with your

> > > > comments

> > > > "Because DMA code is not the only caller of iommu_map/unmap",

> > > > it seems

> > > > depending on the user.

> > > > 

> > > > Would you please explain this further? I mean besides DMA,

> > > > which user

> > > > will use iommu_map/umap and how it split a block.

> > > 

> > > I also think that arm_lpae_split_blk_unmap() scenario is not

> > > exist, maybe

> > > we should remove it, and give a warning for this wrong usage.

> > 

> > Can't it happen with VFIO?

> 

> ...or GPU drivers, or anyone else managing their own IOMMU domain 

> directly. A sequence like this is perfectly legal:

> 

> 	iommu_map(domain, iova, paddr, SZ_8M, prot);

> 	...

> 	iommu_unmap(domain, iova + SZ_1M * 5, SZ_1M * 3);

> 

> where if iova and paddr happen to be suitably aligned, the map will

> lay 

> down blocks, and the unmap will then have to split one of them into 

> pages to remove half of it. We don't tear our hair out maintaining 

> split_blk_unmap() for the fun of it :(


Thank you for the GPU example. But for VFIO, I remember all memory will
be   pinned in the early stage of emulator (such as qemu) start. So,
the split will occur at which operation? Maybe virtio balloon inflate?

Thanks.
Shunyong.

> 

> Robin.
Will Deacon Aug. 15, 2018, 7:33 a.m. | #9
On Wed, Aug 15, 2018 at 01:43:37AM +0000, Yang, Shunyong wrote:
> On Tue, 2018-08-14 at 11:02 +0100, Robin Murphy wrote:

> > On 14/08/18 09:35, Will Deacon wrote:

> > > On Tue, Aug 14, 2018 at 04:33:41PM +0800, Leizhen (ThunderTown)

> > > wrote:

> > > > On 2018/8/6 9:32, Yang, Shunyong wrote:

> > > > > On 2018/7/26 22:37, Robin Murphy wrote:

> > > > > > Because DMA code is not the only caller of iommu_map/unmap.

> > > > > > It's

> > > > > > perfectly legal in the IOMMU API to partially unmap a

> > > > > > previous mapping

> > > > > > such that a block entry needs to be split. The DMA API,

> > > > > > however, is a

> > > > > > lot more constrined, and thus by construction the iommu-dma

> > > > > > layer will

> > > > > > never generate a block-splitting iommu_unmap() except as a

> > > > > > result of

> > > > > > illegal DMA API usage, and we obviously do not need to

> > > > > > optimise for that

> > > > > > (you will get a warning about mismatched unmaps under dma-

> > > > > > debug, but

> > > > > > it's a bit too expensive to police in the general case).

> > > > > >

> > > > >

> > > > > When I was reading the code around arm_lpae_split_blk_unmap(),

> > > > > I was

> > > > > curious in which scenario a block will be split. Now with your

> > > > > comments

> > > > > "Because DMA code is not the only caller of iommu_map/unmap",

> > > > > it seems

> > > > > depending on the user.

> > > > >

> > > > > Would you please explain this further? I mean besides DMA,

> > > > > which user

> > > > > will use iommu_map/umap and how it split a block.

> > > >

> > > > I also think that arm_lpae_split_blk_unmap() scenario is not

> > > > exist, maybe

> > > > we should remove it, and give a warning for this wrong usage.

> > >

> > > Can't it happen with VFIO?

> >

> > ...or GPU drivers, or anyone else managing their own IOMMU domain

> > directly. A sequence like this is perfectly legal:

> >

> >     iommu_map(domain, iova, paddr, SZ_8M, prot);

> >     ...

> >     iommu_unmap(domain, iova + SZ_1M * 5, SZ_1M * 3);

> >

> > where if iova and paddr happen to be suitably aligned, the map will

> > lay

> > down blocks, and the unmap will then have to split one of them into

> > pages to remove half of it. We don't tear our hair out maintaining

> > split_blk_unmap() for the fun of it :(

>

> Thank you for the GPU example. But for VFIO, I remember all memory will

> be   pinned in the early stage of emulator (such as qemu) start. So,

> the split will occur at which operation? Maybe virtio balloon inflate?


My memory is pretty hazy here, but I was fairly sure that VFIO didn't
always unmap() with the same granularity as it map()'d, at least for
the v1 interface. Either way, split_blk_unmap() was written because it was
necessary at the time, rather than just for fun!

Will
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Will Deacon Aug. 15, 2018, 7:35 a.m. | #10
On Wed, Aug 15, 2018 at 08:33:01AM +0100, Will Deacon wrote:
> On Wed, Aug 15, 2018 at 01:43:37AM +0000, Yang, Shunyong wrote:

> > On Tue, 2018-08-14 at 11:02 +0100, Robin Murphy wrote:

> > > On 14/08/18 09:35, Will Deacon wrote:

> > > > On Tue, Aug 14, 2018 at 04:33:41PM +0800, Leizhen (ThunderTown)

> > > > wrote:

> > > > > On 2018/8/6 9:32, Yang, Shunyong wrote:

> > > > > > On 2018/7/26 22:37, Robin Murphy wrote:

> > > > > > > Because DMA code is not the only caller of iommu_map/unmap.

> > > > > > > It's

> > > > > > > perfectly legal in the IOMMU API to partially unmap a

> > > > > > > previous mapping

> > > > > > > such that a block entry needs to be split. The DMA API,

> > > > > > > however, is a

> > > > > > > lot more constrined, and thus by construction the iommu-dma

> > > > > > > layer will

> > > > > > > never generate a block-splitting iommu_unmap() except as a

> > > > > > > result of

> > > > > > > illegal DMA API usage, and we obviously do not need to

> > > > > > > optimise for that

> > > > > > > (you will get a warning about mismatched unmaps under dma-

> > > > > > > debug, but

> > > > > > > it's a bit too expensive to police in the general case).

> > > > > > >

> > > > > >

> > > > > > When I was reading the code around arm_lpae_split_blk_unmap(),

> > > > > > I was

> > > > > > curious in which scenario a block will be split. Now with your

> > > > > > comments

> > > > > > "Because DMA code is not the only caller of iommu_map/unmap",

> > > > > > it seems

> > > > > > depending on the user.

> > > > > >

> > > > > > Would you please explain this further? I mean besides DMA,

> > > > > > which user

> > > > > > will use iommu_map/umap and how it split a block.

> > > > >

> > > > > I also think that arm_lpae_split_blk_unmap() scenario is not

> > > > > exist, maybe

> > > > > we should remove it, and give a warning for this wrong usage.

> > > >

> > > > Can't it happen with VFIO?

> > >

> > > ...or GPU drivers, or anyone else managing their own IOMMU domain

> > > directly. A sequence like this is perfectly legal:

> > >

> > >     iommu_map(domain, iova, paddr, SZ_8M, prot);

> > >     ...

> > >     iommu_unmap(domain, iova + SZ_1M * 5, SZ_1M * 3);

> > >

> > > where if iova and paddr happen to be suitably aligned, the map will

> > > lay

> > > down blocks, and the unmap will then have to split one of them into

> > > pages to remove half of it. We don't tear our hair out maintaining

> > > split_blk_unmap() for the fun of it :(

> >

> > Thank you for the GPU example. But for VFIO, I remember all memory will

> > be   pinned in the early stage of emulator (such as qemu) start. So,

> > the split will occur at which operation? Maybe virtio balloon inflate?

> 

> My memory is pretty hazy here, but I was fairly sure that VFIO didn't

> always unmap() with the same granularity as it map()'d, at least for

> the v1 interface. Either way, split_blk_unmap() was written because it was

> necessary at the time, rather than just for fun!

> 

> Will

> IMPORTANT NOTICE: The contents of this email and any attachments are

> confidential and may also be privileged. If you are not the intended

> recipient, please notify the sender immediately and do not disclose the

> contents to any other person, use it for any purpose, or store or copy the

> information in any medium. Thank you.


Urgh, sorry about this threatening disclaimer ^^. Please disregard.

Will
Yang, Shunyong Aug. 16, 2018, 12:43 a.m. | #11
Hi, Will and Robin,

  Many thanks for your explanations.

Thanks.
Shunyong.

On 2018/8/15 15:35, Will Deacon wrote:
> On Wed, Aug 15, 2018 at 08:33:01AM +0100, Will Deacon wrote:

>> On Wed, Aug 15, 2018 at 01:43:37AM +0000, Yang, Shunyong wrote:

>>> On Tue, 2018-08-14 at 11:02 +0100, Robin Murphy wrote:

>>>> On 14/08/18 09:35, Will Deacon wrote:

>>>>> On Tue, Aug 14, 2018 at 04:33:41PM +0800, Leizhen (ThunderTown)

>>>>> wrote:

>>>>>> On 2018/8/6 9:32, Yang, Shunyong wrote:

>>>>>>> On 2018/7/26 22:37, Robin Murphy wrote:

>>>>>>>> Because DMA code is not the only caller of iommu_map/unmap.

>>>>>>>> It's

>>>>>>>> perfectly legal in the IOMMU API to partially unmap a

>>>>>>>> previous mapping

>>>>>>>> such that a block entry needs to be split. The DMA API,

>>>>>>>> however, is a

>>>>>>>> lot more constrined, and thus by construction the iommu-dma

>>>>>>>> layer will

>>>>>>>> never generate a block-splitting iommu_unmap() except as a

>>>>>>>> result of

>>>>>>>> illegal DMA API usage, and we obviously do not need to

>>>>>>>> optimise for that

>>>>>>>> (you will get a warning about mismatched unmaps under dma-

>>>>>>>> debug, but

>>>>>>>> it's a bit too expensive to police in the general case).

>>>>>>>>

>>>>>>>

>>>>>>> When I was reading the code around arm_lpae_split_blk_unmap(),

>>>>>>> I was

>>>>>>> curious in which scenario a block will be split. Now with your

>>>>>>> comments

>>>>>>> "Because DMA code is not the only caller of iommu_map/unmap",

>>>>>>> it seems

>>>>>>> depending on the user.

>>>>>>>

>>>>>>> Would you please explain this further? I mean besides DMA,

>>>>>>> which user

>>>>>>> will use iommu_map/umap and how it split a block.

>>>>>>

>>>>>> I also think that arm_lpae_split_blk_unmap() scenario is not

>>>>>> exist, maybe

>>>>>> we should remove it, and give a warning for this wrong usage.

>>>>>

>>>>> Can't it happen with VFIO?

>>>>

>>>> ...or GPU drivers, or anyone else managing their own IOMMU domain

>>>> directly. A sequence like this is perfectly legal:

>>>>

>>>>     iommu_map(domain, iova, paddr, SZ_8M, prot);

>>>>     ...

>>>>     iommu_unmap(domain, iova + SZ_1M * 5, SZ_1M * 3);

>>>>

>>>> where if iova and paddr happen to be suitably aligned, the map will

>>>> lay

>>>> down blocks, and the unmap will then have to split one of them into

>>>> pages to remove half of it. We don't tear our hair out maintaining

>>>> split_blk_unmap() for the fun of it :(

>>>

>>> Thank you for the GPU example. But for VFIO, I remember all memory will

>>> be   pinned in the early stage of emulator (such as qemu) start. So,

>>> the split will occur at which operation? Maybe virtio balloon inflate?

>>

>> My memory is pretty hazy here, but I was fairly sure that VFIO didn't

>> always unmap() with the same granularity as it map()'d, at least for

>> the v1 interface. Either way, split_blk_unmap() was written because it was

>> necessary at the time, rather than just for fun!

>>

>> Will

>> IMPORTANT NOTICE: The contents of this email and any attachments are

>> confidential and may also be privileged. If you are not the intended

>> recipient, please notify the sender immediately and do not disclose the

>> contents to any other person, use it for any purpose, or store or copy the

>> information in any medium. Thank you.

> 

> Urgh, sorry about this threatening disclaimer ^^. Please disregard.

> 

> Will

>

Patch

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 010a254..9234db3 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -292,7 +292,7 @@  static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,
 
 static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 			       unsigned long iova, size_t size, int lvl,
-			       arm_lpae_iopte *ptep);
+			       arm_lpae_iopte *ptep, int strict);
 
 static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
 				phys_addr_t paddr, arm_lpae_iopte prot,
@@ -334,7 +334,7 @@  static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
 		size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
 
 		tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
-		if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz))
+		if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp, IOMMU_STRICT) != sz))
 			return -EINVAL;
 	}
 
@@ -531,7 +531,7 @@  static void arm_lpae_free_pgtable(struct io_pgtable *iop)
 static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
 				       unsigned long iova, size_t size,
 				       arm_lpae_iopte blk_pte, int lvl,
-				       arm_lpae_iopte *ptep)
+				       arm_lpae_iopte *ptep, int strict)
 {
 	struct io_pgtable_cfg *cfg = &data->iop.cfg;
 	arm_lpae_iopte pte, *tablep;
@@ -576,15 +576,18 @@  static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
 	}
 
 	if (unmap_idx < 0)
-		return __arm_lpae_unmap(data, iova, size, lvl, tablep);
+		return __arm_lpae_unmap(data, iova, size, lvl, tablep, strict);
 
 	io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true);
+	if (!strict)
+		io_pgtable_tlb_sync(&data->iop);
+
 	return size;
 }
 
 static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 			       unsigned long iova, size_t size, int lvl,
-			       arm_lpae_iopte *ptep)
+			       arm_lpae_iopte *ptep, int strict)
 {
 	arm_lpae_iopte pte;
 	struct io_pgtable *iop = &data->iop;
@@ -609,7 +612,7 @@  static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 			io_pgtable_tlb_sync(iop);
 			ptep = iopte_deref(pte, data);
 			__arm_lpae_free_pgtable(data, lvl + 1, ptep);
-		} else {
+		} else if (strict) {
 			io_pgtable_tlb_add_flush(iop, iova, size, size, true);
 		}
 
@@ -620,25 +623,27 @@  static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 		 * minus the part we want to unmap
 		 */
 		return arm_lpae_split_blk_unmap(data, iova, size, pte,
-						lvl + 1, ptep);
+						lvl + 1, ptep, strict);
 	}
 
 	/* Keep on walkin' */
 	ptep = iopte_deref(pte, data);
-	return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep);
+	return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep, strict);
 }
 
 static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
 			     size_t size)
 {
+	int strict = ((iova & IOMMU_STRICT_MODE_MASK) == IOMMU_STRICT);
 	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
 	arm_lpae_iopte *ptep = data->pgd;
 	int lvl = ARM_LPAE_START_LVL(data);
 
+	iova &= ~IOMMU_STRICT_MODE_MASK;
 	if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias)))
 		return 0;
 
-	return __arm_lpae_unmap(data, iova, size, lvl, ptep);
+	return __arm_lpae_unmap(data, iova, size, lvl, ptep, strict);
 }
 
 static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,