diff mbox series

[v2,2/8] iommu/arm-smmu-v3: Use S2FWB when available

Message ID 2-v2-621370057090+91fec-smmuv3_nesting_jgg@nvidia.com
State New
Headers show
Series Initial support for SMMUv3 nested translation | expand

Commit Message

Jason Gunthorpe Aug. 27, 2024, 3:51 p.m. UTC
Force Write Back (FWB) changes how the S2 IOPTE's MemAttr field
works. When S2FWB is supported and enabled the IOPTE will force cachable
access to IOMMU_CACHE memory when nesting with a S1 and deny cachable
access otherwise.

When using a single stage of translation, a simple S2 domain, it doesn't
change anything as it is just a different encoding for the exsting mapping
of the IOMMU protection flags to cachability attributes.

However, when used with a nested S1, FWB has the effect of preventing the
guest from choosing a MemAttr in it's S1 that would cause ordinary DMA to
bypass the cache. Consistent with KVM we wish to deny the guest the
ability to become incoherent with cached memory the hypervisor believes is
cachable so we don't have to flush it.

Turn on S2FWB whenever the SMMU supports it and use it for all S2
mappings.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 11 +++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  3 +++
 drivers/iommu/io-pgtable-arm.c              | 27 +++++++++++++++++----
 include/linux/io-pgtable.h                  |  2 ++
 4 files changed, 38 insertions(+), 5 deletions(-)

Comments

Jason Gunthorpe Aug. 28, 2024, 6:30 p.m. UTC | #1
On Tue, Aug 27, 2024 at 12:48:13PM -0700, Nicolin Chen wrote:
> Hi Jason,
> 
> On Tue, Aug 27, 2024 at 12:51:32PM -0300, Jason Gunthorpe wrote:
> > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> > index f5d9fd1f45bf49..9b3658aae21005 100644
> > --- a/drivers/iommu/io-pgtable-arm.c
> > +++ b/drivers/iommu/io-pgtable-arm.c
> > @@ -106,6 +106,18 @@
> >  #define ARM_LPAE_PTE_HAP_FAULT		(((arm_lpae_iopte)0) << 6)
> >  #define ARM_LPAE_PTE_HAP_READ		(((arm_lpae_iopte)1) << 6)
> >  #define ARM_LPAE_PTE_HAP_WRITE		(((arm_lpae_iopte)2) << 6)
> > +/*
> > + * For !FWB these code to:
> > + *  1111 = Normal outer write back cachable / Inner Write Back Cachable
> > + *         Permit S1 to override
> > + *  0101 = Normal Non-cachable / Inner Non-cachable
> > + *  0001 = Device / Device-nGnRE
> > + * For S2FWB these code:
> > + *  0110 Force Normal Write Back
> > + *  0101 Normal* is forced Normal-NC, Device unchanged
> > + *  0001 Force Device-nGnRE
> > + */
> > +#define ARM_LPAE_PTE_MEMATTR_FWB_WB	(((arm_lpae_iopte)0x6) << 2)
> 
> The other part looks good. Yet, would you mind sharing the location
> that defines this 0x6 explicitly?

I'm looking at an older one ARM DDI 0487F.c

D5.5.5 Stage 2 memory region type and Cacheability attributes when FEAT_S2FWB is implemented

The text talks about the bits in the PTE, not relative to the MEMATTR
field, so 6 << 2 encodes to:

 543210
 011000

Then see table D5-40 Effect of bit[4] == 1 on Cacheability and Memory Type)

 Bit[5] = 0 = is RES0.
 Bit[4] = 1 = determines the interpretation of bits [3:2].
 Bits[3:2] == 10 == Normal Write-Back 

Here Bit means 'bit of the PTE' because the MemAttr does not have 5
bits.

> Where it has the followings in D8.6.6:
>  "For stage 2 translations, if FEAT_MTE_PERM is not implemented, then
>   FEAT_S2FWB has all of the following effects on the MemAttr[3:2] bits:
>    - MemAttr[3] is RES0.
>    - The value of MemAttr[2] determines the interpretation of the
>      MemAttr[1:0] bits.

And here the text switches from talking about the PTE bits to the
Field bits. MemAttr[3] == PTE[5], and the above text matches D5.5

The use of numbering schemes relative to the start of the field and
also relative to the PTE is tricky.

Jason
Nicolin Chen Aug. 28, 2024, 7:47 p.m. UTC | #2
On Wed, Aug 28, 2024 at 03:30:37PM -0300, Jason Gunthorpe wrote:
> On Tue, Aug 27, 2024 at 12:48:13PM -0700, Nicolin Chen wrote:
> > Hi Jason,
> > 
> > On Tue, Aug 27, 2024 at 12:51:32PM -0300, Jason Gunthorpe wrote:
> > > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> > > index f5d9fd1f45bf49..9b3658aae21005 100644
> > > --- a/drivers/iommu/io-pgtable-arm.c
> > > +++ b/drivers/iommu/io-pgtable-arm.c
> > > @@ -106,6 +106,18 @@
> > >  #define ARM_LPAE_PTE_HAP_FAULT		(((arm_lpae_iopte)0) << 6)
> > >  #define ARM_LPAE_PTE_HAP_READ		(((arm_lpae_iopte)1) << 6)
> > >  #define ARM_LPAE_PTE_HAP_WRITE		(((arm_lpae_iopte)2) << 6)
> > > +/*
> > > + * For !FWB these code to:
> > > + *  1111 = Normal outer write back cachable / Inner Write Back Cachable
> > > + *         Permit S1 to override
> > > + *  0101 = Normal Non-cachable / Inner Non-cachable
> > > + *  0001 = Device / Device-nGnRE
> > > + * For S2FWB these code:
> > > + *  0110 Force Normal Write Back
> > > + *  0101 Normal* is forced Normal-NC, Device unchanged
> > > + *  0001 Force Device-nGnRE
> > > + */
> > > +#define ARM_LPAE_PTE_MEMATTR_FWB_WB	(((arm_lpae_iopte)0x6) << 2)
> > 
> > The other part looks good. Yet, would you mind sharing the location
> > that defines this 0x6 explicitly?
> 
> I'm looking at an older one ARM DDI 0487F.c
> 
> D5.5.5 Stage 2 memory region type and Cacheability attributes when FEAT_S2FWB is implemented
> 
> The text talks about the bits in the PTE, not relative to the MEMATTR
> field, so 6 << 2 encodes to:
> 
>  543210
>  011000
> 
> Then see table D5-40 Effect of bit[4] == 1 on Cacheability and Memory Type)
> 
>  Bit[5] = 0 = is RES0.
>  Bit[4] = 1 = determines the interpretation of bits [3:2].
>  Bits[3:2] == 10 == Normal Write-Back 
> 
> Here Bit means 'bit of the PTE' because the MemAttr does not have 5
> bits.
> 
> > Where it has the followings in D8.6.6:
> >  "For stage 2 translations, if FEAT_MTE_PERM is not implemented, then
> >   FEAT_S2FWB has all of the following effects on the MemAttr[3:2] bits:
> >    - MemAttr[3] is RES0.
> >    - The value of MemAttr[2] determines the interpretation of the
> >      MemAttr[1:0] bits.
> 
> And here the text switches from talking about the PTE bits to the
> Field bits. MemAttr[3] == PTE[5], and the above text matches D5.5
> 
> The use of numbering schemes relative to the start of the field and
> also relative to the PTE is tricky.

I download the version F.c, and the chapter looks cleaner than
the one in K.a. I guess the FEAT_MTE_PERM complicates that...

I double checked the K.a doc, and found a piece of pseudocode
that seems to confirm 0b0110 (0x6) is the correct value:

MemoryAttributes AArch64.S2ApplyFWBMemAttrs(MemoryAttributes s1_memattrs,
					    S2TTWParams walkparams,
					    bits(N) descriptor)
	MemoryAttributes memattrs;
	s2_attr = descriptor<5:2>;
	s2_sh = if walkparams.ds == '1' then walkparams.sh else descriptor<9:8>;
	s2_fnxs = descriptor<11>;
	if s2_attr<2> == '0' then // S2 Device, S1 any
		s2_device = DecodeDevice(s2_attr<1:0>);
		memattrs.memtype = MemType_Device;
		if s1_memattrs.memtype == MemType_Device then
			memattrs.device = S2CombineS1Device(s1_memattrs.device, s2_device);
		else
			memattrs.device = s2_device;
		memattrs.xs = s1_memattrs.xs;
	elsif s2_attr<1:0> == '11' then // S2 attr = S1 attr
		memattrs = s1_memattrs;
	elsif s2_attr<1:0> == '10' then // Force writeback

Thanks
Nicolin
Nicolin Chen Aug. 28, 2024, 7:50 p.m. UTC | #3
On Tue, Aug 27, 2024 at 12:51:32PM -0300, Jason Gunthorpe wrote:
> Force Write Back (FWB) changes how the S2 IOPTE's MemAttr field
> works. When S2FWB is supported and enabled the IOPTE will force cachable
> access to IOMMU_CACHE memory when nesting with a S1 and deny cachable
> access otherwise.
> 
> When using a single stage of translation, a simple S2 domain, it doesn't
> change anything as it is just a different encoding for the exsting mapping
> of the IOMMU protection flags to cachability attributes.
> 
> However, when used with a nested S1, FWB has the effect of preventing the
> guest from choosing a MemAttr in it's S1 that would cause ordinary DMA to
> bypass the cache. Consistent with KVM we wish to deny the guest the
> ability to become incoherent with cached memory the hypervisor believes is
> cachable so we don't have to flush it.
> 
> Turn on S2FWB whenever the SMMU supports it and use it for all S2
> mappings.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tian, Kevin Aug. 30, 2024, 7:44 a.m. UTC | #4
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, August 27, 2024 11:52 PM
> 
> @@ -4189,6 +4193,13 @@ static int arm_smmu_device_hw_probe(struct
> arm_smmu_device *smmu)
> 
>  	/* IDR3 */
>  	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
> +	/*
> +	 * If for some reason the HW does not support DMA coherency then
> using
> +	 * S2FWB won't work. This will also disable nesting support.
> +	 */
> +	if (FIELD_GET(IDR3_FWB, reg) &&
> +	    (smmu->features & ARM_SMMU_FEAT_COHERENCY))
> +		smmu->features |= ARM_SMMU_FEAT_S2FWB;
>  	if (FIELD_GET(IDR3_RIL, reg))
>  		smmu->features |= ARM_SMMU_FEAT_RANGE_INV;

then also clear ARM_SMMU_FEAT_NESTING?
Nicolin Chen Aug. 30, 2024, 7:56 a.m. UTC | #5
On Fri, Aug 30, 2024 at 07:44:35AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, August 27, 2024 11:52 PM
> >
> > @@ -4189,6 +4193,13 @@ static int arm_smmu_device_hw_probe(struct
> > arm_smmu_device *smmu)
> >
> >       /* IDR3 */
> >       reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
> > +     /*
> > +      * If for some reason the HW does not support DMA coherency then
> > using
> > +      * S2FWB won't work. This will also disable nesting support.
> > +      */
> > +     if (FIELD_GET(IDR3_FWB, reg) &&
> > +         (smmu->features & ARM_SMMU_FEAT_COHERENCY))
> > +             smmu->features |= ARM_SMMU_FEAT_S2FWB;
> >       if (FIELD_GET(IDR3_RIL, reg))
> >               smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
> 
> then also clear ARM_SMMU_FEAT_NESTING?

S2FWB isn't the only HW option for nesting. Pls refer to PATCH-8:
https://lore.kernel.org/linux-iommu/8-v2-621370057090+91fec-smmuv3_nesting_jgg@nvidia.com/

+static struct iommu_domain *
+arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags,
[...]
+	/*
+	 * Must support some way to prevent the VM from bypassing the cache
+	 * because VFIO currently does not do any cache maintenance.
+	 */
+	if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_CANWBS) &&
+	    !(master->smmu->features & ARM_SMMU_FEAT_S2FWB))
+		return ERR_PTR(-EOPNOTSUPP);

Thanks
Nicolin
Tian, Kevin Aug. 30, 2024, 8:01 a.m. UTC | #6
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Friday, August 30, 2024 3:56 PM
> 
> On Fri, Aug 30, 2024 at 07:44:35AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Tuesday, August 27, 2024 11:52 PM
> > >
> > > @@ -4189,6 +4193,13 @@ static int arm_smmu_device_hw_probe(struct
> > > arm_smmu_device *smmu)
> > >
> > >       /* IDR3 */
> > >       reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
> > > +     /*
> > > +      * If for some reason the HW does not support DMA coherency then
> > > using
> > > +      * S2FWB won't work. This will also disable nesting support.
> > > +      */
> > > +     if (FIELD_GET(IDR3_FWB, reg) &&
> > > +         (smmu->features & ARM_SMMU_FEAT_COHERENCY))
> > > +             smmu->features |= ARM_SMMU_FEAT_S2FWB;
> > >       if (FIELD_GET(IDR3_RIL, reg))
> > >               smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
> >
> > then also clear ARM_SMMU_FEAT_NESTING?
> 
> S2FWB isn't the only HW option for nesting. Pls refer to PATCH-8:
> https://lore.kernel.org/linux-iommu/8-v2-621370057090+91fec-
> smmuv3_nesting_jgg@nvidia.com/
> 
> +static struct iommu_domain *
> +arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags,
> [...]
> +	/*
> +	 * Must support some way to prevent the VM from bypassing the
> cache
> +	 * because VFIO currently does not do any cache maintenance.
> +	 */
> +	if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_CANWBS) &&
> +	    !(master->smmu->features & ARM_SMMU_FEAT_S2FWB))
> +		return ERR_PTR(-EOPNOTSUPP);
> 

Yes, but if we guard the setting of the nesting bit upon those
conditions then it's simpler code in other paths by only looking
at one bit.
Mostafa Saleh Aug. 30, 2024, 3:12 p.m. UTC | #7
Hi Jason,

Sorry, I haven’t followed up on that, I was out for a while.

On Tue, Aug 27, 2024 at 12:51:32PM -0300, Jason Gunthorpe wrote:
> Force Write Back (FWB) changes how the S2 IOPTE's MemAttr field
> works. When S2FWB is supported and enabled the IOPTE will force cachable
> access to IOMMU_CACHE memory when nesting with a S1 and deny cachable
> access otherwise.
> 
> When using a single stage of translation, a simple S2 domain, it doesn't
> change anything as it is just a different encoding for the exsting mapping
> of the IOMMU protection flags to cachability attributes.
> 
> However, when used with a nested S1, FWB has the effect of preventing the
> guest from choosing a MemAttr in it's S1 that would cause ordinary DMA to
> bypass the cache. Consistent with KVM we wish to deny the guest the
> ability to become incoherent with cached memory the hypervisor believes is
> cachable so we don't have to flush it.
> 
> Turn on S2FWB whenever the SMMU supports it and use it for all S2
> mappings.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 11 +++++++++
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  3 +++
>  drivers/iommu/io-pgtable-arm.c              | 27 +++++++++++++++++----
>  include/linux/io-pgtable.h                  |  2 ++
>  4 files changed, 38 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 531125f231b662..e2b97ad6d74b03 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -1612,6 +1612,8 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
>  		FIELD_PREP(STRTAB_STE_1_EATS,
>  			   ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
>  
> +	if (smmu->features & ARM_SMMU_FEAT_S2FWB)
> +		target->data[1] |= cpu_to_le64(STRTAB_STE_1_S2FWB);
>  	if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR)
>  		target->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
>  							  STRTAB_STE_1_SHCFG_INCOMING));
> @@ -2400,6 +2402,8 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
>  		pgtbl_cfg.oas = smmu->oas;
>  		fmt = ARM_64_LPAE_S2;
>  		finalise_stage_fn = arm_smmu_domain_finalise_s2;
> +		if (smmu->features & ARM_SMMU_FEAT_S2FWB)
> +			pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_S2FWB;
>  		break;
>  	default:
>  		return -EINVAL;
> @@ -4189,6 +4193,13 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
>  
>  	/* IDR3 */
>  	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
> +	/*
> +	 * If for some reason the HW does not support DMA coherency then using
> +	 * S2FWB won't work. This will also disable nesting support.
> +	 */
> +	if (FIELD_GET(IDR3_FWB, reg) &&
> +	    (smmu->features & ARM_SMMU_FEAT_COHERENCY))
> +		smmu->features |= ARM_SMMU_FEAT_S2FWB;
I think that’s for the SMMU coherency which in theory is not related to the
master which FWB overrides, so this check is not correct.

What I meant in the previous thread that we should set FWB only for coherent
masters as (in attach s2):
	if (smmu->features & ARM_SMMU_FEAT_S2FWB && dev_is_dma_coherent(master->dev)
		// set S2FWB in STE

Thanks,
Mostafa
>  	if (FIELD_GET(IDR3_RIL, reg))
>  		smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
>  
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> index 8851a7abb5f0f3..7e8d2f36faebf3 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> @@ -55,6 +55,7 @@
>  #define IDR1_SIDSIZE			GENMASK(5, 0)
>  
>  #define ARM_SMMU_IDR3			0xc
> +#define IDR3_FWB			(1 << 8)
>  #define IDR3_RIL			(1 << 10)
>  
>  #define ARM_SMMU_IDR5			0x14
> @@ -258,6 +259,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
>  #define STRTAB_STE_1_S1CSH		GENMASK_ULL(7, 6)
>  
>  #define STRTAB_STE_1_S1STALLD		(1UL << 27)
> +#define STRTAB_STE_1_S2FWB		(1UL << 25)
>  
>  #define STRTAB_STE_1_EATS		GENMASK_ULL(29, 28)
>  #define STRTAB_STE_1_EATS_ABT		0UL
> @@ -700,6 +702,7 @@ struct arm_smmu_device {
>  #define ARM_SMMU_FEAT_ATTR_TYPES_OVR	(1 << 20)
>  #define ARM_SMMU_FEAT_HA		(1 << 21)
>  #define ARM_SMMU_FEAT_HD		(1 << 22)
> +#define ARM_SMMU_FEAT_S2FWB		(1 << 23)
>  	u32				features;
>  
>  #define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)
> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> index f5d9fd1f45bf49..9b3658aae21005 100644
> --- a/drivers/iommu/io-pgtable-arm.c
> +++ b/drivers/iommu/io-pgtable-arm.c
> @@ -106,6 +106,18 @@
>  #define ARM_LPAE_PTE_HAP_FAULT		(((arm_lpae_iopte)0) << 6)
>  #define ARM_LPAE_PTE_HAP_READ		(((arm_lpae_iopte)1) << 6)
>  #define ARM_LPAE_PTE_HAP_WRITE		(((arm_lpae_iopte)2) << 6)
> +/*
> + * For !FWB these code to:
> + *  1111 = Normal outer write back cachable / Inner Write Back Cachable
> + *         Permit S1 to override
> + *  0101 = Normal Non-cachable / Inner Non-cachable
> + *  0001 = Device / Device-nGnRE
> + * For S2FWB these code:
> + *  0110 Force Normal Write Back
> + *  0101 Normal* is forced Normal-NC, Device unchanged
> + *  0001 Force Device-nGnRE
> + */
> +#define ARM_LPAE_PTE_MEMATTR_FWB_WB	(((arm_lpae_iopte)0x6) << 2)
>  #define ARM_LPAE_PTE_MEMATTR_OIWB	(((arm_lpae_iopte)0xf) << 2)
>  #define ARM_LPAE_PTE_MEMATTR_NC		(((arm_lpae_iopte)0x5) << 2)
>  #define ARM_LPAE_PTE_MEMATTR_DEV	(((arm_lpae_iopte)0x1) << 2)
> @@ -458,12 +470,16 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
>  	 */
>  	if (data->iop.fmt == ARM_64_LPAE_S2 ||
>  	    data->iop.fmt == ARM_32_LPAE_S2) {
> -		if (prot & IOMMU_MMIO)
> +		if (prot & IOMMU_MMIO) {
>  			pte |= ARM_LPAE_PTE_MEMATTR_DEV;
> -		else if (prot & IOMMU_CACHE)
> -			pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
> -		else
> +		} else if (prot & IOMMU_CACHE) {
> +			if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_S2FWB)
> +				pte |= ARM_LPAE_PTE_MEMATTR_FWB_WB;
> +			else
> +				pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
> +		} else {
>  			pte |= ARM_LPAE_PTE_MEMATTR_NC;
> +		}
>  	} else {
>  		if (prot & IOMMU_MMIO)
>  			pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV
> @@ -932,7 +948,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
>  	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
>  			    IO_PGTABLE_QUIRK_ARM_TTBR1 |
>  			    IO_PGTABLE_QUIRK_ARM_OUTER_WBWA |
> -			    IO_PGTABLE_QUIRK_ARM_HD))
> +			    IO_PGTABLE_QUIRK_ARM_HD |
> +			    IO_PGTABLE_QUIRK_ARM_S2FWB))
>  		return NULL;
>  
>  	data = arm_lpae_alloc_pgtable(cfg);
> diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
> index f9a81761bfceda..aff9b020b6dcc7 100644
> --- a/include/linux/io-pgtable.h
> +++ b/include/linux/io-pgtable.h
> @@ -87,6 +87,7 @@ struct io_pgtable_cfg {
>  	 *	attributes set in the TCR for a non-coherent page-table walker.
>  	 *
>  	 * IO_PGTABLE_QUIRK_ARM_HD: Enables dirty tracking in stage 1 pagetable.
> +	 * IO_PGTABLE_QUIRK_ARM_S2FWB: Use the FWB format for the MemAttrs bits
>  	 */
>  	#define IO_PGTABLE_QUIRK_ARM_NS			BIT(0)
>  	#define IO_PGTABLE_QUIRK_NO_PERMS		BIT(1)
> @@ -95,6 +96,7 @@ struct io_pgtable_cfg {
>  	#define IO_PGTABLE_QUIRK_ARM_TTBR1		BIT(5)
>  	#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA		BIT(6)
>  	#define IO_PGTABLE_QUIRK_ARM_HD			BIT(7)
> +	#define IO_PGTABLE_QUIRK_ARM_S2FWB		BIT(8)
>  	unsigned long			quirks;
>  	unsigned long			pgsize_bitmap;
>  	unsigned int			ias;
> -- 
> 2.46.0
>
Jason Gunthorpe Aug. 30, 2024, 4:40 p.m. UTC | #8
On Fri, Aug 30, 2024 at 03:12:54PM +0000, Mostafa Saleh wrote:
> > +	/*
> > +	 * If for some reason the HW does not support DMA coherency then using
> > +	 * S2FWB won't work. This will also disable nesting support.
> > +	 */
> > +	if (FIELD_GET(IDR3_FWB, reg) &&
> > +	    (smmu->features & ARM_SMMU_FEAT_COHERENCY))
> > +		smmu->features |= ARM_SMMU_FEAT_S2FWB;
> I think that’s for the SMMU coherency which in theory is not related to the
> master which FWB overrides, so this check is not correct.

Yes, I agree, in theory.

However the driver today already links them together:

	case IOMMU_CAP_CACHE_COHERENCY:
		/* Assume that a coherent TCU implies coherent TBUs */
		return master->smmu->features & ARM_SMMU_FEAT_COHERENCY;

So this hunk was a continuation of that design.

> What I meant in the previous thread that we should set FWB only for coherent
> masters as (in attach s2):
> 	if (smmu->features & ARM_SMMU_FEAT_S2FWB && dev_is_dma_coherent(master->dev)
> 		// set S2FWB in STE

I think as I explained in that thread, it is not really correct
either. There is no reason to block using S2FWB for non-coherent
masters that are not used with VFIO. The page table will still place
the correct memattr according to the IOMMU_CACHE flag, S2FWB just
slightly changes the encoding.

For VFIO, non-coherent masters need to be blocked from VFIO entirely
and should never get even be allowed to get here.

If anything should be changed then it would be the above
IOMMU_CAP_CACHE_COHERENCY test, and I don't know if
dev_is_dma_coherent() would be correct there, or if it should do some
ACPI inspection or what.

So let's drop the above hunk, it already happens implicitly because
VFIO checks it via IOMMU_CAP_CACHE_COHERENCY and it makes more sense
to put the assumption in one place.

Thanks,
Jason
Jason Gunthorpe Sept. 3, 2024, 12:05 a.m. UTC | #9
On Mon, Sep 02, 2024 at 09:29:53AM +0000, Mostafa Saleh wrote:
> On Fri, Aug 30, 2024 at 01:40:19PM -0300, Jason Gunthorpe wrote:
> > On Fri, Aug 30, 2024 at 03:12:54PM +0000, Mostafa Saleh wrote:
> > > > +	/*
> > > > +	 * If for some reason the HW does not support DMA coherency then using
> > > > +	 * S2FWB won't work. This will also disable nesting support.
> > > > +	 */
> > > > +	if (FIELD_GET(IDR3_FWB, reg) &&
> > > > +	    (smmu->features & ARM_SMMU_FEAT_COHERENCY))
> > > > +		smmu->features |= ARM_SMMU_FEAT_S2FWB;
> > > I think that’s for the SMMU coherency which in theory is not related to the
> > > master which FWB overrides, so this check is not correct.
> > 
> > Yes, I agree, in theory.
> > 
> > However the driver today already links them together:
> > 
> > 	case IOMMU_CAP_CACHE_COHERENCY:
> > 		/* Assume that a coherent TCU implies coherent TBUs */
> > 		return master->smmu->features & ARM_SMMU_FEAT_COHERENCY;
> > 
> > So this hunk was a continuation of that design.
> > 
> > > What I meant in the previous thread that we should set FWB only for coherent
> > > masters as (in attach s2):
> > > 	if (smmu->features & ARM_SMMU_FEAT_S2FWB && dev_is_dma_coherent(master->dev)
> > > 		// set S2FWB in STE
> > 
> > I think as I explained in that thread, it is not really correct
> > either. There is no reason to block using S2FWB for non-coherent
> > masters that are not used with VFIO. The page table will still place
> > the correct memattr according to the IOMMU_CACHE flag, S2FWB just
> > slightly changes the encoding.
> 
> It’s not just the encoding that changes, as
> - Without FWB, stage-2 combine attributes
> - While with FWB, it overrides them.

You mean there is some incomming attribute in the transaction
(obviously not talking PCI here) and S2FWB combines with that?

> So a cacheable mapping in stage-2 can lead to a non-cacheable
> (or with different cachableitiy attributes) transaction based on the
> input. I am not sure though if there is such case in the kernel.

If the kernel supplies IOMMU_CACHE then the kernel also skips all the
cache flushing. So it would be a functional problem if combining was
causing a non-cachable access through a IOMMU_CACHE S2 already. The
DMA API would fail if that was the case.

> > If anything should be changed then it would be the above
> > IOMMU_CAP_CACHE_COHERENCY test, and I don't know if
> > dev_is_dma_coherent() would be correct there, or if it should do some
> > ACPI inspection or what.
> 
> I agree, I believe that this assumption is not accurate, I am not sure
> what is the right approach here, but in concept I think we shouldn’t
> enable FWB for non-coherent devices (using dev_is_dma_coherent() or
> other check)

The DMA API requires that the cachability rules it sets via
IOMMU_CACHE are followed. In this way the stricter behavior of S2FWB
is a benefit, not a draw back.

I'm still not seeing a problm here??

Jason
Mostafa Saleh Sept. 3, 2024, 7:57 a.m. UTC | #10
On Mon, Sep 02, 2024 at 09:05:46PM -0300, Jason Gunthorpe wrote:
> On Mon, Sep 02, 2024 at 09:29:53AM +0000, Mostafa Saleh wrote:
> > On Fri, Aug 30, 2024 at 01:40:19PM -0300, Jason Gunthorpe wrote:
> > > On Fri, Aug 30, 2024 at 03:12:54PM +0000, Mostafa Saleh wrote:
> > > > > +	/*
> > > > > +	 * If for some reason the HW does not support DMA coherency then using
> > > > > +	 * S2FWB won't work. This will also disable nesting support.
> > > > > +	 */
> > > > > +	if (FIELD_GET(IDR3_FWB, reg) &&
> > > > > +	    (smmu->features & ARM_SMMU_FEAT_COHERENCY))
> > > > > +		smmu->features |= ARM_SMMU_FEAT_S2FWB;
> > > > I think that’s for the SMMU coherency which in theory is not related to the
> > > > master which FWB overrides, so this check is not correct.
> > > 
> > > Yes, I agree, in theory.
> > > 
> > > However the driver today already links them together:
> > > 
> > > 	case IOMMU_CAP_CACHE_COHERENCY:
> > > 		/* Assume that a coherent TCU implies coherent TBUs */
> > > 		return master->smmu->features & ARM_SMMU_FEAT_COHERENCY;
> > > 
> > > So this hunk was a continuation of that design.
> > > 
> > > > What I meant in the previous thread that we should set FWB only for coherent
> > > > masters as (in attach s2):
> > > > 	if (smmu->features & ARM_SMMU_FEAT_S2FWB && dev_is_dma_coherent(master->dev)
> > > > 		// set S2FWB in STE
> > > 
> > > I think as I explained in that thread, it is not really correct
> > > either. There is no reason to block using S2FWB for non-coherent
> > > masters that are not used with VFIO. The page table will still place
> > > the correct memattr according to the IOMMU_CACHE flag, S2FWB just
> > > slightly changes the encoding.
> > 
> > It’s not just the encoding that changes, as
> > - Without FWB, stage-2 combine attributes
> > - While with FWB, it overrides them.
> 
> You mean there is some incomming attribute in the transaction
> (obviously not talking PCI here) and S2FWB combines with that?

Yes, stuff as cacheability (as defined by Arm spec)
I am not sure about PCI, but according to the spec:
	“PCIe does not contain memory type attributes, and each transaction
	takes a system-defined memory type when it progresses into the system”

> 
> > So a cacheable mapping in stage-2 can lead to a non-cacheable
> > (or with different cachableitiy attributes) transaction based on the
> > input. I am not sure though if there is such case in the kernel.
> 
> If the kernel supplies IOMMU_CACHE then the kernel also skips all the
> cache flushing. So it would be a functional problem if combining was
> causing a non-cachable access through a IOMMU_CACHE S2 already. The
> DMA API would fail if that was the case.

Correct, but it’s not just about cacheable/non-cacheable, as I mentioned
it’s about other attributes also, this is a very niche case, and again I
am not sure if there are devices affected in the kernel, but I just
wanted to highlight it’s not just a different encoding for stage-2.

> 
> > > If anything should be changed then it would be the above
> > > IOMMU_CAP_CACHE_COHERENCY test, and I don't know if
> > > dev_is_dma_coherent() would be correct there, or if it should do some
> > > ACPI inspection or what.
> > 
> > I agree, I believe that this assumption is not accurate, I am not sure
> > what is the right approach here, but in concept I think we shouldn’t
> > enable FWB for non-coherent devices (using dev_is_dma_coherent() or
> > other check)
> 
> The DMA API requires that the cachability rules it sets via
> IOMMU_CACHE are followed. In this way the stricter behavior of S2FWB
> is a benefit, not a draw back.
> 
> I'm still not seeing a problm here??

Basically, I believe we shouldn’t set FWB blindly just because it’s supported,
I don’t see how it’s useful for stage-2 only domains.

And I believe making assumptions about VFIO (which actually is not correctly
enforced at the moment) is fragile, and we should only set FWB for coherent
devices in nested setup only where the VMM(or hypervisor) knows better than
the VM.

Thanks,
Mostafa

> 
> Jason
Jason Gunthorpe Sept. 3, 2024, 11:33 p.m. UTC | #11
On Tue, Sep 03, 2024 at 07:57:01AM +0000, Mostafa Saleh wrote:

> Basically, I believe we shouldn’t set FWB blindly just because it’s supported,
> I don’t see how it’s useful for stage-2 only domains.

And the only problem we can see is some niche scenario where incoming
memory attributes that are already requesting cachable combine to a
different kind of cachable?

> And I believe making assumptions about VFIO (which actually is not correctly
> enforced at the moment) is fragile.

VFIO requiring cachable is definately not fragile, and it also sets
the IOMMU_CACHE flag to indicate this. Revising VFIO to allow
non-cachable would be a signficant change and would also change what
IOMMU_CACHE flag it sets.

> and we should only set FWB for coherent
> devices in nested setup only where the VMM(or hypervisor) knows better than
> the VM.

I don't want to touch the 'only coherent devices' question. Last time
I tried to do that I got told every option was wrong.

I would be fine to only enable for nesting parent domains. It is
mandatory here and we definitely don't support non-cachable nesting
today.  Can we agree on that?

Keep in mind SMMU S2FWB is really new and probably very little HW
supports it right now. So we are not breaking anything existing
here. IMHO it is better to always enable the stricter features going
forward, and then evaluate an in-kernel opt-out if someone comes with
a concrete use case.

Jason
Shameerali Kolothum Thodi Sept. 4, 2024, 2:20 p.m. UTC | #12
> -----Original Message-----
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, August 27, 2024 4:52 PM
> To: acpica-devel@lists.linux.dev; Guohanjun (Hanjun Guo)
> <guohanjun@huawei.com>; iommu@lists.linux.dev; Joerg Roedel
> <joro@8bytes.org>; Kevin Tian <kevin.tian@intel.com>;
> kvm@vger.kernel.org; Len Brown <lenb@kernel.org>; linux-
> acpi@vger.kernel.org; linux-arm-kernel@lists.infradead.org; Lorenzo Pieralisi
> <lpieralisi@kernel.org>; Rafael J. Wysocki <rafael@kernel.org>; Robert
> Moore <robert.moore@intel.com>; Robin Murphy
> <robin.murphy@arm.com>; Sudeep Holla <sudeep.holla@arm.com>; Will
> Deacon <will@kernel.org>
> Cc: Alex Williamson <alex.williamson@redhat.com>; Eric Auger
> <eric.auger@redhat.com>; Jean-Philippe Brucker <jean-
> philippe@linaro.org>; Moritz Fischer <mdf@kernel.org>; Michael Shavit
> <mshavit@google.com>; Nicolin Chen <nicolinc@nvidia.com>;
> patches@lists.linux.dev; Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>; Mostafa Saleh
> <smostafa@google.com>
> Subject: [PATCH v2 2/8] iommu/arm-smmu-v3: Use S2FWB when available
> 
> Force Write Back (FWB) changes how the S2 IOPTE's MemAttr field
> works. When S2FWB is supported and enabled the IOPTE will force cachable
> access to IOMMU_CACHE memory when nesting with a S1 and deny cachable
> access otherwise.
> 
> When using a single stage of translation, a simple S2 domain, it doesn't
> change anything as it is just a different encoding for the exsting mapping
> of the IOMMU protection flags to cachability attributes.
> 
> However, when used with a nested S1, FWB has the effect of preventing the
> guest from choosing a MemAttr in it's S1 that would cause ordinary DMA to
> bypass the cache. Consistent with KVM we wish to deny the guest the
> ability to become incoherent with cached memory the hypervisor believes is
> cachable so we don't have to flush it.
> 
> Turn on S2FWB whenever the SMMU supports it and use it for all S2
> mappings.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---

(...)

> @@ -932,7 +948,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg
> *cfg, void *cookie)
>  	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
>  			    IO_PGTABLE_QUIRK_ARM_TTBR1 |
>  			    IO_PGTABLE_QUIRK_ARM_OUTER_WBWA |
> -			    IO_PGTABLE_QUIRK_ARM_HD))
> +			    IO_PGTABLE_QUIRK_ARM_HD |
> +			    IO_PGTABLE_QUIRK_ARM_S2FWB))
>  		return NULL;

This should be added to arm_64_lpae_alloc_pgtable_s2(), not here.

With the above fixed, I was able to assign a n/w VF dev to a Guest on a
test hardware that supports S2FWB.

However host kernel has this WARN message:
[ 1546.165105] WARNING: CPU: 5 PID: 7047 at drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c:1086 arm_smmu_entry_qword_diff+0x124/0x138
....
[ 1546.330312]  arm_smmu_entry_qword_diff+0x124/0x138
[ 1546.335090]  arm_smmu_write_entry+0x38/0x22c
[ 1546.339346]  arm_smmu_install_ste_for_dev+0x158/0x1ac
[ 1546.344383]  arm_smmu_attach_dev+0x138/0x240
[ 1546.348639]  __iommu_device_set_domain+0x7c/0x11c
[ 1546.353330]  __iommu_group_set_domain_internal+0x60/0x134
[ 1546.358714]  iommu_group_replace_domain+0x3c/0x68
[ 1546.363404]  iommufd_device_do_replace+0x334/0x398
[ 1546.368181]  iommufd_device_change_pt+0x26c/0x650
[ 1546.372871]  iommufd_device_replace+0x18/0x24
[ 1546.377214]  vfio_iommufd_physical_attach_ioas+0x28/0x68
[ 1546.382514]  vfio_df_ioctl_attach_pt+0x98/0x170


And when I tried to use the assigned n/w dev, it seems to do a reset
continuously.

root@localhost:/# ping 150.0.124.42
PING 150.0.124.42 (150.0.124.42): 56 data bytes
64 bytes from 150.0.124.42: seq=0 ttl=64 time=47.648 ms
[ 1395.958630] hns3 0000:c2:00.0 eth1: NETDEV WATCHDOG: CPU: 1: transmit queue 10 timed out 5260 ms
[ 1395.960187] hns3 0000:c2:00.0 eth1: DQL info last_cnt: 42, queued: 42, adj_limit: 0, completed: 0
[ 1395.961758] hns3 0000:c2:00.0 eth1: queue state: 0x6, delta msecs: 5260
[ 1395.962925] hns3 0000:c2:00.0 eth1: tx_timeout count: 1, queue id: 10, SW_NTU: 0x1, SW_NTC: 0x0, napi state: 16
[ 1395.964677] hns3 0000:c2:00.0 eth1: tx_pkts: 0, tx_bytes: 0, sw_err_cnt: 0, tx_pending: 0
[ 1395.966114] hns3 0000:c2:00.0 eth1: seg_pkt_cnt: 0, tx_more: 0, restart_queue: 0, tx_busy: 0
[ 1395.967598] hns3 0000:c2:00.0 eth1: tx_push: 1, tx_mem_doorbell: 0
[ 1395.968687] hns3 0000:c2:00.0 eth1: BD_NUM: 0x7f HW_HEAD: 0x0, HW_TAIL: 0x0, BD_ERR: 0x0, INT: 0x1
[ 1395.970291] hns3 0000:c2:00.0 eth1: RING_EN: 0x1, TC: 0x0, FBD_NUM: 0x0 FBD_OFT: 0x0, EBD_NUM: 0x400, EBD_OFT: 0x0
[ 1395.972134] hns3 0000:c2:00.0: received reset request from VF enet

All this works fine on a hardware without S2FWB though.

Also on this test hardware, it works fine with legacy VFIO assignment.

Not debugged further. Please let me know if you have any hunch.

Thanks,
Shameer
Jason Gunthorpe Sept. 4, 2024, 3 p.m. UTC | #13
On Wed, Sep 04, 2024 at 02:20:36PM +0000, Shameerali Kolothum Thodi wrote:

> This should be added to arm_64_lpae_alloc_pgtable_s2(), not here.

Woops! Yes:

-       /* The NS quirk doesn't apply at stage 2 */
-       if (cfg->quirks)
+       if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_S2FWB))
                return NULL;

> With the above fixed, I was able to assign a n/w VF dev to a Guest on a
> test hardware that supports S2FWB.

Okay great
 
> However host kernel has this WARN message:
> [ 1546.165105] WARNING: CPU: 5 PID: 7047 at drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c:1086 arm_smmu_entry_qword_diff+0x124/0x138
> ....

Yes, my dumb mistake again, thanks for testing

@@ -1009,7 +1009,8 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
        /* S2 translates */
        if (cfg & BIT(1)) {
                used_bits[1] |=
-                       cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG);
+                       cpu_to_le64(STRTAB_STE_1_S2FWB | STRTAB_STE_1_EATS |
+                                   STRTAB_STE_1_SHCFG);

> root@localhost:/# ping 150.0.124.42
> PING 150.0.124.42 (150.0.124.42): 56 data bytes
> 64 bytes from 150.0.124.42: seq=0 ttl=64 time=47.648 ms

So DMA is not totally broken if a packet flowed.

> [ 1395.958630] hns3 0000:c2:00.0 eth1: NETDEV WATCHDOG: CPU: 1: transmit queue 10 timed out 5260 ms

Timeout? Maybe interrupts are not working? Does /proc/interrupts
suggest that? That would point at the ITS mapping

Do you have all of Nicolin's extra patches in this kernel to make the
ITS work with nesting?
Mostafa Saleh Sept. 10, 2024, 10:55 a.m. UTC | #14
On Tue, Sep 03, 2024 at 08:33:40PM -0300, Jason Gunthorpe wrote:
> On Tue, Sep 03, 2024 at 07:57:01AM +0000, Mostafa Saleh wrote:
> 
> > Basically, I believe we shouldn’t set FWB blindly just because it’s supported,
> > I don’t see how it’s useful for stage-2 only domains.
> 
> And the only problem we can see is some niche scenario where incoming
> memory attributes that are already requesting cachable combine to a
> different kind of cachable?

No, it’s not about the niche scenario, as I mentioned I don’t think
we should enable FWB because it just exists. One can argue the opposite,
if S2FWB is no different why enable it?

AFAIU, FWB would be useful in cases where the hypervisor(or VMM) knows
better than the VM, for example some devices MMIO space are emulated so
they are normal memory and it’s more efficient to use memory attributes.

Taking into consideration all the hassle that can happen if non-coherent
devices use the wrong attribute, I’d suggest either set FWB only for
coherent devices (I know it’s not easy to define, but maybe be should?)
or we have a new CAP where the caller is aware of that. But I don’t think
the driver should decide that on behalf of the caller.

> 
> > And I believe making assumptions about VFIO (which actually is not correctly
> > enforced at the moment) is fragile.
> 
> VFIO requiring cachable is definately not fragile, and it also sets
> the IOMMU_CACHE flag to indicate this. Revising VFIO to allow
> non-cachable would be a signficant change and would also change what
> IOMMU_CACHE flag it sets.
> 

I meant the driver shouldn't assume the caller behaviour, if it's VFIO
or something new.

> > and we should only set FWB for coherent
> > devices in nested setup only where the VMM(or hypervisor) knows better than
> > the VM.
> 
> I don't want to touch the 'only coherent devices' question. Last time
> I tried to do that I got told every option was wrong.
> 
> I would be fine to only enable for nesting parent domains. It is
> mandatory here and we definitely don't support non-cachable nesting
> today.  Can we agree on that?
> 
Why is it mandatory?

I think a supporting point for this, is that KVM does the same for
the CPU, where it enables FWB for VMs if supported. I have this on
my list to study if that can be improved. But may be if we are out
of options that would be a start.

> Keep in mind SMMU S2FWB is really new and probably very little HW
> supports it right now. So we are not breaking anything existing
> here. IMHO it is better to always enable the stricter features going
> forward, and then evaluate an in-kernel opt-out if someone comes with
> a concrete use case.
> 

I agree, it’s unlikely that this breaks existing hardware, but I’d
be concerned if FWB is enabled unconditionally it breaks devices in
the future and we end up restricting it more.

Thanks,
Mostafa
> Jason
Shameerali Kolothum Thodi Sept. 10, 2024, 11:25 a.m. UTC | #15
> -----Original Message-----
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, September 4, 2024 4:00 PM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: acpica-devel@lists.linux.dev; Guohanjun (Hanjun Guo)
> <guohanjun@huawei.com>; iommu@lists.linux.dev; Joerg Roedel
> <joro@8bytes.org>; Kevin Tian <kevin.tian@intel.com>; kvm@vger.kernel.org;
> Len Brown <lenb@kernel.org>; linux-acpi@vger.kernel.org; linux-arm-
> kernel@lists.infradead.org; Lorenzo Pieralisi <lpieralisi@kernel.org>; Rafael J.
> Wysocki <rafael@kernel.org>; Robert Moore <robert.moore@intel.com>; Robin
> Murphy <robin.murphy@arm.com>; Sudeep Holla <sudeep.holla@arm.com>;
> Will Deacon <will@kernel.org>; Alex Williamson
> <alex.williamson@redhat.com>; Eric Auger <eric.auger@redhat.com>; Jean-
> Philippe Brucker <jean-philippe@linaro.org>; Moritz Fischer <mdf@kernel.org>;
> Michael Shavit <mshavit@google.com>; Nicolin Chen <nicolinc@nvidia.com>;
> patches@lists.linux.dev; Mostafa Saleh <smostafa@google.com>
> Subject: Re: [PATCH v2 2/8] iommu/arm-smmu-v3: Use S2FWB when available
> 
> On Wed, Sep 04, 2024 at 02:20:36PM +0000, Shameerali Kolothum Thodi wrote:
> 
> > This should be added to arm_64_lpae_alloc_pgtable_s2(), not here.
> 
> Woops! Yes:
> 
> -       /* The NS quirk doesn't apply at stage 2 */
> -       if (cfg->quirks)
> +       if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_S2FWB))
>                 return NULL;
> 
> > With the above fixed, I was able to assign a n/w VF dev to a Guest on
> > a test hardware that supports S2FWB.
> 
> Okay great
> 
> > However host kernel has this WARN message:
> > [ 1546.165105] WARNING: CPU: 5 PID: 7047 at
> > drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c:1086
> > arm_smmu_entry_qword_diff+0x124/0x138
> > ....
> 
> Yes, my dumb mistake again, thanks for testing
> 
> @@ -1009,7 +1009,8 @@ void arm_smmu_get_ste_used(const __le64 *ent,
> __le64 *used_bits)
>         /* S2 translates */
>         if (cfg & BIT(1)) {
>                 used_bits[1] |=
> -                       cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG);
> +                       cpu_to_le64(STRTAB_STE_1_S2FWB | STRTAB_STE_1_EATS |
> +                                   STRTAB_STE_1_SHCFG);
> 
> > root@localhost:/# ping 150.0.124.42
> > PING 150.0.124.42 (150.0.124.42): 56 data bytes
> > 64 bytes from 150.0.124.42: seq=0 ttl=64 time=47.648 ms
> 
> So DMA is not totally broken if a packet flowed.
> 
> > [ 1395.958630] hns3 0000:c2:00.0 eth1: NETDEV WATCHDOG: CPU: 1:
> > transmit queue 10 timed out 5260 ms
> 
> Timeout? Maybe interrupts are not working? Does /proc/interrupts suggest
> that? That would point at the ITS mapping

Interrupt seems to be Ok in this case as I can see /proc/interrupts increasing.

> Do you have all of Nicolin's extra patches in this kernel to make the ITS work
> with nesting?

Yes. I am using his
 https://github.com/nicolinc/iommufd/commits/iommufd_viommu_p1-v2/

> From a page table POV, iommu_dma_get_msi_page() has:
> 
> 	int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
> 
> So the ITS page should be:
> 
> 		if (prot & IOMMU_MMIO) {
> 			pte |= ARM_LPAE_PTE_MEMATTR_DEV;
> 
> Which which still looks right under S2FWB unless I've misread the manual?
> 
> > [ 1395.960187] hns3 0000:c2:00.0 eth1: DQL info last_cnt: 42, queued:
> > 42, adj_limit: 0, completed: 0 [ 1395.961758] hns3 0000:c2:00.0 eth1:
> > queue state: 0x6, delta msecs: 5260 [ 1395.962925] hns3 0000:c2:00.0
> > eth1: tx_timeout count: 1, queue id: 10, SW_NTU: 0x1, SW_NTC: 0x0,
> > napi state: 16 [ 1395.964677] hns3 0000:c2:00.0 eth1: tx_pkts: 0,
> > tx_bytes: 0, sw_err_cnt: 0, tx_pending: 0 [ 1395.966114] hns3
> > 0000:c2:00.0 eth1: seg_pkt_cnt: 0, tx_more: 0, restart_queue: 0,
> > tx_busy: 0 [ 1395.967598] hns3 0000:c2:00.0 eth1: tx_push: 1,
> > tx_mem_doorbell: 0 [ 1395.968687] hns3 0000:c2:00.0 eth1: BD_NUM: 0x7f
> > HW_HEAD: 0x0, HW_TAIL: 0x0, BD_ERR: 0x0, INT: 0x1 [ 1395.970291] hns3
> > 0000:c2:00.0 eth1: RING_EN: 0x1, TC: 0x0, FBD_NUM: 0x0 FBD_OFT: 0x0,
> > EBD_NUM: 0x400, EBD_OFT: 0x0 [ 1395.972134] hns3 0000:c2:00.0:
> > received reset request from VF enet
> >
> > All this works fine on a hardware without S2FWB though.
> >
> > Also on this test hardware, it works fine with legacy VFIO assignment.
> 
> So.. Legacy VFIO assignment will use the S1, no nesting and not enable S2FWB?

Yes S1
 
> Try to isolate if S2FWB is the exact cause by disabling it in the kernel on this
> system vs something else wrong?

It looks like not related to S2FWB. I tried  commenting out S2FWB and issue is still
there.  Probably something related to this test setup.

Thanks,
Shameer
Jason Gunthorpe Sept. 10, 2024, 8:22 p.m. UTC | #16
On Tue, Sep 10, 2024 at 10:55:51AM +0000, Mostafa Saleh wrote:
> On Tue, Sep 03, 2024 at 08:33:40PM -0300, Jason Gunthorpe wrote:
> > On Tue, Sep 03, 2024 at 07:57:01AM +0000, Mostafa Saleh wrote:
> > 
> > > Basically, I believe we shouldn’t set FWB blindly just because it’s supported,
> > > I don’t see how it’s useful for stage-2 only domains.
> > 
> > And the only problem we can see is some niche scenario where incoming
> > memory attributes that are already requesting cachable combine to a
> > different kind of cachable?
> 
> No, it’s not about the niche scenario, as I mentioned I don’t think
> we should enable FWB because it just exists. One can argue the opposite,
> if S2FWB is no different why enable it?

Well, I'd argue that it provides more certainty for the kernel that
the DMA API behavior is matched by HW behavior. But I don't feel strongly.

I adjusted the patch to only enable it for nesting parents.

> AFAIU, FWB would be useful in cases where the hypervisor(or VMM) knows
> better than the VM, for example some devices MMIO space are emulated so
> they are normal memory and it’s more efficient to use memory attributes.

Not quite, the purpose of FWB is to allow the hypervisor to avoid
costly cache flushing. It is specifically to protect the hypervisor
against a VM causing the caches to go incoherent.

Caches that are unexpectedly incoherent are a security problem for the
hypervisor.

> > > and we should only set FWB for coherent
> > > devices in nested setup only where the VMM(or hypervisor) knows better than
> > > the VM.
> > 
> > I don't want to touch the 'only coherent devices' question. Last time
> > I tried to do that I got told every option was wrong.
> > 
> > I would be fine to only enable for nesting parent domains. It is
> > mandatory here and we definitely don't support non-cachable nesting
> > today.  Can we agree on that?
> 
> Why is it mandatory?

Because iommufd/vfio doesn't have cache flushing.
 
> I think a supporting point for this, is that KVM does the same for
> the CPU, where it enables FWB for VMs if supported. I have this on
> my list to study if that can be improved. But may be if we are out
> of options that would be a start.

When KVM turns on S2FWB it stops doing cache flushing. As I understand
it S2FWB is significantly a performance optimization.

On the VFIO side we don't have cache flushing at all. So enforcing
cache consistency is mandatory for security.

For native VFIO we set IOMMU_CACHE and expect that the contract with
the IOMMU is that no cache flushing is required.

For nested we set S2FWB/CANWBS to prevent the VM from disabling VFIO's
IOMMU_CACHE and again the contract with the HW is that no cache
flushing is required.

Thus VFIO is security correct even though it doesn't cache flush.

None of this has anything to do with device coherence capability. It
is why I keep saying incoherent devices must be blocked from VFIO
because it cannot operate them securely/correctly.

Fixing that is a whole other topic, Yi has a series for it on x86 at
least..

Jason
Jason Gunthorpe Sept. 11, 2024, 10:52 p.m. UTC | #17
On Tue, Sep 10, 2024 at 11:25:55AM +0000, Shameerali Kolothum Thodi wrote:
> > Try to isolate if S2FWB is the exact cause by disabling it in the kernel on this
> > system vs something else wrong?
> 
> It looks like not related to S2FWB. I tried  commenting out S2FWB and issue is still
> there.  Probably something related to this test setup.

Okay, so not these patches. You managed to get some DMA through the
S2FWB so that is encouraging.

Thanks,
Jason
Mostafa Saleh Sept. 17, 2024, 9:48 a.m. UTC | #18
On Tue, Sep 10, 2024 at 05:22:51PM -0300, Jason Gunthorpe wrote:
> On Tue, Sep 10, 2024 at 10:55:51AM +0000, Mostafa Saleh wrote:
> > On Tue, Sep 03, 2024 at 08:33:40PM -0300, Jason Gunthorpe wrote:
> > > On Tue, Sep 03, 2024 at 07:57:01AM +0000, Mostafa Saleh wrote:
> > > 
> > > > Basically, I believe we shouldn’t set FWB blindly just because it’s supported,
> > > > I don’t see how it’s useful for stage-2 only domains.
> > > 
> > > And the only problem we can see is some niche scenario where incoming
> > > memory attributes that are already requesting cachable combine to a
> > > different kind of cachable?
> > 
> > No, it’s not about the niche scenario, as I mentioned I don’t think
> > we should enable FWB because it just exists. One can argue the opposite,
> > if S2FWB is no different why enable it?
> 
> Well, I'd argue that it provides more certainty for the kernel that
> the DMA API behavior is matched by HW behavior. But I don't feel strongly.
> 
> I adjusted the patch to only enable it for nesting parents.
> 
> > AFAIU, FWB would be useful in cases where the hypervisor(or VMM) knows
> > better than the VM, for example some devices MMIO space are emulated so
> > they are normal memory and it’s more efficient to use memory attributes.
> 
> Not quite, the purpose of FWB is to allow the hypervisor to avoid
> costly cache flushing. It is specifically to protect the hypervisor
> against a VM causing the caches to go incoherent.
> 
> Caches that are unexpectedly incoherent are a security problem for the
> hypervisor.

I see, thanks for explaining, I got confused about the device emulation case,
it’s also about corruption because of a mismatch of memory attributes,
something like:
https://bugzilla.redhat.com/show_bug.cgi?id=1679680

At the moment, I see KVM doesn’t really touch guest memory, but it does CMO for
guest map(in case memslot had already some data) and on unmap, which I
believe has significant performance improvement.

> 
> > > > and we should only set FWB for coherent
> > > > devices in nested setup only where the VMM(or hypervisor) knows better than
> > > > the VM.
> > > 
> > > I don't want to touch the 'only coherent devices' question. Last time
> > > I tried to do that I got told every option was wrong.
> > > 
> > > I would be fine to only enable for nesting parent domains. It is
> > > mandatory here and we definitely don't support non-cachable nesting
> > > today.  Can we agree on that?
> > 
> > Why is it mandatory?
> 
> Because iommufd/vfio doesn't have cache flushing.
>  

I see.

> > I think a supporting point for this, is that KVM does the same for
> > the CPU, where it enables FWB for VMs if supported. I have this on
> > my list to study if that can be improved. But may be if we are out
> > of options that would be a start.
> 
> When KVM turns on S2FWB it stops doing cache flushing. As I understand
> it S2FWB is significantly a performance optimization.
> 
> On the VFIO side we don't have cache flushing at all. So enforcing
> cache consistency is mandatory for security.
> 
> For native VFIO we set IOMMU_CACHE and expect that the contract with
> the IOMMU is that no cache flushing is required.
> 
> For nested we set S2FWB/CANWBS to prevent the VM from disabling VFIO's
> IOMMU_CACHE and again the contract with the HW is that no cache
> flushing is required.
> 
> Thus VFIO is security correct even though it doesn't cache flush.
> 
> None of this has anything to do with device coherence capability. It
> is why I keep saying incoherent devices must be blocked from VFIO
> because it cannot operate them securely/correctly.
> 
> Fixing that is a whole other topic, Yi has a series for it on x86 at
> least..

I see, that makes sense to only support it for nested domains on
the assumption they are only used for VFIO/IOMMUFD till we figure out
non-coherent devices, I guess you are referring to:
https://lore.kernel.org/all/ZltQ3PyHKiQmN9SU@nvidia.com/t/#me702dd242782393eb7769000c96702a0fed7f6ca

Thanks,
Mostafa
> 
> Jason
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 531125f231b662..e2b97ad6d74b03 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1612,6 +1612,8 @@  void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 		FIELD_PREP(STRTAB_STE_1_EATS,
 			   ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
 
+	if (smmu->features & ARM_SMMU_FEAT_S2FWB)
+		target->data[1] |= cpu_to_le64(STRTAB_STE_1_S2FWB);
 	if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR)
 		target->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
 							  STRTAB_STE_1_SHCFG_INCOMING));
@@ -2400,6 +2402,8 @@  static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
 		pgtbl_cfg.oas = smmu->oas;
 		fmt = ARM_64_LPAE_S2;
 		finalise_stage_fn = arm_smmu_domain_finalise_s2;
+		if (smmu->features & ARM_SMMU_FEAT_S2FWB)
+			pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_S2FWB;
 		break;
 	default:
 		return -EINVAL;
@@ -4189,6 +4193,13 @@  static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
 
 	/* IDR3 */
 	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
+	/*
+	 * If for some reason the HW does not support DMA coherency then using
+	 * S2FWB won't work. This will also disable nesting support.
+	 */
+	if (FIELD_GET(IDR3_FWB, reg) &&
+	    (smmu->features & ARM_SMMU_FEAT_COHERENCY))
+		smmu->features |= ARM_SMMU_FEAT_S2FWB;
 	if (FIELD_GET(IDR3_RIL, reg))
 		smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 8851a7abb5f0f3..7e8d2f36faebf3 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -55,6 +55,7 @@ 
 #define IDR1_SIDSIZE			GENMASK(5, 0)
 
 #define ARM_SMMU_IDR3			0xc
+#define IDR3_FWB			(1 << 8)
 #define IDR3_RIL			(1 << 10)
 
 #define ARM_SMMU_IDR5			0x14
@@ -258,6 +259,7 @@  static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
 #define STRTAB_STE_1_S1CSH		GENMASK_ULL(7, 6)
 
 #define STRTAB_STE_1_S1STALLD		(1UL << 27)
+#define STRTAB_STE_1_S2FWB		(1UL << 25)
 
 #define STRTAB_STE_1_EATS		GENMASK_ULL(29, 28)
 #define STRTAB_STE_1_EATS_ABT		0UL
@@ -700,6 +702,7 @@  struct arm_smmu_device {
 #define ARM_SMMU_FEAT_ATTR_TYPES_OVR	(1 << 20)
 #define ARM_SMMU_FEAT_HA		(1 << 21)
 #define ARM_SMMU_FEAT_HD		(1 << 22)
+#define ARM_SMMU_FEAT_S2FWB		(1 << 23)
 	u32				features;
 
 #define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index f5d9fd1f45bf49..9b3658aae21005 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -106,6 +106,18 @@ 
 #define ARM_LPAE_PTE_HAP_FAULT		(((arm_lpae_iopte)0) << 6)
 #define ARM_LPAE_PTE_HAP_READ		(((arm_lpae_iopte)1) << 6)
 #define ARM_LPAE_PTE_HAP_WRITE		(((arm_lpae_iopte)2) << 6)
+/*
+ * For !FWB these code to:
+ *  1111 = Normal outer write back cachable / Inner Write Back Cachable
+ *         Permit S1 to override
+ *  0101 = Normal Non-cachable / Inner Non-cachable
+ *  0001 = Device / Device-nGnRE
+ * For S2FWB these code:
+ *  0110 Force Normal Write Back
+ *  0101 Normal* is forced Normal-NC, Device unchanged
+ *  0001 Force Device-nGnRE
+ */
+#define ARM_LPAE_PTE_MEMATTR_FWB_WB	(((arm_lpae_iopte)0x6) << 2)
 #define ARM_LPAE_PTE_MEMATTR_OIWB	(((arm_lpae_iopte)0xf) << 2)
 #define ARM_LPAE_PTE_MEMATTR_NC		(((arm_lpae_iopte)0x5) << 2)
 #define ARM_LPAE_PTE_MEMATTR_DEV	(((arm_lpae_iopte)0x1) << 2)
@@ -458,12 +470,16 @@  static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
 	 */
 	if (data->iop.fmt == ARM_64_LPAE_S2 ||
 	    data->iop.fmt == ARM_32_LPAE_S2) {
-		if (prot & IOMMU_MMIO)
+		if (prot & IOMMU_MMIO) {
 			pte |= ARM_LPAE_PTE_MEMATTR_DEV;
-		else if (prot & IOMMU_CACHE)
-			pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
-		else
+		} else if (prot & IOMMU_CACHE) {
+			if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_S2FWB)
+				pte |= ARM_LPAE_PTE_MEMATTR_FWB_WB;
+			else
+				pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
+		} else {
 			pte |= ARM_LPAE_PTE_MEMATTR_NC;
+		}
 	} else {
 		if (prot & IOMMU_MMIO)
 			pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV
@@ -932,7 +948,8 @@  arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
 			    IO_PGTABLE_QUIRK_ARM_TTBR1 |
 			    IO_PGTABLE_QUIRK_ARM_OUTER_WBWA |
-			    IO_PGTABLE_QUIRK_ARM_HD))
+			    IO_PGTABLE_QUIRK_ARM_HD |
+			    IO_PGTABLE_QUIRK_ARM_S2FWB))
 		return NULL;
 
 	data = arm_lpae_alloc_pgtable(cfg);
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index f9a81761bfceda..aff9b020b6dcc7 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -87,6 +87,7 @@  struct io_pgtable_cfg {
 	 *	attributes set in the TCR for a non-coherent page-table walker.
 	 *
 	 * IO_PGTABLE_QUIRK_ARM_HD: Enables dirty tracking in stage 1 pagetable.
+	 * IO_PGTABLE_QUIRK_ARM_S2FWB: Use the FWB format for the MemAttrs bits
 	 */
 	#define IO_PGTABLE_QUIRK_ARM_NS			BIT(0)
 	#define IO_PGTABLE_QUIRK_NO_PERMS		BIT(1)
@@ -95,6 +96,7 @@  struct io_pgtable_cfg {
 	#define IO_PGTABLE_QUIRK_ARM_TTBR1		BIT(5)
 	#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA		BIT(6)
 	#define IO_PGTABLE_QUIRK_ARM_HD			BIT(7)
+	#define IO_PGTABLE_QUIRK_ARM_S2FWB		BIT(8)
 	unsigned long			quirks;
 	unsigned long			pgsize_bitmap;
 	unsigned int			ias;