diff mbox series

[v2,8/8] iommu/arm-smmu-v3: Support IOMMU_DOMAIN_NESTED

Message ID 8-v2-621370057090+91fec-smmuv3_nesting_jgg@nvidia.com
State New
Headers show
Series Initial support for SMMUv3 nested translation | expand

Commit Message

Jason Gunthorpe Aug. 27, 2024, 3:51 p.m. UTC
For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2 iommu_domain acting
as the parent and a user provided STE fragment that defines the CD table
and related data with addresses translated by the S2 iommu_domain.

The kernel only permits userspace to control certain allowed bits of the
STE that are safe for user/guest control.

IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2
translation, but there is no way of knowing which S1 entries refer to a
range of S2.

For the IOTLB we follow ARM's guidance and issue a CMDQ_OP_TLBI_NH_ALL to
flush all ASIDs from the VMID after flushing the S2 on any change to the
S2.

Similarly we have to flush the entire ATC if the S2 is changed.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 217 +++++++++++++++++++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  20 ++
 include/uapi/linux/iommufd.h                |  20 ++
 3 files changed, 250 insertions(+), 7 deletions(-)

Comments

Nicolin Chen Aug. 27, 2024, 9:23 p.m. UTC | #1
On Tue, Aug 27, 2024 at 12:51:38PM -0300, Jason Gunthorpe wrote:
> For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2 iommu_domain acting
> as the parent and a user provided STE fragment that defines the CD table
> and related data with addresses translated by the S2 iommu_domain.
> 
> The kernel only permits userspace to control certain allowed bits of the
> STE that are safe for user/guest control.
> 
> IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2
> translation, but there is no way of knowing which S1 entries refer to a
> range of S2.
> 
> For the IOTLB we follow ARM's guidance and issue a CMDQ_OP_TLBI_NH_ALL to
> flush all ASIDs from the VMID after flushing the S2 on any change to the
> S2.
> 
> Similarly we have to flush the entire ATC if the S2 is changed.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>

With some small nits:

> @@ -2192,6 +2255,16 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
>  	}
>  	__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
>  
> +	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2 &&
> +	    smmu_domain->nest_parent) {

smmu_domain->nest_parent alone is enough?
  
[---]
> +static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
> +				      struct device *dev)
> +{
[..]
> +	if (arm_smmu_ssids_in_use(&master->cd_table) ||

This feels more like a -EBUSY as it would be unlikely able to
attach to a different nested domain?

> +	    nested_domain->s2_parent->smmu != master->smmu)
> +		return -EINVAL;
[---]

> +static struct iommu_domain *
> +arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags,
> +			      struct iommu_domain *parent,
> +			      const struct iommu_user_data *user_data)
> +{
> +	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
> +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +	struct arm_smmu_nested_domain *nested_domain;
> +	struct arm_smmu_domain *smmu_parent;
> +	struct iommu_hwpt_arm_smmuv3 arg;
> +	unsigned int eats;
> +	unsigned int cfg;
> +	int ret;
> +
> +	if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING))
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	/*
> +	 * Must support some way to prevent the VM from bypassing the cache
> +	 * because VFIO currently does not do any cache maintenance.
> +	 */
> +	if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_CANWBS) &&
> +	    !(master->smmu->features & ARM_SMMU_FEAT_S2FWB))
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	ret = iommu_copy_struct_from_user(&arg, user_data,
> +					  IOMMU_HWPT_DATA_ARM_SMMUV3, ste);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	if (flags || !(master->smmu->features & ARM_SMMU_FEAT_TRANS_S1))
> +		return ERR_PTR(-EOPNOTSUPP);

A bit redundant to the first sanity against ARM_SMMU_FEAT_NESTING,
since ARM_SMMU_FEAT_NESTING includes ARM_SMMU_FEAT_TRANS_S1.

> +
> +	if (!(parent->type & __IOMMU_DOMAIN_PAGING))
> +		return ERR_PTR(-EINVAL);
> +
> +	smmu_parent = to_smmu_domain(parent);
> +	if (smmu_parent->stage != ARM_SMMU_DOMAIN_S2 ||

Maybe "!smmu_parent->nest_parent" instead.

[---]
> +	    smmu_parent->smmu != master->smmu)
> +		return ERR_PTR(-EINVAL);

It'd be slightly nicer if we do all the non-arg validations prior
to calling iommu_copy_struct_from_user(). Then, the following arg
validations would be closer to the copy().

> +
> +	/* EIO is reserved for invalid STE data. */
> +	if ((arg.ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) ||
> +	    (arg.ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED))
> +		return ERR_PTR(-EIO);
[---]


>  /* The following are exposed for testing purposes. */
>  struct arm_smmu_entry_writer_ops;
>  struct arm_smmu_entry_writer {
> @@ -830,6 +849,7 @@ struct arm_smmu_master_domain {
>  	struct list_head devices_elm;
>  	struct arm_smmu_master *master;
>  	ioasid_t ssid;
> +	u8 nest_parent;

Would it be nicer to match with the one in struct arm_smmu_domain:
+	bool				nest_parent : 1;
?

> + * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor Table info
> + *                                (IOMMU_HWPT_DATA_ARM_SMMUV3)
> + *
> + * @ste: The first two double words of the user space Stream Table Entry for
> + *       a user stage-1 Context Descriptor Table. Must be little-endian.
> + *       Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec)
> + *       - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax
> + *       - word-1: S1DSS, S1CIR, S1COR, S1CSH, S1STALLD

It seems that word-1 is missing EATS.

Thanks
Nicolin
Jason Gunthorpe Aug. 28, 2024, 7:01 p.m. UTC | #2
On Tue, Aug 27, 2024 at 02:23:07PM -0700, Nicolin Chen wrote:
> On Tue, Aug 27, 2024 at 12:51:38PM -0300, Jason Gunthorpe wrote:
> > For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2 iommu_domain acting
> > as the parent and a user provided STE fragment that defines the CD table
> > and related data with addresses translated by the S2 iommu_domain.
> > 
> > The kernel only permits userspace to control certain allowed bits of the
> > STE that are safe for user/guest control.
> > 
> > IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2
> > translation, but there is no way of knowing which S1 entries refer to a
> > range of S2.
> > 
> > For the IOTLB we follow ARM's guidance and issue a CMDQ_OP_TLBI_NH_ALL to
> > flush all ASIDs from the VMID after flushing the S2 on any change to the
> > S2.
> > 
> > Similarly we have to flush the entire ATC if the S2 is changed.
> > 
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> 
> Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
> 
> With some small nits:
> 
> > @@ -2192,6 +2255,16 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
> >  	}
> >  	__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
> >  
> > +	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2 &&
> > +	    smmu_domain->nest_parent) {
> 
> smmu_domain->nest_parent alone is enough?

Yes, I thought I did that when Robin noted it.. 

> [---]
> > +static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
> > +				      struct device *dev)
> > +{
> [..]
> > +	if (arm_smmu_ssids_in_use(&master->cd_table) ||
> 
> This feels more like a -EBUSY as it would be unlikely able to
> attach to a different nested domain?

Yeah, we did that in arm_smmu_attach_dev()

> > +static struct iommu_domain *
> > +arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags,
> > +			      struct iommu_domain *parent,
> > +			      const struct iommu_user_data *user_data)
> > +{
> > +	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
> > +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> > +	struct arm_smmu_nested_domain *nested_domain;
> > +	struct arm_smmu_domain *smmu_parent;
> > +	struct iommu_hwpt_arm_smmuv3 arg;
> > +	unsigned int eats;
> > +	unsigned int cfg;
> > +	int ret;
> > +
> > +	if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING))
> > +		return ERR_PTR(-EOPNOTSUPP);
> > +
> > +	/*
> > +	 * Must support some way to prevent the VM from bypassing the cache
> > +	 * because VFIO currently does not do any cache maintenance.
> > +	 */
> > +	if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_CANWBS) &&
> > +	    !(master->smmu->features & ARM_SMMU_FEAT_S2FWB))
> > +		return ERR_PTR(-EOPNOTSUPP);
> > +
> > +	ret = iommu_copy_struct_from_user(&arg, user_data,
> > +					  IOMMU_HWPT_DATA_ARM_SMMUV3, ste);
> > +	if (ret)
> > +		return ERR_PTR(ret);
> > +
> > +	if (flags || !(master->smmu->features & ARM_SMMU_FEAT_TRANS_S1))
> > +		return ERR_PTR(-EOPNOTSUPP);
> 
> A bit redundant to the first sanity against ARM_SMMU_FEAT_NESTING,
> since ARM_SMMU_FEAT_NESTING includes ARM_SMMU_FEAT_TRANS_S1.

Yeah, I think this was ment to be up at the top

	if (flags || !(master->smmu->features & ARM_SMMU_FEAT_NESTING))
		return ERR_PTR(-EOPNOTSUPP);

> > +
> > +	if (!(parent->type & __IOMMU_DOMAIN_PAGING))
> > +		return ERR_PTR(-EINVAL);
> > +
> > +	smmu_parent = to_smmu_domain(parent);
> > +	if (smmu_parent->stage != ARM_SMMU_DOMAIN_S2 ||
> 
> Maybe "!smmu_parent->nest_parent" instead.

Hmm, yes.. Actually we can delete it, and the paging test above.

The core code checks it.

Though I think we missed owner validation there??

@@ -225,7 +225,8 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
        if ((flags & ~IOMMU_HWPT_FAULT_ID_VALID) ||
            !user_data->len || !ops->domain_alloc_user)
                return ERR_PTR(-EOPNOTSUPP);
-       if (parent->auto_domain || !parent->nest_parent)
+       if (parent->auto_domain || !parent->nest_parent ||
+           parent->common.domain->owner != ops)
                return ERR_PTR(-EINVAL);

Right??

> [---]
> > +	    smmu_parent->smmu != master->smmu)
> > +		return ERR_PTR(-EINVAL);
> 
> It'd be slightly nicer if we do all the non-arg validations prior
> to calling iommu_copy_struct_from_user(). Then, the following arg
> validations would be closer to the copy().

Sure

> >  struct arm_smmu_entry_writer {
> > @@ -830,6 +849,7 @@ struct arm_smmu_master_domain {
> >  	struct list_head devices_elm;
> >  	struct arm_smmu_master *master;
> >  	ioasid_t ssid;
> > +	u8 nest_parent;
> 
> Would it be nicer to match with the one in struct arm_smmu_domain:
> +	bool				nest_parent : 1;
> ?

Ah, lets just use bool

> > + * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor Table info
> > + *                                (IOMMU_HWPT_DATA_ARM_SMMUV3)
> > + *
> > + * @ste: The first two double words of the user space Stream Table Entry for
> > + *       a user stage-1 Context Descriptor Table. Must be little-endian.
> > + *       Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec)
> > + *       - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax
> > + *       - word-1: S1DSS, S1CIR, S1COR, S1CSH, S1STALLD
> 
> It seems that word-1 is missing EATS.

Yes, this was missed

Jason
Nicolin Chen Aug. 28, 2024, 7:27 p.m. UTC | #3
On Wed, Aug 28, 2024 at 04:01:00PM -0300, Jason Gunthorpe wrote:

> > > +
> > > +	if (!(parent->type & __IOMMU_DOMAIN_PAGING))
> > > +		return ERR_PTR(-EINVAL);
> > > +
> > > +	smmu_parent = to_smmu_domain(parent);
> > > +	if (smmu_parent->stage != ARM_SMMU_DOMAIN_S2 ||
> > 
> > Maybe "!smmu_parent->nest_parent" instead.
> 
> Hmm, yes.. Actually we can delete it, and the paging test above.
> 
> The core code checks it.

Yea, we can rely on the core.

> Though I think we missed owner validation there??
> 
> @@ -225,7 +225,8 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
>         if ((flags & ~IOMMU_HWPT_FAULT_ID_VALID) ||
>             !user_data->len || !ops->domain_alloc_user)
>                 return ERR_PTR(-EOPNOTSUPP);
> -       if (parent->auto_domain || !parent->nest_parent)
> +       if (parent->auto_domain || !parent->nest_parent ||
> +           parent->common.domain->owner != ops)
>                 return ERR_PTR(-EINVAL);
> 
> Right??

Yea, this ensures the same driver.

> > [---]
> > > +	    smmu_parent->smmu != master->smmu)
> > > +		return ERR_PTR(-EINVAL);

Then, we only need this one.

Thanks
Nicolin
Tian, Kevin Aug. 30, 2024, 8:16 a.m. UTC | #4
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, August 27, 2024 11:52 PM
> 
> For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2
> iommu_domain acting
> as the parent and a user provided STE fragment that defines the CD table
> and related data with addresses translated by the S2 iommu_domain.
> 
> The kernel only permits userspace to control certain allowed bits of the
> STE that are safe for user/guest control.
> 
> IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2
> translation, but there is no way of knowing which S1 entries refer to a
> range of S2.
> 
> For the IOTLB we follow ARM's guidance and issue a
> CMDQ_OP_TLBI_NH_ALL to
> flush all ASIDs from the VMID after flushing the S2 on any change to the
> S2.
> 
> Similarly we have to flush the entire ATC if the S2 is changed.

it's clearer to mention that ATS is not supported at this point. 

> @@ -2614,7 +2687,8 @@ arm_smmu_find_master_domain(struct
> arm_smmu_domain *smmu_domain,
>  	list_for_each_entry(master_domain, &smmu_domain->devices,
>  			    devices_elm) {
>  		if (master_domain->master == master &&
> -		    master_domain->ssid == ssid)
> +		    master_domain->ssid == ssid &&
> +		    master_domain->nest_parent == nest_parent)
>  			return master_domain;
>  	}

there are two nest_parent flags in master_domain and smmu_domain.
Probably duplicating?

> +static struct iommu_domain *
> +arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags,
> +			      struct iommu_domain *parent,
> +			      const struct iommu_user_data *user_data)
> +{
> +	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
> +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +	struct arm_smmu_nested_domain *nested_domain;
> +	struct arm_smmu_domain *smmu_parent;
> +	struct iommu_hwpt_arm_smmuv3 arg;
> +	unsigned int eats;
> +	unsigned int cfg;
> +	int ret;
> +
> +	if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING))
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	/*
> +	 * Must support some way to prevent the VM from bypassing the
> cache
> +	 * because VFIO currently does not do any cache maintenance.
> +	 */
> +	if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_CANWBS) &&
> +	    !(master->smmu->features & ARM_SMMU_FEAT_S2FWB))
> +		return ERR_PTR(-EOPNOTSUPP);

this can be saved if we guard the setting of NESTING upon them.

> +
> +	ret = iommu_copy_struct_from_user(&arg, user_data,
> +
> IOMMU_HWPT_DATA_ARM_SMMUV3, ste);
> +	if (ret)
> +		return ERR_PTR(ret);

prefer to allocating resource after static condition checks below.

> +
> +	if (flags || !(master->smmu->features &
> ARM_SMMU_FEAT_TRANS_S1))
> +		return ERR_PTR(-EOPNOTSUPP);

Is it possible when NESTING is supported?

> +
> +	if (!(parent->type & __IOMMU_DOMAIN_PAGING))
> +		return ERR_PTR(-EINVAL);

Just check parent->nest_parent

> +
> +	smmu_parent = to_smmu_domain(parent);
> +	if (smmu_parent->stage != ARM_SMMU_DOMAIN_S2 ||
> +	    smmu_parent->smmu != master->smmu)
> +		return ERR_PTR(-EINVAL);

again S2 should be implied when parent->nest_parent is true.

> +
> +	/* EIO is reserved for invalid STE data. */
> +	if ((arg.ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) ||
> +	    (arg.ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED))
> +		return ERR_PTR(-EIO);
> +
> +	cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(arg.ste[0]));
> +	if (cfg != STRTAB_STE_0_CFG_ABORT && cfg !=
> STRTAB_STE_0_CFG_BYPASS &&
> +	    cfg != STRTAB_STE_0_CFG_S1_TRANS)
> +		return ERR_PTR(-EIO);

If vSTE is invalid those bits can be ignored?

> +
> +	eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg.ste[1]));
> +	if (eats != STRTAB_STE_1_EATS_ABT)
> +		return ERR_PTR(-EIO);
> +
> +	if (cfg != STRTAB_STE_0_CFG_S1_TRANS)
> +		eats = STRTAB_STE_1_EATS_ABT;

this check sounds redundant. If the last check passes then eats is
already set to _ABT. 

> 
> +/**
> + * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor
> Table info
> + *                                (IOMMU_HWPT_DATA_ARM_SMMUV3)
> + *
> + * @ste: The first two double words of the user space Stream Table Entry
> for
> + *       a user stage-1 Context Descriptor Table. Must be little-endian.
> + *       Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW
> Spec)
> + *       - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax
> + *       - word-1: S1DSS, S1CIR, S1COR, S1CSH, S1STALLD

Not sure whether EATS should be documented here or not. It's handled 
but must be ZERO at this point.
Jason Gunthorpe Aug. 30, 2024, 2:13 p.m. UTC | #5
On Fri, Aug 30, 2024 at 08:16:27AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, August 27, 2024 11:52 PM
> > 
> > For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2
> > iommu_domain acting
> > as the parent and a user provided STE fragment that defines the CD table
> > and related data with addresses translated by the S2 iommu_domain.
> > 
> > The kernel only permits userspace to control certain allowed bits of the
> > STE that are safe for user/guest control.
> > 
> > IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2
> > translation, but there is no way of knowing which S1 entries refer to a
> > range of S2.
> > 
> > For the IOTLB we follow ARM's guidance and issue a
> > CMDQ_OP_TLBI_NH_ALL to
> > flush all ASIDs from the VMID after flushing the S2 on any change to the
> > S2.
> > 
> > Similarly we have to flush the entire ATC if the S2 is changed.
> 
> it's clearer to mention that ATS is not supported at this point. 

As things have ended up we need the viommu series to come along with
this to enable the full feature, and viommu supports ATS invalidation.

Ideally I'd like to merge them both together.

> > @@ -2614,7 +2687,8 @@ arm_smmu_find_master_domain(struct
> > arm_smmu_domain *smmu_domain,
> >  	list_for_each_entry(master_domain, &smmu_domain->devices,
> >  			    devices_elm) {
> >  		if (master_domain->master == master &&
> > -		    master_domain->ssid == ssid)
> > +		    master_domain->ssid == ssid &&
> > +		    master_domain->nest_parent == nest_parent)
> >  			return master_domain;
> >  	}
> 
> there are two nest_parent flags in master_domain and smmu_domain.
> Probably duplicating?

Sort of, sort of not..

This is a bit awkward right now because the arm_smmu_domain is still
per-instance, so the domain->nest_parent exists to control flushing of
the VMID

But we also have the per-attachment 'master_domain' struct, and there
the nest_parent controls flushing of the ATC.

In the end arm_smmu_domain will stop being per-instance and per-attach
'master_domain' would have the vmid and the nest_parent only. I'm
aiming for something like how VTD and RISCV are doing their flushing,
with a list of flush instructions attached to the domain.

So for now we have the in-between state where a S2 marked as parent
will avoid the ATC flush when directly attached to a RID but not the
ASID flush. Eventually we will be able to avoid both.

> > +static struct iommu_domain *
> > +arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags,
> > +			      struct iommu_domain *parent,
> > +			      const struct iommu_user_data *user_data)
> > +{
> > +	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
> > +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> > +	struct arm_smmu_nested_domain *nested_domain;
> > +	struct arm_smmu_domain *smmu_parent;
> > +	struct iommu_hwpt_arm_smmuv3 arg;
> > +	unsigned int eats;
> > +	unsigned int cfg;
> > +	int ret;
> > +
> > +	if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING))
> > +		return ERR_PTR(-EOPNOTSUPP);
> > +
> > +	/*
> > +	 * Must support some way to prevent the VM from bypassing the
> > cache
> > +	 * because VFIO currently does not do any cache maintenance.
> > +	 */
> > +	if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_CANWBS) &&
> > +	    !(master->smmu->features & ARM_SMMU_FEAT_S2FWB))
> > +		return ERR_PTR(-EOPNOTSUPP);
> 
> this can be saved if we guard the setting of NESTING upon them.

IOMMU_FWSPEC_PCI_RC_CANWBS is per-device, FEAT_NESTING is SMMU global,
they can't really be combined.

> > +
> > +	ret = iommu_copy_struct_from_user(&arg, user_data,
> > +
> > IOMMU_HWPT_DATA_ARM_SMMUV3, ste);
> > +	if (ret)
> > +		return ERR_PTR(ret);
> 
> prefer to allocating resource after static condition checks below.
> 
> > +
> > +	if (flags || !(master->smmu->features &
> > ARM_SMMU_FEAT_TRANS_S1))
> > +		return ERR_PTR(-EOPNOTSUPP);
> 
> Is it possible when NESTING is supported?
> 
> > +
> > +	if (!(parent->type & __IOMMU_DOMAIN_PAGING))
> > +		return ERR_PTR(-EINVAL);
> 
> Just check parent->nest_parent
> 
> > +
> > +	smmu_parent = to_smmu_domain(parent);
> > +	if (smmu_parent->stage != ARM_SMMU_DOMAIN_S2 ||
> > +	    smmu_parent->smmu != master->smmu)
> > +		return ERR_PTR(-EINVAL);
> 
> again S2 should be implied when parent->nest_parent is true.

I think I did all of these for Nicolin

> > +
> > +	/* EIO is reserved for invalid STE data. */
> > +	if ((arg.ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) ||
> > +	    (arg.ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED))
> > +		return ERR_PTR(-EIO);
> > +
> > +	cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(arg.ste[0]));
> > +	if (cfg != STRTAB_STE_0_CFG_ABORT && cfg !=
> > STRTAB_STE_0_CFG_BYPASS &&
> > +	    cfg != STRTAB_STE_0_CFG_S1_TRANS)
> > +		return ERR_PTR(-EIO);
> 
> If vSTE is invalid those bits can be ignored?

Yes, but also I was expecting the VMM to sanitize that.. Let's have
the kernel do it. Move the validation to a function and then:

static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg,
				  unsigned int *eats)
{
	unsigned int cfg;

	if (!(arg->ste[0] & STRTAB_STE_0_V)) {
		memset(arg->ste, 0, sizeof(arg->ste));
		return 0;
	}


> > +
> > +	eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg.ste[1]));
> > +	if (eats != STRTAB_STE_1_EATS_ABT)
> > +		return ERR_PTR(-EIO);
> > +
> > +	if (cfg != STRTAB_STE_0_CFG_S1_TRANS)
> > +		eats = STRTAB_STE_1_EATS_ABT;
> 
> this check sounds redundant. If the last check passes then eats is
> already set to _ABT. 

Yes.. This hunk needs to go into this patch:

https://lore.kernel.org/linux-iommu/3962bef2ca6ab9bd06a52910f114345ecfe48ba6.1724776335.git.nicolinc@nvidia.com/T/#u

> > +/**
> > + * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor
> > Table info
> > + *                                (IOMMU_HWPT_DATA_ARM_SMMUV3)
> > + *
> > + * @ste: The first two double words of the user space Stream Table Entry
> > for
> > + *       a user stage-1 Context Descriptor Table. Must be little-endian.
> > + *       Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW
> > Spec)
> > + *       - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax
> > + *       - word-1: S1DSS, S1CIR, S1COR, S1CSH, S1STALLD
> 
> Not sure whether EATS should be documented here or not. It's handled 
> but must be ZERO at this point.

Let's put it in the above patch

Thanks,
Jason
Jason Gunthorpe Aug. 30, 2024, 2:39 p.m. UTC | #6
On Fri, Aug 30, 2024 at 08:16:27AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, August 27, 2024 11:52 PM
> > 
> > For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2
> > iommu_domain acting
> > as the parent and a user provided STE fragment that defines the CD table
> > and related data with addresses translated by the S2 iommu_domain.
> > 
> > The kernel only permits userspace to control certain allowed bits of the
> > STE that are safe for user/guest control.
> > 
> > IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2
> > translation, but there is no way of knowing which S1 entries refer to a
> > range of S2.
> > 
> > For the IOTLB we follow ARM's guidance and issue a
> > CMDQ_OP_TLBI_NH_ALL to
> > flush all ASIDs from the VMID after flushing the S2 on any change to the
> > S2.
> > 
> > Similarly we have to flush the entire ATC if the S2 is changed.
> 
> it's clearer to mention that ATS is not supported at this point.

I will also move all of this stuff to the ATS enablement patch

> > @@ -2614,7 +2687,8 @@ arm_smmu_find_master_domain(struct
> > arm_smmu_domain *smmu_domain,
> >  	list_for_each_entry(master_domain, &smmu_domain->devices,
> >  			    devices_elm) {
> >  		if (master_domain->master == master &&
> > -		    master_domain->ssid == ssid)
> > +		    master_domain->ssid == ssid &&
> > +		    master_domain->nest_parent == nest_parent)
> >  			return master_domain;
> >  	}
> 
> there are two nest_parent flags in master_domain and smmu_domain.
> Probably duplicating?

Including this

And I will rename master_domain->nest_parent to master_domain->nested_ats_flush
and it will derive from nest_domain->enable_ats.

Which I think will be much clearer..

Thanks,
Jason
Mostafa Saleh Aug. 30, 2024, 4:09 p.m. UTC | #7
Hi Jason,

On Tue, Aug 27, 2024 at 12:51:38PM -0300, Jason Gunthorpe wrote:
> For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2 iommu_domain acting
> as the parent and a user provided STE fragment that defines the CD table
> and related data with addresses translated by the S2 iommu_domain.
> 
> The kernel only permits userspace to control certain allowed bits of the
> STE that are safe for user/guest control.
> 
> IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2
> translation, but there is no way of knowing which S1 entries refer to a
> range of S2.
> 
> For the IOTLB we follow ARM's guidance and issue a CMDQ_OP_TLBI_NH_ALL to
> flush all ASIDs from the VMID after flushing the S2 on any change to the
> S2.
> 
> Similarly we have to flush the entire ATC if the S2 is changed.
> 

I am still reviewing this patch, but just some quick questions.

1) How does userspace do IOTLB maintenance for S1 in that case?

2) Is there a reason the UAPI is designed this way?
The way I imagined this, is that userspace will pass the pointer to the CD
(+ format) not the STE (or part of it).
Making user space messing with shareability and cacheability of S1 CD access
feels odd. (Although CD configure page table access which is similar).

Thanks,
Mostafa

> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 217 +++++++++++++++++++-
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  20 ++
>  include/uapi/linux/iommufd.h                |  20 ++
>  3 files changed, 250 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 8db3db6328f8b7..a21dce1f25cb95 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -295,6 +295,7 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
>  	case CMDQ_OP_TLBI_NH_ASID:
>  		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
>  		fallthrough;
> +	case CMDQ_OP_TLBI_NH_ALL:
>  	case CMDQ_OP_TLBI_S12_VMALL:
>  		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
>  		break;
> @@ -1640,6 +1641,59 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
>  }
>  EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_s2_domain_ste);
>  
> +static void arm_smmu_make_nested_cd_table_ste(
> +	struct arm_smmu_ste *target, struct arm_smmu_master *master,
> +	struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
> +{
> +	arm_smmu_make_s2_domain_ste(target, master, nested_domain->s2_parent,
> +				    ats_enabled);
> +
> +	target->data[0] = cpu_to_le64(STRTAB_STE_0_V |
> +				      FIELD_PREP(STRTAB_STE_0_CFG,
> +						 STRTAB_STE_0_CFG_NESTED)) |
> +			  (nested_domain->ste[0] & ~STRTAB_STE_0_CFG);
> +	target->data[1] |= nested_domain->ste[1];
> +}
> +
> +/*
> + * Create a physical STE from the virtual STE that userspace provided when it
> + * created the nested domain. Using the vSTE userspace can request:
> + * - Non-valid STE
> + * - Abort STE
> + * - Bypass STE (install the S2, no CD table)
> + * - CD table STE (install the S2 and the userspace CD table)
> + */
> +static void arm_smmu_make_nested_domain_ste(
> +	struct arm_smmu_ste *target, struct arm_smmu_master *master,
> +	struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
> +{
> +	/*
> +	 * Userspace can request a non-valid STE through the nesting interface.
> +	 * We relay that into an abort physical STE with the intention that
> +	 * C_BAD_STE for this SID can be generated to userspace.
> +	 */
> +	if (!(nested_domain->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) {
> +		arm_smmu_make_abort_ste(target);
> +		return;
> +	}
> +
> +	switch (FIELD_GET(STRTAB_STE_0_CFG,
> +			  le64_to_cpu(nested_domain->ste[0]))) {
> +	case STRTAB_STE_0_CFG_S1_TRANS:
> +		arm_smmu_make_nested_cd_table_ste(target, master, nested_domain,
> +						  ats_enabled);
> +		break;
> +	case STRTAB_STE_0_CFG_BYPASS:
> +		arm_smmu_make_s2_domain_ste(
> +			target, master, nested_domain->s2_parent, ats_enabled);
> +		break;
> +	case STRTAB_STE_0_CFG_ABORT:
> +	default:
> +		arm_smmu_make_abort_ste(target);
> +		break;
> +	}
> +}
> +
>  /*
>   * This can safely directly manipulate the STE memory without a sync sequence
>   * because the STE table has not been installed in the SMMU yet.
> @@ -2065,7 +2119,16 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
>  		if (!master->ats_enabled)
>  			continue;
>  
> -		arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, &cmd);
> +		if (master_domain->nest_parent) {
> +			/*
> +			 * If a S2 used as a nesting parent is changed we have
> +			 * no option but to completely flush the ATC.
> +			 */
> +			arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd);
> +		} else {
> +			arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size,
> +						&cmd);
> +		}
>  
>  		for (i = 0; i < master->num_streams; i++) {
>  			cmd.atc.sid = master->streams[i].id;
> @@ -2192,6 +2255,16 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
>  	}
>  	__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
>  
> +	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2 &&
> +	    smmu_domain->nest_parent) {
> +		/*
> +		 * When the S2 domain changes all the nested S1 ASIDs have to be
> +		 * flushed too.
> +		 */
> +		cmd.opcode = CMDQ_OP_TLBI_NH_ALL;
> +		arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd);
> +	}
> +
>  	/*
>  	 * Unfortunately, this can't be leaf-only since we may have
>  	 * zapped an entire table.
> @@ -2604,8 +2677,8 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
>  
>  static struct arm_smmu_master_domain *
>  arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
> -			    struct arm_smmu_master *master,
> -			    ioasid_t ssid)
> +			    struct arm_smmu_master *master, ioasid_t ssid,
> +			    bool nest_parent)
>  {
>  	struct arm_smmu_master_domain *master_domain;
>  
> @@ -2614,7 +2687,8 @@ arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
>  	list_for_each_entry(master_domain, &smmu_domain->devices,
>  			    devices_elm) {
>  		if (master_domain->master == master &&
> -		    master_domain->ssid == ssid)
> +		    master_domain->ssid == ssid &&
> +		    master_domain->nest_parent == nest_parent)
>  			return master_domain;
>  	}
>  	return NULL;
> @@ -2634,6 +2708,9 @@ to_smmu_domain_devices(struct iommu_domain *domain)
>  	if ((domain->type & __IOMMU_DOMAIN_PAGING) ||
>  	    domain->type == IOMMU_DOMAIN_SVA)
>  		return to_smmu_domain(domain);
> +	if (domain->type == IOMMU_DOMAIN_NESTED)
> +		return container_of(domain, struct arm_smmu_nested_domain,
> +				    domain)->s2_parent;
>  	return NULL;
>  }
>  
> @@ -2649,7 +2726,8 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
>  		return;
>  
>  	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
> -	master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid);
> +	master_domain = arm_smmu_find_master_domain(
> +		smmu_domain, master, ssid, domain->type == IOMMU_DOMAIN_NESTED);
>  	if (master_domain) {
>  		list_del(&master_domain->devices_elm);
>  		kfree(master_domain);
> @@ -2664,6 +2742,7 @@ struct arm_smmu_attach_state {
>  	struct iommu_domain *old_domain;
>  	struct arm_smmu_master *master;
>  	bool cd_needs_ats;
> +	bool disable_ats;
>  	ioasid_t ssid;
>  	/* Resulting state */
>  	bool ats_enabled;
> @@ -2716,7 +2795,8 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
>  		 * enabled if we have arm_smmu_domain, those always have page
>  		 * tables.
>  		 */
> -		state->ats_enabled = arm_smmu_ats_supported(master);
> +		state->ats_enabled = !state->disable_ats &&
> +				     arm_smmu_ats_supported(master);
>  	}
>  
>  	if (smmu_domain) {
> @@ -2725,6 +2805,8 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
>  			return -ENOMEM;
>  		master_domain->master = master;
>  		master_domain->ssid = state->ssid;
> +		master_domain->nest_parent = new_domain->type ==
> +					       IOMMU_DOMAIN_NESTED;
>  
>  		/*
>  		 * During prepare we want the current smmu_domain and new
> @@ -3097,6 +3179,122 @@ static struct iommu_domain arm_smmu_blocked_domain = {
>  	.ops = &arm_smmu_blocked_ops,
>  };
>  
> +static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
> +				      struct device *dev)
> +{
> +	struct arm_smmu_nested_domain *nested_domain =
> +		container_of(domain, struct arm_smmu_nested_domain, domain);
> +	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
> +	struct arm_smmu_attach_state state = {
> +		.master = master,
> +		.old_domain = iommu_get_domain_for_dev(dev),
> +		.ssid = IOMMU_NO_PASID,
> +		/* Currently invalidation of ATC is not supported */
> +		.disable_ats = true,
> +	};
> +	struct arm_smmu_ste ste;
> +	int ret;
> +
> +	if (arm_smmu_ssids_in_use(&master->cd_table) ||
> +	    nested_domain->s2_parent->smmu != master->smmu)
> +		return -EINVAL;
> +
> +	mutex_lock(&arm_smmu_asid_lock);
> +	ret = arm_smmu_attach_prepare(&state, domain);
> +	if (ret) {
> +		mutex_unlock(&arm_smmu_asid_lock);
> +		return ret;
> +	}
> +
> +	arm_smmu_make_nested_domain_ste(&ste, master, nested_domain,
> +					state.ats_enabled);
> +	arm_smmu_install_ste_for_dev(master, &ste);
> +	arm_smmu_attach_commit(&state);
> +	mutex_unlock(&arm_smmu_asid_lock);
> +	return 0;
> +}
> +
> +static void arm_smmu_domain_nested_free(struct iommu_domain *domain)
> +{
> +	kfree(container_of(domain, struct arm_smmu_nested_domain, domain));
> +}
> +
> +static const struct iommu_domain_ops arm_smmu_nested_ops = {
> +	.attach_dev = arm_smmu_attach_dev_nested,
> +	.free = arm_smmu_domain_nested_free,
> +};
> +
> +static struct iommu_domain *
> +arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags,
> +			      struct iommu_domain *parent,
> +			      const struct iommu_user_data *user_data)
> +{
> +	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
> +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +	struct arm_smmu_nested_domain *nested_domain;
> +	struct arm_smmu_domain *smmu_parent;
> +	struct iommu_hwpt_arm_smmuv3 arg;
> +	unsigned int eats;
> +	unsigned int cfg;
> +	int ret;
> +
> +	if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING))
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	/*
> +	 * Must support some way to prevent the VM from bypassing the cache
> +	 * because VFIO currently does not do any cache maintenance.
> +	 */
> +	if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_CANWBS) &&
> +	    !(master->smmu->features & ARM_SMMU_FEAT_S2FWB))
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	ret = iommu_copy_struct_from_user(&arg, user_data,
> +					  IOMMU_HWPT_DATA_ARM_SMMUV3, ste);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	if (flags || !(master->smmu->features & ARM_SMMU_FEAT_TRANS_S1))
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	if (!(parent->type & __IOMMU_DOMAIN_PAGING))
> +		return ERR_PTR(-EINVAL);
> +
> +	smmu_parent = to_smmu_domain(parent);
> +	if (smmu_parent->stage != ARM_SMMU_DOMAIN_S2 ||
> +	    smmu_parent->smmu != master->smmu)
> +		return ERR_PTR(-EINVAL);
> +
> +	/* EIO is reserved for invalid STE data. */
> +	if ((arg.ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) ||
> +	    (arg.ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED))
> +		return ERR_PTR(-EIO);
> +
> +	cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(arg.ste[0]));
> +	if (cfg != STRTAB_STE_0_CFG_ABORT && cfg != STRTAB_STE_0_CFG_BYPASS &&
> +	    cfg != STRTAB_STE_0_CFG_S1_TRANS)
> +		return ERR_PTR(-EIO);
> +
> +	eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg.ste[1]));
> +	if (eats != STRTAB_STE_1_EATS_ABT)
> +		return ERR_PTR(-EIO);
> +
> +	if (cfg != STRTAB_STE_0_CFG_S1_TRANS)
> +		eats = STRTAB_STE_1_EATS_ABT;
> +
> +	nested_domain = kzalloc(sizeof(*nested_domain), GFP_KERNEL_ACCOUNT);
> +	if (!nested_domain)
> +		return ERR_PTR(-ENOMEM);
> +
> +	nested_domain->domain.type = IOMMU_DOMAIN_NESTED;
> +	nested_domain->domain.ops = &arm_smmu_nested_ops;
> +	nested_domain->s2_parent = smmu_parent;
> +	nested_domain->ste[0] = arg.ste[0];
> +	nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS);
> +
> +	return &nested_domain->domain;
> +}
> +
>  static struct iommu_domain *
>  arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
>  			   struct iommu_domain *parent,
> @@ -3108,9 +3306,13 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
>  	struct arm_smmu_domain *smmu_domain;
>  	int ret;
>  
> +	if (parent)
> +		return arm_smmu_domain_alloc_nesting(dev, flags, parent,
> +						     user_data);
> +
>  	if (flags & ~PAGING_FLAGS)
>  		return ERR_PTR(-EOPNOTSUPP);
> -	if (parent || user_data)
> +	if (user_data)
>  		return ERR_PTR(-EOPNOTSUPP);
>  
>  	smmu_domain = arm_smmu_domain_alloc();
> @@ -3123,6 +3325,7 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
>  			goto err_free;
>  		}
>  		smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
> +		smmu_domain->nest_parent = true;
>  	}
>  
>  	smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> index 4b05c81b181a82..b563cfedf22e91 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> @@ -240,6 +240,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
>  #define STRTAB_STE_0_CFG_BYPASS		4
>  #define STRTAB_STE_0_CFG_S1_TRANS	5
>  #define STRTAB_STE_0_CFG_S2_TRANS	6
> +#define STRTAB_STE_0_CFG_NESTED		7
>  
>  #define STRTAB_STE_0_S1FMT		GENMASK_ULL(5, 4)
>  #define STRTAB_STE_0_S1FMT_LINEAR	0
> @@ -291,6 +292,15 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
>  
>  #define STRTAB_STE_3_S2TTB_MASK		GENMASK_ULL(51, 4)
>  
> +/* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */
> +#define STRTAB_STE_0_NESTING_ALLOWED                                         \
> +	cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \
> +		    STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX)
> +#define STRTAB_STE_1_NESTING_ALLOWED                            \
> +	cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |   \
> +		    STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |   \
> +		    STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS)
> +
>  /*
>   * Context descriptors.
>   *
> @@ -508,6 +518,7 @@ struct arm_smmu_cmdq_ent {
>  			};
>  		} cfgi;
>  
> +		#define CMDQ_OP_TLBI_NH_ALL     0x10
>  		#define CMDQ_OP_TLBI_NH_ASID	0x11
>  		#define CMDQ_OP_TLBI_NH_VA	0x12
>  		#define CMDQ_OP_TLBI_EL2_ALL	0x20
> @@ -790,10 +801,18 @@ struct arm_smmu_domain {
>  	struct list_head		devices;
>  	spinlock_t			devices_lock;
>  	bool				enforce_cache_coherency : 1;
> +	bool				nest_parent : 1;
>  
>  	struct mmu_notifier		mmu_notifier;
>  };
>  
> +struct arm_smmu_nested_domain {
> +	struct iommu_domain domain;
> +	struct arm_smmu_domain *s2_parent;
> +
> +	__le64 ste[2];
> +};
> +
>  /* The following are exposed for testing purposes. */
>  struct arm_smmu_entry_writer_ops;
>  struct arm_smmu_entry_writer {
> @@ -830,6 +849,7 @@ struct arm_smmu_master_domain {
>  	struct list_head devices_elm;
>  	struct arm_smmu_master *master;
>  	ioasid_t ssid;
> +	u8 nest_parent;
>  };
>  
>  static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index 83b6e1cd338d8f..76e9ad6c9403af 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -394,14 +394,34 @@ struct iommu_hwpt_vtd_s1 {
>  	__u32 __reserved;
>  };
>  
> +/**
> + * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor Table info
> + *                                (IOMMU_HWPT_DATA_ARM_SMMUV3)
> + *
> + * @ste: The first two double words of the user space Stream Table Entry for
> + *       a user stage-1 Context Descriptor Table. Must be little-endian.
> + *       Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec)
> + *       - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax
> + *       - word-1: S1DSS, S1CIR, S1COR, S1CSH, S1STALLD
> + *
> + * -EIO will be returned if @ste is not legal or contains any non-allowed field.
> + * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass
> + * nested domain will translate the same as the nesting parent.
> + */
> +struct iommu_hwpt_arm_smmuv3 {
> +	__aligned_le64 ste[2];
> +};
> +
>  /**
>   * enum iommu_hwpt_data_type - IOMMU HWPT Data Type
>   * @IOMMU_HWPT_DATA_NONE: no data
>   * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
> + * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table
>   */
>  enum iommu_hwpt_data_type {
>  	IOMMU_HWPT_DATA_NONE = 0,
>  	IOMMU_HWPT_DATA_VTD_S1 = 1,
> +	IOMMU_HWPT_DATA_ARM_SMMUV3 = 2,
>  };
>  
>  /**
> -- 
> 2.46.0
>
Nicolin Chen Aug. 30, 2024, 4:59 p.m. UTC | #8
On Fri, Aug 30, 2024 at 04:09:04PM +0000, Mostafa Saleh wrote:
 
> On Tue, Aug 27, 2024 at 12:51:38PM -0300, Jason Gunthorpe wrote:
> > For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2 iommu_domain acting
> > as the parent and a user provided STE fragment that defines the CD table
> > and related data with addresses translated by the S2 iommu_domain.
> >
> > The kernel only permits userspace to control certain allowed bits of the
> > STE that are safe for user/guest control.
> >
> > IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2
> > translation, but there is no way of knowing which S1 entries refer to a
> > range of S2.
> >
> > For the IOTLB we follow ARM's guidance and issue a CMDQ_OP_TLBI_NH_ALL to
> > flush all ASIDs from the VMID after flushing the S2 on any change to the
> > S2.
> >
> > Similarly we have to flush the entire ATC if the S2 is changed.
> >
> 
> I am still reviewing this patch, but just some quick questions.
> 
> 1) How does userspace do IOTLB maintenance for S1 in that case?

We do all the TLBI/ATC/CD invalidations via the VIOMMU uapi:
https://lore.kernel.org/linux-iommu/cover.1724776335.git.nicolinc@nvidia.com/

In another word, nesting support requires the VIOMMU p1 series
at least.

> 2) Is there a reason the UAPI is designed this way?
> The way I imagined this, is that userspace will pass the pointer to the CD
> (+ format) not the STE (or part of it).
> Making user space messing with shareability and cacheability of S1 CD access
> feels odd. (Although CD configure page table access which is similar).

Given that STE.S1ContextPtr itself is an IPA/GPA, it feels to me
that the HW is designed in such a fashion of user space managing
the CD table and its entries?

CD cache will be flushed if CFGI_CD{_ALL} is trapped. This would
be the same if we pass CD info via the uAPI.

What's the concern for shareability?

Thanks
Nicolin
Jason Gunthorpe Aug. 30, 2024, 5:04 p.m. UTC | #9
On Fri, Aug 30, 2024 at 04:09:04PM +0000, Mostafa Saleh wrote:
> Hi Jason,
> 
> On Tue, Aug 27, 2024 at 12:51:38PM -0300, Jason Gunthorpe wrote:
> > For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2 iommu_domain acting
> > as the parent and a user provided STE fragment that defines the CD table
> > and related data with addresses translated by the S2 iommu_domain.
> > 
> > The kernel only permits userspace to control certain allowed bits of the
> > STE that are safe for user/guest control.
> > 
> > IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2
> > translation, but there is no way of knowing which S1 entries refer to a
> > range of S2.
> > 
> > For the IOTLB we follow ARM's guidance and issue a CMDQ_OP_TLBI_NH_ALL to
> > flush all ASIDs from the VMID after flushing the S2 on any change to the
> > S2.
> > 
> > Similarly we have to flush the entire ATC if the S2 is changed.
> > 
> 
> I am still reviewing this patch, but just some quick questions.
> 
> 1) How does userspace do IOTLB maintenance for S1 in that case?

See

https://lore.kernel.org/linux-iommu/cover.1724776335.git.nicolinc@nvidia.com

Patch 17

Really, this series and that series must be together. We have a patch
planning issue to sort out here as well, all 27 should go together
into the same merge window.

> 2) Is there a reason the UAPI is designed this way?
> The way I imagined this, is that userspace will pass the pointer to the CD
> (+ format) not the STE (or part of it).

Yes, we need more information from the STE than just that. EATS and
STALL for instance. And the cachability below. Who knows what else in
the future.

We also want to support the V=0, Bypass and Abort STE configurations
under the nesting domain (V, CFG required) so that the VIOMMU can
remain affiliated with the STE in all cases. This is necessary to
allow VIOMMU event reporting to always work.

Looking at the masks:

STRTAB_STE_0_NESTING_ALLOWED = 0xf80fffffffffffff
STRTAB_STE_1_NESTING_ALLOWED = 0x380000ff

So we do use alot of the bits. Reformatting from the native HW format
into something else doesn't seem better for VMM or kernel..

This is similar to the invalidation design where we also just forward
the invalidation command as is in native HW format, and how IDR is
done the same.

Overall this sort of direct transparency is how I prefer to see these
kinds of iommufd HW specific interfaces designed. From a lot of
experience here, arbitary marshall/unmarshall is often an
antipattern :)

> Making user space messing with shareability and cacheability of S1 CD access
> feels odd. (Although CD configure page table access which is similar).

As I understand it, the walk of the CD table will be constrained by
the S2FWB, just like all the other accesses by the guest.

So we just take a consistent approach of allowing the guest to provide
memattrs in the vSTE, CD, and S1 page table and rely on the HW's S2FWB
to police it.

As you say there are lots of memattr type bits under direct guest
control, it doesn't necessarily make alot of sense to permit
everything in those contexts and then add extra code to do something
different here.

Though I agree it looks odd, it is self-consistent.

Jason
Mostafa Saleh Sept. 2, 2024, 9:57 a.m. UTC | #10
On Fri, Aug 30, 2024 at 02:04:26PM -0300, Jason Gunthorpe wrote:
> On Fri, Aug 30, 2024 at 04:09:04PM +0000, Mostafa Saleh wrote:
> > Hi Jason,
> > 
> > On Tue, Aug 27, 2024 at 12:51:38PM -0300, Jason Gunthorpe wrote:
> > > For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2 iommu_domain acting
> > > as the parent and a user provided STE fragment that defines the CD table
> > > and related data with addresses translated by the S2 iommu_domain.
> > > 
> > > The kernel only permits userspace to control certain allowed bits of the
> > > STE that are safe for user/guest control.
> > > 
> > > IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2
> > > translation, but there is no way of knowing which S1 entries refer to a
> > > range of S2.
> > > 
> > > For the IOTLB we follow ARM's guidance and issue a CMDQ_OP_TLBI_NH_ALL to
> > > flush all ASIDs from the VMID after flushing the S2 on any change to the
> > > S2.
> > > 
> > > Similarly we have to flush the entire ATC if the S2 is changed.
> > > 
> > 
> > I am still reviewing this patch, but just some quick questions.
> > 
> > 1) How does userspace do IOTLB maintenance for S1 in that case?
> 
> See
> 
> https://lore.kernel.org/linux-iommu/cover.1724776335.git.nicolinc@nvidia.com
> 
> Patch 17

Thanks, I had this series on my radar, I will check it by this week.

>
> Really, this series and that series must be together. We have a patch
> planning issue to sort out here as well, all 27 should go together
> into the same merge window.
> 
> > 2) Is there a reason the UAPI is designed this way?
> > The way I imagined this, is that userspace will pass the pointer to the CD
> > (+ format) not the STE (or part of it).
> 
> Yes, we need more information from the STE than just that. EATS and
> STALL for instance. And the cachability below. Who knows what else in
> the future.

But for example if that was extended later, how can user space know
which fields are allowed and which are not?

> 
> We also want to support the V=0, Bypass and Abort STE configurations
> under the nesting domain (V, CFG required) so that the VIOMMU can
> remain affiliated with the STE in all cases. This is necessary to
> allow VIOMMU event reporting to always work.
> 
> Looking at the masks:
> 
> STRTAB_STE_0_NESTING_ALLOWED = 0xf80fffffffffffff
> STRTAB_STE_1_NESTING_ALLOWED = 0x380000ff
> 
> So we do use alot of the bits. Reformatting from the native HW format
> into something else doesn't seem better for VMM or kernel..
> 
> This is similar to the invalidation design where we also just forward
> the invalidation command as is in native HW format, and how IDR is
> done the same.
> 
> Overall this sort of direct transparency is how I prefer to see these
> kinds of iommufd HW specific interfaces designed. From a lot of
> experience here, arbitary marshall/unmarshall is often an
> antipattern :)

Is there any documentation for the (proposed) SMMUv3 UAPI for IOMMUFD?
I can understand reading IDRs from userspace (with some sanitation),
but adding some more logic to map vSTE to STE needs more care of what
kind of semantics are provided.

Also, I am working on similar interface for pKVM where we “paravirtualize”
the SMMU access for guests, it’s different semantics, but I hope we can
align that with IOMMUFD (but it’s nowhere near upstream now)

I see you are talking in LPC about IOMMUFD:
https://lore.kernel.org/linux-iommu/0-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com/T/#m2dbb08f3bf8506a492bc7dda2de662e42371e683

Do you have any plans to talk about this also?

Thanks,
Mostafa
> 
> > Making user space messing with shareability and cacheability of S1 CD access
> > feels odd. (Although CD configure page table access which is similar).
> 
> As I understand it, the walk of the CD table will be constrained by
> the S2FWB, just like all the other accesses by the guest.
> 
> So we just take a consistent approach of allowing the guest to provide
> memattrs in the vSTE, CD, and S1 page table and rely on the HW's S2FWB
> to police it.
> 
> As you say there are lots of memattr type bits under direct guest
> control, it doesn't necessarily make alot of sense to permit
> everything in those contexts and then add extra code to do something
> different here.
> 
> Though I agree it looks odd, it is self-consistent.
> 
> Jason
Jason Gunthorpe Sept. 3, 2024, 12:30 a.m. UTC | #11
On Mon, Sep 02, 2024 at 09:57:45AM +0000, Mostafa Saleh wrote:
> > > 2) Is there a reason the UAPI is designed this way?
> > > The way I imagined this, is that userspace will pass the pointer to the CD
> > > (+ format) not the STE (or part of it).
> > 
> > Yes, we need more information from the STE than just that. EATS and
> > STALL for instance. And the cachability below. Who knows what else in
> > the future.
> 
> But for example if that was extended later, how can user space know
> which fields are allowed and which are not?

Changes the vSTE rules that require userspace being aware would have
to be signaled in the GET_INFO answer. This is the same process no
matter how you encode the STE bits in the structure.

This confirmation of kernel support would then be reflected in the
vIDRs to the VM and the VM could know to set the extended bits.

Otherwise setting an invalidate vSTE will fail the ioctl, the VMM can
log the event, generate an event and install an abort vSTE.

> > Overall this sort of direct transparency is how I prefer to see these
> > kinds of iommufd HW specific interfaces designed. From a lot of
> > experience here, arbitary marshall/unmarshall is often an
> > antipattern :)
> 
> Is there any documentation for the (proposed) SMMUv3 UAPI for IOMMUFD?

Just the comments in this series?

> I can understand reading IDRs from userspace (with some sanitation),
> but adding some more logic to map vSTE to STE needs more care of what
> kind of semantics are provided.

We can enhance the comment if you think it is not clear enough. It
lists the fields the userspace should pass through.

> Also, I am working on similar interface for pKVM where we “paravirtualize”
> the SMMU access for guests, it’s different semantics, but I hope we can
> align that with IOMMUFD (but it’s nowhere near upstream now)

Well, if you do paravirt where you just do map/unmap calls to the
hypervisor (ie classic virtio-iommu) then you don't need to do very
much.

If you want to do nesting, then IMHO, just present a real vSMMU. It is
already intended to be paravirtualized and this is what the
confidential compute people are going to be doing as well.

Otherwise I'd expect you'd get more value to align with the
virtio-iommu nesting stuff, where they have layed out what information
the VM needs. iommufd is not intended to be just jammed directly into
a VM. There is an expectation that a VMM will sit there on top and
massage things.

> I see you are talking in LPC about IOMMUFD:
> https://lore.kernel.org/linux-iommu/0-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com/T/#m2dbb08f3bf8506a492bc7dda2de662e42371e683
> 
> Do you have any plans to talk about this also?

Nothing specific, this is LPC so if people in the room would like to
use the session for that then we can talk about it. Last year the room
wanted to talk about PASID mostly.

I haven't heard if someone is going to KVM forum to talk about
vSMMUv3? Eric? Nicolin do you know?

Jason
Nicolin Chen Sept. 3, 2024, 1:13 a.m. UTC | #12
On Mon, Sep 02, 2024 at 09:30:22PM -0300, Jason Gunthorpe wrote:

> > I see you are talking in LPC about IOMMUFD:
> > https://lore.kernel.org/linux-iommu/0-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com/T/#m2dbb08f3bf8506a492bc7dda2de662e42371e683
> > 
> > Do you have any plans to talk about this also?
> 
> Nothing specific, this is LPC so if people in the room would like to
> use the session for that then we can talk about it. Last year the room
> wanted to talk about PASID mostly.
> 
> I haven't heard if someone is going to KVM forum to talk about
> vSMMUv3? Eric? Nicolin do you know?

I am not attending for seemingly in-person option only, nor sure
if there's a topic: it doesn't seem to have an official schedule
yet..

That being said, I think we do need a thread at least, for vSMMU
in the QEMU. The multi-vSMMU design requires to change the vSMMU
module to a pluggable one via QEMU command line, while the PCI-
vSMMU topology should be implemented in libvirt, either of which
needs some effort, where I haven't got bandwidth to spare. Given
that kernel patches are in a solid shape, I think I could start
to borrow some help. I'll draft an email to the qemu-devel list.
Folks joining the forum might be able to carry out a discussion.

Thanks
Nicolin
Mostafa Saleh Sept. 3, 2024, 9 a.m. UTC | #13
On Mon, Sep 02, 2024 at 09:30:22PM -0300, Jason Gunthorpe wrote:
> On Mon, Sep 02, 2024 at 09:57:45AM +0000, Mostafa Saleh wrote:
> > > > 2) Is there a reason the UAPI is designed this way?
> > > > The way I imagined this, is that userspace will pass the pointer to the CD
> > > > (+ format) not the STE (or part of it).
> > > 
> > > Yes, we need more information from the STE than just that. EATS and
> > > STALL for instance. And the cachability below. Who knows what else in
> > > the future.
> > 
> > But for example if that was extended later, how can user space know
> > which fields are allowed and which are not?
> 
> Changes the vSTE rules that require userspace being aware would have
> to be signaled in the GET_INFO answer. This is the same process no
> matter how you encode the STE bits in the structure.
> 
How? And why changing that in the future is not a problem as sanitising IDRs?

> This confirmation of kernel support would then be reflected in the
> vIDRs to the VM and the VM could know to set the extended bits.
> 
> Otherwise setting an invalidate vSTE will fail the ioctl, the VMM can
> log the event, generate an event and install an abort vSTE.
> 
> > > Overall this sort of direct transparency is how I prefer to see these
> > > kinds of iommufd HW specific interfaces designed. From a lot of
> > > experience here, arbitary marshall/unmarshall is often an
> > > antipattern :)
> > 
> > Is there any documentation for the (proposed) SMMUv3 UAPI for IOMMUFD?
> 
> Just the comments in this series?

But this is a UAPI. How can userspace implement that if it has no
documentation, and how can it be maintained if there is no clear
interface with userspace with what is expected/returned...

> 
> > I can understand reading IDRs from userspace (with some sanitation),
> > but adding some more logic to map vSTE to STE needs more care of what
> > kind of semantics are provided.
> 
> We can enhance the comment if you think it is not clear enough. It
> lists the fields the userspace should pass through.
> 
> > Also, I am working on similar interface for pKVM where we “paravirtualize”
> > the SMMU access for guests, it’s different semantics, but I hope we can
> > align that with IOMMUFD (but it’s nowhere near upstream now)
> 
> Well, if you do paravirt where you just do map/unmap calls to the
> hypervisor (ie classic virtio-iommu) then you don't need to do very
> much.

But we have a different model, with virtio-iommu, it typically presents
the device to the VM and on the backend it calls VFIO MAP/UNMAP.
Although technically we can have virtio-iommu in the hypervisor (EL2),
that is a lot of complexit and increase in the TCB of pKVM.

For pKVM, the VMM is not trusted and the hypervisor would do the map/unmap...,
but the VMM will have to configure the virtual view of the device (Mapping of
endpoints to virtual endpoints, vIRQs…), this requires a userspace interface
to query some HW info (similar to VFIO VFIO_DEVICE_GET_IRQ_INFO and then mapping
it to a GSI through KVM, but for IOMMUs)
Though, this design is very early and in progress.

> 
> If you want to do nesting, then IMHO, just present a real vSMMU. It is
> already intended to be paravirtualized and this is what the
> confidential compute people are going to be doing as well.
> 
> Otherwise I'd expect you'd get more value to align with the
> virtio-iommu nesting stuff, where they have layed out what information
> the VM needs. iommufd is not intended to be just jammed directly into
> a VM. There is an expectation that a VMM will sit there on top and
> massage things.

I haven’t been keeping up with iommufd lately, I will try to spend more
time on that in the future.
But my idea is that we would create an IOMMUFD, attach it to a device and then
through some extra IOCTLs, we can configure some “virtual” topology for it which
then relies on KVM, again this is very early, and we need to support pKVM IOMMUs
in the host first (I plan to send v2 RFC soon for that)

> 
> > I see you are talking in LPC about IOMMUFD:
> > https://lore.kernel.org/linux-iommu/0-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com/T/#m2dbb08f3bf8506a492bc7dda2de662e42371e683
> > 
> > Do you have any plans to talk about this also?
> 
> Nothing specific, this is LPC so if people in the room would like to
> use the session for that then we can talk about it. Last year the room
> wanted to talk about PASID mostly.
> 
> I haven't heard if someone is going to KVM forum to talk about
> vSMMUv3? Eric? Nicolin do you know?

I see, I won’t be in KVM forum, but I plan to attend LPC, we can discuss
further there if people are interested.

Thanks,
Mostafa

> 
> Jason
Jason Gunthorpe Sept. 3, 2024, 11:55 p.m. UTC | #14
On Tue, Sep 03, 2024 at 09:00:32AM +0000, Mostafa Saleh wrote:
> On Mon, Sep 02, 2024 at 09:30:22PM -0300, Jason Gunthorpe wrote:
> > On Mon, Sep 02, 2024 at 09:57:45AM +0000, Mostafa Saleh wrote:
> > > > > 2) Is there a reason the UAPI is designed this way?
> > > > > The way I imagined this, is that userspace will pass the pointer to the CD
> > > > > (+ format) not the STE (or part of it).
> > > > 
> > > > Yes, we need more information from the STE than just that. EATS and
> > > > STALL for instance. And the cachability below. Who knows what else in
> > > > the future.
> > > 
> > > But for example if that was extended later, how can user space know
> > > which fields are allowed and which are not?
> > 
> > Changes the vSTE rules that require userspace being aware would have
> > to be signaled in the GET_INFO answer. This is the same process no
> > matter how you encode the STE bits in the structure.
>
> How? 

--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -504,6 +504,11 @@ struct iommu_hw_info_vtd {
        __aligned_u64 ecap_reg;
 };
 
+enum {
+       /* The kernel understand field NEW in the STE */
+       IOMMU_HW_INFO_ARM_SMMUV3_VSTE_NEW = 1 << 0,
+};
+
 /**
  * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information
  *                                   (IOMMU_HW_INFO_TYPE_ARM_SMMUV3)
@@ -514,6 +519,7 @@ struct iommu_hw_info_vtd {
  * @iidr: Information about the implementation and implementer of ARM SMMU,
  *        and architecture version supported
  * @aidr: ARM SMMU architecture version
+ * @kernel_capabilities: Bitmask of IOMMU_HW_INFO_ARM_SMMUV3_*
  *
  * For the details of @idr, @iidr and @aidr, please refer to the chapters
  * from 6.3.1 to 6.3.6 in the SMMUv3 Spec.
@@ -535,6 +541,7 @@ struct iommu_hw_info_arm_smmuv3 {
        __u32 idr[6];
        __u32 iidr;
        __u32 aidr;
+       __u32 kernel_capabilities;
 };
 
 /**

For example. There are all sorts of rules about 0 filling and things
that make this work trivially for the userspace.

> And why changing that in the future is not a problem as
> sanitising IDRs?

Reporting a static kernel capability through GET_INFO output is
easier/saner than providing some kind of policy flags in the GET_INFO
input to specify how the sanitization should work.

> > This confirmation of kernel support would then be reflected in the
> > vIDRs to the VM and the VM could know to set the extended bits.
> > 
> > Otherwise setting an invalidate vSTE will fail the ioctl, the VMM can
> > log the event, generate an event and install an abort vSTE.
> > 
> > > > Overall this sort of direct transparency is how I prefer to see these
> > > > kinds of iommufd HW specific interfaces designed. From a lot of
> > > > experience here, arbitary marshall/unmarshall is often an
> > > > antipattern :)
> > > 
> > > Is there any documentation for the (proposed) SMMUv3 UAPI for IOMMUFD?
> > 
> > Just the comments in this series?
> 
> But this is a UAPI. How can userspace implement that if it has no
> documentation, and how can it be maintained if there is no clear
> interface with userspace with what is expected/returned...

I'm not sure what you are looking for here? I don't think an entire
tutorial on how to build a paravirtualized vSMMU is appropriate to
put in comments?

The behavior of the vSTE processing as a single feature should be
understandable, and I think it is from the comments and code. If it
isn't, lets improve that.

There is definitely a jump from knowing how these point items work to
knowing how to build a para virtualized vSMMU in your VMM. This is
likely a gap of thousands of lines of code in userspace :\

> But we have a different model, with virtio-iommu, it typically presents
> the device to the VM and on the backend it calls VFIO MAP/UNMAP.

I thought pkvm's model was also map/unmap - so it could suppor HW
without nesting?

> Although technically we can have virtio-iommu in the hypervisor (EL2),
> that is a lot of complexit and increase in the TCB of pKVM.

That is too bad, it would be nice to not have to do everything new
from scratch to just get to the same outcome. :(

> I haven’t been keeping up with iommufd lately, I will try to spend more
> time on that in the future.
> But my idea is that we would create an IOMMUFD, attach it to a device and then
> through some extra IOCTLs, we can configure some “virtual” topology for it which
> then relies on KVM, again this is very early, and we need to support pKVM IOMMUs
> in the host first (I plan to send v2 RFC soon for that)

Most likely your needs here will match the needs of the confidential
compute people which are basically doing that same stuf. The way pKVM
wants to operate looks really similar to me to how the confidential
compute stuff wants to work where the VMM is untrusted and operations
are delegated to some kind of secure world.

So, for instance, AMD recently posted patches about how they would
create vPCI devices in their secure world, and there are various
things in the works for secure IOMMUs and so forth all with the
intention of not trusting the VMM, or permitting the VMM to compromise
the VM.

I would *really* like everyone to sit down and figure out how to
manage virtual device lifecycle in a single language!

> > I haven't heard if someone is going to KVM forum to talk about
> > vSMMUv3? Eric? Nicolin do you know?
> 
> I see, I won’t be in KVM forum, but I plan to attend LPC, we can discuss
> further there if people are interested.

Sure, definately, look forward to meeting you!

Jason
Mostafa Saleh Sept. 6, 2024, 11:07 a.m. UTC | #15
On Tue, Sep 03, 2024 at 08:55:32PM -0300, Jason Gunthorpe wrote:
> On Tue, Sep 03, 2024 at 09:00:32AM +0000, Mostafa Saleh wrote:
> > On Mon, Sep 02, 2024 at 09:30:22PM -0300, Jason Gunthorpe wrote:
> > > On Mon, Sep 02, 2024 at 09:57:45AM +0000, Mostafa Saleh wrote:
> > > > > > 2) Is there a reason the UAPI is designed this way?
> > > > > > The way I imagined this, is that userspace will pass the pointer to the CD
> > > > > > (+ format) not the STE (or part of it).
> > > > > 
> > > > > Yes, we need more information from the STE than just that. EATS and
> > > > > STALL for instance. And the cachability below. Who knows what else in
> > > > > the future.
> > > > 
> > > > But for example if that was extended later, how can user space know
> > > > which fields are allowed and which are not?
> > > 
> > > Changes the vSTE rules that require userspace being aware would have
> > > to be signaled in the GET_INFO answer. This is the same process no
> > > matter how you encode the STE bits in the structure.
> >
> > How? 
> 
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -504,6 +504,11 @@ struct iommu_hw_info_vtd {
>         __aligned_u64 ecap_reg;
>  };
>  
> +enum {
> +       /* The kernel understand field NEW in the STE */
> +       IOMMU_HW_INFO_ARM_SMMUV3_VSTE_NEW = 1 << 0,
> +};
> +
>  /**
>   * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information
>   *                                   (IOMMU_HW_INFO_TYPE_ARM_SMMUV3)
> @@ -514,6 +519,7 @@ struct iommu_hw_info_vtd {
>   * @iidr: Information about the implementation and implementer of ARM SMMU,
>   *        and architecture version supported
>   * @aidr: ARM SMMU architecture version
> + * @kernel_capabilities: Bitmask of IOMMU_HW_INFO_ARM_SMMUV3_*
>   *
>   * For the details of @idr, @iidr and @aidr, please refer to the chapters
>   * from 6.3.1 to 6.3.6 in the SMMUv3 Spec.
> @@ -535,6 +541,7 @@ struct iommu_hw_info_arm_smmuv3 {
>         __u32 idr[6];
>         __u32 iidr;
>         __u32 aidr;
> +       __u32 kernel_capabilities;
>  };
>  
>  /**
> 
> For example. There are all sorts of rules about 0 filling and things
> that make this work trivially for the userspace.

I see, that makes sense to have.
However, I believe the UAPI can be more clear and solid in terms of
what is supported (maybe a typical struct with the CD, and some
extra configs?) I will give it a think.

> 
> > And why changing that in the future is not a problem as
> > sanitising IDRs?
> 
> Reporting a static kernel capability through GET_INFO output is
> easier/saner than providing some kind of policy flags in the GET_INFO
> input to specify how the sanitization should work.

I don’t think it’s “policy”, it’s just giving userspace the minimum
knowledge it needs to create the vSMMU, but again no really strong
opinion about that.

> 
> > > This confirmation of kernel support would then be reflected in the
> > > vIDRs to the VM and the VM could know to set the extended bits.
> > > 
> > > Otherwise setting an invalidate vSTE will fail the ioctl, the VMM can
> > > log the event, generate an event and install an abort vSTE.
> > > 
> > > > > Overall this sort of direct transparency is how I prefer to see these
> > > > > kinds of iommufd HW specific interfaces designed. From a lot of
> > > > > experience here, arbitary marshall/unmarshall is often an
> > > > > antipattern :)
> > > > 
> > > > Is there any documentation for the (proposed) SMMUv3 UAPI for IOMMUFD?
> > > 
> > > Just the comments in this series?
> > 
> > But this is a UAPI. How can userspace implement that if it has no
> > documentation, and how can it be maintained if there is no clear
> > interface with userspace with what is expected/returned...
> 
> I'm not sure what you are looking for here? I don't think an entire
> tutorial on how to build a paravirtualized vSMMU is appropriate to
> put in comments?

Sorry, I don’t think I was clear, I meant actual documentation for
the UAPI, as in RST files for example. If I want to support that
in kvmtool how can I implement it? I think we should have clear
docs for the UAPI with what is exposed from the driver, what are the
possible returns, expected behaviour of abort, bypass in the vSTE...,
it also makes it easier to reason about some of the choices.

> 
> The behavior of the vSTE processing as a single feature should be
> understandable, and I think it is from the comments and code. If it
> isn't, lets improve that.
> 
> There is definitely a jump from knowing how these point items work to
> knowing how to build a para virtualized vSMMU in your VMM. This is
> likely a gap of thousands of lines of code in userspace :\
> 
> > But we have a different model, with virtio-iommu, it typically presents
> > the device to the VM and on the backend it calls VFIO MAP/UNMAP.
> 
> I thought pkvm's model was also map/unmap - so it could suppor HW
> without nesting?
> 

Yes, it’s map/unmap based, but it has to be implemented in the
hypervisor, it doesn’t rely on VFIO.

Also, I have been looking at nesting recently (but for the host).

>
> > Although technically we can have virtio-iommu in the hypervisor (EL2),
> > that is a lot of complexit and increase in the TCB of pKVM.
> 
> That is too bad, it would be nice to not have to do everything new
> from scratch to just get to the same outcome. :(
> 

Yeah, I agree, yet a new pv interface :/
Although, it’s quite simple as it follows Linux IOMMU semantics with
HVC as transport and no in-memory data structures, queues...

Not implementing virtio in the hypervisor was an initial design choice
as it would be very challenging in terms of reasoning about the TCB.

> > I haven’t been keeping up with iommufd lately, I will try to spend more
> > time on that in the future.
> > But my idea is that we would create an IOMMUFD, attach it to a device and then
> > through some extra IOCTLs, we can configure some “virtual” topology for it which
> > then relies on KVM, again this is very early, and we need to support pKVM IOMMUs
> > in the host first (I plan to send v2 RFC soon for that)
> 
> Most likely your needs here will match the needs of the confidential
> compute people which are basically doing that same stuf. The way pKVM
> wants to operate looks really similar to me to how the confidential
> compute stuff wants to work where the VMM is untrusted and operations
> are delegated to some kind of secure world.
> 

Exactly.

> So, for instance, AMD recently posted patches about how they would
> create vPCI devices in their secure world, and there are various
> things in the works for secure IOMMUs and so forth all with the
> intention of not trusting the VMM, or permitting the VMM to compromise
> the VM.
> 

I have seen those, but didn't get the time to read them

> I would *really* like everyone to sit down and figure out how to
> manage virtual device lifecycle in a single language!

Yes, just like the guest_memfd work. There has been also
some work to unify some of the guest HVC bits:
https://lore.kernel.org/all/20240830130150.8568-1-will@kernel.org/

We should do the same for IOMMUs and IO.

> 
> > > I haven't heard if someone is going to KVM forum to talk about
> > > vSMMUv3? Eric? Nicolin do you know?
> > 
> > I see, I won’t be in KVM forum, but I plan to attend LPC, we can discuss
> > further there if people are interested.
> 
> Sure, definately, look forward to meeting you!

Great, me too!

Thanks,
Mostafa

> 
> Jason
Jason Gunthorpe Sept. 6, 2024, 1:34 p.m. UTC | #16
On Fri, Sep 06, 2024 at 11:07:47AM +0000, Mostafa Saleh wrote:

> However, I believe the UAPI can be more clear and solid in terms of
> what is supported (maybe a typical struct with the CD, and some
> extra configs?) I will give it a think.

I don't think breaking up the STE into fields in another struct is
going to be a big improvement, it adds more code and corner cases to
break up and reassemble it.

#define STRTAB_STE_0_NESTING_ALLOWED                                         \
	cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \
		    STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX)
#define STRTAB_STE_1_NESTING_ALLOWED                            \
	cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |   \
		    STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |   \
		    STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS)

It is 11 fields that would need to be recoded, that's alot.. Even if
you say the 3 cache ones are not needed it is still alot.

> > Reporting a static kernel capability through GET_INFO output is
> > easier/saner than providing some kind of policy flags in the GET_INFO
> > input to specify how the sanitization should work.
> 
> I don’t think it’s “policy”, it’s just giving userspace the minimum
> knowledge it needs to create the vSMMU, but again no really strong
> opinion about that.

There is no single "minimum knowledge" though, it depends on what the
VMM is able to support. IMHO once you go over to the "VMM has to
ignore bits it doesn't understand" you may as well just show
everything. Then the kernel side can't be wrong.

If the kernel side can be wrong, then you are back to handshaking
policy because the kernel can't assume that all existing VMMs wil not
rely on the kernel to do the masking.

> > > But this is a UAPI. How can userspace implement that if it has no
> > > documentation, and how can it be maintained if there is no clear
> > > interface with userspace with what is expected/returned...
> > 
> > I'm not sure what you are looking for here? I don't think an entire
> > tutorial on how to build a paravirtualized vSMMU is appropriate to
> > put in comments?
> 
> Sorry, I don’t think I was clear, I meant actual documentation for
> the UAPI, as in RST files for example. If I want to support that
> in kvmtool how can I implement it? 

Well, you need thousands of lines of code in kvtool to build a vIOMMU :)

Nicolin is looking at writing something, lets see.

I think for here we should focus on the comments being succinct but
sufficient to understand what the uAPI does itself.

> > I would *really* like everyone to sit down and figure out how to
> > manage virtual device lifecycle in a single language!
> 
> Yes, just like the guest_memfd work. There has been also
> some work to unify some of the guest HVC bits:
> https://lore.kernel.org/all/20240830130150.8568-1-will@kernel.org/

I think Dan Williams is being ringleader for the PCI side effort on CC

Jason
Jason Gunthorpe Sept. 6, 2024, 6:28 p.m. UTC | #17
On Fri, Aug 30, 2024 at 02:04:26PM -0300, Jason Gunthorpe wrote:

> Really, this series and that series must be together. We have a patch
> planning issue to sort out here as well, all 27 should go together
> into the same merge window.

I'm thinking strongly about moving the nesting code into
arm-smmuv3-nesting.c and wrapping it all in a kconfig. Similar to
SVA. Now that we see the viommu related code and more it will be a few
hundred lines there.

We'd leave the kconfig off until all of the parts are merged. There
are enough dependent series here that this seems to be the best
compromise.. Embedded cases can turn it off so it is longterm useful.

WDYT?

Jason
Nicolin Chen Sept. 6, 2024, 6:49 p.m. UTC | #18
On Fri, Sep 06, 2024 at 03:28:31PM -0300, Jason Gunthorpe wrote:
> On Fri, Aug 30, 2024 at 02:04:26PM -0300, Jason Gunthorpe wrote:
> 
> > Really, this series and that series must be together. We have a patch
> > planning issue to sort out here as well, all 27 should go together
> > into the same merge window.
> 
> I'm thinking strongly about moving the nesting code into
> arm-smmuv3-nesting.c and wrapping it all in a kconfig. Similar to
> SVA. Now that we see the viommu related code and more it will be a few
> hundred lines there.

+1 for this. I was thinking of doing that when I started drafting
patches, yet at that time we only had a couple of functions. Now,
with viommu_ops, it has grown.

> We'd leave the kconfig off until all of the parts are merged. There
> are enough dependent series here that this seems to be the best
> compromise.. Embedded cases can turn it off so it is longterm useful.

You mean doing that so as to merge two series separately? I wonder
if somebody might turn it on while the 2nd series isn't merge...

Thanks
Nicolin
Jason Gunthorpe Sept. 6, 2024, 11:15 p.m. UTC | #19
On Fri, Sep 06, 2024 at 11:49:08AM -0700, Nicolin Chen wrote:
> > We'd leave the kconfig off until all of the parts are merged. There
> > are enough dependent series here that this seems to be the best
> > compromise.. Embedded cases can turn it off so it is longterm useful.
> 
> You mean doing that so as to merge two series separately? 

Not just the two series, but the ITS fix too.

> I wonder if somebody might turn it on while the 2nd series isn't
> merge...

As long as distro's don't and I trust them to do a good job.

Jason
Mostafa Saleh Sept. 10, 2024, 11:12 a.m. UTC | #20
On Fri, Sep 06, 2024 at 10:34:44AM -0300, Jason Gunthorpe wrote:
> On Fri, Sep 06, 2024 at 11:07:47AM +0000, Mostafa Saleh wrote:
> 
> > However, I believe the UAPI can be more clear and solid in terms of
> > what is supported (maybe a typical struct with the CD, and some
> > extra configs?) I will give it a think.
> 
> I don't think breaking up the STE into fields in another struct is
> going to be a big improvement, it adds more code and corner cases to
> break up and reassemble it.
> 
> #define STRTAB_STE_0_NESTING_ALLOWED                                         \
> 	cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \
> 		    STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX)
> #define STRTAB_STE_1_NESTING_ALLOWED                            \
> 	cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |   \
> 		    STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |   \
> 		    STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS)
> 
> It is 11 fields that would need to be recoded, that's alot.. Even if
> you say the 3 cache ones are not needed it is still alot.

I was thinking of providing a higher level semantics
(no need for caching, valid...), something like:

struct smmu_user_table {
	u64 cd_table;
	u32 smmu_cd_cfg;  /* linear or 2lvl,.... */
	u32 smmu_trans_cfg; /* Translate, bypass, abort */
	u32 dev_feat; /*ATS, STALL, …*/
};

I feel that is a bit more clear for user space? Instead of
partially setting the STE, and it should be easier to extend than
masking the STE.

I’am not opposed to the vSTE, I just feel it's loosely defined,
that's why I was asking for the docs.

> 
> > > Reporting a static kernel capability through GET_INFO output is
> > > easier/saner than providing some kind of policy flags in the GET_INFO
> > > input to specify how the sanitization should work.
> > 
> > I don’t think it’s “policy”, it’s just giving userspace the minimum
> > knowledge it needs to create the vSMMU, but again no really strong
> > opinion about that.
> 
> There is no single "minimum knowledge" though, it depends on what the
> VMM is able to support. IMHO once you go over to the "VMM has to
> ignore bits it doesn't understand" you may as well just show
> everything. Then the kernel side can't be wrong.
> 
> If the kernel side can be wrong, then you are back to handshaking
> policy because the kernel can't assume that all existing VMMs wil not
> rely on the kernel to do the masking.
>

I agree it’s tricky, again no strong opinion on that, although I doubt
that a VMM would care about all the SMMU features.

> > > > But this is a UAPI. How can userspace implement that if it has no
> > > > documentation, and how can it be maintained if there is no clear
> > > > interface with userspace with what is expected/returned...
> > > 
> > > I'm not sure what you are looking for here? I don't think an entire
> > > tutorial on how to build a paravirtualized vSMMU is appropriate to
> > > put in comments?
> > 
> > Sorry, I don’t think I was clear, I meant actual documentation for
> > the UAPI, as in RST files for example. If I want to support that
> > in kvmtool how can I implement it? 
> 
> Well, you need thousands of lines of code in kvtool to build a vIOMMU :)
> 
> Nicolin is looking at writing something, lets see.
> 
> I think for here we should focus on the comments being succinct but
> sufficient to understand what the uAPI does itself.
> 
Actually I think the opposite, I think UAPI docs is more important
here, especially for the vSTE, that's how we can compare the code to
what is expected from user-space.

> > > I would *really* like everyone to sit down and figure out how to
> > > manage virtual device lifecycle in a single language!
> > 
> > Yes, just like the guest_memfd work. There has been also
> > some work to unify some of the guest HVC bits:
> > https://lore.kernel.org/all/20240830130150.8568-1-will@kernel.org/
> 
> I think Dan Williams is being ringleader for the PCI side effort on CC

Thanks, I will try to spend some time on the secure VFIO work.

Thanks,
Mostafa

> 
> Jason
Jason Gunthorpe Sept. 15, 2024, 9:39 p.m. UTC | #21
On Tue, Sep 10, 2024 at 11:12:20AM +0000, Mostafa Saleh wrote:
> On Fri, Sep 06, 2024 at 10:34:44AM -0300, Jason Gunthorpe wrote:
> > On Fri, Sep 06, 2024 at 11:07:47AM +0000, Mostafa Saleh wrote:
> > 
> > > However, I believe the UAPI can be more clear and solid in terms of
> > > what is supported (maybe a typical struct with the CD, and some
> > > extra configs?) I will give it a think.
> > 
> > I don't think breaking up the STE into fields in another struct is
> > going to be a big improvement, it adds more code and corner cases to
> > break up and reassemble it.
> > 
> > #define STRTAB_STE_0_NESTING_ALLOWED                                         \
> > 	cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \
> > 		    STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX)
> > #define STRTAB_STE_1_NESTING_ALLOWED                            \
> > 	cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |   \
> > 		    STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |   \
> > 		    STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS)
> > 
> > It is 11 fields that would need to be recoded, that's alot.. Even if
> > you say the 3 cache ones are not needed it is still alot.
> 
> I was thinking of providing a higher level semantics
> (no need for caching, valid...), something like:

Well, that isn't higher level semantics, really, it is just splitting
up the existing fields.

We do need to do something with valid as well as the VM can create a
non-valid STE and we still have to wrap a nesting domain around it to
ensure that event routing can work.

> struct smmu_user_table {
> 	u64 cd_table;
> 	u32 smmu_cd_cfg;  /* linear or 2lvl,.... */
> 	u32 smmu_trans_cfg; /* Translate, bypass, abort */
> 	u32 dev_feat; /*ATS, STALL, …*/
> };
> 
> I feel that is a bit more clear for user space?

Having done these sorts of interfaces over a long time, I belive it is
not. Deviating from the native HW format and re-marshalling into
something else is error prone and can become a problem when the
transformation from the well known HW format to the intermediate
format becomes a source of confusion too.

> Instead of partially setting the STE, and it should be easier to
> extend than masking the STE.

It is not going to partially set, it is going to validate a mask from
the original vSTE and if the mask fails then it will create a
non-valid STE instead.

We can't eliminate the mask because the VMM needs to mask and check
always no matter what the kernel interface is.

One option for the vmm is to just pass the vSTE entirely to the kernel
and let it validate it. If validation fails then use a V=0 STE
instead.
x
> I’am not opposed to the vSTE, I just feel it's loosely defined,
> that's why I was asking for the docs.

The kdoc lists all the fields and it is reflected directly to HW, and
there is a bitmask above being very explicit about what bits are
allowed. Where is the loosely defined you see?

If we broaden the mask down the road then we'd need some feature bits
to inform the VMM that the kernel supports a wider vSTE mask.

Thanks,
Jason
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 8db3db6328f8b7..a21dce1f25cb95 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -295,6 +295,7 @@  static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 	case CMDQ_OP_TLBI_NH_ASID:
 		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
 		fallthrough;
+	case CMDQ_OP_TLBI_NH_ALL:
 	case CMDQ_OP_TLBI_S12_VMALL:
 		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
 		break;
@@ -1640,6 +1641,59 @@  void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 }
 EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_s2_domain_ste);
 
+static void arm_smmu_make_nested_cd_table_ste(
+	struct arm_smmu_ste *target, struct arm_smmu_master *master,
+	struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+	arm_smmu_make_s2_domain_ste(target, master, nested_domain->s2_parent,
+				    ats_enabled);
+
+	target->data[0] = cpu_to_le64(STRTAB_STE_0_V |
+				      FIELD_PREP(STRTAB_STE_0_CFG,
+						 STRTAB_STE_0_CFG_NESTED)) |
+			  (nested_domain->ste[0] & ~STRTAB_STE_0_CFG);
+	target->data[1] |= nested_domain->ste[1];
+}
+
+/*
+ * Create a physical STE from the virtual STE that userspace provided when it
+ * created the nested domain. Using the vSTE userspace can request:
+ * - Non-valid STE
+ * - Abort STE
+ * - Bypass STE (install the S2, no CD table)
+ * - CD table STE (install the S2 and the userspace CD table)
+ */
+static void arm_smmu_make_nested_domain_ste(
+	struct arm_smmu_ste *target, struct arm_smmu_master *master,
+	struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+	/*
+	 * Userspace can request a non-valid STE through the nesting interface.
+	 * We relay that into an abort physical STE with the intention that
+	 * C_BAD_STE for this SID can be generated to userspace.
+	 */
+	if (!(nested_domain->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) {
+		arm_smmu_make_abort_ste(target);
+		return;
+	}
+
+	switch (FIELD_GET(STRTAB_STE_0_CFG,
+			  le64_to_cpu(nested_domain->ste[0]))) {
+	case STRTAB_STE_0_CFG_S1_TRANS:
+		arm_smmu_make_nested_cd_table_ste(target, master, nested_domain,
+						  ats_enabled);
+		break;
+	case STRTAB_STE_0_CFG_BYPASS:
+		arm_smmu_make_s2_domain_ste(
+			target, master, nested_domain->s2_parent, ats_enabled);
+		break;
+	case STRTAB_STE_0_CFG_ABORT:
+	default:
+		arm_smmu_make_abort_ste(target);
+		break;
+	}
+}
+
 /*
  * This can safely directly manipulate the STE memory without a sync sequence
  * because the STE table has not been installed in the SMMU yet.
@@ -2065,7 +2119,16 @@  int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 		if (!master->ats_enabled)
 			continue;
 
-		arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, &cmd);
+		if (master_domain->nest_parent) {
+			/*
+			 * If a S2 used as a nesting parent is changed we have
+			 * no option but to completely flush the ATC.
+			 */
+			arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd);
+		} else {
+			arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size,
+						&cmd);
+		}
 
 		for (i = 0; i < master->num_streams; i++) {
 			cmd.atc.sid = master->streams[i].id;
@@ -2192,6 +2255,16 @@  static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
 	}
 	__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
 
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2 &&
+	    smmu_domain->nest_parent) {
+		/*
+		 * When the S2 domain changes all the nested S1 ASIDs have to be
+		 * flushed too.
+		 */
+		cmd.opcode = CMDQ_OP_TLBI_NH_ALL;
+		arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd);
+	}
+
 	/*
 	 * Unfortunately, this can't be leaf-only since we may have
 	 * zapped an entire table.
@@ -2604,8 +2677,8 @@  static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
 
 static struct arm_smmu_master_domain *
 arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
-			    struct arm_smmu_master *master,
-			    ioasid_t ssid)
+			    struct arm_smmu_master *master, ioasid_t ssid,
+			    bool nest_parent)
 {
 	struct arm_smmu_master_domain *master_domain;
 
@@ -2614,7 +2687,8 @@  arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
 	list_for_each_entry(master_domain, &smmu_domain->devices,
 			    devices_elm) {
 		if (master_domain->master == master &&
-		    master_domain->ssid == ssid)
+		    master_domain->ssid == ssid &&
+		    master_domain->nest_parent == nest_parent)
 			return master_domain;
 	}
 	return NULL;
@@ -2634,6 +2708,9 @@  to_smmu_domain_devices(struct iommu_domain *domain)
 	if ((domain->type & __IOMMU_DOMAIN_PAGING) ||
 	    domain->type == IOMMU_DOMAIN_SVA)
 		return to_smmu_domain(domain);
+	if (domain->type == IOMMU_DOMAIN_NESTED)
+		return container_of(domain, struct arm_smmu_nested_domain,
+				    domain)->s2_parent;
 	return NULL;
 }
 
@@ -2649,7 +2726,8 @@  static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
 		return;
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid);
+	master_domain = arm_smmu_find_master_domain(
+		smmu_domain, master, ssid, domain->type == IOMMU_DOMAIN_NESTED);
 	if (master_domain) {
 		list_del(&master_domain->devices_elm);
 		kfree(master_domain);
@@ -2664,6 +2742,7 @@  struct arm_smmu_attach_state {
 	struct iommu_domain *old_domain;
 	struct arm_smmu_master *master;
 	bool cd_needs_ats;
+	bool disable_ats;
 	ioasid_t ssid;
 	/* Resulting state */
 	bool ats_enabled;
@@ -2716,7 +2795,8 @@  static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
 		 * enabled if we have arm_smmu_domain, those always have page
 		 * tables.
 		 */
-		state->ats_enabled = arm_smmu_ats_supported(master);
+		state->ats_enabled = !state->disable_ats &&
+				     arm_smmu_ats_supported(master);
 	}
 
 	if (smmu_domain) {
@@ -2725,6 +2805,8 @@  static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
 			return -ENOMEM;
 		master_domain->master = master;
 		master_domain->ssid = state->ssid;
+		master_domain->nest_parent = new_domain->type ==
+					       IOMMU_DOMAIN_NESTED;
 
 		/*
 		 * During prepare we want the current smmu_domain and new
@@ -3097,6 +3179,122 @@  static struct iommu_domain arm_smmu_blocked_domain = {
 	.ops = &arm_smmu_blocked_ops,
 };
 
+static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
+				      struct device *dev)
+{
+	struct arm_smmu_nested_domain *nested_domain =
+		container_of(domain, struct arm_smmu_nested_domain, domain);
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct arm_smmu_attach_state state = {
+		.master = master,
+		.old_domain = iommu_get_domain_for_dev(dev),
+		.ssid = IOMMU_NO_PASID,
+		/* Currently invalidation of ATC is not supported */
+		.disable_ats = true,
+	};
+	struct arm_smmu_ste ste;
+	int ret;
+
+	if (arm_smmu_ssids_in_use(&master->cd_table) ||
+	    nested_domain->s2_parent->smmu != master->smmu)
+		return -EINVAL;
+
+	mutex_lock(&arm_smmu_asid_lock);
+	ret = arm_smmu_attach_prepare(&state, domain);
+	if (ret) {
+		mutex_unlock(&arm_smmu_asid_lock);
+		return ret;
+	}
+
+	arm_smmu_make_nested_domain_ste(&ste, master, nested_domain,
+					state.ats_enabled);
+	arm_smmu_install_ste_for_dev(master, &ste);
+	arm_smmu_attach_commit(&state);
+	mutex_unlock(&arm_smmu_asid_lock);
+	return 0;
+}
+
+static void arm_smmu_domain_nested_free(struct iommu_domain *domain)
+{
+	kfree(container_of(domain, struct arm_smmu_nested_domain, domain));
+}
+
+static const struct iommu_domain_ops arm_smmu_nested_ops = {
+	.attach_dev = arm_smmu_attach_dev_nested,
+	.free = arm_smmu_domain_nested_free,
+};
+
+static struct iommu_domain *
+arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags,
+			      struct iommu_domain *parent,
+			      const struct iommu_user_data *user_data)
+{
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct arm_smmu_nested_domain *nested_domain;
+	struct arm_smmu_domain *smmu_parent;
+	struct iommu_hwpt_arm_smmuv3 arg;
+	unsigned int eats;
+	unsigned int cfg;
+	int ret;
+
+	if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	/*
+	 * Must support some way to prevent the VM from bypassing the cache
+	 * because VFIO currently does not do any cache maintenance.
+	 */
+	if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_CANWBS) &&
+	    !(master->smmu->features & ARM_SMMU_FEAT_S2FWB))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	ret = iommu_copy_struct_from_user(&arg, user_data,
+					  IOMMU_HWPT_DATA_ARM_SMMUV3, ste);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (flags || !(master->smmu->features & ARM_SMMU_FEAT_TRANS_S1))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (!(parent->type & __IOMMU_DOMAIN_PAGING))
+		return ERR_PTR(-EINVAL);
+
+	smmu_parent = to_smmu_domain(parent);
+	if (smmu_parent->stage != ARM_SMMU_DOMAIN_S2 ||
+	    smmu_parent->smmu != master->smmu)
+		return ERR_PTR(-EINVAL);
+
+	/* EIO is reserved for invalid STE data. */
+	if ((arg.ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) ||
+	    (arg.ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED))
+		return ERR_PTR(-EIO);
+
+	cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(arg.ste[0]));
+	if (cfg != STRTAB_STE_0_CFG_ABORT && cfg != STRTAB_STE_0_CFG_BYPASS &&
+	    cfg != STRTAB_STE_0_CFG_S1_TRANS)
+		return ERR_PTR(-EIO);
+
+	eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg.ste[1]));
+	if (eats != STRTAB_STE_1_EATS_ABT)
+		return ERR_PTR(-EIO);
+
+	if (cfg != STRTAB_STE_0_CFG_S1_TRANS)
+		eats = STRTAB_STE_1_EATS_ABT;
+
+	nested_domain = kzalloc(sizeof(*nested_domain), GFP_KERNEL_ACCOUNT);
+	if (!nested_domain)
+		return ERR_PTR(-ENOMEM);
+
+	nested_domain->domain.type = IOMMU_DOMAIN_NESTED;
+	nested_domain->domain.ops = &arm_smmu_nested_ops;
+	nested_domain->s2_parent = smmu_parent;
+	nested_domain->ste[0] = arg.ste[0];
+	nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS);
+
+	return &nested_domain->domain;
+}
+
 static struct iommu_domain *
 arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 			   struct iommu_domain *parent,
@@ -3108,9 +3306,13 @@  arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 	struct arm_smmu_domain *smmu_domain;
 	int ret;
 
+	if (parent)
+		return arm_smmu_domain_alloc_nesting(dev, flags, parent,
+						     user_data);
+
 	if (flags & ~PAGING_FLAGS)
 		return ERR_PTR(-EOPNOTSUPP);
-	if (parent || user_data)
+	if (user_data)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	smmu_domain = arm_smmu_domain_alloc();
@@ -3123,6 +3325,7 @@  arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 			goto err_free;
 		}
 		smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+		smmu_domain->nest_parent = true;
 	}
 
 	smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 4b05c81b181a82..b563cfedf22e91 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -240,6 +240,7 @@  static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
 #define STRTAB_STE_0_CFG_BYPASS		4
 #define STRTAB_STE_0_CFG_S1_TRANS	5
 #define STRTAB_STE_0_CFG_S2_TRANS	6
+#define STRTAB_STE_0_CFG_NESTED		7
 
 #define STRTAB_STE_0_S1FMT		GENMASK_ULL(5, 4)
 #define STRTAB_STE_0_S1FMT_LINEAR	0
@@ -291,6 +292,15 @@  static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
 
 #define STRTAB_STE_3_S2TTB_MASK		GENMASK_ULL(51, 4)
 
+/* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */
+#define STRTAB_STE_0_NESTING_ALLOWED                                         \
+	cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \
+		    STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX)
+#define STRTAB_STE_1_NESTING_ALLOWED                            \
+	cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |   \
+		    STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |   \
+		    STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS)
+
 /*
  * Context descriptors.
  *
@@ -508,6 +518,7 @@  struct arm_smmu_cmdq_ent {
 			};
 		} cfgi;
 
+		#define CMDQ_OP_TLBI_NH_ALL     0x10
 		#define CMDQ_OP_TLBI_NH_ASID	0x11
 		#define CMDQ_OP_TLBI_NH_VA	0x12
 		#define CMDQ_OP_TLBI_EL2_ALL	0x20
@@ -790,10 +801,18 @@  struct arm_smmu_domain {
 	struct list_head		devices;
 	spinlock_t			devices_lock;
 	bool				enforce_cache_coherency : 1;
+	bool				nest_parent : 1;
 
 	struct mmu_notifier		mmu_notifier;
 };
 
+struct arm_smmu_nested_domain {
+	struct iommu_domain domain;
+	struct arm_smmu_domain *s2_parent;
+
+	__le64 ste[2];
+};
+
 /* The following are exposed for testing purposes. */
 struct arm_smmu_entry_writer_ops;
 struct arm_smmu_entry_writer {
@@ -830,6 +849,7 @@  struct arm_smmu_master_domain {
 	struct list_head devices_elm;
 	struct arm_smmu_master *master;
 	ioasid_t ssid;
+	u8 nest_parent;
 };
 
 static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 83b6e1cd338d8f..76e9ad6c9403af 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -394,14 +394,34 @@  struct iommu_hwpt_vtd_s1 {
 	__u32 __reserved;
 };
 
+/**
+ * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor Table info
+ *                                (IOMMU_HWPT_DATA_ARM_SMMUV3)
+ *
+ * @ste: The first two double words of the user space Stream Table Entry for
+ *       a user stage-1 Context Descriptor Table. Must be little-endian.
+ *       Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec)
+ *       - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax
+ *       - word-1: S1DSS, S1CIR, S1COR, S1CSH, S1STALLD
+ *
+ * -EIO will be returned if @ste is not legal or contains any non-allowed field.
+ * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass
+ * nested domain will translate the same as the nesting parent.
+ */
+struct iommu_hwpt_arm_smmuv3 {
+	__aligned_le64 ste[2];
+};
+
 /**
  * enum iommu_hwpt_data_type - IOMMU HWPT Data Type
  * @IOMMU_HWPT_DATA_NONE: no data
  * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
+ * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table
  */
 enum iommu_hwpt_data_type {
 	IOMMU_HWPT_DATA_NONE = 0,
 	IOMMU_HWPT_DATA_VTD_S1 = 1,
+	IOMMU_HWPT_DATA_ARM_SMMUV3 = 2,
 };
 
 /**