diff mbox series

[v1,15/16] iommu/tegra241-cmdqv: Add user-space use support

Message ID 30c7aff68c35040ee637629cb9fc2b6e7f83f76c.1744353300.git.nicolinc@nvidia.com
State New
Headers show
Series iommufd: Add vIOMMU infrastructure (Part-4 vCMDQ) | expand

Commit Message

Nicolin Chen April 11, 2025, 6:37 a.m. UTC
Add the support via vIOMMU infrastructure for virtualization use case.

This basically allows VMM to allocate VINTFs (as a vIOMMU object) and
assign VCMDQs (vCMDQ objects) to it. A VINTF's MMIO page0 can be mmap'd
to user space for VM to access directly without VMEXIT and corresponding
hypercall.

As an initial version, the number of VCMDQs per VINTF is fixed to two.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  15 +
 include/uapi/linux/iommufd.h                  |  34 ++
 .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c     |   6 +-
 .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c    | 322 +++++++++++++++++-
 4 files changed, 370 insertions(+), 7 deletions(-)

Comments

Tian, Kevin April 21, 2025, 8:37 a.m. UTC | #1
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Friday, April 11, 2025 2:38 PM
> 
> Add the support via vIOMMU infrastructure for virtualization use case.
> 
> This basically allows VMM to allocate VINTFs (as a vIOMMU object) and
> assign VCMDQs (vCMDQ objects) to it. A VINTF's MMIO page0 can be
> mmap'd
> to user space for VM to access directly without VMEXIT and corresponding
> hypercall.

it'd be helpful to add a bit more context, e.g. what page0 contains, sid
slots, vcmdq_base (mapped in s2), etc. so it's easier for one to understand
it from start instead of by reading the code.

> 
> As an initial version, the number of VCMDQs per VINTF is fixed to two.

so an user could map both VCMDQs of an VINTF even when only one
VCMDQ is created, given the entire 64K page0 is legible for mmap once
the VINTF is associated to a viommu?

no security issue given the VINTF is not shared, but conceptually if
feasible (e.g. two CMDQ's MMIO ranges sits in different 4k pages of
VINTF page0) does it make sense to do per-VCMDQ mmap control
and return mmap info at VCMDQ alloc?

> +static struct iommufd_vcmdq *
> +tegra241_cmdqv_vcmdq_alloc(struct iommufd_viommu *viommu,
> +			   const struct iommu_user_data *user_data)
> +{
> +	struct tegra241_vintf *vintf = viommu_to_vintf(viommu);
> +	struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
> +	struct iommu_vcmdq_tegra241_cmdqv arg;
> +	struct tegra241_vcmdq *vcmdq;
> +	phys_addr_t q_base;
> +	char header[64];
> +	int ret;
> +
> +	ret = iommu_copy_struct_from_user(
> +		&arg, user_data, IOMMU_VCMDQ_TYPE_TEGRA241_CMDQV,
> vcmdq_base);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	if (!arg.vcmdq_base || arg.vcmdq_base & ~VCMDQ_ADDR)
> +		return ERR_PTR(-EINVAL);
> +	if (!arg.vcmdq_log2size || arg.vcmdq_log2size > VCMDQ_LOG2SIZE)
> +		return ERR_PTR(-EINVAL);
> +	if (arg.vcmdq_id >= cmdqv->num_lvcmdqs_per_vintf)
> +		return ERR_PTR(-EINVAL);
> +	q_base = arm_smmu_domain_ipa_to_pa(vintf->s2_domain,
> arg.vcmdq_base);
> +	if (!q_base)
> +		return ERR_PTR(-EINVAL);
> +
> +	if (vintf->lvcmdqs[arg.vcmdq_id]) {
> +		vcmdq = vintf->lvcmdqs[arg.vcmdq_id];
> +
> +		/* deinit the previous setting as a reset, before re-init */
> +		tegra241_vcmdq_hw_deinit(vcmdq);
> +
> +		vcmdq->cmdq.q.q_base = q_base & VCMDQ_ADDR;
> +		vcmdq->cmdq.q.q_base |= arg.vcmdq_log2size;
> +		tegra241_vcmdq_hw_init_user(vcmdq);
> +
> +		return &vcmdq->core;
> +	}

why not returning -EBUSY here?

> +
> +	vcmdq = iommufd_vcmdq_alloc(viommu, struct tegra241_vcmdq,
> core);
> +	if (!vcmdq)
> +		return ERR_PTR(-ENOMEM);
> +
> +	ret = tegra241_vintf_init_lvcmdq(vintf, arg.vcmdq_id, vcmdq);
> +	if (ret)
> +		goto free_vcmdq;
> +	dev_dbg(cmdqv->dev, "%sallocated\n",
> +		lvcmdq_error_header(vcmdq, header, 64));
> +
> +	vcmdq->cmdq.q.q_base = q_base & VCMDQ_ADDR;
> +	vcmdq->cmdq.q.q_base |= arg.vcmdq_log2size;

could the queue size be multiple pages? there is no guarantee
that the HPA of guest queue would be contiguous :/
Tian, Kevin April 23, 2025, 8:05 a.m. UTC | #2
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Tuesday, April 22, 2025 3:14 AM
> On Mon, Apr 21, 2025 at 08:37:40AM +0000, Tian, Kevin wrote:
> > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > Sent: Friday, April 11, 2025 2:38 PM
> > >
> > > +
> > > +	vcmdq = iommufd_vcmdq_alloc(viommu, struct tegra241_vcmdq,
> > > core);
> > > +	if (!vcmdq)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	ret = tegra241_vintf_init_lvcmdq(vintf, arg.vcmdq_id, vcmdq);
> > > +	if (ret)
> > > +		goto free_vcmdq;
> > > +	dev_dbg(cmdqv->dev, "%sallocated\n",
> > > +		lvcmdq_error_header(vcmdq, header, 64));
> > > +
> > > +	vcmdq->cmdq.q.q_base = q_base & VCMDQ_ADDR;
> > > +	vcmdq->cmdq.q.q_base |= arg.vcmdq_log2size;
> >
> > could the queue size be multiple pages? there is no guarantee
> > that the HPA of guest queue would be contiguous :/
> 
> It certainly can. VMM must make sure the guest PA are contiguous
> by using huge pages to back the guest RAM space. Kernel has no
> control of this but only has to trust the VMM.
> 
> I'm adding a note here:
> 	/* User space ensures that the queue memory is physically
> contiguous */
> 
> And likely something similar in the uAPI header too.
> 

It's not a good idea having the kernel trust the VMM. Also I'm not
sure the contiguity is guaranteed all the time with huge page
(e.g. if just using THP).

@Jason?

btw does smmu only read the cmdq or also update some fields
in the queue? If the latter, then it also brings a security hole 
as a malicious  VMM could violate the contiguity requirement
to instruct the smmu to touch pages which don't belong to 
it...
Jason Gunthorpe April 24, 2025, 1:40 p.m. UTC | #3
On Wed, Apr 23, 2025 at 11:51:53PM -0700, Nicolin Chen wrote:
> On Wed, Apr 23, 2025 at 08:13:33PM -0300, Jason Gunthorpe wrote:
> > On Wed, Apr 23, 2025 at 11:31:29AM -0700, Nicolin Chen wrote:
> > 
> > > > It also needs to act like a mdev and lock down the part of the IOAS
> > > > that provides that memory so the pin can't be released and UAF things.
> > > 
> > > If I capture this correctly, the GPA->PA mapping is already done
> > > at the IOAS level for the S2 HWPT/domain, i.e. pages are already
> > > pinned. So we just need to a pair of for-driver APIs to validate
> > > the contiguity and refcount pages calling iopt_area_add_access().
> > 
> > Yes, adding an access is the key thing, the access will give you a
> > page list which you can validate, but it also provides a way to
> > synchronize if a hostile userspace does an unmap.
> 
> The new APIs are very like iommufd_access_pin/unpin_pages(). But
> to reduce the amount of code that we have to share with driver.o,
> I added a smaller iopt_area_get/put_access() that gets an access
> and increases/decreases the refcounts only.

Maybe the access should be obtained by the core code to avoid the
driver.o bloating? All the cmdq types need a memory buffer, right?
Perhaps that should just be generalized

> Meanwhile, I am thinking if we could use the known S2 domain to
> translate the GPAs to PAs for the contiguity test, which feels a
> little cleaner to do in an IOMMU driver v.s. with a page list?

You still need the access, and the access already generates a page
list..

Jason
Nicolin Chen April 24, 2025, 3:46 p.m. UTC | #4
On Thu, Apr 24, 2025 at 10:40:49AM -0300, Jason Gunthorpe wrote:
> On Wed, Apr 23, 2025 at 11:51:53PM -0700, Nicolin Chen wrote:
> > On Wed, Apr 23, 2025 at 08:13:33PM -0300, Jason Gunthorpe wrote:
> > > On Wed, Apr 23, 2025 at 11:31:29AM -0700, Nicolin Chen wrote:
> > > 
> > > > > It also needs to act like a mdev and lock down the part of the IOAS
> > > > > that provides that memory so the pin can't be released and UAF things.
> > > > 
> > > > If I capture this correctly, the GPA->PA mapping is already done
> > > > at the IOAS level for the S2 HWPT/domain, i.e. pages are already
> > > > pinned. So we just need to a pair of for-driver APIs to validate
> > > > the contiguity and refcount pages calling iopt_area_add_access().
> > > 
> > > Yes, adding an access is the key thing, the access will give you a
> > > page list which you can validate, but it also provides a way to
> > > synchronize if a hostile userspace does an unmap.
> > 
> > The new APIs are very like iommufd_access_pin/unpin_pages(). But
> > to reduce the amount of code that we have to share with driver.o,
> > I added a smaller iopt_area_get/put_access() that gets an access
> > and increases/decreases the refcounts only.
> 
> Maybe the access should be obtained by the core code to avoid the
> driver.o bloating? All the cmdq types need a memory buffer, right?
> Perhaps that should just be generalized

Yes. AMD just confirmed that they needed a similar setup. I think
we can do that!

> > Meanwhile, I am thinking if we could use the known S2 domain to
> > translate the GPAs to PAs for the contiguity test, which feels a
> > little cleaner to do in an IOMMU driver v.s. with a page list?
> 
> You still need the access, and the access already generates a page
> list..

Right. I was thinking it could save a few lines that fetches the
page list, but then that would need another around of translation,
which is unnecessary.

Thanks
Nicolin
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index a5835af72417..84f8dfdd9638 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -999,6 +999,14 @@  int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 				struct arm_smmu_cmdq *cmdq, u64 *cmds, int n,
 				bool sync);
 
+static inline phys_addr_t
+arm_smmu_domain_ipa_to_pa(struct arm_smmu_domain *smmu_domain, u64 ipa)
+{
+	if (WARN_ON_ONCE(smmu_domain->stage != ARM_SMMU_DOMAIN_S2))
+		return 0;
+	return iommu_iova_to_phys(&smmu_domain->domain, ipa);
+}
+
 #ifdef CONFIG_ARM_SMMU_V3_SVA
 bool arm_smmu_sva_supported(struct arm_smmu_device *smmu);
 bool arm_smmu_master_sva_supported(struct arm_smmu_master *master);
@@ -1075,9 +1083,16 @@  int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
 void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state);
 void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master);
 int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt);
+struct iommu_domain *
+arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
+			      const struct iommu_user_data *user_data);
+int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
+			       struct iommu_user_data_array *array);
 #else
 #define arm_smmu_hw_info NULL
 #define arm_vsmmu_alloc NULL
+#define arm_vsmmu_alloc_domain_nested NULL
+#define arm_vsmmu_cache_invalidate NULL
 
 static inline int
 arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 428f1e9e6239..ce20f038b56b 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -952,10 +952,28 @@  struct iommu_fault_alloc {
  * enum iommu_viommu_type - Virtual IOMMU Type
  * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use
  * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type
+ * @IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension for SMMUv3
  */
 enum iommu_viommu_type {
 	IOMMU_VIOMMU_TYPE_DEFAULT = 0,
 	IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1,
+	IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV = 2,
+};
+
+/**
+ * struct iommu_viommu_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Virtual Interface
+ *                                      (IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV)
+ * @out_vintf_page0_pgoff: Offset of the VINTF page0 for mmap syscall
+ * @out_vintf_page0_pgsz: Size of the VINTF page0 for mmap syscall
+ *
+ * Both @out_vintf_page0_pgoff and @out_vintf_page0_pgsz are given by the kernel
+ * for user space to mmap the VINTF page0 from the host physical address space
+ * to the guest physical address space so that a guest kernel can directly R/W
+ * access to the VINTF page0 in order to control its virtual comamnd queues.
+ */
+struct iommu_viommu_tegra241_cmdqv {
+	__aligned_u64 out_vintf_page0_pgoff;
+	__aligned_u64 out_vintf_page0_pgsz;
 };
 
 /**
@@ -1152,9 +1170,25 @@  struct iommu_veventq_alloc {
 /**
  * enum iommu_vcmdq_type - Virtual Queue Type
  * @IOMMU_VCMDQ_TYPE_DEFAULT: Reserved for future use
+ * @IOMMU_VCMDQ_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension for SMMUv3
  */
 enum iommu_vcmdq_data_type {
 	IOMMU_VCMDQ_TYPE_DEFAULT = 0,
+	IOMMU_VCMDQ_TYPE_TEGRA241_CMDQV = 1,
+};
+
+/**
+ * struct iommu_vqueue_tegra241_cmdqv - NVIDIA Tegra241's Virtual Command Queue
+ *                                      for its CMDQV Extension for ARM SMMUv3
+ *                                      (IOMMU_VCMDQ_TYPE_TEGRA241_CMDQV)
+ * @vcmdq_id: logical ID of a virtual command queue in the vIOMMU instance
+ * @vcmdq_log2size: (1 << @vcmdq_log2size) will be the size of the vcmdq
+ * @vcmdq_base: guest physical address (IPA) to the vcmdq base address
+ */
+struct iommu_vcmdq_tegra241_cmdqv {
+	__u32 vcmdq_id;
+	__u32 vcmdq_log2size;
+	__aligned_u64 vcmdq_base;
 };
 
 /**
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index aa8653af50f2..162872d82cd4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -216,7 +216,7 @@  static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg,
 	return 0;
 }
 
-static struct iommu_domain *
+struct iommu_domain *
 arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 			      const struct iommu_user_data *user_data)
 {
@@ -327,8 +327,8 @@  static int arm_vsmmu_convert_user_cmd(struct arm_vsmmu *vsmmu,
 	return 0;
 }
 
-static int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
-				      struct iommu_user_data_array *array)
+int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
+			       struct iommu_user_data_array *array)
 {
 	struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
 	struct arm_smmu_device *smmu = vsmmu->smmu;
diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index ba029f7d24ce..b778739f845a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -8,7 +8,9 @@ 
 #include <linux/dma-mapping.h>
 #include <linux/interrupt.h>
 #include <linux/iommu.h>
+#include <linux/iommufd.h>
 #include <linux/iopoll.h>
+#include <uapi/linux/iommufd.h>
 
 #include <acpi/acpixf.h>
 
@@ -26,6 +28,7 @@ 
 #define  CMDQV_EN			BIT(0)
 
 #define TEGRA241_CMDQV_PARAM		0x0004
+#define  CMDQV_NUM_SID_PER_VM_LOG2	GENMASK(15, 12)
 #define  CMDQV_NUM_VINTF_LOG2		GENMASK(11, 8)
 #define  CMDQV_NUM_VCMDQ_LOG2		GENMASK(7, 4)
 
@@ -53,6 +56,9 @@ 
 #define  VINTF_STATUS			GENMASK(3, 1)
 #define  VINTF_ENABLED			BIT(0)
 
+#define TEGRA241_VINTF_SID_MATCH(s)	(0x0040 + 0x4*(s))
+#define TEGRA241_VINTF_SID_REPLACE(s)	(0x0080 + 0x4*(s))
+
 #define TEGRA241_VINTF_LVCMDQ_ERR_MAP_64(m) \
 					(0x00C0 + 0x8*(m))
 #define  LVCMDQ_ERR_MAP_NUM_64		2
@@ -114,6 +120,7 @@  MODULE_PARM_DESC(bypass_vcmdq,
 
 /**
  * struct tegra241_vcmdq - Virtual Command Queue
+ * @core: Embedded iommufd_vcmdq structure
  * @idx: Global index in the CMDQV
  * @lidx: Local index in the VINTF
  * @enabled: Enable status
@@ -124,6 +131,8 @@  MODULE_PARM_DESC(bypass_vcmdq,
  * @page1: MMIO Page1 base address
  */
 struct tegra241_vcmdq {
+	struct iommufd_vcmdq core;
+
 	u16 idx;
 	u16 lidx;
 
@@ -136,18 +145,26 @@  struct tegra241_vcmdq {
 	void __iomem *page0;
 	void __iomem *page1;
 };
+#define core_to_vcmdq(v) container_of(v, struct tegra241_vcmdq, core)
 
 /**
  * struct tegra241_vintf - Virtual Interface
+ * @vsmmue: Embedded arm_vsmmu structure
  * @idx: Global index in the CMDQV
+ * @vmid: VMID for configuration
  * @enabled: Enable status
  * @hyp_own: Owned by hypervisor (in-kernel)
  * @cmdqv: Parent CMDQV pointer
  * @lvcmdqs: List of logical VCMDQ pointers
  * @base: MMIO base address
+ * @s2_domain: Stage-2 SMMU domain
+ * @sid_slots: Stream ID Slot allocator
  */
 struct tegra241_vintf {
+	struct arm_vsmmu vsmmu;
+
 	u16 idx;
+	u16 vmid;
 
 	bool enabled;
 	bool hyp_own;
@@ -156,6 +173,25 @@  struct tegra241_vintf {
 	struct tegra241_vcmdq **lvcmdqs;
 
 	void __iomem *base;
+	struct arm_smmu_domain *s2_domain;
+	unsigned long mmap_pgoff;
+
+	struct ida sid_slots;
+};
+#define viommu_to_vintf(v) container_of(v, struct tegra241_vintf, vsmmu.core)
+
+/**
+ * struct tegra241_vintf_slot - Virtual Interface Stream ID Slot
+ * @core: Embedded iommufd_vdevice structure
+ * @vintf: Parent VINTF pointer
+ * @sid: Physical Stream ID
+ * @id: Slot index in the VINTF
+ */
+struct tegra241_vintf_slot {
+	struct iommufd_vdevice core;
+	struct tegra241_vintf *vintf;
+	u32 sid;
+	u8 idx;
 };
 
 /**
@@ -163,10 +199,12 @@  struct tegra241_vintf {
  * @smmu: SMMUv3 device
  * @dev: CMDQV device
  * @base: MMIO base address
+ * @base_phys: MMIO physical base address, for mmap
  * @irq: IRQ number
  * @num_vintfs: Total number of VINTFs
  * @num_vcmdqs: Total number of VCMDQs
  * @num_lvcmdqs_per_vintf: Number of logical VCMDQs per VINTF
+ * @num_sids_per_vintf: Total number of SID replacements per VINTF
  * @vintf_ids: VINTF id allocator
  * @vintfs: List of VINTFs
  */
@@ -175,12 +213,14 @@  struct tegra241_cmdqv {
 	struct device *dev;
 
 	void __iomem *base;
+	phys_addr_t base_phys;
 	int irq;
 
 	/* CMDQV Hardware Params */
 	u16 num_vintfs;
 	u16 num_vcmdqs;
 	u16 num_lvcmdqs_per_vintf;
+	u16 num_sids_per_vintf;
 
 	struct ida vintf_ids;
 
@@ -379,6 +419,11 @@  static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq)
 	dev_dbg(vcmdq->cmdqv->dev, "%sdeinited\n", h);
 }
 
+static void _tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
+{
+	writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE));
+}
+
 static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
 {
 	char header[64], *h = lvcmdq_error_header(vcmdq, header, 64);
@@ -388,7 +433,7 @@  static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
 	tegra241_vcmdq_hw_deinit(vcmdq);
 
 	/* Configure and enable VCMDQ */
-	writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE));
+	_tegra241_vcmdq_hw_init(vcmdq);
 
 	ret = vcmdq_write_config(vcmdq, VCMDQ_EN);
 	if (ret) {
@@ -407,11 +452,16 @@  static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
 static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf)
 {
 	u16 lidx;
+	int slot;
 
 	for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++)
 		if (vintf->lvcmdqs && vintf->lvcmdqs[lidx])
 			tegra241_vcmdq_hw_deinit(vintf->lvcmdqs[lidx]);
 	vintf_write_config(vintf, 0);
+	for (slot = 0; slot < vintf->cmdqv->num_sids_per_vintf; slot++) {
+		writel_relaxed(0, REG_VINTF(vintf, SID_REPLACE(slot)));
+		writel_relaxed(0, REG_VINTF(vintf, SID_MATCH(slot)));
+	}
 }
 
 static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own)
@@ -429,7 +479,8 @@  static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own)
 	 * whether enabling it here or not, as !HYP_OWN cmdq HWs only support a
 	 * restricted set of supported commands.
 	 */
-	regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own);
+	regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own) |
+		 FIELD_PREP(VINTF_VMID, vintf->vmid);
 	writel(regval, REG_VINTF(vintf, CONFIG));
 
 	ret = vintf_write_config(vintf, regval | VINTF_EN);
@@ -555,7 +606,9 @@  static void tegra241_vintf_free_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
 
 	dev_dbg(vintf->cmdqv->dev,
 		"%sdeallocated\n", lvcmdq_error_header(vcmdq, header, 64));
-	kfree(vcmdq);
+	/* Guest-owned VCMDQ is free-ed with vcmdq by iommufd core */
+	if (vcmdq->vintf->hyp_own)
+		kfree(vcmdq);
 }
 
 static struct tegra241_vcmdq *
@@ -594,6 +647,9 @@  tegra241_vintf_alloc_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
 
 static void tegra241_cmdqv_deinit_vintf(struct tegra241_cmdqv *cmdqv, u16 idx)
 {
+	if (cmdqv->vintfs[idx]->mmap_pgoff)
+		iommufd_ctx_free_mmap(cmdqv->vintfs[idx]->vsmmu.core.ictx,
+				      cmdqv->vintfs[idx]->mmap_pgoff);
 	kfree(cmdqv->vintfs[idx]->lvcmdqs);
 	ida_free(&cmdqv->vintf_ids, idx);
 	cmdqv->vintfs[idx] = NULL;
@@ -649,7 +705,11 @@  static void tegra241_cmdqv_remove_vintf(struct tegra241_cmdqv *cmdqv, u16 idx)
 
 	dev_dbg(cmdqv->dev, "VINTF%u: deallocated\n", vintf->idx);
 	tegra241_cmdqv_deinit_vintf(cmdqv, idx);
-	kfree(vintf);
+	if (!vintf->hyp_own)
+		ida_destroy(&vintf->sid_slots);
+	/* Guest-owned VINTF is free-ed with viommu by iommufd core */
+	if (vintf->hyp_own)
+		kfree(vintf);
 }
 
 static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu)
@@ -677,10 +737,17 @@  static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu)
 	put_device(cmdqv->dev); /* smmu->impl_dev */
 }
 
+static struct arm_vsmmu *
+tegra241_cmdqv_vsmmu_alloc(struct arm_smmu_device *smmu,
+			   struct arm_smmu_domain *smmu_domain,
+			   struct iommufd_ctx *ictx, unsigned int viommu_type,
+			   const struct iommu_user_data *user_data);
+
 static struct arm_smmu_impl_ops tegra241_cmdqv_impl_ops = {
 	.get_secondary_cmdq = tegra241_cmdqv_get_cmdq,
 	.device_reset = tegra241_cmdqv_hw_reset,
 	.device_remove = tegra241_cmdqv_remove,
+	.vsmmu_alloc = tegra241_cmdqv_vsmmu_alloc,
 };
 
 /* Probe Functions */
@@ -822,6 +889,7 @@  __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 	cmdqv->irq = irq;
 	cmdqv->base = base;
 	cmdqv->dev = smmu->impl_dev;
+	cmdqv->base_phys = res->start;
 
 	if (cmdqv->irq > 0) {
 		ret = request_threaded_irq(irq, NULL, tegra241_cmdqv_isr,
@@ -838,6 +906,8 @@  __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 	cmdqv->num_vintfs = 1 << FIELD_GET(CMDQV_NUM_VINTF_LOG2, regval);
 	cmdqv->num_vcmdqs = 1 << FIELD_GET(CMDQV_NUM_VCMDQ_LOG2, regval);
 	cmdqv->num_lvcmdqs_per_vintf = cmdqv->num_vcmdqs / cmdqv->num_vintfs;
+	cmdqv->num_sids_per_vintf =
+		1 << FIELD_GET(CMDQV_NUM_SID_PER_VM_LOG2, regval);
 
 	cmdqv->vintfs =
 		kcalloc(cmdqv->num_vintfs, sizeof(*cmdqv->vintfs), GFP_KERNEL);
@@ -891,3 +961,247 @@  struct arm_smmu_device *tegra241_cmdqv_probe(struct arm_smmu_device *smmu)
 	put_device(smmu->impl_dev);
 	return ERR_PTR(-ENODEV);
 }
+
+/* User-space vIOMMU and vCMDQ Functions */
+
+static int tegra241_vcmdq_hw_init_user(struct tegra241_vcmdq *vcmdq)
+{
+	char header[64];
+
+	/* Configure the vcmdq only; User space does the enabling */
+	_tegra241_vcmdq_hw_init(vcmdq);
+
+	dev_dbg(vcmdq->cmdqv->dev, "%sinited at host PA 0x%llx size 0x%lx\n",
+		lvcmdq_error_header(vcmdq, header, 64),
+		vcmdq->cmdq.q.q_base & VCMDQ_ADDR,
+		1UL << (vcmdq->cmdq.q.q_base & VCMDQ_LOG2SIZE));
+	return 0;
+}
+
+static struct iommufd_vcmdq *
+tegra241_cmdqv_vcmdq_alloc(struct iommufd_viommu *viommu,
+			   const struct iommu_user_data *user_data)
+{
+	struct tegra241_vintf *vintf = viommu_to_vintf(viommu);
+	struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
+	struct iommu_vcmdq_tegra241_cmdqv arg;
+	struct tegra241_vcmdq *vcmdq;
+	phys_addr_t q_base;
+	char header[64];
+	int ret;
+
+	ret = iommu_copy_struct_from_user(
+		&arg, user_data, IOMMU_VCMDQ_TYPE_TEGRA241_CMDQV, vcmdq_base);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!arg.vcmdq_base || arg.vcmdq_base & ~VCMDQ_ADDR)
+		return ERR_PTR(-EINVAL);
+	if (!arg.vcmdq_log2size || arg.vcmdq_log2size > VCMDQ_LOG2SIZE)
+		return ERR_PTR(-EINVAL);
+	if (arg.vcmdq_id >= cmdqv->num_lvcmdqs_per_vintf)
+		return ERR_PTR(-EINVAL);
+	q_base = arm_smmu_domain_ipa_to_pa(vintf->s2_domain, arg.vcmdq_base);
+	if (!q_base)
+		return ERR_PTR(-EINVAL);
+
+	if (vintf->lvcmdqs[arg.vcmdq_id]) {
+		vcmdq = vintf->lvcmdqs[arg.vcmdq_id];
+
+		/* deinit the previous setting as a reset, before re-init */
+		tegra241_vcmdq_hw_deinit(vcmdq);
+
+		vcmdq->cmdq.q.q_base = q_base & VCMDQ_ADDR;
+		vcmdq->cmdq.q.q_base |= arg.vcmdq_log2size;
+		tegra241_vcmdq_hw_init_user(vcmdq);
+
+		return &vcmdq->core;
+	}
+
+	vcmdq = iommufd_vcmdq_alloc(viommu, struct tegra241_vcmdq, core);
+	if (!vcmdq)
+		return ERR_PTR(-ENOMEM);
+
+	ret = tegra241_vintf_init_lvcmdq(vintf, arg.vcmdq_id, vcmdq);
+	if (ret)
+		goto free_vcmdq;
+	dev_dbg(cmdqv->dev, "%sallocated\n",
+		lvcmdq_error_header(vcmdq, header, 64));
+
+	vcmdq->cmdq.q.q_base = q_base & VCMDQ_ADDR;
+	vcmdq->cmdq.q.q_base |= arg.vcmdq_log2size;
+
+	ret = tegra241_vcmdq_hw_init_user(vcmdq);
+	if (ret)
+		goto free_vcmdq;
+	vintf->lvcmdqs[arg.vcmdq_id] = vcmdq;
+
+	return &vcmdq->core;
+free_vcmdq:
+	iommufd_struct_destroy(viommu->ictx, vcmdq, core);
+	return ERR_PTR(ret);
+}
+
+static void tegra241_cmdqv_vcmdq_free(struct iommufd_vcmdq *core)
+{
+	struct tegra241_vcmdq *vcmdq = core_to_vcmdq(core);
+
+	tegra241_vintf_remove_lvcmdq(vcmdq->vintf, vcmdq->lidx);
+
+	/* IOMMUFD core frees the memory of vcmdq and vcmdq */
+}
+
+static void tegra241_cmdqv_viommu_destroy(struct iommufd_viommu *viommu)
+{
+	struct tegra241_vintf *vintf = viommu_to_vintf(viommu);
+
+	tegra241_cmdqv_remove_vintf(vintf->cmdqv, vintf->idx);
+
+	/* IOMMUFD core frees the memory of vintf and viommu */
+}
+
+static struct iommufd_vdevice *
+tegra241_cmdqv_vdevice_alloc(struct iommufd_viommu *viommu, struct device *dev,
+			     u64 dev_id)
+{
+	struct tegra241_vintf *vintf = viommu_to_vintf(viommu);
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct arm_smmu_stream *stream = &master->streams[0];
+	struct tegra241_vintf_slot *slot;
+	int idx;
+
+	if (dev_id > UINT_MAX)
+		return ERR_PTR(-EINVAL);
+
+	slot = iommufd_vdevice_alloc(viommu, struct tegra241_vintf_slot, core);
+	if (!slot)
+		return ERR_PTR(-ENOMEM);
+
+	WARN_ON_ONCE(master->num_streams != 1);
+
+	/* Find an empty slot of SID_MATCH and SID_REPLACE */
+	idx = ida_alloc_max(&vintf->sid_slots,
+			    vintf->cmdqv->num_sids_per_vintf - 1, GFP_KERNEL);
+	if (idx < 0) {
+		iommufd_struct_destroy(viommu->ictx, slot, core);
+		return ERR_PTR(idx);
+	}
+
+	writel_relaxed(stream->id, REG_VINTF(vintf, SID_REPLACE(idx)));
+	writel_relaxed(dev_id << 1 | 0x1, REG_VINTF(vintf, SID_MATCH(idx)));
+	dev_dbg(vintf->cmdqv->dev,
+		"VINTF%u: allocated a slot (%d) for pSID=%x, vSID=%x\n",
+		vintf->idx, idx, stream->id, (u32)dev_id);
+
+	slot->idx = idx;
+	slot->vintf = vintf;
+	slot->sid = stream->id;
+
+	return &slot->core;
+}
+
+static void tegra241_cmdqv_vdevice_destroy(struct iommufd_vdevice *vdev)
+{
+	struct tegra241_vintf_slot *slot =
+		container_of(vdev, struct tegra241_vintf_slot, core);
+	struct tegra241_vintf *vintf = slot->vintf;
+
+	writel_relaxed(0, REG_VINTF(vintf, SID_REPLACE(slot->idx)));
+	writel_relaxed(0, REG_VINTF(vintf, SID_MATCH(slot->idx)));
+	ida_free(&vintf->sid_slots, slot->idx);
+	dev_dbg(vintf->cmdqv->dev,
+		"VINTF%u: deallocated a slot (%d) for pSID=%x\n", vintf->idx,
+		slot->idx, slot->sid);
+
+	/* IOMMUFD core frees the memory of slot and vdev */
+}
+
+static struct iommufd_viommu_ops tegra241_cmdqv_viommu_ops = {
+	.destroy = tegra241_cmdqv_viommu_destroy,
+	.alloc_domain_nested = arm_vsmmu_alloc_domain_nested,
+	.cache_invalidate = arm_vsmmu_cache_invalidate,
+	.vdevice_alloc = tegra241_cmdqv_vdevice_alloc,
+	.vdevice_destroy = tegra241_cmdqv_vdevice_destroy,
+	.vcmdq_alloc = tegra241_cmdqv_vcmdq_alloc,
+	.vcmdq_free = tegra241_cmdqv_vcmdq_free,
+};
+
+static struct arm_vsmmu *
+tegra241_cmdqv_vsmmu_alloc(struct arm_smmu_device *smmu,
+			   struct arm_smmu_domain *smmu_domain,
+			   struct iommufd_ctx *ictx, unsigned int viommu_type,
+			   const struct iommu_user_data *user_data)
+{
+	struct tegra241_cmdqv *cmdqv =
+		container_of(smmu, struct tegra241_cmdqv, smmu);
+	struct iommu_viommu_tegra241_cmdqv data;
+	struct tegra241_vintf *vintf;
+	phys_addr_t page0_base;
+	int ret;
+
+	if (viommu_type != IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV)
+		return ERR_PTR(-EOPNOTSUPP);
+	if (!smmu_domain || smmu_domain->stage != ARM_SMMU_DOMAIN_S2)
+		return ERR_PTR(-EINVAL);
+	if (!user_data)
+		return ERR_PTR(-EINVAL);
+
+	ret = iommu_copy_struct_from_user(&data, user_data,
+					  IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
+					  out_vintf_page0_pgsz);
+	if (ret)
+		return ERR_PTR(ret);
+
+	vintf = iommufd_viommu_alloc(ictx, struct tegra241_vintf, vsmmu.core,
+				     &tegra241_cmdqv_viommu_ops);
+	if (!vintf)
+		return ERR_PTR(-ENOMEM);
+
+	ret = tegra241_cmdqv_init_vintf(cmdqv, cmdqv->num_vintfs - 1, vintf);
+	if (ret < 0) {
+		dev_err(cmdqv->dev, "no more available vintf\n");
+		goto free_vintf;
+	}
+
+	vintf->s2_domain = smmu_domain;
+	vintf->vmid = smmu_domain->s2_cfg.vmid;
+
+	ret = tegra241_vintf_hw_init(vintf, false);
+	if (ret)
+		goto deinit_vintf;
+
+	vintf->lvcmdqs = kcalloc(cmdqv->num_lvcmdqs_per_vintf,
+				 sizeof(*vintf->lvcmdqs), GFP_KERNEL);
+	if (!vintf->lvcmdqs) {
+		ret = -ENOMEM;
+		goto hw_deinit_vintf;
+	}
+
+	page0_base = cmdqv->base_phys + TEGRA241_VINTFi_PAGE0(vintf->idx);
+	ret = iommufd_ctx_alloc_mmap(ictx, page0_base, SZ_64K, true,
+				     &vintf->mmap_pgoff);
+	if (ret)
+		goto hw_deinit_vintf;
+
+	data.out_vintf_page0_pgsz = SZ_64K;
+	data.out_vintf_page0_pgoff = vintf->mmap_pgoff;
+	ret = iommu_copy_struct_to_user(user_data, &data,
+					IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
+					out_vintf_page0_pgsz);
+	if (ret)
+		goto hw_deinit_vintf;
+
+	ida_init(&vintf->sid_slots);
+
+	dev_dbg(cmdqv->dev, "VINTF%u: allocated with vmid (%d)\n", vintf->idx,
+		vintf->vmid);
+
+	return &vintf->vsmmu;
+hw_deinit_vintf:
+	tegra241_vintf_hw_deinit(vintf);
+deinit_vintf:
+	tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx);
+free_vintf:
+	iommufd_struct_destroy(ictx, vintf, vsmmu.core);
+	return ERR_PTR(ret);
+}