diff mbox series

[v4,11/23] iommufd/viommu: Add IOMMUFD_CMD_HW_QUEUE_ALLOC ioctl

Message ID f52937c027e2fd25d76bc47f4965ba46f82c77c0.1746757630.git.nicolinc@nvidia.com
State New
Headers show
Series iommufd: Add vIOMMU infrastructure (Part-4 HW QUEUE) | expand

Commit Message

Nicolin Chen May 9, 2025, 3:02 a.m. UTC
Introduce a new IOMMUFD_CMD_HW_QUEUE_ALLOC ioctl for user space to allocate
a HW QUEUE object for a vIOMMU specific HW-accelerated queue, e.g.:
 - NVIDIA's Virtual Command Queue
 - AMD vIOMMU's Command Buffer, Event Log Buffer, and PPR Log Buffer

This is a vIOMMU based ioctl. Simply increase the refcount of the vIOMMU.

Reviewed-by: Pranjal Shrivastava <praan@google.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h |   2 +
 include/uapi/linux/iommufd.h            |  42 ++++++++++
 drivers/iommu/iommufd/main.c            |   6 ++
 drivers/iommu/iommufd/viommu.c          | 104 ++++++++++++++++++++++++
 4 files changed, 154 insertions(+)

Comments

Nicolin Chen May 15, 2025, 6:16 p.m. UTC | #1
On Thu, May 15, 2025 at 01:06:20PM -0300, Jason Gunthorpe wrote:
> On Thu, May 08, 2025 at 08:02:32PM -0700, Nicolin Chen wrote:
> > +/**
> > + * struct iommu_hw_queue_alloc - ioctl(IOMMU_HW_QUEUE_ALLOC)
> > + * @size: sizeof(struct iommu_hw_queue_alloc)
> > + * @flags: Must be 0
> > + * @viommu_id: Virtual IOMMU ID to associate the HW queue with
> > + * @type: One of enum iommu_hw_queue_type
> > + * @index: The logical index to the HW queue per virtual IOMMU for a multi-queue
> > + *         model
> > + * @out_hw_queue_id: The ID of the new HW queue
> > + * @base_addr: Base address of the queue memory in guest physical address space
> > + * @length: Length of the queue memory in the guest physical address space
> > + *
> > + * Allocate a HW queue object for a vIOMMU-specific HW-accelerated queue, which
> > + * allows HW to access a guest queue memory described by @base_addr and @length.
> > + * Upon success, the underlying physical pages of the guest queue memory will be
> > + * pinned to prevent VMM from unmapping them in the IOAS until the HW queue gets
> > + * destroyed.
> 
> Do we have way to make the pinning optional?
> 
> As I understand AMD's system the iommu HW itself translates the
> base_addr through the S2 page table automatically, so it doesn't need
> pinned memory and physical addresses but just the IOVA.

Right. That's why we invented a flag, and it should be probably
extended to cover the pin step as well.

> Perhaps for this reason the pinning should be done with a function
> call from the driver?

But the whole point of doing in the core was to avoid the entire
iopt related structure/function from being exposed to the driver,
which would otherwise hugely increase the size of the driver.o?

> > +struct iommu_hw_queue_alloc {
> > +	__u32 size;
> > +	__u32 flags;
> > +	__u32 viommu_id;
> > +	__u32 type;
> > +	__u32 index;
> > +	__u32 out_hw_queue_id;
> > +	__aligned_u64 base_addr;
> 
> base addr should probably be called nesting_parent_iova  to match how
> we described the viommu hwpt:
> 
>  * @hwpt_id: ID of a nesting parent HWPT to associate to

Ack.

> > +	/*
> > +	 * The underlying physical pages must be pinned to prevent them from
> > +	 * being unmapped (via IOMMUFD_CMD_IOAS_UNMAP) during the life cycle
> > +	 * of the HW QUEUE object.
> > +	 */
> > +	rc = iopt_pin_pages(&hwpt->ioas->iopt, cmd->base_addr, cmd->length,
> > +			    pages, 0);
> 
> I don't think this actually works like this without an unmap
> callback. unmap will break:
> 
> 			iommufd_access_notify_unmap(iopt, area_first, length);
> 			/* Something is not responding to unmap requests. */
> 			tries++;
> 			if (WARN_ON(tries > 100))
> 				return -EDEADLOCK;
> 
> If it can't shoot down the pinning.

Hmm, I thought we want the unmap to fail until VMM releases the HW
QUEUE first? In what case, does VMM wants to unmap while holding
the queue pages?

> Why did this need to change away from just a normal access? That ops
> and unmap callback are not optional things.
> 
> What vcmdq would do in the unmap callback is question I'm not sure
> of..
> 
> This is more reason to put the pin/access in the driver so it can
> provide an unmap callback that can fix it up.

As there are two types of "access" here.. you mean iommufd_access,
i.e. vcmdq driver should hold an iommufd_access like an emulated
vfio device driver?

> I think this should have
> been done just by using the normal access mechanism, maybe with a
> simplifying wrapper for in-driver use. ie no need for patch #9

Still, the driver.o file will be very large, unlike VFIO that just
depends on CONFIG_IOMMUFD?

Thanks
Nicolin
Nicolin Chen May 15, 2025, 8:32 p.m. UTC | #2
On Thu, May 15, 2025 at 03:59:38PM -0300, Jason Gunthorpe wrote:
> On Thu, May 15, 2025 at 11:16:45AM -0700, Nicolin Chen wrote:
> > > I don't think this actually works like this without an unmap
> > > callback. unmap will break:
> > > 
> > > 			iommufd_access_notify_unmap(iopt, area_first, length);
> > > 			/* Something is not responding to unmap requests. */
> > > 			tries++;
> > > 			if (WARN_ON(tries > 100))
> > > 				return -EDEADLOCK;
> > > 
> > > If it can't shoot down the pinning.
> > 
> > Hmm, I thought we want the unmap to fail until VMM releases the HW
> > QUEUE first? In what case, does VMM wants to unmap while holding
> > the queue pages?
> 
> Well, if that is what we want to do then this needs to be revised
> somehow..

Yea, unless we have a strong reason to allow unmap while holding
the HW queue.

I think we could set a new flag:

 enum {
 	IOMMUFD_ACCESS_RW_READ = 0,
 	IOMMUFD_ACCESS_RW_WRITE = 1 << 0,
 	/* Set if the caller is in a kthread then rw will use kthread_use_mm() */
 	IOMMUFD_ACCESS_RW_KTHREAD = 1 << 1,
+	IOMMUFD_ACCESS_NO_UNMAP = 1 << 3,
 
 	/* Only for use by selftest */
 	__IOMMUFD_ACCESS_RW_SLOW_PATH = 1 << 2,
 };

and reject iopt_unmap_iova_range().

Thanks
Nicolin
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 79160b039bc7..36a4e060982f 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -611,6 +611,8 @@  int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_viommu_destroy(struct iommufd_object *obj);
 int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_vdevice_destroy(struct iommufd_object *obj);
+int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_hw_queue_destroy(struct iommufd_object *obj);
 
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index cc90299a08d9..001e2deb5a2d 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -56,6 +56,7 @@  enum {
 	IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
 	IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
 	IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93,
+	IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94,
 };
 
 /**
@@ -1147,4 +1148,45 @@  struct iommu_veventq_alloc {
 	__u32 __reserved;
 };
 #define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC)
+
+/**
+ * enum iommu_hw_queue_type - HW Queue Type
+ * @IOMMU_HW_QUEUE_TYPE_DEFAULT: Reserved for future use
+ */
+enum iommu_hw_queue_type {
+	IOMMU_HW_QUEUE_TYPE_DEFAULT = 0,
+};
+
+/**
+ * struct iommu_hw_queue_alloc - ioctl(IOMMU_HW_QUEUE_ALLOC)
+ * @size: sizeof(struct iommu_hw_queue_alloc)
+ * @flags: Must be 0
+ * @viommu_id: Virtual IOMMU ID to associate the HW queue with
+ * @type: One of enum iommu_hw_queue_type
+ * @index: The logical index to the HW queue per virtual IOMMU for a multi-queue
+ *         model
+ * @out_hw_queue_id: The ID of the new HW queue
+ * @base_addr: Base address of the queue memory in guest physical address space
+ * @length: Length of the queue memory in the guest physical address space
+ *
+ * Allocate a HW queue object for a vIOMMU-specific HW-accelerated queue, which
+ * allows HW to access a guest queue memory described by @base_addr and @length.
+ * Upon success, the underlying physical pages of the guest queue memory will be
+ * pinned to prevent VMM from unmapping them in the IOAS until the HW queue gets
+ * destroyed.
+ *
+ * A vIOMMU can allocate multiple queues, but it must use a different @index to
+ * separate each allocation, e.g. HW queue0, HW queue1, ...
+ */
+struct iommu_hw_queue_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 viommu_id;
+	__u32 type;
+	__u32 index;
+	__u32 out_hw_queue_id;
+	__aligned_u64 base_addr;
+	__aligned_u64 length;
+};
+#define IOMMU_HW_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HW_QUEUE_ALLOC)
 #endif
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 2b9ee9b4a424..10410e2f710a 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -292,6 +292,7 @@  union ucmd_buffer {
 	struct iommu_destroy destroy;
 	struct iommu_fault_alloc fault;
 	struct iommu_hw_info info;
+	struct iommu_hw_queue_alloc hw_queue;
 	struct iommu_hwpt_alloc hwpt;
 	struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
 	struct iommu_hwpt_invalidate cache;
@@ -334,6 +335,8 @@  static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 struct iommu_fault_alloc, out_fault_fd),
 	IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info,
 		 __reserved),
+	IOCTL_OP(IOMMU_HW_QUEUE_ALLOC, iommufd_hw_queue_alloc_ioctl,
+		 struct iommu_hw_queue_alloc, length),
 	IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
 		 __reserved),
 	IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap,
@@ -490,6 +493,9 @@  static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_FAULT] = {
 		.destroy = iommufd_fault_destroy,
 	},
+	[IOMMUFD_OBJ_HW_QUEUE] = {
+		.destroy = iommufd_hw_queue_destroy,
+	},
 	[IOMMUFD_OBJ_HWPT_PAGING] = {
 		.destroy = iommufd_hwpt_paging_destroy,
 		.abort = iommufd_hwpt_paging_abort,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index a65153458a26..ef41aec0b5bf 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -170,3 +170,107 @@  int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	iommufd_put_object(ucmd->ictx, &viommu->obj);
 	return rc;
 }
+
+void iommufd_hw_queue_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_hw_queue *hw_queue =
+		container_of(obj, struct iommufd_hw_queue, obj);
+	struct iommufd_viommu *viommu = hw_queue->viommu;
+
+	if (viommu->ops->hw_queue_destroy)
+		viommu->ops->hw_queue_destroy(hw_queue);
+	iopt_unpin_pages(&viommu->hwpt->ioas->iopt, hw_queue->base_addr,
+			 hw_queue->length);
+	refcount_dec(&viommu->obj.users);
+}
+
+int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_hw_queue_alloc *cmd = ucmd->cmd;
+	struct iommufd_hw_queue *hw_queue;
+	struct iommufd_hwpt_paging *hwpt;
+	struct iommufd_viommu *viommu;
+	struct page **pages;
+	int max_npages, i;
+	u64 end;
+	int rc;
+
+	if (cmd->flags || cmd->type == IOMMU_HW_QUEUE_TYPE_DEFAULT)
+		return -EOPNOTSUPP;
+	if (!cmd->base_addr || !cmd->length)
+		return -EINVAL;
+	if (check_add_overflow(cmd->base_addr, cmd->length - 1, &end))
+		return -EOVERFLOW;
+
+	max_npages = DIV_ROUND_UP(cmd->length, PAGE_SIZE);
+	pages = kcalloc(max_npages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+	if (IS_ERR(viommu)) {
+		rc = PTR_ERR(viommu);
+		goto out_free;
+	}
+	hwpt = viommu->hwpt;
+
+	if (!viommu->ops || !viommu->ops->hw_queue_alloc) {
+		rc = -EOPNOTSUPP;
+		goto out_put_viommu;
+	}
+
+	/* Quick test on the base address */
+	if (!iommu_iova_to_phys(hwpt->common.domain, cmd->base_addr)) {
+		rc = -ENXIO;
+		goto out_put_viommu;
+	}
+
+	/*
+	 * The underlying physical pages must be pinned to prevent them from
+	 * being unmapped (via IOMMUFD_CMD_IOAS_UNMAP) during the life cycle
+	 * of the HW QUEUE object.
+	 */
+	rc = iopt_pin_pages(&hwpt->ioas->iopt, cmd->base_addr, cmd->length,
+			    pages, 0);
+	if (rc)
+		goto out_put_viommu;
+
+	if (viommu->ops->flags & IOMMUFD_VIOMMU_FLAG_HW_QUEUE_READS_PA) {
+		/* Validate if the underlying physical pages are contiguous */
+		for (i = 1; i < max_npages && pages[i]; i++) {
+			if (page_to_pfn(pages[i]) ==
+			    page_to_pfn(pages[i - 1]) + 1)
+				continue;
+			rc = -EFAULT;
+			goto out_unpin;
+		}
+	}
+
+	hw_queue = viommu->ops->hw_queue_alloc(viommu, cmd->type, cmd->index,
+					       cmd->base_addr, cmd->length);
+	if (IS_ERR(hw_queue)) {
+		rc = PTR_ERR(hw_queue);
+		goto out_unpin;
+	}
+
+	hw_queue->viommu = viommu;
+	refcount_inc(&viommu->obj.users);
+	hw_queue->ictx = ucmd->ictx;
+	hw_queue->length = cmd->length;
+	hw_queue->base_addr = cmd->base_addr;
+	cmd->out_hw_queue_id = hw_queue->obj.id;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		iommufd_object_abort_and_destroy(ucmd->ictx, &hw_queue->obj);
+	else
+		iommufd_object_finalize(ucmd->ictx, &hw_queue->obj);
+	goto out_put_viommu;
+
+out_unpin:
+	iopt_unpin_pages(&hwpt->ioas->iopt, cmd->base_addr, cmd->length);
+out_put_viommu:
+	iommufd_put_object(ucmd->ictx, &viommu->obj);
+out_free:
+	kfree(pages);
+	return rc;
+}