diff mbox series

[RFCv2,09/13] iommufd: Add IOMMU_OPTION_SW_MSI_START/SIZE ioctls

Message ID d3cb1694e07be0e214dc44dcb2cb74f014606560.1736550979.git.nicolinc@nvidia.com
State New
Headers show
Series iommu: Add MSI mapping support with nested SMMU | expand

Commit Message

Nicolin Chen Jan. 11, 2025, 3:32 a.m. UTC
For systems that require MSI pages to be mapped into the IOMMU translation
the IOMMU driver provides an IOMMU_RESV_SW_MSI range, which is the default
recommended IOVA window to place these mappings. However, there is nothing
special about this address. And to support the RMR trick in VMM for nested
translation, the VMM needs to know what sw_msi window the kernel is using.
As there is no particular reason to force VMM to adopt the kernel default,
provide a simple IOMMU_OPTION_SW_MSI_START/SIZE ioctl that the VMM can use
to directly specify the sw_msi window that it wants to use, which replaces
and disables the default IOMMU_RESV_SW_MSI from the driver to avoid having
to build an API to discover the default IOMMU_RESV_SW_MSI.

Since iommufd now has its own sw_msi function, this is easy to implement.

To keep things simple, the parameters are global to the entire iommufd FD,
and will directly replace the IOMMU_RESV_SW_MSI values. The VMM must set
the values before creating any hwpt's to have any effect.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h |  4 +++
 include/uapi/linux/iommufd.h            | 18 ++++++++++++-
 drivers/iommu/iommufd/device.c          |  4 +++
 drivers/iommu/iommufd/io_pagetable.c    |  4 ++-
 drivers/iommu/iommufd/ioas.c            | 34 +++++++++++++++++++++++++
 drivers/iommu/iommufd/main.c            |  6 +++++
 6 files changed, 68 insertions(+), 2 deletions(-)

Comments

Tian, Kevin Jan. 23, 2025, 10:07 a.m. UTC | #1
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Saturday, January 11, 2025 11:32 AM
> 
> @@ -294,7 +294,9 @@ struct iommu_ioas_unmap {
> 
>  /**
>   * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
> - *                       ioctl(IOMMU_OPTION_HUGE_PAGES)
> + *                       ioctl(IOMMU_OPTION_HUGE_PAGES) and
> + *                       ioctl(IOMMU_OPTION_SW_MSI_START) and
> + *                       ioctl(IOMMU_OPTION_SW_MSI_SIZE)
>   * @IOMMU_OPTION_RLIMIT_MODE:
>   *    Change how RLIMIT_MEMLOCK accounting works. The caller must have
> privilege
>   *    to invoke this. Value 0 (default) is user based accounting, 1 uses process
> @@ -304,10 +306,24 @@ struct iommu_ioas_unmap {
>   *    iommu mappings. Value 0 disables combining, everything is mapped to
>   *    PAGE_SIZE. This can be useful for benchmarking.  This is a per-IOAS
>   *    option, the object_id must be the IOAS ID.
> + * @IOMMU_OPTION_SW_MSI_START:
> + *    Change the base address of the IOMMU mapping region for MSI
> doorbell(s).
> + *    It must be set this before attaching a device to an IOAS/HWPT,

remove 'this'

> otherwise
> + *    this option will be not effective on that IOAS/HWPT. User can 

Do we want to explicitly check this instead of leaving it no effect
silently?

> choose to
> + *    let kernel pick a base address, by simply ignoring this option or setting
> + *    a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id
> must be 0
> + * @IOMMU_OPTION_SW_MSI_SIZE:
> + *    Change the size of the IOMMU mapping region for MSI doorbell(s). It
> must
> + *    be set this before attaching a device to an IOAS/HWPT, otherwise it
> won't
> + *    be effective on that IOAS/HWPT. The value is in MB, and the minimum
> value
> + *    is 1 MB. A value 0 (default) will invalidate the MSI doorbell base address
> + *    value set to IOMMU_OPTION_SW_MSI_START. Global option, object_id
> must be 0

hmm there is no check on the minimal value and enable the effect
of value 0 in this patch.

>  iommufd_device_attach_reserved_iova(struct iommufd_device *idev,
>  				    struct iommufd_hwpt_paging
> *hwpt_paging)
>  {
> +	struct iommufd_ctx *ictx = idev->ictx;
>  	int rc;
> 
>  	lockdep_assert_held(&idev->igroup->lock);
> 
> +	/* Override it with a user-programmed SW_MSI region */
> +	if (ictx->sw_msi_size && ictx->sw_msi_start != PHYS_ADDR_MAX)
> +		idev->igroup->sw_msi_start = ictx->sw_msi_start;
>  	rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
>  						 idev->dev,
>  						 &idev->igroup-
> >sw_msi_start);

what about moving above additions into 
iopt_table_enforce_dev_resv_regions() which is all about finding
a sw_msi address and can check the user setting internally?

> diff --git a/drivers/iommu/iommufd/io_pagetable.c
> b/drivers/iommu/iommufd/io_pagetable.c
> index 8a790e597e12..5d7f5ca1eecf 100644
> --- a/drivers/iommu/iommufd/io_pagetable.c
> +++ b/drivers/iommu/iommufd/io_pagetable.c
> @@ -1446,7 +1446,9 @@ int iopt_table_enforce_dev_resv_regions(struct
> io_pagetable *iopt,
>  		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
>  			num_hw_msi++;
>  		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
> -			*sw_msi_start = resv->start;
> +			/* Bypass the driver-defined SW_MSI region, if preset
> */
> +			if (*sw_msi_start == PHYS_ADDR_MAX)
> +				*sw_msi_start = resv->start;

the code is not about bypass. Instead it's to use the driver-defined
region if user doesn't set it.
Jason Gunthorpe Jan. 29, 2025, 5:39 p.m. UTC | #2
On Wed, Jan 29, 2025 at 06:23:33PM +0100, Eric Auger wrote:
> >> IIUC the MSI window will then be different when using legacy VFIO
> >> assignment and iommufd backend.
> > ? They use the same, iommufd can have userspace override it. Then it
> > will ignore the reserved region.
> In current arm-smmu-v3.c you have
>         region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
>                                          prot, IOMMU_RESV_SW_MSI,
> GFP_KERNEL);
> 
> in arm_smmu_get_resv_regions()
> If you overwrite the default region, don't you need to expose the user
> defined resv region?

If it was overriden inside iommufd then the user told the kernel what
range to use to override it. I don't need to go back and report back
to userspace information that it already gave to the kernel..

> > Nothing using iommufd should parse that sysfs file.
> Right but aren't you still supposed to populate the sysfs files
> properly. This region must be carved out from the IOVA space, right?

The sysfs shouldn't be changed here based on how iommufd decides to
use the iova space. The sysfs reflects the information reported from
the driver and sw_msi should be understood as the driver's
recommendation when you view it from sysfs.

The actual reserved regions in effect for an iommufd object are
queried directly in iommufd and do not have a sysfs representation.

> >>> + * @IOMMU_OPTION_SW_MSI_START:
> >>> + *    Change the base address of the IOMMU mapping region for MSI doorbell(s).
> >>> + *    It must be set this before attaching a device to an IOAS/HWPT, otherwise
> >>> + *    this option will be not effective on that IOAS/HWPT. User can choose to
> >>> + *    let kernel pick a base address, by simply ignoring this option or setting
> >>> + *    a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id must be 0
> >> I think we should document it cannot be put at a random place either.
> > It can be put at any place a map can be placed.
> to me It cannot overlap with guest RAM IPA so userspace needs to be
> cautious about that

Yes, userspace needs to manage its own VM memory map to avoid
overlaps, but from an API perspective it can be placed anywhere that a
map can be placed.

Jason
Eric Auger Jan. 29, 2025, 5:49 p.m. UTC | #3
On 1/29/25 6:39 PM, Jason Gunthorpe wrote:
> On Wed, Jan 29, 2025 at 06:23:33PM +0100, Eric Auger wrote:
>>>> IIUC the MSI window will then be different when using legacy VFIO
>>>> assignment and iommufd backend.
>>> ? They use the same, iommufd can have userspace override it. Then it
>>> will ignore the reserved region.
>> In current arm-smmu-v3.c you have
>>         region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH,
>>                                          prot, IOMMU_RESV_SW_MSI,
>> GFP_KERNEL);
>>
>> in arm_smmu_get_resv_regions()
>> If you overwrite the default region, don't you need to expose the user
>> defined resv region?
> If it was overriden inside iommufd then the user told the kernel what
> range to use to override it. I don't need to go back and report back
> to userspace information that it already gave to the kernel..

Looks strange to me because info exposed in sysfs is wrong then. What if
someone else relies on this info, either at kernel level through the
get_resv_regions callback or from user space.
>
>>> Nothing using iommufd should parse that sysfs file.
>> Right but aren't you still supposed to populate the sysfs files
>> properly. This region must be carved out from the IOVA space, right?
> The sysfs shouldn't be changed here based on how iommufd decides to
> use the iova space. The sysfs reflects the information reported from
> the driver and sw_msi should be understood as the driver's
> recommendation when you view it from sysfs.
>
> The actual reserved regions in effect for an iommufd object are
> queried directly in iommufd and do not have a sysfs representation.
>
>>>>> + * @IOMMU_OPTION_SW_MSI_START:
>>>>> + *    Change the base address of the IOMMU mapping region for MSI doorbell(s).
>>>>> + *    It must be set this before attaching a device to an IOAS/HWPT, otherwise
>>>>> + *    this option will be not effective on that IOAS/HWPT. User can choose to
>>>>> + *    let kernel pick a base address, by simply ignoring this option or setting
>>>>> + *    a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id must be 0
>>>> I think we should document it cannot be put at a random place either.
>>> It can be put at any place a map can be placed.
>> to me It cannot overlap with guest RAM IPA so userspace needs to be
>> cautious about that
> Yes, userspace needs to manage its own VM memory map to avoid
> overlaps, but from an API perspective it can be placed anywhere that a
> map can be placed.
OK

Eric
>
> Jason
>
Jason Gunthorpe Jan. 29, 2025, 8:15 p.m. UTC | #4
On Wed, Jan 29, 2025 at 06:49:22PM +0100, Eric Auger wrote:
> > If it was overriden inside iommufd then the user told the kernel what
> > range to use to override it. I don't need to go back and report back
> > to userspace information that it already gave to the kernel..
> 
> Looks strange to me because info exposed in sysfs is wrong then. What if
> someone else relies on this info, either at kernel level through the
> get_resv_regions callback or from user space.

Nothing else should call get_resv_regions() because VFIO is bound to
the device and iommufd owns the domain. We expect some exclusivity
here :)

sysfs for sw_msi should be understood as reporting the driver
recommendation, not anything to do with the current MSI operation of
the device.

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 3e83bbb5912c..9f071609f00b 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -45,6 +45,9 @@  struct iommufd_ctx {
 	struct mutex sw_msi_lock;
 	struct list_head sw_msi_list;
 	unsigned int sw_msi_id;
+	/* User-programmed SW_MSI region, to override igroup->sw_msi_start */
+	phys_addr_t sw_msi_start;
+	size_t sw_msi_size;
 
 	u8 account_mode;
 	/* Compatibility with VFIO no iommu */
@@ -281,6 +284,7 @@  int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
 int iommufd_option_rlimit_mode(struct iommu_option *cmd,
 			       struct iommufd_ctx *ictx);
+int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx *ictx);
 
 int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
 int iommufd_check_iova_range(struct io_pagetable *iopt,
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 34810f6ae2b5..c864a201e502 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -294,7 +294,9 @@  struct iommu_ioas_unmap {
 
 /**
  * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
- *                       ioctl(IOMMU_OPTION_HUGE_PAGES)
+ *                       ioctl(IOMMU_OPTION_HUGE_PAGES) and
+ *                       ioctl(IOMMU_OPTION_SW_MSI_START) and
+ *                       ioctl(IOMMU_OPTION_SW_MSI_SIZE)
  * @IOMMU_OPTION_RLIMIT_MODE:
  *    Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
  *    to invoke this. Value 0 (default) is user based accounting, 1 uses process
@@ -304,10 +306,24 @@  struct iommu_ioas_unmap {
  *    iommu mappings. Value 0 disables combining, everything is mapped to
  *    PAGE_SIZE. This can be useful for benchmarking.  This is a per-IOAS
  *    option, the object_id must be the IOAS ID.
+ * @IOMMU_OPTION_SW_MSI_START:
+ *    Change the base address of the IOMMU mapping region for MSI doorbell(s).
+ *    It must be set this before attaching a device to an IOAS/HWPT, otherwise
+ *    this option will be not effective on that IOAS/HWPT. User can choose to
+ *    let kernel pick a base address, by simply ignoring this option or setting
+ *    a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id must be 0
+ * @IOMMU_OPTION_SW_MSI_SIZE:
+ *    Change the size of the IOMMU mapping region for MSI doorbell(s). It must
+ *    be set this before attaching a device to an IOAS/HWPT, otherwise it won't
+ *    be effective on that IOAS/HWPT. The value is in MB, and the minimum value
+ *    is 1 MB. A value 0 (default) will invalidate the MSI doorbell base address
+ *    value set to IOMMU_OPTION_SW_MSI_START. Global option, object_id must be 0
  */
 enum iommufd_option {
 	IOMMU_OPTION_RLIMIT_MODE = 0,
 	IOMMU_OPTION_HUGE_PAGES = 1,
+	IOMMU_OPTION_SW_MSI_START = 2,
+	IOMMU_OPTION_SW_MSI_SIZE = 3,
 };
 
 /**
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index f75b3c23cd41..093a3bd798db 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -445,10 +445,14 @@  static int
 iommufd_device_attach_reserved_iova(struct iommufd_device *idev,
 				    struct iommufd_hwpt_paging *hwpt_paging)
 {
+	struct iommufd_ctx *ictx = idev->ictx;
 	int rc;
 
 	lockdep_assert_held(&idev->igroup->lock);
 
+	/* Override it with a user-programmed SW_MSI region */
+	if (ictx->sw_msi_size && ictx->sw_msi_start != PHYS_ADDR_MAX)
+		idev->igroup->sw_msi_start = ictx->sw_msi_start;
 	rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
 						 idev->dev,
 						 &idev->igroup->sw_msi_start);
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 8a790e597e12..5d7f5ca1eecf 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -1446,7 +1446,9 @@  int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
 			num_hw_msi++;
 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
-			*sw_msi_start = resv->start;
+			/* Bypass the driver-defined SW_MSI region, if preset */
+			if (*sw_msi_start == PHYS_ADDR_MAX)
+				*sw_msi_start = resv->start;
 			num_sw_msi++;
 		}
 
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 1542c5fd10a8..3f4e25b660f9 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -620,6 +620,40 @@  int iommufd_option_rlimit_mode(struct iommu_option *cmd,
 	return -EOPNOTSUPP;
 }
 
+int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx *ictx)
+{
+	if (cmd->object_id)
+		return -EOPNOTSUPP;
+
+	if (cmd->op == IOMMU_OPTION_OP_GET) {
+		switch (cmd->option_id) {
+		case IOMMU_OPTION_SW_MSI_START:
+			cmd->val64 = (u64)ictx->sw_msi_start;
+			break;
+		case IOMMU_OPTION_SW_MSI_SIZE:
+			cmd->val64 = (u64)ictx->sw_msi_size;
+			break;
+		default:
+			return -EOPNOTSUPP;
+		}
+		return 0;
+	}
+	if (cmd->op == IOMMU_OPTION_OP_SET) {
+		switch (cmd->option_id) {
+		case IOMMU_OPTION_SW_MSI_START:
+			ictx->sw_msi_start = (phys_addr_t)cmd->val64;
+			break;
+		case IOMMU_OPTION_SW_MSI_SIZE:
+			ictx->sw_msi_size = (size_t)cmd->val64;
+			break;
+		default:
+			return -EOPNOTSUPP;
+		}
+		return 0;
+	}
+	return -EOPNOTSUPP;
+}
+
 static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
 					  struct iommufd_ioas *ioas)
 {
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 7cc9497b7193..026297265c71 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -229,6 +229,8 @@  static int iommufd_fops_open(struct inode *inode, struct file *filp)
 	init_waitqueue_head(&ictx->destroy_wait);
 	mutex_init(&ictx->sw_msi_lock);
 	INIT_LIST_HEAD(&ictx->sw_msi_list);
+	ictx->sw_msi_start = PHYS_ADDR_MAX;
+	ictx->sw_msi_size = 0;
 	filp->private_data = ictx;
 	return 0;
 }
@@ -287,6 +289,10 @@  static int iommufd_option(struct iommufd_ucmd *ucmd)
 	case IOMMU_OPTION_RLIMIT_MODE:
 		rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
 		break;
+	case IOMMU_OPTION_SW_MSI_START:
+	case IOMMU_OPTION_SW_MSI_SIZE:
+		rc = iommufd_option_sw_msi(cmd, ucmd->ictx);
+		break;
 	case IOMMU_OPTION_HUGE_PAGES:
 		rc = iommufd_ioas_option(ucmd);
 		break;