diff mbox series

[RFC,19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Message ID 20250529053513.1592088-20-yilun.xu@linux.intel.com
State New
Headers show
Series Host side (KVM/VFIO/IOMMUFD) support for TDISP using TSM | expand

Commit Message

Xu Yilun May 29, 2025, 5:35 a.m. UTC
Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
expected to be called by userspace when CoCo VM issues TDI bind/unbind
command to VMM. Specifically for TDX Connect, these commands are some
secure Hypervisor call named GHCI (Guest-Hypervisor Communication
Interface).

The TSM TDI bind/unbind operations are expected to be initiated by a
running CoCo VM, which already have the legacy assigned device in place.
The TSM bind operation is to request VMM make all secure configurations
to support device work as a TDI, and then issue TDISP messages to move
the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.

Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
device to TDISP ERROR state.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 drivers/vfio/iommufd.c           | 22 ++++++++++
 drivers/vfio/pci/vfio_pci_core.c | 74 ++++++++++++++++++++++++++++++++
 include/linux/vfio.h             |  7 +++
 include/linux/vfio_pci_core.h    |  1 +
 include/uapi/linux/vfio.h        | 42 ++++++++++++++++++
 5 files changed, 146 insertions(+)

Comments

Aneesh Kumar K.V June 1, 2025, 10:45 a.m. UTC | #1
Xu Yilun <yilun.xu@linux.intel.com> writes:

> Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
> expected to be called by userspace when CoCo VM issues TDI bind/unbind
> command to VMM. Specifically for TDX Connect, these commands are some
> secure Hypervisor call named GHCI (Guest-Hypervisor Communication
> Interface).
>
> The TSM TDI bind/unbind operations are expected to be initiated by a
> running CoCo VM, which already have the legacy assigned device in place.
> The TSM bind operation is to request VMM make all secure configurations
> to support device work as a TDI, and then issue TDISP messages to move
> the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
>
> Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
> device to TDISP ERROR state.
>

Any reason these need to be a vfio ioctl instead of iommufd ioctl?
For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/

>
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Wu Hao <hao.wu@intel.com>
> Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
> ---
>  drivers/vfio/iommufd.c           | 22 ++++++++++
>  drivers/vfio/pci/vfio_pci_core.c | 74 ++++++++++++++++++++++++++++++++
>  include/linux/vfio.h             |  7 +++
>  include/linux/vfio_pci_core.h    |  1 +
>  include/uapi/linux/vfio.h        | 42 ++++++++++++++++++
>  5 files changed, 146 insertions(+)
>
> diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
> index 3441d24538a8..33fd20ffaeee 100644
> --- a/drivers/vfio/iommufd.c
> +++ b/drivers/vfio/iommufd.c
> @@ -297,3 +297,25 @@ void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev)
>  	vdev->iommufd_attached = false;
>  }
>  EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas);
> +
> +int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id)
> +{
> +	lockdep_assert_held(&vdev->dev_set->lock);
> +
> +	if (WARN_ON(!vdev->iommufd_device))
> +		return -EINVAL;
> +
> +	return iommufd_device_tsm_bind(vdev->iommufd_device, vdevice_id);
> +}
> +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_bind);
> +
> +void vfio_iommufd_tsm_unbind(struct vfio_device *vdev)
> +{
> +	lockdep_assert_held(&vdev->dev_set->lock);
> +
> +	if (WARN_ON(!vdev->iommufd_device))
> +		return;
> +
> +	iommufd_device_tsm_unbind(vdev->iommufd_device);
> +}
> +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_unbind);
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 116964057b0b..92544e54c9c3 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -692,6 +692,13 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
>  #if IS_ENABLED(CONFIG_EEH)
>  	eeh_dev_release(vdev->pdev);
>  #endif
> +
> +	if (vdev->is_tsm_bound) {
> +		vfio_iommufd_tsm_unbind(&vdev->vdev);
> +		pci_release_regions(vdev->pdev);
> +		vdev->is_tsm_bound = false;
> +	}
> +
>  	vfio_pci_core_disable(vdev);
>  
>  	vfio_pci_dma_buf_cleanup(vdev);
> @@ -1447,6 +1454,69 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
>  				  ioeventfd.fd);
>  }
>  
> +static int vfio_pci_ioctl_tsm_bind(struct vfio_pci_core_device *vdev,
> +				   void __user *arg)
> +{
> +	unsigned long minsz = offsetofend(struct vfio_pci_tsm_bind, vdevice_id);
> +	struct vfio_pci_tsm_bind tsm_bind;
> +	struct pci_dev *pdev = vdev->pdev;
> +	int ret;
> +
> +	if (copy_from_user(&tsm_bind, arg, minsz))
> +		return -EFAULT;
> +
> +	if (tsm_bind.argsz < minsz || tsm_bind.flags)
> +		return -EINVAL;
> +
> +	mutex_lock(&vdev->vdev.dev_set->lock);
> +
> +	/* To ensure no host side MMIO access is possible */
> +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> +	if (ret)
> +		goto out_unlock;
>

This should be part of pci_tsm_bind() ? 

> +
> +	ret = vfio_iommufd_tsm_bind(&vdev->vdev, tsm_bind.vdevice_id);
> +	if (ret)
> +		goto out_release_region;
> +
> +	vdev->is_tsm_bound = true;
> +	mutex_unlock(&vdev->vdev.dev_set->lock);
> +
> +	return 0;
> +
> +out_release_region:
> +	pci_release_regions(pdev);
> +out_unlock:
> +	mutex_unlock(&vdev->vdev.dev_set->lock);
> +	return ret;
> +}
> +
> +static int vfio_pci_ioctl_tsm_unbind(struct vfio_pci_core_device *vdev,
> +				     void __user *arg)
> +{
> +	unsigned long minsz = offsetofend(struct vfio_pci_tsm_unbind, flags);
> +	struct vfio_pci_tsm_unbind tsm_unbind;
> +	struct pci_dev *pdev = vdev->pdev;
> +
> +	if (copy_from_user(&tsm_unbind, arg, minsz))
> +		return -EFAULT;
> +
> +	if (tsm_unbind.argsz < minsz || tsm_unbind.flags)
> +		return -EINVAL;
> +
> +	mutex_lock(&vdev->vdev.dev_set->lock);
> +
> +	if (!vdev->is_tsm_bound)
> +		return 0;
> +
> +	vfio_iommufd_tsm_unbind(&vdev->vdev);
> +	pci_release_regions(pdev);
> +	vdev->is_tsm_bound = false;
> +	mutex_unlock(&vdev->vdev.dev_set->lock);
> +
> +	return 0;
> +}
> +


-aneesh
Xu Yilun June 2, 2025, 2:43 p.m. UTC | #2
On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote:
> Xu Yilun <yilun.xu@linux.intel.com> writes:
> 
> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
> > expected to be called by userspace when CoCo VM issues TDI bind/unbind
> > command to VMM. Specifically for TDX Connect, these commands are some
> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication
> > Interface).
> >
> > The TSM TDI bind/unbind operations are expected to be initiated by a
> > running CoCo VM, which already have the legacy assigned device in place.
> > The TSM bind operation is to request VMM make all secure configurations
> > to support device work as a TDI, and then issue TDISP messages to move
> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
> >
> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
> > device to TDISP ERROR state.
> >
> 
> Any reason these need to be a vfio ioctl instead of iommufd ioctl?
> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/

A general reason is, the device driver - VFIO should be aware of the
bound state, and some operations break the bound state. VFIO should also
know some operations on bound may crash kernel because of platform TSM
firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping
in secure page tables cannot be unmapped before TDI STOP [1].

Specifically, for TDX Connect, the firmware enforces MMIO unmapping in
S-EPT would fail if TDI is bound. For AMD there seems also some
requirement about this but I need Alexey's confirmation.

[1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/

> 
> >
> > Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> > Signed-off-by: Wu Hao <hao.wu@intel.com>
> > Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
> > ---
> >  drivers/vfio/iommufd.c           | 22 ++++++++++
> >  drivers/vfio/pci/vfio_pci_core.c | 74 ++++++++++++++++++++++++++++++++
> >  include/linux/vfio.h             |  7 +++
> >  include/linux/vfio_pci_core.h    |  1 +
> >  include/uapi/linux/vfio.h        | 42 ++++++++++++++++++
> >  5 files changed, 146 insertions(+)
> >
> > diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
> > index 3441d24538a8..33fd20ffaeee 100644
> > --- a/drivers/vfio/iommufd.c
> > +++ b/drivers/vfio/iommufd.c
> > @@ -297,3 +297,25 @@ void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev)
> >  	vdev->iommufd_attached = false;
> >  }
> >  EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas);
> > +
> > +int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id)
> > +{
> > +	lockdep_assert_held(&vdev->dev_set->lock);
> > +
> > +	if (WARN_ON(!vdev->iommufd_device))
> > +		return -EINVAL;
> > +
> > +	return iommufd_device_tsm_bind(vdev->iommufd_device, vdevice_id);
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_bind);
> > +
> > +void vfio_iommufd_tsm_unbind(struct vfio_device *vdev)
> > +{
> > +	lockdep_assert_held(&vdev->dev_set->lock);
> > +
> > +	if (WARN_ON(!vdev->iommufd_device))
> > +		return;
> > +
> > +	iommufd_device_tsm_unbind(vdev->iommufd_device);
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_unbind);
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index 116964057b0b..92544e54c9c3 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -692,6 +692,13 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
> >  #if IS_ENABLED(CONFIG_EEH)
> >  	eeh_dev_release(vdev->pdev);
> >  #endif
> > +
> > +	if (vdev->is_tsm_bound) {
> > +		vfio_iommufd_tsm_unbind(&vdev->vdev);
> > +		pci_release_regions(vdev->pdev);
> > +		vdev->is_tsm_bound = false;
> > +	}
> > +
> >  	vfio_pci_core_disable(vdev);
> >  
> >  	vfio_pci_dma_buf_cleanup(vdev);
> > @@ -1447,6 +1454,69 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
> >  				  ioeventfd.fd);
> >  }
> >  
> > +static int vfio_pci_ioctl_tsm_bind(struct vfio_pci_core_device *vdev,
> > +				   void __user *arg)
> > +{
> > +	unsigned long minsz = offsetofend(struct vfio_pci_tsm_bind, vdevice_id);
> > +	struct vfio_pci_tsm_bind tsm_bind;
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	int ret;
> > +
> > +	if (copy_from_user(&tsm_bind, arg, minsz))
> > +		return -EFAULT;
> > +
> > +	if (tsm_bind.argsz < minsz || tsm_bind.flags)
> > +		return -EINVAL;
> > +
> > +	mutex_lock(&vdev->vdev.dev_set->lock);
> > +
> > +	/* To ensure no host side MMIO access is possible */
> > +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> > +	if (ret)
> > +		goto out_unlock;
> >
> 
> This should be part of pci_tsm_bind() ? 

I'm not quite sure. My feelig is this method is specific for VFIO
driver. Many other drivers just request regions on probe(), they can
never bind successfully if pci tsm hide this implementation internally.

Thanks,
Yilun
Xu Yilun June 5, 2025, 9:41 a.m. UTC | #3
On Wed, Jun 04, 2025 at 07:07:18PM +0530, Aneesh Kumar K.V wrote:
> Xu Yilun <yilun.xu@linux.intel.com> writes:
> 
> > On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote:
> >> Xu Yilun <yilun.xu@linux.intel.com> writes:
> >> 
> >> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
> >> > expected to be called by userspace when CoCo VM issues TDI bind/unbind
> >> > command to VMM. Specifically for TDX Connect, these commands are some
> >> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication
> >> > Interface).
> >> >
> >> > The TSM TDI bind/unbind operations are expected to be initiated by a
> >> > running CoCo VM, which already have the legacy assigned device in place.
> >> > The TSM bind operation is to request VMM make all secure configurations
> >> > to support device work as a TDI, and then issue TDISP messages to move
> >> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
> >> >
> >> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
> >> > device to TDISP ERROR state.
> >> >
> >> 
> >> Any reason these need to be a vfio ioctl instead of iommufd ioctl?
> >> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/
> >
> > A general reason is, the device driver - VFIO should be aware of the
> > bound state, and some operations break the bound state. VFIO should also
> > know some operations on bound may crash kernel because of platform TSM
> > firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping
> > in secure page tables cannot be unmapped before TDI STOP [1].
> >
> > Specifically, for TDX Connect, the firmware enforces MMIO unmapping in
> > S-EPT would fail if TDI is bound. For AMD there seems also some
> > requirement about this but I need Alexey's confirmation.
> >
> > [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/
> >
> 
> According to the TDISP specification (Section 11.2.6), clearing either
> the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause
> the TDI to transition to an error state. To handle this gracefully, it
> seems necessary to unbind the TDI before modifying the BME or MSE bits.

Yes. But now the suggestion is never let VFIO do unbind, instead VFIO
should block these operations when device is bound.

> 
> If I understand correctly, we also need to unmap the Stage-2 mapping due
> to the issue described in commit
> abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional
> reasons we would want to unmap the Stage-2 mapping for the BAR (as done
> in vfio_pci_zap_and_down_write_memory_lock)?

I think no more reason. 

> 
> Additionally, with TDX, it appears that before unmapping the Stage-2
> mapping for the BAR, we should first unbind the TDI (ie, move it to the
> "unlock" state?) Is this step related Section 11.2.6 of the TDISP spec,
> or is it driven by a different requirement?

No, this is not device side TDISP requirement. It is host side
requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share
with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals
unmap IOMMU S2 PT.

If we allow IOMMU S2 PT unmapped when TDI is running, host could fool
guest by just unmap some PT entry and suppress the fault event. Guest
thought a DMA writting is successful but it is not and may cause
data integrity issue.

This is not a TDX specific problem, but different vendors has different
mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For
AMD, will trigger some HW protection called "ASID fence" [1]. Not sure
how ARM handles this?

https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/

Thanks,
Yilun

> 
> -aneesh
Jason Gunthorpe June 5, 2025, 3:10 p.m. UTC | #4
On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote:

> > +
> > +	/* To ensure no host side MMIO access is possible */
> > +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> > +	if (ret)
> > +		goto out_unlock;
> > +
> >
> 
> I am hitting failures here with similar changes. Can you share the Qemu
> changes needed to make this pci_request_regions_exclusive successful.
> Also after the TDI is unbound, we want the region ownership backto
> "vfio-pci" so that things continue to work as non-secure device. I don't
> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
> release the region ownership.

Again, IMHO, we should not be doing this dynamically. VFIO should do
pci_request_regions_exclusive() once at the very start and it should
stay that way.

There is no reason to change it dynamically.

The only decision to make is if all vfio should switch to exclusive
mode or if we need to make it optional for userspace.

Jason
Aneesh Kumar K.V June 5, 2025, 4:09 p.m. UTC | #5
Xu Yilun <yilun.xu@linux.intel.com> writes:

> On Wed, Jun 04, 2025 at 07:07:18PM +0530, Aneesh Kumar K.V wrote:
>> Xu Yilun <yilun.xu@linux.intel.com> writes:
>> 
>> > On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote:
>> >> Xu Yilun <yilun.xu@linux.intel.com> writes:
>> >> 
>> >> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
>> >> > expected to be called by userspace when CoCo VM issues TDI bind/unbind
>> >> > command to VMM. Specifically for TDX Connect, these commands are some
>> >> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication
>> >> > Interface).
>> >> >
>> >> > The TSM TDI bind/unbind operations are expected to be initiated by a
>> >> > running CoCo VM, which already have the legacy assigned device in place.
>> >> > The TSM bind operation is to request VMM make all secure configurations
>> >> > to support device work as a TDI, and then issue TDISP messages to move
>> >> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
>> >> >
>> >> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
>> >> > device to TDISP ERROR state.
>> >> >
>> >> 
>> >> Any reason these need to be a vfio ioctl instead of iommufd ioctl?
>> >> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/
>> >
>> > A general reason is, the device driver - VFIO should be aware of the
>> > bound state, and some operations break the bound state. VFIO should also
>> > know some operations on bound may crash kernel because of platform TSM
>> > firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping
>> > in secure page tables cannot be unmapped before TDI STOP [1].
>> >
>> > Specifically, for TDX Connect, the firmware enforces MMIO unmapping in
>> > S-EPT would fail if TDI is bound. For AMD there seems also some
>> > requirement about this but I need Alexey's confirmation.
>> >
>> > [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/
>> >
>> 
>> According to the TDISP specification (Section 11.2.6), clearing either
>> the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause
>> the TDI to transition to an error state. To handle this gracefully, it
>> seems necessary to unbind the TDI before modifying the BME or MSE bits.
>
> Yes. But now the suggestion is never let VFIO do unbind, instead VFIO
> should block these operations when device is bound.
>
>> 
>> If I understand correctly, we also need to unmap the Stage-2 mapping due
>> to the issue described in commit
>> abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional
>> reasons we would want to unmap the Stage-2 mapping for the BAR (as done
>> in vfio_pci_zap_and_down_write_memory_lock)?
>
> I think no more reason. 
>
>> 
>> Additionally, with TDX, it appears that before unmapping the Stage-2
>> mapping for the BAR, we should first unbind the TDI (ie, move it to the
>> "unlock" state?) Is this step related Section 11.2.6 of the TDISP spec,
>> or is it driven by a different requirement?
>
> No, this is not device side TDISP requirement. It is host side
> requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share
> with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals
> unmap IOMMU S2 PT.
>
> If we allow IOMMU S2 PT unmapped when TDI is running, host could fool
> guest by just unmap some PT entry and suppress the fault event. Guest
> thought a DMA writting is successful but it is not and may cause
> data integrity issue.
>
> This is not a TDX specific problem, but different vendors has different
> mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For
> AMD, will trigger some HW protection called "ASID fence" [1]. Not sure
> how ARM handles this?
>
> https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/
>

MMIO/BAR Unmapping:
If the stage-2 mapping is removed while the device is in a locked state—a
scenario that ARM permits—the granule transitions to the RIPAS_DESTROYED and
HIPAS_UNASSIGNED states. Any MMIO or CPU access to such a granule will trigger a
non-emulatable data abort, which is forwarded to the non-secure hypervisor
(e.g., KVM).

However, at this point, the system cannot make further progress. The unmapping
was initiated by the host without coordination from the guest, leaving the
granule in a broken state.

A more robust workflow would involve the guest first transitioning the granule
to RIPAS_EMPTY, followed by the host unmapping the stage-2 entry.

IOMMU Page Table Unmap:
Both the CPU and the SMMU can share the stage-2 page table. If the non-secure
host unmaps an entry from this shared page table, the affected granule again
transitions to RIPAS_DESTROYED and HIPAS_UNASSIGNED.

In this case, a DMA transaction—(SMMU is configured by the Realm Management
Monitor,RMM)—can be terminated. This typically results in an event being
recorded in the event queue which can be read by RMM.

However, interrupt delivery remains under non-secure host control, and
the guest may not be immediately aware that the DMA transaction was
terminated. I am currently confirming this behavior with the design team
and will follow up once I have clarity.

-aneesh
Jason Gunthorpe June 5, 2025, 4:33 p.m. UTC | #6
On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote:
> Jason Gunthorpe <jgg@nvidia.com> writes:
> 
> > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote:
> >
> >> > +
> >> > +	/* To ensure no host side MMIO access is possible */
> >> > +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> >> > +	if (ret)
> >> > +		goto out_unlock;
> >> > +
> >> >
> >> 
> >> I am hitting failures here with similar changes. Can you share the Qemu
> >> changes needed to make this pci_request_regions_exclusive successful.
> >> Also after the TDI is unbound, we want the region ownership backto
> >> "vfio-pci" so that things continue to work as non-secure device. I don't
> >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
> >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
> >> release the region ownership.
> >
> > Again, IMHO, we should not be doing this dynamically. VFIO should do
> > pci_request_regions_exclusive() once at the very start and it should
> > stay that way.
> >
> > There is no reason to change it dynamically.
> >
> > The only decision to make is if all vfio should switch to exclusive
> > mode or if we need to make it optional for userspace.
> 
> We only need the exclusive mode when the device is operating in secure
> mode, correct? That suggests we’ll need to dynamically toggle this
> setting based on the device’s security state.

No, if the decision is that VFIO should allow this to be controlled by
userspace then userspace will tell iommufd to run in regions_exclusive
mode prior to opening the vfio cdev and VFIO will still do it once at
open time and never change it.

The only thing request_regions does is block other drivers outside
vfio from using this memory space. There is no reason at all to change
this dynamically. A CC VMM using VFIO will never use a driver outside
VFIO to touch the VFIO controlled memory.

Jason
Xu Yilun June 6, 2025, 4:26 a.m. UTC | #7
On Thu, Jun 05, 2025 at 01:33:39PM -0300, Jason Gunthorpe wrote:
> On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote:
> > Jason Gunthorpe <jgg@nvidia.com> writes:
> > 
> > > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote:
> > >
> > >> > +
> > >> > +	/* To ensure no host side MMIO access is possible */
> > >> > +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> > >> > +	if (ret)
> > >> > +		goto out_unlock;
> > >> > +
> > >> >
> > >> 
> > >> I am hitting failures here with similar changes. Can you share the Qemu
> > >> changes needed to make this pci_request_regions_exclusive successful.

Jason has described the suggested static lockdown flow and we could
try on that.  I just wanna help position your immediate failure.

Maybe you still have QEMU mmapped the MMIO region.

int vfio_pci_core_mmap()
{
...

	if (!vdev->barmap[index]) {
		ret = pci_request_selected_regions(pdev,
						   1 << index, "vfio-pci");
...
}

Even for static lockdown, userspace should not mmap the MMIOs anymore.

Thanks,
Yilun

> > >> Also after the TDI is unbound, we want the region ownership backto
> > >> "vfio-pci" so that things continue to work as non-secure device. I don't
> > >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
> > >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
> > >> release the region ownership.
> > >
> > > Again, IMHO, we should not be doing this dynamically. VFIO should do
> > > pci_request_regions_exclusive() once at the very start and it should
> > > stay that way.
> > >
> > > There is no reason to change it dynamically.
> > >
> > > The only decision to make is if all vfio should switch to exclusive
> > > mode or if we need to make it optional for userspace.
> > 
> > We only need the exclusive mode when the device is operating in secure
> > mode, correct? That suggests we’ll need to dynamically toggle this
> > setting based on the device’s security state.
> 
> No, if the decision is that VFIO should allow this to be controlled by
> userspace then userspace will tell iommufd to run in regions_exclusive
> mode prior to opening the vfio cdev and VFIO will still do it once at
> open time and never change it.
> 
> The only thing request_regions does is block other drivers outside
> vfio from using this memory space. There is no reason at all to change
> this dynamically. A CC VMM using VFIO will never use a driver outside
> VFIO to touch the VFIO controlled memory.
> 
> Jason
Jason Gunthorpe June 6, 2025, 12:09 p.m. UTC | #8
On Fri, Jun 06, 2025 at 03:02:49PM +0530, Aneesh Kumar K.V wrote:
> Jason Gunthorpe <jgg@nvidia.com> writes:
> 
> > On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote:
> >> Jason Gunthorpe <jgg@nvidia.com> writes:
> >> 
> >> > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote:
> >> >
> >> >> > +
> >> >> > +	/* To ensure no host side MMIO access is possible */
> >> >> > +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> >> >> > +	if (ret)
> >> >> > +		goto out_unlock;
> >> >> > +
> >> >> >
> >> >> 
> >> >> I am hitting failures here with similar changes. Can you share the Qemu
> >> >> changes needed to make this pci_request_regions_exclusive successful.
> >> >> Also after the TDI is unbound, we want the region ownership backto
> >> >> "vfio-pci" so that things continue to work as non-secure device. I don't
> >> >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
> >> >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
> >> >> release the region ownership.
> >> >
> >> > Again, IMHO, we should not be doing this dynamically. VFIO should do
> >> > pci_request_regions_exclusive() once at the very start and it should
> >> > stay that way.
> >> >
> >> > There is no reason to change it dynamically.
> >> >
> >> > The only decision to make is if all vfio should switch to exclusive
> >> > mode or if we need to make it optional for userspace.
> >> 
> >> We only need the exclusive mode when the device is operating in secure
> >> mode, correct? That suggests we’ll need to dynamically toggle this
> >> setting based on the device’s security state.
> >
> > No, if the decision is that VFIO should allow this to be controlled by
> > userspace then userspace will tell iommufd to run in regions_exclusive
> > mode prior to opening the vfio cdev and VFIO will still do it once at
> > open time and never change it.
> 
> So this will be handled by setting
> vdevice::flags = IOMMUFD_PCI_REGION_EXCLUSIVE in

Not like that.. I would suggest a global vfio sysfs or module parameter, or
maybe a iommufd ictx global option:

 IOMMU_OPTION(IOMMU_OPTION_OP_SET, IOMMU_OPTION_EXCLUSIVE_RANGES)

You want something simple here, not tied to vdevice or very dynamic.

The use cases for non-exclusive ranges are very narrow, IMHO

> and vfio_pci_core_mmap() will do
> 
> 	if (!vdev->barmap[index]) {
> 
> 		if (core_vdev->iommufd_device &&
> 		    iommufd_vdevice_region_exclusive(core_vdev->iommufd_device))
> 			ret = pci_request_selected_regions_exclusive(pdev,
> 							1 << index, "vfio-pci");
> 		else
> 			ret = pci_request_selected_regions(pdev,
> 						1 << index, "vfio-pci");

And IMHO, these should be moved to probe time or at least FD open
time, not at mmap time...

Jason
diff mbox series

Patch

diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 3441d24538a8..33fd20ffaeee 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -297,3 +297,25 @@  void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev)
 	vdev->iommufd_attached = false;
 }
 EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas);
+
+int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id)
+{
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (WARN_ON(!vdev->iommufd_device))
+		return -EINVAL;
+
+	return iommufd_device_tsm_bind(vdev->iommufd_device, vdevice_id);
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_bind);
+
+void vfio_iommufd_tsm_unbind(struct vfio_device *vdev)
+{
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (WARN_ON(!vdev->iommufd_device))
+		return;
+
+	iommufd_device_tsm_unbind(vdev->iommufd_device);
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_unbind);
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 116964057b0b..92544e54c9c3 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -692,6 +692,13 @@  void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 #if IS_ENABLED(CONFIG_EEH)
 	eeh_dev_release(vdev->pdev);
 #endif
+
+	if (vdev->is_tsm_bound) {
+		vfio_iommufd_tsm_unbind(&vdev->vdev);
+		pci_release_regions(vdev->pdev);
+		vdev->is_tsm_bound = false;
+	}
+
 	vfio_pci_core_disable(vdev);
 
 	vfio_pci_dma_buf_cleanup(vdev);
@@ -1447,6 +1454,69 @@  static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
 				  ioeventfd.fd);
 }
 
+static int vfio_pci_ioctl_tsm_bind(struct vfio_pci_core_device *vdev,
+				   void __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_pci_tsm_bind, vdevice_id);
+	struct vfio_pci_tsm_bind tsm_bind;
+	struct pci_dev *pdev = vdev->pdev;
+	int ret;
+
+	if (copy_from_user(&tsm_bind, arg, minsz))
+		return -EFAULT;
+
+	if (tsm_bind.argsz < minsz || tsm_bind.flags)
+		return -EINVAL;
+
+	mutex_lock(&vdev->vdev.dev_set->lock);
+
+	/* To ensure no host side MMIO access is possible */
+	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
+	if (ret)
+		goto out_unlock;
+
+	ret = vfio_iommufd_tsm_bind(&vdev->vdev, tsm_bind.vdevice_id);
+	if (ret)
+		goto out_release_region;
+
+	vdev->is_tsm_bound = true;
+	mutex_unlock(&vdev->vdev.dev_set->lock);
+
+	return 0;
+
+out_release_region:
+	pci_release_regions(pdev);
+out_unlock:
+	mutex_unlock(&vdev->vdev.dev_set->lock);
+	return ret;
+}
+
+static int vfio_pci_ioctl_tsm_unbind(struct vfio_pci_core_device *vdev,
+				     void __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_pci_tsm_unbind, flags);
+	struct vfio_pci_tsm_unbind tsm_unbind;
+	struct pci_dev *pdev = vdev->pdev;
+
+	if (copy_from_user(&tsm_unbind, arg, minsz))
+		return -EFAULT;
+
+	if (tsm_unbind.argsz < minsz || tsm_unbind.flags)
+		return -EINVAL;
+
+	mutex_lock(&vdev->vdev.dev_set->lock);
+
+	if (!vdev->is_tsm_bound)
+		return 0;
+
+	vfio_iommufd_tsm_unbind(&vdev->vdev);
+	pci_release_regions(pdev);
+	vdev->is_tsm_bound = false;
+	mutex_unlock(&vdev->vdev.dev_set->lock);
+
+	return 0;
+}
+
 long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 			 unsigned long arg)
 {
@@ -1471,6 +1541,10 @@  long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 		return vfio_pci_ioctl_reset(vdev, uarg);
 	case VFIO_DEVICE_SET_IRQS:
 		return vfio_pci_ioctl_set_irqs(vdev, uarg);
+	case VFIO_DEVICE_TSM_BIND:
+		return vfio_pci_ioctl_tsm_bind(vdev, uarg);
+	case VFIO_DEVICE_TSM_UNBIND:
+		return vfio_pci_ioctl_tsm_unbind(vdev, uarg);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index d521d2c01a92..747b94bb9758 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -70,6 +70,7 @@  struct vfio_device {
 	struct iommufd_device *iommufd_device;
 	struct ida pasids;
 	u8 iommufd_attached:1;
+	u8 iommufd_tsm_bound:1;
 #endif
 	u8 cdev_opened:1;
 #ifdef CONFIG_DEBUG_FS
@@ -155,6 +156,8 @@  int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
 void vfio_iommufd_emulated_unbind(struct vfio_device *vdev);
 int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
 void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev);
+int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id);
+void vfio_iommufd_tsm_unbind(struct vfio_device *vdev);
 #else
 static inline struct iommufd_ctx *
 vfio_iommufd_device_ictx(struct vfio_device *vdev)
@@ -190,6 +193,10 @@  vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx)
 	((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL)
 #define vfio_iommufd_emulated_detach_ioas \
 	((void (*)(struct vfio_device *vdev)) NULL)
+#define vfio_iommufd_tsm_bind \
+	((int (*)(struct vfio_device *vdev, u32 vdevice_id)) NULL)
+#define vfio_iommufd_tsm_unbind \
+	((void (*)(struct vfio_device *vdev)) NULL)
 #endif
 
 static inline bool vfio_device_cdev_opened(struct vfio_device *device)
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index da5d8955ae56..b2982100221f 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -80,6 +80,7 @@  struct vfio_pci_core_device {
 	bool			needs_pm_restore:1;
 	bool			pm_intx_masked:1;
 	bool			pm_runtime_engaged:1;
+	bool			is_tsm_bound:1;
 	struct pci_saved_state	*pci_saved_state;
 	struct pci_saved_state	*pm_save;
 	int			ioeventfds_nr;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 9445fa36efd3..16bd93a5b427 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1493,6 +1493,48 @@  struct vfio_device_feature_dma_buf {
 	struct vfio_region_dma_range dma_ranges[];
 };
 
+/*
+ * Upon VFIO_DEVICE_TSM_BIND, Put the device in TSM Bind state.
+ *
+ * @argsz:	User filled size of this data.
+ * @flags:	Must be 0.
+ * @vdevice_id:	Input the target id which can represent an vdevice allocated
+ *		via iommufd subsystem.
+ *
+ * The vdevice holds all virtualization information needed for TSM Bind.
+ * TSM Bind means host finishes all host side trusted configurations to build
+ * a Tee Device Interface(TDI), then put the TDI in TDISP CONFIG_LOCKED or RUN
+ * state, waiting for guest's attestation. IOMMUFD finds all virtualization
+ * information from vdevice_id, and executes the TSM Bind. VFIO should be aware
+ * some operations (e.g. reset, toggle MSE, private MMIO access) to physical
+ * device impacts TSM Bind, so never do them or do them only after TSM Unbind.
+ * This IOCTL is only allowed on cdev fds.
+ */
+struct vfio_pci_tsm_bind {
+	__u32	argsz;
+	__u32	flags;
+	__u32	vdevice_id;
+	__u32	pad;
+};
+
+#define VFIO_DEVICE_TSM_BIND		_IO(VFIO_TYPE, VFIO_BASE + 22)
+
+/*
+ * Upon VFIO_DEVICE_TSM_UNBIND, put the device in TSM Unbind state.
+ *
+ * @argsz:	User filled size of this data.
+ * @flags:	Must be 0.
+ *
+ * TSM Unbind means host removes all trusted configurations, and put the TDI in
+ * CONFIG_UNLOCKED TDISP state.
+ */
+struct vfio_pci_tsm_unbind {
+	__u32	argsz;
+	__u32	flags;
+};
+
+#define VFIO_DEVICE_TSM_UNBIND		_IO(VFIO_TYPE, VFIO_BASE + 23)
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**