Message ID | 20250529053513.1592088-20-yilun.xu@linux.intel.com |
---|---|
State | New |
Headers | show |
Series | Host side (KVM/VFIO/IOMMUFD) support for TDISP using TSM | expand |
Xu Yilun <yilun.xu@linux.intel.com> writes: > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are > expected to be called by userspace when CoCo VM issues TDI bind/unbind > command to VMM. Specifically for TDX Connect, these commands are some > secure Hypervisor call named GHCI (Guest-Hypervisor Communication > Interface). > > The TSM TDI bind/unbind operations are expected to be initiated by a > running CoCo VM, which already have the legacy assigned device in place. > The TSM bind operation is to request VMM make all secure configurations > to support device work as a TDI, and then issue TDISP messages to move > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation. > > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead > device to TDISP ERROR state. > Any reason these need to be a vfio ioctl instead of iommufd ioctl? For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/ > > Suggested-by: Jason Gunthorpe <jgg@nvidia.com> > Signed-off-by: Wu Hao <hao.wu@intel.com> > Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com> > --- > drivers/vfio/iommufd.c | 22 ++++++++++ > drivers/vfio/pci/vfio_pci_core.c | 74 ++++++++++++++++++++++++++++++++ > include/linux/vfio.h | 7 +++ > include/linux/vfio_pci_core.h | 1 + > include/uapi/linux/vfio.h | 42 ++++++++++++++++++ > 5 files changed, 146 insertions(+) > > diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c > index 3441d24538a8..33fd20ffaeee 100644 > --- a/drivers/vfio/iommufd.c > +++ b/drivers/vfio/iommufd.c > @@ -297,3 +297,25 @@ void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev) > vdev->iommufd_attached = false; > } > EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas); > + > +int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id) > +{ > + lockdep_assert_held(&vdev->dev_set->lock); > + > + if (WARN_ON(!vdev->iommufd_device)) > + return -EINVAL; > + > + return iommufd_device_tsm_bind(vdev->iommufd_device, vdevice_id); > +} > +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_bind); > + > +void vfio_iommufd_tsm_unbind(struct vfio_device *vdev) > +{ > + lockdep_assert_held(&vdev->dev_set->lock); > + > + if (WARN_ON(!vdev->iommufd_device)) > + return; > + > + iommufd_device_tsm_unbind(vdev->iommufd_device); > +} > +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_unbind); > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c > index 116964057b0b..92544e54c9c3 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -692,6 +692,13 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) > #if IS_ENABLED(CONFIG_EEH) > eeh_dev_release(vdev->pdev); > #endif > + > + if (vdev->is_tsm_bound) { > + vfio_iommufd_tsm_unbind(&vdev->vdev); > + pci_release_regions(vdev->pdev); > + vdev->is_tsm_bound = false; > + } > + > vfio_pci_core_disable(vdev); > > vfio_pci_dma_buf_cleanup(vdev); > @@ -1447,6 +1454,69 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, > ioeventfd.fd); > } > > +static int vfio_pci_ioctl_tsm_bind(struct vfio_pci_core_device *vdev, > + void __user *arg) > +{ > + unsigned long minsz = offsetofend(struct vfio_pci_tsm_bind, vdevice_id); > + struct vfio_pci_tsm_bind tsm_bind; > + struct pci_dev *pdev = vdev->pdev; > + int ret; > + > + if (copy_from_user(&tsm_bind, arg, minsz)) > + return -EFAULT; > + > + if (tsm_bind.argsz < minsz || tsm_bind.flags) > + return -EINVAL; > + > + mutex_lock(&vdev->vdev.dev_set->lock); > + > + /* To ensure no host side MMIO access is possible */ > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm"); > + if (ret) > + goto out_unlock; > This should be part of pci_tsm_bind() ? > + > + ret = vfio_iommufd_tsm_bind(&vdev->vdev, tsm_bind.vdevice_id); > + if (ret) > + goto out_release_region; > + > + vdev->is_tsm_bound = true; > + mutex_unlock(&vdev->vdev.dev_set->lock); > + > + return 0; > + > +out_release_region: > + pci_release_regions(pdev); > +out_unlock: > + mutex_unlock(&vdev->vdev.dev_set->lock); > + return ret; > +} > + > +static int vfio_pci_ioctl_tsm_unbind(struct vfio_pci_core_device *vdev, > + void __user *arg) > +{ > + unsigned long minsz = offsetofend(struct vfio_pci_tsm_unbind, flags); > + struct vfio_pci_tsm_unbind tsm_unbind; > + struct pci_dev *pdev = vdev->pdev; > + > + if (copy_from_user(&tsm_unbind, arg, minsz)) > + return -EFAULT; > + > + if (tsm_unbind.argsz < minsz || tsm_unbind.flags) > + return -EINVAL; > + > + mutex_lock(&vdev->vdev.dev_set->lock); > + > + if (!vdev->is_tsm_bound) > + return 0; > + > + vfio_iommufd_tsm_unbind(&vdev->vdev); > + pci_release_regions(pdev); > + vdev->is_tsm_bound = false; > + mutex_unlock(&vdev->vdev.dev_set->lock); > + > + return 0; > +} > + -aneesh
On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote: > Xu Yilun <yilun.xu@linux.intel.com> writes: > > > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are > > expected to be called by userspace when CoCo VM issues TDI bind/unbind > > command to VMM. Specifically for TDX Connect, these commands are some > > secure Hypervisor call named GHCI (Guest-Hypervisor Communication > > Interface). > > > > The TSM TDI bind/unbind operations are expected to be initiated by a > > running CoCo VM, which already have the legacy assigned device in place. > > The TSM bind operation is to request VMM make all secure configurations > > to support device work as a TDI, and then issue TDISP messages to move > > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation. > > > > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead > > device to TDISP ERROR state. > > > > Any reason these need to be a vfio ioctl instead of iommufd ioctl? > For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/ A general reason is, the device driver - VFIO should be aware of the bound state, and some operations break the bound state. VFIO should also know some operations on bound may crash kernel because of platform TSM firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping in secure page tables cannot be unmapped before TDI STOP [1]. Specifically, for TDX Connect, the firmware enforces MMIO unmapping in S-EPT would fail if TDI is bound. For AMD there seems also some requirement about this but I need Alexey's confirmation. [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ > > > > > Suggested-by: Jason Gunthorpe <jgg@nvidia.com> > > Signed-off-by: Wu Hao <hao.wu@intel.com> > > Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com> > > --- > > drivers/vfio/iommufd.c | 22 ++++++++++ > > drivers/vfio/pci/vfio_pci_core.c | 74 ++++++++++++++++++++++++++++++++ > > include/linux/vfio.h | 7 +++ > > include/linux/vfio_pci_core.h | 1 + > > include/uapi/linux/vfio.h | 42 ++++++++++++++++++ > > 5 files changed, 146 insertions(+) > > > > diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c > > index 3441d24538a8..33fd20ffaeee 100644 > > --- a/drivers/vfio/iommufd.c > > +++ b/drivers/vfio/iommufd.c > > @@ -297,3 +297,25 @@ void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev) > > vdev->iommufd_attached = false; > > } > > EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas); > > + > > +int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id) > > +{ > > + lockdep_assert_held(&vdev->dev_set->lock); > > + > > + if (WARN_ON(!vdev->iommufd_device)) > > + return -EINVAL; > > + > > + return iommufd_device_tsm_bind(vdev->iommufd_device, vdevice_id); > > +} > > +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_bind); > > + > > +void vfio_iommufd_tsm_unbind(struct vfio_device *vdev) > > +{ > > + lockdep_assert_held(&vdev->dev_set->lock); > > + > > + if (WARN_ON(!vdev->iommufd_device)) > > + return; > > + > > + iommufd_device_tsm_unbind(vdev->iommufd_device); > > +} > > +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_unbind); > > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c > > index 116964057b0b..92544e54c9c3 100644 > > --- a/drivers/vfio/pci/vfio_pci_core.c > > +++ b/drivers/vfio/pci/vfio_pci_core.c > > @@ -692,6 +692,13 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) > > #if IS_ENABLED(CONFIG_EEH) > > eeh_dev_release(vdev->pdev); > > #endif > > + > > + if (vdev->is_tsm_bound) { > > + vfio_iommufd_tsm_unbind(&vdev->vdev); > > + pci_release_regions(vdev->pdev); > > + vdev->is_tsm_bound = false; > > + } > > + > > vfio_pci_core_disable(vdev); > > > > vfio_pci_dma_buf_cleanup(vdev); > > @@ -1447,6 +1454,69 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, > > ioeventfd.fd); > > } > > > > +static int vfio_pci_ioctl_tsm_bind(struct vfio_pci_core_device *vdev, > > + void __user *arg) > > +{ > > + unsigned long minsz = offsetofend(struct vfio_pci_tsm_bind, vdevice_id); > > + struct vfio_pci_tsm_bind tsm_bind; > > + struct pci_dev *pdev = vdev->pdev; > > + int ret; > > + > > + if (copy_from_user(&tsm_bind, arg, minsz)) > > + return -EFAULT; > > + > > + if (tsm_bind.argsz < minsz || tsm_bind.flags) > > + return -EINVAL; > > + > > + mutex_lock(&vdev->vdev.dev_set->lock); > > + > > + /* To ensure no host side MMIO access is possible */ > > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm"); > > + if (ret) > > + goto out_unlock; > > > > This should be part of pci_tsm_bind() ? I'm not quite sure. My feelig is this method is specific for VFIO driver. Many other drivers just request regions on probe(), they can never bind successfully if pci tsm hide this implementation internally. Thanks, Yilun
On Wed, Jun 04, 2025 at 07:07:18PM +0530, Aneesh Kumar K.V wrote: > Xu Yilun <yilun.xu@linux.intel.com> writes: > > > On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote: > >> Xu Yilun <yilun.xu@linux.intel.com> writes: > >> > >> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are > >> > expected to be called by userspace when CoCo VM issues TDI bind/unbind > >> > command to VMM. Specifically for TDX Connect, these commands are some > >> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication > >> > Interface). > >> > > >> > The TSM TDI bind/unbind operations are expected to be initiated by a > >> > running CoCo VM, which already have the legacy assigned device in place. > >> > The TSM bind operation is to request VMM make all secure configurations > >> > to support device work as a TDI, and then issue TDISP messages to move > >> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation. > >> > > >> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead > >> > device to TDISP ERROR state. > >> > > >> > >> Any reason these need to be a vfio ioctl instead of iommufd ioctl? > >> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/ > > > > A general reason is, the device driver - VFIO should be aware of the > > bound state, and some operations break the bound state. VFIO should also > > know some operations on bound may crash kernel because of platform TSM > > firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping > > in secure page tables cannot be unmapped before TDI STOP [1]. > > > > Specifically, for TDX Connect, the firmware enforces MMIO unmapping in > > S-EPT would fail if TDI is bound. For AMD there seems also some > > requirement about this but I need Alexey's confirmation. > > > > [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ > > > > According to the TDISP specification (Section 11.2.6), clearing either > the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause > the TDI to transition to an error state. To handle this gracefully, it > seems necessary to unbind the TDI before modifying the BME or MSE bits. Yes. But now the suggestion is never let VFIO do unbind, instead VFIO should block these operations when device is bound. > > If I understand correctly, we also need to unmap the Stage-2 mapping due > to the issue described in commit > abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional > reasons we would want to unmap the Stage-2 mapping for the BAR (as done > in vfio_pci_zap_and_down_write_memory_lock)? I think no more reason. > > Additionally, with TDX, it appears that before unmapping the Stage-2 > mapping for the BAR, we should first unbind the TDI (ie, move it to the > "unlock" state?) Is this step related Section 11.2.6 of the TDISP spec, > or is it driven by a different requirement? No, this is not device side TDISP requirement. It is host side requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals unmap IOMMU S2 PT. If we allow IOMMU S2 PT unmapped when TDI is running, host could fool guest by just unmap some PT entry and suppress the fault event. Guest thought a DMA writting is successful but it is not and may cause data integrity issue. This is not a TDX specific problem, but different vendors has different mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For AMD, will trigger some HW protection called "ASID fence" [1]. Not sure how ARM handles this? https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ Thanks, Yilun > > -aneesh
On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote: > > + > > + /* To ensure no host side MMIO access is possible */ > > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm"); > > + if (ret) > > + goto out_unlock; > > + > > > > I am hitting failures here with similar changes. Can you share the Qemu > changes needed to make this pci_request_regions_exclusive successful. > Also after the TDI is unbound, we want the region ownership backto > "vfio-pci" so that things continue to work as non-secure device. I don't > see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in > userspace which will result in vfio_unmap()/vfio_map(). But that doesn't > release the region ownership. Again, IMHO, we should not be doing this dynamically. VFIO should do pci_request_regions_exclusive() once at the very start and it should stay that way. There is no reason to change it dynamically. The only decision to make is if all vfio should switch to exclusive mode or if we need to make it optional for userspace. Jason
Xu Yilun <yilun.xu@linux.intel.com> writes: > On Wed, Jun 04, 2025 at 07:07:18PM +0530, Aneesh Kumar K.V wrote: >> Xu Yilun <yilun.xu@linux.intel.com> writes: >> >> > On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote: >> >> Xu Yilun <yilun.xu@linux.intel.com> writes: >> >> >> >> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are >> >> > expected to be called by userspace when CoCo VM issues TDI bind/unbind >> >> > command to VMM. Specifically for TDX Connect, these commands are some >> >> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication >> >> > Interface). >> >> > >> >> > The TSM TDI bind/unbind operations are expected to be initiated by a >> >> > running CoCo VM, which already have the legacy assigned device in place. >> >> > The TSM bind operation is to request VMM make all secure configurations >> >> > to support device work as a TDI, and then issue TDISP messages to move >> >> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation. >> >> > >> >> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead >> >> > device to TDISP ERROR state. >> >> > >> >> >> >> Any reason these need to be a vfio ioctl instead of iommufd ioctl? >> >> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/ >> > >> > A general reason is, the device driver - VFIO should be aware of the >> > bound state, and some operations break the bound state. VFIO should also >> > know some operations on bound may crash kernel because of platform TSM >> > firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping >> > in secure page tables cannot be unmapped before TDI STOP [1]. >> > >> > Specifically, for TDX Connect, the firmware enforces MMIO unmapping in >> > S-EPT would fail if TDI is bound. For AMD there seems also some >> > requirement about this but I need Alexey's confirmation. >> > >> > [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ >> > >> >> According to the TDISP specification (Section 11.2.6), clearing either >> the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause >> the TDI to transition to an error state. To handle this gracefully, it >> seems necessary to unbind the TDI before modifying the BME or MSE bits. > > Yes. But now the suggestion is never let VFIO do unbind, instead VFIO > should block these operations when device is bound. > >> >> If I understand correctly, we also need to unmap the Stage-2 mapping due >> to the issue described in commit >> abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional >> reasons we would want to unmap the Stage-2 mapping for the BAR (as done >> in vfio_pci_zap_and_down_write_memory_lock)? > > I think no more reason. > >> >> Additionally, with TDX, it appears that before unmapping the Stage-2 >> mapping for the BAR, we should first unbind the TDI (ie, move it to the >> "unlock" state?) Is this step related Section 11.2.6 of the TDISP spec, >> or is it driven by a different requirement? > > No, this is not device side TDISP requirement. It is host side > requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share > with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals > unmap IOMMU S2 PT. > > If we allow IOMMU S2 PT unmapped when TDI is running, host could fool > guest by just unmap some PT entry and suppress the fault event. Guest > thought a DMA writting is successful but it is not and may cause > data integrity issue. > > This is not a TDX specific problem, but different vendors has different > mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For > AMD, will trigger some HW protection called "ASID fence" [1]. Not sure > how ARM handles this? > > https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ > MMIO/BAR Unmapping: If the stage-2 mapping is removed while the device is in a locked state—a scenario that ARM permits—the granule transitions to the RIPAS_DESTROYED and HIPAS_UNASSIGNED states. Any MMIO or CPU access to such a granule will trigger a non-emulatable data abort, which is forwarded to the non-secure hypervisor (e.g., KVM). However, at this point, the system cannot make further progress. The unmapping was initiated by the host without coordination from the guest, leaving the granule in a broken state. A more robust workflow would involve the guest first transitioning the granule to RIPAS_EMPTY, followed by the host unmapping the stage-2 entry. IOMMU Page Table Unmap: Both the CPU and the SMMU can share the stage-2 page table. If the non-secure host unmaps an entry from this shared page table, the affected granule again transitions to RIPAS_DESTROYED and HIPAS_UNASSIGNED. In this case, a DMA transaction—(SMMU is configured by the Realm Management Monitor,RMM)—can be terminated. This typically results in an event being recorded in the event queue which can be read by RMM. However, interrupt delivery remains under non-secure host control, and the guest may not be immediately aware that the DMA transaction was terminated. I am currently confirming this behavior with the design team and will follow up once I have clarity. -aneesh
On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote: > Jason Gunthorpe <jgg@nvidia.com> writes: > > > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote: > > > >> > + > >> > + /* To ensure no host side MMIO access is possible */ > >> > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm"); > >> > + if (ret) > >> > + goto out_unlock; > >> > + > >> > > >> > >> I am hitting failures here with similar changes. Can you share the Qemu > >> changes needed to make this pci_request_regions_exclusive successful. > >> Also after the TDI is unbound, we want the region ownership backto > >> "vfio-pci" so that things continue to work as non-secure device. I don't > >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in > >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't > >> release the region ownership. > > > > Again, IMHO, we should not be doing this dynamically. VFIO should do > > pci_request_regions_exclusive() once at the very start and it should > > stay that way. > > > > There is no reason to change it dynamically. > > > > The only decision to make is if all vfio should switch to exclusive > > mode or if we need to make it optional for userspace. > > We only need the exclusive mode when the device is operating in secure > mode, correct? That suggests we’ll need to dynamically toggle this > setting based on the device’s security state. No, if the decision is that VFIO should allow this to be controlled by userspace then userspace will tell iommufd to run in regions_exclusive mode prior to opening the vfio cdev and VFIO will still do it once at open time and never change it. The only thing request_regions does is block other drivers outside vfio from using this memory space. There is no reason at all to change this dynamically. A CC VMM using VFIO will never use a driver outside VFIO to touch the VFIO controlled memory. Jason
On Thu, Jun 05, 2025 at 01:33:39PM -0300, Jason Gunthorpe wrote: > On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote: > > Jason Gunthorpe <jgg@nvidia.com> writes: > > > > > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote: > > > > > >> > + > > >> > + /* To ensure no host side MMIO access is possible */ > > >> > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm"); > > >> > + if (ret) > > >> > + goto out_unlock; > > >> > + > > >> > > > >> > > >> I am hitting failures here with similar changes. Can you share the Qemu > > >> changes needed to make this pci_request_regions_exclusive successful. Jason has described the suggested static lockdown flow and we could try on that. I just wanna help position your immediate failure. Maybe you still have QEMU mmapped the MMIO region. int vfio_pci_core_mmap() { ... if (!vdev->barmap[index]) { ret = pci_request_selected_regions(pdev, 1 << index, "vfio-pci"); ... } Even for static lockdown, userspace should not mmap the MMIOs anymore. Thanks, Yilun > > >> Also after the TDI is unbound, we want the region ownership backto > > >> "vfio-pci" so that things continue to work as non-secure device. I don't > > >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in > > >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't > > >> release the region ownership. > > > > > > Again, IMHO, we should not be doing this dynamically. VFIO should do > > > pci_request_regions_exclusive() once at the very start and it should > > > stay that way. > > > > > > There is no reason to change it dynamically. > > > > > > The only decision to make is if all vfio should switch to exclusive > > > mode or if we need to make it optional for userspace. > > > > We only need the exclusive mode when the device is operating in secure > > mode, correct? That suggests we’ll need to dynamically toggle this > > setting based on the device’s security state. > > No, if the decision is that VFIO should allow this to be controlled by > userspace then userspace will tell iommufd to run in regions_exclusive > mode prior to opening the vfio cdev and VFIO will still do it once at > open time and never change it. > > The only thing request_regions does is block other drivers outside > vfio from using this memory space. There is no reason at all to change > this dynamically. A CC VMM using VFIO will never use a driver outside > VFIO to touch the VFIO controlled memory. > > Jason
On Fri, Jun 06, 2025 at 03:02:49PM +0530, Aneesh Kumar K.V wrote: > Jason Gunthorpe <jgg@nvidia.com> writes: > > > On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote: > >> Jason Gunthorpe <jgg@nvidia.com> writes: > >> > >> > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote: > >> > > >> >> > + > >> >> > + /* To ensure no host side MMIO access is possible */ > >> >> > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm"); > >> >> > + if (ret) > >> >> > + goto out_unlock; > >> >> > + > >> >> > > >> >> > >> >> I am hitting failures here with similar changes. Can you share the Qemu > >> >> changes needed to make this pci_request_regions_exclusive successful. > >> >> Also after the TDI is unbound, we want the region ownership backto > >> >> "vfio-pci" so that things continue to work as non-secure device. I don't > >> >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in > >> >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't > >> >> release the region ownership. > >> > > >> > Again, IMHO, we should not be doing this dynamically. VFIO should do > >> > pci_request_regions_exclusive() once at the very start and it should > >> > stay that way. > >> > > >> > There is no reason to change it dynamically. > >> > > >> > The only decision to make is if all vfio should switch to exclusive > >> > mode or if we need to make it optional for userspace. > >> > >> We only need the exclusive mode when the device is operating in secure > >> mode, correct? That suggests we’ll need to dynamically toggle this > >> setting based on the device’s security state. > > > > No, if the decision is that VFIO should allow this to be controlled by > > userspace then userspace will tell iommufd to run in regions_exclusive > > mode prior to opening the vfio cdev and VFIO will still do it once at > > open time and never change it. > > So this will be handled by setting > vdevice::flags = IOMMUFD_PCI_REGION_EXCLUSIVE in Not like that.. I would suggest a global vfio sysfs or module parameter, or maybe a iommufd ictx global option: IOMMU_OPTION(IOMMU_OPTION_OP_SET, IOMMU_OPTION_EXCLUSIVE_RANGES) You want something simple here, not tied to vdevice or very dynamic. The use cases for non-exclusive ranges are very narrow, IMHO > and vfio_pci_core_mmap() will do > > if (!vdev->barmap[index]) { > > if (core_vdev->iommufd_device && > iommufd_vdevice_region_exclusive(core_vdev->iommufd_device)) > ret = pci_request_selected_regions_exclusive(pdev, > 1 << index, "vfio-pci"); > else > ret = pci_request_selected_regions(pdev, > 1 << index, "vfio-pci"); And IMHO, these should be moved to probe time or at least FD open time, not at mmap time... Jason
diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c index 3441d24538a8..33fd20ffaeee 100644 --- a/drivers/vfio/iommufd.c +++ b/drivers/vfio/iommufd.c @@ -297,3 +297,25 @@ void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev) vdev->iommufd_attached = false; } EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas); + +int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (WARN_ON(!vdev->iommufd_device)) + return -EINVAL; + + return iommufd_device_tsm_bind(vdev->iommufd_device, vdevice_id); +} +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_bind); + +void vfio_iommufd_tsm_unbind(struct vfio_device *vdev) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (WARN_ON(!vdev->iommufd_device)) + return; + + iommufd_device_tsm_unbind(vdev->iommufd_device); +} +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_unbind); diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 116964057b0b..92544e54c9c3 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -692,6 +692,13 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) #if IS_ENABLED(CONFIG_EEH) eeh_dev_release(vdev->pdev); #endif + + if (vdev->is_tsm_bound) { + vfio_iommufd_tsm_unbind(&vdev->vdev); + pci_release_regions(vdev->pdev); + vdev->is_tsm_bound = false; + } + vfio_pci_core_disable(vdev); vfio_pci_dma_buf_cleanup(vdev); @@ -1447,6 +1454,69 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, ioeventfd.fd); } +static int vfio_pci_ioctl_tsm_bind(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_pci_tsm_bind, vdevice_id); + struct vfio_pci_tsm_bind tsm_bind; + struct pci_dev *pdev = vdev->pdev; + int ret; + + if (copy_from_user(&tsm_bind, arg, minsz)) + return -EFAULT; + + if (tsm_bind.argsz < minsz || tsm_bind.flags) + return -EINVAL; + + mutex_lock(&vdev->vdev.dev_set->lock); + + /* To ensure no host side MMIO access is possible */ + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm"); + if (ret) + goto out_unlock; + + ret = vfio_iommufd_tsm_bind(&vdev->vdev, tsm_bind.vdevice_id); + if (ret) + goto out_release_region; + + vdev->is_tsm_bound = true; + mutex_unlock(&vdev->vdev.dev_set->lock); + + return 0; + +out_release_region: + pci_release_regions(pdev); +out_unlock: + mutex_unlock(&vdev->vdev.dev_set->lock); + return ret; +} + +static int vfio_pci_ioctl_tsm_unbind(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_pci_tsm_unbind, flags); + struct vfio_pci_tsm_unbind tsm_unbind; + struct pci_dev *pdev = vdev->pdev; + + if (copy_from_user(&tsm_unbind, arg, minsz)) + return -EFAULT; + + if (tsm_unbind.argsz < minsz || tsm_unbind.flags) + return -EINVAL; + + mutex_lock(&vdev->vdev.dev_set->lock); + + if (!vdev->is_tsm_bound) + return 0; + + vfio_iommufd_tsm_unbind(&vdev->vdev); + pci_release_regions(pdev); + vdev->is_tsm_bound = false; + mutex_unlock(&vdev->vdev.dev_set->lock); + + return 0; +} + long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -1471,6 +1541,10 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return vfio_pci_ioctl_reset(vdev, uarg); case VFIO_DEVICE_SET_IRQS: return vfio_pci_ioctl_set_irqs(vdev, uarg); + case VFIO_DEVICE_TSM_BIND: + return vfio_pci_ioctl_tsm_bind(vdev, uarg); + case VFIO_DEVICE_TSM_UNBIND: + return vfio_pci_ioctl_tsm_unbind(vdev, uarg); default: return -ENOTTY; } diff --git a/include/linux/vfio.h b/include/linux/vfio.h index d521d2c01a92..747b94bb9758 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -70,6 +70,7 @@ struct vfio_device { struct iommufd_device *iommufd_device; struct ida pasids; u8 iommufd_attached:1; + u8 iommufd_tsm_bound:1; #endif u8 cdev_opened:1; #ifdef CONFIG_DEBUG_FS @@ -155,6 +156,8 @@ int vfio_iommufd_emulated_bind(struct vfio_device *vdev, void vfio_iommufd_emulated_unbind(struct vfio_device *vdev); int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id); void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev); +int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id); +void vfio_iommufd_tsm_unbind(struct vfio_device *vdev); #else static inline struct iommufd_ctx * vfio_iommufd_device_ictx(struct vfio_device *vdev) @@ -190,6 +193,10 @@ vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx) ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) #define vfio_iommufd_emulated_detach_ioas \ ((void (*)(struct vfio_device *vdev)) NULL) +#define vfio_iommufd_tsm_bind \ + ((int (*)(struct vfio_device *vdev, u32 vdevice_id)) NULL) +#define vfio_iommufd_tsm_unbind \ + ((void (*)(struct vfio_device *vdev)) NULL) #endif static inline bool vfio_device_cdev_opened(struct vfio_device *device) diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index da5d8955ae56..b2982100221f 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -80,6 +80,7 @@ struct vfio_pci_core_device { bool needs_pm_restore:1; bool pm_intx_masked:1; bool pm_runtime_engaged:1; + bool is_tsm_bound:1; struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 9445fa36efd3..16bd93a5b427 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1493,6 +1493,48 @@ struct vfio_device_feature_dma_buf { struct vfio_region_dma_range dma_ranges[]; }; +/* + * Upon VFIO_DEVICE_TSM_BIND, Put the device in TSM Bind state. + * + * @argsz: User filled size of this data. + * @flags: Must be 0. + * @vdevice_id: Input the target id which can represent an vdevice allocated + * via iommufd subsystem. + * + * The vdevice holds all virtualization information needed for TSM Bind. + * TSM Bind means host finishes all host side trusted configurations to build + * a Tee Device Interface(TDI), then put the TDI in TDISP CONFIG_LOCKED or RUN + * state, waiting for guest's attestation. IOMMUFD finds all virtualization + * information from vdevice_id, and executes the TSM Bind. VFIO should be aware + * some operations (e.g. reset, toggle MSE, private MMIO access) to physical + * device impacts TSM Bind, so never do them or do them only after TSM Unbind. + * This IOCTL is only allowed on cdev fds. + */ +struct vfio_pci_tsm_bind { + __u32 argsz; + __u32 flags; + __u32 vdevice_id; + __u32 pad; +}; + +#define VFIO_DEVICE_TSM_BIND _IO(VFIO_TYPE, VFIO_BASE + 22) + +/* + * Upon VFIO_DEVICE_TSM_UNBIND, put the device in TSM Unbind state. + * + * @argsz: User filled size of this data. + * @flags: Must be 0. + * + * TSM Unbind means host removes all trusted configurations, and put the TDI in + * CONFIG_UNLOCKED TDISP state. + */ +struct vfio_pci_tsm_unbind { + __u32 argsz; + __u32 flags; +}; + +#define VFIO_DEVICE_TSM_UNBIND _IO(VFIO_TYPE, VFIO_BASE + 23) + /* -------- API for Type1 VFIO IOMMU -------- */ /**