Host side (KVM/VFIO/IOMMUFD) support for TDISP using TSM

[RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Xu Yilun 8 months, 2 weeks ago

Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
expected to be called by userspace when CoCo VM issues TDI bind/unbind
command to VMM. Specifically for TDX Connect, these commands are some
secure Hypervisor call named GHCI (Guest-Hypervisor Communication
Interface).

The TSM TDI bind/unbind operations are expected to be initiated by a
running CoCo VM, which already have the legacy assigned device in place.
The TSM bind operation is to request VMM make all secure configurations
to support device work as a TDI, and then issue TDISP messages to move
the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.

Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
device to TDISP ERROR state.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 drivers/vfio/iommufd.c           | 22 ++++++++++
 drivers/vfio/pci/vfio_pci_core.c | 74 ++++++++++++++++++++++++++++++++
 include/linux/vfio.h             |  7 +++
 include/linux/vfio_pci_core.h    |  1 +
 include/uapi/linux/vfio.h        | 42 ++++++++++++++++++
 5 files changed, 146 insertions(+)

diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 3441d24538a8..33fd20ffaeee 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -297,3 +297,25 @@ void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev)
 	vdev->iommufd_attached = false;
 }
 EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas);
+
+int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id)
+{
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (WARN_ON(!vdev->iommufd_device))
+		return -EINVAL;
+
+	return iommufd_device_tsm_bind(vdev->iommufd_device, vdevice_id);
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_bind);
+
+void vfio_iommufd_tsm_unbind(struct vfio_device *vdev)
+{
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (WARN_ON(!vdev->iommufd_device))
+		return;
+
+	iommufd_device_tsm_unbind(vdev->iommufd_device);
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_unbind);
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 116964057b0b..92544e54c9c3 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -692,6 +692,13 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 #if IS_ENABLED(CONFIG_EEH)
 	eeh_dev_release(vdev->pdev);
 #endif
+
+	if (vdev->is_tsm_bound) {
+		vfio_iommufd_tsm_unbind(&vdev->vdev);
+		pci_release_regions(vdev->pdev);
+		vdev->is_tsm_bound = false;
+	}
+
 	vfio_pci_core_disable(vdev);
 
 	vfio_pci_dma_buf_cleanup(vdev);
@@ -1447,6 +1454,69 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
 				  ioeventfd.fd);
 }
 
+static int vfio_pci_ioctl_tsm_bind(struct vfio_pci_core_device *vdev,
+				   void __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_pci_tsm_bind, vdevice_id);
+	struct vfio_pci_tsm_bind tsm_bind;
+	struct pci_dev *pdev = vdev->pdev;
+	int ret;
+
+	if (copy_from_user(&tsm_bind, arg, minsz))
+		return -EFAULT;
+
+	if (tsm_bind.argsz < minsz || tsm_bind.flags)
+		return -EINVAL;
+
+	mutex_lock(&vdev->vdev.dev_set->lock);
+
+	/* To ensure no host side MMIO access is possible */
+	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
+	if (ret)
+		goto out_unlock;
+
+	ret = vfio_iommufd_tsm_bind(&vdev->vdev, tsm_bind.vdevice_id);
+	if (ret)
+		goto out_release_region;
+
+	vdev->is_tsm_bound = true;
+	mutex_unlock(&vdev->vdev.dev_set->lock);
+
+	return 0;
+
+out_release_region:
+	pci_release_regions(pdev);
+out_unlock:
+	mutex_unlock(&vdev->vdev.dev_set->lock);
+	return ret;
+}
+
+static int vfio_pci_ioctl_tsm_unbind(struct vfio_pci_core_device *vdev,
+				     void __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_pci_tsm_unbind, flags);
+	struct vfio_pci_tsm_unbind tsm_unbind;
+	struct pci_dev *pdev = vdev->pdev;
+
+	if (copy_from_user(&tsm_unbind, arg, minsz))
+		return -EFAULT;
+
+	if (tsm_unbind.argsz < minsz || tsm_unbind.flags)
+		return -EINVAL;
+
+	mutex_lock(&vdev->vdev.dev_set->lock);
+
+	if (!vdev->is_tsm_bound)
+		return 0;
+
+	vfio_iommufd_tsm_unbind(&vdev->vdev);
+	pci_release_regions(pdev);
+	vdev->is_tsm_bound = false;
+	mutex_unlock(&vdev->vdev.dev_set->lock);
+
+	return 0;
+}
+
 long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 			 unsigned long arg)
 {
@@ -1471,6 +1541,10 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 		return vfio_pci_ioctl_reset(vdev, uarg);
 	case VFIO_DEVICE_SET_IRQS:
 		return vfio_pci_ioctl_set_irqs(vdev, uarg);
+	case VFIO_DEVICE_TSM_BIND:
+		return vfio_pci_ioctl_tsm_bind(vdev, uarg);
+	case VFIO_DEVICE_TSM_UNBIND:
+		return vfio_pci_ioctl_tsm_unbind(vdev, uarg);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index d521d2c01a92..747b94bb9758 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -70,6 +70,7 @@ struct vfio_device {
 	struct iommufd_device *iommufd_device;
 	struct ida pasids;
 	u8 iommufd_attached:1;
+	u8 iommufd_tsm_bound:1;
 #endif
 	u8 cdev_opened:1;
 #ifdef CONFIG_DEBUG_FS
@@ -155,6 +156,8 @@ int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
 void vfio_iommufd_emulated_unbind(struct vfio_device *vdev);
 int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
 void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev);
+int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id);
+void vfio_iommufd_tsm_unbind(struct vfio_device *vdev);
 #else
 static inline struct iommufd_ctx *
 vfio_iommufd_device_ictx(struct vfio_device *vdev)
@@ -190,6 +193,10 @@ vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx)
 	((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL)
 #define vfio_iommufd_emulated_detach_ioas \
 	((void (*)(struct vfio_device *vdev)) NULL)
+#define vfio_iommufd_tsm_bind \
+	((int (*)(struct vfio_device *vdev, u32 vdevice_id)) NULL)
+#define vfio_iommufd_tsm_unbind \
+	((void (*)(struct vfio_device *vdev)) NULL)
 #endif
 
 static inline bool vfio_device_cdev_opened(struct vfio_device *device)
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index da5d8955ae56..b2982100221f 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -80,6 +80,7 @@ struct vfio_pci_core_device {
 	bool			needs_pm_restore:1;
 	bool			pm_intx_masked:1;
 	bool			pm_runtime_engaged:1;
+	bool			is_tsm_bound:1;
 	struct pci_saved_state	*pci_saved_state;
 	struct pci_saved_state	*pm_save;
 	int			ioeventfds_nr;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 9445fa36efd3..16bd93a5b427 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1493,6 +1493,48 @@ struct vfio_device_feature_dma_buf {
 	struct vfio_region_dma_range dma_ranges[];
 };
 
+/*
+ * Upon VFIO_DEVICE_TSM_BIND, Put the device in TSM Bind state.
+ *
+ * @argsz:	User filled size of this data.
+ * @flags:	Must be 0.
+ * @vdevice_id:	Input the target id which can represent an vdevice allocated
+ *		via iommufd subsystem.
+ *
+ * The vdevice holds all virtualization information needed for TSM Bind.
+ * TSM Bind means host finishes all host side trusted configurations to build
+ * a Tee Device Interface(TDI), then put the TDI in TDISP CONFIG_LOCKED or RUN
+ * state, waiting for guest's attestation. IOMMUFD finds all virtualization
+ * information from vdevice_id, and executes the TSM Bind. VFIO should be aware
+ * some operations (e.g. reset, toggle MSE, private MMIO access) to physical
+ * device impacts TSM Bind, so never do them or do them only after TSM Unbind.
+ * This IOCTL is only allowed on cdev fds.
+ */
+struct vfio_pci_tsm_bind {
+	__u32	argsz;
+	__u32	flags;
+	__u32	vdevice_id;
+	__u32	pad;
+};
+
+#define VFIO_DEVICE_TSM_BIND		_IO(VFIO_TYPE, VFIO_BASE + 22)
+
+/*
+ * Upon VFIO_DEVICE_TSM_UNBIND, put the device in TSM Unbind state.
+ *
+ * @argsz:	User filled size of this data.
+ * @flags:	Must be 0.
+ *
+ * TSM Unbind means host removes all trusted configurations, and put the TDI in
+ * CONFIG_UNLOCKED TDISP state.
+ */
+struct vfio_pci_tsm_unbind {
+	__u32	argsz;
+	__u32	flags;
+};
+
+#define VFIO_DEVICE_TSM_UNBIND		_IO(VFIO_TYPE, VFIO_BASE + 23)
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
2.25.1

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Aneesh Kumar K.V 8 months, 1 week ago

Xu Yilun <yilun.xu@linux.intel.com> writes:

> Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
> expected to be called by userspace when CoCo VM issues TDI bind/unbind
> command to VMM. Specifically for TDX Connect, these commands are some
> secure Hypervisor call named GHCI (Guest-Hypervisor Communication
> Interface).
>
> The TSM TDI bind/unbind operations are expected to be initiated by a
> running CoCo VM, which already have the legacy assigned device in place.
> The TSM bind operation is to request VMM make all secure configurations
> to support device work as a TDI, and then issue TDISP messages to move
> the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
>
> Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
> device to TDISP ERROR state.
>
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Wu Hao <hao.wu@intel.com>
> Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
>

....

> +
> +	/* To ensure no host side MMIO access is possible */
> +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> +	if (ret)
> +		goto out_unlock;
> +
>

I am hitting failures here with similar changes. Can you share the Qemu
changes needed to make this pci_request_regions_exclusive successful.
Also after the TDI is unbound, we want the region ownership backto
"vfio-pci" so that things continue to work as non-secure device. I don't
see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
release the region ownership.


> +	ret = vfio_iommufd_tsm_bind(&vdev->vdev, tsm_bind.vdevice_id);
> +	if (ret)
> +		goto out_release_region;
> +
> +	vdev->is_tsm_bound = true;
> +	mutex_unlock(&vdev->vdev.dev_set->lock);
> +
> +	return 0;
> +
> +out_release_region:
> +	pci_release_regions(pdev);
> +out_unlock:
> +	mutex_unlock(&vdev->vdev.dev_set->lock);
> +	return ret;
> +}

-aneesh

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Jason Gunthorpe 8 months, 1 week ago

On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote:

> > +
> > +	/* To ensure no host side MMIO access is possible */
> > +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> > +	if (ret)
> > +		goto out_unlock;
> > +
> >
> 
> I am hitting failures here with similar changes. Can you share the Qemu
> changes needed to make this pci_request_regions_exclusive successful.
> Also after the TDI is unbound, we want the region ownership backto
> "vfio-pci" so that things continue to work as non-secure device. I don't
> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
> release the region ownership.

Again, IMHO, we should not be doing this dynamically. VFIO should do
pci_request_regions_exclusive() once at the very start and it should
stay that way.

There is no reason to change it dynamically.

The only decision to make is if all vfio should switch to exclusive
mode or if we need to make it optional for userspace.

Jason

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Aneesh Kumar K.V 8 months, 1 week ago

Jason Gunthorpe <jgg@nvidia.com> writes:

> On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote:
>
>> > +
>> > +	/* To ensure no host side MMIO access is possible */
>> > +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
>> > +	if (ret)
>> > +		goto out_unlock;
>> > +
>> >
>> 
>> I am hitting failures here with similar changes. Can you share the Qemu
>> changes needed to make this pci_request_regions_exclusive successful.
>> Also after the TDI is unbound, we want the region ownership backto
>> "vfio-pci" so that things continue to work as non-secure device. I don't
>> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
>> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
>> release the region ownership.
>
> Again, IMHO, we should not be doing this dynamically. VFIO should do
> pci_request_regions_exclusive() once at the very start and it should
> stay that way.
>
> There is no reason to change it dynamically.
>
> The only decision to make is if all vfio should switch to exclusive
> mode or if we need to make it optional for userspace.

We only need the exclusive mode when the device is operating in secure
mode, correct? That suggests we’ll need to dynamically toggle this
setting based on the device’s security state.

-aneesh

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Jason Gunthorpe 8 months, 1 week ago

On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote:
> Jason Gunthorpe <jgg@nvidia.com> writes:
> 
> > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote:
> >
> >> > +
> >> > +	/* To ensure no host side MMIO access is possible */
> >> > +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> >> > +	if (ret)
> >> > +		goto out_unlock;
> >> > +
> >> >
> >> 
> >> I am hitting failures here with similar changes. Can you share the Qemu
> >> changes needed to make this pci_request_regions_exclusive successful.
> >> Also after the TDI is unbound, we want the region ownership backto
> >> "vfio-pci" so that things continue to work as non-secure device. I don't
> >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
> >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
> >> release the region ownership.
> >
> > Again, IMHO, we should not be doing this dynamically. VFIO should do
> > pci_request_regions_exclusive() once at the very start and it should
> > stay that way.
> >
> > There is no reason to change it dynamically.
> >
> > The only decision to make is if all vfio should switch to exclusive
> > mode or if we need to make it optional for userspace.
> 
> We only need the exclusive mode when the device is operating in secure
> mode, correct? That suggests we’ll need to dynamically toggle this
> setting based on the device’s security state.

No, if the decision is that VFIO should allow this to be controlled by
userspace then userspace will tell iommufd to run in regions_exclusive
mode prior to opening the vfio cdev and VFIO will still do it once at
open time and never change it.

The only thing request_regions does is block other drivers outside
vfio from using this memory space. There is no reason at all to change
this dynamically. A CC VMM using VFIO will never use a driver outside
VFIO to touch the VFIO controlled memory.

Jason

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Aneesh Kumar K.V 8 months, 1 week ago

Jason Gunthorpe <jgg@nvidia.com> writes:

> On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote:
>> Jason Gunthorpe <jgg@nvidia.com> writes:
>> 
>> > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote:
>> >
>> >> > +
>> >> > +	/* To ensure no host side MMIO access is possible */
>> >> > +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
>> >> > +	if (ret)
>> >> > +		goto out_unlock;
>> >> > +
>> >> >
>> >> 
>> >> I am hitting failures here with similar changes. Can you share the Qemu
>> >> changes needed to make this pci_request_regions_exclusive successful.
>> >> Also after the TDI is unbound, we want the region ownership backto
>> >> "vfio-pci" so that things continue to work as non-secure device. I don't
>> >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
>> >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
>> >> release the region ownership.
>> >
>> > Again, IMHO, we should not be doing this dynamically. VFIO should do
>> > pci_request_regions_exclusive() once at the very start and it should
>> > stay that way.
>> >
>> > There is no reason to change it dynamically.
>> >
>> > The only decision to make is if all vfio should switch to exclusive
>> > mode or if we need to make it optional for userspace.
>> 
>> We only need the exclusive mode when the device is operating in secure
>> mode, correct? That suggests we’ll need to dynamically toggle this
>> setting based on the device’s security state.
>
> No, if the decision is that VFIO should allow this to be controlled by
> userspace then userspace will tell iommufd to run in regions_exclusive
> mode prior to opening the vfio cdev and VFIO will still do it once at
> open time and never change it.
>

So this will be handled by setting
vdevice::flags = IOMMUFD_PCI_REGION_EXCLUSIVE in
iommufd_vdevice_alloc_ioctl()? And we set this flag when starting a
secure guest, regardless of whether the device is TEE-capable or not

and vfio_pci_core_mmap() will do

	if (!vdev->barmap[index]) {

		if (core_vdev->iommufd_device &&
		    iommufd_vdevice_region_exclusive(core_vdev->iommufd_device))
			ret = pci_request_selected_regions_exclusive(pdev,
							1 << index, "vfio-pci");
		else
			ret = pci_request_selected_regions(pdev,
						1 << index, "vfio-pci");




>
> The only thing request_regions does is block other drivers outside
> vfio from using this memory space. There is no reason at all to change
> this dynamically. A CC VMM using VFIO will never use a driver outside
> VFIO to touch the VFIO controlled memory.
>
> Jason

-aneesh

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Jason Gunthorpe 8 months, 1 week ago

On Fri, Jun 06, 2025 at 03:02:49PM +0530, Aneesh Kumar K.V wrote:
> Jason Gunthorpe <jgg@nvidia.com> writes:
> 
> > On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote:
> >> Jason Gunthorpe <jgg@nvidia.com> writes:
> >> 
> >> > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote:
> >> >
> >> >> > +
> >> >> > +	/* To ensure no host side MMIO access is possible */
> >> >> > +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> >> >> > +	if (ret)
> >> >> > +		goto out_unlock;
> >> >> > +
> >> >> >
> >> >> 
> >> >> I am hitting failures here with similar changes. Can you share the Qemu
> >> >> changes needed to make this pci_request_regions_exclusive successful.
> >> >> Also after the TDI is unbound, we want the region ownership backto
> >> >> "vfio-pci" so that things continue to work as non-secure device. I don't
> >> >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
> >> >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
> >> >> release the region ownership.
> >> >
> >> > Again, IMHO, we should not be doing this dynamically. VFIO should do
> >> > pci_request_regions_exclusive() once at the very start and it should
> >> > stay that way.
> >> >
> >> > There is no reason to change it dynamically.
> >> >
> >> > The only decision to make is if all vfio should switch to exclusive
> >> > mode or if we need to make it optional for userspace.
> >> 
> >> We only need the exclusive mode when the device is operating in secure
> >> mode, correct? That suggests we’ll need to dynamically toggle this
> >> setting based on the device’s security state.
> >
> > No, if the decision is that VFIO should allow this to be controlled by
> > userspace then userspace will tell iommufd to run in regions_exclusive
> > mode prior to opening the vfio cdev and VFIO will still do it once at
> > open time and never change it.
> 
> So this will be handled by setting
> vdevice::flags = IOMMUFD_PCI_REGION_EXCLUSIVE in

Not like that.. I would suggest a global vfio sysfs or module parameter, or
maybe a iommufd ictx global option:

 IOMMU_OPTION(IOMMU_OPTION_OP_SET, IOMMU_OPTION_EXCLUSIVE_RANGES)

You want something simple here, not tied to vdevice or very dynamic.

The use cases for non-exclusive ranges are very narrow, IMHO

> and vfio_pci_core_mmap() will do
> 
> 	if (!vdev->barmap[index]) {
> 
> 		if (core_vdev->iommufd_device &&
> 		    iommufd_vdevice_region_exclusive(core_vdev->iommufd_device))
> 			ret = pci_request_selected_regions_exclusive(pdev,
> 							1 << index, "vfio-pci");
> 		else
> 			ret = pci_request_selected_regions(pdev,
> 						1 << index, "vfio-pci");

And IMHO, these should be moved to probe time or at least FD open
time, not at mmap time...

Jason

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Xu Yilun 8 months, 1 week ago

On Thu, Jun 05, 2025 at 01:33:39PM -0300, Jason Gunthorpe wrote:
> On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote:
> > Jason Gunthorpe <jgg@nvidia.com> writes:
> > 
> > > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote:
> > >
> > >> > +
> > >> > +	/* To ensure no host side MMIO access is possible */
> > >> > +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> > >> > +	if (ret)
> > >> > +		goto out_unlock;
> > >> > +
> > >> >
> > >> 
> > >> I am hitting failures here with similar changes. Can you share the Qemu
> > >> changes needed to make this pci_request_regions_exclusive successful.

Jason has described the suggested static lockdown flow and we could
try on that.  I just wanna help position your immediate failure.

Maybe you still have QEMU mmapped the MMIO region.

int vfio_pci_core_mmap()
{
...

	if (!vdev->barmap[index]) {
		ret = pci_request_selected_regions(pdev,
						   1 << index, "vfio-pci");
...
}

Even for static lockdown, userspace should not mmap the MMIOs anymore.

Thanks,
Yilun

> > >> Also after the TDI is unbound, we want the region ownership backto
> > >> "vfio-pci" so that things continue to work as non-secure device. I don't
> > >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
> > >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
> > >> release the region ownership.
> > >
> > > Again, IMHO, we should not be doing this dynamically. VFIO should do
> > > pci_request_regions_exclusive() once at the very start and it should
> > > stay that way.
> > >
> > > There is no reason to change it dynamically.
> > >
> > > The only decision to make is if all vfio should switch to exclusive
> > > mode or if we need to make it optional for userspace.
> > 
> > We only need the exclusive mode when the device is operating in secure
> > mode, correct? That suggests we’ll need to dynamically toggle this
> > setting based on the device’s security state.
> 
> No, if the decision is that VFIO should allow this to be controlled by
> userspace then userspace will tell iommufd to run in regions_exclusive
> mode prior to opening the vfio cdev and VFIO will still do it once at
> open time and never change it.
> 
> The only thing request_regions does is block other drivers outside
> vfio from using this memory space. There is no reason at all to change
> this dynamically. A CC VMM using VFIO will never use a driver outside
> VFIO to touch the VFIO controlled memory.
> 
> Jason

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Aneesh Kumar K.V 8 months, 1 week ago

Xu Yilun <yilun.xu@linux.intel.com> writes:

> Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
> expected to be called by userspace when CoCo VM issues TDI bind/unbind
> command to VMM. Specifically for TDX Connect, these commands are some
> secure Hypervisor call named GHCI (Guest-Hypervisor Communication
> Interface).
>
> The TSM TDI bind/unbind operations are expected to be initiated by a
> running CoCo VM, which already have the legacy assigned device in place.
> The TSM bind operation is to request VMM make all secure configurations
> to support device work as a TDI, and then issue TDISP messages to move
> the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
>
> Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
> device to TDISP ERROR state.
>

Any reason these need to be a vfio ioctl instead of iommufd ioctl?
For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/

>
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Wu Hao <hao.wu@intel.com>
> Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
> ---
>  drivers/vfio/iommufd.c           | 22 ++++++++++
>  drivers/vfio/pci/vfio_pci_core.c | 74 ++++++++++++++++++++++++++++++++
>  include/linux/vfio.h             |  7 +++
>  include/linux/vfio_pci_core.h    |  1 +
>  include/uapi/linux/vfio.h        | 42 ++++++++++++++++++
>  5 files changed, 146 insertions(+)
>
> diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
> index 3441d24538a8..33fd20ffaeee 100644
> --- a/drivers/vfio/iommufd.c
> +++ b/drivers/vfio/iommufd.c
> @@ -297,3 +297,25 @@ void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev)
>  	vdev->iommufd_attached = false;
>  }
>  EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas);
> +
> +int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id)
> +{
> +	lockdep_assert_held(&vdev->dev_set->lock);
> +
> +	if (WARN_ON(!vdev->iommufd_device))
> +		return -EINVAL;
> +
> +	return iommufd_device_tsm_bind(vdev->iommufd_device, vdevice_id);
> +}
> +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_bind);
> +
> +void vfio_iommufd_tsm_unbind(struct vfio_device *vdev)
> +{
> +	lockdep_assert_held(&vdev->dev_set->lock);
> +
> +	if (WARN_ON(!vdev->iommufd_device))
> +		return;
> +
> +	iommufd_device_tsm_unbind(vdev->iommufd_device);
> +}
> +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_unbind);
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 116964057b0b..92544e54c9c3 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -692,6 +692,13 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
>  #if IS_ENABLED(CONFIG_EEH)
>  	eeh_dev_release(vdev->pdev);
>  #endif
> +
> +	if (vdev->is_tsm_bound) {
> +		vfio_iommufd_tsm_unbind(&vdev->vdev);
> +		pci_release_regions(vdev->pdev);
> +		vdev->is_tsm_bound = false;
> +	}
> +
>  	vfio_pci_core_disable(vdev);
>  
>  	vfio_pci_dma_buf_cleanup(vdev);
> @@ -1447,6 +1454,69 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
>  				  ioeventfd.fd);
>  }
>  
> +static int vfio_pci_ioctl_tsm_bind(struct vfio_pci_core_device *vdev,
> +				   void __user *arg)
> +{
> +	unsigned long minsz = offsetofend(struct vfio_pci_tsm_bind, vdevice_id);
> +	struct vfio_pci_tsm_bind tsm_bind;
> +	struct pci_dev *pdev = vdev->pdev;
> +	int ret;
> +
> +	if (copy_from_user(&tsm_bind, arg, minsz))
> +		return -EFAULT;
> +
> +	if (tsm_bind.argsz < minsz || tsm_bind.flags)
> +		return -EINVAL;
> +
> +	mutex_lock(&vdev->vdev.dev_set->lock);
> +
> +	/* To ensure no host side MMIO access is possible */
> +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> +	if (ret)
> +		goto out_unlock;
>

This should be part of pci_tsm_bind() ? 

> +
> +	ret = vfio_iommufd_tsm_bind(&vdev->vdev, tsm_bind.vdevice_id);
> +	if (ret)
> +		goto out_release_region;
> +
> +	vdev->is_tsm_bound = true;
> +	mutex_unlock(&vdev->vdev.dev_set->lock);
> +
> +	return 0;
> +
> +out_release_region:
> +	pci_release_regions(pdev);
> +out_unlock:
> +	mutex_unlock(&vdev->vdev.dev_set->lock);
> +	return ret;
> +}
> +
> +static int vfio_pci_ioctl_tsm_unbind(struct vfio_pci_core_device *vdev,
> +				     void __user *arg)
> +{
> +	unsigned long minsz = offsetofend(struct vfio_pci_tsm_unbind, flags);
> +	struct vfio_pci_tsm_unbind tsm_unbind;
> +	struct pci_dev *pdev = vdev->pdev;
> +
> +	if (copy_from_user(&tsm_unbind, arg, minsz))
> +		return -EFAULT;
> +
> +	if (tsm_unbind.argsz < minsz || tsm_unbind.flags)
> +		return -EINVAL;
> +
> +	mutex_lock(&vdev->vdev.dev_set->lock);
> +
> +	if (!vdev->is_tsm_bound)
> +		return 0;
> +
> +	vfio_iommufd_tsm_unbind(&vdev->vdev);
> +	pci_release_regions(pdev);
> +	vdev->is_tsm_bound = false;
> +	mutex_unlock(&vdev->vdev.dev_set->lock);
> +
> +	return 0;
> +}
> +


-aneesh

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Xu Yilun 8 months, 1 week ago

On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote:
> Xu Yilun <yilun.xu@linux.intel.com> writes:
> 
> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
> > expected to be called by userspace when CoCo VM issues TDI bind/unbind
> > command to VMM. Specifically for TDX Connect, these commands are some
> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication
> > Interface).
> >
> > The TSM TDI bind/unbind operations are expected to be initiated by a
> > running CoCo VM, which already have the legacy assigned device in place.
> > The TSM bind operation is to request VMM make all secure configurations
> > to support device work as a TDI, and then issue TDISP messages to move
> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
> >
> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
> > device to TDISP ERROR state.
> >
> 
> Any reason these need to be a vfio ioctl instead of iommufd ioctl?
> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/

A general reason is, the device driver - VFIO should be aware of the
bound state, and some operations break the bound state. VFIO should also
know some operations on bound may crash kernel because of platform TSM
firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping
in secure page tables cannot be unmapped before TDI STOP [1].

Specifically, for TDX Connect, the firmware enforces MMIO unmapping in
S-EPT would fail if TDI is bound. For AMD there seems also some
requirement about this but I need Alexey's confirmation.

[1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/

> 
> >
> > Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> > Signed-off-by: Wu Hao <hao.wu@intel.com>
> > Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
> > ---
> >  drivers/vfio/iommufd.c           | 22 ++++++++++
> >  drivers/vfio/pci/vfio_pci_core.c | 74 ++++++++++++++++++++++++++++++++
> >  include/linux/vfio.h             |  7 +++
> >  include/linux/vfio_pci_core.h    |  1 +
> >  include/uapi/linux/vfio.h        | 42 ++++++++++++++++++
> >  5 files changed, 146 insertions(+)
> >
> > diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
> > index 3441d24538a8..33fd20ffaeee 100644
> > --- a/drivers/vfio/iommufd.c
> > +++ b/drivers/vfio/iommufd.c
> > @@ -297,3 +297,25 @@ void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev)
> >  	vdev->iommufd_attached = false;
> >  }
> >  EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas);
> > +
> > +int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id)
> > +{
> > +	lockdep_assert_held(&vdev->dev_set->lock);
> > +
> > +	if (WARN_ON(!vdev->iommufd_device))
> > +		return -EINVAL;
> > +
> > +	return iommufd_device_tsm_bind(vdev->iommufd_device, vdevice_id);
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_bind);
> > +
> > +void vfio_iommufd_tsm_unbind(struct vfio_device *vdev)
> > +{
> > +	lockdep_assert_held(&vdev->dev_set->lock);
> > +
> > +	if (WARN_ON(!vdev->iommufd_device))
> > +		return;
> > +
> > +	iommufd_device_tsm_unbind(vdev->iommufd_device);
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_unbind);
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index 116964057b0b..92544e54c9c3 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -692,6 +692,13 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
> >  #if IS_ENABLED(CONFIG_EEH)
> >  	eeh_dev_release(vdev->pdev);
> >  #endif
> > +
> > +	if (vdev->is_tsm_bound) {
> > +		vfio_iommufd_tsm_unbind(&vdev->vdev);
> > +		pci_release_regions(vdev->pdev);
> > +		vdev->is_tsm_bound = false;
> > +	}
> > +
> >  	vfio_pci_core_disable(vdev);
> >  
> >  	vfio_pci_dma_buf_cleanup(vdev);
> > @@ -1447,6 +1454,69 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
> >  				  ioeventfd.fd);
> >  }
> >  
> > +static int vfio_pci_ioctl_tsm_bind(struct vfio_pci_core_device *vdev,
> > +				   void __user *arg)
> > +{
> > +	unsigned long minsz = offsetofend(struct vfio_pci_tsm_bind, vdevice_id);
> > +	struct vfio_pci_tsm_bind tsm_bind;
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	int ret;
> > +
> > +	if (copy_from_user(&tsm_bind, arg, minsz))
> > +		return -EFAULT;
> > +
> > +	if (tsm_bind.argsz < minsz || tsm_bind.flags)
> > +		return -EINVAL;
> > +
> > +	mutex_lock(&vdev->vdev.dev_set->lock);
> > +
> > +	/* To ensure no host side MMIO access is possible */
> > +	ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> > +	if (ret)
> > +		goto out_unlock;
> >
> 
> This should be part of pci_tsm_bind() ? 

I'm not quite sure. My feelig is this method is specific for VFIO
driver. Many other drivers just request regions on probe(), they can
never bind successfully if pci tsm hide this implementation internally.

Thanks,
Yilun

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Aneesh Kumar K.V 8 months, 1 week ago

Xu Yilun <yilun.xu@linux.intel.com> writes:

> On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote:
>> Xu Yilun <yilun.xu@linux.intel.com> writes:
>> 
>> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
>> > expected to be called by userspace when CoCo VM issues TDI bind/unbind
>> > command to VMM. Specifically for TDX Connect, these commands are some
>> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication
>> > Interface).
>> >
>> > The TSM TDI bind/unbind operations are expected to be initiated by a
>> > running CoCo VM, which already have the legacy assigned device in place.
>> > The TSM bind operation is to request VMM make all secure configurations
>> > to support device work as a TDI, and then issue TDISP messages to move
>> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
>> >
>> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
>> > device to TDISP ERROR state.
>> >
>> 
>> Any reason these need to be a vfio ioctl instead of iommufd ioctl?
>> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/
>
> A general reason is, the device driver - VFIO should be aware of the
> bound state, and some operations break the bound state. VFIO should also
> know some operations on bound may crash kernel because of platform TSM
> firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping
> in secure page tables cannot be unmapped before TDI STOP [1].
>
> Specifically, for TDX Connect, the firmware enforces MMIO unmapping in
> S-EPT would fail if TDI is bound. For AMD there seems also some
> requirement about this but I need Alexey's confirmation.
>
> [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/
>

According to the TDISP specification (Section 11.2.6), clearing either
the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause
the TDI to transition to an error state. To handle this gracefully, it
seems necessary to unbind the TDI before modifying the BME or MSE bits.

If I understand correctly, we also need to unmap the Stage-2 mapping due
to the issue described in commit
abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional
reasons we would want to unmap the Stage-2 mapping for the BAR (as done
in vfio_pci_zap_and_down_write_memory_lock)?

Additionally, with TDX, it appears that before unmapping the Stage-2
mapping for the BAR, we should first unbind the TDI (ie, move it to the
"unlock" state?) Is this step related Section 11.2.6 of the TDISP spec,
or is it driven by a different requirement?

-aneesh

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Xu Yilun 8 months, 1 week ago

On Wed, Jun 04, 2025 at 07:07:18PM +0530, Aneesh Kumar K.V wrote:
> Xu Yilun <yilun.xu@linux.intel.com> writes:
> 
> > On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote:
> >> Xu Yilun <yilun.xu@linux.intel.com> writes:
> >> 
> >> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
> >> > expected to be called by userspace when CoCo VM issues TDI bind/unbind
> >> > command to VMM. Specifically for TDX Connect, these commands are some
> >> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication
> >> > Interface).
> >> >
> >> > The TSM TDI bind/unbind operations are expected to be initiated by a
> >> > running CoCo VM, which already have the legacy assigned device in place.
> >> > The TSM bind operation is to request VMM make all secure configurations
> >> > to support device work as a TDI, and then issue TDISP messages to move
> >> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
> >> >
> >> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
> >> > device to TDISP ERROR state.
> >> >
> >> 
> >> Any reason these need to be a vfio ioctl instead of iommufd ioctl?
> >> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/
> >
> > A general reason is, the device driver - VFIO should be aware of the
> > bound state, and some operations break the bound state. VFIO should also
> > know some operations on bound may crash kernel because of platform TSM
> > firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping
> > in secure page tables cannot be unmapped before TDI STOP [1].
> >
> > Specifically, for TDX Connect, the firmware enforces MMIO unmapping in
> > S-EPT would fail if TDI is bound. For AMD there seems also some
> > requirement about this but I need Alexey's confirmation.
> >
> > [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/
> >
> 
> According to the TDISP specification (Section 11.2.6), clearing either
> the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause
> the TDI to transition to an error state. To handle this gracefully, it
> seems necessary to unbind the TDI before modifying the BME or MSE bits.

Yes. But now the suggestion is never let VFIO do unbind, instead VFIO
should block these operations when device is bound.

> 
> If I understand correctly, we also need to unmap the Stage-2 mapping due
> to the issue described in commit
> abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional
> reasons we would want to unmap the Stage-2 mapping for the BAR (as done
> in vfio_pci_zap_and_down_write_memory_lock)?

I think no more reason. 

> 
> Additionally, with TDX, it appears that before unmapping the Stage-2
> mapping for the BAR, we should first unbind the TDI (ie, move it to the
> "unlock" state?) Is this step related Section 11.2.6 of the TDISP spec,
> or is it driven by a different requirement?

No, this is not device side TDISP requirement. It is host side
requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share
with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals
unmap IOMMU S2 PT.

If we allow IOMMU S2 PT unmapped when TDI is running, host could fool
guest by just unmap some PT entry and suppress the fault event. Guest
thought a DMA writting is successful but it is not and may cause
data integrity issue.

This is not a TDX specific problem, but different vendors has different
mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For
AMD, will trigger some HW protection called "ASID fence" [1]. Not sure
how ARM handles this?

https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/

Thanks,
Yilun

> 
> -aneesh

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Aneesh Kumar K.V 7 months, 3 weeks ago

Xu Yilun <yilun.xu@linux.intel.com> writes:

> On Wed, Jun 04, 2025 at 07:07:18PM +0530, Aneesh Kumar K.V wrote:
>> Xu Yilun <yilun.xu@linux.intel.com> writes:
>> 
>> > On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote:
>> >> Xu Yilun <yilun.xu@linux.intel.com> writes:
>> >> 
>> >> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
>> >> > expected to be called by userspace when CoCo VM issues TDI bind/unbind
>> >> > command to VMM. Specifically for TDX Connect, these commands are some
>> >> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication
>> >> > Interface).
>> >> >
>> >> > The TSM TDI bind/unbind operations are expected to be initiated by a
>> >> > running CoCo VM, which already have the legacy assigned device in place.
>> >> > The TSM bind operation is to request VMM make all secure configurations
>> >> > to support device work as a TDI, and then issue TDISP messages to move
>> >> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
>> >> >
>> >> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
>> >> > device to TDISP ERROR state.
>> >> >
>> >> 
>> >> Any reason these need to be a vfio ioctl instead of iommufd ioctl?
>> >> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/
>> >
>> > A general reason is, the device driver - VFIO should be aware of the
>> > bound state, and some operations break the bound state. VFIO should also
>> > know some operations on bound may crash kernel because of platform TSM
>> > firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping
>> > in secure page tables cannot be unmapped before TDI STOP [1].
>> >
>> > Specifically, for TDX Connect, the firmware enforces MMIO unmapping in
>> > S-EPT would fail if TDI is bound. For AMD there seems also some
>> > requirement about this but I need Alexey's confirmation.
>> >
>> > [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/
>> >
>> 
>> According to the TDISP specification (Section 11.2.6), clearing either
>> the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause
>> the TDI to transition to an error state. To handle this gracefully, it
>> seems necessary to unbind the TDI before modifying the BME or MSE bits.
>
> Yes. But now the suggestion is never let VFIO do unbind, instead VFIO
> should block these operations when device is bound.
>
>> 
>> If I understand correctly, we also need to unmap the Stage-2 mapping due
>> to the issue described in commit
>> abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional
>> reasons we would want to unmap the Stage-2 mapping for the BAR (as done
>> in vfio_pci_zap_and_down_write_memory_lock)?
>
> I think no more reason. 
>
>> 
>> Additionally, with TDX, it appears that before unmapping the Stage-2
>> mapping for the BAR, we should first unbind the TDI (ie, move it to the
>> "unlock" state?) Is this step related Section 11.2.6 of the TDISP spec,
>> or is it driven by a different requirement?
>
> No, this is not device side TDISP requirement. It is host side
> requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share
> with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals
> unmap IOMMU S2 PT.
>
> If we allow IOMMU S2 PT unmapped when TDI is running, host could fool
> guest by just unmap some PT entry and suppress the fault event. Guest
> thought a DMA writting is successful but it is not and may cause
> data integrity issue.
>

I am still trying to find more details here. How did the guest conclude
DMA writing is successful? Guest would timeout waiting for DMA to complete
if the host hides the interrupt delivery of failed DMA transfer?

>
> This is not a TDX specific problem, but different vendors has different
> mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For
> AMD, will trigger some HW protection called "ASID fence" [1]. Not sure
> how ARM handles this?
>
> https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/
>
> Thanks,
> Yilun
>

-aneesh

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Xu Yilun 7 months, 3 weeks ago

On Mon, Jun 16, 2025 at 01:46:42PM +0530, Aneesh Kumar K.V wrote:
> Xu Yilun <yilun.xu@linux.intel.com> writes:
> 
> > On Wed, Jun 04, 2025 at 07:07:18PM +0530, Aneesh Kumar K.V wrote:
> >> Xu Yilun <yilun.xu@linux.intel.com> writes:
> >> 
> >> > On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote:
> >> >> Xu Yilun <yilun.xu@linux.intel.com> writes:
> >> >> 
> >> >> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
> >> >> > expected to be called by userspace when CoCo VM issues TDI bind/unbind
> >> >> > command to VMM. Specifically for TDX Connect, these commands are some
> >> >> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication
> >> >> > Interface).
> >> >> >
> >> >> > The TSM TDI bind/unbind operations are expected to be initiated by a
> >> >> > running CoCo VM, which already have the legacy assigned device in place.
> >> >> > The TSM bind operation is to request VMM make all secure configurations
> >> >> > to support device work as a TDI, and then issue TDISP messages to move
> >> >> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
> >> >> >
> >> >> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
> >> >> > device to TDISP ERROR state.
> >> >> >
> >> >> 
> >> >> Any reason these need to be a vfio ioctl instead of iommufd ioctl?
> >> >> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/
> >> >
> >> > A general reason is, the device driver - VFIO should be aware of the
> >> > bound state, and some operations break the bound state. VFIO should also
> >> > know some operations on bound may crash kernel because of platform TSM
> >> > firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping
> >> > in secure page tables cannot be unmapped before TDI STOP [1].
> >> >
> >> > Specifically, for TDX Connect, the firmware enforces MMIO unmapping in
> >> > S-EPT would fail if TDI is bound. For AMD there seems also some
> >> > requirement about this but I need Alexey's confirmation.
> >> >
> >> > [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/
> >> >
> >> 
> >> According to the TDISP specification (Section 11.2.6), clearing either
> >> the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause
> >> the TDI to transition to an error state. To handle this gracefully, it
> >> seems necessary to unbind the TDI before modifying the BME or MSE bits.
> >
> > Yes. But now the suggestion is never let VFIO do unbind, instead VFIO
> > should block these operations when device is bound.
> >
> >> 
> >> If I understand correctly, we also need to unmap the Stage-2 mapping due
> >> to the issue described in commit
> >> abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional
> >> reasons we would want to unmap the Stage-2 mapping for the BAR (as done
> >> in vfio_pci_zap_and_down_write_memory_lock)?
> >
> > I think no more reason. 
> >
> >> 
> >> Additionally, with TDX, it appears that before unmapping the Stage-2
> >> mapping for the BAR, we should first unbind the TDI (ie, move it to the
> >> "unlock" state?) Is this step related Section 11.2.6 of the TDISP spec,
> >> or is it driven by a different requirement?
> >
> > No, this is not device side TDISP requirement. It is host side
> > requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share
> > with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals
> > unmap IOMMU S2 PT.
> >
> > If we allow IOMMU S2 PT unmapped when TDI is running, host could fool
> > guest by just unmap some PT entry and suppress the fault event. Guest
> > thought a DMA writting is successful but it is not and may cause
> > data integrity issue.
> >
> 
> I am still trying to find more details here. How did the guest conclude
> DMA writing is successful?

Traditionally VMM is the trusted entity. If there is no IOMMU fault
reported, guest assumes DMA writing is successful.

> Guest would timeout waiting for DMA to complete

There is no *generic* machanism to detect or wait for a single DMA
write completion. They are "posted" in terms of PCIe.

Thanks,
Yilun

> if the host hides the interrupt delivery of failed DMA transfer?
> 
> >
> > This is not a TDX specific problem, but different vendors has different
> > mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For
> > AMD, will trigger some HW protection called "ASID fence" [1]. Not sure
> > how ARM handles this?
> >
> > https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/
> >
> > Thanks,
> > Yilun
> >
> 
> -aneesh

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Aneesh Kumar K.V 8 months, 1 week ago

Xu Yilun <yilun.xu@linux.intel.com> writes:

> On Wed, Jun 04, 2025 at 07:07:18PM +0530, Aneesh Kumar K.V wrote:
>> Xu Yilun <yilun.xu@linux.intel.com> writes:
>> 
>> > On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote:
>> >> Xu Yilun <yilun.xu@linux.intel.com> writes:
>> >> 
>> >> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
>> >> > expected to be called by userspace when CoCo VM issues TDI bind/unbind
>> >> > command to VMM. Specifically for TDX Connect, these commands are some
>> >> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication
>> >> > Interface).
>> >> >
>> >> > The TSM TDI bind/unbind operations are expected to be initiated by a
>> >> > running CoCo VM, which already have the legacy assigned device in place.
>> >> > The TSM bind operation is to request VMM make all secure configurations
>> >> > to support device work as a TDI, and then issue TDISP messages to move
>> >> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
>> >> >
>> >> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
>> >> > device to TDISP ERROR state.
>> >> >
>> >> 
>> >> Any reason these need to be a vfio ioctl instead of iommufd ioctl?
>> >> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/
>> >
>> > A general reason is, the device driver - VFIO should be aware of the
>> > bound state, and some operations break the bound state. VFIO should also
>> > know some operations on bound may crash kernel because of platform TSM
>> > firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping
>> > in secure page tables cannot be unmapped before TDI STOP [1].
>> >
>> > Specifically, for TDX Connect, the firmware enforces MMIO unmapping in
>> > S-EPT would fail if TDI is bound. For AMD there seems also some
>> > requirement about this but I need Alexey's confirmation.
>> >
>> > [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/
>> >
>> 
>> According to the TDISP specification (Section 11.2.6), clearing either
>> the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause
>> the TDI to transition to an error state. To handle this gracefully, it
>> seems necessary to unbind the TDI before modifying the BME or MSE bits.
>
> Yes. But now the suggestion is never let VFIO do unbind, instead VFIO
> should block these operations when device is bound.
>
>> 
>> If I understand correctly, we also need to unmap the Stage-2 mapping due
>> to the issue described in commit
>> abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional
>> reasons we would want to unmap the Stage-2 mapping for the BAR (as done
>> in vfio_pci_zap_and_down_write_memory_lock)?
>
> I think no more reason. 
>
>> 
>> Additionally, with TDX, it appears that before unmapping the Stage-2
>> mapping for the BAR, we should first unbind the TDI (ie, move it to the
>> "unlock" state?) Is this step related Section 11.2.6 of the TDISP spec,
>> or is it driven by a different requirement?
>
> No, this is not device side TDISP requirement. It is host side
> requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share
> with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals
> unmap IOMMU S2 PT.
>
> If we allow IOMMU S2 PT unmapped when TDI is running, host could fool
> guest by just unmap some PT entry and suppress the fault event. Guest
> thought a DMA writting is successful but it is not and may cause
> data integrity issue.
>
> This is not a TDX specific problem, but different vendors has different
> mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For
> AMD, will trigger some HW protection called "ASID fence" [1]. Not sure
> how ARM handles this?
>
> https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/
>

MMIO/BAR Unmapping:
If the stage-2 mapping is removed while the device is in a locked state—a
scenario that ARM permits—the granule transitions to the RIPAS_DESTROYED and
HIPAS_UNASSIGNED states. Any MMIO or CPU access to such a granule will trigger a
non-emulatable data abort, which is forwarded to the non-secure hypervisor
(e.g., KVM).

However, at this point, the system cannot make further progress. The unmapping
was initiated by the host without coordination from the guest, leaving the
granule in a broken state.

A more robust workflow would involve the guest first transitioning the granule
to RIPAS_EMPTY, followed by the host unmapping the stage-2 entry.

IOMMU Page Table Unmap:
Both the CPU and the SMMU can share the stage-2 page table. If the non-secure
host unmaps an entry from this shared page table, the affected granule again
transitions to RIPAS_DESTROYED and HIPAS_UNASSIGNED.

In this case, a DMA transaction—(SMMU is configured by the Realm Management
Monitor,RMM)—can be terminated. This typically results in an event being
recorded in the event queue which can be read by RMM.

However, interrupt delivery remains under non-secure host control, and
the guest may not be immediately aware that the DMA transaction was
terminated. I am currently confirming this behavior with the design team
and will follow up once I have clarity.

-aneesh

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Jason Gunthorpe 8 months, 1 week ago

On Thu, Jun 05, 2025 at 05:41:17PM +0800, Xu Yilun wrote:

> No, this is not device side TDISP requirement. It is host side
> requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share
> with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals
> unmap IOMMU S2 PT.
> 
> If we allow IOMMU S2 PT unmapped when TDI is running, host could fool
> guest by just unmap some PT entry and suppress the fault event. Guest
> thought a DMA writting is successful but it is not and may cause
> data integrity issue.

So, TDX prevents *any* unmap, even of normal memory, from the S2 while
a guest is running?  Seems extreme?

MMIO isn't special, if you have a rule like that for such a security
reason it should cover all of the S2.

> This is not a TDX specific problem, but different vendors has different
> mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For
> AMD, will trigger some HW protection called "ASID fence" [1]. Not sure
> how ARM handles this?

This seems even more extreme, if the guest gets a bad DMA address into
the device then the entire device gets killed? No chance to debug it?

Jason

Re: [RFC PATCH 19/30] vfio/pci: Add TSM TDI bind/unbind IOCTLs for TEE-IO support

Posted by Xu Yilun 8 months, 1 week ago

On Thu, Jun 05, 2025 at 12:09:16PM -0300, Jason Gunthorpe wrote:
> On Thu, Jun 05, 2025 at 05:41:17PM +0800, Xu Yilun wrote:
> 
> > No, this is not device side TDISP requirement. It is host side
> > requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share
> > with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals
> > unmap IOMMU S2 PT.
> > 
> > If we allow IOMMU S2 PT unmapped when TDI is running, host could fool
> > guest by just unmap some PT entry and suppress the fault event. Guest
> > thought a DMA writting is successful but it is not and may cause
> > data integrity issue.
> 
> So, TDX prevents *any* unmap, even of normal memory, from the S2 while
> a guest is running?  Seems extreme?

Prevents any unmap *not intended* by guest, even for normal memory.

Guest could show its unmapping intention by issuing an "page release"
firmware call then host is OK to unmap. This for normal memory.

For MMIO, Guest implicitly hwo the intention by unbind the TDI first.

> 
> MMIO isn't special, if you have a rule like that for such a security
> reason it should cover all of the S2.

It does.

Thanks,
Yilun

> 
> > This is not a TDX specific problem, but different vendors has different
> > mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For
> > AMD, will trigger some HW protection called "ASID fence" [1]. Not sure
> > how ARM handles this?
> 
> This seems even more extreme, if the guest gets a bad DMA address into
> the device then the entire device gets killed? No chance to debug it?
> 
> Jason
>