Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
expected to be called by userspace when CoCo VM issues TDI bind/unbind
command to VMM. Specifically for TDX Connect, these commands are some
secure Hypervisor call named GHCI (Guest-Hypervisor Communication
Interface).
The TSM TDI bind/unbind operations are expected to be initiated by a
running CoCo VM, which already have the legacy assigned device in place.
The TSM bind operation is to request VMM make all secure configurations
to support device work as a TDI, and then issue TDISP messages to move
the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
device to TDISP ERROR state.
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
drivers/vfio/iommufd.c | 22 ++++++++++
drivers/vfio/pci/vfio_pci_core.c | 74 ++++++++++++++++++++++++++++++++
include/linux/vfio.h | 7 +++
include/linux/vfio_pci_core.h | 1 +
include/uapi/linux/vfio.h | 42 ++++++++++++++++++
5 files changed, 146 insertions(+)
diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 3441d24538a8..33fd20ffaeee 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -297,3 +297,25 @@ void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev)
vdev->iommufd_attached = false;
}
EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas);
+
+int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id)
+{
+ lockdep_assert_held(&vdev->dev_set->lock);
+
+ if (WARN_ON(!vdev->iommufd_device))
+ return -EINVAL;
+
+ return iommufd_device_tsm_bind(vdev->iommufd_device, vdevice_id);
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_bind);
+
+void vfio_iommufd_tsm_unbind(struct vfio_device *vdev)
+{
+ lockdep_assert_held(&vdev->dev_set->lock);
+
+ if (WARN_ON(!vdev->iommufd_device))
+ return;
+
+ iommufd_device_tsm_unbind(vdev->iommufd_device);
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_unbind);
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 116964057b0b..92544e54c9c3 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -692,6 +692,13 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
#if IS_ENABLED(CONFIG_EEH)
eeh_dev_release(vdev->pdev);
#endif
+
+ if (vdev->is_tsm_bound) {
+ vfio_iommufd_tsm_unbind(&vdev->vdev);
+ pci_release_regions(vdev->pdev);
+ vdev->is_tsm_bound = false;
+ }
+
vfio_pci_core_disable(vdev);
vfio_pci_dma_buf_cleanup(vdev);
@@ -1447,6 +1454,69 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
ioeventfd.fd);
}
+static int vfio_pci_ioctl_tsm_bind(struct vfio_pci_core_device *vdev,
+ void __user *arg)
+{
+ unsigned long minsz = offsetofend(struct vfio_pci_tsm_bind, vdevice_id);
+ struct vfio_pci_tsm_bind tsm_bind;
+ struct pci_dev *pdev = vdev->pdev;
+ int ret;
+
+ if (copy_from_user(&tsm_bind, arg, minsz))
+ return -EFAULT;
+
+ if (tsm_bind.argsz < minsz || tsm_bind.flags)
+ return -EINVAL;
+
+ mutex_lock(&vdev->vdev.dev_set->lock);
+
+ /* To ensure no host side MMIO access is possible */
+ ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
+ if (ret)
+ goto out_unlock;
+
+ ret = vfio_iommufd_tsm_bind(&vdev->vdev, tsm_bind.vdevice_id);
+ if (ret)
+ goto out_release_region;
+
+ vdev->is_tsm_bound = true;
+ mutex_unlock(&vdev->vdev.dev_set->lock);
+
+ return 0;
+
+out_release_region:
+ pci_release_regions(pdev);
+out_unlock:
+ mutex_unlock(&vdev->vdev.dev_set->lock);
+ return ret;
+}
+
+static int vfio_pci_ioctl_tsm_unbind(struct vfio_pci_core_device *vdev,
+ void __user *arg)
+{
+ unsigned long minsz = offsetofend(struct vfio_pci_tsm_unbind, flags);
+ struct vfio_pci_tsm_unbind tsm_unbind;
+ struct pci_dev *pdev = vdev->pdev;
+
+ if (copy_from_user(&tsm_unbind, arg, minsz))
+ return -EFAULT;
+
+ if (tsm_unbind.argsz < minsz || tsm_unbind.flags)
+ return -EINVAL;
+
+ mutex_lock(&vdev->vdev.dev_set->lock);
+
+ if (!vdev->is_tsm_bound)
+ return 0;
+
+ vfio_iommufd_tsm_unbind(&vdev->vdev);
+ pci_release_regions(pdev);
+ vdev->is_tsm_bound = false;
+ mutex_unlock(&vdev->vdev.dev_set->lock);
+
+ return 0;
+}
+
long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
unsigned long arg)
{
@@ -1471,6 +1541,10 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
return vfio_pci_ioctl_reset(vdev, uarg);
case VFIO_DEVICE_SET_IRQS:
return vfio_pci_ioctl_set_irqs(vdev, uarg);
+ case VFIO_DEVICE_TSM_BIND:
+ return vfio_pci_ioctl_tsm_bind(vdev, uarg);
+ case VFIO_DEVICE_TSM_UNBIND:
+ return vfio_pci_ioctl_tsm_unbind(vdev, uarg);
default:
return -ENOTTY;
}
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index d521d2c01a92..747b94bb9758 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -70,6 +70,7 @@ struct vfio_device {
struct iommufd_device *iommufd_device;
struct ida pasids;
u8 iommufd_attached:1;
+ u8 iommufd_tsm_bound:1;
#endif
u8 cdev_opened:1;
#ifdef CONFIG_DEBUG_FS
@@ -155,6 +156,8 @@ int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
void vfio_iommufd_emulated_unbind(struct vfio_device *vdev);
int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev);
+int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id);
+void vfio_iommufd_tsm_unbind(struct vfio_device *vdev);
#else
static inline struct iommufd_ctx *
vfio_iommufd_device_ictx(struct vfio_device *vdev)
@@ -190,6 +193,10 @@ vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx)
((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL)
#define vfio_iommufd_emulated_detach_ioas \
((void (*)(struct vfio_device *vdev)) NULL)
+#define vfio_iommufd_tsm_bind \
+ ((int (*)(struct vfio_device *vdev, u32 vdevice_id)) NULL)
+#define vfio_iommufd_tsm_unbind \
+ ((void (*)(struct vfio_device *vdev)) NULL)
#endif
static inline bool vfio_device_cdev_opened(struct vfio_device *device)
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index da5d8955ae56..b2982100221f 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -80,6 +80,7 @@ struct vfio_pci_core_device {
bool needs_pm_restore:1;
bool pm_intx_masked:1;
bool pm_runtime_engaged:1;
+ bool is_tsm_bound:1;
struct pci_saved_state *pci_saved_state;
struct pci_saved_state *pm_save;
int ioeventfds_nr;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 9445fa36efd3..16bd93a5b427 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1493,6 +1493,48 @@ struct vfio_device_feature_dma_buf {
struct vfio_region_dma_range dma_ranges[];
};
+/*
+ * Upon VFIO_DEVICE_TSM_BIND, Put the device in TSM Bind state.
+ *
+ * @argsz: User filled size of this data.
+ * @flags: Must be 0.
+ * @vdevice_id: Input the target id which can represent an vdevice allocated
+ * via iommufd subsystem.
+ *
+ * The vdevice holds all virtualization information needed for TSM Bind.
+ * TSM Bind means host finishes all host side trusted configurations to build
+ * a Tee Device Interface(TDI), then put the TDI in TDISP CONFIG_LOCKED or RUN
+ * state, waiting for guest's attestation. IOMMUFD finds all virtualization
+ * information from vdevice_id, and executes the TSM Bind. VFIO should be aware
+ * some operations (e.g. reset, toggle MSE, private MMIO access) to physical
+ * device impacts TSM Bind, so never do them or do them only after TSM Unbind.
+ * This IOCTL is only allowed on cdev fds.
+ */
+struct vfio_pci_tsm_bind {
+ __u32 argsz;
+ __u32 flags;
+ __u32 vdevice_id;
+ __u32 pad;
+};
+
+#define VFIO_DEVICE_TSM_BIND _IO(VFIO_TYPE, VFIO_BASE + 22)
+
+/*
+ * Upon VFIO_DEVICE_TSM_UNBIND, put the device in TSM Unbind state.
+ *
+ * @argsz: User filled size of this data.
+ * @flags: Must be 0.
+ *
+ * TSM Unbind means host removes all trusted configurations, and put the TDI in
+ * CONFIG_UNLOCKED TDISP state.
+ */
+struct vfio_pci_tsm_unbind {
+ __u32 argsz;
+ __u32 flags;
+};
+
+#define VFIO_DEVICE_TSM_UNBIND _IO(VFIO_TYPE, VFIO_BASE + 23)
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**
--
2.25.1
Xu Yilun <yilun.xu@linux.intel.com> writes: > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are > expected to be called by userspace when CoCo VM issues TDI bind/unbind > command to VMM. Specifically for TDX Connect, these commands are some > secure Hypervisor call named GHCI (Guest-Hypervisor Communication > Interface). > > The TSM TDI bind/unbind operations are expected to be initiated by a > running CoCo VM, which already have the legacy assigned device in place. > The TSM bind operation is to request VMM make all secure configurations > to support device work as a TDI, and then issue TDISP messages to move > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation. > > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead > device to TDISP ERROR state. > > Suggested-by: Jason Gunthorpe <jgg@nvidia.com> > Signed-off-by: Wu Hao <hao.wu@intel.com> > Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com> > .... > + > + /* To ensure no host side MMIO access is possible */ > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm"); > + if (ret) > + goto out_unlock; > + > I am hitting failures here with similar changes. Can you share the Qemu changes needed to make this pci_request_regions_exclusive successful. Also after the TDI is unbound, we want the region ownership backto "vfio-pci" so that things continue to work as non-secure device. I don't see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in userspace which will result in vfio_unmap()/vfio_map(). But that doesn't release the region ownership. > + ret = vfio_iommufd_tsm_bind(&vdev->vdev, tsm_bind.vdevice_id); > + if (ret) > + goto out_release_region; > + > + vdev->is_tsm_bound = true; > + mutex_unlock(&vdev->vdev.dev_set->lock); > + > + return 0; > + > +out_release_region: > + pci_release_regions(pdev); > +out_unlock: > + mutex_unlock(&vdev->vdev.dev_set->lock); > + return ret; > +} -aneesh
On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote: > > + > > + /* To ensure no host side MMIO access is possible */ > > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm"); > > + if (ret) > > + goto out_unlock; > > + > > > > I am hitting failures here with similar changes. Can you share the Qemu > changes needed to make this pci_request_regions_exclusive successful. > Also after the TDI is unbound, we want the region ownership backto > "vfio-pci" so that things continue to work as non-secure device. I don't > see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in > userspace which will result in vfio_unmap()/vfio_map(). But that doesn't > release the region ownership. Again, IMHO, we should not be doing this dynamically. VFIO should do pci_request_regions_exclusive() once at the very start and it should stay that way. There is no reason to change it dynamically. The only decision to make is if all vfio should switch to exclusive mode or if we need to make it optional for userspace. Jason
Jason Gunthorpe <jgg@nvidia.com> writes: > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote: > >> > + >> > + /* To ensure no host side MMIO access is possible */ >> > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm"); >> > + if (ret) >> > + goto out_unlock; >> > + >> > >> >> I am hitting failures here with similar changes. Can you share the Qemu >> changes needed to make this pci_request_regions_exclusive successful. >> Also after the TDI is unbound, we want the region ownership backto >> "vfio-pci" so that things continue to work as non-secure device. I don't >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't >> release the region ownership. > > Again, IMHO, we should not be doing this dynamically. VFIO should do > pci_request_regions_exclusive() once at the very start and it should > stay that way. > > There is no reason to change it dynamically. > > The only decision to make is if all vfio should switch to exclusive > mode or if we need to make it optional for userspace. We only need the exclusive mode when the device is operating in secure mode, correct? That suggests we’ll need to dynamically toggle this setting based on the device’s security state. -aneesh
On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote: > Jason Gunthorpe <jgg@nvidia.com> writes: > > > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote: > > > >> > + > >> > + /* To ensure no host side MMIO access is possible */ > >> > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm"); > >> > + if (ret) > >> > + goto out_unlock; > >> > + > >> > > >> > >> I am hitting failures here with similar changes. Can you share the Qemu > >> changes needed to make this pci_request_regions_exclusive successful. > >> Also after the TDI is unbound, we want the region ownership backto > >> "vfio-pci" so that things continue to work as non-secure device. I don't > >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in > >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't > >> release the region ownership. > > > > Again, IMHO, we should not be doing this dynamically. VFIO should do > > pci_request_regions_exclusive() once at the very start and it should > > stay that way. > > > > There is no reason to change it dynamically. > > > > The only decision to make is if all vfio should switch to exclusive > > mode or if we need to make it optional for userspace. > > We only need the exclusive mode when the device is operating in secure > mode, correct? That suggests we’ll need to dynamically toggle this > setting based on the device’s security state. No, if the decision is that VFIO should allow this to be controlled by userspace then userspace will tell iommufd to run in regions_exclusive mode prior to opening the vfio cdev and VFIO will still do it once at open time and never change it. The only thing request_regions does is block other drivers outside vfio from using this memory space. There is no reason at all to change this dynamically. A CC VMM using VFIO will never use a driver outside VFIO to touch the VFIO controlled memory. Jason
Jason Gunthorpe <jgg@nvidia.com> writes:
> On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote:
>> Jason Gunthorpe <jgg@nvidia.com> writes:
>>
>> > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote:
>> >
>> >> > +
>> >> > + /* To ensure no host side MMIO access is possible */
>> >> > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
>> >> > + if (ret)
>> >> > + goto out_unlock;
>> >> > +
>> >> >
>> >>
>> >> I am hitting failures here with similar changes. Can you share the Qemu
>> >> changes needed to make this pci_request_regions_exclusive successful.
>> >> Also after the TDI is unbound, we want the region ownership backto
>> >> "vfio-pci" so that things continue to work as non-secure device. I don't
>> >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
>> >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
>> >> release the region ownership.
>> >
>> > Again, IMHO, we should not be doing this dynamically. VFIO should do
>> > pci_request_regions_exclusive() once at the very start and it should
>> > stay that way.
>> >
>> > There is no reason to change it dynamically.
>> >
>> > The only decision to make is if all vfio should switch to exclusive
>> > mode or if we need to make it optional for userspace.
>>
>> We only need the exclusive mode when the device is operating in secure
>> mode, correct? That suggests we’ll need to dynamically toggle this
>> setting based on the device’s security state.
>
> No, if the decision is that VFIO should allow this to be controlled by
> userspace then userspace will tell iommufd to run in regions_exclusive
> mode prior to opening the vfio cdev and VFIO will still do it once at
> open time and never change it.
>
So this will be handled by setting
vdevice::flags = IOMMUFD_PCI_REGION_EXCLUSIVE in
iommufd_vdevice_alloc_ioctl()? And we set this flag when starting a
secure guest, regardless of whether the device is TEE-capable or not
and vfio_pci_core_mmap() will do
if (!vdev->barmap[index]) {
if (core_vdev->iommufd_device &&
iommufd_vdevice_region_exclusive(core_vdev->iommufd_device))
ret = pci_request_selected_regions_exclusive(pdev,
1 << index, "vfio-pci");
else
ret = pci_request_selected_regions(pdev,
1 << index, "vfio-pci");
>
> The only thing request_regions does is block other drivers outside
> vfio from using this memory space. There is no reason at all to change
> this dynamically. A CC VMM using VFIO will never use a driver outside
> VFIO to touch the VFIO controlled memory.
>
> Jason
-aneesh
On Fri, Jun 06, 2025 at 03:02:49PM +0530, Aneesh Kumar K.V wrote:
> Jason Gunthorpe <jgg@nvidia.com> writes:
>
> > On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote:
> >> Jason Gunthorpe <jgg@nvidia.com> writes:
> >>
> >> > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote:
> >> >
> >> >> > +
> >> >> > + /* To ensure no host side MMIO access is possible */
> >> >> > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> >> >> > + if (ret)
> >> >> > + goto out_unlock;
> >> >> > +
> >> >> >
> >> >>
> >> >> I am hitting failures here with similar changes. Can you share the Qemu
> >> >> changes needed to make this pci_request_regions_exclusive successful.
> >> >> Also after the TDI is unbound, we want the region ownership backto
> >> >> "vfio-pci" so that things continue to work as non-secure device. I don't
> >> >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
> >> >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
> >> >> release the region ownership.
> >> >
> >> > Again, IMHO, we should not be doing this dynamically. VFIO should do
> >> > pci_request_regions_exclusive() once at the very start and it should
> >> > stay that way.
> >> >
> >> > There is no reason to change it dynamically.
> >> >
> >> > The only decision to make is if all vfio should switch to exclusive
> >> > mode or if we need to make it optional for userspace.
> >>
> >> We only need the exclusive mode when the device is operating in secure
> >> mode, correct? That suggests we’ll need to dynamically toggle this
> >> setting based on the device’s security state.
> >
> > No, if the decision is that VFIO should allow this to be controlled by
> > userspace then userspace will tell iommufd to run in regions_exclusive
> > mode prior to opening the vfio cdev and VFIO will still do it once at
> > open time and never change it.
>
> So this will be handled by setting
> vdevice::flags = IOMMUFD_PCI_REGION_EXCLUSIVE in
Not like that.. I would suggest a global vfio sysfs or module parameter, or
maybe a iommufd ictx global option:
IOMMU_OPTION(IOMMU_OPTION_OP_SET, IOMMU_OPTION_EXCLUSIVE_RANGES)
You want something simple here, not tied to vdevice or very dynamic.
The use cases for non-exclusive ranges are very narrow, IMHO
> and vfio_pci_core_mmap() will do
>
> if (!vdev->barmap[index]) {
>
> if (core_vdev->iommufd_device &&
> iommufd_vdevice_region_exclusive(core_vdev->iommufd_device))
> ret = pci_request_selected_regions_exclusive(pdev,
> 1 << index, "vfio-pci");
> else
> ret = pci_request_selected_regions(pdev,
> 1 << index, "vfio-pci");
And IMHO, these should be moved to probe time or at least FD open
time, not at mmap time...
Jason
On Thu, Jun 05, 2025 at 01:33:39PM -0300, Jason Gunthorpe wrote:
> On Thu, Jun 05, 2025 at 09:47:01PM +0530, Aneesh Kumar K.V wrote:
> > Jason Gunthorpe <jgg@nvidia.com> writes:
> >
> > > On Thu, Jun 05, 2025 at 05:33:52PM +0530, Aneesh Kumar K.V wrote:
> > >
> > >> > +
> > >> > + /* To ensure no host side MMIO access is possible */
> > >> > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> > >> > + if (ret)
> > >> > + goto out_unlock;
> > >> > +
> > >> >
> > >>
> > >> I am hitting failures here with similar changes. Can you share the Qemu
> > >> changes needed to make this pci_request_regions_exclusive successful.
Jason has described the suggested static lockdown flow and we could
try on that. I just wanna help position your immediate failure.
Maybe you still have QEMU mmapped the MMIO region.
int vfio_pci_core_mmap()
{
...
if (!vdev->barmap[index]) {
ret = pci_request_selected_regions(pdev,
1 << index, "vfio-pci");
...
}
Even for static lockdown, userspace should not mmap the MMIOs anymore.
Thanks,
Yilun
> > >> Also after the TDI is unbound, we want the region ownership backto
> > >> "vfio-pci" so that things continue to work as non-secure device. I don't
> > >> see we doing that. I could add a pci_bar_deactivate/pci_bar_activate in
> > >> userspace which will result in vfio_unmap()/vfio_map(). But that doesn't
> > >> release the region ownership.
> > >
> > > Again, IMHO, we should not be doing this dynamically. VFIO should do
> > > pci_request_regions_exclusive() once at the very start and it should
> > > stay that way.
> > >
> > > There is no reason to change it dynamically.
> > >
> > > The only decision to make is if all vfio should switch to exclusive
> > > mode or if we need to make it optional for userspace.
> >
> > We only need the exclusive mode when the device is operating in secure
> > mode, correct? That suggests we’ll need to dynamically toggle this
> > setting based on the device’s security state.
>
> No, if the decision is that VFIO should allow this to be controlled by
> userspace then userspace will tell iommufd to run in regions_exclusive
> mode prior to opening the vfio cdev and VFIO will still do it once at
> open time and never change it.
>
> The only thing request_regions does is block other drivers outside
> vfio from using this memory space. There is no reason at all to change
> this dynamically. A CC VMM using VFIO will never use a driver outside
> VFIO to touch the VFIO controlled memory.
>
> Jason
Xu Yilun <yilun.xu@linux.intel.com> writes:
> Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
> expected to be called by userspace when CoCo VM issues TDI bind/unbind
> command to VMM. Specifically for TDX Connect, these commands are some
> secure Hypervisor call named GHCI (Guest-Hypervisor Communication
> Interface).
>
> The TSM TDI bind/unbind operations are expected to be initiated by a
> running CoCo VM, which already have the legacy assigned device in place.
> The TSM bind operation is to request VMM make all secure configurations
> to support device work as a TDI, and then issue TDISP messages to move
> the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
>
> Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
> device to TDISP ERROR state.
>
Any reason these need to be a vfio ioctl instead of iommufd ioctl?
For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/
>
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Wu Hao <hao.wu@intel.com>
> Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
> ---
> drivers/vfio/iommufd.c | 22 ++++++++++
> drivers/vfio/pci/vfio_pci_core.c | 74 ++++++++++++++++++++++++++++++++
> include/linux/vfio.h | 7 +++
> include/linux/vfio_pci_core.h | 1 +
> include/uapi/linux/vfio.h | 42 ++++++++++++++++++
> 5 files changed, 146 insertions(+)
>
> diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
> index 3441d24538a8..33fd20ffaeee 100644
> --- a/drivers/vfio/iommufd.c
> +++ b/drivers/vfio/iommufd.c
> @@ -297,3 +297,25 @@ void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev)
> vdev->iommufd_attached = false;
> }
> EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas);
> +
> +int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id)
> +{
> + lockdep_assert_held(&vdev->dev_set->lock);
> +
> + if (WARN_ON(!vdev->iommufd_device))
> + return -EINVAL;
> +
> + return iommufd_device_tsm_bind(vdev->iommufd_device, vdevice_id);
> +}
> +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_bind);
> +
> +void vfio_iommufd_tsm_unbind(struct vfio_device *vdev)
> +{
> + lockdep_assert_held(&vdev->dev_set->lock);
> +
> + if (WARN_ON(!vdev->iommufd_device))
> + return;
> +
> + iommufd_device_tsm_unbind(vdev->iommufd_device);
> +}
> +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_unbind);
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 116964057b0b..92544e54c9c3 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -692,6 +692,13 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
> #if IS_ENABLED(CONFIG_EEH)
> eeh_dev_release(vdev->pdev);
> #endif
> +
> + if (vdev->is_tsm_bound) {
> + vfio_iommufd_tsm_unbind(&vdev->vdev);
> + pci_release_regions(vdev->pdev);
> + vdev->is_tsm_bound = false;
> + }
> +
> vfio_pci_core_disable(vdev);
>
> vfio_pci_dma_buf_cleanup(vdev);
> @@ -1447,6 +1454,69 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
> ioeventfd.fd);
> }
>
> +static int vfio_pci_ioctl_tsm_bind(struct vfio_pci_core_device *vdev,
> + void __user *arg)
> +{
> + unsigned long minsz = offsetofend(struct vfio_pci_tsm_bind, vdevice_id);
> + struct vfio_pci_tsm_bind tsm_bind;
> + struct pci_dev *pdev = vdev->pdev;
> + int ret;
> +
> + if (copy_from_user(&tsm_bind, arg, minsz))
> + return -EFAULT;
> +
> + if (tsm_bind.argsz < minsz || tsm_bind.flags)
> + return -EINVAL;
> +
> + mutex_lock(&vdev->vdev.dev_set->lock);
> +
> + /* To ensure no host side MMIO access is possible */
> + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> + if (ret)
> + goto out_unlock;
>
This should be part of pci_tsm_bind() ?
> +
> + ret = vfio_iommufd_tsm_bind(&vdev->vdev, tsm_bind.vdevice_id);
> + if (ret)
> + goto out_release_region;
> +
> + vdev->is_tsm_bound = true;
> + mutex_unlock(&vdev->vdev.dev_set->lock);
> +
> + return 0;
> +
> +out_release_region:
> + pci_release_regions(pdev);
> +out_unlock:
> + mutex_unlock(&vdev->vdev.dev_set->lock);
> + return ret;
> +}
> +
> +static int vfio_pci_ioctl_tsm_unbind(struct vfio_pci_core_device *vdev,
> + void __user *arg)
> +{
> + unsigned long minsz = offsetofend(struct vfio_pci_tsm_unbind, flags);
> + struct vfio_pci_tsm_unbind tsm_unbind;
> + struct pci_dev *pdev = vdev->pdev;
> +
> + if (copy_from_user(&tsm_unbind, arg, minsz))
> + return -EFAULT;
> +
> + if (tsm_unbind.argsz < minsz || tsm_unbind.flags)
> + return -EINVAL;
> +
> + mutex_lock(&vdev->vdev.dev_set->lock);
> +
> + if (!vdev->is_tsm_bound)
> + return 0;
> +
> + vfio_iommufd_tsm_unbind(&vdev->vdev);
> + pci_release_regions(pdev);
> + vdev->is_tsm_bound = false;
> + mutex_unlock(&vdev->vdev.dev_set->lock);
> +
> + return 0;
> +}
> +
-aneesh
On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote:
> Xu Yilun <yilun.xu@linux.intel.com> writes:
>
> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are
> > expected to be called by userspace when CoCo VM issues TDI bind/unbind
> > command to VMM. Specifically for TDX Connect, these commands are some
> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication
> > Interface).
> >
> > The TSM TDI bind/unbind operations are expected to be initiated by a
> > running CoCo VM, which already have the legacy assigned device in place.
> > The TSM bind operation is to request VMM make all secure configurations
> > to support device work as a TDI, and then issue TDISP messages to move
> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation.
> >
> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead
> > device to TDISP ERROR state.
> >
>
> Any reason these need to be a vfio ioctl instead of iommufd ioctl?
> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/
A general reason is, the device driver - VFIO should be aware of the
bound state, and some operations break the bound state. VFIO should also
know some operations on bound may crash kernel because of platform TSM
firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping
in secure page tables cannot be unmapped before TDI STOP [1].
Specifically, for TDX Connect, the firmware enforces MMIO unmapping in
S-EPT would fail if TDI is bound. For AMD there seems also some
requirement about this but I need Alexey's confirmation.
[1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/
>
> >
> > Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> > Signed-off-by: Wu Hao <hao.wu@intel.com>
> > Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
> > ---
> > drivers/vfio/iommufd.c | 22 ++++++++++
> > drivers/vfio/pci/vfio_pci_core.c | 74 ++++++++++++++++++++++++++++++++
> > include/linux/vfio.h | 7 +++
> > include/linux/vfio_pci_core.h | 1 +
> > include/uapi/linux/vfio.h | 42 ++++++++++++++++++
> > 5 files changed, 146 insertions(+)
> >
> > diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
> > index 3441d24538a8..33fd20ffaeee 100644
> > --- a/drivers/vfio/iommufd.c
> > +++ b/drivers/vfio/iommufd.c
> > @@ -297,3 +297,25 @@ void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev)
> > vdev->iommufd_attached = false;
> > }
> > EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas);
> > +
> > +int vfio_iommufd_tsm_bind(struct vfio_device *vdev, u32 vdevice_id)
> > +{
> > + lockdep_assert_held(&vdev->dev_set->lock);
> > +
> > + if (WARN_ON(!vdev->iommufd_device))
> > + return -EINVAL;
> > +
> > + return iommufd_device_tsm_bind(vdev->iommufd_device, vdevice_id);
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_bind);
> > +
> > +void vfio_iommufd_tsm_unbind(struct vfio_device *vdev)
> > +{
> > + lockdep_assert_held(&vdev->dev_set->lock);
> > +
> > + if (WARN_ON(!vdev->iommufd_device))
> > + return;
> > +
> > + iommufd_device_tsm_unbind(vdev->iommufd_device);
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_iommufd_tsm_unbind);
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index 116964057b0b..92544e54c9c3 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -692,6 +692,13 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
> > #if IS_ENABLED(CONFIG_EEH)
> > eeh_dev_release(vdev->pdev);
> > #endif
> > +
> > + if (vdev->is_tsm_bound) {
> > + vfio_iommufd_tsm_unbind(&vdev->vdev);
> > + pci_release_regions(vdev->pdev);
> > + vdev->is_tsm_bound = false;
> > + }
> > +
> > vfio_pci_core_disable(vdev);
> >
> > vfio_pci_dma_buf_cleanup(vdev);
> > @@ -1447,6 +1454,69 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
> > ioeventfd.fd);
> > }
> >
> > +static int vfio_pci_ioctl_tsm_bind(struct vfio_pci_core_device *vdev,
> > + void __user *arg)
> > +{
> > + unsigned long minsz = offsetofend(struct vfio_pci_tsm_bind, vdevice_id);
> > + struct vfio_pci_tsm_bind tsm_bind;
> > + struct pci_dev *pdev = vdev->pdev;
> > + int ret;
> > +
> > + if (copy_from_user(&tsm_bind, arg, minsz))
> > + return -EFAULT;
> > +
> > + if (tsm_bind.argsz < minsz || tsm_bind.flags)
> > + return -EINVAL;
> > +
> > + mutex_lock(&vdev->vdev.dev_set->lock);
> > +
> > + /* To ensure no host side MMIO access is possible */
> > + ret = pci_request_regions_exclusive(pdev, "vfio-pci-tsm");
> > + if (ret)
> > + goto out_unlock;
> >
>
> This should be part of pci_tsm_bind() ?
I'm not quite sure. My feelig is this method is specific for VFIO
driver. Many other drivers just request regions on probe(), they can
never bind successfully if pci tsm hide this implementation internally.
Thanks,
Yilun
Xu Yilun <yilun.xu@linux.intel.com> writes: > On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote: >> Xu Yilun <yilun.xu@linux.intel.com> writes: >> >> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are >> > expected to be called by userspace when CoCo VM issues TDI bind/unbind >> > command to VMM. Specifically for TDX Connect, these commands are some >> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication >> > Interface). >> > >> > The TSM TDI bind/unbind operations are expected to be initiated by a >> > running CoCo VM, which already have the legacy assigned device in place. >> > The TSM bind operation is to request VMM make all secure configurations >> > to support device work as a TDI, and then issue TDISP messages to move >> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation. >> > >> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead >> > device to TDISP ERROR state. >> > >> >> Any reason these need to be a vfio ioctl instead of iommufd ioctl? >> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/ > > A general reason is, the device driver - VFIO should be aware of the > bound state, and some operations break the bound state. VFIO should also > know some operations on bound may crash kernel because of platform TSM > firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping > in secure page tables cannot be unmapped before TDI STOP [1]. > > Specifically, for TDX Connect, the firmware enforces MMIO unmapping in > S-EPT would fail if TDI is bound. For AMD there seems also some > requirement about this but I need Alexey's confirmation. > > [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ > According to the TDISP specification (Section 11.2.6), clearing either the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause the TDI to transition to an error state. To handle this gracefully, it seems necessary to unbind the TDI before modifying the BME or MSE bits. If I understand correctly, we also need to unmap the Stage-2 mapping due to the issue described in commit abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional reasons we would want to unmap the Stage-2 mapping for the BAR (as done in vfio_pci_zap_and_down_write_memory_lock)? Additionally, with TDX, it appears that before unmapping the Stage-2 mapping for the BAR, we should first unbind the TDI (ie, move it to the "unlock" state?) Is this step related Section 11.2.6 of the TDISP spec, or is it driven by a different requirement? -aneesh
On Wed, Jun 04, 2025 at 07:07:18PM +0530, Aneesh Kumar K.V wrote: > Xu Yilun <yilun.xu@linux.intel.com> writes: > > > On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote: > >> Xu Yilun <yilun.xu@linux.intel.com> writes: > >> > >> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are > >> > expected to be called by userspace when CoCo VM issues TDI bind/unbind > >> > command to VMM. Specifically for TDX Connect, these commands are some > >> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication > >> > Interface). > >> > > >> > The TSM TDI bind/unbind operations are expected to be initiated by a > >> > running CoCo VM, which already have the legacy assigned device in place. > >> > The TSM bind operation is to request VMM make all secure configurations > >> > to support device work as a TDI, and then issue TDISP messages to move > >> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation. > >> > > >> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead > >> > device to TDISP ERROR state. > >> > > >> > >> Any reason these need to be a vfio ioctl instead of iommufd ioctl? > >> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/ > > > > A general reason is, the device driver - VFIO should be aware of the > > bound state, and some operations break the bound state. VFIO should also > > know some operations on bound may crash kernel because of platform TSM > > firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping > > in secure page tables cannot be unmapped before TDI STOP [1]. > > > > Specifically, for TDX Connect, the firmware enforces MMIO unmapping in > > S-EPT would fail if TDI is bound. For AMD there seems also some > > requirement about this but I need Alexey's confirmation. > > > > [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ > > > > According to the TDISP specification (Section 11.2.6), clearing either > the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause > the TDI to transition to an error state. To handle this gracefully, it > seems necessary to unbind the TDI before modifying the BME or MSE bits. Yes. But now the suggestion is never let VFIO do unbind, instead VFIO should block these operations when device is bound. > > If I understand correctly, we also need to unmap the Stage-2 mapping due > to the issue described in commit > abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional > reasons we would want to unmap the Stage-2 mapping for the BAR (as done > in vfio_pci_zap_and_down_write_memory_lock)? I think no more reason. > > Additionally, with TDX, it appears that before unmapping the Stage-2 > mapping for the BAR, we should first unbind the TDI (ie, move it to the > "unlock" state?) Is this step related Section 11.2.6 of the TDISP spec, > or is it driven by a different requirement? No, this is not device side TDISP requirement. It is host side requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals unmap IOMMU S2 PT. If we allow IOMMU S2 PT unmapped when TDI is running, host could fool guest by just unmap some PT entry and suppress the fault event. Guest thought a DMA writting is successful but it is not and may cause data integrity issue. This is not a TDX specific problem, but different vendors has different mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For AMD, will trigger some HW protection called "ASID fence" [1]. Not sure how ARM handles this? https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ Thanks, Yilun > > -aneesh
Xu Yilun <yilun.xu@linux.intel.com> writes: > On Wed, Jun 04, 2025 at 07:07:18PM +0530, Aneesh Kumar K.V wrote: >> Xu Yilun <yilun.xu@linux.intel.com> writes: >> >> > On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote: >> >> Xu Yilun <yilun.xu@linux.intel.com> writes: >> >> >> >> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are >> >> > expected to be called by userspace when CoCo VM issues TDI bind/unbind >> >> > command to VMM. Specifically for TDX Connect, these commands are some >> >> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication >> >> > Interface). >> >> > >> >> > The TSM TDI bind/unbind operations are expected to be initiated by a >> >> > running CoCo VM, which already have the legacy assigned device in place. >> >> > The TSM bind operation is to request VMM make all secure configurations >> >> > to support device work as a TDI, and then issue TDISP messages to move >> >> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation. >> >> > >> >> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead >> >> > device to TDISP ERROR state. >> >> > >> >> >> >> Any reason these need to be a vfio ioctl instead of iommufd ioctl? >> >> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/ >> > >> > A general reason is, the device driver - VFIO should be aware of the >> > bound state, and some operations break the bound state. VFIO should also >> > know some operations on bound may crash kernel because of platform TSM >> > firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping >> > in secure page tables cannot be unmapped before TDI STOP [1]. >> > >> > Specifically, for TDX Connect, the firmware enforces MMIO unmapping in >> > S-EPT would fail if TDI is bound. For AMD there seems also some >> > requirement about this but I need Alexey's confirmation. >> > >> > [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ >> > >> >> According to the TDISP specification (Section 11.2.6), clearing either >> the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause >> the TDI to transition to an error state. To handle this gracefully, it >> seems necessary to unbind the TDI before modifying the BME or MSE bits. > > Yes. But now the suggestion is never let VFIO do unbind, instead VFIO > should block these operations when device is bound. > >> >> If I understand correctly, we also need to unmap the Stage-2 mapping due >> to the issue described in commit >> abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional >> reasons we would want to unmap the Stage-2 mapping for the BAR (as done >> in vfio_pci_zap_and_down_write_memory_lock)? > > I think no more reason. > >> >> Additionally, with TDX, it appears that before unmapping the Stage-2 >> mapping for the BAR, we should first unbind the TDI (ie, move it to the >> "unlock" state?) Is this step related Section 11.2.6 of the TDISP spec, >> or is it driven by a different requirement? > > No, this is not device side TDISP requirement. It is host side > requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share > with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals > unmap IOMMU S2 PT. > > If we allow IOMMU S2 PT unmapped when TDI is running, host could fool > guest by just unmap some PT entry and suppress the fault event. Guest > thought a DMA writting is successful but it is not and may cause > data integrity issue. > I am still trying to find more details here. How did the guest conclude DMA writing is successful? Guest would timeout waiting for DMA to complete if the host hides the interrupt delivery of failed DMA transfer? > > This is not a TDX specific problem, but different vendors has different > mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For > AMD, will trigger some HW protection called "ASID fence" [1]. Not sure > how ARM handles this? > > https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ > > Thanks, > Yilun > -aneesh
On Mon, Jun 16, 2025 at 01:46:42PM +0530, Aneesh Kumar K.V wrote: > Xu Yilun <yilun.xu@linux.intel.com> writes: > > > On Wed, Jun 04, 2025 at 07:07:18PM +0530, Aneesh Kumar K.V wrote: > >> Xu Yilun <yilun.xu@linux.intel.com> writes: > >> > >> > On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote: > >> >> Xu Yilun <yilun.xu@linux.intel.com> writes: > >> >> > >> >> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are > >> >> > expected to be called by userspace when CoCo VM issues TDI bind/unbind > >> >> > command to VMM. Specifically for TDX Connect, these commands are some > >> >> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication > >> >> > Interface). > >> >> > > >> >> > The TSM TDI bind/unbind operations are expected to be initiated by a > >> >> > running CoCo VM, which already have the legacy assigned device in place. > >> >> > The TSM bind operation is to request VMM make all secure configurations > >> >> > to support device work as a TDI, and then issue TDISP messages to move > >> >> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation. > >> >> > > >> >> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead > >> >> > device to TDISP ERROR state. > >> >> > > >> >> > >> >> Any reason these need to be a vfio ioctl instead of iommufd ioctl? > >> >> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/ > >> > > >> > A general reason is, the device driver - VFIO should be aware of the > >> > bound state, and some operations break the bound state. VFIO should also > >> > know some operations on bound may crash kernel because of platform TSM > >> > firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping > >> > in secure page tables cannot be unmapped before TDI STOP [1]. > >> > > >> > Specifically, for TDX Connect, the firmware enforces MMIO unmapping in > >> > S-EPT would fail if TDI is bound. For AMD there seems also some > >> > requirement about this but I need Alexey's confirmation. > >> > > >> > [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ > >> > > >> > >> According to the TDISP specification (Section 11.2.6), clearing either > >> the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause > >> the TDI to transition to an error state. To handle this gracefully, it > >> seems necessary to unbind the TDI before modifying the BME or MSE bits. > > > > Yes. But now the suggestion is never let VFIO do unbind, instead VFIO > > should block these operations when device is bound. > > > >> > >> If I understand correctly, we also need to unmap the Stage-2 mapping due > >> to the issue described in commit > >> abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional > >> reasons we would want to unmap the Stage-2 mapping for the BAR (as done > >> in vfio_pci_zap_and_down_write_memory_lock)? > > > > I think no more reason. > > > >> > >> Additionally, with TDX, it appears that before unmapping the Stage-2 > >> mapping for the BAR, we should first unbind the TDI (ie, move it to the > >> "unlock" state?) Is this step related Section 11.2.6 of the TDISP spec, > >> or is it driven by a different requirement? > > > > No, this is not device side TDISP requirement. It is host side > > requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share > > with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals > > unmap IOMMU S2 PT. > > > > If we allow IOMMU S2 PT unmapped when TDI is running, host could fool > > guest by just unmap some PT entry and suppress the fault event. Guest > > thought a DMA writting is successful but it is not and may cause > > data integrity issue. > > > > I am still trying to find more details here. How did the guest conclude > DMA writing is successful? Traditionally VMM is the trusted entity. If there is no IOMMU fault reported, guest assumes DMA writing is successful. > Guest would timeout waiting for DMA to complete There is no *generic* machanism to detect or wait for a single DMA write completion. They are "posted" in terms of PCIe. Thanks, Yilun > if the host hides the interrupt delivery of failed DMA transfer? > > > > > This is not a TDX specific problem, but different vendors has different > > mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For > > AMD, will trigger some HW protection called "ASID fence" [1]. Not sure > > how ARM handles this? > > > > https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ > > > > Thanks, > > Yilun > > > > -aneesh
Xu Yilun <yilun.xu@linux.intel.com> writes: > On Wed, Jun 04, 2025 at 07:07:18PM +0530, Aneesh Kumar K.V wrote: >> Xu Yilun <yilun.xu@linux.intel.com> writes: >> >> > On Sun, Jun 01, 2025 at 04:15:32PM +0530, Aneesh Kumar K.V wrote: >> >> Xu Yilun <yilun.xu@linux.intel.com> writes: >> >> >> >> > Add new IOCTLs to do TSM based TDI bind/unbind. These IOCTLs are >> >> > expected to be called by userspace when CoCo VM issues TDI bind/unbind >> >> > command to VMM. Specifically for TDX Connect, these commands are some >> >> > secure Hypervisor call named GHCI (Guest-Hypervisor Communication >> >> > Interface). >> >> > >> >> > The TSM TDI bind/unbind operations are expected to be initiated by a >> >> > running CoCo VM, which already have the legacy assigned device in place. >> >> > The TSM bind operation is to request VMM make all secure configurations >> >> > to support device work as a TDI, and then issue TDISP messages to move >> >> > the TDI to CONFIG_LOCKED or RUN state, waiting for guest's attestation. >> >> > >> >> > Do TSM Unbind before vfio_pci_core_disable(), otherwise will lead >> >> > device to TDISP ERROR state. >> >> > >> >> >> >> Any reason these need to be a vfio ioctl instead of iommufd ioctl? >> >> For ex: https://lore.kernel.org/all/20250529133757.462088-3-aneesh.kumar@kernel.org/ >> > >> > A general reason is, the device driver - VFIO should be aware of the >> > bound state, and some operations break the bound state. VFIO should also >> > know some operations on bound may crash kernel because of platform TSM >> > firmware's enforcement. E.g. zapping MMIO, because private MMIO mapping >> > in secure page tables cannot be unmapped before TDI STOP [1]. >> > >> > Specifically, for TDX Connect, the firmware enforces MMIO unmapping in >> > S-EPT would fail if TDI is bound. For AMD there seems also some >> > requirement about this but I need Alexey's confirmation. >> > >> > [1] https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ >> > >> >> According to the TDISP specification (Section 11.2.6), clearing either >> the Bus Master Enable (BME) or Memory Space Enable (MSE) bits will cause >> the TDI to transition to an error state. To handle this gracefully, it >> seems necessary to unbind the TDI before modifying the BME or MSE bits. > > Yes. But now the suggestion is never let VFIO do unbind, instead VFIO > should block these operations when device is bound. > >> >> If I understand correctly, we also need to unmap the Stage-2 mapping due >> to the issue described in commit >> abafbc551fddede3e0a08dee1dcde08fc0eb8476. Are there any additional >> reasons we would want to unmap the Stage-2 mapping for the BAR (as done >> in vfio_pci_zap_and_down_write_memory_lock)? > > I think no more reason. > >> >> Additionally, with TDX, it appears that before unmapping the Stage-2 >> mapping for the BAR, we should first unbind the TDI (ie, move it to the >> "unlock" state?) Is this step related Section 11.2.6 of the TDISP spec, >> or is it driven by a different requirement? > > No, this is not device side TDISP requirement. It is host side > requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share > with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals > unmap IOMMU S2 PT. > > If we allow IOMMU S2 PT unmapped when TDI is running, host could fool > guest by just unmap some PT entry and suppress the fault event. Guest > thought a DMA writting is successful but it is not and may cause > data integrity issue. > > This is not a TDX specific problem, but different vendors has different > mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For > AMD, will trigger some HW protection called "ASID fence" [1]. Not sure > how ARM handles this? > > https://lore.kernel.org/all/aDnXxk46kwrOcl0i@yilunxu-OptiPlex-7050/ > MMIO/BAR Unmapping: If the stage-2 mapping is removed while the device is in a locked state—a scenario that ARM permits—the granule transitions to the RIPAS_DESTROYED and HIPAS_UNASSIGNED states. Any MMIO or CPU access to such a granule will trigger a non-emulatable data abort, which is forwarded to the non-secure hypervisor (e.g., KVM). However, at this point, the system cannot make further progress. The unmapping was initiated by the host without coordination from the guest, leaving the granule in a broken state. A more robust workflow would involve the guest first transitioning the granule to RIPAS_EMPTY, followed by the host unmapping the stage-2 entry. IOMMU Page Table Unmap: Both the CPU and the SMMU can share the stage-2 page table. If the non-secure host unmaps an entry from this shared page table, the affected granule again transitions to RIPAS_DESTROYED and HIPAS_UNASSIGNED. In this case, a DMA transaction—(SMMU is configured by the Realm Management Monitor,RMM)—can be terminated. This typically results in an event being recorded in the event queue which can be read by RMM. However, interrupt delivery remains under non-secure host control, and the guest may not be immediately aware that the DMA transaction was terminated. I am currently confirming this behavior with the design team and will follow up once I have clarity. -aneesh
On Thu, Jun 05, 2025 at 05:41:17PM +0800, Xu Yilun wrote: > No, this is not device side TDISP requirement. It is host side > requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share > with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals > unmap IOMMU S2 PT. > > If we allow IOMMU S2 PT unmapped when TDI is running, host could fool > guest by just unmap some PT entry and suppress the fault event. Guest > thought a DMA writting is successful but it is not and may cause > data integrity issue. So, TDX prevents *any* unmap, even of normal memory, from the S2 while a guest is running? Seems extreme? MMIO isn't special, if you have a rule like that for such a security reason it should cover all of the S2. > This is not a TDX specific problem, but different vendors has different > mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For > AMD, will trigger some HW protection called "ASID fence" [1]. Not sure > how ARM handles this? This seems even more extreme, if the guest gets a bad DMA address into the device then the entire device gets killed? No chance to debug it? Jason
On Thu, Jun 05, 2025 at 12:09:16PM -0300, Jason Gunthorpe wrote: > On Thu, Jun 05, 2025 at 05:41:17PM +0800, Xu Yilun wrote: > > > No, this is not device side TDISP requirement. It is host side > > requirement to fix DMA silent drop issue. TDX enforces CPU S2 PT share > > with IOMMU S2 PT (does ARM do the same?), so unmap CPU S2 PT in KVM equals > > unmap IOMMU S2 PT. > > > > If we allow IOMMU S2 PT unmapped when TDI is running, host could fool > > guest by just unmap some PT entry and suppress the fault event. Guest > > thought a DMA writting is successful but it is not and may cause > > data integrity issue. > > So, TDX prevents *any* unmap, even of normal memory, from the S2 while > a guest is running? Seems extreme? Prevents any unmap *not intended* by guest, even for normal memory. Guest could show its unmapping intention by issuing an "page release" firmware call then host is OK to unmap. This for normal memory. For MMIO, Guest implicitly hwo the intention by unbind the TDI first. > > MMIO isn't special, if you have a rule like that for such a security > reason it should cover all of the S2. It does. Thanks, Yilun > > > This is not a TDX specific problem, but different vendors has different > > mechanisms for this. For TDX, firmware fails the MMIO unmap for S2. For > > AMD, will trigger some HW protection called "ASID fence" [1]. Not sure > > how ARM handles this? > > This seems even more extreme, if the guest gets a bad DMA address into > the device then the entire device gets killed? No chance to debug it? > > Jason >
© 2016 - 2025 Red Hat, Inc.