[v4] vfio/xe: Add driver variant for Xe VF migration

[PATCH v4 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Michał Winiarski 3 months ago

In addition to generic VFIO PCI functionality, the driver implements
VFIO migration uAPI, allowing userspace to enable migration for Intel
Graphics SR-IOV Virtual Functions.
The driver binds to VF device, and uses API exposed by Xe driver bound
to PF device to control VF device state and transfer the migration data.

Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 MAINTAINERS                  |   7 +
 drivers/vfio/pci/Kconfig     |   2 +
 drivers/vfio/pci/Makefile    |   2 +
 drivers/vfio/pci/xe/Kconfig  |  12 +
 drivers/vfio/pci/xe/Makefile |   3 +
 drivers/vfio/pci/xe/main.c   | 556 +++++++++++++++++++++++++++++++++++
 6 files changed, 582 insertions(+)
 create mode 100644 drivers/vfio/pci/xe/Kconfig
 create mode 100644 drivers/vfio/pci/xe/Makefile
 create mode 100644 drivers/vfio/pci/xe/main.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 9e941f983e27e..5558707d47d4f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -27008,6 +27008,13 @@ L:	virtualization@lists.linux.dev
 S:	Maintained
 F:	drivers/vfio/pci/virtio
 
+VFIO XE PCI DRIVER
+M:	Michał Winiarski <michal.winiarski@intel.com>
+L:	kvm@vger.kernel.org
+L:	intel-xe@lists.freedesktop.org
+S:	Supported
+F:	drivers/vfio/pci/xe
+
 VGA_SWITCHEROO
 R:	Lukas Wunner <lukas@wunner.de>
 S:	Maintained
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 2b0172f546652..c100f0ab87f2d 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -67,4 +67,6 @@ source "drivers/vfio/pci/nvgrace-gpu/Kconfig"
 
 source "drivers/vfio/pci/qat/Kconfig"
 
+source "drivers/vfio/pci/xe/Kconfig"
+
 endmenu
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index cf00c0a7e55c8..f5d46aa9347b9 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -19,3 +19,5 @@ obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/
 obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/
 
 obj-$(CONFIG_QAT_VFIO_PCI) += qat/
+
+obj-$(CONFIG_XE_VFIO_PCI) += xe/
diff --git a/drivers/vfio/pci/xe/Kconfig b/drivers/vfio/pci/xe/Kconfig
new file mode 100644
index 0000000000000..787be88268685
--- /dev/null
+++ b/drivers/vfio/pci/xe/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config XE_VFIO_PCI
+	tristate "VFIO support for Intel Graphics"
+	depends on DRM_XE
+	select VFIO_PCI_CORE
+	help
+	  This option enables vendor-specific VFIO driver for Intel Graphics.
+	  In addition to generic VFIO PCI functionality, it implements VFIO
+	  migration uAPI allowing userspace to enable migration for
+	  Intel Graphics SR-IOV Virtual Functions supported by the Xe driver.
+
+	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/xe/Makefile b/drivers/vfio/pci/xe/Makefile
new file mode 100644
index 0000000000000..13aa0fd192cd4
--- /dev/null
+++ b/drivers/vfio/pci/xe/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_XE_VFIO_PCI) += xe-vfio-pci.o
+xe-vfio-pci-y := main.o
diff --git a/drivers/vfio/pci/xe/main.c b/drivers/vfio/pci/xe/main.c
new file mode 100644
index 0000000000000..1caa64163be9f
--- /dev/null
+++ b/drivers/vfio/pci/xe/main.c
@@ -0,0 +1,556 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#include <linux/anon_inodes.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+
+#include <drm/intel/xe_sriov_vfio.h>
+#include <drm/intel/pciids.h>
+
+struct xe_vfio_pci_migration_file {
+	struct file *filp;
+	/* serializes accesses to migration data */
+	struct mutex lock;
+	struct xe_vfio_pci_core_device *xe_vdev;
+};
+
+struct xe_vfio_pci_core_device {
+	struct vfio_pci_core_device core_device;
+	struct xe_device *xe;
+	/* VF number used by PF, Xe HW/FW components use vfid indexing starting from 1 */
+	unsigned int vfid;
+	u8 migrate_cap:1;
+	u8 deferred_reset:1;
+	/* protects migration state */
+	struct mutex state_mutex;
+	enum vfio_device_mig_state mig_state;
+	/* protects the reset_done flow */
+	spinlock_t reset_lock;
+	struct xe_vfio_pci_migration_file *migf;
+};
+
+#define xe_vdev_to_dev(xe_vdev) (&(xe_vdev)->core_device.pdev->dev)
+
+static void xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file *migf)
+{
+	struct xe_vfio_pci_core_device *xe_vdev = migf->xe_vdev;
+
+	mutex_lock(&migf->lock);
+	xe_vdev->migf = NULL;
+	mutex_unlock(&migf->lock);
+}
+
+static void xe_vfio_pci_reset(struct xe_vfio_pci_core_device *xe_vdev)
+{
+	if (xe_vdev->migf)
+		xe_vfio_pci_disable_file(xe_vdev->migf);
+
+	xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+}
+
+static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device *xe_vdev)
+{
+	mutex_lock(&xe_vdev->state_mutex);
+}
+
+/*
+ * This function is called in all state_mutex unlock cases to
+ * handle a 'deferred_reset' if exists.
+ */
+static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device *xe_vdev)
+{
+again:
+	spin_lock(&xe_vdev->reset_lock);
+	if (xe_vdev->deferred_reset) {
+		xe_vdev->deferred_reset = false;
+		spin_unlock(&xe_vdev->reset_lock);
+		xe_vfio_pci_reset(xe_vdev);
+		goto again;
+	}
+	mutex_unlock(&xe_vdev->state_mutex);
+	spin_unlock(&xe_vdev->reset_lock);
+}
+
+static void xe_vfio_pci_reset_done(struct pci_dev *pdev)
+{
+	struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev);
+	int ret;
+
+	if (!xe_vdev->vfid)
+		return;
+
+	/*
+	 * VF FLR requires additional processing done by PF driver.
+	 * The processing is done after FLR is already finished from PCIe
+	 * perspective.
+	 * In order to avoid a scenario where VF is used while PF processing
+	 * is still in progress, additional synchronization point is needed.
+	 */
+	ret = xe_sriov_vfio_wait_flr_done(xe_vdev->xe, xe_vdev->vfid);
+	if (ret)
+		dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret);
+
+	if (!xe_vdev->migrate_cap)
+		return;
+
+	/*
+	 * As the higher VFIO layers are holding locks across reset and using
+	 * those same locks with the mm_lock we need to prevent ABBA deadlock
+	 * with the state_mutex and mm_lock.
+	 * In case the state_mutex was taken already we defer the cleanup work
+	 * to the unlock flow of the other running context.
+	 */
+	spin_lock(&xe_vdev->reset_lock);
+	xe_vdev->deferred_reset = true;
+	if (!mutex_trylock(&xe_vdev->state_mutex)) {
+		spin_unlock(&xe_vdev->reset_lock);
+		return;
+	}
+	spin_unlock(&xe_vdev->reset_lock);
+	xe_vfio_pci_state_mutex_unlock(xe_vdev);
+
+	xe_vfio_pci_reset(xe_vdev);
+}
+
+static const struct pci_error_handlers xe_vfio_pci_err_handlers = {
+	.reset_done = xe_vfio_pci_reset_done,
+	.error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static int xe_vfio_pci_open_device(struct vfio_device *core_vdev)
+{
+	struct xe_vfio_pci_core_device *xe_vdev =
+		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+	struct vfio_pci_core_device *vdev = &xe_vdev->core_device;
+	int ret;
+
+	ret = vfio_pci_core_enable(vdev);
+	if (ret)
+		return ret;
+
+	vfio_pci_core_finish_enable(vdev);
+
+	return 0;
+}
+
+static int xe_vfio_pci_release_file(struct inode *inode, struct file *filp)
+{
+	struct xe_vfio_pci_migration_file *migf = filp->private_data;
+
+	xe_vfio_pci_disable_file(migf);
+	mutex_destroy(&migf->lock);
+	kfree(migf);
+
+	return 0;
+}
+
+static ssize_t xe_vfio_pci_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos)
+{
+	struct xe_vfio_pci_migration_file *migf = filp->private_data;
+	ssize_t ret;
+
+	if (pos)
+		return -ESPIPE;
+
+	mutex_lock(&migf->lock);
+	ret = xe_sriov_vfio_data_read(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len);
+	mutex_unlock(&migf->lock);
+
+	return ret;
+}
+
+static const struct file_operations xe_vfio_pci_save_fops = {
+	.owner = THIS_MODULE,
+	.read = xe_vfio_pci_save_read,
+	.release = xe_vfio_pci_release_file,
+	.llseek = noop_llseek,
+};
+
+static ssize_t xe_vfio_pci_resume_write(struct file *filp, const char __user *buf,
+					size_t len, loff_t *pos)
+{
+	struct xe_vfio_pci_migration_file *migf = filp->private_data;
+	ssize_t ret;
+
+	if (pos)
+		return -ESPIPE;
+
+	mutex_lock(&migf->lock);
+	ret = xe_sriov_vfio_data_write(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len);
+	mutex_unlock(&migf->lock);
+
+	return ret;
+}
+
+static const struct file_operations xe_vfio_pci_resume_fops = {
+	.owner = THIS_MODULE,
+	.write = xe_vfio_pci_resume_write,
+	.release = xe_vfio_pci_release_file,
+	.llseek = noop_llseek,
+};
+
+static const char *vfio_dev_state_str(u32 state)
+{
+	switch (state) {
+	case VFIO_DEVICE_STATE_RUNNING: return "running";
+	case VFIO_DEVICE_STATE_RUNNING_P2P: return "running_p2p";
+	case VFIO_DEVICE_STATE_STOP_COPY: return "stopcopy";
+	case VFIO_DEVICE_STATE_STOP: return "stop";
+	case VFIO_DEVICE_STATE_RESUMING: return "resuming";
+	case VFIO_DEVICE_STATE_ERROR: return "error";
+	default: return "";
+	}
+}
+
+enum xe_vfio_pci_file_type {
+	XE_VFIO_FILE_SAVE = 0,
+	XE_VFIO_FILE_RESUME,
+};
+
+static struct xe_vfio_pci_migration_file *
+xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev,
+		       enum xe_vfio_pci_file_type type)
+{
+	struct xe_vfio_pci_migration_file *migf;
+	const struct file_operations *fops;
+	int flags;
+
+	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+	if (!migf)
+		return ERR_PTR(-ENOMEM);
+
+	fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops : &xe_vfio_pci_resume_fops;
+	flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY;
+	migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags);
+	if (IS_ERR(migf->filp)) {
+		kfree(migf);
+		return ERR_CAST(migf->filp);
+	}
+
+	mutex_init(&migf->lock);
+	migf->xe_vdev = xe_vdev;
+	xe_vdev->migf = migf;
+
+	stream_open(migf->filp->f_inode, migf->filp);
+
+	return migf;
+}
+
+static struct file *
+xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new)
+{
+	u32 cur = xe_vdev->mig_state;
+	int ret;
+
+	dev_dbg(xe_vdev_to_dev(xe_vdev),
+		"state: %s->%s\n", vfio_dev_state_str(cur), vfio_dev_state_str(new));
+
+	/*
+	 * "STOP" handling is reused for "RUNNING_P2P", as the device doesn't
+	 * have the capability to selectively block outgoing p2p DMA transfers.
+	 * While the device is allowing BAR accesses when the VF is stopped, it
+	 * is not processing any new workload requests, effectively stopping
+	 * any outgoing DMA transfers (not just p2p).
+	 * Any VRAM / MMIO accesses occurring during "RUNNING_P2P" are kept and
+	 * will be migrated to target VF during stop-copy.
+	 */
+	if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
+		ret = xe_sriov_vfio_suspend_device(xe_vdev->xe, xe_vdev->vfid);
+		if (ret)
+			goto err;
+
+		return NULL;
+	}
+
+	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) ||
+	    (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P))
+		return NULL;
+
+	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) {
+		ret = xe_sriov_vfio_resume_device(xe_vdev->xe, xe_vdev->vfid);
+		if (ret)
+			goto err;
+
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+		struct xe_vfio_pci_migration_file *migf;
+
+		migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE);
+		if (IS_ERR(migf)) {
+			ret = PTR_ERR(migf);
+			goto err;
+		}
+		get_file(migf->filp);
+
+		ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->xe, xe_vdev->vfid);
+		if (ret) {
+			fput(migf->filp);
+			goto err;
+		}
+
+		return migf->filp;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
+		if (xe_vdev->migf) {
+			fput(xe_vdev->migf->filp);
+			xe_vfio_pci_disable_file(xe_vdev->migf);
+		}
+
+		ret = xe_sriov_vfio_stop_copy_exit(xe_vdev->xe, xe_vdev->vfid);
+		if (ret)
+			goto err;
+
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+		struct xe_vfio_pci_migration_file *migf;
+
+		migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_RESUME);
+		if (IS_ERR(migf)) {
+			ret = PTR_ERR(migf);
+			goto err;
+		}
+		get_file(migf->filp);
+
+		ret = xe_sriov_vfio_resume_data_enter(xe_vdev->xe, xe_vdev->vfid);
+		if (ret) {
+			fput(migf->filp);
+			goto err;
+		}
+
+		return migf->filp;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+		if (xe_vdev->migf) {
+			fput(xe_vdev->migf->filp);
+			xe_vfio_pci_disable_file(xe_vdev->migf);
+		}
+
+		ret = xe_sriov_vfio_resume_data_exit(xe_vdev->xe, xe_vdev->vfid);
+		if (ret)
+			goto err;
+
+		return NULL;
+	}
+
+	WARN(true, "Unknown state transition %d->%d", cur, new);
+	return ERR_PTR(-EINVAL);
+
+err:
+	dev_dbg(xe_vdev_to_dev(xe_vdev),
+		"Failed to transition state: %s->%s err=%d\n",
+		vfio_dev_state_str(cur), vfio_dev_state_str(new), ret);
+	return ERR_PTR(ret);
+}
+
+static struct file *
+xe_vfio_pci_set_device_state(struct vfio_device *core_vdev,
+			     enum vfio_device_mig_state new_state)
+{
+	struct xe_vfio_pci_core_device *xe_vdev =
+		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+	enum vfio_device_mig_state next_state;
+	struct file *f = NULL;
+	int ret;
+
+	xe_vfio_pci_state_mutex_lock(xe_vdev);
+	while (new_state != xe_vdev->mig_state) {
+		ret = vfio_mig_get_next_state(core_vdev, xe_vdev->mig_state,
+					      new_state, &next_state);
+		if (ret) {
+			xe_sriov_vfio_error(xe_vdev->xe, xe_vdev->vfid);
+			f = ERR_PTR(ret);
+			break;
+		}
+		f = xe_vfio_set_state(xe_vdev, next_state);
+		if (IS_ERR(f))
+			break;
+
+		xe_vdev->mig_state = next_state;
+
+		/* Multiple state transitions with non-NULL file in the middle */
+		if (f && new_state != xe_vdev->mig_state) {
+			fput(f);
+			f = ERR_PTR(-EINVAL);
+			break;
+		}
+	}
+	xe_vfio_pci_state_mutex_unlock(xe_vdev);
+
+	return f;
+}
+
+static int xe_vfio_pci_get_device_state(struct vfio_device *core_vdev,
+					enum vfio_device_mig_state *curr_state)
+{
+	struct xe_vfio_pci_core_device *xe_vdev =
+		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+
+	xe_vfio_pci_state_mutex_lock(xe_vdev);
+	*curr_state = xe_vdev->mig_state;
+	xe_vfio_pci_state_mutex_unlock(xe_vdev);
+
+	return 0;
+}
+
+static int xe_vfio_pci_get_data_size(struct vfio_device *vdev,
+				     unsigned long *stop_copy_length)
+{
+	struct xe_vfio_pci_core_device *xe_vdev =
+		container_of(vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+
+	xe_vfio_pci_state_mutex_lock(xe_vdev);
+	*stop_copy_length = xe_sriov_vfio_stop_copy_size(xe_vdev->xe, xe_vdev->vfid);
+	xe_vfio_pci_state_mutex_unlock(xe_vdev);
+
+	return 0;
+}
+
+static const struct vfio_migration_ops xe_vfio_pci_migration_ops = {
+	.migration_set_state = xe_vfio_pci_set_device_state,
+	.migration_get_state = xe_vfio_pci_get_device_state,
+	.migration_get_data_size = xe_vfio_pci_get_data_size,
+};
+
+static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev)
+{
+	struct vfio_device *core_vdev = &xe_vdev->core_device.vdev;
+	struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
+	struct xe_device *xe = xe_sriov_vfio_get_pf(pdev);
+	int ret;
+
+	if (!xe)
+		return;
+	if (!xe_sriov_vfio_migration_supported(xe))
+		return;
+
+	ret = pci_iov_vf_id(pdev);
+	if (ret < 0)
+		return;
+
+	mutex_init(&xe_vdev->state_mutex);
+	spin_lock_init(&xe_vdev->reset_lock);
+
+	/* Xe HW/FW components use vfid indexing starting from 1 */
+	xe_vdev->vfid = ret + 1;
+	xe_vdev->xe = xe;
+	xe_vdev->migrate_cap = true;
+
+	core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
+	core_vdev->mig_ops = &xe_vfio_pci_migration_ops;
+}
+
+static void xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device *xe_vdev)
+{
+	if (!xe_vdev->migrate_cap)
+		return;
+
+	mutex_destroy(&xe_vdev->state_mutex);
+}
+
+static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev)
+{
+	struct xe_vfio_pci_core_device *xe_vdev =
+		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+
+	xe_vfio_pci_migration_init(xe_vdev);
+
+	return vfio_pci_core_init_dev(core_vdev);
+}
+
+static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev)
+{
+	struct xe_vfio_pci_core_device *xe_vdev =
+		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+
+	xe_vfio_pci_migration_fini(xe_vdev);
+}
+
+static const struct vfio_device_ops xe_vfio_pci_ops = {
+	.name = "xe-vfio-pci",
+	.init = xe_vfio_pci_init_dev,
+	.release = xe_vfio_pci_release_dev,
+	.open_device = xe_vfio_pci_open_device,
+	.close_device = vfio_pci_core_close_device,
+	.ioctl = vfio_pci_core_ioctl,
+	.device_feature = vfio_pci_core_ioctl_feature,
+	.read = vfio_pci_core_read,
+	.write = vfio_pci_core_write,
+	.mmap = vfio_pci_core_mmap,
+	.request = vfio_pci_core_request,
+	.match = vfio_pci_core_match,
+	.match_token_uuid = vfio_pci_core_match_token_uuid,
+	.bind_iommufd = vfio_iommufd_physical_bind,
+	.unbind_iommufd = vfio_iommufd_physical_unbind,
+	.attach_ioas = vfio_iommufd_physical_attach_ioas,
+	.detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static int xe_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct xe_vfio_pci_core_device *xe_vdev;
+	int ret;
+
+	xe_vdev = vfio_alloc_device(xe_vfio_pci_core_device, core_device.vdev, &pdev->dev,
+				    &xe_vfio_pci_ops);
+	if (IS_ERR(xe_vdev))
+		return PTR_ERR(xe_vdev);
+
+	dev_set_drvdata(&pdev->dev, &xe_vdev->core_device);
+
+	ret = vfio_pci_core_register_device(&xe_vdev->core_device);
+	if (ret) {
+		vfio_put_device(&xe_vdev->core_device.vdev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void xe_vfio_pci_remove(struct pci_dev *pdev)
+{
+	struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev);
+
+	vfio_pci_core_unregister_device(&xe_vdev->core_device);
+	vfio_put_device(&xe_vdev->core_device.vdev);
+}
+
+#define INTEL_PCI_VFIO_DEVICE(_id) { \
+	PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, (_id)) \
+}
+
+static const struct pci_device_id xe_vfio_pci_table[] = {
+	INTEL_PTL_IDS(INTEL_PCI_VFIO_DEVICE),
+	INTEL_WCL_IDS(INTEL_PCI_VFIO_DEVICE),
+	INTEL_BMG_IDS(INTEL_PCI_VFIO_DEVICE),
+	{}
+};
+MODULE_DEVICE_TABLE(pci, xe_vfio_pci_table);
+
+static struct pci_driver xe_vfio_pci_driver = {
+	.name = "xe-vfio-pci",
+	.id_table = xe_vfio_pci_table,
+	.probe = xe_vfio_pci_probe,
+	.remove = xe_vfio_pci_remove,
+	.err_handler = &xe_vfio_pci_err_handlers,
+	.driver_managed_dma = true,
+};
+module_pci_driver(xe_vfio_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michał Winiarski <michal.winiarski@intel.com>");
+MODULE_DESCRIPTION("VFIO PCI driver with migration support for Intel Graphics");
-- 
2.51.2

RE: [PATCH v4 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Tian, Kevin 3 months ago

> From: Winiarski, Michal <michal.winiarski@intel.com>
> Sent: Wednesday, November 5, 2025 11:10 PM
> 
> In addition to generic VFIO PCI functionality, the driver implements
> VFIO migration uAPI, allowing userspace to enable migration for Intel
> Graphics SR-IOV Virtual Functions.
> The driver binds to VF device, and uses API exposed by Xe driver bound
> to PF device to control VF device state and transfer the migration data.

"The driver binds to VF device and uses API exposed by Xe driver to
transfer the VF migration data under the control of PF device."

> +config XE_VFIO_PCI
> +	tristate "VFIO support for Intel Graphics"
> +	depends on DRM_XE
> +	select VFIO_PCI_CORE
> +	help
> +	  This option enables vendor-specific VFIO driver for Intel Graphics.
> +	  In addition to generic VFIO PCI functionality, it implements VFIO
> +	  migration uAPI allowing userspace to enable migration for
> +	  Intel Graphics SR-IOV Virtual Functions supported by the Xe driver.

hmm another "vendor-specific"...

> +struct xe_vfio_pci_core_device {
> +	struct vfio_pci_core_device core_device;
> +	struct xe_device *xe;
> +	/* VF number used by PF, Xe HW/FW components use vfid indexing
> starting from 1 */

Having both PF and Xe HW/FW is a bit noising. could be:

/* PF internal control uses vfid index starting from 1 */

> +
> +static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device
> *xe_vdev)
> +{
> +	mutex_lock(&xe_vdev->state_mutex);
> +}
> +
> +/*
> + * This function is called in all state_mutex unlock cases to
> + * handle a 'deferred_reset' if exists.
> + */
> +static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device
> *xe_vdev)
> +{
> +again:
> +	spin_lock(&xe_vdev->reset_lock);
> +	if (xe_vdev->deferred_reset) {
> +		xe_vdev->deferred_reset = false;
> +		spin_unlock(&xe_vdev->reset_lock);
> +		xe_vfio_pci_reset(xe_vdev);
> +		goto again;
> +	}
> +	mutex_unlock(&xe_vdev->state_mutex);
> +	spin_unlock(&xe_vdev->reset_lock);
> +}

this deferred_reset logic is a mlx unique thing. See:

https://lore.kernel.org/kvm/20240220132459.GM13330@nvidia.com/

Re: [PATCH v4 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Winiarski, Michal 3 months ago

On Thu, Nov 06, 2025 at 09:20:36AM +0100, Tian, Kevin wrote:
> > From: Winiarski, Michal <michal.winiarski@intel.com>
> > Sent: Wednesday, November 5, 2025 11:10 PM
> > 
> > In addition to generic VFIO PCI functionality, the driver implements
> > VFIO migration uAPI, allowing userspace to enable migration for Intel
> > Graphics SR-IOV Virtual Functions.
> > The driver binds to VF device, and uses API exposed by Xe driver bound
> > to PF device to control VF device state and transfer the migration data.
> 
> "The driver binds to VF device and uses API exposed by Xe driver to
> transfer the VF migration data under the control of PF device."

Ok.

> 
> > +config XE_VFIO_PCI
> > +	tristate "VFIO support for Intel Graphics"
> > +	depends on DRM_XE
> > +	select VFIO_PCI_CORE
> > +	help
> > +	  This option enables vendor-specific VFIO driver for Intel Graphics.
> > +	  In addition to generic VFIO PCI functionality, it implements VFIO
> > +	  migration uAPI allowing userspace to enable migration for
> > +	  Intel Graphics SR-IOV Virtual Functions supported by the Xe driver.
> 
> hmm another "vendor-specific"...

Ooops. I'll switch to "device specific driver variant" naming here as
well.

> 
> > +struct xe_vfio_pci_core_device {
> > +	struct vfio_pci_core_device core_device;
> > +	struct xe_device *xe;
> > +	/* VF number used by PF, Xe HW/FW components use vfid indexing
> > starting from 1 */
> 
> Having both PF and Xe HW/FW is a bit noising. could be:
> 
> /* PF internal control uses vfid index starting from 1 */

Ok.

> 
> > +
> > +static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device
> > *xe_vdev)
> > +{
> > +	mutex_lock(&xe_vdev->state_mutex);
> > +}
> > +
> > +/*
> > + * This function is called in all state_mutex unlock cases to
> > + * handle a 'deferred_reset' if exists.
> > + */
> > +static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device
> > *xe_vdev)
> > +{
> > +again:
> > +	spin_lock(&xe_vdev->reset_lock);
> > +	if (xe_vdev->deferred_reset) {
> > +		xe_vdev->deferred_reset = false;
> > +		spin_unlock(&xe_vdev->reset_lock);
> > +		xe_vfio_pci_reset(xe_vdev);
> > +		goto again;
> > +	}
> > +	mutex_unlock(&xe_vdev->state_mutex);
> > +	spin_unlock(&xe_vdev->reset_lock);
> > +}
> 
> this deferred_reset logic is a mlx unique thing. See:
> 
> https://lore.kernel.org/kvm/20240220132459.GM13330@nvidia.com/

Interesting, that doesn't match my observations.

[] ======================================================
[] WARNING: possible circular locking dependency detected
[] 6.18.0-rc3-xe+ #90 Tainted: G S   U
[] ------------------------------------------------------
[] qemu-system-x86/4375 is trying to acquire lock:
[] ff1100015af3ec30 (&migf->lock){+.+.}-{3:3}, at: xe_vfio_pci_set_device_state+0x22b/0x440 [xe_vfio_pci]
[]
   but task is already holding lock:
[] ff1100014c541a58 (&xe_vdev->state_mutex){+.+.}-{3:3}, at: xe_vfio_pci_set_device_state+0x43/0x440 [xe_vfio_pci]
[]
   which lock already depends on the new lock.

[]
   the existing dependency chain (in reverse order) is:
[]
   -> #3 (&xe_vdev->state_mutex){+.+.}-{3:3}:
[]        __mutex_lock+0xba/0x1110
[]        mutex_lock_nested+0x1b/0x30
[]        xe_vfio_pci_reset_done+0x2b/0xc0 [xe_vfio_pci]
[]        pci_try_reset_function+0xd7/0x150
[]        vfio_pci_core_ioctl+0x7f1/0xf20 [vfio_pci_core]
[]        vfio_device_fops_unl_ioctl+0x163/0x310 [vfio]
[]        __se_sys_ioctl+0x71/0xc0
[]        __x64_sys_ioctl+0x1d/0x30
[]        x64_sys_call+0x6ac/0xe50
[]        do_syscall_64+0xa1/0x560
[]        entry_SYSCALL_64_after_hwframe+0x4b/0x53
[]
   -> #2 (&vdev->memory_lock){++++}-{3:3}:
[]        down_read+0x41/0x180
[]        vfio_pci_mmap_huge_fault+0xec/0x310 [vfio_pci_core]
[]        handle_mm_fault+0x8aa/0x13b0
[]        fixup_user_fault+0x124/0x280
[]        vaddr_get_pfns+0x1d2/0x420 [vfio_iommu_type1]
[]        vfio_pin_pages_remote+0x173/0x610 [vfio_iommu_type1]
[]        vfio_pin_map_dma+0xfd/0x340 [vfio_iommu_type1]
[]        vfio_iommu_type1_ioctl+0xfdf/0x1290 [vfio_iommu_type1]
[]        vfio_fops_unl_ioctl+0x106/0x340 [vfio]
[]        __se_sys_ioctl+0x71/0xc0
[]        __x64_sys_ioctl+0x1d/0x30
[]        x64_sys_call+0x6ac/0xe50
[]        do_syscall_64+0xa1/0x560
[]        entry_SYSCALL_64_after_hwframe+0x4b/0x53
[]
   -> #1 (&mm->mmap_lock){++++}-{3:3}:
[]        __might_fault+0x56/0x90
[]        _copy_to_user+0x23/0x70
[]        xe_sriov_migration_data_read+0x17b/0x3f0 [xe]
[]        xe_sriov_vfio_data_read+0x44/0x60 [xe]
[]        xe_vfio_pci_save_read+0x55/0x80 [xe_vfio_pci]
[]        vfs_read+0xc0/0x300
[]        ksys_read+0x79/0xf0
[]        __x64_sys_read+0x1b/0x30
[]        x64_sys_call+0xcc4/0xe50
[]        do_syscall_64+0xa1/0x560
[]        entry_SYSCALL_64_after_hwframe+0x4b/0x53
[]
   -> #0 (&migf->lock){+.+.}-{3:3}:
[]        __lock_acquire+0x1aff/0x3450
[]        lock_acquire+0xde/0x280
[]        __mutex_lock+0xba/0x1110
[]        mutex_lock_nested+0x1b/0x30
[]        xe_vfio_pci_set_device_state+0x22b/0x440 [xe_vfio_pci]
[]        vfio_ioctl_device_feature_mig_device_state+0x9c/0x1b0 [vfio]
[]        vfio_device_fops_unl_ioctl+0x289/0x310 [vfio]
[]        __se_sys_ioctl+0x71/0xc0
[]        __x64_sys_ioctl+0x1d/0x30
[]        x64_sys_call+0x6ac/0xe50
[]        do_syscall_64+0xa1/0x560
[]        entry_SYSCALL_64_after_hwframe+0x4b/0x53
[]
   other info that might help us debug this:

[] Chain exists of:
     &migf->lock --> &vdev->memory_lock --> &xe_vdev->state_mutex

[]  Possible unsafe locking scenario:

[]        CPU0                    CPU1
[]        ----                    ----
[]   lock(&xe_vdev->state_mutex);
[]                                lock(&vdev->memory_lock);
[]                                lock(&xe_vdev->state_mutex);
[]   lock(&migf->lock);
[]
    *** DEADLOCK ***

[] 1 lock held by qemu-system-x86/4375:
[]  #0: ff1100014c541a58 (&xe_vdev->state_mutex){+.+.}-{3:3}, at: xe_vfio_pci_set_device_state+0x43/0x440 [xe_vfio_pci]
[]
   stack backtrace:
[] CPU: 18 UID: 0 PID: 4375 Comm: qemu-system-x86 Tainted: G S   U              6.18.0-rc3-xe+ #90 PREEMPT(voluntary)
[] Tainted: [S]=CPU_OUT_OF_SPEC, [U]=USER
[] Hardware name: Intel Corporation WHITLEY/WHITLEY, BIOS SE5C6200.86B.0027.P18.2206090856 06/09/2022
[] Call Trace:
[]  <TASK>
[]  __dump_stack+0x19/0x30
[]  dump_stack_lvl+0x66/0x90
[]  dump_stack+0x10/0x14
[]  print_circular_bug+0x2fd/0x310
[]  check_noncircular+0x139/0x160
[]  __lock_acquire+0x1aff/0x3450
[]  ? vprintk_emit+0x3ec/0x560
[]  ? dev_vprintk_emit+0x162/0x1c0
[]  lock_acquire+0xde/0x280
[]  ? xe_vfio_pci_set_device_state+0x22b/0x440 [xe_vfio_pci]
[]  ? xe_vfio_pci_set_device_state+0x22b/0x440 [xe_vfio_pci]
[]  __mutex_lock+0xba/0x1110
[]  ? xe_vfio_pci_set_device_state+0x22b/0x440 [xe_vfio_pci]
[]  mutex_lock_nested+0x1b/0x30
[]  xe_vfio_pci_set_device_state+0x22b/0x440 [xe_vfio_pci]
[]  vfio_ioctl_device_feature_mig_device_state+0x9c/0x1b0 [vfio]
[]  vfio_device_fops_unl_ioctl+0x289/0x310 [vfio]
[]  __se_sys_ioctl+0x71/0xc0
[]  ? entry_SYSCALL_64_after_hwframe+0x4b/0x53
[]  __x64_sys_ioctl+0x1d/0x30
[]  x64_sys_call+0x6ac/0xe50
[]  do_syscall_64+0xa1/0x560
[]  ? __lock_acquire+0x73f/0x3450
[]  ? __lock_acquire+0x73f/0x3450
[]  ? __lock_acquire+0x73f/0x3450
[]  ? lock_release+0x10b/0x340
[]  ? wp_page_reuse+0x82/0x100
[]  ? lock_release+0x10b/0x340
[]  ? wp_page_reuse+0xcc/0x100
[]  ? lock_acquire+0xde/0x280
[]  ? count_memcg_event_mm+0x20/0x170
[]  ? lock_is_held_type+0x8f/0x140
[]  ? lock_release+0x10b/0x340
[]  ? count_memcg_event_mm+0x20/0x170
[]  ? count_memcg_event_mm+0x20/0x170
[]  ? count_memcg_event_mm+0x20/0x170
[]  ? count_memcg_event_mm+0x114/0x170
[]  ? handle_mm_fault+0x1300/0x13b0
[]  ? handle_mm_fault+0x3c/0x13b0
[]  ? lock_vma_under_rcu+0x8c/0x230
[]  ? lock_release+0x10b/0x340
[]  ? exc_page_fault+0x77/0xf0
[]  ? irqentry_exit_to_user_mode+0x100/0x130
[]  ? irqentry_exit+0x31/0x80
[]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
[] RIP: 0033:0x70dff032eb1d
[] Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00
[] RSP: 002b:00007ffcc0367ff0 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[] RAX: ffffffffffffffda RBX: 00005748e046d600 RCX: 000070dff032eb1d
[] RDX: 00007ffcc0368080 RSI: 0000000000003b75 RDI: 000000000000001d
[] RBP: 00007ffcc0368040 R08: 00000005748df663 R09: 0000000000000007
[] R10: 00005748df663060 R11: 0000000000000246 R12: 0000000000000001
[] R13: 0000000000000000 R14: 00005748e055f0b0 R15: 00007ffcc0368080
[]  </TASK>

In short:

0: set_device_state
xe_vdev->state_mutex : migf->lock
1: data_read
migf->lock : mm->mmap_lock
2: vfio_pin_dma
mm->mmap_lock : vdev->memory_lock
3: vfio_pci_ioctl_reset
vdev->memory_lock : xe_vdev->state_mutex

In other words:
set_device_state takes xe_vdev->state_mutex and blocks on migf->lock,
data_read takes migf->lock and blocks on mm->mmap_lock
vfio_pin_dma takes mm->mmap_lock and blocks on vdev->memory_lock
reset takes vdev->memory_lock and blocks on xe_vdev->state_mutex

copy_to_user/copy_from_user doesn't have to be called under state_mutex,
it just needs to be taken under migf->lock.
The deferred reset trick exists because migf->lock needs to be taken
under state_mutex as part of reset_done callback, which completes the
chain and triggers the lockdep splat.

To me, it looks like something generic, that will have impact on any
device specific driver variant.
What am I missing?

I wonder if drivers that don't implement the deferred reset trick were
ever executed with lockdep enabled.

(BTW: Looking at it in more depth again - I do need to revisit the
disable_fd flow on xe-vfio-pci side, so do expect small changes on that
front in next revision)

Thanks,
-Michał

RE: [PATCH v4 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Tian, Kevin 3 months ago

> From: Winiarski, Michal <michal.winiarski@intel.com>
> Sent: Thursday, November 6, 2025 6:56 PM
> 
> On Thu, Nov 06, 2025 at 09:20:36AM +0100, Tian, Kevin wrote:
> >
> > this deferred_reset logic is a mlx unique thing. See:
> >
> > https://lore.kernel.org/kvm/20240220132459.GM13330@nvidia.com/
> 
> Interesting, that doesn't match my observations.
> 
> [] ======================================================
> [] WARNING: possible circular locking dependency detected
> [] 6.18.0-rc3-xe+ #90 Tainted: G S   U
> [] ------------------------------------------------------
> [] qemu-system-x86/4375 is trying to acquire lock:
> [] ff1100015af3ec30 (&migf->lock){+.+.}-{3:3}, at:
> xe_vfio_pci_set_device_state+0x22b/0x440 [xe_vfio_pci]
> []
>    but task is already holding lock:
> [] ff1100014c541a58 (&xe_vdev->state_mutex){+.+.}-{3:3}, at:
> xe_vfio_pci_set_device_state+0x43/0x440 [xe_vfio_pci]
> []
>    which lock already depends on the new lock.
> 
> []
>    the existing dependency chain (in reverse order) is:
> []
>    -> #3 (&xe_vdev->state_mutex){+.+.}-{3:3}:
> []        __mutex_lock+0xba/0x1110
> []        mutex_lock_nested+0x1b/0x30
> []        xe_vfio_pci_reset_done+0x2b/0xc0 [xe_vfio_pci]
> []        pci_try_reset_function+0xd7/0x150
> []        vfio_pci_core_ioctl+0x7f1/0xf20 [vfio_pci_core]
> []        vfio_device_fops_unl_ioctl+0x163/0x310 [vfio]
> []        __se_sys_ioctl+0x71/0xc0
> []        __x64_sys_ioctl+0x1d/0x30
> []        x64_sys_call+0x6ac/0xe50
> []        do_syscall_64+0xa1/0x560
> []        entry_SYSCALL_64_after_hwframe+0x4b/0x53
> []
>    -> #2 (&vdev->memory_lock){++++}-{3:3}:
> []        down_read+0x41/0x180
> []        vfio_pci_mmap_huge_fault+0xec/0x310 [vfio_pci_core]
> []        handle_mm_fault+0x8aa/0x13b0
> []        fixup_user_fault+0x124/0x280
> []        vaddr_get_pfns+0x1d2/0x420 [vfio_iommu_type1]
> []        vfio_pin_pages_remote+0x173/0x610 [vfio_iommu_type1]
> []        vfio_pin_map_dma+0xfd/0x340 [vfio_iommu_type1]
> []        vfio_iommu_type1_ioctl+0xfdf/0x1290 [vfio_iommu_type1]
> []        vfio_fops_unl_ioctl+0x106/0x340 [vfio]
> []        __se_sys_ioctl+0x71/0xc0
> []        __x64_sys_ioctl+0x1d/0x30
> []        x64_sys_call+0x6ac/0xe50
> []        do_syscall_64+0xa1/0x560
> []        entry_SYSCALL_64_after_hwframe+0x4b/0x53
> []
>    -> #1 (&mm->mmap_lock){++++}-{3:3}:
> []        __might_fault+0x56/0x90
> []        _copy_to_user+0x23/0x70
> []        xe_sriov_migration_data_read+0x17b/0x3f0 [xe]
> []        xe_sriov_vfio_data_read+0x44/0x60 [xe]
> []        xe_vfio_pci_save_read+0x55/0x80 [xe_vfio_pci]
> []        vfs_read+0xc0/0x300
> []        ksys_read+0x79/0xf0
> []        __x64_sys_read+0x1b/0x30
> []        x64_sys_call+0xcc4/0xe50
> []        do_syscall_64+0xa1/0x560
> []        entry_SYSCALL_64_after_hwframe+0x4b/0x53
> []
>    -> #0 (&migf->lock){+.+.}-{3:3}:
> []        __lock_acquire+0x1aff/0x3450
> []        lock_acquire+0xde/0x280
> []        __mutex_lock+0xba/0x1110
> []        mutex_lock_nested+0x1b/0x30
> []        xe_vfio_pci_set_device_state+0x22b/0x440 [xe_vfio_pci]
> []        vfio_ioctl_device_feature_mig_device_state+0x9c/0x1b0 [vfio]
> []        vfio_device_fops_unl_ioctl+0x289/0x310 [vfio]
> []        __se_sys_ioctl+0x71/0xc0
> []        __x64_sys_ioctl+0x1d/0x30
> []        x64_sys_call+0x6ac/0xe50
> []        do_syscall_64+0xa1/0x560
> []        entry_SYSCALL_64_after_hwframe+0x4b/0x53
> []
>    other info that might help us debug this:
> 
> [] Chain exists of:
>      &migf->lock --> &vdev->memory_lock --> &xe_vdev->state_mutex
> 
> []  Possible unsafe locking scenario:
> 
> []        CPU0                    CPU1
> []        ----                    ----
> []   lock(&xe_vdev->state_mutex);
> []                                lock(&vdev->memory_lock);
> []                                lock(&xe_vdev->state_mutex);
> []   lock(&migf->lock);
> []
>     *** DEADLOCK ***
> 
> [] 1 lock held by qemu-system-x86/4375:
> []  #0: ff1100014c541a58 (&xe_vdev->state_mutex){+.+.}-{3:3}, at:
> xe_vfio_pci_set_device_state+0x43/0x440 [xe_vfio_pci]
> []
>    stack backtrace:
> [] CPU: 18 UID: 0 PID: 4375 Comm: qemu-system-x86 Tainted: G S   U
> 6.18.0-rc3-xe+ #90 PREEMPT(voluntary)
> [] Tainted: [S]=CPU_OUT_OF_SPEC, [U]=USER
> [] Hardware name: Intel Corporation WHITLEY/WHITLEY, BIOS
> SE5C6200.86B.0027.P18.2206090856 06/09/2022
> [] Call Trace:
> []  <TASK>
> []  __dump_stack+0x19/0x30
> []  dump_stack_lvl+0x66/0x90
> []  dump_stack+0x10/0x14
> []  print_circular_bug+0x2fd/0x310
> []  check_noncircular+0x139/0x160
> []  __lock_acquire+0x1aff/0x3450
> []  ? vprintk_emit+0x3ec/0x560
> []  ? dev_vprintk_emit+0x162/0x1c0
> []  lock_acquire+0xde/0x280
> []  ? xe_vfio_pci_set_device_state+0x22b/0x440 [xe_vfio_pci]
> []  ? xe_vfio_pci_set_device_state+0x22b/0x440 [xe_vfio_pci]
> []  __mutex_lock+0xba/0x1110
> []  ? xe_vfio_pci_set_device_state+0x22b/0x440 [xe_vfio_pci]
> []  mutex_lock_nested+0x1b/0x30
> []  xe_vfio_pci_set_device_state+0x22b/0x440 [xe_vfio_pci]
> []  vfio_ioctl_device_feature_mig_device_state+0x9c/0x1b0 [vfio]
> []  vfio_device_fops_unl_ioctl+0x289/0x310 [vfio]
> []  __se_sys_ioctl+0x71/0xc0
> []  ? entry_SYSCALL_64_after_hwframe+0x4b/0x53
> []  __x64_sys_ioctl+0x1d/0x30
> []  x64_sys_call+0x6ac/0xe50
> []  do_syscall_64+0xa1/0x560
> []  ? __lock_acquire+0x73f/0x3450
> []  ? __lock_acquire+0x73f/0x3450
> []  ? __lock_acquire+0x73f/0x3450
> []  ? lock_release+0x10b/0x340
> []  ? wp_page_reuse+0x82/0x100
> []  ? lock_release+0x10b/0x340
> []  ? wp_page_reuse+0xcc/0x100
> []  ? lock_acquire+0xde/0x280
> []  ? count_memcg_event_mm+0x20/0x170
> []  ? lock_is_held_type+0x8f/0x140
> []  ? lock_release+0x10b/0x340
> []  ? count_memcg_event_mm+0x20/0x170
> []  ? count_memcg_event_mm+0x20/0x170
> []  ? count_memcg_event_mm+0x20/0x170
> []  ? count_memcg_event_mm+0x114/0x170
> []  ? handle_mm_fault+0x1300/0x13b0
> []  ? handle_mm_fault+0x3c/0x13b0
> []  ? lock_vma_under_rcu+0x8c/0x230
> []  ? lock_release+0x10b/0x340
> []  ? exc_page_fault+0x77/0xf0
> []  ? irqentry_exit_to_user_mode+0x100/0x130
> []  ? irqentry_exit+0x31/0x80
> []  entry_SYSCALL_64_after_hwframe+0x4b/0x53
> [] RIP: 0033:0x70dff032eb1d
> [] Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00
> 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff
> 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00
> [] RSP: 002b:00007ffcc0367ff0 EFLAGS: 00000246 ORIG_RAX:
> 0000000000000010
> [] RAX: ffffffffffffffda RBX: 00005748e046d600 RCX: 000070dff032eb1d
> [] RDX: 00007ffcc0368080 RSI: 0000000000003b75 RDI: 000000000000001d
> [] RBP: 00007ffcc0368040 R08: 00000005748df663 R09: 0000000000000007
> [] R10: 00005748df663060 R11: 0000000000000246 R12: 0000000000000001
> [] R13: 0000000000000000 R14: 00005748e055f0b0 R15: 00007ffcc0368080
> []  </TASK>
> 
> In short:
> 
> 0: set_device_state
> xe_vdev->state_mutex : migf->lock
> 1: data_read
> migf->lock : mm->mmap_lock
> 2: vfio_pin_dma
> mm->mmap_lock : vdev->memory_lock
> 3: vfio_pci_ioctl_reset
> vdev->memory_lock : xe_vdev->state_mutex

oh that's a good spot!

Previous deadlock requires 3 parties, due to copy_from/to_user()
under state_mutex:

0: set_device_state
vdev->state_mutex : mm->mmap_lock
2: vfio_pin_dma
mm->mmap_lock : vdev->memory_lock
3: vfio_pci_ioctl_reset
vdev->memory_lock : vdev->state_mutex

Now with migf->lock and the additional path of data_read it becomes
a 4-parties game. and looks it's common.

> 
> In other words:
> set_device_state takes xe_vdev->state_mutex and blocks on migf->lock,
> data_read takes migf->lock and blocks on mm->mmap_lock
> vfio_pin_dma takes mm->mmap_lock and blocks on vdev->memory_lock
> reset takes vdev->memory_lock and blocks on xe_vdev->state_mutex
> 
> copy_to_user/copy_from_user doesn't have to be called under state_mutex,
> it just needs to be taken under migf->lock.
> The deferred reset trick exists because migf->lock needs to be taken
> under state_mutex as part of reset_done callback, which completes the
> chain and triggers the lockdep splat.

this chain doesn't even reach migf->lock in the reset path. It's triggered
already, when acquiring state_mutex.

> 
> To me, it looks like something generic, that will have impact on any
> device specific driver variant.
> What am I missing?
> 
> I wonder if drivers that don't implement the deferred reset trick were
> ever executed with lockdep enabled.
> 

@Jason, @Yishai, @Shameer, @Giovanni, @Brett:

Sounds it's a right thing to pull back the deferred reset trick into
every driver. anything overlooked?

Re: [PATCH v4 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Jason Gunthorpe 3 months ago

On Fri, Nov 07, 2025 at 03:10:33AM +0000, Tian, Kevin wrote:
> > To me, it looks like something generic, that will have impact on any
> > device specific driver variant.
> > What am I missing?
> > 
> > I wonder if drivers that don't implement the deferred reset trick were
> > ever executed with lockdep enabled.
> > 
> 
> @Jason, @Yishai, @Shameer, @Giovanni, @Brett:
> 
> Sounds it's a right thing to pull back the deferred reset trick into
> every driver. anything overlooked?

It does seem like we should probably do something in the core code to
help this and remove the duplication.

I guess it makes sense the read/write lock would become entangled too.

Jason

RE: [PATCH v4 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Tian, Kevin 3 months ago

> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Saturday, November 8, 2025 8:48 AM
> 
> On Fri, Nov 07, 2025 at 03:10:33AM +0000, Tian, Kevin wrote:
> > > To me, it looks like something generic, that will have impact on any
> > > device specific driver variant.
> > > What am I missing?
> > >
> > > I wonder if drivers that don't implement the deferred reset trick were
> > > ever executed with lockdep enabled.
> > >
> >
> > @Jason, @Yishai, @Shameer, @Giovanni, @Brett:
> >
> > Sounds it's a right thing to pull back the deferred reset trick into
> > every driver. anything overlooked?
> 
> It does seem like we should probably do something in the core code to
> help this and remove the duplication.

from backport p.o.v. it might be easier to first fix each driver 
independently then remove the duplication in upstream? 

> 
> I guess it makes sense the read/write lock would become entangled too.
> 

looks so

Re: [PATCH v4 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Jason Gunthorpe 3 months ago

On Sat, Nov 08, 2025 at 01:05:55AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@ziepe.ca>
> > Sent: Saturday, November 8, 2025 8:48 AM
> > 
> > On Fri, Nov 07, 2025 at 03:10:33AM +0000, Tian, Kevin wrote:
> > > > To me, it looks like something generic, that will have impact on any
> > > > device specific driver variant.
> > > > What am I missing?
> > > >
> > > > I wonder if drivers that don't implement the deferred reset trick were
> > > > ever executed with lockdep enabled.
> > > >
> > >
> > > @Jason, @Yishai, @Shameer, @Giovanni, @Brett:
> > >
> > > Sounds it's a right thing to pull back the deferred reset trick into
> > > every driver. anything overlooked?
> > 
> > It does seem like we should probably do something in the core code to
> > help this and remove the duplication.
> 
> from backport p.o.v. it might be easier to first fix each driver 
> independently then remove the duplication in upstream? 

If it hasn't bothered anyone yet I wouldn't stress about backporting..

Maybe those drivers do work for some unknown reason?

Plus it is *really* hard to actually hit this deadlock..

Jason