[v5] vfio/xe: Add driver variant for Xe VF migration

[PATCH v5 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Michał Winiarski 3 months ago

In addition to generic VFIO PCI functionality, the driver implements
VFIO migration uAPI, allowing userspace to enable migration for Intel
Graphics SR-IOV Virtual Functions.
The driver binds to VF device and uses API exposed by Xe driver to
transfer the VF migration data under the control of PF device.

Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 MAINTAINERS                  |   7 +
 drivers/vfio/pci/Kconfig     |   2 +
 drivers/vfio/pci/Makefile    |   2 +
 drivers/vfio/pci/xe/Kconfig  |  12 +
 drivers/vfio/pci/xe/Makefile |   3 +
 drivers/vfio/pci/xe/main.c   | 568 +++++++++++++++++++++++++++++++++++
 6 files changed, 594 insertions(+)
 create mode 100644 drivers/vfio/pci/xe/Kconfig
 create mode 100644 drivers/vfio/pci/xe/Makefile
 create mode 100644 drivers/vfio/pci/xe/main.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 3e3373fb59100..72486df08b9f7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -27012,6 +27012,13 @@ L:	virtualization@lists.linux.dev
 S:	Maintained
 F:	drivers/vfio/pci/virtio
 
+VFIO XE PCI DRIVER
+M:	Michał Winiarski <michal.winiarski@intel.com>
+L:	kvm@vger.kernel.org
+L:	intel-xe@lists.freedesktop.org
+S:	Supported
+F:	drivers/vfio/pci/xe
+
 VGA_SWITCHEROO
 R:	Lukas Wunner <lukas@wunner.de>
 S:	Maintained
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 2b0172f546652..c100f0ab87f2d 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -67,4 +67,6 @@ source "drivers/vfio/pci/nvgrace-gpu/Kconfig"
 
 source "drivers/vfio/pci/qat/Kconfig"
 
+source "drivers/vfio/pci/xe/Kconfig"
+
 endmenu
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index cf00c0a7e55c8..f5d46aa9347b9 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -19,3 +19,5 @@ obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/
 obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/
 
 obj-$(CONFIG_QAT_VFIO_PCI) += qat/
+
+obj-$(CONFIG_XE_VFIO_PCI) += xe/
diff --git a/drivers/vfio/pci/xe/Kconfig b/drivers/vfio/pci/xe/Kconfig
new file mode 100644
index 0000000000000..4253f2a86ca1f
--- /dev/null
+++ b/drivers/vfio/pci/xe/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config XE_VFIO_PCI
+	tristate "VFIO support for Intel Graphics"
+	depends on DRM_XE
+	select VFIO_PCI_CORE
+	help
+	  This option enables device specific VFIO driver variant for Intel Graphics.
+	  In addition to generic VFIO PCI functionality, it implements VFIO
+	  migration uAPI allowing userspace to enable migration for
+	  Intel Graphics SR-IOV Virtual Functions supported by the Xe driver.
+
+	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/xe/Makefile b/drivers/vfio/pci/xe/Makefile
new file mode 100644
index 0000000000000..13aa0fd192cd4
--- /dev/null
+++ b/drivers/vfio/pci/xe/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_XE_VFIO_PCI) += xe-vfio-pci.o
+xe-vfio-pci-y := main.o
diff --git a/drivers/vfio/pci/xe/main.c b/drivers/vfio/pci/xe/main.c
new file mode 100644
index 0000000000000..ce0ed82ee4d31
--- /dev/null
+++ b/drivers/vfio/pci/xe/main.c
@@ -0,0 +1,568 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#include <linux/anon_inodes.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+
+#include <drm/intel/xe_sriov_vfio.h>
+#include <drm/intel/pciids.h>
+
+struct xe_vfio_pci_migration_file {
+	struct file *filp;
+	/* serializes accesses to migration data */
+	struct mutex lock;
+	bool disabled;
+	struct xe_vfio_pci_core_device *xe_vdev;
+};
+
+struct xe_vfio_pci_core_device {
+	struct vfio_pci_core_device core_device;
+	struct xe_device *xe;
+	/* PF internal control uses vfid index starting from 1 */
+	unsigned int vfid;
+	u8 migrate_cap:1;
+	u8 deferred_reset:1;
+	/* protects migration state */
+	struct mutex state_mutex;
+	enum vfio_device_mig_state mig_state;
+	/* protects the reset_done flow */
+	spinlock_t reset_lock;
+	struct xe_vfio_pci_migration_file *migf;
+};
+
+#define xe_vdev_to_dev(xe_vdev) (&(xe_vdev)->core_device.pdev->dev)
+
+static void xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file *migf)
+{
+	mutex_lock(&migf->lock);
+	migf->disabled = true;
+	mutex_unlock(&migf->lock);
+}
+
+static void xe_vfio_pci_put_file(struct xe_vfio_pci_core_device *xe_vdev)
+{
+	xe_vfio_pci_disable_file(xe_vdev->migf);
+	fput(xe_vdev->migf->filp);
+	xe_vdev->migf = NULL;
+}
+
+static void xe_vfio_pci_reset(struct xe_vfio_pci_core_device *xe_vdev)
+{
+	if (xe_vdev->migf)
+		xe_vfio_pci_put_file(xe_vdev);
+
+	xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+}
+
+static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device *xe_vdev)
+{
+	mutex_lock(&xe_vdev->state_mutex);
+}
+
+/*
+ * This function is called in all state_mutex unlock cases to
+ * handle a 'deferred_reset' if exists.
+ */
+static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device *xe_vdev)
+{
+again:
+	spin_lock(&xe_vdev->reset_lock);
+	if (xe_vdev->deferred_reset) {
+		xe_vdev->deferred_reset = false;
+		spin_unlock(&xe_vdev->reset_lock);
+		xe_vfio_pci_reset(xe_vdev);
+		goto again;
+	}
+	mutex_unlock(&xe_vdev->state_mutex);
+	spin_unlock(&xe_vdev->reset_lock);
+}
+
+static void xe_vfio_pci_reset_done(struct pci_dev *pdev)
+{
+	struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev);
+	int ret;
+
+	if (!xe_vdev->vfid)
+		return;
+
+	/*
+	 * VF FLR requires additional processing done by PF driver.
+	 * The processing is done after FLR is already finished from PCIe
+	 * perspective.
+	 * In order to avoid a scenario where VF is used while PF processing
+	 * is still in progress, additional synchronization point is needed.
+	 */
+	ret = xe_sriov_vfio_wait_flr_done(xe_vdev->xe, xe_vdev->vfid);
+	if (ret)
+		dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret);
+
+	if (!xe_vdev->migrate_cap)
+		return;
+
+	/*
+	 * As the higher VFIO layers are holding locks across reset and using
+	 * those same locks with the mm_lock we need to prevent ABBA deadlock
+	 * with the state_mutex and mm_lock.
+	 * In case the state_mutex was taken already we defer the cleanup work
+	 * to the unlock flow of the other running context.
+	 */
+	spin_lock(&xe_vdev->reset_lock);
+	xe_vdev->deferred_reset = true;
+	if (!mutex_trylock(&xe_vdev->state_mutex)) {
+		spin_unlock(&xe_vdev->reset_lock);
+		return;
+	}
+	spin_unlock(&xe_vdev->reset_lock);
+	xe_vfio_pci_state_mutex_unlock(xe_vdev);
+
+	xe_vfio_pci_reset(xe_vdev);
+}
+
+static const struct pci_error_handlers xe_vfio_pci_err_handlers = {
+	.reset_done = xe_vfio_pci_reset_done,
+	.error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static int xe_vfio_pci_open_device(struct vfio_device *core_vdev)
+{
+	struct xe_vfio_pci_core_device *xe_vdev =
+		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+	struct vfio_pci_core_device *vdev = &xe_vdev->core_device;
+	int ret;
+
+	ret = vfio_pci_core_enable(vdev);
+	if (ret)
+		return ret;
+
+	vfio_pci_core_finish_enable(vdev);
+
+	return 0;
+}
+
+static int xe_vfio_pci_release_file(struct inode *inode, struct file *filp)
+{
+	struct xe_vfio_pci_migration_file *migf = filp->private_data;
+
+	xe_vfio_pci_disable_file(migf);
+	mutex_destroy(&migf->lock);
+	kfree(migf);
+
+	return 0;
+}
+
+static ssize_t xe_vfio_pci_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos)
+{
+	struct xe_vfio_pci_migration_file *migf = filp->private_data;
+	ssize_t ret;
+
+	if (pos)
+		return -ESPIPE;
+
+	mutex_lock(&migf->lock);
+	if (migf->disabled) {
+		mutex_unlock(&migf->lock);
+		return -ENODEV;
+	}
+
+	ret = xe_sriov_vfio_data_read(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len);
+	mutex_unlock(&migf->lock);
+
+	return ret;
+}
+
+static const struct file_operations xe_vfio_pci_save_fops = {
+	.owner = THIS_MODULE,
+	.read = xe_vfio_pci_save_read,
+	.release = xe_vfio_pci_release_file,
+	.llseek = noop_llseek,
+};
+
+static ssize_t xe_vfio_pci_resume_write(struct file *filp, const char __user *buf,
+					size_t len, loff_t *pos)
+{
+	struct xe_vfio_pci_migration_file *migf = filp->private_data;
+	ssize_t ret;
+
+	if (pos)
+		return -ESPIPE;
+
+	mutex_lock(&migf->lock);
+	if (migf->disabled) {
+		mutex_unlock(&migf->lock);
+		return -ENODEV;
+	}
+
+	ret = xe_sriov_vfio_data_write(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len);
+	mutex_unlock(&migf->lock);
+
+	return ret;
+}
+
+static const struct file_operations xe_vfio_pci_resume_fops = {
+	.owner = THIS_MODULE,
+	.write = xe_vfio_pci_resume_write,
+	.release = xe_vfio_pci_release_file,
+	.llseek = noop_llseek,
+};
+
+static const char *vfio_dev_state_str(u32 state)
+{
+	switch (state) {
+	case VFIO_DEVICE_STATE_RUNNING: return "running";
+	case VFIO_DEVICE_STATE_RUNNING_P2P: return "running_p2p";
+	case VFIO_DEVICE_STATE_STOP_COPY: return "stopcopy";
+	case VFIO_DEVICE_STATE_STOP: return "stop";
+	case VFIO_DEVICE_STATE_RESUMING: return "resuming";
+	case VFIO_DEVICE_STATE_ERROR: return "error";
+	default: return "";
+	}
+}
+
+enum xe_vfio_pci_file_type {
+	XE_VFIO_FILE_SAVE = 0,
+	XE_VFIO_FILE_RESUME,
+};
+
+static struct xe_vfio_pci_migration_file *
+xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev,
+		       enum xe_vfio_pci_file_type type)
+{
+	struct xe_vfio_pci_migration_file *migf;
+	const struct file_operations *fops;
+	int flags;
+
+	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+	if (!migf)
+		return ERR_PTR(-ENOMEM);
+
+	fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops : &xe_vfio_pci_resume_fops;
+	flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY;
+	migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags);
+	if (IS_ERR(migf->filp)) {
+		kfree(migf);
+		return ERR_CAST(migf->filp);
+	}
+
+	mutex_init(&migf->lock);
+	migf->xe_vdev = xe_vdev;
+	xe_vdev->migf = migf;
+
+	stream_open(migf->filp->f_inode, migf->filp);
+
+	return migf;
+}
+
+static struct file *
+xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new)
+{
+	u32 cur = xe_vdev->mig_state;
+	int ret;
+
+	dev_dbg(xe_vdev_to_dev(xe_vdev),
+		"state: %s->%s\n", vfio_dev_state_str(cur), vfio_dev_state_str(new));
+
+	/*
+	 * "STOP" handling is reused for "RUNNING_P2P", as the device doesn't
+	 * have the capability to selectively block outgoing p2p DMA transfers.
+	 * While the device is allowing BAR accesses when the VF is stopped, it
+	 * is not processing any new workload requests, effectively stopping
+	 * any outgoing DMA transfers (not just p2p).
+	 * Any VRAM / MMIO accesses occurring during "RUNNING_P2P" are kept and
+	 * will be migrated to target VF during stop-copy.
+	 */
+	if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
+		ret = xe_sriov_vfio_suspend_device(xe_vdev->xe, xe_vdev->vfid);
+		if (ret)
+			goto err;
+
+		return NULL;
+	}
+
+	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) ||
+	    (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P))
+		return NULL;
+
+	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) {
+		ret = xe_sriov_vfio_resume_device(xe_vdev->xe, xe_vdev->vfid);
+		if (ret)
+			goto err;
+
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+		struct xe_vfio_pci_migration_file *migf;
+
+		migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE);
+		if (IS_ERR(migf)) {
+			ret = PTR_ERR(migf);
+			goto err;
+		}
+		get_file(migf->filp);
+
+		ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->xe, xe_vdev->vfid);
+		if (ret) {
+			fput(migf->filp);
+			goto err;
+		}
+
+		return migf->filp;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
+		if (xe_vdev->migf)
+			xe_vfio_pci_put_file(xe_vdev);
+
+		ret = xe_sriov_vfio_stop_copy_exit(xe_vdev->xe, xe_vdev->vfid);
+		if (ret)
+			goto err;
+
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+		struct xe_vfio_pci_migration_file *migf;
+
+		migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_RESUME);
+		if (IS_ERR(migf)) {
+			ret = PTR_ERR(migf);
+			goto err;
+		}
+		get_file(migf->filp);
+
+		ret = xe_sriov_vfio_resume_data_enter(xe_vdev->xe, xe_vdev->vfid);
+		if (ret) {
+			fput(migf->filp);
+			goto err;
+		}
+
+		return migf->filp;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+		if (xe_vdev->migf)
+			xe_vfio_pci_put_file(xe_vdev);
+
+		ret = xe_sriov_vfio_resume_data_exit(xe_vdev->xe, xe_vdev->vfid);
+		if (ret)
+			goto err;
+
+		return NULL;
+	}
+
+	WARN(true, "Unknown state transition %d->%d", cur, new);
+	return ERR_PTR(-EINVAL);
+
+err:
+	dev_dbg(xe_vdev_to_dev(xe_vdev),
+		"Failed to transition state: %s->%s err=%d\n",
+		vfio_dev_state_str(cur), vfio_dev_state_str(new), ret);
+	return ERR_PTR(ret);
+}
+
+static struct file *
+xe_vfio_pci_set_device_state(struct vfio_device *core_vdev,
+			     enum vfio_device_mig_state new_state)
+{
+	struct xe_vfio_pci_core_device *xe_vdev =
+		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+	enum vfio_device_mig_state next_state;
+	struct file *f = NULL;
+	int ret;
+
+	xe_vfio_pci_state_mutex_lock(xe_vdev);
+	while (new_state != xe_vdev->mig_state) {
+		ret = vfio_mig_get_next_state(core_vdev, xe_vdev->mig_state,
+					      new_state, &next_state);
+		if (ret) {
+			xe_sriov_vfio_error(xe_vdev->xe, xe_vdev->vfid);
+			f = ERR_PTR(ret);
+			break;
+		}
+		f = xe_vfio_set_state(xe_vdev, next_state);
+		if (IS_ERR(f))
+			break;
+
+		xe_vdev->mig_state = next_state;
+
+		/* Multiple state transitions with non-NULL file in the middle */
+		if (f && new_state != xe_vdev->mig_state) {
+			fput(f);
+			f = ERR_PTR(-EINVAL);
+			break;
+		}
+	}
+	xe_vfio_pci_state_mutex_unlock(xe_vdev);
+
+	return f;
+}
+
+static int xe_vfio_pci_get_device_state(struct vfio_device *core_vdev,
+					enum vfio_device_mig_state *curr_state)
+{
+	struct xe_vfio_pci_core_device *xe_vdev =
+		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+
+	xe_vfio_pci_state_mutex_lock(xe_vdev);
+	*curr_state = xe_vdev->mig_state;
+	xe_vfio_pci_state_mutex_unlock(xe_vdev);
+
+	return 0;
+}
+
+static int xe_vfio_pci_get_data_size(struct vfio_device *vdev,
+				     unsigned long *stop_copy_length)
+{
+	struct xe_vfio_pci_core_device *xe_vdev =
+		container_of(vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+
+	xe_vfio_pci_state_mutex_lock(xe_vdev);
+	*stop_copy_length = xe_sriov_vfio_stop_copy_size(xe_vdev->xe, xe_vdev->vfid);
+	xe_vfio_pci_state_mutex_unlock(xe_vdev);
+
+	return 0;
+}
+
+static const struct vfio_migration_ops xe_vfio_pci_migration_ops = {
+	.migration_set_state = xe_vfio_pci_set_device_state,
+	.migration_get_state = xe_vfio_pci_get_device_state,
+	.migration_get_data_size = xe_vfio_pci_get_data_size,
+};
+
+static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev)
+{
+	struct vfio_device *core_vdev = &xe_vdev->core_device.vdev;
+	struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
+	struct xe_device *xe = xe_sriov_vfio_get_pf(pdev);
+	int ret;
+
+	if (!xe)
+		return;
+	if (!xe_sriov_vfio_migration_supported(xe))
+		return;
+
+	ret = pci_iov_vf_id(pdev);
+	if (ret < 0)
+		return;
+
+	mutex_init(&xe_vdev->state_mutex);
+	spin_lock_init(&xe_vdev->reset_lock);
+
+	/* PF internal control uses vfid index starting from 1 */
+	xe_vdev->vfid = ret + 1;
+	xe_vdev->xe = xe;
+	xe_vdev->migrate_cap = true;
+
+	core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
+	core_vdev->mig_ops = &xe_vfio_pci_migration_ops;
+}
+
+static void xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device *xe_vdev)
+{
+	if (!xe_vdev->migrate_cap)
+		return;
+
+	mutex_destroy(&xe_vdev->state_mutex);
+}
+
+static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev)
+{
+	struct xe_vfio_pci_core_device *xe_vdev =
+		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+
+	xe_vfio_pci_migration_init(xe_vdev);
+
+	return vfio_pci_core_init_dev(core_vdev);
+}
+
+static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev)
+{
+	struct xe_vfio_pci_core_device *xe_vdev =
+		container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+
+	xe_vfio_pci_migration_fini(xe_vdev);
+}
+
+static const struct vfio_device_ops xe_vfio_pci_ops = {
+	.name = "xe-vfio-pci",
+	.init = xe_vfio_pci_init_dev,
+	.release = xe_vfio_pci_release_dev,
+	.open_device = xe_vfio_pci_open_device,
+	.close_device = vfio_pci_core_close_device,
+	.ioctl = vfio_pci_core_ioctl,
+	.device_feature = vfio_pci_core_ioctl_feature,
+	.read = vfio_pci_core_read,
+	.write = vfio_pci_core_write,
+	.mmap = vfio_pci_core_mmap,
+	.request = vfio_pci_core_request,
+	.match = vfio_pci_core_match,
+	.match_token_uuid = vfio_pci_core_match_token_uuid,
+	.bind_iommufd = vfio_iommufd_physical_bind,
+	.unbind_iommufd = vfio_iommufd_physical_unbind,
+	.attach_ioas = vfio_iommufd_physical_attach_ioas,
+	.detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static int xe_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct xe_vfio_pci_core_device *xe_vdev;
+	int ret;
+
+	xe_vdev = vfio_alloc_device(xe_vfio_pci_core_device, core_device.vdev, &pdev->dev,
+				    &xe_vfio_pci_ops);
+	if (IS_ERR(xe_vdev))
+		return PTR_ERR(xe_vdev);
+
+	dev_set_drvdata(&pdev->dev, &xe_vdev->core_device);
+
+	ret = vfio_pci_core_register_device(&xe_vdev->core_device);
+	if (ret) {
+		vfio_put_device(&xe_vdev->core_device.vdev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void xe_vfio_pci_remove(struct pci_dev *pdev)
+{
+	struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev);
+
+	vfio_pci_core_unregister_device(&xe_vdev->core_device);
+	vfio_put_device(&xe_vdev->core_device.vdev);
+}
+
+#define INTEL_PCI_VFIO_DEVICE(_id) { \
+	PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, (_id)) \
+}
+
+static const struct pci_device_id xe_vfio_pci_table[] = {
+	INTEL_PTL_IDS(INTEL_PCI_VFIO_DEVICE),
+	INTEL_WCL_IDS(INTEL_PCI_VFIO_DEVICE),
+	INTEL_BMG_IDS(INTEL_PCI_VFIO_DEVICE),
+	{}
+};
+MODULE_DEVICE_TABLE(pci, xe_vfio_pci_table);
+
+static struct pci_driver xe_vfio_pci_driver = {
+	.name = "xe-vfio-pci",
+	.id_table = xe_vfio_pci_table,
+	.probe = xe_vfio_pci_probe,
+	.remove = xe_vfio_pci_remove,
+	.err_handler = &xe_vfio_pci_err_handlers,
+	.driver_managed_dma = true,
+};
+module_pci_driver(xe_vfio_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michał Winiarski <michal.winiarski@intel.com>");
+MODULE_DESCRIPTION("VFIO PCI driver with migration support for Intel Graphics");
-- 
2.51.2

RE: [PATCH v5 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Tian, Kevin 3 months ago

> From: Winiarski, Michal <michal.winiarski@intel.com>
> Sent: Tuesday, November 11, 2025 9:05 AM
> +
> +	/*
> +	 * As the higher VFIO layers are holding locks across reset and using
> +	 * those same locks with the mm_lock we need to prevent ABBA
> deadlock
> +	 * with the state_mutex and mm_lock.
> +	 * In case the state_mutex was taken already we defer the cleanup
> work
> +	 * to the unlock flow of the other running context.
> +	 */
> +	spin_lock(&xe_vdev->reset_lock);
> +	xe_vdev->deferred_reset = true;
> +	if (!mutex_trylock(&xe_vdev->state_mutex)) {
> +		spin_unlock(&xe_vdev->reset_lock);
> +		return;
> +	}
> +	spin_unlock(&xe_vdev->reset_lock);
> +	xe_vfio_pci_state_mutex_unlock(xe_vdev);
> +
> +	xe_vfio_pci_reset(xe_vdev);
> +}

Jason suggested to do this in the core given it's common [1].

If you disagree, then please raise it and get consensus in that thread
instead of rushing to post a new version...

[1] https://lore.kernel.org/all/20251108004754.GD1859178@ziepe.ca/

Re: [PATCH v5 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Winiarski, Michal 3 months ago

On Tue, Nov 11, 2025 at 02:38:19AM +0100, Tian, Kevin wrote:
> > From: Winiarski, Michal <michal.winiarski@intel.com>
> > Sent: Tuesday, November 11, 2025 9:05 AM
> > +
> > +	/*
> > +	 * As the higher VFIO layers are holding locks across reset and using
> > +	 * those same locks with the mm_lock we need to prevent ABBA
> > deadlock
> > +	 * with the state_mutex and mm_lock.
> > +	 * In case the state_mutex was taken already we defer the cleanup
> > work
> > +	 * to the unlock flow of the other running context.
> > +	 */
> > +	spin_lock(&xe_vdev->reset_lock);
> > +	xe_vdev->deferred_reset = true;
> > +	if (!mutex_trylock(&xe_vdev->state_mutex)) {
> > +		spin_unlock(&xe_vdev->reset_lock);
> > +		return;
> > +	}
> > +	spin_unlock(&xe_vdev->reset_lock);
> > +	xe_vfio_pci_state_mutex_unlock(xe_vdev);
> > +
> > +	xe_vfio_pci_reset(xe_vdev);
> > +}
> 
> Jason suggested to do this in the core given it's common [1].
> 
> If you disagree, then please raise it and get consensus in that thread
> instead of rushing to post a new version...
> 
> [1] https://lore.kernel.org/all/20251108004754.GD1859178@ziepe.ca/

Hi,

I agree that it should be done in the core eventually.
I didn't view it as something blocking next revision, as the discussion
was in the context of converting every driver, which is something that
probably shouldn't be done as part of this series.

Note that the v6.19 feature pull for Xe is likely going to happen
tomorrow, so that's part of the reason for "rushing" the next version.
I wanted to collect all the r-bs on Xe side to be prepared for that.
If any parts of this series need to go through Xe tree, it will need to
be merged there soon (or wait all the way until v6.20 / v7).

Thanks,
-Michał

RE: [PATCH v5 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Tian, Kevin 3 months ago

> From: Winiarski, Michal <michal.winiarski@intel.com>
> Sent: Tuesday, November 11, 2025 4:26 PM
> 
> On Tue, Nov 11, 2025 at 02:38:19AM +0100, Tian, Kevin wrote:
> > > From: Winiarski, Michal <michal.winiarski@intel.com>
> > > Sent: Tuesday, November 11, 2025 9:05 AM
> > > +
> > > +	/*
> > > +	 * As the higher VFIO layers are holding locks across reset and using
> > > +	 * those same locks with the mm_lock we need to prevent ABBA
> > > deadlock
> > > +	 * with the state_mutex and mm_lock.
> > > +	 * In case the state_mutex was taken already we defer the cleanup
> > > work
> > > +	 * to the unlock flow of the other running context.
> > > +	 */
> > > +	spin_lock(&xe_vdev->reset_lock);
> > > +	xe_vdev->deferred_reset = true;
> > > +	if (!mutex_trylock(&xe_vdev->state_mutex)) {
> > > +		spin_unlock(&xe_vdev->reset_lock);
> > > +		return;
> > > +	}
> > > +	spin_unlock(&xe_vdev->reset_lock);
> > > +	xe_vfio_pci_state_mutex_unlock(xe_vdev);
> > > +
> > > +	xe_vfio_pci_reset(xe_vdev);
> > > +}
> >
> > Jason suggested to do this in the core given it's common [1].
> >
> > If you disagree, then please raise it and get consensus in that thread
> > instead of rushing to post a new version...
> >
> > [1] https://lore.kernel.org/all/20251108004754.GD1859178@ziepe.ca/
> 
> Hi,
> 
> I agree that it should be done in the core eventually.
> I didn't view it as something blocking next revision, as the discussion
> was in the context of converting every driver, which is something that
> probably shouldn't be done as part of this series.

well it doesn't make much sense to push a new driver specific
implementation when the core approach is preferred.

> 
> Note that the v6.19 feature pull for Xe is likely going to happen
> tomorrow, so that's part of the reason for "rushing" the next version.
> I wanted to collect all the r-bs on Xe side to be prepared for that.
> If any parts of this series need to go through Xe tree, it will need to
> be merged there soon (or wait all the way until v6.20 / v7).

at least the v5 cover-letter should tell something about this plan.
instead of leaving unaddressed opens in previous version not
mentioned at all.

then I'll leave to Alex and Rodrigo to decide the merge plan. From
my side I didn’t feel very risky having Xe patches and VFIO patches
go in the mainline separately - the remaining open is mostly
contained in vfio side. 

But now only one VFIO variant driver reviewer (me) looked at this
series in depth. Jason gave some valuable inputs but I'm afraid
he hasn't done a thorough review yet. Not sure we are at a point 
with confidence that the interface between VFIO/Xe has been finalized...

Re: [PATCH v5 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Winiarski, Michal 2 months, 4 weeks ago

On Tue, Nov 11, 2025 at 10:53:16AM +0100, Tian, Kevin wrote:
> > From: Winiarski, Michal <michal.winiarski@intel.com>
> > Sent: Tuesday, November 11, 2025 4:26 PM
> > 
> > On Tue, Nov 11, 2025 at 02:38:19AM +0100, Tian, Kevin wrote:
> > > > From: Winiarski, Michal <michal.winiarski@intel.com>
> > > > Sent: Tuesday, November 11, 2025 9:05 AM
> > > > +
> > > > +	/*
> > > > +	 * As the higher VFIO layers are holding locks across reset and using
> > > > +	 * those same locks with the mm_lock we need to prevent ABBA
> > > > deadlock
> > > > +	 * with the state_mutex and mm_lock.
> > > > +	 * In case the state_mutex was taken already we defer the cleanup
> > > > work
> > > > +	 * to the unlock flow of the other running context.
> > > > +	 */
> > > > +	spin_lock(&xe_vdev->reset_lock);
> > > > +	xe_vdev->deferred_reset = true;
> > > > +	if (!mutex_trylock(&xe_vdev->state_mutex)) {
> > > > +		spin_unlock(&xe_vdev->reset_lock);
> > > > +		return;
> > > > +	}
> > > > +	spin_unlock(&xe_vdev->reset_lock);
> > > > +	xe_vfio_pci_state_mutex_unlock(xe_vdev);
> > > > +
> > > > +	xe_vfio_pci_reset(xe_vdev);
> > > > +}
> > >
> > > Jason suggested to do this in the core given it's common [1].
> > >
> > > If you disagree, then please raise it and get consensus in that thread
> > > instead of rushing to post a new version...
> > >
> > > [1] https://lore.kernel.org/all/20251108004754.GD1859178@ziepe.ca/
> > 
> > Hi,
> > 
> > I agree that it should be done in the core eventually.
> > I didn't view it as something blocking next revision, as the discussion
> > was in the context of converting every driver, which is something that
> > probably shouldn't be done as part of this series.
> 
> well it doesn't make much sense to push a new driver specific
> implementation when the core approach is preferred.

This would generally mean that accepting any new VFIO driver variant
would be blocked until core approach materializes.

Jason, can you confirm that this is indeed what you have in mind?
Just to determine how urgent the core-side changes are, and whether
there's anything we can do to help with that.

> > 
> > Note that the v6.19 feature pull for Xe is likely going to happen
> > tomorrow, so that's part of the reason for "rushing" the next version.
> > I wanted to collect all the r-bs on Xe side to be prepared for that.
> > If any parts of this series need to go through Xe tree, it will need to
> > be merged there soon (or wait all the way until v6.20 / v7).
> 
> at least the v5 cover-letter should tell something about this plan.
> instead of leaving unaddressed opens in previous version not
> mentioned at all.
> 
> then I'll leave to Alex and Rodrigo to decide the merge plan. From
> my side I didn’t feel very risky having Xe patches and VFIO patches
> go in the mainline separately - the remaining open is mostly
> contained in vfio side. 
> 
> But now only one VFIO variant driver reviewer (me) looked at this
> series in depth. Jason gave some valuable inputs but I'm afraid
> he hasn't done a thorough review yet. Not sure we are at a point 
> with confidence that the interface between VFIO/Xe has been finalized...

I posted a subset of this series separately for inclusion in Xe tree:
https://lore.kernel.org/intel-xe/20251112132220.516975-1-michal.winiarski@intel.com/

If there are any changes requested to the interface and it impacts the
underlying implementation, we'll sort it out on Xe side.

Thanks,
-Michał

Re: [PATCH v5 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Jason Gunthorpe 2 months, 3 weeks ago

On Wed, Nov 12, 2025 at 02:46:08PM +0100, Winiarski, Michal wrote:
> > > I agree that it should be done in the core eventually.
> > > I didn't view it as something blocking next revision, as the discussion
> > > was in the context of converting every driver, which is something that
> > > probably shouldn't be done as part of this series.
> > 
> > well it doesn't make much sense to push a new driver specific
> > implementation when the core approach is preferred.
> 
> This would generally mean that accepting any new VFIO driver variant
> would be blocked until core approach materializes.
> 
> Jason, can you confirm that this is indeed what you have in mind?
> Just to determine how urgent the core-side changes are, and whether
> there's anything we can do to help with that.

A core approach would be nice, but I also haven't looked at what it
would be like.

I think if you post a small series trying to build one and convert
some of the existing drivers it would be sufficient to let this go
ahead.

Jason

Re: [PATCH v5 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Winiarski, Michal 2 months, 3 weeks ago

On Mon, Nov 17, 2025 at 01:41:17PM -0400, Jason Gunthorpe wrote:
> On Wed, Nov 12, 2025 at 02:46:08PM +0100, Winiarski, Michal wrote:
> > > > I agree that it should be done in the core eventually.
> > > > I didn't view it as something blocking next revision, as the discussion
> > > > was in the context of converting every driver, which is something that
> > > > probably shouldn't be done as part of this series.
> > > 
> > > well it doesn't make much sense to push a new driver specific
> > > implementation when the core approach is preferred.
> > 
> > This would generally mean that accepting any new VFIO driver variant
> > would be blocked until core approach materializes.
> > 
> > Jason, can you confirm that this is indeed what you have in mind?
> > Just to determine how urgent the core-side changes are, and whether
> > there's anything we can do to help with that.
> 
> A core approach would be nice, but I also haven't looked at what it
> would be like.
> 
> I think if you post a small series trying to build one and convert
> some of the existing drivers it would be sufficient to let this go
> ahead.
> 
> Jason

I posted a series that attempts to do just that.
https://lore.kernel.org/lkml/20251120123647.3522082-1-michal.winiarski@intel.com/

I would appreciate if we could move forward with the review of this
series independently. It should be relatively straightforward to convert
this driver once we're able to get an alignment on specific core-side
solution.

-Michał

RE: [PATCH v5 28/28] vfio/xe: Add device specific vfio_pci driver variant for Intel graphics

Posted by Tian, Kevin 2 months, 2 weeks ago

> From: Winiarski, Michal <michal.winiarski@intel.com>
> Sent: Thursday, November 20, 2025 8:40 PM
> 
> On Mon, Nov 17, 2025 at 01:41:17PM -0400, Jason Gunthorpe wrote:
> > On Wed, Nov 12, 2025 at 02:46:08PM +0100, Winiarski, Michal wrote:
> > > > > I agree that it should be done in the core eventually.
> > > > > I didn't view it as something blocking next revision, as the discussion
> > > > > was in the context of converting every driver, which is something that
> > > > > probably shouldn't be done as part of this series.
> > > >
> > > > well it doesn't make much sense to push a new driver specific
> > > > implementation when the core approach is preferred.
> > >
> > > This would generally mean that accepting any new VFIO driver variant
> > > would be blocked until core approach materializes.
> > >
> > > Jason, can you confirm that this is indeed what you have in mind?
> > > Just to determine how urgent the core-side changes are, and whether
> > > there's anything we can do to help with that.
> >
> > A core approach would be nice, but I also haven't looked at what it
> > would be like.
> >
> > I think if you post a small series trying to build one and convert
> > some of the existing drivers it would be sufficient to let this go
> > ahead.
> >
> > Jason
> 
> I posted a series that attempts to do just that.
> https://lore.kernel.org/lkml/20251120123647.3522082-1-
> michal.winiarski@intel.com/
> 
> I would appreciate if we could move forward with the review of this
> series independently. It should be relatively straightforward to convert
> this driver once we're able to get an alignment on specific core-side
> solution.
> 

that core series is simple. so whichever goes first is ok to me.

for what it stands here:

Reviewed-by: Kevin Tian <kevin.tian@intel.com>