[PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver

Chris Li posted 10 patches 4 months, 3 weeks ago
[PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Chris Li 4 months, 3 weeks ago
After the list of preserved devices is constructed, the PCI subsystem can
now forward the liveupdate request to the driver.

The PCI subsystem saves and restores a u64 data from LUO callback. For
each device, the PCI subsystem preserve a "dev_state" struct, which
contains the path (domain + bus + devfn) and a per device u64 data.

The device driver will use such a u64 data area to store the device driver
state. The device live update callback looks very similar to the LUO
subsystem callback, with the "void *arg" change to "struct device *dev".

In the prepare callback, the PCI subsystem allocates then preserves a
folio big enough to hold all requested device state (struct pci_dev_ser)
in an array and the count.

The PCI sub system will just forward the liveupdate call back with u64
data point to the u64 field of the device state array.

If some device fails the prepare callback, all previous devices that
already successfully finished the prepare call back will get the cancel
call back to clean up the saved state. That clean up is the special case
that not the full list will be walked.

In other live update callbacks, all the devices in the preserved device
list will get the callback with their own u64 data field.

Signed-off-by: Chris Li <chrisl@kernel.org>
---
 drivers/pci/liveupdate.c       | 203 +++++++++++++++++++++++++++++++++++++++--
 include/linux/dev_liveupdate.h |  23 +++++
 include/linux/device/driver.h  |   6 ++
 3 files changed, 223 insertions(+), 9 deletions(-)

diff --git a/drivers/pci/liveupdate.c b/drivers/pci/liveupdate.c
index e8891844b8194dabf8d1e8e2d74d9c701bd741ca..2b215c224fb78c908579b0d22be713e1dc7ca21f 100644
--- a/drivers/pci/liveupdate.c
+++ b/drivers/pci/liveupdate.c
@@ -9,11 +9,25 @@
 #define dev_fmt(fmt) "PCI liveupdate: " fmt
 
 #include <linux/types.h>
+#include <linux/kexec_handover.h>
 #include <linux/liveupdate.h>
 #include "pci.h"
 
 #define PCI_SUBSYSTEM_NAME "pci"
 
+static LIST_HEAD(preserved_devices);
+
+struct pci_dev_ser {
+	u32	path;		/* domain + bus + slot + fn */
+	u32	flags;
+	u64	driver_data;	/* driver data */
+};
+
+struct pci_ser {
+	u32 count;
+	struct pci_dev_ser devs[];
+};
+
 static void stack_push_buses(struct list_head *stack, struct list_head *buses)
 {
 	struct pci_bus *bus;
@@ -70,42 +84,213 @@ static int build_liveupdate_devices(struct list_head *head)
 	return count;
 }
 
+static void dev_cleanup_liveupdate(struct device *dev)
+{
+	dev->lu.flags &= ~LU_DEPENDED;
+	list_del_init(&dev->lu.lu_next);
+}
+
 static void cleanup_liveupdate_devices(struct list_head *head)
 {
 	struct device *d, *n;
 
-	list_for_each_entry_safe(d, n, head, lu.lu_next) {
-		d->lu.flags &= ~LU_DEPENDED;
-		list_del_init(&d->lu.lu_next);
+	list_for_each_entry_safe(d, n, head, lu.lu_next)
+		dev_cleanup_liveupdate(d);
+}
+
+static void cleanup_liveupdate_state(struct pci_ser *pci_state)
+{
+	struct folio *folio = virt_to_folio(pci_state);
+
+	kho_unpreserve_folio(folio);
+	folio_put(folio);
+}
+
+static void pci_call_cancel(struct pci_ser *pci_state)
+{
+	struct pci_dev_ser *si = pci_state->devs;
+	struct device *dev, *next;
+
+	list_for_each_entry_safe(dev, next, &preserved_devices, lu.lu_next) {
+		struct pci_dev_ser *s = si++;
+
+		if (!dev->driver)
+			panic("PCI liveupdate cancel: %s has no driver", dev_name(dev));
+		if (!dev->driver->lu)
+			panic("PCI liveupdate cancel: %s driver %s does not support liveupdate",
+			      dev_name(dev), dev->driver->name ? : "(null name)");
+		if (dev->driver->lu->cancel)
+			dev->driver->lu->cancel(dev, s->driver_data);
+		dev_cleanup_liveupdate(dev);
 	}
 }
 
-static int pci_liveupdate_prepare(void *arg, u64 *data)
+static int pci_get_device_path(struct pci_dev *pdev)
+{
+	return (pci_domain_nr(pdev->bus) << 16) | pci_dev_id(pdev);
+}
+
+static int pci_save_device_state(struct device *dev, struct pci_dev_ser *s)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	s->path = pci_get_device_path(pdev);
+	s->flags = dev->lu.flags;
+	return 0;
+}
+
+static int pci_call_prepare(struct pci_ser *pci_state,
+			    struct list_head *devices)
+{
+	struct pci_dev_ser *pdev_state_current = pci_state->devs;
+	struct device *dev, *next;
+	int ret;
+	char *reason;
+
+	list_for_each_entry_safe(dev, next, devices, lu.lu_next) {
+		struct pci_dev_ser *s = pdev_state_current++;
+
+		if (!dev->driver) {
+			reason = "no driver";
+			ret = -ENOENT;
+			goto cancel;
+		}
+		if (!dev->driver->lu) {
+			reason = "driver does not support liveupdate";
+			ret = -EPERM;
+			goto cancel;
+		}
+		ret = pci_save_device_state(dev, s);
+		if (ret) {
+			reason = "save device state failed";
+			goto cancel;
+		}
+		if (dev->driver->lu->prepare) {
+			ret = dev->driver->lu->prepare(dev, &s->driver_data);
+			if (ret) {
+				reason = "prepare() failed";
+				goto cancel;
+			}
+		}
+		list_move_tail(&dev->lu.lu_next, &preserved_devices);
+	}
+	return 0;
+
+cancel:
+	dev_err(dev, "luo prepare failed %d (%s)\n", ret, reason);
+	pci_call_cancel(pci_state);
+	return ret;
+}
+
+static int __pci_liveupdate_prepare(void *arg, u64 *data)
 {
 	LIST_HEAD(requested_devices);
+	struct pci_ser *pci_state;
+	int ret;
+	int count = build_liveupdate_devices(&requested_devices);
+	int size = sizeof(*pci_state) + sizeof(pci_state->devs[0]) * count;
+	int order = get_order(size);
+	struct folio *folio;
 
-	pr_info("prepare data[%llx]\n", *data);
+	folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order);
+	if (!folio) {
+		ret = -ENOMEM;
+		goto cleanup_device;
+	}
 
-	pci_lock_rescan_remove();
-	down_write(&pci_bus_sem);
+	pci_state = folio_address(folio);
+	pci_state->count = count;
+
+	ret = kho_preserve_folio(folio);
+	if (ret) {
+		pr_err("liveupdate_preserve_folio failed\n");
+		goto release_folio;
+	}
+
+	ret = pci_call_prepare(pci_state, &requested_devices);
+	if (ret)
+		goto unpreserve;
 
-	build_liveupdate_devices(&requested_devices);
+	*data = __pa(pci_state);
+	pr_info("prepare data[%llx]\n", *data);
+	return 0;
+
+unpreserve:
+	kho_unpreserve_folio(folio);
+release_folio:
+	folio_put(folio);
+cleanup_device:
 	cleanup_liveupdate_devices(&requested_devices);
+	return ret;
+}
 
+static int pci_liveupdate_prepare(void *arg, u64 *data)
+{
+	int ret;
+
+	pci_lock_rescan_remove();
+	down_write(&pci_bus_sem);
+	ret = __pci_liveupdate_prepare(arg, data);
 	up_write(&pci_bus_sem);
 	pci_unlock_rescan_remove();
+	return ret;
+}
+
+static int pci_call_freeze(struct pci_ser *pci_state, struct list_head *devlist)
+{
+	struct pci_dev_ser *n = pci_state->devs;
+	struct device *dev;
+	int ret = 0;
+
+	list_for_each_entry(dev, devlist, lu.lu_next) {
+		struct pci_dev_ser *s = n++;
+
+		if (!dev->driver) {
+			if (!dev->parent)
+				continue;
+			panic("PCI liveupdate freeze: %s has no driver", dev_name(dev));
+		}
+		if (!dev->driver->lu->freeze)
+			continue;
+		ret = dev->driver->lu->freeze(dev, &s->driver_data);
+		if (ret) {
+			dev_err(dev, "luo freeze failed %d\n", ret);
+			pci_call_cancel(pci_state);
+			return ret;
+		}
+	}
 	return 0;
 }
 
 static int pci_liveupdate_freeze(void *arg, u64 *data)
 {
+	struct pci_ser *pci_state = phys_to_virt(*data);
+	int ret;
+
 	pr_info("freeze data[%llx]\n", *data);
-	return 0;
+	pci_lock_rescan_remove();
+	down_write(&pci_bus_sem);
+
+	ret = pci_call_freeze(pci_state, &preserved_devices);
+
+	up_write(&pci_bus_sem);
+	pci_unlock_rescan_remove();
+	return ret;
 }
 
 static void pci_liveupdate_cancel(void *arg, u64 data)
 {
+	struct pci_ser *pci_state = phys_to_virt(data);
+
 	pr_info("cancel data[%llx]\n", data);
+	pci_lock_rescan_remove();
+	down_write(&pci_bus_sem);
+
+	pci_call_cancel(pci_state);
+	cleanup_liveupdate_state(pci_state);
+
+	up_write(&pci_bus_sem);
+	pci_unlock_rescan_remove();
 }
 
 static void pci_liveupdate_finish(void *arg, u64 data)
diff --git a/include/linux/dev_liveupdate.h b/include/linux/dev_liveupdate.h
index 72297cba08a999e89f7bc0997dabdbe14e0aa12c..80a723c7701ac4ddc2ddd03d0ffc9cc5a62a6083 100644
--- a/include/linux/dev_liveupdate.h
+++ b/include/linux/dev_liveupdate.h
@@ -20,6 +20,8 @@ enum liveupdate_flag {
 #define	LU_REQUESTED	(LU_BUSMASTER)
 #define	LU_DEPENDED	(LU_BUSMASTER_BRIDGE)
 
+struct device;
+
 /**
  * struct dev_liveupdate - Device state for live update operations
  * @lu_next:	List head for linking the device into live update
@@ -40,5 +42,26 @@ struct dev_liveupdate {
 	bool visited:1;
 };
 
+/**
+ * struct dev_liveupdate_ops - Live Update callback functions
+ * @prepare:     Prepare device for the upcoming state transition. Driver and
+ *               buses should save the necessary device state.
+ * @freeze:      A final notification before the system jumps to the new kernel.
+ *               Called from reboot() syscall.
+ * @cancel:      Cancel the live update process. Driver should clean
+ *               up any saved state if necessary.
+ * @finish:      The system has completed a transition. Drivers and buses should
+ *               have already restored the previously saved device state.
+ *               Clean-up any saved state or reset unreclaimed device.
+ *
+ * This structure is used by drivers and buses to hold the callback from LUO.
+ */
+struct dev_liveupdate_ops {
+	int (*prepare)(struct device *dev, u64 *data);
+	int (*freeze)(struct device *dev, u64 *data);
+	void (*cancel)(struct device *dev, u64 data);
+	void (*finish)(struct device *dev, u64 data);
+};
+
 #endif /* CONFIG_LIVEUPDATE */
 #endif /* _LINUX_DEV_LIVEUPDATE_H */
diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index cd8e0f0a634be9ea63ff22e89d66ada3b1a9eaf2..b2ba469cc3065a412f02230c62e811af19c4d2c6 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -19,6 +19,7 @@
 #include <linux/pm.h>
 #include <linux/device/bus.h>
 #include <linux/module.h>
+#include <linux/dev_liveupdate.h>
 
 /**
  * enum probe_type - device driver probe type to try
@@ -80,6 +81,8 @@ enum probe_type {
  *		it is bound to the driver.
  * @pm:		Power management operations of the device which matched
  *		this driver.
+ * @lu:		Live update callbacks, notify device of the live
+ *		update state, and allow preserve device across reboot.
  * @coredump:	Called when sysfs entry is written to. The device driver
  *		is expected to call the dev_coredump API resulting in a
  *		uevent.
@@ -116,6 +119,9 @@ struct device_driver {
 	const struct attribute_group **dev_groups;
 
 	const struct dev_pm_ops *pm;
+#ifdef CONFIG_LIVEUPDATE
+	const struct dev_liveupdate_ops *lu;
+#endif
 	void (*coredump) (struct device *dev);
 
 	struct driver_private *p;

-- 
2.51.0.384.g4c02a37b29-goog
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Greg Kroah-Hartman 4 months, 1 week ago
On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote:
>  include/linux/dev_liveupdate.h |  23 +++++
>  include/linux/device/driver.h  |   6 ++

Driver core changes under the guise of only PCI changes?  Please no.

Break this series out properly, get the driver core stuff working FIRST,
then show how multiple busses will work with them (i.e. you usually need
3 to know if you got it right).

I'm guessing you will need/want PCI, platform, and something else?

thanks,

greg k-h
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Chris Li 4 months, 1 week ago
On Tue, Sep 30, 2025 at 8:30 AM Greg Kroah-Hartman
<gregkh@linuxfoundation.org> wrote:
>
> On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote:
> >  include/linux/dev_liveupdate.h |  23 +++++
> >  include/linux/device/driver.h  |   6 ++
>
> Driver core changes under the guise of only PCI changes?  Please no.

There is a reason why I use the device struct rather than the pci_dev
struct even though liveupdate currently only works with PCI devices.
It comes down to the fact that the pci_bus and pci_host_bridge are not
pci_dev struct. We need something that is common across all those
three types of PCI related struct I care about(pci_dev, pci_bus,
pci_host_bridge). The device struct is just common around those. I can
move the dev_liveupdate struct into pci_bus, pci_host_bridge and
pci_dev independently. That will be more contained inside PCI, not
touching the device struct. The patch would be bigger because the data
structure is spread into different structs. Do you have a preference
which way to go?

> Break this series out properly, get the driver core stuff working FIRST,
> then show how multiple busses will work with them (i.e. you usually need
> 3 to know if you got it right).

Multiple buses you mean different types of bus, e.g. USB, PCI and
others or 3 pci_bus is good enough? Right now we have no intention to
support bus types other than PCI devices. The liveupdate is about
preserving the GPU context cross kernel upgrade. Suggestion welcome.

> I'm guessing you will need/want PCI, platform, and something else?

This series only cares about PCI. The LUO series has subsystems. The
PCI livedupate code is registered as an LUO subsystem. I guess the
subsystem is close to the platform you have in mind? LUO also has the
memfd in addition to the subsystem.

Chris
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Greg Kroah-Hartman 4 months, 1 week ago
On Thu, Oct 02, 2025 at 01:38:56PM -0700, Chris Li wrote:
> On Tue, Sep 30, 2025 at 8:30 AM Greg Kroah-Hartman
> <gregkh@linuxfoundation.org> wrote:
> >
> > On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote:
> > >  include/linux/dev_liveupdate.h |  23 +++++
> > >  include/linux/device/driver.h  |   6 ++
> >
> > Driver core changes under the guise of only PCI changes?  Please no.
> 
> There is a reason why I use the device struct rather than the pci_dev
> struct even though liveupdate currently only works with PCI devices.
> It comes down to the fact that the pci_bus and pci_host_bridge are not
> pci_dev struct. We need something that is common across all those
> three types of PCI related struct I care about(pci_dev, pci_bus,
> pci_host_bridge). The device struct is just common around those. I can
> move the dev_liveupdate struct into pci_bus, pci_host_bridge and
> pci_dev independently. That will be more contained inside PCI, not
> touching the device struct. The patch would be bigger because the data
> structure is spread into different structs. Do you have a preference
> which way to go?

If you only are caring about one single driver, don't mess with a
subsystem or the driver core, just change the driver.  My objection here
was that you were claiming it was a PCI change, yet it was actually only
touching the driver core which means that all devices in the systems for
all Linux users will be affected.

> > Break this series out properly, get the driver core stuff working FIRST,
> > then show how multiple busses will work with them (i.e. you usually need
> > 3 to know if you got it right).
> 
> Multiple buses you mean different types of bus, e.g. USB, PCI and
> others or 3 pci_bus is good enough? Right now we have no intention to
> support bus types other than PCI devices. The liveupdate is about
> preserving the GPU context cross kernel upgrade. Suggestion welcome.

So all of this is just for one single driver.  Ugh.  Just do it in the
single driver then, don't mess with the driver core, or even the PCI
core.  Just make it specific to the driver and then none of us will even
notice the mess that this all creates :)

thanks,

greg k-h
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Chris Li 4 months, 1 week ago
On Thu, Oct 2, 2025 at 11:19 PM Greg Kroah-Hartman
<gregkh@linuxfoundation.org> wrote:
>
> On Thu, Oct 02, 2025 at 01:38:56PM -0700, Chris Li wrote:
> > On Tue, Sep 30, 2025 at 8:30 AM Greg Kroah-Hartman
> > <gregkh@linuxfoundation.org> wrote:
> > >
> > > On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote:
> > > >  include/linux/dev_liveupdate.h |  23 +++++
> > > >  include/linux/device/driver.h  |   6 ++
> > >
> > > Driver core changes under the guise of only PCI changes?  Please no.
> >
> > There is a reason why I use the device struct rather than the pci_dev
> > struct even though liveupdate currently only works with PCI devices.
> > It comes down to the fact that the pci_bus and pci_host_bridge are not
> > pci_dev struct. We need something that is common across all those
> > three types of PCI related struct I care about(pci_dev, pci_bus,
> > pci_host_bridge). The device struct is just common around those. I can
> > move the dev_liveupdate struct into pci_bus, pci_host_bridge and
> > pci_dev independently. That will be more contained inside PCI, not
> > touching the device struct. The patch would be bigger because the data
> > structure is spread into different structs. Do you have a preference
> > which way to go?
>
> If you only are caring about one single driver, don't mess with a
> subsystem or the driver core, just change the driver.  My objection here

It is more than just one driver, we have vfio-pci, idpf, pci-pf-stub
and possible nvme driver.
The change needs to happen in the PCI enumeration and probing as well,
that is outside of the driver code.

> was that you were claiming it was a PCI change, yet it was actually only
> touching the driver core which means that all devices in the systems for

In theory all the devices can be liveupdate preserved. But now we only
support PCI.
I can look into containing the change in PCI only and not touching the
device struct if that is what you mean. I recall I tried that
previously and failed because bus->bridge is a device struct rather
than pci_dev or pci_host_bridge. I can try harder not to touch device
structs. Patch will be bigger and more complex than this right now.
But at least the damage is limited to PCI only if successful.

> all Linux users will be affected.

I understand your concerns. I was wishing one day all devices could
support liveupdate, but that is not the case right now.

> > > Break this series out properly, get the driver core stuff working FIRST,
> > > then show how multiple busses will work with them (i.e. you usually need
> > > 3 to know if you got it right).
> >
> > Multiple buses you mean different types of bus, e.g. USB, PCI and
> > others or 3 pci_bus is good enough? Right now we have no intention to
> > support bus types other than PCI devices. The liveupdate is about
> > preserving the GPU context cross kernel upgrade. Suggestion welcome.
>
> So all of this is just for one single driver.  Ugh.  Just do it in the
> single driver then, don't mess with the driver core, or even the PCI

Not a single driver. It is the whole PCI core. Or in your book the
whole PCI is just one single driver?

> core.  Just make it specific to the driver and then none of us will even
> notice the mess that this all creates :)

OK. Let me try that PCI only approach again and try it harder. I will
report back.

Thanks for the feedback.

Chris
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Greg Kroah-Hartman 4 months, 1 week ago
On Fri, Oct 03, 2025 at 12:26:01AM -0700, Chris Li wrote:
> On Thu, Oct 2, 2025 at 11:19 PM Greg Kroah-Hartman
> <gregkh@linuxfoundation.org> wrote:
> >
> > On Thu, Oct 02, 2025 at 01:38:56PM -0700, Chris Li wrote:
> > > On Tue, Sep 30, 2025 at 8:30 AM Greg Kroah-Hartman
> > > <gregkh@linuxfoundation.org> wrote:
> > > >
> > > > On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote:
> > > > >  include/linux/dev_liveupdate.h |  23 +++++
> > > > >  include/linux/device/driver.h  |   6 ++
> > > >
> > > > Driver core changes under the guise of only PCI changes?  Please no.
> > >
> > > There is a reason why I use the device struct rather than the pci_dev
> > > struct even though liveupdate currently only works with PCI devices.
> > > It comes down to the fact that the pci_bus and pci_host_bridge are not
> > > pci_dev struct. We need something that is common across all those
> > > three types of PCI related struct I care about(pci_dev, pci_bus,
> > > pci_host_bridge). The device struct is just common around those. I can
> > > move the dev_liveupdate struct into pci_bus, pci_host_bridge and
> > > pci_dev independently. That will be more contained inside PCI, not
> > > touching the device struct. The patch would be bigger because the data
> > > structure is spread into different structs. Do you have a preference
> > > which way to go?
> >
> > If you only are caring about one single driver, don't mess with a
> > subsystem or the driver core, just change the driver.  My objection here
> 
> It is more than just one driver, we have vfio-pci, idpf, pci-pf-stub
> and possible nvme driver.

Why is nvme considered a "GPU" that needs context saved?

> The change needs to happen in the PCI enumeration and probing as well,
> that is outside of the driver code.

So all just PCI drivers?  Then keep this in PCI-only please, and don't
touch the driver core.

> > was that you were claiming it was a PCI change, yet it was actually only
> > touching the driver core which means that all devices in the systems for
> 
> In theory all the devices can be liveupdate preserved. But now we only
> support PCI.

Then for now, only focus on PCI.

thanks,

greg k-h
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Chris Li 4 months, 1 week ago
On Fri, Oct 3, 2025 at 5:26 AM Greg Kroah-Hartman
<gregkh@linuxfoundation.org> wrote:
>
> On Fri, Oct 03, 2025 at 12:26:01AM -0700, Chris Li wrote:
>
> > It is more than just one driver, we have vfio-pci, idpf, pci-pf-stub
> > and possible nvme driver.
>
> Why is nvme considered a "GPU" that needs context saved?

NVME is not a GPU. The internal reason to have NVME participate in the
liveupdate is because the NVME shutdown of the IO queue is very slow,
it contributes the largest chunk of delay in the black out window for
liveupdate. The NVME participation is just an optimization to avoid
resetting the NVME queue. Consider it as (optional ) speed
optimization.

> > The change needs to happen in the PCI enumeration and probing as well,
> > that is outside of the driver code.
>
> So all just PCI drivers?  Then keep this in PCI-only please, and don't
> touch the driver core.

Ack. Will do.

>
> > > was that you were claiming it was a PCI change, yet it was actually only
> > > touching the driver core which means that all devices in the systems for
> >
> > In theory all the devices can be liveupdate preserved. But now we only
> > support PCI.
>
> Then for now, only focus on PCI.

Agree, thanks for the alignment.

Chris
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by David Matlack 4 months, 1 week ago
On Fri, Oct 3, 2025 at 10:49 AM Chris Li <chrisl@kernel.org> wrote:
>
> On Fri, Oct 3, 2025 at 5:26 AM Greg Kroah-Hartman
> <gregkh@linuxfoundation.org> wrote:
> >
> > On Fri, Oct 03, 2025 at 12:26:01AM -0700, Chris Li wrote:
> >
> > > It is more than just one driver, we have vfio-pci, idpf, pci-pf-stub
> > > and possible nvme driver.
> >
> > Why is nvme considered a "GPU" that needs context saved?
>
> NVME is not a GPU. The internal reason to have NVME participate in the
> liveupdate is because the NVME shutdown of the IO queue is very slow,
> it contributes the largest chunk of delay in the black out window for
> liveupdate. The NVME participation is just an optimization to avoid
> resetting the NVME queue. Consider it as (optional ) speed
> optimization.

This is not true. We haven't made any changes to the nvme driver
within Google for Live Update.

The reason I mentioned nvme in another email chain is because Google
has some hosts where we want to preserve VFs bound to vfio-pci across
Live Update where the PF is bound to nvme. But Jason is suggesting we
seriously explore switching the PF driver to vfio-pci before trying to
upstream nvme support for Live Update, which I think is fair.
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Chris Li 4 months, 1 week ago
On Fri, Oct 3, 2025 at 11:28 AM David Matlack <dmatlack@google.com> wrote:
>
> On Fri, Oct 3, 2025 at 10:49 AM Chris Li <chrisl@kernel.org> wrote:
> >
> > On Fri, Oct 3, 2025 at 5:26 AM Greg Kroah-Hartman
> > NVME is not a GPU. The internal reason to have NVME participate in the
> > liveupdate is because the NVME shutdown of the IO queue is very slow,
> > it contributes the largest chunk of delay in the black out window for
> > liveupdate. The NVME participation is just an optimization to avoid
> > resetting the NVME queue. Consider it as (optional ) speed
> > optimization.
>
> This is not true. We haven't made any changes to the nvme driver
> within Google for Live Update.
>
> The reason I mentioned nvme in another email chain is because Google
> has some hosts where we want to preserve VFs bound to vfio-pci across
> Live Update where the PF is bound to nvme. But Jason is suggesting we
> seriously explore switching the PF driver to vfio-pci before trying to
> upstream nvme support for Live Update, which I think is fair.

Ah, thanks for the clarification and sorry for my confusion. I think I
was thinking of a different storage driver, not the NVME you have in
mind.

Chris
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Jason Gunthorpe 4 months, 1 week ago
On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote:
> After the list of preserved devices is constructed, the PCI subsystem can
> now forward the liveupdate request to the driver.

This also seems completely backwards for how iommufd should be
working. It doesn't want callbacks triggered on prepare, it wants to
drive everything from its own ioctl.

Let's just do one thing at a time please and make this series about
iommufd to match the other luo series for iommufd.

non-iommufd cases can be proposed in their own series.

Jason
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Chris Li 4 months, 1 week ago
On Mon, Sep 29, 2025 at 10:48 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote:
> > After the list of preserved devices is constructed, the PCI subsystem can
> > now forward the liveupdate request to the driver.
>
> This also seems completely backwards for how iommufd should be
> working. It doesn't want callbacks triggered on prepare, it wants to
> drive everything from its own ioctl.

This series is about basic PCI device support, not IOMMUFD.

> Let's just do one thing at a time please and make this series about
> iommufd to match the other luo series for iommufd.

I am confused by you.

> non-iommufd cases can be proposed in their own series.

This is that non-iommufd series.


Chris
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Jason Gunthorpe 4 months, 1 week ago
On Mon, Sep 29, 2025 at 07:11:06PM -0700, Chris Li wrote:
> On Mon, Sep 29, 2025 at 10:48 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote:
> > > After the list of preserved devices is constructed, the PCI subsystem can
> > > now forward the liveupdate request to the driver.
> >
> > This also seems completely backwards for how iommufd should be
> > working. It doesn't want callbacks triggered on prepare, it wants to
> > drive everything from its own ioctl.
> 
> This series is about basic PCI device support, not IOMMUFD.
> 
> > Let's just do one thing at a time please and make this series about
> > iommufd to match the other luo series for iommufd.
> 
> I am confused by you.
> 
> > non-iommufd cases can be proposed in their own series.
> 
> This is that non-iommufd series.

Then don't do generic devices until we get iommufd done and you have a
meaningful in-tree driver to consume what you are adding.

Jason
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Chris Li 4 months, 1 week ago
On Tue, Sep 30, 2025 at 9:38 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > > non-iommufd cases can be proposed in their own series.
> >
> > This is that non-iommufd series.
>
> Then don't do generic devices until we get iommufd done and you have a
> meaningful in-tree driver to consume what you are adding.

Thanks for the suggestion. See the above explanation why I add to
device struct. Because it was contained in both pci_dev, pci_bus and
pci_host_bridge struct. All PCI related, but device struct is the
common base.

I can move them into three PCI related struct separately then leave
the device struct alone. Will that be better?

I did add the pci-lu-test driver to consume and test the LUO/PCI
series for the bus master bits. Are you suggesting the LUO/PCI should
wait until VFIO can consume the LUO/PCI changes then  send them out as
a series.

That will delay the LUO/PCI a bit for the VFIO.

Chris
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by David Matlack 4 months, 1 week ago
On 2025-09-30 01:38 PM, Jason Gunthorpe wrote:
> On Mon, Sep 29, 2025 at 07:11:06PM -0700, Chris Li wrote:
> > On Mon, Sep 29, 2025 at 10:48 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > >
> > > On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote:
> > > > After the list of preserved devices is constructed, the PCI subsystem can
> > > > now forward the liveupdate request to the driver.
> > >
> > > This also seems completely backwards for how iommufd should be
> > > working. It doesn't want callbacks triggered on prepare, it wants to
> > > drive everything from its own ioctl.
> > 
> > This series is about basic PCI device support, not IOMMUFD.
> > 
> > > Let's just do one thing at a time please and make this series about
> > > iommufd to match the other luo series for iommufd.
> > 
> > I am confused by you.
> > 
> > > non-iommufd cases can be proposed in their own series.
> > 
> > This is that non-iommufd series.
> 
> Then don't do generic devices until we get iommufd done and you have a
> meaningful in-tree driver to consume what you are adding.

I agree with Jason. I don't think we can reasonably make the argument
that we need this series until we have actualy use-cases for it.

I think we should focus on vfio-pci device preservation next, and use
that to incrementally drive whatever changes are necessary to the PCI
and generic device layer bit by bit.

For example, once we a basic vfio-pci device preservation working, we
can start thinking about how to handle when that device is a VF, and we
have to start also preserving the SR-IOV state on the PF and get the PF
driver involved in the process. At that point we can discuss how to
solve that specific problem. Maybe the solution will look something like
this series, maybe it will look like something else. There is open
design space.

Without approaching it this way, I don't see how we can't reasonably
argue that anything in this series is necessary. And I suspect some
parts of this series truly are unnecessary, at least in the short term.
In our internal implementation, the only dependent device that truly
needed to participate is the PF driver when a VF is preserved.
Everything else (e.g. pcieport callbacks) have just been no-ops.
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Chris Li 4 months, 1 week ago
On Thu, Oct 2, 2025 at 11:54 AM David Matlack <dmatlack@google.com> wrote:
> > Then don't do generic devices until we get iommufd done and you have a
> > meaningful in-tree driver to consume what you are adding.
>
> I agree with Jason. I don't think we can reasonably make the argument
> that we need this series until we have actualy use-cases for it.
>
> I think we should focus on vfio-pci device preservation next, and use
> that to incrementally drive whatever changes are necessary to the PCI
> and generic device layer bit by bit.

The feedback I got for the PCI V1 was to make it as minimal as
possible. We agree preserving the BUS MASTER bit is the first minimal
step. That is what I did in the V2 phase I series. Only the bus
master. I think the pci-lu-test driver did demo the bus master bit, it
is not vfio yet. At least that was the plan shared in the upstream
alignment meeting.

> For example, once we a basic vfio-pci device preservation working, we
> can start thinking about how to handle when that device is a VF, and we
> have to start also preserving the SR-IOV state on the PF and get the PF

SR-IOV is a much bigger step than the BUS Master bit. I recall at one
point in the upstream discussion meeting that we don't do SR-IOV as
the first step. I am not opposed to it, we need to get to vfio and
SR-IOV eventually. I just feel that the PCI + VFIO + SR-IOV will be a
much bigger series. I worry the series size is not friendly for
reviewers. I wish there could be smaller incremental steps digestible.

> driver involved in the process. At that point we can discuss how to
> solve that specific problem. Maybe the solution will look something like
> this series, maybe it will look like something else. There is open
> design space.

Yes doable, just will delay the LUO/PCI series by a bit and a much
bigger series.

> Without approaching it this way, I don't see how we can't reasonably
> argue that anything in this series is necessary. And I suspect some
> parts of this series truly are unnecessary, at least in the short term.

You have me on the double negatives, always not very good at those.
If the bigger series is what we want, I can do that. Just will have
some latency to get the VFIO.

> In our internal implementation, the only dependent device that truly
> needed to participate is the PF driver when a VF is preserved.
> Everything else (e.g. pcieport callbacks) have just been no-ops.

Your VF device does not need to preserve DMA? If you want to preserve
DMA the bus master bit is required, and the pcieport driver for the
PCI-PCI bridge is also required. I am not sure pure VF and PF without
any DMA makes practical sense.

Chris
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by David Matlack 4 months, 1 week ago
On Thu, Oct 2, 2025 at 1:58 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Thu, Oct 2, 2025 at 11:54 AM David Matlack <dmatlack@google.com> wrote:
> > > Then don't do generic devices until we get iommufd done and you have a
> > > meaningful in-tree driver to consume what you are adding.
> >
> > I agree with Jason. I don't think we can reasonably make the argument
> > that we need this series until we have actualy use-cases for it.
> >
> > I think we should focus on vfio-pci device preservation next, and use
> > that to incrementally drive whatever changes are necessary to the PCI
> > and generic device layer bit by bit.
>
> The feedback I got for the PCI V1 was to make it as minimal as
> possible. We agree preserving the BUS MASTER bit is the first minimal
> step. That is what I did in the V2 phase I series. Only the bus
> master. I think the pci-lu-test driver did demo the bus master bit, it
> is not vfio yet. At least that was the plan shared in the upstream
> alignment meeting.

What do the driver callbacks in patch 3 and patches 5-8 have to do
with preserving the bus master bit? That's half the series.

>
> > For example, once we a basic vfio-pci device preservation working, we
> > can start thinking about how to handle when that device is a VF, and we
> > have to start also preserving the SR-IOV state on the PF and get the PF
>
> SR-IOV is a much bigger step than the BUS Master bit. I recall at one
> point in the upstream discussion meeting that we don't do SR-IOV as
> the first step. I am not opposed to it, we need to get to vfio and
> SR-IOV eventually. I just feel that the PCI + VFIO + SR-IOV will be a
> much bigger series. I worry the series size is not friendly for
> reviewers. I wish there could be smaller incremental steps digestible.

SR-IOV is not a first step, of course. That's not what I said. I'm
saying SR-IOV is future work that could justify some of the larger
changes in this series (e.g. driver callbacks). So I am suggesting we
revisit those changes when we are working on SR-IOV.

>
> > driver involved in the process. At that point we can discuss how to
> > solve that specific problem. Maybe the solution will look something like
> > this series, maybe it will look like something else. There is open
> > design space.
>
> Yes doable, just will delay the LUO/PCI series by a bit and a much
> bigger series.

There will not be one "LUO/PCI series". There will be many incremental steps.

>
> > Without approaching it this way, I don't see how we can't reasonably
> > argue that anything in this series is necessary. And I suspect some
> > parts of this series truly are unnecessary, at least in the short term.
>
> You have me on the double negatives, always not very good at those.
> If the bigger series is what we want, I can do that. Just will have
> some latency to get the VFIO.

Oops, typo on my end. I meant "I don't see how we _can_ reasonably
argue". I am saying we don't have enough justification (yet) for a lot
of the code changes in this series.

>
> > In our internal implementation, the only dependent device that truly
> > needed to participate is the PF driver when a VF is preserved.
> > Everything else (e.g. pcieport callbacks) have just been no-ops.
>
> Your VF device does not need to preserve DMA? If you want to preserve
> DMA the bus master bit is required, and the pcieport driver for the
> PCI-PCI bridge is also required. I am not sure pure VF and PF without
> any DMA makes practical sense.

I'm saying the only drivers that actually needed to implement Live
Update driver callbacks have been vfio-pci and PF drivers. The
vfio-pci support doesn't exist yet upstream, and we are planning to
use FD preservation there. So we don't know if driver callbacks will
be needed for that. And we don't care about PF drivers until we get to
supporting SR-IOV. So the driver callbacks all seem unnecessary at
this point.

I totally agree we need to avoid clearing the bus master bit. Let's
focus on that.
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Chris Li 4 months, 1 week ago
On Thu, Oct 2, 2025 at 2:31 PM David Matlack <dmatlack@google.com> wrote:
>
> On Thu, Oct 2, 2025 at 1:58 PM Chris Li <chrisl@kernel.org> wrote:
> >
> > On Thu, Oct 2, 2025 at 11:54 AM David Matlack <dmatlack@google.com> wrote:
> > > > Then don't do generic devices until we get iommufd done and you have a
> > > > meaningful in-tree driver to consume what you are adding.
> > >
> > > I agree with Jason. I don't think we can reasonably make the argument
> > > that we need this series until we have actualy use-cases for it.
> > >
> > > I think we should focus on vfio-pci device preservation next, and use
> > > that to incrementally drive whatever changes are necessary to the PCI
> > > and generic device layer bit by bit.
> >
> > The feedback I got for the PCI V1 was to make it as minimal as
> > possible. We agree preserving the BUS MASTER bit is the first minimal
> > step. That is what I did in the V2 phase I series. Only the bus
> > master. I think the pci-lu-test driver did demo the bus master bit, it
> > is not vfio yet. At least that was the plan shared in the upstream
> > alignment meeting.
>
> What do the driver callbacks in patch 3 and patches 5-8 have to do
> with preserving the bus master bit? That's half the series.

I was thinking the pcie driver is for preserving the bus master bit on
the bridge. I just realized that the actual bus master bit preserve is
at the pci_dev level. The pcie driver call back is effectively no op.
Yes, I can delete those from the series and not save driver data at
all. Points taken.
>
> >
> > > For example, once we a basic vfio-pci device preservation working, we
> > > can start thinking about how to handle when that device is a VF, and we
> > > have to start also preserving the SR-IOV state on the PF and get the PF
> >
> > SR-IOV is a much bigger step than the BUS Master bit. I recall at one
> > point in the upstream discussion meeting that we don't do SR-IOV as
> > the first step. I am not opposed to it, we need to get to vfio and
> > SR-IOV eventually. I just feel that the PCI + VFIO + SR-IOV will be a
> > much bigger series. I worry the series size is not friendly for
> > reviewers. I wish there could be smaller incremental steps digestible.
>
> SR-IOV is not a first step, of course. That's not what I said. I'm
> saying SR-IOV is future work that could justify some of the larger
> changes in this series (e.g. driver callbacks). So I am suggesting we
> revisit those changes when we are working on SR-IOV.

Just to confirm my understanding aligned with you. We remove the
driver callbacks in PCI series until the vfio SR-IOV to add them back.

> > > driver involved in the process. At that point we can discuss how to
> > > solve that specific problem. Maybe the solution will look something like
> > > this series, maybe it will look like something else. There is open
> > > design space.
> >
> > Yes doable, just will delay the LUO/PCI series by a bit and a much
> > bigger series.
>
> There will not be one "LUO/PCI series". There will be many incremental steps.

Oh, that is good then. I can still keep the PCI series as one incremental step.

>
> >
> > > Without approaching it this way, I don't see how we can't reasonably
> > > argue that anything in this series is necessary. And I suspect some
> > > parts of this series truly are unnecessary, at least in the short term.
> >
> > You have me on the double negatives, always not very good at those.
> > If the bigger series is what we want, I can do that. Just will have
> > some latency to get the VFIO.
>
> Oops, typo on my end. I meant "I don't see how we _can_ reasonably
> argue". I am saying we don't have enough justification (yet) for a lot
> of the code changes in this series.

Ack for removing driver callbacks in this series.

> > > In our internal implementation, the only dependent device that truly
> > > needed to participate is the PF driver when a VF is preserved.
> > > Everything else (e.g. pcieport callbacks) have just been no-ops.
> >
> > Your VF device does not need to preserve DMA? If you want to preserve
> > DMA the bus master bit is required, and the pcieport driver for the
> > PCI-PCI bridge is also required. I am not sure pure VF and PF without
> > any DMA makes practical sense.
>
> I'm saying the only drivers that actually needed to implement Live
> Update driver callbacks have been vfio-pci and PF drivers. The
> vfio-pci support doesn't exist yet upstream, and we are planning to
> use FD preservation there. So we don't know if driver callbacks will
> be needed for that. And we don't care about PF drivers until we get to
> supporting SR-IOV. So the driver callbacks all seem unnecessary at
> this point.

Ack.

>
> I totally agree we need to avoid clearing the bus master bit. Let's
> focus on that.

Ack.

Chris
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Jason Gunthorpe 4 months, 1 week ago
On Thu, Oct 02, 2025 at 02:31:08PM -0700, David Matlack wrote:
> I'm saying the only drivers that actually needed to implement Live
> Update driver callbacks have been vfio-pci and PF drivers. The
> vfio-pci support doesn't exist yet upstream, and we are planning to
> use FD preservation there. So we don't know if driver callbacks will
> be needed for that. 

I don't expect driver callbacks trough the pci subsystem, and I think
they should be removed from this series.

As I said the flow is backwards from what we want. The vfio driver
gets a luo FD from an ioctl, and it goes through and calls into luo
and pci with that session object to do all the required serialization.

Any required callbacks should be routed through luo based on creating
preserved objects within luo and providing ops to luo.

There is no other way to properly link things to sessions.

> And we don't care about PF drivers until we get to
> supporting SR-IOV. So the driver callbacks all seem unnecessary at
> this point.

I guess we will see, but I'm hoping we can get quite far using
vfio-pci as the SRIOV PF driver and don't need to try to get a big PF
in-kernel driver entangled in this.

Jason
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Chris Li 4 months, 1 week ago
On Thu, Oct 2, 2025 at 4:21 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Thu, Oct 02, 2025 at 02:31:08PM -0700, David Matlack wrote:
> > I'm saying the only drivers that actually needed to implement Live
> > Update driver callbacks have been vfio-pci and PF drivers. The
> > vfio-pci support doesn't exist yet upstream, and we are planning to
> > use FD preservation there. So we don't know if driver callbacks will
> > be needed for that.
>
> I don't expect driver callbacks trough the pci subsystem, and I think
> they should be removed from this series.

Per suggestion from David as well. I will remove the driver callback
from the PCI series.

>
> As I said the flow is backwards from what we want. The vfio driver
> gets a luo FD from an ioctl, and it goes through and calls into luo
> and pci with that session object to do all the required serialization.
>
> Any required callbacks should be routed through luo based on creating
> preserved objects within luo and providing ops to luo.
>
> There is no other way to properly link things to sessions.

As David pointed out in the other email, the PCI also supports other
non vfio PCI devices which do not have the FD and FD related sessions.
That is the original intent for the LUO PCI subsystem.

> > And we don't care about PF drivers until we get to
> > supporting SR-IOV. So the driver callbacks all seem unnecessary at
> > this point.
>
> I guess we will see, but I'm hoping we can get quite far using
> vfio-pci as the SRIOV PF driver and don't need to try to get a big PF
> in-kernel driver entangled in this.

Yes, vfio-pci is a big series as well. Getting a NIC might be easier
to get the PF DMA working with a live update but that will be thrown
away once we have the vfio-pci as the real user. Actually getting the
pci-pf-stub driver working would be a smaller and reasonable step to
justify the PF support in LUO PCI.

Chris
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Jason Gunthorpe 4 months, 1 week ago
On Thu, Oct 02, 2025 at 10:24:59PM -0700, Chris Li wrote:

> As David pointed out in the other email, the PCI also supports other
> non vfio PCI devices which do not have the FD and FD related sessions.
> That is the original intent for the LUO PCI subsystem.

This doesn't make sense. We don't know how to solve this problem yet,
but I'm pretty confident we will need to inject a FD and session into
these drivers too.

> away once we have the vfio-pci as the real user. Actually getting the
> pci-pf-stub driver working would be a smaller and reasonable step to
> justify the PF support in LUO PCI.

In this contex pci-pf-stub is useless, just use vfio-pci as the SRIOV
stub. I wouldn't invest in it. Especially since it creates more
complexity because we don't have an obvious way to get the session FD.

Jason
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Chris Li 4 months, 1 week ago
On Fri, Oct 3, 2025 at 5:06 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Thu, Oct 02, 2025 at 10:24:59PM -0700, Chris Li wrote:
>
> > As David pointed out in the other email, the PCI also supports other
> > non vfio PCI devices which do not have the FD and FD related sessions.
> > That is the original intent for the LUO PCI subsystem.
>
> This doesn't make sense. We don't know how to solve this problem yet,
> but I'm pretty confident we will need to inject a FD and session into
> these drivers too.

Ack. I can start hacking on hook up the PCI layer to the vfio FD and
sessions. Not sure how to do that at this point yet, I will give it a
stab and report back.

>
> > away once we have the vfio-pci as the real user. Actually getting the
> > pci-pf-stub driver working would be a smaller and reasonable step to
> > justify the PF support in LUO PCI.
>
> In this contex pci-pf-stub is useless, just use vfio-pci as the SRIOV
> stub. I wouldn't invest in it. Especially since it creates more
> complexity because we don't have an obvious way to get the session FD.

Ack. I will not do pci-pf-stub then.

Chris
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by David Matlack 4 months, 1 week ago
On Fri, Oct 3, 2025 at 5:06 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Thu, Oct 02, 2025 at 10:24:59PM -0700, Chris Li wrote:
>
> > As David pointed out in the other email, the PCI also supports other
> > non vfio PCI devices which do not have the FD and FD related sessions.
> > That is the original intent for the LUO PCI subsystem.
>
> This doesn't make sense. We don't know how to solve this problem yet,
> but I'm pretty confident we will need to inject a FD and session into
> these drivers too.

Google's LUO PCI subsystem (i.e. this series) predated a lot of the
discussion about FD preservation and needed to support the legacy vfio
container/group model. Outside of vfio-pci, the only other drivers
that participate are the PF drivers (pci-pf-stub and idpf), but they
just register empty callbacks.

So from an upstream perspective we don't really have a usecase for
callbacks. Chris ,I saw in your other email that you agree with
dropping them in the next version, so it sounds like we are aligned
then.

Vipin Sharma is working on the vfio-pci MVP series. Vipin, if you
anticipate VFIO is going to need driver callbacks on top of the LUO FD
callbacks, please chime in here.
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Vipin Sharma 4 months, 1 week ago
On Fri, Oct 3, 2025 at 9:27 AM David Matlack <dmatlack@google.com> wrote:
>
> On Fri, Oct 3, 2025 at 5:06 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > On Thu, Oct 02, 2025 at 10:24:59PM -0700, Chris Li wrote:
> >
> > > As David pointed out in the other email, the PCI also supports other
> > > non vfio PCI devices which do not have the FD and FD related sessions.
> > > That is the original intent for the LUO PCI subsystem.
> >
> > This doesn't make sense. We don't know how to solve this problem yet,
> > but I'm pretty confident we will need to inject a FD and session into
> > these drivers too.
>
> Google's LUO PCI subsystem (i.e. this series) predated a lot of the
> discussion about FD preservation and needed to support the legacy vfio
> container/group model. Outside of vfio-pci, the only other drivers
> that participate are the PF drivers (pci-pf-stub and idpf), but they
> just register empty callbacks.
>
> So from an upstream perspective we don't really have a usecase for
> callbacks. Chris ,I saw in your other email that you agree with
> dropping them in the next version, so it sounds like we are aligned
> then.
>
> Vipin Sharma is working on the vfio-pci MVP series. Vipin, if you
> anticipate VFIO is going to need driver callbacks on top of the LUO FD
> callbacks, please chime in here.

Currently, LUO FD callbacks are the only ones I need. I think we are
fine for now without PCI callbacks. I will have better clarity next
week but for now I am only using LUO FD callbacks.
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by David Matlack 4 months, 1 week ago
On Thu, Oct 2, 2025 at 4:21 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> On Thu, Oct 02, 2025 at 02:31:08PM -0700, David Matlack wrote:
> > And we don't care about PF drivers until we get to
> > supporting SR-IOV. So the driver callbacks all seem unnecessary at
> > this point.
>
> I guess we will see, but I'm hoping we can get quite far using
> vfio-pci as the SRIOV PF driver and don't need to try to get a big PF
> in-kernel driver entangled in this.

So far we have had to support vfio-pci, pci-pf-stub, and idpf as PF
drivers, and nvme looks like it's coming soon :(
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Jason Gunthorpe 4 months, 1 week ago
On Thu, Oct 02, 2025 at 04:42:17PM -0700, David Matlack wrote:
> On Thu, Oct 2, 2025 at 4:21 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > On Thu, Oct 02, 2025 at 02:31:08PM -0700, David Matlack wrote:
> > > And we don't care about PF drivers until we get to
> > > supporting SR-IOV. So the driver callbacks all seem unnecessary at
> > > this point.
> >
> > I guess we will see, but I'm hoping we can get quite far using
> > vfio-pci as the SRIOV PF driver and don't need to try to get a big PF
> > in-kernel driver entangled in this.
> 
> So far we have had to support vfio-pci, pci-pf-stub, and idpf as PF
> drivers, and nvme looks like it's coming soon :(

How much effort did you put into moving them to vfio though? Hack Hack
in the kernel is easy, but upstreaming may be very hard :\

Shutting down enough of the PF kernel driver to safely kexec is almost
the same as unbinding it completely.

Jason
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by David Matlack 4 months, 1 week ago
On Fri, Oct 3, 2025 at 5:04 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Thu, Oct 02, 2025 at 04:42:17PM -0700, David Matlack wrote:
> > On Thu, Oct 2, 2025 at 4:21 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > > On Thu, Oct 02, 2025 at 02:31:08PM -0700, David Matlack wrote:
> > > > And we don't care about PF drivers until we get to
> > > > supporting SR-IOV. So the driver callbacks all seem unnecessary at
> > > > this point.
> > >
> > > I guess we will see, but I'm hoping we can get quite far using
> > > vfio-pci as the SRIOV PF driver and don't need to try to get a big PF
> > > in-kernel driver entangled in this.
> >
> > So far we have had to support vfio-pci, pci-pf-stub, and idpf as PF
> > drivers, and nvme looks like it's coming soon :(
>
> How much effort did you put into moving them to vfio though? Hack Hack
> in the kernel is easy, but upstreaming may be very hard :\
>
> Shutting down enough of the PF kernel driver to safely kexec is almost
> the same as unbinding it completely.

I think it's totally fair to tell us to replace pci-pf-stub with
vfio-pci. That gets rid of one PF driver.

idpf cannot be easily replaced with vfio-pci, since the PF is also
used for host networking. Brian Vazquez from Google will be giving a
talk about the idpf support at LPC so we can revisit this topic there.
We took the approach of only preserving the SR-IOV configuration in
the PF, everything else gets reset (so no DMA mapping preservation, no
driver state preservation, etc.).

We haven't looked into nvme yet so we'll have to revisit that discussion later.
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Jason Gunthorpe 4 months, 1 week ago
On Fri, Oct 03, 2025 at 09:03:36AM -0700, David Matlack wrote:
> > Shutting down enough of the PF kernel driver to safely kexec is almost
> > the same as unbinding it completely.
> 
> I think it's totally fair to tell us to replace pci-pf-stub with
> vfio-pci. That gets rid of one PF driver.
> 
> idpf cannot be easily replaced with vfio-pci, since the PF is also
> used for host networking.

Run host networking on a VF instead?

> Brian Vazquez from Google will be giving a
> talk about the idpf support at LPC so we can revisit this topic there.
> We took the approach of only preserving the SR-IOV configuration in
> the PF, everything else gets reset (so no DMA mapping preservation, no
> driver state preservation, etc.).

Yes, that's pretty much what you'd have to do, it sure would be nice
to have some helper to manage this to minimize driver work. It really
is remove the existing driver and just leave it idle unless luo fails
then rebind it..

> We haven't looked into nvme yet so we'll have to revisit that discussion later.

Put any host storage on a NVMe VF?

Jason
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by Pasha Tatashin 4 months, 1 week ago
On Fri, Oct 3, 2025 at 12:16 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Fri, Oct 03, 2025 at 09:03:36AM -0700, David Matlack wrote:
> > > Shutting down enough of the PF kernel driver to safely kexec is almost
> > > the same as unbinding it completely.
> >
> > I think it's totally fair to tell us to replace pci-pf-stub with
> > vfio-pci. That gets rid of one PF driver.
> >
> > idpf cannot be easily replaced with vfio-pci, since the PF is also
> > used for host networking.
>
> Run host networking on a VF instead?

There is a plan for this, but not immediately. In upstream, I suspect
vfio-pci is all we need, and other drivers can be added when it really
necessary.

>
> > Brian Vazquez from Google will be giving a
> > talk about the idpf support at LPC so we can revisit this topic there.
> > We took the approach of only preserving the SR-IOV configuration in
> > the PF, everything else gets reset (so no DMA mapping preservation, no
> > driver state preservation, etc.).
>
> Yes, that's pretty much what you'd have to do, it sure would be nice
> to have some helper to manage this to minimize driver work. It really
> is remove the existing driver and just leave it idle unless luo fails
> then rebind it..
>
> > We haven't looked into nvme yet so we'll have to revisit that discussion later.
>
> Put any host storage on a NVMe VF?
>
> Jason
Re: [PATCH v2 03/10] PCI/LUO: Forward prepare()/freeze()/cancel() callbacks to driver
Posted by David Matlack 4 months, 1 week ago
On Fri, Oct 3, 2025 at 9:28 AM Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
>
> On Fri, Oct 3, 2025 at 12:16 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > On Fri, Oct 03, 2025 at 09:03:36AM -0700, David Matlack wrote:
> > > > Shutting down enough of the PF kernel driver to safely kexec is almost
> > > > the same as unbinding it completely.
> > >
> > > I think it's totally fair to tell us to replace pci-pf-stub with
> > > vfio-pci. That gets rid of one PF driver.
> > >
> > > idpf cannot be easily replaced with vfio-pci, since the PF is also
> > > used for host networking.
> >
> > Run host networking on a VF instead?
>
> There is a plan for this, but not immediately. In upstream, I suspect
> vfio-pci is all we need, and other drivers can be added when it really
> necessary.

Sounds good to me.