After the list of preserved devices is constructed, the PCI subsystem can
now forward the liveupdate request to the driver.
The PCI subsystem saves and restores a u64 data from LUO callback. For
each device, the PCI subsystem preserve a "dev_state" struct, which
contains the path (domain + bus + devfn) and a per device u64 data.
The device driver will use such a u64 data area to store the device driver
state. The device live update callback looks very similar to the LUO
subsystem callback, with the "void *arg" change to "struct device *dev".
In the prepare callback, the PCI subsystem allocates then preserves a
folio big enough to hold all requested device state (struct pci_dev_ser)
in an array and the count.
The PCI sub system will just forward the liveupdate call back with u64
data point to the u64 field of the device state array.
If some device fails the prepare callback, all previous devices that
already successfully finished the prepare call back will get the cancel
call back to clean up the saved state. That clean up is the special case
that not the full list will be walked.
In other live update callbacks, all the devices in the preserved device
list will get the callback with their own u64 data field.
Signed-off-by: Chris Li <chrisl@kernel.org>
---
drivers/pci/liveupdate.c | 203 +++++++++++++++++++++++++++++++++++++++--
include/linux/dev_liveupdate.h | 23 +++++
include/linux/device/driver.h | 6 ++
3 files changed, 223 insertions(+), 9 deletions(-)
diff --git a/drivers/pci/liveupdate.c b/drivers/pci/liveupdate.c
index e8891844b8194dabf8d1e8e2d74d9c701bd741ca..2b215c224fb78c908579b0d22be713e1dc7ca21f 100644
--- a/drivers/pci/liveupdate.c
+++ b/drivers/pci/liveupdate.c
@@ -9,11 +9,25 @@
#define dev_fmt(fmt) "PCI liveupdate: " fmt
#include <linux/types.h>
+#include <linux/kexec_handover.h>
#include <linux/liveupdate.h>
#include "pci.h"
#define PCI_SUBSYSTEM_NAME "pci"
+static LIST_HEAD(preserved_devices);
+
+struct pci_dev_ser {
+ u32 path; /* domain + bus + slot + fn */
+ u32 flags;
+ u64 driver_data; /* driver data */
+};
+
+struct pci_ser {
+ u32 count;
+ struct pci_dev_ser devs[];
+};
+
static void stack_push_buses(struct list_head *stack, struct list_head *buses)
{
struct pci_bus *bus;
@@ -70,42 +84,213 @@ static int build_liveupdate_devices(struct list_head *head)
return count;
}
+static void dev_cleanup_liveupdate(struct device *dev)
+{
+ dev->lu.flags &= ~LU_DEPENDED;
+ list_del_init(&dev->lu.lu_next);
+}
+
static void cleanup_liveupdate_devices(struct list_head *head)
{
struct device *d, *n;
- list_for_each_entry_safe(d, n, head, lu.lu_next) {
- d->lu.flags &= ~LU_DEPENDED;
- list_del_init(&d->lu.lu_next);
+ list_for_each_entry_safe(d, n, head, lu.lu_next)
+ dev_cleanup_liveupdate(d);
+}
+
+static void cleanup_liveupdate_state(struct pci_ser *pci_state)
+{
+ struct folio *folio = virt_to_folio(pci_state);
+
+ kho_unpreserve_folio(folio);
+ folio_put(folio);
+}
+
+static void pci_call_cancel(struct pci_ser *pci_state)
+{
+ struct pci_dev_ser *si = pci_state->devs;
+ struct device *dev, *next;
+
+ list_for_each_entry_safe(dev, next, &preserved_devices, lu.lu_next) {
+ struct pci_dev_ser *s = si++;
+
+ if (!dev->driver)
+ panic("PCI liveupdate cancel: %s has no driver", dev_name(dev));
+ if (!dev->driver->lu)
+ panic("PCI liveupdate cancel: %s driver %s does not support liveupdate",
+ dev_name(dev), dev->driver->name ? : "(null name)");
+ if (dev->driver->lu->cancel)
+ dev->driver->lu->cancel(dev, s->driver_data);
+ dev_cleanup_liveupdate(dev);
}
}
-static int pci_liveupdate_prepare(void *arg, u64 *data)
+static int pci_get_device_path(struct pci_dev *pdev)
+{
+ return (pci_domain_nr(pdev->bus) << 16) | pci_dev_id(pdev);
+}
+
+static int pci_save_device_state(struct device *dev, struct pci_dev_ser *s)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ s->path = pci_get_device_path(pdev);
+ s->flags = dev->lu.flags;
+ return 0;
+}
+
+static int pci_call_prepare(struct pci_ser *pci_state,
+ struct list_head *devices)
+{
+ struct pci_dev_ser *pdev_state_current = pci_state->devs;
+ struct device *dev, *next;
+ int ret;
+ char *reason;
+
+ list_for_each_entry_safe(dev, next, devices, lu.lu_next) {
+ struct pci_dev_ser *s = pdev_state_current++;
+
+ if (!dev->driver) {
+ reason = "no driver";
+ ret = -ENOENT;
+ goto cancel;
+ }
+ if (!dev->driver->lu) {
+ reason = "driver does not support liveupdate";
+ ret = -EPERM;
+ goto cancel;
+ }
+ ret = pci_save_device_state(dev, s);
+ if (ret) {
+ reason = "save device state failed";
+ goto cancel;
+ }
+ if (dev->driver->lu->prepare) {
+ ret = dev->driver->lu->prepare(dev, &s->driver_data);
+ if (ret) {
+ reason = "prepare() failed";
+ goto cancel;
+ }
+ }
+ list_move_tail(&dev->lu.lu_next, &preserved_devices);
+ }
+ return 0;
+
+cancel:
+ dev_err(dev, "luo prepare failed %d (%s)\n", ret, reason);
+ pci_call_cancel(pci_state);
+ return ret;
+}
+
+static int __pci_liveupdate_prepare(void *arg, u64 *data)
{
LIST_HEAD(requested_devices);
+ struct pci_ser *pci_state;
+ int ret;
+ int count = build_liveupdate_devices(&requested_devices);
+ int size = sizeof(*pci_state) + sizeof(pci_state->devs[0]) * count;
+ int order = get_order(size);
+ struct folio *folio;
- pr_info("prepare data[%llx]\n", *data);
+ folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order);
+ if (!folio) {
+ ret = -ENOMEM;
+ goto cleanup_device;
+ }
- pci_lock_rescan_remove();
- down_write(&pci_bus_sem);
+ pci_state = folio_address(folio);
+ pci_state->count = count;
+
+ ret = kho_preserve_folio(folio);
+ if (ret) {
+ pr_err("liveupdate_preserve_folio failed\n");
+ goto release_folio;
+ }
+
+ ret = pci_call_prepare(pci_state, &requested_devices);
+ if (ret)
+ goto unpreserve;
- build_liveupdate_devices(&requested_devices);
+ *data = __pa(pci_state);
+ pr_info("prepare data[%llx]\n", *data);
+ return 0;
+
+unpreserve:
+ kho_unpreserve_folio(folio);
+release_folio:
+ folio_put(folio);
+cleanup_device:
cleanup_liveupdate_devices(&requested_devices);
+ return ret;
+}
+static int pci_liveupdate_prepare(void *arg, u64 *data)
+{
+ int ret;
+
+ pci_lock_rescan_remove();
+ down_write(&pci_bus_sem);
+ ret = __pci_liveupdate_prepare(arg, data);
up_write(&pci_bus_sem);
pci_unlock_rescan_remove();
+ return ret;
+}
+
+static int pci_call_freeze(struct pci_ser *pci_state, struct list_head *devlist)
+{
+ struct pci_dev_ser *n = pci_state->devs;
+ struct device *dev;
+ int ret = 0;
+
+ list_for_each_entry(dev, devlist, lu.lu_next) {
+ struct pci_dev_ser *s = n++;
+
+ if (!dev->driver) {
+ if (!dev->parent)
+ continue;
+ panic("PCI liveupdate freeze: %s has no driver", dev_name(dev));
+ }
+ if (!dev->driver->lu->freeze)
+ continue;
+ ret = dev->driver->lu->freeze(dev, &s->driver_data);
+ if (ret) {
+ dev_err(dev, "luo freeze failed %d\n", ret);
+ pci_call_cancel(pci_state);
+ return ret;
+ }
+ }
return 0;
}
static int pci_liveupdate_freeze(void *arg, u64 *data)
{
+ struct pci_ser *pci_state = phys_to_virt(*data);
+ int ret;
+
pr_info("freeze data[%llx]\n", *data);
- return 0;
+ pci_lock_rescan_remove();
+ down_write(&pci_bus_sem);
+
+ ret = pci_call_freeze(pci_state, &preserved_devices);
+
+ up_write(&pci_bus_sem);
+ pci_unlock_rescan_remove();
+ return ret;
}
static void pci_liveupdate_cancel(void *arg, u64 data)
{
+ struct pci_ser *pci_state = phys_to_virt(data);
+
pr_info("cancel data[%llx]\n", data);
+ pci_lock_rescan_remove();
+ down_write(&pci_bus_sem);
+
+ pci_call_cancel(pci_state);
+ cleanup_liveupdate_state(pci_state);
+
+ up_write(&pci_bus_sem);
+ pci_unlock_rescan_remove();
}
static void pci_liveupdate_finish(void *arg, u64 data)
diff --git a/include/linux/dev_liveupdate.h b/include/linux/dev_liveupdate.h
index 72297cba08a999e89f7bc0997dabdbe14e0aa12c..80a723c7701ac4ddc2ddd03d0ffc9cc5a62a6083 100644
--- a/include/linux/dev_liveupdate.h
+++ b/include/linux/dev_liveupdate.h
@@ -20,6 +20,8 @@ enum liveupdate_flag {
#define LU_REQUESTED (LU_BUSMASTER)
#define LU_DEPENDED (LU_BUSMASTER_BRIDGE)
+struct device;
+
/**
* struct dev_liveupdate - Device state for live update operations
* @lu_next: List head for linking the device into live update
@@ -40,5 +42,26 @@ struct dev_liveupdate {
bool visited:1;
};
+/**
+ * struct dev_liveupdate_ops - Live Update callback functions
+ * @prepare: Prepare device for the upcoming state transition. Driver and
+ * buses should save the necessary device state.
+ * @freeze: A final notification before the system jumps to the new kernel.
+ * Called from reboot() syscall.
+ * @cancel: Cancel the live update process. Driver should clean
+ * up any saved state if necessary.
+ * @finish: The system has completed a transition. Drivers and buses should
+ * have already restored the previously saved device state.
+ * Clean-up any saved state or reset unreclaimed device.
+ *
+ * This structure is used by drivers and buses to hold the callback from LUO.
+ */
+struct dev_liveupdate_ops {
+ int (*prepare)(struct device *dev, u64 *data);
+ int (*freeze)(struct device *dev, u64 *data);
+ void (*cancel)(struct device *dev, u64 data);
+ void (*finish)(struct device *dev, u64 data);
+};
+
#endif /* CONFIG_LIVEUPDATE */
#endif /* _LINUX_DEV_LIVEUPDATE_H */
diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index cd8e0f0a634be9ea63ff22e89d66ada3b1a9eaf2..b2ba469cc3065a412f02230c62e811af19c4d2c6 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -19,6 +19,7 @@
#include <linux/pm.h>
#include <linux/device/bus.h>
#include <linux/module.h>
+#include <linux/dev_liveupdate.h>
/**
* enum probe_type - device driver probe type to try
@@ -80,6 +81,8 @@ enum probe_type {
* it is bound to the driver.
* @pm: Power management operations of the device which matched
* this driver.
+ * @lu: Live update callbacks, notify device of the live
+ * update state, and allow preserve device across reboot.
* @coredump: Called when sysfs entry is written to. The device driver
* is expected to call the dev_coredump API resulting in a
* uevent.
@@ -116,6 +119,9 @@ struct device_driver {
const struct attribute_group **dev_groups;
const struct dev_pm_ops *pm;
+#ifdef CONFIG_LIVEUPDATE
+ const struct dev_liveupdate_ops *lu;
+#endif
void (*coredump) (struct device *dev);
struct driver_private *p;
--
2.51.0.384.g4c02a37b29-goog
On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote: > include/linux/dev_liveupdate.h | 23 +++++ > include/linux/device/driver.h | 6 ++ Driver core changes under the guise of only PCI changes? Please no. Break this series out properly, get the driver core stuff working FIRST, then show how multiple busses will work with them (i.e. you usually need 3 to know if you got it right). I'm guessing you will need/want PCI, platform, and something else? thanks, greg k-h
On Tue, Sep 30, 2025 at 8:30 AM Greg Kroah-Hartman <gregkh@linuxfoundation.org> wrote: > > On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote: > > include/linux/dev_liveupdate.h | 23 +++++ > > include/linux/device/driver.h | 6 ++ > > Driver core changes under the guise of only PCI changes? Please no. There is a reason why I use the device struct rather than the pci_dev struct even though liveupdate currently only works with PCI devices. It comes down to the fact that the pci_bus and pci_host_bridge are not pci_dev struct. We need something that is common across all those three types of PCI related struct I care about(pci_dev, pci_bus, pci_host_bridge). The device struct is just common around those. I can move the dev_liveupdate struct into pci_bus, pci_host_bridge and pci_dev independently. That will be more contained inside PCI, not touching the device struct. The patch would be bigger because the data structure is spread into different structs. Do you have a preference which way to go? > Break this series out properly, get the driver core stuff working FIRST, > then show how multiple busses will work with them (i.e. you usually need > 3 to know if you got it right). Multiple buses you mean different types of bus, e.g. USB, PCI and others or 3 pci_bus is good enough? Right now we have no intention to support bus types other than PCI devices. The liveupdate is about preserving the GPU context cross kernel upgrade. Suggestion welcome. > I'm guessing you will need/want PCI, platform, and something else? This series only cares about PCI. The LUO series has subsystems. The PCI livedupate code is registered as an LUO subsystem. I guess the subsystem is close to the platform you have in mind? LUO also has the memfd in addition to the subsystem. Chris
On Thu, Oct 02, 2025 at 01:38:56PM -0700, Chris Li wrote: > On Tue, Sep 30, 2025 at 8:30 AM Greg Kroah-Hartman > <gregkh@linuxfoundation.org> wrote: > > > > On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote: > > > include/linux/dev_liveupdate.h | 23 +++++ > > > include/linux/device/driver.h | 6 ++ > > > > Driver core changes under the guise of only PCI changes? Please no. > > There is a reason why I use the device struct rather than the pci_dev > struct even though liveupdate currently only works with PCI devices. > It comes down to the fact that the pci_bus and pci_host_bridge are not > pci_dev struct. We need something that is common across all those > three types of PCI related struct I care about(pci_dev, pci_bus, > pci_host_bridge). The device struct is just common around those. I can > move the dev_liveupdate struct into pci_bus, pci_host_bridge and > pci_dev independently. That will be more contained inside PCI, not > touching the device struct. The patch would be bigger because the data > structure is spread into different structs. Do you have a preference > which way to go? If you only are caring about one single driver, don't mess with a subsystem or the driver core, just change the driver. My objection here was that you were claiming it was a PCI change, yet it was actually only touching the driver core which means that all devices in the systems for all Linux users will be affected. > > Break this series out properly, get the driver core stuff working FIRST, > > then show how multiple busses will work with them (i.e. you usually need > > 3 to know if you got it right). > > Multiple buses you mean different types of bus, e.g. USB, PCI and > others or 3 pci_bus is good enough? Right now we have no intention to > support bus types other than PCI devices. The liveupdate is about > preserving the GPU context cross kernel upgrade. Suggestion welcome. So all of this is just for one single driver. Ugh. Just do it in the single driver then, don't mess with the driver core, or even the PCI core. Just make it specific to the driver and then none of us will even notice the mess that this all creates :) thanks, greg k-h
On Thu, Oct 2, 2025 at 11:19 PM Greg Kroah-Hartman <gregkh@linuxfoundation.org> wrote: > > On Thu, Oct 02, 2025 at 01:38:56PM -0700, Chris Li wrote: > > On Tue, Sep 30, 2025 at 8:30 AM Greg Kroah-Hartman > > <gregkh@linuxfoundation.org> wrote: > > > > > > On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote: > > > > include/linux/dev_liveupdate.h | 23 +++++ > > > > include/linux/device/driver.h | 6 ++ > > > > > > Driver core changes under the guise of only PCI changes? Please no. > > > > There is a reason why I use the device struct rather than the pci_dev > > struct even though liveupdate currently only works with PCI devices. > > It comes down to the fact that the pci_bus and pci_host_bridge are not > > pci_dev struct. We need something that is common across all those > > three types of PCI related struct I care about(pci_dev, pci_bus, > > pci_host_bridge). The device struct is just common around those. I can > > move the dev_liveupdate struct into pci_bus, pci_host_bridge and > > pci_dev independently. That will be more contained inside PCI, not > > touching the device struct. The patch would be bigger because the data > > structure is spread into different structs. Do you have a preference > > which way to go? > > If you only are caring about one single driver, don't mess with a > subsystem or the driver core, just change the driver. My objection here It is more than just one driver, we have vfio-pci, idpf, pci-pf-stub and possible nvme driver. The change needs to happen in the PCI enumeration and probing as well, that is outside of the driver code. > was that you were claiming it was a PCI change, yet it was actually only > touching the driver core which means that all devices in the systems for In theory all the devices can be liveupdate preserved. But now we only support PCI. I can look into containing the change in PCI only and not touching the device struct if that is what you mean. I recall I tried that previously and failed because bus->bridge is a device struct rather than pci_dev or pci_host_bridge. I can try harder not to touch device structs. Patch will be bigger and more complex than this right now. But at least the damage is limited to PCI only if successful. > all Linux users will be affected. I understand your concerns. I was wishing one day all devices could support liveupdate, but that is not the case right now. > > > Break this series out properly, get the driver core stuff working FIRST, > > > then show how multiple busses will work with them (i.e. you usually need > > > 3 to know if you got it right). > > > > Multiple buses you mean different types of bus, e.g. USB, PCI and > > others or 3 pci_bus is good enough? Right now we have no intention to > > support bus types other than PCI devices. The liveupdate is about > > preserving the GPU context cross kernel upgrade. Suggestion welcome. > > So all of this is just for one single driver. Ugh. Just do it in the > single driver then, don't mess with the driver core, or even the PCI Not a single driver. It is the whole PCI core. Or in your book the whole PCI is just one single driver? > core. Just make it specific to the driver and then none of us will even > notice the mess that this all creates :) OK. Let me try that PCI only approach again and try it harder. I will report back. Thanks for the feedback. Chris
On Fri, Oct 03, 2025 at 12:26:01AM -0700, Chris Li wrote: > On Thu, Oct 2, 2025 at 11:19 PM Greg Kroah-Hartman > <gregkh@linuxfoundation.org> wrote: > > > > On Thu, Oct 02, 2025 at 01:38:56PM -0700, Chris Li wrote: > > > On Tue, Sep 30, 2025 at 8:30 AM Greg Kroah-Hartman > > > <gregkh@linuxfoundation.org> wrote: > > > > > > > > On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote: > > > > > include/linux/dev_liveupdate.h | 23 +++++ > > > > > include/linux/device/driver.h | 6 ++ > > > > > > > > Driver core changes under the guise of only PCI changes? Please no. > > > > > > There is a reason why I use the device struct rather than the pci_dev > > > struct even though liveupdate currently only works with PCI devices. > > > It comes down to the fact that the pci_bus and pci_host_bridge are not > > > pci_dev struct. We need something that is common across all those > > > three types of PCI related struct I care about(pci_dev, pci_bus, > > > pci_host_bridge). The device struct is just common around those. I can > > > move the dev_liveupdate struct into pci_bus, pci_host_bridge and > > > pci_dev independently. That will be more contained inside PCI, not > > > touching the device struct. The patch would be bigger because the data > > > structure is spread into different structs. Do you have a preference > > > which way to go? > > > > If you only are caring about one single driver, don't mess with a > > subsystem or the driver core, just change the driver. My objection here > > It is more than just one driver, we have vfio-pci, idpf, pci-pf-stub > and possible nvme driver. Why is nvme considered a "GPU" that needs context saved? > The change needs to happen in the PCI enumeration and probing as well, > that is outside of the driver code. So all just PCI drivers? Then keep this in PCI-only please, and don't touch the driver core. > > was that you were claiming it was a PCI change, yet it was actually only > > touching the driver core which means that all devices in the systems for > > In theory all the devices can be liveupdate preserved. But now we only > support PCI. Then for now, only focus on PCI. thanks, greg k-h
On Fri, Oct 3, 2025 at 5:26 AM Greg Kroah-Hartman <gregkh@linuxfoundation.org> wrote: > > On Fri, Oct 03, 2025 at 12:26:01AM -0700, Chris Li wrote: > > > It is more than just one driver, we have vfio-pci, idpf, pci-pf-stub > > and possible nvme driver. > > Why is nvme considered a "GPU" that needs context saved? NVME is not a GPU. The internal reason to have NVME participate in the liveupdate is because the NVME shutdown of the IO queue is very slow, it contributes the largest chunk of delay in the black out window for liveupdate. The NVME participation is just an optimization to avoid resetting the NVME queue. Consider it as (optional ) speed optimization. > > The change needs to happen in the PCI enumeration and probing as well, > > that is outside of the driver code. > > So all just PCI drivers? Then keep this in PCI-only please, and don't > touch the driver core. Ack. Will do. > > > > was that you were claiming it was a PCI change, yet it was actually only > > > touching the driver core which means that all devices in the systems for > > > > In theory all the devices can be liveupdate preserved. But now we only > > support PCI. > > Then for now, only focus on PCI. Agree, thanks for the alignment. Chris
On Fri, Oct 3, 2025 at 10:49 AM Chris Li <chrisl@kernel.org> wrote: > > On Fri, Oct 3, 2025 at 5:26 AM Greg Kroah-Hartman > <gregkh@linuxfoundation.org> wrote: > > > > On Fri, Oct 03, 2025 at 12:26:01AM -0700, Chris Li wrote: > > > > > It is more than just one driver, we have vfio-pci, idpf, pci-pf-stub > > > and possible nvme driver. > > > > Why is nvme considered a "GPU" that needs context saved? > > NVME is not a GPU. The internal reason to have NVME participate in the > liveupdate is because the NVME shutdown of the IO queue is very slow, > it contributes the largest chunk of delay in the black out window for > liveupdate. The NVME participation is just an optimization to avoid > resetting the NVME queue. Consider it as (optional ) speed > optimization. This is not true. We haven't made any changes to the nvme driver within Google for Live Update. The reason I mentioned nvme in another email chain is because Google has some hosts where we want to preserve VFs bound to vfio-pci across Live Update where the PF is bound to nvme. But Jason is suggesting we seriously explore switching the PF driver to vfio-pci before trying to upstream nvme support for Live Update, which I think is fair.
On Fri, Oct 3, 2025 at 11:28 AM David Matlack <dmatlack@google.com> wrote: > > On Fri, Oct 3, 2025 at 10:49 AM Chris Li <chrisl@kernel.org> wrote: > > > > On Fri, Oct 3, 2025 at 5:26 AM Greg Kroah-Hartman > > NVME is not a GPU. The internal reason to have NVME participate in the > > liveupdate is because the NVME shutdown of the IO queue is very slow, > > it contributes the largest chunk of delay in the black out window for > > liveupdate. The NVME participation is just an optimization to avoid > > resetting the NVME queue. Consider it as (optional ) speed > > optimization. > > This is not true. We haven't made any changes to the nvme driver > within Google for Live Update. > > The reason I mentioned nvme in another email chain is because Google > has some hosts where we want to preserve VFs bound to vfio-pci across > Live Update where the PF is bound to nvme. But Jason is suggesting we > seriously explore switching the PF driver to vfio-pci before trying to > upstream nvme support for Live Update, which I think is fair. Ah, thanks for the clarification and sorry for my confusion. I think I was thinking of a different storage driver, not the NVME you have in mind. Chris
On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote: > After the list of preserved devices is constructed, the PCI subsystem can > now forward the liveupdate request to the driver. This also seems completely backwards for how iommufd should be working. It doesn't want callbacks triggered on prepare, it wants to drive everything from its own ioctl. Let's just do one thing at a time please and make this series about iommufd to match the other luo series for iommufd. non-iommufd cases can be proposed in their own series. Jason
On Mon, Sep 29, 2025 at 10:48 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote: > > After the list of preserved devices is constructed, the PCI subsystem can > > now forward the liveupdate request to the driver. > > This also seems completely backwards for how iommufd should be > working. It doesn't want callbacks triggered on prepare, it wants to > drive everything from its own ioctl. This series is about basic PCI device support, not IOMMUFD. > Let's just do one thing at a time please and make this series about > iommufd to match the other luo series for iommufd. I am confused by you. > non-iommufd cases can be proposed in their own series. This is that non-iommufd series. Chris
On Mon, Sep 29, 2025 at 07:11:06PM -0700, Chris Li wrote: > On Mon, Sep 29, 2025 at 10:48 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote: > > > After the list of preserved devices is constructed, the PCI subsystem can > > > now forward the liveupdate request to the driver. > > > > This also seems completely backwards for how iommufd should be > > working. It doesn't want callbacks triggered on prepare, it wants to > > drive everything from its own ioctl. > > This series is about basic PCI device support, not IOMMUFD. > > > Let's just do one thing at a time please and make this series about > > iommufd to match the other luo series for iommufd. > > I am confused by you. > > > non-iommufd cases can be proposed in their own series. > > This is that non-iommufd series. Then don't do generic devices until we get iommufd done and you have a meaningful in-tree driver to consume what you are adding. Jason
On Tue, Sep 30, 2025 at 9:38 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > > non-iommufd cases can be proposed in their own series. > > > > This is that non-iommufd series. > > Then don't do generic devices until we get iommufd done and you have a > meaningful in-tree driver to consume what you are adding. Thanks for the suggestion. See the above explanation why I add to device struct. Because it was contained in both pci_dev, pci_bus and pci_host_bridge struct. All PCI related, but device struct is the common base. I can move them into three PCI related struct separately then leave the device struct alone. Will that be better? I did add the pci-lu-test driver to consume and test the LUO/PCI series for the bus master bits. Are you suggesting the LUO/PCI should wait until VFIO can consume the LUO/PCI changes then send them out as a series. That will delay the LUO/PCI a bit for the VFIO. Chris
On 2025-09-30 01:38 PM, Jason Gunthorpe wrote: > On Mon, Sep 29, 2025 at 07:11:06PM -0700, Chris Li wrote: > > On Mon, Sep 29, 2025 at 10:48 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > > > On Tue, Sep 16, 2025 at 12:45:11AM -0700, Chris Li wrote: > > > > After the list of preserved devices is constructed, the PCI subsystem can > > > > now forward the liveupdate request to the driver. > > > > > > This also seems completely backwards for how iommufd should be > > > working. It doesn't want callbacks triggered on prepare, it wants to > > > drive everything from its own ioctl. > > > > This series is about basic PCI device support, not IOMMUFD. > > > > > Let's just do one thing at a time please and make this series about > > > iommufd to match the other luo series for iommufd. > > > > I am confused by you. > > > > > non-iommufd cases can be proposed in their own series. > > > > This is that non-iommufd series. > > Then don't do generic devices until we get iommufd done and you have a > meaningful in-tree driver to consume what you are adding. I agree with Jason. I don't think we can reasonably make the argument that we need this series until we have actualy use-cases for it. I think we should focus on vfio-pci device preservation next, and use that to incrementally drive whatever changes are necessary to the PCI and generic device layer bit by bit. For example, once we a basic vfio-pci device preservation working, we can start thinking about how to handle when that device is a VF, and we have to start also preserving the SR-IOV state on the PF and get the PF driver involved in the process. At that point we can discuss how to solve that specific problem. Maybe the solution will look something like this series, maybe it will look like something else. There is open design space. Without approaching it this way, I don't see how we can't reasonably argue that anything in this series is necessary. And I suspect some parts of this series truly are unnecessary, at least in the short term. In our internal implementation, the only dependent device that truly needed to participate is the PF driver when a VF is preserved. Everything else (e.g. pcieport callbacks) have just been no-ops.
On Thu, Oct 2, 2025 at 11:54 AM David Matlack <dmatlack@google.com> wrote: > > Then don't do generic devices until we get iommufd done and you have a > > meaningful in-tree driver to consume what you are adding. > > I agree with Jason. I don't think we can reasonably make the argument > that we need this series until we have actualy use-cases for it. > > I think we should focus on vfio-pci device preservation next, and use > that to incrementally drive whatever changes are necessary to the PCI > and generic device layer bit by bit. The feedback I got for the PCI V1 was to make it as minimal as possible. We agree preserving the BUS MASTER bit is the first minimal step. That is what I did in the V2 phase I series. Only the bus master. I think the pci-lu-test driver did demo the bus master bit, it is not vfio yet. At least that was the plan shared in the upstream alignment meeting. > For example, once we a basic vfio-pci device preservation working, we > can start thinking about how to handle when that device is a VF, and we > have to start also preserving the SR-IOV state on the PF and get the PF SR-IOV is a much bigger step than the BUS Master bit. I recall at one point in the upstream discussion meeting that we don't do SR-IOV as the first step. I am not opposed to it, we need to get to vfio and SR-IOV eventually. I just feel that the PCI + VFIO + SR-IOV will be a much bigger series. I worry the series size is not friendly for reviewers. I wish there could be smaller incremental steps digestible. > driver involved in the process. At that point we can discuss how to > solve that specific problem. Maybe the solution will look something like > this series, maybe it will look like something else. There is open > design space. Yes doable, just will delay the LUO/PCI series by a bit and a much bigger series. > Without approaching it this way, I don't see how we can't reasonably > argue that anything in this series is necessary. And I suspect some > parts of this series truly are unnecessary, at least in the short term. You have me on the double negatives, always not very good at those. If the bigger series is what we want, I can do that. Just will have some latency to get the VFIO. > In our internal implementation, the only dependent device that truly > needed to participate is the PF driver when a VF is preserved. > Everything else (e.g. pcieport callbacks) have just been no-ops. Your VF device does not need to preserve DMA? If you want to preserve DMA the bus master bit is required, and the pcieport driver for the PCI-PCI bridge is also required. I am not sure pure VF and PF without any DMA makes practical sense. Chris
On Thu, Oct 2, 2025 at 1:58 PM Chris Li <chrisl@kernel.org> wrote: > > On Thu, Oct 2, 2025 at 11:54 AM David Matlack <dmatlack@google.com> wrote: > > > Then don't do generic devices until we get iommufd done and you have a > > > meaningful in-tree driver to consume what you are adding. > > > > I agree with Jason. I don't think we can reasonably make the argument > > that we need this series until we have actualy use-cases for it. > > > > I think we should focus on vfio-pci device preservation next, and use > > that to incrementally drive whatever changes are necessary to the PCI > > and generic device layer bit by bit. > > The feedback I got for the PCI V1 was to make it as minimal as > possible. We agree preserving the BUS MASTER bit is the first minimal > step. That is what I did in the V2 phase I series. Only the bus > master. I think the pci-lu-test driver did demo the bus master bit, it > is not vfio yet. At least that was the plan shared in the upstream > alignment meeting. What do the driver callbacks in patch 3 and patches 5-8 have to do with preserving the bus master bit? That's half the series. > > > For example, once we a basic vfio-pci device preservation working, we > > can start thinking about how to handle when that device is a VF, and we > > have to start also preserving the SR-IOV state on the PF and get the PF > > SR-IOV is a much bigger step than the BUS Master bit. I recall at one > point in the upstream discussion meeting that we don't do SR-IOV as > the first step. I am not opposed to it, we need to get to vfio and > SR-IOV eventually. I just feel that the PCI + VFIO + SR-IOV will be a > much bigger series. I worry the series size is not friendly for > reviewers. I wish there could be smaller incremental steps digestible. SR-IOV is not a first step, of course. That's not what I said. I'm saying SR-IOV is future work that could justify some of the larger changes in this series (e.g. driver callbacks). So I am suggesting we revisit those changes when we are working on SR-IOV. > > > driver involved in the process. At that point we can discuss how to > > solve that specific problem. Maybe the solution will look something like > > this series, maybe it will look like something else. There is open > > design space. > > Yes doable, just will delay the LUO/PCI series by a bit and a much > bigger series. There will not be one "LUO/PCI series". There will be many incremental steps. > > > Without approaching it this way, I don't see how we can't reasonably > > argue that anything in this series is necessary. And I suspect some > > parts of this series truly are unnecessary, at least in the short term. > > You have me on the double negatives, always not very good at those. > If the bigger series is what we want, I can do that. Just will have > some latency to get the VFIO. Oops, typo on my end. I meant "I don't see how we _can_ reasonably argue". I am saying we don't have enough justification (yet) for a lot of the code changes in this series. > > > In our internal implementation, the only dependent device that truly > > needed to participate is the PF driver when a VF is preserved. > > Everything else (e.g. pcieport callbacks) have just been no-ops. > > Your VF device does not need to preserve DMA? If you want to preserve > DMA the bus master bit is required, and the pcieport driver for the > PCI-PCI bridge is also required. I am not sure pure VF and PF without > any DMA makes practical sense. I'm saying the only drivers that actually needed to implement Live Update driver callbacks have been vfio-pci and PF drivers. The vfio-pci support doesn't exist yet upstream, and we are planning to use FD preservation there. So we don't know if driver callbacks will be needed for that. And we don't care about PF drivers until we get to supporting SR-IOV. So the driver callbacks all seem unnecessary at this point. I totally agree we need to avoid clearing the bus master bit. Let's focus on that.
On Thu, Oct 2, 2025 at 2:31 PM David Matlack <dmatlack@google.com> wrote: > > On Thu, Oct 2, 2025 at 1:58 PM Chris Li <chrisl@kernel.org> wrote: > > > > On Thu, Oct 2, 2025 at 11:54 AM David Matlack <dmatlack@google.com> wrote: > > > > Then don't do generic devices until we get iommufd done and you have a > > > > meaningful in-tree driver to consume what you are adding. > > > > > > I agree with Jason. I don't think we can reasonably make the argument > > > that we need this series until we have actualy use-cases for it. > > > > > > I think we should focus on vfio-pci device preservation next, and use > > > that to incrementally drive whatever changes are necessary to the PCI > > > and generic device layer bit by bit. > > > > The feedback I got for the PCI V1 was to make it as minimal as > > possible. We agree preserving the BUS MASTER bit is the first minimal > > step. That is what I did in the V2 phase I series. Only the bus > > master. I think the pci-lu-test driver did demo the bus master bit, it > > is not vfio yet. At least that was the plan shared in the upstream > > alignment meeting. > > What do the driver callbacks in patch 3 and patches 5-8 have to do > with preserving the bus master bit? That's half the series. I was thinking the pcie driver is for preserving the bus master bit on the bridge. I just realized that the actual bus master bit preserve is at the pci_dev level. The pcie driver call back is effectively no op. Yes, I can delete those from the series and not save driver data at all. Points taken. > > > > > > For example, once we a basic vfio-pci device preservation working, we > > > can start thinking about how to handle when that device is a VF, and we > > > have to start also preserving the SR-IOV state on the PF and get the PF > > > > SR-IOV is a much bigger step than the BUS Master bit. I recall at one > > point in the upstream discussion meeting that we don't do SR-IOV as > > the first step. I am not opposed to it, we need to get to vfio and > > SR-IOV eventually. I just feel that the PCI + VFIO + SR-IOV will be a > > much bigger series. I worry the series size is not friendly for > > reviewers. I wish there could be smaller incremental steps digestible. > > SR-IOV is not a first step, of course. That's not what I said. I'm > saying SR-IOV is future work that could justify some of the larger > changes in this series (e.g. driver callbacks). So I am suggesting we > revisit those changes when we are working on SR-IOV. Just to confirm my understanding aligned with you. We remove the driver callbacks in PCI series until the vfio SR-IOV to add them back. > > > driver involved in the process. At that point we can discuss how to > > > solve that specific problem. Maybe the solution will look something like > > > this series, maybe it will look like something else. There is open > > > design space. > > > > Yes doable, just will delay the LUO/PCI series by a bit and a much > > bigger series. > > There will not be one "LUO/PCI series". There will be many incremental steps. Oh, that is good then. I can still keep the PCI series as one incremental step. > > > > > > Without approaching it this way, I don't see how we can't reasonably > > > argue that anything in this series is necessary. And I suspect some > > > parts of this series truly are unnecessary, at least in the short term. > > > > You have me on the double negatives, always not very good at those. > > If the bigger series is what we want, I can do that. Just will have > > some latency to get the VFIO. > > Oops, typo on my end. I meant "I don't see how we _can_ reasonably > argue". I am saying we don't have enough justification (yet) for a lot > of the code changes in this series. Ack for removing driver callbacks in this series. > > > In our internal implementation, the only dependent device that truly > > > needed to participate is the PF driver when a VF is preserved. > > > Everything else (e.g. pcieport callbacks) have just been no-ops. > > > > Your VF device does not need to preserve DMA? If you want to preserve > > DMA the bus master bit is required, and the pcieport driver for the > > PCI-PCI bridge is also required. I am not sure pure VF and PF without > > any DMA makes practical sense. > > I'm saying the only drivers that actually needed to implement Live > Update driver callbacks have been vfio-pci and PF drivers. The > vfio-pci support doesn't exist yet upstream, and we are planning to > use FD preservation there. So we don't know if driver callbacks will > be needed for that. And we don't care about PF drivers until we get to > supporting SR-IOV. So the driver callbacks all seem unnecessary at > this point. Ack. > > I totally agree we need to avoid clearing the bus master bit. Let's > focus on that. Ack. Chris
On Thu, Oct 02, 2025 at 02:31:08PM -0700, David Matlack wrote: > I'm saying the only drivers that actually needed to implement Live > Update driver callbacks have been vfio-pci and PF drivers. The > vfio-pci support doesn't exist yet upstream, and we are planning to > use FD preservation there. So we don't know if driver callbacks will > be needed for that. I don't expect driver callbacks trough the pci subsystem, and I think they should be removed from this series. As I said the flow is backwards from what we want. The vfio driver gets a luo FD from an ioctl, and it goes through and calls into luo and pci with that session object to do all the required serialization. Any required callbacks should be routed through luo based on creating preserved objects within luo and providing ops to luo. There is no other way to properly link things to sessions. > And we don't care about PF drivers until we get to > supporting SR-IOV. So the driver callbacks all seem unnecessary at > this point. I guess we will see, but I'm hoping we can get quite far using vfio-pci as the SRIOV PF driver and don't need to try to get a big PF in-kernel driver entangled in this. Jason
On Thu, Oct 2, 2025 at 4:21 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Thu, Oct 02, 2025 at 02:31:08PM -0700, David Matlack wrote: > > I'm saying the only drivers that actually needed to implement Live > > Update driver callbacks have been vfio-pci and PF drivers. The > > vfio-pci support doesn't exist yet upstream, and we are planning to > > use FD preservation there. So we don't know if driver callbacks will > > be needed for that. > > I don't expect driver callbacks trough the pci subsystem, and I think > they should be removed from this series. Per suggestion from David as well. I will remove the driver callback from the PCI series. > > As I said the flow is backwards from what we want. The vfio driver > gets a luo FD from an ioctl, and it goes through and calls into luo > and pci with that session object to do all the required serialization. > > Any required callbacks should be routed through luo based on creating > preserved objects within luo and providing ops to luo. > > There is no other way to properly link things to sessions. As David pointed out in the other email, the PCI also supports other non vfio PCI devices which do not have the FD and FD related sessions. That is the original intent for the LUO PCI subsystem. > > And we don't care about PF drivers until we get to > > supporting SR-IOV. So the driver callbacks all seem unnecessary at > > this point. > > I guess we will see, but I'm hoping we can get quite far using > vfio-pci as the SRIOV PF driver and don't need to try to get a big PF > in-kernel driver entangled in this. Yes, vfio-pci is a big series as well. Getting a NIC might be easier to get the PF DMA working with a live update but that will be thrown away once we have the vfio-pci as the real user. Actually getting the pci-pf-stub driver working would be a smaller and reasonable step to justify the PF support in LUO PCI. Chris
On Thu, Oct 02, 2025 at 10:24:59PM -0700, Chris Li wrote: > As David pointed out in the other email, the PCI also supports other > non vfio PCI devices which do not have the FD and FD related sessions. > That is the original intent for the LUO PCI subsystem. This doesn't make sense. We don't know how to solve this problem yet, but I'm pretty confident we will need to inject a FD and session into these drivers too. > away once we have the vfio-pci as the real user. Actually getting the > pci-pf-stub driver working would be a smaller and reasonable step to > justify the PF support in LUO PCI. In this contex pci-pf-stub is useless, just use vfio-pci as the SRIOV stub. I wouldn't invest in it. Especially since it creates more complexity because we don't have an obvious way to get the session FD. Jason
On Fri, Oct 3, 2025 at 5:06 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Thu, Oct 02, 2025 at 10:24:59PM -0700, Chris Li wrote: > > > As David pointed out in the other email, the PCI also supports other > > non vfio PCI devices which do not have the FD and FD related sessions. > > That is the original intent for the LUO PCI subsystem. > > This doesn't make sense. We don't know how to solve this problem yet, > but I'm pretty confident we will need to inject a FD and session into > these drivers too. Ack. I can start hacking on hook up the PCI layer to the vfio FD and sessions. Not sure how to do that at this point yet, I will give it a stab and report back. > > > away once we have the vfio-pci as the real user. Actually getting the > > pci-pf-stub driver working would be a smaller and reasonable step to > > justify the PF support in LUO PCI. > > In this contex pci-pf-stub is useless, just use vfio-pci as the SRIOV > stub. I wouldn't invest in it. Especially since it creates more > complexity because we don't have an obvious way to get the session FD. Ack. I will not do pci-pf-stub then. Chris
On Fri, Oct 3, 2025 at 5:06 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Thu, Oct 02, 2025 at 10:24:59PM -0700, Chris Li wrote: > > > As David pointed out in the other email, the PCI also supports other > > non vfio PCI devices which do not have the FD and FD related sessions. > > That is the original intent for the LUO PCI subsystem. > > This doesn't make sense. We don't know how to solve this problem yet, > but I'm pretty confident we will need to inject a FD and session into > these drivers too. Google's LUO PCI subsystem (i.e. this series) predated a lot of the discussion about FD preservation and needed to support the legacy vfio container/group model. Outside of vfio-pci, the only other drivers that participate are the PF drivers (pci-pf-stub and idpf), but they just register empty callbacks. So from an upstream perspective we don't really have a usecase for callbacks. Chris ,I saw in your other email that you agree with dropping them in the next version, so it sounds like we are aligned then. Vipin Sharma is working on the vfio-pci MVP series. Vipin, if you anticipate VFIO is going to need driver callbacks on top of the LUO FD callbacks, please chime in here.
On Fri, Oct 3, 2025 at 9:27 AM David Matlack <dmatlack@google.com> wrote: > > On Fri, Oct 3, 2025 at 5:06 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > On Thu, Oct 02, 2025 at 10:24:59PM -0700, Chris Li wrote: > > > > > As David pointed out in the other email, the PCI also supports other > > > non vfio PCI devices which do not have the FD and FD related sessions. > > > That is the original intent for the LUO PCI subsystem. > > > > This doesn't make sense. We don't know how to solve this problem yet, > > but I'm pretty confident we will need to inject a FD and session into > > these drivers too. > > Google's LUO PCI subsystem (i.e. this series) predated a lot of the > discussion about FD preservation and needed to support the legacy vfio > container/group model. Outside of vfio-pci, the only other drivers > that participate are the PF drivers (pci-pf-stub and idpf), but they > just register empty callbacks. > > So from an upstream perspective we don't really have a usecase for > callbacks. Chris ,I saw in your other email that you agree with > dropping them in the next version, so it sounds like we are aligned > then. > > Vipin Sharma is working on the vfio-pci MVP series. Vipin, if you > anticipate VFIO is going to need driver callbacks on top of the LUO FD > callbacks, please chime in here. Currently, LUO FD callbacks are the only ones I need. I think we are fine for now without PCI callbacks. I will have better clarity next week but for now I am only using LUO FD callbacks.
On Thu, Oct 2, 2025 at 4:21 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Thu, Oct 02, 2025 at 02:31:08PM -0700, David Matlack wrote: > > And we don't care about PF drivers until we get to > > supporting SR-IOV. So the driver callbacks all seem unnecessary at > > this point. > > I guess we will see, but I'm hoping we can get quite far using > vfio-pci as the SRIOV PF driver and don't need to try to get a big PF > in-kernel driver entangled in this. So far we have had to support vfio-pci, pci-pf-stub, and idpf as PF drivers, and nvme looks like it's coming soon :(
On Thu, Oct 02, 2025 at 04:42:17PM -0700, David Matlack wrote: > On Thu, Oct 2, 2025 at 4:21 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Thu, Oct 02, 2025 at 02:31:08PM -0700, David Matlack wrote: > > > And we don't care about PF drivers until we get to > > > supporting SR-IOV. So the driver callbacks all seem unnecessary at > > > this point. > > > > I guess we will see, but I'm hoping we can get quite far using > > vfio-pci as the SRIOV PF driver and don't need to try to get a big PF > > in-kernel driver entangled in this. > > So far we have had to support vfio-pci, pci-pf-stub, and idpf as PF > drivers, and nvme looks like it's coming soon :( How much effort did you put into moving them to vfio though? Hack Hack in the kernel is easy, but upstreaming may be very hard :\ Shutting down enough of the PF kernel driver to safely kexec is almost the same as unbinding it completely. Jason
On Fri, Oct 3, 2025 at 5:04 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Thu, Oct 02, 2025 at 04:42:17PM -0700, David Matlack wrote: > > On Thu, Oct 2, 2025 at 4:21 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > On Thu, Oct 02, 2025 at 02:31:08PM -0700, David Matlack wrote: > > > > And we don't care about PF drivers until we get to > > > > supporting SR-IOV. So the driver callbacks all seem unnecessary at > > > > this point. > > > > > > I guess we will see, but I'm hoping we can get quite far using > > > vfio-pci as the SRIOV PF driver and don't need to try to get a big PF > > > in-kernel driver entangled in this. > > > > So far we have had to support vfio-pci, pci-pf-stub, and idpf as PF > > drivers, and nvme looks like it's coming soon :( > > How much effort did you put into moving them to vfio though? Hack Hack > in the kernel is easy, but upstreaming may be very hard :\ > > Shutting down enough of the PF kernel driver to safely kexec is almost > the same as unbinding it completely. I think it's totally fair to tell us to replace pci-pf-stub with vfio-pci. That gets rid of one PF driver. idpf cannot be easily replaced with vfio-pci, since the PF is also used for host networking. Brian Vazquez from Google will be giving a talk about the idpf support at LPC so we can revisit this topic there. We took the approach of only preserving the SR-IOV configuration in the PF, everything else gets reset (so no DMA mapping preservation, no driver state preservation, etc.). We haven't looked into nvme yet so we'll have to revisit that discussion later.
On Fri, Oct 03, 2025 at 09:03:36AM -0700, David Matlack wrote: > > Shutting down enough of the PF kernel driver to safely kexec is almost > > the same as unbinding it completely. > > I think it's totally fair to tell us to replace pci-pf-stub with > vfio-pci. That gets rid of one PF driver. > > idpf cannot be easily replaced with vfio-pci, since the PF is also > used for host networking. Run host networking on a VF instead? > Brian Vazquez from Google will be giving a > talk about the idpf support at LPC so we can revisit this topic there. > We took the approach of only preserving the SR-IOV configuration in > the PF, everything else gets reset (so no DMA mapping preservation, no > driver state preservation, etc.). Yes, that's pretty much what you'd have to do, it sure would be nice to have some helper to manage this to minimize driver work. It really is remove the existing driver and just leave it idle unless luo fails then rebind it.. > We haven't looked into nvme yet so we'll have to revisit that discussion later. Put any host storage on a NVMe VF? Jason
On Fri, Oct 3, 2025 at 12:16 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Fri, Oct 03, 2025 at 09:03:36AM -0700, David Matlack wrote: > > > Shutting down enough of the PF kernel driver to safely kexec is almost > > > the same as unbinding it completely. > > > > I think it's totally fair to tell us to replace pci-pf-stub with > > vfio-pci. That gets rid of one PF driver. > > > > idpf cannot be easily replaced with vfio-pci, since the PF is also > > used for host networking. > > Run host networking on a VF instead? There is a plan for this, but not immediately. In upstream, I suspect vfio-pci is all we need, and other drivers can be added when it really necessary. > > > Brian Vazquez from Google will be giving a > > talk about the idpf support at LPC so we can revisit this topic there. > > We took the approach of only preserving the SR-IOV configuration in > > the PF, everything else gets reset (so no DMA mapping preservation, no > > driver state preservation, etc.). > > Yes, that's pretty much what you'd have to do, it sure would be nice > to have some helper to manage this to minimize driver work. It really > is remove the existing driver and just leave it idle unless luo fails > then rebind it.. > > > We haven't looked into nvme yet so we'll have to revisit that discussion later. > > Put any host storage on a NVMe VF? > > Jason
On Fri, Oct 3, 2025 at 9:28 AM Pasha Tatashin <pasha.tatashin@soleen.com> wrote: > > On Fri, Oct 3, 2025 at 12:16 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > On Fri, Oct 03, 2025 at 09:03:36AM -0700, David Matlack wrote: > > > > Shutting down enough of the PF kernel driver to safely kexec is almost > > > > the same as unbinding it completely. > > > > > > I think it's totally fair to tell us to replace pci-pf-stub with > > > vfio-pci. That gets rid of one PF driver. > > > > > > idpf cannot be easily replaced with vfio-pci, since the PF is also > > > used for host networking. > > > > Run host networking on a VF instead? > > There is a plan for this, but not immediately. In upstream, I suspect > vfio-pci is all we need, and other drivers can be added when it really > necessary. Sounds good to me.
© 2016 - 2026 Red Hat, Inc.