Add an API to enable the PCI subsystem to track all devices that are
preserved across a Live Update, including both incoming devices (passed
from the previous kernel) and outgoing devices (passed to the next
kernel).
Use PCI segment number and BDF to keep track of devices across Live
Update. This means the kernel must keep both identifiers constant across
a Live Update for any preserved device. VFs are not supported for now,
since that requires preserving SR-IOV state on the device to ensure the
same number of VFs appear after kexec and with the same BDFs.
Drivers that preserve devices across Live Update can now register their
struct liveupdate_file_handler with the PCI subsystem so that the PCI
subsystem can allocate and manage File-Lifecycle-Bound (FLB) global data
to track the list of incoming and outgoing preserved devices.
pci_liveupdate_register_fh(driver_fh)
pci_liveupdate_unregister_fh(driver_fh)
Drivers can notify the PCI subsystem whenever a device is preserved and
unpreserved with the following APIs:
pci_liveupdate_outgoing_preserve(pci_dev)
pci_liveupdate_outgoing_unpreserve(pci_dev)
After a Live Update, the PCI subsystem can fetch its FLB global data
from the previous kernel from the Live Update Orchestrator (LUO) to
determine which devices are preserved. This API is also made available
for drivers to use to check if a device was preserved before userspace
retrieves the file for it.
pci_liveupdate_incoming_is_preserved(pci_dev)
Once a driver has finished restoring an incoming preserved device, it
can notify the PCI subsystem with the following call:
pci_liveupdate_incoming_finish(pci_dev)
This will be used in subsequent commits by the vfio-pci driver to
preserve VFIO devices across Live Update.
Signed-off-by: David Matlack <dmatlack@google.com>
---
drivers/pci/Makefile | 1 +
drivers/pci/liveupdate.c | 248 ++++++++++++++++++++++++++++++++++++
include/linux/kho/abi/pci.h | 53 ++++++++
include/linux/pci.h | 38 ++++++
4 files changed, 340 insertions(+)
create mode 100644 drivers/pci/liveupdate.c
create mode 100644 include/linux/kho/abi/pci.h
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 67647f1880fb..0cb43e10e71d 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_SYSFS) += pci-sysfs.o slot.o
obj-$(CONFIG_ACPI) += pci-acpi.o
obj-$(CONFIG_GENERIC_PCI_IOMAP) += iomap.o
+obj-$(CONFIG_LIVEUPDATE) += liveupdate.o
endif
obj-$(CONFIG_OF) += of.o
diff --git a/drivers/pci/liveupdate.c b/drivers/pci/liveupdate.c
new file mode 100644
index 000000000000..f9bb97f3bada
--- /dev/null
+++ b/drivers/pci/liveupdate.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * David Matlack <dmatlack@google.com>
+ */
+
+#include <linux/bsearch.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/pci.h>
+#include <linux/liveupdate.h>
+#include <linux/mutex.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+#include <linux/sort.h>
+
+static DEFINE_MUTEX(pci_flb_outgoing_lock);
+static DEFINE_MUTEX(pci_flb_incoming_lock);
+
+static int pci_flb_preserve(struct liveupdate_flb_op_args *args)
+{
+ struct pci_dev *dev = NULL;
+ struct folio *folio;
+ unsigned int order;
+ int nr_devices = 0;
+ int ret;
+
+ /*
+ * Calculate the maximum number of devices based on what's present
+ * on the system currently (including VFs) to size the folio holding
+ * struct pci_ser. This is not perfect given devices could be
+ * hotplugged, but it's also unlikely that all devices in the system are
+ * going to be preserved anyway.
+ */
+ for_each_pci_dev(dev) {
+ if (dev->is_virtfn)
+ continue;
+
+ nr_devices += 1 + pci_sriov_get_totalvfs(dev);
+ }
+
+ order = get_order(offsetof(struct pci_ser, devices[nr_devices + 1]));
+
+ folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order);
+ if (!folio)
+ return -ENOMEM;
+
+ ret = kho_preserve_folio(folio);
+ if (ret) {
+ folio_put(folio);
+ return ret;
+ }
+
+ args->obj = folio_address(folio);
+ args->data = virt_to_phys(args->obj);
+
+ return 0;
+}
+
+static void pci_flb_unpreserve(struct liveupdate_flb_op_args *args)
+{
+ struct pci_ser *ser = args->obj;
+ struct folio *folio = virt_to_folio(ser);
+
+ WARN_ON_ONCE(ser->nr_devices);
+ kho_unpreserve_folio(folio);
+ folio_put(folio);
+}
+
+static int pci_flb_retrieve(struct liveupdate_flb_op_args *args)
+{
+ struct folio *folio;
+
+ folio = kho_restore_folio(args->data);
+ if (!folio)
+ panic("Unable to restore preserved FLB data from KHO (0x%llx)\n", args->data);
+
+ args->obj = folio_address(folio);
+ return 0;
+}
+
+static void pci_flb_finish(struct liveupdate_flb_op_args *args)
+{
+ struct pci_ser *ser = args->obj;
+
+ /*
+ * Sanity check that all devices have been finished via
+ * pci_liveupdate_incoming_finish().
+ */
+ WARN_ON_ONCE(ser->nr_devices);
+ folio_put(virt_to_folio(ser));
+}
+
+static struct liveupdate_flb_ops pci_liveupdate_flb_ops = {
+ .preserve = pci_flb_preserve,
+ .unpreserve = pci_flb_unpreserve,
+ .retrieve = pci_flb_retrieve,
+ .finish = pci_flb_finish,
+ .owner = THIS_MODULE,
+};
+
+static struct liveupdate_flb pci_liveupdate_flb = {
+ .ops = &pci_liveupdate_flb_ops,
+ .compatible = PCI_LUO_FLB_COMPATIBLE,
+};
+
+#define INIT_PCI_DEV_SER(_dev) { \
+ .domain = pci_domain_nr((_dev)->bus), \
+ .bdf = pci_dev_id(_dev), \
+}
+
+static int pci_dev_ser_cmp(const void *__a, const void *__b)
+{
+ const struct pci_dev_ser *a = __a, *b = __b;
+
+ return cmp_int(a->domain << 16 | a->bdf, b->domain << 16 | b->bdf);
+}
+
+static struct pci_dev_ser *pci_ser_find(struct pci_ser *ser, struct pci_dev *dev)
+{
+ const struct pci_dev_ser key = INIT_PCI_DEV_SER(dev);
+
+ return bsearch(&key, ser->devices, ser->nr_devices,
+ sizeof(key), pci_dev_ser_cmp);
+}
+
+static int pci_ser_delete(struct pci_ser *ser, struct pci_dev *dev)
+{
+ struct pci_dev_ser *dev_ser;
+ int i;
+
+ dev_ser = pci_ser_find(ser, dev);
+ if (!dev_ser)
+ return -ENOENT;
+
+ for (i = dev_ser - ser->devices; i < ser->nr_devices - 1; i++)
+ ser->devices[i] = ser->devices[i + 1];
+
+ ser->nr_devices--;
+ return 0;
+}
+
+static int max_nr_devices(struct pci_ser *ser)
+{
+ u64 size;
+
+ size = folio_size(virt_to_folio(ser));
+ size -= offsetof(struct pci_ser, devices);
+
+ return size / sizeof(struct pci_dev_ser);
+}
+
+int pci_liveupdate_outgoing_preserve(struct pci_dev *dev)
+{
+ struct pci_dev_ser new = INIT_PCI_DEV_SER(dev);
+ struct pci_ser *ser;
+ int i, ret;
+
+ /* VFs are not supported yet due to BDF instability across kexec */
+ if (dev->is_virtfn)
+ return -EINVAL;
+
+ guard(mutex)(&pci_flb_outgoing_lock);
+
+ ret = liveupdate_flb_get_outgoing(&pci_liveupdate_flb, (void **)&ser);
+ if (ret)
+ return ret;
+
+ if (ser->nr_devices == max_nr_devices(ser))
+ return -E2BIG;
+
+ for (i = ser->nr_devices; i > 0; i--) {
+ struct pci_dev_ser *prev = &ser->devices[i - 1];
+ int cmp = pci_dev_ser_cmp(&new, prev);
+
+ /* This device is already preserved. */
+ if (cmp == 0)
+ return 0;
+
+ if (cmp > 0)
+ break;
+
+ ser->devices[i] = *prev;
+ }
+
+ ser->devices[i] = new;
+ ser->nr_devices++;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pci_liveupdate_outgoing_preserve);
+
+void pci_liveupdate_outgoing_unpreserve(struct pci_dev *dev)
+{
+ struct pci_ser *ser;
+ int ret;
+
+ guard(mutex)(&pci_flb_outgoing_lock);
+
+ ret = liveupdate_flb_get_outgoing(&pci_liveupdate_flb, (void **)&ser);
+ if (WARN_ON_ONCE(ret))
+ return;
+
+ WARN_ON_ONCE(pci_ser_delete(ser, dev));
+}
+EXPORT_SYMBOL_GPL(pci_liveupdate_outgoing_unpreserve);
+
+bool pci_liveupdate_incoming_is_preserved(struct pci_dev *dev)
+{
+ struct pci_ser *ser;
+ int ret;
+
+ guard(mutex)(&pci_flb_incoming_lock);
+
+ ret = liveupdate_flb_get_incoming(&pci_liveupdate_flb, (void **)&ser);
+ if (ret)
+ return false;
+
+ return pci_ser_find(ser, dev);
+}
+EXPORT_SYMBOL_GPL(pci_liveupdate_incoming_is_preserved);
+
+void pci_liveupdate_incoming_finish(struct pci_dev *dev)
+{
+ struct pci_ser *ser;
+ int ret;
+
+ guard(mutex)(&pci_flb_incoming_lock);
+
+ ret = liveupdate_flb_get_incoming(&pci_liveupdate_flb, (void **)&ser);
+ if (WARN_ON_ONCE(ret))
+ return;
+
+ WARN_ON_ONCE(pci_ser_delete(ser, dev));
+}
+EXPORT_SYMBOL_GPL(pci_liveupdate_incoming_finish);
+
+int pci_liveupdate_register_fh(struct liveupdate_file_handler *fh)
+{
+ return liveupdate_register_flb(fh, &pci_liveupdate_flb);
+}
+EXPORT_SYMBOL_GPL(pci_liveupdate_register_fh);
+
+int pci_liveupdate_unregister_fh(struct liveupdate_file_handler *fh)
+{
+ return liveupdate_unregister_flb(fh, &pci_liveupdate_flb);
+}
+EXPORT_SYMBOL_GPL(pci_liveupdate_unregister_fh);
diff --git a/include/linux/kho/abi/pci.h b/include/linux/kho/abi/pci.h
new file mode 100644
index 000000000000..53744b6f191a
--- /dev/null
+++ b/include/linux/kho/abi/pci.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * David Matlack <dmatlack@google.com>
+ */
+
+#ifndef _LINUX_KHO_ABI_PCI_H
+#define _LINUX_KHO_ABI_PCI_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/**
+ * DOC: PCI File-Lifecycle Bound (FLB) Live Update ABI
+ *
+ * This header defines the ABI for preserving core PCI state across kexec using
+ * Live Update File-Lifecycle Bound (FLB) data.
+ *
+ * This interface is a contract. Any modification to any of the serialization
+ * structs defined here constitutes a breaking change. Such changes require
+ * incrementing the version number in the PCI_LUO_FLB_COMPATIBLE string.
+ */
+
+#define PCI_LUO_FLB_COMPATIBLE "pci-v1"
+
+/**
+ * struct pci_dev_ser - Serialized state about a single PCI device.
+ *
+ * @domain: The device's PCI domain number (segment).
+ * @bdf: The device's PCI bus, device, and function number.
+ */
+struct pci_dev_ser {
+ u16 domain;
+ u16 bdf;
+} __packed;
+
+/**
+ * struct pci_ser - PCI Subsystem Live Update State
+ *
+ * This struct tracks state about all devices that are being preserved across
+ * a Live Update for the next kernel.
+ *
+ * @nr_devices: The number of devices that were preserved.
+ * @devices: Flexible array of pci_dev_ser structs for each device. Guaranteed
+ * to be sorted ascending by domain and bdf.
+ */
+struct pci_ser {
+ u64 nr_devices;
+ struct pci_dev_ser devices[];
+} __packed;
+
+#endif /* _LINUX_KHO_ABI_PCI_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d1fdf81fbe1e..6a3c2d7e5b82 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -40,6 +40,7 @@
#include <linux/resource_ext.h>
#include <linux/msi_api.h>
#include <uapi/linux/pci.h>
+#include <linux/liveupdate.h>
#include <linux/pci_ids.h>
@@ -2795,4 +2796,41 @@ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type);
WARN_ONCE(condition, "%s %s: " fmt, \
dev_driver_string(&(pdev)->dev), pci_name(pdev), ##arg)
+#ifdef CONFIG_LIVEUPDATE
+int pci_liveupdate_outgoing_preserve(struct pci_dev *dev);
+void pci_liveupdate_outgoing_unpreserve(struct pci_dev *dev);
+bool pci_liveupdate_incoming_is_preserved(struct pci_dev *dev);
+void pci_liveupdate_incoming_finish(struct pci_dev *dev);
+int pci_liveupdate_register_fh(struct liveupdate_file_handler *fh);
+int pci_liveupdate_unregister_fh(struct liveupdate_file_handler *fh);
+#else /* !CONFIG_LIVEUPDATE */
+static inline int pci_liveupdate_outgoing_preserve(struct pci_dev *dev)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void pci_liveupdate_outgoing_unpreserve(struct pci_dev *dev)
+{
+}
+
+static inline bool pci_liveupdate_incoming_is_preserved(struct pci_dev *dev)
+{
+ return false;
+}
+
+static inline void pci_liveupdate_incoming_finish(struct pci_dev *dev)
+{
+}
+
+static inline int pci_liveupdate_register_fh(struct liveupdate_file_handler *fh)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int pci_liveupdate_unregister_fh(struct liveupdate_file_handler *fh)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* !CONFIG_LIVEUPDATE */
+
#endif /* LINUX_PCI_H */
--
2.52.0.487.g5c8c507ade-goog
> +static void pci_flb_unpreserve(struct liveupdate_flb_op_args *args)
> +{
> + struct pci_ser *ser = args->obj;
> + struct folio *folio = virt_to_folio(ser);
> +
> + WARN_ON_ONCE(ser->nr_devices);
> + kho_unpreserve_folio(folio);
> + folio_put(folio);
Here, and in other places in this series, I would use:
https://lore.kernel.org/all/20251114190002.3311679-4-pasha.tatashin@soleen.com
kho_alloc_preserve(size_t size)
kho_unpreserve_free(void *mem)
kho_restore_free(void *mem)
Pasha
On Sat, Nov 29, 2025 at 12:15 PM Pasha Tatashin
<pasha.tatashin@soleen.com> wrote:
>
> > +static void pci_flb_unpreserve(struct liveupdate_flb_op_args *args)
> > +{
> > + struct pci_ser *ser = args->obj;
> > + struct folio *folio = virt_to_folio(ser);
> > +
> > + WARN_ON_ONCE(ser->nr_devices);
> > + kho_unpreserve_folio(folio);
> > + folio_put(folio);
>
> Here, and in other places in this series, I would use:
> https://lore.kernel.org/all/20251114190002.3311679-4-pasha.tatashin@soleen.com
>
> kho_alloc_preserve(size_t size)
> kho_unpreserve_free(void *mem)
> kho_restore_free(void *mem)
Will do, thanks for the suggestion.
On Wed, Nov 26, 2025 at 07:35:49PM +0000, David Matlack wrote: > Add an API to enable the PCI subsystem to track all devices that are > preserved across a Live Update, including both incoming devices (passed > from the previous kernel) and outgoing devices (passed to the next > kernel). > > Use PCI segment number and BDF to keep track of devices across Live > Update. This means the kernel must keep both identifiers constant across > a Live Update for any preserved device. While bus numbers will *usually* stay the same across next and previous kernel, there are exceptions. E.g. if "pci=assign-busses" is specified on the command line, the kernel will re-assign bus numbers on every boot. The most portable way to identify PCI devices across kernels is to store their path from the root down the hierarchy. Because the bus number might change but the device/function number on each bus stays the same. This is what EFI does with device paths: https://uefi.org/specs/UEFI/2.10/10_Protocols_Device_Path_Protocol.html Example: Acpi(PNP0A03,0)/Pci(1E|0)/Pci(0|0) Source: https://raw.githubusercontent.com/tianocore-docs/edk2-UefiDriverWritersGuide/main/3_foundation/39_uefi_device_paths/README.9.md We've got a device path *parser* in drivers/firmware/efi/dev-path-parser.c, but we don't have a *generator* for device paths in the kernel yet. Thanks, Lukas
On Sat, Nov 29, 2025 at 11:34:49AM +0100, Lukas Wunner wrote: > On Wed, Nov 26, 2025 at 07:35:49PM +0000, David Matlack wrote: > > Add an API to enable the PCI subsystem to track all devices that are > > preserved across a Live Update, including both incoming devices (passed > > from the previous kernel) and outgoing devices (passed to the next > > kernel). > > > > Use PCI segment number and BDF to keep track of devices across Live > > Update. This means the kernel must keep both identifiers constant across > > a Live Update for any preserved device. > > While bus numbers will *usually* stay the same across next and previous > kernel, there are exceptions. E.g. if "pci=assign-busses" is specified > on the command line, the kernel will re-assign bus numbers on every boot. Stuff like this has to be disabled for this live update stuff, if the bus numbers are changed it will break the active use of the iommu across the kexec. So while what you say is all technically true, I'm not sure this is necessary. Jason
On Sat, Nov 29, 2025 at 7:51 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Sat, Nov 29, 2025 at 11:34:49AM +0100, Lukas Wunner wrote: > > On Wed, Nov 26, 2025 at 07:35:49PM +0000, David Matlack wrote: > > > Add an API to enable the PCI subsystem to track all devices that are > > > preserved across a Live Update, including both incoming devices (passed > > > from the previous kernel) and outgoing devices (passed to the next > > > kernel). > > > > > > Use PCI segment number and BDF to keep track of devices across Live > > > Update. This means the kernel must keep both identifiers constant across > > > a Live Update for any preserved device. > > > > While bus numbers will *usually* stay the same across next and previous > > kernel, there are exceptions. E.g. if "pci=assign-busses" is specified > > on the command line, the kernel will re-assign bus numbers on every boot. > > Stuff like this has to be disabled for this live update stuff, if the > bus numbers are changed it will break the active use of the iommu > across the kexec. > > So while what you say is all technically true, I'm not sure this is > necessary. I agree. However, Lukas's comment made me wonder about the future: if we eventually need to preserve non-PCI devices (like a TPM), should we be designing a common identification mechanism for all buses now? Or should we settle on BDF for PCI and invent stable identifiers for other bus types as they become necessary? Pasha > > Jason
On Sat, Nov 29, 2025 at 08:20:34PM -0500, Pasha Tatashin wrote: > On Sat, Nov 29, 2025 at 7:51 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > On Sat, Nov 29, 2025 at 11:34:49AM +0100, Lukas Wunner wrote: > > > On Wed, Nov 26, 2025 at 07:35:49PM +0000, David Matlack wrote: > > > > Add an API to enable the PCI subsystem to track all devices that are > > > > preserved across a Live Update, including both incoming devices (passed > > > > from the previous kernel) and outgoing devices (passed to the next > > > > kernel). > > > > > > > > Use PCI segment number and BDF to keep track of devices across Live > > > > Update. This means the kernel must keep both identifiers constant across > > > > a Live Update for any preserved device. > > > > > > While bus numbers will *usually* stay the same across next and previous > > > kernel, there are exceptions. E.g. if "pci=assign-busses" is specified > > > on the command line, the kernel will re-assign bus numbers on every boot. > > > > Stuff like this has to be disabled for this live update stuff, if the > > bus numbers are changed it will break the active use of the iommu > > across the kexec. > > > > So while what you say is all technically true, I'm not sure this is > > necessary. > > I agree. However, Lukas's comment made me wonder about the future: if > we eventually need to preserve non-PCI devices (like a TPM), should we > be designing a common identification mechanism for all buses now? Or > should we settle on BDF for PCI and invent stable identifiers for > other bus types as they become necessary? Well, at least PCI subsystem should use BDF.. You are probably right that the matching of preserved data to a struct device should be more general though. Jason
> > > So while what you say is all technically true, I'm not sure this is > > > necessary. > > > > I agree. However, Lukas's comment made me wonder about the future: if > > we eventually need to preserve non-PCI devices (like a TPM), should we > > be designing a common identification mechanism for all buses now? Or > > should we settle on BDF for PCI and invent stable identifiers for > > other bus types as they become necessary? > > Well, at least PCI subsystem should use BDF.. BDF must be stable for PCI live update to work correctly. > You are probably right that the matching of preserved data to a struct > device should be more general though. Right, we need a mechanism to ensure early in boot that any preserved device does not auto-bind to a driver later in boot. Using the UEFI Device Path format seems like a good way not to re-invent something that already exists. For example, while a preserved PCI device looks like this: Acpi(PNP0A03,0)/Pci(1E|0)/Pci(0|0) (Luka's example) We can seamlessly support other device types later using their native paths without changing the identification schema: TPM: Acpi(PNP0C31,0) IPMI/BMC: Acpi(PNP0A03,0)/Pci(1F|0)/BMC(1,0xCA2) NVMe (PCI-attached): Acpi(PNP0A03,0)/Pci(1C|0)/Pci(0|0)/NVMe(1,00-00-...) etc... Pasha
On 2025-12-01 09:29 AM, Jason Gunthorpe wrote:
> On Sat, Nov 29, 2025 at 08:20:34PM -0500, Pasha Tatashin wrote:
> > On Sat, Nov 29, 2025 at 7:51 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >
> > > On Sat, Nov 29, 2025 at 11:34:49AM +0100, Lukas Wunner wrote:
> > > > On Wed, Nov 26, 2025 at 07:35:49PM +0000, David Matlack wrote:
> > > > > Add an API to enable the PCI subsystem to track all devices that are
> > > > > preserved across a Live Update, including both incoming devices (passed
> > > > > from the previous kernel) and outgoing devices (passed to the next
> > > > > kernel).
> > > > >
> > > > > Use PCI segment number and BDF to keep track of devices across Live
> > > > > Update. This means the kernel must keep both identifiers constant across
> > > > > a Live Update for any preserved device.
> > > >
> > > > While bus numbers will *usually* stay the same across next and previous
> > > > kernel, there are exceptions. E.g. if "pci=assign-busses" is specified
> > > > on the command line, the kernel will re-assign bus numbers on every boot.
> > >
> > > Stuff like this has to be disabled for this live update stuff, if the
> > > bus numbers are changed it will break the active use of the iommu
> > > across the kexec.
> > >
> > > So while what you say is all technically true, I'm not sure this is
> > > necessary.
> >
> > I agree. However, Lukas's comment made me wonder about the future: if
> > we eventually need to preserve non-PCI devices (like a TPM), should we
> > be designing a common identification mechanism for all buses now? Or
> > should we settle on BDF for PCI and invent stable identifiers for
> > other bus types as they become necessary?
>
> Well, at least PCI subsystem should use BDF..
>
> You are probably right that the matching of preserved data to a struct
> device should be more general though.
Lukas' suggestion would also make it more reliable to detect bus numbers
changing during a Live Update. We can play whack-a-mole with things like
assign-busses, but there will be a risk that we miss something or
something changes in the future.
Perhaps it would make sense to rely on BDF in the PCI subsystem in the
short term and enforce bus number stability manually (e.g. see patch at
the bottom), and then explore stable device paths as a future
improvement to make PCI device preservation more reliable and also to
enable other bus types?
To handle pci=assign-busses, perhaps something like this? Are there any
other places where the kernel could change busses?
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0ce98e18b5a8..2e1e1aa385a8 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1331,6 +1331,20 @@ static bool pci_ea_fixed_busnrs(struct pci_dev *dev, u8 *sec, u8 *sub)
return true;
}
+static bool pci_assign_all_busses(void)
+{
+ /*
+ * During a Live Update, do not assign new bus numbers. Use bus numbers
+ * assigned by the firmware and the previous kernel. Bus numbers must
+ * remain constant so that devices preserved across the Live Update can
+ * use the IOMMU uninterrupted.
+ */
+ if (liveupdate_count())
+ return false;
+
+ return pcibios_assign_all_busses();
+}
+
/*
* pci_scan_bridge_extend() - Scan buses behind a bridge
* @bus: Parent bus the bridge is on
@@ -1404,7 +1418,7 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
- if ((secondary || subordinate) && !pcibios_assign_all_busses() &&
+ if ((secondary || subordinate) && !pci_assign_all_busses() &&
!is_cardbus && !broken) {
unsigned int cmax, buses;
@@ -1441,13 +1455,16 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
if (subordinate > max)
max = subordinate;
} else {
+ pci_WARN_ONCE(dev, liveupdate_count(),
+ "Assigning new bus numbers during a Live Update! [%u %u %u %u]\n",
+ secondary, subordinate, is_cardbus, broken);
/*
* We need to assign a number to this bus which we always
* do in the second pass.
*/
if (!pass) {
- if (pcibios_assign_all_busses() || broken || is_cardbus)
+ if (pci_assign_all_busses() || broken || is_cardbus)
/*
* Temporarily disable forwarding of the
@@ -1522,7 +1539,7 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
max+i+1))
break;
while (parent->parent) {
- if ((!pcibios_assign_all_busses()) &&
+ if ((!pci_assign_all_busses()) &&
(parent->busn_res.end > max) &&
(parent->busn_res.end <= max+i)) {
j = 1;
diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
index b913d63eab5f..87a4982d0eb1 100644
--- a/include/linux/liveupdate.h
+++ b/include/linux/liveupdate.h
@@ -219,6 +219,7 @@ struct liveupdate_flb {
/* Return true if live update orchestrator is enabled */
bool liveupdate_enabled(void);
+int liveupdate_count(void);
/* Called during kexec to tell LUO that entered into reboot */
int liveupdate_reboot(void);
@@ -241,6 +242,11 @@ static inline bool liveupdate_enabled(void)
return false;
}
+static inline int liveupdate_count(void)
+{
+ return 0;
+}
+
static inline int liveupdate_reboot(void)
{
return 0;
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
index 69298d82f404..2f273397bd41 100644
--- a/kernel/liveupdate/luo_core.c
+++ b/kernel/liveupdate/luo_core.c
@@ -256,6 +256,13 @@ bool liveupdate_enabled(void)
{
return luo_global.enabled;
}
+EXPORT_SYMBOL_GPL(liveupdate_enabled);
+
+int liveupdate_count(void)
+{
+ return luo_global.liveupdate_num;
+}
+EXPORT_SYMBOL_GPL(liveupdate_count);
/**
* DOC: LUO ioctl Interface
On Sat, Nov 29, 2025 at 5:34 AM Lukas Wunner <lukas@wunner.de> wrote: > > On Wed, Nov 26, 2025 at 07:35:49PM +0000, David Matlack wrote: > > Add an API to enable the PCI subsystem to track all devices that are > > preserved across a Live Update, including both incoming devices (passed > > from the previous kernel) and outgoing devices (passed to the next > > kernel). > > > > Use PCI segment number and BDF to keep track of devices across Live > > Update. This means the kernel must keep both identifiers constant across > > a Live Update for any preserved device. > > While bus numbers will *usually* stay the same across next and previous > kernel, there are exceptions. E.g. if "pci=assign-busses" is specified > on the command line, the kernel will re-assign bus numbers on every boot. > > The most portable way to identify PCI devices across kernels is to > store their path from the root down the hierarchy. Because the bus > number might change but the device/function number on each bus stays > the same. > > This is what EFI does with device paths: > https://uefi.org/specs/UEFI/2.10/10_Protocols_Device_Path_Protocol.html > > Example: > Acpi(PNP0A03,0)/Pci(1E|0)/Pci(0|0) > > Source: > https://raw.githubusercontent.com/tianocore-docs/edk2-UefiDriverWritersGuide/main/3_foundation/39_uefi_device_paths/README.9.md > > We've got a device path *parser* in drivers/firmware/efi/dev-path-parser.c, > but we don't have a *generator* for device paths in the kernel yet. Hi Lukas, Thanks for the input. You are right that bus numbers can change in standard boot scenarios. However, for Live Update, we skip firmware, and we would likely list pci=assign-busses as an unsupported parameter. So, BDF should be sufficient. That said, if there is a better method using a stable hierarchical path, and more importantly, if that method can be extended to other bus types, we are open to considering it. The main hurdle is that we would need a way to generate this stable path in the kernel and also parse it during early boot. Thanks, Pasha
© 2016 - 2025 Red Hat, Inc.