[PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update

David Matlack posted 21 patches 2 months, 1 week ago
[PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by David Matlack 2 months, 1 week ago
Add an API to enable the PCI subsystem to track all devices that are
preserved across a Live Update, including both incoming devices (passed
from the previous kernel) and outgoing devices (passed to the next
kernel).

Use PCI segment number and BDF to keep track of devices across Live
Update. This means the kernel must keep both identifiers constant across
a Live Update for any preserved device. VFs are not supported for now,
since that requires preserving SR-IOV state on the device to ensure the
same number of VFs appear after kexec and with the same BDFs.

Drivers that preserve devices across Live Update can now register their
struct liveupdate_file_handler with the PCI subsystem so that the PCI
subsystem can allocate and manage File-Lifecycle-Bound (FLB) global data
to track the list of incoming and outgoing preserved devices.

  pci_liveupdate_register_fh(driver_fh)
  pci_liveupdate_unregister_fh(driver_fh)

Drivers can notify the PCI subsystem whenever a device is preserved and
unpreserved with the following APIs:

  pci_liveupdate_outgoing_preserve(pci_dev)
  pci_liveupdate_outgoing_unpreserve(pci_dev)

After a Live Update, the PCI subsystem can fetch its FLB global data
from the previous kernel from the Live Update Orchestrator (LUO) to
determine which devices are preserved. This API is also made available
for drivers to use to check if a device was preserved before userspace
retrieves the file for it.

  pci_liveupdate_incoming_is_preserved(pci_dev)

Once a driver has finished restoring an incoming preserved device, it
can notify the PCI subsystem with the following call:

  pci_liveupdate_incoming_finish(pci_dev)

This will be used in subsequent commits by the vfio-pci driver to
preserve VFIO devices across Live Update.

Signed-off-by: David Matlack <dmatlack@google.com>
---
 drivers/pci/Makefile        |   1 +
 drivers/pci/liveupdate.c    | 248 ++++++++++++++++++++++++++++++++++++
 include/linux/kho/abi/pci.h |  53 ++++++++
 include/linux/pci.h         |  38 ++++++
 4 files changed, 340 insertions(+)
 create mode 100644 drivers/pci/liveupdate.c
 create mode 100644 include/linux/kho/abi/pci.h

diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 67647f1880fb..0cb43e10e71d 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_PROC_FS)		+= proc.o
 obj-$(CONFIG_SYSFS)		+= pci-sysfs.o slot.o
 obj-$(CONFIG_ACPI)		+= pci-acpi.o
 obj-$(CONFIG_GENERIC_PCI_IOMAP) += iomap.o
+obj-$(CONFIG_LIVEUPDATE)	+= liveupdate.o
 endif
 
 obj-$(CONFIG_OF)		+= of.o
diff --git a/drivers/pci/liveupdate.c b/drivers/pci/liveupdate.c
new file mode 100644
index 000000000000..f9bb97f3bada
--- /dev/null
+++ b/drivers/pci/liveupdate.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * David Matlack <dmatlack@google.com>
+ */
+
+#include <linux/bsearch.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/pci.h>
+#include <linux/liveupdate.h>
+#include <linux/mutex.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+#include <linux/sort.h>
+
+static DEFINE_MUTEX(pci_flb_outgoing_lock);
+static DEFINE_MUTEX(pci_flb_incoming_lock);
+
+static int pci_flb_preserve(struct liveupdate_flb_op_args *args)
+{
+	struct pci_dev *dev = NULL;
+	struct folio *folio;
+	unsigned int order;
+	int nr_devices = 0;
+	int ret;
+
+	/*
+	 * Calculate the maximum number of devices based on what's present
+	 * on the system currently (including VFs) to size the folio holding
+	 * struct pci_ser. This is not perfect given devices could be
+	 * hotplugged, but it's also unlikely that all devices in the system are
+	 * going to be preserved anyway.
+	 */
+	for_each_pci_dev(dev) {
+		if (dev->is_virtfn)
+			continue;
+
+		nr_devices += 1 + pci_sriov_get_totalvfs(dev);
+	}
+
+	order = get_order(offsetof(struct pci_ser, devices[nr_devices + 1]));
+
+	folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order);
+	if (!folio)
+		return -ENOMEM;
+
+	ret = kho_preserve_folio(folio);
+	if (ret) {
+		folio_put(folio);
+		return ret;
+	}
+
+	args->obj = folio_address(folio);
+	args->data = virt_to_phys(args->obj);
+
+	return 0;
+}
+
+static void pci_flb_unpreserve(struct liveupdate_flb_op_args *args)
+{
+	struct pci_ser *ser = args->obj;
+	struct folio *folio = virt_to_folio(ser);
+
+	WARN_ON_ONCE(ser->nr_devices);
+	kho_unpreserve_folio(folio);
+	folio_put(folio);
+}
+
+static int pci_flb_retrieve(struct liveupdate_flb_op_args *args)
+{
+	struct folio *folio;
+
+	folio = kho_restore_folio(args->data);
+	if (!folio)
+		panic("Unable to restore preserved FLB data from KHO (0x%llx)\n", args->data);
+
+	args->obj = folio_address(folio);
+	return 0;
+}
+
+static void pci_flb_finish(struct liveupdate_flb_op_args *args)
+{
+	struct pci_ser *ser = args->obj;
+
+	/*
+	 * Sanity check that all devices have been finished via
+	 * pci_liveupdate_incoming_finish().
+	 */
+	WARN_ON_ONCE(ser->nr_devices);
+	folio_put(virt_to_folio(ser));
+}
+
+static struct liveupdate_flb_ops pci_liveupdate_flb_ops = {
+	.preserve = pci_flb_preserve,
+	.unpreserve = pci_flb_unpreserve,
+	.retrieve = pci_flb_retrieve,
+	.finish = pci_flb_finish,
+	.owner = THIS_MODULE,
+};
+
+static struct liveupdate_flb pci_liveupdate_flb = {
+	.ops = &pci_liveupdate_flb_ops,
+	.compatible = PCI_LUO_FLB_COMPATIBLE,
+};
+
+#define INIT_PCI_DEV_SER(_dev) {		\
+	.domain = pci_domain_nr((_dev)->bus),	\
+	.bdf = pci_dev_id(_dev),		\
+}
+
+static int pci_dev_ser_cmp(const void *__a, const void *__b)
+{
+	const struct pci_dev_ser *a = __a, *b = __b;
+
+	return cmp_int(a->domain << 16 | a->bdf, b->domain << 16 | b->bdf);
+}
+
+static struct pci_dev_ser *pci_ser_find(struct pci_ser *ser, struct pci_dev *dev)
+{
+	const struct pci_dev_ser key = INIT_PCI_DEV_SER(dev);
+
+	return bsearch(&key, ser->devices, ser->nr_devices,
+		       sizeof(key), pci_dev_ser_cmp);
+}
+
+static int pci_ser_delete(struct pci_ser *ser, struct pci_dev *dev)
+{
+	struct pci_dev_ser *dev_ser;
+	int i;
+
+	dev_ser = pci_ser_find(ser, dev);
+	if (!dev_ser)
+		return -ENOENT;
+
+	for (i = dev_ser - ser->devices; i < ser->nr_devices - 1; i++)
+		ser->devices[i] = ser->devices[i + 1];
+
+	ser->nr_devices--;
+	return 0;
+}
+
+static int max_nr_devices(struct pci_ser *ser)
+{
+	u64 size;
+
+	size = folio_size(virt_to_folio(ser));
+	size -= offsetof(struct pci_ser, devices);
+
+	return size / sizeof(struct pci_dev_ser);
+}
+
+int pci_liveupdate_outgoing_preserve(struct pci_dev *dev)
+{
+	struct pci_dev_ser new = INIT_PCI_DEV_SER(dev);
+	struct pci_ser *ser;
+	int i, ret;
+
+	/* VFs are not supported yet due to BDF instability across kexec */
+	if (dev->is_virtfn)
+		return -EINVAL;
+
+	guard(mutex)(&pci_flb_outgoing_lock);
+
+	ret = liveupdate_flb_get_outgoing(&pci_liveupdate_flb, (void **)&ser);
+	if (ret)
+		return ret;
+
+	if (ser->nr_devices == max_nr_devices(ser))
+		return -E2BIG;
+
+	for (i = ser->nr_devices; i > 0; i--) {
+		struct pci_dev_ser *prev = &ser->devices[i - 1];
+		int cmp = pci_dev_ser_cmp(&new, prev);
+
+		/* This device is already preserved. */
+		if (cmp == 0)
+			return 0;
+
+		if (cmp > 0)
+			break;
+
+		ser->devices[i] = *prev;
+	}
+
+	ser->devices[i] = new;
+	ser->nr_devices++;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_liveupdate_outgoing_preserve);
+
+void pci_liveupdate_outgoing_unpreserve(struct pci_dev *dev)
+{
+	struct pci_ser *ser;
+	int ret;
+
+	guard(mutex)(&pci_flb_outgoing_lock);
+
+	ret = liveupdate_flb_get_outgoing(&pci_liveupdate_flb, (void **)&ser);
+	if (WARN_ON_ONCE(ret))
+		return;
+
+	WARN_ON_ONCE(pci_ser_delete(ser, dev));
+}
+EXPORT_SYMBOL_GPL(pci_liveupdate_outgoing_unpreserve);
+
+bool pci_liveupdate_incoming_is_preserved(struct pci_dev *dev)
+{
+	struct pci_ser *ser;
+	int ret;
+
+	guard(mutex)(&pci_flb_incoming_lock);
+
+	ret = liveupdate_flb_get_incoming(&pci_liveupdate_flb, (void **)&ser);
+	if (ret)
+		return false;
+
+	return pci_ser_find(ser, dev);
+}
+EXPORT_SYMBOL_GPL(pci_liveupdate_incoming_is_preserved);
+
+void pci_liveupdate_incoming_finish(struct pci_dev *dev)
+{
+	struct pci_ser *ser;
+	int ret;
+
+	guard(mutex)(&pci_flb_incoming_lock);
+
+	ret = liveupdate_flb_get_incoming(&pci_liveupdate_flb, (void **)&ser);
+	if (WARN_ON_ONCE(ret))
+		return;
+
+	WARN_ON_ONCE(pci_ser_delete(ser, dev));
+}
+EXPORT_SYMBOL_GPL(pci_liveupdate_incoming_finish);
+
+int pci_liveupdate_register_fh(struct liveupdate_file_handler *fh)
+{
+	return liveupdate_register_flb(fh, &pci_liveupdate_flb);
+}
+EXPORT_SYMBOL_GPL(pci_liveupdate_register_fh);
+
+int pci_liveupdate_unregister_fh(struct liveupdate_file_handler *fh)
+{
+	return liveupdate_unregister_flb(fh, &pci_liveupdate_flb);
+}
+EXPORT_SYMBOL_GPL(pci_liveupdate_unregister_fh);
diff --git a/include/linux/kho/abi/pci.h b/include/linux/kho/abi/pci.h
new file mode 100644
index 000000000000..53744b6f191a
--- /dev/null
+++ b/include/linux/kho/abi/pci.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * David Matlack <dmatlack@google.com>
+ */
+
+#ifndef _LINUX_KHO_ABI_PCI_H
+#define _LINUX_KHO_ABI_PCI_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/**
+ * DOC: PCI File-Lifecycle Bound (FLB) Live Update ABI
+ *
+ * This header defines the ABI for preserving core PCI state across kexec using
+ * Live Update File-Lifecycle Bound (FLB) data.
+ *
+ * This interface is a contract. Any modification to any of the serialization
+ * structs defined here constitutes a breaking change. Such changes require
+ * incrementing the version number in the PCI_LUO_FLB_COMPATIBLE string.
+ */
+
+#define PCI_LUO_FLB_COMPATIBLE "pci-v1"
+
+/**
+ * struct pci_dev_ser - Serialized state about a single PCI device.
+ *
+ * @domain: The device's PCI domain number (segment).
+ * @bdf: The device's PCI bus, device, and function number.
+ */
+struct pci_dev_ser {
+	u16 domain;
+	u16 bdf;
+} __packed;
+
+/**
+ * struct pci_ser - PCI Subsystem Live Update State
+ *
+ * This struct tracks state about all devices that are being preserved across
+ * a Live Update for the next kernel.
+ *
+ * @nr_devices: The number of devices that were preserved.
+ * @devices: Flexible array of pci_dev_ser structs for each device. Guaranteed
+ *           to be sorted ascending by domain and bdf.
+ */
+struct pci_ser {
+	u64 nr_devices;
+	struct pci_dev_ser devices[];
+} __packed;
+
+#endif /* _LINUX_KHO_ABI_PCI_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d1fdf81fbe1e..6a3c2d7e5b82 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -40,6 +40,7 @@
 #include <linux/resource_ext.h>
 #include <linux/msi_api.h>
 #include <uapi/linux/pci.h>
+#include <linux/liveupdate.h>
 
 #include <linux/pci_ids.h>
 
@@ -2795,4 +2796,41 @@ void pci_uevent_ers(struct pci_dev *pdev, enum  pci_ers_result err_type);
 	WARN_ONCE(condition, "%s %s: " fmt, \
 		  dev_driver_string(&(pdev)->dev), pci_name(pdev), ##arg)
 
+#ifdef CONFIG_LIVEUPDATE
+int pci_liveupdate_outgoing_preserve(struct pci_dev *dev);
+void pci_liveupdate_outgoing_unpreserve(struct pci_dev *dev);
+bool pci_liveupdate_incoming_is_preserved(struct pci_dev *dev);
+void pci_liveupdate_incoming_finish(struct pci_dev *dev);
+int pci_liveupdate_register_fh(struct liveupdate_file_handler *fh);
+int pci_liveupdate_unregister_fh(struct liveupdate_file_handler *fh);
+#else /* !CONFIG_LIVEUPDATE */
+static inline int pci_liveupdate_outgoing_preserve(struct pci_dev *dev)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void pci_liveupdate_outgoing_unpreserve(struct pci_dev *dev)
+{
+}
+
+static inline bool pci_liveupdate_incoming_is_preserved(struct pci_dev *dev)
+{
+	return false;
+}
+
+static inline void pci_liveupdate_incoming_finish(struct pci_dev *dev)
+{
+}
+
+static inline int pci_liveupdate_register_fh(struct liveupdate_file_handler *fh)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int pci_liveupdate_unregister_fh(struct liveupdate_file_handler *fh)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* !CONFIG_LIVEUPDATE */
+
 #endif /* LINUX_PCI_H */
-- 
2.52.0.487.g5c8c507ade-goog
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by Pasha Tatashin 2 months, 1 week ago
> +static void pci_flb_unpreserve(struct liveupdate_flb_op_args *args)
> +{
> +       struct pci_ser *ser = args->obj;
> +       struct folio *folio = virt_to_folio(ser);
> +
> +       WARN_ON_ONCE(ser->nr_devices);
> +       kho_unpreserve_folio(folio);
> +       folio_put(folio);

Here, and in other places in this series, I would use:
https://lore.kernel.org/all/20251114190002.3311679-4-pasha.tatashin@soleen.com

kho_alloc_preserve(size_t size)
kho_unpreserve_free(void *mem)
kho_restore_free(void *mem)

Pasha
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by David Matlack 2 months, 1 week ago
On Sat, Nov 29, 2025 at 12:15 PM Pasha Tatashin
<pasha.tatashin@soleen.com> wrote:
>
> > +static void pci_flb_unpreserve(struct liveupdate_flb_op_args *args)
> > +{
> > +       struct pci_ser *ser = args->obj;
> > +       struct folio *folio = virt_to_folio(ser);
> > +
> > +       WARN_ON_ONCE(ser->nr_devices);
> > +       kho_unpreserve_folio(folio);
> > +       folio_put(folio);
>
> Here, and in other places in this series, I would use:
> https://lore.kernel.org/all/20251114190002.3311679-4-pasha.tatashin@soleen.com
>
> kho_alloc_preserve(size_t size)
> kho_unpreserve_free(void *mem)
> kho_restore_free(void *mem)

Will do, thanks for the suggestion.
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by Lukas Wunner 2 months, 1 week ago
On Wed, Nov 26, 2025 at 07:35:49PM +0000, David Matlack wrote:
> Add an API to enable the PCI subsystem to track all devices that are
> preserved across a Live Update, including both incoming devices (passed
> from the previous kernel) and outgoing devices (passed to the next
> kernel).
> 
> Use PCI segment number and BDF to keep track of devices across Live
> Update. This means the kernel must keep both identifiers constant across
> a Live Update for any preserved device.

While bus numbers will *usually* stay the same across next and previous
kernel, there are exceptions.  E.g. if "pci=assign-busses" is specified
on the command line, the kernel will re-assign bus numbers on every boot.

The most portable way to identify PCI devices across kernels is to
store their path from the root down the hierarchy.  Because the bus
number might change but the device/function number on each bus stays
the same.

This is what EFI does with device paths:
https://uefi.org/specs/UEFI/2.10/10_Protocols_Device_Path_Protocol.html

Example:
Acpi(PNP0A03,0)/Pci(1E|0)/Pci(0|0)

Source:
https://raw.githubusercontent.com/tianocore-docs/edk2-UefiDriverWritersGuide/main/3_foundation/39_uefi_device_paths/README.9.md

We've got a device path *parser* in drivers/firmware/efi/dev-path-parser.c,
but we don't have a *generator* for device paths in the kernel yet.

Thanks,

Lukas
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by Jason Gunthorpe 2 months, 1 week ago
On Sat, Nov 29, 2025 at 11:34:49AM +0100, Lukas Wunner wrote:
> On Wed, Nov 26, 2025 at 07:35:49PM +0000, David Matlack wrote:
> > Add an API to enable the PCI subsystem to track all devices that are
> > preserved across a Live Update, including both incoming devices (passed
> > from the previous kernel) and outgoing devices (passed to the next
> > kernel).
> > 
> > Use PCI segment number and BDF to keep track of devices across Live
> > Update. This means the kernel must keep both identifiers constant across
> > a Live Update for any preserved device.
> 
> While bus numbers will *usually* stay the same across next and previous
> kernel, there are exceptions.  E.g. if "pci=assign-busses" is specified
> on the command line, the kernel will re-assign bus numbers on every boot.

Stuff like this has to be disabled for this live update stuff, if the
bus numbers are changed it will break the active use of the iommu
across the kexec.

So while what you say is all technically true, I'm not sure this is
necessary.

Jason
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by Pasha Tatashin 2 months, 1 week ago
On Sat, Nov 29, 2025 at 7:51 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Sat, Nov 29, 2025 at 11:34:49AM +0100, Lukas Wunner wrote:
> > On Wed, Nov 26, 2025 at 07:35:49PM +0000, David Matlack wrote:
> > > Add an API to enable the PCI subsystem to track all devices that are
> > > preserved across a Live Update, including both incoming devices (passed
> > > from the previous kernel) and outgoing devices (passed to the next
> > > kernel).
> > >
> > > Use PCI segment number and BDF to keep track of devices across Live
> > > Update. This means the kernel must keep both identifiers constant across
> > > a Live Update for any preserved device.
> >
> > While bus numbers will *usually* stay the same across next and previous
> > kernel, there are exceptions.  E.g. if "pci=assign-busses" is specified
> > on the command line, the kernel will re-assign bus numbers on every boot.
>
> Stuff like this has to be disabled for this live update stuff, if the
> bus numbers are changed it will break the active use of the iommu
> across the kexec.
>
> So while what you say is all technically true, I'm not sure this is
> necessary.

I agree. However, Lukas's comment made me wonder about the future: if
we eventually need to preserve non-PCI devices (like a TPM), should we
be designing a common identification mechanism for all buses now? Or
should we settle on BDF for PCI and invent stable identifiers for
other bus types as they become necessary?

Pasha

>
> Jason
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by Jason Gunthorpe 2 months, 1 week ago
On Sat, Nov 29, 2025 at 08:20:34PM -0500, Pasha Tatashin wrote:
> On Sat, Nov 29, 2025 at 7:51 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Sat, Nov 29, 2025 at 11:34:49AM +0100, Lukas Wunner wrote:
> > > On Wed, Nov 26, 2025 at 07:35:49PM +0000, David Matlack wrote:
> > > > Add an API to enable the PCI subsystem to track all devices that are
> > > > preserved across a Live Update, including both incoming devices (passed
> > > > from the previous kernel) and outgoing devices (passed to the next
> > > > kernel).
> > > >
> > > > Use PCI segment number and BDF to keep track of devices across Live
> > > > Update. This means the kernel must keep both identifiers constant across
> > > > a Live Update for any preserved device.
> > >
> > > While bus numbers will *usually* stay the same across next and previous
> > > kernel, there are exceptions.  E.g. if "pci=assign-busses" is specified
> > > on the command line, the kernel will re-assign bus numbers on every boot.
> >
> > Stuff like this has to be disabled for this live update stuff, if the
> > bus numbers are changed it will break the active use of the iommu
> > across the kexec.
> >
> > So while what you say is all technically true, I'm not sure this is
> > necessary.
> 
> I agree. However, Lukas's comment made me wonder about the future: if
> we eventually need to preserve non-PCI devices (like a TPM), should we
> be designing a common identification mechanism for all buses now? Or
> should we settle on BDF for PCI and invent stable identifiers for
> other bus types as they become necessary?

Well, at least PCI subsystem should use BDF..

You are probably right that the matching of preserved data to a struct
device should be more general though.

Jason
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by Pasha Tatashin 2 months, 1 week ago
> > > So while what you say is all technically true, I'm not sure this is
> > > necessary.
> >
> > I agree. However, Lukas's comment made me wonder about the future: if
> > we eventually need to preserve non-PCI devices (like a TPM), should we
> > be designing a common identification mechanism for all buses now? Or
> > should we settle on BDF for PCI and invent stable identifiers for
> > other bus types as they become necessary?
>
> Well, at least PCI subsystem should use BDF..

BDF must be stable for PCI live update to work correctly.

> You are probably right that the matching of preserved data to a struct
> device should be more general though.

Right, we need a mechanism to ensure early in boot that any preserved
device does not auto-bind to a driver later in boot.

Using the UEFI Device Path format seems like a good way not to
re-invent something that already exists.  For example, while a
preserved PCI device looks like this:
Acpi(PNP0A03,0)/Pci(1E|0)/Pci(0|0) (Luka's example)

We can seamlessly support other device types later using their native
paths without changing the identification schema:

TPM: Acpi(PNP0C31,0)
IPMI/BMC: Acpi(PNP0A03,0)/Pci(1F|0)/BMC(1,0xCA2)
NVMe (PCI-attached): Acpi(PNP0A03,0)/Pci(1C|0)/Pci(0|0)/NVMe(1,00-00-...)
etc...

Pasha
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by David Matlack 2 months, 1 week ago
On 2025-12-01 09:29 AM, Jason Gunthorpe wrote:
> On Sat, Nov 29, 2025 at 08:20:34PM -0500, Pasha Tatashin wrote:
> > On Sat, Nov 29, 2025 at 7:51 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >
> > > On Sat, Nov 29, 2025 at 11:34:49AM +0100, Lukas Wunner wrote:
> > > > On Wed, Nov 26, 2025 at 07:35:49PM +0000, David Matlack wrote:
> > > > > Add an API to enable the PCI subsystem to track all devices that are
> > > > > preserved across a Live Update, including both incoming devices (passed
> > > > > from the previous kernel) and outgoing devices (passed to the next
> > > > > kernel).
> > > > >
> > > > > Use PCI segment number and BDF to keep track of devices across Live
> > > > > Update. This means the kernel must keep both identifiers constant across
> > > > > a Live Update for any preserved device.
> > > >
> > > > While bus numbers will *usually* stay the same across next and previous
> > > > kernel, there are exceptions.  E.g. if "pci=assign-busses" is specified
> > > > on the command line, the kernel will re-assign bus numbers on every boot.
> > >
> > > Stuff like this has to be disabled for this live update stuff, if the
> > > bus numbers are changed it will break the active use of the iommu
> > > across the kexec.
> > >
> > > So while what you say is all technically true, I'm not sure this is
> > > necessary.
> > 
> > I agree. However, Lukas's comment made me wonder about the future: if
> > we eventually need to preserve non-PCI devices (like a TPM), should we
> > be designing a common identification mechanism for all buses now? Or
> > should we settle on BDF for PCI and invent stable identifiers for
> > other bus types as they become necessary?
> 
> Well, at least PCI subsystem should use BDF..
> 
> You are probably right that the matching of preserved data to a struct
> device should be more general though.

Lukas' suggestion would also make it more reliable to detect bus numbers
changing during a Live Update. We can play whack-a-mole with things like
assign-busses, but there will be a risk that we miss something or
something changes in the future.

Perhaps it would make sense to rely on BDF in the PCI subsystem in the
short term and enforce bus number stability manually (e.g. see patch at
the bottom), and then explore stable device paths as a future
improvement to make PCI device preservation more reliable and also to
enable other bus types?

To handle pci=assign-busses, perhaps something like this? Are there any
other places where the kernel could change busses?

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0ce98e18b5a8..2e1e1aa385a8 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1331,6 +1331,20 @@ static bool pci_ea_fixed_busnrs(struct pci_dev *dev, u8 *sec, u8 *sub)
 	return true;
 }
 
+static bool pci_assign_all_busses(void)
+{
+	/*
+	 * During a Live Update, do not assign new bus numbers. Use bus numbers
+	 * assigned by the firmware and the previous kernel. Bus numbers must
+	 * remain constant so that devices preserved across the Live Update can
+	 * use the IOMMU uninterrupted.
+	 */
+	if (liveupdate_count())
+		return false;
+
+	return pcibios_assign_all_busses();
+}
+
 /*
  * pci_scan_bridge_extend() - Scan buses behind a bridge
  * @bus: Parent bus the bridge is on
@@ -1404,7 +1418,7 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
 	pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
 			      bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
 
-	if ((secondary || subordinate) && !pcibios_assign_all_busses() &&
+	if ((secondary || subordinate) && !pci_assign_all_busses() &&
 	    !is_cardbus && !broken) {
 		unsigned int cmax, buses;
 
@@ -1441,13 +1455,16 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
 		if (subordinate > max)
 			max = subordinate;
 	} else {
+		pci_WARN_ONCE(dev, liveupdate_count(),
+			      "Assigning new bus numbers during a Live Update! [%u %u %u %u]\n",
+			      secondary, subordinate, is_cardbus, broken);
 
 		/*
 		 * We need to assign a number to this bus which we always
 		 * do in the second pass.
 		 */
 		if (!pass) {
-			if (pcibios_assign_all_busses() || broken || is_cardbus)
+			if (pci_assign_all_busses() || broken || is_cardbus)
 
 				/*
 				 * Temporarily disable forwarding of the
@@ -1522,7 +1539,7 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
 							max+i+1))
 					break;
 				while (parent->parent) {
-					if ((!pcibios_assign_all_busses()) &&
+					if ((!pci_assign_all_busses()) &&
 					    (parent->busn_res.end > max) &&
 					    (parent->busn_res.end <= max+i)) {
 						j = 1;
diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
index b913d63eab5f..87a4982d0eb1 100644
--- a/include/linux/liveupdate.h
+++ b/include/linux/liveupdate.h
@@ -219,6 +219,7 @@ struct liveupdate_flb {
 
 /* Return true if live update orchestrator is enabled */
 bool liveupdate_enabled(void);
+int liveupdate_count(void);
 
 /* Called during kexec to tell LUO that entered into reboot */
 int liveupdate_reboot(void);
@@ -241,6 +242,11 @@ static inline bool liveupdate_enabled(void)
 	return false;
 }
 
+static inline int liveupdate_count(void)
+{
+	return 0;
+}
+
 static inline int liveupdate_reboot(void)
 {
 	return 0;
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
index 69298d82f404..2f273397bd41 100644
--- a/kernel/liveupdate/luo_core.c
+++ b/kernel/liveupdate/luo_core.c
@@ -256,6 +256,13 @@ bool liveupdate_enabled(void)
 {
 	return luo_global.enabled;
 }
+EXPORT_SYMBOL_GPL(liveupdate_enabled);
+
+int liveupdate_count(void)
+{
+	return luo_global.liveupdate_num;
+}
+EXPORT_SYMBOL_GPL(liveupdate_count);
 
 /**
  * DOC: LUO ioctl Interface
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by Lukas Wunner 2 months, 1 week ago
On Mon, Dec 01, 2025 at 06:54:11PM +0000, David Matlack wrote:
> To handle pci=assign-busses, perhaps something like this? Are there any
> other places where the kernel could change busses?

In theory the algorithm to assign bus numbers could change from
one kernel version to the next.  Ilpo (+cc) is currently reworking
the resource allocation algorithm.  That work primarily covers
MMIO window sizing, but bus numbers are resources as well and
could be affected by changes.  Resource allocation code is
already quite convoluted and sprinkling liveupdate special cases
all over it may not be received with enthusiasm. ;)

Of course in practice, changes to the algorithm do not happen often
and the kernel will preserve bus numbers as set by BIOS.  Only if it
detects incorrect bus assignments or if forced via the command line
will the kernel re-assign bus numbers.

But you do gain a bit of reliability if you don't assume bus numbers
to stay the same and instead use the "path from root" approach to
identify devices.

Thanks,

Lukas
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by Jason Gunthorpe 2 months, 1 week ago
On Tue, Dec 02, 2025 at 07:20:23AM +0100, Lukas Wunner wrote:

> But you do gain a bit of reliability if you don't assume bus numbers
> to stay the same and instead use the "path from root" approach to
> identify devices.

Again, that's not reliability it is subtle bugs. The device is active
during KHO, you CAN NOT do any resource reassignment, not bus numbers,
not mmio. It must be fully disabled.

Jason
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by Chris Li 2 months, 1 week ago
Hi Lukas,

Sorry I am late to the party.

On Tue, Dec 2, 2025 at 6:59 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Tue, Dec 02, 2025 at 07:20:23AM +0100, Lukas Wunner wrote:
>
> > But you do gain a bit of reliability if you don't assume bus numbers
> > to stay the same and instead use the "path from root" approach to
> > identify devices.
>
> Again, that's not reliability it is subtle bugs. The device is active
> during KHO, you CAN NOT do any resource reassignment, not bus numbers,
> not mmio. It must be fully disabled.

I agree with Jason. The bus number is used in the low level hardware
to do the DMA transfer. The bus number can not change for a device
during livedupate with pending DMA transfer. The BDF MUST remain the
same as the liveupdate with DMA transfer requirement. Given the BDF
remains the same. Using the path from root doesn't buy you more
protections. It just makes the patch more complicated but achieves the
same thing. That is why I chose the BDF approach for the PCI
liveupdate subsystem in the first place. To keep it simple.

Jason, please correct me if I am wrong. My understanding is that not
only the device that is actively doing the DMA requires the bus number
to stay the same, I think all the parent bridge, all the way to the
root PCI host bridge, bus number must remain the same. After all, the
DMA will need to route through the parent bridges.

Another point is that, on the same machine it can have multiple PCI
host bridges. Each PCI host bridge bus number is acquired from the
ACPI table walk. I am not aware of any way to get the slot number of
the PCI host bridge. Lukas, do you know how to get the PCI host bridge
slot number to form a path?

Chris
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by Lukas Wunner 2 months, 1 week ago
On Tue, Dec 02, 2025 at 08:36:53PM +0400, Chris Li wrote:
> On Tue, Dec 2, 2025 at 6:59 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > The device is active
> > during KHO, you CAN NOT do any resource reassignment, not bus numbers,
> > not mmio. It must be fully disabled.
> 
> I agree with Jason. The bus number is used in the low level hardware
> to do the DMA transfer. The bus number can not change for a device
> during livedupate with pending DMA transfer. The BDF MUST remain the
> same as the liveupdate with DMA transfer requirement.

Thank you both for the explanation.

> Another point is that, on the same machine it can have multiple PCI
> host bridges. Each PCI host bridge bus number is acquired from the
> ACPI table walk. I am not aware of any way to get the slot number of
> the PCI host bridge. Lukas, do you know how to get the PCI host bridge
> slot number to form a path?

Host bridges are identified by the segment number.  On ACPI-based systems,
it's retrieved by acpi_pci_root_add() through invocation of the _SEG method.

Thanks,

Lukas
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by Jason Gunthorpe 2 months, 1 week ago
On Tue, Dec 02, 2025 at 08:36:53PM +0400, Chris Li wrote:

> Jason, please correct me if I am wrong. My understanding is that not
> only the device that is actively doing the DMA requires the bus number
> to stay the same, I think all the parent bridge, all the way to the
> root PCI host bridge, bus number must remain the same. After all, the
> DMA will need to route through the parent bridges.

The completions need to route back through the parent bridges, so yes
you cannot do anything to disturb RID based routing in the active
fabric either, with also means few changes to the subordinate bus
range of any bridge are possible.

Jason
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by Chris Li 2 months, 1 week ago
On Tue, Dec 2, 2025 at 10:19 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Tue, Dec 02, 2025 at 08:36:53PM +0400, Chris Li wrote:
>
> > Jason, please correct me if I am wrong. My understanding is that not
> > only the device that is actively doing the DMA requires the bus number
> > to stay the same, I think all the parent bridge, all the way to the
> > root PCI host bridge, bus number must remain the same. After all, the
> > DMA will need to route through the parent bridges.
>
> The completions need to route back through the parent bridges, so yes
> you cannot do anything to disturb RID based routing in the active
> fabric either, with also means few changes to the subordinate bus
> range of any bridge are possible.

Thank you Jason for the confirmation.

Lukas, that means if we are using the path, we will need to save the
bus number along each path node. Different liveupdate devices might
share the parent bridges, we might want to de-duplicate that. Then you
end up with something very similar to the BDF design, where the path
part is just redundant if you have BDF.

That is what I mean previously, using the BDF has the same protections
as path design, just simpler.

Chris
Re: [PATCH 02/21] PCI: Add API to track PCI devices preserved across Live Update
Posted by Pasha Tatashin 2 months, 1 week ago
On Sat, Nov 29, 2025 at 5:34 AM Lukas Wunner <lukas@wunner.de> wrote:
>
> On Wed, Nov 26, 2025 at 07:35:49PM +0000, David Matlack wrote:
> > Add an API to enable the PCI subsystem to track all devices that are
> > preserved across a Live Update, including both incoming devices (passed
> > from the previous kernel) and outgoing devices (passed to the next
> > kernel).
> >
> > Use PCI segment number and BDF to keep track of devices across Live
> > Update. This means the kernel must keep both identifiers constant across
> > a Live Update for any preserved device.
>
> While bus numbers will *usually* stay the same across next and previous
> kernel, there are exceptions.  E.g. if "pci=assign-busses" is specified
> on the command line, the kernel will re-assign bus numbers on every boot.
>
> The most portable way to identify PCI devices across kernels is to
> store their path from the root down the hierarchy.  Because the bus
> number might change but the device/function number on each bus stays
> the same.
>
> This is what EFI does with device paths:
> https://uefi.org/specs/UEFI/2.10/10_Protocols_Device_Path_Protocol.html
>
> Example:
> Acpi(PNP0A03,0)/Pci(1E|0)/Pci(0|0)
>
> Source:
> https://raw.githubusercontent.com/tianocore-docs/edk2-UefiDriverWritersGuide/main/3_foundation/39_uefi_device_paths/README.9.md
>
> We've got a device path *parser* in drivers/firmware/efi/dev-path-parser.c,
> but we don't have a *generator* for device paths in the kernel yet.

Hi Lukas,

Thanks for the input.

You are right that bus numbers can change in standard boot scenarios.
However, for Live Update, we skip firmware, and we would likely list
pci=assign-busses as an unsupported parameter. So, BDF should be
sufficient.

That said, if there is a better method using a stable hierarchical
path, and more importantly, if that method can be extended to other
bus types, we are open to considering it. The main hurdle is that we
would need a way to generate this stable path in the kernel and also
parse it during early boot.

Thanks,
Pasha