Integrate the LUO with the KHO framework to enable passing LUO state
across a kexec reboot.
When LUO is transitioned to a "prepared" state, it tells KHO to
finalize, so all memory segments that were added to KHO preservation
list are getting preserved. After "Prepared" state no new segments
can be preserved. If LUO is canceled, it also tells KHO to cancel the
serialization, and therefore, later LUO can go back into the prepared
state.
This patch introduces the following changes:
- During the KHO finalization phase allocate FDT blob.
- Populate this FDT with a LUO compatibility string ("luo-v1").
LUO now depends on `CONFIG_KEXEC_HANDOVER`. The core state transition
logic (`luo_do_*_calls`) remains unimplemented in this patch.
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
---
include/linux/liveupdate/abi/luo.h | 54 ++++++++++
kernel/liveupdate/luo_core.c | 153 ++++++++++++++++++++++++++++-
2 files changed, 206 insertions(+), 1 deletion(-)
create mode 100644 include/linux/liveupdate/abi/luo.h
diff --git a/include/linux/liveupdate/abi/luo.h b/include/linux/liveupdate/abi/luo.h
new file mode 100644
index 000000000000..9483a294287f
--- /dev/null
+++ b/include/linux/liveupdate/abi/luo.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: Live Update Orchestrator ABI
+ *
+ * This header defines the stable Application Binary Interface used by the
+ * Live Update Orchestrator to pass state from a pre-update kernel to a
+ * post-update kernel. The ABI is built upon the Kexec HandOver framework
+ * and uses a Flattened Device Tree to describe the preserved data.
+ *
+ * This interface is a contract. Any modification to the FDT structure, node
+ * properties, compatible strings, or the layout of the `__packed` serialization
+ * structures defined here constitutes a breaking change. Such changes require
+ * incrementing the version number in the relevant `_COMPATIBLE` string to
+ * prevent a new kernel from misinterpreting data from an old kernel.
+ *
+ * FDT Structure Overview:
+ * The entire LUO state is encapsulated within a single KHO entry named "LUO".
+ * This entry contains an FDT with the following layout:
+ *
+ * .. code-block:: none
+ *
+ * / {
+ * compatible = "luo-v1";
+ * liveupdate-number = <...>;
+ * };
+ *
+ * Main LUO Node (/):
+ *
+ * - compatible: "luo-v1"
+ * Identifies the overall LUO ABI version.
+ * - liveupdate-number: u64
+ * A counter tracking the number of successful live updates performed.
+ */
+
+#ifndef _LINUX_LIVEUPDATE_ABI_LUO_H
+#define _LINUX_LIVEUPDATE_ABI_LUO_H
+
+/*
+ * The LUO FDT hooks all LUO state for sessions, fds, etc.
+ * In the root it allso carries "liveupdate-number" 64-bit property that
+ * corresponds to the number of live-updates performed on this machine.
+ */
+#define LUO_FDT_SIZE PAGE_SIZE
+#define LUO_FDT_KHO_ENTRY_NAME "LUO"
+#define LUO_FDT_COMPATIBLE "luo-v1"
+#define LUO_FDT_LIVEUPDATE_NUM "liveupdate-number"
+
+#endif /* _LINUX_LIVEUPDATE_ABI_LUO_H */
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
index 0e1ab19fa1cd..4a213b262b9f 100644
--- a/kernel/liveupdate/luo_core.c
+++ b/kernel/liveupdate/luo_core.c
@@ -42,11 +42,24 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
#include <linux/kobject.h>
+#include <linux/libfdt.h>
#include <linux/liveupdate.h>
+#include <linux/liveupdate/abi/luo.h>
+#include <linux/mm.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+#include "kexec_handover_internal.h"
static struct {
bool enabled;
+ void *fdt_out;
+ void *fdt_in;
+ u64 liveupdate_num;
} luo_global;
static int __init early_liveupdate_param(char *buf)
@@ -55,6 +68,129 @@ static int __init early_liveupdate_param(char *buf)
}
early_param("liveupdate", early_liveupdate_param);
+static int __init luo_early_startup(void)
+{
+ phys_addr_t fdt_phys;
+ int err, ln_size;
+ const void *ptr;
+
+ if (!kho_is_enabled()) {
+ if (liveupdate_enabled())
+ pr_warn("Disabling liveupdate because KHO is disabled\n");
+ luo_global.enabled = false;
+ return 0;
+ }
+
+ /* Retrieve LUO subtree, and verify its format. */
+ err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys);
+ if (err) {
+ if (err != -ENOENT) {
+ pr_err("failed to retrieve FDT '%s' from KHO: %pe\n",
+ LUO_FDT_KHO_ENTRY_NAME, ERR_PTR(err));
+ return err;
+ }
+
+ return 0;
+ }
+
+ luo_global.fdt_in = phys_to_virt(fdt_phys);
+ err = fdt_node_check_compatible(luo_global.fdt_in, 0,
+ LUO_FDT_COMPATIBLE);
+ if (err) {
+ pr_err("FDT '%s' is incompatible with '%s' [%d]\n",
+ LUO_FDT_KHO_ENTRY_NAME, LUO_FDT_COMPATIBLE, err);
+
+ return -EINVAL;
+ }
+
+ ln_size = 0;
+ ptr = fdt_getprop(luo_global.fdt_in, 0, LUO_FDT_LIVEUPDATE_NUM,
+ &ln_size);
+ if (!ptr || ln_size != sizeof(luo_global.liveupdate_num)) {
+ pr_err("Unable to get live update number '%s' [%d]\n",
+ LUO_FDT_LIVEUPDATE_NUM, ln_size);
+
+ return -EINVAL;
+ }
+
+ luo_global.liveupdate_num = get_unaligned((u64 *)ptr);
+ pr_info("Retrieved live update data, liveupdate number: %lld\n",
+ luo_global.liveupdate_num);
+
+ return 0;
+}
+
+static int __init liveupdate_early_init(void)
+{
+ int err;
+
+ err = luo_early_startup();
+ if (err) {
+ pr_err("The incoming tree failed to initialize properly [%pe], disabling live update\n",
+ ERR_PTR(err));
+ luo_global.enabled = false;
+ }
+
+ return err;
+}
+early_initcall(liveupdate_early_init);
+
+/* Called during boot to create outgoing LUO fdt tree */
+static int __init luo_fdt_setup(void)
+{
+ const u64 ln = luo_global.liveupdate_num + 1;
+ void *fdt_out;
+ int err;
+
+ fdt_out = kho_alloc_preserve(LUO_FDT_SIZE);
+ if (IS_ERR(fdt_out)) {
+ pr_err("failed to allocate/preserve FDT memory\n");
+ return PTR_ERR(fdt_out);
+ }
+
+ err = fdt_create(fdt_out, LUO_FDT_SIZE);
+ err |= fdt_finish_reservemap(fdt_out);
+ err |= fdt_begin_node(fdt_out, "");
+ err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE);
+ err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln));
+ err |= fdt_end_node(fdt_out);
+ err |= fdt_finish(fdt_out);
+ if (err)
+ goto exit_free;
+
+ err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out);
+ if (err)
+ goto exit_free;
+ luo_global.fdt_out = fdt_out;
+
+ return 0;
+
+exit_free:
+ kho_unpreserve_free(fdt_out);
+ pr_err("failed to prepare LUO FDT: %d\n", err);
+
+ return err;
+}
+
+/*
+ * late initcall because it initializes the outgoing tree that is needed only
+ * once userspace starts using /dev/liveupdate.
+ */
+static int __init luo_late_startup(void)
+{
+ int err;
+
+ if (!liveupdate_enabled())
+ return 0;
+
+ err = luo_fdt_setup();
+ if (err)
+ luo_global.enabled = false;
+
+ return err;
+}
+late_initcall(luo_late_startup);
+
/* Public Functions */
/**
@@ -69,7 +205,22 @@ early_param("liveupdate", early_liveupdate_param);
*/
int liveupdate_reboot(void)
{
- return 0;
+ int err;
+
+ if (!liveupdate_enabled())
+ return 0;
+
+ err = kho_finalize();
+ if (err) {
+ pr_err("kho_finalize failed %d\n", err);
+ /*
+ * kho_finalize() may return libfdt errors, to aboid passing to
+ * userspace unknown errors, change this to EAGAIN.
+ */
+ err = -EAGAIN;
+ }
+
+ return err;
}
/**
--
2.52.0.rc1.455.g30608eb744-goog
On Sat, Nov 15, 2025 at 06:33:48PM -0500, Pasha Tatashin wrote:
> Integrate the LUO with the KHO framework to enable passing LUO state
> across a kexec reboot.
>
> When LUO is transitioned to a "prepared" state, it tells KHO to
> finalize, so all memory segments that were added to KHO preservation
> list are getting preserved. After "Prepared" state no new segments
> can be preserved. If LUO is canceled, it also tells KHO to cancel the
> serialization, and therefore, later LUO can go back into the prepared
> state.
>
> This patch introduces the following changes:
> - During the KHO finalization phase allocate FDT blob.
This happens much earlier, isn't it?
> - Populate this FDT with a LUO compatibility string ("luo-v1").
>
> LUO now depends on `CONFIG_KEXEC_HANDOVER`. The core state transition
> logic (`luo_do_*_calls`) remains unimplemented in this patch.
>
> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
> ---
> include/linux/liveupdate/abi/luo.h | 54 ++++++++++
> kernel/liveupdate/luo_core.c | 153 ++++++++++++++++++++++++++++-
> 2 files changed, 206 insertions(+), 1 deletion(-)
> create mode 100644 include/linux/liveupdate/abi/luo.h
>
> diff --git a/include/linux/liveupdate/abi/luo.h b/include/linux/liveupdate/abi/luo.h
> new file mode 100644
> index 000000000000..9483a294287f
> --- /dev/null
> +++ b/include/linux/liveupdate/abi/luo.h
> @@ -0,0 +1,54 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * Copyright (c) 2025, Google LLC.
> + * Pasha Tatashin <pasha.tatashin@soleen.com>
> + */
> +
> +/**
> + * DOC: Live Update Orchestrator ABI
> + *
> + * This header defines the stable Application Binary Interface used by the
> + * Live Update Orchestrator to pass state from a pre-update kernel to a
> + * post-update kernel. The ABI is built upon the Kexec HandOver framework
> + * and uses a Flattened Device Tree to describe the preserved data.
> + *
> + * This interface is a contract. Any modification to the FDT structure, node
> + * properties, compatible strings, or the layout of the `__packed` serialization
> + * structures defined here constitutes a breaking change. Such changes require
> + * incrementing the version number in the relevant `_COMPATIBLE` string to
> + * prevent a new kernel from misinterpreting data from an old kernel.
I'd add a sentence that stresses that ABI changes are possible as long they
include changes to the FDT version.
This is indeed implied by the last paragraph, but I think it's worth
spelling it explicitly.
Another thing that I think this should mention is that compatibility is
only guaranteed for the kernels that use the same ABI version.
> + *
> + * FDT Structure Overview:
> + * The entire LUO state is encapsulated within a single KHO entry named "LUO".
> + * This entry contains an FDT with the following layout:
> + *
> + * .. code-block:: none
> + *
> + * / {
> + * compatible = "luo-v1";
> + * liveupdate-number = <...>;
> + * };
> + *
> + * Main LUO Node (/):
> + *
> + * - compatible: "luo-v1"
> + * Identifies the overall LUO ABI version.
> + * - liveupdate-number: u64
> + * A counter tracking the number of successful live updates performed.
> + */
...
> +static int __init liveupdate_early_init(void)
> +{
> + int err;
> +
> + err = luo_early_startup();
> + if (err) {
> + pr_err("The incoming tree failed to initialize properly [%pe], disabling live update\n",
> + ERR_PTR(err));
How do we report this to the userspace?
I think the decision what to do in this case belongs there. Even if it's
down to choosing between plain kexec and full reboot, it's still a policy
that should be implemented in userspace.
> + luo_global.enabled = false;
> + }
> +
> + return err;
> +}
> +early_initcall(liveupdate_early_init);
--
Sincerely yours,
Mike.
On Sun, Nov 16, 2025 at 7:43 AM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Sat, Nov 15, 2025 at 06:33:48PM -0500, Pasha Tatashin wrote:
> > Integrate the LUO with the KHO framework to enable passing LUO state
> > across a kexec reboot.
> >
> > When LUO is transitioned to a "prepared" state, it tells KHO to
> > finalize, so all memory segments that were added to KHO preservation
> > list are getting preserved. After "Prepared" state no new segments
> > can be preserved. If LUO is canceled, it also tells KHO to cancel the
> > serialization, and therefore, later LUO can go back into the prepared
> > state.
> >
> > This patch introduces the following changes:
> > - During the KHO finalization phase allocate FDT blob.
>
> This happens much earlier, isn't it?
It is, this commit log needs to be updated, it still talks about
prepare/cancel, where they are since v5 replaced with
preserve/unfreeze.
>
> > - Populate this FDT with a LUO compatibility string ("luo-v1").
> >
> > LUO now depends on `CONFIG_KEXEC_HANDOVER`. The core state transition
> > logic (`luo_do_*_calls`) remains unimplemented in this patch.
> >
> > Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
> > ---
> > include/linux/liveupdate/abi/luo.h | 54 ++++++++++
> > kernel/liveupdate/luo_core.c | 153 ++++++++++++++++++++++++++++-
> > 2 files changed, 206 insertions(+), 1 deletion(-)
> > create mode 100644 include/linux/liveupdate/abi/luo.h
> >
> > diff --git a/include/linux/liveupdate/abi/luo.h b/include/linux/liveupdate/abi/luo.h
> > new file mode 100644
> > index 000000000000..9483a294287f
> > --- /dev/null
> > +++ b/include/linux/liveupdate/abi/luo.h
> > @@ -0,0 +1,54 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +
> > +/*
> > + * Copyright (c) 2025, Google LLC.
> > + * Pasha Tatashin <pasha.tatashin@soleen.com>
> > + */
> > +
> > +/**
> > + * DOC: Live Update Orchestrator ABI
> > + *
> > + * This header defines the stable Application Binary Interface used by the
> > + * Live Update Orchestrator to pass state from a pre-update kernel to a
> > + * post-update kernel. The ABI is built upon the Kexec HandOver framework
> > + * and uses a Flattened Device Tree to describe the preserved data.
> > + *
> > + * This interface is a contract. Any modification to the FDT structure, node
> > + * properties, compatible strings, or the layout of the `__packed` serialization
> > + * structures defined here constitutes a breaking change. Such changes require
> > + * incrementing the version number in the relevant `_COMPATIBLE` string to
> > + * prevent a new kernel from misinterpreting data from an old kernel.
>
> I'd add a sentence that stresses that ABI changes are possible as long they
> include changes to the FDT version.
> This is indeed implied by the last paragraph, but I think it's worth
> spelling it explicitly.
>
> Another thing that I think this should mention is that compatibility is
> only guaranteed for the kernels that use the same ABI version.
Sure, I will add both.
> > + *
> > + * FDT Structure Overview:
> > + * The entire LUO state is encapsulated within a single KHO entry named "LUO".
> > + * This entry contains an FDT with the following layout:
> > + *
> > + * .. code-block:: none
> > + *
> > + * / {
> > + * compatible = "luo-v1";
> > + * liveupdate-number = <...>;
> > + * };
> > + *
> > + * Main LUO Node (/):
> > + *
> > + * - compatible: "luo-v1"
> > + * Identifies the overall LUO ABI version.
> > + * - liveupdate-number: u64
> > + * A counter tracking the number of successful live updates performed.
> > + */
> ...
>
> > +static int __init liveupdate_early_init(void)
> > +{
> > + int err;
> > +
> > + err = luo_early_startup();
> > + if (err) {
> > + pr_err("The incoming tree failed to initialize properly [%pe], disabling live update\n",
> > + ERR_PTR(err));
>
> How do we report this to the userspace?
> I think the decision what to do in this case belongs there. Even if it's
> down to choosing between plain kexec and full reboot, it's still a policy
> that should be implemented in userspace.
I agree that policy belongs in userspace, and that is how we designed
it. In this specific failure case (ABI mismatch or corrupt FDT), the
preserved state is unrecoverable by the kernel. We cannot parse the
incoming data, so we cannot offer it to userspace.
We report this state by not registering the /dev/liveupdate device.
When the userspace agent attempts to initialize, it receives ENOENT.
At that point, the agent exercises its policy:
- Check dmesg for the specific error and report the failure to the
fleet control plane.
- Trigger a fresh (kexec or cold) reboot to reset unreclaimable resources.
Pasha
On Sun, Nov 16, 2025 at 09:55:30AM -0500, Pasha Tatashin wrote:
> On Sun, Nov 16, 2025 at 7:43 AM Mike Rapoport <rppt@kernel.org> wrote:
> >
> > > +static int __init liveupdate_early_init(void)
> > > +{
> > > + int err;
> > > +
> > > + err = luo_early_startup();
> > > + if (err) {
> > > + pr_err("The incoming tree failed to initialize properly [%pe], disabling live update\n",
> > > + ERR_PTR(err));
> >
> > How do we report this to the userspace?
> > I think the decision what to do in this case belongs there. Even if it's
> > down to choosing between plain kexec and full reboot, it's still a policy
> > that should be implemented in userspace.
>
> I agree that policy belongs in userspace, and that is how we designed
> it. In this specific failure case (ABI mismatch or corrupt FDT), the
> preserved state is unrecoverable by the kernel. We cannot parse the
> incoming data, so we cannot offer it to userspace.
>
> We report this state by not registering the /dev/liveupdate device.
> When the userspace agent attempts to initialize, it receives ENOENT.
> At that point, the agent exercises its policy:
>
> - Check dmesg for the specific error and report the failure to the
> fleet control plane.
Hmm, this is not nice. I think we still should register /dev/liveupdate and
let userspace discover this error via /dev/liveupdate ABIs.
> - Trigger a fresh (kexec or cold) reboot to reset unreclaimable resources.
>
> Pasha
>
--
Sincerely yours,
Mike.
On Sun, Nov 16, 2025 at 2:16 PM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Sun, Nov 16, 2025 at 09:55:30AM -0500, Pasha Tatashin wrote:
> > On Sun, Nov 16, 2025 at 7:43 AM Mike Rapoport <rppt@kernel.org> wrote:
> > >
> > > > +static int __init liveupdate_early_init(void)
> > > > +{
> > > > + int err;
> > > > +
> > > > + err = luo_early_startup();
> > > > + if (err) {
> > > > + pr_err("The incoming tree failed to initialize properly [%pe], disabling live update\n",
> > > > + ERR_PTR(err));
> > >
> > > How do we report this to the userspace?
> > > I think the decision what to do in this case belongs there. Even if it's
> > > down to choosing between plain kexec and full reboot, it's still a policy
> > > that should be implemented in userspace.
> >
> > I agree that policy belongs in userspace, and that is how we designed
> > it. In this specific failure case (ABI mismatch or corrupt FDT), the
> > preserved state is unrecoverable by the kernel. We cannot parse the
> > incoming data, so we cannot offer it to userspace.
> >
> > We report this state by not registering the /dev/liveupdate device.
> > When the userspace agent attempts to initialize, it receives ENOENT.
> > At that point, the agent exercises its policy:
> >
> > - Check dmesg for the specific error and report the failure to the
> > fleet control plane.
>
> Hmm, this is not nice. I think we still should register /dev/liveupdate and
> let userspace discover this error via /dev/liveupdate ABIs.
Not registering the device is the correct approach here for two reasons:
1. This follows the standard Linux driver pattern. If a driver fails
to initialize its underlying resources (hardware, firmware, or in this
case, the incoming FDT), it does not register a character device.
2. Registering a "zombie" device that exists solely to return errors
adds significant complexity. We would need to introduce a specific
"broken" state to the state machine and add checks to IOCTLs to reject
commands with a specific error code.
Pasha
On Mon, Nov 17, 2025 at 01:29:47PM -0500, Pasha Tatashin wrote:
> On Sun, Nov 16, 2025 at 2:16 PM Mike Rapoport <rppt@kernel.org> wrote:
> >
> > On Sun, Nov 16, 2025 at 09:55:30AM -0500, Pasha Tatashin wrote:
> > > On Sun, Nov 16, 2025 at 7:43 AM Mike Rapoport <rppt@kernel.org> wrote:
> > > >
> > > > > +static int __init liveupdate_early_init(void)
> > > > > +{
> > > > > + int err;
> > > > > +
> > > > > + err = luo_early_startup();
> > > > > + if (err) {
> > > > > + pr_err("The incoming tree failed to initialize properly [%pe], disabling live update\n",
> > > > > + ERR_PTR(err));
> > > >
> > > > How do we report this to the userspace?
> > > > I think the decision what to do in this case belongs there. Even if it's
> > > > down to choosing between plain kexec and full reboot, it's still a policy
> > > > that should be implemented in userspace.
> > >
> > > I agree that policy belongs in userspace, and that is how we designed
> > > it. In this specific failure case (ABI mismatch or corrupt FDT), the
> > > preserved state is unrecoverable by the kernel. We cannot parse the
> > > incoming data, so we cannot offer it to userspace.
> > >
> > > We report this state by not registering the /dev/liveupdate device.
> > > When the userspace agent attempts to initialize, it receives ENOENT.
> > > At that point, the agent exercises its policy:
> > >
> > > - Check dmesg for the specific error and report the failure to the
> > > fleet control plane.
> >
> > Hmm, this is not nice. I think we still should register /dev/liveupdate and
> > let userspace discover this error via /dev/liveupdate ABIs.
>
> Not registering the device is the correct approach here for two reasons:
>
> 1. This follows the standard Linux driver pattern. If a driver fails
> to initialize its underlying resources (hardware, firmware, or in this
> case, the incoming FDT), it does not register a character device.
> 2. Registering a "zombie" device that exists solely to return errors
> adds significant complexity. We would need to introduce a specific
> "broken" state to the state machine and add checks to IOCTLs to reject
> commands with a specific error code.
You can avoid that complexity if you register the device with a different
fops, but that's technicality.
Your point about treating the incoming FDT as an underlying resource that
failed to initialize makes sense, but nevertheless userspace needs a
reliable way to detect it and parsing dmesg is not something we should rely
on.
> Pasha
--
Sincerely yours,
Mike.
> You can avoid that complexity if you register the device with a different > fops, but that's technicality. > > Your point about treating the incoming FDT as an underlying resource that > failed to initialize makes sense, but nevertheless userspace needs a > reliable way to detect it and parsing dmesg is not something we should rely > on. I see two solutions: 1. LUO fails to retrieve the preserved data, the user gets informed by not finding /dev/liveupdate, and studying the dmesg for what has happened (in reality in fleets version mismatches should not be happening, those should be detected in quals). 2. Create a zombie device to return some errno on open, and still study dmesg to understand what really happened. I think that 1 is better Pasha
On Mon, Nov 17, 2025 at 11:22:54PM -0500, Pasha Tatashin wrote: > > You can avoid that complexity if you register the device with a different > > fops, but that's technicality. > > > > Your point about treating the incoming FDT as an underlying resource that > > failed to initialize makes sense, but nevertheless userspace needs a > > reliable way to detect it and parsing dmesg is not something we should rely > > on. > > I see two solutions: > > 1. LUO fails to retrieve the preserved data, the user gets informed by > not finding /dev/liveupdate, and studying the dmesg for what has > happened (in reality in fleets version mismatches should not be > happening, those should be detected in quals). > 2. Create a zombie device to return some errno on open, and still > study dmesg to understand what really happened. User should not study dmesg. We need another solution. What's wrong with e.g. ioctl()? > I think that 1 is better > > Pasha -- Sincerely yours, Mike.
On Tue, Nov 18, 2025 at 01:21:34PM +0200, Mike Rapoport wrote: > On Mon, Nov 17, 2025 at 11:22:54PM -0500, Pasha Tatashin wrote: > > > You can avoid that complexity if you register the device with a different > > > fops, but that's technicality. > > > > > > Your point about treating the incoming FDT as an underlying resource that > > > failed to initialize makes sense, but nevertheless userspace needs a > > > reliable way to detect it and parsing dmesg is not something we should rely > > > on. > > > > I see two solutions: > > > > 1. LUO fails to retrieve the preserved data, the user gets informed by > > not finding /dev/liveupdate, and studying the dmesg for what has > > happened (in reality in fleets version mismatches should not be > > happening, those should be detected in quals). > > 2. Create a zombie device to return some errno on open, and still > > study dmesg to understand what really happened. > > User should not study dmesg. We need another solution. > What's wrong with e.g. ioctl()? It seems very dangerous to even boot at all if the next kernel doesn't understand the serialization information.. IMHO I think we should not even be thinking about this, it is up to the predecessor environment to prevent it from happening. The ideas to use ELF metadata/etc to allow a pre-flight validation are the right solution. If we get into the next kernel and it receives information it cannot process it should just BUG_ON and die, or some broad equivalent. It is a catastrophic orchestration error, and we don't need some fine grain recovery or userspace visibility. Crash dump the system and reboot it. IOW, I would not invest time in this. Jason
On Tue, Nov 18, 2025 at 10:03:00AM -0400, Jason Gunthorpe wrote: > On Tue, Nov 18, 2025 at 01:21:34PM +0200, Mike Rapoport wrote: > > On Mon, Nov 17, 2025 at 11:22:54PM -0500, Pasha Tatashin wrote: > > > > You can avoid that complexity if you register the device with a different > > > > fops, but that's technicality. > > > > > > > > Your point about treating the incoming FDT as an underlying resource that > > > > failed to initialize makes sense, but nevertheless userspace needs a > > > > reliable way to detect it and parsing dmesg is not something we should rely > > > > on. > > > > > > I see two solutions: > > > > > > 1. LUO fails to retrieve the preserved data, the user gets informed by > > > not finding /dev/liveupdate, and studying the dmesg for what has > > > happened (in reality in fleets version mismatches should not be > > > happening, those should be detected in quals). > > > 2. Create a zombie device to return some errno on open, and still > > > study dmesg to understand what really happened. > > > > User should not study dmesg. We need another solution. > > What's wrong with e.g. ioctl()? > > It seems very dangerous to even boot at all if the next kernel doesn't > understand the serialization information.. > > IMHO I think we should not even be thinking about this, it is up to > the predecessor environment to prevent it from happening. The ideas to > use ELF metadata/etc to allow a pre-flight validation are the right > solution. > > If we get into the next kernel and it receives information it cannot > process it should just BUG_ON and die, or some broad equivalent. > It is a catastrophic orchestration error, and we don't need some fine > grain recovery or userspace visibility. Crash dump the system and > reboot it. I was under impression Pasha wanted to get up to the userspace no matter what. panic() in liveupdate_early_init() makes perfect sense to me. Parsing dmesg does not. > IOW, I would not invest time in this. > > Jason -- Sincerely yours, Mike.
On Tue, Nov 18, 2025 at 10:06 AM Mike Rapoport <rppt@kernel.org> wrote: > > On Tue, Nov 18, 2025 at 10:03:00AM -0400, Jason Gunthorpe wrote: > > On Tue, Nov 18, 2025 at 01:21:34PM +0200, Mike Rapoport wrote: > > > On Mon, Nov 17, 2025 at 11:22:54PM -0500, Pasha Tatashin wrote: > > > > > You can avoid that complexity if you register the device with a different > > > > > fops, but that's technicality. > > > > > > > > > > Your point about treating the incoming FDT as an underlying resource that > > > > > failed to initialize makes sense, but nevertheless userspace needs a > > > > > reliable way to detect it and parsing dmesg is not something we should rely > > > > > on. > > > > > > > > I see two solutions: > > > > > > > > 1. LUO fails to retrieve the preserved data, the user gets informed by > > > > not finding /dev/liveupdate, and studying the dmesg for what has > > > > happened (in reality in fleets version mismatches should not be > > > > happening, those should be detected in quals). > > > > 2. Create a zombie device to return some errno on open, and still > > > > study dmesg to understand what really happened. > > > > > > User should not study dmesg. We need another solution. > > > What's wrong with e.g. ioctl()? > > > > It seems very dangerous to even boot at all if the next kernel doesn't > > understand the serialization information.. > > > > IMHO I think we should not even be thinking about this, it is up to > > the predecessor environment to prevent it from happening. The ideas to > > use ELF metadata/etc to allow a pre-flight validation are the right > > solution. 100% agreed, this is the goal. > > If we get into the next kernel and it receives information it cannot > > process it should just BUG_ON and die, or some broad equivalent. I initially had a panic() that would kill the kernel, but after further consideration, I realized that we can still boot into "maintenance" mode and allow the user to decide when and how to reboot the machine back to a normal state. Crashing during early boot has its own disadvantages: the crash kernel is not available. Also, because live-update has to be very fast, the console is likely to be disabled. Therefore, getting to userspace and allowing the user to investigate what happened (e.g., automatically retrieving dmesg or a core dump and filing a bug) before rebooting seems like the most sensible approach. This won't leak data, as /dev/liveupdate is completely disabled, so nothing preserved in memory will be recoverable. Pasha
On Tue, Nov 18, 2025 at 10:18:28AM -0500, Pasha Tatashin wrote: > On Tue, Nov 18, 2025 at 10:06 AM Mike Rapoport <rppt@kernel.org> wrote: > > > > On Tue, Nov 18, 2025 at 10:03:00AM -0400, Jason Gunthorpe wrote: > > > On Tue, Nov 18, 2025 at 01:21:34PM +0200, Mike Rapoport wrote: > > > > On Mon, Nov 17, 2025 at 11:22:54PM -0500, Pasha Tatashin wrote: > > > > > > You can avoid that complexity if you register the device with a different > > > > > > fops, but that's technicality. > > > > > > > > > > > > Your point about treating the incoming FDT as an underlying resource that > > > > > > failed to initialize makes sense, but nevertheless userspace needs a > > > > > > reliable way to detect it and parsing dmesg is not something we should rely > > > > > > on. > > > > > > > > > > I see two solutions: > > > > > > > > > > 1. LUO fails to retrieve the preserved data, the user gets informed by > > > > > not finding /dev/liveupdate, and studying the dmesg for what has > > > > > happened (in reality in fleets version mismatches should not be > > > > > happening, those should be detected in quals). > > > > > 2. Create a zombie device to return some errno on open, and still > > > > > study dmesg to understand what really happened. > > > > > > > > User should not study dmesg. We need another solution. > > > > What's wrong with e.g. ioctl()? > > > > > > It seems very dangerous to even boot at all if the next kernel doesn't > > > understand the serialization information.. > > > > > > IMHO I think we should not even be thinking about this, it is up to > > > the predecessor environment to prevent it from happening. The ideas to > > > use ELF metadata/etc to allow a pre-flight validation are the right > > > solution. > > 100% agreed, this is the goal. > > > > If we get into the next kernel and it receives information it cannot > > > process it should just BUG_ON and die, or some broad equivalent. > > I initially had a panic() that would kill the kernel, but after > further consideration, I realized that we can still boot into > "maintenance" mode and allow the user to decide when and how to reboot > the machine back to a normal state. > This won't leak data, as /dev/liveupdate is completely disabled, so > nothing preserved in memory will be recoverable. This seems reasonable, but it is still dangerous. At the minimum the KHO startup either needs to succeed, panic, or fail to online most of the memory (ie run from the safe region only) The above approach works better for things like VFIO or memfd where you can boot significantly safely. Not sure about iommu though, if iommu doesn't deserialize properly then it probably corrupts all memory too. Jason
> > This won't leak data, as /dev/liveupdate is completely disabled, so > > nothing preserved in memory will be recoverable. > > This seems reasonable, but it is still dangerous. > > At the minimum the KHO startup either needs to succeed, panic, or fail > to online most of the memory (ie run from the safe region only) Allowing degrade booting using only scratch memory sounds like a very good compromise. This allows the live-update boot to stay alive as a sort of "crash kernel," particularly since kdump functionality is not available here. However, it would require some work in KHO to enable such a feature. > The above approach works better for things like VFIO or memfd where > you can boot significantly safely. Not sure about iommu though, if > iommu doesn't deserialize properly then it probably corrupts all > memory too. Yes, DMA may corrupt memory if KHO is broken, *but* we are discussing broken LUO recovering, the KHO preserved memory should still stay as preserved but unretriable, so DMA activity should only happen to those regions... Pasha > > Jason
On Tue, Nov 18, 2025 at 10:46:35AM -0500, Pasha Tatashin wrote: > > > This won't leak data, as /dev/liveupdate is completely disabled, so > > > nothing preserved in memory will be recoverable. > > > > This seems reasonable, but it is still dangerous. > > > > At the minimum the KHO startup either needs to succeed, panic, or fail > > to online most of the memory (ie run from the safe region only) > > Allowing degrade booting using only scratch memory sounds like a very > good compromise. This allows the live-update boot to stay alive as a > sort of "crash kernel," particularly since kdump functionality is not > available here. However, it would require some work in KHO to enable > such a feature. > > > The above approach works better for things like VFIO or memfd where > > you can boot significantly safely. Not sure about iommu though, if > > iommu doesn't deserialize properly then it probably corrupts all > > memory too. > > Yes, DMA may corrupt memory if KHO is broken, *but* we are discussing > broken LUO recovering, the KHO preserved memory should still stay as > preserved but unretriable, so DMA activity should only happen to those > regions... If the iommu is not preserved then normal iommu boot will possibly set the translation the identiy and it will scribble over random memory. You can't rely on the translation being present and only reaching kho preserved memroy if the iommu can't restore itself. Jason
On Tue, Nov 18, 2025 at 11:15 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Tue, Nov 18, 2025 at 10:46:35AM -0500, Pasha Tatashin wrote: > > > > This won't leak data, as /dev/liveupdate is completely disabled, so > > > > nothing preserved in memory will be recoverable. > > > > > > This seems reasonable, but it is still dangerous. > > > > > > At the minimum the KHO startup either needs to succeed, panic, or fail > > > to online most of the memory (ie run from the safe region only) > > > > Allowing degrade booting using only scratch memory sounds like a very > > good compromise. This allows the live-update boot to stay alive as a > > sort of "crash kernel," particularly since kdump functionality is not > > available here. However, it would require some work in KHO to enable > > such a feature. > > > > > The above approach works better for things like VFIO or memfd where > > > you can boot significantly safely. Not sure about iommu though, if > > > iommu doesn't deserialize properly then it probably corrupts all > > > memory too. > > > > Yes, DMA may corrupt memory if KHO is broken, *but* we are discussing > > broken LUO recovering, the KHO preserved memory should still stay as > > preserved but unretriable, so DMA activity should only happen to those > > regions... > > If the iommu is not preserved then normal iommu boot will possibly set > the translation the identiy and it will scribble over random memory. > > You can't rely on the translation being present and only reaching kho > preserved memroy if the iommu can't restore itself. In this case, we cannot even rely on having "safe" memory, i.e. this scratch only boot to preserve dmesg/core etc, this is unfortunate. Is there a way to avoid defaulting to identify mode when we are booting into the "maintenance" mode? Thanks, Pasha > > Jason
On Tue, Nov 18, 2025 at 05:07:15PM -0500, Pasha Tatashin wrote: > In this case, we cannot even rely on having "safe" memory, i.e. this > scratch only boot to preserve dmesg/core etc, this is unfortunate. Is > there a way to avoid defaulting to identify mode when we are booting > into the "maintenance" mode? Maybe one could be created? It's tricky though because you also really want to block drivers from using the iommu if you don't know they are quieted and you can't do that without parsing the KHO data, which you can't do because it doesn't understand it.. IDK, I think the "maintenance" mode is something that is probably best effort and shouldn't be relied on. It will work if the iommu data is restored or other lucky conditions hit, so it is not useless, but it is certainly not robust or guaranteed. You are better to squirt a panic message out of the serial port and hope for the best I guess. Jason
On Tue, Nov 18, 2025 at 6:25 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Tue, Nov 18, 2025 at 05:07:15PM -0500, Pasha Tatashin wrote: > > > In this case, we cannot even rely on having "safe" memory, i.e. this > > scratch only boot to preserve dmesg/core etc, this is unfortunate. Is > > there a way to avoid defaulting to identify mode when we are booting > > into the "maintenance" mode? > > Maybe one could be created? > > It's tricky though because you also really want to block drivers from > using the iommu if you don't know they are quieted and you can't do > that without parsing the KHO data, which you can't do because it > doesn't understand it.. > > IDK, I think the "maintenance" mode is something that is probably best > effort and shouldn't be relied on. It will work if the iommu data is > restored or other lucky conditions hit, so it is not useless, but it > is certainly not robust or guaranteed. Right, even kdump has always been best-effort; many types of crashes do not make it to the crash kernel. > You are better to squirt a panic message out of the serial port and For early boot LUO mismatches, or if FLB data is inaccessible for any reason, devices might go rogue, so triggering a panic during boot is appropriate. However, session and file data structures are deserialized later, when /dev/liveupdate is first opened by userspace. If deserialization fails at that stage, I think we should simply fail the open(/dev/liveupdate) call with an error such as -EIO. Pasha
On Tue, Nov 18, 2025 at 10:03:03PM -0500, Pasha Tatashin wrote: > On Tue, Nov 18, 2025 at 6:25 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > On Tue, Nov 18, 2025 at 05:07:15PM -0500, Pasha Tatashin wrote: > > > > > In this case, we cannot even rely on having "safe" memory, i.e. this > > > scratch only boot to preserve dmesg/core etc, this is unfortunate. Is > > > there a way to avoid defaulting to identify mode when we are booting > > > into the "maintenance" mode? > > > > Maybe one could be created? > > > > It's tricky though because you also really want to block drivers from > > using the iommu if you don't know they are quieted and you can't do > > that without parsing the KHO data, which you can't do because it > > doesn't understand it.. > > > > IDK, I think the "maintenance" mode is something that is probably best > > effort and shouldn't be relied on. It will work if the iommu data is > > restored or other lucky conditions hit, so it is not useless, but it > > is certainly not robust or guaranteed. > > Right, even kdump has always been best-effort; many types of crashes > do not make it to the crash kernel. > > > You are better to squirt a panic message out of the serial port and > > For early boot LUO mismatches, or if FLB data is inaccessible for any > reason, devices might go rogue, so triggering a panic during boot is > appropriate. > > However, session and file data structures are deserialized later, when > /dev/liveupdate is first opened by userspace. If deserialization fails > at that stage, I think we should simply fail the open(/dev/liveupdate) > call with an error such as -EIO. That seems reasonable, if you reached this point then it is probably OK. Most likely the prior kernel should mark some critical things like kho, iommu and pci data as 'madatory early boot' and if the new kernel doesn't use them then blow up right away. Jason
© 2016 - 2026 Red Hat, Inc.