Introduce the user-space interface for the Live Update Orchestrator
via ioctl commands, enabling external control over the live update
process and management of preserved resources.
Create a character device at /dev/liveupdate. Access
to this device requires the CAP_SYS_ADMIN capability.
A new uAPI header, <uapi/linux/liveupdate.h>, defines the necessary
structures. The magic number is registered in
Documentation/userspace-api/ioctl/ioctl-number.rst.
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
---
.../userspace-api/ioctl/ioctl-number.rst | 2 +
include/linux/liveupdate.h | 36 +--
include/uapi/linux/liveupdate.h | 265 ++++++++++++++++++
kernel/liveupdate/Makefile | 1 +
kernel/liveupdate/luo_ioctl.c | 178 ++++++++++++
5 files changed, 447 insertions(+), 35 deletions(-)
create mode 100644 include/uapi/linux/liveupdate.h
create mode 100644 kernel/liveupdate/luo_ioctl.c
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index bc91756bde73..8368aa05b4df 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -380,6 +380,8 @@ Code Seq# Include File Comments
0xB8 01-02 uapi/misc/mrvl_cn10k_dpi.h Marvell CN10K DPI driver
0xB8 all uapi/linux/mshv.h Microsoft Hyper-V /dev/mshv driver
<mailto:linux-hyperv@vger.kernel.org>
+0xBA all uapi/linux/liveupdate.h Pasha Tatashin
+ <mailto:pasha.tatashin@soleen.com>
0xC0 00-0F linux/usb/iowarrior.h
0xCA 00-0F uapi/misc/cxl.h Dead since 6.15
0xCA 10-2F uapi/misc/ocxl.h
diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
index 28a8aa4cafca..970447de5d8c 100644
--- a/include/linux/liveupdate.h
+++ b/include/linux/liveupdate.h
@@ -10,6 +10,7 @@
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/list.h>
+#include <uapi/linux/liveupdate.h>
/**
* enum liveupdate_event - Events that trigger live update callbacks.
@@ -53,41 +54,6 @@ enum liveupdate_event {
LIVEUPDATE_CANCEL,
};
-/**
- * enum liveupdate_state - Defines the possible states of the live update
- * orchestrator.
- * @LIVEUPDATE_STATE_UNDEFINED: State has not yet been initialized.
- * @LIVEUPDATE_STATE_NORMAL: Default state, no live update in progress.
- * @LIVEUPDATE_STATE_PREPARED: Live update is prepared for reboot; the
- * LIVEUPDATE_PREPARE callbacks have completed
- * successfully.
- * Devices might operate in a limited state
- * for example the participating devices might
- * not be allowed to unbind, and also the
- * setting up of new DMA mappings might be
- * disabled in this state.
- * @LIVEUPDATE_STATE_FROZEN: The final reboot event
- * (%LIVEUPDATE_FREEZE) has been sent, and the
- * system is performing its final state saving
- * within the "blackout window". User
- * workloads must be suspended. The actual
- * reboot (kexec) into the next kernel is
- * imminent.
- * @LIVEUPDATE_STATE_UPDATED: The system has rebooted into the next
- * kernel via live update the system is now
- * running the next kernel, awaiting the
- * finish event.
- *
- * These states track the progress and outcome of a live update operation.
- */
-enum liveupdate_state {
- LIVEUPDATE_STATE_UNDEFINED = 0,
- LIVEUPDATE_STATE_NORMAL = 1,
- LIVEUPDATE_STATE_PREPARED = 2,
- LIVEUPDATE_STATE_FROZEN = 3,
- LIVEUPDATE_STATE_UPDATED = 4,
-};
-
struct file;
/**
diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h
new file mode 100644
index 000000000000..7b12a1073c3c
--- /dev/null
+++ b/include/uapi/linux/liveupdate.h
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+/*
+ * Userspace interface for /dev/liveupdate
+ * Live Update Orchestrator
+ *
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef _UAPI_LIVEUPDATE_H
+#define _UAPI_LIVEUPDATE_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/**
+ * enum liveupdate_state - Defines the possible states of the live update
+ * orchestrator.
+ * @LIVEUPDATE_STATE_UNDEFINED: State has not yet been initialized.
+ * @LIVEUPDATE_STATE_NORMAL: Default state, no live update in progress.
+ * @LIVEUPDATE_STATE_PREPARED: Live update is prepared for reboot; the
+ * LIVEUPDATE_PREPARE callbacks have completed
+ * successfully.
+ * Devices might operate in a limited state
+ * for example the participating devices might
+ * not be allowed to unbind, and also the
+ * setting up of new DMA mappings might be
+ * disabled in this state.
+ * @LIVEUPDATE_STATE_FROZEN: The final reboot event
+ * (%LIVEUPDATE_FREEZE) has been sent, and the
+ * system is performing its final state saving
+ * within the "blackout window". User
+ * workloads must be suspended. The actual
+ * reboot (kexec) into the next kernel is
+ * imminent.
+ * @LIVEUPDATE_STATE_UPDATED: The system has rebooted into the next
+ * kernel via live update the system is now
+ * running the next kernel, awaiting the
+ * finish event.
+ *
+ * These states track the progress and outcome of a live update operation.
+ */
+enum liveupdate_state {
+ LIVEUPDATE_STATE_UNDEFINED = 0,
+ LIVEUPDATE_STATE_NORMAL = 1,
+ LIVEUPDATE_STATE_PREPARED = 2,
+ LIVEUPDATE_STATE_FROZEN = 3,
+ LIVEUPDATE_STATE_UPDATED = 4,
+};
+
+/**
+ * struct liveupdate_fd - Holds parameters for preserving and restoring file
+ * descriptors across live update.
+ * @fd: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: The user-space file
+ * descriptor to be preserved.
+ * Output for %LIVEUPDATE_IOCTL_FD_RESTORE: The new file descriptor
+ * representing the fully restored kernel resource.
+ * @flags: Unused, reserved for future expansion, must be set to 0.
+ * @token: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: An opaque, unique token
+ * preserved for preserved resource.
+ * Input for %LIVEUPDATE_IOCTL_FD_RESTORE: The token previously
+ * provided to the preserve ioctl for the resource to be restored.
+ *
+ * This structure is used as the argument for the %LIVEUPDATE_IOCTL_FD_PRESERVE
+ * and %LIVEUPDATE_IOCTL_FD_RESTORE ioctls. These ioctls allow specific types
+ * of file descriptors (for example memfd, kvm, iommufd, and VFIO) to have their
+ * underlying kernel state preserved across a live update cycle.
+ *
+ * To preserve an FD, user space passes this struct to
+ * %LIVEUPDATE_IOCTL_FD_PRESERVE with the @fd field set. On success, the
+ * kernel uses the @token field to uniquly associate the preserved FD.
+ *
+ * After the live update transition, user space passes the struct populated with
+ * the *same* @token to %LIVEUPDATE_IOCTL_FD_RESTORE. The kernel uses the @token
+ * to find the preserved state and, on success, populates the @fd field with a
+ * new file descriptor referring to the restored resource.
+ */
+struct liveupdate_fd {
+ int fd;
+ __u32 flags;
+ __aligned_u64 token;
+};
+
+/* The ioctl type, documented in ioctl-number.rst */
+#define LIVEUPDATE_IOCTL_TYPE 0xBA
+
+/**
+ * LIVEUPDATE_IOCTL_FD_PRESERVE - Validate and initiate preservation for a file
+ * descriptor.
+ *
+ * Argument: Pointer to &struct liveupdate_fd.
+ *
+ * User sets the @fd field identifying the file descriptor to preserve
+ * (e.g., memfd, kvm, iommufd, VFIO). The kernel validates if this FD type
+ * and its dependencies are supported for preservation. If validation passes,
+ * the kernel marks the FD internally and *initiates the process* of preparing
+ * its state for saving. The actual snapshotting of the state typically occurs
+ * during the subsequent %LIVEUPDATE_IOCTL_PREPARE execution phase, though
+ * some finalization might occur during freeze.
+ * On successful validation and initiation, the kernel uses the @token
+ * field with an opaque identifier representing the resource being preserved.
+ * This token confirms the FD is targeted for preservation and is required for
+ * the subsequent %LIVEUPDATE_IOCTL_FD_RESTORE call after the live update.
+ *
+ * Return: 0 on success (validation passed, preservation initiated), negative
+ * error code on failure (e.g., unsupported FD type, dependency issue,
+ * validation failed).
+ */
+#define LIVEUPDATE_IOCTL_FD_PRESERVE \
+ _IOW(LIVEUPDATE_IOCTL_TYPE, 0x00, struct liveupdate_fd)
+
+/**
+ * LIVEUPDATE_IOCTL_FD_UNPRESERVE - Remove a file descriptor from the
+ * preservation list.
+ *
+ * Argument: Pointer to __u64 token.
+ *
+ * Allows user space to explicitly remove a file descriptor from the set of
+ * items marked as potentially preservable. User space provides a pointer to the
+ * __u64 @token that was previously returned by a successful
+ * %LIVEUPDATE_IOCTL_FD_PRESERVE call (potentially from a prior, possibly
+ * cancelled, live update attempt). The kernel reads the token value from the
+ * provided user-space address.
+ *
+ * On success, the kernel removes the corresponding entry (identified by the
+ * token value read from the user pointer) from its internal preservation list.
+ * The provided @token (representing the now-removed entry) becomes invalid
+ * after this call.
+ *
+ * Return: 0 on success, negative error code on failure (e.g., -EBUSY or -EINVAL
+ * if not in %LIVEUPDATE_STATE_NORMAL, bad address provided, invalid token value
+ * read, token not found).
+ */
+#define LIVEUPDATE_IOCTL_FD_UNPRESERVE \
+ _IOW(LIVEUPDATE_IOCTL_TYPE, 0x01, __u64)
+
+/**
+ * LIVEUPDATE_IOCTL_FD_RESTORE - Restore a previously preserved file descriptor.
+ *
+ * Argument: Pointer to &struct liveupdate_fd.
+ *
+ * User sets the @token field to the value obtained from a successful
+ * %LIVEUPDATE_IOCTL_FD_PRESERVE call before the live update. On success,
+ * the kernel restores the state (saved during the PREPARE/FREEZE phases)
+ * associated with the token and populates the @fd field with a new file
+ * descriptor referencing the restored resource in the current (new) kernel.
+ * This operation must be performed *before* signaling completion via
+ * %LIVEUPDATE_IOCTL_FINISH.
+ *
+ * Return: 0 on success, negative error code on failure (e.g., invalid token).
+ */
+#define LIVEUPDATE_IOCTL_FD_RESTORE \
+ _IOWR(LIVEUPDATE_IOCTL_TYPE, 0x02, struct liveupdate_fd)
+
+/**
+ * LIVEUPDATE_IOCTL_GET_STATE - Query the current state of the live update
+ * orchestrator.
+ *
+ * Argument: Pointer to &enum liveupdate_state.
+ *
+ * The kernel fills the enum value pointed to by the argument with the current
+ * state of the live update subsystem. Possible states are:
+ *
+ * - %LIVEUPDATE_STATE_NORMAL: Default state; no live update operation is
+ * currently in progress.
+ * - %LIVEUPDATE_STATE_PREPARED: The preparation phase (triggered by
+ * %LIVEUPDATE_IOCTL_PREPARE) has completed
+ * successfully. The system is ready for the
+ * reboot transition. Note that some
+ * device operations (e.g., unbinding, new DMA
+ * mappings) might be restricted in this state.
+ * - %LIVEUPDATE_STATE_UPDATED: The system has successfully rebooted into the
+ * new kernel via live update. It is now running
+ * the new kernel code and is awaiting the
+ * completion signal from user space via
+ * %LIVEUPDATE_IOCTL_FINISH after
+ * restoration tasks are done.
+ *
+ * See the definition of &enum liveupdate_state for more details on each state.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+#define LIVEUPDATE_IOCTL_GET_STATE \
+ _IOR(LIVEUPDATE_IOCTL_TYPE, 0x03, enum liveupdate_state)
+
+/**
+ * LIVEUPDATE_IOCTL_PREPARE - Initiate preparation phase and trigger state
+ * saving.
+ *
+ * Argument: None.
+ *
+ * Initiates the live update preparation phase. This action corresponds to
+ * the internal %LIVEUPDATE_PREPARE. This typically triggers the saving process
+ * for items marked via the PRESERVE ioctls. This typically occurs *before*
+ * the "blackout window", while user applications (e.g., VMs) may still be
+ * running. Kernel subsystems receiving the %LIVEUPDATE_PREPARE event should
+ * serialize necessary state. This command does not transfer data.
+ *
+ * Return: 0 on success, negative error code on failure. Transitions state
+ * towards %LIVEUPDATE_STATE_PREPARED on success.
+ */
+#define LIVEUPDATE_IOCTL_PREPARE \
+ _IO(LIVEUPDATE_IOCTL_TYPE, 0x04)
+
+/**
+ * LIVEUPDATE_IOCTL_CANCEL - Cancel the live update preparation phase.
+ *
+ * Argument: None.
+ *
+ * Notifies the live update subsystem to abort the preparation sequence
+ * potentially initiated by %LIVEUPDATE_IOCTL_PREPARE. This action
+ * typically corresponds to the internal %LIVEUPDATE_CANCEL kernel event,
+ * which might also be triggered automatically if the PREPARE stage fails
+ * internally.
+ *
+ * When triggered, subsystems receiving the %LIVEUPDATE_CANCEL event should
+ * revert any state changes or actions taken specifically for the aborted
+ * prepare phase (e.g., discard partially serialized state). The kernel
+ * releases resources allocated specifically for this *aborted preparation
+ * attempt*.
+ *
+ * This operation cancels the current *attempt* to prepare for a live update
+ * but does **not** remove previously validated items from the internal list
+ * of potentially preservable resources. Consequently, preservation tokens
+ * previously generated by successful %LIVEUPDATE_IOCTL_FD_PRESERVE or calls
+ * generally **remain valid** as identifiers for those potentially preservable
+ * resources. However, since the system state returns towards
+ * %LIVEUPDATE_STATE_NORMAL, user space must initiate a new live update sequence
+ * (starting with %LIVEUPDATE_IOCTL_PREPARE) to proceed with an update
+ * using these (or other) tokens.
+ *
+ * This command does not transfer data. Kernel callbacks for the
+ * %LIVEUPDATE_CANCEL event must not fail.
+ *
+ * Return: 0 on success, negative error code on failure. Transitions state back
+ * towards %LIVEUPDATE_STATE_NORMAL on success.
+ */
+#define LIVEUPDATE_IOCTL_CANCEL \
+ _IO(LIVEUPDATE_IOCTL_TYPE, 0x06)
+
+/**
+ * LIVEUPDATE_IOCTL_EVENT_FINISH - Signal restoration completion and trigger
+ * cleanup.
+ *
+ * Argument: None.
+ *
+ * Signals that user space has completed all necessary restoration actions in
+ * the new kernel (after a live update reboot). This action corresponds to the
+ * internal %LIVEUPDATE_FINISH kernel event. Calling this ioctl triggers the
+ * cleanup phase: any resources that were successfully preserved but were *not*
+ * subsequently restored (reclaimed) via the RESTORE ioctls will have their
+ * preserved state discarded and associated kernel resources released. Involved
+ * devices may be reset. All desired restorations *must* be completed *before*
+ * this. Kernel callbacks for the %LIVEUPDATE_FINISH event must not fail.
+ * Successfully completing this phase transitions the system state from
+ * %LIVEUPDATE_STATE_UPDATED back to %LIVEUPDATE_STATE_NORMAL. This command does
+ * not transfer data.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+#define LIVEUPDATE_IOCTL_FINISH \
+ _IO(LIVEUPDATE_IOCTL_TYPE, 0x07)
+
+#endif /* _UAPI_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
index b5054140b9a9..cb3ea380f6b9 100644
--- a/kernel/liveupdate/Makefile
+++ b/kernel/liveupdate/Makefile
@@ -7,4 +7,5 @@ obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o
obj-$(CONFIG_LIVEUPDATE) += luo_core.o
obj-$(CONFIG_LIVEUPDATE) += luo_files.o
+obj-$(CONFIG_LIVEUPDATE) += luo_ioctl.o
obj-$(CONFIG_LIVEUPDATE) += luo_subsystems.o
diff --git a/kernel/liveupdate/luo_ioctl.c b/kernel/liveupdate/luo_ioctl.c
new file mode 100644
index 000000000000..3de1d243df5a
--- /dev/null
+++ b/kernel/liveupdate/luo_ioctl.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO ioctl Interface
+ *
+ * The IOCTL user-space control interface for the LUO subsystem.
+ * It registers a misc character device, typically found at ``/dev/liveupdate``,
+ * which allows privileged userspace applications (requiring %CAP_SYS_ADMIN) to
+ * manage and monitor the LUO state machine and associated resources like
+ * preservable file descriptors.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/liveupdate.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/liveupdate.h>
+#include "luo_internal.h"
+
+static int luo_ioctl_fd_restore(struct liveupdate_fd *luo_fd)
+{
+ struct file *file;
+ int ret;
+ int fd;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0) {
+ pr_err("Failed to allocate new fd: %d\n", fd);
+ return fd;
+ }
+
+ ret = luo_retrieve_file(luo_fd->token, &file);
+ if (ret < 0) {
+ put_unused_fd(fd);
+
+ return ret;
+ }
+
+ fd_install(fd, file);
+ luo_fd->fd = fd;
+
+ return 0;
+}
+
+static int luo_open(struct inode *inodep, struct file *filep)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (filep->f_flags & O_EXCL)
+ return -EINVAL;
+
+ return 0;
+}
+
+static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ struct liveupdate_fd luo_fd;
+ enum liveupdate_state state;
+ int ret = 0;
+ u64 token;
+
+ if (_IOC_TYPE(cmd) != LIVEUPDATE_IOCTL_TYPE)
+ return -ENOTTY;
+
+ switch (cmd) {
+ case LIVEUPDATE_IOCTL_GET_STATE:
+ state = liveupdate_get_state();
+ if (copy_to_user(argp, &state, sizeof(state)))
+ ret = -EFAULT;
+ break;
+
+ case LIVEUPDATE_IOCTL_PREPARE:
+ ret = luo_prepare();
+ break;
+
+ case LIVEUPDATE_IOCTL_FINISH:
+ ret = luo_finish();
+ break;
+
+ case LIVEUPDATE_IOCTL_CANCEL:
+ ret = luo_cancel();
+ break;
+
+ case LIVEUPDATE_IOCTL_FD_PRESERVE:
+ if (copy_from_user(&luo_fd, argp, sizeof(luo_fd))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = luo_register_file(luo_fd.token, luo_fd.fd);
+ if (!ret && copy_to_user(argp, &luo_fd, sizeof(luo_fd))) {
+ WARN_ON_ONCE(luo_unregister_file(luo_fd.token));
+ ret = -EFAULT;
+ }
+ break;
+
+ case LIVEUPDATE_IOCTL_FD_UNPRESERVE:
+ if (copy_from_user(&token, argp, sizeof(u64))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = luo_unregister_file(token);
+ break;
+
+ case LIVEUPDATE_IOCTL_FD_RESTORE:
+ if (copy_from_user(&luo_fd, argp, sizeof(luo_fd))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = luo_ioctl_fd_restore(&luo_fd);
+ if (!ret && copy_to_user(argp, &luo_fd, sizeof(luo_fd)))
+ ret = -EFAULT;
+ break;
+
+ default:
+ pr_warn("ioctl: unknown command nr: 0x%x\n", _IOC_NR(cmd));
+ ret = -ENOTTY;
+ break;
+ }
+
+ return ret;
+}
+
+static const struct file_operations fops = {
+ .owner = THIS_MODULE,
+ .open = luo_open,
+ .unlocked_ioctl = luo_ioctl,
+};
+
+static struct miscdevice liveupdate_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "liveupdate",
+ .fops = &fops,
+};
+
+static int __init liveupdate_init(void)
+{
+ int err;
+
+ if (!liveupdate_enabled())
+ return 0;
+
+ err = misc_register(&liveupdate_miscdev);
+ if (err < 0) {
+ pr_err("Failed to register misc device '%s': %d\n",
+ liveupdate_miscdev.name, err);
+ }
+
+ return err;
+}
+module_init(liveupdate_init);
+
+static void __exit liveupdate_exit(void)
+{
+ misc_deregister(&liveupdate_miscdev);
+}
+module_exit(liveupdate_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pasha Tatashin");
+MODULE_DESCRIPTION("Live Update Orchestrator");
+MODULE_VERSION("0.1");
--
2.50.0.727.gbf7dc18ff4-goog
On Wed, Jul 23, 2025 at 02:46:29PM +0000, Pasha Tatashin wrote:
> Introduce the user-space interface for the Live Update Orchestrator
> via ioctl commands, enabling external control over the live update
> process and management of preserved resources.
I strongly recommend copying something like fwctl (which is copying
iommufd, which is copying some other best practices). I will try to
outline the main points below.
The design of the fwctl scheme allows alot of options for ABI
compatible future extensions and I very strongly recommend that
complex ioctl style APIs be built with that in mind. I have so many
scars from trying to undo fixed ABI design :)
> +/**
> + * struct liveupdate_fd - Holds parameters for preserving and restoring file
> + * descriptors across live update.
> + * @fd: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: The user-space file
> + * descriptor to be preserved.
> + * Output for %LIVEUPDATE_IOCTL_FD_RESTORE: The new file descriptor
> + * representing the fully restored kernel resource.
> + * @flags: Unused, reserved for future expansion, must be set to 0.
> + * @token: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: An opaque, unique token
> + * preserved for preserved resource.
> + * Input for %LIVEUPDATE_IOCTL_FD_RESTORE: The token previously
> + * provided to the preserve ioctl for the resource to be restored.
> + *
> + * This structure is used as the argument for the %LIVEUPDATE_IOCTL_FD_PRESERVE
> + * and %LIVEUPDATE_IOCTL_FD_RESTORE ioctls. These ioctls allow specific types
> + * of file descriptors (for example memfd, kvm, iommufd, and VFIO) to have their
> + * underlying kernel state preserved across a live update cycle.
> + *
> + * To preserve an FD, user space passes this struct to
> + * %LIVEUPDATE_IOCTL_FD_PRESERVE with the @fd field set. On success, the
> + * kernel uses the @token field to uniquly associate the preserved FD.
> + *
> + * After the live update transition, user space passes the struct populated with
> + * the *same* @token to %LIVEUPDATE_IOCTL_FD_RESTORE. The kernel uses the @token
> + * to find the preserved state and, on success, populates the @fd field with a
> + * new file descriptor referring to the restored resource.
> + */
> +struct liveupdate_fd {
> + int fd;
'int' should not appear in uapi structs. Fds are __s32
> + __u32 flags;
> + __aligned_u64 token;
> +};
> +
> +/* The ioctl type, documented in ioctl-number.rst */
> +#define LIVEUPDATE_IOCTL_TYPE 0xBA
I have found it very helpful to organize the ioctl numbering like this:
#define IOMMUFD_TYPE (';')
enum {
IOMMUFD_CMD_BASE = 0x80,
IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE,
IOMMUFD_CMD_IOAS_ALLOC = 0x81,
IOMMUFD_CMD_IOAS_ALLOW_IOVAS = 0x82,
[..]
#define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)
The numbers should be tightly packed and non-overlapping. It becomes
difficult to manage this if the numbers are sprinkled all over the
file. The above structuring will enforce git am conflicts if things
get muddled up. Saved me a few times already in iommufd.
> +/**
> + * LIVEUPDATE_IOCTL_FD_PRESERVE - Validate and initiate preservation for a file
> + * descriptor.
> + *
> + * Argument: Pointer to &struct liveupdate_fd.
> + *
> + * User sets the @fd field identifying the file descriptor to preserve
> + * (e.g., memfd, kvm, iommufd, VFIO). The kernel validates if this FD type
> + * and its dependencies are supported for preservation. If validation passes,
> + * the kernel marks the FD internally and *initiates the process* of preparing
> + * its state for saving. The actual snapshotting of the state typically occurs
> + * during the subsequent %LIVEUPDATE_IOCTL_PREPARE execution phase, though
> + * some finalization might occur during freeze.
> + * On successful validation and initiation, the kernel uses the @token
> + * field with an opaque identifier representing the resource being preserved.
> + * This token confirms the FD is targeted for preservation and is required for
> + * the subsequent %LIVEUPDATE_IOCTL_FD_RESTORE call after the live update.
> + *
> + * Return: 0 on success (validation passed, preservation initiated), negative
> + * error code on failure (e.g., unsupported FD type, dependency issue,
> + * validation failed).
> + */
> +#define LIVEUPDATE_IOCTL_FD_PRESERVE \
> + _IOW(LIVEUPDATE_IOCTL_TYPE, 0x00, struct liveupdate_fd)
From a kdoc perspective I find it works much better to attach the kdoc
to the struct, not the ioctl:
/**
* struct iommu_destroy - ioctl(IOMMU_DESTROY)
* @size: sizeof(struct iommu_destroy)
* @id: iommufd object ID to destroy. Can be any destroyable object type.
*
* Destroy any object held within iommufd.
*/
struct iommu_destroy {
__u32 size;
__u32 id;
};
#define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)
Generates this kdoc:
https://docs.kernel.org/userspace-api/iommufd.html#c.iommu_destroy
You should also make sure to link the uapi header into the kdoc build
under the "userspace API" chaper.
The structs should also be self-describing. I am fairly strongly
against using the size mechanism in the _IOW macro, it is instantly
ABI incompatible and basically impossible to deal with from userspace.
Hence why the IOMMFD version is _IO().
This means stick a size member in the first 4 bytes of every
struct. More on this later..
> +/**
> + * LIVEUPDATE_IOCTL_FD_UNPRESERVE - Remove a file descriptor from the
> + * preservation list.
> + *
> + * Argument: Pointer to __u64 token.
Every ioctl should have a struct, with the size header. If you want to
do more down the road you can not using this structure.
> +#define LIVEUPDATE_IOCTL_FD_RESTORE \
> + _IOWR(LIVEUPDATE_IOCTL_TYPE, 0x02, struct liveupdate_fd)
Strongly recommend that every ioctl have a unique struct. Sharing
structs makes future extend-ability harder.
> +/**
> + * LIVEUPDATE_IOCTL_PREPARE - Initiate preparation phase and trigger state
> + * saving.
Perhaps these just want to be a single 'set state' ioctl with an enum
input argument?
> @@ -7,4 +7,5 @@ obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
> obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o
> obj-$(CONFIG_LIVEUPDATE) += luo_core.o
> obj-$(CONFIG_LIVEUPDATE) += luo_files.o
> +obj-$(CONFIG_LIVEUPDATE) += luo_ioctl.o
> obj-$(CONFIG_LIVEUPDATE) += luo_subsystems.o
I don't think luo is modular, but I think it is generally better to
write the kbuilds as though it was anyhow if it has a lot of files:
iommufd-y := \
device.o \
eventq.o \
hw_pagetable.o \
io_pagetable.o \
ioas.o \
main.o \
pages.o \
vfio_compat.o \
viommu.o
obj-$(CONFIG_IOMMUFD) += iommufd.o
Basically don't repeat obj-$(CONFIG_LIVEUPDATE), every one of those
lines creates a new module (if it was modular)
> +static int luo_open(struct inode *inodep, struct file *filep)
> +{
> + if (!capable(CAP_SYS_ADMIN))
> + return -EACCES;
IMHO file system permissions should control permission to open. No
capable check.
> + if (filep->f_flags & O_EXCL)
> + return -EINVAL;
O_EXCL doesn't really do anything for cdev, I'd drop this.
The open should have an atomic to check for single open though.
> +static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
> +{
> + void __user *argp = (void __user *)arg;
> + struct liveupdate_fd luo_fd;
> + enum liveupdate_state state;
> + int ret = 0;
> + u64 token;
> +
> + if (_IOC_TYPE(cmd) != LIVEUPDATE_IOCTL_TYPE)
> + return -ENOTTY;
The generic parse/disptach from fwctl is a really good idea here, you
can cut and paste it, change the names. It makes it really easy to manage future extensibility:
List the ops and their structs:
static const struct fwctl_ioctl_op fwctl_ioctl_ops[] = {
IOCTL_OP(FWCTL_INFO, fwctl_cmd_info, struct fwctl_info, out_device_data),
IOCTL_OP(FWCTL_RPC, fwctl_cmd_rpc, struct fwctl_rpc, out),
};
Index the list and copy_from_user the struct desribing the opt:
static long fwctl_fops_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
struct fwctl_uctx *uctx = filp->private_data;
const struct fwctl_ioctl_op *op;
struct fwctl_ucmd ucmd = {};
union fwctl_ucmd_buffer buf;
unsigned int nr;
int ret;
nr = _IOC_NR(cmd);
if ((nr - FWCTL_CMD_BASE) >= ARRAY_SIZE(fwctl_ioctl_ops))
return -ENOIOCTLCMD;
op = &fwctl_ioctl_ops[nr - FWCTL_CMD_BASE];
if (op->ioctl_num != cmd)
return -ENOIOCTLCMD;
ucmd.uctx = uctx;
ucmd.cmd = &buf;
ucmd.ubuffer = (void __user *)arg;
// This is reading/checking the standard 4 byte size header:
ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
if (ret)
return ret;
if (ucmd.user_size < op->min_size)
return -EINVAL;
ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
ucmd.user_size);
Removes a bunch of boiler plate and easy to make wrong copy_from_users
in the ioctls. Centralizes size validation, zero padding checking/etc.
> + ret = luo_register_file(luo_fd.token, luo_fd.fd);
> + if (!ret && copy_to_user(argp, &luo_fd, sizeof(luo_fd))) {
> + WARN_ON_ONCE(luo_unregister_file(luo_fd.token));
> + ret = -EFAULT;
Then for extensibility you'd copy back the struct:
static int ucmd_respond(struct fwctl_ucmd *ucmd, size_t cmd_len)
{
if (copy_to_user(ucmd->ubuffer, ucmd->cmd,
min_t(size_t, ucmd->user_size, cmd_len)))
return -EFAULT;
return 0;
}
Which truncates it/etc according to some ABI extensibility rules.
> +static int __init liveupdate_init(void)
> +{
> + int err;
> +
> + if (!liveupdate_enabled())
> + return 0;
> +
> + err = misc_register(&liveupdate_miscdev);
> + if (err < 0) {
> + pr_err("Failed to register misc device '%s': %d\n",
> + liveupdate_miscdev.name, err);
Should remove most of the pr_err's, here too IMHO..
Jason
On Tue, Jul 29, 2025 at 12:35 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Wed, Jul 23, 2025 at 02:46:29PM +0000, Pasha Tatashin wrote:
> > Introduce the user-space interface for the Live Update Orchestrator
> > via ioctl commands, enabling external control over the live update
> > process and management of preserved resources.
>
> I strongly recommend copying something like fwctl (which is copying
> iommufd, which is copying some other best practices). I will try to
> outline the main points below.
>
> The design of the fwctl scheme allows alot of options for ABI
> compatible future extensions and I very strongly recommend that
> complex ioctl style APIs be built with that in mind. I have so many
> scars from trying to undo fixed ABI design :)
Thank you for bringing this up, I have reviewed fwctl ioctl
implementation, and also iommufd ioctl, and I made the necessary
changes to make luo similar.
> > +/**
> > + * struct liveupdate_fd - Holds parameters for preserving and restoring file
> > + * descriptors across live update.
> > + * @fd: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: The user-space file
> > + * descriptor to be preserved.
> > + * Output for %LIVEUPDATE_IOCTL_FD_RESTORE: The new file descriptor
> > + * representing the fully restored kernel resource.
> > + * @flags: Unused, reserved for future expansion, must be set to 0.
> > + * @token: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: An opaque, unique token
> > + * preserved for preserved resource.
> > + * Input for %LIVEUPDATE_IOCTL_FD_RESTORE: The token previously
> > + * provided to the preserve ioctl for the resource to be restored.
> > + *
> > + * This structure is used as the argument for the %LIVEUPDATE_IOCTL_FD_PRESERVE
> > + * and %LIVEUPDATE_IOCTL_FD_RESTORE ioctls. These ioctls allow specific types
> > + * of file descriptors (for example memfd, kvm, iommufd, and VFIO) to have their
> > + * underlying kernel state preserved across a live update cycle.
> > + *
> > + * To preserve an FD, user space passes this struct to
> > + * %LIVEUPDATE_IOCTL_FD_PRESERVE with the @fd field set. On success, the
> > + * kernel uses the @token field to uniquly associate the preserved FD.
> > + *
> > + * After the live update transition, user space passes the struct populated with
> > + * the *same* @token to %LIVEUPDATE_IOCTL_FD_RESTORE. The kernel uses the @token
> > + * to find the preserved state and, on success, populates the @fd field with a
> > + * new file descriptor referring to the restored resource.
> > + */
> > +struct liveupdate_fd {
> > + int fd;
>
> 'int' should not appear in uapi structs. Fds are __s32
done
>
> > + __u32 flags;
> > + __aligned_u64 token;
> > +};
> > +
> > +/* The ioctl type, documented in ioctl-number.rst */
> > +#define LIVEUPDATE_IOCTL_TYPE 0xBA
>
> I have found it very helpful to organize the ioctl numbering like this:
>
> #define IOMMUFD_TYPE (';')
>
> enum {
> IOMMUFD_CMD_BASE = 0x80,
> IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE,
> IOMMUFD_CMD_IOAS_ALLOC = 0x81,
> IOMMUFD_CMD_IOAS_ALLOW_IOVAS = 0x82,
> [..]
>
> #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)
>
> The numbers should be tightly packed and non-overlapping. It becomes
> difficult to manage this if the numbers are sprinkled all over the
> file. The above structuring will enforce git am conflicts if things
> get muddled up. Saved me a few times already in iommufd.
Done
>
> > +/**
> > + * LIVEUPDATE_IOCTL_FD_PRESERVE - Validate and initiate preservation for a file
> > + * descriptor.
> > + *
> > + * Argument: Pointer to &struct liveupdate_fd.
> > + *
> > + * User sets the @fd field identifying the file descriptor to preserve
> > + * (e.g., memfd, kvm, iommufd, VFIO). The kernel validates if this FD type
> > + * and its dependencies are supported for preservation. If validation passes,
> > + * the kernel marks the FD internally and *initiates the process* of preparing
> > + * its state for saving. The actual snapshotting of the state typically occurs
> > + * during the subsequent %LIVEUPDATE_IOCTL_PREPARE execution phase, though
> > + * some finalization might occur during freeze.
> > + * On successful validation and initiation, the kernel uses the @token
> > + * field with an opaque identifier representing the resource being preserved.
> > + * This token confirms the FD is targeted for preservation and is required for
> > + * the subsequent %LIVEUPDATE_IOCTL_FD_RESTORE call after the live update.
> > + *
> > + * Return: 0 on success (validation passed, preservation initiated), negative
> > + * error code on failure (e.g., unsupported FD type, dependency issue,
> > + * validation failed).
> > + */
> > +#define LIVEUPDATE_IOCTL_FD_PRESERVE \
> > + _IOW(LIVEUPDATE_IOCTL_TYPE, 0x00, struct liveupdate_fd)
>
> From a kdoc perspective I find it works much better to attach the kdoc
> to the struct, not the ioctl:
>
> /**
> * struct iommu_destroy - ioctl(IOMMU_DESTROY)
> * @size: sizeof(struct iommu_destroy)
> * @id: iommufd object ID to destroy. Can be any destroyable object type.
> *
> * Destroy any object held within iommufd.
> */
> struct iommu_destroy {
> __u32 size;
> __u32 id;
> };
> #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)
>
> Generates this kdoc:
>
> https://docs.kernel.org/userspace-api/iommufd.html#c.iommu_destroy
Agreed, done the same as above.
>
> You should also make sure to link the uapi header into the kdoc build
> under the "userspace API" chaper.
>
> The structs should also be self-describing. I am fairly strongly
> against using the size mechanism in the _IOW macro, it is instantly
> ABI incompatible and basically impossible to deal with from userspace.
>
> Hence why the IOMMFD version is _IO().
Right, I came to the same conclusion while reviewing fwctl, I replaced
everything with pure _IO().
>
> This means stick a size member in the first 4 bytes of every
> struct. More on this later..
>
> > +/**
> > + * LIVEUPDATE_IOCTL_FD_UNPRESERVE - Remove a file descriptor from the
> > + * preservation list.
> > + *
> > + * Argument: Pointer to __u64 token.
>
> Every ioctl should have a struct, with the size header. If you want to
> do more down the road you can not using this structure.
Done
>
> > +#define LIVEUPDATE_IOCTL_FD_RESTORE \
> > + _IOWR(LIVEUPDATE_IOCTL_TYPE, 0x02, struct liveupdate_fd)
>
> Strongly recommend that every ioctl have a unique struct. Sharing
> structs makes future extend-ability harder.
Done
>
> > +/**
> > + * LIVEUPDATE_IOCTL_PREPARE - Initiate preparation phase and trigger state
> > + * saving.
>
> Perhaps these just want to be a single 'set state' ioctl with an enum
> input argument?
Added a IOCTL: LIVEUPDATE_SET_EVENT, and all events
PREPARE/FINISH/CANCEL are now done through it.
>
> > @@ -7,4 +7,5 @@ obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
> > obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o
> > obj-$(CONFIG_LIVEUPDATE) += luo_core.o
> > obj-$(CONFIG_LIVEUPDATE) += luo_files.o
> > +obj-$(CONFIG_LIVEUPDATE) += luo_ioctl.o
> > obj-$(CONFIG_LIVEUPDATE) += luo_subsystems.o
>
> I don't think luo is modular, but I think it is generally better to
> write the kbuilds as though it was anyhow if it has a lot of files:
>
> iommufd-y := \
> device.o \
> eventq.o \
> hw_pagetable.o \
> io_pagetable.o \
> ioas.o \
> main.o \
> pages.o \
> vfio_compat.o \
> viommu.o
> obj-$(CONFIG_IOMMUFD) += iommufd.o
Done
>
> Basically don't repeat obj-$(CONFIG_LIVEUPDATE), every one of those
> lines creates a new module (if it was modular)
>
> > +static int luo_open(struct inode *inodep, struct file *filep)
> > +{
> > + if (!capable(CAP_SYS_ADMIN))
> > + return -EACCES;
>
> IMHO file system permissions should control permission to open. No
> capable check.
Removed
>
> > + if (filep->f_flags & O_EXCL)
> > + return -EINVAL;
>
> O_EXCL doesn't really do anything for cdev, I'd drop this.
>
> The open should have an atomic to check for single open though.
Removed, and added an enforcement for a single open.
>
> > +static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
> > +{
> > + void __user *argp = (void __user *)arg;
> > + struct liveupdate_fd luo_fd;
> > + enum liveupdate_state state;
> > + int ret = 0;
> > + u64 token;
> > +
> > + if (_IOC_TYPE(cmd) != LIVEUPDATE_IOCTL_TYPE)
> > + return -ENOTTY;
>
> The generic parse/disptach from fwctl is a really good idea here, you
> can cut and paste it, change the names. It makes it really easy to manage future extensibility:
>
> List the ops and their structs:
>
> static const struct fwctl_ioctl_op fwctl_ioctl_ops[] = {
> IOCTL_OP(FWCTL_INFO, fwctl_cmd_info, struct fwctl_info, out_device_data),
> IOCTL_OP(FWCTL_RPC, fwctl_cmd_rpc, struct fwctl_rpc, out),
> };
>
> Index the list and copy_from_user the struct desribing the opt:
>
> static long fwctl_fops_ioctl(struct file *filp, unsigned int cmd,
> unsigned long arg)
> {
> struct fwctl_uctx *uctx = filp->private_data;
> const struct fwctl_ioctl_op *op;
> struct fwctl_ucmd ucmd = {};
> union fwctl_ucmd_buffer buf;
> unsigned int nr;
> int ret;
>
> nr = _IOC_NR(cmd);
> if ((nr - FWCTL_CMD_BASE) >= ARRAY_SIZE(fwctl_ioctl_ops))
> return -ENOIOCTLCMD;
>
> op = &fwctl_ioctl_ops[nr - FWCTL_CMD_BASE];
> if (op->ioctl_num != cmd)
> return -ENOIOCTLCMD;
>
> ucmd.uctx = uctx;
> ucmd.cmd = &buf;
> ucmd.ubuffer = (void __user *)arg;
> // This is reading/checking the standard 4 byte size header:
> ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
> if (ret)
> return ret;
>
> if (ucmd.user_size < op->min_size)
> return -EINVAL;
>
> ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
> ucmd.user_size);
>
>
> Removes a bunch of boiler plate and easy to make wrong copy_from_users
> in the ioctls. Centralizes size validation, zero padding checking/etc.
Yeap, implemented as above.
>
> > + ret = luo_register_file(luo_fd.token, luo_fd.fd);
> > + if (!ret && copy_to_user(argp, &luo_fd, sizeof(luo_fd))) {
> > + WARN_ON_ONCE(luo_unregister_file(luo_fd.token));
> > + ret = -EFAULT;
>
> Then for extensibility you'd copy back the struct:
>
> static int ucmd_respond(struct fwctl_ucmd *ucmd, size_t cmd_len)
> {
> if (copy_to_user(ucmd->ubuffer, ucmd->cmd,
> min_t(size_t, ucmd->user_size, cmd_len)))
> return -EFAULT;
> return 0;
> }
>
> Which truncates it/etc according to some ABI extensibility rules.
>
> > +static int __init liveupdate_init(void)
> > +{
> > + int err;
> > +
> > + if (!liveupdate_enabled())
> > + return 0;
> > +
> > + err = misc_register(&liveupdate_miscdev);
> > + if (err < 0) {
> > + pr_err("Failed to register misc device '%s': %d\n",
> > + liveupdate_miscdev.name, err);
>
> Should remove most of the pr_err's, here too IMHO..
Removed.
>
> Jason
Thanks a lot for the thorough review!
Pasha
© 2016 - 2026 Red Hat, Inc.