Introduce the user-space interface for the Live Update Orchestrator
via ioctl commands, enabling external control over the live update
process and management of preserved resources.
Create a character device at /dev/liveupdate. Access
to this device requires the CAP_SYS_ADMIN capability.
A new uAPI header, <uapi/linux/liveupdate.h>, defines the necessary
structures. The magic number is registered in
Documentation/userspace-api/ioctl/ioctl-number.rst.
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
---
.../userspace-api/ioctl/ioctl-number.rst | 2 +
include/linux/liveupdate.h | 36 +--
include/uapi/linux/liveupdate.h | 265 ++++++++++++++++++
kernel/liveupdate/Makefile | 1 +
kernel/liveupdate/luo_ioctl.c | 178 ++++++++++++
5 files changed, 447 insertions(+), 35 deletions(-)
create mode 100644 include/uapi/linux/liveupdate.h
create mode 100644 kernel/liveupdate/luo_ioctl.c
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index bc91756bde73..8368aa05b4df 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -380,6 +380,8 @@ Code Seq# Include File Comments
0xB8 01-02 uapi/misc/mrvl_cn10k_dpi.h Marvell CN10K DPI driver
0xB8 all uapi/linux/mshv.h Microsoft Hyper-V /dev/mshv driver
<mailto:linux-hyperv@vger.kernel.org>
+0xBA all uapi/linux/liveupdate.h Pasha Tatashin
+ <mailto:pasha.tatashin@soleen.com>
0xC0 00-0F linux/usb/iowarrior.h
0xCA 00-0F uapi/misc/cxl.h Dead since 6.15
0xCA 10-2F uapi/misc/ocxl.h
diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
index 28a8aa4cafca..970447de5d8c 100644
--- a/include/linux/liveupdate.h
+++ b/include/linux/liveupdate.h
@@ -10,6 +10,7 @@
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/list.h>
+#include <uapi/linux/liveupdate.h>
/**
* enum liveupdate_event - Events that trigger live update callbacks.
@@ -53,41 +54,6 @@ enum liveupdate_event {
LIVEUPDATE_CANCEL,
};
-/**
- * enum liveupdate_state - Defines the possible states of the live update
- * orchestrator.
- * @LIVEUPDATE_STATE_UNDEFINED: State has not yet been initialized.
- * @LIVEUPDATE_STATE_NORMAL: Default state, no live update in progress.
- * @LIVEUPDATE_STATE_PREPARED: Live update is prepared for reboot; the
- * LIVEUPDATE_PREPARE callbacks have completed
- * successfully.
- * Devices might operate in a limited state
- * for example the participating devices might
- * not be allowed to unbind, and also the
- * setting up of new DMA mappings might be
- * disabled in this state.
- * @LIVEUPDATE_STATE_FROZEN: The final reboot event
- * (%LIVEUPDATE_FREEZE) has been sent, and the
- * system is performing its final state saving
- * within the "blackout window". User
- * workloads must be suspended. The actual
- * reboot (kexec) into the next kernel is
- * imminent.
- * @LIVEUPDATE_STATE_UPDATED: The system has rebooted into the next
- * kernel via live update the system is now
- * running the next kernel, awaiting the
- * finish event.
- *
- * These states track the progress and outcome of a live update operation.
- */
-enum liveupdate_state {
- LIVEUPDATE_STATE_UNDEFINED = 0,
- LIVEUPDATE_STATE_NORMAL = 1,
- LIVEUPDATE_STATE_PREPARED = 2,
- LIVEUPDATE_STATE_FROZEN = 3,
- LIVEUPDATE_STATE_UPDATED = 4,
-};
-
struct file;
/**
diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h
new file mode 100644
index 000000000000..7b12a1073c3c
--- /dev/null
+++ b/include/uapi/linux/liveupdate.h
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+/*
+ * Userspace interface for /dev/liveupdate
+ * Live Update Orchestrator
+ *
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef _UAPI_LIVEUPDATE_H
+#define _UAPI_LIVEUPDATE_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/**
+ * enum liveupdate_state - Defines the possible states of the live update
+ * orchestrator.
+ * @LIVEUPDATE_STATE_UNDEFINED: State has not yet been initialized.
+ * @LIVEUPDATE_STATE_NORMAL: Default state, no live update in progress.
+ * @LIVEUPDATE_STATE_PREPARED: Live update is prepared for reboot; the
+ * LIVEUPDATE_PREPARE callbacks have completed
+ * successfully.
+ * Devices might operate in a limited state
+ * for example the participating devices might
+ * not be allowed to unbind, and also the
+ * setting up of new DMA mappings might be
+ * disabled in this state.
+ * @LIVEUPDATE_STATE_FROZEN: The final reboot event
+ * (%LIVEUPDATE_FREEZE) has been sent, and the
+ * system is performing its final state saving
+ * within the "blackout window". User
+ * workloads must be suspended. The actual
+ * reboot (kexec) into the next kernel is
+ * imminent.
+ * @LIVEUPDATE_STATE_UPDATED: The system has rebooted into the next
+ * kernel via live update the system is now
+ * running the next kernel, awaiting the
+ * finish event.
+ *
+ * These states track the progress and outcome of a live update operation.
+ */
+enum liveupdate_state {
+ LIVEUPDATE_STATE_UNDEFINED = 0,
+ LIVEUPDATE_STATE_NORMAL = 1,
+ LIVEUPDATE_STATE_PREPARED = 2,
+ LIVEUPDATE_STATE_FROZEN = 3,
+ LIVEUPDATE_STATE_UPDATED = 4,
+};
+
+/**
+ * struct liveupdate_fd - Holds parameters for preserving and restoring file
+ * descriptors across live update.
+ * @fd: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: The user-space file
+ * descriptor to be preserved.
+ * Output for %LIVEUPDATE_IOCTL_FD_RESTORE: The new file descriptor
+ * representing the fully restored kernel resource.
+ * @flags: Unused, reserved for future expansion, must be set to 0.
+ * @token: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: An opaque, unique token
+ * preserved for preserved resource.
+ * Input for %LIVEUPDATE_IOCTL_FD_RESTORE: The token previously
+ * provided to the preserve ioctl for the resource to be restored.
+ *
+ * This structure is used as the argument for the %LIVEUPDATE_IOCTL_FD_PRESERVE
+ * and %LIVEUPDATE_IOCTL_FD_RESTORE ioctls. These ioctls allow specific types
+ * of file descriptors (for example memfd, kvm, iommufd, and VFIO) to have their
+ * underlying kernel state preserved across a live update cycle.
+ *
+ * To preserve an FD, user space passes this struct to
+ * %LIVEUPDATE_IOCTL_FD_PRESERVE with the @fd field set. On success, the
+ * kernel uses the @token field to uniquly associate the preserved FD.
+ *
+ * After the live update transition, user space passes the struct populated with
+ * the *same* @token to %LIVEUPDATE_IOCTL_FD_RESTORE. The kernel uses the @token
+ * to find the preserved state and, on success, populates the @fd field with a
+ * new file descriptor referring to the restored resource.
+ */
+struct liveupdate_fd {
+ int fd;
+ __u32 flags;
+ __aligned_u64 token;
+};
+
+/* The ioctl type, documented in ioctl-number.rst */
+#define LIVEUPDATE_IOCTL_TYPE 0xBA
+
+/**
+ * LIVEUPDATE_IOCTL_FD_PRESERVE - Validate and initiate preservation for a file
+ * descriptor.
+ *
+ * Argument: Pointer to &struct liveupdate_fd.
+ *
+ * User sets the @fd field identifying the file descriptor to preserve
+ * (e.g., memfd, kvm, iommufd, VFIO). The kernel validates if this FD type
+ * and its dependencies are supported for preservation. If validation passes,
+ * the kernel marks the FD internally and *initiates the process* of preparing
+ * its state for saving. The actual snapshotting of the state typically occurs
+ * during the subsequent %LIVEUPDATE_IOCTL_PREPARE execution phase, though
+ * some finalization might occur during freeze.
+ * On successful validation and initiation, the kernel uses the @token
+ * field with an opaque identifier representing the resource being preserved.
+ * This token confirms the FD is targeted for preservation and is required for
+ * the subsequent %LIVEUPDATE_IOCTL_FD_RESTORE call after the live update.
+ *
+ * Return: 0 on success (validation passed, preservation initiated), negative
+ * error code on failure (e.g., unsupported FD type, dependency issue,
+ * validation failed).
+ */
+#define LIVEUPDATE_IOCTL_FD_PRESERVE \
+ _IOW(LIVEUPDATE_IOCTL_TYPE, 0x00, struct liveupdate_fd)
+
+/**
+ * LIVEUPDATE_IOCTL_FD_UNPRESERVE - Remove a file descriptor from the
+ * preservation list.
+ *
+ * Argument: Pointer to __u64 token.
+ *
+ * Allows user space to explicitly remove a file descriptor from the set of
+ * items marked as potentially preservable. User space provides a pointer to the
+ * __u64 @token that was previously returned by a successful
+ * %LIVEUPDATE_IOCTL_FD_PRESERVE call (potentially from a prior, possibly
+ * cancelled, live update attempt). The kernel reads the token value from the
+ * provided user-space address.
+ *
+ * On success, the kernel removes the corresponding entry (identified by the
+ * token value read from the user pointer) from its internal preservation list.
+ * The provided @token (representing the now-removed entry) becomes invalid
+ * after this call.
+ *
+ * Return: 0 on success, negative error code on failure (e.g., -EBUSY or -EINVAL
+ * if not in %LIVEUPDATE_STATE_NORMAL, bad address provided, invalid token value
+ * read, token not found).
+ */
+#define LIVEUPDATE_IOCTL_FD_UNPRESERVE \
+ _IOW(LIVEUPDATE_IOCTL_TYPE, 0x01, __u64)
+
+/**
+ * LIVEUPDATE_IOCTL_FD_RESTORE - Restore a previously preserved file descriptor.
+ *
+ * Argument: Pointer to &struct liveupdate_fd.
+ *
+ * User sets the @token field to the value obtained from a successful
+ * %LIVEUPDATE_IOCTL_FD_PRESERVE call before the live update. On success,
+ * the kernel restores the state (saved during the PREPARE/FREEZE phases)
+ * associated with the token and populates the @fd field with a new file
+ * descriptor referencing the restored resource in the current (new) kernel.
+ * This operation must be performed *before* signaling completion via
+ * %LIVEUPDATE_IOCTL_FINISH.
+ *
+ * Return: 0 on success, negative error code on failure (e.g., invalid token).
+ */
+#define LIVEUPDATE_IOCTL_FD_RESTORE \
+ _IOWR(LIVEUPDATE_IOCTL_TYPE, 0x02, struct liveupdate_fd)
+
+/**
+ * LIVEUPDATE_IOCTL_GET_STATE - Query the current state of the live update
+ * orchestrator.
+ *
+ * Argument: Pointer to &enum liveupdate_state.
+ *
+ * The kernel fills the enum value pointed to by the argument with the current
+ * state of the live update subsystem. Possible states are:
+ *
+ * - %LIVEUPDATE_STATE_NORMAL: Default state; no live update operation is
+ * currently in progress.
+ * - %LIVEUPDATE_STATE_PREPARED: The preparation phase (triggered by
+ * %LIVEUPDATE_IOCTL_PREPARE) has completed
+ * successfully. The system is ready for the
+ * reboot transition. Note that some
+ * device operations (e.g., unbinding, new DMA
+ * mappings) might be restricted in this state.
+ * - %LIVEUPDATE_STATE_UPDATED: The system has successfully rebooted into the
+ * new kernel via live update. It is now running
+ * the new kernel code and is awaiting the
+ * completion signal from user space via
+ * %LIVEUPDATE_IOCTL_FINISH after
+ * restoration tasks are done.
+ *
+ * See the definition of &enum liveupdate_state for more details on each state.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+#define LIVEUPDATE_IOCTL_GET_STATE \
+ _IOR(LIVEUPDATE_IOCTL_TYPE, 0x03, enum liveupdate_state)
+
+/**
+ * LIVEUPDATE_IOCTL_PREPARE - Initiate preparation phase and trigger state
+ * saving.
+ *
+ * Argument: None.
+ *
+ * Initiates the live update preparation phase. This action corresponds to
+ * the internal %LIVEUPDATE_PREPARE. This typically triggers the saving process
+ * for items marked via the PRESERVE ioctls. This typically occurs *before*
+ * the "blackout window", while user applications (e.g., VMs) may still be
+ * running. Kernel subsystems receiving the %LIVEUPDATE_PREPARE event should
+ * serialize necessary state. This command does not transfer data.
+ *
+ * Return: 0 on success, negative error code on failure. Transitions state
+ * towards %LIVEUPDATE_STATE_PREPARED on success.
+ */
+#define LIVEUPDATE_IOCTL_PREPARE \
+ _IO(LIVEUPDATE_IOCTL_TYPE, 0x04)
+
+/**
+ * LIVEUPDATE_IOCTL_CANCEL - Cancel the live update preparation phase.
+ *
+ * Argument: None.
+ *
+ * Notifies the live update subsystem to abort the preparation sequence
+ * potentially initiated by %LIVEUPDATE_IOCTL_PREPARE. This action
+ * typically corresponds to the internal %LIVEUPDATE_CANCEL kernel event,
+ * which might also be triggered automatically if the PREPARE stage fails
+ * internally.
+ *
+ * When triggered, subsystems receiving the %LIVEUPDATE_CANCEL event should
+ * revert any state changes or actions taken specifically for the aborted
+ * prepare phase (e.g., discard partially serialized state). The kernel
+ * releases resources allocated specifically for this *aborted preparation
+ * attempt*.
+ *
+ * This operation cancels the current *attempt* to prepare for a live update
+ * but does **not** remove previously validated items from the internal list
+ * of potentially preservable resources. Consequently, preservation tokens
+ * previously generated by successful %LIVEUPDATE_IOCTL_FD_PRESERVE or calls
+ * generally **remain valid** as identifiers for those potentially preservable
+ * resources. However, since the system state returns towards
+ * %LIVEUPDATE_STATE_NORMAL, user space must initiate a new live update sequence
+ * (starting with %LIVEUPDATE_IOCTL_PREPARE) to proceed with an update
+ * using these (or other) tokens.
+ *
+ * This command does not transfer data. Kernel callbacks for the
+ * %LIVEUPDATE_CANCEL event must not fail.
+ *
+ * Return: 0 on success, negative error code on failure. Transitions state back
+ * towards %LIVEUPDATE_STATE_NORMAL on success.
+ */
+#define LIVEUPDATE_IOCTL_CANCEL \
+ _IO(LIVEUPDATE_IOCTL_TYPE, 0x06)
+
+/**
+ * LIVEUPDATE_IOCTL_EVENT_FINISH - Signal restoration completion and trigger
+ * cleanup.
+ *
+ * Argument: None.
+ *
+ * Signals that user space has completed all necessary restoration actions in
+ * the new kernel (after a live update reboot). This action corresponds to the
+ * internal %LIVEUPDATE_FINISH kernel event. Calling this ioctl triggers the
+ * cleanup phase: any resources that were successfully preserved but were *not*
+ * subsequently restored (reclaimed) via the RESTORE ioctls will have their
+ * preserved state discarded and associated kernel resources released. Involved
+ * devices may be reset. All desired restorations *must* be completed *before*
+ * this. Kernel callbacks for the %LIVEUPDATE_FINISH event must not fail.
+ * Successfully completing this phase transitions the system state from
+ * %LIVEUPDATE_STATE_UPDATED back to %LIVEUPDATE_STATE_NORMAL. This command does
+ * not transfer data.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+#define LIVEUPDATE_IOCTL_FINISH \
+ _IO(LIVEUPDATE_IOCTL_TYPE, 0x07)
+
+#endif /* _UAPI_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
index b5054140b9a9..cb3ea380f6b9 100644
--- a/kernel/liveupdate/Makefile
+++ b/kernel/liveupdate/Makefile
@@ -7,4 +7,5 @@ obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o
obj-$(CONFIG_LIVEUPDATE) += luo_core.o
obj-$(CONFIG_LIVEUPDATE) += luo_files.o
+obj-$(CONFIG_LIVEUPDATE) += luo_ioctl.o
obj-$(CONFIG_LIVEUPDATE) += luo_subsystems.o
diff --git a/kernel/liveupdate/luo_ioctl.c b/kernel/liveupdate/luo_ioctl.c
new file mode 100644
index 000000000000..3de1d243df5a
--- /dev/null
+++ b/kernel/liveupdate/luo_ioctl.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO ioctl Interface
+ *
+ * The IOCTL user-space control interface for the LUO subsystem.
+ * It registers a misc character device, typically found at ``/dev/liveupdate``,
+ * which allows privileged userspace applications (requiring %CAP_SYS_ADMIN) to
+ * manage and monitor the LUO state machine and associated resources like
+ * preservable file descriptors.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/liveupdate.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/liveupdate.h>
+#include "luo_internal.h"
+
+static int luo_ioctl_fd_restore(struct liveupdate_fd *luo_fd)
+{
+ struct file *file;
+ int ret;
+ int fd;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0) {
+ pr_err("Failed to allocate new fd: %d\n", fd);
+ return fd;
+ }
+
+ ret = luo_retrieve_file(luo_fd->token, &file);
+ if (ret < 0) {
+ put_unused_fd(fd);
+
+ return ret;
+ }
+
+ fd_install(fd, file);
+ luo_fd->fd = fd;
+
+ return 0;
+}
+
+static int luo_open(struct inode *inodep, struct file *filep)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (filep->f_flags & O_EXCL)
+ return -EINVAL;
+
+ return 0;
+}
+
+static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ struct liveupdate_fd luo_fd;
+ enum liveupdate_state state;
+ int ret = 0;
+ u64 token;
+
+ if (_IOC_TYPE(cmd) != LIVEUPDATE_IOCTL_TYPE)
+ return -ENOTTY;
+
+ switch (cmd) {
+ case LIVEUPDATE_IOCTL_GET_STATE:
+ state = liveupdate_get_state();
+ if (copy_to_user(argp, &state, sizeof(state)))
+ ret = -EFAULT;
+ break;
+
+ case LIVEUPDATE_IOCTL_PREPARE:
+ ret = luo_prepare();
+ break;
+
+ case LIVEUPDATE_IOCTL_FINISH:
+ ret = luo_finish();
+ break;
+
+ case LIVEUPDATE_IOCTL_CANCEL:
+ ret = luo_cancel();
+ break;
+
+ case LIVEUPDATE_IOCTL_FD_PRESERVE:
+ if (copy_from_user(&luo_fd, argp, sizeof(luo_fd))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = luo_register_file(luo_fd.token, luo_fd.fd);
+ if (!ret && copy_to_user(argp, &luo_fd, sizeof(luo_fd))) {
+ WARN_ON_ONCE(luo_unregister_file(luo_fd.token));
+ ret = -EFAULT;
+ }
+ break;
+
+ case LIVEUPDATE_IOCTL_FD_UNPRESERVE:
+ if (copy_from_user(&token, argp, sizeof(u64))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = luo_unregister_file(token);
+ break;
+
+ case LIVEUPDATE_IOCTL_FD_RESTORE:
+ if (copy_from_user(&luo_fd, argp, sizeof(luo_fd))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = luo_ioctl_fd_restore(&luo_fd);
+ if (!ret && copy_to_user(argp, &luo_fd, sizeof(luo_fd)))
+ ret = -EFAULT;
+ break;
+
+ default:
+ pr_warn("ioctl: unknown command nr: 0x%x\n", _IOC_NR(cmd));
+ ret = -ENOTTY;
+ break;
+ }
+
+ return ret;
+}
+
+static const struct file_operations fops = {
+ .owner = THIS_MODULE,
+ .open = luo_open,
+ .unlocked_ioctl = luo_ioctl,
+};
+
+static struct miscdevice liveupdate_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "liveupdate",
+ .fops = &fops,
+};
+
+static int __init liveupdate_init(void)
+{
+ int err;
+
+ if (!liveupdate_enabled())
+ return 0;
+
+ err = misc_register(&liveupdate_miscdev);
+ if (err < 0) {
+ pr_err("Failed to register misc device '%s': %d\n",
+ liveupdate_miscdev.name, err);
+ }
+
+ return err;
+}
+module_init(liveupdate_init);
+
+static void __exit liveupdate_exit(void)
+{
+ misc_deregister(&liveupdate_miscdev);
+}
+module_exit(liveupdate_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pasha Tatashin");
+MODULE_DESCRIPTION("Live Update Orchestrator");
+MODULE_VERSION("0.1");
--
2.50.0.727.gbf7dc18ff4-goog
On Wed, Jul 23, 2025 at 02:46:29PM +0000, Pasha Tatashin wrote: > Introduce the user-space interface for the Live Update Orchestrator > via ioctl commands, enabling external control over the live update > process and management of preserved resources. I strongly recommend copying something like fwctl (which is copying iommufd, which is copying some other best practices). I will try to outline the main points below. The design of the fwctl scheme allows alot of options for ABI compatible future extensions and I very strongly recommend that complex ioctl style APIs be built with that in mind. I have so many scars from trying to undo fixed ABI design :) > +/** > + * struct liveupdate_fd - Holds parameters for preserving and restoring file > + * descriptors across live update. > + * @fd: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: The user-space file > + * descriptor to be preserved. > + * Output for %LIVEUPDATE_IOCTL_FD_RESTORE: The new file descriptor > + * representing the fully restored kernel resource. > + * @flags: Unused, reserved for future expansion, must be set to 0. > + * @token: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: An opaque, unique token > + * preserved for preserved resource. > + * Input for %LIVEUPDATE_IOCTL_FD_RESTORE: The token previously > + * provided to the preserve ioctl for the resource to be restored. > + * > + * This structure is used as the argument for the %LIVEUPDATE_IOCTL_FD_PRESERVE > + * and %LIVEUPDATE_IOCTL_FD_RESTORE ioctls. These ioctls allow specific types > + * of file descriptors (for example memfd, kvm, iommufd, and VFIO) to have their > + * underlying kernel state preserved across a live update cycle. > + * > + * To preserve an FD, user space passes this struct to > + * %LIVEUPDATE_IOCTL_FD_PRESERVE with the @fd field set. On success, the > + * kernel uses the @token field to uniquly associate the preserved FD. > + * > + * After the live update transition, user space passes the struct populated with > + * the *same* @token to %LIVEUPDATE_IOCTL_FD_RESTORE. The kernel uses the @token > + * to find the preserved state and, on success, populates the @fd field with a > + * new file descriptor referring to the restored resource. > + */ > +struct liveupdate_fd { > + int fd; 'int' should not appear in uapi structs. Fds are __s32 > + __u32 flags; > + __aligned_u64 token; > +}; > + > +/* The ioctl type, documented in ioctl-number.rst */ > +#define LIVEUPDATE_IOCTL_TYPE 0xBA I have found it very helpful to organize the ioctl numbering like this: #define IOMMUFD_TYPE (';') enum { IOMMUFD_CMD_BASE = 0x80, IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE, IOMMUFD_CMD_IOAS_ALLOC = 0x81, IOMMUFD_CMD_IOAS_ALLOW_IOVAS = 0x82, [..] #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY) The numbers should be tightly packed and non-overlapping. It becomes difficult to manage this if the numbers are sprinkled all over the file. The above structuring will enforce git am conflicts if things get muddled up. Saved me a few times already in iommufd. > +/** > + * LIVEUPDATE_IOCTL_FD_PRESERVE - Validate and initiate preservation for a file > + * descriptor. > + * > + * Argument: Pointer to &struct liveupdate_fd. > + * > + * User sets the @fd field identifying the file descriptor to preserve > + * (e.g., memfd, kvm, iommufd, VFIO). The kernel validates if this FD type > + * and its dependencies are supported for preservation. If validation passes, > + * the kernel marks the FD internally and *initiates the process* of preparing > + * its state for saving. The actual snapshotting of the state typically occurs > + * during the subsequent %LIVEUPDATE_IOCTL_PREPARE execution phase, though > + * some finalization might occur during freeze. > + * On successful validation and initiation, the kernel uses the @token > + * field with an opaque identifier representing the resource being preserved. > + * This token confirms the FD is targeted for preservation and is required for > + * the subsequent %LIVEUPDATE_IOCTL_FD_RESTORE call after the live update. > + * > + * Return: 0 on success (validation passed, preservation initiated), negative > + * error code on failure (e.g., unsupported FD type, dependency issue, > + * validation failed). > + */ > +#define LIVEUPDATE_IOCTL_FD_PRESERVE \ > + _IOW(LIVEUPDATE_IOCTL_TYPE, 0x00, struct liveupdate_fd) From a kdoc perspective I find it works much better to attach the kdoc to the struct, not the ioctl: /** * struct iommu_destroy - ioctl(IOMMU_DESTROY) * @size: sizeof(struct iommu_destroy) * @id: iommufd object ID to destroy. Can be any destroyable object type. * * Destroy any object held within iommufd. */ struct iommu_destroy { __u32 size; __u32 id; }; #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY) Generates this kdoc: https://docs.kernel.org/userspace-api/iommufd.html#c.iommu_destroy You should also make sure to link the uapi header into the kdoc build under the "userspace API" chaper. The structs should also be self-describing. I am fairly strongly against using the size mechanism in the _IOW macro, it is instantly ABI incompatible and basically impossible to deal with from userspace. Hence why the IOMMFD version is _IO(). This means stick a size member in the first 4 bytes of every struct. More on this later.. > +/** > + * LIVEUPDATE_IOCTL_FD_UNPRESERVE - Remove a file descriptor from the > + * preservation list. > + * > + * Argument: Pointer to __u64 token. Every ioctl should have a struct, with the size header. If you want to do more down the road you can not using this structure. > +#define LIVEUPDATE_IOCTL_FD_RESTORE \ > + _IOWR(LIVEUPDATE_IOCTL_TYPE, 0x02, struct liveupdate_fd) Strongly recommend that every ioctl have a unique struct. Sharing structs makes future extend-ability harder. > +/** > + * LIVEUPDATE_IOCTL_PREPARE - Initiate preparation phase and trigger state > + * saving. Perhaps these just want to be a single 'set state' ioctl with an enum input argument? > @@ -7,4 +7,5 @@ obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o > obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o > obj-$(CONFIG_LIVEUPDATE) += luo_core.o > obj-$(CONFIG_LIVEUPDATE) += luo_files.o > +obj-$(CONFIG_LIVEUPDATE) += luo_ioctl.o > obj-$(CONFIG_LIVEUPDATE) += luo_subsystems.o I don't think luo is modular, but I think it is generally better to write the kbuilds as though it was anyhow if it has a lot of files: iommufd-y := \ device.o \ eventq.o \ hw_pagetable.o \ io_pagetable.o \ ioas.o \ main.o \ pages.o \ vfio_compat.o \ viommu.o obj-$(CONFIG_IOMMUFD) += iommufd.o Basically don't repeat obj-$(CONFIG_LIVEUPDATE), every one of those lines creates a new module (if it was modular) > +static int luo_open(struct inode *inodep, struct file *filep) > +{ > + if (!capable(CAP_SYS_ADMIN)) > + return -EACCES; IMHO file system permissions should control permission to open. No capable check. > + if (filep->f_flags & O_EXCL) > + return -EINVAL; O_EXCL doesn't really do anything for cdev, I'd drop this. The open should have an atomic to check for single open though. > +static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) > +{ > + void __user *argp = (void __user *)arg; > + struct liveupdate_fd luo_fd; > + enum liveupdate_state state; > + int ret = 0; > + u64 token; > + > + if (_IOC_TYPE(cmd) != LIVEUPDATE_IOCTL_TYPE) > + return -ENOTTY; The generic parse/disptach from fwctl is a really good idea here, you can cut and paste it, change the names. It makes it really easy to manage future extensibility: List the ops and their structs: static const struct fwctl_ioctl_op fwctl_ioctl_ops[] = { IOCTL_OP(FWCTL_INFO, fwctl_cmd_info, struct fwctl_info, out_device_data), IOCTL_OP(FWCTL_RPC, fwctl_cmd_rpc, struct fwctl_rpc, out), }; Index the list and copy_from_user the struct desribing the opt: static long fwctl_fops_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct fwctl_uctx *uctx = filp->private_data; const struct fwctl_ioctl_op *op; struct fwctl_ucmd ucmd = {}; union fwctl_ucmd_buffer buf; unsigned int nr; int ret; nr = _IOC_NR(cmd); if ((nr - FWCTL_CMD_BASE) >= ARRAY_SIZE(fwctl_ioctl_ops)) return -ENOIOCTLCMD; op = &fwctl_ioctl_ops[nr - FWCTL_CMD_BASE]; if (op->ioctl_num != cmd) return -ENOIOCTLCMD; ucmd.uctx = uctx; ucmd.cmd = &buf; ucmd.ubuffer = (void __user *)arg; // This is reading/checking the standard 4 byte size header: ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); if (ret) return ret; if (ucmd.user_size < op->min_size) return -EINVAL; ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, ucmd.user_size); Removes a bunch of boiler plate and easy to make wrong copy_from_users in the ioctls. Centralizes size validation, zero padding checking/etc. > + ret = luo_register_file(luo_fd.token, luo_fd.fd); > + if (!ret && copy_to_user(argp, &luo_fd, sizeof(luo_fd))) { > + WARN_ON_ONCE(luo_unregister_file(luo_fd.token)); > + ret = -EFAULT; Then for extensibility you'd copy back the struct: static int ucmd_respond(struct fwctl_ucmd *ucmd, size_t cmd_len) { if (copy_to_user(ucmd->ubuffer, ucmd->cmd, min_t(size_t, ucmd->user_size, cmd_len))) return -EFAULT; return 0; } Which truncates it/etc according to some ABI extensibility rules. > +static int __init liveupdate_init(void) > +{ > + int err; > + > + if (!liveupdate_enabled()) > + return 0; > + > + err = misc_register(&liveupdate_miscdev); > + if (err < 0) { > + pr_err("Failed to register misc device '%s': %d\n", > + liveupdate_miscdev.name, err); Should remove most of the pr_err's, here too IMHO.. Jason
On Tue, Jul 29, 2025 at 12:35 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Wed, Jul 23, 2025 at 02:46:29PM +0000, Pasha Tatashin wrote: > > Introduce the user-space interface for the Live Update Orchestrator > > via ioctl commands, enabling external control over the live update > > process and management of preserved resources. > > I strongly recommend copying something like fwctl (which is copying > iommufd, which is copying some other best practices). I will try to > outline the main points below. > > The design of the fwctl scheme allows alot of options for ABI > compatible future extensions and I very strongly recommend that > complex ioctl style APIs be built with that in mind. I have so many > scars from trying to undo fixed ABI design :) Thank you for bringing this up, I have reviewed fwctl ioctl implementation, and also iommufd ioctl, and I made the necessary changes to make luo similar. > > +/** > > + * struct liveupdate_fd - Holds parameters for preserving and restoring file > > + * descriptors across live update. > > + * @fd: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: The user-space file > > + * descriptor to be preserved. > > + * Output for %LIVEUPDATE_IOCTL_FD_RESTORE: The new file descriptor > > + * representing the fully restored kernel resource. > > + * @flags: Unused, reserved for future expansion, must be set to 0. > > + * @token: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: An opaque, unique token > > + * preserved for preserved resource. > > + * Input for %LIVEUPDATE_IOCTL_FD_RESTORE: The token previously > > + * provided to the preserve ioctl for the resource to be restored. > > + * > > + * This structure is used as the argument for the %LIVEUPDATE_IOCTL_FD_PRESERVE > > + * and %LIVEUPDATE_IOCTL_FD_RESTORE ioctls. These ioctls allow specific types > > + * of file descriptors (for example memfd, kvm, iommufd, and VFIO) to have their > > + * underlying kernel state preserved across a live update cycle. > > + * > > + * To preserve an FD, user space passes this struct to > > + * %LIVEUPDATE_IOCTL_FD_PRESERVE with the @fd field set. On success, the > > + * kernel uses the @token field to uniquly associate the preserved FD. > > + * > > + * After the live update transition, user space passes the struct populated with > > + * the *same* @token to %LIVEUPDATE_IOCTL_FD_RESTORE. The kernel uses the @token > > + * to find the preserved state and, on success, populates the @fd field with a > > + * new file descriptor referring to the restored resource. > > + */ > > +struct liveupdate_fd { > > + int fd; > > 'int' should not appear in uapi structs. Fds are __s32 done > > > + __u32 flags; > > + __aligned_u64 token; > > +}; > > + > > +/* The ioctl type, documented in ioctl-number.rst */ > > +#define LIVEUPDATE_IOCTL_TYPE 0xBA > > I have found it very helpful to organize the ioctl numbering like this: > > #define IOMMUFD_TYPE (';') > > enum { > IOMMUFD_CMD_BASE = 0x80, > IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE, > IOMMUFD_CMD_IOAS_ALLOC = 0x81, > IOMMUFD_CMD_IOAS_ALLOW_IOVAS = 0x82, > [..] > > #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY) > > The numbers should be tightly packed and non-overlapping. It becomes > difficult to manage this if the numbers are sprinkled all over the > file. The above structuring will enforce git am conflicts if things > get muddled up. Saved me a few times already in iommufd. Done > > > +/** > > + * LIVEUPDATE_IOCTL_FD_PRESERVE - Validate and initiate preservation for a file > > + * descriptor. > > + * > > + * Argument: Pointer to &struct liveupdate_fd. > > + * > > + * User sets the @fd field identifying the file descriptor to preserve > > + * (e.g., memfd, kvm, iommufd, VFIO). The kernel validates if this FD type > > + * and its dependencies are supported for preservation. If validation passes, > > + * the kernel marks the FD internally and *initiates the process* of preparing > > + * its state for saving. The actual snapshotting of the state typically occurs > > + * during the subsequent %LIVEUPDATE_IOCTL_PREPARE execution phase, though > > + * some finalization might occur during freeze. > > + * On successful validation and initiation, the kernel uses the @token > > + * field with an opaque identifier representing the resource being preserved. > > + * This token confirms the FD is targeted for preservation and is required for > > + * the subsequent %LIVEUPDATE_IOCTL_FD_RESTORE call after the live update. > > + * > > + * Return: 0 on success (validation passed, preservation initiated), negative > > + * error code on failure (e.g., unsupported FD type, dependency issue, > > + * validation failed). > > + */ > > +#define LIVEUPDATE_IOCTL_FD_PRESERVE \ > > + _IOW(LIVEUPDATE_IOCTL_TYPE, 0x00, struct liveupdate_fd) > > From a kdoc perspective I find it works much better to attach the kdoc > to the struct, not the ioctl: > > /** > * struct iommu_destroy - ioctl(IOMMU_DESTROY) > * @size: sizeof(struct iommu_destroy) > * @id: iommufd object ID to destroy. Can be any destroyable object type. > * > * Destroy any object held within iommufd. > */ > struct iommu_destroy { > __u32 size; > __u32 id; > }; > #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY) > > Generates this kdoc: > > https://docs.kernel.org/userspace-api/iommufd.html#c.iommu_destroy Agreed, done the same as above. > > You should also make sure to link the uapi header into the kdoc build > under the "userspace API" chaper. > > The structs should also be self-describing. I am fairly strongly > against using the size mechanism in the _IOW macro, it is instantly > ABI incompatible and basically impossible to deal with from userspace. > > Hence why the IOMMFD version is _IO(). Right, I came to the same conclusion while reviewing fwctl, I replaced everything with pure _IO(). > > This means stick a size member in the first 4 bytes of every > struct. More on this later.. > > > +/** > > + * LIVEUPDATE_IOCTL_FD_UNPRESERVE - Remove a file descriptor from the > > + * preservation list. > > + * > > + * Argument: Pointer to __u64 token. > > Every ioctl should have a struct, with the size header. If you want to > do more down the road you can not using this structure. Done > > > +#define LIVEUPDATE_IOCTL_FD_RESTORE \ > > + _IOWR(LIVEUPDATE_IOCTL_TYPE, 0x02, struct liveupdate_fd) > > Strongly recommend that every ioctl have a unique struct. Sharing > structs makes future extend-ability harder. Done > > > +/** > > + * LIVEUPDATE_IOCTL_PREPARE - Initiate preparation phase and trigger state > > + * saving. > > Perhaps these just want to be a single 'set state' ioctl with an enum > input argument? Added a IOCTL: LIVEUPDATE_SET_EVENT, and all events PREPARE/FINISH/CANCEL are now done through it. > > > @@ -7,4 +7,5 @@ obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o > > obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o > > obj-$(CONFIG_LIVEUPDATE) += luo_core.o > > obj-$(CONFIG_LIVEUPDATE) += luo_files.o > > +obj-$(CONFIG_LIVEUPDATE) += luo_ioctl.o > > obj-$(CONFIG_LIVEUPDATE) += luo_subsystems.o > > I don't think luo is modular, but I think it is generally better to > write the kbuilds as though it was anyhow if it has a lot of files: > > iommufd-y := \ > device.o \ > eventq.o \ > hw_pagetable.o \ > io_pagetable.o \ > ioas.o \ > main.o \ > pages.o \ > vfio_compat.o \ > viommu.o > obj-$(CONFIG_IOMMUFD) += iommufd.o Done > > Basically don't repeat obj-$(CONFIG_LIVEUPDATE), every one of those > lines creates a new module (if it was modular) > > > +static int luo_open(struct inode *inodep, struct file *filep) > > +{ > > + if (!capable(CAP_SYS_ADMIN)) > > + return -EACCES; > > IMHO file system permissions should control permission to open. No > capable check. Removed > > > + if (filep->f_flags & O_EXCL) > > + return -EINVAL; > > O_EXCL doesn't really do anything for cdev, I'd drop this. > > The open should have an atomic to check for single open though. Removed, and added an enforcement for a single open. > > > +static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) > > +{ > > + void __user *argp = (void __user *)arg; > > + struct liveupdate_fd luo_fd; > > + enum liveupdate_state state; > > + int ret = 0; > > + u64 token; > > + > > + if (_IOC_TYPE(cmd) != LIVEUPDATE_IOCTL_TYPE) > > + return -ENOTTY; > > The generic parse/disptach from fwctl is a really good idea here, you > can cut and paste it, change the names. It makes it really easy to manage future extensibility: > > List the ops and their structs: > > static const struct fwctl_ioctl_op fwctl_ioctl_ops[] = { > IOCTL_OP(FWCTL_INFO, fwctl_cmd_info, struct fwctl_info, out_device_data), > IOCTL_OP(FWCTL_RPC, fwctl_cmd_rpc, struct fwctl_rpc, out), > }; > > Index the list and copy_from_user the struct desribing the opt: > > static long fwctl_fops_ioctl(struct file *filp, unsigned int cmd, > unsigned long arg) > { > struct fwctl_uctx *uctx = filp->private_data; > const struct fwctl_ioctl_op *op; > struct fwctl_ucmd ucmd = {}; > union fwctl_ucmd_buffer buf; > unsigned int nr; > int ret; > > nr = _IOC_NR(cmd); > if ((nr - FWCTL_CMD_BASE) >= ARRAY_SIZE(fwctl_ioctl_ops)) > return -ENOIOCTLCMD; > > op = &fwctl_ioctl_ops[nr - FWCTL_CMD_BASE]; > if (op->ioctl_num != cmd) > return -ENOIOCTLCMD; > > ucmd.uctx = uctx; > ucmd.cmd = &buf; > ucmd.ubuffer = (void __user *)arg; > // This is reading/checking the standard 4 byte size header: > ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); > if (ret) > return ret; > > if (ucmd.user_size < op->min_size) > return -EINVAL; > > ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, > ucmd.user_size); > > > Removes a bunch of boiler plate and easy to make wrong copy_from_users > in the ioctls. Centralizes size validation, zero padding checking/etc. Yeap, implemented as above. > > > + ret = luo_register_file(luo_fd.token, luo_fd.fd); > > + if (!ret && copy_to_user(argp, &luo_fd, sizeof(luo_fd))) { > > + WARN_ON_ONCE(luo_unregister_file(luo_fd.token)); > > + ret = -EFAULT; > > Then for extensibility you'd copy back the struct: > > static int ucmd_respond(struct fwctl_ucmd *ucmd, size_t cmd_len) > { > if (copy_to_user(ucmd->ubuffer, ucmd->cmd, > min_t(size_t, ucmd->user_size, cmd_len))) > return -EFAULT; > return 0; > } > > Which truncates it/etc according to some ABI extensibility rules. > > > +static int __init liveupdate_init(void) > > +{ > > + int err; > > + > > + if (!liveupdate_enabled()) > > + return 0; > > + > > + err = misc_register(&liveupdate_miscdev); > > + if (err < 0) { > > + pr_err("Failed to register misc device '%s': %d\n", > > + liveupdate_miscdev.name, err); > > Should remove most of the pr_err's, here too IMHO.. Removed. > > Jason Thanks a lot for the thorough review! Pasha
© 2016 - 2025 Red Hat, Inc.