[PATCH v2 16/32] liveupdate: luo_ioctl: add ioctl interface

Pasha Tatashin posted 32 patches 2 months, 2 weeks ago
There is a newer version of this series
[PATCH v2 16/32] liveupdate: luo_ioctl: add ioctl interface
Posted by Pasha Tatashin 2 months, 2 weeks ago
Introduce the user-space interface for the Live Update Orchestrator
via ioctl commands, enabling external control over the live update
process and management of preserved resources.

Create a character device at /dev/liveupdate. Access
to this device requires the CAP_SYS_ADMIN capability.

A new uAPI header, <uapi/linux/liveupdate.h>, defines the necessary
structures. The magic number is registered in
Documentation/userspace-api/ioctl/ioctl-number.rst.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
---
 .../userspace-api/ioctl/ioctl-number.rst      |   2 +
 include/linux/liveupdate.h                    |  36 +--
 include/uapi/linux/liveupdate.h               | 265 ++++++++++++++++++
 kernel/liveupdate/Makefile                    |   1 +
 kernel/liveupdate/luo_ioctl.c                 | 178 ++++++++++++
 5 files changed, 447 insertions(+), 35 deletions(-)
 create mode 100644 include/uapi/linux/liveupdate.h
 create mode 100644 kernel/liveupdate/luo_ioctl.c

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index bc91756bde73..8368aa05b4df 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -380,6 +380,8 @@ Code  Seq#    Include File                                           Comments
 0xB8  01-02  uapi/misc/mrvl_cn10k_dpi.h                              Marvell CN10K DPI driver
 0xB8  all    uapi/linux/mshv.h                                       Microsoft Hyper-V /dev/mshv driver
                                                                      <mailto:linux-hyperv@vger.kernel.org>
+0xBA  all    uapi/linux/liveupdate.h                                 Pasha Tatashin
+                                                                     <mailto:pasha.tatashin@soleen.com>
 0xC0  00-0F  linux/usb/iowarrior.h
 0xCA  00-0F  uapi/misc/cxl.h                                         Dead since 6.15
 0xCA  10-2F  uapi/misc/ocxl.h
diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
index 28a8aa4cafca..970447de5d8c 100644
--- a/include/linux/liveupdate.h
+++ b/include/linux/liveupdate.h
@@ -10,6 +10,7 @@
 #include <linux/bug.h>
 #include <linux/types.h>
 #include <linux/list.h>
+#include <uapi/linux/liveupdate.h>
 
 /**
  * enum liveupdate_event - Events that trigger live update callbacks.
@@ -53,41 +54,6 @@ enum liveupdate_event {
 	LIVEUPDATE_CANCEL,
 };
 
-/**
- * enum liveupdate_state - Defines the possible states of the live update
- * orchestrator.
- * @LIVEUPDATE_STATE_UNDEFINED:      State has not yet been initialized.
- * @LIVEUPDATE_STATE_NORMAL:         Default state, no live update in progress.
- * @LIVEUPDATE_STATE_PREPARED:       Live update is prepared for reboot; the
- *                                   LIVEUPDATE_PREPARE callbacks have completed
- *                                   successfully.
- *                                   Devices might operate in a limited state
- *                                   for example the participating devices might
- *                                   not be allowed to unbind, and also the
- *                                   setting up of new DMA mappings might be
- *                                   disabled in this state.
- * @LIVEUPDATE_STATE_FROZEN:         The final reboot event
- *                                   (%LIVEUPDATE_FREEZE) has been sent, and the
- *                                   system is performing its final state saving
- *                                   within the "blackout window". User
- *                                   workloads must be suspended. The actual
- *                                   reboot (kexec) into the next kernel is
- *                                   imminent.
- * @LIVEUPDATE_STATE_UPDATED:        The system has rebooted into the next
- *                                   kernel via live update the system is now
- *                                   running the next kernel, awaiting the
- *                                   finish event.
- *
- * These states track the progress and outcome of a live update operation.
- */
-enum liveupdate_state  {
-	LIVEUPDATE_STATE_UNDEFINED = 0,
-	LIVEUPDATE_STATE_NORMAL = 1,
-	LIVEUPDATE_STATE_PREPARED = 2,
-	LIVEUPDATE_STATE_FROZEN = 3,
-	LIVEUPDATE_STATE_UPDATED = 4,
-};
-
 struct file;
 
 /**
diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h
new file mode 100644
index 000000000000..7b12a1073c3c
--- /dev/null
+++ b/include/uapi/linux/liveupdate.h
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+/*
+ * Userspace interface for /dev/liveupdate
+ * Live Update Orchestrator
+ *
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef _UAPI_LIVEUPDATE_H
+#define _UAPI_LIVEUPDATE_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/**
+ * enum liveupdate_state - Defines the possible states of the live update
+ * orchestrator.
+ * @LIVEUPDATE_STATE_UNDEFINED:      State has not yet been initialized.
+ * @LIVEUPDATE_STATE_NORMAL:         Default state, no live update in progress.
+ * @LIVEUPDATE_STATE_PREPARED:       Live update is prepared for reboot; the
+ *                                   LIVEUPDATE_PREPARE callbacks have completed
+ *                                   successfully.
+ *                                   Devices might operate in a limited state
+ *                                   for example the participating devices might
+ *                                   not be allowed to unbind, and also the
+ *                                   setting up of new DMA mappings might be
+ *                                   disabled in this state.
+ * @LIVEUPDATE_STATE_FROZEN:         The final reboot event
+ *                                   (%LIVEUPDATE_FREEZE) has been sent, and the
+ *                                   system is performing its final state saving
+ *                                   within the "blackout window". User
+ *                                   workloads must be suspended. The actual
+ *                                   reboot (kexec) into the next kernel is
+ *                                   imminent.
+ * @LIVEUPDATE_STATE_UPDATED:        The system has rebooted into the next
+ *                                   kernel via live update the system is now
+ *                                   running the next kernel, awaiting the
+ *                                   finish event.
+ *
+ * These states track the progress and outcome of a live update operation.
+ */
+enum liveupdate_state  {
+	LIVEUPDATE_STATE_UNDEFINED = 0,
+	LIVEUPDATE_STATE_NORMAL = 1,
+	LIVEUPDATE_STATE_PREPARED = 2,
+	LIVEUPDATE_STATE_FROZEN = 3,
+	LIVEUPDATE_STATE_UPDATED = 4,
+};
+
+/**
+ * struct liveupdate_fd - Holds parameters for preserving and restoring file
+ * descriptors across live update.
+ * @fd:    Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: The user-space file
+ *         descriptor to be preserved.
+ *         Output for %LIVEUPDATE_IOCTL_FD_RESTORE: The new file descriptor
+ *         representing the fully restored kernel resource.
+ * @flags: Unused, reserved for future expansion, must be set to 0.
+ * @token: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: An opaque, unique token
+ *         preserved for preserved resource.
+ *         Input for %LIVEUPDATE_IOCTL_FD_RESTORE: The token previously
+ *         provided to the preserve ioctl for the resource to be restored.
+ *
+ * This structure is used as the argument for the %LIVEUPDATE_IOCTL_FD_PRESERVE
+ * and %LIVEUPDATE_IOCTL_FD_RESTORE ioctls. These ioctls allow specific types
+ * of file descriptors (for example memfd, kvm, iommufd, and VFIO) to have their
+ * underlying kernel state preserved across a live update cycle.
+ *
+ * To preserve an FD, user space passes this struct to
+ * %LIVEUPDATE_IOCTL_FD_PRESERVE with the @fd field set. On success, the
+ * kernel uses the @token field to uniquly associate the preserved FD.
+ *
+ * After the live update transition, user space passes the struct populated with
+ * the *same* @token to %LIVEUPDATE_IOCTL_FD_RESTORE. The kernel uses the @token
+ * to find the preserved state and, on success, populates the @fd field with a
+ * new file descriptor referring to the restored resource.
+ */
+struct liveupdate_fd {
+	int		fd;
+	__u32		flags;
+	__aligned_u64	token;
+};
+
+/* The ioctl type, documented in ioctl-number.rst */
+#define LIVEUPDATE_IOCTL_TYPE		0xBA
+
+/**
+ * LIVEUPDATE_IOCTL_FD_PRESERVE - Validate and initiate preservation for a file
+ * descriptor.
+ *
+ * Argument: Pointer to &struct liveupdate_fd.
+ *
+ * User sets the @fd field identifying the file descriptor to preserve
+ * (e.g., memfd, kvm, iommufd, VFIO). The kernel validates if this FD type
+ * and its dependencies are supported for preservation. If validation passes,
+ * the kernel marks the FD internally and *initiates the process* of preparing
+ * its state for saving. The actual snapshotting of the state typically occurs
+ * during the subsequent %LIVEUPDATE_IOCTL_PREPARE execution phase, though
+ * some finalization might occur during freeze.
+ * On successful validation and initiation, the kernel uses the @token
+ * field with an opaque identifier representing the resource being preserved.
+ * This token confirms the FD is targeted for preservation and is required for
+ * the subsequent %LIVEUPDATE_IOCTL_FD_RESTORE call after the live update.
+ *
+ * Return: 0 on success (validation passed, preservation initiated), negative
+ * error code on failure (e.g., unsupported FD type, dependency issue,
+ * validation failed).
+ */
+#define LIVEUPDATE_IOCTL_FD_PRESERVE					\
+	_IOW(LIVEUPDATE_IOCTL_TYPE, 0x00, struct liveupdate_fd)
+
+/**
+ * LIVEUPDATE_IOCTL_FD_UNPRESERVE - Remove a file descriptor from the
+ * preservation list.
+ *
+ * Argument: Pointer to __u64 token.
+ *
+ * Allows user space to explicitly remove a file descriptor from the set of
+ * items marked as potentially preservable. User space provides a pointer to the
+ * __u64 @token that was previously returned by a successful
+ * %LIVEUPDATE_IOCTL_FD_PRESERVE call (potentially from a prior, possibly
+ * cancelled, live update attempt). The kernel reads the token value from the
+ * provided user-space address.
+ *
+ * On success, the kernel removes the corresponding entry (identified by the
+ * token value read from the user pointer) from its internal preservation list.
+ * The provided @token (representing the now-removed entry) becomes invalid
+ * after this call.
+ *
+ * Return: 0 on success, negative error code on failure (e.g., -EBUSY or -EINVAL
+ * if not in %LIVEUPDATE_STATE_NORMAL, bad address provided, invalid token value
+ * read, token not found).
+ */
+#define LIVEUPDATE_IOCTL_FD_UNPRESERVE					\
+	_IOW(LIVEUPDATE_IOCTL_TYPE, 0x01, __u64)
+
+/**
+ * LIVEUPDATE_IOCTL_FD_RESTORE - Restore a previously preserved file descriptor.
+ *
+ * Argument: Pointer to &struct liveupdate_fd.
+ *
+ * User sets the @token field to the value obtained from a successful
+ * %LIVEUPDATE_IOCTL_FD_PRESERVE call before the live update. On success,
+ * the kernel restores the state (saved during the PREPARE/FREEZE phases)
+ * associated with the token and populates the @fd field with a new file
+ * descriptor referencing the restored resource in the current (new) kernel.
+ * This operation must be performed *before* signaling completion via
+ * %LIVEUPDATE_IOCTL_FINISH.
+ *
+ * Return: 0 on success, negative error code on failure (e.g., invalid token).
+ */
+#define LIVEUPDATE_IOCTL_FD_RESTORE					\
+	_IOWR(LIVEUPDATE_IOCTL_TYPE, 0x02, struct liveupdate_fd)
+
+/**
+ * LIVEUPDATE_IOCTL_GET_STATE - Query the current state of the live update
+ * orchestrator.
+ *
+ * Argument: Pointer to &enum liveupdate_state.
+ *
+ * The kernel fills the enum value pointed to by the argument with the current
+ * state of the live update subsystem. Possible states are:
+ *
+ * - %LIVEUPDATE_STATE_NORMAL:   Default state; no live update operation is
+ *                               currently in progress.
+ * - %LIVEUPDATE_STATE_PREPARED: The preparation phase (triggered by
+ *                               %LIVEUPDATE_IOCTL_PREPARE) has completed
+ *                               successfully. The system is ready for the
+ *                               reboot transition. Note that some
+ *                               device operations (e.g., unbinding, new DMA
+ *                               mappings) might be restricted in this state.
+ * - %LIVEUPDATE_STATE_UPDATED:  The system has successfully rebooted into the
+ *                               new kernel via live update. It is now running
+ *                               the new kernel code and is awaiting the
+ *                               completion signal from user space via
+ *                               %LIVEUPDATE_IOCTL_FINISH after
+ *                               restoration tasks are done.
+ *
+ * See the definition of &enum liveupdate_state for more details on each state.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+#define LIVEUPDATE_IOCTL_GET_STATE					\
+	_IOR(LIVEUPDATE_IOCTL_TYPE, 0x03, enum liveupdate_state)
+
+/**
+ * LIVEUPDATE_IOCTL_PREPARE - Initiate preparation phase and trigger state
+ * saving.
+ *
+ * Argument: None.
+ *
+ * Initiates the live update preparation phase. This action corresponds to
+ * the internal %LIVEUPDATE_PREPARE. This typically triggers the saving process
+ * for items marked via the PRESERVE ioctls. This typically occurs *before*
+ * the "blackout window", while user applications (e.g., VMs) may still be
+ * running. Kernel subsystems receiving the %LIVEUPDATE_PREPARE event should
+ * serialize necessary state. This command does not transfer data.
+ *
+ * Return: 0 on success, negative error code on failure. Transitions state
+ * towards %LIVEUPDATE_STATE_PREPARED on success.
+ */
+#define LIVEUPDATE_IOCTL_PREPARE					\
+	_IO(LIVEUPDATE_IOCTL_TYPE, 0x04)
+
+/**
+ * LIVEUPDATE_IOCTL_CANCEL - Cancel the live update preparation phase.
+ *
+ * Argument: None.
+ *
+ * Notifies the live update subsystem to abort the preparation sequence
+ * potentially initiated by %LIVEUPDATE_IOCTL_PREPARE. This action
+ * typically corresponds to the internal %LIVEUPDATE_CANCEL kernel event,
+ * which might also be triggered automatically if the PREPARE stage fails
+ * internally.
+ *
+ * When triggered, subsystems receiving the %LIVEUPDATE_CANCEL event should
+ * revert any state changes or actions taken specifically for the aborted
+ * prepare phase (e.g., discard partially serialized state). The kernel
+ * releases resources allocated specifically for this *aborted preparation
+ * attempt*.
+ *
+ * This operation cancels the current *attempt* to prepare for a live update
+ * but does **not** remove previously validated items from the internal list
+ * of potentially preservable resources. Consequently, preservation tokens
+ * previously generated by successful %LIVEUPDATE_IOCTL_FD_PRESERVE or calls
+ * generally **remain valid** as identifiers for those potentially preservable
+ * resources. However, since the system state returns towards
+ * %LIVEUPDATE_STATE_NORMAL, user space must initiate a new live update sequence
+ * (starting with %LIVEUPDATE_IOCTL_PREPARE) to proceed with an update
+ * using these (or other) tokens.
+ *
+ * This command does not transfer data. Kernel callbacks for the
+ * %LIVEUPDATE_CANCEL event must not fail.
+ *
+ * Return: 0 on success, negative error code on failure. Transitions state back
+ * towards %LIVEUPDATE_STATE_NORMAL on success.
+ */
+#define LIVEUPDATE_IOCTL_CANCEL						\
+	_IO(LIVEUPDATE_IOCTL_TYPE, 0x06)
+
+/**
+ * LIVEUPDATE_IOCTL_EVENT_FINISH - Signal restoration completion and trigger
+ * cleanup.
+ *
+ * Argument: None.
+ *
+ * Signals that user space has completed all necessary restoration actions in
+ * the new kernel (after a live update reboot). This action corresponds to the
+ * internal %LIVEUPDATE_FINISH kernel event. Calling this ioctl triggers the
+ * cleanup phase: any resources that were successfully preserved but were *not*
+ * subsequently restored (reclaimed) via the RESTORE ioctls will have their
+ * preserved state discarded and associated kernel resources released. Involved
+ * devices may be reset. All desired restorations *must* be completed *before*
+ * this. Kernel callbacks for the %LIVEUPDATE_FINISH event must not fail.
+ * Successfully completing this phase transitions the system state from
+ * %LIVEUPDATE_STATE_UPDATED back to %LIVEUPDATE_STATE_NORMAL. This command does
+ * not transfer data.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+#define LIVEUPDATE_IOCTL_FINISH						\
+	_IO(LIVEUPDATE_IOCTL_TYPE, 0x07)
+
+#endif /* _UAPI_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
index b5054140b9a9..cb3ea380f6b9 100644
--- a/kernel/liveupdate/Makefile
+++ b/kernel/liveupdate/Makefile
@@ -7,4 +7,5 @@ obj-$(CONFIG_KEXEC_HANDOVER)		+= kexec_handover.o
 obj-$(CONFIG_KEXEC_HANDOVER_DEBUG)	+= kexec_handover_debug.o
 obj-$(CONFIG_LIVEUPDATE)		+= luo_core.o
 obj-$(CONFIG_LIVEUPDATE)		+= luo_files.o
+obj-$(CONFIG_LIVEUPDATE)		+= luo_ioctl.o
 obj-$(CONFIG_LIVEUPDATE)		+= luo_subsystems.o
diff --git a/kernel/liveupdate/luo_ioctl.c b/kernel/liveupdate/luo_ioctl.c
new file mode 100644
index 000000000000..3de1d243df5a
--- /dev/null
+++ b/kernel/liveupdate/luo_ioctl.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO ioctl Interface
+ *
+ * The IOCTL user-space control interface for the LUO subsystem.
+ * It registers a misc character device, typically found at ``/dev/liveupdate``,
+ * which allows privileged userspace applications (requiring %CAP_SYS_ADMIN) to
+ * manage and monitor the LUO state machine and associated resources like
+ * preservable file descriptors.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/liveupdate.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/liveupdate.h>
+#include "luo_internal.h"
+
+static int luo_ioctl_fd_restore(struct liveupdate_fd *luo_fd)
+{
+	struct file *file;
+	int ret;
+	int fd;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		pr_err("Failed to allocate new fd: %d\n", fd);
+		return fd;
+	}
+
+	ret = luo_retrieve_file(luo_fd->token, &file);
+	if (ret < 0) {
+		put_unused_fd(fd);
+
+		return ret;
+	}
+
+	fd_install(fd, file);
+	luo_fd->fd = fd;
+
+	return 0;
+}
+
+static int luo_open(struct inode *inodep, struct file *filep)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (filep->f_flags & O_EXCL)
+		return -EINVAL;
+
+	return 0;
+}
+
+static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	struct liveupdate_fd luo_fd;
+	enum liveupdate_state state;
+	int ret = 0;
+	u64 token;
+
+	if (_IOC_TYPE(cmd) != LIVEUPDATE_IOCTL_TYPE)
+		return -ENOTTY;
+
+	switch (cmd) {
+	case LIVEUPDATE_IOCTL_GET_STATE:
+		state = liveupdate_get_state();
+		if (copy_to_user(argp, &state, sizeof(state)))
+			ret = -EFAULT;
+		break;
+
+	case LIVEUPDATE_IOCTL_PREPARE:
+		ret = luo_prepare();
+		break;
+
+	case LIVEUPDATE_IOCTL_FINISH:
+		ret = luo_finish();
+		break;
+
+	case LIVEUPDATE_IOCTL_CANCEL:
+		ret = luo_cancel();
+		break;
+
+	case LIVEUPDATE_IOCTL_FD_PRESERVE:
+		if (copy_from_user(&luo_fd, argp, sizeof(luo_fd))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = luo_register_file(luo_fd.token, luo_fd.fd);
+		if (!ret && copy_to_user(argp, &luo_fd, sizeof(luo_fd))) {
+			WARN_ON_ONCE(luo_unregister_file(luo_fd.token));
+			ret = -EFAULT;
+		}
+		break;
+
+	case LIVEUPDATE_IOCTL_FD_UNPRESERVE:
+		if (copy_from_user(&token, argp, sizeof(u64))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = luo_unregister_file(token);
+		break;
+
+	case LIVEUPDATE_IOCTL_FD_RESTORE:
+		if (copy_from_user(&luo_fd, argp, sizeof(luo_fd))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = luo_ioctl_fd_restore(&luo_fd);
+		if (!ret && copy_to_user(argp, &luo_fd, sizeof(luo_fd)))
+			ret = -EFAULT;
+		break;
+
+	default:
+		pr_warn("ioctl: unknown command nr: 0x%x\n", _IOC_NR(cmd));
+		ret = -ENOTTY;
+		break;
+	}
+
+	return ret;
+}
+
+static const struct file_operations fops = {
+	.owner          = THIS_MODULE,
+	.open           = luo_open,
+	.unlocked_ioctl = luo_ioctl,
+};
+
+static struct miscdevice liveupdate_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name  = "liveupdate",
+	.fops  = &fops,
+};
+
+static int __init liveupdate_init(void)
+{
+	int err;
+
+	if (!liveupdate_enabled())
+		return 0;
+
+	err = misc_register(&liveupdate_miscdev);
+	if (err < 0) {
+		pr_err("Failed to register misc device '%s': %d\n",
+		       liveupdate_miscdev.name, err);
+	}
+
+	return err;
+}
+module_init(liveupdate_init);
+
+static void __exit liveupdate_exit(void)
+{
+	misc_deregister(&liveupdate_miscdev);
+}
+module_exit(liveupdate_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pasha Tatashin");
+MODULE_DESCRIPTION("Live Update Orchestrator");
+MODULE_VERSION("0.1");
-- 
2.50.0.727.gbf7dc18ff4-goog
Re: [PATCH v2 16/32] liveupdate: luo_ioctl: add ioctl interface
Posted by Jason Gunthorpe 2 months, 1 week ago
On Wed, Jul 23, 2025 at 02:46:29PM +0000, Pasha Tatashin wrote:
> Introduce the user-space interface for the Live Update Orchestrator
> via ioctl commands, enabling external control over the live update
> process and management of preserved resources.

I strongly recommend copying something like fwctl (which is copying
iommufd, which is copying some other best practices). I will try to
outline the main points below.

The design of the fwctl scheme allows alot of options for ABI
compatible future extensions and I very strongly recommend that
complex ioctl style APIs be built with that in mind. I have so many
scars from trying to undo fixed ABI design :)

> +/**
> + * struct liveupdate_fd - Holds parameters for preserving and restoring file
> + * descriptors across live update.
> + * @fd:    Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: The user-space file
> + *         descriptor to be preserved.
> + *         Output for %LIVEUPDATE_IOCTL_FD_RESTORE: The new file descriptor
> + *         representing the fully restored kernel resource.
> + * @flags: Unused, reserved for future expansion, must be set to 0.
> + * @token: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: An opaque, unique token
> + *         preserved for preserved resource.
> + *         Input for %LIVEUPDATE_IOCTL_FD_RESTORE: The token previously
> + *         provided to the preserve ioctl for the resource to be restored.
> + *
> + * This structure is used as the argument for the %LIVEUPDATE_IOCTL_FD_PRESERVE
> + * and %LIVEUPDATE_IOCTL_FD_RESTORE ioctls. These ioctls allow specific types
> + * of file descriptors (for example memfd, kvm, iommufd, and VFIO) to have their
> + * underlying kernel state preserved across a live update cycle.
> + *
> + * To preserve an FD, user space passes this struct to
> + * %LIVEUPDATE_IOCTL_FD_PRESERVE with the @fd field set. On success, the
> + * kernel uses the @token field to uniquly associate the preserved FD.
> + *
> + * After the live update transition, user space passes the struct populated with
> + * the *same* @token to %LIVEUPDATE_IOCTL_FD_RESTORE. The kernel uses the @token
> + * to find the preserved state and, on success, populates the @fd field with a
> + * new file descriptor referring to the restored resource.
> + */
> +struct liveupdate_fd {
> +	int		fd;

'int' should not appear in uapi structs. Fds are __s32

> +	__u32		flags;
> +	__aligned_u64	token;
> +};
> +
> +/* The ioctl type, documented in ioctl-number.rst */
> +#define LIVEUPDATE_IOCTL_TYPE		0xBA

I have found it very helpful to organize the ioctl numbering like this:

#define IOMMUFD_TYPE (';')

enum {
	IOMMUFD_CMD_BASE = 0x80,
	IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE,
	IOMMUFD_CMD_IOAS_ALLOC = 0x81,
	IOMMUFD_CMD_IOAS_ALLOW_IOVAS = 0x82,
[..]

#define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)

The numbers should be tightly packed and non-overlapping. It becomes
difficult to manage this if the numbers are sprinkled all over the
file. The above structuring will enforce git am conflicts if things
get muddled up. Saved me a few times already in iommufd.

> +/**
> + * LIVEUPDATE_IOCTL_FD_PRESERVE - Validate and initiate preservation for a file
> + * descriptor.
> + *
> + * Argument: Pointer to &struct liveupdate_fd.
> + *
> + * User sets the @fd field identifying the file descriptor to preserve
> + * (e.g., memfd, kvm, iommufd, VFIO). The kernel validates if this FD type
> + * and its dependencies are supported for preservation. If validation passes,
> + * the kernel marks the FD internally and *initiates the process* of preparing
> + * its state for saving. The actual snapshotting of the state typically occurs
> + * during the subsequent %LIVEUPDATE_IOCTL_PREPARE execution phase, though
> + * some finalization might occur during freeze.
> + * On successful validation and initiation, the kernel uses the @token
> + * field with an opaque identifier representing the resource being preserved.
> + * This token confirms the FD is targeted for preservation and is required for
> + * the subsequent %LIVEUPDATE_IOCTL_FD_RESTORE call after the live update.
> + *
> + * Return: 0 on success (validation passed, preservation initiated), negative
> + * error code on failure (e.g., unsupported FD type, dependency issue,
> + * validation failed).
> + */
> +#define LIVEUPDATE_IOCTL_FD_PRESERVE					\
> +	_IOW(LIVEUPDATE_IOCTL_TYPE, 0x00, struct liveupdate_fd)

From a kdoc perspective I find it works much better to attach the kdoc
to the struct, not the ioctl:

/**
 * struct iommu_destroy - ioctl(IOMMU_DESTROY)
 * @size: sizeof(struct iommu_destroy)
 * @id: iommufd object ID to destroy. Can be any destroyable object type.
 *
 * Destroy any object held within iommufd.
 */
struct iommu_destroy {
	__u32 size;
	__u32 id;
};
#define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)

Generates this kdoc:

https://docs.kernel.org/userspace-api/iommufd.html#c.iommu_destroy

You should also make sure to link the uapi header into the kdoc build
under the "userspace API" chaper.

The structs should also be self-describing. I am fairly strongly
against using the size mechanism in the _IOW macro, it is instantly
ABI incompatible and basically impossible to deal with from userspace.

Hence why the IOMMFD version is _IO().

This means stick a size member in the first 4 bytes of every
struct. More on this later..

> +/**
> + * LIVEUPDATE_IOCTL_FD_UNPRESERVE - Remove a file descriptor from the
> + * preservation list.
> + *
> + * Argument: Pointer to __u64 token.

Every ioctl should have a struct, with the size header. If you want to
do more down the road you can not using this structure.

> +#define LIVEUPDATE_IOCTL_FD_RESTORE					\
> +	_IOWR(LIVEUPDATE_IOCTL_TYPE, 0x02, struct liveupdate_fd)

Strongly recommend that every ioctl have a unique struct. Sharing
structs makes future extend-ability harder.

> +/**
> + * LIVEUPDATE_IOCTL_PREPARE - Initiate preparation phase and trigger state
> + * saving.

Perhaps these just want to be a single 'set state' ioctl with an enum
input argument?

> @@ -7,4 +7,5 @@ obj-$(CONFIG_KEXEC_HANDOVER)		+= kexec_handover.o
>  obj-$(CONFIG_KEXEC_HANDOVER_DEBUG)	+= kexec_handover_debug.o
>  obj-$(CONFIG_LIVEUPDATE)		+= luo_core.o
>  obj-$(CONFIG_LIVEUPDATE)		+= luo_files.o
> +obj-$(CONFIG_LIVEUPDATE)		+= luo_ioctl.o
>  obj-$(CONFIG_LIVEUPDATE)		+= luo_subsystems.o

I don't think luo is modular, but I think it is generally better to
write the kbuilds as though it was anyhow if it has a lot of files:

iommufd-y := \
	device.o \
	eventq.o \
	hw_pagetable.o \
	io_pagetable.o \
	ioas.o \
	main.o \
	pages.o \
	vfio_compat.o \
	viommu.o
obj-$(CONFIG_IOMMUFD) += iommufd.o

Basically don't repeat obj-$(CONFIG_LIVEUPDATE), every one of those
lines creates a new module (if it was modular)

> +static int luo_open(struct inode *inodep, struct file *filep)
> +{
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EACCES;

IMHO file system permissions should control permission to open. No
capable check.

> +	if (filep->f_flags & O_EXCL)
> +		return -EINVAL;

O_EXCL doesn't really do anything for cdev, I'd drop this.

The open should have an atomic to check for single open though.

> +static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
> +{
> +	void __user *argp = (void __user *)arg;
> +	struct liveupdate_fd luo_fd;
> +	enum liveupdate_state state;
> +	int ret = 0;
> +	u64 token;
> +
> +	if (_IOC_TYPE(cmd) != LIVEUPDATE_IOCTL_TYPE)
> +		return -ENOTTY;

The generic parse/disptach from fwctl is a really good idea here, you
can cut and paste it, change the names. It makes it really easy to manage future extensibility:

List the ops and their structs:

static const struct fwctl_ioctl_op fwctl_ioctl_ops[] = {
	IOCTL_OP(FWCTL_INFO, fwctl_cmd_info, struct fwctl_info, out_device_data),
	IOCTL_OP(FWCTL_RPC, fwctl_cmd_rpc, struct fwctl_rpc, out),
};

Index the list and copy_from_user the struct desribing the opt:

static long fwctl_fops_ioctl(struct file *filp, unsigned int cmd,
			       unsigned long arg)
{
	struct fwctl_uctx *uctx = filp->private_data;
	const struct fwctl_ioctl_op *op;
	struct fwctl_ucmd ucmd = {};
	union fwctl_ucmd_buffer buf;
	unsigned int nr;
	int ret;

	nr = _IOC_NR(cmd);
	if ((nr - FWCTL_CMD_BASE) >= ARRAY_SIZE(fwctl_ioctl_ops))
		return -ENOIOCTLCMD;

	op = &fwctl_ioctl_ops[nr - FWCTL_CMD_BASE];
	if (op->ioctl_num != cmd)
		return -ENOIOCTLCMD;

	ucmd.uctx = uctx;
	ucmd.cmd = &buf;
	ucmd.ubuffer = (void __user *)arg;
        // This is reading/checking the standard 4 byte size header:
	ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
	if (ret)
		return ret;

	if (ucmd.user_size < op->min_size)
		return -EINVAL;

	ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
				    ucmd.user_size);


Removes a bunch of boiler plate and easy to make wrong copy_from_users
in the ioctls. Centralizes size validation, zero padding checking/etc.

> +		ret = luo_register_file(luo_fd.token, luo_fd.fd);
> +		if (!ret && copy_to_user(argp, &luo_fd, sizeof(luo_fd))) {
> +			WARN_ON_ONCE(luo_unregister_file(luo_fd.token));
> +			ret = -EFAULT;

Then for extensibility you'd copy back the struct:

static int ucmd_respond(struct fwctl_ucmd *ucmd, size_t cmd_len)
{
	if (copy_to_user(ucmd->ubuffer, ucmd->cmd,
			 min_t(size_t, ucmd->user_size, cmd_len)))
		return -EFAULT;
	return 0;
}

Which truncates it/etc according to some ABI extensibility rules.

> +static int __init liveupdate_init(void)
> +{
> +	int err;
> +
> +	if (!liveupdate_enabled())
> +		return 0;
> +
> +	err = misc_register(&liveupdate_miscdev);
> +	if (err < 0) {
> +		pr_err("Failed to register misc device '%s': %d\n",
> +		       liveupdate_miscdev.name, err);

Should remove most of the pr_err's, here too IMHO..

Jason
Re: [PATCH v2 16/32] liveupdate: luo_ioctl: add ioctl interface
Posted by Pasha Tatashin 2 months ago
On Tue, Jul 29, 2025 at 12:35 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Wed, Jul 23, 2025 at 02:46:29PM +0000, Pasha Tatashin wrote:
> > Introduce the user-space interface for the Live Update Orchestrator
> > via ioctl commands, enabling external control over the live update
> > process and management of preserved resources.
>
> I strongly recommend copying something like fwctl (which is copying
> iommufd, which is copying some other best practices). I will try to
> outline the main points below.
>
> The design of the fwctl scheme allows alot of options for ABI
> compatible future extensions and I very strongly recommend that
> complex ioctl style APIs be built with that in mind. I have so many
> scars from trying to undo fixed ABI design :)

Thank you for bringing this up, I have reviewed fwctl ioctl
implementation, and also iommufd ioctl, and I made the necessary
changes to make luo similar.

> > +/**
> > + * struct liveupdate_fd - Holds parameters for preserving and restoring file
> > + * descriptors across live update.
> > + * @fd:    Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: The user-space file
> > + *         descriptor to be preserved.
> > + *         Output for %LIVEUPDATE_IOCTL_FD_RESTORE: The new file descriptor
> > + *         representing the fully restored kernel resource.
> > + * @flags: Unused, reserved for future expansion, must be set to 0.
> > + * @token: Input for %LIVEUPDATE_IOCTL_FD_PRESERVE: An opaque, unique token
> > + *         preserved for preserved resource.
> > + *         Input for %LIVEUPDATE_IOCTL_FD_RESTORE: The token previously
> > + *         provided to the preserve ioctl for the resource to be restored.
> > + *
> > + * This structure is used as the argument for the %LIVEUPDATE_IOCTL_FD_PRESERVE
> > + * and %LIVEUPDATE_IOCTL_FD_RESTORE ioctls. These ioctls allow specific types
> > + * of file descriptors (for example memfd, kvm, iommufd, and VFIO) to have their
> > + * underlying kernel state preserved across a live update cycle.
> > + *
> > + * To preserve an FD, user space passes this struct to
> > + * %LIVEUPDATE_IOCTL_FD_PRESERVE with the @fd field set. On success, the
> > + * kernel uses the @token field to uniquly associate the preserved FD.
> > + *
> > + * After the live update transition, user space passes the struct populated with
> > + * the *same* @token to %LIVEUPDATE_IOCTL_FD_RESTORE. The kernel uses the @token
> > + * to find the preserved state and, on success, populates the @fd field with a
> > + * new file descriptor referring to the restored resource.
> > + */
> > +struct liveupdate_fd {
> > +     int             fd;
>
> 'int' should not appear in uapi structs. Fds are __s32

done

>
> > +     __u32           flags;
> > +     __aligned_u64   token;
> > +};
> > +
> > +/* The ioctl type, documented in ioctl-number.rst */
> > +#define LIVEUPDATE_IOCTL_TYPE                0xBA
>
> I have found it very helpful to organize the ioctl numbering like this:
>
> #define IOMMUFD_TYPE (';')
>
> enum {
>         IOMMUFD_CMD_BASE = 0x80,
>         IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE,
>         IOMMUFD_CMD_IOAS_ALLOC = 0x81,
>         IOMMUFD_CMD_IOAS_ALLOW_IOVAS = 0x82,
> [..]
>
> #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)
>
> The numbers should be tightly packed and non-overlapping. It becomes
> difficult to manage this if the numbers are sprinkled all over the
> file. The above structuring will enforce git am conflicts if things
> get muddled up. Saved me a few times already in iommufd.

Done

>
> > +/**
> > + * LIVEUPDATE_IOCTL_FD_PRESERVE - Validate and initiate preservation for a file
> > + * descriptor.
> > + *
> > + * Argument: Pointer to &struct liveupdate_fd.
> > + *
> > + * User sets the @fd field identifying the file descriptor to preserve
> > + * (e.g., memfd, kvm, iommufd, VFIO). The kernel validates if this FD type
> > + * and its dependencies are supported for preservation. If validation passes,
> > + * the kernel marks the FD internally and *initiates the process* of preparing
> > + * its state for saving. The actual snapshotting of the state typically occurs
> > + * during the subsequent %LIVEUPDATE_IOCTL_PREPARE execution phase, though
> > + * some finalization might occur during freeze.
> > + * On successful validation and initiation, the kernel uses the @token
> > + * field with an opaque identifier representing the resource being preserved.
> > + * This token confirms the FD is targeted for preservation and is required for
> > + * the subsequent %LIVEUPDATE_IOCTL_FD_RESTORE call after the live update.
> > + *
> > + * Return: 0 on success (validation passed, preservation initiated), negative
> > + * error code on failure (e.g., unsupported FD type, dependency issue,
> > + * validation failed).
> > + */
> > +#define LIVEUPDATE_IOCTL_FD_PRESERVE                                 \
> > +     _IOW(LIVEUPDATE_IOCTL_TYPE, 0x00, struct liveupdate_fd)
>
> From a kdoc perspective I find it works much better to attach the kdoc
> to the struct, not the ioctl:
>
> /**
>  * struct iommu_destroy - ioctl(IOMMU_DESTROY)
>  * @size: sizeof(struct iommu_destroy)
>  * @id: iommufd object ID to destroy. Can be any destroyable object type.
>  *
>  * Destroy any object held within iommufd.
>  */
> struct iommu_destroy {
>         __u32 size;
>         __u32 id;
> };
> #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)
>
> Generates this kdoc:
>
> https://docs.kernel.org/userspace-api/iommufd.html#c.iommu_destroy

Agreed, done the same as above.

>
> You should also make sure to link the uapi header into the kdoc build
> under the "userspace API" chaper.
>
> The structs should also be self-describing. I am fairly strongly
> against using the size mechanism in the _IOW macro, it is instantly
> ABI incompatible and basically impossible to deal with from userspace.
>
> Hence why the IOMMFD version is _IO().

Right, I came to the same conclusion while reviewing fwctl, I replaced
everything with pure _IO().

>
> This means stick a size member in the first 4 bytes of every
> struct. More on this later..
>
> > +/**
> > + * LIVEUPDATE_IOCTL_FD_UNPRESERVE - Remove a file descriptor from the
> > + * preservation list.
> > + *
> > + * Argument: Pointer to __u64 token.
>
> Every ioctl should have a struct, with the size header. If you want to
> do more down the road you can not using this structure.

Done

>
> > +#define LIVEUPDATE_IOCTL_FD_RESTORE                                  \
> > +     _IOWR(LIVEUPDATE_IOCTL_TYPE, 0x02, struct liveupdate_fd)
>
> Strongly recommend that every ioctl have a unique struct. Sharing
> structs makes future extend-ability harder.

Done

>
> > +/**
> > + * LIVEUPDATE_IOCTL_PREPARE - Initiate preparation phase and trigger state
> > + * saving.
>
> Perhaps these just want to be a single 'set state' ioctl with an enum
> input argument?

Added a IOCTL: LIVEUPDATE_SET_EVENT, and all events
PREPARE/FINISH/CANCEL are now done through it.

>
> > @@ -7,4 +7,5 @@ obj-$(CONFIG_KEXEC_HANDOVER)          += kexec_handover.o
> >  obj-$(CONFIG_KEXEC_HANDOVER_DEBUG)   += kexec_handover_debug.o
> >  obj-$(CONFIG_LIVEUPDATE)             += luo_core.o
> >  obj-$(CONFIG_LIVEUPDATE)             += luo_files.o
> > +obj-$(CONFIG_LIVEUPDATE)             += luo_ioctl.o
> >  obj-$(CONFIG_LIVEUPDATE)             += luo_subsystems.o
>
> I don't think luo is modular, but I think it is generally better to
> write the kbuilds as though it was anyhow if it has a lot of files:
>
> iommufd-y := \
>         device.o \
>         eventq.o \
>         hw_pagetable.o \
>         io_pagetable.o \
>         ioas.o \
>         main.o \
>         pages.o \
>         vfio_compat.o \
>         viommu.o
> obj-$(CONFIG_IOMMUFD) += iommufd.o

Done

>
> Basically don't repeat obj-$(CONFIG_LIVEUPDATE), every one of those
> lines creates a new module (if it was modular)
>
> > +static int luo_open(struct inode *inodep, struct file *filep)
> > +{
> > +     if (!capable(CAP_SYS_ADMIN))
> > +             return -EACCES;
>
> IMHO file system permissions should control permission to open. No
> capable check.

Removed

>
> > +     if (filep->f_flags & O_EXCL)
> > +             return -EINVAL;
>
> O_EXCL doesn't really do anything for cdev, I'd drop this.
>
> The open should have an atomic to check for single open though.

Removed, and added an enforcement for a single open.

>
> > +static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
> > +{
> > +     void __user *argp = (void __user *)arg;
> > +     struct liveupdate_fd luo_fd;
> > +     enum liveupdate_state state;
> > +     int ret = 0;
> > +     u64 token;
> > +
> > +     if (_IOC_TYPE(cmd) != LIVEUPDATE_IOCTL_TYPE)
> > +             return -ENOTTY;
>
> The generic parse/disptach from fwctl is a really good idea here, you
> can cut and paste it, change the names. It makes it really easy to manage future extensibility:
>
> List the ops and their structs:
>
> static const struct fwctl_ioctl_op fwctl_ioctl_ops[] = {
>         IOCTL_OP(FWCTL_INFO, fwctl_cmd_info, struct fwctl_info, out_device_data),
>         IOCTL_OP(FWCTL_RPC, fwctl_cmd_rpc, struct fwctl_rpc, out),
> };
>
> Index the list and copy_from_user the struct desribing the opt:
>
> static long fwctl_fops_ioctl(struct file *filp, unsigned int cmd,
>                                unsigned long arg)
> {
>         struct fwctl_uctx *uctx = filp->private_data;
>         const struct fwctl_ioctl_op *op;
>         struct fwctl_ucmd ucmd = {};
>         union fwctl_ucmd_buffer buf;
>         unsigned int nr;
>         int ret;
>
>         nr = _IOC_NR(cmd);
>         if ((nr - FWCTL_CMD_BASE) >= ARRAY_SIZE(fwctl_ioctl_ops))
>                 return -ENOIOCTLCMD;
>
>         op = &fwctl_ioctl_ops[nr - FWCTL_CMD_BASE];
>         if (op->ioctl_num != cmd)
>                 return -ENOIOCTLCMD;
>
>         ucmd.uctx = uctx;
>         ucmd.cmd = &buf;
>         ucmd.ubuffer = (void __user *)arg;
>         // This is reading/checking the standard 4 byte size header:
>         ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
>         if (ret)
>                 return ret;
>
>         if (ucmd.user_size < op->min_size)
>                 return -EINVAL;
>
>         ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
>                                     ucmd.user_size);
>
>
> Removes a bunch of boiler plate and easy to make wrong copy_from_users
> in the ioctls. Centralizes size validation, zero padding checking/etc.

Yeap, implemented as  above.

>
> > +             ret = luo_register_file(luo_fd.token, luo_fd.fd);
> > +             if (!ret && copy_to_user(argp, &luo_fd, sizeof(luo_fd))) {
> > +                     WARN_ON_ONCE(luo_unregister_file(luo_fd.token));
> > +                     ret = -EFAULT;
>
> Then for extensibility you'd copy back the struct:
>
> static int ucmd_respond(struct fwctl_ucmd *ucmd, size_t cmd_len)
> {
>         if (copy_to_user(ucmd->ubuffer, ucmd->cmd,
>                          min_t(size_t, ucmd->user_size, cmd_len)))
>                 return -EFAULT;
>         return 0;
> }
>
> Which truncates it/etc according to some ABI extensibility rules.
>
> > +static int __init liveupdate_init(void)
> > +{
> > +     int err;
> > +
> > +     if (!liveupdate_enabled())
> > +             return 0;
> > +
> > +     err = misc_register(&liveupdate_miscdev);
> > +     if (err < 0) {
> > +             pr_err("Failed to register misc device '%s': %d\n",
> > +                    liveupdate_miscdev.name, err);
>
> Should remove most of the pr_err's, here too IMHO..

Removed.

>
> Jason

Thanks a lot for the thorough review!

Pasha