[PATCH v3 29/30] luo: allow preserving memfd

Pasha Tatashin posted 30 patches 1 month, 4 weeks ago
There is a newer version of this series
[PATCH v3 29/30] luo: allow preserving memfd
Posted by Pasha Tatashin 1 month, 4 weeks ago
From: Pratyush Yadav <ptyadav@amazon.de>

The ability to preserve a memfd allows userspace to use KHO and LUO to
transfer its memory contents to the next kernel. This is useful in many
ways. For one, it can be used with IOMMUFD as the backing store for
IOMMU page tables. Preserving IOMMUFD is essential for performing a
hypervisor live update with passthrough devices. memfd support provides
the first building block for making that possible.

For another, applications with a large amount of memory that takes time
to reconstruct, reboots to consume kernel upgrades can be very
expensive. memfd with LUO gives those applications reboot-persistent
memory that they can use to quickly save and reconstruct that state.

While memfd is backed by either hugetlbfs or shmem, currently only
support on shmem is added. To be more precise, support for anonymous
shmem files is added.

The handover to the next kernel is not transparent. All the properties
of the file are not preserved; only its memory contents, position, and
size. The recreated file gets the UID and GID of the task doing the
restore, and the task's cgroup gets charged with the memory.

After LUO is in prepared state, the file cannot grow or shrink, and all
its pages are pinned to avoid migrations and swapping. The file can
still be read from or written to.

Co-developed-by: Changyuan Lyu <changyuanl@google.com>
Signed-off-by: Changyuan Lyu <changyuanl@google.com>
Co-developed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Pratyush Yadav <ptyadav@amazon.de>
---
 MAINTAINERS    |   2 +
 mm/Makefile    |   1 +
 mm/memfd_luo.c | 507 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 510 insertions(+)
 create mode 100644 mm/memfd_luo.c

diff --git a/MAINTAINERS b/MAINTAINERS
index b88b77977649..7421d21672f3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14209,6 +14209,7 @@ F:	tools/testing/selftests/livepatch/
 
 LIVE UPDATE
 M:	Pasha Tatashin <pasha.tatashin@soleen.com>
+R:	Pratyush Yadav <pratyush@kernel.org>
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/ABI/testing/sysfs-kernel-liveupdate
@@ -14218,6 +14219,7 @@ F:	Documentation/userspace-api/liveupdate.rst
 F:	include/linux/liveupdate.h
 F:	include/uapi/linux/liveupdate.h
 F:	kernel/liveupdate/
+F:	mm/memfd_luo.c
 F:	tools/testing/selftests/liveupdate/
 
 LLC (802.2)
diff --git a/mm/Makefile b/mm/Makefile
index ef54aa615d9d..0a9936ffc172 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_NUMA) += memory-tiers.o
 obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
 obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
+obj-$(CONFIG_LIVEUPDATE) += memfd_luo.o
 obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
 obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
 ifdef CONFIG_SWAP
diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c
new file mode 100644
index 000000000000..0c91b40a2080
--- /dev/null
+++ b/mm/memfd_luo.c
@@ -0,0 +1,507 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ * Changyuan Lyu <changyuanl@google.com>
+ *
+ * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
+ * Pratyush Yadav <ptyadav@amazon.de>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/file.h>
+#include <linux/io.h>
+#include <linux/libfdt.h>
+#include <linux/liveupdate.h>
+#include <linux/kexec_handover.h>
+#include <linux/shmem_fs.h>
+#include <linux/bits.h>
+#include "internal.h"
+
+static const char memfd_luo_compatible[] = "memfd-v1";
+
+#define PRESERVED_PFN_MASK		GENMASK(63, 12)
+#define PRESERVED_PFN_SHIFT		12
+#define PRESERVED_FLAG_DIRTY		BIT(0)
+#define PRESERVED_FLAG_UPTODATE		BIT(1)
+
+#define PRESERVED_FOLIO_PFN(desc)	(((desc) & PRESERVED_PFN_MASK) >> PRESERVED_PFN_SHIFT)
+#define PRESERVED_FOLIO_FLAGS(desc)	((desc) & ~PRESERVED_PFN_MASK)
+#define PRESERVED_FOLIO_MKDESC(pfn, flags) (((pfn) << PRESERVED_PFN_SHIFT) | (flags))
+
+struct memfd_luo_preserved_folio {
+	/*
+	 * The folio descriptor is made of 2 parts. The bottom 12 bits are used
+	 * for storing flags, the others for storing the PFN.
+	 */
+	u64 foliodesc;
+	u64 index;
+};
+
+static int memfd_luo_preserve_folios(struct memfd_luo_preserved_folio *pfolios,
+				     struct folio **folios,
+				     unsigned int nr_folios)
+{
+	unsigned int i;
+	int err;
+
+	for (i = 0; i < nr_folios; i++) {
+		struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
+		struct folio *folio = folios[i];
+		unsigned int flags = 0;
+		unsigned long pfn;
+
+		err = kho_preserve_folio(folio);
+		if (err)
+			goto err_unpreserve;
+
+		pfn = folio_pfn(folio);
+		if (folio_test_dirty(folio))
+			flags |= PRESERVED_FLAG_DIRTY;
+		if (folio_test_uptodate(folio))
+			flags |= PRESERVED_FLAG_UPTODATE;
+
+		pfolio->foliodesc = PRESERVED_FOLIO_MKDESC(pfn, flags);
+		pfolio->index = folio->index;
+	}
+
+	return 0;
+
+err_unpreserve:
+	i--;
+	for (; i >= 0; i--)
+		WARN_ON_ONCE(kho_unpreserve_folio(folios[i]));
+	return err;
+}
+
+static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
+					unsigned int nr_folios)
+{
+	unsigned int i;
+
+	for (i = 0; i < nr_folios; i++) {
+		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
+		struct folio *folio;
+
+		if (!pfolio->foliodesc)
+			continue;
+
+		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
+
+		kho_unpreserve_folio(folio);
+		unpin_folio(folio);
+	}
+}
+
+static void *memfd_luo_create_fdt(unsigned long size)
+{
+	unsigned int order = get_order(size);
+	struct folio *fdt_folio;
+	int err = 0;
+	void *fdt;
+
+	if (order > MAX_PAGE_ORDER)
+		return NULL;
+
+	fdt_folio = folio_alloc(GFP_KERNEL, order);
+	if (!fdt_folio)
+		return NULL;
+
+	fdt = folio_address(fdt_folio);
+
+	err |= fdt_create(fdt, (1 << (order + PAGE_SHIFT)));
+	err |= fdt_finish_reservemap(fdt);
+	err |= fdt_begin_node(fdt, "");
+	if (err)
+		goto free;
+
+	return fdt;
+
+free:
+	folio_put(fdt_folio);
+	return NULL;
+}
+
+static int memfd_luo_finish_fdt(void *fdt)
+{
+	int err;
+
+	err = fdt_end_node(fdt);
+	if (err)
+		return err;
+
+	return fdt_finish(fdt);
+}
+
+static int memfd_luo_prepare(struct liveupdate_file_handler *handler,
+			     struct file *file, u64 *data)
+{
+	struct memfd_luo_preserved_folio *preserved_folios;
+	struct inode *inode = file_inode(file);
+	unsigned int max_folios, nr_folios = 0;
+	int err = 0, preserved_size;
+	struct folio **folios;
+	long size, nr_pinned;
+	pgoff_t offset;
+	void *fdt;
+	u64 pos;
+
+	if (WARN_ON_ONCE(!shmem_file(file)))
+		return -EINVAL;
+
+	inode_lock(inode);
+	shmem_i_mapping_freeze(inode, true);
+
+	size = i_size_read(inode);
+	if ((PAGE_ALIGN(size) / PAGE_SIZE) > UINT_MAX) {
+		err = -E2BIG;
+		goto err_unlock;
+	}
+
+	/*
+	 * Guess the number of folios based on inode size. Real number might end
+	 * up being smaller if there are higher order folios.
+	 */
+	max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
+	folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
+	if (!folios) {
+		err = -ENOMEM;
+		goto err_unfreeze;
+	}
+
+	/*
+	 * Pin the folios so they don't move around behind our back. This also
+	 * ensures none of the folios are in CMA -- which ensures they don't
+	 * fall in KHO scratch memory. It also moves swapped out folios back to
+	 * memory.
+	 *
+	 * A side effect of doing this is that it allocates a folio for all
+	 * indices in the file. This might waste memory on sparse memfds. If
+	 * that is really a problem in the future, we can have a
+	 * memfd_pin_folios() variant that does not allocate a page on empty
+	 * slots.
+	 */
+	nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
+				     &offset);
+	if (nr_pinned < 0) {
+		err = nr_pinned;
+		pr_err("failed to pin folios: %d\n", err);
+		goto err_free_folios;
+	}
+	/* nr_pinned won't be more than max_folios which is also unsigned int. */
+	nr_folios = (unsigned int)nr_pinned;
+
+	preserved_size = sizeof(struct memfd_luo_preserved_folio) * nr_folios;
+	if (check_mul_overflow(sizeof(struct memfd_luo_preserved_folio),
+			       nr_folios, &preserved_size)) {
+		err = -E2BIG;
+		goto err_unpin;
+	}
+
+	/*
+	 * Most of the space should be taken by preserved folios. So take its
+	 * size, plus a page for other properties.
+	 */
+	fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE);
+	if (!fdt) {
+		err = -ENOMEM;
+		goto err_unpin;
+	}
+
+	pos = file->f_pos;
+	err = fdt_property(fdt, "pos", &pos, sizeof(pos));
+	if (err)
+		goto err_free_fdt;
+
+	err = fdt_property(fdt, "size", &size, sizeof(size));
+	if (err)
+		goto err_free_fdt;
+
+	err = fdt_property_placeholder(fdt, "folios", preserved_size,
+				       (void **)&preserved_folios);
+	if (err) {
+		pr_err("Failed to reserve folios property in FDT: %s\n",
+		       fdt_strerror(err));
+		err = -ENOMEM;
+		goto err_free_fdt;
+	}
+
+	err = memfd_luo_preserve_folios(preserved_folios, folios, nr_folios);
+	if (err)
+		goto err_free_fdt;
+
+	err = memfd_luo_finish_fdt(fdt);
+	if (err)
+		goto err_unpreserve;
+
+	err = kho_preserve_folio(virt_to_folio(fdt));
+	if (err)
+		goto err_unpreserve;
+
+	kvfree(folios);
+	inode_unlock(inode);
+
+	*data = virt_to_phys(fdt);
+	return 0;
+
+err_unpreserve:
+	memfd_luo_unpreserve_folios(preserved_folios, nr_folios);
+err_free_fdt:
+	folio_put(virt_to_folio(fdt));
+err_unpin:
+	unpin_folios(folios, nr_pinned);
+err_free_folios:
+	kvfree(folios);
+err_unfreeze:
+	shmem_i_mapping_freeze(inode, false);
+err_unlock:
+	inode_unlock(inode);
+	return err;
+}
+
+static int memfd_luo_freeze(struct liveupdate_file_handler *handler,
+			    struct file *file, u64 *data)
+{
+	u64 pos = file->f_pos;
+	void *fdt;
+	int err;
+
+	if (WARN_ON_ONCE(!*data))
+		return -EINVAL;
+
+	fdt = phys_to_virt(*data);
+
+	/*
+	 * The pos or size might have changed since prepare. Everything else
+	 * stays the same.
+	 */
+	err = fdt_setprop(fdt, 0, "pos", &pos, sizeof(pos));
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void memfd_luo_cancel(struct liveupdate_file_handler *handler,
+			     struct file *file, u64 data)
+{
+	const struct memfd_luo_preserved_folio *pfolios;
+	struct inode *inode = file_inode(file);
+	struct folio *fdt_folio;
+	void *fdt;
+	int len;
+
+	if (WARN_ON_ONCE(!data))
+		return;
+
+	inode_lock(inode);
+	shmem_i_mapping_freeze(inode, false);
+
+	fdt = phys_to_virt(data);
+	fdt_folio = virt_to_folio(fdt);
+	pfolios = fdt_getprop(fdt, 0, "folios", &len);
+	if (pfolios)
+		memfd_luo_unpreserve_folios(pfolios, len / sizeof(*pfolios));
+
+	kho_unpreserve_folio(fdt_folio);
+	folio_put(fdt_folio);
+	inode_unlock(inode);
+}
+
+static struct folio *memfd_luo_get_fdt(u64 data)
+{
+	return kho_restore_folio((phys_addr_t)data);
+}
+
+static void memfd_luo_finish(struct liveupdate_file_handler *handler,
+			     struct file *file, u64 data, bool reclaimed)
+{
+	const struct memfd_luo_preserved_folio *pfolios;
+	struct folio *fdt_folio;
+	int len;
+
+	if (reclaimed)
+		return;
+
+	fdt_folio = memfd_luo_get_fdt(data);
+
+	pfolios = fdt_getprop(folio_address(fdt_folio), 0, "folios", &len);
+	if (pfolios)
+		memfd_luo_unpreserve_folios(pfolios, len / sizeof(*pfolios));
+
+	folio_put(fdt_folio);
+}
+
+static int memfd_luo_retrieve(struct liveupdate_file_handler *handler, u64 data,
+			      struct file **file_p)
+{
+	const struct memfd_luo_preserved_folio *pfolios;
+	int nr_pfolios, len, ret = 0, i = 0;
+	struct address_space *mapping;
+	struct folio *folio, *fdt_folio;
+	const u64 *pos, *size;
+	struct inode *inode;
+	struct file *file;
+	const void *fdt;
+
+	fdt_folio = memfd_luo_get_fdt(data);
+	if (!fdt_folio)
+		return -ENOENT;
+
+	fdt = page_to_virt(folio_page(fdt_folio, 0));
+
+	pfolios = fdt_getprop(fdt, 0, "folios", &len);
+	if (!pfolios || len % sizeof(*pfolios)) {
+		pr_err("invalid 'folios' property\n");
+		ret = -EINVAL;
+		goto put_fdt;
+	}
+	nr_pfolios = len / sizeof(*pfolios);
+
+	size = fdt_getprop(fdt, 0, "size", &len);
+	if (!size || len != sizeof(u64)) {
+		pr_err("invalid 'size' property\n");
+		ret = -EINVAL;
+		goto put_folios;
+	}
+
+	pos = fdt_getprop(fdt, 0, "pos", &len);
+	if (!pos || len != sizeof(u64)) {
+		pr_err("invalid 'pos' property\n");
+		ret = -EINVAL;
+		goto put_folios;
+	}
+
+	file = shmem_file_setup("", 0, VM_NORESERVE);
+
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		pr_err("failed to setup file: %d\n", ret);
+		goto put_folios;
+	}
+
+	inode = file->f_inode;
+	mapping = inode->i_mapping;
+	vfs_setpos(file, *pos, MAX_LFS_FILESIZE);
+
+	for (; i < nr_pfolios; i++) {
+		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
+		phys_addr_t phys;
+		u64 index;
+		int flags;
+
+		if (!pfolio->foliodesc)
+			continue;
+
+		phys = PFN_PHYS(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
+		folio = kho_restore_folio(phys);
+		if (!folio) {
+			pr_err("Unable to restore folio at physical address: %llx\n",
+			       phys);
+			goto put_file;
+		}
+		index = pfolio->index;
+		flags = PRESERVED_FOLIO_FLAGS(pfolio->foliodesc);
+
+		/* Set up the folio for insertion. */
+		/*
+		 * TODO: Should find a way to unify this and
+		 * shmem_alloc_and_add_folio().
+		 */
+		__folio_set_locked(folio);
+		__folio_set_swapbacked(folio);
+
+		ret = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
+		if (ret) {
+			pr_err("shmem: failed to charge folio index %d: %d\n",
+			       i, ret);
+			goto unlock_folio;
+		}
+
+		ret = shmem_add_to_page_cache(folio, mapping, index, NULL,
+					      mapping_gfp_mask(mapping));
+		if (ret) {
+			pr_err("shmem: failed to add to page cache folio index %d: %d\n",
+			       i, ret);
+			goto unlock_folio;
+		}
+
+		if (flags & PRESERVED_FLAG_UPTODATE)
+			folio_mark_uptodate(folio);
+		if (flags & PRESERVED_FLAG_DIRTY)
+			folio_mark_dirty(folio);
+
+		ret = shmem_inode_acct_blocks(inode, 1);
+		if (ret) {
+			pr_err("shmem: failed to account folio index %d: %d\n",
+			       i, ret);
+			goto unlock_folio;
+		}
+
+		shmem_recalc_inode(inode, 1, 0);
+		folio_add_lru(folio);
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+
+	inode->i_size = *size;
+	*file_p = file;
+	folio_put(fdt_folio);
+	return 0;
+
+unlock_folio:
+	folio_unlock(folio);
+	folio_put(folio);
+put_file:
+	fput(file);
+	i++;
+put_folios:
+	for (; i < nr_pfolios; i++) {
+		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
+
+		folio = kho_restore_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
+		if (folio)
+			folio_put(folio);
+	}
+
+put_fdt:
+	folio_put(fdt_folio);
+	return ret;
+}
+
+static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
+				   struct file *file)
+{
+	struct inode *inode = file_inode(file);
+
+	return shmem_file(file) && !inode->i_nlink;
+}
+
+static const struct liveupdate_file_ops memfd_luo_file_ops = {
+	.prepare = memfd_luo_prepare,
+	.freeze = memfd_luo_freeze,
+	.cancel = memfd_luo_cancel,
+	.finish = memfd_luo_finish,
+	.retrieve = memfd_luo_retrieve,
+	.can_preserve = memfd_luo_can_preserve,
+	.owner = THIS_MODULE,
+};
+
+static struct liveupdate_file_handler memfd_luo_handler = {
+	.ops = &memfd_luo_file_ops,
+	.compatible = memfd_luo_compatible,
+};
+
+static int __init memfd_luo_init(void)
+{
+	int err;
+
+	err = liveupdate_register_file_handler(&memfd_luo_handler);
+	if (err)
+		pr_err("Could not register luo filesystem handler: %d\n", err);
+
+	return err;
+}
+late_initcall(memfd_luo_init);
-- 
2.50.1.565.gc32cd1483b-goog
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Jason Gunthorpe 1 month, 1 week ago
On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote:

> +	/*
> +	 * Most of the space should be taken by preserved folios. So take its
> +	 * size, plus a page for other properties.
> +	 */
> +	fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE);
> +	if (!fdt) {
> +		err = -ENOMEM;
> +		goto err_unpin;
> +	}

This doesn't seem to have any versioning scheme, it really should..

> +	err = fdt_property_placeholder(fdt, "folios", preserved_size,
> +				       (void **)&preserved_folios);
> +	if (err) {
> +		pr_err("Failed to reserve folios property in FDT: %s\n",
> +		       fdt_strerror(err));
> +		err = -ENOMEM;
> +		goto err_free_fdt;
> +	}

Yuk.

This really wants some luo helper

'luo alloc array'
'luo restore array'
'luo free array'

Which would get a linearized list of pages in the vmap to hold the
array and then allocate some structure to record the page list and
return back the u64 of the phys_addr of the top of the structure to
store in whatever.

Getting fdt to allocate the array inside the fds is just not going to
work for anything of size.

> +	for (; i < nr_pfolios; i++) {
> +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> +		phys_addr_t phys;
> +		u64 index;
> +		int flags;
> +
> +		if (!pfolio->foliodesc)
> +			continue;
> +
> +		phys = PFN_PHYS(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> +		folio = kho_restore_folio(phys);
> +		if (!folio) {
> +			pr_err("Unable to restore folio at physical address: %llx\n",
> +			       phys);
> +			goto put_file;
> +		}
> +		index = pfolio->index;
> +		flags = PRESERVED_FOLIO_FLAGS(pfolio->foliodesc);
> +
> +		/* Set up the folio for insertion. */
> +		/*
> +		 * TODO: Should find a way to unify this and
> +		 * shmem_alloc_and_add_folio().
> +		 */
> +		__folio_set_locked(folio);
> +		__folio_set_swapbacked(folio);
> 
> +		ret = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
> +		if (ret) {
> +			pr_err("shmem: failed to charge folio index %d: %d\n",
> +			       i, ret);
> +			goto unlock_folio;
> +		}

[..]

> +		folio_add_lru(folio);
> +		folio_unlock(folio);
> +		folio_put(folio);
> +	}

Probably some consolidation will be needed to make this less
duplicated..

But overall I think just using the memfd_luo_preserved_folio as the
serialization is entirely file, I don't think this needs anything more
complicated.

What it does need is an alternative to the FDT with versioning.

Which seems to me to be entirely fine as:

 struct memfd_luo_v0 {
    __aligned_u64 size;
    __aligned_u64 pos;
    __aligned_u64 folios;
 };

 struct memfd_luo_v0 memfd_luo_v0 = {.size = size, pos = file->f_pos, folios = folios};
 luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0);

Which also shows the actual data needing to be serialized comes from
more than one struct and has to be marshaled in code, somehow, to a
single struct.

Then I imagine a fairly simple forwards/backwards story. If something
new is needed that is non-optional, lets say you compress the folios
list to optimize holes:

 struct memfd_luo_v1 {
    __aligned_u64 size;
    __aligned_u64 pos;
    __aligned_u64 folios_list_with_holes;
 };

Obviously a v0 kernel cannot parse this, but in this case a v1 aware
kernel could optionally duplicate and write out the v0 format as well:

 luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0);
 luo_store_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1);

Then the rule is fairly simple, when the sucessor kernel goes to
deserialize it asks luo for the versions it supports:

 if (luo_restore_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1))
    restore_v1(&memfd_luo_v1)
 else if (luo_restore_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0))
    restore_v0(&memfd_luo_v0)
 else
    luo_failure("Do not understand this");

luo core just manages this list of versioned data per serialized
object. There is only one version per object.

Jason
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Mike Rapoport 1 month ago
On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote:
> On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote:
> 
> > +	/*
> > +	 * Most of the space should be taken by preserved folios. So take its
> > +	 * size, plus a page for other properties.
> > +	 */
> > +	fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE);
> > +	if (!fdt) {
> > +		err = -ENOMEM;
> > +		goto err_unpin;
> > +	}
> 
> This doesn't seem to have any versioning scheme, it really should..
> 
> > +	err = fdt_property_placeholder(fdt, "folios", preserved_size,
> > +				       (void **)&preserved_folios);
> > +	if (err) {
> > +		pr_err("Failed to reserve folios property in FDT: %s\n",
> > +		       fdt_strerror(err));
> > +		err = -ENOMEM;
> > +		goto err_free_fdt;
> > +	}
> 
> Yuk.
> 
> This really wants some luo helper
> 
> 'luo alloc array'
> 'luo restore array'
> 'luo free array'

We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here:
https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1

Will wait for kbuild and then send proper patches.
 

-- 
Sincerely yours,
Mike.
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month ago
Hi Mike,

On Mon, Sep 01 2025, Mike Rapoport wrote:

> On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote:
>> On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote:
>> 
>> > +	/*
>> > +	 * Most of the space should be taken by preserved folios. So take its
>> > +	 * size, plus a page for other properties.
>> > +	 */
>> > +	fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE);
>> > +	if (!fdt) {
>> > +		err = -ENOMEM;
>> > +		goto err_unpin;
>> > +	}
>> 
>> This doesn't seem to have any versioning scheme, it really should..
>> 
>> > +	err = fdt_property_placeholder(fdt, "folios", preserved_size,
>> > +				       (void **)&preserved_folios);
>> > +	if (err) {
>> > +		pr_err("Failed to reserve folios property in FDT: %s\n",
>> > +		       fdt_strerror(err));
>> > +		err = -ENOMEM;
>> > +		goto err_free_fdt;
>> > +	}
>> 
>> Yuk.
>> 
>> This really wants some luo helper
>> 
>> 'luo alloc array'
>> 'luo restore array'
>> 'luo free array'
>
> We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here:
> https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1
>
> Will wait for kbuild and then send proper patches.

I have been working on something similar, but in a more generic way.

I have implemented a sparse KHO-preservable array (called kho_array)
with xarray like properties. It can take in 4-byte aligned pointers and
supports saving non-pointer values similar to xa_mk_value(). For now it
doesn't support multi-index entries, but if needed the data format can
be extended to support it as well.

The structure is very similar to what you have implemented. It uses a
linked list of pages with some metadata at the head of each page.

I have used it for memfd preservation, and I think it is quite
versatile. For example, your kho_preserve_vmalloc() can be very easily
built on top of this kho_array by simply saving each physical page
address at consecutive indices in the array.

The code is still WIP and currently a bit hacky, but I will clean it up
in a couple days and I think it should be ready for posting. You can
find the current version at [0][1]. Would be good to hear your thoughts,
and if you agree with the approach, I can also port
kho_preserve_vmalloc() to work on top of kho_array as well.

[0] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=cf4c04c1e9ac854e3297018ad6dada17c54a59af
[1] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=5eb0d7316274a9c87acaeedd86941979fc4baf96

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Mike Rapoport 1 month ago
Hi Pratyush,

On Mon, Sep 01, 2025 at 07:01:38PM +0200, Pratyush Yadav wrote:
> Hi Mike,
> 
> On Mon, Sep 01 2025, Mike Rapoport wrote:
> 
> > On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote:
> >> On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote:
> >> 
> >> > +	/*
> >> > +	 * Most of the space should be taken by preserved folios. So take its
> >> > +	 * size, plus a page for other properties.
> >> > +	 */
> >> > +	fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE);
> >> > +	if (!fdt) {
> >> > +		err = -ENOMEM;
> >> > +		goto err_unpin;
> >> > +	}
> >> 
> >> This doesn't seem to have any versioning scheme, it really should..
> >> 
> >> > +	err = fdt_property_placeholder(fdt, "folios", preserved_size,
> >> > +				       (void **)&preserved_folios);
> >> > +	if (err) {
> >> > +		pr_err("Failed to reserve folios property in FDT: %s\n",
> >> > +		       fdt_strerror(err));
> >> > +		err = -ENOMEM;
> >> > +		goto err_free_fdt;
> >> > +	}
> >> 
> >> Yuk.
> >> 
> >> This really wants some luo helper
> >> 
> >> 'luo alloc array'
> >> 'luo restore array'
> >> 'luo free array'
> >
> > We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here:
> > https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1
> >
> > Will wait for kbuild and then send proper patches.
> 
> I have been working on something similar, but in a more generic way.
> 
> I have implemented a sparse KHO-preservable array (called kho_array)
> with xarray like properties. It can take in 4-byte aligned pointers and
> supports saving non-pointer values similar to xa_mk_value(). For now it
> doesn't support multi-index entries, but if needed the data format can
> be extended to support it as well.
> 
> The structure is very similar to what you have implemented. It uses a
> linked list of pages with some metadata at the head of each page.
> 
> I have used it for memfd preservation, and I think it is quite
> versatile. For example, your kho_preserve_vmalloc() can be very easily
> built on top of this kho_array by simply saving each physical page
> address at consecutive indices in the array.

I've started to work on something similar to your kho_array for memfd case
and then I thought that since we know the size of the array we can simply
vmalloc it and preserve vmalloc, and that lead me to implementing
preservation of vmalloc :)

I like the idea to have kho_array for cases when we don't know the amount
of data to preserve in advance, but for memfd as it's currently
implemented I think that allocating and preserving vmalloc is simpler.

As for porting kho_preserve_vmalloc() to kho_array, I also feel that it
would just make kho_preserve_vmalloc() more complex and I'd rather simplify
it even more, e.g. with preallocating all the pages that preserve indices
in advance.
 
> The code is still WIP and currently a bit hacky, but I will clean it up
> in a couple days and I think it should be ready for posting. You can
> find the current version at [0][1]. Would be good to hear your thoughts,
> and if you agree with the approach, I can also port
> kho_preserve_vmalloc() to work on top of kho_array as well.
> 
> [0] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=cf4c04c1e9ac854e3297018ad6dada17c54a59af
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=5eb0d7316274a9c87acaeedd86941979fc4baf96
> 
> -- 
> Regards,
> Pratyush Yadav

-- 
Sincerely yours,
Mike.
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month ago
Hi Mike,

On Tue, Sep 02 2025, Mike Rapoport wrote:

> Hi Pratyush,
>
> On Mon, Sep 01, 2025 at 07:01:38PM +0200, Pratyush Yadav wrote:
>> Hi Mike,
>> 
>> On Mon, Sep 01 2025, Mike Rapoport wrote:
>> 
>> > On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote:
>> >> On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote:
>> >> 
>> >> > +	/*
>> >> > +	 * Most of the space should be taken by preserved folios. So take its
>> >> > +	 * size, plus a page for other properties.
>> >> > +	 */
>> >> > +	fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE);
>> >> > +	if (!fdt) {
>> >> > +		err = -ENOMEM;
>> >> > +		goto err_unpin;
>> >> > +	}
>> >> 
>> >> This doesn't seem to have any versioning scheme, it really should..
>> >> 
>> >> > +	err = fdt_property_placeholder(fdt, "folios", preserved_size,
>> >> > +				       (void **)&preserved_folios);
>> >> > +	if (err) {
>> >> > +		pr_err("Failed to reserve folios property in FDT: %s\n",
>> >> > +		       fdt_strerror(err));
>> >> > +		err = -ENOMEM;
>> >> > +		goto err_free_fdt;
>> >> > +	}
>> >> 
>> >> Yuk.
>> >> 
>> >> This really wants some luo helper
>> >> 
>> >> 'luo alloc array'
>> >> 'luo restore array'
>> >> 'luo free array'
>> >
>> > We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here:
>> > https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1
>> >
>> > Will wait for kbuild and then send proper patches.
>> 
>> I have been working on something similar, but in a more generic way.
>> 
>> I have implemented a sparse KHO-preservable array (called kho_array)
>> with xarray like properties. It can take in 4-byte aligned pointers and
>> supports saving non-pointer values similar to xa_mk_value(). For now it
>> doesn't support multi-index entries, but if needed the data format can
>> be extended to support it as well.
>> 
>> The structure is very similar to what you have implemented. It uses a
>> linked list of pages with some metadata at the head of each page.
>> 
>> I have used it for memfd preservation, and I think it is quite
>> versatile. For example, your kho_preserve_vmalloc() can be very easily
>> built on top of this kho_array by simply saving each physical page
>> address at consecutive indices in the array.
>
> I've started to work on something similar to your kho_array for memfd case
> and then I thought that since we know the size of the array we can simply
> vmalloc it and preserve vmalloc, and that lead me to implementing
> preservation of vmalloc :)
>
> I like the idea to have kho_array for cases when we don't know the amount
> of data to preserve in advance, but for memfd as it's currently
> implemented I think that allocating and preserving vmalloc is simpler.
>
> As for porting kho_preserve_vmalloc() to kho_array, I also feel that it
> would just make kho_preserve_vmalloc() more complex and I'd rather simplify
> it even more, e.g. with preallocating all the pages that preserve indices
> in advance.

I think there are two parts here. One is the data format of the KHO
array and the other is the way to build it. I think the format is quite
simple and versatile, and we can have many strategies of building it.

For example, if you are only concerned with pre-allocating data, I can
very well add a way to initialize the KHO array with with a fixed size
up front.

Beyond that, I think KHO array will actually make kho_preserve_vmalloc()
simpler since it won't have to deal with the linked list traversal
logic. It can just do ka_for_each() and just get all the pages. We can
also convert the preservation bitmaps to use it so the linked list logic
is in one place, and others just build on top of it.

>  
>> The code is still WIP and currently a bit hacky, but I will clean it up
>> in a couple days and I think it should be ready for posting. You can
>> find the current version at [0][1]. Would be good to hear your thoughts,
>> and if you agree with the approach, I can also port
>> kho_preserve_vmalloc() to work on top of kho_array as well.
>> 
>> [0] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=cf4c04c1e9ac854e3297018ad6dada17c54a59af
>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=5eb0d7316274a9c87acaeedd86941979fc4baf96
>> 
>> -- 
>> Regards,
>> Pratyush Yadav

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Mike Rapoport 1 month ago
Hi Pratyush,

On Wed, Sep 03, 2025 at 04:17:15PM +0200, Pratyush Yadav wrote:
> On Tue, Sep 02 2025, Mike Rapoport wrote:
> >
> > As for porting kho_preserve_vmalloc() to kho_array, I also feel that it
> > would just make kho_preserve_vmalloc() more complex and I'd rather simplify
> > it even more, e.g. with preallocating all the pages that preserve indices
> > in advance.
> 
> I think there are two parts here. One is the data format of the KHO
> array and the other is the way to build it. I think the format is quite
> simple and versatile, and we can have many strategies of building it.
> 
> For example, if you are only concerned with pre-allocating data, I can
> very well add a way to initialize the KHO array with with a fixed size
> up front.

I wasn't concerned with preallocation vs allocating a page at a time, I
though with preallocation the vmalloc code will become even simpler, but
it's not :)
 
> Beyond that, I think KHO array will actually make kho_preserve_vmalloc()
> simpler since it won't have to deal with the linked list traversal
> logic. It can just do ka_for_each() and just get all the pages.
>
> We can also convert the preservation bitmaps to use it so the linked list
> logic is in one place, and others just build on top of it.

I disagree. The boilerplate to initialize and iterate the kho_array will
not make neither vmalloc nor bitmaps preservation simpler IMO.

And for bitmaps Pasha and Jason M. are anyway working on a different data
structure already, so if their proposal moves forward converting bitmap
preservation to anything would be a wasted effort.

> -- 
> Regards,
> Pratyush Yadav

-- 
Sincerely yours,
Mike.
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month ago
Hi Mike,

On Wed, Sep 03 2025, Mike Rapoport wrote:
> On Wed, Sep 03, 2025 at 04:17:15PM +0200, Pratyush Yadav wrote:
>> On Tue, Sep 02 2025, Mike Rapoport wrote:
>> >
>> > As for porting kho_preserve_vmalloc() to kho_array, I also feel that it
>> > would just make kho_preserve_vmalloc() more complex and I'd rather simplify
>> > it even more, e.g. with preallocating all the pages that preserve indices
>> > in advance.
[...]
>  
>> Beyond that, I think KHO array will actually make kho_preserve_vmalloc()
>> simpler since it won't have to deal with the linked list traversal
>> logic. It can just do ka_for_each() and just get all the pages.
>>
>> We can also convert the preservation bitmaps to use it so the linked list
>> logic is in one place, and others just build on top of it.
>
> I disagree. The boilerplate to initialize and iterate the kho_array will
> not make neither vmalloc nor bitmaps preservation simpler IMO.

I have done 80% of the work on this already, so let's do this: I will do
the rest of the 20% and publish the patches. Then you and Jason can have
a look and if you still think it's not worth it, I am fine shelving it
for now and revisiting later when there might be a stronger case.

>
> And for bitmaps Pasha and Jason M. are anyway working on a different data
> structure already, so if their proposal moves forward converting bitmap
> preservation to anything would be a wasted effort.

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pasha Tatashin 1 month ago
On Mon, Sep 1, 2025 at 4:23 PM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote:
> > On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote:
> >
> > > +   /*
> > > +    * Most of the space should be taken by preserved folios. So take its
> > > +    * size, plus a page for other properties.
> > > +    */
> > > +   fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE);
> > > +   if (!fdt) {
> > > +           err = -ENOMEM;
> > > +           goto err_unpin;
> > > +   }
> >
> > This doesn't seem to have any versioning scheme, it really should..
> >
> > > +   err = fdt_property_placeholder(fdt, "folios", preserved_size,
> > > +                                  (void **)&preserved_folios);
> > > +   if (err) {
> > > +           pr_err("Failed to reserve folios property in FDT: %s\n",
> > > +                  fdt_strerror(err));
> > > +           err = -ENOMEM;
> > > +           goto err_free_fdt;
> > > +   }
> >
> > Yuk.
> >
> > This really wants some luo helper
> >
> > 'luo alloc array'
> > 'luo restore array'
> > 'luo free array'
>
> We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here:
> https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1

The patch looks okay to me, but it doesn't support holes in vmap
areas. While that is likely acceptable for vmalloc, it could be a
problem if we want to preserve memfd with holes and using vmap
preservation as a method, which would require a different approach.
Still, this would help with preserving memfd.

However, I wonder if we should add a separate preservation library on
top of the kho and not as part of kho (or at least keep them in a
separate file from core logic). This would allow us to preserve more
advanced data structures such as this and define preservation version
control, similar to Jason's store_object/restore_object proposal.

>
> Will wait for kbuild and then send proper patches.
>
>
> --
> Sincerely yours,
> Mike.
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Mike Rapoport 1 month ago
On Mon, Sep 01, 2025 at 04:54:15PM +0000, Pasha Tatashin wrote:
> On Mon, Sep 1, 2025 at 4:23 PM Mike Rapoport <rppt@kernel.org> wrote:
> >
> > On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote:
> > > On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote:
> > >
> > > > +   /*
> > > > +    * Most of the space should be taken by preserved folios. So take its
> > > > +    * size, plus a page for other properties.
> > > > +    */
> > > > +   fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE);
> > > > +   if (!fdt) {
> > > > +           err = -ENOMEM;
> > > > +           goto err_unpin;
> > > > +   }
> > >
> > > This doesn't seem to have any versioning scheme, it really should..
> > >
> > > > +   err = fdt_property_placeholder(fdt, "folios", preserved_size,
> > > > +                                  (void **)&preserved_folios);
> > > > +   if (err) {
> > > > +           pr_err("Failed to reserve folios property in FDT: %s\n",
> > > > +                  fdt_strerror(err));
> > > > +           err = -ENOMEM;
> > > > +           goto err_free_fdt;
> > > > +   }
> > >
> > > Yuk.
> > >
> > > This really wants some luo helper
> > >
> > > 'luo alloc array'
> > > 'luo restore array'
> > > 'luo free array'
> >
> > We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here:
> > https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1
> 
> The patch looks okay to me, but it doesn't support holes in vmap
> areas. While that is likely acceptable for vmalloc, it could be a
> problem if we want to preserve memfd with holes and using vmap
> preservation as a method, which would require a different approach.
> Still, this would help with preserving memfd.

I can't say I understand what you mean by "holes in vmap areas". We anyway
get an array of folios in memfd_pin_folios() and at that point we know
exactly how many folios there is. So we can do something like

	preserved_folios = vmalloc_array(nr_folios, sizeof(*preserved_folios));
	memfd_luo_preserve_folios(preserved_folios, folios, nr_folios);
	kho_preserve_vmalloc(preserved_folios, &folios_info);

> However, I wonder if we should add a separate preservation library on
> top of the kho and not as part of kho (or at least keep them in a
> separate file from core logic). This would allow us to preserve more
> advanced data structures such as this and define preservation version
> control, similar to Jason's store_object/restore_object proposal.

kho_preserve_vmalloc() seems quite basic and I don't think it should be
separated from kho core. kho_array is already planned in a separate file :)
 
> > Will wait for kbuild and then send proper patches.
> >
> >
> > --
> > Sincerely yours,
> > Mike.
> 

-- 
Sincerely yours,
Mike.
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month ago
Hi Pasha,

On Mon, Sep 01 2025, Pasha Tatashin wrote:

> On Mon, Sep 1, 2025 at 4:23 PM Mike Rapoport <rppt@kernel.org> wrote:
>>
>> On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote:
>> > On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote:
>> >
>> > > +   /*
>> > > +    * Most of the space should be taken by preserved folios. So take its
>> > > +    * size, plus a page for other properties.
>> > > +    */
>> > > +   fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE);
>> > > +   if (!fdt) {
>> > > +           err = -ENOMEM;
>> > > +           goto err_unpin;
>> > > +   }
>> >
>> > This doesn't seem to have any versioning scheme, it really should..
>> >
>> > > +   err = fdt_property_placeholder(fdt, "folios", preserved_size,
>> > > +                                  (void **)&preserved_folios);
>> > > +   if (err) {
>> > > +           pr_err("Failed to reserve folios property in FDT: %s\n",
>> > > +                  fdt_strerror(err));
>> > > +           err = -ENOMEM;
>> > > +           goto err_free_fdt;
>> > > +   }
>> >
>> > Yuk.
>> >
>> > This really wants some luo helper
>> >
>> > 'luo alloc array'
>> > 'luo restore array'
>> > 'luo free array'
>>
>> We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here:
>> https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1
>
> The patch looks okay to me, but it doesn't support holes in vmap
> areas. While that is likely acceptable for vmalloc, it could be a
> problem if we want to preserve memfd with holes and using vmap
> preservation as a method, which would require a different approach.
> Still, this would help with preserving memfd.

I agree. I think we should do it the other way round. Build a sparse
array first, and then use that to build vmap preservation. Our emails
seem to have crossed, but see my reply to Mike [0] that describes my
idea a bit more, along with WIP code.

[0] https://lore.kernel.org/lkml/mafs0ldmyw1hp.fsf@kernel.org/

>
> However, I wonder if we should add a separate preservation library on
> top of the kho and not as part of kho (or at least keep them in a
> separate file from core logic). This would allow us to preserve more
> advanced data structures such as this and define preservation version
> control, similar to Jason's store_object/restore_object proposal.

This is how I have done it in my code: created a separate file called
kho_array.c. If we have enough such data structures, we can probably
move it under kernel/liveupdate/lib/.

As for the store_object/restore_object proposal: see an alternate idea
at [1].

[1] https://lore.kernel.org/lkml/mafs0h5xmw12a.fsf@kernel.org/

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pasha Tatashin 1 month ago
> >> > This really wants some luo helper
> >> >
> >> > 'luo alloc array'
> >> > 'luo restore array'
> >> > 'luo free array'
> >>
> >> We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here:
> >> https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1
> >
> > The patch looks okay to me, but it doesn't support holes in vmap
> > areas. While that is likely acceptable for vmalloc, it could be a
> > problem if we want to preserve memfd with holes and using vmap
> > preservation as a method, which would require a different approach.
> > Still, this would help with preserving memfd.
>
> I agree. I think we should do it the other way round. Build a sparse
> array first, and then use that to build vmap preservation. Our emails

Yes, sparse array support would help both: vmalloc and memfd preservation.

> seem to have crossed, but see my reply to Mike [0] that describes my
> idea a bit more, along with WIP code.
>
> [0] https://lore.kernel.org/lkml/mafs0ldmyw1hp.fsf@kernel.org/
>
> >
> > However, I wonder if we should add a separate preservation library on
> > top of the kho and not as part of kho (or at least keep them in a
> > separate file from core logic). This would allow us to preserve more
> > advanced data structures such as this and define preservation version
> > control, similar to Jason's store_object/restore_object proposal.
>
> This is how I have done it in my code: created a separate file called
> kho_array.c. If we have enough such data structures, we can probably
> move it under kernel/liveupdate/lib/.

Yes, let's place it under kernel/liveupdate/lib/. We will add more
preservation types over time.

> As for the store_object/restore_object proposal: see an alternate idea
> at [1].
>
> [1] https://lore.kernel.org/lkml/mafs0h5xmw12a.fsf@kernel.org/

What you are proposing makes sense. We can update the LUO API to be
responsible for passing the compatible string outside of the data
payload. However, I think we first need to settle on the actual API
for storing and restoring a versioned blob of data and place that code
into kernel/liveupdate/lib/. Depending on which API we choose, we can
then modify the LUO to work accordingly.

>
> --
> Regards,
> Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Jason Gunthorpe 1 month ago
On Mon, Sep 01, 2025 at 07:02:46PM +0000, Pasha Tatashin wrote:
> > >> > This really wants some luo helper
> > >> >
> > >> > 'luo alloc array'
> > >> > 'luo restore array'
> > >> > 'luo free array'
> > >>
> > >> We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here:
> > >> https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1
> > >
> > > The patch looks okay to me, but it doesn't support holes in vmap
> > > areas. While that is likely acceptable for vmalloc, it could be a
> > > problem if we want to preserve memfd with holes and using vmap
> > > preservation as a method, which would require a different approach.
> > > Still, this would help with preserving memfd.
> >
> > I agree. I think we should do it the other way round. Build a sparse
> > array first, and then use that to build vmap preservation. Our emails
> 
> Yes, sparse array support would help both: vmalloc and memfd preservation.

Why? vmalloc is always full popoulated, no sparseness..

And again in real systems we expect memfd to be fully populated too.

I wouldn't invest any time in something like this right now. Just be
inefficient if there is sparseness for some reason.

Jason
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pasha Tatashin 1 month ago
> > > > The patch looks okay to me, but it doesn't support holes in vmap
> > > > areas. While that is likely acceptable for vmalloc, it could be a
> > > > problem if we want to preserve memfd with holes and using vmap
> > > > preservation as a method, which would require a different approach.
> > > > Still, this would help with preserving memfd.
> > >
> > > I agree. I think we should do it the other way round. Build a sparse
> > > array first, and then use that to build vmap preservation. Our emails
> >
> > Yes, sparse array support would help both: vmalloc and memfd preservation.
>
> Why? vmalloc is always full popoulated, no sparseness..

vmalloc is always fully populated, but if we add support for
preserving an area with holes, it can also be used for preserving
vmalloc. By the way, I don't like calling it *vmalloc* preservation
because we aren't preserving the original virtual addresses; we are
preserving a list of pages that are reassembled into a virtually
contiguous area. Maybe kho map, or kho page map, not sure, but vmalloc
does not sound right to me.

> And again in real systems we expect memfd to be fully populated too.

I thought so too, but we already have a use case for slightly sparse
memfd, unfortunately, that becomes *very* inefficient when fully
populated.
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Mike Rapoport 1 month ago
On Wed, Sep 03, 2025 at 03:59:40PM +0000, Pasha Tatashin wrote:
> > 
> > And again in real systems we expect memfd to be fully populated too.
> 
> I thought so too, but we already have a use case for slightly sparse
> memfd, unfortunately, that becomes *very* inefficient when fully
> populated.

Wait, regardless of how sparse memfd is, once you memfd_pin_folios() the
number of folios to preserve is known and the metadata to preserve is a
fully populated array.

-- 
Sincerely yours,
Mike.
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Jason Gunthorpe 1 month ago
On Wed, Sep 03, 2025 at 03:59:40PM +0000, Pasha Tatashin wrote:

> vmalloc is always fully populated, but if we add support for
> preserving an area with holes, it can also be used for preserving
> vmalloc. 

Why? If you can't create it with vmap what is the point?

> By the way, I don't like calling it *vmalloc* preservation
> because we aren't preserving the original virtual addresses; we are
> preserving a list of pages that are reassembled into a virtually
> contiguous area. Maybe kho map, or kho page map, not sure, but vmalloc
> does not sound right to me.

No preservation retains the virtual address, that is pretty much
universal.

It is vmalloc preservation because the flow is

 x = vmalloc()
 kho_preserve_vmalloc(x, &preserved)
 [..]
 x = kho_restore_vmalloc(preserved)
 vfree(x)

It is the same naming as folio preservation. Upon restore you get a
vmalloc() back.

> > And again in real systems we expect memfd to be fully populated too.
> 
> I thought so too, but we already have a use case for slightly sparse
> memfd, unfortunately, that becomes *very* inefficient when fully
> populated.

Really? Why not use multiple memfds :(

So maybe you need to do optimized sparseness in memfd :(

Jason
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Chris Li 1 month ago
On Tue, Aug 26, 2025 at 9:20 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote:
>
> > +     /*
> > +      * Most of the space should be taken by preserved folios. So take its
> > +      * size, plus a page for other properties.
> > +      */
> > +     fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE);
> > +     if (!fdt) {
> > +             err = -ENOMEM;
> > +             goto err_unpin;
> > +     }
>
> This doesn't seem to have any versioning scheme, it really should..
>
> > +     err = fdt_property_placeholder(fdt, "folios", preserved_size,
> > +                                    (void **)&preserved_folios);
> > +     if (err) {
> > +             pr_err("Failed to reserve folios property in FDT: %s\n",
> > +                    fdt_strerror(err));
> > +             err = -ENOMEM;
> > +             goto err_free_fdt;
> > +     }
>
> Yuk.
>
> This really wants some luo helper
>
> 'luo alloc array'
> 'luo restore array'
> 'luo free array'

Yes, that will be one step forward.

Another idea is that having a middle layer manages the life cycle of
the reserved memory for you. Kind of like a slab allocator for the
preserved memory. It allows bulk free if there is an error on the live
update prepare(), you need to free all previously allocated memory
anyway. If there is some preserved memory that needs to stay after a
long term after the live update kernel boot up, use some special flags
to indicate so don't mix the free_all pool.
>
> Which would get a linearized list of pages in the vmap to hold the
> array and then allocate some structure to record the page list and
> return back the u64 of the phys_addr of the top of the structure to
> store in whatever.
>
> Getting fdt to allocate the array inside the fds is just not going to
> work for anything of size.
>
> > +     for (; i < nr_pfolios; i++) {
> > +             const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> > +             phys_addr_t phys;
> > +             u64 index;
> > +             int flags;
> > +
> > +             if (!pfolio->foliodesc)
> > +                     continue;
> > +
> > +             phys = PFN_PHYS(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> > +             folio = kho_restore_folio(phys);
> > +             if (!folio) {
> > +                     pr_err("Unable to restore folio at physical address: %llx\n",
> > +                            phys);
> > +                     goto put_file;
> > +             }
> > +             index = pfolio->index;
> > +             flags = PRESERVED_FOLIO_FLAGS(pfolio->foliodesc);
> > +
> > +             /* Set up the folio for insertion. */
> > +             /*
> > +              * TODO: Should find a way to unify this and
> > +              * shmem_alloc_and_add_folio().
> > +              */
> > +             __folio_set_locked(folio);
> > +             __folio_set_swapbacked(folio);
> >
> > +             ret = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
> > +             if (ret) {
> > +                     pr_err("shmem: failed to charge folio index %d: %d\n",
> > +                            i, ret);
> > +                     goto unlock_folio;
> > +             }
>
> [..]
>
> > +             folio_add_lru(folio);
> > +             folio_unlock(folio);
> > +             folio_put(folio);
> > +     }
>
> Probably some consolidation will be needed to make this less
> duplicated..
>
> But overall I think just using the memfd_luo_preserved_folio as the
> serialization is entirely file, I don't think this needs anything more
> complicated.
>
> What it does need is an alternative to the FDT with versioning.
>
> Which seems to me to be entirely fine as:
>
>  struct memfd_luo_v0 {
>     __aligned_u64 size;
>     __aligned_u64 pos;
>     __aligned_u64 folios;
>  };
>
>  struct memfd_luo_v0 memfd_luo_v0 = {.size = size, pos = file->f_pos, folios = folios};
>  luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0);
>
> Which also shows the actual data needing to be serialized comes from
> more than one struct and has to be marshaled in code, somehow, to a
> single struct.
>
> Then I imagine a fairly simple forwards/backwards story. If something
> new is needed that is non-optional, lets say you compress the folios
> list to optimize holes:
>
>  struct memfd_luo_v1 {
>     __aligned_u64 size;
>     __aligned_u64 pos;
>     __aligned_u64 folios_list_with_holes;
>  };
>
> Obviously a v0 kernel cannot parse this, but in this case a v1 aware
> kernel could optionally duplicate and write out the v0 format as well:
>
>  luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0);
>  luo_store_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1);

Question: Do we have a matching FDT node to match the memfd C
structure hierarchy? Otherwise all the C struct will lump into one FDT
node. Maybe one FDT node for all C struct is fine. Then there is a
risk of overflowing the 4K buffer limit on the FDT node.

I would like to get independent of FDT for the versioning.

FDT on the top level sounds OK. Not ideal but workable. We are getting
deeper and deeper into complex internal data structures. Do we still
want every data structure referenced by a FDT identifier?

> Then the rule is fairly simple, when the sucessor kernel goes to
> deserialize it asks luo for the versions it supports:
>
>  if (luo_restore_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1))
>     restore_v1(&memfd_luo_v1)
>  else if (luo_restore_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0))
>     restore_v0(&memfd_luo_v0)
>  else
>     luo_failure("Do not understand this");
>
> luo core just manages this list of versioned data per serialized
> object. There is only one version per object.

Obviously, this can be done.

Is that approach you want to expand to every other C struct as well?
See the above FDT node complexity.

I am getting the feeling that we are hand crafting screws to build an
airplane. Can it be done? Of course. Does it scale well? I am not
sure. There are many developers who are currently hand-crafting this
kind of screws to be used on the different components of the airplane.

We need a machine that can stamp out screws with our specifications,
faster. I want such a machine. Other developers might want one as
well.

The initial discussion of the idea of such a machine is pretty
discouraged. There are huge communication barriers because of the
fixation on hand crafted screws. I understand exploring such machine
ideas alone might distract the engineer from hand crafting more
screws, one of them might realize that, oh, I want such a machine as
well.

At this stage, do you see that exploring such a machine idea can be
beneficial or harmful to the project? If such an idea is considered
harmful, we should stop discussing such an idea at all. Go back to
building more batches of hand crafted screws, which are waiting by the
next critical component.

Also if such a machine can produce screws up to your specification,
but it has a different look and feel than the hand crafted screws. We
can stamp out the screw faster.  Would you consider putting such a
machined screw on your most critical component of the engine?

Best Regards,

Chris
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Jason Gunthorpe 1 month ago
On Fri, Aug 29, 2025 at 12:18:43PM -0700, Chris Li wrote:

> Another idea is that having a middle layer manages the life cycle of
> the reserved memory for you. Kind of like a slab allocator for the
> preserved memory. 

If you want a slab allocator then I think you should make slab
preservable.. Don't need more allocators :\

> Question: Do we have a matching FDT node to match the memfd C
> structure hierarchy? Otherwise all the C struct will lump into one FDT
> node. Maybe one FDT node for all C struct is fine. Then there is a
> risk of overflowing the 4K buffer limit on the FDT node.

I thought you were getting rid of FDT? My suggestion was to be taken
as a FDT replacement..

You need some kind of hierarchy of identifiers, things like memfd
should chain off some higher level luo object for a file descriptor.

PCI should be the same, but not fd based.

It may be that luo maintains some flat dictionary of
  string -> [object type, version, u64 ptr]*

And if you want to serialize that the optimal path would be to have a
vmalloc of all the strings and a vmalloc of the [] data, sort of like
the kho array idea.

> At this stage, do you see that exploring such a machine idea can be
> beneficial or harmful to the project? If such an idea is considered
> harmful, we should stop discussing such an idea at all. Go back to
> building more batches of hand crafted screws, which are waiting by the
> next critical component.

I haven't heard a compelling idea that will obviously make things
better.. Adding more layers and complexity is not better.

Your BTF proposal doesn't seem to benifit memfd at all, it was focused
on extracting data directly from an existing struct which I feel very
strongly we should never do.

The above dictionary, I also don't see how BTF helps. It is such a
special encoding. Yes you could make some elaborate serialization
infrastructure, like FDT, but we have all been saying FDT is too hard
to use and too much code. I'm not sure I'm convinced there is really a
better middle ground :\

IMHO if there is some way to improve this it still yet to be found,
and I think we don't well understand what we need to serialize just
yet.

Smaller ideas like preserve the vmalloc will make big improvement
already.

Lets not race ahead until we understand the actual problem properly.

Jason
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Chris Li 1 month ago
On Tue, Sep 2, 2025 at 6:42 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Fri, Aug 29, 2025 at 12:18:43PM -0700, Chris Li wrote:
>
> > Another idea is that having a middle layer manages the life cycle of
> > the reserved memory for you. Kind of like a slab allocator for the
> > preserved memory.
>
> If you want a slab allocator then I think you should make slab
> preservable.. Don't need more allocators :\

Sure, we can reuse the slab allocator to add the KHO function to it. I
consider that as the implementation detail side, I haven't even
started yet. I just want to point out that we might want to have a
high level library to take care of the life cycle of the preserved
memory. Less boilerplate code for the caller.

> > Question: Do we have a matching FDT node to match the memfd C
> > structure hierarchy? Otherwise all the C struct will lump into one FDT
> > node. Maybe one FDT node for all C struct is fine. Then there is a
> > risk of overflowing the 4K buffer limit on the FDT node.
>
> I thought you were getting rid of FDT? My suggestion was to be taken
> as a FDT replacement..

Thanks for the clarification. Yes, I do want to get rid of FDT, very much so.

If we are not using FDT, adding an object might change the underlying
C structure layout causing a chain reaction of C struct change back to
the root. That is where I assume you might be still using FDT. I see
your later comments address that with a list of objects. I will
discuss it there.

> You need some kind of hierarchy of identifiers, things like memfd
> should chain off some higher level luo object for a file descriptor.

Ack.

>
> PCI should be the same, but not fd based.

Ack.

> It may be that luo maintains some flat dictionary of
>   string -> [object type, version, u64 ptr]*

I see, got it. That answers my question of how to add a new object
without changing the C structure layout. You are using a list of the
same C structure. When adding more objects to it, just add more items
to the list. This part of the boiler plate detail is not mentioned in
your original suggestion.  I understand your proposal better now.

> And if you want to serialize that the optimal path would be to have a
> vmalloc of all the strings and a vmalloc of the [] data, sort of like
> the kho array idea.

The KHO array idea is already implemented in the existing KHO code or
that is something new you want to propose?

Then we will have to know the combined size of the string up front,
similar to the FDT story. Ideally the list can incrementally add items
to it. May be stored as a list as raw pointer without vmalloc
first,then have a final pass vmalloc and serialize the string and
data.

With the additional detail above, I would like to point out something
I have observed earlier: even though the core idea of the native C
struct is simple and intuitive, the end of end implementation is not.
When we compare C struct implementation, we need to include all those
additional boilerplate details as a whole, otherwise it is not a apple
to apple comparison.

> > At this stage, do you see that exploring such a machine idea can be
> > beneficial or harmful to the project? If such an idea is considered
> > harmful, we should stop discussing such an idea at all. Go back to
> > building more batches of hand crafted screws, which are waiting by the
> > next critical component.
>
> I haven't heard a compelling idea that will obviously make things
> better.. Adding more layers and complexity is not better.

Yes, I completely understand how you reason it, and I agree with your
assessment.

I like to add to that you have been heavily discounting the
boilerplate stuff in the C struct solution. Here is where our view
point might different:
If the "more layer" has its counterpart in the C struct solution as
well, then it is not "more", it is the necessary evil. We need to
compare apples to apples.

> Your BTF proposal doesn't seem to benifit memfd at all, it was focused
> on extracting data directly from an existing struct which I feel very
> strongly we should never do.

From data flow point of view, the data is get from a C struct and
eventually store into a C struct. That is no way around that. That is
the necessary evil if you automate this process. Hey, there is also no
rule saying that you can't use a bounce buffer of some kind of manual
control in between.

It is just a way to automate stuff to reduce the boilerplate. We can
put different label on that and escalate that label or concept is bad.
Your C struct has the exact same thing pulling data from the C struct
and storing into C struct. It is just the label we are arguing. This
label is good and that label is bad. Underlying it has the similar
common necessary evil.

> The above dictionary, I also don't see how BTF helps. It is such a
> special encoding. Yes you could make some elaborate serialization
> infrastructure, like FDT, but we have all been saying FDT is too hard
> to use and too much code. I'm not sure I'm convinced there is really a

Are you ready to be connived? If you keep this as a religion you can
never be convinced.

The reason FDT is too hard to use have other reason. FDT is design to
be constructed by offline tools. In kernel mostly just read only. We
are using FDT outside of its original design parameter. It does not
mean that some thing (the machine) specially design for this purpose
can't be build and easier to use.

> better middle ground :\

With due respect, it sounds like you have the risk of judging
something you haven't fully understood. I feel that a baby, my baby,
has been thrown out with the bathwater.

As a test of water for the above statement, can you describe my idea
equal or better than I do so it passes the test of I say: "yes, this
is exactly what I am trying to build".

That is the communication barrier I am talking about. I estimate at
this rate it will take us about 15 email exchanges to get to the core
stuff. It might be much quicker to lock you and me in a room, Only
release us when you and I can describe each other's viewpoint at a
mutual satisfactory level. I understand your time is precious, and I
don't want to waste your time. I fully respect and comply with your
decision. If you want me to stop now, I can stop. No question asked.

That gets back to my original question, do we already have a ruling
that even the discussion of "the machine" idea is forbidden.

> IMHO if there is some way to improve this it still yet to be found,

In my mind, I have found it. I have to get over the communication
barrier to plead my case to you. You can issue a preliminary ruling to
dismiss my case. I just wish you fully understood the case facts
before you make such a ruling.

> and I think we don't well understand what we need to serialize just
> yet.

That may be true, we don't have 100% understanding of what needs to be
serialized.  On the other hand, it is not 0% either. Based on what we
understand, we can already use "the machine" to help us do what we
know much more effectively. Of course, there is a trade off for
developing "the machine". It takes extra time and the complexity to
maintain such a machine. I fully understand that.

> Smaller ideas like preserve the vmalloc will make big improvement
> already.

Yes, I totally agree. It is a local optimization we can do, it might
not be the global optimized though. "the machine" might not use
vmalloc at all, all this small incremental change will be throw away
once we have "the machine".

I put this situation in the airplane story, yes, we build diamond
plated filers to produce the hand craft screws faster. The missing
opportunity is that, if we have "the machine" earlier, we can pump out
machined screws much faster at scale, minus the time to build the
machine, it might still be an overall win. We don't need to use
diamond plated filter if we have the machine.

> Lets not race ahead until we understand the actual problem properly.

Is that the final ruling? It feels like so. Just clarifying what I am receiving.

I feel a much stronger sense of urgency than you though.  The stakes
are high, currently you already have four departments can use this
common serialization library right now:
1) PCI
2) VFIO
3) IOMMU
4) Memfd.

We are getting into the more complex data structures. If we merge this
into the mainline, it is much harder to pull them out later.
Basically, this is a done deal. That is why I am putting my reputation
and my job on the line to pitch "the machine" idea. It is a very risky
move, I fully understand that.

Chris
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Jason Gunthorpe 1 month ago
On Wed, Sep 03, 2025 at 05:01:15AM -0700, Chris Li wrote:

> > And if you want to serialize that the optimal path would be to have a
> > vmalloc of all the strings and a vmalloc of the [] data, sort of like
> > the kho array idea.
> 
> The KHO array idea is already implemented in the existing KHO code or
> that is something new you want to propose?

Pratyush has proposed it

> Then we will have to know the combined size of the string up front,
> similar to the FDT story. Ideally the list can incrementally add items
> to it. May be stored as a list as raw pointer without vmalloc
> first,then have a final pass vmalloc and serialize the string and
> data.

There are many options, and the dynamic extendability from the KHO
array might be a good fit here. But you can also just store the
serializations in a linked list and then write them out.

> With the additional detail above, I would like to point out something
> I have observed earlier: even though the core idea of the native C
> struct is simple and intuitive, the end of end implementation is not.
> When we compare C struct implementation, we need to include all those
> additional boilerplate details as a whole, otherwise it is not a apple
> to apple comparison.

You need all of this anyhow, BTF doesn't create version meta data,
evaluate which version are suitable, or de-serialize complex rbtree or
linked lists structures.

> > Your BTF proposal doesn't seem to benifit memfd at all, it was focused
> > on extracting data directly from an existing struct which I feel very
> > strongly we should never do.
> 
> From data flow point of view, the data is get from a C struct and
> eventually store into a C struct. That is no way around that. That is
> the necessary evil if you automate this process. Hey, there is also no
> rule saying that you can't use a bounce buffer of some kind of manual
> control in between.

Yeah but if I already wrote the code to make the required C struct
there only difference is 'memcpy c struct' vs 'serialze with btf c
struct' and that isn't meaningful.

If the boilerplate is around arrays of C structs and things then the
KHO array proposal is a good direction to de-duplicate code.

> It is just a way to automate stuff to reduce the boilerplate. 

You haven't clearly spelled out what the boilerplate even is, this was
my feedback to you to be very clear on what is being improved.

> I feel a much stronger sense of urgency than you though.  The stakes
> are high, currently you already have four departments can use this
> common serialization library right now:
> 1) PCI
> 2) VFIO
> 3) IOMMU
> 4) Memfd.

We don't know what they actually need to write out, we haven't seen
any patches. 

Let's start with simple patches and deal with the fundamental problems
like versioning, then you can come with ideas to optimize if it turns
out there is something to improve here.

I'm not convinced PCI (a few bits per struct pci_device to start),
memfd (xarray) and IOMMU (dictionaries of HW physical pointers) share
a significant overlap of serialization requirements beyond luo level
managing the objects and versioning.

Jason
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 3 weeks, 4 days ago
On Thu, Sep 04 2025, Jason Gunthorpe wrote:

> On Wed, Sep 03, 2025 at 05:01:15AM -0700, Chris Li wrote:
>
>> > And if you want to serialize that the optimal path would be to have a
>> > vmalloc of all the strings and a vmalloc of the [] data, sort of like
>> > the kho array idea.
>> 
>> The KHO array idea is already implemented in the existing KHO code or
>> that is something new you want to propose?
>
> Pratyush has proposed it

I just sent out the RFC:
https://lore.kernel.org/linux-mm/20250909144426.33274-1-pratyush@kernel.org/T/#u

[...]

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Mike Rapoport 1 month, 1 week ago
On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote:
> On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote:
> 
> > +	err = fdt_property_placeholder(fdt, "folios", preserved_size,
> > +				       (void **)&preserved_folios);
> > +	if (err) {
> > +		pr_err("Failed to reserve folios property in FDT: %s\n",
> > +		       fdt_strerror(err));
> > +		err = -ENOMEM;
> > +		goto err_free_fdt;
> > +	}
> 
> Yuk.
> 
> This really wants some luo helper
> 
> 'luo alloc array'
> 'luo restore array'
> 'luo free array'
> 
> Which would get a linearized list of pages in the vmap to hold the
> array and then allocate some structure to record the page list and
> return back the u64 of the phys_addr of the top of the structure to
> store in whatever.
> 
> Getting fdt to allocate the array inside the fds is just not going to
> work for anything of size.

I agree that we need a side-car structure for preserving large (potentially
sparse) arrays, but I think it should be a part of KHO rather than LUO.

-- 
Sincerely yours,
Mike.
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Chris Li 1 month ago
On Thu, Aug 28, 2025 at 12:14 AM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote:
> > On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote:
> >
> > > +   err = fdt_property_placeholder(fdt, "folios", preserved_size,
> > > +                                  (void **)&preserved_folios);
> > > +   if (err) {
> > > +           pr_err("Failed to reserve folios property in FDT: %s\n",
> > > +                  fdt_strerror(err));
> > > +           err = -ENOMEM;
> > > +           goto err_free_fdt;
> > > +   }
> >
> > Yuk.
> >
> > This really wants some luo helper
> >
> > 'luo alloc array'
> > 'luo restore array'
> > 'luo free array'
> >
> > Which would get a linearized list of pages in the vmap to hold the
> > array and then allocate some structure to record the page list and
> > return back the u64 of the phys_addr of the top of the structure to
> > store in whatever.
> >
> > Getting fdt to allocate the array inside the fds is just not going to
> > work for anything of size.
>
> I agree that we need a side-car structure for preserving large (potentially
> sparse) arrays, but I think it should be a part of KHO rather than LUO.

I agree this can be used by components outside of LUO as well. Ideally
as some helper library so every component can use it. I don't have a
strong opinion on KHO or the stand alone library. I am fine with both.

Chris
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month, 1 week ago
Hi Jason,

Thanks for the review.

On Tue, Aug 26 2025, Jason Gunthorpe wrote:

> On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote:
>
>> +	/*
>> +	 * Most of the space should be taken by preserved folios. So take its
>> +	 * size, plus a page for other properties.
>> +	 */
>> +	fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE);
>> +	if (!fdt) {
>> +		err = -ENOMEM;
>> +		goto err_unpin;
>> +	}
>
> This doesn't seem to have any versioning scheme, it really should..

It does. See the "compatible" property.

    static const char memfd_luo_compatible[] = "memfd-v1";

static struct liveupdate_file_handler memfd_luo_handler = {
	.ops = &memfd_luo_file_ops,
	.compatible = memfd_luo_compatible,
};

This goes into the LUO FDT:

	static int luo_files_to_fdt(struct xarray *files_xa_out)
	[...]
	xa_for_each(files_xa_out, token, h) {
		[...]
		ret = fdt_property_string(luo_file_fdt_out, "compatible",
					  h->fh->compatible);

So this function only gets called for the version 1.

>
>> +	err = fdt_property_placeholder(fdt, "folios", preserved_size,
>> +				       (void **)&preserved_folios);
>> +	if (err) {
>> +		pr_err("Failed to reserve folios property in FDT: %s\n",
>> +		       fdt_strerror(err));
>> +		err = -ENOMEM;
>> +		goto err_free_fdt;
>> +	}
>
> Yuk.
>
> This really wants some luo helper
>
> 'luo alloc array'
> 'luo restore array'
> 'luo free array'
>
> Which would get a linearized list of pages in the vmap to hold the
> array and then allocate some structure to record the page list and
> return back the u64 of the phys_addr of the top of the structure to
> store in whatever.
>
> Getting fdt to allocate the array inside the fds is just not going to
> work for anything of size.

Yep, I agree. This version already runs into size limits of around 1 GiB
due to the FDT being limited to MAX_PAGE_ORDER, since that is the
largest contiguous piece of memory folio_alloc() can give us. On top,
FDT is only limited to 32 bits. While very large, it isn't unreasonable
to expect metadata exceeding that for some use cases (4 GiB is only 0.4%
of 1 TiB and there are systems a lot larger than that around).

I think we need something a luo_xarray data structure that users like
memfd (and later hugetlb and guest_memfd and maybe others) can build to
make serialization easier. It will cover both contiguous arrays and
arrays with some holes in them.

I did it this way mainly to keep things simple and get things out. But
Pasha already mentioned he is running into this limit for some tests, so
I think I will experiment around with a serialized xarray design.

>
>> +	for (; i < nr_pfolios; i++) {
>> +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
>> +		phys_addr_t phys;
>> +		u64 index;
>> +		int flags;
>> +
>> +		if (!pfolio->foliodesc)
>> +			continue;
>> +
>> +		phys = PFN_PHYS(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
>> +		folio = kho_restore_folio(phys);
>> +		if (!folio) {
>> +			pr_err("Unable to restore folio at physical address: %llx\n",
>> +			       phys);
>> +			goto put_file;
>> +		}
>> +		index = pfolio->index;
>> +		flags = PRESERVED_FOLIO_FLAGS(pfolio->foliodesc);
>> +
>> +		/* Set up the folio for insertion. */
>> +		/*
>> +		 * TODO: Should find a way to unify this and
>> +		 * shmem_alloc_and_add_folio().
>> +		 */
>> +		__folio_set_locked(folio);
>> +		__folio_set_swapbacked(folio);
>> 
>> +		ret = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
>> +		if (ret) {
>> +			pr_err("shmem: failed to charge folio index %d: %d\n",
>> +			       i, ret);
>> +			goto unlock_folio;
>> +		}
>
> [..]
>
>> +		folio_add_lru(folio);
>> +		folio_unlock(folio);
>> +		folio_put(folio);
>> +	}
>
> Probably some consolidation will be needed to make this less
> duplicated..

Maybe. I do have that as a TODO item, but I took a quick look today and
I am not sure if it will make things simple enough. There are a few
places that add a folio to the shmem page cache, and all of them have
subtle differences and consolidating them all might be tricky. Let me
give it a shot...

>
> But overall I think just using the memfd_luo_preserved_folio as the
> serialization is entirely file, I don't think this needs anything more
> complicated.
>
> What it does need is an alternative to the FDT with versioning.

As I explained above, the versioning is already there. Beyond that, why
do you think a raw C struct is better than FDT? It is just another way
of expressing the same information. FDT is a bit more cumbersome to
write and read, but comes at the benefit of more introspect-ability.

>
> Which seems to me to be entirely fine as:
>
>  struct memfd_luo_v0 {
>     __aligned_u64 size;
>     __aligned_u64 pos;
>     __aligned_u64 folios;
>  };
>
>  struct memfd_luo_v0 memfd_luo_v0 = {.size = size, pos = file->f_pos, folios = folios};
>  luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0);
>
> Which also shows the actual data needing to be serialized comes from
> more than one struct and has to be marshaled in code, somehow, to a
> single struct.
>
> Then I imagine a fairly simple forwards/backwards story. If something
> new is needed that is non-optional, lets say you compress the folios
> list to optimize holes:
>
>  struct memfd_luo_v1 {
>     __aligned_u64 size;
>     __aligned_u64 pos;
>     __aligned_u64 folios_list_with_holes;
>  };
>
> Obviously a v0 kernel cannot parse this, but in this case a v1 aware
> kernel could optionally duplicate and write out the v0 format as well:
>
>  luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0);
>  luo_store_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1);

I think what you describe here is essentially how LUO works currently,
just that the mechanisms are a bit different.

For example, instead of the subsystem calling luo_store_object(), the
LUO core calls back into the subsystem at the appropriate time to let it
populate the object. See memfd_luo_prepare() and the data argument. The
version is decided by the compatible string with which the handler was
registered.

Since LUO knows when to start serializing what, I think this flow of
calling into the subsystem and letting it fill in an object that LUO
tracks and hands over makes a lot of sense.

>
> Then the rule is fairly simple, when the sucessor kernel goes to
> deserialize it asks luo for the versions it supports:
>
>  if (luo_restore_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1))
>     restore_v1(&memfd_luo_v1)
>  else if (luo_restore_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0))
>     restore_v0(&memfd_luo_v0)
>  else
>     luo_failure("Do not understand this");

Similarly, on restore side, the new kernel can register handlers of all
the versions it can deal with, and LUO core takes care of calling into
the right callback. See  memfd_luo_retrieve() for example. If we now have
a v2, the new kernel can simply define a new handler for v2 and add a
new memfd_luo_retrieve_v2().

>
> luo core just manages this list of versioned data per serialized
> object. There is only one version per object.

This also holds true.

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Jason Gunthorpe 1 month, 1 week ago
On Wed, Aug 27, 2025 at 05:03:55PM +0200, Pratyush Yadav wrote:

> I think we need something a luo_xarray data structure that users like
> memfd (and later hugetlb and guest_memfd and maybe others) can build to
> make serialization easier. It will cover both contiguous arrays and
> arrays with some holes in them.

I'm not sure xarray is the right way to go, it is very complex data
structure and building a kho variation of it seems like it is a huge
amount of work.

I'd stick with simple kvalloc type approaches until we really run into
trouble.

You can always map a sparse xarray into a kvalloc linear list by
including the xarray index in each entry.

Especially for memfd where we don't actually expect any sparsity in
real uses cases there is no reason to invest a huge effort to optimize
for it..

> As I explained above, the versioning is already there. Beyond that, why
> do you think a raw C struct is better than FDT? It is just another way
> of expressing the same information. FDT is a bit more cumbersome to
> write and read, but comes at the benefit of more introspect-ability.

Doesn't have the size limitations, is easier to work list, runs
faster.

> >  luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0);
> >  luo_store_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1);
> 
> I think what you describe here is essentially how LUO works currently,
> just that the mechanisms are a bit different.

The bit different is a very important bit though :)

The versioning should be first class, not hidden away as some emergent
property of registering multiple serializers or something like that.

Jason
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month ago
Hi Jason,

On Thu, Aug 28 2025, Jason Gunthorpe wrote:

> On Wed, Aug 27, 2025 at 05:03:55PM +0200, Pratyush Yadav wrote:
>
>> I think we need something a luo_xarray data structure that users like
>> memfd (and later hugetlb and guest_memfd and maybe others) can build to
>> make serialization easier. It will cover both contiguous arrays and
>> arrays with some holes in them.
>
> I'm not sure xarray is the right way to go, it is very complex data
> structure and building a kho variation of it seems like it is a huge
> amount of work.
>
> I'd stick with simple kvalloc type approaches until we really run into
> trouble.
>
> You can always map a sparse xarray into a kvalloc linear list by
> including the xarray index in each entry.
>
> Especially for memfd where we don't actually expect any sparsity in
> real uses cases there is no reason to invest a huge effort to optimize
> for it..

Full xarray is too complex, sure. But I think a simple sparse array with
xarray-like properties (4-byte pointers, values using xa_mk_value()) is
fairly simple to implement. More advanced features of xarray like
multi-index entries can be added later if needed.

In fact, I have a WIP version of such an array and have used it for
memfd preservation, and it looks quite alright to me. You can find the
code at [0]. It is roughly 300 lines of code. I still need to clean it
up to make it post-able, but it does work.

Building kvalloc on top of this becomes trivial.

[0] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=cf4c04c1e9ac854e3297018ad6dada17c54a59af

>
>> As I explained above, the versioning is already there. Beyond that, why
>> do you think a raw C struct is better than FDT? It is just another way
>> of expressing the same information. FDT is a bit more cumbersome to
>> write and read, but comes at the benefit of more introspect-ability.
>
> Doesn't have the size limitations, is easier to work list, runs
> faster.
>
>> >  luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0);
>> >  luo_store_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1);
>> 
>> I think what you describe here is essentially how LUO works currently,
>> just that the mechanisms are a bit different.
>
> The bit different is a very important bit though :)
>
> The versioning should be first class, not hidden away as some emergent
> property of registering multiple serializers or something like that.

That makes sense. How about some simple changes to the LUO interfaces to
make the version more prominent:

	int (*prepare)(struct liveupdate_file_handler *handler,
		       struct file *file, u64 *data, char **compatible);

This lets the subsystem fill in the compatible (AKA version) (string
here, but you can make it an integer if you want) when it serialized its
data.

And on restore side, LUO can pass in the compatible:

	int (*retrieve)(struct liveupdate_file_handler *handler,
			u64 data, char *compatible, struct file **file);


-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Jason Gunthorpe 1 month ago
On Mon, Sep 01, 2025 at 07:10:53PM +0200, Pratyush Yadav wrote:
> Building kvalloc on top of this becomes trivial.
> 
> [0] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=cf4c04c1e9ac854e3297018ad6dada17c54a59af

This isn't really an array, it is a non-seekable serialization of
key/values with some optimization for consecutive keys. IMHO it is
most useful if you don't know the size of the thing you want to
serialize in advance since it has a nice dynamic append.

But if you do know the size, I think it makes more sense just to do a
preserving vmalloc and write out a linear array..

So, it could be useful, but I wouldn't use it for memfd, the vmalloc
approach is better and we shouldn't optimize for sparsness which
should never happen.

> > The versioning should be first class, not hidden away as some emergent
> > property of registering multiple serializers or something like that.
> 
> That makes sense. How about some simple changes to the LUO interfaces to
> make the version more prominent:
> 
> 	int (*prepare)(struct liveupdate_file_handler *handler,
> 		       struct file *file, u64 *data, char **compatible);

Yeah, something more integrated with the ops is better.

You could list the supported versions in the ops itself

  const char **supported_deserialize_versions;

And let the luo framework find the right versions.

But for prepare I would expect an inbetween object:

	int (*prepare)(struct liveupdate_file_handler *handler,
	    	       struct luo_object *obj, struct file *file);

And then you'd do function calls on 'obj' to store 'data' per version.

Jason
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month ago
Hi Jason,

On Tue, Sep 02 2025, Jason Gunthorpe wrote:

> On Mon, Sep 01, 2025 at 07:10:53PM +0200, Pratyush Yadav wrote:
>> Building kvalloc on top of this becomes trivial.
>> 
>> [0] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=cf4c04c1e9ac854e3297018ad6dada17c54a59af
>
> This isn't really an array, it is a non-seekable serialization of
> key/values with some optimization for consecutive keys. IMHO it is

Sure, an array is not the best name for the thing. Call it whatever,
maybe a "sparse collection of pointers". But I hope you get the idea.

> most useful if you don't know the size of the thing you want to
> serialize in advance since it has a nice dynamic append.
>
> But if you do know the size, I think it makes more sense just to do a
> preserving vmalloc and write out a linear array..

I think there are two separate parts here. One is the data format and
the other is the data builder.

The format itself is quite simple. It is a linked list of discontiguous
pages that holds a set of pointers. We use that idea already for the
preserved pages bitmap. Mike's vmalloc preservation patches also use the
same idea, just with a small variation.

The builder part (ka_iter in my patches) is an abstraction on top to
build the data structure. I designed it with the nice dynamic append
property since it seemed like a nice and convenient design, but we can
have it define the size statically as well. The underlying data format
won't change.

>
> So, it could be useful, but I wouldn't use it for memfd, the vmalloc
> approach is better and we shouldn't optimize for sparsness which
> should never happen.

I disagree. I think we are re-inventing the same data format with minor
variations. I think we should define extensible fundamental data formats
first, and then use those as the building blocks for the rest of our
serialization logic.

I think KHO array does exactly that. It provides the fundamental
serialization for a collection of pointers, and other serialization use
cases can then build on top of it. For example, the preservation bitmaps
can get rid of their linked list logic and just use KHO array to hold
and retrieve its bitmaps. It will make the serialization simpler.
Similar argument for vmalloc preservation.

I also don't get why you think sparseness "should never happen". For
memfd for example, you say in one of your other emails that "And again
in real systems we expect memfd to be fully populated too." Which
systems and use cases do you have in mind? Why do you think people won't
want a sparse memfd?

And finally, from a data format perspective, the sparseness only adds a
small bit of complexity (the startpos for each kho_array_page).
Everything else is practically the same as a continuous array.

All in all, I think KHO array is going to prove useful and will make
serialization for subsystems easier. I think sparseness will also prove
useful but it is not a hill I want to die on. I am fine with starting
with a non-sparse array if people really insist. But I do think we
should go with KHO array as a base instead of re-inventing the linked
list of pages again and again.

>
>> > The versioning should be first class, not hidden away as some emergent
>> > property of registering multiple serializers or something like that.
>> 
>> That makes sense. How about some simple changes to the LUO interfaces to
>> make the version more prominent:
>> 
>> 	int (*prepare)(struct liveupdate_file_handler *handler,
>> 		       struct file *file, u64 *data, char **compatible);
>
> Yeah, something more integrated with the ops is better.
>
> You could list the supported versions in the ops itself
>
>   const char **supported_deserialize_versions;
>
> And let the luo framework find the right versions.
>
> But for prepare I would expect an inbetween object:
>
> 	int (*prepare)(struct liveupdate_file_handler *handler,
> 	    	       struct luo_object *obj, struct file *file);
>
> And then you'd do function calls on 'obj' to store 'data' per version.

What do you mean by "data per version"? I think there should be only one
version of the serialized object. Multiple versions of the same thing
will get ugly real quick.

Other than that, I think this could work well. I am guessing luo_object
stores the version and gives us a way to query it on the other side. I
think if we are letting LUO manage supported versions, it should be
richer than just a list of strings. I think it should include a ops
structure for deserializing each version. That would encapsulate the
versioning more cleanly.

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Jason Gunthorpe 1 month ago
On Wed, Sep 03, 2025 at 04:10:37PM +0200, Pratyush Yadav wrote:

> > So, it could be useful, but I wouldn't use it for memfd, the vmalloc
> > approach is better and we shouldn't optimize for sparsness which
> > should never happen.
> 
> I disagree. I think we are re-inventing the same data format with minor
> variations. I think we should define extensible fundamental data formats
> first, and then use those as the building blocks for the rest of our
> serialization logic.

page, vmalloc, slab seem to me to be the fundamental units of memory
management in linux, so they should get KHO support.

If you want to preserve a known-sized array you use vmalloc and then
write out the per-list items. If it is a dictionary/sparse array then
you write an index with each item too. This is all trivial and doesn't
really need more abstraction in of itself, IMHO.

> cases can then build on top of it. For example, the preservation bitmaps
> can get rid of their linked list logic and just use KHO array to hold
> and retrieve its bitmaps. It will make the serialization simpler.

I don't think the bitmaps should, the serialization here is very
special because it is not actually preserved, it just exists for the
time while the new kernel runs in scratch and is insta freed once the
allocators start up.

> I also don't get why you think sparseness "should never happen". For
> memfd for example, you say in one of your other emails that "And again
> in real systems we expect memfd to be fully populated too." Which
> systems and use cases do you have in mind? Why do you think people won't
> want a sparse memfd?

memfd should principally be used to back VM memory, and I expect VM
memory to be fully populated. Why would it be sparse?

> All in all, I think KHO array is going to prove useful and will make
> serialization for subsystems easier. I think sparseness will also prove
> useful but it is not a hill I want to die on. I am fine with starting
> with a non-sparse array if people really insist. But I do think we
> should go with KHO array as a base instead of re-inventing the linked
> list of pages again and again.

The two main advantages I see to the kho array design vs vmalloc is
that it should be a bit faster as it doesn't establish a vmap, and it
handles unknown size lists much better.

Are these important considerations? IDK.

As I said to Chris, I think we should see more examples of what we
actually need before assuming any certain datastructure is the best
choice.

So I'd stick to simpler open coded things and go back and improve them
than start out building the wrong shared data structure.

How about have at least three luo clients that show meaningful benefit
before proposing something beyond the fundamental page, vmalloc, slab
things?

> What do you mean by "data per version"? I think there should be only one
> version of the serialized object. Multiple versions of the same thing
> will get ugly real quick.

If you want to support backwards/forwards compatability then you
probably should support multiple versions as well. Otherwise it
could become quite hard to make downgrades..

Ideally I'd want to remove the upstream code for obsolete versions
fairly quickly so I'd imagine kernels will want to generate both
versions during the transition period and then eventually newer
kernels will only accept the new version.

I've argued before that the extended matrix of any kernel version to
any other kernel version should lie with the distro/CSP making the
kernel fork. They know what their upgrade sequence will be so they can
manage any missing versions to make it work.

Upstream should do like v6.1 to v6.2 only or something similarly well
constrained. I think this is a reasonable trade off to get subsystem
maintainers to even accept this stuff at all.

> Other than that, I think this could work well. I am guessing luo_object
> stores the version and gives us a way to query it on the other side. I
> think if we are letting LUO manage supported versions, it should be
> richer than just a list of strings. I think it should include a ops
> structure for deserializing each version. That would encapsulate the
> versioning more cleanly.

Yeah, sounds about right

Jason
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month ago
Hi Jason,

On Wed, Sep 03 2025, Jason Gunthorpe wrote:

> On Wed, Sep 03, 2025 at 04:10:37PM +0200, Pratyush Yadav wrote:
>
>> > So, it could be useful, but I wouldn't use it for memfd, the vmalloc
>> > approach is better and we shouldn't optimize for sparsness which
>> > should never happen.
>> 
>> I disagree. I think we are re-inventing the same data format with minor
>> variations. I think we should define extensible fundamental data formats
>> first, and then use those as the building blocks for the rest of our
>> serialization logic.
>
> page, vmalloc, slab seem to me to be the fundamental units of memory
> management in linux, so they should get KHO support.
>
> If you want to preserve a known-sized array you use vmalloc and then
> write out the per-list items. If it is a dictionary/sparse array then
> you write an index with each item too. This is all trivial and doesn't
> really need more abstraction in of itself, IMHO.

We will use up double the space for tracking metadata, but maybe that is
fine until we start seeing bigger memfds in real workloads.

>
>> cases can then build on top of it. For example, the preservation bitmaps
>> can get rid of their linked list logic and just use KHO array to hold
>> and retrieve its bitmaps. It will make the serialization simpler.
>
> I don't think the bitmaps should, the serialization here is very
> special because it is not actually preserved, it just exists for the
> time while the new kernel runs in scratch and is insta freed once the
> allocators start up.

I don't think it matters if they are preserved or not. The serialization
and deserialization is independent of that. You can very well create a
KHO array that you don't KHO-preserve. On next boot, you can still use
it, you just have to be careful of doing it while scratch-only. Same as
we do now.

>
>> I also don't get why you think sparseness "should never happen". For
>> memfd for example, you say in one of your other emails that "And again
>> in real systems we expect memfd to be fully populated too." Which
>> systems and use cases do you have in mind? Why do you think people won't
>> want a sparse memfd?
>
> memfd should principally be used to back VM memory, and I expect VM
> memory to be fully populated. Why would it be sparse?

For the _hypervisor_ live update case, sure. Though even there, I have a
feeling we will start seeing userspace components on the hypervisor use
memfd for stashing some of their state. Pasha has already mentioned they
have a use case for a memfd that is not VM memory.

But hypervisor live upadte isn't the only use case for LUO. We are
looking at enabling state preservation for "normal" userspace
applications. Think big storage nodes with memory in order of TiB. Those
can use a memfd to back their caches so on a kernel upgrade the caches
don't have to be re-fetched. Sparseness is to be expected for such use
cases.

>
>> All in all, I think KHO array is going to prove useful and will make
>> serialization for subsystems easier. I think sparseness will also prove
>> useful but it is not a hill I want to die on. I am fine with starting
>> with a non-sparse array if people really insist. But I do think we
>> should go with KHO array as a base instead of re-inventing the linked
>> list of pages again and again.
>
> The two main advantages I see to the kho array design vs vmalloc is
> that it should be a bit faster as it doesn't establish a vmap, and it
> handles unknown size lists much better.
>
> Are these important considerations? IDK.
>
> As I said to Chris, I think we should see more examples of what we
> actually need before assuming any certain datastructure is the best
> choice.
>
> So I'd stick to simpler open coded things and go back and improve them
> than start out building the wrong shared data structure.
>
> How about have at least three luo clients that show meaningful benefit
> before proposing something beyond the fundamental page, vmalloc, slab
> things?

I think the fundamentals themselves get some benefit. But anyway, since
I have done most of the work on this feature anyway, I will do the rest
and send the patches out. Then you can have a look and if you're still
not convinced, I am fine shelving it for now to revisit later when a
stronger case can be made.

>
>> What do you mean by "data per version"? I think there should be only one
>> version of the serialized object. Multiple versions of the same thing
>> will get ugly real quick.
>
> If you want to support backwards/forwards compatability then you
> probably should support multiple versions as well. Otherwise it
> could become quite hard to make downgrades..

Hmm, forward can work regardless since a newer kernel should speak older
formats too, but for backwards it makes sense to have an older version.

But perhaps it might be a better idea to come up with a mechanism for
the kernel to discover which formats the "next" kernel speaks so it can
for one decide whether it can do the live update at all, and for another
which formats it should use. Maybe we give a way for luod to choose
formats, and give it the responsibility for doing these checks?

>
> Ideally I'd want to remove the upstream code for obsolete versions
> fairly quickly so I'd imagine kernels will want to generate both
> versions during the transition period and then eventually newer
> kernels will only accept the new version.
>
> I've argued before that the extended matrix of any kernel version to
> any other kernel version should lie with the distro/CSP making the
> kernel fork. They know what their upgrade sequence will be so they can
> manage any missing versions to make it work.
>
> Upstream should do like v6.1 to v6.2 only or something similarly well
> constrained. I think this is a reasonable trade off to get subsystem
> maintainers to even accept this stuff at all.
[...]

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Jason Gunthorpe 1 month ago
On Thu, Sep 04, 2025 at 02:57:35PM +0200, Pratyush Yadav wrote:

> I don't think it matters if they are preserved or not. The serialization
> and deserialization is independent of that. You can very well create a
> KHO array that you don't KHO-preserve. On next boot, you can still use
> it, you just have to be careful of doing it while scratch-only. Same as
> we do now.

The KHO array machinery itself can't preserve its own memory
either.

> For the _hypervisor_ live update case, sure. Though even there, I have a
> feeling we will start seeing userspace components on the hypervisor use
> memfd for stashing some of their state. 

Sure, but don't make excessively sparse memfds for kexec use, why
should that be hard?

> applications. Think big storage nodes with memory in order of TiB. Those
> can use a memfd to back their caches so on a kernel upgrade the caches
> don't have to be re-fetched. Sparseness is to be expected for such use
> cases.

Oh? I'm surpised you'd have sparseness there. sparseness seems like
such a weird feature to want to rely on :\

> But perhaps it might be a better idea to come up with a mechanism for
> the kernel to discover which formats the "next" kernel speaks so it can
> for one decide whether it can do the live update at all, and for another
> which formats it should use. Maybe we give a way for luod to choose
> formats, and give it the responsibility for doing these checks?

I have felt that we should catalog the formats&versions the kernel can
read/write in some way during kbuild.

Maybe this turns into a sysfs directory of all the data with an
'enable_write' flag that luod could set to 0 to optimize.

And maybe this could be a kbuild report that luod could parse to do
this optimization.

And maybe distro/csps use this information mechanically to check if
version pairs are kexec compatible.

Which re-enforces my feeling that the formats/version should be first
class concepts, every version should be registered and luo should
sequence calling the code for the right version at the right time.

Jason
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 3 weeks, 4 days ago
On Thu, Sep 04 2025, Jason Gunthorpe wrote:

> On Thu, Sep 04, 2025 at 02:57:35PM +0200, Pratyush Yadav wrote:
>
>> I don't think it matters if they are preserved or not. The serialization
>> and deserialization is independent of that. You can very well create a
>> KHO array that you don't KHO-preserve. On next boot, you can still use
>> it, you just have to be careful of doing it while scratch-only. Same as
>> we do now.
>
> The KHO array machinery itself can't preserve its own memory
> either.

It can. Maybe it couldn't in the version I showed you, but now it can.
See kho_array_preserve() in
https://lore.kernel.org/linux-mm/20250909144426.33274-2-pratyush@kernel.org/

>
>> For the _hypervisor_ live update case, sure. Though even there, I have a
>> feeling we will start seeing userspace components on the hypervisor use
>> memfd for stashing some of their state. 
>
> Sure, but don't make excessively sparse memfds for kexec use, why
> should that be hard?

Sure, I don't think they should be excessively sparse. But _some_ level
of sparseness can be there.

>
>> applications. Think big storage nodes with memory in order of TiB. Those
>> can use a memfd to back their caches so on a kernel upgrade the caches
>> don't have to be re-fetched. Sparseness is to be expected for such use
>> cases.
>
> Oh? I'm surpised you'd have sparseness there. sparseness seems like
> such a weird feature to want to rely on :\
>
>> But perhaps it might be a better idea to come up with a mechanism for
>> the kernel to discover which formats the "next" kernel speaks so it can
>> for one decide whether it can do the live update at all, and for another
>> which formats it should use. Maybe we give a way for luod to choose
>> formats, and give it the responsibility for doing these checks?
>
> I have felt that we should catalog the formats&versions the kernel can
> read/write in some way during kbuild.
>
> Maybe this turns into a sysfs directory of all the data with an
> 'enable_write' flag that luod could set to 0 to optimize.
>
> And maybe this could be a kbuild report that luod could parse to do
> this optimization.

Or maybe we put that information in a ELF section in the kernel image?
Not sure how feasible it would be for tooling to read but I think that
would very closely associate the versions info with the kernel. The
other option might be to put it somewhere with modules I guess.

>
> And maybe distro/csps use this information mechanically to check if
> version pairs are kexec compatible.
>
> Which re-enforces my feeling that the formats/version should be first
> class concepts, every version should be registered and luo should
> sequence calling the code for the right version at the right time.
>
> Jason

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pasha Tatashin 3 weeks, 4 days ago
On Tue, Sep 9, 2025 at 10:53 AM Pratyush Yadav <me@yadavpratyush.com> wrote:
>
> On Thu, Sep 04 2025, Jason Gunthorpe wrote:
>
> > On Thu, Sep 04, 2025 at 02:57:35PM +0200, Pratyush Yadav wrote:
> >
> >> I don't think it matters if they are preserved or not. The serialization
> >> and deserialization is independent of that. You can very well create a
> >> KHO array that you don't KHO-preserve. On next boot, you can still use
> >> it, you just have to be careful of doing it while scratch-only. Same as
> >> we do now.
> >
> > The KHO array machinery itself can't preserve its own memory
> > either.
>
> It can. Maybe it couldn't in the version I showed you, but now it can.
> See kho_array_preserve() in
> https://lore.kernel.org/linux-mm/20250909144426.33274-2-pratyush@kernel.org/
>
> >
> >> For the _hypervisor_ live update case, sure. Though even there, I have a
> >> feeling we will start seeing userspace components on the hypervisor use
> >> memfd for stashing some of their state.
> >
> > Sure, but don't make excessively sparse memfds for kexec use, why
> > should that be hard?
>
> Sure, I don't think they should be excessively sparse. But _some_ level
> of sparseness can be there.

This is right; loosely sparse memfd support is needed. However, an
excessively sparse preservation will be inefficient for LU, unless we
change the backing to be from a separate pool of physical pages that
is always preserved. If we do that, it would probably make sense only
for guestmemfd and only if we ever decide to support overcommitted
VMs. I suspect it is not something that we currently need to worry
about.

> >> applications. Think big storage nodes with memory in order of TiB. Those
> >> can use a memfd to back their caches so on a kernel upgrade the caches
> >> don't have to be re-fetched. Sparseness is to be expected for such use
> >> cases.
> >
> > Oh? I'm surpised you'd have sparseness there. sparseness seems like
> > such a weird feature to want to rely on :\
> >
> >> But perhaps it might be a better idea to come up with a mechanism for
> >> the kernel to discover which formats the "next" kernel speaks so it can
> >> for one decide whether it can do the live update at all, and for another
> >> which formats it should use. Maybe we give a way for luod to choose
> >> formats, and give it the responsibility for doing these checks?
> >
> > I have felt that we should catalog the formats&versions the kernel can
> > read/write in some way during kbuild.
> >
> > Maybe this turns into a sysfs directory of all the data with an
> > 'enable_write' flag that luod could set to 0 to optimize.
> >
> > And maybe this could be a kbuild report that luod could parse to do
> > this optimization.
>
> Or maybe we put that information in a ELF section in the kernel image?
> Not sure how feasible it would be for tooling to read but I think that
> would very closely associate the versions info with the kernel. The
> other option might be to put it somewhere with modules I guess.

To me, all this sounds like hardening, which, while important, can be
added later. The pre-kexec check for compatibility can be defined and
implemented once we have all live update components ready
(KHO/LUO/PCI/IOMMU/VFIO/MEMFD), once we stabilize the versioning
story, and once we start discussing update stability.

Currently, we've agreed that there are no stability guarantees.
Sometime in the future, we may guarantee minor-to-minor stability, and
later, stable-to-stable. Once we start working on minor-to-minor
stability, it would be a good idea to also add hardening where a
pre-live update would check for compatibility.

In reality, this is not something that is high priority for cloud
providers, because these kinds of incompatibilities would be found
during qualification; the kernel will fail to update by detecting a
version mismatch during boot instead of during shutdown.

> > And maybe distro/csps use this information mechanically to check if
> > version pairs are kexec compatible.
> >
> > Which re-enforces my feeling that the formats/version should be first
> > class concepts, every version should be registered and luo should
> > sequence calling the code for the right version at the right time.
> >
> > Jason
>
> --
> Regards,
> Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 3 weeks, 4 days ago
On Tue, Sep 09 2025, Pasha Tatashin wrote:

> On Tue, Sep 9, 2025 at 10:53 AM Pratyush Yadav <me@yadavpratyush.com> wrote:
>>
>> On Thu, Sep 04 2025, Jason Gunthorpe wrote:
>>
>> > On Thu, Sep 04, 2025 at 02:57:35PM +0200, Pratyush Yadav wrote:
[...]
>> >> But perhaps it might be a better idea to come up with a mechanism for
>> >> the kernel to discover which formats the "next" kernel speaks so it can
>> >> for one decide whether it can do the live update at all, and for another
>> >> which formats it should use. Maybe we give a way for luod to choose
>> >> formats, and give it the responsibility for doing these checks?
>> >
>> > I have felt that we should catalog the formats&versions the kernel can
>> > read/write in some way during kbuild.
>> >
>> > Maybe this turns into a sysfs directory of all the data with an
>> > 'enable_write' flag that luod could set to 0 to optimize.
>> >
>> > And maybe this could be a kbuild report that luod could parse to do
>> > this optimization.
>>
>> Or maybe we put that information in a ELF section in the kernel image?
>> Not sure how feasible it would be for tooling to read but I think that
>> would very closely associate the versions info with the kernel. The
>> other option might be to put it somewhere with modules I guess.
>
> To me, all this sounds like hardening, which, while important, can be
> added later. The pre-kexec check for compatibility can be defined and
> implemented once we have all live update components ready
> (KHO/LUO/PCI/IOMMU/VFIO/MEMFD), once we stabilize the versioning
> story, and once we start discussing update stability.

Right. I don't think this is something the current LUO patches have to
solve. This is for later down the line.

>
> Currently, we've agreed that there are no stability guarantees.
> Sometime in the future, we may guarantee minor-to-minor stability, and
> later, stable-to-stable. Once we start working on minor-to-minor
> stability, it would be a good idea to also add hardening where a
> pre-live update would check for compatibility.
>
> In reality, this is not something that is high priority for cloud
> providers, because these kinds of incompatibilities would be found
> during qualification; the kernel will fail to update by detecting a
> version mismatch during boot instead of during shutdown.

I think it would help with making a wider range of roll back and forward
options available. For example, if your current kernel can speak version
A and B, and you are rolling back to a kernel that only speaks A, this
information can be used to choose the right serialization formats.

[...]

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pasha Tatashin 3 weeks, 4 days ago
> I think it would help with making a wider range of roll back and forward
> options available. For example, if your current kernel can speak version
> A and B, and you are rolling back to a kernel that only speaks A, this
> information can be used to choose the right serialization formats.

At least for upstream, we discussed not to support rolling back (this
can be revised in the future), but for now rollback is something that
would need to be taken care of downstream.

Pasha
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Jason Gunthorpe 3 weeks, 4 days ago
On Tue, Sep 09, 2025 at 11:40:18AM -0400, Pasha Tatashin wrote:
> In reality, this is not something that is high priority for cloud
> providers, because these kinds of incompatibilities would be found
> during qualification; the kernel will fail to update by detecting a
> version mismatch during boot instead of during shutdown.

Given I expect CSPs will have to add-in specific version support for
their own special version-pair needs, I think it would be helpful in
the long run to have a tool that reported what versions a kernel build
wrote and parsed. Test-to-learn the same information sounds a bit too
difficult.

Something with ELF is probably how to do that, but I don't imagine a
use in a runtime check consuming this information.

Jason
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pasha Tatashin 3 weeks, 4 days ago
On Tue, Sep 9, 2025 at 11:54 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Tue, Sep 09, 2025 at 11:40:18AM -0400, Pasha Tatashin wrote:
> > In reality, this is not something that is high priority for cloud
> > providers, because these kinds of incompatibilities would be found
> > during qualification; the kernel will fail to update by detecting a
> > version mismatch during boot instead of during shutdown.
>
> Given I expect CSPs will have to add-in specific version support for
> their own special version-pair needs, I think it would be helpful in
> the long run to have a tool that reported what versions a kernel build
> wrote and parsed. Test-to-learn the same information sounds a bit too
> difficult.

Yes, I agree. My point was only about the near term: it's just not a
priority at the moment. This won't block us in the future, as we can
always add a tooling later to inject the required ELF segments for
pre-live update checks.

Pasha
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Jason Gunthorpe 3 weeks, 4 days ago
On Tue, Sep 09, 2025 at 12:30:35PM -0400, Pasha Tatashin wrote:
> On Tue, Sep 9, 2025 at 11:54 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Tue, Sep 09, 2025 at 11:40:18AM -0400, Pasha Tatashin wrote:
> > > In reality, this is not something that is high priority for cloud
> > > providers, because these kinds of incompatibilities would be found
> > > during qualification; the kernel will fail to update by detecting a
> > > version mismatch during boot instead of during shutdown.
> >
> > Given I expect CSPs will have to add-in specific version support for
> > their own special version-pair needs, I think it would be helpful in
> > the long run to have a tool that reported what versions a kernel build
> > wrote and parsed. Test-to-learn the same information sounds a bit too
> > difficult.
> 
> Yes, I agree. My point was only about the near term: it's just not a
> priority at the moment. This won't block us in the future, as we can
> always add a tooling later to inject the required ELF segments for
> pre-live update checks.

Yes, but lets design things to have this kind of logical code model
where there are specific serializations, with specific versions that
are at least discoverably by greping for some struct luo_xxx_ops or
whatever.

Let's avoid open coding versioning stuff where it is hard to find and
hard to later make a manifest out of

Jason

Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pasha Tatashin 3 weeks, 4 days ago
> Yes, but lets design things to have this kind of logical code model
> where there are specific serializations, with specific versions that
> are at least discoverably by greping for some struct luo_xxx_ops or
> whatever.
>
> Let's avoid open coding versioning stuff where it is hard to find and
> hard to later make a manifest out of

Fully agreed, the versioning has to be centralized (not open coded,
and verified in a single place) & discoverable.

>
> Jason
>
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Chris Li 1 month, 1 week ago
On Thu, Aug 28, 2025 at 5:43 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Wed, Aug 27, 2025 at 05:03:55PM +0200, Pratyush Yadav wrote:
>
> > I think we need something a luo_xarray data structure that users like
> > memfd (and later hugetlb and guest_memfd and maybe others) can build to
> > make serialization easier. It will cover both contiguous arrays and
> > arrays with some holes in them.
>
> I'm not sure xarray is the right way to go, it is very complex data
> structure and building a kho variation of it seems like it is a huge
> amount of work.
>
> I'd stick with simple kvalloc type approaches until we really run into
> trouble.
>
> You can always map a sparse xarray into a kvalloc linear list by
> including the xarray index in each entry.

Each entry will be 16 byte, 8 for index and 8 for XAvalue, right?

> Especially for memfd where we don't actually expect any sparsity in
> real uses cases there is no reason to invest a huge effort to optimize
> for it..

Ack.

>
> > As I explained above, the versioning is already there. Beyond that, why
> > do you think a raw C struct is better than FDT? It is just another way
> > of expressing the same information. FDT is a bit more cumbersome to
> > write and read, but comes at the benefit of more introspect-ability.
>
> Doesn't have the size limitations, is easier to work list, runs
> faster.

Yes, especially when you have a large array.

Chris
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Vipin Sharma 1 month, 3 weeks ago
On 2025-08-07 01:44:35, Pasha Tatashin wrote:
> From: Pratyush Yadav <ptyadav@amazon.de>
> +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
> +					unsigned int nr_folios)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < nr_folios; i++) {
> +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> +		struct folio *folio;
> +
> +		if (!pfolio->foliodesc)
> +			continue;
> +
> +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> +
> +		kho_unpreserve_folio(folio);

This one is missing WARN_ON_ONCE() similar to the one in
memfd_luo_preserve_folios().

> +		unpin_folio(folio);
> +	}
> +}
> +
> +static void *memfd_luo_create_fdt(unsigned long size)
> +{
> +	unsigned int order = get_order(size);
> +	struct folio *fdt_folio;
> +	int err = 0;
> +	void *fdt;
> +
> +	if (order > MAX_PAGE_ORDER)
> +		return NULL;
> +
> +	fdt_folio = folio_alloc(GFP_KERNEL, order);

__GFP_ZERO should also be used here. Otherwise this can lead to
unintentional passing of old kernel memory.

> +static int memfd_luo_prepare(struct liveupdate_file_handler *handler,
> +			     struct file *file, u64 *data)
> +{
> +	struct memfd_luo_preserved_folio *preserved_folios;
> +	struct inode *inode = file_inode(file);
> +	unsigned int max_folios, nr_folios = 0;
> +	int err = 0, preserved_size;
> +	struct folio **folios;
> +	long size, nr_pinned;
> +	pgoff_t offset;
> +	void *fdt;
> +	u64 pos;
> +
> +	if (WARN_ON_ONCE(!shmem_file(file)))
> +		return -EINVAL;

This one is only check for shmem_file, whereas in
memfd_luo_can_preserve() there is check for inode->i_nlink also. Is that
not needed here?

> +
> +	inode_lock(inode);
> +	shmem_i_mapping_freeze(inode, true);
> +
> +	size = i_size_read(inode);
> +	if ((PAGE_ALIGN(size) / PAGE_SIZE) > UINT_MAX) {
> +		err = -E2BIG;
> +		goto err_unlock;
> +	}
> +
> +	/*
> +	 * Guess the number of folios based on inode size. Real number might end
> +	 * up being smaller if there are higher order folios.
> +	 */
> +	max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
> +	folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);

__GFP_ZERO?

> +static int memfd_luo_freeze(struct liveupdate_file_handler *handler,
> +			    struct file *file, u64 *data)
> +{
> +	u64 pos = file->f_pos;
> +	void *fdt;
> +	int err;
> +
> +	if (WARN_ON_ONCE(!*data))
> +		return -EINVAL;
> +
> +	fdt = phys_to_virt(*data);
> +
> +	/*
> +	 * The pos or size might have changed since prepare. Everything else
> +	 * stays the same.
> +	 */
> +	err = fdt_setprop(fdt, 0, "pos", &pos, sizeof(pos));
> +	if (err)
> +		return err;

Comment is talking about pos and size but code is only updating pos. 

> +static int memfd_luo_retrieve(struct liveupdate_file_handler *handler, u64 data,
> +			      struct file **file_p)
> +{
> +	const struct memfd_luo_preserved_folio *pfolios;
> +	int nr_pfolios, len, ret = 0, i = 0;
> +	struct address_space *mapping;
> +	struct folio *folio, *fdt_folio;
> +	const u64 *pos, *size;
> +	struct inode *inode;
> +	struct file *file;
> +	const void *fdt;
> +
> +	fdt_folio = memfd_luo_get_fdt(data);
> +	if (!fdt_folio)
> +		return -ENOENT;
> +
> +	fdt = page_to_virt(folio_page(fdt_folio, 0));
> +
> +	pfolios = fdt_getprop(fdt, 0, "folios", &len);
> +	if (!pfolios || len % sizeof(*pfolios)) {
> +		pr_err("invalid 'folios' property\n");

Print should clearly state that error is because fields is not found or
len is not multiple of sizeof(*pfolios).
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month, 3 weeks ago
Hi Vipin,

Thanks for the review.

On Tue, Aug 12 2025, Vipin Sharma wrote:

> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
>> From: Pratyush Yadav <ptyadav@amazon.de>
>> +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
>> +					unsigned int nr_folios)
>> +{
>> +	unsigned int i;
>> +
>> +	for (i = 0; i < nr_folios; i++) {
>> +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
>> +		struct folio *folio;
>> +
>> +		if (!pfolio->foliodesc)
>> +			continue;
>> +
>> +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
>> +
>> +		kho_unpreserve_folio(folio);
>
> This one is missing WARN_ON_ONCE() similar to the one in
> memfd_luo_preserve_folios().

Right, will add.

>
>> +		unpin_folio(folio);

Looking at this code caught my eye. This can also be called from LUO's
finish callback if no one claimed the memfd after live update. In that
case, unpin_folio() is going to underflow the pincount or refcount on
the folio since after the kexec, the folio is no longer pinned. We
should only be doing folio_put().

I think this function should take a argument to specify which of these
cases it is dealing with.

>> +	}
>> +}
>> +
>> +static void *memfd_luo_create_fdt(unsigned long size)
>> +{
>> +	unsigned int order = get_order(size);
>> +	struct folio *fdt_folio;
>> +	int err = 0;
>> +	void *fdt;
>> +
>> +	if (order > MAX_PAGE_ORDER)
>> +		return NULL;
>> +
>> +	fdt_folio = folio_alloc(GFP_KERNEL, order);
>
> __GFP_ZERO should also be used here. Otherwise this can lead to
> unintentional passing of old kernel memory.

fdt_create() zeroes out the buffer so this should not be a problem.

>
>> +static int memfd_luo_prepare(struct liveupdate_file_handler *handler,
>> +			     struct file *file, u64 *data)
>> +{
>> +	struct memfd_luo_preserved_folio *preserved_folios;
>> +	struct inode *inode = file_inode(file);
>> +	unsigned int max_folios, nr_folios = 0;
>> +	int err = 0, preserved_size;
>> +	struct folio **folios;
>> +	long size, nr_pinned;
>> +	pgoff_t offset;
>> +	void *fdt;
>> +	u64 pos;
>> +
>> +	if (WARN_ON_ONCE(!shmem_file(file)))
>> +		return -EINVAL;
>
> This one is only check for shmem_file, whereas in
> memfd_luo_can_preserve() there is check for inode->i_nlink also. Is that
> not needed here?

Actually, this should never happen since the LUO can_preserve() callback
should make sure of this. I think it would be perfectly fine to just
drop this check. I only added it because I was being extra careful.

>
>> +
>> +	inode_lock(inode);
>> +	shmem_i_mapping_freeze(inode, true);
>> +
>> +	size = i_size_read(inode);
>> +	if ((PAGE_ALIGN(size) / PAGE_SIZE) > UINT_MAX) {
>> +		err = -E2BIG;
>> +		goto err_unlock;
>> +	}
>> +
>> +	/*
>> +	 * Guess the number of folios based on inode size. Real number might end
>> +	 * up being smaller if there are higher order folios.
>> +	 */
>> +	max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
>> +	folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
>
> __GFP_ZERO?

Why? This is only used in this function and gets freed on return. And
the function only looks at the elements that get initialized by
memfd_pin_folios().

>
>> +static int memfd_luo_freeze(struct liveupdate_file_handler *handler,
>> +			    struct file *file, u64 *data)
>> +{
>> +	u64 pos = file->f_pos;
>> +	void *fdt;
>> +	int err;
>> +
>> +	if (WARN_ON_ONCE(!*data))
>> +		return -EINVAL;
>> +
>> +	fdt = phys_to_virt(*data);
>> +
>> +	/*
>> +	 * The pos or size might have changed since prepare. Everything else
>> +	 * stays the same.
>> +	 */
>> +	err = fdt_setprop(fdt, 0, "pos", &pos, sizeof(pos));
>> +	if (err)
>> +		return err;
>
> Comment is talking about pos and size but code is only updating pos. 

Right. Comment is out of date. size can no longer change since prepare.
So will update the comment.

>
>> +static int memfd_luo_retrieve(struct liveupdate_file_handler *handler, u64 data,
>> +			      struct file **file_p)
>> +{
>> +	const struct memfd_luo_preserved_folio *pfolios;
>> +	int nr_pfolios, len, ret = 0, i = 0;
>> +	struct address_space *mapping;
>> +	struct folio *folio, *fdt_folio;
>> +	const u64 *pos, *size;
>> +	struct inode *inode;
>> +	struct file *file;
>> +	const void *fdt;
>> +
>> +	fdt_folio = memfd_luo_get_fdt(data);
>> +	if (!fdt_folio)
>> +		return -ENOENT;
>> +
>> +	fdt = page_to_virt(folio_page(fdt_folio, 0));
>> +
>> +	pfolios = fdt_getprop(fdt, 0, "folios", &len);
>> +	if (!pfolios || len % sizeof(*pfolios)) {
>> +		pr_err("invalid 'folios' property\n");
>
> Print should clearly state that error is because fields is not found or
> len is not multiple of sizeof(*pfolios).

Eh, there is already too much boilerplate one has to write (and read)
for parsing the FDT. Is there really a need for an extra 3-4 lines of
code for _each_ property that is parsed?

Long term, I think we shouldn't be doing this manually anyway. I think
the maintainable path forward is to define a schema for the serialized
data and have a parser that takes in the schema and gives out a parsed
struct, doing all sorts of checks in the process.

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pasha Tatashin 1 month, 3 weeks ago
On Wed, Aug 13, 2025 at 12:29 PM Pratyush Yadav <pratyush@kernel.org> wrote:
>
> Hi Vipin,
>
> Thanks for the review.
>
> On Tue, Aug 12 2025, Vipin Sharma wrote:
>
> > On 2025-08-07 01:44:35, Pasha Tatashin wrote:
> >> From: Pratyush Yadav <ptyadav@amazon.de>
> >> +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
> >> +                                    unsigned int nr_folios)
> >> +{
> >> +    unsigned int i;
> >> +
> >> +    for (i = 0; i < nr_folios; i++) {
> >> +            const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> >> +            struct folio *folio;
> >> +
> >> +            if (!pfolio->foliodesc)
> >> +                    continue;
> >> +
> >> +            folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> >> +
> >> +            kho_unpreserve_folio(folio);
> >
> > This one is missing WARN_ON_ONCE() similar to the one in
> > memfd_luo_preserve_folios().
>
> Right, will add.
>
> >
> >> +            unpin_folio(folio);
>
> Looking at this code caught my eye. This can also be called from LUO's
> finish callback if no one claimed the memfd after live update. In that
> case, unpin_folio() is going to underflow the pincount or refcount on
> the folio since after the kexec, the folio is no longer pinned. We
> should only be doing folio_put().
>
> I think this function should take a argument to specify which of these
> cases it is dealing with.
>
> >> +    }
> >> +}
> >> +
> >> +static void *memfd_luo_create_fdt(unsigned long size)
> >> +{
> >> +    unsigned int order = get_order(size);
> >> +    struct folio *fdt_folio;
> >> +    int err = 0;
> >> +    void *fdt;
> >> +
> >> +    if (order > MAX_PAGE_ORDER)
> >> +            return NULL;
> >> +
> >> +    fdt_folio = folio_alloc(GFP_KERNEL, order);
> >
> > __GFP_ZERO should also be used here. Otherwise this can lead to
> > unintentional passing of old kernel memory.
>
> fdt_create() zeroes out the buffer so this should not be a problem.

You are right, fdt_create() zeroes the whole buffer, however, I wonder
if it could be `optimized` to only clear only the header part of FDT,
not the rest and this could potentially lead us to send an FDT buffer
that contains both a valid FDT and the trailing bits contain data from
old kernel.

Pasha
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month, 3 weeks ago
On Wed, Aug 13 2025, Pasha Tatashin wrote:

> On Wed, Aug 13, 2025 at 12:29 PM Pratyush Yadav <pratyush@kernel.org> wrote:
>>
>> Hi Vipin,
>>
>> Thanks for the review.
>>
>> On Tue, Aug 12 2025, Vipin Sharma wrote:
>>
>> > On 2025-08-07 01:44:35, Pasha Tatashin wrote:
>> >> From: Pratyush Yadav <ptyadav@amazon.de>
>> >> +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
>> >> +                                    unsigned int nr_folios)
>> >> +{
>> >> +    unsigned int i;
>> >> +
>> >> +    for (i = 0; i < nr_folios; i++) {
>> >> +            const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
>> >> +            struct folio *folio;
>> >> +
>> >> +            if (!pfolio->foliodesc)
>> >> +                    continue;
>> >> +
>> >> +            folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
>> >> +
>> >> +            kho_unpreserve_folio(folio);
>> >
>> > This one is missing WARN_ON_ONCE() similar to the one in
>> > memfd_luo_preserve_folios().
>>
>> Right, will add.
>>
>> >
>> >> +            unpin_folio(folio);
>>
>> Looking at this code caught my eye. This can also be called from LUO's
>> finish callback if no one claimed the memfd after live update. In that
>> case, unpin_folio() is going to underflow the pincount or refcount on
>> the folio since after the kexec, the folio is no longer pinned. We
>> should only be doing folio_put().
>>
>> I think this function should take a argument to specify which of these
>> cases it is dealing with.
>>
>> >> +    }
>> >> +}
>> >> +
>> >> +static void *memfd_luo_create_fdt(unsigned long size)
>> >> +{
>> >> +    unsigned int order = get_order(size);
>> >> +    struct folio *fdt_folio;
>> >> +    int err = 0;
>> >> +    void *fdt;
>> >> +
>> >> +    if (order > MAX_PAGE_ORDER)
>> >> +            return NULL;
>> >> +
>> >> +    fdt_folio = folio_alloc(GFP_KERNEL, order);
>> >
>> > __GFP_ZERO should also be used here. Otherwise this can lead to
>> > unintentional passing of old kernel memory.
>>
>> fdt_create() zeroes out the buffer so this should not be a problem.
>
> You are right, fdt_create() zeroes the whole buffer, however, I wonder
> if it could be `optimized` to only clear only the header part of FDT,
> not the rest and this could potentially lead us to send an FDT buffer
> that contains both a valid FDT and the trailing bits contain data from
> old kernel.

Fair enough. At least the API documentation does not say anything about
the state of the buffer. My main concern was around performance since
the FDT can be multiple megabytes long for big memfds. Anyway, this
isn't in the blackout window so perhaps we can live with it. Will add
the GFP_ZERO.

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Greg KH 1 month, 3 weeks ago
On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote:
> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
> > From: Pratyush Yadav <ptyadav@amazon.de>
> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
> > +					unsigned int nr_folios)
> > +{
> > +	unsigned int i;
> > +
> > +	for (i = 0; i < nr_folios; i++) {
> > +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> > +		struct folio *folio;
> > +
> > +		if (!pfolio->foliodesc)
> > +			continue;
> > +
> > +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> > +
> > +		kho_unpreserve_folio(folio);
> 
> This one is missing WARN_ON_ONCE() similar to the one in
> memfd_luo_preserve_folios().

So you really want to cause a machine to reboot and get a CVE issued for
this, if it could be triggered?  That's bold :)

Please don't.  If that can happen, handle the issue and move on, don't
crash boxes.

thanks,

greg k-h
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month, 3 weeks ago
On Wed, Aug 13 2025, Greg KH wrote:

> On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote:
>> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
>> > From: Pratyush Yadav <ptyadav@amazon.de>
>> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
>> > +					unsigned int nr_folios)
>> > +{
>> > +	unsigned int i;
>> > +
>> > +	for (i = 0; i < nr_folios; i++) {
>> > +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
>> > +		struct folio *folio;
>> > +
>> > +		if (!pfolio->foliodesc)
>> > +			continue;
>> > +
>> > +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
>> > +
>> > +		kho_unpreserve_folio(folio);
>> 
>> This one is missing WARN_ON_ONCE() similar to the one in
>> memfd_luo_preserve_folios().
>
> So you really want to cause a machine to reboot and get a CVE issued for
> this, if it could be triggered?  That's bold :)
>
> Please don't.  If that can happen, handle the issue and move on, don't
> crash boxes.

Why would a WARN() crash the machine? That is what BUG() does, not
WARN().

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Greg KH 1 month, 3 weeks ago
On Wed, Aug 13, 2025 at 02:02:07PM +0200, Pratyush Yadav wrote:
> On Wed, Aug 13 2025, Greg KH wrote:
> 
> > On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote:
> >> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
> >> > From: Pratyush Yadav <ptyadav@amazon.de>
> >> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
> >> > +					unsigned int nr_folios)
> >> > +{
> >> > +	unsigned int i;
> >> > +
> >> > +	for (i = 0; i < nr_folios; i++) {
> >> > +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> >> > +		struct folio *folio;
> >> > +
> >> > +		if (!pfolio->foliodesc)
> >> > +			continue;
> >> > +
> >> > +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> >> > +
> >> > +		kho_unpreserve_folio(folio);
> >> 
> >> This one is missing WARN_ON_ONCE() similar to the one in
> >> memfd_luo_preserve_folios().
> >
> > So you really want to cause a machine to reboot and get a CVE issued for
> > this, if it could be triggered?  That's bold :)
> >
> > Please don't.  If that can happen, handle the issue and move on, don't
> > crash boxes.
> 
> Why would a WARN() crash the machine? That is what BUG() does, not
> WARN().

See 'panic_on_warn' which is enabled in a few billion Linux systems
these days :(
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Jason Gunthorpe 1 month, 3 weeks ago
On Wed, Aug 13, 2025 at 02:14:23PM +0200, Greg KH wrote:
> On Wed, Aug 13, 2025 at 02:02:07PM +0200, Pratyush Yadav wrote:
> > On Wed, Aug 13 2025, Greg KH wrote:
> > 
> > > On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote:
> > >> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
> > >> > From: Pratyush Yadav <ptyadav@amazon.de>
> > >> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
> > >> > +					unsigned int nr_folios)
> > >> > +{
> > >> > +	unsigned int i;
> > >> > +
> > >> > +	for (i = 0; i < nr_folios; i++) {
> > >> > +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> > >> > +		struct folio *folio;
> > >> > +
> > >> > +		if (!pfolio->foliodesc)
> > >> > +			continue;
> > >> > +
> > >> > +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> > >> > +
> > >> > +		kho_unpreserve_folio(folio);
> > >> 
> > >> This one is missing WARN_ON_ONCE() similar to the one in
> > >> memfd_luo_preserve_folios().
> > >
> > > So you really want to cause a machine to reboot and get a CVE issued for
> > > this, if it could be triggered?  That's bold :)
> > >
> > > Please don't.  If that can happen, handle the issue and move on, don't
> > > crash boxes.
> > 
> > Why would a WARN() crash the machine? That is what BUG() does, not
> > WARN().
> 
> See 'panic_on_warn' which is enabled in a few billion Linux systems
> these days :(

This has been discussed so many times already:

https://lwn.net/Articles/969923/

When someone tried to formalize this "don't use WARN_ON" position 
in the coding-style.rst it was NAK'd:

https://lwn.net/ml/linux-kernel/10af93f8-83f2-48ce-9bc3-80fe4c60082c@redhat.com/

Based on Linus's opposition to the idea:

https://lore.kernel.org/all/CAHk-=wgF7K2gSSpy=m_=K3Nov4zaceUX9puQf1TjkTJLA2XC_g@mail.gmail.com/

Use the warn ons. Make sure they can't be triggered by userspace. Use
them to detect corruption/malfunction in the kernel.

In this case if kho_unpreserve_folio() fails in this call chain it
means some error unwind is wrongly happening out of sequence, and we
are now forced to leak memory. Unwind is not something that userspace
should be controlling, so of course we want a WARN_ON here.

Jason
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month, 3 weeks ago
On Wed, Aug 13 2025, Jason Gunthorpe wrote:

> On Wed, Aug 13, 2025 at 02:14:23PM +0200, Greg KH wrote:
>> On Wed, Aug 13, 2025 at 02:02:07PM +0200, Pratyush Yadav wrote:
>> > On Wed, Aug 13 2025, Greg KH wrote:
>> > 
>> > > On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote:
>> > >> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
>> > >> > From: Pratyush Yadav <ptyadav@amazon.de>
>> > >> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
>> > >> > +					unsigned int nr_folios)
>> > >> > +{
>> > >> > +	unsigned int i;
>> > >> > +
>> > >> > +	for (i = 0; i < nr_folios; i++) {
>> > >> > +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
>> > >> > +		struct folio *folio;
>> > >> > +
>> > >> > +		if (!pfolio->foliodesc)
>> > >> > +			continue;
>> > >> > +
>> > >> > +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
>> > >> > +
>> > >> > +		kho_unpreserve_folio(folio);
>> > >> 
>> > >> This one is missing WARN_ON_ONCE() similar to the one in
>> > >> memfd_luo_preserve_folios().
>> > >
>> > > So you really want to cause a machine to reboot and get a CVE issued for
>> > > this, if it could be triggered?  That's bold :)
>> > >
>> > > Please don't.  If that can happen, handle the issue and move on, don't
>> > > crash boxes.
>> > 
>> > Why would a WARN() crash the machine? That is what BUG() does, not
>> > WARN().
>> 
>> See 'panic_on_warn' which is enabled in a few billion Linux systems
>> these days :(
>
> This has been discussed so many times already:
>
> https://lwn.net/Articles/969923/
>
> When someone tried to formalize this "don't use WARN_ON" position 
> in the coding-style.rst it was NAK'd:
>
> https://lwn.net/ml/linux-kernel/10af93f8-83f2-48ce-9bc3-80fe4c60082c@redhat.com/
>
> Based on Linus's opposition to the idea:
>
> https://lore.kernel.org/all/CAHk-=wgF7K2gSSpy=m_=K3Nov4zaceUX9puQf1TjkTJLA2XC_g@mail.gmail.com/
>
> Use the warn ons. Make sure they can't be triggered by userspace. Use
> them to detect corruption/malfunction in the kernel.
>
> In this case if kho_unpreserve_folio() fails in this call chain it
> means some error unwind is wrongly happening out of sequence, and we
> are now forced to leak memory. Unwind is not something that userspace
> should be controlling, so of course we want a WARN_ON here.

Yep. And if we are saying WARN() should never be used then doesn't that
make panic_on_warn a no-op? What is even the point of that option then?

Here, we are unable to unpreserve a folio that we have preserved. This
isn't a normal error that we expect to happen. This should _not_ happen
unless something has gone horribly wrong.

For example, the calls to kho_preserve_folio() don't WARN(), since that
can fail for various reasons. They just return the error up the call
chain. As an analogy, allocating a page can fail, and it is quite
reasonable to expect the code to not throw out WARN()s for that. But if
for some reason you can't free a page that you allocated, this is very
unexpected and should WARN(). Of course, in Linux the page free APIs
don't even return a status, but I hope you get my point.

If I were a system administrator who sets panic_on_warn, I would _want_
the system to crash so no further damage happens and I can collect
logs/crash dumps to investigate later. Without the WARN(), I never get a
chance to debug and my system breaks silently. For all others, the
kernel goes on with some possibly corrupted/broken state.

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Greg KH 1 month, 3 weeks ago
On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote:
> On Wed, Aug 13, 2025 at 02:14:23PM +0200, Greg KH wrote:
> > On Wed, Aug 13, 2025 at 02:02:07PM +0200, Pratyush Yadav wrote:
> > > On Wed, Aug 13 2025, Greg KH wrote:
> > > 
> > > > On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote:
> > > >> On 2025-08-07 01:44:35, Pasha Tatashin wrote:
> > > >> > From: Pratyush Yadav <ptyadav@amazon.de>
> > > >> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
> > > >> > +					unsigned int nr_folios)
> > > >> > +{
> > > >> > +	unsigned int i;
> > > >> > +
> > > >> > +	for (i = 0; i < nr_folios; i++) {
> > > >> > +		const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> > > >> > +		struct folio *folio;
> > > >> > +
> > > >> > +		if (!pfolio->foliodesc)
> > > >> > +			continue;
> > > >> > +
> > > >> > +		folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
> > > >> > +
> > > >> > +		kho_unpreserve_folio(folio);
> > > >> 
> > > >> This one is missing WARN_ON_ONCE() similar to the one in
> > > >> memfd_luo_preserve_folios().
> > > >
> > > > So you really want to cause a machine to reboot and get a CVE issued for
> > > > this, if it could be triggered?  That's bold :)
> > > >
> > > > Please don't.  If that can happen, handle the issue and move on, don't
> > > > crash boxes.
> > > 
> > > Why would a WARN() crash the machine? That is what BUG() does, not
> > > WARN().
> > 
> > See 'panic_on_warn' which is enabled in a few billion Linux systems
> > these days :(
> 
> This has been discussed so many times already:
> 
> https://lwn.net/Articles/969923/
> 
> When someone tried to formalize this "don't use WARN_ON" position 
> in the coding-style.rst it was NAK'd:
> 
> https://lwn.net/ml/linux-kernel/10af93f8-83f2-48ce-9bc3-80fe4c60082c@redhat.com/
> 
> Based on Linus's opposition to the idea:
> 
> https://lore.kernel.org/all/CAHk-=wgF7K2gSSpy=m_=K3Nov4zaceUX9puQf1TjkTJLA2XC_g@mail.gmail.com/
> 
> Use the warn ons. Make sure they can't be triggered by userspace. Use
> them to detect corruption/malfunction in the kernel.
> 
> In this case if kho_unpreserve_folio() fails in this call chain it
> means some error unwind is wrongly happening out of sequence, and we
> are now forced to leak memory. Unwind is not something that userspace
> should be controlling, so of course we want a WARN_ON here.

"should be" is the key here.  And it's not obvious from this patch if
that's true or not, which is why I mentioned it.

I will keep bringing this up, given the HUGE number of CVEs I keep
assigning each week for when userspace hits WARN_ON() calls until that
flow starts to die out either because we don't keep adding new calls, OR
we finally fix them all.  Both would be good...

thanks,

greg k-h
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Jason Gunthorpe 1 month, 3 weeks ago
On Wed, Aug 13, 2025 at 03:00:08PM +0200, Greg KH wrote:
> > In this case if kho_unpreserve_folio() fails in this call chain it
> > means some error unwind is wrongly happening out of sequence, and we
> > are now forced to leak memory. Unwind is not something that userspace
> > should be controlling, so of course we want a WARN_ON here.
> 
> "should be" is the key here.  And it's not obvious from this patch if
> that's true or not, which is why I mentioned it.
> 
> I will keep bringing this up, given the HUGE number of CVEs I keep
> assigning each week for when userspace hits WARN_ON() calls until that
> flow starts to die out either because we don't keep adding new calls, OR
> we finally fix them all.  Both would be good...

WARN or not, userspace triggering permanently leaking kernel memory is
a CVE worthy bug in of itself.

So even if userspace triggers this I'd rather have the warn than the
difficult to find leak.

I don't know what your CVEs are, but I get a decent number of
userspace hits a WARN bug from with syzkaller, and they are all bugs
in the kernel. Bugs that should probably get CVEs even without the
crash on WARN issue anyhow. The WARN made them discoverable cheaply.

The most recent was a userspace triggerable arthimetic overflow
corrupted a datastructure and a WARN caught it, syzkaller found it,
and we fixed it before it became a splashy exploit with a web
site.

Removing bug catching to reduce CVEs because we don't find the bugs
anymore seems like the wrong direction to me.

Jason
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month, 3 weeks ago
On Wed, Aug 13 2025, Greg KH wrote:

> On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote:
[...]
>> Use the warn ons. Make sure they can't be triggered by userspace. Use
>> them to detect corruption/malfunction in the kernel.
>> 
>> In this case if kho_unpreserve_folio() fails in this call chain it
>> means some error unwind is wrongly happening out of sequence, and we
>> are now forced to leak memory. Unwind is not something that userspace
>> should be controlling, so of course we want a WARN_ON here.
>
> "should be" is the key here.  And it's not obvious from this patch if
> that's true or not, which is why I mentioned it.
>
> I will keep bringing this up, given the HUGE number of CVEs I keep
> assigning each week for when userspace hits WARN_ON() calls until that
> flow starts to die out either because we don't keep adding new calls, OR
> we finally fix them all.  Both would be good...

Out of curiosity, why is hitting a WARN_ON() considered a vulnerability?
I'd guess one reason is overwhelming system console which can cause a
denial of service, but what about WARN_ON_ONCE() or WARN_RATELIMIT()?

-- 
Regards,
Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Greg KH 1 month, 3 weeks ago
On Wed, Aug 13, 2025 at 03:37:03PM +0200, Pratyush Yadav wrote:
> On Wed, Aug 13 2025, Greg KH wrote:
> 
> > On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote:
> [...]
> >> Use the warn ons. Make sure they can't be triggered by userspace. Use
> >> them to detect corruption/malfunction in the kernel.
> >> 
> >> In this case if kho_unpreserve_folio() fails in this call chain it
> >> means some error unwind is wrongly happening out of sequence, and we
> >> are now forced to leak memory. Unwind is not something that userspace
> >> should be controlling, so of course we want a WARN_ON here.
> >
> > "should be" is the key here.  And it's not obvious from this patch if
> > that's true or not, which is why I mentioned it.
> >
> > I will keep bringing this up, given the HUGE number of CVEs I keep
> > assigning each week for when userspace hits WARN_ON() calls until that
> > flow starts to die out either because we don't keep adding new calls, OR
> > we finally fix them all.  Both would be good...
> 
> Out of curiosity, why is hitting a WARN_ON() considered a vulnerability?
> I'd guess one reason is overwhelming system console which can cause a
> denial of service, but what about WARN_ON_ONCE() or WARN_RATELIMIT()?

If panic_on_warn is set, this will cause the machine to crash/reboot,
which is considered a "vulnerability" by the CVE.org definition.  If a
user can trigger this, it gets a CVE assigned to it.

hope this helps,

greg k-h
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pasha Tatashin 1 month, 3 weeks ago
On Wed, Aug 13, 2025 at 1:37 PM Pratyush Yadav <pratyush@kernel.org> wrote:
>
> On Wed, Aug 13 2025, Greg KH wrote:
>
> > On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote:
> [...]
> >> Use the warn ons. Make sure they can't be triggered by userspace. Use
> >> them to detect corruption/malfunction in the kernel.
> >>
> >> In this case if kho_unpreserve_folio() fails in this call chain it
> >> means some error unwind is wrongly happening out of sequence, and we
> >> are now forced to leak memory. Unwind is not something that userspace
> >> should be controlling, so of course we want a WARN_ON here.
> >
> > "should be" is the key here.  And it's not obvious from this patch if
> > that's true or not, which is why I mentioned it.
> >
> > I will keep bringing this up, given the HUGE number of CVEs I keep
> > assigning each week for when userspace hits WARN_ON() calls until that
> > flow starts to die out either because we don't keep adding new calls, OR
> > we finally fix them all.  Both would be good...
>
> Out of curiosity, why is hitting a WARN_ON() considered a vulnerability?
> I'd guess one reason is overwhelming system console which can cause a
> denial of service, but what about WARN_ON_ONCE() or WARN_RATELIMIT()?

My understanding that it is vulnerability only if it can be triggered
from userspace, otherwise it is a preferred method to give a notice
that something is very wrong.

Given the large number of machines that have panic_on_warn, a reliable
kernel crash that is triggered from userspace is a vulnerability(?).

Pasha

>
> --
> Regards,
> Pratyush Yadav
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Greg KH 1 month, 3 weeks ago
On Wed, Aug 13, 2025 at 01:41:51PM +0000, Pasha Tatashin wrote:
> On Wed, Aug 13, 2025 at 1:37 PM Pratyush Yadav <pratyush@kernel.org> wrote:
> >
> > On Wed, Aug 13 2025, Greg KH wrote:
> >
> > > On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote:
> > [...]
> > >> Use the warn ons. Make sure they can't be triggered by userspace. Use
> > >> them to detect corruption/malfunction in the kernel.
> > >>
> > >> In this case if kho_unpreserve_folio() fails in this call chain it
> > >> means some error unwind is wrongly happening out of sequence, and we
> > >> are now forced to leak memory. Unwind is not something that userspace
> > >> should be controlling, so of course we want a WARN_ON here.
> > >
> > > "should be" is the key here.  And it's not obvious from this patch if
> > > that's true or not, which is why I mentioned it.
> > >
> > > I will keep bringing this up, given the HUGE number of CVEs I keep
> > > assigning each week for when userspace hits WARN_ON() calls until that
> > > flow starts to die out either because we don't keep adding new calls, OR
> > > we finally fix them all.  Both would be good...
> >
> > Out of curiosity, why is hitting a WARN_ON() considered a vulnerability?
> > I'd guess one reason is overwhelming system console which can cause a
> > denial of service, but what about WARN_ON_ONCE() or WARN_RATELIMIT()?
> 
> My understanding that it is vulnerability only if it can be triggered
> from userspace, otherwise it is a preferred method to give a notice
> that something is very wrong.
> 
> Given the large number of machines that have panic_on_warn, a reliable
> kernel crash that is triggered from userspace is a vulnerability(?).

Yes, and so is a unreliable one :)
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pasha Tatashin 1 month, 3 weeks ago
> +static int memfd_luo_preserve_folios(struct memfd_luo_preserved_folio *pfolios,
> +                                    struct folio **folios,
> +                                    unsigned int nr_folios)
> +{
> +       unsigned int i;

Should be 'long i'

Otherwise in err_unpreserve we get into an infinite loop. Thank you
Josh Hilke for noticing this.

Pasha

> +       int err;
> +
> +       for (i = 0; i < nr_folios; i++) {
> +               struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
> +               struct folio *folio = folios[i];
> +               unsigned int flags = 0;
> +               unsigned long pfn;
> +
> +               err = kho_preserve_folio(folio);
> +               if (err)
> +                       goto err_unpreserve;
> +
> +               pfn = folio_pfn(folio);
> +               if (folio_test_dirty(folio))
> +                       flags |= PRESERVED_FLAG_DIRTY;
> +               if (folio_test_uptodate(folio))
> +                       flags |= PRESERVED_FLAG_UPTODATE;
> +
> +               pfolio->foliodesc = PRESERVED_FOLIO_MKDESC(pfn, flags);
> +               pfolio->index = folio->index;
> +       }
> +
> +       return 0;
> +
> +err_unpreserve:
> +       i--;
> +       for (; i >= 0; i--)
> +               WARN_ON_ONCE(kho_unpreserve_folio(folios[i]));
> +       return err;
> +}
> +
Re: [PATCH v3 29/30] luo: allow preserving memfd
Posted by Pratyush Yadav 1 month, 3 weeks ago
On Fri, Aug 08 2025, Pasha Tatashin wrote:

>> +static int memfd_luo_preserve_folios(struct memfd_luo_preserved_folio *pfolios,
>> +                                    struct folio **folios,
>> +                                    unsigned int nr_folios)
>> +{
>> +       unsigned int i;
>
> Should be 'long i'
>
> Otherwise in err_unpreserve we get into an infinite loop. Thank you
> Josh Hilke for noticing this.

Good catch! Will fix.
>
>> +       int err;
>> +
>> +       for (i = 0; i < nr_folios; i++) {
>> +               struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
>> +               struct folio *folio = folios[i];
>> +               unsigned int flags = 0;
>> +               unsigned long pfn;
>> +
>> +               err = kho_preserve_folio(folio);
>> +               if (err)
>> +                       goto err_unpreserve;
>> +
>> +               pfn = folio_pfn(folio);
>> +               if (folio_test_dirty(folio))
>> +                       flags |= PRESERVED_FLAG_DIRTY;
>> +               if (folio_test_uptodate(folio))
>> +                       flags |= PRESERVED_FLAG_UPTODATE;
>> +
>> +               pfolio->foliodesc = PRESERVED_FOLIO_MKDESC(pfn, flags);
>> +               pfolio->index = folio->index;
>> +       }
>> +
>> +       return 0;
>> +
>> +err_unpreserve:
>> +       i--;
>> +       for (; i >= 0; i--)
>> +               WARN_ON_ONCE(kho_unpreserve_folio(folios[i]));
>> +       return err;
>> +}
>> +

-- 
Regards,
Pratyush Yadav