From: Pratyush Yadav <ptyadav@amazon.de>
The ability to preserve a memfd allows userspace to use KHO and LUO to
transfer its memory contents to the next kernel. This is useful in many
ways. For one, it can be used with IOMMUFD as the backing store for
IOMMU page tables. Preserving IOMMUFD is essential for performing a
hypervisor live update with passthrough devices. memfd support provides
the first building block for making that possible.
For another, applications with a large amount of memory that takes time
to reconstruct, reboots to consume kernel upgrades can be very
expensive. memfd with LUO gives those applications reboot-persistent
memory that they can use to quickly save and reconstruct that state.
While memfd is backed by either hugetlbfs or shmem, currently only
support on shmem is added. To be more precise, support for anonymous
shmem files is added.
The handover to the next kernel is not transparent. All the properties
of the file are not preserved; only its memory contents, position, and
size. The recreated file gets the UID and GID of the task doing the
restore, and the task's cgroup gets charged with the memory.
After LUO is in prepared state, the file cannot grow or shrink, and all
its pages are pinned to avoid migrations and swapping. The file can
still be read from or written to.
Co-developed-by: Changyuan Lyu <changyuanl@google.com>
Signed-off-by: Changyuan Lyu <changyuanl@google.com>
Co-developed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Pratyush Yadav <ptyadav@amazon.de>
---
MAINTAINERS | 2 +
mm/Makefile | 1 +
mm/memfd_luo.c | 507 +++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 510 insertions(+)
create mode 100644 mm/memfd_luo.c
diff --git a/MAINTAINERS b/MAINTAINERS
index b88b77977649..7421d21672f3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14209,6 +14209,7 @@ F: tools/testing/selftests/livepatch/
LIVE UPDATE
M: Pasha Tatashin <pasha.tatashin@soleen.com>
+R: Pratyush Yadav <pratyush@kernel.org>
L: linux-kernel@vger.kernel.org
S: Maintained
F: Documentation/ABI/testing/sysfs-kernel-liveupdate
@@ -14218,6 +14219,7 @@ F: Documentation/userspace-api/liveupdate.rst
F: include/linux/liveupdate.h
F: include/uapi/linux/liveupdate.h
F: kernel/liveupdate/
+F: mm/memfd_luo.c
F: tools/testing/selftests/liveupdate/
LLC (802.2)
diff --git a/mm/Makefile b/mm/Makefile
index ef54aa615d9d..0a9936ffc172 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_NUMA) += memory-tiers.o
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
+obj-$(CONFIG_LIVEUPDATE) += memfd_luo.o
obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
ifdef CONFIG_SWAP
diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c
new file mode 100644
index 000000000000..0c91b40a2080
--- /dev/null
+++ b/mm/memfd_luo.c
@@ -0,0 +1,507 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ * Changyuan Lyu <changyuanl@google.com>
+ *
+ * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
+ * Pratyush Yadav <ptyadav@amazon.de>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/file.h>
+#include <linux/io.h>
+#include <linux/libfdt.h>
+#include <linux/liveupdate.h>
+#include <linux/kexec_handover.h>
+#include <linux/shmem_fs.h>
+#include <linux/bits.h>
+#include "internal.h"
+
+static const char memfd_luo_compatible[] = "memfd-v1";
+
+#define PRESERVED_PFN_MASK GENMASK(63, 12)
+#define PRESERVED_PFN_SHIFT 12
+#define PRESERVED_FLAG_DIRTY BIT(0)
+#define PRESERVED_FLAG_UPTODATE BIT(1)
+
+#define PRESERVED_FOLIO_PFN(desc) (((desc) & PRESERVED_PFN_MASK) >> PRESERVED_PFN_SHIFT)
+#define PRESERVED_FOLIO_FLAGS(desc) ((desc) & ~PRESERVED_PFN_MASK)
+#define PRESERVED_FOLIO_MKDESC(pfn, flags) (((pfn) << PRESERVED_PFN_SHIFT) | (flags))
+
+struct memfd_luo_preserved_folio {
+ /*
+ * The folio descriptor is made of 2 parts. The bottom 12 bits are used
+ * for storing flags, the others for storing the PFN.
+ */
+ u64 foliodesc;
+ u64 index;
+};
+
+static int memfd_luo_preserve_folios(struct memfd_luo_preserved_folio *pfolios,
+ struct folio **folios,
+ unsigned int nr_folios)
+{
+ unsigned int i;
+ int err;
+
+ for (i = 0; i < nr_folios; i++) {
+ struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
+ struct folio *folio = folios[i];
+ unsigned int flags = 0;
+ unsigned long pfn;
+
+ err = kho_preserve_folio(folio);
+ if (err)
+ goto err_unpreserve;
+
+ pfn = folio_pfn(folio);
+ if (folio_test_dirty(folio))
+ flags |= PRESERVED_FLAG_DIRTY;
+ if (folio_test_uptodate(folio))
+ flags |= PRESERVED_FLAG_UPTODATE;
+
+ pfolio->foliodesc = PRESERVED_FOLIO_MKDESC(pfn, flags);
+ pfolio->index = folio->index;
+ }
+
+ return 0;
+
+err_unpreserve:
+ i--;
+ for (; i >= 0; i--)
+ WARN_ON_ONCE(kho_unpreserve_folio(folios[i]));
+ return err;
+}
+
+static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios,
+ unsigned int nr_folios)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr_folios; i++) {
+ const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
+ struct folio *folio;
+
+ if (!pfolio->foliodesc)
+ continue;
+
+ folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
+
+ kho_unpreserve_folio(folio);
+ unpin_folio(folio);
+ }
+}
+
+static void *memfd_luo_create_fdt(unsigned long size)
+{
+ unsigned int order = get_order(size);
+ struct folio *fdt_folio;
+ int err = 0;
+ void *fdt;
+
+ if (order > MAX_PAGE_ORDER)
+ return NULL;
+
+ fdt_folio = folio_alloc(GFP_KERNEL, order);
+ if (!fdt_folio)
+ return NULL;
+
+ fdt = folio_address(fdt_folio);
+
+ err |= fdt_create(fdt, (1 << (order + PAGE_SHIFT)));
+ err |= fdt_finish_reservemap(fdt);
+ err |= fdt_begin_node(fdt, "");
+ if (err)
+ goto free;
+
+ return fdt;
+
+free:
+ folio_put(fdt_folio);
+ return NULL;
+}
+
+static int memfd_luo_finish_fdt(void *fdt)
+{
+ int err;
+
+ err = fdt_end_node(fdt);
+ if (err)
+ return err;
+
+ return fdt_finish(fdt);
+}
+
+static int memfd_luo_prepare(struct liveupdate_file_handler *handler,
+ struct file *file, u64 *data)
+{
+ struct memfd_luo_preserved_folio *preserved_folios;
+ struct inode *inode = file_inode(file);
+ unsigned int max_folios, nr_folios = 0;
+ int err = 0, preserved_size;
+ struct folio **folios;
+ long size, nr_pinned;
+ pgoff_t offset;
+ void *fdt;
+ u64 pos;
+
+ if (WARN_ON_ONCE(!shmem_file(file)))
+ return -EINVAL;
+
+ inode_lock(inode);
+ shmem_i_mapping_freeze(inode, true);
+
+ size = i_size_read(inode);
+ if ((PAGE_ALIGN(size) / PAGE_SIZE) > UINT_MAX) {
+ err = -E2BIG;
+ goto err_unlock;
+ }
+
+ /*
+ * Guess the number of folios based on inode size. Real number might end
+ * up being smaller if there are higher order folios.
+ */
+ max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
+ folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
+ if (!folios) {
+ err = -ENOMEM;
+ goto err_unfreeze;
+ }
+
+ /*
+ * Pin the folios so they don't move around behind our back. This also
+ * ensures none of the folios are in CMA -- which ensures they don't
+ * fall in KHO scratch memory. It also moves swapped out folios back to
+ * memory.
+ *
+ * A side effect of doing this is that it allocates a folio for all
+ * indices in the file. This might waste memory on sparse memfds. If
+ * that is really a problem in the future, we can have a
+ * memfd_pin_folios() variant that does not allocate a page on empty
+ * slots.
+ */
+ nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
+ &offset);
+ if (nr_pinned < 0) {
+ err = nr_pinned;
+ pr_err("failed to pin folios: %d\n", err);
+ goto err_free_folios;
+ }
+ /* nr_pinned won't be more than max_folios which is also unsigned int. */
+ nr_folios = (unsigned int)nr_pinned;
+
+ preserved_size = sizeof(struct memfd_luo_preserved_folio) * nr_folios;
+ if (check_mul_overflow(sizeof(struct memfd_luo_preserved_folio),
+ nr_folios, &preserved_size)) {
+ err = -E2BIG;
+ goto err_unpin;
+ }
+
+ /*
+ * Most of the space should be taken by preserved folios. So take its
+ * size, plus a page for other properties.
+ */
+ fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE);
+ if (!fdt) {
+ err = -ENOMEM;
+ goto err_unpin;
+ }
+
+ pos = file->f_pos;
+ err = fdt_property(fdt, "pos", &pos, sizeof(pos));
+ if (err)
+ goto err_free_fdt;
+
+ err = fdt_property(fdt, "size", &size, sizeof(size));
+ if (err)
+ goto err_free_fdt;
+
+ err = fdt_property_placeholder(fdt, "folios", preserved_size,
+ (void **)&preserved_folios);
+ if (err) {
+ pr_err("Failed to reserve folios property in FDT: %s\n",
+ fdt_strerror(err));
+ err = -ENOMEM;
+ goto err_free_fdt;
+ }
+
+ err = memfd_luo_preserve_folios(preserved_folios, folios, nr_folios);
+ if (err)
+ goto err_free_fdt;
+
+ err = memfd_luo_finish_fdt(fdt);
+ if (err)
+ goto err_unpreserve;
+
+ err = kho_preserve_folio(virt_to_folio(fdt));
+ if (err)
+ goto err_unpreserve;
+
+ kvfree(folios);
+ inode_unlock(inode);
+
+ *data = virt_to_phys(fdt);
+ return 0;
+
+err_unpreserve:
+ memfd_luo_unpreserve_folios(preserved_folios, nr_folios);
+err_free_fdt:
+ folio_put(virt_to_folio(fdt));
+err_unpin:
+ unpin_folios(folios, nr_pinned);
+err_free_folios:
+ kvfree(folios);
+err_unfreeze:
+ shmem_i_mapping_freeze(inode, false);
+err_unlock:
+ inode_unlock(inode);
+ return err;
+}
+
+static int memfd_luo_freeze(struct liveupdate_file_handler *handler,
+ struct file *file, u64 *data)
+{
+ u64 pos = file->f_pos;
+ void *fdt;
+ int err;
+
+ if (WARN_ON_ONCE(!*data))
+ return -EINVAL;
+
+ fdt = phys_to_virt(*data);
+
+ /*
+ * The pos or size might have changed since prepare. Everything else
+ * stays the same.
+ */
+ err = fdt_setprop(fdt, 0, "pos", &pos, sizeof(pos));
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static void memfd_luo_cancel(struct liveupdate_file_handler *handler,
+ struct file *file, u64 data)
+{
+ const struct memfd_luo_preserved_folio *pfolios;
+ struct inode *inode = file_inode(file);
+ struct folio *fdt_folio;
+ void *fdt;
+ int len;
+
+ if (WARN_ON_ONCE(!data))
+ return;
+
+ inode_lock(inode);
+ shmem_i_mapping_freeze(inode, false);
+
+ fdt = phys_to_virt(data);
+ fdt_folio = virt_to_folio(fdt);
+ pfolios = fdt_getprop(fdt, 0, "folios", &len);
+ if (pfolios)
+ memfd_luo_unpreserve_folios(pfolios, len / sizeof(*pfolios));
+
+ kho_unpreserve_folio(fdt_folio);
+ folio_put(fdt_folio);
+ inode_unlock(inode);
+}
+
+static struct folio *memfd_luo_get_fdt(u64 data)
+{
+ return kho_restore_folio((phys_addr_t)data);
+}
+
+static void memfd_luo_finish(struct liveupdate_file_handler *handler,
+ struct file *file, u64 data, bool reclaimed)
+{
+ const struct memfd_luo_preserved_folio *pfolios;
+ struct folio *fdt_folio;
+ int len;
+
+ if (reclaimed)
+ return;
+
+ fdt_folio = memfd_luo_get_fdt(data);
+
+ pfolios = fdt_getprop(folio_address(fdt_folio), 0, "folios", &len);
+ if (pfolios)
+ memfd_luo_unpreserve_folios(pfolios, len / sizeof(*pfolios));
+
+ folio_put(fdt_folio);
+}
+
+static int memfd_luo_retrieve(struct liveupdate_file_handler *handler, u64 data,
+ struct file **file_p)
+{
+ const struct memfd_luo_preserved_folio *pfolios;
+ int nr_pfolios, len, ret = 0, i = 0;
+ struct address_space *mapping;
+ struct folio *folio, *fdt_folio;
+ const u64 *pos, *size;
+ struct inode *inode;
+ struct file *file;
+ const void *fdt;
+
+ fdt_folio = memfd_luo_get_fdt(data);
+ if (!fdt_folio)
+ return -ENOENT;
+
+ fdt = page_to_virt(folio_page(fdt_folio, 0));
+
+ pfolios = fdt_getprop(fdt, 0, "folios", &len);
+ if (!pfolios || len % sizeof(*pfolios)) {
+ pr_err("invalid 'folios' property\n");
+ ret = -EINVAL;
+ goto put_fdt;
+ }
+ nr_pfolios = len / sizeof(*pfolios);
+
+ size = fdt_getprop(fdt, 0, "size", &len);
+ if (!size || len != sizeof(u64)) {
+ pr_err("invalid 'size' property\n");
+ ret = -EINVAL;
+ goto put_folios;
+ }
+
+ pos = fdt_getprop(fdt, 0, "pos", &len);
+ if (!pos || len != sizeof(u64)) {
+ pr_err("invalid 'pos' property\n");
+ ret = -EINVAL;
+ goto put_folios;
+ }
+
+ file = shmem_file_setup("", 0, VM_NORESERVE);
+
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ pr_err("failed to setup file: %d\n", ret);
+ goto put_folios;
+ }
+
+ inode = file->f_inode;
+ mapping = inode->i_mapping;
+ vfs_setpos(file, *pos, MAX_LFS_FILESIZE);
+
+ for (; i < nr_pfolios; i++) {
+ const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
+ phys_addr_t phys;
+ u64 index;
+ int flags;
+
+ if (!pfolio->foliodesc)
+ continue;
+
+ phys = PFN_PHYS(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
+ folio = kho_restore_folio(phys);
+ if (!folio) {
+ pr_err("Unable to restore folio at physical address: %llx\n",
+ phys);
+ goto put_file;
+ }
+ index = pfolio->index;
+ flags = PRESERVED_FOLIO_FLAGS(pfolio->foliodesc);
+
+ /* Set up the folio for insertion. */
+ /*
+ * TODO: Should find a way to unify this and
+ * shmem_alloc_and_add_folio().
+ */
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+
+ ret = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
+ if (ret) {
+ pr_err("shmem: failed to charge folio index %d: %d\n",
+ i, ret);
+ goto unlock_folio;
+ }
+
+ ret = shmem_add_to_page_cache(folio, mapping, index, NULL,
+ mapping_gfp_mask(mapping));
+ if (ret) {
+ pr_err("shmem: failed to add to page cache folio index %d: %d\n",
+ i, ret);
+ goto unlock_folio;
+ }
+
+ if (flags & PRESERVED_FLAG_UPTODATE)
+ folio_mark_uptodate(folio);
+ if (flags & PRESERVED_FLAG_DIRTY)
+ folio_mark_dirty(folio);
+
+ ret = shmem_inode_acct_blocks(inode, 1);
+ if (ret) {
+ pr_err("shmem: failed to account folio index %d: %d\n",
+ i, ret);
+ goto unlock_folio;
+ }
+
+ shmem_recalc_inode(inode, 1, 0);
+ folio_add_lru(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ inode->i_size = *size;
+ *file_p = file;
+ folio_put(fdt_folio);
+ return 0;
+
+unlock_folio:
+ folio_unlock(folio);
+ folio_put(folio);
+put_file:
+ fput(file);
+ i++;
+put_folios:
+ for (; i < nr_pfolios; i++) {
+ const struct memfd_luo_preserved_folio *pfolio = &pfolios[i];
+
+ folio = kho_restore_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc));
+ if (folio)
+ folio_put(folio);
+ }
+
+put_fdt:
+ folio_put(fdt_folio);
+ return ret;
+}
+
+static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
+ struct file *file)
+{
+ struct inode *inode = file_inode(file);
+
+ return shmem_file(file) && !inode->i_nlink;
+}
+
+static const struct liveupdate_file_ops memfd_luo_file_ops = {
+ .prepare = memfd_luo_prepare,
+ .freeze = memfd_luo_freeze,
+ .cancel = memfd_luo_cancel,
+ .finish = memfd_luo_finish,
+ .retrieve = memfd_luo_retrieve,
+ .can_preserve = memfd_luo_can_preserve,
+ .owner = THIS_MODULE,
+};
+
+static struct liveupdate_file_handler memfd_luo_handler = {
+ .ops = &memfd_luo_file_ops,
+ .compatible = memfd_luo_compatible,
+};
+
+static int __init memfd_luo_init(void)
+{
+ int err;
+
+ err = liveupdate_register_file_handler(&memfd_luo_handler);
+ if (err)
+ pr_err("Could not register luo filesystem handler: %d\n", err);
+
+ return err;
+}
+late_initcall(memfd_luo_init);
--
2.50.1.565.gc32cd1483b-goog
On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote: > + /* > + * Most of the space should be taken by preserved folios. So take its > + * size, plus a page for other properties. > + */ > + fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE); > + if (!fdt) { > + err = -ENOMEM; > + goto err_unpin; > + } This doesn't seem to have any versioning scheme, it really should.. > + err = fdt_property_placeholder(fdt, "folios", preserved_size, > + (void **)&preserved_folios); > + if (err) { > + pr_err("Failed to reserve folios property in FDT: %s\n", > + fdt_strerror(err)); > + err = -ENOMEM; > + goto err_free_fdt; > + } Yuk. This really wants some luo helper 'luo alloc array' 'luo restore array' 'luo free array' Which would get a linearized list of pages in the vmap to hold the array and then allocate some structure to record the page list and return back the u64 of the phys_addr of the top of the structure to store in whatever. Getting fdt to allocate the array inside the fds is just not going to work for anything of size. > + for (; i < nr_pfolios; i++) { > + const struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; > + phys_addr_t phys; > + u64 index; > + int flags; > + > + if (!pfolio->foliodesc) > + continue; > + > + phys = PFN_PHYS(PRESERVED_FOLIO_PFN(pfolio->foliodesc)); > + folio = kho_restore_folio(phys); > + if (!folio) { > + pr_err("Unable to restore folio at physical address: %llx\n", > + phys); > + goto put_file; > + } > + index = pfolio->index; > + flags = PRESERVED_FOLIO_FLAGS(pfolio->foliodesc); > + > + /* Set up the folio for insertion. */ > + /* > + * TODO: Should find a way to unify this and > + * shmem_alloc_and_add_folio(). > + */ > + __folio_set_locked(folio); > + __folio_set_swapbacked(folio); > > + ret = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping)); > + if (ret) { > + pr_err("shmem: failed to charge folio index %d: %d\n", > + i, ret); > + goto unlock_folio; > + } [..] > + folio_add_lru(folio); > + folio_unlock(folio); > + folio_put(folio); > + } Probably some consolidation will be needed to make this less duplicated.. But overall I think just using the memfd_luo_preserved_folio as the serialization is entirely file, I don't think this needs anything more complicated. What it does need is an alternative to the FDT with versioning. Which seems to me to be entirely fine as: struct memfd_luo_v0 { __aligned_u64 size; __aligned_u64 pos; __aligned_u64 folios; }; struct memfd_luo_v0 memfd_luo_v0 = {.size = size, pos = file->f_pos, folios = folios}; luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0); Which also shows the actual data needing to be serialized comes from more than one struct and has to be marshaled in code, somehow, to a single struct. Then I imagine a fairly simple forwards/backwards story. If something new is needed that is non-optional, lets say you compress the folios list to optimize holes: struct memfd_luo_v1 { __aligned_u64 size; __aligned_u64 pos; __aligned_u64 folios_list_with_holes; }; Obviously a v0 kernel cannot parse this, but in this case a v1 aware kernel could optionally duplicate and write out the v0 format as well: luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0); luo_store_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1); Then the rule is fairly simple, when the sucessor kernel goes to deserialize it asks luo for the versions it supports: if (luo_restore_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1)) restore_v1(&memfd_luo_v1) else if (luo_restore_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0)) restore_v0(&memfd_luo_v0) else luo_failure("Do not understand this"); luo core just manages this list of versioned data per serialized object. There is only one version per object. Jason
On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote: > On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote: > > > + /* > > + * Most of the space should be taken by preserved folios. So take its > > + * size, plus a page for other properties. > > + */ > > + fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE); > > + if (!fdt) { > > + err = -ENOMEM; > > + goto err_unpin; > > + } > > This doesn't seem to have any versioning scheme, it really should.. > > > + err = fdt_property_placeholder(fdt, "folios", preserved_size, > > + (void **)&preserved_folios); > > + if (err) { > > + pr_err("Failed to reserve folios property in FDT: %s\n", > > + fdt_strerror(err)); > > + err = -ENOMEM; > > + goto err_free_fdt; > > + } > > Yuk. > > This really wants some luo helper > > 'luo alloc array' > 'luo restore array' > 'luo free array' We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here: https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1 Will wait for kbuild and then send proper patches. -- Sincerely yours, Mike.
Hi Mike, On Mon, Sep 01 2025, Mike Rapoport wrote: > On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote: >> On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote: >> >> > + /* >> > + * Most of the space should be taken by preserved folios. So take its >> > + * size, plus a page for other properties. >> > + */ >> > + fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE); >> > + if (!fdt) { >> > + err = -ENOMEM; >> > + goto err_unpin; >> > + } >> >> This doesn't seem to have any versioning scheme, it really should.. >> >> > + err = fdt_property_placeholder(fdt, "folios", preserved_size, >> > + (void **)&preserved_folios); >> > + if (err) { >> > + pr_err("Failed to reserve folios property in FDT: %s\n", >> > + fdt_strerror(err)); >> > + err = -ENOMEM; >> > + goto err_free_fdt; >> > + } >> >> Yuk. >> >> This really wants some luo helper >> >> 'luo alloc array' >> 'luo restore array' >> 'luo free array' > > We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here: > https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1 > > Will wait for kbuild and then send proper patches. I have been working on something similar, but in a more generic way. I have implemented a sparse KHO-preservable array (called kho_array) with xarray like properties. It can take in 4-byte aligned pointers and supports saving non-pointer values similar to xa_mk_value(). For now it doesn't support multi-index entries, but if needed the data format can be extended to support it as well. The structure is very similar to what you have implemented. It uses a linked list of pages with some metadata at the head of each page. I have used it for memfd preservation, and I think it is quite versatile. For example, your kho_preserve_vmalloc() can be very easily built on top of this kho_array by simply saving each physical page address at consecutive indices in the array. The code is still WIP and currently a bit hacky, but I will clean it up in a couple days and I think it should be ready for posting. You can find the current version at [0][1]. Would be good to hear your thoughts, and if you agree with the approach, I can also port kho_preserve_vmalloc() to work on top of kho_array as well. [0] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=cf4c04c1e9ac854e3297018ad6dada17c54a59af [1] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=5eb0d7316274a9c87acaeedd86941979fc4baf96 -- Regards, Pratyush Yadav
Hi Pratyush, On Mon, Sep 01, 2025 at 07:01:38PM +0200, Pratyush Yadav wrote: > Hi Mike, > > On Mon, Sep 01 2025, Mike Rapoport wrote: > > > On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote: > >> On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote: > >> > >> > + /* > >> > + * Most of the space should be taken by preserved folios. So take its > >> > + * size, plus a page for other properties. > >> > + */ > >> > + fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE); > >> > + if (!fdt) { > >> > + err = -ENOMEM; > >> > + goto err_unpin; > >> > + } > >> > >> This doesn't seem to have any versioning scheme, it really should.. > >> > >> > + err = fdt_property_placeholder(fdt, "folios", preserved_size, > >> > + (void **)&preserved_folios); > >> > + if (err) { > >> > + pr_err("Failed to reserve folios property in FDT: %s\n", > >> > + fdt_strerror(err)); > >> > + err = -ENOMEM; > >> > + goto err_free_fdt; > >> > + } > >> > >> Yuk. > >> > >> This really wants some luo helper > >> > >> 'luo alloc array' > >> 'luo restore array' > >> 'luo free array' > > > > We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here: > > https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1 > > > > Will wait for kbuild and then send proper patches. > > I have been working on something similar, but in a more generic way. > > I have implemented a sparse KHO-preservable array (called kho_array) > with xarray like properties. It can take in 4-byte aligned pointers and > supports saving non-pointer values similar to xa_mk_value(). For now it > doesn't support multi-index entries, but if needed the data format can > be extended to support it as well. > > The structure is very similar to what you have implemented. It uses a > linked list of pages with some metadata at the head of each page. > > I have used it for memfd preservation, and I think it is quite > versatile. For example, your kho_preserve_vmalloc() can be very easily > built on top of this kho_array by simply saving each physical page > address at consecutive indices in the array. I've started to work on something similar to your kho_array for memfd case and then I thought that since we know the size of the array we can simply vmalloc it and preserve vmalloc, and that lead me to implementing preservation of vmalloc :) I like the idea to have kho_array for cases when we don't know the amount of data to preserve in advance, but for memfd as it's currently implemented I think that allocating and preserving vmalloc is simpler. As for porting kho_preserve_vmalloc() to kho_array, I also feel that it would just make kho_preserve_vmalloc() more complex and I'd rather simplify it even more, e.g. with preallocating all the pages that preserve indices in advance. > The code is still WIP and currently a bit hacky, but I will clean it up > in a couple days and I think it should be ready for posting. You can > find the current version at [0][1]. Would be good to hear your thoughts, > and if you agree with the approach, I can also port > kho_preserve_vmalloc() to work on top of kho_array as well. > > [0] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=cf4c04c1e9ac854e3297018ad6dada17c54a59af > [1] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=5eb0d7316274a9c87acaeedd86941979fc4baf96 > > -- > Regards, > Pratyush Yadav -- Sincerely yours, Mike.
Hi Mike, On Tue, Sep 02 2025, Mike Rapoport wrote: > Hi Pratyush, > > On Mon, Sep 01, 2025 at 07:01:38PM +0200, Pratyush Yadav wrote: >> Hi Mike, >> >> On Mon, Sep 01 2025, Mike Rapoport wrote: >> >> > On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote: >> >> On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote: >> >> >> >> > + /* >> >> > + * Most of the space should be taken by preserved folios. So take its >> >> > + * size, plus a page for other properties. >> >> > + */ >> >> > + fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE); >> >> > + if (!fdt) { >> >> > + err = -ENOMEM; >> >> > + goto err_unpin; >> >> > + } >> >> >> >> This doesn't seem to have any versioning scheme, it really should.. >> >> >> >> > + err = fdt_property_placeholder(fdt, "folios", preserved_size, >> >> > + (void **)&preserved_folios); >> >> > + if (err) { >> >> > + pr_err("Failed to reserve folios property in FDT: %s\n", >> >> > + fdt_strerror(err)); >> >> > + err = -ENOMEM; >> >> > + goto err_free_fdt; >> >> > + } >> >> >> >> Yuk. >> >> >> >> This really wants some luo helper >> >> >> >> 'luo alloc array' >> >> 'luo restore array' >> >> 'luo free array' >> > >> > We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here: >> > https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1 >> > >> > Will wait for kbuild and then send proper patches. >> >> I have been working on something similar, but in a more generic way. >> >> I have implemented a sparse KHO-preservable array (called kho_array) >> with xarray like properties. It can take in 4-byte aligned pointers and >> supports saving non-pointer values similar to xa_mk_value(). For now it >> doesn't support multi-index entries, but if needed the data format can >> be extended to support it as well. >> >> The structure is very similar to what you have implemented. It uses a >> linked list of pages with some metadata at the head of each page. >> >> I have used it for memfd preservation, and I think it is quite >> versatile. For example, your kho_preserve_vmalloc() can be very easily >> built on top of this kho_array by simply saving each physical page >> address at consecutive indices in the array. > > I've started to work on something similar to your kho_array for memfd case > and then I thought that since we know the size of the array we can simply > vmalloc it and preserve vmalloc, and that lead me to implementing > preservation of vmalloc :) > > I like the idea to have kho_array for cases when we don't know the amount > of data to preserve in advance, but for memfd as it's currently > implemented I think that allocating and preserving vmalloc is simpler. > > As for porting kho_preserve_vmalloc() to kho_array, I also feel that it > would just make kho_preserve_vmalloc() more complex and I'd rather simplify > it even more, e.g. with preallocating all the pages that preserve indices > in advance. I think there are two parts here. One is the data format of the KHO array and the other is the way to build it. I think the format is quite simple and versatile, and we can have many strategies of building it. For example, if you are only concerned with pre-allocating data, I can very well add a way to initialize the KHO array with with a fixed size up front. Beyond that, I think KHO array will actually make kho_preserve_vmalloc() simpler since it won't have to deal with the linked list traversal logic. It can just do ka_for_each() and just get all the pages. We can also convert the preservation bitmaps to use it so the linked list logic is in one place, and others just build on top of it. > >> The code is still WIP and currently a bit hacky, but I will clean it up >> in a couple days and I think it should be ready for posting. You can >> find the current version at [0][1]. Would be good to hear your thoughts, >> and if you agree with the approach, I can also port >> kho_preserve_vmalloc() to work on top of kho_array as well. >> >> [0] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=cf4c04c1e9ac854e3297018ad6dada17c54a59af >> [1] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=5eb0d7316274a9c87acaeedd86941979fc4baf96 >> >> -- >> Regards, >> Pratyush Yadav -- Regards, Pratyush Yadav
Hi Pratyush, On Wed, Sep 03, 2025 at 04:17:15PM +0200, Pratyush Yadav wrote: > On Tue, Sep 02 2025, Mike Rapoport wrote: > > > > As for porting kho_preserve_vmalloc() to kho_array, I also feel that it > > would just make kho_preserve_vmalloc() more complex and I'd rather simplify > > it even more, e.g. with preallocating all the pages that preserve indices > > in advance. > > I think there are two parts here. One is the data format of the KHO > array and the other is the way to build it. I think the format is quite > simple and versatile, and we can have many strategies of building it. > > For example, if you are only concerned with pre-allocating data, I can > very well add a way to initialize the KHO array with with a fixed size > up front. I wasn't concerned with preallocation vs allocating a page at a time, I though with preallocation the vmalloc code will become even simpler, but it's not :) > Beyond that, I think KHO array will actually make kho_preserve_vmalloc() > simpler since it won't have to deal with the linked list traversal > logic. It can just do ka_for_each() and just get all the pages. > > We can also convert the preservation bitmaps to use it so the linked list > logic is in one place, and others just build on top of it. I disagree. The boilerplate to initialize and iterate the kho_array will not make neither vmalloc nor bitmaps preservation simpler IMO. And for bitmaps Pasha and Jason M. are anyway working on a different data structure already, so if their proposal moves forward converting bitmap preservation to anything would be a wasted effort. > -- > Regards, > Pratyush Yadav -- Sincerely yours, Mike.
Hi Mike, On Wed, Sep 03 2025, Mike Rapoport wrote: > On Wed, Sep 03, 2025 at 04:17:15PM +0200, Pratyush Yadav wrote: >> On Tue, Sep 02 2025, Mike Rapoport wrote: >> > >> > As for porting kho_preserve_vmalloc() to kho_array, I also feel that it >> > would just make kho_preserve_vmalloc() more complex and I'd rather simplify >> > it even more, e.g. with preallocating all the pages that preserve indices >> > in advance. [...] > >> Beyond that, I think KHO array will actually make kho_preserve_vmalloc() >> simpler since it won't have to deal with the linked list traversal >> logic. It can just do ka_for_each() and just get all the pages. >> >> We can also convert the preservation bitmaps to use it so the linked list >> logic is in one place, and others just build on top of it. > > I disagree. The boilerplate to initialize and iterate the kho_array will > not make neither vmalloc nor bitmaps preservation simpler IMO. I have done 80% of the work on this already, so let's do this: I will do the rest of the 20% and publish the patches. Then you and Jason can have a look and if you still think it's not worth it, I am fine shelving it for now and revisiting later when there might be a stronger case. > > And for bitmaps Pasha and Jason M. are anyway working on a different data > structure already, so if their proposal moves forward converting bitmap > preservation to anything would be a wasted effort. -- Regards, Pratyush Yadav
On Mon, Sep 1, 2025 at 4:23 PM Mike Rapoport <rppt@kernel.org> wrote: > > On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote: > > On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote: > > > > > + /* > > > + * Most of the space should be taken by preserved folios. So take its > > > + * size, plus a page for other properties. > > > + */ > > > + fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE); > > > + if (!fdt) { > > > + err = -ENOMEM; > > > + goto err_unpin; > > > + } > > > > This doesn't seem to have any versioning scheme, it really should.. > > > > > + err = fdt_property_placeholder(fdt, "folios", preserved_size, > > > + (void **)&preserved_folios); > > > + if (err) { > > > + pr_err("Failed to reserve folios property in FDT: %s\n", > > > + fdt_strerror(err)); > > > + err = -ENOMEM; > > > + goto err_free_fdt; > > > + } > > > > Yuk. > > > > This really wants some luo helper > > > > 'luo alloc array' > > 'luo restore array' > > 'luo free array' > > We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here: > https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1 The patch looks okay to me, but it doesn't support holes in vmap areas. While that is likely acceptable for vmalloc, it could be a problem if we want to preserve memfd with holes and using vmap preservation as a method, which would require a different approach. Still, this would help with preserving memfd. However, I wonder if we should add a separate preservation library on top of the kho and not as part of kho (or at least keep them in a separate file from core logic). This would allow us to preserve more advanced data structures such as this and define preservation version control, similar to Jason's store_object/restore_object proposal. > > Will wait for kbuild and then send proper patches. > > > -- > Sincerely yours, > Mike.
On Mon, Sep 01, 2025 at 04:54:15PM +0000, Pasha Tatashin wrote: > On Mon, Sep 1, 2025 at 4:23 PM Mike Rapoport <rppt@kernel.org> wrote: > > > > On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote: > > > On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote: > > > > > > > + /* > > > > + * Most of the space should be taken by preserved folios. So take its > > > > + * size, plus a page for other properties. > > > > + */ > > > > + fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE); > > > > + if (!fdt) { > > > > + err = -ENOMEM; > > > > + goto err_unpin; > > > > + } > > > > > > This doesn't seem to have any versioning scheme, it really should.. > > > > > > > + err = fdt_property_placeholder(fdt, "folios", preserved_size, > > > > + (void **)&preserved_folios); > > > > + if (err) { > > > > + pr_err("Failed to reserve folios property in FDT: %s\n", > > > > + fdt_strerror(err)); > > > > + err = -ENOMEM; > > > > + goto err_free_fdt; > > > > + } > > > > > > Yuk. > > > > > > This really wants some luo helper > > > > > > 'luo alloc array' > > > 'luo restore array' > > > 'luo free array' > > > > We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here: > > https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1 > > The patch looks okay to me, but it doesn't support holes in vmap > areas. While that is likely acceptable for vmalloc, it could be a > problem if we want to preserve memfd with holes and using vmap > preservation as a method, which would require a different approach. > Still, this would help with preserving memfd. I can't say I understand what you mean by "holes in vmap areas". We anyway get an array of folios in memfd_pin_folios() and at that point we know exactly how many folios there is. So we can do something like preserved_folios = vmalloc_array(nr_folios, sizeof(*preserved_folios)); memfd_luo_preserve_folios(preserved_folios, folios, nr_folios); kho_preserve_vmalloc(preserved_folios, &folios_info); > However, I wonder if we should add a separate preservation library on > top of the kho and not as part of kho (or at least keep them in a > separate file from core logic). This would allow us to preserve more > advanced data structures such as this and define preservation version > control, similar to Jason's store_object/restore_object proposal. kho_preserve_vmalloc() seems quite basic and I don't think it should be separated from kho core. kho_array is already planned in a separate file :) > > Will wait for kbuild and then send proper patches. > > > > > > -- > > Sincerely yours, > > Mike. > -- Sincerely yours, Mike.
Hi Pasha, On Mon, Sep 01 2025, Pasha Tatashin wrote: > On Mon, Sep 1, 2025 at 4:23 PM Mike Rapoport <rppt@kernel.org> wrote: >> >> On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote: >> > On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote: >> > >> > > + /* >> > > + * Most of the space should be taken by preserved folios. So take its >> > > + * size, plus a page for other properties. >> > > + */ >> > > + fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE); >> > > + if (!fdt) { >> > > + err = -ENOMEM; >> > > + goto err_unpin; >> > > + } >> > >> > This doesn't seem to have any versioning scheme, it really should.. >> > >> > > + err = fdt_property_placeholder(fdt, "folios", preserved_size, >> > > + (void **)&preserved_folios); >> > > + if (err) { >> > > + pr_err("Failed to reserve folios property in FDT: %s\n", >> > > + fdt_strerror(err)); >> > > + err = -ENOMEM; >> > > + goto err_free_fdt; >> > > + } >> > >> > Yuk. >> > >> > This really wants some luo helper >> > >> > 'luo alloc array' >> > 'luo restore array' >> > 'luo free array' >> >> We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here: >> https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1 > > The patch looks okay to me, but it doesn't support holes in vmap > areas. While that is likely acceptable for vmalloc, it could be a > problem if we want to preserve memfd with holes and using vmap > preservation as a method, which would require a different approach. > Still, this would help with preserving memfd. I agree. I think we should do it the other way round. Build a sparse array first, and then use that to build vmap preservation. Our emails seem to have crossed, but see my reply to Mike [0] that describes my idea a bit more, along with WIP code. [0] https://lore.kernel.org/lkml/mafs0ldmyw1hp.fsf@kernel.org/ > > However, I wonder if we should add a separate preservation library on > top of the kho and not as part of kho (or at least keep them in a > separate file from core logic). This would allow us to preserve more > advanced data structures such as this and define preservation version > control, similar to Jason's store_object/restore_object proposal. This is how I have done it in my code: created a separate file called kho_array.c. If we have enough such data structures, we can probably move it under kernel/liveupdate/lib/. As for the store_object/restore_object proposal: see an alternate idea at [1]. [1] https://lore.kernel.org/lkml/mafs0h5xmw12a.fsf@kernel.org/ -- Regards, Pratyush Yadav
> >> > This really wants some luo helper > >> > > >> > 'luo alloc array' > >> > 'luo restore array' > >> > 'luo free array' > >> > >> We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here: > >> https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1 > > > > The patch looks okay to me, but it doesn't support holes in vmap > > areas. While that is likely acceptable for vmalloc, it could be a > > problem if we want to preserve memfd with holes and using vmap > > preservation as a method, which would require a different approach. > > Still, this would help with preserving memfd. > > I agree. I think we should do it the other way round. Build a sparse > array first, and then use that to build vmap preservation. Our emails Yes, sparse array support would help both: vmalloc and memfd preservation. > seem to have crossed, but see my reply to Mike [0] that describes my > idea a bit more, along with WIP code. > > [0] https://lore.kernel.org/lkml/mafs0ldmyw1hp.fsf@kernel.org/ > > > > > However, I wonder if we should add a separate preservation library on > > top of the kho and not as part of kho (or at least keep them in a > > separate file from core logic). This would allow us to preserve more > > advanced data structures such as this and define preservation version > > control, similar to Jason's store_object/restore_object proposal. > > This is how I have done it in my code: created a separate file called > kho_array.c. If we have enough such data structures, we can probably > move it under kernel/liveupdate/lib/. Yes, let's place it under kernel/liveupdate/lib/. We will add more preservation types over time. > As for the store_object/restore_object proposal: see an alternate idea > at [1]. > > [1] https://lore.kernel.org/lkml/mafs0h5xmw12a.fsf@kernel.org/ What you are proposing makes sense. We can update the LUO API to be responsible for passing the compatible string outside of the data payload. However, I think we first need to settle on the actual API for storing and restoring a versioned blob of data and place that code into kernel/liveupdate/lib/. Depending on which API we choose, we can then modify the LUO to work accordingly. > > -- > Regards, > Pratyush Yadav
On Mon, Sep 01, 2025 at 07:02:46PM +0000, Pasha Tatashin wrote: > > >> > This really wants some luo helper > > >> > > > >> > 'luo alloc array' > > >> > 'luo restore array' > > >> > 'luo free array' > > >> > > >> We can just add kho_{preserve,restore}_vmalloc(). I've drafted it here: > > >> https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=kho/vmalloc/v1 > > > > > > The patch looks okay to me, but it doesn't support holes in vmap > > > areas. While that is likely acceptable for vmalloc, it could be a > > > problem if we want to preserve memfd with holes and using vmap > > > preservation as a method, which would require a different approach. > > > Still, this would help with preserving memfd. > > > > I agree. I think we should do it the other way round. Build a sparse > > array first, and then use that to build vmap preservation. Our emails > > Yes, sparse array support would help both: vmalloc and memfd preservation. Why? vmalloc is always full popoulated, no sparseness.. And again in real systems we expect memfd to be fully populated too. I wouldn't invest any time in something like this right now. Just be inefficient if there is sparseness for some reason. Jason
> > > > The patch looks okay to me, but it doesn't support holes in vmap > > > > areas. While that is likely acceptable for vmalloc, it could be a > > > > problem if we want to preserve memfd with holes and using vmap > > > > preservation as a method, which would require a different approach. > > > > Still, this would help with preserving memfd. > > > > > > I agree. I think we should do it the other way round. Build a sparse > > > array first, and then use that to build vmap preservation. Our emails > > > > Yes, sparse array support would help both: vmalloc and memfd preservation. > > Why? vmalloc is always full popoulated, no sparseness.. vmalloc is always fully populated, but if we add support for preserving an area with holes, it can also be used for preserving vmalloc. By the way, I don't like calling it *vmalloc* preservation because we aren't preserving the original virtual addresses; we are preserving a list of pages that are reassembled into a virtually contiguous area. Maybe kho map, or kho page map, not sure, but vmalloc does not sound right to me. > And again in real systems we expect memfd to be fully populated too. I thought so too, but we already have a use case for slightly sparse memfd, unfortunately, that becomes *very* inefficient when fully populated.
On Wed, Sep 03, 2025 at 03:59:40PM +0000, Pasha Tatashin wrote: > > > > And again in real systems we expect memfd to be fully populated too. > > I thought so too, but we already have a use case for slightly sparse > memfd, unfortunately, that becomes *very* inefficient when fully > populated. Wait, regardless of how sparse memfd is, once you memfd_pin_folios() the number of folios to preserve is known and the metadata to preserve is a fully populated array. -- Sincerely yours, Mike.
On Wed, Sep 03, 2025 at 03:59:40PM +0000, Pasha Tatashin wrote: > vmalloc is always fully populated, but if we add support for > preserving an area with holes, it can also be used for preserving > vmalloc. Why? If you can't create it with vmap what is the point? > By the way, I don't like calling it *vmalloc* preservation > because we aren't preserving the original virtual addresses; we are > preserving a list of pages that are reassembled into a virtually > contiguous area. Maybe kho map, or kho page map, not sure, but vmalloc > does not sound right to me. No preservation retains the virtual address, that is pretty much universal. It is vmalloc preservation because the flow is x = vmalloc() kho_preserve_vmalloc(x, &preserved) [..] x = kho_restore_vmalloc(preserved) vfree(x) It is the same naming as folio preservation. Upon restore you get a vmalloc() back. > > And again in real systems we expect memfd to be fully populated too. > > I thought so too, but we already have a use case for slightly sparse > memfd, unfortunately, that becomes *very* inefficient when fully > populated. Really? Why not use multiple memfds :( So maybe you need to do optimized sparseness in memfd :( Jason
On Tue, Aug 26, 2025 at 9:20 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote: > > > + /* > > + * Most of the space should be taken by preserved folios. So take its > > + * size, plus a page for other properties. > > + */ > > + fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE); > > + if (!fdt) { > > + err = -ENOMEM; > > + goto err_unpin; > > + } > > This doesn't seem to have any versioning scheme, it really should.. > > > + err = fdt_property_placeholder(fdt, "folios", preserved_size, > > + (void **)&preserved_folios); > > + if (err) { > > + pr_err("Failed to reserve folios property in FDT: %s\n", > > + fdt_strerror(err)); > > + err = -ENOMEM; > > + goto err_free_fdt; > > + } > > Yuk. > > This really wants some luo helper > > 'luo alloc array' > 'luo restore array' > 'luo free array' Yes, that will be one step forward. Another idea is that having a middle layer manages the life cycle of the reserved memory for you. Kind of like a slab allocator for the preserved memory. It allows bulk free if there is an error on the live update prepare(), you need to free all previously allocated memory anyway. If there is some preserved memory that needs to stay after a long term after the live update kernel boot up, use some special flags to indicate so don't mix the free_all pool. > > Which would get a linearized list of pages in the vmap to hold the > array and then allocate some structure to record the page list and > return back the u64 of the phys_addr of the top of the structure to > store in whatever. > > Getting fdt to allocate the array inside the fds is just not going to > work for anything of size. > > > + for (; i < nr_pfolios; i++) { > > + const struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; > > + phys_addr_t phys; > > + u64 index; > > + int flags; > > + > > + if (!pfolio->foliodesc) > > + continue; > > + > > + phys = PFN_PHYS(PRESERVED_FOLIO_PFN(pfolio->foliodesc)); > > + folio = kho_restore_folio(phys); > > + if (!folio) { > > + pr_err("Unable to restore folio at physical address: %llx\n", > > + phys); > > + goto put_file; > > + } > > + index = pfolio->index; > > + flags = PRESERVED_FOLIO_FLAGS(pfolio->foliodesc); > > + > > + /* Set up the folio for insertion. */ > > + /* > > + * TODO: Should find a way to unify this and > > + * shmem_alloc_and_add_folio(). > > + */ > > + __folio_set_locked(folio); > > + __folio_set_swapbacked(folio); > > > > + ret = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping)); > > + if (ret) { > > + pr_err("shmem: failed to charge folio index %d: %d\n", > > + i, ret); > > + goto unlock_folio; > > + } > > [..] > > > + folio_add_lru(folio); > > + folio_unlock(folio); > > + folio_put(folio); > > + } > > Probably some consolidation will be needed to make this less > duplicated.. > > But overall I think just using the memfd_luo_preserved_folio as the > serialization is entirely file, I don't think this needs anything more > complicated. > > What it does need is an alternative to the FDT with versioning. > > Which seems to me to be entirely fine as: > > struct memfd_luo_v0 { > __aligned_u64 size; > __aligned_u64 pos; > __aligned_u64 folios; > }; > > struct memfd_luo_v0 memfd_luo_v0 = {.size = size, pos = file->f_pos, folios = folios}; > luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0); > > Which also shows the actual data needing to be serialized comes from > more than one struct and has to be marshaled in code, somehow, to a > single struct. > > Then I imagine a fairly simple forwards/backwards story. If something > new is needed that is non-optional, lets say you compress the folios > list to optimize holes: > > struct memfd_luo_v1 { > __aligned_u64 size; > __aligned_u64 pos; > __aligned_u64 folios_list_with_holes; > }; > > Obviously a v0 kernel cannot parse this, but in this case a v1 aware > kernel could optionally duplicate and write out the v0 format as well: > > luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0); > luo_store_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1); Question: Do we have a matching FDT node to match the memfd C structure hierarchy? Otherwise all the C struct will lump into one FDT node. Maybe one FDT node for all C struct is fine. Then there is a risk of overflowing the 4K buffer limit on the FDT node. I would like to get independent of FDT for the versioning. FDT on the top level sounds OK. Not ideal but workable. We are getting deeper and deeper into complex internal data structures. Do we still want every data structure referenced by a FDT identifier? > Then the rule is fairly simple, when the sucessor kernel goes to > deserialize it asks luo for the versions it supports: > > if (luo_restore_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1)) > restore_v1(&memfd_luo_v1) > else if (luo_restore_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0)) > restore_v0(&memfd_luo_v0) > else > luo_failure("Do not understand this"); > > luo core just manages this list of versioned data per serialized > object. There is only one version per object. Obviously, this can be done. Is that approach you want to expand to every other C struct as well? See the above FDT node complexity. I am getting the feeling that we are hand crafting screws to build an airplane. Can it be done? Of course. Does it scale well? I am not sure. There are many developers who are currently hand-crafting this kind of screws to be used on the different components of the airplane. We need a machine that can stamp out screws with our specifications, faster. I want such a machine. Other developers might want one as well. The initial discussion of the idea of such a machine is pretty discouraged. There are huge communication barriers because of the fixation on hand crafted screws. I understand exploring such machine ideas alone might distract the engineer from hand crafting more screws, one of them might realize that, oh, I want such a machine as well. At this stage, do you see that exploring such a machine idea can be beneficial or harmful to the project? If such an idea is considered harmful, we should stop discussing such an idea at all. Go back to building more batches of hand crafted screws, which are waiting by the next critical component. Also if such a machine can produce screws up to your specification, but it has a different look and feel than the hand crafted screws. We can stamp out the screw faster. Would you consider putting such a machined screw on your most critical component of the engine? Best Regards, Chris
On Fri, Aug 29, 2025 at 12:18:43PM -0700, Chris Li wrote: > Another idea is that having a middle layer manages the life cycle of > the reserved memory for you. Kind of like a slab allocator for the > preserved memory. If you want a slab allocator then I think you should make slab preservable.. Don't need more allocators :\ > Question: Do we have a matching FDT node to match the memfd C > structure hierarchy? Otherwise all the C struct will lump into one FDT > node. Maybe one FDT node for all C struct is fine. Then there is a > risk of overflowing the 4K buffer limit on the FDT node. I thought you were getting rid of FDT? My suggestion was to be taken as a FDT replacement.. You need some kind of hierarchy of identifiers, things like memfd should chain off some higher level luo object for a file descriptor. PCI should be the same, but not fd based. It may be that luo maintains some flat dictionary of string -> [object type, version, u64 ptr]* And if you want to serialize that the optimal path would be to have a vmalloc of all the strings and a vmalloc of the [] data, sort of like the kho array idea. > At this stage, do you see that exploring such a machine idea can be > beneficial or harmful to the project? If such an idea is considered > harmful, we should stop discussing such an idea at all. Go back to > building more batches of hand crafted screws, which are waiting by the > next critical component. I haven't heard a compelling idea that will obviously make things better.. Adding more layers and complexity is not better. Your BTF proposal doesn't seem to benifit memfd at all, it was focused on extracting data directly from an existing struct which I feel very strongly we should never do. The above dictionary, I also don't see how BTF helps. It is such a special encoding. Yes you could make some elaborate serialization infrastructure, like FDT, but we have all been saying FDT is too hard to use and too much code. I'm not sure I'm convinced there is really a better middle ground :\ IMHO if there is some way to improve this it still yet to be found, and I think we don't well understand what we need to serialize just yet. Smaller ideas like preserve the vmalloc will make big improvement already. Lets not race ahead until we understand the actual problem properly. Jason
On Tue, Sep 2, 2025 at 6:42 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Fri, Aug 29, 2025 at 12:18:43PM -0700, Chris Li wrote: > > > Another idea is that having a middle layer manages the life cycle of > > the reserved memory for you. Kind of like a slab allocator for the > > preserved memory. > > If you want a slab allocator then I think you should make slab > preservable.. Don't need more allocators :\ Sure, we can reuse the slab allocator to add the KHO function to it. I consider that as the implementation detail side, I haven't even started yet. I just want to point out that we might want to have a high level library to take care of the life cycle of the preserved memory. Less boilerplate code for the caller. > > Question: Do we have a matching FDT node to match the memfd C > > structure hierarchy? Otherwise all the C struct will lump into one FDT > > node. Maybe one FDT node for all C struct is fine. Then there is a > > risk of overflowing the 4K buffer limit on the FDT node. > > I thought you were getting rid of FDT? My suggestion was to be taken > as a FDT replacement.. Thanks for the clarification. Yes, I do want to get rid of FDT, very much so. If we are not using FDT, adding an object might change the underlying C structure layout causing a chain reaction of C struct change back to the root. That is where I assume you might be still using FDT. I see your later comments address that with a list of objects. I will discuss it there. > You need some kind of hierarchy of identifiers, things like memfd > should chain off some higher level luo object for a file descriptor. Ack. > > PCI should be the same, but not fd based. Ack. > It may be that luo maintains some flat dictionary of > string -> [object type, version, u64 ptr]* I see, got it. That answers my question of how to add a new object without changing the C structure layout. You are using a list of the same C structure. When adding more objects to it, just add more items to the list. This part of the boiler plate detail is not mentioned in your original suggestion. I understand your proposal better now. > And if you want to serialize that the optimal path would be to have a > vmalloc of all the strings and a vmalloc of the [] data, sort of like > the kho array idea. The KHO array idea is already implemented in the existing KHO code or that is something new you want to propose? Then we will have to know the combined size of the string up front, similar to the FDT story. Ideally the list can incrementally add items to it. May be stored as a list as raw pointer without vmalloc first,then have a final pass vmalloc and serialize the string and data. With the additional detail above, I would like to point out something I have observed earlier: even though the core idea of the native C struct is simple and intuitive, the end of end implementation is not. When we compare C struct implementation, we need to include all those additional boilerplate details as a whole, otherwise it is not a apple to apple comparison. > > At this stage, do you see that exploring such a machine idea can be > > beneficial or harmful to the project? If such an idea is considered > > harmful, we should stop discussing such an idea at all. Go back to > > building more batches of hand crafted screws, which are waiting by the > > next critical component. > > I haven't heard a compelling idea that will obviously make things > better.. Adding more layers and complexity is not better. Yes, I completely understand how you reason it, and I agree with your assessment. I like to add to that you have been heavily discounting the boilerplate stuff in the C struct solution. Here is where our view point might different: If the "more layer" has its counterpart in the C struct solution as well, then it is not "more", it is the necessary evil. We need to compare apples to apples. > Your BTF proposal doesn't seem to benifit memfd at all, it was focused > on extracting data directly from an existing struct which I feel very > strongly we should never do. From data flow point of view, the data is get from a C struct and eventually store into a C struct. That is no way around that. That is the necessary evil if you automate this process. Hey, there is also no rule saying that you can't use a bounce buffer of some kind of manual control in between. It is just a way to automate stuff to reduce the boilerplate. We can put different label on that and escalate that label or concept is bad. Your C struct has the exact same thing pulling data from the C struct and storing into C struct. It is just the label we are arguing. This label is good and that label is bad. Underlying it has the similar common necessary evil. > The above dictionary, I also don't see how BTF helps. It is such a > special encoding. Yes you could make some elaborate serialization > infrastructure, like FDT, but we have all been saying FDT is too hard > to use and too much code. I'm not sure I'm convinced there is really a Are you ready to be connived? If you keep this as a religion you can never be convinced. The reason FDT is too hard to use have other reason. FDT is design to be constructed by offline tools. In kernel mostly just read only. We are using FDT outside of its original design parameter. It does not mean that some thing (the machine) specially design for this purpose can't be build and easier to use. > better middle ground :\ With due respect, it sounds like you have the risk of judging something you haven't fully understood. I feel that a baby, my baby, has been thrown out with the bathwater. As a test of water for the above statement, can you describe my idea equal or better than I do so it passes the test of I say: "yes, this is exactly what I am trying to build". That is the communication barrier I am talking about. I estimate at this rate it will take us about 15 email exchanges to get to the core stuff. It might be much quicker to lock you and me in a room, Only release us when you and I can describe each other's viewpoint at a mutual satisfactory level. I understand your time is precious, and I don't want to waste your time. I fully respect and comply with your decision. If you want me to stop now, I can stop. No question asked. That gets back to my original question, do we already have a ruling that even the discussion of "the machine" idea is forbidden. > IMHO if there is some way to improve this it still yet to be found, In my mind, I have found it. I have to get over the communication barrier to plead my case to you. You can issue a preliminary ruling to dismiss my case. I just wish you fully understood the case facts before you make such a ruling. > and I think we don't well understand what we need to serialize just > yet. That may be true, we don't have 100% understanding of what needs to be serialized. On the other hand, it is not 0% either. Based on what we understand, we can already use "the machine" to help us do what we know much more effectively. Of course, there is a trade off for developing "the machine". It takes extra time and the complexity to maintain such a machine. I fully understand that. > Smaller ideas like preserve the vmalloc will make big improvement > already. Yes, I totally agree. It is a local optimization we can do, it might not be the global optimized though. "the machine" might not use vmalloc at all, all this small incremental change will be throw away once we have "the machine". I put this situation in the airplane story, yes, we build diamond plated filers to produce the hand craft screws faster. The missing opportunity is that, if we have "the machine" earlier, we can pump out machined screws much faster at scale, minus the time to build the machine, it might still be an overall win. We don't need to use diamond plated filter if we have the machine. > Lets not race ahead until we understand the actual problem properly. Is that the final ruling? It feels like so. Just clarifying what I am receiving. I feel a much stronger sense of urgency than you though. The stakes are high, currently you already have four departments can use this common serialization library right now: 1) PCI 2) VFIO 3) IOMMU 4) Memfd. We are getting into the more complex data structures. If we merge this into the mainline, it is much harder to pull them out later. Basically, this is a done deal. That is why I am putting my reputation and my job on the line to pitch "the machine" idea. It is a very risky move, I fully understand that. Chris
On Wed, Sep 03, 2025 at 05:01:15AM -0700, Chris Li wrote: > > And if you want to serialize that the optimal path would be to have a > > vmalloc of all the strings and a vmalloc of the [] data, sort of like > > the kho array idea. > > The KHO array idea is already implemented in the existing KHO code or > that is something new you want to propose? Pratyush has proposed it > Then we will have to know the combined size of the string up front, > similar to the FDT story. Ideally the list can incrementally add items > to it. May be stored as a list as raw pointer without vmalloc > first,then have a final pass vmalloc and serialize the string and > data. There are many options, and the dynamic extendability from the KHO array might be a good fit here. But you can also just store the serializations in a linked list and then write them out. > With the additional detail above, I would like to point out something > I have observed earlier: even though the core idea of the native C > struct is simple and intuitive, the end of end implementation is not. > When we compare C struct implementation, we need to include all those > additional boilerplate details as a whole, otherwise it is not a apple > to apple comparison. You need all of this anyhow, BTF doesn't create version meta data, evaluate which version are suitable, or de-serialize complex rbtree or linked lists structures. > > Your BTF proposal doesn't seem to benifit memfd at all, it was focused > > on extracting data directly from an existing struct which I feel very > > strongly we should never do. > > From data flow point of view, the data is get from a C struct and > eventually store into a C struct. That is no way around that. That is > the necessary evil if you automate this process. Hey, there is also no > rule saying that you can't use a bounce buffer of some kind of manual > control in between. Yeah but if I already wrote the code to make the required C struct there only difference is 'memcpy c struct' vs 'serialze with btf c struct' and that isn't meaningful. If the boilerplate is around arrays of C structs and things then the KHO array proposal is a good direction to de-duplicate code. > It is just a way to automate stuff to reduce the boilerplate. You haven't clearly spelled out what the boilerplate even is, this was my feedback to you to be very clear on what is being improved. > I feel a much stronger sense of urgency than you though. The stakes > are high, currently you already have four departments can use this > common serialization library right now: > 1) PCI > 2) VFIO > 3) IOMMU > 4) Memfd. We don't know what they actually need to write out, we haven't seen any patches. Let's start with simple patches and deal with the fundamental problems like versioning, then you can come with ideas to optimize if it turns out there is something to improve here. I'm not convinced PCI (a few bits per struct pci_device to start), memfd (xarray) and IOMMU (dictionaries of HW physical pointers) share a significant overlap of serialization requirements beyond luo level managing the objects and versioning. Jason
On Thu, Sep 04 2025, Jason Gunthorpe wrote: > On Wed, Sep 03, 2025 at 05:01:15AM -0700, Chris Li wrote: > >> > And if you want to serialize that the optimal path would be to have a >> > vmalloc of all the strings and a vmalloc of the [] data, sort of like >> > the kho array idea. >> >> The KHO array idea is already implemented in the existing KHO code or >> that is something new you want to propose? > > Pratyush has proposed it I just sent out the RFC: https://lore.kernel.org/linux-mm/20250909144426.33274-1-pratyush@kernel.org/T/#u [...] -- Regards, Pratyush Yadav
On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote: > On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote: > > > + err = fdt_property_placeholder(fdt, "folios", preserved_size, > > + (void **)&preserved_folios); > > + if (err) { > > + pr_err("Failed to reserve folios property in FDT: %s\n", > > + fdt_strerror(err)); > > + err = -ENOMEM; > > + goto err_free_fdt; > > + } > > Yuk. > > This really wants some luo helper > > 'luo alloc array' > 'luo restore array' > 'luo free array' > > Which would get a linearized list of pages in the vmap to hold the > array and then allocate some structure to record the page list and > return back the u64 of the phys_addr of the top of the structure to > store in whatever. > > Getting fdt to allocate the array inside the fds is just not going to > work for anything of size. I agree that we need a side-car structure for preserving large (potentially sparse) arrays, but I think it should be a part of KHO rather than LUO. -- Sincerely yours, Mike.
On Thu, Aug 28, 2025 at 12:14 AM Mike Rapoport <rppt@kernel.org> wrote: > > On Tue, Aug 26, 2025 at 01:20:19PM -0300, Jason Gunthorpe wrote: > > On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote: > > > > > + err = fdt_property_placeholder(fdt, "folios", preserved_size, > > > + (void **)&preserved_folios); > > > + if (err) { > > > + pr_err("Failed to reserve folios property in FDT: %s\n", > > > + fdt_strerror(err)); > > > + err = -ENOMEM; > > > + goto err_free_fdt; > > > + } > > > > Yuk. > > > > This really wants some luo helper > > > > 'luo alloc array' > > 'luo restore array' > > 'luo free array' > > > > Which would get a linearized list of pages in the vmap to hold the > > array and then allocate some structure to record the page list and > > return back the u64 of the phys_addr of the top of the structure to > > store in whatever. > > > > Getting fdt to allocate the array inside the fds is just not going to > > work for anything of size. > > I agree that we need a side-car structure for preserving large (potentially > sparse) arrays, but I think it should be a part of KHO rather than LUO. I agree this can be used by components outside of LUO as well. Ideally as some helper library so every component can use it. I don't have a strong opinion on KHO or the stand alone library. I am fine with both. Chris
Hi Jason, Thanks for the review. On Tue, Aug 26 2025, Jason Gunthorpe wrote: > On Thu, Aug 07, 2025 at 01:44:35AM +0000, Pasha Tatashin wrote: > >> + /* >> + * Most of the space should be taken by preserved folios. So take its >> + * size, plus a page for other properties. >> + */ >> + fdt = memfd_luo_create_fdt(PAGE_ALIGN(preserved_size) + PAGE_SIZE); >> + if (!fdt) { >> + err = -ENOMEM; >> + goto err_unpin; >> + } > > This doesn't seem to have any versioning scheme, it really should.. It does. See the "compatible" property. static const char memfd_luo_compatible[] = "memfd-v1"; static struct liveupdate_file_handler memfd_luo_handler = { .ops = &memfd_luo_file_ops, .compatible = memfd_luo_compatible, }; This goes into the LUO FDT: static int luo_files_to_fdt(struct xarray *files_xa_out) [...] xa_for_each(files_xa_out, token, h) { [...] ret = fdt_property_string(luo_file_fdt_out, "compatible", h->fh->compatible); So this function only gets called for the version 1. > >> + err = fdt_property_placeholder(fdt, "folios", preserved_size, >> + (void **)&preserved_folios); >> + if (err) { >> + pr_err("Failed to reserve folios property in FDT: %s\n", >> + fdt_strerror(err)); >> + err = -ENOMEM; >> + goto err_free_fdt; >> + } > > Yuk. > > This really wants some luo helper > > 'luo alloc array' > 'luo restore array' > 'luo free array' > > Which would get a linearized list of pages in the vmap to hold the > array and then allocate some structure to record the page list and > return back the u64 of the phys_addr of the top of the structure to > store in whatever. > > Getting fdt to allocate the array inside the fds is just not going to > work for anything of size. Yep, I agree. This version already runs into size limits of around 1 GiB due to the FDT being limited to MAX_PAGE_ORDER, since that is the largest contiguous piece of memory folio_alloc() can give us. On top, FDT is only limited to 32 bits. While very large, it isn't unreasonable to expect metadata exceeding that for some use cases (4 GiB is only 0.4% of 1 TiB and there are systems a lot larger than that around). I think we need something a luo_xarray data structure that users like memfd (and later hugetlb and guest_memfd and maybe others) can build to make serialization easier. It will cover both contiguous arrays and arrays with some holes in them. I did it this way mainly to keep things simple and get things out. But Pasha already mentioned he is running into this limit for some tests, so I think I will experiment around with a serialized xarray design. > >> + for (; i < nr_pfolios; i++) { >> + const struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; >> + phys_addr_t phys; >> + u64 index; >> + int flags; >> + >> + if (!pfolio->foliodesc) >> + continue; >> + >> + phys = PFN_PHYS(PRESERVED_FOLIO_PFN(pfolio->foliodesc)); >> + folio = kho_restore_folio(phys); >> + if (!folio) { >> + pr_err("Unable to restore folio at physical address: %llx\n", >> + phys); >> + goto put_file; >> + } >> + index = pfolio->index; >> + flags = PRESERVED_FOLIO_FLAGS(pfolio->foliodesc); >> + >> + /* Set up the folio for insertion. */ >> + /* >> + * TODO: Should find a way to unify this and >> + * shmem_alloc_and_add_folio(). >> + */ >> + __folio_set_locked(folio); >> + __folio_set_swapbacked(folio); >> >> + ret = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping)); >> + if (ret) { >> + pr_err("shmem: failed to charge folio index %d: %d\n", >> + i, ret); >> + goto unlock_folio; >> + } > > [..] > >> + folio_add_lru(folio); >> + folio_unlock(folio); >> + folio_put(folio); >> + } > > Probably some consolidation will be needed to make this less > duplicated.. Maybe. I do have that as a TODO item, but I took a quick look today and I am not sure if it will make things simple enough. There are a few places that add a folio to the shmem page cache, and all of them have subtle differences and consolidating them all might be tricky. Let me give it a shot... > > But overall I think just using the memfd_luo_preserved_folio as the > serialization is entirely file, I don't think this needs anything more > complicated. > > What it does need is an alternative to the FDT with versioning. As I explained above, the versioning is already there. Beyond that, why do you think a raw C struct is better than FDT? It is just another way of expressing the same information. FDT is a bit more cumbersome to write and read, but comes at the benefit of more introspect-ability. > > Which seems to me to be entirely fine as: > > struct memfd_luo_v0 { > __aligned_u64 size; > __aligned_u64 pos; > __aligned_u64 folios; > }; > > struct memfd_luo_v0 memfd_luo_v0 = {.size = size, pos = file->f_pos, folios = folios}; > luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0); > > Which also shows the actual data needing to be serialized comes from > more than one struct and has to be marshaled in code, somehow, to a > single struct. > > Then I imagine a fairly simple forwards/backwards story. If something > new is needed that is non-optional, lets say you compress the folios > list to optimize holes: > > struct memfd_luo_v1 { > __aligned_u64 size; > __aligned_u64 pos; > __aligned_u64 folios_list_with_holes; > }; > > Obviously a v0 kernel cannot parse this, but in this case a v1 aware > kernel could optionally duplicate and write out the v0 format as well: > > luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0); > luo_store_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1); I think what you describe here is essentially how LUO works currently, just that the mechanisms are a bit different. For example, instead of the subsystem calling luo_store_object(), the LUO core calls back into the subsystem at the appropriate time to let it populate the object. See memfd_luo_prepare() and the data argument. The version is decided by the compatible string with which the handler was registered. Since LUO knows when to start serializing what, I think this flow of calling into the subsystem and letting it fill in an object that LUO tracks and hands over makes a lot of sense. > > Then the rule is fairly simple, when the sucessor kernel goes to > deserialize it asks luo for the versions it supports: > > if (luo_restore_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1)) > restore_v1(&memfd_luo_v1) > else if (luo_restore_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0)) > restore_v0(&memfd_luo_v0) > else > luo_failure("Do not understand this"); Similarly, on restore side, the new kernel can register handlers of all the versions it can deal with, and LUO core takes care of calling into the right callback. See memfd_luo_retrieve() for example. If we now have a v2, the new kernel can simply define a new handler for v2 and add a new memfd_luo_retrieve_v2(). > > luo core just manages this list of versioned data per serialized > object. There is only one version per object. This also holds true. -- Regards, Pratyush Yadav
On Wed, Aug 27, 2025 at 05:03:55PM +0200, Pratyush Yadav wrote: > I think we need something a luo_xarray data structure that users like > memfd (and later hugetlb and guest_memfd and maybe others) can build to > make serialization easier. It will cover both contiguous arrays and > arrays with some holes in them. I'm not sure xarray is the right way to go, it is very complex data structure and building a kho variation of it seems like it is a huge amount of work. I'd stick with simple kvalloc type approaches until we really run into trouble. You can always map a sparse xarray into a kvalloc linear list by including the xarray index in each entry. Especially for memfd where we don't actually expect any sparsity in real uses cases there is no reason to invest a huge effort to optimize for it.. > As I explained above, the versioning is already there. Beyond that, why > do you think a raw C struct is better than FDT? It is just another way > of expressing the same information. FDT is a bit more cumbersome to > write and read, but comes at the benefit of more introspect-ability. Doesn't have the size limitations, is easier to work list, runs faster. > > luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0); > > luo_store_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1); > > I think what you describe here is essentially how LUO works currently, > just that the mechanisms are a bit different. The bit different is a very important bit though :) The versioning should be first class, not hidden away as some emergent property of registering multiple serializers or something like that. Jason
Hi Jason, On Thu, Aug 28 2025, Jason Gunthorpe wrote: > On Wed, Aug 27, 2025 at 05:03:55PM +0200, Pratyush Yadav wrote: > >> I think we need something a luo_xarray data structure that users like >> memfd (and later hugetlb and guest_memfd and maybe others) can build to >> make serialization easier. It will cover both contiguous arrays and >> arrays with some holes in them. > > I'm not sure xarray is the right way to go, it is very complex data > structure and building a kho variation of it seems like it is a huge > amount of work. > > I'd stick with simple kvalloc type approaches until we really run into > trouble. > > You can always map a sparse xarray into a kvalloc linear list by > including the xarray index in each entry. > > Especially for memfd where we don't actually expect any sparsity in > real uses cases there is no reason to invest a huge effort to optimize > for it.. Full xarray is too complex, sure. But I think a simple sparse array with xarray-like properties (4-byte pointers, values using xa_mk_value()) is fairly simple to implement. More advanced features of xarray like multi-index entries can be added later if needed. In fact, I have a WIP version of such an array and have used it for memfd preservation, and it looks quite alright to me. You can find the code at [0]. It is roughly 300 lines of code. I still need to clean it up to make it post-able, but it does work. Building kvalloc on top of this becomes trivial. [0] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=cf4c04c1e9ac854e3297018ad6dada17c54a59af > >> As I explained above, the versioning is already there. Beyond that, why >> do you think a raw C struct is better than FDT? It is just another way >> of expressing the same information. FDT is a bit more cumbersome to >> write and read, but comes at the benefit of more introspect-ability. > > Doesn't have the size limitations, is easier to work list, runs > faster. > >> > luo_store_object(&memfd_luo_v0, sizeof(memfd_luo_v0), <.. identifier for this fd..>, /*version=*/0); >> > luo_store_object(&memfd_luo_v1, sizeof(memfd_luo_v1), <.. identifier for this fd..>, /*version=*/1); >> >> I think what you describe here is essentially how LUO works currently, >> just that the mechanisms are a bit different. > > The bit different is a very important bit though :) > > The versioning should be first class, not hidden away as some emergent > property of registering multiple serializers or something like that. That makes sense. How about some simple changes to the LUO interfaces to make the version more prominent: int (*prepare)(struct liveupdate_file_handler *handler, struct file *file, u64 *data, char **compatible); This lets the subsystem fill in the compatible (AKA version) (string here, but you can make it an integer if you want) when it serialized its data. And on restore side, LUO can pass in the compatible: int (*retrieve)(struct liveupdate_file_handler *handler, u64 data, char *compatible, struct file **file); -- Regards, Pratyush Yadav
On Mon, Sep 01, 2025 at 07:10:53PM +0200, Pratyush Yadav wrote: > Building kvalloc on top of this becomes trivial. > > [0] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=cf4c04c1e9ac854e3297018ad6dada17c54a59af This isn't really an array, it is a non-seekable serialization of key/values with some optimization for consecutive keys. IMHO it is most useful if you don't know the size of the thing you want to serialize in advance since it has a nice dynamic append. But if you do know the size, I think it makes more sense just to do a preserving vmalloc and write out a linear array.. So, it could be useful, but I wouldn't use it for memfd, the vmalloc approach is better and we shouldn't optimize for sparsness which should never happen. > > The versioning should be first class, not hidden away as some emergent > > property of registering multiple serializers or something like that. > > That makes sense. How about some simple changes to the LUO interfaces to > make the version more prominent: > > int (*prepare)(struct liveupdate_file_handler *handler, > struct file *file, u64 *data, char **compatible); Yeah, something more integrated with the ops is better. You could list the supported versions in the ops itself const char **supported_deserialize_versions; And let the luo framework find the right versions. But for prepare I would expect an inbetween object: int (*prepare)(struct liveupdate_file_handler *handler, struct luo_object *obj, struct file *file); And then you'd do function calls on 'obj' to store 'data' per version. Jason
Hi Jason, On Tue, Sep 02 2025, Jason Gunthorpe wrote: > On Mon, Sep 01, 2025 at 07:10:53PM +0200, Pratyush Yadav wrote: >> Building kvalloc on top of this becomes trivial. >> >> [0] https://git.kernel.org/pub/scm/linux/kernel/git/pratyush/linux.git/commit/?h=kho-array&id=cf4c04c1e9ac854e3297018ad6dada17c54a59af > > This isn't really an array, it is a non-seekable serialization of > key/values with some optimization for consecutive keys. IMHO it is Sure, an array is not the best name for the thing. Call it whatever, maybe a "sparse collection of pointers". But I hope you get the idea. > most useful if you don't know the size of the thing you want to > serialize in advance since it has a nice dynamic append. > > But if you do know the size, I think it makes more sense just to do a > preserving vmalloc and write out a linear array.. I think there are two separate parts here. One is the data format and the other is the data builder. The format itself is quite simple. It is a linked list of discontiguous pages that holds a set of pointers. We use that idea already for the preserved pages bitmap. Mike's vmalloc preservation patches also use the same idea, just with a small variation. The builder part (ka_iter in my patches) is an abstraction on top to build the data structure. I designed it with the nice dynamic append property since it seemed like a nice and convenient design, but we can have it define the size statically as well. The underlying data format won't change. > > So, it could be useful, but I wouldn't use it for memfd, the vmalloc > approach is better and we shouldn't optimize for sparsness which > should never happen. I disagree. I think we are re-inventing the same data format with minor variations. I think we should define extensible fundamental data formats first, and then use those as the building blocks for the rest of our serialization logic. I think KHO array does exactly that. It provides the fundamental serialization for a collection of pointers, and other serialization use cases can then build on top of it. For example, the preservation bitmaps can get rid of their linked list logic and just use KHO array to hold and retrieve its bitmaps. It will make the serialization simpler. Similar argument for vmalloc preservation. I also don't get why you think sparseness "should never happen". For memfd for example, you say in one of your other emails that "And again in real systems we expect memfd to be fully populated too." Which systems and use cases do you have in mind? Why do you think people won't want a sparse memfd? And finally, from a data format perspective, the sparseness only adds a small bit of complexity (the startpos for each kho_array_page). Everything else is practically the same as a continuous array. All in all, I think KHO array is going to prove useful and will make serialization for subsystems easier. I think sparseness will also prove useful but it is not a hill I want to die on. I am fine with starting with a non-sparse array if people really insist. But I do think we should go with KHO array as a base instead of re-inventing the linked list of pages again and again. > >> > The versioning should be first class, not hidden away as some emergent >> > property of registering multiple serializers or something like that. >> >> That makes sense. How about some simple changes to the LUO interfaces to >> make the version more prominent: >> >> int (*prepare)(struct liveupdate_file_handler *handler, >> struct file *file, u64 *data, char **compatible); > > Yeah, something more integrated with the ops is better. > > You could list the supported versions in the ops itself > > const char **supported_deserialize_versions; > > And let the luo framework find the right versions. > > But for prepare I would expect an inbetween object: > > int (*prepare)(struct liveupdate_file_handler *handler, > struct luo_object *obj, struct file *file); > > And then you'd do function calls on 'obj' to store 'data' per version. What do you mean by "data per version"? I think there should be only one version of the serialized object. Multiple versions of the same thing will get ugly real quick. Other than that, I think this could work well. I am guessing luo_object stores the version and gives us a way to query it on the other side. I think if we are letting LUO manage supported versions, it should be richer than just a list of strings. I think it should include a ops structure for deserializing each version. That would encapsulate the versioning more cleanly. -- Regards, Pratyush Yadav
On Wed, Sep 03, 2025 at 04:10:37PM +0200, Pratyush Yadav wrote: > > So, it could be useful, but I wouldn't use it for memfd, the vmalloc > > approach is better and we shouldn't optimize for sparsness which > > should never happen. > > I disagree. I think we are re-inventing the same data format with minor > variations. I think we should define extensible fundamental data formats > first, and then use those as the building blocks for the rest of our > serialization logic. page, vmalloc, slab seem to me to be the fundamental units of memory management in linux, so they should get KHO support. If you want to preserve a known-sized array you use vmalloc and then write out the per-list items. If it is a dictionary/sparse array then you write an index with each item too. This is all trivial and doesn't really need more abstraction in of itself, IMHO. > cases can then build on top of it. For example, the preservation bitmaps > can get rid of their linked list logic and just use KHO array to hold > and retrieve its bitmaps. It will make the serialization simpler. I don't think the bitmaps should, the serialization here is very special because it is not actually preserved, it just exists for the time while the new kernel runs in scratch and is insta freed once the allocators start up. > I also don't get why you think sparseness "should never happen". For > memfd for example, you say in one of your other emails that "And again > in real systems we expect memfd to be fully populated too." Which > systems and use cases do you have in mind? Why do you think people won't > want a sparse memfd? memfd should principally be used to back VM memory, and I expect VM memory to be fully populated. Why would it be sparse? > All in all, I think KHO array is going to prove useful and will make > serialization for subsystems easier. I think sparseness will also prove > useful but it is not a hill I want to die on. I am fine with starting > with a non-sparse array if people really insist. But I do think we > should go with KHO array as a base instead of re-inventing the linked > list of pages again and again. The two main advantages I see to the kho array design vs vmalloc is that it should be a bit faster as it doesn't establish a vmap, and it handles unknown size lists much better. Are these important considerations? IDK. As I said to Chris, I think we should see more examples of what we actually need before assuming any certain datastructure is the best choice. So I'd stick to simpler open coded things and go back and improve them than start out building the wrong shared data structure. How about have at least three luo clients that show meaningful benefit before proposing something beyond the fundamental page, vmalloc, slab things? > What do you mean by "data per version"? I think there should be only one > version of the serialized object. Multiple versions of the same thing > will get ugly real quick. If you want to support backwards/forwards compatability then you probably should support multiple versions as well. Otherwise it could become quite hard to make downgrades.. Ideally I'd want to remove the upstream code for obsolete versions fairly quickly so I'd imagine kernels will want to generate both versions during the transition period and then eventually newer kernels will only accept the new version. I've argued before that the extended matrix of any kernel version to any other kernel version should lie with the distro/CSP making the kernel fork. They know what their upgrade sequence will be so they can manage any missing versions to make it work. Upstream should do like v6.1 to v6.2 only or something similarly well constrained. I think this is a reasonable trade off to get subsystem maintainers to even accept this stuff at all. > Other than that, I think this could work well. I am guessing luo_object > stores the version and gives us a way to query it on the other side. I > think if we are letting LUO manage supported versions, it should be > richer than just a list of strings. I think it should include a ops > structure for deserializing each version. That would encapsulate the > versioning more cleanly. Yeah, sounds about right Jason
Hi Jason, On Wed, Sep 03 2025, Jason Gunthorpe wrote: > On Wed, Sep 03, 2025 at 04:10:37PM +0200, Pratyush Yadav wrote: > >> > So, it could be useful, but I wouldn't use it for memfd, the vmalloc >> > approach is better and we shouldn't optimize for sparsness which >> > should never happen. >> >> I disagree. I think we are re-inventing the same data format with minor >> variations. I think we should define extensible fundamental data formats >> first, and then use those as the building blocks for the rest of our >> serialization logic. > > page, vmalloc, slab seem to me to be the fundamental units of memory > management in linux, so they should get KHO support. > > If you want to preserve a known-sized array you use vmalloc and then > write out the per-list items. If it is a dictionary/sparse array then > you write an index with each item too. This is all trivial and doesn't > really need more abstraction in of itself, IMHO. We will use up double the space for tracking metadata, but maybe that is fine until we start seeing bigger memfds in real workloads. > >> cases can then build on top of it. For example, the preservation bitmaps >> can get rid of their linked list logic and just use KHO array to hold >> and retrieve its bitmaps. It will make the serialization simpler. > > I don't think the bitmaps should, the serialization here is very > special because it is not actually preserved, it just exists for the > time while the new kernel runs in scratch and is insta freed once the > allocators start up. I don't think it matters if they are preserved or not. The serialization and deserialization is independent of that. You can very well create a KHO array that you don't KHO-preserve. On next boot, you can still use it, you just have to be careful of doing it while scratch-only. Same as we do now. > >> I also don't get why you think sparseness "should never happen". For >> memfd for example, you say in one of your other emails that "And again >> in real systems we expect memfd to be fully populated too." Which >> systems and use cases do you have in mind? Why do you think people won't >> want a sparse memfd? > > memfd should principally be used to back VM memory, and I expect VM > memory to be fully populated. Why would it be sparse? For the _hypervisor_ live update case, sure. Though even there, I have a feeling we will start seeing userspace components on the hypervisor use memfd for stashing some of their state. Pasha has already mentioned they have a use case for a memfd that is not VM memory. But hypervisor live upadte isn't the only use case for LUO. We are looking at enabling state preservation for "normal" userspace applications. Think big storage nodes with memory in order of TiB. Those can use a memfd to back their caches so on a kernel upgrade the caches don't have to be re-fetched. Sparseness is to be expected for such use cases. > >> All in all, I think KHO array is going to prove useful and will make >> serialization for subsystems easier. I think sparseness will also prove >> useful but it is not a hill I want to die on. I am fine with starting >> with a non-sparse array if people really insist. But I do think we >> should go with KHO array as a base instead of re-inventing the linked >> list of pages again and again. > > The two main advantages I see to the kho array design vs vmalloc is > that it should be a bit faster as it doesn't establish a vmap, and it > handles unknown size lists much better. > > Are these important considerations? IDK. > > As I said to Chris, I think we should see more examples of what we > actually need before assuming any certain datastructure is the best > choice. > > So I'd stick to simpler open coded things and go back and improve them > than start out building the wrong shared data structure. > > How about have at least three luo clients that show meaningful benefit > before proposing something beyond the fundamental page, vmalloc, slab > things? I think the fundamentals themselves get some benefit. But anyway, since I have done most of the work on this feature anyway, I will do the rest and send the patches out. Then you can have a look and if you're still not convinced, I am fine shelving it for now to revisit later when a stronger case can be made. > >> What do you mean by "data per version"? I think there should be only one >> version of the serialized object. Multiple versions of the same thing >> will get ugly real quick. > > If you want to support backwards/forwards compatability then you > probably should support multiple versions as well. Otherwise it > could become quite hard to make downgrades.. Hmm, forward can work regardless since a newer kernel should speak older formats too, but for backwards it makes sense to have an older version. But perhaps it might be a better idea to come up with a mechanism for the kernel to discover which formats the "next" kernel speaks so it can for one decide whether it can do the live update at all, and for another which formats it should use. Maybe we give a way for luod to choose formats, and give it the responsibility for doing these checks? > > Ideally I'd want to remove the upstream code for obsolete versions > fairly quickly so I'd imagine kernels will want to generate both > versions during the transition period and then eventually newer > kernels will only accept the new version. > > I've argued before that the extended matrix of any kernel version to > any other kernel version should lie with the distro/CSP making the > kernel fork. They know what their upgrade sequence will be so they can > manage any missing versions to make it work. > > Upstream should do like v6.1 to v6.2 only or something similarly well > constrained. I think this is a reasonable trade off to get subsystem > maintainers to even accept this stuff at all. [...] -- Regards, Pratyush Yadav
On Thu, Sep 04, 2025 at 02:57:35PM +0200, Pratyush Yadav wrote: > I don't think it matters if they are preserved or not. The serialization > and deserialization is independent of that. You can very well create a > KHO array that you don't KHO-preserve. On next boot, you can still use > it, you just have to be careful of doing it while scratch-only. Same as > we do now. The KHO array machinery itself can't preserve its own memory either. > For the _hypervisor_ live update case, sure. Though even there, I have a > feeling we will start seeing userspace components on the hypervisor use > memfd for stashing some of their state. Sure, but don't make excessively sparse memfds for kexec use, why should that be hard? > applications. Think big storage nodes with memory in order of TiB. Those > can use a memfd to back their caches so on a kernel upgrade the caches > don't have to be re-fetched. Sparseness is to be expected for such use > cases. Oh? I'm surpised you'd have sparseness there. sparseness seems like such a weird feature to want to rely on :\ > But perhaps it might be a better idea to come up with a mechanism for > the kernel to discover which formats the "next" kernel speaks so it can > for one decide whether it can do the live update at all, and for another > which formats it should use. Maybe we give a way for luod to choose > formats, and give it the responsibility for doing these checks? I have felt that we should catalog the formats&versions the kernel can read/write in some way during kbuild. Maybe this turns into a sysfs directory of all the data with an 'enable_write' flag that luod could set to 0 to optimize. And maybe this could be a kbuild report that luod could parse to do this optimization. And maybe distro/csps use this information mechanically to check if version pairs are kexec compatible. Which re-enforces my feeling that the formats/version should be first class concepts, every version should be registered and luo should sequence calling the code for the right version at the right time. Jason
On Thu, Sep 04 2025, Jason Gunthorpe wrote: > On Thu, Sep 04, 2025 at 02:57:35PM +0200, Pratyush Yadav wrote: > >> I don't think it matters if they are preserved or not. The serialization >> and deserialization is independent of that. You can very well create a >> KHO array that you don't KHO-preserve. On next boot, you can still use >> it, you just have to be careful of doing it while scratch-only. Same as >> we do now. > > The KHO array machinery itself can't preserve its own memory > either. It can. Maybe it couldn't in the version I showed you, but now it can. See kho_array_preserve() in https://lore.kernel.org/linux-mm/20250909144426.33274-2-pratyush@kernel.org/ > >> For the _hypervisor_ live update case, sure. Though even there, I have a >> feeling we will start seeing userspace components on the hypervisor use >> memfd for stashing some of their state. > > Sure, but don't make excessively sparse memfds for kexec use, why > should that be hard? Sure, I don't think they should be excessively sparse. But _some_ level of sparseness can be there. > >> applications. Think big storage nodes with memory in order of TiB. Those >> can use a memfd to back their caches so on a kernel upgrade the caches >> don't have to be re-fetched. Sparseness is to be expected for such use >> cases. > > Oh? I'm surpised you'd have sparseness there. sparseness seems like > such a weird feature to want to rely on :\ > >> But perhaps it might be a better idea to come up with a mechanism for >> the kernel to discover which formats the "next" kernel speaks so it can >> for one decide whether it can do the live update at all, and for another >> which formats it should use. Maybe we give a way for luod to choose >> formats, and give it the responsibility for doing these checks? > > I have felt that we should catalog the formats&versions the kernel can > read/write in some way during kbuild. > > Maybe this turns into a sysfs directory of all the data with an > 'enable_write' flag that luod could set to 0 to optimize. > > And maybe this could be a kbuild report that luod could parse to do > this optimization. Or maybe we put that information in a ELF section in the kernel image? Not sure how feasible it would be for tooling to read but I think that would very closely associate the versions info with the kernel. The other option might be to put it somewhere with modules I guess. > > And maybe distro/csps use this information mechanically to check if > version pairs are kexec compatible. > > Which re-enforces my feeling that the formats/version should be first > class concepts, every version should be registered and luo should > sequence calling the code for the right version at the right time. > > Jason -- Regards, Pratyush Yadav
On Tue, Sep 9, 2025 at 10:53 AM Pratyush Yadav <me@yadavpratyush.com> wrote: > > On Thu, Sep 04 2025, Jason Gunthorpe wrote: > > > On Thu, Sep 04, 2025 at 02:57:35PM +0200, Pratyush Yadav wrote: > > > >> I don't think it matters if they are preserved or not. The serialization > >> and deserialization is independent of that. You can very well create a > >> KHO array that you don't KHO-preserve. On next boot, you can still use > >> it, you just have to be careful of doing it while scratch-only. Same as > >> we do now. > > > > The KHO array machinery itself can't preserve its own memory > > either. > > It can. Maybe it couldn't in the version I showed you, but now it can. > See kho_array_preserve() in > https://lore.kernel.org/linux-mm/20250909144426.33274-2-pratyush@kernel.org/ > > > > >> For the _hypervisor_ live update case, sure. Though even there, I have a > >> feeling we will start seeing userspace components on the hypervisor use > >> memfd for stashing some of their state. > > > > Sure, but don't make excessively sparse memfds for kexec use, why > > should that be hard? > > Sure, I don't think they should be excessively sparse. But _some_ level > of sparseness can be there. This is right; loosely sparse memfd support is needed. However, an excessively sparse preservation will be inefficient for LU, unless we change the backing to be from a separate pool of physical pages that is always preserved. If we do that, it would probably make sense only for guestmemfd and only if we ever decide to support overcommitted VMs. I suspect it is not something that we currently need to worry about. > >> applications. Think big storage nodes with memory in order of TiB. Those > >> can use a memfd to back their caches so on a kernel upgrade the caches > >> don't have to be re-fetched. Sparseness is to be expected for such use > >> cases. > > > > Oh? I'm surpised you'd have sparseness there. sparseness seems like > > such a weird feature to want to rely on :\ > > > >> But perhaps it might be a better idea to come up with a mechanism for > >> the kernel to discover which formats the "next" kernel speaks so it can > >> for one decide whether it can do the live update at all, and for another > >> which formats it should use. Maybe we give a way for luod to choose > >> formats, and give it the responsibility for doing these checks? > > > > I have felt that we should catalog the formats&versions the kernel can > > read/write in some way during kbuild. > > > > Maybe this turns into a sysfs directory of all the data with an > > 'enable_write' flag that luod could set to 0 to optimize. > > > > And maybe this could be a kbuild report that luod could parse to do > > this optimization. > > Or maybe we put that information in a ELF section in the kernel image? > Not sure how feasible it would be for tooling to read but I think that > would very closely associate the versions info with the kernel. The > other option might be to put it somewhere with modules I guess. To me, all this sounds like hardening, which, while important, can be added later. The pre-kexec check for compatibility can be defined and implemented once we have all live update components ready (KHO/LUO/PCI/IOMMU/VFIO/MEMFD), once we stabilize the versioning story, and once we start discussing update stability. Currently, we've agreed that there are no stability guarantees. Sometime in the future, we may guarantee minor-to-minor stability, and later, stable-to-stable. Once we start working on minor-to-minor stability, it would be a good idea to also add hardening where a pre-live update would check for compatibility. In reality, this is not something that is high priority for cloud providers, because these kinds of incompatibilities would be found during qualification; the kernel will fail to update by detecting a version mismatch during boot instead of during shutdown. > > And maybe distro/csps use this information mechanically to check if > > version pairs are kexec compatible. > > > > Which re-enforces my feeling that the formats/version should be first > > class concepts, every version should be registered and luo should > > sequence calling the code for the right version at the right time. > > > > Jason > > -- > Regards, > Pratyush Yadav
On Tue, Sep 09 2025, Pasha Tatashin wrote: > On Tue, Sep 9, 2025 at 10:53 AM Pratyush Yadav <me@yadavpratyush.com> wrote: >> >> On Thu, Sep 04 2025, Jason Gunthorpe wrote: >> >> > On Thu, Sep 04, 2025 at 02:57:35PM +0200, Pratyush Yadav wrote: [...] >> >> But perhaps it might be a better idea to come up with a mechanism for >> >> the kernel to discover which formats the "next" kernel speaks so it can >> >> for one decide whether it can do the live update at all, and for another >> >> which formats it should use. Maybe we give a way for luod to choose >> >> formats, and give it the responsibility for doing these checks? >> > >> > I have felt that we should catalog the formats&versions the kernel can >> > read/write in some way during kbuild. >> > >> > Maybe this turns into a sysfs directory of all the data with an >> > 'enable_write' flag that luod could set to 0 to optimize. >> > >> > And maybe this could be a kbuild report that luod could parse to do >> > this optimization. >> >> Or maybe we put that information in a ELF section in the kernel image? >> Not sure how feasible it would be for tooling to read but I think that >> would very closely associate the versions info with the kernel. The >> other option might be to put it somewhere with modules I guess. > > To me, all this sounds like hardening, which, while important, can be > added later. The pre-kexec check for compatibility can be defined and > implemented once we have all live update components ready > (KHO/LUO/PCI/IOMMU/VFIO/MEMFD), once we stabilize the versioning > story, and once we start discussing update stability. Right. I don't think this is something the current LUO patches have to solve. This is for later down the line. > > Currently, we've agreed that there are no stability guarantees. > Sometime in the future, we may guarantee minor-to-minor stability, and > later, stable-to-stable. Once we start working on minor-to-minor > stability, it would be a good idea to also add hardening where a > pre-live update would check for compatibility. > > In reality, this is not something that is high priority for cloud > providers, because these kinds of incompatibilities would be found > during qualification; the kernel will fail to update by detecting a > version mismatch during boot instead of during shutdown. I think it would help with making a wider range of roll back and forward options available. For example, if your current kernel can speak version A and B, and you are rolling back to a kernel that only speaks A, this information can be used to choose the right serialization formats. [...] -- Regards, Pratyush Yadav
> I think it would help with making a wider range of roll back and forward > options available. For example, if your current kernel can speak version > A and B, and you are rolling back to a kernel that only speaks A, this > information can be used to choose the right serialization formats. At least for upstream, we discussed not to support rolling back (this can be revised in the future), but for now rollback is something that would need to be taken care of downstream. Pasha
On Tue, Sep 09, 2025 at 11:40:18AM -0400, Pasha Tatashin wrote: > In reality, this is not something that is high priority for cloud > providers, because these kinds of incompatibilities would be found > during qualification; the kernel will fail to update by detecting a > version mismatch during boot instead of during shutdown. Given I expect CSPs will have to add-in specific version support for their own special version-pair needs, I think it would be helpful in the long run to have a tool that reported what versions a kernel build wrote and parsed. Test-to-learn the same information sounds a bit too difficult. Something with ELF is probably how to do that, but I don't imagine a use in a runtime check consuming this information. Jason
On Tue, Sep 9, 2025 at 11:54 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Tue, Sep 09, 2025 at 11:40:18AM -0400, Pasha Tatashin wrote: > > In reality, this is not something that is high priority for cloud > > providers, because these kinds of incompatibilities would be found > > during qualification; the kernel will fail to update by detecting a > > version mismatch during boot instead of during shutdown. > > Given I expect CSPs will have to add-in specific version support for > their own special version-pair needs, I think it would be helpful in > the long run to have a tool that reported what versions a kernel build > wrote and parsed. Test-to-learn the same information sounds a bit too > difficult. Yes, I agree. My point was only about the near term: it's just not a priority at the moment. This won't block us in the future, as we can always add a tooling later to inject the required ELF segments for pre-live update checks. Pasha
On Tue, Sep 09, 2025 at 12:30:35PM -0400, Pasha Tatashin wrote: > On Tue, Sep 9, 2025 at 11:54 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > On Tue, Sep 09, 2025 at 11:40:18AM -0400, Pasha Tatashin wrote: > > > In reality, this is not something that is high priority for cloud > > > providers, because these kinds of incompatibilities would be found > > > during qualification; the kernel will fail to update by detecting a > > > version mismatch during boot instead of during shutdown. > > > > Given I expect CSPs will have to add-in specific version support for > > their own special version-pair needs, I think it would be helpful in > > the long run to have a tool that reported what versions a kernel build > > wrote and parsed. Test-to-learn the same information sounds a bit too > > difficult. > > Yes, I agree. My point was only about the near term: it's just not a > priority at the moment. This won't block us in the future, as we can > always add a tooling later to inject the required ELF segments for > pre-live update checks. Yes, but lets design things to have this kind of logical code model where there are specific serializations, with specific versions that are at least discoverably by greping for some struct luo_xxx_ops or whatever. Let's avoid open coding versioning stuff where it is hard to find and hard to later make a manifest out of Jason
> Yes, but lets design things to have this kind of logical code model > where there are specific serializations, with specific versions that > are at least discoverably by greping for some struct luo_xxx_ops or > whatever. > > Let's avoid open coding versioning stuff where it is hard to find and > hard to later make a manifest out of Fully agreed, the versioning has to be centralized (not open coded, and verified in a single place) & discoverable. > > Jason >
On Thu, Aug 28, 2025 at 5:43 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Wed, Aug 27, 2025 at 05:03:55PM +0200, Pratyush Yadav wrote: > > > I think we need something a luo_xarray data structure that users like > > memfd (and later hugetlb and guest_memfd and maybe others) can build to > > make serialization easier. It will cover both contiguous arrays and > > arrays with some holes in them. > > I'm not sure xarray is the right way to go, it is very complex data > structure and building a kho variation of it seems like it is a huge > amount of work. > > I'd stick with simple kvalloc type approaches until we really run into > trouble. > > You can always map a sparse xarray into a kvalloc linear list by > including the xarray index in each entry. Each entry will be 16 byte, 8 for index and 8 for XAvalue, right? > Especially for memfd where we don't actually expect any sparsity in > real uses cases there is no reason to invest a huge effort to optimize > for it.. Ack. > > > As I explained above, the versioning is already there. Beyond that, why > > do you think a raw C struct is better than FDT? It is just another way > > of expressing the same information. FDT is a bit more cumbersome to > > write and read, but comes at the benefit of more introspect-ability. > > Doesn't have the size limitations, is easier to work list, runs > faster. Yes, especially when you have a large array. Chris
On 2025-08-07 01:44:35, Pasha Tatashin wrote: > From: Pratyush Yadav <ptyadav@amazon.de> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios, > + unsigned int nr_folios) > +{ > + unsigned int i; > + > + for (i = 0; i < nr_folios; i++) { > + const struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; > + struct folio *folio; > + > + if (!pfolio->foliodesc) > + continue; > + > + folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc)); > + > + kho_unpreserve_folio(folio); This one is missing WARN_ON_ONCE() similar to the one in memfd_luo_preserve_folios(). > + unpin_folio(folio); > + } > +} > + > +static void *memfd_luo_create_fdt(unsigned long size) > +{ > + unsigned int order = get_order(size); > + struct folio *fdt_folio; > + int err = 0; > + void *fdt; > + > + if (order > MAX_PAGE_ORDER) > + return NULL; > + > + fdt_folio = folio_alloc(GFP_KERNEL, order); __GFP_ZERO should also be used here. Otherwise this can lead to unintentional passing of old kernel memory. > +static int memfd_luo_prepare(struct liveupdate_file_handler *handler, > + struct file *file, u64 *data) > +{ > + struct memfd_luo_preserved_folio *preserved_folios; > + struct inode *inode = file_inode(file); > + unsigned int max_folios, nr_folios = 0; > + int err = 0, preserved_size; > + struct folio **folios; > + long size, nr_pinned; > + pgoff_t offset; > + void *fdt; > + u64 pos; > + > + if (WARN_ON_ONCE(!shmem_file(file))) > + return -EINVAL; This one is only check for shmem_file, whereas in memfd_luo_can_preserve() there is check for inode->i_nlink also. Is that not needed here? > + > + inode_lock(inode); > + shmem_i_mapping_freeze(inode, true); > + > + size = i_size_read(inode); > + if ((PAGE_ALIGN(size) / PAGE_SIZE) > UINT_MAX) { > + err = -E2BIG; > + goto err_unlock; > + } > + > + /* > + * Guess the number of folios based on inode size. Real number might end > + * up being smaller if there are higher order folios. > + */ > + max_folios = PAGE_ALIGN(size) / PAGE_SIZE; > + folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL); __GFP_ZERO? > +static int memfd_luo_freeze(struct liveupdate_file_handler *handler, > + struct file *file, u64 *data) > +{ > + u64 pos = file->f_pos; > + void *fdt; > + int err; > + > + if (WARN_ON_ONCE(!*data)) > + return -EINVAL; > + > + fdt = phys_to_virt(*data); > + > + /* > + * The pos or size might have changed since prepare. Everything else > + * stays the same. > + */ > + err = fdt_setprop(fdt, 0, "pos", &pos, sizeof(pos)); > + if (err) > + return err; Comment is talking about pos and size but code is only updating pos. > +static int memfd_luo_retrieve(struct liveupdate_file_handler *handler, u64 data, > + struct file **file_p) > +{ > + const struct memfd_luo_preserved_folio *pfolios; > + int nr_pfolios, len, ret = 0, i = 0; > + struct address_space *mapping; > + struct folio *folio, *fdt_folio; > + const u64 *pos, *size; > + struct inode *inode; > + struct file *file; > + const void *fdt; > + > + fdt_folio = memfd_luo_get_fdt(data); > + if (!fdt_folio) > + return -ENOENT; > + > + fdt = page_to_virt(folio_page(fdt_folio, 0)); > + > + pfolios = fdt_getprop(fdt, 0, "folios", &len); > + if (!pfolios || len % sizeof(*pfolios)) { > + pr_err("invalid 'folios' property\n"); Print should clearly state that error is because fields is not found or len is not multiple of sizeof(*pfolios).
Hi Vipin, Thanks for the review. On Tue, Aug 12 2025, Vipin Sharma wrote: > On 2025-08-07 01:44:35, Pasha Tatashin wrote: >> From: Pratyush Yadav <ptyadav@amazon.de> >> +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios, >> + unsigned int nr_folios) >> +{ >> + unsigned int i; >> + >> + for (i = 0; i < nr_folios; i++) { >> + const struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; >> + struct folio *folio; >> + >> + if (!pfolio->foliodesc) >> + continue; >> + >> + folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc)); >> + >> + kho_unpreserve_folio(folio); > > This one is missing WARN_ON_ONCE() similar to the one in > memfd_luo_preserve_folios(). Right, will add. > >> + unpin_folio(folio); Looking at this code caught my eye. This can also be called from LUO's finish callback if no one claimed the memfd after live update. In that case, unpin_folio() is going to underflow the pincount or refcount on the folio since after the kexec, the folio is no longer pinned. We should only be doing folio_put(). I think this function should take a argument to specify which of these cases it is dealing with. >> + } >> +} >> + >> +static void *memfd_luo_create_fdt(unsigned long size) >> +{ >> + unsigned int order = get_order(size); >> + struct folio *fdt_folio; >> + int err = 0; >> + void *fdt; >> + >> + if (order > MAX_PAGE_ORDER) >> + return NULL; >> + >> + fdt_folio = folio_alloc(GFP_KERNEL, order); > > __GFP_ZERO should also be used here. Otherwise this can lead to > unintentional passing of old kernel memory. fdt_create() zeroes out the buffer so this should not be a problem. > >> +static int memfd_luo_prepare(struct liveupdate_file_handler *handler, >> + struct file *file, u64 *data) >> +{ >> + struct memfd_luo_preserved_folio *preserved_folios; >> + struct inode *inode = file_inode(file); >> + unsigned int max_folios, nr_folios = 0; >> + int err = 0, preserved_size; >> + struct folio **folios; >> + long size, nr_pinned; >> + pgoff_t offset; >> + void *fdt; >> + u64 pos; >> + >> + if (WARN_ON_ONCE(!shmem_file(file))) >> + return -EINVAL; > > This one is only check for shmem_file, whereas in > memfd_luo_can_preserve() there is check for inode->i_nlink also. Is that > not needed here? Actually, this should never happen since the LUO can_preserve() callback should make sure of this. I think it would be perfectly fine to just drop this check. I only added it because I was being extra careful. > >> + >> + inode_lock(inode); >> + shmem_i_mapping_freeze(inode, true); >> + >> + size = i_size_read(inode); >> + if ((PAGE_ALIGN(size) / PAGE_SIZE) > UINT_MAX) { >> + err = -E2BIG; >> + goto err_unlock; >> + } >> + >> + /* >> + * Guess the number of folios based on inode size. Real number might end >> + * up being smaller if there are higher order folios. >> + */ >> + max_folios = PAGE_ALIGN(size) / PAGE_SIZE; >> + folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL); > > __GFP_ZERO? Why? This is only used in this function and gets freed on return. And the function only looks at the elements that get initialized by memfd_pin_folios(). > >> +static int memfd_luo_freeze(struct liveupdate_file_handler *handler, >> + struct file *file, u64 *data) >> +{ >> + u64 pos = file->f_pos; >> + void *fdt; >> + int err; >> + >> + if (WARN_ON_ONCE(!*data)) >> + return -EINVAL; >> + >> + fdt = phys_to_virt(*data); >> + >> + /* >> + * The pos or size might have changed since prepare. Everything else >> + * stays the same. >> + */ >> + err = fdt_setprop(fdt, 0, "pos", &pos, sizeof(pos)); >> + if (err) >> + return err; > > Comment is talking about pos and size but code is only updating pos. Right. Comment is out of date. size can no longer change since prepare. So will update the comment. > >> +static int memfd_luo_retrieve(struct liveupdate_file_handler *handler, u64 data, >> + struct file **file_p) >> +{ >> + const struct memfd_luo_preserved_folio *pfolios; >> + int nr_pfolios, len, ret = 0, i = 0; >> + struct address_space *mapping; >> + struct folio *folio, *fdt_folio; >> + const u64 *pos, *size; >> + struct inode *inode; >> + struct file *file; >> + const void *fdt; >> + >> + fdt_folio = memfd_luo_get_fdt(data); >> + if (!fdt_folio) >> + return -ENOENT; >> + >> + fdt = page_to_virt(folio_page(fdt_folio, 0)); >> + >> + pfolios = fdt_getprop(fdt, 0, "folios", &len); >> + if (!pfolios || len % sizeof(*pfolios)) { >> + pr_err("invalid 'folios' property\n"); > > Print should clearly state that error is because fields is not found or > len is not multiple of sizeof(*pfolios). Eh, there is already too much boilerplate one has to write (and read) for parsing the FDT. Is there really a need for an extra 3-4 lines of code for _each_ property that is parsed? Long term, I think we shouldn't be doing this manually anyway. I think the maintainable path forward is to define a schema for the serialized data and have a parser that takes in the schema and gives out a parsed struct, doing all sorts of checks in the process. -- Regards, Pratyush Yadav
On Wed, Aug 13, 2025 at 12:29 PM Pratyush Yadav <pratyush@kernel.org> wrote: > > Hi Vipin, > > Thanks for the review. > > On Tue, Aug 12 2025, Vipin Sharma wrote: > > > On 2025-08-07 01:44:35, Pasha Tatashin wrote: > >> From: Pratyush Yadav <ptyadav@amazon.de> > >> +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios, > >> + unsigned int nr_folios) > >> +{ > >> + unsigned int i; > >> + > >> + for (i = 0; i < nr_folios; i++) { > >> + const struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; > >> + struct folio *folio; > >> + > >> + if (!pfolio->foliodesc) > >> + continue; > >> + > >> + folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc)); > >> + > >> + kho_unpreserve_folio(folio); > > > > This one is missing WARN_ON_ONCE() similar to the one in > > memfd_luo_preserve_folios(). > > Right, will add. > > > > >> + unpin_folio(folio); > > Looking at this code caught my eye. This can also be called from LUO's > finish callback if no one claimed the memfd after live update. In that > case, unpin_folio() is going to underflow the pincount or refcount on > the folio since after the kexec, the folio is no longer pinned. We > should only be doing folio_put(). > > I think this function should take a argument to specify which of these > cases it is dealing with. > > >> + } > >> +} > >> + > >> +static void *memfd_luo_create_fdt(unsigned long size) > >> +{ > >> + unsigned int order = get_order(size); > >> + struct folio *fdt_folio; > >> + int err = 0; > >> + void *fdt; > >> + > >> + if (order > MAX_PAGE_ORDER) > >> + return NULL; > >> + > >> + fdt_folio = folio_alloc(GFP_KERNEL, order); > > > > __GFP_ZERO should also be used here. Otherwise this can lead to > > unintentional passing of old kernel memory. > > fdt_create() zeroes out the buffer so this should not be a problem. You are right, fdt_create() zeroes the whole buffer, however, I wonder if it could be `optimized` to only clear only the header part of FDT, not the rest and this could potentially lead us to send an FDT buffer that contains both a valid FDT and the trailing bits contain data from old kernel. Pasha
On Wed, Aug 13 2025, Pasha Tatashin wrote: > On Wed, Aug 13, 2025 at 12:29 PM Pratyush Yadav <pratyush@kernel.org> wrote: >> >> Hi Vipin, >> >> Thanks for the review. >> >> On Tue, Aug 12 2025, Vipin Sharma wrote: >> >> > On 2025-08-07 01:44:35, Pasha Tatashin wrote: >> >> From: Pratyush Yadav <ptyadav@amazon.de> >> >> +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios, >> >> + unsigned int nr_folios) >> >> +{ >> >> + unsigned int i; >> >> + >> >> + for (i = 0; i < nr_folios; i++) { >> >> + const struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; >> >> + struct folio *folio; >> >> + >> >> + if (!pfolio->foliodesc) >> >> + continue; >> >> + >> >> + folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc)); >> >> + >> >> + kho_unpreserve_folio(folio); >> > >> > This one is missing WARN_ON_ONCE() similar to the one in >> > memfd_luo_preserve_folios(). >> >> Right, will add. >> >> > >> >> + unpin_folio(folio); >> >> Looking at this code caught my eye. This can also be called from LUO's >> finish callback if no one claimed the memfd after live update. In that >> case, unpin_folio() is going to underflow the pincount or refcount on >> the folio since after the kexec, the folio is no longer pinned. We >> should only be doing folio_put(). >> >> I think this function should take a argument to specify which of these >> cases it is dealing with. >> >> >> + } >> >> +} >> >> + >> >> +static void *memfd_luo_create_fdt(unsigned long size) >> >> +{ >> >> + unsigned int order = get_order(size); >> >> + struct folio *fdt_folio; >> >> + int err = 0; >> >> + void *fdt; >> >> + >> >> + if (order > MAX_PAGE_ORDER) >> >> + return NULL; >> >> + >> >> + fdt_folio = folio_alloc(GFP_KERNEL, order); >> > >> > __GFP_ZERO should also be used here. Otherwise this can lead to >> > unintentional passing of old kernel memory. >> >> fdt_create() zeroes out the buffer so this should not be a problem. > > You are right, fdt_create() zeroes the whole buffer, however, I wonder > if it could be `optimized` to only clear only the header part of FDT, > not the rest and this could potentially lead us to send an FDT buffer > that contains both a valid FDT and the trailing bits contain data from > old kernel. Fair enough. At least the API documentation does not say anything about the state of the buffer. My main concern was around performance since the FDT can be multiple megabytes long for big memfds. Anyway, this isn't in the blackout window so perhaps we can live with it. Will add the GFP_ZERO. -- Regards, Pratyush Yadav
On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote: > On 2025-08-07 01:44:35, Pasha Tatashin wrote: > > From: Pratyush Yadav <ptyadav@amazon.de> > > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios, > > + unsigned int nr_folios) > > +{ > > + unsigned int i; > > + > > + for (i = 0; i < nr_folios; i++) { > > + const struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; > > + struct folio *folio; > > + > > + if (!pfolio->foliodesc) > > + continue; > > + > > + folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc)); > > + > > + kho_unpreserve_folio(folio); > > This one is missing WARN_ON_ONCE() similar to the one in > memfd_luo_preserve_folios(). So you really want to cause a machine to reboot and get a CVE issued for this, if it could be triggered? That's bold :) Please don't. If that can happen, handle the issue and move on, don't crash boxes. thanks, greg k-h
On Wed, Aug 13 2025, Greg KH wrote: > On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote: >> On 2025-08-07 01:44:35, Pasha Tatashin wrote: >> > From: Pratyush Yadav <ptyadav@amazon.de> >> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios, >> > + unsigned int nr_folios) >> > +{ >> > + unsigned int i; >> > + >> > + for (i = 0; i < nr_folios; i++) { >> > + const struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; >> > + struct folio *folio; >> > + >> > + if (!pfolio->foliodesc) >> > + continue; >> > + >> > + folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc)); >> > + >> > + kho_unpreserve_folio(folio); >> >> This one is missing WARN_ON_ONCE() similar to the one in >> memfd_luo_preserve_folios(). > > So you really want to cause a machine to reboot and get a CVE issued for > this, if it could be triggered? That's bold :) > > Please don't. If that can happen, handle the issue and move on, don't > crash boxes. Why would a WARN() crash the machine? That is what BUG() does, not WARN(). -- Regards, Pratyush Yadav
On Wed, Aug 13, 2025 at 02:02:07PM +0200, Pratyush Yadav wrote: > On Wed, Aug 13 2025, Greg KH wrote: > > > On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote: > >> On 2025-08-07 01:44:35, Pasha Tatashin wrote: > >> > From: Pratyush Yadav <ptyadav@amazon.de> > >> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios, > >> > + unsigned int nr_folios) > >> > +{ > >> > + unsigned int i; > >> > + > >> > + for (i = 0; i < nr_folios; i++) { > >> > + const struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; > >> > + struct folio *folio; > >> > + > >> > + if (!pfolio->foliodesc) > >> > + continue; > >> > + > >> > + folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc)); > >> > + > >> > + kho_unpreserve_folio(folio); > >> > >> This one is missing WARN_ON_ONCE() similar to the one in > >> memfd_luo_preserve_folios(). > > > > So you really want to cause a machine to reboot and get a CVE issued for > > this, if it could be triggered? That's bold :) > > > > Please don't. If that can happen, handle the issue and move on, don't > > crash boxes. > > Why would a WARN() crash the machine? That is what BUG() does, not > WARN(). See 'panic_on_warn' which is enabled in a few billion Linux systems these days :(
On Wed, Aug 13, 2025 at 02:14:23PM +0200, Greg KH wrote: > On Wed, Aug 13, 2025 at 02:02:07PM +0200, Pratyush Yadav wrote: > > On Wed, Aug 13 2025, Greg KH wrote: > > > > > On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote: > > >> On 2025-08-07 01:44:35, Pasha Tatashin wrote: > > >> > From: Pratyush Yadav <ptyadav@amazon.de> > > >> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios, > > >> > + unsigned int nr_folios) > > >> > +{ > > >> > + unsigned int i; > > >> > + > > >> > + for (i = 0; i < nr_folios; i++) { > > >> > + const struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; > > >> > + struct folio *folio; > > >> > + > > >> > + if (!pfolio->foliodesc) > > >> > + continue; > > >> > + > > >> > + folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc)); > > >> > + > > >> > + kho_unpreserve_folio(folio); > > >> > > >> This one is missing WARN_ON_ONCE() similar to the one in > > >> memfd_luo_preserve_folios(). > > > > > > So you really want to cause a machine to reboot and get a CVE issued for > > > this, if it could be triggered? That's bold :) > > > > > > Please don't. If that can happen, handle the issue and move on, don't > > > crash boxes. > > > > Why would a WARN() crash the machine? That is what BUG() does, not > > WARN(). > > See 'panic_on_warn' which is enabled in a few billion Linux systems > these days :( This has been discussed so many times already: https://lwn.net/Articles/969923/ When someone tried to formalize this "don't use WARN_ON" position in the coding-style.rst it was NAK'd: https://lwn.net/ml/linux-kernel/10af93f8-83f2-48ce-9bc3-80fe4c60082c@redhat.com/ Based on Linus's opposition to the idea: https://lore.kernel.org/all/CAHk-=wgF7K2gSSpy=m_=K3Nov4zaceUX9puQf1TjkTJLA2XC_g@mail.gmail.com/ Use the warn ons. Make sure they can't be triggered by userspace. Use them to detect corruption/malfunction in the kernel. In this case if kho_unpreserve_folio() fails in this call chain it means some error unwind is wrongly happening out of sequence, and we are now forced to leak memory. Unwind is not something that userspace should be controlling, so of course we want a WARN_ON here. Jason
On Wed, Aug 13 2025, Jason Gunthorpe wrote: > On Wed, Aug 13, 2025 at 02:14:23PM +0200, Greg KH wrote: >> On Wed, Aug 13, 2025 at 02:02:07PM +0200, Pratyush Yadav wrote: >> > On Wed, Aug 13 2025, Greg KH wrote: >> > >> > > On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote: >> > >> On 2025-08-07 01:44:35, Pasha Tatashin wrote: >> > >> > From: Pratyush Yadav <ptyadav@amazon.de> >> > >> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios, >> > >> > + unsigned int nr_folios) >> > >> > +{ >> > >> > + unsigned int i; >> > >> > + >> > >> > + for (i = 0; i < nr_folios; i++) { >> > >> > + const struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; >> > >> > + struct folio *folio; >> > >> > + >> > >> > + if (!pfolio->foliodesc) >> > >> > + continue; >> > >> > + >> > >> > + folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc)); >> > >> > + >> > >> > + kho_unpreserve_folio(folio); >> > >> >> > >> This one is missing WARN_ON_ONCE() similar to the one in >> > >> memfd_luo_preserve_folios(). >> > > >> > > So you really want to cause a machine to reboot and get a CVE issued for >> > > this, if it could be triggered? That's bold :) >> > > >> > > Please don't. If that can happen, handle the issue and move on, don't >> > > crash boxes. >> > >> > Why would a WARN() crash the machine? That is what BUG() does, not >> > WARN(). >> >> See 'panic_on_warn' which is enabled in a few billion Linux systems >> these days :( > > This has been discussed so many times already: > > https://lwn.net/Articles/969923/ > > When someone tried to formalize this "don't use WARN_ON" position > in the coding-style.rst it was NAK'd: > > https://lwn.net/ml/linux-kernel/10af93f8-83f2-48ce-9bc3-80fe4c60082c@redhat.com/ > > Based on Linus's opposition to the idea: > > https://lore.kernel.org/all/CAHk-=wgF7K2gSSpy=m_=K3Nov4zaceUX9puQf1TjkTJLA2XC_g@mail.gmail.com/ > > Use the warn ons. Make sure they can't be triggered by userspace. Use > them to detect corruption/malfunction in the kernel. > > In this case if kho_unpreserve_folio() fails in this call chain it > means some error unwind is wrongly happening out of sequence, and we > are now forced to leak memory. Unwind is not something that userspace > should be controlling, so of course we want a WARN_ON here. Yep. And if we are saying WARN() should never be used then doesn't that make panic_on_warn a no-op? What is even the point of that option then? Here, we are unable to unpreserve a folio that we have preserved. This isn't a normal error that we expect to happen. This should _not_ happen unless something has gone horribly wrong. For example, the calls to kho_preserve_folio() don't WARN(), since that can fail for various reasons. They just return the error up the call chain. As an analogy, allocating a page can fail, and it is quite reasonable to expect the code to not throw out WARN()s for that. But if for some reason you can't free a page that you allocated, this is very unexpected and should WARN(). Of course, in Linux the page free APIs don't even return a status, but I hope you get my point. If I were a system administrator who sets panic_on_warn, I would _want_ the system to crash so no further damage happens and I can collect logs/crash dumps to investigate later. Without the WARN(), I never get a chance to debug and my system breaks silently. For all others, the kernel goes on with some possibly corrupted/broken state. -- Regards, Pratyush Yadav
On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote: > On Wed, Aug 13, 2025 at 02:14:23PM +0200, Greg KH wrote: > > On Wed, Aug 13, 2025 at 02:02:07PM +0200, Pratyush Yadav wrote: > > > On Wed, Aug 13 2025, Greg KH wrote: > > > > > > > On Tue, Aug 12, 2025 at 11:34:37PM -0700, Vipin Sharma wrote: > > > >> On 2025-08-07 01:44:35, Pasha Tatashin wrote: > > > >> > From: Pratyush Yadav <ptyadav@amazon.de> > > > >> > +static void memfd_luo_unpreserve_folios(const struct memfd_luo_preserved_folio *pfolios, > > > >> > + unsigned int nr_folios) > > > >> > +{ > > > >> > + unsigned int i; > > > >> > + > > > >> > + for (i = 0; i < nr_folios; i++) { > > > >> > + const struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; > > > >> > + struct folio *folio; > > > >> > + > > > >> > + if (!pfolio->foliodesc) > > > >> > + continue; > > > >> > + > > > >> > + folio = pfn_folio(PRESERVED_FOLIO_PFN(pfolio->foliodesc)); > > > >> > + > > > >> > + kho_unpreserve_folio(folio); > > > >> > > > >> This one is missing WARN_ON_ONCE() similar to the one in > > > >> memfd_luo_preserve_folios(). > > > > > > > > So you really want to cause a machine to reboot and get a CVE issued for > > > > this, if it could be triggered? That's bold :) > > > > > > > > Please don't. If that can happen, handle the issue and move on, don't > > > > crash boxes. > > > > > > Why would a WARN() crash the machine? That is what BUG() does, not > > > WARN(). > > > > See 'panic_on_warn' which is enabled in a few billion Linux systems > > these days :( > > This has been discussed so many times already: > > https://lwn.net/Articles/969923/ > > When someone tried to formalize this "don't use WARN_ON" position > in the coding-style.rst it was NAK'd: > > https://lwn.net/ml/linux-kernel/10af93f8-83f2-48ce-9bc3-80fe4c60082c@redhat.com/ > > Based on Linus's opposition to the idea: > > https://lore.kernel.org/all/CAHk-=wgF7K2gSSpy=m_=K3Nov4zaceUX9puQf1TjkTJLA2XC_g@mail.gmail.com/ > > Use the warn ons. Make sure they can't be triggered by userspace. Use > them to detect corruption/malfunction in the kernel. > > In this case if kho_unpreserve_folio() fails in this call chain it > means some error unwind is wrongly happening out of sequence, and we > are now forced to leak memory. Unwind is not something that userspace > should be controlling, so of course we want a WARN_ON here. "should be" is the key here. And it's not obvious from this patch if that's true or not, which is why I mentioned it. I will keep bringing this up, given the HUGE number of CVEs I keep assigning each week for when userspace hits WARN_ON() calls until that flow starts to die out either because we don't keep adding new calls, OR we finally fix them all. Both would be good... thanks, greg k-h
On Wed, Aug 13, 2025 at 03:00:08PM +0200, Greg KH wrote: > > In this case if kho_unpreserve_folio() fails in this call chain it > > means some error unwind is wrongly happening out of sequence, and we > > are now forced to leak memory. Unwind is not something that userspace > > should be controlling, so of course we want a WARN_ON here. > > "should be" is the key here. And it's not obvious from this patch if > that's true or not, which is why I mentioned it. > > I will keep bringing this up, given the HUGE number of CVEs I keep > assigning each week for when userspace hits WARN_ON() calls until that > flow starts to die out either because we don't keep adding new calls, OR > we finally fix them all. Both would be good... WARN or not, userspace triggering permanently leaking kernel memory is a CVE worthy bug in of itself. So even if userspace triggers this I'd rather have the warn than the difficult to find leak. I don't know what your CVEs are, but I get a decent number of userspace hits a WARN bug from with syzkaller, and they are all bugs in the kernel. Bugs that should probably get CVEs even without the crash on WARN issue anyhow. The WARN made them discoverable cheaply. The most recent was a userspace triggerable arthimetic overflow corrupted a datastructure and a WARN caught it, syzkaller found it, and we fixed it before it became a splashy exploit with a web site. Removing bug catching to reduce CVEs because we don't find the bugs anymore seems like the wrong direction to me. Jason
On Wed, Aug 13 2025, Greg KH wrote: > On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote: [...] >> Use the warn ons. Make sure they can't be triggered by userspace. Use >> them to detect corruption/malfunction in the kernel. >> >> In this case if kho_unpreserve_folio() fails in this call chain it >> means some error unwind is wrongly happening out of sequence, and we >> are now forced to leak memory. Unwind is not something that userspace >> should be controlling, so of course we want a WARN_ON here. > > "should be" is the key here. And it's not obvious from this patch if > that's true or not, which is why I mentioned it. > > I will keep bringing this up, given the HUGE number of CVEs I keep > assigning each week for when userspace hits WARN_ON() calls until that > flow starts to die out either because we don't keep adding new calls, OR > we finally fix them all. Both would be good... Out of curiosity, why is hitting a WARN_ON() considered a vulnerability? I'd guess one reason is overwhelming system console which can cause a denial of service, but what about WARN_ON_ONCE() or WARN_RATELIMIT()? -- Regards, Pratyush Yadav
On Wed, Aug 13, 2025 at 03:37:03PM +0200, Pratyush Yadav wrote: > On Wed, Aug 13 2025, Greg KH wrote: > > > On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote: > [...] > >> Use the warn ons. Make sure they can't be triggered by userspace. Use > >> them to detect corruption/malfunction in the kernel. > >> > >> In this case if kho_unpreserve_folio() fails in this call chain it > >> means some error unwind is wrongly happening out of sequence, and we > >> are now forced to leak memory. Unwind is not something that userspace > >> should be controlling, so of course we want a WARN_ON here. > > > > "should be" is the key here. And it's not obvious from this patch if > > that's true or not, which is why I mentioned it. > > > > I will keep bringing this up, given the HUGE number of CVEs I keep > > assigning each week for when userspace hits WARN_ON() calls until that > > flow starts to die out either because we don't keep adding new calls, OR > > we finally fix them all. Both would be good... > > Out of curiosity, why is hitting a WARN_ON() considered a vulnerability? > I'd guess one reason is overwhelming system console which can cause a > denial of service, but what about WARN_ON_ONCE() or WARN_RATELIMIT()? If panic_on_warn is set, this will cause the machine to crash/reboot, which is considered a "vulnerability" by the CVE.org definition. If a user can trigger this, it gets a CVE assigned to it. hope this helps, greg k-h
On Wed, Aug 13, 2025 at 1:37 PM Pratyush Yadav <pratyush@kernel.org> wrote: > > On Wed, Aug 13 2025, Greg KH wrote: > > > On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote: > [...] > >> Use the warn ons. Make sure they can't be triggered by userspace. Use > >> them to detect corruption/malfunction in the kernel. > >> > >> In this case if kho_unpreserve_folio() fails in this call chain it > >> means some error unwind is wrongly happening out of sequence, and we > >> are now forced to leak memory. Unwind is not something that userspace > >> should be controlling, so of course we want a WARN_ON here. > > > > "should be" is the key here. And it's not obvious from this patch if > > that's true or not, which is why I mentioned it. > > > > I will keep bringing this up, given the HUGE number of CVEs I keep > > assigning each week for when userspace hits WARN_ON() calls until that > > flow starts to die out either because we don't keep adding new calls, OR > > we finally fix them all. Both would be good... > > Out of curiosity, why is hitting a WARN_ON() considered a vulnerability? > I'd guess one reason is overwhelming system console which can cause a > denial of service, but what about WARN_ON_ONCE() or WARN_RATELIMIT()? My understanding that it is vulnerability only if it can be triggered from userspace, otherwise it is a preferred method to give a notice that something is very wrong. Given the large number of machines that have panic_on_warn, a reliable kernel crash that is triggered from userspace is a vulnerability(?). Pasha > > -- > Regards, > Pratyush Yadav
On Wed, Aug 13, 2025 at 01:41:51PM +0000, Pasha Tatashin wrote: > On Wed, Aug 13, 2025 at 1:37 PM Pratyush Yadav <pratyush@kernel.org> wrote: > > > > On Wed, Aug 13 2025, Greg KH wrote: > > > > > On Wed, Aug 13, 2025 at 09:41:40AM -0300, Jason Gunthorpe wrote: > > [...] > > >> Use the warn ons. Make sure they can't be triggered by userspace. Use > > >> them to detect corruption/malfunction in the kernel. > > >> > > >> In this case if kho_unpreserve_folio() fails in this call chain it > > >> means some error unwind is wrongly happening out of sequence, and we > > >> are now forced to leak memory. Unwind is not something that userspace > > >> should be controlling, so of course we want a WARN_ON here. > > > > > > "should be" is the key here. And it's not obvious from this patch if > > > that's true or not, which is why I mentioned it. > > > > > > I will keep bringing this up, given the HUGE number of CVEs I keep > > > assigning each week for when userspace hits WARN_ON() calls until that > > > flow starts to die out either because we don't keep adding new calls, OR > > > we finally fix them all. Both would be good... > > > > Out of curiosity, why is hitting a WARN_ON() considered a vulnerability? > > I'd guess one reason is overwhelming system console which can cause a > > denial of service, but what about WARN_ON_ONCE() or WARN_RATELIMIT()? > > My understanding that it is vulnerability only if it can be triggered > from userspace, otherwise it is a preferred method to give a notice > that something is very wrong. > > Given the large number of machines that have panic_on_warn, a reliable > kernel crash that is triggered from userspace is a vulnerability(?). Yes, and so is a unreliable one :)
> +static int memfd_luo_preserve_folios(struct memfd_luo_preserved_folio *pfolios, > + struct folio **folios, > + unsigned int nr_folios) > +{ > + unsigned int i; Should be 'long i' Otherwise in err_unpreserve we get into an infinite loop. Thank you Josh Hilke for noticing this. Pasha > + int err; > + > + for (i = 0; i < nr_folios; i++) { > + struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; > + struct folio *folio = folios[i]; > + unsigned int flags = 0; > + unsigned long pfn; > + > + err = kho_preserve_folio(folio); > + if (err) > + goto err_unpreserve; > + > + pfn = folio_pfn(folio); > + if (folio_test_dirty(folio)) > + flags |= PRESERVED_FLAG_DIRTY; > + if (folio_test_uptodate(folio)) > + flags |= PRESERVED_FLAG_UPTODATE; > + > + pfolio->foliodesc = PRESERVED_FOLIO_MKDESC(pfn, flags); > + pfolio->index = folio->index; > + } > + > + return 0; > + > +err_unpreserve: > + i--; > + for (; i >= 0; i--) > + WARN_ON_ONCE(kho_unpreserve_folio(folios[i])); > + return err; > +} > +
On Fri, Aug 08 2025, Pasha Tatashin wrote: >> +static int memfd_luo_preserve_folios(struct memfd_luo_preserved_folio *pfolios, >> + struct folio **folios, >> + unsigned int nr_folios) >> +{ >> + unsigned int i; > > Should be 'long i' > > Otherwise in err_unpreserve we get into an infinite loop. Thank you > Josh Hilke for noticing this. Good catch! Will fix. > >> + int err; >> + >> + for (i = 0; i < nr_folios; i++) { >> + struct memfd_luo_preserved_folio *pfolio = &pfolios[i]; >> + struct folio *folio = folios[i]; >> + unsigned int flags = 0; >> + unsigned long pfn; >> + >> + err = kho_preserve_folio(folio); >> + if (err) >> + goto err_unpreserve; >> + >> + pfn = folio_pfn(folio); >> + if (folio_test_dirty(folio)) >> + flags |= PRESERVED_FLAG_DIRTY; >> + if (folio_test_uptodate(folio)) >> + flags |= PRESERVED_FLAG_UPTODATE; >> + >> + pfolio->foliodesc = PRESERVED_FOLIO_MKDESC(pfn, flags); >> + pfolio->index = folio->index; >> + } >> + >> + return 0; >> + >> +err_unpreserve: >> + i--; >> + for (; i >= 0; i--) >> + WARN_ON_ONCE(kho_unpreserve_folio(folios[i])); >> + return err; >> +} >> + -- Regards, Pratyush Yadav
© 2016 - 2025 Red Hat, Inc.