[PATCH v2 1/3] kho: Adopt KHO radix tree data structures

Jason Miu posted 3 patches 3 months, 3 weeks ago
There is a newer version of this series
[PATCH v2 1/3] kho: Adopt KHO radix tree data structures
Posted by Jason Miu 3 months, 3 weeks ago
Introduce a radix tree data structure for tracking preserved
memory pages in KHO, which will replace the current xarray-based
implementation.

The primary motivation for this change is to eliminate the need for
serialization. By marking preserved pages directly in the new KHO
radix tree and passing them to the next kernel, the entire
serialization process can be removed. This ultimately allows for the
removal of the KHO finalize and abort states, simplifying the overall
design.

The preserved page physical address and its order are encoded in to a
value. The KHO radix tree has multiple level of nodes where each node
is a table contining a descriptor to the next level of nodes. The
encoded value get split and stored its parts along the tree
traversal. The tree traversal ends with the `kho_bitmap_table`, where
each bit represents a single preserved page.

Instead of serializing the memory map, the first kernel store the KHO
radix tree root in the FDT. This KHO radix tree root is passed to the
second kernel after kexec, hence elimitated the KHO finalize and abort
states.

The second kernel walks the passed-in KHO radix tree from its root. It
restores the memory pages and their orders by decoding the value
stored in the KHO radix tree.

This architectural shift to using a shared radix tree structure
simplifies the KHO design and eliminates the overhead of serializing
and deserializing the preserved memory map.

Signed-off-by: Jason Miu <jasonmiu@google.com>
---
 MAINTAINERS                                   |   2 +
 include/linux/kexec_handover.h                |  23 +-
 .../linux/live_update/abi/kexec_handover.h    |  10 +
 kernel/kexec_handover.c                       | 861 +++++++++---------
 4 files changed, 458 insertions(+), 438 deletions(-)
 create mode 100644 include/linux/live_update/abi/kexec_handover.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 7ab92f471cb8..046a333ba0d3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13550,12 +13550,14 @@ KEXEC HANDOVER (KHO)
 M:	Alexander Graf <graf@amazon.com>
 M:	Mike Rapoport <rppt@kernel.org>
 M:	Changyuan Lyu <changyuanl@google.com>
+M:	Jason Miu <jasonmiu@google.com>
 L:	kexec@lists.infradead.org
 L:	linux-mm@kvack.org
 S:	Maintained
 F:	Documentation/admin-guide/mm/kho.rst
 F:	Documentation/core-api/kho/*
 F:	include/linux/kexec_handover.h
+F:	include/linux/live_update/abi/kexec_handover.h
 F:	kernel/kexec_handover.c
 F:	tools/testing/selftests/kho/
 
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index 348844cffb13..bc2f9e060a79 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -19,23 +19,6 @@ enum kho_event {
 struct folio;
 struct notifier_block;
 
-#define DECLARE_KHOSER_PTR(name, type) \
-	union {                        \
-		phys_addr_t phys;      \
-		type ptr;              \
-	} name
-#define KHOSER_STORE_PTR(dest, val)               \
-	({                                        \
-		typeof(val) v = val;              \
-		typecheck(typeof((dest).ptr), v); \
-		(dest).phys = virt_to_phys(v);    \
-	})
-#define KHOSER_LOAD_PTR(src)                                                 \
-	({                                                                   \
-		typeof(src) s = src;                                         \
-		(typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \
-	})
-
 struct kho_serialization;
 
 #ifdef CONFIG_KEXEC_HANDOVER
@@ -45,6 +28,7 @@ int kho_preserve_folio(struct folio *folio);
 int kho_preserve_phys(phys_addr_t phys, size_t size);
 struct folio *kho_restore_folio(phys_addr_t phys);
 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
+int kho_remove_subtree(const char *name);
 int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
 
 int register_kho_notifier(struct notifier_block *nb);
@@ -86,6 +70,11 @@ static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
 	return -EOPNOTSUPP;
 }
 
+static inline int kho_remove_subtree(const char *name)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int register_kho_notifier(struct notifier_block *nb)
 {
 	return -EOPNOTSUPP;
diff --git a/include/linux/live_update/abi/kexec_handover.h b/include/linux/live_update/abi/kexec_handover.h
new file mode 100644
index 000000000000..3154e5c33851
--- /dev/null
+++ b/include/linux/live_update/abi/kexec_handover.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ABI__KEXEC_HANDOVER_H
+#define _ABI__KEXEC_HANDOVER_H
+
+#define KHO_FDT_COMPATIBLE "kho-v2"
+#define PROP_PRESERVED_PAGE_RADIX_TREE "preserved-page-radix-tree"
+#define PROP_SUB_FDT "fdt"
+
+#endif
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index ecd1ac210dbd..2fc5975690a7 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -15,9 +15,12 @@
 #include <linux/kexec_handover.h>
 #include <linux/libfdt.h>
 #include <linux/list.h>
+#include <linux/log2.h>
 #include <linux/memblock.h>
 #include <linux/notifier.h>
 #include <linux/page-isolation.h>
+#include <linux/rwsem.h>
+#include <linux/live_update/abi/kexec_handover.h>
 
 #include <asm/early_ioremap.h>
 
@@ -28,10 +31,6 @@
 #include "../mm/internal.h"
 #include "kexec_internal.h"
 
-#define KHO_FDT_COMPATIBLE "kho-v1"
-#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
-#define PROP_SUB_FDT "fdt"
-
 static bool kho_enable __ro_after_init;
 
 bool kho_is_enabled(void)
@@ -46,143 +45,305 @@ static int __init kho_parse_enable(char *p)
 }
 early_param("kho", kho_parse_enable);
 
+typedef int (*kho_radix_tree_walk_callback_t)(unsigned long encoded);
+
 /*
- * Keep track of memory that is to be preserved across KHO.
+ * The KHO radix tree tracks preserved memory pages. It is a hierarchical
+ * structure that starts with a single root `kho_radix_tree`. This single
+ * tree stores pages of all orders.
+ *
+ * This is achieved by encoding the page's physical address and its order into
+ * a single `unsigned long` value. This encoded value is then used to traverse
+ * the tree.
+ *
+ * The tree hierarchy is shown below:
+ *
+ * kho_radix_tree_root
+ * +-------------------+
+ * |     Level 5       | (struct kho_radix_tree)
+ * +-------------------+
+ *   |
+ *   v
+ * +-------------------+
+ * |     Level 4       | (struct kho_radix_tree)
+ * +-------------------+
+ *   |
+ *   | ... (intermediate levels)
+ *   |
+ *   v
+ * +-------------------+
+ * |      Level 0      | (struct kho_bitmap_table)
+ * +-------------------+
  *
- * The serializing side uses two levels of xarrays to manage chunks of per-order
- * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a
- * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations
- * each bitmap will cover 16M of address space. Thus, for 16G of memory at most
- * 512K of bitmap memory will be needed for order 0.
+ * The following diagram illustrates how the encoded value is split into
+ * indices for the tree levels, with PAGE_SIZE is 4KB:
  *
- * This approach is fully incremental, as the serialization progresses folios
- * can continue be aggregated to the tracker. The final step, immediately prior
- * to kexec would serialize the xarray information into a linked list for the
- * successor kernel to parse.
+ *      63:60   59:51    50:42    41:33    32:24    23:15         14:0
+ * +---------+--------+--------+--------+--------+--------+-----------------+
+ * |    0    |  Lv 5  |  Lv 4  |  Lv 3  |  Lv 2  |  Lv 1  |  Lv 0 (bitmap)  |
+ * +---------+--------+--------+--------+--------+--------+-----------------+
+ *
+ * Each `kho_radix_tree` (Levels 1-5) and `kho_bitmap_table` (Level 0) is
+ * PAGE_SIZE. Each entry in a `kho_radix_tree` is a descriptor (a physical
+ * address) pointing to the next level node. For Level 1 `kho_radix_tree`
+ * nodes, these descriptors point to a `kho_bitmap_table`. The final
+ * `kho_bitmap_table` is a bitmap where each set bit represents a single
+ * preserved page.
  */
+struct kho_radix_tree {
+	phys_addr_t table[PAGE_SIZE / sizeof(phys_addr_t)];
+};
 
-#define PRESERVE_BITS (512 * 8)
-
-struct kho_mem_phys_bits {
-	DECLARE_BITMAP(preserve, PRESERVE_BITS);
+struct kho_bitmap_table {
+	unsigned long bitmaps[PAGE_SIZE / sizeof(unsigned long)];
 };
 
-struct kho_mem_phys {
+/*
+ * The KHO radix tree tracks preserved pages by encoding a page's physical
+ * address (pa) and its order into a single unsigned long value. This value
+ * is then used to traverse the tree. The encoded value is composed of two
+ * parts: the 'order bits' in the upper part and the 'page offset' in the
+ * lower part.
+ *
+ *   <-- Higher Bits ------------------------------------ Lower Bits -->
+ *  +--------------------------+-----------------------------------------+
+ *  |        Order Bits        |               Page Offset               |
+ *  +--------------------------+-----------------------------------------+
+ *  | ... 0 0 1 0 0 ...        | pa >> (PAGE_SHIFT + order)              |
+ *  +--------------------------+-----------------------------------------+
+ *            ^
+ *            |
+ *  This single '1' bit's position
+ *  uniquely identifies the 'order'.
+ *
+ *
+ * Page Offset:
+ * The 'page offset' is the physical address normalized for its order. It
+ * effectively represents the page offset for the given order.
+ *
+ * Order Bits:
+ * The 'order bits' encode the page order by setting a single bit at a
+ * specific position. The position of this bit itself represents the order.
+ *
+ * For instance, on a 64-bit system with 4KB pages (PAGE_SHIFT = 12), the
+ * maximum range for a page offset (for order 0) is 52 bits (64 - 12). This
+ * offset occupies bits [0-51]. For order 0, the order bit is set at
+ * position 52.
+ *
+ * As the order increases, the number of bits required for the 'page offset'
+ * decreases. For example, order 1 requires one less bit for its page
+ * offset. This allows its order bit to be set at position 51,
+ * i.e. shifting right, without conflicting with the page offset bits.
+ *
+ * This design stores pages of all sizes (orders) in a single 6-level table.  It
+ * efficiently shares lower table levels, especially due to common zero top
+ * address bits, allowing a single, efficient algorithm to manage all pages.
+ */
+enum kho_radix_consts {
+	/* The bit position of a 0-order page, only supports 64bits system */
+	ORDER_0_LG2 = 64 - PAGE_SHIFT,
+	/* Bit number used for each level of tables */
+	TABLE_SIZE_LG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t)),
+	/* Bit number used for lowest level of bitmaps */
+	BITMAP_SIZE_LG2 = PAGE_SHIFT + const_ilog2(BITS_PER_BYTE),
 	/*
-	 * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
-	 * to order.
+	 * The total tree depth is the number of intermediate levels
+	 * and 1 bitmap level.
 	 */
-	struct xarray phys_bits;
+	TREE_MAX_DEPTH = DIV_ROUND_UP(ORDER_0_LG2 - BITMAP_SIZE_LG2,
+				      TABLE_SIZE_LG2) + 1,
 };
 
-struct kho_mem_track {
-	/* Points to kho_mem_phys, each order gets its own bitmap tree */
-	struct xarray orders;
-};
+/*
+ * `kho_radix_tree_root` points to a page thats serves as the root of the
+ * KHO radix tree. This page is allocated during KHO module initialization.
+ * Its physical address is written to the FDT and passed to the next kernel
+ * during kexec.
+ */
+static struct kho_radix_tree *kho_radix_tree_root;
+static DECLARE_RWSEM(kho_radix_tree_root_sem);
 
-struct khoser_mem_chunk;
+static inline phys_addr_t kho_radix_tree_desc(struct kho_radix_tree *va)
+{
+	return (phys_addr_t)virt_to_phys(va);
+}
 
-struct kho_serialization {
-	struct page *fdt;
-	struct list_head fdt_list;
-	struct dentry *sub_fdt_dir;
-	struct kho_mem_track track;
-	/* First chunk of serialized preserved memory map */
-	struct khoser_mem_chunk *preserved_mem_map;
-};
+static inline struct kho_radix_tree *kho_radix_tree(phys_addr_t desc)
+{
+	return (struct kho_radix_tree *)(phys_to_virt(desc));
+}
 
-static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz)
+static struct kho_radix_tree *kho_alloc_radix_tree(void)
 {
-	void *elm, *res;
+	return (struct kho_radix_tree *)get_zeroed_page(GFP_KERNEL);
+}
 
-	elm = xa_load(xa, index);
-	if (elm)
-		return elm;
+static unsigned long kho_radix_encode(phys_addr_t pa, unsigned int order)
+{
+	/* Order bits part */
+	unsigned long h = 1UL << (ORDER_0_LG2 - order);
+	/* Page offset part */
+	unsigned long l = pa >> (PAGE_SHIFT + order);
 
-	elm = kzalloc(sz, GFP_KERNEL);
-	if (!elm)
-		return ERR_PTR(-ENOMEM);
+	return h | l;
+}
 
-	res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
-	if (xa_is_err(res))
-		res = ERR_PTR(xa_err(res));
+static phys_addr_t kho_radix_decode(unsigned long encoded, unsigned int *order)
+{
+	unsigned int order_bit = fls64(encoded);
+	phys_addr_t pa;
 
-	if (res) {
-		kfree(elm);
-		return res;
-	}
+	/* order_bit is numbered starting at 1 from fls64 */
+	*order = ORDER_0_LG2 - order_bit + 1;
+	/* The order is discarded by the shift */
+	pa = encoded << (PAGE_SHIFT + *order);
 
-	return elm;
+	return pa;
 }
 
-static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
-			     unsigned long end_pfn)
+static unsigned long kho_radix_get_index(unsigned long encoded,
+					 unsigned int level)
 {
-	struct kho_mem_phys_bits *bits;
-	struct kho_mem_phys *physxa;
-
-	while (pfn < end_pfn) {
-		const unsigned int order =
-			min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
-		const unsigned long pfn_high = pfn >> order;
+	int s;
 
-		physxa = xa_load(&track->orders, order);
-		if (!physxa)
-			continue;
+	if (level == 0)
+		return encoded % (1 << BITMAP_SIZE_LG2);
 
-		bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
-		if (!bits)
-			continue;
+	s = ((level - 1) * TABLE_SIZE_LG2) + BITMAP_SIZE_LG2;
+	return (encoded >> s) % (1 << TABLE_SIZE_LG2);
+}
 
-		clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+static int kho_radix_set_bitmap(struct kho_bitmap_table *bit_tlb,
+				unsigned long offset)
+{
+	if (!bit_tlb ||
+	    offset >= PAGE_SIZE * BITS_PER_BYTE)
+		return -EINVAL;
 
-		pfn += 1 << order;
-	}
+	__set_bit(offset, bit_tlb->bitmaps);
+	return 0;
 }
 
-static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
-				unsigned int order)
+static int kho_radix_preserve_page(phys_addr_t pa, unsigned int order)
 {
-	struct kho_mem_phys_bits *bits;
-	struct kho_mem_phys *physxa, *new_physxa;
-	const unsigned long pfn_high = pfn >> order;
+	unsigned long encoded = kho_radix_encode(pa, order);
+	struct kho_radix_tree *current_tree, *new_tree;
+	struct kho_bitmap_table *bitmap_table;
+	unsigned int tree_level = TREE_MAX_DEPTH - 1;
+	unsigned int err = 0;
+	unsigned int i, idx;
 
-	might_sleep();
+	down_write(&kho_radix_tree_root_sem);
 
-	physxa = xa_load(&track->orders, order);
-	if (!physxa) {
-		int err;
+	if (!kho_radix_tree_root) {
+		err = -EINVAL;
+		goto out;
+	}
 
-		new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL);
-		if (!new_physxa)
-			return -ENOMEM;
+	current_tree = kho_radix_tree_root;
 
-		xa_init(&new_physxa->phys_bits);
-		physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa,
-				    GFP_KERNEL);
+	/* Go from high levels to low levels */
+	for (i = tree_level; i >= 0; i--) {
+		idx = kho_radix_get_index(encoded, i);
 
-		err = xa_err(physxa);
-		if (err || physxa) {
-			xa_destroy(&new_physxa->phys_bits);
-			kfree(new_physxa);
+		if (i == 0) {
+			bitmap_table = (struct kho_bitmap_table *)current_tree;
+			err = kho_radix_set_bitmap(bitmap_table, idx);
+			goto out;
+		}
+
+		if (!current_tree->table[idx]) {
+			new_tree = kho_alloc_radix_tree();
+			if (!new_tree) {
+				err = -ENOMEM;
+				goto out;
+			}
 
-			if (err)
-				return err;
+			current_tree->table[idx] =
+				kho_radix_tree_desc(new_tree);
+			current_tree = new_tree;
 		} else {
-			physxa = new_physxa;
+			current_tree = kho_radix_tree(current_tree->table[idx]);
 		}
 	}
 
-	bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS,
-				sizeof(*bits));
-	if (IS_ERR(bits))
-		return PTR_ERR(bits);
+out:
+	up_write(&kho_radix_tree_root_sem);
+	return err;
+}
+
+static int kho_radix_walk_bitmaps(struct kho_bitmap_table *bit_tlb,
+				  unsigned long encoded,
+				  kho_radix_tree_walk_callback_t cb)
+{
+	unsigned long *bitmap = (unsigned long *)bit_tlb;
+	unsigned int i;
+	int err = 0;
+
+	for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) {
+		err = cb(encoded | i);
+		if (err)
+			return err;
+	}
 
-	set_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+	return 0;
+}
+
+static int kho_radix_walk_trees(struct kho_radix_tree *root, unsigned int level,
+				unsigned long start,
+				kho_radix_tree_walk_callback_t cb)
+{
+	struct kho_radix_tree *next_tree;
+	struct kho_bitmap_table *bitmap_table;
+	unsigned long encoded, i;
+	unsigned int level_shift;
+	int err = 0;
+
+	for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) {
+		if (root->table[i]) {
+			level_shift = ((level - 1) * TABLE_SIZE_LG2) +
+				BITMAP_SIZE_LG2;
+			encoded = start | (i << level_shift);
+
+			next_tree = kho_radix_tree(root->table[i]);
+
+			if (level > 1) {
+				err = kho_radix_walk_trees(next_tree, level - 1,
+							   encoded, cb);
+				if (err)
+					return err;
+			} else {
+				/*
+				 * we are at level 1,
+				 * next_tree is pointing to the level 0 bitmap.
+				 */
+				bitmap_table =
+					(struct kho_bitmap_table *)next_tree;
+				return kho_radix_walk_bitmaps(bitmap_table,
+							      encoded, cb);
+			}
+		}
+	}
 
 	return 0;
 }
 
+struct kho_serialization {
+	struct page *fdt;
+	struct list_head fdt_list;
+	struct dentry *sub_fdt_dir;
+	struct mutex fdt_mutex;		/* Lock for the output FDT */
+};
+
+static int __kho_preserve_order(unsigned long pfn, unsigned int order)
+{
+	unsigned long pa = PFN_PHYS(pfn);
+
+	might_sleep();
+
+	return kho_radix_preserve_page(pa, order);
+}
+
 /* almost as free_reserved_page(), just don't free the page */
 static void kho_restore_page(struct page *page, unsigned int order)
 {
@@ -224,152 +385,49 @@ struct folio *kho_restore_folio(phys_addr_t phys)
 }
 EXPORT_SYMBOL_GPL(kho_restore_folio);
 
-/* Serialize and deserialize struct kho_mem_phys across kexec
- *
- * Record all the bitmaps in a linked list of pages for the next kernel to
- * process. Each chunk holds bitmaps of the same order and each block of bitmaps
- * starts at a given physical address. This allows the bitmaps to be sparse. The
- * xarray is used to store them in a tree while building up the data structure,
- * but the KHO successor kernel only needs to process them once in order.
- *
- * All of this memory is normal kmalloc() memory and is not marked for
- * preservation. The successor kernel will remain isolated to the scratch space
- * until it completes processing this list. Once processed all the memory
- * storing these ranges will be marked as free.
- */
-
-struct khoser_mem_bitmap_ptr {
-	phys_addr_t phys_start;
-	DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *);
-};
-
-struct khoser_mem_chunk_hdr {
-	DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *);
-	unsigned int order;
-	unsigned int num_elms;
-};
-
-#define KHOSER_BITMAP_SIZE                                   \
-	((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \
-	 sizeof(struct khoser_mem_bitmap_ptr))
-
-struct khoser_mem_chunk {
-	struct khoser_mem_chunk_hdr hdr;
-	struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE];
-};
-
-static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
-
-static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk,
-					  unsigned long order)
-{
-	struct khoser_mem_chunk *chunk;
-
-	chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!chunk)
-		return NULL;
-	chunk->hdr.order = order;
-	if (cur_chunk)
-		KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
-	return chunk;
-}
-
-static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
-{
-	struct khoser_mem_chunk *chunk = first_chunk;
-
-	while (chunk) {
-		struct khoser_mem_chunk *tmp = chunk;
-
-		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
-		kfree(tmp);
-	}
-}
-
-static int kho_mem_serialize(struct kho_serialization *ser)
+static int __init kho_radix_walk_trees_memblock_callback(unsigned long encoded)
 {
-	struct khoser_mem_chunk *first_chunk = NULL;
-	struct khoser_mem_chunk *chunk = NULL;
-	struct kho_mem_phys *physxa;
-	unsigned long order;
-
-	xa_for_each(&ser->track.orders, order, physxa) {
-		struct kho_mem_phys_bits *bits;
-		unsigned long phys;
-
-		chunk = new_chunk(chunk, order);
-		if (!chunk)
-			goto err_free;
-
-		if (!first_chunk)
-			first_chunk = chunk;
-
-		xa_for_each(&physxa->phys_bits, phys, bits) {
-			struct khoser_mem_bitmap_ptr *elm;
+	unsigned int order;
+	unsigned long pa;
+	struct page *page;
+	int sz;
 
-			if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
-				chunk = new_chunk(chunk, order);
-				if (!chunk)
-					goto err_free;
-			}
+	pa = kho_radix_decode(encoded, &order);
 
-			elm = &chunk->bitmaps[chunk->hdr.num_elms];
-			chunk->hdr.num_elms++;
-			elm->phys_start = (phys * PRESERVE_BITS)
-					  << (order + PAGE_SHIFT);
-			KHOSER_STORE_PTR(elm->bitmap, bits);
-		}
-	}
+	sz = 1 << (order + PAGE_SHIFT);
+	page = phys_to_page(pa);
 
-	ser->preserved_mem_map = first_chunk;
+	/* Reserve the memory preserved in KHO radix tree in memblock */
+	memblock_reserve(pa, sz);
+	memblock_reserved_mark_noinit(pa, sz);
+	page->private = order;
 
 	return 0;
-
-err_free:
-	kho_mem_ser_free(first_chunk);
-	return -ENOMEM;
-}
-
-static void __init deserialize_bitmap(unsigned int order,
-				      struct khoser_mem_bitmap_ptr *elm)
-{
-	struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
-	unsigned long bit;
-
-	for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
-		int sz = 1 << (order + PAGE_SHIFT);
-		phys_addr_t phys =
-			elm->phys_start + (bit << (order + PAGE_SHIFT));
-		struct page *page = phys_to_page(phys);
-
-		memblock_reserve(phys, sz);
-		memblock_reserved_mark_noinit(phys, sz);
-		page->private = order;
-	}
 }
 
 static void __init kho_mem_deserialize(const void *fdt)
 {
-	struct khoser_mem_chunk *chunk;
+	struct kho_radix_tree *tree_root;
 	const phys_addr_t *mem;
 	int len;
 
-	mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len);
+	/* Retrieve the KHO radix tree from passed-in FDT. */
+	mem = fdt_getprop(fdt, 0, PROP_PRESERVED_PAGE_RADIX_TREE, &len);
 
 	if (!mem || len != sizeof(*mem)) {
-		pr_err("failed to get preserved memory bitmaps\n");
+		pr_err("failed to get preserved KHO memory tree\n");
 		return;
 	}
 
-	chunk = *mem ? phys_to_virt(*mem) : NULL;
-	while (chunk) {
-		unsigned int i;
+	tree_root = *mem ?
+		(struct kho_radix_tree *)phys_to_virt(*mem) :
+		NULL;
 
-		for (i = 0; i != chunk->hdr.num_elms; i++)
-			deserialize_bitmap(chunk->hdr.order,
-					   &chunk->bitmaps[i]);
-		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
-	}
+	if (!tree_root)
+		return;
+
+	kho_radix_walk_trees(tree_root, TREE_MAX_DEPTH - 1,
+			     0, kho_radix_walk_trees_memblock_callback);
 }
 
 /*
@@ -599,6 +657,35 @@ static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
 	return 0;
 }
 
+static int kho_debugfs_fdt_remove(struct list_head *list, const char *name)
+{
+	struct fdt_debugfs *f, *tmp;
+
+	list_for_each_entry_safe(f, tmp, list, list) {
+		if (strcmp(f->file->d_name.name, name) == 0) {
+			debugfs_remove(f->file);
+			list_del(&f->list);
+			kfree(f);
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+struct kho_out {
+	struct blocking_notifier_head chain_head;
+	struct dentry *dir;
+	struct kho_serialization ser;
+};
+
+static struct kho_out kho_out = {
+	.chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head),
+	.ser = {
+		.fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list),
+	},
+};
+
 /**
  * kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
  * @ser: serialization control object passed by KHO notifiers.
@@ -616,43 +703,90 @@ static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
  */
 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt)
 {
-	int err = 0;
+	void *root_fdt = page_to_virt(kho_out.ser.fdt);
 	u64 phys = (u64)virt_to_phys(fdt);
-	void *root = page_to_virt(ser->fdt);
+	int offset, err;
 
-	err |= fdt_begin_node(root, name);
-	err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys));
-	err |= fdt_end_node(root);
+	guard(mutex)(&kho_out.ser.fdt_mutex);
 
-	if (err)
-		return err;
+	offset = fdt_path_offset(root_fdt, "/");
+	if (offset < 0) {
+		pr_err("Failed to find the root FDT node: %s\n",
+		       fdt_strerror(offset));
+		return (offset == -FDT_ERR_NOTFOUND) ? -ENOENT : -EINVAL;
+	}
+
+	offset = fdt_add_subnode(root_fdt, offset, name);
+	if (offset < 0) {
+		pr_err("Failed to add root FDT subnode '%s': %s\n",
+		       name, fdt_strerror(offset));
+		return -ENOSPC;
+	}
+
+	err = fdt_setprop(root_fdt, offset, PROP_SUB_FDT,
+			  &phys, sizeof(phys));
+	if (err) {
+		pr_err("Failed to set property for FDT subnode '%s': %s\n",
+		       name, fdt_strerror(err));
+		fdt_del_node(root_fdt, offset);
+		return -ENOSPC;
+	}
 
-	return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt);
+	return kho_debugfs_fdt_add(&kho_out.ser.fdt_list,
+				   kho_out.ser.sub_fdt_dir, name, fdt);
 }
 EXPORT_SYMBOL_GPL(kho_add_subtree);
 
-struct kho_out {
-	struct blocking_notifier_head chain_head;
+/**
+ * kho_remove_subtree - remove a previously added sub FDT.
+ * @name: name of the sub tree to remove.
+ *
+ * Removes a child node named @name from the KHO root FDT that was
+ * previously added with kho_add_subtree().
+ *
+ * The corresponding debugfs blob entry at
+ * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` is also removed.
+ *
+ * Return: 0 on success, error code on failure.
+ */
+int kho_remove_subtree(const char *name)
+{
+	void *root_fdt = page_to_virt(kho_out.ser.fdt);
+	int offset, err;
 
-	struct dentry *dir;
+	guard(mutex)(&kho_out.ser.fdt_mutex);
 
-	struct mutex lock; /* protects KHO FDT finalization */
+	offset = fdt_path_offset(root_fdt, "/");
+	if (offset < 0) {
+		pr_err("Failed to find the root FDT node: %s\n",
+		       fdt_strerror(offset));
+		err = (offset == -FDT_ERR_NOTFOUND) ? -ENOENT : -EINVAL;
+		goto remove_debugfs;
+	}
 
-	struct kho_serialization ser;
-	bool finalized;
-};
+	offset = fdt_subnode_offset(root_fdt, offset, name);
+	if (offset < 0) {
+		pr_err("Error finding root FDT subnode '%s': %s\n", name,
+		       fdt_strerror(offset));
+		err = (offset == -FDT_ERR_NOTFOUND) ? -ENOENT : -EINVAL;
+		goto remove_debugfs;
+	}
 
-static struct kho_out kho_out = {
-	.chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head),
-	.lock = __MUTEX_INITIALIZER(kho_out.lock),
-	.ser = {
-		.fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list),
-		.track = {
-			.orders = XARRAY_INIT(kho_out.ser.track.orders, 0),
-		},
-	},
-	.finalized = false,
-};
+	err = fdt_del_node(root_fdt, offset);
+	if (err < 0) {
+		pr_err("Failed to delete subnode '%s': %s\n", name,
+		       fdt_strerror(err));
+		err = -EINVAL;
+		goto remove_debugfs;
+	}
+
+remove_debugfs:
+	if (kho_debugfs_fdt_remove(&kho_out.ser.fdt_list, name) && !err)
+		err = -ENOENT;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(kho_remove_subtree);
 
 int register_kho_notifier(struct notifier_block *nb)
 {
@@ -679,12 +813,8 @@ int kho_preserve_folio(struct folio *folio)
 {
 	const unsigned long pfn = folio_pfn(folio);
 	const unsigned int order = folio_order(folio);
-	struct kho_mem_track *track = &kho_out.ser.track;
-
-	if (kho_out.finalized)
-		return -EBUSY;
 
-	return __kho_preserve_order(track, pfn, order);
+	return __kho_preserve_order(pfn, order);
 }
 EXPORT_SYMBOL_GPL(kho_preserve_folio);
 
@@ -701,14 +831,8 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio);
 int kho_preserve_phys(phys_addr_t phys, size_t size)
 {
 	unsigned long pfn = PHYS_PFN(phys);
-	unsigned long failed_pfn = 0;
-	const unsigned long start_pfn = pfn;
 	const unsigned long end_pfn = PHYS_PFN(phys + size);
 	int err = 0;
-	struct kho_mem_track *track = &kho_out.ser.track;
-
-	if (kho_out.finalized)
-		return -EBUSY;
 
 	if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
 		return -EINVAL;
@@ -717,19 +841,14 @@ int kho_preserve_phys(phys_addr_t phys, size_t size)
 		const unsigned int order =
 			min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
 
-		err = __kho_preserve_order(track, pfn, order);
-		if (err) {
-			failed_pfn = pfn;
-			break;
-		}
+		err = __kho_preserve_order(pfn, order);
+		if (err)
+			return err;
 
 		pfn += 1 << order;
 	}
 
-	if (err)
-		__kho_unpreserve(track, start_pfn, failed_pfn);
-
-	return err;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(kho_preserve_phys);
 
@@ -737,150 +856,6 @@ EXPORT_SYMBOL_GPL(kho_preserve_phys);
 
 static struct dentry *debugfs_root;
 
-static int kho_out_update_debugfs_fdt(void)
-{
-	int err = 0;
-	struct fdt_debugfs *ff, *tmp;
-
-	if (kho_out.finalized) {
-		err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir,
-					  "fdt", page_to_virt(kho_out.ser.fdt));
-	} else {
-		list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) {
-			debugfs_remove(ff->file);
-			list_del(&ff->list);
-			kfree(ff);
-		}
-	}
-
-	return err;
-}
-
-static int kho_abort(void)
-{
-	int err;
-	unsigned long order;
-	struct kho_mem_phys *physxa;
-
-	xa_for_each(&kho_out.ser.track.orders, order, physxa) {
-		struct kho_mem_phys_bits *bits;
-		unsigned long phys;
-
-		xa_for_each(&physxa->phys_bits, phys, bits)
-			kfree(bits);
-
-		xa_destroy(&physxa->phys_bits);
-		kfree(physxa);
-	}
-	xa_destroy(&kho_out.ser.track.orders);
-
-	if (kho_out.ser.preserved_mem_map) {
-		kho_mem_ser_free(kho_out.ser.preserved_mem_map);
-		kho_out.ser.preserved_mem_map = NULL;
-	}
-
-	err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT,
-					   NULL);
-	err = notifier_to_errno(err);
-
-	if (err)
-		pr_err("Failed to abort KHO finalization: %d\n", err);
-
-	return err;
-}
-
-static int kho_finalize(void)
-{
-	int err = 0;
-	u64 *preserved_mem_map;
-	void *fdt = page_to_virt(kho_out.ser.fdt);
-
-	err |= fdt_create(fdt, PAGE_SIZE);
-	err |= fdt_finish_reservemap(fdt);
-	err |= fdt_begin_node(fdt, "");
-	err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE);
-	/**
-	 * Reserve the preserved-memory-map property in the root FDT, so
-	 * that all property definitions will precede subnodes created by
-	 * KHO callers.
-	 */
-	err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP,
-					sizeof(*preserved_mem_map),
-					(void **)&preserved_mem_map);
-	if (err)
-		goto abort;
-
-	err = kho_preserve_folio(page_folio(kho_out.ser.fdt));
-	if (err)
-		goto abort;
-
-	err = blocking_notifier_call_chain(&kho_out.chain_head,
-					   KEXEC_KHO_FINALIZE, &kho_out.ser);
-	err = notifier_to_errno(err);
-	if (err)
-		goto abort;
-
-	err = kho_mem_serialize(&kho_out.ser);
-	if (err)
-		goto abort;
-
-	*preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map);
-
-	err |= fdt_end_node(fdt);
-	err |= fdt_finish(fdt);
-
-abort:
-	if (err) {
-		pr_err("Failed to convert KHO state tree: %d\n", err);
-		kho_abort();
-	}
-
-	return err;
-}
-
-static int kho_out_finalize_get(void *data, u64 *val)
-{
-	mutex_lock(&kho_out.lock);
-	*val = kho_out.finalized;
-	mutex_unlock(&kho_out.lock);
-
-	return 0;
-}
-
-static int kho_out_finalize_set(void *data, u64 _val)
-{
-	int ret = 0;
-	bool val = !!_val;
-
-	mutex_lock(&kho_out.lock);
-
-	if (val == kho_out.finalized) {
-		if (kho_out.finalized)
-			ret = -EEXIST;
-		else
-			ret = -ENOENT;
-		goto unlock;
-	}
-
-	if (val)
-		ret = kho_finalize();
-	else
-		ret = kho_abort();
-
-	if (ret)
-		goto unlock;
-
-	kho_out.finalized = val;
-	ret = kho_out_update_debugfs_fdt();
-
-unlock:
-	mutex_unlock(&kho_out.lock);
-	return ret;
-}
-
-DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get,
-			 kho_out_finalize_set, "%llu\n");
-
 static int scratch_phys_show(struct seq_file *m, void *v)
 {
 	for (int i = 0; i < kho_scratch_cnt; i++)
@@ -921,11 +896,6 @@ static __init int kho_out_debugfs_init(void)
 	if (IS_ERR(f))
 		goto err_rmdir;
 
-	f = debugfs_create_file("finalize", 0600, dir, NULL,
-				&fops_kho_out_finalize);
-	if (IS_ERR(f))
-		goto err_rmdir;
-
 	kho_out.dir = dir;
 	kho_out.ser.sub_fdt_dir = sub_fdt_dir;
 	return 0;
@@ -1037,6 +1007,36 @@ static __init int kho_in_debugfs_init(const void *fdt)
 	return err;
 }
 
+static int kho_out_fdt_init(void)
+{
+	void *fdt = page_to_virt(kho_out.ser.fdt);
+	u64 preserved_radix_tree_pa;
+	int err = 0;
+
+	err = fdt_create_empty_tree(fdt, PAGE_SIZE);
+	if (err)
+		goto out;
+
+	err = fdt_setprop_string(fdt, 0, "compatible", KHO_FDT_COMPATIBLE);
+	if (err)
+		goto out;
+
+	down_read(&kho_radix_tree_root_sem);
+	preserved_radix_tree_pa = (u64)virt_to_phys(kho_radix_tree_root);
+	up_read(&kho_radix_tree_root_sem);
+
+	err = fdt_setprop(fdt, 0, PROP_PRESERVED_PAGE_RADIX_TREE,
+			  &preserved_radix_tree_pa, sizeof(u64));
+	if (err)
+		goto out;
+
+out:
+	if (err)
+		pr_err("Failed to convert KHO memory tree: %d\n", err);
+
+	return err;
+}
+
 static __init int kho_init(void)
 {
 	int err = 0;
@@ -1051,15 +1051,31 @@ static __init int kho_init(void)
 		goto err_free_scratch;
 	}
 
+	down_write(&kho_radix_tree_root_sem);
+	kho_radix_tree_root = (struct kho_radix_tree *)
+		kzalloc(PAGE_SIZE, GFP_KERNEL);
+	up_write(&kho_radix_tree_root_sem);
+	if (!kho_radix_tree_root) {
+		err = -ENOMEM;
+		goto err_free_fdt;
+	}
+
+	err = kho_out_fdt_init();
+	if (err)
+		goto err_free_kho_radix_tree_root;
+
 	debugfs_root = debugfs_create_dir("kho", NULL);
 	if (IS_ERR(debugfs_root)) {
 		err = -ENOENT;
-		goto err_free_fdt;
+		goto err_free_kho_radix_tree_root;
 	}
 
 	err = kho_out_debugfs_init();
 	if (err)
-		goto err_free_fdt;
+		goto err_free_kho_radix_tree_root;
+
+	/* Preserve the memory page of FDT for the next kernel */
+	kho_preserve_phys(page_to_phys(kho_out.ser.fdt), PAGE_SIZE);
 
 	if (fdt) {
 		err = kho_in_debugfs_init(fdt);
@@ -1087,6 +1103,9 @@ static __init int kho_init(void)
 
 	return 0;
 
+err_free_kho_radix_tree_root:
+	kfree(kho_radix_tree_root);
+	kho_radix_tree_root = NULL;
 err_free_fdt:
 	put_page(kho_out.ser.fdt);
 	kho_out.ser.fdt = NULL;
@@ -1100,7 +1119,7 @@ static __init int kho_init(void)
 	kho_enable = false;
 	return err;
 }
-late_initcall(kho_init);
+subsys_initcall(kho_init);
 
 static void __init kho_release_scratch(void)
 {
-- 
2.51.0.858.gf9c4a03a3a-goog
Re: [PATCH v2 1/3] kho: Adopt KHO radix tree data structures
Posted by Mike Rapoport 3 months, 2 weeks ago
On Mon, Oct 20, 2025 at 03:03:04AM -0700, Jason Miu wrote:
> Introduce a radix tree data structure for tracking preserved
> memory pages in KHO, which will replace the current xarray-based
> implementation.
> 
> The primary motivation for this change is to eliminate the need for
> serialization. By marking preserved pages directly in the new KHO
> radix tree and passing them to the next kernel, the entire
> serialization process can be removed. This ultimately allows for the
> removal of the KHO finalize and abort states, simplifying the overall
> design.
> 
> The preserved page physical address and its order are encoded in to a
> value. The KHO radix tree has multiple level of nodes where each node
> is a table contining a descriptor to the next level of nodes. The
> encoded value get split and stored its parts along the tree
> traversal. The tree traversal ends with the `kho_bitmap_table`, where
> each bit represents a single preserved page.
> 
> Instead of serializing the memory map, the first kernel store the KHO
> radix tree root in the FDT. This KHO radix tree root is passed to the
> second kernel after kexec, hence elimitated the KHO finalize and abort
> states.
> 
> The second kernel walks the passed-in KHO radix tree from its root. It
> restores the memory pages and their orders by decoding the value
> stored in the KHO radix tree.
> 
> This architectural shift to using a shared radix tree structure
> simplifies the KHO design and eliminates the overhead of serializing
> and deserializing the preserved memory map.
> 
> Signed-off-by: Jason Miu <jasonmiu@google.com>
> ---
>  MAINTAINERS                                   |   2 +
>  include/linux/kexec_handover.h                |  23 +-
>  .../linux/live_update/abi/kexec_handover.h    |  10 +
>  kernel/kexec_handover.c                       | 861 +++++++++---------
>  4 files changed, 458 insertions(+), 438 deletions(-)
>  create mode 100644 include/linux/live_update/abi/kexec_handover.h
> 
> diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
> index 348844cffb13..bc2f9e060a79 100644
> --- a/include/linux/kexec_handover.h
> +++ b/include/linux/kexec_handover.h
> @@ -45,6 +28,7 @@ int kho_preserve_folio(struct folio *folio);
>  int kho_preserve_phys(phys_addr_t phys, size_t size);
>  struct folio *kho_restore_folio(phys_addr_t phys);
>  int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
> +int kho_remove_subtree(const char *name);

This change seems unrelated.

>  int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
>  
>  int register_kho_notifier(struct notifier_block *nb);
> @@ -86,6 +70,11 @@ static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
>  	return -EOPNOTSUPP;
>  }
>  
> +static inline int kho_remove_subtree(const char *name)
> +{
> +	return -EOPNOTSUPP;
> +}
> +

Ditto.

>  static inline int register_kho_notifier(struct notifier_block *nb)
>  {
>  	return -EOPNOTSUPP;
> diff --git a/include/linux/live_update/abi/kexec_handover.h b/include/linux/live_update/abi/kexec_handover.h
> new file mode 100644
> index 000000000000..3154e5c33851
> --- /dev/null
> +++ b/include/linux/live_update/abi/kexec_handover.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _ABI__KEXEC_HANDOVER_H
> +#define _ABI__KEXEC_HANDOVER_H
> +
> +#define KHO_FDT_COMPATIBLE "kho-v2"
> +#define PROP_PRESERVED_PAGE_RADIX_TREE "preserved-page-radix-tree"

The property describes the preserved memory, it's not important if it's a
radix tree, a linked list or whatever else, so I don't think
page-radix-tree is a good name.

IMHO we can keep the preserved-memory-map name because version bump ensures
we won't attempt to parse v1 data structure.

> +#define PROP_SUB_FDT "fdt"
> +
> +#endif
> diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
> index ecd1ac210dbd..2fc5975690a7 100644
> --- a/kernel/kexec_handover.c
> +++ b/kernel/kexec_handover.c
> @@ -46,143 +45,305 @@ static int __init kho_parse_enable(char *p)
>  }
>  early_param("kho", kho_parse_enable);
>  
> +typedef int (*kho_radix_tree_walk_callback_t)(unsigned long encoded);
> +
>  /*
> - * Keep track of memory that is to be preserved across KHO.

Please leave this sentence.

> + * The KHO radix tree tracks preserved memory pages. It is a hierarchical
> + * structure that starts with a single root `kho_radix_tree`. This single
> + * tree stores pages of all orders.

I'd combine this comment and the comment above declaration of enum
kho_radix_consts into something like this:

/*
 * KHO tracks preserved memory using a radix tree data structure. Each node of
 * the tree is PAGE_SIZE and it is PAGE_SIZE aligned.
 *
 * The leaf nodes of the tree are bitmaps where each set bit represents a
 * single preserved page. The intermediate and root nodes of the tree are
 * tables of physical addresses that point to a lower level node:
 *
 * kho_radix_tree_root
 * +-------------------+
 * |     Level 5       | (struct kho_radix_tree)
 * +-------------------+
 *   |
 *   v
 * +-------------------+
 * |     Level 4       | (struct kho_radix_tree)
 * +-------------------+
 *   |
 *   | ... (intermediate levels)
 *   |
 *   v
 * +-------------------+
 * |      Level 0      | (struct kho_bitmap_table)
 * +-------------------+
 *
 * The physical address of the preserved page and its order are encoded into a
 * single unsigned long value where higher bits represent the order and the
 * lower bits represent the page offset:
 *
 *   <-- Higher Bits ------------------------------------ Lower Bits -->
 *  +--------------------------+-----------------------------------------+
 *  |        Order Bits        |               Page Offset               |
 *  +--------------------------+-----------------------------------------+
 *  |0| ... 0 0 0 1            | pa >> (PAGE_SHIFT + 0)                  |
 *  |1| ... 0 0 0 0 1            | pa >> (PAGE_SHIFT + 1)                |
 *  |2| ... 0 0 0 0 0 1            | pa >> (PAGE_SHIFT + 2)              |
 *  +------------------------------+-------------------------------------+
 *                  ^
 *                  |
 *  This single '1' bit's position uniquely identifies the 'order'.
 *
 * Page Offset:
 * The 'page offset' is the physical address normalized for its order. It
 * effectively represents the page offset for the given order.
 *
 * Order Bits:
 * The 'order bits' encode the page order by setting a single bit at a
 * specific position. The position of this bit itself represents the order.
 *
 * For instance, on a 64-bit system with 4KB pages (PAGE_SHIFT = 12), the
 * maximum range for a page offset (for order 0) is 52 bits (64 - 12). This
 * offset occupies bits [0-51]. For order 0, the order bit is set at
 * position 52.
 *
 * As the order increases, the number of bits required for the 'page offset'
 * decreases. For example, order 1 requires one less bit for its page
 * offset. This allows its order bit to be set at position 51,
 * i.e. shifting right, without conflicting with the page offset bits.
 *
 * Bitfields in this encoded value are used as indices into the tables for
 * upper levels and as bitmap index for the lowest level.
 *
 * The following diagram illustrates how the encoded value is split into
 * indices for the tree levels, with PAGE_SIZE of 4KB:
 *
 *      63:60   59:51    50:42    41:33    32:24    23:15         14:0
 * +---------+--------+--------+--------+--------+--------+-----------------+
 * |    0    |  Lv 5  |  Lv 4  |  Lv 3  |  Lv 2  |  Lv 1  |  Lv 0 (bitmap)  |
 * +---------+--------+--------+--------+--------+--------+-----------------+
 *
 * This design stores pages of all sizes (orders) in a single 6-level table.
 * It efficiently shares lower table levels, especially due to common zero top
 * address bits, allowing a single, efficient algorithm to manage all pages.
 *
 * < a paragraph about memory consumption >
 */

> + *
> + * This is achieved by encoding the page's physical address and its order into
> + * a single `unsigned long` value. This encoded value is then used to traverse
> + * the tree.

...

> + * Each `kho_radix_tree` (Levels 1-5) and `kho_bitmap_table` (Level 0) is
> + * PAGE_SIZE. Each entry in a `kho_radix_tree` is a descriptor (a physical
> + * address) pointing to the next level node. For Level 1 `kho_radix_tree`
> + * nodes, these descriptors point to a `kho_bitmap_table`. The final
> + * `kho_bitmap_table` is a bitmap where each set bit represents a single
> + * preserved page.
>   */

Please add an empty line between description and the declaration.

> +struct kho_radix_tree {

I think the name should reflect that this data structure is for tracking
memory in KHO. Otherwise it reads like a generic data structure that can be
used with KHO, like kho_array Pratyush proposed a while ago.

Maybe struct kho_mem_radix_tree? Or even drop the 'radix', I don't think
it's really important here.

> +	phys_addr_t table[PAGE_SIZE / sizeof(phys_addr_t)];
> +};
>  
> -#define PRESERVE_BITS (512 * 8)
> -
> -struct kho_mem_phys_bits {
> -	DECLARE_BITMAP(preserve, PRESERVE_BITS);
> +struct kho_bitmap_table {
> +	unsigned long bitmaps[PAGE_SIZE / sizeof(unsigned long)];

What's wrong with kho_mem_phys_bits and DECLARE_BITMAP?
Simply adjust PRESERVE_BITS to match PAGE_SIZE.

>  };
>  
> -struct kho_mem_phys {
> +/*
> + * The KHO radix tree tracks preserved pages by encoding a page's physical

...

> + * address bits, allowing a single, efficient algorithm to manage all pages.
> + */
> +enum kho_radix_consts {

I think this should be declared before the structs, so that there will be
one long description before the enum, structs and code.

> +	/* The bit position of a 0-order page, only supports 64bits system */
> +	ORDER_0_LG2 = 64 - PAGE_SHIFT,
> +	/* Bit number used for each level of tables */
> +	TABLE_SIZE_LG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t)),
> +	/* Bit number used for lowest level of bitmaps */
> +	BITMAP_SIZE_LG2 = PAGE_SHIFT + const_ilog2(BITS_PER_BYTE),
>  	/*
> -	 * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
> -	 * to order.
> +	 * The total tree depth is the number of intermediate levels
> +	 * and 1 bitmap level.
>  	 */
> -	struct xarray phys_bits;
> +	TREE_MAX_DEPTH = DIV_ROUND_UP(ORDER_0_LG2 - BITMAP_SIZE_LG2,
> +				      TABLE_SIZE_LG2) + 1,
>  };
>  
> -struct kho_mem_track {
> -	/* Points to kho_mem_phys, each order gets its own bitmap tree */
> -	struct xarray orders;
> -};
> +/*
> + * `kho_radix_tree_root` points to a page thats serves as the root of the
> + * KHO radix tree. This page is allocated during KHO module initialization.
> + * Its physical address is written to the FDT and passed to the next kernel
> + * during kexec.
> + */
> +static struct kho_radix_tree *kho_radix_tree_root;
> +static DECLARE_RWSEM(kho_radix_tree_root_sem);

I believe it's clearer to have struct kho_mem_track containing the pointer
to root and the semaphore, e.g.

struct kho_mem_track {
	struct kho_mem_tree *root;
	struct rw_semaphore sem;
};
  
> -struct khoser_mem_chunk;
> +static inline phys_addr_t kho_radix_tree_desc(struct kho_radix_tree *va)

I think kho_mem_tree is a better prefix than kho_radix.

> +{
> +	return (phys_addr_t)virt_to_phys(va);
> +}
>  
> -struct kho_serialization {
> -	struct page *fdt;
> -	struct list_head fdt_list;
> -	struct dentry *sub_fdt_dir;
> -	struct kho_mem_track track;
> -	/* First chunk of serialized preserved memory map */
> -	struct khoser_mem_chunk *preserved_mem_map;
> -};
> +static inline struct kho_radix_tree *kho_radix_tree(phys_addr_t desc)

This is not really a 'desc', this is a plain physical address, i.e. 'pa'.
It's probably will be clearer to just use phys_to_virt() and virt_to_phys()
directly rather than introduce these wrappers.

> +{
> +	return (struct kho_radix_tree *)(phys_to_virt(desc));
> +}
>  
> -static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz)
> +static struct kho_radix_tree *kho_alloc_radix_tree(void)
>  {
> -	void *elm, *res;
> +	return (struct kho_radix_tree *)get_zeroed_page(GFP_KERNEL);
> +}
>  
> -	elm = xa_load(xa, index);
> -	if (elm)
> -		return elm;
> +static unsigned long kho_radix_encode(phys_addr_t pa, unsigned int order)

This name is really cryptic. Encode to what? Can't say I have great ideas,
but this name should reflect that physical address and order are encoded
into combined index into the tree.

Maybe _encode_indices()?

> +{
> +	/* Order bits part */
> +	unsigned long h = 1UL << (ORDER_0_LG2 - order);
> +	/* Page offset part */
> +	unsigned long l = pa >> (PAGE_SHIFT + order);
>  
> -	elm = kzalloc(sz, GFP_KERNEL);
> -	if (!elm)
> -		return ERR_PTR(-ENOMEM);
> +	return h | l;
> +}
>  
> -	res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
> -	if (xa_is_err(res))
> -		res = ERR_PTR(xa_err(res));
> +static phys_addr_t kho_radix_decode(unsigned long encoded, unsigned int *order)

The same here. Maybe

	_decode_indices(unsigned long indices, unsigned int *order)

> +{
> +	unsigned int order_bit = fls64(encoded);
> +	phys_addr_t pa;
>  
> -	if (res) {
> -		kfree(elm);
> -		return res;
> -	}
> +	/* order_bit is numbered starting at 1 from fls64 */
> +	*order = ORDER_0_LG2 - order_bit + 1;
> +	/* The order is discarded by the shift */
> +	pa = encoded << (PAGE_SHIFT + *order);
>  
> -	return elm;
> +	return pa;
>  }
>  
> -static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
> -			     unsigned long end_pfn)
> +static unsigned long kho_radix_get_index(unsigned long encoded,

I don't like 'encoded' here and below as well.

> +					 unsigned int level)
>  {
> -	struct kho_mem_phys_bits *bits;
> -	struct kho_mem_phys *physxa;
> -
> -	while (pfn < end_pfn) {
> -		const unsigned int order =
> -			min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
> -		const unsigned long pfn_high = pfn >> order;
> +	int s;
>  
> -		physxa = xa_load(&track->orders, order);
> -		if (!physxa)
> -			continue;
> +	if (level == 0)
> +		return encoded % (1 << BITMAP_SIZE_LG2);
>  
> -		bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
> -		if (!bits)
> -			continue;
> +	s = ((level - 1) * TABLE_SIZE_LG2) + BITMAP_SIZE_LG2;
> +	return (encoded >> s) % (1 << TABLE_SIZE_LG2);
> +}
>  
> -		clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
> +static int kho_radix_set_bitmap(struct kho_bitmap_table *bit_tlb,

                                                           ^ bitmap

> +				unsigned long offset)
> +{
> +	if (!bit_tlb ||
> +	    offset >= PAGE_SIZE * BITS_PER_BYTE)

No need for two lines here

> +		return -EINVAL;
>  
> -		pfn += 1 << order;
> -	}
> +	__set_bit(offset, bit_tlb->bitmaps);
> +	return 0;
>  }
>  
> -static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
> -				unsigned int order)
> +static int kho_radix_preserve_page(phys_addr_t pa, unsigned int order)
>  {
> -	struct kho_mem_phys_bits *bits;
> -	struct kho_mem_phys *physxa, *new_physxa;
> -	const unsigned long pfn_high = pfn >> order;
> +	unsigned long encoded = kho_radix_encode(pa, order);
> +	struct kho_radix_tree *current_tree, *new_tree;

current_ is excessive IMHO and 'node' seems to be a better name for a tree
iterator:

	struct kho_radix_tree *node;

is fine, the new_tree can be declared in the scope that uses it.

> +	struct kho_bitmap_table *bitmap_table;

_table is excessive on both the type name and variable name

> +	unsigned int tree_level = TREE_MAX_DEPTH - 1;

Just 'level' is enough.

> +	unsigned int err = 0;

err is signed.

> +	unsigned int i, idx;
>  
> -	might_sleep();
> +	down_write(&kho_radix_tree_root_sem);
>  
> -	physxa = xa_load(&track->orders, order);
> -	if (!physxa) {
> -		int err;
> +	if (!kho_radix_tree_root) {

Isn't this an assert? 
Also I don't see concurrency for accessing the root, the assertion can be
outside the lock.

> +		err = -EINVAL;
> +		goto out;
> +	}
>  
> -		new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL);
> -		if (!new_physxa)
> -			return -ENOMEM;
> +	current_tree = kho_radix_tree_root;
>  
> -		xa_init(&new_physxa->phys_bits);
> -		physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa,
> -				    GFP_KERNEL);
> +	/* Go from high levels to low levels */
> +	for (i = tree_level; i >= 0; i--) {
> +		idx = kho_radix_get_index(encoded, i);
>  
> -		err = xa_err(physxa);
> -		if (err || physxa) {
> -			xa_destroy(&new_physxa->phys_bits);
> -			kfree(new_physxa);
> +		if (i == 0) {
> +			bitmap_table = (struct kho_bitmap_table *)current_tree;
> +			err = kho_radix_set_bitmap(bitmap_table, idx);

I'd consider making the loop until 'i > 0' and putting this outside the
loop.

> +			goto out;
> +		}
> +

		if (node->table[idx]) {
			node = kho_radix_tree(node->table[idx]);
			continue;
		}

and reduce the indentation for allocating a new table.

> +		if (!current_tree->table[idx]) {
> +			new_tree = kho_alloc_radix_tree();
> +			if (!new_tree) {
> +				err = -ENOMEM;
> +				goto out;
> +			}
>  
> -			if (err)
> -				return err;
> +			current_tree->table[idx] =
> +				kho_radix_tree_desc(new_tree);
> +			current_tree = new_tree;
>  		} else {
> -			physxa = new_physxa;
> +			current_tree = kho_radix_tree(current_tree->table[idx]);
>  		}
>  	}
>  
> -	bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS,
> -				sizeof(*bits));
> -	if (IS_ERR(bits))
> -		return PTR_ERR(bits);
> +out:
> +	up_write(&kho_radix_tree_root_sem);
> +	return err;
> +}
> +
> +static int kho_radix_walk_bitmaps(struct kho_bitmap_table *bit_tlb,
> +				  unsigned long encoded,
> +				  kho_radix_tree_walk_callback_t cb)
> +{
> +	unsigned long *bitmap = (unsigned long *)bit_tlb;
> +	unsigned int i;
> +	int err = 0;
> +
> +	for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) {
> +		err = cb(encoded | i);
> +		if (err)
> +			return err;
> +	}
>  
> -	set_bit(pfn_high % PRESERVE_BITS, bits->preserve);
> +	return 0;
> +}
> +
> +static int kho_radix_walk_trees(struct kho_radix_tree *root, unsigned int level,

static int kho_radix_walk_tree(struct kho_radix_tree *parent, ...

> +				unsigned long start,
> +				kho_radix_tree_walk_callback_t cb)
> +{
> +	struct kho_radix_tree *next_tree;

	struct kho_radix_tree *node;

> +	struct kho_bitmap_table *bitmap_table;

                                ^ bitmap;

> +	unsigned long encoded, i;
> +	unsigned int level_shift;

This can move to the scope where it's used and 'shift' would be enough.

> +	int err = 0;

...

> +static int __kho_preserve_order(unsigned long pfn, unsigned int order)
> +{
> +	unsigned long pa = PFN_PHYS(pfn);
> +
> +	might_sleep();
> +
> +	return kho_radix_preserve_page(pa, order);

I don't think that __kho_preserve_order() wrapper is useful, just rename
kho_radix_preserve_page to __kho_preserve_order()

> +}
> +
>  /* almost as free_reserved_page(), just don't free the page */
>  static void kho_restore_page(struct page *page, unsigned int order)
>  {
> @@ -224,152 +385,49 @@ struct folio *kho_restore_folio(phys_addr_t phys)
>  }
>  EXPORT_SYMBOL_GPL(kho_restore_folio);

...

> -static int kho_mem_serialize(struct kho_serialization *ser)
> +static int __init kho_radix_walk_trees_memblock_callback(unsigned long encoded)

I wonder if we expect other callbacks for tree traversal?
For now there's only one case when we traverse the tree, so it seems
simpler to just merge the code here into kho_radix_walk_bitmaps() and
rename kho_radix_walk_trees() and kho_radix_walk_bitmaps() to reflect that
they are reserving the preserved memory.

>  {
> +	unsigned int order;
> +	unsigned long pa;
> +	struct page *page;
> +	int sz;
> 
> +	pa = kho_radix_decode(encoded, &order);
> +	sz = 1 << (order + PAGE_SHIFT);
> +	page = phys_to_page(pa);
>  
> +	/* Reserve the memory preserved in KHO radix tree in memblock */
> +	memblock_reserve(pa, sz);
> +	memblock_reserved_mark_noinit(pa, sz);
> +	page->private = order;
>  }
>  
>  static void __init kho_mem_deserialize(const void *fdt)

Well, it's not deserialize anymore.

>  {
> @@ -599,6 +657,35 @@ static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
>  	return 0;
>  }

It seems that a lot of the changes below are not related to changing the
mechanism for tracking the preserved memory.

I'm stopping here.

-- 
Sincerely yours,
Mike.
Re: [PATCH v2 1/3] kho: Adopt KHO radix tree data structures
Posted by Jason Miu 3 months, 1 week ago
Hi Mike,

Thank you very much for providing your comments.

On Mon, Oct 27, 2025 at 4:53 AM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Mon, Oct 20, 2025 at 03:03:04AM -0700, Jason Miu wrote:
> > Introduce a radix tree data structure for tracking preserved
> > memory pages in KHO, which will replace the current xarray-based
> > implementation.
> >
> > The primary motivation for this change is to eliminate the need for
> > serialization. By marking preserved pages directly in the new KHO
> > radix tree and passing them to the next kernel, the entire
> > serialization process can be removed. This ultimately allows for the
> > removal of the KHO finalize and abort states, simplifying the overall
> > design.
> >
> > The preserved page physical address and its order are encoded in to a
> > value. The KHO radix tree has multiple level of nodes where each node
> > is a table contining a descriptor to the next level of nodes. The
> > encoded value get split and stored its parts along the tree
> > traversal. The tree traversal ends with the `kho_bitmap_table`, where
> > each bit represents a single preserved page.
> >
> > Instead of serializing the memory map, the first kernel store the KHO
> > radix tree root in the FDT. This KHO radix tree root is passed to the
> > second kernel after kexec, hence elimitated the KHO finalize and abort
> > states.
> >
> > The second kernel walks the passed-in KHO radix tree from its root. It
> > restores the memory pages and their orders by decoding the value
> > stored in the KHO radix tree.
> >
> > This architectural shift to using a shared radix tree structure
> > simplifies the KHO design and eliminates the overhead of serializing
> > and deserializing the preserved memory map.
> >
> > Signed-off-by: Jason Miu <jasonmiu@google.com>
> > ---
> >  MAINTAINERS                                   |   2 +
> >  include/linux/kexec_handover.h                |  23 +-
> >  .../linux/live_update/abi/kexec_handover.h    |  10 +
> >  kernel/kexec_handover.c                       | 861 +++++++++---------
> >  4 files changed, 458 insertions(+), 438 deletions(-)
> >  create mode 100644 include/linux/live_update/abi/kexec_handover.h
> >
> > diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
> > index 348844cffb13..bc2f9e060a79 100644
> > --- a/include/linux/kexec_handover.h
> > +++ b/include/linux/kexec_handover.h
> > @@ -45,6 +28,7 @@ int kho_preserve_folio(struct folio *folio);
> >  int kho_preserve_phys(phys_addr_t phys, size_t size);
> >  struct folio *kho_restore_folio(phys_addr_t phys);
> >  int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
> > +int kho_remove_subtree(const char *name);
>
> This change seems unrelated.

I want to add the "kho_remove_subtree()" as a counterpart of the
"kho_add_subtree()". You are right that "kho_remove_subtree()" is not
very related to this radix tree data struct change. Do you prefer I
split this kho_remove_subtree() change as another patch in this
series, or do you prefer another independent patch series? (I like the
latter option more personally)

>
> >  int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
> >
> >  int register_kho_notifier(struct notifier_block *nb);
> > @@ -86,6 +70,11 @@ static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
> >       return -EOPNOTSUPP;
> >  }
> >
> > +static inline int kho_remove_subtree(const char *name)
> > +{
> > +     return -EOPNOTSUPP;
> > +}
> > +
>
> Ditto.
>
> >  static inline int register_kho_notifier(struct notifier_block *nb)
> >  {
> >       return -EOPNOTSUPP;
> > diff --git a/include/linux/live_update/abi/kexec_handover.h b/include/linux/live_update/abi/kexec_handover.h
> > new file mode 100644
> > index 000000000000..3154e5c33851
> > --- /dev/null
> > +++ b/include/linux/live_update/abi/kexec_handover.h
> > @@ -0,0 +1,10 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +
> > +#ifndef _ABI__KEXEC_HANDOVER_H
> > +#define _ABI__KEXEC_HANDOVER_H
> > +
> > +#define KHO_FDT_COMPATIBLE "kho-v2"
> > +#define PROP_PRESERVED_PAGE_RADIX_TREE "preserved-page-radix-tree"
>
> The property describes the preserved memory, it's not important if it's a
> radix tree, a linked list or whatever else, so I don't think
> page-radix-tree is a good name.
>
> IMHO we can keep the preserved-memory-map name because version bump ensures
> we won't attempt to parse v1 data structure.
>

Sure, I can just keep the preserved-memory-map name.

> > +#define PROP_SUB_FDT "fdt"
> > +
> > +#endif
> > diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
> > index ecd1ac210dbd..2fc5975690a7 100644
> > --- a/kernel/kexec_handover.c
> > +++ b/kernel/kexec_handover.c
> > @@ -46,143 +45,305 @@ static int __init kho_parse_enable(char *p)
> >  }
> >  early_param("kho", kho_parse_enable);
> >
> > +typedef int (*kho_radix_tree_walk_callback_t)(unsigned long encoded);
> > +
> >  /*
> > - * Keep track of memory that is to be preserved across KHO.
>
> Please leave this sentence.
>
> > + * The KHO radix tree tracks preserved memory pages. It is a hierarchical
> > + * structure that starts with a single root `kho_radix_tree`. This single
> > + * tree stores pages of all orders.
>
> I'd combine this comment and the comment above declaration of enum
> kho_radix_consts into something like this:
>

Yes, I will move the kho_radix_consts up so its comment will be
combined and stay close to the comment of kho_radix_tree.

> /*
>  * KHO tracks preserved memory using a radix tree data structure. Each node of
>  * the tree is PAGE_SIZE and it is PAGE_SIZE aligned.
>  *
>  * The leaf nodes of the tree are bitmaps where each set bit represents a
>  * single preserved page. The intermediate and root nodes of the tree are
>  * tables of physical addresses that point to a lower level node:
>  *
>  * kho_radix_tree_root
>  * +-------------------+
>  * |     Level 5       | (struct kho_radix_tree)
>  * +-------------------+
>  *   |
>  *   v
>  * +-------------------+
>  * |     Level 4       | (struct kho_radix_tree)
>  * +-------------------+
>  *   |
>  *   | ... (intermediate levels)
>  *   |
>  *   v
>  * +-------------------+
>  * |      Level 0      | (struct kho_bitmap_table)
>  * +-------------------+
>  *
>  * The physical address of the preserved page and its order are encoded into a
>  * single unsigned long value where higher bits represent the order and the
>  * lower bits represent the page offset:
>  *
>  *   <-- Higher Bits ------------------------------------ Lower Bits -->
>  *  +--------------------------+-----------------------------------------+
>  *  |        Order Bits        |               Page Offset               |
>  *  +--------------------------+-----------------------------------------+
>  *  |0| ... 0 0 0 1            | pa >> (PAGE_SHIFT + 0)                  |
>  *  |1| ... 0 0 0 0 1            | pa >> (PAGE_SHIFT + 1)                |
>  *  |2| ... 0 0 0 0 0 1            | pa >> (PAGE_SHIFT + 2)              |
>  *  +------------------------------+-------------------------------------+
>  *                  ^
>  *                  |
>  *  This single '1' bit's position uniquely identifies the 'order'.
>  *
>  * Page Offset:
>  * The 'page offset' is the physical address normalized for its order. It
>  * effectively represents the page offset for the given order.
>  *
>  * Order Bits:
>  * The 'order bits' encode the page order by setting a single bit at a
>  * specific position. The position of this bit itself represents the order.
>  *
>  * For instance, on a 64-bit system with 4KB pages (PAGE_SHIFT = 12), the
>  * maximum range for a page offset (for order 0) is 52 bits (64 - 12). This
>  * offset occupies bits [0-51]. For order 0, the order bit is set at
>  * position 52.
>  *
>  * As the order increases, the number of bits required for the 'page offset'
>  * decreases. For example, order 1 requires one less bit for its page
>  * offset. This allows its order bit to be set at position 51,
>  * i.e. shifting right, without conflicting with the page offset bits.
>  *
>  * Bitfields in this encoded value are used as indices into the tables for
>  * upper levels and as bitmap index for the lowest level.
>  *
>  * The following diagram illustrates how the encoded value is split into
>  * indices for the tree levels, with PAGE_SIZE of 4KB:
>  *
>  *      63:60   59:51    50:42    41:33    32:24    23:15         14:0
>  * +---------+--------+--------+--------+--------+--------+-----------------+
>  * |    0    |  Lv 5  |  Lv 4  |  Lv 3  |  Lv 2  |  Lv 1  |  Lv 0 (bitmap)  |
>  * +---------+--------+--------+--------+--------+--------+-----------------+
>  *
>  * This design stores pages of all sizes (orders) in a single 6-level table.
>  * It efficiently shares lower table levels, especially due to common zero top
>  * address bits, allowing a single, efficient algorithm to manage all pages.
>  *
>  * < a paragraph about memory consumption >
>  */
>
> > + *
> > + * This is achieved by encoding the page's physical address and its order into
> > + * a single `unsigned long` value. This encoded value is then used to traverse
> > + * the tree.
>
> ...
>
> > + * Each `kho_radix_tree` (Levels 1-5) and `kho_bitmap_table` (Level 0) is
> > + * PAGE_SIZE. Each entry in a `kho_radix_tree` is a descriptor (a physical
> > + * address) pointing to the next level node. For Level 1 `kho_radix_tree`
> > + * nodes, these descriptors point to a `kho_bitmap_table`. The final
> > + * `kho_bitmap_table` is a bitmap where each set bit represents a single
> > + * preserved page.
> >   */
>
> Please add an empty line between description and the declaration.
>
> > +struct kho_radix_tree {
>
> I think the name should reflect that this data structure is for tracking
> memory in KHO. Otherwise it reads like a generic data structure that can be
> used with KHO, like kho_array Pratyush proposed a while ago.
>
> Maybe struct kho_mem_radix_tree? Or even drop the 'radix', I don't think
> it's really important here.
>

I use the 'radix' here as we need a step to convert the page address
to a radix tree key. I agree that `kho_mem_radix_tree` sounds better.

> > +     phys_addr_t table[PAGE_SIZE / sizeof(phys_addr_t)];
> > +};
> >
> > -#define PRESERVE_BITS (512 * 8)
> > -
> > -struct kho_mem_phys_bits {
> > -     DECLARE_BITMAP(preserve, PRESERVE_BITS);
> > +struct kho_bitmap_table {
> > +     unsigned long bitmaps[PAGE_SIZE / sizeof(unsigned long)];
>
> What's wrong with kho_mem_phys_bits and DECLARE_BITMAP?
> Simply adjust PRESERVE_BITS to match PAGE_SIZE.
>

I used `kho_bitmap_table` as I wanted to highlight it as a patch of
bitmap that has to be in the size of PAGE_SIZE. I found the original
`kho_mem_phys_bits` harder to understand as a bit in this bitmap is
referring to one memory page. I can update it as

#define PRESERVE_PAGE_BITS (PAGE_SIZE * 8)
struct kho_bitmap {
    DECLARE_BITMAP(preserve_page, PRESERVE_PAGE_BITS);
};

what do you think?

> >  };
> >
> > -struct kho_mem_phys {
> > +/*
> > + * The KHO radix tree tracks preserved pages by encoding a page's physical
>
> ...
>
> > + * address bits, allowing a single, efficient algorithm to manage all pages.
> > + */
> > +enum kho_radix_consts {
>
> I think this should be declared before the structs, so that there will be
> one long description before the enum, structs and code.
>

yes, will move this enum up.

> > +     /* The bit position of a 0-order page, only supports 64bits system */
> > +     ORDER_0_LG2 = 64 - PAGE_SHIFT,
> > +     /* Bit number used for each level of tables */
> > +     TABLE_SIZE_LG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t)),
> > +     /* Bit number used for lowest level of bitmaps */
> > +     BITMAP_SIZE_LG2 = PAGE_SHIFT + const_ilog2(BITS_PER_BYTE),
> >       /*
> > -      * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
> > -      * to order.
> > +      * The total tree depth is the number of intermediate levels
> > +      * and 1 bitmap level.
> >        */
> > -     struct xarray phys_bits;
> > +     TREE_MAX_DEPTH = DIV_ROUND_UP(ORDER_0_LG2 - BITMAP_SIZE_LG2,
> > +                                   TABLE_SIZE_LG2) + 1,
> >  };
> >
> > -struct kho_mem_track {
> > -     /* Points to kho_mem_phys, each order gets its own bitmap tree */
> > -     struct xarray orders;
> > -};
> > +/*
> > + * `kho_radix_tree_root` points to a page thats serves as the root of the
> > + * KHO radix tree. This page is allocated during KHO module initialization.
> > + * Its physical address is written to the FDT and passed to the next kernel
> > + * during kexec.
> > + */
> > +static struct kho_radix_tree *kho_radix_tree_root;
> > +static DECLARE_RWSEM(kho_radix_tree_root_sem);
>
> I believe it's clearer to have struct kho_mem_track containing the pointer
> to root and the semaphore, e.g.
>
> struct kho_mem_track {
>         struct kho_mem_tree *root;
>         struct rw_semaphore sem;
> };
>

Sure.

> > -struct khoser_mem_chunk;
> > +static inline phys_addr_t kho_radix_tree_desc(struct kho_radix_tree *va)
>
> I think kho_mem_tree is a better prefix than kho_radix.

As above, I tend to use `kho_mem_radix_tree`.

>
> > +{
> > +     return (phys_addr_t)virt_to_phys(va);
> > +}
> >
> > -struct kho_serialization {
> > -     struct page *fdt;
> > -     struct list_head fdt_list;
> > -     struct dentry *sub_fdt_dir;
> > -     struct kho_mem_track track;
> > -     /* First chunk of serialized preserved memory map */
> > -     struct khoser_mem_chunk *preserved_mem_map;
> > -};
> > +static inline struct kho_radix_tree *kho_radix_tree(phys_addr_t desc)
>
> This is not really a 'desc', this is a plain physical address, i.e. 'pa'.
> It's probably will be clearer to just use phys_to_virt() and virt_to_phys()
> directly rather than introduce these wrappers.

We had some previous discussions that using helper functions for
converting between the va and pa. But I agree with what you said that
"this is not ready a desc". How about "kho_radix_tree(phys_addr_t
node_pa)", to align with the variable name of the tree iterator below?

>
> > +{
> > +     return (struct kho_radix_tree *)(phys_to_virt(desc));
> > +}
> >
> > -static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz)
> > +static struct kho_radix_tree *kho_alloc_radix_tree(void)
> >  {
> > -     void *elm, *res;
> > +     return (struct kho_radix_tree *)get_zeroed_page(GFP_KERNEL);
> > +}
> >
> > -     elm = xa_load(xa, index);
> > -     if (elm)
> > -             return elm;
> > +static unsigned long kho_radix_encode(phys_addr_t pa, unsigned int order)
>
> This name is really cryptic. Encode to what? Can't say I have great ideas,
> but this name should reflect that physical address and order are encoded
> into combined index into the tree.
>
> Maybe _encode_indices()?

As the "indices" can also refer to an entry of the table in a KHO mem
radix tree node. We are actually encoding a radix tree key, so may be
kho_radix_encode_key() ?

>
> > +{
> > +     /* Order bits part */
> > +     unsigned long h = 1UL << (ORDER_0_LG2 - order);
> > +     /* Page offset part */
> > +     unsigned long l = pa >> (PAGE_SHIFT + order);
> >
> > -     elm = kzalloc(sz, GFP_KERNEL);
> > -     if (!elm)
> > -             return ERR_PTR(-ENOMEM);
> > +     return h | l;
> > +}
> >
> > -     res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
> > -     if (xa_is_err(res))
> > -             res = ERR_PTR(xa_err(res));
> > +static phys_addr_t kho_radix_decode(unsigned long encoded, unsigned int *order)
>
> The same here. Maybe
>
>         _decode_indices(unsigned long indices, unsigned int *order)
>
> > +{
> > +     unsigned int order_bit = fls64(encoded);
> > +     phys_addr_t pa;
> >
> > -     if (res) {
> > -             kfree(elm);
> > -             return res;
> > -     }
> > +     /* order_bit is numbered starting at 1 from fls64 */
> > +     *order = ORDER_0_LG2 - order_bit + 1;
> > +     /* The order is discarded by the shift */
> > +     pa = encoded << (PAGE_SHIFT + *order);
> >
> > -     return elm;
> > +     return pa;
> >  }
> >
> > -static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
> > -                          unsigned long end_pfn)
> > +static unsigned long kho_radix_get_index(unsigned long encoded,
>
> I don't like 'encoded' here and below as well.
>

If we agree with the name above, I think we can use
"kho_radix_get_index(unsigned long radix_key" here.

> > +                                      unsigned int level)
> >  {
> > -     struct kho_mem_phys_bits *bits;
> > -     struct kho_mem_phys *physxa;
> > -
> > -     while (pfn < end_pfn) {
> > -             const unsigned int order =
> > -                     min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
> > -             const unsigned long pfn_high = pfn >> order;
> > +     int s;
> >
> > -             physxa = xa_load(&track->orders, order);
> > -             if (!physxa)
> > -                     continue;
> > +     if (level == 0)
> > +             return encoded % (1 << BITMAP_SIZE_LG2);
> >
> > -             bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
> > -             if (!bits)
> > -                     continue;
> > +     s = ((level - 1) * TABLE_SIZE_LG2) + BITMAP_SIZE_LG2;
> > +     return (encoded >> s) % (1 << TABLE_SIZE_LG2);
> > +}
> >
> > -             clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
> > +static int kho_radix_set_bitmap(struct kho_bitmap_table *bit_tlb,
>
>                                                            ^ bitmap

Yes, let's just call it `struct kho_bitmap`.

>
> > +                             unsigned long offset)
> > +{
> > +     if (!bit_tlb ||
> > +         offset >= PAGE_SIZE * BITS_PER_BYTE)
>
> No need for two lines here
>
> > +             return -EINVAL;
> >
> > -             pfn += 1 << order;
> > -     }
> > +     __set_bit(offset, bit_tlb->bitmaps);
> > +     return 0;
> >  }
> >
> > -static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
> > -                             unsigned int order)
> > +static int kho_radix_preserve_page(phys_addr_t pa, unsigned int order)
> >  {
> > -     struct kho_mem_phys_bits *bits;
> > -     struct kho_mem_phys *physxa, *new_physxa;
> > -     const unsigned long pfn_high = pfn >> order;
> > +     unsigned long encoded = kho_radix_encode(pa, order);
> > +     struct kho_radix_tree *current_tree, *new_tree;
>
> current_ is excessive IMHO and 'node' seems to be a better name for a tree
> iterator:
>
>         struct kho_radix_tree *node;
>
> is fine, the new_tree can be declared in the scope that uses it.
>
> > +     struct kho_bitmap_table *bitmap_table;
>
> _table is excessive on both the type name and variable name
>
> > +     unsigned int tree_level = TREE_MAX_DEPTH - 1;
>
> Just 'level' is enough.
>
> > +     unsigned int err = 0;
>
> err is signed.
>
> > +     unsigned int i, idx;
> >
> > -     might_sleep();
> > +     down_write(&kho_radix_tree_root_sem);
> >
> > -     physxa = xa_load(&track->orders, order);
> > -     if (!physxa) {
> > -             int err;
> > +     if (!kho_radix_tree_root) {
>
> Isn't this an assert?
> Also I don't see concurrency for accessing the root, the assertion can be
> outside the lock.
>

Yes, I will add a WARN_ON here.

> > +             err = -EINVAL;
> > +             goto out;
> > +     }
> >
> > -             new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL);
> > -             if (!new_physxa)
> > -                     return -ENOMEM;
> > +     current_tree = kho_radix_tree_root;
> >
> > -             xa_init(&new_physxa->phys_bits);
> > -             physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa,
> > -                                 GFP_KERNEL);
> > +     /* Go from high levels to low levels */
> > +     for (i = tree_level; i >= 0; i--) {
> > +             idx = kho_radix_get_index(encoded, i);
> >
> > -             err = xa_err(physxa);
> > -             if (err || physxa) {
> > -                     xa_destroy(&new_physxa->phys_bits);
> > -                     kfree(new_physxa);
> > +             if (i == 0) {
> > +                     bitmap_table = (struct kho_bitmap_table *)current_tree;
> > +                     err = kho_radix_set_bitmap(bitmap_table, idx);
>
> I'd consider making the loop until 'i > 0' and putting this outside the
> loop.
>
> > +                     goto out;
> > +             }
> > +
>
>                 if (node->table[idx]) {
>                         node = kho_radix_tree(node->table[idx]);
>                         continue;
>                 }
>
> and reduce the indentation for allocating a new table.
>
> > +             if (!current_tree->table[idx]) {
> > +                     new_tree = kho_alloc_radix_tree();
> > +                     if (!new_tree) {
> > +                             err = -ENOMEM;
> > +                             goto out;
> > +                     }
> >
> > -                     if (err)
> > -                             return err;
> > +                     current_tree->table[idx] =
> > +                             kho_radix_tree_desc(new_tree);
> > +                     current_tree = new_tree;
> >               } else {
> > -                     physxa = new_physxa;
> > +                     current_tree = kho_radix_tree(current_tree->table[idx]);
> >               }
> >       }
> >
> > -     bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS,
> > -                             sizeof(*bits));
> > -     if (IS_ERR(bits))
> > -             return PTR_ERR(bits);
> > +out:
> > +     up_write(&kho_radix_tree_root_sem);
> > +     return err;
> > +}
> > +
> > +static int kho_radix_walk_bitmaps(struct kho_bitmap_table *bit_tlb,
> > +                               unsigned long encoded,
> > +                               kho_radix_tree_walk_callback_t cb)
> > +{
> > +     unsigned long *bitmap = (unsigned long *)bit_tlb;
> > +     unsigned int i;
> > +     int err = 0;
> > +
> > +     for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) {
> > +             err = cb(encoded | i);
> > +             if (err)
> > +                     return err;
> > +     }
> >
> > -     set_bit(pfn_high % PRESERVE_BITS, bits->preserve);
> > +     return 0;
> > +}
> > +
> > +static int kho_radix_walk_trees(struct kho_radix_tree *root, unsigned int level,
>
> static int kho_radix_walk_tree(struct kho_radix_tree *parent, ...
>
> > +                             unsigned long start,
> > +                             kho_radix_tree_walk_callback_t cb)
> > +{
> > +     struct kho_radix_tree *next_tree;
>
>         struct kho_radix_tree *node;
>
> > +     struct kho_bitmap_table *bitmap_table;
>
>                                 ^ bitmap;
>
> > +     unsigned long encoded, i;
> > +     unsigned int level_shift;
>
> This can move to the scope where it's used and 'shift' would be enough.
>
> > +     int err = 0;
>
> ...
>
> > +static int __kho_preserve_order(unsigned long pfn, unsigned int order)
> > +{
> > +     unsigned long pa = PFN_PHYS(pfn);
> > +
> > +     might_sleep();
> > +
> > +     return kho_radix_preserve_page(pa, order);
>
> I don't think that __kho_preserve_order() wrapper is useful, just rename
> kho_radix_preserve_page to __kho_preserve_order()

Will merge both.

>
> > +}
> > +
> >  /* almost as free_reserved_page(), just don't free the page */
> >  static void kho_restore_page(struct page *page, unsigned int order)
> >  {
> > @@ -224,152 +385,49 @@ struct folio *kho_restore_folio(phys_addr_t phys)
> >  }
> >  EXPORT_SYMBOL_GPL(kho_restore_folio);
>
> ...
>
> > -static int kho_mem_serialize(struct kho_serialization *ser)
> > +static int __init kho_radix_walk_trees_memblock_callback(unsigned long encoded)
>
> I wonder if we expect other callbacks for tree traversal?
> For now there's only one case when we traverse the tree, so it seems
> simpler to just merge the code here into kho_radix_walk_bitmaps() and
> rename kho_radix_walk_trees() and kho_radix_walk_bitmaps() to reflect that
> they are reserving the preserved memory.
>

Currently only `kho_radix_walk_trees_memblock_callback()` is used in
tree walking, but having a callback in the tree walking helps for
debugging the KHO mem tree. I can imagine that we can have different
build options for different callbacks. If there is no big objection, I
tend to keep the callback.

> >  {
> > +     unsigned int order;
> > +     unsigned long pa;
> > +     struct page *page;
> > +     int sz;
> >
> > +     pa = kho_radix_decode(encoded, &order);
> > +     sz = 1 << (order + PAGE_SHIFT);
> > +     page = phys_to_page(pa);
> >
> > +     /* Reserve the memory preserved in KHO radix tree in memblock */
> > +     memblock_reserve(pa, sz);
> > +     memblock_reserved_mark_noinit(pa, sz);
> > +     page->private = order;
> >  }
> >
> >  static void __init kho_mem_deserialize(const void *fdt)
>
> Well, it's not deserialize anymore.

Yeah, I did think about this name too... How about  kho_mem_retrieve() ?

>
> >  {
> > @@ -599,6 +657,35 @@ static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
> >       return 0;
> >  }
>
> It seems that a lot of the changes below are not related to changing the
> mechanism for tracking the preserved memory.
>
> I'm stopping here.
>
> --
> Sincerely yours,
> Mike.

Thanks again, Mike.
Re: [PATCH v2 1/3] kho: Adopt KHO radix tree data structures
Posted by Jason Gunthorpe 3 months, 2 weeks ago
On Mon, Oct 20, 2025 at 03:03:04AM -0700, Jason Miu wrote:

> +static struct kho_radix_tree *kho_alloc_radix_tree(void)
>  {
> +	return (struct kho_radix_tree *)get_zeroed_page(GFP_KERNEL);
> +}

I was reading the thread over here:

https://lore.kernel.org/all/20151222210435.GB20997@ZenIV.linux.org.uk/

And I guess this stuff should just use
  kzalloc(sizeof(struct kho_radix_tree), GFP_KERNEL);

Jason
Re: [PATCH v2 1/3] kho: Adopt KHO radix tree data structures
Posted by Jason Gunthorpe 3 months, 2 weeks ago
On Mon, Oct 20, 2025 at 03:03:04AM -0700, Jason Miu wrote:
> - * Keep track of memory that is to be preserved across KHO.
> + * The KHO radix tree tracks preserved memory pages. It is a hierarchical
> + * structure that starts with a single root `kho_radix_tree`. This single
> + * tree stores pages of all orders.
> + *
> + * This is achieved by encoding the page's physical address and its order into
> + * a single `unsigned long` value. This encoded value is then used to traverse
> + * the tree.
> + *
> + * The tree hierarchy is shown below:
> + *
> + * kho_radix_tree_root
> + * +-------------------+
> + * |     Level 5       | (struct kho_radix_tree)
> + * +-------------------+
> + *   |
> + *   v
> + * +-------------------+
> + * |     Level 4       | (struct kho_radix_tree)
> + * +-------------------+
> + *   |
> + *   | ... (intermediate levels)
> + *   |
> + *   v
> + * +-------------------+
> + * |      Level 0      | (struct kho_bitmap_table)
> + * +-------------------+
>   *
> - * The serializing side uses two levels of xarrays to manage chunks of per-order
> - * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a
> - * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations
> - * each bitmap will cover 16M of address space. Thus, for 16G of memory at most
> - * 512K of bitmap memory will be needed for order 0.

I think it is valuable to preserve this justification for
bitmaps. There was a lot of debate over bitmaps vs ranges and this is
the advantage of bitmaps. It is a bit subtle..

> +/*
> + * The KHO radix tree tracks preserved pages by encoding a page's physical
> + * address (pa) and its order into a single unsigned long value. This value
> + * is then used to traverse the tree. The encoded value is composed of two
> + * parts: the 'order bits' in the upper part and the 'page offset' in the
> + * lower part.
> + *
> + *   <-- Higher Bits ------------------------------------ Lower Bits -->
> + *  +--------------------------+-----------------------------------------+
> + *  |        Order Bits        |               Page Offset               |
> + *  +--------------------------+-----------------------------------------+
> + *  | ... 0 0 1 0 0 ...        | pa >> (PAGE_SHIFT + order)              |
> + *  +--------------------------+-----------------------------------------+
> + *            ^
> + *            |
> + *  This single '1' bit's position
> + *  uniquely identifies the 'order'.
> + *
> + *
> + * Page Offset:
> + * The 'page offset' is the physical address normalized for its order. It
> + * effectively represents the page offset for the given order.
> + *
> + * Order Bits:
> + * The 'order bits' encode the page order by setting a single bit at a
> + * specific position. The position of this bit itself represents the order.
> + *
> + * For instance, on a 64-bit system with 4KB pages (PAGE_SHIFT = 12), the
> + * maximum range for a page offset (for order 0) is 52 bits (64 - 12). This
> + * offset occupies bits [0-51]. For order 0, the order bit is set at
> + * position 52.
> + *
> + * As the order increases, the number of bits required for the 'page offset'
> + * decreases. For example, order 1 requires one less bit for its page
> + * offset. This allows its order bit to be set at position 51,
> + * i.e. shifting right, without conflicting with the page offset bits.

This description is correct, but the diagram is misleading. Maybe like this:

 *  |0| ... 0 0 0 1          | pa >> (PAGE_SHIFT + 0)              |
 *  |1| ... 0 0 0 0 1        | pa >> (PAGE_SHIFT + 1)              |
 *  |2| ... 0 0 0 0 0 1      | pa >> (PAGE_SHIFT + 2)              |
 [..]


> + *
> + * This design stores pages of all sizes (orders) in a single 6-level table.  It
> + * efficiently shares lower table levels, especially due to common zero top
> + * address bits, allowing a single, efficient algorithm to manage all pages.
> + */
> +enum kho_radix_consts {
> +	/* The bit position of a 0-order page, only supports 64bits system */
> +	ORDER_0_LG2 = 64 - PAGE_SHIFT,
> +	/* Bit number used for each level of tables */
> +	TABLE_SIZE_LG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t)),
> +	/* Bit number used for lowest level of bitmaps */
> +	BITMAP_SIZE_LG2 = PAGE_SHIFT + const_ilog2(BITS_PER_BYTE),

"bit number" is a bit confusing when using a lg2 terms..

        /* Size of the table in kho_radix_tree, in lg2 */
	TABLE_SIZE_LG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t))

	/* Number of bits in the kho_bitmap_table, in lg2 */
	BITMAP_SIZE_LG2 = const_ilog2(BITS_PER_BYTE * PAGE_SIZE)

Then use the constants in the structs:

struct kho_radix_tree {
       phys_addr_t table[1 << TABLE_SIZE_LG2];
};
struct kho_bitmap_table {
       unsigned long bitmaps[(1 << BITMAP_SIZE_LG2) / BITS_PER_LONG];
};

> -struct khoser_mem_chunk;
> +static inline phys_addr_t kho_radix_tree_desc(struct kho_radix_tree *va)
> +{
> +	return (phys_addr_t)virt_to_phys(va);
> +}

virt_to_phys already returns phys_addr_t ?

> +static inline struct kho_radix_tree *kho_radix_tree(phys_addr_t desc)
> +{
> +	return (struct kho_radix_tree *)(phys_to_virt(desc));
> +}

phys_to_virt returns void *, no need for a cast

> +static int kho_radix_set_bitmap(struct kho_bitmap_table *bit_tlb,
> +				unsigned long offset)
> +{
> +	if (!bit_tlb ||
> +	    offset >= PAGE_SIZE * BITS_PER_BYTE)
> +		return -EINVAL;

WARN_ON ? These are assertions aren't they?

> +static int kho_radix_walk_trees(struct kho_radix_tree *root, unsigned int level,
> +				unsigned long start,
> +				kho_radix_tree_walk_callback_t cb)
> +{
> +	struct kho_radix_tree *next_tree;
> +	struct kho_bitmap_table *bitmap_table;
> +	unsigned long encoded, i;
> +	unsigned int level_shift;
> +	int err = 0;
> +
> +	for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) {
> +		if (root->table[i]) {


if (!root->table[i])
   continue

Remove a level of indentation here

> +static int __kho_preserve_order(unsigned long pfn, unsigned int order)
> +{
> +	unsigned long pa = PFN_PHYS(pfn);
> +
> +	might_sleep();
> +
> +	return kho_radix_preserve_page(pa, order);

might_sleep should be in kho_radix_preserve_page() ? The might should
be placed around if statements that might avoid a sleep, and that is
not in this function.

>  static void __init kho_mem_deserialize(const void *fdt)
>  {
> +	struct kho_radix_tree *tree_root;
>  	const phys_addr_t *mem;
>  	int len;
>  
> +	/* Retrieve the KHO radix tree from passed-in FDT. */
> +	mem = fdt_getprop(fdt, 0, PROP_PRESERVED_PAGE_RADIX_TREE, &len);
>  
>  	if (!mem || len != sizeof(*mem)) {
> +		pr_err("failed to get preserved KHO memory tree\n");
>  		return;
>  	}
>  
> +	tree_root = *mem ?
> +		(struct kho_radix_tree *)phys_to_virt(*mem) :
> +		NULL;
>  
> +	if (!tree_root)
> +		return;

Seems weird?

if (!*mem)
   return;

tree_root = phys_to_virt(*mem)
kho_radix_walk_trees(tree_root, TREE_MAX_DEPTH - 1, 0,
			     kho_radix_walk_trees_memblock_callback);


This is prettty good now, IMHO

Jason
Re: [PATCH v2 1/3] kho: Adopt KHO radix tree data structures
Posted by Jason Miu 3 months, 1 week ago
On Thu, Oct 23, 2025 at 10:43 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Mon, Oct 20, 2025 at 03:03:04AM -0700, Jason Miu wrote:
> > - * Keep track of memory that is to be preserved across KHO.
> > + * The KHO radix tree tracks preserved memory pages. It is a hierarchical
> > + * structure that starts with a single root `kho_radix_tree`. This single
> > + * tree stores pages of all orders.
> > + *
> > + * This is achieved by encoding the page's physical address and its order into
> > + * a single `unsigned long` value. This encoded value is then used to traverse
> > + * the tree.
> > + *
> > + * The tree hierarchy is shown below:
> > + *
> > + * kho_radix_tree_root
> > + * +-------------------+
> > + * |     Level 5       | (struct kho_radix_tree)
> > + * +-------------------+
> > + *   |
> > + *   v
> > + * +-------------------+
> > + * |     Level 4       | (struct kho_radix_tree)
> > + * +-------------------+
> > + *   |
> > + *   | ... (intermediate levels)
> > + *   |
> > + *   v
> > + * +-------------------+
> > + * |      Level 0      | (struct kho_bitmap_table)
> > + * +-------------------+
> >   *
> > - * The serializing side uses two levels of xarrays to manage chunks of per-order
> > - * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a
> > - * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations
> > - * each bitmap will cover 16M of address space. Thus, for 16G of memory at most
> > - * 512K of bitmap memory will be needed for order 0.
>
> I think it is valuable to preserve this justification for
> bitmaps. There was a lot of debate over bitmaps vs ranges and this is
> the advantage of bitmaps. It is a bit subtle..

Sure, I will update the description about using the bitmap, along with
the new values.


> > +/*
> > + * The KHO radix tree tracks preserved pages by encoding a page's physical
> > + * address (pa) and its order into a single unsigned long value. This value
> > + * is then used to traverse the tree. The encoded value is composed of two
> > + * parts: the 'order bits' in the upper part and the 'page offset' in the
> > + * lower part.
> > + *
> > + *   <-- Higher Bits ------------------------------------ Lower Bits -->
> > + *  +--------------------------+-----------------------------------------+
> > + *  |        Order Bits        |               Page Offset               |
> > + *  +--------------------------+-----------------------------------------+
> > + *  | ... 0 0 1 0 0 ...        | pa >> (PAGE_SHIFT + order)              |
> > + *  +--------------------------+-----------------------------------------+
> > + *            ^
> > + *            |
> > + *  This single '1' bit's position
> > + *  uniquely identifies the 'order'.
> > + *
> > + *
> > + * Page Offset:
> > + * The 'page offset' is the physical address normalized for its order. It
> > + * effectively represents the page offset for the given order.
> > + *
> > + * Order Bits:
> > + * The 'order bits' encode the page order by setting a single bit at a
> > + * specific position. The position of this bit itself represents the order.
> > + *
> > + * For instance, on a 64-bit system with 4KB pages (PAGE_SHIFT = 12), the
> > + * maximum range for a page offset (for order 0) is 52 bits (64 - 12). This
> > + * offset occupies bits [0-51]. For order 0, the order bit is set at
> > + * position 52.
> > + *
> > + * As the order increases, the number of bits required for the 'page offset'
> > + * decreases. For example, order 1 requires one less bit for its page
> > + * offset. This allows its order bit to be set at position 51,
> > + * i.e. shifting right, without conflicting with the page offset bits.
>
> This description is correct, but the diagram is misleading. Maybe like this:
>
>  *  |0| ... 0 0 0 1          | pa >> (PAGE_SHIFT + 0)              |
>  *  |1| ... 0 0 0 0 1        | pa >> (PAGE_SHIFT + 1)              |
>  *  |2| ... 0 0 0 0 0 1      | pa >> (PAGE_SHIFT + 2)              |
>  [..]
>

I see, giving new examples makes the order bit positions more clearer.

>
> > + *
> > + * This design stores pages of all sizes (orders) in a single 6-level table.  It
> > + * efficiently shares lower table levels, especially due to common zero top
> > + * address bits, allowing a single, efficient algorithm to manage all pages.
> > + */
> > +enum kho_radix_consts {
> > +     /* The bit position of a 0-order page, only supports 64bits system */
> > +     ORDER_0_LG2 = 64 - PAGE_SHIFT,
> > +     /* Bit number used for each level of tables */
> > +     TABLE_SIZE_LG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t)),
> > +     /* Bit number used for lowest level of bitmaps */
> > +     BITMAP_SIZE_LG2 = PAGE_SHIFT + const_ilog2(BITS_PER_BYTE),
>
> "bit number" is a bit confusing when using a lg2 terms..
>
>         /* Size of the table in kho_radix_tree, in lg2 */
>         TABLE_SIZE_LG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t))
>
>         /* Number of bits in the kho_bitmap_table, in lg2 */
>         BITMAP_SIZE_LG2 = const_ilog2(BITS_PER_BYTE * PAGE_SIZE)
>
> Then use the constants in the structs:
>
> struct kho_radix_tree {
>        phys_addr_t table[1 << TABLE_SIZE_LG2];
> };
> struct kho_bitmap_table {
>        unsigned long bitmaps[(1 << BITMAP_SIZE_LG2) / BITS_PER_LONG];
> };
>
> > -struct khoser_mem_chunk;
> > +static inline phys_addr_t kho_radix_tree_desc(struct kho_radix_tree *va)
> > +{
> > +     return (phys_addr_t)virt_to_phys(va);
> > +}
>
> virt_to_phys already returns phys_addr_t ?
>
> > +static inline struct kho_radix_tree *kho_radix_tree(phys_addr_t desc)
> > +{
> > +     return (struct kho_radix_tree *)(phys_to_virt(desc));
> > +}
>
> phys_to_virt returns void *, no need for a cast
>

Yup, will update the types in both functions.

> > +static int kho_radix_set_bitmap(struct kho_bitmap_table *bit_tlb,
> > +                             unsigned long offset)
> > +{
> > +     if (!bit_tlb ||
> > +         offset >= PAGE_SIZE * BITS_PER_BYTE)
> > +             return -EINVAL;
>
> WARN_ON ? These are assertions aren't they?

Yes, will add an assertion here. Or should we use "BUG_ON"? As if this
error happened something is quite wrong and I do not think it is
recoverable.

>
> > +static int kho_radix_walk_trees(struct kho_radix_tree *root, unsigned int level,
> > +                             unsigned long start,
> > +                             kho_radix_tree_walk_callback_t cb)
> > +{
> > +     struct kho_radix_tree *next_tree;
> > +     struct kho_bitmap_table *bitmap_table;
> > +     unsigned long encoded, i;
> > +     unsigned int level_shift;
> > +     int err = 0;
> > +
> > +     for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) {
> > +             if (root->table[i]) {
>
>
> if (!root->table[i])
>    continue
>
> Remove a level of indentation here
>
> > +static int __kho_preserve_order(unsigned long pfn, unsigned int order)
> > +{
> > +     unsigned long pa = PFN_PHYS(pfn);
> > +
> > +     might_sleep();
> > +
> > +     return kho_radix_preserve_page(pa, order);
>
> might_sleep should be in kho_radix_preserve_page() ? The might should
> be placed around if statements that might avoid a sleep, and that is
> not in this function.
>

I got another feedback that we can merge the kho_radix_preserve_page()
__kho_preserve_order(), so the might_sleep() will be in the if
statements.

> >  static void __init kho_mem_deserialize(const void *fdt)
> >  {
> > +     struct kho_radix_tree *tree_root;
> >       const phys_addr_t *mem;
> >       int len;
> >
> > +     /* Retrieve the KHO radix tree from passed-in FDT. */
> > +     mem = fdt_getprop(fdt, 0, PROP_PRESERVED_PAGE_RADIX_TREE, &len);
> >
> >       if (!mem || len != sizeof(*mem)) {
> > +             pr_err("failed to get preserved KHO memory tree\n");
> >               return;
> >       }
> >
> > +     tree_root = *mem ?
> > +             (struct kho_radix_tree *)phys_to_virt(*mem) :
> > +             NULL;
> >
> > +     if (!tree_root)
> > +             return;
>
> Seems weird?
>
> if (!*mem)
>    return;
>
> tree_root = phys_to_virt(*mem)
> kho_radix_walk_trees(tree_root, TREE_MAX_DEPTH - 1, 0,
>                              kho_radix_walk_trees_memblock_callback);
>

Yup, this looks more natural, my mind was thinking to check tree_root.

>
> This is prettty good now, IMHO
>
> Jason
Re: [PATCH v2 1/3] kho: Adopt KHO radix tree data structures
Posted by David Matlack 3 months, 2 weeks ago
On Mon, Oct 20, 2025 at 3:03 AM Jason Miu <jasonmiu@google.com> wrote:
> diff --git a/include/linux/live_update/abi/kexec_handover.h b/include/linux/live_update/abi/kexec_handover.h
> new file mode 100644

The need for this directory also came up in Vipin's VFIO series [1],
so let's align on a directory we can all use.

Should we s/live_update/liveupdate/ to align with the file/directory
naming convention Pasha is using in LUO (no underscore)? [2]

Otherwise, LGTM.

[1] https://lore.kernel.org/kvm/20251018231126.GS3938986@ziepe.ca/
[2] https://lore.kernel.org/lkml/20250929010321.3462457-1-pasha.tatashin@soleen.com/
Re: [PATCH v2 1/3] kho: Adopt KHO radix tree data structures
Posted by Jason Miu 3 months, 2 weeks ago
On Wed, Oct 22, 2025 at 8:51 AM David Matlack <dmatlack@google.com> wrote:
>
> On Mon, Oct 20, 2025 at 3:03 AM Jason Miu <jasonmiu@google.com> wrote:
> > diff --git a/include/linux/live_update/abi/kexec_handover.h b/include/linux/live_update/abi/kexec_handover.h
> > new file mode 100644
>
> The need for this directory also came up in Vipin's VFIO series [1],
> so let's align on a directory we can all use.
>
> Should we s/live_update/liveupdate/ to align with the file/directory
> naming convention Pasha is using in LUO (no underscore)? [2]
>
> Otherwise, LGTM.
>
> [1] https://lore.kernel.org/kvm/20251018231126.GS3938986@ziepe.ca/
> [2] https://lore.kernel.org/lkml/20250929010321.3462457-1-pasha.tatashin@soleen.com/

Thanks for the suggestion. Yes, let's follow the convention in LUO
[1]. Will use "include/linux/liveupdate/abi/kexec_handover.h" in this
patch and share the "include/linux/liveupdate/abi/" path for
liveupdate components.

[1] https://lore.kernel.org/lkml/20250929010321.3462457-7-pasha.tatashin@soleen.com/

--
Jason Miu