[PATCH v3 3/4] kho: Adopt radix tree for preserved memory tracking

Jason Miu posted 4 patches 2 months ago
There is a newer version of this series
[PATCH v3 3/4] kho: Adopt radix tree for preserved memory tracking
Posted by Jason Miu 2 months ago
Introduce a radix tree implementation for tracking preserved memory pages
and switch the KHO memory tracking mechanism to use it. This lays the
groundwork for a stateless KHO implementation that eliminates the need for
serialization and the associated "finalize" state.

This patch introduces the core radix tree data structures and constants to
the KHO ABI. It adds the radix tree node and leaf structures, along with
documentation for the radix tree key encoding scheme that combines a page's
physical address and order.

To support broader use by other kernel subsystems, such as hugetlb
preservation, the core radix tree manipulation functions are exported as
a public API.

The xarray-based memory tracking is replaced with this new radix tree
implementation. The core KHO preservation and unpreservation functions are
wired up to use the radix tree helpers. On boot, the second kernel restores
the preserved memory map by walking the radix tree whose root physical
address is passed via the FDT.

The ABI `compatible` version is bumped to "kho-v2" to reflect the
structural changes in the preserved memory map and sub-FDT property
names.

Signed-off-by: Jason Miu <jasonmiu@google.com>
---
 Documentation/core-api/kho/concepts.rst   |   2 +-
 Documentation/core-api/kho/fdt.rst        |   7 +
 Documentation/core-api/kho/index.rst      |   1 +
 Documentation/core-api/kho/radix_tree.rst |  17 +
 include/linux/kho/abi/kexec_handover.h    | 124 +++-
 include/linux/kho_radix_tree.h            |  81 +++
 kernel/liveupdate/kexec_handover.c        | 658 ++++++++++++----------
 7 files changed, 568 insertions(+), 322 deletions(-)
 create mode 100644 Documentation/core-api/kho/radix_tree.rst
 create mode 100644 include/linux/kho_radix_tree.h

diff --git a/Documentation/core-api/kho/concepts.rst b/Documentation/core-api/kho/concepts.rst
index e96893937286..d38bcaa951e4 100644
--- a/Documentation/core-api/kho/concepts.rst
+++ b/Documentation/core-api/kho/concepts.rst
@@ -71,7 +71,7 @@ in the FDT. That state is called the KHO finalization phase.
 Public API
 ==========
 .. kernel-doc:: kernel/liveupdate/kexec_handover.c
-   :export:
+   :identifiers: kho_is_enabled kho_restore_folio kho_restore_pages kho_add_subtree kho_remove_subtree kho_preserve_folio kho_unpreserve_folio kho_preserve_pages kho_unpreserve_pages kho_preserve_vmalloc kho_unpreserve_vmalloc kho_restore_vmalloc kho_alloc_preserve kho_unpreserve_free kho_restore_free is_kho_boot kho_retrieve_subtree
 
 Internal API
 ============
diff --git a/Documentation/core-api/kho/fdt.rst b/Documentation/core-api/kho/fdt.rst
index 4202b67fcdf9..56993d091b32 100644
--- a/Documentation/core-api/kho/fdt.rst
+++ b/Documentation/core-api/kho/fdt.rst
@@ -20,8 +20,15 @@ Kexec Handover ABI for vmalloc Preservation
 .. kernel-doc:: include/linux/kho/abi/kexec_handover.h
    :doc: Kexec Handover ABI for vmalloc Preservation
 
+Keep track of memory that is to be preserved across KHO
+=======================================================
+
+.. kernel-doc:: include/linux/kho/abi/kexec_handover.h
+   :doc: Keep track of memory that is to be preserved across KHO.
+
 See Also
 ========
 
 - :doc:`/admin-guide/mm/kho`
 - :doc:`/core-api/kho/concepts`
+- :doc:`/core-api/kho/radix_tree`
diff --git a/Documentation/core-api/kho/index.rst b/Documentation/core-api/kho/index.rst
index 0c63b0c5c143..68cdf332076e 100644
--- a/Documentation/core-api/kho/index.rst
+++ b/Documentation/core-api/kho/index.rst
@@ -9,5 +9,6 @@ Kexec Handover Subsystem
 
    concepts
    fdt
+   radix_tree
 
 .. only::  subproject and html
diff --git a/Documentation/core-api/kho/radix_tree.rst b/Documentation/core-api/kho/radix_tree.rst
new file mode 100644
index 000000000000..9523ade75d9c
--- /dev/null
+++ b/Documentation/core-api/kho/radix_tree.rst
@@ -0,0 +1,17 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================
+KHO Radix Tree
+====================
+
+Description
+===========
+
+.. kernel-doc:: include/linux/kho_radix_tree.h
+   :doc: Kexec Handover Radix Tree
+
+Public API
+==========
+
+.. kernel-doc:: kernel/liveupdate/kexec_handover.c
+   :identifiers: kho_radix_encode_key kho_radix_decode_key kho_radix_add_page kho_radix_del_page kho_radix_walk_tree
diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h
index 74f4fa67e458..bdda2fe67353 100644
--- a/include/linux/kho/abi/kexec_handover.h
+++ b/include/linux/kho/abi/kexec_handover.h
@@ -10,6 +10,8 @@
 #ifndef _LINUX_KHO_ABI_KEXEC_HANDOVER_H
 #define _LINUX_KHO_ABI_KEXEC_HANDOVER_H
 
+#include <linux/bits.h>
+#include <linux/log2.h>
 #include <linux/types.h>
 
 /**
@@ -35,25 +37,25 @@
  *   parses this FDT to locate and restore the preserved data.::
  *
  *     / {
- *         compatible = "kho-v1";
+ *         compatible = "kho-v2";
  *
  *         preserved-memory-map = <0x...>;
  *
  *         <subnode-name-1> {
- *             fdt = <0x...>;
+ *             preserved-data = <0x...>;
  *         };
  *
  *         <subnode-name-2> {
- *             fdt = <0x...>;
+ *             preserved-data = <0x...>;
  *         };
  *               ... ...
  *         <subnode-name-N> {
- *             fdt = <0x...>;
+ *             preserved-data = <0x...>;
  *         };
  *     };
  *
  *   Root KHO Node (/):
- *     - compatible: "kho-v1"
+ *     - compatible: "kho-v2"
  *
  *       Indentifies the overall KHO ABI version.
  *
@@ -68,20 +70,20 @@
  *     is provided by the subsystem that uses KHO for preserving its
  *     data.
  *
- *     - fdt: u64
+ *     - preserved-data: u64
  *
- *       Physical address pointing to a subnode FDT blob that is also
+ *       Physical address pointing to a subnode data blob that is also
  *       being preserved.
  */
 
 /* The compatible string for the KHO FDT root node. */
-#define KHO_FDT_COMPATIBLE "kho-v1"
+#define KHO_FDT_COMPATIBLE "kho-v2"
 
 /* The FDT property for the preserved memory map. */
 #define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map"
 
 /* The FDT property for sub-FDTs. */
-#define KHO_FDT_SUB_TREE_PROP_NAME "fdt"
+#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data"
 
 /**
  * DOC: Kexec Handover ABI for vmalloc Preservation
@@ -159,4 +161,108 @@ struct kho_vmalloc {
 	unsigned short order;
 };
 
+/**
+ * DOC: Keep track of memory that is to be preserved across KHO.
+ *
+ * KHO tracks preserved memory using a radix tree data structure. Each node of
+ * the tree is PAGE_SIZE. The leaf nodes are bitmaps where each set bit
+ * represents a single preserved page. The intermediate nodes are tables of
+ * physical addresses that point to a lower level node.
+ *
+ * The tree hierarchy is shown below::
+ *
+ *   root
+ *   +-------------------+
+ *   |     Level 5       | (struct kho_radix_node)
+ *   +-------------------+
+ *     |
+ *     v
+ *   +-------------------+
+ *   |     Level 4       | (struct kho_radix_node)
+ *   +-------------------+
+ *     |
+ *     | ... (intermediate levels)
+ *     |
+ *     v
+ *   +-------------------+
+ *   |      Level 0      | (struct kho_radix_leaf)
+ *   +-------------------+
+ *
+ * This is achieved by encoding the page's physical address (pa) and its order
+ * into a single unsigned long value. This value is a key then used to traverse
+ * the tree. The encoded key value is composed of two parts: the 'order bit' in
+ * the upper part and the 'page offset' in the lower part.::
+ *
+ *   +------------+-----------------------------+--------------------------+
+ *   | Page Order | Order Bit                   | Page Offset              |
+ *   +------------+-----------------------------+--------------------------+
+ *   | 0          | ...000100 ... (at bit 52)   | pa >> (PAGE_SHIFT + 0)   |
+ *   | 1          | ...000010 ... (at bit 51)   | pa >> (PAGE_SHIFT + 1)   |
+ *   | 2          | ...000001 ... (at bit 50)   | pa >> (PAGE_SHIFT + 2)   |
+ *   | ...        | ...                         | ...                      |
+ *   +------------+-----------------------------+--------------------------+
+ *
+ * Page Offset:
+ * The 'page offset' is the physical address normalized for its order. It
+ * effectively represents the page offset for the given order.
+ *
+ * Order Bit:
+ * The 'order bit' encodes the page order by setting a single bit at a
+ * specific position. The position of this bit itself represents the order.
+ *
+ * For instance, on a 64-bit system with 4KB pages (PAGE_SHIFT = 12), the
+ * maximum range for a page offset (for order 0) is 52 bits (64 - 12). This
+ * offset occupies bits [0-51]. For order 0, the order bit is set at
+ * position 52.
+ *
+ * The following diagram illustrates how the encoded key value is split into
+ * indices for the tree levels, with PAGE_SIZE of 4KB::
+ *
+ *        63:60   59:51    50:42    41:33    32:24    23:15         14:0
+ *   +---------+--------+--------+--------+--------+--------+-----------------+
+ *   |    0    |  Lv 5  |  Lv 4  |  Lv 3  |  Lv 2  |  Lv 1  |  Lv 0 (bitmap)  |
+ *   +---------+--------+--------+--------+--------+--------+-----------------+
+ *
+ * This design stores pages of all sizes (orders) in a single 6-level table.
+ * It efficiently shares lower table levels, especially due to common zero top
+ * address bits, allowing a single, efficient algorithm to manage all pages.
+ * This bitmap approach also offers memory efficiency; for example, a 512KB
+ * bitmap can cover a 16GB memory range for 0-order pages with PAGE_SIZE = 4KB.
+ *
+ * The data structures defined here are part of the KHO ABI. Any modification
+ * to these structures that breaks backward compatibility must be accompanied by
+ * an update to the "compatible" string. This ensures that a newer kernel can
+ * correctly interpret the data passed by an older kernel.
+ */
+
+/*
+ * Defines constants for the KHO radix tree structure, used to track preserved
+ * memory. These constants govern the indexing, sizing, and depth of the tree.
+ */
+enum kho_radix_consts {
+	/* The bit position of a 0-order page */
+	KHO_ORDER_0_LG2 = 64 - PAGE_SHIFT,
+
+	/* Size of the table in kho_mem_radix_tree, in lg2 */
+	KHO_TABLE_SIZE_LG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t)),
+
+	/* Number of bits in the kho_bitmap, in lg2 */
+	KHO_BITMAP_SIZE_LG2 = PAGE_SHIFT + const_ilog2(BITS_PER_BYTE),
+
+	/*
+	 * The total tree depth is the number of intermediate levels
+	 * and 1 bitmap level.
+	 */
+	KHO_TREE_MAX_DEPTH = DIV_ROUND_UP(KHO_ORDER_0_LG2 - KHO_BITMAP_SIZE_LG2,
+					  KHO_TABLE_SIZE_LG2) + 1,
+};
+
+struct kho_radix_node {
+	u64 table[1 << KHO_TABLE_SIZE_LG2];
+};
+
+struct kho_radix_leaf {
+	DECLARE_BITMAP(bitmap, 1 << KHO_BITMAP_SIZE_LG2);
+};
+
 #endif	/* _LINUX_KHO_ABI_KEXEC_HANDOVER_H */
diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
new file mode 100644
index 000000000000..5101a04f6ae6
--- /dev/null
+++ b/include/linux/kho_radix_tree.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LIVEUPDATE_KEXEC_HANDOVER_RADIX_TREE_H
+#define _LIVEUPDATE_KEXEC_HANDOVER_RADIX_TREE_H
+
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+
+/**
+ * DOC: Kexec Handover Radix Tree
+ *
+ * This is a radix tree implementation for tracking physical memory pages
+ * across kexec transitions. It was developed for the KHO mechanism but is
+ * designed for broader use by any subsystem that needs to preserve pages.
+ *
+ * The radix tree is a multi-level tree where leaf nodes are bitmaps
+ * representing individual pages. To allow pages of different sizes (orders)
+ * to be stored efficiently in a single tree, it uses a unique key encoding
+ * scheme. Each key is an unsigned long that combines a page's physical
+ * address and its order.
+ *
+ * Client code is responsible for allocating the root node of the tree and
+ * managing its lifecycle, and must use the tree data structures defined in
+ * the KHO ABI, `include/linux/kho/abi/kexec_handover.h`.
+ */
+
+struct kho_radix_node;
+
+typedef int (*kho_radix_tree_walk_callback_t)(unsigned long radix_key);
+
+#ifdef CONFIG_KEXEC_HANDOVER
+
+unsigned long kho_radix_encode_key(phys_addr_t pa, unsigned int order);
+
+phys_addr_t kho_radix_decode_key(unsigned long radix_key,
+				 unsigned int *order);
+
+int kho_radix_add_page(struct kho_radix_node *root, unsigned long pfn,
+		       unsigned int order);
+
+void kho_radix_del_page(struct kho_radix_node *root, unsigned long pfn,
+			unsigned int order);
+
+int kho_radix_walk_tree(struct kho_radix_node *root, unsigned int level,
+			unsigned long start, kho_radix_tree_walk_callback_t cb);
+
+#else  /* #ifdef CONFIG_KEXEC_HANDOVER */
+
+static inline unsigned long kho_radix_encode_key(phys_addr_t pa,
+						 unsigned int order)
+{
+	return 0;
+}
+
+static inline phys_addr_t kho_radix_decode_key(unsigned long radix_key,
+					       unsigned int *order)
+{
+	return 0;
+};
+
+static inline int kho_radix_add_page(struct kho_radix_node *root, long pfn,
+				     unsigned int order)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void kho_radix_del_page(struct kho_radix_node *root,
+				      unsigned long pfn, unsigned int order) { }
+
+static inline int kho_radix_walk_tree(struct kho_radix_node *root,
+				      unsigned int level,
+				      unsigned long start,
+				      kho_radix_tree_walk_callback_t cb)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif /* #ifdef CONFIG_KEXEC_HANDOVER */
+
+#endif	/* _LIVEUPDATE_KEXEC_HANDOVER_RADIX_TREE_H */
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index a180b3367e8f..81bac82c8672 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
  * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
  * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com>
+ * Copyright (C) 2025 Google LLC, Jason Miu <jasonmiu@google.com>
  */
 
 #define pr_fmt(fmt) "KHO: " fmt
@@ -15,6 +16,7 @@
 #include <linux/count_zeros.h>
 #include <linux/kexec.h>
 #include <linux/kexec_handover.h>
+#include <linux/kho_radix_tree.h>
 #include <linux/kho/abi/kexec_handover.h>
 #include <linux/libfdt.h>
 #include <linux/list.h>
@@ -66,155 +68,302 @@ static int __init kho_parse_enable(char *p)
 early_param("kho", kho_parse_enable);
 
 /*
- * Keep track of memory that is to be preserved across KHO.
- *
- * The serializing side uses two levels of xarrays to manage chunks of per-order
- * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order
- * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0
- * allocations each bitmap will cover 128M of address space. Thus, for 16G of
- * memory at most 512K of bitmap memory will be needed for order 0.
- *
- * This approach is fully incremental, as the serialization progresses folios
- * can continue be aggregated to the tracker. The final step, immediately prior
- * to kexec would serialize the xarray information into a linked list for the
- * successor kernel to parse.
+ * `root` points to a page that serves as the root of the
+ * KHO radix tree. This page is allocated during KHO module initialization.
+ * Its physical address is written to the FDT and passed to the next kernel
+ * during kexec.
  */
-
-#define PRESERVE_BITS (PAGE_SIZE * 8)
-
-struct kho_mem_phys_bits {
-	DECLARE_BITMAP(preserve, PRESERVE_BITS);
-};
-
-static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE);
-
-struct kho_mem_phys {
-	/*
-	 * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
-	 * to order.
-	 */
-	struct xarray phys_bits;
-};
-
 struct kho_mem_track {
-	/* Points to kho_mem_phys, each order gets its own bitmap tree */
-	struct xarray orders;
+	struct kho_radix_node *root;
+	struct rw_semaphore sem;
 };
 
-struct khoser_mem_chunk;
-
 struct kho_out {
-	void *fdt;
-	bool finalized;
-	struct mutex lock; /* protects KHO FDT finalization */
-
 	struct kho_mem_track track;
+	void *fdt;
+	struct mutex lock; /* protects KHO FDT */
 	struct kho_debugfs dbg;
 };
 
 static struct kho_out kho_out = {
-	.lock = __MUTEX_INITIALIZER(kho_out.lock),
 	.track = {
-		.orders = XARRAY_INIT(kho_out.track.orders, 0),
+		.sem = __RWSEM_INITIALIZER(kho_out.track.sem),
 	},
-	.finalized = false,
+	.lock = __MUTEX_INITIALIZER(kho_out.lock),
 };
 
-static void *xa_load_or_alloc(struct xarray *xa, unsigned long index)
+/**
+ * kho_radix_encode_key - Encodes a physical address and order into a radix key.
+ * @pa: The physical address of the page.
+ * @order: The order of the page.
+ *
+ * This function combines a page's physical address and its order into a
+ * single unsigned long, which is used as a key for all radix tree
+ * operations.
+ *
+ * Return: The encoded unsigned long key.
+ */
+unsigned long kho_radix_encode_key(phys_addr_t pa, unsigned int order)
 {
-	void *res = xa_load(xa, index);
+	/* Order bits part */
+	unsigned long h = 1UL << (KHO_ORDER_0_LG2 - order);
+	/* Page offset part */
+	unsigned long l = pa >> (PAGE_SHIFT + order);
 
-	if (res)
-		return res;
+	return h | l;
+}
+EXPORT_SYMBOL_GPL(kho_radix_encode_key);
 
-	void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL);
+/**
+ * kho_radix_decode_key - Decodes a radix key back into a physical address and order.
+ * @radix_key: The unsigned long key to decode.
+ * @order: An output parameter, a pointer to an unsigned int where the decoded
+ *         page order will be stored.
+ *
+ * This function reverses the encoding performed by kho_radix_encode_key(),
+ * extracting the original physical address and page order from a given key.
+ *
+ * Return: The decoded physical address.
+ */
+phys_addr_t kho_radix_decode_key(unsigned long radix_key,
+				 unsigned int *order)
+{
+	unsigned int order_bit = fls64(radix_key);
+	phys_addr_t pa;
 
-	if (!elm)
-		return ERR_PTR(-ENOMEM);
+	/* order_bit is numbered starting at 1 from fls64 */
+	*order = KHO_ORDER_0_LG2 - order_bit + 1;
+	/* The order is discarded by the shift */
+	pa = radix_key << (PAGE_SHIFT + *order);
 
-	if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE)))
-		return ERR_PTR(-EINVAL);
+	return pa;
+}
+EXPORT_SYMBOL_GPL(kho_radix_decode_key);
+
+static unsigned long kho_radix_get_index(unsigned long radix_key,
+					 unsigned int level)
+{
+	int s;
 
-	res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
-	if (xa_is_err(res))
-		return ERR_PTR(xa_err(res));
-	else if (res)
-		return res;
+	if (level == 0)
+		return radix_key % (1 << KHO_BITMAP_SIZE_LG2);
 
-	return no_free_ptr(elm);
+	s = ((level - 1) * KHO_TABLE_SIZE_LG2) + KHO_BITMAP_SIZE_LG2;
+	return (radix_key >> s) % (1 << KHO_TABLE_SIZE_LG2);
 }
 
-static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn,
-				   unsigned int order)
+/**
+ * kho_radix_add_page - Marks a page as preserved in the radix tree.
+ * @root: The root of the radix tree.
+ * @pfn: The page frame number of the page to preserve.
+ * @order: The order of the page.
+ *
+ * This function traverses the radix tree based on the key derived from @pfn
+ * and @order. It sets the corresponding bit in the leaf bitmap to mark the
+ * page for preservation. If intermediate nodes do not exist along the path,
+ * they are allocated and added to the tree.
+ *
+ * Return: 0 on success, or a negative error code on failure.
+ */
+int kho_radix_add_page(struct kho_radix_node *root,
+		       unsigned long pfn, unsigned int order)
 {
-	struct kho_mem_phys_bits *bits;
-	struct kho_mem_phys *physxa;
-	const unsigned long pfn_high = pfn >> order;
+	phys_addr_t pa = PFN_PHYS(pfn);
+	unsigned long radix_key = kho_radix_encode_key(pa, order);
+	struct kho_radix_node *node;
+	struct kho_radix_leaf *leaf;
+	unsigned int i, idx;
+	int err = 0;
 
-	physxa = xa_load(&track->orders, order);
-	if (WARN_ON_ONCE(!physxa))
-		return;
+	/*
+	 * This array stores pointers to newly allocated intermediate radix tree
+	 * nodes along the insertion path. In case of an error during node
+	 * allocation or insertion, these stored pointers are used to free
+	 * the partially allocated path, preventing memory leaks.
+	 */
+	struct kho_radix_node *intermediate_nodes[KHO_TREE_MAX_DEPTH] = { 0 };
 
-	bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
-	if (WARN_ON_ONCE(!bits))
-		return;
+	might_sleep();
 
-	clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+	node = root;
+
+	/* Go from high levels to low levels */
+	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
+		idx = kho_radix_get_index(radix_key, i);
+
+		if (node->table[idx]) {
+			node = phys_to_virt((phys_addr_t)node->table[idx]);
+			continue;
+		}
+
+		/* Next node is empty, create a new node for it */
+		struct kho_radix_node *new_tree;
+
+		new_tree = (struct kho_radix_node *)get_zeroed_page(GFP_KERNEL);
+		if (!new_tree) {
+			err = -ENOMEM;
+			goto err_free_alloc_nodes;
+		}
+
+		node->table[idx] = virt_to_phys(new_tree);
+		node = new_tree;
+
+		intermediate_nodes[i] = new_tree;
+	}
+
+	/* Handle the leaf level bitmap (level 0) */
+	idx = kho_radix_get_index(radix_key, 0);
+	leaf = (struct kho_radix_leaf *)node;
+	__set_bit(idx, leaf->bitmap);
+
+	return 0;
+
+err_free_alloc_nodes:
+	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
+		if (intermediate_nodes[i])
+			free_page((unsigned long)intermediate_nodes[i]);
+	}
+
+	return err;
 }
+EXPORT_SYMBOL_GPL(kho_radix_add_page);
 
-static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
-			     unsigned long end_pfn)
+/**
+ * kho_radix_del_page - Removes a page's preservation status from the radix tree.
+ * @root: The root of the radix tree.
+ * @pfn: The page frame number of the page to unpreserve.
+ * @order: The order of the page.
+ *
+ * This function traverses the radix tree and clears the bit corresponding to
+ * the page, effectively removing its "preserved" status. It does not free
+ * the tree's intermediate nodes, even if they become empty.
+ */
+void kho_radix_del_page(struct kho_radix_node *root, unsigned long pfn,
+			unsigned int order)
 {
-	unsigned int order;
+	unsigned long radix_key = kho_radix_encode_key(PFN_PHYS(pfn), order);
+	unsigned int tree_level = KHO_TREE_MAX_DEPTH - 1;
+	struct kho_radix_node *node;
+	struct kho_radix_leaf *leaf;
+	unsigned int i, idx;
 
-	while (pfn < end_pfn) {
-		order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+	might_sleep();
 
-		__kho_unpreserve_order(track, pfn, order);
+	node = root;
 
-		pfn += 1 << order;
+	/* Go from high levels to low levels */
+	for (i = tree_level; i > 0; i--) {
+		idx = kho_radix_get_index(radix_key, i);
+
+		/*
+		 * Attempting to delete a page that has not been preserved,
+		 * return with a warning.
+		 */
+		if (WARN_ON(!node->table[idx]))
+			return;
+
+		if (node->table[idx])
+			node = phys_to_virt((phys_addr_t)node->table[idx]);
 	}
+
+	/* Handle the leaf level bitmap (level 0) */
+	leaf = (struct kho_radix_leaf *)node;
+	__clear_bit(idx, leaf->bitmap);
 }
+EXPORT_SYMBOL_GPL(kho_radix_del_page);
 
-static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
-				unsigned int order)
+static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf,
+			       unsigned long radix_key,
+			       kho_radix_tree_walk_callback_t cb)
 {
-	struct kho_mem_phys_bits *bits;
-	struct kho_mem_phys *physxa, *new_physxa;
-	const unsigned long pfn_high = pfn >> order;
+	unsigned long *bitmap = (unsigned long *)leaf;
+	unsigned int i;
+	int err;
 
-	might_sleep();
-	physxa = xa_load(&track->orders, order);
-	if (!physxa) {
-		int err;
+	for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) {
+		err = cb(radix_key | i);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/**
+ * kho_radix_walk_tree - Traverses the radix tree and calls a callback for each preserved page.
+ * @root: A pointer to the root node of the radix tree to walk.
+ * @level: The starting level for the walk (typically KHO_TREE_MAX_DEPTH - 1).
+ * @start: The initial key prefix for the walk (typically 0).
+ * @cb: A callback function of type kho_radix_tree_walk_callback_t that will be
+ *      invoked for each preserved page found in the tree. The callback receives
+ *      the full radix key of the preserved page.
+ *
+ * This function walks the radix tree, searching from the specified top level
+ * (@level) down to the lowest level (level 0). For each preserved page found,
+ * it invokes the provided callback, passing the page's fully reconstructed
+ * radix key.
+ *
+ * Return: 0 if the walk completed the specified subtree, or the non-zero return
+ *         value from the callback that stopped the walk.
+ */
+int kho_radix_walk_tree(struct kho_radix_node *root, unsigned int level,
+			unsigned long start, kho_radix_tree_walk_callback_t cb)
+{
+	struct kho_radix_node *node;
+	struct kho_radix_leaf *leaf;
+	unsigned long radix_key, i;
+	int err;
 
-		new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL);
-		if (!new_physxa)
-			return -ENOMEM;
+	for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) {
+		if (!root->table[i])
+			continue;
+
+		unsigned int shift;
 
-		xa_init(&new_physxa->phys_bits);
-		physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa,
-				    GFP_KERNEL);
+		shift = ((level - 1) * KHO_TABLE_SIZE_LG2) +
+			KHO_BITMAP_SIZE_LG2;
+		radix_key = start | (i << shift);
 
-		err = xa_err(physxa);
-		if (err || physxa) {
-			xa_destroy(&new_physxa->phys_bits);
-			kfree(new_physxa);
+		node = phys_to_virt((phys_addr_t)root->table[i]);
 
+		if (level > 1) {
+			err = kho_radix_walk_tree(node, level - 1,
+						  radix_key, cb);
 			if (err)
 				return err;
 		} else {
-			physxa = new_physxa;
+			/*
+			 * we are at level 1,
+			 * node is pointing to the level 0 bitmap.
+			 */
+			leaf = (struct kho_radix_leaf *)node;
+			return kho_radix_walk_leaf(leaf, radix_key, cb);
 		}
 	}
 
-	bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
-	if (IS_ERR(bits))
-		return PTR_ERR(bits);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kho_radix_walk_tree);
+
 
-	set_bit(pfn_high % PRESERVE_BITS, bits->preserve);
 
-	return 0;
+static void __kho_unpreserve(unsigned long pfn, unsigned long end_pfn)
+{
+	struct kho_mem_track *track = &kho_out.track;
+	unsigned int order;
+
+	if (WARN_ON_ONCE(!track->root))
+		return;
+
+	down_write(&track->sem);
+	while (pfn < end_pfn) {
+		order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+
+		kho_radix_del_page(track->root, pfn, order);
+
+		pfn += 1 << order;
+	}
+	up_write(&track->sem);
 }
 
 static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
@@ -228,9 +377,10 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
 
 	info.page_private = page->private;
 	/*
-	 * deserialize_bitmap() only sets the magic on the head page. This magic
-	 * check also implicitly makes sure phys is order-aligned since for
-	 * non-order-aligned phys addresses, magic will never be set.
+	 * kho_radix_walk_tree_memblock_callback() only sets the magic on the
+	 * head page. This magic check also implicitly makes sure phys is
+	 * order-aligned since for non-order-aligned phys addresses, magic will
+	 * never be set.
 	 */
 	if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER))
 		return NULL;
@@ -301,195 +451,30 @@ struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages)
 }
 EXPORT_SYMBOL_GPL(kho_restore_pages);
 
-/* Serialize and deserialize struct kho_mem_phys across kexec
- *
- * Record all the bitmaps in a linked list of pages for the next kernel to
- * process. Each chunk holds bitmaps of the same order and each block of bitmaps
- * starts at a given physical address. This allows the bitmaps to be sparse. The
- * xarray is used to store them in a tree while building up the data structure,
- * but the KHO successor kernel only needs to process them once in order.
- *
- * All of this memory is normal kmalloc() memory and is not marked for
- * preservation. The successor kernel will remain isolated to the scratch space
- * until it completes processing this list. Once processed all the memory
- * storing these ranges will be marked as free.
- */
-
-struct khoser_mem_bitmap_ptr {
-	phys_addr_t phys_start;
-	DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *);
-};
-
-struct khoser_mem_chunk_hdr {
-	DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *);
-	unsigned int order;
-	unsigned int num_elms;
-};
-
-#define KHOSER_BITMAP_SIZE                                   \
-	((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \
-	 sizeof(struct khoser_mem_bitmap_ptr))
-
-struct khoser_mem_chunk {
-	struct khoser_mem_chunk_hdr hdr;
-	struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE];
-};
-
-static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
-
-static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk,
-					  unsigned long order)
-{
-	struct khoser_mem_chunk *chunk __free(free_page) = NULL;
-
-	chunk = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!chunk)
-		return ERR_PTR(-ENOMEM);
-
-	if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE)))
-		return ERR_PTR(-EINVAL);
-
-	chunk->hdr.order = order;
-	if (cur_chunk)
-		KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
-	return no_free_ptr(chunk);
-}
-
-static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
-{
-	struct khoser_mem_chunk *chunk = first_chunk;
-
-	while (chunk) {
-		struct khoser_mem_chunk *tmp = chunk;
-
-		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
-		free_page((unsigned long)tmp);
-	}
-}
-
-/*
- *  Update memory map property, if old one is found discard it via
- *  kho_mem_ser_free().
- */
-static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk)
+static int __init kho_radix_walk_tree_memblock_callback(unsigned long radix_key)
 {
-	void *ptr;
-	u64 phys;
-
-	ptr = fdt_getprop_w(kho_out.fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, NULL);
-
-	/* Check and discard previous memory map */
-	phys = get_unaligned((u64 *)ptr);
-	if (phys)
-		kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys));
-
-	/* Update with the new value */
-	phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0;
-	put_unaligned(phys, (u64 *)ptr);
-}
-
-static int kho_mem_serialize(struct kho_out *kho_out)
-{
-	struct khoser_mem_chunk *first_chunk = NULL;
-	struct khoser_mem_chunk *chunk = NULL;
-	struct kho_mem_phys *physxa;
-	unsigned long order;
-	int err = -ENOMEM;
+	union kho_page_info info;
+	unsigned int order;
+	unsigned long pa;
+	struct page *page;
+	int sz;
 
-	xa_for_each(&kho_out->track.orders, order, physxa) {
-		struct kho_mem_phys_bits *bits;
-		unsigned long phys;
+	pa = kho_radix_decode_key(radix_key, &order);
 
-		chunk = new_chunk(chunk, order);
-		if (IS_ERR(chunk)) {
-			err = PTR_ERR(chunk);
-			goto err_free;
-		}
-
-		if (!first_chunk)
-			first_chunk = chunk;
-
-		xa_for_each(&physxa->phys_bits, phys, bits) {
-			struct khoser_mem_bitmap_ptr *elm;
-
-			if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
-				chunk = new_chunk(chunk, order);
-				if (IS_ERR(chunk)) {
-					err = PTR_ERR(chunk);
-					goto err_free;
-				}
-			}
-
-			elm = &chunk->bitmaps[chunk->hdr.num_elms];
-			chunk->hdr.num_elms++;
-			elm->phys_start = (phys * PRESERVE_BITS)
-					  << (order + PAGE_SHIFT);
-			KHOSER_STORE_PTR(elm->bitmap, bits);
-		}
-	}
+	sz = 1 << (order + PAGE_SHIFT);
+	page = phys_to_page(pa);
 
-	kho_update_memory_map(first_chunk);
+	/* Reserve the memory preserved in KHO radix tree in memblock */
+	memblock_reserve(pa, sz);
+	memblock_reserved_mark_noinit(pa, sz);
+	info.magic = KHO_PAGE_MAGIC;
+	info.order = order;
+	page->private = info.page_private;
 
 	return 0;
-
-err_free:
-	kho_mem_ser_free(first_chunk);
-	return err;
 }
 
-static void __init deserialize_bitmap(unsigned int order,
-				      struct khoser_mem_bitmap_ptr *elm)
-{
-	struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
-	unsigned long bit;
-
-	for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
-		int sz = 1 << (order + PAGE_SHIFT);
-		phys_addr_t phys =
-			elm->phys_start + (bit << (order + PAGE_SHIFT));
-		struct page *page = phys_to_page(phys);
-		union kho_page_info info;
-
-		memblock_reserve(phys, sz);
-		memblock_reserved_mark_noinit(phys, sz);
-		info.magic = KHO_PAGE_MAGIC;
-		info.order = order;
-		page->private = info.page_private;
-	}
-}
 
-/* Return true if memory was deserizlied */
-static bool __init kho_mem_deserialize(const void *fdt)
-{
-	struct khoser_mem_chunk *chunk;
-	const void *mem_ptr;
-	u64 mem;
-	int len;
-
-	mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len);
-	if (!mem_ptr || len != sizeof(u64)) {
-		pr_err("failed to get preserved memory bitmaps\n");
-		return false;
-	}
-
-	mem = get_unaligned((const u64 *)mem_ptr);
-	chunk = mem ? phys_to_virt(mem) : NULL;
-
-	/* No preserved physical pages were passed, no deserialization */
-	if (!chunk)
-		return false;
-
-	while (chunk) {
-		unsigned int i;
-
-		for (i = 0; i != chunk->hdr.num_elms; i++)
-			deserialize_bitmap(chunk->hdr.order,
-					   &chunk->bitmaps[i]);
-		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
-	}
-
-	return true;
-}
 
 /*
  * With KHO enabled, memory can become fragmented because KHO regions may
@@ -789,14 +774,22 @@ EXPORT_SYMBOL_GPL(kho_remove_subtree);
  */
 int kho_preserve_folio(struct folio *folio)
 {
+	struct kho_mem_track *track = &kho_out.track;
 	const unsigned long pfn = folio_pfn(folio);
 	const unsigned int order = folio_order(folio);
-	struct kho_mem_track *track = &kho_out.track;
+	int err;
 
 	if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
 		return -EINVAL;
 
-	return __kho_preserve_order(track, pfn, order);
+	if (WARN_ON_ONCE(!track->root))
+		return -EINVAL;
+
+	down_write(&track->sem);
+	err = kho_radix_add_page(track->root, pfn, order);
+	up_write(&track->sem);
+
+	return err;
 }
 EXPORT_SYMBOL_GPL(kho_preserve_folio);
 
@@ -810,11 +803,16 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio);
  */
 void kho_unpreserve_folio(struct folio *folio)
 {
+	struct kho_mem_track *track = &kho_out.track;
 	const unsigned long pfn = folio_pfn(folio);
 	const unsigned int order = folio_order(folio);
-	struct kho_mem_track *track = &kho_out.track;
 
-	__kho_unpreserve_order(track, pfn, order);
+	if (WARN_ON_ONCE(!track->root))
+		return;
+
+	down_write(&track->sem);
+	kho_radix_del_page(track->root, pfn, order);
+	up_write(&track->sem);
 }
 EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
 
@@ -842,11 +840,15 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages)
 		return -EINVAL;
 	}
 
+	if (WARN_ON_ONCE(!track->root))
+		return -EINVAL;
+
+	down_write(&track->sem);
 	while (pfn < end_pfn) {
 		const unsigned int order =
 			min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
 
-		err = __kho_preserve_order(track, pfn, order);
+		err = kho_radix_add_page(track->root, pfn, order);
 		if (err) {
 			failed_pfn = pfn;
 			break;
@@ -854,9 +856,10 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages)
 
 		pfn += 1 << order;
 	}
+	up_write(&track->sem);
 
 	if (err)
-		__kho_unpreserve(track, start_pfn, failed_pfn);
+		__kho_unpreserve(start_pfn, failed_pfn);
 
 	return err;
 }
@@ -874,11 +877,10 @@ EXPORT_SYMBOL_GPL(kho_preserve_pages);
  */
 void kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
 {
-	struct kho_mem_track *track = &kho_out.track;
 	const unsigned long start_pfn = page_to_pfn(page);
 	const unsigned long end_pfn = start_pfn + nr_pages;
 
-	__kho_unpreserve(track, start_pfn, end_pfn);
+	__kho_unpreserve(start_pfn, end_pfn);
 }
 EXPORT_SYMBOL_GPL(kho_unpreserve_pages);
 
@@ -937,14 +939,13 @@ static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur
 static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
 					 unsigned short order)
 {
-	struct kho_mem_track *track = &kho_out.track;
 	unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
 
-	__kho_unpreserve(track, pfn, pfn + 1);
+	__kho_unpreserve(pfn, pfn + 1);
 
 	for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
 		pfn = PHYS_PFN(chunk->phys[i]);
-		__kho_unpreserve(track, pfn, pfn + (1 << order));
+		__kho_unpreserve(pfn, pfn + (1 << order));
 	}
 }
 
@@ -1213,25 +1214,12 @@ EXPORT_SYMBOL_GPL(kho_restore_free);
 
 int kho_finalize(void)
 {
-	int ret;
-
-	if (!kho_enable)
-		return -EOPNOTSUPP;
-
-	guard(mutex)(&kho_out.lock);
-	ret = kho_mem_serialize(&kho_out);
-	if (ret)
-		return ret;
-
-	kho_out.finalized = true;
-
 	return 0;
 }
 
 bool kho_finalized(void)
 {
-	guard(mutex)(&kho_out.lock);
-	return kho_out.finalized;
+	return false;
 }
 
 struct kho_in {
@@ -1304,18 +1292,49 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
 }
 EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
 
+/* Return non-zero if error */
+static int __init kho_mem_retrieve(const void *fdt)
+{
+	struct kho_radix_node *tree_root;
+	const phys_addr_t *mem;
+	int len;
+
+	/* Retrieve the KHO radix tree from passed-in FDT. */
+	mem = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len);
+
+	if (!mem || len != sizeof(*mem)) {
+		pr_err("failed to get preserved KHO memory tree\n");
+		return -ENOENT;
+	}
+
+	if (!*mem)
+		return -EINVAL;
+
+	tree_root = phys_to_virt(*mem);
+	return kho_radix_walk_tree(tree_root, KHO_TREE_MAX_DEPTH - 1,
+				   0, kho_radix_walk_tree_memblock_callback);
+}
+
 static __init int kho_out_fdt_setup(void)
 {
+	struct kho_mem_track *track = &kho_out.track;
 	void *root = kho_out.fdt;
-	u64 empty_mem_map = 0;
+	u64 preserved_mem_tree_pa;
 	int err;
 
 	err = fdt_create(root, PAGE_SIZE);
 	err |= fdt_finish_reservemap(root);
 	err |= fdt_begin_node(root, "");
 	err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
-	err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, &empty_mem_map,
-			    sizeof(empty_mem_map));
+
+	down_read(&track->sem);
+	preserved_mem_tree_pa = (u64)virt_to_phys(track->root);
+	up_read(&track->sem);
+
+	err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME,
+			    &preserved_mem_tree_pa,
+			    sizeof(preserved_mem_tree_pa));
+
 	err |= fdt_end_node(root);
 	err |= fdt_finish(root);
 
@@ -1324,16 +1343,26 @@ static __init int kho_out_fdt_setup(void)
 
 static __init int kho_init(void)
 {
+	struct kho_mem_track *track = &kho_out.track;
 	const void *fdt = kho_get_fdt();
 	int err = 0;
 
 	if (!kho_enable)
 		return 0;
 
+	down_write(&track->sem);
+	track->root = (struct kho_radix_node *)
+		kzalloc(PAGE_SIZE, GFP_KERNEL);
+	up_write(&track->sem);
+	if (!track->root) {
+		err = -ENOMEM;
+		goto err_free_scratch;
+	}
+
 	kho_out.fdt = kho_alloc_preserve(PAGE_SIZE);
 	if (IS_ERR(kho_out.fdt)) {
 		err = PTR_ERR(kho_out.fdt);
-		goto err_free_scratch;
+		goto err_free_kho_radix_tree_root;
 	}
 
 	err = kho_debugfs_init();
@@ -1379,6 +1408,11 @@ static __init int kho_init(void)
 
 err_free_fdt:
 	kho_unpreserve_free(kho_out.fdt);
+
+err_free_kho_radix_tree_root:
+	kfree(track->root);
+	track->root = NULL;
+
 err_free_scratch:
 	kho_out.fdt = NULL;
 	for (int i = 0; i < kho_scratch_cnt; i++) {
@@ -1422,7 +1456,7 @@ void __init kho_memory_init(void)
 		kho_scratch = phys_to_virt(kho_in.scratch_phys);
 		kho_release_scratch();
 
-		if (!kho_mem_deserialize(kho_get_fdt()))
+		if (kho_mem_retrieve(kho_get_fdt()))
 			kho_in.fdt_phys = 0;
 	} else {
 		kho_reserve_scratch();
-- 
2.52.0.223.gf5cc29aaa4-goog
Re: [PATCH v3 3/4] kho: Adopt radix tree for preserved memory tracking
Posted by Mike Rapoport 1 month ago
On Mon, Dec 08, 2025 at 06:53:15PM -0800, Jason Miu wrote:
> Introduce a radix tree implementation for tracking preserved memory pages
> and switch the KHO memory tracking mechanism to use it. This lays the
> groundwork for a stateless KHO implementation that eliminates the need for
> serialization and the associated "finalize" state.
> 
> This patch introduces the core radix tree data structures and constants to
> the KHO ABI. It adds the radix tree node and leaf structures, along with
> documentation for the radix tree key encoding scheme that combines a page's
> physical address and order.
> 
> To support broader use by other kernel subsystems, such as hugetlb
> preservation, the core radix tree manipulation functions are exported as
> a public API.
> 
> The xarray-based memory tracking is replaced with this new radix tree
> implementation. The core KHO preservation and unpreservation functions are
> wired up to use the radix tree helpers. On boot, the second kernel restores
> the preserved memory map by walking the radix tree whose root physical
> address is passed via the FDT.
> 
> The ABI `compatible` version is bumped to "kho-v2" to reflect the
> structural changes in the preserved memory map and sub-FDT property
> names.
> 
> Signed-off-by: Jason Miu <jasonmiu@google.com>
> ---
>  Documentation/core-api/kho/concepts.rst   |   2 +-
>  Documentation/core-api/kho/fdt.rst        |   7 +
>  Documentation/core-api/kho/index.rst      |   1 +
>  Documentation/core-api/kho/radix_tree.rst |  17 +
>  include/linux/kho/abi/kexec_handover.h    | 124 +++-
>  include/linux/kho_radix_tree.h            |  81 +++
>  kernel/liveupdate/kexec_handover.c        | 658 ++++++++++++----------
>  7 files changed, 568 insertions(+), 322 deletions(-)
>  create mode 100644 Documentation/core-api/kho/radix_tree.rst
>  create mode 100644 include/linux/kho_radix_tree.h
> 
> diff --git a/Documentation/core-api/kho/concepts.rst b/Documentation/core-api/kho/concepts.rst
> index e96893937286..d38bcaa951e4 100644
> --- a/Documentation/core-api/kho/concepts.rst
> +++ b/Documentation/core-api/kho/concepts.rst
> @@ -71,7 +71,7 @@ in the FDT. That state is called the KHO finalization phase.
>  Public API
>  ==========
>  .. kernel-doc:: kernel/liveupdate/kexec_handover.c
> -   :export:
> +   :identifiers: kho_is_enabled kho_restore_folio kho_restore_pages kho_add_subtree kho_remove_subtree kho_preserve_folio kho_unpreserve_folio kho_preserve_pages kho_unpreserve_pages kho_preserve_vmalloc kho_unpreserve_vmalloc kho_restore_vmalloc kho_alloc_preserve kho_unpreserve_free kho_restore_free is_kho_boot kho_retrieve_subtree

Ouch. This would be unmaintainable :(

>  
>  Internal API
>  ============

...

> diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h
> index 74f4fa67e458..bdda2fe67353 100644
> --- a/include/linux/kho/abi/kexec_handover.h
> +++ b/include/linux/kho/abi/kexec_handover.h
> @@ -10,6 +10,8 @@
>  #ifndef _LINUX_KHO_ABI_KEXEC_HANDOVER_H
>  #define _LINUX_KHO_ABI_KEXEC_HANDOVER_H
>  
> +#include <linux/bits.h>
> +#include <linux/log2.h>
>  #include <linux/types.h>
>  
>  /**
> @@ -35,25 +37,25 @@
>   *   parses this FDT to locate and restore the preserved data.::
>   *
>   *     / {
> - *         compatible = "kho-v1";
> + *         compatible = "kho-v2";
>   *
>   *         preserved-memory-map = <0x...>;
>   *
>   *         <subnode-name-1> {
> - *             fdt = <0x...>;
> + *             preserved-data = <0x...>;

Please extend the paragraph describing "compatible" change in the commit
message to mention that "preserved-data" is a better name than "fdt"
because some subsystems will not use fdt format for their preserved state.

>   *         };
>   *
>   *         <subnode-name-2> {
> - *             fdt = <0x...>;
> + *             preserved-data = <0x...>;
>   *         };
>   *               ... ...
>   *         <subnode-name-N> {
> - *             fdt = <0x...>;
> + *             preserved-data = <0x...>;
>   *         };
>   *     };
>   *
>   *   Root KHO Node (/):
> - *     - compatible: "kho-v1"
> + *     - compatible: "kho-v2"
>   *
>   *       Indentifies the overall KHO ABI version.
>   *
> @@ -68,20 +70,20 @@
>   *     is provided by the subsystem that uses KHO for preserving its
>   *     data.
>   *
> - *     - fdt: u64
> + *     - preserved-data: u64
>   *
> - *       Physical address pointing to a subnode FDT blob that is also
> + *       Physical address pointing to a subnode data blob that is also
>   *       being preserved.
>   */
>  
>  /* The compatible string for the KHO FDT root node. */
> -#define KHO_FDT_COMPATIBLE "kho-v1"
> +#define KHO_FDT_COMPATIBLE "kho-v2"
>  
>  /* The FDT property for the preserved memory map. */
>  #define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map"
>  
>  /* The FDT property for sub-FDTs. */
> -#define KHO_FDT_SUB_TREE_PROP_NAME "fdt"
> +#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data"
>  
>  /**
>   * DOC: Kexec Handover ABI for vmalloc Preservation
> @@ -159,4 +161,108 @@ struct kho_vmalloc {
>  	unsigned short order;
>  };
>  
> +/**
> + * DOC: Keep track of memory that is to be preserved across KHO.

Maybe "KHO persistent memory tracker"?

> + *
> + * KHO tracks preserved memory using a radix tree data structure. Each node of
> + * the tree is PAGE_SIZE. The leaf nodes are bitmaps where each set bit

Maybe "Each node of the tree is exactly a single page"?

> + * represents a single preserved page. The intermediate nodes are tables of

And here "a single preserved page" reads to me like a single order-0 page.
I think we should note that each bit can represent pages of different
orders.

> + * physical addresses that point to a lower level node.
> + *
> + * The tree hierarchy is shown below::
> + *
> + *   root
> + *   +-------------------+
> + *   |     Level 5       | (struct kho_radix_node)
> + *   +-------------------+
> + *     |
> + *     v
> + *   +-------------------+
> + *   |     Level 4       | (struct kho_radix_node)
> + *   +-------------------+
> + *     |
> + *     | ... (intermediate levels)
> + *     |
> + *     v
> + *   +-------------------+
> + *   |      Level 0      | (struct kho_radix_leaf)
> + *   +-------------------+
> + *
> + * This is achieved by encoding the page's physical address (pa) and its order

It's not really clear what "this is achieved" refers to.

> + * into a single unsigned long value. This value is a key then used to traverse

                                         This value is then used as a key to ...

> + * the tree. The encoded key value is composed of two parts: the 'order bit' in
> + * the upper part and the 'page offset' in the lower part.::
> + *
> + *   +------------+-----------------------------+--------------------------+
> + *   | Page Order | Order Bit                   | Page Offset              |
> + *   +------------+-----------------------------+--------------------------+
> + *   | 0          | ...000100 ... (at bit 52)   | pa >> (PAGE_SHIFT + 0)   |
> + *   | 1          | ...000010 ... (at bit 51)   | pa >> (PAGE_SHIFT + 1)   |
> + *   | 2          | ...000001 ... (at bit 50)   | pa >> (PAGE_SHIFT + 2)   |
> + *   | ...        | ...                         | ...                      |
> + *   +------------+-----------------------------+--------------------------+
> + *
> + * Page Offset:
> + * The 'page offset' is the physical address normalized for its order. It
> + * effectively represents the page offset for the given order.
> + *
> + * Order Bit:
> + * The 'order bit' encodes the page order by setting a single bit at a
> + * specific position. The position of this bit itself represents the order.
> + *
> + * For instance, on a 64-bit system with 4KB pages (PAGE_SHIFT = 12), the
> + * maximum range for a page offset (for order 0) is 52 bits (64 - 12). This
> + * offset occupies bits [0-51]. For order 0, the order bit is set at
> + * position 52.
> + *
> + * The following diagram illustrates how the encoded key value is split into
> + * indices for the tree levels, with PAGE_SIZE of 4KB::
> + *
> + *        63:60   59:51    50:42    41:33    32:24    23:15         14:0
> + *   +---------+--------+--------+--------+--------+--------+-----------------+
> + *   |    0    |  Lv 5  |  Lv 4  |  Lv 3  |  Lv 2  |  Lv 1  |  Lv 0 (bitmap)  |
> + *   +---------+--------+--------+--------+--------+--------+-----------------+
> + *
> + * This design stores pages of all sizes (orders) in a single 6-level table.

s/This design/The radix tree/ and s/table/hierarchy/

> + * It efficiently shares lower table levels, especially due to common zero top
> + * address bits, allowing a single, efficient algorithm to manage all pages.
> + * This bitmap approach also offers memory efficiency; for example, a 512KB
> + * bitmap can cover a 16GB memory range for 0-order pages with PAGE_SIZE = 4KB.
> + *
> + * The data structures defined here are part of the KHO ABI. Any modification
> + * to these structures that breaks backward compatibility must be accompanied by
> + * an update to the "compatible" string. This ensures that a newer kernel can
> + * correctly interpret the data passed by an older kernel.
> + */
> +
> +/*
> + * Defines constants for the KHO radix tree structure, used to track preserved
> + * memory. These constants govern the indexing, sizing, and depth of the tree.
> + */
> +enum kho_radix_consts {
> +	/* The bit position of a 0-order page */

                 ^ this is either position of the order bits or length of
the "page offset" for an order-0 page

> +	KHO_ORDER_0_LG2 = 64 - PAGE_SHIFT,

I'd spell out LOG2 rather than LG2 here and below.

> +
> +	/* Size of the table in kho_mem_radix_tree, in lg2 */

We don't have kho_mem_radix_tree anymore, do we?

> +	KHO_TABLE_SIZE_LG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t)),
> +
> +	/* Number of bits in the kho_bitmap, in lg2 */
> +	KHO_BITMAP_SIZE_LG2 = PAGE_SHIFT + const_ilog2(BITS_PER_BYTE),
> +
> +	/*
> +	 * The total tree depth is the number of intermediate levels
> +	 * and 1 bitmap level.
> +	 */
> +	KHO_TREE_MAX_DEPTH = DIV_ROUND_UP(KHO_ORDER_0_LG2 - KHO_BITMAP_SIZE_LG2,
> +					  KHO_TABLE_SIZE_LG2) + 1,
> +};
> +
> +struct kho_radix_node {
> +	u64 table[1 << KHO_TABLE_SIZE_LG2];
> +};
> +
> +struct kho_radix_leaf {
> +	DECLARE_BITMAP(bitmap, 1 << KHO_BITMAP_SIZE_LG2);
> +};
> +
>  #endif	/* _LINUX_KHO_ABI_KEXEC_HANDOVER_H */
> diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
> new file mode 100644
> index 000000000000..5101a04f6ae6
> --- /dev/null
> +++ b/include/linux/kho_radix_tree.h
> @@ -0,0 +1,81 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _LIVEUPDATE_KEXEC_HANDOVER_RADIX_TREE_H
> +#define _LIVEUPDATE_KEXEC_HANDOVER_RADIX_TREE_H

Please use _LINUX_KHO_ABI prefix

> +
> +#include <linux/err.h>
> +#include <linux/errno.h>
> +#include <linux/types.h>
> +
> +/**
> + * DOC: Kexec Handover Radix Tree
> + *
> + * This is a radix tree implementation for tracking physical memory pages
> + * across kexec transitions. It was developed for the KHO mechanism but is
> + * designed for broader use by any subsystem that needs to preserve pages.
> + *
> + * The radix tree is a multi-level tree where leaf nodes are bitmaps
> + * representing individual pages. To allow pages of different sizes (orders)
> + * to be stored efficiently in a single tree, it uses a unique key encoding
> + * scheme. Each key is an unsigned long that combines a page's physical
> + * address and its order.
> + *
> + * Client code is responsible for allocating the root node of the tree and
> + * managing its lifecycle, and must use the tree data structures defined in
> + * the KHO ABI, `include/linux/kho/abi/kexec_handover.h`.
> + */
> +
> +struct kho_radix_node;
> +
> +typedef int (*kho_radix_tree_walk_callback_t)(unsigned long radix_key);

I don't think radix tree users outside kexec_handover.c should bother with
the key encoding.
The callback here should have physical address and order as parameters.

> +
> +#ifdef CONFIG_KEXEC_HANDOVER
> +
> +unsigned long kho_radix_encode_key(phys_addr_t pa, unsigned int order);
> +
> +phys_addr_t kho_radix_decode_key(unsigned long radix_key,
> +				 unsigned int *order);

These should not be a part of public interface.

> +int kho_radix_add_page(struct kho_radix_node *root, unsigned long pfn,
> +		       unsigned int order);
> +
> +void kho_radix_del_page(struct kho_radix_node *root, unsigned long pfn,
> +			unsigned int order);
> +
> +int kho_radix_walk_tree(struct kho_radix_node *root, unsigned int level,
> +			unsigned long start, kho_radix_tree_walk_callback_t cb);
> +

...

> diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
> index a180b3367e8f..81bac82c8672 100644
> --- a/kernel/liveupdate/kexec_handover.c
> +++ b/kernel/liveupdate/kexec_handover.c
> @@ -66,155 +68,302 @@ static int __init kho_parse_enable(char *p)
>  early_param("kho", kho_parse_enable);

...

>  struct kho_mem_track {
> -	/* Points to kho_mem_phys, each order gets its own bitmap tree */
> -	struct xarray orders;
> +	struct kho_radix_node *root;
> +	struct rw_semaphore sem;

It does not look like we have concurrent readers, why choose rw_semaphore
and not mutex?

>  };
>  
> -struct khoser_mem_chunk;
> -
>  struct kho_out {
> -	void *fdt;
> -	bool finalized;

The next patch removes finalization, probably removing the finalized field
should be done there.

> -	struct mutex lock; /* protects KHO FDT finalization */
> -
>  	struct kho_mem_track track;
> +	void *fdt;
> +	struct mutex lock; /* protects KHO FDT */

Please don't move the fields around.
And while the update of the comment is correct, it seems to me rather a
part of the next patch.

>  	struct kho_debugfs dbg;
>  };
>  
>  static struct kho_out kho_out = {
> -	.lock = __MUTEX_INITIALIZER(kho_out.lock),
>  	.track = {
> -		.orders = XARRAY_INIT(kho_out.track.orders, 0),
> +		.sem = __RWSEM_INITIALIZER(kho_out.track.sem),
>  	},
> -	.finalized = false,
> +	.lock = __MUTEX_INITIALIZER(kho_out.lock),

Please don't to move fields.

>  };
>  
> -static void *xa_load_or_alloc(struct xarray *xa, unsigned long index)
> +/**
> + * kho_radix_encode_key - Encodes a physical address and order into a radix key.
> + * @pa: The physical address of the page.
> + * @order: The order of the page.
> + *
> + * This function combines a page's physical address and its order into a
> + * single unsigned long, which is used as a key for all radix tree
> + * operations.
> + *
> + * Return: The encoded unsigned long key.
> + */
> +unsigned long kho_radix_encode_key(phys_addr_t pa, unsigned int order)
>  {
> -	void *res = xa_load(xa, index);
> +	/* Order bits part */
> +	unsigned long h = 1UL << (KHO_ORDER_0_LG2 - order);
> +	/* Page offset part */
> +	unsigned long l = pa >> (PAGE_SHIFT + order);
>  
> -	if (res)
> -		return res;
> +	return h | l;
> +}
> +EXPORT_SYMBOL_GPL(kho_radix_encode_key);
>  
> -	void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL);
> +/**
> + * kho_radix_decode_key - Decodes a radix key back into a physical address and order.
> + * @radix_key: The unsigned long key to decode.
> + * @order: An output parameter, a pointer to an unsigned int where the decoded
> + *         page order will be stored.
> + *
> + * This function reverses the encoding performed by kho_radix_encode_key(),
> + * extracting the original physical address and page order from a given key.
> + *
> + * Return: The decoded physical address.
> + */
> +phys_addr_t kho_radix_decode_key(unsigned long radix_key,
> +				 unsigned int *order)
> +{
> +	unsigned int order_bit = fls64(radix_key);
> +	phys_addr_t pa;
>  
> -	if (!elm)
> -		return ERR_PTR(-ENOMEM);
> +	/* order_bit is numbered starting at 1 from fls64 */
> +	*order = KHO_ORDER_0_LG2 - order_bit + 1;
> +	/* The order is discarded by the shift */
> +	pa = radix_key << (PAGE_SHIFT + *order);
>  
> -	if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE)))
> -		return ERR_PTR(-EINVAL);
> +	return pa;
> +}
> +EXPORT_SYMBOL_GPL(kho_radix_decode_key);

Please make kho_radix_encode_key() and kho_radix_decode_key() static.

> +
> +static unsigned long kho_radix_get_index(unsigned long radix_key,
> +					 unsigned int level)
> +{
> +	int s;
>  
> -	res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
> -	if (xa_is_err(res))
> -		return ERR_PTR(xa_err(res));
> -	else if (res)
> -		return res;
> +	if (level == 0)
> +		return radix_key % (1 << KHO_BITMAP_SIZE_LG2);

I'd split this to 

static unsigned long kho_get_radix_bitmap_index(unsigned long key);

>  
> -	return no_free_ptr(elm);
> +	s = ((level - 1) * KHO_TABLE_SIZE_LG2) + KHO_BITMAP_SIZE_LG2;
> +	return (radix_key >> s) % (1 << KHO_TABLE_SIZE_LG2);
>  }
>  
> -static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn,
> -				   unsigned int order)
> +/**
> + * kho_radix_add_page - Marks a page as preserved in the radix tree.
> + * @root: The root of the radix tree.
> + * @pfn: The page frame number of the page to preserve.
> + * @order: The order of the page.
> + *
> + * This function traverses the radix tree based on the key derived from @pfn
> + * and @order. It sets the corresponding bit in the leaf bitmap to mark the
> + * page for preservation. If intermediate nodes do not exist along the path,
> + * they are allocated and added to the tree.
> + *
> + * Return: 0 on success, or a negative error code on failure.
> + */
> +int kho_radix_add_page(struct kho_radix_node *root,
> +		       unsigned long pfn, unsigned int order)
>  {
> -	struct kho_mem_phys_bits *bits;
> -	struct kho_mem_phys *physxa;
> -	const unsigned long pfn_high = pfn >> order;
> +	phys_addr_t pa = PFN_PHYS(pfn);
> +	unsigned long radix_key = kho_radix_encode_key(pa, order);

pa seems unused elsewhere, you can just put PFN_PHYS() into
kho_radix_encode_key().
And radix_ prefix for the key seems redundant to me.

> +	struct kho_radix_node *node;
> +	struct kho_radix_leaf *leaf;
> +	unsigned int i, idx;
> +	int err = 0;
>  
> -	physxa = xa_load(&track->orders, order);
> -	if (WARN_ON_ONCE(!physxa))
> -		return;
> +	/*
> +	 * This array stores pointers to newly allocated intermediate radix tree
> +	 * nodes along the insertion path. In case of an error during node
> +	 * allocation or insertion, these stored pointers are used to free
> +	 * the partially allocated path, preventing memory leaks.
> +	 */
> +	struct kho_radix_node *intermediate_nodes[KHO_TREE_MAX_DEPTH] = { 0 };

Let's try keeping declarations in reverse xmas tree order. This long line
can be the first declaration.
And I don't think this array deserves such a long comment, it's quite
obvious why it's needed.

>  
> -	bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
> -	if (WARN_ON_ONCE(!bits))
> -		return;
> +	might_sleep();
>  
> -	clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
> +	node = root;
> +
> +	/* Go from high levels to low levels */
> +	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
> +		idx = kho_radix_get_index(radix_key, i);
> +
> +		if (node->table[idx]) {
> +			node = phys_to_virt((phys_addr_t)node->table[idx]);

Is casting to phys_addr_t required?
We should have an assert that verifies that phys_addr_t and u64 have the
same size somewhere, otherwise everything falls apart anyway.

> +			continue;
> +		}
> +
> +		/* Next node is empty, create a new node for it */
> +		struct kho_radix_node *new_tree;

Please don't mix declarations and code unless strictly necessary.
And new_node seems a more appropriate name here.

> +
> +		new_tree = (struct kho_radix_node *)get_zeroed_page(GFP_KERNEL);
> +		if (!new_tree) {
> +			err = -ENOMEM;
> +			goto err_free_alloc_nodes;

This reads to me like "on error free and allocate nodes". err_free_nodes
sounds a better name.

> +		}
> +
> +		node->table[idx] = virt_to_phys(new_tree);
> +		node = new_tree;
> +
> +		intermediate_nodes[i] = new_tree;
> +	}
> +
> +	/* Handle the leaf level bitmap (level 0) */
> +	idx = kho_radix_get_index(radix_key, 0);
> +	leaf = (struct kho_radix_leaf *)node;
> +	__set_bit(idx, leaf->bitmap);
> +
> +	return 0;
> +
> +err_free_alloc_nodes:
> +	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
> +		if (intermediate_nodes[i])
> +			free_page((unsigned long)intermediate_nodes[i]);
> +	}
> +
> +	return err;
>  }
> +EXPORT_SYMBOL_GPL(kho_radix_add_page);
>  
> -static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
> -			     unsigned long end_pfn)
> +/**
> + * kho_radix_del_page - Removes a page's preservation status from the radix tree.
> + * @root: The root of the radix tree.
> + * @pfn: The page frame number of the page to unpreserve.
> + * @order: The order of the page.
> + *
> + * This function traverses the radix tree and clears the bit corresponding to
> + * the page, effectively removing its "preserved" status. It does not free
> + * the tree's intermediate nodes, even if they become empty.
> + */
> +void kho_radix_del_page(struct kho_radix_node *root, unsigned long pfn,
> +			unsigned int order)
>  {
> -	unsigned int order;
> +	unsigned long radix_key = kho_radix_encode_key(PFN_PHYS(pfn), order);
> +	unsigned int tree_level = KHO_TREE_MAX_DEPTH - 1;
> +	struct kho_radix_node *node;
> +	struct kho_radix_leaf *leaf;
> +	unsigned int i, idx;
>  
> -	while (pfn < end_pfn) {
> -		order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
> +	might_sleep();
>  
> -		__kho_unpreserve_order(track, pfn, order);
> +	node = root;

This can be done at declaration spot.

>  
> -		pfn += 1 << order;
> +	/* Go from high levels to low levels */
> +	for (i = tree_level; i > 0; i--) {

tree_level seems unnecessary, just use KHO_TREE_MAX_DEPTH - 1.

> +		idx = kho_radix_get_index(radix_key, i);
> +
> +		/*
> +		 * Attempting to delete a page that has not been preserved,
> +		 * return with a warning.
> +		 */
> +		if (WARN_ON(!node->table[idx]))
> +			return;
> +
> +		if (node->table[idx])
> +			node = phys_to_virt((phys_addr_t)node->table[idx]);
>  	}
> +
> +	/* Handle the leaf level bitmap (level 0) */
> +	leaf = (struct kho_radix_leaf *)node;

idx should be updated here for level 0.

> +	__clear_bit(idx, leaf->bitmap);
>  }
> +EXPORT_SYMBOL_GPL(kho_radix_del_page);
  
...

> +
> +/**
> + * kho_radix_walk_tree - Traverses the radix tree and calls a callback for each preserved page.
> + * @root: A pointer to the root node of the radix tree to walk.
> + * @level: The starting level for the walk (typically KHO_TREE_MAX_DEPTH - 1).
> + * @start: The initial key prefix for the walk (typically 0).
> + * @cb: A callback function of type kho_radix_tree_walk_callback_t that will be
> + *      invoked for each preserved page found in the tree. The callback receives
> + *      the full radix key of the preserved page.
> + *
> + * This function walks the radix tree, searching from the specified top level
> + * (@level) down to the lowest level (level 0). For each preserved page found,
> + * it invokes the provided callback, passing the page's fully reconstructed
> + * radix key.
> + *
> + * Return: 0 if the walk completed the specified subtree, or the non-zero return
> + *         value from the callback that stopped the walk.
> + */
> +int kho_radix_walk_tree(struct kho_radix_node *root, unsigned int level,
> +			unsigned long start, kho_radix_tree_walk_callback_t cb)
> +{
> +	struct kho_radix_node *node;
> +	struct kho_radix_leaf *leaf;
> +	unsigned long radix_key, i;
> +	int err;
>  
> -		new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL);
> -		if (!new_physxa)
> -			return -ENOMEM;
> +	for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) {
> +		if (!root->table[i])
> +			continue;
> +
> +		unsigned int shift;

Please don't mix declarations and code unless strictly necessary.

>  
> -		xa_init(&new_physxa->phys_bits);
> -		physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa,
> -				    GFP_KERNEL);
> +		shift = ((level - 1) * KHO_TABLE_SIZE_LG2) +
> +			KHO_BITMAP_SIZE_LG2;
> +		radix_key = start | (i << shift);
>  
> -		err = xa_err(physxa);
> -		if (err || physxa) {
> -			xa_destroy(&new_physxa->phys_bits);
> -			kfree(new_physxa);
> +		node = phys_to_virt((phys_addr_t)root->table[i]);
>  
> +		if (level > 1) {
> +			err = kho_radix_walk_tree(node, level - 1,
> +						  radix_key, cb);
>  			if (err)
>  				return err;
>  		} else {
> -			physxa = new_physxa;
> +			/*
> +			 * we are at level 1,
> +			 * node is pointing to the level 0 bitmap.
> +			 */
> +			leaf = (struct kho_radix_leaf *)node;
> +			return kho_radix_walk_leaf(leaf, radix_key, cb);

I'd inverse the if:

		if (!level)
			return kho_radix_walk_leaf();

		err  = kho_radix_walk_tree()


>  		}
>  	}
>  
> -	bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
> -	if (IS_ERR(bits))
> -		return PTR_ERR(bits);
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(kho_radix_walk_tree);
> +

Feels like an extra empty line is added here. Please drop it.

>  
> -	set_bit(pfn_high % PRESERVE_BITS, bits->preserve);
>  
> -	return 0;
> +static void __kho_unpreserve(unsigned long pfn, unsigned long end_pfn)

The change of __kho_unpreserve() signature does not belong to this patch.
If you feel strongly this change is justified make it a preparation patch
before the radix tree changes.

> +{
> +	struct kho_mem_track *track = &kho_out.track;
> +	unsigned int order;
> +
> +	if (WARN_ON_ONCE(!track->root))
> +		return;
> +
> +	down_write(&track->sem);
> +	while (pfn < end_pfn) {
> +		order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
> +
> +		kho_radix_del_page(track->root, pfn, order);

If we are going to expose radix tree APIs, it would make sense for them to
take care of the locking internally.

For that we might need something like

struct kho_radix_tree {
	struct kho_radix_node *root;
	struct mutex lock;
}; 

and use the root struct as the parameter to kho_radix APIs.

> +
> +		pfn += 1 << order;
> +	}
> +	up_write(&track->sem);
>  }

...

> -static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk)
> +static int __init kho_radix_walk_tree_memblock_callback(unsigned long radix_key)

This name is much about being a callback for walking the tree and very
little about what the function does. It should be the other way around.

>  {
> +	union kho_page_info info;
> +	unsigned int order;
> +	unsigned long pa;

In the most places we use 'phys_addr_t phys' for physical addresses.

> +	struct page *page;
> +	int sz;
>  
> +	pa = kho_radix_decode_key(radix_key, &order);
>  
> +	sz = 1 << (order + PAGE_SHIFT);
> +	page = phys_to_page(pa);
>  
> +	/* Reserve the memory preserved in KHO radix tree in memblock */
> +	memblock_reserve(pa, sz);
> +	memblock_reserved_mark_noinit(pa, sz);
> +	info.magic = KHO_PAGE_MAGIC;
> +	info.order = order;
> +	page->private = info.page_private;
>  
>  	return 0;
>  }
>  
>  
>  

Too many empty lines here.

>  /*
>   * With KHO enabled, memory can become fragmented because KHO regions may
> @@ -789,14 +774,22 @@ EXPORT_SYMBOL_GPL(kho_remove_subtree);
>   */
>  int kho_preserve_folio(struct folio *folio)
>  {
> +	struct kho_mem_track *track = &kho_out.track;
>  	const unsigned long pfn = folio_pfn(folio);
>  	const unsigned int order = folio_order(folio);
> -	struct kho_mem_track *track = &kho_out.track;
> +	int err;
>  
>  	if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
>  		return -EINVAL;
>  
> -	return __kho_preserve_order(track, pfn, order);
> +	if (WARN_ON_ONCE(!track->root))
> +		return -EINVAL;

Can we move this to kho_radix_add_page() and kho_radix_del_page()?
I see that some preserve/unpreserve methods WARN and some don't.

> +
> +	down_write(&track->sem);
> +	err = kho_radix_add_page(track->root, pfn, order);
> +	up_write(&track->sem);
> +
> +	return err;
>  }
>  EXPORT_SYMBOL_GPL(kho_preserve_folio);

...

> @@ -1213,25 +1214,12 @@ EXPORT_SYMBOL_GPL(kho_restore_free);
>  
>  int kho_finalize(void)
>  {
> -	int ret;
> -
> -	if (!kho_enable)
> -		return -EOPNOTSUPP;
> -
> -	guard(mutex)(&kho_out.lock);
> -	ret = kho_mem_serialize(&kho_out);
> -	if (ret)
> -		return ret;
> -
> -	kho_out.finalized = true;
> -
>  	return 0;
>  }
>  
>  bool kho_finalized(void)
>  {
> -	guard(mutex)(&kho_out.lock);
> -	return kho_out.finalized;
> +	return false;

Most of the finalization changes belong to the next patch IMO.

>  }
>  
>  struct kho_in {
> @@ -1304,18 +1292,49 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
>  }
>  EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
>  
> +/* Return non-zero if error */

That's what 99% of the kernel does, no need to comment about it.

>  static __init int kho_out_fdt_setup(void)
>  {
> +	struct kho_mem_track *track = &kho_out.track;
>  	void *root = kho_out.fdt;
> -	u64 empty_mem_map = 0;
> +	u64 preserved_mem_tree_pa;
>  	int err;
>  
>  	err = fdt_create(root, PAGE_SIZE);
>  	err |= fdt_finish_reservemap(root);
>  	err |= fdt_begin_node(root, "");
>  	err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
> -	err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, &empty_mem_map,
> -			    sizeof(empty_mem_map));
> +
> +	down_read(&track->sem);
> +	preserved_mem_tree_pa = (u64)virt_to_phys(track->root);
> +	up_read(&track->sem);

It seems to be the only place that uses down_read(). So we actually don't
have concurrent readers. Let's just use a mutex.

> +
> +	err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME,
> +			    &preserved_mem_tree_pa,
> +			    sizeof(preserved_mem_tree_pa));
> +
>  	err |= fdt_end_node(root);
>  	err |= fdt_finish(root);
>  
> @@ -1324,16 +1343,26 @@ static __init int kho_out_fdt_setup(void)
>  
>  static __init int kho_init(void)
>  {
> +	struct kho_mem_track *track = &kho_out.track;
>  	const void *fdt = kho_get_fdt();
>  	int err = 0;
>  
>  	if (!kho_enable)
>  		return 0;
>  
> +	down_write(&track->sem);
> +	track->root = (struct kho_radix_node *)
> +		kzalloc(PAGE_SIZE, GFP_KERNEL);
> +	up_write(&track->sem);
> +	if (!track->root) {
> +		err = -ENOMEM;
> +		goto err_free_scratch;
> +	}
> +
>  	kho_out.fdt = kho_alloc_preserve(PAGE_SIZE);
>  	if (IS_ERR(kho_out.fdt)) {
>  		err = PTR_ERR(kho_out.fdt);
> -		goto err_free_scratch;
> +		goto err_free_kho_radix_tree_root;
>  	}
>  
>  	err = kho_debugfs_init();
> @@ -1379,6 +1408,11 @@ static __init int kho_init(void)
>  
>  err_free_fdt:
>  	kho_unpreserve_free(kho_out.fdt);
> +
> +err_free_kho_radix_tree_root:
> +	kfree(track->root);
> +	track->root = NULL;
> +

No need for empty lines around the error handling

>  err_free_scratch:
>  	kho_out.fdt = NULL;
>  	for (int i = 0; i < kho_scratch_cnt; i++) {
> @@ -1422,7 +1456,7 @@ void __init kho_memory_init(void)
>  		kho_scratch = phys_to_virt(kho_in.scratch_phys);
>  		kho_release_scratch();
>  
> -		if (!kho_mem_deserialize(kho_get_fdt()))
> +		if (kho_mem_retrieve(kho_get_fdt()))
>  			kho_in.fdt_phys = 0;
>  	} else {
>  		kho_reserve_scratch();

-- 
Sincerely yours,
Mike.
Re: [PATCH v3 3/4] kho: Adopt radix tree for preserved memory tracking
Posted by Jason Miu 1 month ago
Thank you, Mike for your comments! I put my update into the v4 patch series.

On Mon, Jan 5, 2026 at 1:27 AM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Mon, Dec 08, 2025 at 06:53:15PM -0800, Jason Miu wrote:
> > Introduce a radix tree implementation for tracking preserved memory pages
> > and switch the KHO memory tracking mechanism to use it. This lays the
> > groundwork for a stateless KHO implementation that eliminates the need for
> > serialization and the associated "finalize" state.
> >
> > This patch introduces the core radix tree data structures and constants to
> > the KHO ABI. It adds the radix tree node and leaf structures, along with
> > documentation for the radix tree key encoding scheme that combines a page's
> > physical address and order.
> >
> > To support broader use by other kernel subsystems, such as hugetlb
> > preservation, the core radix tree manipulation functions are exported as
> > a public API.
> >
> > The xarray-based memory tracking is replaced with this new radix tree
> > implementation. The core KHO preservation and unpreservation functions are
> > wired up to use the radix tree helpers. On boot, the second kernel restores
> > the preserved memory map by walking the radix tree whose root physical
> > address is passed via the FDT.
> >
> > The ABI `compatible` version is bumped to "kho-v2" to reflect the
> > structural changes in the preserved memory map and sub-FDT property
> > names.
> >
> > Signed-off-by: Jason Miu <jasonmiu@google.com>
> > ---
> >  Documentation/core-api/kho/concepts.rst   |   2 +-
> >  Documentation/core-api/kho/fdt.rst        |   7 +
> >  Documentation/core-api/kho/index.rst      |   1 +
> >  Documentation/core-api/kho/radix_tree.rst |  17 +
> >  include/linux/kho/abi/kexec_handover.h    | 124 +++-
> >  include/linux/kho_radix_tree.h            |  81 +++
> >  kernel/liveupdate/kexec_handover.c        | 658 ++++++++++++----------
> >  7 files changed, 568 insertions(+), 322 deletions(-)
> >  create mode 100644 Documentation/core-api/kho/radix_tree.rst
> >  create mode 100644 include/linux/kho_radix_tree.h
> >
> > diff --git a/Documentation/core-api/kho/concepts.rst b/Documentation/core-api/kho/concepts.rst
> > index e96893937286..d38bcaa951e4 100644
> > --- a/Documentation/core-api/kho/concepts.rst
> > +++ b/Documentation/core-api/kho/concepts.rst
> > @@ -71,7 +71,7 @@ in the FDT. That state is called the KHO finalization phase.
> >  Public API
> >  ==========
> >  .. kernel-doc:: kernel/liveupdate/kexec_handover.c
> > -   :export:
> > +   :identifiers: kho_is_enabled kho_restore_folio kho_restore_pages kho_add_subtree kho_remove_subtree kho_preserve_folio kho_unpreserve_folio kho_preserve_pages kho_unpreserve_pages kho_preserve_vmalloc kho_unpreserve_vmalloc kho_restore_vmalloc kho_alloc_preserve kho_unpreserve_free kho_restore_free is_kho_boot kho_retrieve_subtree
>
> Ouch. This would be unmaintainable :(
>

With the newly merged concepts and FDT doc (ab37f60bc0eb), I also
merged the APIs into the index file so all the info is in the same
place.

> > diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h
> > index 74f4fa67e458..bdda2fe67353 100644
> > --- a/include/linux/kho/abi/kexec_handover.h
> > +++ b/include/linux/kho/abi/kexec_handover.h
> > @@ -10,6 +10,8 @@
> >  #ifndef _LINUX_KHO_ABI_KEXEC_HANDOVER_H
> >  #define _LINUX_KHO_ABI_KEXEC_HANDOVER_H
> >
> > +#include <linux/bits.h>
> > +#include <linux/log2.h>
> >  #include <linux/types.h>
> >
> >  /**
> > @@ -35,25 +37,25 @@
> >   *   parses this FDT to locate and restore the preserved data.::
> >   *
> >   *     / {
> > - *         compatible = "kho-v1";
> > + *         compatible = "kho-v2";
> >   *
> >   *         preserved-memory-map = <0x...>;
> >   *
> >   *         <subnode-name-1> {
> > - *             fdt = <0x...>;
> > + *             preserved-data = <0x...>;
>
> Please extend the paragraph describing "compatible" change in the commit
> message to mention that "preserved-data" is a better name than "fdt"
> because some subsystems will not use fdt format for their preserved state.
>

Sure.

> >   *         };
> >   *
> >   *         <subnode-name-2> {
> > - *             fdt = <0x...>;
> > + *             preserved-data = <0x...>;
> >   *         };
> >   *               ... ...
> >   *         <subnode-name-N> {
> > - *             fdt = <0x...>;
> > + *             preserved-data = <0x...>;
> >   *         };
> >   *     };
> >   *
> >   *   Root KHO Node (/):
> > - *     - compatible: "kho-v1"
> > + *     - compatible: "kho-v2"
> >   *
> >   *       Indentifies the overall KHO ABI version.
> >   *
> > @@ -68,20 +70,20 @@
> >   *     is provided by the subsystem that uses KHO for preserving its
> >   *     data.
> >   *
> > - *     - fdt: u64
> > + *     - preserved-data: u64
> >   *
> > - *       Physical address pointing to a subnode FDT blob that is also
> > + *       Physical address pointing to a subnode data blob that is also
> >   *       being preserved.
> >   */
> >
> >  /* The compatible string for the KHO FDT root node. */
> > -#define KHO_FDT_COMPATIBLE "kho-v1"
> > +#define KHO_FDT_COMPATIBLE "kho-v2"
> >
> >  /* The FDT property for the preserved memory map. */
> >  #define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map"
> >
> >  /* The FDT property for sub-FDTs. */
> > -#define KHO_FDT_SUB_TREE_PROP_NAME "fdt"
> > +#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data"
> >
> >  /**
> >   * DOC: Kexec Handover ABI for vmalloc Preservation
> > @@ -159,4 +161,108 @@ struct kho_vmalloc {
> >       unsigned short order;
> >  };
> >
> > +/**
> > + * DOC: Keep track of memory that is to be preserved across KHO.
>
> Maybe "KHO persistent memory tracker"?
>
> > + *
> > + * KHO tracks preserved memory using a radix tree data structure. Each node of
> > + * the tree is PAGE_SIZE. The leaf nodes are bitmaps where each set bit
>
> Maybe "Each node of the tree is exactly a single page"?
>
> > + * represents a single preserved page. The intermediate nodes are tables of
>
> And here "a single preserved page" reads to me like a single order-0 page.
> I think we should note that each bit can represent pages of different
> orders.
>
> > + * physical addresses that point to a lower level node.
> > + *
> > + * The tree hierarchy is shown below::
> > + *
> > + *   root
> > + *   +-------------------+
> > + *   |     Level 5       | (struct kho_radix_node)
> > + *   +-------------------+
> > + *     |
> > + *     v
> > + *   +-------------------+
> > + *   |     Level 4       | (struct kho_radix_node)
> > + *   +-------------------+
> > + *     |
> > + *     | ... (intermediate levels)
> > + *     |
> > + *     v
> > + *   +-------------------+
> > + *   |      Level 0      | (struct kho_radix_leaf)
> > + *   +-------------------+
> > + *
> > + * This is achieved by encoding the page's physical address (pa) and its order
>
> It's not really clear what "this is achieved" refers to.
>
> > + * into a single unsigned long value. This value is a key then used to traverse
>
>                                          This value is then used as a key to ...
>
> > + * the tree. The encoded key value is composed of two parts: the 'order bit' in
> > + * the upper part and the 'page offset' in the lower part.::
> > + *
> > + *   +------------+-----------------------------+--------------------------+
> > + *   | Page Order | Order Bit                   | Page Offset              |
> > + *   +------------+-----------------------------+--------------------------+
> > + *   | 0          | ...000100 ... (at bit 52)   | pa >> (PAGE_SHIFT + 0)   |
> > + *   | 1          | ...000010 ... (at bit 51)   | pa >> (PAGE_SHIFT + 1)   |
> > + *   | 2          | ...000001 ... (at bit 50)   | pa >> (PAGE_SHIFT + 2)   |
> > + *   | ...        | ...                         | ...                      |
> > + *   +------------+-----------------------------+--------------------------+
> > + *
> > + * Page Offset:
> > + * The 'page offset' is the physical address normalized for its order. It
> > + * effectively represents the page offset for the given order.
> > + *
> > + * Order Bit:
> > + * The 'order bit' encodes the page order by setting a single bit at a
> > + * specific position. The position of this bit itself represents the order.
> > + *
> > + * For instance, on a 64-bit system with 4KB pages (PAGE_SHIFT = 12), the
> > + * maximum range for a page offset (for order 0) is 52 bits (64 - 12). This
> > + * offset occupies bits [0-51]. For order 0, the order bit is set at
> > + * position 52.
> > + *
> > + * The following diagram illustrates how the encoded key value is split into
> > + * indices for the tree levels, with PAGE_SIZE of 4KB::
> > + *
> > + *        63:60   59:51    50:42    41:33    32:24    23:15         14:0
> > + *   +---------+--------+--------+--------+--------+--------+-----------------+
> > + *   |    0    |  Lv 5  |  Lv 4  |  Lv 3  |  Lv 2  |  Lv 1  |  Lv 0 (bitmap)  |
> > + *   +---------+--------+--------+--------+--------+--------+-----------------+
> > + *
> > + * This design stores pages of all sizes (orders) in a single 6-level table.
>
> s/This design/The radix tree/ and s/table/hierarchy/
>

Yup, the documents above are updated.

> > + * It efficiently shares lower table levels, especially due to common zero top
> > + * address bits, allowing a single, efficient algorithm to manage all pages.
> > + * This bitmap approach also offers memory efficiency; for example, a 512KB
> > + * bitmap can cover a 16GB memory range for 0-order pages with PAGE_SIZE = 4KB.
> > + *
> > + * The data structures defined here are part of the KHO ABI. Any modification
> > + * to these structures that breaks backward compatibility must be accompanied by
> > + * an update to the "compatible" string. This ensures that a newer kernel can
> > + * correctly interpret the data passed by an older kernel.
> > + */
> > +
> > +/*
> > + * Defines constants for the KHO radix tree structure, used to track preserved
> > + * memory. These constants govern the indexing, sizing, and depth of the tree.
> > + */
> > +enum kho_radix_consts {
> > +     /* The bit position of a 0-order page */
>
>                  ^ this is either position of the order bits or length of
> the "page offset" for an order-0 page
>
> > +     KHO_ORDER_0_LG2 = 64 - PAGE_SHIFT,
>
> I'd spell out LOG2 rather than LG2 here and below.
>
> > +
> > +     /* Size of the table in kho_mem_radix_tree, in lg2 */
>
> We don't have kho_mem_radix_tree anymore, do we?
>
> > +     KHO_TABLE_SIZE_LG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t)),
> > +
> > +     /* Number of bits in the kho_bitmap, in lg2 */
> > +     KHO_BITMAP_SIZE_LG2 = PAGE_SHIFT + const_ilog2(BITS_PER_BYTE),
> > +
> > +     /*
> > +      * The total tree depth is the number of intermediate levels
> > +      * and 1 bitmap level.
> > +      */
> > +     KHO_TREE_MAX_DEPTH = DIV_ROUND_UP(KHO_ORDER_0_LG2 - KHO_BITMAP_SIZE_LG2,
> > +                                       KHO_TABLE_SIZE_LG2) + 1,
> > +};
> > +
> > +struct kho_radix_node {
> > +     u64 table[1 << KHO_TABLE_SIZE_LG2];
> > +};
> > +
> > +struct kho_radix_leaf {
> > +     DECLARE_BITMAP(bitmap, 1 << KHO_BITMAP_SIZE_LG2);
> > +};
> > +
> >  #endif       /* _LINUX_KHO_ABI_KEXEC_HANDOVER_H */
> > diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
> > new file mode 100644
> > index 000000000000..5101a04f6ae6
> > --- /dev/null
> > +++ b/kho_radix_tree.h
> > @@ -0,0 +1,81 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +
> > +#ifndef _LIVEUPDATE_KEXEC_HANDOVER_RADIX_TREE_H
> > +#define _LIVEUPDATE_KEXEC_HANDOVER_RADIX_TREE_H
>
> Please use _LINUX_KHO_ABI prefix
>

Updated. This is the "include/linux/kho_radix_tree.h" for the public
kho radix tree API, but we like to use the same _LINUX_KHO_ABI prefix,
correct?

> > +
> > +#include <linux/err.h>
> > +#include <linux/errno.h>
> > +#include <linux/types.h>
> > +
> > +/**
> > + * DOC: Kexec Handover Radix Tree
> > + *
> > + * This is a radix tree implementation for tracking physical memory pages
> > + * across kexec transitions. It was developed for the KHO mechanism but is
> > + * designed for broader use by any subsystem that needs to preserve pages.
> > + *
> > + * The radix tree is a multi-level tree where leaf nodes are bitmaps
> > + * representing individual pages. To allow pages of different sizes (orders)
> > + * to be stored efficiently in a single tree, it uses a unique key encoding
> > + * scheme. Each key is an unsigned long that combines a page's physical
> > + * address and its order.
> > + *
> > + * Client code is responsible for allocating the root node of the tree and
> > + * managing its lifecycle, and must use the tree data structures defined in
> > + * the KHO ABI, `include/linux/kho/abi/kexec_handover.h`.
> > + */
> > +
> > +struct kho_radix_node;
> > +
> > +typedef int (*kho_radix_tree_walk_callback_t)(unsigned long radix_key);
>
> I don't think radix tree users outside kexec_handover.c should bother with
> the key encoding.
> The callback here should have physical address and order as parameters.
>

I have updated the related function signatures and removed the
kho_radix_encode/decode_key() from the public API. I think this makes
the interface cleaner, thanks.

> > +
> > +#ifdef CONFIG_KEXEC_HANDOVER
> > +
> > +unsigned long kho_radix_encode_key(phys_addr_t pa, unsigned int order);
> > +
> > +phys_addr_t kho_radix_decode_key(unsigned long radix_key,
> > +                              unsigned int *order);
>
> These should not be a part of public interface.
>
> > +int kho_radix_add_page(struct kho_radix_node *root, unsigned long pfn,
> > +                    unsigned int order);
> > +
> > +void kho_radix_del_page(struct kho_radix_node *root, unsigned long pfn,
> > +                     unsigned int order);
> > +
> > +int kho_radix_walk_tree(struct kho_radix_node *root, unsigned int level,
> > +                     unsigned long start, kho_radix_tree_walk_callback_t cb);
> > +
>
> ...
>
> > diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
> > index a180b3367e8f..81bac82c8672 100644
> > --- a/kernel/liveupdate/kexec_handover.c
> > +++ b/kernel/liveupdate/kexec_handover.c
> > @@ -66,155 +68,302 @@ static int __init kho_parse_enable(char *p)
> >  early_param("kho", kho_parse_enable);
>
> ...
>
> >  struct kho_mem_track {
> > -     /* Points to kho_mem_phys, each order gets its own bitmap tree */
> > -     struct xarray orders;
> > +     struct kho_radix_node *root;
> > +     struct rw_semaphore sem;
>
> It does not look like we have concurrent readers, why choose rw_semaphore
> and not mutex?
>

Yes, we currently do not have concurrent readers, I have updated to
use a mutex. In the future when we support parallel tree accessing we
can extend this to rw_semaphore.

> >  };
> >
> > -struct khoser_mem_chunk;
> > -
> >  struct kho_out {
> > -     void *fdt;
> > -     bool finalized;
>
> The next patch removes finalization, probably removing the finalized field
> should be done there.

All finalization related updates are grouped to the next patch.

>
> > -     struct mutex lock; /* protects KHO FDT finalization */
> > -
> >       struct kho_mem_track track;
> > +     void *fdt;
> > +     struct mutex lock; /* protects KHO FDT */
>
> Please don't move the fields around.
> And while the update of the comment is correct, it seems to me rather a
> part of the next patch.
>
> >       struct kho_debugfs dbg;
> >  };
> >
> >  static struct kho_out kho_out = {
> > -     .lock = __MUTEX_INITIALIZER(kho_out.lock),
> >       .track = {
> > -             .orders = XARRAY_INIT(kho_out.track.orders, 0),
> > +             .sem = __RWSEM_INITIALIZER(kho_out.track.sem),
> >       },
> > -     .finalized = false,
> > +     .lock = __MUTEX_INITIALIZER(kho_out.lock),
>
> Please don't to move fields.
>
> >  };
> >
> > -static void *xa_load_or_alloc(struct xarray *xa, unsigned long index)
> > +/**
> > + * kho_radix_encode_key - Encodes a physical address and order into a radix key.
> > + * @pa: The physical address of the page.
> > + * @order: The order of the page.
> > + *
> > + * This function combines a page's physical address and its order into a
> > + * single unsigned long, which is used as a key for all radix tree
> > + * operations.
> > + *
> > + * Return: The encoded unsigned long key.
> > + */
> > +unsigned long kho_radix_encode_key(phys_addr_t pa, unsigned int order)
> >  {
> > -     void *res = xa_load(xa, index);
> > +     /* Order bits part */
> > +     unsigned long h = 1UL << (KHO_ORDER_0_LG2 - order);
> > +     /* Page offset part */
> > +     unsigned long l = pa >> (PAGE_SHIFT + order);
> >
> > -     if (res)
> > -             return res;
> > +     return h | l;
> > +}
> > +EXPORT_SYMBOL_GPL(kho_radix_encode_key);
> >
> > -     void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL);
> > +/**
> > + * kho_radix_decode_key - Decodes a radix key back into a physical address and order.
> > + * @radix_key: The unsigned long key to decode.
> > + * @order: An output parameter, a pointer to an unsigned int where the decoded
> > + *         page order will be stored.
> > + *
> > + * This function reverses the encoding performed by kho_radix_encode_key(),
> > + * extracting the original physical address and page order from a given key.
> > + *
> > + * Return: The decoded physical address.
> > + */
> > +phys_addr_t kho_radix_decode_key(unsigned long radix_key,
> > +                              unsigned int *order)
> > +{
> > +     unsigned int order_bit = fls64(radix_key);
> > +     phys_addr_t pa;
> >
> > -     if (!elm)
> > -             return ERR_PTR(-ENOMEM);
> > +     /* order_bit is numbered starting at 1 from fls64 */
> > +     *order = KHO_ORDER_0_LG2 - order_bit + 1;
> > +     /* The order is discarded by the shift */
> > +     pa = radix_key << (PAGE_SHIFT + *order);
> >
> > -     if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE)))
> > -             return ERR_PTR(-EINVAL);
> > +     return pa;
> > +}
> > +EXPORT_SYMBOL_GPL(kho_radix_decode_key);
>
> Please make kho_radix_encode_key() and kho_radix_decode_key() static.
>
> > +
> > +static unsigned long kho_radix_get_index(unsigned long radix_key,
> > +                                      unsigned int level)
> > +{
> > +     int s;
> >
> > -     res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
> > -     if (xa_is_err(res))
> > -             return ERR_PTR(xa_err(res));
> > -     else if (res)
> > -             return res;
> > +     if (level == 0)
> > +             return radix_key % (1 << KHO_BITMAP_SIZE_LG2);
>
> I'd split this to
>
> static unsigned long kho_get_radix_bitmap_index(unsigned long key);

Sure, as we already have a function to handle the leaf level walking.

>
> >
> > -     return no_free_ptr(elm);
> > +     s = ((level - 1) * KHO_TABLE_SIZE_LG2) + KHO_BITMAP_SIZE_LG2;
> > +     return (radix_key >> s) % (1 << KHO_TABLE_SIZE_LG2);
> >  }
> >
> > -static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn,
> > -                                unsigned int order)
> > +/**
> > + * kho_radix_add_page - Marks a page as preserved in the radix tree.
> > + * @root: The root of the radix tree.
> > + * @pfn: The page frame number of the page to preserve.
> > + * @order: The order of the page.
> > + *
> > + * This function traverses the radix tree based on the key derived from @pfn
> > + * and @order. It sets the corresponding bit in the leaf bitmap to mark the
> > + * page for preservation. If intermediate nodes do not exist along the path,
> > + * they are allocated and added to the tree.
> > + *
> > + * Return: 0 on success, or a negative error code on failure.
> > + */
> > +int kho_radix_add_page(struct kho_radix_node *root,
> > +                    unsigned long pfn, unsigned int order)
> >  {
> > -     struct kho_mem_phys_bits *bits;
> > -     struct kho_mem_phys *physxa;
> > -     const unsigned long pfn_high = pfn >> order;
> > +     phys_addr_t pa = PFN_PHYS(pfn);
> > +     unsigned long radix_key = kho_radix_encode_key(pa, order);
>
> pa seems unused elsewhere, you can just put PFN_PHYS() into
> kho_radix_encode_key().
> And radix_ prefix for the key seems redundant to me.
>
> > +     struct kho_radix_node *node;
> > +     struct kho_radix_leaf *leaf;
> > +     unsigned int i, idx;
> > +     int err = 0;
> >
> > -     physxa = xa_load(&track->orders, order);
> > -     if (WARN_ON_ONCE(!physxa))
> > -             return;
> > +     /*
> > +      * This array stores pointers to newly allocated intermediate radix tree
> > +      * nodes along the insertion path. In case of an error during node
> > +      * allocation or insertion, these stored pointers are used to free
> > +      * the partially allocated path, preventing memory leaks.
> > +      */
> > +     struct kho_radix_node *intermediate_nodes[KHO_TREE_MAX_DEPTH] = { 0 };
>
> Let's try keeping declarations in reverse xmas tree order. This long line
> can be the first declaration.
> And I don't think this array deserves such a long comment, it's quite
> obvious why it's needed.
>
> >
> > -     bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
> > -     if (WARN_ON_ONCE(!bits))
> > -             return;
> > +     might_sleep();
> >
> > -     clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
> > +     node = root;
> > +
> > +     /* Go from high levels to low levels */
> > +     for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
> > +             idx = kho_radix_get_index(radix_key, i);
> > +
> > +             if (node->table[idx]) {
> > +                     node = phys_to_virt((phys_addr_t)node->table[idx]);
>
> Is casting to phys_addr_t required?
> We should have an assert that verifies that phys_addr_t and u64 have the
> same size somewhere, otherwise everything falls apart anyway.
>
> > +                     continue;
> > +             }
> > +
> > +             /* Next node is empty, create a new node for it */
> > +             struct kho_radix_node *new_tree;
>
> Please don't mix declarations and code unless strictly necessary.
> And new_node seems a more appropriate name here.
>
> > +
> > +             new_tree = (struct kho_radix_node *)get_zeroed_page(GFP_KERNEL);
> > +             if (!new_tree) {
> > +                     err = -ENOMEM;
> > +                     goto err_free_alloc_nodes;
>
> This reads to me like "on error free and allocate nodes". err_free_nodes
> sounds a better name.
>

I was thinking, "When an error occurs, free the allocated nodes" =).
But I agree err_free_nodes is a better one.

> > +             }
> > +
> > +             node->table[idx] = virt_to_phys(new_tree);
> > +             node = new_tree;
> > +
> > +             intermediate_nodes[i] = new_tree;
> > +     }
> > +
> > +     /* Handle the leaf level bitmap (level 0) */
> > +     idx = kho_radix_get_index(radix_key, 0);
> > +     leaf = (struct kho_radix_leaf *)node;
> > +     __set_bit(idx, leaf->bitmap);
> > +
> > +     return 0;
> > +
> > +err_free_alloc_nodes:
> > +     for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
> > +             if (intermediate_nodes[i])
> > +                     free_page((unsigned long)intermediate_nodes[i]);
> > +     }
> > +
> > +     return err;
> >  }
> > +EXPORT_SYMBOL_GPL(kho_radix_add_page);
> >
> > -static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
> > -                          unsigned long end_pfn)
> > +/**
> > + * kho_radix_del_page - Removes a page's preservation status from the radix tree.
> > + * @root: The root of the radix tree.
> > + * @pfn: The page frame number of the page to unpreserve.
> > + * @order: The order of the page.
> > + *
> > + * This function traverses the radix tree and clears the bit corresponding to
> > + * the page, effectively removing its "preserved" status. It does not free
> > + * the tree's intermediate nodes, even if they become empty.
> > + */
> > +void kho_radix_del_page(struct kho_radix_node *root, unsigned long pfn,
> > +                     unsigned int order)
> >  {
> > -     unsigned int order;
> > +     unsigned long radix_key = kho_radix_encode_key(PFN_PHYS(pfn), order);
> > +     unsigned int tree_level = KHO_TREE_MAX_DEPTH - 1;
> > +     struct kho_radix_node *node;
> > +     struct kho_radix_leaf *leaf;
> > +     unsigned int i, idx;
> >
> > -     while (pfn < end_pfn) {
> > -             order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
> > +     might_sleep();
> >
> > -             __kho_unpreserve_order(track, pfn, order);
> > +     node = root;
>
> This can be done at declaration spot.
>
> >
> > -             pfn += 1 << order;
> > +     /* Go from high levels to low levels */
> > +     for (i = tree_level; i > 0; i--) {
>
> tree_level seems unnecessary, just use KHO_TREE_MAX_DEPTH - 1.
>
> > +             idx = kho_radix_get_index(radix_key, i);
> > +
> > +             /*
> > +              * Attempting to delete a page that has not been preserved,
> > +              * return with a warning.
> > +              */
> > +             if (WARN_ON(!node->table[idx]))
> > +                     return;
> > +
> > +             if (node->table[idx])
> > +                     node = phys_to_virt((phys_addr_t)node->table[idx]);
> >       }
> > +
> > +     /* Handle the leaf level bitmap (level 0) */
> > +     leaf = (struct kho_radix_leaf *)node;
>
> idx should be updated here for level 0.

Yes, thanks for catching this.

>
> > +     __clear_bit(idx, leaf->bitmap);
> >  }
> > +EXPORT_SYMBOL_GPL(kho_radix_del_page);
>
> ...
>
> > +
> > +/**
> > + * kho_radix_walk_tree - Traverses the radix tree and calls a callback for each preserved page.
> > + * @root: A pointer to the root node of the radix tree to walk.
> > + * @level: The starting level for the walk (typically KHO_TREE_MAX_DEPTH - 1).
> > + * @start: The initial key prefix for the walk (typically 0).
> > + * @cb: A callback function of type kho_radix_tree_walk_callback_t that will be
> > + *      invoked for each preserved page found in the tree. The callback receives
> > + *      the full radix key of the preserved page.
> > + *
> > + * This function walks the radix tree, searching from the specified top level
> > + * (@level) down to the lowest level (level 0). For each preserved page found,
> > + * it invokes the provided callback, passing the page's fully reconstructed
> > + * radix key.
> > + *
> > + * Return: 0 if the walk completed the specified subtree, or the non-zero return
> > + *         value from the callback that stopped the walk.
> > + */
> > +int kho_radix_walk_tree(struct kho_radix_node *root, unsigned int level,
> > +                     unsigned long start, kho_radix_tree_walk_callback_t cb)
> > +{
> > +     struct kho_radix_node *node;
> > +     struct kho_radix_leaf *leaf;
> > +     unsigned long radix_key, i;
> > +     int err;
> >
> > -             new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL);
> > -             if (!new_physxa)
> > -                     return -ENOMEM;
> > +     for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) {
> > +             if (!root->table[i])
> > +                     continue;
> > +
> > +             unsigned int shift;
>
> Please don't mix declarations and code unless strictly necessary.
>
> >
> > -             xa_init(&new_physxa->phys_bits);
> > -             physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa,
> > -                                 GFP_KERNEL);
> > +             shift = ((level - 1) * KHO_TABLE_SIZE_LG2) +
> > +                     KHO_BITMAP_SIZE_LG2;
> > +             radix_key = start | (i << shift);
> >
> > -             err = xa_err(physxa);
> > -             if (err || physxa) {
> > -                     xa_destroy(&new_physxa->phys_bits);
> > -                     kfree(new_physxa);
> > +             node = phys_to_virt((phys_addr_t)root->table[i]);
> >
> > +             if (level > 1) {
> > +                     err = kho_radix_walk_tree(node, level - 1,
> > +                                               radix_key, cb);
> >                       if (err)
> >                               return err;
> >               } else {
> > -                     physxa = new_physxa;
> > +                     /*
> > +                      * we are at level 1,
> > +                      * node is pointing to the level 0 bitmap.
> > +                      */
> > +                     leaf = (struct kho_radix_leaf *)node;
> > +                     return kho_radix_walk_leaf(leaf, radix_key, cb);
>
> I'd inverse the if:
>
>                 if (!level)
>                         return kho_radix_walk_leaf();
>
>                 err  = kho_radix_walk_tree()
>

I think we want to check if we are at level 1 so we can just walk the
leaf level (level 0). I have updated the branching order.

>
> >               }
> >       }
> >
> > -     bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
> > -     if (IS_ERR(bits))
> > -             return PTR_ERR(bits);
> > +     return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(kho_radix_walk_tree);
> > +
>
> Feels like an extra empty line is added here. Please drop it.
>
> >
> > -     set_bit(pfn_high % PRESERVE_BITS, bits->preserve);
> >
> > -     return 0;
> > +static void __kho_unpreserve(unsigned long pfn, unsigned long end_pfn)
>
> The change of __kho_unpreserve() signature does not belong to this patch.
> If you feel strongly this change is justified make it a preparation patch
> before the radix tree changes.
>

__kho_unpreserve() is no longer takes "struct kho_mem_track" because
it is replaced by "struct kho_radix_tree", see below. So this change
will be a part of this patch.

> > +{
> > +     struct kho_mem_track *track = &kho_out.track;
> > +     unsigned int order;
> > +
> > +     if (WARN_ON_ONCE(!track->root))
> > +             return;
> > +
> > +     down_write(&track->sem);
> > +     while (pfn < end_pfn) {
> > +             order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
> > +
> > +             kho_radix_del_page(track->root, pfn, order);
>
> If we are going to expose radix tree APIs, it would make sense for them to
> take care of the locking internally.
>
> For that we might need something like
>
> struct kho_radix_tree {
>         struct kho_radix_node *root;
>         struct mutex lock;
> };
>
> and use the root struct as the parameter to kho_radix APIs.
>

I think having the API handle the lock internally is a good idea. This
means we need to expose this "struct kho_radix_tree" as a part of the
public API. Previously "struct kho_mem_track" did the same thing but
just for kexec_handover.c internal use. I renamed it to "struct
kho_radix_tree" and moved it to the public kho_radix_tree.h header so
the clients can use it.

> > +
> > +             pfn += 1 << order;
> > +     }
> > +     up_write(&track->sem);
> >  }
>
> ...
>
> > -static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk)
> > +static int __init kho_radix_walk_tree_memblock_callback(unsigned long radix_key)
>
> This name is much about being a callback for walking the tree and very
> little about what the function does. It should be the other way around.

I renamed it to "kho_radix_memblock_reserve()", hope this makes more sense.

> >  {
> > +     union kho_page_info info;
> > +     unsigned int order;
> > +     unsigned long pa;
>
> In the most places we use 'phys_addr_t phys' for physical addresses.
>
> > +     struct page *page;
> > +     int sz;
> >
> > +     pa = kho_radix_decode_key(radix_key, &order);
> >
> > +     sz = 1 << (order + PAGE_SHIFT);
> > +     page = phys_to_page(pa);
> >
> > +     /* Reserve the memory preserved in KHO radix tree in memblock */
> > +     memblock_reserve(pa, sz);
> > +     memblock_reserved_mark_noinit(pa, sz);
> > +     info.magic = KHO_PAGE_MAGIC;
> > +     info.order = order;
> > +     page->private = info.page_private;
> >
> >       return 0;
> >  }
> >
> >
> >
>
> Too many empty lines here.
>
> >  /*
> >   * With KHO enabled, memory can become fragmented because KHO regions may
> > @@ -789,14 +774,22 @@ EXPORT_SYMBOL_GPL(kho_remove_subtree);
> >   */
> >  int kho_preserve_folio(struct folio *folio)
> >  {
> > +     struct kho_mem_track *track = &kho_out.track;
> >       const unsigned long pfn = folio_pfn(folio);
> >       const unsigned int order = folio_order(folio);
> > -     struct kho_mem_track *track = &kho_out.track;
> > +     int err;
> >
> >       if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
> >               return -EINVAL;
> >
> > -     return __kho_preserve_order(track, pfn, order);
> > +     if (WARN_ON_ONCE(!track->root))
> > +             return -EINVAL;
>
> Can we move this to kho_radix_add_page() and kho_radix_del_page()?
> I see that some preserve/unpreserve methods WARN and some don't.
>

yes, the root pointer checking and warning are moved into those radix
tree public functions.

> > +
> > +     down_write(&track->sem);
> > +     err = kho_radix_add_page(track->root, pfn, order);
> > +     up_write(&track->sem);
> > +
> > +     return err;
> >  }
> >  EXPORT_SYMBOL_GPL(kho_preserve_folio);
>
> ...
>
> > @@ -1213,25 +1214,12 @@ EXPORT_SYMBOL_GPL(kho_restore_free);
> >
> >  int kho_finalize(void)
> >  {
> > -     int ret;
> > -
> > -     if (!kho_enable)
> > -             return -EOPNOTSUPP;
> > -
> > -     guard(mutex)(&kho_out.lock);
> > -     ret = kho_mem_serialize(&kho_out);
> > -     if (ret)
> > -             return ret;
> > -
> > -     kho_out.finalized = true;
> > -
> >       return 0;
> >  }
> >
> >  bool kho_finalized(void)
> >  {
> > -     guard(mutex)(&kho_out.lock);
> > -     return kho_out.finalized;
> > +     return false;
>
> Most of the finalization changes belong to the next patch IMO.
>
> >  }
> >
> >  struct kho_in {
> > @@ -1304,18 +1292,49 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
> >  }
> >  EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
> >
> > +/* Return non-zero if error */
>
> That's what 99% of the kernel does, no need to comment about it.
>
> >  static __init int kho_out_fdt_setup(void)
> >  {
> > +     struct kho_mem_track *track = &kho_out.track;
> >       void *root = kho_out.fdt;
> > -     u64 empty_mem_map = 0;
> > +     u64 preserved_mem_tree_pa;
> >       int err;
> >
> >       err = fdt_create(root, PAGE_SIZE);
> >       err |= fdt_finish_reservemap(root);
> >       err |= fdt_begin_node(root, "");
> >       err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
> > -     err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, &empty_mem_map,
> > -                         sizeof(empty_mem_map));
> > +
> > +     down_read(&track->sem);
> > +     preserved_mem_tree_pa = (u64)virt_to_phys(track->root);
> > +     up_read(&track->sem);
>
> It seems to be the only place that uses down_read(). So we actually don't
> have concurrent readers. Let's just use a mutex.
>
> > +
> > +     err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME,
> > +                         &preserved_mem_tree_pa,
> > +                         sizeof(preserved_mem_tree_pa));
> > +
> >       err |= fdt_end_node(root);
> >       err |= fdt_finish(root);
> >
> > @@ -1324,16 +1343,26 @@ static __init int kho_out_fdt_setup(void)
> >
> >  static __init int kho_init(void)
> >  {
> > +     struct kho_mem_track *track = &kho_out.track;
> >       const void *fdt = kho_get_fdt();
> >       int err = 0;
> >
> >       if (!kho_enable)
> >               return 0;
> >
> > +     down_write(&track->sem);
> > +     track->root = (struct kho_radix_node *)
> > +             kzalloc(PAGE_SIZE, GFP_KERNEL);
> > +     up_write(&track->sem);
> > +     if (!track->root) {
> > +             err = -ENOMEM;
> > +             goto err_free_scratch;
> > +     }
> > +
> >       kho_out.fdt = kho_alloc_preserve(PAGE_SIZE);
> >       if (IS_ERR(kho_out.fdt)) {
> >               err = PTR_ERR(kho_out.fdt);
> > -             goto err_free_scratch;
> > +             goto err_free_kho_radix_tree_root;
> >       }
> >
> >       err = kho_debugfs_init();
> > @@ -1379,6 +1408,11 @@ static __init int kho_init(void)
> >
> >  err_free_fdt:
> >       kho_unpreserve_free(kho_out.fdt);
> > +
> > +err_free_kho_radix_tree_root:
> > +     kfree(track->root);
> > +     track->root = NULL;
> > +
>
> No need for empty lines around the error handling
>
> >  err_free_scratch:
> >       kho_out.fdt = NULL;
> >       for (int i = 0; i < kho_scratch_cnt; i++) {
> > @@ -1422,7 +1456,7 @@ void __init kho_memory_init(void)
> >               kho_scratch = phys_to_virt(kho_in.scratch_phys);
> >               kho_release_scratch();
> >
> > -             if (!kho_mem_deserialize(kho_get_fdt()))
> > +             if (kho_mem_retrieve(kho_get_fdt()))
> >                       kho_in.fdt_phys = 0;
> >       } else {
> >               kho_reserve_scratch();
>
> --
> Sincerely yours,
> Mike.