[PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion

Smita Koralahalli posted 7 patches 2 months, 3 weeks ago
[PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by Smita Koralahalli 2 months, 3 weeks ago
Introduce a background worker in cxl_acpi to delay SOFT RESERVE handling
until the cxl_mem driver has probed at least one device. This coordination
ensures that DAX registration or fallback handling for soft-reserved
regions is not triggered prematurely.

The worker waits on cxl_wait_queue, which is signaled via
cxl_mem_active_inc() during cxl_mem_probe(). Once at least one memory
device probe is confirmed, the worker invokes wait_for_device_probe()
to allow the rest of the CXL device hierarchy to complete initialization.

Additionally, it also handles initialization order issues where
cxl_acpi_probe() may complete before other drivers such as cxl_port or
cxl_mem have loaded, especially when cxl_acpi and cxl_port are built-in
and cxl_mem is a loadable module. In such cases, using only
wait_for_device_probe() is insufficient, as it may return before all
relevant probes are registered.

While region creation happens in cxl_port_probe(), waiting on
cxl_mem_active() would be sufficient as cxl_mem_probe() can only succeed
after the port hierarchy is in place. Furthermore, since cxl_mem depends
on cxl_pci, this also guarantees that cxl_pci has loaded by the time the
wait completes.

As cxl_mem_active() infrastructure already exists for tracking probe
activity, cxl_acpi can use it without introducing new coordination
mechanisms.

Co-developed-by: Nathan Fontenot <Nathan.Fontenot@amd.com>
Signed-off-by: Nathan Fontenot <Nathan.Fontenot@amd.com>
Co-developed-by: Terry Bowman <terry.bowman@amd.com>
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
 drivers/cxl/acpi.c             | 18 ++++++++++++++++++
 drivers/cxl/core/probe_state.c |  5 +++++
 drivers/cxl/cxl.h              |  2 ++
 3 files changed, 25 insertions(+)

diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index ca06d5acdf8f..3a27289e669b 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -823,6 +823,20 @@ static int pair_cxl_resource(struct device *dev, void *data)
 	return 0;
 }
 
+static void cxl_softreserv_mem_work_fn(struct work_struct *work)
+{
+	if (!wait_event_timeout(cxl_wait_queue, cxl_mem_active(), 30 * HZ))
+		pr_debug("Timeout waiting for cxl_mem probing");
+
+	wait_for_device_probe();
+}
+static DECLARE_WORK(cxl_sr_work, cxl_softreserv_mem_work_fn);
+
+static void cxl_softreserv_mem_update(void)
+{
+	schedule_work(&cxl_sr_work);
+}
+
 static int cxl_acpi_probe(struct platform_device *pdev)
 {
 	int rc = 0;
@@ -903,6 +917,9 @@ static int cxl_acpi_probe(struct platform_device *pdev)
 	cxl_bus_rescan();
 
 out:
+	/* Update SOFT RESERVE resources that intersect with CXL regions */
+	cxl_softreserv_mem_update();
+
 	return rc;
 }
 
@@ -934,6 +951,7 @@ static int __init cxl_acpi_init(void)
 
 static void __exit cxl_acpi_exit(void)
 {
+	cancel_work_sync(&cxl_sr_work);
 	platform_driver_unregister(&cxl_acpi_driver);
 	cxl_bus_drain();
 }
diff --git a/drivers/cxl/core/probe_state.c b/drivers/cxl/core/probe_state.c
index 5ba4b4de0e33..3089b2698b32 100644
--- a/drivers/cxl/core/probe_state.c
+++ b/drivers/cxl/core/probe_state.c
@@ -2,9 +2,12 @@
 /* Copyright(c) 2022 Intel Corporation. All rights reserved. */
 #include <linux/atomic.h>
 #include <linux/export.h>
+#include <linux/wait.h>
 #include "cxlmem.h"
 
 static atomic_t mem_active;
+DECLARE_WAIT_QUEUE_HEAD(cxl_wait_queue);
+EXPORT_SYMBOL_NS_GPL(cxl_wait_queue, "CXL");
 
 bool cxl_mem_active(void)
 {
@@ -13,10 +16,12 @@ bool cxl_mem_active(void)
 
 	return false;
 }
+EXPORT_SYMBOL_NS_GPL(cxl_mem_active, "CXL");
 
 void cxl_mem_active_inc(void)
 {
 	atomic_inc(&mem_active);
+	wake_up(&cxl_wait_queue);
 }
 EXPORT_SYMBOL_NS_GPL(cxl_mem_active_inc, "CXL");
 
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 3f1695c96abc..3117136f0208 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -903,6 +903,8 @@ void cxl_coordinates_combine(struct access_coordinate *out,
 
 bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
 
+extern wait_queue_head_t cxl_wait_queue;
+
 /*
  * Unit test builds overrides this to __weak, find the 'strong' version
  * of these symbols in tools/testing/cxl/.
-- 
2.17.1
Re: [PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by dan.j.williams@intel.com 2 months, 2 weeks ago
Smita Koralahalli wrote:
> Introduce a background worker in cxl_acpi to delay SOFT RESERVE handling
> until the cxl_mem driver has probed at least one device. This coordination
> ensures that DAX registration or fallback handling for soft-reserved
> regions is not triggered prematurely.
> 
> The worker waits on cxl_wait_queue, which is signaled via
> cxl_mem_active_inc() during cxl_mem_probe(). Once at least one memory
> device probe is confirmed, the worker invokes wait_for_device_probe()
> to allow the rest of the CXL device hierarchy to complete initialization.
> 
> Additionally, it also handles initialization order issues where
> cxl_acpi_probe() may complete before other drivers such as cxl_port or
> cxl_mem have loaded, especially when cxl_acpi and cxl_port are built-in
> and cxl_mem is a loadable module. In such cases, using only
> wait_for_device_probe() is insufficient, as it may return before all
> relevant probes are registered.

Right, but that problem is not solved by this which still leaves the
decision on when to give up on this mechanism, and this mechanism does
not tell you when follow-on probe work is complete.

> While region creation happens in cxl_port_probe(), waiting on
> cxl_mem_active() would be sufficient as cxl_mem_probe() can only succeed
> after the port hierarchy is in place. Furthermore, since cxl_mem depends
> on cxl_pci, this also guarantees that cxl_pci has loaded by the time the
> wait completes.
> 
> As cxl_mem_active() infrastructure already exists for tracking probe
> activity, cxl_acpi can use it without introducing new coordination
> mechanisms.

In appreciate the instinct to not add anything new, but the module
loading problem is solvable.

If the goal is: "I want to give device-dax a point at which it can make
a go / no-go decision about whether the CXL subsystem has properly
assembled all CXL regions implied by Soft Reserved instersecting with
CXL Windows." Then that is something like the below, only lightly tested
and likely regresses the non-CXL case.

-- 8< --
From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 22 Jul 2025 16:11:08 -0700
Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration

CXL and dax_hmem fight over "Soft Reserved" (EFI Specific Purpose Memory)
resources are published in the iomem resource tree. The entry blocks some
CXL hotplug flows, and CXL blocks dax_hmem from publishing the memory in
the event that CXL fails to parse the platform configuration.

Towards resolving this conflict: (the non-RFC version
of this patch should split these into separate patches):

1/ Defer publishing "Soft Reserved" entries in the iomem resource tree
   until the consumer, dax_hmem, is ready to use them.

2/ Fix detection of "Soft Reserved" vs "CXL Window" resource overlaps by
   switching from MODULE_SOFTDEP() to request_module() for making sure that
   cxl_acpi has had a chance to publish "CXL Window" resources.

3/ Add cxl_pci to the list of modules that need to have had a chance to
   scan boot devices such that wait_device_probe() flushes initial CXL
   topology discovery.

4/ Add a workqueue that delays consideration of "Soft Reserved" that
   overlaps CXL so that the CXL subsystem can complete all of its region
   assembly.

For RFC purposes this only solves the reliabilty of the DAX_CXL_MODE_DROP
case. DAX_CXL_MODE_REGISTER support can follow to shutdown CXL in favor of
vanilla DAX devices as an emergency fallback for platform configuration
quirks and bugs.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/kernel/e820.c    |  2 +-
 drivers/dax/hmem/device.c |  4 +-
 drivers/dax/hmem/hmem.c   | 94 +++++++++++++++++++++++++++++++++------
 include/linux/ioport.h    | 25 +++++++++++
 kernel/resource.c         | 58 +++++++++++++++++++-----
 5 files changed, 156 insertions(+), 27 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c3acbd26408b..aef1ff2cabda 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1153,7 +1153,7 @@ void __init e820__reserve_resources_late(void)
 	res = e820_res;
 	for (i = 0; i < e820_table->nr_entries; i++) {
 		if (!res->parent && res->end)
-			insert_resource_expand_to_fit(&iomem_resource, res);
+			insert_resource_late(res);
 		res++;
 	}
 
diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
index f9e1a76a04a9..22732b729017 100644
--- a/drivers/dax/hmem/device.c
+++ b/drivers/dax/hmem/device.c
@@ -83,8 +83,8 @@ static __init int hmem_register_one(struct resource *res, void *data)
 
 static __init int hmem_init(void)
 {
-	walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED,
-			IORESOURCE_MEM, 0, -1, NULL, hmem_register_one);
+	walk_soft_reserve_res_desc(IORES_DESC_SOFT_RESERVED, IORESOURCE_MEM, 0,
+				   -1, NULL, hmem_register_one);
 	return 0;
 }
 
diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 5e7c53f18491..0916478e3817 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -59,9 +59,45 @@ static void release_hmem(void *pdev)
 	platform_device_unregister(pdev);
 }
 
+static enum dax_cxl_mode {
+	DAX_CXL_MODE_DEFER,
+	DAX_CXL_MODE_REGISTER,
+	DAX_CXL_MODE_DROP,
+} dax_cxl_mode;
+
+static int handle_deferred_cxl(struct device *host, int target_nid,
+				const struct resource *res)
+{
+	if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
+			      IORES_DESC_CXL) != REGION_DISJOINT) {
+		if (dax_cxl_mode == DAX_CXL_MODE_DROP)
+			dev_dbg(host, "dropping CXL range: %pr\n", res);
+	}
+	return 0;
+}
+
+struct dax_defer_work {
+	struct platform_device *pdev;
+	struct work_struct work;
+};
+
+static void process_defer_work(struct work_struct *_work)
+{
+	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
+	struct platform_device *pdev = work->pdev;
+
+	/* relies on cxl_acpi and cxl_pci having had a chance to load */
+	wait_for_device_probe();
+
+	dax_cxl_mode = DAX_CXL_MODE_DROP;
+
+	walk_hmem_resources(&pdev->dev, handle_deferred_cxl);
+}
+
 static int hmem_register_device(struct device *host, int target_nid,
 				const struct resource *res)
 {
+	struct dax_defer_work *work = dev_get_drvdata(host);
 	struct platform_device *pdev;
 	struct memregion_info info;
 	long id;
@@ -70,14 +106,21 @@ static int hmem_register_device(struct device *host, int target_nid,
 	if (IS_ENABLED(CONFIG_CXL_REGION) &&
 	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
 			      IORES_DESC_CXL) != REGION_DISJOINT) {
-		dev_dbg(host, "deferring range to CXL: %pr\n", res);
-		return 0;
+		switch (dax_cxl_mode) {
+		case DAX_CXL_MODE_DEFER:
+			dev_dbg(host, "deferring range to CXL: %pr\n", res);
+			schedule_work(&work->work);
+			return 0;
+		case DAX_CXL_MODE_REGISTER:
+			dev_dbg(host, "registering CXL range: %pr\n", res);
+			break;
+		case DAX_CXL_MODE_DROP:
+			dev_dbg(host, "dropping CXL range: %pr\n", res);
+			return 0;
+		}
 	}
 
-	rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
-			       IORES_DESC_SOFT_RESERVED);
-	if (rc != REGION_INTERSECTS)
-		return 0;
+	/* TODO: insert "Soft Reserved" into iomem here */
 
 	id = memregion_alloc(GFP_KERNEL);
 	if (id < 0) {
@@ -123,8 +166,30 @@ static int hmem_register_device(struct device *host, int target_nid,
 	return rc;
 }
 
+static void kill_defer_work(void *_work)
+{
+	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
+
+	cancel_work_sync(&work->work);
+	kfree(work);
+}
+
 static int dax_hmem_platform_probe(struct platform_device *pdev)
 {
+	struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
+	int rc;
+
+	if (!work)
+		return -ENOMEM;
+
+	work->pdev = pdev;
+	INIT_WORK(&work->work, process_defer_work);
+
+	rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
+	if (rc)
+		return rc;
+
+	platform_set_drvdata(pdev, work);
 	return walk_hmem_resources(&pdev->dev, hmem_register_device);
 }
 
@@ -139,6 +204,16 @@ static __init int dax_hmem_init(void)
 {
 	int rc;
 
+	/*
+	 * Ensure that cxl_acpi and cxl_pci have a chance to kick off
+	 * CXL topology discovery at least once before scanning the
+	 * iomem resource tree for IORES_DESC_CXL resources.
+	 */
+	if (IS_ENABLED(CONFIG_CXL_REGION)) {
+		request_module("cxl_acpi");
+		request_module("cxl_pci");
+	}
+
 	rc = platform_driver_register(&dax_hmem_platform_driver);
 	if (rc)
 		return rc;
@@ -159,13 +234,6 @@ static __exit void dax_hmem_exit(void)
 module_init(dax_hmem_init);
 module_exit(dax_hmem_exit);
 
-/* Allow for CXL to define its own dax regions */
-#if IS_ENABLED(CONFIG_CXL_REGION)
-#if IS_MODULE(CONFIG_CXL_ACPI)
-MODULE_SOFTDEP("pre: cxl_acpi");
-#endif
-#endif
-
 MODULE_ALIAS("platform:hmem*");
 MODULE_ALIAS("platform:hmem_platform*");
 MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory");
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index e8b2d6aa4013..4fc6ab518c24 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -232,6 +232,9 @@ struct resource_constraint {
 /* PC/ISA/whatever - the normal PC address spaces: IO and memory */
 extern struct resource ioport_resource;
 extern struct resource iomem_resource;
+#ifdef CONFIG_EFI_SOFT_RESERVE
+extern struct resource soft_reserve_resource;
+#endif
 
 extern struct resource *request_resource_conflict(struct resource *root, struct resource *new);
 extern int request_resource(struct resource *root, struct resource *new);
@@ -255,6 +258,22 @@ int adjust_resource(struct resource *res, resource_size_t start,
 		    resource_size_t size);
 resource_size_t resource_alignment(struct resource *res);
 
+
+#ifdef CONFIG_EFI_SOFT_RESERVE
+static inline void insert_resource_late(struct resource *new)
+{
+	if (new->desc == IORES_DESC_SOFT_RESERVED)
+		insert_resource_expand_to_fit(&soft_reserve_resource, new);
+	else
+		insert_resource_expand_to_fit(&iomem_resource, new);
+}
+#else
+static inline void insert_resource_late(struct resource *new)
+{
+	insert_resource_expand_to_fit(&iomem_resource, new);
+}
+#endif
+
 /**
  * resource_set_size - Calculate resource end address from size and start
  * @res: Resource descriptor
@@ -409,6 +428,12 @@ walk_system_ram_res_rev(u64 start, u64 end, void *arg,
 extern int
 walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
 		    void *arg, int (*func)(struct resource *, void *));
+int walk_soft_reserve_res_desc(unsigned long desc, unsigned long flags,
+			       u64 start, u64 end, void *arg,
+			       int (*func)(struct resource *, void *));
+int region_intersects_soft_reserve(struct resource *root, resource_size_t start,
+				   size_t size, unsigned long flags,
+				   unsigned long desc);
 
 struct resource *devm_request_free_mem_region(struct device *dev,
 		struct resource *base, unsigned long size);
diff --git a/kernel/resource.c b/kernel/resource.c
index 8d3e6ed0bdc1..fd90990c31c6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -321,8 +321,8 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long
 }
 
 /**
- * find_next_iomem_res - Finds the lowest iomem resource that covers part of
- *			 [@start..@end].
+ * find_next_res - Finds the lowest resource that covers part of
+ *		   [@start..@end].
  *
  * If a resource is found, returns 0 and @*res is overwritten with the part
  * of the resource that's within [@start..@end]; if none is found, returns
@@ -337,9 +337,9 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long
  * The caller must specify @start, @end, @flags, and @desc
  * (which may be IORES_DESC_NONE).
  */
-static int find_next_iomem_res(resource_size_t start, resource_size_t end,
-			       unsigned long flags, unsigned long desc,
-			       struct resource *res)
+static int find_next_res(struct resource *parent, resource_size_t start,
+			 resource_size_t end, unsigned long flags,
+			 unsigned long desc, struct resource *res)
 {
 	struct resource *p;
 
@@ -351,7 +351,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
 
 	read_lock(&resource_lock);
 
-	for_each_resource(&iomem_resource, p, false) {
+	for_each_resource(parent, p, false) {
 		/* If we passed the resource we are looking for, stop */
 		if (p->start > end) {
 			p = NULL;
@@ -382,16 +382,23 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
 	return p ? 0 : -ENODEV;
 }
 
-static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
-				 unsigned long flags, unsigned long desc,
-				 void *arg,
-				 int (*func)(struct resource *, void *))
+static int find_next_iomem_res(resource_size_t start, resource_size_t end,
+			       unsigned long flags, unsigned long desc,
+			       struct resource *res)
+{
+	return find_next_res(&iomem_resource, start, end, flags, desc, res);
+}
+
+static int walk_res_desc(struct resource *parent, resource_size_t start,
+			 resource_size_t end, unsigned long flags,
+			 unsigned long desc, void *arg,
+			 int (*func)(struct resource *, void *))
 {
 	struct resource res;
 	int ret = -EINVAL;
 
 	while (start < end &&
-	       !find_next_iomem_res(start, end, flags, desc, &res)) {
+	       !find_next_res(parent, start, end, flags, desc, &res)) {
 		ret = (*func)(&res, arg);
 		if (ret)
 			break;
@@ -402,6 +409,15 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
 	return ret;
 }
 
+static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
+				 unsigned long flags, unsigned long desc,
+				 void *arg,
+				 int (*func)(struct resource *, void *))
+{
+	return walk_res_desc(&iomem_resource, start, end, flags, desc, arg, func);
+}
+
+
 /**
  * walk_iomem_res_desc - Walks through iomem resources and calls func()
  *			 with matching resource ranges.
@@ -426,6 +442,26 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
 }
 EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
 
+#ifdef CONFIG_EFI_SOFT_RESERVE
+struct resource soft_reserve_resource = {
+	.name	= "Soft Reserved",
+	.start	= 0,
+	.end	= -1,
+	.desc	= IORES_DESC_SOFT_RESERVED,
+	.flags	= IORESOURCE_MEM,
+};
+EXPORT_SYMBOL_GPL(soft_reserve_resource);
+
+int walk_soft_reserve_res_desc(unsigned long desc, unsigned long flags,
+			       u64 start, u64 end, void *arg,
+			       int (*func)(struct resource *, void *))
+{
+	return walk_res_desc(&soft_reserve_resource, start, end, flags, desc,
+			     arg, func);
+}
+EXPORT_SYMBOL_GPL(walk_soft_reserve_res_desc);
+#endif
+
 /*
  * This function calls the @func callback against all memory ranges of type
  * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
-- 
2.50.1
Re: [PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by Koralahalli Channabasappa, Smita 2 months, 1 week ago
Hi Dan,

On 7/23/2025 12:31 AM, dan.j.williams@intel.com wrote:
> Smita Koralahalli wrote:
>> Introduce a background worker in cxl_acpi to delay SOFT RESERVE handling
>> until the cxl_mem driver has probed at least one device. This coordination
>> ensures that DAX registration or fallback handling for soft-reserved
>> regions is not triggered prematurely.
>>
>> The worker waits on cxl_wait_queue, which is signaled via
>> cxl_mem_active_inc() during cxl_mem_probe(). Once at least one memory
>> device probe is confirmed, the worker invokes wait_for_device_probe()
>> to allow the rest of the CXL device hierarchy to complete initialization.
>>
>> Additionally, it also handles initialization order issues where
>> cxl_acpi_probe() may complete before other drivers such as cxl_port or
>> cxl_mem have loaded, especially when cxl_acpi and cxl_port are built-in
>> and cxl_mem is a loadable module. In such cases, using only
>> wait_for_device_probe() is insufficient, as it may return before all
>> relevant probes are registered.
> 
> Right, but that problem is not solved by this which still leaves the
> decision on when to give up on this mechanism, and this mechanism does
> not tell you when follow-on probe work is complete.
> 
>> While region creation happens in cxl_port_probe(), waiting on
>> cxl_mem_active() would be sufficient as cxl_mem_probe() can only succeed
>> after the port hierarchy is in place. Furthermore, since cxl_mem depends
>> on cxl_pci, this also guarantees that cxl_pci has loaded by the time the
>> wait completes.
>>
>> As cxl_mem_active() infrastructure already exists for tracking probe
>> activity, cxl_acpi can use it without introducing new coordination
>> mechanisms.
> 
> In appreciate the instinct to not add anything new, but the module
> loading problem is solvable.
> 
> If the goal is: "I want to give device-dax a point at which it can make
> a go / no-go decision about whether the CXL subsystem has properly
> assembled all CXL regions implied by Soft Reserved instersecting with
> CXL Windows." Then that is something like the below, only lightly tested
> and likely regresses the non-CXL case.
> 
> -- 8< --
>  From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001
> From: Dan Williams <dan.j.williams@intel.com>
> Date: Tue, 22 Jul 2025 16:11:08 -0700
> Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration
> 
> CXL and dax_hmem fight over "Soft Reserved" (EFI Specific Purpose Memory)
> resources are published in the iomem resource tree. The entry blocks some
> CXL hotplug flows, and CXL blocks dax_hmem from publishing the memory in
> the event that CXL fails to parse the platform configuration.
> 
> Towards resolving this conflict: (the non-RFC version
> of this patch should split these into separate patches):
> 
> 1/ Defer publishing "Soft Reserved" entries in the iomem resource tree
>     until the consumer, dax_hmem, is ready to use them.
> 
> 2/ Fix detection of "Soft Reserved" vs "CXL Window" resource overlaps by
>     switching from MODULE_SOFTDEP() to request_module() for making sure that
>     cxl_acpi has had a chance to publish "CXL Window" resources.
> 
> 3/ Add cxl_pci to the list of modules that need to have had a chance to
>     scan boot devices such that wait_device_probe() flushes initial CXL
>     topology discovery.
> 
> 4/ Add a workqueue that delays consideration of "Soft Reserved" that
>     overlaps CXL so that the CXL subsystem can complete all of its region
>     assembly.
> 
> For RFC purposes this only solves the reliabilty of the DAX_CXL_MODE_DROP
> case. DAX_CXL_MODE_REGISTER support can follow to shutdown CXL in favor of
> vanilla DAX devices as an emergency fallback for platform configuration
> quirks and bugs.
> 
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>   arch/x86/kernel/e820.c    |  2 +-
>   drivers/dax/hmem/device.c |  4 +-
>   drivers/dax/hmem/hmem.c   | 94 +++++++++++++++++++++++++++++++++------
>   include/linux/ioport.h    | 25 +++++++++++
>   kernel/resource.c         | 58 +++++++++++++++++++-----
>   5 files changed, 156 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
> index c3acbd26408b..aef1ff2cabda 100644
> --- a/arch/x86/kernel/e820.c
> +++ b/arch/x86/kernel/e820.c
> @@ -1153,7 +1153,7 @@ void __init e820__reserve_resources_late(void)
>   	res = e820_res;
>   	for (i = 0; i < e820_table->nr_entries; i++) {
>   		if (!res->parent && res->end)
> -			insert_resource_expand_to_fit(&iomem_resource, res);
> +			insert_resource_late(res);
>   		res++;
>   	}
>   
> diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
> index f9e1a76a04a9..22732b729017 100644
> --- a/drivers/dax/hmem/device.c
> +++ b/drivers/dax/hmem/device.c
> @@ -83,8 +83,8 @@ static __init int hmem_register_one(struct resource *res, void *data)
>   
>   static __init int hmem_init(void)
>   {
> -	walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED,
> -			IORESOURCE_MEM, 0, -1, NULL, hmem_register_one);
> +	walk_soft_reserve_res_desc(IORES_DESC_SOFT_RESERVED, IORESOURCE_MEM, 0,
> +				   -1, NULL, hmem_register_one);
>   	return 0;
>   }
>   
> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
> index 5e7c53f18491..0916478e3817 100644
> --- a/drivers/dax/hmem/hmem.c
> +++ b/drivers/dax/hmem/hmem.c
> @@ -59,9 +59,45 @@ static void release_hmem(void *pdev)
>   	platform_device_unregister(pdev);
>   }
>   
> +static enum dax_cxl_mode {
> +	DAX_CXL_MODE_DEFER,
> +	DAX_CXL_MODE_REGISTER,
> +	DAX_CXL_MODE_DROP,
> +} dax_cxl_mode;
> +
> +static int handle_deferred_cxl(struct device *host, int target_nid,
> +				const struct resource *res)
> +{
> +	if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
> +			      IORES_DESC_CXL) != REGION_DISJOINT) {
> +		if (dax_cxl_mode == DAX_CXL_MODE_DROP)
> +			dev_dbg(host, "dropping CXL range: %pr\n", res);
> +	}
> +	return 0;
> +}
> +
> +struct dax_defer_work {
> +	struct platform_device *pdev;
> +	struct work_struct work;
> +};
> +
> +static void process_defer_work(struct work_struct *_work)
> +{
> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> +	struct platform_device *pdev = work->pdev;
> +
> +	/* relies on cxl_acpi and cxl_pci having had a chance to load */
> +	wait_for_device_probe();
> +
> +	dax_cxl_mode = DAX_CXL_MODE_DROP;
> +
> +	walk_hmem_resources(&pdev->dev, handle_deferred_cxl);
> +}
> +
>   static int hmem_register_device(struct device *host, int target_nid,
>   				const struct resource *res)
>   {
> +	struct dax_defer_work *work = dev_get_drvdata(host);
>   	struct platform_device *pdev;
>   	struct memregion_info info;
>   	long id;
> @@ -70,14 +106,21 @@ static int hmem_register_device(struct device *host, int target_nid,
>   	if (IS_ENABLED(CONFIG_CXL_REGION) &&
>   	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>   			      IORES_DESC_CXL) != REGION_DISJOINT) {

I may be wrong here, but could this check fail? While request_module() 
ensures that cxl_acpi and cxl_pci are requested for loading, it does not 
guarantee that either has completed initialization or that region 
enumeration (i.e add_cxl_resources()) has finished by the time we reach 
this check.

We also haven't called wait_for_device_probe() at this point, which is 
typically used to block until all pending device probes are complete.

Thanks
Smita
> -		dev_dbg(host, "deferring range to CXL: %pr\n", res);
> -		return 0;
> +		switch (dax_cxl_mode) {
> +		case DAX_CXL_MODE_DEFER:
> +			dev_dbg(host, "deferring range to CXL: %pr\n", res);
> +			schedule_work(&work->work);
> +			return 0;
> +		case DAX_CXL_MODE_REGISTER:
> +			dev_dbg(host, "registering CXL range: %pr\n", res);
> +			break;
> +		case DAX_CXL_MODE_DROP:
> +			dev_dbg(host, "dropping CXL range: %pr\n", res);
> +			return 0;
> +		}
>   	}
>   
> -	rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
> -			       IORES_DESC_SOFT_RESERVED);
> -	if (rc != REGION_INTERSECTS)
> -		return 0;
> +	/* TODO: insert "Soft Reserved" into iomem here */
>   
>   	id = memregion_alloc(GFP_KERNEL);
>   	if (id < 0) {
> @@ -123,8 +166,30 @@ static int hmem_register_device(struct device *host, int target_nid,
>   	return rc;
>   }
>   
> +static void kill_defer_work(void *_work)
> +{
> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> +
> +	cancel_work_sync(&work->work);
> +	kfree(work);
> +}
> +
>   static int dax_hmem_platform_probe(struct platform_device *pdev)
>   {
> +	struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
> +	int rc;
> +
> +	if (!work)
> +		return -ENOMEM;
> +
> +	work->pdev = pdev;
> +	INIT_WORK(&work->work, process_defer_work);
> +
> +	rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
> +	if (rc)
> +		return rc;
> +
> +	platform_set_drvdata(pdev, work);
>   	return walk_hmem_resources(&pdev->dev, hmem_register_device);
>   }
>   
> @@ -139,6 +204,16 @@ static __init int dax_hmem_init(void)
>   {
>   	int rc;
>   
> +	/*
> +	 * Ensure that cxl_acpi and cxl_pci have a chance to kick off
> +	 * CXL topology discovery at least once before scanning the
> +	 * iomem resource tree for IORES_DESC_CXL resources.
> +	 */
> +	if (IS_ENABLED(CONFIG_CXL_REGION)) {
> +		request_module("cxl_acpi");
> +		request_module("cxl_pci");
> +	}
> +
>   	rc = platform_driver_register(&dax_hmem_platform_driver);
>   	if (rc)
>   		return rc;
> @@ -159,13 +234,6 @@ static __exit void dax_hmem_exit(void)
>   module_init(dax_hmem_init);
>   module_exit(dax_hmem_exit);
>   
> -/* Allow for CXL to define its own dax regions */
> -#if IS_ENABLED(CONFIG_CXL_REGION)
> -#if IS_MODULE(CONFIG_CXL_ACPI)
> -MODULE_SOFTDEP("pre: cxl_acpi");
> -#endif
> -#endif
> -
>   MODULE_ALIAS("platform:hmem*");
>   MODULE_ALIAS("platform:hmem_platform*");
>   MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory");
> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> index e8b2d6aa4013..4fc6ab518c24 100644
> --- a/include/linux/ioport.h
> +++ b/include/linux/ioport.h
> @@ -232,6 +232,9 @@ struct resource_constraint {
>   /* PC/ISA/whatever - the normal PC address spaces: IO and memory */
>   extern struct resource ioport_resource;
>   extern struct resource iomem_resource;
> +#ifdef CONFIG_EFI_SOFT_RESERVE
> +extern struct resource soft_reserve_resource;
> +#endif
>   
>   extern struct resource *request_resource_conflict(struct resource *root, struct resource *new);
>   extern int request_resource(struct resource *root, struct resource *new);
> @@ -255,6 +258,22 @@ int adjust_resource(struct resource *res, resource_size_t start,
>   		    resource_size_t size);
>   resource_size_t resource_alignment(struct resource *res);
>   
> +
> +#ifdef CONFIG_EFI_SOFT_RESERVE
> +static inline void insert_resource_late(struct resource *new)
> +{
> +	if (new->desc == IORES_DESC_SOFT_RESERVED)
> +		insert_resource_expand_to_fit(&soft_reserve_resource, new);
> +	else
> +		insert_resource_expand_to_fit(&iomem_resource, new);
> +}
> +#else
> +static inline void insert_resource_late(struct resource *new)
> +{
> +	insert_resource_expand_to_fit(&iomem_resource, new);
> +}
> +#endif
> +
>   /**
>    * resource_set_size - Calculate resource end address from size and start
>    * @res: Resource descriptor
> @@ -409,6 +428,12 @@ walk_system_ram_res_rev(u64 start, u64 end, void *arg,
>   extern int
>   walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
>   		    void *arg, int (*func)(struct resource *, void *));
> +int walk_soft_reserve_res_desc(unsigned long desc, unsigned long flags,
> +			       u64 start, u64 end, void *arg,
> +			       int (*func)(struct resource *, void *));
> +int region_intersects_soft_reserve(struct resource *root, resource_size_t start,
> +				   size_t size, unsigned long flags,
> +				   unsigned long desc);
>   
>   struct resource *devm_request_free_mem_region(struct device *dev,
>   		struct resource *base, unsigned long size);
> diff --git a/kernel/resource.c b/kernel/resource.c
> index 8d3e6ed0bdc1..fd90990c31c6 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -321,8 +321,8 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long
>   }
>   
>   /**
> - * find_next_iomem_res - Finds the lowest iomem resource that covers part of
> - *			 [@start..@end].
> + * find_next_res - Finds the lowest resource that covers part of
> + *		   [@start..@end].
>    *
>    * If a resource is found, returns 0 and @*res is overwritten with the part
>    * of the resource that's within [@start..@end]; if none is found, returns
> @@ -337,9 +337,9 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long
>    * The caller must specify @start, @end, @flags, and @desc
>    * (which may be IORES_DESC_NONE).
>    */
> -static int find_next_iomem_res(resource_size_t start, resource_size_t end,
> -			       unsigned long flags, unsigned long desc,
> -			       struct resource *res)
> +static int find_next_res(struct resource *parent, resource_size_t start,
> +			 resource_size_t end, unsigned long flags,
> +			 unsigned long desc, struct resource *res)
>   {
>   	struct resource *p;
>   
> @@ -351,7 +351,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
>   
>   	read_lock(&resource_lock);
>   
> -	for_each_resource(&iomem_resource, p, false) {
> +	for_each_resource(parent, p, false) {
>   		/* If we passed the resource we are looking for, stop */
>   		if (p->start > end) {
>   			p = NULL;
> @@ -382,16 +382,23 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
>   	return p ? 0 : -ENODEV;
>   }
>   
> -static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
> -				 unsigned long flags, unsigned long desc,
> -				 void *arg,
> -				 int (*func)(struct resource *, void *))
> +static int find_next_iomem_res(resource_size_t start, resource_size_t end,
> +			       unsigned long flags, unsigned long desc,
> +			       struct resource *res)
> +{
> +	return find_next_res(&iomem_resource, start, end, flags, desc, res);
> +}
> +
> +static int walk_res_desc(struct resource *parent, resource_size_t start,
> +			 resource_size_t end, unsigned long flags,
> +			 unsigned long desc, void *arg,
> +			 int (*func)(struct resource *, void *))
>   {
>   	struct resource res;
>   	int ret = -EINVAL;
>   
>   	while (start < end &&
> -	       !find_next_iomem_res(start, end, flags, desc, &res)) {
> +	       !find_next_res(parent, start, end, flags, desc, &res)) {
>   		ret = (*func)(&res, arg);
>   		if (ret)
>   			break;
> @@ -402,6 +409,15 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
>   	return ret;
>   }
>   
> +static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
> +				 unsigned long flags, unsigned long desc,
> +				 void *arg,
> +				 int (*func)(struct resource *, void *))
> +{
> +	return walk_res_desc(&iomem_resource, start, end, flags, desc, arg, func);
> +}
> +
> +
>   /**
>    * walk_iomem_res_desc - Walks through iomem resources and calls func()
>    *			 with matching resource ranges.
> @@ -426,6 +442,26 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
>   }
>   EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
>   
> +#ifdef CONFIG_EFI_SOFT_RESERVE
> +struct resource soft_reserve_resource = {
> +	.name	= "Soft Reserved",
> +	.start	= 0,
> +	.end	= -1,
> +	.desc	= IORES_DESC_SOFT_RESERVED,
> +	.flags	= IORESOURCE_MEM,
> +};
> +EXPORT_SYMBOL_GPL(soft_reserve_resource);
> +
> +int walk_soft_reserve_res_desc(unsigned long desc, unsigned long flags,
> +			       u64 start, u64 end, void *arg,
> +			       int (*func)(struct resource *, void *))
> +{
> +	return walk_res_desc(&soft_reserve_resource, start, end, flags, desc,
> +			     arg, func);
> +}
> +EXPORT_SYMBOL_GPL(walk_soft_reserve_res_desc);
> +#endif
> +
>   /*
>    * This function calls the @func callback against all memory ranges of type
>    * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
Re: [PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by dan.j.williams@intel.com 2 months, 1 week ago
Koralahalli Channabasappa, Smita wrote:
[..]
> >   static int hmem_register_device(struct device *host, int target_nid,
> >   				const struct resource *res)
> >   {
> > +	struct dax_defer_work *work = dev_get_drvdata(host);
> >   	struct platform_device *pdev;
> >   	struct memregion_info info;
> >   	long id;
> > @@ -70,14 +106,21 @@ static int hmem_register_device(struct device *host, int target_nid,
> >   	if (IS_ENABLED(CONFIG_CXL_REGION) &&
> >   	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
> >   			      IORES_DESC_CXL) != REGION_DISJOINT) {
> 
> I may be wrong here, but could this check fail?

It can fail, but for the case where ACPI0017 is present and CXL windows
exist, the failure cases would only be the extreme ones like OOM killer.

> While request_module() ensures that cxl_acpi and cxl_pci are requested
> for loading, it does not guarantee that either has completed
> initialization or that region enumeration (i.e add_cxl_resources())
> has finished by the time we reach this check.

No, outside of someone doing something silly like passing
"driver_async_probe=cxl_acpi" on the kernel command line then
request_module() will complete synchronously (btw, should close that
possibility off with PROBE_FORCE_SYNCHRONOUS).

When request_module() returns module_init() for the requested module
will have completed. ACPI devices will have been enumerated by this
point, so cxl_acpi_probe() will have also run by the time module_init()
completes.

> We also haven't called wait_for_device_probe() at this point, which is 
> typically used to block until all pending device probes are complete.

wait_for_device_probe() is only needed for async probing, deferred
probing, and dependent device probing. cxl_acpi is none of those cases.
ACPI devices are always enumerated before userspace is up, so the
initial driver attach can always assume to have completed in module_init
context.

wait_for_device_probe() is needed for cxl_pci attach because cxl_pci
attach is async and it creates dependent devices that fire off their own
module requests.

As I noted in the changelog MODULE_SOFTDEP() is not reliable for
ordering, but request_module() is reliable for ordering. We could go so
far as to have symbol dependencies to require module loading to succeed,
but I don't think that is needed here.

See that approach in the for-6.18/cxl-probe-order RFC branch for cxl_mem
and cxl_port:

https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git/log/?h=for-6.18/cxl-probe-order
Re: [PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by dan.j.williams@intel.com 2 months, 2 weeks ago
dan.j.williams@ wrote:
[..]
> If the goal is: "I want to give device-dax a point at which it can make
> a go / no-go decision about whether the CXL subsystem has properly
> assembled all CXL regions implied by Soft Reserved instersecting with
> CXL Windows." Then that is something like the below, only lightly tested
> and likely regresses the non-CXL case.
> 
> -- 8< --
> From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001
> From: Dan Williams <dan.j.williams@intel.com>
> Date: Tue, 22 Jul 2025 16:11:08 -0700
> Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration

Likely needs this incremental change to prevent DEV_DAX_HMEM from being
built-in when CXL is not. This still leaves the awkward scenario of CXL
enabled, DEV_DAX_CXL disabled, and DEV_DAX_HMEM built-in. I believe that
safely fails in devdax only / fallback mode, but something to
investigate when respinning on top of this.

-- 8< --
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index d656e4c0eb84..3683bb3f2311 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -48,6 +48,8 @@ config DEV_DAX_CXL
 	tristate "CXL DAX: direct access to CXL RAM regions"
 	depends on CXL_BUS && CXL_REGION && DEV_DAX
 	default CXL_REGION && DEV_DAX
+	depends on CXL_ACPI >= DEV_DAX_HMEM
+	depends on CXL_PCI >= DEV_DAX_HMEM
 	help
 	  CXL RAM regions are either mapped by platform-firmware
 	  and published in the initial system-memory map as "System RAM", mapped
diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 0916478e3817..8bcd104111a8 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -103,7 +103,7 @@ static int hmem_register_device(struct device *host, int target_nid,
 	long id;
 	int rc;
 
-	if (IS_ENABLED(CONFIG_CXL_REGION) &&
+	if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
 	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
 			      IORES_DESC_CXL) != REGION_DISJOINT) {
 		switch (dax_cxl_mode) {
@@ -209,7 +209,7 @@ static __init int dax_hmem_init(void)
 	 * CXL topology discovery at least once before scanning the
 	 * iomem resource tree for IORES_DESC_CXL resources.
 	 */
-	if (IS_ENABLED(CONFIG_CXL_REGION)) {
+	if (IS_ENABLED(CONFIG_DEV_DAX_CXL)) {
 		request_module("cxl_acpi");
 		request_module("cxl_pci");
 	}
Re: [PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by Zhijian Li (Fujitsu) 2 months ago
Hi Dan and Smita,


On 24/07/2025 00:13, dan.j.williams@intel.com wrote:
> dan.j.williams@ wrote:
> [..]
>> If the goal is: "I want to give device-dax a point at which it can make
>> a go / no-go decision about whether the CXL subsystem has properly
>> assembled all CXL regions implied by Soft Reserved instersecting with
>> CXL Windows." Then that is something like the below, only lightly tested
>> and likely regresses the non-CXL case.
>>
>> -- 8< --
>>  From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001
>> From: Dan Williams <dan.j.williams@intel.com>
>> Date: Tue, 22 Jul 2025 16:11:08 -0700
>> Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration
> 
> Likely needs this incremental change to prevent DEV_DAX_HMEM from being
> built-in when CXL is not. This still leaves the awkward scenario of CXL
> enabled, DEV_DAX_CXL disabled, and DEV_DAX_HMEM built-in. I believe that
> safely fails in devdax only / fallback mode, but something to
> investigate when respinning on top of this.
> 

Thank you for your RFC; I find your proposal remarkably compelling, as it adeptly addresses the issues I am currently facing.


To begin with, I still encountered several issues with your patch (considering the patch at the RFC stage, I think it is already quite commendable):

1. Some resources described by SRAT are wrongly identified as System RAM (kmem), such as the following: 200000000-5bffffff.
    
    ```
    200000000-5bffffff : dax6.0
      200000000-5bffffff : System RAM (kmem)
    5c0001128-5c00011b7 : port1
    5d0000000-64ffffff : CXL Window 0
      5d0000000-64ffffff : region0
        5d0000000-64ffffff : dax0.0
          5d0000000-64ffffff : System RAM (kmem)
    680000000-e7ffffff : PCI Bus 0000:00

    [root@rdma-server ~]# dmesg | grep -i -e soft -e hotplug
    [    0.000000] Command line: BOOT_IMAGE=(hd0,msdos1)/boot/vmlinuz-6.16.0-rc4-lizhijian-Dan+ root=UUID=386769a3-cfa5-47c8-8797-d5ec58c9cb6c ro earlyprintk=ttyS0 no_timer_check net.ifnames=0 console=tty1 console=ttyS0,115200n8 softlockup_panic=1 printk.devkmsg=on oops=panic sysrq_always_enabled panic_on_warn ignore_loglevel kasan.fault=panic
    [    0.000000] BIOS-e820: [mem 0x0000000180000000-0x00000001ffffffff] soft reserved
    [    0.000000] BIOS-e820: [mem 0x00000005d0000000-0x000000064ffffff] soft reserved
    [    0.072114] ACPI: SRAT: Node 3 PXM 3 [mem 0x200000000-0x5bffffff] hotplug
    ```

2. Triggers dev_warn and dev_err:
    
    ```
    [root@rdma-server ~]# journalctl -p err -p warning --dmesg
    ...snip...
    Jul 29 13:17:36 rdma-server kernel: cxl root0: Extended linear cache calculation failed rc:-2
    Jul 29 13:17:36 rdma-server kernel: hmem hmem.1: probe with driver hmem failed with error -12
    Jul 29 13:17:36 rdma-server kernel: hmem hmem.2: probe with driver hmem failed with error -12
    Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: mapping0: 0x100000000-0x17ffffff could not reserve region
    Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: probe with driver kmem failed with error -16
    ```

3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible.
    
    On failure:
    
    ```
    100000000-27ffffff : System RAM
    5c0001128-5c00011b7 : port1
    5c0011128-5c00111b7 : port2
    5d0000000-6cffffff : CXL Window 0
    6d0000000-7cffffff : CXL Window 1
    7000000000-700000ffff : PCI Bus 0000:0c
      7000000000-700000ffff : 0000:0c:00.0
        7000001080-70000010d7 : mem1
    ```

    On success:
    
    ```
    5d0000000-7cffffff : dax0.0
      5d0000000-7cffffff : System RAM (kmem)
        5d0000000-6cffffff : CXL Window 0
        6d0000000-7cffffff : CXL Window 1
    ```

In term of issues 1 and 2, this arises because hmem_register_device() attempts to register resources of all "HMEM devices," whereas we only need to register the IORES_DESC_SOFT_RESERVED resources. I believe resolving the current TODO will address this.

```
-   rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
-                          IORES_DESC_SOFT_RESERVED);
-   if (rc != REGION_INTERSECTS)
-       return 0;
+   /* TODO: insert "Soft Reserved" into iomem here */
```

Regarding issue 3 (which exists in the current situation), this could be because it cannot ensure that dax_hmem_probe() executes prior to cxl_acpi_probe() when CXL_REGION is disabled.

I am pleased that you have pushed the patch to the cxl/for-6.18/cxl-probe-order branch, and I'm looking forward to its integration into the upstream during the v6.18 merge window.
Besides the current TODO, you also mentioned that this RFC PATCH must be further subdivided into several patches, so there remains significant work to be done.
If my understanding is correct, you would be personally continuing to push forward this patch, right?


Smita,

Do you have any additional thoughts on this proposal from your side?


Thanks
Zhijian




> -- 8< --
> diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
> index d656e4c0eb84..3683bb3f2311 100644
> --- a/drivers/dax/Kconfig
> +++ b/drivers/dax/Kconfig
> @@ -48,6 +48,8 @@ config DEV_DAX_CXL
>   	tristate "CXL DAX: direct access to CXL RAM regions"
>   	depends on CXL_BUS && CXL_REGION && DEV_DAX
>   	default CXL_REGION && DEV_DAX
> +	depends on CXL_ACPI >= DEV_DAX_HMEM
> +	depends on CXL_PCI >= DEV_DAX_HMEM
>   	help
>   	  CXL RAM regions are either mapped by platform-firmware
>   	  and published in the initial system-memory map as "System RAM", mapped
> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
> index 0916478e3817..8bcd104111a8 100644
> --- a/drivers/dax/hmem/hmem.c
> +++ b/drivers/dax/hmem/hmem.c
> @@ -103,7 +103,7 @@ static int hmem_register_device(struct device *host, int target_nid,
>   	long id;
>   	int rc;
>   
> -	if (IS_ENABLED(CONFIG_CXL_REGION) &&
> +	if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
>   	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>   			      IORES_DESC_CXL) != REGION_DISJOINT) {
>   		switch (dax_cxl_mode) {
> @@ -209,7 +209,7 @@ static __init int dax_hmem_init(void)
>   	 * CXL topology discovery at least once before scanning the
>   	 * iomem resource tree for IORES_DESC_CXL resources.
>   	 */
> -	if (IS_ENABLED(CONFIG_CXL_REGION)) {
> +	if (IS_ENABLED(CONFIG_DEV_DAX_CXL)) {
>   		request_module("cxl_acpi");
>   		request_module("cxl_pci");
>   	}
Re: [PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by Alison Schofield 1 month, 2 weeks ago
On Tue, Aug 05, 2025 at 03:58:41AM +0000, Zhijian Li (Fujitsu) wrote:
> Hi Dan and Smita,
> 
> 
> On 24/07/2025 00:13, dan.j.williams@intel.com wrote:
> > dan.j.williams@ wrote:
> > [..]
> >> If the goal is: "I want to give device-dax a point at which it can make
> >> a go / no-go decision about whether the CXL subsystem has properly
> >> assembled all CXL regions implied by Soft Reserved instersecting with
> >> CXL Windows." Then that is something like the below, only lightly tested
> >> and likely regresses the non-CXL case.
> >>
> >> -- 8< --
> >>  From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001
> >> From: Dan Williams <dan.j.williams@intel.com>
> >> Date: Tue, 22 Jul 2025 16:11:08 -0700
> >> Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration
> > 
> > Likely needs this incremental change to prevent DEV_DAX_HMEM from being
> > built-in when CXL is not. This still leaves the awkward scenario of CXL
> > enabled, DEV_DAX_CXL disabled, and DEV_DAX_HMEM built-in. I believe that
> > safely fails in devdax only / fallback mode, but something to
> > investigate when respinning on top of this.
> > 
> 
> Thank you for your RFC; I find your proposal remarkably compelling, as it adeptly addresses the issues I am currently facing.
> 
> 
> To begin with, I still encountered several issues with your patch (considering the patch at the RFC stage, I think it is already quite commendable):

Hi Zhijian,

Like you, I tried this RFC out. It resolved the issue of soft reserved
resources preventing teardown and replacement of a region in place.

I looked at the issues you found, and have some questions comments
included below.

> 
> 1. Some resources described by SRAT are wrongly identified as System RAM (kmem), such as the following: 200000000-5bffffff.
>     
>     ```
>     200000000-5bffffff : dax6.0
>       200000000-5bffffff : System RAM (kmem)
>     5c0001128-5c00011b7 : port1
>     5d0000000-64ffffff : CXL Window 0
>       5d0000000-64ffffff : region0
>         5d0000000-64ffffff : dax0.0
>           5d0000000-64ffffff : System RAM (kmem)
>     680000000-e7ffffff : PCI Bus 0000:00
> 
>     [root@rdma-server ~]# dmesg | grep -i -e soft -e hotplug
>     [    0.000000] Command line: BOOT_IMAGE=(hd0,msdos1)/boot/vmlinuz-6.16.0-rc4-lizhijian-Dan+ root=UUID=386769a3-cfa5-47c8-8797-d5ec58c9cb6c ro earlyprintk=ttyS0 no_timer_check net.ifnames=0 console=tty1 console=ttyS0,115200n8 softlockup_panic=1 printk.devkmsg=on oops=panic sysrq_always_enabled panic_on_warn ignore_loglevel kasan.fault=panic
>     [    0.000000] BIOS-e820: [mem 0x0000000180000000-0x00000001ffffffff] soft reserved
>     [    0.000000] BIOS-e820: [mem 0x00000005d0000000-0x000000064ffffff] soft reserved
>     [    0.072114] ACPI: SRAT: Node 3 PXM 3 [mem 0x200000000-0x5bffffff] hotplug
>     ```

Is that range also labelled as soft reserved?  
I ask, because I'm trying to draw a parallel between our test platforms.
I see - 

[] BIOS-e820: [mem 0x0000024080000000-0x000004407fffffff] soft reserved
.
.
[] reserve setup_data: [mem 0x0000024080000000-0x000004407fffffff] soft reserved
.
.
[] ACPI: SRAT: Node 6 PXM 14 [mem 0x24080000000-0x4407fffffff] hotplug

/proc/iomem - as expected
24080000000-5f77fffffff : CXL Window 0
  24080000000-4407fffffff : region0
    24080000000-4407fffffff : dax0.0
      24080000000-4407fffffff : System RAM (kmem)


I'm also seeing this message:
[] resource: Unaddressable device  [mem 0x24080000000-0x4407fffffff] conflicts with [mem 0x24080000000-0x4407fffffff]

> 
> 2. Triggers dev_warn and dev_err:
>     
>     ```
>     [root@rdma-server ~]# journalctl -p err -p warning --dmesg
>     ...snip...
>     Jul 29 13:17:36 rdma-server kernel: cxl root0: Extended linear cache calculation failed rc:-2
>     Jul 29 13:17:36 rdma-server kernel: hmem hmem.1: probe with driver hmem failed with error -12
>     Jul 29 13:17:36 rdma-server kernel: hmem hmem.2: probe with driver hmem failed with error -12
>     Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: mapping0: 0x100000000-0x17ffffff could not reserve region
>     Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: probe with driver kmem failed with error -16

I see the kmem dax messages also. It seems the kmem probe is going after
every range (except hotplug) in the SRAT, and failing.

>     ```
> 
> 3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible.

Haven't tested !CXL_REGION yet.

>     
>     On failure:
>     
>     ```
>     100000000-27ffffff : System RAM
>     5c0001128-5c00011b7 : port1
>     5c0011128-5c00111b7 : port2
>     5d0000000-6cffffff : CXL Window 0
>     6d0000000-7cffffff : CXL Window 1
>     7000000000-700000ffff : PCI Bus 0000:0c
>       7000000000-700000ffff : 0000:0c:00.0
>         7000001080-70000010d7 : mem1
>     ```
> 
>     On success:
>     
>     ```
>     5d0000000-7cffffff : dax0.0
>       5d0000000-7cffffff : System RAM (kmem)
>         5d0000000-6cffffff : CXL Window 0
>         6d0000000-7cffffff : CXL Window 1
>     ```
> 
> In term of issues 1 and 2, this arises because hmem_register_device() attempts to register resources of all "HMEM devices," whereas we only need to register the IORES_DESC_SOFT_RESERVED resources. I believe resolving the current TODO will address this.
> 
> ```
> -   rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
> -                          IORES_DESC_SOFT_RESERVED);
> -   if (rc != REGION_INTERSECTS)
> -       return 0;
> +   /* TODO: insert "Soft Reserved" into iomem here */
> ```

Above makes sense.

I'll probably wait for an update from Smita to test again, but if you
or Smita have anything you want me to try out on my hardwware in the
meantime, let me know.

-- Alison


> 
> Regarding issue 3 (which exists in the current situation), this could be because it cannot ensure that dax_hmem_probe() executes prior to cxl_acpi_probe() when CXL_REGION is disabled.
> 
> I am pleased that you have pushed the patch to the cxl/for-6.18/cxl-probe-order branch, and I'm looking forward to its integration into the upstream during the v6.18 merge window.
> Besides the current TODO, you also mentioned that this RFC PATCH must be further subdivided into several patches, so there remains significant work to be done.
> If my understanding is correct, you would be personally continuing to push forward this patch, right?
> 
> 
> Smita,
> 
> Do you have any additional thoughts on this proposal from your side?
> 
> 
> Thanks
> Zhijian
> 
snip
Re: [PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by Zhijian Li (Fujitsu) 1 month, 2 weeks ago

On 21/08/2025 07:14, Alison Schofield wrote:
> On Tue, Aug 05, 2025 at 03:58:41AM +0000, Zhijian Li (Fujitsu) wrote:
>> Hi Dan and Smita,
>>
>>
>> On 24/07/2025 00:13, dan.j.williams@intel.com wrote:
>>> dan.j.williams@ wrote:
>>> [..]
>>>> If the goal is: "I want to give device-dax a point at which it can make
>>>> a go / no-go decision about whether the CXL subsystem has properly
>>>> assembled all CXL regions implied by Soft Reserved instersecting with
>>>> CXL Windows." Then that is something like the below, only lightly tested
>>>> and likely regresses the non-CXL case.
>>>>
>>>> -- 8< --
>>>>   From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001
>>>> From: Dan Williams <dan.j.williams@intel.com>
>>>> Date: Tue, 22 Jul 2025 16:11:08 -0700
>>>> Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration
>>>
>>> Likely needs this incremental change to prevent DEV_DAX_HMEM from being
>>> built-in when CXL is not. This still leaves the awkward scenario of CXL
>>> enabled, DEV_DAX_CXL disabled, and DEV_DAX_HMEM built-in. I believe that
>>> safely fails in devdax only / fallback mode, but something to
>>> investigate when respinning on top of this.
>>>
>>
>> Thank you for your RFC; I find your proposal remarkably compelling, as it adeptly addresses the issues I am currently facing.
>>
>>
>> To begin with, I still encountered several issues with your patch (considering the patch at the RFC stage, I think it is already quite commendable):
> 
> Hi Zhijian,
> 
> Like you, I tried this RFC out. It resolved the issue of soft reserved
> resources preventing teardown and replacement of a region in place.
> 
> I looked at the issues you found, and have some questions comments
> included below.
> 
>>
>> 1. Some resources described by SRAT are wrongly identified as System RAM (kmem), such as the following: 200000000-5bffffff.
>>      
>>      ```
>>      200000000-5bffffff : dax6.0
>>        200000000-5bffffff : System RAM (kmem)
>>      5c0001128-5c00011b7 : port1
>>      5d0000000-64ffffff : CXL Window 0
>>        5d0000000-64ffffff : region0
>>          5d0000000-64ffffff : dax0.0
>>            5d0000000-64ffffff : System RAM (kmem)
>>      680000000-e7ffffff : PCI Bus 0000:00
>>
>>      [root@rdma-server ~]# dmesg | grep -i -e soft -e hotplug
>>      [    0.000000] Command line: BOOT_IMAGE=(hd0,msdos1)/boot/vmlinuz-6.16.0-rc4-lizhijian-Dan+ root=UUID=386769a3-cfa5-47c8-8797-d5ec58c9cb6c ro earlyprintk=ttyS0 no_timer_check net.ifnames=0 console=tty1 console=ttyS0,115200n8 softlockup_panic=1 printk.devkmsg=on oops=panic sysrq_always_enabled panic_on_warn ignore_loglevel kasan.fault=panic
>>      [    0.000000] BIOS-e820: [mem 0x0000000180000000-0x00000001ffffffff] soft reserved
>>      [    0.000000] BIOS-e820: [mem 0x00000005d0000000-0x000000064ffffff] soft reserved
>>      [    0.072114] ACPI: SRAT: Node 3 PXM 3 [mem 0x200000000-0x5bffffff] hotplug
>>      ```
> 
> Is that range also labelled as soft reserved?
> I ask, because I'm trying to draw a parallel between our test platforms.

No, It's not a soft reserved range. This can simply simulate with QEMU with `maxmem=192G` option(see below full qemu command line).
In my environment, `0x200000000-0x5bffffff` is something like [DRAM_END + 1, DRAM_END + maxmem - TOTAL_INSTALLED_DRAM_SIZE]
DRAM_END: end of the installed DRAM in Node 3

This range is reserved for the DRAM hot-add. In my case, it will be registered into 'HMEM devices' by calling hmem_register_resource in HMAT(drivers/acpi/numa/hmat.c)

  893 static void hmat_register_target_devices(struct memory_target *target)
  894 {
  895         struct resource *res;
  896
  897         /*
  898          * Do not bother creating devices if no driver is available to
  899          * consume them.
  900          */
  901         if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM))
  902                 return;
  903
  904         for (res = target->memregions.child; res; res = res->sibling) {
  905                 int target_nid = pxm_to_node(target->memory_pxm);
  906
  907                 hmem_register_resource(target_nid, res);
  908         }
  909 }


$ dmesg | grep -i -e soft -e hotplug -e Node
[    0.000000] Command line: BOOT_IMAGE=(hd0,msdos1)/boot/vmlinuz-6.16.0-rc4-lizhijian-Dan-00026-g1473b9914846-dirty root=UUID=386769a3-cfa5-47c8-8797-d5ec58c9cb6c ro earlyprintk=ttyS0 no_timer_check net.ifnames=0 console=tty1 conc
[    0.000000] BIOS-e820: [mem 0x0000000180000000-0x00000001ffffffff] soft reserved
[    0.000000] BIOS-e820: [mem 0x00000005d0000000-0x000000064fffffff] soft reserved
[    0.066332] ACPI: SRAT: Node 0 PXM 0 [mem 0x00000000-0x0009ffff]
[    0.067665] ACPI: SRAT: Node 0 PXM 0 [mem 0x00100000-0x7fffffff]
[    0.068995] ACPI: SRAT: Node 1 PXM 1 [mem 0x100000000-0x17fffffff]
[    0.070359] ACPI: SRAT: Node 2 PXM 2 [mem 0x180000000-0x1bfffffff]
[    0.071723] ACPI: SRAT: Node 3 PXM 3 [mem 0x1c0000000-0x1ffffffff]
[    0.073085] ACPI: SRAT: Node 3 PXM 3 [mem 0x200000000-0x5bfffffff] hotplug
[    0.075689] NUMA: Node 0 [mem 0x00001000-0x0009ffff] + [mem 0x00100000-0x7fffffff] -> [mem 0x00001000-0x7fffffff]
[    0.077849] NODE_DATA(0) allocated [mem 0x7ffb3e00-0x7ffdefff]
[    0.079149] NODE_DATA(1) allocated [mem 0x17ffd1e00-0x17fffcfff]
[    0.086077] Movable zone start for each node
[    0.087054] Early memory node ranges
[    0.087890]   node   0: [mem 0x0000000000001000-0x000000000009efff]
[    0.089264]   node   0: [mem 0x0000000000100000-0x000000007ffdefff]
[    0.090631]   node   1: [mem 0x0000000100000000-0x000000017fffffff]
[    0.092003] Initmem setup node 0 [mem 0x0000000000001000-0x000000007ffdefff]
[    0.093532] Initmem setup node 1 [mem 0x0000000100000000-0x000000017fffffff]
[    0.095164] Initmem setup node 2 as memoryless
[    0.096281] Initmem setup node 3 as memoryless
[    0.097397] Initmem setup node 4 as memoryless
[    0.098444] On node 0, zone DMA: 1 pages in unavailable ranges
[    0.099866] On node 0, zone DMA: 97 pages in unavailable ranges
[    0.104342] On node 1, zone Normal: 33 pages in unavailable ranges
[    0.126883] CPU topo: Allowing 4 present CPUs plus 0 hotplug CPUs

=================================

Please note that this is a modified QEMU.

/home/lizhijian/qemu/build-hmem/qemu-system-x86_64 -machine q35,accel=kvm,cxl=on,hmat=on \
-name guest-rdma-server -nographic -boot c \
-m size=6G,slots=2,maxmem=19922944k \
-hda /home/lizhijian/images/Fedora-rdma-server.qcow2 \
-object memory-backend-memfd,share=on,size=2G,id=m0 \
-object memory-backend-memfd,share=on,size=2G,id=m1 \
-numa node,nodeid=0,cpus=0-1,memdev=m0 \
-numa node,nodeid=1,cpus=2-3,memdev=m1 \
-smp 4,sockets=2,cores=2 \
-device pcie-root-port,id=pci-root,slot=8,bus=pcie.0,chassis=0 \
-device pxb-cxl,id=pxb-cxl-host-bridge,bus=pcie.0,bus_nr=0x35,hdm_for_passthrough=true \
-device cxl-rp,id=cxl-rp-hb-rp0,bus=pxb-cxl-host-bridge,chassis=0,slot=0,port=0 \
-device cxl-type3,bus=cxl-rp-hb-rp0,volatile-memdev=cxl-vmem0,id=cxl-vmem0,program-hdm-decoder=true \
-object memory-backend-file,id=cxl-vmem0,share=on,mem-path=/home/lizhijian/images/cxltest0.raw,size=2048M \
-M cxl-fmw.0.targets.0=pxb-cxl-host-bridge,cxl-fmw.0.size=2G,cxl-fmw.0.interleave-granularity=8k \
-nic bridge,br=virbr0,model=e1000,mac=52:54:00:c9:76:74 \
-bios /home/lizhijian/seabios/out/bios.bin \
-object memory-backend-memfd,share=on,size=1G,id=m2 \
-object memory-backend-memfd,share=on,size=1G,id=m3 \
-numa node,memdev=m2,nodeid=2 \
-numa node,memdev=m3,nodeid=3 \
-numa dist,src=0,dst=0,val=10 \
-numa dist,src=0,dst=1,val=21 \
-numa dist,src=0,dst=2,val=21 \
-numa dist,src=0,dst=3,val=21 \
-numa dist,src=1,dst=0,val=21 \
-numa dist,src=1,dst=1,val=10 \
-numa dist,src=1,dst=2,val=21 \
-numa dist,src=1,dst=3,val=21 \
-numa dist,src=2,dst=0,val=21 \
-numa dist,src=2,dst=1,val=21 \
-numa dist,src=2,dst=2,val=10 \
-numa dist,src=2,dst=3,val=21 \
-numa dist,src=3,dst=0,val=21 \
-numa dist,src=3,dst=1,val=21 \
-numa dist,src=3,dst=2,val=21 \
-numa dist,src=3,dst=3,val=10 \
-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=110 \
-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=20000M \
-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=240 \
-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=40000M \
-numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-latency,latency=340 \
-numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-bandwidth,bandwidth=60000M \
-numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-latency,latency=440 \
-numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-bandwidth,bandwidth=80000M \
-numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-latency,latency=240 \
-numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=40000M \
-numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-latency,latency=110 \
-numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=20000M \
-numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-latency,latency=340 \
-numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-bandwidth,bandwidth=60000M \
-numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-latency,latency=440 \
-numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-bandwidth,bandwidth=80000M



> I see -
> 
> [] BIOS-e820: [mem 0x0000024080000000-0x000004407fffffff] soft reserved
> .
> .
> [] reserve setup_data: [mem 0x0000024080000000-0x000004407fffffff] soft reserved
> .
> .
> [] ACPI: SRAT: Node 6 PXM 14 [mem 0x24080000000-0x4407fffffff] hotplug
> 
> /proc/iomem - as expected
> 24080000000-5f77fffffff : CXL Window 0
>    24080000000-4407fffffff : region0
>      24080000000-4407fffffff : dax0.0
>        24080000000-4407fffffff : System RAM (kmem)
> 
> 
> I'm also seeing this message:
> [] resource: Unaddressable device  [mem 0x24080000000-0x4407fffffff] conflicts with [mem 0x24080000000-0x4407fffffff]
> 
>>
>> 2. Triggers dev_warn and dev_err:
>>      
>>      ```
>>      [root@rdma-server ~]# journalctl -p err -p warning --dmesg
>>      ...snip...
>>      Jul 29 13:17:36 rdma-server kernel: cxl root0: Extended linear cache calculation failed rc:-2
>>      Jul 29 13:17:36 rdma-server kernel: hmem hmem.1: probe with driver hmem failed with error -12
>>      Jul 29 13:17:36 rdma-server kernel: hmem hmem.2: probe with driver hmem failed with error -12
>>      Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: mapping0: 0x100000000-0x17ffffff could not reserve region
>>      Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: probe with driver kmem failed with error -16
> 
> I see the kmem dax messages also. It seems the kmem probe is going after
> every range (except hotplug) in the SRAT, and failing.

Yes, that's true, because current RFC removed the code that filters out the non-soft-reserverd resource. As a result, it will try to register dax/kmem for all of them while some of them has been marked as busy in the iomem_resource.

>> -   rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>> -                          IORES_DESC_SOFT_RESERVED);
>> -   if (rc != REGION_INTERSECTS)
>> -       return 0;


This is another example on my real *CXL HOST*:
Aug 19 17:59:05  kernel: device-mapper: core: CONFIG_IMA_DISABLE_HTABLE is disabled. Duplicate IMA measuremen>
Aug 19 17:59:09  kernel: power_meter ACPI000D:00: Ignoring unsafe software power cap!
Aug 19 17:59:09  kernel: kmem dax2.0: mapping0: 0x0-0x8fffffff could not reserve region
Aug 19 17:59:09  kernel: kmem dax2.0: probe with driver kmem failed with error -16
Aug 19 17:59:09  kernel: kmem dax3.0: mapping0: 0x100000000-0x86fffffff could not reserve region
Aug 19 17:59:09  kernel: kmem dax3.0: probe with driver kmem failed with error -16
Aug 19 17:59:09  kernel: kmem dax4.0: mapping0: 0x870000000-0x106fffffff could not reserve region
Aug 19 17:59:09  kernel: kmem dax4.0: probe with driver kmem failed with error -16
Aug 19 17:59:19  kernel: nvme nvme0: using unchecked data buffer
Aug 19 18:36:27  kernel: block nvme1n1: No UUID available providing old NGUID
lizhijian@:~$ sudo grep -w -e 106fffffff -e 870000000 -e 8fffffff -e 100000000 /proc/iomem
6fffb000-8fffffff : Reserved
100000000-10000ffff : Reserved
106ccc0000-106fffffff : Reserved


This issue can be resolved by re-introducing sort_reserved_region_intersects(...) I guess.



> 
>>      ```
>>
>> 3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible.
> 
> Haven't tested !CXL_REGION yet.
> 
>>      
>>      On failure:
>>      
>>      ```
>>      100000000-27ffffff : System RAM
>>      5c0001128-5c00011b7 : port1
>>      5c0011128-5c00111b7 : port2
>>      5d0000000-6cffffff : CXL Window 0
>>      6d0000000-7cffffff : CXL Window 1
>>      7000000000-700000ffff : PCI Bus 0000:0c
>>        7000000000-700000ffff : 0000:0c:00.0
>>          7000001080-70000010d7 : mem1
>>      ```
>>
>>      On success:
>>      
>>      ```
>>      5d0000000-7cffffff : dax0.0
>>        5d0000000-7cffffff : System RAM (kmem)
>>          5d0000000-6cffffff : CXL Window 0
>>          6d0000000-7cffffff : CXL Window 1
>>      ```
>>
>> In term of issues 1 and 2, this arises because hmem_register_device() attempts to register resources of all "HMEM devices," whereas we only need to register the IORES_DESC_SOFT_RESERVED resources. I believe resolving the current TODO will address this.
>>
>> ```
>> -   rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>> -                          IORES_DESC_SOFT_RESERVED);
>> -   if (rc != REGION_INTERSECTS)
>> -       return 0;
>> +   /* TODO: insert "Soft Reserved" into iomem here */
>> ```
> 
> Above makes sense.

I think the subroutine add_soft_reserved() in your previous patchset[1] are able to cover this TODO

> 
> I'll probably wait for an update from Smita to test again, but if you
> or Smita have anything you want me to try out on my hardwware in the
> meantime, let me know.
> 

Here is my local fixup based on Dan's RFC, it can resovle issue 1 and 2.


-- 8< --
  commit e7ccd7a01e168e185971da66f4aa13eb451caeaf
Author: Li Zhijian <lizhijian@fujitsu.com>
Date:   Fri Aug 20 11:07:15 2025 +0800

     Fix probe-order TODO
     
     Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>

diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 754115da86cc..965ffc622136 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -93,6 +93,26 @@ static void process_defer_work(struct work_struct *_work)
  	walk_hmem_resources(&pdev->dev, handle_deferred_cxl);
  }
  
+static int add_soft_reserved(resource_size_t start, resource_size_t len,
+			     unsigned long flags)
+{
+	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+	int rc;
+
+	if (!res)
+		return -ENOMEM;
+
+	*res = DEFINE_RES_NAMED_DESC(start, len, "Soft Reserved",
+				     flags | IORESOURCE_MEM,
+				     IORES_DESC_SOFT_RESERVED);
+
+	rc = insert_resource(&iomem_resource, res);
+	if (rc)
+		kfree(res);
+
+	return rc;
+}
+
  static int hmem_register_device(struct device *host, int target_nid,
  				const struct resource *res)
  {
@@ -102,6 +122,10 @@ static int hmem_register_device(struct device *host, int target_nid,
  	long id;
  	int rc;
  
+	if (soft_reserve_res_intersects(res->start, resource_size(res),
+		      IORESOURCE_MEM, IORES_DESC_NONE) == REGION_DISJOINT)
+		return 0;
+
  	if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
  	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
  			      IORES_DESC_CXL) != REGION_DISJOINT) {
@@ -119,7 +143,17 @@ static int hmem_register_device(struct device *host, int target_nid,
  		}
  	}
  
-	/* TODO: insert "Soft Reserved" into iomem here */
+	/*
+	 * This is a verified Soft Reserved region that CXL is not claiming (or
+	 * is being overridden). Add it to the main iomem tree so it can be
+	 * properly reserved by the DAX driver.
+	 */
+	rc = add_soft_reserved(res->start, res->end - res->start + 1, 0);
+	if (rc) {
+		dev_warn(host, "failed to insert soft-reserved resource %pr into iomem: %d\n",
+			 res, rc);
+		return rc;
+	}
  
  	id = memregion_alloc(GFP_KERNEL);
  	if (id < 0) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 349f0d9aad22..eca5956c444b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1069,6 +1069,8 @@ enum {
  int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
  		      unsigned long desc);
  
+int soft_reserve_res_intersects(resource_size_t offset, size_t size, unsigned long flags,
+		      unsigned long desc);
  /* Support for virtually mapped pages */
  struct page *vmalloc_to_page(const void *addr);
  unsigned long vmalloc_to_pfn(const void *addr);
diff --git a/kernel/resource.c b/kernel/resource.c
index b8eac6af2fad..a34b76cf690a 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -461,6 +461,22 @@ int walk_soft_reserve_res_desc(unsigned long desc, unsigned long flags,
  			     arg, func);
  }
  EXPORT_SYMBOL_GPL(walk_soft_reserve_res_desc);
+
+static int __region_intersects(struct resource *parent, resource_size_t start,
+			       size_t size, unsigned long flags,
+			       unsigned long desc);
+int soft_reserve_res_intersects(resource_size_t start, size_t size, unsigned long flags,
+		      unsigned long desc)
+{
+	int ret;
+
+	read_lock(&resource_lock);
+	ret = __region_intersects(&soft_reserve_resource, start, size, flags, desc);
+	read_unlock(&resource_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(soft_reserve_res_intersects);
  #endif
  
  /*



[1] https://lore.kernel.org/linux-cxl/29312c0765224ae76862d59a17748c8188fb95f1.1692638817.git.alison.schofield@intel.com/


> -- Alison
> 
> 
>>
>> Regarding issue 3 (which exists in the current situation), this could be because it cannot ensure that dax_hmem_probe() executes prior to cxl_acpi_probe() when CXL_REGION is disabled.
>>
>> I am pleased that you have pushed the patch to the cxl/for-6.18/cxl-probe-order branch, and I'm looking forward to its integration into the upstream during the v6.18 merge window.
>> Besides the current TODO, you also mentioned that this RFC PATCH must be further subdivided into several patches, so there remains significant work to be done.
>> If my understanding is correct, you would be personally continuing to push forward this patch, right?
>>
>>
>> Smita,
>>
>> Do you have any additional thoughts on this proposal from your side?
>>
>>
>> Thanks
>> Zhijian
>>
> snip
> 
Re: [PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by Koralahalli Channabasappa, Smita 1 month, 2 weeks ago
On 8/20/2025 7:30 PM, Zhijian Li (Fujitsu) wrote:
> 
> 
> On 21/08/2025 07:14, Alison Schofield wrote:
>> On Tue, Aug 05, 2025 at 03:58:41AM +0000, Zhijian Li (Fujitsu) wrote:
>>> Hi Dan and Smita,
>>>
>>>
>>> On 24/07/2025 00:13, dan.j.williams@intel.com wrote:
>>>> dan.j.williams@ wrote:
>>>> [..]
>>>>> If the goal is: "I want to give device-dax a point at which it can make
>>>>> a go / no-go decision about whether the CXL subsystem has properly
>>>>> assembled all CXL regions implied by Soft Reserved instersecting with
>>>>> CXL Windows." Then that is something like the below, only lightly tested
>>>>> and likely regresses the non-CXL case.
>>>>>
>>>>> -- 8< --
>>>>>    From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001
>>>>> From: Dan Williams <dan.j.williams@intel.com>
>>>>> Date: Tue, 22 Jul 2025 16:11:08 -0700
>>>>> Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration
>>>>
>>>> Likely needs this incremental change to prevent DEV_DAX_HMEM from being
>>>> built-in when CXL is not. This still leaves the awkward scenario of CXL
>>>> enabled, DEV_DAX_CXL disabled, and DEV_DAX_HMEM built-in. I believe that
>>>> safely fails in devdax only / fallback mode, but something to
>>>> investigate when respinning on top of this.
>>>>
>>>
>>> Thank you for your RFC; I find your proposal remarkably compelling, as it adeptly addresses the issues I am currently facing.
>>>
>>>
>>> To begin with, I still encountered several issues with your patch (considering the patch at the RFC stage, I think it is already quite commendable):
>>
>> Hi Zhijian,
>>
>> Like you, I tried this RFC out. It resolved the issue of soft reserved
>> resources preventing teardown and replacement of a region in place.
>>
>> I looked at the issues you found, and have some questions comments
>> included below.
>>
>>>
>>> 1. Some resources described by SRAT are wrongly identified as System RAM (kmem), such as the following: 200000000-5bffffff.
>>>       
>>>       ```
>>>       200000000-5bffffff : dax6.0
>>>         200000000-5bffffff : System RAM (kmem)
>>>       5c0001128-5c00011b7 : port1
>>>       5d0000000-64ffffff : CXL Window 0
>>>         5d0000000-64ffffff : region0
>>>           5d0000000-64ffffff : dax0.0
>>>             5d0000000-64ffffff : System RAM (kmem)
>>>       680000000-e7ffffff : PCI Bus 0000:00
>>>
>>>       [root@rdma-server ~]# dmesg | grep -i -e soft -e hotplug
>>>       [    0.000000] Command line: BOOT_IMAGE=(hd0,msdos1)/boot/vmlinuz-6.16.0-rc4-lizhijian-Dan+ root=UUID=386769a3-cfa5-47c8-8797-d5ec58c9cb6c ro earlyprintk=ttyS0 no_timer_check net.ifnames=0 console=tty1 console=ttyS0,115200n8 softlockup_panic=1 printk.devkmsg=on oops=panic sysrq_always_enabled panic_on_warn ignore_loglevel kasan.fault=panic
>>>       [    0.000000] BIOS-e820: [mem 0x0000000180000000-0x00000001ffffffff] soft reserved
>>>       [    0.000000] BIOS-e820: [mem 0x00000005d0000000-0x000000064ffffff] soft reserved
>>>       [    0.072114] ACPI: SRAT: Node 3 PXM 3 [mem 0x200000000-0x5bffffff] hotplug
>>>       ```
>>
>> Is that range also labelled as soft reserved?
>> I ask, because I'm trying to draw a parallel between our test platforms.
> 
> No, It's not a soft reserved range. This can simply simulate with QEMU with `maxmem=192G` option(see below full qemu command line).
> In my environment, `0x200000000-0x5bffffff` is something like [DRAM_END + 1, DRAM_END + maxmem - TOTAL_INSTALLED_DRAM_SIZE]
> DRAM_END: end of the installed DRAM in Node 3
> 
> This range is reserved for the DRAM hot-add. In my case, it will be registered into 'HMEM devices' by calling hmem_register_resource in HMAT(drivers/acpi/numa/hmat.c)
> 
>    893 static void hmat_register_target_devices(struct memory_target *target)
>    894 {
>    895         struct resource *res;
>    896
>    897         /*
>    898          * Do not bother creating devices if no driver is available to
>    899          * consume them.
>    900          */
>    901         if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM))
>    902                 return;
>    903
>    904         for (res = target->memregions.child; res; res = res->sibling) {
>    905                 int target_nid = pxm_to_node(target->memory_pxm);
>    906
>    907                 hmem_register_resource(target_nid, res);
>    908         }
>    909 }
> 
> 
> $ dmesg | grep -i -e soft -e hotplug -e Node
> [    0.000000] Command line: BOOT_IMAGE=(hd0,msdos1)/boot/vmlinuz-6.16.0-rc4-lizhijian-Dan-00026-g1473b9914846-dirty root=UUID=386769a3-cfa5-47c8-8797-d5ec58c9cb6c ro earlyprintk=ttyS0 no_timer_check net.ifnames=0 console=tty1 conc
> [    0.000000] BIOS-e820: [mem 0x0000000180000000-0x00000001ffffffff] soft reserved
> [    0.000000] BIOS-e820: [mem 0x00000005d0000000-0x000000064fffffff] soft reserved
> [    0.066332] ACPI: SRAT: Node 0 PXM 0 [mem 0x00000000-0x0009ffff]
> [    0.067665] ACPI: SRAT: Node 0 PXM 0 [mem 0x00100000-0x7fffffff]
> [    0.068995] ACPI: SRAT: Node 1 PXM 1 [mem 0x100000000-0x17fffffff]
> [    0.070359] ACPI: SRAT: Node 2 PXM 2 [mem 0x180000000-0x1bfffffff]
> [    0.071723] ACPI: SRAT: Node 3 PXM 3 [mem 0x1c0000000-0x1ffffffff]
> [    0.073085] ACPI: SRAT: Node 3 PXM 3 [mem 0x200000000-0x5bfffffff] hotplug
> [    0.075689] NUMA: Node 0 [mem 0x00001000-0x0009ffff] + [mem 0x00100000-0x7fffffff] -> [mem 0x00001000-0x7fffffff]
> [    0.077849] NODE_DATA(0) allocated [mem 0x7ffb3e00-0x7ffdefff]
> [    0.079149] NODE_DATA(1) allocated [mem 0x17ffd1e00-0x17fffcfff]
> [    0.086077] Movable zone start for each node
> [    0.087054] Early memory node ranges
> [    0.087890]   node   0: [mem 0x0000000000001000-0x000000000009efff]
> [    0.089264]   node   0: [mem 0x0000000000100000-0x000000007ffdefff]
> [    0.090631]   node   1: [mem 0x0000000100000000-0x000000017fffffff]
> [    0.092003] Initmem setup node 0 [mem 0x0000000000001000-0x000000007ffdefff]
> [    0.093532] Initmem setup node 1 [mem 0x0000000100000000-0x000000017fffffff]
> [    0.095164] Initmem setup node 2 as memoryless
> [    0.096281] Initmem setup node 3 as memoryless
> [    0.097397] Initmem setup node 4 as memoryless
> [    0.098444] On node 0, zone DMA: 1 pages in unavailable ranges
> [    0.099866] On node 0, zone DMA: 97 pages in unavailable ranges
> [    0.104342] On node 1, zone Normal: 33 pages in unavailable ranges
> [    0.126883] CPU topo: Allowing 4 present CPUs plus 0 hotplug CPUs
> 
> =================================
> 
> Please note that this is a modified QEMU.
> 
> /home/lizhijian/qemu/build-hmem/qemu-system-x86_64 -machine q35,accel=kvm,cxl=on,hmat=on \
> -name guest-rdma-server -nographic -boot c \
> -m size=6G,slots=2,maxmem=19922944k \
> -hda /home/lizhijian/images/Fedora-rdma-server.qcow2 \
> -object memory-backend-memfd,share=on,size=2G,id=m0 \
> -object memory-backend-memfd,share=on,size=2G,id=m1 \
> -numa node,nodeid=0,cpus=0-1,memdev=m0 \
> -numa node,nodeid=1,cpus=2-3,memdev=m1 \
> -smp 4,sockets=2,cores=2 \
> -device pcie-root-port,id=pci-root,slot=8,bus=pcie.0,chassis=0 \
> -device pxb-cxl,id=pxb-cxl-host-bridge,bus=pcie.0,bus_nr=0x35,hdm_for_passthrough=true \
> -device cxl-rp,id=cxl-rp-hb-rp0,bus=pxb-cxl-host-bridge,chassis=0,slot=0,port=0 \
> -device cxl-type3,bus=cxl-rp-hb-rp0,volatile-memdev=cxl-vmem0,id=cxl-vmem0,program-hdm-decoder=true \
> -object memory-backend-file,id=cxl-vmem0,share=on,mem-path=/home/lizhijian/images/cxltest0.raw,size=2048M \
> -M cxl-fmw.0.targets.0=pxb-cxl-host-bridge,cxl-fmw.0.size=2G,cxl-fmw.0.interleave-granularity=8k \
> -nic bridge,br=virbr0,model=e1000,mac=52:54:00:c9:76:74 \
> -bios /home/lizhijian/seabios/out/bios.bin \
> -object memory-backend-memfd,share=on,size=1G,id=m2 \
> -object memory-backend-memfd,share=on,size=1G,id=m3 \
> -numa node,memdev=m2,nodeid=2 \
> -numa node,memdev=m3,nodeid=3 \
> -numa dist,src=0,dst=0,val=10 \
> -numa dist,src=0,dst=1,val=21 \
> -numa dist,src=0,dst=2,val=21 \
> -numa dist,src=0,dst=3,val=21 \
> -numa dist,src=1,dst=0,val=21 \
> -numa dist,src=1,dst=1,val=10 \
> -numa dist,src=1,dst=2,val=21 \
> -numa dist,src=1,dst=3,val=21 \
> -numa dist,src=2,dst=0,val=21 \
> -numa dist,src=2,dst=1,val=21 \
> -numa dist,src=2,dst=2,val=10 \
> -numa dist,src=2,dst=3,val=21 \
> -numa dist,src=3,dst=0,val=21 \
> -numa dist,src=3,dst=1,val=21 \
> -numa dist,src=3,dst=2,val=21 \
> -numa dist,src=3,dst=3,val=10 \
> -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=110 \
> -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=20000M \
> -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=240 \
> -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=40000M \
> -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-latency,latency=340 \
> -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-bandwidth,bandwidth=60000M \
> -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-latency,latency=440 \
> -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-bandwidth,bandwidth=80000M \
> -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-latency,latency=240 \
> -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=40000M \
> -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-latency,latency=110 \
> -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=20000M \
> -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-latency,latency=340 \
> -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-bandwidth,bandwidth=60000M \
> -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-latency,latency=440 \
> -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-bandwidth,bandwidth=80000M
> 
> 
> 
>> I see -
>>
>> [] BIOS-e820: [mem 0x0000024080000000-0x000004407fffffff] soft reserved
>> .
>> .
>> [] reserve setup_data: [mem 0x0000024080000000-0x000004407fffffff] soft reserved
>> .
>> .
>> [] ACPI: SRAT: Node 6 PXM 14 [mem 0x24080000000-0x4407fffffff] hotplug
>>
>> /proc/iomem - as expected
>> 24080000000-5f77fffffff : CXL Window 0
>>     24080000000-4407fffffff : region0
>>       24080000000-4407fffffff : dax0.0
>>         24080000000-4407fffffff : System RAM (kmem)
>>
>>
>> I'm also seeing this message:
>> [] resource: Unaddressable device  [mem 0x24080000000-0x4407fffffff] conflicts with [mem 0x24080000000-0x4407fffffff]
>>
>>>
>>> 2. Triggers dev_warn and dev_err:
>>>       
>>>       ```
>>>       [root@rdma-server ~]# journalctl -p err -p warning --dmesg
>>>       ...snip...
>>>       Jul 29 13:17:36 rdma-server kernel: cxl root0: Extended linear cache calculation failed rc:-2
>>>       Jul 29 13:17:36 rdma-server kernel: hmem hmem.1: probe with driver hmem failed with error -12
>>>       Jul 29 13:17:36 rdma-server kernel: hmem hmem.2: probe with driver hmem failed with error -12
>>>       Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: mapping0: 0x100000000-0x17ffffff could not reserve region
>>>       Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: probe with driver kmem failed with error -16
>>
>> I see the kmem dax messages also. It seems the kmem probe is going after
>> every range (except hotplug) in the SRAT, and failing.
> 
> Yes, that's true, because current RFC removed the code that filters out the non-soft-reserverd resource. As a result, it will try to register dax/kmem for all of them while some of them has been marked as busy in the iomem_resource.
> 
>>> -   rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>>> -                          IORES_DESC_SOFT_RESERVED);
>>> -   if (rc != REGION_INTERSECTS)
>>> -       return 0;
> 
> 
> This is another example on my real *CXL HOST*:
> Aug 19 17:59:05  kernel: device-mapper: core: CONFIG_IMA_DISABLE_HTABLE is disabled. Duplicate IMA measuremen>
> Aug 19 17:59:09  kernel: power_meter ACPI000D:00: Ignoring unsafe software power cap!
> Aug 19 17:59:09  kernel: kmem dax2.0: mapping0: 0x0-0x8fffffff could not reserve region
> Aug 19 17:59:09  kernel: kmem dax2.0: probe with driver kmem failed with error -16
> Aug 19 17:59:09  kernel: kmem dax3.0: mapping0: 0x100000000-0x86fffffff could not reserve region
> Aug 19 17:59:09  kernel: kmem dax3.0: probe with driver kmem failed with error -16
> Aug 19 17:59:09  kernel: kmem dax4.0: mapping0: 0x870000000-0x106fffffff could not reserve region
> Aug 19 17:59:09  kernel: kmem dax4.0: probe with driver kmem failed with error -16
> Aug 19 17:59:19  kernel: nvme nvme0: using unchecked data buffer
> Aug 19 18:36:27  kernel: block nvme1n1: No UUID available providing old NGUID
> lizhijian@:~$ sudo grep -w -e 106fffffff -e 870000000 -e 8fffffff -e 100000000 /proc/iomem
> 6fffb000-8fffffff : Reserved
> 100000000-10000ffff : Reserved
> 106ccc0000-106fffffff : Reserved
> 
> 
> This issue can be resolved by re-introducing sort_reserved_region_intersects(...) I guess.
> 
> 
> 
>>
>>>       ```
>>>
>>> 3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible.
>>
>> Haven't tested !CXL_REGION yet.

When CXL_REGION is disabled, DEV_DAX_CXL will also be disabled. So 
dax_hmem should handle it. I was able to fallback to dax_hmem. But let 
me know if I'm missing something.

config DEV_DAX_CXL
         tristate "CXL DAX: direct access to CXL RAM regions"
         depends on CXL_BUS && CXL_REGION && DEV_DAX
..

>>
>>>       
>>>       On failure:
>>>       
>>>       ```
>>>       100000000-27ffffff : System RAM
>>>       5c0001128-5c00011b7 : port1
>>>       5c0011128-5c00111b7 : port2
>>>       5d0000000-6cffffff : CXL Window 0
>>>       6d0000000-7cffffff : CXL Window 1
>>>       7000000000-700000ffff : PCI Bus 0000:0c
>>>         7000000000-700000ffff : 0000:0c:00.0
>>>           7000001080-70000010d7 : mem1
>>>       ```
>>>
>>>       On success:
>>>       
>>>       ```
>>>       5d0000000-7cffffff : dax0.0
>>>         5d0000000-7cffffff : System RAM (kmem)
>>>           5d0000000-6cffffff : CXL Window 0
>>>           6d0000000-7cffffff : CXL Window 1
>>>       ```
>>>
>>> In term of issues 1 and 2, this arises because hmem_register_device() attempts to register resources of all "HMEM devices," whereas we only need to register the IORES_DESC_SOFT_RESERVED resources. I believe resolving the current TODO will address this.
>>>
>>> ```
>>> -   rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>>> -                          IORES_DESC_SOFT_RESERVED);
>>> -   if (rc != REGION_INTERSECTS)
>>> -       return 0;
>>> +   /* TODO: insert "Soft Reserved" into iomem here */
>>> ```
>>
>> Above makes sense.
> 
> I think the subroutine add_soft_reserved() in your previous patchset[1] are able to cover this TODO
> 
>>
>> I'll probably wait for an update from Smita to test again, but if you
>> or Smita have anything you want me to try out on my hardwware in the
>> meantime, let me know.
>>
> 
> Here is my local fixup based on Dan's RFC, it can resovle issue 1 and 2.

I almost have the same approach :) Sorry, I missed adding your
"Signed-off-by".. Will include for next revision..

> 
> 
> -- 8< --
>    commit e7ccd7a01e168e185971da66f4aa13eb451caeaf
> Author: Li Zhijian <lizhijian@fujitsu.com>
> Date:   Fri Aug 20 11:07:15 2025 +0800
> 
>       Fix probe-order TODO
>       
>       Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
> 
> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
> index 754115da86cc..965ffc622136 100644
> --- a/drivers/dax/hmem/hmem.c
> +++ b/drivers/dax/hmem/hmem.c
> @@ -93,6 +93,26 @@ static void process_defer_work(struct work_struct *_work)
>    	walk_hmem_resources(&pdev->dev, handle_deferred_cxl);
>    }
>    
> +static int add_soft_reserved(resource_size_t start, resource_size_t len,
> +			     unsigned long flags)
> +{
> +	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
> +	int rc;
> +
> +	if (!res)
> +		return -ENOMEM;
> +
> +	*res = DEFINE_RES_NAMED_DESC(start, len, "Soft Reserved",
> +				     flags | IORESOURCE_MEM,
> +				     IORES_DESC_SOFT_RESERVED);
> +
> +	rc = insert_resource(&iomem_resource, res);
> +	if (rc)
> +		kfree(res);
> +
> +	return rc;
> +}
> +
>    static int hmem_register_device(struct device *host, int target_nid,
>    				const struct resource *res)
>    {
> @@ -102,6 +122,10 @@ static int hmem_register_device(struct device *host, int target_nid,
>    	long id;
>    	int rc;
> 
    > +	if (soft_reserve_res_intersects(res->start, resource_size(res),
> +		      IORESOURCE_MEM, IORES_DESC_NONE) == REGION_DISJOINT)
> +		return 0;
> +

Should also handle CONFIG_EFI_SOFT_RESERVE not enabled case..


Thanks
Smita

>    	if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
>    	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>    			      IORES_DESC_CXL) != REGION_DISJOINT) {
> @@ -119,7 +143,17 @@ static int hmem_register_device(struct device *host, int target_nid,
>    		}
>    	}
>    
> -	/* TODO: insert "Soft Reserved" into iomem here */
> +	/*
> +	 * This is a verified Soft Reserved region that CXL is not claiming (or
> +	 * is being overridden). Add it to the main iomem tree so it can be
> +	 * properly reserved by the DAX driver.
> +	 */
> +	rc = add_soft_reserved(res->start, res->end - res->start + 1, 0);
> +	if (rc) {
> +		dev_warn(host, "failed to insert soft-reserved resource %pr into iomem: %d\n",
> +			 res, rc);
> +		return rc;
> +	}
>    
>    	id = memregion_alloc(GFP_KERNEL);
>    	if (id < 0) {
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 349f0d9aad22..eca5956c444b 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1069,6 +1069,8 @@ enum {
>    int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
>    		      unsigned long desc);
>    
> +int soft_reserve_res_intersects(resource_size_t offset, size_t size, unsigned long flags,
> +		      unsigned long desc);
>    /* Support for virtually mapped pages */
>    struct page *vmalloc_to_page(const void *addr);
>    unsigned long vmalloc_to_pfn(const void *addr);
> diff --git a/kernel/resource.c b/kernel/resource.c
> index b8eac6af2fad..a34b76cf690a 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -461,6 +461,22 @@ int walk_soft_reserve_res_desc(unsigned long desc, unsigned long flags,
>    			     arg, func);
>    }
>    EXPORT_SYMBOL_GPL(walk_soft_reserve_res_desc);
> +
> +static int __region_intersects(struct resource *parent, resource_size_t start,
> +			       size_t size, unsigned long flags,
> +			       unsigned long desc);
> +int soft_reserve_res_intersects(resource_size_t start, size_t size, unsigned long flags,
> +		      unsigned long desc)
> +{
> +	int ret;
> +
> +	read_lock(&resource_lock);
> +	ret = __region_intersects(&soft_reserve_resource, start, size, flags, desc);
> +	read_unlock(&resource_lock);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(soft_reserve_res_intersects);
>    #endif
>    
>    /*
> 
> 
> 
> [1] https://lore.kernel.org/linux-cxl/29312c0765224ae76862d59a17748c8188fb95f1.1692638817.git.alison.schofield@intel.com/
> 
> 
>> -- Alison
>>
>>
>>>
>>> Regarding issue 3 (which exists in the current situation), this could be because it cannot ensure that dax_hmem_probe() executes prior to cxl_acpi_probe() when CXL_REGION is disabled.
>>>
>>> I am pleased that you have pushed the patch to the cxl/for-6.18/cxl-probe-order branch, and I'm looking forward to its integration into the upstream during the v6.18 merge window.
>>> Besides the current TODO, you also mentioned that this RFC PATCH must be further subdivided into several patches, so there remains significant work to be done.
>>> If my understanding is correct, you would be personally continuing to push forward this patch, right?
>>>
>>>
>>> Smita,
>>>
>>> Do you have any additional thoughts on this proposal from your side?
>>>
>>>
>>> Thanks
>>> Zhijian
>>>
>> snip
>>
Re: [PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by Zhijian Li (Fujitsu) 1 month, 1 week ago

On 22/08/2025 11:56, Koralahalli Channabasappa, Smita wrote:
>>
>>>
>>>>       ```
>>>>
>>>> 3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible.
>>>
>>> Haven't tested !CXL_REGION yet.
> 
> When CXL_REGION is disabled, DEV_DAX_CXL will also be disabled. So dax_hmem should handle it. 

Yes, falling back to dax_hmem/kmem is the result we expect.
I haven't figured out the root cause of the issue yet, but I can tell you that in my QEMU environment,
there is currently a certain probability that it cannot fall back to dax_hmem/kmem.

Upon its failure, I observed the following warnings and errors (with my local fixup kernel).
[   12.203254] kmem dax0.0: mapping0: 0x5d0000000-0x7cfffffff could not reserve region
[   12.203437] kmem dax0.0: probe with driver kmem failed with error -16



> I was able to fallback to dax_hmem. But let me know if I'm missing something.
> 
> config DEV_DAX_CXL
>          tristate "CXL DAX: direct access to CXL RAM regions"
>          depends on CXL_BUS && CXL_REGION && DEV_DAX
> ..
> 
>>>
>>>>       On failure:
>>>>       ```
>>>>       100000000-27ffffff : System RAM
>>>>       5c0001128-5c00011b7 : port1
>>>>       5c0011128-5c00111b7 : port2
>>>>       5d0000000-6cffffff : CXL Window 0
>>>>       6d0000000-7cffffff : CXL Window 1
>>>>       7000000000-700000ffff : PCI Bus 0000:0c
>>>>         7000000000-700000ffff : 0000:0c:00.0
>>>>           7000001080-70000010d7 : mem1
>>>>       ```
>>>>
>>>>       On success:
>>>>       ```
>>>>       5d0000000-7cffffff : dax0.0
>>>>         5d0000000-7cffffff : System RAM (kmem)
>>>>           5d0000000-6cffffff : CXL Window 0
>>>>           6d0000000-7cffffff : CXL Window 1
>>>>       ```
>>>>
>>>> In term of issues 1 and 2, this arises because hmem_register_device() attempts to register resources of all "HMEM devices," whereas we only need to register the IORES_DESC_SOFT_RESERVED resources. I believe resolving the current TODO will address this.
>>>>
>>>> ```
>>>> -   rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>>>> -                          IORES_DESC_SOFT_RESERVED);
>>>> -   if (rc != REGION_INTERSECTS)
>>>> -       return 0;
>>>> +   /* TODO: insert "Soft Reserved" into iomem here */
>>>> ```
>>>
>>> Above makes sense.
>>
>> I think the subroutine add_soft_reserved() in your previous patchset[1] are able to cover this TODO
>>
>>>
>>> I'll probably wait for an update from Smita to test again, but if you
>>> or Smita have anything you want me to try out on my hardwware in the
>>> meantime, let me know.
>>>
>>
>> Here is my local fixup based on Dan's RFC, it can resovle issue 1 and 2.
> 
> I almost have the same approach 🙂 Sorry, I missed adding your
> "Signed-off-by".. Will include for next revision..

Never mind.

Glad to see your V6, I will test and take a look at soon




> 
>>
>>
>> -- 8< --
>>    commit e7ccd7a01e168e185971da66f4aa13eb451caeaf
>> Author: Li Zhijian <lizhijian@fujitsu.com>
>> Date:   Fri Aug 20 11:07:15 2025 +0800
>>
>>       Fix probe-order TODO
>>       Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>>
>> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
>> index 754115da86cc..965ffc622136 100644
>> --- a/drivers/dax/hmem/hmem.c
>> +++ b/drivers/dax/hmem/hmem.c
>> @@ -93,6 +93,26 @@ static void process_defer_work(struct work_struct *_work)
>>        walk_hmem_resources(&pdev->dev, handle_deferred_cxl);
>>    }
>> +static int add_soft_reserved(resource_size_t start, resource_size_t len,
>> +                 unsigned long flags)
>> +{
>> +    struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
>> +    int rc;
>> +
>> +    if (!res)
>> +        return -ENOMEM;
>> +
>> +    *res = DEFINE_RES_NAMED_DESC(start, len, "Soft Reserved",
>> +                     flags | IORESOURCE_MEM,
>> +                     IORES_DESC_SOFT_RESERVED);
>> +
>> +    rc = insert_resource(&iomem_resource, res);
>> +    if (rc)
>> +        kfree(res);
>> +
>> +    return rc;
>> +}
>> +
>>    static int hmem_register_device(struct device *host, int target_nid,
>>                    const struct resource *res)
>>    {
>> @@ -102,6 +122,10 @@ static int hmem_register_device(struct device *host, int target_nid,
>>        long id;
>>        int rc;
>>
>     > +    if (soft_reserve_res_intersects(res->start, resource_size(res),
>> +              IORESOURCE_MEM, IORES_DESC_NONE) == REGION_DISJOINT)
>> +        return 0;
>> +
> 
> Should also handle CONFIG_EFI_SOFT_RESERVE not enabled case..



I think it’s unnecessary. For !CONFIG_EFI_SOFT_RESERVE, it will return directly because soft_reserve_res_intersects() will always return REGION_DISJOINT.


Thanks
Zhijian

> 
> 
> Thanks
> Smita
Re: [PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by Zhijian Li (Fujitsu) 1 month, 1 week ago
All,


I have confirmed that in the !CXL_REGION configuration, the same environment may fail to fall back to hmem.(Your new patch cannot resolve this issue)

In my environment:
- There are two CXL memory devices corresponding to:
   ```
   5d0000000-6cffffff : CXL Window 0
   6d0000000-7cffffff : CXL Window 1
   ```
- E820 table contains a 'soft reserved' entry:
   ```
   [    0.000000] BIOS-e820: [mem 0x00000005d0000000-0x00000007cfffffff] soft reserved
   ```

However, since my ACPI SRAT doesn't describe the CXL memory devices (the point), `acpi/hmat.c` won't allocate memory targets for them. This prevents the call chain:
```c
hmat_register_target_devices() // for each SRAT-described target
   -> hmem_register_resource()
     -> insert entry into "HMEM devices" resource
```

Therefore, for successful fallback to hmem in this environment: `dax_hmem.ko` and `kmem.ko` must request resources BEFORE `cxl_acpi.ko` inserts 'CXL Window X'

However the kernel cannot guarantee this initialization order.

When cxl_acpi runs before dax_kmem/kmem:
```
(built-in)                 CXL_REGION=n
driver/dax/hmem/device.c  cxl_acpi.ko      dax_hmem.ko               kmem.ko

(1) Add entry '15d0000000-7cfffffff'
                                          (2) Traverse "HMEM devices"
                                              Insert to iomem:
                                              5d0000000-7cffffff : Soft Reserved

                      (3) Insert CXL Window 0/1
                          /proc/iomem shows:
                          5d0000000-7cffffff : Soft Reserved
                            5d0000000-6cffffff : CXL Window 0
                            6d0000000-7cffffff : CXL Window 1

                                         (4) Create dax device
                                                                 (5) request_mem_region() fails
                                                                   for 5d0000000-7cffffff
                                                                   Reason: Children of 'Soft Reserved'
                                                                   (CXL Windows 0/1) don't cover full range
```

---------------------
In my another environment where ACPI SRAT has separate entries per CXL device:
1. `acpi/hmat.c` inserts two entries into "HMEM devices":
    - 5d0000000-6cffffff
    - 6d0000000-7cffffff

2. Regardless of module order, dax/kmem requests per-device resources, resulting in:
    ```
    5d0000000-7cffffff : Soft Reserved
        5d0000000-6cffffff : CXL Window 0
            5d0000000-6cffffff : dax0.0
                5d0000000-6cffffff : System RAM (kmem)
        6d0000000-7cffffff : CXL Window 1
            6d0000000-7cffffff : dax1.0
                6d0000000-7cffffff : System RAM (kmem)
    ```

Thanks,
Zhijian


On 25/08/2025 15:50, Li Zhijian wrote:
> 
> 
> On 22/08/2025 11:56, Koralahalli Channabasappa, Smita wrote:
>>>
>>>>
>>>>>       ```
>>>>>
>>>>> 3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible.
>>>>
>>>> Haven't tested !CXL_REGION yet.
>>
>> When CXL_REGION is disabled, DEV_DAX_CXL will also be disabled. So dax_hmem should handle it. 
> 
> Yes, falling back to dax_hmem/kmem is the result we expect.
> I haven't figured out the root cause of the issue yet, but I can tell you that in my QEMU environment,
> there is currently a certain probability that it cannot fall back to dax_hmem/kmem.
> 
> Upon its failure, I observed the following warnings and errors (with my local fixup kernel).
> [   12.203254] kmem dax0.0: mapping0: 0x5d0000000-0x7cfffffff could not reserve region
> [   12.203437] kmem dax0.0: probe with driver kmem failed with error -16
> 
> 
> 
>> I was able to fallback to dax_hmem. But let me know if I'm missing something.
>>
>> config DEV_DAX_CXL
>>          tristate "CXL DAX: direct access to CXL RAM regions"
>>          depends on CXL_BUS && CXL_REGION && DEV_DAX
>> ..
>>
>>>>
>>>>>       On failure:
>>>>>       ```
>>>>>       100000000-27ffffff : System RAM
>>>>>       5c0001128-5c00011b7 : port1
>>>>>       5c0011128-5c00111b7 : port2
>>>>>       5d0000000-6cffffff : CXL Window 0
>>>>>       6d0000000-7cffffff : CXL Window 1
>>>>>       7000000000-700000ffff : PCI Bus 0000:0c
>>>>>         7000000000-700000ffff : 0000:0c:00.0
>>>>>           7000001080-70000010d7 : mem1
>>>>>       ```
>>>>>
>>>>>       On success:
>>>>>       ```
>>>>>       5d0000000-7cffffff : dax0.0
>>>>>         5d0000000-7cffffff : System RAM (kmem)
>>>>>           5d0000000-6cffffff : CXL Window 0
>>>>>           6d0000000-7cffffff : CXL Window 1
>>>>>       ```
Re: [PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by Koralahalli Channabasappa, Smita 1 month, 1 week ago
Hi Zhijian,

On 8/26/2025 11:30 PM, Zhijian Li (Fujitsu) wrote:
> All,
> 
> 
> I have confirmed that in the !CXL_REGION configuration, the same environment may fail to fall back to hmem.(Your new patch cannot resolve this issue)
> 
> In my environment:
> - There are two CXL memory devices corresponding to:
>     ```
>     5d0000000-6cffffff : CXL Window 0
>     6d0000000-7cffffff : CXL Window 1
>     ```
> - E820 table contains a 'soft reserved' entry:
>     ```
>     [    0.000000] BIOS-e820: [mem 0x00000005d0000000-0x00000007cfffffff] soft reserved
>     ```
> 
> However, since my ACPI SRAT doesn't describe the CXL memory devices (the point), `acpi/hmat.c` won't allocate memory targets for them. This prevents the call chain:
> ```c
> hmat_register_target_devices() // for each SRAT-described target
>     -> hmem_register_resource()
>       -> insert entry into "HMEM devices" resource
> ```
> 
> Therefore, for successful fallback to hmem in this environment: `dax_hmem.ko` and `kmem.ko` must request resources BEFORE `cxl_acpi.ko` inserts 'CXL Window X'
> 
> However the kernel cannot guarantee this initialization order.
> 
> When cxl_acpi runs before dax_kmem/kmem:
> ```
> (built-in)                 CXL_REGION=n
> driver/dax/hmem/device.c  cxl_acpi.ko      dax_hmem.ko               kmem.ko
> 
> (1) Add entry '15d0000000-7cfffffff'
>                                            (2) Traverse "HMEM devices"
>                                                Insert to iomem:
>                                                5d0000000-7cffffff : Soft Reserved
> 
>                        (3) Insert CXL Window 0/1
>                            /proc/iomem shows:
>                            5d0000000-7cffffff : Soft Reserved
>                              5d0000000-6cffffff : CXL Window 0
>                              6d0000000-7cffffff : CXL Window 1
> 
>                                           (4) Create dax device
>                                                                   (5) request_mem_region() fails
>                                                                     for 5d0000000-7cffffff
>                                                                     Reason: Children of 'Soft Reserved'
>                                                                     (CXL Windows 0/1) don't cover full range
> ```
> 

Thanks for confirming the failure point. I was thinking of two possible 
ways forward here, and I would like to get feedback from others:

[1] Teach dax_hmem to split when the parent claim fails:
If __request_region() fails for the top-level Soft Reserved range 
because IORES_DESC_CXL children already exist, dax_hmem could iterate 
those windows and register each one individually. The downside is that 
it adds some complexity and feels a bit like papering over the fact that 
CXL should eventually own all of this memory. As Dan mentioned, the 
long-term plan is for Linux to not need the soft-reserve fallback at 
all, and simply ignore Soft Reserve for CXL Windows because the CXL 
subsystem will handle it.

[2] Always unconditionally load CXL early..
Call request_module("cxl_acpi"); request_module("cxl_pci"); from 
dax_hmem_init() (without the IS_ENABLED(CONFIG_DEV_DAX_CXL) guard). If 
those are y/m, they’ll be present; if n, it’s a no-op. Then in 
hmem_register_device() drop the IS_ENABLED(CONFIG_DEV_DAX_CXL) gate and do:

if (region_intersects(res->start, resource_size(res),
                       IORESOURCE_MEM, IORES_DESC_CXL) !=REGION_DISJOINT)
	/* defer to CXL */;

and defer to CXL if windows are present. This makes Soft Reserved 
unavailable once CXL Windows have been discovered, even if CXL_REGION is 
disabled. That aligns better with the idea that “CXL should win” 
whenever a window is visible (This also needs to be considered alongside 
patch 6/6 in my series.)

With CXL_REGION=n there would be no devdax and no kmem for that range; 
proc/iomem would show only the windows something like below

850000000-284fffffff : CXL Window 0
2850000000-484fffffff : CXL Window 1
4850000000-684fffffff : CXL Window 2

That means the memory is left unclaimed/unavailable.. (no System RAM, no 
/dev/dax). Is that acceptable when CXL_REGION is disabled?

Thanks
Smita
> ---------------------
> In my another environment where ACPI SRAT has separate entries per CXL device:
> 1. `acpi/hmat.c` inserts two entries into "HMEM devices":
>      - 5d0000000-6cffffff
>      - 6d0000000-7cffffff
> 
> 2. Regardless of module order, dax/kmem requests per-device resources, resulting in:
>      ```
>      5d0000000-7cffffff : Soft Reserved
>          5d0000000-6cffffff : CXL Window 0
>              5d0000000-6cffffff : dax0.0
>                  5d0000000-6cffffff : System RAM (kmem)
>          6d0000000-7cffffff : CXL Window 1
>              6d0000000-7cffffff : dax1.0
>                  6d0000000-7cffffff : System RAM (kmem)
>      ```
> 
> Thanks,
> Zhijian
> 
> 
> On 25/08/2025 15:50, Li Zhijian wrote:
>>
>>
>> On 22/08/2025 11:56, Koralahalli Channabasappa, Smita wrote:
>>>>
>>>>>
>>>>>>        ```
>>>>>>
>>>>>> 3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible.
>>>>>
>>>>> Haven't tested !CXL_REGION yet.
>>>
>>> When CXL_REGION is disabled, DEV_DAX_CXL will also be disabled. So dax_hmem should handle it.
>>
>> Yes, falling back to dax_hmem/kmem is the result we expect.
>> I haven't figured out the root cause of the issue yet, but I can tell you that in my QEMU environment,
>> there is currently a certain probability that it cannot fall back to dax_hmem/kmem.
>>
>> Upon its failure, I observed the following warnings and errors (with my local fixup kernel).
>> [   12.203254] kmem dax0.0: mapping0: 0x5d0000000-0x7cfffffff could not reserve region
>> [   12.203437] kmem dax0.0: probe with driver kmem failed with error -16
>>
>>
>>
>>> I was able to fallback to dax_hmem. But let me know if I'm missing something.
>>>
>>> config DEV_DAX_CXL
>>>           tristate "CXL DAX: direct access to CXL RAM regions"
>>>           depends on CXL_BUS && CXL_REGION && DEV_DAX
>>> ..
>>>
>>>>>
>>>>>>        On failure:
>>>>>>        ```
>>>>>>        100000000-27ffffff : System RAM
>>>>>>        5c0001128-5c00011b7 : port1
>>>>>>        5c0011128-5c00111b7 : port2
>>>>>>        5d0000000-6cffffff : CXL Window 0
>>>>>>        6d0000000-7cffffff : CXL Window 1
>>>>>>        7000000000-700000ffff : PCI Bus 0000:0c
>>>>>>          7000000000-700000ffff : 0000:0c:00.0
>>>>>>            7000001080-70000010d7 : mem1
>>>>>>        ```
>>>>>>
>>>>>>        On success:
>>>>>>        ```
>>>>>>        5d0000000-7cffffff : dax0.0
>>>>>>          5d0000000-7cffffff : System RAM (kmem)
>>>>>>            5d0000000-6cffffff : CXL Window 0
>>>>>>            6d0000000-7cffffff : CXL Window 1
>>>>>>        ```

Re: [PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by Zhijian Li (Fujitsu) 1 month ago

On 29/08/2025 07:21, Koralahalli Channabasappa, Smita wrote:
> Hi Zhijian,
> 
> On 8/26/2025 11:30 PM, Zhijian Li (Fujitsu) wrote:
>> All,
>>
>>
>> I have confirmed that in the !CXL_REGION configuration, the same environment may fail to fall back to hmem.(Your new patch cannot resolve this issue)
>>
>> In my environment:
>> - There are two CXL memory devices corresponding to:
>>     ```
>>     5d0000000-6cffffff : CXL Window 0
>>     6d0000000-7cffffff : CXL Window 1
>>     ```
>> - E820 table contains a 'soft reserved' entry:
>>     ```
>>     [    0.000000] BIOS-e820: [mem 0x00000005d0000000-0x00000007cfffffff] soft reserved
>>     ```
>>
>> However, since my ACPI SRAT doesn't describe the CXL memory devices (the point), `acpi/hmat.c` won't allocate memory targets for them. This prevents the call chain:
>> ```c
>> hmat_register_target_devices() // for each SRAT-described target
>>     -> hmem_register_resource()
>>       -> insert entry into "HMEM devices" resource
>> ```
>>
>> Therefore, for successful fallback to hmem in this environment: `dax_hmem.ko` and `kmem.ko` must request resources BEFORE `cxl_acpi.ko` inserts 'CXL Window X'
>>
>> However the kernel cannot guarantee this initialization order.
>>
>> When cxl_acpi runs before dax_kmem/kmem:
>> ```
>> (built-in)                 CXL_REGION=n
>> driver/dax/hmem/device.c  cxl_acpi.ko      dax_hmem.ko               kmem.ko
>>
>> (1) Add entry '15d0000000-7cfffffff'
>>                                            (2) Traverse "HMEM devices"
>>                                                Insert to iomem:
>>                                                5d0000000-7cffffff : Soft Reserved
>>
>>                        (3) Insert CXL Window 0/1
>>                            /proc/iomem shows:
>>                            5d0000000-7cffffff : Soft Reserved
>>                              5d0000000-6cffffff : CXL Window 0
>>                              6d0000000-7cffffff : CXL Window 1
>>
>>                                           (4) Create dax device
>>                                                                   (5) request_mem_region() fails
>>                                                                     for 5d0000000-7cffffff
>>                                                                     Reason: Children of 'Soft Reserved'
>>                                                                     (CXL Windows 0/1) don't cover full range
>> ```
>>
> 
> Thanks for confirming the failure point. I was thinking of two possible ways forward here, and I would like to get feedback from others:
> 
> [1] Teach dax_hmem to split when the parent claim fails:
> If __request_region() fails for the top-level Soft Reserved range because IORES_DESC_CXL children already exist, dax_hmem could iterate those windows and register each one individually. The downside is that it adds some complexity and feels a bit like papering over the fact that CXL should eventually own all of this memory. 

I examined below change to ensure kmem runs first, it seemed to work.

  static int __init cxl_acpi_init(void)
  {
+       if (!IS_ENABLED(CONFIG_DEV_DAX_CXL) && IS_ENABLED(CONFIG_DEV_DAX_KMEM)) {
+               /* fall back to dax_hmem,kmem */
+               request_module("kmem");
+       }
         return platform_driver_register(&cxl_acpi_driver);
  }


> As Dan mentioned, the long-term plan is for Linux to not need the soft-reserve fallback at all, and simply ignore Soft Reserve for CXL Windows because the CXL subsystem will handle it.

The current CXL_REGION kconfig states:
   Otherwise, platform-firmware managed CXL is enabled by being placed in the system address map and does not need a driver.

I think this implies that a fallback to dax_hmem/kmem is still required for such cases.

Of course, I personally agree with this 'long-term plan'.



> 
> [2] Always unconditionally load CXL early..
> Call request_module("cxl_acpi"); request_module("cxl_pci"); from dax_hmem_init() (without the IS_ENABLED(CONFIG_DEV_DAX_CXL) guard). If those are y/m, they’ll be present; if n, it’s a no-op. Then in hmem_register_device() drop the IS_ENABLED(CONFIG_DEV_DAX_CXL) gate and do:
> 
> if (region_intersects(res->start, resource_size(res),
>                        IORESOURCE_MEM, IORES_DESC_CXL) !=REGION_DISJOINT)
>      /* defer to CXL */;
> 
> and defer to CXL if windows are present. This makes Soft Reserved unavailable once CXL Windows have been discovered, even if CXL_REGION is disabled. That aligns better with the idea that “CXL should win” whenever a window is visible (This also needs to be considered alongside patch 6/6 in my series.)
> 
> With CXL_REGION=n there would be no devdax and no kmem for that range; proc/iomem would show only the windows something like below
> 
> 850000000-284fffffff : CXL Window 0
> 2850000000-484fffffff : CXL Window 1
> 4850000000-684fffffff : CXL Window 2
> 
> That means the memory is left unclaimed/unavailable.. (no System RAM, no /dev/dax). Is that acceptable when CXL_REGION is disabled?

Regarding option [2] (unconditionally loading CXL early):
This approach conflicts with the CXL_REGION Kconfig description mentioned above.


---
To refocus on the original issue – the inability to recreate regions after destruction when CXL Windows overlap with Soft Reserved
I believe your patch series "[PATCH 0/6] dax/hmem, cxl: Coordinate Soft Reserved handling with CXL" effectively addresses this problem.
  
As for the pre-existing issues with !CXL_REGION and the unimplemented DAX_CXL_MODE_REGISTER, I suggest deferring them for now.
They need not be resolved within this patch set, as we should prioritize the initial problem.


Thanks
Zhijian
Re: [PATCH v5 3/7] cxl/acpi: Add background worker to coordinate with cxl_mem probe completion
Posted by Dave Jiang 2 months, 3 weeks ago

On 7/15/25 11:04 AM, Smita Koralahalli wrote:
> Introduce a background worker in cxl_acpi to delay SOFT RESERVE handling
> until the cxl_mem driver has probed at least one device. This coordination
> ensures that DAX registration or fallback handling for soft-reserved
> regions is not triggered prematurely.
> 
> The worker waits on cxl_wait_queue, which is signaled via
> cxl_mem_active_inc() during cxl_mem_probe(). Once at least one memory
> device probe is confirmed, the worker invokes wait_for_device_probe()
> to allow the rest of the CXL device hierarchy to complete initialization.
> 
> Additionally, it also handles initialization order issues where
> cxl_acpi_probe() may complete before other drivers such as cxl_port or
> cxl_mem have loaded, especially when cxl_acpi and cxl_port are built-in
> and cxl_mem is a loadable module. In such cases, using only
> wait_for_device_probe() is insufficient, as it may return before all
> relevant probes are registered.
> 
> While region creation happens in cxl_port_probe(), waiting on
> cxl_mem_active() would be sufficient as cxl_mem_probe() can only succeed
> after the port hierarchy is in place. Furthermore, since cxl_mem depends
> on cxl_pci, this also guarantees that cxl_pci has loaded by the time the
> wait completes.
> 
> As cxl_mem_active() infrastructure already exists for tracking probe
> activity, cxl_acpi can use it without introducing new coordination
> mechanisms.
> 
> Co-developed-by: Nathan Fontenot <Nathan.Fontenot@amd.com>
> Signed-off-by: Nathan Fontenot <Nathan.Fontenot@amd.com>
> Co-developed-by: Terry Bowman <terry.bowman@amd.com>
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
>  drivers/cxl/acpi.c             | 18 ++++++++++++++++++
>  drivers/cxl/core/probe_state.c |  5 +++++
>  drivers/cxl/cxl.h              |  2 ++
>  3 files changed, 25 insertions(+)
> 
> diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
> index ca06d5acdf8f..3a27289e669b 100644
> --- a/drivers/cxl/acpi.c
> +++ b/drivers/cxl/acpi.c
> @@ -823,6 +823,20 @@ static int pair_cxl_resource(struct device *dev, void *data)
>  	return 0;
>  }
>  
> +static void cxl_softreserv_mem_work_fn(struct work_struct *work)
> +{
> +	if (!wait_event_timeout(cxl_wait_queue, cxl_mem_active(), 30 * HZ))
> +		pr_debug("Timeout waiting for cxl_mem probing");
> +
> +	wait_for_device_probe();
> +}
> +static DECLARE_WORK(cxl_sr_work, cxl_softreserv_mem_work_fn);
> +
> +static void cxl_softreserv_mem_update(void)
> +{
> +	schedule_work(&cxl_sr_work);
> +}
> +
>  static int cxl_acpi_probe(struct platform_device *pdev)
>  {
>  	int rc = 0;
> @@ -903,6 +917,9 @@ static int cxl_acpi_probe(struct platform_device *pdev)
>  	cxl_bus_rescan();
>  
>  out:
> +	/* Update SOFT RESERVE resources that intersect with CXL regions */
> +	cxl_softreserv_mem_update();

Can you please squash 1/7 with this patch since both are fairly small? Otherwise it leaves the reviewer wonder what the changes in 1/7 would result in.

DJ

> +
>  	return rc;
>  }
>  
> @@ -934,6 +951,7 @@ static int __init cxl_acpi_init(void)
>  
>  static void __exit cxl_acpi_exit(void)
>  {
> +	cancel_work_sync(&cxl_sr_work);
>  	platform_driver_unregister(&cxl_acpi_driver);
>  	cxl_bus_drain();
>  }
> diff --git a/drivers/cxl/core/probe_state.c b/drivers/cxl/core/probe_state.c
> index 5ba4b4de0e33..3089b2698b32 100644
> --- a/drivers/cxl/core/probe_state.c
> +++ b/drivers/cxl/core/probe_state.c
> @@ -2,9 +2,12 @@
>  /* Copyright(c) 2022 Intel Corporation. All rights reserved. */
>  #include <linux/atomic.h>
>  #include <linux/export.h>
> +#include <linux/wait.h>
>  #include "cxlmem.h"
>  
>  static atomic_t mem_active;
> +DECLARE_WAIT_QUEUE_HEAD(cxl_wait_queue);
> +EXPORT_SYMBOL_NS_GPL(cxl_wait_queue, "CXL");
>  
>  bool cxl_mem_active(void)
>  {
> @@ -13,10 +16,12 @@ bool cxl_mem_active(void)
>  
>  	return false;
>  }
> +EXPORT_SYMBOL_NS_GPL(cxl_mem_active, "CXL");
>  
>  void cxl_mem_active_inc(void)
>  {
>  	atomic_inc(&mem_active);
> +	wake_up(&cxl_wait_queue);
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_mem_active_inc, "CXL");
>  
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 3f1695c96abc..3117136f0208 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -903,6 +903,8 @@ void cxl_coordinates_combine(struct access_coordinate *out,
>  
>  bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
>  
> +extern wait_queue_head_t cxl_wait_queue;
> +
>  /*
>   * Unit test builds overrides this to __weak, find the 'strong' version
>   * of these symbols in tools/testing/cxl/.