Introduce a background worker in cxl_acpi to delay SOFT RESERVE handling
until the cxl_mem driver has probed at least one device. This coordination
ensures that DAX registration or fallback handling for soft-reserved
regions is not triggered prematurely.
The worker waits on cxl_wait_queue, which is signaled via
cxl_mem_active_inc() during cxl_mem_probe(). Once at least one memory
device probe is confirmed, the worker invokes wait_for_device_probe()
to allow the rest of the CXL device hierarchy to complete initialization.
Additionally, it also handles initialization order issues where
cxl_acpi_probe() may complete before other drivers such as cxl_port or
cxl_mem have loaded, especially when cxl_acpi and cxl_port are built-in
and cxl_mem is a loadable module. In such cases, using only
wait_for_device_probe() is insufficient, as it may return before all
relevant probes are registered.
While region creation happens in cxl_port_probe(), waiting on
cxl_mem_active() would be sufficient as cxl_mem_probe() can only succeed
after the port hierarchy is in place. Furthermore, since cxl_mem depends
on cxl_pci, this also guarantees that cxl_pci has loaded by the time the
wait completes.
As cxl_mem_active() infrastructure already exists for tracking probe
activity, cxl_acpi can use it without introducing new coordination
mechanisms.
Co-developed-by: Nathan Fontenot <Nathan.Fontenot@amd.com>
Signed-off-by: Nathan Fontenot <Nathan.Fontenot@amd.com>
Co-developed-by: Terry Bowman <terry.bowman@amd.com>
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
drivers/cxl/acpi.c | 18 ++++++++++++++++++
drivers/cxl/core/probe_state.c | 5 +++++
drivers/cxl/cxl.h | 2 ++
3 files changed, 25 insertions(+)
diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index ca06d5acdf8f..3a27289e669b 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -823,6 +823,20 @@ static int pair_cxl_resource(struct device *dev, void *data)
return 0;
}
+static void cxl_softreserv_mem_work_fn(struct work_struct *work)
+{
+ if (!wait_event_timeout(cxl_wait_queue, cxl_mem_active(), 30 * HZ))
+ pr_debug("Timeout waiting for cxl_mem probing");
+
+ wait_for_device_probe();
+}
+static DECLARE_WORK(cxl_sr_work, cxl_softreserv_mem_work_fn);
+
+static void cxl_softreserv_mem_update(void)
+{
+ schedule_work(&cxl_sr_work);
+}
+
static int cxl_acpi_probe(struct platform_device *pdev)
{
int rc = 0;
@@ -903,6 +917,9 @@ static int cxl_acpi_probe(struct platform_device *pdev)
cxl_bus_rescan();
out:
+ /* Update SOFT RESERVE resources that intersect with CXL regions */
+ cxl_softreserv_mem_update();
+
return rc;
}
@@ -934,6 +951,7 @@ static int __init cxl_acpi_init(void)
static void __exit cxl_acpi_exit(void)
{
+ cancel_work_sync(&cxl_sr_work);
platform_driver_unregister(&cxl_acpi_driver);
cxl_bus_drain();
}
diff --git a/drivers/cxl/core/probe_state.c b/drivers/cxl/core/probe_state.c
index 5ba4b4de0e33..3089b2698b32 100644
--- a/drivers/cxl/core/probe_state.c
+++ b/drivers/cxl/core/probe_state.c
@@ -2,9 +2,12 @@
/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
#include <linux/atomic.h>
#include <linux/export.h>
+#include <linux/wait.h>
#include "cxlmem.h"
static atomic_t mem_active;
+DECLARE_WAIT_QUEUE_HEAD(cxl_wait_queue);
+EXPORT_SYMBOL_NS_GPL(cxl_wait_queue, "CXL");
bool cxl_mem_active(void)
{
@@ -13,10 +16,12 @@ bool cxl_mem_active(void)
return false;
}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_active, "CXL");
void cxl_mem_active_inc(void)
{
atomic_inc(&mem_active);
+ wake_up(&cxl_wait_queue);
}
EXPORT_SYMBOL_NS_GPL(cxl_mem_active_inc, "CXL");
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 3f1695c96abc..3117136f0208 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -903,6 +903,8 @@ void cxl_coordinates_combine(struct access_coordinate *out,
bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
+extern wait_queue_head_t cxl_wait_queue;
+
/*
* Unit test builds overrides this to __weak, find the 'strong' version
* of these symbols in tools/testing/cxl/.
--
2.17.1
Smita Koralahalli wrote: > Introduce a background worker in cxl_acpi to delay SOFT RESERVE handling > until the cxl_mem driver has probed at least one device. This coordination > ensures that DAX registration or fallback handling for soft-reserved > regions is not triggered prematurely. > > The worker waits on cxl_wait_queue, which is signaled via > cxl_mem_active_inc() during cxl_mem_probe(). Once at least one memory > device probe is confirmed, the worker invokes wait_for_device_probe() > to allow the rest of the CXL device hierarchy to complete initialization. > > Additionally, it also handles initialization order issues where > cxl_acpi_probe() may complete before other drivers such as cxl_port or > cxl_mem have loaded, especially when cxl_acpi and cxl_port are built-in > and cxl_mem is a loadable module. In such cases, using only > wait_for_device_probe() is insufficient, as it may return before all > relevant probes are registered. Right, but that problem is not solved by this which still leaves the decision on when to give up on this mechanism, and this mechanism does not tell you when follow-on probe work is complete. > While region creation happens in cxl_port_probe(), waiting on > cxl_mem_active() would be sufficient as cxl_mem_probe() can only succeed > after the port hierarchy is in place. Furthermore, since cxl_mem depends > on cxl_pci, this also guarantees that cxl_pci has loaded by the time the > wait completes. > > As cxl_mem_active() infrastructure already exists for tracking probe > activity, cxl_acpi can use it without introducing new coordination > mechanisms. In appreciate the instinct to not add anything new, but the module loading problem is solvable. If the goal is: "I want to give device-dax a point at which it can make a go / no-go decision about whether the CXL subsystem has properly assembled all CXL regions implied by Soft Reserved instersecting with CXL Windows." Then that is something like the below, only lightly tested and likely regresses the non-CXL case. -- 8< -- From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001 From: Dan Williams <dan.j.williams@intel.com> Date: Tue, 22 Jul 2025 16:11:08 -0700 Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration CXL and dax_hmem fight over "Soft Reserved" (EFI Specific Purpose Memory) resources are published in the iomem resource tree. The entry blocks some CXL hotplug flows, and CXL blocks dax_hmem from publishing the memory in the event that CXL fails to parse the platform configuration. Towards resolving this conflict: (the non-RFC version of this patch should split these into separate patches): 1/ Defer publishing "Soft Reserved" entries in the iomem resource tree until the consumer, dax_hmem, is ready to use them. 2/ Fix detection of "Soft Reserved" vs "CXL Window" resource overlaps by switching from MODULE_SOFTDEP() to request_module() for making sure that cxl_acpi has had a chance to publish "CXL Window" resources. 3/ Add cxl_pci to the list of modules that need to have had a chance to scan boot devices such that wait_device_probe() flushes initial CXL topology discovery. 4/ Add a workqueue that delays consideration of "Soft Reserved" that overlaps CXL so that the CXL subsystem can complete all of its region assembly. For RFC purposes this only solves the reliabilty of the DAX_CXL_MODE_DROP case. DAX_CXL_MODE_REGISTER support can follow to shutdown CXL in favor of vanilla DAX devices as an emergency fallback for platform configuration quirks and bugs. Signed-off-by: Dan Williams <dan.j.williams@intel.com> --- arch/x86/kernel/e820.c | 2 +- drivers/dax/hmem/device.c | 4 +- drivers/dax/hmem/hmem.c | 94 +++++++++++++++++++++++++++++++++------ include/linux/ioport.h | 25 +++++++++++ kernel/resource.c | 58 +++++++++++++++++++----- 5 files changed, 156 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c3acbd26408b..aef1ff2cabda 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1153,7 +1153,7 @@ void __init e820__reserve_resources_late(void) res = e820_res; for (i = 0; i < e820_table->nr_entries; i++) { if (!res->parent && res->end) - insert_resource_expand_to_fit(&iomem_resource, res); + insert_resource_late(res); res++; } diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c index f9e1a76a04a9..22732b729017 100644 --- a/drivers/dax/hmem/device.c +++ b/drivers/dax/hmem/device.c @@ -83,8 +83,8 @@ static __init int hmem_register_one(struct resource *res, void *data) static __init int hmem_init(void) { - walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED, - IORESOURCE_MEM, 0, -1, NULL, hmem_register_one); + walk_soft_reserve_res_desc(IORES_DESC_SOFT_RESERVED, IORESOURCE_MEM, 0, + -1, NULL, hmem_register_one); return 0; } diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index 5e7c53f18491..0916478e3817 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -59,9 +59,45 @@ static void release_hmem(void *pdev) platform_device_unregister(pdev); } +static enum dax_cxl_mode { + DAX_CXL_MODE_DEFER, + DAX_CXL_MODE_REGISTER, + DAX_CXL_MODE_DROP, +} dax_cxl_mode; + +static int handle_deferred_cxl(struct device *host, int target_nid, + const struct resource *res) +{ + if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM, + IORES_DESC_CXL) != REGION_DISJOINT) { + if (dax_cxl_mode == DAX_CXL_MODE_DROP) + dev_dbg(host, "dropping CXL range: %pr\n", res); + } + return 0; +} + +struct dax_defer_work { + struct platform_device *pdev; + struct work_struct work; +}; + +static void process_defer_work(struct work_struct *_work) +{ + struct dax_defer_work *work = container_of(_work, typeof(*work), work); + struct platform_device *pdev = work->pdev; + + /* relies on cxl_acpi and cxl_pci having had a chance to load */ + wait_for_device_probe(); + + dax_cxl_mode = DAX_CXL_MODE_DROP; + + walk_hmem_resources(&pdev->dev, handle_deferred_cxl); +} + static int hmem_register_device(struct device *host, int target_nid, const struct resource *res) { + struct dax_defer_work *work = dev_get_drvdata(host); struct platform_device *pdev; struct memregion_info info; long id; @@ -70,14 +106,21 @@ static int hmem_register_device(struct device *host, int target_nid, if (IS_ENABLED(CONFIG_CXL_REGION) && region_intersects(res->start, resource_size(res), IORESOURCE_MEM, IORES_DESC_CXL) != REGION_DISJOINT) { - dev_dbg(host, "deferring range to CXL: %pr\n", res); - return 0; + switch (dax_cxl_mode) { + case DAX_CXL_MODE_DEFER: + dev_dbg(host, "deferring range to CXL: %pr\n", res); + schedule_work(&work->work); + return 0; + case DAX_CXL_MODE_REGISTER: + dev_dbg(host, "registering CXL range: %pr\n", res); + break; + case DAX_CXL_MODE_DROP: + dev_dbg(host, "dropping CXL range: %pr\n", res); + return 0; + } } - rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM, - IORES_DESC_SOFT_RESERVED); - if (rc != REGION_INTERSECTS) - return 0; + /* TODO: insert "Soft Reserved" into iomem here */ id = memregion_alloc(GFP_KERNEL); if (id < 0) { @@ -123,8 +166,30 @@ static int hmem_register_device(struct device *host, int target_nid, return rc; } +static void kill_defer_work(void *_work) +{ + struct dax_defer_work *work = container_of(_work, typeof(*work), work); + + cancel_work_sync(&work->work); + kfree(work); +} + static int dax_hmem_platform_probe(struct platform_device *pdev) { + struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL); + int rc; + + if (!work) + return -ENOMEM; + + work->pdev = pdev; + INIT_WORK(&work->work, process_defer_work); + + rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work); + if (rc) + return rc; + + platform_set_drvdata(pdev, work); return walk_hmem_resources(&pdev->dev, hmem_register_device); } @@ -139,6 +204,16 @@ static __init int dax_hmem_init(void) { int rc; + /* + * Ensure that cxl_acpi and cxl_pci have a chance to kick off + * CXL topology discovery at least once before scanning the + * iomem resource tree for IORES_DESC_CXL resources. + */ + if (IS_ENABLED(CONFIG_CXL_REGION)) { + request_module("cxl_acpi"); + request_module("cxl_pci"); + } + rc = platform_driver_register(&dax_hmem_platform_driver); if (rc) return rc; @@ -159,13 +234,6 @@ static __exit void dax_hmem_exit(void) module_init(dax_hmem_init); module_exit(dax_hmem_exit); -/* Allow for CXL to define its own dax regions */ -#if IS_ENABLED(CONFIG_CXL_REGION) -#if IS_MODULE(CONFIG_CXL_ACPI) -MODULE_SOFTDEP("pre: cxl_acpi"); -#endif -#endif - MODULE_ALIAS("platform:hmem*"); MODULE_ALIAS("platform:hmem_platform*"); MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory"); diff --git a/include/linux/ioport.h b/include/linux/ioport.h index e8b2d6aa4013..4fc6ab518c24 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -232,6 +232,9 @@ struct resource_constraint { /* PC/ISA/whatever - the normal PC address spaces: IO and memory */ extern struct resource ioport_resource; extern struct resource iomem_resource; +#ifdef CONFIG_EFI_SOFT_RESERVE +extern struct resource soft_reserve_resource; +#endif extern struct resource *request_resource_conflict(struct resource *root, struct resource *new); extern int request_resource(struct resource *root, struct resource *new); @@ -255,6 +258,22 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size); resource_size_t resource_alignment(struct resource *res); + +#ifdef CONFIG_EFI_SOFT_RESERVE +static inline void insert_resource_late(struct resource *new) +{ + if (new->desc == IORES_DESC_SOFT_RESERVED) + insert_resource_expand_to_fit(&soft_reserve_resource, new); + else + insert_resource_expand_to_fit(&iomem_resource, new); +} +#else +static inline void insert_resource_late(struct resource *new) +{ + insert_resource_expand_to_fit(&iomem_resource, new); +} +#endif + /** * resource_set_size - Calculate resource end address from size and start * @res: Resource descriptor @@ -409,6 +428,12 @@ walk_system_ram_res_rev(u64 start, u64 end, void *arg, extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); +int walk_soft_reserve_res_desc(unsigned long desc, unsigned long flags, + u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)); +int region_intersects_soft_reserve(struct resource *root, resource_size_t start, + size_t size, unsigned long flags, + unsigned long desc); struct resource *devm_request_free_mem_region(struct device *dev, struct resource *base, unsigned long size); diff --git a/kernel/resource.c b/kernel/resource.c index 8d3e6ed0bdc1..fd90990c31c6 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -321,8 +321,8 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long } /** - * find_next_iomem_res - Finds the lowest iomem resource that covers part of - * [@start..@end]. + * find_next_res - Finds the lowest resource that covers part of + * [@start..@end]. * * If a resource is found, returns 0 and @*res is overwritten with the part * of the resource that's within [@start..@end]; if none is found, returns @@ -337,9 +337,9 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long * The caller must specify @start, @end, @flags, and @desc * (which may be IORES_DESC_NONE). */ -static int find_next_iomem_res(resource_size_t start, resource_size_t end, - unsigned long flags, unsigned long desc, - struct resource *res) +static int find_next_res(struct resource *parent, resource_size_t start, + resource_size_t end, unsigned long flags, + unsigned long desc, struct resource *res) { struct resource *p; @@ -351,7 +351,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for_each_resource(&iomem_resource, p, false) { + for_each_resource(parent, p, false) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; @@ -382,16 +382,23 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, return p ? 0 : -ENODEV; } -static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, - unsigned long flags, unsigned long desc, - void *arg, - int (*func)(struct resource *, void *)) +static int find_next_iomem_res(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + struct resource *res) +{ + return find_next_res(&iomem_resource, start, end, flags, desc, res); +} + +static int walk_res_desc(struct resource *parent, resource_size_t start, + resource_size_t end, unsigned long flags, + unsigned long desc, void *arg, + int (*func)(struct resource *, void *)) { struct resource res; int ret = -EINVAL; while (start < end && - !find_next_iomem_res(start, end, flags, desc, &res)) { + !find_next_res(parent, start, end, flags, desc, &res)) { ret = (*func)(&res, arg); if (ret) break; @@ -402,6 +409,15 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, return ret; } +static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + void *arg, + int (*func)(struct resource *, void *)) +{ + return walk_res_desc(&iomem_resource, start, end, flags, desc, arg, func); +} + + /** * walk_iomem_res_desc - Walks through iomem resources and calls func() * with matching resource ranges. @@ -426,6 +442,26 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, } EXPORT_SYMBOL_GPL(walk_iomem_res_desc); +#ifdef CONFIG_EFI_SOFT_RESERVE +struct resource soft_reserve_resource = { + .name = "Soft Reserved", + .start = 0, + .end = -1, + .desc = IORES_DESC_SOFT_RESERVED, + .flags = IORESOURCE_MEM, +}; +EXPORT_SYMBOL_GPL(soft_reserve_resource); + +int walk_soft_reserve_res_desc(unsigned long desc, unsigned long flags, + u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)) +{ + return walk_res_desc(&soft_reserve_resource, start, end, flags, desc, + arg, func); +} +EXPORT_SYMBOL_GPL(walk_soft_reserve_res_desc); +#endif + /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. -- 2.50.1
Hi Dan, On 7/23/2025 12:31 AM, dan.j.williams@intel.com wrote: > Smita Koralahalli wrote: >> Introduce a background worker in cxl_acpi to delay SOFT RESERVE handling >> until the cxl_mem driver has probed at least one device. This coordination >> ensures that DAX registration or fallback handling for soft-reserved >> regions is not triggered prematurely. >> >> The worker waits on cxl_wait_queue, which is signaled via >> cxl_mem_active_inc() during cxl_mem_probe(). Once at least one memory >> device probe is confirmed, the worker invokes wait_for_device_probe() >> to allow the rest of the CXL device hierarchy to complete initialization. >> >> Additionally, it also handles initialization order issues where >> cxl_acpi_probe() may complete before other drivers such as cxl_port or >> cxl_mem have loaded, especially when cxl_acpi and cxl_port are built-in >> and cxl_mem is a loadable module. In such cases, using only >> wait_for_device_probe() is insufficient, as it may return before all >> relevant probes are registered. > > Right, but that problem is not solved by this which still leaves the > decision on when to give up on this mechanism, and this mechanism does > not tell you when follow-on probe work is complete. > >> While region creation happens in cxl_port_probe(), waiting on >> cxl_mem_active() would be sufficient as cxl_mem_probe() can only succeed >> after the port hierarchy is in place. Furthermore, since cxl_mem depends >> on cxl_pci, this also guarantees that cxl_pci has loaded by the time the >> wait completes. >> >> As cxl_mem_active() infrastructure already exists for tracking probe >> activity, cxl_acpi can use it without introducing new coordination >> mechanisms. > > In appreciate the instinct to not add anything new, but the module > loading problem is solvable. > > If the goal is: "I want to give device-dax a point at which it can make > a go / no-go decision about whether the CXL subsystem has properly > assembled all CXL regions implied by Soft Reserved instersecting with > CXL Windows." Then that is something like the below, only lightly tested > and likely regresses the non-CXL case. > > -- 8< -- > From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001 > From: Dan Williams <dan.j.williams@intel.com> > Date: Tue, 22 Jul 2025 16:11:08 -0700 > Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration > > CXL and dax_hmem fight over "Soft Reserved" (EFI Specific Purpose Memory) > resources are published in the iomem resource tree. The entry blocks some > CXL hotplug flows, and CXL blocks dax_hmem from publishing the memory in > the event that CXL fails to parse the platform configuration. > > Towards resolving this conflict: (the non-RFC version > of this patch should split these into separate patches): > > 1/ Defer publishing "Soft Reserved" entries in the iomem resource tree > until the consumer, dax_hmem, is ready to use them. > > 2/ Fix detection of "Soft Reserved" vs "CXL Window" resource overlaps by > switching from MODULE_SOFTDEP() to request_module() for making sure that > cxl_acpi has had a chance to publish "CXL Window" resources. > > 3/ Add cxl_pci to the list of modules that need to have had a chance to > scan boot devices such that wait_device_probe() flushes initial CXL > topology discovery. > > 4/ Add a workqueue that delays consideration of "Soft Reserved" that > overlaps CXL so that the CXL subsystem can complete all of its region > assembly. > > For RFC purposes this only solves the reliabilty of the DAX_CXL_MODE_DROP > case. DAX_CXL_MODE_REGISTER support can follow to shutdown CXL in favor of > vanilla DAX devices as an emergency fallback for platform configuration > quirks and bugs. > > Signed-off-by: Dan Williams <dan.j.williams@intel.com> > --- > arch/x86/kernel/e820.c | 2 +- > drivers/dax/hmem/device.c | 4 +- > drivers/dax/hmem/hmem.c | 94 +++++++++++++++++++++++++++++++++------ > include/linux/ioport.h | 25 +++++++++++ > kernel/resource.c | 58 +++++++++++++++++++----- > 5 files changed, 156 insertions(+), 27 deletions(-) > > diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c > index c3acbd26408b..aef1ff2cabda 100644 > --- a/arch/x86/kernel/e820.c > +++ b/arch/x86/kernel/e820.c > @@ -1153,7 +1153,7 @@ void __init e820__reserve_resources_late(void) > res = e820_res; > for (i = 0; i < e820_table->nr_entries; i++) { > if (!res->parent && res->end) > - insert_resource_expand_to_fit(&iomem_resource, res); > + insert_resource_late(res); > res++; > } > > diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c > index f9e1a76a04a9..22732b729017 100644 > --- a/drivers/dax/hmem/device.c > +++ b/drivers/dax/hmem/device.c > @@ -83,8 +83,8 @@ static __init int hmem_register_one(struct resource *res, void *data) > > static __init int hmem_init(void) > { > - walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED, > - IORESOURCE_MEM, 0, -1, NULL, hmem_register_one); > + walk_soft_reserve_res_desc(IORES_DESC_SOFT_RESERVED, IORESOURCE_MEM, 0, > + -1, NULL, hmem_register_one); > return 0; > } > > diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c > index 5e7c53f18491..0916478e3817 100644 > --- a/drivers/dax/hmem/hmem.c > +++ b/drivers/dax/hmem/hmem.c > @@ -59,9 +59,45 @@ static void release_hmem(void *pdev) > platform_device_unregister(pdev); > } > > +static enum dax_cxl_mode { > + DAX_CXL_MODE_DEFER, > + DAX_CXL_MODE_REGISTER, > + DAX_CXL_MODE_DROP, > +} dax_cxl_mode; > + > +static int handle_deferred_cxl(struct device *host, int target_nid, > + const struct resource *res) > +{ > + if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM, > + IORES_DESC_CXL) != REGION_DISJOINT) { > + if (dax_cxl_mode == DAX_CXL_MODE_DROP) > + dev_dbg(host, "dropping CXL range: %pr\n", res); > + } > + return 0; > +} > + > +struct dax_defer_work { > + struct platform_device *pdev; > + struct work_struct work; > +}; > + > +static void process_defer_work(struct work_struct *_work) > +{ > + struct dax_defer_work *work = container_of(_work, typeof(*work), work); > + struct platform_device *pdev = work->pdev; > + > + /* relies on cxl_acpi and cxl_pci having had a chance to load */ > + wait_for_device_probe(); > + > + dax_cxl_mode = DAX_CXL_MODE_DROP; > + > + walk_hmem_resources(&pdev->dev, handle_deferred_cxl); > +} > + > static int hmem_register_device(struct device *host, int target_nid, > const struct resource *res) > { > + struct dax_defer_work *work = dev_get_drvdata(host); > struct platform_device *pdev; > struct memregion_info info; > long id; > @@ -70,14 +106,21 @@ static int hmem_register_device(struct device *host, int target_nid, > if (IS_ENABLED(CONFIG_CXL_REGION) && > region_intersects(res->start, resource_size(res), IORESOURCE_MEM, > IORES_DESC_CXL) != REGION_DISJOINT) { I may be wrong here, but could this check fail? While request_module() ensures that cxl_acpi and cxl_pci are requested for loading, it does not guarantee that either has completed initialization or that region enumeration (i.e add_cxl_resources()) has finished by the time we reach this check. We also haven't called wait_for_device_probe() at this point, which is typically used to block until all pending device probes are complete. Thanks Smita > - dev_dbg(host, "deferring range to CXL: %pr\n", res); > - return 0; > + switch (dax_cxl_mode) { > + case DAX_CXL_MODE_DEFER: > + dev_dbg(host, "deferring range to CXL: %pr\n", res); > + schedule_work(&work->work); > + return 0; > + case DAX_CXL_MODE_REGISTER: > + dev_dbg(host, "registering CXL range: %pr\n", res); > + break; > + case DAX_CXL_MODE_DROP: > + dev_dbg(host, "dropping CXL range: %pr\n", res); > + return 0; > + } > } > > - rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM, > - IORES_DESC_SOFT_RESERVED); > - if (rc != REGION_INTERSECTS) > - return 0; > + /* TODO: insert "Soft Reserved" into iomem here */ > > id = memregion_alloc(GFP_KERNEL); > if (id < 0) { > @@ -123,8 +166,30 @@ static int hmem_register_device(struct device *host, int target_nid, > return rc; > } > > +static void kill_defer_work(void *_work) > +{ > + struct dax_defer_work *work = container_of(_work, typeof(*work), work); > + > + cancel_work_sync(&work->work); > + kfree(work); > +} > + > static int dax_hmem_platform_probe(struct platform_device *pdev) > { > + struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL); > + int rc; > + > + if (!work) > + return -ENOMEM; > + > + work->pdev = pdev; > + INIT_WORK(&work->work, process_defer_work); > + > + rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work); > + if (rc) > + return rc; > + > + platform_set_drvdata(pdev, work); > return walk_hmem_resources(&pdev->dev, hmem_register_device); > } > > @@ -139,6 +204,16 @@ static __init int dax_hmem_init(void) > { > int rc; > > + /* > + * Ensure that cxl_acpi and cxl_pci have a chance to kick off > + * CXL topology discovery at least once before scanning the > + * iomem resource tree for IORES_DESC_CXL resources. > + */ > + if (IS_ENABLED(CONFIG_CXL_REGION)) { > + request_module("cxl_acpi"); > + request_module("cxl_pci"); > + } > + > rc = platform_driver_register(&dax_hmem_platform_driver); > if (rc) > return rc; > @@ -159,13 +234,6 @@ static __exit void dax_hmem_exit(void) > module_init(dax_hmem_init); > module_exit(dax_hmem_exit); > > -/* Allow for CXL to define its own dax regions */ > -#if IS_ENABLED(CONFIG_CXL_REGION) > -#if IS_MODULE(CONFIG_CXL_ACPI) > -MODULE_SOFTDEP("pre: cxl_acpi"); > -#endif > -#endif > - > MODULE_ALIAS("platform:hmem*"); > MODULE_ALIAS("platform:hmem_platform*"); > MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory"); > diff --git a/include/linux/ioport.h b/include/linux/ioport.h > index e8b2d6aa4013..4fc6ab518c24 100644 > --- a/include/linux/ioport.h > +++ b/include/linux/ioport.h > @@ -232,6 +232,9 @@ struct resource_constraint { > /* PC/ISA/whatever - the normal PC address spaces: IO and memory */ > extern struct resource ioport_resource; > extern struct resource iomem_resource; > +#ifdef CONFIG_EFI_SOFT_RESERVE > +extern struct resource soft_reserve_resource; > +#endif > > extern struct resource *request_resource_conflict(struct resource *root, struct resource *new); > extern int request_resource(struct resource *root, struct resource *new); > @@ -255,6 +258,22 @@ int adjust_resource(struct resource *res, resource_size_t start, > resource_size_t size); > resource_size_t resource_alignment(struct resource *res); > > + > +#ifdef CONFIG_EFI_SOFT_RESERVE > +static inline void insert_resource_late(struct resource *new) > +{ > + if (new->desc == IORES_DESC_SOFT_RESERVED) > + insert_resource_expand_to_fit(&soft_reserve_resource, new); > + else > + insert_resource_expand_to_fit(&iomem_resource, new); > +} > +#else > +static inline void insert_resource_late(struct resource *new) > +{ > + insert_resource_expand_to_fit(&iomem_resource, new); > +} > +#endif > + > /** > * resource_set_size - Calculate resource end address from size and start > * @res: Resource descriptor > @@ -409,6 +428,12 @@ walk_system_ram_res_rev(u64 start, u64 end, void *arg, > extern int > walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, > void *arg, int (*func)(struct resource *, void *)); > +int walk_soft_reserve_res_desc(unsigned long desc, unsigned long flags, > + u64 start, u64 end, void *arg, > + int (*func)(struct resource *, void *)); > +int region_intersects_soft_reserve(struct resource *root, resource_size_t start, > + size_t size, unsigned long flags, > + unsigned long desc); > > struct resource *devm_request_free_mem_region(struct device *dev, > struct resource *base, unsigned long size); > diff --git a/kernel/resource.c b/kernel/resource.c > index 8d3e6ed0bdc1..fd90990c31c6 100644 > --- a/kernel/resource.c > +++ b/kernel/resource.c > @@ -321,8 +321,8 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long > } > > /** > - * find_next_iomem_res - Finds the lowest iomem resource that covers part of > - * [@start..@end]. > + * find_next_res - Finds the lowest resource that covers part of > + * [@start..@end]. > * > * If a resource is found, returns 0 and @*res is overwritten with the part > * of the resource that's within [@start..@end]; if none is found, returns > @@ -337,9 +337,9 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long > * The caller must specify @start, @end, @flags, and @desc > * (which may be IORES_DESC_NONE). > */ > -static int find_next_iomem_res(resource_size_t start, resource_size_t end, > - unsigned long flags, unsigned long desc, > - struct resource *res) > +static int find_next_res(struct resource *parent, resource_size_t start, > + resource_size_t end, unsigned long flags, > + unsigned long desc, struct resource *res) > { > struct resource *p; > > @@ -351,7 +351,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, > > read_lock(&resource_lock); > > - for_each_resource(&iomem_resource, p, false) { > + for_each_resource(parent, p, false) { > /* If we passed the resource we are looking for, stop */ > if (p->start > end) { > p = NULL; > @@ -382,16 +382,23 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, > return p ? 0 : -ENODEV; > } > > -static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, > - unsigned long flags, unsigned long desc, > - void *arg, > - int (*func)(struct resource *, void *)) > +static int find_next_iomem_res(resource_size_t start, resource_size_t end, > + unsigned long flags, unsigned long desc, > + struct resource *res) > +{ > + return find_next_res(&iomem_resource, start, end, flags, desc, res); > +} > + > +static int walk_res_desc(struct resource *parent, resource_size_t start, > + resource_size_t end, unsigned long flags, > + unsigned long desc, void *arg, > + int (*func)(struct resource *, void *)) > { > struct resource res; > int ret = -EINVAL; > > while (start < end && > - !find_next_iomem_res(start, end, flags, desc, &res)) { > + !find_next_res(parent, start, end, flags, desc, &res)) { > ret = (*func)(&res, arg); > if (ret) > break; > @@ -402,6 +409,15 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, > return ret; > } > > +static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, > + unsigned long flags, unsigned long desc, > + void *arg, > + int (*func)(struct resource *, void *)) > +{ > + return walk_res_desc(&iomem_resource, start, end, flags, desc, arg, func); > +} > + > + > /** > * walk_iomem_res_desc - Walks through iomem resources and calls func() > * with matching resource ranges. > @@ -426,6 +442,26 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, > } > EXPORT_SYMBOL_GPL(walk_iomem_res_desc); > > +#ifdef CONFIG_EFI_SOFT_RESERVE > +struct resource soft_reserve_resource = { > + .name = "Soft Reserved", > + .start = 0, > + .end = -1, > + .desc = IORES_DESC_SOFT_RESERVED, > + .flags = IORESOURCE_MEM, > +}; > +EXPORT_SYMBOL_GPL(soft_reserve_resource); > + > +int walk_soft_reserve_res_desc(unsigned long desc, unsigned long flags, > + u64 start, u64 end, void *arg, > + int (*func)(struct resource *, void *)) > +{ > + return walk_res_desc(&soft_reserve_resource, start, end, flags, desc, > + arg, func); > +} > +EXPORT_SYMBOL_GPL(walk_soft_reserve_res_desc); > +#endif > + > /* > * This function calls the @func callback against all memory ranges of type > * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
Koralahalli Channabasappa, Smita wrote: [..] > > static int hmem_register_device(struct device *host, int target_nid, > > const struct resource *res) > > { > > + struct dax_defer_work *work = dev_get_drvdata(host); > > struct platform_device *pdev; > > struct memregion_info info; > > long id; > > @@ -70,14 +106,21 @@ static int hmem_register_device(struct device *host, int target_nid, > > if (IS_ENABLED(CONFIG_CXL_REGION) && > > region_intersects(res->start, resource_size(res), IORESOURCE_MEM, > > IORES_DESC_CXL) != REGION_DISJOINT) { > > I may be wrong here, but could this check fail? It can fail, but for the case where ACPI0017 is present and CXL windows exist, the failure cases would only be the extreme ones like OOM killer. > While request_module() ensures that cxl_acpi and cxl_pci are requested > for loading, it does not guarantee that either has completed > initialization or that region enumeration (i.e add_cxl_resources()) > has finished by the time we reach this check. No, outside of someone doing something silly like passing "driver_async_probe=cxl_acpi" on the kernel command line then request_module() will complete synchronously (btw, should close that possibility off with PROBE_FORCE_SYNCHRONOUS). When request_module() returns module_init() for the requested module will have completed. ACPI devices will have been enumerated by this point, so cxl_acpi_probe() will have also run by the time module_init() completes. > We also haven't called wait_for_device_probe() at this point, which is > typically used to block until all pending device probes are complete. wait_for_device_probe() is only needed for async probing, deferred probing, and dependent device probing. cxl_acpi is none of those cases. ACPI devices are always enumerated before userspace is up, so the initial driver attach can always assume to have completed in module_init context. wait_for_device_probe() is needed for cxl_pci attach because cxl_pci attach is async and it creates dependent devices that fire off their own module requests. As I noted in the changelog MODULE_SOFTDEP() is not reliable for ordering, but request_module() is reliable for ordering. We could go so far as to have symbol dependencies to require module loading to succeed, but I don't think that is needed here. See that approach in the for-6.18/cxl-probe-order RFC branch for cxl_mem and cxl_port: https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git/log/?h=for-6.18/cxl-probe-order
dan.j.williams@ wrote: [..] > If the goal is: "I want to give device-dax a point at which it can make > a go / no-go decision about whether the CXL subsystem has properly > assembled all CXL regions implied by Soft Reserved instersecting with > CXL Windows." Then that is something like the below, only lightly tested > and likely regresses the non-CXL case. > > -- 8< -- > From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001 > From: Dan Williams <dan.j.williams@intel.com> > Date: Tue, 22 Jul 2025 16:11:08 -0700 > Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration Likely needs this incremental change to prevent DEV_DAX_HMEM from being built-in when CXL is not. This still leaves the awkward scenario of CXL enabled, DEV_DAX_CXL disabled, and DEV_DAX_HMEM built-in. I believe that safely fails in devdax only / fallback mode, but something to investigate when respinning on top of this. -- 8< -- diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index d656e4c0eb84..3683bb3f2311 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -48,6 +48,8 @@ config DEV_DAX_CXL tristate "CXL DAX: direct access to CXL RAM regions" depends on CXL_BUS && CXL_REGION && DEV_DAX default CXL_REGION && DEV_DAX + depends on CXL_ACPI >= DEV_DAX_HMEM + depends on CXL_PCI >= DEV_DAX_HMEM help CXL RAM regions are either mapped by platform-firmware and published in the initial system-memory map as "System RAM", mapped diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index 0916478e3817..8bcd104111a8 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -103,7 +103,7 @@ static int hmem_register_device(struct device *host, int target_nid, long id; int rc; - if (IS_ENABLED(CONFIG_CXL_REGION) && + if (IS_ENABLED(CONFIG_DEV_DAX_CXL) && region_intersects(res->start, resource_size(res), IORESOURCE_MEM, IORES_DESC_CXL) != REGION_DISJOINT) { switch (dax_cxl_mode) { @@ -209,7 +209,7 @@ static __init int dax_hmem_init(void) * CXL topology discovery at least once before scanning the * iomem resource tree for IORES_DESC_CXL resources. */ - if (IS_ENABLED(CONFIG_CXL_REGION)) { + if (IS_ENABLED(CONFIG_DEV_DAX_CXL)) { request_module("cxl_acpi"); request_module("cxl_pci"); }
Hi Dan and Smita, On 24/07/2025 00:13, dan.j.williams@intel.com wrote: > dan.j.williams@ wrote: > [..] >> If the goal is: "I want to give device-dax a point at which it can make >> a go / no-go decision about whether the CXL subsystem has properly >> assembled all CXL regions implied by Soft Reserved instersecting with >> CXL Windows." Then that is something like the below, only lightly tested >> and likely regresses the non-CXL case. >> >> -- 8< -- >> From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001 >> From: Dan Williams <dan.j.williams@intel.com> >> Date: Tue, 22 Jul 2025 16:11:08 -0700 >> Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration > > Likely needs this incremental change to prevent DEV_DAX_HMEM from being > built-in when CXL is not. This still leaves the awkward scenario of CXL > enabled, DEV_DAX_CXL disabled, and DEV_DAX_HMEM built-in. I believe that > safely fails in devdax only / fallback mode, but something to > investigate when respinning on top of this. > Thank you for your RFC; I find your proposal remarkably compelling, as it adeptly addresses the issues I am currently facing. To begin with, I still encountered several issues with your patch (considering the patch at the RFC stage, I think it is already quite commendable): 1. Some resources described by SRAT are wrongly identified as System RAM (kmem), such as the following: 200000000-5bffffff. ``` 200000000-5bffffff : dax6.0 200000000-5bffffff : System RAM (kmem) 5c0001128-5c00011b7 : port1 5d0000000-64ffffff : CXL Window 0 5d0000000-64ffffff : region0 5d0000000-64ffffff : dax0.0 5d0000000-64ffffff : System RAM (kmem) 680000000-e7ffffff : PCI Bus 0000:00 [root@rdma-server ~]# dmesg | grep -i -e soft -e hotplug [ 0.000000] Command line: BOOT_IMAGE=(hd0,msdos1)/boot/vmlinuz-6.16.0-rc4-lizhijian-Dan+ root=UUID=386769a3-cfa5-47c8-8797-d5ec58c9cb6c ro earlyprintk=ttyS0 no_timer_check net.ifnames=0 console=tty1 console=ttyS0,115200n8 softlockup_panic=1 printk.devkmsg=on oops=panic sysrq_always_enabled panic_on_warn ignore_loglevel kasan.fault=panic [ 0.000000] BIOS-e820: [mem 0x0000000180000000-0x00000001ffffffff] soft reserved [ 0.000000] BIOS-e820: [mem 0x00000005d0000000-0x000000064ffffff] soft reserved [ 0.072114] ACPI: SRAT: Node 3 PXM 3 [mem 0x200000000-0x5bffffff] hotplug ``` 2. Triggers dev_warn and dev_err: ``` [root@rdma-server ~]# journalctl -p err -p warning --dmesg ...snip... Jul 29 13:17:36 rdma-server kernel: cxl root0: Extended linear cache calculation failed rc:-2 Jul 29 13:17:36 rdma-server kernel: hmem hmem.1: probe with driver hmem failed with error -12 Jul 29 13:17:36 rdma-server kernel: hmem hmem.2: probe with driver hmem failed with error -12 Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: mapping0: 0x100000000-0x17ffffff could not reserve region Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: probe with driver kmem failed with error -16 ``` 3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible. On failure: ``` 100000000-27ffffff : System RAM 5c0001128-5c00011b7 : port1 5c0011128-5c00111b7 : port2 5d0000000-6cffffff : CXL Window 0 6d0000000-7cffffff : CXL Window 1 7000000000-700000ffff : PCI Bus 0000:0c 7000000000-700000ffff : 0000:0c:00.0 7000001080-70000010d7 : mem1 ``` On success: ``` 5d0000000-7cffffff : dax0.0 5d0000000-7cffffff : System RAM (kmem) 5d0000000-6cffffff : CXL Window 0 6d0000000-7cffffff : CXL Window 1 ``` In term of issues 1 and 2, this arises because hmem_register_device() attempts to register resources of all "HMEM devices," whereas we only need to register the IORES_DESC_SOFT_RESERVED resources. I believe resolving the current TODO will address this. ``` - rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM, - IORES_DESC_SOFT_RESERVED); - if (rc != REGION_INTERSECTS) - return 0; + /* TODO: insert "Soft Reserved" into iomem here */ ``` Regarding issue 3 (which exists in the current situation), this could be because it cannot ensure that dax_hmem_probe() executes prior to cxl_acpi_probe() when CXL_REGION is disabled. I am pleased that you have pushed the patch to the cxl/for-6.18/cxl-probe-order branch, and I'm looking forward to its integration into the upstream during the v6.18 merge window. Besides the current TODO, you also mentioned that this RFC PATCH must be further subdivided into several patches, so there remains significant work to be done. If my understanding is correct, you would be personally continuing to push forward this patch, right? Smita, Do you have any additional thoughts on this proposal from your side? Thanks Zhijian > -- 8< -- > diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig > index d656e4c0eb84..3683bb3f2311 100644 > --- a/drivers/dax/Kconfig > +++ b/drivers/dax/Kconfig > @@ -48,6 +48,8 @@ config DEV_DAX_CXL > tristate "CXL DAX: direct access to CXL RAM regions" > depends on CXL_BUS && CXL_REGION && DEV_DAX > default CXL_REGION && DEV_DAX > + depends on CXL_ACPI >= DEV_DAX_HMEM > + depends on CXL_PCI >= DEV_DAX_HMEM > help > CXL RAM regions are either mapped by platform-firmware > and published in the initial system-memory map as "System RAM", mapped > diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c > index 0916478e3817..8bcd104111a8 100644 > --- a/drivers/dax/hmem/hmem.c > +++ b/drivers/dax/hmem/hmem.c > @@ -103,7 +103,7 @@ static int hmem_register_device(struct device *host, int target_nid, > long id; > int rc; > > - if (IS_ENABLED(CONFIG_CXL_REGION) && > + if (IS_ENABLED(CONFIG_DEV_DAX_CXL) && > region_intersects(res->start, resource_size(res), IORESOURCE_MEM, > IORES_DESC_CXL) != REGION_DISJOINT) { > switch (dax_cxl_mode) { > @@ -209,7 +209,7 @@ static __init int dax_hmem_init(void) > * CXL topology discovery at least once before scanning the > * iomem resource tree for IORES_DESC_CXL resources. > */ > - if (IS_ENABLED(CONFIG_CXL_REGION)) { > + if (IS_ENABLED(CONFIG_DEV_DAX_CXL)) { > request_module("cxl_acpi"); > request_module("cxl_pci"); > }
On Tue, Aug 05, 2025 at 03:58:41AM +0000, Zhijian Li (Fujitsu) wrote: > Hi Dan and Smita, > > > On 24/07/2025 00:13, dan.j.williams@intel.com wrote: > > dan.j.williams@ wrote: > > [..] > >> If the goal is: "I want to give device-dax a point at which it can make > >> a go / no-go decision about whether the CXL subsystem has properly > >> assembled all CXL regions implied by Soft Reserved instersecting with > >> CXL Windows." Then that is something like the below, only lightly tested > >> and likely regresses the non-CXL case. > >> > >> -- 8< -- > >> From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001 > >> From: Dan Williams <dan.j.williams@intel.com> > >> Date: Tue, 22 Jul 2025 16:11:08 -0700 > >> Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration > > > > Likely needs this incremental change to prevent DEV_DAX_HMEM from being > > built-in when CXL is not. This still leaves the awkward scenario of CXL > > enabled, DEV_DAX_CXL disabled, and DEV_DAX_HMEM built-in. I believe that > > safely fails in devdax only / fallback mode, but something to > > investigate when respinning on top of this. > > > > Thank you for your RFC; I find your proposal remarkably compelling, as it adeptly addresses the issues I am currently facing. > > > To begin with, I still encountered several issues with your patch (considering the patch at the RFC stage, I think it is already quite commendable): Hi Zhijian, Like you, I tried this RFC out. It resolved the issue of soft reserved resources preventing teardown and replacement of a region in place. I looked at the issues you found, and have some questions comments included below. > > 1. Some resources described by SRAT are wrongly identified as System RAM (kmem), such as the following: 200000000-5bffffff. > > ``` > 200000000-5bffffff : dax6.0 > 200000000-5bffffff : System RAM (kmem) > 5c0001128-5c00011b7 : port1 > 5d0000000-64ffffff : CXL Window 0 > 5d0000000-64ffffff : region0 > 5d0000000-64ffffff : dax0.0 > 5d0000000-64ffffff : System RAM (kmem) > 680000000-e7ffffff : PCI Bus 0000:00 > > [root@rdma-server ~]# dmesg | grep -i -e soft -e hotplug > [ 0.000000] Command line: BOOT_IMAGE=(hd0,msdos1)/boot/vmlinuz-6.16.0-rc4-lizhijian-Dan+ root=UUID=386769a3-cfa5-47c8-8797-d5ec58c9cb6c ro earlyprintk=ttyS0 no_timer_check net.ifnames=0 console=tty1 console=ttyS0,115200n8 softlockup_panic=1 printk.devkmsg=on oops=panic sysrq_always_enabled panic_on_warn ignore_loglevel kasan.fault=panic > [ 0.000000] BIOS-e820: [mem 0x0000000180000000-0x00000001ffffffff] soft reserved > [ 0.000000] BIOS-e820: [mem 0x00000005d0000000-0x000000064ffffff] soft reserved > [ 0.072114] ACPI: SRAT: Node 3 PXM 3 [mem 0x200000000-0x5bffffff] hotplug > ``` Is that range also labelled as soft reserved? I ask, because I'm trying to draw a parallel between our test platforms. I see - [] BIOS-e820: [mem 0x0000024080000000-0x000004407fffffff] soft reserved . . [] reserve setup_data: [mem 0x0000024080000000-0x000004407fffffff] soft reserved . . [] ACPI: SRAT: Node 6 PXM 14 [mem 0x24080000000-0x4407fffffff] hotplug /proc/iomem - as expected 24080000000-5f77fffffff : CXL Window 0 24080000000-4407fffffff : region0 24080000000-4407fffffff : dax0.0 24080000000-4407fffffff : System RAM (kmem) I'm also seeing this message: [] resource: Unaddressable device [mem 0x24080000000-0x4407fffffff] conflicts with [mem 0x24080000000-0x4407fffffff] > > 2. Triggers dev_warn and dev_err: > > ``` > [root@rdma-server ~]# journalctl -p err -p warning --dmesg > ...snip... > Jul 29 13:17:36 rdma-server kernel: cxl root0: Extended linear cache calculation failed rc:-2 > Jul 29 13:17:36 rdma-server kernel: hmem hmem.1: probe with driver hmem failed with error -12 > Jul 29 13:17:36 rdma-server kernel: hmem hmem.2: probe with driver hmem failed with error -12 > Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: mapping0: 0x100000000-0x17ffffff could not reserve region > Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: probe with driver kmem failed with error -16 I see the kmem dax messages also. It seems the kmem probe is going after every range (except hotplug) in the SRAT, and failing. > ``` > > 3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible. Haven't tested !CXL_REGION yet. > > On failure: > > ``` > 100000000-27ffffff : System RAM > 5c0001128-5c00011b7 : port1 > 5c0011128-5c00111b7 : port2 > 5d0000000-6cffffff : CXL Window 0 > 6d0000000-7cffffff : CXL Window 1 > 7000000000-700000ffff : PCI Bus 0000:0c > 7000000000-700000ffff : 0000:0c:00.0 > 7000001080-70000010d7 : mem1 > ``` > > On success: > > ``` > 5d0000000-7cffffff : dax0.0 > 5d0000000-7cffffff : System RAM (kmem) > 5d0000000-6cffffff : CXL Window 0 > 6d0000000-7cffffff : CXL Window 1 > ``` > > In term of issues 1 and 2, this arises because hmem_register_device() attempts to register resources of all "HMEM devices," whereas we only need to register the IORES_DESC_SOFT_RESERVED resources. I believe resolving the current TODO will address this. > > ``` > - rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM, > - IORES_DESC_SOFT_RESERVED); > - if (rc != REGION_INTERSECTS) > - return 0; > + /* TODO: insert "Soft Reserved" into iomem here */ > ``` Above makes sense. I'll probably wait for an update from Smita to test again, but if you or Smita have anything you want me to try out on my hardwware in the meantime, let me know. -- Alison > > Regarding issue 3 (which exists in the current situation), this could be because it cannot ensure that dax_hmem_probe() executes prior to cxl_acpi_probe() when CXL_REGION is disabled. > > I am pleased that you have pushed the patch to the cxl/for-6.18/cxl-probe-order branch, and I'm looking forward to its integration into the upstream during the v6.18 merge window. > Besides the current TODO, you also mentioned that this RFC PATCH must be further subdivided into several patches, so there remains significant work to be done. > If my understanding is correct, you would be personally continuing to push forward this patch, right? > > > Smita, > > Do you have any additional thoughts on this proposal from your side? > > > Thanks > Zhijian > snip
On 21/08/2025 07:14, Alison Schofield wrote: > On Tue, Aug 05, 2025 at 03:58:41AM +0000, Zhijian Li (Fujitsu) wrote: >> Hi Dan and Smita, >> >> >> On 24/07/2025 00:13, dan.j.williams@intel.com wrote: >>> dan.j.williams@ wrote: >>> [..] >>>> If the goal is: "I want to give device-dax a point at which it can make >>>> a go / no-go decision about whether the CXL subsystem has properly >>>> assembled all CXL regions implied by Soft Reserved instersecting with >>>> CXL Windows." Then that is something like the below, only lightly tested >>>> and likely regresses the non-CXL case. >>>> >>>> -- 8< -- >>>> From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001 >>>> From: Dan Williams <dan.j.williams@intel.com> >>>> Date: Tue, 22 Jul 2025 16:11:08 -0700 >>>> Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration >>> >>> Likely needs this incremental change to prevent DEV_DAX_HMEM from being >>> built-in when CXL is not. This still leaves the awkward scenario of CXL >>> enabled, DEV_DAX_CXL disabled, and DEV_DAX_HMEM built-in. I believe that >>> safely fails in devdax only / fallback mode, but something to >>> investigate when respinning on top of this. >>> >> >> Thank you for your RFC; I find your proposal remarkably compelling, as it adeptly addresses the issues I am currently facing. >> >> >> To begin with, I still encountered several issues with your patch (considering the patch at the RFC stage, I think it is already quite commendable): > > Hi Zhijian, > > Like you, I tried this RFC out. It resolved the issue of soft reserved > resources preventing teardown and replacement of a region in place. > > I looked at the issues you found, and have some questions comments > included below. > >> >> 1. Some resources described by SRAT are wrongly identified as System RAM (kmem), such as the following: 200000000-5bffffff. >> >> ``` >> 200000000-5bffffff : dax6.0 >> 200000000-5bffffff : System RAM (kmem) >> 5c0001128-5c00011b7 : port1 >> 5d0000000-64ffffff : CXL Window 0 >> 5d0000000-64ffffff : region0 >> 5d0000000-64ffffff : dax0.0 >> 5d0000000-64ffffff : System RAM (kmem) >> 680000000-e7ffffff : PCI Bus 0000:00 >> >> [root@rdma-server ~]# dmesg | grep -i -e soft -e hotplug >> [ 0.000000] Command line: BOOT_IMAGE=(hd0,msdos1)/boot/vmlinuz-6.16.0-rc4-lizhijian-Dan+ root=UUID=386769a3-cfa5-47c8-8797-d5ec58c9cb6c ro earlyprintk=ttyS0 no_timer_check net.ifnames=0 console=tty1 console=ttyS0,115200n8 softlockup_panic=1 printk.devkmsg=on oops=panic sysrq_always_enabled panic_on_warn ignore_loglevel kasan.fault=panic >> [ 0.000000] BIOS-e820: [mem 0x0000000180000000-0x00000001ffffffff] soft reserved >> [ 0.000000] BIOS-e820: [mem 0x00000005d0000000-0x000000064ffffff] soft reserved >> [ 0.072114] ACPI: SRAT: Node 3 PXM 3 [mem 0x200000000-0x5bffffff] hotplug >> ``` > > Is that range also labelled as soft reserved? > I ask, because I'm trying to draw a parallel between our test platforms. No, It's not a soft reserved range. This can simply simulate with QEMU with `maxmem=192G` option(see below full qemu command line). In my environment, `0x200000000-0x5bffffff` is something like [DRAM_END + 1, DRAM_END + maxmem - TOTAL_INSTALLED_DRAM_SIZE] DRAM_END: end of the installed DRAM in Node 3 This range is reserved for the DRAM hot-add. In my case, it will be registered into 'HMEM devices' by calling hmem_register_resource in HMAT(drivers/acpi/numa/hmat.c) 893 static void hmat_register_target_devices(struct memory_target *target) 894 { 895 struct resource *res; 896 897 /* 898 * Do not bother creating devices if no driver is available to 899 * consume them. 900 */ 901 if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM)) 902 return; 903 904 for (res = target->memregions.child; res; res = res->sibling) { 905 int target_nid = pxm_to_node(target->memory_pxm); 906 907 hmem_register_resource(target_nid, res); 908 } 909 } $ dmesg | grep -i -e soft -e hotplug -e Node [ 0.000000] Command line: BOOT_IMAGE=(hd0,msdos1)/boot/vmlinuz-6.16.0-rc4-lizhijian-Dan-00026-g1473b9914846-dirty root=UUID=386769a3-cfa5-47c8-8797-d5ec58c9cb6c ro earlyprintk=ttyS0 no_timer_check net.ifnames=0 console=tty1 conc [ 0.000000] BIOS-e820: [mem 0x0000000180000000-0x00000001ffffffff] soft reserved [ 0.000000] BIOS-e820: [mem 0x00000005d0000000-0x000000064fffffff] soft reserved [ 0.066332] ACPI: SRAT: Node 0 PXM 0 [mem 0x00000000-0x0009ffff] [ 0.067665] ACPI: SRAT: Node 0 PXM 0 [mem 0x00100000-0x7fffffff] [ 0.068995] ACPI: SRAT: Node 1 PXM 1 [mem 0x100000000-0x17fffffff] [ 0.070359] ACPI: SRAT: Node 2 PXM 2 [mem 0x180000000-0x1bfffffff] [ 0.071723] ACPI: SRAT: Node 3 PXM 3 [mem 0x1c0000000-0x1ffffffff] [ 0.073085] ACPI: SRAT: Node 3 PXM 3 [mem 0x200000000-0x5bfffffff] hotplug [ 0.075689] NUMA: Node 0 [mem 0x00001000-0x0009ffff] + [mem 0x00100000-0x7fffffff] -> [mem 0x00001000-0x7fffffff] [ 0.077849] NODE_DATA(0) allocated [mem 0x7ffb3e00-0x7ffdefff] [ 0.079149] NODE_DATA(1) allocated [mem 0x17ffd1e00-0x17fffcfff] [ 0.086077] Movable zone start for each node [ 0.087054] Early memory node ranges [ 0.087890] node 0: [mem 0x0000000000001000-0x000000000009efff] [ 0.089264] node 0: [mem 0x0000000000100000-0x000000007ffdefff] [ 0.090631] node 1: [mem 0x0000000100000000-0x000000017fffffff] [ 0.092003] Initmem setup node 0 [mem 0x0000000000001000-0x000000007ffdefff] [ 0.093532] Initmem setup node 1 [mem 0x0000000100000000-0x000000017fffffff] [ 0.095164] Initmem setup node 2 as memoryless [ 0.096281] Initmem setup node 3 as memoryless [ 0.097397] Initmem setup node 4 as memoryless [ 0.098444] On node 0, zone DMA: 1 pages in unavailable ranges [ 0.099866] On node 0, zone DMA: 97 pages in unavailable ranges [ 0.104342] On node 1, zone Normal: 33 pages in unavailable ranges [ 0.126883] CPU topo: Allowing 4 present CPUs plus 0 hotplug CPUs ================================= Please note that this is a modified QEMU. /home/lizhijian/qemu/build-hmem/qemu-system-x86_64 -machine q35,accel=kvm,cxl=on,hmat=on \ -name guest-rdma-server -nographic -boot c \ -m size=6G,slots=2,maxmem=19922944k \ -hda /home/lizhijian/images/Fedora-rdma-server.qcow2 \ -object memory-backend-memfd,share=on,size=2G,id=m0 \ -object memory-backend-memfd,share=on,size=2G,id=m1 \ -numa node,nodeid=0,cpus=0-1,memdev=m0 \ -numa node,nodeid=1,cpus=2-3,memdev=m1 \ -smp 4,sockets=2,cores=2 \ -device pcie-root-port,id=pci-root,slot=8,bus=pcie.0,chassis=0 \ -device pxb-cxl,id=pxb-cxl-host-bridge,bus=pcie.0,bus_nr=0x35,hdm_for_passthrough=true \ -device cxl-rp,id=cxl-rp-hb-rp0,bus=pxb-cxl-host-bridge,chassis=0,slot=0,port=0 \ -device cxl-type3,bus=cxl-rp-hb-rp0,volatile-memdev=cxl-vmem0,id=cxl-vmem0,program-hdm-decoder=true \ -object memory-backend-file,id=cxl-vmem0,share=on,mem-path=/home/lizhijian/images/cxltest0.raw,size=2048M \ -M cxl-fmw.0.targets.0=pxb-cxl-host-bridge,cxl-fmw.0.size=2G,cxl-fmw.0.interleave-granularity=8k \ -nic bridge,br=virbr0,model=e1000,mac=52:54:00:c9:76:74 \ -bios /home/lizhijian/seabios/out/bios.bin \ -object memory-backend-memfd,share=on,size=1G,id=m2 \ -object memory-backend-memfd,share=on,size=1G,id=m3 \ -numa node,memdev=m2,nodeid=2 \ -numa node,memdev=m3,nodeid=3 \ -numa dist,src=0,dst=0,val=10 \ -numa dist,src=0,dst=1,val=21 \ -numa dist,src=0,dst=2,val=21 \ -numa dist,src=0,dst=3,val=21 \ -numa dist,src=1,dst=0,val=21 \ -numa dist,src=1,dst=1,val=10 \ -numa dist,src=1,dst=2,val=21 \ -numa dist,src=1,dst=3,val=21 \ -numa dist,src=2,dst=0,val=21 \ -numa dist,src=2,dst=1,val=21 \ -numa dist,src=2,dst=2,val=10 \ -numa dist,src=2,dst=3,val=21 \ -numa dist,src=3,dst=0,val=21 \ -numa dist,src=3,dst=1,val=21 \ -numa dist,src=3,dst=2,val=21 \ -numa dist,src=3,dst=3,val=10 \ -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=110 \ -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=20000M \ -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=240 \ -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=40000M \ -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-latency,latency=340 \ -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-bandwidth,bandwidth=60000M \ -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-latency,latency=440 \ -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-bandwidth,bandwidth=80000M \ -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-latency,latency=240 \ -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=40000M \ -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-latency,latency=110 \ -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=20000M \ -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-latency,latency=340 \ -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-bandwidth,bandwidth=60000M \ -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-latency,latency=440 \ -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-bandwidth,bandwidth=80000M > I see - > > [] BIOS-e820: [mem 0x0000024080000000-0x000004407fffffff] soft reserved > . > . > [] reserve setup_data: [mem 0x0000024080000000-0x000004407fffffff] soft reserved > . > . > [] ACPI: SRAT: Node 6 PXM 14 [mem 0x24080000000-0x4407fffffff] hotplug > > /proc/iomem - as expected > 24080000000-5f77fffffff : CXL Window 0 > 24080000000-4407fffffff : region0 > 24080000000-4407fffffff : dax0.0 > 24080000000-4407fffffff : System RAM (kmem) > > > I'm also seeing this message: > [] resource: Unaddressable device [mem 0x24080000000-0x4407fffffff] conflicts with [mem 0x24080000000-0x4407fffffff] > >> >> 2. Triggers dev_warn and dev_err: >> >> ``` >> [root@rdma-server ~]# journalctl -p err -p warning --dmesg >> ...snip... >> Jul 29 13:17:36 rdma-server kernel: cxl root0: Extended linear cache calculation failed rc:-2 >> Jul 29 13:17:36 rdma-server kernel: hmem hmem.1: probe with driver hmem failed with error -12 >> Jul 29 13:17:36 rdma-server kernel: hmem hmem.2: probe with driver hmem failed with error -12 >> Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: mapping0: 0x100000000-0x17ffffff could not reserve region >> Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: probe with driver kmem failed with error -16 > > I see the kmem dax messages also. It seems the kmem probe is going after > every range (except hotplug) in the SRAT, and failing. Yes, that's true, because current RFC removed the code that filters out the non-soft-reserverd resource. As a result, it will try to register dax/kmem for all of them while some of them has been marked as busy in the iomem_resource. >> - rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM, >> - IORES_DESC_SOFT_RESERVED); >> - if (rc != REGION_INTERSECTS) >> - return 0; This is another example on my real *CXL HOST*: Aug 19 17:59:05 kernel: device-mapper: core: CONFIG_IMA_DISABLE_HTABLE is disabled. Duplicate IMA measuremen> Aug 19 17:59:09 kernel: power_meter ACPI000D:00: Ignoring unsafe software power cap! Aug 19 17:59:09 kernel: kmem dax2.0: mapping0: 0x0-0x8fffffff could not reserve region Aug 19 17:59:09 kernel: kmem dax2.0: probe with driver kmem failed with error -16 Aug 19 17:59:09 kernel: kmem dax3.0: mapping0: 0x100000000-0x86fffffff could not reserve region Aug 19 17:59:09 kernel: kmem dax3.0: probe with driver kmem failed with error -16 Aug 19 17:59:09 kernel: kmem dax4.0: mapping0: 0x870000000-0x106fffffff could not reserve region Aug 19 17:59:09 kernel: kmem dax4.0: probe with driver kmem failed with error -16 Aug 19 17:59:19 kernel: nvme nvme0: using unchecked data buffer Aug 19 18:36:27 kernel: block nvme1n1: No UUID available providing old NGUID lizhijian@:~$ sudo grep -w -e 106fffffff -e 870000000 -e 8fffffff -e 100000000 /proc/iomem 6fffb000-8fffffff : Reserved 100000000-10000ffff : Reserved 106ccc0000-106fffffff : Reserved This issue can be resolved by re-introducing sort_reserved_region_intersects(...) I guess. > >> ``` >> >> 3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible. > > Haven't tested !CXL_REGION yet. > >> >> On failure: >> >> ``` >> 100000000-27ffffff : System RAM >> 5c0001128-5c00011b7 : port1 >> 5c0011128-5c00111b7 : port2 >> 5d0000000-6cffffff : CXL Window 0 >> 6d0000000-7cffffff : CXL Window 1 >> 7000000000-700000ffff : PCI Bus 0000:0c >> 7000000000-700000ffff : 0000:0c:00.0 >> 7000001080-70000010d7 : mem1 >> ``` >> >> On success: >> >> ``` >> 5d0000000-7cffffff : dax0.0 >> 5d0000000-7cffffff : System RAM (kmem) >> 5d0000000-6cffffff : CXL Window 0 >> 6d0000000-7cffffff : CXL Window 1 >> ``` >> >> In term of issues 1 and 2, this arises because hmem_register_device() attempts to register resources of all "HMEM devices," whereas we only need to register the IORES_DESC_SOFT_RESERVED resources. I believe resolving the current TODO will address this. >> >> ``` >> - rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM, >> - IORES_DESC_SOFT_RESERVED); >> - if (rc != REGION_INTERSECTS) >> - return 0; >> + /* TODO: insert "Soft Reserved" into iomem here */ >> ``` > > Above makes sense. I think the subroutine add_soft_reserved() in your previous patchset[1] are able to cover this TODO > > I'll probably wait for an update from Smita to test again, but if you > or Smita have anything you want me to try out on my hardwware in the > meantime, let me know. > Here is my local fixup based on Dan's RFC, it can resovle issue 1 and 2. -- 8< -- commit e7ccd7a01e168e185971da66f4aa13eb451caeaf Author: Li Zhijian <lizhijian@fujitsu.com> Date: Fri Aug 20 11:07:15 2025 +0800 Fix probe-order TODO Signed-off-by: Li Zhijian <lizhijian@fujitsu.com> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index 754115da86cc..965ffc622136 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -93,6 +93,26 @@ static void process_defer_work(struct work_struct *_work) walk_hmem_resources(&pdev->dev, handle_deferred_cxl); } +static int add_soft_reserved(resource_size_t start, resource_size_t len, + unsigned long flags) +{ + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + int rc; + + if (!res) + return -ENOMEM; + + *res = DEFINE_RES_NAMED_DESC(start, len, "Soft Reserved", + flags | IORESOURCE_MEM, + IORES_DESC_SOFT_RESERVED); + + rc = insert_resource(&iomem_resource, res); + if (rc) + kfree(res); + + return rc; +} + static int hmem_register_device(struct device *host, int target_nid, const struct resource *res) { @@ -102,6 +122,10 @@ static int hmem_register_device(struct device *host, int target_nid, long id; int rc; + if (soft_reserve_res_intersects(res->start, resource_size(res), + IORESOURCE_MEM, IORES_DESC_NONE) == REGION_DISJOINT) + return 0; + if (IS_ENABLED(CONFIG_DEV_DAX_CXL) && region_intersects(res->start, resource_size(res), IORESOURCE_MEM, IORES_DESC_CXL) != REGION_DISJOINT) { @@ -119,7 +143,17 @@ static int hmem_register_device(struct device *host, int target_nid, } } - /* TODO: insert "Soft Reserved" into iomem here */ + /* + * This is a verified Soft Reserved region that CXL is not claiming (or + * is being overridden). Add it to the main iomem tree so it can be + * properly reserved by the DAX driver. + */ + rc = add_soft_reserved(res->start, res->end - res->start + 1, 0); + if (rc) { + dev_warn(host, "failed to insert soft-reserved resource %pr into iomem: %d\n", + res, rc); + return rc; + } id = memregion_alloc(GFP_KERNEL); if (id < 0) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 349f0d9aad22..eca5956c444b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1069,6 +1069,8 @@ enum { int region_intersects(resource_size_t offset, size_t size, unsigned long flags, unsigned long desc); +int soft_reserve_res_intersects(resource_size_t offset, size_t size, unsigned long flags, + unsigned long desc); /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); diff --git a/kernel/resource.c b/kernel/resource.c index b8eac6af2fad..a34b76cf690a 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -461,6 +461,22 @@ int walk_soft_reserve_res_desc(unsigned long desc, unsigned long flags, arg, func); } EXPORT_SYMBOL_GPL(walk_soft_reserve_res_desc); + +static int __region_intersects(struct resource *parent, resource_size_t start, + size_t size, unsigned long flags, + unsigned long desc); +int soft_reserve_res_intersects(resource_size_t start, size_t size, unsigned long flags, + unsigned long desc) +{ + int ret; + + read_lock(&resource_lock); + ret = __region_intersects(&soft_reserve_resource, start, size, flags, desc); + read_unlock(&resource_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(soft_reserve_res_intersects); #endif /* [1] https://lore.kernel.org/linux-cxl/29312c0765224ae76862d59a17748c8188fb95f1.1692638817.git.alison.schofield@intel.com/ > -- Alison > > >> >> Regarding issue 3 (which exists in the current situation), this could be because it cannot ensure that dax_hmem_probe() executes prior to cxl_acpi_probe() when CXL_REGION is disabled. >> >> I am pleased that you have pushed the patch to the cxl/for-6.18/cxl-probe-order branch, and I'm looking forward to its integration into the upstream during the v6.18 merge window. >> Besides the current TODO, you also mentioned that this RFC PATCH must be further subdivided into several patches, so there remains significant work to be done. >> If my understanding is correct, you would be personally continuing to push forward this patch, right? >> >> >> Smita, >> >> Do you have any additional thoughts on this proposal from your side? >> >> >> Thanks >> Zhijian >> > snip >
On 8/20/2025 7:30 PM, Zhijian Li (Fujitsu) wrote: > > > On 21/08/2025 07:14, Alison Schofield wrote: >> On Tue, Aug 05, 2025 at 03:58:41AM +0000, Zhijian Li (Fujitsu) wrote: >>> Hi Dan and Smita, >>> >>> >>> On 24/07/2025 00:13, dan.j.williams@intel.com wrote: >>>> dan.j.williams@ wrote: >>>> [..] >>>>> If the goal is: "I want to give device-dax a point at which it can make >>>>> a go / no-go decision about whether the CXL subsystem has properly >>>>> assembled all CXL regions implied by Soft Reserved instersecting with >>>>> CXL Windows." Then that is something like the below, only lightly tested >>>>> and likely regresses the non-CXL case. >>>>> >>>>> -- 8< -- >>>>> From 48b25461eca050504cf5678afd7837307b2dd14f Mon Sep 17 00:00:00 2001 >>>>> From: Dan Williams <dan.j.williams@intel.com> >>>>> Date: Tue, 22 Jul 2025 16:11:08 -0700 >>>>> Subject: [RFC PATCH] dax/cxl: Defer Soft Reserved registration >>>> >>>> Likely needs this incremental change to prevent DEV_DAX_HMEM from being >>>> built-in when CXL is not. This still leaves the awkward scenario of CXL >>>> enabled, DEV_DAX_CXL disabled, and DEV_DAX_HMEM built-in. I believe that >>>> safely fails in devdax only / fallback mode, but something to >>>> investigate when respinning on top of this. >>>> >>> >>> Thank you for your RFC; I find your proposal remarkably compelling, as it adeptly addresses the issues I am currently facing. >>> >>> >>> To begin with, I still encountered several issues with your patch (considering the patch at the RFC stage, I think it is already quite commendable): >> >> Hi Zhijian, >> >> Like you, I tried this RFC out. It resolved the issue of soft reserved >> resources preventing teardown and replacement of a region in place. >> >> I looked at the issues you found, and have some questions comments >> included below. >> >>> >>> 1. Some resources described by SRAT are wrongly identified as System RAM (kmem), such as the following: 200000000-5bffffff. >>> >>> ``` >>> 200000000-5bffffff : dax6.0 >>> 200000000-5bffffff : System RAM (kmem) >>> 5c0001128-5c00011b7 : port1 >>> 5d0000000-64ffffff : CXL Window 0 >>> 5d0000000-64ffffff : region0 >>> 5d0000000-64ffffff : dax0.0 >>> 5d0000000-64ffffff : System RAM (kmem) >>> 680000000-e7ffffff : PCI Bus 0000:00 >>> >>> [root@rdma-server ~]# dmesg | grep -i -e soft -e hotplug >>> [ 0.000000] Command line: BOOT_IMAGE=(hd0,msdos1)/boot/vmlinuz-6.16.0-rc4-lizhijian-Dan+ root=UUID=386769a3-cfa5-47c8-8797-d5ec58c9cb6c ro earlyprintk=ttyS0 no_timer_check net.ifnames=0 console=tty1 console=ttyS0,115200n8 softlockup_panic=1 printk.devkmsg=on oops=panic sysrq_always_enabled panic_on_warn ignore_loglevel kasan.fault=panic >>> [ 0.000000] BIOS-e820: [mem 0x0000000180000000-0x00000001ffffffff] soft reserved >>> [ 0.000000] BIOS-e820: [mem 0x00000005d0000000-0x000000064ffffff] soft reserved >>> [ 0.072114] ACPI: SRAT: Node 3 PXM 3 [mem 0x200000000-0x5bffffff] hotplug >>> ``` >> >> Is that range also labelled as soft reserved? >> I ask, because I'm trying to draw a parallel between our test platforms. > > No, It's not a soft reserved range. This can simply simulate with QEMU with `maxmem=192G` option(see below full qemu command line). > In my environment, `0x200000000-0x5bffffff` is something like [DRAM_END + 1, DRAM_END + maxmem - TOTAL_INSTALLED_DRAM_SIZE] > DRAM_END: end of the installed DRAM in Node 3 > > This range is reserved for the DRAM hot-add. In my case, it will be registered into 'HMEM devices' by calling hmem_register_resource in HMAT(drivers/acpi/numa/hmat.c) > > 893 static void hmat_register_target_devices(struct memory_target *target) > 894 { > 895 struct resource *res; > 896 > 897 /* > 898 * Do not bother creating devices if no driver is available to > 899 * consume them. > 900 */ > 901 if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM)) > 902 return; > 903 > 904 for (res = target->memregions.child; res; res = res->sibling) { > 905 int target_nid = pxm_to_node(target->memory_pxm); > 906 > 907 hmem_register_resource(target_nid, res); > 908 } > 909 } > > > $ dmesg | grep -i -e soft -e hotplug -e Node > [ 0.000000] Command line: BOOT_IMAGE=(hd0,msdos1)/boot/vmlinuz-6.16.0-rc4-lizhijian-Dan-00026-g1473b9914846-dirty root=UUID=386769a3-cfa5-47c8-8797-d5ec58c9cb6c ro earlyprintk=ttyS0 no_timer_check net.ifnames=0 console=tty1 conc > [ 0.000000] BIOS-e820: [mem 0x0000000180000000-0x00000001ffffffff] soft reserved > [ 0.000000] BIOS-e820: [mem 0x00000005d0000000-0x000000064fffffff] soft reserved > [ 0.066332] ACPI: SRAT: Node 0 PXM 0 [mem 0x00000000-0x0009ffff] > [ 0.067665] ACPI: SRAT: Node 0 PXM 0 [mem 0x00100000-0x7fffffff] > [ 0.068995] ACPI: SRAT: Node 1 PXM 1 [mem 0x100000000-0x17fffffff] > [ 0.070359] ACPI: SRAT: Node 2 PXM 2 [mem 0x180000000-0x1bfffffff] > [ 0.071723] ACPI: SRAT: Node 3 PXM 3 [mem 0x1c0000000-0x1ffffffff] > [ 0.073085] ACPI: SRAT: Node 3 PXM 3 [mem 0x200000000-0x5bfffffff] hotplug > [ 0.075689] NUMA: Node 0 [mem 0x00001000-0x0009ffff] + [mem 0x00100000-0x7fffffff] -> [mem 0x00001000-0x7fffffff] > [ 0.077849] NODE_DATA(0) allocated [mem 0x7ffb3e00-0x7ffdefff] > [ 0.079149] NODE_DATA(1) allocated [mem 0x17ffd1e00-0x17fffcfff] > [ 0.086077] Movable zone start for each node > [ 0.087054] Early memory node ranges > [ 0.087890] node 0: [mem 0x0000000000001000-0x000000000009efff] > [ 0.089264] node 0: [mem 0x0000000000100000-0x000000007ffdefff] > [ 0.090631] node 1: [mem 0x0000000100000000-0x000000017fffffff] > [ 0.092003] Initmem setup node 0 [mem 0x0000000000001000-0x000000007ffdefff] > [ 0.093532] Initmem setup node 1 [mem 0x0000000100000000-0x000000017fffffff] > [ 0.095164] Initmem setup node 2 as memoryless > [ 0.096281] Initmem setup node 3 as memoryless > [ 0.097397] Initmem setup node 4 as memoryless > [ 0.098444] On node 0, zone DMA: 1 pages in unavailable ranges > [ 0.099866] On node 0, zone DMA: 97 pages in unavailable ranges > [ 0.104342] On node 1, zone Normal: 33 pages in unavailable ranges > [ 0.126883] CPU topo: Allowing 4 present CPUs plus 0 hotplug CPUs > > ================================= > > Please note that this is a modified QEMU. > > /home/lizhijian/qemu/build-hmem/qemu-system-x86_64 -machine q35,accel=kvm,cxl=on,hmat=on \ > -name guest-rdma-server -nographic -boot c \ > -m size=6G,slots=2,maxmem=19922944k \ > -hda /home/lizhijian/images/Fedora-rdma-server.qcow2 \ > -object memory-backend-memfd,share=on,size=2G,id=m0 \ > -object memory-backend-memfd,share=on,size=2G,id=m1 \ > -numa node,nodeid=0,cpus=0-1,memdev=m0 \ > -numa node,nodeid=1,cpus=2-3,memdev=m1 \ > -smp 4,sockets=2,cores=2 \ > -device pcie-root-port,id=pci-root,slot=8,bus=pcie.0,chassis=0 \ > -device pxb-cxl,id=pxb-cxl-host-bridge,bus=pcie.0,bus_nr=0x35,hdm_for_passthrough=true \ > -device cxl-rp,id=cxl-rp-hb-rp0,bus=pxb-cxl-host-bridge,chassis=0,slot=0,port=0 \ > -device cxl-type3,bus=cxl-rp-hb-rp0,volatile-memdev=cxl-vmem0,id=cxl-vmem0,program-hdm-decoder=true \ > -object memory-backend-file,id=cxl-vmem0,share=on,mem-path=/home/lizhijian/images/cxltest0.raw,size=2048M \ > -M cxl-fmw.0.targets.0=pxb-cxl-host-bridge,cxl-fmw.0.size=2G,cxl-fmw.0.interleave-granularity=8k \ > -nic bridge,br=virbr0,model=e1000,mac=52:54:00:c9:76:74 \ > -bios /home/lizhijian/seabios/out/bios.bin \ > -object memory-backend-memfd,share=on,size=1G,id=m2 \ > -object memory-backend-memfd,share=on,size=1G,id=m3 \ > -numa node,memdev=m2,nodeid=2 \ > -numa node,memdev=m3,nodeid=3 \ > -numa dist,src=0,dst=0,val=10 \ > -numa dist,src=0,dst=1,val=21 \ > -numa dist,src=0,dst=2,val=21 \ > -numa dist,src=0,dst=3,val=21 \ > -numa dist,src=1,dst=0,val=21 \ > -numa dist,src=1,dst=1,val=10 \ > -numa dist,src=1,dst=2,val=21 \ > -numa dist,src=1,dst=3,val=21 \ > -numa dist,src=2,dst=0,val=21 \ > -numa dist,src=2,dst=1,val=21 \ > -numa dist,src=2,dst=2,val=10 \ > -numa dist,src=2,dst=3,val=21 \ > -numa dist,src=3,dst=0,val=21 \ > -numa dist,src=3,dst=1,val=21 \ > -numa dist,src=3,dst=2,val=21 \ > -numa dist,src=3,dst=3,val=10 \ > -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=110 \ > -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=20000M \ > -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=240 \ > -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=40000M \ > -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-latency,latency=340 \ > -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-bandwidth,bandwidth=60000M \ > -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-latency,latency=440 \ > -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-bandwidth,bandwidth=80000M \ > -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-latency,latency=240 \ > -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=40000M \ > -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-latency,latency=110 \ > -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=20000M \ > -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-latency,latency=340 \ > -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-bandwidth,bandwidth=60000M \ > -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-latency,latency=440 \ > -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-bandwidth,bandwidth=80000M > > > >> I see - >> >> [] BIOS-e820: [mem 0x0000024080000000-0x000004407fffffff] soft reserved >> . >> . >> [] reserve setup_data: [mem 0x0000024080000000-0x000004407fffffff] soft reserved >> . >> . >> [] ACPI: SRAT: Node 6 PXM 14 [mem 0x24080000000-0x4407fffffff] hotplug >> >> /proc/iomem - as expected >> 24080000000-5f77fffffff : CXL Window 0 >> 24080000000-4407fffffff : region0 >> 24080000000-4407fffffff : dax0.0 >> 24080000000-4407fffffff : System RAM (kmem) >> >> >> I'm also seeing this message: >> [] resource: Unaddressable device [mem 0x24080000000-0x4407fffffff] conflicts with [mem 0x24080000000-0x4407fffffff] >> >>> >>> 2. Triggers dev_warn and dev_err: >>> >>> ``` >>> [root@rdma-server ~]# journalctl -p err -p warning --dmesg >>> ...snip... >>> Jul 29 13:17:36 rdma-server kernel: cxl root0: Extended linear cache calculation failed rc:-2 >>> Jul 29 13:17:36 rdma-server kernel: hmem hmem.1: probe with driver hmem failed with error -12 >>> Jul 29 13:17:36 rdma-server kernel: hmem hmem.2: probe with driver hmem failed with error -12 >>> Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: mapping0: 0x100000000-0x17ffffff could not reserve region >>> Jul 29 13:17:36 rdma-server kernel: kmem dax3.0: probe with driver kmem failed with error -16 >> >> I see the kmem dax messages also. It seems the kmem probe is going after >> every range (except hotplug) in the SRAT, and failing. > > Yes, that's true, because current RFC removed the code that filters out the non-soft-reserverd resource. As a result, it will try to register dax/kmem for all of them while some of them has been marked as busy in the iomem_resource. > >>> - rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM, >>> - IORES_DESC_SOFT_RESERVED); >>> - if (rc != REGION_INTERSECTS) >>> - return 0; > > > This is another example on my real *CXL HOST*: > Aug 19 17:59:05 kernel: device-mapper: core: CONFIG_IMA_DISABLE_HTABLE is disabled. Duplicate IMA measuremen> > Aug 19 17:59:09 kernel: power_meter ACPI000D:00: Ignoring unsafe software power cap! > Aug 19 17:59:09 kernel: kmem dax2.0: mapping0: 0x0-0x8fffffff could not reserve region > Aug 19 17:59:09 kernel: kmem dax2.0: probe with driver kmem failed with error -16 > Aug 19 17:59:09 kernel: kmem dax3.0: mapping0: 0x100000000-0x86fffffff could not reserve region > Aug 19 17:59:09 kernel: kmem dax3.0: probe with driver kmem failed with error -16 > Aug 19 17:59:09 kernel: kmem dax4.0: mapping0: 0x870000000-0x106fffffff could not reserve region > Aug 19 17:59:09 kernel: kmem dax4.0: probe with driver kmem failed with error -16 > Aug 19 17:59:19 kernel: nvme nvme0: using unchecked data buffer > Aug 19 18:36:27 kernel: block nvme1n1: No UUID available providing old NGUID > lizhijian@:~$ sudo grep -w -e 106fffffff -e 870000000 -e 8fffffff -e 100000000 /proc/iomem > 6fffb000-8fffffff : Reserved > 100000000-10000ffff : Reserved > 106ccc0000-106fffffff : Reserved > > > This issue can be resolved by re-introducing sort_reserved_region_intersects(...) I guess. > > > >> >>> ``` >>> >>> 3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible. >> >> Haven't tested !CXL_REGION yet. When CXL_REGION is disabled, DEV_DAX_CXL will also be disabled. So dax_hmem should handle it. I was able to fallback to dax_hmem. But let me know if I'm missing something. config DEV_DAX_CXL tristate "CXL DAX: direct access to CXL RAM regions" depends on CXL_BUS && CXL_REGION && DEV_DAX .. >> >>> >>> On failure: >>> >>> ``` >>> 100000000-27ffffff : System RAM >>> 5c0001128-5c00011b7 : port1 >>> 5c0011128-5c00111b7 : port2 >>> 5d0000000-6cffffff : CXL Window 0 >>> 6d0000000-7cffffff : CXL Window 1 >>> 7000000000-700000ffff : PCI Bus 0000:0c >>> 7000000000-700000ffff : 0000:0c:00.0 >>> 7000001080-70000010d7 : mem1 >>> ``` >>> >>> On success: >>> >>> ``` >>> 5d0000000-7cffffff : dax0.0 >>> 5d0000000-7cffffff : System RAM (kmem) >>> 5d0000000-6cffffff : CXL Window 0 >>> 6d0000000-7cffffff : CXL Window 1 >>> ``` >>> >>> In term of issues 1 and 2, this arises because hmem_register_device() attempts to register resources of all "HMEM devices," whereas we only need to register the IORES_DESC_SOFT_RESERVED resources. I believe resolving the current TODO will address this. >>> >>> ``` >>> - rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM, >>> - IORES_DESC_SOFT_RESERVED); >>> - if (rc != REGION_INTERSECTS) >>> - return 0; >>> + /* TODO: insert "Soft Reserved" into iomem here */ >>> ``` >> >> Above makes sense. > > I think the subroutine add_soft_reserved() in your previous patchset[1] are able to cover this TODO > >> >> I'll probably wait for an update from Smita to test again, but if you >> or Smita have anything you want me to try out on my hardwware in the >> meantime, let me know. >> > > Here is my local fixup based on Dan's RFC, it can resovle issue 1 and 2. I almost have the same approach :) Sorry, I missed adding your "Signed-off-by".. Will include for next revision.. > > > -- 8< -- > commit e7ccd7a01e168e185971da66f4aa13eb451caeaf > Author: Li Zhijian <lizhijian@fujitsu.com> > Date: Fri Aug 20 11:07:15 2025 +0800 > > Fix probe-order TODO > > Signed-off-by: Li Zhijian <lizhijian@fujitsu.com> > > diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c > index 754115da86cc..965ffc622136 100644 > --- a/drivers/dax/hmem/hmem.c > +++ b/drivers/dax/hmem/hmem.c > @@ -93,6 +93,26 @@ static void process_defer_work(struct work_struct *_work) > walk_hmem_resources(&pdev->dev, handle_deferred_cxl); > } > > +static int add_soft_reserved(resource_size_t start, resource_size_t len, > + unsigned long flags) > +{ > + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); > + int rc; > + > + if (!res) > + return -ENOMEM; > + > + *res = DEFINE_RES_NAMED_DESC(start, len, "Soft Reserved", > + flags | IORESOURCE_MEM, > + IORES_DESC_SOFT_RESERVED); > + > + rc = insert_resource(&iomem_resource, res); > + if (rc) > + kfree(res); > + > + return rc; > +} > + > static int hmem_register_device(struct device *host, int target_nid, > const struct resource *res) > { > @@ -102,6 +122,10 @@ static int hmem_register_device(struct device *host, int target_nid, > long id; > int rc; > > + if (soft_reserve_res_intersects(res->start, resource_size(res), > + IORESOURCE_MEM, IORES_DESC_NONE) == REGION_DISJOINT) > + return 0; > + Should also handle CONFIG_EFI_SOFT_RESERVE not enabled case.. Thanks Smita > if (IS_ENABLED(CONFIG_DEV_DAX_CXL) && > region_intersects(res->start, resource_size(res), IORESOURCE_MEM, > IORES_DESC_CXL) != REGION_DISJOINT) { > @@ -119,7 +143,17 @@ static int hmem_register_device(struct device *host, int target_nid, > } > } > > - /* TODO: insert "Soft Reserved" into iomem here */ > + /* > + * This is a verified Soft Reserved region that CXL is not claiming (or > + * is being overridden). Add it to the main iomem tree so it can be > + * properly reserved by the DAX driver. > + */ > + rc = add_soft_reserved(res->start, res->end - res->start + 1, 0); > + if (rc) { > + dev_warn(host, "failed to insert soft-reserved resource %pr into iomem: %d\n", > + res, rc); > + return rc; > + } > > id = memregion_alloc(GFP_KERNEL); > if (id < 0) { > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 349f0d9aad22..eca5956c444b 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1069,6 +1069,8 @@ enum { > int region_intersects(resource_size_t offset, size_t size, unsigned long flags, > unsigned long desc); > > +int soft_reserve_res_intersects(resource_size_t offset, size_t size, unsigned long flags, > + unsigned long desc); > /* Support for virtually mapped pages */ > struct page *vmalloc_to_page(const void *addr); > unsigned long vmalloc_to_pfn(const void *addr); > diff --git a/kernel/resource.c b/kernel/resource.c > index b8eac6af2fad..a34b76cf690a 100644 > --- a/kernel/resource.c > +++ b/kernel/resource.c > @@ -461,6 +461,22 @@ int walk_soft_reserve_res_desc(unsigned long desc, unsigned long flags, > arg, func); > } > EXPORT_SYMBOL_GPL(walk_soft_reserve_res_desc); > + > +static int __region_intersects(struct resource *parent, resource_size_t start, > + size_t size, unsigned long flags, > + unsigned long desc); > +int soft_reserve_res_intersects(resource_size_t start, size_t size, unsigned long flags, > + unsigned long desc) > +{ > + int ret; > + > + read_lock(&resource_lock); > + ret = __region_intersects(&soft_reserve_resource, start, size, flags, desc); > + read_unlock(&resource_lock); > + > + return ret; > +} > +EXPORT_SYMBOL_GPL(soft_reserve_res_intersects); > #endif > > /* > > > > [1] https://lore.kernel.org/linux-cxl/29312c0765224ae76862d59a17748c8188fb95f1.1692638817.git.alison.schofield@intel.com/ > > >> -- Alison >> >> >>> >>> Regarding issue 3 (which exists in the current situation), this could be because it cannot ensure that dax_hmem_probe() executes prior to cxl_acpi_probe() when CXL_REGION is disabled. >>> >>> I am pleased that you have pushed the patch to the cxl/for-6.18/cxl-probe-order branch, and I'm looking forward to its integration into the upstream during the v6.18 merge window. >>> Besides the current TODO, you also mentioned that this RFC PATCH must be further subdivided into several patches, so there remains significant work to be done. >>> If my understanding is correct, you would be personally continuing to push forward this patch, right? >>> >>> >>> Smita, >>> >>> Do you have any additional thoughts on this proposal from your side? >>> >>> >>> Thanks >>> Zhijian >>> >> snip >>
On 22/08/2025 11:56, Koralahalli Channabasappa, Smita wrote: >> >>> >>>> ``` >>>> >>>> 3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible. >>> >>> Haven't tested !CXL_REGION yet. > > When CXL_REGION is disabled, DEV_DAX_CXL will also be disabled. So dax_hmem should handle it. Yes, falling back to dax_hmem/kmem is the result we expect. I haven't figured out the root cause of the issue yet, but I can tell you that in my QEMU environment, there is currently a certain probability that it cannot fall back to dax_hmem/kmem. Upon its failure, I observed the following warnings and errors (with my local fixup kernel). [ 12.203254] kmem dax0.0: mapping0: 0x5d0000000-0x7cfffffff could not reserve region [ 12.203437] kmem dax0.0: probe with driver kmem failed with error -16 > I was able to fallback to dax_hmem. But let me know if I'm missing something. > > config DEV_DAX_CXL > tristate "CXL DAX: direct access to CXL RAM regions" > depends on CXL_BUS && CXL_REGION && DEV_DAX > .. > >>> >>>> On failure: >>>> ``` >>>> 100000000-27ffffff : System RAM >>>> 5c0001128-5c00011b7 : port1 >>>> 5c0011128-5c00111b7 : port2 >>>> 5d0000000-6cffffff : CXL Window 0 >>>> 6d0000000-7cffffff : CXL Window 1 >>>> 7000000000-700000ffff : PCI Bus 0000:0c >>>> 7000000000-700000ffff : 0000:0c:00.0 >>>> 7000001080-70000010d7 : mem1 >>>> ``` >>>> >>>> On success: >>>> ``` >>>> 5d0000000-7cffffff : dax0.0 >>>> 5d0000000-7cffffff : System RAM (kmem) >>>> 5d0000000-6cffffff : CXL Window 0 >>>> 6d0000000-7cffffff : CXL Window 1 >>>> ``` >>>> >>>> In term of issues 1 and 2, this arises because hmem_register_device() attempts to register resources of all "HMEM devices," whereas we only need to register the IORES_DESC_SOFT_RESERVED resources. I believe resolving the current TODO will address this. >>>> >>>> ``` >>>> - rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM, >>>> - IORES_DESC_SOFT_RESERVED); >>>> - if (rc != REGION_INTERSECTS) >>>> - return 0; >>>> + /* TODO: insert "Soft Reserved" into iomem here */ >>>> ``` >>> >>> Above makes sense. >> >> I think the subroutine add_soft_reserved() in your previous patchset[1] are able to cover this TODO >> >>> >>> I'll probably wait for an update from Smita to test again, but if you >>> or Smita have anything you want me to try out on my hardwware in the >>> meantime, let me know. >>> >> >> Here is my local fixup based on Dan's RFC, it can resovle issue 1 and 2. > > I almost have the same approach 🙂 Sorry, I missed adding your > "Signed-off-by".. Will include for next revision.. Never mind. Glad to see your V6, I will test and take a look at soon > >> >> >> -- 8< -- >> commit e7ccd7a01e168e185971da66f4aa13eb451caeaf >> Author: Li Zhijian <lizhijian@fujitsu.com> >> Date: Fri Aug 20 11:07:15 2025 +0800 >> >> Fix probe-order TODO >> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com> >> >> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c >> index 754115da86cc..965ffc622136 100644 >> --- a/drivers/dax/hmem/hmem.c >> +++ b/drivers/dax/hmem/hmem.c >> @@ -93,6 +93,26 @@ static void process_defer_work(struct work_struct *_work) >> walk_hmem_resources(&pdev->dev, handle_deferred_cxl); >> } >> +static int add_soft_reserved(resource_size_t start, resource_size_t len, >> + unsigned long flags) >> +{ >> + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); >> + int rc; >> + >> + if (!res) >> + return -ENOMEM; >> + >> + *res = DEFINE_RES_NAMED_DESC(start, len, "Soft Reserved", >> + flags | IORESOURCE_MEM, >> + IORES_DESC_SOFT_RESERVED); >> + >> + rc = insert_resource(&iomem_resource, res); >> + if (rc) >> + kfree(res); >> + >> + return rc; >> +} >> + >> static int hmem_register_device(struct device *host, int target_nid, >> const struct resource *res) >> { >> @@ -102,6 +122,10 @@ static int hmem_register_device(struct device *host, int target_nid, >> long id; >> int rc; >> > > + if (soft_reserve_res_intersects(res->start, resource_size(res), >> + IORESOURCE_MEM, IORES_DESC_NONE) == REGION_DISJOINT) >> + return 0; >> + > > Should also handle CONFIG_EFI_SOFT_RESERVE not enabled case.. I think it’s unnecessary. For !CONFIG_EFI_SOFT_RESERVE, it will return directly because soft_reserve_res_intersects() will always return REGION_DISJOINT. Thanks Zhijian > > > Thanks > Smita
All, I have confirmed that in the !CXL_REGION configuration, the same environment may fail to fall back to hmem.(Your new patch cannot resolve this issue) In my environment: - There are two CXL memory devices corresponding to: ``` 5d0000000-6cffffff : CXL Window 0 6d0000000-7cffffff : CXL Window 1 ``` - E820 table contains a 'soft reserved' entry: ``` [ 0.000000] BIOS-e820: [mem 0x00000005d0000000-0x00000007cfffffff] soft reserved ``` However, since my ACPI SRAT doesn't describe the CXL memory devices (the point), `acpi/hmat.c` won't allocate memory targets for them. This prevents the call chain: ```c hmat_register_target_devices() // for each SRAT-described target -> hmem_register_resource() -> insert entry into "HMEM devices" resource ``` Therefore, for successful fallback to hmem in this environment: `dax_hmem.ko` and `kmem.ko` must request resources BEFORE `cxl_acpi.ko` inserts 'CXL Window X' However the kernel cannot guarantee this initialization order. When cxl_acpi runs before dax_kmem/kmem: ``` (built-in) CXL_REGION=n driver/dax/hmem/device.c cxl_acpi.ko dax_hmem.ko kmem.ko (1) Add entry '15d0000000-7cfffffff' (2) Traverse "HMEM devices" Insert to iomem: 5d0000000-7cffffff : Soft Reserved (3) Insert CXL Window 0/1 /proc/iomem shows: 5d0000000-7cffffff : Soft Reserved 5d0000000-6cffffff : CXL Window 0 6d0000000-7cffffff : CXL Window 1 (4) Create dax device (5) request_mem_region() fails for 5d0000000-7cffffff Reason: Children of 'Soft Reserved' (CXL Windows 0/1) don't cover full range ``` --------------------- In my another environment where ACPI SRAT has separate entries per CXL device: 1. `acpi/hmat.c` inserts two entries into "HMEM devices": - 5d0000000-6cffffff - 6d0000000-7cffffff 2. Regardless of module order, dax/kmem requests per-device resources, resulting in: ``` 5d0000000-7cffffff : Soft Reserved 5d0000000-6cffffff : CXL Window 0 5d0000000-6cffffff : dax0.0 5d0000000-6cffffff : System RAM (kmem) 6d0000000-7cffffff : CXL Window 1 6d0000000-7cffffff : dax1.0 6d0000000-7cffffff : System RAM (kmem) ``` Thanks, Zhijian On 25/08/2025 15:50, Li Zhijian wrote: > > > On 22/08/2025 11:56, Koralahalli Channabasappa, Smita wrote: >>> >>>> >>>>> ``` >>>>> >>>>> 3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible. >>>> >>>> Haven't tested !CXL_REGION yet. >> >> When CXL_REGION is disabled, DEV_DAX_CXL will also be disabled. So dax_hmem should handle it. > > Yes, falling back to dax_hmem/kmem is the result we expect. > I haven't figured out the root cause of the issue yet, but I can tell you that in my QEMU environment, > there is currently a certain probability that it cannot fall back to dax_hmem/kmem. > > Upon its failure, I observed the following warnings and errors (with my local fixup kernel). > [ 12.203254] kmem dax0.0: mapping0: 0x5d0000000-0x7cfffffff could not reserve region > [ 12.203437] kmem dax0.0: probe with driver kmem failed with error -16 > > > >> I was able to fallback to dax_hmem. But let me know if I'm missing something. >> >> config DEV_DAX_CXL >> tristate "CXL DAX: direct access to CXL RAM regions" >> depends on CXL_BUS && CXL_REGION && DEV_DAX >> .. >> >>>> >>>>> On failure: >>>>> ``` >>>>> 100000000-27ffffff : System RAM >>>>> 5c0001128-5c00011b7 : port1 >>>>> 5c0011128-5c00111b7 : port2 >>>>> 5d0000000-6cffffff : CXL Window 0 >>>>> 6d0000000-7cffffff : CXL Window 1 >>>>> 7000000000-700000ffff : PCI Bus 0000:0c >>>>> 7000000000-700000ffff : 0000:0c:00.0 >>>>> 7000001080-70000010d7 : mem1 >>>>> ``` >>>>> >>>>> On success: >>>>> ``` >>>>> 5d0000000-7cffffff : dax0.0 >>>>> 5d0000000-7cffffff : System RAM (kmem) >>>>> 5d0000000-6cffffff : CXL Window 0 >>>>> 6d0000000-7cffffff : CXL Window 1 >>>>> ```
Hi Zhijian, On 8/26/2025 11:30 PM, Zhijian Li (Fujitsu) wrote: > All, > > > I have confirmed that in the !CXL_REGION configuration, the same environment may fail to fall back to hmem.(Your new patch cannot resolve this issue) > > In my environment: > - There are two CXL memory devices corresponding to: > ``` > 5d0000000-6cffffff : CXL Window 0 > 6d0000000-7cffffff : CXL Window 1 > ``` > - E820 table contains a 'soft reserved' entry: > ``` > [ 0.000000] BIOS-e820: [mem 0x00000005d0000000-0x00000007cfffffff] soft reserved > ``` > > However, since my ACPI SRAT doesn't describe the CXL memory devices (the point), `acpi/hmat.c` won't allocate memory targets for them. This prevents the call chain: > ```c > hmat_register_target_devices() // for each SRAT-described target > -> hmem_register_resource() > -> insert entry into "HMEM devices" resource > ``` > > Therefore, for successful fallback to hmem in this environment: `dax_hmem.ko` and `kmem.ko` must request resources BEFORE `cxl_acpi.ko` inserts 'CXL Window X' > > However the kernel cannot guarantee this initialization order. > > When cxl_acpi runs before dax_kmem/kmem: > ``` > (built-in) CXL_REGION=n > driver/dax/hmem/device.c cxl_acpi.ko dax_hmem.ko kmem.ko > > (1) Add entry '15d0000000-7cfffffff' > (2) Traverse "HMEM devices" > Insert to iomem: > 5d0000000-7cffffff : Soft Reserved > > (3) Insert CXL Window 0/1 > /proc/iomem shows: > 5d0000000-7cffffff : Soft Reserved > 5d0000000-6cffffff : CXL Window 0 > 6d0000000-7cffffff : CXL Window 1 > > (4) Create dax device > (5) request_mem_region() fails > for 5d0000000-7cffffff > Reason: Children of 'Soft Reserved' > (CXL Windows 0/1) don't cover full range > ``` > Thanks for confirming the failure point. I was thinking of two possible ways forward here, and I would like to get feedback from others: [1] Teach dax_hmem to split when the parent claim fails: If __request_region() fails for the top-level Soft Reserved range because IORES_DESC_CXL children already exist, dax_hmem could iterate those windows and register each one individually. The downside is that it adds some complexity and feels a bit like papering over the fact that CXL should eventually own all of this memory. As Dan mentioned, the long-term plan is for Linux to not need the soft-reserve fallback at all, and simply ignore Soft Reserve for CXL Windows because the CXL subsystem will handle it. [2] Always unconditionally load CXL early.. Call request_module("cxl_acpi"); request_module("cxl_pci"); from dax_hmem_init() (without the IS_ENABLED(CONFIG_DEV_DAX_CXL) guard). If those are y/m, they’ll be present; if n, it’s a no-op. Then in hmem_register_device() drop the IS_ENABLED(CONFIG_DEV_DAX_CXL) gate and do: if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM, IORES_DESC_CXL) !=REGION_DISJOINT) /* defer to CXL */; and defer to CXL if windows are present. This makes Soft Reserved unavailable once CXL Windows have been discovered, even if CXL_REGION is disabled. That aligns better with the idea that “CXL should win” whenever a window is visible (This also needs to be considered alongside patch 6/6 in my series.) With CXL_REGION=n there would be no devdax and no kmem for that range; proc/iomem would show only the windows something like below 850000000-284fffffff : CXL Window 0 2850000000-484fffffff : CXL Window 1 4850000000-684fffffff : CXL Window 2 That means the memory is left unclaimed/unavailable.. (no System RAM, no /dev/dax). Is that acceptable when CXL_REGION is disabled? Thanks Smita > --------------------- > In my another environment where ACPI SRAT has separate entries per CXL device: > 1. `acpi/hmat.c` inserts two entries into "HMEM devices": > - 5d0000000-6cffffff > - 6d0000000-7cffffff > > 2. Regardless of module order, dax/kmem requests per-device resources, resulting in: > ``` > 5d0000000-7cffffff : Soft Reserved > 5d0000000-6cffffff : CXL Window 0 > 5d0000000-6cffffff : dax0.0 > 5d0000000-6cffffff : System RAM (kmem) > 6d0000000-7cffffff : CXL Window 1 > 6d0000000-7cffffff : dax1.0 > 6d0000000-7cffffff : System RAM (kmem) > ``` > > Thanks, > Zhijian > > > On 25/08/2025 15:50, Li Zhijian wrote: >> >> >> On 22/08/2025 11:56, Koralahalli Channabasappa, Smita wrote: >>>> >>>>> >>>>>> ``` >>>>>> >>>>>> 3. When CXL_REGION is disabled, there is a failure to fallback to dax_hmem, in which case only CXL Window X is visible. >>>>> >>>>> Haven't tested !CXL_REGION yet. >>> >>> When CXL_REGION is disabled, DEV_DAX_CXL will also be disabled. So dax_hmem should handle it. >> >> Yes, falling back to dax_hmem/kmem is the result we expect. >> I haven't figured out the root cause of the issue yet, but I can tell you that in my QEMU environment, >> there is currently a certain probability that it cannot fall back to dax_hmem/kmem. >> >> Upon its failure, I observed the following warnings and errors (with my local fixup kernel). >> [ 12.203254] kmem dax0.0: mapping0: 0x5d0000000-0x7cfffffff could not reserve region >> [ 12.203437] kmem dax0.0: probe with driver kmem failed with error -16 >> >> >> >>> I was able to fallback to dax_hmem. But let me know if I'm missing something. >>> >>> config DEV_DAX_CXL >>> tristate "CXL DAX: direct access to CXL RAM regions" >>> depends on CXL_BUS && CXL_REGION && DEV_DAX >>> .. >>> >>>>> >>>>>> On failure: >>>>>> ``` >>>>>> 100000000-27ffffff : System RAM >>>>>> 5c0001128-5c00011b7 : port1 >>>>>> 5c0011128-5c00111b7 : port2 >>>>>> 5d0000000-6cffffff : CXL Window 0 >>>>>> 6d0000000-7cffffff : CXL Window 1 >>>>>> 7000000000-700000ffff : PCI Bus 0000:0c >>>>>> 7000000000-700000ffff : 0000:0c:00.0 >>>>>> 7000001080-70000010d7 : mem1 >>>>>> ``` >>>>>> >>>>>> On success: >>>>>> ``` >>>>>> 5d0000000-7cffffff : dax0.0 >>>>>> 5d0000000-7cffffff : System RAM (kmem) >>>>>> 5d0000000-6cffffff : CXL Window 0 >>>>>> 6d0000000-7cffffff : CXL Window 1 >>>>>> ```
On 29/08/2025 07:21, Koralahalli Channabasappa, Smita wrote: > Hi Zhijian, > > On 8/26/2025 11:30 PM, Zhijian Li (Fujitsu) wrote: >> All, >> >> >> I have confirmed that in the !CXL_REGION configuration, the same environment may fail to fall back to hmem.(Your new patch cannot resolve this issue) >> >> In my environment: >> - There are two CXL memory devices corresponding to: >> ``` >> 5d0000000-6cffffff : CXL Window 0 >> 6d0000000-7cffffff : CXL Window 1 >> ``` >> - E820 table contains a 'soft reserved' entry: >> ``` >> [ 0.000000] BIOS-e820: [mem 0x00000005d0000000-0x00000007cfffffff] soft reserved >> ``` >> >> However, since my ACPI SRAT doesn't describe the CXL memory devices (the point), `acpi/hmat.c` won't allocate memory targets for them. This prevents the call chain: >> ```c >> hmat_register_target_devices() // for each SRAT-described target >> -> hmem_register_resource() >> -> insert entry into "HMEM devices" resource >> ``` >> >> Therefore, for successful fallback to hmem in this environment: `dax_hmem.ko` and `kmem.ko` must request resources BEFORE `cxl_acpi.ko` inserts 'CXL Window X' >> >> However the kernel cannot guarantee this initialization order. >> >> When cxl_acpi runs before dax_kmem/kmem: >> ``` >> (built-in) CXL_REGION=n >> driver/dax/hmem/device.c cxl_acpi.ko dax_hmem.ko kmem.ko >> >> (1) Add entry '15d0000000-7cfffffff' >> (2) Traverse "HMEM devices" >> Insert to iomem: >> 5d0000000-7cffffff : Soft Reserved >> >> (3) Insert CXL Window 0/1 >> /proc/iomem shows: >> 5d0000000-7cffffff : Soft Reserved >> 5d0000000-6cffffff : CXL Window 0 >> 6d0000000-7cffffff : CXL Window 1 >> >> (4) Create dax device >> (5) request_mem_region() fails >> for 5d0000000-7cffffff >> Reason: Children of 'Soft Reserved' >> (CXL Windows 0/1) don't cover full range >> ``` >> > > Thanks for confirming the failure point. I was thinking of two possible ways forward here, and I would like to get feedback from others: > > [1] Teach dax_hmem to split when the parent claim fails: > If __request_region() fails for the top-level Soft Reserved range because IORES_DESC_CXL children already exist, dax_hmem could iterate those windows and register each one individually. The downside is that it adds some complexity and feels a bit like papering over the fact that CXL should eventually own all of this memory. I examined below change to ensure kmem runs first, it seemed to work. static int __init cxl_acpi_init(void) { + if (!IS_ENABLED(CONFIG_DEV_DAX_CXL) && IS_ENABLED(CONFIG_DEV_DAX_KMEM)) { + /* fall back to dax_hmem,kmem */ + request_module("kmem"); + } return platform_driver_register(&cxl_acpi_driver); } > As Dan mentioned, the long-term plan is for Linux to not need the soft-reserve fallback at all, and simply ignore Soft Reserve for CXL Windows because the CXL subsystem will handle it. The current CXL_REGION kconfig states: Otherwise, platform-firmware managed CXL is enabled by being placed in the system address map and does not need a driver. I think this implies that a fallback to dax_hmem/kmem is still required for such cases. Of course, I personally agree with this 'long-term plan'. > > [2] Always unconditionally load CXL early.. > Call request_module("cxl_acpi"); request_module("cxl_pci"); from dax_hmem_init() (without the IS_ENABLED(CONFIG_DEV_DAX_CXL) guard). If those are y/m, they’ll be present; if n, it’s a no-op. Then in hmem_register_device() drop the IS_ENABLED(CONFIG_DEV_DAX_CXL) gate and do: > > if (region_intersects(res->start, resource_size(res), > IORESOURCE_MEM, IORES_DESC_CXL) !=REGION_DISJOINT) > /* defer to CXL */; > > and defer to CXL if windows are present. This makes Soft Reserved unavailable once CXL Windows have been discovered, even if CXL_REGION is disabled. That aligns better with the idea that “CXL should win” whenever a window is visible (This also needs to be considered alongside patch 6/6 in my series.) > > With CXL_REGION=n there would be no devdax and no kmem for that range; proc/iomem would show only the windows something like below > > 850000000-284fffffff : CXL Window 0 > 2850000000-484fffffff : CXL Window 1 > 4850000000-684fffffff : CXL Window 2 > > That means the memory is left unclaimed/unavailable.. (no System RAM, no /dev/dax). Is that acceptable when CXL_REGION is disabled? Regarding option [2] (unconditionally loading CXL early): This approach conflicts with the CXL_REGION Kconfig description mentioned above. --- To refocus on the original issue – the inability to recreate regions after destruction when CXL Windows overlap with Soft Reserved I believe your patch series "[PATCH 0/6] dax/hmem, cxl: Coordinate Soft Reserved handling with CXL" effectively addresses this problem. As for the pre-existing issues with !CXL_REGION and the unimplemented DAX_CXL_MODE_REGISTER, I suggest deferring them for now. They need not be resolved within this patch set, as we should prioritize the initial problem. Thanks Zhijian
On 7/15/25 11:04 AM, Smita Koralahalli wrote: > Introduce a background worker in cxl_acpi to delay SOFT RESERVE handling > until the cxl_mem driver has probed at least one device. This coordination > ensures that DAX registration or fallback handling for soft-reserved > regions is not triggered prematurely. > > The worker waits on cxl_wait_queue, which is signaled via > cxl_mem_active_inc() during cxl_mem_probe(). Once at least one memory > device probe is confirmed, the worker invokes wait_for_device_probe() > to allow the rest of the CXL device hierarchy to complete initialization. > > Additionally, it also handles initialization order issues where > cxl_acpi_probe() may complete before other drivers such as cxl_port or > cxl_mem have loaded, especially when cxl_acpi and cxl_port are built-in > and cxl_mem is a loadable module. In such cases, using only > wait_for_device_probe() is insufficient, as it may return before all > relevant probes are registered. > > While region creation happens in cxl_port_probe(), waiting on > cxl_mem_active() would be sufficient as cxl_mem_probe() can only succeed > after the port hierarchy is in place. Furthermore, since cxl_mem depends > on cxl_pci, this also guarantees that cxl_pci has loaded by the time the > wait completes. > > As cxl_mem_active() infrastructure already exists for tracking probe > activity, cxl_acpi can use it without introducing new coordination > mechanisms. > > Co-developed-by: Nathan Fontenot <Nathan.Fontenot@amd.com> > Signed-off-by: Nathan Fontenot <Nathan.Fontenot@amd.com> > Co-developed-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> > --- > drivers/cxl/acpi.c | 18 ++++++++++++++++++ > drivers/cxl/core/probe_state.c | 5 +++++ > drivers/cxl/cxl.h | 2 ++ > 3 files changed, 25 insertions(+) > > diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c > index ca06d5acdf8f..3a27289e669b 100644 > --- a/drivers/cxl/acpi.c > +++ b/drivers/cxl/acpi.c > @@ -823,6 +823,20 @@ static int pair_cxl_resource(struct device *dev, void *data) > return 0; > } > > +static void cxl_softreserv_mem_work_fn(struct work_struct *work) > +{ > + if (!wait_event_timeout(cxl_wait_queue, cxl_mem_active(), 30 * HZ)) > + pr_debug("Timeout waiting for cxl_mem probing"); > + > + wait_for_device_probe(); > +} > +static DECLARE_WORK(cxl_sr_work, cxl_softreserv_mem_work_fn); > + > +static void cxl_softreserv_mem_update(void) > +{ > + schedule_work(&cxl_sr_work); > +} > + > static int cxl_acpi_probe(struct platform_device *pdev) > { > int rc = 0; > @@ -903,6 +917,9 @@ static int cxl_acpi_probe(struct platform_device *pdev) > cxl_bus_rescan(); > > out: > + /* Update SOFT RESERVE resources that intersect with CXL regions */ > + cxl_softreserv_mem_update(); Can you please squash 1/7 with this patch since both are fairly small? Otherwise it leaves the reviewer wonder what the changes in 1/7 would result in. DJ > + > return rc; > } > > @@ -934,6 +951,7 @@ static int __init cxl_acpi_init(void) > > static void __exit cxl_acpi_exit(void) > { > + cancel_work_sync(&cxl_sr_work); > platform_driver_unregister(&cxl_acpi_driver); > cxl_bus_drain(); > } > diff --git a/drivers/cxl/core/probe_state.c b/drivers/cxl/core/probe_state.c > index 5ba4b4de0e33..3089b2698b32 100644 > --- a/drivers/cxl/core/probe_state.c > +++ b/drivers/cxl/core/probe_state.c > @@ -2,9 +2,12 @@ > /* Copyright(c) 2022 Intel Corporation. All rights reserved. */ > #include <linux/atomic.h> > #include <linux/export.h> > +#include <linux/wait.h> > #include "cxlmem.h" > > static atomic_t mem_active; > +DECLARE_WAIT_QUEUE_HEAD(cxl_wait_queue); > +EXPORT_SYMBOL_NS_GPL(cxl_wait_queue, "CXL"); > > bool cxl_mem_active(void) > { > @@ -13,10 +16,12 @@ bool cxl_mem_active(void) > > return false; > } > +EXPORT_SYMBOL_NS_GPL(cxl_mem_active, "CXL"); > > void cxl_mem_active_inc(void) > { > atomic_inc(&mem_active); > + wake_up(&cxl_wait_queue); > } > EXPORT_SYMBOL_NS_GPL(cxl_mem_active_inc, "CXL"); > > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h > index 3f1695c96abc..3117136f0208 100644 > --- a/drivers/cxl/cxl.h > +++ b/drivers/cxl/cxl.h > @@ -903,6 +903,8 @@ void cxl_coordinates_combine(struct access_coordinate *out, > > bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port); > > +extern wait_queue_head_t cxl_wait_queue; > + > /* > * Unit test builds overrides this to __weak, find the 'strong' version > * of these symbols in tools/testing/cxl/.
© 2016 - 2025 Red Hat, Inc.