[v5] dax/hmem, cxl: Coordinate Soft Reserved handling with CXL and HMEM

[PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Smita Koralahalli 2 weeks, 4 days ago

The current probe time ownership check for Soft Reserved memory based
solely on CXL window intersection is insufficient. dax_hmem probing is not
always guaranteed to run after CXL enumeration and region assembly, which
can lead to incorrect ownership decisions before the CXL stack has
finished publishing windows and assembling committed regions.

Introduce deferred ownership handling for Soft Reserved ranges that
intersect CXL windows at probe time by scheduling deferred work from
dax_hmem and waiting for the CXL stack to complete enumeration and region
assembly before deciding ownership.

Evaluate ownership of Soft Reserved ranges based on CXL region
containment.

   - If all Soft Reserved ranges are fully contained within committed CXL
     regions, DROP handling Soft Reserved ranges from dax_hmem and allow
     dax_cxl to bind.

   - If any Soft Reserved range is not fully claimed by committed CXL
     region, tear down all CXL regions and REGISTER the Soft Reserved
     ranges with dax_hmem instead.

While ownership resolution is pending, gate dax_cxl probing to avoid
binding prematurely.

This enforces a strict ownership. Either CXL fully claims the Soft
Reserved ranges or it relinquishes it entirely.

Co-developed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
 drivers/cxl/core/region.c | 25 ++++++++++++
 drivers/cxl/cxl.h         |  2 +
 drivers/dax/cxl.c         |  9 +++++
 drivers/dax/hmem/hmem.c   | 81 ++++++++++++++++++++++++++++++++++++++-
 4 files changed, 115 insertions(+), 2 deletions(-)

diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 9827a6dd3187..6c22a2d4abbb 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -3875,6 +3875,31 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
 DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
 			 cxl_region_debugfs_poison_clear, "%llx\n");
 
+static int cxl_region_teardown_cb(struct device *dev, void *data)
+{
+	struct cxl_root_decoder *cxlrd;
+	struct cxl_region *cxlr;
+	struct cxl_port *port;
+
+	if (!is_cxl_region(dev))
+		return 0;
+
+	cxlr = to_cxl_region(dev);
+
+	cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+	port = cxlrd_to_port(cxlrd);
+
+	devm_release_action(port->uport_dev, unregister_region, cxlr);
+
+	return 0;
+}
+
+void cxl_region_teardown_all(void)
+{
+	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_region_teardown_cb);
+}
+EXPORT_SYMBOL_GPL(cxl_region_teardown_all);
+
 static int cxl_region_contains_sr_cb(struct device *dev, void *data)
 {
 	struct resource *res = data;
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index b0ff6b65ea0b..1864d35d5f69 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -907,6 +907,7 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled);
 struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
 u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
 bool cxl_region_contains_soft_reserve(const struct resource *res);
+void cxl_region_teardown_all(void);
 #else
 static inline bool is_cxl_pmem_region(struct device *dev)
 {
@@ -933,6 +934,7 @@ static inline bool cxl_region_contains_soft_reserve(const struct resource *res)
 {
 	return false;
 }
+static inline void cxl_region_teardown_all(void) { }
 #endif
 
 void cxl_endpoint_parse_cdat(struct cxl_port *port);
diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
index 13cd94d32ff7..b7e90d6dd888 100644
--- a/drivers/dax/cxl.c
+++ b/drivers/dax/cxl.c
@@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
 	struct dax_region *dax_region;
 	struct dev_dax_data data;
 
+	switch (dax_cxl_mode) {
+	case DAX_CXL_MODE_DEFER:
+		return -EPROBE_DEFER;
+	case DAX_CXL_MODE_REGISTER:
+		return -ENODEV;
+	case DAX_CXL_MODE_DROP:
+		break;
+	}
+
 	if (nid == NUMA_NO_NODE)
 		nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
 
diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 1e3424358490..bcb57d8678d7 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -3,6 +3,7 @@
 #include <linux/memregion.h>
 #include <linux/module.h>
 #include <linux/dax.h>
+#include "../../cxl/cxl.h"
 #include "../bus.h"
 
 static bool region_idle;
@@ -58,9 +59,15 @@ static void release_hmem(void *pdev)
 	platform_device_unregister(pdev);
 }
 
+struct dax_defer_work {
+	struct platform_device *pdev;
+	struct work_struct work;
+};
+
 static int hmem_register_device(struct device *host, int target_nid,
 				const struct resource *res)
 {
+	struct dax_defer_work *work = dev_get_drvdata(host);
 	struct platform_device *pdev;
 	struct memregion_info info;
 	long id;
@@ -69,8 +76,18 @@ static int hmem_register_device(struct device *host, int target_nid,
 	if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
 	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
 			      IORES_DESC_CXL) != REGION_DISJOINT) {
-		dev_dbg(host, "deferring range to CXL: %pr\n", res);
-		return 0;
+		switch (dax_cxl_mode) {
+		case DAX_CXL_MODE_DEFER:
+			dev_dbg(host, "deferring range to CXL: %pr\n", res);
+			schedule_work(&work->work);
+			return 0;
+		case DAX_CXL_MODE_REGISTER:
+			dev_dbg(host, "registering CXL range: %pr\n", res);
+			break;
+		case DAX_CXL_MODE_DROP:
+			dev_dbg(host, "dropping CXL range: %pr\n", res);
+			return 0;
+		}
 	}
 
 	rc = region_intersects_soft_reserve(res->start, resource_size(res));
@@ -123,8 +140,67 @@ static int hmem_register_device(struct device *host, int target_nid,
 	return rc;
 }
 
+static int cxl_contains_soft_reserve(struct device *host, int target_nid,
+				     const struct resource *res)
+{
+	if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
+			      IORES_DESC_CXL) != REGION_DISJOINT) {
+		if (!cxl_region_contains_soft_reserve(res))
+			return 1;
+	}
+
+	return 0;
+}
+
+static void process_defer_work(struct work_struct *_work)
+{
+	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
+	struct platform_device *pdev = work->pdev;
+	int rc;
+
+	/* relies on cxl_acpi and cxl_pci having had a chance to load */
+	wait_for_device_probe();
+
+	rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
+
+	if (!rc) {
+		dax_cxl_mode = DAX_CXL_MODE_DROP;
+		rc = bus_rescan_devices(&cxl_bus_type);
+		if (rc)
+			dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
+	} else {
+		dax_cxl_mode = DAX_CXL_MODE_REGISTER;
+		cxl_region_teardown_all();
+	}
+
+	walk_hmem_resources(&pdev->dev, hmem_register_device);
+}
+
+static void kill_defer_work(void *_work)
+{
+	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
+
+	cancel_work_sync(&work->work);
+	kfree(work);
+}
+
 static int dax_hmem_platform_probe(struct platform_device *pdev)
 {
+	struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
+	int rc;
+
+	if (!work)
+		return -ENOMEM;
+
+	work->pdev = pdev;
+	INIT_WORK(&work->work, process_defer_work);
+
+	rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
+	if (rc)
+		return rc;
+
+	platform_set_drvdata(pdev, work);
+
 	return walk_hmem_resources(&pdev->dev, hmem_register_device);
 }
 
@@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
 MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory");
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Intel Corporation");
+MODULE_IMPORT_NS("CXL");
-- 
2.17.1

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by dan.j.williams@intel.com 1 week, 5 days ago

Smita Koralahalli wrote:
> The current probe time ownership check for Soft Reserved memory based
> solely on CXL window intersection is insufficient. dax_hmem probing is not
> always guaranteed to run after CXL enumeration and region assembly, which
> can lead to incorrect ownership decisions before the CXL stack has
> finished publishing windows and assembling committed regions.
> 
> Introduce deferred ownership handling for Soft Reserved ranges that
> intersect CXL windows at probe time by scheduling deferred work from
> dax_hmem and waiting for the CXL stack to complete enumeration and region
> assembly before deciding ownership.
> 
> Evaluate ownership of Soft Reserved ranges based on CXL region
> containment.
> 
>    - If all Soft Reserved ranges are fully contained within committed CXL
>      regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>      dax_cxl to bind.
> 
>    - If any Soft Reserved range is not fully claimed by committed CXL
>      region, tear down all CXL regions and REGISTER the Soft Reserved
>      ranges with dax_hmem instead.
> 
> While ownership resolution is pending, gate dax_cxl probing to avoid
> binding prematurely.
> 
> This enforces a strict ownership. Either CXL fully claims the Soft
> Reserved ranges or it relinquishes it entirely.
> 
> Co-developed-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
>  drivers/cxl/core/region.c | 25 ++++++++++++
>  drivers/cxl/cxl.h         |  2 +
>  drivers/dax/cxl.c         |  9 +++++
>  drivers/dax/hmem/hmem.c   | 81 ++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 115 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 9827a6dd3187..6c22a2d4abbb 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -3875,6 +3875,31 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
>  DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
>  			 cxl_region_debugfs_poison_clear, "%llx\n");
>  
> +static int cxl_region_teardown_cb(struct device *dev, void *data)
> +{
> +	struct cxl_root_decoder *cxlrd;
> +	struct cxl_region *cxlr;
> +	struct cxl_port *port;
> +
> +	if (!is_cxl_region(dev))
> +		return 0;
> +
> +	cxlr = to_cxl_region(dev);
> +
> +	cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
> +	port = cxlrd_to_port(cxlrd);
> +
> +	devm_release_action(port->uport_dev, unregister_region, cxlr);
> +
> +	return 0;
> +}
> +
> +void cxl_region_teardown_all(void)
> +{
> +	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_region_teardown_cb);
> +}
> +EXPORT_SYMBOL_GPL(cxl_region_teardown_all);
> +
>  static int cxl_region_contains_sr_cb(struct device *dev, void *data)
>  {
>  	struct resource *res = data;
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index b0ff6b65ea0b..1864d35d5f69 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -907,6 +907,7 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled);
>  struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
>  u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
>  bool cxl_region_contains_soft_reserve(const struct resource *res);
> +void cxl_region_teardown_all(void);
>  #else
>  static inline bool is_cxl_pmem_region(struct device *dev)
>  {
> @@ -933,6 +934,7 @@ static inline bool cxl_region_contains_soft_reserve(const struct resource *res)
>  {
>  	return false;
>  }
> +static inline void cxl_region_teardown_all(void) { }
>  #endif
>  
>  void cxl_endpoint_parse_cdat(struct cxl_port *port);
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index 13cd94d32ff7..b7e90d6dd888 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
>  	struct dax_region *dax_region;
>  	struct dev_dax_data data;
>  
> +	switch (dax_cxl_mode) {
> +	case DAX_CXL_MODE_DEFER:
> +		return -EPROBE_DEFER;

So, I think this causes a mess because now you have 2 workqueues (driver
core defer-queue and hmem work) competing to disposition this device.
What this seems to want is to only run in the post "soft reserve
dispositioned" world. Something like (untested!)

diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
index 13cd94d32ff7..1162495eb317 100644
--- a/drivers/dax/cxl.c
+++ b/drivers/dax/cxl.c
@@ -14,6 +14,9 @@ static int cxl_dax_region_probe(struct device *dev)
        struct dax_region *dax_region;
        struct dev_dax_data data;
 
+       /* Make sure that dax_cxl_mode is stable, only runs once at boot */
+       flush_hmem_work();
+
        if (nid == NUMA_NO_NODE)
                nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
 
@@ -38,6 +41,7 @@ static struct cxl_driver cxl_dax_region_driver = {
        .id = CXL_DEVICE_DAX_REGION,
        .drv = {
                .suppress_bind_attrs = true,
+               .probe_type = PROBE_PREFER_ASYNCHRONOUS,
        },
 };
 
...where that flush_hmem_work() is something provided by
drivers/dax/bus.c. The asynchronous probe is to make sure that the wait
is always out-of-line of any other synchronous probing.

You could probably drop the work item from being a per hmem_platform
drvdata and just make it a singleton work item in bus.c that hmem.c
queues and cxl.c flushes.

Probably also need to make sure that hmem_init() always runs before
dax_cxl module init with something like this for the built-in case:

diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 5ed5c39857c8..70e996bf1526 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
+obj-y += hmem/
 obj-$(CONFIG_DAX) += dax.o
 obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o
@@ -10,5 +11,3 @@ dax-y += bus.o
 device_dax-y := device.o
 dax_pmem-y := pmem.o
 dax_cxl-y := cxl.o
-
-obj-y += hmem/

[..]
> +static void process_defer_work(struct work_struct *_work)
> +{
> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> +	struct platform_device *pdev = work->pdev;
> +	int rc;
> +
> +	/* relies on cxl_acpi and cxl_pci having had a chance to load */
> +	wait_for_device_probe();
> +
> +	rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);

Like I said before this probably wants to be named something like
soft_reserve_has_cxl_match() to make it clear what is happening.

> +
> +	if (!rc) {
> +		dax_cxl_mode = DAX_CXL_MODE_DROP;
> +		rc = bus_rescan_devices(&cxl_bus_type);
> +		if (rc)
> +			dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
> +	} else {
> +		dax_cxl_mode = DAX_CXL_MODE_REGISTER;
> +		cxl_region_teardown_all();

I was thinking through what Alison asked about what to do later in boot
when other regions are being dynamically created. It made me wonder if
this safety can be achieved more easily by just making sure that the
alloc_dax_region() call fails.

Something like (untested / incomplete, needs cleanup handling!)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index fde29e0ad68b..fd18343e0538 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -10,6 +10,7 @@
 #include "dax-private.h"
 #include "bus.h"
 
+static struct resource dax_regions = DEFINE_RES_MEM_NAMED(0, -1, "DAX Regions");
 static DEFINE_MUTEX(dax_bus_lock);
 
 /*
@@ -661,11 +662,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
        dax_region->dev = parent;
        dax_region->target_node = target_node;
        ida_init(&dax_region->ida);
-       dax_region->res = (struct resource) {
-               .start = range->start,
-               .end = range->end,
-               .flags = IORESOURCE_MEM | flags,
-       };
+       dax_region->res = __request_region(&dax_regions, range->start, range->end, flags);
 
        if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
                kfree(dax_region);

...which will result in enforcing only one of dax_hmem or dax_cxl being
able to register a dax_region.

Yes, this would leave a mess of disabled cxl_dax_region devices lying
around, but it would leave more breadcrumbs for debug, and reduce the
number of races you need to worry about.

In other words, I thought total teardown would be simpler, but as the
feedback keeps coming in, I think that brings a different set of
complexity. So just inject failures for dax_cxl to trip over and then we
can go further later to effect total teardown if that proves to not be
enough.

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Koralahalli Channabasappa, Smita 1 week, 4 days ago

Hi Dan,

On 1/28/2026 3:35 PM, dan.j.williams@intel.com wrote:
> Smita Koralahalli wrote:
>> The current probe time ownership check for Soft Reserved memory based
>> solely on CXL window intersection is insufficient. dax_hmem probing is not
>> always guaranteed to run after CXL enumeration and region assembly, which
>> can lead to incorrect ownership decisions before the CXL stack has
>> finished publishing windows and assembling committed regions.
>>
>> Introduce deferred ownership handling for Soft Reserved ranges that
>> intersect CXL windows at probe time by scheduling deferred work from
>> dax_hmem and waiting for the CXL stack to complete enumeration and region
>> assembly before deciding ownership.
>>
>> Evaluate ownership of Soft Reserved ranges based on CXL region
>> containment.
>>
>>     - If all Soft Reserved ranges are fully contained within committed CXL
>>       regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>>       dax_cxl to bind.
>>
>>     - If any Soft Reserved range is not fully claimed by committed CXL
>>       region, tear down all CXL regions and REGISTER the Soft Reserved
>>       ranges with dax_hmem instead.
>>
>> While ownership resolution is pending, gate dax_cxl probing to avoid
>> binding prematurely.
>>
>> This enforces a strict ownership. Either CXL fully claims the Soft
>> Reserved ranges or it relinquishes it entirely.
>>
>> Co-developed-by: Dan Williams <dan.j.williams@intel.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> ---
>>   drivers/cxl/core/region.c | 25 ++++++++++++
>>   drivers/cxl/cxl.h         |  2 +
>>   drivers/dax/cxl.c         |  9 +++++
>>   drivers/dax/hmem/hmem.c   | 81 ++++++++++++++++++++++++++++++++++++++-
>>   4 files changed, 115 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
>> index 9827a6dd3187..6c22a2d4abbb 100644
>> --- a/drivers/cxl/core/region.c
>> +++ b/drivers/cxl/core/region.c
>> @@ -3875,6 +3875,31 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
>>   DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
>>   			 cxl_region_debugfs_poison_clear, "%llx\n");
>>   
>> +static int cxl_region_teardown_cb(struct device *dev, void *data)
>> +{
>> +	struct cxl_root_decoder *cxlrd;
>> +	struct cxl_region *cxlr;
>> +	struct cxl_port *port;
>> +
>> +	if (!is_cxl_region(dev))
>> +		return 0;
>> +
>> +	cxlr = to_cxl_region(dev);
>> +
>> +	cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
>> +	port = cxlrd_to_port(cxlrd);
>> +
>> +	devm_release_action(port->uport_dev, unregister_region, cxlr);
>> +
>> +	return 0;
>> +}
>> +
>> +void cxl_region_teardown_all(void)
>> +{
>> +	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_region_teardown_cb);
>> +}
>> +EXPORT_SYMBOL_GPL(cxl_region_teardown_all);
>> +
>>   static int cxl_region_contains_sr_cb(struct device *dev, void *data)
>>   {
>>   	struct resource *res = data;
>> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
>> index b0ff6b65ea0b..1864d35d5f69 100644
>> --- a/drivers/cxl/cxl.h
>> +++ b/drivers/cxl/cxl.h
>> @@ -907,6 +907,7 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled);
>>   struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
>>   u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
>>   bool cxl_region_contains_soft_reserve(const struct resource *res);
>> +void cxl_region_teardown_all(void);
>>   #else
>>   static inline bool is_cxl_pmem_region(struct device *dev)
>>   {
>> @@ -933,6 +934,7 @@ static inline bool cxl_region_contains_soft_reserve(const struct resource *res)
>>   {
>>   	return false;
>>   }
>> +static inline void cxl_region_teardown_all(void) { }
>>   #endif
>>   
>>   void cxl_endpoint_parse_cdat(struct cxl_port *port);
>> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
>> index 13cd94d32ff7..b7e90d6dd888 100644
>> --- a/drivers/dax/cxl.c
>> +++ b/drivers/dax/cxl.c
>> @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
>>   	struct dax_region *dax_region;
>>   	struct dev_dax_data data;
>>   
>> +	switch (dax_cxl_mode) {
>> +	case DAX_CXL_MODE_DEFER:
>> +		return -EPROBE_DEFER;
> 
> So, I think this causes a mess because now you have 2 workqueues (driver
> core defer-queue and hmem work) competing to disposition this device.
> What this seems to want is to only run in the post "soft reserve
> dispositioned" world. Something like (untested!)
> 
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index 13cd94d32ff7..1162495eb317 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -14,6 +14,9 @@ static int cxl_dax_region_probe(struct device *dev)
>          struct dax_region *dax_region;
>          struct dev_dax_data data;
>   
> +       /* Make sure that dax_cxl_mode is stable, only runs once at boot */
> +       flush_hmem_work();
> +
>          if (nid == NUMA_NO_NODE)
>                  nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
>   
> @@ -38,6 +41,7 @@ static struct cxl_driver cxl_dax_region_driver = {
>          .id = CXL_DEVICE_DAX_REGION,
>          .drv = {
>                  .suppress_bind_attrs = true,
> +               .probe_type = PROBE_PREFER_ASYNCHRONOUS,
>          },
>   };
>   
> ...where that flush_hmem_work() is something provided by
> drivers/dax/bus.c. The asynchronous probe is to make sure that the wait
> is always out-of-line of any other synchronous probing.
> 
> You could probably drop the work item from being a per hmem_platform
> drvdata and just make it a singleton work item in bus.c that hmem.c
> queues and cxl.c flushes.
> 
> Probably also need to make sure that hmem_init() always runs before
> dax_cxl module init with something like this for the built-in case:
> 
> diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
> index 5ed5c39857c8..70e996bf1526 100644
> --- a/drivers/dax/Makefile
> +++ b/drivers/dax/Makefile
> @@ -1,4 +1,5 @@
>   # SPDX-License-Identifier: GPL-2.0
> +obj-y += hmem/
>   obj-$(CONFIG_DAX) += dax.o
>   obj-$(CONFIG_DEV_DAX) += device_dax.o
>   obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o
> @@ -10,5 +11,3 @@ dax-y += bus.o
>   device_dax-y := device.o
>   dax_pmem-y := pmem.o
>   dax_cxl-y := cxl.o
> -
> -obj-y += hmem/
> 
> [..]
>> +static void process_defer_work(struct work_struct *_work)
>> +{
>> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
>> +	struct platform_device *pdev = work->pdev;
>> +	int rc;
>> +
>> +	/* relies on cxl_acpi and cxl_pci having had a chance to load */
>> +	wait_for_device_probe();
>> +
>> +	rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
> 
> Like I said before this probably wants to be named something like
> soft_reserve_has_cxl_match() to make it clear what is happening.
> 
>> +
>> +	if (!rc) {
>> +		dax_cxl_mode = DAX_CXL_MODE_DROP;
>> +		rc = bus_rescan_devices(&cxl_bus_type);
>> +		if (rc)
>> +			dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
>> +	} else {
>> +		dax_cxl_mode = DAX_CXL_MODE_REGISTER;
>> +		cxl_region_teardown_all();
> 
> I was thinking through what Alison asked about what to do later in boot
> when other regions are being dynamically created. It made me wonder if
> this safety can be achieved more easily by just making sure that the
> alloc_dax_region() call fails.

Agreed with all the points above, including making alloc_dax_region() 
fail as the safety mechanism. This also cleanly avoids the no Soft 
Reserved case Alison pointed out, where dax_cxl_mode can remain stuck in 
DEFER and return -EPROBE_DEFER.

What I’m still trying to understand is the case of “other regions being 
dynamically created.” Once HMEM has claimed the relevant HPA range, any 
later userspace attempts to create regions (via cxl create-region) 
should naturally fail due to the existing HPA allocation. This already 
shows up as an HPA allocation failure currently.

#cxl create-region -d decoder0.0 -m mem2 -w 1 -g256
cxl region: create_region: region0: set_size failed: Numerical result 
out of range
cxl region: cmd_create_region: created 0 regions

And in the dmesg:
[  466.819353] alloc_hpa: cxl region0: HPA allocation error (-34) for 
size:0x0000002000000000 in CXL Window 0 [mem 0x850000000-0x284fffffff 
flags 0x200]

Also, at this point, with the probe-ordering fixes and the use of 
wait_for_device_probe(), region probing should have fully completed.

Am I missing any other scenario where regions could still be created 
dynamically beyond this?

> 
> Something like (untested / incomplete, needs cleanup handling!)
> 
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index fde29e0ad68b..fd18343e0538 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -10,6 +10,7 @@
>   #include "dax-private.h"
>   #include "bus.h"
>   
> +static struct resource dax_regions = DEFINE_RES_MEM_NAMED(0, -1, "DAX Regions");
>   static DEFINE_MUTEX(dax_bus_lock);
>   
>   /*
> @@ -661,11 +662,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
>          dax_region->dev = parent;
>          dax_region->target_node = target_node;
>          ida_init(&dax_region->ida);
> -       dax_region->res = (struct resource) {
> -               .start = range->start,
> -               .end = range->end,
> -               .flags = IORESOURCE_MEM | flags,
> -       };
> +       dax_region->res = __request_region(&dax_regions, range->start, range->end, flags);
>   
>          if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
>                  kfree(dax_region);
> 
> ...which will result in enforcing only one of dax_hmem or dax_cxl being
> able to register a dax_region.
> 
> Yes, this would leave a mess of disabled cxl_dax_region devices lying
> around, but it would leave more breadcrumbs for debug, and reduce the
> number of races you need to worry about.
> 
> In other words, I thought total teardown would be simpler, but as the
> feedback keeps coming in, I think that brings a different set of
> complexity. So just inject failures for dax_cxl to trip over and then we
> can go further later to effect total teardown if that proves to not be
> enough.

One concern with the approach of not tearing down CXL regions is the 
state it leaves behind in /proc/iomem. Soft Reserved ranges are 
REGISTERed to HMEM while CXL regions remain present. The resulting 
nesting (dax under region, region under window and window under SR) 
visually suggests a coherent CXL hierarchy, even though ownership has 
effectively moved to HMEM. When users, then attempt to tear regions down 
and recreate them from userspace, they hit the same HPA allocation 
failures described above.

If we decide not to tear down regions in the REGISTER case, should we 
gate decoder resets during user initiated region teardown? Today, 
decoders are reset when regions are torn down dynamically, and 
subsequent attempts to recreate regions can trigger a large amount of 
mailbox traffic. Much of what shows up as repeated “Reading event logs/ 
Clearing …” messages which ends up interleaved with the HPA allocation 
failure, which can be confusing.

Thanks
Smita

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by dan.j.williams@intel.com 1 week, 4 days ago

Koralahalli Channabasappa, Smita wrote:
[..]
> > I was thinking through what Alison asked about what to do later in boot
> > when other regions are being dynamically created. It made me wonder if
> > this safety can be achieved more easily by just making sure that the
> > alloc_dax_region() call fails.
> 
> Agreed with all the points above, including making alloc_dax_region() 
> fail as the safety mechanism. This also cleanly avoids the no Soft 
> Reserved case Alison pointed out, where dax_cxl_mode can remain stuck in 
> DEFER and return -EPROBE_DEFER.
> 
> What I’m still trying to understand is the case of “other regions being 
> dynamically created.” Once HMEM has claimed the relevant HPA range, any 
> later userspace attempts to create regions (via cxl create-region) 
> should naturally fail due to the existing HPA allocation. This already 
> shows up as an HPA allocation failure currently.
> 
> #cxl create-region -d decoder0.0 -m mem2 -w 1 -g256
> cxl region: create_region: region0: set_size failed: Numerical result 
> out of range
> cxl region: cmd_create_region: created 0 regions
> 
> And in the dmesg:
> [  466.819353] alloc_hpa: cxl region0: HPA allocation error (-34) for 
> size:0x0000002000000000 in CXL Window 0 [mem 0x850000000-0x284fffffff 
> flags 0x200]
> 
> Also, at this point, with the probe-ordering fixes and the use of 
> wait_for_device_probe(), region probing should have fully completed.
> 
> Am I missing any other scenario where regions could still be created 
> dynamically beyond this?

The concern is what to do about regions and memory devices that are
completely innocent. So, for example imagine deviceA is handled by BIOS
and deviceB is ignored by BIOS. If deviceB was ignored by BIOS then it
would be rude to tear down any regions that might be established for
deviceB. So if alloc_dax_region() exclusion and HPA space reservation
prevent future collisions while not disturbing innocent devices, then I
think userspace can pick up the pieces from there.

> > Something like (untested / incomplete, needs cleanup handling!)
> > 
> > diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> > index fde29e0ad68b..fd18343e0538 100644
> > --- a/drivers/dax/bus.c
> > +++ b/drivers/dax/bus.c
> > @@ -10,6 +10,7 @@
> >   #include "dax-private.h"
> >   #include "bus.h"
> >   
> > +static struct resource dax_regions = DEFINE_RES_MEM_NAMED(0, -1, "DAX Regions");
> >   static DEFINE_MUTEX(dax_bus_lock);
> >   
> >   /*
> > @@ -661,11 +662,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
> >          dax_region->dev = parent;
> >          dax_region->target_node = target_node;
> >          ida_init(&dax_region->ida);
> > -       dax_region->res = (struct resource) {
> > -               .start = range->start,
> > -               .end = range->end,
> > -               .flags = IORESOURCE_MEM | flags,
> > -       };
> > +       dax_region->res = __request_region(&dax_regions, range->start, range->end, flags);
> >   
> >          if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
> >                  kfree(dax_region);
> > 
> > ...which will result in enforcing only one of dax_hmem or dax_cxl being
> > able to register a dax_region.
> > 
> > Yes, this would leave a mess of disabled cxl_dax_region devices lying
> > around, but it would leave more breadcrumbs for debug, and reduce the
> > number of races you need to worry about.
> > 
> > In other words, I thought total teardown would be simpler, but as the
> > feedback keeps coming in, I think that brings a different set of
> > complexity. So just inject failures for dax_cxl to trip over and then we
> > can go further later to effect total teardown if that proves to not be
> > enough.
> 
> One concern with the approach of not tearing down CXL regions is the 
> state it leaves behind in /proc/iomem. Soft Reserved ranges are 
> REGISTERed to HMEM while CXL regions remain present. The resulting 
> nesting (dax under region, region under window and window under SR) 
> visually suggests a coherent CXL hierarchy, even though ownership has 
> effectively moved to HMEM. When users, then attempt to tear regions down 
> and recreate them from userspace, they hit the same HPA allocation 
> failures described above.

So this gets back to a question of do we really need "Soft Reserved" to
show up in /proc/iomem? It is an ABI change to stop publishing it
altogether, so at a minimum we need to be prepared to keep publishing it
if it causes someone's working setup to regress.

The current state of the for-7.0/cxl-init branch drops publishing "Soft
Reserved". I am cautiously optimistic no one notices as long as DAX
devices keep appearing, but at the first sign of regression we need a
plan B.

> If we decide not to tear down regions in the REGISTER case, should we 
> gate decoder resets during user initiated region teardown? Today, 
> decoders are reset when regions are torn down dynamically, and 
> subsequent attempts to recreate regions can trigger a large amount of 
> mailbox traffic. Much of what shows up as repeated “Reading event logs/ 
> Clearing …” messages which ends up interleaved with the HPA allocation 
> failure, which can be confusing.

One of the nice side effects of installing the "Soft Reserved" entries
late, when HMEM takes over, is that they are easier to remove.

So the flow would be, if you know what you are doing, is to disable the
HMEM device which uninstalls the "Soft Reserved" entries, before trying
to decommit the region and reclaim the HPA space.

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Tomasz Wolski 5 days, 2 hours ago

>> > I was thinking through what Alison asked about what to do later in boot
>> > when other regions are being dynamically created. It made me wonder if
>> > this safety can be achieved more easily by just making sure that the
>> > alloc_dax_region() call fails.
>> 
>> Agreed with all the points above, including making alloc_dax_region() 
>> fail as the safety mechanism. This also cleanly avoids the no Soft 
>> Reserved case Alison pointed out, where dax_cxl_mode can remain stuck in 
>> DEFER and return -EPROBE_DEFER.
>> 
>> What I’m still trying to understand is the case of “other regions being 
>> dynamically created.” Once HMEM has claimed the relevant HPA range, any 
>> later userspace attempts to create regions (via cxl create-region) 
>> should naturally fail due to the existing HPA allocation. This already 
>> shows up as an HPA allocation failure currently.
>> 
>> #cxl create-region -d decoder0.0 -m mem2 -w 1 -g256
>> cxl region: create_region: region0: set_size failed: Numerical result 
>> out of range
>> cxl region: cmd_create_region: created 0 regions
>> 
>> And in the dmesg:
>> [  466.819353] alloc_hpa: cxl region0: HPA allocation error (-34) for 
>> size:0x0000002000000000 in CXL Window 0 [mem 0x850000000-0x284fffffff 
>> flags 0x200]
>> 
>> Also, at this point, with the probe-ordering fixes and the use of 
>> wait_for_device_probe(), region probing should have fully completed.
>> 
>> Am I missing any other scenario where regions could still be created 
>> dynamically beyond this?
>
>The concern is what to do about regions and memory devices that are
>completely innocent. So, for example imagine deviceA is handled by BIOS
>and deviceB is ignored by BIOS. If deviceB was ignored by BIOS then it
>would be rude to tear down any regions that might be established for
>deviceB. So if alloc_dax_region() exclusion and HPA space reservation
>prevent future collisions while not disturbing innocent devices, then I
>think userspace can pick up the pieces from there.

I'm trying to follow the idea of "deviceB being ignored by BIOS" 
Do you consider hot-plug devices and user creating reqions manually? 
Could you please describe such scenario?

>> > Something like (untested / incomplete, needs cleanup handling!)
>> > 
>> > diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
>> > index fde29e0ad68b..fd18343e0538 100644
>> > --- a/drivers/dax/bus.c
>> > +++ b/drivers/dax/bus.c
>> > @@ -10,6 +10,7 @@
>> >   #include "dax-private.h"
>> >   #include "bus.h"
>> >   
>> > +static struct resource dax_regions = DEFINE_RES_MEM_NAMED(0, -1, "DAX Regions");
>> >   static DEFINE_MUTEX(dax_bus_lock);
>> >   
>> >   /*
>> > @@ -661,11 +662,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
>> >          dax_region->dev = parent;
>> >          dax_region->target_node = target_node;
>> >          ida_init(&dax_region->ida);
>> > -       dax_region->res = (struct resource) {
>> > -               .start = range->start,
>> > -               .end = range->end,
>> > -               .flags = IORESOURCE_MEM | flags,
>> > -       };
>> > +       dax_region->res = __request_region(&dax_regions, range->start, range->end, flags);
>> >   
>> >          if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
>> >                  kfree(dax_region);
>> > 
>> > ...which will result in enforcing only one of dax_hmem or dax_cxl being
>> > able to register a dax_region.
>> > 
>> > Yes, this would leave a mess of disabled cxl_dax_region devices lying
>> > around, but it would leave more breadcrumbs for debug, and reduce the
>> > number of races you need to worry about.
>> > 
>> > In other words, I thought total teardown would be simpler, but as the
>> > feedback keeps coming in, I think that brings a different set of
>> > complexity. So just inject failures for dax_cxl to trip over and then we
>> > can go further later to effect total teardown if that proves to not be
>> > enough.
>> 
>> One concern with the approach of not tearing down CXL regions is the 
>> state it leaves behind in /proc/iomem. Soft Reserved ranges are 
>> REGISTERed to HMEM while CXL regions remain present. The resulting 
>> nesting (dax under region, region under window and window under SR) 
>> visually suggests a coherent CXL hierarchy, even though ownership has 
>> effectively moved to HMEM. When users, then attempt to tear regions down 
>> and recreate them from userspace, they hit the same HPA allocation 
>> failures described above.
>
>So this gets back to a question of do we really need "Soft Reserved" to
>show up in /proc/iomem? It is an ABI change to stop publishing it
>altogether, so at a minimum we need to be prepared to keep publishing it
>if it causes someone's working setup to regress.
>
>The current state of the for-7.0/cxl-init branch drops publishing "Soft
>Reserved". I am cautiously optimistic no one notices as long as DAX
>devices keep appearing, but at the first sign of regression we need a
>plan B.
>
>> If we decide not to tear down regions in the REGISTER case, should we 
>> gate decoder resets during user initiated region teardown? Today, 
>> decoders are reset when regions are torn down dynamically, and 
>> subsequent attempts to recreate regions can trigger a large amount of 
>> mailbox traffic. Much of what shows up as repeated “Reading event logs/ 
>> Clearing …” messages which ends up interleaved with the HPA allocation 
>> failure, which can be confusing.
>
>One of the nice side effects of installing the "Soft Reserved" entries
>late, when HMEM takes over, is that they are easier to remove.
>
>So the flow would be, if you know what you are doing, is to disable the
>HMEM device which uninstalls the "Soft Reserved" entries, before trying
>to decommit the region and reclaim the HPA space.
>

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by dan.j.williams@intel.com 1 week, 4 days ago

dan.j.williams@ wrote:
> Smita Koralahalli wrote:
> > The current probe time ownership check for Soft Reserved memory based
> > solely on CXL window intersection is insufficient. dax_hmem probing is not
> > always guaranteed to run after CXL enumeration and region assembly, which
> > can lead to incorrect ownership decisions before the CXL stack has
> > finished publishing windows and assembling committed regions.
> > 
> > Introduce deferred ownership handling for Soft Reserved ranges that
> > intersect CXL windows at probe time by scheduling deferred work from
> > dax_hmem and waiting for the CXL stack to complete enumeration and region
> > assembly before deciding ownership.
> > 
> > Evaluate ownership of Soft Reserved ranges based on CXL region
> > containment.
> > 
> >    - If all Soft Reserved ranges are fully contained within committed CXL
> >      regions, DROP handling Soft Reserved ranges from dax_hmem and allow
> >      dax_cxl to bind.
> > 
> >    - If any Soft Reserved range is not fully claimed by committed CXL
> >      region, tear down all CXL regions and REGISTER the Soft Reserved
> >      ranges with dax_hmem instead.
> > 
> > While ownership resolution is pending, gate dax_cxl probing to avoid
> > binding prematurely.
> > 
> > This enforces a strict ownership. Either CXL fully claims the Soft
> > Reserved ranges or it relinquishes it entirely.
> > 
> > Co-developed-by: Dan Williams <dan.j.williams@intel.com>
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> > ---
> >  drivers/cxl/core/region.c | 25 ++++++++++++
> >  drivers/cxl/cxl.h         |  2 +
> >  drivers/dax/cxl.c         |  9 +++++
> >  drivers/dax/hmem/hmem.c   | 81 ++++++++++++++++++++++++++++++++++++++-
> >  4 files changed, 115 insertions(+), 2 deletions(-)
> > 
[..]
> > diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> > index 13cd94d32ff7..b7e90d6dd888 100644
> > --- a/drivers/dax/cxl.c
> > +++ b/drivers/dax/cxl.c
> > @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
> >  	struct dax_region *dax_region;
> >  	struct dev_dax_data data;
> >  
> > +	switch (dax_cxl_mode) {
> > +	case DAX_CXL_MODE_DEFER:
> > +		return -EPROBE_DEFER;
> 
> So, I think this causes a mess because now you have 2 workqueues (driver
> core defer-queue and hmem work) competing to disposition this device.
> What this seems to want is to only run in the post "soft reserve
> dispositioned" world. Something like (untested!)
> 
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index 13cd94d32ff7..1162495eb317 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -14,6 +14,9 @@ static int cxl_dax_region_probe(struct device *dev)
>         struct dax_region *dax_region;
>         struct dev_dax_data data;
>  
> +       /* Make sure that dax_cxl_mode is stable, only runs once at boot */
> +       flush_hmem_work();
> +

It occurs to me that this likely insta-hangs because
wait_for_device_probe() waits forever for itself to flush. So it may
need to be a scheme where the cxl_dax_region_driver registration does
something like this (untested!):

diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
index 13cd94d32ff7..6a1a38b4f64b 100644
--- a/drivers/dax/cxl.c
+++ b/drivers/dax/cxl.c
@@ -41,7 +41,32 @@ static struct cxl_driver cxl_dax_region_driver = {
        },
 };
 
-module_cxl_driver(cxl_dax_region_driver);
+static void cxl_dax_region_driver_register(struct work_struct *work)
+{
+       flush_hmem_work();
+       cxl_driver_register(&cxl_dax_region_driver);
+}
+
+static DECLARE_WORK(cxl_dax_region_driver_work, cxl_dax_region_driver_register);
+
+static int __init cxl_dax_region_init(void)
+{
+       /*
+        * Need to resolve a race with dax_hmem wanting to drive regions 
+        * instead of CXL
+        */
+       queue_work(system_long_wq, &cxl_dax_region_driver_work);
+       return 0;
+}
+module_init(cxl_dax_region_init);
+
+static void __exit cxl_dax_region_exit(void)
+{
+       flush_work(&cxl_dax_region_driver_work);
+       cxl_driver_unregister(&cxl_dax_region_driver);
+}
+module_exit(cxl_dax_region_exit);
+
 MODULE_ALIAS_CXL(CXL_DEVICE_DAX_REGION);
 MODULE_DESCRIPTION("CXL DAX: direct access to CXL regions");
 MODULE_LICENSE("GPL");

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Alison Schofield 1 week, 6 days ago

On Thu, Jan 22, 2026 at 04:55:42AM +0000, Smita Koralahalli wrote:
> The current probe time ownership check for Soft Reserved memory based
> solely on CXL window intersection is insufficient. dax_hmem probing is not
> always guaranteed to run after CXL enumeration and region assembly, which
> can lead to incorrect ownership decisions before the CXL stack has
> finished publishing windows and assembling committed regions.
> 
> Introduce deferred ownership handling for Soft Reserved ranges that
> intersect CXL windows at probe time by scheduling deferred work from
> dax_hmem and waiting for the CXL stack to complete enumeration and region
> assembly before deciding ownership.
> 
> Evaluate ownership of Soft Reserved ranges based on CXL region
> containment.
> 
>    - If all Soft Reserved ranges are fully contained within committed CXL
>      regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>      dax_cxl to bind.
> 
>    - If any Soft Reserved range is not fully claimed by committed CXL
>      region, tear down all CXL regions and REGISTER the Soft Reserved
>      ranges with dax_hmem instead.
> 
> While ownership resolution is pending, gate dax_cxl probing to avoid
> binding prematurely.
> 
> This enforces a strict ownership. Either CXL fully claims the Soft
> Reserved ranges or it relinquishes it entirely.
> 
> Co-developed-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
>  drivers/cxl/core/region.c | 25 ++++++++++++
>  drivers/cxl/cxl.h         |  2 +

Can the region teardown helper be introduced in a separate patch before this
patch...like you did for the contains soft reserved helper?

>  drivers/dax/cxl.c         |  9 +++++
>  drivers/dax/hmem/hmem.c   | 81 ++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 115 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 9827a6dd3187..6c22a2d4abbb 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -3875,6 +3875,31 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
>  DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
>  			 cxl_region_debugfs_poison_clear, "%llx\n");
>  
> +static int cxl_region_teardown_cb(struct device *dev, void *data)
> +{
> +	struct cxl_root_decoder *cxlrd;
> +	struct cxl_region *cxlr;
> +	struct cxl_port *port;
> +
> +	if (!is_cxl_region(dev))
> +		return 0;
> +
> +	cxlr = to_cxl_region(dev);
> +
> +	cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
> +	port = cxlrd_to_port(cxlrd);
> +

How about a dev_dbg() here on each killed region, and a dev_info()
at the call site proclaiming what is happening.


> +	devm_release_action(port->uport_dev, unregister_region, cxlr);
> +
> +	return 0;
> +}
> +
> +void cxl_region_teardown_all(void)
> +{
> +	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_region_teardown_cb);
> +}
> +EXPORT_SYMBOL_GPL(cxl_region_teardown_all);

Maybe be cautious with who can access this function:
EXPORT_SYMBOL_FOR_MODULES(cxl_region_teardown_all, "dax_hmem");


> +
>  static int cxl_region_contains_sr_cb(struct device *dev, void *data)
>  {
>  	struct resource *res = data;
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index b0ff6b65ea0b..1864d35d5f69 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -907,6 +907,7 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled);
>  struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
>  u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
>  bool cxl_region_contains_soft_reserve(const struct resource *res);
> +void cxl_region_teardown_all(void);
>  #else
>  static inline bool is_cxl_pmem_region(struct device *dev)
>  {
> @@ -933,6 +934,7 @@ static inline bool cxl_region_contains_soft_reserve(const struct resource *res)
>  {
>  	return false;
>  }
> +static inline void cxl_region_teardown_all(void) { }
>  #endif
>  
>  void cxl_endpoint_parse_cdat(struct cxl_port *port);
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index 13cd94d32ff7..b7e90d6dd888 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
>  	struct dax_region *dax_region;
>  	struct dev_dax_data data;
>  
> +	switch (dax_cxl_mode) {
> +	case DAX_CXL_MODE_DEFER:
> +		return -EPROBE_DEFER;
> +	case DAX_CXL_MODE_REGISTER:
> +		return -ENODEV;
> +	case DAX_CXL_MODE_DROP:
> +		break;
> +	}
> +
>  	if (nid == NUMA_NO_NODE)
>  		nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
>  
> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
> index 1e3424358490..bcb57d8678d7 100644
> --- a/drivers/dax/hmem/hmem.c
> +++ b/drivers/dax/hmem/hmem.c
> @@ -3,6 +3,7 @@
>  #include <linux/memregion.h>
>  #include <linux/module.h>
>  #include <linux/dax.h>
> +#include "../../cxl/cxl.h"
>  #include "../bus.h"
>  
>  static bool region_idle;
> @@ -58,9 +59,15 @@ static void release_hmem(void *pdev)
>  	platform_device_unregister(pdev);
>  }
>  
> +struct dax_defer_work {
> +	struct platform_device *pdev;
> +	struct work_struct work;
> +};
> +
>  static int hmem_register_device(struct device *host, int target_nid,
>  				const struct resource *res)
>  {
> +	struct dax_defer_work *work = dev_get_drvdata(host);
>  	struct platform_device *pdev;
>  	struct memregion_info info;
>  	long id;
> @@ -69,8 +76,18 @@ static int hmem_register_device(struct device *host, int target_nid,
>  	if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
>  	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>  			      IORES_DESC_CXL) != REGION_DISJOINT) {
> -		dev_dbg(host, "deferring range to CXL: %pr\n", res);
> -		return 0;
> +		switch (dax_cxl_mode) {
> +		case DAX_CXL_MODE_DEFER:
> +			dev_dbg(host, "deferring range to CXL: %pr\n", res);
> +			schedule_work(&work->work);
> +			return 0;
> +		case DAX_CXL_MODE_REGISTER:
> +			dev_dbg(host, "registering CXL range: %pr\n", res);
> +			break;
> +		case DAX_CXL_MODE_DROP:
> +			dev_dbg(host, "dropping CXL range: %pr\n", res);
> +			return 0;
> +		}
>  	}
>  
>  	rc = region_intersects_soft_reserve(res->start, resource_size(res));
> @@ -123,8 +140,67 @@ static int hmem_register_device(struct device *host, int target_nid,
>  	return rc;
>  }
>  
> +static int cxl_contains_soft_reserve(struct device *host, int target_nid,
> +				     const struct resource *res)
> +{
> +	if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
> +			      IORES_DESC_CXL) != REGION_DISJOINT) {
> +		if (!cxl_region_contains_soft_reserve(res))
> +			return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static void process_defer_work(struct work_struct *_work)
> +{
> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> +	struct platform_device *pdev = work->pdev;
> +	int rc;
> +
> +	/* relies on cxl_acpi and cxl_pci having had a chance to load */
> +	wait_for_device_probe();
> +
> +	rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
> +
> +	if (!rc) {
> +		dax_cxl_mode = DAX_CXL_MODE_DROP;
> +		rc = bus_rescan_devices(&cxl_bus_type);
> +		if (rc)
> +			dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
> +	} else {
> +		dax_cxl_mode = DAX_CXL_MODE_REGISTER;

dev_info or dev_warn that we are doing the teardown.


> +		cxl_region_teardown_all();
> +	}
> +
> +	walk_hmem_resources(&pdev->dev, hmem_register_device);
> +}
> +
> +static void kill_defer_work(void *_work)
> +{
> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> +
> +	cancel_work_sync(&work->work);
> +	kfree(work);
> +}
> +
>  static int dax_hmem_platform_probe(struct platform_device *pdev)
>  {
> +	struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
> +	int rc;
> +
> +	if (!work)
> +		return -ENOMEM;
> +
> +	work->pdev = pdev;
> +	INIT_WORK(&work->work, process_defer_work);
> +
> +	rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
> +	if (rc)
> +		return rc;
> +
> +	platform_set_drvdata(pdev, work);
> +
>  	return walk_hmem_resources(&pdev->dev, hmem_register_device);
>  }
>  
> @@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
>  MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory");
>  MODULE_LICENSE("GPL v2");
>  MODULE_AUTHOR("Intel Corporation");
> +MODULE_IMPORT_NS("CXL");
> -- 
> 2.17.1
>

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Alison Schofield 2 weeks ago

On Thu, Jan 22, 2026 at 04:55:42AM +0000, Smita Koralahalli wrote:
> The current probe time ownership check for Soft Reserved memory based
> solely on CXL window intersection is insufficient. dax_hmem probing is not
> always guaranteed to run after CXL enumeration and region assembly, which
> can lead to incorrect ownership decisions before the CXL stack has
> finished publishing windows and assembling committed regions.
> 
> Introduce deferred ownership handling for Soft Reserved ranges that
> intersect CXL windows at probe time by scheduling deferred work from
> dax_hmem and waiting for the CXL stack to complete enumeration and region
> assembly before deciding ownership.
> 
> Evaluate ownership of Soft Reserved ranges based on CXL region
> containment.
> 
>    - If all Soft Reserved ranges are fully contained within committed CXL
>      regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>      dax_cxl to bind.
> 
>    - If any Soft Reserved range is not fully claimed by committed CXL
>      region, tear down all CXL regions and REGISTER the Soft Reserved
>      ranges with dax_hmem instead.

Question about the teardown below..


> 
> While ownership resolution is pending, gate dax_cxl probing to avoid
> binding prematurely.
> 
> This enforces a strict ownership. Either CXL fully claims the Soft
> Reserved ranges or it relinquishes it entirely.
> 
> Co-developed-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
>  drivers/cxl/core/region.c | 25 ++++++++++++
>  drivers/cxl/cxl.h         |  2 +
>  drivers/dax/cxl.c         |  9 +++++
>  drivers/dax/hmem/hmem.c   | 81 ++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 115 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 9827a6dd3187..6c22a2d4abbb 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -3875,6 +3875,31 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
>  DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
>  			 cxl_region_debugfs_poison_clear, "%llx\n");
>  
> +static int cxl_region_teardown_cb(struct device *dev, void *data)
> +{
> +	struct cxl_root_decoder *cxlrd;
> +	struct cxl_region *cxlr;
> +	struct cxl_port *port;
> +
> +	if (!is_cxl_region(dev))
> +		return 0;
> +
> +	cxlr = to_cxl_region(dev);
> +
> +	cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
> +	port = cxlrd_to_port(cxlrd);
> +
> +	devm_release_action(port->uport_dev, unregister_region, cxlr);
> +
> +	return 0;
> +}
> +
> +void cxl_region_teardown_all(void)
> +{
> +	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_region_teardown_cb);
> +}
> +EXPORT_SYMBOL_GPL(cxl_region_teardown_all);
> +
>  static int cxl_region_contains_sr_cb(struct device *dev, void *data)
>  {
>  	struct resource *res = data;
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index b0ff6b65ea0b..1864d35d5f69 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -907,6 +907,7 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled);
>  struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
>  u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
>  bool cxl_region_contains_soft_reserve(const struct resource *res);
> +void cxl_region_teardown_all(void);
>  #else
>  static inline bool is_cxl_pmem_region(struct device *dev)
>  {
> @@ -933,6 +934,7 @@ static inline bool cxl_region_contains_soft_reserve(const struct resource *res)
>  {
>  	return false;
>  }
> +static inline void cxl_region_teardown_all(void) { }
>  #endif
>  
>  void cxl_endpoint_parse_cdat(struct cxl_port *port);
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index 13cd94d32ff7..b7e90d6dd888 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
>  	struct dax_region *dax_region;
>  	struct dev_dax_data data;
>  
> +	switch (dax_cxl_mode) {
> +	case DAX_CXL_MODE_DEFER:
> +		return -EPROBE_DEFER;
> +	case DAX_CXL_MODE_REGISTER:
> +		return -ENODEV;
> +	case DAX_CXL_MODE_DROP:
> +		break;
> +	}
> +
>  	if (nid == NUMA_NO_NODE)
>  		nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
>  
> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
> index 1e3424358490..bcb57d8678d7 100644
> --- a/drivers/dax/hmem/hmem.c
> +++ b/drivers/dax/hmem/hmem.c
> @@ -3,6 +3,7 @@
>  #include <linux/memregion.h>
>  #include <linux/module.h>
>  #include <linux/dax.h>
> +#include "../../cxl/cxl.h"
>  #include "../bus.h"
>  
>  static bool region_idle;
> @@ -58,9 +59,15 @@ static void release_hmem(void *pdev)
>  	platform_device_unregister(pdev);
>  }
>  
> +struct dax_defer_work {
> +	struct platform_device *pdev;
> +	struct work_struct work;
> +};
> +
>  static int hmem_register_device(struct device *host, int target_nid,
>  				const struct resource *res)
>  {
> +	struct dax_defer_work *work = dev_get_drvdata(host);
>  	struct platform_device *pdev;
>  	struct memregion_info info;
>  	long id;
> @@ -69,8 +76,18 @@ static int hmem_register_device(struct device *host, int target_nid,
>  	if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
>  	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>  			      IORES_DESC_CXL) != REGION_DISJOINT) {
> -		dev_dbg(host, "deferring range to CXL: %pr\n", res);
> -		return 0;
> +		switch (dax_cxl_mode) {
> +		case DAX_CXL_MODE_DEFER:
> +			dev_dbg(host, "deferring range to CXL: %pr\n", res);
> +			schedule_work(&work->work);
> +			return 0;
> +		case DAX_CXL_MODE_REGISTER:
> +			dev_dbg(host, "registering CXL range: %pr\n", res);
> +			break;
> +		case DAX_CXL_MODE_DROP:
> +			dev_dbg(host, "dropping CXL range: %pr\n", res);
> +			return 0;
> +		}
>  	}
>  
>  	rc = region_intersects_soft_reserve(res->start, resource_size(res));
> @@ -123,8 +140,67 @@ static int hmem_register_device(struct device *host, int target_nid,
>  	return rc;
>  }
>  
> +static int cxl_contains_soft_reserve(struct device *host, int target_nid,
> +				     const struct resource *res)
> +{
> +	if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
> +			      IORES_DESC_CXL) != REGION_DISJOINT) {
> +		if (!cxl_region_contains_soft_reserve(res))
> +			return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static void process_defer_work(struct work_struct *_work)
> +{
> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> +	struct platform_device *pdev = work->pdev;
> +	int rc;
> +
> +	/* relies on cxl_acpi and cxl_pci having had a chance to load */
> +	wait_for_device_probe();
> +
> +	rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
> +
> +	if (!rc) {
> +		dax_cxl_mode = DAX_CXL_MODE_DROP;
> +		rc = bus_rescan_devices(&cxl_bus_type);
> +		if (rc)
> +			dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
> +	} else {
> +		dax_cxl_mode = DAX_CXL_MODE_REGISTER;
> +		cxl_region_teardown_all();

The region teardown appears as a one-shot sweep of existing regions
without considering regions not yet assembled. After this point will
a newly arriving region, be racing with HMEM again to create a DAX
region?


> +	}
> +
> +	walk_hmem_resources(&pdev->dev, hmem_register_device);
> +}
> +
> +static void kill_defer_work(void *_work)
> +{
> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> +
> +	cancel_work_sync(&work->work);
> +	kfree(work);
> +}
> +
>  static int dax_hmem_platform_probe(struct platform_device *pdev)
>  {
> +	struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
> +	int rc;
> +
> +	if (!work)
> +		return -ENOMEM;
> +
> +	work->pdev = pdev;
> +	INIT_WORK(&work->work, process_defer_work);
> +
> +	rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
> +	if (rc)
> +		return rc;
> +
> +	platform_set_drvdata(pdev, work);
> +
>  	return walk_hmem_resources(&pdev->dev, hmem_register_device);
>  }
>  
> @@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
>  MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory");
>  MODULE_LICENSE("GPL v2");
>  MODULE_AUTHOR("Intel Corporation");
> +MODULE_IMPORT_NS("CXL");
> -- 
> 2.17.1
>

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Koralahalli Channabasappa, Smita 1 week, 5 days ago

On 1/26/2026 5:38 PM, Alison Schofield wrote:

[snip]
..

>> +static void process_defer_work(struct work_struct *_work)
>> +{
>> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
>> +	struct platform_device *pdev = work->pdev;
>> +	int rc;
>> +
>> +	/* relies on cxl_acpi and cxl_pci having had a chance to load */
>> +	wait_for_device_probe();
>> +
>> +	rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
>> +
>> +	if (!rc) {
>> +		dax_cxl_mode = DAX_CXL_MODE_DROP;
>> +		rc = bus_rescan_devices(&cxl_bus_type);
>> +		if (rc)
>> +			dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
>> +	} else {
>> +		dax_cxl_mode = DAX_CXL_MODE_REGISTER;
>> +		cxl_region_teardown_all();
> 
> The region teardown appears as a one-shot sweep of existing regions
> without considering regions not yet assembled. After this point will
> a newly arriving region, be racing with HMEM again to create a DAX
> region?

My understanding is that with the probe ordering patches and 
wait_for_device_probe(), CXL region discovery and assembly should have 
completed before this point.

Thanks
Smita
> 
> 
>> +	}
>> +
>> +	walk_hmem_resources(&pdev->dev, hmem_register_device);
>> +}
>> +
>> +static void kill_defer_work(void *_work)
>> +{
>> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
>> +
>> +	cancel_work_sync(&work->work);
>> +	kfree(work);
>> +}
>> +
>>   static int dax_hmem_platform_probe(struct platform_device *pdev)
>>   {
>> +	struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
>> +	int rc;
>> +
>> +	if (!work)
>> +		return -ENOMEM;
>> +
>> +	work->pdev = pdev;
>> +	INIT_WORK(&work->work, process_defer_work);
>> +
>> +	rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
>> +	if (rc)
>> +		return rc;
>> +
>> +	platform_set_drvdata(pdev, work);
>> +
>>   	return walk_hmem_resources(&pdev->dev, hmem_register_device);
>>   }
>>   
>> @@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
>>   MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory");
>>   MODULE_LICENSE("GPL v2");
>>   MODULE_AUTHOR("Intel Corporation");
>> +MODULE_IMPORT_NS("CXL");
>> -- 
>> 2.17.1
>>

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Alison Schofield 1 week, 5 days ago

On Wed, Jan 28, 2026 at 01:14:52PM -0800, Koralahalli Channabasappa, Smita wrote:
> On 1/26/2026 5:38 PM, Alison Schofield wrote:
> 
> [snip]
> ..
> 
> > > +static void process_defer_work(struct work_struct *_work)
> > > +{
> > > +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> > > +	struct platform_device *pdev = work->pdev;
> > > +	int rc;
> > > +
> > > +	/* relies on cxl_acpi and cxl_pci having had a chance to load */
> > > +	wait_for_device_probe();
> > > +
> > > +	rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
> > > +
> > > +	if (!rc) {
> > > +		dax_cxl_mode = DAX_CXL_MODE_DROP;
> > > +		rc = bus_rescan_devices(&cxl_bus_type);
> > > +		if (rc)
> > > +			dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
> > > +	} else {
> > > +		dax_cxl_mode = DAX_CXL_MODE_REGISTER;
> > > +		cxl_region_teardown_all();
> > 
> > The region teardown appears as a one-shot sweep of existing regions
> > without considering regions not yet assembled. After this point will
> > a newly arriving region, be racing with HMEM again to create a DAX
> > region?
> 
> My understanding is that with the probe ordering patches and
> wait_for_device_probe(), CXL region discovery and assembly should have
> completed before this point.

OK - my confusion. Thanks for explaining.
-- Alison


> 
> Thanks
> Smita
> > 
> > 
> > > +	}
> > > +
> > > +	walk_hmem_resources(&pdev->dev, hmem_register_device);
> > > +}
> > > +
> > > +static void kill_defer_work(void *_work)
> > > +{
> > > +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> > > +
> > > +	cancel_work_sync(&work->work);
> > > +	kfree(work);
> > > +}
> > > +
> > >   static int dax_hmem_platform_probe(struct platform_device *pdev)
> > >   {
> > > +	struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
> > > +	int rc;
> > > +
> > > +	if (!work)
> > > +		return -ENOMEM;
> > > +
> > > +	work->pdev = pdev;
> > > +	INIT_WORK(&work->work, process_defer_work);
> > > +
> > > +	rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
> > > +	if (rc)
> > > +		return rc;
> > > +
> > > +	platform_set_drvdata(pdev, work);
> > > +
> > >   	return walk_hmem_resources(&pdev->dev, hmem_register_device);
> > >   }
> > > @@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
> > >   MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory");
> > >   MODULE_LICENSE("GPL v2");
> > >   MODULE_AUTHOR("Intel Corporation");
> > > +MODULE_IMPORT_NS("CXL");
> > > -- 
> > > 2.17.1
> > > 
>

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Dave Jiang 2 weeks, 3 days ago


On 1/21/26 9:55 PM, Smita Koralahalli wrote:
> The current probe time ownership check for Soft Reserved memory based
> solely on CXL window intersection is insufficient. dax_hmem probing is not
> always guaranteed to run after CXL enumeration and region assembly, which
> can lead to incorrect ownership decisions before the CXL stack has
> finished publishing windows and assembling committed regions.
> 
> Introduce deferred ownership handling for Soft Reserved ranges that
> intersect CXL windows at probe time by scheduling deferred work from
> dax_hmem and waiting for the CXL stack to complete enumeration and region
> assembly before deciding ownership.
> 
> Evaluate ownership of Soft Reserved ranges based on CXL region
> containment.
> 
>    - If all Soft Reserved ranges are fully contained within committed CXL
>      regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>      dax_cxl to bind.
> 
>    - If any Soft Reserved range is not fully claimed by committed CXL
>      region, tear down all CXL regions and REGISTER the Soft Reserved
>      ranges with dax_hmem instead.
> 
> While ownership resolution is pending, gate dax_cxl probing to avoid
> binding prematurely.
> 
> This enforces a strict ownership. Either CXL fully claims the Soft
> Reserved ranges or it relinquishes it entirely.
> 
> Co-developed-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
>  drivers/cxl/core/region.c | 25 ++++++++++++
>  drivers/cxl/cxl.h         |  2 +
>  drivers/dax/cxl.c         |  9 +++++
>  drivers/dax/hmem/hmem.c   | 81 ++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 115 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 9827a6dd3187..6c22a2d4abbb 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -3875,6 +3875,31 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
>  DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
>  			 cxl_region_debugfs_poison_clear, "%llx\n");
>  
> +static int cxl_region_teardown_cb(struct device *dev, void *data)
> +{
> +	struct cxl_root_decoder *cxlrd;
> +	struct cxl_region *cxlr;
> +	struct cxl_port *port;
> +
> +	if (!is_cxl_region(dev))
> +		return 0;
> +
> +	cxlr = to_cxl_region(dev);
> +
> +	cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
> +	port = cxlrd_to_port(cxlrd);
> +
> +	devm_release_action(port->uport_dev, unregister_region, cxlr);
> +
> +	return 0;
> +}
> +
> +void cxl_region_teardown_all(void)
> +{
> +	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_region_teardown_cb);
> +}
> +EXPORT_SYMBOL_GPL(cxl_region_teardown_all);
> +
>  static int cxl_region_contains_sr_cb(struct device *dev, void *data)
>  {
>  	struct resource *res = data;
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index b0ff6b65ea0b..1864d35d5f69 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -907,6 +907,7 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled);
>  struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
>  u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
>  bool cxl_region_contains_soft_reserve(const struct resource *res);
> +void cxl_region_teardown_all(void);
>  #else
>  static inline bool is_cxl_pmem_region(struct device *dev)
>  {
> @@ -933,6 +934,7 @@ static inline bool cxl_region_contains_soft_reserve(const struct resource *res)
>  {
>  	return false;
>  }
> +static inline void cxl_region_teardown_all(void) { }
>  #endif
>  
>  void cxl_endpoint_parse_cdat(struct cxl_port *port);
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index 13cd94d32ff7..b7e90d6dd888 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
>  	struct dax_region *dax_region;
>  	struct dev_dax_data data;
>  
> +	switch (dax_cxl_mode) {
> +	case DAX_CXL_MODE_DEFER:
> +		return -EPROBE_DEFER;
> +	case DAX_CXL_MODE_REGISTER:
> +		return -ENODEV;
> +	case DAX_CXL_MODE_DROP:
> +		break;
> +	}
> +
>  	if (nid == NUMA_NO_NODE)
>  		nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
>  
> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
> index 1e3424358490..bcb57d8678d7 100644
> --- a/drivers/dax/hmem/hmem.c
> +++ b/drivers/dax/hmem/hmem.c
> @@ -3,6 +3,7 @@
>  #include <linux/memregion.h>
>  #include <linux/module.h>
>  #include <linux/dax.h>
> +#include "../../cxl/cxl.h"

Would it make sense to move what common definitions you need to include/cxl/cxl.h?

DJ

>  #include "../bus.h"
>  
>  static bool region_idle;
> @@ -58,9 +59,15 @@ static void release_hmem(void *pdev)
>  	platform_device_unregister(pdev);
>  }
>  
> +struct dax_defer_work {
> +	struct platform_device *pdev;
> +	struct work_struct work;
> +};
> +
>  static int hmem_register_device(struct device *host, int target_nid,
>  				const struct resource *res)
>  {
> +	struct dax_defer_work *work = dev_get_drvdata(host);
>  	struct platform_device *pdev;
>  	struct memregion_info info;
>  	long id;
> @@ -69,8 +76,18 @@ static int hmem_register_device(struct device *host, int target_nid,
>  	if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
>  	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>  			      IORES_DESC_CXL) != REGION_DISJOINT) {
> -		dev_dbg(host, "deferring range to CXL: %pr\n", res);
> -		return 0;
> +		switch (dax_cxl_mode) {
> +		case DAX_CXL_MODE_DEFER:
> +			dev_dbg(host, "deferring range to CXL: %pr\n", res);
> +			schedule_work(&work->work);
> +			return 0;
> +		case DAX_CXL_MODE_REGISTER:
> +			dev_dbg(host, "registering CXL range: %pr\n", res);
> +			break;
> +		case DAX_CXL_MODE_DROP:
> +			dev_dbg(host, "dropping CXL range: %pr\n", res);
> +			return 0;
> +		}
>  	}
>  
>  	rc = region_intersects_soft_reserve(res->start, resource_size(res));
> @@ -123,8 +140,67 @@ static int hmem_register_device(struct device *host, int target_nid,
>  	return rc;
>  }
>  
> +static int cxl_contains_soft_reserve(struct device *host, int target_nid,
> +				     const struct resource *res)
> +{
> +	if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
> +			      IORES_DESC_CXL) != REGION_DISJOINT) {
> +		if (!cxl_region_contains_soft_reserve(res))
> +			return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static void process_defer_work(struct work_struct *_work)
> +{
> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> +	struct platform_device *pdev = work->pdev;
> +	int rc;
> +
> +	/* relies on cxl_acpi and cxl_pci having had a chance to load */
> +	wait_for_device_probe();
> +
> +	rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
> +
> +	if (!rc) {
> +		dax_cxl_mode = DAX_CXL_MODE_DROP;
> +		rc = bus_rescan_devices(&cxl_bus_type);
> +		if (rc)
> +			dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
> +	} else {
> +		dax_cxl_mode = DAX_CXL_MODE_REGISTER;
> +		cxl_region_teardown_all();
> +	}
> +
> +	walk_hmem_resources(&pdev->dev, hmem_register_device);
> +}
> +
> +static void kill_defer_work(void *_work)
> +{
> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> +
> +	cancel_work_sync(&work->work);
> +	kfree(work);
> +}
> +
>  static int dax_hmem_platform_probe(struct platform_device *pdev)
>  {
> +	struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
> +	int rc;
> +
> +	if (!work)
> +		return -ENOMEM;
> +
> +	work->pdev = pdev;
> +	INIT_WORK(&work->work, process_defer_work);
> +
> +	rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
> +	if (rc)
> +		return rc;
> +
> +	platform_set_drvdata(pdev, work);
> +
>  	return walk_hmem_resources(&pdev->dev, hmem_register_device);
>  }
>  
> @@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
>  MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory");
>  MODULE_LICENSE("GPL v2");
>  MODULE_AUTHOR("Intel Corporation");
> +MODULE_IMPORT_NS("CXL");

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Alejandro Lucero Palau 2 weeks, 3 days ago

On 1/22/26 04:55, Smita Koralahalli wrote:
> The current probe time ownership check for Soft Reserved memory based
> solely on CXL window intersection is insufficient. dax_hmem probing is not
> always guaranteed to run after CXL enumeration and region assembly, which
> can lead to incorrect ownership decisions before the CXL stack has
> finished publishing windows and assembling committed regions.
>
> Introduce deferred ownership handling for Soft Reserved ranges that
> intersect CXL windows at probe time by scheduling deferred work from
> dax_hmem and waiting for the CXL stack to complete enumeration and region
> assembly before deciding ownership.
>
> Evaluate ownership of Soft Reserved ranges based on CXL region
> containment.
>
>     - If all Soft Reserved ranges are fully contained within committed CXL
>       regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>       dax_cxl to bind.
>
>     - If any Soft Reserved range is not fully claimed by committed CXL
>       region, tear down all CXL regions and REGISTER the Soft Reserved
>       ranges with dax_hmem instead.


I was not sure if I was understanding this properly, but after looking 
at the code I think I do ... but then I do not understand the reason 
behind. If I'm right, there could be two devices and therefore different 
soft reserved ranges, with one getting an automatic cxl region for all 
the range and the other without that, and the outcome would be the first 
one getting its region removed and added to hmem. Maybe I'm missing 
something obvious but, why? If there is a good reason, I think it should 
be documented in the commit and somewhere else.


I have also problems understanding the concurrency when handling the 
global dax_cxl_mode variable. It is modified inside process_defer_work() 
which I think can have different instances for different devices 
executed concurrently in different cores/workers (the system_wq used is 
not ordered). If I'm right race conditions are likely.


>
> While ownership resolution is pending, gate dax_cxl probing to avoid
> binding prematurely.
>
> This enforces a strict ownership. Either CXL fully claims the Soft
> Reserved ranges or it relinquishes it entirely.
>
> Co-developed-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
>   drivers/cxl/core/region.c | 25 ++++++++++++
>   drivers/cxl/cxl.h         |  2 +
>   drivers/dax/cxl.c         |  9 +++++
>   drivers/dax/hmem/hmem.c   | 81 ++++++++++++++++++++++++++++++++++++++-
>   4 files changed, 115 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 9827a6dd3187..6c22a2d4abbb 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -3875,6 +3875,31 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
>   DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
>   			 cxl_region_debugfs_poison_clear, "%llx\n");
>   
> +static int cxl_region_teardown_cb(struct device *dev, void *data)
> +{
> +	struct cxl_root_decoder *cxlrd;
> +	struct cxl_region *cxlr;
> +	struct cxl_port *port;
> +
> +	if (!is_cxl_region(dev))
> +		return 0;
> +
> +	cxlr = to_cxl_region(dev);
> +
> +	cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
> +	port = cxlrd_to_port(cxlrd);
> +
> +	devm_release_action(port->uport_dev, unregister_region, cxlr);
> +
> +	return 0;
> +}
> +
> +void cxl_region_teardown_all(void)
> +{
> +	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_region_teardown_cb);
> +}
> +EXPORT_SYMBOL_GPL(cxl_region_teardown_all);
> +
>   static int cxl_region_contains_sr_cb(struct device *dev, void *data)
>   {
>   	struct resource *res = data;
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index b0ff6b65ea0b..1864d35d5f69 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -907,6 +907,7 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled);
>   struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
>   u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
>   bool cxl_region_contains_soft_reserve(const struct resource *res);
> +void cxl_region_teardown_all(void);
>   #else
>   static inline bool is_cxl_pmem_region(struct device *dev)
>   {
> @@ -933,6 +934,7 @@ static inline bool cxl_region_contains_soft_reserve(const struct resource *res)
>   {
>   	return false;
>   }
> +static inline void cxl_region_teardown_all(void) { }
>   #endif
>   
>   void cxl_endpoint_parse_cdat(struct cxl_port *port);
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index 13cd94d32ff7..b7e90d6dd888 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
>   	struct dax_region *dax_region;
>   	struct dev_dax_data data;
>   
> +	switch (dax_cxl_mode) {
> +	case DAX_CXL_MODE_DEFER:
> +		return -EPROBE_DEFER;
> +	case DAX_CXL_MODE_REGISTER:
> +		return -ENODEV;
> +	case DAX_CXL_MODE_DROP:
> +		break;
> +	}
> +
>   	if (nid == NUMA_NO_NODE)
>   		nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
>   
> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
> index 1e3424358490..bcb57d8678d7 100644
> --- a/drivers/dax/hmem/hmem.c
> +++ b/drivers/dax/hmem/hmem.c
> @@ -3,6 +3,7 @@
>   #include <linux/memregion.h>
>   #include <linux/module.h>
>   #include <linux/dax.h>
> +#include "../../cxl/cxl.h"
>   #include "../bus.h"
>   
>   static bool region_idle;
> @@ -58,9 +59,15 @@ static void release_hmem(void *pdev)
>   	platform_device_unregister(pdev);
>   }
>   
> +struct dax_defer_work {
> +	struct platform_device *pdev;
> +	struct work_struct work;
> +};
> +
>   static int hmem_register_device(struct device *host, int target_nid,
>   				const struct resource *res)
>   {
> +	struct dax_defer_work *work = dev_get_drvdata(host);
>   	struct platform_device *pdev;
>   	struct memregion_info info;
>   	long id;
> @@ -69,8 +76,18 @@ static int hmem_register_device(struct device *host, int target_nid,
>   	if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
>   	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>   			      IORES_DESC_CXL) != REGION_DISJOINT) {
> -		dev_dbg(host, "deferring range to CXL: %pr\n", res);
> -		return 0;
> +		switch (dax_cxl_mode) {
> +		case DAX_CXL_MODE_DEFER:
> +			dev_dbg(host, "deferring range to CXL: %pr\n", res);
> +			schedule_work(&work->work);
> +			return 0;
> +		case DAX_CXL_MODE_REGISTER:
> +			dev_dbg(host, "registering CXL range: %pr\n", res);
> +			break;
> +		case DAX_CXL_MODE_DROP:
> +			dev_dbg(host, "dropping CXL range: %pr\n", res);
> +			return 0;
> +		}
>   	}
>   
>   	rc = region_intersects_soft_reserve(res->start, resource_size(res));
> @@ -123,8 +140,67 @@ static int hmem_register_device(struct device *host, int target_nid,
>   	return rc;
>   }
>   
> +static int cxl_contains_soft_reserve(struct device *host, int target_nid,
> +				     const struct resource *res)
> +{
> +	if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
> +			      IORES_DESC_CXL) != REGION_DISJOINT) {
> +		if (!cxl_region_contains_soft_reserve(res))
> +			return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static void process_defer_work(struct work_struct *_work)
> +{
> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> +	struct platform_device *pdev = work->pdev;
> +	int rc;
> +
> +	/* relies on cxl_acpi and cxl_pci having had a chance to load */
> +	wait_for_device_probe();
> +
> +	rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
> +
> +	if (!rc) {
> +		dax_cxl_mode = DAX_CXL_MODE_DROP;
> +		rc = bus_rescan_devices(&cxl_bus_type);
> +		if (rc)
> +			dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
> +	} else {
> +		dax_cxl_mode = DAX_CXL_MODE_REGISTER;
> +		cxl_region_teardown_all();
> +	}
> +
> +	walk_hmem_resources(&pdev->dev, hmem_register_device);
> +}
> +
> +static void kill_defer_work(void *_work)
> +{
> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> +
> +	cancel_work_sync(&work->work);
> +	kfree(work);
> +}
> +
>   static int dax_hmem_platform_probe(struct platform_device *pdev)
>   {
> +	struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
> +	int rc;
> +
> +	if (!work)
> +		return -ENOMEM;
> +
> +	work->pdev = pdev;
> +	INIT_WORK(&work->work, process_defer_work);
> +
> +	rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
> +	if (rc)
> +		return rc;
> +
> +	platform_set_drvdata(pdev, work);
> +
>   	return walk_hmem_resources(&pdev->dev, hmem_register_device);
>   }
>   
> @@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
>   MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory");
>   MODULE_LICENSE("GPL v2");
>   MODULE_AUTHOR("Intel Corporation");
> +MODULE_IMPORT_NS("CXL");

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Koralahalli Channabasappa, Smita 2 weeks, 1 day ago

Hi Alejandro,

On 1/23/2026 3:59 AM, Alejandro Lucero Palau wrote:
> 
> On 1/22/26 04:55, Smita Koralahalli wrote:
>> The current probe time ownership check for Soft Reserved memory based
>> solely on CXL window intersection is insufficient. dax_hmem probing is 
>> not
>> always guaranteed to run after CXL enumeration and region assembly, which
>> can lead to incorrect ownership decisions before the CXL stack has
>> finished publishing windows and assembling committed regions.
>>
>> Introduce deferred ownership handling for Soft Reserved ranges that
>> intersect CXL windows at probe time by scheduling deferred work from
>> dax_hmem and waiting for the CXL stack to complete enumeration and region
>> assembly before deciding ownership.
>>
>> Evaluate ownership of Soft Reserved ranges based on CXL region
>> containment.
>>
>>     - If all Soft Reserved ranges are fully contained within committed 
>> CXL
>>       regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>>       dax_cxl to bind.
>>
>>     - If any Soft Reserved range is not fully claimed by committed CXL
>>       region, tear down all CXL regions and REGISTER the Soft Reserved
>>       ranges with dax_hmem instead.
> 
> 
> I was not sure if I was understanding this properly, but after looking 
> at the code I think I do ... but then I do not understand the reason 
> behind. If I'm right, there could be two devices and therefore different 
> soft reserved ranges, with one getting an automatic cxl region for all 
> the range and the other without that, and the outcome would be the first 
> one getting its region removed and added to hmem. Maybe I'm missing 
> something obvious but, why? If there is a good reason, I think it should 
> be documented in the commit and somewhere else.

Yeah, if I understood Dan correctly, that's exactly the intended behavior.

I'm trying to restate the "why" behind this based on Dan's earlier 
guidance. Please correct me if I'm misrepresenting it Dan.

The policy is meant to be coarse: If all SR ranges that intersect CXL 
windows are fully contained by committed CXL regions, then we have high 
confidence that the platform descriptions line up and CXL owns the memory.

If any SR range that intersects a CXL window is not fully covered by 
committed regions then we treat that as unexpected platform shenanigans. 
In that situation the intent is to give up on CXL entirely for those SR 
ranges because partial ownership becomes ambiguous.

This is why the fallback is global and not per range. The goal is to
leave no room for mixed some SR to CXL, some SR to HMEM configurations. 
Any mismatch should push the platform issue back to the vendor to fix 
the description (ideally preserving the simplifying assumption of a 1:1 
correlation between CXL Regions and SR).

Thanks for pointing this out. I will update the why in the next revision.

> 
> 
> I have also problems understanding the concurrency when handling the 
> global dax_cxl_mode variable. It is modified inside process_defer_work() 
> which I think can have different instances for different devices 
> executed concurrently in different cores/workers (the system_wq used is 
> not ordered). If I'm right race conditions are likely.

Yeah, this is something I spent sometime thinking on. My rationale 
behind not having it and where I'm still unsure:

My assumption was that after wait_for_device_probe(), CXL topology 
discovery and region commit are complete and stable. And each deferred 
worker should observe the same CXL state and therefore compute the same 
final policy (either DROP or REGISTER).

Also, I was assuming that even if multiple process_defer_work() 
instances run, the operations they perform are effectively safe to 
repeat.. though I'm not sure on this.

cxl_region_teardown_all(): this ultimately triggers the 
devm_release_action(... unregister_region ...) path. My expectation was 
that these devm actions are single shot per device lifecycle, so 
repeated teardown attempts should become noops. And 
cxl_region_teardown_all() ultimately leads to cxl_decoder_detach(), 
which takes "cxl_rwsem.region". That should serialize decoder detach and 
region teardown.

bus_rescan_devices(&cxl_bus_type): I assumed repeated rescans during 
boot are fine as the rescan path will simply rediscover already present 
devices..

walk_hmem_resources(.., hmem_register_device): in the DROP case,I 
thought running the walk multiple times is safe because devm managed 
platform devices and memregion allocations should prevent duplicate 
lifetime issues.

So, even if multiple process_defer_work() instances execute 
concurrently, the CXL operations involved in containment evaluation 
(cxl_region_contains_soft_reserve()) and teardown are already guarded.

But I'm still trying to understand if bus_rescan_devices(&cxl_bus_type) 
is not safe when invoked concurrently?

Or is the primary issue that dax_cxl_mode is a global updated from one 
context and read from others, and should be synchronized even if the 
computed final value will always be the same?

Happy to correct if my understanding is incorrect.

Thanks
Smita

> 
> 
>>
>> While ownership resolution is pending, gate dax_cxl probing to avoid
>> binding prematurely.
>>
>> This enforces a strict ownership. Either CXL fully claims the Soft
>> Reserved ranges or it relinquishes it entirely.
>>
>> Co-developed-by: Dan Williams <dan.j.williams@intel.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> ---
>>   drivers/cxl/core/region.c | 25 ++++++++++++
>>   drivers/cxl/cxl.h         |  2 +
>>   drivers/dax/cxl.c         |  9 +++++
>>   drivers/dax/hmem/hmem.c   | 81 ++++++++++++++++++++++++++++++++++++++-
>>   4 files changed, 115 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
>> index 9827a6dd3187..6c22a2d4abbb 100644
>> --- a/drivers/cxl/core/region.c
>> +++ b/drivers/cxl/core/region.c
>> @@ -3875,6 +3875,31 @@ static int cxl_region_debugfs_poison_clear(void 
>> *data, u64 offset)
>>   DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
>>                cxl_region_debugfs_poison_clear, "%llx\n");
>> +static int cxl_region_teardown_cb(struct device *dev, void *data)
>> +{
>> +    struct cxl_root_decoder *cxlrd;
>> +    struct cxl_region *cxlr;
>> +    struct cxl_port *port;
>> +
>> +    if (!is_cxl_region(dev))
>> +        return 0;
>> +
>> +    cxlr = to_cxl_region(dev);
>> +
>> +    cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
>> +    port = cxlrd_to_port(cxlrd);
>> +
>> +    devm_release_action(port->uport_dev, unregister_region, cxlr);
>> +
>> +    return 0;
>> +}
>> +
>> +void cxl_region_teardown_all(void)
>> +{
>> +    bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_region_teardown_cb);
>> +}
>> +EXPORT_SYMBOL_GPL(cxl_region_teardown_all);
>> +
>>   static int cxl_region_contains_sr_cb(struct device *dev, void *data)
>>   {
>>       struct resource *res = data;
>> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
>> index b0ff6b65ea0b..1864d35d5f69 100644
>> --- a/drivers/cxl/cxl.h
>> +++ b/drivers/cxl/cxl.h
>> @@ -907,6 +907,7 @@ int cxl_add_to_region(struct cxl_endpoint_decoder 
>> *cxled);
>>   struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
>>   u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
>>   bool cxl_region_contains_soft_reserve(const struct resource *res);
>> +void cxl_region_teardown_all(void);
>>   #else
>>   static inline bool is_cxl_pmem_region(struct device *dev)
>>   {
>> @@ -933,6 +934,7 @@ static inline bool 
>> cxl_region_contains_soft_reserve(const struct resource *res)
>>   {
>>       return false;
>>   }
>> +static inline void cxl_region_teardown_all(void) { }
>>   #endif
>>   void cxl_endpoint_parse_cdat(struct cxl_port *port);
>> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
>> index 13cd94d32ff7..b7e90d6dd888 100644
>> --- a/drivers/dax/cxl.c
>> +++ b/drivers/dax/cxl.c
>> @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
>>       struct dax_region *dax_region;
>>       struct dev_dax_data data;
>> +    switch (dax_cxl_mode) {
>> +    case DAX_CXL_MODE_DEFER:
>> +        return -EPROBE_DEFER;
>> +    case DAX_CXL_MODE_REGISTER:
>> +        return -ENODEV;
>> +    case DAX_CXL_MODE_DROP:
>> +        break;
>> +    }
>> +
>>       if (nid == NUMA_NO_NODE)
>>           nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
>> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
>> index 1e3424358490..bcb57d8678d7 100644
>> --- a/drivers/dax/hmem/hmem.c
>> +++ b/drivers/dax/hmem/hmem.c
>> @@ -3,6 +3,7 @@
>>   #include <linux/memregion.h>
>>   #include <linux/module.h>
>>   #include <linux/dax.h>
>> +#include "../../cxl/cxl.h"
>>   #include "../bus.h"
>>   static bool region_idle;
>> @@ -58,9 +59,15 @@ static void release_hmem(void *pdev)
>>       platform_device_unregister(pdev);
>>   }
>> +struct dax_defer_work {
>> +    struct platform_device *pdev;
>> +    struct work_struct work;
>> +};
>> +
>>   static int hmem_register_device(struct device *host, int target_nid,
>>                   const struct resource *res)
>>   {
>> +    struct dax_defer_work *work = dev_get_drvdata(host);
>>       struct platform_device *pdev;
>>       struct memregion_info info;
>>       long id;
>> @@ -69,8 +76,18 @@ static int hmem_register_device(struct device 
>> *host, int target_nid,
>>       if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
>>           region_intersects(res->start, resource_size(res), 
>> IORESOURCE_MEM,
>>                     IORES_DESC_CXL) != REGION_DISJOINT) {
>> -        dev_dbg(host, "deferring range to CXL: %pr\n", res);
>> -        return 0;
>> +        switch (dax_cxl_mode) {
>> +        case DAX_CXL_MODE_DEFER:
>> +            dev_dbg(host, "deferring range to CXL: %pr\n", res);
>> +            schedule_work(&work->work);
>> +            return 0;
>> +        case DAX_CXL_MODE_REGISTER:
>> +            dev_dbg(host, "registering CXL range: %pr\n", res);
>> +            break;
>> +        case DAX_CXL_MODE_DROP:
>> +            dev_dbg(host, "dropping CXL range: %pr\n", res);
>> +            return 0;
>> +        }
>>       }
>>       rc = region_intersects_soft_reserve(res->start, 
>> resource_size(res));
>> @@ -123,8 +140,67 @@ static int hmem_register_device(struct device 
>> *host, int target_nid,
>>       return rc;
>>   }
>> +static int cxl_contains_soft_reserve(struct device *host, int 
>> target_nid,
>> +                     const struct resource *res)
>> +{
>> +    if (region_intersects(res->start, resource_size(res), 
>> IORESOURCE_MEM,
>> +                  IORES_DESC_CXL) != REGION_DISJOINT) {
>> +        if (!cxl_region_contains_soft_reserve(res))
>> +            return 1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static void process_defer_work(struct work_struct *_work)
>> +{
>> +    struct dax_defer_work *work = container_of(_work, typeof(*work), 
>> work);
>> +    struct platform_device *pdev = work->pdev;
>> +    int rc;
>> +
>> +    /* relies on cxl_acpi and cxl_pci having had a chance to load */
>> +    wait_for_device_probe();
>> +
>> +    rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
>> +
>> +    if (!rc) {
>> +        dax_cxl_mode = DAX_CXL_MODE_DROP;
>> +        rc = bus_rescan_devices(&cxl_bus_type);
>> +        if (rc)
>> +            dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
>> +    } else {
>> +        dax_cxl_mode = DAX_CXL_MODE_REGISTER;
>> +        cxl_region_teardown_all();
>> +    }
>> +
>> +    walk_hmem_resources(&pdev->dev, hmem_register_device);
>> +}
>> +
>> +static void kill_defer_work(void *_work)
>> +{
>> +    struct dax_defer_work *work = container_of(_work, typeof(*work), 
>> work);
>> +
>> +    cancel_work_sync(&work->work);
>> +    kfree(work);
>> +}
>> +
>>   static int dax_hmem_platform_probe(struct platform_device *pdev)
>>   {
>> +    struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
>> +    int rc;
>> +
>> +    if (!work)
>> +        return -ENOMEM;
>> +
>> +    work->pdev = pdev;
>> +    INIT_WORK(&work->work, process_defer_work);
>> +
>> +    rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
>> +    if (rc)
>> +        return rc;
>> +
>> +    platform_set_drvdata(pdev, work);
>> +
>>       return walk_hmem_resources(&pdev->dev, hmem_register_device);
>>   }
>> @@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
>>   MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' 
>> memory");
>>   MODULE_LICENSE("GPL v2");
>>   MODULE_AUTHOR("Intel Corporation");
>> +MODULE_IMPORT_NS("CXL");

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by dan.j.williams@intel.com 2 weeks ago

[responding to the questions raised here before reviewing the patch...]

Koralahalli Channabasappa, Smita wrote:
> Hi Alejandro,
> 
> On 1/23/2026 3:59 AM, Alejandro Lucero Palau wrote:
> > 
> > On 1/22/26 04:55, Smita Koralahalli wrote:
> >> The current probe time ownership check for Soft Reserved memory based
> >> solely on CXL window intersection is insufficient. dax_hmem probing is 
> >> not
> >> always guaranteed to run after CXL enumeration and region assembly, which
> >> can lead to incorrect ownership decisions before the CXL stack has
> >> finished publishing windows and assembling committed regions.
> >>
> >> Introduce deferred ownership handling for Soft Reserved ranges that
> >> intersect CXL windows at probe time by scheduling deferred work from
> >> dax_hmem and waiting for the CXL stack to complete enumeration and region
> >> assembly before deciding ownership.
> >>
> >> Evaluate ownership of Soft Reserved ranges based on CXL region
> >> containment.
> >>
> >>     - If all Soft Reserved ranges are fully contained within committed 
> >> CXL
> >>       regions, DROP handling Soft Reserved ranges from dax_hmem and allow
> >>       dax_cxl to bind.
> >>
> >>     - If any Soft Reserved range is not fully claimed by committed CXL
> >>       region, tear down all CXL regions and REGISTER the Soft Reserved
> >>       ranges with dax_hmem instead.
> > 
> > 
> > I was not sure if I was understanding this properly, but after looking 
> > at the code I think I do ... but then I do not understand the reason 
> > behind. If I'm right, there could be two devices and therefore different 
> > soft reserved ranges, with one getting an automatic cxl region for all 
> > the range and the other without that, and the outcome would be the first 
> > one getting its region removed and added to hmem. Maybe I'm missing 
> > something obvious but, why? If there is a good reason, I think it should 
> > be documented in the commit and somewhere else.
> 
> Yeah, if I understood Dan correctly, that's exactly the intended behavior.
> 
> I'm trying to restate the "why" behind this based on Dan's earlier 
> guidance. Please correct me if I'm misrepresenting it Dan.
> 
> The policy is meant to be coarse: If all SR ranges that intersect CXL 
> windows are fully contained by committed CXL regions, then we have high 
> confidence that the platform descriptions line up and CXL owns the memory.
> 
> If any SR range that intersects a CXL window is not fully covered by 
> committed regions then we treat that as unexpected platform shenanigans. 
> In that situation the intent is to give up on CXL entirely for those SR 
> ranges because partial ownership becomes ambiguous.
> 
> This is why the fallback is global and not per range. The goal is to
> leave no room for mixed some SR to CXL, some SR to HMEM configurations. 
> Any mismatch should push the platform issue back to the vendor to fix 
> the description (ideally preserving the simplifying assumption of a 1:1 
> correlation between CXL Regions and SR).
> 
> Thanks for pointing this out. I will update the why in the next revision.

You have it right. This is mostly a policy to save debug sanity and
share the compatibility pain. You either always get everything the BIOS
put into the memory map, or you get the fully enlightened CXL world.

When accelerator memory enters the mix it does require an opt-in/out of
this scheme. Either the device completely opts out of this HMEM fallback
mechanism by marking the memory as Reserved (the dominant preference),
or it arranges for CXL accelerator drivers to be present at boot if they
want to interoperate with this fallback. Some folks want the fallback:
https://lpc.events/event/19/contributions/2064/

> > I have also problems understanding the concurrency when handling the 
> > global dax_cxl_mode variable. It is modified inside process_defer_work() 
> > which I think can have different instances for different devices 
> > executed concurrently in different cores/workers (the system_wq used is 
> > not ordered). If I'm right race conditions are likely.

It only works as a single queue of regions. One sync point to say "all
collected regions are routed into the dax_hmem or dax_cxl bucket".

> Yeah, this is something I spent sometime thinking on. My rationale 
> behind not having it and where I'm still unsure:
> 
> My assumption was that after wait_for_device_probe(), CXL topology 
> discovery and region commit are complete and stable.

...or more specifically, any CXL region discovery after that point is a
typical runtime dynamic discovery event that is not subject to any
deferral.

> And each deferred worker should observe the same CXL state and
> therefore compute the same final policy (either DROP or REGISTER).

The expectation is one queue, one event that takes the rwsem and
dispositions all present regions relative to initial soft-reserve memory
map.

> Also, I was assuming that even if multiple process_defer_work() 
> instances run, the operations they perform are effectively safe to 
> repeat.. though I'm not sure on this.

I think something is wrong if the workqueue runs more than once. It is
just a place to wait for initial device probe to complete and then fixup
all the regions (allow dax_region registration to proceed) that were
waiting for that.

> cxl_region_teardown_all(): this ultimately triggers the 
> devm_release_action(... unregister_region ...) path. My expectation was 
> that these devm actions are single shot per device lifecycle, so 
> repeated teardown attempts should become noops.

Not noops, right? The definition of a devm_action is that they always
fire at device_del(). There is no facility to device_del() a device
twice.

> cxl_region_teardown_all() ultimately leads to cxl_decoder_detach(), 
> which takes "cxl_rwsem.region". That should serialize decoder detach and 
> region teardown.
> 
> bus_rescan_devices(&cxl_bus_type): I assumed repeated rescans during 
> boot are fine as the rescan path will simply rediscover already present 
> devices..

The rescan path likely needs some logic to give up on CXL region
autodiscovery for devices that failed their memmap compatibility check.

> walk_hmem_resources(.., hmem_register_device): in the DROP case,I 
> thought running the walk multiple times is safe because devm managed 
> platform devices and memregion allocations should prevent duplicate 
> lifetime issues.
> 
> So, even if multiple process_defer_work() instances execute 
> concurrently, the CXL operations involved in containment evaluation 
> (cxl_region_contains_soft_reserve()) and teardown are already guarded.
> 
> But I'm still trying to understand if bus_rescan_devices(&cxl_bus_type) 
> is not safe when invoked concurrently?

It already races today between natural bus enumeration and the
cxl_bus_rescan() call from cxl_acpi. So it needs to be ok, it is
naturally synchronized by the region's device_lock and regions' rwsem.

> Or is the primary issue that dax_cxl_mode is a global updated from one 
> context and read from others, and should be synchronized even if the 
> computed final value will always be the same?

There is only one global hmem_platform device, so only one potential
item in this workqueue.

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Koralahalli Channabasappa, Smita 1 week, 6 days ago

Hi Dan,

Thanks for clearing some of my incorrect understandings here.

On 1/26/2026 3:53 PM, dan.j.williams@intel.com wrote:
> [responding to the questions raised here before reviewing the patch...]
> 
> Koralahalli Channabasappa, Smita wrote:
>> Hi Alejandro,
>>
>> On 1/23/2026 3:59 AM, Alejandro Lucero Palau wrote:
>>>
>>> On 1/22/26 04:55, Smita Koralahalli wrote:
>>>> The current probe time ownership check for Soft Reserved memory based
>>>> solely on CXL window intersection is insufficient. dax_hmem probing is
>>>> not
>>>> always guaranteed to run after CXL enumeration and region assembly, which
>>>> can lead to incorrect ownership decisions before the CXL stack has
>>>> finished publishing windows and assembling committed regions.
>>>>
>>>> Introduce deferred ownership handling for Soft Reserved ranges that
>>>> intersect CXL windows at probe time by scheduling deferred work from
>>>> dax_hmem and waiting for the CXL stack to complete enumeration and region
>>>> assembly before deciding ownership.
>>>>
>>>> Evaluate ownership of Soft Reserved ranges based on CXL region
>>>> containment.
>>>>
>>>>      - If all Soft Reserved ranges are fully contained within committed
>>>> CXL
>>>>        regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>>>>        dax_cxl to bind.
>>>>
>>>>      - If any Soft Reserved range is not fully claimed by committed CXL
>>>>        region, tear down all CXL regions and REGISTER the Soft Reserved
>>>>        ranges with dax_hmem instead.
>>>
>>>
>>> I was not sure if I was understanding this properly, but after looking
>>> at the code I think I do ... but then I do not understand the reason
>>> behind. If I'm right, there could be two devices and therefore different
>>> soft reserved ranges, with one getting an automatic cxl region for all
>>> the range and the other without that, and the outcome would be the first
>>> one getting its region removed and added to hmem. Maybe I'm missing
>>> something obvious but, why? If there is a good reason, I think it should
>>> be documented in the commit and somewhere else.
>>
>> Yeah, if I understood Dan correctly, that's exactly the intended behavior.
>>
>> I'm trying to restate the "why" behind this based on Dan's earlier
>> guidance. Please correct me if I'm misrepresenting it Dan.
>>
>> The policy is meant to be coarse: If all SR ranges that intersect CXL
>> windows are fully contained by committed CXL regions, then we have high
>> confidence that the platform descriptions line up and CXL owns the memory.
>>
>> If any SR range that intersects a CXL window is not fully covered by
>> committed regions then we treat that as unexpected platform shenanigans.
>> In that situation the intent is to give up on CXL entirely for those SR
>> ranges because partial ownership becomes ambiguous.
>>
>> This is why the fallback is global and not per range. The goal is to
>> leave no room for mixed some SR to CXL, some SR to HMEM configurations.
>> Any mismatch should push the platform issue back to the vendor to fix
>> the description (ideally preserving the simplifying assumption of a 1:1
>> correlation between CXL Regions and SR).
>>
>> Thanks for pointing this out. I will update the why in the next revision.
> 
> You have it right. This is mostly a policy to save debug sanity and
> share the compatibility pain. You either always get everything the BIOS
> put into the memory map, or you get the fully enlightened CXL world.
> 
> When accelerator memory enters the mix it does require an opt-in/out of
> this scheme. Either the device completely opts out of this HMEM fallback
> mechanism by marking the memory as Reserved (the dominant preference),
> or it arranges for CXL accelerator drivers to be present at boot if they
> want to interoperate with this fallback. Some folks want the fallback:
> https://lpc.events/event/19/contributions/2064/
> 
>>> I have also problems understanding the concurrency when handling the
>>> global dax_cxl_mode variable. It is modified inside process_defer_work()
>>> which I think can have different instances for different devices
>>> executed concurrently in different cores/workers (the system_wq used is
>>> not ordered). If I'm right race conditions are likely.
> 
> It only works as a single queue of regions. One sync point to say "all
> collected regions are routed into the dax_hmem or dax_cxl bucket".

Got it. My earlier assumption of multiple executions of the deferred 
work is incorrect. Thank you.

> 
>> Yeah, this is something I spent sometime thinking on. My rationale
>> behind not having it and where I'm still unsure:
>>
>> My assumption was that after wait_for_device_probe(), CXL topology
>> discovery and region commit are complete and stable.
> 
> ...or more specifically, any CXL region discovery after that point is a
> typical runtime dynamic discovery event that is not subject to any
> deferral.
> 
>> And each deferred worker should observe the same CXL state and
>> therefore compute the same final policy (either DROP or REGISTER).
> 
> The expectation is one queue, one event that takes the rwsem and
> dispositions all present regions relative to initial soft-reserve memory
> map.
> 
>> Also, I was assuming that even if multiple process_defer_work()
>> instances run, the operations they perform are effectively safe to
>> repeat.. though I'm not sure on this.
> 
> I think something is wrong if the workqueue runs more than once. It is
> just a place to wait for initial device probe to complete and then fixup
> all the regions (allow dax_region registration to proceed) that were
> waiting for that.

Right.

> 
>> cxl_region_teardown_all(): this ultimately triggers the
>> devm_release_action(... unregister_region ...) path. My expectation was
>> that these devm actions are single shot per device lifecycle, so
>> repeated teardown attempts should become noops.
> 
> Not noops, right? The definition of a devm_action is that they always
> fire at device_del(). There is no facility to device_del() a device
> twice.

Yeah they fire exactly once at device_del().

> 
>> cxl_region_teardown_all() ultimately leads to cxl_decoder_detach(),
>> which takes "cxl_rwsem.region". That should serialize decoder detach and
>> region teardown.
>>
>> bus_rescan_devices(&cxl_bus_type): I assumed repeated rescans during
>> boot are fine as the rescan path will simply rediscover already present
>> devices..
> 
> The rescan path likely needs some logic to give up on CXL region
> autodiscovery for devices that failed their memmap compatibility check.
> 
>> walk_hmem_resources(.., hmem_register_device): in the DROP case,I
>> thought running the walk multiple times is safe because devm managed
>> platform devices and memregion allocations should prevent duplicate
>> lifetime issues.
>>
>> So, even if multiple process_defer_work() instances execute
>> concurrently, the CXL operations involved in containment evaluation
>> (cxl_region_contains_soft_reserve()) and teardown are already guarded.
>>
>> But I'm still trying to understand if bus_rescan_devices(&cxl_bus_type)
>> is not safe when invoked concurrently?
> 
> It already races today between natural bus enumeration and the
> cxl_bus_rescan() call from cxl_acpi. So it needs to be ok, it is
> naturally synchronized by the region's device_lock and regions' rwsem.

Thanks for confirming this.

> 
>> Or is the primary issue that dax_cxl_mode is a global updated from one
>> context and read from others, and should be synchronized even if the
>> computed final value will always be the same?
> 
> There is only one global hmem_platform device, so only one potential
> item in this workqueue.

Thanks
Smita

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Alejandro Lucero Palau 1 week, 6 days ago

On 1/26/26 23:53, dan.j.williams@intel.com wrote:
> [responding to the questions raised here before reviewing the patch...]
>
> Koralahalli Channabasappa, Smita wrote:
>> Hi Alejandro,
>>
>> On 1/23/2026 3:59 AM, Alejandro Lucero Palau wrote:
>>> On 1/22/26 04:55, Smita Koralahalli wrote:
>>>> The current probe time ownership check for Soft Reserved memory based
>>>> solely on CXL window intersection is insufficient. dax_hmem probing is
>>>> not
>>>> always guaranteed to run after CXL enumeration and region assembly, which
>>>> can lead to incorrect ownership decisions before the CXL stack has
>>>> finished publishing windows and assembling committed regions.
>>>>
>>>> Introduce deferred ownership handling for Soft Reserved ranges that
>>>> intersect CXL windows at probe time by scheduling deferred work from
>>>> dax_hmem and waiting for the CXL stack to complete enumeration and region
>>>> assembly before deciding ownership.
>>>>
>>>> Evaluate ownership of Soft Reserved ranges based on CXL region
>>>> containment.
>>>>
>>>>      - If all Soft Reserved ranges are fully contained within committed
>>>> CXL
>>>>        regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>>>>        dax_cxl to bind.
>>>>
>>>>      - If any Soft Reserved range is not fully claimed by committed CXL
>>>>        region, tear down all CXL regions and REGISTER the Soft Reserved
>>>>        ranges with dax_hmem instead.
>>>
>>> I was not sure if I was understanding this properly, but after looking
>>> at the code I think I do ... but then I do not understand the reason
>>> behind. If I'm right, there could be two devices and therefore different
>>> soft reserved ranges, with one getting an automatic cxl region for all
>>> the range and the other without that, and the outcome would be the first
>>> one getting its region removed and added to hmem. Maybe I'm missing
>>> something obvious but, why? If there is a good reason, I think it should
>>> be documented in the commit and somewhere else.
>> Yeah, if I understood Dan correctly, that's exactly the intended behavior.
>>
>> I'm trying to restate the "why" behind this based on Dan's earlier
>> guidance. Please correct me if I'm misrepresenting it Dan.
>>
>> The policy is meant to be coarse: If all SR ranges that intersect CXL
>> windows are fully contained by committed CXL regions, then we have high
>> confidence that the platform descriptions line up and CXL owns the memory.
>>
>> If any SR range that intersects a CXL window is not fully covered by
>> committed regions then we treat that as unexpected platform shenanigans.
>> In that situation the intent is to give up on CXL entirely for those SR
>> ranges because partial ownership becomes ambiguous.
>>
>> This is why the fallback is global and not per range. The goal is to
>> leave no room for mixed some SR to CXL, some SR to HMEM configurations.
>> Any mismatch should push the platform issue back to the vendor to fix
>> the description (ideally preserving the simplifying assumption of a 1:1
>> correlation between CXL Regions and SR).
>>
>> Thanks for pointing this out. I will update the why in the next revision.
> You have it right. This is mostly a policy to save debug sanity and
> share the compatibility pain. You either always get everything the BIOS
> put into the memory map, or you get the fully enlightened CXL world.
>
> When accelerator memory enters the mix it does require an opt-in/out of
> this scheme. Either the device completely opts out of this HMEM fallback
> mechanism by marking the memory as Reserved (the dominant preference),
> or it arranges for CXL accelerator drivers to be present at boot if they
> want to interoperate with this fallback. Some folks want the fallback:
> https://lpc.events/event/19/contributions/2064/


I will take a look at this presentation, but I think there could be 
another option where accelerators information is obtained during pci 
enumeration by the kernel and using this information by this 
functionality skipping those ranges allocated to them. Forcing them to 
be compiled with the kernel would go against what distributions 
currently and widely do with initramfs. Not sure if some current "early" 
stubs could be used for this though but the information needs to be 
recollected before this code does the checks.


>>> I have also problems understanding the concurrency when handling the
>>> global dax_cxl_mode variable. It is modified inside process_defer_work()
>>> which I think can have different instances for different devices
>>> executed concurrently in different cores/workers (the system_wq used is
>>> not ordered). If I'm right race conditions are likely.
> It only works as a single queue of regions. One sync point to say "all
> collected regions are routed into the dax_hmem or dax_cxl bucket".


That is how I think it should work, handling all the soft reserved 
ranges vs regions by one code execution. But that is not the case. More 
later.


>> Yeah, this is something I spent sometime thinking on. My rationale
>> behind not having it and where I'm still unsure:
>>
>> My assumption was that after wait_for_device_probe(), CXL topology
>> discovery and region commit are complete and stable.
> ...or more specifically, any CXL region discovery after that point is a
> typical runtime dynamic discovery event that is not subject to any
> deferral.
>
>> And each deferred worker should observe the same CXL state and
>> therefore compute the same final policy (either DROP or REGISTER).
> The expectation is one queue, one event that takes the rwsem and
> dispositions all present regions relative to initial soft-reserve memory
> map.
>
>> Also, I was assuming that even if multiple process_defer_work()
>> instances run, the operations they perform are effectively safe to
>> repeat.. though I'm not sure on this.
> I think something is wrong if the workqueue runs more than once. It is
> just a place to wait for initial device probe to complete and then fixup
> all the regions (allow dax_region registration to proceed) that were
> waiting for that.
>
>> cxl_region_teardown_all(): this ultimately triggers the
>> devm_release_action(... unregister_region ...) path. My expectation was
>> that these devm actions are single shot per device lifecycle, so
>> repeated teardown attempts should become noops.
> Not noops, right? The definition of a devm_action is that they always
> fire at device_del(). There is no facility to device_del() a device
> twice.
>
>> cxl_region_teardown_all() ultimately leads to cxl_decoder_detach(),
>> which takes "cxl_rwsem.region". That should serialize decoder detach and
>> region teardown.
>>
>> bus_rescan_devices(&cxl_bus_type): I assumed repeated rescans during
>> boot are fine as the rescan path will simply rediscover already present
>> devices..
> The rescan path likely needs some logic to give up on CXL region
> autodiscovery for devices that failed their memmap compatibility check.
>
>> walk_hmem_resources(.., hmem_register_device): in the DROP case,I
>> thought running the walk multiple times is safe because devm managed
>> platform devices and memregion allocations should prevent duplicate
>> lifetime issues.
>>
>> So, even if multiple process_defer_work() instances execute
>> concurrently, the CXL operations involved in containment evaluation
>> (cxl_region_contains_soft_reserve()) and teardown are already guarded.
>>
>> But I'm still trying to understand if bus_rescan_devices(&cxl_bus_type)
>> is not safe when invoked concurrently?
> It already races today between natural bus enumeration and the
> cxl_bus_rescan() call from cxl_acpi. So it needs to be ok, it is
> naturally synchronized by the region's device_lock and regions' rwsem.
>
>> Or is the primary issue that dax_cxl_mode is a global updated from one
>> context and read from others, and should be synchronized even if the
>> computed final value will always be the same?
> There is only one global hmem_platform device, so only one potential
> item in this workqueue.


Well, I do not think so.


hmem_init() in drivers/dax/device.c walks IORESOURCE_MEM looking for 
IORES_DESC_SOFT_RESERVED descriptors and calling hmem_register_one for 
each of them. That leads to create an hmem_platform platform device (no 
typo, just emphasizing this is a platform device with name 
hmem_platform) so there will be as many hmem_platform devices as 
descriptors found.


Then each hmem_platform probe() will create an hmem platform device, 
where a work will be schedule passing this specific hmem platform device 
as argument. So, each work will check for the specific ranges of its own 
pdev and not all of them. The check can result in a different value 
assigned to dax_cxl_mode leading to potential race conditions with 
concurrent workers and also potentially leaving soft reserved ranges 
without both, a dax or an hmem device.

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by dan.j.williams@intel.com 1 week, 6 days ago

Alejandro Lucero Palau wrote:
[..]
> I will take a look at this presentation, but I think there could be 
> another option where accelerators information is obtained during pci 
> enumeration by the kernel and using this information by this 
> functionality skipping those ranges allocated to them. Forcing them to 
> be compiled with the kernel would go against what distributions 
> currently and widely do with initramfs. Not sure if some current "early" 
> stubs could be used for this though but the information needs to be 
> recollected before this code does the checks.

The simple path is "do not use EFI_MEMORY_SP for accelerator memory".
However, if the accelerator wants to publish memory as EFI_MEMORY_SP
then it needs to coordinate with the kernel's default behavior somehow.
That means expanding the list of drivers that dax_hmem needs to await
before it can make a determination, or teaching dax_hmem to look for a
secondary indication that it should never fall back to the default
behavior.

Talk to your AMD peers Paul and Rajneesh about their needs. I took it on
faith that the use case was required.

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Alejandro Lucero Palau 1 week, 5 days ago

On 1/27/26 23:41, dan.j.williams@intel.com wrote:
> Alejandro Lucero Palau wrote:
> [..]
>> I will take a look at this presentation, but I think there could be
>> another option where accelerators information is obtained during pci
>> enumeration by the kernel and using this information by this
>> functionality skipping those ranges allocated to them. Forcing them to
>> be compiled with the kernel would go against what distributions
>> currently and widely do with initramfs. Not sure if some current "early"
>> stubs could be used for this though but the information needs to be
>> recollected before this code does the checks.
> The simple path is "do not use EFI_MEMORY_SP for accelerator memory".

Sure. That is what I hope our device will end up having ... since 
neither hmem nor dax is an option for us.

> However, if the accelerator wants to publish memory as EFI_MEMORY_SP
> then it needs to coordinate with the kernel's default behavior somehow.

I think some Type2 drivers could be happy with dax and therefore using 
EFI_MEMORY_SP, so yes, that is what I meant: there is another option 
instead of forcing drivers to be present at the time of this decision. 
If someone reading is working on Type2 drivers and see this 
suitable/required, please tell. I'll be interested in doing it or helping.

> That means expanding the list of drivers that dax_hmem needs to await
> before it can make a determination, or teaching dax_hmem to look for a
> secondary indication that it should never fall back to the default
> behavior.

I think waiting could be problematic as some Type2 drivers could not be 
automatically load. It looks like if a CXL region is not backing the 
Type2 CXL.mem completely should not impact dax devices and cxl regions 
maybe being used at Type2 driver probe. Would a warning be enough?

>
> Talk to your AMD peers Paul and Rajneesh about their needs. I took it on
> faith that the use case was required.

After reading that presentation, I think this is a different subject. 
Assuming case 1 there is what you have in mind, and if I understand it 
properly, that could be useful for companies owning the full platform, 
but not sure adding a specific acpi driver per device makes sense for 
less-powerful vendors. Anyway, I will talk with them as the memory 
allocation part which seems to be one thing to do by those acpi drivers 
is interesting.

Thank you

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Tomasz Wolski 4 months, 1 week ago

>> [responding to the questions raised here before reviewing the patch...]
>>
>> Koralahalli Channabasappa, Smita wrote:
>>> Hi Alejandro,
>>>
>>> On 1/23/2026 3:59 AM, Alejandro Lucero Palau wrote:
>>>> On 1/22/26 04:55, Smita Koralahalli wrote:
>>>>> The current probe time ownership check for Soft Reserved memory based
>>>>> solely on CXL window intersection is insufficient. dax_hmem probing is
>>>>> not
>>>>> always guaranteed to run after CXL enumeration and region assembly, which
>>>>> can lead to incorrect ownership decisions before the CXL stack has
>>>>> finished publishing windows and assembling committed regions.
>>>>>
>>>>> Introduce deferred ownership handling for Soft Reserved ranges that
>>>>> intersect CXL windows at probe time by scheduling deferred work from
>>>>> dax_hmem and waiting for the CXL stack to complete enumeration and region
>>>>> assembly before deciding ownership.
>>>>>
>>>>> Evaluate ownership of Soft Reserved ranges based on CXL region
>>>>> containment.
>>>>>
>>>>>      - If all Soft Reserved ranges are fully contained within committed
>>>>> CXL
>>>>>        regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>>>>>        dax_cxl to bind.
>>>>>
>>>>>      - If any Soft Reserved range is not fully claimed by committed CXL
>>>>>        region, tear down all CXL regions and REGISTER the Soft Reserved
>>>>>        ranges with dax_hmem instead.
>>>>
>>>> I was not sure if I was understanding this properly, but after looking
>>>> at the code I think I do ... but then I do not understand the reason
>>>> behind. If I'm right, there could be two devices and therefore different
>>>> soft reserved ranges, with one getting an automatic cxl region for all
>>>> the range and the other without that, and the outcome would be the first
>>>> one getting its region removed and added to hmem. Maybe I'm missing
>>>> something obvious but, why? If there is a good reason, I think it should
>>>> be documented in the commit and somewhere else.
>>> Yeah, if I understood Dan correctly, that's exactly the intended behavior.
>>>
>>> I'm trying to restate the "why" behind this based on Dan's earlier
>>> guidance. Please correct me if I'm misrepresenting it Dan.
>>>
>>> The policy is meant to be coarse: If all SR ranges that intersect CXL
>>> windows are fully contained by committed CXL regions, then we have high
>>> confidence that the platform descriptions line up and CXL owns the memory.
>>>
>>> If any SR range that intersects a CXL window is not fully covered by
>>> committed regions then we treat that as unexpected platform shenanigans.
>>> In that situation the intent is to give up on CXL entirely for those SR
>>> ranges because partial ownership becomes ambiguous.
>>>
>>> This is why the fallback is global and not per range. The goal is to
>>> leave no room for mixed some SR to CXL, some SR to HMEM configurations.
>>> Any mismatch should push the platform issue back to the vendor to fix
>>> the description (ideally preserving the simplifying assumption of a 1:1
>>> correlation between CXL Regions and SR).
>>>
>>> Thanks for pointing this out. I will update the why in the next revision.
>> You have it right. This is mostly a policy to save debug sanity and
>> share the compatibility pain. You either always get everything the BIOS
>> put into the memory map, or you get the fully enlightened CXL world.
>>
>> When accelerator memory enters the mix it does require an opt-in/out of
>> this scheme. Either the device completely opts out of this HMEM fallback
>> mechanism by marking the memory as Reserved (the dominant preference),
>> or it arranges for CXL accelerator drivers to be present at boot if they
>> want to interoperate with this fallback. Some folks want the fallback:
>> https://lpc.events/event/19/contributions/2064/
>
>
>I will take a look at this presentation, but I think there could be 
>another option where accelerators information is obtained during pci 
>enumeration by the kernel and using this information by this 
>functionality skipping those ranges allocated to them. Forcing them to 
>be compiled with the kernel would go against what distributions 
>currently and widely do with initramfs. Not sure if some current "early" 
>stubs could be used for this though but the information needs to be 
>recollected before this code does the checks.
>
>
>>>> I have also problems understanding the concurrency when handling the
>>>> global dax_cxl_mode variable. It is modified inside process_defer_work()
>>>> which I think can have different instances for different devices
>>>> executed concurrently in different cores/workers (the system_wq used is
>>>> not ordered). If I'm right race conditions are likely.
>> It only works as a single queue of regions. One sync point to say "all
>> collected regions are routed into the dax_hmem or dax_cxl bucket".
>
>
>That is how I think it should work, handling all the soft reserved 
>ranges vs regions by one code execution. But that is not the case. More 
>later.
>
>
>>> Yeah, this is something I spent sometime thinking on. My rationale
>>> behind not having it and where I'm still unsure:
>>>
>>> My assumption was that after wait_for_device_probe(), CXL topology
>>> discovery and region commit are complete and stable.
>> ...or more specifically, any CXL region discovery after that point is a
>> typical runtime dynamic discovery event that is not subject to any
>> deferral.
>>
>>> And each deferred worker should observe the same CXL state and
>>> therefore compute the same final policy (either DROP or REGISTER).
>> The expectation is one queue, one event that takes the rwsem and
>> dispositions all present regions relative to initial soft-reserve memory
>> map.
>>
>>> Also, I was assuming that even if multiple process_defer_work()
>>> instances run, the operations they perform are effectively safe to
>>> repeat.. though I'm not sure on this.
>> I think something is wrong if the workqueue runs more than once. It is
>> just a place to wait for initial device probe to complete and then fixup
>> all the regions (allow dax_region registration to proceed) that were
>> waiting for that.
>>
>>> cxl_region_teardown_all(): this ultimately triggers the
>>> devm_release_action(... unregister_region ...) path. My expectation was
>>> that these devm actions are single shot per device lifecycle, so
>>> repeated teardown attempts should become noops.
>> Not noops, right? The definition of a devm_action is that they always
>> fire at device_del(). There is no facility to device_del() a device
>> twice.
>>
>>> cxl_region_teardown_all() ultimately leads to cxl_decoder_detach(),
>>> which takes "cxl_rwsem.region". That should serialize decoder detach and
>>> region teardown.
>>>
>>> bus_rescan_devices(&cxl_bus_type): I assumed repeated rescans during
>>> boot are fine as the rescan path will simply rediscover already present
>>> devices..
>> The rescan path likely needs some logic to give up on CXL region
>> autodiscovery for devices that failed their memmap compatibility check.
>>
>>> walk_hmem_resources(.., hmem_register_device): in the DROP case,I
>>> thought running the walk multiple times is safe because devm managed
>>> platform devices and memregion allocations should prevent duplicate
>>> lifetime issues.
>>>
>>> So, even if multiple process_defer_work() instances execute
>>> concurrently, the CXL operations involved in containment evaluation
>>> (cxl_region_contains_soft_reserve()) and teardown are already guarded.
>>>
>>> But I'm still trying to understand if bus_rescan_devices(&cxl_bus_type)
>>> is not safe when invoked concurrently?
>> It already races today between natural bus enumeration and the
>> cxl_bus_rescan() call from cxl_acpi. So it needs to be ok, it is
>> naturally synchronized by the region's device_lock and regions' rwsem.
>>
>>> Or is the primary issue that dax_cxl_mode is a global updated from one
>>> context and read from others, and should be synchronized even if the
>>> computed final value will always be the same?
>> There is only one global hmem_platform device, so only one potential
>> item in this workqueue.
>
>
>Well, I do not think so.
>
>
>hmem_init() in drivers/dax/device.c walks IORESOURCE_MEM looking for 
>IORES_DESC_SOFT_RESERVED descriptors and calling hmem_register_one for 
>each of them. That leads to create an hmem_platform platform device (no 
>typo, just emphasizing this is a platform device with name 
>hmem_platform) so there will be as many hmem_platform devices as 
>descriptors found.
>
>
>Then each hmem_platform probe() will create an hmem platform device, 
>where a work will be schedule passing this specific hmem platform device 
>as argument. So, each work will check for the specific ranges of its own 
>pdev and not all of them. The check can result in a different value 
>assigned to dax_cxl_mode leading to potential race conditions with 
>concurrent workers and also potentially leaving soft reserved ranges 
>without both, a dax or an hmem device.

Hi Alejandro,

Isn't below  check in __hmem_register_resource ensuring that only one
hmem_platform device can be created? So first resource would
create platform device and set the platform_initialized bool to true

static void __hmem_register_resource(int target_nid, struct resource *res)
..
	if (platform_initialized)
		return;
..

Thanks,
Tomasz

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Alejandro Lucero Palau 1 week, 6 days ago

On 10/1/25 18:15, Tomasz Wolski wrote:
>>> [responding to the questions raised here before reviewing the patch...]
>>>
>>> Koralahalli Channabasappa, Smita wrote:
>>>> Hi Alejandro,
>>>>
>>>> On 1/23/2026 3:59 AM, Alejandro Lucero Palau wrote:
>>>>> On 1/22/26 04:55, Smita Koralahalli wrote:
>>>>>> The current probe time ownership check for Soft Reserved memory based
>>>>>> solely on CXL window intersection is insufficient. dax_hmem probing is
>>>>>> not
>>>>>> always guaranteed to run after CXL enumeration and region assembly, which
>>>>>> can lead to incorrect ownership decisions before the CXL stack has
>>>>>> finished publishing windows and assembling committed regions.
>>>>>>
>>>>>> Introduce deferred ownership handling for Soft Reserved ranges that
>>>>>> intersect CXL windows at probe time by scheduling deferred work from
>>>>>> dax_hmem and waiting for the CXL stack to complete enumeration and region
>>>>>> assembly before deciding ownership.
>>>>>>
>>>>>> Evaluate ownership of Soft Reserved ranges based on CXL region
>>>>>> containment.
>>>>>>
>>>>>>       - If all Soft Reserved ranges are fully contained within committed
>>>>>> CXL
>>>>>>         regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>>>>>>         dax_cxl to bind.
>>>>>>
>>>>>>       - If any Soft Reserved range is not fully claimed by committed CXL
>>>>>>         region, tear down all CXL regions and REGISTER the Soft Reserved
>>>>>>         ranges with dax_hmem instead.
>>>>> I was not sure if I was understanding this properly, but after looking
>>>>> at the code I think I do ... but then I do not understand the reason
>>>>> behind. If I'm right, there could be two devices and therefore different
>>>>> soft reserved ranges, with one getting an automatic cxl region for all
>>>>> the range and the other without that, and the outcome would be the first
>>>>> one getting its region removed and added to hmem. Maybe I'm missing
>>>>> something obvious but, why? If there is a good reason, I think it should
>>>>> be documented in the commit and somewhere else.
>>>> Yeah, if I understood Dan correctly, that's exactly the intended behavior.
>>>>
>>>> I'm trying to restate the "why" behind this based on Dan's earlier
>>>> guidance. Please correct me if I'm misrepresenting it Dan.
>>>>
>>>> The policy is meant to be coarse: If all SR ranges that intersect CXL
>>>> windows are fully contained by committed CXL regions, then we have high
>>>> confidence that the platform descriptions line up and CXL owns the memory.
>>>>
>>>> If any SR range that intersects a CXL window is not fully covered by
>>>> committed regions then we treat that as unexpected platform shenanigans.
>>>> In that situation the intent is to give up on CXL entirely for those SR
>>>> ranges because partial ownership becomes ambiguous.
>>>>
>>>> This is why the fallback is global and not per range. The goal is to
>>>> leave no room for mixed some SR to CXL, some SR to HMEM configurations.
>>>> Any mismatch should push the platform issue back to the vendor to fix
>>>> the description (ideally preserving the simplifying assumption of a 1:1
>>>> correlation between CXL Regions and SR).
>>>>
>>>> Thanks for pointing this out. I will update the why in the next revision.
>>> You have it right. This is mostly a policy to save debug sanity and
>>> share the compatibility pain. You either always get everything the BIOS
>>> put into the memory map, or you get the fully enlightened CXL world.
>>>
>>> When accelerator memory enters the mix it does require an opt-in/out of
>>> this scheme. Either the device completely opts out of this HMEM fallback
>>> mechanism by marking the memory as Reserved (the dominant preference),
>>> or it arranges for CXL accelerator drivers to be present at boot if they
>>> want to interoperate with this fallback. Some folks want the fallback:
>>> https://lpc.events/event/19/contributions/2064/
>>
>> I will take a look at this presentation, but I think there could be
>> another option where accelerators information is obtained during pci
>> enumeration by the kernel and using this information by this
>> functionality skipping those ranges allocated to them. Forcing them to
>> be compiled with the kernel would go against what distributions
>> currently and widely do with initramfs. Not sure if some current "early"
>> stubs could be used for this though but the information needs to be
>> recollected before this code does the checks.
>>
>>
>>>>> I have also problems understanding the concurrency when handling the
>>>>> global dax_cxl_mode variable. It is modified inside process_defer_work()
>>>>> which I think can have different instances for different devices
>>>>> executed concurrently in different cores/workers (the system_wq used is
>>>>> not ordered). If I'm right race conditions are likely.
>>> It only works as a single queue of regions. One sync point to say "all
>>> collected regions are routed into the dax_hmem or dax_cxl bucket".
>>
>> That is how I think it should work, handling all the soft reserved
>> ranges vs regions by one code execution. But that is not the case. More
>> later.
>>
>>
>>>> Yeah, this is something I spent sometime thinking on. My rationale
>>>> behind not having it and where I'm still unsure:
>>>>
>>>> My assumption was that after wait_for_device_probe(), CXL topology
>>>> discovery and region commit are complete and stable.
>>> ...or more specifically, any CXL region discovery after that point is a
>>> typical runtime dynamic discovery event that is not subject to any
>>> deferral.
>>>
>>>> And each deferred worker should observe the same CXL state and
>>>> therefore compute the same final policy (either DROP or REGISTER).
>>> The expectation is one queue, one event that takes the rwsem and
>>> dispositions all present regions relative to initial soft-reserve memory
>>> map.
>>>
>>>> Also, I was assuming that even if multiple process_defer_work()
>>>> instances run, the operations they perform are effectively safe to
>>>> repeat.. though I'm not sure on this.
>>> I think something is wrong if the workqueue runs more than once. It is
>>> just a place to wait for initial device probe to complete and then fixup
>>> all the regions (allow dax_region registration to proceed) that were
>>> waiting for that.
>>>
>>>> cxl_region_teardown_all(): this ultimately triggers the
>>>> devm_release_action(... unregister_region ...) path. My expectation was
>>>> that these devm actions are single shot per device lifecycle, so
>>>> repeated teardown attempts should become noops.
>>> Not noops, right? The definition of a devm_action is that they always
>>> fire at device_del(). There is no facility to device_del() a device
>>> twice.
>>>
>>>> cxl_region_teardown_all() ultimately leads to cxl_decoder_detach(),
>>>> which takes "cxl_rwsem.region". That should serialize decoder detach and
>>>> region teardown.
>>>>
>>>> bus_rescan_devices(&cxl_bus_type): I assumed repeated rescans during
>>>> boot are fine as the rescan path will simply rediscover already present
>>>> devices..
>>> The rescan path likely needs some logic to give up on CXL region
>>> autodiscovery for devices that failed their memmap compatibility check.
>>>
>>>> walk_hmem_resources(.., hmem_register_device): in the DROP case,I
>>>> thought running the walk multiple times is safe because devm managed
>>>> platform devices and memregion allocations should prevent duplicate
>>>> lifetime issues.
>>>>
>>>> So, even if multiple process_defer_work() instances execute
>>>> concurrently, the CXL operations involved in containment evaluation
>>>> (cxl_region_contains_soft_reserve()) and teardown are already guarded.
>>>>
>>>> But I'm still trying to understand if bus_rescan_devices(&cxl_bus_type)
>>>> is not safe when invoked concurrently?
>>> It already races today between natural bus enumeration and the
>>> cxl_bus_rescan() call from cxl_acpi. So it needs to be ok, it is
>>> naturally synchronized by the region's device_lock and regions' rwsem.
>>>
>>>> Or is the primary issue that dax_cxl_mode is a global updated from one
>>>> context and read from others, and should be synchronized even if the
>>>> computed final value will always be the same?
>>> There is only one global hmem_platform device, so only one potential
>>> item in this workqueue.
>>
>> Well, I do not think so.
>>
>>
>> hmem_init() in drivers/dax/device.c walks IORESOURCE_MEM looking for
>> IORES_DESC_SOFT_RESERVED descriptors and calling hmem_register_one for
>> each of them. That leads to create an hmem_platform platform device (no
>> typo, just emphasizing this is a platform device with name
>> hmem_platform) so there will be as many hmem_platform devices as
>> descriptors found.
>>
>>
>> Then each hmem_platform probe() will create an hmem platform device,
>> where a work will be schedule passing this specific hmem platform device
>> as argument. So, each work will check for the specific ranges of its own
>> pdev and not all of them. The check can result in a different value
>> assigned to dax_cxl_mode leading to potential race conditions with
>> concurrent workers and also potentially leaving soft reserved ranges
>> without both, a dax or an hmem device.
> Hi Alejandro,
>
> Isn't below  check in __hmem_register_resource ensuring that only one
> hmem_platform device can be created? So first resource would
> create platform device and set the platform_initialized bool to true
>
> static void __hmem_register_resource(int target_nid, struct resource *res)
> ..
> 	if (platform_initialized)
> 		return;
> ..


Upps. Silly me. So focused on platform_device_alloc() ...


So, the concurrency problem when updating dax_cxl_mode does not exist as 
all potential concurrent workers use the hmem_platform device, but I 
think, or maybe I'm having a really bad day, the hmem devices are 
"indeed" created based on those soft reserved ranges when walking 
hmem_active ... the work is schedule as many times as hmem devices, and 
maybe there could be a problem with concurrent works invoking 
cxl_region_teardown_all().


Anyways, thank you for pointing out what I could not see.


> Thanks,
> Tomasz
>

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Alejandro Lucero Palau 2 weeks ago

Hi Smita,


On 1/25/26 03:17, Koralahalli Channabasappa, Smita wrote:
> Hi Alejandro,
>
> On 1/23/2026 3:59 AM, Alejandro Lucero Palau wrote:
>>
>> On 1/22/26 04:55, Smita Koralahalli wrote:
>>> The current probe time ownership check for Soft Reserved memory based
>>> solely on CXL window intersection is insufficient. dax_hmem probing 
>>> is not
>>> always guaranteed to run after CXL enumeration and region assembly, 
>>> which
>>> can lead to incorrect ownership decisions before the CXL stack has
>>> finished publishing windows and assembling committed regions.
>>>
>>> Introduce deferred ownership handling for Soft Reserved ranges that
>>> intersect CXL windows at probe time by scheduling deferred work from
>>> dax_hmem and waiting for the CXL stack to complete enumeration and 
>>> region
>>> assembly before deciding ownership.
>>>
>>> Evaluate ownership of Soft Reserved ranges based on CXL region
>>> containment.
>>>
>>>     - If all Soft Reserved ranges are fully contained within 
>>> committed CXL
>>>       regions, DROP handling Soft Reserved ranges from dax_hmem and 
>>> allow
>>>       dax_cxl to bind.
>>>
>>>     - If any Soft Reserved range is not fully claimed by committed CXL
>>>       region, tear down all CXL regions and REGISTER the Soft Reserved
>>>       ranges with dax_hmem instead.
>>
>>
>> I was not sure if I was understanding this properly, but after 
>> looking at the code I think I do ... but then I do not understand the 
>> reason behind. If I'm right, there could be two devices and therefore 
>> different soft reserved ranges, with one getting an automatic cxl 
>> region for all the range and the other without that, and the outcome 
>> would be the first one getting its region removed and added to hmem. 
>> Maybe I'm missing something obvious but, why? If there is a good 
>> reason, I think it should be documented in the commit and somewhere 
>> else.
>
> Yeah, if I understood Dan correctly, that's exactly the intended 
> behavior.


OK


>
> I'm trying to restate the "why" behind this based on Dan's earlier 
> guidance. Please correct me if I'm misrepresenting it Dan.
>
> The policy is meant to be coarse: If all SR ranges that intersect CXL 
> windows are fully contained by committed CXL regions, then we have 
> high confidence that the platform descriptions line up and CXL owns 
> the memory.
>
> If any SR range that intersects a CXL window is not fully covered by 
> committed regions then we treat that as unexpected platform 
> shenanigans. In that situation the intent is to give up on CXL 
> entirely for those SR ranges because partial ownership becomes ambiguous.
>
> This is why the fallback is global and not per range. The goal is to
> leave no room for mixed some SR to CXL, some SR to HMEM 
> configurations. Any mismatch should push the platform issue back to 
> the vendor to fix the description (ideally preserving the simplifying 
> assumption of a 1:1 correlation between CXL Regions and SR).


I guess it is a good policy but my concern is with Type2, and at the 
time this check is done the Type2 driver could have not been probed 
(dynamic module), so a soft reserved range could not have a cxl region 
and that not being an error. I do not think this is a problem for your 
patch but something I should add to mine. I'm not sure how to do so yet 
because I will need to do it using some information from the PCI device 
struct while all this patchset relies on "anonymous" soft reserved range.


>
> Thanks for pointing this out. I will update the why in the next revision.
>
>>
>>
>> I have also problems understanding the concurrency when handling the 
>> global dax_cxl_mode variable. It is modified inside 
>> process_defer_work() which I think can have different instances for 
>> different devices executed concurrently in different cores/workers 
>> (the system_wq used is not ordered). If I'm right race conditions are 
>> likely.
>
> Yeah, this is something I spent sometime thinking on. My rationale 
> behind not having it and where I'm still unsure:
>
> My assumption was that after wait_for_device_probe(), CXL topology 
> discovery and region commit are complete and stable. And each deferred 
> worker should observe the same CXL state and therefore compute the 
> same final policy (either DROP or REGISTER).
>

I think so as well.


> Also, I was assuming that even if multiple process_defer_work() 
> instances run, the operations they perform are effectively safe to 
> repeat.. though I'm not sure on this.
>

No if they run in parallel, what I think it could be the case. If there 
is just one soft reserved range, that is fine, but concurrent ones 
executing the code in process_defer_work() can trigger race conditions 
when updating the global variable. Although the code inside this 
function does the right thing, the walk when calling to 
hmem_register_device() will not.

I would say a global spinlock should be enough.


Thank you,

Alejandro


> cxl_region_teardown_all(): this ultimately triggers the 
> devm_release_action(... unregister_region ...) path. My expectation 
> was that these devm actions are single shot per device lifecycle, so 
> repeated teardown attempts should become noops. And 
> cxl_region_teardown_all() ultimately leads to cxl_decoder_detach(), 
> which takes "cxl_rwsem.region". That should serialize decoder detach 
> and region teardown.
>
> bus_rescan_devices(&cxl_bus_type): I assumed repeated rescans during 
> boot are fine as the rescan path will simply rediscover already 
> present devices..
>
> walk_hmem_resources(.., hmem_register_device): in the DROP case,I 
> thought running the walk multiple times is safe because devm managed 
> platform devices and memregion allocations should prevent duplicate 
> lifetime issues.
>
> So, even if multiple process_defer_work() instances execute 
> concurrently, the CXL operations involved in containment evaluation 
> (cxl_region_contains_soft_reserve()) and teardown are already guarded.
>
> But I'm still trying to understand if 
> bus_rescan_devices(&cxl_bus_type) is not safe when invoked concurrently?
>
> Or is the primary issue that dax_cxl_mode is a global updated from one 
> context and read from others, and should be synchronized even if the 
> computed final value will always be the same?
>
> Happy to correct if my understanding is incorrect.
>
> Thanks
> Smita
>
>>
>>
>>>
>>> While ownership resolution is pending, gate dax_cxl probing to avoid
>>> binding prematurely.
>>>
>>> This enforces a strict ownership. Either CXL fully claims the Soft
>>> Reserved ranges or it relinquishes it entirely.
>>>
>>> Co-developed-by: Dan Williams <dan.j.williams@intel.com>
>>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>>> Signed-off-by: Smita Koralahalli 
>>> <Smita.KoralahalliChannabasappa@amd.com>
>>> ---
>>>   drivers/cxl/core/region.c | 25 ++++++++++++
>>>   drivers/cxl/cxl.h         |  2 +
>>>   drivers/dax/cxl.c         |  9 +++++
>>>   drivers/dax/hmem/hmem.c   | 81 
>>> ++++++++++++++++++++++++++++++++++++++-
>>>   4 files changed, 115 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
>>> index 9827a6dd3187..6c22a2d4abbb 100644
>>> --- a/drivers/cxl/core/region.c
>>> +++ b/drivers/cxl/core/region.c
>>> @@ -3875,6 +3875,31 @@ static int 
>>> cxl_region_debugfs_poison_clear(void *data, u64 offset)
>>>   DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
>>>                cxl_region_debugfs_poison_clear, "%llx\n");
>>> +static int cxl_region_teardown_cb(struct device *dev, void *data)
>>> +{
>>> +    struct cxl_root_decoder *cxlrd;
>>> +    struct cxl_region *cxlr;
>>> +    struct cxl_port *port;
>>> +
>>> +    if (!is_cxl_region(dev))
>>> +        return 0;
>>> +
>>> +    cxlr = to_cxl_region(dev);
>>> +
>>> +    cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
>>> +    port = cxlrd_to_port(cxlrd);
>>> +
>>> +    devm_release_action(port->uport_dev, unregister_region, cxlr);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +void cxl_region_teardown_all(void)
>>> +{
>>> +    bus_for_each_dev(&cxl_bus_type, NULL, NULL, 
>>> cxl_region_teardown_cb);
>>> +}
>>> +EXPORT_SYMBOL_GPL(cxl_region_teardown_all);
>>> +
>>>   static int cxl_region_contains_sr_cb(struct device *dev, void *data)
>>>   {
>>>       struct resource *res = data;
>>> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
>>> index b0ff6b65ea0b..1864d35d5f69 100644
>>> --- a/drivers/cxl/cxl.h
>>> +++ b/drivers/cxl/cxl.h
>>> @@ -907,6 +907,7 @@ int cxl_add_to_region(struct 
>>> cxl_endpoint_decoder *cxled);
>>>   struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
>>>   u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
>>>   bool cxl_region_contains_soft_reserve(const struct resource *res);
>>> +void cxl_region_teardown_all(void);
>>>   #else
>>>   static inline bool is_cxl_pmem_region(struct device *dev)
>>>   {
>>> @@ -933,6 +934,7 @@ static inline bool 
>>> cxl_region_contains_soft_reserve(const struct resource *res)
>>>   {
>>>       return false;
>>>   }
>>> +static inline void cxl_region_teardown_all(void) { }
>>>   #endif
>>>   void cxl_endpoint_parse_cdat(struct cxl_port *port);
>>> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
>>> index 13cd94d32ff7..b7e90d6dd888 100644
>>> --- a/drivers/dax/cxl.c
>>> +++ b/drivers/dax/cxl.c
>>> @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
>>>       struct dax_region *dax_region;
>>>       struct dev_dax_data data;
>>> +    switch (dax_cxl_mode) {
>>> +    case DAX_CXL_MODE_DEFER:
>>> +        return -EPROBE_DEFER;
>>> +    case DAX_CXL_MODE_REGISTER:
>>> +        return -ENODEV;
>>> +    case DAX_CXL_MODE_DROP:
>>> +        break;
>>> +    }
>>> +
>>>       if (nid == NUMA_NO_NODE)
>>>           nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
>>> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
>>> index 1e3424358490..bcb57d8678d7 100644
>>> --- a/drivers/dax/hmem/hmem.c
>>> +++ b/drivers/dax/hmem/hmem.c
>>> @@ -3,6 +3,7 @@
>>>   #include <linux/memregion.h>
>>>   #include <linux/module.h>
>>>   #include <linux/dax.h>
>>> +#include "../../cxl/cxl.h"
>>>   #include "../bus.h"
>>>   static bool region_idle;
>>> @@ -58,9 +59,15 @@ static void release_hmem(void *pdev)
>>>       platform_device_unregister(pdev);
>>>   }
>>> +struct dax_defer_work {
>>> +    struct platform_device *pdev;
>>> +    struct work_struct work;
>>> +};
>>> +
>>>   static int hmem_register_device(struct device *host, int target_nid,
>>>                   const struct resource *res)
>>>   {
>>> +    struct dax_defer_work *work = dev_get_drvdata(host);
>>>       struct platform_device *pdev;
>>>       struct memregion_info info;
>>>       long id;
>>> @@ -69,8 +76,18 @@ static int hmem_register_device(struct device 
>>> *host, int target_nid,
>>>       if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
>>>           region_intersects(res->start, resource_size(res), 
>>> IORESOURCE_MEM,
>>>                     IORES_DESC_CXL) != REGION_DISJOINT) {
>>> -        dev_dbg(host, "deferring range to CXL: %pr\n", res);
>>> -        return 0;
>>> +        switch (dax_cxl_mode) {
>>> +        case DAX_CXL_MODE_DEFER:
>>> +            dev_dbg(host, "deferring range to CXL: %pr\n", res);
>>> +            schedule_work(&work->work);
>>> +            return 0;
>>> +        case DAX_CXL_MODE_REGISTER:
>>> +            dev_dbg(host, "registering CXL range: %pr\n", res);
>>> +            break;
>>> +        case DAX_CXL_MODE_DROP:
>>> +            dev_dbg(host, "dropping CXL range: %pr\n", res);
>>> +            return 0;
>>> +        }
>>>       }
>>>       rc = region_intersects_soft_reserve(res->start, 
>>> resource_size(res));
>>> @@ -123,8 +140,67 @@ static int hmem_register_device(struct device 
>>> *host, int target_nid,
>>>       return rc;
>>>   }
>>> +static int cxl_contains_soft_reserve(struct device *host, int 
>>> target_nid,
>>> +                     const struct resource *res)
>>> +{
>>> +    if (region_intersects(res->start, resource_size(res), 
>>> IORESOURCE_MEM,
>>> +                  IORES_DESC_CXL) != REGION_DISJOINT) {
>>> +        if (!cxl_region_contains_soft_reserve(res))
>>> +            return 1;
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void process_defer_work(struct work_struct *_work)
>>> +{
>>> +    struct dax_defer_work *work = container_of(_work, 
>>> typeof(*work), work);
>>> +    struct platform_device *pdev = work->pdev;
>>> +    int rc;
>>> +
>>> +    /* relies on cxl_acpi and cxl_pci having had a chance to load */
>>> +    wait_for_device_probe();
>>> +
>>> +    rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
>>> +
>>> +    if (!rc) {
>>> +        dax_cxl_mode = DAX_CXL_MODE_DROP;
>>> +        rc = bus_rescan_devices(&cxl_bus_type);
>>> +        if (rc)
>>> +            dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
>>> +    } else {
>>> +        dax_cxl_mode = DAX_CXL_MODE_REGISTER;
>>> +        cxl_region_teardown_all();
>>> +    }
>>> +
>>> +    walk_hmem_resources(&pdev->dev, hmem_register_device);
>>> +}
>>> +
>>> +static void kill_defer_work(void *_work)
>>> +{
>>> +    struct dax_defer_work *work = container_of(_work, 
>>> typeof(*work), work);
>>> +
>>> +    cancel_work_sync(&work->work);
>>> +    kfree(work);
>>> +}
>>> +
>>>   static int dax_hmem_platform_probe(struct platform_device *pdev)
>>>   {
>>> +    struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
>>> +    int rc;
>>> +
>>> +    if (!work)
>>> +        return -ENOMEM;
>>> +
>>> +    work->pdev = pdev;
>>> +    INIT_WORK(&work->work, process_defer_work);
>>> +
>>> +    rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
>>> +    if (rc)
>>> +        return rc;
>>> +
>>> +    platform_set_drvdata(pdev, work);
>>> +
>>>       return walk_hmem_resources(&pdev->dev, hmem_register_device);
>>>   }
>>> @@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
>>>   MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' 
>>> memory");
>>>   MODULE_LICENSE("GPL v2");
>>>   MODULE_AUTHOR("Intel Corporation");
>>> +MODULE_IMPORT_NS("CXL");
>

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Alejandro Lucero Palau 2 weeks ago

On 1/26/26 12:20, Alejandro Lucero Palau wrote:
> Hi Smita,
>
>
> On 1/25/26 03:17, Koralahalli Channabasappa, Smita wrote:
>> Hi Alejandro,
>>
>> On 1/23/2026 3:59 AM, Alejandro Lucero Palau wrote:
>>>
>>> On 1/22/26 04:55, Smita Koralahalli wrote:
>>>> The current probe time ownership check for Soft Reserved memory based
>>>> solely on CXL window intersection is insufficient. dax_hmem probing 
>>>> is not
>>>> always guaranteed to run after CXL enumeration and region assembly, 
>>>> which
>>>> can lead to incorrect ownership decisions before the CXL stack has
>>>> finished publishing windows and assembling committed regions.
>>>>
>>>> Introduce deferred ownership handling for Soft Reserved ranges that
>>>> intersect CXL windows at probe time by scheduling deferred work from
>>>> dax_hmem and waiting for the CXL stack to complete enumeration and 
>>>> region
>>>> assembly before deciding ownership.
>>>>
>>>> Evaluate ownership of Soft Reserved ranges based on CXL region
>>>> containment.
>>>>
>>>>     - If all Soft Reserved ranges are fully contained within 
>>>> committed CXL
>>>>       regions, DROP handling Soft Reserved ranges from dax_hmem and 
>>>> allow
>>>>       dax_cxl to bind.
>>>>
>>>>     - If any Soft Reserved range is not fully claimed by committed CXL
>>>>       region, tear down all CXL regions and REGISTER the Soft Reserved
>>>>       ranges with dax_hmem instead.
>>>
>>>
>>> I was not sure if I was understanding this properly, but after 
>>> looking at the code I think I do ... but then I do not understand 
>>> the reason behind. If I'm right, there could be two devices and 
>>> therefore different soft reserved ranges, with one getting an 
>>> automatic cxl region for all the range and the other without that, 
>>> and the outcome would be the first one getting its region removed 
>>> and added to hmem. Maybe I'm missing something obvious but, why? If 
>>> there is a good reason, I think it should be documented in the 
>>> commit and somewhere else.
>>
>> Yeah, if I understood Dan correctly, that's exactly the intended 
>> behavior.
>
>
> OK
>
>
>>
>> I'm trying to restate the "why" behind this based on Dan's earlier 
>> guidance. Please correct me if I'm misrepresenting it Dan.
>>
>> The policy is meant to be coarse: If all SR ranges that intersect CXL 
>> windows are fully contained by committed CXL regions, then we have 
>> high confidence that the platform descriptions line up and CXL owns 
>> the memory.
>>
>> If any SR range that intersects a CXL window is not fully covered by 
>> committed regions then we treat that as unexpected platform 
>> shenanigans. In that situation the intent is to give up on CXL 
>> entirely for those SR ranges because partial ownership becomes 
>> ambiguous.
>>
>> This is why the fallback is global and not per range. The goal is to
>> leave no room for mixed some SR to CXL, some SR to HMEM 
>> configurations. Any mismatch should push the platform issue back to 
>> the vendor to fix the description (ideally preserving the simplifying 
>> assumption of a 1:1 correlation between CXL Regions and SR).
>
>
> I guess it is a good policy but my concern is with Type2, and at the 
> time this check is done the Type2 driver could have not been probed 
> (dynamic module), so a soft reserved range could not have a cxl region 
> and that not being an error. I do not think this is a problem for your 
> patch but something I should add to mine. I'm not sure how to do so 
> yet because I will need to do it using some information from the PCI 
> device struct while all this patchset relies on "anonymous" soft 
> reserved range.
>
>
>>
>> Thanks for pointing this out. I will update the why in the next 
>> revision.
>>
>>>
>>>
>>> I have also problems understanding the concurrency when handling the 
>>> global dax_cxl_mode variable. It is modified inside 
>>> process_defer_work() which I think can have different instances for 
>>> different devices executed concurrently in different cores/workers 
>>> (the system_wq used is not ordered). If I'm right race conditions 
>>> are likely.
>>
>> Yeah, this is something I spent sometime thinking on. My rationale 
>> behind not having it and where I'm still unsure:
>>
>> My assumption was that after wait_for_device_probe(), CXL topology 
>> discovery and region commit are complete and stable. And each 
>> deferred worker should observe the same CXL state and therefore 
>> compute the same final policy (either DROP or REGISTER).
>>
>
> I think so as well.
>
>
>> Also, I was assuming that even if multiple process_defer_work() 
>> instances run, the operations they perform are effectively safe to 
>> repeat.. though I'm not sure on this.
>>
>
> No if they run in parallel, what I think it could be the case. If 
> there is just one soft reserved range, that is fine, but concurrent 
> ones executing the code in process_defer_work() can trigger race 
> conditions when updating the global variable. Although the code inside 
> this function does the right thing, the walk when calling to 
> hmem_register_device() will not.
>
> I would say a global spinlock should be enough.
>

I went to have lunch and my feeling was something was wrong about my email.


I still think there is a concurrency problem and the spinlock will avoid 
it, but not enough for the functionality desired. First of all, I think 
the code should check if dax_cxl_mode has been set to DROP already, 
because it does not matter anymore if its soft reserved range is fully  
contained in cxl regions. Does it?


But then, even with that change, there is another problem, or at least I 
can see one, so tell me if I am wrong. THe problem is not (only) 
concurrency but order of handling the particular device soft reserved 
range. For example, with a timeline like this:


1) device1 resources checked out and all in cxl regions.

2) device1 discards creation of an hmem platform device.

3) device2 resources checked out and not all contained in cxl regions.

4) all cxl regions removed

5) device2 creates its hmem platform device

...

device1 does not have a dax/cxl region nor an hmem device.


If I am right this requires another approach where just one work will 
handle all the devices ranges, and if all fine, the hmem devices will be 
created or not.


Thank you,

Alejandro




>
> Thank you,
>
> Alejandro
>
>
>> cxl_region_teardown_all(): this ultimately triggers the 
>> devm_release_action(... unregister_region ...) path. My expectation 
>> was that these devm actions are single shot per device lifecycle, so 
>> repeated teardown attempts should become noops. And 
>> cxl_region_teardown_all() ultimately leads to cxl_decoder_detach(), 
>> which takes "cxl_rwsem.region". That should serialize decoder detach 
>> and region teardown.
>>
>> bus_rescan_devices(&cxl_bus_type): I assumed repeated rescans during 
>> boot are fine as the rescan path will simply rediscover already 
>> present devices..
>>
>> walk_hmem_resources(.., hmem_register_device): in the DROP case,I 
>> thought running the walk multiple times is safe because devm managed 
>> platform devices and memregion allocations should prevent duplicate 
>> lifetime issues.
>>
>> So, even if multiple process_defer_work() instances execute 
>> concurrently, the CXL operations involved in containment evaluation 
>> (cxl_region_contains_soft_reserve()) and teardown are already guarded.
>>
>> But I'm still trying to understand if 
>> bus_rescan_devices(&cxl_bus_type) is not safe when invoked concurrently?
>>
>> Or is the primary issue that dax_cxl_mode is a global updated from 
>> one context and read from others, and should be synchronized even if 
>> the computed final value will always be the same?
>>
>> Happy to correct if my understanding is incorrect.
>>
>> Thanks
>> Smita
>>
>>>
>>>
>>>>
>>>> While ownership resolution is pending, gate dax_cxl probing to avoid
>>>> binding prematurely.
>>>>
>>>> This enforces a strict ownership. Either CXL fully claims the Soft
>>>> Reserved ranges or it relinquishes it entirely.
>>>>
>>>> Co-developed-by: Dan Williams <dan.j.williams@intel.com>
>>>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>>>> Signed-off-by: Smita Koralahalli 
>>>> <Smita.KoralahalliChannabasappa@amd.com>
>>>> ---
>>>>   drivers/cxl/core/region.c | 25 ++++++++++++
>>>>   drivers/cxl/cxl.h         |  2 +
>>>>   drivers/dax/cxl.c         |  9 +++++
>>>>   drivers/dax/hmem/hmem.c   | 81 
>>>> ++++++++++++++++++++++++++++++++++++++-
>>>>   4 files changed, 115 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
>>>> index 9827a6dd3187..6c22a2d4abbb 100644
>>>> --- a/drivers/cxl/core/region.c
>>>> +++ b/drivers/cxl/core/region.c
>>>> @@ -3875,6 +3875,31 @@ static int 
>>>> cxl_region_debugfs_poison_clear(void *data, u64 offset)
>>>>   DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
>>>>                cxl_region_debugfs_poison_clear, "%llx\n");
>>>> +static int cxl_region_teardown_cb(struct device *dev, void *data)
>>>> +{
>>>> +    struct cxl_root_decoder *cxlrd;
>>>> +    struct cxl_region *cxlr;
>>>> +    struct cxl_port *port;
>>>> +
>>>> +    if (!is_cxl_region(dev))
>>>> +        return 0;
>>>> +
>>>> +    cxlr = to_cxl_region(dev);
>>>> +
>>>> +    cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
>>>> +    port = cxlrd_to_port(cxlrd);
>>>> +
>>>> +    devm_release_action(port->uport_dev, unregister_region, cxlr);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +void cxl_region_teardown_all(void)
>>>> +{
>>>> +    bus_for_each_dev(&cxl_bus_type, NULL, NULL, 
>>>> cxl_region_teardown_cb);
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(cxl_region_teardown_all);
>>>> +
>>>>   static int cxl_region_contains_sr_cb(struct device *dev, void *data)
>>>>   {
>>>>       struct resource *res = data;
>>>> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
>>>> index b0ff6b65ea0b..1864d35d5f69 100644
>>>> --- a/drivers/cxl/cxl.h
>>>> +++ b/drivers/cxl/cxl.h
>>>> @@ -907,6 +907,7 @@ int cxl_add_to_region(struct 
>>>> cxl_endpoint_decoder *cxled);
>>>>   struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
>>>>   u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 
>>>> spa);
>>>>   bool cxl_region_contains_soft_reserve(const struct resource *res);
>>>> +void cxl_region_teardown_all(void);
>>>>   #else
>>>>   static inline bool is_cxl_pmem_region(struct device *dev)
>>>>   {
>>>> @@ -933,6 +934,7 @@ static inline bool 
>>>> cxl_region_contains_soft_reserve(const struct resource *res)
>>>>   {
>>>>       return false;
>>>>   }
>>>> +static inline void cxl_region_teardown_all(void) { }
>>>>   #endif
>>>>   void cxl_endpoint_parse_cdat(struct cxl_port *port);
>>>> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
>>>> index 13cd94d32ff7..b7e90d6dd888 100644
>>>> --- a/drivers/dax/cxl.c
>>>> +++ b/drivers/dax/cxl.c
>>>> @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
>>>>       struct dax_region *dax_region;
>>>>       struct dev_dax_data data;
>>>> +    switch (dax_cxl_mode) {
>>>> +    case DAX_CXL_MODE_DEFER:
>>>> +        return -EPROBE_DEFER;
>>>> +    case DAX_CXL_MODE_REGISTER:
>>>> +        return -ENODEV;
>>>> +    case DAX_CXL_MODE_DROP:
>>>> +        break;
>>>> +    }
>>>> +
>>>>       if (nid == NUMA_NO_NODE)
>>>>           nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
>>>> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
>>>> index 1e3424358490..bcb57d8678d7 100644
>>>> --- a/drivers/dax/hmem/hmem.c
>>>> +++ b/drivers/dax/hmem/hmem.c
>>>> @@ -3,6 +3,7 @@
>>>>   #include <linux/memregion.h>
>>>>   #include <linux/module.h>
>>>>   #include <linux/dax.h>
>>>> +#include "../../cxl/cxl.h"
>>>>   #include "../bus.h"
>>>>   static bool region_idle;
>>>> @@ -58,9 +59,15 @@ static void release_hmem(void *pdev)
>>>>       platform_device_unregister(pdev);
>>>>   }
>>>> +struct dax_defer_work {
>>>> +    struct platform_device *pdev;
>>>> +    struct work_struct work;
>>>> +};
>>>> +
>>>>   static int hmem_register_device(struct device *host, int target_nid,
>>>>                   const struct resource *res)
>>>>   {
>>>> +    struct dax_defer_work *work = dev_get_drvdata(host);
>>>>       struct platform_device *pdev;
>>>>       struct memregion_info info;
>>>>       long id;
>>>> @@ -69,8 +76,18 @@ static int hmem_register_device(struct device 
>>>> *host, int target_nid,
>>>>       if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
>>>>           region_intersects(res->start, resource_size(res), 
>>>> IORESOURCE_MEM,
>>>>                     IORES_DESC_CXL) != REGION_DISJOINT) {
>>>> -        dev_dbg(host, "deferring range to CXL: %pr\n", res);
>>>> -        return 0;
>>>> +        switch (dax_cxl_mode) {
>>>> +        case DAX_CXL_MODE_DEFER:
>>>> +            dev_dbg(host, "deferring range to CXL: %pr\n", res);
>>>> +            schedule_work(&work->work);
>>>> +            return 0;
>>>> +        case DAX_CXL_MODE_REGISTER:
>>>> +            dev_dbg(host, "registering CXL range: %pr\n", res);
>>>> +            break;
>>>> +        case DAX_CXL_MODE_DROP:
>>>> +            dev_dbg(host, "dropping CXL range: %pr\n", res);
>>>> +            return 0;
>>>> +        }
>>>>       }
>>>>       rc = region_intersects_soft_reserve(res->start, 
>>>> resource_size(res));
>>>> @@ -123,8 +140,67 @@ static int hmem_register_device(struct device 
>>>> *host, int target_nid,
>>>>       return rc;
>>>>   }
>>>> +static int cxl_contains_soft_reserve(struct device *host, int 
>>>> target_nid,
>>>> +                     const struct resource *res)
>>>> +{
>>>> +    if (region_intersects(res->start, resource_size(res), 
>>>> IORESOURCE_MEM,
>>>> +                  IORES_DESC_CXL) != REGION_DISJOINT) {
>>>> +        if (!cxl_region_contains_soft_reserve(res))
>>>> +            return 1;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static void process_defer_work(struct work_struct *_work)
>>>> +{
>>>> +    struct dax_defer_work *work = container_of(_work, 
>>>> typeof(*work), work);
>>>> +    struct platform_device *pdev = work->pdev;
>>>> +    int rc;
>>>> +
>>>> +    /* relies on cxl_acpi and cxl_pci having had a chance to load */
>>>> +    wait_for_device_probe();
>>>> +
>>>> +    rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
>>>> +
>>>> +    if (!rc) {
>>>> +        dax_cxl_mode = DAX_CXL_MODE_DROP;
>>>> +        rc = bus_rescan_devices(&cxl_bus_type);
>>>> +        if (rc)
>>>> +            dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
>>>> +    } else {
>>>> +        dax_cxl_mode = DAX_CXL_MODE_REGISTER;
>>>> +        cxl_region_teardown_all();
>>>> +    }
>>>> +
>>>> +    walk_hmem_resources(&pdev->dev, hmem_register_device);
>>>> +}
>>>> +
>>>> +static void kill_defer_work(void *_work)
>>>> +{
>>>> +    struct dax_defer_work *work = container_of(_work, 
>>>> typeof(*work), work);
>>>> +
>>>> +    cancel_work_sync(&work->work);
>>>> +    kfree(work);
>>>> +}
>>>> +
>>>>   static int dax_hmem_platform_probe(struct platform_device *pdev)
>>>>   {
>>>> +    struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
>>>> +    int rc;
>>>> +
>>>> +    if (!work)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    work->pdev = pdev;
>>>> +    INIT_WORK(&work->work, process_defer_work);
>>>> +
>>>> +    rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
>>>> +    if (rc)
>>>> +        return rc;
>>>> +
>>>> +    platform_set_drvdata(pdev, work);
>>>> +
>>>>       return walk_hmem_resources(&pdev->dev, hmem_register_device);
>>>>   }
>>>> @@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
>>>>   MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' 
>>>> memory");
>>>>   MODULE_LICENSE("GPL v2");
>>>>   MODULE_AUTHOR("Intel Corporation");
>>>> +MODULE_IMPORT_NS("CXL");
>>
>

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Alison Schofield 2 weeks, 3 days ago

On Thu, Jan 22, 2026 at 04:55:42AM +0000, Smita Koralahalli wrote:
> The current probe time ownership check for Soft Reserved memory based
> solely on CXL window intersection is insufficient. dax_hmem probing is not
> always guaranteed to run after CXL enumeration and region assembly, which
> can lead to incorrect ownership decisions before the CXL stack has
> finished publishing windows and assembling committed regions.
> 
> Introduce deferred ownership handling for Soft Reserved ranges that
> intersect CXL windows at probe time by scheduling deferred work from
> dax_hmem and waiting for the CXL stack to complete enumeration and region
> assembly before deciding ownership.
> 
> Evaluate ownership of Soft Reserved ranges based on CXL region
> containment.
> 
>    - If all Soft Reserved ranges are fully contained within committed CXL
>      regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>      dax_cxl to bind.
> 
>    - If any Soft Reserved range is not fully claimed by committed CXL
>      region, tear down all CXL regions and REGISTER the Soft Reserved
>      ranges with dax_hmem instead.
> 
> While ownership resolution is pending, gate dax_cxl probing to avoid
> binding prematurely.

This patch is the point in the set where I begin to fail creating DAX
regions on my non soft-reserved platforms.

Before this patch, at region probe, devm_cxl_add_dax_region(cxlr) succeeded
without delay, but now those calls result in EPROBE DEFER.

That deferral is wanted for platforms with Soft Reserveds, but for
platforms without, those probes will never resume.

IIUC this will impact platforms without SRs, not just my test setup.
In my testing it's visible during both QEMU and cxl-test region creation.

Can we abandon this whole deferral scheme if there is nothing in the
new soft_reserved resource tree?

Or maybe another way to get the dax probes UN-deferred in this case?

-- Alison

> 
> This enforces a strict ownership. Either CXL fully claims the Soft
> Reserved ranges or it relinquishes it entirely.
> 
> Co-developed-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
>  drivers/cxl/core/region.c | 25 ++++++++++++
>  drivers/cxl/cxl.h         |  2 +
>  drivers/dax/cxl.c         |  9 +++++
>  drivers/dax/hmem/hmem.c   | 81 ++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 115 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 9827a6dd3187..6c22a2d4abbb 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -3875,6 +3875,31 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
>  DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
>  			 cxl_region_debugfs_poison_clear, "%llx\n");
>  
> +static int cxl_region_teardown_cb(struct device *dev, void *data)
> +{
> +	struct cxl_root_decoder *cxlrd;
> +	struct cxl_region *cxlr;
> +	struct cxl_port *port;
> +
> +	if (!is_cxl_region(dev))
> +		return 0;
> +
> +	cxlr = to_cxl_region(dev);
> +
> +	cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
> +	port = cxlrd_to_port(cxlrd);
> +
> +	devm_release_action(port->uport_dev, unregister_region, cxlr);
> +
> +	return 0;
> +}
> +
> +void cxl_region_teardown_all(void)
> +{
> +	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_region_teardown_cb);
> +}
> +EXPORT_SYMBOL_GPL(cxl_region_teardown_all);
> +
>  static int cxl_region_contains_sr_cb(struct device *dev, void *data)
>  {
>  	struct resource *res = data;
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index b0ff6b65ea0b..1864d35d5f69 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -907,6 +907,7 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled);
>  struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
>  u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
>  bool cxl_region_contains_soft_reserve(const struct resource *res);
> +void cxl_region_teardown_all(void);
>  #else
>  static inline bool is_cxl_pmem_region(struct device *dev)
>  {
> @@ -933,6 +934,7 @@ static inline bool cxl_region_contains_soft_reserve(const struct resource *res)
>  {
>  	return false;
>  }
> +static inline void cxl_region_teardown_all(void) { }
>  #endif
>  
>  void cxl_endpoint_parse_cdat(struct cxl_port *port);
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index 13cd94d32ff7..b7e90d6dd888 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
>  	struct dax_region *dax_region;
>  	struct dev_dax_data data;
>  
> +	switch (dax_cxl_mode) {
> +	case DAX_CXL_MODE_DEFER:
> +		return -EPROBE_DEFER;
> +	case DAX_CXL_MODE_REGISTER:
> +		return -ENODEV;
> +	case DAX_CXL_MODE_DROP:
> +		break;
> +	}
> +
>  	if (nid == NUMA_NO_NODE)
>  		nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
>  
> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
> index 1e3424358490..bcb57d8678d7 100644
> --- a/drivers/dax/hmem/hmem.c
> +++ b/drivers/dax/hmem/hmem.c
> @@ -3,6 +3,7 @@
>  #include <linux/memregion.h>
>  #include <linux/module.h>
>  #include <linux/dax.h>
> +#include "../../cxl/cxl.h"
>  #include "../bus.h"
>  
>  static bool region_idle;
> @@ -58,9 +59,15 @@ static void release_hmem(void *pdev)
>  	platform_device_unregister(pdev);
>  }
>  
> +struct dax_defer_work {
> +	struct platform_device *pdev;
> +	struct work_struct work;
> +};
> +
>  static int hmem_register_device(struct device *host, int target_nid,
>  				const struct resource *res)
>  {
> +	struct dax_defer_work *work = dev_get_drvdata(host);
>  	struct platform_device *pdev;
>  	struct memregion_info info;
>  	long id;
> @@ -69,8 +76,18 @@ static int hmem_register_device(struct device *host, int target_nid,
>  	if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
>  	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>  			      IORES_DESC_CXL) != REGION_DISJOINT) {
> -		dev_dbg(host, "deferring range to CXL: %pr\n", res);
> -		return 0;
> +		switch (dax_cxl_mode) {
> +		case DAX_CXL_MODE_DEFER:
> +			dev_dbg(host, "deferring range to CXL: %pr\n", res);
> +			schedule_work(&work->work);
> +			return 0;
> +		case DAX_CXL_MODE_REGISTER:
> +			dev_dbg(host, "registering CXL range: %pr\n", res);
> +			break;
> +		case DAX_CXL_MODE_DROP:
> +			dev_dbg(host, "dropping CXL range: %pr\n", res);
> +			return 0;
> +		}
>  	}
>  
>  	rc = region_intersects_soft_reserve(res->start, resource_size(res));
> @@ -123,8 +140,67 @@ static int hmem_register_device(struct device *host, int target_nid,
>  	return rc;
>  }
>  
> +static int cxl_contains_soft_reserve(struct device *host, int target_nid,
> +				     const struct resource *res)
> +{
> +	if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
> +			      IORES_DESC_CXL) != REGION_DISJOINT) {
> +		if (!cxl_region_contains_soft_reserve(res))
> +			return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static void process_defer_work(struct work_struct *_work)
> +{
> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> +	struct platform_device *pdev = work->pdev;
> +	int rc;
> +
> +	/* relies on cxl_acpi and cxl_pci having had a chance to load */
> +	wait_for_device_probe();
> +
> +	rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
> +
> +	if (!rc) {
> +		dax_cxl_mode = DAX_CXL_MODE_DROP;
> +		rc = bus_rescan_devices(&cxl_bus_type);
> +		if (rc)
> +			dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
> +	} else {
> +		dax_cxl_mode = DAX_CXL_MODE_REGISTER;
> +		cxl_region_teardown_all();
> +	}
> +
> +	walk_hmem_resources(&pdev->dev, hmem_register_device);
> +}
> +
> +static void kill_defer_work(void *_work)
> +{
> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> +
> +	cancel_work_sync(&work->work);
> +	kfree(work);
> +}
> +
>  static int dax_hmem_platform_probe(struct platform_device *pdev)
>  {
> +	struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
> +	int rc;
> +
> +	if (!work)
> +		return -ENOMEM;
> +
> +	work->pdev = pdev;
> +	INIT_WORK(&work->work, process_defer_work);
> +
> +	rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
> +	if (rc)
> +		return rc;
> +
> +	platform_set_drvdata(pdev, work);
> +
>  	return walk_hmem_resources(&pdev->dev, hmem_register_device);
>  }
>  
> @@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
>  MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory");
>  MODULE_LICENSE("GPL v2");
>  MODULE_AUTHOR("Intel Corporation");
> +MODULE_IMPORT_NS("CXL");
> -- 
> 2.17.1
>

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Koralahalli Channabasappa, Smita 2 weeks ago

Hi Alison,

On 1/22/2026 10:35 PM, Alison Schofield wrote:
> On Thu, Jan 22, 2026 at 04:55:42AM +0000, Smita Koralahalli wrote:
>> The current probe time ownership check for Soft Reserved memory based
>> solely on CXL window intersection is insufficient. dax_hmem probing is not
>> always guaranteed to run after CXL enumeration and region assembly, which
>> can lead to incorrect ownership decisions before the CXL stack has
>> finished publishing windows and assembling committed regions.
>>
>> Introduce deferred ownership handling for Soft Reserved ranges that
>> intersect CXL windows at probe time by scheduling deferred work from
>> dax_hmem and waiting for the CXL stack to complete enumeration and region
>> assembly before deciding ownership.
>>
>> Evaluate ownership of Soft Reserved ranges based on CXL region
>> containment.
>>
>>     - If all Soft Reserved ranges are fully contained within committed CXL
>>       regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>>       dax_cxl to bind.
>>
>>     - If any Soft Reserved range is not fully claimed by committed CXL
>>       region, tear down all CXL regions and REGISTER the Soft Reserved
>>       ranges with dax_hmem instead.
>>
>> While ownership resolution is pending, gate dax_cxl probing to avoid
>> binding prematurely.
> 
> This patch is the point in the set where I begin to fail creating DAX
> regions on my non soft-reserved platforms.
> 
> Before this patch, at region probe, devm_cxl_add_dax_region(cxlr) succeeded
> without delay, but now those calls result in EPROBE DEFER.
> 
> That deferral is wanted for platforms with Soft Reserveds, but for
> platforms without, those probes will never resume.
> 
> IIUC this will impact platforms without SRs, not just my test setup.
> In my testing it's visible during both QEMU and cxl-test region creation.
> 
> Can we abandon this whole deferral scheme if there is nothing in the
> new soft_reserved resource tree?
> 
> Or maybe another way to get the dax probes UN-deferred in this case?

Thanks for pointing this. I didn't think through this.

I was thinking to make the deferral conditional on HMEM actually 
observing a CXL-overlapping range. Rough flow:

One assumption I'm relying on here is that dax_hmem and "initial" 
hmem_register_device() walk happens before dax_cxl probes. If that 
assumption doesn’t hold this approach may not be sufficient.

1. Keep dax_cxl_mode default as DEFER as it is now in dax/bus.c
2. Introduce need_deferral flag initialized to false in dax/bus.c
3. During the initial dax_hmem walk, in hmem_register_device() if HMEM 
observes SR that intersects IORES_DESC_CXL, set a need_deferral flag and 
schedule the deferred work. (case DEFER)
4. In dax_cxl probe: only return -EPROBE_DEFER when dax_cxl_mode == 
DEFER and need_deferral is set, otherwise proceed with cxl_dax.

Please call out if you see issues with this approach (especially around 
the ordering assumption).

Thanks
Smita
> 
> -- Alison
> 
>>
>> This enforces a strict ownership. Either CXL fully claims the Soft
>> Reserved ranges or it relinquishes it entirely.
>>
>> Co-developed-by: Dan Williams <dan.j.williams@intel.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> ---
>>   drivers/cxl/core/region.c | 25 ++++++++++++
>>   drivers/cxl/cxl.h         |  2 +
>>   drivers/dax/cxl.c         |  9 +++++
>>   drivers/dax/hmem/hmem.c   | 81 ++++++++++++++++++++++++++++++++++++++-
>>   4 files changed, 115 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
>> index 9827a6dd3187..6c22a2d4abbb 100644
>> --- a/drivers/cxl/core/region.c
>> +++ b/drivers/cxl/core/region.c
>> @@ -3875,6 +3875,31 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
>>   DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
>>   			 cxl_region_debugfs_poison_clear, "%llx\n");
>>   
>> +static int cxl_region_teardown_cb(struct device *dev, void *data)
>> +{
>> +	struct cxl_root_decoder *cxlrd;
>> +	struct cxl_region *cxlr;
>> +	struct cxl_port *port;
>> +
>> +	if (!is_cxl_region(dev))
>> +		return 0;
>> +
>> +	cxlr = to_cxl_region(dev);
>> +
>> +	cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
>> +	port = cxlrd_to_port(cxlrd);
>> +
>> +	devm_release_action(port->uport_dev, unregister_region, cxlr);
>> +
>> +	return 0;
>> +}
>> +
>> +void cxl_region_teardown_all(void)
>> +{
>> +	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_region_teardown_cb);
>> +}
>> +EXPORT_SYMBOL_GPL(cxl_region_teardown_all);
>> +
>>   static int cxl_region_contains_sr_cb(struct device *dev, void *data)
>>   {
>>   	struct resource *res = data;
>> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
>> index b0ff6b65ea0b..1864d35d5f69 100644
>> --- a/drivers/cxl/cxl.h
>> +++ b/drivers/cxl/cxl.h
>> @@ -907,6 +907,7 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled);
>>   struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
>>   u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
>>   bool cxl_region_contains_soft_reserve(const struct resource *res);
>> +void cxl_region_teardown_all(void);
>>   #else
>>   static inline bool is_cxl_pmem_region(struct device *dev)
>>   {
>> @@ -933,6 +934,7 @@ static inline bool cxl_region_contains_soft_reserve(const struct resource *res)
>>   {
>>   	return false;
>>   }
>> +static inline void cxl_region_teardown_all(void) { }
>>   #endif
>>   
>>   void cxl_endpoint_parse_cdat(struct cxl_port *port);
>> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
>> index 13cd94d32ff7..b7e90d6dd888 100644
>> --- a/drivers/dax/cxl.c
>> +++ b/drivers/dax/cxl.c
>> @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
>>   	struct dax_region *dax_region;
>>   	struct dev_dax_data data;
>>   
>> +	switch (dax_cxl_mode) {
>> +	case DAX_CXL_MODE_DEFER:
>> +		return -EPROBE_DEFER;
>> +	case DAX_CXL_MODE_REGISTER:
>> +		return -ENODEV;
>> +	case DAX_CXL_MODE_DROP:
>> +		break;
>> +	}
>> +
>>   	if (nid == NUMA_NO_NODE)
>>   		nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
>>   
>> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
>> index 1e3424358490..bcb57d8678d7 100644
>> --- a/drivers/dax/hmem/hmem.c
>> +++ b/drivers/dax/hmem/hmem.c
>> @@ -3,6 +3,7 @@
>>   #include <linux/memregion.h>
>>   #include <linux/module.h>
>>   #include <linux/dax.h>
>> +#include "../../cxl/cxl.h"
>>   #include "../bus.h"
>>   
>>   static bool region_idle;
>> @@ -58,9 +59,15 @@ static void release_hmem(void *pdev)
>>   	platform_device_unregister(pdev);
>>   }
>>   
>> +struct dax_defer_work {
>> +	struct platform_device *pdev;
>> +	struct work_struct work;
>> +};
>> +
>>   static int hmem_register_device(struct device *host, int target_nid,
>>   				const struct resource *res)
>>   {
>> +	struct dax_defer_work *work = dev_get_drvdata(host);
>>   	struct platform_device *pdev;
>>   	struct memregion_info info;
>>   	long id;
>> @@ -69,8 +76,18 @@ static int hmem_register_device(struct device *host, int target_nid,
>>   	if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
>>   	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>>   			      IORES_DESC_CXL) != REGION_DISJOINT) {
>> -		dev_dbg(host, "deferring range to CXL: %pr\n", res);
>> -		return 0;
>> +		switch (dax_cxl_mode) {
>> +		case DAX_CXL_MODE_DEFER:
>> +			dev_dbg(host, "deferring range to CXL: %pr\n", res);
>> +			schedule_work(&work->work);
>> +			return 0;
>> +		case DAX_CXL_MODE_REGISTER:
>> +			dev_dbg(host, "registering CXL range: %pr\n", res);
>> +			break;
>> +		case DAX_CXL_MODE_DROP:
>> +			dev_dbg(host, "dropping CXL range: %pr\n", res);
>> +			return 0;
>> +		}
>>   	}
>>   
>>   	rc = region_intersects_soft_reserve(res->start, resource_size(res));
>> @@ -123,8 +140,67 @@ static int hmem_register_device(struct device *host, int target_nid,
>>   	return rc;
>>   }
>>   
>> +static int cxl_contains_soft_reserve(struct device *host, int target_nid,
>> +				     const struct resource *res)
>> +{
>> +	if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>> +			      IORES_DESC_CXL) != REGION_DISJOINT) {
>> +		if (!cxl_region_contains_soft_reserve(res))
>> +			return 1;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void process_defer_work(struct work_struct *_work)
>> +{
>> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
>> +	struct platform_device *pdev = work->pdev;
>> +	int rc;
>> +
>> +	/* relies on cxl_acpi and cxl_pci having had a chance to load */
>> +	wait_for_device_probe();
>> +
>> +	rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
>> +
>> +	if (!rc) {
>> +		dax_cxl_mode = DAX_CXL_MODE_DROP;
>> +		rc = bus_rescan_devices(&cxl_bus_type);
>> +		if (rc)
>> +			dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
>> +	} else {
>> +		dax_cxl_mode = DAX_CXL_MODE_REGISTER;
>> +		cxl_region_teardown_all();
>> +	}
>> +
>> +	walk_hmem_resources(&pdev->dev, hmem_register_device);
>> +}
>> +
>> +static void kill_defer_work(void *_work)
>> +{
>> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
>> +
>> +	cancel_work_sync(&work->work);
>> +	kfree(work);
>> +}
>> +
>>   static int dax_hmem_platform_probe(struct platform_device *pdev)
>>   {
>> +	struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
>> +	int rc;
>> +
>> +	if (!work)
>> +		return -ENOMEM;
>> +
>> +	work->pdev = pdev;
>> +	INIT_WORK(&work->work, process_defer_work);
>> +
>> +	rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
>> +	if (rc)
>> +		return rc;
>> +
>> +	platform_set_drvdata(pdev, work);
>> +
>>   	return walk_hmem_resources(&pdev->dev, hmem_register_device);
>>   }
>>   
>> @@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
>>   MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory");
>>   MODULE_LICENSE("GPL v2");
>>   MODULE_AUTHOR("Intel Corporation");
>> +MODULE_IMPORT_NS("CXL");
>> -- 
>> 2.17.1
>>

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Alison Schofield 2 weeks ago

On Mon, Jan 26, 2026 at 01:05:47PM -0800, Koralahalli Channabasappa, Smita wrote:
> Hi Alison,
> 
> On 1/22/2026 10:35 PM, Alison Schofield wrote:
> > On Thu, Jan 22, 2026 at 04:55:42AM +0000, Smita Koralahalli wrote:
> > > The current probe time ownership check for Soft Reserved memory based
> > > solely on CXL window intersection is insufficient. dax_hmem probing is not
> > > always guaranteed to run after CXL enumeration and region assembly, which
> > > can lead to incorrect ownership decisions before the CXL stack has
> > > finished publishing windows and assembling committed regions.
> > > 
> > > Introduce deferred ownership handling for Soft Reserved ranges that
> > > intersect CXL windows at probe time by scheduling deferred work from
> > > dax_hmem and waiting for the CXL stack to complete enumeration and region
> > > assembly before deciding ownership.
> > > 
> > > Evaluate ownership of Soft Reserved ranges based on CXL region
> > > containment.
> > > 
> > >     - If all Soft Reserved ranges are fully contained within committed CXL
> > >       regions, DROP handling Soft Reserved ranges from dax_hmem and allow
> > >       dax_cxl to bind.
> > > 
> > >     - If any Soft Reserved range is not fully claimed by committed CXL
> > >       region, tear down all CXL regions and REGISTER the Soft Reserved
> > >       ranges with dax_hmem instead.
> > > 
> > > While ownership resolution is pending, gate dax_cxl probing to avoid
> > > binding prematurely.
> > 
> > This patch is the point in the set where I begin to fail creating DAX
> > regions on my non soft-reserved platforms.
> > 
> > Before this patch, at region probe, devm_cxl_add_dax_region(cxlr) succeeded
> > without delay, but now those calls result in EPROBE DEFER.
> > 
> > That deferral is wanted for platforms with Soft Reserveds, but for
> > platforms without, those probes will never resume.
> > 
> > IIUC this will impact platforms without SRs, not just my test setup.
> > In my testing it's visible during both QEMU and cxl-test region creation.
> > 
> > Can we abandon this whole deferral scheme if there is nothing in the
> > new soft_reserved resource tree?
> > 
> > Or maybe another way to get the dax probes UN-deferred in this case?
> 
> Thanks for pointing this. I didn't think through this.
> 
> I was thinking to make the deferral conditional on HMEM actually observing a
> CXL-overlapping range. Rough flow:
> 
> One assumption I'm relying on here is that dax_hmem and "initial"
> hmem_register_device() walk happens before dax_cxl probes. If that
> assumption doesn’t hold this approach may not be sufficient.
> 
> 1. Keep dax_cxl_mode default as DEFER as it is now in dax/bus.c
> 2. Introduce need_deferral flag initialized to false in dax/bus.c
> 3. During the initial dax_hmem walk, in hmem_register_device() if HMEM
> observes SR that intersects IORES_DESC_CXL, set a need_deferral flag and
> schedule the deferred work. (case DEFER)
> 4. In dax_cxl probe: only return -EPROBE_DEFER when dax_cxl_mode == DEFER
> and need_deferral is set, otherwise proceed with cxl_dax.
> 
> Please call out if you see issues with this approach (especially around the
> ordering assumption).


A quick thought to share - 

Will the 'need_deferral' flag be cleared when all deferred work is
done, so that case 2) below can succeed:

While these changes add sync and fallback for platforms that use Soft
Reserveds, protect against regressing other use cases like:

1) Platforms that don't create SRs but do create auto regions and
expect them to either automatically create dax regions on successful CXL
driver assembly.

2) Plain old user space creation of ram regions where the user expects
the result to be a CXL region and a DAX region. These may occur in
platforms with or without Soft Reserveds. 

> 
> Thanks
> Smita
> > 
> > -- Alison
> > 
> > > 
> > > This enforces a strict ownership. Either CXL fully claims the Soft
> > > Reserved ranges or it relinquishes it entirely.
> > > 
> > > Co-developed-by: Dan Williams <dan.j.williams@intel.com>
> > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > > Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> > > ---
> > >   drivers/cxl/core/region.c | 25 ++++++++++++
> > >   drivers/cxl/cxl.h         |  2 +
> > >   drivers/dax/cxl.c         |  9 +++++
> > >   drivers/dax/hmem/hmem.c   | 81 ++++++++++++++++++++++++++++++++++++++-
> > >   4 files changed, 115 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> > > index 9827a6dd3187..6c22a2d4abbb 100644
> > > --- a/drivers/cxl/core/region.c
> > > +++ b/drivers/cxl/core/region.c
> > > @@ -3875,6 +3875,31 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
> > >   DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
> > >   			 cxl_region_debugfs_poison_clear, "%llx\n");
> > > +static int cxl_region_teardown_cb(struct device *dev, void *data)
> > > +{
> > > +	struct cxl_root_decoder *cxlrd;
> > > +	struct cxl_region *cxlr;
> > > +	struct cxl_port *port;
> > > +
> > > +	if (!is_cxl_region(dev))
> > > +		return 0;
> > > +
> > > +	cxlr = to_cxl_region(dev);
> > > +
> > > +	cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
> > > +	port = cxlrd_to_port(cxlrd);
> > > +
> > > +	devm_release_action(port->uport_dev, unregister_region, cxlr);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +void cxl_region_teardown_all(void)
> > > +{
> > > +	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_region_teardown_cb);
> > > +}
> > > +EXPORT_SYMBOL_GPL(cxl_region_teardown_all);
> > > +
> > >   static int cxl_region_contains_sr_cb(struct device *dev, void *data)
> > >   {
> > >   	struct resource *res = data;
> > > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> > > index b0ff6b65ea0b..1864d35d5f69 100644
> > > --- a/drivers/cxl/cxl.h
> > > +++ b/drivers/cxl/cxl.h
> > > @@ -907,6 +907,7 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled);
> > >   struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
> > >   u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
> > >   bool cxl_region_contains_soft_reserve(const struct resource *res);
> > > +void cxl_region_teardown_all(void);
> > >   #else
> > >   static inline bool is_cxl_pmem_region(struct device *dev)
> > >   {
> > > @@ -933,6 +934,7 @@ static inline bool cxl_region_contains_soft_reserve(const struct resource *res)
> > >   {
> > >   	return false;
> > >   }
> > > +static inline void cxl_region_teardown_all(void) { }
> > >   #endif
> > >   void cxl_endpoint_parse_cdat(struct cxl_port *port);
> > > diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> > > index 13cd94d32ff7..b7e90d6dd888 100644
> > > --- a/drivers/dax/cxl.c
> > > +++ b/drivers/dax/cxl.c
> > > @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
> > >   	struct dax_region *dax_region;
> > >   	struct dev_dax_data data;
> > > +	switch (dax_cxl_mode) {
> > > +	case DAX_CXL_MODE_DEFER:
> > > +		return -EPROBE_DEFER;
> > > +	case DAX_CXL_MODE_REGISTER:
> > > +		return -ENODEV;
> > > +	case DAX_CXL_MODE_DROP:
> > > +		break;
> > > +	}
> > > +
> > >   	if (nid == NUMA_NO_NODE)
> > >   		nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
> > > diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
> > > index 1e3424358490..bcb57d8678d7 100644
> > > --- a/drivers/dax/hmem/hmem.c
> > > +++ b/drivers/dax/hmem/hmem.c
> > > @@ -3,6 +3,7 @@
> > >   #include <linux/memregion.h>
> > >   #include <linux/module.h>
> > >   #include <linux/dax.h>
> > > +#include "../../cxl/cxl.h"
> > >   #include "../bus.h"
> > >   static bool region_idle;
> > > @@ -58,9 +59,15 @@ static void release_hmem(void *pdev)
> > >   	platform_device_unregister(pdev);
> > >   }
> > > +struct dax_defer_work {
> > > +	struct platform_device *pdev;
> > > +	struct work_struct work;
> > > +};
> > > +
> > >   static int hmem_register_device(struct device *host, int target_nid,
> > >   				const struct resource *res)
> > >   {
> > > +	struct dax_defer_work *work = dev_get_drvdata(host);
> > >   	struct platform_device *pdev;
> > >   	struct memregion_info info;
> > >   	long id;
> > > @@ -69,8 +76,18 @@ static int hmem_register_device(struct device *host, int target_nid,
> > >   	if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
> > >   	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
> > >   			      IORES_DESC_CXL) != REGION_DISJOINT) {
> > > -		dev_dbg(host, "deferring range to CXL: %pr\n", res);
> > > -		return 0;
> > > +		switch (dax_cxl_mode) {
> > > +		case DAX_CXL_MODE_DEFER:
> > > +			dev_dbg(host, "deferring range to CXL: %pr\n", res);
> > > +			schedule_work(&work->work);
> > > +			return 0;
> > > +		case DAX_CXL_MODE_REGISTER:
> > > +			dev_dbg(host, "registering CXL range: %pr\n", res);
> > > +			break;
> > > +		case DAX_CXL_MODE_DROP:
> > > +			dev_dbg(host, "dropping CXL range: %pr\n", res);
> > > +			return 0;
> > > +		}
> > >   	}
> > >   	rc = region_intersects_soft_reserve(res->start, resource_size(res));
> > > @@ -123,8 +140,67 @@ static int hmem_register_device(struct device *host, int target_nid,
> > >   	return rc;
> > >   }
> > > +static int cxl_contains_soft_reserve(struct device *host, int target_nid,
> > > +				     const struct resource *res)
> > > +{
> > > +	if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
> > > +			      IORES_DESC_CXL) != REGION_DISJOINT) {
> > > +		if (!cxl_region_contains_soft_reserve(res))
> > > +			return 1;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static void process_defer_work(struct work_struct *_work)
> > > +{
> > > +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> > > +	struct platform_device *pdev = work->pdev;
> > > +	int rc;
> > > +
> > > +	/* relies on cxl_acpi and cxl_pci having had a chance to load */
> > > +	wait_for_device_probe();
> > > +
> > > +	rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
> > > +
> > > +	if (!rc) {
> > > +		dax_cxl_mode = DAX_CXL_MODE_DROP;
> > > +		rc = bus_rescan_devices(&cxl_bus_type);
> > > +		if (rc)
> > > +			dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
> > > +	} else {
> > > +		dax_cxl_mode = DAX_CXL_MODE_REGISTER;
> > > +		cxl_region_teardown_all();
> > > +	}
> > > +
> > > +	walk_hmem_resources(&pdev->dev, hmem_register_device);
> > > +}
> > > +
> > > +static void kill_defer_work(void *_work)
> > > +{
> > > +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
> > > +
> > > +	cancel_work_sync(&work->work);
> > > +	kfree(work);
> > > +}
> > > +
> > >   static int dax_hmem_platform_probe(struct platform_device *pdev)
> > >   {
> > > +	struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
> > > +	int rc;
> > > +
> > > +	if (!work)
> > > +		return -ENOMEM;
> > > +
> > > +	work->pdev = pdev;
> > > +	INIT_WORK(&work->work, process_defer_work);
> > > +
> > > +	rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
> > > +	if (rc)
> > > +		return rc;
> > > +
> > > +	platform_set_drvdata(pdev, work);
> > > +
> > >   	return walk_hmem_resources(&pdev->dev, hmem_register_device);
> > >   }
> > > @@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
> > >   MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory");
> > >   MODULE_LICENSE("GPL v2");
> > >   MODULE_AUTHOR("Intel Corporation");
> > > +MODULE_IMPORT_NS("CXL");
> > > -- 
> > > 2.17.1
> > > 
>

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by Koralahalli Channabasappa, Smita 1 week, 6 days ago

Hi Alison,

On 1/26/2026 2:33 PM, Alison Schofield wrote:
> On Mon, Jan 26, 2026 at 01:05:47PM -0800, Koralahalli Channabasappa, Smita wrote:
>> Hi Alison,
>>
>> On 1/22/2026 10:35 PM, Alison Schofield wrote:
>>> On Thu, Jan 22, 2026 at 04:55:42AM +0000, Smita Koralahalli wrote:
>>>> The current probe time ownership check for Soft Reserved memory based
>>>> solely on CXL window intersection is insufficient. dax_hmem probing is not
>>>> always guaranteed to run after CXL enumeration and region assembly, which
>>>> can lead to incorrect ownership decisions before the CXL stack has
>>>> finished publishing windows and assembling committed regions.
>>>>
>>>> Introduce deferred ownership handling for Soft Reserved ranges that
>>>> intersect CXL windows at probe time by scheduling deferred work from
>>>> dax_hmem and waiting for the CXL stack to complete enumeration and region
>>>> assembly before deciding ownership.
>>>>
>>>> Evaluate ownership of Soft Reserved ranges based on CXL region
>>>> containment.
>>>>
>>>>      - If all Soft Reserved ranges are fully contained within committed CXL
>>>>        regions, DROP handling Soft Reserved ranges from dax_hmem and allow
>>>>        dax_cxl to bind.
>>>>
>>>>      - If any Soft Reserved range is not fully claimed by committed CXL
>>>>        region, tear down all CXL regions and REGISTER the Soft Reserved
>>>>        ranges with dax_hmem instead.
>>>>
>>>> While ownership resolution is pending, gate dax_cxl probing to avoid
>>>> binding prematurely.
>>>
>>> This patch is the point in the set where I begin to fail creating DAX
>>> regions on my non soft-reserved platforms.
>>>
>>> Before this patch, at region probe, devm_cxl_add_dax_region(cxlr) succeeded
>>> without delay, but now those calls result in EPROBE DEFER.
>>>
>>> That deferral is wanted for platforms with Soft Reserveds, but for
>>> platforms without, those probes will never resume.
>>>
>>> IIUC this will impact platforms without SRs, not just my test setup.
>>> In my testing it's visible during both QEMU and cxl-test region creation.
>>>
>>> Can we abandon this whole deferral scheme if there is nothing in the
>>> new soft_reserved resource tree?
>>>
>>> Or maybe another way to get the dax probes UN-deferred in this case?
>>
>> Thanks for pointing this. I didn't think through this.
>>
>> I was thinking to make the deferral conditional on HMEM actually observing a
>> CXL-overlapping range. Rough flow:
>>
>> One assumption I'm relying on here is that dax_hmem and "initial"
>> hmem_register_device() walk happens before dax_cxl probes. If that
>> assumption doesn’t hold this approach may not be sufficient.
>>
>> 1. Keep dax_cxl_mode default as DEFER as it is now in dax/bus.c
>> 2. Introduce need_deferral flag initialized to false in dax/bus.c
>> 3. During the initial dax_hmem walk, in hmem_register_device() if HMEM
>> observes SR that intersects IORES_DESC_CXL, set a need_deferral flag and
>> schedule the deferred work. (case DEFER)
>> 4. In dax_cxl probe: only return -EPROBE_DEFER when dax_cxl_mode == DEFER
>> and need_deferral is set, otherwise proceed with cxl_dax.
>>
>> Please call out if you see issues with this approach (especially around the
>> ordering assumption).
> 
> 
> A quick thought to share -
> 
> Will the 'need_deferral' flag be cleared when all deferred work is
> done, so that case 2) below can succeed:

My thinking was that we don’t strictly need to clear need_deferral as 
long as dax_cxl_mode is the actual gate. need_deferral would only be set 
when HMEM observes an SR range intersecting IORES_DESC_CXL, and after 
the deferred work runs we should always transition dax_cxl_mode from 
DEFER to either DROP or REGISTER. At that point dax_cxl won’t return 
EPROBE_DEFER anymore regardless of the flag value.

I also had a follow-up thought: rather than a separate need_deferral 
flag, we could make this explicit in the mode enum. For example, keep 
DEFER as the default, and when hmem_register_device() first observes a 
SR and CXL intersection, transition the mode from DEFER to something 
like NEEDS_CHANGE. Then dax_cxl would only return -EPROBE_DEFER in the 
NEEDS_CHANGE state, and once the deferred work completes it would move 
the mode to DROP or REGISTER.

Please correct me if I’m missing a case where dax_cxl_mode could remain 
DEFER even after setting the flag.

Thanks
Smita

> 
> While these changes add sync and fallback for platforms that use Soft
> Reserveds, protect against regressing other use cases like:
> 
> 1) Platforms that don't create SRs but do create auto regions and
> expect them to either automatically create dax regions on successful CXL
> driver assembly.
> 
> 2) Plain old user space creation of ram regions where the user expects
> the result to be a CXL region and a DAX region. These may occur in
> platforms with or without Soft Reserveds.
> 
>>
>> Thanks
>> Smita
>>>
>>> -- Alison
>>>
>>>>
>>>> This enforces a strict ownership. Either CXL fully claims the Soft
>>>> Reserved ranges or it relinquishes it entirely.
>>>>
>>>> Co-developed-by: Dan Williams <dan.j.williams@intel.com>
>>>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>>>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>>>> ---
>>>>    drivers/cxl/core/region.c | 25 ++++++++++++
>>>>    drivers/cxl/cxl.h         |  2 +
>>>>    drivers/dax/cxl.c         |  9 +++++
>>>>    drivers/dax/hmem/hmem.c   | 81 ++++++++++++++++++++++++++++++++++++++-
>>>>    4 files changed, 115 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
>>>> index 9827a6dd3187..6c22a2d4abbb 100644
>>>> --- a/drivers/cxl/core/region.c
>>>> +++ b/drivers/cxl/core/region.c
>>>> @@ -3875,6 +3875,31 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
>>>>    DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
>>>>    			 cxl_region_debugfs_poison_clear, "%llx\n");
>>>> +static int cxl_region_teardown_cb(struct device *dev, void *data)
>>>> +{
>>>> +	struct cxl_root_decoder *cxlrd;
>>>> +	struct cxl_region *cxlr;
>>>> +	struct cxl_port *port;
>>>> +
>>>> +	if (!is_cxl_region(dev))
>>>> +		return 0;
>>>> +
>>>> +	cxlr = to_cxl_region(dev);
>>>> +
>>>> +	cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
>>>> +	port = cxlrd_to_port(cxlrd);
>>>> +
>>>> +	devm_release_action(port->uport_dev, unregister_region, cxlr);
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +void cxl_region_teardown_all(void)
>>>> +{
>>>> +	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_region_teardown_cb);
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(cxl_region_teardown_all);
>>>> +
>>>>    static int cxl_region_contains_sr_cb(struct device *dev, void *data)
>>>>    {
>>>>    	struct resource *res = data;
>>>> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
>>>> index b0ff6b65ea0b..1864d35d5f69 100644
>>>> --- a/drivers/cxl/cxl.h
>>>> +++ b/drivers/cxl/cxl.h
>>>> @@ -907,6 +907,7 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled);
>>>>    struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
>>>>    u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
>>>>    bool cxl_region_contains_soft_reserve(const struct resource *res);
>>>> +void cxl_region_teardown_all(void);
>>>>    #else
>>>>    static inline bool is_cxl_pmem_region(struct device *dev)
>>>>    {
>>>> @@ -933,6 +934,7 @@ static inline bool cxl_region_contains_soft_reserve(const struct resource *res)
>>>>    {
>>>>    	return false;
>>>>    }
>>>> +static inline void cxl_region_teardown_all(void) { }
>>>>    #endif
>>>>    void cxl_endpoint_parse_cdat(struct cxl_port *port);
>>>> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
>>>> index 13cd94d32ff7..b7e90d6dd888 100644
>>>> --- a/drivers/dax/cxl.c
>>>> +++ b/drivers/dax/cxl.c
>>>> @@ -14,6 +14,15 @@ static int cxl_dax_region_probe(struct device *dev)
>>>>    	struct dax_region *dax_region;
>>>>    	struct dev_dax_data data;
>>>> +	switch (dax_cxl_mode) {
>>>> +	case DAX_CXL_MODE_DEFER:
>>>> +		return -EPROBE_DEFER;
>>>> +	case DAX_CXL_MODE_REGISTER:
>>>> +		return -ENODEV;
>>>> +	case DAX_CXL_MODE_DROP:
>>>> +		break;
>>>> +	}
>>>> +
>>>>    	if (nid == NUMA_NO_NODE)
>>>>    		nid = memory_add_physaddr_to_nid(cxlr_dax->hpa_range.start);
>>>> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
>>>> index 1e3424358490..bcb57d8678d7 100644
>>>> --- a/drivers/dax/hmem/hmem.c
>>>> +++ b/drivers/dax/hmem/hmem.c
>>>> @@ -3,6 +3,7 @@
>>>>    #include <linux/memregion.h>
>>>>    #include <linux/module.h>
>>>>    #include <linux/dax.h>
>>>> +#include "../../cxl/cxl.h"
>>>>    #include "../bus.h"
>>>>    static bool region_idle;
>>>> @@ -58,9 +59,15 @@ static void release_hmem(void *pdev)
>>>>    	platform_device_unregister(pdev);
>>>>    }
>>>> +struct dax_defer_work {
>>>> +	struct platform_device *pdev;
>>>> +	struct work_struct work;
>>>> +};
>>>> +
>>>>    static int hmem_register_device(struct device *host, int target_nid,
>>>>    				const struct resource *res)
>>>>    {
>>>> +	struct dax_defer_work *work = dev_get_drvdata(host);
>>>>    	struct platform_device *pdev;
>>>>    	struct memregion_info info;
>>>>    	long id;
>>>> @@ -69,8 +76,18 @@ static int hmem_register_device(struct device *host, int target_nid,
>>>>    	if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
>>>>    	    region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>>>>    			      IORES_DESC_CXL) != REGION_DISJOINT) {
>>>> -		dev_dbg(host, "deferring range to CXL: %pr\n", res);
>>>> -		return 0;
>>>> +		switch (dax_cxl_mode) {
>>>> +		case DAX_CXL_MODE_DEFER:
>>>> +			dev_dbg(host, "deferring range to CXL: %pr\n", res);
>>>> +			schedule_work(&work->work);
>>>> +			return 0;
>>>> +		case DAX_CXL_MODE_REGISTER:
>>>> +			dev_dbg(host, "registering CXL range: %pr\n", res);
>>>> +			break;
>>>> +		case DAX_CXL_MODE_DROP:
>>>> +			dev_dbg(host, "dropping CXL range: %pr\n", res);
>>>> +			return 0;
>>>> +		}
>>>>    	}
>>>>    	rc = region_intersects_soft_reserve(res->start, resource_size(res));
>>>> @@ -123,8 +140,67 @@ static int hmem_register_device(struct device *host, int target_nid,
>>>>    	return rc;
>>>>    }
>>>> +static int cxl_contains_soft_reserve(struct device *host, int target_nid,
>>>> +				     const struct resource *res)
>>>> +{
>>>> +	if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
>>>> +			      IORES_DESC_CXL) != REGION_DISJOINT) {
>>>> +		if (!cxl_region_contains_soft_reserve(res))
>>>> +			return 1;
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static void process_defer_work(struct work_struct *_work)
>>>> +{
>>>> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
>>>> +	struct platform_device *pdev = work->pdev;
>>>> +	int rc;
>>>> +
>>>> +	/* relies on cxl_acpi and cxl_pci having had a chance to load */
>>>> +	wait_for_device_probe();
>>>> +
>>>> +	rc = walk_hmem_resources(&pdev->dev, cxl_contains_soft_reserve);
>>>> +
>>>> +	if (!rc) {
>>>> +		dax_cxl_mode = DAX_CXL_MODE_DROP;
>>>> +		rc = bus_rescan_devices(&cxl_bus_type);
>>>> +		if (rc)
>>>> +			dev_warn(&pdev->dev, "CXL bus rescan failed: %d\n", rc);
>>>> +	} else {
>>>> +		dax_cxl_mode = DAX_CXL_MODE_REGISTER;
>>>> +		cxl_region_teardown_all();
>>>> +	}
>>>> +
>>>> +	walk_hmem_resources(&pdev->dev, hmem_register_device);
>>>> +}
>>>> +
>>>> +static void kill_defer_work(void *_work)
>>>> +{
>>>> +	struct dax_defer_work *work = container_of(_work, typeof(*work), work);
>>>> +
>>>> +	cancel_work_sync(&work->work);
>>>> +	kfree(work);
>>>> +}
>>>> +
>>>>    static int dax_hmem_platform_probe(struct platform_device *pdev)
>>>>    {
>>>> +	struct dax_defer_work *work = kzalloc(sizeof(*work), GFP_KERNEL);
>>>> +	int rc;
>>>> +
>>>> +	if (!work)
>>>> +		return -ENOMEM;
>>>> +
>>>> +	work->pdev = pdev;
>>>> +	INIT_WORK(&work->work, process_defer_work);
>>>> +
>>>> +	rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, work);
>>>> +	if (rc)
>>>> +		return rc;
>>>> +
>>>> +	platform_set_drvdata(pdev, work);
>>>> +
>>>>    	return walk_hmem_resources(&pdev->dev, hmem_register_device);
>>>>    }
>>>> @@ -174,3 +250,4 @@ MODULE_ALIAS("platform:hmem_platform*");
>>>>    MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory");
>>>>    MODULE_LICENSE("GPL v2");
>>>>    MODULE_AUTHOR("Intel Corporation");
>>>> +MODULE_IMPORT_NS("CXL");
>>>> -- 
>>>> 2.17.1
>>>>
>>
>

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by dan.j.williams@intel.com 1 week, 5 days ago

Koralahalli Channabasappa, Smita wrote:
> Hi Alison,
> 
> On 1/26/2026 2:33 PM, Alison Schofield wrote:
> > On Mon, Jan 26, 2026 at 01:05:47PM -0800, Koralahalli Channabasappa, Smita wrote:
> >> Hi Alison,
> >>
> >> On 1/22/2026 10:35 PM, Alison Schofield wrote:
> >>> On Thu, Jan 22, 2026 at 04:55:42AM +0000, Smita Koralahalli wrote:
> >>>> The current probe time ownership check for Soft Reserved memory based
> >>>> solely on CXL window intersection is insufficient. dax_hmem probing is not
> >>>> always guaranteed to run after CXL enumeration and region assembly, which
> >>>> can lead to incorrect ownership decisions before the CXL stack has
> >>>> finished publishing windows and assembling committed regions.
> >>>>
> >>>> Introduce deferred ownership handling for Soft Reserved ranges that
> >>>> intersect CXL windows at probe time by scheduling deferred work from
> >>>> dax_hmem and waiting for the CXL stack to complete enumeration and region
> >>>> assembly before deciding ownership.
> >>>>
> >>>> Evaluate ownership of Soft Reserved ranges based on CXL region
> >>>> containment.
> >>>>
> >>>>      - If all Soft Reserved ranges are fully contained within committed CXL
> >>>>        regions, DROP handling Soft Reserved ranges from dax_hmem and allow
> >>>>        dax_cxl to bind.
> >>>>
> >>>>      - If any Soft Reserved range is not fully claimed by committed CXL
> >>>>        region, tear down all CXL regions and REGISTER the Soft Reserved
> >>>>        ranges with dax_hmem instead.
> >>>>
> >>>> While ownership resolution is pending, gate dax_cxl probing to avoid
> >>>> binding prematurely.
> >>>
> >>> This patch is the point in the set where I begin to fail creating DAX
> >>> regions on my non soft-reserved platforms.
> >>>
> >>> Before this patch, at region probe, devm_cxl_add_dax_region(cxlr) succeeded
> >>> without delay, but now those calls result in EPROBE DEFER.
> >>>
> >>> That deferral is wanted for platforms with Soft Reserveds, but for
> >>> platforms without, those probes will never resume.
> >>>
> >>> IIUC this will impact platforms without SRs, not just my test setup.
> >>> In my testing it's visible during both QEMU and cxl-test region creation.
> >>>
> >>> Can we abandon this whole deferral scheme if there is nothing in the
> >>> new soft_reserved resource tree?
> >>>
> >>> Or maybe another way to get the dax probes UN-deferred in this case?
> >>
> >> Thanks for pointing this. I didn't think through this.
> >>
> >> I was thinking to make the deferral conditional on HMEM actually observing a
> >> CXL-overlapping range. Rough flow:
> >>
> >> One assumption I'm relying on here is that dax_hmem and "initial"
> >> hmem_register_device() walk happens before dax_cxl probes. If that
> >> assumption doesn’t hold this approach may not be sufficient.
> >>
> >> 1. Keep dax_cxl_mode default as DEFER as it is now in dax/bus.c
> >> 2. Introduce need_deferral flag initialized to false in dax/bus.c
> >> 3. During the initial dax_hmem walk, in hmem_register_device() if HMEM
> >> observes SR that intersects IORES_DESC_CXL, set a need_deferral flag and
> >> schedule the deferred work. (case DEFER)
> >> 4. In dax_cxl probe: only return -EPROBE_DEFER when dax_cxl_mode == DEFER
> >> and need_deferral is set, otherwise proceed with cxl_dax.
> >>
> >> Please call out if you see issues with this approach (especially around the
> >> ordering assumption).
> > 
> > 
> > A quick thought to share -
> > 
> > Will the 'need_deferral' flag be cleared when all deferred work is
> > done, so that case 2) below can succeed:
> 
> My thinking was that we don’t strictly need to clear need_deferral as 
> long as dax_cxl_mode is the actual gate. need_deferral would only be set 
> when HMEM observes an SR range intersecting IORES_DESC_CXL, and after 
> the deferred work runs we should always transition dax_cxl_mode from 
> DEFER to either DROP or REGISTER. At that point dax_cxl won’t return 
> EPROBE_DEFER anymore regardless of the flag value.
> 
> I also had a follow-up thought: rather than a separate need_deferral 
> flag, we could make this explicit in the mode enum. For example, keep 
> DEFER as the default, and when hmem_register_device() first observes a 
> SR and CXL intersection, transition the mode from DEFER to something 
> like NEEDS_CHANGE. Then dax_cxl would only return -EPROBE_DEFER in the 
> NEEDS_CHANGE state, and once the deferred work completes it would move 
> the mode to DROP or REGISTER.
> 
> Please correct me if I’m missing a case where dax_cxl_mode could remain 
> DEFER even after setting the flag.

Recall that DAX_CXL_MODE_DEFER is about what to do about arriving
hmem_register_device() events. Until CXL comes up those all get deferred
to the workqueue. When the workqueue runs it flushes both the
hmem_register_device() events from probing the HMAT and
cxl_region_probe() events from initial PCI discovery.

The deferral never needs to be visible to cxl_dax_region_probe() outside
of knowing that the workqueue has had a chance to flush at least once.

If we go with the alloc_dax_region() observation in my other mail it
means that the HPA space will already be claimed and
cxl_dax_region_probe() will fail. If we can get to that point of "all
HMEM registered, and all CXL regions failing to attach their
cxl_dax_region devices" that is a good stopping point. Then can decide
if a follow-on patch is needed to cleanup that state
(cxl_region_teardown_all()) , or if it can just idle that way in that
messy state and wait for userspace to cleanup if it wants.

Might want an error message to point people to report their failing
system configuration to the linux-cxl@ mailing list.

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by kernel test robot 2 weeks, 3 days ago

Hi Smita,

kernel test robot noticed the following build errors:

[auto build test ERROR on bc62f5b308cbdedf29132fe96e9d591e526527e1]

url:    https://github.com/intel-lab-lkp/linux/commits/Smita-Koralahalli/dax-hmem-Request-cxl_acpi-and-cxl_pci-before-walking-Soft-Reserved-ranges/20260122-130032
base:   bc62f5b308cbdedf29132fe96e9d591e526527e1
patch link:    https://lore.kernel.org/r/20260122045543.218194-7-Smita.KoralahalliChannabasappa%40amd.com
patch subject: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20260123/202601231309.JYIKhk2G-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260123/202601231309.JYIKhk2G-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601231309.JYIKhk2G-lkp@intel.com/

All errors (new ones prefixed by >>, old ones prefixed by <<):

>> ERROR: modpost: "cxl_bus_type" [drivers/dax/hmem/dax_hmem.ko] undefined!

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges

Posted by kernel test robot 2 weeks, 4 days ago

Hi Smita,

kernel test robot noticed the following build errors:

[auto build test ERROR on bc62f5b308cbdedf29132fe96e9d591e526527e1]

url:    https://github.com/intel-lab-lkp/linux/commits/Smita-Koralahalli/dax-hmem-Request-cxl_acpi-and-cxl_pci-before-walking-Soft-Reserved-ranges/20260122-130032
base:   bc62f5b308cbdedf29132fe96e9d591e526527e1
patch link:    https://lore.kernel.org/r/20260122045543.218194-7-Smita.KoralahalliChannabasappa%40amd.com
patch subject: [PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20260122/202601221448.OYyjVxEC-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260122/202601221448.OYyjVxEC-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601221448.OYyjVxEC-lkp@intel.com/

All errors (new ones prefixed by >>, old ones prefixed by <<):

>> ERROR: modpost: "cxl_bus_type" [drivers/dax/hmem/dax_hmem.ko] undefined!

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

[PATCH v5 1/7] dax/hmem: Request cxl_acpi and cxl_pci before walking Soft Reserved ranges
[PATCH v5 2/7] dax/hmem: Gate Soft Reserved deferral on DEV_DAX_CXL
[PATCH v5 3/7] cxl/region: Skip decoder reset on detach for autodiscovered regions
[PATCH v5 4/7] cxl/region: Add helper to check Soft Reserved containment by CXL regions
[PATCH v5 5/7] dax: Introduce dax_cxl_mode for CXL coordination
[PATCH v5 6/7] dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges
[PATCH v5 7/7] dax/hmem: Reintroduce Soft Reserved ranges back into the iomem tree