Add a sysram memctrl that directly hotplugs memory without needing to
route through DAX. This simplifies the sysram usecase considerably.
The sysram memctl adds new sysfs controls when registered:
region/memctrl/[hotplug, hotunplug, state]
hotplug: controller attempts to hotplug the memory region
hotunplug: controller attempts to offline and hotunplug the memory region
state: [online,online_normal,offline]
online : controller onlines blocks in ZONE_MOVABLE
online_normal: controller onlines blocks in ZONE_NORMAL
offline : controller attempts to offline the memory blocks
Hotplug note - by default the controller will hotplug the blocks, but
leave them offline (unless MHP auto-online in Kconfig is enabled).
Setting state to "online_normal" may prevent future hot-unplug of sysram
regions, and unbinding a memory region with memory online in ZONE_NORMAL
may result in the device being removed but the memory remaining online.
This can result in future management functions failing (such as adding a
new region). This is why "online_normal" is explicit, and the default
online zone is ZONE_MOVABLE.
Cc: David Hildenbrand <david@kernel.org>
Signed-off-by: Gregory Price <gourry@gourry.net>
---
drivers/cxl/core/core.h | 2 +
drivers/cxl/core/memctrl/Makefile | 1 +
drivers/cxl/core/memctrl/memctrl.c | 2 +
drivers/cxl/core/memctrl/sysram_region.c | 358 +++++++++++++++++++++++
drivers/cxl/core/region.c | 5 +
drivers/cxl/cxl.h | 6 +-
6 files changed, 372 insertions(+), 2 deletions(-)
create mode 100644 drivers/cxl/core/memctrl/sysram_region.c
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 1156a4bd0080..18cb84950500 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -31,6 +31,8 @@ int cxl_decoder_detach(struct cxl_region *cxlr,
struct cxl_endpoint_decoder *cxled, int pos,
enum cxl_detach_mode mode);
+int devm_cxl_add_sysram_region(struct cxl_region *cxlr);
+
#define CXL_REGION_ATTR(x) (&dev_attr_##x.attr)
#define CXL_REGION_TYPE(x) (&cxl_region_type)
#define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr),
diff --git a/drivers/cxl/core/memctrl/Makefile b/drivers/cxl/core/memctrl/Makefile
index 8165aad5a52a..1c52c7d75570 100644
--- a/drivers/cxl/core/memctrl/Makefile
+++ b/drivers/cxl/core/memctrl/Makefile
@@ -2,3 +2,4 @@
cxl_core-$(CONFIG_CXL_REGION) += memctrl/memctrl.o
cxl_core-$(CONFIG_CXL_REGION) += memctrl/dax_region.o
+cxl_core-$(CONFIG_CXL_REGION) += memctrl/sysram_region.o
diff --git a/drivers/cxl/core/memctrl/memctrl.c b/drivers/cxl/core/memctrl/memctrl.c
index 24e0e14b39c7..40ffb59353bb 100644
--- a/drivers/cxl/core/memctrl/memctrl.c
+++ b/drivers/cxl/core/memctrl/memctrl.c
@@ -34,6 +34,8 @@ int cxl_enable_memctrl(struct cxl_region *cxlr)
return devm_cxl_add_dax_region(cxlr);
case CXL_MEMCTRL_DAX:
return devm_cxl_add_dax_region(cxlr);
+ case CXL_MEMCTRL_SYSRAM:
+ return devm_cxl_add_sysram_region(cxlr);
default:
return -EINVAL;
}
diff --git a/drivers/cxl/core/memctrl/sysram_region.c b/drivers/cxl/core/memctrl/sysram_region.c
new file mode 100644
index 000000000000..a7570c8a54e1
--- /dev/null
+++ b/drivers/cxl/core/memctrl/sysram_region.c
@@ -0,0 +1,358 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2026 Meta Inc. All rights reserved. */
+#include <linux/memremap.h>
+#include <linux/memory.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/memory-tiers.h>
+#include <linux/memory_hotplug.h>
+#include <linux/string_helpers.h>
+#include <linux/sched/signal.h>
+#include <cxlmem.h>
+#include <cxl.h>
+#include "../core.h"
+
+/* If HMAT was unavailable, assign a default distance. */
+#define MEMTIER_DEFAULT_CXL_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 5)
+
+static const char *sysram_name = "System RAM (CXL)";
+
+struct cxl_sysram_data {
+ const char *res_name;
+ int mgid;
+ struct resource *res;
+};
+
+static DEFINE_MUTEX(cxl_memory_type_lock);
+static LIST_HEAD(cxl_memory_types);
+
+static struct cxl_region *to_cxl_region(struct device *dev)
+{
+ if (dev->type != &cxl_region_type)
+ return NULL;
+ return container_of(dev, struct cxl_region, dev);
+}
+
+static struct memory_dev_type *cxl_find_alloc_memory_type(int adist)
+{
+ guard(mutex)(&cxl_memory_type_lock);
+ return mt_find_alloc_memory_type(adist, &cxl_memory_types);
+}
+
+static void __maybe_unused cxl_put_memory_types(void)
+{
+ guard(mutex)(&cxl_memory_type_lock);
+ mt_put_memory_types(&cxl_memory_types);
+}
+
+static int cxl_sysram_range(struct cxl_region *cxlr, struct range *r)
+{
+ struct cxl_region_params *p = &cxlr->params;
+
+ if (!p->res)
+ return -ENODEV;
+
+ /* memory-block align the hotplug range */
+ r->start = ALIGN(p->res->start, memory_block_size_bytes());
+ r->end = ALIGN_DOWN(p->res->end + 1, memory_block_size_bytes()) - 1;
+ if (r->start >= r->end) {
+ r->start = p->res->start;
+ r->end = p->res->end;
+ return -ENOSPC;
+ }
+ return 0;
+}
+
+static ssize_t hotunplug_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct range range;
+ int rc;
+
+ if (!cxlr)
+ return -ENODEV;
+
+ rc = cxl_sysram_range(cxlr, &range);
+ if (rc)
+ return rc;
+
+ rc = offline_and_remove_memory(range.start, range_len(&range));
+
+ if (rc)
+ return rc;
+
+ return len;
+}
+static DEVICE_ATTR_WO(hotunplug);
+
+struct online_memory_cb_arg {
+ int online_type;
+ int rc;
+};
+
+static int online_memory_block_cb(struct memory_block *mem, void *arg)
+{
+ struct online_memory_cb_arg *cb_arg = arg;
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ cond_resched();
+
+ if (mem->state == MEM_ONLINE)
+ return 0;
+
+ mem->online_type = cb_arg->online_type;
+ cb_arg->rc = device_online(&mem->dev);
+
+ return cb_arg->rc;
+}
+
+static int offline_memory_block_cb(struct memory_block *mem, void *arg)
+{
+ int *rc = arg;
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ cond_resched();
+
+ if (mem->state == MEM_OFFLINE)
+ return 0;
+
+ *rc = device_offline(&mem->dev);
+
+ return *rc;
+}
+
+static ssize_t state_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct online_memory_cb_arg cb_arg;
+ struct range range;
+ int rc;
+
+ if (!cxlr)
+ return -ENODEV;
+
+ rc = cxl_sysram_range(cxlr, &range);
+ if (rc)
+ return rc;
+
+ rc = lock_device_hotplug_sysfs();
+ if (rc)
+ return rc;
+
+ if (sysfs_streq(buf, "online")) {
+ cb_arg.online_type = MMOP_ONLINE_MOVABLE;
+ cb_arg.rc = 0;
+ rc = walk_memory_blocks(range.start, range_len(&range),
+ &cb_arg, online_memory_block_cb);
+ if (!rc)
+ rc = cb_arg.rc;
+ } else if (sysfs_streq(buf, "online_normal")) {
+ cb_arg.online_type = MMOP_ONLINE;
+ cb_arg.rc = 0;
+ rc = walk_memory_blocks(range.start, range_len(&range),
+ &cb_arg, online_memory_block_cb);
+ if (!rc)
+ rc = cb_arg.rc;
+ } else if (sysfs_streq(buf, "offline")) {
+ int offline_rc = 0;
+
+ rc = walk_memory_blocks(range.start, range_len(&range),
+ &offline_rc, offline_memory_block_cb);
+ if (!rc)
+ rc = offline_rc;
+ } else {
+ rc = -EINVAL;
+ }
+
+ unlock_device_hotplug();
+
+ if (rc)
+ return rc;
+
+ return len;
+}
+static DEVICE_ATTR_WO(state);
+
+static ssize_t hotplug_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_sysram_data *data;
+ struct range range;
+ int rc;
+
+ if (!cxlr)
+ return -ENODEV;
+
+ data = dev_get_drvdata(dev);
+ if (!data)
+ return -ENODEV;
+
+ rc = cxl_sysram_range(cxlr, &range);
+ if (rc)
+ return rc;
+
+ rc = add_memory_driver_managed(data->mgid, range.start,
+ range_len(&range), sysram_name,
+ MHP_NID_IS_MGID);
+ if (rc)
+ return rc;
+
+ return len;
+}
+static DEVICE_ATTR_WO(hotplug);
+
+static struct attribute *cxl_sysram_region_attrs[] = {
+ &dev_attr_hotunplug.attr,
+ &dev_attr_state.attr,
+ &dev_attr_hotplug.attr,
+ NULL,
+};
+
+static const struct attribute_group cxl_sysram_region_group = {
+ .name = "memctl",
+ .attrs = cxl_sysram_region_attrs,
+};
+
+static void cxl_sysram_unregister(void *_data)
+{
+ struct cxl_sysram_data *data = _data;
+ struct range range = {
+ .start = data->res->start,
+ .end = data->res->end
+ };
+
+ /* We have one shot for removal, otherwise it's stuck til reboot */
+ if (!offline_and_remove_memory(range.start, range_len(&range))) {
+ remove_resource(data->res);
+ kfree(data->res);
+ memory_group_unregister(data->mgid);
+ kfree(data->res_name);
+ kfree(data);
+ return;
+ }
+ pr_err("CXL: %#llx-%#llx cannot be hotremoved until next reboot\n",
+ range.start, range.end);
+}
+
+int devm_cxl_add_sysram_region(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ struct device *dev = &cxlr->dev;
+ struct cxl_sysram_data *data;
+ struct memory_dev_type *mtype;
+ unsigned long total_len = 0;
+ struct resource *res;
+ struct range range;
+ mhp_t mhp_flags;
+ int numa_node;
+ int adist = MEMTIER_DEFAULT_CXL_ADISTANCE;
+ int rc;
+
+ numa_node = phys_to_target_node(p->res->start);
+ if (numa_node < 0) {
+ dev_warn(dev, "rejecting CXL region with invalid node: %d\n",
+ numa_node);
+ return -EINVAL;
+ }
+
+ rc = cxl_sysram_range(cxlr, &range);
+ if (rc) {
+ dev_info(dev, "range %#llx-%#llx too small after alignment\n",
+ range.start, range.end);
+ return rc;
+ }
+ total_len = range_len(&range);
+
+ if (!total_len) {
+ dev_warn(dev, "rejecting CXL region without any memory after alignment\n");
+ return -EINVAL;
+ }
+
+ mt_calc_adistance(numa_node, &adist);
+ mtype = cxl_find_alloc_memory_type(adist);
+ if (IS_ERR(mtype))
+ return PTR_ERR(mtype);
+
+ init_node_memory_type(numa_node, mtype);
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data) {
+ rc = -ENOMEM;
+ goto err_data;
+ }
+
+ data->res_name = kstrdup(dev_name(dev), GFP_KERNEL);
+ if (!data->res_name) {
+ rc = -ENOMEM;
+ goto err_res_name;
+ }
+
+ rc = memory_group_register_static(numa_node, PFN_UP(total_len));
+ if (rc < 0)
+ goto err_reg_mgid;
+ data->mgid = rc;
+
+ /* Region is permanently reserved if hotremove fails when unbinding. */
+ res = request_mem_region(range.start, range_len(&range),
+ data->res_name);
+ if (!res) {
+ dev_warn(dev, "range %#llx-%#llx could not reserve region\n",
+ range.start, range.end);
+ rc = -EBUSY;
+ goto err_request_mem;
+ }
+ data->res = res;
+
+ /*
+ * Setup flags for System RAM. Leave _BUSY clear so add_memory() can add
+ * a child resource. Do not inherit flags from parent since it may set
+ * flags unknown to us that will the break add_memory() below.
+ */
+ res->flags = IORESOURCE_SYSTEM_RAM;
+ mhp_flags = MHP_NID_IS_MGID;
+ rc = add_memory_driver_managed(data->mgid, range.start,
+ range_len(&range), sysram_name, mhp_flags);
+ if (rc) {
+ dev_warn(dev, "range %#llx-%#llx memory add failed\n",
+ range.start, range.end);
+ goto err_add_memory;
+ }
+ dev_dbg(dev, "%s: added %llu bytes as System RAM\n", dev_name(dev),
+ (unsigned long long)total_len);
+
+ dev_set_drvdata(dev, data);
+ rc = devm_device_add_group(dev, &cxl_sysram_region_group);
+ if (rc)
+ goto err_add_group;
+
+ return devm_add_action_or_reset(dev, cxl_sysram_unregister, data);
+
+err_add_group:
+ dev_set_drvdata(dev, NULL);
+ /* if this fails, memory cannot be removed from the system until reboot */
+ remove_memory(range.start, range_len(&range));
+err_add_memory:
+ remove_resource(res);
+ kfree(res);
+err_request_mem:
+ memory_group_unregister(data->mgid);
+err_reg_mgid:
+ kfree(data->res_name);
+err_res_name:
+ kfree(data);
+err_data:
+ clear_node_memory_type(numa_node, mtype);
+ return rc;
+}
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 02d7d9ae0252..eeab091f043a 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -639,6 +639,9 @@ static ssize_t ctrl_show(struct device *dev, struct device_attribute *attr,
case CXL_MEMCTRL_DAX:
desc = "dax";
break;
+ case CXL_MEMCTRL_SYSRAM:
+ desc = "sysram";
+ break;
default:
desc = "";
break;
@@ -663,6 +666,8 @@ static ssize_t ctrl_store(struct device *dev, struct device_attribute *attr,
if (sysfs_streq(buf, "dax"))
cxlr->memctrl = CXL_MEMCTRL_DAX;
+ else if (sysfs_streq(buf, "sysram"))
+ cxlr->memctrl = CXL_MEMCTRL_SYSRAM;
else
return -EINVAL;
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index b8fabaa77262..bb4f877b4e8f 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -506,13 +506,15 @@ enum cxl_partition_mode {
/*
* Memory Controller modes:
* None - No controller selected
- * Auto - either BIOS-configured as SysRAM, or default to DAX
- * DAX - creates a dax_region controller for the cxl_region
+ * Auto - either BIOS-configured as SysRAM, or default to DAX
+ * DAX - creates a dax_region controller for the cxl_region
+ * SYSRAM - hotplugs the region directly as System RAM
*/
enum cxl_memctrl_mode {
CXL_MEMCTRL_NONE,
CXL_MEMCTRL_AUTO,
CXL_MEMCTRL_DAX,
+ CXL_MEMCTRL_SYSRAM,
};
/*
--
2.52.0
Gregory Price wrote:
> Add a sysram memctrl that directly hotplugs memory without needing to
> route through DAX. This simplifies the sysram usecase considerably.
>
> The sysram memctl adds new sysfs controls when registered:
> region/memctrl/[hotplug, hotunplug, state]
>
> hotplug: controller attempts to hotplug the memory region
> hotunplug: controller attempts to offline and hotunplug the memory region
> state: [online,online_normal,offline]
> online : controller onlines blocks in ZONE_MOVABLE
> online_normal: controller onlines blocks in ZONE_NORMAL
> offline : controller attempts to offline the memory blocks
>
> Hotplug note - by default the controller will hotplug the blocks, but
> leave them offline (unless MHP auto-online in Kconfig is enabled).
>
> Setting state to "online_normal" may prevent future hot-unplug of sysram
> regions, and unbinding a memory region with memory online in ZONE_NORMAL
> may result in the device being removed but the memory remaining online.
>
> This can result in future management functions failing (such as adding a
> new region). This is why "online_normal" is explicit, and the default
> online zone is ZONE_MOVABLE.
David's early feedback aligns with my own with respect to not creating
new "online_*" ABI terms, but I want to go a step further.
Part of the proposal here solves a fundamental problem with the way
dax_kmem operates in terms of fixing the complication of dax_kmem
depending on fine grained / multi-step online control via memblock
sysfs.
If we are going to introduce a new omnibus way to online entire regions
at a time then that goodness should first come to dax_kmem and then
potentially be refactored into a library that CXL can use to skip the
device_dax indirection.
I.e. the end result would be this "hotplug" mechanism that fixes a long
standing dax_kmem problem and then go further to drop the indirection
through device_dax and have a "hotplug" mechanism directly at the
cxl_region level.
> +int devm_cxl_add_sysram_region(struct cxl_region *cxlr)
> +{
[..]
> +err_add_group:
> + dev_set_drvdata(dev, NULL);
> + /* if this fails, memory cannot be removed from the system until reboot */
> + remove_memory(range.start, range_len(&range));
> +err_add_memory:
> + remove_resource(res);
> + kfree(res);
> +err_request_mem:
> + memory_group_unregister(data->mgid);
> +err_reg_mgid:
> + kfree(data->res_name);
> +err_res_name:
> + kfree(data);
> +err_data:
> + clear_node_memory_type(numa_node, mtype);
> + return rc;
...btw, this feels like too many new gotos in the age of
scope-based-cleanup. It also feels like a bunch of duplicated code that
CXL and fixed up dax_kmem can share.
On Mon, Jan 12, 2026 at 01:10:15PM -0800, dan.j.williams@intel.com wrote:
> Gregory Price wrote:
> >
> > This can result in future management functions failing (such as adding a
> > new region). This is why "online_normal" is explicit, and the default
> > online zone is ZONE_MOVABLE.
>
> David's early feedback aligns with my own with respect to not creating
> new "online_*" ABI terms, but I want to go a step further.
>
> Part of the proposal here solves a fundamental problem with the way
> dax_kmem operates in terms of fixing the complication of dax_kmem
> depending on fine grained / multi-step online control via memblock
> sysfs.
>
> If we are going to introduce a new omnibus way to online entire regions
> at a time then that goodness should first come to dax_kmem and then
> potentially be refactored into a library that CXL can use to skip the
> device_dax indirection.
>
I think that probably just looks like sinking some of this into
memory_hotplug.c as bulk-commands and then exposing a similar
dax0.0/hotplug function that shows up if you're bound to dax_kmem.
That should be trivial to sink, can do.
> I.e. the end result would be this "hotplug" mechanism that fixes a long
> standing dax_kmem problem and then go further to drop the indirection
> through device_dax and have a "hotplug" mechanism directly at the
> cxl_region level.
>
The only catch may be auto-online behavior in dax, we may not want to
encode that there and instead improve the hotplug interface to entice
users to use it directly instead.
> > +int devm_cxl_add_sysram_region(struct cxl_region *cxlr)
> > +{
> [..]
> > +err_add_group:
> > + dev_set_drvdata(dev, NULL);
> > + /* if this fails, memory cannot be removed from the system until reboot */
> > + remove_memory(range.start, range_len(&range));
> > +err_add_memory:
> > + remove_resource(res);
> > + kfree(res);
> > +err_request_mem:
> > + memory_group_unregister(data->mgid);
> > +err_reg_mgid:
> > + kfree(data->res_name);
> > +err_res_name:
> > + kfree(data);
> > +err_data:
> > + clear_node_memory_type(numa_node, mtype);
> > + return rc;
>
> ...btw, this feels like too many new gotos in the age of
> scope-based-cleanup. It also feels like a bunch of duplicated code that
> CXL and fixed up dax_kmem can share.
Yeah i cribbed a bunch of this from dax and hotplug, i expect this to
get significantly cleaner in a version or two.
~Gregory
On 1/12/2026 10:35 AM, Gregory Price wrote:
> Add a sysram memctrl that directly hotplugs memory without needing to
> route through DAX. This simplifies the sysram usecase considerably.
>
> The sysram memctl adds new sysfs controls when registered:
> region/memctrl/[hotplug, hotunplug, state]
>
> hotplug: controller attempts to hotplug the memory region
> hotunplug: controller attempts to offline and hotunplug the memory region
Nit: Would it be better to use hotadd/hotremove here instead of hotplug/hotunplug? The terms
are basically synonymous, but I think hotadd and hotremove are more descriptive.
> state: [online,online_normal,offline]
> online : controller onlines blocks in ZONE_MOVABLE
> online_normal: controller onlines blocks in ZONE_NORMAL
The naming for online states could be improved imo. I understand and agree with the motivation
behind the names, but I could see the use of the word "normal" being confusing to less savvy users.
You could change it to include the zone for both (online_movable/online_normal), but I think it may
be easier to mark which one has drawbacks, i.e. change "online_normal" to something like "online_nonremovable".
That way, anyone who doesn't want to go find the documentation for these can understand the user-visible
impact.
In any case, all of these attributes need ABI documentation as well.
> offline : controller attempts to offline the memory blocks
>
> Hotplug note - by default the controller will hotplug the blocks, but
> leave them offline (unless MHP auto-online in Kconfig is enabled).
>
> Setting state to "online_normal" may prevent future hot-unplug of sysram
> regions, and unbinding a memory region with memory online in ZONE_NORMAL
> may result in the device being removed but the memory remaining online.
>
> This can result in future management functions failing (such as adding a
> new region). This is why "online_normal" is explicit, and the default
> online zone is ZONE_MOVABLE.
>
> Cc: David Hildenbrand <david@kernel.org>
> Signed-off-by: Gregory Price <gourry@gourry.net>
> ---
> drivers/cxl/core/core.h | 2 +
> drivers/cxl/core/memctrl/Makefile | 1 +
> drivers/cxl/core/memctrl/memctrl.c | 2 +
> drivers/cxl/core/memctrl/sysram_region.c | 358 +++++++++++++++++++++++
> drivers/cxl/core/region.c | 5 +
> drivers/cxl/cxl.h | 6 +-
> 6 files changed, 372 insertions(+), 2 deletions(-)
> create mode 100644 drivers/cxl/core/memctrl/sysram_region.c
>
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 1156a4bd0080..18cb84950500 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -31,6 +31,8 @@ int cxl_decoder_detach(struct cxl_region *cxlr,
> struct cxl_endpoint_decoder *cxled, int pos,
> enum cxl_detach_mode mode);
>
> +int devm_cxl_add_sysram_region(struct cxl_region *cxlr);
> +
> #define CXL_REGION_ATTR(x) (&dev_attr_##x.attr)
> #define CXL_REGION_TYPE(x) (&cxl_region_type)
> #define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr),
> diff --git a/drivers/cxl/core/memctrl/Makefile b/drivers/cxl/core/memctrl/Makefile
> index 8165aad5a52a..1c52c7d75570 100644
> --- a/drivers/cxl/core/memctrl/Makefile
> +++ b/drivers/cxl/core/memctrl/Makefile
> @@ -2,3 +2,4 @@
>
> cxl_core-$(CONFIG_CXL_REGION) += memctrl/memctrl.o
> cxl_core-$(CONFIG_CXL_REGION) += memctrl/dax_region.o
> +cxl_core-$(CONFIG_CXL_REGION) += memctrl/sysram_region.o
> diff --git a/drivers/cxl/core/memctrl/memctrl.c b/drivers/cxl/core/memctrl/memctrl.c
> index 24e0e14b39c7..40ffb59353bb 100644
> --- a/drivers/cxl/core/memctrl/memctrl.c
> +++ b/drivers/cxl/core/memctrl/memctrl.c
> @@ -34,6 +34,8 @@ int cxl_enable_memctrl(struct cxl_region *cxlr)
> return devm_cxl_add_dax_region(cxlr);
> case CXL_MEMCTRL_DAX:
> return devm_cxl_add_dax_region(cxlr);
> + case CXL_MEMCTRL_SYSRAM:
> + return devm_cxl_add_sysram_region(cxlr);
> default:
> return -EINVAL;
> }
> diff --git a/drivers/cxl/core/memctrl/sysram_region.c b/drivers/cxl/core/memctrl/sysram_region.c
> new file mode 100644
> index 000000000000..a7570c8a54e1
> --- /dev/null
> +++ b/drivers/cxl/core/memctrl/sysram_region.c
> @@ -0,0 +1,358 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright(c) 2026 Meta Inc. All rights reserved. */
> +#include <linux/memremap.h>
> +#include <linux/memory.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <linux/mm.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/memory_hotplug.h>
> +#include <linux/string_helpers.h>
> +#include <linux/sched/signal.h>
> +#include <cxlmem.h>
> +#include <cxl.h>
> +#include "../core.h"
> +
> +/* If HMAT was unavailable, assign a default distance. */
> +#define MEMTIER_DEFAULT_CXL_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 5)
> +
> +static const char *sysram_name = "System RAM (CXL)";
> +
> +struct cxl_sysram_data {
> + const char *res_name;
> + int mgid;
> + struct resource *res;
> +};
> +
> +static DEFINE_MUTEX(cxl_memory_type_lock);
> +static LIST_HEAD(cxl_memory_types);
> +
> +static struct cxl_region *to_cxl_region(struct device *dev)
> +{
> + if (dev->type != &cxl_region_type)
> + return NULL;
> + return container_of(dev, struct cxl_region, dev);
> +}
What's the reasoning behind redefining this in this file? It's still defined in cxl/core/region.c,
so I would probably just drop the static there and include it through core.h.
> +
> +static struct memory_dev_type *cxl_find_alloc_memory_type(int adist)
> +{
> + guard(mutex)(&cxl_memory_type_lock);
> + return mt_find_alloc_memory_type(adist, &cxl_memory_types);
> +}
> +
> +static void __maybe_unused cxl_put_memory_types(void)
> +{
> + guard(mutex)(&cxl_memory_type_lock);
> + mt_put_memory_types(&cxl_memory_types);
> +}
> +
> +static int cxl_sysram_range(struct cxl_region *cxlr, struct range *r)
> +{
> + struct cxl_region_params *p = &cxlr->params;
> +
> + if (!p->res)
> + return -ENODEV;
> +
> + /* memory-block align the hotplug range */
> + r->start = ALIGN(p->res->start, memory_block_size_bytes());
> + r->end = ALIGN_DOWN(p->res->end + 1, memory_block_size_bytes()) - 1;
> + if (r->start >= r->end) {
> + r->start = p->res->start;
> + r->end = p->res->end;
> + return -ENOSPC;
> + }
> + return 0;
> +}
> +
> +static ssize_t hotunplug_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct cxl_region *cxlr = to_cxl_region(dev);
> + struct range range;
> + int rc;
> +
> + if (!cxlr)
> + return -ENODEV;
> +
> + rc = cxl_sysram_range(cxlr, &range);
> + if (rc)
> + return rc;
> +
> + rc = offline_and_remove_memory(range.start, range_len(&range));
> +
> + if (rc)
Extra blank line above.
> + return rc;
> +
> + return len;
> +}
> +static DEVICE_ATTR_WO(hotunplug);
> +
> +struct online_memory_cb_arg {
> + int online_type;
> + int rc;
> +};
> +
> +static int online_memory_block_cb(struct memory_block *mem, void *arg)
> +{
> + struct online_memory_cb_arg *cb_arg = arg;
> +
> + if (signal_pending(current))
> + return -EINTR;
> +
> + cond_resched();
> +
> + if (mem->state == MEM_ONLINE)
> + return 0;
> +
> + mem->online_type = cb_arg->online_type;
> + cb_arg->rc = device_online(&mem->dev);
> +
> + return cb_arg->rc;
> +}
> +
> +static int offline_memory_block_cb(struct memory_block *mem, void *arg)
> +{
> + int *rc = arg;
> +
> + if (signal_pending(current))
> + return -EINTR;
> +
> + cond_resched();
> +
> + if (mem->state == MEM_OFFLINE)
> + return 0;
> +
> + *rc = device_offline(&mem->dev);
> +
> + return *rc;
> +}
> +
> +static ssize_t state_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct cxl_region *cxlr = to_cxl_region(dev);
> + struct online_memory_cb_arg cb_arg;
> + struct range range;
> + int rc;
> +
> + if (!cxlr)
> + return -ENODEV;
> +
> + rc = cxl_sysram_range(cxlr, &range);
> + if (rc)
> + return rc;
> +
> + rc = lock_device_hotplug_sysfs();
> + if (rc)
> + return rc;
> +
> + if (sysfs_streq(buf, "online")) {
> + cb_arg.online_type = MMOP_ONLINE_MOVABLE;
> + cb_arg.rc = 0;
> + rc = walk_memory_blocks(range.start, range_len(&range),
> + &cb_arg, online_memory_block_cb);
> + if (!rc)
> + rc = cb_arg.rc;
> + } else if (sysfs_streq(buf, "online_normal")) {
> + cb_arg.online_type = MMOP_ONLINE;
> + cb_arg.rc = 0;
> + rc = walk_memory_blocks(range.start, range_len(&range),
> + &cb_arg, online_memory_block_cb);
> + if (!rc)
> + rc = cb_arg.rc;
> + } else if (sysfs_streq(buf, "offline")) {
> + int offline_rc = 0;
> +
> + rc = walk_memory_blocks(range.start, range_len(&range),
> + &offline_rc, offline_memory_block_cb);
> + if (!rc)
> + rc = offline_rc;
> + } else {
> + rc = -EINVAL;
> + }
Nit: You can just set rc = -EINVAL before the if statement instead of doing this else clause.> +
> + unlock_device_hotplug();
> +
> + if (rc)
> + return rc;
> +
> + return len;
> +}
> +static DEVICE_ATTR_WO(state);
> +
> +static ssize_t hotplug_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct cxl_region *cxlr = to_cxl_region(dev);
> + struct cxl_sysram_data *data;
> + struct range range;
> + int rc;
> +
> + if (!cxlr)
> + return -ENODEV;
> +
> + data = dev_get_drvdata(dev);
> + if (!data)
> + return -ENODEV;
> +
> + rc = cxl_sysram_range(cxlr, &range);
> + if (rc)
> + return rc;
> +
> + rc = add_memory_driver_managed(data->mgid, range.start,
> + range_len(&range), sysram_name,
> + MHP_NID_IS_MGID);
> + if (rc)
> + return rc;
> +
> + return len;
> +}
> +static DEVICE_ATTR_WO(hotplug);
> +
> +static struct attribute *cxl_sysram_region_attrs[] = {
> + &dev_attr_hotunplug.attr,
> + &dev_attr_state.attr,
> + &dev_attr_hotplug.attr,
> + NULL,
> +};
> +
> +static const struct attribute_group cxl_sysram_region_group = {
> + .name = "memctl",
> + .attrs = cxl_sysram_region_attrs,
> +};
> +
> +static void cxl_sysram_unregister(void *_data)
> +{
> + struct cxl_sysram_data *data = _data;
> + struct range range = {
> + .start = data->res->start,
> + .end = data->res->end
> + };
> +
> + /* We have one shot for removal, otherwise it's stuck til reboot */
> + if (!offline_and_remove_memory(range.start, range_len(&range))) {
> + remove_resource(data->res);
> + kfree(data->res);
> + memory_group_unregister(data->mgid);
> + kfree(data->res_name);
> + kfree(data);
> + return;
> + }
> + pr_err("CXL: %#llx-%#llx cannot be hotremoved until next reboot\n",
> + range.start, range.end);
> +}
> +
> +int devm_cxl_add_sysram_region(struct cxl_region *cxlr)
> +{
> + struct cxl_region_params *p = &cxlr->params;
> + struct device *dev = &cxlr->dev;
> + struct cxl_sysram_data *data;
> + struct memory_dev_type *mtype;
> + unsigned long total_len = 0;
> + struct resource *res;
> + struct range range;
> + mhp_t mhp_flags;
> + int numa_node;
> + int adist = MEMTIER_DEFAULT_CXL_ADISTANCE;
> + int rc;
> +
> + numa_node = phys_to_target_node(p->res->start);
> + if (numa_node < 0) {
> + dev_warn(dev, "rejecting CXL region with invalid node: %d\n",
> + numa_node);
> + return -EINVAL;
> + }
> +
> + rc = cxl_sysram_range(cxlr, &range);
> + if (rc) {
> + dev_info(dev, "range %#llx-%#llx too small after alignment\n",
> + range.start, range.end);
This should probably be a warning instead. You do it for the next check which is essentially the same
case, so may as well do it here.
> + return rc;
> + }
> + total_len = range_len(&range);
> +
> + if (!total_len) {
> + dev_warn(dev, "rejecting CXL region without any memory after alignment\n");
> + return -EINVAL;
> + }
I don't think this check is needed. cxl_sysram_range() checks if the range->start == range->end (i.e. size == 0)
and errors out. That should cause the above check to error out before this.
> +
> + mt_calc_adistance(numa_node, &adist);
> + mtype = cxl_find_alloc_memory_type(adist);
> + if (IS_ERR(mtype))
> + return PTR_ERR(mtype);
> +
> + init_node_memory_type(numa_node, mtype);
> +
> + data = kzalloc(sizeof(*data), GFP_KERNEL);
> + if (!data) {
> + rc = -ENOMEM;
> + goto err_data;
> + }
> +
> + data->res_name = kstrdup(dev_name(dev), GFP_KERNEL);
> + if (!data->res_name) {
> + rc = -ENOMEM;
> + goto err_res_name;
> + }
> +
> + rc = memory_group_register_static(numa_node, PFN_UP(total_len));
> + if (rc < 0)
> + goto err_reg_mgid;
> + data->mgid = rc;
> +
> + /* Region is permanently reserved if hotremove fails when unbinding. */
> + res = request_mem_region(range.start, range_len(&range),
> + data->res_name);
> + if (!res) {
> + dev_warn(dev, "range %#llx-%#llx could not reserve region\n",
> + range.start, range.end);
> + rc = -EBUSY;
> + goto err_request_mem;
> + }
> + data->res = res;
> +
> + /*
> + * Setup flags for System RAM. Leave _BUSY clear so add_memory() can add
> + * a child resource. Do not inherit flags from parent since it may set
> + * flags unknown to us that will the break add_memory() below.
> + */
> + res->flags = IORESOURCE_SYSTEM_RAM;
> + mhp_flags = MHP_NID_IS_MGID;
> + rc = add_memory_driver_managed(data->mgid, range.start,
> + range_len(&range), sysram_name, mhp_flags);
Look like mhp_flags is only used once, I'd get rid of it and just use MHP_NID_IS_MGID instead.
> + if (rc) {
> + dev_warn(dev, "range %#llx-%#llx memory add failed\n",
> + range.start, range.end);
> + goto err_add_memory;
> + }
> + dev_dbg(dev, "%s: added %llu bytes as System RAM\n", dev_name(dev),
> + (unsigned long long)total_len);
> +
> + dev_set_drvdata(dev, data);
> + rc = devm_device_add_group(dev, &cxl_sysram_region_group);
> + if (rc)
> + goto err_add_group;
> +
> + return devm_add_action_or_reset(dev, cxl_sysram_unregister, data);
> +
> +err_add_group:
> + dev_set_drvdata(dev, NULL);
> + /* if this fails, memory cannot be removed from the system until reboot */
> + remove_memory(range.start, range_len(&range));
> +err_add_memory:
> + remove_resource(res);
> + kfree(res);
> +err_request_mem:
> + memory_group_unregister(data->mgid);
> +err_reg_mgid:
> + kfree(data->res_name);
> +err_res_name:
> + kfree(data);
> +err_data:
> + clear_node_memory_type(numa_node, mtype);
> + return rc;
> +}
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 02d7d9ae0252..eeab091f043a 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -639,6 +639,9 @@ static ssize_t ctrl_show(struct device *dev, struct device_attribute *attr,
> case CXL_MEMCTRL_DAX:
> desc = "dax";
> break;
> + case CXL_MEMCTRL_SYSRAM:
> + desc = "sysram";
> + break;
> default:
> desc = "";
> break;
> @@ -663,6 +666,8 @@ static ssize_t ctrl_store(struct device *dev, struct device_attribute *attr,
>
> if (sysfs_streq(buf, "dax"))
> cxlr->memctrl = CXL_MEMCTRL_DAX;
> + else if (sysfs_streq(buf, "sysram"))
> + cxlr->memctrl = CXL_MEMCTRL_SYSRAM;
> else
> return -EINVAL;
>
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index b8fabaa77262..bb4f877b4e8f 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -506,13 +506,15 @@ enum cxl_partition_mode {
> /*
> * Memory Controller modes:
> * None - No controller selected
> - * Auto - either BIOS-configured as SysRAM, or default to DAX
> - * DAX - creates a dax_region controller for the cxl_region
> + * Auto - either BIOS-configured as SysRAM, or default to DAX
> + * DAX - creates a dax_region controller for the cxl_region
> + * SYSRAM - hotplugs the region directly as System RAM
> */
> enum cxl_memctrl_mode {
> CXL_MEMCTRL_NONE,
> CXL_MEMCTRL_AUTO,
> CXL_MEMCTRL_DAX,
> + CXL_MEMCTRL_SYSRAM,
> };
>
> /*
On Mon, Jan 12, 2026 at 03:10:41PM -0600, Cheatham, Benjamin wrote:
> On 1/12/2026 10:35 AM, Gregory Price wrote:
> > Add a sysram memctrl that directly hotplugs memory without needing to
> > route through DAX. This simplifies the sysram usecase considerably.
> >
> > The sysram memctl adds new sysfs controls when registered:
> > region/memctrl/[hotplug, hotunplug, state]
> >
> > hotplug: controller attempts to hotplug the memory region
> > hotunplug: controller attempts to offline and hotunplug the memory region
>
> Nit: Would it be better to use hotadd/hotremove here instead of hotplug/hotunplug? The terms
> are basically synonymous, but I think hotadd and hotremove are more descriptive.
I will defer to David on this. I think keeping the terminology
consistent is better, but also hotplug is overloaded between physical
and logical. It ultimately means the same thing to be honest.
> > state: [online,online_normal,offline]
> > online : controller onlines blocks in ZONE_MOVABLE
> > online_normal: controller onlines blocks in ZONE_NORMAL
>
> The naming for online states could be improved imo. I understand and agree with the motivation
> behind the names, but I could see the use of the word "normal" being confusing to less savvy users.
> You could change it to include the zone for both (online_movable/online_normal), but I think it may
> be easier to mark which one has drawbacks, i.e. change "online_normal" to something like "online_nonremovable".
> That way, anyone who doesn't want to go find the documentation for these can understand the user-visible
> impact.
>
> In any case, all of these attributes need ABI documentation as well.
>
This is what i was getting at originally, I will consider the other
feedback and spin a v2 with this simplified a bit.
I'm leaning towards agreeing with Dan and David that probably we just
keep online/online_movable since it's consistent with base/memory.c, but
we can continue to have this argument.
I don't think we can reasonable get away from users of this interface
understanding the implications of ZONEs, since whatever they choose to
do dictates what zone the memory gets added to.
> > +static DEFINE_MUTEX(cxl_memory_type_lock);
> > +static LIST_HEAD(cxl_memory_types);
> > +
> > +static struct cxl_region *to_cxl_region(struct device *dev)
> > +{
> > + if (dev->type != &cxl_region_type)
> > + return NULL;
> > + return container_of(dev, struct cxl_region, dev);
> > +}
>
> What's the reasoning behind redefining this in this file? It's still defined in cxl/core/region.c,
> so I would probably just drop the static there and include it through core.h.
>
Just cruft from rapidly moving stuff around. Will fixup.
> > + rc = cxl_sysram_range(cxlr, &range);
> > + if (rc) {
> > + dev_info(dev, "range %#llx-%#llx too small after alignment\n",
> > + range.start, range.end);
>
> This should probably be a warning instead. You do it for the next check which is essentially the same
> case, so may as well do it here.
ack.
> > + if (!total_len) {
> > + dev_warn(dev, "rejecting CXL region without any memory after alignment\n");
> > + return -EINVAL;
> > + }
>
> I don't think this check is needed. cxl_sysram_range() checks if the range->start == range->end (i.e. size == 0)
> and errors out. That should cause the above check to error out before this.
ack
> > + /*
> > + * Setup flags for System RAM. Leave _BUSY clear so add_memory() can add
> > + * a child resource. Do not inherit flags from parent since it may set
> > + * flags unknown to us that will the break add_memory() below.
> > + */
> > + res->flags = IORESOURCE_SYSTEM_RAM;
> > + mhp_flags = MHP_NID_IS_MGID;
> > + rc = add_memory_driver_managed(data->mgid, range.start,
> > + range_len(&range), sysram_name, mhp_flags);
>
> Look like mhp_flags is only used once, I'd get rid of it and just use MHP_NID_IS_MGID instead.
>
ack - yeah this was cribbed from dax.c
Thank you!
~Gregory
On 1/12/2026 4:55 PM, Gregory Price wrote: > On Mon, Jan 12, 2026 at 03:10:41PM -0600, Cheatham, Benjamin wrote: >> On 1/12/2026 10:35 AM, Gregory Price wrote: >>> Add a sysram memctrl that directly hotplugs memory without needing to >>> route through DAX. This simplifies the sysram usecase considerably. >>> >>> The sysram memctl adds new sysfs controls when registered: >>> region/memctrl/[hotplug, hotunplug, state] >>> >>> hotplug: controller attempts to hotplug the memory region >>> hotunplug: controller attempts to offline and hotunplug the memory region >> >> Nit: Would it be better to use hotadd/hotremove here instead of hotplug/hotunplug? The terms >> are basically synonymous, but I think hotadd and hotremove are more descriptive. > > I will defer to David on this. I think keeping the terminology > consistent is better, but also hotplug is overloaded between physical > and logical. It ultimately means the same thing to be honest. I agree, I'm fine with either here. > >>> state: [online,online_normal,offline] >>> online : controller onlines blocks in ZONE_MOVABLE >>> online_normal: controller onlines blocks in ZONE_NORMAL >> >> The naming for online states could be improved imo. I understand and agree with the motivation >> behind the names, but I could see the use of the word "normal" being confusing to less savvy users. >> You could change it to include the zone for both (online_movable/online_normal), but I think it may >> be easier to mark which one has drawbacks, i.e. change "online_normal" to something like "online_nonremovable". >> That way, anyone who doesn't want to go find the documentation for these can understand the user-visible >> impact. >> >> In any case, all of these attributes need ABI documentation as well. >> > > This is what i was getting at originally, I will consider the other > feedback and spin a v2 with this simplified a bit. > > I'm leaning towards agreeing with Dan and David that probably we just > keep online/online_movable since it's consistent with base/memory.c, but > we can continue to have this argument. > > I don't think we can reasonable get away from users of this interface > understanding the implications of ZONEs, since whatever they choose to > do dictates what zone the memory gets added to. That sounds reasonable. I was going under the assumption that someone may come along who doesn't know much about zones, which probably isn't very likely. So if we want to ditch that assumption it's fine by me.
On 1/12/26 17:35, Gregory Price wrote:
> Add a sysram memctrl that directly hotplugs memory without needing to
> route through DAX. This simplifies the sysram usecase considerably.
>
> The sysram memctl adds new sysfs controls when registered:
> region/memctrl/[hotplug, hotunplug, state]
>
> hotplug: controller attempts to hotplug the memory region
Why disconnect the hotplug from the online state?
echo online_movable > hotplug ?
Then we can just have something like add_and_online_memory() in the core.
> hotunplug: controller attempts to offline and hotunplug the memory region
> state: [online,online_normal,offline]
> online : controller onlines blocks in ZONE_MOVABLE
I don't like this incosistency regarding the remainder of common hotplug
toggles.
We should use exactly the same values with exactly the same semantics.
Yes, user-space tooling should be thaught to pass in online_movable :)
> online_normal: controller onlines blocks in ZONE_NORMAL
> offline : controller attempts to offline the memory blocks
Why is that required? ideally we'd start with hotplug vs. hotunplug and
leave manual onlining/offlining out of this interface for now.
>
> Hotplug note - by default the controller will hotplug the blocks, but
> leave them offline (unless MHP auto-online in Kconfig is enabled).
>
> Setting state to "online_normal" may prevent future hot-unplug of sysram
> regions, and unbinding a memory region with memory online in ZONE_NORMAL
> may result in the device being removed but the memory remaining online.
>
> This can result in future management functions failing (such as adding a
> new region). This is why "online_normal" is explicit, and the default
> online zone is ZONE_MOVABLE.
>
> Cc: David Hildenbrand <david@kernel.org>
> Signed-off-by: Gregory Price <gourry@gourry.net>
> ---
> drivers/cxl/core/core.h | 2 +
> drivers/cxl/core/memctrl/Makefile | 1 +
> drivers/cxl/core/memctrl/memctrl.c | 2 +
> drivers/cxl/core/memctrl/sysram_region.c | 358 +++++++++++++++++++++++
> drivers/cxl/core/region.c | 5 +
> drivers/cxl/cxl.h | 6 +-
> 6 files changed, 372 insertions(+), 2 deletions(-)
> create mode 100644 drivers/cxl/core/memctrl/sysram_region.c
>
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 1156a4bd0080..18cb84950500 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -31,6 +31,8 @@ int cxl_decoder_detach(struct cxl_region *cxlr,
> struct cxl_endpoint_decoder *cxled, int pos,
> enum cxl_detach_mode mode);
>
> +int devm_cxl_add_sysram_region(struct cxl_region *cxlr);
> +
> #define CXL_REGION_ATTR(x) (&dev_attr_##x.attr)
> #define CXL_REGION_TYPE(x) (&cxl_region_type)
> #define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr),
> diff --git a/drivers/cxl/core/memctrl/Makefile b/drivers/cxl/core/memctrl/Makefile
> index 8165aad5a52a..1c52c7d75570 100644
> --- a/drivers/cxl/core/memctrl/Makefile
> +++ b/drivers/cxl/core/memctrl/Makefile
> @@ -2,3 +2,4 @@
>
> cxl_core-$(CONFIG_CXL_REGION) += memctrl/memctrl.o
> cxl_core-$(CONFIG_CXL_REGION) += memctrl/dax_region.o
> +cxl_core-$(CONFIG_CXL_REGION) += memctrl/sysram_region.o
> diff --git a/drivers/cxl/core/memctrl/memctrl.c b/drivers/cxl/core/memctrl/memctrl.c
> index 24e0e14b39c7..40ffb59353bb 100644
> --- a/drivers/cxl/core/memctrl/memctrl.c
> +++ b/drivers/cxl/core/memctrl/memctrl.c
> @@ -34,6 +34,8 @@ int cxl_enable_memctrl(struct cxl_region *cxlr)
> return devm_cxl_add_dax_region(cxlr);
> case CXL_MEMCTRL_DAX:
> return devm_cxl_add_dax_region(cxlr);
> + case CXL_MEMCTRL_SYSRAM:
> + return devm_cxl_add_sysram_region(cxlr);
> default:
> return -EINVAL;
> }
> diff --git a/drivers/cxl/core/memctrl/sysram_region.c b/drivers/cxl/core/memctrl/sysram_region.c
> new file mode 100644
> index 000000000000..a7570c8a54e1
> --- /dev/null
> +++ b/drivers/cxl/core/memctrl/sysram_region.c
> @@ -0,0 +1,358 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright(c) 2026 Meta Inc. All rights reserved. */
> +#include <linux/memremap.h>
> +#include <linux/memory.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <linux/mm.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/memory_hotplug.h>
> +#include <linux/string_helpers.h>
> +#include <linux/sched/signal.h>
> +#include <cxlmem.h>
> +#include <cxl.h>
> +#include "../core.h"
> +
> +/* If HMAT was unavailable, assign a default distance. */
> +#define MEMTIER_DEFAULT_CXL_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 5)
> +
> +static const char *sysram_name = "System RAM (CXL)";
> +
> +struct cxl_sysram_data {
> + const char *res_name;
> + int mgid;
> + struct resource *res;
> +};
> +
> +static DEFINE_MUTEX(cxl_memory_type_lock);
> +static LIST_HEAD(cxl_memory_types);
> +
> +static struct cxl_region *to_cxl_region(struct device *dev)
> +{
> + if (dev->type != &cxl_region_type)
> + return NULL;
> + return container_of(dev, struct cxl_region, dev);
> +}
> +
> +static struct memory_dev_type *cxl_find_alloc_memory_type(int adist)
> +{
> + guard(mutex)(&cxl_memory_type_lock);
> + return mt_find_alloc_memory_type(adist, &cxl_memory_types);
> +}
> +
> +static void __maybe_unused cxl_put_memory_types(void)
> +{
> + guard(mutex)(&cxl_memory_type_lock);
> + mt_put_memory_types(&cxl_memory_types);
> +}
> +
> +static int cxl_sysram_range(struct cxl_region *cxlr, struct range *r)
> +{
> + struct cxl_region_params *p = &cxlr->params;
> +
> + if (!p->res)
> + return -ENODEV;
> +
> + /* memory-block align the hotplug range */
> + r->start = ALIGN(p->res->start, memory_block_size_bytes());
> + r->end = ALIGN_DOWN(p->res->end + 1, memory_block_size_bytes()) - 1;
> + if (r->start >= r->end) {
> + r->start = p->res->start;
> + r->end = p->res->end;
> + return -ENOSPC;
> + }
> + return 0;
> +}
> +
> +static ssize_t hotunplug_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct cxl_region *cxlr = to_cxl_region(dev);
> + struct range range;
> + int rc;
> +
> + if (!cxlr)
> + return -ENODEV;
> +
> + rc = cxl_sysram_range(cxlr, &range);
> + if (rc)
> + return rc;
> +
> + rc = offline_and_remove_memory(range.start, range_len(&range));
> +
> + if (rc)
> + return rc;
> +
> + return len;
> +}
> +static DEVICE_ATTR_WO(hotunplug);
> +
> +struct online_memory_cb_arg {
> + int online_type;
> + int rc;
> +};
> +
> +static int online_memory_block_cb(struct memory_block *mem, void *arg)
> +{
> + struct online_memory_cb_arg *cb_arg = arg;
> +
> + if (signal_pending(current))
> + return -EINTR;
> +
> + cond_resched();
> +
> + if (mem->state == MEM_ONLINE)
> + return 0;
> +
> + mem->online_type = cb_arg->online_type;
> + cb_arg->rc = device_online(&mem->dev);
> +
> + return cb_arg->rc;
> +}
> +
> +static int offline_memory_block_cb(struct memory_block *mem, void *arg)
> +{
> + int *rc = arg;
> +
> + if (signal_pending(current))
> + return -EINTR;
> +
> + cond_resched();
> +
> + if (mem->state == MEM_OFFLINE)
> + return 0;
> +
> + *rc = device_offline(&mem->dev);
> +
> + return *rc;
> +}
> +
> +static ssize_t state_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct cxl_region *cxlr = to_cxl_region(dev);
> + struct online_memory_cb_arg cb_arg;
> + struct range range;
> + int rc;
> +
> + if (!cxlr)
> + return -ENODEV;
> +
> + rc = cxl_sysram_range(cxlr, &range);
> + if (rc)
> + return rc;
> +
> + rc = lock_device_hotplug_sysfs();
> + if (rc)
> + return rc;
> +
> + if (sysfs_streq(buf, "online")) {
> + cb_arg.online_type = MMOP_ONLINE_MOVABLE;
> + cb_arg.rc = 0;
> + rc = walk_memory_blocks(range.start, range_len(&range),
> + &cb_arg, online_memory_block_cb);
> + if (!rc)
> + rc = cb_arg.rc;
> + } else if (sysfs_streq(buf, "online_normal")) {
> + cb_arg.online_type = MMOP_ONLINE;
> + cb_arg.rc = 0;
> + rc = walk_memory_blocks(range.start, range_len(&range),
> + &cb_arg, online_memory_block_cb);
> + if (!rc)
> + rc = cb_arg.rc;
> + } else if (sysfs_streq(buf, "offline")) {
> + int offline_rc = 0;
> +
> + rc = walk_memory_blocks(range.start, range_len(&range),
> + &offline_rc, offline_memory_block_cb);
> + if (!rc)
> + rc = offline_rc;
Let's expose this functionality through some common-code helpers. I
really don't want more code doing this non-obvious device_offline() etc
dance.
walk_memory_blocks() should become a core-mm helper. Maybe we can also
cleanup drivers/acpi/acpi_memhotplug.c in that regard.
Hopefully we can then also reuse these helpers in ppc code (see
dlpar_add_lmb() and dlpar_remove_lmb() that do something similar, but
grab the device hotplug lock themselves as they want to perform some
additional operations).
--
Cheers
David
On Mon, Jan 12, 2026 at 09:00:54PM +0100, David Hildenbrand (Red Hat) wrote:
> On 1/12/26 17:35, Gregory Price wrote:
> > Add a sysram memctrl that directly hotplugs memory without needing to
> > route through DAX. This simplifies the sysram usecase considerably.
> >
> > The sysram memctl adds new sysfs controls when registered:
> > region/memctrl/[hotplug, hotunplug, state]
> >
> > hotplug: controller attempts to hotplug the memory region
>
> Why disconnect the hotplug from the online state?
>
> echo online_movable > hotplug ?
>
> Then we can just have something like add_and_online_memory() in the core.
>
mostly i cobbled this together over the weekend to have it for
discussion at the community DAX meeting.
I think just having
[offline,online,online_movable] > hotplug
is probably the better option. There's not much use in a memory_region
control that lets you offline the memory but not remove the blocks.
I mean, I know of *a* use for that, and it's not something we want to
support :]
> > hotunplug: controller attempts to offline and hotunplug the memory region
> > state: [online,online_normal,offline]
> > online : controller onlines blocks in ZONE_MOVABLE
>
> I don't like this incosistency regarding the remainder of common hotplug
> toggles.
>
> We should use exactly the same values with exactly the same semantics. Yes,
> user-space tooling should be thaught to pass in online_movable :)
>
> > online_normal: controller onlines blocks in ZONE_NORMAL
> > offline : controller attempts to offline the memory blocks
>
> Why is that required? ideally we'd start with hotplug vs. hotunplug and
> leave manual onlining/offlining out of this interface for now.
>
That is fair, although i would like a build option to default the online
mode to ZONE_MOVABLE for auto-configured sysram regions w/ the SP bit
set, otherwise that will be forever locked to using the DAX model.
> > + } else if (sysfs_streq(buf, "offline")) {
> > + int offline_rc = 0;
> > +
> > + rc = walk_memory_blocks(range.start, range_len(&range),
> > + &offline_rc, offline_memory_block_cb);
> > + if (!rc)
> > + rc = offline_rc;
>
> Let's expose this functionality through some common-code helpers. I really
> don't want more code doing this non-obvious device_offline() etc dance.
>
> walk_memory_blocks() should become a core-mm helper. Maybe we can also
> cleanup drivers/acpi/acpi_memhotplug.c in that regard.
>
> Hopefully we can then also reuse these helpers in ppc code (see
> dlpar_add_lmb() and dlpar_remove_lmb() that do something similar, but grab
> the device hotplug lock themselves as they want to perform some additional
> operations).
>
I'll take a look.
Thanks!
~Gregory
© 2016 - 2026 Red Hat, Inc.