Provide a new interface for dynamic configuration and deconfiguration of
hotplug memory, allowing for mixed altmap and non-altmap support. It is
a follow-up on the discussion with David:
https://lore.kernel.org/all/ee492da8-74b4-4a97-8b24-73e07257f01d@redhat.com/
As mentioned in the discussion, advantages of the new interface are:
* Users can dynamically specify which memory ranges should have altmap
support, rather than having it statically enabled or disabled for all
hot-plugged memory.
* In the long term, user could specify a memory range, including
multiple blocks, and whether user wants altmap support for that range.
This could allow for the altmap block grouping, or even variable-sized
blocks, in the future. i.e. "grouping" memory blocks that share a same
altmap located on the first memory blocks in the group and reduce
fragementation due to altmap.
To leverage these advantages:
Create a sysfs interface /sys/bus/memory/devices/configure_memory, which
performs runtime (de)configuration of memory with altmap or non-altmap
support. The interface validates the memory ranges against architecture
specific memory configuration and performs add_memory()/remove_memory().
Dynamic (de)configuration of memory is made configurable via config
CONFIG_RUNTIME_MEMORY_CONFIGURATION.
Usage format for the new interface:
echo config_mode,memoryrange,altmap_mode >
/sys/bus/memory/devices/configure_memory
E.g. to configure a range with altmap:
echo 1,0x200000000-0x20fffffff,1 > /sys/bus/memory/devices/configure_memory
This interface could not only help to make s390 more flexible and
similar to others (wrt adding hotplug memory in advance). It might also
be possible to provide the dynamically configured altmap support for
others. E.g. instead of directly doing an add_memory() in the ACPI
handler, with the static altmap setting, one could instead defer that to
the new interface which allows dynamic altmap configuration.
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
---
drivers/base/memory.c | 124 +++++++++++++++++++++++++++++++++++++++++
include/linux/memory.h | 6 ++
mm/Kconfig | 16 ++++++
3 files changed, 146 insertions(+)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 67858eeb92ed..f024444b3301 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -631,6 +631,127 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
return 0;
}
+#ifdef CONFIG_RUNTIME_MEMORY_CONFIGURATION
+enum {
+ REMOVE_MEMORY = 0,
+ ADD_MEMORY,
+ MAX_CONFIGURE_MODE
+};
+
+enum {
+ NOALTMAP = 0,
+ ALTMAP,
+ MAX_ALTMAP_MODE
+};
+
+/*
+ * Return true when the memory range is valid.
+ *
+ * Architecture specific code can override the below function and validate the
+ * memory range against its possible memory configurations.
+ */
+bool __weak arch_validate_memory_range(unsigned long long start,
+ unsigned long long end)
+{
+ return false;
+}
+
+/*
+ * Format:
+ * echo config_mode,memoryrange,altmap_mode >
+ * /sys/bus/memory/devices/configure_memory
+ *
+ * config_mode:
+ * value: 1 - add_memory, 0 - remove_memory
+ *
+ * range:
+ * 0x<start address>-0x<end address>
+ * Where start address is aligned to memory block size and end address
+ * represents last byte in the range.
+ * example: 0x200000000-0x20fffffff
+ *
+ * altmap_mode:
+ * value: 1 - altmap support, 0 - no altmap support
+ */
+static ssize_t configure_memory_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ s64 start, end, block_size, range;
+ u32 config_mode, altmap_mode;
+ int num, nid, ret = -EINVAL;
+ struct memory_block *mem;
+
+ num = sscanf(buf, "%u,0x%llx-0x%llx,%u", &config_mode, &start, &end, &altmap_mode);
+ if (num != 4)
+ goto out;
+
+ if (config_mode >= MAX_CONFIGURE_MODE || altmap_mode >= MAX_ALTMAP_MODE)
+ goto out;
+
+ altmap_mode = altmap_mode ? MHP_MEMMAP_ON_MEMORY |
+ MHP_OFFLINE_INACCESSIBLE : MHP_NONE;
+
+ block_size = memory_block_size_bytes();
+
+ if (!IS_ALIGNED(start, block_size) || !IS_ALIGNED(end + 1, block_size))
+ goto out;
+
+ if (start < 0 || end < 0 || start >= end)
+ goto out;
+
+ if (!arch_validate_memory_range(start, end))
+ goto out;
+
+ ret = lock_device_hotplug_sysfs();
+ if (ret)
+ goto out;
+
+ if (config_mode == ADD_MEMORY) {
+ for (range = start; range < end + 1; range += block_size) {
+ mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(range)));
+ if (mem) {
+ pr_info("Memory already configured - (start:0x%llx)\n", range);
+ ret = -EEXIST;
+ put_device(&mem->dev);
+ goto out_unlock;
+ }
+ nid = memory_add_physaddr_to_nid(range);
+ ret = __add_memory(nid, range, block_size, altmap_mode);
+ if (ret) {
+ pr_info("Memory addition failed - (start:0x%llx)\n", range);
+ goto out_unlock;
+ }
+ }
+ } else if (config_mode == REMOVE_MEMORY) {
+ for (range = start; range < end + 1; range += block_size) {
+ mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(range)));
+ if (!mem) {
+ pr_info("Memory not configured - (start:0x%llx)\n", range);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ if (mem->state != MEM_OFFLINE) {
+ pr_info("Memory removal failed - (start:0x%llx) not offline\n",
+ range);
+ put_device(&mem->dev);
+ ret = -EBUSY;
+ goto out_unlock;
+ } else {
+ /* drop the ref just got via find_memory_block() */
+ put_device(&mem->dev);
+ }
+ __remove_memory(range, block_size);
+ }
+ }
+out_unlock:
+ unlock_device_hotplug();
+out:
+ return ret ? ret : count;
+}
+static DEVICE_ATTR_WO(configure_memory);
+#endif /* CONFIG_RUNTIME_MEMORY_CONFIGURATION */
+
/*
* A reference for the returned memory block device is acquired.
*
@@ -941,6 +1062,9 @@ static struct attribute *memory_root_attrs[] = {
&dev_attr_auto_online_blocks.attr,
#ifdef CONFIG_CRASH_HOTPLUG
&dev_attr_crash_hotplug.attr,
+#endif
+#ifdef CONFIG_RUNTIME_MEMORY_CONFIGURATION
+ &dev_attr_configure_memory.attr,
#endif
NULL
};
diff --git a/include/linux/memory.h b/include/linux/memory.h
index c0afee5d126e..88b2b374bc44 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -85,6 +85,12 @@ struct memory_block {
#endif
};
+#ifdef CONFIG_RUNTIME_MEMORY_CONFIGURATION
+bool arch_validate_memory_range(unsigned long long start, unsigned long long end);
+ssize_t arch_get_memory_max_configurable(void);
+int memory_create_sysfs_max_configurable(void);
+#endif /* CONFIG_RUNTIME_MEMORY_CONFIGURATION */
+
int arch_get_memory_phys_device(unsigned long start_pfn);
unsigned long memory_block_size_bytes(void);
int set_memory_block_size_order(unsigned int order);
diff --git a/mm/Kconfig b/mm/Kconfig
index 84000b016808..2aec2fc3fb25 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -576,6 +576,22 @@ config MHP_MEMMAP_ON_MEMORY
depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+config RUNTIME_MEMORY_CONFIGURATION
+ bool "Dynamic configuration and deconfiguration of memory"
+ def_bool n
+ depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
+ help
+ This option provides support to perform dynamic configuration and
+ deconfiguration of memory with altmap or non-altmap support
+ (/sys/bus/memory/devices/configure_memory). The interface validates
+ the configuration and deconfiguration of memory ranges against
+ architecture specific configuration and performs add_memory() with
+ altmap or non-altmap support and remove_memory() respectively.
+
+ Say Y here if the architecture supports validating dynamically
+ (de)configured memory against architecture specific memory
+ configurations.
+
endif # MEMORY_HOTPLUG
config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
--
2.45.2
On 02.12.24 09:27, Sumanth Korikkar wrote: > Provide a new interface for dynamic configuration and deconfiguration of > hotplug memory, allowing for mixed altmap and non-altmap support. It is > a follow-up on the discussion with David: > > https://lore.kernel.org/all/ee492da8-74b4-4a97-8b24-73e07257f01d@redhat.com/ > > As mentioned in the discussion, advantages of the new interface are: > > * Users can dynamically specify which memory ranges should have altmap > support, rather than having it statically enabled or disabled for all > hot-plugged memory. > > * In the long term, user could specify a memory range, including > multiple blocks, and whether user wants altmap support for that range. > This could allow for the altmap block grouping, or even variable-sized > blocks, in the future. i.e. "grouping" memory blocks that share a same > altmap located on the first memory blocks in the group and reduce > fragementation due to altmap. > > To leverage these advantages: > Create a sysfs interface /sys/bus/memory/devices/configure_memory, which > performs runtime (de)configuration of memory with altmap or non-altmap > support. The interface validates the memory ranges against architecture > specific memory configuration and performs add_memory()/remove_memory(). > Dynamic (de)configuration of memory is made configurable via config > CONFIG_RUNTIME_MEMORY_CONFIGURATION. Hi! Not completely what I had in mind, especially not that we need something that generic without any indication of ranges :) In general, the flow is as follows: 1) Driver detects memory and adds it 2) Something auto-onlines that memory (e.g., udev rule) For dax/kmem, 1) can be controlled using devdax, and usually it also tries to take care of 2). s390x standby storage really is the weird thing here, because it does 1) and doesn't want 2). It shouldn't do 1) until a user wants to make use of standby memory. My thinking was that s390x would expose the standby memory ranges somewhere arch specific in sysfs. From there, one could simply trigger the adding (maybe specifying e.g, memmap_on_memory) of selected ranges. To disable standby memory, one would first offline the memory to then trigger removal using the arch specific interface. It is very similar to dax/kmem's way of handling offline+removal. Now I wonder if dax/kmem could be (ab)used on s390x for standby storage. Likely a simple sysfs interface could be easier to implement. -- Cheers, David / dhildenb
On Mon, Dec 02, 2024 at 05:55:19PM +0100, David Hildenbrand wrote: > Hi! > > Not completely what I had in mind, especially not that we need something > that generic without any indication of ranges :) > > In general, the flow is as follows: > > 1) Driver detects memory and adds it > 2) Something auto-onlines that memory (e.g., udev rule) > > For dax/kmem, 1) can be controlled using devdax, and usually it also tries > to take care of 2). > > s390x standby storage really is the weird thing here, because it does 1) and > doesn't want 2). It shouldn't do 1) until a user wants to make use of > standby memory. Hi David, The current rfc design doesnt do 1) until user initiates it. The current rfc design considers the fact that there cannot be memory holes, when there is a availability of standby memory. (which holds true for both lpars and zvms) With number of online and standby memory ranges count (max_configurable), prototype lsmem/chmem could determine memory ranges which are not yet configured i.e. (configurable_memory = max_configurable - online ranges from sysfs /sys/devices/system/memory/memory*). Example prototype implementation of lsmem/chmem looks like: ./lsmem -o RANGE,SIZE,STATE,BLOCK,ALTMAP RANGE SIZE STATE BLOCK ALTMAP 0x0000000000000000-0x00000002ffffffff 12G online 0-95 0 0x0000000300000000-0x00000003ffffffff 4G deconfigured 96-127 - # Configure range with altmap ./chmem -c 0x0000000300000000-0x00000003ffffffff -a ./lsmem -o RANGE,SIZE,STATE,BLOCK,ALTMAP RANGE SIZE STATE BLOCK ALTMAP 0x0000000000000000-0x00000002ffffffff 12G online 0-95 0 0x0000000300000000-0x00000003ffffffff 4G offline 96-127 1 # Online range ./chmem -e 0x0000000300000000-0x00000003ffffffff && ./lsmem -o RANGE,SIZE,STATE,BLOCK,ALTMAP RANGE SIZE STATE BLOCK ALTMAP 0x0000000000000000-0x00000002ffffffff 12G online 0-95 0 0x0000000300000000-0x00000003ffffffff 4G online 96-127 1 Memory block size: 128M Total online memory: 16G Total offline memory: 0B Total deconfigured: 0B # offline range ./chmem -d 0x0000000300000000-0x00000003ffffffff && ./lsmem -o RANGE,SIZE,STATE,BLOCK,ALTMAP RANGE SIZE STATE BLOCK ALTMAP 0x0000000000000000-0x00000002ffffffff 12G online 0-95 0 0x0000000300000000-0x00000003ffffffff 4G offline 96-127 1 Memory block size: 128M Total online memory: 12G Total offline memory: 4G Total deconfigured: 0B # Defconfigure range. ./chmem -g 0x0000000300000000-0x00000003ffffffff && ./lsmem -o RANGE,SIZE,STATE,BLOCK,ALTMAP RANGE SIZE STATE BLOCK ALTMAP 0x0000000000000000-0x00000002ffffffff 12G online 0-95 0 0x0000000300000000-0x00000003ffffffff 4G deconfigured 96-127 - Memory block size: 128M Total online memory: 12G Total offline memory: 0B Total deconfigured: 4G The user can still determine the available memory ranges and make them configurable using tools like lsmem or chmem with this approach atleast on s390 with this approach. > My thinking was that s390x would expose the standby memory ranges somewhere > arch specific in sysfs. From there, one could simply trigger the adding > (maybe specifying e.g, memmap_on_memory) of selected ranges. As far as I understand, sysfs interface limits the size of the buffer used in show() to 4kb. When there are huge number of standby memory ranges, wouldnt it be an issue to display everything in one attribute? Or use sysfs binary attributes to overcome the limitation? Please correct me, If I am wrong. Questions: 1. If we go ahead with this sysfs interface approach to list all standby memory ranges, could the list be made available via /sys/devices/system/memory/configurable_memlist? This could be helpful, as /sys/devices/system/memory/configure_memory performs architecture independent checks and could also be useful for other architectures in the future. 2. Whether the new interface should also be compatible with lsmem/chmem? 3. OR can we have a s390 specific path (eg: /sys/firmware/memory/standy_range) to list all standby memory range which are in deconfigured state and also use the current design (max_configurable) to make it easier for lsmem/chmem tool to detect these standby memory ranges? > To disable standby memory, one would first offline the memory to then > trigger removal using the arch specific interface. It is very similar to > dax/kmem's way of handling offline+removal. ok > Now I wonder if dax/kmem could be (ab)used on s390x for standby storage. > Likely a simple sysfs interface could be easier to implement. I havent checked dax/kmem in detail yet. I will look into it. Thank you
On 03.12.24 15:33, Sumanth Korikkar wrote: > On Mon, Dec 02, 2024 at 05:55:19PM +0100, David Hildenbrand wrote: >> Hi! >> >> Not completely what I had in mind, especially not that we need something >> that generic without any indication of ranges :) >> >> In general, the flow is as follows: >> >> 1) Driver detects memory and adds it >> 2) Something auto-onlines that memory (e.g., udev rule) >> >> For dax/kmem, 1) can be controlled using devdax, and usually it also tries >> to take care of 2). >> >> s390x standby storage really is the weird thing here, because it does 1) and >> doesn't want 2). It shouldn't do 1) until a user wants to make use of >> standby memory. > > Hi David, Hi, sorry for the late reply. Cleaning up (some of) my inbox before Christmas, and I realized I skipped this mail. > > The current rfc design doesnt do 1) until user initiates it. > > The current rfc design considers the fact that there cannot be memory > holes, when there is a availability of standby memory. (which holds true > for both lpars and zvms) > > With number of online and standby memory ranges count > (max_configurable), prototype lsmem/chmem could determine memory ranges > which are not yet configured > i.e. (configurable_memory = max_configurable - online ranges from sysfs > /sys/devices/system/memory/memory*). > > Example prototype implementation of lsmem/chmem looks like: > ./lsmem -o RANGE,SIZE,STATE,BLOCK,ALTMAP > RANGE SIZE STATE BLOCK ALTMAP > 0x0000000000000000-0x00000002ffffffff 12G online 0-95 0 > 0x0000000300000000-0x00000003ffffffff 4G deconfigured 96-127 - > > # Configure range with altmap > ./chmem -c 0x0000000300000000-0x00000003ffffffff -a > ./lsmem -o RANGE,SIZE,STATE,BLOCK,ALTMAP > RANGE SIZE STATE BLOCK ALTMAP > 0x0000000000000000-0x00000002ffffffff 12G online 0-95 0 > 0x0000000300000000-0x00000003ffffffff 4G offline 96-127 1 > > > # Online range > ./chmem -e 0x0000000300000000-0x00000003ffffffff && > ./lsmem -o RANGE,SIZE,STATE,BLOCK,ALTMAP > RANGE SIZE STATE BLOCK ALTMAP > 0x0000000000000000-0x00000002ffffffff 12G online 0-95 0 > 0x0000000300000000-0x00000003ffffffff 4G online 96-127 1 > > Memory block size: 128M > Total online memory: 16G > Total offline memory: 0B > Total deconfigured: 0B > > # offline range > ./chmem -d 0x0000000300000000-0x00000003ffffffff && > ./lsmem -o RANGE,SIZE,STATE,BLOCK,ALTMAP > RANGE SIZE STATE BLOCK ALTMAP > 0x0000000000000000-0x00000002ffffffff 12G online 0-95 0 > 0x0000000300000000-0x00000003ffffffff 4G offline 96-127 1 > > Memory block size: 128M > Total online memory: 12G > Total offline memory: 4G > Total deconfigured: 0B > > # Defconfigure range. > ./chmem -g 0x0000000300000000-0x00000003ffffffff && > ./lsmem -o RANGE,SIZE,STATE,BLOCK,ALTMAP > RANGE SIZE STATE BLOCK ALTMAP > 0x0000000000000000-0x00000002ffffffff 12G online 0-95 0 > 0x0000000300000000-0x00000003ffffffff 4G deconfigured 96-127 - > > Memory block size: 128M > Total online memory: 12G > Total offline memory: 0B > Total deconfigured: 4G Maybe "standby memory" might make it clearer. The concept is s390x specific, and it will likely stay s390x specific. I like the idea (frontend/tool interface), all we need is a way for these commands to detect ranges and turn them from standby into usable memory. > > The user can still determine the available memory ranges and make them > configurable using tools like lsmem or chmem with this approach atleast > on s390 with this approach. > >> My thinking was that s390x would expose the standby memory ranges somewhere >> arch specific in sysfs. From there, one could simply trigger the adding >> (maybe specifying e.g, memmap_on_memory) of selected ranges. > > As far as I understand, sysfs interface limits the size of the buffer > used in show() to 4kb. sysfs want usually "one value per file". > When there are huge number of standby memory > ranges, wouldnt it be an issue to display everything in one attribute? I was rather wondering about a syfs directory structure that exposes this information. For example, in the granularity of storage increments we can enable/disable. In general, it could be a similar structure as /sys/devices/system/memory/ (one director = one standby storage increment we can enable/disable?), but residing on the s390x specific sysfs area. Or any other way to express ranges that can be enabled/disabled as one unit. I'm not sure if extending /sys/devices/system/memory/ itself would be a good idea, though. It all is very s390x specific. > > Or use sysfs binary attributes to overcome the limitation? > > Please correct me, If I am wrong. > > Questions: > 1. If we go ahead with this sysfs interface approach to list all standby > memory ranges, could the list be made available via > /sys/devices/system/memory/configurable_memlist? This could be helpful, > as /sys/devices/system/memory/configure_memory performs architecture > independent checks and could also be useful for other architectures in > the future. See above, I think we want this s390x specific. > > 2. Whether the new interface should also be compatible with lsmem/chmem? Yes, likely we should allow them to query-configure this s390x specific thing. > > 3. OR can we have a s390 specific path (eg: > /sys/firmware/memory/standy_range) to list all standby memory range > which are in deconfigured state and also use the current design > (max_configurable) to make it easier for lsmem/chmem tool to detect > these standby memory ranges? Ah, there it is, yes! > >> To disable standby memory, one would first offline the memory to then >> trigger removal using the arch specific interface. It is very similar to >> dax/kmem's way of handling offline+removal. > > ok > >> Now I wonder if dax/kmem could be (ab)used on s390x for standby storage. >> Likely a simple sysfs interface could be easier to implement. > > I havent checked dax/kmem in detail yet. I will look into it. Probably it's not 100% what you want to achieve, just to give you an example how similar (but different) technologies have solved this problem. -- Cheers, David / dhildenb
> Maybe "standby memory" might make it clearer. The concept is s390x specific,
> and it will likely stay s390x specific.
>
> I like the idea (frontend/tool interface), all we need is a way for these
> commands to detect ranges and turn them from standby into usable memory.
>
> >
> > The user can still determine the available memory ranges and make them
> > configurable using tools like lsmem or chmem with this approach atleast
> > on s390 with this approach.
> >
> > > My thinking was that s390x would expose the standby memory ranges somewhere
> > > arch specific in sysfs. From there, one could simply trigger the adding
> > > (maybe specifying e.g, memmap_on_memory) of selected ranges.
Hi David,
Sorry for the late reply.
Potential design approach for enabling dynamic configuration and
deconfiguration of hotplug memory with support for both altmap and
non-altmap usage.
Introduces flexibility, allowing users to specify at runtime which
memory ranges should utilize altmap, rather than relying on a static
system-wide setting that applies uniformly to all hotplugged memory.
Introduce new interface on s390 with the following attributes:
1) Attribute1:
/sys/firmware/memory/block_size_bytes
2) Attribute2:
/sys/firmware/memory/memoryX/config
echo 0 > /sys/firmware/memory/memoryX/config -> deconfigure memoryX
echo 1 > /sys/firmware/memory/memoryX/config -> configure memoryX
3) Attribute3:
/sys/firmware/memory/memoryX/altmap_required
echo 0 > /sys/firmware/memory/memoryX/altmap_required -> noaltmap
echo 1 > /sys/firmware/memory/memoryX/altmap_required -> altmap
echo N > /sys/firmware/memory/memoryX/altmap_required -> variable size
altmap grouping (possible future requirements),
where N specifies the number of memory blocks that the current
memory block manages altmap. There are two possibilities here:
* If the altmap cannot fit entirely within memoryX, it can
extend into memoryX+1, meaning the altmap metadata will span
across multiple memory blocks.
* If the altmap for memory range cannot fit within memoryX,
then config will return -EINVAL.
NOTE: “altmap_required” attribute must be set before setting the block as
configured via “config” attribute. (Dependancy)
4) Additionally add the patch to check if the memory block is configured
with altmap or not. Similar to [RFC PATCH 2/4] mm/memory_hotplug: Add
memory block altmap sysfs attribute.
Most of the code changes will be s390 specific with this interface.
Request your inputs on the potential interface. Thank you.
Other questions:
1. I’m just wondering how variable-sized altmap grouping will be
structured in the future. Is it organized by grouping the memory blocks
that require altmap, with the first memory block storing the altmap
metadata for all of them? Or is it possible for the altmap metadata to
span across multiple memory blocks?
2. OR, will dedicated memory blocks be used exclusively for altmap
metadata, which the memory blocks requiring altmap would then consume? (To
prevent fragmentation) ?
Thank you,
Sumanth
On 20.05.25 15:06, Sumanth Korikkar wrote: >> Maybe "standby memory" might make it clearer. The concept is s390x specific, >> and it will likely stay s390x specific. >> >> I like the idea (frontend/tool interface), all we need is a way for these >> commands to detect ranges and turn them from standby into usable memory. >> >>> >>> The user can still determine the available memory ranges and make them >>> configurable using tools like lsmem or chmem with this approach atleast >>> on s390 with this approach. >>> >>>> My thinking was that s390x would expose the standby memory ranges somewhere >>>> arch specific in sysfs. From there, one could simply trigger the adding >>>> (maybe specifying e.g, memmap_on_memory) of selected ranges. > > Hi David, Hi! > > Sorry for the late reply. > > Potential design approach for enabling dynamic configuration and > deconfiguration of hotplug memory with support for both altmap and > non-altmap usage. > > Introduces flexibility, allowing users to specify at runtime which > memory ranges should utilize altmap, rather than relying on a static > system-wide setting that applies uniformly to all hotplugged memory. > > Introduce new interface on s390 with the following attributes: > > 1) Attribute1: > /sys/firmware/memory/block_size_bytes I assume this will be the storage increment size. > > 2) Attribute2: > /sys/firmware/memory/memoryX/config > echo 0 > /sys/firmware/memory/memoryX/config -> deconfigure memoryX > echo 1 > /sys/firmware/memory/memoryX/config -> configure memoryX And these would configure individual storage increments, essentially calling add_memory() and (if possible because we could offline the memory) remove_memory(). > > 3) Attribute3: > /sys/firmware/memory/memoryX/altmap_required > echo 0 > /sys/firmware/memory/memoryX/altmap_required -> noaltmap > echo 1 > /sys/firmware/memory/memoryX/altmap_required -> altmap > echo N > /sys/firmware/memory/memoryX/altmap_required -> variable size > altmap grouping (possible future requirements), > where N specifies the number of memory blocks that the current > memory block manages altmap. There are two possibilities here: > * If the altmap cannot fit entirely within memoryX, it can > extend into memoryX+1, meaning the altmap metadata will span > across multiple memory blocks. > * If the altmap for memory range cannot fit within memoryX, > then config will return -EINVAL. Do we really still need this when we can configure/deconfigure? I mean, on s390x, the most important use case for memmap-on-memory was not wasting memory for offline memory blocks. But with a configuration interface like this ... the only benefit is being able to more-reliably add memory in low-memory conditions. An unlikely scenario with standby storage IMHO. Note that I dislike exposing "altmap" to the user :) Dax calls it "memmap_on_memory", and it is a device attrivute. As soon as we go down that path we have the complexity of having to group memory blocks etc, and if we can just not go down that path right now it will make things a lot simpler. (especially, as you document above, the semantics become *really* weird) As yet another point, I am not sure if someone really needs a per-memory block control of the memmap-on-memory feature. If we could simplify here, that would be great ... > > NOTE: “altmap_required” attribute must be set before setting the block as > configured via “config” attribute. (Dependancy) > > 4) Additionally add the patch to check if the memory block is configured > with altmap or not. Similar to [RFC PATCH 2/4] mm/memory_hotplug: Add > memory block altmap sysfs attribute. > > Most of the code changes will be s390 specific with this interface. > > Request your inputs on the potential interface. Thank you. > > Other questions: > 1. I’m just wondering how variable-sized altmap grouping will be > structured in the future. Is it organized by grouping the memory blocks > that require altmap, with the first memory block storing the altmap > metadata for all of them? Or is it possible for the altmap metadata to > span across multiple memory blocks? That exactly is unclear, which is why we should probably avoid doing that for now. Also, with other developments happening (memdesc), and ongoing effort to shrink "struct page", maybe we will not even need most of this in the future? > > 2. OR, will dedicated memory blocks be used exclusively for altmap > metadata, which the memory blocks requiring altmap would then consume? (To > prevent fragmentation) ? One idea I had was that you would do the add_memory() in bigger granularity. Then, the memory blocks hosting the memmap would have to get onlined first. And offlining of them would fail until all dependent ones were offlined. That would at least limit the impact. Then, the question would be, how could you "group" these memory blocks from your interface to do a single add_memory() etc. But again, maybe we can leave that part out for now ... -- Cheers, David / dhildenb
> > Introduce new interface on s390 with the following attributes:
> >
> > 1) Attribute1:
> > /sys/firmware/memory/block_size_bytes
>
> I assume this will be the storage increment size.
Hi David,
No, this is memory block size.
> > > 2) Attribute2:
> > /sys/firmware/memory/memoryX/config
> > echo 0 > /sys/firmware/memory/memoryX/config -> deconfigure memoryX
> > echo 1 > /sys/firmware/memory/memoryX/config -> configure memoryX
>
> And these would configure individual storage increments, essentially calling
> add_memory() and (if possible because we could offline the memory)
> remove_memory().
configure or deconfigure memory in units of entire memory blocks.
As I understand it, add_memory() operates on memory block granularity,
and this is enforced by check_hotplug_memory_range(), which ensures the
requested range aligns with the memory block size.
> > 3) Attribute3:
> > /sys/firmware/memory/memoryX/altmap_required
> > echo 0 > /sys/firmware/memory/memoryX/altmap_required -> noaltmap
> > echo 1 > /sys/firmware/memory/memoryX/altmap_required -> altmap
> > echo N > /sys/firmware/memory/memoryX/altmap_required -> variable size
> > altmap grouping (possible future requirements),
> > where N specifies the number of memory blocks that the current
> > memory block manages altmap. There are two possibilities here:
> > * If the altmap cannot fit entirely within memoryX, it can
> > extend into memoryX+1, meaning the altmap metadata will span
> > across multiple memory blocks.
> > * If the altmap for memory range cannot fit within memoryX,
> > then config will return -EINVAL.
>
> Do we really still need this when we can configure/deconfigure?
>
> I mean, on s390x, the most important use case for memmap-on-memory was not
> wasting memory for offline memory blocks.
>
> But with a configuration interface like this ... the only benefit is being
> able to more-reliably add memory in low-memory conditions. An unlikely
> scenario with standby storage IMHO.
>
> Note that I dislike exposing "altmap" to the user :) Dax calls it
> "memmap_on_memory", and it is a device attrivute.
>
> As soon as we go down that path we have the complexity of having to group
> memory blocks etc, and if we can just not go down that path right now it
> will make things a lot simpler.
>
> (especially, as you document above, the semantics become *really* weird)
>
> As yet another point, I am not sure if someone really needs a per-memory
> block control of the memmap-on-memory feature.
>
> If we could simplify here, that would be great ...
The original motivation for introducing memmap_on_memory on s390 was to
avoid using online memory to store struct page metadata, particularly
for standby memory blocks. This became critical in cases where there was
an imbalance between standby and online memory, potentially leading to
boot failures due to insufficient memory for metadata allocation.
To address this, memmap_on_memory was utilized on s390. However, in its
current form, it adds altmap metadata at the start of each memory block
at the time of addition, and this configuration is static. It cannot be
changed at runtime.
I was wondering about the following practical scenario:
When online memory is nearly full, the user can add a standby memory
block with memmap_on_memory enabled. This allows the system to avoid
consuming already scarce online memory for metadata.
After enabling and bringing that standby memory online, the user now
has enough free online memory to add additional memory blocks without
memmap_on_memory. These later blocks can provide physically contiguous
memory, which is important for workloads or devices requiring continuous
physical address space.
If my interpretation is correct, I see good potential for this be be
useful.
As you pointed out, how about having something similar to
73954d379efd ("dax: add a sysfs knob to control memmap_on_memory behavior")
i.e.
1) To configure/deconfigure a memory block
/sys/firmware/memory/memoryX/config
1 -> configure
0 -> deconfigure
2) Determine whether memory block should have memmap_on_memory or not.
/sys/firmware/memory/memoryX/memmap_on_memory
1 -> with altmap
0 -> without altmap
This attribute must be set before the memoryX is configured. Or else, it
will default to CONFIG_MHP_MEMMAP_ON_MEMORY / memmap_on_memory parameter.
> > NOTE: “altmap_required” attribute must be set before setting the block as
> > configured via “config” attribute. (Dependancy)
> >
> > 4) Additionally add the patch to check if the memory block is configured
> > with altmap or not. Similar to [RFC PATCH 2/4] mm/memory_hotplug: Add
> > memory block altmap sysfs attribute.
> >
> > Most of the code changes will be s390 specific with this interface.
> >
> > Request your inputs on the potential interface. Thank you.
> >
> > Other questions:
> > 1. I’m just wondering how variable-sized altmap grouping will be
> > structured in the future. Is it organized by grouping the memory blocks
> > that require altmap, with the first memory block storing the altmap
> > metadata for all of them? Or is it possible for the altmap metadata to
> > span across multiple memory blocks?
>
> That exactly is unclear, which is why we should probably avoid doing that
> for now. Also, with other developments happening (memdesc), and ongoing
> effort to shrink "struct page", maybe we will not even need most of this in
> the future?
>
> >
> > 2. OR, will dedicated memory blocks be used exclusively for altmap
> > metadata, which the memory blocks requiring altmap would then consume? (To
> > prevent fragmentation) ?
>
> One idea I had was that you would do the add_memory() in bigger granularity.
>
> Then, the memory blocks hosting the memmap would have to get onlined first.
> And offlining of them would fail until all dependent ones were offlined.
>
> That would at least limit the impact.
>
> Then, the question would be, how could you "group" these memory blocks from
> your interface to do a single add_memory() etc.
>
> But again, maybe we can leave that part out for now ...
Thank you David for the details. I will ignore/leave variable sized
altmap grouping for now.
On 21.05.25 12:34, Sumanth Korikkar wrote:
>>> Introduce new interface on s390 with the following attributes:
>>>
>>> 1) Attribute1:
>>> /sys/firmware/memory/block_size_bytes
>>
>> I assume this will be the storage increment size.
>
> Hi David,
>
> No, this is memory block size.
So, the same as /sys/devices/system/memory/block_size_bytes ?
In a future where we could have variable sized memory blocks, what would
be the granularity here?
>
>>>> 2) Attribute2:
>>> /sys/firmware/memory/memoryX/config
>>> echo 0 > /sys/firmware/memory/memoryX/config -> deconfigure memoryX
>>> echo 1 > /sys/firmware/memory/memoryX/config -> configure memoryX
>>
>> And these would configure individual storage increments, essentially calling
>> add_memory() and (if possible because we could offline the memory)
>> remove_memory().
>
> configure or deconfigure memory in units of entire memory blocks.
I assume, because that is assumed to be the smallest granularity in
which we can add_memory().
And the memory block size is currently always at least the storage
increment size, correct?
>
> As I understand it, add_memory() operates on memory block granularity,
> and this is enforced by check_hotplug_memory_range(), which ensures the
> requested range aligns with the memory block size.
Yes. I was rather wondering, if we could have storage increment size >
memory block size.
>
>>> 3) Attribute3:
>>> /sys/firmware/memory/memoryX/altmap_required
>>> echo 0 > /sys/firmware/memory/memoryX/altmap_required -> noaltmap
>>> echo 1 > /sys/firmware/memory/memoryX/altmap_required -> altmap
>>> echo N > /sys/firmware/memory/memoryX/altmap_required -> variable size
>>> altmap grouping (possible future requirements),
>>> where N specifies the number of memory blocks that the current
>>> memory block manages altmap. There are two possibilities here:
>>> * If the altmap cannot fit entirely within memoryX, it can
>>> extend into memoryX+1, meaning the altmap metadata will span
>>> across multiple memory blocks.
>>> * If the altmap for memory range cannot fit within memoryX,
>>> then config will return -EINVAL.
>>
>> Do we really still need this when we can configure/deconfigure?
>>
>> I mean, on s390x, the most important use case for memmap-on-memory was not
>> wasting memory for offline memory blocks.
>>
>> But with a configuration interface like this ... the only benefit is being
>> able to more-reliably add memory in low-memory conditions. An unlikely
>> scenario with standby storage IMHO.
>>
>> Note that I dislike exposing "altmap" to the user :) Dax calls it
>> "memmap_on_memory", and it is a device attrivute.
>>
>> As soon as we go down that path we have the complexity of having to group
>> memory blocks etc, and if we can just not go down that path right now it
>> will make things a lot simpler.
>>
>> (especially, as you document above, the semantics become *really* weird)
>>
>> As yet another point, I am not sure if someone really needs a per-memory
>> block control of the memmap-on-memory feature.
>>
>> If we could simplify here, that would be great ...
>
> The original motivation for introducing memmap_on_memory on s390 was to
> avoid using online memory to store struct page metadata, particularly
> for standby memory blocks.
Right, when they were added but not online (memory not usable).
> This became critical in cases where there was
> an imbalance between standby and online memory, potentially leading to
> boot failures due to insufficient memory for metadata allocation.
Right, too much memory wasted on unused memmaps.
>
> To address this, memmap_on_memory was utilized on s390. However, in its
> current form, it adds altmap metadata at the start of each memory block
> at the time of addition, and this configuration is static. It cannot be
> changed at runtime.
Yes.
>
> I was wondering about the following practical scenario:
>
> When online memory is nearly full, the user can add a standby memory
> block with memmap_on_memory enabled. This allows the system to avoid
> consuming already scarce online memory for metadata.
Right, that's the use case I mentioned. But we're talking about ~ 2/4
MiB on s390x for a single memory block. There are other things we have
to allocate memory for when onlining memory, so there is no guarantee
that it would work with memmap_on_memory either.
It makes it more likely to succeed :)
>
> After enabling and bringing that standby memory online, the user now
> has enough free online memory to add additional memory blocks without
> memmap_on_memory. These later blocks can provide physically contiguous
> memory, which is important for workloads or devices requiring continuous
> physical address space.
>
> If my interpretation is correct, I see good potential for this be be
> useful.
Again, I think only in the case where we don't have have 2/4 MiB for the
memmap.
If this is triggered from inside the VM, might just be that the admin
can not even login anymore and trigger this if really close to OOM ...
>
> As you pointed out, how about having something similar to
> 73954d379efd ("dax: add a sysfs knob to control memmap_on_memory behavior")
Right. But here, the use case is usually (a) to add a gigantic amount of
memory using add_memory(), not small blocks like on s390x (b) consume
the memmap from (slow) special-purpose memory as well.
Regarding (a), the memmap could be so big that add_memory() might never
really work (not just because of some temporary low-memory situation).
>
> i.e.
>
> 1) To configure/deconfigure a memory block
> /sys/firmware/memory/memoryX/config
>
> 1 -> configure
> 0 -> deconfigure
>
> 2) Determine whether memory block should have memmap_on_memory or not.
> /sys/firmware/memory/memoryX/memmap_on_memory
> 1 -> with altmap
> 0 -> without altmap
>
> This attribute must be set before the memoryX is configured. Or else, it
> will default to CONFIG_MHP_MEMMAP_ON_MEMORY / memmap_on_memory parameter.
I don't have anything against that option. Just a thought if we really
have to introduce this right now.
--
Cheers,
David / dhildenb
> So, the same as /sys/devices/system/memory/block_size_bytes ?
>
> In a future where we could have variable sized memory blocks, what would be
> the granularity here?
I wasnt aware of this variable sized memory blocks. Then either
introduce block_size_bytes attribute inside each memoryX/ directory ? or
add it only when variable sized memory blocks support is implemented?
> I assume, because that is assumed to be the smallest granularity in which we
> can add_memory().
>
> And the memory block size is currently always at least the storage increment
> size, correct?
>
> >
> > As I understand it, add_memory() operates on memory block granularity,
> > and this is enforced by check_hotplug_memory_range(), which ensures the
> > requested range aligns with the memory block size.
>
> Yes. I was rather wondering, if we could have storage increment size >
> memory block size.
I tried the following:
* Config1 (zvm, 8GB online + 4GB standby)
vmcp q v store
STORAGE = 8320M MAX = 2T INC = 16M STANDBY = 3968M RESERVED = 0
the increment size is 16MB in this case and block size is 128MB.
* Config2 (zvm, 512M online + 512M standby)
vmcp q v storage
STORAGE = 512M MAX = 2T INC = 1M STANDBY = 512M RESERVED = 0
But, memory_block_size_bytes() would return max(increment_size,
MIN_MEMORY_BLOCK_SIZE)
In both cases, therefore, memory block size will be 128MB.
On the other hand, I checked one of the lpars,
the increment size is 2GB, which is greater than MIN_MEMORY_BLOCK_SIZE.
Hence, memory block size is 2GB here.
> > I was wondering about the following practical scenario:
> >
> > When online memory is nearly full, the user can add a standby memory
> > block with memmap_on_memory enabled. This allows the system to avoid
> > consuming already scarce online memory for metadata.
>
> Right, that's the use case I mentioned. But we're talking about ~ 2/4 MiB on
> s390x for a single memory block. There are other things we have to allocate
> memory for when onlining memory, so there is no guarantee that it would work
> with memmap_on_memory either.
>
> It makes it more likely to succeed :)
You're right, I wasn't precise.
> > After enabling and bringing that standby memory online, the user now
> > has enough free online memory to add additional memory blocks without
> > memmap_on_memory. These later blocks can provide physically contiguous
> > memory, which is important for workloads or devices requiring continuous
> > physical address space.
> >
> > If my interpretation is correct, I see good potential for this be be
> > useful.
>
> Again, I think only in the case where we don't have have 2/4 MiB for the
> memmap.
I think, it is not 2/4Mib in every usecase.
On my LPAR, the increment size is 2GB. This means 32MB struct pages
metadata - per memory block.
> > As you pointed out, how about having something similar to
> > 73954d379efd ("dax: add a sysfs knob to control memmap_on_memory behavior")
>
> Right. But here, the use case is usually (a) to add a gigantic amount of
> memory using add_memory(), not small blocks like on s390x (b) consume the
> memmap from (slow) special-purpose memory as well.
>
> Regarding (a), the memmap could be so big that add_memory() might never
> really work (not just because of some temporary low-memory situation).
Sorry, I didnt understand it correctly.
regarding a): If add_memory() is performed with memmap_on_memory, altmap
metadata should fit into that added memory right?
> > 1) To configure/deconfigure a memory block
> > /sys/firmware/memory/memoryX/config
> >
> > 1 -> configure
> > 0 -> deconfigure
> >
> > 2) Determine whether memory block should have memmap_on_memory or not.
> > /sys/firmware/memory/memoryX/memmap_on_memory
> > 1 -> with altmap
> > 0 -> without altmap
> >
> > This attribute must be set before the memoryX is configured. Or else, it
> > will default to CONFIG_MHP_MEMMAP_ON_MEMORY / memmap_on_memory parameter.
>
> I don't have anything against that option. Just a thought if we really have
> to introduce this right now.
If there are no objections on this design, I'm happy to start exploring
it further. Thank you
On Wed, May 21, 2025 at 02:33:42PM +0200, David Hildenbrand wrote:
> On 21.05.25 12:34, Sumanth Korikkar wrote:
> > As you pointed out, how about having something similar to
> > 73954d379efd ("dax: add a sysfs knob to control memmap_on_memory behavior")
>
> Right. But here, the use case is usually (a) to add a gigantic amount of
> memory using add_memory(), not small blocks like on s390x (b) consume the
> memmap from (slow) special-purpose memory as well.
>
> Regarding (a), the memmap could be so big that add_memory() might never
> really work (not just because of some temporary low-memory situation).
What is "big"? Worst case for s390 with existing machines would be an
increment size (aka memory block size) of 64GB. So more than 1GB for
memmap plus pages tables, etc would be required.
On 21.05.25 16:21, Heiko Carstens wrote:
> On Wed, May 21, 2025 at 02:33:42PM +0200, David Hildenbrand wrote:
>> On 21.05.25 12:34, Sumanth Korikkar wrote:
>>> As you pointed out, how about having something similar to
>>> 73954d379efd ("dax: add a sysfs knob to control memmap_on_memory behavior")
>>
>> Right. But here, the use case is usually (a) to add a gigantic amount of
>> memory using add_memory(), not small blocks like on s390x (b) consume the
>> memmap from (slow) special-purpose memory as well.
>>
>> Regarding (a), the memmap could be so big that add_memory() might never
>> really work (not just because of some temporary low-memory situation).
>
> What is "big"? Worst case for s390 with existing machines would be an
> increment size (aka memory block size) of 64GB.
Oh! I was assuming the increment size would always be around 256MiB or so.
In that case, it can make sense to have this, yes!
--
Cheers,
David / dhildenb
© 2016 - 2026 Red Hat, Inc.