There are at least six consumers of hotplug_memory_notifier that what they
really are interested in is whether any numa node changed its state, e.g: going
from being memory aware to becoming memoryless and vice versa.
Implement a specific notifier for numa nodes when their state gets changed,
and have those consumers that only care about numa node state changes use it.
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
drivers/acpi/numa/hmat.c | 6 +-
drivers/base/node.c | 21 +++++++
drivers/cxl/core/region.c | 14 ++---
drivers/cxl/cxl.h | 4 +-
include/linux/memory.h | 38 ++++++++++++-
kernel/cgroup/cpuset.c | 2 +-
mm/memory-tiers.c | 8 +--
mm/memory_hotplug.c | 117 +++++++++++++++++++++-----------------
mm/mempolicy.c | 8 +--
mm/slub.c | 13 ++---
10 files changed, 150 insertions(+), 81 deletions(-)
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 9d9052258e92..9ac82a767daf 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -962,10 +962,10 @@ static int hmat_callback(struct notifier_block *self,
unsigned long action, void *arg)
{
struct memory_target *target;
- struct memory_notify *mnb = arg;
+ struct node_notify *mnb = arg;
int pxm, nid = mnb->status_change_nid;
- if (nid == NUMA_NO_NODE || action != MEM_ONLINE)
+ if (nid == NUMA_NO_NODE || action != NODE_BECAME_MEM_AWARE)
return NOTIFY_OK;
pxm = node_to_pxm(nid);
@@ -1118,7 +1118,7 @@ static __init int hmat_init(void)
hmat_register_targets();
/* Keep the table and structures if the notifier may use them */
- if (hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI))
+ if (hotplug_node_notifier(hmat_callback, HMAT_CALLBACK_PRI))
goto out_put;
if (!hmat_set_default_dram_perf())
diff --git a/drivers/base/node.c b/drivers/base/node.c
index cd13ef287011..ca635848ee68 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -110,6 +110,27 @@ static const struct attribute_group *node_access_node_groups[] = {
NULL,
};
+#ifdef CONFIG_MEMORY_HOTPLUG
+static BLOCKING_NOTIFIER_HEAD(node_chain);
+
+int register_node_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&node_chain, nb);
+}
+EXPORT_SYMBOL(register_node_notifier);
+
+void unregister_node_notifier(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&node_chain, nb);
+}
+EXPORT_SYMBOL(unregister_node_notifier);
+
+int node_notify(unsigned long val, void *v)
+{
+ return blocking_notifier_call_chain(&node_chain, val, v);
+}
+#endif
+
static void node_remove_accesses(struct node *node)
{
struct node_access_nodes *c, *cnext;
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index c3f4dc244df7..c43770d6834c 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -2432,12 +2432,12 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb,
unsigned long action, void *arg)
{
struct cxl_region *cxlr = container_of(nb, struct cxl_region,
- memory_notifier);
- struct memory_notify *mnb = arg;
+ node_notifier);
+ struct node_notify *mnb = arg;
int nid = mnb->status_change_nid;
int region_nid;
- if (nid == NUMA_NO_NODE || action != MEM_ONLINE)
+ if (nid == NUMA_NO_NODE || action != NODE_BECAME_MEM_AWARE)
return NOTIFY_DONE;
/*
@@ -3484,7 +3484,7 @@ static void shutdown_notifiers(void *_cxlr)
{
struct cxl_region *cxlr = _cxlr;
- unregister_memory_notifier(&cxlr->memory_notifier);
+ unregister_node_notifier(&cxlr->node_notifier);
unregister_mt_adistance_algorithm(&cxlr->adist_notifier);
}
@@ -3523,9 +3523,9 @@ static int cxl_region_probe(struct device *dev)
if (rc)
return rc;
- cxlr->memory_notifier.notifier_call = cxl_region_perf_attrs_callback;
- cxlr->memory_notifier.priority = CXL_CALLBACK_PRI;
- register_memory_notifier(&cxlr->memory_notifier);
+ cxlr->node_notifier.notifier_call = cxl_region_perf_attrs_callback;
+ cxlr->node_notifier.priority = CXL_CALLBACK_PRI;
+ register_node_notifier(&cxlr->node_notifier);
cxlr->adist_notifier.notifier_call = cxl_region_calculate_adistance;
cxlr->adist_notifier.priority = 100;
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index be8a7dc77719..c4bf8eb3c952 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -513,7 +513,7 @@ enum cxl_partition_mode {
* @flags: Region state flags
* @params: active + config params for the region
* @coord: QoS access coordinates for the region
- * @memory_notifier: notifier for setting the access coordinates to node
+ * @node_notifier: notifier for setting the access coordinates to node
* @adist_notifier: notifier for calculating the abstract distance of node
*/
struct cxl_region {
@@ -526,7 +526,7 @@ struct cxl_region {
unsigned long flags;
struct cxl_region_params params;
struct access_coordinate coord[ACCESS_COORDINATE_MAX];
- struct notifier_block memory_notifier;
+ struct notifier_block node_notifier;
struct notifier_block adist_notifier;
};
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 5ec4e6d209b9..8c5c88eaffb3 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -99,6 +99,14 @@ int set_memory_block_size_order(unsigned int order);
#define MEM_PREPARE_ONLINE (1<<6)
#define MEM_FINISH_OFFLINE (1<<7)
+/* These states are used for numa node notifiers */
+#define NODE_BECOMING_MEM_AWARE (1<<0)
+#define NODE_BECAME_MEM_AWARE (1<<1)
+#define NODE_BECOMING_MEMORYLESS (1<<2)
+#define NODE_BECAME_MEMORYLESS (1<<3)
+#define NODE_CANCEL_MEM_AWARE (1<<4)
+#define NODE_CANCEL_MEMORYLESS (1<<5)
+
struct memory_notify {
/*
* The altmap_start_pfn and altmap_nr_pages fields are designated for
@@ -109,7 +117,10 @@ struct memory_notify {
unsigned long altmap_nr_pages;
unsigned long start_pfn;
unsigned long nr_pages;
- int status_change_nid_normal;
+ int status_change_nid;
+};
+
+struct node_notify {
int status_change_nid;
};
@@ -157,15 +168,34 @@ static inline unsigned long memory_block_advised_max_size(void)
{
return 0;
}
+
+static inline int register_node_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+static inline void unregister_node_notifier(struct notifier_block *nb)
+{
+}
+static inline int node_notify(unsigned long val, void *v)
+{
+ return 0;
+}
+static inline int hotplug_node_notifier(notifier_fn_t fn, int pri)
+{
+ return 0;
+}
#else /* CONFIG_MEMORY_HOTPLUG */
extern int register_memory_notifier(struct notifier_block *nb);
+extern int register_node_notifier(struct notifier_block *nb);
extern void unregister_memory_notifier(struct notifier_block *nb);
+extern void unregister_node_notifier(struct notifier_block *nb);
int create_memory_block_devices(unsigned long start, unsigned long size,
struct vmem_altmap *altmap,
struct memory_group *group);
void remove_memory_block_devices(unsigned long start, unsigned long size);
extern void memory_dev_init(void);
extern int memory_notify(unsigned long val, void *v);
+extern int node_notify(unsigned long val, void *v);
extern struct memory_block *find_memory_block(unsigned long section_nr);
typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
extern int walk_memory_blocks(unsigned long start, unsigned long size,
@@ -185,6 +215,12 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
register_memory_notifier(&fn##_mem_nb); \
})
+#define hotplug_node_notifier(fn, pri) ({ \
+ static __meminitdata struct notifier_block fn##_node_nb =\
+ { .notifier_call = fn, .priority = pri };\
+ register_node_notifier(&fn##_node_nb); \
+})
+
#ifdef CONFIG_NUMA
void memory_block_add_nid(struct memory_block *mem, int nid,
enum meminit_context context);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 83639a12883d..66c84024f217 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4013,7 +4013,7 @@ void __init cpuset_init_smp(void)
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
+ hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index fc14fe53e9b7..dfe6c28c8352 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -872,7 +872,7 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
unsigned long action, void *_arg)
{
struct memory_tier *memtier;
- struct memory_notify *arg = _arg;
+ struct node_notify *arg = _arg;
/*
* Only update the node migration order when a node is
@@ -882,13 +882,13 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
return notifier_from_errno(0);
switch (action) {
- case MEM_OFFLINE:
+ case NODE_BECAME_MEMORYLESS:
mutex_lock(&memory_tier_lock);
if (clear_node_memory_tier(arg->status_change_nid))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
break;
- case MEM_ONLINE:
+ case NODE_BECAME_MEM_AWARE:
mutex_lock(&memory_tier_lock);
memtier = set_node_memory_tier(arg->status_change_nid);
if (!IS_ERR(memtier))
@@ -929,7 +929,7 @@ static int __init memory_tier_init(void)
nodes_and(default_dram_nodes, node_states[N_MEMORY],
node_states[N_CPU]);
- hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
+ hotplug_node_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
return 0;
}
subsys_initcall(memory_tier_init);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b1caedbade5b..777f98326908 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -701,24 +701,18 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
/* check which state of node_states will be changed when online memory */
static void node_states_check_changes_online(unsigned long nr_pages,
- struct zone *zone, struct memory_notify *arg)
+ struct zone *zone, struct node_notify *arg)
{
int nid = zone_to_nid(zone);
arg->status_change_nid = NUMA_NO_NODE;
- arg->status_change_nid_normal = NUMA_NO_NODE;
if (!node_state(nid, N_MEMORY))
arg->status_change_nid = nid;
- if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
- arg->status_change_nid_normal = nid;
}
-static void node_states_set_node(int node, struct memory_notify *arg)
+static void node_states_set_node(int node, struct node_notify *arg)
{
- if (arg->status_change_nid_normal >= 0)
- node_set_state(node, N_NORMAL_MEMORY);
-
if (arg->status_change_nid >= 0)
node_set_state(node, N_MEMORY);
}
@@ -1177,7 +1171,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
int need_zonelists_rebuild = 0;
const int nid = zone_to_nid(zone);
int ret;
- struct memory_notify arg;
+ struct memory_notify mem_arg;
+ struct node_notify node_arg;
+ bool cancel_mem_notifier_on_err = false, cancel_node_notifier_on_err = false;
/*
* {on,off}lining is constrained to full memory sections (or more
@@ -1194,11 +1190,22 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
/* associate pfn range with the zone */
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
- arg.start_pfn = pfn;
- arg.nr_pages = nr_pages;
- node_states_check_changes_online(nr_pages, zone, &arg);
+ mem_arg.start_pfn = pfn;
+ mem_arg.nr_pages = nr_pages;
+ node_states_check_changes_online(nr_pages, zone, &node_arg);
+
+ if (node_arg.status_change_nid >= 0) {
+ /* Node is becoming memory aware. Notify consumers */
+ cancel_node_notifier_on_err = true;
+ ret = node_notify(NODE_BECOMING_MEM_AWARE, &node_arg);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto failed_addition;
+ }
- ret = memory_notify(MEM_GOING_ONLINE, &arg);
+ cancel_mem_notifier_on_err = true;
+ mem_arg.status_change_nid = node_arg.status_change_nid;
+ ret = memory_notify(MEM_GOING_ONLINE, &mem_arg);
ret = notifier_to_errno(ret);
if (ret)
goto failed_addition;
@@ -1224,7 +1231,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
online_pages_range(pfn, nr_pages);
adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
- node_states_set_node(nid, &arg);
+ node_states_set_node(nid, &node_arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
@@ -1245,16 +1252,26 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
kswapd_run(nid);
kcompactd_run(nid);
+ if (node_arg.status_change_nid >= 0)
+ /*
+ * Node went from memoryless to have memory. Notifiy interested
+ * consumers
+ */
+ node_notify(NODE_BECAME_MEM_AWARE, &node_arg);
+
writeback_set_ratelimit();
- memory_notify(MEM_ONLINE, &arg);
+ memory_notify(MEM_ONLINE, &mem_arg);
return 0;
failed_addition:
pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
(unsigned long long) pfn << PAGE_SHIFT,
(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
- memory_notify(MEM_CANCEL_ONLINE, &arg);
+ if (cancel_mem_notifier_on_err)
+ memory_notify(MEM_CANCEL_ONLINE, &mem_arg);
+ if (cancel_node_notifier_on_err)
+ node_notify(NODE_CANCEL_MEM_AWARE, &node_arg);
remove_pfn_range_from_zone(zone, pfn, nr_pages);
return ret;
}
@@ -1888,48 +1905,29 @@ early_param("movable_node", cmdline_parse_movable_node);
/* check which state of node_states will be changed when offline memory */
static void node_states_check_changes_offline(unsigned long nr_pages,
- struct zone *zone, struct memory_notify *arg)
+ struct zone *zone, struct node_notify *arg)
{
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long present_pages = 0;
enum zone_type zt;
arg->status_change_nid = NUMA_NO_NODE;
- arg->status_change_nid_normal = NUMA_NO_NODE;
/*
- * Check whether node_states[N_NORMAL_MEMORY] will be changed.
- * If the memory to be offline is within the range
- * [0..ZONE_NORMAL], and it is the last present memory there,
- * the zones in that range will become empty after the offlining,
- * thus we can determine that we need to clear the node from
- * node_states[N_NORMAL_MEMORY].
+ * Here we count the possible pages within the range [0..ZONE_MOVABLE].
+ * If after having accounted all the pages, we see that the nr_pages to
+ * be offlined is over or equal to the accounted pages, we know that the
+ * node will become empty, ans so, we can clear it for N_MEMORY.
*/
- for (zt = 0; zt <= ZONE_NORMAL; zt++)
+ for (zt = 0; zt <= ZONE_MOVABLE; zt++)
present_pages += pgdat->node_zones[zt].present_pages;
- if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
- arg->status_change_nid_normal = zone_to_nid(zone);
-
- /*
- * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
- * does not apply as we don't support 32bit.
- * Here we count the possible pages from ZONE_MOVABLE.
- * If after having accounted all the pages, we see that the nr_pages
- * to be offlined is over or equal to the accounted pages,
- * we know that the node will become empty, and so, we can clear
- * it for N_MEMORY as well.
- */
- present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
if (nr_pages >= present_pages)
arg->status_change_nid = zone_to_nid(zone);
}
-static void node_states_clear_node(int node, struct memory_notify *arg)
+static void node_states_clear_node(int node, struct node_notify *arg)
{
- if (arg->status_change_nid_normal >= 0)
- node_clear_state(node, N_NORMAL_MEMORY);
-
if (arg->status_change_nid >= 0)
node_clear_state(node, N_MEMORY);
}
@@ -1953,7 +1951,9 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
unsigned long pfn, managed_pages, system_ram_pages = 0;
const int node = zone_to_nid(zone);
unsigned long flags;
- struct memory_notify arg;
+ struct memory_notify mem_arg;
+ struct node_notify node_arg;
+ bool cancel_mem_notifier_on_err = false, cancel_node_notifier_on_err = false;
char *reason;
int ret;
@@ -2012,11 +2012,21 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
goto failed_removal_pcplists_disabled;
}
- arg.start_pfn = start_pfn;
- arg.nr_pages = nr_pages;
- node_states_check_changes_offline(nr_pages, zone, &arg);
+ mem_arg.start_pfn = start_pfn;
+ mem_arg.nr_pages = nr_pages;
+ node_states_check_changes_offline(nr_pages, zone, &node_arg);
+
+ if (node_arg.status_change_nid >= 0) {
+ cancel_node_notifier_on_err = true;
+ ret = node_notify(NODE_BECOMING_MEMORYLESS, &node_arg);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto failed_removal_isolated;
+ }
- ret = memory_notify(MEM_GOING_OFFLINE, &arg);
+ cancel_mem_notifier_on_err = true;
+ mem_arg.status_change_nid = node_arg.status_change_nid;
+ ret = memory_notify(MEM_GOING_OFFLINE, &mem_arg);
ret = notifier_to_errno(ret);
if (ret) {
reason = "notifier failure";
@@ -2096,27 +2106,32 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* Make sure to mark the node as memory-less before rebuilding the zone
* list. Otherwise this node would still appear in the fallback lists.
*/
- node_states_clear_node(node, &arg);
+ node_states_clear_node(node, &node_arg);
if (!populated_zone(zone)) {
zone_pcp_reset(zone);
build_all_zonelists(NULL);
}
- if (arg.status_change_nid >= 0) {
+ if (node_arg.status_change_nid >= 0) {
kcompactd_stop(node);
kswapd_stop(node);
+ /* Node went memoryless. Notifiy interested consumers */
+ node_notify(NODE_BECAME_MEMORYLESS, &node_arg);
}
writeback_set_ratelimit();
- memory_notify(MEM_OFFLINE, &arg);
+ memory_notify(MEM_OFFLINE, &mem_arg);
remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
return 0;
failed_removal_isolated:
/* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
- memory_notify(MEM_CANCEL_OFFLINE, &arg);
+ if (cancel_mem_notifier_on_err)
+ memory_notify(MEM_CANCEL_OFFLINE, &mem_arg);
+ if (cancel_node_notifier_on_err)
+ node_notify(NODE_CANCEL_MEMORYLESS, &node_arg);
failed_removal_pcplists_disabled:
lru_cache_enable();
zone_pcp_enable(zone);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f43951668c41..b3ad63fb3a2b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3591,20 +3591,20 @@ static int wi_node_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
int err;
- struct memory_notify *arg = data;
+ struct node_notify *arg = data;
int nid = arg->status_change_nid;
if (nid < 0)
return NOTIFY_OK;
switch (action) {
- case MEM_ONLINE:
+ case NODE_BECAME_MEM_AWARE:
err = sysfs_wi_node_add(nid);
if (err)
pr_err("failed to add sysfs for node%d during hotplug: %d\n",
nid, err);
break;
- case MEM_OFFLINE:
+ case NODE_BECAME_MEMORYLESS:
sysfs_wi_node_delete(nid);
break;
}
@@ -3639,7 +3639,7 @@ static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
}
}
- hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
+ hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
return 0;
err_cleanup_kobj:
diff --git a/mm/slub.c b/mm/slub.c
index 236d679e6a1d..72515ab4aaf8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -6178,8 +6178,8 @@ static int slab_mem_going_online_callback(void *arg)
{
struct kmem_cache_node *n;
struct kmem_cache *s;
- struct memory_notify *marg = arg;
- int nid = marg->status_change_nid;
+ struct node_notify *narg = arg;
+ int nid = narg->status_change_nid;
int ret = 0;
/*
@@ -6231,15 +6231,12 @@ static int slab_memory_callback(struct notifier_block *self,
int ret = 0;
switch (action) {
- case MEM_GOING_ONLINE:
+ case NODE_BECOMING_MEM_AWARE:
ret = slab_mem_going_online_callback(arg);
break;
- case MEM_GOING_OFFLINE:
+ case NODE_BECOMING_MEMORYLESS:
ret = slab_mem_going_offline_callback(arg);
break;
- case MEM_ONLINE:
- case MEM_CANCEL_OFFLINE:
- break;
}
if (ret)
ret = notifier_from_errno(ret);
@@ -6314,7 +6311,7 @@ void __init kmem_cache_init(void)
sizeof(struct kmem_cache_node),
SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
- hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+ hotplug_node_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
--
2.49.0
On 5/2/25 10:36, Oscar Salvador wrote:
> There are at least six consumers of hotplug_memory_notifier that what they
> really are interested in is whether any numa node changed its state, e.g: going
> from being memory aware to becoming memoryless and vice versa.
>
> Implement a specific notifier for numa nodes when their state gets changed,
> and have those consumers that only care about numa node state changes use it.
>
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Some cleanup suggestions below:
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index b1caedbade5b..777f98326908 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -701,24 +701,18 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
>
> /* check which state of node_states will be changed when online memory */
> static void node_states_check_changes_online(unsigned long nr_pages,
> - struct zone *zone, struct memory_notify *arg)
> + struct zone *zone, struct node_notify *arg)
I've noticed nr_pages parameter is unused.
Also the function became rather trivial, you could just move the code into
its only caller now?
> {
> int nid = zone_to_nid(zone);
>
> arg->status_change_nid = NUMA_NO_NODE;
> - arg->status_change_nid_normal = NUMA_NO_NODE;
>
> if (!node_state(nid, N_MEMORY))
> arg->status_change_nid = nid;
> - if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
> - arg->status_change_nid_normal = nid;
> }
>
> -static void node_states_set_node(int node, struct memory_notify *arg)
> +static void node_states_set_node(int node, struct node_notify *arg)
This one too.
> {
> - if (arg->status_change_nid_normal >= 0)
> - node_set_state(node, N_NORMAL_MEMORY);
> -
> if (arg->status_change_nid >= 0)
> node_set_state(node, N_MEMORY);
> }
> @@ -1177,7 +1171,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
> int need_zonelists_rebuild = 0;
> const int nid = zone_to_nid(zone);
> int ret;
> - struct memory_notify arg;
> + struct memory_notify mem_arg;
> + struct node_notify node_arg;
> + bool cancel_mem_notifier_on_err = false, cancel_node_notifier_on_err = false;
>
> /*
> * {on,off}lining is constrained to full memory sections (or more
> @@ -1194,11 +1190,22 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
> /* associate pfn range with the zone */
> move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
>
> - arg.start_pfn = pfn;
> - arg.nr_pages = nr_pages;
> - node_states_check_changes_online(nr_pages, zone, &arg);
> + mem_arg.start_pfn = pfn;
> + mem_arg.nr_pages = nr_pages;
You could move the two lines below where the rest of mem_arg is filled.
> + node_states_check_changes_online(nr_pages, zone, &node_arg);
> +
> + if (node_arg.status_change_nid >= 0) {
> + /* Node is becoming memory aware. Notify consumers */
> + cancel_node_notifier_on_err = true;
> + ret = node_notify(NODE_BECOMING_MEM_AWARE, &node_arg);
> + ret = notifier_to_errno(ret);
> + if (ret)
> + goto failed_addition;
> + }
>
> - ret = memory_notify(MEM_GOING_ONLINE, &arg);
> + cancel_mem_notifier_on_err = true;
> + mem_arg.status_change_nid = node_arg.status_change_nid;
> + ret = memory_notify(MEM_GOING_ONLINE, &mem_arg);
> ret = notifier_to_errno(ret);
> if (ret)
> goto failed_addition;
> - node_states_set_node(nid, &arg);
> + node_states_set_node(nid, &node_arg);
> if (need_zonelists_rebuild)
> build_all_zonelists(NULL);
>
> @@ -1245,16 +1252,26 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
> kswapd_run(nid);
> kcompactd_run(nid);
>
> + if (node_arg.status_change_nid >= 0)
> + /*
> + * Node went from memoryless to have memory. Notifiy interested
typo Notifiy -> Notify
"to have memory" /s "to having memory"
> + * consumers
> + */
> + node_notify(NODE_BECAME_MEM_AWARE, &node_arg);
> +
> writeback_set_ratelimit();
>
> - memory_notify(MEM_ONLINE, &arg);
> + memory_notify(MEM_ONLINE, &mem_arg);
> return 0;
>
> failed_addition:
> pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
> (unsigned long long) pfn << PAGE_SHIFT,
> (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
> - memory_notify(MEM_CANCEL_ONLINE, &arg);
> + if (cancel_mem_notifier_on_err)
> + memory_notify(MEM_CANCEL_ONLINE, &mem_arg);
> + if (cancel_node_notifier_on_err)
> + node_notify(NODE_CANCEL_MEM_AWARE, &node_arg);
> remove_pfn_range_from_zone(zone, pfn, nr_pages);
> return ret;
> }
> @@ -1888,48 +1905,29 @@ early_param("movable_node", cmdline_parse_movable_node);
>
> /* check which state of node_states will be changed when offline memory */
> static void node_states_check_changes_offline(unsigned long nr_pages,
> - struct zone *zone, struct memory_notify *arg)
> + struct zone *zone, struct node_notify *arg)
> {
> struct pglist_data *pgdat = zone->zone_pgdat;
> unsigned long present_pages = 0;
> enum zone_type zt;
>
> arg->status_change_nid = NUMA_NO_NODE;
> - arg->status_change_nid_normal = NUMA_NO_NODE;
>
> /*
> - * Check whether node_states[N_NORMAL_MEMORY] will be changed.
> - * If the memory to be offline is within the range
> - * [0..ZONE_NORMAL], and it is the last present memory there,
> - * the zones in that range will become empty after the offlining,
> - * thus we can determine that we need to clear the node from
> - * node_states[N_NORMAL_MEMORY].
> + * Here we count the possible pages within the range [0..ZONE_MOVABLE].
> + * If after having accounted all the pages, we see that the nr_pages to
> + * be offlined is over or equal to the accounted pages, we know that the
> + * node will become empty, ans so, we can clear it for N_MEMORY.
typo ans -> and
"over or equal to the accounted" -> "greater than or equal to the accounted"
> */
> - for (zt = 0; zt <= ZONE_NORMAL; zt++)
> + for (zt = 0; zt <= ZONE_MOVABLE; zt++)
> present_pages += pgdat->node_zones[zt].present_pages;
> - if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
> - arg->status_change_nid_normal = zone_to_nid(zone);
> -
> - /*
> - * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
> - * does not apply as we don't support 32bit.
> - * Here we count the possible pages from ZONE_MOVABLE.
> - * If after having accounted all the pages, we see that the nr_pages
> - * to be offlined is over or equal to the accounted pages,
> - * we know that the node will become empty, and so, we can clear
> - * it for N_MEMORY as well.
> - */
> - present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
>
> if (nr_pages >= present_pages)
> arg->status_change_nid = zone_to_nid(zone);
> }
>
> -static void node_states_clear_node(int node, struct memory_notify *arg)
> +static void node_states_clear_node(int node, struct node_notify *arg)
> {
> - if (arg->status_change_nid_normal >= 0)
> - node_clear_state(node, N_NORMAL_MEMORY);
> -
> if (arg->status_change_nid >= 0)
> node_clear_state(node, N_MEMORY);
> }
> @@ -1953,7 +1951,9 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
> unsigned long pfn, managed_pages, system_ram_pages = 0;
> const int node = zone_to_nid(zone);
> unsigned long flags;
> - struct memory_notify arg;
> + struct memory_notify mem_arg;
> + struct node_notify node_arg;
> + bool cancel_mem_notifier_on_err = false, cancel_node_notifier_on_err = false;
> char *reason;
> int ret;
>
> @@ -2012,11 +2012,21 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
> goto failed_removal_pcplists_disabled;
> }
>
> - arg.start_pfn = start_pfn;
> - arg.nr_pages = nr_pages;
> - node_states_check_changes_offline(nr_pages, zone, &arg);
> + mem_arg.start_pfn = start_pfn;
> + mem_arg.nr_pages = nr_pages;
> + node_states_check_changes_offline(nr_pages, zone, &node_arg);
> +
> + if (node_arg.status_change_nid >= 0) {
> + cancel_node_notifier_on_err = true;
> + ret = node_notify(NODE_BECOMING_MEMORYLESS, &node_arg);
> + ret = notifier_to_errno(ret);
> + if (ret)
> + goto failed_removal_isolated;
> + }
>
> - ret = memory_notify(MEM_GOING_OFFLINE, &arg);
> + cancel_mem_notifier_on_err = true;
> + mem_arg.status_change_nid = node_arg.status_change_nid;
> + ret = memory_notify(MEM_GOING_OFFLINE, &mem_arg);
> ret = notifier_to_errno(ret);
> if (ret) {
> reason = "notifier failure";
> @@ -2096,27 +2106,32 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
> * Make sure to mark the node as memory-less before rebuilding the zone
> * list. Otherwise this node would still appear in the fallback lists.
> */
> - node_states_clear_node(node, &arg);
> + node_states_clear_node(node, &node_arg);
> if (!populated_zone(zone)) {
> zone_pcp_reset(zone);
> build_all_zonelists(NULL);
> }
>
> - if (arg.status_change_nid >= 0) {
> + if (node_arg.status_change_nid >= 0) {
> kcompactd_stop(node);
> kswapd_stop(node);
> + /* Node went memoryless. Notifiy interested consumers */
typo Notifiy -> Notify
> + node_notify(NODE_BECAME_MEMORYLESS, &node_arg);
> }
>
> writeback_set_ratelimit();
>
> - memory_notify(MEM_OFFLINE, &arg);
> + memory_notify(MEM_OFFLINE, &mem_arg);
> remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
> return 0;
>
> failed_removal_isolated:
> /* pushback to free area */
> undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
> - memory_notify(MEM_CANCEL_OFFLINE, &arg);
> + if (cancel_mem_notifier_on_err)
> + memory_notify(MEM_CANCEL_OFFLINE, &mem_arg);
> + if (cancel_node_notifier_on_err)
> + node_notify(NODE_CANCEL_MEMORYLESS, &node_arg);
> failed_removal_pcplists_disabled:
> lru_cache_enable();
> zone_pcp_enable(zone);
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index f43951668c41..b3ad63fb3a2b 100644
Thanks,
Alok
On Fri, May 02, 2025 at 10:36:23AM +0200, Oscar Salvador wrote:
> There are at least six consumers of hotplug_memory_notifier that what they
> really are interested in is whether any numa node changed its state, e.g: going
> from being memory aware to becoming memoryless and vice versa.
>
> Implement a specific notifier for numa nodes when their state gets changed,
> and have those consumers that only care about numa node state changes use it.
>
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index f43951668c41..b3ad63fb3a2b 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -3591,20 +3591,20 @@ static int wi_node_notifier(struct notifier_block *nb,
> unsigned long action, void *data)
> {
> int err;
> - struct memory_notify *arg = data;
> + struct node_notify *arg = data;
> int nid = arg->status_change_nid;
>
> if (nid < 0)
> return NOTIFY_OK;
>
> switch (action) {
> - case MEM_ONLINE:
> + case NODE_BECAME_MEM_AWARE:
> err = sysfs_wi_node_add(nid);
> if (err)
> pr_err("failed to add sysfs for node%d during hotplug: %d\n",
> nid, err);
> break;
May I suggest rolling this patch in with this change:
https://lore.kernel.org/linux-mm/aAij2oUCP1zmcoPv@stanley.mountain/
seems to fix the underlying problem, and returning an error now makes
sense given the change.
+cc: Honggyu Kim, Dan Carpenter
> - case MEM_OFFLINE:
> + case NODE_BECAME_MEMORYLESS:
> sysfs_wi_node_delete(nid);
> break;
> }
> @@ -3639,7 +3639,7 @@ static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
> }
> }
>
> - hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
> + hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
> return 0;
>
> err_cleanup_kobj:
Hi Gregory and Oscar,
On 5/3/2025 12:28 AM, Gregory Price wrote:
> On Fri, May 02, 2025 at 10:36:23AM +0200, Oscar Salvador wrote:
>> There are at least six consumers of hotplug_memory_notifier that what they
>> really are interested in is whether any numa node changed its state, e.g: going
>> from being memory aware to becoming memoryless and vice versa.
>>
>> Implement a specific notifier for numa nodes when their state gets changed,
>> and have those consumers that only care about numa node state changes use it.
>>
>> Signed-off-by: Oscar Salvador <osalvador@suse.de>
>> Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
>> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>>
>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>> index f43951668c41..b3ad63fb3a2b 100644
>> --- a/mm/mempolicy.c
>> +++ b/mm/mempolicy.c
>> @@ -3591,20 +3591,20 @@ static int wi_node_notifier(struct notifier_block *nb,
>> unsigned long action, void *data)
>> {
>> int err;
>> - struct memory_notify *arg = data;
>> + struct node_notify *arg = data;
>> int nid = arg->status_change_nid;
>>
>> if (nid < 0)
>> return NOTIFY_OK;
>>
>> switch (action) {
>> - case MEM_ONLINE:
>> + case NODE_BECAME_MEM_AWARE:
>> err = sysfs_wi_node_add(nid);
>> if (err)
>> pr_err("failed to add sysfs for node%d during hotplug: %d\n",
>> nid, err);
>> break;
>
> May I suggest rolling this patch in with this change:
> https://lore.kernel.org/linux-mm/aAij2oUCP1zmcoPv@stanley.mountain/
>
> seems to fix the underlying problem, and returning an error now makes
> sense given the change.
The 'err' of sysfs_wi_node_add() wasn't propagated to its caller before
this change as discussed with David at the following.
https://lore.kernel.org/198f2cbe-b1cb-4239-833e-9aac33d978fa@redhat.com
But as Gregory mentioned, we can pass 'err' now with this numa node notifier
so for this hunk, shouldn't we add the following change on top of this?
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3a7717e09506..3073ebd4e7ee 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3792,7 +3792,7 @@ static int sysfs_wi_node_add(int nid)
static int wi_node_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
- int err;
+ int err = 0;
struct node_notify *arg = data;
int nid = arg->status_change_nid;
@@ -3811,7 +3811,7 @@ static int wi_node_notifier(struct notifier_block *nb,
break;
}
- return NOTIFY_OK;
+ return notifier_from_errno(err);
}
static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
>
> +cc: Honggyu Kim, Dan Carpenter
Thanks for cc-ing me into this thread.
Honggyu
>
>> - case MEM_OFFLINE:
>> + case NODE_BECAME_MEMORYLESS:
>> sysfs_wi_node_delete(nid);
>> break;
>> }
>> @@ -3639,7 +3639,7 @@ static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
>> }
>> }
>>
>> - hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
>> + hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
>> return 0;
>>
>> err_cleanup_kobj:
On Wed, May 28, 2025 at 11:23:53AM +0900, Honggyu Kim wrote:
> The 'err' of sysfs_wi_node_add() wasn't propagated to its caller before
> this change as discussed with David at the following.
> https://lore.kernel.org/198f2cbe-b1cb-4239-833e-9aac33d978fa@redhat.com
>
> But as Gregory mentioned, we can pass 'err' now with this numa node notifier
> so for this hunk, shouldn't we add the following change on top of this?
>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 3a7717e09506..3073ebd4e7ee 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -3792,7 +3792,7 @@ static int sysfs_wi_node_add(int nid)
> static int wi_node_notifier(struct notifier_block *nb,
> unsigned long action, void *data)
> {
> - int err;
> + int err = 0;
> struct node_notify *arg = data;
> int nid = arg->status_change_nid;
>
> @@ -3811,7 +3811,7 @@ static int wi_node_notifier(struct notifier_block *nb,
> break;
> }
>
> - return NOTIFY_OK;
> + return notifier_from_errno(err);
I do not think so.
NODE_BECAME_MEM_AWARE is already too late to react.
We only tell the system that __now__ we are in this state, so there is
no rollback possible from this point forward.
--
Oscar Salvador
SUSE Labs
On Fri, May 02, 2025 at 11:28:02AM -0400, Gregory Price wrote:
> On Fri, May 02, 2025 at 10:36:23AM +0200, Oscar Salvador wrote:
> > There are at least six consumers of hotplug_memory_notifier that what they
> > really are interested in is whether any numa node changed its state, e.g: going
> > from being memory aware to becoming memoryless and vice versa.
> >
> > Implement a specific notifier for numa nodes when their state gets changed,
> > and have those consumers that only care about numa node state changes use it.
> >
> > Signed-off-by: Oscar Salvador <osalvador@suse.de>
> > Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> >
> > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > index f43951668c41..b3ad63fb3a2b 100644
> > --- a/mm/mempolicy.c
> > +++ b/mm/mempolicy.c
> > @@ -3591,20 +3591,20 @@ static int wi_node_notifier(struct notifier_block *nb,
> > unsigned long action, void *data)
> > {
> > int err;
> > - struct memory_notify *arg = data;
> > + struct node_notify *arg = data;
> > int nid = arg->status_change_nid;
> >
> > if (nid < 0)
> > return NOTIFY_OK;
> >
> > switch (action) {
> > - case MEM_ONLINE:
> > + case NODE_BECAME_MEM_AWARE:
> > err = sysfs_wi_node_add(nid);
> > if (err)
> > pr_err("failed to add sysfs for node%d during hotplug: %d\n",
> > nid, err);
> > break;
>
> May I suggest rolling this patch in with this change:
> https://lore.kernel.org/linux-mm/aAij2oUCP1zmcoPv@stanley.mountain/
>
> seems to fix the underlying problem, and returning an error now makes
> sense given the change.
>
> +cc: Honggyu Kim, Dan Carpenter
>
Oops. I sent a v2 of that patch. I imagine that Andrew will fold that
patch into Rakie Kim's patch.
https://lore.kernel.org/all/aBjL7Bwc0QBzgajK@stanley.mountain
regards,
dan carpenter
© 2016 - 2025 Red Hat, Inc.