[PATCH v9 2/2] mm/vmscan: select the closest perferred node in demote_folio_list()

Bing Jiao posted 2 patches 3 weeks, 4 days ago
[PATCH v9 2/2] mm/vmscan: select the closest perferred node in demote_folio_list()
Posted by Bing Jiao 3 weeks, 4 days ago
The preferred demotion node (migration_target_control.nid) should be the
one closest to the source node to minimize migration latency.  Currently,
a discrepancy exists where demote_folio_list() randomly selects an allowed
node if the preferred node from next_demotion_node() is not set in
mems_effective.

To address it, update next_demotion_node() to select a preferred target
against allowed nodes; and to return the closest demotion target if all
preferred nodes are not in mems_effective via next_demotion_node().

It ensures that the preferred demotion target is consistently the closest
available node to the source node.

Signed-off-by: Bing Jiao <bingjiao@google.com>
---
v7 -> v8:
Fix bugs in v7.
Remove the while loop of getting the preferred node via
next_demotion_node().
Use find_next_best_node() to find the closest demotion target.

v8 -> v9:
Move allowed node checks and identification of the closest demotion
target into next_demotion_node() for better function splitting.

 include/linux/memory-tiers.h |  6 +++---
 mm/memory-tiers.c            | 21 ++++++++++++++++-----
 mm/vmscan.c                  |  5 ++---
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 7a805796fcfd..96987d9d95a8 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -53,11 +53,11 @@ struct memory_dev_type *mt_find_alloc_memory_type(int adist,
 						  struct list_head *memory_types);
 void mt_put_memory_types(struct list_head *memory_types);
 #ifdef CONFIG_MIGRATION
-int next_demotion_node(int node);
+int next_demotion_node(int node, const nodemask_t *allowed_mask);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
 bool node_is_toptier(int node);
 #else
-static inline int next_demotion_node(int node)
+static inline int next_demotion_node(int node, const nodemask_t *allowed_mask)
 {
 	return NUMA_NO_NODE;
 }
@@ -101,7 +101,7 @@ static inline void clear_node_memory_type(int node, struct memory_dev_type *memt

 }

-static inline int next_demotion_node(int node)
+static inline int next_demotion_node(int node, const nodemask_t *allowed_mask)
 {
 	return NUMA_NO_NODE;
 }
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 864811fff409..2d6c3754e6a8 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -320,16 +320,17 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
 /**
  * next_demotion_node() - Get the next node in the demotion path
  * @node: The starting node to lookup the next node
+ * @allowed_mask: The pointer to allowed node mask
  *
  * Return: node id for next memory node in the demotion path hierarchy
  * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
  * @node online or guarantee that it *continues* to be the next demotion
  * target.
  */
-int next_demotion_node(int node)
+int next_demotion_node(int node, const nodemask_t *allowed_mask)
 {
 	struct demotion_nodes *nd;
-	int target;
+	nodemask_t mask;

 	if (!node_demotion)
 		return NUMA_NO_NODE;
@@ -344,6 +345,10 @@ int next_demotion_node(int node)
 	 * node_demotion[] reads need to be consistent.
 	 */
 	rcu_read_lock();
+	/* Filter out nodes that are not in allowed_mask. */
+	nodes_and(mask, nd->preferred, *allowed_mask);
+	rcu_read_unlock();
+
 	/*
 	 * If there are multiple target nodes, just select one
 	 * target node randomly.
@@ -356,10 +361,16 @@ int next_demotion_node(int node)
 	 * caching issue, which seems more complicated. So selecting
 	 * target node randomly seems better until now.
 	 */
-	target = node_random(&nd->preferred);
-	rcu_read_unlock();
+	if (!nodes_empty(mask))
+		return node_random(&mask);

-	return target;
+	/*
+	 * Preferred nodes are not in allowed_mask. Filp bits in
+	 * allowed_mask as used node mask. Then, use it to get the
+	 * closest demotion target.
+	 */
+	nodes_complement(mask, *allowed_mask);
+	return find_next_best_node(node, &mask);
 }

 static void disable_all_demotion_targets(void)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5ea1dd2b8cce..7a631de46064 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1048,12 +1048,11 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
 	if (nodes_empty(allowed_mask))
 		return 0;

-	target_nid = next_demotion_node(pgdat->node_id);
+	target_nid = next_demotion_node(pgdat->node_id, &allowed_mask);
 	if (target_nid == NUMA_NO_NODE)
 		/* No lower-tier nodes or nodes were hot-unplugged. */
 		return 0;
-	if (!node_isset(target_nid, allowed_mask))
-		target_nid = node_random(&allowed_mask);
+
 	mtc.nid = target_nid;

 	/* Demotion ignores all cpuset and mempolicy settings */
--
2.52.0.457.g6b5491de43-goog
Re: [PATCH v9 2/2] mm/vmscan: select the closest perferred node in demote_folio_list()
Posted by Shakeel Butt 2 days, 9 hours ago
On Wed, Jan 14, 2026 at 08:53:03PM +0000, Bing Jiao wrote:
> The preferred demotion node (migration_target_control.nid) should be the
> one closest to the source node to minimize migration latency.  Currently,
> a discrepancy exists where demote_folio_list() randomly selects an allowed
> node if the preferred node from next_demotion_node() is not set in
> mems_effective.
> 
> To address it, update next_demotion_node() to select a preferred target
> against allowed nodes; and to return the closest demotion target if all
> preferred nodes are not in mems_effective via next_demotion_node().
> 
> It ensures that the preferred demotion target is consistently the closest
> available node to the source node.
> 
> Signed-off-by: Bing Jiao <bingjiao@google.com>

One nit below:

Acked-by: Shakeel Butt <shakeel.butt@linux.dev>

[...]

> @@ -320,16 +320,17 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>  /**
>   * next_demotion_node() - Get the next node in the demotion path
>   * @node: The starting node to lookup the next node
> + * @allowed_mask: The pointer to allowed node mask
>   *
>   * Return: node id for next memory node in the demotion path hierarchy
>   * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
>   * @node online or guarantee that it *continues* to be the next demotion
>   * target.
>   */
> -int next_demotion_node(int node)
> +int next_demotion_node(int node, const nodemask_t *allowed_mask)
>  {
>  	struct demotion_nodes *nd;
> -	int target;
> +	nodemask_t mask;
> 
>  	if (!node_demotion)
>  		return NUMA_NO_NODE;
> @@ -344,6 +345,10 @@ int next_demotion_node(int node)
>  	 * node_demotion[] reads need to be consistent.
>  	 */
>  	rcu_read_lock();
> +	/* Filter out nodes that are not in allowed_mask. */
> +	nodes_and(mask, nd->preferred, *allowed_mask);
> +	rcu_read_unlock();
> +
>  	/*
>  	 * If there are multiple target nodes, just select one
>  	 * target node randomly.
> @@ -356,10 +361,16 @@ int next_demotion_node(int node)
>  	 * caching issue, which seems more complicated. So selecting
>  	 * target node randomly seems better until now.
>  	 */
> -	target = node_random(&nd->preferred);
> -	rcu_read_unlock();
> +	if (!nodes_empty(mask))
> +		return node_random(&mask);
> 
> -	return target;
> +	/*
> +	 * Preferred nodes are not in allowed_mask. Filp bits in

Filp -> Flip

> +	 * allowed_mask as used node mask. Then, use it to get the
> +	 * closest demotion target.
> +	 */
> +	nodes_complement(mask, *allowed_mask);
> +	return find_next_best_node(node, &mask);
>  }
>