[RFC PATCH 2/4] mm/damon/paddr: Add DAMOS_INTERLEAVE action

Bijan Tabatabai posted 4 patches 4 months ago
[RFC PATCH 2/4] mm/damon/paddr: Add DAMOS_INTERLEAVE action
Posted by Bijan Tabatabai 4 months ago
From: Bijan Tabatabai <bijantabatab@micron.com>

This patch adds the DAMOS_INTERLEAVE action.
It interleaves pages inside of a given region according to the weights
in the iw_table. To reuse existing interleaving code, the target nid for
a folio is determined by calling policy_nodemask, therefore only folios
belonging to processes using the MPOL_WEIGHTED_INTERLEAVE policy will
have their pages migrated.

Below is an example of its usage where pages are initially interleaved at
a 1:1 ratio and then changed to be interleaved at a 2:1 ratio. The
alloc_data program simply allocates 1GB of data then sleeps.
  $ cd /sys/kernel/mm/damon/admin/kdamonds/0
  $ sudo cat ./contexts/0/schemes/0/action
  interleave
  $ echo 1 | sudo tee /sys/kernel/mm/mempolicy/weighted_interleave/node0
  $ echo 1 | sudo tee /sys/kernel/mm/mempolicy/weighted_interleave/node1
  $ numactl -w 0,1 ~/alloc_data 1G &
  $ numastat -c -p alloc_data

  Per-node process memory usage (in MBs) for PID 18473 (alloc_data)
           Node 0 Node 1 Total
           ------ ------ -----
  Huge          0      0     0
  Heap          0      0     0
  Stack         0      0     0
  Private     514    514  1027
  -------  ------ ------ -----
  Total       514    514  1028
  $ echo 2 | sudo tee /sys/kernel/mm/mempolicy/weighted_interleave/node0
  $ numastat -c -p alloc_data

  Per-node process memory usage (in MBs) for PID 18473 (alloc_data)
           Node 0 Node 1 Total
           ------ ------ -----
  Huge          0      0     0
  Heap          0      0     0
  Stack         0      0     0
  Private     684    343  1027
  -------  ------ ------ -----
  Total       684    343  1027

Signed-off-by: Bijan Tabatabai <bijantabatab@micron.com>
---
 Documentation/mm/damon/design.rst |   2 +
 include/linux/damon.h             |   2 +
 mm/damon/paddr.c                  | 112 ++++++++++++++++++++++++++++++
 mm/damon/sysfs-schemes.c          |   1 +
 4 files changed, 117 insertions(+)

diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index ddc50db3afa4..c50d2105cea0 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -455,6 +455,8 @@ that supports each action are as below.
    Supported by ``paddr`` operations set.
  - ``migrate_cold``: Migrate the regions prioritizing colder regions.
    Supported by ``paddr`` operations set.
+ - ``interleave``: Interleave the regions according to the weighted interleave weights.
+   Supported by ``paddr`` operations set.
  - ``stat``: Do nothing but count the statistics.
    Supported by all operations sets.
 
diff --git a/include/linux/damon.h b/include/linux/damon.h
index a4011726cb3b..81d26a203337 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -117,6 +117,7 @@ struct damon_target {
  * @DAMOS_LRU_DEPRIO:	Deprioritize the region on its LRU lists.
  * @DAMOS_MIGRATE_HOT:  Migrate the regions prioritizing warmer regions.
  * @DAMOS_MIGRATE_COLD:	Migrate the regions prioritizing colder regions.
+ * @DAMOS_INTERLEAVE: Interleave the regions by the weighted interleave ratio
  * @DAMOS_STAT:		Do nothing but count the stat.
  * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
  *
@@ -136,6 +137,7 @@ enum damos_action {
 	DAMOS_LRU_DEPRIO,
 	DAMOS_MIGRATE_HOT,
 	DAMOS_MIGRATE_COLD,
+	DAMOS_INTERLEAVE,
 	DAMOS_STAT,		/* Do nothing but only record the stat */
 	NR_DAMOS_ACTIONS,
 };
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 4102a8c5f992..e989464635cd 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -535,6 +535,114 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
 	return applied * PAGE_SIZE;
 }
 
+#if defined(CONFIG_MEMCG) && defined(CONFIG_NUMA)
+struct damos_interleave_private {
+	struct list_head *folio_migration_list;
+	bool putback_lru;
+};
+
+static bool damon_pa_interleave_rmap(struct folio *folio, struct vm_area_struct *vma,
+		unsigned long addr, void *arg)
+{
+	struct mempolicy *pol;
+	struct task_struct *task;
+	pgoff_t ilx;
+	int target_nid;
+	struct damos_interleave_private *priv = arg;
+
+	task = rcu_dereference(vma->vm_mm->owner);
+	if (!task)
+		return true;
+
+	pol = get_task_policy(task);
+	if (!pol)
+		return true;
+
+	/* Getting the interleave weights only makes sense with MPOL_WEIGHTED_INTERLEAVE */
+	if (pol->mode != MPOL_WEIGHTED_INTERLEAVE) {
+		mpol_cond_put(pol);
+		return true;
+	}
+
+	ilx = vma->vm_pgoff >> folio_order(folio);
+	ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + folio_order(folio));
+	policy_nodemask(0, pol, ilx, &target_nid);
+
+	if (target_nid != NUMA_NO_NODE && folio_nid(folio) != target_nid) {
+		list_add(&folio->lru, &priv->folio_migration_list[target_nid]);
+		priv->putback_lru = false;
+	}
+
+	mpol_cond_put(pol);
+	return false;
+}
+
+static unsigned long damon_pa_interleave(struct damon_region *r, struct damos *s,
+		unsigned long *sz_filter_passed)
+{
+	struct damos_interleave_private priv;
+	struct rmap_walk_control rwc;
+	unsigned long addr, applied;
+	struct folio *folio;
+
+	priv.folio_migration_list = kmalloc_array(nr_node_ids, sizeof(struct list_head),
+		GFP_KERNEL);
+	if (!priv.folio_migration_list)
+		return 0;
+
+	for (int i = 0; i < nr_node_ids; i++)
+		INIT_LIST_HEAD(&priv.folio_migration_list[i]);
+
+	memset(&rwc, 0, sizeof(struct rmap_walk_control));
+	rwc.rmap_one = damon_pa_interleave_rmap;
+	rwc.arg = &priv;
+
+	addr = r->ar.start;
+	while (addr < r->ar.end) {
+		folio = damon_get_folio(PHYS_PFN(addr));
+
+		if (damon_pa_invalid_damos_folio(folio, s)) {
+			addr += PAGE_SIZE;
+			continue;
+		}
+
+		if (damos_pa_filter_out(s, folio))
+			goto put_folio;
+		else
+			*sz_filter_passed += folio_size(folio);
+
+		if (!folio_isolate_lru(folio))
+			goto put_folio;
+
+		priv.putback_lru = true;
+		rmap_walk(folio, &rwc);
+
+		if (priv.putback_lru)
+			folio_putback_lru(folio);
+
+put_folio:
+		addr += folio_size(folio);
+		folio_put(folio);
+	}
+
+	applied = 0;
+	for (int i = 0; i < nr_node_ids; i++) {
+		applied += damon_pa_migrate_pages(&priv.folio_migration_list[i], i);
+		cond_resched();
+	}
+
+	kfree(priv.folio_migration_list);
+	s->last_applied = folio;
+	return applied * PAGE_SIZE;
+}
+#else
+static unsigned long damon_pa_interleave(struct damon_region *r, struct damos *s,
+		unsigned long *sz_filter_passed)
+{
+	return 0;
+}
+#endif /* defined(CONFIG_MEMCG) && defined(CONFIG_NUMA) */
+
 static bool damon_pa_scheme_has_filter(struct damos *s)
 {
 	struct damos_filter *f;
@@ -584,6 +692,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 	case DAMOS_MIGRATE_HOT:
 	case DAMOS_MIGRATE_COLD:
 		return damon_pa_migrate(r, scheme, sz_filter_passed);
+	case DAMOS_INTERLEAVE:
+		return damon_pa_interleave(r, scheme, sz_filter_passed);
 	case DAMOS_STAT:
 		return damon_pa_stat(r, scheme, sz_filter_passed);
 	default:
@@ -608,6 +718,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
 		return damon_hot_score(context, r, scheme);
 	case DAMOS_MIGRATE_COLD:
 		return damon_cold_score(context, r, scheme);
+	case DAMOS_INTERLEAVE:
+		return damon_hot_score(context, r, scheme);
 	default:
 		break;
 	}
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 0f6c9e1fec0b..12a32e066d06 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1597,6 +1597,7 @@ static const char * const damon_sysfs_damos_action_strs[] = {
 	"lru_deprio",
 	"migrate_hot",
 	"migrate_cold",
+	"interleave",
 	"stat",
 };
 
-- 
2.43.5
Re: [RFC PATCH 2/4] mm/damon/paddr: Add DAMOS_INTERLEAVE action
Posted by David Hildenbrand 3 months, 4 weeks ago
On 12.06.25 20:13, Bijan Tabatabai wrote:
> From: Bijan Tabatabai <bijantabatab@micron.com>
> 
> This patch adds the DAMOS_INTERLEAVE action.
> It interleaves pages inside of a given region according to the weights
> in the iw_table. To reuse existing interleaving code, the target nid for
> a folio is determined by calling policy_nodemask, therefore only folios
> belonging to processes using the MPOL_WEIGHTED_INTERLEAVE policy will
> have their pages migrated.
> 
> Below is an example of its usage where pages are initially interleaved at
> a 1:1 ratio and then changed to be interleaved at a 2:1 ratio. The
> alloc_data program simply allocates 1GB of data then sleeps.
>    $ cd /sys/kernel/mm/damon/admin/kdamonds/0
>    $ sudo cat ./contexts/0/schemes/0/action
>    interleave
>    $ echo 1 | sudo tee /sys/kernel/mm/mempolicy/weighted_interleave/node0
>    $ echo 1 | sudo tee /sys/kernel/mm/mempolicy/weighted_interleave/node1
>    $ numactl -w 0,1 ~/alloc_data 1G &
>    $ numastat -c -p alloc_data
> 
>    Per-node process memory usage (in MBs) for PID 18473 (alloc_data)
>             Node 0 Node 1 Total
>             ------ ------ -----
>    Huge          0      0     0
>    Heap          0      0     0
>    Stack         0      0     0
>    Private     514    514  1027
>    -------  ------ ------ -----
>    Total       514    514  1028
>    $ echo 2 | sudo tee /sys/kernel/mm/mempolicy/weighted_interleave/node0
>    $ numastat -c -p alloc_data
> 
>    Per-node process memory usage (in MBs) for PID 18473 (alloc_data)
>             Node 0 Node 1 Total
>             ------ ------ -----
>    Huge          0      0     0
>    Heap          0      0     0
>    Stack         0      0     0
>    Private     684    343  1027
>    -------  ------ ------ -----
>    Total       684    343  1027
> 
> Signed-off-by: Bijan Tabatabai <bijantabatab@micron.com>
> ---
>   Documentation/mm/damon/design.rst |   2 +
>   include/linux/damon.h             |   2 +
>   mm/damon/paddr.c                  | 112 ++++++++++++++++++++++++++++++
>   mm/damon/sysfs-schemes.c          |   1 +
>   4 files changed, 117 insertions(+)
> 
> diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
> index ddc50db3afa4..c50d2105cea0 100644
> --- a/Documentation/mm/damon/design.rst
> +++ b/Documentation/mm/damon/design.rst
> @@ -455,6 +455,8 @@ that supports each action are as below.
>      Supported by ``paddr`` operations set.
>    - ``migrate_cold``: Migrate the regions prioritizing colder regions.
>      Supported by ``paddr`` operations set.
> + - ``interleave``: Interleave the regions according to the weighted interleave weights.
> +   Supported by ``paddr`` operations set.
>    - ``stat``: Do nothing but count the statistics.
>      Supported by all operations sets.
>   
> diff --git a/include/linux/damon.h b/include/linux/damon.h
> index a4011726cb3b..81d26a203337 100644
> --- a/include/linux/damon.h
> +++ b/include/linux/damon.h
> @@ -117,6 +117,7 @@ struct damon_target {
>    * @DAMOS_LRU_DEPRIO:	Deprioritize the region on its LRU lists.
>    * @DAMOS_MIGRATE_HOT:  Migrate the regions prioritizing warmer regions.
>    * @DAMOS_MIGRATE_COLD:	Migrate the regions prioritizing colder regions.
> + * @DAMOS_INTERLEAVE: Interleave the regions by the weighted interleave ratio
>    * @DAMOS_STAT:		Do nothing but count the stat.
>    * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
>    *
> @@ -136,6 +137,7 @@ enum damos_action {
>   	DAMOS_LRU_DEPRIO,
>   	DAMOS_MIGRATE_HOT,
>   	DAMOS_MIGRATE_COLD,
> +	DAMOS_INTERLEAVE,
>   	DAMOS_STAT,		/* Do nothing but only record the stat */
>   	NR_DAMOS_ACTIONS,
>   };
> diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
> index 4102a8c5f992..e989464635cd 100644
> --- a/mm/damon/paddr.c
> +++ b/mm/damon/paddr.c
> @@ -535,6 +535,114 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
>   	return applied * PAGE_SIZE;
>   }
>   
> +#if defined(CONFIG_MEMCG) && defined(CONFIG_NUMA)
> +struct damos_interleave_private {
> +	struct list_head *folio_migration_list;
> +	bool putback_lru;
> +};
> +
> +static bool damon_pa_interleave_rmap(struct folio *folio, struct vm_area_struct *vma,
> +		unsigned long addr, void *arg)
> +{
> +	struct mempolicy *pol;
> +	struct task_struct *task;
> +	pgoff_t ilx;
> +	int target_nid;
> +	struct damos_interleave_private *priv = arg;
> +
> +	task = rcu_dereference(vma->vm_mm->owner);
> +	if (!task)
> +		return true;
> +
> +	pol = get_task_policy(task);
> +	if (!pol)
> +		return true;

Why is this not using get_vma_policy(), which will fallback to the task 
policy in case there is no per-vma policy>

-- 
Cheers,

David / dhildenb