Kernel thread based async batch migration

[RFC PATCH v1 3/4] mm: kmigrated - Async kernel migration thread

Posted by Bharata B Rao 3 months, 3 weeks ago

kmigrated is a per-node kernel thread that migrates the
folios marked for migration in batches. Each kmigrated
thread walks the PFN range spanning its node and checks
for potential migration candidates.

It depends on the fields added to extended page flags
to determine the pages that need to be migrated and
the target NID.

Signed-off-by: Bharata B Rao <bharata@amd.com>
---
 include/linux/mmzone.h   |   5 +
 include/linux/page_ext.h |  17 +++
 mm/Makefile              |   3 +-
 mm/kmigrated.c           | 223 +++++++++++++++++++++++++++++++++++++++
 mm/mm_init.c             |   6 ++
 mm/page_ext.c            |  11 ++
 6 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 mm/kmigrated.c

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 283913d42d7b..5d7f0b8d3c91 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -853,6 +853,8 @@ enum zone_type {
 
 };
 
+int kmigrated_add_pfn(unsigned long pfn, int nid);
+
 #ifndef __GENERATING_BOUNDS_H
 
 #define ASYNC_AND_SYNC 2
@@ -1049,6 +1051,7 @@ enum pgdat_flags {
 					 * many pages under writeback
 					 */
 	PGDAT_RECLAIM_LOCKED,		/* prevents concurrent reclaim */
+	PGDAT_KMIGRATED_ACTIVATE,	/* activates kmigrated */
 };
 
 enum zone_flags {
@@ -1493,6 +1496,8 @@ typedef struct pglist_data {
 #ifdef CONFIG_MEMORY_FAILURE
 	struct memory_failure_stats mf_stats;
 #endif
+	struct task_struct *kmigrated;
+	wait_queue_head_t kmigrated_wait;
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 76c817162d2f..4300c9dbafec 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -40,8 +40,25 @@ enum page_ext_flags {
 	PAGE_EXT_YOUNG,
 	PAGE_EXT_IDLE,
 #endif
+	/*
+	 * 32 bits following this are used by the migrator.
+	 * The next available bit position is 33.
+	 */
+	PAGE_EXT_MIGRATE_READY,
 };
 
+#define PAGE_EXT_MIG_NID_WIDTH	10
+#define PAGE_EXT_MIG_FREQ_WIDTH	3
+#define PAGE_EXT_MIG_TIME_WIDTH	18
+
+#define PAGE_EXT_MIG_NID_SHIFT	(PAGE_EXT_MIGRATE_READY + 1)
+#define PAGE_EXT_MIG_FREQ_SHIFT	(PAGE_EXT_MIG_NID_SHIFT + PAGE_EXT_MIG_NID_WIDTH)
+#define PAGE_EXT_MIG_TIME_SHIFT	(PAGE_EXT_MIG_FREQ_SHIFT + PAGE_EXT_MIG_FREQ_WIDTH)
+
+#define PAGE_EXT_MIG_NID_MASK	((1UL << PAGE_EXT_MIG_NID_SHIFT) - 1)
+#define PAGE_EXT_MIG_FREQ_MASK	((1UL << PAGE_EXT_MIG_FREQ_SHIFT) - 1)
+#define PAGE_EXT_MIG_TIME_MASK	((1UL << PAGE_EXT_MIG_TIME_SHIFT) - 1)
+
 /*
  * Page Extension can be considered as an extended mem_map.
  * A page_ext page is associated with every page descriptor. The
diff --git a/mm/Makefile b/mm/Makefile
index 1a7a11d4933d..5a382f19105f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,8 @@ mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
 			   msync.o page_vma_mapped.o pagewalk.o \
-			   pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o
+			   pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o \
+			   kmigrated.o
 
 
 ifdef CONFIG_CROSS_MEMORY_ATTACH
diff --git a/mm/kmigrated.c b/mm/kmigrated.c
new file mode 100644
index 000000000000..3caefe4be0e7
--- /dev/null
+++ b/mm/kmigrated.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * kmigrated is a kernel thread that runs for each node that has
+ * memory. It iterates over the node's PFNs and  migrates pages
+ * marked for migration into their targeted nodes.
+ *
+ * kmigrated depends on PAGE_EXTENSION to find out the pages that
+ * need to be migrated. In addition to a few fields that could be
+ * used by hot page promotion logic to store and evaluate the page
+ * hotness information, the extended page flags is field is extended
+ * to store the target NID for migration.
+ */
+#include <linux/mm.h>
+#include <linux/migrate.h>
+#include <linux/cpuhotplug.h>
+#include <linux/page_ext.h>
+
+#define KMIGRATE_DELAY	MSEC_PER_SEC
+#define KMIGRATE_BATCH	512
+
+static int page_ext_xchg_nid(struct page_ext *page_ext, int nid)
+{
+	unsigned long old_flags, flags;
+	int old_nid;
+
+	old_flags = READ_ONCE(page_ext->flags);
+	do {
+		flags = old_flags;
+		old_nid = (flags >> PAGE_EXT_MIG_NID_SHIFT) & PAGE_EXT_MIG_NID_MASK;
+
+		flags &= ~(PAGE_EXT_MIG_NID_MASK << PAGE_EXT_MIG_NID_SHIFT);
+		flags |= (nid & PAGE_EXT_MIG_NID_MASK) << PAGE_EXT_MIG_NID_SHIFT;
+	} while (unlikely(!try_cmpxchg(&page_ext->flags, &old_flags, flags)));
+
+	return old_nid;
+}
+
+/*
+ * Marks the page as ready for migration.
+ *
+ * @pfn: PFN of the page
+ * @nid: Target NID to were the page needs to be migrated
+ *
+ * The request for migration is noted by setting PAGE_EXT_MIGRATE_READY
+ * in the extended page flags which the kmigrated thread would check.
+ */
+int kmigrated_add_pfn(unsigned long pfn, int nid)
+{
+	struct page *page;
+	struct page_ext *page_ext;
+
+	page = pfn_to_page(pfn);
+	if (!page)
+		return -EINVAL;
+
+	page_ext = page_ext_get(page);
+	if (unlikely(!page_ext))
+		return -EINVAL;
+
+	page_ext_xchg_nid(page_ext, nid);
+	test_and_set_bit(PAGE_EXT_MIGRATE_READY, &page_ext->flags);
+	page_ext_put(page_ext);
+
+	set_bit(PGDAT_KMIGRATED_ACTIVATE, &page_pgdat(page)->flags);
+	return 0;
+}
+
+/*
+ * If the page has been marked ready for migration, return
+ * the NID to which it needs to be migrated to.
+ *
+ * If not return NUMA_NO_NODE.
+ */
+static int kmigrated_get_nid(struct page *page)
+{
+	struct page_ext *page_ext;
+	int nid = NUMA_NO_NODE;
+
+	page_ext = page_ext_get(page);
+	if (unlikely(!page_ext))
+		return nid;
+
+	if (!test_and_clear_bit(PAGE_EXT_MIGRATE_READY, &page_ext->flags))
+		goto out;
+
+	nid = page_ext_xchg_nid(page_ext, nid);
+out:
+	page_ext_put(page_ext);
+	return nid;
+}
+
+/*
+ * Walks the PFNs of the zone, isolates and migrates them in batches.
+ */
+static void kmigrated_walk_zone(unsigned long start_pfn, unsigned long end_pfn,
+				int src_nid)
+{
+	int nid, cur_nid = NUMA_NO_NODE;
+	LIST_HEAD(migrate_list);
+	int batch_count = 0;
+	struct folio *folio;
+	struct page *page;
+	unsigned long pfn;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		if (!pfn_valid(pfn))
+			continue;
+
+		page = pfn_to_online_page(pfn);
+		if (!page)
+			continue;
+
+		if (page_to_nid(page) != src_nid)
+			continue;
+
+		/*
+		 * TODO: Take care of folio_nr_pages() increment
+		 * to pfn count.
+		 */
+		folio = page_folio(page);
+		if (!folio_test_lru(folio))
+			continue;
+
+		nid = kmigrated_get_nid(page);
+		if (nid == NUMA_NO_NODE)
+			continue;
+
+		if (page_to_nid(page) == nid)
+			continue;
+
+		if (migrate_misplaced_folio_prepare(folio, NULL, nid))
+			continue;
+
+		if (cur_nid != NUMA_NO_NODE)
+			cur_nid = nid;
+
+		if (++batch_count >= KMIGRATE_BATCH || cur_nid != nid) {
+			migrate_misplaced_folios_batch(&migrate_list, cur_nid);
+			cur_nid = nid;
+			batch_count = 0;
+			cond_resched();
+		}
+		list_add(&folio->lru, &migrate_list);
+	}
+	if (!list_empty(&migrate_list))
+		migrate_misplaced_folios_batch(&migrate_list, cur_nid);
+}
+
+static void kmigrated_do_work(pg_data_t *pgdat)
+{
+	struct zone *zone;
+	int zone_idx;
+
+	clear_bit(PGDAT_KMIGRATED_ACTIVATE, &pgdat->flags);
+	for (zone_idx = 0; zone_idx < MAX_NR_ZONES; zone_idx++) {
+		zone = &pgdat->node_zones[zone_idx];
+
+		if (!populated_zone(zone))
+			continue;
+
+		if (zone_is_zone_device(zone))
+			continue;
+
+		kmigrated_walk_zone(zone->zone_start_pfn, zone_end_pfn(zone),
+				    pgdat->node_id);
+	}
+}
+
+static inline bool kmigrated_work_requested(pg_data_t *pgdat)
+{
+	return test_bit(PGDAT_KMIGRATED_ACTIVATE, &pgdat->flags);
+}
+
+static void kmigrated_wait_work(pg_data_t *pgdat)
+{
+	long timeout = msecs_to_jiffies(KMIGRATE_DELAY);
+
+	wait_event_timeout(pgdat->kmigrated_wait,
+			   kmigrated_work_requested(pgdat), timeout);
+}
+
+/*
+ * Per-node kthread that iterates over its PFNs and migrates the
+ * pages that have been marked for migration.
+ */
+static int kmigrated(void *p)
+{
+	pg_data_t *pgdat = (pg_data_t *)p;
+
+	while (!kthread_should_stop()) {
+		kmigrated_wait_work(pgdat);
+		kmigrated_do_work(pgdat);
+	}
+	return 0;
+}
+
+static void kmigrated_run(int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+
+	if (pgdat->kmigrated)
+		return;
+
+	pgdat->kmigrated = kthread_create(kmigrated, pgdat, "kmigrated%d", nid);
+	if (IS_ERR(pgdat->kmigrated)) {
+		pr_err("Failed to start kmigrated for node %d\n", nid);
+		pgdat->kmigrated = NULL;
+	} else {
+		wake_up_process(pgdat->kmigrated);
+	}
+}
+
+static int __init kmigrated_init(void)
+{
+	int nid;
+
+	for_each_node_state(nid, N_MEMORY)
+		kmigrated_run(nid);
+
+	return 0;
+}
+
+subsys_initcall(kmigrated_init)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index f2944748f526..3a9cfd175366 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1398,6 +1398,11 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat)
 static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
 #endif
 
+static void pgdat_init_kmigrated(struct pglist_data *pgdat)
+{
+	init_waitqueue_head(&pgdat->kmigrated_wait);
+}
+
 static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 {
 	int i;
@@ -1407,6 +1412,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 
 	pgdat_init_split_queue(pgdat);
 	pgdat_init_kcompactd(pgdat);
+	pgdat_init_kmigrated(pgdat);
 
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index c351fdfe9e9a..546725fffddb 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -76,6 +76,16 @@ static struct page_ext_operations page_idle_ops __initdata = {
 };
 #endif
 
+static bool need_page_mig(void)
+{
+	return true;
+}
+
+static struct page_ext_operations page_mig_ops __initdata = {
+	.need = need_page_mig,
+	.need_shared_flags = true,
+};
+
 static struct page_ext_operations *page_ext_ops[] __initdata = {
 #ifdef CONFIG_PAGE_OWNER
 	&page_owner_ops,
@@ -89,6 +99,7 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
 #ifdef CONFIG_PAGE_TABLE_CHECK
 	&page_table_check_ops,
 #endif
+	&page_mig_ops,
 };
 
 unsigned long page_ext_size;
-- 
2.34.1

Re: [RFC PATCH v1 3/4] mm: kmigrated - Async kernel migration thread

Posted by Byungchul Park 3 months ago

On Mon, Jun 16, 2025 at 07:09:30PM +0530, Bharata B Rao wrote:
> 
> kmigrated is a per-node kernel thread that migrates the
> folios marked for migration in batches. Each kmigrated
> thread walks the PFN range spanning its node and checks
> for potential migration candidates.
> 
> It depends on the fields added to extended page flags
> to determine the pages that need to be migrated and
> the target NID.
> 
> Signed-off-by: Bharata B Rao <bharata@amd.com>
> ---
>  include/linux/mmzone.h   |   5 +
>  include/linux/page_ext.h |  17 +++
>  mm/Makefile              |   3 +-
>  mm/kmigrated.c           | 223 +++++++++++++++++++++++++++++++++++++++
>  mm/mm_init.c             |   6 ++
>  mm/page_ext.c            |  11 ++
>  6 files changed, 264 insertions(+), 1 deletion(-)
>  create mode 100644 mm/kmigrated.c
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 283913d42d7b..5d7f0b8d3c91 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -853,6 +853,8 @@ enum zone_type {
> 
>  };
> 
> +int kmigrated_add_pfn(unsigned long pfn, int nid);
> +
>  #ifndef __GENERATING_BOUNDS_H
> 
>  #define ASYNC_AND_SYNC 2
> @@ -1049,6 +1051,7 @@ enum pgdat_flags {
>                                          * many pages under writeback
>                                          */
>         PGDAT_RECLAIM_LOCKED,           /* prevents concurrent reclaim */
> +       PGDAT_KMIGRATED_ACTIVATE,       /* activates kmigrated */
>  };
> 
>  enum zone_flags {
> @@ -1493,6 +1496,8 @@ typedef struct pglist_data {
>  #ifdef CONFIG_MEMORY_FAILURE
>         struct memory_failure_stats mf_stats;
>  #endif
> +       struct task_struct *kmigrated;
> +       wait_queue_head_t kmigrated_wait;
>  } pg_data_t;
> 
>  #define node_present_pages(nid)        (NODE_DATA(nid)->node_present_pages)
> diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
> index 76c817162d2f..4300c9dbafec 100644
> --- a/include/linux/page_ext.h
> +++ b/include/linux/page_ext.h
> @@ -40,8 +40,25 @@ enum page_ext_flags {
>         PAGE_EXT_YOUNG,
>         PAGE_EXT_IDLE,
>  #endif
> +       /*
> +        * 32 bits following this are used by the migrator.
> +        * The next available bit position is 33.
> +        */
> +       PAGE_EXT_MIGRATE_READY,
>  };
> 
> +#define PAGE_EXT_MIG_NID_WIDTH 10
> +#define PAGE_EXT_MIG_FREQ_WIDTH        3
> +#define PAGE_EXT_MIG_TIME_WIDTH        18
> +
> +#define PAGE_EXT_MIG_NID_SHIFT (PAGE_EXT_MIGRATE_READY + 1)
> +#define PAGE_EXT_MIG_FREQ_SHIFT        (PAGE_EXT_MIG_NID_SHIFT + PAGE_EXT_MIG_NID_WIDTH)
> +#define PAGE_EXT_MIG_TIME_SHIFT        (PAGE_EXT_MIG_FREQ_SHIFT + PAGE_EXT_MIG_FREQ_WIDTH)
> +
> +#define PAGE_EXT_MIG_NID_MASK  ((1UL << PAGE_EXT_MIG_NID_SHIFT) - 1)
> +#define PAGE_EXT_MIG_FREQ_MASK ((1UL << PAGE_EXT_MIG_FREQ_SHIFT) - 1)
> +#define PAGE_EXT_MIG_TIME_MASK ((1UL << PAGE_EXT_MIG_TIME_SHIFT) - 1)
> +
>  /*
>   * Page Extension can be considered as an extended mem_map.
>   * A page_ext page is associated with every page descriptor. The
> diff --git a/mm/Makefile b/mm/Makefile
> index 1a7a11d4933d..5a382f19105f 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -37,7 +37,8 @@ mmu-y                 := nommu.o
>  mmu-$(CONFIG_MMU)      := highmem.o memory.o mincore.o \
>                            mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
>                            msync.o page_vma_mapped.o pagewalk.o \
> -                          pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o
> +                          pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o \
> +                          kmigrated.o
> 
> 
>  ifdef CONFIG_CROSS_MEMORY_ATTACH
> diff --git a/mm/kmigrated.c b/mm/kmigrated.c
> new file mode 100644
> index 000000000000..3caefe4be0e7
> --- /dev/null
> +++ b/mm/kmigrated.c
> @@ -0,0 +1,223 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * kmigrated is a kernel thread that runs for each node that has
> + * memory. It iterates over the node's PFNs and  migrates pages
> + * marked for migration into their targeted nodes.
> + *
> + * kmigrated depends on PAGE_EXTENSION to find out the pages that
> + * need to be migrated. In addition to a few fields that could be
> + * used by hot page promotion logic to store and evaluate the page
> + * hotness information, the extended page flags is field is extended
> + * to store the target NID for migration.
> + */
> +#include <linux/mm.h>
> +#include <linux/migrate.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/page_ext.h>
> +
> +#define KMIGRATE_DELAY MSEC_PER_SEC
> +#define KMIGRATE_BATCH 512
> +
> +static int page_ext_xchg_nid(struct page_ext *page_ext, int nid)
> +{
> +       unsigned long old_flags, flags;
> +       int old_nid;
> +
> +       old_flags = READ_ONCE(page_ext->flags);
> +       do {
> +               flags = old_flags;
> +               old_nid = (flags >> PAGE_EXT_MIG_NID_SHIFT) & PAGE_EXT_MIG_NID_MASK;
> +
> +               flags &= ~(PAGE_EXT_MIG_NID_MASK << PAGE_EXT_MIG_NID_SHIFT);
> +               flags |= (nid & PAGE_EXT_MIG_NID_MASK) << PAGE_EXT_MIG_NID_SHIFT;
> +       } while (unlikely(!try_cmpxchg(&page_ext->flags, &old_flags, flags)));
> +
> +       return old_nid;
> +}
> +
> +/*
> + * Marks the page as ready for migration.
> + *
> + * @pfn: PFN of the page
> + * @nid: Target NID to were the page needs to be migrated
> + *
> + * The request for migration is noted by setting PAGE_EXT_MIGRATE_READY
> + * in the extended page flags which the kmigrated thread would check.
> + */
> +int kmigrated_add_pfn(unsigned long pfn, int nid)
> +{
> +       struct page *page;
> +       struct page_ext *page_ext;
> +
> +       page = pfn_to_page(pfn);
> +       if (!page)
> +               return -EINVAL;
> +
> +       page_ext = page_ext_get(page);
> +       if (unlikely(!page_ext))
> +               return -EINVAL;
> +
> +       page_ext_xchg_nid(page_ext, nid);
> +       test_and_set_bit(PAGE_EXT_MIGRATE_READY, &page_ext->flags);
> +       page_ext_put(page_ext);
> +
> +       set_bit(PGDAT_KMIGRATED_ACTIVATE, &page_pgdat(page)->flags);
> +       return 0;
> +}
> +
> +/*
> + * If the page has been marked ready for migration, return
> + * the NID to which it needs to be migrated to.
> + *
> + * If not return NUMA_NO_NODE.
> + */
> +static int kmigrated_get_nid(struct page *page)
> +{
> +       struct page_ext *page_ext;
> +       int nid = NUMA_NO_NODE;
> +
> +       page_ext = page_ext_get(page);
> +       if (unlikely(!page_ext))
> +               return nid;
> +
> +       if (!test_and_clear_bit(PAGE_EXT_MIGRATE_READY, &page_ext->flags))
> +               goto out;
> +
> +       nid = page_ext_xchg_nid(page_ext, nid);
> +out:
> +       page_ext_put(page_ext);
> +       return nid;
> +}
> +
> +/*
> + * Walks the PFNs of the zone, isolates and migrates them in batches.
> + */
> +static void kmigrated_walk_zone(unsigned long start_pfn, unsigned long end_pfn,
> +                               int src_nid)
> +{
> +       int nid, cur_nid = NUMA_NO_NODE;
> +       LIST_HEAD(migrate_list);
> +       int batch_count = 0;
> +       struct folio *folio;
> +       struct page *page;
> +       unsigned long pfn;
> +
> +       for (pfn = start_pfn; pfn < end_pfn; pfn++) {

Hi,

Is it feasible to scan all the pages in each zone?  I think we should
figure out a better way so as to reduce CPU time for this purpose.

Besides the opinion above, I was thinking to design and implement a
kthread for memory placement between different tiers - I already named
it e.g. kmplaced, rather than relying on kswapd and hinting fault, lol ;)

Now that you've started, I'd like to think about it together and improve
it so that it works better.  Please cc me from the next spin.

	Byungchul

> +               if (!pfn_valid(pfn))
> +                       continue;
> +
> +               page = pfn_to_online_page(pfn);
> +               if (!page)
> +                       continue;
> +
> +               if (page_to_nid(page) != src_nid)
> +                       continue;
> +
> +               /*
> +                * TODO: Take care of folio_nr_pages() increment
> +                * to pfn count.
> +                */
> +               folio = page_folio(page);
> +               if (!folio_test_lru(folio))
> +                       continue;
> +
> +               nid = kmigrated_get_nid(page);
> +               if (nid == NUMA_NO_NODE)
> +                       continue;
> +
> +               if (page_to_nid(page) == nid)
> +                       continue;
> +
> +               if (migrate_misplaced_folio_prepare(folio, NULL, nid))
> +                       continue;
> +
> +               if (cur_nid != NUMA_NO_NODE)
> +                       cur_nid = nid;
> +
> +               if (++batch_count >= KMIGRATE_BATCH || cur_nid != nid) {
> +                       migrate_misplaced_folios_batch(&migrate_list, cur_nid);
> +                       cur_nid = nid;
> +                       batch_count = 0;
> +                       cond_resched();
> +               }
> +               list_add(&folio->lru, &migrate_list);
> +       }
> +       if (!list_empty(&migrate_list))
> +               migrate_misplaced_folios_batch(&migrate_list, cur_nid);
> +}
> +
> +static void kmigrated_do_work(pg_data_t *pgdat)
> +{
> +       struct zone *zone;
> +       int zone_idx;
> +
> +       clear_bit(PGDAT_KMIGRATED_ACTIVATE, &pgdat->flags);
> +       for (zone_idx = 0; zone_idx < MAX_NR_ZONES; zone_idx++) {
> +               zone = &pgdat->node_zones[zone_idx];
> +
> +               if (!populated_zone(zone))
> +                       continue;
> +
> +               if (zone_is_zone_device(zone))
> +                       continue;
> +
> +               kmigrated_walk_zone(zone->zone_start_pfn, zone_end_pfn(zone),
> +                                   pgdat->node_id);
> +       }
> +}
> +
> +static inline bool kmigrated_work_requested(pg_data_t *pgdat)
> +{
> +       return test_bit(PGDAT_KMIGRATED_ACTIVATE, &pgdat->flags);
> +}
> +
> +static void kmigrated_wait_work(pg_data_t *pgdat)
> +{
> +       long timeout = msecs_to_jiffies(KMIGRATE_DELAY);
> +
> +       wait_event_timeout(pgdat->kmigrated_wait,
> +                          kmigrated_work_requested(pgdat), timeout);
> +}
> +
> +/*
> + * Per-node kthread that iterates over its PFNs and migrates the
> + * pages that have been marked for migration.
> + */
> +static int kmigrated(void *p)
> +{
> +       pg_data_t *pgdat = (pg_data_t *)p;
> +
> +       while (!kthread_should_stop()) {
> +               kmigrated_wait_work(pgdat);
> +               kmigrated_do_work(pgdat);
> +       }
> +       return 0;
> +}
> +
> +static void kmigrated_run(int nid)
> +{
> +       pg_data_t *pgdat = NODE_DATA(nid);
> +
> +       if (pgdat->kmigrated)
> +               return;
> +
> +       pgdat->kmigrated = kthread_create(kmigrated, pgdat, "kmigrated%d", nid);
> +       if (IS_ERR(pgdat->kmigrated)) {
> +               pr_err("Failed to start kmigrated for node %d\n", nid);
> +               pgdat->kmigrated = NULL;
> +       } else {
> +               wake_up_process(pgdat->kmigrated);
> +       }
> +}
> +
> +static int __init kmigrated_init(void)
> +{
> +       int nid;
> +
> +       for_each_node_state(nid, N_MEMORY)
> +               kmigrated_run(nid);
> +
> +       return 0;
> +}
> +
> +subsys_initcall(kmigrated_init)
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index f2944748f526..3a9cfd175366 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1398,6 +1398,11 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat)
>  static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
>  #endif
> 
> +static void pgdat_init_kmigrated(struct pglist_data *pgdat)
> +{
> +       init_waitqueue_head(&pgdat->kmigrated_wait);
> +}
> +
>  static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
>  {
>         int i;
> @@ -1407,6 +1412,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
> 
>         pgdat_init_split_queue(pgdat);
>         pgdat_init_kcompactd(pgdat);
> +       pgdat_init_kmigrated(pgdat);
> 
>         init_waitqueue_head(&pgdat->kswapd_wait);
>         init_waitqueue_head(&pgdat->pfmemalloc_wait);
> diff --git a/mm/page_ext.c b/mm/page_ext.c
> index c351fdfe9e9a..546725fffddb 100644
> --- a/mm/page_ext.c
> +++ b/mm/page_ext.c
> @@ -76,6 +76,16 @@ static struct page_ext_operations page_idle_ops __initdata = {
>  };
>  #endif
> 
> +static bool need_page_mig(void)
> +{
> +       return true;
> +}
> +
> +static struct page_ext_operations page_mig_ops __initdata = {
> +       .need = need_page_mig,
> +       .need_shared_flags = true,
> +};
> +
>  static struct page_ext_operations *page_ext_ops[] __initdata = {
>  #ifdef CONFIG_PAGE_OWNER
>         &page_owner_ops,
> @@ -89,6 +99,7 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
>  #ifdef CONFIG_PAGE_TABLE_CHECK
>         &page_table_check_ops,
>  #endif
> +       &page_mig_ops,
>  };
> 
>  unsigned long page_ext_size;
> --
> 2.34.1
>

Re: [RFC PATCH v1 3/4] mm: kmigrated - Async kernel migration thread

Posted by Bharata B Rao 3 months ago

On 07-Jul-25 3:06 PM, Byungchul Park wrote:
> On Mon, Jun 16, 2025 at 07:09:30PM +0530, Bharata B Rao wrote:
>> +
>> +/*
>> + * Walks the PFNs of the zone, isolates and migrates them in batches.
>> + */
>> +static void kmigrated_walk_zone(unsigned long start_pfn, unsigned long end_pfn,
>> +                               int src_nid)
>> +{
>> +       int nid, cur_nid = NUMA_NO_NODE;
>> +       LIST_HEAD(migrate_list);
>> +       int batch_count = 0;
>> +       struct folio *folio;
>> +       struct page *page;
>> +       unsigned long pfn;
>> +
>> +       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> 
> Hi,
> 
> Is it feasible to scan all the pages in each zone?  I think we should
> figure out a better way so as to reduce CPU time for this purpose.

I incorporated a per-zone indicator to inform kmigrated if it needs to
skip the whole zone when scanning and look at only those zones which
have migrate-ready pages.

CPU time spent is one aspect, but the other aspect I have observed is
the delay in identifying migrate-ready pages depending on where they
exist in the zone. I have been seeing both best case and worst case
behaviors due to which the number of pages migrated for a given workload
can vary based on the given run.

Hence scanning all pages without additional smarts to quickly arrive
the pages of interest may not be ideal. I am working on approaches
to improve this situation.

> 
> Besides the opinion above, I was thinking to design and implement a
> kthread for memory placement between different tiers - I already named
> it e.g. kmplaced, rather than relying on kswapd and hinting fault, lol ;)
> 
> Now that you've started, I'd like to think about it together and improve
> it so that it works better.  Please cc me from the next spin.

Sure, will do from next post.

Regards,
Bharata.

page_ext and memdescs

Posted by Matthew Wilcox 3 months, 3 weeks ago

On Mon, Jun 16, 2025 at 07:09:30PM +0530, Bharata B Rao wrote:
> diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
> index 76c817162d2f..4300c9dbafec 100644
> --- a/include/linux/page_ext.h
> +++ b/include/linux/page_ext.h
> @@ -40,8 +40,25 @@ enum page_ext_flags {
>  	PAGE_EXT_YOUNG,
>  	PAGE_EXT_IDLE,
>  #endif
> +	/*
> +	 * 32 bits following this are used by the migrator.
> +	 * The next available bit position is 33.
> +	 */
> +	PAGE_EXT_MIGRATE_READY,
>  };
>  
> +#define PAGE_EXT_MIG_NID_WIDTH	10
> +#define PAGE_EXT_MIG_FREQ_WIDTH	3
> +#define PAGE_EXT_MIG_TIME_WIDTH	18
> +
> +#define PAGE_EXT_MIG_NID_SHIFT	(PAGE_EXT_MIGRATE_READY + 1)
> +#define PAGE_EXT_MIG_FREQ_SHIFT	(PAGE_EXT_MIG_NID_SHIFT + PAGE_EXT_MIG_NID_WIDTH)
> +#define PAGE_EXT_MIG_TIME_SHIFT	(PAGE_EXT_MIG_FREQ_SHIFT + PAGE_EXT_MIG_FREQ_WIDTH)
> +
> +#define PAGE_EXT_MIG_NID_MASK	((1UL << PAGE_EXT_MIG_NID_SHIFT) - 1)
> +#define PAGE_EXT_MIG_FREQ_MASK	((1UL << PAGE_EXT_MIG_FREQ_SHIFT) - 1)
> +#define PAGE_EXT_MIG_TIME_MASK	((1UL << PAGE_EXT_MIG_TIME_SHIFT) - 1)

OK, so we need to have a conversation about page_ext.  Sorry this is
happening to you.  I've kind of skipped over page_ext when talking
about folios and memdescs up to now, so it's not that you've missed
anything.

As the comment says,

 * Page Extension can be considered as an extended mem_map.

and we need to do this because we don't want to grow struct page beyond
64 bytes.  But memdescs are dynamically allocated, so we don't need
page_ext any more, and all that code can go away.

lib/alloc_tag.c:struct page_ext_operations page_alloc_tagging_ops = {
mm/page_ext.c:static struct page_ext_operations page_idle_ops __initdata = {
mm/page_ext.c:static struct page_ext_operations *page_ext_ops[] __initdata = {
mm/page_owner.c:struct page_ext_operations page_owner_ops = {
mm/page_table_check.c:struct page_ext_operations page_table_check_ops = {

I think all of these are actually per-memdesc thangs and not per-page
things, so we can get rid of them all.  That means I don't want to see
new per-page data being added to page_ext.

So, what's this really used for?  It seems like it's really
per-allocation, not per-page.  Does it need to be preserved across
alloc/free or can it be reset at free time?

Re: page_ext and memdescs

Posted by David Hildenbrand 3 months, 2 weeks ago

On 16.06.25 16:05, Matthew Wilcox wrote:
> On Mon, Jun 16, 2025 at 07:09:30PM +0530, Bharata B Rao wrote:
>> diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
>> index 76c817162d2f..4300c9dbafec 100644
>> --- a/include/linux/page_ext.h
>> +++ b/include/linux/page_ext.h
>> @@ -40,8 +40,25 @@ enum page_ext_flags {
>>   	PAGE_EXT_YOUNG,
>>   	PAGE_EXT_IDLE,
>>   #endif
>> +	/*
>> +	 * 32 bits following this are used by the migrator.
>> +	 * The next available bit position is 33.
>> +	 */
>> +	PAGE_EXT_MIGRATE_READY,
>>   };
>>   
>> +#define PAGE_EXT_MIG_NID_WIDTH	10
>> +#define PAGE_EXT_MIG_FREQ_WIDTH	3
>> +#define PAGE_EXT_MIG_TIME_WIDTH	18
>> +
>> +#define PAGE_EXT_MIG_NID_SHIFT	(PAGE_EXT_MIGRATE_READY + 1)
>> +#define PAGE_EXT_MIG_FREQ_SHIFT	(PAGE_EXT_MIG_NID_SHIFT + PAGE_EXT_MIG_NID_WIDTH)
>> +#define PAGE_EXT_MIG_TIME_SHIFT	(PAGE_EXT_MIG_FREQ_SHIFT + PAGE_EXT_MIG_FREQ_WIDTH)
>> +
>> +#define PAGE_EXT_MIG_NID_MASK	((1UL << PAGE_EXT_MIG_NID_SHIFT) - 1)
>> +#define PAGE_EXT_MIG_FREQ_MASK	((1UL << PAGE_EXT_MIG_FREQ_SHIFT) - 1)
>> +#define PAGE_EXT_MIG_TIME_MASK	((1UL << PAGE_EXT_MIG_TIME_SHIFT) - 1)
> 
> OK, so we need to have a conversation about page_ext.  Sorry this is
> happening to you.  I've kind of skipped over page_ext when talking
> about folios and memdescs up to now, so it's not that you've missed
> anything.
> 
> As the comment says,
> 
>   * Page Extension can be considered as an extended mem_map.
> 
> and we need to do this because we don't want to grow struct page beyond
> 64 bytes.  But memdescs are dynamically allocated, so we don't need
> page_ext any more, and all that code can go away.
> 
> lib/alloc_tag.c:struct page_ext_operations page_alloc_tagging_ops = {

In this case, we might not necessarily have an allocated memdesc, for 
all allocations, though. Think of memory ballooning allocating "offline" 
pages in the future.

Of course, the easy solution is to not track these non-memdesc allocations.

> mm/page_ext.c:static struct page_ext_operations page_idle_ops __initdata = {

That should be per-folio.

> mm/page_ext.c:static struct page_ext_operations *page_ext_ops[] __initdata = {

That's just the lookup table for the others.

> mm/page_owner.c:struct page_ext_operations page_owner_ops = {

Hm, probably like tagging above.

> mm/page_table_check.c:struct page_ext_operations page_table_check_ops = {

That should be per-folio as well IIUC.

-- 
Cheers,

David / dhildenb

Re: page_ext and memdescs

Posted by Bharata B Rao 3 months, 3 weeks ago

On 16-Jun-25 7:35 PM, Matthew Wilcox wrote:
> On Mon, Jun 16, 2025 at 07:09:30PM +0530, Bharata B Rao wrote:
<snip>
>> +#define PAGE_EXT_MIG_NID_MASK	((1UL << PAGE_EXT_MIG_NID_SHIFT) - 1)
>> +#define PAGE_EXT_MIG_FREQ_MASK	((1UL << PAGE_EXT_MIG_FREQ_SHIFT) - 1)
>> +#define PAGE_EXT_MIG_TIME_MASK	((1UL << PAGE_EXT_MIG_TIME_SHIFT) - 1)
> 
> OK, so we need to have a conversation about page_ext.  Sorry this is
> happening to you.  I've kind of skipped over page_ext when talking
> about folios and memdescs up to now, so it's not that you've missed
> anything.
> 
> As the comment says,
> 
>   * Page Extension can be considered as an extended mem_map.
> 
> and we need to do this because we don't want to grow struct page beyond
> 64 bytes.  But memdescs are dynamically allocated, so we don't need
> page_ext any more, and all that code can go away.
> 
> lib/alloc_tag.c:struct page_ext_operations page_alloc_tagging_ops = {
> mm/page_ext.c:static struct page_ext_operations page_idle_ops __initdata = {
> mm/page_ext.c:static struct page_ext_operations *page_ext_ops[] __initdata = {
> mm/page_owner.c:struct page_ext_operations page_owner_ops = {
> mm/page_table_check.c:struct page_ext_operations page_table_check_ops = {
> 
> I think all of these are actually per-memdesc thangs and not per-page
> things, so we can get rid of them all.  That means I don't want to see
> new per-page data being added to page_ext.

Fair point.

> 
> So, what's this really used for?  It seems like it's really
> per-allocation, not per-page.  Does it need to be preserved across
> alloc/free or can it be reset at free time?

The context here is to track the pages that need to be migrated. Whether 
it is for NUMA Balancing or for any other sub-system that would need to 
migrate (or promote) pages across nodes, I am trying to come up with a 
kernel thread based migrator that would migrate the identified pages in 
an async and batched manner. For this, the basic information that is 
required for each such ready-to-be-migrated page is the target NID.
Since I have chosen to walk the zones and PFNs of the zone to iterate 
over each page, an additional info that I want per ready-to-be-migrated 
page is an indication that the page is indeed ready now to be migrated 
by the migrator thread.

In addition to these two things, if we want to carve out a single system 
(like kpromoted approach) that handles inputs from multiple page hotness 
sources, maintains heuristics to decide when exactly to migrate/promote 
a page, then it would be good to store a few other information for such 
pages (like access frequency, access timestamp etc).

With that background, I am looking for an optimal place to store this 
information. In my earlier approaches, I was maintaining a global list 
of such hot pages and realized that such an approach will not scale and 
hence in the current approach I am tying that information with the page 
itself. With that, there is no overhead of maintaining such a list, 
synchronizing between producers and migrator thread, no allocation for 
each maintained page. Hence it appeared to me that a pre-allocated 
per-page info would be preferable. At this point, page extension 
appeared a good place to have this information.

Sorry for the long reply, but coming to your specific question now.
So I really need to maintain such data only for pages that can be 
migrated. Pages like most anonymous pages, file backed pages, pages that 
are mapped to user page tables, THP pages etc are candidates. I wonder 
which memdesc type/types would cover all such pages. Would "folio" as 
memdesc (https://kernelnewbies.org/MatthewWilcox/FolioAlloc) be broad 
enough type for this?

As you note, it appears to me that it could be per-allocation rather 
than per-page and the information needn't be preserved across alloc/free.

Regards,
Bharata.

[RFC PATCH v1 1/4] mm: migrate: Allow misplaced migration without VMA too
[RFC PATCH v1 2/4] migrate: implement migrate_misplaced_folios_batch
[RFC PATCH v1 3/4] mm: kmigrated - Async kernel migration thread
[RFC PATCH v1 4/4] mm: sched: Batch-migrate misplaced pages