[v2] mm: Hot page tracking and promotion infrastructure

[RFC PATCH v2 8/8] mm: sched: Move hot page promotion from NUMAB=2 to kpromoted

Posted by Bharata B Rao 5 months ago

Currently hot page promotion (NUMA_BALANCING_MEMORY_TIERING
mode of NUMA Balancing) does hot page detection (via hint faults),
hot page classification and eventual promotion, all by itself and
sits within the scheduler.

With the new hot page tracking and promotion mechanism being
available, NUMA Balancing can limit itself to detection of
hot pages (via hint faults) and off-load rest of the
functionality to the common hot page tracking system.

pghot_record_access(PGHOT_HINT_FAULT) API is used to feed the
hot page info. In addition, the migration rate limiting and
dynamic threshold logic are moved to kpromoted so that the same
can be used for hot pages reported by other sources too.

Signed-off-by: Bharata B Rao <bharata@amd.com>
---
 include/linux/pghot.h |   2 +
 kernel/sched/fair.c   | 149 ++----------------------------------------
 mm/memory.c           |  32 ++-------
 mm/pghot.c            | 132 +++++++++++++++++++++++++++++++++++--
 4 files changed, 142 insertions(+), 173 deletions(-)

diff --git a/include/linux/pghot.h b/include/linux/pghot.h
index 1443643aab13..98a72e01bdd6 100644
--- a/include/linux/pghot.h
+++ b/include/linux/pghot.h
@@ -47,6 +47,8 @@ enum pghot_src {
 #define PGHOT_HEAP_PCT		25
 
 #define KPROMOTED_MIGRATE_BATCH	1024
+#define KPROMOTED_MIGRATION_ADJUST_STEPS	16
+#define KPROMOTED_PROMOTION_THRESHOLD_WINDOW	60000
 
 /*
  * If target NID isn't available, kpromoted promotes to node 0
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b173a059315c..54eeddb6ec23 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -125,11 +125,6 @@ int __weak arch_asym_cpu_priority(int cpu)
 static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
 #endif
 
-#ifdef CONFIG_NUMA_BALANCING
-/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
-static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
-#endif
-
 #ifdef CONFIG_SYSCTL
 static const struct ctl_table sched_fair_sysctls[] = {
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -142,16 +137,6 @@ static const struct ctl_table sched_fair_sysctls[] = {
 		.extra1         = SYSCTL_ONE,
 	},
 #endif
-#ifdef CONFIG_NUMA_BALANCING
-	{
-		.procname	= "numa_balancing_promote_rate_limit_MBps",
-		.data		= &sysctl_numa_balancing_promote_rate_limit,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-	},
-#endif /* CONFIG_NUMA_BALANCING */
 };
 
 static int __init sched_fair_sysctl_init(void)
@@ -1800,108 +1785,6 @@ static inline bool cpupid_valid(int cpupid)
 	return cpupid_to_cpu(cpupid) < nr_cpu_ids;
 }
 
-/*
- * For memory tiering mode, if there are enough free pages (more than
- * enough watermark defined here) in fast memory node, to take full
- * advantage of fast memory capacity, all recently accessed slow
- * memory pages will be migrated to fast memory node without
- * considering hot threshold.
- */
-static bool pgdat_free_space_enough(struct pglist_data *pgdat)
-{
-	int z;
-	unsigned long enough_wmark;
-
-	enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
-			   pgdat->node_present_pages >> 4);
-	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
-		struct zone *zone = pgdat->node_zones + z;
-
-		if (!populated_zone(zone))
-			continue;
-
-		if (zone_watermark_ok(zone, 0,
-				      promo_wmark_pages(zone) + enough_wmark,
-				      ZONE_MOVABLE, 0))
-			return true;
-	}
-	return false;
-}
-
-/*
- * For memory tiering mode, when page tables are scanned, the scan
- * time will be recorded in struct page in addition to make page
- * PROT_NONE for slow memory page.  So when the page is accessed, in
- * hint page fault handler, the hint page fault latency is calculated
- * via,
- *
- *	hint page fault latency = hint page fault time - scan time
- *
- * The smaller the hint page fault latency, the higher the possibility
- * for the page to be hot.
- */
-static int numa_hint_fault_latency(struct folio *folio)
-{
-	int last_time, time;
-
-	time = jiffies_to_msecs(jiffies);
-	last_time = folio_xchg_access_time(folio, time);
-
-	return (time - last_time) & PAGE_ACCESS_TIME_MASK;
-}
-
-/*
- * For memory tiering mode, too high promotion/demotion throughput may
- * hurt application latency.  So we provide a mechanism to rate limit
- * the number of pages that are tried to be promoted.
- */
-static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
-				      unsigned long rate_limit, int nr)
-{
-	unsigned long nr_cand;
-	unsigned int now, start;
-
-	now = jiffies_to_msecs(jiffies);
-	mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
-	nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
-	start = pgdat->nbp_rl_start;
-	if (now - start > MSEC_PER_SEC &&
-	    cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
-		pgdat->nbp_rl_nr_cand = nr_cand;
-	if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
-		return true;
-	return false;
-}
-
-#define NUMA_MIGRATION_ADJUST_STEPS	16
-
-static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
-					    unsigned long rate_limit,
-					    unsigned int ref_th)
-{
-	unsigned int now, start, th_period, unit_th, th;
-	unsigned long nr_cand, ref_cand, diff_cand;
-
-	now = jiffies_to_msecs(jiffies);
-	th_period = sysctl_numa_balancing_scan_period_max;
-	start = pgdat->nbp_th_start;
-	if (now - start > th_period &&
-	    cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
-		ref_cand = rate_limit *
-			sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
-		nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
-		diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
-		unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
-		th = pgdat->nbp_threshold ? : ref_th;
-		if (diff_cand > ref_cand * 11 / 10)
-			th = max(th - unit_th, unit_th);
-		else if (diff_cand < ref_cand * 9 / 10)
-			th = min(th + unit_th, ref_th * 2);
-		pgdat->nbp_th_nr_cand = nr_cand;
-		pgdat->nbp_threshold = th;
-	}
-}
-
 bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
 				int src_nid, int dst_cpu)
 {
@@ -1917,33 +1800,11 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
 
 	/*
 	 * The pages in slow memory node should be migrated according
-	 * to hot/cold instead of private/shared.
-	 */
-	if (folio_use_access_time(folio)) {
-		struct pglist_data *pgdat;
-		unsigned long rate_limit;
-		unsigned int latency, th, def_th;
-
-		pgdat = NODE_DATA(dst_nid);
-		if (pgdat_free_space_enough(pgdat)) {
-			/* workload changed, reset hot threshold */
-			pgdat->nbp_threshold = 0;
-			return true;
-		}
-
-		def_th = sysctl_numa_balancing_hot_threshold;
-		rate_limit = sysctl_numa_balancing_promote_rate_limit << \
-			(20 - PAGE_SHIFT);
-		numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
-
-		th = pgdat->nbp_threshold ? : def_th;
-		latency = numa_hint_fault_latency(folio);
-		if (latency >= th)
-			return false;
-
-		return !numa_promotion_rate_limit(pgdat, rate_limit,
-						  folio_nr_pages(folio));
-	}
+	 * to hot/cold instead of private/shared. Also the migration
+	 * of such pages are handled by kpromoted.
+	 */
+	if (folio_use_access_time(folio))
+		return true;
 
 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
 	last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
diff --git a/mm/memory.c b/mm/memory.c
index 0ba4f6b71847..eeb34e8d9b8e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -75,6 +75,7 @@
 #include <linux/ptrace.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/sysctl.h>
+#include <linux/pghot.h>
 
 #include <trace/events/kmem.h>
 
@@ -5864,34 +5865,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
 
 	target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags,
 					writable, &last_cpupid);
+	nid = target_nid;
 	if (target_nid == NUMA_NO_NODE)
 		goto out_map;
-	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
-		flags |= TNF_MIGRATE_FAIL;
-		goto out_map;
-	}
-	/* The folio is isolated and isolation code holds a folio reference. */
-	pte_unmap_unlock(vmf->pte, vmf->ptl);
+
 	writable = false;
 	ignore_writable = true;
-
-	/* Migrate to the requested node */
-	if (!migrate_misplaced_folio(folio, target_nid)) {
-		nid = target_nid;
-		flags |= TNF_MIGRATED;
-		task_numa_fault(last_cpupid, nid, nr_pages, flags);
-		return 0;
-	}
-
-	flags |= TNF_MIGRATE_FAIL;
-	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-				       vmf->address, &vmf->ptl);
-	if (unlikely(!vmf->pte))
-		return 0;
-	if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
-		pte_unmap_unlock(vmf->pte, vmf->ptl);
-		return 0;
-	}
 out_map:
 	/*
 	 * Make it present again, depending on how arch implements
@@ -5905,8 +5884,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
 					    writable);
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 
-	if (nid != NUMA_NO_NODE)
+	if (nid != NUMA_NO_NODE) {
+		pghot_record_access(folio_pfn(folio), nid, PGHOT_HINT_FAULT,
+				    jiffies);
 		task_numa_fault(last_cpupid, nid, nr_pages, flags);
+	}
 	return 0;
 }
 
diff --git a/mm/pghot.c b/mm/pghot.c
index 9f7581818b8f..9f5746892bce 100644
--- a/mm/pghot.c
+++ b/mm/pghot.c
@@ -9,6 +9,9 @@
  *
  * kpromoted is a kernel thread that runs on each toptier node and
  * promotes pages from max_heap.
+ *
+ * Migration rate-limiting and dynamic threshold logic implementations
+ * were moved from NUMA Balancing mode 2.
  */
 #include <linux/pghot.h>
 #include <linux/kthread.h>
@@ -34,6 +37,9 @@ static bool kpromoted_started __ro_after_init;
 
 static unsigned int sysctl_pghot_freq_window = KPROMOTED_FREQ_WINDOW;
 
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+static unsigned int sysctl_pghot_promote_rate_limit = 65536;
+
 #ifdef CONFIG_SYSCTL
 static const struct ctl_table pghot_sysctls[] = {
 	{
@@ -44,8 +50,17 @@ static const struct ctl_table pghot_sysctls[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 	},
+	{
+		.procname	= "pghot_promote_rate_limit_MBps",
+		.data		= &sysctl_pghot_promote_rate_limit,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+	},
 };
 #endif
+
 static bool phi_heap_less(const void *lhs, const void *rhs, void *args)
 {
 	return (*(struct pghot_info **)lhs)->frequency >
@@ -94,11 +109,99 @@ static bool phi_heap_insert(struct max_heap *phi_heap, struct pghot_info *phi)
 	return true;
 }
 
+/*
+ * For memory tiering mode, if there are enough free pages (more than
+ * enough watermark defined here) in fast memory node, to take full
+ * advantage of fast memory capacity, all recently accessed slow
+ * memory pages will be migrated to fast memory node without
+ * considering hot threshold.
+ */
+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
+{
+	int z;
+	unsigned long enough_wmark;
+
+	enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
+			   pgdat->node_present_pages >> 4);
+	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+		struct zone *zone = pgdat->node_zones + z;
+
+		if (!populated_zone(zone))
+			continue;
+
+		if (zone_watermark_ok(zone, 0,
+				      promo_wmark_pages(zone) + enough_wmark,
+				      ZONE_MOVABLE, 0))
+			return true;
+	}
+	return false;
+}
+
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency.  So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool kpromoted_promotion_rate_limit(struct pglist_data *pgdat,
+					   unsigned long rate_limit, int nr,
+					   unsigned long time)
+{
+	unsigned long nr_cand;
+	unsigned int now, start;
+
+	now = jiffies_to_msecs(time);
+	mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+	nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+	start = pgdat->nbp_rl_start;
+	if (now - start > MSEC_PER_SEC &&
+	    cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+		pgdat->nbp_rl_nr_cand = nr_cand;
+	if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+		return true;
+	return false;
+}
+
+static void kpromoted_promotion_adjust_threshold(struct pglist_data *pgdat,
+						 unsigned long rate_limit,
+						 unsigned int ref_th,
+						 unsigned long now)
+{
+	unsigned int start, th_period, unit_th, th;
+	unsigned long nr_cand, ref_cand, diff_cand;
+
+	now = jiffies_to_msecs(now);
+	th_period = KPROMOTED_PROMOTION_THRESHOLD_WINDOW;
+	start = pgdat->nbp_th_start;
+	if (now - start > th_period &&
+	    cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
+		ref_cand = rate_limit *
+			KPROMOTED_PROMOTION_THRESHOLD_WINDOW / MSEC_PER_SEC;
+		nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+		diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
+		unit_th = ref_th * 2 / KPROMOTED_MIGRATION_ADJUST_STEPS;
+		th = pgdat->nbp_threshold ? : ref_th;
+		if (diff_cand > ref_cand * 11 / 10)
+			th = max(th - unit_th, unit_th);
+		else if (diff_cand < ref_cand * 9 / 10)
+			th = min(th + unit_th, ref_th * 2);
+		pgdat->nbp_th_nr_cand = nr_cand;
+		pgdat->nbp_threshold = th;
+	}
+}
+
+static inline unsigned int pghot_access_latency(struct pghot_info *phi, u32  now)
+{
+	return (now - phi->last_update);
+}
+
 static bool phi_is_pfn_hot(struct pghot_info *phi)
 {
 	struct page *page = pfn_to_online_page(phi->pfn);
-	unsigned long now = jiffies;
 	struct folio *folio;
+	struct pglist_data *pgdat;
+	unsigned long rate_limit;
+	unsigned int latency, th, def_th;
+	unsigned long now = jiffies;
 
 	if (!page || is_zone_device_page(page))
 		return false;
@@ -113,7 +216,24 @@ static bool phi_is_pfn_hot(struct pghot_info *phi)
 		return false;
 	}
 
-	return true;
+	pgdat = NODE_DATA(phi->nid);
+	if (pgdat_free_space_enough(pgdat)) {
+		/* workload changed, reset hot threshold */
+		pgdat->nbp_threshold = 0;
+		return true;
+	}
+
+	def_th = sysctl_pghot_freq_window;
+	rate_limit = sysctl_pghot_promote_rate_limit << (20 - PAGE_SHIFT);
+	kpromoted_promotion_adjust_threshold(pgdat, rate_limit, def_th, now);
+
+	th = pgdat->nbp_threshold ? : def_th;
+	latency = pghot_access_latency(phi, now & PGHOT_TIME_MASK);
+	if (latency >= th)
+		return false;
+
+	return !kpromoted_promotion_rate_limit(pgdat, rate_limit,
+					       folio_nr_pages(folio), now);
 }
 
 static struct folio *kpromoted_isolate_folio(struct pghot_info *phi)
@@ -351,9 +471,13 @@ int pghot_record_access(u64 pfn, int nid, int src, unsigned long now)
 	/*
 	 * If the previous access was beyond the threshold window
 	 * start frequency tracking afresh.
+	 *
+	 * Bypass the new window logic for NUMA hint fault source
+	 * as it is too slow in reporting accesses.
+	 * TODO: Fix this.
 	 */
-	if (((cur_time - phi->last_update) > msecs_to_jiffies(sysctl_pghot_freq_window)) ||
-	    (nid != NUMA_NO_NODE && phi->nid != nid))
+	if ((((cur_time - phi->last_update) > msecs_to_jiffies(sysctl_pghot_freq_window))
+	    && (src != PGHOT_HINT_FAULT)) || (nid != NUMA_NO_NODE && phi->nid != nid))
 		new_window = true;
 
 	if (new_entry || new_window) {
-- 
2.34.1

Re: [RFC PATCH v2 8/8] mm: sched: Move hot page promotion from NUMAB=2 to kpromoted

Posted by Jonathan Cameron 4 months, 1 week ago

On Wed, 10 Sep 2025 20:16:53 +0530
Bharata B Rao <bharata@amd.com> wrote:

> Currently hot page promotion (NUMA_BALANCING_MEMORY_TIERING
> mode of NUMA Balancing) does hot page detection (via hint faults),
> hot page classification and eventual promotion, all by itself and
> sits within the scheduler.
> 
> With the new hot page tracking and promotion mechanism being
> available, NUMA Balancing can limit itself to detection of
> hot pages (via hint faults) and off-load rest of the
> functionality to the common hot page tracking system.
> 
> pghot_record_access(PGHOT_HINT_FAULT) API is used to feed the
> hot page info. In addition, the migration rate limiting and
> dynamic threshold logic are moved to kpromoted so that the same
> can be used for hot pages reported by other sources too.
> 
> Signed-off-by: Bharata B Rao <bharata@amd.com>

Making a direct replacement without any fallback to previous method
is going to need a lot of data to show there are no important regressions.

So bold move if that's the intent! 

J
> ---
>  include/linux/pghot.h |   2 +
>  kernel/sched/fair.c   | 149 ++----------------------------------------
>  mm/memory.c           |  32 ++-------
>  mm/pghot.c            | 132 +++++++++++++++++++++++++++++++++++--
>  4 files changed, 142 insertions(+), 173 deletions(-)
> 

> diff --git a/mm/pghot.c b/mm/pghot.c
> index 9f7581818b8f..9f5746892bce 100644
> --- a/mm/pghot.c
> +++ b/mm/pghot.c
> @@ -9,6 +9,9 @@
>   *
>   * kpromoted is a kernel thread that runs on each toptier node and
>   * promotes pages from max_heap.
> + *
> + * Migration rate-limiting and dynamic threshold logic implementations
> + * were moved from NUMA Balancing mode 2.
>   */
>  #include <linux/pghot.h>
>  #include <linux/kthread.h>
> @@ -34,6 +37,9 @@ static bool kpromoted_started __ro_after_init;
>  
>  static unsigned int sysctl_pghot_freq_window = KPROMOTED_FREQ_WINDOW;
>  
> +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
> +static unsigned int sysctl_pghot_promote_rate_limit = 65536;

If the comment correlates with the value, this is 64 GiB/s?  That seems
unlikely if I guess possible.

> +
>  #ifdef CONFIG_SYSCTL
>  static const struct ctl_table pghot_sysctls[] = {
>  	{
> @@ -44,8 +50,17 @@ static const struct ctl_table pghot_sysctls[] = {
>  		.proc_handler	= proc_dointvec_minmax,
>  		.extra1		= SYSCTL_ZERO,
>  	},
> +	{
> +		.procname	= "pghot_promote_rate_limit_MBps",
> +		.data		= &sysctl_pghot_promote_rate_limit,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= SYSCTL_ZERO,
> +	},
>  };
>  #endif
> +
Put that in earlier patch to reduce noise here.

>  static bool phi_heap_less(const void *lhs, const void *rhs, void *args)
>  {
>  	return (*(struct pghot_info **)lhs)->frequency >
> @@ -94,11 +109,99 @@ static bool phi_heap_insert(struct max_heap *phi_heap, struct pghot_info *phi)
>  	return true;
>  }
>  
> +/*
> + * For memory tiering mode, if there are enough free pages (more than
> + * enough watermark defined here) in fast memory node, to take full

I'd use enough_wmark   Just because "more than enough" is a common
English phrase and I at least tripped over that sentence as a result!

> + * advantage of fast memory capacity, all recently accessed slow
> + * memory pages will be migrated to fast memory node without
> + * considering hot threshold.
> + */
> +static bool pgdat_free_space_enough(struct pglist_data *pgdat)
> +{
> +	int z;
> +	unsigned long enough_wmark;
> +
> +	enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
> +			   pgdat->node_present_pages >> 4);
> +	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
> +		struct zone *zone = pgdat->node_zones + z;
> +
> +		if (!populated_zone(zone))
> +			continue;
> +
> +		if (zone_watermark_ok(zone, 0,
> +				      promo_wmark_pages(zone) + enough_wmark,
> +				      ZONE_MOVABLE, 0))
> +			return true;
> +	}
> +	return false;
> +}

> +
> +static void kpromoted_promotion_adjust_threshold(struct pglist_data *pgdat,

Needs documentation of the algorithm and the reasons for various choices.

I see it is a code move though so maybe that's a job for another day.

> +						 unsigned long rate_limit,
> +						 unsigned int ref_th,
> +						 unsigned long now)
> +{
> +	unsigned int start, th_period, unit_th, th;
> +	unsigned long nr_cand, ref_cand, diff_cand;
> +
> +	now = jiffies_to_msecs(now);
> +	th_period = KPROMOTED_PROMOTION_THRESHOLD_WINDOW;
> +	start = pgdat->nbp_th_start;
> +	if (now - start > th_period &&
> +	    cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
> +		ref_cand = rate_limit *
> +			KPROMOTED_PROMOTION_THRESHOLD_WINDOW / MSEC_PER_SEC;
> +		nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
> +		diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
> +		unit_th = ref_th * 2 / KPROMOTED_MIGRATION_ADJUST_STEPS;
> +		th = pgdat->nbp_threshold ? : ref_th;
> +		if (diff_cand > ref_cand * 11 / 10)
> +			th = max(th - unit_th, unit_th);
> +		else if (diff_cand < ref_cand * 9 / 10)
> +			th = min(th + unit_th, ref_th * 2);
> +		pgdat->nbp_th_nr_cand = nr_cand;
> +		pgdat->nbp_threshold = th;
> +	}
> +}
 +
>  static bool phi_is_pfn_hot(struct pghot_info *phi)
>  {
>  	struct page *page = pfn_to_online_page(phi->pfn);
> -	unsigned long now = jiffies;
>  	struct folio *folio;
> +	struct pglist_data *pgdat;
> +	unsigned long rate_limit;
> +	unsigned int latency, th, def_th;
> +	unsigned long now = jiffies;
>  
Avoid the reorder.  Just put it here in first place if you prefer this.

Re: [RFC PATCH v2 8/8] mm: sched: Move hot page promotion from NUMAB=2 to kpromoted

Posted by Bharata B Rao 4 months ago

On 03-Oct-25 6:08 PM, Jonathan Cameron wrote:
> On Wed, 10 Sep 2025 20:16:53 +0530
> Bharata B Rao <bharata@amd.com> wrote:
> 
>> Currently hot page promotion (NUMA_BALANCING_MEMORY_TIERING
>> mode of NUMA Balancing) does hot page detection (via hint faults),
>> hot page classification and eventual promotion, all by itself and
>> sits within the scheduler.
>>
>> With the new hot page tracking and promotion mechanism being
>> available, NUMA Balancing can limit itself to detection of
>> hot pages (via hint faults) and off-load rest of the
>> functionality to the common hot page tracking system.
>>
>> pghot_record_access(PGHOT_HINT_FAULT) API is used to feed the
>> hot page info. In addition, the migration rate limiting and
>> dynamic threshold logic are moved to kpromoted so that the same
>> can be used for hot pages reported by other sources too.
>>
>> Signed-off-by: Bharata B Rao <bharata@amd.com>
> 
> Making a direct replacement without any fallback to previous method
> is going to need a lot of data to show there are no important regressions.
> 
> So bold move if that's the intent! 

Firstly I am only moving the existing hot page heuristics that is part of
NUMAB=2 to kpromoted so that the same can be applied to hot pages being
identified by other sources. So the hint fault mechanism that is inherent
to NUMAB=2 still remains.

In fact, kscand effort started as a potential replacement for the existing
hot page promotion mechanism by getting rid of hint faults and moving the
page table scanning out of process context.

In any case, I will start including numbers from the next post.
>>  
>>  static unsigned int sysctl_pghot_freq_window = KPROMOTED_FREQ_WINDOW;
>>  
>> +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
>> +static unsigned int sysctl_pghot_promote_rate_limit = 65536;
> 
> If the comment correlates with the value, this is 64 GiB/s?  That seems
> unlikely if I guess possible.

IIUC, the existing logic tries to limit promotion rate to 64 GiB/s by
limiting the number of candidate pages that are promoted within the
1s observation interval.

Are you saying that achieving the rate of 64 GiB/s is not possible
or unlikely?

> 
>> +
>>  #ifdef CONFIG_SYSCTL
>>  static const struct ctl_table pghot_sysctls[] = {
>>  	{
>> @@ -44,8 +50,17 @@ static const struct ctl_table pghot_sysctls[] = {
>>  		.proc_handler	= proc_dointvec_minmax,
>>  		.extra1		= SYSCTL_ZERO,
>>  	},
>> +	{
>> +		.procname	= "pghot_promote_rate_limit_MBps",
>> +		.data		= &sysctl_pghot_promote_rate_limit,
>> +		.maxlen		= sizeof(unsigned int),
>> +		.mode		= 0644,
>> +		.proc_handler	= proc_dointvec_minmax,
>> +		.extra1		= SYSCTL_ZERO,
>> +	},
>>  };
>>  #endif
>> +
> Put that in earlier patch to reduce noise here.

This patch moves the hot page heuristics to kpromoted and hence this
related sysctl is also being moved in this patch.

> 
>>  static bool phi_heap_less(const void *lhs, const void *rhs, void *args)
>>  {
>>  	return (*(struct pghot_info **)lhs)->frequency >
>> @@ -94,11 +109,99 @@ static bool phi_heap_insert(struct max_heap *phi_heap, struct pghot_info *phi)
>>  	return true;
>>  }
>>  
>> +/*
>> + * For memory tiering mode, if there are enough free pages (more than
>> + * enough watermark defined here) in fast memory node, to take full
> 
> I'd use enough_wmark   Just because "more than enough" is a common
> English phrase and I at least tripped over that sentence as a result!

Ah I see that, but as you note later, I am currently only doing the
movement.

> 
>> + * advantage of fast memory capacity, all recently accessed slow
>> + * memory pages will be migrated to fast memory node without
>> + * considering hot threshold.
>> + */
>> +static bool pgdat_free_space_enough(struct pglist_data *pgdat)
>> +{
>> +	int z;
>> +	unsigned long enough_wmark;
>> +
>> +	enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
>> +			   pgdat->node_present_pages >> 4);
>> +	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
>> +		struct zone *zone = pgdat->node_zones + z;
>> +
>> +		if (!populated_zone(zone))
>> +			continue;
>> +
>> +		if (zone_watermark_ok(zone, 0,
>> +				      promo_wmark_pages(zone) + enough_wmark,
>> +				      ZONE_MOVABLE, 0))
>> +			return true;
>> +	}
>> +	return false;
>> +}
> 
>> +
>> +static void kpromoted_promotion_adjust_threshold(struct pglist_data *pgdat,
> 
> Needs documentation of the algorithm and the reasons for various choices.
> 
> I see it is a code move though so maybe that's a job for another day.

Sure.

Regards,
Bharata.

Re: [RFC PATCH v2 8/8] mm: sched: Move hot page promotion from NUMAB=2 to kpromoted

Posted by Jonathan Cameron 4 months ago

On Mon, 6 Oct 2025 11:27:21 +0530
Bharata B Rao <bharata@amd.com> wrote:

> On 03-Oct-25 6:08 PM, Jonathan Cameron wrote:
> > On Wed, 10 Sep 2025 20:16:53 +0530
> > Bharata B Rao <bharata@amd.com> wrote:
> >   
> >> Currently hot page promotion (NUMA_BALANCING_MEMORY_TIERING
> >> mode of NUMA Balancing) does hot page detection (via hint faults),
> >> hot page classification and eventual promotion, all by itself and
> >> sits within the scheduler.
> >>
> >> With the new hot page tracking and promotion mechanism being
> >> available, NUMA Balancing can limit itself to detection of
> >> hot pages (via hint faults) and off-load rest of the
> >> functionality to the common hot page tracking system.
> >>
> >> pghot_record_access(PGHOT_HINT_FAULT) API is used to feed the
> >> hot page info. In addition, the migration rate limiting and
> >> dynamic threshold logic are moved to kpromoted so that the same
> >> can be used for hot pages reported by other sources too.
> >>
> >> Signed-off-by: Bharata B Rao <bharata@amd.com>  
> > 
> > Making a direct replacement without any fallback to previous method
> > is going to need a lot of data to show there are no important regressions.
> > 
> > So bold move if that's the intent!   
> 
> Firstly I am only moving the existing hot page heuristics that is part of
> NUMAB=2 to kpromoted so that the same can be applied to hot pages being
> identified by other sources. So the hint fault mechanism that is inherent
> to NUMAB=2 still remains.

That makes sense.

> 
> In fact, kscand effort started as a potential replacement for the existing
> hot page promotion mechanism by getting rid of hint faults and moving the
> page table scanning out of process context.

Understood and I'm in favor of the that approach but not sure it will be
a fit for all workloads.

> 
> In any case, I will start including numbers from the next post.

Great.

> >>  
> >>  static unsigned int sysctl_pghot_freq_window = KPROMOTED_FREQ_WINDOW;
> >>  
> >> +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
> >> +static unsigned int sysctl_pghot_promote_rate_limit = 65536;  
> > 
> > If the comment correlates with the value, this is 64 GiB/s?  That seems
> > unlikely if I guess possible.  
> 
> IIUC, the existing logic tries to limit promotion rate to 64 GiB/s by
> limiting the number of candidate pages that are promoted within the
> 1s observation interval.
> 
> Are you saying that achieving the rate of 64 GiB/s is not possible
> or unlikely?

Seem rather too high to me, but maybe I just have the wrong mental model
of what we should be moving. 
> 
> >   
> >> +
> >>  #ifdef CONFIG_SYSCTL
> >>  static const struct ctl_table pghot_sysctls[] = {
> >>  	{
> >> @@ -44,8 +50,17 @@ static const struct ctl_table pghot_sysctls[] = {
> >>  		.proc_handler	= proc_dointvec_minmax,
> >>  		.extra1		= SYSCTL_ZERO,
> >>  	},
> >> +	{
> >> +		.procname	= "pghot_promote_rate_limit_MBps",
> >> +		.data		= &sysctl_pghot_promote_rate_limit,
> >> +		.maxlen		= sizeof(unsigned int),
> >> +		.mode		= 0644,
> >> +		.proc_handler	= proc_dointvec_minmax,
> >> +		.extra1		= SYSCTL_ZERO,
> >> +	},
> >>  };
> >>  #endif
> >> +  
> > Put that in earlier patch to reduce noise here.  
> 
> This patch moves the hot page heuristics to kpromoted and hence this
> related sysctl is also being moved in this patch.

I just mean the blank line - not the block above.
This is just a patch set tidying up comment.

Jonathan

[RFC PATCH v2 1/8] mm: migrate: Allow misplaced migration without VMA too
[RFC PATCH v2 2/8] migrate: implement migrate_misplaced_folios_batch
[RFC PATCH v2 3/8] mm: Hot page tracking and promotion
[RFC PATCH v2 4/8] x86: ibs: In-kernel IBS driver for memory access profiling
[RFC PATCH v2 5/8] x86: ibs: Enable IBS profiling for memory accesses
[RFC PATCH v2 6/8] mm: mglru: generalize page table walk
[RFC PATCH v2 7/8] mm: klruscand: use mglru scanning for page promotion
[RFC PATCH v2 8/8] mm: sched: Move hot page promotion from NUMAB=2 to kpromoted