[RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon

Bharata B Rao posted 4 patches 11 months, 1 week ago
[RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Bharata B Rao 11 months, 1 week ago
kpromoted is a kernel daemon that accumulates hot page info
from different sources and tries to promote pages from slow
tiers to top tiers. One instance of this thread runs on each
node that has CPUs.

Subsystems that generate hot page access info can report that
to kpromoted via this API:

int kpromoted_record_access(u64 pfn, int nid, int src,
			    unsigned long time)

@pfn: The PFN of the memory accessed
@nid: The accessing NUMA node ID
@src: The temperature source (subsystem) that generated the
      access info
@time: The access time in jiffies

Some temperature sources may not provide the nid from which
the page was accessed. This is true for sources that use
page table scanning for PTE Accessed bit. Currently the toptier
node to which such pages should be promoted to is hard coded.

Also, the access time provided some sources may at best be
considered approximate. This is especially true for hot pages
detected by PTE A bit scanning.

kpromoted currently maintains the hot PFN records in hash lists
hashed by PFN value. Each record stores the following info:

struct page_hotness_info {
	unsigned long pfn;

	/* Time when this record was updated last */
	unsigned long last_update;

	/*
	 * Number of times this page was accessed in the
	 * current window
	 */
	int frequency;

	/* Most recent access time */
	unsigned long recency;

	/* Most recent access from this node */
	int hot_node;

	struct hlist_node hnode;
};

The way in which a page is categorized as hot enough to be
promoted is pretty primitive now.

Signed-off-by: Bharata B Rao <bharata@amd.com>
---
 include/linux/kpromoted.h     |  54 ++++++
 include/linux/mmzone.h        |   4 +
 include/linux/vm_event_item.h |  13 ++
 mm/Kconfig                    |   7 +
 mm/Makefile                   |   1 +
 mm/kpromoted.c                | 305 ++++++++++++++++++++++++++++++++++
 mm/mm_init.c                  |  10 ++
 mm/vmstat.c                   |  13 ++
 8 files changed, 407 insertions(+)
 create mode 100644 include/linux/kpromoted.h
 create mode 100644 mm/kpromoted.c

diff --git a/include/linux/kpromoted.h b/include/linux/kpromoted.h
new file mode 100644
index 000000000000..2bef3d74f03a
--- /dev/null
+++ b/include/linux/kpromoted.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_KPROMOTED_H
+#define _LINUX_KPROMOTED_H
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/workqueue_types.h>
+
+/* Page hotness temperature sources */
+enum kpromoted_src {
+	KPROMOTED_HW_HINTS,
+	KPROMOTED_PGTABLE_SCAN,
+};
+
+#ifdef CONFIG_KPROMOTED
+
+#define KPROMOTED_FREQ_WINDOW	(5 * MSEC_PER_SEC)
+
+/* 2 accesses within a window will make the page a promotion candidate */
+#define KPRMOTED_FREQ_THRESHOLD	2
+
+#define KPROMOTED_HASH_ORDER	16
+
+struct page_hotness_info {
+	unsigned long pfn;
+
+	/* Time when this record was updated last */
+	unsigned long last_update;
+
+	/*
+	 * Number of times this page was accessed in the
+	 * current window
+	 */
+	int frequency;
+
+	/* Most recent access time */
+	unsigned long recency;
+
+	/* Most recent access from this node */
+	int hot_node;
+	struct hlist_node hnode;
+};
+
+#define KPROMOTE_DELAY	MSEC_PER_SEC
+
+int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now);
+#else
+static inline int kpromoted_record_access(u64 pfn, int nid, int src,
+					  unsigned long now)
+{
+	return 0;
+}
+#endif /* CONFIG_KPROMOTED */
+#endif /* _LINUX_KPROMOTED_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9540b41894da..a5c4e789aa55 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1459,6 +1459,10 @@ typedef struct pglist_data {
 #ifdef CONFIG_MEMORY_FAILURE
 	struct memory_failure_stats mf_stats;
 #endif
+#ifdef CONFIG_KPROMOTED
+	struct task_struct *kpromoted;
+	wait_queue_head_t kpromoted_wait;
+#endif
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index f70d0958095c..b5823b037883 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -182,6 +182,19 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		KSTACK_REST,
 #endif
 #endif /* CONFIG_DEBUG_STACK_USAGE */
+		KPROMOTED_RECORDED_ACCESSES,
+		KPROMOTED_RECORD_HWHINTS,
+		KPROMOTED_RECORD_PGTSCANS,
+		KPROMOTED_RECORD_TOPTIER,
+		KPROMOTED_RECORD_ADDED,
+		KPROMOTED_RECORD_EXISTS,
+		KPROMOTED_MIG_RIGHT_NODE,
+		KPROMOTED_MIG_NON_LRU,
+		KPROMOTED_MIG_COLD_OLD,
+		KPROMOTED_MIG_COLD_NOT_ACCESSED,
+		KPROMOTED_MIG_CANDIDATE,
+		KPROMOTED_MIG_PROMOTED,
+		KPROMOTED_MIG_DROPPED,
 		NR_VM_EVENT_ITEMS
 };
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 1b501db06417..ceaa462a0ce6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1358,6 +1358,13 @@ config PT_RECLAIM
 
 	  Note: now only empty user PTE page table pages will be reclaimed.
 
+config KPROMOTED
+	bool "Kernel hot page promotion daemon"
+	def_bool y
+	depends on NUMA && MIGRATION && MMU
+	help
+	  Promote hot pages from lower tier to top tier by using the
+	  memory access information provided by various sources.
 
 source "mm/damon/Kconfig"
 
diff --git a/mm/Makefile b/mm/Makefile
index 850386a67b3e..bf4f5f18f1f9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -147,3 +147,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
 obj-$(CONFIG_EXECMEM) += execmem.o
 obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
 obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
+obj-$(CONFIG_KPROMOTED) += kpromoted.o
diff --git a/mm/kpromoted.c b/mm/kpromoted.c
new file mode 100644
index 000000000000..2a8b8495b6b3
--- /dev/null
+++ b/mm/kpromoted.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * kpromoted is a kernel thread that runs on each node that has CPU i,e.,
+ * on regular nodes.
+ *
+ * Maintains list of hot pages from lower tiers and promotes them.
+ */
+#include <linux/kpromoted.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/mmzone.h>
+#include <linux/migrate.h>
+#include <linux/memory-tiers.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/cpuhotplug.h>
+#include <linux/hashtable.h>
+
+static DEFINE_HASHTABLE(page_hotness_hash, KPROMOTED_HASH_ORDER);
+static struct mutex page_hotness_lock[1UL << KPROMOTED_HASH_ORDER];
+
+static int kpromote_page(struct page_hotness_info *phi)
+{
+	struct page *page = pfn_to_page(phi->pfn);
+	struct folio *folio;
+	int ret;
+
+	if (!page)
+		return 1;
+
+	folio = page_folio(page);
+	ret = migrate_misplaced_folio_prepare(folio, NULL, phi->hot_node);
+	if (ret)
+		return 1;
+
+	return migrate_misplaced_folio(folio, phi->hot_node);
+}
+
+static int page_should_be_promoted(struct page_hotness_info *phi)
+{
+	struct page *page = pfn_to_online_page(phi->pfn);
+	unsigned long now = jiffies;
+	struct folio *folio;
+
+	if (!page || is_zone_device_page(page))
+		return false;
+
+	folio = page_folio(page);
+	if (!folio_test_lru(folio)) {
+		count_vm_event(KPROMOTED_MIG_NON_LRU);
+		return false;
+	}
+	if (folio_nid(folio) == phi->hot_node) {
+		count_vm_event(KPROMOTED_MIG_RIGHT_NODE);
+		return false;
+	}
+
+	/* If the page was hot a while ago, don't promote */
+	if ((now - phi->last_update) > 2 * msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
+		count_vm_event(KPROMOTED_MIG_COLD_OLD);
+		return false;
+	}
+
+	/* If the page hasn't been accessed enough number of times, don't promote */
+	if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) {
+		count_vm_event(KPROMOTED_MIG_COLD_NOT_ACCESSED);
+		return false;
+	}
+	return true;
+}
+
+/*
+ * Go thro' page hotness information and migrate pages if required.
+ *
+ * Promoted pages are not longer tracked in the hot list.
+ * Cold pages are pruned from the list as well.
+ *
+ * TODO: Batching could be done
+ */
+static void kpromoted_migrate(pg_data_t *pgdat)
+{
+	int nid = pgdat->node_id;
+	struct page_hotness_info *phi;
+	struct hlist_node *tmp;
+	int nr_bkts = HASH_SIZE(page_hotness_hash);
+	int bkt;
+
+	for (bkt = 0; bkt < nr_bkts; bkt++) {
+		mutex_lock(&page_hotness_lock[bkt]);
+		hlist_for_each_entry_safe(phi, tmp, &page_hotness_hash[bkt], hnode) {
+			if (phi->hot_node != nid)
+				continue;
+
+			if (page_should_be_promoted(phi)) {
+				count_vm_event(KPROMOTED_MIG_CANDIDATE);
+				if (!kpromote_page(phi)) {
+					count_vm_event(KPROMOTED_MIG_PROMOTED);
+					hlist_del_init(&phi->hnode);
+					kfree(phi);
+				}
+			} else {
+				/*
+				 * Not a suitable page or cold page, stop tracking it.
+				 * TODO: Identify cold pages and drive demotion?
+				 */
+				count_vm_event(KPROMOTED_MIG_DROPPED);
+				hlist_del_init(&phi->hnode);
+				kfree(phi);
+			}
+		}
+		mutex_unlock(&page_hotness_lock[bkt]);
+	}
+}
+
+static struct page_hotness_info *__kpromoted_lookup(unsigned long pfn, int bkt)
+{
+	struct page_hotness_info *phi;
+
+	hlist_for_each_entry(phi, &page_hotness_hash[bkt], hnode) {
+		if (phi->pfn == pfn)
+			return phi;
+	}
+	return NULL;
+}
+
+static struct page_hotness_info *kpromoted_lookup(unsigned long pfn, int bkt, unsigned long now)
+{
+	struct page_hotness_info *phi;
+
+	phi = __kpromoted_lookup(pfn, bkt);
+	if (!phi) {
+		phi = kzalloc(sizeof(struct page_hotness_info), GFP_KERNEL);
+		if (!phi)
+			return ERR_PTR(-ENOMEM);
+
+		phi->pfn = pfn;
+		phi->frequency = 1;
+		phi->last_update = now;
+		phi->recency = now;
+		hlist_add_head(&phi->hnode, &page_hotness_hash[bkt]);
+		count_vm_event(KPROMOTED_RECORD_ADDED);
+	} else {
+		count_vm_event(KPROMOTED_RECORD_EXISTS);
+	}
+	return phi;
+}
+
+/*
+ * Called by subsystems that generate page hotness/access information.
+ *
+ * Records the memory access info for futher action by kpromoted.
+ */
+int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now)
+{
+	struct page_hotness_info *phi;
+	struct page *page;
+	struct folio *folio;
+	int ret, bkt;
+
+	count_vm_event(KPROMOTED_RECORDED_ACCESSES);
+
+	switch (src) {
+	case KPROMOTED_HW_HINTS:
+		count_vm_event(KPROMOTED_RECORD_HWHINTS);
+		break;
+	case KPROMOTED_PGTABLE_SCAN:
+		count_vm_event(KPROMOTED_RECORD_PGTSCANS);
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * Record only accesses from lower tiers.
+	 * Assuming node having CPUs as toptier for now.
+	 */
+	if (node_is_toptier(pfn_to_nid(pfn))) {
+		count_vm_event(KPROMOTED_RECORD_TOPTIER);
+		return 0;
+	}
+
+	page = pfn_to_online_page(pfn);
+	if (!page || is_zone_device_page(page))
+		return 0;
+
+	folio = page_folio(page);
+	if (!folio_test_lru(folio))
+		return 0;
+
+	bkt = hash_min(pfn, KPROMOTED_HASH_ORDER);
+	mutex_lock(&page_hotness_lock[bkt]);
+	phi = kpromoted_lookup(pfn, bkt, now);
+	if (!phi) {
+		ret = PTR_ERR(phi);
+		goto out;
+	}
+
+	if ((phi->last_update - now) > msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
+		/* New window */
+		phi->frequency = 1; /* TODO: Factor in the history */
+		phi->last_update = now;
+	} else {
+		phi->frequency++;
+	}
+	phi->recency = now;
+
+	/*
+	 * TODOs:
+	 * 1. Source nid is hard-coded for some temperature sources
+	 * 2. Take action if hot_node changes - may be a shared page?
+	 * 3. Maintain node info for every access within the window?
+	 */
+	phi->hot_node = (nid == NUMA_NO_NODE) ? 1 : nid;
+	mutex_unlock(&page_hotness_lock[bkt]);
+out:
+	return 0;
+}
+
+/*
+ * Go through the accumulated mem_access_info and migrate
+ * pages if required.
+ */
+static void kpromoted_do_work(pg_data_t *pgdat)
+{
+	kpromoted_migrate(pgdat);
+}
+
+static inline bool kpromoted_work_requested(pg_data_t *pgdat)
+{
+	return false;
+}
+
+static int kpromoted(void *p)
+{
+	pg_data_t *pgdat = (pg_data_t *)p;
+	struct task_struct *tsk = current;
+	long timeout = msecs_to_jiffies(KPROMOTE_DELAY);
+
+	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+	if (!cpumask_empty(cpumask))
+		set_cpus_allowed_ptr(tsk, cpumask);
+
+	while (!kthread_should_stop()) {
+		wait_event_timeout(pgdat->kpromoted_wait,
+				   kpromoted_work_requested(pgdat), timeout);
+		kpromoted_do_work(pgdat);
+	}
+	return 0;
+}
+
+static void kpromoted_run(int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+
+	if (pgdat->kpromoted)
+		return;
+
+	pgdat->kpromoted = kthread_run(kpromoted, pgdat, "kpromoted%d", nid);
+	if (IS_ERR(pgdat->kpromoted)) {
+		pr_err("Failed to start kpromoted on node %d\n", nid);
+		pgdat->kpromoted = NULL;
+	}
+}
+
+static int kpromoted_cpu_online(unsigned int cpu)
+{
+	int nid;
+
+	for_each_node_state(nid, N_CPU) {
+		pg_data_t *pgdat = NODE_DATA(nid);
+		const struct cpumask *mask;
+
+		mask = cpumask_of_node(pgdat->node_id);
+
+		if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+			/* One of our CPUs online: restore mask */
+			if (pgdat->kpromoted)
+				set_cpus_allowed_ptr(pgdat->kpromoted, mask);
+	}
+	return 0;
+}
+
+static int __init kpromoted_init(void)
+{
+	int nid, ret, i;
+
+	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					"mm/promotion:online",
+					kpromoted_cpu_online, NULL);
+	if (ret < 0) {
+		pr_err("kpromoted: failed to register hotplug callbacks.\n");
+		return ret;
+	}
+
+	for (i = 0; i < (1UL << KPROMOTED_HASH_ORDER); i++)
+		mutex_init(&page_hotness_lock[i]);
+
+	for_each_node_state(nid, N_CPU)
+		kpromoted_run(nid);
+
+	return 0;
+}
+
+subsys_initcall(kpromoted_init)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2630cc30147e..d212df24f89b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1362,6 +1362,15 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat)
 static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
 #endif
 
+#ifdef CONFIG_KPROMOTED
+static void pgdat_init_kpromoted(struct pglist_data *pgdat)
+{
+	init_waitqueue_head(&pgdat->kpromoted_wait);
+}
+#else
+static void pgdat_init_kpromoted(struct pglist_data *pgdat) {}
+#endif
+
 static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 {
 	int i;
@@ -1371,6 +1380,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 
 	pgdat_init_split_queue(pgdat);
 	pgdat_init_kcompactd(pgdat);
+	pgdat_init_kpromoted(pgdat);
 
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 16bfe1c694dd..618f44bae5c8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1466,6 +1466,19 @@ const char * const vmstat_text[] = {
 	"kstack_rest",
 #endif
 #endif
+	"kpromoted_recorded_accesses",
+	"kpromoted_recorded_hwhints",
+	"kpromoted_recorded_pgtscans",
+	"kpromoted_record_toptier",
+	"kpromoted_record_added",
+	"kpromoted_record_exists",
+	"kpromoted_mig_right_node",
+	"kpromoted_mig_non_lru",
+	"kpromoted_mig_cold_old",
+	"kpromoted_mig_cold_not_accessed",
+	"kpromoted_mig_candidate",
+	"kpromoted_mig_promoted",
+	"kpromoted_mig_dropped",
 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
-- 
2.34.1
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Gregory Price 10 months, 3 weeks ago
On Thu, Mar 06, 2025 at 11:15:30AM +0530, Bharata B Rao wrote:
> kpromoted is a kernel daemon that accumulates hot page info
> from different sources and tries to promote pages from slow
> tiers to top tiers. One instance of this thread runs on each
> node that has CPUs.
>

Hot take: This sounds more like ktieringd not kpromoted

Is it reasonable to split the tracking a promotion logic into separate
interfaces?  This would let us manage, for example, rate-limiting in the
movement interface cleanly without having to care about the tiering
system(s) associated with it.

    my_tiering_magic():
        ... identify hot things ...
        promote(batch_folios, optional_data);
            -> kick daemon thread to wake up and do the promotion
	... continue async things ...

Optional data could be anything from target nodes or accessor info, but
not hotness information.

Then users at least get a clean interface for things like rate-limiting,
and everyone proposing their own take on tiering can consume it.  This
may also be useful for existing users (TPP, reclaim?, etc).

~Gregory
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Bharata B Rao 10 months, 3 weeks ago
On 24-Mar-25 7:13 PM, Gregory Price wrote:
> On Thu, Mar 06, 2025 at 11:15:30AM +0530, Bharata B Rao wrote:
>> kpromoted is a kernel daemon that accumulates hot page info
>> from different sources and tries to promote pages from slow
>> tiers to top tiers. One instance of this thread runs on each
>> node that has CPUs.
>>
> 
> Hot take: This sounds more like ktieringd not kpromoted

:-)

> 
> Is it reasonable to split the tracking a promotion logic into separate
> interfaces?  This would let us manage, for example, rate-limiting in the
> movement interface cleanly without having to care about the tiering
> system(s) associated with it.
> 
>      my_tiering_magic():
>          ... identify hot things ...
>          promote(batch_folios, optional_data);
>              -> kick daemon thread to wake up and do the promotion
> 	... continue async things ...
> 
> Optional data could be anything from target nodes or accessor info, but
> not hotness information.
> 
> Then users at least get a clean interface for things like rate-limiting,
> and everyone proposing their own take on tiering can consume it.  This
> may also be useful for existing users (TPP, reclaim?, etc).

Yes, Makes sense to split tracking and promotion logic into separate 
parts. There is no need for the promotion part to work with the hot page 
list that belongs to the tracking part as I have done in this RFC.

Raghu and I already saw that migration part is kind of duplicated in our 
patchsets(kmmscand and this) and were thinking of unifying them. Having 
clean separation as you suggest will be good.

Regards,
Bharata.
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Balbir Singh 10 months, 3 weeks ago
On 3/6/25 16:45, Bharata B Rao wrote:
> kpromoted is a kernel daemon that accumulates hot page info
> from different sources and tries to promote pages from slow
> tiers to top tiers. One instance of this thread runs on each
> node that has CPUs.
> 

Could you please elaborate on what is slow vs top tier? A top tier uses
adist (which is a combination of bandwidth and latency), so I am
not sure the terminology here holds.

> Subsystems that generate hot page access info can report that
> to kpromoted via this API:
> 
> int kpromoted_record_access(u64 pfn, int nid, int src,
> 			    unsigned long time)
> 
> @pfn: The PFN of the memory accessed
> @nid: The accessing NUMA node ID
> @src: The temperature source (subsystem) that generated the
>       access info
> @time: The access time in jiffies
> 
> Some temperature sources may not provide the nid from which

What is a temperature source?

> the page was accessed. This is true for sources that use
> page table scanning for PTE Accessed bit. Currently the toptier
> node to which such pages should be promoted to is hard coded.
> 

What would it take to make this flexible?

> Also, the access time provided some sources may at best be
> considered approximate. This is especially true for hot pages
> detected by PTE A bit scanning.
> 
> kpromoted currently maintains the hot PFN records in hash lists
> hashed by PFN value. Each record stores the following info:
> 
> struct page_hotness_info {
> 	unsigned long pfn;
> 
> 	/* Time when this record was updated last */
> 	unsigned long last_update;
> 
> 	/*
> 	 * Number of times this page was accessed in the
> 	 * current window
> 	 */
> 	int frequency;
> 
> 	/* Most recent access time */
> 	unsigned long recency;
> 
> 	/* Most recent access from this node */
> 	int hot_node;
> 
> 	struct hlist_node hnode;
> };
> 
> The way in which a page is categorized as hot enough to be
> promoted is pretty primitive now.
> 
> Signed-off-by: Bharata B Rao <bharata@amd.com>
> ---
>  include/linux/kpromoted.h     |  54 ++++++
>  include/linux/mmzone.h        |   4 +
>  include/linux/vm_event_item.h |  13 ++
>  mm/Kconfig                    |   7 +
>  mm/Makefile                   |   1 +
>  mm/kpromoted.c                | 305 ++++++++++++++++++++++++++++++++++
>  mm/mm_init.c                  |  10 ++
>  mm/vmstat.c                   |  13 ++
>  8 files changed, 407 insertions(+)
>  create mode 100644 include/linux/kpromoted.h
>  create mode 100644 mm/kpromoted.c
> 
> diff --git a/include/linux/kpromoted.h b/include/linux/kpromoted.h
> new file mode 100644
> index 000000000000..2bef3d74f03a
> --- /dev/null
> +++ b/include/linux/kpromoted.h
> @@ -0,0 +1,54 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_KPROMOTED_H
> +#define _LINUX_KPROMOTED_H
> +
> +#include <linux/types.h>
> +#include <linux/init.h>
> +#include <linux/workqueue_types.h>
> +
> +/* Page hotness temperature sources */
> +enum kpromoted_src {
> +	KPROMOTED_HW_HINTS,
> +	KPROMOTED_PGTABLE_SCAN,
> +};
> +
> +#ifdef CONFIG_KPROMOTED
> +
> +#define KPROMOTED_FREQ_WINDOW	(5 * MSEC_PER_SEC)
> +
> +/* 2 accesses within a window will make the page a promotion candidate */
> +#define KPRMOTED_FREQ_THRESHOLD	2
> +

Were these value derived empirically?


> +#define KPROMOTED_HASH_ORDER	16
> +
> +struct page_hotness_info {
> +	unsigned long pfn;
> +
> +	/* Time when this record was updated last */
> +	unsigned long last_update;
> +
> +	/*
> +	 * Number of times this page was accessed in the
> +	 * current window
> +	 */
> +	int frequency;
> +
> +	/* Most recent access time */
> +	unsigned long recency;
> +
> +	/* Most recent access from this node */
> +	int hot_node;
> +	struct hlist_node hnode;
> +};
> +
> +#define KPROMOTE_DELAY	MSEC_PER_SEC
> +
> +int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now);
> +#else
> +static inline int kpromoted_record_access(u64 pfn, int nid, int src,
> +					  unsigned long now)
> +{
> +	return 0;
> +}
> +#endif /* CONFIG_KPROMOTED */
> +#endif /* _LINUX_KPROMOTED_H */
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 9540b41894da..a5c4e789aa55 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1459,6 +1459,10 @@ typedef struct pglist_data {
>  #ifdef CONFIG_MEMORY_FAILURE
>  	struct memory_failure_stats mf_stats;
>  #endif
> +#ifdef CONFIG_KPROMOTED
> +	struct task_struct *kpromoted;
> +	wait_queue_head_t kpromoted_wait;
> +#endif
>  } pg_data_t;
>  
>  #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index f70d0958095c..b5823b037883 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -182,6 +182,19 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
>  		KSTACK_REST,
>  #endif
>  #endif /* CONFIG_DEBUG_STACK_USAGE */
> +		KPROMOTED_RECORDED_ACCESSES,
> +		KPROMOTED_RECORD_HWHINTS,
> +		KPROMOTED_RECORD_PGTSCANS,
> +		KPROMOTED_RECORD_TOPTIER,
> +		KPROMOTED_RECORD_ADDED,
> +		KPROMOTED_RECORD_EXISTS,
> +		KPROMOTED_MIG_RIGHT_NODE,
> +		KPROMOTED_MIG_NON_LRU,
> +		KPROMOTED_MIG_COLD_OLD,
> +		KPROMOTED_MIG_COLD_NOT_ACCESSED,
> +		KPROMOTED_MIG_CANDIDATE,
> +		KPROMOTED_MIG_PROMOTED,
> +		KPROMOTED_MIG_DROPPED,
>  		NR_VM_EVENT_ITEMS
>  };
>  
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 1b501db06417..ceaa462a0ce6 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1358,6 +1358,13 @@ config PT_RECLAIM
>  
>  	  Note: now only empty user PTE page table pages will be reclaimed.
>  
> +config KPROMOTED
> +	bool "Kernel hot page promotion daemon"
> +	def_bool y
> +	depends on NUMA && MIGRATION && MMU
> +	help
> +	  Promote hot pages from lower tier to top tier by using the
> +	  memory access information provided by various sources.
>  
>  source "mm/damon/Kconfig"
>  
> diff --git a/mm/Makefile b/mm/Makefile
> index 850386a67b3e..bf4f5f18f1f9 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -147,3 +147,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
>  obj-$(CONFIG_EXECMEM) += execmem.o
>  obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
>  obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
> +obj-$(CONFIG_KPROMOTED) += kpromoted.o
> diff --git a/mm/kpromoted.c b/mm/kpromoted.c
> new file mode 100644
> index 000000000000..2a8b8495b6b3
> --- /dev/null
> +++ b/mm/kpromoted.c
> @@ -0,0 +1,305 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * kpromoted is a kernel thread that runs on each node that has CPU i,e.,
> + * on regular nodes.
> + *
> + * Maintains list of hot pages from lower tiers and promotes them.
> + */
> +#include <linux/kpromoted.h>
> +#include <linux/kthread.h>
> +#include <linux/mutex.h>
> +#include <linux/mmzone.h>
> +#include <linux/migrate.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/slab.h>
> +#include <linux/sched.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/hashtable.h>
> +
> +static DEFINE_HASHTABLE(page_hotness_hash, KPROMOTED_HASH_ORDER);
> +static struct mutex page_hotness_lock[1UL << KPROMOTED_HASH_ORDER];
> +
> +static int kpromote_page(struct page_hotness_info *phi)
> +{

Why not just call it kpromote_folio?

> +	struct page *page = pfn_to_page(phi->pfn);
> +	struct folio *folio;
> +	int ret;
> +
> +	if (!page)
> +		return 1;

Do we need to check for is_zone_device_page() here?

> +
> +	folio = page_folio(page);
> +	ret = migrate_misplaced_folio_prepare(folio, NULL, phi->hot_node);
> +	if (ret)
> +		return 1;
> +
> +	return migrate_misplaced_folio(folio, phi->hot_node);
> +}


Could you please document the assumptions for kpromote_page(), what locks
should be held? Does the ref count need to be incremented?

> +
> +static int page_should_be_promoted(struct page_hotness_info *phi)
> +{
> +	struct page *page = pfn_to_online_page(phi->pfn);
> +	unsigned long now = jiffies;
> +	struct folio *folio;
> +
> +	if (!page || is_zone_device_page(page))
> +		return false;
> +
> +	folio = page_folio(page);
> +	if (!folio_test_lru(folio)) {
> +		count_vm_event(KPROMOTED_MIG_NON_LRU);
> +		return false;
> +	}
> +	if (folio_nid(folio) == phi->hot_node) {
> +		count_vm_event(KPROMOTED_MIG_RIGHT_NODE);
> +		return false;
> +	}
> +
> +	/* If the page was hot a while ago, don't promote */
> +	if ((now - phi->last_update) > 2 * msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
> +		count_vm_event(KPROMOTED_MIG_COLD_OLD);

Shouldn't we update phi->last_update here?

> +		return false;
> +	}
> +
> +	/* If the page hasn't been accessed enough number of times, don't promote */
> +	if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) {
> +		count_vm_event(KPROMOTED_MIG_COLD_NOT_ACCESSED);
> +		return false;
> +	}
> +	return true;
> +}
> +
> +/*
> + * Go thro' page hotness information and migrate pages if required.
> + *
> + * Promoted pages are not longer tracked in the hot list.
> + * Cold pages are pruned from the list as well.
> + *
> + * TODO: Batching could be done
> + */
> +static void kpromoted_migrate(pg_data_t *pgdat)
> +{
> +	int nid = pgdat->node_id;
> +	struct page_hotness_info *phi;
> +	struct hlist_node *tmp;
> +	int nr_bkts = HASH_SIZE(page_hotness_hash);
> +	int bkt;
> +
> +	for (bkt = 0; bkt < nr_bkts; bkt++) {
> +		mutex_lock(&page_hotness_lock[bkt]);
> +		hlist_for_each_entry_safe(phi, tmp, &page_hotness_hash[bkt], hnode) {
> +			if (phi->hot_node != nid)
> +				continue;
> +
> +			if (page_should_be_promoted(phi)) {
> +				count_vm_event(KPROMOTED_MIG_CANDIDATE);
> +				if (!kpromote_page(phi)) {
> +					count_vm_event(KPROMOTED_MIG_PROMOTED);
> +					hlist_del_init(&phi->hnode);
> +					kfree(phi);
> +				}
> +			} else {
> +				/*
> +				 * Not a suitable page or cold page, stop tracking it.
> +				 * TODO: Identify cold pages and drive demotion?
> +				 */
> +				count_vm_event(KPROMOTED_MIG_DROPPED);
> +				hlist_del_init(&phi->hnode);
> +				kfree(phi);

Won't existing demotion already handle this?

> +			}
> +		}
> +		mutex_unlock(&page_hotness_lock[bkt]);
> +	}
> +}
> +

It sounds like NUMA balancing, promotion and demotion can all act on parallel on
these folios, if not could you clarify their relationship and dependency?


> +static struct page_hotness_info *__kpromoted_lookup(unsigned long pfn, int bkt)
> +{
> +	struct page_hotness_info *phi;
> +
> +	hlist_for_each_entry(phi, &page_hotness_hash[bkt], hnode) {
> +		if (phi->pfn == pfn)
> +			return phi;
> +	}
> +	return NULL;
> +}
> +
> +static struct page_hotness_info *kpromoted_lookup(unsigned long pfn, int bkt, unsigned long now)
> +{
> +	struct page_hotness_info *phi;
> +
> +	phi = __kpromoted_lookup(pfn, bkt);
> +	if (!phi) {
> +		phi = kzalloc(sizeof(struct page_hotness_info), GFP_KERNEL);
> +		if (!phi)
> +			return ERR_PTR(-ENOMEM);
> +
> +		phi->pfn = pfn;
> +		phi->frequency = 1;
> +		phi->last_update = now;
> +		phi->recency = now;
> +		hlist_add_head(&phi->hnode, &page_hotness_hash[bkt]);
> +		count_vm_event(KPROMOTED_RECORD_ADDED);
> +	} else {
> +		count_vm_event(KPROMOTED_RECORD_EXISTS);
> +	}
> +	return phi;
> +}
> +
> +/*
> + * Called by subsystems that generate page hotness/access information.
> + *
> + * Records the memory access info for futher action by kpromoted.
> + */
> +int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now)
> +{
> +	struct page_hotness_info *phi;
> +	struct page *page;
> +	struct folio *folio;
> +	int ret, bkt;
> +
> +	count_vm_event(KPROMOTED_RECORDED_ACCESSES);
> +
> +	switch (src) {
> +	case KPROMOTED_HW_HINTS:
> +		count_vm_event(KPROMOTED_RECORD_HWHINTS);
> +		break;
> +	case KPROMOTED_PGTABLE_SCAN:
> +		count_vm_event(KPROMOTED_RECORD_PGTSCANS);
> +		break;
> +	default:
> +		break;
> +	}
> +
> +	/*
> +	 * Record only accesses from lower tiers.
> +	 * Assuming node having CPUs as toptier for now.
> +	 */
> +	if (node_is_toptier(pfn_to_nid(pfn))) {
> +		count_vm_event(KPROMOTED_RECORD_TOPTIER);
> +		return 0;
> +	}
> +
> +	page = pfn_to_online_page(pfn);
> +	if (!page || is_zone_device_page(page))
> +		return 0;
> +
> +	folio = page_folio(page);
> +	if (!folio_test_lru(folio))
> +		return 0;
> +
> +	bkt = hash_min(pfn, KPROMOTED_HASH_ORDER);
> +	mutex_lock(&page_hotness_lock[bkt]);
> +	phi = kpromoted_lookup(pfn, bkt, now);
> +	if (!phi) {
> +		ret = PTR_ERR(phi);
> +		goto out;
> +	}
> +
> +	if ((phi->last_update - now) > msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
> +		/* New window */
> +		phi->frequency = 1; /* TODO: Factor in the history */
> +		phi->last_update = now;
> +	} else {
> +		phi->frequency++;
> +	}
> +	phi->recency = now;
> +
> +	/*
> +	 * TODOs:
> +	 * 1. Source nid is hard-coded for some temperature sources
> +	 * 2. Take action if hot_node changes - may be a shared page?
> +	 * 3. Maintain node info for every access within the window?
> +	 */
> +	phi->hot_node = (nid == NUMA_NO_NODE) ? 1 : nid;

I don't understand why nid needs to be 1 if nid is NUMA_NODE_ID? Does
it mean that it's being promoted to the top tier, the mix of hot_node,
tier and nid is not very clear here.

> +	mutex_unlock(&page_hotness_lock[bkt]);
> +out:
> +	return 0;
> +}
> +
> +/*
> + * Go through the accumulated mem_access_info and migrate
> + * pages if required.
> + */
> +static void kpromoted_do_work(pg_data_t *pgdat)
> +{
> +	kpromoted_migrate(pgdat);
> +}
> +
> +static inline bool kpromoted_work_requested(pg_data_t *pgdat)
> +{
> +	return false;
> +}
> +
> +static int kpromoted(void *p)
> +{
> +	pg_data_t *pgdat = (pg_data_t *)p;
> +	struct task_struct *tsk = current;
> +	long timeout = msecs_to_jiffies(KPROMOTE_DELAY);
> +
> +	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
> +
> +	if (!cpumask_empty(cpumask))
> +		set_cpus_allowed_ptr(tsk, cpumask);
> +
> +	while (!kthread_should_stop()) {
> +		wait_event_timeout(pgdat->kpromoted_wait,
> +				   kpromoted_work_requested(pgdat), timeout);
> +		kpromoted_do_work(pgdat);
> +	}
> +	return 0;
> +}
> +
> +static void kpromoted_run(int nid)
> +{
> +	pg_data_t *pgdat = NODE_DATA(nid);
> +
> +	if (pgdat->kpromoted)
> +		return;
> +
> +	pgdat->kpromoted = kthread_run(kpromoted, pgdat, "kpromoted%d", nid);
> +	if (IS_ERR(pgdat->kpromoted)) {
> +		pr_err("Failed to start kpromoted on node %d\n", nid);
> +		pgdat->kpromoted = NULL;
> +	}
> +}
> +
> +static int kpromoted_cpu_online(unsigned int cpu)
> +{
> +	int nid;
> +
> +	for_each_node_state(nid, N_CPU) {
> +		pg_data_t *pgdat = NODE_DATA(nid);
> +		const struct cpumask *mask;
> +
> +		mask = cpumask_of_node(pgdat->node_id);
> +
> +		if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
> +			/* One of our CPUs online: restore mask */
> +			if (pgdat->kpromoted)
> +				set_cpus_allowed_ptr(pgdat->kpromoted, mask);
> +	}
> +	return 0;
> +}
> +
> +static int __init kpromoted_init(void)
> +{
> +	int nid, ret, i;
> +
> +	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
> +					"mm/promotion:online",
> +					kpromoted_cpu_online, NULL);
> +	if (ret < 0) {
> +		pr_err("kpromoted: failed to register hotplug callbacks.\n");
> +		return ret;
> +	}
> +
> +	for (i = 0; i < (1UL << KPROMOTED_HASH_ORDER); i++)
> +		mutex_init(&page_hotness_lock[i]);
> +
> +	for_each_node_state(nid, N_CPU)
> +		kpromoted_run(nid);
> +

I think we need a dynamic way to disabling promotion at run time
as well, right?


> +	return 0;
> +}
> +
> +subsys_initcall(kpromoted_init)
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 2630cc30147e..d212df24f89b 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1362,6 +1362,15 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat)
>  static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
>  #endif
>  
> +#ifdef CONFIG_KPROMOTED
> +static void pgdat_init_kpromoted(struct pglist_data *pgdat)
> +{
> +	init_waitqueue_head(&pgdat->kpromoted_wait);
> +}
> +#else
> +static void pgdat_init_kpromoted(struct pglist_data *pgdat) {}
> +#endif
> +
>  static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
>  {
>  	int i;
> @@ -1371,6 +1380,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
>  
>  	pgdat_init_split_queue(pgdat);
>  	pgdat_init_kcompactd(pgdat);
> +	pgdat_init_kpromoted(pgdat);
>  
>  	init_waitqueue_head(&pgdat->kswapd_wait);
>  	init_waitqueue_head(&pgdat->pfmemalloc_wait);
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 16bfe1c694dd..618f44bae5c8 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1466,6 +1466,19 @@ const char * const vmstat_text[] = {
>  	"kstack_rest",
>  #endif
>  #endif
> +	"kpromoted_recorded_accesses",
> +	"kpromoted_recorded_hwhints",
> +	"kpromoted_recorded_pgtscans",
> +	"kpromoted_record_toptier",
> +	"kpromoted_record_added",
> +	"kpromoted_record_exists",
> +	"kpromoted_mig_right_node",
> +	"kpromoted_mig_non_lru",
> +	"kpromoted_mig_cold_old",
> +	"kpromoted_mig_cold_not_accessed",
> +	"kpromoted_mig_candidate",
> +	"kpromoted_mig_promoted",
> +	"kpromoted_mig_dropped",
>  #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
>  };
>  #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Bharata B Rao 10 months, 2 weeks ago
Hi Balbir,

Sorry for the delay in response and thanks for the review...

On 24-Mar-25 9:05 AM, Balbir Singh wrote:
> On 3/6/25 16:45, Bharata B Rao wrote:
>> kpromoted is a kernel daemon that accumulates hot page info
>> from different sources and tries to promote pages from slow
>> tiers to top tiers. One instance of this thread runs on each
>> node that has CPUs.
>>
> 
> Could you please elaborate on what is slow vs top tier? A top tier uses
> adist (which is a combination of bandwidth and latency), so I am
> not sure the terminology here holds.

Slow is used to mean bottom tiers here as determined by the memory 
tiering hierarchy.

> 
>> Subsystems that generate hot page access info can report that
>> to kpromoted via this API:
>>
>> int kpromoted_record_access(u64 pfn, int nid, int src,
>> 			    unsigned long time)
>>
>> @pfn: The PFN of the memory accessed
>> @nid: The accessing NUMA node ID
>> @src: The temperature source (subsystem) that generated the
>>        access info
>> @time: The access time in jiffies
>>
>> Some temperature sources may not provide the nid from which
> 
> What is a temperature source?

Temperature source is a term used to refer to the subsystem that 
generates memory access information. For e.g. LRU subsystem that scans 
page tables for Accessed bit becomes a source.

> 
>> the page was accessed. This is true for sources that use
>> page table scanning for PTE Accessed bit. Currently the toptier
>> node to which such pages should be promoted to is hard coded.
>>
> 
> What would it take to make this flexible?

The context here is that sources that provide access information by 
scanning the PTE A bit wouldn't know from which node the access was 
done. Same is the case for kmmscand approach though Raghu has some 
heuristics to deduce the best possible toptier node to which a given 
page should be promoted. More details at 
https://lore.kernel.org/linux-mm/20250319193028.29514-1-raghavendra.kt@amd.com/

What kpromoted did for such cases is to just promote the pages to a node 
whose nid has been hard-coded for now (like 0 or 1 etc)

> 
>> Also, the access time provided some sources may at best be
>> considered approximate. This is especially true for hot pages
>> detected by PTE A bit scanning.
>>
>> kpromoted currently maintains the hot PFN records in hash lists
>> hashed by PFN value. Each record stores the following info:
>>
>> struct page_hotness_info {
>> 	unsigned long pfn;
>>
>> 	/* Time when this record was updated last */
>> 	unsigned long last_update;
>>
>> 	/*
>> 	 * Number of times this page was accessed in the
>> 	 * current window
>> 	 */
>> 	int frequency;
>>
>> 	/* Most recent access time */
>> 	unsigned long recency;
>>
>> 	/* Most recent access from this node */
>> 	int hot_node;
>>
>> 	struct hlist_node hnode;
>> };
>>
>> The way in which a page is categorized as hot enough to be
>> promoted is pretty primitive now.
>>
>> Signed-off-by: Bharata B Rao <bharata@amd.com>
>> ---
>>   include/linux/kpromoted.h     |  54 ++++++
>>   include/linux/mmzone.h        |   4 +
>>   include/linux/vm_event_item.h |  13 ++
>>   mm/Kconfig                    |   7 +
>>   mm/Makefile                   |   1 +
>>   mm/kpromoted.c                | 305 ++++++++++++++++++++++++++++++++++
>>   mm/mm_init.c                  |  10 ++
>>   mm/vmstat.c                   |  13 ++
>>   8 files changed, 407 insertions(+)
>>   create mode 100644 include/linux/kpromoted.h
>>   create mode 100644 mm/kpromoted.c
>>
>> diff --git a/include/linux/kpromoted.h b/include/linux/kpromoted.h
>> new file mode 100644
>> index 000000000000..2bef3d74f03a
>> --- /dev/null
>> +++ b/include/linux/kpromoted.h
>> @@ -0,0 +1,54 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef _LINUX_KPROMOTED_H
>> +#define _LINUX_KPROMOTED_H
>> +
>> +#include <linux/types.h>
>> +#include <linux/init.h>
>> +#include <linux/workqueue_types.h>
>> +
>> +/* Page hotness temperature sources */
>> +enum kpromoted_src {
>> +	KPROMOTED_HW_HINTS,
>> +	KPROMOTED_PGTABLE_SCAN,
>> +};
>> +
>> +#ifdef CONFIG_KPROMOTED
>> +
>> +#define KPROMOTED_FREQ_WINDOW	(5 * MSEC_PER_SEC)
>> +
>> +/* 2 accesses within a window will make the page a promotion candidate */
>> +#define KPRMOTED_FREQ_THRESHOLD	2
>> +
> 
> Were these value derived empirically?

It is something I started with capture the notion of "repeated access".

> 
> 
>> +#define KPROMOTED_HASH_ORDER	16
>> +
>> +struct page_hotness_info {
>> +	unsigned long pfn;
>> +
>> +	/* Time when this record was updated last */
>> +	unsigned long last_update;
>> +
>> +	/*
>> +	 * Number of times this page was accessed in the
>> +	 * current window
>> +	 */
>> +	int frequency;
>> +
>> +	/* Most recent access time */
>> +	unsigned long recency;
>> +
>> +	/* Most recent access from this node */
>> +	int hot_node;
>> +	struct hlist_node hnode;
>> +};
>> +
>> +#define KPROMOTE_DELAY	MSEC_PER_SEC
>> +
>> +int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now);
>> +#else
>> +static inline int kpromoted_record_access(u64 pfn, int nid, int src,
>> +					  unsigned long now)
>> +{
>> +	return 0;
>> +}
>> +#endif /* CONFIG_KPROMOTED */
>> +#endif /* _LINUX_KPROMOTED_H */
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index 9540b41894da..a5c4e789aa55 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -1459,6 +1459,10 @@ typedef struct pglist_data {
>>   #ifdef CONFIG_MEMORY_FAILURE
>>   	struct memory_failure_stats mf_stats;
>>   #endif
>> +#ifdef CONFIG_KPROMOTED
>> +	struct task_struct *kpromoted;
>> +	wait_queue_head_t kpromoted_wait;
>> +#endif
>>   } pg_data_t;
>>   
>>   #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
>> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
>> index f70d0958095c..b5823b037883 100644
>> --- a/include/linux/vm_event_item.h
>> +++ b/include/linux/vm_event_item.h
>> @@ -182,6 +182,19 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
>>   		KSTACK_REST,
>>   #endif
>>   #endif /* CONFIG_DEBUG_STACK_USAGE */
>> +		KPROMOTED_RECORDED_ACCESSES,
>> +		KPROMOTED_RECORD_HWHINTS,
>> +		KPROMOTED_RECORD_PGTSCANS,
>> +		KPROMOTED_RECORD_TOPTIER,
>> +		KPROMOTED_RECORD_ADDED,
>> +		KPROMOTED_RECORD_EXISTS,
>> +		KPROMOTED_MIG_RIGHT_NODE,
>> +		KPROMOTED_MIG_NON_LRU,
>> +		KPROMOTED_MIG_COLD_OLD,
>> +		KPROMOTED_MIG_COLD_NOT_ACCESSED,
>> +		KPROMOTED_MIG_CANDIDATE,
>> +		KPROMOTED_MIG_PROMOTED,
>> +		KPROMOTED_MIG_DROPPED,
>>   		NR_VM_EVENT_ITEMS
>>   };
>>   
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index 1b501db06417..ceaa462a0ce6 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -1358,6 +1358,13 @@ config PT_RECLAIM
>>   
>>   	  Note: now only empty user PTE page table pages will be reclaimed.
>>   
>> +config KPROMOTED
>> +	bool "Kernel hot page promotion daemon"
>> +	def_bool y
>> +	depends on NUMA && MIGRATION && MMU
>> +	help
>> +	  Promote hot pages from lower tier to top tier by using the
>> +	  memory access information provided by various sources.
>>   
>>   source "mm/damon/Kconfig"
>>   
>> diff --git a/mm/Makefile b/mm/Makefile
>> index 850386a67b3e..bf4f5f18f1f9 100644
>> --- a/mm/Makefile
>> +++ b/mm/Makefile
>> @@ -147,3 +147,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
>>   obj-$(CONFIG_EXECMEM) += execmem.o
>>   obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
>>   obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
>> +obj-$(CONFIG_KPROMOTED) += kpromoted.o
>> diff --git a/mm/kpromoted.c b/mm/kpromoted.c
>> new file mode 100644
>> index 000000000000..2a8b8495b6b3
>> --- /dev/null
>> +++ b/mm/kpromoted.c
>> @@ -0,0 +1,305 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * kpromoted is a kernel thread that runs on each node that has CPU i,e.,
>> + * on regular nodes.
>> + *
>> + * Maintains list of hot pages from lower tiers and promotes them.
>> + */
>> +#include <linux/kpromoted.h>
>> +#include <linux/kthread.h>
>> +#include <linux/mutex.h>
>> +#include <linux/mmzone.h>
>> +#include <linux/migrate.h>
>> +#include <linux/memory-tiers.h>
>> +#include <linux/slab.h>
>> +#include <linux/sched.h>
>> +#include <linux/cpuhotplug.h>
>> +#include <linux/hashtable.h>
>> +
>> +static DEFINE_HASHTABLE(page_hotness_hash, KPROMOTED_HASH_ORDER);
>> +static struct mutex page_hotness_lock[1UL << KPROMOTED_HASH_ORDER];
>> +
>> +static int kpromote_page(struct page_hotness_info *phi)
>> +{
> 
> Why not just call it kpromote_folio?

Yes, can be called so.

> 
>> +	struct page *page = pfn_to_page(phi->pfn);
>> +	struct folio *folio;
>> +	int ret;
>> +
>> +	if (!page)
>> +		return 1;
> 
> Do we need to check for is_zone_device_page() here?

That and other checks are part of page_should_be_promoted() call just 
prior to attempting to promote.

> 
>> +
>> +	folio = page_folio(page);
>> +	ret = migrate_misplaced_folio_prepare(folio, NULL, phi->hot_node);
>> +	if (ret)
>> +		return 1;
>> +
>> +	return migrate_misplaced_folio(folio, phi->hot_node);
>> +}
> 
> 
> Could you please document the assumptions for kpromote_page(), what locks
> should be held? Does the ref count need to be incremented?

Yes, will document. However it doesn't expect folio refcount to be 
incremented as I am tracking hotpages via PFNs and not by using struct 
folios.

> 
>> +
>> +static int page_should_be_promoted(struct page_hotness_info *phi)
>> +{
>> +	struct page *page = pfn_to_online_page(phi->pfn);
>> +	unsigned long now = jiffies;
>> +	struct folio *folio;
>> +
>> +	if (!page || is_zone_device_page(page))
>> +		return false;
>> +
>> +	folio = page_folio(page);
>> +	if (!folio_test_lru(folio)) {
>> +		count_vm_event(KPROMOTED_MIG_NON_LRU);
>> +		return false;
>> +	}
>> +	if (folio_nid(folio) == phi->hot_node) {
>> +		count_vm_event(KPROMOTED_MIG_RIGHT_NODE);
>> +		return false;
>> +	}
>> +
>> +	/* If the page was hot a while ago, don't promote */
>> +	if ((now - phi->last_update) > 2 * msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
>> +		count_vm_event(KPROMOTED_MIG_COLD_OLD);
> 
> Shouldn't we update phi->last_update here?

Hmm I am not sure about updating from here where we are checking for 
migration feasibility. last_update records the time when the page was 
last accesed.

> 
>> +		return false;
>> +	}
>> +
>> +	/* If the page hasn't been accessed enough number of times, don't promote */
>> +	if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) {
>> +		count_vm_event(KPROMOTED_MIG_COLD_NOT_ACCESSED);
>> +		return false;
>> +	}
>> +	return true;
>> +}
>> +
>> +/*
>> + * Go thro' page hotness information and migrate pages if required.
>> + *
>> + * Promoted pages are not longer tracked in the hot list.
>> + * Cold pages are pruned from the list as well.
>> + *
>> + * TODO: Batching could be done
>> + */
>> +static void kpromoted_migrate(pg_data_t *pgdat)
>> +{
>> +	int nid = pgdat->node_id;
>> +	struct page_hotness_info *phi;
>> +	struct hlist_node *tmp;
>> +	int nr_bkts = HASH_SIZE(page_hotness_hash);
>> +	int bkt;
>> +
>> +	for (bkt = 0; bkt < nr_bkts; bkt++) {
>> +		mutex_lock(&page_hotness_lock[bkt]);
>> +		hlist_for_each_entry_safe(phi, tmp, &page_hotness_hash[bkt], hnode) {
>> +			if (phi->hot_node != nid)
>> +				continue;
>> +
>> +			if (page_should_be_promoted(phi)) {
>> +				count_vm_event(KPROMOTED_MIG_CANDIDATE);
>> +				if (!kpromote_page(phi)) {
>> +					count_vm_event(KPROMOTED_MIG_PROMOTED);
>> +					hlist_del_init(&phi->hnode);
>> +					kfree(phi);
>> +				}
>> +			} else {
>> +				/*
>> +				 * Not a suitable page or cold page, stop tracking it.
>> +				 * TODO: Identify cold pages and drive demotion?
>> +				 */
>> +				count_vm_event(KPROMOTED_MIG_DROPPED);
>> +				hlist_del_init(&phi->hnode);
>> +				kfree(phi);
> 
> Won't existing demotion already handle this?

Yes it does. I had a note here to check if it makes sense to drive 
demotion of pages that are being dropped off from kpromoted tracking 
presumably becasue they aren't hot any longer.

> 
>> +			}
>> +		}
>> +		mutex_unlock(&page_hotness_lock[bkt]);
>> +	}
>> +}
>> +
> 
> It sounds like NUMA balancing, promotion and demotion can all act on parallel on
> these folios, if not could you clarify their relationship and dependency?

kpromoted tracks the hotness of PFNs. It goes through same steps that 
others use to isolate the pages prior to migration. So it is not 
possible to find a page that kpromoted wants to migrate being parallely 
considered by NUMAB for migration or vmscan for demotion. I don't see 
any obvious dependency here, but I can check in detail.

> 
> 
>> +static struct page_hotness_info *__kpromoted_lookup(unsigned long pfn, int bkt)
>> +{
>> +	struct page_hotness_info *phi;
>> +
>> +	hlist_for_each_entry(phi, &page_hotness_hash[bkt], hnode) {
>> +		if (phi->pfn == pfn)
>> +			return phi;
>> +	}
>> +	return NULL;
>> +}
>> +
>> +static struct page_hotness_info *kpromoted_lookup(unsigned long pfn, int bkt, unsigned long now)
>> +{
>> +	struct page_hotness_info *phi;
>> +
>> +	phi = __kpromoted_lookup(pfn, bkt);
>> +	if (!phi) {
>> +		phi = kzalloc(sizeof(struct page_hotness_info), GFP_KERNEL);
>> +		if (!phi)
>> +			return ERR_PTR(-ENOMEM);
>> +
>> +		phi->pfn = pfn;
>> +		phi->frequency = 1;
>> +		phi->last_update = now;
>> +		phi->recency = now;
>> +		hlist_add_head(&phi->hnode, &page_hotness_hash[bkt]);
>> +		count_vm_event(KPROMOTED_RECORD_ADDED);
>> +	} else {
>> +		count_vm_event(KPROMOTED_RECORD_EXISTS);
>> +	}
>> +	return phi;
>> +}
>> +
>> +/*
>> + * Called by subsystems that generate page hotness/access information.
>> + *
>> + * Records the memory access info for futher action by kpromoted.
>> + */
>> +int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now)
>> +{
>> +	struct page_hotness_info *phi;
>> +	struct page *page;
>> +	struct folio *folio;
>> +	int ret, bkt;
>> +
>> +	count_vm_event(KPROMOTED_RECORDED_ACCESSES);
>> +
>> +	switch (src) {
>> +	case KPROMOTED_HW_HINTS:
>> +		count_vm_event(KPROMOTED_RECORD_HWHINTS);
>> +		break;
>> +	case KPROMOTED_PGTABLE_SCAN:
>> +		count_vm_event(KPROMOTED_RECORD_PGTSCANS);
>> +		break;
>> +	default:
>> +		break;
>> +	}
>> +
>> +	/*
>> +	 * Record only accesses from lower tiers.
>> +	 * Assuming node having CPUs as toptier for now.
>> +	 */
>> +	if (node_is_toptier(pfn_to_nid(pfn))) {
>> +		count_vm_event(KPROMOTED_RECORD_TOPTIER);
>> +		return 0;
>> +	}
>> +
>> +	page = pfn_to_online_page(pfn);
>> +	if (!page || is_zone_device_page(page))
>> +		return 0;
>> +
>> +	folio = page_folio(page);
>> +	if (!folio_test_lru(folio))
>> +		return 0;
>> +
>> +	bkt = hash_min(pfn, KPROMOTED_HASH_ORDER);
>> +	mutex_lock(&page_hotness_lock[bkt]);
>> +	phi = kpromoted_lookup(pfn, bkt, now);
>> +	if (!phi) {
>> +		ret = PTR_ERR(phi);
>> +		goto out;
>> +	}
>> +
>> +	if ((phi->last_update - now) > msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
>> +		/* New window */
>> +		phi->frequency = 1; /* TODO: Factor in the history */
>> +		phi->last_update = now;
>> +	} else {
>> +		phi->frequency++;
>> +	}
>> +	phi->recency = now;
>> +
>> +	/*
>> +	 * TODOs:
>> +	 * 1. Source nid is hard-coded for some temperature sources
>> +	 * 2. Take action if hot_node changes - may be a shared page?
>> +	 * 3. Maintain node info for every access within the window?
>> +	 */
>> +	phi->hot_node = (nid == NUMA_NO_NODE) ? 1 : nid;
> 
> I don't understand why nid needs to be 1 if nid is NUMA_NODE_ID? Does
> it mean that it's being promoted to the top tier, the mix of hot_node,
> tier and nid is not very clear here.

As I mentioned earlier, if the access information wasn't accompanied by 
nid (which is specified by NUMA_NO_NODE), it will be promoted to a 
hard-code (currently) toptier node.

> 
>> +	mutex_unlock(&page_hotness_lock[bkt]);
>> +out:
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Go through the accumulated mem_access_info and migrate
>> + * pages if required.
>> + */
>> +static void kpromoted_do_work(pg_data_t *pgdat)
>> +{
>> +	kpromoted_migrate(pgdat);
>> +}
>> +
>> +static inline bool kpromoted_work_requested(pg_data_t *pgdat)
>> +{
>> +	return false;
>> +}
>> +
>> +static int kpromoted(void *p)
>> +{
>> +	pg_data_t *pgdat = (pg_data_t *)p;
>> +	struct task_struct *tsk = current;
>> +	long timeout = msecs_to_jiffies(KPROMOTE_DELAY);
>> +
>> +	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
>> +
>> +	if (!cpumask_empty(cpumask))
>> +		set_cpus_allowed_ptr(tsk, cpumask);
>> +
>> +	while (!kthread_should_stop()) {
>> +		wait_event_timeout(pgdat->kpromoted_wait,
>> +				   kpromoted_work_requested(pgdat), timeout);
>> +		kpromoted_do_work(pgdat);
>> +	}
>> +	return 0;
>> +}
>> +
>> +static void kpromoted_run(int nid)
>> +{
>> +	pg_data_t *pgdat = NODE_DATA(nid);
>> +
>> +	if (pgdat->kpromoted)
>> +		return;
>> +
>> +	pgdat->kpromoted = kthread_run(kpromoted, pgdat, "kpromoted%d", nid);
>> +	if (IS_ERR(pgdat->kpromoted)) {
>> +		pr_err("Failed to start kpromoted on node %d\n", nid);
>> +		pgdat->kpromoted = NULL;
>> +	}
>> +}
>> +
>> +static int kpromoted_cpu_online(unsigned int cpu)
>> +{
>> +	int nid;
>> +
>> +	for_each_node_state(nid, N_CPU) {
>> +		pg_data_t *pgdat = NODE_DATA(nid);
>> +		const struct cpumask *mask;
>> +
>> +		mask = cpumask_of_node(pgdat->node_id);
>> +
>> +		if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
>> +			/* One of our CPUs online: restore mask */
>> +			if (pgdat->kpromoted)
>> +				set_cpus_allowed_ptr(pgdat->kpromoted, mask);
>> +	}
>> +	return 0;
>> +}
>> +
>> +static int __init kpromoted_init(void)
>> +{
>> +	int nid, ret, i;
>> +
>> +	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
>> +					"mm/promotion:online",
>> +					kpromoted_cpu_online, NULL);
>> +	if (ret < 0) {
>> +		pr_err("kpromoted: failed to register hotplug callbacks.\n");
>> +		return ret;
>> +	}
>> +
>> +	for (i = 0; i < (1UL << KPROMOTED_HASH_ORDER); i++)
>> +		mutex_init(&page_hotness_lock[i]);
>> +
>> +	for_each_node_state(nid, N_CPU)
>> +		kpromoted_run(nid);
>> +
> 
> I think we need a dynamic way to disabling promotion at run time
> as well, right?

Myabe, but I understand that promotion is an activity that should be 
benefitial in general. What specific scenarios do you think would need 
explicit disabling of promotion?

Regards,
Bharata.
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Jonathan Cameron 11 months ago
On Thu, 6 Mar 2025 11:15:30 +0530
Bharata B Rao <bharata@amd.com> wrote:

> kpromoted is a kernel daemon that accumulates hot page info
> from different sources and tries to promote pages from slow
> tiers to top tiers. One instance of this thread runs on each
> node that has CPUs.
> 

Firstly, nice work. Much easier to discuss things with an
implementation to look at.

I'm looking at this with my hardware hotness unit "hammer" in hand :)

> Subsystems that generate hot page access info can report that
> to kpromoted via this API:
> 
> int kpromoted_record_access(u64 pfn, int nid, int src,
> 			    unsigned long time)

This perhaps works as an interface for aggregating methods
that produce per access events.  Any hardware counter solution
is going to give you data that is closer to what you used for
the promotion decision.

We might need to aggregate at different levels.  So access
counting promotes to a hot list and we can inject other events
at that level.  The data I have from the CXL HMU is typically
after an epoch (period of time) these N pages were accessed more
than M times.  I can sort of map that to the internal storage
you have.

Would be good to evaluate approximate trackers on top of access
counts. I've no idea if sketches or similar would be efficient
enough (they have a bit of a write amplification problem) but
they may give good answers with much lower storage cost at the
risk of occasionally saying something is hot when it's not.

> 
> @pfn: The PFN of the memory accessed
> @nid: The accessing NUMA node ID
> @src: The temperature source (subsystem) that generated the
>       access info
> @time: The access time in jiffies
> 
> Some temperature sources may not provide the nid from which
> the page was accessed. This is true for sources that use
> page table scanning for PTE Accessed bit. Currently the toptier
> node to which such pages should be promoted to is hard coded.

For those cases (CXL HMU included) maybe we need to
consider how to fill in missing node info with at least a vague chance
of getting a reasonable target for migration.  We can always fall
back to random top tier node, or nearest one to where we are coming
from (on basis we maybe landed in this node based on a fallback
list when the top tier was under memory pressure).

From an interface point of view is that a problem for this layer,
or for the underlying tracking mechanism? (maybe with some helpers)
Also, see later discussion of consistency of hotness tracking and
that the best solution for that differs from that to get
potential targets.  The answer to Is this page consistently hot?" can be
approximated with "Was this page once hot and is it not now cold?"

Access time is something some measurement techniques will only
give you wrt to a measurement was in a window (potentially a long
one if you are looking for consistent hotness over minutes).

> 
> Also, the access time provided some sources may at best be
> considered approximate. This is especially true for hot pages
> detected by PTE A bit scanning.
> 
> kpromoted currently maintains the hot PFN records in hash lists
> hashed by PFN value. Each record stores the following info:
> 
> struct page_hotness_info {
> 	unsigned long pfn;
> 
> 	/* Time when this record was updated last */
> 	unsigned long last_update;
> 
> 	/*
> 	 * Number of times this page was accessed in the
> 	 * current window
I'd express here how that window was defined (I read on
to answer the question I had here at first!)

> 	 */
> 	int frequency;
> 
> 	/* Most recent access time */
> 	unsigned long recency;

Put next to the last_update so all the times are together

> 
> 	/* Most recent access from this node */
> 	int hot_node;

Probably want to relax the most recent part.  I'd guess
the ideal here would be if this is the node accessing it the most
'recently'.

> 
> 	struct hlist_node hnode;
> };
> 
> The way in which a page is categorized as hot enough to be
> promoted is pretty primitive now.

That bit is very hard even if we solve everything else and heavily dependent
on workload access pattern stability and migration impact.  Maybe for
'very hot' pages a fairly short consistency of hotness period is
good enough, but it gets much messier if we care about warm pages.
I guess we solve the 'very hot' first though and maybe avoid the phase
transition from an application starting to when it is at steady state
by considering a wait time for any new userspace process before we
consider moving anything?

Also worth noting that the mechanism that makes sense to check if a
detected hot page is 'stable hot' might use entirely different tracking
approach to that used to find it as a candidate.

Whether that requires passing data between hotness trackers is an
interesting question, or whether there is a natural ordering to trackers.



> diff --git a/mm/kpromoted.c b/mm/kpromoted.c
> new file mode 100644
> index 000000000000..2a8b8495b6b3
> --- /dev/null
> +++ b/mm/kpromoted.c

> +static int page_should_be_promoted(struct page_hotness_info *phi)
> +{
> +	struct page *page = pfn_to_online_page(phi->pfn);
> +	unsigned long now = jiffies;
> +	struct folio *folio;
> +
> +	if (!page || is_zone_device_page(page))
> +		return false;
> +
> +	folio = page_folio(page);
> +	if (!folio_test_lru(folio)) {
> +		count_vm_event(KPROMOTED_MIG_NON_LRU);
> +		return false;
> +	}
> +	if (folio_nid(folio) == phi->hot_node) {
> +		count_vm_event(KPROMOTED_MIG_RIGHT_NODE);
> +		return false;
> +	}
> +
> +	/* If the page was hot a while ago, don't promote */

	/* If the known record of hotness is old, don't promote */ ?

Otherwise this says don't move a page just because it was hot a long time
back. Maybe it is still hot and we just don't have an update yet?

> +	if ((now - phi->last_update) > 2 * msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
> +		count_vm_event(KPROMOTED_MIG_COLD_OLD);
> +		return false;
> +	}
> +
> +	/* If the page hasn't been accessed enough number of times, don't promote */
> +	if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) {
> +		count_vm_event(KPROMOTED_MIG_COLD_NOT_ACCESSED);
> +		return false;
> +	}
> +	return true;
> +}
> +
> +/*
> + * Go thro' page hotness information and migrate pages if required.
> + *
> + * Promoted pages are not longer tracked in the hot list.
> + * Cold pages are pruned from the list as well.

When we say cold here why did we ever see them?

> + *
> + * TODO: Batching could be done
> + */
> +static void kpromoted_migrate(pg_data_t *pgdat)
> +{
> +	int nid = pgdat->node_id;
> +	struct page_hotness_info *phi;
> +	struct hlist_node *tmp;
> +	int nr_bkts = HASH_SIZE(page_hotness_hash);
> +	int bkt;
> +
> +	for (bkt = 0; bkt < nr_bkts; bkt++) {
> +		mutex_lock(&page_hotness_lock[bkt]);
> +		hlist_for_each_entry_safe(phi, tmp, &page_hotness_hash[bkt], hnode) {
> +			if (phi->hot_node != nid)
> +				continue;
> +
> +			if (page_should_be_promoted(phi)) {
> +				count_vm_event(KPROMOTED_MIG_CANDIDATE);
> +				if (!kpromote_page(phi)) {
> +					count_vm_event(KPROMOTED_MIG_PROMOTED);
> +					hlist_del_init(&phi->hnode);
> +					kfree(phi);
> +				}
> +			} else {
> +				/*
> +				 * Not a suitable page or cold page, stop tracking it.
> +				 * TODO: Identify cold pages and drive demotion?

Coldness tracking is really different from hotness as we need to track what we
didn't see to get the really cold pages. Maybe there is some hint to be had
from the exit of this tracker but I'd definitely not try to tackle both ends
with one approach!

> +				 */
> +				count_vm_event(KPROMOTED_MIG_DROPPED);
> +				hlist_del_init(&phi->hnode);
> +				kfree(phi);
> +			}
> +		}
> +		mutex_unlock(&page_hotness_lock[bkt]);
> +	}
> +}


> +/*
> + * Called by subsystems that generate page hotness/access information.
> + *
> + * Records the memory access info for futher action by kpromoted.
> + */
> +int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now)
> +{

> +	bkt = hash_min(pfn, KPROMOTED_HASH_ORDER);
> +	mutex_lock(&page_hotness_lock[bkt]);
> +	phi = kpromoted_lookup(pfn, bkt, now);
> +	if (!phi) {
> +		ret = PTR_ERR(phi);
> +		goto out;
> +	}
> +
> +	if ((phi->last_update - now) > msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
> +		/* New window */
> +		phi->frequency = 1; /* TODO: Factor in the history */
> +		phi->last_update = now;
> +	} else {
> +		phi->frequency++;
> +	}
> +	phi->recency = now;
> +
> +	/*
> +	 * TODOs:
> +	 * 1. Source nid is hard-coded for some temperature sources

Hard coded rather than unknown? I'm curious, what source has that issue?

> +	 * 2. Take action if hot_node changes - may be a shared page?
> +	 * 3. Maintain node info for every access within the window?

I guess some sort of saturating counter set might not be too bad.

> +	 */
> +	phi->hot_node = (nid == NUMA_NO_NODE) ? 1 : nid;
> +	mutex_unlock(&page_hotness_lock[bkt]);
> +out:
> +	return 0;

why store ret and not return it?

> +}
> +
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Bharata B Rao 10 months, 3 weeks ago
On 14-Mar-25 8:58 PM, Jonathan Cameron wrote:
> On Thu, 6 Mar 2025 11:15:30 +0530
> Bharata B Rao <bharata@amd.com> wrote:
> 
>> Subsystems that generate hot page access info can report that
>> to kpromoted via this API:
>>
>> int kpromoted_record_access(u64 pfn, int nid, int src,
>> 			    unsigned long time)
> 
> This perhaps works as an interface for aggregating methods
> that produce per access events.  Any hardware counter solution
> is going to give you data that is closer to what you used for
> the promotion decision.

Right.

> 
> We might need to aggregate at different levels.  So access
> counting promotes to a hot list and we can inject other events
> at that level.  The data I have from the CXL HMU is typically
> after an epoch (period of time) these N pages were accessed more
> than M times.  I can sort of map that to the internal storage
> you have.

Even for IBS source, I am aggregating data in per-cpu buffers before 
presenting them one by one to kpromoted. Guess CXL HMU aggregated data 
could be presented in a similar manner.

> 
> Would be good to evaluate approximate trackers on top of access
> counts. I've no idea if sketches or similar would be efficient
> enough (they have a bit of a write amplification problem) but
> they may give good answers with much lower storage cost at the
> risk of occasionally saying something is hot when it's not.

Could me point me to some information about sketches??

> 
>>
>> @pfn: The PFN of the memory accessed
>> @nid: The accessing NUMA node ID
>> @src: The temperature source (subsystem) that generated the
>>        access info
>> @time: The access time in jiffies
>>
>> Some temperature sources may not provide the nid from which
>> the page was accessed. This is true for sources that use
>> page table scanning for PTE Accessed bit. Currently the toptier
>> node to which such pages should be promoted to is hard coded.
> 
> For those cases (CXL HMU included) maybe we need to
> consider how to fill in missing node info with at least a vague chance
> of getting a reasonable target for migration.  We can always fall
> back to random top tier node, or nearest one to where we are coming
> from (on basis we maybe landed in this node based on a fallback
> list when the top tier was under memory pressure).

Yes. For A-bit scanners, Raghu has devised a scheme to obtain the best 
possible list of target nodes for promotion. He should be sharing more 
about it soon.

> 
>  From an interface point of view is that a problem for this layer,
> or for the underlying tracking mechanism? (maybe with some helpers)

It is not a problem from this interface point of view as this interface 
expects a nid(or default value) and would use that for promotion. It is 
up to the underlying tracking mechanism to provide the most appropriate 
target nid.

> Also, see later discussion of consistency of hotness tracking and
> that the best solution for that differs from that to get
> potential targets.  The answer to Is this page consistently hot?" can be
> approximated with "Was this page once hot and is it not now cold?"
> 
> Access time is something some measurement techniques will only
> give you wrt to a measurement was in a window (potentially a long
> one if you are looking for consistent hotness over minutes).
> 
>>
>> Also, the access time provided some sources may at best be
>> considered approximate. This is especially true for hot pages
>> detected by PTE A bit scanning.
>>
>> kpromoted currently maintains the hot PFN records in hash lists
>> hashed by PFN value. Each record stores the following info:
>>
>> struct page_hotness_info {
>> 	unsigned long pfn;
>>
>> 	/* Time when this record was updated last */
>> 	unsigned long last_update;
>>
>> 	/*
>> 	 * Number of times this page was accessed in the
>> 	 * current window
> I'd express here how that window was defined (I read on
> to answer the question I had here at first!)

Currently the number of accesses that occur within an observation window 
of 5s are considered for hotness calculation and access count is reset 
when the window elapses. This needs to factor in history etc.

> 
>> 	 */
>> 	int frequency;
>>
>> 	/* Most recent access time */
>> 	unsigned long recency;
> 
> Put next to the last_update so all the times are together

Sure.

> 
>>
>> 	/* Most recent access from this node */
>> 	int hot_node;
> 
> Probably want to relax the most recent part.  I'd guess
> the ideal here would be if this is the node accessing it the most
> 'recently'.

You mean the node that did most number of accesses in the given 
observation window and not necessarily the last (or most recent) 
accessed node.

> 
>>
>> 	struct hlist_node hnode;
>> };
>>
>> The way in which a page is categorized as hot enough to be
>> promoted is pretty primitive now.
> 
> That bit is very hard even if we solve everything else and heavily dependent
> on workload access pattern stability and migration impact.  Maybe for
> 'very hot' pages a fairly short consistency of hotness period is
> good enough, but it gets much messier if we care about warm pages.
> I guess we solve the 'very hot' first though and maybe avoid the phase
> transition from an application starting to when it is at steady state
> by considering a wait time for any new userspace process before we
> consider moving anything?
> 
> Also worth noting that the mechanism that makes sense to check if a
> detected hot page is 'stable hot' might use entirely different tracking
> approach to that used to find it as a candidate.
> 
> Whether that requires passing data between hotness trackers is an
> interesting question, or whether there is a natural ordering to trackers.

I was envisioning that different hotness trackers would reinforce the 
page hotness by reporting the same to kpromoted and there would be no 
need to again pass data between hotness trackers.

> 
> 
> 
>> diff --git a/mm/kpromoted.c b/mm/kpromoted.c
>> new file mode 100644
>> index 000000000000..2a8b8495b6b3
>> --- /dev/null
>> +++ b/mm/kpromoted.c
> 
>> +static int page_should_be_promoted(struct page_hotness_info *phi)
>> +{
>> +	struct page *page = pfn_to_online_page(phi->pfn);
>> +	unsigned long now = jiffies;
>> +	struct folio *folio;
>> +
>> +	if (!page || is_zone_device_page(page))
>> +		return false;
>> +
>> +	folio = page_folio(page);
>> +	if (!folio_test_lru(folio)) {
>> +		count_vm_event(KPROMOTED_MIG_NON_LRU);
>> +		return false;
>> +	}
>> +	if (folio_nid(folio) == phi->hot_node) {
>> +		count_vm_event(KPROMOTED_MIG_RIGHT_NODE);
>> +		return false;
>> +	}
>> +
>> +	/* If the page was hot a while ago, don't promote */
> 
> 	/* If the known record of hotness is old, don't promote */ ?
> 
> Otherwise this says don't move a page just because it was hot a long time
> back. Maybe it is still hot and we just don't have an update yet?

Agreed.

> 
>> +	if ((now - phi->last_update) > 2 * msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
>> +		count_vm_event(KPROMOTED_MIG_COLD_OLD);
>> +		return false;
>> +	}
>> +
>> +	/* If the page hasn't been accessed enough number of times, don't promote */
>> +	if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) {
>> +		count_vm_event(KPROMOTED_MIG_COLD_NOT_ACCESSED);
>> +		return false;
>> +	}
>> +	return true;
>> +}
>> +
>> +/*
>> + * Go thro' page hotness information and migrate pages if required.
>> + *
>> + * Promoted pages are not longer tracked in the hot list.
>> + * Cold pages are pruned from the list as well.
> 
> When we say cold here why did we ever see them?

Those hot pages that couldn't be migrated for different reasons are no 
longer tracked by kpromoted and I called such pages as "cold". Guess not 
the right nomenclature to represent them.

> 
>> + *
>> + * TODO: Batching could be done
>> + */
>> +static void kpromoted_migrate(pg_data_t *pgdat)
>> +{
>> +	int nid = pgdat->node_id;
>> +	struct page_hotness_info *phi;
>> +	struct hlist_node *tmp;
>> +	int nr_bkts = HASH_SIZE(page_hotness_hash);
>> +	int bkt;
>> +
>> +	for (bkt = 0; bkt < nr_bkts; bkt++) {
>> +		mutex_lock(&page_hotness_lock[bkt]);
>> +		hlist_for_each_entry_safe(phi, tmp, &page_hotness_hash[bkt], hnode) {
>> +			if (phi->hot_node != nid)
>> +				continue;
>> +
>> +			if (page_should_be_promoted(phi)) {
>> +				count_vm_event(KPROMOTED_MIG_CANDIDATE);
>> +				if (!kpromote_page(phi)) {
>> +					count_vm_event(KPROMOTED_MIG_PROMOTED);
>> +					hlist_del_init(&phi->hnode);
>> +					kfree(phi);
>> +				}
>> +			} else {
>> +				/*
>> +				 * Not a suitable page or cold page, stop tracking it.
>> +				 * TODO: Identify cold pages and drive demotion?
> 
> Coldness tracking is really different from hotness as we need to track what we
> didn't see to get the really cold pages. Maybe there is some hint to be had
> from the exit of this tracker but I'd definitely not try to tackle both ends
> with one approach!

Okay.

> 
>> +				 */
>> +				count_vm_event(KPROMOTED_MIG_DROPPED);
>> +				hlist_del_init(&phi->hnode);
>> +				kfree(phi);
>> +			}
>> +		}
>> +		mutex_unlock(&page_hotness_lock[bkt]);
>> +	}
>> +}
> 
> 
>> +/*
>> + * Called by subsystems that generate page hotness/access information.
>> + *
>> + * Records the memory access info for futher action by kpromoted.
>> + */
>> +int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now)
>> +{
> 
>> +	bkt = hash_min(pfn, KPROMOTED_HASH_ORDER);
>> +	mutex_lock(&page_hotness_lock[bkt]);
>> +	phi = kpromoted_lookup(pfn, bkt, now);
>> +	if (!phi) {
>> +		ret = PTR_ERR(phi);
>> +		goto out;
>> +	}
>> +
>> +	if ((phi->last_update - now) > msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
>> +		/* New window */
>> +		phi->frequency = 1; /* TODO: Factor in the history */
>> +		phi->last_update = now;
>> +	} else {
>> +		phi->frequency++;
>> +	}
>> +	phi->recency = now;
>> +
>> +	/*
>> +	 * TODOs:
>> +	 * 1. Source nid is hard-coded for some temperature sources
> 
> Hard coded rather than unknown? I'm curious, what source has that issue?

I meant that source didn't provide a nid and hence kpromoted ended up 
promoting to a fixed (hard-coded for now) toptier node.

> 
>> +	 * 2. Take action if hot_node changes - may be a shared page?
>> +	 * 3. Maintain node info for every access within the window?
> 
> I guess some sort of saturating counter set might not be too bad.

Yes.

> 
>> +	 */
>> +	phi->hot_node = (nid == NUMA_NO_NODE) ? 1 : nid;
>> +	mutex_unlock(&page_hotness_lock[bkt]);
>> +out:
>> +	return 0;
> 
> why store ret and not return it?

Will fix.

Thanks for your review!

Regards,
Bharata.
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Jonathan Cameron 10 months, 3 weeks ago
On Tue, 18 Mar 2025 09:39:17 +0530
Bharata B Rao <bharata@amd.com> wrote:

> On 14-Mar-25 8:58 PM, Jonathan Cameron wrote:
> > On Thu, 6 Mar 2025 11:15:30 +0530
> > Bharata B Rao <bharata@amd.com> wrote:
> >   
> >> Subsystems that generate hot page access info can report that
> >> to kpromoted via this API:
> >>
> >> int kpromoted_record_access(u64 pfn, int nid, int src,
> >> 			    unsigned long time)  
> > 
> > This perhaps works as an interface for aggregating methods
> > that produce per access events.  Any hardware counter solution
> > is going to give you data that is closer to what you used for
> > the promotion decision.  
> 
> Right.
> 
> > 
> > We might need to aggregate at different levels.  So access
> > counting promotes to a hot list and we can inject other events
> > at that level.  The data I have from the CXL HMU is typically
> > after an epoch (period of time) these N pages were accessed more
> > than M times.  I can sort of map that to the internal storage
> > you have.  
> 
> Even for IBS source, I am aggregating data in per-cpu buffers before 
> presenting them one by one to kpromoted. Guess CXL HMU aggregated data 
> could be presented in a similar manner.

The nature of the data maybe a bit different but certainly should be
able to find somewhere in the stack!

> 
> > 
> > Would be good to evaluate approximate trackers on top of access
> > counts. I've no idea if sketches or similar would be efficient
> > enough (they have a bit of a write amplification problem) but
> > they may give good answers with much lower storage cost at the
> > risk of occasionally saying something is hot when it's not.  
> 
> Could me point me to some information about sketches??

https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch
Is a good starting point but there are lots of related techniques
that trade off good statistical properties against complexity etc.

Roughly speaking you combine a sorted list of the very hottest
with a small number of different hash tables (the sketch) that let you
get an estimate of how hot things are that have dropped off your
hottest list (or not yet gotten hot enough to get into it). 

Looking for literature for top-k algorithms will find you
more references though not all are light weight enough to be
of interest here.

> 
> >   
> >>
> >> @pfn: The PFN of the memory accessed
> >> @nid: The accessing NUMA node ID
> >> @src: The temperature source (subsystem) that generated the
> >>        access info
> >> @time: The access time in jiffies
> >>
> >> Some temperature sources may not provide the nid from which
> >> the page was accessed. This is true for sources that use
> >> page table scanning for PTE Accessed bit. Currently the toptier
> >> node to which such pages should be promoted to is hard coded.  
> > 
> > For those cases (CXL HMU included) maybe we need to
> > consider how to fill in missing node info with at least a vague chance
> > of getting a reasonable target for migration.  We can always fall
> > back to random top tier node, or nearest one to where we are coming
> > from (on basis we maybe landed in this node based on a fallback
> > list when the top tier was under memory pressure).  
> 
> Yes. For A-bit scanners, Raghu has devised a scheme to obtain the best 
> possible list of target nodes for promotion. He should be sharing more 
> about it soon.

Excellent - look forward to seeing that.  Can think of a few possibilities
on how to get that data efficiently so I'm curious what Raghu has chosen.

> 
> > 
> >  From an interface point of view is that a problem for this layer,
> > or for the underlying tracking mechanism? (maybe with some helpers)  
> 
> It is not a problem from this interface point of view as this interface 
> expects a nid(or default value) and would use that for promotion. It is 
> up to the underlying tracking mechanism to provide the most appropriate 
> target nid.

I was wondering if there is some sharing to do, so whether we push Raghu's
means of getting a target node down into the tracker implementation or use
it to fill in messing info at this layer.  Will depend a bit on how
that technique works perhaps.

> 
> > Also, see later discussion of consistency of hotness tracking and
> > that the best solution for that differs from that to get
> > potential targets.  The answer to Is this page consistently hot?" can be
> > approximated with "Was this page once hot and is it not now cold?"
> > 
> > Access time is something some measurement techniques will only
> > give you wrt to a measurement was in a window (potentially a long
> > one if you are looking for consistent hotness over minutes).
> >   
> >>
> >> Also, the access time provided some sources may at best be
> >> considered approximate. This is especially true for hot pages
> >> detected by PTE A bit scanning.
> >>
> >> kpromoted currently maintains the hot PFN records in hash lists
> >> hashed by PFN value. Each record stores the following info:
> >>
> >> struct page_hotness_info {
> >> 	unsigned long pfn;
> >>
> >> 	/* Time when this record was updated last */
> >> 	unsigned long last_update;
> >>
> >> 	/*
> >> 	 * Number of times this page was accessed in the
> >> 	 * current window  
> > I'd express here how that window was defined (I read on
> > to answer the question I had here at first!)  
> 
> Currently the number of accesses that occur within an observation window 
> of 5s are considered for hotness calculation and access count is reset 
> when the window elapses. This needs to factor in history etc.

Just add that to the comment here perhaps.


> 
> >   
> >>
> >> 	/* Most recent access from this node */
> >> 	int hot_node;  
> > 
> > Probably want to relax the most recent part.  I'd guess
> > the ideal here would be if this is the node accessing it the most
> > 'recently'.  
> 
> You mean the node that did most number of accesses in the given 
> observation window and not necessarily the last (or most recent) 
> accessed node.

yes. Though maybe weighted in some fashion for recency?  Something
cheap to do that approximates that such as small saturating counters
with aging..

> 
> >   
> >>
> >> 	struct hlist_node hnode;
> >> };
> >>
> >> The way in which a page is categorized as hot enough to be
> >> promoted is pretty primitive now.  
> > 
> > That bit is very hard even if we solve everything else and heavily dependent
> > on workload access pattern stability and migration impact.  Maybe for
> > 'very hot' pages a fairly short consistency of hotness period is
> > good enough, but it gets much messier if we care about warm pages.
> > I guess we solve the 'very hot' first though and maybe avoid the phase
> > transition from an application starting to when it is at steady state
> > by considering a wait time for any new userspace process before we
> > consider moving anything?
> > 
> > Also worth noting that the mechanism that makes sense to check if a
> > detected hot page is 'stable hot' might use entirely different tracking
> > approach to that used to find it as a candidate.
> > 
> > Whether that requires passing data between hotness trackers is an
> > interesting question, or whether there is a natural ordering to trackers.  
> 
> I was envisioning that different hotness trackers would reinforce the 
> page hotness by reporting the same to kpromoted and there would be no 
> need to again pass data between hotness trackers.

What makes me wonder about that is the question of stability of hotness.
It is a really bad idea to move data based on a short sample - cost is huge
and quite a bit of data is only briefly hot - moving it to fast memory too
early just results in bouncing.  There are probably heuristics we can apply
on process age etc that will help, but generally we can't assume programs
don't have multiple phases with very different access characteristics.

The different tracking approaches have different sweet spots for short vs long
tracking. So it might be a case of one method, e.g. a hotness tracker
is only suitable for monitoring a short time period
(in the simplest sense, counters saturate if you run too long).
Don't read that to generally though as it's not a universal characteristic
and depends on the implementation used, but it is definitely true of
some potential implementations.

Having gotten a list of 1000+ candidate pages that might be worth moving,
we could use access bits to check it's still accessed reasonsably frequently
over the next minute. That can be much lower cost than an access tracker
that is looking for 'hottest'.

Where all these trade offs with timing occur is tricky and workload
dependent.  So figuring out how to autotune will be a challenging.

> 
> > 
> > 
> >   
> >> diff --git a/mm/kpromoted.c b/mm/kpromoted.c
> >> new file mode 100644
> >> index 000000000000..2a8b8495b6b3
> >> --- /dev/null
> >> +++ b/mm/kpromoted.c  


> >> +	bkt = hash_min(pfn, KPROMOTED_HASH_ORDER);
> >> +	mutex_lock(&page_hotness_lock[bkt]);
> >> +	phi = kpromoted_lookup(pfn, bkt, now);
> >> +	if (!phi) {
> >> +		ret = PTR_ERR(phi);
> >> +		goto out;
> >> +	}
> >> +
> >> +	if ((phi->last_update - now) > msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
> >> +		/* New window */
> >> +		phi->frequency = 1; /* TODO: Factor in the history */
> >> +		phi->last_update = now;
> >> +	} else {
> >> +		phi->frequency++;
> >> +	}
> >> +	phi->recency = now;
> >> +
> >> +	/*
> >> +	 * TODOs:
> >> +	 * 1. Source nid is hard-coded for some temperature sources  
> > 
> > Hard coded rather than unknown? I'm curious, what source has that issue?  
> 
> I meant that source didn't provide a nid and hence kpromoted ended up 
> promoting to a fixed (hard-coded for now) toptier node.

Sure. Unknown nid makes sense here.


Thanks,

Jonathan
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Davidlohr Bueso 11 months ago
On Thu, 06 Mar 2025, Bharata B Rao wrote:

>+/*
>+ * Go thro' page hotness information and migrate pages if required.
>+ *
>+ * Promoted pages are not longer tracked in the hot list.
>+ * Cold pages are pruned from the list as well.
>+ *
>+ * TODO: Batching could be done
>+ */
>+static void kpromoted_migrate(pg_data_t *pgdat)
>+{
>+	int nid = pgdat->node_id;
>+	struct page_hotness_info *phi;
>+	struct hlist_node *tmp;
>+	int nr_bkts = HASH_SIZE(page_hotness_hash);
>+	int bkt;
>+
>+	for (bkt = 0; bkt < nr_bkts; bkt++) {
>+		mutex_lock(&page_hotness_lock[bkt]);
>+		hlist_for_each_entry_safe(phi, tmp, &page_hotness_hash[bkt], hnode) {
>+			if (phi->hot_node != nid)
>+				continue;
>+
>+			if (page_should_be_promoted(phi)) {
>+				count_vm_event(KPROMOTED_MIG_CANDIDATE);
>+				if (!kpromote_page(phi)) {
>+					count_vm_event(KPROMOTED_MIG_PROMOTED);
>+					hlist_del_init(&phi->hnode);
>+					kfree(phi);
>+				}
>+			} else {
>+				/*
>+				 * Not a suitable page or cold page, stop tracking it.
>+				 * TODO: Identify cold pages and drive demotion?
>+				 */

I don't think kpromoted should drive demotion at all. No one is complaining about migrate
in lieu of discard, and there is also proactive reclaim which users can trigger. All the
in-kernel problems are wrt promotion. The simpler any of these kthreads are the better.

>+				count_vm_event(KPROMOTED_MIG_DROPPED);
>+				hlist_del_init(&phi->hnode);
>+				kfree(phi);
>+			}
>+		}
>+		mutex_unlock(&page_hotness_lock[bkt]);
>+	}
>+}
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Bharata B Rao 10 months, 4 weeks ago
On 14-Mar-25 2:06 AM, Davidlohr Bueso wrote:
> On Thu, 06 Mar 2025, Bharata B Rao wrote:
> 
>> +/*
>> + * Go thro' page hotness information and migrate pages if required.
>> + *
>> + * Promoted pages are not longer tracked in the hot list.
>> + * Cold pages are pruned from the list as well.
>> + *
>> + * TODO: Batching could be done
>> + */
>> +static void kpromoted_migrate(pg_data_t *pgdat)
>> +{
>> +    int nid = pgdat->node_id;
>> +    struct page_hotness_info *phi;
>> +    struct hlist_node *tmp;
>> +    int nr_bkts = HASH_SIZE(page_hotness_hash);
>> +    int bkt;
>> +
>> +    for (bkt = 0; bkt < nr_bkts; bkt++) {
>> +        mutex_lock(&page_hotness_lock[bkt]);
>> +        hlist_for_each_entry_safe(phi, tmp, &page_hotness_hash[bkt], 
>> hnode) {
>> +            if (phi->hot_node != nid)
>> +                continue;
>> +
>> +            if (page_should_be_promoted(phi)) {
>> +                count_vm_event(KPROMOTED_MIG_CANDIDATE);
>> +                if (!kpromote_page(phi)) {
>> +                    count_vm_event(KPROMOTED_MIG_PROMOTED);
>> +                    hlist_del_init(&phi->hnode);
>> +                    kfree(phi);
>> +                }
>> +            } else {
>> +                /*
>> +                 * Not a suitable page or cold page, stop tracking it.
>> +                 * TODO: Identify cold pages and drive demotion?
>> +                 */
> 
> I don't think kpromoted should drive demotion at all. No one is 
> complaining about migrate
> in lieu of discard, and there is also proactive reclaim which users can 
> trigger. All the
> in-kernel problems are wrt promotion. The simpler any of these kthreads 
> are the better.

I was testing on default kernel with NUMA balancing mode 2.

The multi-threaded application allocates memory on DRAM and the 
allocation spills over to CXL node. The threads keep accessing allocated 
memory pages in random order.

pgpromote_success 6
pgpromote_candidate 745387
pgdemote_kswapd 51085
pgdemote_direct 10481
pgdemote_khugepaged 0
numa_pte_updates 27249625
numa_huge_pte_updates 0
numa_hint_faults 9660745
numa_hint_faults_local 0
numa_pages_migrated 6
numa_node_full 745438
pgmigrate_success 2225458
pgmigrate_fail 1187349

I hardly see any promotion happening.

In order to check the number of times the toptier node was found to be 
full when attempting to promote, I added numa_node_full counter like below:

diff --git a/mm/migrate.c b/mm/migrate.c
index fb19a18892c8..4d049d896589 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2673,6 +2673,7 @@ int migrate_misplaced_folio_prepare(struct folio 
*folio,
         if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
                 int z;

+               count_vm_event(NUMA_NODE_FULL);
                 if (!(sysctl_numa_balancing_mode & 
NUMA_BALANCING_MEMORY_TIERING))
                         return -EAGAIN;
                 for (z = pgdat->nr_zones - 1; z >= 0; z--) {


As seen above, numa_node_full 745438. This matches pgpromote_candidate 
numbers.

I do see counters reporting kswapd-driven and direct demotion as well 
but does this mean that demotion isn't happening fast enough to cope up 
with promotion requirement in this high toptier memory pressure situation?

Regards,
Bharata.
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Davidlohr Bueso 11 months ago
On Thu, 06 Mar 2025, Bharata B Rao wrote:

>+static int page_should_be_promoted(struct page_hotness_info *phi)
>+{
>+	struct page *page = pfn_to_online_page(phi->pfn);
>+	unsigned long now = jiffies;
>+	struct folio *folio;
>+
>+	if (!page || is_zone_device_page(page))
>+		return false;
>+
>+	folio = page_folio(page);
>+	if (!folio_test_lru(folio)) {
>+		count_vm_event(KPROMOTED_MIG_NON_LRU);
>+		return false;
>+	}
>+	if (folio_nid(folio) == phi->hot_node) {
>+		count_vm_event(KPROMOTED_MIG_RIGHT_NODE);
>+		return false;
>+	}

How about using the LRU age itself:

if (folio_test_active())
    return true;

>+
>+	/* If the page was hot a while ago, don't promote */
>+	if ((now - phi->last_update) > 2 * msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
>+		count_vm_event(KPROMOTED_MIG_COLD_OLD);
>+		return false;
>+	}
>+
>+	/* If the page hasn't been accessed enough number of times, don't promote */
>+	if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) {
>+		count_vm_event(KPROMOTED_MIG_COLD_NOT_ACCESSED);
>+		return false;
>+	}
>+	return true;
>+}

...

>+static int kpromoted(void *p)
>+{
>+	pg_data_t *pgdat = (pg_data_t *)p;
>+	struct task_struct *tsk = current;
>+	long timeout = msecs_to_jiffies(KPROMOTE_DELAY);
>+
>+	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
>+
>+	if (!cpumask_empty(cpumask))
>+		set_cpus_allowed_ptr(tsk, cpumask);

Explicit cpumasks are not needed if you use kthread_create_on_node().

See https://web.git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=c6a566f6c1b4d5dff659acd221f95a72923f4085

>+
>+	while (!kthread_should_stop()) {
>+		wait_event_timeout(pgdat->kpromoted_wait,
>+				   kpromoted_work_requested(pgdat), timeout);
>+		kpromoted_do_work(pgdat);
>+	}
>+	return 0;
>+}
>+
>+static void kpromoted_run(int nid)
>+{
>+	pg_data_t *pgdat = NODE_DATA(nid);
>+
>+	if (pgdat->kpromoted)
>+		return;
>+
>+	pgdat->kpromoted = kthread_run(kpromoted, pgdat, "kpromoted%d", nid);
>+	if (IS_ERR(pgdat->kpromoted)) {
>+		pr_err("Failed to start kpromoted on node %d\n", nid);
>+		pgdat->kpromoted = NULL;
>+	}
>+}
>+
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Bharata B Rao 10 months, 4 weeks ago
On 13-Mar-25 10:14 PM, Davidlohr Bueso wrote:
> On Thu, 06 Mar 2025, Bharata B Rao wrote:
> 
>> +static int page_should_be_promoted(struct page_hotness_info *phi)
>> +{
>> +    struct page *page = pfn_to_online_page(phi->pfn);
>> +    unsigned long now = jiffies;
>> +    struct folio *folio;
>> +
>> +    if (!page || is_zone_device_page(page))
>> +        return false;
>> +
>> +    folio = page_folio(page);
>> +    if (!folio_test_lru(folio)) {
>> +        count_vm_event(KPROMOTED_MIG_NON_LRU);
>> +        return false;
>> +    }
>> +    if (folio_nid(folio) == phi->hot_node) {
>> +        count_vm_event(KPROMOTED_MIG_RIGHT_NODE);
>> +        return false;
>> +    }
> 
> How about using the LRU age itself:

Sounds like a good check for page hotness.

> 
> if (folio_test_active())
>     return true;

But the numbers I obtained with this check added, didn't really hit this 
condition all that much. I was running a multi-threaded application that 
allocates enough memory such that the allocation spills over from DRAM 
node to the CXL node. Threads keep touching the memory pages in random 
order.

kpromoted_recorded_accesses 960620 /* Number of recorded accesses */
kpromoted_recorded_hwhints 960620  /* Nr accesses via HW hints, IBS in 
this case */
kpromoted_recorded_pgtscans 0
kpromoted_record_toptier 638006 /* Nr toptier accesses */
kpromoted_record_added 321234 /* Nr (CXL) accesses that are tracked */
kpromoted_record_exists 1380
kpromoted_mig_right_node 0
kpromoted_mig_non_lru 226
kpromoted_mig_lru_active 47 /* Number of accesses considered for 
promotion as determined by folio_test_active() check */
kpromoted_mig_cold_old 0
kpromoted_mig_cold_not_accessed 1373
kpromoted_mig_candidate 319635
kpromoted_mig_promoted 319635
kpromoted_mig_dropped 1599

Need to check why is this the case.

> 
>> +
>> +    /* If the page was hot a while ago, don't promote */
>> +    if ((now - phi->last_update) > 2 * 
>> msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
>> +        count_vm_event(KPROMOTED_MIG_COLD_OLD);
>> +        return false;
>> +    }
>> +
>> +    /* If the page hasn't been accessed enough number of times, don't 
>> promote */
>> +    if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) {
>> +        count_vm_event(KPROMOTED_MIG_COLD_NOT_ACCESSED);
>> +        return false;
>> +    }
>> +    return true;
>> +}
> 
> ...
> 
>> +static int kpromoted(void *p)
>> +{
>> +    pg_data_t *pgdat = (pg_data_t *)p;
>> +    struct task_struct *tsk = current;
>> +    long timeout = msecs_to_jiffies(KPROMOTE_DELAY);
>> +
>> +    const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
>> +
>> +    if (!cpumask_empty(cpumask))
>> +        set_cpus_allowed_ptr(tsk, cpumask);
> 
> Explicit cpumasks are not needed if you use kthread_create_on_node().

Thanks, will incorporate.

Regards,
Bharata.
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Gregory Price 10 months, 4 weeks ago
On Mon, Mar 17, 2025 at 09:09:18AM +0530, Bharata B Rao wrote:
> On 13-Mar-25 10:14 PM, Davidlohr Bueso wrote:
> > On Thu, 06 Mar 2025, Bharata B Rao wrote:
> > 
> > > +static int page_should_be_promoted(struct page_hotness_info *phi)
> > > +{
> > > +    struct page *page = pfn_to_online_page(phi->pfn);
> > > +    unsigned long now = jiffies;
> > > +    struct folio *folio;
> > > +
> > > +    if (!page || is_zone_device_page(page))
> > > +        return false;
> > > +
> > > +    folio = page_folio(page);
> > > +    if (!folio_test_lru(folio)) {
> > > +        count_vm_event(KPROMOTED_MIG_NON_LRU);
> > > +        return false;
> > > +    }
> > > +    if (folio_nid(folio) == phi->hot_node) {
> > > +        count_vm_event(KPROMOTED_MIG_RIGHT_NODE);
> > > +        return false;
> > > +    }
> > 
> > How about using the LRU age itself:
> 
> Sounds like a good check for page hotness.
> 
> > 
> > if (folio_test_active())
> >     return true;
> 
> But the numbers I obtained with this check added, didn't really hit this
> condition all that much. I was running a multi-threaded application that
> allocates enough memory such that the allocation spills over from DRAM node
> to the CXL node. Threads keep touching the memory pages in random order.
> 

Is demotion enabled by any chance?

i.e. are you sure it's actually allocating from CXL and not demoting
cold stuff to CXL?

> kpromoted_recorded_accesses 960620 /* Number of recorded accesses */
> kpromoted_recorded_hwhints 960620  /* Nr accesses via HW hints, IBS in this
> case */
> kpromoted_recorded_pgtscans 0
> kpromoted_record_toptier 638006 /* Nr toptier accesses */
> kpromoted_record_added 321234 /* Nr (CXL) accesses that are tracked */
> kpromoted_record_exists 1380
> kpromoted_mig_right_node 0
> kpromoted_mig_non_lru 226
> kpromoted_mig_lru_active 47 /* Number of accesses considered for promotion
> as determined by folio_test_active() check */
> kpromoted_mig_cold_old 0
> kpromoted_mig_cold_not_accessed 1373
> kpromoted_mig_candidate 319635
> kpromoted_mig_promoted 319635
> kpromoted_mig_dropped 1599
> 
> Need to check why is this the case.
> 
> > 
> > > +
> > > +    /* If the page was hot a while ago, don't promote */
> > > +    if ((now - phi->last_update) > 2 *
> > > msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
> > > +        count_vm_event(KPROMOTED_MIG_COLD_OLD);
> > > +        return false;
> > > +    }
> > > +
> > > +    /* If the page hasn't been accessed enough number of times,
> > > don't promote */
> > > +    if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) {
> > > +        count_vm_event(KPROMOTED_MIG_COLD_NOT_ACCESSED);
> > > +        return false;
> > > +    }
> > > +    return true;
> > > +}
> > 
> > ...
> > 
> > > +static int kpromoted(void *p)
> > > +{
> > > +    pg_data_t *pgdat = (pg_data_t *)p;
> > > +    struct task_struct *tsk = current;
> > > +    long timeout = msecs_to_jiffies(KPROMOTE_DELAY);
> > > +
> > > +    const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
> > > +
> > > +    if (!cpumask_empty(cpumask))
> > > +        set_cpus_allowed_ptr(tsk, cpumask);
> > 
> > Explicit cpumasks are not needed if you use kthread_create_on_node().
> 
> Thanks, will incorporate.
> 
> Regards,
> Bharata.
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Bharata B Rao 10 months, 4 weeks ago
On 17-Mar-25 8:35 PM, Gregory Price wrote:
> On Mon, Mar 17, 2025 at 09:09:18AM +0530, Bharata B Rao wrote:
>> On 13-Mar-25 10:14 PM, Davidlohr Bueso wrote:
>>> On Thu, 06 Mar 2025, Bharata B Rao wrote:
>>>
>>>> +static int page_should_be_promoted(struct page_hotness_info *phi)
>>>> +{
>>>> +    struct page *page = pfn_to_online_page(phi->pfn);
>>>> +    unsigned long now = jiffies;
>>>> +    struct folio *folio;
>>>> +
>>>> +    if (!page || is_zone_device_page(page))
>>>> +        return false;
>>>> +
>>>> +    folio = page_folio(page);
>>>> +    if (!folio_test_lru(folio)) {
>>>> +        count_vm_event(KPROMOTED_MIG_NON_LRU);
>>>> +        return false;
>>>> +    }
>>>> +    if (folio_nid(folio) == phi->hot_node) {
>>>> +        count_vm_event(KPROMOTED_MIG_RIGHT_NODE);
>>>> +        return false;
>>>> +    }
>>>
>>> How about using the LRU age itself:
>>
>> Sounds like a good check for page hotness.
>>
>>>
>>> if (folio_test_active())
>>>      return true;
>>
>> But the numbers I obtained with this check added, didn't really hit this
>> condition all that much. I was running a multi-threaded application that
>> allocates enough memory such that the allocation spills over from DRAM node
>> to the CXL node. Threads keep touching the memory pages in random order.
>>
> 
> Is demotion enabled by any chance?

Yes, I thought enabling demotion is required to create enough room in 
the toptier to handle promotion.

> 
> i.e. are you sure it's actually allocating from CXL and not demoting
> cold stuff to CXL?

But then I realized that spill over was caused by demotion rather than 
initial allocation even when I used MPOL_BIND | MPOL_F_NUMA_BALANCING 
policy with both toptier and CXL node in the nodemask.

> 
>> kpromoted_recorded_accesses 960620 /* Number of recorded accesses */
>> kpromoted_recorded_hwhints 960620  /* Nr accesses via HW hints, IBS in this
>> case */
>> kpromoted_recorded_pgtscans 0
>> kpromoted_record_toptier 638006 /* Nr toptier accesses */
>> kpromoted_record_added 321234 /* Nr (CXL) accesses that are tracked */
>> kpromoted_record_exists 1380
>> kpromoted_mig_right_node 0
>> kpromoted_mig_non_lru 226
>> kpromoted_mig_lru_active 47 /* Number of accesses considered for promotion
>> as determined by folio_test_active() check */

However disabling demotion has no impact on this number (and hence the 
folio_test_active() check)

Regards,
Bharata.
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Gregory Price 10 months, 4 weeks ago
On Mon, Mar 17, 2025 at 09:52:29PM +0530, Bharata B Rao wrote:
> > 
> > > kpromoted_recorded_accesses 960620 /* Number of recorded accesses */
> > > kpromoted_recorded_hwhints 960620  /* Nr accesses via HW hints, IBS in this
> > > case */
> > > kpromoted_recorded_pgtscans 0
> > > kpromoted_record_toptier 638006 /* Nr toptier accesses */
> > > kpromoted_record_added 321234 /* Nr (CXL) accesses that are tracked */
> > > kpromoted_record_exists 1380
> > > kpromoted_mig_right_node 0
> > > kpromoted_mig_non_lru 226
> > > kpromoted_mig_lru_active 47 /* Number of accesses considered for promotion
> > > as determined by folio_test_active() check */
> 
> However disabling demotion has no impact on this number (and hence the
> folio_test_active() check)
>

I've been mulling over what's likely to occur when the Low but not Min
watermark is hit and reclaim is invoked but without demotion enabled.

I'm wonder if kswapd pushes things like r/o pagecache out, only to have
them faulted back into CXL later, while new allocations stick on the
main memory.

You might try MPOL_PREFERRED with CXL node as the target instead of bind
w/ the local node to at least make sure the system is actually
identifying hotness correctly.

~Gregory
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Mike Day 11 months, 1 week ago

On 3/5/25 23:45, Bharata B Rao wrote:
> +static void kpromoted_migrate(pg_data_t *pgdat)
> +{
> +	int nid = pgdat->node_id;
> +	struct page_hotness_info *phi;
> +	struct hlist_node *tmp;
> +	int nr_bkts = HASH_SIZE(page_hotness_hash);
> +	int bkt;
> +
> +	for (bkt = 0; bkt < nr_bkts; bkt++) {
> +		mutex_lock(&page_hotness_lock[bkt]);
> +		hlist_for_each_entry_safe(phi, tmp, &page_hotness_hash[bkt], hnode) {
> +			if (phi->hot_node != nid)
> +				continue;
> +
> +			if (page_should_be_promoted(phi)) {
> +				count_vm_event(KPROMOTED_MIG_CANDIDATE);
> +				if (!kpromote_page(phi)) {
> +					count_vm_event(KPROMOTED_MIG_PROMOTED);
> +					hlist_del_init(&phi->hnode);
> +					kfree(phi);
> +				}
> +			} else {
> +				/*
> +				 * Not a suitable page or cold page, stop tracking it.
> +				 * TODO: Identify cold pages and drive demotion?
> +				 */
> +				count_vm_event(KPROMOTED_MIG_DROPPED);
> +				hlist_del_init(&phi->hnode);
> +				kfree(phi);
> +			}
> +		}
> +		mutex_unlock(&page_hotness_lock[bkt]);
> +	}
> +}
> +
> +static struct page_hotness_info *__kpromoted_lookup(unsigned long pfn, int bkt)
> +{
> +	struct page_hotness_info *phi;
> +
> +	hlist_for_each_entry(phi, &page_hotness_hash[bkt], hnode) {

Should this be hlist_for_each_entry_safe(), given that kpromoted_migrate() may be
running concurrently?

Mike
> +		if (phi->pfn == pfn)
> +			return phi;
> +	}
> +	return NULL;
> +}
Re: [RFC PATCH 2/4] mm: kpromoted: Hot page info collection and promotion daemon
Posted by Bharata B Rao 11 months, 1 week ago
On 06-Mar-25 10:52 PM, Mike Day wrote:
> 
> 
> On 3/5/25 23:45, Bharata B Rao wrote:
>> +static void kpromoted_migrate(pg_data_t *pgdat)
>> +{
>> +    int nid = pgdat->node_id;
>> +    struct page_hotness_info *phi;
>> +    struct hlist_node *tmp;
>> +    int nr_bkts = HASH_SIZE(page_hotness_hash);
>> +    int bkt;
>> +
>> +    for (bkt = 0; bkt < nr_bkts; bkt++) {
>> +        mutex_lock(&page_hotness_lock[bkt]);
>> +        hlist_for_each_entry_safe(phi, tmp, &page_hotness_hash[bkt], 
>> hnode) {
>> +            if (phi->hot_node != nid)
>> +                continue;
>> +
>> +            if (page_should_be_promoted(phi)) {
>> +                count_vm_event(KPROMOTED_MIG_CANDIDATE);
>> +                if (!kpromote_page(phi)) {
>> +                    count_vm_event(KPROMOTED_MIG_PROMOTED);
>> +                    hlist_del_init(&phi->hnode);
>> +                    kfree(phi);
>> +                }
>> +            } else {
>> +                /*
>> +                 * Not a suitable page or cold page, stop tracking it.
>> +                 * TODO: Identify cold pages and drive demotion?
>> +                 */
>> +                count_vm_event(KPROMOTED_MIG_DROPPED);
>> +                hlist_del_init(&phi->hnode);
>> +                kfree(phi);
>> +            }
>> +        }
>> +        mutex_unlock(&page_hotness_lock[bkt]);
>> +    }
>> +}
>> +
>> +static struct page_hotness_info *__kpromoted_lookup(unsigned long 
>> pfn, int bkt)
>> +{
>> +    struct page_hotness_info *phi;
>> +
>> +    hlist_for_each_entry(phi, &page_hotness_hash[bkt], hnode) {
> 
> Should this be hlist_for_each_entry_safe(), given that 
> kpromoted_migrate() may be
> running concurrently?

I don't think so because the migration path can't walk the list 
concurrently as the lists are protected by mutex.

Regards,
Bharata.