Default pghot stores hotness in a 1‑byte record per PFN, limiting
frequency to 2 bits, time to a 5‑bit bucket, and preventing storage
of per‑PFN toptier NID. This restricts time granularity and forces
all promotions to use the global pghot_target_nid.
This patch adds an optional precision mode (CONFIG_PGHOT_PRECISE)
that expands the hotness record to 4 bytes (u32) and provides:
- 10‑bit NID field for per‑PFN promotion target,
- 3‑bit frequency field (freq_threshold range 1–7),
- 14‑bit time field offering finer recency tracking,
- MSB migrate‑ready bit.
Precision mode improves placement accuracy on systems with multiple
toptier nodes and provides higher‑resolution hotness tracking, at
the cost of increasing metadata to 4 bytes per PFN.
Documentation, tunables, and the record layout are updated accordingly.
Signed-off-by: Bharata B Rao <bharata@amd.com>
---
Documentation/admin-guide/mm/pghot.txt | 4 +-
include/linux/mmzone.h | 2 +-
include/linux/pghot.h | 31 ++++++++++
mm/Kconfig | 11 ++++
mm/Makefile | 7 ++-
mm/pghot-precise.c | 81 ++++++++++++++++++++++++++
mm/pghot.c | 13 +++--
7 files changed, 141 insertions(+), 8 deletions(-)
create mode 100644 mm/pghot-precise.c
diff --git a/Documentation/admin-guide/mm/pghot.txt b/Documentation/admin-guide/mm/pghot.txt
index 5f51dd1d4d45..7b84e911afe7 100644
--- a/Documentation/admin-guide/mm/pghot.txt
+++ b/Documentation/admin-guide/mm/pghot.txt
@@ -37,7 +37,7 @@ Path: /sys/kernel/debug/pghot/
3. **freq_threshold**
- Minimum access frequency before a page is marked ready for promotion.
- - Range: 1 to 3
+ - Range: 1 to 3 in default mode, 1 to 7 in precision mode.
- Default: 2
- Example:
# echo 3 > /sys/kernel/debug/pghot/freq_threshold
@@ -59,7 +59,7 @@ Path: /proc/sys/vm/pghot_promote_freq_window_ms
- Controls the time window (in ms) for counting access frequency. A page is
considered hot only when **freq_threshold** number of accesses occur with
this time period.
-- Default: 3000 (3 seconds)
+- Default: 3000 (3 seconds) in default mode and 5000 (5s) in precision mode.
- Example:
# sysctl vm.pghot_promote_freq_window_ms=3000
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d7ed60956543..61fd259d9897 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1938,7 +1938,7 @@ struct mem_section {
#ifdef CONFIG_PGHOT
/*
* Per-PFN hotness data for this section.
- * Array of phi_t (u8 in default mode).
+ * Array of phi_t (u8 in default mode, u32 in precision mode).
* LSB is used as PGHOT_SECTION_HOT_BIT flag.
*/
void *hot_map;
diff --git a/include/linux/pghot.h b/include/linux/pghot.h
index 525d4dd28fc1..2e1742b8caee 100644
--- a/include/linux/pghot.h
+++ b/include/linux/pghot.h
@@ -35,6 +35,36 @@ DECLARE_STATIC_KEY_FALSE(pghot_src_hwhints);
#define PGHOT_DEFAULT_NODE 0
+#if defined(CONFIG_PGHOT_PRECISE)
+#define PGHOT_DEFAULT_FREQ_WINDOW (5 * MSEC_PER_SEC)
+
+/*
+ * Bits 0-26 are used to store nid, frequency and time.
+ * Bits 27-30 are unused now.
+ * Bit 31 is used to indicate the page is ready for migration.
+ */
+#define PGHOT_MIGRATE_READY 31
+
+#define PGHOT_NID_WIDTH 10
+#define PGHOT_FREQ_WIDTH 3
+/* time is stored in 14 bits which can represent up to 16s with HZ=1000 */
+#define PGHOT_TIME_WIDTH 14
+
+#define PGHOT_NID_SHIFT 0
+#define PGHOT_FREQ_SHIFT (PGHOT_NID_SHIFT + PGHOT_NID_WIDTH)
+#define PGHOT_TIME_SHIFT (PGHOT_FREQ_SHIFT + PGHOT_FREQ_WIDTH)
+
+#define PGHOT_NID_MASK GENMASK(PGHOT_NID_WIDTH - 1, 0)
+#define PGHOT_FREQ_MASK GENMASK(PGHOT_FREQ_WIDTH - 1, 0)
+#define PGHOT_TIME_MASK GENMASK(PGHOT_TIME_WIDTH - 1, 0)
+
+#define PGHOT_NID_MAX ((1 << PGHOT_NID_WIDTH) - 1)
+#define PGHOT_FREQ_MAX ((1 << PGHOT_FREQ_WIDTH) - 1)
+#define PGHOT_TIME_MAX ((1 << PGHOT_TIME_WIDTH) - 1)
+
+typedef u32 phi_t;
+
+#else /* !CONFIG_PGHOT_PRECISE */
#define PGHOT_DEFAULT_FREQ_WINDOW (3 * MSEC_PER_SEC)
/*
@@ -61,6 +91,7 @@ DECLARE_STATIC_KEY_FALSE(pghot_src_hwhints);
#define PGHOT_TIME_MAX ((1 << PGHOT_TIME_WIDTH) - 1)
typedef u8 phi_t;
+#endif /* CONFIG_PGHOT_PRECISE */
#define PGHOT_RECORD_SIZE sizeof(phi_t)
diff --git a/mm/Kconfig b/mm/Kconfig
index 4aeab6aee535..14383bb1d890 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1485,6 +1485,17 @@ config PGHOT
This adds 1 byte of metadata overhead per page in lower-tier
memory nodes.
+config PGHOT_PRECISE
+ bool "Hot page tracking precision mode"
+ def_bool n
+ depends on PGHOT
+ help
+ Enables precision mode for tracking hot pages with pghot sub-system.
+ Adds fine-grained access time tracking and explicit toptier target
+ NID tracking. Precise hot page tracking comes at the cost of using
+ 4 bytes per page against the default one byte per page. Preferable
+ to enable this on systems with multiple nodes in toptier.
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 33014de43acc..dc61f4d955f8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -150,4 +150,9 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-$(CONFIG_EXECMEM) += execmem.o
obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
obj-$(CONFIG_LAZY_MMU_MODE_KUNIT_TEST) += tests/lazy_mmu_mode_kunit.o
-obj-$(CONFIG_PGHOT) += pghot.o pghot-tunables.o pghot-default.o
+obj-$(CONFIG_PGHOT) += pghot.o pghot-tunables.o
+ifdef CONFIG_PGHOT_PRECISE
+obj-$(CONFIG_PGHOT) += pghot-precise.o
+else
+obj-$(CONFIG_PGHOT) += pghot-default.o
+endif
diff --git a/mm/pghot-precise.c b/mm/pghot-precise.c
new file mode 100644
index 000000000000..9e8007adfff9
--- /dev/null
+++ b/mm/pghot-precise.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * pghot: Precision mode
+ *
+ * 4 byte hotness record per PFN (u32)
+ * NID, time and frequency tracked as part of the record.
+ */
+
+#include <linux/pghot.h>
+#include <linux/jiffies.h>
+
+bool pghot_nid_valid(int nid)
+{
+ /*
+ * TODO: Add node_online() and node_is_toptier() checks?
+ */
+ if (nid != NUMA_NO_NODE && (nid < 0 || nid >= PGHOT_NID_MAX))
+ return false;
+
+ return true;
+}
+
+unsigned long pghot_access_latency(unsigned long old_time, unsigned long time)
+{
+ return jiffies_to_msecs((time - old_time) & PGHOT_TIME_MASK);
+}
+
+bool pghot_update_record(phi_t *phi, int nid, unsigned long now)
+{
+ phi_t freq, old_freq, hotness, old_hotness, old_time;
+ phi_t time = now & PGHOT_TIME_MASK;
+
+ nid = (nid == NUMA_NO_NODE) ? pghot_target_nid : nid;
+ old_hotness = READ_ONCE(*phi);
+
+ do {
+ bool new_window = false;
+
+ hotness = old_hotness;
+ old_freq = (hotness >> PGHOT_FREQ_SHIFT) & PGHOT_FREQ_MASK;
+ old_time = (hotness >> PGHOT_TIME_SHIFT) & PGHOT_TIME_MASK;
+
+ if (pghot_access_latency(old_time, time) > sysctl_pghot_freq_window)
+ new_window = true;
+
+ if (new_window)
+ freq = 1;
+ else if (old_freq < PGHOT_FREQ_MAX)
+ freq = old_freq + 1;
+ else
+ freq = old_freq;
+
+ hotness &= ~(PGHOT_NID_MASK << PGHOT_NID_SHIFT);
+ hotness &= ~(PGHOT_FREQ_MASK << PGHOT_FREQ_SHIFT);
+ hotness &= ~(PGHOT_TIME_MASK << PGHOT_TIME_SHIFT);
+
+ hotness |= (nid & PGHOT_NID_MASK) << PGHOT_NID_SHIFT;
+ hotness |= (freq & PGHOT_FREQ_MASK) << PGHOT_FREQ_SHIFT;
+ hotness |= (time & PGHOT_TIME_MASK) << PGHOT_TIME_SHIFT;
+
+ if (freq >= pghot_freq_threshold)
+ hotness |= BIT(PGHOT_MIGRATE_READY);
+ } while (unlikely(!try_cmpxchg(phi, &old_hotness, hotness)));
+ return !!(hotness & BIT(PGHOT_MIGRATE_READY));
+}
+
+int pghot_get_record(phi_t *phi, int *nid, int *freq, unsigned long *time)
+{
+ phi_t old_hotness, hotness = 0;
+
+ old_hotness = READ_ONCE(*phi);
+ do {
+ if (!(old_hotness & BIT(PGHOT_MIGRATE_READY)))
+ return -EINVAL;
+ } while (unlikely(!try_cmpxchg(phi, &old_hotness, hotness)));
+
+ *nid = (old_hotness >> PGHOT_NID_SHIFT) & PGHOT_NID_MASK;
+ *freq = (old_hotness >> PGHOT_FREQ_SHIFT) & PGHOT_FREQ_MASK;
+ *time = (old_hotness >> PGHOT_TIME_SHIFT) & PGHOT_TIME_MASK;
+ return 0;
+}
diff --git a/mm/pghot.c b/mm/pghot.c
index dac9e6f3b61e..7d7ef0800ae2 100644
--- a/mm/pghot.c
+++ b/mm/pghot.c
@@ -10,6 +10,9 @@
* the frequency of access and last access time. Promotions are done
* to a default toptier NID.
*
+ * In the precision mode, 4 bytes are used to store the frequency
+ * of access, last access time and the accessing NID.
+ *
* A kernel thread named kmigrated is provided to migrate or promote
* the hot pages. kmigrated runs for each lower tier node. It iterates
* over the node's PFNs and migrates pages marked for migration into
@@ -52,13 +55,15 @@ static bool kmigrated_started __ro_after_init;
* for the purpose of tracking page hotness and subsequent promotion.
*
* @pfn: PFN of the page
- * @nid: Unused
+ * @nid: Target NID to where the page needs to be migrated in precision
+ * mode but unused in default mode
* @src: The identifier of the sub-system that reports the access
* @now: Access time in jiffies
*
- * Updates the frequency and time of access and marks the page as
- * ready for migration if the frequency crosses a threshold. The pages
- * marked for migration are migrated by kmigrated kernel thread.
+ * Updates the NID (in precision mode only), frequency and time of access
+ * and marks the page as ready for migration if the frequency crosses a
+ * threshold. The pages marked for migration are migrated by kmigrated
+ * kernel thread.
*
* Return: 0 on success and -EINVAL on failure to record the access.
*/
--
2.34.1
Replies to review from Shashiko
[https://sashiko.dev/#/patchset/20260323095104.238982-1-bharata%40amd.com]
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index 4aeab6aee535..14383bb1d890 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -1485,6 +1485,17 @@ config PGHOT
>> This adds 1 byte of metadata overhead per page in lower-tier
>> memory nodes.
>>
>> +config PGHOT_PRECISE
>> + bool "Hot page tracking precision mode"
>> + def_bool n
> This isn't a bug, but having both bool with a prompt and def_bool is
> redundant and can be confusing. Should this use default n instead?
Sure.
>> + depends on PGHOT
>> + help
>> + Enables precision mode for tracking hot pages with pghot sub-system.
>[ ... ]
>> diff --git a/mm/pghot-precise.c b/mm/pghot-precise.c
>> new file mode 100644
>> index 000000000000..9e8007adfff9
>> --- /dev/null
>> +++ b/mm/pghot-precise.c
>> @@ -0,0 +1,81 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * pghot: Precision mode
>> + *
>> + * 4 byte hotness record per PFN (u32)
>> + * NID, time and frequency tracked as part of the record.
>> + */
>> +
>> +#include <linux/pghot.h>
>> +#include <linux/jiffies.h>
>> +
>> +bool pghot_nid_valid(int nid)
>> +{
>> + /*
>> + * TODO: Add node_online() and node_is_toptier() checks?
>> + */
>> + if (nid != NUMA_NO_NODE && (nid < 0 || nid >= PGHOT_NID_MAX))
>> + return false;
> Could this validation allow node IDs that exceed the system's actual
> MAX_NUMNODES?
Unlikely as the nid gets passed down from other sub-systems like NUMA Balancing
etc.
> If PGHOT_NID_MAX is 1023 but the system supports fewer nodes, could passing
> a large node ID bypass this check and cause out-of-bounds accesses in NUMA
> node arrays like node_data or node_states?
> Also, is there an off-by-one regression here? PGHOT_NID_MAX is the maximum
> value that fits in the bitfield, so should this check be nid > PGHOT_NID_MAX
> to allow the maximum node ID?
Yes, will adjust this in the next iteration.
>> +
>> + return true;
>> +}
>> +
>> +unsigned long pghot_access_latency(unsigned long old_time, unsigned long time)
>> +{
>> + return jiffies_to_msecs((time - old_time) & PGHOT_TIME_MASK);
>> +}
> Does this timestamp wraparound allow cold pages to be improperly promoted?
> If a page is accessed exactly at intervals matching the wraparound of the
> 14-bit time mask (e.g., every 16.384 seconds at HZ=1000), the latency would
> evaluate to 0. Would this bypass the reset of the frequency counter and
> falsely trigger migration?
The hotness timestamp is stored in a limited number of bits (14 bits in precise
mode, 5 bucketed bits in default mode), which means it can only represent a
finite window of time before wrapping around — approximately 16 seconds for
precise mode and 4 seconds for default mode at HZ=1000. Latency calculations
rely on modular subtraction of these truncated timestamps. This produces the
correct elapsed time as long as the true interval between two events is shorter
than the wrap period. When the interval exceeds the wrap period, there could be
approximations.
In practice, this should be acceptable because the system has two independent
gates that a page must pass through before migration actually occurs:
1. Access frequency gate (recording time): pghot_update_record() determines
whether consecutive accesses to a page fall within the same frequency-counting
window. A page must be accessed at least pghot_freq_threshold times within
sysctl_pghot_freq_window (default 5 seconds) to be marked migration-ready.
2. Recency gate (migration time): kmigrated_should_migrate_memory() checks
whether the page's last recorded access is recent enough relative to the current
time before actually initiating migration.
>> +
>> +bool pghot_update_record(phi_t *phi, int nid, unsigned long now)
>> +{
>> + phi_t freq, old_freq, hotness, old_hotness, old_time;
>> + phi_t time = now & PGHOT_TIME_MASK;
>> +
>> + nid = (nid == NUMA_NO_NODE) ? pghot_target_nid : nid;
>> + old_hotness = READ_ONCE(*phi);
>> +
>> + do {
>> + bool new_window = false;
>> +
>> + hotness = old_hotness;
>> + old_freq = (hotness >> PGHOT_FREQ_SHIFT) & PGHOT_FREQ_MASK;
>> + old_time = (hotness >> PGHOT_TIME_SHIFT) & PGHOT_TIME_MASK;
>> +
>> + if (pghot_access_latency(old_time, time) > sysctl_pghot_freq_window)
>> + new_window = true;
> Is there a missing upper bound on the sysctl_pghot_freq_window configuration?
> The time is tracked in a 14-bit field, which represents a maximum latency of
> 16,383 jiffies. If a user configures a window larger than this maximum
> representable latency, could this condition permanently evaluate to false,
> causing sparse accesses to accumulate over arbitrarily long periods?
I can add an upper bound check.
Regards,
Bharata.
© 2016 - 2026 Red Hat, Inc.