[RFC PATCH V2 03/13] mm: Scan the mm and create a migration list

Raghavendra K T posted 13 patches 3 months, 2 weeks ago
There is a newer version of this series
[RFC PATCH V2 03/13] mm: Scan the mm and create a migration list
Posted by Raghavendra K T 3 months, 2 weeks ago
Since we already have the list of mm_struct in the system, add a module to
scan each mm that walks VMAs of each mm_struct and scan all the pages
associated with that.

 In the scan path: Check for the recently acccessed pages (folios) belonging
to slowtier nodes. Add all those folios to a list.

Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
---
 mm/kscand.c | 319 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 318 insertions(+), 1 deletion(-)

diff --git a/mm/kscand.c b/mm/kscand.c
index d5b0d3041b0f..0edec1b7730d 100644
--- a/mm/kscand.c
+++ b/mm/kscand.c
@@ -4,10 +4,18 @@
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
 #include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/pagewalk.h>
+#include <linux/page_ext.h>
+#include <linux/page_idle.h>
+#include <linux/page_table_check.h>
+#include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/mm_inline.h>
 #include <linux/kthread.h>
 #include <linux/kscand.h>
+#include <linux/memory-tiers.h>
+#include <linux/mempolicy.h>
 #include <linux/string.h>
 #include <linux/delay.h>
 #include <linux/cleanup.h>
@@ -18,6 +26,11 @@
 
 static struct task_struct *kscand_thread __read_mostly;
 static DEFINE_MUTEX(kscand_mutex);
+/*
+ * Total VMA size to cover during scan.
+ */
+#define KSCAND_SCAN_SIZE	(1 * 1024 * 1024 * 1024UL)
+static unsigned long kscand_scan_size __read_mostly = KSCAND_SCAN_SIZE;
 
 /* How long to pause between two scan cycles */
 static unsigned int kscand_scan_sleep_ms __read_mostly = 20;
@@ -42,6 +55,8 @@ static struct kmem_cache *kscand_slot_cache __read_mostly;
 /* Per mm information collected to control VMA scanning */
 struct kscand_mm_slot {
 	struct mm_slot slot;
+	long address;
+	bool is_scanned;
 };
 
 /* Data structure to keep track of current mm under scan */
@@ -54,6 +69,29 @@ struct kscand_scan kscand_scan = {
 	.mm_head = LIST_HEAD_INIT(kscand_scan.mm_head),
 };
 
+/*
+ * Data structure passed to control scanning and also collect
+ * per memory node information
+ */
+struct kscand_scanctrl {
+	struct list_head scan_list;
+	unsigned long address;
+};
+
+struct kscand_scanctrl kscand_scanctrl;
+/* Per folio information used for migration */
+struct kscand_migrate_info {
+	struct list_head migrate_node;
+	struct folio *folio;
+	unsigned long address;
+};
+
+static bool kscand_eligible_srcnid(int nid)
+{
+	/* Only promotion case is considered */
+	return  !node_is_toptier(nid);
+}
+
 static inline int kscand_has_work(void)
 {
 	return !list_empty(&kscand_scan.mm_head);
@@ -84,11 +122,275 @@ static void kscand_wait_work(void)
 			scan_sleep_jiffies);
 }
 
+static inline bool is_valid_folio(struct folio *folio)
+{
+	if (!folio || folio_test_unevictable(folio) || !folio_mapped(folio) ||
+		folio_is_zone_device(folio) || folio_maybe_mapped_shared(folio))
+		return false;
+
+	return true;
+}
+
+
+static bool folio_idle_clear_pte_refs_one(struct folio *folio,
+					 struct vm_area_struct *vma,
+					 unsigned long addr,
+					 pte_t *ptep)
+{
+	bool referenced = false;
+	struct mm_struct *mm = vma->vm_mm;
+	pmd_t *pmd = pmd_off(mm, addr);
+
+	if (ptep) {
+		if (ptep_clear_young_notify(vma, addr, ptep))
+			referenced = true;
+	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+		if (!pmd_present(*pmd))
+			WARN_ON_ONCE(1);
+		if (pmdp_clear_young_notify(vma, addr, pmd))
+			referenced = true;
+	} else {
+		WARN_ON_ONCE(1);
+	}
+
+	if (referenced) {
+		folio_clear_idle(folio);
+		folio_set_young(folio);
+	}
+
+	return true;
+}
+
+static void page_idle_clear_pte_refs(struct page *page, pte_t *pte, struct mm_walk *walk)
+{
+	bool need_lock;
+	struct folio *folio =  page_folio(page);
+	unsigned long address;
+
+	if (!folio_mapped(folio) || !folio_raw_mapping(folio))
+		return;
+
+	need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+	if (need_lock && !folio_trylock(folio))
+		return;
+	address = vma_address(walk->vma, page_pgoff(folio, page), compound_nr(page));
+	VM_BUG_ON_VMA(address == -EFAULT, walk->vma);
+	folio_idle_clear_pte_refs_one(folio, walk->vma, address, pte);
+
+	if (need_lock)
+		folio_unlock(folio);
+}
+
+static int hot_vma_idle_pte_entry(pte_t *pte,
+				 unsigned long addr,
+				 unsigned long next,
+				 struct mm_walk *walk)
+{
+	struct page *page;
+	struct folio *folio;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	struct kscand_migrate_info *info;
+	struct kscand_scanctrl *scanctrl = walk->private;
+	int srcnid;
+
+	scanctrl->address = addr;
+	pte_t pteval = ptep_get(pte);
+
+	if (!pte_present(pteval))
+		return 0;
+
+	if (pte_none(pteval))
+		return 0;
+
+	vma = walk->vma;
+	mm = vma->vm_mm;
+
+	page = pte_page(*pte);
+
+	page_idle_clear_pte_refs(page, pte, walk);
+
+	folio = page_folio(page);
+	folio_get(folio);
+
+	if (!is_valid_folio(folio)) {
+		folio_put(folio);
+		return 0;
+	}
+	srcnid = folio_nid(folio);
+
+
+	if (!folio_test_lru(folio)) {
+		folio_put(folio);
+		return 0;
+	}
+
+	if (!folio_test_idle(folio) || folio_test_young(folio) ||
+			mmu_notifier_test_young(mm, addr) ||
+			folio_test_referenced(folio) || pte_young(pteval)) {
+
+		if (!kscand_eligible_srcnid(srcnid)) {
+			folio_put(folio);
+			return 0;
+		}
+		/* XXX: Leaking memory. TBD: consume info */
+
+		info = kzalloc(sizeof(struct kscand_migrate_info), GFP_NOWAIT);
+		if (info && scanctrl) {
+			info->address = addr;
+			info->folio = folio;
+			list_add_tail(&info->migrate_node, &scanctrl->scan_list);
+		}
+	}
+
+	folio_set_idle(folio);
+	folio_put(folio);
+	return 0;
+}
+
+static const struct mm_walk_ops hot_vma_set_idle_ops = {
+	.pte_entry = hot_vma_idle_pte_entry,
+	.walk_lock = PGWALK_RDLOCK,
+};
+
+static void kscand_walk_page_vma(struct vm_area_struct *vma, struct kscand_scanctrl *scanctrl)
+{
+	if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+	    is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+		return;
+	}
+	if (!vma->vm_mm ||
+	    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+		return;
+
+	if (!vma_is_accessible(vma))
+		return;
+
+	walk_page_vma(vma, &hot_vma_set_idle_ops, scanctrl);
+}
+
 static inline int kscand_test_exit(struct mm_struct *mm)
 {
 	return atomic_read(&mm->mm_users) == 0;
 }
 
+static void kscand_collect_mm_slot(struct kscand_mm_slot *mm_slot)
+{
+	struct mm_slot *slot = &mm_slot->slot;
+	struct mm_struct *mm = slot->mm;
+
+	lockdep_assert_held(&kscand_mm_lock);
+
+	if (kscand_test_exit(mm)) {
+		hash_del(&slot->hash);
+		list_del(&slot->mm_node);
+
+		mm_slot_free(kscand_slot_cache, mm_slot);
+		mmdrop(mm);
+	}
+}
+
+static unsigned long kscand_scan_mm_slot(void)
+{
+	bool next_mm = false;
+	bool update_mmslot_info = false;
+
+	unsigned long vma_scanned_size = 0;
+	unsigned long address;
+
+	struct mm_slot *slot;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma = NULL;
+	struct kscand_mm_slot *mm_slot;
+
+
+	spin_lock(&kscand_mm_lock);
+
+	if (kscand_scan.mm_slot) {
+		mm_slot = kscand_scan.mm_slot;
+		slot = &mm_slot->slot;
+		address = mm_slot->address;
+	} else {
+		slot = list_entry(kscand_scan.mm_head.next,
+				     struct mm_slot, mm_node);
+		mm_slot = mm_slot_entry(slot, struct kscand_mm_slot, slot);
+		address = mm_slot->address;
+		kscand_scan.mm_slot = mm_slot;
+	}
+
+	mm = slot->mm;
+	mm_slot->is_scanned = true;
+	spin_unlock(&kscand_mm_lock);
+
+	if (unlikely(!mmap_read_trylock(mm)))
+		goto outerloop_mmap_lock;
+
+	if (unlikely(kscand_test_exit(mm))) {
+		next_mm = true;
+		goto outerloop;
+	}
+
+	VMA_ITERATOR(vmi, mm, address);
+
+	for_each_vma(vmi, vma) {
+		kscand_walk_page_vma(vma, &kscand_scanctrl);
+		vma_scanned_size += vma->vm_end - vma->vm_start;
+
+		if (vma_scanned_size >= kscand_scan_size) {
+			next_mm = true;
+			/* TBD: Add scanned folios to migration list */
+			break;
+		}
+	}
+
+	if (!vma)
+		address = 0;
+	else
+		address = kscand_scanctrl.address + PAGE_SIZE;
+
+	update_mmslot_info = true;
+
+	if (update_mmslot_info)
+		mm_slot->address = address;
+
+outerloop:
+	/* exit_mmap will destroy ptes after this */
+	mmap_read_unlock(mm);
+
+outerloop_mmap_lock:
+	spin_lock(&kscand_mm_lock);
+	WARN_ON(kscand_scan.mm_slot != mm_slot);
+
+	/*
+	 * Release the current mm_slot if this mm is about to die, or
+	 * if we scanned all vmas of this mm.
+	 */
+	if (unlikely(kscand_test_exit(mm)) || !vma || next_mm) {
+		/*
+		 * Make sure that if mm_users is reaching zero while
+		 * kscand runs here, kscand_exit will find
+		 * mm_slot not pointing to the exiting mm.
+		 */
+		if (slot->mm_node.next != &kscand_scan.mm_head) {
+			slot = list_entry(slot->mm_node.next,
+					struct mm_slot, mm_node);
+			kscand_scan.mm_slot =
+				mm_slot_entry(slot, struct kscand_mm_slot, slot);
+
+		} else
+			kscand_scan.mm_slot = NULL;
+
+		if (kscand_test_exit(mm)) {
+			kscand_collect_mm_slot(mm_slot);
+			goto end;
+		}
+	}
+	mm_slot->is_scanned = false;
+end:
+	spin_unlock(&kscand_mm_lock);
+	return 0;
+}
+
 static void kscand_do_scan(void)
 {
 	unsigned long iter = 0, mms_to_scan;
@@ -101,7 +403,7 @@ static void kscand_do_scan(void)
 			break;
 
 		if (kscand_has_work())
-			msleep(100);
+			kscand_scan_mm_slot();
 
 		iter++;
 
@@ -148,6 +450,7 @@ void __kscand_enter(struct mm_struct *mm)
 	if (!kscand_slot)
 		return;
 
+	kscand_slot->address = 0;
 	slot = &kscand_slot->slot;
 
 	spin_lock(&kscand_mm_lock);
@@ -175,6 +478,12 @@ void __kscand_exit(struct mm_struct *mm)
 		hash_del(&slot->hash);
 		list_del(&slot->mm_node);
 		free = 1;
+	} else if (mm_slot && kscand_scan.mm_slot == mm_slot && !mm_slot->is_scanned) {
+		hash_del(&slot->hash);
+		list_del(&slot->mm_node);
+		free = 1;
+		/* TBD: Set the actual next slot */
+		kscand_scan.mm_slot = NULL;
 	}
 
 	spin_unlock(&kscand_mm_lock);
@@ -224,6 +533,12 @@ static int stop_kscand(void)
 	return 0;
 }
 
+static inline void init_list(void)
+{
+	INIT_LIST_HEAD(&kscand_scanctrl.scan_list);
+	init_waitqueue_head(&kscand_wait);
+}
+
 static int __init kscand_init(void)
 {
 	int err;
@@ -234,6 +549,8 @@ static int __init kscand_init(void)
 		pr_err("kscand: kmem_cache error");
 		return -ENOMEM;
 	}
+
+	init_list();
 	err = start_kscand();
 	if (err)
 		goto err_kscand;
-- 
2.34.1
Re: [RFC PATCH V2 03/13] mm: Scan the mm and create a migration list
Posted by Harry Yoo 3 months, 2 weeks ago
On Tue, Jun 24, 2025 at 05:56:07AM +0000, Raghavendra K T wrote:
> Since we already have the list of mm_struct in the system, add a module to
> scan each mm that walks VMAs of each mm_struct and scan all the pages
> associated with that.
> 
>  In the scan path: Check for the recently acccessed pages (folios) belonging
> to slowtier nodes. Add all those folios to a list.
> 
> Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
> ---

Hi, just taking a quick look...

>  mm/kscand.c | 319 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 318 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/kscand.c b/mm/kscand.c
> index d5b0d3041b0f..0edec1b7730d 100644
> --- a/mm/kscand.c
> +++ b/mm/kscand.c
> @@ -42,6 +55,8 @@ static struct kmem_cache *kscand_slot_cache __read_mostly;
> @@ -84,11 +122,275 @@ static void kscand_wait_work(void)
>  			scan_sleep_jiffies);
>  }
>  
> +static inline bool is_valid_folio(struct folio *folio)
> +{
> +	if (!folio || folio_test_unevictable(folio) || !folio_mapped(folio) ||
> +		folio_is_zone_device(folio) || folio_maybe_mapped_shared(folio))
> +		return false;
> +
> +	return true;
> +}

What makes it undesirable to migrate shared folios?

> +static bool folio_idle_clear_pte_refs_one(struct folio *folio,
> +					 struct vm_area_struct *vma,
> +					 unsigned long addr,
> +					 pte_t *ptep)
> +{
> +	bool referenced = false;
> +	struct mm_struct *mm = vma->vm_mm;
> +	pmd_t *pmd = pmd_off(mm, addr);
> +
> +	if (ptep) {
> +		if (ptep_clear_young_notify(vma, addr, ptep))
> +			referenced = true;
> +	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
> +		if (!pmd_present(*pmd))
> +			WARN_ON_ONCE(1);
> +		if (pmdp_clear_young_notify(vma, addr, pmd))
> +			referenced = true;
> +	} else {
> +		WARN_ON_ONCE(1);
> +	}

This does not look good.

I think pmd entry handling should be handled in
mm_walk_ops.pmd_entry callback?

> +
> +	if (referenced) {
> +		folio_clear_idle(folio);
> +		folio_set_young(folio);
> +	}
> +
> +	return true;
> +}
> +
> +static void page_idle_clear_pte_refs(struct page *page, pte_t *pte, struct mm_walk *walk)
> +{
> +	bool need_lock;
> +	struct folio *folio =  page_folio(page);
> +	unsigned long address;
> +
> +	if (!folio_mapped(folio) || !folio_raw_mapping(folio))
> +		return;
> +
> +	need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
> +	if (need_lock && !folio_trylock(folio))
> +		return;

Why acquire folio lock here?

And I'm not even sure if it's safe to acquire it?
The locking order is folio_lock -> pte_lock

page walk should have already acquired pte_lock before calling
->pte_entry() callback.

> +	address = vma_address(walk->vma, page_pgoff(folio, page), compound_nr(page));
> +	VM_BUG_ON_VMA(address == -EFAULT, walk->vma);
> +	folio_idle_clear_pte_refs_one(folio, walk->vma, address, pte);
> +
> +	if (need_lock)
> +		folio_unlock(folio);
> +}
> +
> +static const struct mm_walk_ops hot_vma_set_idle_ops = {
> +	.pte_entry = hot_vma_idle_pte_entry,
> +	.walk_lock = PGWALK_RDLOCK,
> +};
> +
> +static void kscand_walk_page_vma(struct vm_area_struct *vma, struct kscand_scanctrl *scanctrl)
> +{
> +	if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
> +	    is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
> +		return;
> +	}
> +	if (!vma->vm_mm ||
> +	    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
> +		return;

Why not walk writable file VMAs?

> +	if (!vma_is_accessible(vma))
> +		return;
> +
> +	walk_page_vma(vma, &hot_vma_set_idle_ops, scanctrl);
> +}

-- 
Cheers,
Harry / Hyeonggon
Re: [RFC PATCH V2 03/13] mm: Scan the mm and create a migration list
Posted by Raghavendra K T 3 months, 2 weeks ago
On 6/26/2025 3:37 AM, Harry Yoo wrote:
> On Tue, Jun 24, 2025 at 05:56:07AM +0000, Raghavendra K T wrote:
>> Since we already have the list of mm_struct in the system, add a module to
>> scan each mm that walks VMAs of each mm_struct and scan all the pages
>> associated with that.
>>
>>   In the scan path: Check for the recently acccessed pages (folios) belonging
>> to slowtier nodes. Add all those folios to a list.
>>
>> Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
>> ---
> 
> Hi, just taking a quick look...

Hello Harry,
Thanks for taking a look at the patches.
> 
>>   mm/kscand.c | 319 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 318 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/kscand.c b/mm/kscand.c
>> index d5b0d3041b0f..0edec1b7730d 100644
>> --- a/mm/kscand.c
>> +++ b/mm/kscand.c
>> @@ -42,6 +55,8 @@ static struct kmem_cache *kscand_slot_cache __read_mostly;
>> @@ -84,11 +122,275 @@ static void kscand_wait_work(void)
>>   			scan_sleep_jiffies);
>>   }
>>   
>> +static inline bool is_valid_folio(struct folio *folio)
>> +{
>> +	if (!folio || folio_test_unevictable(folio) || !folio_mapped(folio) ||
>> +		folio_is_zone_device(folio) || folio_maybe_mapped_shared(folio))
>> +		return false;
>> +
>> +	return true;
>> +}
> 
> What makes it undesirable to migrate shared folios?

This was mostly to avoid shared libraries, but yes this also
should have accompanied with EXEC flag to refine further.
This also avoids moving around shared data. I will experiment more
and add additional filters OR remove the check.

>> +static bool folio_idle_clear_pte_refs_one(struct folio *folio,
>> +					 struct vm_area_struct *vma,
>> +					 unsigned long addr,
>> +					 pte_t *ptep)
>> +{
>> +	bool referenced = false;
>> +	struct mm_struct *mm = vma->vm_mm;
>> +	pmd_t *pmd = pmd_off(mm, addr);
>> +
>> +	if (ptep) {
>> +		if (ptep_clear_young_notify(vma, addr, ptep))
>> +			referenced = true;
>> +	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
>> +		if (!pmd_present(*pmd))
>> +			WARN_ON_ONCE(1);
>> +		if (pmdp_clear_young_notify(vma, addr, pmd))
>> +			referenced = true;
>> +	} else {
>> +		WARN_ON_ONCE(1);
>> +	}
> 
> This does not look good.
> 
> I think pmd entry handling should be handled in
> mm_walk_ops.pmd_entry callback?
>

Thanks, Let me check on this. Some part came when I was referring
to idle page tracking.

>> +
>> +	if (referenced) {
>> +		folio_clear_idle(folio);
>> +		folio_set_young(folio);
>> +	}
>> +
>> +	return true;
>> +}
>> +
>> +static void page_idle_clear_pte_refs(struct page *page, pte_t *pte, struct mm_walk *walk)
>> +{
>> +	bool need_lock;
>> +	struct folio *folio =  page_folio(page);
>> +	unsigned long address;
>> +
>> +	if (!folio_mapped(folio) || !folio_raw_mapping(folio))
>> +		return;
>> +
>> +	need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
>> +	if (need_lock && !folio_trylock(folio))
>> +		return;
> 
> Why acquire folio lock here?
> 
> And I'm not even sure if it's safe to acquire it?
> The locking order is folio_lock -> pte_lock
> 
> page walk should have already acquired pte_lock before calling
> ->pte_entry() callback.
> 

I saw you clarified later.

>> +	address = vma_address(walk->vma, page_pgoff(folio, page), compound_nr(page));
>> +	VM_BUG_ON_VMA(address == -EFAULT, walk->vma);
>> +	folio_idle_clear_pte_refs_one(folio, walk->vma, address, pte);
>> +
>> +	if (need_lock)
>> +		folio_unlock(folio);
>> +}
>> +
>> +static const struct mm_walk_ops hot_vma_set_idle_ops = {
>> +	.pte_entry = hot_vma_idle_pte_entry,
>> +	.walk_lock = PGWALK_RDLOCK,
>> +};
>> +
>> +static void kscand_walk_page_vma(struct vm_area_struct *vma, struct kscand_scanctrl *scanctrl)
>> +{
>> +	if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
>> +	    is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
>> +		return;
>> +	}
>> +	if (!vma->vm_mm ||
>> +	    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
>> +		return;
> 
> Why not walk writable file VMAs?
> 

This is mainly followed from numa balancing logic
" Avoid hinting faults in read-only file-backed mappings or the vDSO
  as migrating the pages will be of marginal benefit. "

But I have not measured benefits either way. Let me try once.

>> +	if (!vma_is_accessible(vma))
>> +		return;
>> +
>> +	walk_page_vma(vma, &hot_vma_set_idle_ops, scanctrl);
>> +}
> 

Regards
- Raghu
Re: [RFC PATCH V2 03/13] mm: Scan the mm and create a migration list
Posted by Raghavendra K T 3 months ago

On 6/26/2025 11:57 AM, Raghavendra K T wrote:
> 
> On 6/26/2025 3:37 AM, Harry Yoo wrote:
>> On Tue, Jun 24, 2025 at 05:56:07AM +0000, Raghavendra K T wrote:
>>> Since we already have the list of mm_struct in the system, add a 
>>> module to
>>> scan each mm that walks VMAs of each mm_struct and scan all the pages
>>> associated with that.
>>>
>>>   In the scan path: Check for the recently acccessed pages (folios) 
>>> belonging
>>> to slowtier nodes. Add all those folios to a list.
>>>
>>> Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
>>> ---
>>
>> Hi, just taking a quick look...
> 
> Hello Harry,
> Thanks for taking a look at the patches.
>>
>>>   mm/kscand.c | 319 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>   1 file changed, 318 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/mm/kscand.c b/mm/kscand.c
>>> index d5b0d3041b0f..0edec1b7730d 100644
>>> --- a/mm/kscand.c
>>> +++ b/mm/kscand.c
>>> @@ -42,6 +55,8 @@ static struct kmem_cache *kscand_slot_cache 
>>> __read_mostly;
>>> @@ -84,11 +122,275 @@ static void kscand_wait_work(void)
>>>               scan_sleep_jiffies);
>>>   }
>>> +static inline bool is_valid_folio(struct folio *folio)
>>> +{
>>> +    if (!folio || folio_test_unevictable(folio) || ! 
>>> folio_mapped(folio) ||
>>> +        folio_is_zone_device(folio) || 
>>> folio_maybe_mapped_shared(folio))
>>> +        return false;
>>> +
>>> +    return true;
>>> +}
>>
>> What makes it undesirable to migrate shared folios?
> 
> This was mostly to avoid shared libraries, but yes this also
> should have accompanied with EXEC flag to refine further.
> This also avoids moving around shared data. I will experiment more
> and add additional filters OR remove the check.
>

Atleast experiment with microbenchmark gives a positive benefit if I
remove check for folio_maybe_mapped_shared() folios. Will respin
accordingly.
(also after checking some real workloads).

- Raghu

Re: [RFC PATCH V2 03/13] mm: Scan the mm and create a migration list
Posted by Harry Yoo 3 months, 2 weeks ago
On Thu, Jun 26, 2025 at 07:07:12AM +0900, Harry Yoo wrote:
> On Tue, Jun 24, 2025 at 05:56:07AM +0000, Raghavendra K T wrote:
> > Since we already have the list of mm_struct in the system, add a module to
> > scan each mm that walks VMAs of each mm_struct and scan all the pages
> > associated with that.
> > 
> >  In the scan path: Check for the recently acccessed pages (folios) belonging
> > to slowtier nodes. Add all those folios to a list.
> > 
> > Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
> > ---
> 
> Hi, just taking a quick look...
> 
> >  mm/kscand.c | 319 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 318 insertions(+), 1 deletion(-)
> > 
> > diff --git a/mm/kscand.c b/mm/kscand.c
> > index d5b0d3041b0f..0edec1b7730d 100644
> > --- a/mm/kscand.c
> > +++ b/mm/kscand.c
> > @@ -42,6 +55,8 @@ static struct kmem_cache *kscand_slot_cache __read_mostly;
> > @@ -84,11 +122,275 @@ static void kscand_wait_work(void)
> >  			scan_sleep_jiffies);
> >  }
> >  
> > +static inline bool is_valid_folio(struct folio *folio)
> > +{
> > +	if (!folio || folio_test_unevictable(folio) || !folio_mapped(folio) ||
> > +		folio_is_zone_device(folio) || folio_maybe_mapped_shared(folio))
> > +		return false;
> > +
> > +	return true;
> > +}
> 
> What makes it undesirable to migrate shared folios?
> 
> > +static bool folio_idle_clear_pte_refs_one(struct folio *folio,
> > +					 struct vm_area_struct *vma,
> > +					 unsigned long addr,
> > +					 pte_t *ptep)
> > +{
> > +	bool referenced = false;
> > +	struct mm_struct *mm = vma->vm_mm;
> > +	pmd_t *pmd = pmd_off(mm, addr);
> > +
> > +	if (ptep) {
> > +		if (ptep_clear_young_notify(vma, addr, ptep))
> > +			referenced = true;
> > +	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
> > +		if (!pmd_present(*pmd))
> > +			WARN_ON_ONCE(1);
> > +		if (pmdp_clear_young_notify(vma, addr, pmd))
> > +			referenced = true;
> > +	} else {
> > +		WARN_ON_ONCE(1);
> > +	}
> 
> This does not look good.
> 
> I think pmd entry handling should be handled in
> mm_walk_ops.pmd_entry callback?
> 
> > +
> > +	if (referenced) {
> > +		folio_clear_idle(folio);
> > +		folio_set_young(folio);
> > +	}
> > +
> > +	return true;
> > +}
> > +
> > +static void page_idle_clear_pte_refs(struct page *page, pte_t *pte, struct mm_walk *walk)
> > +{
> > +	bool need_lock;
> > +	struct folio *folio =  page_folio(page);
> > +	unsigned long address;
> > +
> > +	if (!folio_mapped(folio) || !folio_raw_mapping(folio))
> > +		return;
> > +
> > +	need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
> > +	if (need_lock && !folio_trylock(folio))
> > +		return;
> 
> Why acquire folio lock here?
>
> And I'm not even sure if it's safe to acquire it?
> The locking order is folio_lock -> pte_lock
>
> page walk should have already acquired pte_lock before calling
> ->pte_entry() callback.

Oops, it's trylock. Nevermind.
Needed more coffee in the morning :)
 
> > +	address = vma_address(walk->vma, page_pgoff(folio, page), compound_nr(page));
> > +	VM_BUG_ON_VMA(address == -EFAULT, walk->vma);
> > +	folio_idle_clear_pte_refs_one(folio, walk->vma, address, pte);
> > +
> > +	if (need_lock)
> > +		folio_unlock(folio);
> > +}
> > +
> > +static const struct mm_walk_ops hot_vma_set_idle_ops = {
> > +	.pte_entry = hot_vma_idle_pte_entry,
> > +	.walk_lock = PGWALK_RDLOCK,
> > +};
> > +
> > +static void kscand_walk_page_vma(struct vm_area_struct *vma, struct kscand_scanctrl *scanctrl)
> > +{
> > +	if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
> > +	    is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
> > +		return;
> > +	}
> > +	if (!vma->vm_mm ||
> > +	    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
> > +		return;
> 
> Why not walk writable file VMAs?
> 
> > +	if (!vma_is_accessible(vma))
> > +		return;
> > +
> > +	walk_page_vma(vma, &hot_vma_set_idle_ops, scanctrl);
> > +}
> 
> -- 
> Cheers,
> Harry / Hyeonggon

-- 
Cheers,
Harry / Hyeonggon