[v3 02/11] mm/thp: zone_device awareness in THP handling code

Balbir Singh posted 11 patches 1 month, 3 weeks ago
There is a newer version of this series
[v3 02/11] mm/thp: zone_device awareness in THP handling code
Posted by Balbir Singh 1 month, 3 weeks ago
Make THP handling code in the mm subsystem for THP pages aware of zone
device pages. Although the code is designed to be generic when it comes
to handling splitting of pages, the code is designed to work for THP
page sizes corresponding to HPAGE_PMD_NR.

Modify page_vma_mapped_walk() to return true when a zone device huge
entry is present, enabling try_to_migrate() and other code migration
paths to appropriately process the entry. page_vma_mapped_walk() will
return true for zone device private large folios only when
PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are
not zone device private pages from having to add awareness. The key
callback that needs this flag is try_to_migrate_one(). The other
callbacks page idle, damon use it for setting young/dirty bits, which is
not significant when it comes to pmd level bit harvesting.

pmd_pfn() does not work well with zone device entries, use
pfn_pmd_entry_to_swap() for checking and comparison as for zone device
entries.

Support partial unmapping of zone device private entries, which happens
via munmap(). munmap() causes the device private entry pmd to be split,
but the corresponding folio is not split. Deferred split does not work for
zone device private folios due to the need to split during fault
handling. Get migrate_vma_collect_pmd() to handle this case by splitting
partially unmapped device private folios.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
---
 include/linux/rmap.h    |   2 +
 include/linux/swapops.h |  17 ++++
 lib/test_hmm.c          |   2 +-
 mm/huge_memory.c        | 214 +++++++++++++++++++++++++++++++---------
 mm/migrate_device.c     |  47 +++++++++
 mm/page_vma_mapped.c    |  13 ++-
 mm/pgtable-generic.c    |   6 ++
 mm/rmap.c               |  24 ++++-
 8 files changed, 272 insertions(+), 53 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6cd020eea37a..dfb7aae3d77b 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -927,6 +927,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
 #define PVMW_SYNC		(1 << 0)
 /* Look for migration entries rather than present PTEs */
 #define PVMW_MIGRATION		(1 << 1)
+/* Look for device private THP entries */
+#define PVMW_THP_DEVICE_PRIVATE	(1 << 2)
 
 struct page_vma_mapped_walk {
 	unsigned long pfn;
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 64ea151a7ae3..2641c01bd5d2 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
 {
 	return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
 }
+
 #else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
 static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
 		struct page *page)
@@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
 }
 #endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
 
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
+
+static inline int is_pmd_device_private_entry(pmd_t pmd)
+{
+	return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
+}
+
+#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+static inline int is_pmd_device_private_entry(pmd_t pmd)
+{
+	return 0;
+}
+
+#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
 static inline int non_swap_entry(swp_entry_t entry)
 {
 	return swp_type(entry) >= MAX_SWAPFILES;
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 761725bc713c..297f1e034045 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
 	 * the mirror but here we use it to hold the page for the simulated
 	 * device memory and that page holds the pointer to the mirror.
 	 */
-	rpage = vmf->page->zone_device_data;
+	rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data;
 	dmirror = rpage->zone_device_data;
 
 	/* FIXME demonstrate how we can adjust migrate range */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9c38a95e9f09..2495e3fdbfae 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (unlikely(is_swap_pmd(pmd))) {
 		swp_entry_t entry = pmd_to_swp_entry(pmd);
 
-		VM_BUG_ON(!is_pmd_migration_entry(pmd));
-		if (!is_readable_migration_entry(entry)) {
+		VM_WARN_ON(!is_pmd_migration_entry(pmd) &&
+				!is_pmd_device_private_entry(pmd));
+
+		if (is_migration_entry(entry) &&
+			is_writable_migration_entry(entry)) {
 			entry = make_readable_migration_entry(
 							swp_offset(entry));
 			pmd = swp_entry_to_pmd(entry);
@@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 				pmd = pmd_swp_mkuffd_wp(pmd);
 			set_pmd_at(src_mm, addr, src_pmd, pmd);
 		}
+
+		if (is_device_private_entry(entry)) {
+			if (is_writable_device_private_entry(entry)) {
+				entry = make_readable_device_private_entry(
+					swp_offset(entry));
+				pmd = swp_entry_to_pmd(entry);
+
+				if (pmd_swp_soft_dirty(*src_pmd))
+					pmd = pmd_swp_mksoft_dirty(pmd);
+				if (pmd_swp_uffd_wp(*src_pmd))
+					pmd = pmd_swp_mkuffd_wp(pmd);
+				set_pmd_at(src_mm, addr, src_pmd, pmd);
+			}
+
+			src_folio = pfn_swap_entry_folio(entry);
+			VM_WARN_ON(!folio_test_large(src_folio));
+
+			folio_get(src_folio);
+			/*
+			 * folio_try_dup_anon_rmap_pmd does not fail for
+			 * device private entries.
+			 */
+			VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio,
+					  &src_folio->page, dst_vma, src_vma));
+		}
+
 		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 		mm_inc_nr_ptes(dst_mm);
 		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
@@ -2219,15 +2248,22 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			folio_remove_rmap_pmd(folio, page, vma);
 			WARN_ON_ONCE(folio_mapcount(folio) < 0);
 			VM_BUG_ON_PAGE(!PageHead(page), page);
-		} else if (thp_migration_supported()) {
+		} else if (is_pmd_migration_entry(orig_pmd) ||
+				is_pmd_device_private_entry(orig_pmd)) {
 			swp_entry_t entry;
 
-			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
 			entry = pmd_to_swp_entry(orig_pmd);
 			folio = pfn_swap_entry_folio(entry);
 			flush_needed = 0;
-		} else
-			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+
+			if (!thp_migration_supported())
+				WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+
+			if (is_pmd_device_private_entry(orig_pmd)) {
+				folio_remove_rmap_pmd(folio, &folio->page, vma);
+				WARN_ON_ONCE(folio_mapcount(folio) < 0);
+			}
+		}
 
 		if (folio_test_anon(folio)) {
 			zap_deposited_table(tlb->mm, pmd);
@@ -2247,6 +2283,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 				folio_mark_accessed(folio);
 		}
 
+		/*
+		 * Do a folio put on zone device private pages after
+		 * changes to mm_counter, because the folio_put() will
+		 * clean folio->mapping and the folio_test_anon() check
+		 * will not be usable.
+		 */
+		if (folio_is_device_private(folio))
+			folio_put(folio);
+
 		spin_unlock(ptl);
 		if (flush_needed)
 			tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
@@ -2375,7 +2420,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		struct folio *folio = pfn_swap_entry_folio(entry);
 		pmd_t newpmd;
 
-		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
+		VM_WARN_ON(!is_pmd_migration_entry(*pmd) &&
+			   !folio_is_device_private(folio));
 		if (is_writable_migration_entry(entry)) {
 			/*
 			 * A protection check is difficult so
@@ -2388,6 +2434,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			newpmd = swp_entry_to_pmd(entry);
 			if (pmd_swp_soft_dirty(*pmd))
 				newpmd = pmd_swp_mksoft_dirty(newpmd);
+		} else if (is_writable_device_private_entry(entry)) {
+			entry = make_readable_device_private_entry(
+							swp_offset(entry));
+			newpmd = swp_entry_to_pmd(entry);
 		} else {
 			newpmd = *pmd;
 		}
@@ -2842,16 +2892,19 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	struct page *page;
 	pgtable_t pgtable;
 	pmd_t old_pmd, _pmd;
-	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
-	bool anon_exclusive = false, dirty = false;
+	bool young, write, soft_dirty, uffd_wp = false;
+	bool anon_exclusive = false, dirty = false, present = false;
 	unsigned long addr;
 	pte_t *pte;
 	int i;
+	swp_entry_t swp_entry;
 
 	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
+
+	VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+			&& !(is_pmd_device_private_entry(*pmd)));
 
 	count_vm_event(THP_SPLIT_PMD);
 
@@ -2899,18 +2952,45 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		return __split_huge_zero_page_pmd(vma, haddr, pmd);
 	}
 
-	pmd_migration = is_pmd_migration_entry(*pmd);
-	if (unlikely(pmd_migration)) {
-		swp_entry_t entry;
 
+	present = pmd_present(*pmd);
+	if (unlikely(!present)) {
+		swp_entry = pmd_to_swp_entry(*pmd);
 		old_pmd = *pmd;
-		entry = pmd_to_swp_entry(old_pmd);
-		page = pfn_swap_entry_to_page(entry);
-		write = is_writable_migration_entry(entry);
-		if (PageAnon(page))
-			anon_exclusive = is_readable_exclusive_migration_entry(entry);
-		young = is_migration_entry_young(entry);
-		dirty = is_migration_entry_dirty(entry);
+
+		folio = pfn_swap_entry_folio(swp_entry);
+		VM_WARN_ON(!is_migration_entry(swp_entry) &&
+				!is_device_private_entry(swp_entry));
+		page = pfn_swap_entry_to_page(swp_entry);
+
+		if (is_pmd_migration_entry(old_pmd)) {
+			write = is_writable_migration_entry(swp_entry);
+			if (PageAnon(page))
+				anon_exclusive =
+					is_readable_exclusive_migration_entry(
+								swp_entry);
+			young = is_migration_entry_young(swp_entry);
+			dirty = is_migration_entry_dirty(swp_entry);
+		} else if (is_pmd_device_private_entry(old_pmd)) {
+			write = is_writable_device_private_entry(swp_entry);
+			anon_exclusive = PageAnonExclusive(page);
+			if (freeze && anon_exclusive &&
+			    folio_try_share_anon_rmap_pmd(folio, page))
+				freeze = false;
+			if (!freeze) {
+				rmap_t rmap_flags = RMAP_NONE;
+
+				if (anon_exclusive)
+					rmap_flags |= RMAP_EXCLUSIVE;
+
+				folio_ref_add(folio, HPAGE_PMD_NR - 1);
+				if (anon_exclusive)
+					rmap_flags |= RMAP_EXCLUSIVE;
+				folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
+						 vma, haddr, rmap_flags);
+			}
+		}
+
 		soft_dirty = pmd_swp_soft_dirty(old_pmd);
 		uffd_wp = pmd_swp_uffd_wp(old_pmd);
 	} else {
@@ -2996,30 +3076,49 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	 * Note that NUMA hinting access restrictions are not transferred to
 	 * avoid any possibility of altering permissions across VMAs.
 	 */
-	if (freeze || pmd_migration) {
+	if (freeze || !present) {
 		for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
 			pte_t entry;
-			swp_entry_t swp_entry;
-
-			if (write)
-				swp_entry = make_writable_migration_entry(
-							page_to_pfn(page + i));
-			else if (anon_exclusive)
-				swp_entry = make_readable_exclusive_migration_entry(
-							page_to_pfn(page + i));
-			else
-				swp_entry = make_readable_migration_entry(
-							page_to_pfn(page + i));
-			if (young)
-				swp_entry = make_migration_entry_young(swp_entry);
-			if (dirty)
-				swp_entry = make_migration_entry_dirty(swp_entry);
-			entry = swp_entry_to_pte(swp_entry);
-			if (soft_dirty)
-				entry = pte_swp_mksoft_dirty(entry);
-			if (uffd_wp)
-				entry = pte_swp_mkuffd_wp(entry);
-
+			if (freeze || is_migration_entry(swp_entry)) {
+				if (write)
+					swp_entry = make_writable_migration_entry(
+								page_to_pfn(page + i));
+				else if (anon_exclusive)
+					swp_entry = make_readable_exclusive_migration_entry(
+								page_to_pfn(page + i));
+				else
+					swp_entry = make_readable_migration_entry(
+								page_to_pfn(page + i));
+				if (young)
+					swp_entry = make_migration_entry_young(swp_entry);
+				if (dirty)
+					swp_entry = make_migration_entry_dirty(swp_entry);
+				entry = swp_entry_to_pte(swp_entry);
+				if (soft_dirty)
+					entry = pte_swp_mksoft_dirty(entry);
+				if (uffd_wp)
+					entry = pte_swp_mkuffd_wp(entry);
+			} else {
+				/*
+				 * anon_exclusive was already propagated to the relevant
+				 * pages corresponding to the pte entries when freeze
+				 * is false.
+				 */
+				if (write)
+					swp_entry = make_writable_device_private_entry(
+								page_to_pfn(page + i));
+				else
+					swp_entry = make_readable_device_private_entry(
+								page_to_pfn(page + i));
+				/*
+				 * Young and dirty bits are not progated via swp_entry
+				 */
+				entry = swp_entry_to_pte(swp_entry);
+				if (soft_dirty)
+					entry = pte_swp_mksoft_dirty(entry);
+				if (uffd_wp)
+					entry = pte_swp_mkuffd_wp(entry);
+			}
 			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
 			set_pte_at(mm, addr, pte + i, entry);
 		}
@@ -3046,7 +3145,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	}
 	pte_unmap(pte);
 
-	if (!pmd_migration)
+	if (present)
 		folio_remove_rmap_pmd(folio, page, vma);
 	if (freeze)
 		put_page(page);
@@ -3058,8 +3157,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
 			   pmd_t *pmd, bool freeze)
 {
+
 	VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
-	if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
+	if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) ||
+			(is_pmd_device_private_entry(*pmd)))
 		__split_huge_pmd_locked(vma, pmd, address, freeze);
 }
 
@@ -3238,6 +3339,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
 	VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
 	lockdep_assert_held(&lruvec->lru_lock);
 
+	if (folio_is_device_private(folio))
+		return;
+
 	if (list) {
 		/* page reclaim is reclaiming a huge page */
 		VM_WARN_ON(folio_test_lru(folio));
@@ -3252,6 +3356,7 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
 			list_add_tail(&new_folio->lru, &folio->lru);
 		folio_set_lru(new_folio);
 	}
+
 }
 
 /* Racy check whether the huge page can be split */
@@ -3727,7 +3832,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 
 	/* Prevent deferred_split_scan() touching ->_refcount */
 	spin_lock(&ds_queue->split_queue_lock);
-	if (folio_ref_freeze(folio, 1 + extra_pins)) {
+	if (folio_ref_freeze(folio, 1 + folio_expected_ref_count(folio))) {
 		struct address_space *swap_cache = NULL;
 		struct lruvec *lruvec;
 		int expected_refs;
@@ -3858,8 +3963,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	if (nr_shmem_dropped)
 		shmem_uncharge(mapping->host, nr_shmem_dropped);
 
-	if (!ret && is_anon)
+	if (!ret && is_anon && !folio_is_device_private(folio))
 		remap_flags = RMP_USE_SHARED_ZEROPAGE;
+
 	remap_page(folio, 1 << order, remap_flags);
 
 	/*
@@ -4603,7 +4709,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
 		return 0;
 
 	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
-	pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+	if (unlikely(is_pmd_device_private_entry(*pvmw->pmd)))
+		pmdval = pmdp_huge_clear_flush(vma, address, pvmw->pmd);
+	else
+		pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
 
 	/* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
 	anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
@@ -4653,6 +4762,17 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
 	entry = pmd_to_swp_entry(*pvmw->pmd);
 	folio_get(folio);
 	pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
+
+	if (folio_is_device_private(folio)) {
+		if (pmd_write(pmde))
+			entry = make_writable_device_private_entry(
+							page_to_pfn(new));
+		else
+			entry = make_readable_device_private_entry(
+							page_to_pfn(new));
+		pmde = swp_entry_to_pmd(entry);
+	}
+
 	if (pmd_swp_soft_dirty(*pvmw->pmd))
 		pmde = pmd_mksoft_dirty(pmde);
 	if (is_writable_migration_entry(entry))
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index e05e14d6eacd..0ed337f94fcd 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -136,6 +136,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 			 * page table entry. Other special swap entries are not
 			 * migratable, and we ignore regular swapped page.
 			 */
+			struct folio *folio;
+
 			entry = pte_to_swp_entry(pte);
 			if (!is_device_private_entry(entry))
 				goto next;
@@ -147,6 +149,51 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 			    pgmap->owner != migrate->pgmap_owner)
 				goto next;
 
+			folio = page_folio(page);
+			if (folio_test_large(folio)) {
+				struct folio *new_folio;
+				struct folio *new_fault_folio = NULL;
+
+				/*
+				 * The reason for finding pmd present with a
+				 * device private pte and a large folio for the
+				 * pte is partial unmaps. Split the folio now
+				 * for the migration to be handled correctly
+				 */
+				pte_unmap_unlock(ptep, ptl);
+
+				folio_get(folio);
+				if (folio != fault_folio)
+					folio_lock(folio);
+				if (split_folio(folio)) {
+					if (folio != fault_folio)
+						folio_unlock(folio);
+					ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+					goto next;
+				}
+
+				new_folio = page_folio(page);
+				if (fault_folio)
+					new_fault_folio = page_folio(migrate->fault_page);
+
+				/*
+				 * Ensure the lock is held on the correct
+				 * folio after the split
+				 */
+				if (!new_fault_folio) {
+					folio_unlock(folio);
+					folio_put(folio);
+				} else if (folio != new_fault_folio) {
+					folio_get(new_fault_folio);
+					folio_lock(new_fault_folio);
+					folio_unlock(folio);
+					folio_put(folio);
+				}
+
+				addr = start;
+				goto again;
+			}
+
 			mpfn = migrate_pfn(page_to_pfn(page)) |
 					MIGRATE_PFN_MIGRATE;
 			if (is_writable_device_private_entry(entry))
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index e981a1a292d2..246e6c211f34 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -250,12 +250,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 			pvmw->ptl = pmd_lock(mm, pvmw->pmd);
 			pmde = *pvmw->pmd;
 			if (!pmd_present(pmde)) {
-				swp_entry_t entry;
+				swp_entry_t entry = pmd_to_swp_entry(pmde);
 
 				if (!thp_migration_supported() ||
 				    !(pvmw->flags & PVMW_MIGRATION))
 					return not_found(pvmw);
-				entry = pmd_to_swp_entry(pmde);
 				if (!is_migration_entry(entry) ||
 				    !check_pmd(swp_offset_pfn(entry), pvmw))
 					return not_found(pvmw);
@@ -277,6 +276,16 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 			 * cannot return prematurely, while zap_huge_pmd() has
 			 * cleared *pmd but not decremented compound_mapcount().
 			 */
+			swp_entry_t entry;
+
+			entry = pmd_to_swp_entry(pmde);
+
+			if (is_device_private_entry(entry) &&
+				(pvmw->flags & PVMW_THP_DEVICE_PRIVATE)) {
+				pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+				return true;
+			}
+
 			if ((pvmw->flags & PVMW_SYNC) &&
 			    thp_vma_suitable_order(vma, pvmw->address,
 						   PMD_ORDER) &&
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 567e2d084071..604e8206a2ec 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -292,6 +292,12 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
 		*pmdvalp = pmdval;
 	if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
 		goto nomap;
+	if (is_swap_pmd(pmdval)) {
+		swp_entry_t entry = pmd_to_swp_entry(pmdval);
+
+		if (is_device_private_entry(entry))
+			goto nomap;
+	}
 	if (unlikely(pmd_trans_huge(pmdval)))
 		goto nomap;
 	if (unlikely(pmd_bad(pmdval))) {
diff --git a/mm/rmap.c b/mm/rmap.c
index b5837075b6e0..f40e45564295 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2285,7 +2285,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 		     unsigned long address, void *arg)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address,
+				PVMW_THP_DEVICE_PRIVATE);
 	bool anon_exclusive, writable, ret = true;
 	pte_t pteval;
 	struct page *subpage;
@@ -2330,6 +2331,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 	while (page_vma_mapped_walk(&pvmw)) {
 		/* PMD-mapped THP migration entry */
 		if (!pvmw.pte) {
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+			unsigned long pfn;
+#endif
+
 			if (flags & TTU_SPLIT_HUGE_PMD) {
 				split_huge_pmd_locked(vma, pvmw.address,
 						      pvmw.pmd, true);
@@ -2338,8 +2343,21 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 				break;
 			}
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-			subpage = folio_page(folio,
-				pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
+			/*
+			 * Zone device private folios do not work well with
+			 * pmd_pfn() on some architectures due to pte
+			 * inversion.
+			 */
+			if (is_pmd_device_private_entry(*pvmw.pmd)) {
+				swp_entry_t entry = pmd_to_swp_entry(*pvmw.pmd);
+
+				pfn = swp_offset_pfn(entry);
+			} else {
+				pfn = pmd_pfn(*pvmw.pmd);
+			}
+
+			subpage = folio_page(folio, pfn - folio_pfn(folio));
+
 			VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
 					!folio_test_pmd_mappable(folio), folio);
 
-- 
2.50.1

Re: [v3 02/11] mm/thp: zone_device awareness in THP handling code
Posted by Matthew Brost 1 month, 1 week ago
On Tue, Aug 12, 2025 at 12:40:27PM +1000, Balbir Singh wrote:
> Make THP handling code in the mm subsystem for THP pages aware of zone
> device pages. Although the code is designed to be generic when it comes
> to handling splitting of pages, the code is designed to work for THP
> page sizes corresponding to HPAGE_PMD_NR.
> 
> Modify page_vma_mapped_walk() to return true when a zone device huge
> entry is present, enabling try_to_migrate() and other code migration
> paths to appropriately process the entry. page_vma_mapped_walk() will
> return true for zone device private large folios only when
> PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are
> not zone device private pages from having to add awareness. The key
> callback that needs this flag is try_to_migrate_one(). The other
> callbacks page idle, damon use it for setting young/dirty bits, which is
> not significant when it comes to pmd level bit harvesting.
> 
> pmd_pfn() does not work well with zone device entries, use
> pfn_pmd_entry_to_swap() for checking and comparison as for zone device
> entries.
> 
> Support partial unmapping of zone device private entries, which happens
> via munmap(). munmap() causes the device private entry pmd to be split,
> but the corresponding folio is not split. Deferred split does not work for
> zone device private folios due to the need to split during fault
> handling. Get migrate_vma_collect_pmd() to handle this case by splitting
> partially unmapped device private folios.
> 
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Zi Yan <ziy@nvidia.com>
> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
> Cc: Rakie Kim <rakie.kim@sk.com>
> Cc: Byungchul Park <byungchul@sk.com>
> Cc: Gregory Price <gourry@gourry.net>
> Cc: Ying Huang <ying.huang@linux.alibaba.com>
> Cc: Alistair Popple <apopple@nvidia.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> Cc: Nico Pache <npache@redhat.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Dev Jain <dev.jain@arm.com>
> Cc: Barry Song <baohua@kernel.org>
> Cc: Lyude Paul <lyude@redhat.com>
> Cc: Danilo Krummrich <dakr@kernel.org>
> Cc: David Airlie <airlied@gmail.com>
> Cc: Simona Vetter <simona@ffwll.ch>
> Cc: Ralph Campbell <rcampbell@nvidia.com>
> Cc: Mika Penttilä <mpenttil@redhat.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Francois Dugast <francois.dugast@intel.com>
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
> ---
>  include/linux/rmap.h    |   2 +
>  include/linux/swapops.h |  17 ++++
>  lib/test_hmm.c          |   2 +-
>  mm/huge_memory.c        | 214 +++++++++++++++++++++++++++++++---------
>  mm/migrate_device.c     |  47 +++++++++
>  mm/page_vma_mapped.c    |  13 ++-
>  mm/pgtable-generic.c    |   6 ++
>  mm/rmap.c               |  24 ++++-
>  8 files changed, 272 insertions(+), 53 deletions(-)
> 
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 6cd020eea37a..dfb7aae3d77b 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -927,6 +927,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
>  #define PVMW_SYNC		(1 << 0)
>  /* Look for migration entries rather than present PTEs */
>  #define PVMW_MIGRATION		(1 << 1)
> +/* Look for device private THP entries */
> +#define PVMW_THP_DEVICE_PRIVATE	(1 << 2)
>  
>  struct page_vma_mapped_walk {
>  	unsigned long pfn;
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index 64ea151a7ae3..2641c01bd5d2 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>  {
>  	return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
>  }
> +
>  #else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>  static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
>  		struct page *page)
> @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>  }
>  #endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>  
> +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
> +
> +static inline int is_pmd_device_private_entry(pmd_t pmd)
> +{
> +	return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
> +}
> +
> +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
> +
> +static inline int is_pmd_device_private_entry(pmd_t pmd)
> +{
> +	return 0;
> +}
> +
> +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
> +
>  static inline int non_swap_entry(swp_entry_t entry)
>  {
>  	return swp_type(entry) >= MAX_SWAPFILES;
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index 761725bc713c..297f1e034045 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
>  	 * the mirror but here we use it to hold the page for the simulated
>  	 * device memory and that page holds the pointer to the mirror.
>  	 */
> -	rpage = vmf->page->zone_device_data;
> +	rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data;
>  	dmirror = rpage->zone_device_data;
>  
>  	/* FIXME demonstrate how we can adjust migrate range */
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 9c38a95e9f09..2495e3fdbfae 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>  	if (unlikely(is_swap_pmd(pmd))) {
>  		swp_entry_t entry = pmd_to_swp_entry(pmd);
>  
> -		VM_BUG_ON(!is_pmd_migration_entry(pmd));
> -		if (!is_readable_migration_entry(entry)) {
> +		VM_WARN_ON(!is_pmd_migration_entry(pmd) &&
> +				!is_pmd_device_private_entry(pmd));
> +
> +		if (is_migration_entry(entry) &&
> +			is_writable_migration_entry(entry)) {
>  			entry = make_readable_migration_entry(
>  							swp_offset(entry));
>  			pmd = swp_entry_to_pmd(entry);
> @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>  				pmd = pmd_swp_mkuffd_wp(pmd);
>  			set_pmd_at(src_mm, addr, src_pmd, pmd);
>  		}
> +
> +		if (is_device_private_entry(entry)) {
> +			if (is_writable_device_private_entry(entry)) {
> +				entry = make_readable_device_private_entry(
> +					swp_offset(entry));
> +				pmd = swp_entry_to_pmd(entry);
> +
> +				if (pmd_swp_soft_dirty(*src_pmd))
> +					pmd = pmd_swp_mksoft_dirty(pmd);
> +				if (pmd_swp_uffd_wp(*src_pmd))
> +					pmd = pmd_swp_mkuffd_wp(pmd);
> +				set_pmd_at(src_mm, addr, src_pmd, pmd);
> +			}
> +
> +			src_folio = pfn_swap_entry_folio(entry);
> +			VM_WARN_ON(!folio_test_large(src_folio));
> +
> +			folio_get(src_folio);
> +			/*
> +			 * folio_try_dup_anon_rmap_pmd does not fail for
> +			 * device private entries.
> +			 */
> +			VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio,
> +					  &src_folio->page, dst_vma, src_vma));

VM_WARN_ON compiles out in non-debug builds. I hit this running the
fork self I shared with a non-debug build.

Matt 

[1] https://elixir.bootlin.com/linux/v6.16.3/source/include/linux/mmdebug.h#L112

> +		}
> +
>  		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>  		mm_inc_nr_ptes(dst_mm);
>  		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
> @@ -2219,15 +2248,22 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>  			folio_remove_rmap_pmd(folio, page, vma);
>  			WARN_ON_ONCE(folio_mapcount(folio) < 0);
>  			VM_BUG_ON_PAGE(!PageHead(page), page);
> -		} else if (thp_migration_supported()) {
> +		} else if (is_pmd_migration_entry(orig_pmd) ||
> +				is_pmd_device_private_entry(orig_pmd)) {
>  			swp_entry_t entry;
>  
> -			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
>  			entry = pmd_to_swp_entry(orig_pmd);
>  			folio = pfn_swap_entry_folio(entry);
>  			flush_needed = 0;
> -		} else
> -			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
> +
> +			if (!thp_migration_supported())
> +				WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
> +
> +			if (is_pmd_device_private_entry(orig_pmd)) {
> +				folio_remove_rmap_pmd(folio, &folio->page, vma);
> +				WARN_ON_ONCE(folio_mapcount(folio) < 0);
> +			}
> +		}
>  
>  		if (folio_test_anon(folio)) {
>  			zap_deposited_table(tlb->mm, pmd);
> @@ -2247,6 +2283,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>  				folio_mark_accessed(folio);
>  		}
>  
> +		/*
> +		 * Do a folio put on zone device private pages after
> +		 * changes to mm_counter, because the folio_put() will
> +		 * clean folio->mapping and the folio_test_anon() check
> +		 * will not be usable.
> +		 */
> +		if (folio_is_device_private(folio))
> +			folio_put(folio);
> +
>  		spin_unlock(ptl);
>  		if (flush_needed)
>  			tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
> @@ -2375,7 +2420,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>  		struct folio *folio = pfn_swap_entry_folio(entry);
>  		pmd_t newpmd;
>  
> -		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
> +		VM_WARN_ON(!is_pmd_migration_entry(*pmd) &&
> +			   !folio_is_device_private(folio));
>  		if (is_writable_migration_entry(entry)) {
>  			/*
>  			 * A protection check is difficult so
> @@ -2388,6 +2434,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>  			newpmd = swp_entry_to_pmd(entry);
>  			if (pmd_swp_soft_dirty(*pmd))
>  				newpmd = pmd_swp_mksoft_dirty(newpmd);
> +		} else if (is_writable_device_private_entry(entry)) {
> +			entry = make_readable_device_private_entry(
> +							swp_offset(entry));
> +			newpmd = swp_entry_to_pmd(entry);
>  		} else {
>  			newpmd = *pmd;
>  		}
> @@ -2842,16 +2892,19 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  	struct page *page;
>  	pgtable_t pgtable;
>  	pmd_t old_pmd, _pmd;
> -	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
> -	bool anon_exclusive = false, dirty = false;
> +	bool young, write, soft_dirty, uffd_wp = false;
> +	bool anon_exclusive = false, dirty = false, present = false;
>  	unsigned long addr;
>  	pte_t *pte;
>  	int i;
> +	swp_entry_t swp_entry;
>  
>  	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
>  	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>  	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
> -	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
> +
> +	VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
> +			&& !(is_pmd_device_private_entry(*pmd)));
>  
>  	count_vm_event(THP_SPLIT_PMD);
>  
> @@ -2899,18 +2952,45 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		return __split_huge_zero_page_pmd(vma, haddr, pmd);
>  	}
>  
> -	pmd_migration = is_pmd_migration_entry(*pmd);
> -	if (unlikely(pmd_migration)) {
> -		swp_entry_t entry;
>  
> +	present = pmd_present(*pmd);
> +	if (unlikely(!present)) {
> +		swp_entry = pmd_to_swp_entry(*pmd);
>  		old_pmd = *pmd;
> -		entry = pmd_to_swp_entry(old_pmd);
> -		page = pfn_swap_entry_to_page(entry);
> -		write = is_writable_migration_entry(entry);
> -		if (PageAnon(page))
> -			anon_exclusive = is_readable_exclusive_migration_entry(entry);
> -		young = is_migration_entry_young(entry);
> -		dirty = is_migration_entry_dirty(entry);
> +
> +		folio = pfn_swap_entry_folio(swp_entry);
> +		VM_WARN_ON(!is_migration_entry(swp_entry) &&
> +				!is_device_private_entry(swp_entry));
> +		page = pfn_swap_entry_to_page(swp_entry);
> +
> +		if (is_pmd_migration_entry(old_pmd)) {
> +			write = is_writable_migration_entry(swp_entry);
> +			if (PageAnon(page))
> +				anon_exclusive =
> +					is_readable_exclusive_migration_entry(
> +								swp_entry);
> +			young = is_migration_entry_young(swp_entry);
> +			dirty = is_migration_entry_dirty(swp_entry);
> +		} else if (is_pmd_device_private_entry(old_pmd)) {
> +			write = is_writable_device_private_entry(swp_entry);
> +			anon_exclusive = PageAnonExclusive(page);
> +			if (freeze && anon_exclusive &&
> +			    folio_try_share_anon_rmap_pmd(folio, page))
> +				freeze = false;
> +			if (!freeze) {
> +				rmap_t rmap_flags = RMAP_NONE;
> +
> +				if (anon_exclusive)
> +					rmap_flags |= RMAP_EXCLUSIVE;
> +
> +				folio_ref_add(folio, HPAGE_PMD_NR - 1);
> +				if (anon_exclusive)
> +					rmap_flags |= RMAP_EXCLUSIVE;
> +				folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
> +						 vma, haddr, rmap_flags);
> +			}
> +		}
> +
>  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>  		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>  	} else {
> @@ -2996,30 +3076,49 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  	 * Note that NUMA hinting access restrictions are not transferred to
>  	 * avoid any possibility of altering permissions across VMAs.
>  	 */
> -	if (freeze || pmd_migration) {
> +	if (freeze || !present) {
>  		for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>  			pte_t entry;
> -			swp_entry_t swp_entry;
> -
> -			if (write)
> -				swp_entry = make_writable_migration_entry(
> -							page_to_pfn(page + i));
> -			else if (anon_exclusive)
> -				swp_entry = make_readable_exclusive_migration_entry(
> -							page_to_pfn(page + i));
> -			else
> -				swp_entry = make_readable_migration_entry(
> -							page_to_pfn(page + i));
> -			if (young)
> -				swp_entry = make_migration_entry_young(swp_entry);
> -			if (dirty)
> -				swp_entry = make_migration_entry_dirty(swp_entry);
> -			entry = swp_entry_to_pte(swp_entry);
> -			if (soft_dirty)
> -				entry = pte_swp_mksoft_dirty(entry);
> -			if (uffd_wp)
> -				entry = pte_swp_mkuffd_wp(entry);
> -
> +			if (freeze || is_migration_entry(swp_entry)) {
> +				if (write)
> +					swp_entry = make_writable_migration_entry(
> +								page_to_pfn(page + i));
> +				else if (anon_exclusive)
> +					swp_entry = make_readable_exclusive_migration_entry(
> +								page_to_pfn(page + i));
> +				else
> +					swp_entry = make_readable_migration_entry(
> +								page_to_pfn(page + i));
> +				if (young)
> +					swp_entry = make_migration_entry_young(swp_entry);
> +				if (dirty)
> +					swp_entry = make_migration_entry_dirty(swp_entry);
> +				entry = swp_entry_to_pte(swp_entry);
> +				if (soft_dirty)
> +					entry = pte_swp_mksoft_dirty(entry);
> +				if (uffd_wp)
> +					entry = pte_swp_mkuffd_wp(entry);
> +			} else {
> +				/*
> +				 * anon_exclusive was already propagated to the relevant
> +				 * pages corresponding to the pte entries when freeze
> +				 * is false.
> +				 */
> +				if (write)
> +					swp_entry = make_writable_device_private_entry(
> +								page_to_pfn(page + i));
> +				else
> +					swp_entry = make_readable_device_private_entry(
> +								page_to_pfn(page + i));
> +				/*
> +				 * Young and dirty bits are not progated via swp_entry
> +				 */
> +				entry = swp_entry_to_pte(swp_entry);
> +				if (soft_dirty)
> +					entry = pte_swp_mksoft_dirty(entry);
> +				if (uffd_wp)
> +					entry = pte_swp_mkuffd_wp(entry);
> +			}
>  			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
>  			set_pte_at(mm, addr, pte + i, entry);
>  		}
> @@ -3046,7 +3145,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  	}
>  	pte_unmap(pte);
>  
> -	if (!pmd_migration)
> +	if (present)
>  		folio_remove_rmap_pmd(folio, page, vma);
>  	if (freeze)
>  		put_page(page);
> @@ -3058,8 +3157,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>  			   pmd_t *pmd, bool freeze)
>  {
> +
>  	VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
> -	if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
> +	if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) ||
> +			(is_pmd_device_private_entry(*pmd)))
>  		__split_huge_pmd_locked(vma, pmd, address, freeze);
>  }
>  
> @@ -3238,6 +3339,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>  	VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
>  	lockdep_assert_held(&lruvec->lru_lock);
>  
> +	if (folio_is_device_private(folio))
> +		return;
> +
>  	if (list) {
>  		/* page reclaim is reclaiming a huge page */
>  		VM_WARN_ON(folio_test_lru(folio));
> @@ -3252,6 +3356,7 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>  			list_add_tail(&new_folio->lru, &folio->lru);
>  		folio_set_lru(new_folio);
>  	}
> +
>  }
>  
>  /* Racy check whether the huge page can be split */
> @@ -3727,7 +3832,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>  
>  	/* Prevent deferred_split_scan() touching ->_refcount */
>  	spin_lock(&ds_queue->split_queue_lock);
> -	if (folio_ref_freeze(folio, 1 + extra_pins)) {
> +	if (folio_ref_freeze(folio, 1 + folio_expected_ref_count(folio))) {
>  		struct address_space *swap_cache = NULL;
>  		struct lruvec *lruvec;
>  		int expected_refs;
> @@ -3858,8 +3963,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>  	if (nr_shmem_dropped)
>  		shmem_uncharge(mapping->host, nr_shmem_dropped);
>  
> -	if (!ret && is_anon)
> +	if (!ret && is_anon && !folio_is_device_private(folio))
>  		remap_flags = RMP_USE_SHARED_ZEROPAGE;
> +
>  	remap_page(folio, 1 << order, remap_flags);
>  
>  	/*
> @@ -4603,7 +4709,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
>  		return 0;
>  
>  	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
> -	pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
> +	if (unlikely(is_pmd_device_private_entry(*pvmw->pmd)))
> +		pmdval = pmdp_huge_clear_flush(vma, address, pvmw->pmd);
> +	else
> +		pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
>  
>  	/* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
>  	anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
> @@ -4653,6 +4762,17 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
>  	entry = pmd_to_swp_entry(*pvmw->pmd);
>  	folio_get(folio);
>  	pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
> +
> +	if (folio_is_device_private(folio)) {
> +		if (pmd_write(pmde))
> +			entry = make_writable_device_private_entry(
> +							page_to_pfn(new));
> +		else
> +			entry = make_readable_device_private_entry(
> +							page_to_pfn(new));
> +		pmde = swp_entry_to_pmd(entry);
> +	}
> +
>  	if (pmd_swp_soft_dirty(*pvmw->pmd))
>  		pmde = pmd_mksoft_dirty(pmde);
>  	if (is_writable_migration_entry(entry))
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index e05e14d6eacd..0ed337f94fcd 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -136,6 +136,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  			 * page table entry. Other special swap entries are not
>  			 * migratable, and we ignore regular swapped page.
>  			 */
> +			struct folio *folio;
> +
>  			entry = pte_to_swp_entry(pte);
>  			if (!is_device_private_entry(entry))
>  				goto next;
> @@ -147,6 +149,51 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  			    pgmap->owner != migrate->pgmap_owner)
>  				goto next;
>  
> +			folio = page_folio(page);
> +			if (folio_test_large(folio)) {
> +				struct folio *new_folio;
> +				struct folio *new_fault_folio = NULL;
> +
> +				/*
> +				 * The reason for finding pmd present with a
> +				 * device private pte and a large folio for the
> +				 * pte is partial unmaps. Split the folio now
> +				 * for the migration to be handled correctly
> +				 */
> +				pte_unmap_unlock(ptep, ptl);
> +
> +				folio_get(folio);
> +				if (folio != fault_folio)
> +					folio_lock(folio);
> +				if (split_folio(folio)) {
> +					if (folio != fault_folio)
> +						folio_unlock(folio);
> +					ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> +					goto next;
> +				}
> +
> +				new_folio = page_folio(page);
> +				if (fault_folio)
> +					new_fault_folio = page_folio(migrate->fault_page);
> +
> +				/*
> +				 * Ensure the lock is held on the correct
> +				 * folio after the split
> +				 */
> +				if (!new_fault_folio) {
> +					folio_unlock(folio);
> +					folio_put(folio);
> +				} else if (folio != new_fault_folio) {
> +					folio_get(new_fault_folio);
> +					folio_lock(new_fault_folio);
> +					folio_unlock(folio);
> +					folio_put(folio);
> +				}
> +
> +				addr = start;
> +				goto again;
> +			}
> +
>  			mpfn = migrate_pfn(page_to_pfn(page)) |
>  					MIGRATE_PFN_MIGRATE;
>  			if (is_writable_device_private_entry(entry))
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index e981a1a292d2..246e6c211f34 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -250,12 +250,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  			pvmw->ptl = pmd_lock(mm, pvmw->pmd);
>  			pmde = *pvmw->pmd;
>  			if (!pmd_present(pmde)) {
> -				swp_entry_t entry;
> +				swp_entry_t entry = pmd_to_swp_entry(pmde);
>  
>  				if (!thp_migration_supported() ||
>  				    !(pvmw->flags & PVMW_MIGRATION))
>  					return not_found(pvmw);
> -				entry = pmd_to_swp_entry(pmde);
>  				if (!is_migration_entry(entry) ||
>  				    !check_pmd(swp_offset_pfn(entry), pvmw))
>  					return not_found(pvmw);
> @@ -277,6 +276,16 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  			 * cannot return prematurely, while zap_huge_pmd() has
>  			 * cleared *pmd but not decremented compound_mapcount().
>  			 */
> +			swp_entry_t entry;
> +
> +			entry = pmd_to_swp_entry(pmde);
> +
> +			if (is_device_private_entry(entry) &&
> +				(pvmw->flags & PVMW_THP_DEVICE_PRIVATE)) {
> +				pvmw->ptl = pmd_lock(mm, pvmw->pmd);
> +				return true;
> +			}
> +
>  			if ((pvmw->flags & PVMW_SYNC) &&
>  			    thp_vma_suitable_order(vma, pvmw->address,
>  						   PMD_ORDER) &&
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 567e2d084071..604e8206a2ec 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -292,6 +292,12 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
>  		*pmdvalp = pmdval;
>  	if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
>  		goto nomap;
> +	if (is_swap_pmd(pmdval)) {
> +		swp_entry_t entry = pmd_to_swp_entry(pmdval);
> +
> +		if (is_device_private_entry(entry))
> +			goto nomap;
> +	}
>  	if (unlikely(pmd_trans_huge(pmdval)))
>  		goto nomap;
>  	if (unlikely(pmd_bad(pmdval))) {
> diff --git a/mm/rmap.c b/mm/rmap.c
> index b5837075b6e0..f40e45564295 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -2285,7 +2285,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
>  		     unsigned long address, void *arg)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
> -	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> +	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address,
> +				PVMW_THP_DEVICE_PRIVATE);
>  	bool anon_exclusive, writable, ret = true;
>  	pte_t pteval;
>  	struct page *subpage;
> @@ -2330,6 +2331,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
>  	while (page_vma_mapped_walk(&pvmw)) {
>  		/* PMD-mapped THP migration entry */
>  		if (!pvmw.pte) {
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +			unsigned long pfn;
> +#endif
> +
>  			if (flags & TTU_SPLIT_HUGE_PMD) {
>  				split_huge_pmd_locked(vma, pvmw.address,
>  						      pvmw.pmd, true);
> @@ -2338,8 +2343,21 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
>  				break;
>  			}
>  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> -			subpage = folio_page(folio,
> -				pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
> +			/*
> +			 * Zone device private folios do not work well with
> +			 * pmd_pfn() on some architectures due to pte
> +			 * inversion.
> +			 */
> +			if (is_pmd_device_private_entry(*pvmw.pmd)) {
> +				swp_entry_t entry = pmd_to_swp_entry(*pvmw.pmd);
> +
> +				pfn = swp_offset_pfn(entry);
> +			} else {
> +				pfn = pmd_pfn(*pvmw.pmd);
> +			}
> +
> +			subpage = folio_page(folio, pfn - folio_pfn(folio));
> +
>  			VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
>  					!folio_test_pmd_mappable(folio), folio);
>  
> -- 
> 2.50.1
> 
Re: [v3 02/11] mm/thp: zone_device awareness in THP handling code
Posted by David Hildenbrand 1 month, 1 week ago
On 28.08.25 22:05, Matthew Brost wrote:
> On Tue, Aug 12, 2025 at 12:40:27PM +1000, Balbir Singh wrote:
>> Make THP handling code in the mm subsystem for THP pages aware of zone
>> device pages. Although the code is designed to be generic when it comes
>> to handling splitting of pages, the code is designed to work for THP
>> page sizes corresponding to HPAGE_PMD_NR.
>>
>> Modify page_vma_mapped_walk() to return true when a zone device huge
>> entry is present, enabling try_to_migrate() and other code migration
>> paths to appropriately process the entry. page_vma_mapped_walk() will
>> return true for zone device private large folios only when
>> PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are
>> not zone device private pages from having to add awareness. The key
>> callback that needs this flag is try_to_migrate_one(). The other
>> callbacks page idle, damon use it for setting young/dirty bits, which is
>> not significant when it comes to pmd level bit harvesting.
>>
>> pmd_pfn() does not work well with zone device entries, use
>> pfn_pmd_entry_to_swap() for checking and comparison as for zone device
>> entries.
>>
>> Support partial unmapping of zone device private entries, which happens
>> via munmap(). munmap() causes the device private entry pmd to be split,
>> but the corresponding folio is not split. Deferred split does not work for
>> zone device private folios due to the need to split during fault
>> handling. Get migrate_vma_collect_pmd() to handle this case by splitting
>> partially unmapped device private folios.
>>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: Zi Yan <ziy@nvidia.com>
>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>> Cc: Rakie Kim <rakie.kim@sk.com>
>> Cc: Byungchul Park <byungchul@sk.com>
>> Cc: Gregory Price <gourry@gourry.net>
>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>> Cc: Alistair Popple <apopple@nvidia.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>> Cc: Nico Pache <npache@redhat.com>
>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>> Cc: Dev Jain <dev.jain@arm.com>
>> Cc: Barry Song <baohua@kernel.org>
>> Cc: Lyude Paul <lyude@redhat.com>
>> Cc: Danilo Krummrich <dakr@kernel.org>
>> Cc: David Airlie <airlied@gmail.com>
>> Cc: Simona Vetter <simona@ffwll.ch>
>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>> Cc: Mika Penttilä <mpenttil@redhat.com>
>> Cc: Matthew Brost <matthew.brost@intel.com>
>> Cc: Francois Dugast <francois.dugast@intel.com>
>>
>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>> ---
>>   include/linux/rmap.h    |   2 +
>>   include/linux/swapops.h |  17 ++++
>>   lib/test_hmm.c          |   2 +-
>>   mm/huge_memory.c        | 214 +++++++++++++++++++++++++++++++---------
>>   mm/migrate_device.c     |  47 +++++++++
>>   mm/page_vma_mapped.c    |  13 ++-
>>   mm/pgtable-generic.c    |   6 ++
>>   mm/rmap.c               |  24 ++++-
>>   8 files changed, 272 insertions(+), 53 deletions(-)
>>
>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>> index 6cd020eea37a..dfb7aae3d77b 100644
>> --- a/include/linux/rmap.h
>> +++ b/include/linux/rmap.h
>> @@ -927,6 +927,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
>>   #define PVMW_SYNC		(1 << 0)
>>   /* Look for migration entries rather than present PTEs */
>>   #define PVMW_MIGRATION		(1 << 1)
>> +/* Look for device private THP entries */
>> +#define PVMW_THP_DEVICE_PRIVATE	(1 << 2)
>>   
>>   struct page_vma_mapped_walk {
>>   	unsigned long pfn;
>> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
>> index 64ea151a7ae3..2641c01bd5d2 100644
>> --- a/include/linux/swapops.h
>> +++ b/include/linux/swapops.h
>> @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>>   {
>>   	return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
>>   }
>> +
>>   #else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>   static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
>>   		struct page *page)
>> @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>>   }
>>   #endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>   
>> +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
>> +
>> +static inline int is_pmd_device_private_entry(pmd_t pmd)
>> +{
>> +	return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
>> +}
>> +
>> +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
>> +
>> +static inline int is_pmd_device_private_entry(pmd_t pmd)
>> +{
>> +	return 0;
>> +}
>> +
>> +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
>> +
>>   static inline int non_swap_entry(swp_entry_t entry)
>>   {
>>   	return swp_type(entry) >= MAX_SWAPFILES;
>> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
>> index 761725bc713c..297f1e034045 100644
>> --- a/lib/test_hmm.c
>> +++ b/lib/test_hmm.c
>> @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
>>   	 * the mirror but here we use it to hold the page for the simulated
>>   	 * device memory and that page holds the pointer to the mirror.
>>   	 */
>> -	rpage = vmf->page->zone_device_data;
>> +	rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data;
>>   	dmirror = rpage->zone_device_data;
>>   
>>   	/* FIXME demonstrate how we can adjust migrate range */
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 9c38a95e9f09..2495e3fdbfae 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>   	if (unlikely(is_swap_pmd(pmd))) {
>>   		swp_entry_t entry = pmd_to_swp_entry(pmd);
>>   
>> -		VM_BUG_ON(!is_pmd_migration_entry(pmd));
>> -		if (!is_readable_migration_entry(entry)) {
>> +		VM_WARN_ON(!is_pmd_migration_entry(pmd) &&
>> +				!is_pmd_device_private_entry(pmd));
>> +
>> +		if (is_migration_entry(entry) &&
>> +			is_writable_migration_entry(entry)) {
>>   			entry = make_readable_migration_entry(
>>   							swp_offset(entry));
>>   			pmd = swp_entry_to_pmd(entry);
>> @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>   				pmd = pmd_swp_mkuffd_wp(pmd);
>>   			set_pmd_at(src_mm, addr, src_pmd, pmd);
>>   		}
>> +
>> +		if (is_device_private_entry(entry)) {
>> +			if (is_writable_device_private_entry(entry)) {
>> +				entry = make_readable_device_private_entry(
>> +					swp_offset(entry));
>> +				pmd = swp_entry_to_pmd(entry);
>> +
>> +				if (pmd_swp_soft_dirty(*src_pmd))
>> +					pmd = pmd_swp_mksoft_dirty(pmd);
>> +				if (pmd_swp_uffd_wp(*src_pmd))
>> +					pmd = pmd_swp_mkuffd_wp(pmd);
>> +				set_pmd_at(src_mm, addr, src_pmd, pmd);
>> +			}
>> +
>> +			src_folio = pfn_swap_entry_folio(entry);
>> +			VM_WARN_ON(!folio_test_large(src_folio));
>> +
>> +			folio_get(src_folio);
>> +			/*
>> +			 * folio_try_dup_anon_rmap_pmd does not fail for
>> +			 * device private entries.
>> +			 */
>> +			VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio,
>> +					  &src_folio->page, dst_vma, src_vma));
> 
> VM_WARN_ON compiles out in non-debug builds. I hit this running the
> fork self I shared with a non-debug build.


folio_try_dup_anon_rmap_pmd() will never fail for 
folio_is_device_private(folio) -- unless something is deeply messed up 
that we wouldn't identify this folio as being device-private.

Can you elaborate, what were you able to trigger, and in what kind of 
environment?

-- 
Cheers

David / dhildenb

Re: [v3 02/11] mm/thp: zone_device awareness in THP handling code
Posted by Matthew Brost 1 month, 1 week ago
On Thu, Aug 28, 2025 at 10:12:53PM +0200, David Hildenbrand wrote:
> On 28.08.25 22:05, Matthew Brost wrote:
> > On Tue, Aug 12, 2025 at 12:40:27PM +1000, Balbir Singh wrote:
> > > Make THP handling code in the mm subsystem for THP pages aware of zone
> > > device pages. Although the code is designed to be generic when it comes
> > > to handling splitting of pages, the code is designed to work for THP
> > > page sizes corresponding to HPAGE_PMD_NR.
> > > 
> > > Modify page_vma_mapped_walk() to return true when a zone device huge
> > > entry is present, enabling try_to_migrate() and other code migration
> > > paths to appropriately process the entry. page_vma_mapped_walk() will
> > > return true for zone device private large folios only when
> > > PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are
> > > not zone device private pages from having to add awareness. The key
> > > callback that needs this flag is try_to_migrate_one(). The other
> > > callbacks page idle, damon use it for setting young/dirty bits, which is
> > > not significant when it comes to pmd level bit harvesting.
> > > 
> > > pmd_pfn() does not work well with zone device entries, use
> > > pfn_pmd_entry_to_swap() for checking and comparison as for zone device
> > > entries.
> > > 
> > > Support partial unmapping of zone device private entries, which happens
> > > via munmap(). munmap() causes the device private entry pmd to be split,
> > > but the corresponding folio is not split. Deferred split does not work for
> > > zone device private folios due to the need to split during fault
> > > handling. Get migrate_vma_collect_pmd() to handle this case by splitting
> > > partially unmapped device private folios.
> > > 
> > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > Cc: David Hildenbrand <david@redhat.com>
> > > Cc: Zi Yan <ziy@nvidia.com>
> > > Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
> > > Cc: Rakie Kim <rakie.kim@sk.com>
> > > Cc: Byungchul Park <byungchul@sk.com>
> > > Cc: Gregory Price <gourry@gourry.net>
> > > Cc: Ying Huang <ying.huang@linux.alibaba.com>
> > > Cc: Alistair Popple <apopple@nvidia.com>
> > > Cc: Oscar Salvador <osalvador@suse.de>
> > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> > > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> > > Cc: Nico Pache <npache@redhat.com>
> > > Cc: Ryan Roberts <ryan.roberts@arm.com>
> > > Cc: Dev Jain <dev.jain@arm.com>
> > > Cc: Barry Song <baohua@kernel.org>
> > > Cc: Lyude Paul <lyude@redhat.com>
> > > Cc: Danilo Krummrich <dakr@kernel.org>
> > > Cc: David Airlie <airlied@gmail.com>
> > > Cc: Simona Vetter <simona@ffwll.ch>
> > > Cc: Ralph Campbell <rcampbell@nvidia.com>
> > > Cc: Mika Penttilä <mpenttil@redhat.com>
> > > Cc: Matthew Brost <matthew.brost@intel.com>
> > > Cc: Francois Dugast <francois.dugast@intel.com>
> > > 
> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > Signed-off-by: Balbir Singh <balbirs@nvidia.com>
> > > ---
> > >   include/linux/rmap.h    |   2 +
> > >   include/linux/swapops.h |  17 ++++
> > >   lib/test_hmm.c          |   2 +-
> > >   mm/huge_memory.c        | 214 +++++++++++++++++++++++++++++++---------
> > >   mm/migrate_device.c     |  47 +++++++++
> > >   mm/page_vma_mapped.c    |  13 ++-
> > >   mm/pgtable-generic.c    |   6 ++
> > >   mm/rmap.c               |  24 ++++-
> > >   8 files changed, 272 insertions(+), 53 deletions(-)
> > > 
> > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > > index 6cd020eea37a..dfb7aae3d77b 100644
> > > --- a/include/linux/rmap.h
> > > +++ b/include/linux/rmap.h
> > > @@ -927,6 +927,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
> > >   #define PVMW_SYNC		(1 << 0)
> > >   /* Look for migration entries rather than present PTEs */
> > >   #define PVMW_MIGRATION		(1 << 1)
> > > +/* Look for device private THP entries */
> > > +#define PVMW_THP_DEVICE_PRIVATE	(1 << 2)
> > >   struct page_vma_mapped_walk {
> > >   	unsigned long pfn;
> > > diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> > > index 64ea151a7ae3..2641c01bd5d2 100644
> > > --- a/include/linux/swapops.h
> > > +++ b/include/linux/swapops.h
> > > @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
> > >   {
> > >   	return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
> > >   }
> > > +
> > >   #else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
> > >   static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
> > >   		struct page *page)
> > > @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
> > >   }
> > >   #endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
> > > +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
> > > +
> > > +static inline int is_pmd_device_private_entry(pmd_t pmd)
> > > +{
> > > +	return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
> > > +}
> > > +
> > > +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
> > > +
> > > +static inline int is_pmd_device_private_entry(pmd_t pmd)
> > > +{
> > > +	return 0;
> > > +}
> > > +
> > > +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
> > > +
> > >   static inline int non_swap_entry(swp_entry_t entry)
> > >   {
> > >   	return swp_type(entry) >= MAX_SWAPFILES;
> > > diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> > > index 761725bc713c..297f1e034045 100644
> > > --- a/lib/test_hmm.c
> > > +++ b/lib/test_hmm.c
> > > @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
> > >   	 * the mirror but here we use it to hold the page for the simulated
> > >   	 * device memory and that page holds the pointer to the mirror.
> > >   	 */
> > > -	rpage = vmf->page->zone_device_data;
> > > +	rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data;
> > >   	dmirror = rpage->zone_device_data;
> > >   	/* FIXME demonstrate how we can adjust migrate range */
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index 9c38a95e9f09..2495e3fdbfae 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> > >   	if (unlikely(is_swap_pmd(pmd))) {
> > >   		swp_entry_t entry = pmd_to_swp_entry(pmd);
> > > -		VM_BUG_ON(!is_pmd_migration_entry(pmd));
> > > -		if (!is_readable_migration_entry(entry)) {
> > > +		VM_WARN_ON(!is_pmd_migration_entry(pmd) &&
> > > +				!is_pmd_device_private_entry(pmd));
> > > +
> > > +		if (is_migration_entry(entry) &&
> > > +			is_writable_migration_entry(entry)) {
> > >   			entry = make_readable_migration_entry(
> > >   							swp_offset(entry));
> > >   			pmd = swp_entry_to_pmd(entry);
> > > @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> > >   				pmd = pmd_swp_mkuffd_wp(pmd);
> > >   			set_pmd_at(src_mm, addr, src_pmd, pmd);
> > >   		}
> > > +
> > > +		if (is_device_private_entry(entry)) {
> > > +			if (is_writable_device_private_entry(entry)) {
> > > +				entry = make_readable_device_private_entry(
> > > +					swp_offset(entry));
> > > +				pmd = swp_entry_to_pmd(entry);
> > > +
> > > +				if (pmd_swp_soft_dirty(*src_pmd))
> > > +					pmd = pmd_swp_mksoft_dirty(pmd);
> > > +				if (pmd_swp_uffd_wp(*src_pmd))
> > > +					pmd = pmd_swp_mkuffd_wp(pmd);
> > > +				set_pmd_at(src_mm, addr, src_pmd, pmd);
> > > +			}
> > > +
> > > +			src_folio = pfn_swap_entry_folio(entry);
> > > +			VM_WARN_ON(!folio_test_large(src_folio));
> > > +
> > > +			folio_get(src_folio);
> > > +			/*
> > > +			 * folio_try_dup_anon_rmap_pmd does not fail for
> > > +			 * device private entries.
> > > +			 */
> > > +			VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio,
> > > +					  &src_folio->page, dst_vma, src_vma));
> > 
> > VM_WARN_ON compiles out in non-debug builds. I hit this running the
> > fork self I shared with a non-debug build.
> 
> 
> folio_try_dup_anon_rmap_pmd() will never fail for
> folio_is_device_private(folio) -- unless something is deeply messed up that
> we wouldn't identify this folio as being device-private.
> 
> Can you elaborate, what were you able to trigger, and in what kind of
> environment?
> 

Maybe this was bad phrasing. I compilied the kernel with a non-debug
build and fork() broke for THP device pages because the above call to
folio_try_dup_anon_rmap_pmd compiled out (i.e., it wasn't called).

Matt

> -- 
> Cheers
> 
> David / dhildenb
> 
Re: [v3 02/11] mm/thp: zone_device awareness in THP handling code
Posted by David Hildenbrand 1 month, 1 week ago
On 28.08.25 22:17, Matthew Brost wrote:
> On Thu, Aug 28, 2025 at 10:12:53PM +0200, David Hildenbrand wrote:
>> On 28.08.25 22:05, Matthew Brost wrote:
>>> On Tue, Aug 12, 2025 at 12:40:27PM +1000, Balbir Singh wrote:
>>>> Make THP handling code in the mm subsystem for THP pages aware of zone
>>>> device pages. Although the code is designed to be generic when it comes
>>>> to handling splitting of pages, the code is designed to work for THP
>>>> page sizes corresponding to HPAGE_PMD_NR.
>>>>
>>>> Modify page_vma_mapped_walk() to return true when a zone device huge
>>>> entry is present, enabling try_to_migrate() and other code migration
>>>> paths to appropriately process the entry. page_vma_mapped_walk() will
>>>> return true for zone device private large folios only when
>>>> PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are
>>>> not zone device private pages from having to add awareness. The key
>>>> callback that needs this flag is try_to_migrate_one(). The other
>>>> callbacks page idle, damon use it for setting young/dirty bits, which is
>>>> not significant when it comes to pmd level bit harvesting.
>>>>
>>>> pmd_pfn() does not work well with zone device entries, use
>>>> pfn_pmd_entry_to_swap() for checking and comparison as for zone device
>>>> entries.
>>>>
>>>> Support partial unmapping of zone device private entries, which happens
>>>> via munmap(). munmap() causes the device private entry pmd to be split,
>>>> but the corresponding folio is not split. Deferred split does not work for
>>>> zone device private folios due to the need to split during fault
>>>> handling. Get migrate_vma_collect_pmd() to handle this case by splitting
>>>> partially unmapped device private folios.
>>>>
>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>> Cc: David Hildenbrand <david@redhat.com>
>>>> Cc: Zi Yan <ziy@nvidia.com>
>>>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>>>> Cc: Rakie Kim <rakie.kim@sk.com>
>>>> Cc: Byungchul Park <byungchul@sk.com>
>>>> Cc: Gregory Price <gourry@gourry.net>
>>>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>>>> Cc: Alistair Popple <apopple@nvidia.com>
>>>> Cc: Oscar Salvador <osalvador@suse.de>
>>>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>>>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>>>> Cc: Nico Pache <npache@redhat.com>
>>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>>> Cc: Dev Jain <dev.jain@arm.com>
>>>> Cc: Barry Song <baohua@kernel.org>
>>>> Cc: Lyude Paul <lyude@redhat.com>
>>>> Cc: Danilo Krummrich <dakr@kernel.org>
>>>> Cc: David Airlie <airlied@gmail.com>
>>>> Cc: Simona Vetter <simona@ffwll.ch>
>>>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>>>> Cc: Mika Penttilä <mpenttil@redhat.com>
>>>> Cc: Matthew Brost <matthew.brost@intel.com>
>>>> Cc: Francois Dugast <francois.dugast@intel.com>
>>>>
>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>>>> ---
>>>>    include/linux/rmap.h    |   2 +
>>>>    include/linux/swapops.h |  17 ++++
>>>>    lib/test_hmm.c          |   2 +-
>>>>    mm/huge_memory.c        | 214 +++++++++++++++++++++++++++++++---------
>>>>    mm/migrate_device.c     |  47 +++++++++
>>>>    mm/page_vma_mapped.c    |  13 ++-
>>>>    mm/pgtable-generic.c    |   6 ++
>>>>    mm/rmap.c               |  24 ++++-
>>>>    8 files changed, 272 insertions(+), 53 deletions(-)
>>>>
>>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>>> index 6cd020eea37a..dfb7aae3d77b 100644
>>>> --- a/include/linux/rmap.h
>>>> +++ b/include/linux/rmap.h
>>>> @@ -927,6 +927,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
>>>>    #define PVMW_SYNC		(1 << 0)
>>>>    /* Look for migration entries rather than present PTEs */
>>>>    #define PVMW_MIGRATION		(1 << 1)
>>>> +/* Look for device private THP entries */
>>>> +#define PVMW_THP_DEVICE_PRIVATE	(1 << 2)
>>>>    struct page_vma_mapped_walk {
>>>>    	unsigned long pfn;
>>>> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
>>>> index 64ea151a7ae3..2641c01bd5d2 100644
>>>> --- a/include/linux/swapops.h
>>>> +++ b/include/linux/swapops.h
>>>> @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>>>>    {
>>>>    	return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
>>>>    }
>>>> +
>>>>    #else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>>>    static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
>>>>    		struct page *page)
>>>> @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>>>>    }
>>>>    #endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>>> +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
>>>> +
>>>> +static inline int is_pmd_device_private_entry(pmd_t pmd)
>>>> +{
>>>> +	return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
>>>> +}
>>>> +
>>>> +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>>> +
>>>> +static inline int is_pmd_device_private_entry(pmd_t pmd)
>>>> +{
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>>> +
>>>>    static inline int non_swap_entry(swp_entry_t entry)
>>>>    {
>>>>    	return swp_type(entry) >= MAX_SWAPFILES;
>>>> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
>>>> index 761725bc713c..297f1e034045 100644
>>>> --- a/lib/test_hmm.c
>>>> +++ b/lib/test_hmm.c
>>>> @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
>>>>    	 * the mirror but here we use it to hold the page for the simulated
>>>>    	 * device memory and that page holds the pointer to the mirror.
>>>>    	 */
>>>> -	rpage = vmf->page->zone_device_data;
>>>> +	rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data;
>>>>    	dmirror = rpage->zone_device_data;
>>>>    	/* FIXME demonstrate how we can adjust migrate range */
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 9c38a95e9f09..2495e3fdbfae 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>>>    	if (unlikely(is_swap_pmd(pmd))) {
>>>>    		swp_entry_t entry = pmd_to_swp_entry(pmd);
>>>> -		VM_BUG_ON(!is_pmd_migration_entry(pmd));
>>>> -		if (!is_readable_migration_entry(entry)) {
>>>> +		VM_WARN_ON(!is_pmd_migration_entry(pmd) &&
>>>> +				!is_pmd_device_private_entry(pmd));
>>>> +
>>>> +		if (is_migration_entry(entry) &&
>>>> +			is_writable_migration_entry(entry)) {
>>>>    			entry = make_readable_migration_entry(
>>>>    							swp_offset(entry));
>>>>    			pmd = swp_entry_to_pmd(entry);
>>>> @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>>>    				pmd = pmd_swp_mkuffd_wp(pmd);
>>>>    			set_pmd_at(src_mm, addr, src_pmd, pmd);
>>>>    		}
>>>> +
>>>> +		if (is_device_private_entry(entry)) {
>>>> +			if (is_writable_device_private_entry(entry)) {
>>>> +				entry = make_readable_device_private_entry(
>>>> +					swp_offset(entry));
>>>> +				pmd = swp_entry_to_pmd(entry);
>>>> +
>>>> +				if (pmd_swp_soft_dirty(*src_pmd))
>>>> +					pmd = pmd_swp_mksoft_dirty(pmd);
>>>> +				if (pmd_swp_uffd_wp(*src_pmd))
>>>> +					pmd = pmd_swp_mkuffd_wp(pmd);
>>>> +				set_pmd_at(src_mm, addr, src_pmd, pmd);
>>>> +			}
>>>> +
>>>> +			src_folio = pfn_swap_entry_folio(entry);
>>>> +			VM_WARN_ON(!folio_test_large(src_folio));
>>>> +
>>>> +			folio_get(src_folio);
>>>> +			/*
>>>> +			 * folio_try_dup_anon_rmap_pmd does not fail for
>>>> +			 * device private entries.
>>>> +			 */
>>>> +			VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio,
>>>> +					  &src_folio->page, dst_vma, src_vma));
>>>
>>> VM_WARN_ON compiles out in non-debug builds. I hit this running the
>>> fork self I shared with a non-debug build.
>>
>>
>> folio_try_dup_anon_rmap_pmd() will never fail for
>> folio_is_device_private(folio) -- unless something is deeply messed up that
>> we wouldn't identify this folio as being device-private.
>>
>> Can you elaborate, what were you able to trigger, and in what kind of
>> environment?
>>
> 
> Maybe this was bad phrasing. I compilied the kernel with a non-debug
> build and fork() broke for THP device pages because the above call to
> folio_try_dup_anon_rmap_pmd compiled out (i.e., it wasn't called).

Ah, yes!

As I said in my reply, we should not do any kind of WARN here, like in 
the PTE case.

-- 
Cheers

David / dhildenb

Re: [v3 02/11] mm/thp: zone_device awareness in THP handling code
Posted by David Hildenbrand 1 month, 1 week ago
On 12.08.25 04:40, Balbir Singh wrote:
> Make THP handling code in the mm subsystem for THP pages aware of zone
> device pages. Although the code is designed to be generic when it comes
> to handling splitting of pages, the code is designed to work for THP
> page sizes corresponding to HPAGE_PMD_NR.
> 
> Modify page_vma_mapped_walk() to return true when a zone device huge
> entry is present, enabling try_to_migrate() and other code migration
> paths to appropriately process the entry. page_vma_mapped_walk() will
> return true for zone device private large folios only when
> PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are
> not zone device private pages from having to add awareness.

Please don't if avoidable.

We should already have the same problem with small zone-device private
pages, and should have proper folio checks in place, no?


[...]

This thing is huge and hard to review. Given there are subtle changes in here that
are likely problematic, this is a problem. Is there any way to split this
into logical chunks?

Like teaching zap, mprotect, rmap walks .... code separately.

I'm, sure you'll find a way to break this down so I don't walk out of a
review with an headake ;)

>   
>   struct page_vma_mapped_walk {
>   	unsigned long pfn;
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index 64ea151a7ae3..2641c01bd5d2 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>   {
>   	return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
>   }
> +

^ unrelated change

>   #else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>   static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
>   		struct page *page)
> @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>   }
>   #endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>   
> +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
> +
> +static inline int is_pmd_device_private_entry(pmd_t pmd)
> +{
> +	return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
> +}
> +
> +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
> +
> +static inline int is_pmd_device_private_entry(pmd_t pmd)
> +{
> +	return 0;
> +}
> +
> +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
> +
>   static inline int non_swap_entry(swp_entry_t entry)
>   {
>   	return swp_type(entry) >= MAX_SWAPFILES;
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index 761725bc713c..297f1e034045 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
>   	 * the mirror but here we use it to hold the page for the simulated
>   	 * device memory and that page holds the pointer to the mirror.
>   	 */
> -	rpage = vmf->page->zone_device_data;
> +	rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data;

Can we have a wrapper please to give us the zone_device_data for a folio, so
we have something like

rpage = folio_zone_device_data(page_folio(vmf->page));

>   	dmirror = rpage->zone_device_data;
>   
>   	/* FIXME demonstrate how we can adjust migrate range */
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 9c38a95e9f09..2495e3fdbfae 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>   	if (unlikely(is_swap_pmd(pmd))) {
>   		swp_entry_t entry = pmd_to_swp_entry(pmd);
>   
> -		VM_BUG_ON(!is_pmd_migration_entry(pmd));
> -		if (!is_readable_migration_entry(entry)) {
> +		VM_WARN_ON(!is_pmd_migration_entry(pmd) &&
> +				!is_pmd_device_private_entry(pmd));
> +
> +		if (is_migration_entry(entry) &&
> +			is_writable_migration_entry(entry)) {
>   			entry = make_readable_migration_entry(
>   							swp_offset(entry));

Careful: There is is_readable_exclusive_migration_entry(). So don't
change the !is_readable_migration_entry(entry) to is_writable_migration_entry(entry)(),
because it's wrong.

>   			pmd = swp_entry_to_pmd(entry);
> @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>   				pmd = pmd_swp_mkuffd_wp(pmd);
>   			set_pmd_at(src_mm, addr, src_pmd, pmd);
>   		}
> +
> +		if (is_device_private_entry(entry)) {

likely you want "else if" here.

> +			if (is_writable_device_private_entry(entry)) {
> +				entry = make_readable_device_private_entry(
> +					swp_offset(entry));
> +				pmd = swp_entry_to_pmd(entry);
> +
> +				if (pmd_swp_soft_dirty(*src_pmd))
> +					pmd = pmd_swp_mksoft_dirty(pmd);
> +				if (pmd_swp_uffd_wp(*src_pmd))
> +					pmd = pmd_swp_mkuffd_wp(pmd);
> +				set_pmd_at(src_mm, addr, src_pmd, pmd);
> +			}
> +
> +			src_folio = pfn_swap_entry_folio(entry);
> +			VM_WARN_ON(!folio_test_large(src_folio));
> +
> +			folio_get(src_folio);
> +			/*
> +			 * folio_try_dup_anon_rmap_pmd does not fail for
> +			 * device private entries.
> +			 */
> +			VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio,
> +					  &src_folio->page, dst_vma, src_vma));
> +		}

I would appreciate if this code flow here would resemble more what we have in
copy_nonpresent_pte(), at least regarding handling of these two cases.

(e.g., dropping the VM_WARN_ON)

> +
>   		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>   		mm_inc_nr_ptes(dst_mm);
>   		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
> @@ -2219,15 +2248,22 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>   			folio_remove_rmap_pmd(folio, page, vma);
>   			WARN_ON_ONCE(folio_mapcount(folio) < 0);
>   			VM_BUG_ON_PAGE(!PageHead(page), page);
> -		} else if (thp_migration_supported()) {
> +		} else if (is_pmd_migration_entry(orig_pmd) ||
> +				is_pmd_device_private_entry(orig_pmd)) {
>   			swp_entry_t entry;
>   
> -			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
>   			entry = pmd_to_swp_entry(orig_pmd);
>   			folio = pfn_swap_entry_folio(entry);
>   			flush_needed = 0;
> -		} else
> -			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
> +
> +			if (!thp_migration_supported())
> +				WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
> +
> +			if (is_pmd_device_private_entry(orig_pmd)) {
> +				folio_remove_rmap_pmd(folio, &folio->page, vma);
> +				WARN_ON_ONCE(folio_mapcount(folio) < 0);

Can we jsut move that into the folio_is_device_private() check below.

> +			}
> +		}
>   
>   		if (folio_test_anon(folio)) {
>   			zap_deposited_table(tlb->mm, pmd);
> @@ -2247,6 +2283,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>   				folio_mark_accessed(folio);
>   		}
>   
> +		/*
> +		 * Do a folio put on zone device private pages after
> +		 * changes to mm_counter, because the folio_put() will
> +		 * clean folio->mapping and the folio_test_anon() check
> +		 * will not be usable.
> +		 */

The comment can be dropped: it's simple, don't use "folio" after
dropping the reference when zapping.

> +		if (folio_is_device_private(folio))
> +			folio_put(folio);

> +
>   		spin_unlock(ptl);
>   		if (flush_needed)
>   			tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
> @@ -2375,7 +2420,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>   		struct folio *folio = pfn_swap_entry_folio(entry);
>   		pmd_t newpmd;
>   
> -		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
> +		VM_WARN_ON(!is_pmd_migration_entry(*pmd) &&
> +			   !folio_is_device_private(folio));
>   		if (is_writable_migration_entry(entry)) {
>   			/*
>   			 * A protection check is difficult so
> @@ -2388,6 +2434,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>   			newpmd = swp_entry_to_pmd(entry);
>   			if (pmd_swp_soft_dirty(*pmd))
>   				newpmd = pmd_swp_mksoft_dirty(newpmd);
> +		} else if (is_writable_device_private_entry(entry)) {
> +			entry = make_readable_device_private_entry(
> +							swp_offset(entry));
> +			newpmd = swp_entry_to_pmd(entry);
>   		} else {
>   			newpmd = *pmd;
>   		}
> @@ -2842,16 +2892,19 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>   	struct page *page;
>   	pgtable_t pgtable;
>   	pmd_t old_pmd, _pmd;
> -	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
> -	bool anon_exclusive = false, dirty = false;
> +	bool young, write, soft_dirty, uffd_wp = false;
> +	bool anon_exclusive = false, dirty = false, present = false;
>   	unsigned long addr;
>   	pte_t *pte;
>   	int i;
> +	swp_entry_t swp_entry;
>   
>   	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
>   	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>   	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
> -	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
> +
> +	VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
> +			&& !(is_pmd_device_private_entry(*pmd)));

VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) &&
	   !(is_pmd_device_private_entry(*pmd)));


>   
>   	count_vm_event(THP_SPLIT_PMD);
>   
> @@ -2899,18 +2952,45 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>   		return __split_huge_zero_page_pmd(vma, haddr, pmd);
>   	}
>   
> -	pmd_migration = is_pmd_migration_entry(*pmd);
> -	if (unlikely(pmd_migration)) {
> -		swp_entry_t entry;
>   
> +	present = pmd_present(*pmd);
> +	if (unlikely(!present)) {
> +		swp_entry = pmd_to_swp_entry(*pmd);
>   		old_pmd = *pmd;
> -		entry = pmd_to_swp_entry(old_pmd);
> -		page = pfn_swap_entry_to_page(entry);
> -		write = is_writable_migration_entry(entry);
> -		if (PageAnon(page))
> -			anon_exclusive = is_readable_exclusive_migration_entry(entry);
> -		young = is_migration_entry_young(entry);
> -		dirty = is_migration_entry_dirty(entry);
> +
> +		folio = pfn_swap_entry_folio(swp_entry);
> +		VM_WARN_ON(!is_migration_entry(swp_entry) &&
> +				!is_device_private_entry(swp_entry));
> +		page = pfn_swap_entry_to_page(swp_entry);
> +
> +		if (is_pmd_migration_entry(old_pmd)) {
> +			write = is_writable_migration_entry(swp_entry);
> +			if (PageAnon(page))
> +				anon_exclusive =
> +					is_readable_exclusive_migration_entry(
> +								swp_entry);
> +			young = is_migration_entry_young(swp_entry);
> +			dirty = is_migration_entry_dirty(swp_entry);
> +		} else if (is_pmd_device_private_entry(old_pmd)) {
> +			write = is_writable_device_private_entry(swp_entry);
> +			anon_exclusive = PageAnonExclusive(page);
> +			if (freeze && anon_exclusive &&
> +			    folio_try_share_anon_rmap_pmd(folio, page))
> +				freeze = false;
> +			if (!freeze) {
> +				rmap_t rmap_flags = RMAP_NONE;
> +
> +				if (anon_exclusive)
> +					rmap_flags |= RMAP_EXCLUSIVE;
> +
> +				folio_ref_add(folio, HPAGE_PMD_NR - 1);
> +				if (anon_exclusive)
> +					rmap_flags |= RMAP_EXCLUSIVE;
> +				folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
> +						 vma, haddr, rmap_flags);
> +			}
> +		}

This is massive and I'll have to review it with a fresh mind later.

[...]
	put_page(page);
> @@ -3058,8 +3157,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>   void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>   			   pmd_t *pmd, bool freeze)
>   {
> +

^ unrelated

>   	VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
> -	if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
> +	if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) ||
> +			(is_pmd_device_private_entry(*pmd)))

if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) ||
     is_pmd_device_private_entry(*pmd))

>   		__split_huge_pmd_locked(vma, pmd, address, freeze);
>   }
>   
> @@ -3238,6 +3339,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>   	VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
>   	lockdep_assert_held(&lruvec->lru_lock);
>   
> +	if (folio_is_device_private(folio))
> +		return;
> +
>   	if (list) {
>   		/* page reclaim is reclaiming a huge page */
>   		VM_WARN_ON(folio_test_lru(folio));
> @@ -3252,6 +3356,7 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>   			list_add_tail(&new_folio->lru, &folio->lru);
>   		folio_set_lru(new_folio);
>   	}
> +

^ unrelated

>   }
>   
>   /* Racy check whether the huge page can be split */
> @@ -3727,7 +3832,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>   
>   	/* Prevent deferred_split_scan() touching ->_refcount */
>   	spin_lock(&ds_queue->split_queue_lock);
> -	if (folio_ref_freeze(folio, 1 + extra_pins)) {
> +	if (folio_ref_freeze(folio, 1 + folio_expected_ref_count(folio))) {

I think I discussed that with Zi Yan and it's tricky. Such a change should go
into a separate cleanup patch.


>   		struct address_space *swap_cache = NULL;
>   		struct lruvec *lruvec;
>   		int expected_refs;
> @@ -3858,8 +3963,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>   	if (nr_shmem_dropped)
>   		shmem_uncharge(mapping->host, nr_shmem_dropped);
>   
> -	if (!ret && is_anon)
> +	if (!ret && is_anon && !folio_is_device_private(folio))
>   		remap_flags = RMP_USE_SHARED_ZEROPAGE;
> +

^ unrelated

>   	remap_page(folio, 1 << order, remap_flags);
>   
>   	/*
> @@ -4603,7 +4709,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
>   		return 0;
>   
>   	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
> -	pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
> +	if (unlikely(is_pmd_device_private_entry(*pvmw->pmd)))

Use pmd_present() instead, please. (just like in the pte code that handles this).

Why do we have to flush? pmd_clear() might be sufficient? In the PTE case we use pte_clear().

[...]

>   		pmde = pmd_mksoft_dirty(pmde);
>   	if (is_writable_migration_entry(entry))
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index e05e14d6eacd..0ed337f94fcd 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -136,6 +136,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>   			 * page table entry. Other special swap entries are not
>   			 * migratable, and we ignore regular swapped page.
>   			 */
> +			struct folio *folio;
> +
>   			entry = pte_to_swp_entry(pte);
>   			if (!is_device_private_entry(entry))
>   				goto next;
> @@ -147,6 +149,51 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>   			    pgmap->owner != migrate->pgmap_owner)
>   				goto next;
>   
> +			folio = page_folio(page);
> +			if (folio_test_large(folio)) {
> +				struct folio *new_folio;
> +				struct folio *new_fault_folio = NULL;
> +
> +				/*
> +				 * The reason for finding pmd present with a
> +				 * device private pte and a large folio for the
> +				 * pte is partial unmaps. Split the folio now
> +				 * for the migration to be handled correctly
> +				 */

There are also other cases, like any VMA splits. Not sure if that makes a difference,
the folio is PTE mapped.

> +				pte_unmap_unlock(ptep, ptl);
> +
> +				folio_get(folio);
> +				if (folio != fault_folio)
> +					folio_lock(folio);
> +				if (split_folio(folio)) {
> +					if (folio != fault_folio)
> +						folio_unlock(folio);
> +					ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> +					goto next;
> +				}
> +
> +				new_folio = page_folio(page);
> +				if (fault_folio)
> +					new_fault_folio = page_folio(migrate->fault_page);
> +
> +				/*
> +				 * Ensure the lock is held on the correct
> +				 * folio after the split
> +				 */
> +				if (!new_fault_folio) {
> +					folio_unlock(folio);
> +					folio_put(folio);
> +				} else if (folio != new_fault_folio) {
> +					folio_get(new_fault_folio);
> +					folio_lock(new_fault_folio);
> +					folio_unlock(folio);
> +					folio_put(folio);
> +				}
> +
> +				addr = start;
> +				goto again;

Another thing to revisit with clean mind.

> +			}
> +
>   			mpfn = migrate_pfn(page_to_pfn(page)) |
>   					MIGRATE_PFN_MIGRATE;
>   			if (is_writable_device_private_entry(entry))
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index e981a1a292d2..246e6c211f34 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -250,12 +250,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>   			pvmw->ptl = pmd_lock(mm, pvmw->pmd);
>   			pmde = *pvmw->pmd;
>   			if (!pmd_present(pmde)) {
> -				swp_entry_t entry;
> +				swp_entry_t entry = pmd_to_swp_entry(pmde);
>   
>   				if (!thp_migration_supported() ||
>   				    !(pvmw->flags & PVMW_MIGRATION))
>   					return not_found(pvmw);
> -				entry = pmd_to_swp_entry(pmde);
>   				if (!is_migration_entry(entry) ||
>   				    !check_pmd(swp_offset_pfn(entry), pvmw))
>   					return not_found(pvmw);
> @@ -277,6 +276,16 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>   			 * cannot return prematurely, while zap_huge_pmd() has
>   			 * cleared *pmd but not decremented compound_mapcount().
>   			 */
> +			swp_entry_t entry;
> +
> +			entry = pmd_to_swp_entry(pmde);
> +
> +			if (is_device_private_entry(entry) &&
> +				(pvmw->flags & PVMW_THP_DEVICE_PRIVATE)) {
> +				pvmw->ptl = pmd_lock(mm, pvmw->pmd);
> +				return true;
> +			}
> +
>   			if ((pvmw->flags & PVMW_SYNC) &&
>   			    thp_vma_suitable_order(vma, pvmw->address,
>   						   PMD_ORDER) &&
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 567e2d084071..604e8206a2ec 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -292,6 +292,12 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
>   		*pmdvalp = pmdval;
>   	if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
>   		goto nomap;
> +	if (is_swap_pmd(pmdval)) {
> +		swp_entry_t entry = pmd_to_swp_entry(pmdval);
> +
> +		if (is_device_private_entry(entry))
> +			goto nomap;
> +	}
>   	if (unlikely(pmd_trans_huge(pmdval)))
>   		goto nomap;
>   	if (unlikely(pmd_bad(pmdval))) {
> diff --git a/mm/rmap.c b/mm/rmap.c
> index b5837075b6e0..f40e45564295 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -2285,7 +2285,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
>   		     unsigned long address, void *arg)
>   {
>   	struct mm_struct *mm = vma->vm_mm;
> -	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> +	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address,
> +				PVMW_THP_DEVICE_PRIVATE);
>   	bool anon_exclusive, writable, ret = true;
>   	pte_t pteval;
>   	struct page *subpage;
> @@ -2330,6 +2331,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
>   	while (page_vma_mapped_walk(&pvmw)) {
>   		/* PMD-mapped THP migration entry */
>   		if (!pvmw.pte) {
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +			unsigned long pfn;
> +#endif
> +
>   			if (flags & TTU_SPLIT_HUGE_PMD) {
>   				split_huge_pmd_locked(vma, pvmw.address,
>   						      pvmw.pmd, true);
> @@ -2338,8 +2343,21 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
>   				break;
>   			}
>   #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> -			subpage = folio_page(folio,
> -				pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
> +			/*
> +			 * Zone device private folios do not work well with
> +			 * pmd_pfn() on some architectures due to pte
> +			 * inversion.
> +			 */

Please use the handling for the PTE case as inspiration.

		/*
		 * Handle PFN swap PTEs, such as device-exclusive ones, that
		 * actually map pages.
		 */
		pteval = ptep_get(pvmw.pte);
		if (likely(pte_present(pteval))) {
			pfn = pte_pfn(pteval);
		} else {
			pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
			VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
		}


So I would expect here something like

		/*
		 * Handle PFN swap PTEs, such as device-exclusive ones, that
		 * actually map pages.
		 */
		pmdval = pmdp_get(pvmw.pmd);
		if (likely(pmd_present(pmdval)))
			pfn = pmd_pfn(pmdval);
		else
			pfn = swp_offset_pfn(pmd_to_swp_entry(pmdval));


> +			if (is_pmd_device_private_entry(*pvmw.pmd)) {
> +				swp_entry_t entry = pmd_to_swp_entry(*pvmw.pmd);
> +
> +				pfn = swp_offset_pfn(entry);
> +			} else {
> +				pfn = pmd_pfn(*pvmw.pmd);
> +			}
> +
> +			subpage = folio_page(folio, pfn - folio_pfn(folio));
> +
>   			VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
>   					!folio_test_pmd_mappable(folio), folio);
>   


-- 
Cheers

David / dhildenb
Re: [v3 02/11] mm/thp: zone_device awareness in THP handling code
Posted by Balbir Singh 1 month, 1 week ago
On 8/27/25 01:19, David Hildenbrand wrote:
> On 12.08.25 04:40, Balbir Singh wrote:
>> Make THP handling code in the mm subsystem for THP pages aware of zone
>> device pages. Although the code is designed to be generic when it comes
>> to handling splitting of pages, the code is designed to work for THP
>> page sizes corresponding to HPAGE_PMD_NR.
>>
>> Modify page_vma_mapped_walk() to return true when a zone device huge
>> entry is present, enabling try_to_migrate() and other code migration
>> paths to appropriately process the entry. page_vma_mapped_walk() will
>> return true for zone device private large folios only when
>> PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are
>> not zone device private pages from having to add awareness.
> 
> Please don't if avoidable.
> 
> We should already have the same problem with small zone-device private
> pages, and should have proper folio checks in place, no?
> 
> 
> [...]
> 
> This thing is huge and hard to review. Given there are subtle changes in here that
> are likely problematic, this is a problem. Is there any way to split this
> into logical chunks?
> 
> Like teaching zap, mprotect, rmap walks .... code separately.
> 
> I'm, sure you'll find a way to break this down so I don't walk out of a
> review with an headake ;)
> 

:) I had smaller chunks earlier, but then ran into don't add the change unless you
use the change problem

>>     struct page_vma_mapped_walk {
>>       unsigned long pfn;
>> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
>> index 64ea151a7ae3..2641c01bd5d2 100644
>> --- a/include/linux/swapops.h
>> +++ b/include/linux/swapops.h
>> @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>>   {
>>       return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
>>   }
>> +
> 
> ^ unrelated change

Ack

> 
>>   #else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>   static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
>>           struct page *page)
>> @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>>   }
>>   #endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>   +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
>> +
>> +static inline int is_pmd_device_private_entry(pmd_t pmd)
>> +{
>> +    return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
>> +}
>> +
>> +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
>> +
>> +static inline int is_pmd_device_private_entry(pmd_t pmd)
>> +{
>> +    return 0;
>> +}
>> +
>> +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
>> +
>>   static inline int non_swap_entry(swp_entry_t entry)
>>   {
>>       return swp_type(entry) >= MAX_SWAPFILES;
>> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
>> index 761725bc713c..297f1e034045 100644
>> --- a/lib/test_hmm.c
>> +++ b/lib/test_hmm.c
>> @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
>>        * the mirror but here we use it to hold the page for the simulated
>>        * device memory and that page holds the pointer to the mirror.
>>        */
>> -    rpage = vmf->page->zone_device_data;
>> +    rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data;
> 
> Can we have a wrapper please to give us the zone_device_data for a folio, so
> we have something like
> 
> rpage = folio_zone_device_data(page_folio(vmf->page));
> 

Yep, will change

>>       dmirror = rpage->zone_device_data;
>>         /* FIXME demonstrate how we can adjust migrate range */
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 9c38a95e9f09..2495e3fdbfae 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>       if (unlikely(is_swap_pmd(pmd))) {
>>           swp_entry_t entry = pmd_to_swp_entry(pmd);
>>   -        VM_BUG_ON(!is_pmd_migration_entry(pmd));
>> -        if (!is_readable_migration_entry(entry)) {
>> +        VM_WARN_ON(!is_pmd_migration_entry(pmd) &&
>> +                !is_pmd_device_private_entry(pmd));
>> +
>> +        if (is_migration_entry(entry) &&
>> +            is_writable_migration_entry(entry)) {
>>               entry = make_readable_migration_entry(
>>                               swp_offset(entry));
> 
> Careful: There is is_readable_exclusive_migration_entry(). So don't
> change the !is_readable_migration_entry(entry) to is_writable_migration_entry(entry)(),
> because it's wrong.
> 

Ack, I assume you are referring to potential prot_none entries?

>>               pmd = swp_entry_to_pmd(entry);
>> @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>                   pmd = pmd_swp_mkuffd_wp(pmd);
>>               set_pmd_at(src_mm, addr, src_pmd, pmd);
>>           }
>> +
>> +        if (is_device_private_entry(entry)) {
> 
> likely you want "else if" here.
> 

Ack

>> +            if (is_writable_device_private_entry(entry)) {
>> +                entry = make_readable_device_private_entry(
>> +                    swp_offset(entry));
>> +                pmd = swp_entry_to_pmd(entry);
>> +
>> +                if (pmd_swp_soft_dirty(*src_pmd))
>> +                    pmd = pmd_swp_mksoft_dirty(pmd);
>> +                if (pmd_swp_uffd_wp(*src_pmd))
>> +                    pmd = pmd_swp_mkuffd_wp(pmd);
>> +                set_pmd_at(src_mm, addr, src_pmd, pmd);
>> +            }
>> +
>> +            src_folio = pfn_swap_entry_folio(entry);
>> +            VM_WARN_ON(!folio_test_large(src_folio));
>> +
>> +            folio_get(src_folio);
>> +            /*
>> +             * folio_try_dup_anon_rmap_pmd does not fail for
>> +             * device private entries.
>> +             */
>> +            VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio,
>> +                      &src_folio->page, dst_vma, src_vma));
>> +        }
> 
> I would appreciate if this code flow here would resemble more what we have in
> copy_nonpresent_pte(), at least regarding handling of these two cases.
> 
> (e.g., dropping the VM_WARN_ON)

Ack

> 
>> +
>>           add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>           mm_inc_nr_ptes(dst_mm);
>>           pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
>> @@ -2219,15 +2248,22 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>               folio_remove_rmap_pmd(folio, page, vma);
>>               WARN_ON_ONCE(folio_mapcount(folio) < 0);
>>               VM_BUG_ON_PAGE(!PageHead(page), page);
>> -        } else if (thp_migration_supported()) {
>> +        } else if (is_pmd_migration_entry(orig_pmd) ||
>> +                is_pmd_device_private_entry(orig_pmd)) {
>>               swp_entry_t entry;
>>   -            VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
>>               entry = pmd_to_swp_entry(orig_pmd);
>>               folio = pfn_swap_entry_folio(entry);
>>               flush_needed = 0;
>> -        } else
>> -            WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
>> +
>> +            if (!thp_migration_supported())
>> +                WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
>> +
>> +            if (is_pmd_device_private_entry(orig_pmd)) {
>> +                folio_remove_rmap_pmd(folio, &folio->page, vma);
>> +                WARN_ON_ONCE(folio_mapcount(folio) < 0);
> 
> Can we jsut move that into the folio_is_device_private() check below.

The check you mean?

> 
>> +            }
>> +        }
>>             if (folio_test_anon(folio)) {
>>               zap_deposited_table(tlb->mm, pmd);
>> @@ -2247,6 +2283,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>                   folio_mark_accessed(folio);
>>           }
>>   +        /*
>> +         * Do a folio put on zone device private pages after
>> +         * changes to mm_counter, because the folio_put() will
>> +         * clean folio->mapping and the folio_test_anon() check
>> +         * will not be usable.
>> +         */
> 
> The comment can be dropped: it's simple, don't use "folio" after
> dropping the reference when zapping.
> 

Ack

>> +        if (folio_is_device_private(folio))
>> +            folio_put(folio);
> 
>> +
>>           spin_unlock(ptl);
>>           if (flush_needed)
>>               tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
>> @@ -2375,7 +2420,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>           struct folio *folio = pfn_swap_entry_folio(entry);
>>           pmd_t newpmd;
>>   -        VM_BUG_ON(!is_pmd_migration_entry(*pmd));
>> +        VM_WARN_ON(!is_pmd_migration_entry(*pmd) &&
>> +               !folio_is_device_private(folio));
>>           if (is_writable_migration_entry(entry)) {
>>               /*
>>                * A protection check is difficult so
>> @@ -2388,6 +2434,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>               newpmd = swp_entry_to_pmd(entry);
>>               if (pmd_swp_soft_dirty(*pmd))
>>                   newpmd = pmd_swp_mksoft_dirty(newpmd);
>> +        } else if (is_writable_device_private_entry(entry)) {
>> +            entry = make_readable_device_private_entry(
>> +                            swp_offset(entry));
>> +            newpmd = swp_entry_to_pmd(entry);
>>           } else {
>>               newpmd = *pmd;
>>           }
>> @@ -2842,16 +2892,19 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>       struct page *page;
>>       pgtable_t pgtable;
>>       pmd_t old_pmd, _pmd;
>> -    bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
>> -    bool anon_exclusive = false, dirty = false;
>> +    bool young, write, soft_dirty, uffd_wp = false;
>> +    bool anon_exclusive = false, dirty = false, present = false;
>>       unsigned long addr;
>>       pte_t *pte;
>>       int i;
>> +    swp_entry_t swp_entry;
>>         VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
>>       VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>>       VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
>> -    VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
>> +
>> +    VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
>> +            && !(is_pmd_device_private_entry(*pmd)));
> 
> VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) &&
>        !(is_pmd_device_private_entry(*pmd)));
> 
>

Ack
 

>>         count_vm_event(THP_SPLIT_PMD);
>>   @@ -2899,18 +2952,45 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>           return __split_huge_zero_page_pmd(vma, haddr, pmd);
>>       }
>>   -    pmd_migration = is_pmd_migration_entry(*pmd);
>> -    if (unlikely(pmd_migration)) {
>> -        swp_entry_t entry;
>>   +    present = pmd_present(*pmd);
>> +    if (unlikely(!present)) {
>> +        swp_entry = pmd_to_swp_entry(*pmd);
>>           old_pmd = *pmd;
>> -        entry = pmd_to_swp_entry(old_pmd);
>> -        page = pfn_swap_entry_to_page(entry);
>> -        write = is_writable_migration_entry(entry);
>> -        if (PageAnon(page))
>> -            anon_exclusive = is_readable_exclusive_migration_entry(entry);
>> -        young = is_migration_entry_young(entry);
>> -        dirty = is_migration_entry_dirty(entry);
>> +
>> +        folio = pfn_swap_entry_folio(swp_entry);
>> +        VM_WARN_ON(!is_migration_entry(swp_entry) &&
>> +                !is_device_private_entry(swp_entry));
>> +        page = pfn_swap_entry_to_page(swp_entry);
>> +
>> +        if (is_pmd_migration_entry(old_pmd)) {
>> +            write = is_writable_migration_entry(swp_entry);
>> +            if (PageAnon(page))
>> +                anon_exclusive =
>> +                    is_readable_exclusive_migration_entry(
>> +                                swp_entry);
>> +            young = is_migration_entry_young(swp_entry);
>> +            dirty = is_migration_entry_dirty(swp_entry);
>> +        } else if (is_pmd_device_private_entry(old_pmd)) {
>> +            write = is_writable_device_private_entry(swp_entry);
>> +            anon_exclusive = PageAnonExclusive(page);
>> +            if (freeze && anon_exclusive &&
>> +                folio_try_share_anon_rmap_pmd(folio, page))
>> +                freeze = false;
>> +            if (!freeze) {
>> +                rmap_t rmap_flags = RMAP_NONE;
>> +
>> +                if (anon_exclusive)
>> +                    rmap_flags |= RMAP_EXCLUSIVE;
>> +
>> +                folio_ref_add(folio, HPAGE_PMD_NR - 1);
>> +                if (anon_exclusive)
>> +                    rmap_flags |= RMAP_EXCLUSIVE;
>> +                folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
>> +                         vma, haddr, rmap_flags);
>> +            }
>> +        }
> 
> This is massive and I'll have to review it with a fresh mind later.

It is similar to what we do for non device private folios, when we map/unmap the entire
folio during migration, the new fresh folios/pages should be marked as anon exclusive.
But please do check

> 
> [...]
>     put_page(page);
>> @@ -3058,8 +3157,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>   void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>>                  pmd_t *pmd, bool freeze)
>>   {
>> +
> 
> ^ unrelated
> 

Ack

>>       VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
>> -    if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
>> +    if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) ||
>> +            (is_pmd_device_private_entry(*pmd)))
> 
> if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) ||
>     is_pmd_device_private_entry(*pmd))
> 
>>           __split_huge_pmd_locked(vma, pmd, address, freeze);
>>   }
>>   @@ -3238,6 +3339,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>>       VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
>>       lockdep_assert_held(&lruvec->lru_lock);
>>   +    if (folio_is_device_private(folio))
>> +        return;
>> +
>>       if (list) {
>>           /* page reclaim is reclaiming a huge page */
>>           VM_WARN_ON(folio_test_lru(folio));
>> @@ -3252,6 +3356,7 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>>               list_add_tail(&new_folio->lru, &folio->lru);
>>           folio_set_lru(new_folio);
>>       }
>> +
> 
> ^ unrelated
> 

Ack

>>   }
>>     /* Racy check whether the huge page can be split */
>> @@ -3727,7 +3832,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>>         /* Prevent deferred_split_scan() touching ->_refcount */
>>       spin_lock(&ds_queue->split_queue_lock);
>> -    if (folio_ref_freeze(folio, 1 + extra_pins)) {
>> +    if (folio_ref_freeze(folio, 1 + folio_expected_ref_count(folio))) {
> 
> I think I discussed that with Zi Yan and it's tricky. Such a change should go
> into a separate cleanup patch.
> 

Ack, I'll split it up as needed. The reason why this is here is because large folios that
have been partially split (pmd split) have a count of nr_pages + 1 on pmd_split and after
the split the map count falls, but never goes to 0 as the ref_freeze code expects.

> 
>>           struct address_space *swap_cache = NULL;
>>           struct lruvec *lruvec;
>>           int expected_refs;
>> @@ -3858,8 +3963,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>>       if (nr_shmem_dropped)
>>           shmem_uncharge(mapping->host, nr_shmem_dropped);
>>   -    if (!ret && is_anon)
>> +    if (!ret && is_anon && !folio_is_device_private(folio))
>>           remap_flags = RMP_USE_SHARED_ZEROPAGE;
>> +
> 
> ^ unrelated

Ack

> 
>>       remap_page(folio, 1 << order, remap_flags);
>>         /*
>> @@ -4603,7 +4709,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
>>           return 0;
>>         flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
>> -    pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
>> +    if (unlikely(is_pmd_device_private_entry(*pvmw->pmd)))
> 
> Use pmd_present() instead, please. (just like in the pte code that handles this).
> 

Ack

> Why do we have to flush? pmd_clear() might be sufficient? In the PTE case we use pte_clear().

Without the flush, other entities will not see the cleared pmd and isn't the pte_clear() only
when should_defer_flush() is true?

> 
> [...]
> 
>>           pmde = pmd_mksoft_dirty(pmde);
>>       if (is_writable_migration_entry(entry))
>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index e05e14d6eacd..0ed337f94fcd 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -136,6 +136,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>                * page table entry. Other special swap entries are not
>>                * migratable, and we ignore regular swapped page.
>>                */
>> +            struct folio *folio;
>> +
>>               entry = pte_to_swp_entry(pte);
>>               if (!is_device_private_entry(entry))
>>                   goto next;
>> @@ -147,6 +149,51 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>                   pgmap->owner != migrate->pgmap_owner)
>>                   goto next;
>>   +            folio = page_folio(page);
>> +            if (folio_test_large(folio)) {
>> +                struct folio *new_folio;
>> +                struct folio *new_fault_folio = NULL;
>> +
>> +                /*
>> +                 * The reason for finding pmd present with a
>> +                 * device private pte and a large folio for the
>> +                 * pte is partial unmaps. Split the folio now
>> +                 * for the migration to be handled correctly
>> +                 */
> 
> There are also other cases, like any VMA splits. Not sure if that makes a difference,
> the folio is PTE mapped.
> 

Ack, I can clarify that the folio is just pte mapped or remove the comment

>> +                pte_unmap_unlock(ptep, ptl);
>> +
>> +                folio_get(folio);
>> +                if (folio != fault_folio)
>> +                    folio_lock(folio);
>> +                if (split_folio(folio)) {
>> +                    if (folio != fault_folio)
>> +                        folio_unlock(folio);
>> +                    ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
>> +                    goto next;
>> +                }
>> +
>> +                new_folio = page_folio(page);
>> +                if (fault_folio)
>> +                    new_fault_folio = page_folio(migrate->fault_page);
>> +
>> +                /*
>> +                 * Ensure the lock is held on the correct
>> +                 * folio after the split
>> +                 */
>> +                if (!new_fault_folio) {
>> +                    folio_unlock(folio);
>> +                    folio_put(folio);
>> +                } else if (folio != new_fault_folio) {
>> +                    folio_get(new_fault_folio);
>> +                    folio_lock(new_fault_folio);
>> +                    folio_unlock(folio);
>> +                    folio_put(folio);
>> +                }
>> +
>> +                addr = start;
>> +                goto again;
> 
> Another thing to revisit with clean mind.
> 

Sure

>> +            }
>> +
>>               mpfn = migrate_pfn(page_to_pfn(page)) |
>>                       MIGRATE_PFN_MIGRATE;
>>               if (is_writable_device_private_entry(entry))
>> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
>> index e981a1a292d2..246e6c211f34 100644
>> --- a/mm/page_vma_mapped.c
>> +++ b/mm/page_vma_mapped.c
>> @@ -250,12 +250,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>>               pvmw->ptl = pmd_lock(mm, pvmw->pmd);
>>               pmde = *pvmw->pmd;
>>               if (!pmd_present(pmde)) {
>> -                swp_entry_t entry;
>> +                swp_entry_t entry = pmd_to_swp_entry(pmde);
>>                     if (!thp_migration_supported() ||
>>                       !(pvmw->flags & PVMW_MIGRATION))
>>                       return not_found(pvmw);
>> -                entry = pmd_to_swp_entry(pmde);
>>                   if (!is_migration_entry(entry) ||
>>                       !check_pmd(swp_offset_pfn(entry), pvmw))
>>                       return not_found(pvmw);
>> @@ -277,6 +276,16 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>>                * cannot return prematurely, while zap_huge_pmd() has
>>                * cleared *pmd but not decremented compound_mapcount().
>>                */
>> +            swp_entry_t entry;
>> +
>> +            entry = pmd_to_swp_entry(pmde);
>> +
>> +            if (is_device_private_entry(entry) &&
>> +                (pvmw->flags & PVMW_THP_DEVICE_PRIVATE)) {
>> +                pvmw->ptl = pmd_lock(mm, pvmw->pmd);
>> +                return true;
>> +            }
>> +
>>               if ((pvmw->flags & PVMW_SYNC) &&
>>                   thp_vma_suitable_order(vma, pvmw->address,
>>                              PMD_ORDER) &&
>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>> index 567e2d084071..604e8206a2ec 100644
>> --- a/mm/pgtable-generic.c
>> +++ b/mm/pgtable-generic.c
>> @@ -292,6 +292,12 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
>>           *pmdvalp = pmdval;
>>       if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
>>           goto nomap;
>> +    if (is_swap_pmd(pmdval)) {
>> +        swp_entry_t entry = pmd_to_swp_entry(pmdval);
>> +
>> +        if (is_device_private_entry(entry))
>> +            goto nomap;
>> +    }
>>       if (unlikely(pmd_trans_huge(pmdval)))
>>           goto nomap;
>>       if (unlikely(pmd_bad(pmdval))) {
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index b5837075b6e0..f40e45564295 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -2285,7 +2285,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
>>                unsigned long address, void *arg)
>>   {
>>       struct mm_struct *mm = vma->vm_mm;
>> -    DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
>> +    DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address,
>> +                PVMW_THP_DEVICE_PRIVATE);
>>       bool anon_exclusive, writable, ret = true;
>>       pte_t pteval;
>>       struct page *subpage;
>> @@ -2330,6 +2331,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
>>       while (page_vma_mapped_walk(&pvmw)) {
>>           /* PMD-mapped THP migration entry */
>>           if (!pvmw.pte) {
>> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
>> +            unsigned long pfn;
>> +#endif
>> +
>>               if (flags & TTU_SPLIT_HUGE_PMD) {
>>                   split_huge_pmd_locked(vma, pvmw.address,
>>                                 pvmw.pmd, true);
>> @@ -2338,8 +2343,21 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
>>                   break;
>>               }
>>   #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
>> -            subpage = folio_page(folio,
>> -                pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
>> +            /*
>> +             * Zone device private folios do not work well with
>> +             * pmd_pfn() on some architectures due to pte
>> +             * inversion.
>> +             */
> 
> Please use the handling for the PTE case as inspiration.
> 
>         /*
>          * Handle PFN swap PTEs, such as device-exclusive ones, that
>          * actually map pages.
>          */
>         pteval = ptep_get(pvmw.pte);
>         if (likely(pte_present(pteval))) {
>             pfn = pte_pfn(pteval);
>         } else {
>             pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
>             VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
>         }
> 
> 
> So I would expect here something like
> 
>         /*
>          * Handle PFN swap PTEs, such as device-exclusive ones, that
>          * actually map pages.
>          */
>         pmdval = pmdp_get(pvmw.pmd);
>         if (likely(pmd_present(pmdval)))
>             pfn = pmd_pfn(pmdval);
>         else
>             pfn = swp_offset_pfn(pmd_to_swp_entry(pmdval));
> 
> 

I can switch over to pmd_present for the checks

>> +            if (is_pmd_device_private_entry(*pvmw.pmd)) {
>> +                swp_entry_t entry = pmd_to_swp_entry(*pvmw.pmd);
>> +
>> +                pfn = swp_offset_pfn(entry);
>> +            } else {
>> +                pfn = pmd_pfn(*pvmw.pmd);
>> +            }
>> +
>> +            subpage = folio_page(folio, pfn - folio_pfn(folio));
>> +
>>               VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
>>                       !folio_test_pmd_mappable(folio), folio);
>>   
> 
> 


Thanks for the review
Balbir
Re: [v3 02/11] mm/thp: zone_device awareness in THP handling code
Posted by David Hildenbrand 1 month, 1 week ago
>> Like teaching zap, mprotect, rmap walks .... code separately.
>>
>> I'm, sure you'll find a way to break this down so I don't walk out of a
>> review with an headake ;)
>>
> 
> :) I had smaller chunks earlier, but then ran into don't add the change unless you
> use the change problem
> 

It's perfectly reasonable to have something like

mm/huge_memory: teach copy_huge_pmd() about huge device-private entries
mm/huge_memory: support splitting device-private folios

...

etc :)

[...]

>> Careful: There is is_readable_exclusive_migration_entry(). So don't
>> change the !is_readable_migration_entry(entry) to is_writable_migration_entry(entry)(),
>> because it's wrong.
>>
> 
> Ack, I assume you are referring to potential prot_none entries?

readable_exclusive are used to maintain the PageAnonExclusive bit right
now for migration entries. So it's not realted to prot_none.

[...]

>>> -            WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
>>> +
>>> +            if (!thp_migration_supported())
>>> +                WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
>>> +
>>> +            if (is_pmd_device_private_entry(orig_pmd)) {
>>> +                folio_remove_rmap_pmd(folio, &folio->page, vma);
>>> +                WARN_ON_ONCE(folio_mapcount(folio) < 0);
>>
>> Can we jsut move that into the folio_is_device_private() check below.
> 
> The check you mean?

The whole thing like

if (...) {
	folio_remove_rmap_pmd(folio, &folio->page, vma);
	WARN_ON_ONCE(folio_mapcount(folio) < 0);
	folio_put(folio)
}


[...]

> 
>> Why do we have to flush? pmd_clear() might be sufficient? In the PTE case we use pte_clear().
> 
> Without the flush, other entities will not see the cleared pmd and isn't the pte_clear() only
> when should_defer_flush() is true?

It's a non-present page entry, so there should be no TLB entry to flush.

> 
>>
>> [...]
>>
>>>            pmde = pmd_mksoft_dirty(pmde);
>>>        if (is_writable_migration_entry(entry))
>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>> index e05e14d6eacd..0ed337f94fcd 100644
>>> --- a/mm/migrate_device.c
>>> +++ b/mm/migrate_device.c
>>> @@ -136,6 +136,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>>                 * page table entry. Other special swap entries are not
>>>                 * migratable, and we ignore regular swapped page.
>>>                 */
>>> +            struct folio *folio;
>>> +
>>>                entry = pte_to_swp_entry(pte);
>>>                if (!is_device_private_entry(entry))
>>>                    goto next;
>>> @@ -147,6 +149,51 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>>                    pgmap->owner != migrate->pgmap_owner)
>>>                    goto next;
>>>    +            folio = page_folio(page);
>>> +            if (folio_test_large(folio)) {
>>> +                struct folio *new_folio;
>>> +                struct folio *new_fault_folio = NULL;
>>> +
>>> +                /*
>>> +                 * The reason for finding pmd present with a
>>> +                 * device private pte and a large folio for the
>>> +                 * pte is partial unmaps. Split the folio now
>>> +                 * for the migration to be handled correctly
>>> +                 */
>>
>> There are also other cases, like any VMA splits. Not sure if that makes a difference,
>> the folio is PTE mapped.
>>
> 
> Ack, I can clarify that the folio is just pte mapped or remove the comment

Sounds good.


-- 
Cheers

David / dhildenb

Re: [v3 02/11] mm/thp: zone_device awareness in THP handling code
Posted by kernel test robot 1 month, 3 weeks ago
Hi Balbir,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Balbir-Singh/mm-zone_device-support-large-zone-device-private-folios/20250812-105145
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20250812024036.690064-3-balbirs%40nvidia.com
patch subject: [v3 02/11] mm/thp: zone_device awareness in THP handling code
config: x86_64-randconfig-001-20250812 (https://download.01.org/0day-ci/archive/20250812/202508122212.qjsfr5Wf-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250812/202508122212.qjsfr5Wf-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202508122212.qjsfr5Wf-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> mm/migrate_device.c:154:19: warning: variable 'new_folio' set but not used [-Wunused-but-set-variable]
     154 |                                 struct folio *new_folio;
         |                                               ^
   1 warning generated.


vim +/new_folio +154 mm/migrate_device.c

    56	
    57	static int migrate_vma_collect_pmd(pmd_t *pmdp,
    58					   unsigned long start,
    59					   unsigned long end,
    60					   struct mm_walk *walk)
    61	{
    62		struct migrate_vma *migrate = walk->private;
    63		struct folio *fault_folio = migrate->fault_page ?
    64			page_folio(migrate->fault_page) : NULL;
    65		struct vm_area_struct *vma = walk->vma;
    66		struct mm_struct *mm = vma->vm_mm;
    67		unsigned long addr = start, unmapped = 0;
    68		spinlock_t *ptl;
    69		pte_t *ptep;
    70	
    71	again:
    72		if (pmd_none(*pmdp))
    73			return migrate_vma_collect_hole(start, end, -1, walk);
    74	
    75		if (pmd_trans_huge(*pmdp)) {
    76			struct folio *folio;
    77	
    78			ptl = pmd_lock(mm, pmdp);
    79			if (unlikely(!pmd_trans_huge(*pmdp))) {
    80				spin_unlock(ptl);
    81				goto again;
    82			}
    83	
    84			folio = pmd_folio(*pmdp);
    85			if (is_huge_zero_folio(folio)) {
    86				spin_unlock(ptl);
    87				split_huge_pmd(vma, pmdp, addr);
    88			} else {
    89				int ret;
    90	
    91				folio_get(folio);
    92				spin_unlock(ptl);
    93				/* FIXME: we don't expect THP for fault_folio */
    94				if (WARN_ON_ONCE(fault_folio == folio))
    95					return migrate_vma_collect_skip(start, end,
    96									walk);
    97				if (unlikely(!folio_trylock(folio)))
    98					return migrate_vma_collect_skip(start, end,
    99									walk);
   100				ret = split_folio(folio);
   101				if (fault_folio != folio)
   102					folio_unlock(folio);
   103				folio_put(folio);
   104				if (ret)
   105					return migrate_vma_collect_skip(start, end,
   106									walk);
   107			}
   108		}
   109	
   110		ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
   111		if (!ptep)
   112			goto again;
   113		arch_enter_lazy_mmu_mode();
   114	
   115		for (; addr < end; addr += PAGE_SIZE, ptep++) {
   116			struct dev_pagemap *pgmap;
   117			unsigned long mpfn = 0, pfn;
   118			struct folio *folio;
   119			struct page *page;
   120			swp_entry_t entry;
   121			pte_t pte;
   122	
   123			pte = ptep_get(ptep);
   124	
   125			if (pte_none(pte)) {
   126				if (vma_is_anonymous(vma)) {
   127					mpfn = MIGRATE_PFN_MIGRATE;
   128					migrate->cpages++;
   129				}
   130				goto next;
   131			}
   132	
   133			if (!pte_present(pte)) {
   134				/*
   135				 * Only care about unaddressable device page special
   136				 * page table entry. Other special swap entries are not
   137				 * migratable, and we ignore regular swapped page.
   138				 */
   139				struct folio *folio;
   140	
   141				entry = pte_to_swp_entry(pte);
   142				if (!is_device_private_entry(entry))
   143					goto next;
   144	
   145				page = pfn_swap_entry_to_page(entry);
   146				pgmap = page_pgmap(page);
   147				if (!(migrate->flags &
   148					MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
   149				    pgmap->owner != migrate->pgmap_owner)
   150					goto next;
   151	
   152				folio = page_folio(page);
   153				if (folio_test_large(folio)) {
 > 154					struct folio *new_folio;
   155					struct folio *new_fault_folio = NULL;
   156	
   157					/*
   158					 * The reason for finding pmd present with a
   159					 * device private pte and a large folio for the
   160					 * pte is partial unmaps. Split the folio now
   161					 * for the migration to be handled correctly
   162					 */
   163					pte_unmap_unlock(ptep, ptl);
   164	
   165					folio_get(folio);
   166					if (folio != fault_folio)
   167						folio_lock(folio);
   168					if (split_folio(folio)) {
   169						if (folio != fault_folio)
   170							folio_unlock(folio);
   171						ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
   172						goto next;
   173					}
   174	
   175					new_folio = page_folio(page);
   176					if (fault_folio)
   177						new_fault_folio = page_folio(migrate->fault_page);
   178	
   179					/*
   180					 * Ensure the lock is held on the correct
   181					 * folio after the split
   182					 */
   183					if (!new_fault_folio) {
   184						folio_unlock(folio);
   185						folio_put(folio);
   186					} else if (folio != new_fault_folio) {
   187						folio_get(new_fault_folio);
   188						folio_lock(new_fault_folio);
   189						folio_unlock(folio);
   190						folio_put(folio);
   191					}
   192	
   193					addr = start;
   194					goto again;
   195				}
   196	
   197				mpfn = migrate_pfn(page_to_pfn(page)) |
   198						MIGRATE_PFN_MIGRATE;
   199				if (is_writable_device_private_entry(entry))
   200					mpfn |= MIGRATE_PFN_WRITE;
   201			} else {
   202				pfn = pte_pfn(pte);
   203				if (is_zero_pfn(pfn) &&
   204				    (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
   205					mpfn = MIGRATE_PFN_MIGRATE;
   206					migrate->cpages++;
   207					goto next;
   208				}
   209				page = vm_normal_page(migrate->vma, addr, pte);
   210				if (page && !is_zone_device_page(page) &&
   211				    !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
   212					goto next;
   213				} else if (page && is_device_coherent_page(page)) {
   214					pgmap = page_pgmap(page);
   215	
   216					if (!(migrate->flags &
   217						MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
   218						pgmap->owner != migrate->pgmap_owner)
   219						goto next;
   220				}
   221				mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
   222				mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
   223			}
   224	
   225			/* FIXME support THP */
   226			if (!page || !page->mapping || PageTransCompound(page)) {
   227				mpfn = 0;
   228				goto next;
   229			}
   230	
   231			/*
   232			 * By getting a reference on the folio we pin it and that blocks
   233			 * any kind of migration. Side effect is that it "freezes" the
   234			 * pte.
   235			 *
   236			 * We drop this reference after isolating the folio from the lru
   237			 * for non device folio (device folio are not on the lru and thus
   238			 * can't be dropped from it).
   239			 */
   240			folio = page_folio(page);
   241			folio_get(folio);
   242	
   243			/*
   244			 * We rely on folio_trylock() to avoid deadlock between
   245			 * concurrent migrations where each is waiting on the others
   246			 * folio lock. If we can't immediately lock the folio we fail this
   247			 * migration as it is only best effort anyway.
   248			 *
   249			 * If we can lock the folio it's safe to set up a migration entry
   250			 * now. In the common case where the folio is mapped once in a
   251			 * single process setting up the migration entry now is an
   252			 * optimisation to avoid walking the rmap later with
   253			 * try_to_migrate().
   254			 */
   255			if (fault_folio == folio || folio_trylock(folio)) {
   256				bool anon_exclusive;
   257				pte_t swp_pte;
   258	
   259				flush_cache_page(vma, addr, pte_pfn(pte));
   260				anon_exclusive = folio_test_anon(folio) &&
   261						  PageAnonExclusive(page);
   262				if (anon_exclusive) {
   263					pte = ptep_clear_flush(vma, addr, ptep);
   264	
   265					if (folio_try_share_anon_rmap_pte(folio, page)) {
   266						set_pte_at(mm, addr, ptep, pte);
   267						if (fault_folio != folio)
   268							folio_unlock(folio);
   269						folio_put(folio);
   270						mpfn = 0;
   271						goto next;
   272					}
   273				} else {
   274					pte = ptep_get_and_clear(mm, addr, ptep);
   275				}
   276	
   277				migrate->cpages++;
   278	
   279				/* Set the dirty flag on the folio now the pte is gone. */
   280				if (pte_dirty(pte))
   281					folio_mark_dirty(folio);
   282	
   283				/* Setup special migration page table entry */
   284				if (mpfn & MIGRATE_PFN_WRITE)
   285					entry = make_writable_migration_entry(
   286								page_to_pfn(page));
   287				else if (anon_exclusive)
   288					entry = make_readable_exclusive_migration_entry(
   289								page_to_pfn(page));
   290				else
   291					entry = make_readable_migration_entry(
   292								page_to_pfn(page));
   293				if (pte_present(pte)) {
   294					if (pte_young(pte))
   295						entry = make_migration_entry_young(entry);
   296					if (pte_dirty(pte))
   297						entry = make_migration_entry_dirty(entry);
   298				}
   299				swp_pte = swp_entry_to_pte(entry);
   300				if (pte_present(pte)) {
   301					if (pte_soft_dirty(pte))
   302						swp_pte = pte_swp_mksoft_dirty(swp_pte);
   303					if (pte_uffd_wp(pte))
   304						swp_pte = pte_swp_mkuffd_wp(swp_pte);
   305				} else {
   306					if (pte_swp_soft_dirty(pte))
   307						swp_pte = pte_swp_mksoft_dirty(swp_pte);
   308					if (pte_swp_uffd_wp(pte))
   309						swp_pte = pte_swp_mkuffd_wp(swp_pte);
   310				}
   311				set_pte_at(mm, addr, ptep, swp_pte);
   312	
   313				/*
   314				 * This is like regular unmap: we remove the rmap and
   315				 * drop the folio refcount. The folio won't be freed, as
   316				 * we took a reference just above.
   317				 */
   318				folio_remove_rmap_pte(folio, page, vma);
   319				folio_put(folio);
   320	
   321				if (pte_present(pte))
   322					unmapped++;
   323			} else {
   324				folio_put(folio);
   325				mpfn = 0;
   326			}
   327	
   328	next:
   329			migrate->dst[migrate->npages] = 0;
   330			migrate->src[migrate->npages++] = mpfn;
   331		}
   332	
   333		/* Only flush the TLB if we actually modified any entries */
   334		if (unmapped)
   335			flush_tlb_range(walk->vma, start, end);
   336	
   337		arch_leave_lazy_mmu_mode();
   338		pte_unmap_unlock(ptep - 1, ptl);
   339	
   340		return 0;
   341	}
   342	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki