Make THP handling code in the mm subsystem for THP pages aware of zone
device pages. Although the code is designed to be generic when it comes
to handling splitting of pages, the code is designed to work for THP
page sizes corresponding to HPAGE_PMD_NR.
Modify page_vma_mapped_walk() to return true when a zone device huge
entry is present, enabling try_to_migrate() and other code migration
paths to appropriately process the entry. page_vma_mapped_walk() will
return true for zone device private large folios only when
PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are
not zone device private pages from having to add awareness. The key
callback that needs this flag is try_to_migrate_one(). The other
callbacks page idle, damon use it for setting young/dirty bits, which is
not significant when it comes to pmd level bit harvesting.
pmd_pfn() does not work well with zone device entries, use
pfn_pmd_entry_to_swap() for checking and comparison as for zone device
entries.
Support partial unmapping of zone device private entries, which happens
via munmap(). munmap() causes the device private entry pmd to be split,
but the corresponding folio is not split. Deferred split does not work for
zone device private folios due to the need to split during fault
handling. Get migrate_vma_collect_pmd() to handle this case by splitting
partially unmapped device private folios.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
---
include/linux/rmap.h | 2 +
include/linux/swapops.h | 17 ++++
lib/test_hmm.c | 2 +-
mm/huge_memory.c | 214 +++++++++++++++++++++++++++++++---------
mm/migrate_device.c | 47 +++++++++
mm/page_vma_mapped.c | 13 ++-
mm/pgtable-generic.c | 6 ++
mm/rmap.c | 24 ++++-
8 files changed, 272 insertions(+), 53 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6cd020eea37a..dfb7aae3d77b 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -927,6 +927,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
#define PVMW_SYNC (1 << 0)
/* Look for migration entries rather than present PTEs */
#define PVMW_MIGRATION (1 << 1)
+/* Look for device private THP entries */
+#define PVMW_THP_DEVICE_PRIVATE (1 << 2)
struct page_vma_mapped_walk {
unsigned long pfn;
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 64ea151a7ae3..2641c01bd5d2 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
{
return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
}
+
#else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
struct page *page)
@@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
}
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
+
+static inline int is_pmd_device_private_entry(pmd_t pmd)
+{
+ return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd));
+}
+
+#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+static inline int is_pmd_device_private_entry(pmd_t pmd)
+{
+ return 0;
+}
+
+#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
static inline int non_swap_entry(swp_entry_t entry)
{
return swp_type(entry) >= MAX_SWAPFILES;
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 761725bc713c..297f1e034045 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
* the mirror but here we use it to hold the page for the simulated
* device memory and that page holds the pointer to the mirror.
*/
- rpage = vmf->page->zone_device_data;
+ rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data;
dmirror = rpage->zone_device_data;
/* FIXME demonstrate how we can adjust migrate range */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9c38a95e9f09..2495e3fdbfae 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (unlikely(is_swap_pmd(pmd))) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
- VM_BUG_ON(!is_pmd_migration_entry(pmd));
- if (!is_readable_migration_entry(entry)) {
+ VM_WARN_ON(!is_pmd_migration_entry(pmd) &&
+ !is_pmd_device_private_entry(pmd));
+
+ if (is_migration_entry(entry) &&
+ is_writable_migration_entry(entry)) {
entry = make_readable_migration_entry(
swp_offset(entry));
pmd = swp_entry_to_pmd(entry);
@@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd = pmd_swp_mkuffd_wp(pmd);
set_pmd_at(src_mm, addr, src_pmd, pmd);
}
+
+ if (is_device_private_entry(entry)) {
+ if (is_writable_device_private_entry(entry)) {
+ entry = make_readable_device_private_entry(
+ swp_offset(entry));
+ pmd = swp_entry_to_pmd(entry);
+
+ if (pmd_swp_soft_dirty(*src_pmd))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ if (pmd_swp_uffd_wp(*src_pmd))
+ pmd = pmd_swp_mkuffd_wp(pmd);
+ set_pmd_at(src_mm, addr, src_pmd, pmd);
+ }
+
+ src_folio = pfn_swap_entry_folio(entry);
+ VM_WARN_ON(!folio_test_large(src_folio));
+
+ folio_get(src_folio);
+ /*
+ * folio_try_dup_anon_rmap_pmd does not fail for
+ * device private entries.
+ */
+ VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio,
+ &src_folio->page, dst_vma, src_vma));
+ }
+
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
@@ -2219,15 +2248,22 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
folio_remove_rmap_pmd(folio, page, vma);
WARN_ON_ONCE(folio_mapcount(folio) < 0);
VM_BUG_ON_PAGE(!PageHead(page), page);
- } else if (thp_migration_supported()) {
+ } else if (is_pmd_migration_entry(orig_pmd) ||
+ is_pmd_device_private_entry(orig_pmd)) {
swp_entry_t entry;
- VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
folio = pfn_swap_entry_folio(entry);
flush_needed = 0;
- } else
- WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+
+ if (!thp_migration_supported())
+ WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+
+ if (is_pmd_device_private_entry(orig_pmd)) {
+ folio_remove_rmap_pmd(folio, &folio->page, vma);
+ WARN_ON_ONCE(folio_mapcount(folio) < 0);
+ }
+ }
if (folio_test_anon(folio)) {
zap_deposited_table(tlb->mm, pmd);
@@ -2247,6 +2283,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
folio_mark_accessed(folio);
}
+ /*
+ * Do a folio put on zone device private pages after
+ * changes to mm_counter, because the folio_put() will
+ * clean folio->mapping and the folio_test_anon() check
+ * will not be usable.
+ */
+ if (folio_is_device_private(folio))
+ folio_put(folio);
+
spin_unlock(ptl);
if (flush_needed)
tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
@@ -2375,7 +2420,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct folio *folio = pfn_swap_entry_folio(entry);
pmd_t newpmd;
- VM_BUG_ON(!is_pmd_migration_entry(*pmd));
+ VM_WARN_ON(!is_pmd_migration_entry(*pmd) &&
+ !folio_is_device_private(folio));
if (is_writable_migration_entry(entry)) {
/*
* A protection check is difficult so
@@ -2388,6 +2434,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
newpmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
+ } else if (is_writable_device_private_entry(entry)) {
+ entry = make_readable_device_private_entry(
+ swp_offset(entry));
+ newpmd = swp_entry_to_pmd(entry);
} else {
newpmd = *pmd;
}
@@ -2842,16 +2892,19 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
struct page *page;
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
- bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
- bool anon_exclusive = false, dirty = false;
+ bool young, write, soft_dirty, uffd_wp = false;
+ bool anon_exclusive = false, dirty = false, present = false;
unsigned long addr;
pte_t *pte;
int i;
+ swp_entry_t swp_entry;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
- VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
+
+ VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+ && !(is_pmd_device_private_entry(*pmd)));
count_vm_event(THP_SPLIT_PMD);
@@ -2899,18 +2952,45 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- pmd_migration = is_pmd_migration_entry(*pmd);
- if (unlikely(pmd_migration)) {
- swp_entry_t entry;
+ present = pmd_present(*pmd);
+ if (unlikely(!present)) {
+ swp_entry = pmd_to_swp_entry(*pmd);
old_pmd = *pmd;
- entry = pmd_to_swp_entry(old_pmd);
- page = pfn_swap_entry_to_page(entry);
- write = is_writable_migration_entry(entry);
- if (PageAnon(page))
- anon_exclusive = is_readable_exclusive_migration_entry(entry);
- young = is_migration_entry_young(entry);
- dirty = is_migration_entry_dirty(entry);
+
+ folio = pfn_swap_entry_folio(swp_entry);
+ VM_WARN_ON(!is_migration_entry(swp_entry) &&
+ !is_device_private_entry(swp_entry));
+ page = pfn_swap_entry_to_page(swp_entry);
+
+ if (is_pmd_migration_entry(old_pmd)) {
+ write = is_writable_migration_entry(swp_entry);
+ if (PageAnon(page))
+ anon_exclusive =
+ is_readable_exclusive_migration_entry(
+ swp_entry);
+ young = is_migration_entry_young(swp_entry);
+ dirty = is_migration_entry_dirty(swp_entry);
+ } else if (is_pmd_device_private_entry(old_pmd)) {
+ write = is_writable_device_private_entry(swp_entry);
+ anon_exclusive = PageAnonExclusive(page);
+ if (freeze && anon_exclusive &&
+ folio_try_share_anon_rmap_pmd(folio, page))
+ freeze = false;
+ if (!freeze) {
+ rmap_t rmap_flags = RMAP_NONE;
+
+ if (anon_exclusive)
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ folio_ref_add(folio, HPAGE_PMD_NR - 1);
+ if (anon_exclusive)
+ rmap_flags |= RMAP_EXCLUSIVE;
+ folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
+ vma, haddr, rmap_flags);
+ }
+ }
+
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
@@ -2996,30 +3076,49 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
* Note that NUMA hinting access restrictions are not transferred to
* avoid any possibility of altering permissions across VMAs.
*/
- if (freeze || pmd_migration) {
+ if (freeze || !present) {
for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
pte_t entry;
- swp_entry_t swp_entry;
-
- if (write)
- swp_entry = make_writable_migration_entry(
- page_to_pfn(page + i));
- else if (anon_exclusive)
- swp_entry = make_readable_exclusive_migration_entry(
- page_to_pfn(page + i));
- else
- swp_entry = make_readable_migration_entry(
- page_to_pfn(page + i));
- if (young)
- swp_entry = make_migration_entry_young(swp_entry);
- if (dirty)
- swp_entry = make_migration_entry_dirty(swp_entry);
- entry = swp_entry_to_pte(swp_entry);
- if (soft_dirty)
- entry = pte_swp_mksoft_dirty(entry);
- if (uffd_wp)
- entry = pte_swp_mkuffd_wp(entry);
-
+ if (freeze || is_migration_entry(swp_entry)) {
+ if (write)
+ swp_entry = make_writable_migration_entry(
+ page_to_pfn(page + i));
+ else if (anon_exclusive)
+ swp_entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(page + i));
+ else
+ swp_entry = make_readable_migration_entry(
+ page_to_pfn(page + i));
+ if (young)
+ swp_entry = make_migration_entry_young(swp_entry);
+ if (dirty)
+ swp_entry = make_migration_entry_dirty(swp_entry);
+ entry = swp_entry_to_pte(swp_entry);
+ if (soft_dirty)
+ entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
+ } else {
+ /*
+ * anon_exclusive was already propagated to the relevant
+ * pages corresponding to the pte entries when freeze
+ * is false.
+ */
+ if (write)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page + i));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page + i));
+ /*
+ * Young and dirty bits are not progated via swp_entry
+ */
+ entry = swp_entry_to_pte(swp_entry);
+ if (soft_dirty)
+ entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
+ }
VM_WARN_ON(!pte_none(ptep_get(pte + i)));
set_pte_at(mm, addr, pte + i, entry);
}
@@ -3046,7 +3145,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
pte_unmap(pte);
- if (!pmd_migration)
+ if (present)
folio_remove_rmap_pmd(folio, page, vma);
if (freeze)
put_page(page);
@@ -3058,8 +3157,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, bool freeze)
{
+
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
- if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
+ if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) ||
+ (is_pmd_device_private_entry(*pmd)))
__split_huge_pmd_locked(vma, pmd, address, freeze);
}
@@ -3238,6 +3339,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
lockdep_assert_held(&lruvec->lru_lock);
+ if (folio_is_device_private(folio))
+ return;
+
if (list) {
/* page reclaim is reclaiming a huge page */
VM_WARN_ON(folio_test_lru(folio));
@@ -3252,6 +3356,7 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
list_add_tail(&new_folio->lru, &folio->lru);
folio_set_lru(new_folio);
}
+
}
/* Racy check whether the huge page can be split */
@@ -3727,7 +3832,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
- if (folio_ref_freeze(folio, 1 + extra_pins)) {
+ if (folio_ref_freeze(folio, 1 + folio_expected_ref_count(folio))) {
struct address_space *swap_cache = NULL;
struct lruvec *lruvec;
int expected_refs;
@@ -3858,8 +3963,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
if (nr_shmem_dropped)
shmem_uncharge(mapping->host, nr_shmem_dropped);
- if (!ret && is_anon)
+ if (!ret && is_anon && !folio_is_device_private(folio))
remap_flags = RMP_USE_SHARED_ZEROPAGE;
+
remap_page(folio, 1 << order, remap_flags);
/*
@@ -4603,7 +4709,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
return 0;
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
- pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+ if (unlikely(is_pmd_device_private_entry(*pvmw->pmd)))
+ pmdval = pmdp_huge_clear_flush(vma, address, pvmw->pmd);
+ else
+ pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
/* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
@@ -4653,6 +4762,17 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
entry = pmd_to_swp_entry(*pvmw->pmd);
folio_get(folio);
pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
+
+ if (folio_is_device_private(folio)) {
+ if (pmd_write(pmde))
+ entry = make_writable_device_private_entry(
+ page_to_pfn(new));
+ else
+ entry = make_readable_device_private_entry(
+ page_to_pfn(new));
+ pmde = swp_entry_to_pmd(entry);
+ }
+
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index e05e14d6eacd..0ed337f94fcd 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -136,6 +136,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
* page table entry. Other special swap entries are not
* migratable, and we ignore regular swapped page.
*/
+ struct folio *folio;
+
entry = pte_to_swp_entry(pte);
if (!is_device_private_entry(entry))
goto next;
@@ -147,6 +149,51 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
pgmap->owner != migrate->pgmap_owner)
goto next;
+ folio = page_folio(page);
+ if (folio_test_large(folio)) {
+ struct folio *new_folio;
+ struct folio *new_fault_folio = NULL;
+
+ /*
+ * The reason for finding pmd present with a
+ * device private pte and a large folio for the
+ * pte is partial unmaps. Split the folio now
+ * for the migration to be handled correctly
+ */
+ pte_unmap_unlock(ptep, ptl);
+
+ folio_get(folio);
+ if (folio != fault_folio)
+ folio_lock(folio);
+ if (split_folio(folio)) {
+ if (folio != fault_folio)
+ folio_unlock(folio);
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ goto next;
+ }
+
+ new_folio = page_folio(page);
+ if (fault_folio)
+ new_fault_folio = page_folio(migrate->fault_page);
+
+ /*
+ * Ensure the lock is held on the correct
+ * folio after the split
+ */
+ if (!new_fault_folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ } else if (folio != new_fault_folio) {
+ folio_get(new_fault_folio);
+ folio_lock(new_fault_folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ addr = start;
+ goto again;
+ }
+
mpfn = migrate_pfn(page_to_pfn(page)) |
MIGRATE_PFN_MIGRATE;
if (is_writable_device_private_entry(entry))
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index e981a1a292d2..246e6c211f34 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -250,12 +250,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
pvmw->ptl = pmd_lock(mm, pvmw->pmd);
pmde = *pvmw->pmd;
if (!pmd_present(pmde)) {
- swp_entry_t entry;
+ swp_entry_t entry = pmd_to_swp_entry(pmde);
if (!thp_migration_supported() ||
!(pvmw->flags & PVMW_MIGRATION))
return not_found(pvmw);
- entry = pmd_to_swp_entry(pmde);
if (!is_migration_entry(entry) ||
!check_pmd(swp_offset_pfn(entry), pvmw))
return not_found(pvmw);
@@ -277,6 +276,16 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
* cannot return prematurely, while zap_huge_pmd() has
* cleared *pmd but not decremented compound_mapcount().
*/
+ swp_entry_t entry;
+
+ entry = pmd_to_swp_entry(pmde);
+
+ if (is_device_private_entry(entry) &&
+ (pvmw->flags & PVMW_THP_DEVICE_PRIVATE)) {
+ pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+ return true;
+ }
+
if ((pvmw->flags & PVMW_SYNC) &&
thp_vma_suitable_order(vma, pvmw->address,
PMD_ORDER) &&
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 567e2d084071..604e8206a2ec 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -292,6 +292,12 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
*pmdvalp = pmdval;
if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
goto nomap;
+ if (is_swap_pmd(pmdval)) {
+ swp_entry_t entry = pmd_to_swp_entry(pmdval);
+
+ if (is_device_private_entry(entry))
+ goto nomap;
+ }
if (unlikely(pmd_trans_huge(pmdval)))
goto nomap;
if (unlikely(pmd_bad(pmdval))) {
diff --git a/mm/rmap.c b/mm/rmap.c
index b5837075b6e0..f40e45564295 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2285,7 +2285,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address,
+ PVMW_THP_DEVICE_PRIVATE);
bool anon_exclusive, writable, ret = true;
pte_t pteval;
struct page *subpage;
@@ -2330,6 +2331,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
while (page_vma_mapped_walk(&pvmw)) {
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ unsigned long pfn;
+#endif
+
if (flags & TTU_SPLIT_HUGE_PMD) {
split_huge_pmd_locked(vma, pvmw.address,
pvmw.pmd, true);
@@ -2338,8 +2343,21 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- subpage = folio_page(folio,
- pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
+ /*
+ * Zone device private folios do not work well with
+ * pmd_pfn() on some architectures due to pte
+ * inversion.
+ */
+ if (is_pmd_device_private_entry(*pvmw.pmd)) {
+ swp_entry_t entry = pmd_to_swp_entry(*pvmw.pmd);
+
+ pfn = swp_offset_pfn(entry);
+ } else {
+ pfn = pmd_pfn(*pvmw.pmd);
+ }
+
+ subpage = folio_page(folio, pfn - folio_pfn(folio));
+
VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
!folio_test_pmd_mappable(folio), folio);
--
2.50.1
On Tue, Aug 12, 2025 at 12:40:27PM +1000, Balbir Singh wrote: > Make THP handling code in the mm subsystem for THP pages aware of zone > device pages. Although the code is designed to be generic when it comes > to handling splitting of pages, the code is designed to work for THP > page sizes corresponding to HPAGE_PMD_NR. > > Modify page_vma_mapped_walk() to return true when a zone device huge > entry is present, enabling try_to_migrate() and other code migration > paths to appropriately process the entry. page_vma_mapped_walk() will > return true for zone device private large folios only when > PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are > not zone device private pages from having to add awareness. The key > callback that needs this flag is try_to_migrate_one(). The other > callbacks page idle, damon use it for setting young/dirty bits, which is > not significant when it comes to pmd level bit harvesting. > > pmd_pfn() does not work well with zone device entries, use > pfn_pmd_entry_to_swap() for checking and comparison as for zone device > entries. > > Support partial unmapping of zone device private entries, which happens > via munmap(). munmap() causes the device private entry pmd to be split, > but the corresponding folio is not split. Deferred split does not work for > zone device private folios due to the need to split during fault > handling. Get migrate_vma_collect_pmd() to handle this case by splitting > partially unmapped device private folios. > > Cc: Andrew Morton <akpm@linux-foundation.org> > Cc: David Hildenbrand <david@redhat.com> > Cc: Zi Yan <ziy@nvidia.com> > Cc: Joshua Hahn <joshua.hahnjy@gmail.com> > Cc: Rakie Kim <rakie.kim@sk.com> > Cc: Byungchul Park <byungchul@sk.com> > Cc: Gregory Price <gourry@gourry.net> > Cc: Ying Huang <ying.huang@linux.alibaba.com> > Cc: Alistair Popple <apopple@nvidia.com> > Cc: Oscar Salvador <osalvador@suse.de> > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > Cc: Baolin Wang <baolin.wang@linux.alibaba.com> > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> > Cc: Nico Pache <npache@redhat.com> > Cc: Ryan Roberts <ryan.roberts@arm.com> > Cc: Dev Jain <dev.jain@arm.com> > Cc: Barry Song <baohua@kernel.org> > Cc: Lyude Paul <lyude@redhat.com> > Cc: Danilo Krummrich <dakr@kernel.org> > Cc: David Airlie <airlied@gmail.com> > Cc: Simona Vetter <simona@ffwll.ch> > Cc: Ralph Campbell <rcampbell@nvidia.com> > Cc: Mika Penttilä <mpenttil@redhat.com> > Cc: Matthew Brost <matthew.brost@intel.com> > Cc: Francois Dugast <francois.dugast@intel.com> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com> > Signed-off-by: Balbir Singh <balbirs@nvidia.com> > --- > include/linux/rmap.h | 2 + > include/linux/swapops.h | 17 ++++ > lib/test_hmm.c | 2 +- > mm/huge_memory.c | 214 +++++++++++++++++++++++++++++++--------- > mm/migrate_device.c | 47 +++++++++ > mm/page_vma_mapped.c | 13 ++- > mm/pgtable-generic.c | 6 ++ > mm/rmap.c | 24 ++++- > 8 files changed, 272 insertions(+), 53 deletions(-) > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h > index 6cd020eea37a..dfb7aae3d77b 100644 > --- a/include/linux/rmap.h > +++ b/include/linux/rmap.h > @@ -927,6 +927,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, > #define PVMW_SYNC (1 << 0) > /* Look for migration entries rather than present PTEs */ > #define PVMW_MIGRATION (1 << 1) > +/* Look for device private THP entries */ > +#define PVMW_THP_DEVICE_PRIVATE (1 << 2) > > struct page_vma_mapped_walk { > unsigned long pfn; > diff --git a/include/linux/swapops.h b/include/linux/swapops.h > index 64ea151a7ae3..2641c01bd5d2 100644 > --- a/include/linux/swapops.h > +++ b/include/linux/swapops.h > @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd) > { > return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); > } > + > #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ > static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, > struct page *page) > @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd) > } > #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ > > +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) > + > +static inline int is_pmd_device_private_entry(pmd_t pmd) > +{ > + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); > +} > + > +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ > + > +static inline int is_pmd_device_private_entry(pmd_t pmd) > +{ > + return 0; > +} > + > +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ > + > static inline int non_swap_entry(swp_entry_t entry) > { > return swp_type(entry) >= MAX_SWAPFILES; > diff --git a/lib/test_hmm.c b/lib/test_hmm.c > index 761725bc713c..297f1e034045 100644 > --- a/lib/test_hmm.c > +++ b/lib/test_hmm.c > @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) > * the mirror but here we use it to hold the page for the simulated > * device memory and that page holds the pointer to the mirror. > */ > - rpage = vmf->page->zone_device_data; > + rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data; > dmirror = rpage->zone_device_data; > > /* FIXME demonstrate how we can adjust migrate range */ > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 9c38a95e9f09..2495e3fdbfae 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, > if (unlikely(is_swap_pmd(pmd))) { > swp_entry_t entry = pmd_to_swp_entry(pmd); > > - VM_BUG_ON(!is_pmd_migration_entry(pmd)); > - if (!is_readable_migration_entry(entry)) { > + VM_WARN_ON(!is_pmd_migration_entry(pmd) && > + !is_pmd_device_private_entry(pmd)); > + > + if (is_migration_entry(entry) && > + is_writable_migration_entry(entry)) { > entry = make_readable_migration_entry( > swp_offset(entry)); > pmd = swp_entry_to_pmd(entry); > @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, > pmd = pmd_swp_mkuffd_wp(pmd); > set_pmd_at(src_mm, addr, src_pmd, pmd); > } > + > + if (is_device_private_entry(entry)) { > + if (is_writable_device_private_entry(entry)) { > + entry = make_readable_device_private_entry( > + swp_offset(entry)); > + pmd = swp_entry_to_pmd(entry); > + > + if (pmd_swp_soft_dirty(*src_pmd)) > + pmd = pmd_swp_mksoft_dirty(pmd); > + if (pmd_swp_uffd_wp(*src_pmd)) > + pmd = pmd_swp_mkuffd_wp(pmd); > + set_pmd_at(src_mm, addr, src_pmd, pmd); > + } > + > + src_folio = pfn_swap_entry_folio(entry); > + VM_WARN_ON(!folio_test_large(src_folio)); > + > + folio_get(src_folio); > + /* > + * folio_try_dup_anon_rmap_pmd does not fail for > + * device private entries. > + */ > + VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio, > + &src_folio->page, dst_vma, src_vma)); VM_WARN_ON compiles out in non-debug builds. I hit this running the fork self I shared with a non-debug build. Matt [1] https://elixir.bootlin.com/linux/v6.16.3/source/include/linux/mmdebug.h#L112 > + } > + > add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); > mm_inc_nr_ptes(dst_mm); > pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); > @@ -2219,15 +2248,22 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > folio_remove_rmap_pmd(folio, page, vma); > WARN_ON_ONCE(folio_mapcount(folio) < 0); > VM_BUG_ON_PAGE(!PageHead(page), page); > - } else if (thp_migration_supported()) { > + } else if (is_pmd_migration_entry(orig_pmd) || > + is_pmd_device_private_entry(orig_pmd)) { > swp_entry_t entry; > > - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); > entry = pmd_to_swp_entry(orig_pmd); > folio = pfn_swap_entry_folio(entry); > flush_needed = 0; > - } else > - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); > + > + if (!thp_migration_supported()) > + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); > + > + if (is_pmd_device_private_entry(orig_pmd)) { > + folio_remove_rmap_pmd(folio, &folio->page, vma); > + WARN_ON_ONCE(folio_mapcount(folio) < 0); > + } > + } > > if (folio_test_anon(folio)) { > zap_deposited_table(tlb->mm, pmd); > @@ -2247,6 +2283,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > folio_mark_accessed(folio); > } > > + /* > + * Do a folio put on zone device private pages after > + * changes to mm_counter, because the folio_put() will > + * clean folio->mapping and the folio_test_anon() check > + * will not be usable. > + */ > + if (folio_is_device_private(folio)) > + folio_put(folio); > + > spin_unlock(ptl); > if (flush_needed) > tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); > @@ -2375,7 +2420,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > struct folio *folio = pfn_swap_entry_folio(entry); > pmd_t newpmd; > > - VM_BUG_ON(!is_pmd_migration_entry(*pmd)); > + VM_WARN_ON(!is_pmd_migration_entry(*pmd) && > + !folio_is_device_private(folio)); > if (is_writable_migration_entry(entry)) { > /* > * A protection check is difficult so > @@ -2388,6 +2434,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > newpmd = swp_entry_to_pmd(entry); > if (pmd_swp_soft_dirty(*pmd)) > newpmd = pmd_swp_mksoft_dirty(newpmd); > + } else if (is_writable_device_private_entry(entry)) { > + entry = make_readable_device_private_entry( > + swp_offset(entry)); > + newpmd = swp_entry_to_pmd(entry); > } else { > newpmd = *pmd; > } > @@ -2842,16 +2892,19 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > struct page *page; > pgtable_t pgtable; > pmd_t old_pmd, _pmd; > - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; > - bool anon_exclusive = false, dirty = false; > + bool young, write, soft_dirty, uffd_wp = false; > + bool anon_exclusive = false, dirty = false, present = false; > unsigned long addr; > pte_t *pte; > int i; > + swp_entry_t swp_entry; > > VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); > VM_BUG_ON_VMA(vma->vm_start > haddr, vma); > VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); > - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); > + > + VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) > + && !(is_pmd_device_private_entry(*pmd))); > > count_vm_event(THP_SPLIT_PMD); > > @@ -2899,18 +2952,45 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > return __split_huge_zero_page_pmd(vma, haddr, pmd); > } > > - pmd_migration = is_pmd_migration_entry(*pmd); > - if (unlikely(pmd_migration)) { > - swp_entry_t entry; > > + present = pmd_present(*pmd); > + if (unlikely(!present)) { > + swp_entry = pmd_to_swp_entry(*pmd); > old_pmd = *pmd; > - entry = pmd_to_swp_entry(old_pmd); > - page = pfn_swap_entry_to_page(entry); > - write = is_writable_migration_entry(entry); > - if (PageAnon(page)) > - anon_exclusive = is_readable_exclusive_migration_entry(entry); > - young = is_migration_entry_young(entry); > - dirty = is_migration_entry_dirty(entry); > + > + folio = pfn_swap_entry_folio(swp_entry); > + VM_WARN_ON(!is_migration_entry(swp_entry) && > + !is_device_private_entry(swp_entry)); > + page = pfn_swap_entry_to_page(swp_entry); > + > + if (is_pmd_migration_entry(old_pmd)) { > + write = is_writable_migration_entry(swp_entry); > + if (PageAnon(page)) > + anon_exclusive = > + is_readable_exclusive_migration_entry( > + swp_entry); > + young = is_migration_entry_young(swp_entry); > + dirty = is_migration_entry_dirty(swp_entry); > + } else if (is_pmd_device_private_entry(old_pmd)) { > + write = is_writable_device_private_entry(swp_entry); > + anon_exclusive = PageAnonExclusive(page); > + if (freeze && anon_exclusive && > + folio_try_share_anon_rmap_pmd(folio, page)) > + freeze = false; > + if (!freeze) { > + rmap_t rmap_flags = RMAP_NONE; > + > + if (anon_exclusive) > + rmap_flags |= RMAP_EXCLUSIVE; > + > + folio_ref_add(folio, HPAGE_PMD_NR - 1); > + if (anon_exclusive) > + rmap_flags |= RMAP_EXCLUSIVE; > + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, > + vma, haddr, rmap_flags); > + } > + } > + > soft_dirty = pmd_swp_soft_dirty(old_pmd); > uffd_wp = pmd_swp_uffd_wp(old_pmd); > } else { > @@ -2996,30 +3076,49 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > * Note that NUMA hinting access restrictions are not transferred to > * avoid any possibility of altering permissions across VMAs. > */ > - if (freeze || pmd_migration) { > + if (freeze || !present) { > for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { > pte_t entry; > - swp_entry_t swp_entry; > - > - if (write) > - swp_entry = make_writable_migration_entry( > - page_to_pfn(page + i)); > - else if (anon_exclusive) > - swp_entry = make_readable_exclusive_migration_entry( > - page_to_pfn(page + i)); > - else > - swp_entry = make_readable_migration_entry( > - page_to_pfn(page + i)); > - if (young) > - swp_entry = make_migration_entry_young(swp_entry); > - if (dirty) > - swp_entry = make_migration_entry_dirty(swp_entry); > - entry = swp_entry_to_pte(swp_entry); > - if (soft_dirty) > - entry = pte_swp_mksoft_dirty(entry); > - if (uffd_wp) > - entry = pte_swp_mkuffd_wp(entry); > - > + if (freeze || is_migration_entry(swp_entry)) { > + if (write) > + swp_entry = make_writable_migration_entry( > + page_to_pfn(page + i)); > + else if (anon_exclusive) > + swp_entry = make_readable_exclusive_migration_entry( > + page_to_pfn(page + i)); > + else > + swp_entry = make_readable_migration_entry( > + page_to_pfn(page + i)); > + if (young) > + swp_entry = make_migration_entry_young(swp_entry); > + if (dirty) > + swp_entry = make_migration_entry_dirty(swp_entry); > + entry = swp_entry_to_pte(swp_entry); > + if (soft_dirty) > + entry = pte_swp_mksoft_dirty(entry); > + if (uffd_wp) > + entry = pte_swp_mkuffd_wp(entry); > + } else { > + /* > + * anon_exclusive was already propagated to the relevant > + * pages corresponding to the pte entries when freeze > + * is false. > + */ > + if (write) > + swp_entry = make_writable_device_private_entry( > + page_to_pfn(page + i)); > + else > + swp_entry = make_readable_device_private_entry( > + page_to_pfn(page + i)); > + /* > + * Young and dirty bits are not progated via swp_entry > + */ > + entry = swp_entry_to_pte(swp_entry); > + if (soft_dirty) > + entry = pte_swp_mksoft_dirty(entry); > + if (uffd_wp) > + entry = pte_swp_mkuffd_wp(entry); > + } > VM_WARN_ON(!pte_none(ptep_get(pte + i))); > set_pte_at(mm, addr, pte + i, entry); > } > @@ -3046,7 +3145,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > } > pte_unmap(pte); > > - if (!pmd_migration) > + if (present) > folio_remove_rmap_pmd(folio, page, vma); > if (freeze) > put_page(page); > @@ -3058,8 +3157,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, > pmd_t *pmd, bool freeze) > { > + > VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); > - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) > + if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) || > + (is_pmd_device_private_entry(*pmd))) > __split_huge_pmd_locked(vma, pmd, address, freeze); > } > > @@ -3238,6 +3339,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, > VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); > lockdep_assert_held(&lruvec->lru_lock); > > + if (folio_is_device_private(folio)) > + return; > + > if (list) { > /* page reclaim is reclaiming a huge page */ > VM_WARN_ON(folio_test_lru(folio)); > @@ -3252,6 +3356,7 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, > list_add_tail(&new_folio->lru, &folio->lru); > folio_set_lru(new_folio); > } > + > } > > /* Racy check whether the huge page can be split */ > @@ -3727,7 +3832,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, > > /* Prevent deferred_split_scan() touching ->_refcount */ > spin_lock(&ds_queue->split_queue_lock); > - if (folio_ref_freeze(folio, 1 + extra_pins)) { > + if (folio_ref_freeze(folio, 1 + folio_expected_ref_count(folio))) { > struct address_space *swap_cache = NULL; > struct lruvec *lruvec; > int expected_refs; > @@ -3858,8 +3963,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order, > if (nr_shmem_dropped) > shmem_uncharge(mapping->host, nr_shmem_dropped); > > - if (!ret && is_anon) > + if (!ret && is_anon && !folio_is_device_private(folio)) > remap_flags = RMP_USE_SHARED_ZEROPAGE; > + > remap_page(folio, 1 << order, remap_flags); > > /* > @@ -4603,7 +4709,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, > return 0; > > flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); > - pmdval = pmdp_invalidate(vma, address, pvmw->pmd); > + if (unlikely(is_pmd_device_private_entry(*pvmw->pmd))) > + pmdval = pmdp_huge_clear_flush(vma, address, pvmw->pmd); > + else > + pmdval = pmdp_invalidate(vma, address, pvmw->pmd); > > /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ > anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); > @@ -4653,6 +4762,17 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) > entry = pmd_to_swp_entry(*pvmw->pmd); > folio_get(folio); > pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot)); > + > + if (folio_is_device_private(folio)) { > + if (pmd_write(pmde)) > + entry = make_writable_device_private_entry( > + page_to_pfn(new)); > + else > + entry = make_readable_device_private_entry( > + page_to_pfn(new)); > + pmde = swp_entry_to_pmd(entry); > + } > + > if (pmd_swp_soft_dirty(*pvmw->pmd)) > pmde = pmd_mksoft_dirty(pmde); > if (is_writable_migration_entry(entry)) > diff --git a/mm/migrate_device.c b/mm/migrate_device.c > index e05e14d6eacd..0ed337f94fcd 100644 > --- a/mm/migrate_device.c > +++ b/mm/migrate_device.c > @@ -136,6 +136,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, > * page table entry. Other special swap entries are not > * migratable, and we ignore regular swapped page. > */ > + struct folio *folio; > + > entry = pte_to_swp_entry(pte); > if (!is_device_private_entry(entry)) > goto next; > @@ -147,6 +149,51 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, > pgmap->owner != migrate->pgmap_owner) > goto next; > > + folio = page_folio(page); > + if (folio_test_large(folio)) { > + struct folio *new_folio; > + struct folio *new_fault_folio = NULL; > + > + /* > + * The reason for finding pmd present with a > + * device private pte and a large folio for the > + * pte is partial unmaps. Split the folio now > + * for the migration to be handled correctly > + */ > + pte_unmap_unlock(ptep, ptl); > + > + folio_get(folio); > + if (folio != fault_folio) > + folio_lock(folio); > + if (split_folio(folio)) { > + if (folio != fault_folio) > + folio_unlock(folio); > + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); > + goto next; > + } > + > + new_folio = page_folio(page); > + if (fault_folio) > + new_fault_folio = page_folio(migrate->fault_page); > + > + /* > + * Ensure the lock is held on the correct > + * folio after the split > + */ > + if (!new_fault_folio) { > + folio_unlock(folio); > + folio_put(folio); > + } else if (folio != new_fault_folio) { > + folio_get(new_fault_folio); > + folio_lock(new_fault_folio); > + folio_unlock(folio); > + folio_put(folio); > + } > + > + addr = start; > + goto again; > + } > + > mpfn = migrate_pfn(page_to_pfn(page)) | > MIGRATE_PFN_MIGRATE; > if (is_writable_device_private_entry(entry)) > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c > index e981a1a292d2..246e6c211f34 100644 > --- a/mm/page_vma_mapped.c > +++ b/mm/page_vma_mapped.c > @@ -250,12 +250,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) > pvmw->ptl = pmd_lock(mm, pvmw->pmd); > pmde = *pvmw->pmd; > if (!pmd_present(pmde)) { > - swp_entry_t entry; > + swp_entry_t entry = pmd_to_swp_entry(pmde); > > if (!thp_migration_supported() || > !(pvmw->flags & PVMW_MIGRATION)) > return not_found(pvmw); > - entry = pmd_to_swp_entry(pmde); > if (!is_migration_entry(entry) || > !check_pmd(swp_offset_pfn(entry), pvmw)) > return not_found(pvmw); > @@ -277,6 +276,16 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) > * cannot return prematurely, while zap_huge_pmd() has > * cleared *pmd but not decremented compound_mapcount(). > */ > + swp_entry_t entry; > + > + entry = pmd_to_swp_entry(pmde); > + > + if (is_device_private_entry(entry) && > + (pvmw->flags & PVMW_THP_DEVICE_PRIVATE)) { > + pvmw->ptl = pmd_lock(mm, pvmw->pmd); > + return true; > + } > + > if ((pvmw->flags & PVMW_SYNC) && > thp_vma_suitable_order(vma, pvmw->address, > PMD_ORDER) && > diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c > index 567e2d084071..604e8206a2ec 100644 > --- a/mm/pgtable-generic.c > +++ b/mm/pgtable-generic.c > @@ -292,6 +292,12 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) > *pmdvalp = pmdval; > if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) > goto nomap; > + if (is_swap_pmd(pmdval)) { > + swp_entry_t entry = pmd_to_swp_entry(pmdval); > + > + if (is_device_private_entry(entry)) > + goto nomap; > + } > if (unlikely(pmd_trans_huge(pmdval))) > goto nomap; > if (unlikely(pmd_bad(pmdval))) { > diff --git a/mm/rmap.c b/mm/rmap.c > index b5837075b6e0..f40e45564295 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -2285,7 +2285,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, > unsigned long address, void *arg) > { > struct mm_struct *mm = vma->vm_mm; > - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); > + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, > + PVMW_THP_DEVICE_PRIVATE); > bool anon_exclusive, writable, ret = true; > pte_t pteval; > struct page *subpage; > @@ -2330,6 +2331,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, > while (page_vma_mapped_walk(&pvmw)) { > /* PMD-mapped THP migration entry */ > if (!pvmw.pte) { > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > + unsigned long pfn; > +#endif > + > if (flags & TTU_SPLIT_HUGE_PMD) { > split_huge_pmd_locked(vma, pvmw.address, > pvmw.pmd, true); > @@ -2338,8 +2343,21 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, > break; > } > #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > - subpage = folio_page(folio, > - pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); > + /* > + * Zone device private folios do not work well with > + * pmd_pfn() on some architectures due to pte > + * inversion. > + */ > + if (is_pmd_device_private_entry(*pvmw.pmd)) { > + swp_entry_t entry = pmd_to_swp_entry(*pvmw.pmd); > + > + pfn = swp_offset_pfn(entry); > + } else { > + pfn = pmd_pfn(*pvmw.pmd); > + } > + > + subpage = folio_page(folio, pfn - folio_pfn(folio)); > + > VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || > !folio_test_pmd_mappable(folio), folio); > > -- > 2.50.1 >
On 28.08.25 22:05, Matthew Brost wrote: > On Tue, Aug 12, 2025 at 12:40:27PM +1000, Balbir Singh wrote: >> Make THP handling code in the mm subsystem for THP pages aware of zone >> device pages. Although the code is designed to be generic when it comes >> to handling splitting of pages, the code is designed to work for THP >> page sizes corresponding to HPAGE_PMD_NR. >> >> Modify page_vma_mapped_walk() to return true when a zone device huge >> entry is present, enabling try_to_migrate() and other code migration >> paths to appropriately process the entry. page_vma_mapped_walk() will >> return true for zone device private large folios only when >> PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are >> not zone device private pages from having to add awareness. The key >> callback that needs this flag is try_to_migrate_one(). The other >> callbacks page idle, damon use it for setting young/dirty bits, which is >> not significant when it comes to pmd level bit harvesting. >> >> pmd_pfn() does not work well with zone device entries, use >> pfn_pmd_entry_to_swap() for checking and comparison as for zone device >> entries. >> >> Support partial unmapping of zone device private entries, which happens >> via munmap(). munmap() causes the device private entry pmd to be split, >> but the corresponding folio is not split. Deferred split does not work for >> zone device private folios due to the need to split during fault >> handling. Get migrate_vma_collect_pmd() to handle this case by splitting >> partially unmapped device private folios. >> >> Cc: Andrew Morton <akpm@linux-foundation.org> >> Cc: David Hildenbrand <david@redhat.com> >> Cc: Zi Yan <ziy@nvidia.com> >> Cc: Joshua Hahn <joshua.hahnjy@gmail.com> >> Cc: Rakie Kim <rakie.kim@sk.com> >> Cc: Byungchul Park <byungchul@sk.com> >> Cc: Gregory Price <gourry@gourry.net> >> Cc: Ying Huang <ying.huang@linux.alibaba.com> >> Cc: Alistair Popple <apopple@nvidia.com> >> Cc: Oscar Salvador <osalvador@suse.de> >> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> >> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> >> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> >> Cc: Nico Pache <npache@redhat.com> >> Cc: Ryan Roberts <ryan.roberts@arm.com> >> Cc: Dev Jain <dev.jain@arm.com> >> Cc: Barry Song <baohua@kernel.org> >> Cc: Lyude Paul <lyude@redhat.com> >> Cc: Danilo Krummrich <dakr@kernel.org> >> Cc: David Airlie <airlied@gmail.com> >> Cc: Simona Vetter <simona@ffwll.ch> >> Cc: Ralph Campbell <rcampbell@nvidia.com> >> Cc: Mika Penttilä <mpenttil@redhat.com> >> Cc: Matthew Brost <matthew.brost@intel.com> >> Cc: Francois Dugast <francois.dugast@intel.com> >> >> Signed-off-by: Matthew Brost <matthew.brost@intel.com> >> Signed-off-by: Balbir Singh <balbirs@nvidia.com> >> --- >> include/linux/rmap.h | 2 + >> include/linux/swapops.h | 17 ++++ >> lib/test_hmm.c | 2 +- >> mm/huge_memory.c | 214 +++++++++++++++++++++++++++++++--------- >> mm/migrate_device.c | 47 +++++++++ >> mm/page_vma_mapped.c | 13 ++- >> mm/pgtable-generic.c | 6 ++ >> mm/rmap.c | 24 ++++- >> 8 files changed, 272 insertions(+), 53 deletions(-) >> >> diff --git a/include/linux/rmap.h b/include/linux/rmap.h >> index 6cd020eea37a..dfb7aae3d77b 100644 >> --- a/include/linux/rmap.h >> +++ b/include/linux/rmap.h >> @@ -927,6 +927,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, >> #define PVMW_SYNC (1 << 0) >> /* Look for migration entries rather than present PTEs */ >> #define PVMW_MIGRATION (1 << 1) >> +/* Look for device private THP entries */ >> +#define PVMW_THP_DEVICE_PRIVATE (1 << 2) >> >> struct page_vma_mapped_walk { >> unsigned long pfn; >> diff --git a/include/linux/swapops.h b/include/linux/swapops.h >> index 64ea151a7ae3..2641c01bd5d2 100644 >> --- a/include/linux/swapops.h >> +++ b/include/linux/swapops.h >> @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd) >> { >> return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); >> } >> + >> #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ >> static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, >> struct page *page) >> @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd) >> } >> #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ >> >> +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) >> + >> +static inline int is_pmd_device_private_entry(pmd_t pmd) >> +{ >> + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); >> +} >> + >> +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ >> + >> +static inline int is_pmd_device_private_entry(pmd_t pmd) >> +{ >> + return 0; >> +} >> + >> +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ >> + >> static inline int non_swap_entry(swp_entry_t entry) >> { >> return swp_type(entry) >= MAX_SWAPFILES; >> diff --git a/lib/test_hmm.c b/lib/test_hmm.c >> index 761725bc713c..297f1e034045 100644 >> --- a/lib/test_hmm.c >> +++ b/lib/test_hmm.c >> @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) >> * the mirror but here we use it to hold the page for the simulated >> * device memory and that page holds the pointer to the mirror. >> */ >> - rpage = vmf->page->zone_device_data; >> + rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data; >> dmirror = rpage->zone_device_data; >> >> /* FIXME demonstrate how we can adjust migrate range */ >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index 9c38a95e9f09..2495e3fdbfae 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, >> if (unlikely(is_swap_pmd(pmd))) { >> swp_entry_t entry = pmd_to_swp_entry(pmd); >> >> - VM_BUG_ON(!is_pmd_migration_entry(pmd)); >> - if (!is_readable_migration_entry(entry)) { >> + VM_WARN_ON(!is_pmd_migration_entry(pmd) && >> + !is_pmd_device_private_entry(pmd)); >> + >> + if (is_migration_entry(entry) && >> + is_writable_migration_entry(entry)) { >> entry = make_readable_migration_entry( >> swp_offset(entry)); >> pmd = swp_entry_to_pmd(entry); >> @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, >> pmd = pmd_swp_mkuffd_wp(pmd); >> set_pmd_at(src_mm, addr, src_pmd, pmd); >> } >> + >> + if (is_device_private_entry(entry)) { >> + if (is_writable_device_private_entry(entry)) { >> + entry = make_readable_device_private_entry( >> + swp_offset(entry)); >> + pmd = swp_entry_to_pmd(entry); >> + >> + if (pmd_swp_soft_dirty(*src_pmd)) >> + pmd = pmd_swp_mksoft_dirty(pmd); >> + if (pmd_swp_uffd_wp(*src_pmd)) >> + pmd = pmd_swp_mkuffd_wp(pmd); >> + set_pmd_at(src_mm, addr, src_pmd, pmd); >> + } >> + >> + src_folio = pfn_swap_entry_folio(entry); >> + VM_WARN_ON(!folio_test_large(src_folio)); >> + >> + folio_get(src_folio); >> + /* >> + * folio_try_dup_anon_rmap_pmd does not fail for >> + * device private entries. >> + */ >> + VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio, >> + &src_folio->page, dst_vma, src_vma)); > > VM_WARN_ON compiles out in non-debug builds. I hit this running the > fork self I shared with a non-debug build. folio_try_dup_anon_rmap_pmd() will never fail for folio_is_device_private(folio) -- unless something is deeply messed up that we wouldn't identify this folio as being device-private. Can you elaborate, what were you able to trigger, and in what kind of environment? -- Cheers David / dhildenb
On Thu, Aug 28, 2025 at 10:12:53PM +0200, David Hildenbrand wrote: > On 28.08.25 22:05, Matthew Brost wrote: > > On Tue, Aug 12, 2025 at 12:40:27PM +1000, Balbir Singh wrote: > > > Make THP handling code in the mm subsystem for THP pages aware of zone > > > device pages. Although the code is designed to be generic when it comes > > > to handling splitting of pages, the code is designed to work for THP > > > page sizes corresponding to HPAGE_PMD_NR. > > > > > > Modify page_vma_mapped_walk() to return true when a zone device huge > > > entry is present, enabling try_to_migrate() and other code migration > > > paths to appropriately process the entry. page_vma_mapped_walk() will > > > return true for zone device private large folios only when > > > PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are > > > not zone device private pages from having to add awareness. The key > > > callback that needs this flag is try_to_migrate_one(). The other > > > callbacks page idle, damon use it for setting young/dirty bits, which is > > > not significant when it comes to pmd level bit harvesting. > > > > > > pmd_pfn() does not work well with zone device entries, use > > > pfn_pmd_entry_to_swap() for checking and comparison as for zone device > > > entries. > > > > > > Support partial unmapping of zone device private entries, which happens > > > via munmap(). munmap() causes the device private entry pmd to be split, > > > but the corresponding folio is not split. Deferred split does not work for > > > zone device private folios due to the need to split during fault > > > handling. Get migrate_vma_collect_pmd() to handle this case by splitting > > > partially unmapped device private folios. > > > > > > Cc: Andrew Morton <akpm@linux-foundation.org> > > > Cc: David Hildenbrand <david@redhat.com> > > > Cc: Zi Yan <ziy@nvidia.com> > > > Cc: Joshua Hahn <joshua.hahnjy@gmail.com> > > > Cc: Rakie Kim <rakie.kim@sk.com> > > > Cc: Byungchul Park <byungchul@sk.com> > > > Cc: Gregory Price <gourry@gourry.net> > > > Cc: Ying Huang <ying.huang@linux.alibaba.com> > > > Cc: Alistair Popple <apopple@nvidia.com> > > > Cc: Oscar Salvador <osalvador@suse.de> > > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > > Cc: Baolin Wang <baolin.wang@linux.alibaba.com> > > > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> > > > Cc: Nico Pache <npache@redhat.com> > > > Cc: Ryan Roberts <ryan.roberts@arm.com> > > > Cc: Dev Jain <dev.jain@arm.com> > > > Cc: Barry Song <baohua@kernel.org> > > > Cc: Lyude Paul <lyude@redhat.com> > > > Cc: Danilo Krummrich <dakr@kernel.org> > > > Cc: David Airlie <airlied@gmail.com> > > > Cc: Simona Vetter <simona@ffwll.ch> > > > Cc: Ralph Campbell <rcampbell@nvidia.com> > > > Cc: Mika Penttilä <mpenttil@redhat.com> > > > Cc: Matthew Brost <matthew.brost@intel.com> > > > Cc: Francois Dugast <francois.dugast@intel.com> > > > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com> > > > Signed-off-by: Balbir Singh <balbirs@nvidia.com> > > > --- > > > include/linux/rmap.h | 2 + > > > include/linux/swapops.h | 17 ++++ > > > lib/test_hmm.c | 2 +- > > > mm/huge_memory.c | 214 +++++++++++++++++++++++++++++++--------- > > > mm/migrate_device.c | 47 +++++++++ > > > mm/page_vma_mapped.c | 13 ++- > > > mm/pgtable-generic.c | 6 ++ > > > mm/rmap.c | 24 ++++- > > > 8 files changed, 272 insertions(+), 53 deletions(-) > > > > > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h > > > index 6cd020eea37a..dfb7aae3d77b 100644 > > > --- a/include/linux/rmap.h > > > +++ b/include/linux/rmap.h > > > @@ -927,6 +927,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, > > > #define PVMW_SYNC (1 << 0) > > > /* Look for migration entries rather than present PTEs */ > > > #define PVMW_MIGRATION (1 << 1) > > > +/* Look for device private THP entries */ > > > +#define PVMW_THP_DEVICE_PRIVATE (1 << 2) > > > struct page_vma_mapped_walk { > > > unsigned long pfn; > > > diff --git a/include/linux/swapops.h b/include/linux/swapops.h > > > index 64ea151a7ae3..2641c01bd5d2 100644 > > > --- a/include/linux/swapops.h > > > +++ b/include/linux/swapops.h > > > @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd) > > > { > > > return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); > > > } > > > + > > > #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ > > > static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, > > > struct page *page) > > > @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd) > > > } > > > #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ > > > +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) > > > + > > > +static inline int is_pmd_device_private_entry(pmd_t pmd) > > > +{ > > > + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); > > > +} > > > + > > > +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ > > > + > > > +static inline int is_pmd_device_private_entry(pmd_t pmd) > > > +{ > > > + return 0; > > > +} > > > + > > > +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ > > > + > > > static inline int non_swap_entry(swp_entry_t entry) > > > { > > > return swp_type(entry) >= MAX_SWAPFILES; > > > diff --git a/lib/test_hmm.c b/lib/test_hmm.c > > > index 761725bc713c..297f1e034045 100644 > > > --- a/lib/test_hmm.c > > > +++ b/lib/test_hmm.c > > > @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) > > > * the mirror but here we use it to hold the page for the simulated > > > * device memory and that page holds the pointer to the mirror. > > > */ > > > - rpage = vmf->page->zone_device_data; > > > + rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data; > > > dmirror = rpage->zone_device_data; > > > /* FIXME demonstrate how we can adjust migrate range */ > > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > > index 9c38a95e9f09..2495e3fdbfae 100644 > > > --- a/mm/huge_memory.c > > > +++ b/mm/huge_memory.c > > > @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, > > > if (unlikely(is_swap_pmd(pmd))) { > > > swp_entry_t entry = pmd_to_swp_entry(pmd); > > > - VM_BUG_ON(!is_pmd_migration_entry(pmd)); > > > - if (!is_readable_migration_entry(entry)) { > > > + VM_WARN_ON(!is_pmd_migration_entry(pmd) && > > > + !is_pmd_device_private_entry(pmd)); > > > + > > > + if (is_migration_entry(entry) && > > > + is_writable_migration_entry(entry)) { > > > entry = make_readable_migration_entry( > > > swp_offset(entry)); > > > pmd = swp_entry_to_pmd(entry); > > > @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, > > > pmd = pmd_swp_mkuffd_wp(pmd); > > > set_pmd_at(src_mm, addr, src_pmd, pmd); > > > } > > > + > > > + if (is_device_private_entry(entry)) { > > > + if (is_writable_device_private_entry(entry)) { > > > + entry = make_readable_device_private_entry( > > > + swp_offset(entry)); > > > + pmd = swp_entry_to_pmd(entry); > > > + > > > + if (pmd_swp_soft_dirty(*src_pmd)) > > > + pmd = pmd_swp_mksoft_dirty(pmd); > > > + if (pmd_swp_uffd_wp(*src_pmd)) > > > + pmd = pmd_swp_mkuffd_wp(pmd); > > > + set_pmd_at(src_mm, addr, src_pmd, pmd); > > > + } > > > + > > > + src_folio = pfn_swap_entry_folio(entry); > > > + VM_WARN_ON(!folio_test_large(src_folio)); > > > + > > > + folio_get(src_folio); > > > + /* > > > + * folio_try_dup_anon_rmap_pmd does not fail for > > > + * device private entries. > > > + */ > > > + VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio, > > > + &src_folio->page, dst_vma, src_vma)); > > > > VM_WARN_ON compiles out in non-debug builds. I hit this running the > > fork self I shared with a non-debug build. > > > folio_try_dup_anon_rmap_pmd() will never fail for > folio_is_device_private(folio) -- unless something is deeply messed up that > we wouldn't identify this folio as being device-private. > > Can you elaborate, what were you able to trigger, and in what kind of > environment? > Maybe this was bad phrasing. I compilied the kernel with a non-debug build and fork() broke for THP device pages because the above call to folio_try_dup_anon_rmap_pmd compiled out (i.e., it wasn't called). Matt > -- > Cheers > > David / dhildenb >
On 28.08.25 22:17, Matthew Brost wrote: > On Thu, Aug 28, 2025 at 10:12:53PM +0200, David Hildenbrand wrote: >> On 28.08.25 22:05, Matthew Brost wrote: >>> On Tue, Aug 12, 2025 at 12:40:27PM +1000, Balbir Singh wrote: >>>> Make THP handling code in the mm subsystem for THP pages aware of zone >>>> device pages. Although the code is designed to be generic when it comes >>>> to handling splitting of pages, the code is designed to work for THP >>>> page sizes corresponding to HPAGE_PMD_NR. >>>> >>>> Modify page_vma_mapped_walk() to return true when a zone device huge >>>> entry is present, enabling try_to_migrate() and other code migration >>>> paths to appropriately process the entry. page_vma_mapped_walk() will >>>> return true for zone device private large folios only when >>>> PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are >>>> not zone device private pages from having to add awareness. The key >>>> callback that needs this flag is try_to_migrate_one(). The other >>>> callbacks page idle, damon use it for setting young/dirty bits, which is >>>> not significant when it comes to pmd level bit harvesting. >>>> >>>> pmd_pfn() does not work well with zone device entries, use >>>> pfn_pmd_entry_to_swap() for checking and comparison as for zone device >>>> entries. >>>> >>>> Support partial unmapping of zone device private entries, which happens >>>> via munmap(). munmap() causes the device private entry pmd to be split, >>>> but the corresponding folio is not split. Deferred split does not work for >>>> zone device private folios due to the need to split during fault >>>> handling. Get migrate_vma_collect_pmd() to handle this case by splitting >>>> partially unmapped device private folios. >>>> >>>> Cc: Andrew Morton <akpm@linux-foundation.org> >>>> Cc: David Hildenbrand <david@redhat.com> >>>> Cc: Zi Yan <ziy@nvidia.com> >>>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com> >>>> Cc: Rakie Kim <rakie.kim@sk.com> >>>> Cc: Byungchul Park <byungchul@sk.com> >>>> Cc: Gregory Price <gourry@gourry.net> >>>> Cc: Ying Huang <ying.huang@linux.alibaba.com> >>>> Cc: Alistair Popple <apopple@nvidia.com> >>>> Cc: Oscar Salvador <osalvador@suse.de> >>>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> >>>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> >>>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> >>>> Cc: Nico Pache <npache@redhat.com> >>>> Cc: Ryan Roberts <ryan.roberts@arm.com> >>>> Cc: Dev Jain <dev.jain@arm.com> >>>> Cc: Barry Song <baohua@kernel.org> >>>> Cc: Lyude Paul <lyude@redhat.com> >>>> Cc: Danilo Krummrich <dakr@kernel.org> >>>> Cc: David Airlie <airlied@gmail.com> >>>> Cc: Simona Vetter <simona@ffwll.ch> >>>> Cc: Ralph Campbell <rcampbell@nvidia.com> >>>> Cc: Mika Penttilä <mpenttil@redhat.com> >>>> Cc: Matthew Brost <matthew.brost@intel.com> >>>> Cc: Francois Dugast <francois.dugast@intel.com> >>>> >>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com> >>>> Signed-off-by: Balbir Singh <balbirs@nvidia.com> >>>> --- >>>> include/linux/rmap.h | 2 + >>>> include/linux/swapops.h | 17 ++++ >>>> lib/test_hmm.c | 2 +- >>>> mm/huge_memory.c | 214 +++++++++++++++++++++++++++++++--------- >>>> mm/migrate_device.c | 47 +++++++++ >>>> mm/page_vma_mapped.c | 13 ++- >>>> mm/pgtable-generic.c | 6 ++ >>>> mm/rmap.c | 24 ++++- >>>> 8 files changed, 272 insertions(+), 53 deletions(-) >>>> >>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h >>>> index 6cd020eea37a..dfb7aae3d77b 100644 >>>> --- a/include/linux/rmap.h >>>> +++ b/include/linux/rmap.h >>>> @@ -927,6 +927,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, >>>> #define PVMW_SYNC (1 << 0) >>>> /* Look for migration entries rather than present PTEs */ >>>> #define PVMW_MIGRATION (1 << 1) >>>> +/* Look for device private THP entries */ >>>> +#define PVMW_THP_DEVICE_PRIVATE (1 << 2) >>>> struct page_vma_mapped_walk { >>>> unsigned long pfn; >>>> diff --git a/include/linux/swapops.h b/include/linux/swapops.h >>>> index 64ea151a7ae3..2641c01bd5d2 100644 >>>> --- a/include/linux/swapops.h >>>> +++ b/include/linux/swapops.h >>>> @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd) >>>> { >>>> return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); >>>> } >>>> + >>>> #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ >>>> static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, >>>> struct page *page) >>>> @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd) >>>> } >>>> #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ >>>> +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) >>>> + >>>> +static inline int is_pmd_device_private_entry(pmd_t pmd) >>>> +{ >>>> + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); >>>> +} >>>> + >>>> +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ >>>> + >>>> +static inline int is_pmd_device_private_entry(pmd_t pmd) >>>> +{ >>>> + return 0; >>>> +} >>>> + >>>> +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ >>>> + >>>> static inline int non_swap_entry(swp_entry_t entry) >>>> { >>>> return swp_type(entry) >= MAX_SWAPFILES; >>>> diff --git a/lib/test_hmm.c b/lib/test_hmm.c >>>> index 761725bc713c..297f1e034045 100644 >>>> --- a/lib/test_hmm.c >>>> +++ b/lib/test_hmm.c >>>> @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) >>>> * the mirror but here we use it to hold the page for the simulated >>>> * device memory and that page holds the pointer to the mirror. >>>> */ >>>> - rpage = vmf->page->zone_device_data; >>>> + rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data; >>>> dmirror = rpage->zone_device_data; >>>> /* FIXME demonstrate how we can adjust migrate range */ >>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>>> index 9c38a95e9f09..2495e3fdbfae 100644 >>>> --- a/mm/huge_memory.c >>>> +++ b/mm/huge_memory.c >>>> @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, >>>> if (unlikely(is_swap_pmd(pmd))) { >>>> swp_entry_t entry = pmd_to_swp_entry(pmd); >>>> - VM_BUG_ON(!is_pmd_migration_entry(pmd)); >>>> - if (!is_readable_migration_entry(entry)) { >>>> + VM_WARN_ON(!is_pmd_migration_entry(pmd) && >>>> + !is_pmd_device_private_entry(pmd)); >>>> + >>>> + if (is_migration_entry(entry) && >>>> + is_writable_migration_entry(entry)) { >>>> entry = make_readable_migration_entry( >>>> swp_offset(entry)); >>>> pmd = swp_entry_to_pmd(entry); >>>> @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, >>>> pmd = pmd_swp_mkuffd_wp(pmd); >>>> set_pmd_at(src_mm, addr, src_pmd, pmd); >>>> } >>>> + >>>> + if (is_device_private_entry(entry)) { >>>> + if (is_writable_device_private_entry(entry)) { >>>> + entry = make_readable_device_private_entry( >>>> + swp_offset(entry)); >>>> + pmd = swp_entry_to_pmd(entry); >>>> + >>>> + if (pmd_swp_soft_dirty(*src_pmd)) >>>> + pmd = pmd_swp_mksoft_dirty(pmd); >>>> + if (pmd_swp_uffd_wp(*src_pmd)) >>>> + pmd = pmd_swp_mkuffd_wp(pmd); >>>> + set_pmd_at(src_mm, addr, src_pmd, pmd); >>>> + } >>>> + >>>> + src_folio = pfn_swap_entry_folio(entry); >>>> + VM_WARN_ON(!folio_test_large(src_folio)); >>>> + >>>> + folio_get(src_folio); >>>> + /* >>>> + * folio_try_dup_anon_rmap_pmd does not fail for >>>> + * device private entries. >>>> + */ >>>> + VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio, >>>> + &src_folio->page, dst_vma, src_vma)); >>> >>> VM_WARN_ON compiles out in non-debug builds. I hit this running the >>> fork self I shared with a non-debug build. >> >> >> folio_try_dup_anon_rmap_pmd() will never fail for >> folio_is_device_private(folio) -- unless something is deeply messed up that >> we wouldn't identify this folio as being device-private. >> >> Can you elaborate, what were you able to trigger, and in what kind of >> environment? >> > > Maybe this was bad phrasing. I compilied the kernel with a non-debug > build and fork() broke for THP device pages because the above call to > folio_try_dup_anon_rmap_pmd compiled out (i.e., it wasn't called). Ah, yes! As I said in my reply, we should not do any kind of WARN here, like in the PTE case. -- Cheers David / dhildenb
On 12.08.25 04:40, Balbir Singh wrote: > Make THP handling code in the mm subsystem for THP pages aware of zone > device pages. Although the code is designed to be generic when it comes > to handling splitting of pages, the code is designed to work for THP > page sizes corresponding to HPAGE_PMD_NR. > > Modify page_vma_mapped_walk() to return true when a zone device huge > entry is present, enabling try_to_migrate() and other code migration > paths to appropriately process the entry. page_vma_mapped_walk() will > return true for zone device private large folios only when > PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are > not zone device private pages from having to add awareness. Please don't if avoidable. We should already have the same problem with small zone-device private pages, and should have proper folio checks in place, no? [...] This thing is huge and hard to review. Given there are subtle changes in here that are likely problematic, this is a problem. Is there any way to split this into logical chunks? Like teaching zap, mprotect, rmap walks .... code separately. I'm, sure you'll find a way to break this down so I don't walk out of a review with an headake ;) > > struct page_vma_mapped_walk { > unsigned long pfn; > diff --git a/include/linux/swapops.h b/include/linux/swapops.h > index 64ea151a7ae3..2641c01bd5d2 100644 > --- a/include/linux/swapops.h > +++ b/include/linux/swapops.h > @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd) > { > return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); > } > + ^ unrelated change > #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ > static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, > struct page *page) > @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd) > } > #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ > > +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) > + > +static inline int is_pmd_device_private_entry(pmd_t pmd) > +{ > + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); > +} > + > +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ > + > +static inline int is_pmd_device_private_entry(pmd_t pmd) > +{ > + return 0; > +} > + > +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ > + > static inline int non_swap_entry(swp_entry_t entry) > { > return swp_type(entry) >= MAX_SWAPFILES; > diff --git a/lib/test_hmm.c b/lib/test_hmm.c > index 761725bc713c..297f1e034045 100644 > --- a/lib/test_hmm.c > +++ b/lib/test_hmm.c > @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) > * the mirror but here we use it to hold the page for the simulated > * device memory and that page holds the pointer to the mirror. > */ > - rpage = vmf->page->zone_device_data; > + rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data; Can we have a wrapper please to give us the zone_device_data for a folio, so we have something like rpage = folio_zone_device_data(page_folio(vmf->page)); > dmirror = rpage->zone_device_data; > > /* FIXME demonstrate how we can adjust migrate range */ > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 9c38a95e9f09..2495e3fdbfae 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, > if (unlikely(is_swap_pmd(pmd))) { > swp_entry_t entry = pmd_to_swp_entry(pmd); > > - VM_BUG_ON(!is_pmd_migration_entry(pmd)); > - if (!is_readable_migration_entry(entry)) { > + VM_WARN_ON(!is_pmd_migration_entry(pmd) && > + !is_pmd_device_private_entry(pmd)); > + > + if (is_migration_entry(entry) && > + is_writable_migration_entry(entry)) { > entry = make_readable_migration_entry( > swp_offset(entry)); Careful: There is is_readable_exclusive_migration_entry(). So don't change the !is_readable_migration_entry(entry) to is_writable_migration_entry(entry)(), because it's wrong. > pmd = swp_entry_to_pmd(entry); > @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, > pmd = pmd_swp_mkuffd_wp(pmd); > set_pmd_at(src_mm, addr, src_pmd, pmd); > } > + > + if (is_device_private_entry(entry)) { likely you want "else if" here. > + if (is_writable_device_private_entry(entry)) { > + entry = make_readable_device_private_entry( > + swp_offset(entry)); > + pmd = swp_entry_to_pmd(entry); > + > + if (pmd_swp_soft_dirty(*src_pmd)) > + pmd = pmd_swp_mksoft_dirty(pmd); > + if (pmd_swp_uffd_wp(*src_pmd)) > + pmd = pmd_swp_mkuffd_wp(pmd); > + set_pmd_at(src_mm, addr, src_pmd, pmd); > + } > + > + src_folio = pfn_swap_entry_folio(entry); > + VM_WARN_ON(!folio_test_large(src_folio)); > + > + folio_get(src_folio); > + /* > + * folio_try_dup_anon_rmap_pmd does not fail for > + * device private entries. > + */ > + VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio, > + &src_folio->page, dst_vma, src_vma)); > + } I would appreciate if this code flow here would resemble more what we have in copy_nonpresent_pte(), at least regarding handling of these two cases. (e.g., dropping the VM_WARN_ON) > + > add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); > mm_inc_nr_ptes(dst_mm); > pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); > @@ -2219,15 +2248,22 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > folio_remove_rmap_pmd(folio, page, vma); > WARN_ON_ONCE(folio_mapcount(folio) < 0); > VM_BUG_ON_PAGE(!PageHead(page), page); > - } else if (thp_migration_supported()) { > + } else if (is_pmd_migration_entry(orig_pmd) || > + is_pmd_device_private_entry(orig_pmd)) { > swp_entry_t entry; > > - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); > entry = pmd_to_swp_entry(orig_pmd); > folio = pfn_swap_entry_folio(entry); > flush_needed = 0; > - } else > - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); > + > + if (!thp_migration_supported()) > + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); > + > + if (is_pmd_device_private_entry(orig_pmd)) { > + folio_remove_rmap_pmd(folio, &folio->page, vma); > + WARN_ON_ONCE(folio_mapcount(folio) < 0); Can we jsut move that into the folio_is_device_private() check below. > + } > + } > > if (folio_test_anon(folio)) { > zap_deposited_table(tlb->mm, pmd); > @@ -2247,6 +2283,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > folio_mark_accessed(folio); > } > > + /* > + * Do a folio put on zone device private pages after > + * changes to mm_counter, because the folio_put() will > + * clean folio->mapping and the folio_test_anon() check > + * will not be usable. > + */ The comment can be dropped: it's simple, don't use "folio" after dropping the reference when zapping. > + if (folio_is_device_private(folio)) > + folio_put(folio); > + > spin_unlock(ptl); > if (flush_needed) > tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); > @@ -2375,7 +2420,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > struct folio *folio = pfn_swap_entry_folio(entry); > pmd_t newpmd; > > - VM_BUG_ON(!is_pmd_migration_entry(*pmd)); > + VM_WARN_ON(!is_pmd_migration_entry(*pmd) && > + !folio_is_device_private(folio)); > if (is_writable_migration_entry(entry)) { > /* > * A protection check is difficult so > @@ -2388,6 +2434,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > newpmd = swp_entry_to_pmd(entry); > if (pmd_swp_soft_dirty(*pmd)) > newpmd = pmd_swp_mksoft_dirty(newpmd); > + } else if (is_writable_device_private_entry(entry)) { > + entry = make_readable_device_private_entry( > + swp_offset(entry)); > + newpmd = swp_entry_to_pmd(entry); > } else { > newpmd = *pmd; > } > @@ -2842,16 +2892,19 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > struct page *page; > pgtable_t pgtable; > pmd_t old_pmd, _pmd; > - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; > - bool anon_exclusive = false, dirty = false; > + bool young, write, soft_dirty, uffd_wp = false; > + bool anon_exclusive = false, dirty = false, present = false; > unsigned long addr; > pte_t *pte; > int i; > + swp_entry_t swp_entry; > > VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); > VM_BUG_ON_VMA(vma->vm_start > haddr, vma); > VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); > - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); > + > + VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) > + && !(is_pmd_device_private_entry(*pmd))); VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) && !(is_pmd_device_private_entry(*pmd))); > > count_vm_event(THP_SPLIT_PMD); > > @@ -2899,18 +2952,45 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > return __split_huge_zero_page_pmd(vma, haddr, pmd); > } > > - pmd_migration = is_pmd_migration_entry(*pmd); > - if (unlikely(pmd_migration)) { > - swp_entry_t entry; > > + present = pmd_present(*pmd); > + if (unlikely(!present)) { > + swp_entry = pmd_to_swp_entry(*pmd); > old_pmd = *pmd; > - entry = pmd_to_swp_entry(old_pmd); > - page = pfn_swap_entry_to_page(entry); > - write = is_writable_migration_entry(entry); > - if (PageAnon(page)) > - anon_exclusive = is_readable_exclusive_migration_entry(entry); > - young = is_migration_entry_young(entry); > - dirty = is_migration_entry_dirty(entry); > + > + folio = pfn_swap_entry_folio(swp_entry); > + VM_WARN_ON(!is_migration_entry(swp_entry) && > + !is_device_private_entry(swp_entry)); > + page = pfn_swap_entry_to_page(swp_entry); > + > + if (is_pmd_migration_entry(old_pmd)) { > + write = is_writable_migration_entry(swp_entry); > + if (PageAnon(page)) > + anon_exclusive = > + is_readable_exclusive_migration_entry( > + swp_entry); > + young = is_migration_entry_young(swp_entry); > + dirty = is_migration_entry_dirty(swp_entry); > + } else if (is_pmd_device_private_entry(old_pmd)) { > + write = is_writable_device_private_entry(swp_entry); > + anon_exclusive = PageAnonExclusive(page); > + if (freeze && anon_exclusive && > + folio_try_share_anon_rmap_pmd(folio, page)) > + freeze = false; > + if (!freeze) { > + rmap_t rmap_flags = RMAP_NONE; > + > + if (anon_exclusive) > + rmap_flags |= RMAP_EXCLUSIVE; > + > + folio_ref_add(folio, HPAGE_PMD_NR - 1); > + if (anon_exclusive) > + rmap_flags |= RMAP_EXCLUSIVE; > + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, > + vma, haddr, rmap_flags); > + } > + } This is massive and I'll have to review it with a fresh mind later. [...] put_page(page); > @@ -3058,8 +3157,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, > pmd_t *pmd, bool freeze) > { > + ^ unrelated > VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); > - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) > + if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) || > + (is_pmd_device_private_entry(*pmd))) if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) || is_pmd_device_private_entry(*pmd)) > __split_huge_pmd_locked(vma, pmd, address, freeze); > } > > @@ -3238,6 +3339,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, > VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); > lockdep_assert_held(&lruvec->lru_lock); > > + if (folio_is_device_private(folio)) > + return; > + > if (list) { > /* page reclaim is reclaiming a huge page */ > VM_WARN_ON(folio_test_lru(folio)); > @@ -3252,6 +3356,7 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, > list_add_tail(&new_folio->lru, &folio->lru); > folio_set_lru(new_folio); > } > + ^ unrelated > } > > /* Racy check whether the huge page can be split */ > @@ -3727,7 +3832,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, > > /* Prevent deferred_split_scan() touching ->_refcount */ > spin_lock(&ds_queue->split_queue_lock); > - if (folio_ref_freeze(folio, 1 + extra_pins)) { > + if (folio_ref_freeze(folio, 1 + folio_expected_ref_count(folio))) { I think I discussed that with Zi Yan and it's tricky. Such a change should go into a separate cleanup patch. > struct address_space *swap_cache = NULL; > struct lruvec *lruvec; > int expected_refs; > @@ -3858,8 +3963,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order, > if (nr_shmem_dropped) > shmem_uncharge(mapping->host, nr_shmem_dropped); > > - if (!ret && is_anon) > + if (!ret && is_anon && !folio_is_device_private(folio)) > remap_flags = RMP_USE_SHARED_ZEROPAGE; > + ^ unrelated > remap_page(folio, 1 << order, remap_flags); > > /* > @@ -4603,7 +4709,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, > return 0; > > flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); > - pmdval = pmdp_invalidate(vma, address, pvmw->pmd); > + if (unlikely(is_pmd_device_private_entry(*pvmw->pmd))) Use pmd_present() instead, please. (just like in the pte code that handles this). Why do we have to flush? pmd_clear() might be sufficient? In the PTE case we use pte_clear(). [...] > pmde = pmd_mksoft_dirty(pmde); > if (is_writable_migration_entry(entry)) > diff --git a/mm/migrate_device.c b/mm/migrate_device.c > index e05e14d6eacd..0ed337f94fcd 100644 > --- a/mm/migrate_device.c > +++ b/mm/migrate_device.c > @@ -136,6 +136,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, > * page table entry. Other special swap entries are not > * migratable, and we ignore regular swapped page. > */ > + struct folio *folio; > + > entry = pte_to_swp_entry(pte); > if (!is_device_private_entry(entry)) > goto next; > @@ -147,6 +149,51 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, > pgmap->owner != migrate->pgmap_owner) > goto next; > > + folio = page_folio(page); > + if (folio_test_large(folio)) { > + struct folio *new_folio; > + struct folio *new_fault_folio = NULL; > + > + /* > + * The reason for finding pmd present with a > + * device private pte and a large folio for the > + * pte is partial unmaps. Split the folio now > + * for the migration to be handled correctly > + */ There are also other cases, like any VMA splits. Not sure if that makes a difference, the folio is PTE mapped. > + pte_unmap_unlock(ptep, ptl); > + > + folio_get(folio); > + if (folio != fault_folio) > + folio_lock(folio); > + if (split_folio(folio)) { > + if (folio != fault_folio) > + folio_unlock(folio); > + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); > + goto next; > + } > + > + new_folio = page_folio(page); > + if (fault_folio) > + new_fault_folio = page_folio(migrate->fault_page); > + > + /* > + * Ensure the lock is held on the correct > + * folio after the split > + */ > + if (!new_fault_folio) { > + folio_unlock(folio); > + folio_put(folio); > + } else if (folio != new_fault_folio) { > + folio_get(new_fault_folio); > + folio_lock(new_fault_folio); > + folio_unlock(folio); > + folio_put(folio); > + } > + > + addr = start; > + goto again; Another thing to revisit with clean mind. > + } > + > mpfn = migrate_pfn(page_to_pfn(page)) | > MIGRATE_PFN_MIGRATE; > if (is_writable_device_private_entry(entry)) > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c > index e981a1a292d2..246e6c211f34 100644 > --- a/mm/page_vma_mapped.c > +++ b/mm/page_vma_mapped.c > @@ -250,12 +250,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) > pvmw->ptl = pmd_lock(mm, pvmw->pmd); > pmde = *pvmw->pmd; > if (!pmd_present(pmde)) { > - swp_entry_t entry; > + swp_entry_t entry = pmd_to_swp_entry(pmde); > > if (!thp_migration_supported() || > !(pvmw->flags & PVMW_MIGRATION)) > return not_found(pvmw); > - entry = pmd_to_swp_entry(pmde); > if (!is_migration_entry(entry) || > !check_pmd(swp_offset_pfn(entry), pvmw)) > return not_found(pvmw); > @@ -277,6 +276,16 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) > * cannot return prematurely, while zap_huge_pmd() has > * cleared *pmd but not decremented compound_mapcount(). > */ > + swp_entry_t entry; > + > + entry = pmd_to_swp_entry(pmde); > + > + if (is_device_private_entry(entry) && > + (pvmw->flags & PVMW_THP_DEVICE_PRIVATE)) { > + pvmw->ptl = pmd_lock(mm, pvmw->pmd); > + return true; > + } > + > if ((pvmw->flags & PVMW_SYNC) && > thp_vma_suitable_order(vma, pvmw->address, > PMD_ORDER) && > diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c > index 567e2d084071..604e8206a2ec 100644 > --- a/mm/pgtable-generic.c > +++ b/mm/pgtable-generic.c > @@ -292,6 +292,12 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) > *pmdvalp = pmdval; > if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) > goto nomap; > + if (is_swap_pmd(pmdval)) { > + swp_entry_t entry = pmd_to_swp_entry(pmdval); > + > + if (is_device_private_entry(entry)) > + goto nomap; > + } > if (unlikely(pmd_trans_huge(pmdval))) > goto nomap; > if (unlikely(pmd_bad(pmdval))) { > diff --git a/mm/rmap.c b/mm/rmap.c > index b5837075b6e0..f40e45564295 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -2285,7 +2285,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, > unsigned long address, void *arg) > { > struct mm_struct *mm = vma->vm_mm; > - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); > + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, > + PVMW_THP_DEVICE_PRIVATE); > bool anon_exclusive, writable, ret = true; > pte_t pteval; > struct page *subpage; > @@ -2330,6 +2331,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, > while (page_vma_mapped_walk(&pvmw)) { > /* PMD-mapped THP migration entry */ > if (!pvmw.pte) { > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > + unsigned long pfn; > +#endif > + > if (flags & TTU_SPLIT_HUGE_PMD) { > split_huge_pmd_locked(vma, pvmw.address, > pvmw.pmd, true); > @@ -2338,8 +2343,21 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, > break; > } > #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > - subpage = folio_page(folio, > - pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); > + /* > + * Zone device private folios do not work well with > + * pmd_pfn() on some architectures due to pte > + * inversion. > + */ Please use the handling for the PTE case as inspiration. /* * Handle PFN swap PTEs, such as device-exclusive ones, that * actually map pages. */ pteval = ptep_get(pvmw.pte); if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } So I would expect here something like /* * Handle PFN swap PTEs, such as device-exclusive ones, that * actually map pages. */ pmdval = pmdp_get(pvmw.pmd); if (likely(pmd_present(pmdval))) pfn = pmd_pfn(pmdval); else pfn = swp_offset_pfn(pmd_to_swp_entry(pmdval)); > + if (is_pmd_device_private_entry(*pvmw.pmd)) { > + swp_entry_t entry = pmd_to_swp_entry(*pvmw.pmd); > + > + pfn = swp_offset_pfn(entry); > + } else { > + pfn = pmd_pfn(*pvmw.pmd); > + } > + > + subpage = folio_page(folio, pfn - folio_pfn(folio)); > + > VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || > !folio_test_pmd_mappable(folio), folio); > -- Cheers David / dhildenb
On 8/27/25 01:19, David Hildenbrand wrote: > On 12.08.25 04:40, Balbir Singh wrote: >> Make THP handling code in the mm subsystem for THP pages aware of zone >> device pages. Although the code is designed to be generic when it comes >> to handling splitting of pages, the code is designed to work for THP >> page sizes corresponding to HPAGE_PMD_NR. >> >> Modify page_vma_mapped_walk() to return true when a zone device huge >> entry is present, enabling try_to_migrate() and other code migration >> paths to appropriately process the entry. page_vma_mapped_walk() will >> return true for zone device private large folios only when >> PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are >> not zone device private pages from having to add awareness. > > Please don't if avoidable. > > We should already have the same problem with small zone-device private > pages, and should have proper folio checks in place, no? > > > [...] > > This thing is huge and hard to review. Given there are subtle changes in here that > are likely problematic, this is a problem. Is there any way to split this > into logical chunks? > > Like teaching zap, mprotect, rmap walks .... code separately. > > I'm, sure you'll find a way to break this down so I don't walk out of a > review with an headake ;) > :) I had smaller chunks earlier, but then ran into don't add the change unless you use the change problem >> struct page_vma_mapped_walk { >> unsigned long pfn; >> diff --git a/include/linux/swapops.h b/include/linux/swapops.h >> index 64ea151a7ae3..2641c01bd5d2 100644 >> --- a/include/linux/swapops.h >> +++ b/include/linux/swapops.h >> @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd) >> { >> return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); >> } >> + > > ^ unrelated change Ack > >> #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ >> static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, >> struct page *page) >> @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd) >> } >> #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ >> +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) >> + >> +static inline int is_pmd_device_private_entry(pmd_t pmd) >> +{ >> + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); >> +} >> + >> +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ >> + >> +static inline int is_pmd_device_private_entry(pmd_t pmd) >> +{ >> + return 0; >> +} >> + >> +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ >> + >> static inline int non_swap_entry(swp_entry_t entry) >> { >> return swp_type(entry) >= MAX_SWAPFILES; >> diff --git a/lib/test_hmm.c b/lib/test_hmm.c >> index 761725bc713c..297f1e034045 100644 >> --- a/lib/test_hmm.c >> +++ b/lib/test_hmm.c >> @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) >> * the mirror but here we use it to hold the page for the simulated >> * device memory and that page holds the pointer to the mirror. >> */ >> - rpage = vmf->page->zone_device_data; >> + rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data; > > Can we have a wrapper please to give us the zone_device_data for a folio, so > we have something like > > rpage = folio_zone_device_data(page_folio(vmf->page)); > Yep, will change >> dmirror = rpage->zone_device_data; >> /* FIXME demonstrate how we can adjust migrate range */ >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index 9c38a95e9f09..2495e3fdbfae 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, >> if (unlikely(is_swap_pmd(pmd))) { >> swp_entry_t entry = pmd_to_swp_entry(pmd); >> - VM_BUG_ON(!is_pmd_migration_entry(pmd)); >> - if (!is_readable_migration_entry(entry)) { >> + VM_WARN_ON(!is_pmd_migration_entry(pmd) && >> + !is_pmd_device_private_entry(pmd)); >> + >> + if (is_migration_entry(entry) && >> + is_writable_migration_entry(entry)) { >> entry = make_readable_migration_entry( >> swp_offset(entry)); > > Careful: There is is_readable_exclusive_migration_entry(). So don't > change the !is_readable_migration_entry(entry) to is_writable_migration_entry(entry)(), > because it's wrong. > Ack, I assume you are referring to potential prot_none entries? >> pmd = swp_entry_to_pmd(entry); >> @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, >> pmd = pmd_swp_mkuffd_wp(pmd); >> set_pmd_at(src_mm, addr, src_pmd, pmd); >> } >> + >> + if (is_device_private_entry(entry)) { > > likely you want "else if" here. > Ack >> + if (is_writable_device_private_entry(entry)) { >> + entry = make_readable_device_private_entry( >> + swp_offset(entry)); >> + pmd = swp_entry_to_pmd(entry); >> + >> + if (pmd_swp_soft_dirty(*src_pmd)) >> + pmd = pmd_swp_mksoft_dirty(pmd); >> + if (pmd_swp_uffd_wp(*src_pmd)) >> + pmd = pmd_swp_mkuffd_wp(pmd); >> + set_pmd_at(src_mm, addr, src_pmd, pmd); >> + } >> + >> + src_folio = pfn_swap_entry_folio(entry); >> + VM_WARN_ON(!folio_test_large(src_folio)); >> + >> + folio_get(src_folio); >> + /* >> + * folio_try_dup_anon_rmap_pmd does not fail for >> + * device private entries. >> + */ >> + VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio, >> + &src_folio->page, dst_vma, src_vma)); >> + } > > I would appreciate if this code flow here would resemble more what we have in > copy_nonpresent_pte(), at least regarding handling of these two cases. > > (e.g., dropping the VM_WARN_ON) Ack > >> + >> add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); >> mm_inc_nr_ptes(dst_mm); >> pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); >> @@ -2219,15 +2248,22 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, >> folio_remove_rmap_pmd(folio, page, vma); >> WARN_ON_ONCE(folio_mapcount(folio) < 0); >> VM_BUG_ON_PAGE(!PageHead(page), page); >> - } else if (thp_migration_supported()) { >> + } else if (is_pmd_migration_entry(orig_pmd) || >> + is_pmd_device_private_entry(orig_pmd)) { >> swp_entry_t entry; >> - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); >> entry = pmd_to_swp_entry(orig_pmd); >> folio = pfn_swap_entry_folio(entry); >> flush_needed = 0; >> - } else >> - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); >> + >> + if (!thp_migration_supported()) >> + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); >> + >> + if (is_pmd_device_private_entry(orig_pmd)) { >> + folio_remove_rmap_pmd(folio, &folio->page, vma); >> + WARN_ON_ONCE(folio_mapcount(folio) < 0); > > Can we jsut move that into the folio_is_device_private() check below. The check you mean? > >> + } >> + } >> if (folio_test_anon(folio)) { >> zap_deposited_table(tlb->mm, pmd); >> @@ -2247,6 +2283,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, >> folio_mark_accessed(folio); >> } >> + /* >> + * Do a folio put on zone device private pages after >> + * changes to mm_counter, because the folio_put() will >> + * clean folio->mapping and the folio_test_anon() check >> + * will not be usable. >> + */ > > The comment can be dropped: it's simple, don't use "folio" after > dropping the reference when zapping. > Ack >> + if (folio_is_device_private(folio)) >> + folio_put(folio); > >> + >> spin_unlock(ptl); >> if (flush_needed) >> tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); >> @@ -2375,7 +2420,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, >> struct folio *folio = pfn_swap_entry_folio(entry); >> pmd_t newpmd; >> - VM_BUG_ON(!is_pmd_migration_entry(*pmd)); >> + VM_WARN_ON(!is_pmd_migration_entry(*pmd) && >> + !folio_is_device_private(folio)); >> if (is_writable_migration_entry(entry)) { >> /* >> * A protection check is difficult so >> @@ -2388,6 +2434,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, >> newpmd = swp_entry_to_pmd(entry); >> if (pmd_swp_soft_dirty(*pmd)) >> newpmd = pmd_swp_mksoft_dirty(newpmd); >> + } else if (is_writable_device_private_entry(entry)) { >> + entry = make_readable_device_private_entry( >> + swp_offset(entry)); >> + newpmd = swp_entry_to_pmd(entry); >> } else { >> newpmd = *pmd; >> } >> @@ -2842,16 +2892,19 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >> struct page *page; >> pgtable_t pgtable; >> pmd_t old_pmd, _pmd; >> - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; >> - bool anon_exclusive = false, dirty = false; >> + bool young, write, soft_dirty, uffd_wp = false; >> + bool anon_exclusive = false, dirty = false, present = false; >> unsigned long addr; >> pte_t *pte; >> int i; >> + swp_entry_t swp_entry; >> VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); >> VM_BUG_ON_VMA(vma->vm_start > haddr, vma); >> VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); >> - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); >> + >> + VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) >> + && !(is_pmd_device_private_entry(*pmd))); > > VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) && > !(is_pmd_device_private_entry(*pmd))); > > Ack >> count_vm_event(THP_SPLIT_PMD); >> @@ -2899,18 +2952,45 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >> return __split_huge_zero_page_pmd(vma, haddr, pmd); >> } >> - pmd_migration = is_pmd_migration_entry(*pmd); >> - if (unlikely(pmd_migration)) { >> - swp_entry_t entry; >> + present = pmd_present(*pmd); >> + if (unlikely(!present)) { >> + swp_entry = pmd_to_swp_entry(*pmd); >> old_pmd = *pmd; >> - entry = pmd_to_swp_entry(old_pmd); >> - page = pfn_swap_entry_to_page(entry); >> - write = is_writable_migration_entry(entry); >> - if (PageAnon(page)) >> - anon_exclusive = is_readable_exclusive_migration_entry(entry); >> - young = is_migration_entry_young(entry); >> - dirty = is_migration_entry_dirty(entry); >> + >> + folio = pfn_swap_entry_folio(swp_entry); >> + VM_WARN_ON(!is_migration_entry(swp_entry) && >> + !is_device_private_entry(swp_entry)); >> + page = pfn_swap_entry_to_page(swp_entry); >> + >> + if (is_pmd_migration_entry(old_pmd)) { >> + write = is_writable_migration_entry(swp_entry); >> + if (PageAnon(page)) >> + anon_exclusive = >> + is_readable_exclusive_migration_entry( >> + swp_entry); >> + young = is_migration_entry_young(swp_entry); >> + dirty = is_migration_entry_dirty(swp_entry); >> + } else if (is_pmd_device_private_entry(old_pmd)) { >> + write = is_writable_device_private_entry(swp_entry); >> + anon_exclusive = PageAnonExclusive(page); >> + if (freeze && anon_exclusive && >> + folio_try_share_anon_rmap_pmd(folio, page)) >> + freeze = false; >> + if (!freeze) { >> + rmap_t rmap_flags = RMAP_NONE; >> + >> + if (anon_exclusive) >> + rmap_flags |= RMAP_EXCLUSIVE; >> + >> + folio_ref_add(folio, HPAGE_PMD_NR - 1); >> + if (anon_exclusive) >> + rmap_flags |= RMAP_EXCLUSIVE; >> + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, >> + vma, haddr, rmap_flags); >> + } >> + } > > This is massive and I'll have to review it with a fresh mind later. It is similar to what we do for non device private folios, when we map/unmap the entire folio during migration, the new fresh folios/pages should be marked as anon exclusive. But please do check > > [...] > put_page(page); >> @@ -3058,8 +3157,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >> void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, >> pmd_t *pmd, bool freeze) >> { >> + > > ^ unrelated > Ack >> VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); >> - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) >> + if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) || >> + (is_pmd_device_private_entry(*pmd))) > > if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) || > is_pmd_device_private_entry(*pmd)) > >> __split_huge_pmd_locked(vma, pmd, address, freeze); >> } >> @@ -3238,6 +3339,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, >> VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); >> lockdep_assert_held(&lruvec->lru_lock); >> + if (folio_is_device_private(folio)) >> + return; >> + >> if (list) { >> /* page reclaim is reclaiming a huge page */ >> VM_WARN_ON(folio_test_lru(folio)); >> @@ -3252,6 +3356,7 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, >> list_add_tail(&new_folio->lru, &folio->lru); >> folio_set_lru(new_folio); >> } >> + > > ^ unrelated > Ack >> } >> /* Racy check whether the huge page can be split */ >> @@ -3727,7 +3832,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, >> /* Prevent deferred_split_scan() touching ->_refcount */ >> spin_lock(&ds_queue->split_queue_lock); >> - if (folio_ref_freeze(folio, 1 + extra_pins)) { >> + if (folio_ref_freeze(folio, 1 + folio_expected_ref_count(folio))) { > > I think I discussed that with Zi Yan and it's tricky. Such a change should go > into a separate cleanup patch. > Ack, I'll split it up as needed. The reason why this is here is because large folios that have been partially split (pmd split) have a count of nr_pages + 1 on pmd_split and after the split the map count falls, but never goes to 0 as the ref_freeze code expects. > >> struct address_space *swap_cache = NULL; >> struct lruvec *lruvec; >> int expected_refs; >> @@ -3858,8 +3963,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order, >> if (nr_shmem_dropped) >> shmem_uncharge(mapping->host, nr_shmem_dropped); >> - if (!ret && is_anon) >> + if (!ret && is_anon && !folio_is_device_private(folio)) >> remap_flags = RMP_USE_SHARED_ZEROPAGE; >> + > > ^ unrelated Ack > >> remap_page(folio, 1 << order, remap_flags); >> /* >> @@ -4603,7 +4709,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, >> return 0; >> flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); >> - pmdval = pmdp_invalidate(vma, address, pvmw->pmd); >> + if (unlikely(is_pmd_device_private_entry(*pvmw->pmd))) > > Use pmd_present() instead, please. (just like in the pte code that handles this). > Ack > Why do we have to flush? pmd_clear() might be sufficient? In the PTE case we use pte_clear(). Without the flush, other entities will not see the cleared pmd and isn't the pte_clear() only when should_defer_flush() is true? > > [...] > >> pmde = pmd_mksoft_dirty(pmde); >> if (is_writable_migration_entry(entry)) >> diff --git a/mm/migrate_device.c b/mm/migrate_device.c >> index e05e14d6eacd..0ed337f94fcd 100644 >> --- a/mm/migrate_device.c >> +++ b/mm/migrate_device.c >> @@ -136,6 +136,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, >> * page table entry. Other special swap entries are not >> * migratable, and we ignore regular swapped page. >> */ >> + struct folio *folio; >> + >> entry = pte_to_swp_entry(pte); >> if (!is_device_private_entry(entry)) >> goto next; >> @@ -147,6 +149,51 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, >> pgmap->owner != migrate->pgmap_owner) >> goto next; >> + folio = page_folio(page); >> + if (folio_test_large(folio)) { >> + struct folio *new_folio; >> + struct folio *new_fault_folio = NULL; >> + >> + /* >> + * The reason for finding pmd present with a >> + * device private pte and a large folio for the >> + * pte is partial unmaps. Split the folio now >> + * for the migration to be handled correctly >> + */ > > There are also other cases, like any VMA splits. Not sure if that makes a difference, > the folio is PTE mapped. > Ack, I can clarify that the folio is just pte mapped or remove the comment >> + pte_unmap_unlock(ptep, ptl); >> + >> + folio_get(folio); >> + if (folio != fault_folio) >> + folio_lock(folio); >> + if (split_folio(folio)) { >> + if (folio != fault_folio) >> + folio_unlock(folio); >> + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); >> + goto next; >> + } >> + >> + new_folio = page_folio(page); >> + if (fault_folio) >> + new_fault_folio = page_folio(migrate->fault_page); >> + >> + /* >> + * Ensure the lock is held on the correct >> + * folio after the split >> + */ >> + if (!new_fault_folio) { >> + folio_unlock(folio); >> + folio_put(folio); >> + } else if (folio != new_fault_folio) { >> + folio_get(new_fault_folio); >> + folio_lock(new_fault_folio); >> + folio_unlock(folio); >> + folio_put(folio); >> + } >> + >> + addr = start; >> + goto again; > > Another thing to revisit with clean mind. > Sure >> + } >> + >> mpfn = migrate_pfn(page_to_pfn(page)) | >> MIGRATE_PFN_MIGRATE; >> if (is_writable_device_private_entry(entry)) >> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c >> index e981a1a292d2..246e6c211f34 100644 >> --- a/mm/page_vma_mapped.c >> +++ b/mm/page_vma_mapped.c >> @@ -250,12 +250,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) >> pvmw->ptl = pmd_lock(mm, pvmw->pmd); >> pmde = *pvmw->pmd; >> if (!pmd_present(pmde)) { >> - swp_entry_t entry; >> + swp_entry_t entry = pmd_to_swp_entry(pmde); >> if (!thp_migration_supported() || >> !(pvmw->flags & PVMW_MIGRATION)) >> return not_found(pvmw); >> - entry = pmd_to_swp_entry(pmde); >> if (!is_migration_entry(entry) || >> !check_pmd(swp_offset_pfn(entry), pvmw)) >> return not_found(pvmw); >> @@ -277,6 +276,16 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) >> * cannot return prematurely, while zap_huge_pmd() has >> * cleared *pmd but not decremented compound_mapcount(). >> */ >> + swp_entry_t entry; >> + >> + entry = pmd_to_swp_entry(pmde); >> + >> + if (is_device_private_entry(entry) && >> + (pvmw->flags & PVMW_THP_DEVICE_PRIVATE)) { >> + pvmw->ptl = pmd_lock(mm, pvmw->pmd); >> + return true; >> + } >> + >> if ((pvmw->flags & PVMW_SYNC) && >> thp_vma_suitable_order(vma, pvmw->address, >> PMD_ORDER) && >> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c >> index 567e2d084071..604e8206a2ec 100644 >> --- a/mm/pgtable-generic.c >> +++ b/mm/pgtable-generic.c >> @@ -292,6 +292,12 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) >> *pmdvalp = pmdval; >> if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) >> goto nomap; >> + if (is_swap_pmd(pmdval)) { >> + swp_entry_t entry = pmd_to_swp_entry(pmdval); >> + >> + if (is_device_private_entry(entry)) >> + goto nomap; >> + } >> if (unlikely(pmd_trans_huge(pmdval))) >> goto nomap; >> if (unlikely(pmd_bad(pmdval))) { >> diff --git a/mm/rmap.c b/mm/rmap.c >> index b5837075b6e0..f40e45564295 100644 >> --- a/mm/rmap.c >> +++ b/mm/rmap.c >> @@ -2285,7 +2285,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, >> unsigned long address, void *arg) >> { >> struct mm_struct *mm = vma->vm_mm; >> - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); >> + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, >> + PVMW_THP_DEVICE_PRIVATE); >> bool anon_exclusive, writable, ret = true; >> pte_t pteval; >> struct page *subpage; >> @@ -2330,6 +2331,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, >> while (page_vma_mapped_walk(&pvmw)) { >> /* PMD-mapped THP migration entry */ >> if (!pvmw.pte) { >> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION >> + unsigned long pfn; >> +#endif >> + >> if (flags & TTU_SPLIT_HUGE_PMD) { >> split_huge_pmd_locked(vma, pvmw.address, >> pvmw.pmd, true); >> @@ -2338,8 +2343,21 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, >> break; >> } >> #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION >> - subpage = folio_page(folio, >> - pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); >> + /* >> + * Zone device private folios do not work well with >> + * pmd_pfn() on some architectures due to pte >> + * inversion. >> + */ > > Please use the handling for the PTE case as inspiration. > > /* > * Handle PFN swap PTEs, such as device-exclusive ones, that > * actually map pages. > */ > pteval = ptep_get(pvmw.pte); > if (likely(pte_present(pteval))) { > pfn = pte_pfn(pteval); > } else { > pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); > VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); > } > > > So I would expect here something like > > /* > * Handle PFN swap PTEs, such as device-exclusive ones, that > * actually map pages. > */ > pmdval = pmdp_get(pvmw.pmd); > if (likely(pmd_present(pmdval))) > pfn = pmd_pfn(pmdval); > else > pfn = swp_offset_pfn(pmd_to_swp_entry(pmdval)); > > I can switch over to pmd_present for the checks >> + if (is_pmd_device_private_entry(*pvmw.pmd)) { >> + swp_entry_t entry = pmd_to_swp_entry(*pvmw.pmd); >> + >> + pfn = swp_offset_pfn(entry); >> + } else { >> + pfn = pmd_pfn(*pvmw.pmd); >> + } >> + >> + subpage = folio_page(folio, pfn - folio_pfn(folio)); >> + >> VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || >> !folio_test_pmd_mappable(folio), folio); >> > > Thanks for the review Balbir
>> Like teaching zap, mprotect, rmap walks .... code separately. >> >> I'm, sure you'll find a way to break this down so I don't walk out of a >> review with an headake ;) >> > > :) I had smaller chunks earlier, but then ran into don't add the change unless you > use the change problem > It's perfectly reasonable to have something like mm/huge_memory: teach copy_huge_pmd() about huge device-private entries mm/huge_memory: support splitting device-private folios ... etc :) [...] >> Careful: There is is_readable_exclusive_migration_entry(). So don't >> change the !is_readable_migration_entry(entry) to is_writable_migration_entry(entry)(), >> because it's wrong. >> > > Ack, I assume you are referring to potential prot_none entries? readable_exclusive are used to maintain the PageAnonExclusive bit right now for migration entries. So it's not realted to prot_none. [...] >>> - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); >>> + >>> + if (!thp_migration_supported()) >>> + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); >>> + >>> + if (is_pmd_device_private_entry(orig_pmd)) { >>> + folio_remove_rmap_pmd(folio, &folio->page, vma); >>> + WARN_ON_ONCE(folio_mapcount(folio) < 0); >> >> Can we jsut move that into the folio_is_device_private() check below. > > The check you mean? The whole thing like if (...) { folio_remove_rmap_pmd(folio, &folio->page, vma); WARN_ON_ONCE(folio_mapcount(folio) < 0); folio_put(folio) } [...] > >> Why do we have to flush? pmd_clear() might be sufficient? In the PTE case we use pte_clear(). > > Without the flush, other entities will not see the cleared pmd and isn't the pte_clear() only > when should_defer_flush() is true? It's a non-present page entry, so there should be no TLB entry to flush. > >> >> [...] >> >>> pmde = pmd_mksoft_dirty(pmde); >>> if (is_writable_migration_entry(entry)) >>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c >>> index e05e14d6eacd..0ed337f94fcd 100644 >>> --- a/mm/migrate_device.c >>> +++ b/mm/migrate_device.c >>> @@ -136,6 +136,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, >>> * page table entry. Other special swap entries are not >>> * migratable, and we ignore regular swapped page. >>> */ >>> + struct folio *folio; >>> + >>> entry = pte_to_swp_entry(pte); >>> if (!is_device_private_entry(entry)) >>> goto next; >>> @@ -147,6 +149,51 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, >>> pgmap->owner != migrate->pgmap_owner) >>> goto next; >>> + folio = page_folio(page); >>> + if (folio_test_large(folio)) { >>> + struct folio *new_folio; >>> + struct folio *new_fault_folio = NULL; >>> + >>> + /* >>> + * The reason for finding pmd present with a >>> + * device private pte and a large folio for the >>> + * pte is partial unmaps. Split the folio now >>> + * for the migration to be handled correctly >>> + */ >> >> There are also other cases, like any VMA splits. Not sure if that makes a difference, >> the folio is PTE mapped. >> > > Ack, I can clarify that the folio is just pte mapped or remove the comment Sounds good. -- Cheers David / dhildenb
Hi Balbir, kernel test robot noticed the following build warnings: [auto build test WARNING on akpm-mm/mm-everything] url: https://github.com/intel-lab-lkp/linux/commits/Balbir-Singh/mm-zone_device-support-large-zone-device-private-folios/20250812-105145 base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything patch link: https://lore.kernel.org/r/20250812024036.690064-3-balbirs%40nvidia.com patch subject: [v3 02/11] mm/thp: zone_device awareness in THP handling code config: x86_64-randconfig-001-20250812 (https://download.01.org/0day-ci/archive/20250812/202508122212.qjsfr5Wf-lkp@intel.com/config) compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250812/202508122212.qjsfr5Wf-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202508122212.qjsfr5Wf-lkp@intel.com/ All warnings (new ones prefixed by >>): >> mm/migrate_device.c:154:19: warning: variable 'new_folio' set but not used [-Wunused-but-set-variable] 154 | struct folio *new_folio; | ^ 1 warning generated. vim +/new_folio +154 mm/migrate_device.c 56 57 static int migrate_vma_collect_pmd(pmd_t *pmdp, 58 unsigned long start, 59 unsigned long end, 60 struct mm_walk *walk) 61 { 62 struct migrate_vma *migrate = walk->private; 63 struct folio *fault_folio = migrate->fault_page ? 64 page_folio(migrate->fault_page) : NULL; 65 struct vm_area_struct *vma = walk->vma; 66 struct mm_struct *mm = vma->vm_mm; 67 unsigned long addr = start, unmapped = 0; 68 spinlock_t *ptl; 69 pte_t *ptep; 70 71 again: 72 if (pmd_none(*pmdp)) 73 return migrate_vma_collect_hole(start, end, -1, walk); 74 75 if (pmd_trans_huge(*pmdp)) { 76 struct folio *folio; 77 78 ptl = pmd_lock(mm, pmdp); 79 if (unlikely(!pmd_trans_huge(*pmdp))) { 80 spin_unlock(ptl); 81 goto again; 82 } 83 84 folio = pmd_folio(*pmdp); 85 if (is_huge_zero_folio(folio)) { 86 spin_unlock(ptl); 87 split_huge_pmd(vma, pmdp, addr); 88 } else { 89 int ret; 90 91 folio_get(folio); 92 spin_unlock(ptl); 93 /* FIXME: we don't expect THP for fault_folio */ 94 if (WARN_ON_ONCE(fault_folio == folio)) 95 return migrate_vma_collect_skip(start, end, 96 walk); 97 if (unlikely(!folio_trylock(folio))) 98 return migrate_vma_collect_skip(start, end, 99 walk); 100 ret = split_folio(folio); 101 if (fault_folio != folio) 102 folio_unlock(folio); 103 folio_put(folio); 104 if (ret) 105 return migrate_vma_collect_skip(start, end, 106 walk); 107 } 108 } 109 110 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 111 if (!ptep) 112 goto again; 113 arch_enter_lazy_mmu_mode(); 114 115 for (; addr < end; addr += PAGE_SIZE, ptep++) { 116 struct dev_pagemap *pgmap; 117 unsigned long mpfn = 0, pfn; 118 struct folio *folio; 119 struct page *page; 120 swp_entry_t entry; 121 pte_t pte; 122 123 pte = ptep_get(ptep); 124 125 if (pte_none(pte)) { 126 if (vma_is_anonymous(vma)) { 127 mpfn = MIGRATE_PFN_MIGRATE; 128 migrate->cpages++; 129 } 130 goto next; 131 } 132 133 if (!pte_present(pte)) { 134 /* 135 * Only care about unaddressable device page special 136 * page table entry. Other special swap entries are not 137 * migratable, and we ignore regular swapped page. 138 */ 139 struct folio *folio; 140 141 entry = pte_to_swp_entry(pte); 142 if (!is_device_private_entry(entry)) 143 goto next; 144 145 page = pfn_swap_entry_to_page(entry); 146 pgmap = page_pgmap(page); 147 if (!(migrate->flags & 148 MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || 149 pgmap->owner != migrate->pgmap_owner) 150 goto next; 151 152 folio = page_folio(page); 153 if (folio_test_large(folio)) { > 154 struct folio *new_folio; 155 struct folio *new_fault_folio = NULL; 156 157 /* 158 * The reason for finding pmd present with a 159 * device private pte and a large folio for the 160 * pte is partial unmaps. Split the folio now 161 * for the migration to be handled correctly 162 */ 163 pte_unmap_unlock(ptep, ptl); 164 165 folio_get(folio); 166 if (folio != fault_folio) 167 folio_lock(folio); 168 if (split_folio(folio)) { 169 if (folio != fault_folio) 170 folio_unlock(folio); 171 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 172 goto next; 173 } 174 175 new_folio = page_folio(page); 176 if (fault_folio) 177 new_fault_folio = page_folio(migrate->fault_page); 178 179 /* 180 * Ensure the lock is held on the correct 181 * folio after the split 182 */ 183 if (!new_fault_folio) { 184 folio_unlock(folio); 185 folio_put(folio); 186 } else if (folio != new_fault_folio) { 187 folio_get(new_fault_folio); 188 folio_lock(new_fault_folio); 189 folio_unlock(folio); 190 folio_put(folio); 191 } 192 193 addr = start; 194 goto again; 195 } 196 197 mpfn = migrate_pfn(page_to_pfn(page)) | 198 MIGRATE_PFN_MIGRATE; 199 if (is_writable_device_private_entry(entry)) 200 mpfn |= MIGRATE_PFN_WRITE; 201 } else { 202 pfn = pte_pfn(pte); 203 if (is_zero_pfn(pfn) && 204 (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { 205 mpfn = MIGRATE_PFN_MIGRATE; 206 migrate->cpages++; 207 goto next; 208 } 209 page = vm_normal_page(migrate->vma, addr, pte); 210 if (page && !is_zone_device_page(page) && 211 !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { 212 goto next; 213 } else if (page && is_device_coherent_page(page)) { 214 pgmap = page_pgmap(page); 215 216 if (!(migrate->flags & 217 MIGRATE_VMA_SELECT_DEVICE_COHERENT) || 218 pgmap->owner != migrate->pgmap_owner) 219 goto next; 220 } 221 mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; 222 mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; 223 } 224 225 /* FIXME support THP */ 226 if (!page || !page->mapping || PageTransCompound(page)) { 227 mpfn = 0; 228 goto next; 229 } 230 231 /* 232 * By getting a reference on the folio we pin it and that blocks 233 * any kind of migration. Side effect is that it "freezes" the 234 * pte. 235 * 236 * We drop this reference after isolating the folio from the lru 237 * for non device folio (device folio are not on the lru and thus 238 * can't be dropped from it). 239 */ 240 folio = page_folio(page); 241 folio_get(folio); 242 243 /* 244 * We rely on folio_trylock() to avoid deadlock between 245 * concurrent migrations where each is waiting on the others 246 * folio lock. If we can't immediately lock the folio we fail this 247 * migration as it is only best effort anyway. 248 * 249 * If we can lock the folio it's safe to set up a migration entry 250 * now. In the common case where the folio is mapped once in a 251 * single process setting up the migration entry now is an 252 * optimisation to avoid walking the rmap later with 253 * try_to_migrate(). 254 */ 255 if (fault_folio == folio || folio_trylock(folio)) { 256 bool anon_exclusive; 257 pte_t swp_pte; 258 259 flush_cache_page(vma, addr, pte_pfn(pte)); 260 anon_exclusive = folio_test_anon(folio) && 261 PageAnonExclusive(page); 262 if (anon_exclusive) { 263 pte = ptep_clear_flush(vma, addr, ptep); 264 265 if (folio_try_share_anon_rmap_pte(folio, page)) { 266 set_pte_at(mm, addr, ptep, pte); 267 if (fault_folio != folio) 268 folio_unlock(folio); 269 folio_put(folio); 270 mpfn = 0; 271 goto next; 272 } 273 } else { 274 pte = ptep_get_and_clear(mm, addr, ptep); 275 } 276 277 migrate->cpages++; 278 279 /* Set the dirty flag on the folio now the pte is gone. */ 280 if (pte_dirty(pte)) 281 folio_mark_dirty(folio); 282 283 /* Setup special migration page table entry */ 284 if (mpfn & MIGRATE_PFN_WRITE) 285 entry = make_writable_migration_entry( 286 page_to_pfn(page)); 287 else if (anon_exclusive) 288 entry = make_readable_exclusive_migration_entry( 289 page_to_pfn(page)); 290 else 291 entry = make_readable_migration_entry( 292 page_to_pfn(page)); 293 if (pte_present(pte)) { 294 if (pte_young(pte)) 295 entry = make_migration_entry_young(entry); 296 if (pte_dirty(pte)) 297 entry = make_migration_entry_dirty(entry); 298 } 299 swp_pte = swp_entry_to_pte(entry); 300 if (pte_present(pte)) { 301 if (pte_soft_dirty(pte)) 302 swp_pte = pte_swp_mksoft_dirty(swp_pte); 303 if (pte_uffd_wp(pte)) 304 swp_pte = pte_swp_mkuffd_wp(swp_pte); 305 } else { 306 if (pte_swp_soft_dirty(pte)) 307 swp_pte = pte_swp_mksoft_dirty(swp_pte); 308 if (pte_swp_uffd_wp(pte)) 309 swp_pte = pte_swp_mkuffd_wp(swp_pte); 310 } 311 set_pte_at(mm, addr, ptep, swp_pte); 312 313 /* 314 * This is like regular unmap: we remove the rmap and 315 * drop the folio refcount. The folio won't be freed, as 316 * we took a reference just above. 317 */ 318 folio_remove_rmap_pte(folio, page, vma); 319 folio_put(folio); 320 321 if (pte_present(pte)) 322 unmapped++; 323 } else { 324 folio_put(folio); 325 mpfn = 0; 326 } 327 328 next: 329 migrate->dst[migrate->npages] = 0; 330 migrate->src[migrate->npages++] = mpfn; 331 } 332 333 /* Only flush the TLB if we actually modified any entries */ 334 if (unmapped) 335 flush_tlb_range(walk->vma, start, end); 336 337 arch_leave_lazy_mmu_mode(); 338 pte_unmap_unlock(ptep - 1, ptl); 339 340 return 0; 341 } 342 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
© 2016 - 2025 Red Hat, Inc.