From: Mika Penttilä <mpenttil@redhat.com>
Currently, the way device page faulting and migration works
is not optimal, if you want to do both fault handling and
migration at once.
Being able to migrate not present pages (or pages mapped with incorrect
permissions, eg. COW) to the GPU requires doing either of the
following sequences:
1. hmm_range_fault() - fault in non-present pages with correct permissions, etc.
2. migrate_vma_*() - migrate the pages
Or:
1. migrate_vma_*() - migrate present pages
2. If non-present pages detected by migrate_vma_*():
a) call hmm_range_fault() to fault pages in
b) call migrate_vma_*() again to migrate now present pages
The problem with the first sequence is that you always have to do two
page walks even when most of the time the pages are present or zero page
mappings so the common case takes a performance hit.
The second sequence is better for the common case, but far worse if
pages aren't present because now you have to walk the page tables three
times (once to find the page is not present, once so hmm_range_fault()
can find a non-present page to fault in and once again to setup the
migration). It is also tricky to code correctly.
We should be able to walk the page table once, faulting
pages in as required and replacing them with migration entries if
requested.
Add a new flag to HMM APIs, HMM_PFN_REQ_MIGRATE,
which tells to prepare for migration also during fault handling.
Also, for the migrate_vma_setup() call paths, a flags, MIGRATE_VMA_FAULT,
is added to tell to add fault handling to migrate.
Cc: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Suggested-by: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Mika Penttilä <mpenttil@redhat.com>
---
include/linux/hmm.h | 17 +-
include/linux/migrate.h | 6 +-
mm/hmm.c | 657 +++++++++++++++++++++++++++++++++++++---
mm/migrate_device.c | 81 ++++-
4 files changed, 706 insertions(+), 55 deletions(-)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index db75ffc949a7..7b7294ad0f62 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -12,7 +12,7 @@
#include <linux/mm.h>
struct mmu_interval_notifier;
-
+struct migrate_vma;
/*
* On output:
* 0 - The page is faultable and a future call with
@@ -48,15 +48,25 @@ enum hmm_pfn_flags {
HMM_PFN_P2PDMA = 1UL << (BITS_PER_LONG - 5),
HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6),
- HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 11),
+ /* Migrate request */
+ HMM_PFN_MIGRATE = 1UL << (BITS_PER_LONG - 7),
+ HMM_PFN_COMPOUND = 1UL << (BITS_PER_LONG - 8),
+ HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 13),
/* Input flags */
HMM_PFN_REQ_FAULT = HMM_PFN_VALID,
HMM_PFN_REQ_WRITE = HMM_PFN_WRITE,
+ HMM_PFN_REQ_MIGRATE = HMM_PFN_MIGRATE,
HMM_PFN_FLAGS = ~((1UL << HMM_PFN_ORDER_SHIFT) - 1),
};
+enum {
+ /* These flags are carried from input-to-output */
+ HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
+ HMM_PFN_P2PDMA_BUS,
+};
+
/*
* hmm_pfn_to_page() - return struct page pointed to by a device entry
*
@@ -107,6 +117,7 @@ static inline unsigned int hmm_pfn_to_map_order(unsigned long hmm_pfn)
* @default_flags: default flags for the range (write, read, ... see hmm doc)
* @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
* @dev_private_owner: owner of device private pages
+ * @migrate: structure for migrating the associated vma
*/
struct hmm_range {
struct mmu_interval_notifier *notifier;
@@ -117,12 +128,14 @@ struct hmm_range {
unsigned long default_flags;
unsigned long pfn_flags_mask;
void *dev_private_owner;
+ struct migrate_vma *migrate;
};
/*
* Please see Documentation/mm/hmm.rst for how to use the range API.
*/
int hmm_range_fault(struct hmm_range *range);
+int hmm_range_migrate_prepare(struct hmm_range *range, struct migrate_vma **pargs);
/*
* HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 26ca00c325d9..0889309a9d21 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -3,6 +3,7 @@
#define _LINUX_MIGRATE_H
#include <linux/mm.h>
+#include <linux/hmm.h>
#include <linux/mempolicy.h>
#include <linux/migrate_mode.h>
#include <linux/hugetlb.h>
@@ -140,11 +141,12 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
}
-enum migrate_vma_direction {
+enum migrate_vma_info {
MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
+ MIGRATE_VMA_FAULT = 1 << 4,
};
struct migrate_vma {
@@ -192,7 +194,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
unsigned long npages);
void migrate_device_finalize(unsigned long *src_pfns,
unsigned long *dst_pfns, unsigned long npages);
-
+void migrate_hmm_range_setup(struct hmm_range *range);
#endif /* CONFIG_MIGRATION */
#endif /* _LINUX_MIGRATE_H */
diff --git a/mm/hmm.c b/mm/hmm.c
index 4ec74c18bef6..39a07d895043 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -20,6 +20,7 @@
#include <linux/pagemap.h>
#include <linux/leafops.h>
#include <linux/hugetlb.h>
+#include <linux/migrate.h>
#include <linux/memremap.h>
#include <linux/sched/mm.h>
#include <linux/jump_label.h>
@@ -31,8 +32,12 @@
#include "internal.h"
struct hmm_vma_walk {
- struct hmm_range *range;
- unsigned long last;
+ struct mmu_notifier_range mmu_range;
+ struct vm_area_struct *vma;
+ struct hmm_range *range;
+ unsigned long start;
+ unsigned long end;
+ unsigned long last;
};
enum {
@@ -41,21 +46,49 @@ enum {
HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
};
-enum {
- /* These flags are carried from input-to-output */
- HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
- HMM_PFN_P2PDMA_BUS,
-};
+static enum migrate_vma_info hmm_select_migrate(struct hmm_range *range)
+{
+ enum migrate_vma_info minfo;
+
+ minfo = range->migrate ? range->migrate->flags : 0;
+ minfo |= (range->default_flags & HMM_PFN_REQ_MIGRATE) ?
+ MIGRATE_VMA_SELECT_SYSTEM : 0;
+
+ return minfo;
+}
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
- struct hmm_range *range, unsigned long cpu_flags)
+ struct hmm_vma_walk *hmm_vma_walk, unsigned long cpu_flags)
{
+ struct hmm_range *range = hmm_vma_walk->range;
unsigned long i = (addr - range->start) >> PAGE_SHIFT;
+ enum migrate_vma_info minfo;
+ bool migrate = false;
+
+ minfo = hmm_select_migrate(range);
+ if (cpu_flags != HMM_PFN_ERROR) {
+ if (minfo && (vma_is_anonymous(hmm_vma_walk->vma))) {
+ cpu_flags |= (HMM_PFN_VALID | HMM_PFN_MIGRATE);
+ migrate = true;
+ }
+ }
+
+ if (migrate && thp_migration_supported() &&
+ (minfo & MIGRATE_VMA_SELECT_COMPOUND) &&
+ IS_ALIGNED(addr, HPAGE_PMD_SIZE) &&
+ IS_ALIGNED(end, HPAGE_PMD_SIZE)) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= cpu_flags | HMM_PFN_COMPOUND;
+ addr += PAGE_SIZE;
+ i++;
+ cpu_flags = 0;
+ }
for (; addr < end; addr += PAGE_SIZE, i++) {
range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
range->hmm_pfns[i] |= cpu_flags;
}
+
return 0;
}
@@ -171,11 +204,11 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
if (!walk->vma) {
if (required_fault)
return -EFAULT;
- return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
+ return hmm_pfns_fill(addr, end, hmm_vma_walk, HMM_PFN_ERROR);
}
if (required_fault)
return hmm_vma_fault(addr, end, required_fault, walk);
- return hmm_pfns_fill(addr, end, range, 0);
+ return hmm_pfns_fill(addr, end, hmm_vma_walk, 0);
}
static inline unsigned long hmm_pfn_flags_order(unsigned long order)
@@ -289,10 +322,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
goto fault;
if (softleaf_is_migration(entry)) {
- pte_unmap(ptep);
- hmm_vma_walk->last = addr;
- migration_entry_wait(walk->mm, pmdp, addr);
- return -EBUSY;
+ if (!hmm_select_migrate(range)) {
+ pte_unmap(ptep);
+ hmm_vma_walk->last = addr;
+ migration_entry_wait(walk->mm, pmdp, addr);
+ return -EBUSY;
+ } else
+ goto out;
}
/* Report error for everything else */
@@ -376,7 +412,7 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
return -EFAULT;
}
- return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+ return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
}
#else
static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
@@ -389,10 +425,448 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
return -EFAULT;
- return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+ return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
}
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+/**
+ * migrate_vma_split_folio() - Helper function to split a THP folio
+ * @folio: the folio to split
+ * @fault_page: struct page associated with the fault if any
+ *
+ * Returns 0 on success
+ */
+static int migrate_vma_split_folio(struct folio *folio,
+ struct page *fault_page)
+{
+ int ret;
+ struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
+ struct folio *new_fault_folio = NULL;
+
+ if (folio != fault_folio) {
+ folio_get(folio);
+ folio_lock(folio);
+ }
+
+ ret = split_folio(folio);
+ if (ret) {
+ if (folio != fault_folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ return ret;
+ }
+
+ new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
+
+ /*
+ * Ensure the lock is held on the correct
+ * folio after the split
+ */
+ if (!new_fault_folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ } else if (folio != new_fault_folio) {
+ if (new_fault_folio != fault_folio) {
+ folio_get(new_fault_folio);
+ folio_lock(new_fault_folio);
+ }
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ return 0;
+}
+
+static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
+ pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ unsigned long *hmm_pfn)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct migrate_vma *migrate = range->migrate;
+ struct mm_struct *mm = walk->vma->vm_mm;
+ struct folio *fault_folio = NULL;
+ struct folio *folio;
+ enum migrate_vma_info minfo;
+ spinlock_t *ptl;
+ unsigned long i;
+ int r = 0;
+
+ minfo = hmm_select_migrate(range);
+ if (!minfo)
+ return r;
+
+ fault_folio = (migrate && migrate->fault_page) ?
+ page_folio(migrate->fault_page) : NULL;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (pmd_none(*pmdp)) {
+ spin_unlock(ptl);
+ return hmm_pfns_fill(start, end, hmm_vma_walk, 0);
+ }
+
+ if (pmd_trans_huge(*pmdp)) {
+ if (!(minfo & MIGRATE_VMA_SELECT_SYSTEM))
+ goto out;
+
+ folio = pmd_folio(*pmdp);
+ if (is_huge_zero_folio(folio)) {
+ spin_unlock(ptl);
+ return hmm_pfns_fill(start, end, hmm_vma_walk, 0);
+ }
+
+ } else if (!pmd_present(*pmdp)) {
+ const softleaf_t entry = softleaf_from_pmd(*pmdp);
+
+ folio = softleaf_to_folio(entry);
+
+ if (!softleaf_is_device_private(entry))
+ goto out;
+
+ if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
+ goto out;
+ if (folio->pgmap->owner != migrate->pgmap_owner)
+ goto out;
+
+ } else {
+ spin_unlock(ptl);
+ return -EBUSY;
+ }
+
+ folio_get(folio);
+
+ if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
+ spin_unlock(ptl);
+ folio_put(folio);
+ return 0;
+ }
+
+ if (thp_migration_supported() &&
+ (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
+ (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
+ IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
+
+ struct page_vma_mapped_walk pvmw = {
+ .ptl = ptl,
+ .address = start,
+ .pmd = pmdp,
+ .vma = walk->vma,
+ };
+
+ hmm_pfn[0] |= HMM_PFN_MIGRATE | HMM_PFN_COMPOUND;
+
+ r = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
+ if (r) {
+ hmm_pfn[0] &= ~(HMM_PFN_MIGRATE | HMM_PFN_COMPOUND);
+ r = -ENOENT; // fallback
+ goto unlock_out;
+ }
+ for (i = 1, start += PAGE_SIZE; start < end; start += PAGE_SIZE, i++)
+ hmm_pfn[i] &= HMM_PFN_INOUT_FLAGS;
+
+ } else {
+ r = -ENOENT; // fallback
+ goto unlock_out;
+ }
+
+
+out:
+ spin_unlock(ptl);
+ return r;
+
+unlock_out:
+ if (folio != fault_folio)
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
+
+}
+
+/*
+ * Install migration entries if migration requested, either from fault
+ * or migrate paths.
+ *
+ */
+static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
+ pmd_t *pmdp,
+ unsigned long addr,
+ unsigned long *hmm_pfn)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct migrate_vma *migrate = range->migrate;
+ struct mm_struct *mm = walk->vma->vm_mm;
+ struct folio *fault_folio = NULL;
+ enum migrate_vma_info minfo;
+ struct dev_pagemap *pgmap;
+ bool anon_exclusive;
+ struct folio *folio;
+ unsigned long pfn;
+ struct page *page;
+ softleaf_t entry;
+ pte_t pte, swp_pte;
+ spinlock_t *ptl;
+ bool writable = false;
+ pte_t *ptep;
+
+ // Do we want to migrate at all?
+ minfo = hmm_select_migrate(range);
+ if (!minfo)
+ return 0;
+
+ fault_folio = (migrate && migrate->fault_page) ?
+ page_folio(migrate->fault_page) : NULL;
+
+again:
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!ptep)
+ return 0;
+
+ pte = ptep_get(ptep);
+
+ if (pte_none(pte)) {
+ // migrate without faulting case
+ if (vma_is_anonymous(walk->vma)) {
+ *hmm_pfn &= HMM_PFN_INOUT_FLAGS;
+ *hmm_pfn |= HMM_PFN_MIGRATE | HMM_PFN_VALID;
+ goto out;
+ }
+ }
+
+ if (!pte_present(pte)) {
+ /*
+ * Only care about unaddressable device page special
+ * page table entry. Other special swap entries are not
+ * migratable, and we ignore regular swapped page.
+ */
+ entry = softleaf_from_pte(pte);
+ if (!softleaf_is_device_private(entry))
+ goto out;
+
+ if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
+ goto out;
+
+ page = softleaf_to_page(entry);
+ folio = page_folio(page);
+ if (folio->pgmap->owner != migrate->pgmap_owner)
+ goto out;
+
+ if (folio_test_large(folio)) {
+ int ret;
+
+ pte_unmap_unlock(ptep, ptl);
+ ret = migrate_vma_split_folio(folio,
+ migrate->fault_page);
+ if (ret)
+ goto out_unlocked;
+ goto again;
+ }
+
+ pfn = page_to_pfn(page);
+ if (softleaf_is_device_private_write(entry))
+ writable = true;
+ } else {
+ pfn = pte_pfn(pte);
+ if (is_zero_pfn(pfn) &&
+ (minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
+ *hmm_pfn = HMM_PFN_MIGRATE|HMM_PFN_VALID;
+ goto out;
+ }
+ page = vm_normal_page(walk->vma, addr, pte);
+ if (page && !is_zone_device_page(page) &&
+ !(minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
+ goto out;
+ } else if (page && is_device_coherent_page(page)) {
+ pgmap = page_pgmap(page);
+
+ if (!(minfo &
+ MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+ pgmap->owner != migrate->pgmap_owner)
+ goto out;
+ }
+
+ folio = page_folio(page);
+ if (folio_test_large(folio)) {
+ int ret;
+
+ pte_unmap_unlock(ptep, ptl);
+ ret = migrate_vma_split_folio(folio,
+ migrate->fault_page);
+ if (ret)
+ goto out_unlocked;
+
+ goto again;
+ }
+
+ writable = pte_write(pte);
+ }
+
+ if (!page || !page->mapping)
+ goto out;
+
+ /*
+ * By getting a reference on the folio we pin it and that blocks
+ * any kind of migration. Side effect is that it "freezes" the
+ * pte.
+ *
+ * We drop this reference after isolating the folio from the lru
+ * for non device folio (device folio are not on the lru and thus
+ * can't be dropped from it).
+ */
+ folio = page_folio(page);
+ folio_get(folio);
+
+ /*
+ * We rely on folio_trylock() to avoid deadlock between
+ * concurrent migrations where each is waiting on the others
+ * folio lock. If we can't immediately lock the folio we fail this
+ * migration as it is only best effort anyway.
+ *
+ * If we can lock the folio it's safe to set up a migration entry
+ * now. In the common case where the folio is mapped once in a
+ * single process setting up the migration entry now is an
+ * optimisation to avoid walking the rmap later with
+ * try_to_migrate().
+ */
+
+ if (fault_folio == folio || folio_trylock(folio)) {
+ anon_exclusive = folio_test_anon(folio) &&
+ PageAnonExclusive(page);
+
+ flush_cache_page(walk->vma, addr, pfn);
+
+ if (anon_exclusive) {
+ pte = ptep_clear_flush(walk->vma, addr, ptep);
+
+ if (folio_try_share_anon_rmap_pte(folio, page)) {
+ set_pte_at(mm, addr, ptep, pte);
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
+ }
+ } else {
+ pte = ptep_get_and_clear(mm, addr, ptep);
+ }
+
+ /* Setup special migration page table entry */
+ if (writable)
+ entry = make_writable_migration_entry(pfn);
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(pfn);
+ else
+ entry = make_readable_migration_entry(pfn);
+
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_present(pte)) {
+ if (pte_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ if (pte_swp_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
+
+ set_pte_at(mm, addr, ptep, swp_pte);
+ folio_remove_rmap_pte(folio, page, walk->vma);
+ folio_put(folio);
+ *hmm_pfn |= HMM_PFN_MIGRATE;
+
+ if (pte_present(pte))
+ flush_tlb_range(walk->vma, addr, addr + PAGE_SIZE);
+ } else
+ folio_put(folio);
+out:
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+out_unlocked:
+ return -1;
+
+}
+
+static int hmm_vma_walk_split(pmd_t *pmdp,
+ unsigned long addr,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct migrate_vma *migrate = range->migrate;
+ struct folio *folio, *fault_folio;
+ spinlock_t *ptl;
+ int ret = 0;
+
+ fault_folio = (migrate && migrate->fault_page) ?
+ page_folio(migrate->fault_page) : NULL;
+
+ ptl = pmd_lock(walk->mm, pmdp);
+ if (unlikely(!pmd_trans_huge(*pmdp))) {
+ spin_unlock(ptl);
+ goto out;
+ }
+
+ folio = pmd_folio(*pmdp);
+ if (is_huge_zero_folio(folio)) {
+ spin_unlock(ptl);
+ split_huge_pmd(walk->vma, pmdp, addr);
+ } else {
+ folio_get(folio);
+ spin_unlock(ptl);
+
+ if (folio != fault_folio) {
+ if (unlikely(!folio_trylock(folio))) {
+ folio_put(folio);
+ ret = -EBUSY;
+ goto out;
+ }
+ } else
+ folio_put(folio);
+
+ ret = split_folio(folio);
+ if (fault_folio != folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ }
+out:
+ return ret;
+}
+
+static int hmm_vma_capture_migrate_range(unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+
+ if (!hmm_select_migrate(range))
+ return 0;
+
+ if (hmm_vma_walk->vma && (hmm_vma_walk->vma != walk->vma))
+ return -ERANGE;
+
+ hmm_vma_walk->vma = walk->vma;
+ hmm_vma_walk->start = start;
+ hmm_vma_walk->end = end;
+
+ if (end - start > range->end - range->start)
+ return -ERANGE;
+
+ if (!hmm_vma_walk->mmu_range.owner) {
+ mmu_notifier_range_init_owner(&hmm_vma_walk->mmu_range, MMU_NOTIFY_MIGRATE, 0,
+ walk->vma->vm_mm, start, end,
+ range->dev_private_owner);
+ mmu_notifier_invalidate_range_start(&hmm_vma_walk->mmu_range);
+ }
+
+ return 0;
+}
+
static int hmm_vma_walk_pmd(pmd_t *pmdp,
unsigned long start,
unsigned long end,
@@ -404,42 +878,90 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
&range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
unsigned long npages = (end - start) >> PAGE_SHIFT;
unsigned long addr = start;
+ enum migrate_vma_info minfo;
+ unsigned long i;
+ spinlock_t *ptl;
pte_t *ptep;
pmd_t pmd;
+ int r;
+ minfo = hmm_select_migrate(range);
again:
+
pmd = pmdp_get_lockless(pmdp);
- if (pmd_none(pmd))
- return hmm_vma_walk_hole(start, end, -1, walk);
+ if (pmd_none(pmd)) {
+ r = hmm_vma_walk_hole(start, end, -1, walk);
+ if (r || !minfo)
+ return r;
+
+ ptl = pmd_lock(walk->mm, pmdp);
+ if (pmd_none(*pmdp)) {
+ // hmm_vma_walk_hole() filled migration needs
+ spin_unlock(ptl);
+ return r;
+ }
+ spin_unlock(ptl);
+ }
if (thp_migration_supported() && pmd_is_migration_entry(pmd)) {
- if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
- hmm_vma_walk->last = addr;
- pmd_migration_entry_wait(walk->mm, pmdp);
- return -EBUSY;
+ if (!minfo) {
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
+ hmm_vma_walk->last = addr;
+ pmd_migration_entry_wait(walk->mm, pmdp);
+ return -EBUSY;
+ }
}
- return hmm_pfns_fill(start, end, range, 0);
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++)
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+
+ return 0;
}
- if (!pmd_present(pmd))
- return hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
- pmd);
+ if (pmd_trans_huge(pmd) || !pmd_present(pmd)) {
+
+ if (!pmd_present(pmd)) {
+ r = hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
+ pmd);
+ if (r || !minfo)
+ return r;
+ } else {
+
+ /*
+ * No need to take pmd_lock here, even if some other thread
+ * is splitting the huge pmd we will get that event through
+ * mmu_notifier callback.
+ *
+ * So just read pmd value and check again it's a transparent
+ * huge or device mapping one and compute corresponding pfn
+ * values.
+ */
+
+ pmd = pmdp_get_lockless(pmdp);
+ if (!pmd_trans_huge(pmd))
+ goto again;
+
+ r = hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
+
+ if (r || !minfo)
+ return r;
+ }
- if (pmd_trans_huge(pmd)) {
- /*
- * No need to take pmd_lock here, even if some other thread
- * is splitting the huge pmd we will get that event through
- * mmu_notifier callback.
- *
- * So just read pmd value and check again it's a transparent
- * huge or device mapping one and compute corresponding pfn
- * values.
- */
- pmd = pmdp_get_lockless(pmdp);
- if (!pmd_trans_huge(pmd))
- goto again;
+ r = hmm_vma_handle_migrate_prepare_pmd(walk, pmdp, start, end, hmm_pfns);
+
+ if (r == -ENOENT) {
+ r = hmm_vma_walk_split(pmdp, addr, walk);
+ if (r) {
+ /* Split not successful, skip */
+ return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
+ }
+
+ /* Split successful or "again", reloop */
+ hmm_vma_walk->last = addr;
+ return -EBUSY;
+ }
+
+ return r;
- return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
}
/*
@@ -451,22 +973,26 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
if (pmd_bad(pmd)) {
if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
return -EFAULT;
- return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+ return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
}
ptep = pte_offset_map(pmdp, addr);
if (!ptep)
goto again;
for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
- int r;
r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
if (r) {
/* hmm_vma_handle_pte() did pte_unmap() */
return r;
}
+
+ r = hmm_vma_handle_migrate_prepare(walk, pmdp, addr, hmm_pfns);
+ if (r)
+ break;
}
pte_unmap(ptep - 1);
+
return 0;
}
@@ -600,6 +1126,11 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
+ int r;
+
+ r = hmm_vma_capture_migrate_range(start, end, walk);
+ if (r)
+ return r;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
vma->vm_flags & VM_READ)
@@ -622,7 +1153,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
(end - start) >> PAGE_SHIFT, 0))
return -EFAULT;
- hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+ hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
/* Skip this vma and continue processing the next vma. */
return 1;
@@ -652,9 +1183,17 @@ static const struct mm_walk_ops hmm_walk_ops = {
* the invalidation to finish.
* -EFAULT: A page was requested to be valid and could not be made valid
* ie it has no backing VMA or it is illegal to access
+ * -ERANGE: The range crosses multiple VMAs, or space for hmm_pfns array
+ * is too low.
*
* This is similar to get_user_pages(), except that it can read the page tables
* without mutating them (ie causing faults).
+ *
+ * If want to do migrate after faultin, call hmm_rangem_fault() with
+ * HMM_PFN_REQ_MIGRATE and initialize range.migrate field.
+ * After hmm_range_fault() call migrate_hmm_range_setup() instead of
+ * migrate_vma_setup() and after that follow normal migrate calls path.
+ *
*/
int hmm_range_fault(struct hmm_range *range)
{
@@ -662,16 +1201,28 @@ int hmm_range_fault(struct hmm_range *range)
.range = range,
.last = range->start,
};
- struct mm_struct *mm = range->notifier->mm;
+ bool is_fault_path = !!range->notifier;
+ struct mm_struct *mm;
int ret;
+ /*
+ *
+ * Could be serving a device fault or come from migrate
+ * entry point. For the former we have not resolved the vma
+ * yet, and the latter we don't have a notifier (but have a vma).
+ *
+ */
+ mm = is_fault_path ? range->notifier->mm : range->migrate->vma->vm_mm;
mmap_assert_locked(mm);
do {
/* If range is no longer valid force retry. */
- if (mmu_interval_check_retry(range->notifier,
- range->notifier_seq))
- return -EBUSY;
+ if (is_fault_path && mmu_interval_check_retry(range->notifier,
+ range->notifier_seq)) {
+ ret = -EBUSY;
+ break;
+ }
+
ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
&hmm_walk_ops, &hmm_vma_walk);
/*
@@ -681,6 +1232,18 @@ int hmm_range_fault(struct hmm_range *range)
* output, and all >= are still at their input values.
*/
} while (ret == -EBUSY);
+
+ if (hmm_select_migrate(range) && range->migrate &&
+ hmm_vma_walk.mmu_range.owner) {
+ // The migrate_vma path has the following initialized
+ if (is_fault_path) {
+ range->migrate->vma = hmm_vma_walk.vma;
+ range->migrate->start = range->start;
+ range->migrate->end = hmm_vma_walk.end;
+ }
+ mmu_notifier_invalidate_range_end(&hmm_vma_walk.mmu_range);
+ }
+
return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 23379663b1e1..d89efdfca8f6 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -734,7 +734,17 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
*/
int migrate_vma_setup(struct migrate_vma *args)
{
+ int ret;
long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
+ struct hmm_range range = {
+ .notifier = NULL,
+ .start = args->start,
+ .end = args->end,
+ .migrate = args,
+ .hmm_pfns = args->src,
+ .dev_private_owner = args->pgmap_owner,
+ .migrate = args
+ };
args->start &= PAGE_MASK;
args->end &= PAGE_MASK;
@@ -759,17 +769,19 @@ int migrate_vma_setup(struct migrate_vma *args)
args->cpages = 0;
args->npages = 0;
- migrate_vma_collect(args);
+ if (args->flags & MIGRATE_VMA_FAULT)
+ range.default_flags |= HMM_PFN_REQ_FAULT;
+
+ ret = hmm_range_fault(&range);
- if (args->cpages)
- migrate_vma_unmap(args);
+ migrate_hmm_range_setup(&range);
/*
* At this point pages are locked and unmapped, and thus they have
* stable content and can safely be copied to destination memory that
* is allocated by the drivers.
*/
- return 0;
+ return ret;
}
EXPORT_SYMBOL(migrate_vma_setup);
@@ -1489,3 +1501,64 @@ int migrate_device_coherent_folio(struct folio *folio)
return 0;
return -EBUSY;
}
+
+void migrate_hmm_range_setup(struct hmm_range *range)
+{
+
+ struct migrate_vma *migrate = range->migrate;
+
+ if (!migrate)
+ return;
+
+ migrate->npages = (migrate->end - migrate->start) >> PAGE_SHIFT;
+ migrate->cpages = 0;
+
+ for (unsigned long i = 0; i < migrate->npages; i++) {
+
+ unsigned long pfn = range->hmm_pfns[i];
+
+ pfn &= ~HMM_PFN_INOUT_FLAGS;
+
+ /*
+ *
+ * Don't do migration if valid and migrate flags are not both set.
+ *
+ */
+ if ((pfn & (HMM_PFN_VALID | HMM_PFN_MIGRATE)) !=
+ (HMM_PFN_VALID | HMM_PFN_MIGRATE)) {
+ migrate->src[i] = 0;
+ migrate->dst[i] = 0;
+ continue;
+ }
+
+ migrate->cpages++;
+
+ /*
+ *
+ * The zero page is encoded in a special way, valid and migrate is
+ * set, and pfn part is zero. Encode specially for migrate also.
+ *
+ */
+ if (pfn == (HMM_PFN_VALID|HMM_PFN_MIGRATE)) {
+ migrate->src[i] = MIGRATE_PFN_MIGRATE;
+ migrate->dst[i] = 0;
+ continue;
+ }
+ if (pfn == (HMM_PFN_VALID|HMM_PFN_MIGRATE|HMM_PFN_COMPOUND)) {
+ migrate->src[i] = MIGRATE_PFN_MIGRATE|MIGRATE_PFN_COMPOUND;
+ migrate->dst[i] = 0;
+ continue;
+ }
+
+ migrate->src[i] = migrate_pfn(page_to_pfn(hmm_pfn_to_page(pfn)))
+ | MIGRATE_PFN_MIGRATE;
+ migrate->src[i] |= (pfn & HMM_PFN_WRITE) ? MIGRATE_PFN_WRITE : 0;
+ migrate->src[i] |= (pfn & HMM_PFN_COMPOUND) ? MIGRATE_PFN_COMPOUND : 0;
+ migrate->dst[i] = 0;
+ }
+
+ if (migrate->cpages)
+ migrate_vma_unmap(migrate);
+
+}
+EXPORT_SYMBOL(migrate_hmm_range_setup);
--
2.50.0
On Wed, Jan 14, 2026 at 11:19:21AM +0200, mpenttil@redhat.com wrote:
> From: Mika Penttilä <mpenttil@redhat.com>
>
> Currently, the way device page faulting and migration works
> is not optimal, if you want to do both fault handling and
> migration at once.
>
> Being able to migrate not present pages (or pages mapped with incorrect
> permissions, eg. COW) to the GPU requires doing either of the
> following sequences:
>
> 1. hmm_range_fault() - fault in non-present pages with correct permissions, etc.
> 2. migrate_vma_*() - migrate the pages
>
> Or:
>
> 1. migrate_vma_*() - migrate present pages
> 2. If non-present pages detected by migrate_vma_*():
> a) call hmm_range_fault() to fault pages in
> b) call migrate_vma_*() again to migrate now present pages
>
> The problem with the first sequence is that you always have to do two
> page walks even when most of the time the pages are present or zero page
> mappings so the common case takes a performance hit.
>
> The second sequence is better for the common case, but far worse if
> pages aren't present because now you have to walk the page tables three
> times (once to find the page is not present, once so hmm_range_fault()
> can find a non-present page to fault in and once again to setup the
> migration). It is also tricky to code correctly.
>
> We should be able to walk the page table once, faulting
> pages in as required and replacing them with migration entries if
> requested.
>
> Add a new flag to HMM APIs, HMM_PFN_REQ_MIGRATE,
> which tells to prepare for migration also during fault handling.
> Also, for the migrate_vma_setup() call paths, a flags, MIGRATE_VMA_FAULT,
> is added to tell to add fault handling to migrate.
>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Jason Gunthorpe <jgg@nvidia.com>
> Cc: Leon Romanovsky <leonro@nvidia.com>
> Cc: Alistair Popple <apopple@nvidia.com>
> Cc: Balbir Singh <balbirs@nvidia.com>
> Cc: Zi Yan <ziy@nvidia.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
I'll try to test this when I can but horribly behind at the moment.
You can use Intel's CI system to test SVM too. I can get you authorized
to use this. The list to trigger is intel-xe@lists.freedesktop.org and
patches must apply to drm-tip. I'll let you know when you are
authorized.
> Suggested-by: Alistair Popple <apopple@nvidia.com>
> Signed-off-by: Mika Penttilä <mpenttil@redhat.com>
> ---
> include/linux/hmm.h | 17 +-
> include/linux/migrate.h | 6 +-
> mm/hmm.c | 657 +++++++++++++++++++++++++++++++++++++---
> mm/migrate_device.c | 81 ++++-
> 4 files changed, 706 insertions(+), 55 deletions(-)
>
> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> index db75ffc949a7..7b7294ad0f62 100644
> --- a/include/linux/hmm.h
> +++ b/include/linux/hmm.h
> @@ -12,7 +12,7 @@
> #include <linux/mm.h>
>
> struct mmu_interval_notifier;
> -
> +struct migrate_vma;
> /*
> * On output:
> * 0 - The page is faultable and a future call with
> @@ -48,15 +48,25 @@ enum hmm_pfn_flags {
> HMM_PFN_P2PDMA = 1UL << (BITS_PER_LONG - 5),
> HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6),
>
> - HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 11),
> + /* Migrate request */
> + HMM_PFN_MIGRATE = 1UL << (BITS_PER_LONG - 7),
> + HMM_PFN_COMPOUND = 1UL << (BITS_PER_LONG - 8),
> + HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 13),
>
> /* Input flags */
> HMM_PFN_REQ_FAULT = HMM_PFN_VALID,
> HMM_PFN_REQ_WRITE = HMM_PFN_WRITE,
> + HMM_PFN_REQ_MIGRATE = HMM_PFN_MIGRATE,
I believe you are missing kernel for HMM_PFN_MIGRATE.
>
> HMM_PFN_FLAGS = ~((1UL << HMM_PFN_ORDER_SHIFT) - 1),
> };
>
> +enum {
> + /* These flags are carried from input-to-output */
> + HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
> + HMM_PFN_P2PDMA_BUS,
> +};
> +
> /*
> * hmm_pfn_to_page() - return struct page pointed to by a device entry
> *
> @@ -107,6 +117,7 @@ static inline unsigned int hmm_pfn_to_map_order(unsigned long hmm_pfn)
> * @default_flags: default flags for the range (write, read, ... see hmm doc)
> * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
> * @dev_private_owner: owner of device private pages
> + * @migrate: structure for migrating the associated vma
> */
> struct hmm_range {
> struct mmu_interval_notifier *notifier;
> @@ -117,12 +128,14 @@ struct hmm_range {
> unsigned long default_flags;
> unsigned long pfn_flags_mask;
> void *dev_private_owner;
> + struct migrate_vma *migrate;
> };
>
> /*
> * Please see Documentation/mm/hmm.rst for how to use the range API.
> */
> int hmm_range_fault(struct hmm_range *range);
> +int hmm_range_migrate_prepare(struct hmm_range *range, struct migrate_vma **pargs);
>
> /*
> * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index 26ca00c325d9..0889309a9d21 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -3,6 +3,7 @@
> #define _LINUX_MIGRATE_H
>
> #include <linux/mm.h>
> +#include <linux/hmm.h>
> #include <linux/mempolicy.h>
> #include <linux/migrate_mode.h>
> #include <linux/hugetlb.h>
> @@ -140,11 +141,12 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
> return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
> }
>
> -enum migrate_vma_direction {
> +enum migrate_vma_info {
> MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
> MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
> MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
> MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
> + MIGRATE_VMA_FAULT = 1 << 4,
> };
>
> struct migrate_vma {
> @@ -192,7 +194,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
> unsigned long npages);
> void migrate_device_finalize(unsigned long *src_pfns,
> unsigned long *dst_pfns, unsigned long npages);
> -
> +void migrate_hmm_range_setup(struct hmm_range *range);
> #endif /* CONFIG_MIGRATION */
>
> #endif /* _LINUX_MIGRATE_H */
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 4ec74c18bef6..39a07d895043 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -20,6 +20,7 @@
> #include <linux/pagemap.h>
> #include <linux/leafops.h>
> #include <linux/hugetlb.h>
> +#include <linux/migrate.h>
> #include <linux/memremap.h>
> #include <linux/sched/mm.h>
> #include <linux/jump_label.h>
> @@ -31,8 +32,12 @@
> #include "internal.h"
>
> struct hmm_vma_walk {
> - struct hmm_range *range;
> - unsigned long last;
> + struct mmu_notifier_range mmu_range;
> + struct vm_area_struct *vma;
> + struct hmm_range *range;
> + unsigned long start;
> + unsigned long end;
> + unsigned long last;
> };
>
> enum {
> @@ -41,21 +46,49 @@ enum {
> HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
> };
>
> -enum {
> - /* These flags are carried from input-to-output */
> - HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
> - HMM_PFN_P2PDMA_BUS,
> -};
> +static enum migrate_vma_info hmm_select_migrate(struct hmm_range *range)
> +{
> + enum migrate_vma_info minfo;
> +
> + minfo = range->migrate ? range->migrate->flags : 0;
> + minfo |= (range->default_flags & HMM_PFN_REQ_MIGRATE) ?
> + MIGRATE_VMA_SELECT_SYSTEM : 0;
I'm trying to make sense of HMM_PFN_REQ_MIGRATE and why it sets
MIGRATE_VMA_SELECT_SYSTEM?
Also the just the general usage - would range->migrate be NULL in the
expected usage.
Maybe an example of how hmm_range_fault would be called with this flag
and subsequent expected migrate calls would clear this up.
> +
> + return minfo;
> +}
>
> static int hmm_pfns_fill(unsigned long addr, unsigned long end,
> - struct hmm_range *range, unsigned long cpu_flags)
> + struct hmm_vma_walk *hmm_vma_walk, unsigned long cpu_flags)
> {
> + struct hmm_range *range = hmm_vma_walk->range;
> unsigned long i = (addr - range->start) >> PAGE_SHIFT;
> + enum migrate_vma_info minfo;
> + bool migrate = false;
> +
> + minfo = hmm_select_migrate(range);
> + if (cpu_flags != HMM_PFN_ERROR) {
> + if (minfo && (vma_is_anonymous(hmm_vma_walk->vma))) {
> + cpu_flags |= (HMM_PFN_VALID | HMM_PFN_MIGRATE);
> + migrate = true;
> + }
> + }
> +
> + if (migrate && thp_migration_supported() &&
> + (minfo & MIGRATE_VMA_SELECT_COMPOUND) &&
> + IS_ALIGNED(addr, HPAGE_PMD_SIZE) &&
> + IS_ALIGNED(end, HPAGE_PMD_SIZE)) {
> + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
> + range->hmm_pfns[i] |= cpu_flags | HMM_PFN_COMPOUND;
> + addr += PAGE_SIZE;
> + i++;
> + cpu_flags = 0;
> + }
>
> for (; addr < end; addr += PAGE_SIZE, i++) {
> range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
> range->hmm_pfns[i] |= cpu_flags;
> }
> +
> return 0;
> }
>
> @@ -171,11 +204,11 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
> if (!walk->vma) {
> if (required_fault)
> return -EFAULT;
> - return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
> + return hmm_pfns_fill(addr, end, hmm_vma_walk, HMM_PFN_ERROR);
> }
> if (required_fault)
> return hmm_vma_fault(addr, end, required_fault, walk);
> - return hmm_pfns_fill(addr, end, range, 0);
> + return hmm_pfns_fill(addr, end, hmm_vma_walk, 0);
> }
>
> static inline unsigned long hmm_pfn_flags_order(unsigned long order)
> @@ -289,10 +322,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
> goto fault;
>
> if (softleaf_is_migration(entry)) {
> - pte_unmap(ptep);
> - hmm_vma_walk->last = addr;
> - migration_entry_wait(walk->mm, pmdp, addr);
> - return -EBUSY;
> + if (!hmm_select_migrate(range)) {
> + pte_unmap(ptep);
> + hmm_vma_walk->last = addr;
> + migration_entry_wait(walk->mm, pmdp, addr);
> + return -EBUSY;
> + } else
> + goto out;
> }
>
> /* Report error for everything else */
> @@ -376,7 +412,7 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
> return -EFAULT;
> }
>
> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> }
> #else
> static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
> @@ -389,10 +425,448 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
>
> if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
> return -EFAULT;
> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> }
> #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>
> +/**
> + * migrate_vma_split_folio() - Helper function to split a THP folio
> + * @folio: the folio to split
> + * @fault_page: struct page associated with the fault if any
> + *
> + * Returns 0 on success
> + */
> +static int migrate_vma_split_folio(struct folio *folio,
> + struct page *fault_page)
> +{
> + int ret;
> + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
> + struct folio *new_fault_folio = NULL;
> +
> + if (folio != fault_folio) {
> + folio_get(folio);
> + folio_lock(folio);
> + }
> +
> + ret = split_folio(folio);
> + if (ret) {
> + if (folio != fault_folio) {
> + folio_unlock(folio);
> + folio_put(folio);
> + }
> + return ret;
> + }
> +
> + new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
> +
> + /*
> + * Ensure the lock is held on the correct
> + * folio after the split
> + */
> + if (!new_fault_folio) {
> + folio_unlock(folio);
> + folio_put(folio);
> + } else if (folio != new_fault_folio) {
> + if (new_fault_folio != fault_folio) {
> + folio_get(new_fault_folio);
> + folio_lock(new_fault_folio);
> + }
> + folio_unlock(folio);
> + folio_put(folio);
> + }
> +
> + return 0;
> +}
> +
> +static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
> + pmd_t *pmdp,
> + unsigned long start,
> + unsigned long end,
> + unsigned long *hmm_pfn)
> +{
> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
> + struct hmm_range *range = hmm_vma_walk->range;
> + struct migrate_vma *migrate = range->migrate;
> + struct mm_struct *mm = walk->vma->vm_mm;
> + struct folio *fault_folio = NULL;
> + struct folio *folio;
> + enum migrate_vma_info minfo;
> + spinlock_t *ptl;
> + unsigned long i;
> + int r = 0;
> +
> + minfo = hmm_select_migrate(range);
> + if (!minfo)
> + return r;
> +
> + fault_folio = (migrate && migrate->fault_page) ?
> + page_folio(migrate->fault_page) : NULL;
> +
> + ptl = pmd_lock(mm, pmdp);
> + if (pmd_none(*pmdp)) {
> + spin_unlock(ptl);
> + return hmm_pfns_fill(start, end, hmm_vma_walk, 0);
> + }
> +
> + if (pmd_trans_huge(*pmdp)) {
> + if (!(minfo & MIGRATE_VMA_SELECT_SYSTEM))
> + goto out;
> +
> + folio = pmd_folio(*pmdp);
> + if (is_huge_zero_folio(folio)) {
> + spin_unlock(ptl);
> + return hmm_pfns_fill(start, end, hmm_vma_walk, 0);
> + }
> +
> + } else if (!pmd_present(*pmdp)) {
> + const softleaf_t entry = softleaf_from_pmd(*pmdp);
> +
> + folio = softleaf_to_folio(entry);
> +
> + if (!softleaf_is_device_private(entry))
> + goto out;
> +
> + if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
> + goto out;
> + if (folio->pgmap->owner != migrate->pgmap_owner)
> + goto out;
> +
> + } else {
> + spin_unlock(ptl);
> + return -EBUSY;
> + }
> +
> + folio_get(folio);
> +
> + if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
> + spin_unlock(ptl);
> + folio_put(folio);
> + return 0;
> + }
> +
> + if (thp_migration_supported() &&
> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
> +
> + struct page_vma_mapped_walk pvmw = {
> + .ptl = ptl,
> + .address = start,
> + .pmd = pmdp,
> + .vma = walk->vma,
> + };
> +
> + hmm_pfn[0] |= HMM_PFN_MIGRATE | HMM_PFN_COMPOUND;
> +
> + r = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
> + if (r) {
> + hmm_pfn[0] &= ~(HMM_PFN_MIGRATE | HMM_PFN_COMPOUND);
> + r = -ENOENT; // fallback
> + goto unlock_out;
> + }
> + for (i = 1, start += PAGE_SIZE; start < end; start += PAGE_SIZE, i++)
> + hmm_pfn[i] &= HMM_PFN_INOUT_FLAGS;
> +
> + } else {
> + r = -ENOENT; // fallback
> + goto unlock_out;
> + }
> +
> +
> +out:
> + spin_unlock(ptl);
> + return r;
> +
> +unlock_out:
> + if (folio != fault_folio)
> + folio_unlock(folio);
> + folio_put(folio);
> + goto out;
> +
> +}
> +
> +/*
> + * Install migration entries if migration requested, either from fault
> + * or migrate paths.
> + *
> + */
> +static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
> + pmd_t *pmdp,
> + unsigned long addr,
> + unsigned long *hmm_pfn)
> +{
> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
> + struct hmm_range *range = hmm_vma_walk->range;
> + struct migrate_vma *migrate = range->migrate;
> + struct mm_struct *mm = walk->vma->vm_mm;
> + struct folio *fault_folio = NULL;
> + enum migrate_vma_info minfo;
> + struct dev_pagemap *pgmap;
> + bool anon_exclusive;
> + struct folio *folio;
> + unsigned long pfn;
> + struct page *page;
> + softleaf_t entry;
> + pte_t pte, swp_pte;
> + spinlock_t *ptl;
> + bool writable = false;
> + pte_t *ptep;
> +
> + // Do we want to migrate at all?
> + minfo = hmm_select_migrate(range);
> + if (!minfo)
> + return 0;
> +
> + fault_folio = (migrate && migrate->fault_page) ?
> + page_folio(migrate->fault_page) : NULL;
> +
> +again:
> + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> + if (!ptep)
> + return 0;
> +
> + pte = ptep_get(ptep);
> +
> + if (pte_none(pte)) {
> + // migrate without faulting case
> + if (vma_is_anonymous(walk->vma)) {
> + *hmm_pfn &= HMM_PFN_INOUT_FLAGS;
> + *hmm_pfn |= HMM_PFN_MIGRATE | HMM_PFN_VALID;
> + goto out;
> + }
> + }
> +
> + if (!pte_present(pte)) {
> + /*
> + * Only care about unaddressable device page special
> + * page table entry. Other special swap entries are not
> + * migratable, and we ignore regular swapped page.
> + */
> + entry = softleaf_from_pte(pte);
> + if (!softleaf_is_device_private(entry))
> + goto out;
> +
> + if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
> + goto out;
> +
> + page = softleaf_to_page(entry);
> + folio = page_folio(page);
> + if (folio->pgmap->owner != migrate->pgmap_owner)
> + goto out;
> +
> + if (folio_test_large(folio)) {
> + int ret;
> +
> + pte_unmap_unlock(ptep, ptl);
> + ret = migrate_vma_split_folio(folio,
> + migrate->fault_page);
> + if (ret)
> + goto out_unlocked;
> + goto again;
> + }
> +
> + pfn = page_to_pfn(page);
> + if (softleaf_is_device_private_write(entry))
> + writable = true;
> + } else {
> + pfn = pte_pfn(pte);
> + if (is_zero_pfn(pfn) &&
> + (minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
> + *hmm_pfn = HMM_PFN_MIGRATE|HMM_PFN_VALID;
> + goto out;
> + }
> + page = vm_normal_page(walk->vma, addr, pte);
> + if (page && !is_zone_device_page(page) &&
> + !(minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
> + goto out;
> + } else if (page && is_device_coherent_page(page)) {
> + pgmap = page_pgmap(page);
> +
> + if (!(minfo &
> + MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
> + pgmap->owner != migrate->pgmap_owner)
> + goto out;
> + }
> +
> + folio = page_folio(page);
> + if (folio_test_large(folio)) {
> + int ret;
> +
> + pte_unmap_unlock(ptep, ptl);
> + ret = migrate_vma_split_folio(folio,
> + migrate->fault_page);
> + if (ret)
> + goto out_unlocked;
> +
> + goto again;
> + }
> +
> + writable = pte_write(pte);
> + }
> +
> + if (!page || !page->mapping)
> + goto out;
> +
> + /*
> + * By getting a reference on the folio we pin it and that blocks
> + * any kind of migration. Side effect is that it "freezes" the
> + * pte.
> + *
> + * We drop this reference after isolating the folio from the lru
> + * for non device folio (device folio are not on the lru and thus
> + * can't be dropped from it).
> + */
> + folio = page_folio(page);
> + folio_get(folio);
> +
> + /*
> + * We rely on folio_trylock() to avoid deadlock between
> + * concurrent migrations where each is waiting on the others
> + * folio lock. If we can't immediately lock the folio we fail this
> + * migration as it is only best effort anyway.
> + *
> + * If we can lock the folio it's safe to set up a migration entry
> + * now. In the common case where the folio is mapped once in a
> + * single process setting up the migration entry now is an
> + * optimisation to avoid walking the rmap later with
> + * try_to_migrate().
> + */
> +
> + if (fault_folio == folio || folio_trylock(folio)) {
> + anon_exclusive = folio_test_anon(folio) &&
> + PageAnonExclusive(page);
> +
> + flush_cache_page(walk->vma, addr, pfn);
> +
> + if (anon_exclusive) {
> + pte = ptep_clear_flush(walk->vma, addr, ptep);
> +
> + if (folio_try_share_anon_rmap_pte(folio, page)) {
> + set_pte_at(mm, addr, ptep, pte);
> + folio_unlock(folio);
> + folio_put(folio);
> + goto out;
> + }
> + } else {
> + pte = ptep_get_and_clear(mm, addr, ptep);
> + }
> +
> + /* Setup special migration page table entry */
> + if (writable)
> + entry = make_writable_migration_entry(pfn);
> + else if (anon_exclusive)
> + entry = make_readable_exclusive_migration_entry(pfn);
> + else
> + entry = make_readable_migration_entry(pfn);
> +
> + swp_pte = swp_entry_to_pte(entry);
> + if (pte_present(pte)) {
> + if (pte_soft_dirty(pte))
> + swp_pte = pte_swp_mksoft_dirty(swp_pte);
> + if (pte_uffd_wp(pte))
> + swp_pte = pte_swp_mkuffd_wp(swp_pte);
> + } else {
> + if (pte_swp_soft_dirty(pte))
> + swp_pte = pte_swp_mksoft_dirty(swp_pte);
> + if (pte_swp_uffd_wp(pte))
> + swp_pte = pte_swp_mkuffd_wp(swp_pte);
> + }
> +
> + set_pte_at(mm, addr, ptep, swp_pte);
> + folio_remove_rmap_pte(folio, page, walk->vma);
> + folio_put(folio);
> + *hmm_pfn |= HMM_PFN_MIGRATE;
> +
> + if (pte_present(pte))
> + flush_tlb_range(walk->vma, addr, addr + PAGE_SIZE);
> + } else
> + folio_put(folio);
> +out:
> + pte_unmap_unlock(ptep, ptl);
> + return 0;
> +out_unlocked:
> + return -1;
> +
> +}
> +
> +static int hmm_vma_walk_split(pmd_t *pmdp,
> + unsigned long addr,
> + struct mm_walk *walk)
> +{
> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
> + struct hmm_range *range = hmm_vma_walk->range;
> + struct migrate_vma *migrate = range->migrate;
> + struct folio *folio, *fault_folio;
> + spinlock_t *ptl;
> + int ret = 0;
> +
> + fault_folio = (migrate && migrate->fault_page) ?
> + page_folio(migrate->fault_page) : NULL;
> +
> + ptl = pmd_lock(walk->mm, pmdp);
> + if (unlikely(!pmd_trans_huge(*pmdp))) {
> + spin_unlock(ptl);
> + goto out;
> + }
> +
> + folio = pmd_folio(*pmdp);
> + if (is_huge_zero_folio(folio)) {
> + spin_unlock(ptl);
> + split_huge_pmd(walk->vma, pmdp, addr);
> + } else {
> + folio_get(folio);
> + spin_unlock(ptl);
> +
> + if (folio != fault_folio) {
> + if (unlikely(!folio_trylock(folio))) {
> + folio_put(folio);
> + ret = -EBUSY;
> + goto out;
> + }
> + } else
> + folio_put(folio);
> +
> + ret = split_folio(folio);
> + if (fault_folio != folio) {
> + folio_unlock(folio);
> + folio_put(folio);
> + }
> +
> + }
> +out:
> + return ret;
> +}
> +
> +static int hmm_vma_capture_migrate_range(unsigned long start,
> + unsigned long end,
> + struct mm_walk *walk)
> +{
> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
> + struct hmm_range *range = hmm_vma_walk->range;
> +
> + if (!hmm_select_migrate(range))
> + return 0;
> +
> + if (hmm_vma_walk->vma && (hmm_vma_walk->vma != walk->vma))
> + return -ERANGE;
> +
> + hmm_vma_walk->vma = walk->vma;
> + hmm_vma_walk->start = start;
> + hmm_vma_walk->end = end;
> +
> + if (end - start > range->end - range->start)
> + return -ERANGE;
> +
> + if (!hmm_vma_walk->mmu_range.owner) {
> + mmu_notifier_range_init_owner(&hmm_vma_walk->mmu_range, MMU_NOTIFY_MIGRATE, 0,
> + walk->vma->vm_mm, start, end,
> + range->dev_private_owner);
> + mmu_notifier_invalidate_range_start(&hmm_vma_walk->mmu_range);
> + }
> +
> + return 0;
> +}
> +
> static int hmm_vma_walk_pmd(pmd_t *pmdp,
> unsigned long start,
> unsigned long end,
> @@ -404,42 +878,90 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
> unsigned long npages = (end - start) >> PAGE_SHIFT;
> unsigned long addr = start;
> + enum migrate_vma_info minfo;
> + unsigned long i;
> + spinlock_t *ptl;
> pte_t *ptep;
> pmd_t pmd;
> + int r;
>
> + minfo = hmm_select_migrate(range);
> again:
> +
> pmd = pmdp_get_lockless(pmdp);
> - if (pmd_none(pmd))
> - return hmm_vma_walk_hole(start, end, -1, walk);
> + if (pmd_none(pmd)) {
> + r = hmm_vma_walk_hole(start, end, -1, walk);
> + if (r || !minfo)
> + return r;
> +
> + ptl = pmd_lock(walk->mm, pmdp);
> + if (pmd_none(*pmdp)) {
> + // hmm_vma_walk_hole() filled migration needs
> + spin_unlock(ptl);
> + return r;
> + }
> + spin_unlock(ptl);
> + }
>
> if (thp_migration_supported() && pmd_is_migration_entry(pmd)) {
> - if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
> - hmm_vma_walk->last = addr;
> - pmd_migration_entry_wait(walk->mm, pmdp);
> - return -EBUSY;
> + if (!minfo) {
> + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
> + hmm_vma_walk->last = addr;
> + pmd_migration_entry_wait(walk->mm, pmdp);
> + return -EBUSY;
> + }
> }
> - return hmm_pfns_fill(start, end, range, 0);
> + for (i = 0; addr < end; addr += PAGE_SIZE, i++)
> + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
> +
> + return 0;
> }
>
> - if (!pmd_present(pmd))
> - return hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
> - pmd);
> + if (pmd_trans_huge(pmd) || !pmd_present(pmd)) {
> +
> + if (!pmd_present(pmd)) {
> + r = hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
> + pmd);
> + if (r || !minfo)
> + return r;
> + } else {
> +
> + /*
> + * No need to take pmd_lock here, even if some other thread
> + * is splitting the huge pmd we will get that event through
> + * mmu_notifier callback.
> + *
> + * So just read pmd value and check again it's a transparent
> + * huge or device mapping one and compute corresponding pfn
> + * values.
> + */
> +
> + pmd = pmdp_get_lockless(pmdp);
> + if (!pmd_trans_huge(pmd))
> + goto again;
> +
> + r = hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
> +
> + if (r || !minfo)
> + return r;
> + }
>
> - if (pmd_trans_huge(pmd)) {
> - /*
> - * No need to take pmd_lock here, even if some other thread
> - * is splitting the huge pmd we will get that event through
> - * mmu_notifier callback.
> - *
> - * So just read pmd value and check again it's a transparent
> - * huge or device mapping one and compute corresponding pfn
> - * values.
> - */
> - pmd = pmdp_get_lockless(pmdp);
> - if (!pmd_trans_huge(pmd))
> - goto again;
> + r = hmm_vma_handle_migrate_prepare_pmd(walk, pmdp, start, end, hmm_pfns);
> +
> + if (r == -ENOENT) {
> + r = hmm_vma_walk_split(pmdp, addr, walk);
> + if (r) {
> + /* Split not successful, skip */
> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> + }
> +
> + /* Split successful or "again", reloop */
> + hmm_vma_walk->last = addr;
> + return -EBUSY;
> + }
> +
> + return r;
>
> - return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
> }
>
> /*
> @@ -451,22 +973,26 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> if (pmd_bad(pmd)) {
> if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
> return -EFAULT;
> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> }
>
> ptep = pte_offset_map(pmdp, addr);
> if (!ptep)
> goto again;
> for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
> - int r;
>
> r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
> if (r) {
> /* hmm_vma_handle_pte() did pte_unmap() */
> return r;
> }
> +
> + r = hmm_vma_handle_migrate_prepare(walk, pmdp, addr, hmm_pfns);
> + if (r)
> + break;
> }
> pte_unmap(ptep - 1);
> +
> return 0;
> }
>
> @@ -600,6 +1126,11 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
> struct hmm_vma_walk *hmm_vma_walk = walk->private;
> struct hmm_range *range = hmm_vma_walk->range;
> struct vm_area_struct *vma = walk->vma;
> + int r;
> +
> + r = hmm_vma_capture_migrate_range(start, end, walk);
> + if (r)
> + return r;
>
> if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
> vma->vm_flags & VM_READ)
> @@ -622,7 +1153,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
> (end - start) >> PAGE_SHIFT, 0))
> return -EFAULT;
>
> - hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> + hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
>
> /* Skip this vma and continue processing the next vma. */
> return 1;
> @@ -652,9 +1183,17 @@ static const struct mm_walk_ops hmm_walk_ops = {
> * the invalidation to finish.
> * -EFAULT: A page was requested to be valid and could not be made valid
> * ie it has no backing VMA or it is illegal to access
> + * -ERANGE: The range crosses multiple VMAs, or space for hmm_pfns array
> + * is too low.
> *
> * This is similar to get_user_pages(), except that it can read the page tables
> * without mutating them (ie causing faults).
> + *
> + * If want to do migrate after faultin, call hmm_rangem_fault() with
s/faultin/faulting
s/hmm_rangem_fault/hmm_range_fault
> + * HMM_PFN_REQ_MIGRATE and initialize range.migrate field.
I'm not following the HMM_PFN_REQ_MIGRATE usage.
> + * After hmm_range_fault() call migrate_hmm_range_setup() instead of
> + * migrate_vma_setup() and after that follow normal migrate calls path.
> + *
Also since migrate_vma_setup calls hmm_range_fault then
migrate_hmm_range_setup what would the use case for not just calling
migrate_vma_setup be?
> */
> int hmm_range_fault(struct hmm_range *range)
> {
> @@ -662,16 +1201,28 @@ int hmm_range_fault(struct hmm_range *range)
> .range = range,
> .last = range->start,
> };
> - struct mm_struct *mm = range->notifier->mm;
> + bool is_fault_path = !!range->notifier;
> + struct mm_struct *mm;
> int ret;
>
> + /*
> + *
> + * Could be serving a device fault or come from migrate
> + * entry point. For the former we have not resolved the vma
> + * yet, and the latter we don't have a notifier (but have a vma).
> + *
> + */
> + mm = is_fault_path ? range->notifier->mm : range->migrate->vma->vm_mm;
> mmap_assert_locked(mm);
>
> do {
> /* If range is no longer valid force retry. */
> - if (mmu_interval_check_retry(range->notifier,
> - range->notifier_seq))
> - return -EBUSY;
> + if (is_fault_path && mmu_interval_check_retry(range->notifier,
> + range->notifier_seq)) {
> + ret = -EBUSY;
> + break;
> + }
> +
> ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
> &hmm_walk_ops, &hmm_vma_walk);
> /*
> @@ -681,6 +1232,18 @@ int hmm_range_fault(struct hmm_range *range)
> * output, and all >= are still at their input values.
> */
> } while (ret == -EBUSY);
> +
> + if (hmm_select_migrate(range) && range->migrate &&
> + hmm_vma_walk.mmu_range.owner) {
> + // The migrate_vma path has the following initialized
> + if (is_fault_path) {
> + range->migrate->vma = hmm_vma_walk.vma;
> + range->migrate->start = range->start;
> + range->migrate->end = hmm_vma_walk.end;
> + }
> + mmu_notifier_invalidate_range_end(&hmm_vma_walk.mmu_range);
> + }
> +
> return ret;
> }
> EXPORT_SYMBOL(hmm_range_fault);
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 23379663b1e1..d89efdfca8f6 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -734,7 +734,17 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
> */
> int migrate_vma_setup(struct migrate_vma *args)
> {
> + int ret;
> long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
> + struct hmm_range range = {
> + .notifier = NULL,
> + .start = args->start,
> + .end = args->end,
> + .migrate = args,
> + .hmm_pfns = args->src,
> + .dev_private_owner = args->pgmap_owner,
> + .migrate = args
> + };
>
> args->start &= PAGE_MASK;
> args->end &= PAGE_MASK;
> @@ -759,17 +769,19 @@ int migrate_vma_setup(struct migrate_vma *args)
> args->cpages = 0;
> args->npages = 0;
>
> - migrate_vma_collect(args);
> + if (args->flags & MIGRATE_VMA_FAULT)
> + range.default_flags |= HMM_PFN_REQ_FAULT;
Next level here might be skip faulting pte_none() in hmm_range_fault
too?
Matt
> +
> + ret = hmm_range_fault(&range);
>
> - if (args->cpages)
> - migrate_vma_unmap(args);
> + migrate_hmm_range_setup(&range);
>
> /*
> * At this point pages are locked and unmapped, and thus they have
> * stable content and can safely be copied to destination memory that
> * is allocated by the drivers.
> */
> - return 0;
> + return ret;
>
> }
> EXPORT_SYMBOL(migrate_vma_setup);
> @@ -1489,3 +1501,64 @@ int migrate_device_coherent_folio(struct folio *folio)
> return 0;
> return -EBUSY;
> }
> +
> +void migrate_hmm_range_setup(struct hmm_range *range)
> +{
> +
> + struct migrate_vma *migrate = range->migrate;
> +
> + if (!migrate)
> + return;
> +
> + migrate->npages = (migrate->end - migrate->start) >> PAGE_SHIFT;
> + migrate->cpages = 0;
> +
> + for (unsigned long i = 0; i < migrate->npages; i++) {
> +
> + unsigned long pfn = range->hmm_pfns[i];
> +
> + pfn &= ~HMM_PFN_INOUT_FLAGS;
> +
> + /*
> + *
> + * Don't do migration if valid and migrate flags are not both set.
> + *
> + */
> + if ((pfn & (HMM_PFN_VALID | HMM_PFN_MIGRATE)) !=
> + (HMM_PFN_VALID | HMM_PFN_MIGRATE)) {
> + migrate->src[i] = 0;
> + migrate->dst[i] = 0;
> + continue;
> + }
> +
> + migrate->cpages++;
> +
> + /*
> + *
> + * The zero page is encoded in a special way, valid and migrate is
> + * set, and pfn part is zero. Encode specially for migrate also.
> + *
> + */
> + if (pfn == (HMM_PFN_VALID|HMM_PFN_MIGRATE)) {
> + migrate->src[i] = MIGRATE_PFN_MIGRATE;
> + migrate->dst[i] = 0;
> + continue;
> + }
> + if (pfn == (HMM_PFN_VALID|HMM_PFN_MIGRATE|HMM_PFN_COMPOUND)) {
> + migrate->src[i] = MIGRATE_PFN_MIGRATE|MIGRATE_PFN_COMPOUND;
> + migrate->dst[i] = 0;
> + continue;
> + }
> +
> + migrate->src[i] = migrate_pfn(page_to_pfn(hmm_pfn_to_page(pfn)))
> + | MIGRATE_PFN_MIGRATE;
> + migrate->src[i] |= (pfn & HMM_PFN_WRITE) ? MIGRATE_PFN_WRITE : 0;
> + migrate->src[i] |= (pfn & HMM_PFN_COMPOUND) ? MIGRATE_PFN_COMPOUND : 0;
> + migrate->dst[i] = 0;
> + }
> +
> + if (migrate->cpages)
> + migrate_vma_unmap(migrate);
> +
> +}
> +EXPORT_SYMBOL(migrate_hmm_range_setup);
> --
> 2.50.0
>
Hi,
On 1/16/26 04:06, Matthew Brost wrote:
> On Wed, Jan 14, 2026 at 11:19:21AM +0200, mpenttil@redhat.com wrote:
>> From: Mika Penttilä <mpenttil@redhat.com>
>>
>> Currently, the way device page faulting and migration works
>> is not optimal, if you want to do both fault handling and
>> migration at once.
>>
>> Being able to migrate not present pages (or pages mapped with incorrect
>> permissions, eg. COW) to the GPU requires doing either of the
>> following sequences:
>>
>> 1. hmm_range_fault() - fault in non-present pages with correct permissions, etc.
>> 2. migrate_vma_*() - migrate the pages
>>
>> Or:
>>
>> 1. migrate_vma_*() - migrate present pages
>> 2. If non-present pages detected by migrate_vma_*():
>> a) call hmm_range_fault() to fault pages in
>> b) call migrate_vma_*() again to migrate now present pages
>>
>> The problem with the first sequence is that you always have to do two
>> page walks even when most of the time the pages are present or zero page
>> mappings so the common case takes a performance hit.
>>
>> The second sequence is better for the common case, but far worse if
>> pages aren't present because now you have to walk the page tables three
>> times (once to find the page is not present, once so hmm_range_fault()
>> can find a non-present page to fault in and once again to setup the
>> migration). It is also tricky to code correctly.
>>
>> We should be able to walk the page table once, faulting
>> pages in as required and replacing them with migration entries if
>> requested.
>>
>> Add a new flag to HMM APIs, HMM_PFN_REQ_MIGRATE,
>> which tells to prepare for migration also during fault handling.
>> Also, for the migrate_vma_setup() call paths, a flags, MIGRATE_VMA_FAULT,
>> is added to tell to add fault handling to migrate.
>>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: Jason Gunthorpe <jgg@nvidia.com>
>> Cc: Leon Romanovsky <leonro@nvidia.com>
>> Cc: Alistair Popple <apopple@nvidia.com>
>> Cc: Balbir Singh <balbirs@nvidia.com>
>> Cc: Zi Yan <ziy@nvidia.com>
>> Cc: Matthew Brost <matthew.brost@intel.com>
> I'll try to test this when I can but horribly behind at the moment.
>
> You can use Intel's CI system to test SVM too. I can get you authorized
> to use this. The list to trigger is intel-xe@lists.freedesktop.org and
> patches must apply to drm-tip. I'll let you know when you are
> authorized.
Thanks, appreciate, will do that also!
>> Suggested-by: Alistair Popple <apopple@nvidia.com>
>> Signed-off-by: Mika Penttilä <mpenttil@redhat.com>
>> ---
>> include/linux/hmm.h | 17 +-
>> include/linux/migrate.h | 6 +-
>> mm/hmm.c | 657 +++++++++++++++++++++++++++++++++++++---
>> mm/migrate_device.c | 81 ++++-
>> 4 files changed, 706 insertions(+), 55 deletions(-)
>>
>> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
>> index db75ffc949a7..7b7294ad0f62 100644
>> --- a/include/linux/hmm.h
>> +++ b/include/linux/hmm.h
>> @@ -12,7 +12,7 @@
>> #include <linux/mm.h>
>>
>> struct mmu_interval_notifier;
>> -
>> +struct migrate_vma;
>> /*
>> * On output:
>> * 0 - The page is faultable and a future call with
>> @@ -48,15 +48,25 @@ enum hmm_pfn_flags {
>> HMM_PFN_P2PDMA = 1UL << (BITS_PER_LONG - 5),
>> HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6),
>>
>> - HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 11),
>> + /* Migrate request */
>> + HMM_PFN_MIGRATE = 1UL << (BITS_PER_LONG - 7),
>> + HMM_PFN_COMPOUND = 1UL << (BITS_PER_LONG - 8),
>> + HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 13),
>>
>> /* Input flags */
>> HMM_PFN_REQ_FAULT = HMM_PFN_VALID,
>> HMM_PFN_REQ_WRITE = HMM_PFN_WRITE,
>> + HMM_PFN_REQ_MIGRATE = HMM_PFN_MIGRATE,
> I believe you are missing kernel for HMM_PFN_MIGRATE.
I explain below the HMM_PFN_MIGRATE usage.
>
>>
>> HMM_PFN_FLAGS = ~((1UL << HMM_PFN_ORDER_SHIFT) - 1),
>> };
>>
>> +enum {
>> + /* These flags are carried from input-to-output */
>> + HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
>> + HMM_PFN_P2PDMA_BUS,
>> +};
>> +
>> /*
>> * hmm_pfn_to_page() - return struct page pointed to by a device entry
>> *
>> @@ -107,6 +117,7 @@ static inline unsigned int hmm_pfn_to_map_order(unsigned long hmm_pfn)
>> * @default_flags: default flags for the range (write, read, ... see hmm doc)
>> * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
>> * @dev_private_owner: owner of device private pages
>> + * @migrate: structure for migrating the associated vma
>> */
>> struct hmm_range {
>> struct mmu_interval_notifier *notifier;
>> @@ -117,12 +128,14 @@ struct hmm_range {
>> unsigned long default_flags;
>> unsigned long pfn_flags_mask;
>> void *dev_private_owner;
>> + struct migrate_vma *migrate;
>> };
>>
>> /*
>> * Please see Documentation/mm/hmm.rst for how to use the range API.
>> */
>> int hmm_range_fault(struct hmm_range *range);
>> +int hmm_range_migrate_prepare(struct hmm_range *range, struct migrate_vma **pargs);
>>
>> /*
>> * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>> index 26ca00c325d9..0889309a9d21 100644
>> --- a/include/linux/migrate.h
>> +++ b/include/linux/migrate.h
>> @@ -3,6 +3,7 @@
>> #define _LINUX_MIGRATE_H
>>
>> #include <linux/mm.h>
>> +#include <linux/hmm.h>
>> #include <linux/mempolicy.h>
>> #include <linux/migrate_mode.h>
>> #include <linux/hugetlb.h>
>> @@ -140,11 +141,12 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
>> return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
>> }
>>
>> -enum migrate_vma_direction {
>> +enum migrate_vma_info {
>> MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
>> MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
>> MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
>> MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
>> + MIGRATE_VMA_FAULT = 1 << 4,
>> };
>>
>> struct migrate_vma {
>> @@ -192,7 +194,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
>> unsigned long npages);
>> void migrate_device_finalize(unsigned long *src_pfns,
>> unsigned long *dst_pfns, unsigned long npages);
>> -
>> +void migrate_hmm_range_setup(struct hmm_range *range);
>> #endif /* CONFIG_MIGRATION */
>>
>> #endif /* _LINUX_MIGRATE_H */
>> diff --git a/mm/hmm.c b/mm/hmm.c
>> index 4ec74c18bef6..39a07d895043 100644
>> --- a/mm/hmm.c
>> +++ b/mm/hmm.c
>> @@ -20,6 +20,7 @@
>> #include <linux/pagemap.h>
>> #include <linux/leafops.h>
>> #include <linux/hugetlb.h>
>> +#include <linux/migrate.h>
>> #include <linux/memremap.h>
>> #include <linux/sched/mm.h>
>> #include <linux/jump_label.h>
>> @@ -31,8 +32,12 @@
>> #include "internal.h"
>>
>> struct hmm_vma_walk {
>> - struct hmm_range *range;
>> - unsigned long last;
>> + struct mmu_notifier_range mmu_range;
>> + struct vm_area_struct *vma;
>> + struct hmm_range *range;
>> + unsigned long start;
>> + unsigned long end;
>> + unsigned long last;
>> };
>>
>> enum {
>> @@ -41,21 +46,49 @@ enum {
>> HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
>> };
>>
>> -enum {
>> - /* These flags are carried from input-to-output */
>> - HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
>> - HMM_PFN_P2PDMA_BUS,
>> -};
>> +static enum migrate_vma_info hmm_select_migrate(struct hmm_range *range)
>> +{
>> + enum migrate_vma_info minfo;
>> +
>> + minfo = range->migrate ? range->migrate->flags : 0;
>> + minfo |= (range->default_flags & HMM_PFN_REQ_MIGRATE) ?
>> + MIGRATE_VMA_SELECT_SYSTEM : 0;
> I'm trying to make sense of HMM_PFN_REQ_MIGRATE and why it sets
> MIGRATE_VMA_SELECT_SYSTEM?
>
> Also the just the general usage - would range->migrate be NULL in the
> expected usage.
>
> Maybe an example of how hmm_range_fault would be called with this flag
> and subsequent expected migrate calls would clear this up.
hmm_select_migrate() figures out the migration type (or none) that's going
to happen based on the caller (of hmm_range_fault() or migrate_vma_setup()).
HMM_PFN_REQ_MIGRATE is hmm_range_fault() user's way to say it wants
migrate on fault (with default_flags, the whole range) and for that
direction is system->device.
migrate_vma_setup() users (the current device migration) express their
migration intention with migrate_vma.flags (direction being which ever).
>
>> +
>> + return minfo;
>> +}
>>
>> static int hmm_pfns_fill(unsigned long addr, unsigned long end,
>> - struct hmm_range *range, unsigned long cpu_flags)
>> + struct hmm_vma_walk *hmm_vma_walk, unsigned long cpu_flags)
>> {
>> + struct hmm_range *range = hmm_vma_walk->range;
>> unsigned long i = (addr - range->start) >> PAGE_SHIFT;
>> + enum migrate_vma_info minfo;
>> + bool migrate = false;
>> +
>> + minfo = hmm_select_migrate(range);
>> + if (cpu_flags != HMM_PFN_ERROR) {
>> + if (minfo && (vma_is_anonymous(hmm_vma_walk->vma))) {
>> + cpu_flags |= (HMM_PFN_VALID | HMM_PFN_MIGRATE);
>> + migrate = true;
>> + }
>> + }
>> +
>> + if (migrate && thp_migration_supported() &&
>> + (minfo & MIGRATE_VMA_SELECT_COMPOUND) &&
>> + IS_ALIGNED(addr, HPAGE_PMD_SIZE) &&
>> + IS_ALIGNED(end, HPAGE_PMD_SIZE)) {
>> + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
>> + range->hmm_pfns[i] |= cpu_flags | HMM_PFN_COMPOUND;
>> + addr += PAGE_SIZE;
>> + i++;
>> + cpu_flags = 0;
>> + }
>>
>> for (; addr < end; addr += PAGE_SIZE, i++) {
>> range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
>> range->hmm_pfns[i] |= cpu_flags;
>> }
>> +
>> return 0;
>> }
>>
>> @@ -171,11 +204,11 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
>> if (!walk->vma) {
>> if (required_fault)
>> return -EFAULT;
>> - return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
>> + return hmm_pfns_fill(addr, end, hmm_vma_walk, HMM_PFN_ERROR);
>> }
>> if (required_fault)
>> return hmm_vma_fault(addr, end, required_fault, walk);
>> - return hmm_pfns_fill(addr, end, range, 0);
>> + return hmm_pfns_fill(addr, end, hmm_vma_walk, 0);
>> }
>>
>> static inline unsigned long hmm_pfn_flags_order(unsigned long order)
>> @@ -289,10 +322,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
>> goto fault;
>>
>> if (softleaf_is_migration(entry)) {
>> - pte_unmap(ptep);
>> - hmm_vma_walk->last = addr;
>> - migration_entry_wait(walk->mm, pmdp, addr);
>> - return -EBUSY;
>> + if (!hmm_select_migrate(range)) {
>> + pte_unmap(ptep);
>> + hmm_vma_walk->last = addr;
>> + migration_entry_wait(walk->mm, pmdp, addr);
>> + return -EBUSY;
>> + } else
>> + goto out;
>> }
>>
>> /* Report error for everything else */
>> @@ -376,7 +412,7 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
>> return -EFAULT;
>> }
>>
>> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
>> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
>> }
>> #else
>> static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
>> @@ -389,10 +425,448 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
>>
>> if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
>> return -EFAULT;
>> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
>> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
>> }
>> #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>
>> +/**
>> + * migrate_vma_split_folio() - Helper function to split a THP folio
>> + * @folio: the folio to split
>> + * @fault_page: struct page associated with the fault if any
>> + *
>> + * Returns 0 on success
>> + */
>> +static int migrate_vma_split_folio(struct folio *folio,
>> + struct page *fault_page)
>> +{
>> + int ret;
>> + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
>> + struct folio *new_fault_folio = NULL;
>> +
>> + if (folio != fault_folio) {
>> + folio_get(folio);
>> + folio_lock(folio);
>> + }
>> +
>> + ret = split_folio(folio);
>> + if (ret) {
>> + if (folio != fault_folio) {
>> + folio_unlock(folio);
>> + folio_put(folio);
>> + }
>> + return ret;
>> + }
>> +
>> + new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
>> +
>> + /*
>> + * Ensure the lock is held on the correct
>> + * folio after the split
>> + */
>> + if (!new_fault_folio) {
>> + folio_unlock(folio);
>> + folio_put(folio);
>> + } else if (folio != new_fault_folio) {
>> + if (new_fault_folio != fault_folio) {
>> + folio_get(new_fault_folio);
>> + folio_lock(new_fault_folio);
>> + }
>> + folio_unlock(folio);
>> + folio_put(folio);
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
>> + pmd_t *pmdp,
>> + unsigned long start,
>> + unsigned long end,
>> + unsigned long *hmm_pfn)
>> +{
>> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
>> + struct hmm_range *range = hmm_vma_walk->range;
>> + struct migrate_vma *migrate = range->migrate;
>> + struct mm_struct *mm = walk->vma->vm_mm;
>> + struct folio *fault_folio = NULL;
>> + struct folio *folio;
>> + enum migrate_vma_info minfo;
>> + spinlock_t *ptl;
>> + unsigned long i;
>> + int r = 0;
>> +
>> + minfo = hmm_select_migrate(range);
>> + if (!minfo)
>> + return r;
>> +
>> + fault_folio = (migrate && migrate->fault_page) ?
>> + page_folio(migrate->fault_page) : NULL;
>> +
>> + ptl = pmd_lock(mm, pmdp);
>> + if (pmd_none(*pmdp)) {
>> + spin_unlock(ptl);
>> + return hmm_pfns_fill(start, end, hmm_vma_walk, 0);
>> + }
>> +
>> + if (pmd_trans_huge(*pmdp)) {
>> + if (!(minfo & MIGRATE_VMA_SELECT_SYSTEM))
>> + goto out;
>> +
>> + folio = pmd_folio(*pmdp);
>> + if (is_huge_zero_folio(folio)) {
>> + spin_unlock(ptl);
>> + return hmm_pfns_fill(start, end, hmm_vma_walk, 0);
>> + }
>> +
>> + } else if (!pmd_present(*pmdp)) {
>> + const softleaf_t entry = softleaf_from_pmd(*pmdp);
>> +
>> + folio = softleaf_to_folio(entry);
>> +
>> + if (!softleaf_is_device_private(entry))
>> + goto out;
>> +
>> + if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
>> + goto out;
>> + if (folio->pgmap->owner != migrate->pgmap_owner)
>> + goto out;
>> +
>> + } else {
>> + spin_unlock(ptl);
>> + return -EBUSY;
>> + }
>> +
>> + folio_get(folio);
>> +
>> + if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
>> + spin_unlock(ptl);
>> + folio_put(folio);
>> + return 0;
>> + }
>> +
>> + if (thp_migration_supported() &&
>> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
>> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
>> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
>> +
>> + struct page_vma_mapped_walk pvmw = {
>> + .ptl = ptl,
>> + .address = start,
>> + .pmd = pmdp,
>> + .vma = walk->vma,
>> + };
>> +
>> + hmm_pfn[0] |= HMM_PFN_MIGRATE | HMM_PFN_COMPOUND;
>> +
>> + r = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
>> + if (r) {
>> + hmm_pfn[0] &= ~(HMM_PFN_MIGRATE | HMM_PFN_COMPOUND);
>> + r = -ENOENT; // fallback
>> + goto unlock_out;
>> + }
>> + for (i = 1, start += PAGE_SIZE; start < end; start += PAGE_SIZE, i++)
>> + hmm_pfn[i] &= HMM_PFN_INOUT_FLAGS;
>> +
>> + } else {
>> + r = -ENOENT; // fallback
>> + goto unlock_out;
>> + }
>> +
>> +
>> +out:
>> + spin_unlock(ptl);
>> + return r;
>> +
>> +unlock_out:
>> + if (folio != fault_folio)
>> + folio_unlock(folio);
>> + folio_put(folio);
>> + goto out;
>> +
>> +}
>> +
>> +/*
>> + * Install migration entries if migration requested, either from fault
>> + * or migrate paths.
>> + *
>> + */
>> +static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
>> + pmd_t *pmdp,
>> + unsigned long addr,
>> + unsigned long *hmm_pfn)
>> +{
>> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
>> + struct hmm_range *range = hmm_vma_walk->range;
>> + struct migrate_vma *migrate = range->migrate;
>> + struct mm_struct *mm = walk->vma->vm_mm;
>> + struct folio *fault_folio = NULL;
>> + enum migrate_vma_info minfo;
>> + struct dev_pagemap *pgmap;
>> + bool anon_exclusive;
>> + struct folio *folio;
>> + unsigned long pfn;
>> + struct page *page;
>> + softleaf_t entry;
>> + pte_t pte, swp_pte;
>> + spinlock_t *ptl;
>> + bool writable = false;
>> + pte_t *ptep;
>> +
>> + // Do we want to migrate at all?
>> + minfo = hmm_select_migrate(range);
>> + if (!minfo)
>> + return 0;
>> +
>> + fault_folio = (migrate && migrate->fault_page) ?
>> + page_folio(migrate->fault_page) : NULL;
>> +
>> +again:
>> + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
>> + if (!ptep)
>> + return 0;
>> +
>> + pte = ptep_get(ptep);
>> +
>> + if (pte_none(pte)) {
>> + // migrate without faulting case
>> + if (vma_is_anonymous(walk->vma)) {
>> + *hmm_pfn &= HMM_PFN_INOUT_FLAGS;
>> + *hmm_pfn |= HMM_PFN_MIGRATE | HMM_PFN_VALID;
>> + goto out;
>> + }
>> + }
>> +
>> + if (!pte_present(pte)) {
>> + /*
>> + * Only care about unaddressable device page special
>> + * page table entry. Other special swap entries are not
>> + * migratable, and we ignore regular swapped page.
>> + */
>> + entry = softleaf_from_pte(pte);
>> + if (!softleaf_is_device_private(entry))
>> + goto out;
>> +
>> + if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
>> + goto out;
>> +
>> + page = softleaf_to_page(entry);
>> + folio = page_folio(page);
>> + if (folio->pgmap->owner != migrate->pgmap_owner)
>> + goto out;
>> +
>> + if (folio_test_large(folio)) {
>> + int ret;
>> +
>> + pte_unmap_unlock(ptep, ptl);
>> + ret = migrate_vma_split_folio(folio,
>> + migrate->fault_page);
>> + if (ret)
>> + goto out_unlocked;
>> + goto again;
>> + }
>> +
>> + pfn = page_to_pfn(page);
>> + if (softleaf_is_device_private_write(entry))
>> + writable = true;
>> + } else {
>> + pfn = pte_pfn(pte);
>> + if (is_zero_pfn(pfn) &&
>> + (minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
>> + *hmm_pfn = HMM_PFN_MIGRATE|HMM_PFN_VALID;
>> + goto out;
>> + }
>> + page = vm_normal_page(walk->vma, addr, pte);
>> + if (page && !is_zone_device_page(page) &&
>> + !(minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
>> + goto out;
>> + } else if (page && is_device_coherent_page(page)) {
>> + pgmap = page_pgmap(page);
>> +
>> + if (!(minfo &
>> + MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
>> + pgmap->owner != migrate->pgmap_owner)
>> + goto out;
>> + }
>> +
>> + folio = page_folio(page);
>> + if (folio_test_large(folio)) {
>> + int ret;
>> +
>> + pte_unmap_unlock(ptep, ptl);
>> + ret = migrate_vma_split_folio(folio,
>> + migrate->fault_page);
>> + if (ret)
>> + goto out_unlocked;
>> +
>> + goto again;
>> + }
>> +
>> + writable = pte_write(pte);
>> + }
>> +
>> + if (!page || !page->mapping)
>> + goto out;
>> +
>> + /*
>> + * By getting a reference on the folio we pin it and that blocks
>> + * any kind of migration. Side effect is that it "freezes" the
>> + * pte.
>> + *
>> + * We drop this reference after isolating the folio from the lru
>> + * for non device folio (device folio are not on the lru and thus
>> + * can't be dropped from it).
>> + */
>> + folio = page_folio(page);
>> + folio_get(folio);
>> +
>> + /*
>> + * We rely on folio_trylock() to avoid deadlock between
>> + * concurrent migrations where each is waiting on the others
>> + * folio lock. If we can't immediately lock the folio we fail this
>> + * migration as it is only best effort anyway.
>> + *
>> + * If we can lock the folio it's safe to set up a migration entry
>> + * now. In the common case where the folio is mapped once in a
>> + * single process setting up the migration entry now is an
>> + * optimisation to avoid walking the rmap later with
>> + * try_to_migrate().
>> + */
>> +
>> + if (fault_folio == folio || folio_trylock(folio)) {
>> + anon_exclusive = folio_test_anon(folio) &&
>> + PageAnonExclusive(page);
>> +
>> + flush_cache_page(walk->vma, addr, pfn);
>> +
>> + if (anon_exclusive) {
>> + pte = ptep_clear_flush(walk->vma, addr, ptep);
>> +
>> + if (folio_try_share_anon_rmap_pte(folio, page)) {
>> + set_pte_at(mm, addr, ptep, pte);
>> + folio_unlock(folio);
>> + folio_put(folio);
>> + goto out;
>> + }
>> + } else {
>> + pte = ptep_get_and_clear(mm, addr, ptep);
>> + }
>> +
>> + /* Setup special migration page table entry */
>> + if (writable)
>> + entry = make_writable_migration_entry(pfn);
>> + else if (anon_exclusive)
>> + entry = make_readable_exclusive_migration_entry(pfn);
>> + else
>> + entry = make_readable_migration_entry(pfn);
>> +
>> + swp_pte = swp_entry_to_pte(entry);
>> + if (pte_present(pte)) {
>> + if (pte_soft_dirty(pte))
>> + swp_pte = pte_swp_mksoft_dirty(swp_pte);
>> + if (pte_uffd_wp(pte))
>> + swp_pte = pte_swp_mkuffd_wp(swp_pte);
>> + } else {
>> + if (pte_swp_soft_dirty(pte))
>> + swp_pte = pte_swp_mksoft_dirty(swp_pte);
>> + if (pte_swp_uffd_wp(pte))
>> + swp_pte = pte_swp_mkuffd_wp(swp_pte);
>> + }
>> +
>> + set_pte_at(mm, addr, ptep, swp_pte);
>> + folio_remove_rmap_pte(folio, page, walk->vma);
>> + folio_put(folio);
>> + *hmm_pfn |= HMM_PFN_MIGRATE;
>> +
>> + if (pte_present(pte))
>> + flush_tlb_range(walk->vma, addr, addr + PAGE_SIZE);
>> + } else
>> + folio_put(folio);
>> +out:
>> + pte_unmap_unlock(ptep, ptl);
>> + return 0;
>> +out_unlocked:
>> + return -1;
>> +
>> +}
>> +
>> +static int hmm_vma_walk_split(pmd_t *pmdp,
>> + unsigned long addr,
>> + struct mm_walk *walk)
>> +{
>> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
>> + struct hmm_range *range = hmm_vma_walk->range;
>> + struct migrate_vma *migrate = range->migrate;
>> + struct folio *folio, *fault_folio;
>> + spinlock_t *ptl;
>> + int ret = 0;
>> +
>> + fault_folio = (migrate && migrate->fault_page) ?
>> + page_folio(migrate->fault_page) : NULL;
>> +
>> + ptl = pmd_lock(walk->mm, pmdp);
>> + if (unlikely(!pmd_trans_huge(*pmdp))) {
>> + spin_unlock(ptl);
>> + goto out;
>> + }
>> +
>> + folio = pmd_folio(*pmdp);
>> + if (is_huge_zero_folio(folio)) {
>> + spin_unlock(ptl);
>> + split_huge_pmd(walk->vma, pmdp, addr);
>> + } else {
>> + folio_get(folio);
>> + spin_unlock(ptl);
>> +
>> + if (folio != fault_folio) {
>> + if (unlikely(!folio_trylock(folio))) {
>> + folio_put(folio);
>> + ret = -EBUSY;
>> + goto out;
>> + }
>> + } else
>> + folio_put(folio);
>> +
>> + ret = split_folio(folio);
>> + if (fault_folio != folio) {
>> + folio_unlock(folio);
>> + folio_put(folio);
>> + }
>> +
>> + }
>> +out:
>> + return ret;
>> +}
>> +
>> +static int hmm_vma_capture_migrate_range(unsigned long start,
>> + unsigned long end,
>> + struct mm_walk *walk)
>> +{
>> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
>> + struct hmm_range *range = hmm_vma_walk->range;
>> +
>> + if (!hmm_select_migrate(range))
>> + return 0;
>> +
>> + if (hmm_vma_walk->vma && (hmm_vma_walk->vma != walk->vma))
>> + return -ERANGE;
>> +
>> + hmm_vma_walk->vma = walk->vma;
>> + hmm_vma_walk->start = start;
>> + hmm_vma_walk->end = end;
>> +
>> + if (end - start > range->end - range->start)
>> + return -ERANGE;
>> +
>> + if (!hmm_vma_walk->mmu_range.owner) {
>> + mmu_notifier_range_init_owner(&hmm_vma_walk->mmu_range, MMU_NOTIFY_MIGRATE, 0,
>> + walk->vma->vm_mm, start, end,
>> + range->dev_private_owner);
>> + mmu_notifier_invalidate_range_start(&hmm_vma_walk->mmu_range);
>> + }
>> +
>> + return 0;
>> +}
>> +
>> static int hmm_vma_walk_pmd(pmd_t *pmdp,
>> unsigned long start,
>> unsigned long end,
>> @@ -404,42 +878,90 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>> &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
>> unsigned long npages = (end - start) >> PAGE_SHIFT;
>> unsigned long addr = start;
>> + enum migrate_vma_info minfo;
>> + unsigned long i;
>> + spinlock_t *ptl;
>> pte_t *ptep;
>> pmd_t pmd;
>> + int r;
>>
>> + minfo = hmm_select_migrate(range);
>> again:
>> +
>> pmd = pmdp_get_lockless(pmdp);
>> - if (pmd_none(pmd))
>> - return hmm_vma_walk_hole(start, end, -1, walk);
>> + if (pmd_none(pmd)) {
>> + r = hmm_vma_walk_hole(start, end, -1, walk);
>> + if (r || !minfo)
>> + return r;
>> +
>> + ptl = pmd_lock(walk->mm, pmdp);
>> + if (pmd_none(*pmdp)) {
>> + // hmm_vma_walk_hole() filled migration needs
>> + spin_unlock(ptl);
>> + return r;
>> + }
>> + spin_unlock(ptl);
>> + }
>>
>> if (thp_migration_supported() && pmd_is_migration_entry(pmd)) {
>> - if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
>> - hmm_vma_walk->last = addr;
>> - pmd_migration_entry_wait(walk->mm, pmdp);
>> - return -EBUSY;
>> + if (!minfo) {
>> + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
>> + hmm_vma_walk->last = addr;
>> + pmd_migration_entry_wait(walk->mm, pmdp);
>> + return -EBUSY;
>> + }
>> }
>> - return hmm_pfns_fill(start, end, range, 0);
>> + for (i = 0; addr < end; addr += PAGE_SIZE, i++)
>> + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
>> +
>> + return 0;
>> }
>>
>> - if (!pmd_present(pmd))
>> - return hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
>> - pmd);
>> + if (pmd_trans_huge(pmd) || !pmd_present(pmd)) {
>> +
>> + if (!pmd_present(pmd)) {
>> + r = hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
>> + pmd);
>> + if (r || !minfo)
>> + return r;
>> + } else {
>> +
>> + /*
>> + * No need to take pmd_lock here, even if some other thread
>> + * is splitting the huge pmd we will get that event through
>> + * mmu_notifier callback.
>> + *
>> + * So just read pmd value and check again it's a transparent
>> + * huge or device mapping one and compute corresponding pfn
>> + * values.
>> + */
>> +
>> + pmd = pmdp_get_lockless(pmdp);
>> + if (!pmd_trans_huge(pmd))
>> + goto again;
>> +
>> + r = hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
>> +
>> + if (r || !minfo)
>> + return r;
>> + }
>>
>> - if (pmd_trans_huge(pmd)) {
>> - /*
>> - * No need to take pmd_lock here, even if some other thread
>> - * is splitting the huge pmd we will get that event through
>> - * mmu_notifier callback.
>> - *
>> - * So just read pmd value and check again it's a transparent
>> - * huge or device mapping one and compute corresponding pfn
>> - * values.
>> - */
>> - pmd = pmdp_get_lockless(pmdp);
>> - if (!pmd_trans_huge(pmd))
>> - goto again;
>> + r = hmm_vma_handle_migrate_prepare_pmd(walk, pmdp, start, end, hmm_pfns);
>> +
>> + if (r == -ENOENT) {
>> + r = hmm_vma_walk_split(pmdp, addr, walk);
>> + if (r) {
>> + /* Split not successful, skip */
>> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
>> + }
>> +
>> + /* Split successful or "again", reloop */
>> + hmm_vma_walk->last = addr;
>> + return -EBUSY;
>> + }
>> +
>> + return r;
>>
>> - return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
>> }
>>
>> /*
>> @@ -451,22 +973,26 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>> if (pmd_bad(pmd)) {
>> if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
>> return -EFAULT;
>> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
>> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
>> }
>>
>> ptep = pte_offset_map(pmdp, addr);
>> if (!ptep)
>> goto again;
>> for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
>> - int r;
>>
>> r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
>> if (r) {
>> /* hmm_vma_handle_pte() did pte_unmap() */
>> return r;
>> }
>> +
>> + r = hmm_vma_handle_migrate_prepare(walk, pmdp, addr, hmm_pfns);
>> + if (r)
>> + break;
>> }
>> pte_unmap(ptep - 1);
>> +
>> return 0;
>> }
>>
>> @@ -600,6 +1126,11 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
>> struct hmm_vma_walk *hmm_vma_walk = walk->private;
>> struct hmm_range *range = hmm_vma_walk->range;
>> struct vm_area_struct *vma = walk->vma;
>> + int r;
>> +
>> + r = hmm_vma_capture_migrate_range(start, end, walk);
>> + if (r)
>> + return r;
>>
>> if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
>> vma->vm_flags & VM_READ)
>> @@ -622,7 +1153,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
>> (end - start) >> PAGE_SHIFT, 0))
>> return -EFAULT;
>>
>> - hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
>> + hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
>>
>> /* Skip this vma and continue processing the next vma. */
>> return 1;
>> @@ -652,9 +1183,17 @@ static const struct mm_walk_ops hmm_walk_ops = {
>> * the invalidation to finish.
>> * -EFAULT: A page was requested to be valid and could not be made valid
>> * ie it has no backing VMA or it is illegal to access
>> + * -ERANGE: The range crosses multiple VMAs, or space for hmm_pfns array
>> + * is too low.
>> *
>> * This is similar to get_user_pages(), except that it can read the page tables
>> * without mutating them (ie causing faults).
>> + *
>> + * If want to do migrate after faultin, call hmm_rangem_fault() with
> s/faultin/faulting
>
> s/hmm_rangem_fault/hmm_range_fault
Will fix, thanks!
>
>> + * HMM_PFN_REQ_MIGRATE and initialize range.migrate field.
> I'm not following the HMM_PFN_REQ_MIGRATE usage.
>
>> + * After hmm_range_fault() call migrate_hmm_range_setup() instead of
>> + * migrate_vma_setup() and after that follow normal migrate calls path.
>> + *
> Also since migrate_vma_setup calls hmm_range_fault then
> migrate_hmm_range_setup what would the use case for not just calling
> migrate_vma_setup be?
It's the call context that matters. When resolving device faults and based on a policy,
some regions you mirror, some migrate, you start with
call(s) to hmm_range_fault() (+HMM_PFN_REQ_MIGRATE optionally),
and explicitly call migrate_hmm_range_setup() + other migrate_vma* when migrating.
For resolving device faults it's just the hmm_range_fault() part like before.
For current migration users, migrate_vma_setup() keeps the current behavior calling
hmm_range_fault() under the hood for installing migration ptes
after optionally bringing in swapped pages (with MIGRATE_VMA_FAULT).
I think we get better understanding how the entrypoints should look like when used for a real device.
For now I have tried to keep the current behavior and entry points, refactoring under the hood
for code reuse.
>
>> */
>> int hmm_range_fault(struct hmm_range *range)
>> {
>> @@ -662,16 +1201,28 @@ int hmm_range_fault(struct hmm_range *range)
>> .range = range,
>> .last = range->start,
>> };
>> - struct mm_struct *mm = range->notifier->mm;
>> + bool is_fault_path = !!range->notifier;
>> + struct mm_struct *mm;
>> int ret;
>>
>> + /*
>> + *
>> + * Could be serving a device fault or come from migrate
>> + * entry point. For the former we have not resolved the vma
>> + * yet, and the latter we don't have a notifier (but have a vma).
>> + *
>> + */
>> + mm = is_fault_path ? range->notifier->mm : range->migrate->vma->vm_mm;
>> mmap_assert_locked(mm);
>>
>> do {
>> /* If range is no longer valid force retry. */
>> - if (mmu_interval_check_retry(range->notifier,
>> - range->notifier_seq))
>> - return -EBUSY;
>> + if (is_fault_path && mmu_interval_check_retry(range->notifier,
>> + range->notifier_seq)) {
>> + ret = -EBUSY;
>> + break;
>> + }
>> +
>> ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
>> &hmm_walk_ops, &hmm_vma_walk);
>> /*
>> @@ -681,6 +1232,18 @@ int hmm_range_fault(struct hmm_range *range)
>> * output, and all >= are still at their input values.
>> */
>> } while (ret == -EBUSY);
>> +
>> + if (hmm_select_migrate(range) && range->migrate &&
>> + hmm_vma_walk.mmu_range.owner) {
>> + // The migrate_vma path has the following initialized
>> + if (is_fault_path) {
>> + range->migrate->vma = hmm_vma_walk.vma;
>> + range->migrate->start = range->start;
>> + range->migrate->end = hmm_vma_walk.end;
>> + }
>> + mmu_notifier_invalidate_range_end(&hmm_vma_walk.mmu_range);
>> + }
>> +
>> return ret;
>> }
>> EXPORT_SYMBOL(hmm_range_fault);
>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index 23379663b1e1..d89efdfca8f6 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -734,7 +734,17 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
>> */
>> int migrate_vma_setup(struct migrate_vma *args)
>> {
>> + int ret;
>> long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
>> + struct hmm_range range = {
>> + .notifier = NULL,
>> + .start = args->start,
>> + .end = args->end,
>> + .migrate = args,
>> + .hmm_pfns = args->src,
>> + .dev_private_owner = args->pgmap_owner,
>> + .migrate = args
>> + };
>>
>> args->start &= PAGE_MASK;
>> args->end &= PAGE_MASK;
>> @@ -759,17 +769,19 @@ int migrate_vma_setup(struct migrate_vma *args)
>> args->cpages = 0;
>> args->npages = 0;
>>
>> - migrate_vma_collect(args);
>> + if (args->flags & MIGRATE_VMA_FAULT)
>> + range.default_flags |= HMM_PFN_REQ_FAULT;
> Next level here might be skip faulting pte_none() in hmm_range_fault
> too?
That would be a possible optimization!
>
> Matt
Thanks,
Mika
>
>> +
>> + ret = hmm_range_fault(&range);
>>
>> - if (args->cpages)
>> - migrate_vma_unmap(args);
>> + migrate_hmm_range_setup(&range);
>>
>> /*
>> * At this point pages are locked and unmapped, and thus they have
>> * stable content and can safely be copied to destination memory that
>> * is allocated by the drivers.
>> */
>> - return 0;
>> + return ret;
>>
>> }
>> EXPORT_SYMBOL(migrate_vma_setup);
>> @@ -1489,3 +1501,64 @@ int migrate_device_coherent_folio(struct folio *folio)
>> return 0;
>> return -EBUSY;
>> }
>> +
>> +void migrate_hmm_range_setup(struct hmm_range *range)
>> +{
>> +
>> + struct migrate_vma *migrate = range->migrate;
>> +
>> + if (!migrate)
>> + return;
>> +
>> + migrate->npages = (migrate->end - migrate->start) >> PAGE_SHIFT;
>> + migrate->cpages = 0;
>> +
>> + for (unsigned long i = 0; i < migrate->npages; i++) {
>> +
>> + unsigned long pfn = range->hmm_pfns[i];
>> +
>> + pfn &= ~HMM_PFN_INOUT_FLAGS;
>> +
>> + /*
>> + *
>> + * Don't do migration if valid and migrate flags are not both set.
>> + *
>> + */
>> + if ((pfn & (HMM_PFN_VALID | HMM_PFN_MIGRATE)) !=
>> + (HMM_PFN_VALID | HMM_PFN_MIGRATE)) {
>> + migrate->src[i] = 0;
>> + migrate->dst[i] = 0;
>> + continue;
>> + }
>> +
>> + migrate->cpages++;
>> +
>> + /*
>> + *
>> + * The zero page is encoded in a special way, valid and migrate is
>> + * set, and pfn part is zero. Encode specially for migrate also.
>> + *
>> + */
>> + if (pfn == (HMM_PFN_VALID|HMM_PFN_MIGRATE)) {
>> + migrate->src[i] = MIGRATE_PFN_MIGRATE;
>> + migrate->dst[i] = 0;
>> + continue;
>> + }
>> + if (pfn == (HMM_PFN_VALID|HMM_PFN_MIGRATE|HMM_PFN_COMPOUND)) {
>> + migrate->src[i] = MIGRATE_PFN_MIGRATE|MIGRATE_PFN_COMPOUND;
>> + migrate->dst[i] = 0;
>> + continue;
>> + }
>> +
>> + migrate->src[i] = migrate_pfn(page_to_pfn(hmm_pfn_to_page(pfn)))
>> + | MIGRATE_PFN_MIGRATE;
>> + migrate->src[i] |= (pfn & HMM_PFN_WRITE) ? MIGRATE_PFN_WRITE : 0;
>> + migrate->src[i] |= (pfn & HMM_PFN_COMPOUND) ? MIGRATE_PFN_COMPOUND : 0;
>> + migrate->dst[i] = 0;
>> + }
>> +
>> + if (migrate->cpages)
>> + migrate_vma_unmap(migrate);
>> +
>> +}
>> +EXPORT_SYMBOL(migrate_hmm_range_setup);
>> --
>> 2.50.0
>>
On Fri, Jan 16, 2026 at 12:39:30PM +0200, Mika Penttilä wrote:
> Hi,
>
> On 1/16/26 04:06, Matthew Brost wrote:
>
> > On Wed, Jan 14, 2026 at 11:19:21AM +0200, mpenttil@redhat.com wrote:
> >> From: Mika Penttilä <mpenttil@redhat.com>
> >>
> >> Currently, the way device page faulting and migration works
> >> is not optimal, if you want to do both fault handling and
> >> migration at once.
> >>
> >> Being able to migrate not present pages (or pages mapped with incorrect
> >> permissions, eg. COW) to the GPU requires doing either of the
> >> following sequences:
> >>
> >> 1. hmm_range_fault() - fault in non-present pages with correct permissions, etc.
> >> 2. migrate_vma_*() - migrate the pages
> >>
> >> Or:
> >>
> >> 1. migrate_vma_*() - migrate present pages
> >> 2. If non-present pages detected by migrate_vma_*():
> >> a) call hmm_range_fault() to fault pages in
> >> b) call migrate_vma_*() again to migrate now present pages
> >>
> >> The problem with the first sequence is that you always have to do two
> >> page walks even when most of the time the pages are present or zero page
> >> mappings so the common case takes a performance hit.
> >>
> >> The second sequence is better for the common case, but far worse if
> >> pages aren't present because now you have to walk the page tables three
> >> times (once to find the page is not present, once so hmm_range_fault()
> >> can find a non-present page to fault in and once again to setup the
> >> migration). It is also tricky to code correctly.
> >>
> >> We should be able to walk the page table once, faulting
> >> pages in as required and replacing them with migration entries if
> >> requested.
> >>
> >> Add a new flag to HMM APIs, HMM_PFN_REQ_MIGRATE,
> >> which tells to prepare for migration also during fault handling.
> >> Also, for the migrate_vma_setup() call paths, a flags, MIGRATE_VMA_FAULT,
> >> is added to tell to add fault handling to migrate.
> >>
> >> Cc: David Hildenbrand <david@redhat.com>
> >> Cc: Jason Gunthorpe <jgg@nvidia.com>
> >> Cc: Leon Romanovsky <leonro@nvidia.com>
> >> Cc: Alistair Popple <apopple@nvidia.com>
> >> Cc: Balbir Singh <balbirs@nvidia.com>
> >> Cc: Zi Yan <ziy@nvidia.com>
> >> Cc: Matthew Brost <matthew.brost@intel.com>
> > I'll try to test this when I can but horribly behind at the moment.
> >
> > You can use Intel's CI system to test SVM too. I can get you authorized
> > to use this. The list to trigger is intel-xe@lists.freedesktop.org and
> > patches must apply to drm-tip. I'll let you know when you are
> > authorized.
>
> Thanks, appreciate, will do that also!
>
Working on enabling this for you in CI.
I did a quick test by running our complete test suite and got a kernel
hang in this section:
xe_exec_system_allocator.threads-shared-vm-many-stride-malloc-prefetch
Stack trace:
[ 182.915763] INFO: task xe_exec_system_:5357 blocked for more than 30 seconds.
[ 182.922866] Tainted: G U W 6.19.0-rc4-xe+ #2549
[ 182.929183] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 182.936912] task:xe_exec_system_ state:D stack:0 pid:5357 tgid:1862 ppid:1853 task_flags:0x400040 flags:0x00080000
[ 182.936916] Call Trace:
[ 182.936918] <TASK>
[ 182.936919] __schedule+0x4df/0xc20
[ 182.936924] schedule+0x22/0xd0
[ 182.936925] io_schedule+0x41/0x60
[ 182.936926] migration_entry_wait_on_locked+0x21c/0x2a0
[ 182.936929] ? __pfx_wake_page_function+0x10/0x10
[ 182.936931] migration_entry_wait+0xad/0xf0
[ 182.936933] hmm_vma_walk_pmd+0xd5f/0x19b0
[ 182.936935] walk_pgd_range+0x51d/0xa60
[ 182.936938] __walk_page_range+0x75/0x1e0
[ 182.936940] walk_page_range_mm_unsafe+0x138/0x1f0
[ 182.936941] hmm_range_fault+0x8f/0x160
[ 182.936945] drm_gpusvm_get_pages+0x1ae/0x8a0 [drm_gpusvm_helper]
[ 182.936949] drm_gpusvm_range_get_pages+0x2d/0x40 [drm_gpusvm_helper]
[ 182.936951] xe_svm_range_get_pages+0x1b/0x50 [xe]
[ 182.936979] xe_vm_bind_ioctl+0x15c3/0x17e0 [xe]
[ 182.937001] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe]
[ 182.937021] ? drm_ioctl_kernel+0xa3/0x100
[ 182.937024] drm_ioctl_kernel+0xa3/0x100
[ 182.937026] drm_ioctl+0x213/0x440
[ 182.937028] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe]
[ 182.937061] xe_drm_ioctl+0x5a/0xa0 [xe]
[ 182.937083] __x64_sys_ioctl+0x7f/0xd0
[ 182.937085] do_syscall_64+0x50/0x290
[ 182.937088] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 182.937091] RIP: 0033:0x7ff00f724ded
[ 182.937092] RSP: 002b:00007ff00b9fa640 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[ 182.937094] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007ff00f724ded
[ 182.937095] RDX: 00007ff00b9fa6d0 RSI: 0000000040886445 RDI: 0000000000000003
[ 182.937096] RBP: 00007ff00b9fa690 R08: 0000000000000000 R09: 0000000000000000
[ 182.937097] R10: 0000000000000001 R11: 0000000000000246 R12: 00007ff00b9fa6d0
[ 182.937098] R13: 0000000040886445 R14: 0000000000000003 R15: 00007ff00f8a9000
[ 182.937099] </TASK>
This section is a racy test with parallel CPU and device access that is
likely causing the migration process to abort and retry. From the stack
trace, it looks like a migration PMD didn’t get properly removed, and a
subsequent call to hmm_range_fault hangs on a migration entry that was
not removed during the migration abort.
IIRC, some of the last bits in Balbir’s large device pages series had a
similar bug, which I sent to Andrew with fixup patches. I suspect you
have a similar bug. If I can find the time, I’ll see if I can track it
down.
> >> Suggested-by: Alistair Popple <apopple@nvidia.com>
> >> Signed-off-by: Mika Penttilä <mpenttil@redhat.com>
> >> ---
> >> include/linux/hmm.h | 17 +-
> >> include/linux/migrate.h | 6 +-
> >> mm/hmm.c | 657 +++++++++++++++++++++++++++++++++++++---
> >> mm/migrate_device.c | 81 ++++-
> >> 4 files changed, 706 insertions(+), 55 deletions(-)
> >>
> >> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> >> index db75ffc949a7..7b7294ad0f62 100644
> >> --- a/include/linux/hmm.h
> >> +++ b/include/linux/hmm.h
> >> @@ -12,7 +12,7 @@
> >> #include <linux/mm.h>
> >>
> >> struct mmu_interval_notifier;
> >> -
> >> +struct migrate_vma;
> >> /*
> >> * On output:
> >> * 0 - The page is faultable and a future call with
> >> @@ -48,15 +48,25 @@ enum hmm_pfn_flags {
> >> HMM_PFN_P2PDMA = 1UL << (BITS_PER_LONG - 5),
> >> HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6),
> >>
> >> - HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 11),
> >> + /* Migrate request */
> >> + HMM_PFN_MIGRATE = 1UL << (BITS_PER_LONG - 7),
> >> + HMM_PFN_COMPOUND = 1UL << (BITS_PER_LONG - 8),
> >> + HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 13),
> >>
> >> /* Input flags */
> >> HMM_PFN_REQ_FAULT = HMM_PFN_VALID,
> >> HMM_PFN_REQ_WRITE = HMM_PFN_WRITE,
> >> + HMM_PFN_REQ_MIGRATE = HMM_PFN_MIGRATE,
> > I believe you are missing kernel for HMM_PFN_MIGRATE.
>
> I explain below the HMM_PFN_MIGRATE usage.
>
I had a typo here: s/missing kernel/missing doc kernel/
> >
> >>
> >> HMM_PFN_FLAGS = ~((1UL << HMM_PFN_ORDER_SHIFT) - 1),
> >> };
> >>
> >> +enum {
> >> + /* These flags are carried from input-to-output */
> >> + HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
> >> + HMM_PFN_P2PDMA_BUS,
> >> +};
> >> +
> >> /*
> >> * hmm_pfn_to_page() - return struct page pointed to by a device entry
> >> *
> >> @@ -107,6 +117,7 @@ static inline unsigned int hmm_pfn_to_map_order(unsigned long hmm_pfn)
> >> * @default_flags: default flags for the range (write, read, ... see hmm doc)
> >> * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
> >> * @dev_private_owner: owner of device private pages
> >> + * @migrate: structure for migrating the associated vma
> >> */
> >> struct hmm_range {
> >> struct mmu_interval_notifier *notifier;
> >> @@ -117,12 +128,14 @@ struct hmm_range {
> >> unsigned long default_flags;
> >> unsigned long pfn_flags_mask;
> >> void *dev_private_owner;
> >> + struct migrate_vma *migrate;
> >> };
> >>
> >> /*
> >> * Please see Documentation/mm/hmm.rst for how to use the range API.
> >> */
> >> int hmm_range_fault(struct hmm_range *range);
> >> +int hmm_range_migrate_prepare(struct hmm_range *range, struct migrate_vma **pargs);
> >>
> >> /*
> >> * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
> >> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> >> index 26ca00c325d9..0889309a9d21 100644
> >> --- a/include/linux/migrate.h
> >> +++ b/include/linux/migrate.h
> >> @@ -3,6 +3,7 @@
> >> #define _LINUX_MIGRATE_H
> >>
> >> #include <linux/mm.h>
> >> +#include <linux/hmm.h>
> >> #include <linux/mempolicy.h>
> >> #include <linux/migrate_mode.h>
> >> #include <linux/hugetlb.h>
> >> @@ -140,11 +141,12 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
> >> return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
> >> }
> >>
> >> -enum migrate_vma_direction {
> >> +enum migrate_vma_info {
> >> MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
> >> MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
> >> MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
> >> MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
> >> + MIGRATE_VMA_FAULT = 1 << 4,
> >> };
> >>
> >> struct migrate_vma {
> >> @@ -192,7 +194,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
> >> unsigned long npages);
> >> void migrate_device_finalize(unsigned long *src_pfns,
> >> unsigned long *dst_pfns, unsigned long npages);
> >> -
> >> +void migrate_hmm_range_setup(struct hmm_range *range);
> >> #endif /* CONFIG_MIGRATION */
> >>
> >> #endif /* _LINUX_MIGRATE_H */
> >> diff --git a/mm/hmm.c b/mm/hmm.c
> >> index 4ec74c18bef6..39a07d895043 100644
> >> --- a/mm/hmm.c
> >> +++ b/mm/hmm.c
> >> @@ -20,6 +20,7 @@
> >> #include <linux/pagemap.h>
> >> #include <linux/leafops.h>
> >> #include <linux/hugetlb.h>
> >> +#include <linux/migrate.h>
> >> #include <linux/memremap.h>
> >> #include <linux/sched/mm.h>
> >> #include <linux/jump_label.h>
> >> @@ -31,8 +32,12 @@
> >> #include "internal.h"
> >>
> >> struct hmm_vma_walk {
> >> - struct hmm_range *range;
> >> - unsigned long last;
> >> + struct mmu_notifier_range mmu_range;
> >> + struct vm_area_struct *vma;
> >> + struct hmm_range *range;
> >> + unsigned long start;
> >> + unsigned long end;
> >> + unsigned long last;
> >> };
> >>
> >> enum {
> >> @@ -41,21 +46,49 @@ enum {
> >> HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
> >> };
> >>
> >> -enum {
> >> - /* These flags are carried from input-to-output */
> >> - HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
> >> - HMM_PFN_P2PDMA_BUS,
> >> -};
> >> +static enum migrate_vma_info hmm_select_migrate(struct hmm_range *range)
> >> +{
> >> + enum migrate_vma_info minfo;
> >> +
> >> + minfo = range->migrate ? range->migrate->flags : 0;
> >> + minfo |= (range->default_flags & HMM_PFN_REQ_MIGRATE) ?
> >> + MIGRATE_VMA_SELECT_SYSTEM : 0;
> > I'm trying to make sense of HMM_PFN_REQ_MIGRATE and why it sets
> > MIGRATE_VMA_SELECT_SYSTEM?
> >
> > Also the just the general usage - would range->migrate be NULL in the
> > expected usage.
> >
> > Maybe an example of how hmm_range_fault would be called with this flag
> > and subsequent expected migrate calls would clear this up.
>
> hmm_select_migrate() figures out the migration type (or none) that's going
> to happen based on the caller (of hmm_range_fault() or migrate_vma_setup()).
>
> HMM_PFN_REQ_MIGRATE is hmm_range_fault() user's way to say it wants
> migrate on fault (with default_flags, the whole range) and for that
> direction is system->device.
Can’t range->migrate->flags just set MIGRATE_VMA_SELECT_SYSTEM?
As far as I can tell, HMM_PFN_REQ_MIGRATE doesn’t really do anything if
range->migrate is NULL or range->migrate->flags is 0.
> migrate_vma_setup() users (the current device migration) express their
> migration intention with migrate_vma.flags (direction being which ever).
>
> >
> >> +
> >> + return minfo;
> >> +}
> >>
> >> static int hmm_pfns_fill(unsigned long addr, unsigned long end,
> >> - struct hmm_range *range, unsigned long cpu_flags)
> >> + struct hmm_vma_walk *hmm_vma_walk, unsigned long cpu_flags)
> >> {
> >> + struct hmm_range *range = hmm_vma_walk->range;
> >> unsigned long i = (addr - range->start) >> PAGE_SHIFT;
> >> + enum migrate_vma_info minfo;
> >> + bool migrate = false;
> >> +
> >> + minfo = hmm_select_migrate(range);
> >> + if (cpu_flags != HMM_PFN_ERROR) {
> >> + if (minfo && (vma_is_anonymous(hmm_vma_walk->vma))) {
> >> + cpu_flags |= (HMM_PFN_VALID | HMM_PFN_MIGRATE);
> >> + migrate = true;
> >> + }
> >> + }
> >> +
> >> + if (migrate && thp_migration_supported() &&
> >> + (minfo & MIGRATE_VMA_SELECT_COMPOUND) &&
> >> + IS_ALIGNED(addr, HPAGE_PMD_SIZE) &&
> >> + IS_ALIGNED(end, HPAGE_PMD_SIZE)) {
> >> + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
> >> + range->hmm_pfns[i] |= cpu_flags | HMM_PFN_COMPOUND;
> >> + addr += PAGE_SIZE;
> >> + i++;
> >> + cpu_flags = 0;
> >> + }
> >>
> >> for (; addr < end; addr += PAGE_SIZE, i++) {
> >> range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
> >> range->hmm_pfns[i] |= cpu_flags;
> >> }
> >> +
> >> return 0;
> >> }
> >>
> >> @@ -171,11 +204,11 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
> >> if (!walk->vma) {
> >> if (required_fault)
> >> return -EFAULT;
> >> - return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
> >> + return hmm_pfns_fill(addr, end, hmm_vma_walk, HMM_PFN_ERROR);
> >> }
> >> if (required_fault)
> >> return hmm_vma_fault(addr, end, required_fault, walk);
> >> - return hmm_pfns_fill(addr, end, range, 0);
> >> + return hmm_pfns_fill(addr, end, hmm_vma_walk, 0);
> >> }
> >>
> >> static inline unsigned long hmm_pfn_flags_order(unsigned long order)
> >> @@ -289,10 +322,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
> >> goto fault;
> >>
> >> if (softleaf_is_migration(entry)) {
> >> - pte_unmap(ptep);
> >> - hmm_vma_walk->last = addr;
> >> - migration_entry_wait(walk->mm, pmdp, addr);
> >> - return -EBUSY;
> >> + if (!hmm_select_migrate(range)) {
> >> + pte_unmap(ptep);
> >> + hmm_vma_walk->last = addr;
> >> + migration_entry_wait(walk->mm, pmdp, addr);
> >> + return -EBUSY;
> >> + } else
> >> + goto out;
> >> }
> >>
> >> /* Report error for everything else */
> >> @@ -376,7 +412,7 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
> >> return -EFAULT;
> >> }
> >>
> >> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> >> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> >> }
> >> #else
> >> static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
> >> @@ -389,10 +425,448 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
> >>
> >> if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
> >> return -EFAULT;
> >> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> >> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> >> }
> >> #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
> >>
> >> +/**
> >> + * migrate_vma_split_folio() - Helper function to split a THP folio
> >> + * @folio: the folio to split
> >> + * @fault_page: struct page associated with the fault if any
> >> + *
> >> + * Returns 0 on success
> >> + */
> >> +static int migrate_vma_split_folio(struct folio *folio,
> >> + struct page *fault_page)
> >> +{
> >> + int ret;
> >> + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
> >> + struct folio *new_fault_folio = NULL;
> >> +
> >> + if (folio != fault_folio) {
> >> + folio_get(folio);
> >> + folio_lock(folio);
> >> + }
> >> +
> >> + ret = split_folio(folio);
> >> + if (ret) {
> >> + if (folio != fault_folio) {
> >> + folio_unlock(folio);
> >> + folio_put(folio);
> >> + }
> >> + return ret;
> >> + }
> >> +
> >> + new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
> >> +
> >> + /*
> >> + * Ensure the lock is held on the correct
> >> + * folio after the split
> >> + */
> >> + if (!new_fault_folio) {
> >> + folio_unlock(folio);
> >> + folio_put(folio);
> >> + } else if (folio != new_fault_folio) {
> >> + if (new_fault_folio != fault_folio) {
> >> + folio_get(new_fault_folio);
> >> + folio_lock(new_fault_folio);
> >> + }
> >> + folio_unlock(folio);
> >> + folio_put(folio);
> >> + }
> >> +
> >> + return 0;
> >> +}
> >> +
> >> +static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
> >> + pmd_t *pmdp,
> >> + unsigned long start,
> >> + unsigned long end,
> >> + unsigned long *hmm_pfn)
> >> +{
> >> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
> >> + struct hmm_range *range = hmm_vma_walk->range;
> >> + struct migrate_vma *migrate = range->migrate;
> >> + struct mm_struct *mm = walk->vma->vm_mm;
> >> + struct folio *fault_folio = NULL;
> >> + struct folio *folio;
> >> + enum migrate_vma_info minfo;
> >> + spinlock_t *ptl;
> >> + unsigned long i;
> >> + int r = 0;
> >> +
> >> + minfo = hmm_select_migrate(range);
> >> + if (!minfo)
> >> + return r;
> >> +
> >> + fault_folio = (migrate && migrate->fault_page) ?
> >> + page_folio(migrate->fault_page) : NULL;
> >> +
> >> + ptl = pmd_lock(mm, pmdp);
> >> + if (pmd_none(*pmdp)) {
> >> + spin_unlock(ptl);
> >> + return hmm_pfns_fill(start, end, hmm_vma_walk, 0);
> >> + }
> >> +
> >> + if (pmd_trans_huge(*pmdp)) {
> >> + if (!(minfo & MIGRATE_VMA_SELECT_SYSTEM))
> >> + goto out;
> >> +
> >> + folio = pmd_folio(*pmdp);
> >> + if (is_huge_zero_folio(folio)) {
> >> + spin_unlock(ptl);
> >> + return hmm_pfns_fill(start, end, hmm_vma_walk, 0);
> >> + }
> >> +
> >> + } else if (!pmd_present(*pmdp)) {
> >> + const softleaf_t entry = softleaf_from_pmd(*pmdp);
> >> +
> >> + folio = softleaf_to_folio(entry);
> >> +
> >> + if (!softleaf_is_device_private(entry))
> >> + goto out;
> >> +
> >> + if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
> >> + goto out;
> >> + if (folio->pgmap->owner != migrate->pgmap_owner)
> >> + goto out;
> >> +
> >> + } else {
> >> + spin_unlock(ptl);
> >> + return -EBUSY;
> >> + }
> >> +
> >> + folio_get(folio);
> >> +
> >> + if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
> >> + spin_unlock(ptl);
> >> + folio_put(folio);
> >> + return 0;
> >> + }
> >> +
> >> + if (thp_migration_supported() &&
> >> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
> >> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
> >> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
> >> +
> >> + struct page_vma_mapped_walk pvmw = {
> >> + .ptl = ptl,
> >> + .address = start,
> >> + .pmd = pmdp,
> >> + .vma = walk->vma,
> >> + };
> >> +
> >> + hmm_pfn[0] |= HMM_PFN_MIGRATE | HMM_PFN_COMPOUND;
> >> +
> >> + r = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
> >> + if (r) {
> >> + hmm_pfn[0] &= ~(HMM_PFN_MIGRATE | HMM_PFN_COMPOUND);
> >> + r = -ENOENT; // fallback
> >> + goto unlock_out;
> >> + }
> >> + for (i = 1, start += PAGE_SIZE; start < end; start += PAGE_SIZE, i++)
> >> + hmm_pfn[i] &= HMM_PFN_INOUT_FLAGS;
> >> +
> >> + } else {
> >> + r = -ENOENT; // fallback
> >> + goto unlock_out;
> >> + }
> >> +
> >> +
> >> +out:
> >> + spin_unlock(ptl);
> >> + return r;
> >> +
> >> +unlock_out:
> >> + if (folio != fault_folio)
> >> + folio_unlock(folio);
> >> + folio_put(folio);
> >> + goto out;
> >> +
> >> +}
> >> +
> >> +/*
> >> + * Install migration entries if migration requested, either from fault
> >> + * or migrate paths.
> >> + *
> >> + */
> >> +static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
> >> + pmd_t *pmdp,
> >> + unsigned long addr,
> >> + unsigned long *hmm_pfn)
> >> +{
> >> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
> >> + struct hmm_range *range = hmm_vma_walk->range;
> >> + struct migrate_vma *migrate = range->migrate;
> >> + struct mm_struct *mm = walk->vma->vm_mm;
> >> + struct folio *fault_folio = NULL;
> >> + enum migrate_vma_info minfo;
> >> + struct dev_pagemap *pgmap;
> >> + bool anon_exclusive;
> >> + struct folio *folio;
> >> + unsigned long pfn;
> >> + struct page *page;
> >> + softleaf_t entry;
> >> + pte_t pte, swp_pte;
> >> + spinlock_t *ptl;
> >> + bool writable = false;
> >> + pte_t *ptep;
> >> +
> >> + // Do we want to migrate at all?
> >> + minfo = hmm_select_migrate(range);
> >> + if (!minfo)
> >> + return 0;
> >> +
> >> + fault_folio = (migrate && migrate->fault_page) ?
> >> + page_folio(migrate->fault_page) : NULL;
> >> +
> >> +again:
> >> + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> >> + if (!ptep)
> >> + return 0;
> >> +
> >> + pte = ptep_get(ptep);
> >> +
> >> + if (pte_none(pte)) {
> >> + // migrate without faulting case
> >> + if (vma_is_anonymous(walk->vma)) {
> >> + *hmm_pfn &= HMM_PFN_INOUT_FLAGS;
> >> + *hmm_pfn |= HMM_PFN_MIGRATE | HMM_PFN_VALID;
> >> + goto out;
> >> + }
> >> + }
> >> +
> >> + if (!pte_present(pte)) {
> >> + /*
> >> + * Only care about unaddressable device page special
> >> + * page table entry. Other special swap entries are not
> >> + * migratable, and we ignore regular swapped page.
> >> + */
> >> + entry = softleaf_from_pte(pte);
> >> + if (!softleaf_is_device_private(entry))
> >> + goto out;
> >> +
> >> + if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
> >> + goto out;
> >> +
> >> + page = softleaf_to_page(entry);
> >> + folio = page_folio(page);
> >> + if (folio->pgmap->owner != migrate->pgmap_owner)
> >> + goto out;
> >> +
> >> + if (folio_test_large(folio)) {
> >> + int ret;
> >> +
> >> + pte_unmap_unlock(ptep, ptl);
> >> + ret = migrate_vma_split_folio(folio,
> >> + migrate->fault_page);
> >> + if (ret)
> >> + goto out_unlocked;
> >> + goto again;
> >> + }
> >> +
> >> + pfn = page_to_pfn(page);
> >> + if (softleaf_is_device_private_write(entry))
> >> + writable = true;
> >> + } else {
> >> + pfn = pte_pfn(pte);
> >> + if (is_zero_pfn(pfn) &&
> >> + (minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
> >> + *hmm_pfn = HMM_PFN_MIGRATE|HMM_PFN_VALID;
> >> + goto out;
> >> + }
> >> + page = vm_normal_page(walk->vma, addr, pte);
> >> + if (page && !is_zone_device_page(page) &&
> >> + !(minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
> >> + goto out;
> >> + } else if (page && is_device_coherent_page(page)) {
> >> + pgmap = page_pgmap(page);
> >> +
> >> + if (!(minfo &
> >> + MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
> >> + pgmap->owner != migrate->pgmap_owner)
> >> + goto out;
> >> + }
> >> +
> >> + folio = page_folio(page);
> >> + if (folio_test_large(folio)) {
> >> + int ret;
> >> +
> >> + pte_unmap_unlock(ptep, ptl);
> >> + ret = migrate_vma_split_folio(folio,
> >> + migrate->fault_page);
> >> + if (ret)
> >> + goto out_unlocked;
> >> +
> >> + goto again;
> >> + }
> >> +
> >> + writable = pte_write(pte);
> >> + }
> >> +
> >> + if (!page || !page->mapping)
> >> + goto out;
> >> +
> >> + /*
> >> + * By getting a reference on the folio we pin it and that blocks
> >> + * any kind of migration. Side effect is that it "freezes" the
> >> + * pte.
> >> + *
> >> + * We drop this reference after isolating the folio from the lru
> >> + * for non device folio (device folio are not on the lru and thus
> >> + * can't be dropped from it).
> >> + */
> >> + folio = page_folio(page);
> >> + folio_get(folio);
> >> +
> >> + /*
> >> + * We rely on folio_trylock() to avoid deadlock between
> >> + * concurrent migrations where each is waiting on the others
> >> + * folio lock. If we can't immediately lock the folio we fail this
> >> + * migration as it is only best effort anyway.
> >> + *
> >> + * If we can lock the folio it's safe to set up a migration entry
> >> + * now. In the common case where the folio is mapped once in a
> >> + * single process setting up the migration entry now is an
> >> + * optimisation to avoid walking the rmap later with
> >> + * try_to_migrate().
> >> + */
> >> +
> >> + if (fault_folio == folio || folio_trylock(folio)) {
> >> + anon_exclusive = folio_test_anon(folio) &&
> >> + PageAnonExclusive(page);
> >> +
> >> + flush_cache_page(walk->vma, addr, pfn);
> >> +
> >> + if (anon_exclusive) {
> >> + pte = ptep_clear_flush(walk->vma, addr, ptep);
> >> +
> >> + if (folio_try_share_anon_rmap_pte(folio, page)) {
> >> + set_pte_at(mm, addr, ptep, pte);
> >> + folio_unlock(folio);
> >> + folio_put(folio);
> >> + goto out;
> >> + }
> >> + } else {
> >> + pte = ptep_get_and_clear(mm, addr, ptep);
> >> + }
> >> +
> >> + /* Setup special migration page table entry */
> >> + if (writable)
> >> + entry = make_writable_migration_entry(pfn);
> >> + else if (anon_exclusive)
> >> + entry = make_readable_exclusive_migration_entry(pfn);
> >> + else
> >> + entry = make_readable_migration_entry(pfn);
> >> +
> >> + swp_pte = swp_entry_to_pte(entry);
> >> + if (pte_present(pte)) {
> >> + if (pte_soft_dirty(pte))
> >> + swp_pte = pte_swp_mksoft_dirty(swp_pte);
> >> + if (pte_uffd_wp(pte))
> >> + swp_pte = pte_swp_mkuffd_wp(swp_pte);
> >> + } else {
> >> + if (pte_swp_soft_dirty(pte))
> >> + swp_pte = pte_swp_mksoft_dirty(swp_pte);
> >> + if (pte_swp_uffd_wp(pte))
> >> + swp_pte = pte_swp_mkuffd_wp(swp_pte);
> >> + }
> >> +
> >> + set_pte_at(mm, addr, ptep, swp_pte);
> >> + folio_remove_rmap_pte(folio, page, walk->vma);
> >> + folio_put(folio);
> >> + *hmm_pfn |= HMM_PFN_MIGRATE;
> >> +
> >> + if (pte_present(pte))
> >> + flush_tlb_range(walk->vma, addr, addr + PAGE_SIZE);
> >> + } else
> >> + folio_put(folio);
> >> +out:
> >> + pte_unmap_unlock(ptep, ptl);
> >> + return 0;
> >> +out_unlocked:
> >> + return -1;
> >> +
> >> +}
> >> +
> >> +static int hmm_vma_walk_split(pmd_t *pmdp,
> >> + unsigned long addr,
> >> + struct mm_walk *walk)
> >> +{
> >> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
> >> + struct hmm_range *range = hmm_vma_walk->range;
> >> + struct migrate_vma *migrate = range->migrate;
> >> + struct folio *folio, *fault_folio;
> >> + spinlock_t *ptl;
> >> + int ret = 0;
> >> +
> >> + fault_folio = (migrate && migrate->fault_page) ?
> >> + page_folio(migrate->fault_page) : NULL;
> >> +
> >> + ptl = pmd_lock(walk->mm, pmdp);
> >> + if (unlikely(!pmd_trans_huge(*pmdp))) {
> >> + spin_unlock(ptl);
> >> + goto out;
> >> + }
> >> +
> >> + folio = pmd_folio(*pmdp);
> >> + if (is_huge_zero_folio(folio)) {
> >> + spin_unlock(ptl);
> >> + split_huge_pmd(walk->vma, pmdp, addr);
> >> + } else {
> >> + folio_get(folio);
> >> + spin_unlock(ptl);
> >> +
> >> + if (folio != fault_folio) {
> >> + if (unlikely(!folio_trylock(folio))) {
> >> + folio_put(folio);
> >> + ret = -EBUSY;
> >> + goto out;
> >> + }
> >> + } else
> >> + folio_put(folio);
> >> +
> >> + ret = split_folio(folio);
> >> + if (fault_folio != folio) {
> >> + folio_unlock(folio);
> >> + folio_put(folio);
> >> + }
> >> +
> >> + }
> >> +out:
> >> + return ret;
> >> +}
> >> +
> >> +static int hmm_vma_capture_migrate_range(unsigned long start,
> >> + unsigned long end,
> >> + struct mm_walk *walk)
> >> +{
> >> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
> >> + struct hmm_range *range = hmm_vma_walk->range;
> >> +
> >> + if (!hmm_select_migrate(range))
> >> + return 0;
> >> +
> >> + if (hmm_vma_walk->vma && (hmm_vma_walk->vma != walk->vma))
> >> + return -ERANGE;
> >> +
> >> + hmm_vma_walk->vma = walk->vma;
> >> + hmm_vma_walk->start = start;
> >> + hmm_vma_walk->end = end;
> >> +
> >> + if (end - start > range->end - range->start)
> >> + return -ERANGE;
> >> +
> >> + if (!hmm_vma_walk->mmu_range.owner) {
> >> + mmu_notifier_range_init_owner(&hmm_vma_walk->mmu_range, MMU_NOTIFY_MIGRATE, 0,
> >> + walk->vma->vm_mm, start, end,
> >> + range->dev_private_owner);
> >> + mmu_notifier_invalidate_range_start(&hmm_vma_walk->mmu_range);
> >> + }
> >> +
> >> + return 0;
> >> +}
> >> +
> >> static int hmm_vma_walk_pmd(pmd_t *pmdp,
> >> unsigned long start,
> >> unsigned long end,
> >> @@ -404,42 +878,90 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> >> &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
> >> unsigned long npages = (end - start) >> PAGE_SHIFT;
> >> unsigned long addr = start;
> >> + enum migrate_vma_info minfo;
> >> + unsigned long i;
> >> + spinlock_t *ptl;
> >> pte_t *ptep;
> >> pmd_t pmd;
> >> + int r;
> >>
> >> + minfo = hmm_select_migrate(range);
> >> again:
> >> +
> >> pmd = pmdp_get_lockless(pmdp);
> >> - if (pmd_none(pmd))
> >> - return hmm_vma_walk_hole(start, end, -1, walk);
> >> + if (pmd_none(pmd)) {
> >> + r = hmm_vma_walk_hole(start, end, -1, walk);
> >> + if (r || !minfo)
> >> + return r;
> >> +
> >> + ptl = pmd_lock(walk->mm, pmdp);
> >> + if (pmd_none(*pmdp)) {
> >> + // hmm_vma_walk_hole() filled migration needs
> >> + spin_unlock(ptl);
> >> + return r;
> >> + }
> >> + spin_unlock(ptl);
> >> + }
> >>
> >> if (thp_migration_supported() && pmd_is_migration_entry(pmd)) {
> >> - if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
> >> - hmm_vma_walk->last = addr;
> >> - pmd_migration_entry_wait(walk->mm, pmdp);
> >> - return -EBUSY;
> >> + if (!minfo) {
> >> + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
> >> + hmm_vma_walk->last = addr;
> >> + pmd_migration_entry_wait(walk->mm, pmdp);
> >> + return -EBUSY;
> >> + }
> >> }
> >> - return hmm_pfns_fill(start, end, range, 0);
> >> + for (i = 0; addr < end; addr += PAGE_SIZE, i++)
> >> + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
> >> +
> >> + return 0;
> >> }
> >>
> >> - if (!pmd_present(pmd))
> >> - return hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
> >> - pmd);
> >> + if (pmd_trans_huge(pmd) || !pmd_present(pmd)) {
> >> +
> >> + if (!pmd_present(pmd)) {
> >> + r = hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
> >> + pmd);
> >> + if (r || !minfo)
> >> + return r;
> >> + } else {
> >> +
> >> + /*
> >> + * No need to take pmd_lock here, even if some other thread
> >> + * is splitting the huge pmd we will get that event through
> >> + * mmu_notifier callback.
> >> + *
> >> + * So just read pmd value and check again it's a transparent
> >> + * huge or device mapping one and compute corresponding pfn
> >> + * values.
> >> + */
> >> +
> >> + pmd = pmdp_get_lockless(pmdp);
> >> + if (!pmd_trans_huge(pmd))
> >> + goto again;
> >> +
> >> + r = hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
> >> +
> >> + if (r || !minfo)
> >> + return r;
> >> + }
> >>
> >> - if (pmd_trans_huge(pmd)) {
> >> - /*
> >> - * No need to take pmd_lock here, even if some other thread
> >> - * is splitting the huge pmd we will get that event through
> >> - * mmu_notifier callback.
> >> - *
> >> - * So just read pmd value and check again it's a transparent
> >> - * huge or device mapping one and compute corresponding pfn
> >> - * values.
> >> - */
> >> - pmd = pmdp_get_lockless(pmdp);
> >> - if (!pmd_trans_huge(pmd))
> >> - goto again;
> >> + r = hmm_vma_handle_migrate_prepare_pmd(walk, pmdp, start, end, hmm_pfns);
> >> +
> >> + if (r == -ENOENT) {
> >> + r = hmm_vma_walk_split(pmdp, addr, walk);
> >> + if (r) {
> >> + /* Split not successful, skip */
> >> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> >> + }
> >> +
> >> + /* Split successful or "again", reloop */
> >> + hmm_vma_walk->last = addr;
> >> + return -EBUSY;
> >> + }
> >> +
> >> + return r;
> >>
> >> - return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
> >> }
> >>
> >> /*
> >> @@ -451,22 +973,26 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> >> if (pmd_bad(pmd)) {
> >> if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
> >> return -EFAULT;
> >> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> >> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> >> }
> >>
> >> ptep = pte_offset_map(pmdp, addr);
> >> if (!ptep)
> >> goto again;
> >> for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
> >> - int r;
> >>
> >> r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
> >> if (r) {
> >> /* hmm_vma_handle_pte() did pte_unmap() */
> >> return r;
> >> }
> >> +
> >> + r = hmm_vma_handle_migrate_prepare(walk, pmdp, addr, hmm_pfns);
> >> + if (r)
> >> + break;
> >> }
> >> pte_unmap(ptep - 1);
> >> +
> >> return 0;
> >> }
> >>
> >> @@ -600,6 +1126,11 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
> >> struct hmm_vma_walk *hmm_vma_walk = walk->private;
> >> struct hmm_range *range = hmm_vma_walk->range;
> >> struct vm_area_struct *vma = walk->vma;
> >> + int r;
> >> +
> >> + r = hmm_vma_capture_migrate_range(start, end, walk);
> >> + if (r)
> >> + return r;
> >>
> >> if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
> >> vma->vm_flags & VM_READ)
> >> @@ -622,7 +1153,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
> >> (end - start) >> PAGE_SHIFT, 0))
> >> return -EFAULT;
> >>
> >> - hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> >> + hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> >>
> >> /* Skip this vma and continue processing the next vma. */
> >> return 1;
> >> @@ -652,9 +1183,17 @@ static const struct mm_walk_ops hmm_walk_ops = {
> >> * the invalidation to finish.
> >> * -EFAULT: A page was requested to be valid and could not be made valid
> >> * ie it has no backing VMA or it is illegal to access
> >> + * -ERANGE: The range crosses multiple VMAs, or space for hmm_pfns array
> >> + * is too low.
> >> *
> >> * This is similar to get_user_pages(), except that it can read the page tables
> >> * without mutating them (ie causing faults).
> >> + *
> >> + * If want to do migrate after faultin, call hmm_rangem_fault() with
> > s/faultin/faulting
> >
> > s/hmm_rangem_fault/hmm_range_fault
>
> Will fix, thanks!
>
> >
> >> + * HMM_PFN_REQ_MIGRATE and initialize range.migrate field.
> > I'm not following the HMM_PFN_REQ_MIGRATE usage.
> >
> >> + * After hmm_range_fault() call migrate_hmm_range_setup() instead of
> >> + * migrate_vma_setup() and after that follow normal migrate calls path.
> >> + *
> > Also since migrate_vma_setup calls hmm_range_fault then
> > migrate_hmm_range_setup what would the use case for not just calling
> > migrate_vma_setup be?
>
> It's the call context that matters. When resolving device faults and based on a policy,
> some regions you mirror, some migrate, you start with
> call(s) to hmm_range_fault() (+HMM_PFN_REQ_MIGRATE optionally),
> and explicitly call migrate_hmm_range_setup() + other migrate_vma* when migrating.
> For resolving device faults it's just the hmm_range_fault() part like before.
> For current migration users, migrate_vma_setup() keeps the current behavior calling
> hmm_range_fault() under the hood for installing migration ptes
> after optionally bringing in swapped pages (with MIGRATE_VMA_FAULT).
> I think we get better understanding how the entrypoints should look like when used for a real device.
> For now I have tried to keep the current behavior and entry points, refactoring under the hood
> for code reuse.
>
I’m not completely following the above, but I did think of a case where
hmm_range_fault + migrate_hmm_range_setup is better than using
migrate_vma_setup() alone. Previously, before calling
migrate_vma_setup(), we had to look up the VMA. If hmm_range_fault
populates the migrate_vma arguments (include the VMA,) that’s better
than the current flow.
> >
> >> */
> >> int hmm_range_fault(struct hmm_range *range)
> >> {
> >> @@ -662,16 +1201,28 @@ int hmm_range_fault(struct hmm_range *range)
> >> .range = range,
> >> .last = range->start,
> >> };
> >> - struct mm_struct *mm = range->notifier->mm;
> >> + bool is_fault_path = !!range->notifier;
> >> + struct mm_struct *mm;
> >> int ret;
> >>
> >> + /*
> >> + *
> >> + * Could be serving a device fault or come from migrate
> >> + * entry point. For the former we have not resolved the vma
> >> + * yet, and the latter we don't have a notifier (but have a vma).
> >> + *
> >> + */
> >> + mm = is_fault_path ? range->notifier->mm : range->migrate->vma->vm_mm;
> >> mmap_assert_locked(mm);
> >>
> >> do {
> >> /* If range is no longer valid force retry. */
> >> - if (mmu_interval_check_retry(range->notifier,
> >> - range->notifier_seq))
> >> - return -EBUSY;
> >> + if (is_fault_path && mmu_interval_check_retry(range->notifier,
> >> + range->notifier_seq)) {
> >> + ret = -EBUSY;
> >> + break;
> >> + }
> >> +
> >> ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
> >> &hmm_walk_ops, &hmm_vma_walk);
> >> /*
> >> @@ -681,6 +1232,18 @@ int hmm_range_fault(struct hmm_range *range)
> >> * output, and all >= are still at their input values.
> >> */
> >> } while (ret == -EBUSY);
> >> +
> >> + if (hmm_select_migrate(range) && range->migrate &&
> >> + hmm_vma_walk.mmu_range.owner) {
> >> + // The migrate_vma path has the following initialized
> >> + if (is_fault_path) {
> >> + range->migrate->vma = hmm_vma_walk.vma;
> >> + range->migrate->start = range->start;
> >> + range->migrate->end = hmm_vma_walk.end;
> >> + }
> >> + mmu_notifier_invalidate_range_end(&hmm_vma_walk.mmu_range);
> >> + }
> >> +
> >> return ret;
> >> }
> >> EXPORT_SYMBOL(hmm_range_fault);
> >> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> >> index 23379663b1e1..d89efdfca8f6 100644
> >> --- a/mm/migrate_device.c
> >> +++ b/mm/migrate_device.c
> >> @@ -734,7 +734,17 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
> >> */
> >> int migrate_vma_setup(struct migrate_vma *args)
> >> {
> >> + int ret;
> >> long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
> >> + struct hmm_range range = {
> >> + .notifier = NULL,
> >> + .start = args->start,
> >> + .end = args->end,
> >> + .migrate = args,
> >> + .hmm_pfns = args->src,
> >> + .dev_private_owner = args->pgmap_owner,
> >> + .migrate = args
> >> + };
> >>
> >> args->start &= PAGE_MASK;
> >> args->end &= PAGE_MASK;
> >> @@ -759,17 +769,19 @@ int migrate_vma_setup(struct migrate_vma *args)
> >> args->cpages = 0;
> >> args->npages = 0;
> >>
> >> - migrate_vma_collect(args);
> >> + if (args->flags & MIGRATE_VMA_FAULT)
> >> + range.default_flags |= HMM_PFN_REQ_FAULT;
> > Next level here might be skip faulting pte_none() in hmm_range_fault
> > too?
>
> That would be a possible optimization!
>
Yes, I think that would make this really useful, and we (Xe’s SVM
implementation) would likely always want to set the flag that says
‘fault in !pte_present() && !pte_none’. I’m not 100% sure on this, but
that’s my initial feeling.
Matt
> >
> > Matt
>
> Thanks,
> Mika
>
> >
> >> +
> >> + ret = hmm_range_fault(&range);
> >>
> >> - if (args->cpages)
> >> - migrate_vma_unmap(args);
> >> + migrate_hmm_range_setup(&range);
> >>
> >> /*
> >> * At this point pages are locked and unmapped, and thus they have
> >> * stable content and can safely be copied to destination memory that
> >> * is allocated by the drivers.
> >> */
> >> - return 0;
> >> + return ret;
> >>
> >> }
> >> EXPORT_SYMBOL(migrate_vma_setup);
> >> @@ -1489,3 +1501,64 @@ int migrate_device_coherent_folio(struct folio *folio)
> >> return 0;
> >> return -EBUSY;
> >> }
> >> +
> >> +void migrate_hmm_range_setup(struct hmm_range *range)
> >> +{
> >> +
> >> + struct migrate_vma *migrate = range->migrate;
> >> +
> >> + if (!migrate)
> >> + return;
> >> +
> >> + migrate->npages = (migrate->end - migrate->start) >> PAGE_SHIFT;
> >> + migrate->cpages = 0;
> >> +
> >> + for (unsigned long i = 0; i < migrate->npages; i++) {
> >> +
> >> + unsigned long pfn = range->hmm_pfns[i];
> >> +
> >> + pfn &= ~HMM_PFN_INOUT_FLAGS;
> >> +
> >> + /*
> >> + *
> >> + * Don't do migration if valid and migrate flags are not both set.
> >> + *
> >> + */
> >> + if ((pfn & (HMM_PFN_VALID | HMM_PFN_MIGRATE)) !=
> >> + (HMM_PFN_VALID | HMM_PFN_MIGRATE)) {
> >> + migrate->src[i] = 0;
> >> + migrate->dst[i] = 0;
> >> + continue;
> >> + }
> >> +
> >> + migrate->cpages++;
> >> +
> >> + /*
> >> + *
> >> + * The zero page is encoded in a special way, valid and migrate is
> >> + * set, and pfn part is zero. Encode specially for migrate also.
> >> + *
> >> + */
> >> + if (pfn == (HMM_PFN_VALID|HMM_PFN_MIGRATE)) {
> >> + migrate->src[i] = MIGRATE_PFN_MIGRATE;
> >> + migrate->dst[i] = 0;
> >> + continue;
> >> + }
> >> + if (pfn == (HMM_PFN_VALID|HMM_PFN_MIGRATE|HMM_PFN_COMPOUND)) {
> >> + migrate->src[i] = MIGRATE_PFN_MIGRATE|MIGRATE_PFN_COMPOUND;
> >> + migrate->dst[i] = 0;
> >> + continue;
> >> + }
> >> +
> >> + migrate->src[i] = migrate_pfn(page_to_pfn(hmm_pfn_to_page(pfn)))
> >> + | MIGRATE_PFN_MIGRATE;
> >> + migrate->src[i] |= (pfn & HMM_PFN_WRITE) ? MIGRATE_PFN_WRITE : 0;
> >> + migrate->src[i] |= (pfn & HMM_PFN_COMPOUND) ? MIGRATE_PFN_COMPOUND : 0;
> >> + migrate->dst[i] = 0;
> >> + }
> >> +
> >> + if (migrate->cpages)
> >> + migrate_vma_unmap(migrate);
> >> +
> >> +}
> >> +EXPORT_SYMBOL(migrate_hmm_range_setup);
> >> --
> >> 2.50.0
> >>
>
On 1/16/26 22:05, Matthew Brost wrote:
> On Fri, Jan 16, 2026 at 12:39:30PM +0200, Mika Penttilä wrote:
>> Hi,
>>
>> On 1/16/26 04:06, Matthew Brost wrote:
>>
>>> On Wed, Jan 14, 2026 at 11:19:21AM +0200, mpenttil@redhat.com wrote:
>>>> From: Mika Penttilä <mpenttil@redhat.com>
>>>>
>>>> Currently, the way device page faulting and migration works
>>>> is not optimal, if you want to do both fault handling and
>>>> migration at once.
>>>>
>>>> Being able to migrate not present pages (or pages mapped with incorrect
>>>> permissions, eg. COW) to the GPU requires doing either of the
>>>> following sequences:
>>>>
>>>> 1. hmm_range_fault() - fault in non-present pages with correct permissions, etc.
>>>> 2. migrate_vma_*() - migrate the pages
>>>>
>>>> Or:
>>>>
>>>> 1. migrate_vma_*() - migrate present pages
>>>> 2. If non-present pages detected by migrate_vma_*():
>>>> a) call hmm_range_fault() to fault pages in
>>>> b) call migrate_vma_*() again to migrate now present pages
>>>>
>>>> The problem with the first sequence is that you always have to do two
>>>> page walks even when most of the time the pages are present or zero page
>>>> mappings so the common case takes a performance hit.
>>>>
>>>> The second sequence is better for the common case, but far worse if
>>>> pages aren't present because now you have to walk the page tables three
>>>> times (once to find the page is not present, once so hmm_range_fault()
>>>> can find a non-present page to fault in and once again to setup the
>>>> migration). It is also tricky to code correctly.
>>>>
>>>> We should be able to walk the page table once, faulting
>>>> pages in as required and replacing them with migration entries if
>>>> requested.
>>>>
>>>> Add a new flag to HMM APIs, HMM_PFN_REQ_MIGRATE,
>>>> which tells to prepare for migration also during fault handling.
>>>> Also, for the migrate_vma_setup() call paths, a flags, MIGRATE_VMA_FAULT,
>>>> is added to tell to add fault handling to migrate.
>>>>
>>>> Cc: David Hildenbrand <david@redhat.com>
>>>> Cc: Jason Gunthorpe <jgg@nvidia.com>
>>>> Cc: Leon Romanovsky <leonro@nvidia.com>
>>>> Cc: Alistair Popple <apopple@nvidia.com>
>>>> Cc: Balbir Singh <balbirs@nvidia.com>
>>>> Cc: Zi Yan <ziy@nvidia.com>
>>>> Cc: Matthew Brost <matthew.brost@intel.com>
>>> I'll try to test this when I can but horribly behind at the moment.
>>>
>>> You can use Intel's CI system to test SVM too. I can get you authorized
>>> to use this. The list to trigger is intel-xe@lists.freedesktop.org and
>>> patches must apply to drm-tip. I'll let you know when you are
>>> authorized.
>> Thanks, appreciate, will do that also!
>>
> Working on enabling this for you in CI.
>
> I did a quick test by running our complete test suite and got a kernel
> hang in this section:
>
> xe_exec_system_allocator.threads-shared-vm-many-stride-malloc-prefetch
>
> Stack trace:
>
> [ 182.915763] INFO: task xe_exec_system_:5357 blocked for more than 30 seconds.
> [ 182.922866] Tainted: G U W 6.19.0-rc4-xe+ #2549
> [ 182.929183] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 182.936912] task:xe_exec_system_ state:D stack:0 pid:5357 tgid:1862 ppid:1853 task_flags:0x400040 flags:0x00080000
> [ 182.936916] Call Trace:
> [ 182.936918] <TASK>
> [ 182.936919] __schedule+0x4df/0xc20
> [ 182.936924] schedule+0x22/0xd0
> [ 182.936925] io_schedule+0x41/0x60
> [ 182.936926] migration_entry_wait_on_locked+0x21c/0x2a0
> [ 182.936929] ? __pfx_wake_page_function+0x10/0x10
> [ 182.936931] migration_entry_wait+0xad/0xf0
> [ 182.936933] hmm_vma_walk_pmd+0xd5f/0x19b0
> [ 182.936935] walk_pgd_range+0x51d/0xa60
> [ 182.936938] __walk_page_range+0x75/0x1e0
> [ 182.936940] walk_page_range_mm_unsafe+0x138/0x1f0
> [ 182.936941] hmm_range_fault+0x8f/0x160
> [ 182.936945] drm_gpusvm_get_pages+0x1ae/0x8a0 [drm_gpusvm_helper]
> [ 182.936949] drm_gpusvm_range_get_pages+0x2d/0x40 [drm_gpusvm_helper]
> [ 182.936951] xe_svm_range_get_pages+0x1b/0x50 [xe]
> [ 182.936979] xe_vm_bind_ioctl+0x15c3/0x17e0 [xe]
> [ 182.937001] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe]
> [ 182.937021] ? drm_ioctl_kernel+0xa3/0x100
> [ 182.937024] drm_ioctl_kernel+0xa3/0x100
> [ 182.937026] drm_ioctl+0x213/0x440
> [ 182.937028] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe]
> [ 182.937061] xe_drm_ioctl+0x5a/0xa0 [xe]
> [ 182.937083] __x64_sys_ioctl+0x7f/0xd0
> [ 182.937085] do_syscall_64+0x50/0x290
> [ 182.937088] entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [ 182.937091] RIP: 0033:0x7ff00f724ded
> [ 182.937092] RSP: 002b:00007ff00b9fa640 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> [ 182.937094] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007ff00f724ded
> [ 182.937095] RDX: 00007ff00b9fa6d0 RSI: 0000000040886445 RDI: 0000000000000003
> [ 182.937096] RBP: 00007ff00b9fa690 R08: 0000000000000000 R09: 0000000000000000
> [ 182.937097] R10: 0000000000000001 R11: 0000000000000246 R12: 00007ff00b9fa6d0
> [ 182.937098] R13: 0000000040886445 R14: 0000000000000003 R15: 00007ff00f8a9000
> [ 182.937099] </TASK>
>
> This section is a racy test with parallel CPU and device access that is
> likely causing the migration process to abort and retry. From the stack
> trace, it looks like a migration PMD didn’t get properly removed, and a
> subsequent call to hmm_range_fault hangs on a migration entry that was
> not removed during the migration abort.
>
> IIRC, some of the last bits in Balbir’s large device pages series had a
> similar bug, which I sent to Andrew with fixup patches. I suspect you
> have a similar bug. If I can find the time, I’ll see if I can track it
> down.
Thanks for your efforts Matthew!
I remember those discussions, and looks similar. If recall correctly, it was
about failed splits and continuation after that. The code bails out the
pmd loop in this case but maybe this part is missing and is equivalent to
collect skip:
diff --git a/mm/hmm.c b/mm/hmm.c
index 39a07d895043..c7a7bb923a37 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -988,8 +988,12 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
}
r = hmm_vma_handle_migrate_prepare(walk, pmdp, addr, hmm_pfns);
- if (r)
+ if (r) {
+ /* Split has failed, skip to end. */
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++)
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
break;
+ }
}
pte_unmap(ptep - 1);
With that is should be quite similar to current flow (for effects).
>
>>>> Suggested-by: Alistair Popple <apopple@nvidia.com>
>>>> Signed-off-by: Mika Penttilä <mpenttil@redhat.com>
>>>> ---
>>>> include/linux/hmm.h | 17 +-
>>>> include/linux/migrate.h | 6 +-
>>>> mm/hmm.c | 657 +++++++++++++++++++++++++++++++++++++---
>>>> mm/migrate_device.c | 81 ++++-
>>>> 4 files changed, 706 insertions(+), 55 deletions(-)
>>>>
>>>> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
>>>> index db75ffc949a7..7b7294ad0f62 100644
>>>> --- a/include/linux/hmm.h
>>>> +++ b/include/linux/hmm.h
>>>> @@ -12,7 +12,7 @@
>>>> #include <linux/mm.h>
>>>>
>>>> struct mmu_interval_notifier;
>>>> -
>>>> +struct migrate_vma;
>>>> /*
>>>> * On output:
>>>> * 0 - The page is faultable and a future call with
>>>> @@ -48,15 +48,25 @@ enum hmm_pfn_flags {
>>>> HMM_PFN_P2PDMA = 1UL << (BITS_PER_LONG - 5),
>>>> HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6),
>>>>
>>>> - HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 11),
>>>> + /* Migrate request */
>>>> + HMM_PFN_MIGRATE = 1UL << (BITS_PER_LONG - 7),
>>>> + HMM_PFN_COMPOUND = 1UL << (BITS_PER_LONG - 8),
>>>> + HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 13),
>>>>
>>>> /* Input flags */
>>>> HMM_PFN_REQ_FAULT = HMM_PFN_VALID,
>>>> HMM_PFN_REQ_WRITE = HMM_PFN_WRITE,
>>>> + HMM_PFN_REQ_MIGRATE = HMM_PFN_MIGRATE,
>>> I believe you are missing kernel for HMM_PFN_MIGRATE.
>> I explain below the HMM_PFN_MIGRATE usage.
>>
> I had a typo here: s/missing kernel/missing doc kernel/
ack
>
>>>>
>>>> HMM_PFN_FLAGS = ~((1UL << HMM_PFN_ORDER_SHIFT) - 1),
>>>> };
>>>>
>>>> +enum {
>>>> + /* These flags are carried from input-to-output */
>>>> + HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
>>>> + HMM_PFN_P2PDMA_BUS,
>>>> +};
>>>> +
>>>> /*
>>>> * hmm_pfn_to_page() - return struct page pointed to by a device entry
>>>> *
>>>> @@ -107,6 +117,7 @@ static inline unsigned int hmm_pfn_to_map_order(unsigned long hmm_pfn)
>>>> * @default_flags: default flags for the range (write, read, ... see hmm doc)
>>>> * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
>>>> * @dev_private_owner: owner of device private pages
>>>> + * @migrate: structure for migrating the associated vma
>>>> */
>>>> struct hmm_range {
>>>> struct mmu_interval_notifier *notifier;
>>>> @@ -117,12 +128,14 @@ struct hmm_range {
>>>> unsigned long default_flags;
>>>> unsigned long pfn_flags_mask;
>>>> void *dev_private_owner;
>>>> + struct migrate_vma *migrate;
>>>> };
>>>>
>>>> /*
>>>> * Please see Documentation/mm/hmm.rst for how to use the range API.
>>>> */
>>>> int hmm_range_fault(struct hmm_range *range);
>>>> +int hmm_range_migrate_prepare(struct hmm_range *range, struct migrate_vma **pargs);
>>>>
>>>> /*
>>>> * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>>>> index 26ca00c325d9..0889309a9d21 100644
>>>> --- a/include/linux/migrate.h
>>>> +++ b/include/linux/migrate.h
>>>> @@ -3,6 +3,7 @@
>>>> #define _LINUX_MIGRATE_H
>>>>
>>>> #include <linux/mm.h>
>>>> +#include <linux/hmm.h>
>>>> #include <linux/mempolicy.h>
>>>> #include <linux/migrate_mode.h>
>>>> #include <linux/hugetlb.h>
>>>> @@ -140,11 +141,12 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
>>>> return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
>>>> }
>>>>
>>>> -enum migrate_vma_direction {
>>>> +enum migrate_vma_info {
>>>> MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
>>>> MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
>>>> MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
>>>> MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
>>>> + MIGRATE_VMA_FAULT = 1 << 4,
>>>> };
>>>>
>>>> struct migrate_vma {
>>>> @@ -192,7 +194,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
>>>> unsigned long npages);
>>>> void migrate_device_finalize(unsigned long *src_pfns,
>>>> unsigned long *dst_pfns, unsigned long npages);
>>>> -
>>>> +void migrate_hmm_range_setup(struct hmm_range *range);
>>>> #endif /* CONFIG_MIGRATION */
>>>>
>>>> #endif /* _LINUX_MIGRATE_H */
>>>> diff --git a/mm/hmm.c b/mm/hmm.c
>>>> index 4ec74c18bef6..39a07d895043 100644
>>>> --- a/mm/hmm.c
>>>> +++ b/mm/hmm.c
>>>> @@ -20,6 +20,7 @@
>>>> #include <linux/pagemap.h>
>>>> #include <linux/leafops.h>
>>>> #include <linux/hugetlb.h>
>>>> +#include <linux/migrate.h>
>>>> #include <linux/memremap.h>
>>>> #include <linux/sched/mm.h>
>>>> #include <linux/jump_label.h>
>>>> @@ -31,8 +32,12 @@
>>>> #include "internal.h"
>>>>
>>>> struct hmm_vma_walk {
>>>> - struct hmm_range *range;
>>>> - unsigned long last;
>>>> + struct mmu_notifier_range mmu_range;
>>>> + struct vm_area_struct *vma;
>>>> + struct hmm_range *range;
>>>> + unsigned long start;
>>>> + unsigned long end;
>>>> + unsigned long last;
>>>> };
>>>>
>>>> enum {
>>>> @@ -41,21 +46,49 @@ enum {
>>>> HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
>>>> };
>>>>
>>>> -enum {
>>>> - /* These flags are carried from input-to-output */
>>>> - HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
>>>> - HMM_PFN_P2PDMA_BUS,
>>>> -};
>>>> +static enum migrate_vma_info hmm_select_migrate(struct hmm_range *range)
>>>> +{
>>>> + enum migrate_vma_info minfo;
>>>> +
>>>> + minfo = range->migrate ? range->migrate->flags : 0;
>>>> + minfo |= (range->default_flags & HMM_PFN_REQ_MIGRATE) ?
>>>> + MIGRATE_VMA_SELECT_SYSTEM : 0;
>>> I'm trying to make sense of HMM_PFN_REQ_MIGRATE and why it sets
>>> MIGRATE_VMA_SELECT_SYSTEM?
>>>
>>> Also the just the general usage - would range->migrate be NULL in the
>>> expected usage.
>>>
>>> Maybe an example of how hmm_range_fault would be called with this flag
>>> and subsequent expected migrate calls would clear this up.
>> hmm_select_migrate() figures out the migration type (or none) that's going
>> to happen based on the caller (of hmm_range_fault() or migrate_vma_setup()).
>>
>> HMM_PFN_REQ_MIGRATE is hmm_range_fault() user's way to say it wants
>> migrate on fault (with default_flags, the whole range) and for that
>> direction is system->device.
> Can’t range->migrate->flags just set MIGRATE_VMA_SELECT_SYSTEM?
>
> As far as I can tell, HMM_PFN_REQ_MIGRATE doesn’t really do anything if
> range->migrate is NULL or range->migrate->flags is 0.
Yes currently HMM_PFN_REQ_MIGRATE requires range->migrate set but not
range->migrate->flags. But yes, setting range->migrate->flags directly would
work also.
>
>> migrate_vma_setup() users (the current device migration) express their
>> migration intention with migrate_vma.flags (direction being which ever).
>>
>>>> +
>>>> + return minfo;
>>>> +}
>>>>
>>>> static int hmm_pfns_fill(unsigned long addr, unsigned long end,
>>>> - struct hmm_range *range, unsigned long cpu_flags)
>>>> + struct hmm_vma_walk *hmm_vma_walk, unsigned long cpu_flags)
>>>> {
>>>> + struct hmm_range *range = hmm_vma_walk->range;
>>>> unsigned long i = (addr - range->start) >> PAGE_SHIFT;
>>>> + enum migrate_vma_info minfo;
>>>> + bool migrate = false;
>>>> +
>>>> + minfo = hmm_select_migrate(range);
>>>> + if (cpu_flags != HMM_PFN_ERROR) {
>>>> + if (minfo && (vma_is_anonymous(hmm_vma_walk->vma))) {
>>>> + cpu_flags |= (HMM_PFN_VALID | HMM_PFN_MIGRATE);
>>>> + migrate = true;
>>>> + }
>>>> + }
>>>> +
>>>> + if (migrate && thp_migration_supported() &&
>>>> + (minfo & MIGRATE_VMA_SELECT_COMPOUND) &&
>>>> + IS_ALIGNED(addr, HPAGE_PMD_SIZE) &&
>>>> + IS_ALIGNED(end, HPAGE_PMD_SIZE)) {
>>>> + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
>>>> + range->hmm_pfns[i] |= cpu_flags | HMM_PFN_COMPOUND;
>>>> + addr += PAGE_SIZE;
>>>> + i++;
>>>> + cpu_flags = 0;
>>>> + }
>>>>
>>>> for (; addr < end; addr += PAGE_SIZE, i++) {
>>>> range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
>>>> range->hmm_pfns[i] |= cpu_flags;
>>>> }
>>>> +
>>>> return 0;
>>>> }
>>>>
>>>> @@ -171,11 +204,11 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
>>>> if (!walk->vma) {
>>>> if (required_fault)
>>>> return -EFAULT;
>>>> - return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
>>>> + return hmm_pfns_fill(addr, end, hmm_vma_walk, HMM_PFN_ERROR);
>>>> }
>>>> if (required_fault)
>>>> return hmm_vma_fault(addr, end, required_fault, walk);
>>>> - return hmm_pfns_fill(addr, end, range, 0);
>>>> + return hmm_pfns_fill(addr, end, hmm_vma_walk, 0);
>>>> }
>>>>
>>>> static inline unsigned long hmm_pfn_flags_order(unsigned long order)
>>>> @@ -289,10 +322,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
>>>> goto fault;
>>>>
>>>> if (softleaf_is_migration(entry)) {
>>>> - pte_unmap(ptep);
>>>> - hmm_vma_walk->last = addr;
>>>> - migration_entry_wait(walk->mm, pmdp, addr);
>>>> - return -EBUSY;
>>>> + if (!hmm_select_migrate(range)) {
>>>> + pte_unmap(ptep);
>>>> + hmm_vma_walk->last = addr;
>>>> + migration_entry_wait(walk->mm, pmdp, addr);
>>>> + return -EBUSY;
>>>> + } else
>>>> + goto out;
>>>> }
>>>>
>>>> /* Report error for everything else */
>>>> @@ -376,7 +412,7 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
>>>> return -EFAULT;
>>>> }
>>>>
>>>> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
>>>> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
>>>> }
>>>> #else
>>>> static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
>>>> @@ -389,10 +425,448 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
>>>>
>>>> if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
>>>> return -EFAULT;
>>>> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
>>>> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
>>>> }
>>>> #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>>>
>>>> +/**
>>>> + * migrate_vma_split_folio() - Helper function to split a THP folio
>>>> + * @folio: the folio to split
>>>> + * @fault_page: struct page associated with the fault if any
>>>> + *
>>>> + * Returns 0 on success
>>>> + */
>>>> +static int migrate_vma_split_folio(struct folio *folio,
>>>> + struct page *fault_page)
>>>> +{
>>>> + int ret;
>>>> + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
>>>> + struct folio *new_fault_folio = NULL;
>>>> +
>>>> + if (folio != fault_folio) {
>>>> + folio_get(folio);
>>>> + folio_lock(folio);
>>>> + }
>>>> +
>>>> + ret = split_folio(folio);
>>>> + if (ret) {
>>>> + if (folio != fault_folio) {
>>>> + folio_unlock(folio);
>>>> + folio_put(folio);
>>>> + }
>>>> + return ret;
>>>> + }
>>>> +
>>>> + new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
>>>> +
>>>> + /*
>>>> + * Ensure the lock is held on the correct
>>>> + * folio after the split
>>>> + */
>>>> + if (!new_fault_folio) {
>>>> + folio_unlock(folio);
>>>> + folio_put(folio);
>>>> + } else if (folio != new_fault_folio) {
>>>> + if (new_fault_folio != fault_folio) {
>>>> + folio_get(new_fault_folio);
>>>> + folio_lock(new_fault_folio);
>>>> + }
>>>> + folio_unlock(folio);
>>>> + folio_put(folio);
>>>> + }
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
>>>> + pmd_t *pmdp,
>>>> + unsigned long start,
>>>> + unsigned long end,
>>>> + unsigned long *hmm_pfn)
>>>> +{
>>>> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
>>>> + struct hmm_range *range = hmm_vma_walk->range;
>>>> + struct migrate_vma *migrate = range->migrate;
>>>> + struct mm_struct *mm = walk->vma->vm_mm;
>>>> + struct folio *fault_folio = NULL;
>>>> + struct folio *folio;
>>>> + enum migrate_vma_info minfo;
>>>> + spinlock_t *ptl;
>>>> + unsigned long i;
>>>> + int r = 0;
>>>> +
>>>> + minfo = hmm_select_migrate(range);
>>>> + if (!minfo)
>>>> + return r;
>>>> +
>>>> + fault_folio = (migrate && migrate->fault_page) ?
>>>> + page_folio(migrate->fault_page) : NULL;
>>>> +
>>>> + ptl = pmd_lock(mm, pmdp);
>>>> + if (pmd_none(*pmdp)) {
>>>> + spin_unlock(ptl);
>>>> + return hmm_pfns_fill(start, end, hmm_vma_walk, 0);
>>>> + }
>>>> +
>>>> + if (pmd_trans_huge(*pmdp)) {
>>>> + if (!(minfo & MIGRATE_VMA_SELECT_SYSTEM))
>>>> + goto out;
>>>> +
>>>> + folio = pmd_folio(*pmdp);
>>>> + if (is_huge_zero_folio(folio)) {
>>>> + spin_unlock(ptl);
>>>> + return hmm_pfns_fill(start, end, hmm_vma_walk, 0);
>>>> + }
>>>> +
>>>> + } else if (!pmd_present(*pmdp)) {
>>>> + const softleaf_t entry = softleaf_from_pmd(*pmdp);
>>>> +
>>>> + folio = softleaf_to_folio(entry);
>>>> +
>>>> + if (!softleaf_is_device_private(entry))
>>>> + goto out;
>>>> +
>>>> + if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
>>>> + goto out;
>>>> + if (folio->pgmap->owner != migrate->pgmap_owner)
>>>> + goto out;
>>>> +
>>>> + } else {
>>>> + spin_unlock(ptl);
>>>> + return -EBUSY;
>>>> + }
>>>> +
>>>> + folio_get(folio);
>>>> +
>>>> + if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
>>>> + spin_unlock(ptl);
>>>> + folio_put(folio);
>>>> + return 0;
>>>> + }
>>>> +
>>>> + if (thp_migration_supported() &&
>>>> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
>>>> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
>>>> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
>>>> +
>>>> + struct page_vma_mapped_walk pvmw = {
>>>> + .ptl = ptl,
>>>> + .address = start,
>>>> + .pmd = pmdp,
>>>> + .vma = walk->vma,
>>>> + };
>>>> +
>>>> + hmm_pfn[0] |= HMM_PFN_MIGRATE | HMM_PFN_COMPOUND;
>>>> +
>>>> + r = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
>>>> + if (r) {
>>>> + hmm_pfn[0] &= ~(HMM_PFN_MIGRATE | HMM_PFN_COMPOUND);
>>>> + r = -ENOENT; // fallback
>>>> + goto unlock_out;
>>>> + }
>>>> + for (i = 1, start += PAGE_SIZE; start < end; start += PAGE_SIZE, i++)
>>>> + hmm_pfn[i] &= HMM_PFN_INOUT_FLAGS;
>>>> +
>>>> + } else {
>>>> + r = -ENOENT; // fallback
>>>> + goto unlock_out;
>>>> + }
>>>> +
>>>> +
>>>> +out:
>>>> + spin_unlock(ptl);
>>>> + return r;
>>>> +
>>>> +unlock_out:
>>>> + if (folio != fault_folio)
>>>> + folio_unlock(folio);
>>>> + folio_put(folio);
>>>> + goto out;
>>>> +
>>>> +}
>>>> +
>>>> +/*
>>>> + * Install migration entries if migration requested, either from fault
>>>> + * or migrate paths.
>>>> + *
>>>> + */
>>>> +static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
>>>> + pmd_t *pmdp,
>>>> + unsigned long addr,
>>>> + unsigned long *hmm_pfn)
>>>> +{
>>>> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
>>>> + struct hmm_range *range = hmm_vma_walk->range;
>>>> + struct migrate_vma *migrate = range->migrate;
>>>> + struct mm_struct *mm = walk->vma->vm_mm;
>>>> + struct folio *fault_folio = NULL;
>>>> + enum migrate_vma_info minfo;
>>>> + struct dev_pagemap *pgmap;
>>>> + bool anon_exclusive;
>>>> + struct folio *folio;
>>>> + unsigned long pfn;
>>>> + struct page *page;
>>>> + softleaf_t entry;
>>>> + pte_t pte, swp_pte;
>>>> + spinlock_t *ptl;
>>>> + bool writable = false;
>>>> + pte_t *ptep;
>>>> +
>>>> + // Do we want to migrate at all?
>>>> + minfo = hmm_select_migrate(range);
>>>> + if (!minfo)
>>>> + return 0;
>>>> +
>>>> + fault_folio = (migrate && migrate->fault_page) ?
>>>> + page_folio(migrate->fault_page) : NULL;
>>>> +
>>>> +again:
>>>> + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
>>>> + if (!ptep)
>>>> + return 0;
>>>> +
>>>> + pte = ptep_get(ptep);
>>>> +
>>>> + if (pte_none(pte)) {
>>>> + // migrate without faulting case
>>>> + if (vma_is_anonymous(walk->vma)) {
>>>> + *hmm_pfn &= HMM_PFN_INOUT_FLAGS;
>>>> + *hmm_pfn |= HMM_PFN_MIGRATE | HMM_PFN_VALID;
>>>> + goto out;
>>>> + }
>>>> + }
>>>> +
>>>> + if (!pte_present(pte)) {
>>>> + /*
>>>> + * Only care about unaddressable device page special
>>>> + * page table entry. Other special swap entries are not
>>>> + * migratable, and we ignore regular swapped page.
>>>> + */
>>>> + entry = softleaf_from_pte(pte);
>>>> + if (!softleaf_is_device_private(entry))
>>>> + goto out;
>>>> +
>>>> + if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
>>>> + goto out;
>>>> +
>>>> + page = softleaf_to_page(entry);
>>>> + folio = page_folio(page);
>>>> + if (folio->pgmap->owner != migrate->pgmap_owner)
>>>> + goto out;
>>>> +
>>>> + if (folio_test_large(folio)) {
>>>> + int ret;
>>>> +
>>>> + pte_unmap_unlock(ptep, ptl);
>>>> + ret = migrate_vma_split_folio(folio,
>>>> + migrate->fault_page);
>>>> + if (ret)
>>>> + goto out_unlocked;
>>>> + goto again;
>>>> + }
>>>> +
>>>> + pfn = page_to_pfn(page);
>>>> + if (softleaf_is_device_private_write(entry))
>>>> + writable = true;
>>>> + } else {
>>>> + pfn = pte_pfn(pte);
>>>> + if (is_zero_pfn(pfn) &&
>>>> + (minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
>>>> + *hmm_pfn = HMM_PFN_MIGRATE|HMM_PFN_VALID;
>>>> + goto out;
>>>> + }
>>>> + page = vm_normal_page(walk->vma, addr, pte);
>>>> + if (page && !is_zone_device_page(page) &&
>>>> + !(minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
>>>> + goto out;
>>>> + } else if (page && is_device_coherent_page(page)) {
>>>> + pgmap = page_pgmap(page);
>>>> +
>>>> + if (!(minfo &
>>>> + MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
>>>> + pgmap->owner != migrate->pgmap_owner)
>>>> + goto out;
>>>> + }
>>>> +
>>>> + folio = page_folio(page);
>>>> + if (folio_test_large(folio)) {
>>>> + int ret;
>>>> +
>>>> + pte_unmap_unlock(ptep, ptl);
>>>> + ret = migrate_vma_split_folio(folio,
>>>> + migrate->fault_page);
>>>> + if (ret)
>>>> + goto out_unlocked;
>>>> +
>>>> + goto again;
>>>> + }
>>>> +
>>>> + writable = pte_write(pte);
>>>> + }
>>>> +
>>>> + if (!page || !page->mapping)
>>>> + goto out;
>>>> +
>>>> + /*
>>>> + * By getting a reference on the folio we pin it and that blocks
>>>> + * any kind of migration. Side effect is that it "freezes" the
>>>> + * pte.
>>>> + *
>>>> + * We drop this reference after isolating the folio from the lru
>>>> + * for non device folio (device folio are not on the lru and thus
>>>> + * can't be dropped from it).
>>>> + */
>>>> + folio = page_folio(page);
>>>> + folio_get(folio);
>>>> +
>>>> + /*
>>>> + * We rely on folio_trylock() to avoid deadlock between
>>>> + * concurrent migrations where each is waiting on the others
>>>> + * folio lock. If we can't immediately lock the folio we fail this
>>>> + * migration as it is only best effort anyway.
>>>> + *
>>>> + * If we can lock the folio it's safe to set up a migration entry
>>>> + * now. In the common case where the folio is mapped once in a
>>>> + * single process setting up the migration entry now is an
>>>> + * optimisation to avoid walking the rmap later with
>>>> + * try_to_migrate().
>>>> + */
>>>> +
>>>> + if (fault_folio == folio || folio_trylock(folio)) {
>>>> + anon_exclusive = folio_test_anon(folio) &&
>>>> + PageAnonExclusive(page);
>>>> +
>>>> + flush_cache_page(walk->vma, addr, pfn);
>>>> +
>>>> + if (anon_exclusive) {
>>>> + pte = ptep_clear_flush(walk->vma, addr, ptep);
>>>> +
>>>> + if (folio_try_share_anon_rmap_pte(folio, page)) {
>>>> + set_pte_at(mm, addr, ptep, pte);
>>>> + folio_unlock(folio);
>>>> + folio_put(folio);
>>>> + goto out;
>>>> + }
>>>> + } else {
>>>> + pte = ptep_get_and_clear(mm, addr, ptep);
>>>> + }
>>>> +
>>>> + /* Setup special migration page table entry */
>>>> + if (writable)
>>>> + entry = make_writable_migration_entry(pfn);
>>>> + else if (anon_exclusive)
>>>> + entry = make_readable_exclusive_migration_entry(pfn);
>>>> + else
>>>> + entry = make_readable_migration_entry(pfn);
>>>> +
>>>> + swp_pte = swp_entry_to_pte(entry);
>>>> + if (pte_present(pte)) {
>>>> + if (pte_soft_dirty(pte))
>>>> + swp_pte = pte_swp_mksoft_dirty(swp_pte);
>>>> + if (pte_uffd_wp(pte))
>>>> + swp_pte = pte_swp_mkuffd_wp(swp_pte);
>>>> + } else {
>>>> + if (pte_swp_soft_dirty(pte))
>>>> + swp_pte = pte_swp_mksoft_dirty(swp_pte);
>>>> + if (pte_swp_uffd_wp(pte))
>>>> + swp_pte = pte_swp_mkuffd_wp(swp_pte);
>>>> + }
>>>> +
>>>> + set_pte_at(mm, addr, ptep, swp_pte);
>>>> + folio_remove_rmap_pte(folio, page, walk->vma);
>>>> + folio_put(folio);
>>>> + *hmm_pfn |= HMM_PFN_MIGRATE;
>>>> +
>>>> + if (pte_present(pte))
>>>> + flush_tlb_range(walk->vma, addr, addr + PAGE_SIZE);
>>>> + } else
>>>> + folio_put(folio);
>>>> +out:
>>>> + pte_unmap_unlock(ptep, ptl);
>>>> + return 0;
>>>> +out_unlocked:
>>>> + return -1;
>>>> +
>>>> +}
>>>> +
>>>> +static int hmm_vma_walk_split(pmd_t *pmdp,
>>>> + unsigned long addr,
>>>> + struct mm_walk *walk)
>>>> +{
>>>> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
>>>> + struct hmm_range *range = hmm_vma_walk->range;
>>>> + struct migrate_vma *migrate = range->migrate;
>>>> + struct folio *folio, *fault_folio;
>>>> + spinlock_t *ptl;
>>>> + int ret = 0;
>>>> +
>>>> + fault_folio = (migrate && migrate->fault_page) ?
>>>> + page_folio(migrate->fault_page) : NULL;
>>>> +
>>>> + ptl = pmd_lock(walk->mm, pmdp);
>>>> + if (unlikely(!pmd_trans_huge(*pmdp))) {
>>>> + spin_unlock(ptl);
>>>> + goto out;
>>>> + }
>>>> +
>>>> + folio = pmd_folio(*pmdp);
>>>> + if (is_huge_zero_folio(folio)) {
>>>> + spin_unlock(ptl);
>>>> + split_huge_pmd(walk->vma, pmdp, addr);
>>>> + } else {
>>>> + folio_get(folio);
>>>> + spin_unlock(ptl);
>>>> +
>>>> + if (folio != fault_folio) {
>>>> + if (unlikely(!folio_trylock(folio))) {
>>>> + folio_put(folio);
>>>> + ret = -EBUSY;
>>>> + goto out;
>>>> + }
>>>> + } else
>>>> + folio_put(folio);
>>>> +
>>>> + ret = split_folio(folio);
>>>> + if (fault_folio != folio) {
>>>> + folio_unlock(folio);
>>>> + folio_put(folio);
>>>> + }
>>>> +
>>>> + }
>>>> +out:
>>>> + return ret;
>>>> +}
>>>> +
>>>> +static int hmm_vma_capture_migrate_range(unsigned long start,
>>>> + unsigned long end,
>>>> + struct mm_walk *walk)
>>>> +{
>>>> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
>>>> + struct hmm_range *range = hmm_vma_walk->range;
>>>> +
>>>> + if (!hmm_select_migrate(range))
>>>> + return 0;
>>>> +
>>>> + if (hmm_vma_walk->vma && (hmm_vma_walk->vma != walk->vma))
>>>> + return -ERANGE;
>>>> +
>>>> + hmm_vma_walk->vma = walk->vma;
>>>> + hmm_vma_walk->start = start;
>>>> + hmm_vma_walk->end = end;
>>>> +
>>>> + if (end - start > range->end - range->start)
>>>> + return -ERANGE;
>>>> +
>>>> + if (!hmm_vma_walk->mmu_range.owner) {
>>>> + mmu_notifier_range_init_owner(&hmm_vma_walk->mmu_range, MMU_NOTIFY_MIGRATE, 0,
>>>> + walk->vma->vm_mm, start, end,
>>>> + range->dev_private_owner);
>>>> + mmu_notifier_invalidate_range_start(&hmm_vma_walk->mmu_range);
>>>> + }
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> static int hmm_vma_walk_pmd(pmd_t *pmdp,
>>>> unsigned long start,
>>>> unsigned long end,
>>>> @@ -404,42 +878,90 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>>>> &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
>>>> unsigned long npages = (end - start) >> PAGE_SHIFT;
>>>> unsigned long addr = start;
>>>> + enum migrate_vma_info minfo;
>>>> + unsigned long i;
>>>> + spinlock_t *ptl;
>>>> pte_t *ptep;
>>>> pmd_t pmd;
>>>> + int r;
>>>>
>>>> + minfo = hmm_select_migrate(range);
>>>> again:
>>>> +
>>>> pmd = pmdp_get_lockless(pmdp);
>>>> - if (pmd_none(pmd))
>>>> - return hmm_vma_walk_hole(start, end, -1, walk);
>>>> + if (pmd_none(pmd)) {
>>>> + r = hmm_vma_walk_hole(start, end, -1, walk);
>>>> + if (r || !minfo)
>>>> + return r;
>>>> +
>>>> + ptl = pmd_lock(walk->mm, pmdp);
>>>> + if (pmd_none(*pmdp)) {
>>>> + // hmm_vma_walk_hole() filled migration needs
>>>> + spin_unlock(ptl);
>>>> + return r;
>>>> + }
>>>> + spin_unlock(ptl);
>>>> + }
>>>>
>>>> if (thp_migration_supported() && pmd_is_migration_entry(pmd)) {
>>>> - if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
>>>> - hmm_vma_walk->last = addr;
>>>> - pmd_migration_entry_wait(walk->mm, pmdp);
>>>> - return -EBUSY;
>>>> + if (!minfo) {
>>>> + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
>>>> + hmm_vma_walk->last = addr;
>>>> + pmd_migration_entry_wait(walk->mm, pmdp);
>>>> + return -EBUSY;
>>>> + }
>>>> }
>>>> - return hmm_pfns_fill(start, end, range, 0);
>>>> + for (i = 0; addr < end; addr += PAGE_SIZE, i++)
>>>> + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
>>>> +
>>>> + return 0;
>>>> }
>>>>
>>>> - if (!pmd_present(pmd))
>>>> - return hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
>>>> - pmd);
>>>> + if (pmd_trans_huge(pmd) || !pmd_present(pmd)) {
>>>> +
>>>> + if (!pmd_present(pmd)) {
>>>> + r = hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
>>>> + pmd);
>>>> + if (r || !minfo)
>>>> + return r;
>>>> + } else {
>>>> +
>>>> + /*
>>>> + * No need to take pmd_lock here, even if some other thread
>>>> + * is splitting the huge pmd we will get that event through
>>>> + * mmu_notifier callback.
>>>> + *
>>>> + * So just read pmd value and check again it's a transparent
>>>> + * huge or device mapping one and compute corresponding pfn
>>>> + * values.
>>>> + */
>>>> +
>>>> + pmd = pmdp_get_lockless(pmdp);
>>>> + if (!pmd_trans_huge(pmd))
>>>> + goto again;
>>>> +
>>>> + r = hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
>>>> +
>>>> + if (r || !minfo)
>>>> + return r;
>>>> + }
>>>>
>>>> - if (pmd_trans_huge(pmd)) {
>>>> - /*
>>>> - * No need to take pmd_lock here, even if some other thread
>>>> - * is splitting the huge pmd we will get that event through
>>>> - * mmu_notifier callback.
>>>> - *
>>>> - * So just read pmd value and check again it's a transparent
>>>> - * huge or device mapping one and compute corresponding pfn
>>>> - * values.
>>>> - */
>>>> - pmd = pmdp_get_lockless(pmdp);
>>>> - if (!pmd_trans_huge(pmd))
>>>> - goto again;
>>>> + r = hmm_vma_handle_migrate_prepare_pmd(walk, pmdp, start, end, hmm_pfns);
>>>> +
>>>> + if (r == -ENOENT) {
>>>> + r = hmm_vma_walk_split(pmdp, addr, walk);
>>>> + if (r) {
>>>> + /* Split not successful, skip */
>>>> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
>>>> + }
>>>> +
>>>> + /* Split successful or "again", reloop */
>>>> + hmm_vma_walk->last = addr;
>>>> + return -EBUSY;
>>>> + }
>>>> +
>>>> + return r;
>>>>
>>>> - return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
>>>> }
>>>>
>>>> /*
>>>> @@ -451,22 +973,26 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>>>> if (pmd_bad(pmd)) {
>>>> if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
>>>> return -EFAULT;
>>>> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
>>>> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
>>>> }
>>>>
>>>> ptep = pte_offset_map(pmdp, addr);
>>>> if (!ptep)
>>>> goto again;
>>>> for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
>>>> - int r;
>>>>
>>>> r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
>>>> if (r) {
>>>> /* hmm_vma_handle_pte() did pte_unmap() */
>>>> return r;
>>>> }
>>>> +
>>>> + r = hmm_vma_handle_migrate_prepare(walk, pmdp, addr, hmm_pfns);
>>>> + if (r)
>>>> + break;
>>>> }
>>>> pte_unmap(ptep - 1);
>>>> +
>>>> return 0;
>>>> }
>>>>
>>>> @@ -600,6 +1126,11 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
>>>> struct hmm_vma_walk *hmm_vma_walk = walk->private;
>>>> struct hmm_range *range = hmm_vma_walk->range;
>>>> struct vm_area_struct *vma = walk->vma;
>>>> + int r;
>>>> +
>>>> + r = hmm_vma_capture_migrate_range(start, end, walk);
>>>> + if (r)
>>>> + return r;
>>>>
>>>> if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
>>>> vma->vm_flags & VM_READ)
>>>> @@ -622,7 +1153,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
>>>> (end - start) >> PAGE_SHIFT, 0))
>>>> return -EFAULT;
>>>>
>>>> - hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
>>>> + hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
>>>>
>>>> /* Skip this vma and continue processing the next vma. */
>>>> return 1;
>>>> @@ -652,9 +1183,17 @@ static const struct mm_walk_ops hmm_walk_ops = {
>>>> * the invalidation to finish.
>>>> * -EFAULT: A page was requested to be valid and could not be made valid
>>>> * ie it has no backing VMA or it is illegal to access
>>>> + * -ERANGE: The range crosses multiple VMAs, or space for hmm_pfns array
>>>> + * is too low.
>>>> *
>>>> * This is similar to get_user_pages(), except that it can read the page tables
>>>> * without mutating them (ie causing faults).
>>>> + *
>>>> + * If want to do migrate after faultin, call hmm_rangem_fault() with
>>> s/faultin/faulting
>>>
>>> s/hmm_rangem_fault/hmm_range_fault
>> Will fix, thanks!
>>
>>>> + * HMM_PFN_REQ_MIGRATE and initialize range.migrate field.
>>> I'm not following the HMM_PFN_REQ_MIGRATE usage.
>>>
>>>> + * After hmm_range_fault() call migrate_hmm_range_setup() instead of
>>>> + * migrate_vma_setup() and after that follow normal migrate calls path.
>>>> + *
>>> Also since migrate_vma_setup calls hmm_range_fault then
>>> migrate_hmm_range_setup what would the use case for not just calling
>>> migrate_vma_setup be?
>> It's the call context that matters. When resolving device faults and based on a policy,
>> some regions you mirror, some migrate, you start with
>> call(s) to hmm_range_fault() (+HMM_PFN_REQ_MIGRATE optionally),
>> and explicitly call migrate_hmm_range_setup() + other migrate_vma* when migrating.
>> For resolving device faults it's just the hmm_range_fault() part like before.
>> For current migration users, migrate_vma_setup() keeps the current behavior calling
>> hmm_range_fault() under the hood for installing migration ptes
>> after optionally bringing in swapped pages (with MIGRATE_VMA_FAULT).
>> I think we get better understanding how the entrypoints should look like when used for a real device.
>> For now I have tried to keep the current behavior and entry points, refactoring under the hood
>> for code reuse.
>>
> I’m not completely following the above, but I did think of a case where
> hmm_range_fault + migrate_hmm_range_setup is better than using
> migrate_vma_setup() alone. Previously, before calling
> migrate_vma_setup(), we had to look up the VMA. If hmm_range_fault
> populates the migrate_vma arguments (include the VMA,) that’s better
> than the current flow.
Yes that's true, and actually something I forgot to mention in the cover letter:
We get the VMAs populated now.
>
>>>> */
>>>> int hmm_range_fault(struct hmm_range *range)
>>>> {
>>>> @@ -662,16 +1201,28 @@ int hmm_range_fault(struct hmm_range *range)
>>>> .range = range,
>>>> .last = range->start,
>>>> };
>>>> - struct mm_struct *mm = range->notifier->mm;
>>>> + bool is_fault_path = !!range->notifier;
>>>> + struct mm_struct *mm;
>>>> int ret;
>>>>
>>>> + /*
>>>> + *
>>>> + * Could be serving a device fault or come from migrate
>>>> + * entry point. For the former we have not resolved the vma
>>>> + * yet, and the latter we don't have a notifier (but have a vma).
>>>> + *
>>>> + */
>>>> + mm = is_fault_path ? range->notifier->mm : range->migrate->vma->vm_mm;
>>>> mmap_assert_locked(mm);
>>>>
>>>> do {
>>>> /* If range is no longer valid force retry. */
>>>> - if (mmu_interval_check_retry(range->notifier,
>>>> - range->notifier_seq))
>>>> - return -EBUSY;
>>>> + if (is_fault_path && mmu_interval_check_retry(range->notifier,
>>>> + range->notifier_seq)) {
>>>> + ret = -EBUSY;
>>>> + break;
>>>> + }
>>>> +
>>>> ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
>>>> &hmm_walk_ops, &hmm_vma_walk);
>>>> /*
>>>> @@ -681,6 +1232,18 @@ int hmm_range_fault(struct hmm_range *range)
>>>> * output, and all >= are still at their input values.
>>>> */
>>>> } while (ret == -EBUSY);
>>>> +
>>>> + if (hmm_select_migrate(range) && range->migrate &&
>>>> + hmm_vma_walk.mmu_range.owner) {
>>>> + // The migrate_vma path has the following initialized
>>>> + if (is_fault_path) {
>>>> + range->migrate->vma = hmm_vma_walk.vma;
>>>> + range->migrate->start = range->start;
>>>> + range->migrate->end = hmm_vma_walk.end;
>>>> + }
>>>> + mmu_notifier_invalidate_range_end(&hmm_vma_walk.mmu_range);
>>>> + }
>>>> +
>>>> return ret;
>>>> }
>>>> EXPORT_SYMBOL(hmm_range_fault);
>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>>> index 23379663b1e1..d89efdfca8f6 100644
>>>> --- a/mm/migrate_device.c
>>>> +++ b/mm/migrate_device.c
>>>> @@ -734,7 +734,17 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
>>>> */
>>>> int migrate_vma_setup(struct migrate_vma *args)
>>>> {
>>>> + int ret;
>>>> long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
>>>> + struct hmm_range range = {
>>>> + .notifier = NULL,
>>>> + .start = args->start,
>>>> + .end = args->end,
>>>> + .migrate = args,
>>>> + .hmm_pfns = args->src,
>>>> + .dev_private_owner = args->pgmap_owner,
>>>> + .migrate = args
>>>> + };
>>>>
>>>> args->start &= PAGE_MASK;
>>>> args->end &= PAGE_MASK;
>>>> @@ -759,17 +769,19 @@ int migrate_vma_setup(struct migrate_vma *args)
>>>> args->cpages = 0;
>>>> args->npages = 0;
>>>>
>>>> - migrate_vma_collect(args);
>>>> + if (args->flags & MIGRATE_VMA_FAULT)
>>>> + range.default_flags |= HMM_PFN_REQ_FAULT;
>>> Next level here might be skip faulting pte_none() in hmm_range_fault
>>> too?
>> That would be a possible optimization!
>>
> Yes, I think that would make this really useful, and we (Xe’s SVM
> implementation) would likely always want to set the flag that says
> ‘fault in !pte_present() && !pte_none’. I’m not 100% sure on this, but
> that’s my initial feeling.
Yes I will look into this.
> Matt
--Mika
On Sat, Jan 17, 2026 at 09:07:36AM +0200, Mika Penttilä wrote:
> On 1/16/26 22:05, Matthew Brost wrote:
>
> > On Fri, Jan 16, 2026 at 12:39:30PM +0200, Mika Penttilä wrote:
> >> Hi,
> >>
> >> On 1/16/26 04:06, Matthew Brost wrote:
> >>
> >>> On Wed, Jan 14, 2026 at 11:19:21AM +0200, mpenttil@redhat.com wrote:
> >>>> From: Mika Penttilä <mpenttil@redhat.com>
> >>>>
> >>>> Currently, the way device page faulting and migration works
> >>>> is not optimal, if you want to do both fault handling and
> >>>> migration at once.
> >>>>
> >>>> Being able to migrate not present pages (or pages mapped with incorrect
> >>>> permissions, eg. COW) to the GPU requires doing either of the
> >>>> following sequences:
> >>>>
> >>>> 1. hmm_range_fault() - fault in non-present pages with correct permissions, etc.
> >>>> 2. migrate_vma_*() - migrate the pages
> >>>>
> >>>> Or:
> >>>>
> >>>> 1. migrate_vma_*() - migrate present pages
> >>>> 2. If non-present pages detected by migrate_vma_*():
> >>>> a) call hmm_range_fault() to fault pages in
> >>>> b) call migrate_vma_*() again to migrate now present pages
> >>>>
> >>>> The problem with the first sequence is that you always have to do two
> >>>> page walks even when most of the time the pages are present or zero page
> >>>> mappings so the common case takes a performance hit.
> >>>>
> >>>> The second sequence is better for the common case, but far worse if
> >>>> pages aren't present because now you have to walk the page tables three
> >>>> times (once to find the page is not present, once so hmm_range_fault()
> >>>> can find a non-present page to fault in and once again to setup the
> >>>> migration). It is also tricky to code correctly.
> >>>>
> >>>> We should be able to walk the page table once, faulting
> >>>> pages in as required and replacing them with migration entries if
> >>>> requested.
> >>>>
> >>>> Add a new flag to HMM APIs, HMM_PFN_REQ_MIGRATE,
> >>>> which tells to prepare for migration also during fault handling.
> >>>> Also, for the migrate_vma_setup() call paths, a flags, MIGRATE_VMA_FAULT,
> >>>> is added to tell to add fault handling to migrate.
> >>>>
> >>>> Cc: David Hildenbrand <david@redhat.com>
> >>>> Cc: Jason Gunthorpe <jgg@nvidia.com>
> >>>> Cc: Leon Romanovsky <leonro@nvidia.com>
> >>>> Cc: Alistair Popple <apopple@nvidia.com>
> >>>> Cc: Balbir Singh <balbirs@nvidia.com>
> >>>> Cc: Zi Yan <ziy@nvidia.com>
> >>>> Cc: Matthew Brost <matthew.brost@intel.com>
> >>> I'll try to test this when I can but horribly behind at the moment.
> >>>
> >>> You can use Intel's CI system to test SVM too. I can get you authorized
> >>> to use this. The list to trigger is intel-xe@lists.freedesktop.org and
> >>> patches must apply to drm-tip. I'll let you know when you are
> >>> authorized.
> >> Thanks, appreciate, will do that also!
> >>
> > Working on enabling this for you in CI.
> >
> > I did a quick test by running our complete test suite and got a kernel
> > hang in this section:
> >
> > xe_exec_system_allocator.threads-shared-vm-many-stride-malloc-prefetch
> >
> > Stack trace:
> >
> > [ 182.915763] INFO: task xe_exec_system_:5357 blocked for more than 30 seconds.
> > [ 182.922866] Tainted: G U W 6.19.0-rc4-xe+ #2549
> > [ 182.929183] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > [ 182.936912] task:xe_exec_system_ state:D stack:0 pid:5357 tgid:1862 ppid:1853 task_flags:0x400040 flags:0x00080000
> > [ 182.936916] Call Trace:
> > [ 182.936918] <TASK>
> > [ 182.936919] __schedule+0x4df/0xc20
> > [ 182.936924] schedule+0x22/0xd0
> > [ 182.936925] io_schedule+0x41/0x60
> > [ 182.936926] migration_entry_wait_on_locked+0x21c/0x2a0
> > [ 182.936929] ? __pfx_wake_page_function+0x10/0x10
> > [ 182.936931] migration_entry_wait+0xad/0xf0
> > [ 182.936933] hmm_vma_walk_pmd+0xd5f/0x19b0
> > [ 182.936935] walk_pgd_range+0x51d/0xa60
> > [ 182.936938] __walk_page_range+0x75/0x1e0
> > [ 182.936940] walk_page_range_mm_unsafe+0x138/0x1f0
> > [ 182.936941] hmm_range_fault+0x8f/0x160
> > [ 182.936945] drm_gpusvm_get_pages+0x1ae/0x8a0 [drm_gpusvm_helper]
> > [ 182.936949] drm_gpusvm_range_get_pages+0x2d/0x40 [drm_gpusvm_helper]
> > [ 182.936951] xe_svm_range_get_pages+0x1b/0x50 [xe]
> > [ 182.936979] xe_vm_bind_ioctl+0x15c3/0x17e0 [xe]
> > [ 182.937001] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe]
> > [ 182.937021] ? drm_ioctl_kernel+0xa3/0x100
> > [ 182.937024] drm_ioctl_kernel+0xa3/0x100
> > [ 182.937026] drm_ioctl+0x213/0x440
> > [ 182.937028] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe]
> > [ 182.937061] xe_drm_ioctl+0x5a/0xa0 [xe]
> > [ 182.937083] __x64_sys_ioctl+0x7f/0xd0
> > [ 182.937085] do_syscall_64+0x50/0x290
> > [ 182.937088] entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > [ 182.937091] RIP: 0033:0x7ff00f724ded
> > [ 182.937092] RSP: 002b:00007ff00b9fa640 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> > [ 182.937094] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007ff00f724ded
> > [ 182.937095] RDX: 00007ff00b9fa6d0 RSI: 0000000040886445 RDI: 0000000000000003
> > [ 182.937096] RBP: 00007ff00b9fa690 R08: 0000000000000000 R09: 0000000000000000
> > [ 182.937097] R10: 0000000000000001 R11: 0000000000000246 R12: 00007ff00b9fa6d0
> > [ 182.937098] R13: 0000000040886445 R14: 0000000000000003 R15: 00007ff00f8a9000
> > [ 182.937099] </TASK>
> >
> > This section is a racy test with parallel CPU and device access that is
> > likely causing the migration process to abort and retry. From the stack
> > trace, it looks like a migration PMD didn’t get properly removed, and a
> > subsequent call to hmm_range_fault hangs on a migration entry that was
> > not removed during the migration abort.
> >
> > IIRC, some of the last bits in Balbir’s large device pages series had a
> > similar bug, which I sent to Andrew with fixup patches. I suspect you
> > have a similar bug. If I can find the time, I’ll see if I can track it
> > down.
>
> Thanks for your efforts Matthew!
>
Happy to help.
> I remember those discussions, and looks similar. If recall correctly, it was
> about failed splits and continuation after that. The code bails out the
> pmd loop in this case but maybe this part is missing and is equivalent to
> collect skip:
>
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 39a07d895043..c7a7bb923a37 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -988,8 +988,12 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> }
>
> r = hmm_vma_handle_migrate_prepare(walk, pmdp, addr, hmm_pfns);
> - if (r)
> + if (r) {
> + /* Split has failed, skip to end. */
> + for (i = 0; addr < end; addr += PAGE_SIZE, i++)
> + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
> break;
> + }
> }
> pte_unmap(ptep - 1);
>
> With that is should be quite similar to current flow (for effects).
>
I think the above code is conceptually correct and needed, but I don’t
think it is the actual problem.
I traced the issue to the handle and prepare functions racing with other
migrations. The handle function doesn’t find a valid PTE/PMD and
populates the HMM PFN with zero. The prepare function then finds a valid
PTE/PMD and installs a migration entry. My code aborts the migration
because we only migrate if all pages in the requested range are found.
migrate_vma_pages/migrate_vma_finalize are called, but since some PFNs
are zero, the migration entries installed during the prepare step are
not removed.
This is actually very easy to reproduce on my end if I disable compound
page collection in my migrations and then run a test that migrates 2M
pages with racing CPU/GPU access. I suspect we could write a selftest to
replicate this.
I have a hacky patch that fixes this — among other things I spotted
while looking at this — but I don’t think that’s the right approach. I
believe the right approach is to unify the handle and prepare functions
into a single one. If you look at what these two functions do, they are
actually quite similar; e.g., both inspect PTE/PMD state to make
decisions. Of course, if you want a migration you need the PTE/PMD
locks, and those would need to be dropped if you want to do something
like fault in a page. This still seems doable. I’ll let you and the HMM
maintainers duke this out regarding whether this is an acceptable
approach :).
FWIW, here are the changes I had to make to get my code stable. Tested
with compound page collection both on and off.
diff --git a/mm/hmm.c b/mm/hmm.c
index 39a07d895043..5e24cd82b393 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -509,8 +509,10 @@ static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
}
if (pmd_trans_huge(*pmdp)) {
- if (!(minfo & MIGRATE_VMA_SELECT_SYSTEM))
+ if (!(minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
+ hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
goto out;
+ }
folio = pmd_folio(*pmdp);
if (is_huge_zero_folio(folio)) {
@@ -523,16 +525,22 @@ static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
folio = softleaf_to_folio(entry);
- if (!softleaf_is_device_private(entry))
- goto out;
-
- if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
+ if (!softleaf_is_device_private(entry)) {
+ hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
goto out;
- if (folio->pgmap->owner != migrate->pgmap_owner)
+ }
+ if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE)) {
+ hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
goto out;
+ }
+ if (folio->pgmap->owner != migrate->pgmap_owner) {
+ hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
+ goto out;
+ }
} else {
spin_unlock(ptl);
+ hmm_vma_walk->last = start;
return -EBUSY;
}
@@ -541,6 +549,7 @@ static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
spin_unlock(ptl);
folio_put(folio);
+ hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
return 0;
}
@@ -555,8 +564,10 @@ static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
.pmd = pmdp,
.vma = walk->vma,
};
+ unsigned long pfn = page_to_pfn(folio_page(folio, 0));
- hmm_pfn[0] |= HMM_PFN_MIGRATE | HMM_PFN_COMPOUND;
+ hmm_pfn[0] |= HMM_PFN_MIGRATE | HMM_PFN_COMPOUND |
+ HMM_PFN_VALID;
r = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
if (r) {
@@ -564,6 +575,8 @@ static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
r = -ENOENT; // fallback
goto unlock_out;
}
+ hmm_pfn[0] &= HMM_PFN_FLAGS;
+ hmm_pfn[0] |= pfn;
for (i = 1, start += PAGE_SIZE; start < end; start += PAGE_SIZE, i++)
hmm_pfn[i] &= HMM_PFN_INOUT_FLAGS;
@@ -604,7 +617,7 @@ static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
struct dev_pagemap *pgmap;
bool anon_exclusive;
struct folio *folio;
- unsigned long pfn;
+ unsigned long pfn = 0;
struct page *page;
softleaf_t entry;
pte_t pte, swp_pte;
@@ -688,8 +701,8 @@ static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
goto out;
}
- folio = page_folio(page);
- if (folio_test_large(folio)) {
+ folio = page ? page_folio(page) : NULL;
+ if (folio && folio_test_large(folio)) {
int ret;
pte_unmap_unlock(ptep, ptl);
@@ -745,12 +758,15 @@ static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
set_pte_at(mm, addr, ptep, pte);
folio_unlock(folio);
folio_put(folio);
+ *hmm_pfn &= HMM_PFN_INOUT_FLAGS;
goto out;
}
} else {
pte = ptep_get_and_clear(mm, addr, ptep);
}
+ /* XXX: Migrate layer calls folio_mark_dirty if pte_dirty */
+
/* Setup special migration page table entry */
if (writable)
entry = make_writable_migration_entry(pfn);
@@ -759,6 +775,8 @@ static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
else
entry = make_readable_migration_entry(pfn);
+ /* XXX: Migrate layer makes entry young / dirty based on PTE */
+
swp_pte = swp_entry_to_pte(entry);
if (pte_present(pte)) {
if (pte_soft_dirty(pte))
@@ -775,8 +793,18 @@ static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
set_pte_at(mm, addr, ptep, swp_pte);
folio_remove_rmap_pte(folio, page, walk->vma);
folio_put(folio);
- *hmm_pfn |= HMM_PFN_MIGRATE;
+ /*
+ * XXX: It is possible the PTE wasn't present in the first part
+ * of the HHM walk, repopulate it...
+ */
+ *hmm_pfn &= HMM_PFN_FLAGS;
+ *hmm_pfn |= HMM_PFN_MIGRATE | pfn | HMM_PFN_VALID |
+ (writable ? HMM_PFN_WRITE : 0);
+ /*
+ * XXX: Is there a perf impact of calling flush_tlb_range on
+ * each PTE vs. range like migrate_vma layer?
+ */
if (pte_present(pte))
flush_tlb_range(walk->vma, addr, addr + PAGE_SIZE);
} else
@@ -988,8 +1016,10 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
}
r = hmm_vma_handle_migrate_prepare(walk, pmdp, addr, hmm_pfns);
- if (r)
+ if (r) {
+ hmm_pfns_fill(addr, end, hmm_vma_walk, HMM_PFN_ERROR);
break;
+ }
}
pte_unmap(ptep - 1);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index c8f5a0615a5e..58d42d3ab673 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -268,6 +268,12 @@ int migrate_vma_setup(struct migrate_vma *args)
migrate_hmm_range_setup(&range);
+ /* Remove migration PTEs */
+ if (ret) {
+ migrate_vma_pages(args);
+ migrate_vma_finalize(args);
+ }
+
/*
* At this point pages are locked and unmapped, and thus they have
* stable content and can safely be copied to destination memory that
Matt
> >
> >>>> Suggested-by: Alistair Popple <apopple@nvidia.com>
> >>>> Signed-off-by: Mika Penttilä <mpenttil@redhat.com>
> >>>> ---
> >>>> include/linux/hmm.h | 17 +-
> >>>> include/linux/migrate.h | 6 +-
> >>>> mm/hmm.c | 657 +++++++++++++++++++++++++++++++++++++---
> >>>> mm/migrate_device.c | 81 ++++-
> >>>> 4 files changed, 706 insertions(+), 55 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> >>>> index db75ffc949a7..7b7294ad0f62 100644
> >>>> --- a/include/linux/hmm.h
> >>>> +++ b/include/linux/hmm.h
> >>>> @@ -12,7 +12,7 @@
> >>>> #include <linux/mm.h>
> >>>>
> >>>> struct mmu_interval_notifier;
> >>>> -
> >>>> +struct migrate_vma;
> >>>> /*
> >>>> * On output:
> >>>> * 0 - The page is faultable and a future call with
> >>>> @@ -48,15 +48,25 @@ enum hmm_pfn_flags {
> >>>> HMM_PFN_P2PDMA = 1UL << (BITS_PER_LONG - 5),
> >>>> HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6),
> >>>>
> >>>> - HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 11),
> >>>> + /* Migrate request */
> >>>> + HMM_PFN_MIGRATE = 1UL << (BITS_PER_LONG - 7),
> >>>> + HMM_PFN_COMPOUND = 1UL << (BITS_PER_LONG - 8),
> >>>> + HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 13),
> >>>>
> >>>> /* Input flags */
> >>>> HMM_PFN_REQ_FAULT = HMM_PFN_VALID,
> >>>> HMM_PFN_REQ_WRITE = HMM_PFN_WRITE,
> >>>> + HMM_PFN_REQ_MIGRATE = HMM_PFN_MIGRATE,
> >>> I believe you are missing kernel for HMM_PFN_MIGRATE.
> >> I explain below the HMM_PFN_MIGRATE usage.
> >>
> > I had a typo here: s/missing kernel/missing doc kernel/
>
> ack
>
> >
> >>>>
> >>>> HMM_PFN_FLAGS = ~((1UL << HMM_PFN_ORDER_SHIFT) - 1),
> >>>> };
> >>>>
> >>>> +enum {
> >>>> + /* These flags are carried from input-to-output */
> >>>> + HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
> >>>> + HMM_PFN_P2PDMA_BUS,
> >>>> +};
> >>>> +
> >>>> /*
> >>>> * hmm_pfn_to_page() - return struct page pointed to by a device entry
> >>>> *
> >>>> @@ -107,6 +117,7 @@ static inline unsigned int hmm_pfn_to_map_order(unsigned long hmm_pfn)
> >>>> * @default_flags: default flags for the range (write, read, ... see hmm doc)
> >>>> * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
> >>>> * @dev_private_owner: owner of device private pages
> >>>> + * @migrate: structure for migrating the associated vma
> >>>> */
> >>>> struct hmm_range {
> >>>> struct mmu_interval_notifier *notifier;
> >>>> @@ -117,12 +128,14 @@ struct hmm_range {
> >>>> unsigned long default_flags;
> >>>> unsigned long pfn_flags_mask;
> >>>> void *dev_private_owner;
> >>>> + struct migrate_vma *migrate;
> >>>> };
> >>>>
> >>>> /*
> >>>> * Please see Documentation/mm/hmm.rst for how to use the range API.
> >>>> */
> >>>> int hmm_range_fault(struct hmm_range *range);
> >>>> +int hmm_range_migrate_prepare(struct hmm_range *range, struct migrate_vma **pargs);
> >>>>
> >>>> /*
> >>>> * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
> >>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> >>>> index 26ca00c325d9..0889309a9d21 100644
> >>>> --- a/include/linux/migrate.h
> >>>> +++ b/include/linux/migrate.h
> >>>> @@ -3,6 +3,7 @@
> >>>> #define _LINUX_MIGRATE_H
> >>>>
> >>>> #include <linux/mm.h>
> >>>> +#include <linux/hmm.h>
> >>>> #include <linux/mempolicy.h>
> >>>> #include <linux/migrate_mode.h>
> >>>> #include <linux/hugetlb.h>
> >>>> @@ -140,11 +141,12 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
> >>>> return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
> >>>> }
> >>>>
> >>>> -enum migrate_vma_direction {
> >>>> +enum migrate_vma_info {
> >>>> MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
> >>>> MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
> >>>> MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
> >>>> MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
> >>>> + MIGRATE_VMA_FAULT = 1 << 4,
> >>>> };
> >>>>
> >>>> struct migrate_vma {
> >>>> @@ -192,7 +194,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
> >>>> unsigned long npages);
> >>>> void migrate_device_finalize(unsigned long *src_pfns,
> >>>> unsigned long *dst_pfns, unsigned long npages);
> >>>> -
> >>>> +void migrate_hmm_range_setup(struct hmm_range *range);
> >>>> #endif /* CONFIG_MIGRATION */
> >>>>
> >>>> #endif /* _LINUX_MIGRATE_H */
> >>>> diff --git a/mm/hmm.c b/mm/hmm.c
> >>>> index 4ec74c18bef6..39a07d895043 100644
> >>>> --- a/mm/hmm.c
> >>>> +++ b/mm/hmm.c
> >>>> @@ -20,6 +20,7 @@
> >>>> #include <linux/pagemap.h>
> >>>> #include <linux/leafops.h>
> >>>> #include <linux/hugetlb.h>
> >>>> +#include <linux/migrate.h>
> >>>> #include <linux/memremap.h>
> >>>> #include <linux/sched/mm.h>
> >>>> #include <linux/jump_label.h>
> >>>> @@ -31,8 +32,12 @@
> >>>> #include "internal.h"
> >>>>
> >>>> struct hmm_vma_walk {
> >>>> - struct hmm_range *range;
> >>>> - unsigned long last;
> >>>> + struct mmu_notifier_range mmu_range;
> >>>> + struct vm_area_struct *vma;
> >>>> + struct hmm_range *range;
> >>>> + unsigned long start;
> >>>> + unsigned long end;
> >>>> + unsigned long last;
> >>>> };
> >>>>
> >>>> enum {
> >>>> @@ -41,21 +46,49 @@ enum {
> >>>> HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
> >>>> };
> >>>>
> >>>> -enum {
> >>>> - /* These flags are carried from input-to-output */
> >>>> - HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
> >>>> - HMM_PFN_P2PDMA_BUS,
> >>>> -};
> >>>> +static enum migrate_vma_info hmm_select_migrate(struct hmm_range *range)
> >>>> +{
> >>>> + enum migrate_vma_info minfo;
> >>>> +
> >>>> + minfo = range->migrate ? range->migrate->flags : 0;
> >>>> + minfo |= (range->default_flags & HMM_PFN_REQ_MIGRATE) ?
> >>>> + MIGRATE_VMA_SELECT_SYSTEM : 0;
> >>> I'm trying to make sense of HMM_PFN_REQ_MIGRATE and why it sets
> >>> MIGRATE_VMA_SELECT_SYSTEM?
> >>>
> >>> Also the just the general usage - would range->migrate be NULL in the
> >>> expected usage.
> >>>
> >>> Maybe an example of how hmm_range_fault would be called with this flag
> >>> and subsequent expected migrate calls would clear this up.
> >> hmm_select_migrate() figures out the migration type (or none) that's going
> >> to happen based on the caller (of hmm_range_fault() or migrate_vma_setup()).
> >>
> >> HMM_PFN_REQ_MIGRATE is hmm_range_fault() user's way to say it wants
> >> migrate on fault (with default_flags, the whole range) and for that
> >> direction is system->device.
> > Can’t range->migrate->flags just set MIGRATE_VMA_SELECT_SYSTEM?
> >
> > As far as I can tell, HMM_PFN_REQ_MIGRATE doesn’t really do anything if
> > range->migrate is NULL or range->migrate->flags is 0.
>
> Yes currently HMM_PFN_REQ_MIGRATE requires range->migrate set but not
> range->migrate->flags. But yes, setting range->migrate->flags directly would
> work also.
>
> >
> >> migrate_vma_setup() users (the current device migration) express their
> >> migration intention with migrate_vma.flags (direction being which ever).
> >>
> >>>> +
> >>>> + return minfo;
> >>>> +}
> >>>>
> >>>> static int hmm_pfns_fill(unsigned long addr, unsigned long end,
> >>>> - struct hmm_range *range, unsigned long cpu_flags)
> >>>> + struct hmm_vma_walk *hmm_vma_walk, unsigned long cpu_flags)
> >>>> {
> >>>> + struct hmm_range *range = hmm_vma_walk->range;
> >>>> unsigned long i = (addr - range->start) >> PAGE_SHIFT;
> >>>> + enum migrate_vma_info minfo;
> >>>> + bool migrate = false;
> >>>> +
> >>>> + minfo = hmm_select_migrate(range);
> >>>> + if (cpu_flags != HMM_PFN_ERROR) {
> >>>> + if (minfo && (vma_is_anonymous(hmm_vma_walk->vma))) {
> >>>> + cpu_flags |= (HMM_PFN_VALID | HMM_PFN_MIGRATE);
> >>>> + migrate = true;
> >>>> + }
> >>>> + }
> >>>> +
> >>>> + if (migrate && thp_migration_supported() &&
> >>>> + (minfo & MIGRATE_VMA_SELECT_COMPOUND) &&
> >>>> + IS_ALIGNED(addr, HPAGE_PMD_SIZE) &&
> >>>> + IS_ALIGNED(end, HPAGE_PMD_SIZE)) {
> >>>> + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
> >>>> + range->hmm_pfns[i] |= cpu_flags | HMM_PFN_COMPOUND;
> >>>> + addr += PAGE_SIZE;
> >>>> + i++;
> >>>> + cpu_flags = 0;
> >>>> + }
> >>>>
> >>>> for (; addr < end; addr += PAGE_SIZE, i++) {
> >>>> range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
> >>>> range->hmm_pfns[i] |= cpu_flags;
> >>>> }
> >>>> +
> >>>> return 0;
> >>>> }
> >>>>
> >>>> @@ -171,11 +204,11 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
> >>>> if (!walk->vma) {
> >>>> if (required_fault)
> >>>> return -EFAULT;
> >>>> - return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
> >>>> + return hmm_pfns_fill(addr, end, hmm_vma_walk, HMM_PFN_ERROR);
> >>>> }
> >>>> if (required_fault)
> >>>> return hmm_vma_fault(addr, end, required_fault, walk);
> >>>> - return hmm_pfns_fill(addr, end, range, 0);
> >>>> + return hmm_pfns_fill(addr, end, hmm_vma_walk, 0);
> >>>> }
> >>>>
> >>>> static inline unsigned long hmm_pfn_flags_order(unsigned long order)
> >>>> @@ -289,10 +322,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
> >>>> goto fault;
> >>>>
> >>>> if (softleaf_is_migration(entry)) {
> >>>> - pte_unmap(ptep);
> >>>> - hmm_vma_walk->last = addr;
> >>>> - migration_entry_wait(walk->mm, pmdp, addr);
> >>>> - return -EBUSY;
> >>>> + if (!hmm_select_migrate(range)) {
> >>>> + pte_unmap(ptep);
> >>>> + hmm_vma_walk->last = addr;
> >>>> + migration_entry_wait(walk->mm, pmdp, addr);
> >>>> + return -EBUSY;
> >>>> + } else
> >>>> + goto out;
> >>>> }
> >>>>
> >>>> /* Report error for everything else */
> >>>> @@ -376,7 +412,7 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
> >>>> return -EFAULT;
> >>>> }
> >>>>
> >>>> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> >>>> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> >>>> }
> >>>> #else
> >>>> static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
> >>>> @@ -389,10 +425,448 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
> >>>>
> >>>> if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
> >>>> return -EFAULT;
> >>>> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> >>>> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> >>>> }
> >>>> #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
> >>>>
> >>>> +/**
> >>>> + * migrate_vma_split_folio() - Helper function to split a THP folio
> >>>> + * @folio: the folio to split
> >>>> + * @fault_page: struct page associated with the fault if any
> >>>> + *
> >>>> + * Returns 0 on success
> >>>> + */
> >>>> +static int migrate_vma_split_folio(struct folio *folio,
> >>>> + struct page *fault_page)
> >>>> +{
> >>>> + int ret;
> >>>> + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
> >>>> + struct folio *new_fault_folio = NULL;
> >>>> +
> >>>> + if (folio != fault_folio) {
> >>>> + folio_get(folio);
> >>>> + folio_lock(folio);
> >>>> + }
> >>>> +
> >>>> + ret = split_folio(folio);
> >>>> + if (ret) {
> >>>> + if (folio != fault_folio) {
> >>>> + folio_unlock(folio);
> >>>> + folio_put(folio);
> >>>> + }
> >>>> + return ret;
> >>>> + }
> >>>> +
> >>>> + new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
> >>>> +
> >>>> + /*
> >>>> + * Ensure the lock is held on the correct
> >>>> + * folio after the split
> >>>> + */
> >>>> + if (!new_fault_folio) {
> >>>> + folio_unlock(folio);
> >>>> + folio_put(folio);
> >>>> + } else if (folio != new_fault_folio) {
> >>>> + if (new_fault_folio != fault_folio) {
> >>>> + folio_get(new_fault_folio);
> >>>> + folio_lock(new_fault_folio);
> >>>> + }
> >>>> + folio_unlock(folio);
> >>>> + folio_put(folio);
> >>>> + }
> >>>> +
> >>>> + return 0;
> >>>> +}
> >>>> +
> >>>> +static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
> >>>> + pmd_t *pmdp,
> >>>> + unsigned long start,
> >>>> + unsigned long end,
> >>>> + unsigned long *hmm_pfn)
> >>>> +{
> >>>> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
> >>>> + struct hmm_range *range = hmm_vma_walk->range;
> >>>> + struct migrate_vma *migrate = range->migrate;
> >>>> + struct mm_struct *mm = walk->vma->vm_mm;
> >>>> + struct folio *fault_folio = NULL;
> >>>> + struct folio *folio;
> >>>> + enum migrate_vma_info minfo;
> >>>> + spinlock_t *ptl;
> >>>> + unsigned long i;
> >>>> + int r = 0;
> >>>> +
> >>>> + minfo = hmm_select_migrate(range);
> >>>> + if (!minfo)
> >>>> + return r;
> >>>> +
> >>>> + fault_folio = (migrate && migrate->fault_page) ?
> >>>> + page_folio(migrate->fault_page) : NULL;
> >>>> +
> >>>> + ptl = pmd_lock(mm, pmdp);
> >>>> + if (pmd_none(*pmdp)) {
> >>>> + spin_unlock(ptl);
> >>>> + return hmm_pfns_fill(start, end, hmm_vma_walk, 0);
> >>>> + }
> >>>> +
> >>>> + if (pmd_trans_huge(*pmdp)) {
> >>>> + if (!(minfo & MIGRATE_VMA_SELECT_SYSTEM))
> >>>> + goto out;
> >>>> +
> >>>> + folio = pmd_folio(*pmdp);
> >>>> + if (is_huge_zero_folio(folio)) {
> >>>> + spin_unlock(ptl);
> >>>> + return hmm_pfns_fill(start, end, hmm_vma_walk, 0);
> >>>> + }
> >>>> +
> >>>> + } else if (!pmd_present(*pmdp)) {
> >>>> + const softleaf_t entry = softleaf_from_pmd(*pmdp);
> >>>> +
> >>>> + folio = softleaf_to_folio(entry);
> >>>> +
> >>>> + if (!softleaf_is_device_private(entry))
> >>>> + goto out;
> >>>> +
> >>>> + if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
> >>>> + goto out;
> >>>> + if (folio->pgmap->owner != migrate->pgmap_owner)
> >>>> + goto out;
> >>>> +
> >>>> + } else {
> >>>> + spin_unlock(ptl);
> >>>> + return -EBUSY;
> >>>> + }
> >>>> +
> >>>> + folio_get(folio);
> >>>> +
> >>>> + if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
> >>>> + spin_unlock(ptl);
> >>>> + folio_put(folio);
> >>>> + return 0;
> >>>> + }
> >>>> +
> >>>> + if (thp_migration_supported() &&
> >>>> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
> >>>> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
> >>>> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
> >>>> +
> >>>> + struct page_vma_mapped_walk pvmw = {
> >>>> + .ptl = ptl,
> >>>> + .address = start,
> >>>> + .pmd = pmdp,
> >>>> + .vma = walk->vma,
> >>>> + };
> >>>> +
> >>>> + hmm_pfn[0] |= HMM_PFN_MIGRATE | HMM_PFN_COMPOUND;
> >>>> +
> >>>> + r = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
> >>>> + if (r) {
> >>>> + hmm_pfn[0] &= ~(HMM_PFN_MIGRATE | HMM_PFN_COMPOUND);
> >>>> + r = -ENOENT; // fallback
> >>>> + goto unlock_out;
> >>>> + }
> >>>> + for (i = 1, start += PAGE_SIZE; start < end; start += PAGE_SIZE, i++)
> >>>> + hmm_pfn[i] &= HMM_PFN_INOUT_FLAGS;
> >>>> +
> >>>> + } else {
> >>>> + r = -ENOENT; // fallback
> >>>> + goto unlock_out;
> >>>> + }
> >>>> +
> >>>> +
> >>>> +out:
> >>>> + spin_unlock(ptl);
> >>>> + return r;
> >>>> +
> >>>> +unlock_out:
> >>>> + if (folio != fault_folio)
> >>>> + folio_unlock(folio);
> >>>> + folio_put(folio);
> >>>> + goto out;
> >>>> +
> >>>> +}
> >>>> +
> >>>> +/*
> >>>> + * Install migration entries if migration requested, either from fault
> >>>> + * or migrate paths.
> >>>> + *
> >>>> + */
> >>>> +static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
> >>>> + pmd_t *pmdp,
> >>>> + unsigned long addr,
> >>>> + unsigned long *hmm_pfn)
> >>>> +{
> >>>> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
> >>>> + struct hmm_range *range = hmm_vma_walk->range;
> >>>> + struct migrate_vma *migrate = range->migrate;
> >>>> + struct mm_struct *mm = walk->vma->vm_mm;
> >>>> + struct folio *fault_folio = NULL;
> >>>> + enum migrate_vma_info minfo;
> >>>> + struct dev_pagemap *pgmap;
> >>>> + bool anon_exclusive;
> >>>> + struct folio *folio;
> >>>> + unsigned long pfn;
> >>>> + struct page *page;
> >>>> + softleaf_t entry;
> >>>> + pte_t pte, swp_pte;
> >>>> + spinlock_t *ptl;
> >>>> + bool writable = false;
> >>>> + pte_t *ptep;
> >>>> +
> >>>> + // Do we want to migrate at all?
> >>>> + minfo = hmm_select_migrate(range);
> >>>> + if (!minfo)
> >>>> + return 0;
> >>>> +
> >>>> + fault_folio = (migrate && migrate->fault_page) ?
> >>>> + page_folio(migrate->fault_page) : NULL;
> >>>> +
> >>>> +again:
> >>>> + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> >>>> + if (!ptep)
> >>>> + return 0;
> >>>> +
> >>>> + pte = ptep_get(ptep);
> >>>> +
> >>>> + if (pte_none(pte)) {
> >>>> + // migrate without faulting case
> >>>> + if (vma_is_anonymous(walk->vma)) {
> >>>> + *hmm_pfn &= HMM_PFN_INOUT_FLAGS;
> >>>> + *hmm_pfn |= HMM_PFN_MIGRATE | HMM_PFN_VALID;
> >>>> + goto out;
> >>>> + }
> >>>> + }
> >>>> +
> >>>> + if (!pte_present(pte)) {
> >>>> + /*
> >>>> + * Only care about unaddressable device page special
> >>>> + * page table entry. Other special swap entries are not
> >>>> + * migratable, and we ignore regular swapped page.
> >>>> + */
> >>>> + entry = softleaf_from_pte(pte);
> >>>> + if (!softleaf_is_device_private(entry))
> >>>> + goto out;
> >>>> +
> >>>> + if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
> >>>> + goto out;
> >>>> +
> >>>> + page = softleaf_to_page(entry);
> >>>> + folio = page_folio(page);
> >>>> + if (folio->pgmap->owner != migrate->pgmap_owner)
> >>>> + goto out;
> >>>> +
> >>>> + if (folio_test_large(folio)) {
> >>>> + int ret;
> >>>> +
> >>>> + pte_unmap_unlock(ptep, ptl);
> >>>> + ret = migrate_vma_split_folio(folio,
> >>>> + migrate->fault_page);
> >>>> + if (ret)
> >>>> + goto out_unlocked;
> >>>> + goto again;
> >>>> + }
> >>>> +
> >>>> + pfn = page_to_pfn(page);
> >>>> + if (softleaf_is_device_private_write(entry))
> >>>> + writable = true;
> >>>> + } else {
> >>>> + pfn = pte_pfn(pte);
> >>>> + if (is_zero_pfn(pfn) &&
> >>>> + (minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
> >>>> + *hmm_pfn = HMM_PFN_MIGRATE|HMM_PFN_VALID;
> >>>> + goto out;
> >>>> + }
> >>>> + page = vm_normal_page(walk->vma, addr, pte);
> >>>> + if (page && !is_zone_device_page(page) &&
> >>>> + !(minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
> >>>> + goto out;
> >>>> + } else if (page && is_device_coherent_page(page)) {
> >>>> + pgmap = page_pgmap(page);
> >>>> +
> >>>> + if (!(minfo &
> >>>> + MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
> >>>> + pgmap->owner != migrate->pgmap_owner)
> >>>> + goto out;
> >>>> + }
> >>>> +
> >>>> + folio = page_folio(page);
> >>>> + if (folio_test_large(folio)) {
> >>>> + int ret;
> >>>> +
> >>>> + pte_unmap_unlock(ptep, ptl);
> >>>> + ret = migrate_vma_split_folio(folio,
> >>>> + migrate->fault_page);
> >>>> + if (ret)
> >>>> + goto out_unlocked;
> >>>> +
> >>>> + goto again;
> >>>> + }
> >>>> +
> >>>> + writable = pte_write(pte);
> >>>> + }
> >>>> +
> >>>> + if (!page || !page->mapping)
> >>>> + goto out;
> >>>> +
> >>>> + /*
> >>>> + * By getting a reference on the folio we pin it and that blocks
> >>>> + * any kind of migration. Side effect is that it "freezes" the
> >>>> + * pte.
> >>>> + *
> >>>> + * We drop this reference after isolating the folio from the lru
> >>>> + * for non device folio (device folio are not on the lru and thus
> >>>> + * can't be dropped from it).
> >>>> + */
> >>>> + folio = page_folio(page);
> >>>> + folio_get(folio);
> >>>> +
> >>>> + /*
> >>>> + * We rely on folio_trylock() to avoid deadlock between
> >>>> + * concurrent migrations where each is waiting on the others
> >>>> + * folio lock. If we can't immediately lock the folio we fail this
> >>>> + * migration as it is only best effort anyway.
> >>>> + *
> >>>> + * If we can lock the folio it's safe to set up a migration entry
> >>>> + * now. In the common case where the folio is mapped once in a
> >>>> + * single process setting up the migration entry now is an
> >>>> + * optimisation to avoid walking the rmap later with
> >>>> + * try_to_migrate().
> >>>> + */
> >>>> +
> >>>> + if (fault_folio == folio || folio_trylock(folio)) {
> >>>> + anon_exclusive = folio_test_anon(folio) &&
> >>>> + PageAnonExclusive(page);
> >>>> +
> >>>> + flush_cache_page(walk->vma, addr, pfn);
> >>>> +
> >>>> + if (anon_exclusive) {
> >>>> + pte = ptep_clear_flush(walk->vma, addr, ptep);
> >>>> +
> >>>> + if (folio_try_share_anon_rmap_pte(folio, page)) {
> >>>> + set_pte_at(mm, addr, ptep, pte);
> >>>> + folio_unlock(folio);
> >>>> + folio_put(folio);
> >>>> + goto out;
> >>>> + }
> >>>> + } else {
> >>>> + pte = ptep_get_and_clear(mm, addr, ptep);
> >>>> + }
> >>>> +
> >>>> + /* Setup special migration page table entry */
> >>>> + if (writable)
> >>>> + entry = make_writable_migration_entry(pfn);
> >>>> + else if (anon_exclusive)
> >>>> + entry = make_readable_exclusive_migration_entry(pfn);
> >>>> + else
> >>>> + entry = make_readable_migration_entry(pfn);
> >>>> +
> >>>> + swp_pte = swp_entry_to_pte(entry);
> >>>> + if (pte_present(pte)) {
> >>>> + if (pte_soft_dirty(pte))
> >>>> + swp_pte = pte_swp_mksoft_dirty(swp_pte);
> >>>> + if (pte_uffd_wp(pte))
> >>>> + swp_pte = pte_swp_mkuffd_wp(swp_pte);
> >>>> + } else {
> >>>> + if (pte_swp_soft_dirty(pte))
> >>>> + swp_pte = pte_swp_mksoft_dirty(swp_pte);
> >>>> + if (pte_swp_uffd_wp(pte))
> >>>> + swp_pte = pte_swp_mkuffd_wp(swp_pte);
> >>>> + }
> >>>> +
> >>>> + set_pte_at(mm, addr, ptep, swp_pte);
> >>>> + folio_remove_rmap_pte(folio, page, walk->vma);
> >>>> + folio_put(folio);
> >>>> + *hmm_pfn |= HMM_PFN_MIGRATE;
> >>>> +
> >>>> + if (pte_present(pte))
> >>>> + flush_tlb_range(walk->vma, addr, addr + PAGE_SIZE);
> >>>> + } else
> >>>> + folio_put(folio);
> >>>> +out:
> >>>> + pte_unmap_unlock(ptep, ptl);
> >>>> + return 0;
> >>>> +out_unlocked:
> >>>> + return -1;
> >>>> +
> >>>> +}
> >>>> +
> >>>> +static int hmm_vma_walk_split(pmd_t *pmdp,
> >>>> + unsigned long addr,
> >>>> + struct mm_walk *walk)
> >>>> +{
> >>>> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
> >>>> + struct hmm_range *range = hmm_vma_walk->range;
> >>>> + struct migrate_vma *migrate = range->migrate;
> >>>> + struct folio *folio, *fault_folio;
> >>>> + spinlock_t *ptl;
> >>>> + int ret = 0;
> >>>> +
> >>>> + fault_folio = (migrate && migrate->fault_page) ?
> >>>> + page_folio(migrate->fault_page) : NULL;
> >>>> +
> >>>> + ptl = pmd_lock(walk->mm, pmdp);
> >>>> + if (unlikely(!pmd_trans_huge(*pmdp))) {
> >>>> + spin_unlock(ptl);
> >>>> + goto out;
> >>>> + }
> >>>> +
> >>>> + folio = pmd_folio(*pmdp);
> >>>> + if (is_huge_zero_folio(folio)) {
> >>>> + spin_unlock(ptl);
> >>>> + split_huge_pmd(walk->vma, pmdp, addr);
> >>>> + } else {
> >>>> + folio_get(folio);
> >>>> + spin_unlock(ptl);
> >>>> +
> >>>> + if (folio != fault_folio) {
> >>>> + if (unlikely(!folio_trylock(folio))) {
> >>>> + folio_put(folio);
> >>>> + ret = -EBUSY;
> >>>> + goto out;
> >>>> + }
> >>>> + } else
> >>>> + folio_put(folio);
> >>>> +
> >>>> + ret = split_folio(folio);
> >>>> + if (fault_folio != folio) {
> >>>> + folio_unlock(folio);
> >>>> + folio_put(folio);
> >>>> + }
> >>>> +
> >>>> + }
> >>>> +out:
> >>>> + return ret;
> >>>> +}
> >>>> +
> >>>> +static int hmm_vma_capture_migrate_range(unsigned long start,
> >>>> + unsigned long end,
> >>>> + struct mm_walk *walk)
> >>>> +{
> >>>> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
> >>>> + struct hmm_range *range = hmm_vma_walk->range;
> >>>> +
> >>>> + if (!hmm_select_migrate(range))
> >>>> + return 0;
> >>>> +
> >>>> + if (hmm_vma_walk->vma && (hmm_vma_walk->vma != walk->vma))
> >>>> + return -ERANGE;
> >>>> +
> >>>> + hmm_vma_walk->vma = walk->vma;
> >>>> + hmm_vma_walk->start = start;
> >>>> + hmm_vma_walk->end = end;
> >>>> +
> >>>> + if (end - start > range->end - range->start)
> >>>> + return -ERANGE;
> >>>> +
> >>>> + if (!hmm_vma_walk->mmu_range.owner) {
> >>>> + mmu_notifier_range_init_owner(&hmm_vma_walk->mmu_range, MMU_NOTIFY_MIGRATE, 0,
> >>>> + walk->vma->vm_mm, start, end,
> >>>> + range->dev_private_owner);
> >>>> + mmu_notifier_invalidate_range_start(&hmm_vma_walk->mmu_range);
> >>>> + }
> >>>> +
> >>>> + return 0;
> >>>> +}
> >>>> +
> >>>> static int hmm_vma_walk_pmd(pmd_t *pmdp,
> >>>> unsigned long start,
> >>>> unsigned long end,
> >>>> @@ -404,42 +878,90 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> >>>> &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
> >>>> unsigned long npages = (end - start) >> PAGE_SHIFT;
> >>>> unsigned long addr = start;
> >>>> + enum migrate_vma_info minfo;
> >>>> + unsigned long i;
> >>>> + spinlock_t *ptl;
> >>>> pte_t *ptep;
> >>>> pmd_t pmd;
> >>>> + int r;
> >>>>
> >>>> + minfo = hmm_select_migrate(range);
> >>>> again:
> >>>> +
> >>>> pmd = pmdp_get_lockless(pmdp);
> >>>> - if (pmd_none(pmd))
> >>>> - return hmm_vma_walk_hole(start, end, -1, walk);
> >>>> + if (pmd_none(pmd)) {
> >>>> + r = hmm_vma_walk_hole(start, end, -1, walk);
> >>>> + if (r || !minfo)
> >>>> + return r;
> >>>> +
> >>>> + ptl = pmd_lock(walk->mm, pmdp);
> >>>> + if (pmd_none(*pmdp)) {
> >>>> + // hmm_vma_walk_hole() filled migration needs
> >>>> + spin_unlock(ptl);
> >>>> + return r;
> >>>> + }
> >>>> + spin_unlock(ptl);
> >>>> + }
> >>>>
> >>>> if (thp_migration_supported() && pmd_is_migration_entry(pmd)) {
> >>>> - if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
> >>>> - hmm_vma_walk->last = addr;
> >>>> - pmd_migration_entry_wait(walk->mm, pmdp);
> >>>> - return -EBUSY;
> >>>> + if (!minfo) {
> >>>> + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
> >>>> + hmm_vma_walk->last = addr;
> >>>> + pmd_migration_entry_wait(walk->mm, pmdp);
> >>>> + return -EBUSY;
> >>>> + }
> >>>> }
> >>>> - return hmm_pfns_fill(start, end, range, 0);
> >>>> + for (i = 0; addr < end; addr += PAGE_SIZE, i++)
> >>>> + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
> >>>> +
> >>>> + return 0;
> >>>> }
> >>>>
> >>>> - if (!pmd_present(pmd))
> >>>> - return hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
> >>>> - pmd);
> >>>> + if (pmd_trans_huge(pmd) || !pmd_present(pmd)) {
> >>>> +
> >>>> + if (!pmd_present(pmd)) {
> >>>> + r = hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
> >>>> + pmd);
> >>>> + if (r || !minfo)
> >>>> + return r;
> >>>> + } else {
> >>>> +
> >>>> + /*
> >>>> + * No need to take pmd_lock here, even if some other thread
> >>>> + * is splitting the huge pmd we will get that event through
> >>>> + * mmu_notifier callback.
> >>>> + *
> >>>> + * So just read pmd value and check again it's a transparent
> >>>> + * huge or device mapping one and compute corresponding pfn
> >>>> + * values.
> >>>> + */
> >>>> +
> >>>> + pmd = pmdp_get_lockless(pmdp);
> >>>> + if (!pmd_trans_huge(pmd))
> >>>> + goto again;
> >>>> +
> >>>> + r = hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
> >>>> +
> >>>> + if (r || !minfo)
> >>>> + return r;
> >>>> + }
> >>>>
> >>>> - if (pmd_trans_huge(pmd)) {
> >>>> - /*
> >>>> - * No need to take pmd_lock here, even if some other thread
> >>>> - * is splitting the huge pmd we will get that event through
> >>>> - * mmu_notifier callback.
> >>>> - *
> >>>> - * So just read pmd value and check again it's a transparent
> >>>> - * huge or device mapping one and compute corresponding pfn
> >>>> - * values.
> >>>> - */
> >>>> - pmd = pmdp_get_lockless(pmdp);
> >>>> - if (!pmd_trans_huge(pmd))
> >>>> - goto again;
> >>>> + r = hmm_vma_handle_migrate_prepare_pmd(walk, pmdp, start, end, hmm_pfns);
> >>>> +
> >>>> + if (r == -ENOENT) {
> >>>> + r = hmm_vma_walk_split(pmdp, addr, walk);
> >>>> + if (r) {
> >>>> + /* Split not successful, skip */
> >>>> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> >>>> + }
> >>>> +
> >>>> + /* Split successful or "again", reloop */
> >>>> + hmm_vma_walk->last = addr;
> >>>> + return -EBUSY;
> >>>> + }
> >>>> +
> >>>> + return r;
> >>>>
> >>>> - return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
> >>>> }
> >>>>
> >>>> /*
> >>>> @@ -451,22 +973,26 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> >>>> if (pmd_bad(pmd)) {
> >>>> if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
> >>>> return -EFAULT;
> >>>> - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> >>>> + return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> >>>> }
> >>>>
> >>>> ptep = pte_offset_map(pmdp, addr);
> >>>> if (!ptep)
> >>>> goto again;
> >>>> for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
> >>>> - int r;
> >>>>
> >>>> r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
> >>>> if (r) {
> >>>> /* hmm_vma_handle_pte() did pte_unmap() */
> >>>> return r;
> >>>> }
> >>>> +
> >>>> + r = hmm_vma_handle_migrate_prepare(walk, pmdp, addr, hmm_pfns);
> >>>> + if (r)
> >>>> + break;
> >>>> }
> >>>> pte_unmap(ptep - 1);
> >>>> +
> >>>> return 0;
> >>>> }
> >>>>
> >>>> @@ -600,6 +1126,11 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
> >>>> struct hmm_vma_walk *hmm_vma_walk = walk->private;
> >>>> struct hmm_range *range = hmm_vma_walk->range;
> >>>> struct vm_area_struct *vma = walk->vma;
> >>>> + int r;
> >>>> +
> >>>> + r = hmm_vma_capture_migrate_range(start, end, walk);
> >>>> + if (r)
> >>>> + return r;
> >>>>
> >>>> if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
> >>>> vma->vm_flags & VM_READ)
> >>>> @@ -622,7 +1153,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
> >>>> (end - start) >> PAGE_SHIFT, 0))
> >>>> return -EFAULT;
> >>>>
> >>>> - hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> >>>> + hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> >>>>
> >>>> /* Skip this vma and continue processing the next vma. */
> >>>> return 1;
> >>>> @@ -652,9 +1183,17 @@ static const struct mm_walk_ops hmm_walk_ops = {
> >>>> * the invalidation to finish.
> >>>> * -EFAULT: A page was requested to be valid and could not be made valid
> >>>> * ie it has no backing VMA or it is illegal to access
> >>>> + * -ERANGE: The range crosses multiple VMAs, or space for hmm_pfns array
> >>>> + * is too low.
> >>>> *
> >>>> * This is similar to get_user_pages(), except that it can read the page tables
> >>>> * without mutating them (ie causing faults).
> >>>> + *
> >>>> + * If want to do migrate after faultin, call hmm_rangem_fault() with
> >>> s/faultin/faulting
> >>>
> >>> s/hmm_rangem_fault/hmm_range_fault
> >> Will fix, thanks!
> >>
> >>>> + * HMM_PFN_REQ_MIGRATE and initialize range.migrate field.
> >>> I'm not following the HMM_PFN_REQ_MIGRATE usage.
> >>>
> >>>> + * After hmm_range_fault() call migrate_hmm_range_setup() instead of
> >>>> + * migrate_vma_setup() and after that follow normal migrate calls path.
> >>>> + *
> >>> Also since migrate_vma_setup calls hmm_range_fault then
> >>> migrate_hmm_range_setup what would the use case for not just calling
> >>> migrate_vma_setup be?
> >> It's the call context that matters. When resolving device faults and based on a policy,
> >> some regions you mirror, some migrate, you start with
> >> call(s) to hmm_range_fault() (+HMM_PFN_REQ_MIGRATE optionally),
> >> and explicitly call migrate_hmm_range_setup() + other migrate_vma* when migrating.
> >> For resolving device faults it's just the hmm_range_fault() part like before.
> >> For current migration users, migrate_vma_setup() keeps the current behavior calling
> >> hmm_range_fault() under the hood for installing migration ptes
> >> after optionally bringing in swapped pages (with MIGRATE_VMA_FAULT).
> >> I think we get better understanding how the entrypoints should look like when used for a real device.
> >> For now I have tried to keep the current behavior and entry points, refactoring under the hood
> >> for code reuse.
> >>
> > I’m not completely following the above, but I did think of a case where
> > hmm_range_fault + migrate_hmm_range_setup is better than using
> > migrate_vma_setup() alone. Previously, before calling
> > migrate_vma_setup(), we had to look up the VMA. If hmm_range_fault
> > populates the migrate_vma arguments (include the VMA,) that’s better
> > than the current flow.
>
> Yes that's true, and actually something I forgot to mention in the cover letter:
> We get the VMAs populated now.
>
> >
> >>>> */
> >>>> int hmm_range_fault(struct hmm_range *range)
> >>>> {
> >>>> @@ -662,16 +1201,28 @@ int hmm_range_fault(struct hmm_range *range)
> >>>> .range = range,
> >>>> .last = range->start,
> >>>> };
> >>>> - struct mm_struct *mm = range->notifier->mm;
> >>>> + bool is_fault_path = !!range->notifier;
> >>>> + struct mm_struct *mm;
> >>>> int ret;
> >>>>
> >>>> + /*
> >>>> + *
> >>>> + * Could be serving a device fault or come from migrate
> >>>> + * entry point. For the former we have not resolved the vma
> >>>> + * yet, and the latter we don't have a notifier (but have a vma).
> >>>> + *
> >>>> + */
> >>>> + mm = is_fault_path ? range->notifier->mm : range->migrate->vma->vm_mm;
> >>>> mmap_assert_locked(mm);
> >>>>
> >>>> do {
> >>>> /* If range is no longer valid force retry. */
> >>>> - if (mmu_interval_check_retry(range->notifier,
> >>>> - range->notifier_seq))
> >>>> - return -EBUSY;
> >>>> + if (is_fault_path && mmu_interval_check_retry(range->notifier,
> >>>> + range->notifier_seq)) {
> >>>> + ret = -EBUSY;
> >>>> + break;
> >>>> + }
> >>>> +
> >>>> ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
> >>>> &hmm_walk_ops, &hmm_vma_walk);
> >>>> /*
> >>>> @@ -681,6 +1232,18 @@ int hmm_range_fault(struct hmm_range *range)
> >>>> * output, and all >= are still at their input values.
> >>>> */
> >>>> } while (ret == -EBUSY);
> >>>> +
> >>>> + if (hmm_select_migrate(range) && range->migrate &&
> >>>> + hmm_vma_walk.mmu_range.owner) {
> >>>> + // The migrate_vma path has the following initialized
> >>>> + if (is_fault_path) {
> >>>> + range->migrate->vma = hmm_vma_walk.vma;
> >>>> + range->migrate->start = range->start;
> >>>> + range->migrate->end = hmm_vma_walk.end;
> >>>> + }
> >>>> + mmu_notifier_invalidate_range_end(&hmm_vma_walk.mmu_range);
> >>>> + }
> >>>> +
> >>>> return ret;
> >>>> }
> >>>> EXPORT_SYMBOL(hmm_range_fault);
> >>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> >>>> index 23379663b1e1..d89efdfca8f6 100644
> >>>> --- a/mm/migrate_device.c
> >>>> +++ b/mm/migrate_device.c
> >>>> @@ -734,7 +734,17 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
> >>>> */
> >>>> int migrate_vma_setup(struct migrate_vma *args)
> >>>> {
> >>>> + int ret;
> >>>> long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
> >>>> + struct hmm_range range = {
> >>>> + .notifier = NULL,
> >>>> + .start = args->start,
> >>>> + .end = args->end,
> >>>> + .migrate = args,
> >>>> + .hmm_pfns = args->src,
> >>>> + .dev_private_owner = args->pgmap_owner,
> >>>> + .migrate = args
> >>>> + };
> >>>>
> >>>> args->start &= PAGE_MASK;
> >>>> args->end &= PAGE_MASK;
> >>>> @@ -759,17 +769,19 @@ int migrate_vma_setup(struct migrate_vma *args)
> >>>> args->cpages = 0;
> >>>> args->npages = 0;
> >>>>
> >>>> - migrate_vma_collect(args);
> >>>> + if (args->flags & MIGRATE_VMA_FAULT)
> >>>> + range.default_flags |= HMM_PFN_REQ_FAULT;
> >>> Next level here might be skip faulting pte_none() in hmm_range_fault
> >>> too?
> >> That would be a possible optimization!
> >>
> > Yes, I think that would make this really useful, and we (Xe’s SVM
> > implementation) would likely always want to set the flag that says
> > ‘fault in !pte_present() && !pte_none’. I’m not 100% sure on this, but
> > that’s my initial feeling.
>
> Yes I will look into this.
>
> > Matt
>
> --Mika
>
>
On 1/18/26 06:02, Matthew Brost wrote:
> On Sat, Jan 17, 2026 at 09:07:36AM +0200, Mika Penttilä wrote:
>> On 1/16/26 22:05, Matthew Brost wrote:
>>
>>> On Fri, Jan 16, 2026 at 12:39:30PM +0200, Mika Penttilä wrote:
>>>> Hi,
>>>>
>>>> On 1/16/26 04:06, Matthew Brost wrote:
>>>>
>>>>> On Wed, Jan 14, 2026 at 11:19:21AM +0200, mpenttil@redhat.com wrote:
>>>>>> From: Mika Penttilä <mpenttil@redhat.com>
>>>>>>
>>>>>> Currently, the way device page faulting and migration works
>>>>>> is not optimal, if you want to do both fault handling and
>>>>>> migration at once.
>>>>>>
>>>>>> Being able to migrate not present pages (or pages mapped with incorrect
>>>>>> permissions, eg. COW) to the GPU requires doing either of the
>>>>>> following sequences:
>>>>>>
>>>>>> 1. hmm_range_fault() - fault in non-present pages with correct permissions, etc.
>>>>>> 2. migrate_vma_*() - migrate the pages
>>>>>>
>>>>>> Or:
>>>>>>
>>>>>> 1. migrate_vma_*() - migrate present pages
>>>>>> 2. If non-present pages detected by migrate_vma_*():
>>>>>> a) call hmm_range_fault() to fault pages in
>>>>>> b) call migrate_vma_*() again to migrate now present pages
>>>>>>
>>>>>> The problem with the first sequence is that you always have to do two
>>>>>> page walks even when most of the time the pages are present or zero page
>>>>>> mappings so the common case takes a performance hit.
>>>>>>
>>>>>> The second sequence is better for the common case, but far worse if
>>>>>> pages aren't present because now you have to walk the page tables three
>>>>>> times (once to find the page is not present, once so hmm_range_fault()
>>>>>> can find a non-present page to fault in and once again to setup the
>>>>>> migration). It is also tricky to code correctly.
>>>>>>
>>>>>> We should be able to walk the page table once, faulting
>>>>>> pages in as required and replacing them with migration entries if
>>>>>> requested.
>>>>>>
>>>>>> Add a new flag to HMM APIs, HMM_PFN_REQ_MIGRATE,
>>>>>> which tells to prepare for migration also during fault handling.
>>>>>> Also, for the migrate_vma_setup() call paths, a flags, MIGRATE_VMA_FAULT,
>>>>>> is added to tell to add fault handling to migrate.
>>>>>>
>>>>>> Cc: David Hildenbrand <david@redhat.com>
>>>>>> Cc: Jason Gunthorpe <jgg@nvidia.com>
>>>>>> Cc: Leon Romanovsky <leonro@nvidia.com>
>>>>>> Cc: Alistair Popple <apopple@nvidia.com>
>>>>>> Cc: Balbir Singh <balbirs@nvidia.com>
>>>>>> Cc: Zi Yan <ziy@nvidia.com>
>>>>>> Cc: Matthew Brost <matthew.brost@intel.com>
>>>>> I'll try to test this when I can but horribly behind at the moment.
>>>>>
>>>>> You can use Intel's CI system to test SVM too. I can get you authorized
>>>>> to use this. The list to trigger is intel-xe@lists.freedesktop.org and
>>>>> patches must apply to drm-tip. I'll let you know when you are
>>>>> authorized.
>>>> Thanks, appreciate, will do that also!
>>>>
>>> Working on enabling this for you in CI.
>>>
>>> I did a quick test by running our complete test suite and got a kernel
>>> hang in this section:
>>>
>>> xe_exec_system_allocator.threads-shared-vm-many-stride-malloc-prefetch
>>>
>>> Stack trace:
>>>
>>> [ 182.915763] INFO: task xe_exec_system_:5357 blocked for more than 30 seconds.
>>> [ 182.922866] Tainted: G U W 6.19.0-rc4-xe+ #2549
>>> [ 182.929183] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>> [ 182.936912] task:xe_exec_system_ state:D stack:0 pid:5357 tgid:1862 ppid:1853 task_flags:0x400040 flags:0x00080000
>>> [ 182.936916] Call Trace:
>>> [ 182.936918] <TASK>
>>> [ 182.936919] __schedule+0x4df/0xc20
>>> [ 182.936924] schedule+0x22/0xd0
>>> [ 182.936925] io_schedule+0x41/0x60
>>> [ 182.936926] migration_entry_wait_on_locked+0x21c/0x2a0
>>> [ 182.936929] ? __pfx_wake_page_function+0x10/0x10
>>> [ 182.936931] migration_entry_wait+0xad/0xf0
>>> [ 182.936933] hmm_vma_walk_pmd+0xd5f/0x19b0
>>> [ 182.936935] walk_pgd_range+0x51d/0xa60
>>> [ 182.936938] __walk_page_range+0x75/0x1e0
>>> [ 182.936940] walk_page_range_mm_unsafe+0x138/0x1f0
>>> [ 182.936941] hmm_range_fault+0x8f/0x160
>>> [ 182.936945] drm_gpusvm_get_pages+0x1ae/0x8a0 [drm_gpusvm_helper]
>>> [ 182.936949] drm_gpusvm_range_get_pages+0x2d/0x40 [drm_gpusvm_helper]
>>> [ 182.936951] xe_svm_range_get_pages+0x1b/0x50 [xe]
>>> [ 182.936979] xe_vm_bind_ioctl+0x15c3/0x17e0 [xe]
>>> [ 182.937001] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe]
>>> [ 182.937021] ? drm_ioctl_kernel+0xa3/0x100
>>> [ 182.937024] drm_ioctl_kernel+0xa3/0x100
>>> [ 182.937026] drm_ioctl+0x213/0x440
>>> [ 182.937028] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe]
>>> [ 182.937061] xe_drm_ioctl+0x5a/0xa0 [xe]
>>> [ 182.937083] __x64_sys_ioctl+0x7f/0xd0
>>> [ 182.937085] do_syscall_64+0x50/0x290
>>> [ 182.937088] entry_SYSCALL_64_after_hwframe+0x76/0x7e
>>> [ 182.937091] RIP: 0033:0x7ff00f724ded
>>> [ 182.937092] RSP: 002b:00007ff00b9fa640 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
>>> [ 182.937094] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007ff00f724ded
>>> [ 182.937095] RDX: 00007ff00b9fa6d0 RSI: 0000000040886445 RDI: 0000000000000003
>>> [ 182.937096] RBP: 00007ff00b9fa690 R08: 0000000000000000 R09: 0000000000000000
>>> [ 182.937097] R10: 0000000000000001 R11: 0000000000000246 R12: 00007ff00b9fa6d0
>>> [ 182.937098] R13: 0000000040886445 R14: 0000000000000003 R15: 00007ff00f8a9000
>>> [ 182.937099] </TASK>
>>>
>>> This section is a racy test with parallel CPU and device access that is
>>> likely causing the migration process to abort and retry. From the stack
>>> trace, it looks like a migration PMD didn’t get properly removed, and a
>>> subsequent call to hmm_range_fault hangs on a migration entry that was
>>> not removed during the migration abort.
>>>
>>> IIRC, some of the last bits in Balbir’s large device pages series had a
>>> similar bug, which I sent to Andrew with fixup patches. I suspect you
>>> have a similar bug. If I can find the time, I’ll see if I can track it
>>> down.
>> Thanks for your efforts Matthew!
>>
> Happy to help.
>
>> I remember those discussions, and looks similar. If recall correctly, it was
>> about failed splits and continuation after that. The code bails out the
>> pmd loop in this case but maybe this part is missing and is equivalent to
>> collect skip:
>>
>> diff --git a/mm/hmm.c b/mm/hmm.c
>> index 39a07d895043..c7a7bb923a37 100644
>> --- a/mm/hmm.c
>> +++ b/mm/hmm.c
>> @@ -988,8 +988,12 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>> }
>>
>> r = hmm_vma_handle_migrate_prepare(walk, pmdp, addr, hmm_pfns);
>> - if (r)
>> + if (r) {
>> + /* Split has failed, skip to end. */
>> + for (i = 0; addr < end; addr += PAGE_SIZE, i++)
>> + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
>> break;
>> + }
>> }
>> pte_unmap(ptep - 1);
>>
>> With that is should be quite similar to current flow (for effects).
>>
> I think the above code is conceptually correct and needed, but I don’t
> think it is the actual problem.
>
> I traced the issue to the handle and prepare functions racing with other
> migrations. The handle function doesn’t find a valid PTE/PMD and
> populates the HMM PFN with zero. The prepare function then finds a valid
> PTE/PMD and installs a migration entry. My code aborts the migration
> because we only migrate if all pages in the requested range are found.
> migrate_vma_pages/migrate_vma_finalize are called, but since some PFNs
> are zero, the migration entries installed during the prepare step are
> not removed.
>
> This is actually very easy to reproduce on my end if I disable compound
> page collection in my migrations and then run a test that migrates 2M
> pages with racing CPU/GPU access. I suspect we could write a selftest to
> replicate this.
>
> I have a hacky patch that fixes this — among other things I spotted
> while looking at this — but I don’t think that’s the right approach. I
> believe the right approach is to unify the handle and prepare functions
> into a single one. If you look at what these two functions do, they are
> actually quite similar; e.g., both inspect PTE/PMD state to make
> decisions. Of course, if you want a migration you need the PTE/PMD
> locks, and those would need to be dropped if you want to do something
> like fault in a page. This still seems doable. I’ll let you and the HMM
> maintainers duke this out regarding whether this is an acceptable
> approach :).
>
> FWIW, here are the changes I had to make to get my code stable. Tested
> with compound page collection both on and off.
Thanks a lot, you spotted the root cause! Seems the issue is,
if migrating, you have to set the HMM_PFN_VALID and HMM_PFN_MIGRATE both
while holding the pmd/ptl lock (without releasing in between).
My original idea with the separate handle and prepare steps was
that handle does the faulting and pfn population step, the
the HMM_PFN_VALID establishing step.
Prepare then decides about migration, by possibly adding HMM_PFN_MIGRATE.
Currently this isn't race free. One way to fix is like you do, but admit
that brings some redundancy between the paths.
I think having those two functions separate is good for manageability,
instead of scattering migrate related fragments to many places.
The migration part brings quite a lot complexity with the splitting
and stuff, and likely would end up adding encapsulating some of it
in own functions anyway.
I am experimenting with modified locking, so while migrating
we lock the pte/pmd for the walk (and unlock when handling faults) so the
handle and migrate happen logically under same locking region.
Let's see how that looks like. I think that should solve
the seen issues as well..
Will send v2 with this and other fixes so far..
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 39a07d895043..5e24cd82b393 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -509,8 +509,10 @@ static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
> }
>
> if (pmd_trans_huge(*pmdp)) {
> - if (!(minfo & MIGRATE_VMA_SELECT_SYSTEM))
> + if (!(minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
> + hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> goto out;
> + }
>
> folio = pmd_folio(*pmdp);
> if (is_huge_zero_folio(folio)) {
> @@ -523,16 +525,22 @@ static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
>
> folio = softleaf_to_folio(entry);
>
> - if (!softleaf_is_device_private(entry))
> - goto out;
> -
> - if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
> + if (!softleaf_is_device_private(entry)) {
> + hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> goto out;
> - if (folio->pgmap->owner != migrate->pgmap_owner)
> + }
> + if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE)) {
> + hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> goto out;
> + }
>
> + if (folio->pgmap->owner != migrate->pgmap_owner) {
> + hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> + goto out;
> + }
> } else {
> spin_unlock(ptl);
> + hmm_vma_walk->last = start;
> return -EBUSY;
> }
>
> @@ -541,6 +549,7 @@ static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
> if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
> spin_unlock(ptl);
> folio_put(folio);
> + hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
> return 0;
> }
>
> @@ -555,8 +564,10 @@ static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
> .pmd = pmdp,
> .vma = walk->vma,
> };
> + unsigned long pfn = page_to_pfn(folio_page(folio, 0));
>
> - hmm_pfn[0] |= HMM_PFN_MIGRATE | HMM_PFN_COMPOUND;
> + hmm_pfn[0] |= HMM_PFN_MIGRATE | HMM_PFN_COMPOUND |
> + HMM_PFN_VALID;
>
> r = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
> if (r) {
> @@ -564,6 +575,8 @@ static int hmm_vma_handle_migrate_prepare_pmd(const struct mm_walk *walk,
> r = -ENOENT; // fallback
> goto unlock_out;
> }
> + hmm_pfn[0] &= HMM_PFN_FLAGS;
> + hmm_pfn[0] |= pfn;
> for (i = 1, start += PAGE_SIZE; start < end; start += PAGE_SIZE, i++)
> hmm_pfn[i] &= HMM_PFN_INOUT_FLAGS;
>
> @@ -604,7 +617,7 @@ static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
> struct dev_pagemap *pgmap;
> bool anon_exclusive;
> struct folio *folio;
> - unsigned long pfn;
> + unsigned long pfn = 0;
> struct page *page;
> softleaf_t entry;
> pte_t pte, swp_pte;
> @@ -688,8 +701,8 @@ static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
> goto out;
> }
>
> - folio = page_folio(page);
> - if (folio_test_large(folio)) {
> + folio = page ? page_folio(page) : NULL;
> + if (folio && folio_test_large(folio)) {
> int ret;
>
> pte_unmap_unlock(ptep, ptl);
> @@ -745,12 +758,15 @@ static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
> set_pte_at(mm, addr, ptep, pte);
> folio_unlock(folio);
> folio_put(folio);
> + *hmm_pfn &= HMM_PFN_INOUT_FLAGS;
> goto out;
> }
> } else {
> pte = ptep_get_and_clear(mm, addr, ptep);
> }
>
> + /* XXX: Migrate layer calls folio_mark_dirty if pte_dirty */
> +
> /* Setup special migration page table entry */
> if (writable)
> entry = make_writable_migration_entry(pfn);
> @@ -759,6 +775,8 @@ static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
> else
> entry = make_readable_migration_entry(pfn);
>
> + /* XXX: Migrate layer makes entry young / dirty based on PTE */
> +
> swp_pte = swp_entry_to_pte(entry);
> if (pte_present(pte)) {
> if (pte_soft_dirty(pte))
> @@ -775,8 +793,18 @@ static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
> set_pte_at(mm, addr, ptep, swp_pte);
> folio_remove_rmap_pte(folio, page, walk->vma);
> folio_put(folio);
> - *hmm_pfn |= HMM_PFN_MIGRATE;
> + /*
> + * XXX: It is possible the PTE wasn't present in the first part
> + * of the HHM walk, repopulate it...
> + */
> + *hmm_pfn &= HMM_PFN_FLAGS;
> + *hmm_pfn |= HMM_PFN_MIGRATE | pfn | HMM_PFN_VALID |
> + (writable ? HMM_PFN_WRITE : 0);
>
> + /*
> + * XXX: Is there a perf impact of calling flush_tlb_range on
> + * each PTE vs. range like migrate_vma layer?
> + */
> if (pte_present(pte))
> flush_tlb_range(walk->vma, addr, addr + PAGE_SIZE);
> } else
> @@ -988,8 +1016,10 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> }
>
> r = hmm_vma_handle_migrate_prepare(walk, pmdp, addr, hmm_pfns);
> - if (r)
> + if (r) {
> + hmm_pfns_fill(addr, end, hmm_vma_walk, HMM_PFN_ERROR);
> break;
> + }
> }
> pte_unmap(ptep - 1);
>
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index c8f5a0615a5e..58d42d3ab673 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -268,6 +268,12 @@ int migrate_vma_setup(struct migrate_vma *args)
>
> migrate_hmm_range_setup(&range);
>
> + /* Remove migration PTEs */
> + if (ret) {
> + migrate_vma_pages(args);
> + migrate_vma_finalize(args);
> + }
> +
> /*
> * At this point pages are locked and unmapped, and thus they have
> * stable content and can safely be copied to destination memory that
>
> Matt
--Mika
Hi,
kernel test robot noticed the following build warnings:
[auto build test WARNING on akpm-mm/mm-nonmm-unstable]
[also build test WARNING on linus/master v6.19-rc5 next-20260115]
[cannot apply to akpm-mm/mm-everything]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/mpenttil-redhat-com/mm-unified-hmm-fault-and-migrate-device-pagewalk-paths/20260114-172232
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-nonmm-unstable
patch link: https://lore.kernel.org/r/20260114091923.3950465-2-mpenttil%40redhat.com
patch subject: [PATCH 1/3] mm: unified hmm fault and migrate device pagewalk paths
config: x86_64-randconfig-123-20260115 (https://download.01.org/0day-ci/archive/20260116/202601160405.2XZZtwqw-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260116/202601160405.2XZZtwqw-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601160405.2XZZtwqw-lkp@intel.com/
sparse warnings: (new ones prefixed by >>)
mm/migrate_device.c:179:25: sparse: sparse: context imbalance in 'migrate_vma_collect_huge_pmd' - unexpected unlock
mm/migrate_device.c:262:27: sparse: sparse: context imbalance in 'migrate_vma_collect_pmd' - different lock contexts for basic block
>> mm/migrate_device.c:743:18: sparse: sparse: Initializer entry defined twice
mm/migrate_device.c:746:18: sparse: also defined here
mm/migrate_device.c:915:16: sparse: sparse: context imbalance in 'migrate_vma_insert_huge_pmd_page' - different lock contexts for basic block
vim +743 mm/migrate_device.c
670
671 /**
672 * migrate_vma_setup() - prepare to migrate a range of memory
673 * @args: contains the vma, start, and pfns arrays for the migration
674 *
675 * Returns: negative errno on failures, 0 when 0 or more pages were migrated
676 * without an error.
677 *
678 * Prepare to migrate a range of memory virtual address range by collecting all
679 * the pages backing each virtual address in the range, saving them inside the
680 * src array. Then lock those pages and unmap them. Once the pages are locked
681 * and unmapped, check whether each page is pinned or not. Pages that aren't
682 * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
683 * corresponding src array entry. Then restores any pages that are pinned, by
684 * remapping and unlocking those pages.
685 *
686 * The caller should then allocate destination memory and copy source memory to
687 * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
688 * flag set). Once these are allocated and copied, the caller must update each
689 * corresponding entry in the dst array with the pfn value of the destination
690 * page and with MIGRATE_PFN_VALID. Destination pages must be locked via
691 * lock_page().
692 *
693 * Note that the caller does not have to migrate all the pages that are marked
694 * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
695 * device memory to system memory. If the caller cannot migrate a device page
696 * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
697 * consequences for the userspace process, so it must be avoided if at all
698 * possible.
699 *
700 * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
701 * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
702 * allowing the caller to allocate device memory for those unbacked virtual
703 * addresses. For this the caller simply has to allocate device memory and
704 * properly set the destination entry like for regular migration. Note that
705 * this can still fail, and thus inside the device driver you must check if the
706 * migration was successful for those entries after calling migrate_vma_pages(),
707 * just like for regular migration.
708 *
709 * After that, the callers must call migrate_vma_pages() to go over each entry
710 * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
711 * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
712 * then migrate_vma_pages() to migrate struct page information from the source
713 * struct page to the destination struct page. If it fails to migrate the
714 * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
715 * src array.
716 *
717 * At this point all successfully migrated pages have an entry in the src
718 * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
719 * array entry with MIGRATE_PFN_VALID flag set.
720 *
721 * Once migrate_vma_pages() returns the caller may inspect which pages were
722 * successfully migrated, and which were not. Successfully migrated pages will
723 * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
724 *
725 * It is safe to update device page table after migrate_vma_pages() because
726 * both destination and source page are still locked, and the mmap_lock is held
727 * in read mode (hence no one can unmap the range being migrated).
728 *
729 * Once the caller is done cleaning up things and updating its page table (if it
730 * chose to do so, this is not an obligation) it finally calls
731 * migrate_vma_finalize() to update the CPU page table to point to new pages
732 * for successfully migrated pages or otherwise restore the CPU page table to
733 * point to the original source pages.
734 */
735 int migrate_vma_setup(struct migrate_vma *args)
736 {
737 int ret;
738 long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
739 struct hmm_range range = {
740 .notifier = NULL,
741 .start = args->start,
742 .end = args->end,
> 743 .migrate = args,
744 .hmm_pfns = args->src,
745 .dev_private_owner = args->pgmap_owner,
746 .migrate = args
747 };
748
749 args->start &= PAGE_MASK;
750 args->end &= PAGE_MASK;
751 if (!args->vma || is_vm_hugetlb_page(args->vma) ||
752 (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
753 return -EINVAL;
754 if (nr_pages <= 0)
755 return -EINVAL;
756 if (args->start < args->vma->vm_start ||
757 args->start >= args->vma->vm_end)
758 return -EINVAL;
759 if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
760 return -EINVAL;
761 if (!args->src || !args->dst)
762 return -EINVAL;
763 if (args->fault_page && !is_device_private_page(args->fault_page))
764 return -EINVAL;
765 if (args->fault_page && !PageLocked(args->fault_page))
766 return -EINVAL;
767
768 memset(args->src, 0, sizeof(*args->src) * nr_pages);
769 args->cpages = 0;
770 args->npages = 0;
771
772 if (args->flags & MIGRATE_VMA_FAULT)
773 range.default_flags |= HMM_PFN_REQ_FAULT;
774
775 ret = hmm_range_fault(&range);
776
777 migrate_hmm_range_setup(&range);
778
779 /*
780 * At this point pages are locked and unmapped, and thus they have
781 * stable content and can safely be copied to destination memory that
782 * is allocated by the drivers.
783 */
784 return ret;
785
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-nonmm-unstable]
[also build test ERROR on linus/master v6.19-rc5 next-20260114]
[cannot apply to akpm-mm/mm-everything]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/mpenttil-redhat-com/mm-unified-hmm-fault-and-migrate-device-pagewalk-paths/20260114-172232
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-nonmm-unstable
patch link: https://lore.kernel.org/r/20260114091923.3950465-2-mpenttil%40redhat.com
patch subject: [PATCH 1/3] mm: unified hmm fault and migrate device pagewalk paths
config: hexagon-randconfig-r063-20260114 (https://download.01.org/0day-ci/archive/20260115/202601150107.djcXDbUn-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 9b8addffa70cee5b2acc5454712d9cf78ce45710)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260115/202601150107.djcXDbUn-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601150107.djcXDbUn-lkp@intel.com/
All errors (new ones prefixed by >>):
>> mm/hmm.c:49:30: error: incomplete result type 'enum migrate_vma_info' in function definition
49 | static enum migrate_vma_info hmm_select_migrate(struct hmm_range *range)
| ^
mm/hmm.c:49:13: note: forward declaration of 'enum migrate_vma_info'
49 | static enum migrate_vma_info hmm_select_migrate(struct hmm_range *range)
| ^
>> mm/hmm.c:51:24: error: variable has incomplete type 'enum migrate_vma_info'
51 | enum migrate_vma_info minfo;
| ^
mm/hmm.c:49:13: note: forward declaration of 'enum migrate_vma_info'
49 | static enum migrate_vma_info hmm_select_migrate(struct hmm_range *range)
| ^
>> mm/hmm.c:53:41: error: incomplete definition of type 'struct migrate_vma'
53 | minfo = range->migrate ? range->migrate->flags : 0;
| ~~~~~~~~~~~~~~^
include/linux/hmm.h:15:8: note: forward declaration of 'struct migrate_vma'
15 | struct migrate_vma;
| ^
>> mm/hmm.c:55:3: error: use of undeclared identifier 'MIGRATE_VMA_SELECT_SYSTEM'
55 | MIGRATE_VMA_SELECT_SYSTEM : 0;
| ^~~~~~~~~~~~~~~~~~~~~~~~~
mm/hmm.c:65:24: error: variable has incomplete type 'enum migrate_vma_info'
65 | enum migrate_vma_info minfo;
| ^
mm/hmm.c:49:13: note: forward declaration of 'enum migrate_vma_info'
49 | static enum migrate_vma_info hmm_select_migrate(struct hmm_range *range)
| ^
>> mm/hmm.c:77:15: error: use of undeclared identifier 'MIGRATE_VMA_SELECT_COMPOUND'
77 | (minfo & MIGRATE_VMA_SELECT_COMPOUND) &&
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~
mm/hmm.c:423:20: warning: unused variable 'range' [-Wunused-variable]
423 | struct hmm_range *range = hmm_vma_walk->range;
| ^~~~~
mm/hmm.c:493:24: error: variable has incomplete type 'enum migrate_vma_info'
493 | enum migrate_vma_info minfo;
| ^
mm/hmm.c:49:13: note: forward declaration of 'enum migrate_vma_info'
49 | static enum migrate_vma_info hmm_select_migrate(struct hmm_range *range)
| ^
mm/hmm.c:502:35: error: incomplete definition of type 'struct migrate_vma'
502 | fault_folio = (migrate && migrate->fault_page) ?
| ~~~~~~~^
include/linux/hmm.h:15:8: note: forward declaration of 'struct migrate_vma'
15 | struct migrate_vma;
| ^
mm/hmm.c:503:21: error: incomplete definition of type 'struct migrate_vma'
503 | page_folio(migrate->fault_page) : NULL;
| ~~~~~~~^
include/linux/hmm.h:15:8: note: forward declaration of 'struct migrate_vma'
15 | struct migrate_vma;
| ^
mm/hmm.c:503:21: error: incomplete definition of type 'struct migrate_vma'
503 | page_folio(migrate->fault_page) : NULL;
| ~~~~~~~^
include/linux/hmm.h:15:8: note: forward declaration of 'struct migrate_vma'
15 | struct migrate_vma;
| ^
mm/hmm.c:503:21: error: incomplete definition of type 'struct migrate_vma'
503 | page_folio(migrate->fault_page) : NULL;
| ~~~~~~~^
include/linux/hmm.h:15:8: note: forward declaration of 'struct migrate_vma'
15 | struct migrate_vma;
| ^
mm/hmm.c:512:17: error: use of undeclared identifier 'MIGRATE_VMA_SELECT_SYSTEM'
512 | if (!(minfo & MIGRATE_VMA_SELECT_SYSTEM))
| ^~~~~~~~~~~~~~~~~~~~~~~~~
>> mm/hmm.c:529:17: error: use of undeclared identifier 'MIGRATE_VMA_SELECT_DEVICE_PRIVATE'
529 | if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mm/hmm.c:531:37: error: incomplete definition of type 'struct migrate_vma'
531 | if (folio->pgmap->owner != migrate->pgmap_owner)
| ~~~~~~~^
include/linux/hmm.h:15:8: note: forward declaration of 'struct migrate_vma'
15 | struct migrate_vma;
| ^
mm/hmm.c:548:14: error: incomplete definition of type 'struct migrate_vma'
548 | (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
| ~~~~~~~^
include/linux/hmm.h:15:8: note: forward declaration of 'struct migrate_vma'
15 | struct migrate_vma;
| ^
mm/hmm.c:548:24: error: use of undeclared identifier 'MIGRATE_VMA_SELECT_COMPOUND'
548 | (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~
mm/hmm.c:603:24: error: variable has incomplete type 'enum migrate_vma_info'
603 | enum migrate_vma_info minfo;
| ^
mm/hmm.c:49:13: note: forward declaration of 'enum migrate_vma_info'
49 | static enum migrate_vma_info hmm_select_migrate(struct hmm_range *range)
| ^
mm/hmm.c:620:35: error: incomplete definition of type 'struct migrate_vma'
620 | fault_folio = (migrate && migrate->fault_page) ?
| ~~~~~~~^
include/linux/hmm.h:15:8: note: forward declaration of 'struct migrate_vma'
15 | struct migrate_vma;
| ^
mm/hmm.c:621:21: error: incomplete definition of type 'struct migrate_vma'
621 | page_folio(migrate->fault_page) : NULL;
| ~~~~~~~^
include/linux/hmm.h:15:8: note: forward declaration of 'struct migrate_vma'
15 | struct migrate_vma;
| ^
fatal error: too many errors emitted, stopping now [-ferror-limit=]
1 warning and 20 errors generated.
vim +49 mm/hmm.c
48
> 49 static enum migrate_vma_info hmm_select_migrate(struct hmm_range *range)
50 {
> 51 enum migrate_vma_info minfo;
52
> 53 minfo = range->migrate ? range->migrate->flags : 0;
54 minfo |= (range->default_flags & HMM_PFN_REQ_MIGRATE) ?
> 55 MIGRATE_VMA_SELECT_SYSTEM : 0;
56
57 return minfo;
58 }
59
60 static int hmm_pfns_fill(unsigned long addr, unsigned long end,
61 struct hmm_vma_walk *hmm_vma_walk, unsigned long cpu_flags)
62 {
63 struct hmm_range *range = hmm_vma_walk->range;
64 unsigned long i = (addr - range->start) >> PAGE_SHIFT;
65 enum migrate_vma_info minfo;
66 bool migrate = false;
67
68 minfo = hmm_select_migrate(range);
69 if (cpu_flags != HMM_PFN_ERROR) {
70 if (minfo && (vma_is_anonymous(hmm_vma_walk->vma))) {
71 cpu_flags |= (HMM_PFN_VALID | HMM_PFN_MIGRATE);
72 migrate = true;
73 }
74 }
75
76 if (migrate && thp_migration_supported() &&
> 77 (minfo & MIGRATE_VMA_SELECT_COMPOUND) &&
78 IS_ALIGNED(addr, HPAGE_PMD_SIZE) &&
79 IS_ALIGNED(end, HPAGE_PMD_SIZE)) {
80 range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
81 range->hmm_pfns[i] |= cpu_flags | HMM_PFN_COMPOUND;
82 addr += PAGE_SIZE;
83 i++;
84 cpu_flags = 0;
85 }
86
87 for (; addr < end; addr += PAGE_SIZE, i++) {
88 range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
89 range->hmm_pfns[i] |= cpu_flags;
90 }
91
92 return 0;
93 }
94
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-nonmm-unstable]
[also build test ERROR on linus/master v6.19-rc5 next-20260114]
[cannot apply to akpm-mm/mm-everything]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/mpenttil-redhat-com/mm-unified-hmm-fault-and-migrate-device-pagewalk-paths/20260114-172232
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-nonmm-unstable
patch link: https://lore.kernel.org/r/20260114091923.3950465-2-mpenttil%40redhat.com
patch subject: [PATCH 1/3] mm: unified hmm fault and migrate device pagewalk paths
config: hexagon-allmodconfig (https://download.01.org/0day-ci/archive/20260115/202601150035.taEgRkxK-lkp@intel.com/config)
compiler: clang version 17.0.6 (https://github.com/llvm/llvm-project 6009708b4367171ccdbf4b5905cb6a803753fe18)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260115/202601150035.taEgRkxK-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601150035.taEgRkxK-lkp@intel.com/
All errors (new ones prefixed by >>):
mm/hmm.c:423:20: warning: unused variable 'range' [-Wunused-variable]
423 | struct hmm_range *range = hmm_vma_walk->range;
| ^~~~~
>> mm/hmm.c:781:4: error: call to undeclared function 'flush_tlb_range'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
781 | flush_tlb_range(walk->vma, addr, addr + PAGE_SIZE);
| ^
mm/hmm.c:781:4: note: did you mean 'flush_cache_range'?
include/asm-generic/cacheflush.h:35:20: note: 'flush_cache_range' declared here
35 | static inline void flush_cache_range(struct vm_area_struct *vma,
| ^
1 warning and 1 error generated.
vim +/flush_tlb_range +781 mm/hmm.c
587
588 /*
589 * Install migration entries if migration requested, either from fault
590 * or migrate paths.
591 *
592 */
593 static int hmm_vma_handle_migrate_prepare(const struct mm_walk *walk,
594 pmd_t *pmdp,
595 unsigned long addr,
596 unsigned long *hmm_pfn)
597 {
598 struct hmm_vma_walk *hmm_vma_walk = walk->private;
599 struct hmm_range *range = hmm_vma_walk->range;
600 struct migrate_vma *migrate = range->migrate;
601 struct mm_struct *mm = walk->vma->vm_mm;
602 struct folio *fault_folio = NULL;
603 enum migrate_vma_info minfo;
604 struct dev_pagemap *pgmap;
605 bool anon_exclusive;
606 struct folio *folio;
607 unsigned long pfn;
608 struct page *page;
609 softleaf_t entry;
610 pte_t pte, swp_pte;
611 spinlock_t *ptl;
612 bool writable = false;
613 pte_t *ptep;
614
615 // Do we want to migrate at all?
616 minfo = hmm_select_migrate(range);
617 if (!minfo)
618 return 0;
619
620 fault_folio = (migrate && migrate->fault_page) ?
621 page_folio(migrate->fault_page) : NULL;
622
623 again:
624 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
625 if (!ptep)
626 return 0;
627
628 pte = ptep_get(ptep);
629
630 if (pte_none(pte)) {
631 // migrate without faulting case
632 if (vma_is_anonymous(walk->vma)) {
633 *hmm_pfn &= HMM_PFN_INOUT_FLAGS;
634 *hmm_pfn |= HMM_PFN_MIGRATE | HMM_PFN_VALID;
635 goto out;
636 }
637 }
638
639 if (!pte_present(pte)) {
640 /*
641 * Only care about unaddressable device page special
642 * page table entry. Other special swap entries are not
643 * migratable, and we ignore regular swapped page.
644 */
645 entry = softleaf_from_pte(pte);
646 if (!softleaf_is_device_private(entry))
647 goto out;
648
649 if (!(minfo & MIGRATE_VMA_SELECT_DEVICE_PRIVATE))
650 goto out;
651
652 page = softleaf_to_page(entry);
653 folio = page_folio(page);
654 if (folio->pgmap->owner != migrate->pgmap_owner)
655 goto out;
656
657 if (folio_test_large(folio)) {
658 int ret;
659
660 pte_unmap_unlock(ptep, ptl);
661 ret = migrate_vma_split_folio(folio,
662 migrate->fault_page);
663 if (ret)
664 goto out_unlocked;
665 goto again;
666 }
667
668 pfn = page_to_pfn(page);
669 if (softleaf_is_device_private_write(entry))
670 writable = true;
671 } else {
672 pfn = pte_pfn(pte);
673 if (is_zero_pfn(pfn) &&
674 (minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
675 *hmm_pfn = HMM_PFN_MIGRATE|HMM_PFN_VALID;
676 goto out;
677 }
678 page = vm_normal_page(walk->vma, addr, pte);
679 if (page && !is_zone_device_page(page) &&
680 !(minfo & MIGRATE_VMA_SELECT_SYSTEM)) {
681 goto out;
682 } else if (page && is_device_coherent_page(page)) {
683 pgmap = page_pgmap(page);
684
685 if (!(minfo &
686 MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
687 pgmap->owner != migrate->pgmap_owner)
688 goto out;
689 }
690
691 folio = page_folio(page);
692 if (folio_test_large(folio)) {
693 int ret;
694
695 pte_unmap_unlock(ptep, ptl);
696 ret = migrate_vma_split_folio(folio,
697 migrate->fault_page);
698 if (ret)
699 goto out_unlocked;
700
701 goto again;
702 }
703
704 writable = pte_write(pte);
705 }
706
707 if (!page || !page->mapping)
708 goto out;
709
710 /*
711 * By getting a reference on the folio we pin it and that blocks
712 * any kind of migration. Side effect is that it "freezes" the
713 * pte.
714 *
715 * We drop this reference after isolating the folio from the lru
716 * for non device folio (device folio are not on the lru and thus
717 * can't be dropped from it).
718 */
719 folio = page_folio(page);
720 folio_get(folio);
721
722 /*
723 * We rely on folio_trylock() to avoid deadlock between
724 * concurrent migrations where each is waiting on the others
725 * folio lock. If we can't immediately lock the folio we fail this
726 * migration as it is only best effort anyway.
727 *
728 * If we can lock the folio it's safe to set up a migration entry
729 * now. In the common case where the folio is mapped once in a
730 * single process setting up the migration entry now is an
731 * optimisation to avoid walking the rmap later with
732 * try_to_migrate().
733 */
734
735 if (fault_folio == folio || folio_trylock(folio)) {
736 anon_exclusive = folio_test_anon(folio) &&
737 PageAnonExclusive(page);
738
739 flush_cache_page(walk->vma, addr, pfn);
740
741 if (anon_exclusive) {
742 pte = ptep_clear_flush(walk->vma, addr, ptep);
743
744 if (folio_try_share_anon_rmap_pte(folio, page)) {
745 set_pte_at(mm, addr, ptep, pte);
746 folio_unlock(folio);
747 folio_put(folio);
748 goto out;
749 }
750 } else {
751 pte = ptep_get_and_clear(mm, addr, ptep);
752 }
753
754 /* Setup special migration page table entry */
755 if (writable)
756 entry = make_writable_migration_entry(pfn);
757 else if (anon_exclusive)
758 entry = make_readable_exclusive_migration_entry(pfn);
759 else
760 entry = make_readable_migration_entry(pfn);
761
762 swp_pte = swp_entry_to_pte(entry);
763 if (pte_present(pte)) {
764 if (pte_soft_dirty(pte))
765 swp_pte = pte_swp_mksoft_dirty(swp_pte);
766 if (pte_uffd_wp(pte))
767 swp_pte = pte_swp_mkuffd_wp(swp_pte);
768 } else {
769 if (pte_swp_soft_dirty(pte))
770 swp_pte = pte_swp_mksoft_dirty(swp_pte);
771 if (pte_swp_uffd_wp(pte))
772 swp_pte = pte_swp_mkuffd_wp(swp_pte);
773 }
774
775 set_pte_at(mm, addr, ptep, swp_pte);
776 folio_remove_rmap_pte(folio, page, walk->vma);
777 folio_put(folio);
778 *hmm_pfn |= HMM_PFN_MIGRATE;
779
780 if (pte_present(pte))
> 781 flush_tlb_range(walk->vma, addr, addr + PAGE_SIZE);
782 } else
783 folio_put(folio);
784 out:
785 pte_unmap_unlock(ptep, ptl);
786 return 0;
787 out_unlocked:
788 return -1;
789
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi,
kernel test robot noticed the following build warnings:
[auto build test WARNING on akpm-mm/mm-nonmm-unstable]
[also build test WARNING on linus/master v6.19-rc5 next-20260114]
[cannot apply to akpm-mm/mm-everything]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/mpenttil-redhat-com/mm-unified-hmm-fault-and-migrate-device-pagewalk-paths/20260114-172232
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-nonmm-unstable
patch link: https://lore.kernel.org/r/20260114091923.3950465-2-mpenttil%40redhat.com
patch subject: [PATCH 1/3] mm: unified hmm fault and migrate device pagewalk paths
config: parisc-allyesconfig (https://download.01.org/0day-ci/archive/20260114/202601142231.LW22PqTJ-lkp@intel.com/config)
compiler: hppa-linux-gcc (GCC) 15.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260114/202601142231.LW22PqTJ-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601142231.LW22PqTJ-lkp@intel.com/
All warnings (new ones prefixed by >>):
mm/hmm.c: In function 'hmm_vma_handle_absent_pmd':
>> mm/hmm.c:423:27: warning: unused variable 'range' [-Wunused-variable]
423 | struct hmm_range *range = hmm_vma_walk->range;
| ^~~~~
vim +/range +423 mm/hmm.c
53f5c3f489ecddc Jérôme Glisse 2018-04-10 369
10b9feee2d0dc81 Francois Dugast 2025-09-08 370 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
10b9feee2d0dc81 Francois Dugast 2025-09-08 371 static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
10b9feee2d0dc81 Francois Dugast 2025-09-08 372 unsigned long end, unsigned long *hmm_pfns,
10b9feee2d0dc81 Francois Dugast 2025-09-08 373 pmd_t pmd)
10b9feee2d0dc81 Francois Dugast 2025-09-08 374 {
10b9feee2d0dc81 Francois Dugast 2025-09-08 375 struct hmm_vma_walk *hmm_vma_walk = walk->private;
10b9feee2d0dc81 Francois Dugast 2025-09-08 376 struct hmm_range *range = hmm_vma_walk->range;
10b9feee2d0dc81 Francois Dugast 2025-09-08 377 unsigned long npages = (end - start) >> PAGE_SHIFT;
0ac881efe164685 Lorenzo Stoakes 2025-11-10 378 const softleaf_t entry = softleaf_from_pmd(pmd);
10b9feee2d0dc81 Francois Dugast 2025-09-08 379 unsigned long addr = start;
10b9feee2d0dc81 Francois Dugast 2025-09-08 380 unsigned int required_fault;
10b9feee2d0dc81 Francois Dugast 2025-09-08 381
0ac881efe164685 Lorenzo Stoakes 2025-11-10 382 if (softleaf_is_device_private(entry) &&
0ac881efe164685 Lorenzo Stoakes 2025-11-10 383 softleaf_to_folio(entry)->pgmap->owner ==
10b9feee2d0dc81 Francois Dugast 2025-09-08 384 range->dev_private_owner) {
10b9feee2d0dc81 Francois Dugast 2025-09-08 385 unsigned long cpu_flags = HMM_PFN_VALID |
10b9feee2d0dc81 Francois Dugast 2025-09-08 386 hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
0ac881efe164685 Lorenzo Stoakes 2025-11-10 387 unsigned long pfn = softleaf_to_pfn(entry);
10b9feee2d0dc81 Francois Dugast 2025-09-08 388 unsigned long i;
10b9feee2d0dc81 Francois Dugast 2025-09-08 389
0ac881efe164685 Lorenzo Stoakes 2025-11-10 390 if (softleaf_is_device_private_write(entry))
10b9feee2d0dc81 Francois Dugast 2025-09-08 391 cpu_flags |= HMM_PFN_WRITE;
10b9feee2d0dc81 Francois Dugast 2025-09-08 392
10b9feee2d0dc81 Francois Dugast 2025-09-08 393 /*
10b9feee2d0dc81 Francois Dugast 2025-09-08 394 * Fully populate the PFN list though subsequent PFNs could be
10b9feee2d0dc81 Francois Dugast 2025-09-08 395 * inferred, because drivers which are not yet aware of large
10b9feee2d0dc81 Francois Dugast 2025-09-08 396 * folios probably do not support sparsely populated PFN lists.
10b9feee2d0dc81 Francois Dugast 2025-09-08 397 */
10b9feee2d0dc81 Francois Dugast 2025-09-08 398 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
10b9feee2d0dc81 Francois Dugast 2025-09-08 399 hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
10b9feee2d0dc81 Francois Dugast 2025-09-08 400 hmm_pfns[i] |= pfn | cpu_flags;
10b9feee2d0dc81 Francois Dugast 2025-09-08 401 }
10b9feee2d0dc81 Francois Dugast 2025-09-08 402
10b9feee2d0dc81 Francois Dugast 2025-09-08 403 return 0;
10b9feee2d0dc81 Francois Dugast 2025-09-08 404 }
10b9feee2d0dc81 Francois Dugast 2025-09-08 405
10b9feee2d0dc81 Francois Dugast 2025-09-08 406 required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
10b9feee2d0dc81 Francois Dugast 2025-09-08 407 npages, 0);
10b9feee2d0dc81 Francois Dugast 2025-09-08 408 if (required_fault) {
0ac881efe164685 Lorenzo Stoakes 2025-11-10 409 if (softleaf_is_device_private(entry))
10b9feee2d0dc81 Francois Dugast 2025-09-08 410 return hmm_vma_fault(addr, end, required_fault, walk);
10b9feee2d0dc81 Francois Dugast 2025-09-08 411 else
10b9feee2d0dc81 Francois Dugast 2025-09-08 412 return -EFAULT;
10b9feee2d0dc81 Francois Dugast 2025-09-08 413 }
10b9feee2d0dc81 Francois Dugast 2025-09-08 414
70d15e6adfafc3e Mika Penttilä 2026-01-14 415 return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
10b9feee2d0dc81 Francois Dugast 2025-09-08 416 }
10b9feee2d0dc81 Francois Dugast 2025-09-08 417 #else
10b9feee2d0dc81 Francois Dugast 2025-09-08 418 static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
10b9feee2d0dc81 Francois Dugast 2025-09-08 419 unsigned long end, unsigned long *hmm_pfns,
10b9feee2d0dc81 Francois Dugast 2025-09-08 420 pmd_t pmd)
10b9feee2d0dc81 Francois Dugast 2025-09-08 421 {
10b9feee2d0dc81 Francois Dugast 2025-09-08 422 struct hmm_vma_walk *hmm_vma_walk = walk->private;
10b9feee2d0dc81 Francois Dugast 2025-09-08 @423 struct hmm_range *range = hmm_vma_walk->range;
10b9feee2d0dc81 Francois Dugast 2025-09-08 424 unsigned long npages = (end - start) >> PAGE_SHIFT;
10b9feee2d0dc81 Francois Dugast 2025-09-08 425
10b9feee2d0dc81 Francois Dugast 2025-09-08 426 if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
10b9feee2d0dc81 Francois Dugast 2025-09-08 427 return -EFAULT;
70d15e6adfafc3e Mika Penttilä 2026-01-14 428 return hmm_pfns_fill(start, end, hmm_vma_walk, HMM_PFN_ERROR);
10b9feee2d0dc81 Francois Dugast 2025-09-08 429 }
10b9feee2d0dc81 Francois Dugast 2025-09-08 430 #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
10b9feee2d0dc81 Francois Dugast 2025-09-08 431
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
© 2016 - 2026 Red Hat, Inc.