migrate_device code paths go through the collect, setup
and finalize phases of migration. Support for MIGRATE_PFN_COMPOUND
was added earlier in the series to mark THP pages as
MIGRATE_PFN_COMPOUND.
The entries in src and dst arrays passed to these functions still
remain at a PAGE_SIZE granularity. When a compound page is passed,
the first entry has the PFN along with MIGRATE_PFN_COMPOUND
and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the
remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This
representation allows for the compound page to be split into smaller
page sizes.
migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP
page aware. Two new helper functions migrate_vma_collect_huge_pmd()
and migrate_vma_insert_huge_pmd_page() have been added.
migrate_vma_collect_huge_pmd() can collect THP pages, but if for
some reason this fails, there is fallback support to split the folio
and migrate it.
migrate_vma_insert_huge_pmd_page() closely follows the logic of
migrate_vma_insert_page()
Support for splitting pages as needed for migration will follow in
later patches in this series.
Cc: Karol Herbst <kherbst@redhat.com>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Donet Tom <donettom@linux.ibm.com>
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
---
mm/migrate_device.c | 437 +++++++++++++++++++++++++++++++++++++-------
1 file changed, 376 insertions(+), 61 deletions(-)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index e05e14d6eacd..41d0bd787969 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -14,6 +14,7 @@
#include <linux/pagewalk.h>
#include <linux/rmap.h>
#include <linux/swapops.h>
+#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start,
if (!vma_is_anonymous(walk->vma))
return migrate_vma_collect_skip(start, end, walk);
+ if (thp_migration_supported() &&
+ (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
+ (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
+ IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
+ migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE |
+ MIGRATE_PFN_COMPOUND;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ migrate->cpages++;
+
+ /*
+ * Collect the remaining entries as holes, in case we
+ * need to split later
+ */
+ return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
+ }
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
@@ -54,57 +72,148 @@ static int migrate_vma_collect_hole(unsigned long start,
return 0;
}
-static int migrate_vma_collect_pmd(pmd_t *pmdp,
- unsigned long start,
- unsigned long end,
- struct mm_walk *walk)
+/**
+ * migrate_vma_collect_huge_pmd - collect THP pages without splitting the
+ * folio for device private pages.
+ * @pmdp: pointer to pmd entry
+ * @start: start address of the range for migration
+ * @end: end address of the range for migration
+ * @walk: mm_walk callback structure
+ *
+ * Collect the huge pmd entry at @pmdp for migration and set the
+ * MIGRATE_PFN_COMPOUND flag in the migrate src entry to indicate that
+ * migration will occur at HPAGE_PMD granularity
+ */
+static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
+ unsigned long end, struct mm_walk *walk,
+ struct folio *fault_folio)
{
+ struct mm_struct *mm = walk->mm;
+ struct folio *folio;
struct migrate_vma *migrate = walk->private;
- struct folio *fault_folio = migrate->fault_page ?
- page_folio(migrate->fault_page) : NULL;
- struct vm_area_struct *vma = walk->vma;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long addr = start, unmapped = 0;
spinlock_t *ptl;
- pte_t *ptep;
+ swp_entry_t entry;
+ int ret;
+ unsigned long write = 0;
-again:
- if (pmd_none(*pmdp))
+ ptl = pmd_lock(mm, pmdp);
+ if (pmd_none(*pmdp)) {
+ spin_unlock(ptl);
return migrate_vma_collect_hole(start, end, -1, walk);
+ }
if (pmd_trans_huge(*pmdp)) {
- struct folio *folio;
-
- ptl = pmd_lock(mm, pmdp);
- if (unlikely(!pmd_trans_huge(*pmdp))) {
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
spin_unlock(ptl);
- goto again;
+ return migrate_vma_collect_skip(start, end, walk);
}
folio = pmd_folio(*pmdp);
if (is_huge_zero_folio(folio)) {
spin_unlock(ptl);
- split_huge_pmd(vma, pmdp, addr);
- } else {
- int ret;
+ return migrate_vma_collect_hole(start, end, -1, walk);
+ }
+ if (pmd_write(*pmdp))
+ write = MIGRATE_PFN_WRITE;
+ } else if (!pmd_present(*pmdp)) {
+ entry = pmd_to_swp_entry(*pmdp);
+ folio = pfn_swap_entry_folio(entry);
+
+ if (!is_device_private_entry(entry) ||
+ !(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+ (folio->pgmap->owner != migrate->pgmap_owner)) {
+ spin_unlock(ptl);
+ return migrate_vma_collect_skip(start, end, walk);
+ }
- folio_get(folio);
+ if (is_migration_entry(entry)) {
+ migration_entry_wait_on_locked(entry, ptl);
spin_unlock(ptl);
- /* FIXME: we don't expect THP for fault_folio */
- if (WARN_ON_ONCE(fault_folio == folio))
- return migrate_vma_collect_skip(start, end,
- walk);
- if (unlikely(!folio_trylock(folio)))
- return migrate_vma_collect_skip(start, end,
- walk);
- ret = split_folio(folio);
- if (fault_folio != folio)
- folio_unlock(folio);
- folio_put(folio);
- if (ret)
- return migrate_vma_collect_skip(start, end,
- walk);
+ return -EAGAIN;
}
+
+ if (is_writable_device_private_entry(entry))
+ write = MIGRATE_PFN_WRITE;
+ } else {
+ spin_unlock(ptl);
+ return -EAGAIN;
+ }
+
+ folio_get(folio);
+ if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
+ spin_unlock(ptl);
+ folio_put(folio);
+ return migrate_vma_collect_skip(start, end, walk);
+ }
+
+ if (thp_migration_supported() &&
+ (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
+ (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
+ IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
+
+ struct page_vma_mapped_walk pvmw = {
+ .ptl = ptl,
+ .address = start,
+ .pmd = pmdp,
+ .vma = walk->vma,
+ };
+
+ unsigned long pfn = page_to_pfn(folio_page(folio, 0));
+
+ migrate->src[migrate->npages] = migrate_pfn(pfn) | write
+ | MIGRATE_PFN_MIGRATE
+ | MIGRATE_PFN_COMPOUND;
+ migrate->dst[migrate->npages++] = 0;
+ migrate->cpages++;
+ ret = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
+ if (ret) {
+ migrate->npages--;
+ migrate->cpages--;
+ migrate->src[migrate->npages] = 0;
+ migrate->dst[migrate->npages] = 0;
+ goto fallback;
+ }
+ migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
+ spin_unlock(ptl);
+ return 0;
+ }
+
+fallback:
+ spin_unlock(ptl);
+ ret = split_folio(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
+ folio_put(folio);
+ if (ret)
+ return migrate_vma_collect_skip(start, end, walk);
+ if (pmd_none(pmdp_get_lockless(pmdp)))
+ return migrate_vma_collect_hole(start, end, -1, walk);
+
+ return -ENOENT;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ struct folio *fault_folio = migrate->fault_page ?
+ page_folio(migrate->fault_page) : NULL;
+ pte_t *ptep;
+
+again:
+ if (pmd_trans_huge(*pmdp) || !pmd_present(*pmdp)) {
+ int ret = migrate_vma_collect_huge_pmd(pmdp, start, end, walk, fault_folio);
+
+ if (ret == -EAGAIN)
+ goto again;
+ if (ret == 0)
+ return 0;
}
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
@@ -175,8 +284,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
- /* FIXME support THP */
- if (!page || !page->mapping || PageTransCompound(page)) {
+ if (!page || !page->mapping) {
mpfn = 0;
goto next;
}
@@ -347,14 +455,6 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
*/
int extra = 1 + (page == fault_page);
- /*
- * FIXME support THP (transparent huge page), it is bit more complex to
- * check them than regular pages, because they can be mapped with a pmd
- * or with a pte (split pte mapping).
- */
- if (folio_test_large(folio))
- return false;
-
/* Page from ZONE_DEVICE have one extra reference */
if (folio_is_zone_device(folio))
extra++;
@@ -385,17 +485,24 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
lru_add_drain();
- for (i = 0; i < npages; i++) {
+ for (i = 0; i < npages; ) {
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
+ unsigned int nr = 1;
if (!page) {
if (src_pfns[i] & MIGRATE_PFN_MIGRATE)
unmapped++;
- continue;
+ goto next;
}
folio = page_folio(page);
+ nr = folio_nr_pages(folio);
+
+ if (nr > 1)
+ src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+
+
/* ZONE_DEVICE folios are not on LRU */
if (!folio_is_zone_device(folio)) {
if (!folio_test_lru(folio) && allow_drain) {
@@ -407,7 +514,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
if (!folio_isolate_lru(folio)) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
- continue;
+ goto next;
}
/* Drop the reference we took in collect */
@@ -426,10 +533,12 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
- continue;
+ goto next;
}
unmapped++;
+next:
+ i += nr;
}
for (i = 0; i < npages && restore; i++) {
@@ -575,6 +684,146 @@ int migrate_vma_setup(struct migrate_vma *args)
}
EXPORT_SYMBOL(migrate_vma_setup);
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+/**
+ * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm
+ * at @addr. folio is already allocated as a part of the migration process with
+ * large page.
+ *
+ * @folio needs to be initialized and setup after it's allocated. The code bits
+ * here follow closely the code in __do_huge_pmd_anonymous_page(). This API does
+ * not support THP zero pages.
+ *
+ * @migrate: migrate_vma arguments
+ * @addr: address where the folio will be inserted
+ * @folio: folio to be inserted at @addr
+ * @src: src pfn which is being migrated
+ * @pmdp: pointer to the pmd
+ */
+static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ pmd_t *pmdp)
+{
+ struct vm_area_struct *vma = migrate->vma;
+ gfp_t gfp = vma_thp_gfp_mask(vma);
+ struct folio *folio = page_folio(page);
+ int ret;
+ spinlock_t *ptl;
+ pgtable_t pgtable;
+ pmd_t entry;
+ bool flush = false;
+ unsigned long i;
+
+ VM_WARN_ON_FOLIO(!folio, folio);
+ VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp));
+
+ if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER))
+ return -EINVAL;
+
+ ret = anon_vma_prepare(vma);
+ if (ret)
+ return ret;
+
+ folio_set_order(folio, HPAGE_PMD_ORDER);
+ folio_set_large_rmappable(folio);
+
+ if (mem_cgroup_charge(folio, migrate->vma->vm_mm, gfp)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+ ret = -ENOMEM;
+ goto abort;
+ }
+
+ __folio_mark_uptodate(folio);
+
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (unlikely(!pgtable))
+ goto abort;
+
+ if (folio_is_device_private(folio)) {
+ swp_entry_t swp_entry;
+
+ if (vma->vm_flags & VM_WRITE)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page));
+ entry = swp_entry_to_pmd(swp_entry);
+ } else {
+ if (folio_is_zone_device(folio) &&
+ !folio_is_device_coherent(folio)) {
+ goto abort;
+ }
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pmd_mkwrite(pmd_mkdirty(entry), vma);
+ }
+
+ ptl = pmd_lock(vma->vm_mm, pmdp);
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto abort;
+
+ /*
+ * Check for userfaultfd but do not deliver the fault. Instead,
+ * just back off.
+ */
+ if (userfaultfd_missing(vma))
+ goto unlock_abort;
+
+ if (!pmd_none(*pmdp)) {
+ if (!is_huge_zero_pmd(*pmdp))
+ goto unlock_abort;
+ flush = true;
+ } else if (!pmd_none(*pmdp))
+ goto unlock_abort;
+
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
+ if (!folio_is_zone_device(folio))
+ folio_add_lru_vma(folio, vma);
+ folio_get(folio);
+
+ if (flush) {
+ pte_free(vma->vm_mm, pgtable);
+ flush_cache_page(vma, addr, addr + HPAGE_PMD_SIZE);
+ pmdp_invalidate(vma, addr, pmdp);
+ } else {
+ pgtable_trans_huge_deposit(vma->vm_mm, pmdp, pgtable);
+ mm_inc_nr_ptes(vma->vm_mm);
+ }
+ set_pmd_at(vma->vm_mm, addr, pmdp, entry);
+ update_mmu_cache_pmd(vma, addr, pmdp);
+
+ spin_unlock(ptl);
+
+ count_vm_event(THP_FAULT_ALLOC);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+
+ return 0;
+
+unlock_abort:
+ spin_unlock(ptl);
+abort:
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ src[i] &= ~MIGRATE_PFN_MIGRATE;
+ return 0;
+}
+#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
+static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ pmd_t *pmdp)
+{
+ return 0;
+}
+#endif
+
/*
* This code closely matches the code in:
* __handle_mm_fault()
@@ -585,9 +834,10 @@ EXPORT_SYMBOL(migrate_vma_setup);
*/
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
- struct page *page,
+ unsigned long *dst,
unsigned long *src)
{
+ struct page *page = migrate_pfn_to_page(*dst);
struct folio *folio = page_folio(page);
struct vm_area_struct *vma = migrate->vma;
struct mm_struct *mm = vma->vm_mm;
@@ -615,8 +865,25 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
pmdp = pmd_alloc(mm, pudp, addr);
if (!pmdp)
goto abort;
- if (pmd_trans_huge(*pmdp))
- goto abort;
+
+ if (thp_migration_supported() && (*dst & MIGRATE_PFN_COMPOUND)) {
+ int ret = migrate_vma_insert_huge_pmd_page(migrate, addr, page,
+ src, pmdp);
+ if (ret)
+ goto abort;
+ return;
+ }
+
+ if (!pmd_none(*pmdp)) {
+ if (pmd_trans_huge(*pmdp)) {
+ if (!is_huge_zero_pmd(*pmdp))
+ goto abort;
+ folio_get(pmd_folio(*pmdp));
+ split_huge_pmd(vma, pmdp, addr);
+ } else if (pmd_leaf(*pmdp))
+ goto abort;
+ }
+
if (pte_alloc(mm, pmdp))
goto abort;
if (unlikely(anon_vma_prepare(vma)))
@@ -707,23 +974,24 @@ static void __migrate_device_pages(unsigned long *src_pfns,
unsigned long i;
bool notified = false;
- for (i = 0; i < npages; i++) {
+ for (i = 0; i < npages; ) {
struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct address_space *mapping;
struct folio *newfolio, *folio;
int r, extra_cnt = 0;
+ unsigned long nr = 1;
if (!newpage) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
if (!page) {
unsigned long addr;
if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
- continue;
+ goto next;
/*
* The only time there is no vma is when called from
@@ -741,15 +1009,47 @@ static void __migrate_device_pages(unsigned long *src_pfns,
migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
}
- migrate_vma_insert_page(migrate, addr, newpage,
+
+ if ((src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ (!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) {
+ nr = HPAGE_PMD_NR;
+ src_pfns[i] &= ~MIGRATE_PFN_COMPOUND;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ goto next;
+ }
+
+ migrate_vma_insert_page(migrate, addr, &dst_pfns[i],
&src_pfns[i]);
- continue;
+ goto next;
}
newfolio = page_folio(newpage);
folio = page_folio(page);
mapping = folio_mapping(folio);
+ /*
+ * If THP migration is enabled, check if both src and dst
+ * can migrate large pages
+ */
+ if (thp_migration_supported()) {
+ if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
+ (src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ !(dst_pfns[i] & MIGRATE_PFN_COMPOUND)) {
+
+ if (!migrate) {
+ src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE |
+ MIGRATE_PFN_COMPOUND);
+ goto next;
+ }
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ } else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
+ (dst_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ !(src_pfns[i] & MIGRATE_PFN_COMPOUND)) {
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ }
+ }
+
+
if (folio_is_device_private(newfolio) ||
folio_is_device_coherent(newfolio)) {
if (mapping) {
@@ -762,7 +1062,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
if (!folio_test_anon(folio) ||
!folio_free_swap(folio)) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
}
} else if (folio_is_zone_device(newfolio)) {
@@ -770,7 +1070,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
* Other types of ZONE_DEVICE page are not supported.
*/
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
BUG_ON(folio_test_writeback(folio));
@@ -782,6 +1082,8 @@ static void __migrate_device_pages(unsigned long *src_pfns,
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
else
folio_migrate_flags(newfolio, folio);
+next:
+ i += nr;
}
if (notified)
@@ -943,10 +1245,23 @@ static unsigned long migrate_device_pfn_lock(unsigned long pfn)
int migrate_device_range(unsigned long *src_pfns, unsigned long start,
unsigned long npages)
{
- unsigned long i, pfn;
+ unsigned long i, j, pfn;
+
+ for (pfn = start, i = 0; i < npages; pfn++, i++) {
+ struct page *page = pfn_to_page(pfn);
+ struct folio *folio = page_folio(page);
+ unsigned int nr = 1;
- for (pfn = start, i = 0; i < npages; pfn++, i++)
src_pfns[i] = migrate_device_pfn_lock(pfn);
+ nr = folio_nr_pages(folio);
+ if (nr > 1) {
+ src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+ for (j = 1; j < nr; j++)
+ src_pfns[i+j] = 0;
+ i += j - 1;
+ pfn += j - 1;
+ }
+ }
migrate_device_unmap(src_pfns, npages, NULL);
--
2.49.0
On Fri, Jul 04, 2025 at 09:35:03AM +1000, Balbir Singh wrote: > migrate_device code paths go through the collect, setup > and finalize phases of migration. Support for MIGRATE_PFN_COMPOUND > was added earlier in the series to mark THP pages as > MIGRATE_PFN_COMPOUND. > > The entries in src and dst arrays passed to these functions still > remain at a PAGE_SIZE granularity. When a compound page is passed, > the first entry has the PFN along with MIGRATE_PFN_COMPOUND > and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the > remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This > representation allows for the compound page to be split into smaller > page sizes. > > migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP > page aware. Two new helper functions migrate_vma_collect_huge_pmd() > and migrate_vma_insert_huge_pmd_page() have been added. > > migrate_vma_collect_huge_pmd() can collect THP pages, but if for > some reason this fails, there is fallback support to split the folio > and migrate it. > > migrate_vma_insert_huge_pmd_page() closely follows the logic of > migrate_vma_insert_page() > > Support for splitting pages as needed for migration will follow in > later patches in this series. > > Cc: Karol Herbst <kherbst@redhat.com> > Cc: Lyude Paul <lyude@redhat.com> > Cc: Danilo Krummrich <dakr@kernel.org> > Cc: David Airlie <airlied@gmail.com> > Cc: Simona Vetter <simona@ffwll.ch> > Cc: "Jérôme Glisse" <jglisse@redhat.com> > Cc: Shuah Khan <shuah@kernel.org> > Cc: David Hildenbrand <david@redhat.com> > Cc: Barry Song <baohua@kernel.org> > Cc: Baolin Wang <baolin.wang@linux.alibaba.com> > Cc: Ryan Roberts <ryan.roberts@arm.com> > Cc: Matthew Wilcox <willy@infradead.org> > Cc: Peter Xu <peterx@redhat.com> > Cc: Zi Yan <ziy@nvidia.com> > Cc: Kefeng Wang <wangkefeng.wang@huawei.com> > Cc: Jane Chu <jane.chu@oracle.com> > Cc: Alistair Popple <apopple@nvidia.com> > Cc: Donet Tom <donettom@linux.ibm.com> > > Signed-off-by: Balbir Singh <balbirs@nvidia.com> > --- > mm/migrate_device.c | 437 +++++++++++++++++++++++++++++++++++++------- > 1 file changed, 376 insertions(+), 61 deletions(-) > > diff --git a/mm/migrate_device.c b/mm/migrate_device.c > index e05e14d6eacd..41d0bd787969 100644 > --- a/mm/migrate_device.c > +++ b/mm/migrate_device.c > @@ -14,6 +14,7 @@ > #include <linux/pagewalk.h> > #include <linux/rmap.h> > #include <linux/swapops.h> > +#include <asm/pgalloc.h> > #include <asm/tlbflush.h> > #include "internal.h" > > @@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start, > if (!vma_is_anonymous(walk->vma)) > return migrate_vma_collect_skip(start, end, walk); > > + if (thp_migration_supported() && > + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && > + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && > + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { > + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE | > + MIGRATE_PFN_COMPOUND; > + migrate->dst[migrate->npages] = 0; > + migrate->npages++; > + migrate->cpages++; > + > + /* > + * Collect the remaining entries as holes, in case we > + * need to split later > + */ > + return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk); > + } > + > for (addr = start; addr < end; addr += PAGE_SIZE) { > migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; > migrate->dst[migrate->npages] = 0; > @@ -54,57 +72,148 @@ static int migrate_vma_collect_hole(unsigned long start, > return 0; > } > > -static int migrate_vma_collect_pmd(pmd_t *pmdp, > - unsigned long start, > - unsigned long end, > - struct mm_walk *walk) > +/** > + * migrate_vma_collect_huge_pmd - collect THP pages without splitting the > + * folio for device private pages. > + * @pmdp: pointer to pmd entry > + * @start: start address of the range for migration > + * @end: end address of the range for migration > + * @walk: mm_walk callback structure > + * > + * Collect the huge pmd entry at @pmdp for migration and set the > + * MIGRATE_PFN_COMPOUND flag in the migrate src entry to indicate that > + * migration will occur at HPAGE_PMD granularity > + */ > +static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, > + unsigned long end, struct mm_walk *walk, > + struct folio *fault_folio) > { > + struct mm_struct *mm = walk->mm; > + struct folio *folio; > struct migrate_vma *migrate = walk->private; > - struct folio *fault_folio = migrate->fault_page ? > - page_folio(migrate->fault_page) : NULL; > - struct vm_area_struct *vma = walk->vma; > - struct mm_struct *mm = vma->vm_mm; > - unsigned long addr = start, unmapped = 0; > spinlock_t *ptl; > - pte_t *ptep; > + swp_entry_t entry; > + int ret; > + unsigned long write = 0; > > -again: > - if (pmd_none(*pmdp)) > + ptl = pmd_lock(mm, pmdp); > + if (pmd_none(*pmdp)) { > + spin_unlock(ptl); > return migrate_vma_collect_hole(start, end, -1, walk); > + } > > if (pmd_trans_huge(*pmdp)) { > - struct folio *folio; > - > - ptl = pmd_lock(mm, pmdp); > - if (unlikely(!pmd_trans_huge(*pmdp))) { > + if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { > spin_unlock(ptl); > - goto again; > + return migrate_vma_collect_skip(start, end, walk); > } > > folio = pmd_folio(*pmdp); > if (is_huge_zero_folio(folio)) { > spin_unlock(ptl); > - split_huge_pmd(vma, pmdp, addr); > - } else { > - int ret; > + return migrate_vma_collect_hole(start, end, -1, walk); > + } > + if (pmd_write(*pmdp)) > + write = MIGRATE_PFN_WRITE; > + } else if (!pmd_present(*pmdp)) { > + entry = pmd_to_swp_entry(*pmdp); > + folio = pfn_swap_entry_folio(entry); > + > + if (!is_device_private_entry(entry) || > + !(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || > + (folio->pgmap->owner != migrate->pgmap_owner)) { > + spin_unlock(ptl); > + return migrate_vma_collect_skip(start, end, walk); > + } > > - folio_get(folio); > + if (is_migration_entry(entry)) { > + migration_entry_wait_on_locked(entry, ptl); > spin_unlock(ptl); > - /* FIXME: we don't expect THP for fault_folio */ > - if (WARN_ON_ONCE(fault_folio == folio)) > - return migrate_vma_collect_skip(start, end, > - walk); > - if (unlikely(!folio_trylock(folio))) > - return migrate_vma_collect_skip(start, end, > - walk); > - ret = split_folio(folio); > - if (fault_folio != folio) > - folio_unlock(folio); > - folio_put(folio); > - if (ret) > - return migrate_vma_collect_skip(start, end, > - walk); > + return -EAGAIN; > } > + > + if (is_writable_device_private_entry(entry)) > + write = MIGRATE_PFN_WRITE; > + } else { > + spin_unlock(ptl); > + return -EAGAIN; > + } > + > + folio_get(folio); > + if (folio != fault_folio && unlikely(!folio_trylock(folio))) { > + spin_unlock(ptl); > + folio_put(folio); > + return migrate_vma_collect_skip(start, end, walk); > + } > + > + if (thp_migration_supported() && > + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && > + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && > + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { > + > + struct page_vma_mapped_walk pvmw = { > + .ptl = ptl, > + .address = start, > + .pmd = pmdp, > + .vma = walk->vma, > + }; > + > + unsigned long pfn = page_to_pfn(folio_page(folio, 0)); > + > + migrate->src[migrate->npages] = migrate_pfn(pfn) | write > + | MIGRATE_PFN_MIGRATE > + | MIGRATE_PFN_COMPOUND; > + migrate->dst[migrate->npages++] = 0; > + migrate->cpages++; > + ret = set_pmd_migration_entry(&pvmw, folio_page(folio, 0)); > + if (ret) { > + migrate->npages--; > + migrate->cpages--; > + migrate->src[migrate->npages] = 0; > + migrate->dst[migrate->npages] = 0; > + goto fallback; > + } > + migrate_vma_collect_skip(start + PAGE_SIZE, end, walk); > + spin_unlock(ptl); > + return 0; > + } > + > +fallback: > + spin_unlock(ptl); > + ret = split_folio(folio); > + if (fault_folio != folio) > + folio_unlock(folio); > + folio_put(folio); > + if (ret) > + return migrate_vma_collect_skip(start, end, walk); > + if (pmd_none(pmdp_get_lockless(pmdp))) > + return migrate_vma_collect_hole(start, end, -1, walk); > + > + return -ENOENT; > +} > + > +static int migrate_vma_collect_pmd(pmd_t *pmdp, > + unsigned long start, > + unsigned long end, > + struct mm_walk *walk) > +{ > + struct migrate_vma *migrate = walk->private; > + struct vm_area_struct *vma = walk->vma; > + struct mm_struct *mm = vma->vm_mm; > + unsigned long addr = start, unmapped = 0; > + spinlock_t *ptl; > + struct folio *fault_folio = migrate->fault_page ? > + page_folio(migrate->fault_page) : NULL; > + pte_t *ptep; > + > +again: > + if (pmd_trans_huge(*pmdp) || !pmd_present(*pmdp)) { > + int ret = migrate_vma_collect_huge_pmd(pmdp, start, end, walk, fault_folio); > + > + if (ret == -EAGAIN) > + goto again; > + if (ret == 0) > + return 0; > } > > ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); > @@ -175,8 +284,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, > mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; > } > > - /* FIXME support THP */ > - if (!page || !page->mapping || PageTransCompound(page)) { > + if (!page || !page->mapping) { > mpfn = 0; > goto next; > } > @@ -347,14 +455,6 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) > */ > int extra = 1 + (page == fault_page); > > - /* > - * FIXME support THP (transparent huge page), it is bit more complex to > - * check them than regular pages, because they can be mapped with a pmd > - * or with a pte (split pte mapping). > - */ > - if (folio_test_large(folio)) > - return false; > - > /* Page from ZONE_DEVICE have one extra reference */ > if (folio_is_zone_device(folio)) > extra++; > @@ -385,17 +485,24 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, > > lru_add_drain(); > > - for (i = 0; i < npages; i++) { > + for (i = 0; i < npages; ) { > struct page *page = migrate_pfn_to_page(src_pfns[i]); > struct folio *folio; > + unsigned int nr = 1; > > if (!page) { > if (src_pfns[i] & MIGRATE_PFN_MIGRATE) > unmapped++; > - continue; > + goto next; > } > > folio = page_folio(page); > + nr = folio_nr_pages(folio); > + > + if (nr > 1) > + src_pfns[i] |= MIGRATE_PFN_COMPOUND; > + > + > /* ZONE_DEVICE folios are not on LRU */ > if (!folio_is_zone_device(folio)) { > if (!folio_test_lru(folio) && allow_drain) { > @@ -407,7 +514,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, > if (!folio_isolate_lru(folio)) { > src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > restore++; > - continue; > + goto next; > } > > /* Drop the reference we took in collect */ > @@ -426,10 +533,12 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, > > src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > restore++; > - continue; > + goto next; > } > > unmapped++; > +next: > + i += nr; > } > > for (i = 0; i < npages && restore; i++) { > @@ -575,6 +684,146 @@ int migrate_vma_setup(struct migrate_vma *args) > } > EXPORT_SYMBOL(migrate_vma_setup); > > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > +/** > + * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm > + * at @addr. folio is already allocated as a part of the migration process with > + * large page. > + * > + * @folio needs to be initialized and setup after it's allocated. The code bits > + * here follow closely the code in __do_huge_pmd_anonymous_page(). This API does > + * not support THP zero pages. > + * > + * @migrate: migrate_vma arguments > + * @addr: address where the folio will be inserted > + * @folio: folio to be inserted at @addr > + * @src: src pfn which is being migrated > + * @pmdp: pointer to the pmd > + */ > +static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, > + unsigned long addr, > + struct page *page, > + unsigned long *src, > + pmd_t *pmdp) > +{ > + struct vm_area_struct *vma = migrate->vma; > + gfp_t gfp = vma_thp_gfp_mask(vma); > + struct folio *folio = page_folio(page); > + int ret; > + spinlock_t *ptl; > + pgtable_t pgtable; > + pmd_t entry; > + bool flush = false; > + unsigned long i; > + > + VM_WARN_ON_FOLIO(!folio, folio); > + VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp)); > + > + if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER)) > + return -EINVAL; > + > + ret = anon_vma_prepare(vma); > + if (ret) > + return ret; > + > + folio_set_order(folio, HPAGE_PMD_ORDER); > + folio_set_large_rmappable(folio); > + > + if (mem_cgroup_charge(folio, migrate->vma->vm_mm, gfp)) { > + count_vm_event(THP_FAULT_FALLBACK); > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > + ret = -ENOMEM; > + goto abort; > + } > + > + __folio_mark_uptodate(folio); > + > + pgtable = pte_alloc_one(vma->vm_mm); > + if (unlikely(!pgtable)) > + goto abort; > + > + if (folio_is_device_private(folio)) { > + swp_entry_t swp_entry; > + > + if (vma->vm_flags & VM_WRITE) > + swp_entry = make_writable_device_private_entry( > + page_to_pfn(page)); > + else > + swp_entry = make_readable_device_private_entry( > + page_to_pfn(page)); > + entry = swp_entry_to_pmd(swp_entry); > + } else { > + if (folio_is_zone_device(folio) && > + !folio_is_device_coherent(folio)) { > + goto abort; > + } > + entry = folio_mk_pmd(folio, vma->vm_page_prot); > + if (vma->vm_flags & VM_WRITE) > + entry = pmd_mkwrite(pmd_mkdirty(entry), vma); > + } > + > + ptl = pmd_lock(vma->vm_mm, pmdp); > + ret = check_stable_address_space(vma->vm_mm); > + if (ret) > + goto abort; > + > + /* > + * Check for userfaultfd but do not deliver the fault. Instead, > + * just back off. > + */ > + if (userfaultfd_missing(vma)) > + goto unlock_abort; > + > + if (!pmd_none(*pmdp)) { > + if (!is_huge_zero_pmd(*pmdp)) > + goto unlock_abort; > + flush = true; > + } else if (!pmd_none(*pmdp)) > + goto unlock_abort; > + > + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); > + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); > + if (!folio_is_zone_device(folio)) > + folio_add_lru_vma(folio, vma); > + folio_get(folio); > + > + if (flush) { > + pte_free(vma->vm_mm, pgtable); > + flush_cache_page(vma, addr, addr + HPAGE_PMD_SIZE); > + pmdp_invalidate(vma, addr, pmdp); > + } else { > + pgtable_trans_huge_deposit(vma->vm_mm, pmdp, pgtable); > + mm_inc_nr_ptes(vma->vm_mm); > + } > + set_pmd_at(vma->vm_mm, addr, pmdp, entry); > + update_mmu_cache_pmd(vma, addr, pmdp); > + > + spin_unlock(ptl); > + > + count_vm_event(THP_FAULT_ALLOC); > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); > + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); > + > + return 0; > + > +unlock_abort: > + spin_unlock(ptl); > +abort: > + for (i = 0; i < HPAGE_PMD_NR; i++) > + src[i] &= ~MIGRATE_PFN_MIGRATE; > + return 0; > +} > +#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ > +static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, > + unsigned long addr, > + struct page *page, > + unsigned long *src, > + pmd_t *pmdp) > +{ > + return 0; > +} > +#endif > + > /* > * This code closely matches the code in: > * __handle_mm_fault() > @@ -585,9 +834,10 @@ EXPORT_SYMBOL(migrate_vma_setup); > */ > static void migrate_vma_insert_page(struct migrate_vma *migrate, > unsigned long addr, > - struct page *page, > + unsigned long *dst, > unsigned long *src) > { > + struct page *page = migrate_pfn_to_page(*dst); > struct folio *folio = page_folio(page); > struct vm_area_struct *vma = migrate->vma; > struct mm_struct *mm = vma->vm_mm; > @@ -615,8 +865,25 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, > pmdp = pmd_alloc(mm, pudp, addr); > if (!pmdp) > goto abort; > - if (pmd_trans_huge(*pmdp)) > - goto abort; > + > + if (thp_migration_supported() && (*dst & MIGRATE_PFN_COMPOUND)) { > + int ret = migrate_vma_insert_huge_pmd_page(migrate, addr, page, > + src, pmdp); > + if (ret) > + goto abort; > + return; > + } > + > + if (!pmd_none(*pmdp)) { > + if (pmd_trans_huge(*pmdp)) { > + if (!is_huge_zero_pmd(*pmdp)) > + goto abort; > + folio_get(pmd_folio(*pmdp)); > + split_huge_pmd(vma, pmdp, addr); > + } else if (pmd_leaf(*pmdp)) > + goto abort; > + } > + > if (pte_alloc(mm, pmdp)) > goto abort; > if (unlikely(anon_vma_prepare(vma))) > @@ -707,23 +974,24 @@ static void __migrate_device_pages(unsigned long *src_pfns, > unsigned long i; > bool notified = false; > > - for (i = 0; i < npages; i++) { > + for (i = 0; i < npages; ) { > struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); > struct page *page = migrate_pfn_to_page(src_pfns[i]); > struct address_space *mapping; > struct folio *newfolio, *folio; > int r, extra_cnt = 0; > + unsigned long nr = 1; > > if (!newpage) { > src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > - continue; > + goto next; > } > > if (!page) { > unsigned long addr; > > if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) > - continue; > + goto next; > > /* > * The only time there is no vma is when called from > @@ -741,15 +1009,47 @@ static void __migrate_device_pages(unsigned long *src_pfns, > migrate->pgmap_owner); > mmu_notifier_invalidate_range_start(&range); > } > - migrate_vma_insert_page(migrate, addr, newpage, > + > + if ((src_pfns[i] & MIGRATE_PFN_COMPOUND) && > + (!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) { > + nr = HPAGE_PMD_NR; > + src_pfns[i] &= ~MIGRATE_PFN_COMPOUND; > + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > + goto next; > + } > + > + migrate_vma_insert_page(migrate, addr, &dst_pfns[i], > &src_pfns[i]); > - continue; > + goto next; > } > > newfolio = page_folio(newpage); > folio = page_folio(page); > mapping = folio_mapping(folio); > > + /* > + * If THP migration is enabled, check if both src and dst > + * can migrate large pages > + */ > + if (thp_migration_supported()) { > + if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) && > + (src_pfns[i] & MIGRATE_PFN_COMPOUND) && > + !(dst_pfns[i] & MIGRATE_PFN_COMPOUND)) { > + > + if (!migrate) { > + src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE | > + MIGRATE_PFN_COMPOUND); > + goto next; > + } > + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > + } else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) && > + (dst_pfns[i] & MIGRATE_PFN_COMPOUND) && > + !(src_pfns[i] & MIGRATE_PFN_COMPOUND)) { > + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > + } > + } > + > + > if (folio_is_device_private(newfolio) || > folio_is_device_coherent(newfolio)) { > if (mapping) { > @@ -762,7 +1062,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, > if (!folio_test_anon(folio) || > !folio_free_swap(folio)) { > src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > - continue; > + goto next; > } > } > } else if (folio_is_zone_device(newfolio)) { > @@ -770,7 +1070,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, > * Other types of ZONE_DEVICE page are not supported. > */ > src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > - continue; > + goto next; > } > > BUG_ON(folio_test_writeback(folio)); > @@ -782,6 +1082,8 @@ static void __migrate_device_pages(unsigned long *src_pfns, > src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > else > folio_migrate_flags(newfolio, folio); > +next: > + i += nr; > } > > if (notified) > @@ -943,10 +1245,23 @@ static unsigned long migrate_device_pfn_lock(unsigned long pfn) > int migrate_device_range(unsigned long *src_pfns, unsigned long start, > unsigned long npages) I think migrate_device_pfns should be updated too in similar way. Here is what I came up with. Again feel free to include or modify these changes as you see fit. * Similar to migrate_device_range() but supports non-contiguous pre-popluated - * array of device pages to migrate. + * array of device pages to migrate. If a higher-order folio is found, the mpfn + * is OR'ed with MIGRATE_PFN_COMPOUND, and the subsequent mpfns within the range + * of the order are cleared. */ int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages) { - unsigned long i; + unsigned long i, j; + + for (i = 0; i < npages; i++) { + struct page *page = pfn_to_page(src_pfns[i]); + struct folio *folio = page_folio(page); + unsigned int nr = 1; - for (i = 0; i < npages; i++) src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]); + nr = folio_nr_pages(folio); + if (nr > 1) { + src_pfns[i] |= MIGRATE_PFN_COMPOUND; + for (j = 1; j < nr; j++) + src_pfns[i+j] = 0; + i += j - 1; + } + } Matt > { > - unsigned long i, pfn; > + unsigned long i, j, pfn; > + > + for (pfn = start, i = 0; i < npages; pfn++, i++) { > + struct page *page = pfn_to_page(pfn); > + struct folio *folio = page_folio(page); > + unsigned int nr = 1; > > - for (pfn = start, i = 0; i < npages; pfn++, i++) > src_pfns[i] = migrate_device_pfn_lock(pfn); > + nr = folio_nr_pages(folio); > + if (nr > 1) { > + src_pfns[i] |= MIGRATE_PFN_COMPOUND; > + for (j = 1; j < nr; j++) > + src_pfns[i+j] = 0; > + i += j - 1; > + pfn += j - 1; > + } > + } > > migrate_device_unmap(src_pfns, npages, NULL); > > -- > 2.49.0 >
On Fri, Jul 04, 2025 at 09:35:03AM +1000, Balbir Singh wrote: > migrate_device code paths go through the collect, setup > and finalize phases of migration. Support for MIGRATE_PFN_COMPOUND > was added earlier in the series to mark THP pages as > MIGRATE_PFN_COMPOUND. > > The entries in src and dst arrays passed to these functions still > remain at a PAGE_SIZE granularity. When a compound page is passed, > the first entry has the PFN along with MIGRATE_PFN_COMPOUND > and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the > remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This > representation allows for the compound page to be split into smaller > page sizes. > > migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP > page aware. Two new helper functions migrate_vma_collect_huge_pmd() > and migrate_vma_insert_huge_pmd_page() have been added. > > migrate_vma_collect_huge_pmd() can collect THP pages, but if for > some reason this fails, there is fallback support to split the folio > and migrate it. > > migrate_vma_insert_huge_pmd_page() closely follows the logic of > migrate_vma_insert_page() > > Support for splitting pages as needed for migration will follow in > later patches in this series. > > Cc: Karol Herbst <kherbst@redhat.com> > Cc: Lyude Paul <lyude@redhat.com> > Cc: Danilo Krummrich <dakr@kernel.org> > Cc: David Airlie <airlied@gmail.com> > Cc: Simona Vetter <simona@ffwll.ch> > Cc: "Jérôme Glisse" <jglisse@redhat.com> > Cc: Shuah Khan <shuah@kernel.org> > Cc: David Hildenbrand <david@redhat.com> > Cc: Barry Song <baohua@kernel.org> > Cc: Baolin Wang <baolin.wang@linux.alibaba.com> > Cc: Ryan Roberts <ryan.roberts@arm.com> > Cc: Matthew Wilcox <willy@infradead.org> > Cc: Peter Xu <peterx@redhat.com> > Cc: Zi Yan <ziy@nvidia.com> > Cc: Kefeng Wang <wangkefeng.wang@huawei.com> > Cc: Jane Chu <jane.chu@oracle.com> > Cc: Alistair Popple <apopple@nvidia.com> > Cc: Donet Tom <donettom@linux.ibm.com> > > Signed-off-by: Balbir Singh <balbirs@nvidia.com> > --- > mm/migrate_device.c | 437 +++++++++++++++++++++++++++++++++++++------- > 1 file changed, 376 insertions(+), 61 deletions(-) > > diff --git a/mm/migrate_device.c b/mm/migrate_device.c > index e05e14d6eacd..41d0bd787969 100644 > --- a/mm/migrate_device.c > +++ b/mm/migrate_device.c > @@ -14,6 +14,7 @@ > #include <linux/pagewalk.h> > #include <linux/rmap.h> > #include <linux/swapops.h> > +#include <asm/pgalloc.h> > #include <asm/tlbflush.h> > #include "internal.h" > > @@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start, > if (!vma_is_anonymous(walk->vma)) > return migrate_vma_collect_skip(start, end, walk); > > + if (thp_migration_supported() && > + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && > + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && > + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { > + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE | > + MIGRATE_PFN_COMPOUND; > + migrate->dst[migrate->npages] = 0; > + migrate->npages++; > + migrate->cpages++; It's a bit unclear what cpages and npages actually represent when collecting a THP. In my opinion, they should reflect the total number of minimum sized pages collected—i.e., we should increment by the shifted order (512) here. I'm fairly certain the logic in migrate_device.c would break if a 4MB range was requested and a THP was found first, followed by a non-THP. Matt > + > + /* > + * Collect the remaining entries as holes, in case we > + * need to split later > + */ > + return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk); > + } > + > for (addr = start; addr < end; addr += PAGE_SIZE) { > migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; > migrate->dst[migrate->npages] = 0; > @@ -54,57 +72,148 @@ static int migrate_vma_collect_hole(unsigned long start, > return 0; > } > > -static int migrate_vma_collect_pmd(pmd_t *pmdp, > - unsigned long start, > - unsigned long end, > - struct mm_walk *walk) > +/** > + * migrate_vma_collect_huge_pmd - collect THP pages without splitting the > + * folio for device private pages. > + * @pmdp: pointer to pmd entry > + * @start: start address of the range for migration > + * @end: end address of the range for migration > + * @walk: mm_walk callback structure > + * > + * Collect the huge pmd entry at @pmdp for migration and set the > + * MIGRATE_PFN_COMPOUND flag in the migrate src entry to indicate that > + * migration will occur at HPAGE_PMD granularity > + */ > +static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, > + unsigned long end, struct mm_walk *walk, > + struct folio *fault_folio) > { > + struct mm_struct *mm = walk->mm; > + struct folio *folio; > struct migrate_vma *migrate = walk->private; > - struct folio *fault_folio = migrate->fault_page ? > - page_folio(migrate->fault_page) : NULL; > - struct vm_area_struct *vma = walk->vma; > - struct mm_struct *mm = vma->vm_mm; > - unsigned long addr = start, unmapped = 0; > spinlock_t *ptl; > - pte_t *ptep; > + swp_entry_t entry; > + int ret; > + unsigned long write = 0; > > -again: > - if (pmd_none(*pmdp)) > + ptl = pmd_lock(mm, pmdp); > + if (pmd_none(*pmdp)) { > + spin_unlock(ptl); > return migrate_vma_collect_hole(start, end, -1, walk); > + } > > if (pmd_trans_huge(*pmdp)) { > - struct folio *folio; > - > - ptl = pmd_lock(mm, pmdp); > - if (unlikely(!pmd_trans_huge(*pmdp))) { > + if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { > spin_unlock(ptl); > - goto again; > + return migrate_vma_collect_skip(start, end, walk); > } > > folio = pmd_folio(*pmdp); > if (is_huge_zero_folio(folio)) { > spin_unlock(ptl); > - split_huge_pmd(vma, pmdp, addr); > - } else { > - int ret; > + return migrate_vma_collect_hole(start, end, -1, walk); > + } > + if (pmd_write(*pmdp)) > + write = MIGRATE_PFN_WRITE; > + } else if (!pmd_present(*pmdp)) { > + entry = pmd_to_swp_entry(*pmdp); > + folio = pfn_swap_entry_folio(entry); > + > + if (!is_device_private_entry(entry) || > + !(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || > + (folio->pgmap->owner != migrate->pgmap_owner)) { > + spin_unlock(ptl); > + return migrate_vma_collect_skip(start, end, walk); > + } > > - folio_get(folio); > + if (is_migration_entry(entry)) { > + migration_entry_wait_on_locked(entry, ptl); > spin_unlock(ptl); > - /* FIXME: we don't expect THP for fault_folio */ > - if (WARN_ON_ONCE(fault_folio == folio)) > - return migrate_vma_collect_skip(start, end, > - walk); > - if (unlikely(!folio_trylock(folio))) > - return migrate_vma_collect_skip(start, end, > - walk); > - ret = split_folio(folio); > - if (fault_folio != folio) > - folio_unlock(folio); > - folio_put(folio); > - if (ret) > - return migrate_vma_collect_skip(start, end, > - walk); > + return -EAGAIN; > } > + > + if (is_writable_device_private_entry(entry)) > + write = MIGRATE_PFN_WRITE; > + } else { > + spin_unlock(ptl); > + return -EAGAIN; > + } > + > + folio_get(folio); > + if (folio != fault_folio && unlikely(!folio_trylock(folio))) { > + spin_unlock(ptl); > + folio_put(folio); > + return migrate_vma_collect_skip(start, end, walk); > + } > + > + if (thp_migration_supported() && > + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && > + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && > + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { > + > + struct page_vma_mapped_walk pvmw = { > + .ptl = ptl, > + .address = start, > + .pmd = pmdp, > + .vma = walk->vma, > + }; > + > + unsigned long pfn = page_to_pfn(folio_page(folio, 0)); > + > + migrate->src[migrate->npages] = migrate_pfn(pfn) | write > + | MIGRATE_PFN_MIGRATE > + | MIGRATE_PFN_COMPOUND; > + migrate->dst[migrate->npages++] = 0; > + migrate->cpages++; > + ret = set_pmd_migration_entry(&pvmw, folio_page(folio, 0)); > + if (ret) { > + migrate->npages--; > + migrate->cpages--; > + migrate->src[migrate->npages] = 0; > + migrate->dst[migrate->npages] = 0; > + goto fallback; > + } > + migrate_vma_collect_skip(start + PAGE_SIZE, end, walk); > + spin_unlock(ptl); > + return 0; > + } > + > +fallback: > + spin_unlock(ptl); > + ret = split_folio(folio); > + if (fault_folio != folio) > + folio_unlock(folio); > + folio_put(folio); > + if (ret) > + return migrate_vma_collect_skip(start, end, walk); > + if (pmd_none(pmdp_get_lockless(pmdp))) > + return migrate_vma_collect_hole(start, end, -1, walk); > + > + return -ENOENT; > +} > + > +static int migrate_vma_collect_pmd(pmd_t *pmdp, > + unsigned long start, > + unsigned long end, > + struct mm_walk *walk) > +{ > + struct migrate_vma *migrate = walk->private; > + struct vm_area_struct *vma = walk->vma; > + struct mm_struct *mm = vma->vm_mm; > + unsigned long addr = start, unmapped = 0; > + spinlock_t *ptl; > + struct folio *fault_folio = migrate->fault_page ? > + page_folio(migrate->fault_page) : NULL; > + pte_t *ptep; > + > +again: > + if (pmd_trans_huge(*pmdp) || !pmd_present(*pmdp)) { > + int ret = migrate_vma_collect_huge_pmd(pmdp, start, end, walk, fault_folio); > + > + if (ret == -EAGAIN) > + goto again; > + if (ret == 0) > + return 0; > } > > ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); > @@ -175,8 +284,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, > mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; > } > > - /* FIXME support THP */ > - if (!page || !page->mapping || PageTransCompound(page)) { > + if (!page || !page->mapping) { > mpfn = 0; > goto next; > } > @@ -347,14 +455,6 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) > */ > int extra = 1 + (page == fault_page); > > - /* > - * FIXME support THP (transparent huge page), it is bit more complex to > - * check them than regular pages, because they can be mapped with a pmd > - * or with a pte (split pte mapping). > - */ > - if (folio_test_large(folio)) > - return false; > - > /* Page from ZONE_DEVICE have one extra reference */ > if (folio_is_zone_device(folio)) > extra++; > @@ -385,17 +485,24 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, > > lru_add_drain(); > > - for (i = 0; i < npages; i++) { > + for (i = 0; i < npages; ) { > struct page *page = migrate_pfn_to_page(src_pfns[i]); > struct folio *folio; > + unsigned int nr = 1; > > if (!page) { > if (src_pfns[i] & MIGRATE_PFN_MIGRATE) > unmapped++; > - continue; > + goto next; > } > > folio = page_folio(page); > + nr = folio_nr_pages(folio); > + > + if (nr > 1) > + src_pfns[i] |= MIGRATE_PFN_COMPOUND; > + > + > /* ZONE_DEVICE folios are not on LRU */ > if (!folio_is_zone_device(folio)) { > if (!folio_test_lru(folio) && allow_drain) { > @@ -407,7 +514,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, > if (!folio_isolate_lru(folio)) { > src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > restore++; > - continue; > + goto next; > } > > /* Drop the reference we took in collect */ > @@ -426,10 +533,12 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, > > src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > restore++; > - continue; > + goto next; > } > > unmapped++; > +next: > + i += nr; > } > > for (i = 0; i < npages && restore; i++) { > @@ -575,6 +684,146 @@ int migrate_vma_setup(struct migrate_vma *args) > } > EXPORT_SYMBOL(migrate_vma_setup); > > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > +/** > + * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm > + * at @addr. folio is already allocated as a part of the migration process with > + * large page. > + * > + * @folio needs to be initialized and setup after it's allocated. The code bits > + * here follow closely the code in __do_huge_pmd_anonymous_page(). This API does > + * not support THP zero pages. > + * > + * @migrate: migrate_vma arguments > + * @addr: address where the folio will be inserted > + * @folio: folio to be inserted at @addr > + * @src: src pfn which is being migrated > + * @pmdp: pointer to the pmd > + */ > +static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, > + unsigned long addr, > + struct page *page, > + unsigned long *src, > + pmd_t *pmdp) > +{ > + struct vm_area_struct *vma = migrate->vma; > + gfp_t gfp = vma_thp_gfp_mask(vma); > + struct folio *folio = page_folio(page); > + int ret; > + spinlock_t *ptl; > + pgtable_t pgtable; > + pmd_t entry; > + bool flush = false; > + unsigned long i; > + > + VM_WARN_ON_FOLIO(!folio, folio); > + VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp)); > + > + if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER)) > + return -EINVAL; > + > + ret = anon_vma_prepare(vma); > + if (ret) > + return ret; > + > + folio_set_order(folio, HPAGE_PMD_ORDER); > + folio_set_large_rmappable(folio); > + > + if (mem_cgroup_charge(folio, migrate->vma->vm_mm, gfp)) { > + count_vm_event(THP_FAULT_FALLBACK); > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); > + ret = -ENOMEM; > + goto abort; > + } > + > + __folio_mark_uptodate(folio); > + > + pgtable = pte_alloc_one(vma->vm_mm); > + if (unlikely(!pgtable)) > + goto abort; > + > + if (folio_is_device_private(folio)) { > + swp_entry_t swp_entry; > + > + if (vma->vm_flags & VM_WRITE) > + swp_entry = make_writable_device_private_entry( > + page_to_pfn(page)); > + else > + swp_entry = make_readable_device_private_entry( > + page_to_pfn(page)); > + entry = swp_entry_to_pmd(swp_entry); > + } else { > + if (folio_is_zone_device(folio) && > + !folio_is_device_coherent(folio)) { > + goto abort; > + } > + entry = folio_mk_pmd(folio, vma->vm_page_prot); > + if (vma->vm_flags & VM_WRITE) > + entry = pmd_mkwrite(pmd_mkdirty(entry), vma); > + } > + > + ptl = pmd_lock(vma->vm_mm, pmdp); > + ret = check_stable_address_space(vma->vm_mm); > + if (ret) > + goto abort; > + > + /* > + * Check for userfaultfd but do not deliver the fault. Instead, > + * just back off. > + */ > + if (userfaultfd_missing(vma)) > + goto unlock_abort; > + > + if (!pmd_none(*pmdp)) { > + if (!is_huge_zero_pmd(*pmdp)) > + goto unlock_abort; > + flush = true; > + } else if (!pmd_none(*pmdp)) > + goto unlock_abort; > + > + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); > + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); > + if (!folio_is_zone_device(folio)) > + folio_add_lru_vma(folio, vma); > + folio_get(folio); > + > + if (flush) { > + pte_free(vma->vm_mm, pgtable); > + flush_cache_page(vma, addr, addr + HPAGE_PMD_SIZE); > + pmdp_invalidate(vma, addr, pmdp); > + } else { > + pgtable_trans_huge_deposit(vma->vm_mm, pmdp, pgtable); > + mm_inc_nr_ptes(vma->vm_mm); > + } > + set_pmd_at(vma->vm_mm, addr, pmdp, entry); > + update_mmu_cache_pmd(vma, addr, pmdp); > + > + spin_unlock(ptl); > + > + count_vm_event(THP_FAULT_ALLOC); > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); > + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); > + > + return 0; > + > +unlock_abort: > + spin_unlock(ptl); > +abort: > + for (i = 0; i < HPAGE_PMD_NR; i++) > + src[i] &= ~MIGRATE_PFN_MIGRATE; > + return 0; > +} > +#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ > +static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, > + unsigned long addr, > + struct page *page, > + unsigned long *src, > + pmd_t *pmdp) > +{ > + return 0; > +} > +#endif > + > /* > * This code closely matches the code in: > * __handle_mm_fault() > @@ -585,9 +834,10 @@ EXPORT_SYMBOL(migrate_vma_setup); > */ > static void migrate_vma_insert_page(struct migrate_vma *migrate, > unsigned long addr, > - struct page *page, > + unsigned long *dst, > unsigned long *src) > { > + struct page *page = migrate_pfn_to_page(*dst); > struct folio *folio = page_folio(page); > struct vm_area_struct *vma = migrate->vma; > struct mm_struct *mm = vma->vm_mm; > @@ -615,8 +865,25 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, > pmdp = pmd_alloc(mm, pudp, addr); > if (!pmdp) > goto abort; > - if (pmd_trans_huge(*pmdp)) > - goto abort; > + > + if (thp_migration_supported() && (*dst & MIGRATE_PFN_COMPOUND)) { > + int ret = migrate_vma_insert_huge_pmd_page(migrate, addr, page, > + src, pmdp); > + if (ret) > + goto abort; > + return; > + } > + > + if (!pmd_none(*pmdp)) { > + if (pmd_trans_huge(*pmdp)) { > + if (!is_huge_zero_pmd(*pmdp)) > + goto abort; > + folio_get(pmd_folio(*pmdp)); > + split_huge_pmd(vma, pmdp, addr); > + } else if (pmd_leaf(*pmdp)) > + goto abort; > + } > + > if (pte_alloc(mm, pmdp)) > goto abort; > if (unlikely(anon_vma_prepare(vma))) > @@ -707,23 +974,24 @@ static void __migrate_device_pages(unsigned long *src_pfns, > unsigned long i; > bool notified = false; > > - for (i = 0; i < npages; i++) { > + for (i = 0; i < npages; ) { > struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); > struct page *page = migrate_pfn_to_page(src_pfns[i]); > struct address_space *mapping; > struct folio *newfolio, *folio; > int r, extra_cnt = 0; > + unsigned long nr = 1; > > if (!newpage) { > src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > - continue; > + goto next; > } > > if (!page) { > unsigned long addr; > > if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) > - continue; > + goto next; > > /* > * The only time there is no vma is when called from > @@ -741,15 +1009,47 @@ static void __migrate_device_pages(unsigned long *src_pfns, > migrate->pgmap_owner); > mmu_notifier_invalidate_range_start(&range); > } > - migrate_vma_insert_page(migrate, addr, newpage, > + > + if ((src_pfns[i] & MIGRATE_PFN_COMPOUND) && > + (!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) { > + nr = HPAGE_PMD_NR; > + src_pfns[i] &= ~MIGRATE_PFN_COMPOUND; > + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > + goto next; > + } > + > + migrate_vma_insert_page(migrate, addr, &dst_pfns[i], > &src_pfns[i]); > - continue; > + goto next; > } > > newfolio = page_folio(newpage); > folio = page_folio(page); > mapping = folio_mapping(folio); > > + /* > + * If THP migration is enabled, check if both src and dst > + * can migrate large pages > + */ > + if (thp_migration_supported()) { > + if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) && > + (src_pfns[i] & MIGRATE_PFN_COMPOUND) && > + !(dst_pfns[i] & MIGRATE_PFN_COMPOUND)) { > + > + if (!migrate) { > + src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE | > + MIGRATE_PFN_COMPOUND); > + goto next; > + } > + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > + } else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) && > + (dst_pfns[i] & MIGRATE_PFN_COMPOUND) && > + !(src_pfns[i] & MIGRATE_PFN_COMPOUND)) { > + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > + } > + } > + > + > if (folio_is_device_private(newfolio) || > folio_is_device_coherent(newfolio)) { > if (mapping) { > @@ -762,7 +1062,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, > if (!folio_test_anon(folio) || > !folio_free_swap(folio)) { > src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > - continue; > + goto next; > } > } > } else if (folio_is_zone_device(newfolio)) { > @@ -770,7 +1070,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, > * Other types of ZONE_DEVICE page are not supported. > */ > src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > - continue; > + goto next; > } > > BUG_ON(folio_test_writeback(folio)); > @@ -782,6 +1082,8 @@ static void __migrate_device_pages(unsigned long *src_pfns, > src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; > else > folio_migrate_flags(newfolio, folio); > +next: > + i += nr; > } > > if (notified) > @@ -943,10 +1245,23 @@ static unsigned long migrate_device_pfn_lock(unsigned long pfn) > int migrate_device_range(unsigned long *src_pfns, unsigned long start, > unsigned long npages) > { > - unsigned long i, pfn; > + unsigned long i, j, pfn; > + > + for (pfn = start, i = 0; i < npages; pfn++, i++) { > + struct page *page = pfn_to_page(pfn); > + struct folio *folio = page_folio(page); > + unsigned int nr = 1; > > - for (pfn = start, i = 0; i < npages; pfn++, i++) > src_pfns[i] = migrate_device_pfn_lock(pfn); > + nr = folio_nr_pages(folio); > + if (nr > 1) { > + src_pfns[i] |= MIGRATE_PFN_COMPOUND; > + for (j = 1; j < nr; j++) > + src_pfns[i+j] = 0; > + i += j - 1; > + pfn += j - 1; > + } > + } > > migrate_device_unmap(src_pfns, npages, NULL); > > -- > 2.49.0 >
On 7/18/25 16:59, Matthew Brost wrote: > On Fri, Jul 04, 2025 at 09:35:03AM +1000, Balbir Singh wrote: >> + if (thp_migration_supported() && >> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && >> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && >> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { >> + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE | >> + MIGRATE_PFN_COMPOUND; >> + migrate->dst[migrate->npages] = 0; >> + migrate->npages++; >> + migrate->cpages++; > > It's a bit unclear what cpages and npages actually represent when > collecting a THP. In my opinion, they should reflect the total number of > minimum sized pages collected—i.e., we should increment by the shifted > order (512) here. I'm fairly certain the logic in migrate_device.c would > break if a 4MB range was requested and a THP was found first, followed by a > non-THP. > cpages and npages represent entries in the array and when or'ed with MIGRATE_PFN_COMPOUND represent the right number of entries populated. If you have a test that shows the breakage, I'd be keen to see it. We do populate other entries in 4k size(s) when collecting to allow for a split of the folio. Thanks for the review, Balbir Singh
On Fri, Jul 18, 2025 at 05:04:39PM +1000, Balbir Singh wrote: > On 7/18/25 16:59, Matthew Brost wrote: > > On Fri, Jul 04, 2025 at 09:35:03AM +1000, Balbir Singh wrote: > >> + if (thp_migration_supported() && > >> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && > >> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && > >> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { > >> + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE | > >> + MIGRATE_PFN_COMPOUND; > >> + migrate->dst[migrate->npages] = 0; > >> + migrate->npages++; > >> + migrate->cpages++; > > > > It's a bit unclear what cpages and npages actually represent when > > collecting a THP. In my opinion, they should reflect the total number of > > minimum sized pages collected—i.e., we should increment by the shifted > > order (512) here. I'm fairly certain the logic in migrate_device.c would > > break if a 4MB range was requested and a THP was found first, followed by a > > non-THP. > > > > cpages and npages represent entries in the array and when or'ed with MIGRATE_PFN_COMPOUND > represent the right number of entries populated. If you have a test that shows > the breakage, I'd be keen to see it. We do populate other entries in 4k size(s) when > collecting to allow for a split of the folio. > I don’t have a test case, but let me quickly point out a logic bug. Look at migrate_device_unmap. The variable i is incremented by folio_nr_pages, which seems correct. However, in the earlier code, we populate migrate->src using migrate->npages as the index, then increment it by 1. So, if two THPs are found back to back, they’ll occupy entries 0 and 1, while migrate_device_unmap will access entries 0 and 512. Given that we have no idea what mix of THP vs non-THP we’ll encounter, the only sane approach is to populate the input array at minimum page-entry alignment. Similarly, npages and cpages should reflect the number of minimum-sized pages found, with the caller (and migrate_device) understanding that src and dst will be sparsely populated based on each entry’s folio order. Matt > Thanks for the review, > Balbir Singh
On Fri, Jul 18, 2025 at 12:21:36AM -0700, Matthew Brost wrote: > On Fri, Jul 18, 2025 at 05:04:39PM +1000, Balbir Singh wrote: > > On 7/18/25 16:59, Matthew Brost wrote: > > > On Fri, Jul 04, 2025 at 09:35:03AM +1000, Balbir Singh wrote: > > >> + if (thp_migration_supported() && > > >> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && > > >> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && > > >> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { > > >> + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE | > > >> + MIGRATE_PFN_COMPOUND; > > >> + migrate->dst[migrate->npages] = 0; > > >> + migrate->npages++; > > >> + migrate->cpages++; > > > > > > It's a bit unclear what cpages and npages actually represent when > > > collecting a THP. In my opinion, they should reflect the total number of > > > minimum sized pages collected—i.e., we should increment by the shifted > > > order (512) here. I'm fairly certain the logic in migrate_device.c would > > > break if a 4MB range was requested and a THP was found first, followed by a > > > non-THP. > > > > > > > cpages and npages represent entries in the array and when or'ed with MIGRATE_PFN_COMPOUND > > represent the right number of entries populated. If you have a test that shows > > the breakage, I'd be keen to see it. We do populate other entries in 4k size(s) when > > collecting to allow for a split of the folio. > > > > I don’t have a test case, but let me quickly point out a logic bug. > > Look at migrate_device_unmap. The variable i is incremented by > folio_nr_pages, which seems correct. However, in the earlier code, we > populate migrate->src using migrate->npages as the index, then increment > it by 1. So, if two THPs are found back to back, they’ll occupy entries > 0 and 1, while migrate_device_unmap will access entries 0 and 512. > > Given that we have no idea what mix of THP vs non-THP we’ll encounter, > the only sane approach is to populate the input array at minimum > page-entry alignment. Similarly, npages and cpages should reflect the > number of minimum-sized pages found, with the caller (and > migrate_device) understanding that src and dst will be sparsely > populated based on each entry’s folio order. > I looked into this further and found another case where the logic breaks. In __migrate_device_pages, the call to migrate_vma_split_pages assumes that based on folio's order it can populate subsequent entries upon split. This requires the source array to reflect the folio order upon finding it. Here’s a summary of how I believe the migrate_vma_setup interface should behave, assuming 4K pages and 2M THPs: Example A: 4MB requested, 2 THPs found and unmapped src[0]: folio, order 9, migrate flag set src[1–511]: not present src[512]: folio, order 9, migrate flag set src[513–1023]: not present npages = 1024, cpages = 1024 Example B: 4MB requested, 2 THPs found, first THP unmap fails src[0]: folio, order 9, migrate flag clear src[1–511]: not present src[512]: folio, order 9, migrate flag set src[513–1023]: not present npages = 1024, cpages = 512 Example C: 4MB requested, 512 small pages + 1 THP found, some small pages fail to unmap src[0–7]: folio, order 0, migrate flag clear src[8–511]: folio, order 0, migrate flag set src[512]: folio, order 9, migrate flag set src[513–1023]: not present npages = 1024, cpages = 1016 As I suggested in my previous reply to patch #2, this should be documented—preferably in kernel-doc—so the final behavior is clear to both migrate_device.c (and the structs in migrate.h) and the layers above. I can help take a pass at writing kernel-doc for both, as its behavior is fairly before you changes. Matt > Matt > > > Thanks for the review, > > Balbir Singh
On Fri, Jul 18, 2025 at 01:22:24AM -0700, Matthew Brost wrote: > On Fri, Jul 18, 2025 at 12:21:36AM -0700, Matthew Brost wrote: > > On Fri, Jul 18, 2025 at 05:04:39PM +1000, Balbir Singh wrote: > > > On 7/18/25 16:59, Matthew Brost wrote: > > > > On Fri, Jul 04, 2025 at 09:35:03AM +1000, Balbir Singh wrote: > > > >> + if (thp_migration_supported() && > > > >> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && > > > >> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && > > > >> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { > > > >> + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE | > > > >> + MIGRATE_PFN_COMPOUND; > > > >> + migrate->dst[migrate->npages] = 0; > > > >> + migrate->npages++; > > > >> + migrate->cpages++; > > > > > > > > It's a bit unclear what cpages and npages actually represent when > > > > collecting a THP. In my opinion, they should reflect the total number of > > > > minimum sized pages collected—i.e., we should increment by the shifted > > > > order (512) here. I'm fairly certain the logic in migrate_device.c would > > > > break if a 4MB range was requested and a THP was found first, followed by a > > > > non-THP. > > > > > > > > > > cpages and npages represent entries in the array and when or'ed with MIGRATE_PFN_COMPOUND > > > represent the right number of entries populated. If you have a test that shows > > > the breakage, I'd be keen to see it. We do populate other entries in 4k size(s) when > > > collecting to allow for a split of the folio. > > > > > > > I don’t have a test case, but let me quickly point out a logic bug. > > > > Look at migrate_device_unmap. The variable i is incremented by > > folio_nr_pages, which seems correct. However, in the earlier code, we > > populate migrate->src using migrate->npages as the index, then increment > > it by 1. So, if two THPs are found back to back, they’ll occupy entries > > 0 and 1, while migrate_device_unmap will access entries 0 and 512. > > Ugh, ignore this logic bug explanation — I was wrong. I missed that migrate_vma_collect_skip increments npages to create the desired holes in the source array for folio splits or skip-over logic. But my point still stands regarding what cpages should represent — the total number of minimum-sized pages collected and unmapped, in an effort to keep the meaning of npages and cpages consistent. Matt > > Given that we have no idea what mix of THP vs non-THP we’ll encounter, > > the only sane approach is to populate the input array at minimum > > page-entry alignment. Similarly, npages and cpages should reflect the > > number of minimum-sized pages found, with the caller (and > > migrate_device) understanding that src and dst will be sparsely > > populated based on each entry’s folio order. > > > > I looked into this further and found another case where the logic breaks. > > In __migrate_device_pages, the call to migrate_vma_split_pages assumes > that based on folio's order it can populate subsequent entries upon > split. This requires the source array to reflect the folio order upon > finding it. > > Here’s a summary of how I believe the migrate_vma_setup interface should > behave, assuming 4K pages and 2M THPs: > > Example A: 4MB requested, 2 THPs found and unmapped > src[0]: folio, order 9, migrate flag set > src[1–511]: not present > src[512]: folio, order 9, migrate flag set > src[513–1023]: not present > npages = 1024, cpages = 1024 > > Example B: 4MB requested, 2 THPs found, first THP unmap fails > src[0]: folio, order 9, migrate flag clear > src[1–511]: not present > src[512]: folio, order 9, migrate flag set > src[513–1023]: not present > npages = 1024, cpages = 512 > > Example C: 4MB requested, 512 small pages + 1 THP found, some small pages fail to unmap > src[0–7]: folio, order 0, migrate flag clear > src[8–511]: folio, order 0, migrate flag set > src[512]: folio, order 9, migrate flag set > src[513–1023]: not present > npages = 1024, cpages = 1016 > > As I suggested in my previous reply to patch #2, this should be > documented—preferably in kernel-doc—so the final behavior is clear to > both migrate_device.c (and the structs in migrate.h) and the layers > above. I can help take a pass at writing kernel-doc for both, as its > behavior is fairly before you changes. > > Matt > > > Matt > > > > > Thanks for the review, > > > Balbir Singh
Hi Balbir, kernel test robot noticed the following build warnings: [auto build test WARNING on akpm-mm/mm-everything] [also build test WARNING on next-20250704] [cannot apply to akpm-mm/mm-nonmm-unstable shuah-kselftest/next shuah-kselftest/fixes linus/master v6.16-rc4] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Balbir-Singh/mm-zone_device-support-large-zone-device-private-folios/20250704-073807 base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything patch link: https://lore.kernel.org/r/20250703233511.2028395-5-balbirs%40nvidia.com patch subject: [v1 resend 04/12] mm/migrate_device: THP migration of zone device pages config: x86_64-randconfig-075-20250704 (https://download.01.org/0day-ci/archive/20250704/202507042336.o2mutGeh-lkp@intel.com/config) compiler: clang version 20.1.7 (https://github.com/llvm/llvm-project 6146a88f60492b520a36f8f8f3231e15f3cc6082) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250704/202507042336.o2mutGeh-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202507042336.o2mutGeh-lkp@intel.com/ All warnings (new ones prefixed by >>): >> Warning: mm/migrate_device.c:89 function parameter 'fault_folio' not described in 'migrate_vma_collect_huge_pmd' >> Warning: mm/migrate_device.c:707 function parameter 'page' not described in 'migrate_vma_insert_huge_pmd_page' Warning: mm/migrate_device.c:707 Excess function parameter 'folio' description in 'migrate_vma_insert_huge_pmd_page' -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
© 2016 - 2025 Red Hat, Inc.