MIGRATE_VMA_SELECT_COMPOUND will be used to select THP pages during
migrate_vma_setup() and MIGRATE_PFN_COMPOUND will make migrating
device pages as compound pages during device pfn migration.
migrate_device code paths go through the collect, setup
and finalize phases of migration.
The entries in src and dst arrays passed to these functions still
remain at a PAGE_SIZE granularity. When a compound page is passed,
the first entry has the PFN along with MIGRATE_PFN_COMPOUND
and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the
remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This
representation allows for the compound page to be split into smaller
page sizes.
migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP
page aware. Two new helper functions migrate_vma_collect_huge_pmd()
and migrate_vma_insert_huge_pmd_page() have been added.
migrate_vma_collect_huge_pmd() can collect THP pages, but if for
some reason this fails, there is fallback support to split the folio
and migrate it.
migrate_vma_insert_huge_pmd_page() closely follows the logic of
migrate_vma_insert_page()
Support for splitting pages as needed for migration will follow in
later patches in this series.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
---
include/linux/migrate.h | 2 +
mm/migrate_device.c | 456 ++++++++++++++++++++++++++++++++++------
2 files changed, 395 insertions(+), 63 deletions(-)
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 1f0ac122c3bf..41b4cc05a450 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -125,6 +125,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node)
#define MIGRATE_PFN_VALID (1UL << 0)
#define MIGRATE_PFN_MIGRATE (1UL << 1)
#define MIGRATE_PFN_WRITE (1UL << 3)
+#define MIGRATE_PFN_COMPOUND (1UL << 4)
#define MIGRATE_PFN_SHIFT 6
static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
@@ -143,6 +144,7 @@ enum migrate_vma_direction {
MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
+ MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
};
struct migrate_vma {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index f45ef182287d..1dfcf4799ea5 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -14,6 +14,7 @@
#include <linux/pagewalk.h>
#include <linux/rmap.h>
#include <linux/swapops.h>
+#include <linux/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start,
if (!vma_is_anonymous(walk->vma))
return migrate_vma_collect_skip(start, end, walk);
+ if (thp_migration_supported() &&
+ (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
+ (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
+ IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
+ migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE |
+ MIGRATE_PFN_COMPOUND;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ migrate->cpages++;
+
+ /*
+ * Collect the remaining entries as holes, in case we
+ * need to split later
+ */
+ return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
+ }
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
@@ -101,57 +119,150 @@ static int migrate_vma_split_folio(struct folio *folio,
return 0;
}
-static int migrate_vma_collect_pmd(pmd_t *pmdp,
- unsigned long start,
- unsigned long end,
- struct mm_walk *walk)
+/** migrate_vma_collect_huge_pmd - collect THP pages without splitting the
+ * folio for device private pages.
+ * @pmdp: pointer to pmd entry
+ * @start: start address of the range for migration
+ * @end: end address of the range for migration
+ * @walk: mm_walk callback structure
+ *
+ * Collect the huge pmd entry at @pmdp for migration and set the
+ * MIGRATE_PFN_COMPOUND flag in the migrate src entry to indicate that
+ * migration will occur at HPAGE_PMD granularity
+ */
+static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
+ unsigned long end, struct mm_walk *walk,
+ struct folio *fault_folio)
{
+ struct mm_struct *mm = walk->mm;
+ struct folio *folio;
struct migrate_vma *migrate = walk->private;
- struct folio *fault_folio = migrate->fault_page ?
- page_folio(migrate->fault_page) : NULL;
- struct vm_area_struct *vma = walk->vma;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long addr = start, unmapped = 0;
spinlock_t *ptl;
- pte_t *ptep;
+ swp_entry_t entry;
+ int ret;
+ unsigned long write = 0;
-again:
- if (pmd_none(*pmdp))
+ ptl = pmd_lock(mm, pmdp);
+ if (pmd_none(*pmdp)) {
+ spin_unlock(ptl);
return migrate_vma_collect_hole(start, end, -1, walk);
+ }
if (pmd_trans_huge(*pmdp)) {
- struct folio *folio;
-
- ptl = pmd_lock(mm, pmdp);
- if (unlikely(!pmd_trans_huge(*pmdp))) {
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
spin_unlock(ptl);
- goto again;
+ return migrate_vma_collect_skip(start, end, walk);
}
folio = pmd_folio(*pmdp);
if (is_huge_zero_folio(folio)) {
spin_unlock(ptl);
- split_huge_pmd(vma, pmdp, addr);
- } else {
- int ret;
+ return migrate_vma_collect_hole(start, end, -1, walk);
+ }
+ if (pmd_write(*pmdp))
+ write = MIGRATE_PFN_WRITE;
+ } else if (!pmd_present(*pmdp)) {
+ entry = pmd_to_swp_entry(*pmdp);
+ folio = pfn_swap_entry_folio(entry);
+
+ if (!is_device_private_entry(entry) ||
+ !(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+ (folio->pgmap->owner != migrate->pgmap_owner)) {
+ spin_unlock(ptl);
+ return migrate_vma_collect_skip(start, end, walk);
+ }
- folio_get(folio);
+ if (is_migration_entry(entry)) {
+ migration_entry_wait_on_locked(entry, ptl);
spin_unlock(ptl);
- /* FIXME: we don't expect THP for fault_folio */
- if (WARN_ON_ONCE(fault_folio == folio))
- return migrate_vma_collect_skip(start, end,
- walk);
- if (unlikely(!folio_trylock(folio)))
- return migrate_vma_collect_skip(start, end,
- walk);
- ret = split_folio(folio);
- if (fault_folio != folio)
- folio_unlock(folio);
- folio_put(folio);
- if (ret)
- return migrate_vma_collect_skip(start, end,
- walk);
+ return -EAGAIN;
+ }
+
+ if (is_writable_device_private_entry(entry))
+ write = MIGRATE_PFN_WRITE;
+ } else {
+ spin_unlock(ptl);
+ return -EAGAIN;
+ }
+
+ folio_get(folio);
+ if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
+ spin_unlock(ptl);
+ folio_put(folio);
+ return migrate_vma_collect_skip(start, end, walk);
+ }
+
+ if (thp_migration_supported() &&
+ (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
+ (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
+ IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
+
+ struct page_vma_mapped_walk pvmw = {
+ .ptl = ptl,
+ .address = start,
+ .pmd = pmdp,
+ .vma = walk->vma,
+ };
+
+ unsigned long pfn = page_to_pfn(folio_page(folio, 0));
+
+ migrate->src[migrate->npages] = migrate_pfn(pfn) | write
+ | MIGRATE_PFN_MIGRATE
+ | MIGRATE_PFN_COMPOUND;
+ migrate->dst[migrate->npages++] = 0;
+ migrate->cpages++;
+ ret = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
+ if (ret) {
+ migrate->npages--;
+ migrate->cpages--;
+ migrate->src[migrate->npages] = 0;
+ migrate->dst[migrate->npages] = 0;
+ goto fallback;
}
+ migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
+ spin_unlock(ptl);
+ return 0;
+ }
+
+fallback:
+ spin_unlock(ptl);
+ if (!folio_test_large(folio))
+ goto done;
+ ret = split_folio(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
+ folio_put(folio);
+ if (ret)
+ return migrate_vma_collect_skip(start, end, walk);
+ if (pmd_none(pmdp_get_lockless(pmdp)))
+ return migrate_vma_collect_hole(start, end, -1, walk);
+
+done:
+ return -ENOENT;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ struct folio *fault_folio = migrate->fault_page ?
+ page_folio(migrate->fault_page) : NULL;
+ pte_t *ptep;
+
+again:
+ if (pmd_trans_huge(*pmdp) || !pmd_present(*pmdp)) {
+ int ret = migrate_vma_collect_huge_pmd(pmdp, start, end, walk, fault_folio);
+
+ if (ret == -EAGAIN)
+ goto again;
+ if (ret == 0)
+ return 0;
}
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
@@ -269,8 +380,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
- /* FIXME support THP */
- if (!page || !page->mapping || PageTransCompound(page)) {
+ if (!page || !page->mapping) {
mpfn = 0;
goto next;
}
@@ -441,14 +551,6 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
*/
int extra = 1 + (page == fault_page);
- /*
- * FIXME support THP (transparent huge page), it is bit more complex to
- * check them than regular pages, because they can be mapped with a pmd
- * or with a pte (split pte mapping).
- */
- if (folio_test_large(folio))
- return false;
-
/* Page from ZONE_DEVICE have one extra reference */
if (folio_is_zone_device(folio))
extra++;
@@ -479,17 +581,24 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
lru_add_drain();
- for (i = 0; i < npages; i++) {
+ for (i = 0; i < npages; ) {
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
+ unsigned int nr = 1;
if (!page) {
if (src_pfns[i] & MIGRATE_PFN_MIGRATE)
unmapped++;
- continue;
+ goto next;
}
folio = page_folio(page);
+ nr = folio_nr_pages(folio);
+
+ if (nr > 1)
+ src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+
+
/* ZONE_DEVICE folios are not on LRU */
if (!folio_is_zone_device(folio)) {
if (!folio_test_lru(folio) && allow_drain) {
@@ -501,7 +610,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
if (!folio_isolate_lru(folio)) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
- continue;
+ goto next;
}
/* Drop the reference we took in collect */
@@ -520,10 +629,12 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
- continue;
+ goto next;
}
unmapped++;
+next:
+ i += nr;
}
for (i = 0; i < npages && restore; i++) {
@@ -669,6 +780,147 @@ int migrate_vma_setup(struct migrate_vma *args)
}
EXPORT_SYMBOL(migrate_vma_setup);
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+/**
+ * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm
+ * at @addr. folio is already allocated as a part of the migration process with
+ * large page.
+ *
+ * @folio needs to be initialized and setup after it's allocated. The code bits
+ * here follow closely the code in __do_huge_pmd_anonymous_page(). This API does
+ * not support THP zero pages.
+ *
+ * @migrate: migrate_vma arguments
+ * @addr: address where the folio will be inserted
+ * @folio: folio to be inserted at @addr
+ * @src: src pfn which is being migrated
+ * @pmdp: pointer to the pmd
+ */
+static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ pmd_t *pmdp)
+{
+ struct vm_area_struct *vma = migrate->vma;
+ gfp_t gfp = vma_thp_gfp_mask(vma);
+ struct folio *folio = page_folio(page);
+ int ret;
+ vm_fault_t csa_ret;
+ spinlock_t *ptl;
+ pgtable_t pgtable;
+ pmd_t entry;
+ bool flush = false;
+ unsigned long i;
+
+ VM_WARN_ON_FOLIO(!folio, folio);
+ VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp));
+
+ if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER))
+ return -EINVAL;
+
+ ret = anon_vma_prepare(vma);
+ if (ret)
+ return ret;
+
+ folio_set_order(folio, HPAGE_PMD_ORDER);
+ folio_set_large_rmappable(folio);
+
+ if (mem_cgroup_charge(folio, migrate->vma->vm_mm, gfp)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+ ret = -ENOMEM;
+ goto abort;
+ }
+
+ __folio_mark_uptodate(folio);
+
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (unlikely(!pgtable))
+ goto abort;
+
+ if (folio_is_device_private(folio)) {
+ swp_entry_t swp_entry;
+
+ if (vma->vm_flags & VM_WRITE)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page));
+ entry = swp_entry_to_pmd(swp_entry);
+ } else {
+ if (folio_is_zone_device(folio) &&
+ !folio_is_device_coherent(folio)) {
+ goto abort;
+ }
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pmd_mkwrite(pmd_mkdirty(entry), vma);
+ }
+
+ ptl = pmd_lock(vma->vm_mm, pmdp);
+ csa_ret = check_stable_address_space(vma->vm_mm);
+ if (csa_ret)
+ goto abort;
+
+ /*
+ * Check for userfaultfd but do not deliver the fault. Instead,
+ * just back off.
+ */
+ if (userfaultfd_missing(vma))
+ goto unlock_abort;
+
+ if (!pmd_none(*pmdp)) {
+ if (!is_huge_zero_pmd(*pmdp))
+ goto unlock_abort;
+ flush = true;
+ } else if (!pmd_none(*pmdp))
+ goto unlock_abort;
+
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
+ if (!folio_is_zone_device(folio))
+ folio_add_lru_vma(folio, vma);
+ folio_get(folio);
+
+ if (flush) {
+ pte_free(vma->vm_mm, pgtable);
+ flush_cache_page(vma, addr, addr + HPAGE_PMD_SIZE);
+ pmdp_invalidate(vma, addr, pmdp);
+ } else {
+ pgtable_trans_huge_deposit(vma->vm_mm, pmdp, pgtable);
+ mm_inc_nr_ptes(vma->vm_mm);
+ }
+ set_pmd_at(vma->vm_mm, addr, pmdp, entry);
+ update_mmu_cache_pmd(vma, addr, pmdp);
+
+ spin_unlock(ptl);
+
+ count_vm_event(THP_FAULT_ALLOC);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+
+ return 0;
+
+unlock_abort:
+ spin_unlock(ptl);
+abort:
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ src[i] &= ~MIGRATE_PFN_MIGRATE;
+ return 0;
+}
+#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
+static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ pmd_t *pmdp)
+{
+ return 0;
+}
+#endif
+
/*
* This code closely matches the code in:
* __handle_mm_fault()
@@ -679,9 +931,10 @@ EXPORT_SYMBOL(migrate_vma_setup);
*/
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
- struct page *page,
+ unsigned long *dst,
unsigned long *src)
{
+ struct page *page = migrate_pfn_to_page(*dst);
struct folio *folio = page_folio(page);
struct vm_area_struct *vma = migrate->vma;
struct mm_struct *mm = vma->vm_mm;
@@ -709,8 +962,25 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
pmdp = pmd_alloc(mm, pudp, addr);
if (!pmdp)
goto abort;
- if (pmd_trans_huge(*pmdp))
- goto abort;
+
+ if (thp_migration_supported() && (*dst & MIGRATE_PFN_COMPOUND)) {
+ int ret = migrate_vma_insert_huge_pmd_page(migrate, addr, page,
+ src, pmdp);
+ if (ret)
+ goto abort;
+ return;
+ }
+
+ if (!pmd_none(*pmdp)) {
+ if (pmd_trans_huge(*pmdp)) {
+ if (!is_huge_zero_pmd(*pmdp))
+ goto abort;
+ folio_get(pmd_folio(*pmdp));
+ split_huge_pmd(vma, pmdp, addr);
+ } else if (pmd_leaf(*pmdp))
+ goto abort;
+ }
+
if (pte_alloc(mm, pmdp))
goto abort;
if (unlikely(anon_vma_prepare(vma)))
@@ -801,23 +1071,24 @@ static void __migrate_device_pages(unsigned long *src_pfns,
unsigned long i;
bool notified = false;
- for (i = 0; i < npages; i++) {
+ for (i = 0; i < npages; ) {
struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct address_space *mapping;
struct folio *newfolio, *folio;
int r, extra_cnt = 0;
+ unsigned long nr = 1;
if (!newpage) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
if (!page) {
unsigned long addr;
if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
- continue;
+ goto next;
/*
* The only time there is no vma is when called from
@@ -835,15 +1106,47 @@ static void __migrate_device_pages(unsigned long *src_pfns,
migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
}
- migrate_vma_insert_page(migrate, addr, newpage,
+
+ if ((src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ (!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) {
+ nr = HPAGE_PMD_NR;
+ src_pfns[i] &= ~MIGRATE_PFN_COMPOUND;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ goto next;
+ }
+
+ migrate_vma_insert_page(migrate, addr, &dst_pfns[i],
&src_pfns[i]);
- continue;
+ goto next;
}
newfolio = page_folio(newpage);
folio = page_folio(page);
mapping = folio_mapping(folio);
+ /*
+ * If THP migration is enabled, check if both src and dst
+ * can migrate large pages
+ */
+ if (thp_migration_supported()) {
+ if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
+ (src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ !(dst_pfns[i] & MIGRATE_PFN_COMPOUND)) {
+
+ if (!migrate) {
+ src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE |
+ MIGRATE_PFN_COMPOUND);
+ goto next;
+ }
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ } else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
+ (dst_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ !(src_pfns[i] & MIGRATE_PFN_COMPOUND)) {
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ }
+ }
+
+
if (folio_is_device_private(newfolio) ||
folio_is_device_coherent(newfolio)) {
if (mapping) {
@@ -856,7 +1159,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
if (!folio_test_anon(folio) ||
!folio_free_swap(folio)) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
}
} else if (folio_is_zone_device(newfolio)) {
@@ -864,7 +1167,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
* Other types of ZONE_DEVICE page are not supported.
*/
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
BUG_ON(folio_test_writeback(folio));
@@ -876,6 +1179,8 @@ static void __migrate_device_pages(unsigned long *src_pfns,
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
else
folio_migrate_flags(newfolio, folio);
+next:
+ i += nr;
}
if (notified)
@@ -1037,10 +1342,23 @@ static unsigned long migrate_device_pfn_lock(unsigned long pfn)
int migrate_device_range(unsigned long *src_pfns, unsigned long start,
unsigned long npages)
{
- unsigned long i, pfn;
+ unsigned long i, j, pfn;
+
+ for (pfn = start, i = 0; i < npages; pfn++, i++) {
+ struct page *page = pfn_to_page(pfn);
+ struct folio *folio = page_folio(page);
+ unsigned int nr = 1;
- for (pfn = start, i = 0; i < npages; pfn++, i++)
src_pfns[i] = migrate_device_pfn_lock(pfn);
+ nr = folio_nr_pages(folio);
+ if (nr > 1) {
+ src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+ for (j = 1; j < nr; j++)
+ src_pfns[i+j] = 0;
+ i += j - 1;
+ pfn += j - 1;
+ }
+ }
migrate_device_unmap(src_pfns, npages, NULL);
@@ -1058,10 +1376,22 @@ EXPORT_SYMBOL(migrate_device_range);
*/
int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages)
{
- unsigned long i;
+ unsigned long i, j;
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = pfn_to_page(src_pfns[i]);
+ struct folio *folio = page_folio(page);
+ unsigned int nr = 1;
- for (i = 0; i < npages; i++)
src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]);
+ nr = folio_nr_pages(folio);
+ if (nr > 1) {
+ src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+ for (j = 1; j < nr; j++)
+ src_pfns[i+j] = 0;
+ i += j - 1;
+ }
+ }
migrate_device_unmap(src_pfns, npages, NULL);
--
2.50.1
sending again for the v5 thread..
On 9/8/25 03:04, Balbir Singh wrote:
> MIGRATE_VMA_SELECT_COMPOUND will be used to select THP pages during
> migrate_vma_setup() and MIGRATE_PFN_COMPOUND will make migrating
> device pages as compound pages during device pfn migration.
>
> migrate_device code paths go through the collect, setup
> and finalize phases of migration.
>
> The entries in src and dst arrays passed to these functions still
> remain at a PAGE_SIZE granularity. When a compound page is passed,
> the first entry has the PFN along with MIGRATE_PFN_COMPOUND
> and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the
> remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This
> representation allows for the compound page to be split into smaller
> page sizes.
>
> migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP
> page aware. Two new helper functions migrate_vma_collect_huge_pmd()
> and migrate_vma_insert_huge_pmd_page() have been added.
>
> migrate_vma_collect_huge_pmd() can collect THP pages, but if for
> some reason this fails, there is fallback support to split the folio
> and migrate it.
>
> migrate_vma_insert_huge_pmd_page() closely follows the logic of
> migrate_vma_insert_page()
>
> Support for splitting pages as needed for migration will follow in
> later patches in this series.
>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Zi Yan <ziy@nvidia.com>
> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
> Cc: Rakie Kim <rakie.kim@sk.com>
> Cc: Byungchul Park <byungchul@sk.com>
> Cc: Gregory Price <gourry@gourry.net>
> Cc: Ying Huang <ying.huang@linux.alibaba.com>
> Cc: Alistair Popple <apopple@nvidia.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> Cc: Nico Pache <npache@redhat.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Dev Jain <dev.jain@arm.com>
> Cc: Barry Song <baohua@kernel.org>
> Cc: Lyude Paul <lyude@redhat.com>
> Cc: Danilo Krummrich <dakr@kernel.org>
> Cc: David Airlie <airlied@gmail.com>
> Cc: Simona Vetter <simona@ffwll.ch>
> Cc: Ralph Campbell <rcampbell@nvidia.com>
> Cc: Mika Penttilä <mpenttil@redhat.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Francois Dugast <francois.dugast@intel.com>
>
> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
> ---
> include/linux/migrate.h | 2 +
> mm/migrate_device.c | 456 ++++++++++++++++++++++++++++++++++------
> 2 files changed, 395 insertions(+), 63 deletions(-)
>
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index 1f0ac122c3bf..41b4cc05a450 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -125,6 +125,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node)
> #define MIGRATE_PFN_VALID (1UL << 0)
> #define MIGRATE_PFN_MIGRATE (1UL << 1)
> #define MIGRATE_PFN_WRITE (1UL << 3)
> +#define MIGRATE_PFN_COMPOUND (1UL << 4)
> #define MIGRATE_PFN_SHIFT 6
>
> static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
> @@ -143,6 +144,7 @@ enum migrate_vma_direction {
> MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
> MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
> MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
> + MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
> };
>
> struct migrate_vma {
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index f45ef182287d..1dfcf4799ea5 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -14,6 +14,7 @@
> #include <linux/pagewalk.h>
> #include <linux/rmap.h>
> #include <linux/swapops.h>
> +#include <linux/pgalloc.h>
> #include <asm/tlbflush.h>
> #include "internal.h"
>
> @@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start,
> if (!vma_is_anonymous(walk->vma))
> return migrate_vma_collect_skip(start, end, walk);
>
> + if (thp_migration_supported() &&
> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
> + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE |
> + MIGRATE_PFN_COMPOUND;
> + migrate->dst[migrate->npages] = 0;
> + migrate->npages++;
> + migrate->cpages++;
> +
> + /*
> + * Collect the remaining entries as holes, in case we
> + * need to split later
> + */
> + return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
> + }
> +
seems you have to split_huge_pmd() for the huge zero page here in case
of !thp_migration_supported() afaics
> for (addr = start; addr < end; addr += PAGE_SIZE) {
On 9/11/25 21:52, Mika Penttilä wrote:
> sending again for the v5 thread..
>
> On 9/8/25 03:04, Balbir Singh wrote:
>
>> MIGRATE_VMA_SELECT_COMPOUND will be used to select THP pages during
>> migrate_vma_setup() and MIGRATE_PFN_COMPOUND will make migrating
>> device pages as compound pages during device pfn migration.
>>
>> migrate_device code paths go through the collect, setup
>> and finalize phases of migration.
>>
>> The entries in src and dst arrays passed to these functions still
>> remain at a PAGE_SIZE granularity. When a compound page is passed,
>> the first entry has the PFN along with MIGRATE_PFN_COMPOUND
>> and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the
>> remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This
>> representation allows for the compound page to be split into smaller
>> page sizes.
>>
>> migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP
>> page aware. Two new helper functions migrate_vma_collect_huge_pmd()
>> and migrate_vma_insert_huge_pmd_page() have been added.
>>
>> migrate_vma_collect_huge_pmd() can collect THP pages, but if for
>> some reason this fails, there is fallback support to split the folio
>> and migrate it.
>>
>> migrate_vma_insert_huge_pmd_page() closely follows the logic of
>> migrate_vma_insert_page()
>>
>> Support for splitting pages as needed for migration will follow in
>> later patches in this series.
>>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: Zi Yan <ziy@nvidia.com>
>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>> Cc: Rakie Kim <rakie.kim@sk.com>
>> Cc: Byungchul Park <byungchul@sk.com>
>> Cc: Gregory Price <gourry@gourry.net>
>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>> Cc: Alistair Popple <apopple@nvidia.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>> Cc: Nico Pache <npache@redhat.com>
>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>> Cc: Dev Jain <dev.jain@arm.com>
>> Cc: Barry Song <baohua@kernel.org>
>> Cc: Lyude Paul <lyude@redhat.com>
>> Cc: Danilo Krummrich <dakr@kernel.org>
>> Cc: David Airlie <airlied@gmail.com>
>> Cc: Simona Vetter <simona@ffwll.ch>
>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>> Cc: Mika Penttilä <mpenttil@redhat.com>
>> Cc: Matthew Brost <matthew.brost@intel.com>
>> Cc: Francois Dugast <francois.dugast@intel.com>
>>
>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>> ---
>> include/linux/migrate.h | 2 +
>> mm/migrate_device.c | 456 ++++++++++++++++++++++++++++++++++------
>> 2 files changed, 395 insertions(+), 63 deletions(-)
>>
>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>> index 1f0ac122c3bf..41b4cc05a450 100644
>> --- a/include/linux/migrate.h
>> +++ b/include/linux/migrate.h
>> @@ -125,6 +125,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node)
>> #define MIGRATE_PFN_VALID (1UL << 0)
>> #define MIGRATE_PFN_MIGRATE (1UL << 1)
>> #define MIGRATE_PFN_WRITE (1UL << 3)
>> +#define MIGRATE_PFN_COMPOUND (1UL << 4)
>> #define MIGRATE_PFN_SHIFT 6
>>
>> static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
>> @@ -143,6 +144,7 @@ enum migrate_vma_direction {
>> MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
>> MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
>> MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
>> + MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
>> };
>>
>> struct migrate_vma {
>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index f45ef182287d..1dfcf4799ea5 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -14,6 +14,7 @@
>> #include <linux/pagewalk.h>
>> #include <linux/rmap.h>
>> #include <linux/swapops.h>
>> +#include <linux/pgalloc.h>
>> #include <asm/tlbflush.h>
>> #include "internal.h"
>>
>> @@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start,
>> if (!vma_is_anonymous(walk->vma))
>> return migrate_vma_collect_skip(start, end, walk);
>>
>> + if (thp_migration_supported() &&
>> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
>> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
>> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
>> + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE |
>> + MIGRATE_PFN_COMPOUND;
>> + migrate->dst[migrate->npages] = 0;
>> + migrate->npages++;
>> + migrate->cpages++;
>> +
>> + /*
>> + * Collect the remaining entries as holes, in case we
>> + * need to split later
>> + */
>> + return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
>> + }
>> +
>
> seems you have to split_huge_pmd() for the huge zero page here in case
> of !thp_migration_supported() afaics
>
Not really, if pfn is 0, we do a vm_insert_page (please see if (!page) line 1107) and
folio handling in migrate_vma_finalize line 1284
Thanks,
Balbir
On 9/12/25 08:04, Balbir Singh wrote:
> On 9/11/25 21:52, Mika Penttilä wrote:
>> sending again for the v5 thread..
>>
>> On 9/8/25 03:04, Balbir Singh wrote:
>>
>>> MIGRATE_VMA_SELECT_COMPOUND will be used to select THP pages during
>>> migrate_vma_setup() and MIGRATE_PFN_COMPOUND will make migrating
>>> device pages as compound pages during device pfn migration.
>>>
>>> migrate_device code paths go through the collect, setup
>>> and finalize phases of migration.
>>>
>>> The entries in src and dst arrays passed to these functions still
>>> remain at a PAGE_SIZE granularity. When a compound page is passed,
>>> the first entry has the PFN along with MIGRATE_PFN_COMPOUND
>>> and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the
>>> remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This
>>> representation allows for the compound page to be split into smaller
>>> page sizes.
>>>
>>> migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP
>>> page aware. Two new helper functions migrate_vma_collect_huge_pmd()
>>> and migrate_vma_insert_huge_pmd_page() have been added.
>>>
>>> migrate_vma_collect_huge_pmd() can collect THP pages, but if for
>>> some reason this fails, there is fallback support to split the folio
>>> and migrate it.
>>>
>>> migrate_vma_insert_huge_pmd_page() closely follows the logic of
>>> migrate_vma_insert_page()
>>>
>>> Support for splitting pages as needed for migration will follow in
>>> later patches in this series.
>>>
>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>> Cc: David Hildenbrand <david@redhat.com>
>>> Cc: Zi Yan <ziy@nvidia.com>
>>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>>> Cc: Rakie Kim <rakie.kim@sk.com>
>>> Cc: Byungchul Park <byungchul@sk.com>
>>> Cc: Gregory Price <gourry@gourry.net>
>>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>>> Cc: Alistair Popple <apopple@nvidia.com>
>>> Cc: Oscar Salvador <osalvador@suse.de>
>>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>>> Cc: Nico Pache <npache@redhat.com>
>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>> Cc: Dev Jain <dev.jain@arm.com>
>>> Cc: Barry Song <baohua@kernel.org>
>>> Cc: Lyude Paul <lyude@redhat.com>
>>> Cc: Danilo Krummrich <dakr@kernel.org>
>>> Cc: David Airlie <airlied@gmail.com>
>>> Cc: Simona Vetter <simona@ffwll.ch>
>>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>>> Cc: Mika Penttilä <mpenttil@redhat.com>
>>> Cc: Matthew Brost <matthew.brost@intel.com>
>>> Cc: Francois Dugast <francois.dugast@intel.com>
>>>
>>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>>> ---
>>> include/linux/migrate.h | 2 +
>>> mm/migrate_device.c | 456 ++++++++++++++++++++++++++++++++++------
>>> 2 files changed, 395 insertions(+), 63 deletions(-)
>>>
>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>>> index 1f0ac122c3bf..41b4cc05a450 100644
>>> --- a/include/linux/migrate.h
>>> +++ b/include/linux/migrate.h
>>> @@ -125,6 +125,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node)
>>> #define MIGRATE_PFN_VALID (1UL << 0)
>>> #define MIGRATE_PFN_MIGRATE (1UL << 1)
>>> #define MIGRATE_PFN_WRITE (1UL << 3)
>>> +#define MIGRATE_PFN_COMPOUND (1UL << 4)
>>> #define MIGRATE_PFN_SHIFT 6
>>>
>>> static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
>>> @@ -143,6 +144,7 @@ enum migrate_vma_direction {
>>> MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
>>> MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
>>> MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
>>> + MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
>>> };
>>>
>>> struct migrate_vma {
>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>> index f45ef182287d..1dfcf4799ea5 100644
>>> --- a/mm/migrate_device.c
>>> +++ b/mm/migrate_device.c
>>> @@ -14,6 +14,7 @@
>>> #include <linux/pagewalk.h>
>>> #include <linux/rmap.h>
>>> #include <linux/swapops.h>
>>> +#include <linux/pgalloc.h>
>>> #include <asm/tlbflush.h>
>>> #include "internal.h"
>>>
>>> @@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start,
>>> if (!vma_is_anonymous(walk->vma))
>>> return migrate_vma_collect_skip(start, end, walk);
>>>
>>> + if (thp_migration_supported() &&
>>> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
>>> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
>>> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
>>> + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE |
>>> + MIGRATE_PFN_COMPOUND;
>>> + migrate->dst[migrate->npages] = 0;
>>> + migrate->npages++;
>>> + migrate->cpages++;
>>> +
>>> + /*
>>> + * Collect the remaining entries as holes, in case we
>>> + * need to split later
>>> + */
>>> + return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
>>> + }
>>> +
>> seems you have to split_huge_pmd() for the huge zero page here in case
>> of !thp_migration_supported() afaics
>>
> Not really, if pfn is 0, we do a vm_insert_page (please see if (!page) line 1107) and
> folio handling in migrate_vma_finalize line 1284
Ok actually seems it is handled by migrate_vma_insert_page() which does
if (!pmd_none(*pmdp)) {
if (pmd_trans_huge(*pmdp)) {
if (!is_huge_zero_pmd(*pmdp))
goto abort;
folio_get(pmd_folio(*pmdp));
split_huge_pmd(vma, pmdp, addr); <----- here
} else if (pmd_leaf(*pmdp))
goto abort;
}
>
> Thanks,
> Balbir
>
--Mika
On 9/12/25 08:28, Mika Penttilä wrote:
> On 9/12/25 08:04, Balbir Singh wrote:
>
>> On 9/11/25 21:52, Mika Penttilä wrote:
>>> sending again for the v5 thread..
>>>
>>> On 9/8/25 03:04, Balbir Singh wrote:
>>>
>>>> MIGRATE_VMA_SELECT_COMPOUND will be used to select THP pages during
>>>> migrate_vma_setup() and MIGRATE_PFN_COMPOUND will make migrating
>>>> device pages as compound pages during device pfn migration.
>>>>
>>>> migrate_device code paths go through the collect, setup
>>>> and finalize phases of migration.
>>>>
>>>> The entries in src and dst arrays passed to these functions still
>>>> remain at a PAGE_SIZE granularity. When a compound page is passed,
>>>> the first entry has the PFN along with MIGRATE_PFN_COMPOUND
>>>> and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the
>>>> remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This
>>>> representation allows for the compound page to be split into smaller
>>>> page sizes.
>>>>
>>>> migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP
>>>> page aware. Two new helper functions migrate_vma_collect_huge_pmd()
>>>> and migrate_vma_insert_huge_pmd_page() have been added.
>>>>
>>>> migrate_vma_collect_huge_pmd() can collect THP pages, but if for
>>>> some reason this fails, there is fallback support to split the folio
>>>> and migrate it.
>>>>
>>>> migrate_vma_insert_huge_pmd_page() closely follows the logic of
>>>> migrate_vma_insert_page()
>>>>
>>>> Support for splitting pages as needed for migration will follow in
>>>> later patches in this series.
>>>>
>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>> Cc: David Hildenbrand <david@redhat.com>
>>>> Cc: Zi Yan <ziy@nvidia.com>
>>>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>>>> Cc: Rakie Kim <rakie.kim@sk.com>
>>>> Cc: Byungchul Park <byungchul@sk.com>
>>>> Cc: Gregory Price <gourry@gourry.net>
>>>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>>>> Cc: Alistair Popple <apopple@nvidia.com>
>>>> Cc: Oscar Salvador <osalvador@suse.de>
>>>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>>>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>>>> Cc: Nico Pache <npache@redhat.com>
>>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>>> Cc: Dev Jain <dev.jain@arm.com>
>>>> Cc: Barry Song <baohua@kernel.org>
>>>> Cc: Lyude Paul <lyude@redhat.com>
>>>> Cc: Danilo Krummrich <dakr@kernel.org>
>>>> Cc: David Airlie <airlied@gmail.com>
>>>> Cc: Simona Vetter <simona@ffwll.ch>
>>>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>>>> Cc: Mika Penttilä <mpenttil@redhat.com>
>>>> Cc: Matthew Brost <matthew.brost@intel.com>
>>>> Cc: Francois Dugast <francois.dugast@intel.com>
>>>>
>>>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>>>> ---
>>>> include/linux/migrate.h | 2 +
>>>> mm/migrate_device.c | 456 ++++++++++++++++++++++++++++++++++------
>>>> 2 files changed, 395 insertions(+), 63 deletions(-)
>>>>
>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>>>> index 1f0ac122c3bf..41b4cc05a450 100644
>>>> --- a/include/linux/migrate.h
>>>> +++ b/include/linux/migrate.h
>>>> @@ -125,6 +125,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node)
>>>> #define MIGRATE_PFN_VALID (1UL << 0)
>>>> #define MIGRATE_PFN_MIGRATE (1UL << 1)
>>>> #define MIGRATE_PFN_WRITE (1UL << 3)
>>>> +#define MIGRATE_PFN_COMPOUND (1UL << 4)
>>>> #define MIGRATE_PFN_SHIFT 6
>>>>
>>>> static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
>>>> @@ -143,6 +144,7 @@ enum migrate_vma_direction {
>>>> MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
>>>> MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
>>>> MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
>>>> + MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
>>>> };
>>>>
>>>> struct migrate_vma {
>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>>> index f45ef182287d..1dfcf4799ea5 100644
>>>> --- a/mm/migrate_device.c
>>>> +++ b/mm/migrate_device.c
>>>> @@ -14,6 +14,7 @@
>>>> #include <linux/pagewalk.h>
>>>> #include <linux/rmap.h>
>>>> #include <linux/swapops.h>
>>>> +#include <linux/pgalloc.h>
>>>> #include <asm/tlbflush.h>
>>>> #include "internal.h"
>>>>
>>>> @@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start,
>>>> if (!vma_is_anonymous(walk->vma))
>>>> return migrate_vma_collect_skip(start, end, walk);
>>>>
>>>> + if (thp_migration_supported() &&
>>>> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
>>>> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
>>>> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
>>>> + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE |
>>>> + MIGRATE_PFN_COMPOUND;
>>>> + migrate->dst[migrate->npages] = 0;
>>>> + migrate->npages++;
>>>> + migrate->cpages++;
>>>> +
>>>> + /*
>>>> + * Collect the remaining entries as holes, in case we
>>>> + * need to split later
>>>> + */
>>>> + return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
>>>> + }
>>>> +
>>> seems you have to split_huge_pmd() for the huge zero page here in case
>>> of !thp_migration_supported() afaics
>>>
>> Not really, if pfn is 0, we do a vm_insert_page (please see if (!page) line 1107) and
>> folio handling in migrate_vma_finalize line 1284
> Ok actually seems it is handled by migrate_vma_insert_page() which does
>
> if (!pmd_none(*pmdp)) {
> if (pmd_trans_huge(*pmdp)) {
> if (!is_huge_zero_pmd(*pmdp))
> goto abort;
> folio_get(pmd_folio(*pmdp));
> split_huge_pmd(vma, pmdp, addr); <----- here
> } else if (pmd_leaf(*pmdp))
> goto abort;
> }
>
While at it, think the folio_get(pmd_folio(*pmdp)); is wrong for here,
we split the pmd for huge zero page.
>> Thanks,
>> Balbir
>>
> --Mika
>
On 9/12/25 15:38, Mika Penttilä wrote:
>
> On 9/12/25 08:28, Mika Penttilä wrote:
>
>> On 9/12/25 08:04, Balbir Singh wrote:
>>
>>> On 9/11/25 21:52, Mika Penttilä wrote:
>>>> sending again for the v5 thread..
>>>>
>>>> On 9/8/25 03:04, Balbir Singh wrote:
>>>>
>>>>> MIGRATE_VMA_SELECT_COMPOUND will be used to select THP pages during
>>>>> migrate_vma_setup() and MIGRATE_PFN_COMPOUND will make migrating
>>>>> device pages as compound pages during device pfn migration.
>>>>>
>>>>> migrate_device code paths go through the collect, setup
>>>>> and finalize phases of migration.
>>>>>
>>>>> The entries in src and dst arrays passed to these functions still
>>>>> remain at a PAGE_SIZE granularity. When a compound page is passed,
>>>>> the first entry has the PFN along with MIGRATE_PFN_COMPOUND
>>>>> and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the
>>>>> remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This
>>>>> representation allows for the compound page to be split into smaller
>>>>> page sizes.
>>>>>
>>>>> migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP
>>>>> page aware. Two new helper functions migrate_vma_collect_huge_pmd()
>>>>> and migrate_vma_insert_huge_pmd_page() have been added.
>>>>>
>>>>> migrate_vma_collect_huge_pmd() can collect THP pages, but if for
>>>>> some reason this fails, there is fallback support to split the folio
>>>>> and migrate it.
>>>>>
>>>>> migrate_vma_insert_huge_pmd_page() closely follows the logic of
>>>>> migrate_vma_insert_page()
>>>>>
>>>>> Support for splitting pages as needed for migration will follow in
>>>>> later patches in this series.
>>>>>
>>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>>> Cc: David Hildenbrand <david@redhat.com>
>>>>> Cc: Zi Yan <ziy@nvidia.com>
>>>>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>>>>> Cc: Rakie Kim <rakie.kim@sk.com>
>>>>> Cc: Byungchul Park <byungchul@sk.com>
>>>>> Cc: Gregory Price <gourry@gourry.net>
>>>>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>>>>> Cc: Alistair Popple <apopple@nvidia.com>
>>>>> Cc: Oscar Salvador <osalvador@suse.de>
>>>>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>>>>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>>>>> Cc: Nico Pache <npache@redhat.com>
>>>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>>>> Cc: Dev Jain <dev.jain@arm.com>
>>>>> Cc: Barry Song <baohua@kernel.org>
>>>>> Cc: Lyude Paul <lyude@redhat.com>
>>>>> Cc: Danilo Krummrich <dakr@kernel.org>
>>>>> Cc: David Airlie <airlied@gmail.com>
>>>>> Cc: Simona Vetter <simona@ffwll.ch>
>>>>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>>>>> Cc: Mika Penttilä <mpenttil@redhat.com>
>>>>> Cc: Matthew Brost <matthew.brost@intel.com>
>>>>> Cc: Francois Dugast <francois.dugast@intel.com>
>>>>>
>>>>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>>>>> ---
>>>>> include/linux/migrate.h | 2 +
>>>>> mm/migrate_device.c | 456 ++++++++++++++++++++++++++++++++++------
>>>>> 2 files changed, 395 insertions(+), 63 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>>>>> index 1f0ac122c3bf..41b4cc05a450 100644
>>>>> --- a/include/linux/migrate.h
>>>>> +++ b/include/linux/migrate.h
>>>>> @@ -125,6 +125,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node)
>>>>> #define MIGRATE_PFN_VALID (1UL << 0)
>>>>> #define MIGRATE_PFN_MIGRATE (1UL << 1)
>>>>> #define MIGRATE_PFN_WRITE (1UL << 3)
>>>>> +#define MIGRATE_PFN_COMPOUND (1UL << 4)
>>>>> #define MIGRATE_PFN_SHIFT 6
>>>>>
>>>>> static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
>>>>> @@ -143,6 +144,7 @@ enum migrate_vma_direction {
>>>>> MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
>>>>> MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
>>>>> MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
>>>>> + MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
>>>>> };
>>>>>
>>>>> struct migrate_vma {
>>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>>>> index f45ef182287d..1dfcf4799ea5 100644
>>>>> --- a/mm/migrate_device.c
>>>>> +++ b/mm/migrate_device.c
>>>>> @@ -14,6 +14,7 @@
>>>>> #include <linux/pagewalk.h>
>>>>> #include <linux/rmap.h>
>>>>> #include <linux/swapops.h>
>>>>> +#include <linux/pgalloc.h>
>>>>> #include <asm/tlbflush.h>
>>>>> #include "internal.h"
>>>>>
>>>>> @@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start,
>>>>> if (!vma_is_anonymous(walk->vma))
>>>>> return migrate_vma_collect_skip(start, end, walk);
>>>>>
>>>>> + if (thp_migration_supported() &&
>>>>> + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
>>>>> + (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
>>>>> + IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
>>>>> + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE |
>>>>> + MIGRATE_PFN_COMPOUND;
>>>>> + migrate->dst[migrate->npages] = 0;
>>>>> + migrate->npages++;
>>>>> + migrate->cpages++;
>>>>> +
>>>>> + /*
>>>>> + * Collect the remaining entries as holes, in case we
>>>>> + * need to split later
>>>>> + */
>>>>> + return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
>>>>> + }
>>>>> +
>>>> seems you have to split_huge_pmd() for the huge zero page here in case
>>>> of !thp_migration_supported() afaics
>>>>
>>> Not really, if pfn is 0, we do a vm_insert_page (please see if (!page) line 1107) and
>>> folio handling in migrate_vma_finalize line 1284
>> Ok actually seems it is handled by migrate_vma_insert_page() which does
>>
>> if (!pmd_none(*pmdp)) {
>> if (pmd_trans_huge(*pmdp)) {
>> if (!is_huge_zero_pmd(*pmdp))
>> goto abort;
>> folio_get(pmd_folio(*pmdp));
>> split_huge_pmd(vma, pmdp, addr); <----- here
>> } else if (pmd_leaf(*pmdp))
>> goto abort;
>> }
>>
> While at it, think the folio_get(pmd_folio(*pmdp)); is wrong for here,
> we split the pmd for huge zero page.
>
Ack, will do
Thanks for the review
Balbir
© 2016 - 2026 Red Hat, Inc.