[v2] THP support for zone device page migration

[v2 07/11] mm/thp: add split during migration support

Posted by Balbir Singh 2 months, 1 week ago

Support splitting pages during THP zone device migration as needed.
The common case that arises is that after setup, during migrate
the destination might not be able to allocate MIGRATE_PFN_COMPOUND
pages.

Add a new routine migrate_vma_split_pages() to support the splitting
of already isolated pages. The pages being migrated are already unmapped
and marked for migration during setup (via unmap). folio_split() and
__split_unmapped_folio() take additional isolated arguments, to avoid
unmapping and remaping these pages and unlocking/putting the folio.

Cc: Karol Herbst <kherbst@redhat.com>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Donet Tom <donettom@linux.ibm.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>

Signed-off-by: Balbir Singh <balbirs@nvidia.com>
---
 include/linux/huge_mm.h | 11 +++++--
 mm/huge_memory.c        | 46 ++++++++++++++-------------
 mm/migrate_device.c     | 69 ++++++++++++++++++++++++++++++++++-------
 3 files changed, 91 insertions(+), 35 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 56fdcaf7604b..19e7e3b7c2b7 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -343,9 +343,9 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
 		vm_flags_t vm_flags);
 
 bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
-int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
-		unsigned int new_order);
 int split_device_private_folio(struct folio *folio);
+int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+		unsigned int new_order, bool unmapped);
 int min_order_for_split(struct folio *folio);
 int split_folio_to_list(struct folio *folio, struct list_head *list);
 bool uniform_split_supported(struct folio *folio, unsigned int new_order,
@@ -354,6 +354,13 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
 		bool warns);
 int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
 		struct list_head *list);
+
+static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+		unsigned int new_order)
+{
+	return __split_huge_page_to_list_to_order(page, list, new_order, false);
+}
+
 /*
  * try_folio_split - try to split a @folio at @page using non uniform split.
  * @folio: folio to be split
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 75b368e7e33f..1fc1efa219c8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3538,15 +3538,6 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
 		new_folio->mapping = folio->mapping;
 		new_folio->index = folio->index + i;
 
-		/*
-		 * page->private should not be set in tail pages. Fix up and warn once
-		 * if private is unexpectedly set.
-		 */
-		if (unlikely(new_folio->private)) {
-			VM_WARN_ON_ONCE_PAGE(true, new_head);
-			new_folio->private = NULL;
-		}
-
 		if (folio_test_swapcache(folio))
 			new_folio->swap.val = folio->swap.val + i;
 
@@ -3775,6 +3766,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
  * @lock_at: a page within @folio to be left locked to caller
  * @list: after-split folios will be put on it if non NULL
  * @uniform_split: perform uniform split or not (non-uniform split)
+ * @unmapped: The pages are already unmapped, they are migration entries.
  *
  * It calls __split_unmapped_folio() to perform uniform and non-uniform split.
  * It is in charge of checking whether the split is supported or not and
@@ -3790,7 +3782,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
  */
 static int __folio_split(struct folio *folio, unsigned int new_order,
 		struct page *split_at, struct page *lock_at,
-		struct list_head *list, bool uniform_split)
+		struct list_head *list, bool uniform_split, bool unmapped)
 {
 	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
 	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
@@ -3840,13 +3832,15 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 		 * is taken to serialise against parallel split or collapse
 		 * operations.
 		 */
-		anon_vma = folio_get_anon_vma(folio);
-		if (!anon_vma) {
-			ret = -EBUSY;
-			goto out;
+		if (!unmapped) {
+			anon_vma = folio_get_anon_vma(folio);
+			if (!anon_vma) {
+				ret = -EBUSY;
+				goto out;
+			}
+			anon_vma_lock_write(anon_vma);
 		}
 		mapping = NULL;
-		anon_vma_lock_write(anon_vma);
 	} else {
 		unsigned int min_order;
 		gfp_t gfp;
@@ -3913,7 +3907,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 		goto out_unlock;
 	}
 
-	unmap_folio(folio);
+	if (!unmapped)
+		unmap_folio(folio);
 
 	/* block interrupt reentry in xa_lock and spinlock */
 	local_irq_disable();
@@ -4000,10 +3995,13 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 
 			next = folio_next(new_folio);
 
+			zone_device_private_split_cb(folio, new_folio);
+
 			expected_refs = folio_expected_ref_count(new_folio) + 1;
 			folio_ref_unfreeze(new_folio, expected_refs);
 
-			lru_add_split_folio(folio, new_folio, lruvec, list);
+			if (!unmapped)
+				lru_add_split_folio(folio, new_folio, lruvec, list);
 
 			/*
 			 * Anonymous folio with swap cache.
@@ -4037,6 +4035,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 			folio_put_refs(new_folio, nr_pages);
 		}
 
+		zone_device_private_split_cb(folio, NULL);
 		/*
 		 * Unfreeze @folio only after all page cache entries, which
 		 * used to point to it, have been updated with new folios.
@@ -4060,11 +4059,15 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 
 	local_irq_enable();
 
+	if (unmapped)
+		return ret;
+
 	if (nr_shmem_dropped)
 		shmem_uncharge(mapping->host, nr_shmem_dropped);
 
 	if (!ret && is_anon)
 		remap_flags = RMP_USE_SHARED_ZEROPAGE;
+
 	remap_page(folio, 1 << order, remap_flags);
 
 	/*
@@ -4149,12 +4152,13 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
  * Returns -EINVAL when trying to split to an order that is incompatible
  * with the folio. Splitting to order 0 is compatible with all folios.
  */
-int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
-				     unsigned int new_order)
+int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+				     unsigned int new_order, bool unmapped)
 {
 	struct folio *folio = page_folio(page);
 
-	return __folio_split(folio, new_order, &folio->page, page, list, true);
+	return __folio_split(folio, new_order, &folio->page, page, list, true,
+				unmapped);
 }
 
 /*
@@ -4183,7 +4187,7 @@ int folio_split(struct folio *folio, unsigned int new_order,
 		struct page *split_at, struct list_head *list)
 {
 	return __folio_split(folio, new_order, split_at, &folio->page, list,
-			false);
+			false, false);
 }
 
 int min_order_for_split(struct folio *folio)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 4c3334cc3228..49962ea19109 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -816,6 +816,29 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
 		src[i] &= ~MIGRATE_PFN_MIGRATE;
 	return 0;
 }
+
+static int migrate_vma_split_pages(struct migrate_vma *migrate,
+					unsigned long idx, unsigned long addr,
+					struct folio *folio)
+{
+	unsigned long i;
+	unsigned long pfn;
+	unsigned long flags;
+	int ret = 0;
+
+	folio_get(folio);
+	split_huge_pmd_address(migrate->vma, addr, true);
+	ret = __split_huge_page_to_list_to_order(folio_page(folio, 0), NULL,
+							0, true);
+	if (ret)
+		return ret;
+	migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND;
+	flags = migrate->src[idx] & ((1UL << MIGRATE_PFN_SHIFT) - 1);
+	pfn = migrate->src[idx] >> MIGRATE_PFN_SHIFT;
+	for (i = 1; i < HPAGE_PMD_NR; i++)
+		migrate->src[i+idx] = migrate_pfn(pfn + i) | flags;
+	return ret;
+}
 #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
 static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
 					 unsigned long addr,
@@ -825,6 +848,11 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
 {
 	return 0;
 }
+
+static void migrate_vma_split_pages(struct migrate_vma *migrate,
+					unsigned long idx, unsigned long addr,
+					struct folio *folio)
+{}
 #endif
 
 /*
@@ -974,8 +1002,9 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 				struct migrate_vma *migrate)
 {
 	struct mmu_notifier_range range;
-	unsigned long i;
+	unsigned long i, j;
 	bool notified = false;
+	unsigned long addr;
 
 	for (i = 0; i < npages; ) {
 		struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
@@ -1017,12 +1046,16 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 				(!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) {
 				nr = HPAGE_PMD_NR;
 				src_pfns[i] &= ~MIGRATE_PFN_COMPOUND;
-				src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
-				goto next;
+			} else {
+				nr = 1;
 			}
 
-			migrate_vma_insert_page(migrate, addr, &dst_pfns[i],
-						&src_pfns[i]);
+			for (j = 0; j < nr && i + j < npages; j++) {
+				src_pfns[i+j] |= MIGRATE_PFN_MIGRATE;
+				migrate_vma_insert_page(migrate,
+					addr + j * PAGE_SIZE,
+					&dst_pfns[i+j], &src_pfns[i+j]);
+			}
 			goto next;
 		}
 
@@ -1044,7 +1077,14 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 							 MIGRATE_PFN_COMPOUND);
 					goto next;
 				}
-				src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+				nr = 1 << folio_order(folio);
+				addr = migrate->start + i * PAGE_SIZE;
+				if (migrate_vma_split_pages(migrate, i, addr,
+								folio)) {
+					src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE |
+							 MIGRATE_PFN_COMPOUND);
+					goto next;
+				}
 			} else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
 				(dst_pfns[i] & MIGRATE_PFN_COMPOUND) &&
 				!(src_pfns[i] & MIGRATE_PFN_COMPOUND)) {
@@ -1079,12 +1119,17 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 		BUG_ON(folio_test_writeback(folio));
 
 		if (migrate && migrate->fault_page == page)
-			extra_cnt = 1;
-		r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
-		if (r != MIGRATEPAGE_SUCCESS)
-			src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
-		else
-			folio_migrate_flags(newfolio, folio);
+			extra_cnt++;
+		for (j = 0; j < nr && i + j < npages; j++) {
+			folio = page_folio(migrate_pfn_to_page(src_pfns[i+j]));
+			newfolio = page_folio(migrate_pfn_to_page(dst_pfns[i+j]));
+
+			r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
+			if (r != MIGRATEPAGE_SUCCESS)
+				src_pfns[i+j] &= ~MIGRATE_PFN_MIGRATE;
+			else
+				folio_migrate_flags(newfolio, folio);
+		}
 next:
 		i += nr;
 	}
-- 
2.50.1

Re: [v2 07/11] mm/thp: add split during migration support

Posted by kernel test robot 2 months ago

Hi Balbir,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on next-20250731]
[cannot apply to akpm-mm/mm-nonmm-unstable shuah-kselftest/next shuah-kselftest/fixes linus/master v6.16]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Balbir-Singh/mm-zone_device-support-large-zone-device-private-folios/20250730-172600
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20250730092139.3890844-8-balbirs%40nvidia.com
patch subject: [v2 07/11] mm/thp: add split during migration support
config: x86_64-randconfig-071-20250731 (https://download.01.org/0day-ci/archive/20250731/202507311724.mavZerV1-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250731/202507311724.mavZerV1-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507311724.mavZerV1-lkp@intel.com/

All errors (new ones prefixed by >>):

>> mm/migrate_device.c:1082:5: error: statement requires expression of scalar type ('void' invalid)
    1082 |                                 if (migrate_vma_split_pages(migrate, i, addr,
         |                                 ^   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    1083 |                                                                 folio)) {
         |                                                                 ~~~~~~
   1 error generated.


vim +1082 mm/migrate_device.c

   999	
  1000	static void __migrate_device_pages(unsigned long *src_pfns,
  1001					unsigned long *dst_pfns, unsigned long npages,
  1002					struct migrate_vma *migrate)
  1003	{
  1004		struct mmu_notifier_range range;
  1005		unsigned long i, j;
  1006		bool notified = false;
  1007		unsigned long addr;
  1008	
  1009		for (i = 0; i < npages; ) {
  1010			struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
  1011			struct page *page = migrate_pfn_to_page(src_pfns[i]);
  1012			struct address_space *mapping;
  1013			struct folio *newfolio, *folio;
  1014			int r, extra_cnt = 0;
  1015			unsigned long nr = 1;
  1016	
  1017			if (!newpage) {
  1018				src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
  1019				goto next;
  1020			}
  1021	
  1022			if (!page) {
  1023				unsigned long addr;
  1024	
  1025				if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
  1026					goto next;
  1027	
  1028				/*
  1029				 * The only time there is no vma is when called from
  1030				 * migrate_device_coherent_folio(). However this isn't
  1031				 * called if the page could not be unmapped.
  1032				 */
  1033				VM_BUG_ON(!migrate);
  1034				addr = migrate->start + i*PAGE_SIZE;
  1035				if (!notified) {
  1036					notified = true;
  1037	
  1038					mmu_notifier_range_init_owner(&range,
  1039						MMU_NOTIFY_MIGRATE, 0,
  1040						migrate->vma->vm_mm, addr, migrate->end,
  1041						migrate->pgmap_owner);
  1042					mmu_notifier_invalidate_range_start(&range);
  1043				}
  1044	
  1045				if ((src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
  1046					(!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) {
  1047					nr = HPAGE_PMD_NR;
  1048					src_pfns[i] &= ~MIGRATE_PFN_COMPOUND;
  1049				} else {
  1050					nr = 1;
  1051				}
  1052	
  1053				for (j = 0; j < nr && i + j < npages; j++) {
  1054					src_pfns[i+j] |= MIGRATE_PFN_MIGRATE;
  1055					migrate_vma_insert_page(migrate,
  1056						addr + j * PAGE_SIZE,
  1057						&dst_pfns[i+j], &src_pfns[i+j]);
  1058				}
  1059				goto next;
  1060			}
  1061	
  1062			newfolio = page_folio(newpage);
  1063			folio = page_folio(page);
  1064			mapping = folio_mapping(folio);
  1065	
  1066			/*
  1067			 * If THP migration is enabled, check if both src and dst
  1068			 * can migrate large pages
  1069			 */
  1070			if (thp_migration_supported()) {
  1071				if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
  1072					(src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
  1073					!(dst_pfns[i] & MIGRATE_PFN_COMPOUND)) {
  1074	
  1075					if (!migrate) {
  1076						src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE |
  1077								 MIGRATE_PFN_COMPOUND);
  1078						goto next;
  1079					}
  1080					nr = 1 << folio_order(folio);
  1081					addr = migrate->start + i * PAGE_SIZE;
> 1082					if (migrate_vma_split_pages(migrate, i, addr,
  1083									folio)) {
  1084						src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE |
  1085								 MIGRATE_PFN_COMPOUND);
  1086						goto next;
  1087					}
  1088				} else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
  1089					(dst_pfns[i] & MIGRATE_PFN_COMPOUND) &&
  1090					!(src_pfns[i] & MIGRATE_PFN_COMPOUND)) {
  1091					src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
  1092				}
  1093			}
  1094	
  1095	
  1096			if (folio_is_device_private(newfolio) ||
  1097			    folio_is_device_coherent(newfolio)) {
  1098				if (mapping) {
  1099					/*
  1100					 * For now only support anonymous memory migrating to
  1101					 * device private or coherent memory.
  1102					 *
  1103					 * Try to get rid of swap cache if possible.
  1104					 */
  1105					if (!folio_test_anon(folio) ||
  1106					    !folio_free_swap(folio)) {
  1107						src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
  1108						goto next;
  1109					}
  1110				}
  1111			} else if (folio_is_zone_device(newfolio)) {
  1112				/*
  1113				 * Other types of ZONE_DEVICE page are not supported.
  1114				 */
  1115				src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
  1116				goto next;
  1117			}
  1118	
  1119			BUG_ON(folio_test_writeback(folio));
  1120	
  1121			if (migrate && migrate->fault_page == page)
  1122				extra_cnt++;
  1123			for (j = 0; j < nr && i + j < npages; j++) {
  1124				folio = page_folio(migrate_pfn_to_page(src_pfns[i+j]));
  1125				newfolio = page_folio(migrate_pfn_to_page(dst_pfns[i+j]));
  1126	
  1127				r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
  1128				if (r != MIGRATEPAGE_SUCCESS)
  1129					src_pfns[i+j] &= ~MIGRATE_PFN_MIGRATE;
  1130				else
  1131					folio_migrate_flags(newfolio, folio);
  1132			}
  1133	next:
  1134			i += nr;
  1135		}
  1136	
  1137		if (notified)
  1138			mmu_notifier_invalidate_range_end(&range);
  1139	}
  1140	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki