[PATCH RESEND v3 2/2] mm: introduce pmdp_collapse_flush_sync() to skip redundant IPI

Lance Yang posted 2 patches 1 month ago
[PATCH RESEND v3 2/2] mm: introduce pmdp_collapse_flush_sync() to skip redundant IPI
Posted by Lance Yang 1 month ago
From: Lance Yang <lance.yang@linux.dev>

pmdp_collapse_flush() may already send IPIs to flush TLBs, and then
callers send another IPI via tlb_remove_table_sync_one() or
pmdp_get_lockless_sync() to synchronize with concurrent GUP-fast walkers.

However, since GUP-fast runs with IRQs disabled, the TLB flush IPI already
provides the necessary synchronization. We can avoid the redundant second
IPI.

Introduce pmdp_collapse_flush_sync() which combines flush and sync:

- For architectures using the generic pmdp_collapse_flush() implementation
  (e.g., x86): Use mmu_gather to track IPI sends. If the TLB flush sent
  an IPI, tlb_gather_remove_table_sync_one() will skip the redundant one.

- For architectures with custom pmdp_collapse_flush() (s390, riscv,
  powerpc): Fall back to calling pmdp_collapse_flush() followed by
  tlb_remove_table_sync_one(). No behavior change.

Update khugepaged to use pmdp_collapse_flush_sync() instead of separate
flush and sync calls. Remove the now-unused pmdp_get_lockless_sync() macro.

Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Lance Yang <lance.yang@linux.dev>
---
 include/linux/pgtable.h | 13 +++++++++----
 mm/khugepaged.c         |  9 +++------
 mm/pgtable-generic.c    | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index eb8aacba3698..69e290dab450 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -755,7 +755,6 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
 	return pmd;
 }
 #define pmdp_get_lockless pmdp_get_lockless
-#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
 #endif /* CONFIG_PGTABLE_LEVELS > 2 */
 #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */
 
@@ -774,9 +773,6 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
 {
 	return pmdp_get(pmdp);
 }
-static inline void pmdp_get_lockless_sync(void)
-{
-}
 #endif
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1174,6 +1170,8 @@ static inline void pudp_set_wrprotect(struct mm_struct *mm,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
 				 unsigned long address, pmd_t *pmdp);
+extern pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma,
+				 unsigned long address, pmd_t *pmdp);
 #else
 static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
 					unsigned long address,
@@ -1182,6 +1180,13 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
 	BUILD_BUG();
 	return *pmdp;
 }
+static inline pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma,
+					unsigned long address,
+					pmd_t *pmdp)
+{
+	BUILD_BUG();
+	return *pmdp;
+}
 #define pmdp_collapse_flush pmdp_collapse_flush
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 9f790ec34400..0a98afc85c50 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1177,10 +1177,9 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 	 * Parallel GUP-fast is fine since GUP-fast will back off when
 	 * it detects PMD is changed.
 	 */
-	_pmd = pmdp_collapse_flush(vma, address, pmd);
+	_pmd = pmdp_collapse_flush_sync(vma, address, pmd);
 	spin_unlock(pmd_ptl);
 	mmu_notifier_invalidate_range_end(&range);
-	tlb_remove_table_sync_one();
 
 	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
 	if (pte) {
@@ -1663,8 +1662,7 @@ static enum scan_result try_collapse_pte_mapped_thp(struct mm_struct *mm, unsign
 			}
 		}
 	}
-	pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
-	pmdp_get_lockless_sync();
+	pgt_pmd = pmdp_collapse_flush_sync(vma, haddr, pmd);
 	pte_unmap_unlock(start_pte, ptl);
 	if (ptl != pml)
 		spin_unlock(pml);
@@ -1817,8 +1815,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 		 * races against the prior checks.
 		 */
 		if (likely(file_backed_vma_is_retractable(vma))) {
-			pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
-			pmdp_get_lockless_sync();
+			pgt_pmd = pmdp_collapse_flush_sync(vma, addr, pmd);
 			success = true;
 		}
 
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index d3aec7a9926a..be2ee82e6fc4 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -233,6 +233,40 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
 	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	return pmd;
 }
+
+pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma, unsigned long address,
+			       pmd_t *pmdp)
+{
+	struct mmu_gather tlb;
+	pmd_t pmd;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	VM_BUG_ON(pmd_trans_huge(*pmdp));
+
+	tlb_gather_mmu(&tlb, vma->vm_mm);
+	pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
+
+	flush_tlb_mm_range(vma->vm_mm, address, address + HPAGE_PMD_SIZE,
+			   PAGE_SHIFT, true, &tlb);
+
+	/*
+	 * Synchronize with GUP-fast. If the flush sent IPIs, skip the
+	 * redundant sync IPI.
+	 */
+	tlb_gather_remove_table_sync_one(&tlb);
+	tlb_finish_mmu(&tlb);
+	return pmd;
+}
+#else
+pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma, unsigned long address,
+			       pmd_t *pmdp)
+{
+	pmd_t pmd;
+
+	pmd = pmdp_collapse_flush(vma, address, pmdp);
+	tlb_remove_table_sync_one();
+	return pmd;
+}
 #endif
 
 /* arch define pte_free_defer in asm/pgalloc.h for its own implementation */
-- 
2.49.0
Re: [PATCH RESEND v3 2/2] mm: introduce pmdp_collapse_flush_sync() to skip redundant IPI
Posted by kernel test robot 1 month ago
Hi Lance,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on next-20260107]
[cannot apply to tip/x86/core tip/x86/mm linus/master v6.16-rc1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Lance-Yang/mm-tlb-skip-redundant-IPI-when-TLB-flush-already-synchronized/20260106-200505
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20260106120303.38124-3-lance.yang%40linux.dev
patch subject: [PATCH RESEND v3 2/2] mm: introduce pmdp_collapse_flush_sync() to skip redundant IPI
config: riscv-allnoconfig-bpf (https://download.01.org/0day-ci/archive/20260107/202601071153.9k8Fm05X-lkp@intel.com/config)
compiler: riscv64-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260107/202601071153.9k8Fm05X-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601071153.9k8Fm05X-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/khugepaged.c: In function 'collapse_huge_page':
   mm/khugepaged.c:1180:16: error: implicit declaration of function 'pmdp_collapse_flush_sync'; did you mean 'pmdp_collapse_flush'? [-Wimplicit-function-declaration]
    1180 |         _pmd = pmdp_collapse_flush_sync(vma, address, pmd);
         |                ^~~~~~~~~~~~~~~~~~~~~~~~
         |                pmdp_collapse_flush
>> mm/khugepaged.c:1180:16: error: incompatible types when assigning to type 'pmd_t' from type 'int'
   mm/khugepaged.c: In function 'try_collapse_pte_mapped_thp':
   mm/khugepaged.c:1665:19: error: incompatible types when assigning to type 'pmd_t' from type 'int'
    1665 |         pgt_pmd = pmdp_collapse_flush_sync(vma, haddr, pmd);
         |                   ^~~~~~~~~~~~~~~~~~~~~~~~
   mm/khugepaged.c: In function 'retract_page_tables':
   mm/khugepaged.c:1818:35: error: incompatible types when assigning to type 'pmd_t' from type 'int'
    1818 |                         pgt_pmd = pmdp_collapse_flush_sync(vma, addr, pmd);
         |                                   ^~~~~~~~~~~~~~~~~~~~~~~~


vim +1180 mm/khugepaged.c

  1092	
  1093	static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
  1094						   int referenced, int unmapped,
  1095						   struct collapse_control *cc)
  1096	{
  1097		LIST_HEAD(compound_pagelist);
  1098		pmd_t *pmd, _pmd;
  1099		pte_t *pte;
  1100		pgtable_t pgtable;
  1101		struct folio *folio;
  1102		spinlock_t *pmd_ptl, *pte_ptl;
  1103		enum scan_result result = SCAN_FAIL;
  1104		struct vm_area_struct *vma;
  1105		struct mmu_notifier_range range;
  1106	
  1107		VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  1108	
  1109		/*
  1110		 * Before allocating the hugepage, release the mmap_lock read lock.
  1111		 * The allocation can take potentially a long time if it involves
  1112		 * sync compaction, and we do not need to hold the mmap_lock during
  1113		 * that. We will recheck the vma after taking it again in write mode.
  1114		 */
  1115		mmap_read_unlock(mm);
  1116	
  1117		result = alloc_charge_folio(&folio, mm, cc);
  1118		if (result != SCAN_SUCCEED)
  1119			goto out_nolock;
  1120	
  1121		mmap_read_lock(mm);
  1122		result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
  1123		if (result != SCAN_SUCCEED) {
  1124			mmap_read_unlock(mm);
  1125			goto out_nolock;
  1126		}
  1127	
  1128		result = find_pmd_or_thp_or_none(mm, address, &pmd);
  1129		if (result != SCAN_SUCCEED) {
  1130			mmap_read_unlock(mm);
  1131			goto out_nolock;
  1132		}
  1133	
  1134		if (unmapped) {
  1135			/*
  1136			 * __collapse_huge_page_swapin will return with mmap_lock
  1137			 * released when it fails. So we jump out_nolock directly in
  1138			 * that case.  Continuing to collapse causes inconsistency.
  1139			 */
  1140			result = __collapse_huge_page_swapin(mm, vma, address, pmd,
  1141							     referenced);
  1142			if (result != SCAN_SUCCEED)
  1143				goto out_nolock;
  1144		}
  1145	
  1146		mmap_read_unlock(mm);
  1147		/*
  1148		 * Prevent all access to pagetables with the exception of
  1149		 * gup_fast later handled by the ptep_clear_flush and the VM
  1150		 * handled by the anon_vma lock + PG_lock.
  1151		 *
  1152		 * UFFDIO_MOVE is prevented to race as well thanks to the
  1153		 * mmap_lock.
  1154		 */
  1155		mmap_write_lock(mm);
  1156		result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
  1157		if (result != SCAN_SUCCEED)
  1158			goto out_up_write;
  1159		/* check if the pmd is still valid */
  1160		vma_start_write(vma);
  1161		result = check_pmd_still_valid(mm, address, pmd);
  1162		if (result != SCAN_SUCCEED)
  1163			goto out_up_write;
  1164	
  1165		anon_vma_lock_write(vma->anon_vma);
  1166	
  1167		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
  1168					address + HPAGE_PMD_SIZE);
  1169		mmu_notifier_invalidate_range_start(&range);
  1170	
  1171		pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
  1172		/*
  1173		 * This removes any huge TLB entry from the CPU so we won't allow
  1174		 * huge and small TLB entries for the same virtual address to
  1175		 * avoid the risk of CPU bugs in that area.
  1176		 *
  1177		 * Parallel GUP-fast is fine since GUP-fast will back off when
  1178		 * it detects PMD is changed.
  1179		 */
> 1180		_pmd = pmdp_collapse_flush_sync(vma, address, pmd);
  1181		spin_unlock(pmd_ptl);
  1182		mmu_notifier_invalidate_range_end(&range);
  1183	
  1184		pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
  1185		if (pte) {
  1186			result = __collapse_huge_page_isolate(vma, address, pte, cc,
  1187							      &compound_pagelist);
  1188			spin_unlock(pte_ptl);
  1189		} else {
  1190			result = SCAN_NO_PTE_TABLE;
  1191		}
  1192	
  1193		if (unlikely(result != SCAN_SUCCEED)) {
  1194			if (pte)
  1195				pte_unmap(pte);
  1196			spin_lock(pmd_ptl);
  1197			BUG_ON(!pmd_none(*pmd));
  1198			/*
  1199			 * We can only use set_pmd_at when establishing
  1200			 * hugepmds and never for establishing regular pmds that
  1201			 * points to regular pagetables. Use pmd_populate for that
  1202			 */
  1203			pmd_populate(mm, pmd, pmd_pgtable(_pmd));
  1204			spin_unlock(pmd_ptl);
  1205			anon_vma_unlock_write(vma->anon_vma);
  1206			goto out_up_write;
  1207		}
  1208	
  1209		/*
  1210		 * All pages are isolated and locked so anon_vma rmap
  1211		 * can't run anymore.
  1212		 */
  1213		anon_vma_unlock_write(vma->anon_vma);
  1214	
  1215		result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
  1216						   vma, address, pte_ptl,
  1217						   &compound_pagelist);
  1218		pte_unmap(pte);
  1219		if (unlikely(result != SCAN_SUCCEED))
  1220			goto out_up_write;
  1221	
  1222		/*
  1223		 * The smp_wmb() inside __folio_mark_uptodate() ensures the
  1224		 * copy_huge_page writes become visible before the set_pmd_at()
  1225		 * write.
  1226		 */
  1227		__folio_mark_uptodate(folio);
  1228		pgtable = pmd_pgtable(_pmd);
  1229	
  1230		spin_lock(pmd_ptl);
  1231		BUG_ON(!pmd_none(*pmd));
  1232		pgtable_trans_huge_deposit(mm, pmd, pgtable);
  1233		map_anon_folio_pmd_nopf(folio, pmd, vma, address);
  1234		spin_unlock(pmd_ptl);
  1235	
  1236		folio = NULL;
  1237	
  1238		result = SCAN_SUCCEED;
  1239	out_up_write:
  1240		mmap_write_unlock(mm);
  1241	out_nolock:
  1242		if (folio)
  1243			folio_put(folio);
  1244		trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
  1245		return result;
  1246	}
  1247	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH RESEND v3 2/2] mm: introduce pmdp_collapse_flush_sync() to skip redundant IPI
Posted by kernel test robot 1 month ago
Hi Lance,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on next-20260107]
[cannot apply to tip/x86/core tip/x86/mm arnd-asm-generic/master linus/master v6.19-rc4]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Lance-Yang/mm-tlb-skip-redundant-IPI-when-TLB-flush-already-synchronized/20260106-200505
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20260106120303.38124-3-lance.yang%40linux.dev
patch subject: [PATCH RESEND v3 2/2] mm: introduce pmdp_collapse_flush_sync() to skip redundant IPI
config: s390-allnoconfig-bpf (https://download.01.org/0day-ci/archive/20260107/202601071005.oEsmtf0J-lkp@intel.com/config)
compiler: s390-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260107/202601071005.oEsmtf0J-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601071005.oEsmtf0J-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/khugepaged.c: In function 'collapse_huge_page':
>> mm/khugepaged.c:1180:16: error: implicit declaration of function 'pmdp_collapse_flush_sync'; did you mean 'pmdp_collapse_flush'? [-Wimplicit-function-declaration]
    1180 |         _pmd = pmdp_collapse_flush_sync(vma, address, pmd);
         |                ^~~~~~~~~~~~~~~~~~~~~~~~
         |                pmdp_collapse_flush


vim +1180 mm/khugepaged.c

  1092	
  1093	static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
  1094						   int referenced, int unmapped,
  1095						   struct collapse_control *cc)
  1096	{
  1097		LIST_HEAD(compound_pagelist);
  1098		pmd_t *pmd, _pmd;
  1099		pte_t *pte;
  1100		pgtable_t pgtable;
  1101		struct folio *folio;
  1102		spinlock_t *pmd_ptl, *pte_ptl;
  1103		enum scan_result result = SCAN_FAIL;
  1104		struct vm_area_struct *vma;
  1105		struct mmu_notifier_range range;
  1106	
  1107		VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  1108	
  1109		/*
  1110		 * Before allocating the hugepage, release the mmap_lock read lock.
  1111		 * The allocation can take potentially a long time if it involves
  1112		 * sync compaction, and we do not need to hold the mmap_lock during
  1113		 * that. We will recheck the vma after taking it again in write mode.
  1114		 */
  1115		mmap_read_unlock(mm);
  1116	
  1117		result = alloc_charge_folio(&folio, mm, cc);
  1118		if (result != SCAN_SUCCEED)
  1119			goto out_nolock;
  1120	
  1121		mmap_read_lock(mm);
  1122		result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
  1123		if (result != SCAN_SUCCEED) {
  1124			mmap_read_unlock(mm);
  1125			goto out_nolock;
  1126		}
  1127	
  1128		result = find_pmd_or_thp_or_none(mm, address, &pmd);
  1129		if (result != SCAN_SUCCEED) {
  1130			mmap_read_unlock(mm);
  1131			goto out_nolock;
  1132		}
  1133	
  1134		if (unmapped) {
  1135			/*
  1136			 * __collapse_huge_page_swapin will return with mmap_lock
  1137			 * released when it fails. So we jump out_nolock directly in
  1138			 * that case.  Continuing to collapse causes inconsistency.
  1139			 */
  1140			result = __collapse_huge_page_swapin(mm, vma, address, pmd,
  1141							     referenced);
  1142			if (result != SCAN_SUCCEED)
  1143				goto out_nolock;
  1144		}
  1145	
  1146		mmap_read_unlock(mm);
  1147		/*
  1148		 * Prevent all access to pagetables with the exception of
  1149		 * gup_fast later handled by the ptep_clear_flush and the VM
  1150		 * handled by the anon_vma lock + PG_lock.
  1151		 *
  1152		 * UFFDIO_MOVE is prevented to race as well thanks to the
  1153		 * mmap_lock.
  1154		 */
  1155		mmap_write_lock(mm);
  1156		result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
  1157		if (result != SCAN_SUCCEED)
  1158			goto out_up_write;
  1159		/* check if the pmd is still valid */
  1160		vma_start_write(vma);
  1161		result = check_pmd_still_valid(mm, address, pmd);
  1162		if (result != SCAN_SUCCEED)
  1163			goto out_up_write;
  1164	
  1165		anon_vma_lock_write(vma->anon_vma);
  1166	
  1167		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
  1168					address + HPAGE_PMD_SIZE);
  1169		mmu_notifier_invalidate_range_start(&range);
  1170	
  1171		pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
  1172		/*
  1173		 * This removes any huge TLB entry from the CPU so we won't allow
  1174		 * huge and small TLB entries for the same virtual address to
  1175		 * avoid the risk of CPU bugs in that area.
  1176		 *
  1177		 * Parallel GUP-fast is fine since GUP-fast will back off when
  1178		 * it detects PMD is changed.
  1179		 */
> 1180		_pmd = pmdp_collapse_flush_sync(vma, address, pmd);
  1181		spin_unlock(pmd_ptl);
  1182		mmu_notifier_invalidate_range_end(&range);
  1183	
  1184		pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
  1185		if (pte) {
  1186			result = __collapse_huge_page_isolate(vma, address, pte, cc,
  1187							      &compound_pagelist);
  1188			spin_unlock(pte_ptl);
  1189		} else {
  1190			result = SCAN_NO_PTE_TABLE;
  1191		}
  1192	
  1193		if (unlikely(result != SCAN_SUCCEED)) {
  1194			if (pte)
  1195				pte_unmap(pte);
  1196			spin_lock(pmd_ptl);
  1197			BUG_ON(!pmd_none(*pmd));
  1198			/*
  1199			 * We can only use set_pmd_at when establishing
  1200			 * hugepmds and never for establishing regular pmds that
  1201			 * points to regular pagetables. Use pmd_populate for that
  1202			 */
  1203			pmd_populate(mm, pmd, pmd_pgtable(_pmd));
  1204			spin_unlock(pmd_ptl);
  1205			anon_vma_unlock_write(vma->anon_vma);
  1206			goto out_up_write;
  1207		}
  1208	
  1209		/*
  1210		 * All pages are isolated and locked so anon_vma rmap
  1211		 * can't run anymore.
  1212		 */
  1213		anon_vma_unlock_write(vma->anon_vma);
  1214	
  1215		result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
  1216						   vma, address, pte_ptl,
  1217						   &compound_pagelist);
  1218		pte_unmap(pte);
  1219		if (unlikely(result != SCAN_SUCCEED))
  1220			goto out_up_write;
  1221	
  1222		/*
  1223		 * The smp_wmb() inside __folio_mark_uptodate() ensures the
  1224		 * copy_huge_page writes become visible before the set_pmd_at()
  1225		 * write.
  1226		 */
  1227		__folio_mark_uptodate(folio);
  1228		pgtable = pmd_pgtable(_pmd);
  1229	
  1230		spin_lock(pmd_ptl);
  1231		BUG_ON(!pmd_none(*pmd));
  1232		pgtable_trans_huge_deposit(mm, pmd, pgtable);
  1233		map_anon_folio_pmd_nopf(folio, pmd, vma, address);
  1234		spin_unlock(pmd_ptl);
  1235	
  1236		folio = NULL;
  1237	
  1238		result = SCAN_SUCCEED;
  1239	out_up_write:
  1240		mmap_write_unlock(mm);
  1241	out_nolock:
  1242		if (folio)
  1243			folio_put(folio);
  1244		trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
  1245		return result;
  1246	}
  1247	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH RESEND v3 2/2] mm: introduce pmdp_collapse_flush_sync() to skip redundant IPI
Posted by David Hildenbrand (Red Hat) 1 month ago
On 1/6/26 13:03, Lance Yang wrote:
> From: Lance Yang <lance.yang@linux.dev>
> 
> pmdp_collapse_flush() may already send IPIs to flush TLBs, and then
> callers send another IPI via tlb_remove_table_sync_one() or
> pmdp_get_lockless_sync() to synchronize with concurrent GUP-fast walkers.
> 
> However, since GUP-fast runs with IRQs disabled, the TLB flush IPI already
> provides the necessary synchronization. We can avoid the redundant second
> IPI.
> 
> Introduce pmdp_collapse_flush_sync() which combines flush and sync:
> 
> - For architectures using the generic pmdp_collapse_flush() implementation
>    (e.g., x86): Use mmu_gather to track IPI sends. If the TLB flush sent
>    an IPI, tlb_gather_remove_table_sync_one() will skip the redundant one.
> 
> - For architectures with custom pmdp_collapse_flush() (s390, riscv,
>    powerpc): Fall back to calling pmdp_collapse_flush() followed by
>    tlb_remove_table_sync_one(). No behavior change.
> 
> Update khugepaged to use pmdp_collapse_flush_sync() instead of separate
> flush and sync calls. Remove the now-unused pmdp_get_lockless_sync() macro.
> 
> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
> Signed-off-by: Lance Yang <lance.yang@linux.dev>
> ---
>   include/linux/pgtable.h | 13 +++++++++----
>   mm/khugepaged.c         |  9 +++------
>   mm/pgtable-generic.c    | 34 ++++++++++++++++++++++++++++++++++
>   3 files changed, 46 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index eb8aacba3698..69e290dab450 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -755,7 +755,6 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
>   	return pmd;
>   }
>   #define pmdp_get_lockless pmdp_get_lockless
> -#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
>   #endif /* CONFIG_PGTABLE_LEVELS > 2 */
>   #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */
>   
> @@ -774,9 +773,6 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
>   {
>   	return pmdp_get(pmdp);
>   }
> -static inline void pmdp_get_lockless_sync(void)
> -{
> -}
>   #endif
>   
>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> @@ -1174,6 +1170,8 @@ static inline void pudp_set_wrprotect(struct mm_struct *mm,
>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>   extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>   				 unsigned long address, pmd_t *pmdp);
> +extern pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma,
> +				 unsigned long address, pmd_t *pmdp);
>   #else
>   static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>   					unsigned long address,
> @@ -1182,6 +1180,13 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>   	BUILD_BUG();
>   	return *pmdp;
>   }
> +static inline pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma,
> +					unsigned long address,
> +					pmd_t *pmdp)
> +{
> +	BUILD_BUG();
> +	return *pmdp;
> +}
>   #define pmdp_collapse_flush pmdp_collapse_flush
>   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>   #endif
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 9f790ec34400..0a98afc85c50 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1177,10 +1177,9 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>   	 * Parallel GUP-fast is fine since GUP-fast will back off when
>   	 * it detects PMD is changed.
>   	 */
> -	_pmd = pmdp_collapse_flush(vma, address, pmd);
> +	_pmd = pmdp_collapse_flush_sync(vma, address, pmd);
>   	spin_unlock(pmd_ptl);
>   	mmu_notifier_invalidate_range_end(&range);
> -	tlb_remove_table_sync_one();

Now you issue the IPI under PTL.

[...]

> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index d3aec7a9926a..be2ee82e6fc4 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -233,6 +233,40 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
>   	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>   	return pmd;
>   }
> +
> +pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma, unsigned long address,
> +			       pmd_t *pmdp)
> +{
> +	struct mmu_gather tlb;
> +	pmd_t pmd;
> +
> +	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> +	VM_BUG_ON(pmd_trans_huge(*pmdp));
> +
> +	tlb_gather_mmu(&tlb, vma->vm_mm);

Should we be using the new tlb_gather_mmu_vma(), and do we have to set 
the TLB pagesize to PMD?

-- 
Cheers

David
Re: [PATCH RESEND v3 2/2] mm: introduce pmdp_collapse_flush_sync() to skip redundant IPI
Posted by Lance Yang 1 month ago

On 2026/1/6 23:07, David Hildenbrand (Red Hat) wrote:
> On 1/6/26 13:03, Lance Yang wrote:
>> From: Lance Yang <lance.yang@linux.dev>
>>
>> pmdp_collapse_flush() may already send IPIs to flush TLBs, and then
>> callers send another IPI via tlb_remove_table_sync_one() or
>> pmdp_get_lockless_sync() to synchronize with concurrent GUP-fast walkers.
>>
>> However, since GUP-fast runs with IRQs disabled, the TLB flush IPI 
>> already
>> provides the necessary synchronization. We can avoid the redundant second
>> IPI.
>>
>> Introduce pmdp_collapse_flush_sync() which combines flush and sync:
>>
>> - For architectures using the generic pmdp_collapse_flush() 
>> implementation
>>    (e.g., x86): Use mmu_gather to track IPI sends. If the TLB flush sent
>>    an IPI, tlb_gather_remove_table_sync_one() will skip the redundant 
>> one.
>>
>> - For architectures with custom pmdp_collapse_flush() (s390, riscv,
>>    powerpc): Fall back to calling pmdp_collapse_flush() followed by
>>    tlb_remove_table_sync_one(). No behavior change.
>>
>> Update khugepaged to use pmdp_collapse_flush_sync() instead of separate
>> flush and sync calls. Remove the now-unused pmdp_get_lockless_sync() 
>> macro.
>>
>> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
>> Signed-off-by: Lance Yang <lance.yang@linux.dev>
>> ---
>>   include/linux/pgtable.h | 13 +++++++++----
>>   mm/khugepaged.c         |  9 +++------
>>   mm/pgtable-generic.c    | 34 ++++++++++++++++++++++++++++++++++
>>   3 files changed, 46 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>> index eb8aacba3698..69e290dab450 100644
>> --- a/include/linux/pgtable.h
>> +++ b/include/linux/pgtable.h
>> @@ -755,7 +755,6 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
>>       return pmd;
>>   }
>>   #define pmdp_get_lockless pmdp_get_lockless
>> -#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
>>   #endif /* CONFIG_PGTABLE_LEVELS > 2 */
>>   #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */
>> @@ -774,9 +773,6 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
>>   {
>>       return pmdp_get(pmdp);
>>   }
>> -static inline void pmdp_get_lockless_sync(void)
>> -{
>> -}
>>   #endif
>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> @@ -1174,6 +1170,8 @@ static inline void pudp_set_wrprotect(struct 
>> mm_struct *mm,
>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>   extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>>                    unsigned long address, pmd_t *pmdp);
>> +extern pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma,
>> +                 unsigned long address, pmd_t *pmdp);
>>   #else
>>   static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>>                       unsigned long address,
>> @@ -1182,6 +1180,13 @@ static inline pmd_t pmdp_collapse_flush(struct 
>> vm_area_struct *vma,
>>       BUILD_BUG();
>>       return *pmdp;
>>   }
>> +static inline pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma,
>> +                    unsigned long address,
>> +                    pmd_t *pmdp)
>> +{
>> +    BUILD_BUG();
>> +    return *pmdp;
>> +}
>>   #define pmdp_collapse_flush pmdp_collapse_flush
>>   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>>   #endif
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 9f790ec34400..0a98afc85c50 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1177,10 +1177,9 @@ static enum scan_result 
>> collapse_huge_page(struct mm_struct *mm, unsigned long a
>>        * Parallel GUP-fast is fine since GUP-fast will back off when
>>        * it detects PMD is changed.
>>        */
>> -    _pmd = pmdp_collapse_flush(vma, address, pmd);
>> +    _pmd = pmdp_collapse_flush_sync(vma, address, pmd);
>>       spin_unlock(pmd_ptl);
>>       mmu_notifier_invalidate_range_end(&range);
>> -    tlb_remove_table_sync_one();
> 
> Now you issue the IPI under PTL.
We do send TLB flush IPI under PTL before, e.g. in 
try_collapse_pte_mapped_thp():

	pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
	pmdp_get_lockless_sync();
	pte_unmap_unlock(start_pte, ptl);

But anyway, we can do better by passing ptl in and unlocking
before the sync IPI ;)
> 
> [...]
> 
>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>> index d3aec7a9926a..be2ee82e6fc4 100644
>> --- a/mm/pgtable-generic.c
>> +++ b/mm/pgtable-generic.c
>> @@ -233,6 +233,40 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct 
>> *vma, unsigned long address,
>>       flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>>       return pmd;
>>   }
>> +
>> +pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma, unsigned 
>> long address,
>> +                   pmd_t *pmdp)
>> +{
>> +    struct mmu_gather tlb;
>> +    pmd_t pmd;
>> +
>> +    VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>> +    VM_BUG_ON(pmd_trans_huge(*pmdp));
>> +
>> +    tlb_gather_mmu(&tlb, vma->vm_mm);
> 
> Should we be using the new tlb_gather_mmu_vma(), and do we have to set 
> the TLB pagesize to PMD?

Yes, good point on tlb_gather_mmu_vma()!

So, the sequence will be:

	tlb_gather_mmu_vma(&tlb, vma);
	pmd = pmdp_huge_get_and_clear(...);
	flush_tlb_mm_range(..., &tlb);
	if (ptl)
		spin_unlock(ptl);
	tlb_gather_remove_table_sync_one(&tlb);
	tlb_finish_mmu(&tlb);Thanks,
Lance