From: Lance Yang <lance.yang@linux.dev>
pmdp_collapse_flush() may already send IPIs to flush TLBs, and then
callers send another IPI via tlb_remove_table_sync_one() or
pmdp_get_lockless_sync() to synchronize with concurrent GUP-fast walkers.
However, since GUP-fast runs with IRQs disabled, the TLB flush IPI already
provides the necessary synchronization. We can avoid the redundant second
IPI.
Introduce pmdp_collapse_flush_sync() which combines flush and sync:
- For architectures using the generic pmdp_collapse_flush() implementation
(e.g., x86): Use mmu_gather to track IPI sends. If the TLB flush sent
an IPI, tlb_gather_remove_table_sync_one() will skip the redundant one.
- For architectures with custom pmdp_collapse_flush() (s390, riscv,
powerpc): Fall back to calling pmdp_collapse_flush() followed by
tlb_remove_table_sync_one(). No behavior change.
Update khugepaged to use pmdp_collapse_flush_sync() instead of separate
flush and sync calls. Remove the now-unused pmdp_get_lockless_sync() macro.
Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Lance Yang <lance.yang@linux.dev>
---
include/linux/pgtable.h | 13 +++++++++----
mm/khugepaged.c | 9 +++------
mm/pgtable-generic.c | 34 ++++++++++++++++++++++++++++++++++
3 files changed, 46 insertions(+), 10 deletions(-)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index eb8aacba3698..69e290dab450 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -755,7 +755,6 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
return pmd;
}
#define pmdp_get_lockless pmdp_get_lockless
-#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */
@@ -774,9 +773,6 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
return pmdp_get(pmdp);
}
-static inline void pmdp_get_lockless_sync(void)
-{
-}
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1174,6 +1170,8 @@ static inline void pudp_set_wrprotect(struct mm_struct *mm,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
+extern pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
#else
static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address,
@@ -1182,6 +1180,13 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
BUILD_BUG();
return *pmdp;
}
+static inline pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmdp)
+{
+ BUILD_BUG();
+ return *pmdp;
+}
#define pmdp_collapse_flush pmdp_collapse_flush
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 9f790ec34400..0a98afc85c50 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1177,10 +1177,9 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* Parallel GUP-fast is fine since GUP-fast will back off when
* it detects PMD is changed.
*/
- _pmd = pmdp_collapse_flush(vma, address, pmd);
+ _pmd = pmdp_collapse_flush_sync(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
- tlb_remove_table_sync_one();
pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
if (pte) {
@@ -1663,8 +1662,7 @@ static enum scan_result try_collapse_pte_mapped_thp(struct mm_struct *mm, unsign
}
}
}
- pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
- pmdp_get_lockless_sync();
+ pgt_pmd = pmdp_collapse_flush_sync(vma, haddr, pmd);
pte_unmap_unlock(start_pte, ptl);
if (ptl != pml)
spin_unlock(pml);
@@ -1817,8 +1815,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* races against the prior checks.
*/
if (likely(file_backed_vma_is_retractable(vma))) {
- pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
- pmdp_get_lockless_sync();
+ pgt_pmd = pmdp_collapse_flush_sync(vma, addr, pmd);
success = true;
}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index d3aec7a9926a..be2ee82e6fc4 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -233,6 +233,40 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
}
+
+pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ struct mmu_gather tlb;
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(pmd_trans_huge(*pmdp));
+
+ tlb_gather_mmu(&tlb, vma->vm_mm);
+ pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
+
+ flush_tlb_mm_range(vma->vm_mm, address, address + HPAGE_PMD_SIZE,
+ PAGE_SHIFT, true, &tlb);
+
+ /*
+ * Synchronize with GUP-fast. If the flush sent IPIs, skip the
+ * redundant sync IPI.
+ */
+ tlb_gather_remove_table_sync_one(&tlb);
+ tlb_finish_mmu(&tlb);
+ return pmd;
+}
+#else
+pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+
+ pmd = pmdp_collapse_flush(vma, address, pmdp);
+ tlb_remove_table_sync_one();
+ return pmd;
+}
#endif
/* arch define pte_free_defer in asm/pgalloc.h for its own implementation */
--
2.49.0
Hi Lance,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on next-20260107]
[cannot apply to tip/x86/core tip/x86/mm linus/master v6.16-rc1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Lance-Yang/mm-tlb-skip-redundant-IPI-when-TLB-flush-already-synchronized/20260106-200505
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20260106120303.38124-3-lance.yang%40linux.dev
patch subject: [PATCH RESEND v3 2/2] mm: introduce pmdp_collapse_flush_sync() to skip redundant IPI
config: riscv-allnoconfig-bpf (https://download.01.org/0day-ci/archive/20260107/202601071153.9k8Fm05X-lkp@intel.com/config)
compiler: riscv64-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260107/202601071153.9k8Fm05X-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601071153.9k8Fm05X-lkp@intel.com/
All errors (new ones prefixed by >>):
mm/khugepaged.c: In function 'collapse_huge_page':
mm/khugepaged.c:1180:16: error: implicit declaration of function 'pmdp_collapse_flush_sync'; did you mean 'pmdp_collapse_flush'? [-Wimplicit-function-declaration]
1180 | _pmd = pmdp_collapse_flush_sync(vma, address, pmd);
| ^~~~~~~~~~~~~~~~~~~~~~~~
| pmdp_collapse_flush
>> mm/khugepaged.c:1180:16: error: incompatible types when assigning to type 'pmd_t' from type 'int'
mm/khugepaged.c: In function 'try_collapse_pte_mapped_thp':
mm/khugepaged.c:1665:19: error: incompatible types when assigning to type 'pmd_t' from type 'int'
1665 | pgt_pmd = pmdp_collapse_flush_sync(vma, haddr, pmd);
| ^~~~~~~~~~~~~~~~~~~~~~~~
mm/khugepaged.c: In function 'retract_page_tables':
mm/khugepaged.c:1818:35: error: incompatible types when assigning to type 'pmd_t' from type 'int'
1818 | pgt_pmd = pmdp_collapse_flush_sync(vma, addr, pmd);
| ^~~~~~~~~~~~~~~~~~~~~~~~
vim +1180 mm/khugepaged.c
1092
1093 static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
1094 int referenced, int unmapped,
1095 struct collapse_control *cc)
1096 {
1097 LIST_HEAD(compound_pagelist);
1098 pmd_t *pmd, _pmd;
1099 pte_t *pte;
1100 pgtable_t pgtable;
1101 struct folio *folio;
1102 spinlock_t *pmd_ptl, *pte_ptl;
1103 enum scan_result result = SCAN_FAIL;
1104 struct vm_area_struct *vma;
1105 struct mmu_notifier_range range;
1106
1107 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1108
1109 /*
1110 * Before allocating the hugepage, release the mmap_lock read lock.
1111 * The allocation can take potentially a long time if it involves
1112 * sync compaction, and we do not need to hold the mmap_lock during
1113 * that. We will recheck the vma after taking it again in write mode.
1114 */
1115 mmap_read_unlock(mm);
1116
1117 result = alloc_charge_folio(&folio, mm, cc);
1118 if (result != SCAN_SUCCEED)
1119 goto out_nolock;
1120
1121 mmap_read_lock(mm);
1122 result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
1123 if (result != SCAN_SUCCEED) {
1124 mmap_read_unlock(mm);
1125 goto out_nolock;
1126 }
1127
1128 result = find_pmd_or_thp_or_none(mm, address, &pmd);
1129 if (result != SCAN_SUCCEED) {
1130 mmap_read_unlock(mm);
1131 goto out_nolock;
1132 }
1133
1134 if (unmapped) {
1135 /*
1136 * __collapse_huge_page_swapin will return with mmap_lock
1137 * released when it fails. So we jump out_nolock directly in
1138 * that case. Continuing to collapse causes inconsistency.
1139 */
1140 result = __collapse_huge_page_swapin(mm, vma, address, pmd,
1141 referenced);
1142 if (result != SCAN_SUCCEED)
1143 goto out_nolock;
1144 }
1145
1146 mmap_read_unlock(mm);
1147 /*
1148 * Prevent all access to pagetables with the exception of
1149 * gup_fast later handled by the ptep_clear_flush and the VM
1150 * handled by the anon_vma lock + PG_lock.
1151 *
1152 * UFFDIO_MOVE is prevented to race as well thanks to the
1153 * mmap_lock.
1154 */
1155 mmap_write_lock(mm);
1156 result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
1157 if (result != SCAN_SUCCEED)
1158 goto out_up_write;
1159 /* check if the pmd is still valid */
1160 vma_start_write(vma);
1161 result = check_pmd_still_valid(mm, address, pmd);
1162 if (result != SCAN_SUCCEED)
1163 goto out_up_write;
1164
1165 anon_vma_lock_write(vma->anon_vma);
1166
1167 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
1168 address + HPAGE_PMD_SIZE);
1169 mmu_notifier_invalidate_range_start(&range);
1170
1171 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
1172 /*
1173 * This removes any huge TLB entry from the CPU so we won't allow
1174 * huge and small TLB entries for the same virtual address to
1175 * avoid the risk of CPU bugs in that area.
1176 *
1177 * Parallel GUP-fast is fine since GUP-fast will back off when
1178 * it detects PMD is changed.
1179 */
> 1180 _pmd = pmdp_collapse_flush_sync(vma, address, pmd);
1181 spin_unlock(pmd_ptl);
1182 mmu_notifier_invalidate_range_end(&range);
1183
1184 pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
1185 if (pte) {
1186 result = __collapse_huge_page_isolate(vma, address, pte, cc,
1187 &compound_pagelist);
1188 spin_unlock(pte_ptl);
1189 } else {
1190 result = SCAN_NO_PTE_TABLE;
1191 }
1192
1193 if (unlikely(result != SCAN_SUCCEED)) {
1194 if (pte)
1195 pte_unmap(pte);
1196 spin_lock(pmd_ptl);
1197 BUG_ON(!pmd_none(*pmd));
1198 /*
1199 * We can only use set_pmd_at when establishing
1200 * hugepmds and never for establishing regular pmds that
1201 * points to regular pagetables. Use pmd_populate for that
1202 */
1203 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
1204 spin_unlock(pmd_ptl);
1205 anon_vma_unlock_write(vma->anon_vma);
1206 goto out_up_write;
1207 }
1208
1209 /*
1210 * All pages are isolated and locked so anon_vma rmap
1211 * can't run anymore.
1212 */
1213 anon_vma_unlock_write(vma->anon_vma);
1214
1215 result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
1216 vma, address, pte_ptl,
1217 &compound_pagelist);
1218 pte_unmap(pte);
1219 if (unlikely(result != SCAN_SUCCEED))
1220 goto out_up_write;
1221
1222 /*
1223 * The smp_wmb() inside __folio_mark_uptodate() ensures the
1224 * copy_huge_page writes become visible before the set_pmd_at()
1225 * write.
1226 */
1227 __folio_mark_uptodate(folio);
1228 pgtable = pmd_pgtable(_pmd);
1229
1230 spin_lock(pmd_ptl);
1231 BUG_ON(!pmd_none(*pmd));
1232 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1233 map_anon_folio_pmd_nopf(folio, pmd, vma, address);
1234 spin_unlock(pmd_ptl);
1235
1236 folio = NULL;
1237
1238 result = SCAN_SUCCEED;
1239 out_up_write:
1240 mmap_write_unlock(mm);
1241 out_nolock:
1242 if (folio)
1243 folio_put(folio);
1244 trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
1245 return result;
1246 }
1247
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi Lance,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on next-20260107]
[cannot apply to tip/x86/core tip/x86/mm arnd-asm-generic/master linus/master v6.19-rc4]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Lance-Yang/mm-tlb-skip-redundant-IPI-when-TLB-flush-already-synchronized/20260106-200505
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20260106120303.38124-3-lance.yang%40linux.dev
patch subject: [PATCH RESEND v3 2/2] mm: introduce pmdp_collapse_flush_sync() to skip redundant IPI
config: s390-allnoconfig-bpf (https://download.01.org/0day-ci/archive/20260107/202601071005.oEsmtf0J-lkp@intel.com/config)
compiler: s390-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260107/202601071005.oEsmtf0J-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601071005.oEsmtf0J-lkp@intel.com/
All errors (new ones prefixed by >>):
mm/khugepaged.c: In function 'collapse_huge_page':
>> mm/khugepaged.c:1180:16: error: implicit declaration of function 'pmdp_collapse_flush_sync'; did you mean 'pmdp_collapse_flush'? [-Wimplicit-function-declaration]
1180 | _pmd = pmdp_collapse_flush_sync(vma, address, pmd);
| ^~~~~~~~~~~~~~~~~~~~~~~~
| pmdp_collapse_flush
vim +1180 mm/khugepaged.c
1092
1093 static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
1094 int referenced, int unmapped,
1095 struct collapse_control *cc)
1096 {
1097 LIST_HEAD(compound_pagelist);
1098 pmd_t *pmd, _pmd;
1099 pte_t *pte;
1100 pgtable_t pgtable;
1101 struct folio *folio;
1102 spinlock_t *pmd_ptl, *pte_ptl;
1103 enum scan_result result = SCAN_FAIL;
1104 struct vm_area_struct *vma;
1105 struct mmu_notifier_range range;
1106
1107 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1108
1109 /*
1110 * Before allocating the hugepage, release the mmap_lock read lock.
1111 * The allocation can take potentially a long time if it involves
1112 * sync compaction, and we do not need to hold the mmap_lock during
1113 * that. We will recheck the vma after taking it again in write mode.
1114 */
1115 mmap_read_unlock(mm);
1116
1117 result = alloc_charge_folio(&folio, mm, cc);
1118 if (result != SCAN_SUCCEED)
1119 goto out_nolock;
1120
1121 mmap_read_lock(mm);
1122 result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
1123 if (result != SCAN_SUCCEED) {
1124 mmap_read_unlock(mm);
1125 goto out_nolock;
1126 }
1127
1128 result = find_pmd_or_thp_or_none(mm, address, &pmd);
1129 if (result != SCAN_SUCCEED) {
1130 mmap_read_unlock(mm);
1131 goto out_nolock;
1132 }
1133
1134 if (unmapped) {
1135 /*
1136 * __collapse_huge_page_swapin will return with mmap_lock
1137 * released when it fails. So we jump out_nolock directly in
1138 * that case. Continuing to collapse causes inconsistency.
1139 */
1140 result = __collapse_huge_page_swapin(mm, vma, address, pmd,
1141 referenced);
1142 if (result != SCAN_SUCCEED)
1143 goto out_nolock;
1144 }
1145
1146 mmap_read_unlock(mm);
1147 /*
1148 * Prevent all access to pagetables with the exception of
1149 * gup_fast later handled by the ptep_clear_flush and the VM
1150 * handled by the anon_vma lock + PG_lock.
1151 *
1152 * UFFDIO_MOVE is prevented to race as well thanks to the
1153 * mmap_lock.
1154 */
1155 mmap_write_lock(mm);
1156 result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
1157 if (result != SCAN_SUCCEED)
1158 goto out_up_write;
1159 /* check if the pmd is still valid */
1160 vma_start_write(vma);
1161 result = check_pmd_still_valid(mm, address, pmd);
1162 if (result != SCAN_SUCCEED)
1163 goto out_up_write;
1164
1165 anon_vma_lock_write(vma->anon_vma);
1166
1167 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
1168 address + HPAGE_PMD_SIZE);
1169 mmu_notifier_invalidate_range_start(&range);
1170
1171 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
1172 /*
1173 * This removes any huge TLB entry from the CPU so we won't allow
1174 * huge and small TLB entries for the same virtual address to
1175 * avoid the risk of CPU bugs in that area.
1176 *
1177 * Parallel GUP-fast is fine since GUP-fast will back off when
1178 * it detects PMD is changed.
1179 */
> 1180 _pmd = pmdp_collapse_flush_sync(vma, address, pmd);
1181 spin_unlock(pmd_ptl);
1182 mmu_notifier_invalidate_range_end(&range);
1183
1184 pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
1185 if (pte) {
1186 result = __collapse_huge_page_isolate(vma, address, pte, cc,
1187 &compound_pagelist);
1188 spin_unlock(pte_ptl);
1189 } else {
1190 result = SCAN_NO_PTE_TABLE;
1191 }
1192
1193 if (unlikely(result != SCAN_SUCCEED)) {
1194 if (pte)
1195 pte_unmap(pte);
1196 spin_lock(pmd_ptl);
1197 BUG_ON(!pmd_none(*pmd));
1198 /*
1199 * We can only use set_pmd_at when establishing
1200 * hugepmds and never for establishing regular pmds that
1201 * points to regular pagetables. Use pmd_populate for that
1202 */
1203 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
1204 spin_unlock(pmd_ptl);
1205 anon_vma_unlock_write(vma->anon_vma);
1206 goto out_up_write;
1207 }
1208
1209 /*
1210 * All pages are isolated and locked so anon_vma rmap
1211 * can't run anymore.
1212 */
1213 anon_vma_unlock_write(vma->anon_vma);
1214
1215 result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
1216 vma, address, pte_ptl,
1217 &compound_pagelist);
1218 pte_unmap(pte);
1219 if (unlikely(result != SCAN_SUCCEED))
1220 goto out_up_write;
1221
1222 /*
1223 * The smp_wmb() inside __folio_mark_uptodate() ensures the
1224 * copy_huge_page writes become visible before the set_pmd_at()
1225 * write.
1226 */
1227 __folio_mark_uptodate(folio);
1228 pgtable = pmd_pgtable(_pmd);
1229
1230 spin_lock(pmd_ptl);
1231 BUG_ON(!pmd_none(*pmd));
1232 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1233 map_anon_folio_pmd_nopf(folio, pmd, vma, address);
1234 spin_unlock(pmd_ptl);
1235
1236 folio = NULL;
1237
1238 result = SCAN_SUCCEED;
1239 out_up_write:
1240 mmap_write_unlock(mm);
1241 out_nolock:
1242 if (folio)
1243 folio_put(folio);
1244 trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
1245 return result;
1246 }
1247
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On 1/6/26 13:03, Lance Yang wrote:
> From: Lance Yang <lance.yang@linux.dev>
>
> pmdp_collapse_flush() may already send IPIs to flush TLBs, and then
> callers send another IPI via tlb_remove_table_sync_one() or
> pmdp_get_lockless_sync() to synchronize with concurrent GUP-fast walkers.
>
> However, since GUP-fast runs with IRQs disabled, the TLB flush IPI already
> provides the necessary synchronization. We can avoid the redundant second
> IPI.
>
> Introduce pmdp_collapse_flush_sync() which combines flush and sync:
>
> - For architectures using the generic pmdp_collapse_flush() implementation
> (e.g., x86): Use mmu_gather to track IPI sends. If the TLB flush sent
> an IPI, tlb_gather_remove_table_sync_one() will skip the redundant one.
>
> - For architectures with custom pmdp_collapse_flush() (s390, riscv,
> powerpc): Fall back to calling pmdp_collapse_flush() followed by
> tlb_remove_table_sync_one(). No behavior change.
>
> Update khugepaged to use pmdp_collapse_flush_sync() instead of separate
> flush and sync calls. Remove the now-unused pmdp_get_lockless_sync() macro.
>
> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
> Signed-off-by: Lance Yang <lance.yang@linux.dev>
> ---
> include/linux/pgtable.h | 13 +++++++++----
> mm/khugepaged.c | 9 +++------
> mm/pgtable-generic.c | 34 ++++++++++++++++++++++++++++++++++
> 3 files changed, 46 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index eb8aacba3698..69e290dab450 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -755,7 +755,6 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
> return pmd;
> }
> #define pmdp_get_lockless pmdp_get_lockless
> -#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
> #endif /* CONFIG_PGTABLE_LEVELS > 2 */
> #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */
>
> @@ -774,9 +773,6 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
> {
> return pmdp_get(pmdp);
> }
> -static inline void pmdp_get_lockless_sync(void)
> -{
> -}
> #endif
>
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> @@ -1174,6 +1170,8 @@ static inline void pudp_set_wrprotect(struct mm_struct *mm,
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
> unsigned long address, pmd_t *pmdp);
> +extern pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma,
> + unsigned long address, pmd_t *pmdp);
> #else
> static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
> unsigned long address,
> @@ -1182,6 +1180,13 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
> BUILD_BUG();
> return *pmdp;
> }
> +static inline pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma,
> + unsigned long address,
> + pmd_t *pmdp)
> +{
> + BUILD_BUG();
> + return *pmdp;
> +}
> #define pmdp_collapse_flush pmdp_collapse_flush
> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> #endif
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 9f790ec34400..0a98afc85c50 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1177,10 +1177,9 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> * Parallel GUP-fast is fine since GUP-fast will back off when
> * it detects PMD is changed.
> */
> - _pmd = pmdp_collapse_flush(vma, address, pmd);
> + _pmd = pmdp_collapse_flush_sync(vma, address, pmd);
> spin_unlock(pmd_ptl);
> mmu_notifier_invalidate_range_end(&range);
> - tlb_remove_table_sync_one();
Now you issue the IPI under PTL.
[...]
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index d3aec7a9926a..be2ee82e6fc4 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -233,6 +233,40 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
> flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
> return pmd;
> }
> +
> +pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma, unsigned long address,
> + pmd_t *pmdp)
> +{
> + struct mmu_gather tlb;
> + pmd_t pmd;
> +
> + VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> + VM_BUG_ON(pmd_trans_huge(*pmdp));
> +
> + tlb_gather_mmu(&tlb, vma->vm_mm);
Should we be using the new tlb_gather_mmu_vma(), and do we have to set
the TLB pagesize to PMD?
--
Cheers
David
On 2026/1/6 23:07, David Hildenbrand (Red Hat) wrote:
> On 1/6/26 13:03, Lance Yang wrote:
>> From: Lance Yang <lance.yang@linux.dev>
>>
>> pmdp_collapse_flush() may already send IPIs to flush TLBs, and then
>> callers send another IPI via tlb_remove_table_sync_one() or
>> pmdp_get_lockless_sync() to synchronize with concurrent GUP-fast walkers.
>>
>> However, since GUP-fast runs with IRQs disabled, the TLB flush IPI
>> already
>> provides the necessary synchronization. We can avoid the redundant second
>> IPI.
>>
>> Introduce pmdp_collapse_flush_sync() which combines flush and sync:
>>
>> - For architectures using the generic pmdp_collapse_flush()
>> implementation
>> (e.g., x86): Use mmu_gather to track IPI sends. If the TLB flush sent
>> an IPI, tlb_gather_remove_table_sync_one() will skip the redundant
>> one.
>>
>> - For architectures with custom pmdp_collapse_flush() (s390, riscv,
>> powerpc): Fall back to calling pmdp_collapse_flush() followed by
>> tlb_remove_table_sync_one(). No behavior change.
>>
>> Update khugepaged to use pmdp_collapse_flush_sync() instead of separate
>> flush and sync calls. Remove the now-unused pmdp_get_lockless_sync()
>> macro.
>>
>> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
>> Signed-off-by: Lance Yang <lance.yang@linux.dev>
>> ---
>> include/linux/pgtable.h | 13 +++++++++----
>> mm/khugepaged.c | 9 +++------
>> mm/pgtable-generic.c | 34 ++++++++++++++++++++++++++++++++++
>> 3 files changed, 46 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>> index eb8aacba3698..69e290dab450 100644
>> --- a/include/linux/pgtable.h
>> +++ b/include/linux/pgtable.h
>> @@ -755,7 +755,6 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
>> return pmd;
>> }
>> #define pmdp_get_lockless pmdp_get_lockless
>> -#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
>> #endif /* CONFIG_PGTABLE_LEVELS > 2 */
>> #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */
>> @@ -774,9 +773,6 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
>> {
>> return pmdp_get(pmdp);
>> }
>> -static inline void pmdp_get_lockless_sync(void)
>> -{
>> -}
>> #endif
>> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> @@ -1174,6 +1170,8 @@ static inline void pudp_set_wrprotect(struct
>> mm_struct *mm,
>> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>> unsigned long address, pmd_t *pmdp);
>> +extern pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma,
>> + unsigned long address, pmd_t *pmdp);
>> #else
>> static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>> unsigned long address,
>> @@ -1182,6 +1180,13 @@ static inline pmd_t pmdp_collapse_flush(struct
>> vm_area_struct *vma,
>> BUILD_BUG();
>> return *pmdp;
>> }
>> +static inline pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma,
>> + unsigned long address,
>> + pmd_t *pmdp)
>> +{
>> + BUILD_BUG();
>> + return *pmdp;
>> +}
>> #define pmdp_collapse_flush pmdp_collapse_flush
>> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>> #endif
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 9f790ec34400..0a98afc85c50 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1177,10 +1177,9 @@ static enum scan_result
>> collapse_huge_page(struct mm_struct *mm, unsigned long a
>> * Parallel GUP-fast is fine since GUP-fast will back off when
>> * it detects PMD is changed.
>> */
>> - _pmd = pmdp_collapse_flush(vma, address, pmd);
>> + _pmd = pmdp_collapse_flush_sync(vma, address, pmd);
>> spin_unlock(pmd_ptl);
>> mmu_notifier_invalidate_range_end(&range);
>> - tlb_remove_table_sync_one();
>
> Now you issue the IPI under PTL.
We do send TLB flush IPI under PTL before, e.g. in
try_collapse_pte_mapped_thp():
pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
pmdp_get_lockless_sync();
pte_unmap_unlock(start_pte, ptl);
But anyway, we can do better by passing ptl in and unlocking
before the sync IPI ;)
>
> [...]
>
>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>> index d3aec7a9926a..be2ee82e6fc4 100644
>> --- a/mm/pgtable-generic.c
>> +++ b/mm/pgtable-generic.c
>> @@ -233,6 +233,40 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct
>> *vma, unsigned long address,
>> flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>> return pmd;
>> }
>> +
>> +pmd_t pmdp_collapse_flush_sync(struct vm_area_struct *vma, unsigned
>> long address,
>> + pmd_t *pmdp)
>> +{
>> + struct mmu_gather tlb;
>> + pmd_t pmd;
>> +
>> + VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>> + VM_BUG_ON(pmd_trans_huge(*pmdp));
>> +
>> + tlb_gather_mmu(&tlb, vma->vm_mm);
>
> Should we be using the new tlb_gather_mmu_vma(), and do we have to set
> the TLB pagesize to PMD?
Yes, good point on tlb_gather_mmu_vma()!
So, the sequence will be:
tlb_gather_mmu_vma(&tlb, vma);
pmd = pmdp_huge_get_and_clear(...);
flush_tlb_mm_range(..., &tlb);
if (ptl)
spin_unlock(ptl);
tlb_gather_remove_table_sync_one(&tlb);
tlb_finish_mmu(&tlb);Thanks,
Lance
© 2016 - 2026 Red Hat, Inc.