mm/internal.h | 19 +++++++++++++++++++ mm/memory.c | 8 +------- mm/page_vma_mapped.c | 11 ++--------- 3 files changed, 22 insertions(+), 16 deletions(-)
We still mention compound_mapcount() in two comments.
Instead of simply referring to the folio mapcount in both places, let's
factor out the odd-looking PTL sync into sync_with_folio_pmd_zap(), and
add centralized documentation why this is required.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
---
mm/internal.h | 19 +++++++++++++++++++
mm/memory.c | 8 +-------
mm/page_vma_mapped.c | 11 ++---------
3 files changed, 22 insertions(+), 16 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index cb0af847d7d9..e0ef192b0be3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -516,6 +516,25 @@ void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
+/**
+ * sync_with_folio_pmd_zap - sync with concurrent zapping of a folio PMD
+ * @mm: The mm_struct.
+ * @pmdp: Pointer to the pmd that was found to be pmd_none().
+ *
+ * When we stumble over a pmd_none() without holding the PTL while unmapping a
+ * folio that could have been mapped at that PMD, it could be that concurrent
+ * zapping of the PMD is not complete yet. While the PMD might be pmd_none()
+ * already, the folio might still appear to be mapped (folio_mapped()).
+ *
+ * Wait for concurrent zapping to complete by grabbing the PTL.
+ */
+static inline void sync_with_folio_pmd_zap(struct mm_struct *mm, pmd_t *pmdp)
+{
+ spinlock_t *ptl = pmd_lock(mm, pmdp);
+
+ spin_unlock(ptl);
+}
+
struct zap_details;
void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 876bf73959c6..c87d796050ba 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2006,13 +2006,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
} else if (details && details->single_folio &&
folio_test_pmd_mappable(details->single_folio) &&
next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
- spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
- /*
- * Take and drop THP pmd lock so that we cannot return
- * prematurely, while zap_huge_pmd() has cleared *pmd,
- * but not yet decremented compound_mapcount().
- */
- spin_unlock(ptl);
+ sync_with_folio_pmd_zap(tlb->mm, pmd);
}
if (pmd_none(*pmd)) {
addr = next;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index b38a1d00c971..a4d52fdb3056 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -269,11 +269,6 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
} else if (!pmd_present(pmde)) {
- /*
- * If PVMW_SYNC, take and drop THP pmd lock so that we
- * cannot return prematurely, while zap_huge_pmd() has
- * cleared *pmd but not decremented compound_mapcount().
- */
const softleaf_t entry = softleaf_from_pmd(pmde);
if (softleaf_is_device_private(entry)) {
@@ -284,11 +279,9 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if ((pvmw->flags & PVMW_SYNC) &&
thp_vma_suitable_order(vma, pvmw->address,
PMD_ORDER) &&
- (pvmw->nr_pages >= HPAGE_PMD_NR)) {
- spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
+ (pvmw->nr_pages >= HPAGE_PMD_NR))
+ sync_with_folio_pmd_zap(mm, pvmw->pmd);
- spin_unlock(ptl);
- }
step_forward(pvmw, PMD_SIZE);
continue;
}
--
2.43.0
On Mon, Feb 23, 2026 at 05:39:20PM +0100, David Hildenbrand (Arm) wrote:
> void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
>
> +/**
> + * sync_with_folio_pmd_zap - sync with concurrent zapping of a folio PMD
> + * @mm: The mm_struct.
> + * @pmdp: Pointer to the pmd that was found to be pmd_none().
> + *
> + * When we stumble over a pmd_none() without holding the PTL while unmapping a
> + * folio that could have been mapped at that PMD, it could be that concurrent
> + * zapping of the PMD is not complete yet. While the PMD might be pmd_none()
> + * already, the folio might still appear to be mapped (folio_mapped()).
> + *
> + * Wait for concurrent zapping to complete by grabbing the PTL.
> + */
I like this. The one thing we've lost is the name of the function which
does the zapping, which I think was a helpful detail. Perhaps not to
someone who's deep in "how page tables work", but I wouldn't know where
to look for the counterpart to this. So how about:
Option A:
+ * When we stumble over a pmd_none() without holding the PTL while
+ * unmapping a folio that could have been mapped at that PMD,
+ * zap_huge_pmd() may not be complete yet. While the PMD might be pmd_none()
+ * already, the folio might still appear to be mapped (folio_mapped()).
Option B:
+ * When we find a pmd_none() while unmapping a folio without holding
+ * the PTL, zap_huge_pmd() may have cleared the PMD but not yet
+ * modified the folio to indicate that it's unmapped.
(for both options, I'm just changing that one paragraph; the paragraph
starting "Wait", I would leave unchanged)
> +static inline void sync_with_folio_pmd_zap(struct mm_struct *mm, pmd_t *pmdp)
> +{
> + spinlock_t *ptl = pmd_lock(mm, pmdp);
> +
> + spin_unlock(ptl);
> +}
> +
> struct zap_details;
> void unmap_page_range(struct mmu_gather *tlb,
> struct vm_area_struct *vma,
> diff --git a/mm/memory.c b/mm/memory.c
> index 876bf73959c6..c87d796050ba 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2006,13 +2006,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
> } else if (details && details->single_folio &&
> folio_test_pmd_mappable(details->single_folio) &&
> next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
> - spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
> - /*
> - * Take and drop THP pmd lock so that we cannot return
> - * prematurely, while zap_huge_pmd() has cleared *pmd,
> - * but not yet decremented compound_mapcount().
> - */
> - spin_unlock(ptl);
> + sync_with_folio_pmd_zap(tlb->mm, pmd);
> }
> if (pmd_none(*pmd)) {
> addr = next;
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index b38a1d00c971..a4d52fdb3056 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -269,11 +269,6 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> spin_unlock(pvmw->ptl);
> pvmw->ptl = NULL;
> } else if (!pmd_present(pmde)) {
> - /*
> - * If PVMW_SYNC, take and drop THP pmd lock so that we
> - * cannot return prematurely, while zap_huge_pmd() has
> - * cleared *pmd but not decremented compound_mapcount().
> - */
> const softleaf_t entry = softleaf_from_pmd(pmde);
>
> if (softleaf_is_device_private(entry)) {
> @@ -284,11 +279,9 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> if ((pvmw->flags & PVMW_SYNC) &&
> thp_vma_suitable_order(vma, pvmw->address,
> PMD_ORDER) &&
> - (pvmw->nr_pages >= HPAGE_PMD_NR)) {
> - spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
> + (pvmw->nr_pages >= HPAGE_PMD_NR))
> + sync_with_folio_pmd_zap(mm, pvmw->pmd);
>
> - spin_unlock(ptl);
> - }
> step_forward(pvmw, PMD_SIZE);
> continue;
> }
> --
> 2.43.0
>
>
On 2/23/26 18:58, Matthew Wilcox wrote: > On Mon, Feb 23, 2026 at 05:39:20PM +0100, David Hildenbrand (Arm) wrote: >> void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); >> >> +/** >> + * sync_with_folio_pmd_zap - sync with concurrent zapping of a folio PMD >> + * @mm: The mm_struct. >> + * @pmdp: Pointer to the pmd that was found to be pmd_none(). >> + * >> + * When we stumble over a pmd_none() without holding the PTL while unmapping a >> + * folio that could have been mapped at that PMD, it could be that concurrent >> + * zapping of the PMD is not complete yet. While the PMD might be pmd_none() >> + * already, the folio might still appear to be mapped (folio_mapped()). >> + * >> + * Wait for concurrent zapping to complete by grabbing the PTL. >> + */ > > I like this. The one thing we've lost is the name of the function which > does the zapping, which I think was a helpful detail. I dropped it as we seem to have a talent to rename functions but to forget about updating comments. This very patch is an example of that :) But well, I used folio_mapped() ... so ... > Perhaps not to > someone who's deep in "how page tables work", but I wouldn't know where > to look for the counterpart to this. So how about: > > Option A: > + * When we stumble over a pmd_none() without holding the PTL while > + * unmapping a folio that could have been mapped at that PMD, > + * zap_huge_pmd() may not be complete yet. While the PMD might be pmd_none() > + * already, the folio might still appear to be mapped (folio_mapped()). > > Option B: > + * When we find a pmd_none() while unmapping a folio without holding > + * the PTL, zap_huge_pmd() may have cleared the PMD but not yet > + * modified the folio to indicate that it's unmapped. Let's do a slightly longer B: * When we find a pmd_none() while unmapping a folio without holding * the PTL, zap_huge_pmd() may have cleared the PMD but not yet * modified the folio to indicate that it's unmapped. Skipping the PMD * without synchronization could make folio unmapping code assume * that unmapping failed. Good? If so, Andrew can you squash or should I resend? Thanks for the review! -- Cheers, David
© 2016 - 2026 Red Hat, Inc.