include/linux/mm.h | 7 ------- mm/pagewalk.c | 18 ++++++++++++++++-- mm/pgtable-generic.c | 21 ++++++++++++++++++++- 3 files changed, 36 insertions(+), 10 deletions(-)
ptdump walks the kernel page tables holding only the init_mm mmap lock
and the memory hotplug lock. Neither of those stops vmalloc or ioremap
from freeing a kernel PTE page underneath the walk. When
vmap_try_huge_pmd() installs a huge mapping it collapses the existing
PTE table and frees it through pmd_free_pte_page(), and on x86 that
happens without the init_mm mmap lock. syzbot caught the resulting
use after free in ptdump_pte_entry() reading a page table that had
already been freed.
pagetable_free_kernel() used to free the page immediately on
configurations without CONFIG_ASYNC_KERNEL_PGTABLE_FREE, and on the
async ones it only batched a TLB flush before freeing. In both cases a
lockless walker could still be dereferencing the page.
Defer the free by a grace period instead. pagetable_free_kernel() now
hands every kernel page table to call_rcu(), so the page stays valid
until any walk that may have observed it has finished. The async path
keeps doing its TLB flush first and then queues the RCU free per page.
On the read side, walk_page_range_debug() takes the RCU read lock
around the kernel walk through the new walk_kernel_page_table_range_rcu()
helper. A walker either sees the cleared PMD and skips the page, or
keeps it alive until it drops the lock. The plain
walk_kernel_page_table_range() stays as it is for callers that already
own their range and cannot race a free, such as the arm64 page table
split paths.
Fixes: 5ba2f0a15564 ("mm: introduce deferred freeing for kernel page tables")
Reported-by: syzbot+fd95a72470f5a44e464c@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/6a287988.39669fcc.33b062.00a0.GAE@google.com/T/
Assisted-by: Claude:claude-opus-4-8
Signed-off-by: David Carlier <devnexen@gmail.com>
---
v4: defer the free in both the async and non async configs, not just
the async one. Move the walk under a named
walk_kernel_page_table_range_rcu() helper instead of open coding
rcu_read_lock() in walk_page_range_debug().
v3: take rcu_read_lock() in the init_mm branch of
walk_page_range_debug() rather than inside the lockless walker,
which the arm64 split paths also use with GFP_PGTABLE_KERNEL and
can sleep.
v2: use call_rcu() instead of synchronize_rcu().
---
include/linux/mm.h | 7 -------
mm/pagewalk.c | 18 ++++++++++++++++--
mm/pgtable-generic.c | 21 ++++++++++++++++++++-
3 files changed, 36 insertions(+), 10 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 485df9c2dbdd..79408a17a1b0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3695,14 +3695,7 @@ static inline void __pagetable_free(struct ptdesc *pt)
__free_pages(page, compound_order(page));
}
-#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
void pagetable_free_kernel(struct ptdesc *pt);
-#else
-static inline void pagetable_free_kernel(struct ptdesc *pt)
-{
- __pagetable_free(pt);
-}
-#endif
/**
* pagetable_free - Free pagetables
* @pt: The page table descriptor
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 3ae2586ff45b..5b5807a88394 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -664,6 +664,19 @@ int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end
return walk_pgd_range(start, end, &walk);
}
+static int walk_kernel_page_table_range_rcu(unsigned long start, unsigned long end,
+ const struct mm_walk_ops *ops, pgd_t *pgd,
+ void *private)
+{
+ int err;
+
+ rcu_read_lock();
+ err = walk_kernel_page_table_range(start, end, ops, pgd, private);
+ rcu_read_unlock();
+
+ return err;
+}
+
/**
* walk_page_range_debug - walk a range of pagetables not backed by a vma
* @mm: mm_struct representing the target process of page table walk
@@ -693,8 +706,9 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
/* For convenience, we allow traversal of kernel mappings. */
if (mm == &init_mm)
- return walk_kernel_page_table_range(start, end, ops,
- pgd, private);
+ return walk_kernel_page_table_range_rcu(start, end, ops, pgd,
+ private);
+
if (start >= end || !walk.mm)
return -EINVAL;
if (!check_ops_safe(ops))
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index b91b1a98029c..d45a556b4021 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -410,6 +410,13 @@ pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
goto again;
}
+static void kernel_pgtable_free_rcu(struct rcu_head *head)
+{
+ struct ptdesc *pt = container_of(head, struct ptdesc, pt_rcu_head);
+
+ __pagetable_free(pt);
+}
+
#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
static void kernel_pgtable_work_func(struct work_struct *work);
@@ -434,8 +441,15 @@ static void kernel_pgtable_work_func(struct work_struct *work)
spin_unlock(&kernel_pgtable_work.lock);
iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL);
+
+ /*
+ * Lockless kernel page table walkers (ptdump, and any other user of
+ * walk_kernel_page_table_range_lockless()) dereference these pages
+ * under rcu_read_lock(). Free them after a grace period so a walker
+ * cannot still be reading a page we release.
+ */
list_for_each_entry_safe(pt, next, &page_list, pt_list)
- __pagetable_free(pt);
+ call_rcu(&pt->pt_rcu_head, kernel_pgtable_free_rcu);
}
void pagetable_free_kernel(struct ptdesc *pt)
@@ -446,4 +460,9 @@ void pagetable_free_kernel(struct ptdesc *pt)
schedule_work(&kernel_pgtable_work.work);
}
+#else
+void pagetable_free_kernel(struct ptdesc *pt)
+{
+ call_rcu(&pt->pt_rcu_head, kernel_pgtable_free_rcu);
+}
#endif
--
2.53.0
© 2016 - 2026 Red Hat, Inc.