From: Dave Hansen <dave.hansen@linux.intel.com>
This introduces a conditional asynchronous mechanism, enabled by
CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the
freeing of pages that are used as page tables for kernel address mappings.
These pages are now queued to a work struct instead of being freed
immediately.
This deferred freeing allows for batch-freeing of page tables, providing
a safe context for performing a single expensive operation (TLB flush)
for a batch of kernel page tables instead of performing that expensive
operation for each page table.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
---
mm/Kconfig | 3 +++
include/linux/mm.h | 16 +++++++++++++---
mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++
3 files changed, 53 insertions(+), 3 deletions(-)
diff --git a/mm/Kconfig b/mm/Kconfig
index 0e26f4fc8717..a83df9934acd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -908,6 +908,9 @@ config PAGE_MAPCOUNT
config PGTABLE_HAS_HUGE_LEAVES
def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE
+config ASYNC_KERNEL_PGTABLE_FREE
+ def_bool n
+
# TODO: Allow to be enabled without THP
config ARCH_SUPPORTS_HUGE_PFNMAP
def_bool n
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 52ae551d0eb4..d521abd33164 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3031,6 +3031,14 @@ static inline void __pagetable_free(struct ptdesc *pt)
__free_pages(page, compound_order(page));
}
+#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
+void pagetable_free_kernel(struct ptdesc *pt);
+#else
+static inline void pagetable_free_kernel(struct ptdesc *pt)
+{
+ __pagetable_free(pt);
+}
+#endif
/**
* pagetable_free - Free pagetables
* @pt: The page table descriptor
@@ -3040,10 +3048,12 @@ static inline void __pagetable_free(struct ptdesc *pt)
*/
static inline void pagetable_free(struct ptdesc *pt)
{
- if (ptdesc_test_kernel(pt))
+ if (ptdesc_test_kernel(pt)) {
ptdesc_clear_kernel(pt);
-
- __pagetable_free(pt);
+ pagetable_free_kernel(pt);
+ } else {
+ __pagetable_free(pt);
+ }
}
#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 567e2d084071..1c7caa8ef164 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -406,3 +406,40 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
pte_unmap_unlock(pte, ptl);
goto again;
}
+
+#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
+static void kernel_pgtable_work_func(struct work_struct *work);
+
+static struct {
+ struct list_head list;
+ /* protect above ptdesc lists */
+ spinlock_t lock;
+ struct work_struct work;
+} kernel_pgtable_work = {
+ .list = LIST_HEAD_INIT(kernel_pgtable_work.list),
+ .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
+ .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
+};
+
+static void kernel_pgtable_work_func(struct work_struct *work)
+{
+ struct ptdesc *pt, *next;
+ LIST_HEAD(page_list);
+
+ spin_lock(&kernel_pgtable_work.lock);
+ list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
+ spin_unlock(&kernel_pgtable_work.lock);
+
+ list_for_each_entry_safe(pt, next, &page_list, pt_list)
+ __pagetable_free(pt);
+}
+
+void pagetable_free_kernel(struct ptdesc *pt)
+{
+ spin_lock(&kernel_pgtable_work.lock);
+ list_add(&pt->pt_list, &kernel_pgtable_work.list);
+ spin_unlock(&kernel_pgtable_work.lock);
+
+ schedule_work(&kernel_pgtable_work.work);
+}
+#endif
--
2.43.0
On Wed, Oct 22, 2025 at 04:26:33PM +0800, Lu Baolu wrote:
> From: Dave Hansen <dave.hansen@linux.intel.com>
>
> This introduces a conditional asynchronous mechanism, enabled by
> CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the
> freeing of pages that are used as page tables for kernel address mappings.
> These pages are now queued to a work struct instead of being freed
> immediately.
>
> This deferred freeing allows for batch-freeing of page tables, providing
> a safe context for performing a single expensive operation (TLB flush)
> for a batch of kernel page tables instead of performing that expensive
> operation for each page table.
>
> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
> mm/Kconfig | 3 +++
> include/linux/mm.h | 16 +++++++++++++---
> mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++
> 3 files changed, 53 insertions(+), 3 deletions(-)
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 0e26f4fc8717..a83df9934acd 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -908,6 +908,9 @@ config PAGE_MAPCOUNT
> config PGTABLE_HAS_HUGE_LEAVES
> def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE
>
> +config ASYNC_KERNEL_PGTABLE_FREE
> + def_bool n
> +
> # TODO: Allow to be enabled without THP
> config ARCH_SUPPORTS_HUGE_PFNMAP
> def_bool n
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 52ae551d0eb4..d521abd33164 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3031,6 +3031,14 @@ static inline void __pagetable_free(struct ptdesc *pt)
> __free_pages(page, compound_order(page));
> }
>
> +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
> +void pagetable_free_kernel(struct ptdesc *pt);
> +#else
> +static inline void pagetable_free_kernel(struct ptdesc *pt)
> +{
> + __pagetable_free(pt);
> +}
> +#endif
> /**
> * pagetable_free - Free pagetables
> * @pt: The page table descriptor
> @@ -3040,10 +3048,12 @@ static inline void __pagetable_free(struct ptdesc *pt)
> */
> static inline void pagetable_free(struct ptdesc *pt)
> {
> - if (ptdesc_test_kernel(pt))
> + if (ptdesc_test_kernel(pt)) {
> ptdesc_clear_kernel(pt);
> -
> - __pagetable_free(pt);
> + pagetable_free_kernel(pt);
> + } else {
> + __pagetable_free(pt);
> + }
> }
>
> #if defined(CONFIG_SPLIT_PTE_PTLOCKS)
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 567e2d084071..1c7caa8ef164 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -406,3 +406,40 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
> pte_unmap_unlock(pte, ptl);
> goto again;
> }
> +
> +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
> +static void kernel_pgtable_work_func(struct work_struct *work);
> +
> +static struct {
> + struct list_head list;
> + /* protect above ptdesc lists */
> + spinlock_t lock;
> + struct work_struct work;
> +} kernel_pgtable_work = {
> + .list = LIST_HEAD_INIT(kernel_pgtable_work.list),
> + .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
> + .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
> +};
> +
> +static void kernel_pgtable_work_func(struct work_struct *work)
> +{
> + struct ptdesc *pt, *next;
> + LIST_HEAD(page_list);
> +
> + spin_lock(&kernel_pgtable_work.lock);
> + list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
> + spin_unlock(&kernel_pgtable_work.lock);
> +
> + list_for_each_entry_safe(pt, next, &page_list, pt_list)
> + __pagetable_free(pt);
> +}
> +
> +void pagetable_free_kernel(struct ptdesc *pt)
> +{
> + spin_lock(&kernel_pgtable_work.lock);
> + list_add(&pt->pt_list, &kernel_pgtable_work.list);
> + spin_unlock(&kernel_pgtable_work.lock);
> +
> + schedule_work(&kernel_pgtable_work.work);
> +}
> +#endif
> --
> 2.43.0
>
--
Sincerely yours,
Mike.
On 22.10.25 10:26, Lu Baolu wrote:
> From: Dave Hansen <dave.hansen@linux.intel.com>
>
> This introduces a conditional asynchronous mechanism, enabled by
> CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the
> freeing of pages that are used as page tables for kernel address mappings.
> These pages are now queued to a work struct instead of being freed
> immediately.
>
> This deferred freeing allows for batch-freeing of page tables, providing
> a safe context for performing a single expensive operation (TLB flush)
> for a batch of kernel page tables instead of performing that expensive
> operation for each page table.
>
> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
> ---
> mm/Kconfig | 3 +++
> include/linux/mm.h | 16 +++++++++++++---
> mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++
> 3 files changed, 53 insertions(+), 3 deletions(-)
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 0e26f4fc8717..a83df9934acd 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -908,6 +908,9 @@ config PAGE_MAPCOUNT
> config PGTABLE_HAS_HUGE_LEAVES
> def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE
>
> +config ASYNC_KERNEL_PGTABLE_FREE
> + def_bool n
> +
> # TODO: Allow to be enabled without THP
> config ARCH_SUPPORTS_HUGE_PFNMAP
> def_bool n
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 52ae551d0eb4..d521abd33164 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3031,6 +3031,14 @@ static inline void __pagetable_free(struct ptdesc *pt)
> __free_pages(page, compound_order(page));
> }
>
> +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
> +void pagetable_free_kernel(struct ptdesc *pt);
> +#else
> +static inline void pagetable_free_kernel(struct ptdesc *pt)
> +{
> + __pagetable_free(pt);
> +}
> +#endif
> /**
> * pagetable_free - Free pagetables
> * @pt: The page table descriptor
> @@ -3040,10 +3048,12 @@ static inline void __pagetable_free(struct ptdesc *pt)
> */
> static inline void pagetable_free(struct ptdesc *pt)
> {
> - if (ptdesc_test_kernel(pt))
> + if (ptdesc_test_kernel(pt)) {
> ptdesc_clear_kernel(pt);
> -
> - __pagetable_free(pt);
> + pagetable_free_kernel(pt);
> + } else {
> + __pagetable_free(pt);
> + }
> }
>
> #if defined(CONFIG_SPLIT_PTE_PTLOCKS)
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 567e2d084071..1c7caa8ef164 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -406,3 +406,40 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
> pte_unmap_unlock(pte, ptl);
> goto again;
> }
> +
> +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
> +static void kernel_pgtable_work_func(struct work_struct *work);
> +
> +static struct {
> + struct list_head list;
> + /* protect above ptdesc lists */
> + spinlock_t lock;
> + struct work_struct work;
> +} kernel_pgtable_work = {
> + .list = LIST_HEAD_INIT(kernel_pgtable_work.list),
> + .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
> + .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
> +};
> +
> +static void kernel_pgtable_work_func(struct work_struct *work)
> +{
> + struct ptdesc *pt, *next;
> + LIST_HEAD(page_list);
> +
> + spin_lock(&kernel_pgtable_work.lock);
> + list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
> + spin_unlock(&kernel_pgtable_work.lock);
> +
> + list_for_each_entry_safe(pt, next, &page_list, pt_list)
> + __pagetable_free(pt);
> +}
> +
> +void pagetable_free_kernel(struct ptdesc *pt)
> +{
> + spin_lock(&kernel_pgtable_work.lock);
> + list_add(&pt->pt_list, &kernel_pgtable_work.list);
> + spin_unlock(&kernel_pgtable_work.lock);
> +
> + schedule_work(&kernel_pgtable_work.work);
> +}
> +#endif
Acked-by: David Hildenbrand <david@redhat.com>
I was briefly wondering whether the pages can get stuck in there
sufficiently long that we would want to wire up the shrinker to say
"OOM, hold your horses, we can still free something here".
But I'd assume the workqueue will get scheduled in a reasonable
timeframe either so this is not a concern?
--
Cheers
David / dhildenb
On Wed, Oct 22, 2025 at 08:34:53PM +0200, David Hildenbrand wrote:
> On 22.10.25 10:26, Lu Baolu wrote:
> > From: Dave Hansen <dave.hansen@linux.intel.com>
> >
> > This introduces a conditional asynchronous mechanism, enabled by
> > CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the
> > freeing of pages that are used as page tables for kernel address mappings.
> > These pages are now queued to a work struct instead of being freed
> > immediately.
> >
> > This deferred freeing allows for batch-freeing of page tables, providing
> > a safe context for performing a single expensive operation (TLB flush)
> > for a batch of kernel page tables instead of performing that expensive
> > operation for each page table.
> >
> > Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> > Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> > Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> > Reviewed-by: Kevin Tian <kevin.tian@intel.com>
> > ---
> > mm/Kconfig | 3 +++
> > include/linux/mm.h | 16 +++++++++++++---
> > mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++
> > 3 files changed, 53 insertions(+), 3 deletions(-)
> >
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index 0e26f4fc8717..a83df9934acd 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -908,6 +908,9 @@ config PAGE_MAPCOUNT
> > config PGTABLE_HAS_HUGE_LEAVES
> > def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE
> > +config ASYNC_KERNEL_PGTABLE_FREE
> > + def_bool n
> > +
> > # TODO: Allow to be enabled without THP
> > config ARCH_SUPPORTS_HUGE_PFNMAP
> > def_bool n
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 52ae551d0eb4..d521abd33164 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -3031,6 +3031,14 @@ static inline void __pagetable_free(struct ptdesc *pt)
> > __free_pages(page, compound_order(page));
> > }
> > +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
> > +void pagetable_free_kernel(struct ptdesc *pt);
> > +#else
> > +static inline void pagetable_free_kernel(struct ptdesc *pt)
> > +{
> > + __pagetable_free(pt);
> > +}
> > +#endif
> > /**
> > * pagetable_free - Free pagetables
> > * @pt: The page table descriptor
> > @@ -3040,10 +3048,12 @@ static inline void __pagetable_free(struct ptdesc *pt)
> > */
> > static inline void pagetable_free(struct ptdesc *pt)
> > {
> > - if (ptdesc_test_kernel(pt))
> > + if (ptdesc_test_kernel(pt)) {
> > ptdesc_clear_kernel(pt);
> > -
> > - __pagetable_free(pt);
> > + pagetable_free_kernel(pt);
> > + } else {
> > + __pagetable_free(pt);
> > + }
> > }
> > #if defined(CONFIG_SPLIT_PTE_PTLOCKS)
> > diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> > index 567e2d084071..1c7caa8ef164 100644
> > --- a/mm/pgtable-generic.c
> > +++ b/mm/pgtable-generic.c
> > @@ -406,3 +406,40 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
> > pte_unmap_unlock(pte, ptl);
> > goto again;
> > }
> > +
> > +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
> > +static void kernel_pgtable_work_func(struct work_struct *work);
> > +
> > +static struct {
> > + struct list_head list;
> > + /* protect above ptdesc lists */
> > + spinlock_t lock;
> > + struct work_struct work;
> > +} kernel_pgtable_work = {
> > + .list = LIST_HEAD_INIT(kernel_pgtable_work.list),
> > + .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
> > + .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
> > +};
> > +
> > +static void kernel_pgtable_work_func(struct work_struct *work)
> > +{
> > + struct ptdesc *pt, *next;
> > + LIST_HEAD(page_list);
> > +
> > + spin_lock(&kernel_pgtable_work.lock);
> > + list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
> > + spin_unlock(&kernel_pgtable_work.lock);
> > +
> > + list_for_each_entry_safe(pt, next, &page_list, pt_list)
> > + __pagetable_free(pt);
> > +}
> > +
> > +void pagetable_free_kernel(struct ptdesc *pt)
> > +{
> > + spin_lock(&kernel_pgtable_work.lock);
> > + list_add(&pt->pt_list, &kernel_pgtable_work.list);
> > + spin_unlock(&kernel_pgtable_work.lock);
> > +
> > + schedule_work(&kernel_pgtable_work.work);
> > +}
> > +#endif
>
> Acked-by: David Hildenbrand <david@redhat.com>
>
> I was briefly wondering whether the pages can get stuck in there
> sufficiently long that we would want to wire up the shrinker to say "OOM,
> hold your horses, we can still free something here".
>
> But I'd assume the workqueue will get scheduled in a reasonable timeframe
> either so this is not a concern?
Maybe it should have this set then:
``WQ_MEM_RECLAIM``
All wq which might be used in the memory reclaim paths **MUST**
have this flag set. The wq is guaranteed to have at least one
execution context regardless of memory pressure.
So it can't get locked up and will eventually run and free.
Jason
On 10/22/25 11:34, David Hildenbrand wrote: ... > I was briefly wondering whether the pages can get stuck in there > sufficiently long that we would want to wire up the shrinker to say > "OOM, hold your horses, we can still free something here". > > But I'd assume the workqueue will get scheduled in a reasonable > timeframe either so this is not a concern? First, I can't fathom there will ever be more than a couple of pages in there. If there's an OOM going on, there's probably no shortage of idle time leading up to and during the OOM as threads plow into mutexes and wait for I/O. That's when the work will get handled even more quickly than normal. I suspect it'll work itself out naturally. It wouldn't be hard to toss a counter in there for the list length and dump it at OOM, or pr_info() if it's got more than a few pages on it.
© 2016 - 2025 Red Hat, Inc.