From: Dave Hansen <dave.hansen@linux.intel.com>
This introduces a conditional asynchronous mechanism, enabled by
CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the
freeing of pages that are used as page tables for kernel address mappings.
These pages are now queued to a work struct instead of being freed
immediately.
This deferred freeing allows for batch-freeing of page tables, providing
a safe context for performing a single expensive operation (TLB flush)
for a batch of kernel page tables instead of performing that expensive
operation for each page table.
On x86, CONFIG_ASYNC_KERNEL_PGTABLE_FREE is selected if CONFIG_IOMMU_SVA
is enabled, because both Intel and AMD IOMMU architectures could
potentially cache kernel page table entries in their paging structure
cache, regardless of the permission.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
---
arch/x86/Kconfig | 1 +
mm/Kconfig | 3 +++
include/linux/mm.h | 16 +++++++++++++---
mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++
4 files changed, 54 insertions(+), 3 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fa3b616af03a..ded29ee848fd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -279,6 +279,7 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+ select ASYNC_KERNEL_PGTABLE_FREE if IOMMU_SVA
select MMU_GATHER_RCU_TABLE_FREE
select MMU_GATHER_MERGE_VMAS
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
diff --git a/mm/Kconfig b/mm/Kconfig
index 0e26f4fc8717..a83df9934acd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -908,6 +908,9 @@ config PAGE_MAPCOUNT
config PGTABLE_HAS_HUGE_LEAVES
def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE
+config ASYNC_KERNEL_PGTABLE_FREE
+ def_bool n
+
# TODO: Allow to be enabled without THP
config ARCH_SUPPORTS_HUGE_PFNMAP
def_bool n
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bb235a9f991e..fe5515725c46 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3031,6 +3031,14 @@ static inline void __pagetable_free(struct ptdesc *pt)
__free_pages(page, compound_order(page));
}
+#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
+void pagetable_free_kernel(struct ptdesc *pt);
+#else
+static inline void pagetable_free_kernel(struct ptdesc *pt)
+{
+ __pagetable_free(pt);
+}
+#endif
/**
* pagetable_free - Free pagetables
* @pt: The page table descriptor
@@ -3040,10 +3048,12 @@ static inline void __pagetable_free(struct ptdesc *pt)
*/
static inline void pagetable_free(struct ptdesc *pt)
{
- if (ptdesc_test_kernel(pt))
+ if (ptdesc_test_kernel(pt)) {
ptdesc_clear_kernel(pt);
-
- __pagetable_free(pt);
+ pagetable_free_kernel(pt);
+ } else {
+ __pagetable_free(pt);
+ }
}
#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 567e2d084071..1c7caa8ef164 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -406,3 +406,40 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
pte_unmap_unlock(pte, ptl);
goto again;
}
+
+#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
+static void kernel_pgtable_work_func(struct work_struct *work);
+
+static struct {
+ struct list_head list;
+ /* protect above ptdesc lists */
+ spinlock_t lock;
+ struct work_struct work;
+} kernel_pgtable_work = {
+ .list = LIST_HEAD_INIT(kernel_pgtable_work.list),
+ .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
+ .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
+};
+
+static void kernel_pgtable_work_func(struct work_struct *work)
+{
+ struct ptdesc *pt, *next;
+ LIST_HEAD(page_list);
+
+ spin_lock(&kernel_pgtable_work.lock);
+ list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
+ spin_unlock(&kernel_pgtable_work.lock);
+
+ list_for_each_entry_safe(pt, next, &page_list, pt_list)
+ __pagetable_free(pt);
+}
+
+void pagetable_free_kernel(struct ptdesc *pt)
+{
+ spin_lock(&kernel_pgtable_work.lock);
+ list_add(&pt->pt_list, &kernel_pgtable_work.list);
+ spin_unlock(&kernel_pgtable_work.lock);
+
+ schedule_work(&kernel_pgtable_work.work);
+}
+#endif
--
2.43.0
On 14.10.25 15:04, Lu Baolu wrote: > From: Dave Hansen <dave.hansen@linux.intel.com> > > This introduces a conditional asynchronous mechanism, enabled by > CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the > freeing of pages that are used as page tables for kernel address mappings. > These pages are now queued to a work struct instead of being freed > immediately. > > This deferred freeing allows for batch-freeing of page tables, providing > a safe context for performing a single expensive operation (TLB flush) > for a batch of kernel page tables instead of performing that expensive > operation for each page table. > > On x86, CONFIG_ASYNC_KERNEL_PGTABLE_FREE is selected if CONFIG_IOMMU_SVA > is enabled, because both Intel and AMD IOMMU architectures could > potentially cache kernel page table entries in their paging structure > cache, regardless of the permission. See below, I assume this is patch #7 material. > > Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> > Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com> > Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> > Reviewed-by: Kevin Tian <kevin.tian@intel.com> > --- > arch/x86/Kconfig | 1 + > mm/Kconfig | 3 +++ > include/linux/mm.h | 16 +++++++++++++--- > mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++ > 4 files changed, 54 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index fa3b616af03a..ded29ee848fd 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -279,6 +279,7 @@ config X86 > select HAVE_PCI > select HAVE_PERF_REGS > select HAVE_PERF_USER_STACK_DUMP > + select ASYNC_KERNEL_PGTABLE_FREE if IOMMU_SVA That should belong into patch #7, no? -- Cheers David / dhildenb
On 10/17/25 03:35, David Hildenbrand wrote: > On 14.10.25 15:04, Lu Baolu wrote: >> From: Dave Hansen <dave.hansen@linux.intel.com> >> >> This introduces a conditional asynchronous mechanism, enabled by >> CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the >> freeing of pages that are used as page tables for kernel address >> mappings. >> These pages are now queued to a work struct instead of being freed >> immediately. >> >> This deferred freeing allows for batch-freeing of page tables, providing >> a safe context for performing a single expensive operation (TLB flush) >> for a batch of kernel page tables instead of performing that expensive >> operation for each page table. >> >> On x86, CONFIG_ASYNC_KERNEL_PGTABLE_FREE is selected if CONFIG_IOMMU_SVA >> is enabled, because both Intel and AMD IOMMU architectures could >> potentially cache kernel page table entries in their paging structure >> cache, regardless of the permission. > > See below, I assume this is patch #7 material. > >> >> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> >> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com> >> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> >> Reviewed-by: Kevin Tian <kevin.tian@intel.com> >> --- >> arch/x86/Kconfig | 1 + >> mm/Kconfig | 3 +++ >> include/linux/mm.h | 16 +++++++++++++--- >> mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++ >> 4 files changed, 54 insertions(+), 3 deletions(-) >> >> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig >> index fa3b616af03a..ded29ee848fd 100644 >> --- a/arch/x86/Kconfig >> +++ b/arch/x86/Kconfig >> @@ -279,6 +279,7 @@ config X86 >> select HAVE_PCI >> select HAVE_PERF_REGS >> select HAVE_PERF_USER_STACK_DUMP >> + select ASYNC_KERNEL_PGTABLE_FREE if IOMMU_SVA > > That should belong into patch #7, no? Yes. Done. Thanks, baolu
© 2016 - 2025 Red Hat, Inc.