[PATCH v14 13/13] x86/mm: only invalidate final translations with INVLPGB

Rik van Riel posted 13 patches 9 months, 3 weeks ago
There is a newer version of this series
[PATCH v14 13/13] x86/mm: only invalidate final translations with INVLPGB
Posted by Rik van Riel 9 months, 3 weeks ago
Use the INVLPGB_FINAL_ONLY flag when invalidating mappings with INVPLGB.
This way only leaf mappings get removed from the TLB, leaving intermediate
translations cached.

On the (rare) occasions where we free page tables we do a full flush,
ensuring intermediate translations get flushed from the TLB.

Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
Tested-by: Brendan Jackman <jackmanb@google.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
---
 arch/x86/include/asm/tlb.h | 10 ++++++++--
 arch/x86/mm/tlb.c          | 13 +++++++------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index e645884a1877..8d78667a2d1b 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -92,9 +92,15 @@ static inline void __tlbsync(void)
 static inline void __invlpgb_flush_user_nr_nosync(unsigned long pcid,
 						  unsigned long addr,
 						  u16 nr,
-						  bool pmd_stride)
+						  bool pmd_stride,
+						  bool freed_tables)
 {
-	__invlpgb(0, pcid, addr, nr, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
+	u8 flags = INVLPGB_PCID | INVLPGB_VA;
+
+	if (!freed_tables)
+		flags |= INVLPGB_FINAL_ONLY;
+
+	__invlpgb(0, pcid, addr, nr, pmd_stride, flags);
 }
 
 /* Flush all mappings for a given PCID, not including globals. */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4d56d22b9893..91680cfd5868 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -497,9 +497,10 @@ static inline void tlbsync(void)
 
 static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
 						unsigned long addr,
-						u16 nr, bool pmd_stride)
+						u16 nr, bool pmd_stride,
+						bool freed_tables)
 {
-	__invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride);
+	__invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride, freed_tables);
 	if (!cpu_need_tlbsync())
 		cpu_write_tlbsync(true);
 }
@@ -542,9 +543,9 @@ static void broadcast_tlb_flush(struct flush_tlb_info *info)
 			nr = clamp_val(nr, 1, invlpgb_count_max);
 		}
 
-		invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
+		invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd, info->freed_tables);
 		if (static_cpu_has(X86_FEATURE_PTI))
-			invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
+			invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd, info->freed_tables);
 
 		addr += nr << info->stride_shift;
 	} while (addr < info->end);
@@ -1688,10 +1689,10 @@ void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
 	u16 asid = mm_global_asid(mm);
 
 	if (asid) {
-		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false, false);
 		/* Do any CPUs supporting INVLPGB need PTI? */
 		if (static_cpu_has(X86_FEATURE_PTI))
-			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false, false);
 
 		/*
 		 * Some CPUs might still be using a local ASID for this
-- 
2.47.1
Re: [PATCH v14 13/13] x86/mm: only invalidate final translations with INVLPGB
Posted by Dave Hansen 9 months, 2 weeks ago
On 2/25/25 19:00, Rik van Riel wrote:
>  static inline void __invlpgb_flush_user_nr_nosync(unsigned long pcid,
>  						  unsigned long addr,
>  						  u16 nr,
> -						  bool pmd_stride)
> +						  bool pmd_stride,
> +						  bool freed_tables)
>  {
> -	__invlpgb(0, pcid, addr, nr, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
> +	u8 flags = INVLPGB_PCID | INVLPGB_VA;
> +
> +	if (!freed_tables)
> +		flags |= INVLPGB_FINAL_ONLY;
> +
> +	__invlpgb(0, pcid, addr, nr, pmd_stride, flags);
>  }

I'm not sure this is OK.

Think of a hugetlbfs mapping with shared page tables. Say you had a
1GB-sized and 1GB-aligned mapping. It might zap the one PUD that it
needs, set tlb->cleared_puds=1 but it never sets ->freed_tables because
it didn't actually free the shared page table page.

I'd honestly just throw this patch out of the series for now. All of the
other TLB invalidation that the kernel does implicitly toss out the
mid-level paging structure caches.
Re: [PATCH v14 13/13] x86/mm: only invalidate final translations with INVLPGB
Posted by Borislav Petkov 9 months, 2 weeks ago
On Mon, Mar 03, 2025 at 02:40:49PM -0800, Dave Hansen wrote:
> On 2/25/25 19:00, Rik van Riel wrote:
> >  static inline void __invlpgb_flush_user_nr_nosync(unsigned long pcid,
> >  						  unsigned long addr,
> >  						  u16 nr,
> > -						  bool pmd_stride)
> > +						  bool pmd_stride,
> > +						  bool freed_tables)
> >  {
> > -	__invlpgb(0, pcid, addr, nr, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
> > +	u8 flags = INVLPGB_PCID | INVLPGB_VA;
> > +
> > +	if (!freed_tables)
> > +		flags |= INVLPGB_FINAL_ONLY;
> > +
> > +	__invlpgb(0, pcid, addr, nr, pmd_stride, flags);
> >  }
> 
> I'm not sure this is OK.
> 
> Think of a hugetlbfs mapping with shared page tables. Say you had a
> 1GB-sized and 1GB-aligned mapping. It might zap the one PUD that it
> needs, set tlb->cleared_puds=1 but it never sets ->freed_tables because
> it didn't actually free the shared page table page.
> 
> I'd honestly just throw this patch out of the series for now. All of the
> other TLB invalidation that the kernel does implicitly toss out the
> mid-level paging structure caches.

Right, I guess we can revisit this later, once the dust settles.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette