[v8] AMD broadcast TLB invalidation

[PATCH v8 03/12] x86/mm: consolidate full flush threshold decision

Posted by Rik van Riel 14 hours ago

Reduce code duplication by consolidating the decision point
for whether to do individual invalidations or a full flush
inside get_flush_tlb_info.

Signed-off-by: Rik van Riel <riel@surriel.com>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
---
 arch/x86/mm/tlb.c | 52 ++++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6cf881a942bb..02e1f5c5bca3 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1000,8 +1000,13 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
 	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
 #endif
 
-	info->start		= start;
-	info->end		= end;
+	/*
+	 * Round the start and end addresses to the page size specified
+	 * by the stride shift. This ensures partial pages at the end of
+	 * a range get fully invalidated.
+	 */
+	info->start		= round_down(start, 1 << stride_shift);
+	info->end		= round_up(end, 1 << stride_shift);
 	info->mm		= mm;
 	info->stride_shift	= stride_shift;
 	info->freed_tables	= freed_tables;
@@ -1009,6 +1014,15 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
 	info->initiating_cpu	= smp_processor_id();
 	info->trim_cpumask	= 0;
 
+	/*
+	 * If the number of flushes is so large that a full flush
+	 * would be faster, do a full flush.
+	 */
+	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
+		info->start = 0;
+		info->end = TLB_FLUSH_ALL;
+	}
+
 	return info;
 }
 
@@ -1026,17 +1040,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 				bool freed_tables)
 {
 	struct flush_tlb_info *info;
+	int cpu = get_cpu();
 	u64 new_tlb_gen;
-	int cpu;
-
-	cpu = get_cpu();
-
-	/* Should we flush just the requested range? */
-	if ((end == TLB_FLUSH_ALL) ||
-	    ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
-		start = 0;
-		end = TLB_FLUSH_ALL;
-	}
 
 	/* This is also a barrier that synchronizes with switch_mm(). */
 	new_tlb_gen = inc_mm_tlb_gen(mm);
@@ -1089,22 +1094,19 @@ static void do_kernel_range_flush(void *info)
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	/* Balance as user space task's flush, a bit conservative */
-	if (end == TLB_FLUSH_ALL ||
-	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
-		on_each_cpu(do_flush_tlb_all, NULL, 1);
-	} else {
-		struct flush_tlb_info *info;
+	struct flush_tlb_info *info;
 
-		preempt_disable();
-		info = get_flush_tlb_info(NULL, start, end, 0, false,
-					  TLB_GENERATION_INVALID);
+	guard(preempt)();
+
+	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
+				  TLB_GENERATION_INVALID);
 
+	if (info->end == TLB_FLUSH_ALL)
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+	else
 		on_each_cpu(do_kernel_range_flush, info, 1);
 
-		put_flush_tlb_info();
-		preempt_enable();
-	}
+	put_flush_tlb_info();
 }
 
 /*
@@ -1276,7 +1278,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 
 	int cpu = get_cpu();
 
-	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
+	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, PAGE_SHIFT, false,
 				  TLB_GENERATION_INVALID);
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
-- 
2.47.1

Re: [PATCH v8 03/12] x86/mm: consolidate full flush threshold decision

Posted by Peter Zijlstra 3 hours ago

On Tue, Feb 04, 2025 at 08:39:52PM -0500, Rik van Riel wrote:

> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 6cf881a942bb..02e1f5c5bca3 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -1000,8 +1000,13 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
>  	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
>  #endif
>  
> -	info->start		= start;
> -	info->end		= end;
> +	/*
> +	 * Round the start and end addresses to the page size specified
> +	 * by the stride shift. This ensures partial pages at the end of
> +	 * a range get fully invalidated.
> +	 */
> +	info->start		= round_down(start, 1 << stride_shift);
> +	info->end		= round_up(end, 1 << stride_shift);
>  	info->mm		= mm;
>  	info->stride_shift	= stride_shift;
>  	info->freed_tables	= freed_tables;

Rather than doing this; should we not fix whatever dodgy users are
feeding us non-page-aligned addresses for invalidation?

Re: [PATCH v8 03/12] x86/mm: consolidate full flush threshold decision

Posted by Peter Zijlstra 3 hours ago

On Wed, Feb 05, 2025 at 01:20:07PM +0100, Peter Zijlstra wrote:
> On Tue, Feb 04, 2025 at 08:39:52PM -0500, Rik van Riel wrote:
> 
> > diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> > index 6cf881a942bb..02e1f5c5bca3 100644
> > --- a/arch/x86/mm/tlb.c
> > +++ b/arch/x86/mm/tlb.c
> > @@ -1000,8 +1000,13 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
> >  	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
> >  #endif
> >  
> > -	info->start		= start;
> > -	info->end		= end;
> > +	/*
> > +	 * Round the start and end addresses to the page size specified
> > +	 * by the stride shift. This ensures partial pages at the end of
> > +	 * a range get fully invalidated.
> > +	 */
> > +	info->start		= round_down(start, 1 << stride_shift);
> > +	info->end		= round_up(end, 1 << stride_shift);
> >  	info->mm		= mm;
> >  	info->stride_shift	= stride_shift;
> >  	info->freed_tables	= freed_tables;
> 
> Rather than doing this; should we not fix whatever dodgy users are
> feeding us non-page-aligned addresses for invalidation?

That is, I feel this is worthy of WARN_ONCE at the very least.

Re: [PATCH v8 03/12] x86/mm: consolidate full flush threshold decision

Posted by Rik van Riel 2 hours ago

On Wed, 2025-02-05 at 13:20 +0100, Peter Zijlstra wrote:
> On Tue, Feb 04, 2025 at 08:39:52PM -0500, Rik van Riel wrote:
> 
> > diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> > index 6cf881a942bb..02e1f5c5bca3 100644
> > --- a/arch/x86/mm/tlb.c
> > +++ b/arch/x86/mm/tlb.c
> > @@ -1000,8 +1000,13 @@ static struct flush_tlb_info
> > *get_flush_tlb_info(struct mm_struct *mm,
> >  	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
> >  #endif
> >  
> > -	info->start		= start;
> > -	info->end		= end;
> > +	/*
> > +	 * Round the start and end addresses to the page size
> > specified
> > +	 * by the stride shift. This ensures partial pages at the
> > end of
> > +	 * a range get fully invalidated.
> > +	 */
> > +	info->start		= round_down(start, 1 <<
> > stride_shift);
> > +	info->end		= round_up(end, 1 <<
> > stride_shift);
> >  	info->mm		= mm;
> >  	info->stride_shift	= stride_shift;
> >  	info->freed_tables	= freed_tables;
> 
> Rather than doing this; should we not fix whatever dodgy users are
> feeding us non-page-aligned addresses for invalidation?
> 

The best way to do that would probably be by adding
a WARN_ON_ONCE here if the value of either start or
end changed, not by merging code that will trigger
kernel crashes - even if the bug is elsewhere.

I would be happy to add a WARN_ON_ONCE either in a
next version, or in a follow-up patch, whichever is
more convenient for you.

-- 
All Rights Reversed.