[PATCH v14 01/13] x86/mm: consolidate full flush threshold decision

Rik van Riel posted 13 patches 9 months, 3 weeks ago
There is a newer version of this series
[PATCH v14 01/13] x86/mm: consolidate full flush threshold decision
Posted by Rik van Riel 9 months, 3 weeks ago
Reduce code duplication by consolidating the decision point
for whether to do individual invalidations or a full flush
inside get_flush_tlb_info.

Signed-off-by: Rik van Riel <riel@surriel.com>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
---
 arch/x86/mm/tlb.c | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ffc25b348041..dbcb5c968ff9 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1000,6 +1000,15 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
 	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
 #endif
 
+	/*
+	 * If the number of flushes is so large that a full flush
+	 * would be faster, do a full flush.
+	 */
+	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
+		start = 0;
+		end = TLB_FLUSH_ALL;
+	}
+
 	info->start		= start;
 	info->end		= end;
 	info->mm		= mm;
@@ -1026,17 +1035,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 				bool freed_tables)
 {
 	struct flush_tlb_info *info;
+	int cpu = get_cpu();
 	u64 new_tlb_gen;
-	int cpu;
-
-	cpu = get_cpu();
-
-	/* Should we flush just the requested range? */
-	if ((end == TLB_FLUSH_ALL) ||
-	    ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
-		start = 0;
-		end = TLB_FLUSH_ALL;
-	}
 
 	/* This is also a barrier that synchronizes with switch_mm(). */
 	new_tlb_gen = inc_mm_tlb_gen(mm);
@@ -1089,22 +1089,19 @@ static void do_kernel_range_flush(void *info)
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	/* Balance as user space task's flush, a bit conservative */
-	if (end == TLB_FLUSH_ALL ||
-	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
-		on_each_cpu(do_flush_tlb_all, NULL, 1);
-	} else {
-		struct flush_tlb_info *info;
+	struct flush_tlb_info *info;
+
+	guard(preempt)();
 
-		preempt_disable();
-		info = get_flush_tlb_info(NULL, start, end, 0, false,
-					  TLB_GENERATION_INVALID);
+	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
+				  TLB_GENERATION_INVALID);
 
+	if (info->end == TLB_FLUSH_ALL)
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+	else
 		on_each_cpu(do_kernel_range_flush, info, 1);
 
-		put_flush_tlb_info();
-		preempt_enable();
-	}
+	put_flush_tlb_info();
 }
 
 /*
-- 
2.47.1
[BUG] x86/mm: regression after 4a02ed8e1cc3
Posted by Giovanni Cabiddu 3 months, 2 weeks ago
On Tue, Feb 25, 2025 at 10:00:36PM -0500, Rik van Riel wrote:
> Reduce code duplication by consolidating the decision point
> for whether to do individual invalidations or a full flush
> inside get_flush_tlb_info.
> 
> Signed-off-by: Rik van Riel <riel@surriel.com>
> Suggested-by: Dave Hansen <dave.hansen@intel.com>
> Tested-by: Michael Kelley <mhklinux@outlook.com>
> Acked-by: Dave Hansen <dave.hansen@intel.com>
> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
> ---
After 4a02ed8e1cc3 ("x86/mm: Consolidate full flush threshold
decision"), we've seen data corruption in DMAd buffers when testing SVA.

From our preliminary analysis, it appears that get_flush_tlb_info()
modifies the start and end parameters for full TLB flushes (setting
start=0, end=TLB_FLUSH_ALL). However, the MMU notifier call at the end
of the function still uses the original parameters instead of the
updated info->start and info->end.

The change below appears to solve the problem, however we are not sure if
this is the right way to fix the problem.

----8<----
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 39f80111e6f1..e66c7662c254 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1459,7 +1459,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 
 	put_flush_tlb_info();
 	put_cpu();
-	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
+	mmu_notifier_arch_invalidate_secondary_tlbs(mm, info->start, info->end);
 }
 
 static void do_flush_tlb_all(void *info)
-- 
2.51.0
Re: [BUG] x86/mm: regression after 4a02ed8e1cc3
Posted by Jann Horn 3 months, 2 weeks ago
On Tue, Sep 2, 2025 at 5:44 PM Giovanni Cabiddu
<giovanni.cabiddu@intel.com> wrote:
> On Tue, Feb 25, 2025 at 10:00:36PM -0500, Rik van Riel wrote:
> > Reduce code duplication by consolidating the decision point
> > for whether to do individual invalidations or a full flush
> > inside get_flush_tlb_info.
> >
> > Signed-off-by: Rik van Riel <riel@surriel.com>
> > Suggested-by: Dave Hansen <dave.hansen@intel.com>
> > Tested-by: Michael Kelley <mhklinux@outlook.com>
> > Acked-by: Dave Hansen <dave.hansen@intel.com>
> > Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
> > ---
> After 4a02ed8e1cc3 ("x86/mm: Consolidate full flush threshold
> decision"), we've seen data corruption in DMAd buffers when testing SVA.

If it's not too much effort, you could try to see whether bumping
/sys/kernel/debug/x86/tlb_single_page_flush_ceiling to some relatively
large value (maybe something like 262144, which is 512*512) causes you
to see the same kind of issue before commit 4a02ed8e1cc3. (Note that
increasing tlb_single_page_flush_ceiling costs performance, and
putting an overly big value in there will probably make the system
completely unresponsive.)
Re: [BUG] x86/mm: regression after 4a02ed8e1cc3
Posted by Giovanni Cabiddu 3 months, 2 weeks ago
On Tue, Sep 02, 2025 at 06:31:49PM +0200, Jann Horn wrote:
> On Tue, Sep 2, 2025 at 5:44 PM Giovanni Cabiddu
> <giovanni.cabiddu@intel.com> wrote:
> > On Tue, Feb 25, 2025 at 10:00:36PM -0500, Rik van Riel wrote:
> > > Reduce code duplication by consolidating the decision point
> > > for whether to do individual invalidations or a full flush
> > > inside get_flush_tlb_info.
> > >
> > > Signed-off-by: Rik van Riel <riel@surriel.com>
> > > Suggested-by: Dave Hansen <dave.hansen@intel.com>
> > > Tested-by: Michael Kelley <mhklinux@outlook.com>
> > > Acked-by: Dave Hansen <dave.hansen@intel.com>
> > > Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
> > > ---
> > After 4a02ed8e1cc3 ("x86/mm: Consolidate full flush threshold
> > decision"), we've seen data corruption in DMAd buffers when testing SVA.
> 
> If it's not too much effort, you could try to see whether bumping
> /sys/kernel/debug/x86/tlb_single_page_flush_ceiling to some relatively
> large value (maybe something like 262144, which is 512*512) causes you
> to see the same kind of issue before commit 4a02ed8e1cc3. (Note that
> increasing tlb_single_page_flush_ceiling costs performance, and
> putting an overly big value in there will probably make the system
> completely unresponsive.)
Thanks. We will try to increase tlb_single_page_flush_ceiling before
4a02ed8e1cc3.

I'm not familiar with this code, but based on the commit message,
4a02ed8e1cc3 appears to be a refactor. However, that doesn't seem to be
the case.
Before the commit, mmu_notifier_arch_invalidate_secondary_tlbs() was
getting a modified value for start and end. After the commit it appears
to be using the original values instead. Was this intentional?

Regards,

-- 
Giovanni
Re: [BUG] x86/mm: regression after 4a02ed8e1cc3
Posted by Jann Horn 3 months, 2 weeks ago
On Tue, Sep 2, 2025 at 5:44 PM Giovanni Cabiddu
<giovanni.cabiddu@intel.com> wrote:
> On Tue, Feb 25, 2025 at 10:00:36PM -0500, Rik van Riel wrote:
> > Reduce code duplication by consolidating the decision point
> > for whether to do individual invalidations or a full flush
> > inside get_flush_tlb_info.
> >
> > Signed-off-by: Rik van Riel <riel@surriel.com>
> > Suggested-by: Dave Hansen <dave.hansen@intel.com>
> > Tested-by: Michael Kelley <mhklinux@outlook.com>
> > Acked-by: Dave Hansen <dave.hansen@intel.com>
> > Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
> > ---
> After 4a02ed8e1cc3 ("x86/mm: Consolidate full flush threshold
> decision"), we've seen data corruption in DMAd buffers when testing SVA.
>
> From our preliminary analysis, it appears that get_flush_tlb_info()
> modifies the start and end parameters for full TLB flushes (setting
> start=0, end=TLB_FLUSH_ALL). However, the MMU notifier call at the end
> of the function still uses the original parameters instead of the
> updated info->start and info->end.
>
> The change below appears to solve the problem, however we are not sure if
> this is the right way to fix the problem.
>
> ----8<----
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 39f80111e6f1..e66c7662c254 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -1459,7 +1459,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
>
>         put_flush_tlb_info();
>         put_cpu();
> -       mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
> +       mmu_notifier_arch_invalidate_secondary_tlbs(mm, info->start, info->end);
>  }

I don't see why the IOMMU flush should be broadened just because the
CPU flush got broadened.

On x86, IOMMU flushes happen from arch_tlbbatch_add_pending() and
flush_tlb_mm_range(); the IOMMU folks might know better, but as far as
I know, there is nothing that elides IOMMU flushes depending on the
state of X86-internal flush generation tracking or such.

To me this looks like a change that is correct but makes it easier to
hit IOMMU flushing issues in other places.

Are you encountering these issues on an Intel system or an AMD system?
Re: [BUG] x86/mm: regression after 4a02ed8e1cc3
Posted by Nadav Amit 3 months, 2 weeks ago
I just noticed few things need clarification

> On 2 Sep 2025, at 19:05, Jann Horn <jannh@google.com> wrote:
> 
> On Tue, Sep 2, 2025 at 5:44 PM Giovanni Cabiddu
> <giovanni.cabiddu@intel.com> wrote:
>> 

[snip]

>> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
>> index 39f80111e6f1..e66c7662c254 100644
>> --- a/arch/x86/mm/tlb.c
>> +++ b/arch/x86/mm/tlb.c
>> @@ -1459,7 +1459,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
>> 
>>        put_flush_tlb_info();
>>        put_cpu();
>> -       mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
>> +       mmu_notifier_arch_invalidate_secondary_tlbs(mm, info->start, info->end);
>> }
> 
> I don't see why the IOMMU flush should be broadened just because the
> CPU flush got broadened.

Agreed (as Rik also indicated now)

> 
> On x86, IOMMU flushes happen from arch_tlbbatch_add_pending() and
> flush_tlb_mm_range(); the IOMMU folks might know better, but as far as
> I know, there is nothing that elides IOMMU flushes depending on the
> state of X86-internal flush generation tracking or such.
> 
> To me this looks like a change that is correct but makes it easier to
> hit IOMMU flushing issues in other places.

This change is not correct. Do not reference info after calling
put_flush_tlb_info().

> 
> Are you encountering these issues on an Intel system or an AMD system?

I would note that on AMD IOMMUs there is some masking functionality
that allows to make large yet selective flushes.

This means both that we should not force IOMMU flush range to be large
just because we decided to do so for the CPU (as you correctly said)
and that there might be an unrelated bug, like in the mask computation.
Re: [BUG] x86/mm: regression after 4a02ed8e1cc3
Posted by Jann Horn 3 months, 2 weeks ago
On Wed, Sep 3, 2025 at 4:18 PM Nadav Amit <nadav.amit@gmail.com> wrote:
> > On 2 Sep 2025, at 19:05, Jann Horn <jannh@google.com> wrote:
> > On x86, IOMMU flushes happen from arch_tlbbatch_add_pending() and
> > flush_tlb_mm_range(); the IOMMU folks might know better, but as far as
> > I know, there is nothing that elides IOMMU flushes depending on the
> > state of X86-internal flush generation tracking or such.
> >
> > To me this looks like a change that is correct but makes it easier to
> > hit IOMMU flushing issues in other places.
>
> This change is not correct. Do not reference info after calling
> put_flush_tlb_info().

To be clear, I worded that very confusingly, I meant that Rik's commit
4a02ed8e1cc3 looks correct to me.
Re: [BUG] x86/mm: regression after 4a02ed8e1cc3
Posted by Jann Horn 3 months, 2 weeks ago
On Tue, Sep 2, 2025 at 6:05 PM Jann Horn <jannh@google.com> wrote:
> To me this looks like a change that is correct but makes it easier to
> hit IOMMU flushing issues in other places.

(This was ambigous; to be clear, I meant that 4a02ed8e1cc3 looks
correct to me, and that the suggested fix doesn't look to me like it
fixes anything.)
Re: [BUG] x86/mm: regression after 4a02ed8e1cc3
Posted by Dave Hansen 3 months, 2 weeks ago
On 9/2/25 08:44, Giovanni Cabiddu wrote:
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 39f80111e6f1..e66c7662c254 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -1459,7 +1459,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
>  
>  	put_flush_tlb_info();
>  	put_cpu();
> -	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
> +	mmu_notifier_arch_invalidate_secondary_tlbs(mm, info->start, info->end);
>  }

That does look like the right solution.

This is the downside of wrapping everything up in that 'info' struct;
it's not obvious that the canonical source of the start/end information
moved from those variables into the structure.

Rik, is that your read on it too?

In any case, Giovanni, do you want to send that as a "real" patch that
we can apply (changelog, SoB, Fixes, Cc:stable@, etc...)?
Re: [BUG] x86/mm: regression after 4a02ed8e1cc3
Posted by Rik van Riel 3 months, 2 weeks ago
On Tue, 2025-09-02 at 08:50 -0700, Dave Hansen wrote:
> On 9/2/25 08:44, Giovanni Cabiddu wrote:
> > diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> > index 39f80111e6f1..e66c7662c254 100644
> > --- a/arch/x86/mm/tlb.c
> > +++ b/arch/x86/mm/tlb.c
> > @@ -1459,7 +1459,7 @@ void flush_tlb_mm_range(struct mm_struct *mm,
> > unsigned long start,
> >  
> >  	put_flush_tlb_info();
> >  	put_cpu();
> > -	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start,
> > end);
> > +	mmu_notifier_arch_invalidate_secondary_tlbs(mm, info-
> > >start, info->end);
> >  }
> 
> That does look like the right solution.
> 
> This is the downside of wrapping everything up in that 'info' struct;
> it's not obvious that the canonical source of the start/end
> information
> moved from those variables into the structure.
> 
> Rik, is that your read on it too?
> 
In flush_tlb_mm_range(), we only need to flush from
start to end. The same should be true for the mmu notifier.

The only reason info->start and info->end can be modified
is because a global TLB flush can be faster than a ranged
one.

I can't think of any reason why that should affect the
range the mmu notifier needs to flush.

-- 
All Rights Reversed.
Re: [BUG] x86/mm: regression after 4a02ed8e1cc3
Posted by Nadav Amit 3 months, 2 weeks ago

> On 2 Sep 2025, at 18:50, Dave Hansen <dave.hansen@intel.com> wrote:
> 
> On 9/2/25 08:44, Giovanni Cabiddu wrote:
>> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
>> index 39f80111e6f1..e66c7662c254 100644
>> --- a/arch/x86/mm/tlb.c
>> +++ b/arch/x86/mm/tlb.c
>> @@ -1459,7 +1459,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
>> 
>> 	put_flush_tlb_info();
>> 	put_cpu();
>> - 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
>> + 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, info->start, info->end);
>> }
> 
> That does look like the right solution.
> 
> This is the downside of wrapping everything up in that 'info' struct;
> it's not obvious that the canonical source of the start/end information
> moved from those variables into the structure.

Just a reminder (which I am sure that you know): the main motivation behind
this info struct is to allow the payload that is required for the TLB
shootdown to be in a single cache-line.

It would be nice (if possible) that callees like broadcast_tlb_flush()
would constify flush_tlb_info (I’m not encouraging lots of consification,
but maybe it’s appropriate here) to prevent the misuse of flush_tlb_info.
Re: [BUG] x86/mm: regression after 4a02ed8e1cc3
Posted by Dave Hansen 3 months, 2 weeks ago
On 9/2/25 09:08, Nadav Amit wrote:
> It would be nice (if possible) that callees like broadcast_tlb_flush()
> would constify flush_tlb_info (I’m not encouraging lots of consification,
> but maybe it’s appropriate here) to prevent the misuse of flush_tlb_info.

Yeah, agreed. It would be nice if there was a clear and enforced line
between producers and consumers here.
[tip: x86/core] x86/mm: Consolidate full flush threshold decision
Posted by tip-bot2 for Rik van Riel 9 months ago
The following commit has been merged into the x86/core branch of tip:

Commit-ID:     4a02ed8e1cc33acd04d8f5b89751d3bbb6be35d8
Gitweb:        https://git.kernel.org/tip/4a02ed8e1cc33acd04d8f5b89751d3bbb6be35d8
Author:        Rik van Riel <riel@surriel.com>
AuthorDate:    Tue, 25 Feb 2025 22:00:36 -05:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 19 Mar 2025 11:08:07 +01:00

x86/mm: Consolidate full flush threshold decision

Reduce code duplication by consolidating the decision point for whether to do
individual invalidations or a full flush inside get_flush_tlb_info().

Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lore.kernel.org/r/20250226030129.530345-2-riel@surriel.com
---
 arch/x86/mm/tlb.c | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ffc25b3..dbcb5c9 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1000,6 +1000,15 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
 	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
 #endif
 
+	/*
+	 * If the number of flushes is so large that a full flush
+	 * would be faster, do a full flush.
+	 */
+	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
+		start = 0;
+		end = TLB_FLUSH_ALL;
+	}
+
 	info->start		= start;
 	info->end		= end;
 	info->mm		= mm;
@@ -1026,17 +1035,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 				bool freed_tables)
 {
 	struct flush_tlb_info *info;
+	int cpu = get_cpu();
 	u64 new_tlb_gen;
-	int cpu;
-
-	cpu = get_cpu();
-
-	/* Should we flush just the requested range? */
-	if ((end == TLB_FLUSH_ALL) ||
-	    ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
-		start = 0;
-		end = TLB_FLUSH_ALL;
-	}
 
 	/* This is also a barrier that synchronizes with switch_mm(). */
 	new_tlb_gen = inc_mm_tlb_gen(mm);
@@ -1089,22 +1089,19 @@ static void do_kernel_range_flush(void *info)
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	/* Balance as user space task's flush, a bit conservative */
-	if (end == TLB_FLUSH_ALL ||
-	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
-		on_each_cpu(do_flush_tlb_all, NULL, 1);
-	} else {
-		struct flush_tlb_info *info;
+	struct flush_tlb_info *info;
+
+	guard(preempt)();
 
-		preempt_disable();
-		info = get_flush_tlb_info(NULL, start, end, 0, false,
-					  TLB_GENERATION_INVALID);
+	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
+				  TLB_GENERATION_INVALID);
 
+	if (info->end == TLB_FLUSH_ALL)
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+	else
 		on_each_cpu(do_kernel_range_flush, info, 1);
 
-		put_flush_tlb_info();
-		preempt_enable();
-	}
+	put_flush_tlb_info();
 }
 
 /*
[tip: x86/mm] x86/mm: Consolidate full flush threshold decision
Posted by tip-bot2 for Rik van Riel 9 months, 2 weeks ago
The following commit has been merged into the x86/mm branch of tip:

Commit-ID:     fd521566cc0cd5e327683fea70cba93b1966ebac
Gitweb:        https://git.kernel.org/tip/fd521566cc0cd5e327683fea70cba93b1966ebac
Author:        Rik van Riel <riel@surriel.com>
AuthorDate:    Tue, 25 Feb 2025 22:00:36 -05:00
Committer:     Borislav Petkov (AMD) <bp@alien8.de>
CommitterDate: Tue, 04 Mar 2025 11:39:38 +01:00

x86/mm: Consolidate full flush threshold decision

Reduce code duplication by consolidating the decision point for whether to do
individual invalidations or a full flush inside get_flush_tlb_info().

Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lore.kernel.org/r/20250226030129.530345-2-riel@surriel.com
---
 arch/x86/mm/tlb.c | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ffc25b3..dbcb5c9 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1000,6 +1000,15 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
 	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
 #endif
 
+	/*
+	 * If the number of flushes is so large that a full flush
+	 * would be faster, do a full flush.
+	 */
+	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
+		start = 0;
+		end = TLB_FLUSH_ALL;
+	}
+
 	info->start		= start;
 	info->end		= end;
 	info->mm		= mm;
@@ -1026,17 +1035,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 				bool freed_tables)
 {
 	struct flush_tlb_info *info;
+	int cpu = get_cpu();
 	u64 new_tlb_gen;
-	int cpu;
-
-	cpu = get_cpu();
-
-	/* Should we flush just the requested range? */
-	if ((end == TLB_FLUSH_ALL) ||
-	    ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
-		start = 0;
-		end = TLB_FLUSH_ALL;
-	}
 
 	/* This is also a barrier that synchronizes with switch_mm(). */
 	new_tlb_gen = inc_mm_tlb_gen(mm);
@@ -1089,22 +1089,19 @@ static void do_kernel_range_flush(void *info)
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	/* Balance as user space task's flush, a bit conservative */
-	if (end == TLB_FLUSH_ALL ||
-	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
-		on_each_cpu(do_flush_tlb_all, NULL, 1);
-	} else {
-		struct flush_tlb_info *info;
+	struct flush_tlb_info *info;
+
+	guard(preempt)();
 
-		preempt_disable();
-		info = get_flush_tlb_info(NULL, start, end, 0, false,
-					  TLB_GENERATION_INVALID);
+	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
+				  TLB_GENERATION_INVALID);
 
+	if (info->end == TLB_FLUSH_ALL)
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+	else
 		on_each_cpu(do_kernel_range_flush, info, 1);
 
-		put_flush_tlb_info();
-		preempt_enable();
-	}
+	put_flush_tlb_info();
 }
 
 /*