[PATCH v14 04/13] x86/mm: use INVLPGB for kernel TLB flushes

Rik van Riel posted 13 patches 9 months, 3 weeks ago
There is a newer version of this series
[PATCH v14 04/13] x86/mm: use INVLPGB for kernel TLB flushes
Posted by Rik van Riel 9 months, 3 weeks ago
Use broadcast TLB invalidation for kernel addresses when available.

Remove the need to send IPIs for kernel TLB flushes.

Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
Tested-by: Brendan Jackman <jackmanb@google.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
---
 arch/x86/mm/tlb.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index dbcb5c968ff9..f44a03bca41c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1077,6 +1077,18 @@ void flush_tlb_all(void)
 	on_each_cpu(do_flush_tlb_all, NULL, 1);
 }
 
+static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
+{
+	unsigned long addr, nr;
+
+	for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
+		nr = (info->end - addr) >> PAGE_SHIFT;
+		nr = clamp_val(nr, 1, invlpgb_count_max);
+		invlpgb_flush_addr_nosync(addr, nr);
+	}
+	__tlbsync();
+}
+
 static void do_kernel_range_flush(void *info)
 {
 	struct flush_tlb_info *f = info;
@@ -1087,6 +1099,22 @@ static void do_kernel_range_flush(void *info)
 		flush_tlb_one_kernel(addr);
 }
 
+static void kernel_tlb_flush_all(struct flush_tlb_info *info)
+{
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		invlpgb_flush_all();
+	else
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+static void kernel_tlb_flush_range(struct flush_tlb_info *info)
+{
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		invlpgb_kernel_range_flush(info);
+	else
+		on_each_cpu(do_kernel_range_flush, info, 1);
+}
+
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
 	struct flush_tlb_info *info;
@@ -1097,9 +1125,9 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 				  TLB_GENERATION_INVALID);
 
 	if (info->end == TLB_FLUSH_ALL)
-		on_each_cpu(do_flush_tlb_all, NULL, 1);
+		kernel_tlb_flush_all(info);
 	else
-		on_each_cpu(do_kernel_range_flush, info, 1);
+		kernel_tlb_flush_range(info);
 
 	put_flush_tlb_info();
 }
-- 
2.47.1
Re: [PATCH v14 04/13] x86/mm: use INVLPGB for kernel TLB flushes
Posted by Borislav Petkov 9 months, 3 weeks ago
On Tue, Feb 25, 2025 at 10:00:39PM -0500, Rik van Riel wrote:
> Use broadcast TLB invalidation for kernel addresses when available.
> 
> Remove the need to send IPIs for kernel TLB flushes.
> 
> Signed-off-by: Rik van Riel <riel@surriel.com>
> Tested-by: Manali Shukla <Manali.Shukla@amd.com>
> Tested-by: Brendan Jackman <jackmanb@google.com>
> Tested-by: Michael Kelley <mhklinux@outlook.com>
> ---
>  arch/x86/mm/tlb.c | 32 ++++++++++++++++++++++++++++++--
>  1 file changed, 30 insertions(+), 2 deletions(-)

Changes ontop:

--- /tmp/current.patch	2025-02-28 22:39:33.236465716 +0100
+++ /tmp/0001-x86-mm-Use-INVLPGB-for-kernel-TLB-flushes.patch	2025-02-28 22:39:59.432310072 +0100
@@ -1,36 +1,43 @@
+From b97ae5e31069cd536b563df185de52d33a565077 Mon Sep 17 00:00:00 2001
 From: Rik van Riel <riel@surriel.com>
 Date: Tue, 25 Feb 2025 22:00:39 -0500
-Subject: x86/mm: Use INVLPGB for kernel TLB flushes
+Subject: [PATCH] x86/mm: Use INVLPGB for kernel TLB flushes
 
 Use broadcast TLB invalidation for kernel addresses when available.
-
 Remove the need to send IPIs for kernel TLB flushes.
 
+   [ bp: Integrate dhansen's comments additions. ]
+
 Signed-off-by: Rik van Riel <riel@surriel.com>
 Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
-Tested-by: Manali Shukla <Manali.Shukla@amd.com>
-Tested-by: Brendan Jackman <jackmanb@google.com>
-Tested-by: Michael Kelley <mhklinux@outlook.com>
+Acked-by: Dave Hansen <dave.hansen@intel.com>
 Link: https://lore.kernel.org/r/20250226030129.530345-5-riel@surriel.com
 ---
- arch/x86/mm/tlb.c | 32 ++++++++++++++++++++++++++++++--
- 1 file changed, 30 insertions(+), 2 deletions(-)
+ arch/x86/mm/tlb.c | 39 +++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 37 insertions(+), 2 deletions(-)
 
 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
-index dbcb5c968ff9..f44a03bca41c 100644
+index dbcb5c968ff9..5c44b94ad5af 100644
 --- a/arch/x86/mm/tlb.c
 +++ b/arch/x86/mm/tlb.c
-@@ -1077,6 +1077,18 @@ void flush_tlb_all(void)
+@@ -1077,6 +1077,25 @@ void flush_tlb_all(void)
  	on_each_cpu(do_flush_tlb_all, NULL, 1);
  }
  
++/* Flush an arbitrarily large range of memory with INVLPGB. */
 +static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
 +{
 +	unsigned long addr, nr;
 +
 +	for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
 +		nr = (info->end - addr) >> PAGE_SHIFT;
++
++		/*
++		 * INVLPGB has a limit on the size of ranges it can
++		 * flush. Break up large flushes.
++		 */
 +		nr = clamp_val(nr, 1, invlpgb_count_max);
++
 +		invlpgb_flush_addr_nosync(addr, nr);
 +	}
 +	__tlbsync();
 

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette
Re: [PATCH v14 04/13] x86/mm: use INVLPGB for kernel TLB flushes
Posted by Dave Hansen 9 months, 3 weeks ago
On 2/25/25 19:00, Rik van Riel wrote:
> Use broadcast TLB invalidation for kernel addresses when available.
> 
> Remove the need to send IPIs for kernel TLB flushes.

Nit: the changelog doesn't address the refactoring.

*Ideally*, you'd create the helpers and move the code there in one patch
and then actually "use INVLPGB for kernel TLB flushes" in the next. It's
compact enough here that it's not a deal breaker.

> +static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
> +{
> +	unsigned long addr, nr;
> +
> +	for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
> +		nr = (info->end - addr) >> PAGE_SHIFT;
> +		nr = clamp_val(nr, 1, invlpgb_count_max);
> +		invlpgb_flush_addr_nosync(addr, nr);
> +	}
> +	__tlbsync();
> +}

This needs a comment or two. Explaining that the function can take large
sizes:

/*
 * Flush an arbitrarily large range of memory with INVLPGB
 */

But that the _instruction_ can not is important.  This would be great in
the loop just above the clamp:

		/*
		 * INVLPGB has a limit on the size of ranges
		 * it can flush. Break large flushes up.
		 */

>  static void do_kernel_range_flush(void *info)
>  {
>  	struct flush_tlb_info *f = info;
> @@ -1087,6 +1099,22 @@ static void do_kernel_range_flush(void *info)
>  		flush_tlb_one_kernel(addr);
>  }
>  
> +static void kernel_tlb_flush_all(struct flush_tlb_info *info)
> +{
> +	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
> +		invlpgb_flush_all();
> +	else
> +		on_each_cpu(do_flush_tlb_all, NULL, 1);
> +}
> +
> +static void kernel_tlb_flush_range(struct flush_tlb_info *info)
> +{
> +	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
> +		invlpgb_kernel_range_flush(info);
> +	else
> +		on_each_cpu(do_kernel_range_flush, info, 1);
> +}
> +
>  void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>  {
>  	struct flush_tlb_info *info;
> @@ -1097,9 +1125,9 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>  				  TLB_GENERATION_INVALID);
>  
>  	if (info->end == TLB_FLUSH_ALL)
> -		on_each_cpu(do_flush_tlb_all, NULL, 1);
> +		kernel_tlb_flush_all(info);
>  	else
> -		on_each_cpu(do_kernel_range_flush, info, 1);
> +		kernel_tlb_flush_range(info);
>  
>  	put_flush_tlb_info();
>  }

But the structure of this code is much better than previous versions.
With the comments fixed:

Acked-by: Dave Hansen <dave.hansen@intel.com>
[tip: x86/core] x86/mm: Use INVLPGB for kernel TLB flushes
Posted by tip-bot2 for Rik van Riel 9 months ago
The following commit has been merged into the x86/core branch of tip:

Commit-ID:     82378c6c2f435dba66145609de16bf44a9de6303
Gitweb:        https://git.kernel.org/tip/82378c6c2f435dba66145609de16bf44a9de6303
Author:        Rik van Riel <riel@surriel.com>
AuthorDate:    Tue, 25 Feb 2025 22:00:39 -05:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 19 Mar 2025 11:12:29 +01:00

x86/mm: Use INVLPGB for kernel TLB flushes

Use broadcast TLB invalidation for kernel addresses when available.
Remove the need to send IPIs for kernel TLB flushes.

   [ bp: Integrate dhansen's comments additions, merge the
     flush_tlb_all() change into this one too. ]

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20250226030129.530345-5-riel@surriel.com
---
 arch/x86/mm/tlb.c | 48 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index dbcb5c9..8cd084b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1064,7 +1064,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
 }
 
-
 static void do_flush_tlb_all(void *info)
 {
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -1074,7 +1073,32 @@ static void do_flush_tlb_all(void *info)
 void flush_tlb_all(void)
 {
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
-	on_each_cpu(do_flush_tlb_all, NULL, 1);
+
+	/* First try (faster) hardware-assisted TLB invalidation. */
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		invlpgb_flush_all();
+	else
+		/* Fall back to the IPI-based invalidation. */
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+/* Flush an arbitrarily large range of memory with INVLPGB. */
+static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
+{
+	unsigned long addr, nr;
+
+	for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
+		nr = (info->end - addr) >> PAGE_SHIFT;
+
+		/*
+		 * INVLPGB has a limit on the size of ranges it can
+		 * flush. Break up large flushes.
+		 */
+		nr = clamp_val(nr, 1, invlpgb_count_max);
+
+		invlpgb_flush_addr_nosync(addr, nr);
+	}
+	__tlbsync();
 }
 
 static void do_kernel_range_flush(void *info)
@@ -1087,6 +1111,22 @@ static void do_kernel_range_flush(void *info)
 		flush_tlb_one_kernel(addr);
 }
 
+static void kernel_tlb_flush_all(struct flush_tlb_info *info)
+{
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		invlpgb_flush_all();
+	else
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+static void kernel_tlb_flush_range(struct flush_tlb_info *info)
+{
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		invlpgb_kernel_range_flush(info);
+	else
+		on_each_cpu(do_kernel_range_flush, info, 1);
+}
+
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
 	struct flush_tlb_info *info;
@@ -1097,9 +1137,9 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 				  TLB_GENERATION_INVALID);
 
 	if (info->end == TLB_FLUSH_ALL)
-		on_each_cpu(do_flush_tlb_all, NULL, 1);
+		kernel_tlb_flush_all(info);
 	else
-		on_each_cpu(do_kernel_range_flush, info, 1);
+		kernel_tlb_flush_range(info);
 
 	put_flush_tlb_info();
 }
[tip: x86/mm] x86/mm: Use INVLPGB for kernel TLB flushes
Posted by tip-bot2 for Rik van Riel 9 months, 2 weeks ago
The following commit has been merged into the x86/mm branch of tip:

Commit-ID:     ccc19c694b0fe063a90dd27470e9f4ba22990ea1
Gitweb:        https://git.kernel.org/tip/ccc19c694b0fe063a90dd27470e9f4ba22990ea1
Author:        Rik van Riel <riel@surriel.com>
AuthorDate:    Tue, 25 Feb 2025 22:00:39 -05:00
Committer:     Borislav Petkov (AMD) <bp@alien8.de>
CommitterDate: Wed, 05 Mar 2025 17:19:52 +01:00

x86/mm: Use INVLPGB for kernel TLB flushes

Use broadcast TLB invalidation for kernel addresses when available.
Remove the need to send IPIs for kernel TLB flushes.

   [ bp: Integrate dhansen's comments additions, merge the
     flush_tlb_all() change into this one too. ]

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-5-riel@surriel.com
---
 arch/x86/mm/tlb.c | 48 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index dbcb5c9..8cd084b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1064,7 +1064,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
 }
 
-
 static void do_flush_tlb_all(void *info)
 {
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -1074,7 +1073,32 @@ static void do_flush_tlb_all(void *info)
 void flush_tlb_all(void)
 {
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
-	on_each_cpu(do_flush_tlb_all, NULL, 1);
+
+	/* First try (faster) hardware-assisted TLB invalidation. */
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		invlpgb_flush_all();
+	else
+		/* Fall back to the IPI-based invalidation. */
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+/* Flush an arbitrarily large range of memory with INVLPGB. */
+static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
+{
+	unsigned long addr, nr;
+
+	for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
+		nr = (info->end - addr) >> PAGE_SHIFT;
+
+		/*
+		 * INVLPGB has a limit on the size of ranges it can
+		 * flush. Break up large flushes.
+		 */
+		nr = clamp_val(nr, 1, invlpgb_count_max);
+
+		invlpgb_flush_addr_nosync(addr, nr);
+	}
+	__tlbsync();
 }
 
 static void do_kernel_range_flush(void *info)
@@ -1087,6 +1111,22 @@ static void do_kernel_range_flush(void *info)
 		flush_tlb_one_kernel(addr);
 }
 
+static void kernel_tlb_flush_all(struct flush_tlb_info *info)
+{
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		invlpgb_flush_all();
+	else
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+static void kernel_tlb_flush_range(struct flush_tlb_info *info)
+{
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		invlpgb_kernel_range_flush(info);
+	else
+		on_each_cpu(do_kernel_range_flush, info, 1);
+}
+
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
 	struct flush_tlb_info *info;
@@ -1097,9 +1137,9 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 				  TLB_GENERATION_INVALID);
 
 	if (info->end == TLB_FLUSH_ALL)
-		on_each_cpu(do_flush_tlb_all, NULL, 1);
+		kernel_tlb_flush_all(info);
 	else
-		on_each_cpu(do_kernel_range_flush, info, 1);
+		kernel_tlb_flush_range(info);
 
 	put_flush_tlb_info();
 }