[v9] mm: folio_zero_user: clear contiguous pages

[PATCH v9 4/7] x86/mm: Simplify clear_page_*

Posted by Ankur Arora 2 months, 2 weeks ago

clear_page_rep() and clear_page_erms() are wrappers around "REP; STOS"
variations. Inlining gets rid of an unnecessary CALL/RET (which isn't
free when using RETHUNK speculative execution mitigations.)
Fixup and rename clear_page_orig() to adapt to the changed calling
convention.

Also add a comment from Dave Hansen detailing various clearing mechanisms
used in clear_page().

Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
Tested-by: Raghavendra K T <raghavendra.kt@amd.com>
---

Notes:
    - s/memzero_page_aligned_unrolled/__clear_pages_unrolled/
    - fixup comment to specify x86 insns in the standard ALL CAPS style.

 arch/x86/include/asm/page_32.h |  6 ++++
 arch/x86/include/asm/page_64.h | 50 ++++++++++++++++++++++++----------
 arch/x86/lib/clear_page_64.S   | 39 ++++++--------------------
 3 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index 0c623706cb7e..19fddb002cc9 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -17,6 +17,12 @@ extern unsigned long __phys_addr(unsigned long);
 
 #include <linux/string.h>
 
+/**
+ * clear_page() - clear a page using a kernel virtual address.
+ * @page: address of kernel page
+ *
+ * Does absolutely no exception handling.
+ */
 static inline void clear_page(void *page)
 {
 	memset(page, 0, PAGE_SIZE);
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 53f4089333f2..6157bf46590e 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -40,26 +40,46 @@ extern unsigned long __phys_addr_symbol(unsigned long);
 
 #define __phys_reloc_hide(x)	(x)
 
-void clear_page_orig(void *page);
-void clear_page_rep(void *page);
-void clear_page_erms(void *page);
-KCFI_REFERENCE(clear_page_orig);
-KCFI_REFERENCE(clear_page_rep);
-KCFI_REFERENCE(clear_page_erms);
+void __clear_pages_unrolled(void *page);
+KCFI_REFERENCE(__clear_pages_unrolled);
 
-static inline void clear_page(void *page)
+/**
+ * clear_page() - clear a page using a kernel virtual address.
+ * @addr: address of kernel page
+ *
+ * Switch between three implementations of page clearing based on CPU
+ * capabilities:
+ *
+ *  - __clear_pages_unrolled(): the oldest, slowest and universally
+ *    supported method. Zeroes via 8-byte MOV instructions unrolled 8x
+ *    to write a 64-byte cacheline in each loop iteration.
+ *
+ *  - "REP; STOSQ": really old CPUs had crummy REP implementations.
+ *    Vendor CPU setup code sets 'REP_GOOD' on CPUs where REP can be
+ *    trusted. The instruction writes 8-byte per REP iteration but
+ *    CPUs can internally batch these together and do larger writes.
+ *
+ *  - "REP; STOSB": CPUs that enumerate 'ERMS' have an improved STOS
+ *    implementation that is less picky about alignment and where
+ *    STOSB (1-byte at a time) is actually faster than STOSQ (8-bytes
+ *    at a time.)
+ *
+ * Does absolutely no exception handling.
+ */
+static inline void clear_page(void *addr)
 {
+	u64 len = PAGE_SIZE;
 	/*
 	 * Clean up KMSAN metadata for the page being cleared. The assembly call
-	 * below clobbers @page, so we perform unpoisoning before it.
+	 * below clobbers @addr, so we perform unpoisoning before it.
 	 */
-	kmsan_unpoison_memory(page, PAGE_SIZE);
-	alternative_call_2(clear_page_orig,
-			   clear_page_rep, X86_FEATURE_REP_GOOD,
-			   clear_page_erms, X86_FEATURE_ERMS,
-			   "=D" (page),
-			   "D" (page),
-			   "cc", "memory", "rax", "rcx");
+	kmsan_unpoison_memory(addr, len);
+	asm volatile(ALTERNATIVE_2("call __clear_pages_unrolled",
+				   "shrq $3, %%rcx; rep stosq", X86_FEATURE_REP_GOOD,
+				   "rep stosb", X86_FEATURE_ERMS)
+			: "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
+			: "a" (0)
+			: "cc", "memory");
 }
 
 void copy_page(void *to, void *from);
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index a508e4a8c66a..c245b7fc01cd 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -6,30 +6,15 @@
 #include <asm/asm.h>
 
 /*
- * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
- * recommended to use this when possible and we do use them by default.
- * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
- * Otherwise, use original.
+ * Zero page aligned region.
+ * %rdi	- dest
+ * %rcx	- length
  */
-
-/*
- * Zero a page.
- * %rdi	- page
- */
-SYM_TYPED_FUNC_START(clear_page_rep)
-	movl $4096/8,%ecx
-	xorl %eax,%eax
-	rep stosq
-	RET
-SYM_FUNC_END(clear_page_rep)
-EXPORT_SYMBOL_GPL(clear_page_rep)
-
-SYM_TYPED_FUNC_START(clear_page_orig)
-	xorl   %eax,%eax
-	movl   $4096/64,%ecx
+SYM_TYPED_FUNC_START(__clear_pages_unrolled)
+	shrq   $6, %rcx
 	.p2align 4
 .Lloop:
-	decl	%ecx
+	decq	%rcx
 #define PUT(x) movq %rax,x*8(%rdi)
 	movq %rax,(%rdi)
 	PUT(1)
@@ -43,16 +28,8 @@ SYM_TYPED_FUNC_START(clear_page_orig)
 	jnz	.Lloop
 	nop
 	RET
-SYM_FUNC_END(clear_page_orig)
-EXPORT_SYMBOL_GPL(clear_page_orig)
-
-SYM_TYPED_FUNC_START(clear_page_erms)
-	movl $4096,%ecx
-	xorl %eax,%eax
-	rep stosb
-	RET
-SYM_FUNC_END(clear_page_erms)
-EXPORT_SYMBOL_GPL(clear_page_erms)
+SYM_FUNC_END(__clear_pages_unrolled)
+EXPORT_SYMBOL_GPL(__clear_pages_unrolled)
 
 /*
  * Default clear user-space.
-- 
2.31.1

Re: [PATCH v9 4/7] x86/mm: Simplify clear_page_*

Posted by Mateusz Guzik 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 9:24 PM Ankur Arora <ankur.a.arora@oracle.com> wrote:
> + * Switch between three implementations of page clearing based on CPU
> + * capabilities:
> + *
> + *  - __clear_pages_unrolled(): the oldest, slowest and universally
> + *    supported method. Zeroes via 8-byte MOV instructions unrolled 8x
> + *    to write a 64-byte cacheline in each loop iteration.
> + *
> + *  - "REP; STOSQ": really old CPUs had crummy REP implementations.
> + *    Vendor CPU setup code sets 'REP_GOOD' on CPUs where REP can be
> + *    trusted. The instruction writes 8-byte per REP iteration but
> + *    CPUs can internally batch these together and do larger writes.
> + *
> + *  - "REP; STOSB": CPUs that enumerate 'ERMS' have an improved STOS
> + *    implementation that is less picky about alignment and where
> + *    STOSB (1-byte at a time) is actually faster than STOSQ (8-bytes
> + *    at a time.)
> + *

I think this is somewhat odd commentary in this context.

Note about "crummy REP implementations" should be in description of
__clear_pages_unrolled as it justifies its existence (I think the
routine would be best whacked btw, but I'm not going to argue about it
in this thread).
Description of STOSQ notes the CPU can do more than 8 bytes at a time,
while description of STOSB claim does not make such a clarification.
At the same time the note about less picky about alignment makes no
significance in the context of page clearing as they are, well, page
aligned.

There is a fucky real-world problem with ERMS worth noting: there are
hypervisor setups out there which *hide* the bit by default (no
really, see Proxmox for example -- you get a bare bones pre-ERMS
cpuid)

With all this in mind, modulo poor grammar on my end, I would suggest
something like this:

<quote>
There are 3 variants implemented:
- REP; STOSB: used if the CPU supports "Enhanced REP MOVSB/STOSB" (aka
ERMS), which is true for majority of microarchitectures today
- REP; STOSQ: fallback if the ERMS bit is not present
- __clear_pages_unrolled: code for CPUs which are determined to have
poor REP support, only concerns long obsolete uarchs.

Warnings: some hypervisors are configured to expose a very limited set
of capabilites in the guest, fitering out ERMS even if present. As
such the STOSQ variant is still in active use on some setups even when
hardware does not need it.
</quote>

Re: [PATCH v9 4/7] x86/mm: Simplify clear_page_*

Posted by Ankur Arora 2 months, 2 weeks ago

Mateusz Guzik <mjguzik@gmail.com> writes:

> On Fri, Nov 21, 2025 at 9:24 PM Ankur Arora <ankur.a.arora@oracle.com> wrote:
>> + * Switch between three implementations of page clearing based on CPU
>> + * capabilities:
>> + *
>> + *  - __clear_pages_unrolled(): the oldest, slowest and universally
>> + *    supported method. Zeroes via 8-byte MOV instructions unrolled 8x
>> + *    to write a 64-byte cacheline in each loop iteration.
>> + *
>> + *  - "REP; STOSQ": really old CPUs had crummy REP implementations.
>> + *    Vendor CPU setup code sets 'REP_GOOD' on CPUs where REP can be
>> + *    trusted. The instruction writes 8-byte per REP iteration but
>> + *    CPUs can internally batch these together and do larger writes.
>> + *
>> + *  - "REP; STOSB": CPUs that enumerate 'ERMS' have an improved STOS
>> + *    implementation that is less picky about alignment and where
>> + *    STOSB (1-byte at a time) is actually faster than STOSQ (8-bytes
>> + *    at a time.)
>> + *
>
> I think this is somewhat odd commentary in this context.
>
> Note about "crummy REP implementations" should be in description of
> __clear_pages_unrolled as it justifies its existence (I think the
> routine would be best whacked btw, but I'm not going to argue about it
> in this thread).
> Description of STOSQ notes the CPU can do more than 8 bytes at a time,
> while description of STOSB claim does not make such a clarification.
> At the same time the note about less picky about alignment makes no
> significance in the context of page clearing as they are, well, page
> aligned.

Good point. I'll rework the comment a little bit to align things better
(maybe reusing some of what you suggest below).

> There is a fucky real-world problem with ERMS worth noting: there are
> hypervisor setups out there which *hide* the bit by default (no
> really, see Proxmox for example -- you get a bare bones pre-ERMS
> cpuid)
>
> With all this in mind, modulo poor grammar on my end, I would suggest
> something like this:
>
> <quote>
> There are 3 variants implemented:
> - REP; STOSB: used if the CPU supports "Enhanced REP MOVSB/STOSB" (aka
> ERMS), which is true for majority of microarchitectures today
> - REP; STOSQ: fallback if the ERMS bit is not present
> - __clear_pages_unrolled: code for CPUs which are determined to have
> poor REP support, only concerns long obsolete uarchs.
>
> Warnings: some hypervisors are configured to expose a very limited set
> of capabilites in the guest, fitering out ERMS even if present. As
> such the STOSQ variant is still in active use on some setups even when
> hardware does not need it.
> </quote>

The last bit is useful context though maybe some of it fits better in
the commit message.

Thanks
ankur

Re: [PATCH v9 4/7] x86/mm: Simplify clear_page_*

Posted by Borislav Petkov 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 12:23:49PM -0800, Ankur Arora wrote:
> +/**
> + * clear_page() - clear a page using a kernel virtual address.
> + * @addr: address of kernel page
> + *
> + * Switch between three implementations of page clearing based on CPU
> + * capabilities:
> + *
> + *  - __clear_pages_unrolled(): the oldest, slowest and universally
> + *    supported method. Zeroes via 8-byte MOV instructions unrolled 8x
> + *    to write a 64-byte cacheline in each loop iteration.
> + *
> + *  - "REP; STOSQ": really old CPUs had crummy REP implementations.
> + *    Vendor CPU setup code sets 'REP_GOOD' on CPUs where REP can be
> + *    trusted. The instruction writes 8-byte per REP iteration but
> + *    CPUs can internally batch these together and do larger writes.
> + *
> + *  - "REP; STOSB": CPUs that enumerate 'ERMS' have an improved STOS
> + *    implementation that is less picky about alignment and where
> + *    STOSB (1-byte at a time) is actually faster than STOSQ (8-bytes
> + *    at a time.)

Please put here in BIG RED LETTERS something along the lines of:

"The inline asm has a CALL instruction and usually that is a no-no due to the
compiler not knowing that there's a CALL inside the asm and thus won't track
callee-clobbered registers but in this case, all the callee clobbereds by
__clear_pages_unrolled() are part of the inline asm register specification so
that is fine.

Just don't assume that you can call *any* function from inside asm due to the
above."

> + *
> + * Does absolutely no exception handling.
> + */
> +static inline void clear_page(void *addr)
>  {
> +	u64 len = PAGE_SIZE;
>  	/*
>  	 * Clean up KMSAN metadata for the page being cleared. The assembly call
> -	 * below clobbers @page, so we perform unpoisoning before it.
> +	 * below clobbers @addr, so we perform unpoisoning before it.

s/we //

>  	 */
> -	kmsan_unpoison_memory(page, PAGE_SIZE);
> -	alternative_call_2(clear_page_orig,
> -			   clear_page_rep, X86_FEATURE_REP_GOOD,
> -			   clear_page_erms, X86_FEATURE_ERMS,
> -			   "=D" (page),
> -			   "D" (page),
> -			   "cc", "memory", "rax", "rcx");
> +	kmsan_unpoison_memory(addr, len);
> +	asm volatile(ALTERNATIVE_2("call __clear_pages_unrolled",
> +				   "shrq $3, %%rcx; rep stosq", X86_FEATURE_REP_GOOD,
> +				   "rep stosb", X86_FEATURE_ERMS)
> +			: "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
> +			: "a" (0)
> +			: "cc", "memory");
>  }

With that:

Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Re: [PATCH v9 4/7] x86/mm: Simplify clear_page_*

Posted by Ankur Arora 2 months, 2 weeks ago

Borislav Petkov <bp@alien8.de> writes:

> On Fri, Nov 21, 2025 at 12:23:49PM -0800, Ankur Arora wrote:
>> +/**
>> + * clear_page() - clear a page using a kernel virtual address.
>> + * @addr: address of kernel page
>> + *
>> + * Switch between three implementations of page clearing based on CPU
>> + * capabilities:
>> + *
>> + *  - __clear_pages_unrolled(): the oldest, slowest and universally
>> + *    supported method. Zeroes via 8-byte MOV instructions unrolled 8x
>> + *    to write a 64-byte cacheline in each loop iteration.
>> + *
>> + *  - "REP; STOSQ": really old CPUs had crummy REP implementations.
>> + *    Vendor CPU setup code sets 'REP_GOOD' on CPUs where REP can be
>> + *    trusted. The instruction writes 8-byte per REP iteration but
>> + *    CPUs can internally batch these together and do larger writes.
>> + *
>> + *  - "REP; STOSB": CPUs that enumerate 'ERMS' have an improved STOS
>> + *    implementation that is less picky about alignment and where
>> + *    STOSB (1-byte at a time) is actually faster than STOSQ (8-bytes
>> + *    at a time.)
>
> Please put here in BIG RED LETTERS something along the lines of:
>
> "The inline asm has a CALL instruction and usually that is a no-no due to the
> compiler not knowing that there's a CALL inside the asm and thus won't track
> callee-clobbered registers but in this case, all the callee clobbereds by
> __clear_pages_unrolled() are part of the inline asm register specification so
> that is fine.
>
> Just don't assume that you can call *any* function from inside asm due to the
> above."

Will add something clarifying this.

>> + *
>> + * Does absolutely no exception handling.
>> + */
>> +static inline void clear_page(void *addr)
>>  {
>> +	u64 len = PAGE_SIZE;
>>  	/*
>>  	 * Clean up KMSAN metadata for the page being cleared. The assembly call
>> -	 * below clobbers @page, so we perform unpoisoning before it.
>> +	 * below clobbers @addr, so we perform unpoisoning before it.
>
> s/we //
>
>>  	 */
>> -	kmsan_unpoison_memory(page, PAGE_SIZE);
>> -	alternative_call_2(clear_page_orig,
>> -			   clear_page_rep, X86_FEATURE_REP_GOOD,
>> -			   clear_page_erms, X86_FEATURE_ERMS,
>> -			   "=D" (page),
>> -			   "D" (page),
>> -			   "cc", "memory", "rax", "rcx");
>> +	kmsan_unpoison_memory(addr, len);
>> +	asm volatile(ALTERNATIVE_2("call __clear_pages_unrolled",
>> +				   "shrq $3, %%rcx; rep stosq", X86_FEATURE_REP_GOOD,
>> +				   "rep stosb", X86_FEATURE_ERMS)
>> +			: "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
>> +			: "a" (0)
>> +			: "cc", "memory");
>>  }
>
> With that:
>
> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>

Thanks!

--
ankur