[PATCH] crypto: powerpc/poly1305 - Restore crypto_simd_usable test

Herbert Xu posted 1 patch 7 months, 1 week ago
[PATCH] crypto: powerpc/poly1305 - Restore crypto_simd_usable test
Posted by Herbert Xu 7 months, 1 week ago
On Thu, May 08, 2025 at 05:27:13PM +0530, Venkat Rao Bagalkote wrote:
>
> Yes, its was on the same machine, next-20250506 passed.

OK I found one bug in my patches, I incorrectly removed the simd
tests for powerpc.  Does this patch help?

---8<---
Restore the crypto_simd_usable test as powerpc needs it.

Fixes: 14d31979145d ("crypto: powerpc/poly1305 - Add block-only interface")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/arch/powerpc/lib/crypto/poly1305-p10-glue.c b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
index 7cea0ebcc6bc..154eced0bf9e 100644
--- a/arch/powerpc/lib/crypto/poly1305-p10-glue.c
+++ b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
@@ -6,6 +6,7 @@
  */
 #include <asm/switch_to.h>
 #include <crypto/internal/poly1305.h>
+#include <crypto/internal/simd.h>
 #include <linux/cpufeature.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
@@ -51,7 +52,7 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
 	if (!static_key_enabled(&have_p10))
 		return poly1305_blocks_generic(state, src, len, padbit);
 	vsx_begin();
-	if (len >= POLY1305_BLOCK_SIZE * 4) {
+	if (crypto_simd_usable() && len >= POLY1305_BLOCK_SIZE * 4) {
 		poly1305_p10le_4blocks(state, src, len);
 		src += len - (len % (POLY1305_BLOCK_SIZE * 4));
 		len %= POLY1305_BLOCK_SIZE * 4;
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Re: [PATCH] crypto: powerpc/poly1305 - Restore crypto_simd_usable test
Posted by Herbert Xu 7 months, 1 week ago
On Thu, May 08, 2025 at 08:23:17PM +0800, Herbert Xu wrote:
>
> @@ -51,7 +52,7 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
>  	if (!static_key_enabled(&have_p10))
>  		return poly1305_blocks_generic(state, src, len, padbit);
>  	vsx_begin();
> -	if (len >= POLY1305_BLOCK_SIZE * 4) {
> +	if (crypto_simd_usable() && len >= POLY1305_BLOCK_SIZE * 4) {

This patch is obviously broken.  However, I think this code was
always broken in the SIMD-fallback case.  AFAICS the fallback
uses vector instructions so it can't be used in softirqs either.

A proper fallback would have to convert the state to the format
used by the generic poly1305 implementation, call that, and then
convert it back.

Of course it would be a lot easier if ppc could make VSX usable
in softirq context.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Re: [PATCH] crypto: powerpc/poly1305 - Restore crypto_simd_usable test
Posted by Venkat Rao Bagalkote 7 months, 1 week ago
On 08/05/25 5:53 pm, Herbert Xu wrote:
> On Thu, May 08, 2025 at 05:27:13PM +0530, Venkat Rao Bagalkote wrote:
>> Yes, its was on the same machine, next-20250506 passed.
> OK I found one bug in my patches, I incorrectly removed the simd
> tests for powerpc.  Does this patch help?
>
> ---8<---
> Restore the crypto_simd_usable test as powerpc needs it.
>
> Fixes: 14d31979145d ("crypto: powerpc/poly1305 - Add block-only interface")
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> diff --git a/arch/powerpc/lib/crypto/poly1305-p10-glue.c b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
> index 7cea0ebcc6bc..154eced0bf9e 100644
> --- a/arch/powerpc/lib/crypto/poly1305-p10-glue.c
> +++ b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
> @@ -6,6 +6,7 @@
>    */
>   #include <asm/switch_to.h>
>   #include <crypto/internal/poly1305.h>
> +#include <crypto/internal/simd.h>
>   #include <linux/cpufeature.h>
>   #include <linux/jump_label.h>
>   #include <linux/kernel.h>
> @@ -51,7 +52,7 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
>   	if (!static_key_enabled(&have_p10))
>   		return poly1305_blocks_generic(state, src, len, padbit);
>   	vsx_begin();
> -	if (len >= POLY1305_BLOCK_SIZE * 4) {
> +	if (crypto_simd_usable() && len >= POLY1305_BLOCK_SIZE * 4) {
>   		poly1305_p10le_4blocks(state, src, len);
>   		src += len - (len % (POLY1305_BLOCK_SIZE * 4));
>   		len %= POLY1305_BLOCK_SIZE * 4;


Unfortunately, above patch dosent fix the boot warning.


Regards,

Venkat.
[PATCH] crypto: powerpc/poly1305 - Fix input mixup in poly1305_emit_arch
Posted by Herbert Xu 7 months, 1 week ago
On Thu, May 08, 2025 at 08:35:48PM +0530, Venkat Rao Bagalkote wrote:
>
> Unfortunately, above patch dosent fix the boot warning.

This works for me:

---8<---
Swap the order of the arguments in poly1305_emit_arch to match
the prototype.

Fixes: 14d31979145d ("crypto: powerpc/poly1305 - Add block-only interface")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/arch/powerpc/lib/crypto/poly1305-p10le_64.S b/arch/powerpc/lib/crypto/poly1305-p10le_64.S
index 2ba2911b8038..5b368baf96d2 100644
--- a/arch/powerpc/lib/crypto/poly1305-p10le_64.S
+++ b/arch/powerpc/lib/crypto/poly1305-p10le_64.S
@@ -1027,7 +1027,7 @@ Out_no_poly1305_64:
 SYM_FUNC_END(poly1305_64s)
 
 #
-# Input: r3 = h, r4 = s, r5 = mac
+# Input: r3 = h, r4 = mac, r5 = s
 # mac = h + s
 #
 SYM_FUNC_START(poly1305_emit_arch)
@@ -1051,14 +1051,14 @@ SYM_FUNC_START(poly1305_emit_arch)
 	mr	12, 8
 
 Skip_h64:
-	ld	6, 0(4)
-	ld	7, 8(4)
+	ld	6, 0(5)
+	ld	7, 8(5)
 	addc	10, 10, 6
 	adde	11, 11, 7
 	addze	12, 12
 
-	std	10, 0(5)
-	std	11, 8(5)
+	std	10, 0(4)
+	std	11, 8(4)
 	blr
 SYM_FUNC_END(poly1305_emit_arch)
 
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Re: [PATCH] crypto: powerpc/poly1305 - Fix input mixup in poly1305_emit_arch
Posted by Eric Biggers 7 months, 1 week ago
On Fri, May 09, 2025 at 08:29:00PM +0800, Herbert Xu wrote:
> On Thu, May 08, 2025 at 08:35:48PM +0530, Venkat Rao Bagalkote wrote:
> >
> > Unfortunately, above patch dosent fix the boot warning.
> 
> This works for me:
> 
> ---8<---
> Swap the order of the arguments in poly1305_emit_arch to match
> the prototype.
> 
> Fixes: 14d31979145d ("crypto: powerpc/poly1305 - Add block-only interface")
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

This fixes "-cpu Power10", but older CPUs (e.g. "-cpu POWER9") are still
failing.

- Eric
[v2 PATCH] crypto: powerpc/poly1305 - Add poly1305_emit_arch wrapper
Posted by Herbert Xu 7 months, 1 week ago
On Fri, May 09, 2025 at 09:44:50PM -0700, Eric Biggers wrote:
>
> This fixes "-cpu Power10", but older CPUs (e.g. "-cpu POWER9") are still
> failing.

You're right.  I'll revert this and apply the following patch
instead.

BTW this thing is still hopelessly broken if it's called from
softirq context because there is no SIMD fallback.  Yes I removed
the SIMD check but it was already broken before that as it simply
switched from the 4-block version to the 1-block version if SIMD
is not available rather than actually doing something that is
safe in softirq context.

Perhaps we should just remove this altogether until it's fixed.

---8<---
Add poly1305_emit_arch with fallback instead of calling assembly
directly.  This is because the state format differs between p10
and that of the generic implementation.

Reported-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Reported-by: Eric Biggers <ebiggers@google.com>
Fixes: 14d31979145d ("crypto: powerpc/poly1305 - Add block-only interface")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/arch/powerpc/lib/crypto/poly1305-p10-glue.c b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
index 7cea0ebcc6bc..3f1664a724b6 100644
--- a/arch/powerpc/lib/crypto/poly1305-p10-glue.c
+++ b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
@@ -14,10 +14,7 @@
 
 asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen);
 asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
-				   u8 digest[POLY1305_DIGEST_SIZE],
-				   const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]);
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
 
@@ -65,6 +62,16 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
 }
 EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
 
+void poly1305_emit_arch(const struct poly1305_state *state,
+			u8 digest[POLY1305_DIGEST_SIZE],
+			const u32 nonce[4])
+{
+	if (!static_key_enabled(&have_p10))
+		return poly1305_emit_generic(state, digest, nonce);
+	poly1305_emit_64(state, nonce, digest);
+}
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
 bool poly1305_is_arch_optimized(void)
 {
 	return static_key_enabled(&have_p10);
diff --git a/arch/powerpc/lib/crypto/poly1305-p10le_64.S b/arch/powerpc/lib/crypto/poly1305-p10le_64.S
index 2ba2911b8038..a3c1987f1ecd 100644
--- a/arch/powerpc/lib/crypto/poly1305-p10le_64.S
+++ b/arch/powerpc/lib/crypto/poly1305-p10le_64.S
@@ -1030,7 +1030,7 @@ SYM_FUNC_END(poly1305_64s)
 # Input: r3 = h, r4 = s, r5 = mac
 # mac = h + s
 #
-SYM_FUNC_START(poly1305_emit_arch)
+SYM_FUNC_START(poly1305_emit_64)
 	ld	10, 0(3)
 	ld	11, 8(3)
 	ld	12, 16(3)
@@ -1060,7 +1060,7 @@ Skip_h64:
 	std	10, 0(5)
 	std	11, 8(5)
 	blr
-SYM_FUNC_END(poly1305_emit_arch)
+SYM_FUNC_END(poly1305_emit_64)
 
 SYM_DATA_START_LOCAL(RMASK)
 .align 5
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Re: [v2 PATCH] crypto: powerpc/poly1305 - Add poly1305_emit_arch wrapper
Posted by Venkat Rao Bagalkote 7 months, 1 week ago
On 10/05/25 10:40 am, Herbert Xu wrote:
> On Fri, May 09, 2025 at 09:44:50PM -0700, Eric Biggers wrote:
>> This fixes "-cpu Power10", but older CPUs (e.g. "-cpu POWER9") are still
>> failing.
> You're right.  I'll revert this and apply the following patch
> instead.
>
> BTW this thing is still hopelessly broken if it's called from
> softirq context because there is no SIMD fallback.  Yes I removed
> the SIMD check but it was already broken before that as it simply
> switched from the 4-block version to the 1-block version if SIMD
> is not available rather than actually doing something that is
> safe in softirq context.
>
> Perhaps we should just remove this altogether until it's fixed.
>
> ---8<---
> Add poly1305_emit_arch with fallback instead of calling assembly
> directly.  This is because the state format differs between p10
> and that of the generic implementation.
>
> Reported-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
> Reported-by: Eric Biggers <ebiggers@google.com>
> Fixes: 14d31979145d ("crypto: powerpc/poly1305 - Add block-only interface")
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> diff --git a/arch/powerpc/lib/crypto/poly1305-p10-glue.c b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
> index 7cea0ebcc6bc..3f1664a724b6 100644
> --- a/arch/powerpc/lib/crypto/poly1305-p10-glue.c
> +++ b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
> @@ -14,10 +14,7 @@
>   
>   asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen);
>   asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit);
> -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
> -				   u8 digest[POLY1305_DIGEST_SIZE],
> -				   const u32 nonce[4]);
> -EXPORT_SYMBOL_GPL(poly1305_emit_arch);
> +asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]);
>   
>   static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
>   
> @@ -65,6 +62,16 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
>   }
>   EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
>   
> +void poly1305_emit_arch(const struct poly1305_state *state,
> +			u8 digest[POLY1305_DIGEST_SIZE],
> +			const u32 nonce[4])
> +{
> +	if (!static_key_enabled(&have_p10))
> +		return poly1305_emit_generic(state, digest, nonce);
> +	poly1305_emit_64(state, nonce, digest);
> +}
> +EXPORT_SYMBOL_GPL(poly1305_emit_arch);
> +
>   bool poly1305_is_arch_optimized(void)
>   {
>   	return static_key_enabled(&have_p10);
> diff --git a/arch/powerpc/lib/crypto/poly1305-p10le_64.S b/arch/powerpc/lib/crypto/poly1305-p10le_64.S
> index 2ba2911b8038..a3c1987f1ecd 100644
> --- a/arch/powerpc/lib/crypto/poly1305-p10le_64.S
> +++ b/arch/powerpc/lib/crypto/poly1305-p10le_64.S
> @@ -1030,7 +1030,7 @@ SYM_FUNC_END(poly1305_64s)
>   # Input: r3 = h, r4 = s, r5 = mac
>   # mac = h + s
>   #
> -SYM_FUNC_START(poly1305_emit_arch)
> +SYM_FUNC_START(poly1305_emit_64)
>   	ld	10, 0(3)
>   	ld	11, 8(3)
>   	ld	12, 16(3)
> @@ -1060,7 +1060,7 @@ Skip_h64:
>   	std	10, 0(5)
>   	std	11, 8(5)
>   	blr
> -SYM_FUNC_END(poly1305_emit_arch)
> +SYM_FUNC_END(poly1305_emit_64)
>   
>   SYM_DATA_START_LOCAL(RMASK)
>   .align 5


Tested this patch, by applying on top of next-20250508 on IBM Power9 
system and it fixes the reported boot warnings. Hence,


Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>



Regards,

Venkat.
Re: [v2 PATCH] crypto: powerpc/poly1305 - Add poly1305_emit_arch wrapper
Posted by Eric Biggers 7 months, 1 week ago
On Sat, May 10, 2025 at 01:10:22PM +0800, Herbert Xu wrote:
> On Fri, May 09, 2025 at 09:44:50PM -0700, Eric Biggers wrote:
> >
> > This fixes "-cpu Power10", but older CPUs (e.g. "-cpu POWER9") are still
> > failing.
> 
> You're right.  I'll revert this and apply the following patch
> instead.
> 
> BTW this thing is still hopelessly broken if it's called from
> softirq context because there is no SIMD fallback.  Yes I removed
> the SIMD check but it was already broken before that as it simply
> switched from the 4-block version to the 1-block version if SIMD
> is not available rather than actually doing something that is
> safe in softirq context.
> 
> Perhaps we should just remove this altogether until it's fixed.

Yes, the PowerPC Poly1305 code incorrectly uses VSX without first checking
crypto_simd_usable().  And PowerPC also doesn't support VSX in softirqs, or at
least it doesn't claim to (it doesn't override may_use_simd(), so it gets the
default from include/asm-generic/simd.h which returns false in softirq context).
Maybe add 'depends on BROKEN' to CRYPTO_POLY1305_P10 for now, and give the
PowerPC folks (Cc'ed) a chance to fix this before removing the code.

- Eric
Re: [v2 PATCH] crypto: powerpc/poly1305 - Add poly1305_emit_arch wrapper
Posted by Segher Boessenkool 7 months, 1 week ago
Hi!

On Fri, May 09, 2025 at 10:33:08PM -0700, Eric Biggers wrote:
> On Sat, May 10, 2025 at 01:10:22PM +0800, Herbert Xu wrote:
> > On Fri, May 09, 2025 at 09:44:50PM -0700, Eric Biggers wrote:
> > >
> > > This fixes "-cpu Power10", but older CPUs (e.g. "-cpu POWER9") are still
> > > failing.
> > 
> > You're right.  I'll revert this and apply the following patch
> > instead.
> > 
> > BTW this thing is still hopelessly broken if it's called from
> > softirq context because there is no SIMD fallback.  Yes I removed
> > the SIMD check but it was already broken before that as it simply
> > switched from the 4-block version to the 1-block version if SIMD
> > is not available rather than actually doing something that is
> > safe in softirq context.
> > 
> > Perhaps we should just remove this altogether until it's fixed.
> 
> Yes, the PowerPC Poly1305 code incorrectly uses VSX without first checking
> crypto_simd_usable().  And PowerPC also doesn't support VSX in softirqs, or at
> least it doesn't claim to (it doesn't override may_use_simd(), so it gets the
> default from include/asm-generic/simd.h which returns false in softirq context).
> Maybe add 'depends on BROKEN' to CRYPTO_POLY1305_P10 for now, and give the
> PowerPC folks (Cc'ed) a chance to fix this before removing the code.

What doe "may_use_simd" even *mean*?  At its declaration site it says
"whether it is allowable at this time to issue SIMD instructions or
access the SIMD register file", but that is 100% meaningless, you can do
SIMD in GPRs.

On PowerPC we have two separate register files dedicated to SIMD-like
stuff, the VMX and the VSX register files.  Which of those is this
function supposed to care about?

It looks like the whole "may_use_simd" thing is a misguided abstraction
unfortunately :-(


Segher
Re: [v2 PATCH] crypto: powerpc/poly1305 - Add poly1305_emit_arch wrapper
Posted by Herbert Xu 7 months, 1 week ago
On Sat, May 10, 2025 at 05:34:01PM -0500, Segher Boessenkool wrote:
>
> What doe "may_use_simd" even *mean*?  At its declaration site it says
> "whether it is allowable at this time to issue SIMD instructions or
> access the SIMD register file", but that is 100% meaningless, you can do
> SIMD in GPRs.
> 
> On PowerPC we have two separate register files dedicated to SIMD-like
> stuff, the VMX and the VSX register files.  Which of those is this
> function supposed to care about?
> 
> It looks like the whole "may_use_simd" thing is a misguided abstraction
> unfortunately :-(

While we may debate the name of this function, the question is
simply whether you need to save state or not when you get an
interrupt.

If you don't need to save state, then may_use_simd doesn't apply
to you.  If you need to manually save state when you get an IRQ,
then you must obey the rules.

So even if VMX and VSX registers are separate, you must assume
that in an IRQ either could be in use already and therefore you
must not use any of them without saving the state.

The ideal solution is to save the state (if necessary) in softirqs,
or simply disable softirqs when these instructions are in use.
Then the fallback path can be removed, for softirqs at least.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Re: [v2 PATCH] crypto: powerpc/poly1305 - Add poly1305_emit_arch wrapper
Posted by Eric Biggers 7 months, 1 week ago
On Sat, May 10, 2025 at 05:34:01PM -0500, Segher Boessenkool wrote:
> Hi!
> 
> On Fri, May 09, 2025 at 10:33:08PM -0700, Eric Biggers wrote:
> > On Sat, May 10, 2025 at 01:10:22PM +0800, Herbert Xu wrote:
> > > On Fri, May 09, 2025 at 09:44:50PM -0700, Eric Biggers wrote:
> > > >
> > > > This fixes "-cpu Power10", but older CPUs (e.g. "-cpu POWER9") are still
> > > > failing.
> > > 
> > > You're right.  I'll revert this and apply the following patch
> > > instead.
> > > 
> > > BTW this thing is still hopelessly broken if it's called from
> > > softirq context because there is no SIMD fallback.  Yes I removed
> > > the SIMD check but it was already broken before that as it simply
> > > switched from the 4-block version to the 1-block version if SIMD
> > > is not available rather than actually doing something that is
> > > safe in softirq context.
> > > 
> > > Perhaps we should just remove this altogether until it's fixed.
> > 
> > Yes, the PowerPC Poly1305 code incorrectly uses VSX without first checking
> > crypto_simd_usable().  And PowerPC also doesn't support VSX in softirqs, or at
> > least it doesn't claim to (it doesn't override may_use_simd(), so it gets the
> > default from include/asm-generic/simd.h which returns false in softirq context).
> > Maybe add 'depends on BROKEN' to CRYPTO_POLY1305_P10 for now, and give the
> > PowerPC folks (Cc'ed) a chance to fix this before removing the code.
> 
> What doe "may_use_simd" even *mean*?  At its declaration site it says
> "whether it is allowable at this time to issue SIMD instructions or
> access the SIMD register file", but that is 100% meaningless, you can do
> SIMD in GPRs.
> 
> On PowerPC we have two separate register files dedicated to SIMD-like
> stuff, the VMX and the VSX register files.  Which of those is this
> function supposed to care about?
> 
> It looks like the whole "may_use_simd" thing is a misguided abstraction
> unfortunately :-(

may_use_simd() a.k.a crypto_simd_usable() is supposed to check whether vector /
SIMD registers can be used in the current context, provided that the appropriate
architecture-specific functions like kernel_fpu_begin() and kernel_fpu_end() are
used.  In the case of architectures that support the use of multiple sets of
vector / SIMD registers in kernel mode, it would have to check for the
intersection of the calling context requirements for all of them, since it
doesn't specify a particular set.

The reason that may_use_simd() a.k.a. crypto_simd_usable() got pulled out into
an abstraction shared across all architectures is that it's used by
non-architecture-specific code, such as crypto/simd.c, and also the crypto
self-tests which inject 'false' return values to test the no-SIMD code paths.

I think the users other than the self-tests are on the way out, though.  Most of
the users of crypto/simd.c just got removed, with CRYPTO_AES_GCM_P10 being the
last one.  A new non-architecture-specific user of crypto_simd_usable() just got
added in include/crypto/internal/sha2.h for some reason (despite me nacking the
patch), but that should be reverted.

So if it's really the case that VMX and VSX are both supported for kernel-mode
use but have different requirements on the calling context, you could make the
PowerPC crypto code use more precise checks like may_use_vsx().  Just the crypto
self-tests won't be able to test the no-SIMD code paths that way, unfortunately.

- Eric
Re: [v2 PATCH] crypto: powerpc/poly1305 - Add poly1305_emit_arch wrapper
Posted by Herbert Xu 7 months, 1 week ago
On Fri, May 09, 2025 at 10:33:08PM -0700, Eric Biggers wrote:
>
> Yes, the PowerPC Poly1305 code incorrectly uses VSX without first checking
> crypto_simd_usable().  And PowerPC also doesn't support VSX in softirqs, or at
> least it doesn't claim to (it doesn't override may_use_simd(), so it gets the
> default from include/asm-generic/simd.h which returns false in softirq context).
> Maybe add 'depends on BROKEN' to CRYPTO_POLY1305_P10 for now, and give the
> PowerPC folks (Cc'ed) a chance to fix this before removing the code.

I just noticed something weird with this code, running a speed
test using "modprobe tcrypt mode=217" shows that the p10 version
of poly1305 is way slower than the generic:

qemu P9 CPU:

May 10 13:36:46 test-p10 kernel: [   59.585264][  T374] tcrypt: testing speed of multibuffer rfc7539esp(chacha20,poly1305) (rfc7539esp(chacha20-generic,poly1305-generic)) encryption
May 10 13:36:46 test-p10 kernel: [   59.586011][  T374] tcrypt: test 0 (288 bit key, 16 byte blocks): 1 operation in 1374 cycles (16 bytes)
May 10 13:36:46 test-p10 kernel: [   59.587446][  T374] tcrypt: test 1 (288 bit key, 64 byte blocks): 1 operation in 1359 cycles (64 bytes)
May 10 13:36:46 test-p10 kernel: [   59.588025][  T374] tcrypt: test 2 (288 bit key, 256 byte blocks): 1 operation in 1778 cycles (256 bytes)
May 10 13:36:46 test-p10 kernel: [   59.588639][  T374] tcrypt: test 3 (288 bit key, 512 byte blocks): 1 operation in 2323 cycles (512 bytes)
May 10 13:36:46 test-p10 kernel: [   59.589342][  T374] tcrypt: test 4 (288 bit key, 1024 byte blocks): 1 operation in 31624 cycles (1024 bytes)
May 10 13:36:46 test-p10 kernel: [   59.594178][  T374] tcrypt: test 5 (288 bit key, 1420 byte blocks): 1 operation in 4408 cycles (1420 bytes)
May 10 13:36:46 test-p10 kernel: [   59.595317][  T374] tcrypt: test 6 (288 bit key, 4096 byte blocks): 1 operation in 9719 cycles (4096 bytes)
May 10 13:36:46 test-p10 kernel: [   59.597512][  T374] tcrypt: test 7 (288 bit key, 8192 byte blocks): 1 operation in 20168 cycles (8192 bytes)
May 10 13:36:46 test-p10 kernel: [   59.604616][  T374] tcrypt: testing speed of multibuffer rfc7539esp(chacha20,poly1305) (rfc7539esp(chacha20-generic,poly1305-generic)) decryption
May 10 13:36:46 test-p10 kernel: [   59.604916][  T374] tcrypt: test 0 (288 bit key, 16 byte blocks): 1 operation in 1356 cycles (16 bytes)
May 10 13:36:46 test-p10 kernel: [   59.605564][  T374] tcrypt: test 1 (288 bit key, 64 byte blocks): 1 operation in 1393 cycles (64 bytes)
May 10 13:36:46 test-p10 kernel: [   59.608308][  T374] tcrypt: test 2 (288 bit key, 256 byte blocks): 1 operation in 1845 cycles (256 bytes)
May 10 13:36:46 test-p10 kernel: [   59.609002][  T374] tcrypt: test 3 (288 bit key, 512 byte blocks): 1 operation in 2392 cycles (512 bytes)
May 10 13:36:46 test-p10 kernel: [   59.612109][  T374] tcrypt: test 4 (288 bit key, 1024 byte blocks): 1 operation in 3349 cycles (1024 bytes)
May 10 13:36:46 test-p10 kernel: [   59.613289][  T374] tcrypt: test 5 (288 bit key, 1420 byte blocks): 1 operation in 4418 cycles (1420 bytes)
May 10 13:36:46 test-p10 kernel: [   59.616233][  T374] tcrypt: test 6 (288 bit key, 4096 byte blocks): 1 operation in 21600 cycles (4096 bytes)
May 10 13:36:46 test-p10 kernel: [   59.620221][  T374] tcrypt: test 7 (288 bit key, 8192 byte blocks): 1 operation in 20013 cycles (8192 bytes)

qemu P10 CPU:

May 10 13:40:56 test-p10 kernel: [   91.672877][  T392] tcrypt: testing speed of multibuffer rfc7539esp(chacha20,poly1305) (rfc7539esp(chacha20-powerpc,poly1305-generic)) encryption
May 10 13:40:56 test-p10 kernel: [   91.674615][  T392] tcrypt: test 0 (288 bit key, 16 byte blocks): 1 operation in 1471 cycles (16 bytes)
May 10 13:40:56 test-p10 kernel: [   91.680240][  T392] tcrypt: test 1 (288 bit key, 64 byte blocks): 1 operation in 1733 cycles (64 bytes)
May 10 13:40:56 test-p10 kernel: [   91.682975][  T392] tcrypt: test 2 (288 bit key, 256 byte blocks): 1 operation in 3248 cycles (256 bytes)
May 10 13:40:56 test-p10 kernel: [   91.684445][  T392] tcrypt: test 3 (288 bit key, 512 byte blocks): 1 operation in 15211 cycles (512 bytes)
May 10 13:40:56 test-p10 kernel: [   91.687603][  T392] tcrypt: test 4 (288 bit key, 1024 byte blocks): 1 operation in 20500 cycles (1024 bytes)
May 10 13:40:56 test-p10 kernel: [   91.690926][  T392] tcrypt: test 5 (288 bit key, 1420 byte blocks): 1 operation in 10159 cycles (1420 bytes)
May 10 13:40:56 test-p10 kernel: [   91.695009][  T392] tcrypt: test 6 (288 bit key, 4096 byte blocks): 1 operation in 25917 cycles (4096 bytes)
May 10 13:40:56 test-p10 kernel: [   91.701320][  T392] tcrypt: test 7 (288 bit key, 8192 byte blocks): 1 operation in 63352 cycles (8192 bytes)
May 10 13:40:56 test-p10 kernel: [   91.713863][  T392] tcrypt: testing speed of multibuffer rfc7539esp(chacha20,poly1305) (rfc7539esp(chacha20-powerpc,poly1305-generic)) decryption
May 10 13:40:56 test-p10 kernel: [   91.714182][  T392] tcrypt: test 0 (288 bit key, 16 byte blocks): 1 operation in 1502 cycles (16 bytes)
May 10 13:40:56 test-p10 kernel: [   91.714871][  T392] tcrypt: test 1 (288 bit key, 64 byte blocks): 1 operation in 1778 cycles (64 bytes)
May 10 13:40:56 test-p10 kernel: [   91.715508][  T392] tcrypt: test 2 (288 bit key, 256 byte blocks): 1 operation in 3322 cycles (256 bytes)
May 10 13:40:56 test-p10 kernel: [   91.716463][  T392] tcrypt: test 3 (288 bit key, 512 byte blocks): 1 operation in 20980 cycles (512 bytes)
May 10 13:40:56 test-p10 kernel: [   91.720775][  T392] tcrypt: test 4 (288 bit key, 1024 byte blocks): 1 operation in 8000 cycles (1024 bytes)
May 10 13:40:56 test-p10 kernel: [   91.724348][  T392] tcrypt: test 5 (288 bit key, 1420 byte blocks): 1 operation in 10155 cycles (1420 bytes)
May 10 13:40:56 test-p10 kernel: [   91.727952][  T392] tcrypt: test 6 (288 bit key, 4096 byte blocks): 1 operation in 27711 cycles (4096 bytes)
May 10 13:40:56 test-p10 kernel: [   91.735306][  T392] tcrypt: test 7 (288 bit key, 8192 byte blocks): 1 operation in 52874 cycles (8192 bytes)

Did I do something wrong?

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Re: [v2 PATCH] crypto: powerpc/poly1305 - Add poly1305_emit_arch wrapper
Posted by Herbert Xu 7 months, 1 week ago
On Sat, May 10, 2025 at 01:49:13PM +0800, Herbert Xu wrote:
>
> Did I do something wrong?

OK perhaps it's just that the qemu emulation being slow.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Re: [v2 PATCH] crypto: powerpc/poly1305 - Add poly1305_emit_arch wrapper
Posted by Eric Biggers 7 months, 1 week ago
On Sat, May 10, 2025 at 01:50:02PM +0800, Herbert Xu wrote:
> On Sat, May 10, 2025 at 01:49:13PM +0800, Herbert Xu wrote:
> >
> > Did I do something wrong?
> 
> OK perhaps it's just that the qemu emulation being slow.

Yes, non-native QEMU usually isn't any good for benchmarking the
architecture-optimized code, due to the instructions it uses having to be
emulated.  Just to give another random example, in (non-native) QEMU the RISC-V
CRC code is much slower than the generic CRC code.  But when run on an actual
RISC-V processor it's much faster.

- Eric
[PATCH] crypto: powerpc/poly1305 - Add SIMD fallback
Posted by Herbert Xu 7 months, 1 week ago
On Fri, May 09, 2025 at 10:33:08PM -0700, Eric Biggers wrote:
>
> Yes, the PowerPC Poly1305 code incorrectly uses VSX without first checking
> crypto_simd_usable().  And PowerPC also doesn't support VSX in softirqs, or at
> least it doesn't claim to (it doesn't override may_use_simd(), so it gets the
> default from include/asm-generic/simd.h which returns false in softirq context).
> Maybe add 'depends on BROKEN' to CRYPTO_POLY1305_P10 for now, and give the
> PowerPC folks (Cc'ed) a chance to fix this before removing the code.

OK this patch works for me:

---8<---
Add a SIMD fallback path for poly1305-p10 by converting the 2^64
based hash state into a 2^44 base.  In order to ensure that the
generic fallback is actually 2^44, add ARCH_SUPPORTS_INT128 to
powerpc and make poly1305-p10 depend on it.

Fixes: ba8f8624fde2 ("crypto: poly1305-p10 - Glue code for optmized Poly1305 implementation for ppc64le")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6722625a406a..651e0c32957a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -173,6 +173,7 @@ config PPC
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC_BOOK3S || PPC_8xx
+	select ARCH_SUPPORTS_INT128		if PPC64 && CC_HAS_INT128
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
 	select ARCH_USE_MEMTEST
diff --git a/arch/powerpc/lib/crypto/Kconfig b/arch/powerpc/lib/crypto/Kconfig
index ffa541ad6d5d..6761fdb6193c 100644
--- a/arch/powerpc/lib/crypto/Kconfig
+++ b/arch/powerpc/lib/crypto/Kconfig
@@ -9,7 +9,7 @@ config CRYPTO_CHACHA20_P10
 
 config CRYPTO_POLY1305_P10
 	tristate
-	depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
+	depends on PPC64 && CPU_LITTLE_ENDIAN && VSX && ARCH_SUPPORTS_INT128
 	default CRYPTO_LIB_POLY1305
 	select CRYPTO_ARCH_HAVE_LIB_POLY1305
 	select CRYPTO_LIB_POLY1305_GENERIC
diff --git a/arch/powerpc/lib/crypto/poly1305-p10-glue.c b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
index 3f1664a724b6..280c10c48c53 100644
--- a/arch/powerpc/lib/crypto/poly1305-p10-glue.c
+++ b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
@@ -6,6 +6,7 @@
  */
 #include <asm/switch_to.h>
 #include <crypto/internal/poly1305.h>
+#include <crypto/internal/simd.h>
 #include <linux/cpufeature.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
@@ -18,6 +19,11 @@ asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 n
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
 
+static inline bool is_state_base64(struct poly1305_block_state *state)
+{
+	return state->core_r.precomputed_s.r64[2];
+}
+
 static void vsx_begin(void)
 {
 	preempt_disable();
@@ -30,12 +36,35 @@ static void vsx_end(void)
 	preempt_enable();
 }
 
+static void convert_to_base2_44(struct poly1305_block_state *state)
+{
+	u8 raw_key[POLY1305_BLOCK_SIZE];
+	u64 h0, h1, h2;
+
+	if (!is_state_base64(state))
+		return;
+
+	state->core_r.precomputed_s.r64[2] = 0;
+	put_unaligned_le64(state->core_r.key.r64[0], raw_key + 0);
+	put_unaligned_le64(state->core_r.key.r64[1], raw_key + 8);
+	poly1305_core_setkey(&state->core_r, raw_key);
+
+	h0 = state->h.h64[0];
+	h1 = state->h.h64[1];
+	h2 = state->h.h64[2];
+	state->h.h64[0] = h0 & 0xfffffffffffULL;
+	state->h.h64[1] = h0 >> 44 | (h1 & 0xffffffULL) << 20;
+	state->h.h64[2] = h1 >> 24 | h2 << 40;
+}
+
 void poly1305_block_init_arch(struct poly1305_block_state *dctx,
 			      const u8 raw_key[POLY1305_BLOCK_SIZE])
 {
-	if (!static_key_enabled(&have_p10))
+	dctx->core_r.precomputed_s.r64[2] = 0;
+	if (!static_key_enabled(&have_p10) || !crypto_simd_usable())
 		return poly1305_block_init_generic(dctx, raw_key);
 
+	dctx->core_r.precomputed_s.r64[2] = 1;
 	dctx->h = (struct poly1305_state){};
 	dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0);
 	dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8);
@@ -45,8 +74,11 @@ EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
 void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
 			  unsigned int len, u32 padbit)
 {
-	if (!static_key_enabled(&have_p10))
+	if (!static_key_enabled(&have_p10) || !is_state_base64(state) ||
+	    !crypto_simd_usable()) {
+		convert_to_base2_44(state);
 		return poly1305_blocks_generic(state, src, len, padbit);
+	}
 	vsx_begin();
 	if (len >= POLY1305_BLOCK_SIZE * 4) {
 		poly1305_p10le_4blocks(state, src, len);
@@ -66,7 +98,10 @@ void poly1305_emit_arch(const struct poly1305_state *state,
 			u8 digest[POLY1305_DIGEST_SIZE],
 			const u32 nonce[4])
 {
-	if (!static_key_enabled(&have_p10))
+	struct poly1305_block_state *dctx =
+		container_of(state, struct poly1305_block_state, h);
+
+	if (!static_key_enabled(&have_p10) || !is_state_base64(dctx))
 		return poly1305_emit_generic(state, digest, nonce);
 	poly1305_emit_64(state, nonce, digest);
 }
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt