[PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code

Eric Biggers posted 7 patches 2 months ago
[PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code
Posted by Eric Biggers 2 months ago
MD5 is insecure, is no longer commonly used, and has never been
optimized for the most common architectures in the kernel.  Only mips,
powerpc, and sparc have optimized MD5 code in the kernel.  Of these,
only the powerpc one is actually testable in QEMU.  The mips one works
only on Cavium Octeon SoCs.

Taken together, it's clear that it's time to retire these additional MD5
implementations, and focus maintenance on the MD5 generic C code.

This commit removes the PowerPC optimized MD5 code.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/powerpc/configs/powernv_defconfig |   1 -
 arch/powerpc/configs/ppc64_defconfig   |   1 -
 arch/powerpc/crypto/Kconfig            |   8 -
 arch/powerpc/crypto/Makefile           |   2 -
 arch/powerpc/crypto/md5-asm.S          | 235 -------------------------
 arch/powerpc/crypto/md5-glue.c         |  99 -----------
 6 files changed, 346 deletions(-)
 delete mode 100644 arch/powerpc/crypto/md5-asm.S
 delete mode 100644 arch/powerpc/crypto/md5-glue.c

diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index d06388b0f66e3..bd4685612de6d 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -318,11 +318,10 @@ CONFIG_FTR_FIXUP_SELFTEST=y
 CONFIG_MSI_BITMAP_SELFTEST=y
 CONFIG_XMON=y
 CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA256=y
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index ce34597e9f3e1..2d92c11eea7e4 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -385,11 +385,10 @@ CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA256=y
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_LZO=m
-CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_AES_GCM_P10=m
 CONFIG_CRYPTO_DEV_NX=y
 CONFIG_CRYPTO_DEV_NX_ENCRYPT=m
 CONFIG_CRYPTO_DEV_VMX=y
 CONFIG_SYSTEM_TRUSTED_KEYRING=y
diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index cfe39fc221cf8..f4b779c7352de 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -13,18 +13,10 @@ config CRYPTO_CURVE25519_PPC64
 	  Curve25519 algorithm
 
 	  Architecture: PowerPC64
 	  - Little-endian
 
-config CRYPTO_MD5_PPC
-	tristate "Digests: MD5"
-	select CRYPTO_HASH
-	help
-	  MD5 message digest algorithm (RFC1321)
-
-	  Architecture: powerpc
-
 config CRYPTO_AES_PPC_SPE
 	tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)"
 	depends on SPE
 	select CRYPTO_SKCIPHER
 	help
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index bc8fd27344b8b..9eb59dce67f36 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -4,17 +4,15 @@
 #
 # Arch-specific CryptoAPI modules.
 #
 
 obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o
-obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o
 obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
 obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
 obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o
 
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
-md5-ppc-y := md5-asm.o md5-glue.o
 aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o
 vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o
 curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o
 
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
diff --git a/arch/powerpc/crypto/md5-asm.S b/arch/powerpc/crypto/md5-asm.S
deleted file mode 100644
index fa6bc440cf4ac..0000000000000
--- a/arch/powerpc/crypto/md5-asm.S
+++ /dev/null
@@ -1,235 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Fast MD5 implementation for PPC
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/asm-compat.h>
-
-#define rHP	r3
-#define rWP	r4
-
-#define rH0	r0
-#define rH1	r6
-#define rH2	r7
-#define rH3	r5
-
-#define rW00	r8
-#define rW01	r9
-#define rW02	r10
-#define rW03	r11
-#define rW04	r12
-#define rW05	r14
-#define rW06	r15
-#define rW07	r16
-#define rW08	r17
-#define rW09	r18
-#define rW10	r19
-#define rW11	r20
-#define rW12	r21
-#define rW13	r22
-#define rW14	r23
-#define rW15	r24
-
-#define rT0	r25
-#define rT1	r26
-
-#define INITIALIZE \
-	PPC_STLU r1,-INT_FRAME_SIZE(r1); \
-	SAVE_GPRS(14, 26, r1)		/* push registers onto stack	*/
-
-#define FINALIZE \
-	REST_GPRS(14, 26, r1);		/* pop registers from stack	*/ \
-	addi	r1,r1,INT_FRAME_SIZE
-
-#ifdef __BIG_ENDIAN__
-#define LOAD_DATA(reg, off) \
-	lwbrx		reg,0,rWP;	/* load data			*/
-#define INC_PTR \
-	addi		rWP,rWP,4;	/* increment per word		*/
-#define NEXT_BLOCK			/* nothing to do		*/
-#else
-#define LOAD_DATA(reg, off) \
-	lwz		reg,off(rWP);	/* load data			*/
-#define INC_PTR				/* nothing to do		*/
-#define NEXT_BLOCK \
-	addi		rWP,rWP,64;	/* increment per block		*/
-#endif
-
-#define R_00_15(a, b, c, d, w0, w1, p, q, off, k0h, k0l, k1h, k1l) \
-	LOAD_DATA(w0, off)		/*    W				*/ \
-	and		rT0,b,c;	/* 1: f = b and c		*/ \
-	INC_PTR				/*    ptr++			*/ \
-	andc		rT1,d,b;	/* 1: f' = ~b and d		*/ \
-	LOAD_DATA(w1, off+4)		/*    W				*/ \
-	or		rT0,rT0,rT1;	/* 1: f = f or f'		*/ \
-	addi		w0,w0,k0l;	/* 1: wk = w + k		*/ \
-	add		a,a,rT0;	/* 1: a = a + f			*/ \
-	addis		w0,w0,k0h;	/* 1: wk = w + k'		*/ \
-	addis		w1,w1,k1h;	/* 2: wk = w + k		*/ \
-	add		a,a,w0;		/* 1: a = a + wk		*/ \
-	addi		w1,w1,k1l;	/* 2: wk = w + k'		*/ \
-	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
-	add		d,d,w1;		/* 2: a = a + wk		*/ \
-	add		a,a,b;		/* 1: a = a + b			*/ \
-	and		rT0,a,b;	/* 2: f = b and c		*/ \
-	andc		rT1,c,a;	/* 2: f' = ~b and d		*/ \
-	or		rT0,rT0,rT1;	/* 2: f = f or f'		*/ \
-	add		d,d,rT0;	/* 2: a = a + f			*/ \
-	INC_PTR				/*    ptr++			*/ \
-	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
-	add		d,d,a;		/* 2: a = a + b			*/
-
-#define R_16_31(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
-	andc		rT0,c,d;	/* 1: f = c and ~d		*/ \
-	and		rT1,b,d;	/* 1: f' = b and d		*/ \
-	addi		w0,w0,k0l;	/* 1: wk = w + k		*/ \
-	or		rT0,rT0,rT1;	/* 1: f = f or f'		*/ \
-	addis		w0,w0,k0h;	/* 1: wk = w + k'		*/ \
-	add		a,a,rT0;	/* 1: a = a + f			*/ \
-	addi		w1,w1,k1l;	/* 2: wk = w + k		*/ \
-	add		a,a,w0;		/* 1: a = a + wk		*/ \
-	addis		w1,w1,k1h;	/* 2: wk = w + k'		*/ \
-	andc		rT0,b,c;	/* 2: f = c and ~d		*/ \
-	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
-	add		a,a,b;		/* 1: a = a + b			*/ \
-	add		d,d,w1;		/* 2: a = a + wk		*/ \
-	and		rT1,a,c;	/* 2: f' = b and d		*/ \
-	or		rT0,rT0,rT1;	/* 2: f = f or f'		*/ \
-	add		d,d,rT0;	/* 2: a = a + f			*/ \
-	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
-	add		d,d,a;		/* 2: a = a +b			*/
-
-#define R_32_47(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
-	xor		rT0,b,c;	/* 1: f' = b xor c		*/ \
-	addi		w0,w0,k0l;	/* 1: wk = w + k		*/ \
-	xor		rT1,rT0,d;	/* 1: f = f xor f'		*/ \
-	addis		w0,w0,k0h;	/* 1: wk = w + k'		*/ \
-	add		a,a,rT1;	/* 1: a = a + f			*/ \
-	addi		w1,w1,k1l;	/* 2: wk = w + k		*/ \
-	add		a,a,w0;		/* 1: a = a + wk		*/ \
-	addis		w1,w1,k1h;	/* 2: wk = w + k'		*/ \
-	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
-	add		d,d,w1;		/* 2: a = a + wk		*/ \
-	add		a,a,b;		/* 1: a = a + b			*/ \
-	xor		rT1,rT0,a;	/* 2: f = b xor f'		*/ \
-	add		d,d,rT1;	/* 2: a = a + f			*/ \
-	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
-	add		d,d,a;		/* 2: a = a + b			*/
-
-#define R_48_63(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
-	addi		w0,w0,k0l;	/* 1: w = w + k			*/ \
-	orc		rT0,b,d;	/* 1: f = b or ~d		*/ \
-	addis		w0,w0,k0h;	/* 1: w = w + k'		*/ \
-	xor		rT0,rT0,c;	/* 1: f = f xor c		*/ \
-	add		a,a,w0;		/* 1: a = a + wk		*/ \
-	addi		w1,w1,k1l;	/* 2: w = w + k			*/ \
-	add		a,a,rT0;	/* 1: a = a + f			*/ \
-	addis		w1,w1,k1h;	/* 2: w = w + k'		*/ \
-	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
-	add		a,a,b;		/* 1: a = a + b			*/ \
-	orc		rT0,a,c;	/* 2: f = b or ~d		*/ \
-	add		d,d,w1;		/* 2: a = a + wk		*/ \
-	xor		rT0,rT0,b;	/* 2: f = f xor c		*/ \
-	add		d,d,rT0;	/* 2: a = a + f			*/ \
-	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
-	add		d,d,a;		/* 2: a = a + b			*/
-
-_GLOBAL(ppc_md5_transform)
-	INITIALIZE
-
-	mtctr		r5
-	lwz		rH0,0(rHP)
-	lwz		rH1,4(rHP)
-	lwz		rH2,8(rHP)
-	lwz		rH3,12(rHP)
-
-ppc_md5_main:
-	R_00_15(rH0, rH1, rH2, rH3, rW00, rW01, 25, 20, 0,
-		0xd76b, -23432, 0xe8c8, -18602)
-	R_00_15(rH2, rH3, rH0, rH1, rW02, rW03, 15, 10, 8,
-		0x2420, 0x70db, 0xc1be, -12562)
-	R_00_15(rH0, rH1, rH2, rH3, rW04, rW05, 25, 20, 16,
-		0xf57c, 0x0faf, 0x4788, -14806)
-	R_00_15(rH2, rH3, rH0, rH1, rW06, rW07, 15, 10, 24,
-		0xa830, 0x4613, 0xfd47, -27391)
-	R_00_15(rH0, rH1, rH2, rH3, rW08, rW09, 25, 20, 32,
-		0x6981, -26408, 0x8b45,  -2129)
-	R_00_15(rH2, rH3, rH0, rH1, rW10, rW11, 15, 10, 40,
-		0xffff, 0x5bb1, 0x895d, -10306)
-	R_00_15(rH0, rH1, rH2, rH3, rW12, rW13, 25, 20, 48,
-		0x6b90, 0x1122, 0xfd98, 0x7193)
-	R_00_15(rH2, rH3, rH0, rH1, rW14, rW15, 15, 10, 56,
-		0xa679, 0x438e, 0x49b4, 0x0821)
-
-	R_16_31(rH0, rH1, rH2, rH3, rW01, rW06, 27, 23,
-		0x0d56, 0x6e0c, 0x1810, 0x6d2d)
-	R_16_31(rH2, rH3, rH0, rH1, rW11, rW00, 18, 12,
-		0x9d02, -32109, 0x124c, 0x2332)
-	R_16_31(rH0, rH1, rH2, rH3, rW05, rW10, 27, 23,
-		0x8ea7, 0x4a33, 0x0245, -18270)
-	R_16_31(rH2, rH3, rH0, rH1, rW15, rW04, 18, 12,
-		0x8eee,  -8608, 0xf258,  -5095)
-	R_16_31(rH0, rH1, rH2, rH3, rW09, rW14, 27, 23,
-		0x969d, -10697, 0x1cbe, -15288)
-	R_16_31(rH2, rH3, rH0, rH1, rW03, rW08, 18, 12,
-		0x3317, 0x3e99, 0xdbd9, 0x7c15)
-	R_16_31(rH0, rH1, rH2, rH3, rW13, rW02, 27, 23,
-		0xac4b, 0x7772, 0xd8cf, 0x331d)
-	R_16_31(rH2, rH3, rH0, rH1, rW07, rW12, 18, 12,
-		0x6a28, 0x6dd8, 0x219a, 0x3b68)
-
-	R_32_47(rH0, rH1, rH2, rH3, rW05, rW08, 28, 21,
-		0x29cb, 0x28e5, 0x4218,  -7788)
-	R_32_47(rH2, rH3, rH0, rH1, rW11, rW14, 16,  9,
-		0x473f, 0x06d1, 0x3aae, 0x3036)
-	R_32_47(rH0, rH1, rH2, rH3, rW01, rW04, 28, 21,
-		0xaea1, -15134, 0x640b, -11295)
-	R_32_47(rH2, rH3, rH0, rH1, rW07, rW10, 16,  9,
-		0x8f4c, 0x4887, 0xbc7c, -22499)
-	R_32_47(rH0, rH1, rH2, rH3, rW13, rW00, 28, 21,
-		0x7eb8, -27199, 0x00ea, 0x6050)
-	R_32_47(rH2, rH3, rH0, rH1, rW03, rW06, 16,  9,
-		0xe01a, 0x22fe, 0x4447, 0x69c5)
-	R_32_47(rH0, rH1, rH2, rH3, rW09, rW12, 28, 21,
-		0xb7f3, 0x0253, 0x59b1, 0x4d5b)
-	R_32_47(rH2, rH3, rH0, rH1, rW15, rW02, 16,  9,
-		0x4701, -27017, 0xc7bd, -19859)
-
-	R_48_63(rH0, rH1, rH2, rH3, rW00, rW07, 26, 22,
-		0x0988,  -1462, 0x4c70, -19401)
-	R_48_63(rH2, rH3, rH0, rH1, rW14, rW05, 17, 11,
-		0xadaf,  -5221, 0xfc99, 0x66f7)
-	R_48_63(rH0, rH1, rH2, rH3, rW12, rW03, 26, 22,
-		0x7e80, -16418, 0xba1e, -25587)
-	R_48_63(rH2, rH3, rH0, rH1, rW10, rW01, 17, 11,
-		0x4130, 0x380d, 0xe0c5, 0x738d)
-	lwz		rW00,0(rHP)
-	R_48_63(rH0, rH1, rH2, rH3, rW08, rW15, 26, 22,
-		0xe837, -30770, 0xde8a, 0x69e8)
-	lwz		rW14,4(rHP)
-	R_48_63(rH2, rH3, rH0, rH1, rW06, rW13, 17, 11,
-		0x9e79, 0x260f, 0x256d, -27941)
-	lwz		rW12,8(rHP)
-	R_48_63(rH0, rH1, rH2, rH3, rW04, rW11, 26, 22,
-		0xab75, -20775, 0x4f9e, -28397)
-	lwz		rW10,12(rHP)
-	R_48_63(rH2, rH3, rH0, rH1, rW02, rW09, 17, 11,
-		0x662b, 0x7c56, 0x11b2, 0x0358)
-
-	add		rH0,rH0,rW00
-	stw		rH0,0(rHP)
-	add		rH1,rH1,rW14
-	stw		rH1,4(rHP)
-	add		rH2,rH2,rW12
-	stw		rH2,8(rHP)
-	add		rH3,rH3,rW10
-	stw		rH3,12(rHP)
-	NEXT_BLOCK
-
-	bdnz		ppc_md5_main
-
-	FINALIZE
-	blr
diff --git a/arch/powerpc/crypto/md5-glue.c b/arch/powerpc/crypto/md5-glue.c
deleted file mode 100644
index 204440a90cd84..0000000000000
--- a/arch/powerpc/crypto/md5-glue.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for MD5 implementation for PPC assembler
- *
- * Based on generic implementation.
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <crypto/internal/hash.h>
-#include <crypto/md5.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-extern void ppc_md5_transform(u32 *state, const u8 *src, u32 blocks);
-
-static int ppc_md5_init(struct shash_desc *desc)
-{
-	struct md5_state *sctx = shash_desc_ctx(desc);
-
-	sctx->hash[0] = MD5_H0;
-	sctx->hash[1] = MD5_H1;
-	sctx->hash[2] = MD5_H2;
-	sctx->hash[3] =	MD5_H3;
-	sctx->byte_count = 0;
-
-	return 0;
-}
-
-static int ppc_md5_update(struct shash_desc *desc, const u8 *data,
-			unsigned int len)
-{
-	struct md5_state *sctx = shash_desc_ctx(desc);
-
-	sctx->byte_count += round_down(len, MD5_HMAC_BLOCK_SIZE);
-	ppc_md5_transform(sctx->hash, data, len >> 6);
-	return len - round_down(len, MD5_HMAC_BLOCK_SIZE);
-}
-
-static int ppc_md5_finup(struct shash_desc *desc, const u8 *src,
-			 unsigned int offset, u8 *out)
-{
-	struct md5_state *sctx = shash_desc_ctx(desc);
-	__le64 block[MD5_BLOCK_WORDS] = {};
-	u8 *p = memcpy(block, src, offset);
-	__le32 *dst = (__le32 *)out;
-	__le64 *pbits;
-
-	src = p;
-	p += offset;
-	*p++ = 0x80;
-	sctx->byte_count += offset;
-	pbits = &block[(MD5_BLOCK_WORDS / (offset > 55 ? 1 : 2)) - 1];
-	*pbits = cpu_to_le64(sctx->byte_count << 3);
-	ppc_md5_transform(sctx->hash, src, (pbits - block + 1) / 8);
-	memzero_explicit(block, sizeof(block));
-
-	dst[0] = cpu_to_le32(sctx->hash[0]);
-	dst[1] = cpu_to_le32(sctx->hash[1]);
-	dst[2] = cpu_to_le32(sctx->hash[2]);
-	dst[3] = cpu_to_le32(sctx->hash[3]);
-	return 0;
-}
-
-static struct shash_alg alg = {
-	.digestsize	=	MD5_DIGEST_SIZE,
-	.init		=	ppc_md5_init,
-	.update		=	ppc_md5_update,
-	.finup		=	ppc_md5_finup,
-	.descsize	=	MD5_STATE_SIZE,
-	.base		=	{
-		.cra_name	=	"md5",
-		.cra_driver_name=	"md5-ppc",
-		.cra_priority	=	200,
-		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	=	MD5_HMAC_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-};
-
-static int __init ppc_md5_mod_init(void)
-{
-	return crypto_register_shash(&alg);
-}
-
-static void __exit ppc_md5_mod_fini(void)
-{
-	crypto_unregister_shash(&alg);
-}
-
-module_init(ppc_md5_mod_init);
-module_exit(ppc_md5_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("MD5 Secure Hash Algorithm, PPC assembler");
-
-MODULE_ALIAS_CRYPTO("md5");
-MODULE_ALIAS_CRYPTO("md5-ppc");
-- 
2.50.1
Re: [PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code
Posted by Segher Boessenkool 2 months ago
On Sun, Aug 03, 2025 at 01:44:29PM -0700, Eric Biggers wrote:
> MD5 is insecure,

Really?  Have you found an attack?  Can you explain it to the rest of
the world?

MD5 is not recommended for future cryptographic purposes, there have
been some collission "attacks" on it (quotes because such a thing is
never an attack at all, merely an indication that not all is well with
it, somewhere in the future an actual vulnerability might be found).

Since there are newer, better, *cheaper* alternatives available, of
course you should not use MD5 for anything new anymore.  But claiming it
is insecure is FUD.

> This commit removes the PowerPC optimized MD5 code.

Why?  It would help to have real arguments for it!

If you just want to remove code you have no interest in, there are much
bigger targets available...


Segher
Re: [PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code
Posted by Eric Biggers 2 months ago
On Sun, Aug 03, 2025 at 05:07:10PM -0500, Segher Boessenkool wrote:
> On Sun, Aug 03, 2025 at 01:44:29PM -0700, Eric Biggers wrote:
> > MD5 is insecure,
> 
> Really?  Have you found an attack?  Can you explain it to the rest of
> the world?
> 
> MD5 is not recommended for future cryptographic purposes, there have
> been some collission "attacks" on it (quotes because such a thing is
> never an attack at all, merely an indication that not all is well with
> it, somewhere in the future an actual vulnerability might be found).
> 
> Since there are newer, better, *cheaper* alternatives available, of
> course you should not use MD5 for anything new anymore.  But claiming it
> is insecure is FUD.

Many attacks, including practical attacks, have been found on MD5 over
the past few decades.  Check out https://en.wikipedia.org/wiki/MD5

> > This commit removes the PowerPC optimized MD5 code.
> 
> Why?  It would help to have real arguments for it!

Sure, check out the commit message which mentioned multiple reasons why
maintaining this code is not worthwhile.

- Eric
Re: [PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code
Posted by Segher Boessenkool 2 months ago
On Sun, Aug 03, 2025 at 03:14:38PM -0700, Eric Biggers wrote:
> On Sun, Aug 03, 2025 at 05:07:10PM -0500, Segher Boessenkool wrote:
> > On Sun, Aug 03, 2025 at 01:44:29PM -0700, Eric Biggers wrote:
> > > MD5 is insecure,
> > 
> > Really?  Have you found an attack?  Can you explain it to the rest of
> > the world?
> > 
> > MD5 is not recommended for future cryptographic purposes, there have
> > been some collission "attacks" on it (quotes because such a thing is
> > never an attack at all, merely an indication that not all is well with
> > it, somewhere in the future an actual vulnerability might be found).
> > 
> > Since there are newer, better, *cheaper* alternatives available, of
> > course you should not use MD5 for anything new anymore.  But claiming it
> > is insecure is FUD.
> 
> Many attacks, including practical attacks, have been found on MD5 over
> the past few decades.  Check out https://en.wikipedia.org/wiki/MD5

There is no new information on that page.  There are no practical
attacks mentioned there, either, just some collission things (which
never can be practical attacks for most applications).

> > > This commit removes the PowerPC optimized MD5 code.
> > 
> > Why?  It would help to have real arguments for it!
> 
> Sure, check out the commit message which mentioned multiple reasons why
> maintaining this code is not worthwhile.

Of course I have read that, but that information went missing, if you
intended to provide it :-(

You are replacing a known-working target implementation by a lower
performance generic implementation.  But is that one known-working at
all?  Does it come with tests?  Was it tested to have the same outputs
as the existing thing, maybe?  Just on a few inputs maybe.

We were not told anything like that.


Segher
Re: [PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code
Posted by Eric Biggers 2 months ago
On Sun, Aug 03, 2025 at 05:27:01PM -0500, Segher Boessenkool wrote:
> You are replacing a known-working target implementation by a lower
> performance generic implementation.

That's probably correct, though FWIW there have been quite a few cases
where optimized assembly code in the kernel actually turned out to be
slower than the C code.  (That primarily happens when the assembly code
doesn't take advantage of any special CPU features, which was the case
for this PowerPC code.)  I don't have PowerPC hardware to check the
exact performance differential here, but IMO even if there was a
slowdown the factors still weigh strongly in favor of retiring this.

> But is that one known-working at all?  Does it come with tests?  Was
> it tested to have the same outputs as the existing thing, maybe?  Just
> on a few inputs maybe.

Of course.  Patch 7 adds a KUnit test suite for MD5, and there are still
the older tests in crypto/testmgr.c.  And of course generic code is much
easier to test than arch-specific code.  So not only is it tested, but
the test coverage is much better than it was before.

- Eric
Re: [PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code
Posted by Christophe Leroy 2 months ago

Le 03/08/2025 à 22:44, Eric Biggers a écrit :
> MD5 is insecure, is no longer commonly used, and has never been
> optimized for the most common architectures in the kernel.  Only mips,
> powerpc, and sparc have optimized MD5 code in the kernel.  Of these,
> only the powerpc one is actually testable in QEMU.  The mips one works
> only on Cavium Octeon SoCs.
> 
> Taken together, it's clear that it's time to retire these additional MD5
> implementations, and focus maintenance on the MD5 generic C code.

Sorry, for me it is not that clear. Even if MD5 is depracated we still 
have several applications that use MD5 for various reasons on our boards.

I ran the test on kernel v6.16 with following file:

# ls -l avion.au
-rw-------    1 root     root      12130159 Jan  1  1970 avion.au

With CONFIG_CRYPTO_MD5_PPC:

# time md5sum avion.au
6513851d6109d42477b20cd56bf57f28  avion.au
real    0m 1.02s
user    0m 0.01s
sys     0m 1.01s

Without CONFIG_CRYPTO_MD5_PPC:

# time md5sum avion.au
6513851d6109d42477b20cd56bf57f28  avion.au
real    0m 1.35s
user    0m 0.01s
sys     0m 1.34s

I think the difference is big enough to consider keeping optimised MD5 code.

Christophe



> 
> This commit removes the PowerPC optimized MD5 code.
> 
> Signed-off-by: Eric Biggers <ebiggers@kernel.org>
> ---
>   arch/powerpc/configs/powernv_defconfig |   1 -
>   arch/powerpc/configs/ppc64_defconfig   |   1 -
>   arch/powerpc/crypto/Kconfig            |   8 -
>   arch/powerpc/crypto/Makefile           |   2 -
>   arch/powerpc/crypto/md5-asm.S          | 235 -------------------------
>   arch/powerpc/crypto/md5-glue.c         |  99 -----------
>   6 files changed, 346 deletions(-)
>   delete mode 100644 arch/powerpc/crypto/md5-asm.S
>   delete mode 100644 arch/powerpc/crypto/md5-glue.c
> 
> diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
> index d06388b0f66e3..bd4685612de6d 100644
> --- a/arch/powerpc/configs/powernv_defconfig
> +++ b/arch/powerpc/configs/powernv_defconfig
> @@ -318,11 +318,10 @@ CONFIG_FTR_FIXUP_SELFTEST=y
>   CONFIG_MSI_BITMAP_SELFTEST=y
>   CONFIG_XMON=y
>   CONFIG_CRYPTO_BENCHMARK=m
>   CONFIG_CRYPTO_PCBC=m
>   CONFIG_CRYPTO_HMAC=y
> -CONFIG_CRYPTO_MD5_PPC=m
>   CONFIG_CRYPTO_MICHAEL_MIC=m
>   CONFIG_CRYPTO_SHA256=y
>   CONFIG_CRYPTO_WP512=m
>   CONFIG_CRYPTO_ANUBIS=m
>   CONFIG_CRYPTO_BLOWFISH=m
> diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
> index ce34597e9f3e1..2d92c11eea7e4 100644
> --- a/arch/powerpc/configs/ppc64_defconfig
> +++ b/arch/powerpc/configs/ppc64_defconfig
> @@ -385,11 +385,10 @@ CONFIG_CRYPTO_TWOFISH=m
>   CONFIG_CRYPTO_PCBC=m
>   CONFIG_CRYPTO_MICHAEL_MIC=m
>   CONFIG_CRYPTO_SHA256=y
>   CONFIG_CRYPTO_WP512=m
>   CONFIG_CRYPTO_LZO=m
> -CONFIG_CRYPTO_MD5_PPC=m
>   CONFIG_CRYPTO_AES_GCM_P10=m
>   CONFIG_CRYPTO_DEV_NX=y
>   CONFIG_CRYPTO_DEV_NX_ENCRYPT=m
>   CONFIG_CRYPTO_DEV_VMX=y
>   CONFIG_SYSTEM_TRUSTED_KEYRING=y
> diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
> index cfe39fc221cf8..f4b779c7352de 100644
> --- a/arch/powerpc/crypto/Kconfig
> +++ b/arch/powerpc/crypto/Kconfig
> @@ -13,18 +13,10 @@ config CRYPTO_CURVE25519_PPC64
>   	  Curve25519 algorithm
>   
>   	  Architecture: PowerPC64
>   	  - Little-endian
>   
> -config CRYPTO_MD5_PPC
> -	tristate "Digests: MD5"
> -	select CRYPTO_HASH
> -	help
> -	  MD5 message digest algorithm (RFC1321)
> -
> -	  Architecture: powerpc
> -
>   config CRYPTO_AES_PPC_SPE
>   	tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)"
>   	depends on SPE
>   	select CRYPTO_SKCIPHER
>   	help
> diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
> index bc8fd27344b8b..9eb59dce67f36 100644
> --- a/arch/powerpc/crypto/Makefile
> +++ b/arch/powerpc/crypto/Makefile
> @@ -4,17 +4,15 @@
>   #
>   # Arch-specific CryptoAPI modules.
>   #
>   
>   obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o
> -obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o
>   obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
>   obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
>   obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o
>   
>   aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
> -md5-ppc-y := md5-asm.o md5-glue.o
>   aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o
>   vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o
>   curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o
>   
>   ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
> diff --git a/arch/powerpc/crypto/md5-asm.S b/arch/powerpc/crypto/md5-asm.S
> deleted file mode 100644
> index fa6bc440cf4ac..0000000000000
> --- a/arch/powerpc/crypto/md5-asm.S
> +++ /dev/null
> @@ -1,235 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0-or-later */
> -/*
> - * Fast MD5 implementation for PPC
> - *
> - * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
> - */
> -#include <asm/ppc_asm.h>
> -#include <asm/asm-offsets.h>
> -#include <asm/asm-compat.h>
> -
> -#define rHP	r3
> -#define rWP	r4
> -
> -#define rH0	r0
> -#define rH1	r6
> -#define rH2	r7
> -#define rH3	r5
> -
> -#define rW00	r8
> -#define rW01	r9
> -#define rW02	r10
> -#define rW03	r11
> -#define rW04	r12
> -#define rW05	r14
> -#define rW06	r15
> -#define rW07	r16
> -#define rW08	r17
> -#define rW09	r18
> -#define rW10	r19
> -#define rW11	r20
> -#define rW12	r21
> -#define rW13	r22
> -#define rW14	r23
> -#define rW15	r24
> -
> -#define rT0	r25
> -#define rT1	r26
> -
> -#define INITIALIZE \
> -	PPC_STLU r1,-INT_FRAME_SIZE(r1); \
> -	SAVE_GPRS(14, 26, r1)		/* push registers onto stack	*/
> -
> -#define FINALIZE \
> -	REST_GPRS(14, 26, r1);		/* pop registers from stack	*/ \
> -	addi	r1,r1,INT_FRAME_SIZE
> -
> -#ifdef __BIG_ENDIAN__
> -#define LOAD_DATA(reg, off) \
> -	lwbrx		reg,0,rWP;	/* load data			*/
> -#define INC_PTR \
> -	addi		rWP,rWP,4;	/* increment per word		*/
> -#define NEXT_BLOCK			/* nothing to do		*/
> -#else
> -#define LOAD_DATA(reg, off) \
> -	lwz		reg,off(rWP);	/* load data			*/
> -#define INC_PTR				/* nothing to do		*/
> -#define NEXT_BLOCK \
> -	addi		rWP,rWP,64;	/* increment per block		*/
> -#endif
> -
> -#define R_00_15(a, b, c, d, w0, w1, p, q, off, k0h, k0l, k1h, k1l) \
> -	LOAD_DATA(w0, off)		/*    W				*/ \
> -	and		rT0,b,c;	/* 1: f = b and c		*/ \
> -	INC_PTR				/*    ptr++			*/ \
> -	andc		rT1,d,b;	/* 1: f' = ~b and d		*/ \
> -	LOAD_DATA(w1, off+4)		/*    W				*/ \
> -	or		rT0,rT0,rT1;	/* 1: f = f or f'		*/ \
> -	addi		w0,w0,k0l;	/* 1: wk = w + k		*/ \
> -	add		a,a,rT0;	/* 1: a = a + f			*/ \
> -	addis		w0,w0,k0h;	/* 1: wk = w + k'		*/ \
> -	addis		w1,w1,k1h;	/* 2: wk = w + k		*/ \
> -	add		a,a,w0;		/* 1: a = a + wk		*/ \
> -	addi		w1,w1,k1l;	/* 2: wk = w + k'		*/ \
> -	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
> -	add		d,d,w1;		/* 2: a = a + wk		*/ \
> -	add		a,a,b;		/* 1: a = a + b			*/ \
> -	and		rT0,a,b;	/* 2: f = b and c		*/ \
> -	andc		rT1,c,a;	/* 2: f' = ~b and d		*/ \
> -	or		rT0,rT0,rT1;	/* 2: f = f or f'		*/ \
> -	add		d,d,rT0;	/* 2: a = a + f			*/ \
> -	INC_PTR				/*    ptr++			*/ \
> -	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
> -	add		d,d,a;		/* 2: a = a + b			*/
> -
> -#define R_16_31(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
> -	andc		rT0,c,d;	/* 1: f = c and ~d		*/ \
> -	and		rT1,b,d;	/* 1: f' = b and d		*/ \
> -	addi		w0,w0,k0l;	/* 1: wk = w + k		*/ \
> -	or		rT0,rT0,rT1;	/* 1: f = f or f'		*/ \
> -	addis		w0,w0,k0h;	/* 1: wk = w + k'		*/ \
> -	add		a,a,rT0;	/* 1: a = a + f			*/ \
> -	addi		w1,w1,k1l;	/* 2: wk = w + k		*/ \
> -	add		a,a,w0;		/* 1: a = a + wk		*/ \
> -	addis		w1,w1,k1h;	/* 2: wk = w + k'		*/ \
> -	andc		rT0,b,c;	/* 2: f = c and ~d		*/ \
> -	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
> -	add		a,a,b;		/* 1: a = a + b			*/ \
> -	add		d,d,w1;		/* 2: a = a + wk		*/ \
> -	and		rT1,a,c;	/* 2: f' = b and d		*/ \
> -	or		rT0,rT0,rT1;	/* 2: f = f or f'		*/ \
> -	add		d,d,rT0;	/* 2: a = a + f			*/ \
> -	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
> -	add		d,d,a;		/* 2: a = a +b			*/
> -
> -#define R_32_47(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
> -	xor		rT0,b,c;	/* 1: f' = b xor c		*/ \
> -	addi		w0,w0,k0l;	/* 1: wk = w + k		*/ \
> -	xor		rT1,rT0,d;	/* 1: f = f xor f'		*/ \
> -	addis		w0,w0,k0h;	/* 1: wk = w + k'		*/ \
> -	add		a,a,rT1;	/* 1: a = a + f			*/ \
> -	addi		w1,w1,k1l;	/* 2: wk = w + k		*/ \
> -	add		a,a,w0;		/* 1: a = a + wk		*/ \
> -	addis		w1,w1,k1h;	/* 2: wk = w + k'		*/ \
> -	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
> -	add		d,d,w1;		/* 2: a = a + wk		*/ \
> -	add		a,a,b;		/* 1: a = a + b			*/ \
> -	xor		rT1,rT0,a;	/* 2: f = b xor f'		*/ \
> -	add		d,d,rT1;	/* 2: a = a + f			*/ \
> -	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
> -	add		d,d,a;		/* 2: a = a + b			*/
> -
> -#define R_48_63(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
> -	addi		w0,w0,k0l;	/* 1: w = w + k			*/ \
> -	orc		rT0,b,d;	/* 1: f = b or ~d		*/ \
> -	addis		w0,w0,k0h;	/* 1: w = w + k'		*/ \
> -	xor		rT0,rT0,c;	/* 1: f = f xor c		*/ \
> -	add		a,a,w0;		/* 1: a = a + wk		*/ \
> -	addi		w1,w1,k1l;	/* 2: w = w + k			*/ \
> -	add		a,a,rT0;	/* 1: a = a + f			*/ \
> -	addis		w1,w1,k1h;	/* 2: w = w + k'		*/ \
> -	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
> -	add		a,a,b;		/* 1: a = a + b			*/ \
> -	orc		rT0,a,c;	/* 2: f = b or ~d		*/ \
> -	add		d,d,w1;		/* 2: a = a + wk		*/ \
> -	xor		rT0,rT0,b;	/* 2: f = f xor c		*/ \
> -	add		d,d,rT0;	/* 2: a = a + f			*/ \
> -	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
> -	add		d,d,a;		/* 2: a = a + b			*/
> -
> -_GLOBAL(ppc_md5_transform)
> -	INITIALIZE
> -
> -	mtctr		r5
> -	lwz		rH0,0(rHP)
> -	lwz		rH1,4(rHP)
> -	lwz		rH2,8(rHP)
> -	lwz		rH3,12(rHP)
> -
> -ppc_md5_main:
> -	R_00_15(rH0, rH1, rH2, rH3, rW00, rW01, 25, 20, 0,
> -		0xd76b, -23432, 0xe8c8, -18602)
> -	R_00_15(rH2, rH3, rH0, rH1, rW02, rW03, 15, 10, 8,
> -		0x2420, 0x70db, 0xc1be, -12562)
> -	R_00_15(rH0, rH1, rH2, rH3, rW04, rW05, 25, 20, 16,
> -		0xf57c, 0x0faf, 0x4788, -14806)
> -	R_00_15(rH2, rH3, rH0, rH1, rW06, rW07, 15, 10, 24,
> -		0xa830, 0x4613, 0xfd47, -27391)
> -	R_00_15(rH0, rH1, rH2, rH3, rW08, rW09, 25, 20, 32,
> -		0x6981, -26408, 0x8b45,  -2129)
> -	R_00_15(rH2, rH3, rH0, rH1, rW10, rW11, 15, 10, 40,
> -		0xffff, 0x5bb1, 0x895d, -10306)
> -	R_00_15(rH0, rH1, rH2, rH3, rW12, rW13, 25, 20, 48,
> -		0x6b90, 0x1122, 0xfd98, 0x7193)
> -	R_00_15(rH2, rH3, rH0, rH1, rW14, rW15, 15, 10, 56,
> -		0xa679, 0x438e, 0x49b4, 0x0821)
> -
> -	R_16_31(rH0, rH1, rH2, rH3, rW01, rW06, 27, 23,
> -		0x0d56, 0x6e0c, 0x1810, 0x6d2d)
> -	R_16_31(rH2, rH3, rH0, rH1, rW11, rW00, 18, 12,
> -		0x9d02, -32109, 0x124c, 0x2332)
> -	R_16_31(rH0, rH1, rH2, rH3, rW05, rW10, 27, 23,
> -		0x8ea7, 0x4a33, 0x0245, -18270)
> -	R_16_31(rH2, rH3, rH0, rH1, rW15, rW04, 18, 12,
> -		0x8eee,  -8608, 0xf258,  -5095)
> -	R_16_31(rH0, rH1, rH2, rH3, rW09, rW14, 27, 23,
> -		0x969d, -10697, 0x1cbe, -15288)
> -	R_16_31(rH2, rH3, rH0, rH1, rW03, rW08, 18, 12,
> -		0x3317, 0x3e99, 0xdbd9, 0x7c15)
> -	R_16_31(rH0, rH1, rH2, rH3, rW13, rW02, 27, 23,
> -		0xac4b, 0x7772, 0xd8cf, 0x331d)
> -	R_16_31(rH2, rH3, rH0, rH1, rW07, rW12, 18, 12,
> -		0x6a28, 0x6dd8, 0x219a, 0x3b68)
> -
> -	R_32_47(rH0, rH1, rH2, rH3, rW05, rW08, 28, 21,
> -		0x29cb, 0x28e5, 0x4218,  -7788)
> -	R_32_47(rH2, rH3, rH0, rH1, rW11, rW14, 16,  9,
> -		0x473f, 0x06d1, 0x3aae, 0x3036)
> -	R_32_47(rH0, rH1, rH2, rH3, rW01, rW04, 28, 21,
> -		0xaea1, -15134, 0x640b, -11295)
> -	R_32_47(rH2, rH3, rH0, rH1, rW07, rW10, 16,  9,
> -		0x8f4c, 0x4887, 0xbc7c, -22499)
> -	R_32_47(rH0, rH1, rH2, rH3, rW13, rW00, 28, 21,
> -		0x7eb8, -27199, 0x00ea, 0x6050)
> -	R_32_47(rH2, rH3, rH0, rH1, rW03, rW06, 16,  9,
> -		0xe01a, 0x22fe, 0x4447, 0x69c5)
> -	R_32_47(rH0, rH1, rH2, rH3, rW09, rW12, 28, 21,
> -		0xb7f3, 0x0253, 0x59b1, 0x4d5b)
> -	R_32_47(rH2, rH3, rH0, rH1, rW15, rW02, 16,  9,
> -		0x4701, -27017, 0xc7bd, -19859)
> -
> -	R_48_63(rH0, rH1, rH2, rH3, rW00, rW07, 26, 22,
> -		0x0988,  -1462, 0x4c70, -19401)
> -	R_48_63(rH2, rH3, rH0, rH1, rW14, rW05, 17, 11,
> -		0xadaf,  -5221, 0xfc99, 0x66f7)
> -	R_48_63(rH0, rH1, rH2, rH3, rW12, rW03, 26, 22,
> -		0x7e80, -16418, 0xba1e, -25587)
> -	R_48_63(rH2, rH3, rH0, rH1, rW10, rW01, 17, 11,
> -		0x4130, 0x380d, 0xe0c5, 0x738d)
> -	lwz		rW00,0(rHP)
> -	R_48_63(rH0, rH1, rH2, rH3, rW08, rW15, 26, 22,
> -		0xe837, -30770, 0xde8a, 0x69e8)
> -	lwz		rW14,4(rHP)
> -	R_48_63(rH2, rH3, rH0, rH1, rW06, rW13, 17, 11,
> -		0x9e79, 0x260f, 0x256d, -27941)
> -	lwz		rW12,8(rHP)
> -	R_48_63(rH0, rH1, rH2, rH3, rW04, rW11, 26, 22,
> -		0xab75, -20775, 0x4f9e, -28397)
> -	lwz		rW10,12(rHP)
> -	R_48_63(rH2, rH3, rH0, rH1, rW02, rW09, 17, 11,
> -		0x662b, 0x7c56, 0x11b2, 0x0358)
> -
> -	add		rH0,rH0,rW00
> -	stw		rH0,0(rHP)
> -	add		rH1,rH1,rW14
> -	stw		rH1,4(rHP)
> -	add		rH2,rH2,rW12
> -	stw		rH2,8(rHP)
> -	add		rH3,rH3,rW10
> -	stw		rH3,12(rHP)
> -	NEXT_BLOCK
> -
> -	bdnz		ppc_md5_main
> -
> -	FINALIZE
> -	blr
> diff --git a/arch/powerpc/crypto/md5-glue.c b/arch/powerpc/crypto/md5-glue.c
> deleted file mode 100644
> index 204440a90cd84..0000000000000
> --- a/arch/powerpc/crypto/md5-glue.c
> +++ /dev/null
> @@ -1,99 +0,0 @@
> -// SPDX-License-Identifier: GPL-2.0-or-later
> -/*
> - * Glue code for MD5 implementation for PPC assembler
> - *
> - * Based on generic implementation.
> - *
> - * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
> - */
> -
> -#include <crypto/internal/hash.h>
> -#include <crypto/md5.h>
> -#include <linux/kernel.h>
> -#include <linux/module.h>
> -#include <linux/string.h>
> -
> -extern void ppc_md5_transform(u32 *state, const u8 *src, u32 blocks);
> -
> -static int ppc_md5_init(struct shash_desc *desc)
> -{
> -	struct md5_state *sctx = shash_desc_ctx(desc);
> -
> -	sctx->hash[0] = MD5_H0;
> -	sctx->hash[1] = MD5_H1;
> -	sctx->hash[2] = MD5_H2;
> -	sctx->hash[3] =	MD5_H3;
> -	sctx->byte_count = 0;
> -
> -	return 0;
> -}
> -
> -static int ppc_md5_update(struct shash_desc *desc, const u8 *data,
> -			unsigned int len)
> -{
> -	struct md5_state *sctx = shash_desc_ctx(desc);
> -
> -	sctx->byte_count += round_down(len, MD5_HMAC_BLOCK_SIZE);
> -	ppc_md5_transform(sctx->hash, data, len >> 6);
> -	return len - round_down(len, MD5_HMAC_BLOCK_SIZE);
> -}
> -
> -static int ppc_md5_finup(struct shash_desc *desc, const u8 *src,
> -			 unsigned int offset, u8 *out)
> -{
> -	struct md5_state *sctx = shash_desc_ctx(desc);
> -	__le64 block[MD5_BLOCK_WORDS] = {};
> -	u8 *p = memcpy(block, src, offset);
> -	__le32 *dst = (__le32 *)out;
> -	__le64 *pbits;
> -
> -	src = p;
> -	p += offset;
> -	*p++ = 0x80;
> -	sctx->byte_count += offset;
> -	pbits = &block[(MD5_BLOCK_WORDS / (offset > 55 ? 1 : 2)) - 1];
> -	*pbits = cpu_to_le64(sctx->byte_count << 3);
> -	ppc_md5_transform(sctx->hash, src, (pbits - block + 1) / 8);
> -	memzero_explicit(block, sizeof(block));
> -
> -	dst[0] = cpu_to_le32(sctx->hash[0]);
> -	dst[1] = cpu_to_le32(sctx->hash[1]);
> -	dst[2] = cpu_to_le32(sctx->hash[2]);
> -	dst[3] = cpu_to_le32(sctx->hash[3]);
> -	return 0;
> -}
> -
> -static struct shash_alg alg = {
> -	.digestsize	=	MD5_DIGEST_SIZE,
> -	.init		=	ppc_md5_init,
> -	.update		=	ppc_md5_update,
> -	.finup		=	ppc_md5_finup,
> -	.descsize	=	MD5_STATE_SIZE,
> -	.base		=	{
> -		.cra_name	=	"md5",
> -		.cra_driver_name=	"md5-ppc",
> -		.cra_priority	=	200,
> -		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
> -		.cra_blocksize	=	MD5_HMAC_BLOCK_SIZE,
> -		.cra_module	=	THIS_MODULE,
> -	}
> -};
> -
> -static int __init ppc_md5_mod_init(void)
> -{
> -	return crypto_register_shash(&alg);
> -}
> -
> -static void __exit ppc_md5_mod_fini(void)
> -{
> -	crypto_unregister_shash(&alg);
> -}
> -
> -module_init(ppc_md5_mod_init);
> -module_exit(ppc_md5_mod_fini);
> -
> -MODULE_LICENSE("GPL");
> -MODULE_DESCRIPTION("MD5 Secure Hash Algorithm, PPC assembler");
> -
> -MODULE_ALIAS_CRYPTO("md5");
> -MODULE_ALIAS_CRYPTO("md5-ppc");

Re: [PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code
Posted by Eric Biggers 2 months ago
On Mon, Aug 04, 2025 at 07:42:15PM +0200, Christophe Leroy wrote:
> 
> 
> Le 03/08/2025 à 22:44, Eric Biggers a écrit :
> > MD5 is insecure, is no longer commonly used, and has never been
> > optimized for the most common architectures in the kernel.  Only mips,
> > powerpc, and sparc have optimized MD5 code in the kernel.  Of these,
> > only the powerpc one is actually testable in QEMU.  The mips one works
> > only on Cavium Octeon SoCs.
> > 
> > Taken together, it's clear that it's time to retire these additional MD5
> > implementations, and focus maintenance on the MD5 generic C code.
> 
> Sorry, for me it is not that clear. Even if MD5 is depracated we still have
> several applications that use MD5 for various reasons on our boards.
> 
> I ran the test on kernel v6.16 with following file:
> 
> # ls -l avion.au
> -rw-------    1 root     root      12130159 Jan  1  1970 avion.au
> 
> With CONFIG_CRYPTO_MD5_PPC:
> 
> # time md5sum avion.au
> 6513851d6109d42477b20cd56bf57f28  avion.au
> real    0m 1.02s
> user    0m 0.01s
> sys     0m 1.01s
> 
> Without CONFIG_CRYPTO_MD5_PPC:
> 
> # time md5sum avion.au
> 6513851d6109d42477b20cd56bf57f28  avion.au
> real    0m 1.35s
> user    0m 0.01s
> sys     0m 1.34s
> 
> I think the difference is big enough to consider keeping optimised MD5 code.

But md5sum doesn't use the kernel's MD5 code.  So it's implausible that
it has any effect on md5sum.  The difference you saw must be due to an
unrelated reason like I/O caching, CPU frequency, etc.  Try running your
test multiple times to eliminate other sources of variation.

- Eric
Re: [PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code
Posted by Christophe Leroy 2 months ago

Le 04/08/2025 à 20:09, Eric Biggers a écrit :
> On Mon, Aug 04, 2025 at 07:42:15PM +0200, Christophe Leroy wrote:
>>
>>
>> Le 03/08/2025 à 22:44, Eric Biggers a écrit :
>>> MD5 is insecure, is no longer commonly used, and has never been
>>> optimized for the most common architectures in the kernel.  Only mips,
>>> powerpc, and sparc have optimized MD5 code in the kernel.  Of these,
>>> only the powerpc one is actually testable in QEMU.  The mips one works
>>> only on Cavium Octeon SoCs.
>>>
>>> Taken together, it's clear that it's time to retire these additional MD5
>>> implementations, and focus maintenance on the MD5 generic C code.
>>
>> Sorry, for me it is not that clear. Even if MD5 is depracated we still have
>> several applications that use MD5 for various reasons on our boards.
>>
>> I ran the test on kernel v6.16 with following file:
>>
>> # ls -l avion.au
>> -rw-------    1 root     root      12130159 Jan  1  1970 avion.au
>>
>> With CONFIG_CRYPTO_MD5_PPC:
>>
>> # time md5sum avion.au
>> 6513851d6109d42477b20cd56bf57f28  avion.au
>> real    0m 1.02s
>> user    0m 0.01s
>> sys     0m 1.01s
>>
>> Without CONFIG_CRYPTO_MD5_PPC:
>>
>> # time md5sum avion.au
>> 6513851d6109d42477b20cd56bf57f28  avion.au
>> real    0m 1.35s
>> user    0m 0.01s
>> sys     0m 1.34s
>>
>> I think the difference is big enough to consider keeping optimised MD5 code.
> 
> But md5sum doesn't use the kernel's MD5 code.  So it's implausible that
> it has any effect on md5sum.  The difference you saw must be due to an
> unrelated reason like I/O caching, CPU frequency, etc.  Try running your
> test multiple times to eliminate other sources of variation.

md5sum uses the kernel's MD5 code:

# ldd `which md5sum`
         linux-vdso32.so.1 (0x77b90000)
         libkcapi.so.1 => /usr/lib/libkcapi.so.1 (0x6ffa0000)    <==
         libcrypt.so.1 => /lib/libcrypt.so.1 (0x6ff50000)
         libc.so.6 => /lib/libc.so.6 (0x6fd10000)
         /lib/ld.so.1 => //lib/ld.so.1 (0x77ba0000)

Previous test was on an mpc8xx.

I now did the test on mpc832x and the difference is even bigger:

With CONFIG_CRYPTO_MD5_PPC:

# time md5sum avion.au
6513851d6109d42477b20cd56bf57f28  avion.au
real    0m 0.41s
user    0m 0.00s
sys     0m 0.34s

Without CONFIG_CRYPTO_MD5_PPC:

# time md5sum avion.au
6513851d6109d42477b20cd56bf57f28  avion.au
real    0m 0.58s
user    0m 0.00s
sys     0m 0.47s

Christophe
Re: [PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code
Posted by Eric Biggers 2 months ago
On Mon, Aug 04, 2025 at 09:02:27PM +0200, Christophe Leroy wrote:
> 
> 
> Le 04/08/2025 à 20:09, Eric Biggers a écrit :
> > On Mon, Aug 04, 2025 at 07:42:15PM +0200, Christophe Leroy wrote:
> > > 
> > > 
> > > Le 03/08/2025 à 22:44, Eric Biggers a écrit :
> > > > MD5 is insecure, is no longer commonly used, and has never been
> > > > optimized for the most common architectures in the kernel.  Only mips,
> > > > powerpc, and sparc have optimized MD5 code in the kernel.  Of these,
> > > > only the powerpc one is actually testable in QEMU.  The mips one works
> > > > only on Cavium Octeon SoCs.
> > > > 
> > > > Taken together, it's clear that it's time to retire these additional MD5
> > > > implementations, and focus maintenance on the MD5 generic C code.
> > > 
> > > Sorry, for me it is not that clear. Even if MD5 is depracated we still have
> > > several applications that use MD5 for various reasons on our boards.
> > > 
> > > I ran the test on kernel v6.16 with following file:
> > > 
> > > # ls -l avion.au
> > > -rw-------    1 root     root      12130159 Jan  1  1970 avion.au
> > > 
> > > With CONFIG_CRYPTO_MD5_PPC:
> > > 
> > > # time md5sum avion.au
> > > 6513851d6109d42477b20cd56bf57f28  avion.au
> > > real    0m 1.02s
> > > user    0m 0.01s
> > > sys     0m 1.01s
> > > 
> > > Without CONFIG_CRYPTO_MD5_PPC:
> > > 
> > > # time md5sum avion.au
> > > 6513851d6109d42477b20cd56bf57f28  avion.au
> > > real    0m 1.35s
> > > user    0m 0.01s
> > > sys     0m 1.34s
> > > 
> > > I think the difference is big enough to consider keeping optimised MD5 code.
> > 
> > But md5sum doesn't use the kernel's MD5 code.  So it's implausible that
> > it has any effect on md5sum.  The difference you saw must be due to an
> > unrelated reason like I/O caching, CPU frequency, etc.  Try running your
> > test multiple times to eliminate other sources of variation.
> 
> md5sum uses the kernel's MD5 code:

What?  That's crazy.  Userspace MD5 code would be faster and more
reliable.  No need to make syscalls, transfer data to and from the
kernel, have an external dependency, etc.  Is this the coreutils md5sum?
We need to get this reported and fixed.

- Eric
Re: [PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code
Posted by Eric Biggers 2 months ago
On Mon, Aug 04, 2025 at 10:59:01PM +0000, Eric Biggers wrote:
> On Mon, Aug 04, 2025 at 09:02:27PM +0200, Christophe Leroy wrote:
> > 
> > 
> > Le 04/08/2025 à 20:09, Eric Biggers a écrit :
> > > On Mon, Aug 04, 2025 at 07:42:15PM +0200, Christophe Leroy wrote:
> > > > 
> > > > 
> > > > Le 03/08/2025 à 22:44, Eric Biggers a écrit :
> > > > > MD5 is insecure, is no longer commonly used, and has never been
> > > > > optimized for the most common architectures in the kernel.  Only mips,
> > > > > powerpc, and sparc have optimized MD5 code in the kernel.  Of these,
> > > > > only the powerpc one is actually testable in QEMU.  The mips one works
> > > > > only on Cavium Octeon SoCs.
> > > > > 
> > > > > Taken together, it's clear that it's time to retire these additional MD5
> > > > > implementations, and focus maintenance on the MD5 generic C code.
> > > > 
> > > > Sorry, for me it is not that clear. Even if MD5 is depracated we still have
> > > > several applications that use MD5 for various reasons on our boards.
> > > > 
> > > > I ran the test on kernel v6.16 with following file:
> > > > 
> > > > # ls -l avion.au
> > > > -rw-------    1 root     root      12130159 Jan  1  1970 avion.au
> > > > 
> > > > With CONFIG_CRYPTO_MD5_PPC:
> > > > 
> > > > # time md5sum avion.au
> > > > 6513851d6109d42477b20cd56bf57f28  avion.au
> > > > real    0m 1.02s
> > > > user    0m 0.01s
> > > > sys     0m 1.01s
> > > > 
> > > > Without CONFIG_CRYPTO_MD5_PPC:
> > > > 
> > > > # time md5sum avion.au
> > > > 6513851d6109d42477b20cd56bf57f28  avion.au
> > > > real    0m 1.35s
> > > > user    0m 0.01s
> > > > sys     0m 1.34s
> > > > 
> > > > I think the difference is big enough to consider keeping optimised MD5 code.
> > > 
> > > But md5sum doesn't use the kernel's MD5 code.  So it's implausible that
> > > it has any effect on md5sum.  The difference you saw must be due to an
> > > unrelated reason like I/O caching, CPU frequency, etc.  Try running your
> > > test multiple times to eliminate other sources of variation.
> > 
> > md5sum uses the kernel's MD5 code:
> >
> > libkcapi.so.1 => /usr/lib/libkcapi.so.1 (0x6ffa0000)    <==

Oh, I think you used the obscure implementation of md5sum from
libkcapi-tools, instead of the normal md5sum.  Why?  Did you check how
the normal md5sum performs too?  Just doing the calculation in userspace
is much more efficient.

- Eric
Re: [PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code
Posted by Christophe Leroy 2 months ago

Le 05/08/2025 à 01:09, Eric Biggers a écrit :
> On Mon, Aug 04, 2025 at 10:59:01PM +0000, Eric Biggers wrote:
>> On Mon, Aug 04, 2025 at 09:02:27PM +0200, Christophe Leroy wrote:
>>>
>>>
>>> Le 04/08/2025 à 20:09, Eric Biggers a écrit :
>>>> On Mon, Aug 04, 2025 at 07:42:15PM +0200, Christophe Leroy wrote:
>>>>>
>>>>>
>>>>> Le 03/08/2025 à 22:44, Eric Biggers a écrit :
>>>>>> MD5 is insecure, is no longer commonly used, and has never been
>>>>>> optimized for the most common architectures in the kernel.  Only mips,
>>>>>> powerpc, and sparc have optimized MD5 code in the kernel.  Of these,
>>>>>> only the powerpc one is actually testable in QEMU.  The mips one works
>>>>>> only on Cavium Octeon SoCs.
>>>>>>
>>>>>> Taken together, it's clear that it's time to retire these additional MD5
>>>>>> implementations, and focus maintenance on the MD5 generic C code.
>>>>>
>>>>> Sorry, for me it is not that clear. Even if MD5 is depracated we still have
>>>>> several applications that use MD5 for various reasons on our boards.
>>>>>
>>>>> I ran the test on kernel v6.16 with following file:
>>>>>
>>>>> # ls -l avion.au
>>>>> -rw-------    1 root     root      12130159 Jan  1  1970 avion.au
>>>>>
>>>>> With CONFIG_CRYPTO_MD5_PPC:
>>>>>
>>>>> # time md5sum avion.au
>>>>> 6513851d6109d42477b20cd56bf57f28  avion.au
>>>>> real    0m 1.02s
>>>>> user    0m 0.01s
>>>>> sys     0m 1.01s
>>>>>
>>>>> Without CONFIG_CRYPTO_MD5_PPC:
>>>>>
>>>>> # time md5sum avion.au
>>>>> 6513851d6109d42477b20cd56bf57f28  avion.au
>>>>> real    0m 1.35s
>>>>> user    0m 0.01s
>>>>> sys     0m 1.34s
>>>>>
>>>>> I think the difference is big enough to consider keeping optimised MD5 code.
>>>>
>>>> But md5sum doesn't use the kernel's MD5 code.  So it's implausible that
>>>> it has any effect on md5sum.  The difference you saw must be due to an
>>>> unrelated reason like I/O caching, CPU frequency, etc.  Try running your
>>>> test multiple times to eliminate other sources of variation.
>>>
>>> md5sum uses the kernel's MD5 code:
>>>
>>> libkcapi.so.1 => /usr/lib/libkcapi.so.1 (0x6ffa0000)    <==
> 
> Oh, I think you used the obscure implementation of md5sum from
> libkcapi-tools, instead of the normal md5sum.  Why?  Did you check how
> the normal md5sum performs too?  Just doing the calculation in userspace
> is much more efficient.

Calculation in userspace is less efficient on my board:

# time md5sum avion.au
6513851d6109d42477b20cd56bf57f28  avion.au
real    0m 1.87s
user    0m 1.51s
sys     0m 0.35s

Christophe
Re: [PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code
Posted by Christophe Leroy 2 months ago

Le 05/08/2025 à 00:59, Eric Biggers a écrit :
> On Mon, Aug 04, 2025 at 09:02:27PM +0200, Christophe Leroy wrote:
>>
>>
>> Le 04/08/2025 à 20:09, Eric Biggers a écrit :
>>> On Mon, Aug 04, 2025 at 07:42:15PM +0200, Christophe Leroy wrote:
>>>>
>>>>
>>>> Le 03/08/2025 à 22:44, Eric Biggers a écrit :
>>>>> MD5 is insecure, is no longer commonly used, and has never been
>>>>> optimized for the most common architectures in the kernel.  Only mips,
>>>>> powerpc, and sparc have optimized MD5 code in the kernel.  Of these,
>>>>> only the powerpc one is actually testable in QEMU.  The mips one works
>>>>> only on Cavium Octeon SoCs.
>>>>>
>>>>> Taken together, it's clear that it's time to retire these additional MD5
>>>>> implementations, and focus maintenance on the MD5 generic C code.
>>>>
>>>> Sorry, for me it is not that clear. Even if MD5 is depracated we still have
>>>> several applications that use MD5 for various reasons on our boards.
>>>>
>>>> I ran the test on kernel v6.16 with following file:
>>>>
>>>> # ls -l avion.au
>>>> -rw-------    1 root     root      12130159 Jan  1  1970 avion.au
>>>>
>>>> With CONFIG_CRYPTO_MD5_PPC:
>>>>
>>>> # time md5sum avion.au
>>>> 6513851d6109d42477b20cd56bf57f28  avion.au
>>>> real    0m 1.02s
>>>> user    0m 0.01s
>>>> sys     0m 1.01s
>>>>
>>>> Without CONFIG_CRYPTO_MD5_PPC:
>>>>
>>>> # time md5sum avion.au
>>>> 6513851d6109d42477b20cd56bf57f28  avion.au
>>>> real    0m 1.35s
>>>> user    0m 0.01s
>>>> sys     0m 1.34s
>>>>
>>>> I think the difference is big enough to consider keeping optimised MD5 code.
>>>
>>> But md5sum doesn't use the kernel's MD5 code.  So it's implausible that
>>> it has any effect on md5sum.  The difference you saw must be due to an
>>> unrelated reason like I/O caching, CPU frequency, etc.  Try running your
>>> test multiple times to eliminate other sources of variation.
>>
>> md5sum uses the kernel's MD5 code:
> 
> What?  That's crazy.  Userspace MD5 code would be faster and more
> reliable.  No need to make syscalls, transfer data to and from the
> kernel, have an external dependency, etc.  Is this the coreutils md5sum?
> We need to get this reported and fixed.


Content of files is already buffered inside the kernel. likcapi doesn't 
tranfer data, it uses splice().

As far as I know, coreutil is not able to use the TALITOS Security 
engine we have on the mpc885 and mpc8321 microcontroleurs. We primarily 
use libkcapi for that.

In order to keep things consistant, we use the same userspace on boards 
which don't have a security engine, ie the mpc866, we rely on the kernel 
providing an optimised software implementation fallback.

Christophe
Re: [PATCH 3/7] crypto: powerpc/md5 - Remove PowerPC optimized MD5 code
Posted by Eric Biggers 2 months ago
On Tue, Aug 05, 2025 at 08:27:14AM +0200, Christophe Leroy wrote:
> > What?  That's crazy.  Userspace MD5 code would be faster and more
> > reliable.  No need to make syscalls, transfer data to and from the
> > kernel, have an external dependency, etc.  Is this the coreutils md5sum?
> > We need to get this reported and fixed.
> 
> Content of files is already buffered inside the kernel. likcapi doesn't
> tranfer data, it uses splice().
> 
> As far as I know, coreutil is not able to use the TALITOS Security engine we
> have on the mpc885 and mpc8321 microcontroleurs. We primarily use libkcapi
> for that.
> 
> In order to keep things consistant, we use the same userspace on boards
> which don't have a security engine, ie the mpc866, we rely on the kernel
> providing an optimised software implementation fallback.

Even if the data is transferred in a zero-copy manner, the AF_ALG
version still has a lot of overhead from system calls.  Running the
exact same PPC optimized code in userspace would of course be faster.

Now, it sounds like you don't have the PPC optimized MD5 code in
userspace, and that's why you're measuring the AF_ALG based md5sum to be
faster.  But that's just the wrong design, and it does not give optimal
performance.  The fact that the kernel allows access to software code
via AF_ALG is basically a bug.

For now we'll keep the PPC optimized MD5 around in the kernel to
accomodate userspace code that is depending on this bug, but for future
reference we don't add new software implementations purely for use by
userspace.  (And BTW this is nothing new; this has already been the
policy for many years.)  Please work to bring the optimized code you
need into userspace.  For example contribute it to OpenSSL.

- Eric
Crypto use cases (was: Remove PowerPC optimized MD5 code)
Posted by Simon Richter 2 months ago
Hi,

On 8/5/25 07:59, Eric Biggers wrote:

>> md5sum uses the kernel's MD5 code:

> What?  That's crazy.  Userspace MD5 code would be faster and more
> reliable.  No need to make syscalls, transfer data to and from the
> kernel, have an external dependency, etc.  Is this the coreutils md5sum?
> We need to get this reported and fixed.

The userspace API allows zero-copy transfers from userspace, and AFAIK 
also directly operating on files without ever transferring the data to 
userspace (so we save one copy).

Userspace requests are also where the asynchronous hardware offload 
units get to chomp on large blocks of data while the CPU is doing 
something else:

$ time dd if=test.bin of=/dev/zero bs=1G     # warm up caches
real    0m1.541s
user    0m0.000s
sys     0m0.732s

$ time gzip -9 <test.bin >test.bin.gz        # compress with the CPU
real    2m57.789s
user    2m55.986s
sys     0m1.508s

$ time ./gzfht_test test.bin                 # compress with NEST unit
real    0m3.207s
user    0m0.584s
sys     0m2.487s

$ time gzip -d <test.bin.nx.gz >test.bin.nx  # decompress with CPU
real    1m0.103s
user    0m57.990s
sys     0m1.878s

$ time ./gunz_test test.bin.gz               # decompress with NEST unit
real    0m2.722s
user    0m0.200s
sys     0m1.872s

That's why I'm objecting to measuring the general usefulness of hardware 
crypto units by the standards of fscrypt, which has an artificial 
limitation of never submitting blocks larger than 4kB: there are other 
use cases that don't have that limitation, and where the overhead is 
negligible because it is incurred only once for a few gigabytes of data.

That's why I suggested changing from a priority field to "speed" and 
"overhead" fields, and calculate priority for each application as 
(size/speed+overhead) -- smallest number wins, size is what the 
application expects to use as the typical request size (which for 
fscrypt and IPsec is on the small side, so it would always select the 
CPU unless there was a low-overhead offload engine available)

This probably needs some adjustment to allow selecting a low-power 
implementation (e.g. on mobile, I'd want to use offloading for fscrypt 
even if it is slower), and model request batching which reduces the 
overhead in a busy system, but it should be a good start.

    Simon
Re: Crypto use cases (was: Remove PowerPC optimized MD5 code)
Posted by Eric Biggers 2 months ago
On Tue, Aug 05, 2025 at 01:49:31PM +0900, Simon Richter wrote:
> Hi,
> 
> On 8/5/25 07:59, Eric Biggers wrote:
> 
> > > md5sum uses the kernel's MD5 code:
> 
> > What?  That's crazy.  Userspace MD5 code would be faster and more
> > reliable.  No need to make syscalls, transfer data to and from the
> > kernel, have an external dependency, etc.  Is this the coreutils md5sum?
> > We need to get this reported and fixed.
> 
> The userspace API allows zero-copy transfers from userspace, and AFAIK also
> directly operating on files without ever transferring the data to userspace
> (so we save one copy).
> 
> Userspace requests are also where the asynchronous hardware offload units
> get to chomp on large blocks of data while the CPU is doing something else:
> 
> $ time dd if=test.bin of=/dev/zero bs=1G     # warm up caches
> real    0m1.541s
> user    0m0.000s
> sys     0m0.732s
> 
> $ time gzip -9 <test.bin >test.bin.gz        # compress with the CPU
> real    2m57.789s
> user    2m55.986s
> sys     0m1.508s
> 
> $ time ./gzfht_test test.bin                 # compress with NEST unit
> real    0m3.207s
> user    0m0.584s
> sys     0m2.487s
> 
> $ time gzip -d <test.bin.nx.gz >test.bin.nx  # decompress with CPU
> real    1m0.103s
> user    0m57.990s
> sys     0m1.878s
> 
> $ time ./gunz_test test.bin.gz               # decompress with NEST unit
> real    0m2.722s
> user    0m0.200s
> sys     0m1.872s
> 
> That's why I'm objecting to measuring the general usefulness of hardware
> crypto units by the standards of fscrypt, which has an artificial limitation
> of never submitting blocks larger than 4kB: there are other use cases that
> don't have that limitation, and where the overhead is negligible because it
> is incurred only once for a few gigabytes of data.
> 
> That's why I suggested changing from a priority field to "speed" and
> "overhead" fields, and calculate priority for each application as
> (size/speed+overhead) -- smallest number wins, size is what the application
> expects to use as the typical request size (which for fscrypt and IPsec is
> on the small side, so it would always select the CPU unless there was a
> low-overhead offload engine available)
> 
> This probably needs some adjustment to allow selecting a low-power
> implementation (e.g. on mobile, I'd want to use offloading for fscrypt even
> if it is slower), and model request batching which reduces the overhead in a
> busy system, but it should be a good start.

What does this have to do with this thread, which is about the PowerPC
optimized MD5 code?

- Eric
Re: Crypto use cases
Posted by Simon Richter 2 months ago
Hi,

On 8/5/25 13:58, Eric Biggers wrote:

> What does this have to do with this thread, which is about the PowerPC
> optimized MD5 code?

Hence the new subject. It is still related to removal of code, but asks 
about the bigger picture.

The code removal changes you've been pushing lately absolutely make 
sense in the context of "the crypto subsystem is for internal use by the 
kernel, and all known users will only ever submit small work items."

However, there is also the userspace API, and hardware accelerators also 
register with the crypto subsystem, so it is *also* the way for 
userspace to use specialized hardware.

If these are separate, then it makes sense to acknowledge that the 
kernel will never use asynchronous transforms for anything, simplify the 
internal API, and move all the hardware support to a separate registry 
that is for userspace applications only.

However if we don't want separate registries, then it makes no sense for 
the kernel to set policy for userspace here; it should offer all the 
alternatives. The kernel can, and should, define policy for itself, and 
I wholeheartedly agree that offloading small requests does not make 
sense, unless we're on battery power.

I'm also not convinced that fscrypt cannot ever learn to submit a single 
large request or a large batch of small requests if it is asked to 
decrypt a large file, so in my opinion the common registry makes more sense.

Making sure that hardware support actually works and is tested is the 
responsibility of the driver and port maintainers. We understand that 
subsystem maintainers do not have all the hardware available, but the 
same goes for all the other subsystems -- the DRM maintainers routinely 
merge drivers for hardware they do not own, because the changes come 
from people who *do* own the hardware, and have tested the changes.

The latter is a project management issue, mostly: if there is a lack of 
working relationships with driver and port maintainers, then that needs 
to be fixed, not assumed to be an unchangeable part of the environment 
that technical decisions are made in.

    Simon
Re: Crypto use cases
Posted by Eric Biggers 2 months ago
On Tue, Aug 05, 2025 at 04:17:49PM +0900, Simon Richter wrote:
> Hi,
> 
> On 8/5/25 13:58, Eric Biggers wrote:
> 
> > What does this have to do with this thread, which is about the PowerPC
> > optimized MD5 code?
> 
> Hence the new subject. It is still related to removal of code, but asks
> about the bigger picture.
> 
> The code removal changes you've been pushing lately absolutely make sense in
> the context of "the crypto subsystem is for internal use by the kernel, and
> all known users will only ever submit small work items."
> 
> However, there is also the userspace API, and hardware accelerators also
> register with the crypto subsystem, so it is *also* the way for userspace to
> use specialized hardware.
> 
> If these are separate, then it makes sense to acknowledge that the kernel
> will never use asynchronous transforms for anything, simplify the internal
> API, and move all the hardware support to a separate registry that is for
> userspace applications only.

I think you're grouping together different things that aren't actually
very related.  For example this patch series proposed removing some
software code, not async drivers.  The only async driver I removed
recently is one of the crc32c ones.

Doesn't lib/crypto/ largely accomplish what you're thinking of as
"separate registries"?  As we migrate more in-kernel users to
lib/crypto/, the old-school crypto API becomes more focused just on
accomodating AF_ALG users.  However, that is, and will continue to be, a
long process.  In the mean time we still have many in-kernel users of
the old-school crypto API to decide what to do with.

> I'm also not convinced that fscrypt cannot ever learn to submit a single
> large request or a large batch of small requests if it is asked to decrypt a
> large file, so in my opinion the common registry makes more sense.

It's certainly never going to be the entire file or a single batch.
That's just not how filesystems work.

As for offloading entire I/O requests, fscrypt already supports that,
just for inline crypto engines instead of the old-school separate ones.
Inline encryption is fundamentally much more efficient.  The same
efficiency can't be achieved with a separate engine, even with a large
batch.  If people would like to continue to explore that approach
anyway, they're free to do so, but it's not a promising approach, at
least not on any of the platforms I examined.  (And pointing to
irrelevant data, like for compression, is not helpful.)

> Making sure that hardware support actually works and is tested is the
> responsibility of the driver and port maintainers. We understand that
> subsystem maintainers do not have all the hardware available, but the same
> goes for all the other subsystems -- the DRM maintainers routinely merge
> drivers for hardware they do not own, because the changes come from people
> who *do* own the hardware, and have tested the changes.
> 
> The latter is a project management issue, mostly: if there is a lack of
> working relationships with driver and port maintainers, then that needs to
> be fixed, not assumed to be an unchangeable part of the environment that
> technical decisions are made in.

This significantly understates the challenge that exists with a large
number of drivers, including orphaned drivers and architectures, as well
as the high standard of correctness that cryptography code needs to have
vs. more everyday device drivers.  And also the fact that async offload
drivers are fundamentally much harder to get correct than software code.
The odds are really stacked against everyone here, and I think calling
it a "project management issue" largely misses the point.

- Eric