[PATCH v3 2/2] crypto: Add Zhaoxin PadLock Hash Engine support for SHA384/SHA512

Tony W Wang-oc posted 2 patches 11 months ago
[PATCH v3 2/2] crypto: Add Zhaoxin PadLock Hash Engine support for SHA384/SHA512
Posted by Tony W Wang-oc 11 months ago
Zhaoxin CPUs have implemented the SHA(Secure Hash Algorithm) as its CPU
instructions, including SHA1, SHA256, SHA384 and SHA512, which conform
to the Secure Hash Algorithms specified by FIPS 180-3.

Zhaoxin CPU's SHA1/SHA256 implementation is compatible with VIA's
SHA1/SHA256, so add Zhaoxin CPU's SHA384/SHA512 support in padlock-sha.c.

With the help of implementation of SHA in hardware instead of software,
can develop applications with higher performance, more security and more
flexibility.

Below table gives a summary of test using the driver tcrypt with different
crypt algorithm drivers on Zhaoxin KH-40000 platform:
---------------------------------------------------------------------------
tcrypt     driver   16*    64      256     1024    2048    4096    8192
---------------------------------------------------------------------------
           PadLock** 442.80 1309.21 3257.53 5221.56 5813.45 6136.39 6264.50***
403:SHA1   generic** 341.44 813.27  1458.98 1818.03 1896.60 1940.71 1939.06
           ratio    1.30   1.61    2.23    2.87    3.07    3.16    3.23
---------------------------------------------------------------------------
           Padlock  451.70 1313.65 2958.71 4658.55 5109.16 5359.08 5459.13
404:SHA256 generic  202.62 463.55  845.01  1070.50 1117.51 1144.79 1155.68
           ratio    2.23   2.83    3.50    4.35    4.57    4.68    4.72
---------------------------------------------------------------------------
           Padlock  350.90 1406.42 3166.16 5736.39 6627.77 7182.01 7429.18
405:SHA384 generic  161.76 654.88  979.06  1350.56 1423.08 1496.57 1513.12
           ratio    2.17   2.15    3.23    4.25    4.66    4.80    4.91
---------------------------------------------------------------------------
           Padlock  334.49 1394.71 3159.93 5728.86 6625.33 7169.23 7407.80
406:SHA512 generic  161.80 653.84  979.42  1351.41 1444.14 1495.35 1518.43
           ratio    2.07   2.13    3.23    4.24    4.59    4.79    4.88
---------------------------------------------------------------------------
*: The length of each data block to be processed by one complete SHA
   sequence, namely one INIT, multi UPDATEs and one FINAL.
**: Crypt algorithm driver used by tcrypt, "PadLock" represents padlock-sha
   while "generic" represents the generic software SHA driver.
***: The speed of each crypt algorithm driver processing different length
   of data blocks, unit is Mb/s.

The ratio in the table implies the performance of SHA implemented by
padlock-sha driver is much higher than the ones implemented by the generic
software driver of sha1/sha256/sha384/sha512.

Signed-off-by: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
---
 drivers/crypto/Kconfig       |  10 +-
 drivers/crypto/padlock-sha.c | 200 ++++++++++++++++++++++++++++++++++-
 2 files changed, 202 insertions(+), 8 deletions(-)

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 19ab145f912e..0e97be36e037 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -39,15 +39,19 @@ config CRYPTO_DEV_PADLOCK_AES
 	  called padlock-aes.
 
 config CRYPTO_DEV_PADLOCK_SHA
-	tristate "PadLock driver for SHA1 and SHA256 algorithms"
+	tristate "PadLock driver for SHA1/SHA256/SHA384/SHA512 algorithms"
+	depends on X86 && !UML
 	depends on CRYPTO_DEV_PADLOCK
 	select CRYPTO_HASH
 	select CRYPTO_SHA1
 	select CRYPTO_SHA256
+	select CRYPTO_SHA512
 	help
-	  Use VIA PadLock for SHA1/SHA256 algorithms.
+	  Use PadLock for SHA1/SHA256 algorithms.
+	  Available in VIA C7 and newer processors, available in Zhaoxin processors.
 
-	  Available in VIA C7 and newer processors.
+	  Use PadLock for SHA384/SHA512 algorithms.
+	  Available in Zhaoxin processors.
 
 	  If unsure say M. The compiled module will be
 	  called padlock-sha.
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c
index 6865c7f1fc1a..80af906184e2 100644
--- a/drivers/crypto/padlock-sha.c
+++ b/drivers/crypto/padlock-sha.c
@@ -5,6 +5,10 @@
  * Support for VIA PadLock hardware crypto engine.
  *
  * Copyright (c) 2006  Michal Ludvig <michal@logix.cz>
+ *
+ * Add SHA384/SHA512 support for Zhaoxin processors.
+ *
+ * Copyright (c) 2025  George Xue <georgexue@zhaoxin.com>
  */
 
 #include <crypto/internal/hash.h>
@@ -434,6 +438,123 @@ static int padlock_sha256_final_nano(struct shash_desc *desc, u8 *out)
 	return 0;
 }
 
+static inline void padlock_output_block_512(uint64_t *src, uint64_t *dst, size_t count)
+{
+	while (count--)
+		*dst++ = swab64(*src++);
+}
+
+static int padlock_sha384_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	*sctx = (struct sha512_state){
+		.state = { SHA384_H0, SHA384_H1, SHA384_H2, SHA384_H3, SHA384_H4, SHA384_H5,
+			   SHA384_H6, SHA384_H7 },
+		.count = { 0, 0 },
+	};
+
+	return 0;
+}
+
+static int padlock_sha512_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	*sctx = (struct sha512_state){
+		.state = { SHA512_H0, SHA512_H1, SHA512_H2, SHA512_H3, SHA512_H4, SHA512_H5,
+			   SHA512_H6, SHA512_H7 },
+		.count = { 0, 0 },
+	};
+
+	return 0;
+}
+
+static int padlock_sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial, done;
+	const u8 *src;
+	u8 buf[SHA512_BLOCK_SIZE];
+	u8 *dst = &buf[0];
+
+	partial = sctx->count[0] % SHA512_BLOCK_SIZE;
+
+	sctx->count[0] += len;
+	if (sctx->count[0] < len)
+		sctx->count[1]++;
+
+	done = 0;
+	src = data;
+	memcpy(dst, sctx->state, SHA512_DIGEST_SIZE);
+
+	if ((partial + len) >= SHA512_BLOCK_SIZE) {
+		/* Append the bytes in state's buffer to a block to handle */
+		if (partial) {
+			done = -partial;
+			memcpy(sctx->buf + partial, data, done + SHA512_BLOCK_SIZE);
+
+			src = sctx->buf;
+
+			asm volatile(".byte 0xf3, 0x0f, 0xa6, 0xe0"
+				     : "+S"(src), "+D"(dst)
+				     : "c"(1UL));
+
+			done += SHA512_BLOCK_SIZE;
+			src = data + done;
+		}
+
+		/* Process the left bytes from input data */
+		if (len - done >= SHA512_BLOCK_SIZE) {
+			asm volatile(".byte 0xf3, 0x0f, 0xa6, 0xe0"
+				     : "+S"(src), "+D"(dst)
+				     : "c"((unsigned long)((len - done) / SHA512_BLOCK_SIZE)));
+
+			done += ((len - done) - (len - done) % SHA512_BLOCK_SIZE);
+			src = data + done;
+		}
+		partial = 0;
+	}
+
+	memcpy(sctx->state, dst, SHA512_DIGEST_SIZE);
+	memcpy(sctx->buf + partial, src, len - done);
+
+	return 0;
+}
+
+static int padlock_sha512_final(struct shash_desc *desc, u8 *out)
+{
+	const int bit_offset = SHA512_BLOCK_SIZE - sizeof(__be64[2]);
+	struct sha512_state *state = shash_desc_ctx(desc);
+	unsigned int partial = state->count[0] % SHA512_BLOCK_SIZE, padlen;
+	__be64 bits[2];
+
+	/* Both SHA384 and SHA512 may be supported. */
+	int dgst_size = crypto_shash_digestsize(desc->tfm);
+
+	static u8 padding[SHA512_BLOCK_SIZE];
+
+	memset(padding, 0, SHA512_BLOCK_SIZE);
+	padding[0] = 0x80;
+
+	/* Convert byte count in little endian to bit count in big endian. */
+	bits[0] = cpu_to_be64(state->count[1] << 3 | state->count[0] >> 61);
+	bits[1] = cpu_to_be64(state->count[0] << 3);
+
+	padlen = (partial < bit_offset) ? (bit_offset - partial) :
+					  ((SHA512_BLOCK_SIZE + bit_offset) - partial);
+
+	padlock_sha512_update(desc, padding, padlen);
+
+	/* Append length field bytes */
+	padlock_sha512_update(desc, (const u8 *)bits, sizeof(__be64[2]));
+
+	/* Swap to output */
+	padlock_output_block_512(state->state, (uint64_t *)out, dgst_size / sizeof(uint64_t));
+
+	return 0;
+}
+
 static int padlock_sha_export_nano(struct shash_desc *desc,
 				void *out)
 {
@@ -490,6 +611,42 @@ static struct shash_alg sha256_alg_nano = {
 	}
 };
 
+static struct shash_alg sha384_alg = {
+	.digestsize = SHA384_DIGEST_SIZE,
+	.init       = padlock_sha384_init,
+	.update     = padlock_sha512_update,
+	.final      = padlock_sha512_final,
+	.export     = padlock_sha_export_nano,
+	.import     = padlock_sha_import_nano,
+	.descsize   = sizeof(struct sha512_state),
+	.statesize  = sizeof(struct sha512_state),
+	.base       = {
+		.cra_name        = "sha384",
+		.cra_driver_name = "sha384-padlock-zhaoxin",
+		.cra_priority    = PADLOCK_CRA_PRIORITY,
+		.cra_blocksize   = SHA384_BLOCK_SIZE,
+		.cra_module      = THIS_MODULE,
+	}
+};
+
+static struct shash_alg sha512_alg = {
+	.digestsize = SHA512_DIGEST_SIZE,
+	.init       = padlock_sha512_init,
+	.update     = padlock_sha512_update,
+	.final      = padlock_sha512_final,
+	.export     = padlock_sha_export_nano,
+	.import     = padlock_sha_import_nano,
+	.descsize   = sizeof(struct sha512_state),
+	.statesize  = sizeof(struct sha512_state),
+	.base       = {
+		.cra_name        = "sha512",
+		.cra_driver_name = "sha512-padlock-zhaoxin",
+		.cra_priority    = PADLOCK_CRA_PRIORITY,
+		.cra_blocksize   = SHA512_BLOCK_SIZE,
+		.cra_module      = THIS_MODULE,
+	}
+};
+
 static const struct x86_cpu_id padlock_sha_ids[] = {
 	X86_MATCH_FEATURE(X86_FEATURE_PHE, NULL),
 	{}
@@ -502,12 +659,16 @@ static int __init padlock_init(void)
 	struct cpuinfo_x86 *c = &cpu_data(0);
 	struct shash_alg *sha1;
 	struct shash_alg *sha256;
+	struct shash_alg *sha384;
+	struct shash_alg *sha512;
 
 	if (!x86_match_cpu(padlock_sha_ids) || !boot_cpu_has(X86_FEATURE_PHE_EN))
 		return -ENODEV;
 
-	/* Register the newly added algorithm module if on *
-	* VIA Nano processor, or else just do as before */
+	/*
+	 * Register the newly added algorithm module if on
+	 * Zhaoxin/VIA Nano processor, or else just do as before
+	 */
 	if (c->x86_model < 0x0f) {
 		sha1 = &sha1_alg;
 		sha256 = &sha256_alg;
@@ -524,15 +685,34 @@ static int __init padlock_init(void)
 	if (rc)
 		goto out_unreg1;
 
-	printk(KERN_NOTICE PFX "Using VIA PadLock ACE for SHA1/SHA256 algorithms.\n");
+	printk(KERN_NOTICE PFX "Using PadLock ACE for SHA1/SHA256 algorithms.\n");
+
+	if (boot_cpu_has(X86_FEATURE_PHE2_EN)) {
+		sha384 = &sha384_alg;
+		sha512 = &sha512_alg;
+
+		rc = crypto_register_shash(sha384);
+		if (rc)
+			goto out_unreg2;
+
+		rc = crypto_register_shash(sha512);
+		if (rc)
+			goto out_unreg3;
+
+		printk(KERN_NOTICE PFX "Using PadLock ACE for SHA384/SHA512 algorithms.\n");
+	}
 
 	return 0;
 
+out_unreg3:
+	crypto_unregister_shash(sha384);
+out_unreg2:
+	crypto_unregister_shash(sha256);
 out_unreg1:
 	crypto_unregister_shash(sha1);
 
 out:
-	printk(KERN_ERR PFX "VIA PadLock SHA1/SHA256 initialization failed.\n");
+	printk(KERN_ERR PFX "PadLock SHA1/SHA256/SHA384/SHA5112 initialization failed.\n");
 	return rc;
 }
 
@@ -543,6 +723,11 @@ static void __exit padlock_fini(void)
 	if (c->x86_model >= 0x0f) {
 		crypto_unregister_shash(&sha1_alg_nano);
 		crypto_unregister_shash(&sha256_alg_nano);
+
+		if (boot_cpu_has(X86_FEATURE_PHE2_EN)) {
+			crypto_unregister_shash(&sha384_alg);
+			crypto_unregister_shash(&sha512_alg);
+		}
 	} else {
 		crypto_unregister_shash(&sha1_alg);
 		crypto_unregister_shash(&sha256_alg);
@@ -552,11 +737,16 @@ static void __exit padlock_fini(void)
 module_init(padlock_init);
 module_exit(padlock_fini);
 
-MODULE_DESCRIPTION("VIA PadLock SHA1/SHA256 algorithms support.");
+MODULE_DESCRIPTION("PadLock SHA1/SHA256/SHA384/SHA512 algorithms support.");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Michal Ludvig");
+MODULE_AUTHOR("George Xue <georgexue@zhaoxin.com>");
 
 MODULE_ALIAS_CRYPTO("sha1-all");
 MODULE_ALIAS_CRYPTO("sha256-all");
+MODULE_ALIAS_CRYPTO("sha384-all");
+MODULE_ALIAS_CRYPTO("sha512-all");
 MODULE_ALIAS_CRYPTO("sha1-padlock");
 MODULE_ALIAS_CRYPTO("sha256-padlock");
+MODULE_ALIAS_CRYPTO("sha384-padlock");
+MODULE_ALIAS_CRYPTO("sha512-padlock");
-- 
2.25.1