VAES+AVX2 optimized implementation of AES-GCM

[PATCH 7/8] crypto: x86/aes-gcm - optimize AVX512 precomputation of H^2 from H^1

Posted by Eric Biggers 4 months, 1 week ago

Squaring in GF(2^128) requires fewer instructions than a generic
multiplication in GF(2^128).  Take advantage of this when computing H^2
from H^1 in aes_gcm_precompute_vaes_avx512().

Note that aes_gcm_precompute_vaes_avx2() already uses this optimization.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/x86/crypto/aes-gcm-vaes-avx512.S | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/x86/crypto/aes-gcm-vaes-avx512.S b/arch/x86/crypto/aes-gcm-vaes-avx512.S
index 3cf0945a25170..5c8301d275c66 100644
--- a/arch/x86/crypto/aes-gcm-vaes-avx512.S
+++ b/arch/x86/crypto/aes-gcm-vaes-avx512.S
@@ -258,10 +258,23 @@
 	vpclmulqdq	$0x01, \mi, \gfpoly, \t0
 	vpshufd		$0x4e, \mi, \mi
 	vpternlogd	$0x96, \t0, \mi, \hi
 .endm
 
+// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it
+// squares \a.  It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0.
+.macro	_ghash_square	a, dst, gfpoly, t0, t1
+	vpclmulqdq	$0x00, \a, \a, \t0	  // LO = a_L * a_L
+	vpclmulqdq	$0x11, \a, \a, \dst	  // HI = a_H * a_H
+	vpclmulqdq	$0x01, \t0, \gfpoly, \t1  // LO_L*(x^63 + x^62 + x^57)
+	vpshufd		$0x4e, \t0, \t0		  // Swap halves of LO
+	vpxord		\t0, \t1, \t1		  // Fold LO into MI
+	vpclmulqdq	$0x01, \t1, \gfpoly, \t0  // MI_L*(x^63 + x^62 + x^57)
+	vpshufd		$0x4e, \t1, \t1		  // Swap halves of MI
+	vpternlogd	$0x96, \t0, \t1, \dst	  // Fold MI into HI
+.endm
+
 // void aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);
 //
 // Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and
 // initialize |key->h_powers| and |key->padding|.
 SYM_FUNC_START(aes_gcm_precompute_vaes_avx512)
@@ -335,12 +348,11 @@ SYM_FUNC_START(aes_gcm_precompute_vaes_avx512)
 	// Note that as with H^1, all higher key powers also need an extra
 	// factor of x^-1 (or x using the natural interpretation).  Nothing
 	// special needs to be done to make this happen, though: H^1 * H^1 would
 	// end up with two factors of x^-1, but the multiplication consumes one.
 	// So the product H^2 ends up with the desired one factor of x^-1.
-	_ghash_mul	H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \
-			%xmm0, %xmm1, %xmm2
+	_ghash_square	H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, %xmm0, %xmm1
 
 	// Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
 	vinserti128	$1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM
 	vinserti128	$1, H_INC_XMM, H_INC_YMM, H_INC_YMM
 
-- 
2.51.0

[PATCH 1/8] crypto: x86/aes-gcm - add VAES+AVX2 optimized code
[PATCH 2/8] crypto: x86/aes-gcm - remove VAES+AVX10/256 optimized code
[PATCH 3/8] crypto: x86/aes-gcm - rename avx10 and avx10_512 to avx512
[PATCH 4/8] crypto: x86/aes-gcm - clean up AVX512 code to assume 512-bit vectors
[PATCH 5/8] crypto: x86/aes-gcm - reorder AVX512 precompute and aad_update functions
[PATCH 6/8] crypto: x86/aes-gcm - revise some comments in AVX512 code
[PATCH 7/8] crypto: x86/aes-gcm - optimize AVX512 precomputation of H^2 from H^1
[PATCH 8/8] crypto: x86/aes-gcm - optimize long AAD processing with AVX512