Optimize the AES library with x86 AES-NI instructions.
The relevant existing assembly functions, aesni_set_key(), aesni_enc(),
and aesni_dec(), are a bit difficult to extract into the library:
- They're coupled to the code for the AES modes.
- They operate on struct crypto_aes_ctx. The AES library now uses
different structs.
- They assume the key is 16-byte aligned. The AES library only
*prefers* 16-byte alignment; it doesn't require it.
Moreover, they're not all that great in the first place:
- They use unrolled loops, which isn't a great choice on x86.
- They use the 'aeskeygenassist' instruction, which is unnecessary, is
slow on Intel CPUs, and forces the loop to be unrolled.
- They have special code for AES-192 key expansion, despite that being
kind of useless. AES-128 and AES-256 are the ones used in practice.
These are small functions anyway.
Therefore, I opted to just write replacements of these functions for the
library. They address all the above issues.
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
lib/crypto/Kconfig | 1 +
lib/crypto/Makefile | 1 +
lib/crypto/x86/aes-aesni.S | 261 +++++++++++++++++++++++++++++++++++++
lib/crypto/x86/aes.h | 85 ++++++++++++
4 files changed, 348 insertions(+)
create mode 100644 lib/crypto/x86/aes-aesni.S
create mode 100644 lib/crypto/x86/aes.h
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 222887c04240..e3ee31217988 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -19,10 +19,11 @@ config CRYPTO_LIB_AES_ARCH
default y if PPC && (SPE || (PPC64 && VSX))
default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
default y if S390
default y if SPARC64
+ default y if X86
config CRYPTO_LIB_AESCFB
tristate
select CRYPTO_LIB_AES
select CRYPTO_LIB_UTILS
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 761d52d91f92..725eef05b758 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -50,10 +50,11 @@ OBJECT_FILES_NON_STANDARD_powerpc/aesp8-ppc.o := y
endif # !CONFIG_SPE
endif # CONFIG_PPC
libaes-$(CONFIG_RISCV) += riscv/aes-riscv64-zvkned.o
libaes-$(CONFIG_SPARC) += sparc/aes_asm.o
+libaes-$(CONFIG_X86) += x86/aes-aesni.o
endif # CONFIG_CRYPTO_LIB_AES_ARCH
################################################################################
obj-$(CONFIG_CRYPTO_LIB_AESCFB) += libaescfb.o
diff --git a/lib/crypto/x86/aes-aesni.S b/lib/crypto/x86/aes-aesni.S
new file mode 100644
index 000000000000..b8c3e104a3be
--- /dev/null
+++ b/lib/crypto/x86/aes-aesni.S
@@ -0,0 +1,261 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+//
+// AES block cipher using AES-NI instructions
+//
+// Copyright 2026 Google LLC
+//
+// The code in this file supports 32-bit and 64-bit CPUs, and it doesn't require
+// AVX. It does use up to SSE4.1, which all CPUs with AES-NI have.
+#include <linux/linkage.h>
+
+.section .rodata
+#ifdef __x86_64__
+#define RODATA(label) label(%rip)
+#else
+#define RODATA(label) label
+#endif
+
+ // A mask for pshufb that extracts the last dword, rotates it right by 8
+ // bits, and copies the result to all four dwords.
+.p2align 4
+.Lmask:
+ .byte 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12
+
+ // The AES round constants, used during key expansion
+.Lrcon:
+ .long 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
+
+.text
+
+// Transform four dwords [a0, a1, a2, a3] in \a into
+// [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]. \tmp is a temporary xmm register.
+//
+// Note: this could be done in four instructions, shufps + pxor + shufps + pxor,
+// if the temporary register were zero-initialized ahead of time. We instead do
+// it in an easier-to-understand way that doesn't require zero-initialization
+// and avoids the unusual shufps instruction. movdqa is usually "free" anyway.
+.macro _prefix_sum a, tmp
+ movdqa \a, \tmp // [a0, a1, a2, a3]
+ pslldq $4, \a // [0, a0, a1, a2]
+ pxor \tmp, \a // [a0, a0^a1, a1^a2, a2^a3]
+ movdqa \a, \tmp
+ pslldq $8, \a // [0, 0, a0, a0^a1]
+ pxor \tmp, \a // [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]
+.endm
+
+.macro _gen_round_key a, b
+ // Compute four copies of rcon[i] ^ SubBytes(ror32(w, 8)), where w is
+ // the last dword of the previous round key (given in \b).
+ //
+ // 'aesenclast src, dst' does dst = src XOR SubBytes(ShiftRows(dst)).
+ // It is used here solely for the SubBytes and the XOR. The ShiftRows
+ // is a no-op because all four columns are the same here.
+ //
+ // Don't use the 'aeskeygenassist' instruction, since:
+ // - On most Intel CPUs it is microcoded, making it have a much higher
+ // latency and use more execution ports than 'aesenclast'.
+ // - It cannot be used in a loop, since it requires an immediate.
+ // - It doesn't do much more than 'aesenclast' in the first place.
+ movdqa \b, %xmm2
+ pshufb MASK, %xmm2
+ aesenclast RCON, %xmm2
+
+ // XOR in the prefix sum of the four dwords of \a, which is the
+ // previous round key (AES-128) or the first round key in the previous
+ // pair of round keys (AES-256). The result is the next round key.
+ _prefix_sum \a, tmp=%xmm3
+ pxor %xmm2, \a
+
+ // Store the next round key to memory. Also leave it in \a.
+ movdqu \a, (RNDKEYS)
+.endm
+
+.macro _aes_expandkey_aesni is_aes128
+#ifdef __x86_64__
+ // Arguments
+ .set RNDKEYS, %rdi
+ .set INV_RNDKEYS, %rsi
+ .set IN_KEY, %rdx
+
+ // Other local variables
+ .set RCON_PTR, %rcx
+ .set COUNTER, %eax
+#else
+ // Arguments, assuming -mregparm=3
+ .set RNDKEYS, %eax
+ .set INV_RNDKEYS, %edx
+ .set IN_KEY, %ecx
+
+ // Other local variables
+ .set RCON_PTR, %ebx
+ .set COUNTER, %esi
+#endif
+ .set RCON, %xmm6
+ .set MASK, %xmm7
+
+#ifdef __i386__
+ push %ebx
+ push %esi
+#endif
+
+.if \is_aes128
+ // AES-128: the first round key is simply a copy of the raw key.
+ movdqu (IN_KEY), %xmm0
+ movdqu %xmm0, (RNDKEYS)
+.else
+ // AES-256: the first two round keys are simply a copy of the raw key.
+ movdqu (IN_KEY), %xmm0
+ movdqu %xmm0, (RNDKEYS)
+ movdqu 16(IN_KEY), %xmm1
+ movdqu %xmm1, 16(RNDKEYS)
+ add $32, RNDKEYS
+.endif
+
+ // Generate the remaining round keys.
+ movdqa RODATA(.Lmask), MASK
+.if \is_aes128
+ lea RODATA(.Lrcon), RCON_PTR
+ mov $10, COUNTER
+.Lgen_next_aes128_round_key:
+ add $16, RNDKEYS
+ movd (RCON_PTR), RCON
+ pshufd $0x00, RCON, RCON
+ add $4, RCON_PTR
+ _gen_round_key %xmm0, %xmm0
+ dec COUNTER
+ jnz .Lgen_next_aes128_round_key
+.else
+ // AES-256: only the first 7 round constants are needed, so instead of
+ // loading each one from memory, just start by loading [1, 1, 1, 1] and
+ // then generate the rest by doubling.
+ pshufd $0x00, RODATA(.Lrcon), RCON
+ pxor %xmm5, %xmm5 // All-zeroes
+ mov $7, COUNTER
+.Lgen_next_aes256_round_key_pair:
+ // Generate the next AES-256 round key: either the first of a pair of
+ // two, or the last one.
+ _gen_round_key %xmm0, %xmm1
+
+ dec COUNTER
+ jz .Lgen_aes256_round_keys_done
+
+ // Generate the second AES-256 round key of the pair. Compared to the
+ // first, there's no rotation and no XOR of a round constant.
+ pshufd $0xff, %xmm0, %xmm2 // Get four copies of last dword
+ aesenclast %xmm5, %xmm2 // Just does SubBytes
+ _prefix_sum %xmm1, tmp=%xmm3
+ pxor %xmm2, %xmm1
+ movdqu %xmm1, 16(RNDKEYS)
+ add $32, RNDKEYS
+ paddd RCON, RCON // RCON <<= 1
+ jmp .Lgen_next_aes256_round_key_pair
+.Lgen_aes256_round_keys_done:
+.endif
+
+ // If INV_RNDKEYS is non-NULL, write the round keys for the Equivalent
+ // Inverse Cipher to it. To do that, reverse the standard round keys,
+ // and apply aesimc (InvMixColumn) to each except the first and last.
+ test INV_RNDKEYS, INV_RNDKEYS
+ jz .Ldone\@
+ movdqu (RNDKEYS), %xmm0 // Last standard round key
+ movdqu %xmm0, (INV_RNDKEYS) // => First inverse round key
+.if \is_aes128
+ mov $9, COUNTER
+.else
+ mov $13, COUNTER
+.endif
+.Lgen_next_inv_round_key\@:
+ sub $16, RNDKEYS
+ add $16, INV_RNDKEYS
+ movdqu (RNDKEYS), %xmm0
+ aesimc %xmm0, %xmm0
+ movdqu %xmm0, (INV_RNDKEYS)
+ dec COUNTER
+ jnz .Lgen_next_inv_round_key\@
+ movdqu -16(RNDKEYS), %xmm0 // First standard round key
+ movdqu %xmm0, 16(INV_RNDKEYS) // => Last inverse round key
+
+.Ldone\@:
+#ifdef __i386__
+ pop %esi
+ pop %ebx
+#endif
+ RET
+.endm
+
+// void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+// const u8 in_key[AES_KEYSIZE_128]);
+SYM_FUNC_START(aes128_expandkey_aesni)
+ _aes_expandkey_aesni 1
+SYM_FUNC_END(aes128_expandkey_aesni)
+
+// void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+// const u8 in_key[AES_KEYSIZE_256]);
+SYM_FUNC_START(aes256_expandkey_aesni)
+ _aes_expandkey_aesni 0
+SYM_FUNC_END(aes256_expandkey_aesni)
+
+.macro _aes_crypt_aesni enc
+#ifdef __x86_64__
+ .set RNDKEYS, %rdi
+ .set NROUNDS, %esi
+ .set OUT, %rdx
+ .set IN, %rcx
+#else
+ // Assuming -mregparm=3
+ .set RNDKEYS, %eax
+ .set NROUNDS, %edx
+ .set OUT, %ecx
+ .set IN, %ebx // Passed on stack
+#endif
+
+#ifdef __i386__
+ push %ebx
+ mov 8(%esp), %ebx
+#endif
+
+ // Zero-th round
+ movdqu (IN), %xmm0
+ movdqu (RNDKEYS), %xmm1
+ pxor %xmm1, %xmm0
+
+ // Normal rounds
+ add $16, RNDKEYS
+ dec NROUNDS
+.Lnext_round\@:
+ movdqu (RNDKEYS), %xmm1
+.if \enc
+ aesenc %xmm1, %xmm0
+.else
+ aesdec %xmm1, %xmm0
+.endif
+ add $16, RNDKEYS
+ dec NROUNDS
+ jne .Lnext_round\@
+
+ // Last round
+ movdqu (RNDKEYS), %xmm1
+.if \enc
+ aesenclast %xmm1, %xmm0
+.else
+ aesdeclast %xmm1, %xmm0
+.endif
+ movdqu %xmm0, (OUT)
+
+#ifdef __i386__
+ pop %ebx
+#endif
+ RET
+.endm
+
+// void aes_encrypt_aesni(const u32 rndkeys[], int nrounds,
+// u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+SYM_FUNC_START(aes_encrypt_aesni)
+ _aes_crypt_aesni 1
+SYM_FUNC_END(aes_encrypt_aesni)
+
+// void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds,
+// u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+SYM_FUNC_START(aes_decrypt_aesni)
+ _aes_crypt_aesni 0
+SYM_FUNC_END(aes_decrypt_aesni)
diff --git a/lib/crypto/x86/aes.h b/lib/crypto/x86/aes.h
new file mode 100644
index 000000000000..b047dee94f57
--- /dev/null
+++ b/lib/crypto/x86/aes.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * AES block cipher using AES-NI instructions
+ *
+ * Copyright 2026 Google LLC
+ */
+
+#include <asm/fpu/api.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_aes);
+
+void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+ const u8 in_key[AES_KEYSIZE_128]);
+void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+ const u8 in_key[AES_KEYSIZE_256]);
+void aes_encrypt_aesni(const u32 rndkeys[], int nrounds,
+ u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds,
+ u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+
+/*
+ * Expand an AES key using AES-NI if supported and usable or generic code
+ * otherwise. The expanded key format is compatible between the two cases. The
+ * outputs are @k->rndkeys (required) and @inv_k->inv_rndkeys (optional).
+ *
+ * We could just always use the generic key expansion code. AES key expansion
+ * is usually less performance-critical than AES en/decryption. However,
+ * there's still *some* value in speed here, as well as in non-key-dependent
+ * execution time which AES-NI provides. So, do use AES-NI to expand AES-128
+ * and AES-256 keys. (Don't bother with AES-192, as it's almost never used.)
+ */
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+ union aes_invkey_arch *inv_k,
+ const u8 *in_key, int key_len, int nrounds)
+{
+ u32 *rndkeys = k->rndkeys;
+ u32 *inv_rndkeys = inv_k ? inv_k->inv_rndkeys : NULL;
+
+ if (static_branch_likely(&have_aes) && key_len != AES_KEYSIZE_192 &&
+ irq_fpu_usable()) {
+ kernel_fpu_begin();
+ if (key_len == AES_KEYSIZE_128)
+ aes128_expandkey_aesni(rndkeys, inv_rndkeys, in_key);
+ else
+ aes256_expandkey_aesni(rndkeys, inv_rndkeys, in_key);
+ kernel_fpu_end();
+ } else {
+ aes_expandkey_generic(rndkeys, inv_rndkeys, in_key, key_len);
+ }
+}
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+ u8 out[AES_BLOCK_SIZE],
+ const u8 in[AES_BLOCK_SIZE])
+{
+ if (static_branch_likely(&have_aes) && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ aes_encrypt_aesni(key->k.rndkeys, key->nrounds, out, in);
+ kernel_fpu_end();
+ } else {
+ aes_encrypt_generic(key->k.rndkeys, key->nrounds, out, in);
+ }
+}
+
+static void aes_decrypt_arch(const struct aes_key *key,
+ u8 out[AES_BLOCK_SIZE],
+ const u8 in[AES_BLOCK_SIZE])
+{
+ if (static_branch_likely(&have_aes) && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ aes_decrypt_aesni(key->inv_k.inv_rndkeys, key->nrounds,
+ out, in);
+ kernel_fpu_end();
+ } else {
+ aes_decrypt_generic(key->inv_k.inv_rndkeys, key->nrounds,
+ out, in);
+ }
+}
+
+#define aes_mod_init_arch aes_mod_init_arch
+static void aes_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_AES))
+ static_branch_enable(&have_aes);
+}
--
2.52.0