From nobody Wed Nov 27 04:49:26 2024 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9143C132132; Mon, 14 Oct 2024 04:25:32 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1728879932; cv=none; b=ZeXV3th6AtRt3JY5TstcRMwFFRJ9i3OtxpG03/mRNRTmNPNv4tU8t594ciVxwsiW3Ae9zrdcF/XLfu7ECA2TwYdYFYqVyaADL5R+QBNjxZjilHSZ0cZz9cQ7dTwdQqYcmO+3gEdHqqpmAU1RwyGON6NHJONoVyWfSgmvNHXoMng= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1728879932; c=relaxed/simple; bh=V4G8veCQ1vyP2bYC5v5Sfu2aA7K3ftboPQc0JQW7sIQ=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=GAe8IEW0igL1wJGJKVo1n0a9SWmKtnDCwx+mboGfxnr4E/NrlNeMfMZAjyjgvLkXOqighbWWMvLCY/nnGCA/lK7QEhczuRkERrjK6xrXoAtvIAgwVx3fi2ZXZbdndhp2x0a7D3didcIDvo5/WN4mQm+gMNwvQJTempst661dkdo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=Ay2qmW0D; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="Ay2qmW0D" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 003C4C4AF09; Mon, 14 Oct 2024 04:25:31 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1728879932; bh=V4G8veCQ1vyP2bYC5v5Sfu2aA7K3ftboPQc0JQW7sIQ=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=Ay2qmW0DUJIeorLXIHKyPkIedinlUlRAfjy2n9qHW8C1YhXik5JYv+cWv3r/0Six4 zcaW0cfAtak1GmHk7l8nZWfLpMXzv/hU6puWay3zFkoi2L6W275D83+8x8WvuGw6qP Bm33MlwFZN8SPPiQmYFkg8rCbse/VKYRpR2kWrE8pM8ZHizrSNdxxa0Zo+uCWbzCY8 xy4Z95QJcmzLRkRa+42pB+IBpGUzjLQ4pBeiRY5jDtzzVyPh5P3nQwXF9Qv6UgLU5c 7/CUlUn+moTXSXKTNeK1nULD/jIq5/UxSZknBpO8xWnPbW8yA7A+L5nhB1lcpEJQjV 7zrY5PHuaK9kA== From: Eric Biggers To: linux-crypto@vger.kernel.org Cc: x86@kernel.org, linux-kernel@vger.kernel.org, Ard Biesheuvel , Josh Poimboeuf , Peter Zijlstra Subject: [PATCH 1/3] crypto: x86/crc32c - simplify code for handling fewer than 200 bytes Date: Sun, 13 Oct 2024 21:24:45 -0700 Message-ID: <20241014042447.50197-2-ebiggers@kernel.org> X-Mailer: git-send-email 2.47.0 In-Reply-To: <20241014042447.50197-1-ebiggers@kernel.org> References: <20241014042447.50197-1-ebiggers@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Eric Biggers The assembly code in crc32c-pcl-intel-asm_64.S is invoked only for lengths >=3D 512, due to the overhead of saving and restoring FPU state. Therefore, it is unnecessary for this code to be excessively "optimized" for lengths < 200. Eliminate the excessive unrolling of this part of the code and use a more straightforward qword-at-a-time loop. Note: the part of the code in question is not entirely redundant, as it is still used to process any remainder mod 24, as well as any remaining data when fewer than 200 bytes remain after least one 3072-byte chunk. Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel --- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 116 ++++++---------------- 1 file changed, 33 insertions(+), 83 deletions(-) diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/cr= c32c-pcl-intel-asm_64.S index bbcff1fb78cb2..466cea4943963 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -54,24 +54,14 @@ =20 .macro JMPTBL_ENTRY i .quad .Lcrc_\i .endm =20 -.macro JNC_LESS_THAN j - jnc .Lless_than_\j -.endm - -# Define threshold where buffers are considered "small" and routed to more -# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so -# SMALL_SIZE can be no larger than 255. - +# Define threshold below which buffers are considered "small" and routed to +# regular CRC code that does not interleave the CRC instructions. #define SMALL_SIZE 200 =20 -.if (SMALL_SIZE > 255) -.error "SMALL_ SIZE must be < 256" -.endif - # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); =20 .text SYM_FUNC_START(crc_pcl) #define bufp rdi @@ -98,29 +88,22 @@ SYM_FUNC_START(crc_pcl) pushq %rsi =20 ## Move crc_init for Linux to a different mov crc_init_arg, crc_init =20 + mov %bufp, bufptmp # rdi =3D *buf + cmp $SMALL_SIZE, len + jb .Lsmall + ################################################################ ## 1) ALIGN: ################################################################ - - mov %bufp, bufptmp # rdi =3D *buf neg %bufp and $7, %bufp # calculate the unalignment amount of # the address je .Lproc_block # Skip if aligned =20 - ## If len is less than 8 and we're unaligned, we need to jump - ## to special code to avoid reading beyond the end of the buffer - cmp $8, len - jae .Ldo_align - # less_than_8 expects length in upper 3 bits of len_dw - # less_than_8_post_shl1 expects length =3D carryflag * 8 + len_dw[31:30] - shl $32-3+1, len_dw - jmp .Lless_than_8_post_shl1 - .Ldo_align: #### Calculate CRC of unaligned bytes of the buffer (if any) movq (bufptmp), tmp # load a quadward from the buffer add %bufp, bufptmp # align buffer pointer for quadword # processing @@ -142,13 +125,10 @@ SYM_FUNC_START(crc_pcl) =20 cmpq $128*24, len jae .Lfull_block =20 .Lcontinue_block: - cmpq $SMALL_SIZE, len - jb .Lsmall - ## len < 128*24 movq $2731, %rax # 2731 =3D ceil(2^16 / 24) mul len_dw shrq $16, %rax =20 @@ -241,72 +221,42 @@ LABEL crc_ %i LABEL crc_ 0 ENDBR mov tmp, len cmp $128*24, tmp jae .Lfull_block - cmp $24, tmp + cmp $SMALL_SIZE, tmp jae .Lcontinue_block =20 -.Lless_than_24: - shl $32-4, len_dw # less_than_16 expects length - # in upper 4 bits of len_dw - jnc .Lless_than_16 - crc32q (bufptmp), crc_init - crc32q 8(bufptmp), crc_init - jz .Ldo_return - add $16, bufptmp - # len is less than 8 if we got here - # less_than_8 expects length in upper 3 bits of len_dw - # less_than_8_post_shl1 expects length =3D carryflag * 8 + len_dw[31:30] - shl $2, len_dw - jmp .Lless_than_8_post_shl1 - ####################################################################### - ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) + ## 6) Process any remainder without interleaving: ####################################################################### .Lsmall: - shl $32-8, len_dw # Prepare len_dw for less_than_256 - j=3D256 -.rept 5 # j =3D {256, 128, 64, 32, 16} -.altmacro -LABEL less_than_ %j # less_than_j: Length should be in - # upper lg(j) bits of len_dw - j=3D(j/2) - shl $1, len_dw # Get next MSB - JNC_LESS_THAN %j -.noaltmacro - i=3D0 -.rept (j/8) - crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data - i=3Di+8 -.endr - jz .Ldo_return # Return if remaining length is zero - add $j, bufptmp # Advance buf -.endr - -.Lless_than_8: # Length should be stored in - # upper 3 bits of len_dw - shl $1, len_dw -.Lless_than_8_post_shl1: - jnc .Lless_than_4 - crc32l (bufptmp), crc_init_dw # CRC of 4 bytes - jz .Ldo_return # return if remaining data is zero - add $4, bufptmp -.Lless_than_4: # Length should be stored in - # upper 2 bits of len_dw - shl $1, len_dw - jnc .Lless_than_2 - crc32w (bufptmp), crc_init_dw # CRC of 2 bytes - jz .Ldo_return # return if remaining data is zero - add $2, bufptmp -.Lless_than_2: # Length should be stored in the MSB - # of len_dw - shl $1, len_dw - jnc .Lless_than_1 - crc32b (bufptmp), crc_init_dw # CRC of 1 byte -.Lless_than_1: # Length should be zero -.Ldo_return: + test len, len + jz .Ldone + mov len_dw, %eax + shr $3, %eax + jz .Ldo_dword +.Ldo_qwords: + crc32q (bufptmp), crc_init + add $8, bufptmp + dec %eax + jnz .Ldo_qwords +.Ldo_dword: + test $4, len_dw + jz .Ldo_word + crc32l (bufptmp), crc_init_dw + add $4, bufptmp +.Ldo_word: + test $2, len_dw + jz .Ldo_byte + crc32w (bufptmp), crc_init_dw + add $2, bufptmp +.Ldo_byte: + test $1, len_dw + jz .Ldone + crc32b (bufptmp), crc_init_dw +.Ldone: movq crc_init, %rax popq %rsi popq %rdi popq %rbx RET --=20 2.47.0 From nobody Wed Nov 27 04:49:26 2024 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BE7BC1369AA; Mon, 14 Oct 2024 04:25:32 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1728879932; cv=none; b=hxkoDt2BX3MWpjs2BvmUW0WcdLfsR2++PItAHBcwBhX31FOg6TPAzGrCjk40Iv5Fwu+ACPDjEXoR/R4g8T/TKgpW6JmwRzqSeCN615S7IUbw1aUi03C0QXI0NTgzcYw2QgcFIlJxrFP90t4zko5bYiKbIMpAQZ6cRgCgNh+WYjI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1728879932; c=relaxed/simple; bh=PJQ2WFhPJRxPyoMGMR3LzkZSPPky28CN3RLpvs4UTaI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=o5KvlIV0jOu4ktk75OocFkNGfJspWtCO+TZAExBrFL27nngQVSLFOxp8NiwHnm2vSVyhkWQx+SnODdFx9NSPYUN5ZWT4ew/iNn1gp/q7tEZ1xgnjuLxajBzuQnK982xCuD+0WgVxrl0PX5SUjq7N2Rmi9ki4MlLkCFg1t7UeMvM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=LMOZ+OT9; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="LMOZ+OT9" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 4F826C4AF0B; Mon, 14 Oct 2024 04:25:32 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1728879932; bh=PJQ2WFhPJRxPyoMGMR3LzkZSPPky28CN3RLpvs4UTaI=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=LMOZ+OT9PZ7lO+ZCa2NUof4BzuB/E+VcN69a3+4SKR4lBytiQ7AGawL3G8Z8i0cTM wn6cgGqJteUjW8Gh9D24EY53urxlsZHamH3Dy2AIq5ruL1UeCurVjYR6+fCIfIdyMt odFgcVGz/bhW/gsPwrA2ix+HbzKymkZOIdvCJfqDCaGIxPwJj51eRs/Dqjwu4LUeuR zf1DJnKjMofaEvLbZjQiD3M7kM7X8nhfoEJdbq+apcowYSzj5o1Qs2UaJkt5ELhchM aMSCH649p+ZTY9KjmEGowI5Buv+EgS+vUBpiOkIj683FM8bvIJEfG9+49P5XwWKKHz +D7QlADhN/Zgw== From: Eric Biggers To: linux-crypto@vger.kernel.org Cc: x86@kernel.org, linux-kernel@vger.kernel.org, Ard Biesheuvel , Josh Poimboeuf , Peter Zijlstra Subject: [PATCH 2/3] crypto: x86/crc32c - access 32-bit arguments as 32-bit Date: Sun, 13 Oct 2024 21:24:46 -0700 Message-ID: <20241014042447.50197-3-ebiggers@kernel.org> X-Mailer: git-send-email 2.47.0 In-Reply-To: <20241014042447.50197-1-ebiggers@kernel.org> References: <20241014042447.50197-1-ebiggers@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Eric Biggers Fix crc32c-pcl-intel-asm_64.S to access 32-bit arguments as 32-bit values instead of 64-bit, since the upper bits of the corresponding 64-bit registers are not guaranteed to be zero. Also update the type of the length argument to be unsigned int rather than int, as the assembly code treats it as unsigned. Note: there haven't been any reports of this bug actually causing incorrect behavior. Neither gcc nor clang guarantee zero-extension to 64 bits, but zero-extension is likely to happen in practice because most instructions that operate on 32-bit registers zero-extend to 64 bits. Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel --- arch/x86/crypto/crc32c-intel_glue.c | 2 +- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 57 +++++++++++------------ 2 files changed, 27 insertions(+), 32 deletions(-) diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-i= ntel_glue.c index feccb5254c7e5..52c5d47ef5a14 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -39,11 +39,11 @@ * size is >=3D 512 to account * for fpu state save/restore overhead. */ #define CRC32C_PCL_BREAKEVEN 512 =20 -asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, +asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init); #endif /* CONFIG_X86_64 */ =20 static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, siz= e_t length) { diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/cr= c32c-pcl-intel-asm_64.S index 466cea4943963..bbf860e90951d 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -58,11 +58,11 @@ =20 # Define threshold below which buffers are considered "small" and routed to # regular CRC code that does not interleave the CRC instructions. #define SMALL_SIZE 200 =20 -# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); +# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int cr= c_init); =20 .text SYM_FUNC_START(crc_pcl) #define bufp rdi #define bufp_dw %edi @@ -70,18 +70,15 @@ SYM_FUNC_START(crc_pcl) #define bufp_b %dil #define bufptmp %rcx #define block_0 %rcx #define block_1 %rdx #define block_2 %r11 -#define len %rsi -#define len_dw %esi -#define len_w %si -#define len_b %sil -#define crc_init_arg %rdx +#define len %esi +#define crc_init_arg %edx #define tmp %rbx -#define crc_init %r8 -#define crc_init_dw %r8d +#define crc_init %r8d +#define crc_init_q %r8 #define crc1 %r9 #define crc2 %r10 =20 pushq %rbx pushq %rdi @@ -105,13 +102,13 @@ SYM_FUNC_START(crc_pcl) .Ldo_align: #### Calculate CRC of unaligned bytes of the buffer (if any) movq (bufptmp), tmp # load a quadward from the buffer add %bufp, bufptmp # align buffer pointer for quadword # processing - sub %bufp, len # update buffer length + sub bufp_dw, len # update buffer length .Lalign_loop: - crc32b %bl, crc_init_dw # compute crc32 of 1-byte + crc32b %bl, crc_init # compute crc32 of 1-byte shr $8, tmp # get next byte dec %bufp jne .Lalign_loop =20 .Lproc_block: @@ -119,19 +116,18 @@ SYM_FUNC_START(crc_pcl) ################################################################ ## 2) PROCESS BLOCKS: ################################################################ =20 ## compute num of bytes to be processed - movq len, tmp # save num bytes in tmp =20 - cmpq $128*24, len + cmp $128*24, len jae .Lfull_block =20 .Lcontinue_block: ## len < 128*24 movq $2731, %rax # 2731 =3D ceil(2^16 / 24) - mul len_dw + mul len shrq $16, %rax =20 ## eax contains floor(bytes / 24) =3D num 24-byte chunks to do =20 ## process rax 24-byte chunks (128 >=3D rax >=3D 0) @@ -174,21 +170,21 @@ SYM_FUNC_START(crc_pcl) .rept 128-1 .altmacro LABEL crc_ %i .noaltmacro ENDBR - crc32q -i*8(block_0), crc_init + crc32q -i*8(block_0), crc_init_q crc32q -i*8(block_1), crc1 crc32q -i*8(block_2), crc2 i=3D(i-1) .endr =20 .altmacro LABEL crc_ %i .noaltmacro ENDBR - crc32q -i*8(block_0), crc_init + crc32q -i*8(block_0), crc_init_q crc32q -i*8(block_1), crc1 # SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet =20 mov block_2, block_0 =20 @@ -198,66 +194,65 @@ LABEL crc_ %i =20 lea (K_table-8)(%rip), %bufp # first entry is for idx 1 shlq $3, %rax # rax *=3D 8 pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 leal (%eax,%eax,2), %eax # rax *=3D 3 (total *24) - subq %rax, tmp # tmp -=3D rax*24 + sub %eax, len # len -=3D rax*24 =20 - movq crc_init, %xmm1 # CRC for block 1 + movq crc_init_q, %xmm1 # CRC for block 1 pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 =20 movq crc1, %xmm2 # CRC for block 2 pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 =20 pxor %xmm2,%xmm1 movq %xmm1, %rax xor -i*8(block_2), %rax - mov crc2, crc_init - crc32 %rax, crc_init + mov crc2, crc_init_q + crc32 %rax, crc_init_q =20 ################################################################ ## 5) Check for end: ################################################################ =20 LABEL crc_ 0 ENDBR - mov tmp, len - cmp $128*24, tmp + cmp $128*24, len jae .Lfull_block - cmp $SMALL_SIZE, tmp + cmp $SMALL_SIZE, len jae .Lcontinue_block =20 ####################################################################### ## 6) Process any remainder without interleaving: ####################################################################### .Lsmall: test len, len jz .Ldone - mov len_dw, %eax + mov len, %eax shr $3, %eax jz .Ldo_dword .Ldo_qwords: - crc32q (bufptmp), crc_init + crc32q (bufptmp), crc_init_q add $8, bufptmp dec %eax jnz .Ldo_qwords .Ldo_dword: - test $4, len_dw + test $4, len jz .Ldo_word - crc32l (bufptmp), crc_init_dw + crc32l (bufptmp), crc_init add $4, bufptmp .Ldo_word: - test $2, len_dw + test $2, len jz .Ldo_byte - crc32w (bufptmp), crc_init_dw + crc32w (bufptmp), crc_init add $2, bufptmp .Ldo_byte: - test $1, len_dw + test $1, len jz .Ldone - crc32b (bufptmp), crc_init_dw + crc32b (bufptmp), crc_init .Ldone: - movq crc_init, %rax + mov crc_init, %eax popq %rsi popq %rdi popq %rbx RET SYM_FUNC_END(crc_pcl) --=20 2.47.0 From nobody Wed Nov 27 04:49:26 2024 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id EB43F13A24D; Mon, 14 Oct 2024 04:25:32 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1728879933; cv=none; b=MKemF9dP0tqDGJcO5jBBkiHDV90nDzZnl6NjjMCbBEROZqTu2z2J6ulnFDxE1kIrAtp+5weZ9s81kiz/iqMoSpuI6vEAYP9B8vUa6Zl2Pze9nphoPwIO+pogzJ2NDiDYAYmcBtluFT6mi0DiewTKFBZTHu7EPBIXpNvDrGLG4Qw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1728879933; c=relaxed/simple; bh=S507UONhbJRA/Nk31B/+8d7E9sxvFQ7zN3/8PYLAt6U=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=UM4iaFT+2jRULx1VAJ5ZG1zDn/9hDvNoLsdycih0VcBfLS/SoKT/W/V/uSUoiTOAsWJK0ThtNYozwMT3va7Yu9OK8M189ZTbHGlSceSjmD+KXL4iMRoll3iyyjwCxGtTABvgrodN/2xY89HM6F0ABz/IJGYOIh1M3r1rcQbCVeg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=aJ0a7SGU; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="aJ0a7SGU" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 9DD40C4CED3; Mon, 14 Oct 2024 04:25:32 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1728879932; bh=S507UONhbJRA/Nk31B/+8d7E9sxvFQ7zN3/8PYLAt6U=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=aJ0a7SGUz+8rAe6Yu6XdXmrsoqNh7fH0hGCzEW9DzZMM2aBTlpjJFy1th7CAcYwOU eR5qX5qeHE/FqYzLT8GZgZHtMmFuT9Sx9ULg/QuzppJvsoVUy23nz2DoAMYb+DPLJS gpOLNmrC5ENL2o/bRibIP7ueOkzgupDvHOskhtAOWuSke2rLNgyvhNTphn/FhbFJbb yr6TiDuzRuYuF5EYTqqTer8A7iEziFMxuAOfTvAdMOetKHbARrxZ+/3zFtrWCl8UzC ACpDwa/pKfIx57zDTPiAcfUGEn9GzGKBIzY9PkMFHp0mRXm5oSdehjnSw5KgsjqBuW oQhxgfdwt+Xvw== From: Eric Biggers To: linux-crypto@vger.kernel.org Cc: x86@kernel.org, linux-kernel@vger.kernel.org, Ard Biesheuvel , Josh Poimboeuf , Peter Zijlstra Subject: [PATCH 3/3] crypto: x86/crc32c - eliminate jump table and excessive unrolling Date: Sun, 13 Oct 2024 21:24:47 -0700 Message-ID: <20241014042447.50197-4-ebiggers@kernel.org> X-Mailer: git-send-email 2.47.0 In-Reply-To: <20241014042447.50197-1-ebiggers@kernel.org> References: <20241014042447.50197-1-ebiggers@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Eric Biggers crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully unrolled and uses a jump table to jump into the correct location. This optimization is misguided, as it bloats the binary code size and introduces an indirect call. x86_64 CPUs can predict loops well, so it is fine to just use a loop instead. Loop bookkeeping instructions can compete with the crc instructions for the ALUs, but this is easily mitigated by unrolling the loop by a smaller amount, such as 4 times. Therefore, re-roll the loop and make related tweaks to the code. This reduces the binary code size of crc_pclmul() from 4546 bytes to 418 bytes, a 91% reduction. In general it also makes the code faster, with some large improvements seen when retpoline is enabled. More detailed performance results are shown below. They are given as percent improvement in throughput (negative means regressed) for CPU microarchitecture vs. input length in bytes. E.g. an improvement from 40 GB/s to 50 GB/s would be listed as 25%. Table 1: Results with retpoline enabled (the default): | 512 | 833 | 1024 | 2000 | 3173 | 4096 | ---------------------+-------+-------+-------+------ +-------+-------+ Intel Haswell | 35.0% | 20.7% | 17.8% | 9.7% | -0.2% | 4.4% | Intel Emerald Rapids | 66.8% | 45.2% | 36.3% | 19.3% | 0.0% | 5.4% | AMD Zen 2 | 29.5% | 17.2% | 13.5% | 8.6% | -0.5% | 2.8% | Table 2: Results with retpoline disabled: | 512 | 833 | 1024 | 2000 | 3173 | 4096 | ---------------------+-------+-------+-------+------ +-------+-------+ Intel Haswell | 3.3% | 4.8% | 4.5% | 0.9% | -2.9% | 0.3% | Intel Emerald Rapids | 7.5% | 6.4% | 5.2% | 2.3% | -0.0% | 0.6% | AMD Zen 2 | 11.8% | 1.4% | 0.2% | 1.3% | -0.9% | -0.2% | Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel --- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 233 +++++++++------------- 1 file changed, 92 insertions(+), 141 deletions(-) diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/cr= c32c-pcl-intel-asm_64.S index bbf860e90951d..752812bc4991d 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -5,10 +5,11 @@ * downloaded from: * http://www.intel.com/content/dam/www/public/us/en/documents/white-paper= s/crc-iscsi-polynomial-crc32-instruction-paper.pdf * http://www.intel.com/content/dam/www/public/us/en/documents/white-paper= s/fast-crc-computation-paper.pdf * * Copyright (C) 2012 Intel Corporation. + * Copyright 2024 Google LLC * * Authors: * Wajdi Feghali * James Guilford * David Cote @@ -42,186 +43,153 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ =20 #include -#include =20 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction =20 -.macro LABEL prefix n -.L\prefix\n\(): -.endm - -.macro JMPTBL_ENTRY i -.quad .Lcrc_\i -.endm - # Define threshold below which buffers are considered "small" and routed to # regular CRC code that does not interleave the CRC instructions. #define SMALL_SIZE 200 =20 # unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int cr= c_init); =20 .text SYM_FUNC_START(crc_pcl) -#define bufp rdi -#define bufp_dw %edi -#define bufp_w %di -#define bufp_b %dil -#define bufptmp %rcx -#define block_0 %rcx -#define block_1 %rdx -#define block_2 %r11 -#define len %esi -#define crc_init_arg %edx -#define tmp %rbx -#define crc_init %r8d -#define crc_init_q %r8 -#define crc1 %r9 -#define crc2 %r10 - - pushq %rbx - pushq %rdi - pushq %rsi +#define bufp %rdi +#define bufp_d %edi +#define len %esi +#define crc_init %edx +#define crc_init_q %rdx +#define n_misaligned %ecx /* overlaps chunk_bytes! */ +#define n_misaligned_q %rcx +#define chunk_bytes %ecx /* overlaps n_misaligned! */ +#define chunk_bytes_q %rcx +#define crc1 %r8 +#define crc2 %r9 =20 - ## Move crc_init for Linux to a different - mov crc_init_arg, crc_init - - mov %bufp, bufptmp # rdi =3D *buf cmp $SMALL_SIZE, len jb .Lsmall =20 ################################################################ ## 1) ALIGN: ################################################################ - neg %bufp - and $7, %bufp # calculate the unalignment amount of + mov bufp_d, n_misaligned + neg n_misaligned + and $7, n_misaligned # calculate the misalignment amount of # the address - je .Lproc_block # Skip if aligned + je .Laligned # Skip if aligned =20 + # Process 1 <=3D n_misaligned <=3D 7 bytes individually in order to align + # the remaining data to an 8-byte boundary. .Ldo_align: - #### Calculate CRC of unaligned bytes of the buffer (if any) - movq (bufptmp), tmp # load a quadward from the buffer - add %bufp, bufptmp # align buffer pointer for quadword - # processing - sub bufp_dw, len # update buffer length + movq (bufp), %rax + add n_misaligned_q, bufp + sub n_misaligned, len .Lalign_loop: - crc32b %bl, crc_init # compute crc32 of 1-byte - shr $8, tmp # get next byte - dec %bufp + crc32b %al, crc_init # compute crc32 of 1-byte + shr $8, %rax # get next byte + dec n_misaligned jne .Lalign_loop - -.Lproc_block: +.Laligned: =20 ################################################################ - ## 2) PROCESS BLOCKS: + ## 2) PROCESS BLOCK: ################################################################ =20 - ## compute num of bytes to be processed - cmp $128*24, len jae .Lfull_block =20 -.Lcontinue_block: - ## len < 128*24 - movq $2731, %rax # 2731 =3D ceil(2^16 / 24) - mul len - shrq $16, %rax - - ## eax contains floor(bytes / 24) =3D num 24-byte chunks to do - - ## process rax 24-byte chunks (128 >=3D rax >=3D 0) - - ## compute end address of each block - ## block 0 (base addr + RAX * 8) - ## block 1 (base addr + RAX * 16) - ## block 2 (base addr + RAX * 24) - lea (bufptmp, %rax, 8), block_0 - lea (block_0, %rax, 8), block_1 - lea (block_1, %rax, 8), block_2 - - xor crc1, crc1 - xor crc2, crc2 - - ## branch into array - leaq jump_table(%rip), %bufp - mov (%bufp,%rax,8), %bufp - JMP_NOSPEC bufp +.Lpartial_block: + # Compute floor(len / 24) to get num qwords to process from each lane. + imul $2731, len, %eax # 2731 =3D ceil(2^16 / 24) + shr $16, %eax + jmp .Lcrc_3lanes =20 - ################################################################ - ## 2a) PROCESS FULL BLOCKS: - ################################################################ .Lfull_block: - movl $128,%eax - lea 128*8*2(block_0), block_1 - lea 128*8*3(block_0), block_2 - add $128*8*1, block_0 - - xor crc1,crc1 - xor crc2,crc2 - - # Fall through into top of crc array (crc_128) + # Processing 128 qwords from each lane. + mov $128, %eax =20 ################################################################ - ## 3) CRC Array: + ## 3) CRC each of three lanes: ################################################################ =20 - i=3D128 -.rept 128-1 -.altmacro -LABEL crc_ %i -.noaltmacro - ENDBR - crc32q -i*8(block_0), crc_init_q - crc32q -i*8(block_1), crc1 - crc32q -i*8(block_2), crc2 - i=3D(i-1) -.endr - -.altmacro -LABEL crc_ %i -.noaltmacro - ENDBR - crc32q -i*8(block_0), crc_init_q - crc32q -i*8(block_1), crc1 -# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet +.Lcrc_3lanes: + xor crc1,crc1 + xor crc2,crc2 + mov %eax, chunk_bytes + shl $3, chunk_bytes # num bytes to process from each lane + sub $5, %eax # 4 for 4x_loop, 1 for special last iter + jl .Lcrc_3lanes_4x_done + + # Unroll the loop by a factor of 4 to reduce the overhead of the loop + # bookkeeping instructions, which can compete with crc32q for the ALUs. +.Lcrc_3lanes_4x_loop: + crc32q (bufp), crc_init_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + crc32q 8(bufp), crc_init_q + crc32q 8(bufp,chunk_bytes_q), crc1 + crc32q 8(bufp,chunk_bytes_q,2), crc2 + crc32q 16(bufp), crc_init_q + crc32q 16(bufp,chunk_bytes_q), crc1 + crc32q 16(bufp,chunk_bytes_q,2), crc2 + crc32q 24(bufp), crc_init_q + crc32q 24(bufp,chunk_bytes_q), crc1 + crc32q 24(bufp,chunk_bytes_q,2), crc2 + add $32, bufp + sub $4, %eax + jge .Lcrc_3lanes_4x_loop + +.Lcrc_3lanes_4x_done: + add $4, %eax + jz .Lcrc_3lanes_last_qword + +.Lcrc_3lanes_1x_loop: + crc32q (bufp), crc_init_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + add $8, bufp + dec %eax + jnz .Lcrc_3lanes_1x_loop =20 - mov block_2, block_0 +.Lcrc_3lanes_last_qword: + crc32q (bufp), crc_init_q + crc32q (bufp,chunk_bytes_q), crc1 +# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet =20 ################################################################ ## 4) Combine three results: ################################################################ =20 - lea (K_table-8)(%rip), %bufp # first entry is for idx 1 - shlq $3, %rax # rax *=3D 8 - pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 - leal (%eax,%eax,2), %eax # rax *=3D 3 (total *24) - sub %eax, len # len -=3D rax*24 + lea (K_table-8)(%rip), %rax # first entry is for idx 1 + pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 + lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 + sub %eax, len # len -=3D chunk_bytes * 3 =20 movq crc_init_q, %xmm1 # CRC for block 1 pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 =20 movq crc1, %xmm2 # CRC for block 2 pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 =20 pxor %xmm2,%xmm1 movq %xmm1, %rax - xor -i*8(block_2), %rax + xor (bufp,chunk_bytes_q,2), %rax mov crc2, crc_init_q crc32 %rax, crc_init_q + lea 8(bufp,chunk_bytes_q,2), bufp =20 ################################################################ - ## 5) Check for end: + ## 5) If more blocks remain, goto (2): ################################################################ =20 -LABEL crc_ 0 - ENDBR cmp $128*24, len - jae .Lfull_block + jae .Lfull_block cmp $SMALL_SIZE, len - jae .Lcontinue_block + jae .Lpartial_block =20 ####################################################################### ## 6) Process any remainder without interleaving: ####################################################################### .Lsmall: @@ -229,51 +197,34 @@ LABEL crc_ 0 jz .Ldone mov len, %eax shr $3, %eax jz .Ldo_dword .Ldo_qwords: - crc32q (bufptmp), crc_init_q - add $8, bufptmp + crc32q (bufp), crc_init_q + add $8, bufp dec %eax jnz .Ldo_qwords .Ldo_dword: test $4, len jz .Ldo_word - crc32l (bufptmp), crc_init - add $4, bufptmp + crc32l (bufp), crc_init + add $4, bufp .Ldo_word: test $2, len jz .Ldo_byte - crc32w (bufptmp), crc_init - add $2, bufptmp + crc32w (bufp), crc_init + add $2, bufp .Ldo_byte: test $1, len jz .Ldone - crc32b (bufptmp), crc_init + crc32b (bufp), crc_init .Ldone: mov crc_init, %eax - popq %rsi - popq %rdi - popq %rbx RET SYM_FUNC_END(crc_pcl) =20 .section .rodata, "a", @progbits - ################################################################ - ## jump table Table is 129 entries x 2 bytes each - ################################################################ -.align 4 -jump_table: - i=3D0 -.rept 129 -.altmacro -JMPTBL_ENTRY %i -.noaltmacro - i=3Di+1 -.endr - - ################################################################ ## PCLMULQDQ tables ## Table is 128 entries x 2 words (8 bytes) each ################################################################ .align 8 --=20 2.47.0