From nobody Mon Oct  6 17:01:58 2025
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B4EE520766E;
	Fri, 18 Jul 2025 19:20:02 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1752866402; cv=none;
 b=hU6q/4WAPPxaZ3/6Yc4bQ4wtv7ieDlSNPgGFSIpVZoUksxXL6azjZB3OtwT6aHjwaxZRHjyuwMcFSL+smegGb2tYUJzGaB6LAGCFZ+l7k6fGm6L1A7y7rKSV4BDqEihUbv5XZKl8h2ZKd570q2cDFzosNGKc0+wnGvWIjHNHBoE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1752866402; c=relaxed/simple;
	bh=dS9Fky7xKxNSurHar2x8+SnKQ+ldHvnIXl92Nxn4y4s=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=BLDNLswowdd6qGGN1iaKdkNLeiW0TCJB8bOnZnhGXVS+H2q4lbHnN7H50P9BD+iaPfs3xuj7IH8kkbwoU3oS0v3+M0wuHf7e5xPMP1LJOGvNz+Swh95Gyc6CwBn09h1QIGmQ6gflE0P8Vok1GxixTRg2vIzlFIWHDS3VNBObSeo=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=iFTpLzz1; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="iFTpLzz1"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id E466EC4CEF5;
	Fri, 18 Jul 2025 19:20:01 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1752866402;
	bh=dS9Fky7xKxNSurHar2x8+SnKQ+ldHvnIXl92Nxn4y4s=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
	b=iFTpLzz1/jvaqInCdtnCjIDSF8PDXyQP48rKDBMnal+HMZy/rHR1670icf1SF30gj
	 /d//UrQKxPIrpODbm1HW2E7bCcxIFMuv0x0WpU3pdA4c2IAa1LIf9zViBh3gvT9L+z
	 eC9OMxLLDxaSq2uocqd8OoA4k+HIy6VSScqvxzgN5m3g+sXheG7gBLI/jOi6PSbwK0
	 qRdackw6v8bfF7VmCNk1ggVBlc7VoNJvy8s1/QCCy2LPDQIVuKvKBqY9AckHCmtTv0
	 FJXp3ownEh869C4vr1Lxw6w6SEh+obZcriLHbDM9IFwTSeamHdECkMu5XULdo+9trB
	 aUCqJSEq1GnEg==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org,
	x86@kernel.org,
	Ard Biesheuvel <ardb@kernel.org>,
	"Jason A . Donenfeld" <Jason@zx2c4.com>,
	Eric Biggers <ebiggers@kernel.org>
Subject: [PATCH 1/2] lib/crypto: x86/sha1-ni: Minor optimizations and cleanup
Date: Fri, 18 Jul 2025 12:18:59 -0700
Message-ID: <20250718191900.42877-2-ebiggers@kernel.org>
X-Mailer: git-send-email 2.50.1
In-Reply-To: <20250718191900.42877-1-ebiggers@kernel.org>
References: <20250718191900.42877-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

- Store the previous state in %xmm8-%xmm9 instead of spilling it to the
  stack.  There are plenty of unused XMM registers here, so there is no
  reason to spill to the stack.  (While 32-bit code is limited to
  %xmm0-%xmm7, this is 64-bit code, so it's free to use %xmm8-%xmm15.)

- Remove the unnecessary check for nblocks =3D=3D 0.  sha1_ni_transform() is
  always passed a positive nblocks.

- To get an XMM register with 'e' in the high dword and the rest zeroes,
  just zeroize the register using pxor, then load 'e'.  Previously the
  code loaded 'e', then zeroized the lower dwords by AND-ing with a
  constant, which was slightly less efficient.

- Instead of computing &DATA_PTR[NBLOCKS << 6] and stopping when
  DATA_PTR reaches that value, instead just decrement NBLOCKS on each
  iteration and stop when it reaches 0.  This is fewer instructions.

- Rename DIGEST_PTR to STATE_PTR.  It points to the SHA-1 internal
  state, not a SHA-1 digest value.

This commit shrinks the code size of sha1_ni_transform() from 624 bytes
to 589 bytes and also shrinks rodata by 16 bytes.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
---
 lib/crypto/x86/sha1-ni-asm.S | 68 +++++++++++++-----------------------
 1 file changed, 25 insertions(+), 43 deletions(-)

diff --git a/lib/crypto/x86/sha1-ni-asm.S b/lib/crypto/x86/sha1-ni-asm.S
index 3989b0642ff5f..1d08b2f364ce7 100644
--- a/lib/crypto/x86/sha1-ni-asm.S
+++ b/lib/crypto/x86/sha1-ni-asm.S
@@ -53,65 +53,56 @@
  *
  */
=20
 #include <linux/linkage.h>
=20
-#define DIGEST_PTR	%rdi	/* 1st arg */
+#define STATE_PTR	%rdi	/* 1st arg */
 #define DATA_PTR	%rsi	/* 2nd arg */
 #define NUM_BLKS	%rdx	/* 3rd arg */
=20
-/* gcc conversion */
-#define FRAME_SIZE	32	/* space for 2x16 bytes */
-
 #define ABCD		%xmm0
 #define E0		%xmm1	/* Need two E's b/c they ping pong */
 #define E1		%xmm2
 #define MSG0		%xmm3
 #define MSG1		%xmm4
 #define MSG2		%xmm5
 #define MSG3		%xmm6
 #define SHUF_MASK	%xmm7
-
+#define ABCD_SAVED	%xmm8
+#define E0_SAVED	%xmm9
=20
 /*
  * Intel SHA Extensions optimized implementation of a SHA-1 block function
  *
  * This function takes a pointer to the current SHA-1 state, a pointer to =
the
- * input data, and the number of 64-byte blocks to process.  Once all bloc=
ks
- * have been processed, the state is updated with the new state.  This fun=
ction
- * only processes complete blocks.  State initialization, buffering of par=
tial
+ * input data, and the number of 64-byte blocks to process.  The number of
+ * blocks to process is assumed to be nonzero.  Once all blocks have been
+ * processed, the state is updated with the new state.  This function only
+ * processes complete blocks.  State initialization, buffering of partial
  * blocks, and digest finalization are expected to be handled elsewhere.
  *
  * The indented lines in the loop are instructions related to rounds proce=
ssing.
  * The non-indented lines are instructions related to the message schedule.
  *
  * void sha1_ni_transform(struct sha1_block_state *state,
  *			  const u8 *data, size_t nblocks)
  */
 .text
 SYM_FUNC_START(sha1_ni_transform)
-	push		%rbp
-	mov		%rsp, %rbp
-	sub		$FRAME_SIZE, %rsp
-	and		$~0xF, %rsp
-
-	shl		$6, NUM_BLKS		/* convert to bytes */
-	jz		.Ldone_hash
-	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
-
-	/* load initial hash values */
-	pinsrd		$3, 1*16(DIGEST_PTR), E0
-	movdqu		0*16(DIGEST_PTR), ABCD
-	pand		UPPER_WORD_MASK(%rip), E0
+
+	/* Load the initial state from STATE_PTR. */
+	pxor		E0, E0
+	pinsrd		$3, 16(STATE_PTR), E0
+	movdqu		(STATE_PTR), ABCD
 	pshufd		$0x1B, ABCD, ABCD
=20
 	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
=20
-.Lloop0:
-	/* Save hash values for addition after rounds */
-	movdqa		E0, (0*16)(%rsp)
-	movdqa		ABCD, (1*16)(%rsp)
+.Lnext_block:
+	/* Save the state for addition after the rounds. */
+	movdqa		E0, E0_SAVED
+	movdqa		ABCD, ABCD_SAVED
=20
 	/* Rounds 0-3 */
 	movdqu		0*16(DATA_PTR), MSG0
 	pshufb		SHUF_MASK, MSG0
 		paddd		MSG0, E0
@@ -265,35 +256,26 @@ SYM_FUNC_START(sha1_ni_transform)
 	/* Rounds 76-79 */
 		sha1nexte	MSG3, E1
 		movdqa		ABCD, E0
 		sha1rnds4	$3, E1, ABCD
=20
-	/* Add current hash values with previously saved */
-	sha1nexte	(0*16)(%rsp), E0
-	paddd		(1*16)(%rsp), ABCD
+	/* Add the previous state (before the rounds) to the current state. */
+	sha1nexte	E0_SAVED, E0
+	paddd		ABCD_SAVED, ABCD
=20
-	/* Increment data pointer and loop if more to process */
+	/* Advance to the next block, or break if there are no more blocks. */
 	add		$64, DATA_PTR
-	cmp		NUM_BLKS, DATA_PTR
-	jne		.Lloop0
+	dec		NUM_BLKS
+	jnz		.Lnext_block
=20
-	/* Write hash values back in the correct order */
+	/* Store the new state to STATE_PTR. */
+	pextrd		$3, E0, 16(STATE_PTR)
 	pshufd		$0x1B, ABCD, ABCD
-	movdqu		ABCD, 0*16(DIGEST_PTR)
-	pextrd		$3, E0, 1*16(DIGEST_PTR)
-
-.Ldone_hash:
-	mov		%rbp, %rsp
-	pop		%rbp
+	movdqu		ABCD, (STATE_PTR)
=20
 	RET
 SYM_FUNC_END(sha1_ni_transform)
=20
 .section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
 .align 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x000102030405060708090a0b0c0d0e0f
-
-.section	.rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
-.align 16
-UPPER_WORD_MASK:
-	.octa 0xFFFFFFFF000000000000000000000000
--=20
2.50.1
From nobody Mon Oct  6 17:01:58 2025
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2274520F098;
	Fri, 18 Jul 2025 19:20:02 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1752866403; cv=none;
 b=gDWyQPRyhr6xtQxE75c6NoKUSTCJmQ5AxgLwTPgPtJpnou+ezHWT6iiHyQhe2Mk8fJULH3+3PxMDtJ/zYNHklDU8fwNftA1KAi9OsWH+TxzBXbnbJWK5xgaWOlx03gRStIifAQ87QZkHseZBmv1zig/8UMU4ogJZv7306sPKO+A=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1752866403; c=relaxed/simple;
	bh=tqbHNuzT5yP389d31CenPRZuzKHAZqST8zpA2sincno=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=TjQNuYLQTPXLy15Ryo/rzM1ys/CpZfEG5/BDCHQE4W0ZoAGg+lh0G13d42tC7L9LLEW3zugsAql3KL9LVD2VXbrtt+cdGvHstbsr61y4GDlIUHYQFIxzKQaHgXfjVxw+QSyay7XkeHVqkMTAmdEq4/Vq2TLj5MT0qpTfQEz1l7E=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=MLctTnl/; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="MLctTnl/"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 5BB33C4CEF6;
	Fri, 18 Jul 2025 19:20:02 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1752866402;
	bh=tqbHNuzT5yP389d31CenPRZuzKHAZqST8zpA2sincno=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
	b=MLctTnl/VJIxu7ShtQqpiAm0MNC0yItzgToWhzuoF966otVQc0YNRa/BNKEsiP+gq
	 8MoEnfvs/oF8bC9QbmyqjAjg31WKoqeRlHt7e2yNQoXixDWGKGFeVRq5m8ij5hLzOG
	 mxNXCQ2NEuDys38xZtnmavu0LcgnFf2kkgWw2Uqsz9kgFofXVWsxcfsXtwN1QY5MtU
	 iv+jro7pejYmaaBJHVmByZ0mpe1EILeZ7EaYeYWiGaOFqx0HRKYAa6DvpNYiyulEBN
	 ZUEXBIywOLbJYZdA+bnj7XKHnjtj+mUjIjXFaY51RDQkilpC4mD9Ys7gb8KC39oLaj
	 eMaeWXm89/Adg==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org,
	x86@kernel.org,
	Ard Biesheuvel <ardb@kernel.org>,
	"Jason A . Donenfeld" <Jason@zx2c4.com>,
	Eric Biggers <ebiggers@kernel.org>
Subject: [PATCH 2/2] lib/crypto: x86/sha1-ni: Convert to use rounds macros
Date: Fri, 18 Jul 2025 12:19:00 -0700
Message-ID: <20250718191900.42877-3-ebiggers@kernel.org>
X-Mailer: git-send-email 2.50.1
In-Reply-To: <20250718191900.42877-1-ebiggers@kernel.org>
References: <20250718191900.42877-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

The assembly code that does all 80 rounds of SHA-1 is highly repetitive.
Replace it with 20 expansions of a macro that does 4 rounds, using the
macro arguments and .if directives to handle the slight variations
between rounds.  This reduces the length of sha1-ni-asm.S by 129 lines
while still producing the exact same object file.  This mirrors
sha256-ni-asm.S which uses this same strategy.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
---
 lib/crypto/x86/sha1-ni-asm.S | 187 ++++++-----------------------------
 1 file changed, 29 insertions(+), 158 deletions(-)

diff --git a/lib/crypto/x86/sha1-ni-asm.S b/lib/crypto/x86/sha1-ni-asm.S
index 1d08b2f364ce7..428f9b9605943 100644
--- a/lib/crypto/x86/sha1-ni-asm.S
+++ b/lib/crypto/x86/sha1-ni-asm.S
@@ -68,23 +68,43 @@
 #define MSG3		%xmm6
 #define SHUF_MASK	%xmm7
 #define ABCD_SAVED	%xmm8
 #define E0_SAVED	%xmm9
=20
+.macro do_4rounds	i, m0, m1, m2, m3, e0, e1
+.if \i < 16
+	movdqu		\i*4(DATA_PTR), \m0
+	pshufb		SHUF_MASK, \m0
+.endif
+.if \i =3D=3D 0
+	paddd		\m0, \e0
+.else
+	sha1nexte	\m0, \e0
+.endif
+	movdqa		ABCD, \e1
+.if \i >=3D 12 && \i < 76
+	sha1msg2	\m0, \m1
+.endif
+	sha1rnds4	$\i / 20, \e0, ABCD
+.if \i >=3D 4 && \i < 68
+	sha1msg1	\m0, \m3
+.endif
+.if \i >=3D 8 && \i < 72
+	pxor		\m0, \m2
+.endif
+.endm
+
 /*
  * Intel SHA Extensions optimized implementation of a SHA-1 block function
  *
  * This function takes a pointer to the current SHA-1 state, a pointer to =
the
  * input data, and the number of 64-byte blocks to process.  The number of
  * blocks to process is assumed to be nonzero.  Once all blocks have been
  * processed, the state is updated with the new state.  This function only
  * processes complete blocks.  State initialization, buffering of partial
  * blocks, and digest finalization are expected to be handled elsewhere.
  *
- * The indented lines in the loop are instructions related to rounds proce=
ssing.
- * The non-indented lines are instructions related to the message schedule.
- *
  * void sha1_ni_transform(struct sha1_block_state *state,
  *			  const u8 *data, size_t nblocks)
  */
 .text
 SYM_FUNC_START(sha1_ni_transform)
@@ -100,165 +120,16 @@ SYM_FUNC_START(sha1_ni_transform)
 .Lnext_block:
 	/* Save the state for addition after the rounds. */
 	movdqa		E0, E0_SAVED
 	movdqa		ABCD, ABCD_SAVED
=20
-	/* Rounds 0-3 */
-	movdqu		0*16(DATA_PTR), MSG0
-	pshufb		SHUF_MASK, MSG0
-		paddd		MSG0, E0
-		movdqa		ABCD, E1
-		sha1rnds4	$0, E0, ABCD
-
-	/* Rounds 4-7 */
-	movdqu		1*16(DATA_PTR), MSG1
-	pshufb		SHUF_MASK, MSG1
-		sha1nexte	MSG1, E1
-		movdqa		ABCD, E0
-		sha1rnds4	$0, E1, ABCD
-	sha1msg1	MSG1, MSG0
-
-	/* Rounds 8-11 */
-	movdqu		2*16(DATA_PTR), MSG2
-	pshufb		SHUF_MASK, MSG2
-		sha1nexte	MSG2, E0
-		movdqa		ABCD, E1
-		sha1rnds4	$0, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	pxor		MSG2, MSG0
-
-	/* Rounds 12-15 */
-	movdqu		3*16(DATA_PTR), MSG3
-	pshufb		SHUF_MASK, MSG3
-		sha1nexte	MSG3, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$0, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	pxor		MSG3, MSG1
-
-	/* Rounds 16-19 */
-		sha1nexte	MSG0, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$0, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	pxor		MSG0, MSG2
-
-	/* Rounds 20-23 */
-		sha1nexte	MSG1, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	pxor		MSG1, MSG3
-
-	/* Rounds 24-27 */
-		sha1nexte	MSG2, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$1, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	pxor		MSG2, MSG0
-
-	/* Rounds 28-31 */
-		sha1nexte	MSG3, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	pxor		MSG3, MSG1
-
-	/* Rounds 32-35 */
-		sha1nexte	MSG0, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$1, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	pxor		MSG0, MSG2
-
-	/* Rounds 36-39 */
-		sha1nexte	MSG1, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	pxor		MSG1, MSG3
-
-	/* Rounds 40-43 */
-		sha1nexte	MSG2, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	pxor		MSG2, MSG0
-
-	/* Rounds 44-47 */
-		sha1nexte	MSG3, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$2, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	pxor		MSG3, MSG1
-
-	/* Rounds 48-51 */
-		sha1nexte	MSG0, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	pxor		MSG0, MSG2
-
-	/* Rounds 52-55 */
-		sha1nexte	MSG1, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$2, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	pxor		MSG1, MSG3
-
-	/* Rounds 56-59 */
-		sha1nexte	MSG2, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	pxor		MSG2, MSG0
-
-	/* Rounds 60-63 */
-		sha1nexte	MSG3, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$3, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	pxor		MSG3, MSG1
-
-	/* Rounds 64-67 */
-		sha1nexte	MSG0, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$3, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	pxor		MSG0, MSG2
-
-	/* Rounds 68-71 */
-		sha1nexte	MSG1, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$3, E1, ABCD
-	pxor		MSG1, MSG3
-
-	/* Rounds 72-75 */
-		sha1nexte	MSG2, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$3, E0, ABCD
-
-	/* Rounds 76-79 */
-		sha1nexte	MSG3, E1
-		movdqa		ABCD, E0
-		sha1rnds4	$3, E1, ABCD
+.irp i, 0, 16, 32, 48, 64
+	do_4rounds	(\i + 0),  MSG0, MSG1, MSG2, MSG3, E0, E1
+	do_4rounds	(\i + 4),  MSG1, MSG2, MSG3, MSG0, E1, E0
+	do_4rounds	(\i + 8),  MSG2, MSG3, MSG0, MSG1, E0, E1
+	do_4rounds	(\i + 12), MSG3, MSG0, MSG1, MSG2, E1, E0
+.endr
=20
 	/* Add the previous state (before the rounds) to the current state. */
 	sha1nexte	E0_SAVED, E0
 	paddd		ABCD_SAVED, ABCD
=20
--=20
2.50.1