From nobody Mon Jun  8 04:15:29 2026
Received: from mail-wm1-f41.google.com (mail-wm1-f41.google.com
 [209.85.128.41])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 53F38330B07
	for <linux-kernel@vger.kernel.org>; Sun,  7 Jun 2026 11:25:38 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.128.41
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1780831540; cv=none;
 b=HTaI876HkkVlFa4t16wOnepS9lnDeEnvi40cE6Yp/bxYNT3lOFh9Krt+hNqJdIR/m7N/1sK2JT0x+mUxQWw0p+9JweYYXw3+TW8gY0vI/FedRwMkeipaIRCir+EUc1VgTSg0gGnXKVTJKOjsTOYcdVOWVqSm3RHFg2qXYrNmBTU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1780831540; c=relaxed/simple;
	bh=WK8dNOBYpsLHJqBG9EtuIZvlHcO5fFHa4uh22QQA2Wg=;
	h=From:To:Cc:Subject:Date:Message-ID:MIME-Version;
 b=r1l8hFUsD+kbNQ8v5FtJj9R8tqMZ8maClE5ek9Pu81Zt06mtymgN9o4P2892V77w1573pt01rWESX4w/N21NojNjJL05U1GG4fHuMA2OVJM2Vg8SZ+vuigG8GBbgA8wXmSmXDc/Mfr9IpO91dujP5LlgVvYKjTRbEgmA0Mzt8K8=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=BjQfyJQY; arc=none smtp.client-ip=209.85.128.41
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="BjQfyJQY"
Received: by mail-wm1-f41.google.com with SMTP id
 5b1f17b1804b1-490c0c92cffso20869075e9.2
        for <linux-kernel@vger.kernel.org>;
 Sun, 07 Jun 2026 04:25:38 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20251104; t=1780831537; x=1781436337;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:message-id:date:subject:cc
         :to:from:from:to:cc:subject:date:message-id:reply-to;
        bh=m9nxHZo/Ea/9cV2py7u1jKrjrNWsS0AoJHAz+pUd7Ps=;
        b=BjQfyJQYvfOF7r+OjqPJYt5laIp9ccLuAj9tRGbLwuwQtclKIpeUDxfRkM+vfRA5CD
         EhqGJaDs7JV40zw60Urtoj9+bJvkDpnXcmaDq/ZCpVJZiY7Vu//Muyuw371WyadY82uX
         Mmek6DIE0Nky4Jl1zmvWOmKdmAxzvHC85A2V7U7A+S0BQ6bYsF82LZZbTG+OVUY9FxgA
         zpnYMMn7bw410qqbWZthTfQGaRN5nrM+HD1wbvuAzP4WflXLPuAKNBFjIJUi99OD/wUq
         uW3G81EJB6XMMq/9UptG+o9QRhTdk/imirAWf7w7aSkplN/7+L8G5hScBty2I6BW6hMN
         tLfg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20251104; t=1780831537; x=1781436337;
        h=content-transfer-encoding:mime-version:message-id:date:subject:cc
         :to:from:x-gm-gg:x-gm-message-state:from:to:cc:subject:date
         :message-id:reply-to;
        bh=m9nxHZo/Ea/9cV2py7u1jKrjrNWsS0AoJHAz+pUd7Ps=;
        b=sNe/z+GR7YXSqgV7Icr+PMJE48sWTKcLyb1tXuQJnrnC63E58xN5zX8Tra3jWohZ6e
         icnX7PEKTRdEQzDTNJgO3HXgYhGonYeyOn5ffRqA4Im+F/O0HudKor9dmNOd7xwmJUam
         bCxkMcdLA6LRug5QOBiLtx9qMdn4bnJvGArteRZRDe/kq6O0RcYkDjmCBistMb8csQAV
         h7YD8kfmI6i24xuO0ttwpdgMLQIGWEeQIauadPuQmq/blbpj3BKq+5ifnyk+UsoXj//m
         RwTzolZt7Vdm0H54vxyK18Bf3nXcPurEdanTFooRr0fWi4BA5gh2bB+QdTZJPbgruNCG
         Fbxw==
X-Forwarded-Encrypted: i=1;
 AFNElJ9TtCWKrdfv7dlKZ1LbVPYFikVReog7/9/BtYPI3YOOZfF/qAMngt1kum0ca2aJkwLuANF5A1uX5OekEZk=@vger.kernel.org
X-Gm-Message-State: AOJu0YzqMD59BW3bu+lhVkwZs+o9V7HtSdABK2UYPsAQnVkFYF7y5mc8
	1hgWC11TOxzryJBAS8S/7YAhlmPg5M2baVxsqOn/Lm5JD9yQfZcHQg8f
X-Gm-Gg: Acq92OGJ58YYJODyGImdgBOEGmzXQMFDhMBd2dy3UFkq3HbF6j0YY4U8jumUFRYSQLO
	QjAsn1Kzz1kMw+V3UpEnLRBUQ3Id/RgaKNVX6J76FWyctwiuUS9jQd2J/bRtQ/6/XcO03Ye4ba+
	dLssvEnm648srq3l7lBOh2u1lyK13zFYaZMweFP811H145TNfDcYFhlYk6ctFqoVlIErp6oYKl5
	n9KN0so+XmDv6+vvVTExhdAd1+hyIKsTrC8OYqpXrg1hY4fiAxA0ork0zshHG4CC3O2RNkFCl6Q
	XSAHDrz+VhvyJ1ZZj64V0hPB/pbkAViAY6Sz7hHGdUsnSbtsalAcATDvniDQhRw7Pqz7CCPl2RV
	Hr81uoamYbgn2LzcZ+BBKdqdcASduBOuh9tgXhX0bX+xsiKg26Mfpo9od2pGQSAlbea3KdVmM8u
	KvtlQw7utJL2xvWfMS80f3otZlPSpJpzg74T8uvvF0vgxb0abNy/vG7iFYZzLHGFaJ4JSNGw==
X-Received: by 2002:a05:600c:81c9:b0:490:b724:507d with SMTP id
 5b1f17b1804b1-490c259f6e2mr153245535e9.11.1780831536484;
        Sun, 07 Jun 2026 04:25:36 -0700 (PDT)
Received: from localhost.localdomain
 ([2a02:1210:8e0c:3300:bec7:46ff:fea1:47ad])
        by smtp.gmail.com with ESMTPSA id
 5b1f17b1804b1-490c2d37edbsm211286635e9.2.2026.06.07.04.25.34
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Sun, 07 Jun 2026 04:25:35 -0700 (PDT)
From: Fabian Blatter <fabianblatter09@gmail.com>
To: lukas@wunner.de,
	ignat@linux.win,
	herbert@gondor.apana.org.au,
	davem@davemloft.net
Cc: stefanb@linux.ibm.com,
	linux-crypto@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	Fabian Blatter <fabianblatter09@gmail.com>
Subject: [PATCH] crypto: ecc - Optimize vli additive operations using compiler
 builtins
Date: Sun,  7 Jun 2026 13:24:35 +0200
Message-ID: <20260607112435.42804-1-fabianblatter09@gmail.com>
X-Mailer: git-send-email 2.54.0
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Replace the software carry flag emulation with compiler builtins.

Even the newest compilers struggle with taking advantage of the
hardware carry flag. Compiler builtins allow the compiler to
much more easily achieve this while still remaining constant-time.

This yields an approximately 6-7% performance improvement
on the ecc_gen_privkey, ecc_make_pub_key and crypto_ecdh_shared_secret
functions on x86_64 on all curve sizes.

Additionally, the code becomes much more readable.

Signed-off-by: Fabian Blatter <fabianblatter09@gmail.com>
---

Hi,

I'd like to expand on the benchmarks, compare the generated assembly,
and clarify some things.


Use of compiler builtins:

This patch uses __builtin_addcll, __builtin_subcll when available and
otherwise __builtin_uaddll_overflow, __builtin_usubll_overflow. the
latter have existed since ancient gcc versions, so no third fallback
is needed.

I have put the add_carry and sub_borrow inline functions with the
preprocessor logic for builtin selection directly in crypto/ecc.c.
Please let me know if you would like them to be somewhere else.

They do not emit data-dependent branches, and so remain constant-time.


Benchmarks:

All benchmarks were run single-threaded on my AMD 7700X CPU limited to
5.6Ghz. I have measured both nanoseconds and clock cycles, since their
combination can hint at downclocking issues and allows calculation of
the clock speed during the benchmark.

I have omitted the raw output from the benchmarking code, as they much
exceed the 72 character limit.

I have calculated the percent differences, included clock speed
calculations and relevant summaries.


Macro benchmarks:

These were run in a virtualized environment using virtme-ng on the
compiled linux kernel image compiled with default flags.

(the first value is the original time per operation, the second the
patched one. cc is short for clock cycles)

Curve keypair generation (ecc_gen_privkey + ecc_make_pub_key):

P256:
 - 646963ns/op -> 600632ns/op =3D -7.71%
 - 2911300cc/op -> 2702854cc/op =3D -7.71%
 - 4.4999Ghz -> 4.5000Ghz =3D no difference

P384:
 - 1239160ns/op -> 1153940ns/op =3D -7.38%
 - 5576250cc/op -> 5192749cc/op =3D -7.38%
 - 4.5000Ghz -> 4.5000Ghz =3D no difference

Shared secret generation (crypto_ecdh_shared_secret):

P256:
 - 320114ns/op -> 297548ns/op =3D -7.58%
 - 1440521cc/op -> 1338972cc/op =3D -7.58%
 - 4.5000Ghz -> 4.5000Ghz =3D no difference

P384:
 - 620768ns/op -> 582560ns/op =3D -6.55%
 - 2793467cc/op -> 2621529cc/op =3D -6.55%
 - 4.5000Ghz -> 4.5000Ghz =3D no difference

The benchmarks clearly indicate a roughly 6-7% performance increase on
the public API functions. It also appears that virtme-ng limited the
clock speed to 4.5Ghz


Micro benchmarks:

Since the vli additive functions only rely on u64 being defined, these
were run without virtualization and with varying compilers and
compiler flags.

The microbenchmarks show much more mixed results, depending
heavily on the compiler and optimization level used.

For instance, on gcc and O2, the vli_add present in the
patch is actually 25.3% slower than the original one. I have tracked
this down to gcc using a weird way to restore the carry flag after
each iteration, causing way more dependent instructions, preventing
ILP from executing multiple at once.

This is quite interesting, since, as far as I know, the kernel compiles
with gcc and O2 by default, yet the macro-level benchmarks still show a
performance increase. The effect seems to be reversed when crypto/ecc.c
gets compiled. Or maybe the linux kernel uses some additional
optimization flags, I am unsure.

However, most of the time, the patched version outperforms the original
one by a wide margin:
 - On clang -O2 or -O3, vli_add and vli_uadd show a 4.074x and 5.384x
   speedup.
 - On gcc, vli_uadd shows a 74% performance increase at O2,=20
   and a 2.07x speedup at O3.

The performance profile of vli_sub and vli_usub is almost identical to
that of vli_add and vli_uadd.


Assembly comparison:

I have put together a piece of code on Compiler explorer, to make sure
it compiles on old gcc versions, view instructions and play around with
compiler settings.

If you would like, you can play around yourself here:
https://godbolt.org/z/1jT5zesz8

When using clang 22.1 at -O3 -march=3Dlunarlake, the difference between
the patched and original version is particularly clear. The patched
version produces this assembly in the unrolled vli_add loop:

mov     rax, qword ptr [rsi + 8*rcx + 16]
adc     rax, qword ptr [rdx + 8*rcx + 16]
mov     qword ptr [rdi + 8*rcx + 16], rax
mov     rax, qword ptr [rsi + 8*rcx + 24]
adc     rax, qword ptr [rdx + 8*rcx + 24]
mov     qword ptr [rdi + 8*rcx + 24], rax
mov     rax, qword ptr [rsi + 8*rcx + 32]
adc     rax, qword ptr [rdx + 8*rcx + 32]
mov     qword ptr [rdi + 8*rcx + 32], rax
mov     rax, qword ptr [rsi + 8*rcx + 40]
adc     rax, qword ptr [rdx + 8*rcx + 40]
mov     qword ptr [rdi + 8*rcx + 40], rax
mov     rax, qword ptr [rsi + 8*rcx + 48]
adc     rax, qword ptr [rdx + 8*rcx + 48]

This is basically optimal for an inner loop. It's pure adc and mov
instructions. The loop counting part is still nowhere near perfect,
and still uses setc instructions. But it is still better than what
the original version produces with the same compiler and flags:

mov     r10, qword ptr [rsi + 8*rcx]
lea     r11, [r10 + rax]
add     r11, qword ptr [rdx + 8*rcx]
xor     ebx, ebx
cmp     r11, r10
setb    bl
cmove   rbx, rax
mov     qword ptr [rdi + 8*rcx], r11
mov     rax, qword ptr [rsi + 8*rcx + 8]
lea     r10, [rax + rbx]
add     r10, qword ptr [rdx + 8*rcx + 8]
xor     r11d, r11d
cmp     r10, rax
setb    r11b
cmove   r11, rbx
mov     qword ptr [rdi + 8*rcx + 8], r10
mov     rax, qword ptr [rsi + 8*rcx + 16]
lea     r10, [rax + r11]
add     r10, qword ptr [rdx + 8*rcx + 16]
xor     ebx, ebx
cmp     r10, rax
setb    bl
cmove   rbx, r11
mov     qword ptr [rdi + 8*rcx + 16], r10
mov     rax, qword ptr [rsi + 8*rcx + 24]
lea     r10, [rax + rbx]
add     r10, qword ptr [rdx + 8*rcx + 24]
xor     r11d, r11d
cmp     r10, rax
setb    r11b
cmove   r11, rbx

This is downright horrendous. that entire block of processes only 4
limbs, thats 8 instructions per limb! The add instructions
are also not adc instructions, showing that the carry flag is
being fully emulated. This demonstrates how even on the newest
compilers and at the highest optimization level, still cannot
generate hardware carry chains without explicit use of builtins.

I should note that not just clang 22.1.0 with -O3 -march=3Dlunarlake
does this. Gcc and clang show this behaviour on every version i have
tested, regardless of target architecture.

I am not very familiar with ARM or RISC-V assembly, but looking at
compiler explorer, the effect clearly persists, and in the case of
RISC-V actually gets much worse.

This affects all architectures across all compilers and compiler
flags.


If you have gotten this far, thank you for reading this and I am looking
forward to any feedback! If you would like any changes to this patch,
I am very happy to send a v2.

 crypto/ecc.c | 98 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 60 insertions(+), 38 deletions(-)

diff --git a/crypto/ecc.c b/crypto/ecc.c
index 43b0def3a225..4f7bb6f424d8 100644
--- a/crypto/ecc.c
+++ b/crypto/ecc.c
@@ -279,6 +279,48 @@ static void vli_rshift1(u64 *vli, unsigned int ndigits)
 	}
 }
=20
+#ifdef __has_builtin
+#if __has_builtin(__builtin_addcll)
+#define USE_BUILTIN_ADDC
+#endif
+#endif
+
+/* Computes result =3D left + right + carry_in and updates carry_out */
+static inline void add_carry(u64 left, u64 right, u64 *result, u64 carry_i=
n,
+			     u64 *carry_out)
+{
+#ifdef USE_BUILTIN_ADDC
+	*result =3D __builtin_addcll(left, right, carry_in, carry_out);
+#else
+	u64 sum1, sum2;
+	u64 c1 =3D __builtin_uaddll_overflow(left, right, &sum1);
+	u64 c2 =3D __builtin_uaddll_overflow(sum1, carry_in, &sum2);
+	*result =3D sum2;
+	*carry_out =3D c1 | c2;
+#endif
+}
+
+#ifdef __has_builtin
+#if __has_builtin(__builtin_subcll)
+#define USE_BUILTIN_SUBC
+#endif
+#endif
+
+/* Computes result =3D left - right - borrow_in and updates borrow_out */
+static inline void sub_borrow(u64 left, u64 right, u64 *result, u64 borrow=
_in,
+			      u64 *borrow_out)
+{
+#ifdef USE_BUILTIN_SUBC
+	*result =3D __builtin_subcll(left, right, borrow_in, borrow_out);
+#else
+	u64 diff1, diff2;
+	u64 b1 =3D __builtin_usubll_overflow(left, right, &diff1);
+	u64 b2 =3D __builtin_usubll_overflow(diff1, borrow_in, &diff2);
+	*result =3D diff2;
+	*borrow_out =3D b1 | b2;
+#endif
+}
+
 /* Computes result =3D left + right, returning carry. Can modify in place.=
 */
 static u64 vli_add(u64 *result, const u64 *left, const u64 *right,
 		   unsigned int ndigits)
@@ -286,15 +328,8 @@ static u64 vli_add(u64 *result, const u64 *left, const=
 u64 *right,
 	u64 carry =3D 0;
 	int i;
=20
-	for (i =3D 0; i < ndigits; i++) {
-		u64 sum;
-
-		sum =3D left[i] + right[i] + carry;
-		if (sum !=3D left[i])
-			carry =3D (sum < left[i]);
-
-		result[i] =3D sum;
-	}
+	for (i =3D 0; i < ndigits; i++)
+		add_carry(left[i], right[i], &result[i], carry, &carry);
=20
 	return carry;
 }
@@ -303,40 +338,29 @@ static u64 vli_add(u64 *result, const u64 *left, cons=
t u64 *right,
 static u64 vli_uadd(u64 *result, const u64 *left, u64 right,
 		    unsigned int ndigits)
 {
-	u64 carry =3D right;
+	u64 carry;
 	int i;
=20
-	for (i =3D 0; i < ndigits; i++) {
-		u64 sum;
+	if (ndigits =3D=3D 0)
+		return right;
=20
-		sum =3D left[i] + carry;
-		if (sum !=3D left[i])
-			carry =3D (sum < left[i]);
-		else
-			carry =3D !!carry;
+	carry =3D __builtin_uaddll_overflow(left[0], right, &result[0]);
=20
-		result[i] =3D sum;
-	}
+	for (i =3D 1; i < ndigits; i++)
+		carry =3D __builtin_uaddll_overflow(left[i], carry, &result[i]);
=20
 	return carry;
 }
=20
 /* Computes result =3D left - right, returning borrow. Can modify in place=
. */
 u64 vli_sub(u64 *result, const u64 *left, const u64 *right,
-		   unsigned int ndigits)
+	    unsigned int ndigits)
 {
 	u64 borrow =3D 0;
 	int i;
=20
-	for (i =3D 0; i < ndigits; i++) {
-		u64 diff;
-
-		diff =3D left[i] - right[i] - borrow;
-		if (diff !=3D left[i])
-			borrow =3D (diff > left[i]);
-
-		result[i] =3D diff;
-	}
+	for (i =3D 0; i < ndigits; i++)
+		sub_borrow(left[i], right[i], &result[i], borrow, &borrow);
=20
 	return borrow;
 }
@@ -344,20 +368,18 @@ EXPORT_SYMBOL(vli_sub);
=20
 /* Computes result =3D left - right, returning borrow. Can modify in place=
. */
 static u64 vli_usub(u64 *result, const u64 *left, u64 right,
-	     unsigned int ndigits)
+		    unsigned int ndigits)
 {
-	u64 borrow =3D right;
+	u64 borrow;
 	int i;
=20
-	for (i =3D 0; i < ndigits; i++) {
-		u64 diff;
+	if (ndigits =3D=3D 0)
+		return right;
=20
-		diff =3D left[i] - borrow;
-		if (diff !=3D left[i])
-			borrow =3D (diff > left[i]);
+	borrow =3D __builtin_usubll_overflow(left[0], right, &result[0]);
=20
-		result[i] =3D diff;
-	}
+	for (i =3D 1; i < ndigits; i++)
+		borrow =3D __builtin_usubll_overflow(left[i], borrow, &result[i]);
=20
 	return borrow;
 }
--=20
2.54.0