tools/nolibc: Optimise and common up number to ascii functions

[PATCH next] tools/nolibc: Optimise and common up number to ascii functions

Posted by david.laight.linux@gmail.com 4 days ago

From: David Laight <david.laight.linux@gmail.com>

Implement u[64]to[ah]_r() using a common function that uses multiply
by reciprocal to generate the least significant digit first and then
reverses the string.

On 32bit this is five multiplies (with 64bit product) for each output
digit. I think the old utoa_r() always did 36 multiplies and a lot
of subtracts - so this is likely faster even for 32bit values.
Definitely better for 64bit values (especially small ones).

Clearly shifts are faster for base 16, but reversing the output buffer
makes a big difference.

Sharing the code reduces the footprint (unless gcc decides to constant
fold the functions).
Definitely helps vfprintf() where the constants get loaded and a single
call is down.
Also makes it cheap to add octal support to vfprintf for completeness.

Signed-off-by: David Laight <david.laight.linux@gmail.com>
---
 tools/include/nolibc/stdlib.h | 145 ++++++++++++++--------------------
 1 file changed, 58 insertions(+), 87 deletions(-)

diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h
index f184e108ed0a..8a2e86f60ae9 100644
--- a/tools/include/nolibc/stdlib.h
+++ b/tools/include/nolibc/stdlib.h
@@ -188,36 +188,67 @@ void *realloc(void *old_ptr, size_t new_size)
 	return ret;
 }
 
-/* Converts the unsigned long integer <in> to its hex representation into
+/* Converts the unsigned 64bit integer <in> to base <base> ascii into
  * buffer <buffer>, which must be long enough to store the number and the
- * trailing zero (17 bytes for "ffffffffffffffff" or 9 for "ffffffff"). The
- * buffer is filled from the first byte, and the number of characters emitted
- * (not counting the trailing zero) is returned. The function is constructed
- * in a way to optimize the code size and avoid any divide that could add a
- * dependency on large external functions.
+ * trailing zero. The buffer is filled from the first byte, and the number
+ * of characters emitted (not counting the trailing zero) is returned.
+ * The function uses 'multiply be reciprocal' for the divisions and
+ * requires the caller pass the correct compile-time constant.
  */
-static __attribute__((unused))
-int utoh_r(unsigned long in, char *buffer)
+#define __U64TOA_RECIP(base) ((base) & 1 ? ~0ull / (base) : (1ull << 63) / ((base) / 2))
+static __attribute__((unused, noinline))
+int __u64toa_base(uint64_t in, char *buffer, unsigned int base, uint64_t recip)
 {
-	signed char pos = (~0UL > 0xfffffffful) ? 60 : 28;
-	int digits = 0;
-	int dig;
+	unsigned int digits = 0;
+	unsigned int dig;
+	uint64_t q;
+	char *p;
 
+	/* Generate least significant digit first */
 	do {
-		dig = in >> pos;
-		in -= (uint64_t)dig << pos;
-		pos -= 4;
-		if (dig || digits || pos < 0) {
-			if (dig > 9)
-				dig += 'a' - '0' - 10;
-			buffer[digits++] = '0' + dig;
+#if defined(__SIZEOF_INT128__) && !defined(__mips__)
+		q = ((unsigned __int128)in * recip) >> 64;
+#else
+		uint64_t p = (uint32_t)in * (recip >> 32);
+		q = (in >> 32) * (recip >> 32) + (p >> 32);
+		p = (uint32_t)p + (in >> 32) * (uint32_t)recip;
+		q += p >> 32;
+#endif
+		dig = in - q * base;
+		/* Correct for any rounding errors (eg from low*low multiply) */
+		while (dig >= base) {
+			dig -= base;
+			q++;
 		}
-	} while (pos >= 0);
+		if (dig > 9)
+			dig += 'a' - '0' - 10;
+		buffer[digits++] = '0' + dig;
+	} while ((in = q));
 
 	buffer[digits] = 0;
+
+	/* Order reverse to result */
+	for (p = buffer + digits - 1; p > buffer; buffer++, p--) {
+		dig = *buffer;
+		*buffer = *p;
+		*p = dig;
+	}
+
 	return digits;
 }
 
+/* Converts the unsigned long integer <in> to its hex representation into
+ * buffer <buffer>, which must be long enough to store the number and the
+ * trailing zero (17 bytes for "ffffffffffffffff" or 9 for "ffffffff"). The
+ * buffer is filled from the first byte, and the number of characters emitted
+ * (not counting the trailing zero) is returned.
+ */
+static __inline__ __attribute__((unused))
+int utoh_r(unsigned long in, char *buffer)
+{
+	return __u64toa_base(in, buffer, 16, __U64TOA_RECIP(16));
+}
+
 /* converts unsigned long <in> to an hex string using the static itoa_buffer
  * and returns the pointer to that string.
  */
@@ -233,30 +264,11 @@ char *utoh(unsigned long in)
  * trailing zero (21 bytes for 18446744073709551615 in 64-bit, 11 for
  * 4294967295 in 32-bit). The buffer is filled from the first byte, and the
  * number of characters emitted (not counting the trailing zero) is returned.
- * The function is constructed in a way to optimize the code size and avoid
- * any divide that could add a dependency on large external functions.
  */
-static __attribute__((unused))
+static __inline__ __attribute__((unused))
 int utoa_r(unsigned long in, char *buffer)
 {
-	unsigned long lim;
-	int digits = 0;
-	int pos = (~0UL > 0xfffffffful) ? 19 : 9;
-	int dig;
-
-	do {
-		for (dig = 0, lim = 1; dig < pos; dig++)
-			lim *= 10;
-
-		if (digits || in >= lim || !pos) {
-			for (dig = 0; in >= lim; dig++)
-				in -= lim;
-			buffer[digits++] = '0' + dig;
-		}
-	} while (pos--);
-
-	buffer[digits] = 0;
-	return digits;
+	return __u64toa_base(in, buffer, 10, __U64TOA_RECIP(10));
 }
 
 /* Converts the signed long integer <in> to its string representation into
@@ -324,34 +336,12 @@ char *utoa(unsigned long in)
  * buffer <buffer>, which must be long enough to store the number and the
  * trailing zero (17 bytes for "ffffffffffffffff"). The buffer is filled from
  * the first byte, and the number of characters emitted (not counting the
- * trailing zero) is returned. The function is constructed in a way to optimize
- * the code size and avoid any divide that could add a dependency on large
- * external functions.
+ * trailing zero) is returned.
  */
-static __attribute__((unused))
+static __inline__ __attribute__((unused))
 int u64toh_r(uint64_t in, char *buffer)
 {
-	signed char pos = 60;
-	int digits = 0;
-	int dig;
-
-	do {
-		if (sizeof(long) >= 8) {
-			dig = (in >> pos) & 0xF;
-		} else {
-			/* 32-bit platforms: avoid a 64-bit shift */
-			uint32_t d = (pos >= 32) ? (in >> 32) : in;
-			dig = (d >> (pos & 31)) & 0xF;
-		}
-		if (dig > 9)
-			dig += 'a' - '0' - 10;
-		pos -= 4;
-		if (dig || digits || pos < 0)
-			buffer[digits++] = '0' + dig;
-	} while (pos >= 0);
-
-	buffer[digits] = 0;
-	return digits;
+	return __u64toa_base(in, buffer, 16, __U64TOA_RECIP(16));
 }
 
 /* converts uint64_t <in> to an hex string using the static itoa_buffer and
@@ -368,31 +358,12 @@ char *u64toh(uint64_t in)
  * buffer <buffer>, which must be long enough to store the number and the
  * trailing zero (21 bytes for 18446744073709551615). The buffer is filled from
  * the first byte, and the number of characters emitted (not counting the
- * trailing zero) is returned. The function is constructed in a way to optimize
- * the code size and avoid any divide that could add a dependency on large
- * external functions.
+ * trailing zero) is returned.
  */
-static __attribute__((unused))
+static __inline__ __attribute__((unused))
 int u64toa_r(uint64_t in, char *buffer)
 {
-	unsigned long long lim;
-	int digits = 0;
-	int pos = 19; /* start with the highest possible digit */
-	int dig;
-
-	do {
-		for (dig = 0, lim = 1; dig < pos; dig++)
-			lim *= 10;
-
-		if (digits || in >= lim || !pos) {
-			for (dig = 0; in >= lim; dig++)
-				in -= lim;
-			buffer[digits++] = '0' + dig;
-		}
-	} while (pos--);
-
-	buffer[digits] = 0;
-	return digits;
+	return __u64toa_base(in, buffer, 10, __U64TOA_RECIP(10));
 }
 
 /* Converts the signed 64-bit integer <in> to its string representation into
-- 
2.39.5

Re: [PATCH next] tools/nolibc: Optimise and common up number to ascii functions

Posted by Willy Tarreau 9 minutes ago

Hi David,

On Tue, Feb 03, 2026 at 03:13:15PM +0000, david.laight.linux@gmail.com wrote:
> From: David Laight <david.laight.linux@gmail.com>
> 
> Implement u[64]to[ah]_r() using a common function that uses multiply
> by reciprocal to generate the least significant digit first and then
> reverses the string.
> 
> On 32bit this is five multiplies (with 64bit product) for each output
> digit. I think the old utoa_r() always did 36 multiplies and a lot
> of subtracts - so this is likely faster even for 32bit values.
> Definitely better for 64bit values (especially small ones).
> 
> Clearly shifts are faster for base 16, but reversing the output buffer
> makes a big difference.
> 
> Sharing the code reduces the footprint (unless gcc decides to constant
> fold the functions).
> Definitely helps vfprintf() where the constants get loaded and a single
> call is down.
> Also makes it cheap to add octal support to vfprintf for completeness.
> 
> Signed-off-by: David Laight <david.laight.linux@gmail.com>

OK, I had a long series of tests on it, including with older compilers
going back to gcc-4.7 and on various archs. Except for code that would
previously only use utoh(), the new code is slightly smaller in the vast
majority of cases. And this combined with the added flexibility looks
like a good addition. The code is not trivial (as every time we're
dealing with number representation) but it's well documented, so I'm
personally fine with the change.

I'm just having a few comments below:

> -static __attribute__((unused))
> -int utoh_r(unsigned long in, char *buffer)
> +#define __U64TOA_RECIP(base) ((base) & 1 ? ~0ull / (base) : (1ull << 63) / ((base) / 2))

Please rename this macro to have _NOBLIC_ as a prefix.

> +#if defined(__SIZEOF_INT128__) && !defined(__mips__)

Out of curiosity, why !mips ? I tried with -mabi=64 and the function size
dropped from 0x120 to 0xc0 (lost 1/3 of its size).

> +		q = ((unsigned __int128)in * recip) >> 64;
> +#else
(...)

Once the macro is renamed, feel free to add:

Acked-by: Willy Tarreau <w@1wt.eu>

Thanks!
Willy