The existing mul_u64_u64_div_u64() rounds down, a 'rounding up'
variant needs 'divisor - 1' adding in between the multiply and
divide so cannot easily be done by a caller.
Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d
and implement the 'round down' and 'round up' using it.
Update the x86-64 asm to optimise for 'c' being a constant zero.
For architectures that support u128 check for a 64bit product after
the multiply (will be cheap).
Leave in the early check for other architectures (mostly 32bit) when
'c' is zero to avoid the multi-part multiply.
Note that the cost of the 128bit divide will dwarf the rest of the code.
This function is very slow on everything except x86-64 (very very slow
on 32bit).
Add kerndoc definitions for all three functions.
Signed-off-by: David Laight <david.laight.linux@gmail.com>
---
arch/x86/include/asm/div64.h | 19 +++++++++++-----
include/linux/math64.h | 44 +++++++++++++++++++++++++++++++++++-
lib/math/div64.c | 41 ++++++++++++++++++---------------
3 files changed, 79 insertions(+), 25 deletions(-)
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9931e4c7d73f..9322a35f6a39 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -84,21 +84,28 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
* Will generate an #DE when the result doesn't fit u64, could fix with an
* __ex_table[] entry when it becomes an issue.
*/
-static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
+static inline u64 mul_u64_add_u64_div_u64(u64 a, u64 mul, u64 add, u64 div)
{
u64 q;
- asm ("mulq %2; divq %3" : "=a" (q)
- : "a" (a), "rm" (mul), "rm" (div)
- : "rdx");
+ if (statically_true(!add)) {
+ asm ("mulq %2; divq %3" : "=a" (q)
+ : "a" (a), "rm" (mul), "rm" (div)
+ : "rdx");
+ } else {
+ asm ("mulq %2; addq %4,%%rax; adcq $0,%%rdx; divq %3"
+ : "=a" (q)
+ : "a" (a), "rm" (mul), "rm" (div), "rm" (add)
+ : "rdx");
+ }
return q;
}
-#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
+#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
{
- return mul_u64_u64_div_u64(a, mul, div);
+ return mul_u64_add_u64_div_u64(a, mul, 0, div);
}
#define mul_u64_u32_div mul_u64_u32_div
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 6aaccc1626ab..e958170e64ab 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -282,7 +282,49 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
}
#endif /* mul_u64_u32_div */
-u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
+/**
+ * mul_u64_add_u64_div_u64 - unsigned 64bit multiply, add, and divide
+ * @a: first unsigned 64bit multiplicand
+ * @b: second unsigned 64bit multiplicand
+ * @c: unsigned 64bit addend
+ * @d: unsigned 64bit divisor
+ *
+ * Multiply two 64bit values together to generate a 128bit product
+ * add a third value and then divide by a fourth.
+ * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
+ *
+ * Return: (@a * @b + @c) / @d
+ */
+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d);
+
+/**
+ * mul_u64_u64_div_u64 - unsigned 64bit multiply and divide
+ * @a: first unsigned 64bit multiplicand
+ * @b: second unsigned 64bit multiplicand
+ * @d: unsigned 64bit divisor
+ *
+ * Multiply two 64bit values together to generate a 128bit product
+ * and then divide by a third value.
+ * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
+ *
+ * Return: @a * @b / @d
+ */
+#define mul_u64_u64_div_u64(a, b, d) mul_u64_add_u64_div_u64(a, b, 0, d)
+
+/**
+ * mul_u64_u64_div_u64_roundup - unsigned 64bit multiply and divide rounded up
+ * @a: first unsigned 64bit multiplicand
+ * @b: second unsigned 64bit multiplicand
+ * @d: unsigned 64bit divisor
+ *
+ * Multiply two 64bit values together to generate a 128bit product
+ * and then divide and round up.
+ * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
+ *
+ * Return: (@a * @b + @d - 1) / @d
+ */
+#define mul_u64_u64_div_u64_roundup(a, b, d) \
+ ({ u64 _tmp = (d); mul_u64_add_u64_div_u64(a, b, _tmp - 1, _tmp); })
/**
* DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 5faa29208bdb..50e025174495 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -183,26 +183,28 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
}
EXPORT_SYMBOL(iter_div_u64_rem);
-#ifndef mul_u64_u64_div_u64
-u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
+#if !defined(mul_u64_add_u64_div_u64)
+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
{
- if (ilog2(a) + ilog2(b) <= 62)
- return div64_u64(a * b, c);
-
#if defined(__SIZEOF_INT128__)
/* native 64x64=128 bits multiplication */
- u128 prod = (u128)a * b;
+ u128 prod = (u128)a * b + c;
u64 n_lo = prod, n_hi = prod >> 64;
#else
+ if (!c && ilog2(a) + ilog2(b) <= 62)
+ return div64_u64(a * b, d);
+
/* perform a 64x64=128 bits multiplication manually */
u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32;
u64 x, y, z;
- x = (u64)a_lo * b_lo;
+ /* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */
+ x = (u64)a_lo * b_lo + (u32)c;
y = (u64)a_lo * b_hi + (u32)(x >> 32);
+ y += (u32)(c >> 32);
z = (u64)a_hi * b_hi + (u32)(y >> 32);
y = (u64)a_hi * b_lo + (u32)y;
z += (u32)(y >> 32);
@@ -212,36 +214,39 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
#endif
- /* make sure c is not zero, trigger exception otherwise */
+ if (!n_hi)
+ return div64_u64(n_lo, d);
+
+ /* make sure d is not zero, trigger exception otherwise */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdiv-by-zero"
- if (unlikely(c == 0))
+ if (unlikely(d == 0))
return 1/0;
#pragma GCC diagnostic pop
- int shift = __builtin_ctzll(c);
+ int shift = __builtin_ctzll(d);
/* try reducing the fraction in case the dividend becomes <= 64 bits */
if ((n_hi >> shift) == 0) {
u64 n = shift ? (n_lo >> shift) | (n_hi << (64 - shift)) : n_lo;
- return div64_u64(n, c >> shift);
+ return div64_u64(n, d >> shift);
/*
* The remainder value if needed would be:
- * res = div64_u64_rem(n, c >> shift, &rem);
+ * res = div64_u64_rem(n, d >> shift, &rem);
* rem = (rem << shift) + (n_lo - (n << shift));
*/
}
- if (n_hi >= c) {
+ if (n_hi >= d) {
/* overflow: result is unrepresentable in a u64 */
return -1;
}
/* Do the full 128 by 64 bits division */
- shift = __builtin_clzll(c);
- c <<= shift;
+ shift = __builtin_clzll(d);
+ d <<= shift;
int p = 64 + shift;
u64 res = 0;
@@ -256,8 +261,8 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
n_hi <<= shift;
n_hi |= n_lo >> (64 - shift);
n_lo <<= shift;
- if (carry || (n_hi >= c)) {
- n_hi -= c;
+ if (carry || (n_hi >= d)) {
+ n_hi -= d;
res |= 1ULL << p;
}
} while (n_hi);
@@ -265,5 +270,5 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
return res;
}
-EXPORT_SYMBOL(mul_u64_u64_div_u64);
+EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
#endif
--
2.39.5
On Sat, 5 Apr 2025, David Laight wrote:
> The existing mul_u64_u64_div_u64() rounds down, a 'rounding up'
> variant needs 'divisor - 1' adding in between the multiply and
> divide so cannot easily be done by a caller.
>
> Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d
> and implement the 'round down' and 'round up' using it.
>
> Update the x86-64 asm to optimise for 'c' being a constant zero.
>
> For architectures that support u128 check for a 64bit product after
> the multiply (will be cheap).
> Leave in the early check for other architectures (mostly 32bit) when
> 'c' is zero to avoid the multi-part multiply.
>
> Note that the cost of the 128bit divide will dwarf the rest of the code.
> This function is very slow on everything except x86-64 (very very slow
> on 32bit).
>
> Add kerndoc definitions for all three functions.
>
> Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Sidenote: The 128-bits division cost is proportional to the number of
bits in the final result. So if the result is 0x0080000000000000 then
the loop will execute only once and exit early.
> ---
> arch/x86/include/asm/div64.h | 19 +++++++++++-----
> include/linux/math64.h | 44 +++++++++++++++++++++++++++++++++++-
> lib/math/div64.c | 41 ++++++++++++++++++---------------
> 3 files changed, 79 insertions(+), 25 deletions(-)
>
> diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
> index 9931e4c7d73f..9322a35f6a39 100644
> --- a/arch/x86/include/asm/div64.h
> +++ b/arch/x86/include/asm/div64.h
> @@ -84,21 +84,28 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
> * Will generate an #DE when the result doesn't fit u64, could fix with an
> * __ex_table[] entry when it becomes an issue.
> */
> -static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
> +static inline u64 mul_u64_add_u64_div_u64(u64 a, u64 mul, u64 add, u64 div)
> {
> u64 q;
>
> - asm ("mulq %2; divq %3" : "=a" (q)
> - : "a" (a), "rm" (mul), "rm" (div)
> - : "rdx");
> + if (statically_true(!add)) {
> + asm ("mulq %2; divq %3" : "=a" (q)
> + : "a" (a), "rm" (mul), "rm" (div)
> + : "rdx");
> + } else {
> + asm ("mulq %2; addq %4,%%rax; adcq $0,%%rdx; divq %3"
> + : "=a" (q)
> + : "a" (a), "rm" (mul), "rm" (div), "rm" (add)
> + : "rdx");
> + }
>
> return q;
> }
> -#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
> +#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
>
> static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
> {
> - return mul_u64_u64_div_u64(a, mul, div);
> + return mul_u64_add_u64_div_u64(a, mul, 0, div);
> }
> #define mul_u64_u32_div mul_u64_u32_div
>
> diff --git a/include/linux/math64.h b/include/linux/math64.h
> index 6aaccc1626ab..e958170e64ab 100644
> --- a/include/linux/math64.h
> +++ b/include/linux/math64.h
> @@ -282,7 +282,49 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
> }
> #endif /* mul_u64_u32_div */
>
> -u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
> +/**
> + * mul_u64_add_u64_div_u64 - unsigned 64bit multiply, add, and divide
> + * @a: first unsigned 64bit multiplicand
> + * @b: second unsigned 64bit multiplicand
> + * @c: unsigned 64bit addend
> + * @d: unsigned 64bit divisor
> + *
> + * Multiply two 64bit values together to generate a 128bit product
> + * add a third value and then divide by a fourth.
> + * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
> + *
> + * Return: (@a * @b + @c) / @d
> + */
> +u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d);
> +
> +/**
> + * mul_u64_u64_div_u64 - unsigned 64bit multiply and divide
> + * @a: first unsigned 64bit multiplicand
> + * @b: second unsigned 64bit multiplicand
> + * @d: unsigned 64bit divisor
> + *
> + * Multiply two 64bit values together to generate a 128bit product
> + * and then divide by a third value.
> + * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
> + *
> + * Return: @a * @b / @d
> + */
> +#define mul_u64_u64_div_u64(a, b, d) mul_u64_add_u64_div_u64(a, b, 0, d)
> +
> +/**
> + * mul_u64_u64_div_u64_roundup - unsigned 64bit multiply and divide rounded up
> + * @a: first unsigned 64bit multiplicand
> + * @b: second unsigned 64bit multiplicand
> + * @d: unsigned 64bit divisor
> + *
> + * Multiply two 64bit values together to generate a 128bit product
> + * and then divide and round up.
> + * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
> + *
> + * Return: (@a * @b + @d - 1) / @d
> + */
> +#define mul_u64_u64_div_u64_roundup(a, b, d) \
> + ({ u64 _tmp = (d); mul_u64_add_u64_div_u64(a, b, _tmp - 1, _tmp); })
>
> /**
> * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up
> diff --git a/lib/math/div64.c b/lib/math/div64.c
> index 5faa29208bdb..50e025174495 100644
> --- a/lib/math/div64.c
> +++ b/lib/math/div64.c
> @@ -183,26 +183,28 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
> }
> EXPORT_SYMBOL(iter_div_u64_rem);
>
> -#ifndef mul_u64_u64_div_u64
> -u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
> +#if !defined(mul_u64_add_u64_div_u64)
> +u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
> {
> - if (ilog2(a) + ilog2(b) <= 62)
> - return div64_u64(a * b, c);
> -
> #if defined(__SIZEOF_INT128__)
>
> /* native 64x64=128 bits multiplication */
> - u128 prod = (u128)a * b;
> + u128 prod = (u128)a * b + c;
> u64 n_lo = prod, n_hi = prod >> 64;
>
> #else
>
> + if (!c && ilog2(a) + ilog2(b) <= 62)
> + return div64_u64(a * b, d);
> +
> /* perform a 64x64=128 bits multiplication manually */
> u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32;
> u64 x, y, z;
>
> - x = (u64)a_lo * b_lo;
> + /* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */
> + x = (u64)a_lo * b_lo + (u32)c;
> y = (u64)a_lo * b_hi + (u32)(x >> 32);
> + y += (u32)(c >> 32);
> z = (u64)a_hi * b_hi + (u32)(y >> 32);
> y = (u64)a_hi * b_lo + (u32)y;
> z += (u32)(y >> 32);
> @@ -212,36 +214,39 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
>
> #endif
>
> - /* make sure c is not zero, trigger exception otherwise */
> + if (!n_hi)
> + return div64_u64(n_lo, d);
> +
> + /* make sure d is not zero, trigger exception otherwise */
> #pragma GCC diagnostic push
> #pragma GCC diagnostic ignored "-Wdiv-by-zero"
> - if (unlikely(c == 0))
> + if (unlikely(d == 0))
> return 1/0;
> #pragma GCC diagnostic pop
>
> - int shift = __builtin_ctzll(c);
> + int shift = __builtin_ctzll(d);
>
> /* try reducing the fraction in case the dividend becomes <= 64 bits */
> if ((n_hi >> shift) == 0) {
> u64 n = shift ? (n_lo >> shift) | (n_hi << (64 - shift)) : n_lo;
>
> - return div64_u64(n, c >> shift);
> + return div64_u64(n, d >> shift);
> /*
> * The remainder value if needed would be:
> - * res = div64_u64_rem(n, c >> shift, &rem);
> + * res = div64_u64_rem(n, d >> shift, &rem);
> * rem = (rem << shift) + (n_lo - (n << shift));
> */
> }
>
> - if (n_hi >= c) {
> + if (n_hi >= d) {
> /* overflow: result is unrepresentable in a u64 */
> return -1;
> }
>
> /* Do the full 128 by 64 bits division */
>
> - shift = __builtin_clzll(c);
> - c <<= shift;
> + shift = __builtin_clzll(d);
> + d <<= shift;
>
> int p = 64 + shift;
> u64 res = 0;
> @@ -256,8 +261,8 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
> n_hi <<= shift;
> n_hi |= n_lo >> (64 - shift);
> n_lo <<= shift;
> - if (carry || (n_hi >= c)) {
> - n_hi -= c;
> + if (carry || (n_hi >= d)) {
> + n_hi -= d;
> res |= 1ULL << p;
> }
> } while (n_hi);
> @@ -265,5 +270,5 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
>
> return res;
> }
> -EXPORT_SYMBOL(mul_u64_u64_div_u64);
> +EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
> #endif
> --
> 2.39.5
>
>
On Sat, 5 Apr 2025 21:46:25 -0400 (EDT) Nicolas Pitre <npitre@baylibre.com> wrote: > On Sat, 5 Apr 2025, David Laight wrote: > > > The existing mul_u64_u64_div_u64() rounds down, a 'rounding up' > > variant needs 'divisor - 1' adding in between the multiply and > > divide so cannot easily be done by a caller. > > > > Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d > > and implement the 'round down' and 'round up' using it. > > > > Update the x86-64 asm to optimise for 'c' being a constant zero. > > > > For architectures that support u128 check for a 64bit product after > > the multiply (will be cheap). > > Leave in the early check for other architectures (mostly 32bit) when > > 'c' is zero to avoid the multi-part multiply. > > > > Note that the cost of the 128bit divide will dwarf the rest of the code. > > This function is very slow on everything except x86-64 (very very slow > > on 32bit). > > > > Add kerndoc definitions for all three functions. > > > > Signed-off-by: David Laight <david.laight.linux@gmail.com> > > Reviewed-by: Nicolas Pitre <npitre@baylibre.com> > > Sidenote: The 128-bits division cost is proportional to the number of > bits in the final result. So if the result is 0x0080000000000000 then > the loop will execute only once and exit early. Some performance measurements for the test cases: 0: ok 50 25 19 19 19 19 19 19 19 19 mul_u64_u64_div_u64 1: ok 2 0 0 0 0 0 0 0 0 0 mul_u64_u64_div_u64 2: ok 4 4 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 3: ok 4 4 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 4: ok 4 4 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 5: ok 15 8 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 6: ok 275 225 223 223 223 223 223 224 224 223 mul_u64_u64_div_u64 7: ok 9 6 4 4 4 4 5 5 4 4 mul_u64_u64_div_u64 8: ok 241 192 187 187 187 187 187 188 187 187 mul_u64_u64_div_u64 9: ok 212 172 169 169 169 169 169 169 169 169 mul_u64_u64_div_u64 10: ok 12 6 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 11: ok 9 5 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 12: ok 6 4 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 13: ok 6 5 5 4 4 4 4 4 4 4 mul_u64_u64_div_u64 14: ok 4 4 5 5 4 4 4 4 4 5 mul_u64_u64_div_u64 15: ok 18 12 8 8 8 8 8 8 8 8 mul_u64_u64_div_u64 16: ok 18 11 6 6 6 6 6 6 6 6 mul_u64_u64_div_u64 17: ok 22 16 11 11 11 11 11 11 11 11 mul_u64_u64_div_u64 18: ok 25 18 9 9 9 9 9 10 9 10 mul_u64_u64_div_u64 19: ok 272 231 227 227 226 227 227 227 227 226 mul_u64_u64_div_u64 20: ok 198 188 185 185 185 186 185 185 186 186 mul_u64_u64_div_u64 21: ok 207 198 196 196 196 196 196 196 196 196 mul_u64_u64_div_u64 22: ok 201 189 190 189 190 189 190 189 190 189 mul_u64_u64_div_u64 23: ok 224 184 181 181 181 181 180 180 181 181 mul_u64_u64_div_u64 24: ok 238 185 179 179 179 179 179 179 179 179 mul_u64_u64_div_u64 25: ok 208 178 177 177 177 177 177 177 177 177 mul_u64_u64_div_u64 26: ok 170 146 139 139 139 139 139 139 139 139 mul_u64_u64_div_u64 27: ok 256 204 196 196 196 196 196 196 196 196 mul_u64_u64_div_u64 28: ok 227 195 194 195 194 195 194 195 194 195 mul_u64_u64_div_u64 Entry 0 is an extra test and is the test overhead - subtracted from the others. Each column is clocks for one run of the test, but for this set I'm running the actual test 16 times and later dividing the clock count by 16. It doesn't make much difference though, so the cost of the mfence; rdpmc; mfence; nop_test; mfence; rdpmc; mfence really is about 190 clocks (between the rdpmc results). As soon as you hit a non-trival case the number of clocks increases dramatically. This is on a zen5 in 64bit mode ignoring the u128 path. (I don't have the packages installed to compile a 32bit binary). Maybe I can compile it for arm32, it'll need the mfence and rdpmc changing. But maybe something simple will be ok on a pi-5. (oh and yes, I didn't need to include autoconf.h) David
On Sun, 6 Apr 2025 10:35:16 +0100 David Laight <david.laight.linux@gmail.com> wrote: > On Sat, 5 Apr 2025 21:46:25 -0400 (EDT) > Nicolas Pitre <npitre@baylibre.com> wrote: > > > On Sat, 5 Apr 2025, David Laight wrote: > > > > > The existing mul_u64_u64_div_u64() rounds down, a 'rounding up' > > > variant needs 'divisor - 1' adding in between the multiply and > > > divide so cannot easily be done by a caller. > > > > > > Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d > > > and implement the 'round down' and 'round up' using it. > > > > > > Update the x86-64 asm to optimise for 'c' being a constant zero. > > > > > > For architectures that support u128 check for a 64bit product after > > > the multiply (will be cheap). > > > Leave in the early check for other architectures (mostly 32bit) when > > > 'c' is zero to avoid the multi-part multiply. > > > > > > Note that the cost of the 128bit divide will dwarf the rest of the code. > > > This function is very slow on everything except x86-64 (very very slow > > > on 32bit). > > > > > > Add kerndoc definitions for all three functions. > > > > > > Signed-off-by: David Laight <david.laight.linux@gmail.com> > > > > Reviewed-by: Nicolas Pitre <npitre@baylibre.com> > > > > Sidenote: The 128-bits division cost is proportional to the number of > > bits in the final result. So if the result is 0x0080000000000000 then > > the loop will execute only once and exit early. > > Some performance measurements for the test cases: > 0: ok 50 25 19 19 19 19 19 19 19 19 mul_u64_u64_div_u64 > 1: ok 2 0 0 0 0 0 0 0 0 0 mul_u64_u64_div_u64 > 2: ok 4 4 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 > 3: ok 4 4 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 > 4: ok 4 4 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 > 5: ok 15 8 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 > 6: ok 275 225 223 223 223 223 223 224 224 223 mul_u64_u64_div_u64 > 7: ok 9 6 4 4 4 4 5 5 4 4 mul_u64_u64_div_u64 > 8: ok 241 192 187 187 187 187 187 188 187 187 mul_u64_u64_div_u64 > 9: ok 212 172 169 169 169 169 169 169 169 169 mul_u64_u64_div_u64 > 10: ok 12 6 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 > 11: ok 9 5 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 > 12: ok 6 4 4 4 4 4 4 4 4 4 mul_u64_u64_div_u64 > 13: ok 6 5 5 4 4 4 4 4 4 4 mul_u64_u64_div_u64 > 14: ok 4 4 5 5 4 4 4 4 4 5 mul_u64_u64_div_u64 > 15: ok 18 12 8 8 8 8 8 8 8 8 mul_u64_u64_div_u64 > 16: ok 18 11 6 6 6 6 6 6 6 6 mul_u64_u64_div_u64 > 17: ok 22 16 11 11 11 11 11 11 11 11 mul_u64_u64_div_u64 > 18: ok 25 18 9 9 9 9 9 10 9 10 mul_u64_u64_div_u64 > 19: ok 272 231 227 227 226 227 227 227 227 226 mul_u64_u64_div_u64 > 20: ok 198 188 185 185 185 186 185 185 186 186 mul_u64_u64_div_u64 > 21: ok 207 198 196 196 196 196 196 196 196 196 mul_u64_u64_div_u64 > 22: ok 201 189 190 189 190 189 190 189 190 189 mul_u64_u64_div_u64 > 23: ok 224 184 181 181 181 181 180 180 181 181 mul_u64_u64_div_u64 > 24: ok 238 185 179 179 179 179 179 179 179 179 mul_u64_u64_div_u64 > 25: ok 208 178 177 177 177 177 177 177 177 177 mul_u64_u64_div_u64 > 26: ok 170 146 139 139 139 139 139 139 139 139 mul_u64_u64_div_u64 > 27: ok 256 204 196 196 196 196 196 196 196 196 mul_u64_u64_div_u64 > 28: ok 227 195 194 195 194 195 194 195 194 195 mul_u64_u64_div_u64 > > Entry 0 is an extra test and is the test overhead - subtracted from the others. Except I'd failed to add the 'if (!a) return' so it was doing all the multiples. > Each column is clocks for one run of the test, but for this set I'm running > the actual test 16 times and later dividing the clock count by 16. > It doesn't make much difference though, so the cost of the > mfence; rdpmc; mfence; nop_test; mfence; rdpmc; mfence > really is about 190 clocks (between the rdpmc results). > > As soon as you hit a non-trival case the number of clocks increases > dramatically. > > This is on a zen5 in 64bit mode ignoring the u128 path. > (I don't have the packages installed to compile a 32bit binary). I had a brain wave. If I make the mul_div depend on the result of the first rdpmc and make the second rdpmc depend on the mul_div all the mfence can be removed. (easily done by + (value & volatile_zero)). (I need to do the same 'trick' for my 'rep movsb' measurements.) I then get (for 10 single calls of each mul_div): 0: ok 109 62 27 26 26 26 26 26 26 26 mul_u64_u64_div_u64 1: ok 100 48 47 26 25 25 25 25 25 25 mul_u64_u64_div_u64 2: ok 306 31 31 31 31 31 31 31 31 31 mul_u64_u64_div_u64 3: ok 32 32 32 32 32 32 32 32 32 32 mul_u64_u64_div_u64 4: ok 329 31 31 31 31 31 31 31 31 31 mul_u64_u64_div_u64 5: ok 108 61 59 34 34 34 34 34 34 34 mul_u64_u64_div_u64 6: ok 892 462 290 245 243 243 243 243 243 243 mul_u64_u64_div_u64 7: ok 95 80 34 34 34 34 34 34 34 34 mul_u64_u64_div_u64 8: ok 598 500 217 218 216 214 213 218 216 218 mul_u64_u64_div_u64 9: ok 532 461 228 186 187 189 189 189 189 189 mul_u64_u64_div_u64 10: ok 57 53 31 31 31 31 31 31 31 31 mul_u64_u64_div_u64 11: ok 71 41 41 27 27 27 27 27 27 27 mul_u64_u64_div_u64 12: ok 54 27 27 27 27 27 27 27 27 27 mul_u64_u64_div_u64 13: ok 60 34 34 34 34 34 34 34 34 34 mul_u64_u64_div_u64 14: ok 34 34 34 34 34 34 34 34 34 34 mul_u64_u64_div_u64 15: ok 74 94 24 24 24 24 24 24 24 24 mul_u64_u64_div_u64 16: ok 124 46 45 20 20 20 20 20 20 20 mul_u64_u64_div_u64 17: ok 140 79 26 24 25 24 25 24 25 24 mul_u64_u64_div_u64 18: ok 144 106 24 23 23 23 23 23 23 23 mul_u64_u64_div_u64 19: ok 569 357 204 203 204 203 204 203 204 203 mul_u64_u64_div_u64 20: ok 263 279 240 208 208 208 208 208 208 208 mul_u64_u64_div_u64 21: ok 351 580 397 216 215 215 215 215 215 215 mul_u64_u64_div_u64 22: ok 293 267 267 208 209 207 208 207 208 207 mul_u64_u64_div_u64 23: ok 536 319 236 236 236 236 236 236 236 236 mul_u64_u64_div_u64 24: ok 984 334 194 194 197 193 193 193 193 193 mul_u64_u64_div_u64 25: ok 630 360 195 199 199 199 199 199 199 199 mul_u64_u64_div_u64 26: ok 558 239 150 149 148 151 149 151 149 151 mul_u64_u64_div_u64 27: ok 997 414 215 219 214 219 214 219 214 219 mul_u64_u64_div_u64 28: ok 679 398 216 216 213 215 217 216 216 213 mul_u64_u64_div_u64 which now includes the cost of the divide when the product is 64bit. The first few results on each row are affected by things like cache reads and branch prediction. But the later ones are pretty stable. David
On Sat, 5 Apr 2025, Nicolas Pitre wrote: > On Sat, 5 Apr 2025, David Laight wrote: > > > The existing mul_u64_u64_div_u64() rounds down, a 'rounding up' > > variant needs 'divisor - 1' adding in between the multiply and > > divide so cannot easily be done by a caller. > > > > Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d > > and implement the 'round down' and 'round up' using it. > > > > Update the x86-64 asm to optimise for 'c' being a constant zero. > > > > For architectures that support u128 check for a 64bit product after > > the multiply (will be cheap). > > Leave in the early check for other architectures (mostly 32bit) when > > 'c' is zero to avoid the multi-part multiply. > > > > Note that the cost of the 128bit divide will dwarf the rest of the code. > > This function is very slow on everything except x86-64 (very very slow > > on 32bit). > > > > Add kerndoc definitions for all three functions. > > > > Signed-off-by: David Laight <david.laight.linux@gmail.com> > > Reviewed-by: Nicolas Pitre <npitre@baylibre.com> > > Sidenote: The 128-bits division cost is proportional to the number of > bits in the final result. So if the result is 0x0080000000000000 then > the loop will execute only once and exit early. Just to clarify what I said: the 128-bits division cost is proportional to the number of _set_ bits in the final result. Nicolas
© 2016 - 2026 Red Hat, Inc.