Replace the bit by bit algorithm with one that generates 16 bits
per iteration on 32bit architectures and 32 bits on 64bit ones.
On my zen 5 this reduces the time for the tests (using the generic
code) from ~3350ns to ~1000ns.
Running the 32bit algorithm on 64bit x86 takes ~1500ns.
It'll be slightly slower on a real 32bit system, mostly due
to register pressure.
The savings for 32bit x86 are much higher (tested in userspace).
The worst case (lots of bits in the quotient) drops from ~900 clocks
to ~130 (pretty much independant of the arguments).
Other 32bit architectures may see better savings.
It is possibly to optimise for divisors that span less than
__LONG_WIDTH__/2 bits. However I suspect they don't happen that often
and it doesn't remove any slow cpu divide instructions which dominate
the result.
Signed-off-by: David Laight <david.laight.linux@gmail.com>
---
new patch for v3.
lib/math/div64.c | 124 +++++++++++++++++++++++++++++++++--------------
1 file changed, 87 insertions(+), 37 deletions(-)
diff --git a/lib/math/div64.c b/lib/math/div64.c
index fb77fd9d999d..bb318ff2ad87 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -221,11 +221,37 @@ static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c)
#endif
+#ifndef BITS_PER_ITER
+#define BITS_PER_ITER (__LONG_WIDTH__ >= 64 ? 32 : 16)
+#endif
+
+#if BITS_PER_ITER == 32
+#define mul_u64_long_add_u64(p_lo, a, b, c) mul_u64_u64_add_u64(p_lo, a, b, c)
+#define add_u64_long(a, b) ((a) + (b))
+
+#else
+static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c)
+{
+ u64 n_lo = mul_add(a, b, c);
+ u64 n_med = mul_add(a >> 32, b, c >> 32);
+
+ n_med = add_u64_u32(n_med, n_lo >> 32);
+ *p_lo = n_med << 32 | (u32)n_lo;
+ return n_med >> 32;
+}
+
+#define add_u64_long(a, b) add_u64_u32(a, b)
+
+#endif
+
u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
{
- u64 n_lo, n_hi;
+ unsigned long d_msig, q_digit;
+ unsigned int reps, d_z_hi;
+ u64 quotient, n_lo, n_hi;
+ u32 overflow;
if (WARN_ONCE(!d, "%s: division of (%#llx * %#llx + %#llx) by zero, returning 0",
__func__, a, b, c )) {
/*
* Return 0 (rather than ~(u64)0) because it is less likely to
@@ -243,46 +269,70 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
__func__, a, b, c, n_hi, n_lo, d))
return ~(u64)0;
- int shift = __builtin_ctzll(d);
-
- /* try reducing the fraction in case the dividend becomes <= 64 bits */
- if ((n_hi >> shift) == 0) {
- u64 n = shift ? (n_lo >> shift) | (n_hi << (64 - shift)) : n_lo;
-
- return div64_u64(n, d >> shift);
- /*
- * The remainder value if needed would be:
- * res = div64_u64_rem(n, d >> shift, &rem);
- * rem = (rem << shift) + (n_lo - (n << shift));
- */
+ /* Left align the divisor, shifting the dividend to match */
+ d_z_hi = __builtin_clzll(d);
+ if (d_z_hi) {
+ d <<= d_z_hi;
+ n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi);
+ n_lo <<= d_z_hi;
}
- /* Do the full 128 by 64 bits division */
-
- shift = __builtin_clzll(d);
- d <<= shift;
-
- int p = 64 + shift;
- u64 res = 0;
- bool carry;
+ reps = 64 / BITS_PER_ITER;
+ /* Optimise loop count for small dividends */
+ if (!(u32)(n_hi >> 32)) {
+ reps -= 32 / BITS_PER_ITER;
+ n_hi = n_hi << 32 | n_lo >> 32;
+ n_lo <<= 32;
+ }
+#if BITS_PER_ITER == 16
+ if (!(u32)(n_hi >> 48)) {
+ reps--;
+ n_hi = add_u64_u32(n_hi << 16, n_lo >> 48);
+ n_lo <<= 16;
+ }
+#endif
- do {
- carry = n_hi >> 63;
- shift = carry ? 1 : __builtin_clzll(n_hi);
- if (p < shift)
- break;
- p -= shift;
- n_hi <<= shift;
- n_hi |= n_lo >> (64 - shift);
- n_lo <<= shift;
- if (carry || (n_hi >= d)) {
- n_hi -= d;
- res |= 1ULL << p;
+ /* Invert the dividend so we can use add instead of subtract. */
+ n_lo = ~n_lo;
+ n_hi = ~n_hi;
+
+ /*
+ * Get the most significant BITS_PER_ITER bits of the divisor.
+ * This is used to get a low 'guestimate' of the quotient digit.
+ */
+ d_msig = (d >> (64 - BITS_PER_ITER)) + 1;
+
+ /*
+ * Now do a 'long division' with BITS_PER_ITER bit 'digits'.
+ * The 'guess' quotient digit can be low and BITS_PER_ITER+1 bits.
+ * The worst case is dividing ~0 by 0x8000 which requires two subtracts.
+ */
+ quotient = 0;
+ while (reps--) {
+ q_digit = (unsigned long)(~n_hi >> (64 - 2 * BITS_PER_ITER)) / d_msig;
+ /* Shift 'n' left to align with the product q_digit * d */
+ overflow = n_hi >> (64 - BITS_PER_ITER);
+ n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER));
+ n_lo <<= BITS_PER_ITER;
+ /* Add product to negated divisor */
+ overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi);
+ /* Adjust for the q_digit 'guestimate' being low */
+ while (overflow < 0xffffffff >> (32 - BITS_PER_ITER)) {
+ q_digit++;
+ n_hi += d;
+ overflow += n_hi < d;
}
- } while (n_hi);
- /* The remainder value if needed would be n_hi << p */
+ quotient = add_u64_long(quotient << BITS_PER_ITER, q_digit);
+ }
- return res;
+ /*
+ * The above only ensures the remainder doesn't overflow,
+ * it can still be possible to add (aka subtract) another copy
+ * of the divisor.
+ */
+ if ((n_hi + d) > n_hi)
+ quotient++;
+ return quotient;
}
#if !defined(test_mul_u64_add_u64_div_u64)
EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
--
2.39.5
On Sat, 14 Jun 2025 10:53:45 +0100 David Laight <david.laight.linux@gmail.com> wrote: > Replace the bit by bit algorithm with one that generates 16 bits > per iteration on 32bit architectures and 32 bits on 64bit ones. I've spent far too long doing some clock counting exercises on this code. This is the latest version with some conditional compiles and comments explaining the various optimisation. I think the 'best' version is with -DMULDIV_OPT=0xc3 #ifndef MULDIV_OPT #define MULDIV_OPT 0 #endif #ifndef BITS_PER_ITER #define BITS_PER_ITER (__LONG_WIDTH__ >= 64 ? 32 : 16) #endif #define unlikely(x) __builtin_expect((x), 0) #define likely(x) __builtin_expect(!!(x), 1) #if __LONG_WIDTH__ >= 64 /* gcc generates sane code for 64bit. */ static unsigned int tzcntll(unsigned long long x) { return __builtin_ctzll(x); } static unsigned int lzcntll(unsigned long long x) { return __builtin_clzll(x); } #else /* * Assuming that bsf/bsr dont change the output register * when the input is zero (should be true now that 486 aren't * supported) these simple conditional (and cmov) free functions * can be used to count trailing/leading zeros. */ static inline unsigned int tzcnt_z(u32 x, unsigned int if_z) { asm("bsfl %1,%0" : "+r" (if_z) : "r" (x)); return if_z; } static inline unsigned int tzcntll(unsigned long long x) { return tzcnt_z(x, 32 + tzcnt_z(x >> 32, 32)); } static inline unsigned int bsr_z(u32 x, unsigned int if_z) { asm("bsrl %1,%0" : "+r" (if_z) : "r" (x)); return if_z; } static inline unsigned int bsrll(unsigned long long x) { return 32 + bsr_z(x >> 32, bsr_z(x, -1) - 32); } static inline unsigned int lzcntll(unsigned long long x) { return 63 - bsrll(x); } #endif /* * gcc (but not clang) makes a pigs-breakfast of mixed * 32/64 bit maths. */ #if !defined(__i386__) || defined(__clang__) static u64 add_u64_u32(u64 a, u32 b) { return a + b; } static inline u64 mul_u32_u32(u32 a, u32 b) { return (u64)a * b; } #else static u64 add_u64_u32(u64 a, u32 b) { u32 hi = a >> 32, lo = a; asm ("addl %[b], %[lo]; adcl $0, %[hi]" : [lo] "+r" (lo), [hi] "+r" (hi) : [b] "rm" (b) ); return (u64)hi << 32 | lo; } static inline u64 mul_u32_u32(u32 a, u32 b) { u32 high, low; asm ("mull %[b]" : "=a" (low), "=d" (high) : [a] "a" (a), [b] "rm" (b) ); return low | ((u64)high) << 32; } #endif static inline u64 mul_add(u32 a, u32 b, u32 c) { return add_u64_u32(mul_u32_u32(a, b), c); } #if defined(__SIZEOF_INT128__) typedef unsigned __int128 u128; static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) { /* native 64x64=128 bits multiplication */ u128 prod = (u128)a * b + c; *p_lo = prod; return prod >> 64; } #else static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) { /* perform a 64x64=128 bits multiplication in 32bit chunks */ u64 x, y, z; /* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */ x = mul_add(a, b, c); y = mul_add(a, b >> 32, c >> 32); y = add_u64_u32(y, x >> 32); z = mul_add(a >> 32, b >> 32, y >> 32); y = mul_add(a >> 32, b, y); *p_lo = (y << 32) + (u32)x; return add_u64_u32(z, y >> 32); } #endif #if BITS_PER_ITER == 32 #define mul_u64_long_add_u64(p_lo, a, b, c) mul_u64_u64_add_u64(p_lo, a, b, c) #define add_u64_long(a, b) ((a) + (b)) #else static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c) { u64 n_lo = mul_add(a, b, c); u64 n_med = mul_add(a >> 32, b, c >> 32); n_med = add_u64_u32(n_med, n_lo >> 32); *p_lo = n_med << 32 | (u32)n_lo; return n_med >> 32; } #define add_u64_long(a, b) add_u64_u32(a, b) #endif #if MULDIV_OPT & 0x40 /* * If the divisor has BITS_PER_ITER or fewer bits then a simple * long division can be done. */ #if BITS_PER_ITER == 16 static u64 div_u80_u16(u32 n_hi, u64 n_lo, u32 d) { u64 q = 0; if (n_hi) { n_hi = n_hi << 16 | (u32)(n_lo >> 48); q = (n_hi / d) << 16; n_hi = (n_hi % d) << 16 | (u16)(n_lo >> 32); } else { n_hi = n_lo >> 32; if (!n_hi) return (u32)n_lo / d; } q |= n_hi / d; q <<= 32; n_hi = (n_hi % d) << 16 | ((u32)n_lo >> 16); q |= (n_hi / d) << 16; n_hi = (n_hi % d) << 16 | (u16)n_lo; q |= n_hi / d; return q; } #else static u64 div_u96_u32(u64 n_hi, u64 n_lo, u32 d) { u64 q; if (!n_hi) return n_lo / d; n_hi = n_hi << 32 | n_lo >> 32; q = (n_hi / d) << 32; n_hi = (n_hi % d) << 32 | (u32)n_lo; return q | n_hi / d; } #endif #endif u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) { unsigned long d_msig, n_long, q_digit; unsigned int reps, d_z_hi; u64 quotient, n_lo, n_hi; u32 overflow; n_hi = mul_u64_u64_add_u64(&n_lo, a, b, c); if (unlikely(n_hi >= d)) { if (!d) // Quotient infinity or NaN return 0; // Quotient larger than 64 bits return ~(u64)0; } #if !(MULDIV_OPT & 0x80) // OPT: Small divisors can be optimised here. // OPT: But the test is measurable and a lot of the cases get // OPT: picked up by later tests - especially 1, 2 and 0x40. // OPT: For 32bit a full 64bit divide will also be non-trivial. if (unlikely(!n_hi)) return div64_u64(n_lo, d); #endif d_z_hi = lzcntll(d); #if MULDIV_OPT & 0x40 // Optimise for divisors with less than BITS_PER_ITER significant bits. // OPT: A much simpler 'long division' can be done. // OPT: The test could be reworked to avoid the txcntll() when d_z_hi // OPT: is large enough - but the code starts looking horrid. // OPT: This picks up the same divisions as OPT 8, with a faster algorithm. u32 d_z_lo = tzcntll(d); if (d_z_hi + d_z_lo >= 64 - BITS_PER_ITER) { if (d_z_hi < 64 - BITS_PER_ITER) { n_lo = n_lo >> d_z_lo | n_hi << (64 - d_z_lo); n_hi >>= d_z_lo; d >>= d_z_lo; } #if BITS_PER_ITER == 16 return div_u80_u16(n_hi, n_lo, d); #else return div_u96_u32(n_hi, n_lo, d); #endif } #endif /* Left align the divisor, shifting the dividend to match */ #if MULDIV_OPT & 0x10 // OPT: Replacing the 'pretty much always taken' branch // OPT: with an extra shift (one clock - should be noise) // OPT: feels like it ought to be a gain (for 64bit). // OPT: Most of the test cases have 64bit divisors - so lose, // OPT: but even some with a smaller divisor are hit for a few clocks. // OPT: Might be generating a register spill to stack. d <<= d_z_hi; n_hi = n_hi << d_z_hi | (n_lo >> (63 - d_z_hi) >> 1); n_lo <<= d_z_hi; #else if (d_z_hi) { d <<= d_z_hi; n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi); n_lo <<= d_z_hi; } #endif reps = 64 / BITS_PER_ITER; /* Optimise loop count for small dividends */ #if MULDIV_OPT & 1 // OPT: Products with lots of leading zeros are almost certainly // OPT: very common. // OPT: The gain from removing the loop iterations is significant. // OPT: Especially on 32bit where two iterations can be removed // OPT: with a simple shift and conditional jump. if (!(u32)(n_hi >> 32)) { reps -= 32 / BITS_PER_ITER; n_hi = n_hi << 32 | n_lo >> 32; n_lo <<= 32; } #endif #if MULDIV_OPT & 2 && BITS_PER_ITER == 16 if (!(u32)(n_hi >> 48)) { reps--; n_hi = add_u64_u32(n_hi << 16, n_lo >> 48); n_lo <<= 16; } #endif /* * Get the most significant BITS_PER_ITER bits of the divisor. * This is used to get a low 'guestimate' of the quotient digit. */ d_msig = (d >> (64 - BITS_PER_ITER)); #if MULDIV_OPT & 8 // OPT: d_msig only needs rounding up - so can be unchanged if // OPT: all its low bits are zero. // OPT: However the test itself causes register pressure on x86-32. // OPT: The earlier check (0x40) optimises the same cases. // OPT: The code it generates is a lot better. d_msig += !!(d << BITS_PER_ITER); #else d_msig += 1; #endif /* Invert the dividend so we can use add instead of subtract. */ n_lo = ~n_lo; n_hi = ~n_hi; /* * Now do a 'long division' with BITS_PER_ITER bit 'digits'. * The 'guess' quotient digit can be low and BITS_PER_ITER+1 bits. * The worst case is dividing ~0 by 0x8000 which requires two subtracts. */ quotient = 0; while (reps--) { n_long = ~n_hi >> (64 - 2 * BITS_PER_ITER); #if !(MULDIV_OPT & 0x20) // OPT: If the cpu divide instruction has a long latency an other // OPT: instructions can execute while the divide is pending then // OPT: you want the divide as early as possible. // OPT: I've seen delays if it moved below the shifts, but I suspect // OPT: the ivy bridge cpu spreads the u-ops between the execution // OPT: units so you don't get the full latency to play with. // OPT: gcc doesn't put the divide as early as it might, attempting to // OPT: do so by hand failed - and I'm not playing with custom asm. q_digit = n_long / d_msig; #endif /* Shift 'n' left to align with the product q_digit * d */ overflow = n_hi >> (64 - BITS_PER_ITER); n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER)); n_lo <<= BITS_PER_ITER; quotient <<= BITS_PER_ITER; #if MULDIV_OPT & 4 // OPT: This optimises for zero digits. // OPT: With the compiler/cpu I using today (gcc 13.3 and Sandy bridge) // OPT: it needs the divide moved below the conditional. // OPT: For x86-64 0x24 and 0x03 are actually pretty similar, // OPT: but x86-32 is definitely slower all the time, and the outer // OPT: check removes two loop iterations at once. if (unlikely(n_long < d_msig)) { // OPT: Without something here the 'unlikely' still generates // OPT: a conditional backwards branch which some cpu will // OPT: statically predict taken. // asm( "nop"); continue; } #endif #if MULDIV_OPT & 0x20 q_digit = n_long / d_msig; #endif /* Add product to negated divisor */ overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi); /* Adjust for the q_digit 'guestimate' being low */ while (unlikely(overflow < 0xffffffff >> (32 - BITS_PER_ITER))) { q_digit++; n_hi += d; overflow += n_hi < d; } quotient = add_u64_long(quotient, q_digit); } /* * The above only ensures the remainder doesn't overflow, * it can still be possible to add (aka subtract) another copy * of the divisor. */ if ((n_hi + d) > n_hi) quotient++; return quotient; } Some measurements on an ivy bridge. These are the test vectors from the test module with a few extra values on the end that pick different paths through this implementatoin. The numbers are 'performance counter' deltas for 10 consecutive calls with the same values. So the latter values are with the branch predictor 'trained' to the test case. The first few (larger) values show the cost of mispredicted branches. Apologies for the very long lines. $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0xc3 && sudo ./div_perf 0: ok 162 134 78 78 78 78 78 80 80 80 mul_u64_u64_div_u64_new b*7/3 = 19 1: ok 91 91 91 91 91 91 91 91 91 91 mul_u64_u64_div_u64_new ffff0000*ffff0000/f = 1110eeef00000000 2: ok 75 77 75 77 77 77 77 77 77 77 mul_u64_u64_div_u64_new ffffffff*ffffffff/1 = fffffffe00000001 3: ok 89 91 91 91 91 91 89 90 91 91 mul_u64_u64_div_u64_new ffffffff*ffffffff/2 = 7fffffff00000000 4: ok 147 147 128 128 128 128 128 128 128 128 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/2 = fffffffe80000000 5: ok 128 128 128 128 128 128 128 128 128 128 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab 6: ok 121 121 121 121 121 121 121 121 121 121 mul_u64_u64_div_u64_new 1ffffffff*1ffffffff/4 = ffffffff00000000 7: ok 274 234 146 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff 8: ok 177 148 148 149 149 149 149 149 149 149 mul_u64_u64_div_u64_new 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 9: ok 138 90 118 91 91 91 91 92 92 92 mul_u64_u64_div_u64_new 7fffffffffffffff*2/3 = 5555555555555554 10: ok 113 114 86 86 84 86 86 84 87 87 mul_u64_u64_div_u64_new ffffffffffffffff*2/8000000000000000 = 3 11: ok 87 88 88 86 88 88 88 88 90 90 mul_u64_u64_div_u64_new ffffffffffffffff*2/c000000000000000 = 2 12: ok 82 86 84 86 83 86 83 86 83 87 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 13: ok 82 86 84 86 83 86 83 86 83 86 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 14: ok 189 187 138 132 132 132 131 131 131 131 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 15: ok 221 175 159 131 131 131 131 131 131 131 mul_u64_u64_div_u64_new fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 16: ok 134 132 134 134 134 135 134 134 134 134 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 17: ok 172 134 137 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 18: ok 182 182 129 129 129 129 129 129 129 129 mul_u64_u64_div_u64_new 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 19: ok 130 129 130 129 129 129 129 129 129 129 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca 20: ok 130 129 129 129 129 129 129 129 129 129 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b 21: ok 130 129 129 129 129 129 129 129 129 129 mul_u64_u64_div_u64_new 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 22: ok 206 140 138 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe 23: ok 174 140 138 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 24: ok 135 137 137 137 137 137 137 137 137 137 mul_u64_u64_div_u64_new f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 25: ok 134 136 136 136 136 136 136 136 136 136 mul_u64_u64_div_u64_new 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f 26: ok 136 134 134 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 27: ok 139 138 138 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 28: ok 130 143 95 95 96 96 96 96 96 96 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e 29: ok 169 158 158 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 30: ok 178 164 144 147 147 147 147 147 147 147 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef 31: ok 163 128 128 128 128 128 128 128 128 128 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d 32: ok 163 184 137 136 136 138 138 138 138 138 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0x03 && sudo ./div_perf 0: ok 125 78 78 79 79 79 79 79 79 79 mul_u64_u64_div_u64_new b*7/3 = 19 1: ok 88 89 89 88 89 89 89 89 89 89 mul_u64_u64_div_u64_new ffff0000*ffff0000/f = 1110eeef00000000 2: ok 75 76 76 76 76 76 74 76 76 76 mul_u64_u64_div_u64_new ffffffff*ffffffff/1 = fffffffe00000001 3: ok 87 89 89 89 89 89 89 88 88 88 mul_u64_u64_div_u64_new ffffffff*ffffffff/2 = 7fffffff00000000 4: ok 305 221 148 144 147 147 147 147 147 147 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/2 = fffffffe80000000 5: ok 179 178 141 141 141 141 141 141 141 141 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab 6: ok 148 200 143 145 145 145 145 145 145 145 mul_u64_u64_div_u64_new 1ffffffff*1ffffffff/4 = ffffffff00000000 7: ok 201 186 140 135 135 135 135 135 135 135 mul_u64_u64_div_u64_new ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff 8: ok 227 154 145 141 141 141 141 141 141 141 mul_u64_u64_div_u64_new 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 9: ok 111 111 89 89 89 89 89 89 89 89 mul_u64_u64_div_u64_new 7fffffffffffffff*2/3 = 5555555555555554 10: ok 149 156 124 90 90 90 90 90 90 90 mul_u64_u64_div_u64_new ffffffffffffffff*2/8000000000000000 = 3 11: ok 91 91 90 90 90 90 90 90 90 90 mul_u64_u64_div_u64_new ffffffffffffffff*2/c000000000000000 = 2 12: ok 197 197 138 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 13: ok 260 136 136 135 135 135 135 135 135 135 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 14: ok 186 187 164 130 127 127 127 127 127 127 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 15: ok 171 172 173 158 160 128 125 127 125 127 mul_u64_u64_div_u64_new fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 16: ok 157 164 129 130 130 130 130 130 130 130 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 17: ok 191 158 130 132 132 130 130 130 130 130 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 18: ok 197 214 163 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 19: ok 196 196 135 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca 20: ok 191 216 176 140 138 138 138 138 138 138 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b 21: ok 232 157 156 145 145 145 145 145 145 145 mul_u64_u64_div_u64_new 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 22: ok 159 192 133 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe 23: ok 133 134 134 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 24: ok 134 131 131 131 131 134 131 131 131 131 mul_u64_u64_div_u64_new f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 25: ok 133 130 130 130 130 130 130 130 130 130 mul_u64_u64_div_u64_new 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f 26: ok 133 130 130 130 130 130 130 130 130 131 mul_u64_u64_div_u64_new 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 27: ok 133 134 134 134 134 134 134 134 134 133 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 28: ok 151 93 119 93 93 93 93 93 93 93 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e 29: ok 193 137 134 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 30: ok 194 151 150 137 137 137 137 137 137 137 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef 31: ok 137 173 172 137 138 138 138 138 138 138 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d 32: ok 160 149 131 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0x24 && sudo ./div_perf 0: ok 130 106 79 79 78 78 78 78 81 81 mul_u64_u64_div_u64_new b*7/3 = 19 1: ok 88 92 92 89 92 92 92 92 91 91 mul_u64_u64_div_u64_new ffff0000*ffff0000/f = 1110eeef00000000 2: ok 74 78 78 78 78 78 75 79 78 78 mul_u64_u64_div_u64_new ffffffff*ffffffff/1 = fffffffe00000001 3: ok 87 92 92 92 92 92 92 92 92 92 mul_u64_u64_div_u64_new ffffffff*ffffffff/2 = 7fffffff00000000 4: ok 330 275 181 145 147 148 148 148 148 148 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/2 = fffffffe80000000 5: ok 225 175 141 146 146 146 146 146 146 146 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab 6: ok 187 194 193 194 178 144 148 148 148 148 mul_u64_u64_div_u64_new 1ffffffff*1ffffffff/4 = ffffffff00000000 7: ok 202 189 178 139 140 139 140 139 140 139 mul_u64_u64_div_u64_new ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff 8: ok 228 168 150 143 143 143 143 143 143 143 mul_u64_u64_div_u64_new 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 9: ok 112 112 92 89 92 92 92 92 92 87 mul_u64_u64_div_u64_new 7fffffffffffffff*2/3 = 5555555555555554 10: ok 153 184 92 93 95 95 95 95 95 95 mul_u64_u64_div_u64_new ffffffffffffffff*2/8000000000000000 = 3 11: ok 158 93 93 93 95 92 95 95 95 95 mul_u64_u64_div_u64_new ffffffffffffffff*2/c000000000000000 = 2 12: ok 206 178 146 139 139 140 139 140 139 140 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 13: ok 187 146 140 139 140 139 140 139 140 139 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 14: ok 167 173 136 136 102 97 97 97 97 97 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 15: ok 166 150 105 98 98 98 97 99 97 99 mul_u64_u64_div_u64_new fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 16: ok 209 197 139 136 136 136 136 136 136 136 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 17: ok 170 142 170 137 136 135 136 135 136 135 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 18: ok 238 197 172 140 140 140 139 141 140 139 mul_u64_u64_div_u64_new 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 19: ok 185 206 139 141 142 142 142 142 142 142 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca 20: ok 207 226 146 142 140 140 140 140 140 140 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b 21: ok 161 153 148 148 148 148 148 148 148 146 mul_u64_u64_div_u64_new 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 22: ok 172 199 141 140 140 140 140 140 140 140 mul_u64_u64_div_u64_new ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe 23: ok 172 137 140 140 140 140 140 140 140 140 mul_u64_u64_div_u64_new e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 24: ok 135 138 138 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 25: ok 136 137 137 137 137 137 137 137 137 137 mul_u64_u64_div_u64_new 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f 26: ok 136 136 136 136 136 136 136 136 136 136 mul_u64_u64_div_u64_new 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 27: ok 139 140 140 140 140 140 140 140 140 140 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 28: ok 132 94 97 96 96 96 96 96 96 96 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e 29: ok 139 140 140 140 140 140 140 140 140 140 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 30: ok 159 141 141 141 141 141 141 141 141 141 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef 31: ok 244 186 154 141 139 141 140 139 141 140 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d 32: ok 165 140 140 140 140 140 140 140 140 140 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0xc3 -m32 && sudo ./div_perf 0: ok 336 131 104 104 102 102 103 103 103 105 mul_u64_u64_div_u64_new b*7/3 = 19 1: ok 195 195 171 171 171 171 171 171 171 171 mul_u64_u64_div_u64_new ffff0000*ffff0000/f = 1110eeef00000000 2: ok 165 162 162 162 162 162 162 162 162 162 mul_u64_u64_div_u64_new ffffffff*ffffffff/1 = fffffffe00000001 3: ok 165 173 173 173 173 173 173 173 173 173 mul_u64_u64_div_u64_new ffffffff*ffffffff/2 = 7fffffff00000000 4: ok 220 216 202 206 202 206 202 206 202 206 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/2 = fffffffe80000000 5: ok 203 205 208 205 209 205 209 205 209 205 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab 6: ok 203 203 199 203 199 203 199 203 199 203 mul_u64_u64_div_u64_new 1ffffffff*1ffffffff/4 = ffffffff00000000 7: ok 574 421 251 242 246 254 246 254 246 254 mul_u64_u64_div_u64_new ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff 8: ok 370 317 263 262 254 259 258 262 254 259 mul_u64_u64_div_u64_new 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 9: ok 214 199 177 177 177 177 177 177 177 177 mul_u64_u64_div_u64_new 7fffffffffffffff*2/3 = 5555555555555554 10: ok 163 150 128 129 129 129 129 129 129 129 mul_u64_u64_div_u64_new ffffffffffffffff*2/8000000000000000 = 3 11: ok 135 133 135 135 135 135 135 135 135 135 mul_u64_u64_div_u64_new ffffffffffffffff*2/c000000000000000 = 2 12: ok 206 208 208 180 183 183 183 183 183 183 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 13: ok 205 183 183 184 183 183 183 183 183 183 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 14: ok 331 296 238 229 232 232 232 232 232 232 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 15: ok 324 311 238 239 239 239 239 239 239 242 mul_u64_u64_div_u64_new fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 16: ok 300 262 265 233 238 234 238 234 238 234 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 17: ok 295 282 244 244 244 244 244 244 244 244 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 18: ok 245 247 235 222 221 222 219 222 221 222 mul_u64_u64_div_u64_new 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 19: ok 235 221 222 221 222 219 222 221 222 219 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca 20: ok 220 222 221 222 219 222 219 222 221 222 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b 21: ok 225 221 222 219 221 219 221 219 221 219 mul_u64_u64_div_u64_new 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 22: ok 309 274 250 238 237 244 239 241 237 244 mul_u64_u64_div_u64_new ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe 23: ok 284 284 250 247 243 247 247 243 247 247 mul_u64_u64_div_u64_new e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 24: ok 239 239 243 240 239 239 239 239 239 239 mul_u64_u64_div_u64_new f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 25: ok 255 255 255 255 247 243 247 247 243 247 mul_u64_u64_div_u64_new 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f 26: ok 327 274 240 242 240 242 240 242 240 242 mul_u64_u64_div_u64_new 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 27: ok 461 313 340 259 257 259 257 259 257 259 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 28: ok 291 219 180 174 170 173 171 174 171 174 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e 29: ok 280 319 258 257 259 257 259 257 259 257 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 30: ok 379 352 247 239 220 225 225 229 228 229 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef 31: ok 235 219 221 219 221 219 221 219 221 219 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d 32: ok 305 263 257 257 258 258 263 257 257 257 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0x03 -m32 && sudo ./div_perf 0: ok 292 127 129 125 123 128 125 123 125 121 mul_u64_u64_div_u64_new b*7/3 = 19 1: ok 190 196 151 149 151 149 151 149 151 149 mul_u64_u64_div_u64_new ffff0000*ffff0000/f = 1110eeef00000000 2: ok 141 139 139 139 139 139 139 139 139 139 mul_u64_u64_div_u64_new ffffffff*ffffffff/1 = fffffffe00000001 3: ok 152 149 151 149 151 149 151 149 151 149 mul_u64_u64_div_u64_new ffffffff*ffffffff/2 = 7fffffff00000000 4: ok 667 453 276 270 271 271 271 267 274 272 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/2 = fffffffe80000000 5: ok 366 319 373 337 278 278 278 278 278 278 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab 6: ok 380 349 268 268 271 265 265 265 265 265 mul_u64_u64_div_u64_new 1ffffffff*1ffffffff/4 = ffffffff00000000 7: ok 340 277 255 251 249 255 251 249 255 251 mul_u64_u64_div_u64_new ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff 8: ok 377 302 253 252 256 256 256 256 256 256 mul_u64_u64_div_u64_new 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 9: ok 181 184 157 155 157 155 157 155 157 155 mul_u64_u64_div_u64_new 7fffffffffffffff*2/3 = 5555555555555554 10: ok 304 223 142 139 139 141 139 139 139 139 mul_u64_u64_div_u64_new ffffffffffffffff*2/8000000000000000 = 3 11: ok 153 143 148 142 143 142 143 142 143 142 mul_u64_u64_div_u64_new ffffffffffffffff*2/c000000000000000 = 2 12: ok 428 323 292 246 257 248 253 250 248 253 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 13: ok 289 256 260 257 255 257 255 257 255 257 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 14: ok 334 246 234 234 227 233 229 233 229 233 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 15: ok 324 302 273 236 236 236 236 236 236 236 mul_u64_u64_div_u64_new fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 16: ok 269 328 285 259 232 230 236 232 232 236 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 17: ok 307 329 330 244 247 246 245 245 245 245 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 18: ok 359 361 324 258 258 258 258 258 258 258 mul_u64_u64_div_u64_new 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 19: ok 347 325 295 258 260 253 251 251 251 251 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca 20: ok 339 312 261 260 255 255 255 255 255 255 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b 21: ok 411 349 333 276 272 272 272 272 272 272 mul_u64_u64_div_u64_new 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 22: ok 297 330 290 266 239 236 238 237 238 237 mul_u64_u64_div_u64_new ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe 23: ok 299 311 250 247 250 245 247 250 245 247 mul_u64_u64_div_u64_new e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 24: ok 274 245 238 237 237 237 237 237 237 247 mul_u64_u64_div_u64_new f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 25: ok 247 247 245 247 250 245 247 250 245 247 mul_u64_u64_div_u64_new 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f 26: ok 341 354 288 239 242 240 239 242 240 239 mul_u64_u64_div_u64_new 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 27: ok 408 375 288 312 257 260 259 259 259 259 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 28: ok 289 259 199 198 201 170 170 174 173 173 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e 29: ok 334 257 260 257 260 259 259 259 259 259 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 30: ok 341 267 244 233 229 233 229 233 229 233 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef 31: ok 323 323 297 297 264 268 267 268 267 268 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d 32: ok 284 262 251 251 253 251 251 251 251 251 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0x24 -m32 && sudo ./div_perf 0: ok 362 127 125 122 123 122 125 129 126 126 mul_u64_u64_div_u64_new b*7/3 = 19 1: ok 190 175 149 150 154 150 154 150 154 150 mul_u64_u64_div_u64_new ffff0000*ffff0000/f = 1110eeef00000000 2: ok 144 139 139 139 139 139 139 139 139 139 mul_u64_u64_div_u64_new ffffffff*ffffffff/1 = fffffffe00000001 3: ok 146 150 154 154 150 154 150 154 150 154 mul_u64_u64_div_u64_new ffffffff*ffffffff/2 = 7fffffff00000000 4: ok 747 554 319 318 316 319 318 328 314 324 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/2 = fffffffe80000000 5: ok 426 315 312 315 315 315 315 315 315 315 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab 6: ok 352 391 317 316 323 327 323 327 323 324 mul_u64_u64_div_u64_new 1ffffffff*1ffffffff/4 = ffffffff00000000 7: ok 369 328 298 292 298 292 298 292 298 292 mul_u64_u64_div_u64_new ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff 8: ok 436 348 298 299 298 300 307 297 297 301 mul_u64_u64_div_u64_new 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 9: ok 180 183 151 151 151 151 151 151 151 153 mul_u64_u64_div_u64_new 7fffffffffffffff*2/3 = 5555555555555554 10: ok 286 251 188 169 174 170 178 170 178 170 mul_u64_u64_div_u64_new ffffffffffffffff*2/8000000000000000 = 3 11: ok 230 177 172 177 172 177 172 177 172 177 mul_u64_u64_div_u64_new ffffffffffffffff*2/c000000000000000 = 2 12: ok 494 412 371 331 296 298 305 290 296 298 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 13: ok 330 300 304 305 290 305 294 302 290 296 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 14: ok 284 241 175 172 176 175 175 172 176 175 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 15: ok 283 263 171 176 175 175 175 175 175 175 mul_u64_u64_div_u64_new fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 16: ok 346 247 258 208 202 205 208 202 205 208 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 17: ok 242 272 208 213 209 213 209 213 209 213 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 18: ok 494 337 306 309 309 309 309 309 309 309 mul_u64_u64_div_u64_new 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 19: ok 392 394 329 305 299 302 294 302 292 305 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca 20: ok 372 307 306 310 308 310 308 310 308 310 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b 21: ok 525 388 310 320 314 313 314 315 312 315 mul_u64_u64_div_u64_new 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 22: ok 400 293 289 354 284 283 284 284 284 284 mul_u64_u64_div_u64_new ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe 23: ok 320 324 289 290 289 289 290 289 289 290 mul_u64_u64_div_u64_new e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 24: ok 279 289 290 285 285 285 285 285 285 285 mul_u64_u64_div_u64_new f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 25: ok 288 290 289 289 290 289 289 290 289 289 mul_u64_u64_div_u64_new 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f 26: ok 361 372 351 325 288 293 286 293 294 293 mul_u64_u64_div_u64_new 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 27: ok 483 349 302 302 305 300 307 304 307 304 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 28: ok 339 328 234 233 213 214 213 208 215 208 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e 29: ok 406 300 303 300 307 304 307 304 307 304 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 30: ok 404 421 335 268 265 271 267 271 267 272 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef 31: ok 484 350 310 309 306 309 309 306 309 309 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d 32: ok 368 306 301 307 304 307 304 307 304 307 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 David
On Wed, 9 Jul 2025 15:24:20 +0100 David Laight <david.laight.linux@gmail.com> wrote: > On Sat, 14 Jun 2025 10:53:45 +0100 > David Laight <david.laight.linux@gmail.com> wrote: > > > Replace the bit by bit algorithm with one that generates 16 bits > > per iteration on 32bit architectures and 32 bits on 64bit ones. > > I've spent far too long doing some clock counting exercises on this code. > This is the latest version with some conditional compiles and comments > explaining the various optimisation. > I think the 'best' version is with -DMULDIV_OPT=0xc3 > >.... > Some measurements on an ivy bridge. > These are the test vectors from the test module with a few extra values on the > end that pick different paths through this implementatoin. > The numbers are 'performance counter' deltas for 10 consecutive calls with the > same values. > So the latter values are with the branch predictor 'trained' to the test case. > The first few (larger) values show the cost of mispredicted branches. I should have included the values for the sames tests with the existing code. Added here, but this is includes the change to test for overflow and divide by zero after the multiply (no ilog2 calls - which make the 32bit code worse). About the only cases where the old code it better are when the quotient only has two bits set. > Apologies for the very long lines. > > $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0xc3 && sudo ./div_perf > 0: ok 162 134 78 78 78 78 78 80 80 80 mul_u64_u64_div_u64_new b*7/3 = 19 > 1: ok 91 91 91 91 91 91 91 91 91 91 mul_u64_u64_div_u64_new ffff0000*ffff0000/f = 1110eeef00000000 > 2: ok 75 77 75 77 77 77 77 77 77 77 mul_u64_u64_div_u64_new ffffffff*ffffffff/1 = fffffffe00000001 > 3: ok 89 91 91 91 91 91 89 90 91 91 mul_u64_u64_div_u64_new ffffffff*ffffffff/2 = 7fffffff00000000 > 4: ok 147 147 128 128 128 128 128 128 128 128 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/2 = fffffffe80000000 > 5: ok 128 128 128 128 128 128 128 128 128 128 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab > 6: ok 121 121 121 121 121 121 121 121 121 121 mul_u64_u64_div_u64_new 1ffffffff*1ffffffff/4 = ffffffff00000000 > 7: ok 274 234 146 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff > 8: ok 177 148 148 149 149 149 149 149 149 149 mul_u64_u64_div_u64_new 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 > 9: ok 138 90 118 91 91 91 91 92 92 92 mul_u64_u64_div_u64_new 7fffffffffffffff*2/3 = 5555555555555554 > 10: ok 113 114 86 86 84 86 86 84 87 87 mul_u64_u64_div_u64_new ffffffffffffffff*2/8000000000000000 = 3 > 11: ok 87 88 88 86 88 88 88 88 90 90 mul_u64_u64_div_u64_new ffffffffffffffff*2/c000000000000000 = 2 > 12: ok 82 86 84 86 83 86 83 86 83 87 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 > 13: ok 82 86 84 86 83 86 83 86 83 86 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 > 14: ok 189 187 138 132 132 132 131 131 131 131 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 > 15: ok 221 175 159 131 131 131 131 131 131 131 mul_u64_u64_div_u64_new fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 > 16: ok 134 132 134 134 134 135 134 134 134 134 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 > 17: ok 172 134 137 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 > 18: ok 182 182 129 129 129 129 129 129 129 129 mul_u64_u64_div_u64_new 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 > 19: ok 130 129 130 129 129 129 129 129 129 129 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca > 20: ok 130 129 129 129 129 129 129 129 129 129 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b > 21: ok 130 129 129 129 129 129 129 129 129 129 mul_u64_u64_div_u64_new 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 > 22: ok 206 140 138 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe > 23: ok 174 140 138 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 > 24: ok 135 137 137 137 137 137 137 137 137 137 mul_u64_u64_div_u64_new f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 > 25: ok 134 136 136 136 136 136 136 136 136 136 mul_u64_u64_div_u64_new 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f > 26: ok 136 134 134 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 > 27: ok 139 138 138 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > 28: ok 130 143 95 95 96 96 96 96 96 96 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e > 29: ok 169 158 158 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > 30: ok 178 164 144 147 147 147 147 147 147 147 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef > 31: ok 163 128 128 128 128 128 128 128 128 128 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d > 32: ok 163 184 137 136 136 138 138 138 138 138 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > > $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0x03 && sudo ./div_perf > 0: ok 125 78 78 79 79 79 79 79 79 79 mul_u64_u64_div_u64_new b*7/3 = 19 > 1: ok 88 89 89 88 89 89 89 89 89 89 mul_u64_u64_div_u64_new ffff0000*ffff0000/f = 1110eeef00000000 > 2: ok 75 76 76 76 76 76 74 76 76 76 mul_u64_u64_div_u64_new ffffffff*ffffffff/1 = fffffffe00000001 > 3: ok 87 89 89 89 89 89 89 88 88 88 mul_u64_u64_div_u64_new ffffffff*ffffffff/2 = 7fffffff00000000 > 4: ok 305 221 148 144 147 147 147 147 147 147 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/2 = fffffffe80000000 > 5: ok 179 178 141 141 141 141 141 141 141 141 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab > 6: ok 148 200 143 145 145 145 145 145 145 145 mul_u64_u64_div_u64_new 1ffffffff*1ffffffff/4 = ffffffff00000000 > 7: ok 201 186 140 135 135 135 135 135 135 135 mul_u64_u64_div_u64_new ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff > 8: ok 227 154 145 141 141 141 141 141 141 141 mul_u64_u64_div_u64_new 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 > 9: ok 111 111 89 89 89 89 89 89 89 89 mul_u64_u64_div_u64_new 7fffffffffffffff*2/3 = 5555555555555554 > 10: ok 149 156 124 90 90 90 90 90 90 90 mul_u64_u64_div_u64_new ffffffffffffffff*2/8000000000000000 = 3 > 11: ok 91 91 90 90 90 90 90 90 90 90 mul_u64_u64_div_u64_new ffffffffffffffff*2/c000000000000000 = 2 > 12: ok 197 197 138 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 > 13: ok 260 136 136 135 135 135 135 135 135 135 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 > 14: ok 186 187 164 130 127 127 127 127 127 127 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 > 15: ok 171 172 173 158 160 128 125 127 125 127 mul_u64_u64_div_u64_new fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 > 16: ok 157 164 129 130 130 130 130 130 130 130 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 > 17: ok 191 158 130 132 132 130 130 130 130 130 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 > 18: ok 197 214 163 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 > 19: ok 196 196 135 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca > 20: ok 191 216 176 140 138 138 138 138 138 138 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b > 21: ok 232 157 156 145 145 145 145 145 145 145 mul_u64_u64_div_u64_new 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 > 22: ok 159 192 133 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe > 23: ok 133 134 134 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 > 24: ok 134 131 131 131 131 134 131 131 131 131 mul_u64_u64_div_u64_new f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 > 25: ok 133 130 130 130 130 130 130 130 130 130 mul_u64_u64_div_u64_new 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f > 26: ok 133 130 130 130 130 130 130 130 130 131 mul_u64_u64_div_u64_new 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 > 27: ok 133 134 134 134 134 134 134 134 134 133 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > 28: ok 151 93 119 93 93 93 93 93 93 93 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e > 29: ok 193 137 134 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > 30: ok 194 151 150 137 137 137 137 137 137 137 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef > 31: ok 137 173 172 137 138 138 138 138 138 138 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d > 32: ok 160 149 131 134 134 134 134 134 134 134 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > > $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0x24 && sudo ./div_perf > 0: ok 130 106 79 79 78 78 78 78 81 81 mul_u64_u64_div_u64_new b*7/3 = 19 > 1: ok 88 92 92 89 92 92 92 92 91 91 mul_u64_u64_div_u64_new ffff0000*ffff0000/f = 1110eeef00000000 > 2: ok 74 78 78 78 78 78 75 79 78 78 mul_u64_u64_div_u64_new ffffffff*ffffffff/1 = fffffffe00000001 > 3: ok 87 92 92 92 92 92 92 92 92 92 mul_u64_u64_div_u64_new ffffffff*ffffffff/2 = 7fffffff00000000 > 4: ok 330 275 181 145 147 148 148 148 148 148 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/2 = fffffffe80000000 > 5: ok 225 175 141 146 146 146 146 146 146 146 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab > 6: ok 187 194 193 194 178 144 148 148 148 148 mul_u64_u64_div_u64_new 1ffffffff*1ffffffff/4 = ffffffff00000000 > 7: ok 202 189 178 139 140 139 140 139 140 139 mul_u64_u64_div_u64_new ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff > 8: ok 228 168 150 143 143 143 143 143 143 143 mul_u64_u64_div_u64_new 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 > 9: ok 112 112 92 89 92 92 92 92 92 87 mul_u64_u64_div_u64_new 7fffffffffffffff*2/3 = 5555555555555554 > 10: ok 153 184 92 93 95 95 95 95 95 95 mul_u64_u64_div_u64_new ffffffffffffffff*2/8000000000000000 = 3 > 11: ok 158 93 93 93 95 92 95 95 95 95 mul_u64_u64_div_u64_new ffffffffffffffff*2/c000000000000000 = 2 > 12: ok 206 178 146 139 139 140 139 140 139 140 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 > 13: ok 187 146 140 139 140 139 140 139 140 139 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 > 14: ok 167 173 136 136 102 97 97 97 97 97 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 > 15: ok 166 150 105 98 98 98 97 99 97 99 mul_u64_u64_div_u64_new fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 > 16: ok 209 197 139 136 136 136 136 136 136 136 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 > 17: ok 170 142 170 137 136 135 136 135 136 135 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 > 18: ok 238 197 172 140 140 140 139 141 140 139 mul_u64_u64_div_u64_new 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 > 19: ok 185 206 139 141 142 142 142 142 142 142 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca > 20: ok 207 226 146 142 140 140 140 140 140 140 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b > 21: ok 161 153 148 148 148 148 148 148 148 146 mul_u64_u64_div_u64_new 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 > 22: ok 172 199 141 140 140 140 140 140 140 140 mul_u64_u64_div_u64_new ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe > 23: ok 172 137 140 140 140 140 140 140 140 140 mul_u64_u64_div_u64_new e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 > 24: ok 135 138 138 138 138 138 138 138 138 138 mul_u64_u64_div_u64_new f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 > 25: ok 136 137 137 137 137 137 137 137 137 137 mul_u64_u64_div_u64_new 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f > 26: ok 136 136 136 136 136 136 136 136 136 136 mul_u64_u64_div_u64_new 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 > 27: ok 139 140 140 140 140 140 140 140 140 140 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > 28: ok 132 94 97 96 96 96 96 96 96 96 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e > 29: ok 139 140 140 140 140 140 140 140 140 140 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > 30: ok 159 141 141 141 141 141 141 141 141 141 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef > 31: ok 244 186 154 141 139 141 140 139 141 140 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d > 32: ok 165 140 140 140 140 140 140 140 140 140 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 Existing code, 64bit x86 $ cc -O2 -o div_perf div_perf.c && sudo ./div_perf 0: ok 171 117 88 89 88 87 87 87 90 88 mul_u64_u64_div_u64 b*7/3 = 19 1: ok 98 97 97 97 101 101 101 101 101 101 mul_u64_u64_div_u64 ffff0000*ffff0000/f = 1110eeef00000000 2: ok 85 85 84 84 84 87 87 87 87 87 mul_u64_u64_div_u64 ffffffff*ffffffff/1 = fffffffe00000001 3: ok 99 101 101 101 101 99 101 101 101 101 mul_u64_u64_div_u64 ffffffff*ffffffff/2 = 7fffffff00000000 4: ok 162 145 90 90 90 90 90 90 90 90 mul_u64_u64_div_u64 1ffffffff*ffffffff/2 = fffffffe80000000 5: ok 809 675 582 462 450 446 442 446 446 450 mul_u64_u64_div_u64 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab 6: ok 136 90 90 90 90 90 90 90 90 90 mul_u64_u64_div_u64 1ffffffff*1ffffffff/4 = ffffffff00000000 7: ok 559 449 391 370 374 370 394 377 375 384 mul_u64_u64_div_u64 ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff 8: ok 587 515 386 333 334 334 334 334 334 334 mul_u64_u64_div_u64 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 9: ok 124 123 99 99 101 101 101 101 101 101 mul_u64_u64_div_u64 7fffffffffffffff*2/3 = 5555555555555554 10: ok 143 119 93 90 90 90 90 90 90 90 mul_u64_u64_div_u64 ffffffffffffffff*2/8000000000000000 = 3 11: ok 136 93 93 95 93 93 93 93 93 93 mul_u64_u64_div_u64 ffffffffffffffff*2/c000000000000000 = 2 12: ok 88 90 90 90 90 90 93 90 90 90 mul_u64_u64_div_u64 ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 13: ok 88 90 90 90 90 90 90 90 90 90 mul_u64_u64_div_u64 ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 14: ok 138 144 102 122 101 71 73 74 74 74 mul_u64_u64_div_u64 ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 15: ok 101 98 95 98 93 66 69 69 69 69 mul_u64_u64_div_u64 fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 16: ok 207 169 124 99 76 75 76 76 76 76 mul_u64_u64_div_u64 ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 17: ok 177 143 107 108 76 74 74 74 74 74 mul_u64_u64_div_u64 ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 18: ok 565 444 412 399 401 413 399 412 392 415 mul_u64_u64_div_u64 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 19: ok 411 447 435 392 417 398 422 394 402 394 mul_u64_u64_div_u64 ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca 20: ok 467 452 450 427 405 428 418 413 421 405 mul_u64_u64_div_u64 ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b 21: ok 443 395 398 417 405 418 398 404 404 404 mul_u64_u64_div_u64 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 22: ok 441 400 363 361 347 348 345 345 345 343 mul_u64_u64_div_u64 ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe 23: ok 895 811 423 348 352 352 352 352 352 352 mul_u64_u64_div_u64 e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 24: ok 928 638 445 339 344 344 344 344 344 344 mul_u64_u64_div_u64 f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 25: ok 606 457 277 279 276 276 276 276 276 276 mul_u64_u64_div_u64 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f 26: ok 955 683 415 377 377 377 377 377 377 377 mul_u64_u64_div_u64 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 27: ok 700 599 375 382 382 382 382 382 382 382 mul_u64_u64_div_u64 eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 28: ok 406 346 238 238 217 214 214 214 214 214 mul_u64_u64_div_u64 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e 29: ok 486 380 378 382 382 382 382 382 382 382 mul_u64_u64_div_u64 eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 30: ok 538 407 297 262 244 244 244 244 244 244 mul_u64_u64_div_u64 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef 31: ok 564 539 446 451 417 418 424 425 417 425 mul_u64_u64_div_u64 eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d 32: ok 416 382 387 382 382 382 382 382 382 382 mul_u64_u64_div_u64 eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > > $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0xc3 -m32 && sudo ./div_perf > 0: ok 336 131 104 104 102 102 103 103 103 105 mul_u64_u64_div_u64_new b*7/3 = 19 > 1: ok 195 195 171 171 171 171 171 171 171 171 mul_u64_u64_div_u64_new ffff0000*ffff0000/f = 1110eeef00000000 > 2: ok 165 162 162 162 162 162 162 162 162 162 mul_u64_u64_div_u64_new ffffffff*ffffffff/1 = fffffffe00000001 > 3: ok 165 173 173 173 173 173 173 173 173 173 mul_u64_u64_div_u64_new ffffffff*ffffffff/2 = 7fffffff00000000 > 4: ok 220 216 202 206 202 206 202 206 202 206 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/2 = fffffffe80000000 > 5: ok 203 205 208 205 209 205 209 205 209 205 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab > 6: ok 203 203 199 203 199 203 199 203 199 203 mul_u64_u64_div_u64_new 1ffffffff*1ffffffff/4 = ffffffff00000000 > 7: ok 574 421 251 242 246 254 246 254 246 254 mul_u64_u64_div_u64_new ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff > 8: ok 370 317 263 262 254 259 258 262 254 259 mul_u64_u64_div_u64_new 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 > 9: ok 214 199 177 177 177 177 177 177 177 177 mul_u64_u64_div_u64_new 7fffffffffffffff*2/3 = 5555555555555554 > 10: ok 163 150 128 129 129 129 129 129 129 129 mul_u64_u64_div_u64_new ffffffffffffffff*2/8000000000000000 = 3 > 11: ok 135 133 135 135 135 135 135 135 135 135 mul_u64_u64_div_u64_new ffffffffffffffff*2/c000000000000000 = 2 > 12: ok 206 208 208 180 183 183 183 183 183 183 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 > 13: ok 205 183 183 184 183 183 183 183 183 183 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 > 14: ok 331 296 238 229 232 232 232 232 232 232 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 > 15: ok 324 311 238 239 239 239 239 239 239 242 mul_u64_u64_div_u64_new fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 > 16: ok 300 262 265 233 238 234 238 234 238 234 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 > 17: ok 295 282 244 244 244 244 244 244 244 244 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 > 18: ok 245 247 235 222 221 222 219 222 221 222 mul_u64_u64_div_u64_new 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 > 19: ok 235 221 222 221 222 219 222 221 222 219 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca > 20: ok 220 222 221 222 219 222 219 222 221 222 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b > 21: ok 225 221 222 219 221 219 221 219 221 219 mul_u64_u64_div_u64_new 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 > 22: ok 309 274 250 238 237 244 239 241 237 244 mul_u64_u64_div_u64_new ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe > 23: ok 284 284 250 247 243 247 247 243 247 247 mul_u64_u64_div_u64_new e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 > 24: ok 239 239 243 240 239 239 239 239 239 239 mul_u64_u64_div_u64_new f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 > 25: ok 255 255 255 255 247 243 247 247 243 247 mul_u64_u64_div_u64_new 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f > 26: ok 327 274 240 242 240 242 240 242 240 242 mul_u64_u64_div_u64_new 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 > 27: ok 461 313 340 259 257 259 257 259 257 259 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > 28: ok 291 219 180 174 170 173 171 174 171 174 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e > 29: ok 280 319 258 257 259 257 259 257 259 257 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > 30: ok 379 352 247 239 220 225 225 229 228 229 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef > 31: ok 235 219 221 219 221 219 221 219 221 219 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d > 32: ok 305 263 257 257 258 258 263 257 257 257 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > > $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0x03 -m32 && sudo ./div_perf > 0: ok 292 127 129 125 123 128 125 123 125 121 mul_u64_u64_div_u64_new b*7/3 = 19 > 1: ok 190 196 151 149 151 149 151 149 151 149 mul_u64_u64_div_u64_new ffff0000*ffff0000/f = 1110eeef00000000 > 2: ok 141 139 139 139 139 139 139 139 139 139 mul_u64_u64_div_u64_new ffffffff*ffffffff/1 = fffffffe00000001 > 3: ok 152 149 151 149 151 149 151 149 151 149 mul_u64_u64_div_u64_new ffffffff*ffffffff/2 = 7fffffff00000000 > 4: ok 667 453 276 270 271 271 271 267 274 272 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/2 = fffffffe80000000 > 5: ok 366 319 373 337 278 278 278 278 278 278 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab > 6: ok 380 349 268 268 271 265 265 265 265 265 mul_u64_u64_div_u64_new 1ffffffff*1ffffffff/4 = ffffffff00000000 > 7: ok 340 277 255 251 249 255 251 249 255 251 mul_u64_u64_div_u64_new ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff > 8: ok 377 302 253 252 256 256 256 256 256 256 mul_u64_u64_div_u64_new 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 > 9: ok 181 184 157 155 157 155 157 155 157 155 mul_u64_u64_div_u64_new 7fffffffffffffff*2/3 = 5555555555555554 > 10: ok 304 223 142 139 139 141 139 139 139 139 mul_u64_u64_div_u64_new ffffffffffffffff*2/8000000000000000 = 3 > 11: ok 153 143 148 142 143 142 143 142 143 142 mul_u64_u64_div_u64_new ffffffffffffffff*2/c000000000000000 = 2 > 12: ok 428 323 292 246 257 248 253 250 248 253 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 > 13: ok 289 256 260 257 255 257 255 257 255 257 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 > 14: ok 334 246 234 234 227 233 229 233 229 233 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 > 15: ok 324 302 273 236 236 236 236 236 236 236 mul_u64_u64_div_u64_new fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 > 16: ok 269 328 285 259 232 230 236 232 232 236 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 > 17: ok 307 329 330 244 247 246 245 245 245 245 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 > 18: ok 359 361 324 258 258 258 258 258 258 258 mul_u64_u64_div_u64_new 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 > 19: ok 347 325 295 258 260 253 251 251 251 251 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca > 20: ok 339 312 261 260 255 255 255 255 255 255 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b > 21: ok 411 349 333 276 272 272 272 272 272 272 mul_u64_u64_div_u64_new 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 > 22: ok 297 330 290 266 239 236 238 237 238 237 mul_u64_u64_div_u64_new ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe > 23: ok 299 311 250 247 250 245 247 250 245 247 mul_u64_u64_div_u64_new e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 > 24: ok 274 245 238 237 237 237 237 237 237 247 mul_u64_u64_div_u64_new f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 > 25: ok 247 247 245 247 250 245 247 250 245 247 mul_u64_u64_div_u64_new 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f > 26: ok 341 354 288 239 242 240 239 242 240 239 mul_u64_u64_div_u64_new 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 > 27: ok 408 375 288 312 257 260 259 259 259 259 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > 28: ok 289 259 199 198 201 170 170 174 173 173 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e > 29: ok 334 257 260 257 260 259 259 259 259 259 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > 30: ok 341 267 244 233 229 233 229 233 229 233 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef > 31: ok 323 323 297 297 264 268 267 268 267 268 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d > 32: ok 284 262 251 251 253 251 251 251 251 251 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > > $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0x24 -m32 && sudo ./div_perf > 0: ok 362 127 125 122 123 122 125 129 126 126 mul_u64_u64_div_u64_new b*7/3 = 19 > 1: ok 190 175 149 150 154 150 154 150 154 150 mul_u64_u64_div_u64_new ffff0000*ffff0000/f = 1110eeef00000000 > 2: ok 144 139 139 139 139 139 139 139 139 139 mul_u64_u64_div_u64_new ffffffff*ffffffff/1 = fffffffe00000001 > 3: ok 146 150 154 154 150 154 150 154 150 154 mul_u64_u64_div_u64_new ffffffff*ffffffff/2 = 7fffffff00000000 > 4: ok 747 554 319 318 316 319 318 328 314 324 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/2 = fffffffe80000000 > 5: ok 426 315 312 315 315 315 315 315 315 315 mul_u64_u64_div_u64_new 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab > 6: ok 352 391 317 316 323 327 323 327 323 324 mul_u64_u64_div_u64_new 1ffffffff*1ffffffff/4 = ffffffff00000000 > 7: ok 369 328 298 292 298 292 298 292 298 292 mul_u64_u64_div_u64_new ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff > 8: ok 436 348 298 299 298 300 307 297 297 301 mul_u64_u64_div_u64_new 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 > 9: ok 180 183 151 151 151 151 151 151 151 153 mul_u64_u64_div_u64_new 7fffffffffffffff*2/3 = 5555555555555554 > 10: ok 286 251 188 169 174 170 178 170 178 170 mul_u64_u64_div_u64_new ffffffffffffffff*2/8000000000000000 = 3 > 11: ok 230 177 172 177 172 177 172 177 172 177 mul_u64_u64_div_u64_new ffffffffffffffff*2/c000000000000000 = 2 > 12: ok 494 412 371 331 296 298 305 290 296 298 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 > 13: ok 330 300 304 305 290 305 294 302 290 296 mul_u64_u64_div_u64_new ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 > 14: ok 284 241 175 172 176 175 175 172 176 175 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 > 15: ok 283 263 171 176 175 175 175 175 175 175 mul_u64_u64_div_u64_new fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 > 16: ok 346 247 258 208 202 205 208 202 205 208 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 > 17: ok 242 272 208 213 209 213 209 213 209 213 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 > 18: ok 494 337 306 309 309 309 309 309 309 309 mul_u64_u64_div_u64_new 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 > 19: ok 392 394 329 305 299 302 294 302 292 305 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca > 20: ok 372 307 306 310 308 310 308 310 308 310 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b > 21: ok 525 388 310 320 314 313 314 315 312 315 mul_u64_u64_div_u64_new 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 > 22: ok 400 293 289 354 284 283 284 284 284 284 mul_u64_u64_div_u64_new ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe > 23: ok 320 324 289 290 289 289 290 289 289 290 mul_u64_u64_div_u64_new e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 > 24: ok 279 289 290 285 285 285 285 285 285 285 mul_u64_u64_div_u64_new f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 > 25: ok 288 290 289 289 290 289 289 290 289 289 mul_u64_u64_div_u64_new 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f > 26: ok 361 372 351 325 288 293 286 293 294 293 mul_u64_u64_div_u64_new 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 > 27: ok 483 349 302 302 305 300 307 304 307 304 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > 28: ok 339 328 234 233 213 214 213 208 215 208 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e > 29: ok 406 300 303 300 307 304 307 304 307 304 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > 30: ok 404 421 335 268 265 271 267 271 267 272 mul_u64_u64_div_u64_new 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef > 31: ok 484 350 310 309 306 309 309 306 309 309 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d > 32: ok 368 306 301 307 304 307 304 307 304 307 mul_u64_u64_div_u64_new eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 > > David # old code x86 32bit $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0x03 -m32 && sudo ./div_perf 0: ok 450 149 114 114 113 115 119 114 114 115 mul_u64_u64_div_u64 b*7/3 = 19 1: ok 169 166 141 136 140 136 140 136 140 136 mul_u64_u64_div_u64 ffff0000*ffff0000/f = 1110eeef00000000 2: ok 136 136 133 134 130 134 130 134 130 134 mul_u64_u64_div_u64 ffffffff*ffffffff/1 = fffffffe00000001 3: ok 139 138 140 136 140 136 140 136 138 136 mul_u64_u64_div_u64 ffffffff*ffffffff/2 = 7fffffff00000000 4: ok 258 245 161 159 161 159 161 159 161 159 mul_u64_u64_div_u64 1ffffffff*ffffffff/2 = fffffffe80000000 5: ok 1806 1474 1210 1148 1204 1146 1147 1149 1147 1149 mul_u64_u64_div_u64 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab 6: ok 243 192 192 161 162 162 162 162 162 162 mul_u64_u64_div_u64 1ffffffff*1ffffffff/4 = ffffffff00000000 7: ok 965 907 902 833 835 842 858 846 806 805 mul_u64_u64_div_u64 ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff 8: ok 1171 962 860 922 860 860 860 860 860 860 mul_u64_u64_div_u64 3333333333333333*3333333333333333/5555555555555555 = 1eb851eb851eb851 9: ok 169 176 147 146 142 146 142 146 141 146 mul_u64_u64_div_u64 7fffffffffffffff*2/3 = 5555555555555554 10: ok 205 198 142 136 141 139 141 139 141 139 mul_u64_u64_div_u64 ffffffffffffffff*2/8000000000000000 = 3 11: ok 145 143 143 143 143 143 143 144 143 144 mul_u64_u64_div_u64 ffffffffffffffff*2/c000000000000000 = 2 12: ok 186 188 188 163 161 163 161 163 161 163 mul_u64_u64_div_u64 ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 13: ok 158 160 156 156 156 156 156 156 156 156 mul_u64_u64_div_u64 ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 14: ok 271 261 166 139 138 138 138 138 138 138 mul_u64_u64_div_u64 ffffffffffffffff*8000000000000001/ffffffffffffffff = 8000000000000001 15: ok 242 190 170 171 155 127 124 125 124 125 mul_u64_u64_div_u64 fffffffffffffffe*8000000000000001/ffffffffffffffff = 8000000000000000 16: ok 210 175 176 176 215 146 149 148 149 149 mul_u64_u64_div_u64 ffffffffffffffff*8000000000000001/fffffffffffffffe = 8000000000000001 17: ok 235 224 188 139 140 136 139 140 136 139 mul_u64_u64_div_u64 ffffffffffffffff*8000000000000001/fffffffffffffffd = 8000000000000002 18: ok 1124 1035 960 960 960 960 960 960 960 960 mul_u64_u64_div_u64 7fffffffffffffff*ffffffffffffffff/c000000000000000 = aaaaaaaaaaaaaaa8 19: ok 1106 1092 1028 1027 1010 1009 1011 1010 1010 1010 mul_u64_u64_div_u64 ffffffffffffffff*7fffffffffffffff/a000000000000000 = ccccccccccccccca 20: ok 1123 1015 1017 1017 1003 1002 1002 1002 1002 1002 mul_u64_u64_div_u64 ffffffffffffffff*7fffffffffffffff/9000000000000000 = e38e38e38e38e38b 21: ok 997 973 974 975 974 975 974 975 974 975 mul_u64_u64_div_u64 7fffffffffffffff*7fffffffffffffff/5000000000000000 = ccccccccccccccc9 22: ok 892 834 808 802 832 786 780 784 783 773 mul_u64_u64_div_u64 ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = fffffffffffffffe 23: ok 1495 1162 1028 896 898 898 898 898 898 898 mul_u64_u64_div_u64 e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = 78f8bf8cc86c6e18 24: ok 1493 1220 962 880 880 880 880 880 880 880 mul_u64_u64_div_u64 f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = 42687f79d8998d35 25: ok 1079 960 830 709 711 708 708 708 708 708 mul_u64_u64_div_u64 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f 26: ok 1551 1257 1043 956 957 957 957 957 957 957 mul_u64_u64_div_u64 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 27: ok 1473 1193 995 995 995 995 995 995 995 995 mul_u64_u64_div_u64 eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 28: ok 797 735 579 535 533 534 534 534 534 534 mul_u64_u64_div_u64 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e 29: ok 1122 993 995 995 995 995 995 995 995 995 mul_u64_u64_div_u64 eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 30: ok 931 789 674 604 605 606 605 606 605 606 mul_u64_u64_div_u64 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef 31: ok 1536 1247 1068 1034 1034 1034 1034 1094 1032 1035 mul_u64_u64_div_u64 eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = dc5f7e8b334db07d 32: ok 1068 1019 995 995 995 995 995 995 995 995 mul_u64_u64_div_u64 eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = dc5f5cc9e270d216 David
On Fri, 11 Jul 2025, David Laight wrote: > On Wed, 9 Jul 2025 15:24:20 +0100 > David Laight <david.laight.linux@gmail.com> wrote: > > > On Sat, 14 Jun 2025 10:53:45 +0100 > > David Laight <david.laight.linux@gmail.com> wrote: > > > > > Replace the bit by bit algorithm with one that generates 16 bits > > > per iteration on 32bit architectures and 32 bits on 64bit ones. > > > > I've spent far too long doing some clock counting exercises on this code. > > This is the latest version with some conditional compiles and comments > > explaining the various optimisation. > > I think the 'best' version is with -DMULDIV_OPT=0xc3 > > > >.... > > Some measurements on an ivy bridge. > > These are the test vectors from the test module with a few extra values on the > > end that pick different paths through this implementatoin. > > The numbers are 'performance counter' deltas for 10 consecutive calls with the > > same values. > > So the latter values are with the branch predictor 'trained' to the test case. > > The first few (larger) values show the cost of mispredicted branches. > > I should have included the values for the sames tests with the existing code. > Added here, but this is includes the change to test for overflow and divide > by zero after the multiply (no ilog2 calls - which make the 32bit code worse). > About the only cases where the old code it better are when the quotient > only has two bits set. Kudos to you. I don't think it is necessary to go deeper than this without going crazy. Given you have your brain wrapped around all the variants at this point, I'm deferring to you to create a version for the kernel representing the best compromise between performance and maintainability. Please consider including the following at the start of your next patch series version as this is headed for mainline already: https://lkml.kernel.org/r/q246p466-1453-qon9-29so-37105116009q@onlyvoer.pbz Nicolas
On Fri, 11 Jul 2025 17:40:57 -0400 (EDT) Nicolas Pitre <nico@fluxnic.net> wrote: ... > Kudos to you. > > I don't think it is necessary to go deeper than this without going crazy. I'm crazy already :-) David
> $ cc -O2 -o div_perf div_perf.c -DMULDIV_OPT=0xc3 && sudo ./div_perf > 0: ok 162 134 78 78 78 78 78 80 80 80 > mul_u64_u64_div_u64_new b*7/3 = 19 > 1: ok 91 91 91 91 91 91 91 91 91 91 > mul_u64_u64_div_u64_new ffff0000*ffff0000/f = 1110eeef00000000 > 2: ok 75 77 75 77 77 77 77 77 77 77 > mul_u64_u64_div_u64_new ffffffff*ffffffff/1 = fffffffe00000001 > 3: ok 89 91 91 91 91 91 89 90 91 91 > mul_u64_u64_div_u64_new ffffffff*ffffffff/2 = 7fffffff00000000 > 4: ok 147 147 128 128 128 128 128 128 128 128 > mul_u64_u64_div_u64_new 1ffffffff*ffffffff/2 = fffffffe80000000 > 5: ok 128 128 128 128 128 128 128 128 128 128 > mul_u64_u64_div_u64_new 1ffffffff*ffffffff/3 = aaaaaaa9aaaaaaab > 6: ok 121 121 121 121 121 121 121 121 121 121 > mul_u64_u64_div_u64_new 1ffffffff*1ffffffff/4 = ffffffff00000000 > 7: ok 274 234 146 138 138 138 138 138 138 138 > mul_u64_u64_div_u64_new > ffff000000000000*ffff000000000000/ffff000000000001 = fffeffffffffffff > 8: ok 177 148 148 149 149 149 149 149 149 149 > mul_u64_u64_div_u64_new > 3333333333333333*3333333333333333/5555555555555555 = > 1eb851eb851eb851 > 9: ok 138 90 118 91 91 91 91 92 92 92 > mul_u64_u64_div_u64_new 7fffffffffffffff*2/3 = 5555555555555554 > 10: ok 113 114 86 86 84 86 86 84 87 > 87 mul_u64_u64_div_u64_new ffffffffffffffff*2/8000000000000000 = 3 > 11: ok 87 88 88 86 88 88 88 88 90 > 90 mul_u64_u64_div_u64_new ffffffffffffffff*2/c000000000000000 = 2 > 12: ok 82 86 84 86 83 86 83 86 83 > 87 mul_u64_u64_div_u64_new > ffffffffffffffff*4000000000000004/8000000000000000 = 8000000000000007 > 13: ok 82 86 84 86 83 86 83 86 83 > 86 mul_u64_u64_div_u64_new > ffffffffffffffff*4000000000000001/8000000000000000 = 8000000000000001 > 14: ok 189 187 138 132 132 132 131 131 131 > 131 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/ffffffffffffffff = > 8000000000000001 > 15: ok 221 175 159 131 131 131 131 131 131 > 131 mul_u64_u64_div_u64_new fffffffffffffffe*8000000000000001/ffffffffffffffff > = 8000000000000000 > 16: ok 134 132 134 134 134 135 134 134 134 > 134 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffe > = 8000000000000001 > 17: ok 172 134 137 134 134 134 134 134 134 > 134 mul_u64_u64_div_u64_new ffffffffffffffff*8000000000000001/fffffffffffffffd > = 8000000000000002 > 18: ok 182 182 129 129 129 129 129 129 129 > 129 mul_u64_u64_div_u64_new 7fffffffffffffff*ffffffffffffffff/c000000000000000 > = aaaaaaaaaaaaaaa8 > 19: ok 130 129 130 129 129 129 129 129 129 > 129 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/a000000000000000 > = ccccccccccccccca > 20: ok 130 129 129 129 129 129 129 129 129 > 129 mul_u64_u64_div_u64_new ffffffffffffffff*7fffffffffffffff/9000000000000000 > = e38e38e38e38e38b > 21: ok 130 129 129 129 129 129 129 129 129 > 129 mul_u64_u64_div_u64_new 7fffffffffffffff*7fffffffffffffff/5000000000000000 > = ccccccccccccccc9 > 22: ok 206 140 138 138 138 138 138 138 138 > 138 mul_u64_u64_div_u64_new ffffffffffffffff*fffffffffffffffe/ffffffffffffffff = > fffffffffffffffe > 23: ok 174 140 138 138 138 138 138 138 138 > 138 mul_u64_u64_div_u64_new > e6102d256d7ea3ae*70a77d0be4c31201/d63ec35ab3220357 = > 78f8bf8cc86c6e18 > 24: ok 135 137 137 137 137 137 137 137 137 > 137 mul_u64_u64_div_u64_new > f53bae05cb86c6e1*3847b32d2f8d32e0/cfd4f55a647f403c = > 42687f79d8998d35 > 25: ok 134 136 136 136 136 136 136 136 136 > 136 mul_u64_u64_div_u64_new > 9951c5498f941092*1f8c8bfdf287a251/a3c8dc5f81ea3fe2 = 1d887cb25900091f > 26: ok 136 134 134 134 134 134 134 134 134 > 134 mul_u64_u64_div_u64_new > 374fee9daa1bb2bb*d0bfbff7b8ae3ef/c169337bd42d5179 = 3bb2dbaffcbb961 > 27: ok 139 138 138 138 138 138 138 138 138 > 138 mul_u64_u64_div_u64_new > eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = > dc5f5cc9e270d216 > 28: ok 130 143 95 95 96 96 96 96 96 > 96 mul_u64_u64_div_u64_new > 2d256d7ea3ae*7d0be4c31201/d63ec35ab3220357 = 1a599d6e > 29: ok 169 158 158 138 138 138 138 138 138 > 138 mul_u64_u64_div_u64_new > eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = > dc5f5cc9e270d216 > 30: ok 178 164 144 147 147 147 147 147 147 > 147 mul_u64_u64_div_u64_new > 2d256d7ea3ae*7d0be4c31201/63ec35ab3220357 = 387f55cef > 31: ok 163 128 128 128 128 128 128 128 128 > 128 mul_u64_u64_div_u64_new > eac0d03ac10eeaf0*89be05dfa162ed9b/92bb000000000000 = > dc5f7e8b334db07d > 32: ok 163 184 137 136 136 138 138 138 138 > 138 mul_u64_u64_div_u64_new > eac0d03ac10eeaf0*89be05dfa162ed9b/92bb1679a41f0e4b = > dc5f5cc9e270d216 > Nice work! Is it necessary to add an exception test case, such as a case for division result overflowing 64bit? Thanks -Li
On Thu, 10 Jul 2025 09:39:51 +0000 "Li,Rongqing" <lirongqing@baidu.com> wrote: ... > Nice work! > > Is it necessary to add an exception test case, such as a case for division result overflowing 64bit? Not if the code traps :-) David
On Sat, 14 Jun 2025, David Laight wrote: > Replace the bit by bit algorithm with one that generates 16 bits > per iteration on 32bit architectures and 32 bits on 64bit ones. > > On my zen 5 this reduces the time for the tests (using the generic > code) from ~3350ns to ~1000ns. > > Running the 32bit algorithm on 64bit x86 takes ~1500ns. > It'll be slightly slower on a real 32bit system, mostly due > to register pressure. > > The savings for 32bit x86 are much higher (tested in userspace). > The worst case (lots of bits in the quotient) drops from ~900 clocks > to ~130 (pretty much independant of the arguments). > Other 32bit architectures may see better savings. > > It is possibly to optimise for divisors that span less than > __LONG_WIDTH__/2 bits. However I suspect they don't happen that often > and it doesn't remove any slow cpu divide instructions which dominate > the result. > > Signed-off-by: David Laight <david.laight.linux@gmail.com> Nice work. I had to be fully awake to review this one. Some suggestions below. > + reps = 64 / BITS_PER_ITER; > + /* Optimise loop count for small dividends */ > + if (!(u32)(n_hi >> 32)) { > + reps -= 32 / BITS_PER_ITER; > + n_hi = n_hi << 32 | n_lo >> 32; > + n_lo <<= 32; > + } > +#if BITS_PER_ITER == 16 > + if (!(u32)(n_hi >> 48)) { > + reps--; > + n_hi = add_u64_u32(n_hi << 16, n_lo >> 48); > + n_lo <<= 16; > + } > +#endif I think it would be more beneficial to integrate this within the loop itself instead. It is often the case that the dividend is initially big, and becomes zero or too small for the division in the middle of the loop. Something like: unsigned long n; ... reps = 64 / BITS_PER_ITER; while (reps--) { quotient <<= BITS_PER_ITER; n = ~n_hi >> (64 - 2 * BITS_PER_ITER); if (n < d_msig) { n_hi = (n_hi << BITS_PER_ITER | (n_lo >> (64 - BITS_PER_ITER))); n_lo <<= BITS_PER_ITER; continue; } q_digit = n / d_msig; ... This way small dividends are optimized as well as dividends with holes in them. And this allows for the number of loops to become constant which opens some unrolling optimization possibilities. > + /* > + * Get the most significant BITS_PER_ITER bits of the divisor. > + * This is used to get a low 'guestimate' of the quotient digit. > + */ > + d_msig = (d >> (64 - BITS_PER_ITER)) + 1; Here the unconditional +1 is pessimizing d_msig too much. You should do this instead: d_msig = (d >> (64 - BITS_PER_ITER)) + !!(d << BITS_PER_ITER); In other words, you want to round up the value, not an unconditional +1 causing several unnecessary overflow fixup loops. Nicolas
On Tue, 17 Jun 2025, Nicolas Pitre wrote: > On Sat, 14 Jun 2025, David Laight wrote: > > > Replace the bit by bit algorithm with one that generates 16 bits > > per iteration on 32bit architectures and 32 bits on 64bit ones. > > > > On my zen 5 this reduces the time for the tests (using the generic > > code) from ~3350ns to ~1000ns. > > > > Running the 32bit algorithm on 64bit x86 takes ~1500ns. > > It'll be slightly slower on a real 32bit system, mostly due > > to register pressure. > > > > The savings for 32bit x86 are much higher (tested in userspace). > > The worst case (lots of bits in the quotient) drops from ~900 clocks > > to ~130 (pretty much independant of the arguments). > > Other 32bit architectures may see better savings. > > > > It is possibly to optimise for divisors that span less than > > __LONG_WIDTH__/2 bits. However I suspect they don't happen that often > > and it doesn't remove any slow cpu divide instructions which dominate > > the result. > > > > Signed-off-by: David Laight <david.laight.linux@gmail.com> > > Nice work. I had to be fully awake to review this one. > Some suggestions below. Here's a patch with my suggestions applied to make it easier to figure them out. The added "inline" is necessary to fix compilation on ARM32. The "likely()" makes for better assembly and this part is pretty much likely anyway. I've explained the rest previously, although this is a better implementation. commit 99ea338401f03efe5dbebe57e62bd7c588409c5c Author: Nicolas Pitre <nico@fluxnic.net> Date: Tue Jun 17 14:42:34 2025 -0400 fixup! lib: mul_u64_u64_div_u64() Optimise the divide code diff --git a/lib/math/div64.c b/lib/math/div64.c index 3c9fe878ce68..740e59a58530 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -188,7 +188,7 @@ EXPORT_SYMBOL(iter_div_u64_rem); #if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64) -static u64 mul_add(u32 a, u32 b, u32 c) +static inline u64 mul_add(u32 a, u32 b, u32 c) { return add_u64_u32(mul_u32_u32(a, b), c); } @@ -246,7 +246,7 @@ static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c) u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) { - unsigned long d_msig, q_digit; + unsigned long n_long, d_msig, q_digit; unsigned int reps, d_z_hi; u64 quotient, n_lo, n_hi; u32 overflow; @@ -271,36 +271,21 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) /* Left align the divisor, shifting the dividend to match */ d_z_hi = __builtin_clzll(d); - if (d_z_hi) { + if (likely(d_z_hi)) { d <<= d_z_hi; n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi); n_lo <<= d_z_hi; } - reps = 64 / BITS_PER_ITER; - /* Optimise loop count for small dividends */ - if (!(u32)(n_hi >> 32)) { - reps -= 32 / BITS_PER_ITER; - n_hi = n_hi << 32 | n_lo >> 32; - n_lo <<= 32; - } -#if BITS_PER_ITER == 16 - if (!(u32)(n_hi >> 48)) { - reps--; - n_hi = add_u64_u32(n_hi << 16, n_lo >> 48); - n_lo <<= 16; - } -#endif - /* Invert the dividend so we can use add instead of subtract. */ n_lo = ~n_lo; n_hi = ~n_hi; /* - * Get the most significant BITS_PER_ITER bits of the divisor. + * Get the rounded-up most significant BITS_PER_ITER bits of the divisor. * This is used to get a low 'guestimate' of the quotient digit. */ - d_msig = (d >> (64 - BITS_PER_ITER)) + 1; + d_msig = (d >> (64 - BITS_PER_ITER)) + !!(d << BITS_PER_ITER); /* * Now do a 'long division' with BITS_PER_ITER bit 'digits'. @@ -308,12 +293,17 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) * The worst case is dividing ~0 by 0x8000 which requires two subtracts. */ quotient = 0; - while (reps--) { - q_digit = (unsigned long)(~n_hi >> (64 - 2 * BITS_PER_ITER)) / d_msig; + for (reps = 64 / BITS_PER_ITER; reps; reps--) { + quotient <<= BITS_PER_ITER; + n_long = ~n_hi >> (64 - 2 * BITS_PER_ITER); /* Shift 'n' left to align with the product q_digit * d */ overflow = n_hi >> (64 - BITS_PER_ITER); n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER)); n_lo <<= BITS_PER_ITER; + /* cut it short if q_digit would be 0 */ + if (n_long < d_msig) + continue; + q_digit = n_long / d_msig; /* Add product to negated divisor */ overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi); /* Adjust for the q_digit 'guestimate' being low */ @@ -322,7 +312,7 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) n_hi += d; overflow += n_hi < d; } - quotient = add_u64_long(quotient << BITS_PER_ITER, q_digit); + quotient = add_u64_long(quotient, q_digit); } /*
On Tue, 17 Jun 2025 21:33:23 -0400 (EDT) Nicolas Pitre <nico@fluxnic.net> wrote: > On Tue, 17 Jun 2025, Nicolas Pitre wrote: > > > On Sat, 14 Jun 2025, David Laight wrote: > > > > > Replace the bit by bit algorithm with one that generates 16 bits > > > per iteration on 32bit architectures and 32 bits on 64bit ones. > > > > > > On my zen 5 this reduces the time for the tests (using the generic > > > code) from ~3350ns to ~1000ns. > > > > > > Running the 32bit algorithm on 64bit x86 takes ~1500ns. > > > It'll be slightly slower on a real 32bit system, mostly due > > > to register pressure. > > > > > > The savings for 32bit x86 are much higher (tested in userspace). > > > The worst case (lots of bits in the quotient) drops from ~900 clocks > > > to ~130 (pretty much independant of the arguments). > > > Other 32bit architectures may see better savings. > > > > > > It is possibly to optimise for divisors that span less than > > > __LONG_WIDTH__/2 bits. However I suspect they don't happen that often > > > and it doesn't remove any slow cpu divide instructions which dominate > > > the result. > > > > > > Signed-off-by: David Laight <david.laight.linux@gmail.com> > > > > Nice work. I had to be fully awake to review this one. > > Some suggestions below. > > Here's a patch with my suggestions applied to make it easier to figure > them out. The added "inline" is necessary to fix compilation on ARM32. > The "likely()" makes for better assembly and this part is pretty much > likely anyway. I've explained the rest previously, although this is a > better implementation. I've been trying to find time to do some more performance measurements. I did a few last night and managed to realise at least some of what is happening. I've dropped the responses in here since there is a copy of the code. The 'TLDR' is that the full divide takes my zen5 about 80 clocks (including some test red tape) provided the branch-predictor is predicting correctly. I get that consistently for repeats of the same division, and moving onto the next one - provided they take the same path. (eg for the last few 'random' value tests.) However if I include something that takes a different path (eg divisor doesn't have the top bit set, or the product has enough high zero bits to take the 'optimised' path then the first two or three repeats of the next division are at least 20 clocks slower. In the kernel these calls aren't going to be common enough (and with similar inputs) to train the branch predictor. So the mispredicted branches really start to dominate. This is similar to the issue that Linus keeps mentioning about kernel code being mainly 'cold cache'. > > commit 99ea338401f03efe5dbebe57e62bd7c588409c5c > Author: Nicolas Pitre <nico@fluxnic.net> > Date: Tue Jun 17 14:42:34 2025 -0400 > > fixup! lib: mul_u64_u64_div_u64() Optimise the divide code > > diff --git a/lib/math/div64.c b/lib/math/div64.c > index 3c9fe878ce68..740e59a58530 100644 > --- a/lib/math/div64.c > +++ b/lib/math/div64.c > @@ -188,7 +188,7 @@ EXPORT_SYMBOL(iter_div_u64_rem); > > #if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64) > > -static u64 mul_add(u32 a, u32 b, u32 c) > +static inline u64 mul_add(u32 a, u32 b, u32 c) If inline is needed, it need to be always_inline. > { > return add_u64_u32(mul_u32_u32(a, b), c); > } > @@ -246,7 +246,7 @@ static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c) > > u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > { > - unsigned long d_msig, q_digit; > + unsigned long n_long, d_msig, q_digit; > unsigned int reps, d_z_hi; > u64 quotient, n_lo, n_hi; > u32 overflow; > @@ -271,36 +271,21 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > > /* Left align the divisor, shifting the dividend to match */ > d_z_hi = __builtin_clzll(d); > - if (d_z_hi) { > + if (likely(d_z_hi)) { I don't think likely() helps much. But for 64bit this can be unconditional... > d <<= d_z_hi; > n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi); ... replace 'n_lo >> (64 - d_z_hi)' with 'n_lo >> (63 - d_z_hi) >> 1'. The extra shift probably costs too much on 32bit though. > n_lo <<= d_z_hi; > } > > - reps = 64 / BITS_PER_ITER; > - /* Optimise loop count for small dividends */ > - if (!(u32)(n_hi >> 32)) { > - reps -= 32 / BITS_PER_ITER; > - n_hi = n_hi << 32 | n_lo >> 32; > - n_lo <<= 32; > - } > -#if BITS_PER_ITER == 16 > - if (!(u32)(n_hi >> 48)) { > - reps--; > - n_hi = add_u64_u32(n_hi << 16, n_lo >> 48); > - n_lo <<= 16; > - } > -#endif You don't want the branch in the loop. Once predicted correctly is makes little difference where you put the test - but that won't be 'normal' Unless it gets trained odd-even it'll get mispredicted at least once. For 64bit (zen5) the test outside the loop was less bad (IIRC dropped to 65 clocks - so probably worth it). I need to check an older cpu (got a few Intel ones). > - > /* Invert the dividend so we can use add instead of subtract. */ > n_lo = ~n_lo; > n_hi = ~n_hi; > > /* > - * Get the most significant BITS_PER_ITER bits of the divisor. > + * Get the rounded-up most significant BITS_PER_ITER bits of the divisor. > * This is used to get a low 'guestimate' of the quotient digit. > */ > - d_msig = (d >> (64 - BITS_PER_ITER)) + 1; > + d_msig = (d >> (64 - BITS_PER_ITER)) + !!(d << BITS_PER_ITER); For 64bit x64 that is pretty much zero cost (gcc manages to use the 'Z' flag from the '<< 32' in a 'setne' to get a 1 I didn't look hard enough to find where the zero register came from since 'setne' of changes the low 8 bits). But it is horrid for 32bit - added quite a few clocks. I'm not at all sure it is worth it though. > > /* > * Now do a 'long division' with BITS_PER_ITER bit 'digits'. > @@ -308,12 +293,17 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > * The worst case is dividing ~0 by 0x8000 which requires two subtracts. > */ > quotient = 0; > - while (reps--) { > - q_digit = (unsigned long)(~n_hi >> (64 - 2 * BITS_PER_ITER)) / d_msig; > + for (reps = 64 / BITS_PER_ITER; reps; reps--) { > + quotient <<= BITS_PER_ITER; > + n_long = ~n_hi >> (64 - 2 * BITS_PER_ITER); > /* Shift 'n' left to align with the product q_digit * d */ > overflow = n_hi >> (64 - BITS_PER_ITER); > n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER)); > n_lo <<= BITS_PER_ITER; > + /* cut it short if q_digit would be 0 */ > + if (n_long < d_msig) > + continue; As I said above that gets misprediced too often > + q_digit = n_long / d_msig; I fiddled with the order of the lines of code - changes the way the object code is laid out and change the clock count 'randomly' by one or two. Noise compared to mispredicted branches. I've not tested on an older system with a slower divide. I suspect older (Haswell and Broadwell) may benefit from the divide being scheduled earlier. If anyone wants of copy of the test program I can send you a copy. David > /* Add product to negated divisor */ > overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi); > /* Adjust for the q_digit 'guestimate' being low */ > @@ -322,7 +312,7 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > n_hi += d; > overflow += n_hi < d; > } > - quotient = add_u64_long(quotient << BITS_PER_ITER, q_digit); > + quotient = add_u64_long(quotient, q_digit); > } > > /*
On Thu, 26 Jun 2025, David Laight wrote: > On Tue, 17 Jun 2025 21:33:23 -0400 (EDT) > Nicolas Pitre <nico@fluxnic.net> wrote: > > > On Tue, 17 Jun 2025, Nicolas Pitre wrote: > > > > > On Sat, 14 Jun 2025, David Laight wrote: > > > > > > > Replace the bit by bit algorithm with one that generates 16 bits > > > > per iteration on 32bit architectures and 32 bits on 64bit ones. > > > > > > > > On my zen 5 this reduces the time for the tests (using the generic > > > > code) from ~3350ns to ~1000ns. > > > > > > > > Running the 32bit algorithm on 64bit x86 takes ~1500ns. > > > > It'll be slightly slower on a real 32bit system, mostly due > > > > to register pressure. > > > > > > > > The savings for 32bit x86 are much higher (tested in userspace). > > > > The worst case (lots of bits in the quotient) drops from ~900 clocks > > > > to ~130 (pretty much independant of the arguments). > > > > Other 32bit architectures may see better savings. > > > > > > > > It is possibly to optimise for divisors that span less than > > > > __LONG_WIDTH__/2 bits. However I suspect they don't happen that often > > > > and it doesn't remove any slow cpu divide instructions which dominate > > > > the result. > > > > > > > > Signed-off-by: David Laight <david.laight.linux@gmail.com> > > > > > > Nice work. I had to be fully awake to review this one. > > > Some suggestions below. > > > > Here's a patch with my suggestions applied to make it easier to figure > > them out. The added "inline" is necessary to fix compilation on ARM32. > > The "likely()" makes for better assembly and this part is pretty much > > likely anyway. I've explained the rest previously, although this is a > > better implementation. > > I've been trying to find time to do some more performance measurements. > I did a few last night and managed to realise at least some of what > is happening. > > I've dropped the responses in here since there is a copy of the code. > > The 'TLDR' is that the full divide takes my zen5 about 80 clocks > (including some test red tape) provided the branch-predictor is > predicting correctly. > I get that consistently for repeats of the same division, and moving > onto the next one - provided they take the same path. > (eg for the last few 'random' value tests.) > However if I include something that takes a different path (eg divisor > doesn't have the top bit set, or the product has enough high zero bits > to take the 'optimised' path then the first two or three repeats of the > next division are at least 20 clocks slower. > In the kernel these calls aren't going to be common enough (and with > similar inputs) to train the branch predictor. > So the mispredicted branches really start to dominate. > This is similar to the issue that Linus keeps mentioning about kernel > code being mainly 'cold cache'. > > > > > commit 99ea338401f03efe5dbebe57e62bd7c588409c5c > > Author: Nicolas Pitre <nico@fluxnic.net> > > Date: Tue Jun 17 14:42:34 2025 -0400 > > > > fixup! lib: mul_u64_u64_div_u64() Optimise the divide code > > > > diff --git a/lib/math/div64.c b/lib/math/div64.c > > index 3c9fe878ce68..740e59a58530 100644 > > --- a/lib/math/div64.c > > +++ b/lib/math/div64.c > > @@ -188,7 +188,7 @@ EXPORT_SYMBOL(iter_div_u64_rem); > > > > #if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64) > > > > -static u64 mul_add(u32 a, u32 b, u32 c) > > +static inline u64 mul_add(u32 a, u32 b, u32 c) > > If inline is needed, it need to be always_inline. Here "inline" was needed only because I got a "defined but unused" complaint from the compiler in some cases. The "always_inline" is not absolutely necessary. > > { > > return add_u64_u32(mul_u32_u32(a, b), c); > > } > > @@ -246,7 +246,7 @@ static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c) > > > > u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > > { > > - unsigned long d_msig, q_digit; > > + unsigned long n_long, d_msig, q_digit; > > unsigned int reps, d_z_hi; > > u64 quotient, n_lo, n_hi; > > u32 overflow; > > @@ -271,36 +271,21 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > > > > /* Left align the divisor, shifting the dividend to match */ > > d_z_hi = __builtin_clzll(d); > > - if (d_z_hi) { > > + if (likely(d_z_hi)) { > > I don't think likely() helps much. > But for 64bit this can be unconditional... It helps if you look at how gcc orders the code. Without the likely, the normalization code is moved out of line and a conditional forward branch (predicted untaken by default) is used to reach it even though this code is quite likely. With likely() gcc inserts the normalization code in line and a conditional forward branch (predicted untaken by default) is used to skip over that code. Clearly we want the later. > > d <<= d_z_hi; > > n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi); > > ... replace 'n_lo >> (64 - d_z_hi)' with 'n_lo >> (63 - d_z_hi) >> 1'. Why? I don't see the point. > The extra shift probably costs too much on 32bit though. > > > > n_lo <<= d_z_hi; > > } > > > > - reps = 64 / BITS_PER_ITER; > > - /* Optimise loop count for small dividends */ > > - if (!(u32)(n_hi >> 32)) { > > - reps -= 32 / BITS_PER_ITER; > > - n_hi = n_hi << 32 | n_lo >> 32; > > - n_lo <<= 32; > > - } > > -#if BITS_PER_ITER == 16 > > - if (!(u32)(n_hi >> 48)) { > > - reps--; > > - n_hi = add_u64_u32(n_hi << 16, n_lo >> 48); > > - n_lo <<= 16; > > - } > > -#endif > > You don't want the branch in the loop. > Once predicted correctly is makes little difference where you put > the test - but that won't be 'normal' > Unless it gets trained odd-even it'll get mispredicted at least once. > For 64bit (zen5) the test outside the loop was less bad > (IIRC dropped to 65 clocks - so probably worth it). > I need to check an older cpu (got a few Intel ones). > > > > - > > /* Invert the dividend so we can use add instead of subtract. */ > > n_lo = ~n_lo; > > n_hi = ~n_hi; > > > > /* > > - * Get the most significant BITS_PER_ITER bits of the divisor. > > + * Get the rounded-up most significant BITS_PER_ITER bits of the divisor. > > * This is used to get a low 'guestimate' of the quotient digit. > > */ > > - d_msig = (d >> (64 - BITS_PER_ITER)) + 1; > > + d_msig = (d >> (64 - BITS_PER_ITER)) + !!(d << BITS_PER_ITER); > > For 64bit x64 that is pretty much zero cost > (gcc manages to use the 'Z' flag from the '<< 32' in a 'setne' to get a 1 > I didn't look hard enough to find where the zero register came from > since 'setne' of changes the low 8 bits). > But it is horrid for 32bit - added quite a few clocks. > I'm not at all sure it is worth it though. If you do an hardcoded +1 when it is not necessary, in most of those cases you end up with one or two extra overflow adjustment loops. Those loops are unnecessary when d_msig is not increased by 1. > > > > /* > > * Now do a 'long division' with BITS_PER_ITER bit 'digits'. > > @@ -308,12 +293,17 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > > * The worst case is dividing ~0 by 0x8000 which requires two subtracts. > > */ > > quotient = 0; > > - while (reps--) { > > - q_digit = (unsigned long)(~n_hi >> (64 - 2 * BITS_PER_ITER)) / d_msig; > > + for (reps = 64 / BITS_PER_ITER; reps; reps--) { > > + quotient <<= BITS_PER_ITER; > > + n_long = ~n_hi >> (64 - 2 * BITS_PER_ITER); > > /* Shift 'n' left to align with the product q_digit * d */ > > overflow = n_hi >> (64 - BITS_PER_ITER); > > n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER)); > > n_lo <<= BITS_PER_ITER; > > + /* cut it short if q_digit would be 0 */ > > + if (n_long < d_msig) > > + continue; > > As I said above that gets misprediced too often > > > + q_digit = n_long / d_msig; > > I fiddled with the order of the lines of code - changes the way the object > code is laid out and change the clock count 'randomly' by one or two. > Noise compared to mispredicted branches. > > I've not tested on an older system with a slower divide. > I suspect older (Haswell and Broadwell) may benefit from the divide > being scheduled earlier. > > If anyone wants of copy of the test program I can send you a copy. > > David > > > /* Add product to negated divisor */ > > overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi); > > /* Adjust for the q_digit 'guestimate' being low */ > > @@ -322,7 +312,7 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > > n_hi += d; > > overflow += n_hi < d; > > } > > - quotient = add_u64_long(quotient << BITS_PER_ITER, q_digit); > > + quotient = add_u64_long(quotient, q_digit); > > } > > > > /* > >
On Tue, 17 Jun 2025 21:33:23 -0400 (EDT) Nicolas Pitre <nico@fluxnic.net> wrote: > On Tue, 17 Jun 2025, Nicolas Pitre wrote: > > > On Sat, 14 Jun 2025, David Laight wrote: > > > > > Replace the bit by bit algorithm with one that generates 16 bits > > > per iteration on 32bit architectures and 32 bits on 64bit ones. > > > > > > On my zen 5 this reduces the time for the tests (using the generic > > > code) from ~3350ns to ~1000ns. > > > > > > Running the 32bit algorithm on 64bit x86 takes ~1500ns. > > > It'll be slightly slower on a real 32bit system, mostly due > > > to register pressure. > > > > > > The savings for 32bit x86 are much higher (tested in userspace). > > > The worst case (lots of bits in the quotient) drops from ~900 clocks > > > to ~130 (pretty much independant of the arguments). > > > Other 32bit architectures may see better savings. > > > > > > It is possibly to optimise for divisors that span less than > > > __LONG_WIDTH__/2 bits. However I suspect they don't happen that often > > > and it doesn't remove any slow cpu divide instructions which dominate > > > the result. > > > > > > Signed-off-by: David Laight <david.laight.linux@gmail.com> > > > > Nice work. I had to be fully awake to review this one. > > Some suggestions below. > > Here's a patch with my suggestions applied to make it easier to figure > them out. The added "inline" is necessary to fix compilation on ARM32. > The "likely()" makes for better assembly and this part is pretty much > likely anyway. I've explained the rest previously, although this is a > better implementation. > > commit 99ea338401f03efe5dbebe57e62bd7c588409c5c > Author: Nicolas Pitre <nico@fluxnic.net> > Date: Tue Jun 17 14:42:34 2025 -0400 > > fixup! lib: mul_u64_u64_div_u64() Optimise the divide code > > diff --git a/lib/math/div64.c b/lib/math/div64.c > index 3c9fe878ce68..740e59a58530 100644 > --- a/lib/math/div64.c > +++ b/lib/math/div64.c > @@ -188,7 +188,7 @@ EXPORT_SYMBOL(iter_div_u64_rem); > > #if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64) > > -static u64 mul_add(u32 a, u32 b, u32 c) > +static inline u64 mul_add(u32 a, u32 b, u32 c) > { > return add_u64_u32(mul_u32_u32(a, b), c); > } > @@ -246,7 +246,7 @@ static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c) > > u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > { > - unsigned long d_msig, q_digit; > + unsigned long n_long, d_msig, q_digit; > unsigned int reps, d_z_hi; > u64 quotient, n_lo, n_hi; > u32 overflow; > @@ -271,36 +271,21 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > > /* Left align the divisor, shifting the dividend to match */ > d_z_hi = __builtin_clzll(d); > - if (d_z_hi) { > + if (likely(d_z_hi)) { > d <<= d_z_hi; > n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi); > n_lo <<= d_z_hi; > } > > - reps = 64 / BITS_PER_ITER; > - /* Optimise loop count for small dividends */ > - if (!(u32)(n_hi >> 32)) { > - reps -= 32 / BITS_PER_ITER; > - n_hi = n_hi << 32 | n_lo >> 32; > - n_lo <<= 32; > - } > -#if BITS_PER_ITER == 16 > - if (!(u32)(n_hi >> 48)) { > - reps--; > - n_hi = add_u64_u32(n_hi << 16, n_lo >> 48); > - n_lo <<= 16; > - } > -#endif > - > /* Invert the dividend so we can use add instead of subtract. */ > n_lo = ~n_lo; > n_hi = ~n_hi; > > /* > - * Get the most significant BITS_PER_ITER bits of the divisor. > + * Get the rounded-up most significant BITS_PER_ITER bits of the divisor. > * This is used to get a low 'guestimate' of the quotient digit. > */ > - d_msig = (d >> (64 - BITS_PER_ITER)) + 1; > + d_msig = (d >> (64 - BITS_PER_ITER)) + !!(d << BITS_PER_ITER); If the low divisor bits are zero an alternative simpler divide can be used (you want to detect it before the left align). I deleted that because it complicates things and probably doesn't happen often enough outside the tests cases. > > /* > * Now do a 'long division' with BITS_PER_ITER bit 'digits'. > @@ -308,12 +293,17 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > * The worst case is dividing ~0 by 0x8000 which requires two subtracts. > */ > quotient = 0; > - while (reps--) { > - q_digit = (unsigned long)(~n_hi >> (64 - 2 * BITS_PER_ITER)) / d_msig; > + for (reps = 64 / BITS_PER_ITER; reps; reps--) { > + quotient <<= BITS_PER_ITER; > + n_long = ~n_hi >> (64 - 2 * BITS_PER_ITER); > /* Shift 'n' left to align with the product q_digit * d */ > overflow = n_hi >> (64 - BITS_PER_ITER); > n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER)); > n_lo <<= BITS_PER_ITER; > + /* cut it short if q_digit would be 0 */ > + if (n_long < d_msig) > + continue; I don't think that is right. d_msig is an overestimate so you can only skip the divide and mul_add(). Could be something like: if (n_long < d_msig) { if (!n_long) continue; q_digit = 0; } else { q_digit = n_long / d_msig; /* Add product to negated divisor */ overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi); } but that starts looking like branch prediction hell. > + q_digit = n_long / d_msig; I think you want to do the divide right at the top - maybe even if the result isn't used! All the shifts then happen while the divide instruction is in progress (even without out-of-order execution). It is also quite likely that any cpu divide instruction takes a lot less clocks when the dividend or quotient is small. So if the quotient would be zero there isn't a stall waiting for the divide to complete. As I said before it is a trade off between saving a few clocks for specific cases against adding clocks to all the others. Leading zero bits on the dividend are very likely, quotients with the low 16bits zero much less so (except for test cases). > /* Add product to negated divisor */ > overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi); > /* Adjust for the q_digit 'guestimate' being low */ > @@ -322,7 +312,7 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > n_hi += d; > overflow += n_hi < d; > } > - quotient = add_u64_long(quotient << BITS_PER_ITER, q_digit); > + quotient = add_u64_long(quotient, q_digit); Moving the shift to the top (after the divide) may help instruction scheduling (esp. without aggressive out-of-order execution). David > } > > /*
On Wed, 18 Jun 2025, David Laight wrote: > On Tue, 17 Jun 2025 21:33:23 -0400 (EDT) > Nicolas Pitre <nico@fluxnic.net> wrote: > > > On Tue, 17 Jun 2025, Nicolas Pitre wrote: > > > > > On Sat, 14 Jun 2025, David Laight wrote: > > > > > > > Replace the bit by bit algorithm with one that generates 16 bits > > > > per iteration on 32bit architectures and 32 bits on 64bit ones. > > > > > > > > On my zen 5 this reduces the time for the tests (using the generic > > > > code) from ~3350ns to ~1000ns. > > > > > > > > Running the 32bit algorithm on 64bit x86 takes ~1500ns. > > > > It'll be slightly slower on a real 32bit system, mostly due > > > > to register pressure. > > > > > > > > The savings for 32bit x86 are much higher (tested in userspace). > > > > The worst case (lots of bits in the quotient) drops from ~900 clocks > > > > to ~130 (pretty much independant of the arguments). > > > > Other 32bit architectures may see better savings. > > > > > > > > It is possibly to optimise for divisors that span less than > > > > __LONG_WIDTH__/2 bits. However I suspect they don't happen that often > > > > and it doesn't remove any slow cpu divide instructions which dominate > > > > the result. > > > > > > > > Signed-off-by: David Laight <david.laight.linux@gmail.com> > > > > > > Nice work. I had to be fully awake to review this one. > > > Some suggestions below. > > > > Here's a patch with my suggestions applied to make it easier to figure > > them out. The added "inline" is necessary to fix compilation on ARM32. > > The "likely()" makes for better assembly and this part is pretty much > > likely anyway. I've explained the rest previously, although this is a > > better implementation. > > > > commit 99ea338401f03efe5dbebe57e62bd7c588409c5c > > Author: Nicolas Pitre <nico@fluxnic.net> > > Date: Tue Jun 17 14:42:34 2025 -0400 > > > > fixup! lib: mul_u64_u64_div_u64() Optimise the divide code > > > > diff --git a/lib/math/div64.c b/lib/math/div64.c > > index 3c9fe878ce68..740e59a58530 100644 > > --- a/lib/math/div64.c > > +++ b/lib/math/div64.c > > @@ -188,7 +188,7 @@ EXPORT_SYMBOL(iter_div_u64_rem); > > > > #if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64) > > > > -static u64 mul_add(u32 a, u32 b, u32 c) > > +static inline u64 mul_add(u32 a, u32 b, u32 c) > > { > > return add_u64_u32(mul_u32_u32(a, b), c); > > } > > @@ -246,7 +246,7 @@ static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c) > > > > u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > > { > > - unsigned long d_msig, q_digit; > > + unsigned long n_long, d_msig, q_digit; > > unsigned int reps, d_z_hi; > > u64 quotient, n_lo, n_hi; > > u32 overflow; > > @@ -271,36 +271,21 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > > > > /* Left align the divisor, shifting the dividend to match */ > > d_z_hi = __builtin_clzll(d); > > - if (d_z_hi) { > > + if (likely(d_z_hi)) { > > d <<= d_z_hi; > > n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi); > > n_lo <<= d_z_hi; > > } > > > > - reps = 64 / BITS_PER_ITER; > > - /* Optimise loop count for small dividends */ > > - if (!(u32)(n_hi >> 32)) { > > - reps -= 32 / BITS_PER_ITER; > > - n_hi = n_hi << 32 | n_lo >> 32; > > - n_lo <<= 32; > > - } > > -#if BITS_PER_ITER == 16 > > - if (!(u32)(n_hi >> 48)) { > > - reps--; > > - n_hi = add_u64_u32(n_hi << 16, n_lo >> 48); > > - n_lo <<= 16; > > - } > > -#endif > > - > > /* Invert the dividend so we can use add instead of subtract. */ > > n_lo = ~n_lo; > > n_hi = ~n_hi; > > > > /* > > - * Get the most significant BITS_PER_ITER bits of the divisor. > > + * Get the rounded-up most significant BITS_PER_ITER bits of the divisor. > > * This is used to get a low 'guestimate' of the quotient digit. > > */ > > - d_msig = (d >> (64 - BITS_PER_ITER)) + 1; > > + d_msig = (d >> (64 - BITS_PER_ITER)) + !!(d << BITS_PER_ITER); > > If the low divisor bits are zero an alternative simpler divide > can be used (you want to detect it before the left align). > I deleted that because it complicates things and probably doesn't > happen often enough outside the tests cases. Depends. On 32-bit systems that might be worth it. Something like: if (unlikely(sizeof(long) == 4 && !(u32)d)) return div_u64(n_hi, d >> 32); > > /* > > * Now do a 'long division' with BITS_PER_ITER bit 'digits'. > > @@ -308,12 +293,17 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > > * The worst case is dividing ~0 by 0x8000 which requires two subtracts. > > */ > > quotient = 0; > > - while (reps--) { > > - q_digit = (unsigned long)(~n_hi >> (64 - 2 * BITS_PER_ITER)) / d_msig; > > + for (reps = 64 / BITS_PER_ITER; reps; reps--) { > > + quotient <<= BITS_PER_ITER; > > + n_long = ~n_hi >> (64 - 2 * BITS_PER_ITER); > > /* Shift 'n' left to align with the product q_digit * d */ > > overflow = n_hi >> (64 - BITS_PER_ITER); > > n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER)); > > n_lo <<= BITS_PER_ITER; > > + /* cut it short if q_digit would be 0 */ > > + if (n_long < d_msig) > > + continue; > > I don't think that is right. > d_msig is an overestimate so you can only skip the divide and mul_add(). That's what I thought initially. But `overflow` was always 0xffff in that case. I'd like to prove it mathematically, but empirically this seems to be true with all edge cases I could think of. I also have a test rig using random numbers validating against the native x86 128/64 div and it has been running for an hour. > Could be something like: > if (n_long < d_msig) { > if (!n_long) > continue; > q_digit = 0; > } else { > q_digit = n_long / d_msig; > /* Add product to negated divisor */ > overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi); > } > but that starts looking like branch prediction hell. ... and so far unnecessary (see above). > > > + q_digit = n_long / d_msig; > > I think you want to do the divide right at the top - maybe even if the > result isn't used! > All the shifts then happen while the divide instruction is in progress > (even without out-of-order execution). Only if there is an actual divide instruction available. If it is a library call then you're screwed. And the compiler ought to be able to do that kind of shuffling on its own when there's a benefit. > It is also quite likely that any cpu divide instruction takes a lot > less clocks when the dividend or quotient is small. > So if the quotient would be zero there isn't a stall waiting for the > divide to complete. > > As I said before it is a trade off between saving a few clocks for > specific cases against adding clocks to all the others. > Leading zero bits on the dividend are very likely, quotients with > the low 16bits zero much less so (except for test cases). Given what I said above wrt overflow I think this is a good tradeoff. We just need to measure it. Nicolas
On Wed, 18 Jun 2025 11:39:20 -0400 (EDT) Nicolas Pitre <nico@fluxnic.net> wrote: > On Wed, 18 Jun 2025, David Laight wrote: > > > On Tue, 17 Jun 2025 21:33:23 -0400 (EDT) > > Nicolas Pitre <nico@fluxnic.net> wrote: > > > > > On Tue, 17 Jun 2025, Nicolas Pitre wrote: > > > > > > > On Sat, 14 Jun 2025, David Laight wrote: > > > > > > > > > Replace the bit by bit algorithm with one that generates 16 bits > > > > > per iteration on 32bit architectures and 32 bits on 64bit ones. > > > > > > > > > > On my zen 5 this reduces the time for the tests (using the generic > > > > > code) from ~3350ns to ~1000ns. > > > > > > > > > > Running the 32bit algorithm on 64bit x86 takes ~1500ns. > > > > > It'll be slightly slower on a real 32bit system, mostly due > > > > > to register pressure. > > > > > > > > > > The savings for 32bit x86 are much higher (tested in userspace). > > > > > The worst case (lots of bits in the quotient) drops from ~900 clocks > > > > > to ~130 (pretty much independant of the arguments). > > > > > Other 32bit architectures may see better savings. > > > > > > > > > > It is possibly to optimise for divisors that span less than > > > > > __LONG_WIDTH__/2 bits. However I suspect they don't happen that often > > > > > and it doesn't remove any slow cpu divide instructions which dominate > > > > > the result. > > > > > > > > > > Signed-off-by: David Laight <david.laight.linux@gmail.com> > > > > > > > > Nice work. I had to be fully awake to review this one. > > > > Some suggestions below. > > > > > > Here's a patch with my suggestions applied to make it easier to figure > > > them out. The added "inline" is necessary to fix compilation on ARM32. > > > The "likely()" makes for better assembly and this part is pretty much > > > likely anyway. I've explained the rest previously, although this is a > > > better implementation. > > > > > > commit 99ea338401f03efe5dbebe57e62bd7c588409c5c > > > Author: Nicolas Pitre <nico@fluxnic.net> > > > Date: Tue Jun 17 14:42:34 2025 -0400 > > > > > > fixup! lib: mul_u64_u64_div_u64() Optimise the divide code > > > > > > diff --git a/lib/math/div64.c b/lib/math/div64.c > > > index 3c9fe878ce68..740e59a58530 100644 > > > --- a/lib/math/div64.c > > > +++ b/lib/math/div64.c > > > @@ -188,7 +188,7 @@ EXPORT_SYMBOL(iter_div_u64_rem); > > > > > > #if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64) > > > > > > -static u64 mul_add(u32 a, u32 b, u32 c) > > > +static inline u64 mul_add(u32 a, u32 b, u32 c) > > > { > > > return add_u64_u32(mul_u32_u32(a, b), c); > > > } > > > @@ -246,7 +246,7 @@ static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c) > > > > > > u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > > > { > > > - unsigned long d_msig, q_digit; > > > + unsigned long n_long, d_msig, q_digit; > > > unsigned int reps, d_z_hi; > > > u64 quotient, n_lo, n_hi; > > > u32 overflow; > > > @@ -271,36 +271,21 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > > > > > > /* Left align the divisor, shifting the dividend to match */ > > > d_z_hi = __builtin_clzll(d); > > > - if (d_z_hi) { > > > + if (likely(d_z_hi)) { > > > d <<= d_z_hi; > > > n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi); > > > n_lo <<= d_z_hi; > > > } > > > > > > - reps = 64 / BITS_PER_ITER; > > > - /* Optimise loop count for small dividends */ > > > - if (!(u32)(n_hi >> 32)) { > > > - reps -= 32 / BITS_PER_ITER; > > > - n_hi = n_hi << 32 | n_lo >> 32; > > > - n_lo <<= 32; > > > - } > > > -#if BITS_PER_ITER == 16 > > > - if (!(u32)(n_hi >> 48)) { > > > - reps--; > > > - n_hi = add_u64_u32(n_hi << 16, n_lo >> 48); > > > - n_lo <<= 16; > > > - } > > > -#endif > > > - > > > /* Invert the dividend so we can use add instead of subtract. */ > > > n_lo = ~n_lo; > > > n_hi = ~n_hi; > > > > > > /* > > > - * Get the most significant BITS_PER_ITER bits of the divisor. > > > + * Get the rounded-up most significant BITS_PER_ITER bits of the divisor. > > > * This is used to get a low 'guestimate' of the quotient digit. > > > */ > > > - d_msig = (d >> (64 - BITS_PER_ITER)) + 1; > > > + d_msig = (d >> (64 - BITS_PER_ITER)) + !!(d << BITS_PER_ITER); > > > > If the low divisor bits are zero an alternative simpler divide > > can be used (you want to detect it before the left align). > > I deleted that because it complicates things and probably doesn't > > happen often enough outside the tests cases. > > Depends. On 32-bit systems that might be worth it. Something like: > > if (unlikely(sizeof(long) == 4 && !(u32)d)) > return div_u64(n_hi, d >> 32); You need a bigger divide than that (96 bits by 32). It is also possible this code is better than div_u64() ! My initial version optimised for divisors with max 16 bits using: d_z_hi = __builtin_clzll(d); d_z_lo = __builtin_ffsll(d) - 1; if (d_z_hi + d_z_lo >= 48) { // Max 16 bits in divisor, shift right if (d_z_hi < 48) { n_lo = n_lo >> d_z_lo | n_hi << (64 - d_z_lo); n_hi >>= d_z_lo; d >>= d_z_lo; } return div_me_16(n_hi, n_lo >> 32, n_lo, d); } // Followed by the 'align left' code with the 80 by 16 divide: static u64 div_me_16(u32 acc, u32 n1, u32 n0, u32 d) { u64 q = 0; if (acc) { acc = acc << 16 | n1 >> 16; q = (acc / d) << 16; acc = (acc % d) << 16 | (n1 & 0xffff); } else { acc = n1; if (!acc) return n0 / d; } q |= acc / d; q <<= 32; acc = (acc % d) << 16 | (n0 >> 16); q |= (acc / d) << 16; acc = (acc % d) << 16 | (n0 & 0xffff); q |= acc / d; return q; } In the end I decided the cost of the 64bit ffsll() on 32bit was too much. (I could cut it down to 3 'cmov' instead of 4 - and remember they have to be conditional jumps in the kernel.) > > > > /* > > > * Now do a 'long division' with BITS_PER_ITER bit 'digits'. > > > @@ -308,12 +293,17 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > > > * The worst case is dividing ~0 by 0x8000 which requires two subtracts. > > > */ > > > quotient = 0; > > > - while (reps--) { > > > - q_digit = (unsigned long)(~n_hi >> (64 - 2 * BITS_PER_ITER)) / d_msig; > > > + for (reps = 64 / BITS_PER_ITER; reps; reps--) { > > > + quotient <<= BITS_PER_ITER; > > > + n_long = ~n_hi >> (64 - 2 * BITS_PER_ITER); > > > /* Shift 'n' left to align with the product q_digit * d */ > > > overflow = n_hi >> (64 - BITS_PER_ITER); > > > n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER)); > > > n_lo <<= BITS_PER_ITER; > > > + /* cut it short if q_digit would be 0 */ > > > + if (n_long < d_msig) > > > + continue; > > > > I don't think that is right. > > d_msig is an overestimate so you can only skip the divide and mul_add(). > > That's what I thought initially. But `overflow` was always 0xffff in > that case. I'd like to prove it mathematically, but empirically this > seems to be true with all edge cases I could think of. You are right :-( It all gets 'saved' because the next q_digit can be 17 bits. > I also have a test rig using random numbers validating against the > native x86 128/64 div and it has been running for an hour. I doubt random values will hit many nasty edge cases. ... > > > > > + q_digit = n_long / d_msig; > > > > I think you want to do the divide right at the top - maybe even if the > > result isn't used! > > All the shifts then happen while the divide instruction is in progress > > (even without out-of-order execution). > > Only if there is an actual divide instruction available. If it is a > library call then you're screwed. The library call may well short circuit it anyway. > And the compiler ought to be able to do that kind of shuffling on its > own when there's a benefit. In my experience it doesn't do a very good job - best to give it help. In this case I'd bet it even moves it later on (especially if you add a conditional path that doesn't need it). Try getting (a + b) + (c + d) compiled that way rather than as (a + (b + (c + d))) which has a longer register dependency chain. > > > It is also quite likely that any cpu divide instruction takes a lot > > less clocks when the dividend or quotient is small. > > So if the quotient would be zero there isn't a stall waiting for the > > divide to complete. > > > > As I said before it is a trade off between saving a few clocks for > > specific cases against adding clocks to all the others. > > Leading zero bits on the dividend are very likely, quotients with > > the low 16bits zero much less so (except for test cases). > > Given what I said above wrt overflow I think this is a good tradeoff. > We just need to measure it. Can you do accurate timings for arm64 or arm32? I've found a 2004 Arm book that includes several I-cache busting divide algorithms. But I'm sure this pi-5 has hardware divide. My suspicion is that provided the cpu has reasonable multiply instructions this code will be reasonable with a 32 by 32 software divide. According to Agner's tables all AMD Zen cpu have fast 64bit divide. The same isn't true for Intel though, Skylake is 26 clocks for r32 but 35-88 for r64. You have to get to IceLake (xx-10) to get a fast r64 divide. Probably not enough to avoid for the 128 by 64 case, but there may be cases where it is worth it. I need to get my test farm set up - and source a zen1/2 system. David > > > Nicolas
On Wed, 18 Jun 2025, David Laight wrote: > On Wed, 18 Jun 2025 11:39:20 -0400 (EDT) > Nicolas Pitre <nico@fluxnic.net> wrote: > > > > > + q_digit = n_long / d_msig; > > > > > > I think you want to do the divide right at the top - maybe even if the > > > result isn't used! > > > All the shifts then happen while the divide instruction is in progress > > > (even without out-of-order execution). Well.... testing on my old Intel Core i7-4770R doesn't show a gain. With your proposed patch as is: ~34ns per call With my proposed changes: ~31ns per call With my changes but leaving the divide at the top of the loop: ~32ns per call > Can you do accurate timings for arm64 or arm32? On a Broadcom BCM2712 (ARM Cortex-A76): With your proposed patch as is: ~20 ns per call With my proposed changes: ~19 ns per call With my changes but leaving the divide at the top of the loop: ~19 ns per call Both CPUs have the same max CPU clock rate (2.4 GHz). These are obtained with clock_gettime(CLOCK_MONOTONIC) over 56000 calls. There is some noise in the results over multiple runs though but still. I could get cycle measurements on the RPi5 but that requires a kernel recompile. > I've found a 2004 Arm book that includes several I-cache busting > divide algorithms. > But I'm sure this pi-5 has hardware divide. It does. Nicolas
On Wed, 18 Jun 2025 16:12:49 -0400 (EDT) Nicolas Pitre <nico@fluxnic.net> wrote: > On Wed, 18 Jun 2025, David Laight wrote: > > > On Wed, 18 Jun 2025 11:39:20 -0400 (EDT) > > Nicolas Pitre <nico@fluxnic.net> wrote: > > > > > > > + q_digit = n_long / d_msig; > > > > > > > > I think you want to do the divide right at the top - maybe even if the > > > > result isn't used! > > > > All the shifts then happen while the divide instruction is in progress > > > > (even without out-of-order execution). > > Well.... testing on my old Intel Core i7-4770R doesn't show a gain. > > With your proposed patch as is: ~34ns per call > > With my proposed changes: ~31ns per call > > With my changes but leaving the divide at the top of the loop: ~32ns per call Wonders what makes the difference... Is that random 64bit values (where you don't expect zero digits) or values where there are likely to be small divisors and/or zero digits? On x86 you can use the PERF_COUNT_HW_CPU_CYCLES counter to get pretty accurate counts for a single call. The 'trick' is to use syscall(___NR_perf_event_open,...) and pc = mmap() to get the register number pc->index - 1. Then you want: static inline unsigned int rdpmc(unsigned int counter) { unsigned int low, high; asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter)); return low; } and do: unsigned int start = rdpmc(pc->index - 1); unsigned int zero = 0; OPTIMISER_HIDE_VAR(zero); q = mul_u64_add_u64_div_u64(a + (start & zero), b, c, d); elapsed = rdpmc(pc->index - 1 + (q & zero)) - start; That carefully forces the rdpmc include the code being tested without the massive penalty of lfence/mfence. Do 10 calls and the last 8 will be pretty similar. Lets you time cold-cache and branch mis-prediction effects. > > Can you do accurate timings for arm64 or arm32? > > On a Broadcom BCM2712 (ARM Cortex-A76): > > With your proposed patch as is: ~20 ns per call > > With my proposed changes: ~19 ns per call > > With my changes but leaving the divide at the top of the loop: ~19 ns per call Pretty much no difference. Is that 64bit or 32bit (or the 16 bits per iteration on 64bit) ? The shifts get more expensive on 32bit. Have you timed the original code? > > Both CPUs have the same max CPU clock rate (2.4 GHz). These are obtained > with clock_gettime(CLOCK_MONOTONIC) over 56000 calls. There is some > noise in the results over multiple runs though but still. That many loops definitely trains the branch predictor and ignores any effects of loading the I-cache. As Linus keeps saying, the kernel tends to be 'cold cache', code size matters. That also means that branches are 50% likely to be mis-predicted. (Although working out what cpu actually do is hard.) > > I could get cycle measurements on the RPi5 but that requires a kernel > recompile. Or a loadable module - shame there isn't a sysctl. > > > I've found a 2004 Arm book that includes several I-cache busting > > divide algorithms. > > But I'm sure this pi-5 has hardware divide. > > It does. > > > Nicolas
On Wed, 18 Jun 2025, David Laight wrote: > On Wed, 18 Jun 2025 16:12:49 -0400 (EDT) > Nicolas Pitre <nico@fluxnic.net> wrote: > > > On Wed, 18 Jun 2025, David Laight wrote: > > > > > On Wed, 18 Jun 2025 11:39:20 -0400 (EDT) > > > Nicolas Pitre <nico@fluxnic.net> wrote: > > > > > > > > > + q_digit = n_long / d_msig; > > > > > > > > > > I think you want to do the divide right at the top - maybe even if the > > > > > result isn't used! > > > > > All the shifts then happen while the divide instruction is in progress > > > > > (even without out-of-order execution). > > > > Well.... testing on my old Intel Core i7-4770R doesn't show a gain. > > > > With your proposed patch as is: ~34ns per call > > > > With my proposed changes: ~31ns per call > > > > With my changes but leaving the divide at the top of the loop: ~32ns per call > > Wonders what makes the difference... > Is that random 64bit values (where you don't expect zero digits) > or values where there are likely to be small divisors and/or zero digits? Those are values from the test module. I just copied it into a user space program. > On x86 you can use the PERF_COUNT_HW_CPU_CYCLES counter to get pretty accurate > counts for a single call. > The 'trick' is to use syscall(___NR_perf_event_open,...) and pc = mmap() to get > the register number pc->index - 1. I'm not acquainted enough with x86 to fully make sense of the above. This is more your territory than mine. ;-) > > > Can you do accurate timings for arm64 or arm32? > > > > On a Broadcom BCM2712 (ARM Cortex-A76): > > > > With your proposed patch as is: ~20 ns per call > > > > With my proposed changes: ~19 ns per call > > > > With my changes but leaving the divide at the top of the loop: ~19 ns per call > > Pretty much no difference. > Is that 64bit or 32bit (or the 16 bits per iteration on 64bit) ? 64bit executable with 32 bits per iteration. > Have you timed the original code? The "your proposed patch as is" is the original code per the first email in this thread. > > Both CPUs have the same max CPU clock rate (2.4 GHz). These are obtained > > with clock_gettime(CLOCK_MONOTONIC) over 56000 calls. There is some > > noise in the results over multiple runs though but still. > > That many loops definitely trains the branch predictor and ignores > any effects of loading the I-cache. > As Linus keeps saying, the kernel tends to be 'cold cache', code size > matters. My proposed changes shrink the code especially on 32-bit systems due to the pre-loop special cases removal. > That also means that branches are 50% likely to be mis-predicted. We can tag it as unlikely. In practice this isn't taken very often. > (Although working out what cpu actually do is hard.) > > > > > I could get cycle measurements on the RPi5 but that requires a kernel > > recompile. > > Or a loadable module - shame there isn't a sysctl. Not sure. I've not investigated how the RPi kernel is configured yet. I suspect this is about granting user space direct access to PMU regs. Nicolas
On Wed, 18 Jun 2025 22:43:47 -0400 (EDT) Nicolas Pitre <nico@fluxnic.net> wrote: > On Wed, 18 Jun 2025, David Laight wrote: > > > On Wed, 18 Jun 2025 16:12:49 -0400 (EDT) > > Nicolas Pitre <nico@fluxnic.net> wrote: > > > > > On Wed, 18 Jun 2025, David Laight wrote: > > > > > > > On Wed, 18 Jun 2025 11:39:20 -0400 (EDT) > > > > Nicolas Pitre <nico@fluxnic.net> wrote: > > > > > > > > > > > + q_digit = n_long / d_msig; > > > > > > > > > > > > I think you want to do the divide right at the top - maybe even if the > > > > > > result isn't used! > > > > > > All the shifts then happen while the divide instruction is in progress > > > > > > (even without out-of-order execution). > > > > > > Well.... testing on my old Intel Core i7-4770R doesn't show a gain. > > > > > > With your proposed patch as is: ~34ns per call > > > > > > With my proposed changes: ~31ns per call > > > > > > With my changes but leaving the divide at the top of the loop: ~32ns per call > > > > Wonders what makes the difference... > > Is that random 64bit values (where you don't expect zero digits) > > or values where there are likely to be small divisors and/or zero digits? > > Those are values from the test module. I just copied it into a user > space program. Ah, those tests are heavily biased towards values with all bits set. I added the 'pre-loop' check to speed up the few that have leading zeros (and don't escape into the div_u64() path). I've been timing the divisions separately. I will look at whether it is worth just checking for the top 32bits being zero on 32bit - where the conditional code is just register moves. ... > My proposed changes shrink the code especially on 32-bit systems due to > the pre-loop special cases removal. > > > That also means that branches are 50% likely to be mis-predicted. > > We can tag it as unlikely. In practice this isn't taken very often. I suspect 'unlikely' is over-rated :-) I had 'fun and games' a few years back trying to minimise the worst-case path for some code running on a simple embedded cpu. Firstly gcc seems to ignore 'unlikely' unless there is code in the 'likely' path - an asm comment will do nicely. The there is is cpu itself, the x86 prefetch logic is likely to assume non-taken (probably actually not-branch), but the prediction logic itself uses whatever is in the selected logic (effectively an array - assume hashed) so if it isn't 'trained' on the code being execute is 50% taken. (Only the P6 (late 90's) had prefix for unlikely/likely.) > > > (Although working out what cpu actually do is hard.) > > > > > > > > I could get cycle measurements on the RPi5 but that requires a kernel > > > recompile. > > > > Or a loadable module - shame there isn't a sysctl. > > Not sure. I've not investigated how the RPi kernel is configured yet. > I suspect this is about granting user space direct access to PMU regs. Something like that - you don't get the TSC by default. Access is denied to (try to) stop timing attacks - but doesn't really help. Just makes it all too hard for everyone else. I'm not sure it also stops all the 'time' functions being implemented in the vdso without a system call - and that hurts performance. David > > > Nicolas
On Wed, 18 Jun 2025, Nicolas Pitre wrote: > On Wed, 18 Jun 2025, David Laight wrote: > > > If the low divisor bits are zero an alternative simpler divide > > can be used (you want to detect it before the left align). > > I deleted that because it complicates things and probably doesn't > > happen often enough outside the tests cases. > > Depends. On 32-bit systems that might be worth it. Something like: > > if (unlikely(sizeof(long) == 4 && !(u32)d)) > return div_u64(n_hi, d >> 32); Well scratch that. Won't work obviously. Nicolas
© 2016 - 2025 Red Hat, Inc.