[PATCH v2 1/2] mul_u64_u64_div_u64: make it precise always

Nicolas Pitre posted 2 patches 1 year, 5 months ago
There is a newer version of this series
[PATCH v2 1/2] mul_u64_u64_div_u64: make it precise always
Posted by Nicolas Pitre 1 year, 5 months ago
From: Nicolas Pitre <npitre@baylibre.com>

Library facilities must always return exact results. If the caller may
be contented with approximations then it should do the approximation on
its own.

In this particular case the comment in the code says "the algorithm
... below might lose some precision". Well, if you try it with e.g.:

	a = 18446462598732840960
	b = 18446462598732840960
	c = 18446462598732840961

then the produced answer is 0 whereas the exact answer should be
18446462598732840959. This is _some_ precision lost indeed!

Let's reimplement this function so it always produces the exact result
regardless of its inputs while preserving existing fast paths
when possible.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 lib/math/div64.c | 123 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 80 insertions(+), 43 deletions(-)

diff --git a/lib/math/div64.c b/lib/math/div64.c
index 191761b1b6..dd461b3973 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -186,55 +186,92 @@ EXPORT_SYMBOL(iter_div_u64_rem);
 #ifndef mul_u64_u64_div_u64
 u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
 {
-	u64 res = 0, div, rem;
-	int shift;
+	if (ilog2(a) + ilog2(b) <= 62)
+		return div64_u64(a * b, c);
 
-	/* can a * b overflow ? */
-	if (ilog2(a) + ilog2(b) > 62) {
-		/*
-		 * Note that the algorithm after the if block below might lose
-		 * some precision and the result is more exact for b > a. So
-		 * exchange a and b if a is bigger than b.
-		 *
-		 * For example with a = 43980465100800, b = 100000000, c = 1000000000
-		 * the below calculation doesn't modify b at all because div == 0
-		 * and then shift becomes 45 + 26 - 62 = 9 and so the result
-		 * becomes 4398035251080. However with a and b swapped the exact
-		 * result is calculated (i.e. 4398046510080).
-		 */
-		if (a > b)
-			swap(a, b);
+#if defined(__SIZEOF_INT128__)
+
+	/* native 64x64=128 bits multiplication */
+	u128 prod = (u128)a * b;
+	u64 n_lo = prod, n_hi = prod >> 64;
+
+#else
+
+	/* perform a 64x64=128 bits multiplication manually */
+	union {
+		u64 v;
+		struct {
+#if defined(CONFIG_CPU_LITTLE_ENDIAN)
+			u32 l;
+			u32 h;
+#elif defined(CONFIG_CPU_BIG_ENDIAN)
+			u32 h;
+			u32 l;
+#else
+#error "unknown endianness"
+#endif
+		};
+	} A, B, X, Y, Z;
+
+	A.v = a;
+	B.v = b;
+
+	X.v = (u64)A.l * B.l;
+	Y.v = (u64)A.l * B.h + X.h;
+	Z.v = (u64)A.h * B.h + Y.h;
+	Y.v = (u64)A.h * B.l + Y.l;
+	X.h = Y.l;
+	Z.v += Y.h;
+
+	u64 n_lo = X.v, n_hi = Z.v;
+
+#endif
 
+	int shift = __builtin_ctzll(c);
+
+	/* try reducing the fraction in case the dividend becomes <= 64 bits */
+	if ((n_hi >> shift) == 0) {
+		u64 n = (n_lo >> shift) | (n_hi << (64 - shift));
+
+		return div64_u64(n, c >> shift);
 		/*
-		 * (b * a) / c is equal to
-		 *
-		 *      (b / c) * a +
-		 *      (b % c) * a / c
-		 *
-		 * if nothing overflows. Can the 1st multiplication
-		 * overflow? Yes, but we do not care: this can only
-		 * happen if the end result can't fit in u64 anyway.
-		 *
-		 * So the code below does
-		 *
-		 *      res = (b / c) * a;
-		 *      b = b % c;
+		 * The remainder value if needed would be:
+		 *   res = div64_u64_rem(n, c >> shift, &rem);
+		 *   rem = (rem << shift) + (n_lo - (n << shift));
 		 */
-		div = div64_u64_rem(b, c, &rem);
-		res = div * a;
-		b = rem;
-
-		shift = ilog2(a) + ilog2(b) - 62;
-		if (shift > 0) {
-			/* drop precision */
-			b >>= shift;
-			c >>= shift;
-			if (!c)
-				return res;
-		}
 	}
 
-	return res + div64_u64(a * b, c);
+	if (n_hi >= c) {
+		/* overflow: result is unrepresentable in a u64 */
+		return -1;
+	}
+
+	/* Do the full 128 by 64 bits division */
+
+	shift = __builtin_clzll(c);
+	c <<= shift;
+
+	int p = 64 + shift;
+	u64 res = 0;
+	bool carry;
+
+	do {
+		carry = n_hi >> 63;
+		shift = carry ? 1 : __builtin_clzll(n_hi);
+		if (p < shift)
+			break;
+		p -= shift;
+		n_hi <<= shift;
+		n_hi |= n_lo >> (64 - shift);
+		n_lo <<= shift;
+		if (carry || (n_hi >= c)) {
+			n_hi -= c;
+			res |= 1ULL << p;
+		}
+	} while (n_hi);
+	/* The remainder value if needed would be n_hi << p */
+
+	return res;
 }
 EXPORT_SYMBOL(mul_u64_u64_div_u64);
 #endif
-- 
2.45.2
Re: [PATCH v2 1/2] mul_u64_u64_div_u64: make it precise always
Posted by Uwe Kleine-König 1 year, 5 months ago
Hello Nico,

On Tue, Jul 02, 2024 at 11:34:08PM -0400, Nicolas Pitre wrote:
> diff --git a/lib/math/div64.c b/lib/math/div64.c
> index 191761b1b6..dd461b3973 100644
> --- a/lib/math/div64.c
> +++ b/lib/math/div64.c
> @@ -186,55 +186,92 @@ EXPORT_SYMBOL(iter_div_u64_rem);
>  #ifndef mul_u64_u64_div_u64
>  u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
>  {
> -	u64 res = 0, div, rem;
> -	int shift;
> +	if (ilog2(a) + ilog2(b) <= 62)
> +		return div64_u64(a * b, c);
>  
> -	/* can a * b overflow ? */
> -	if (ilog2(a) + ilog2(b) > 62) {
> -		/*
> -		 * Note that the algorithm after the if block below might lose
> -		 * some precision and the result is more exact for b > a. So
> -		 * exchange a and b if a is bigger than b.
> -		 *
> -		 * For example with a = 43980465100800, b = 100000000, c = 1000000000
> -		 * the below calculation doesn't modify b at all because div == 0
> -		 * and then shift becomes 45 + 26 - 62 = 9 and so the result
> -		 * becomes 4398035251080. However with a and b swapped the exact
> -		 * result is calculated (i.e. 4398046510080).
> -		 */
> -		if (a > b)
> -			swap(a, b);
> +#if defined(__SIZEOF_INT128__)
> +
> +	/* native 64x64=128 bits multiplication */
> +	u128 prod = (u128)a * b;
> +	u64 n_lo = prod, n_hi = prod >> 64;
> +
> +#else
> +
> +	/* perform a 64x64=128 bits multiplication manually */
> +	union {
> +		u64 v;
> +		struct {
> +#if defined(CONFIG_CPU_LITTLE_ENDIAN)
> +			u32 l;
> +			u32 h;
> +#elif defined(CONFIG_CPU_BIG_ENDIAN)
> +			u32 h;
> +			u32 l;
> +#else
> +#error "unknown endianness"
> +#endif
> +		};
> +	} A, B, X, Y, Z;
> +
> +	A.v = a;
> +	B.v = b;
> +
> +	X.v = (u64)A.l * B.l;
> +	Y.v = (u64)A.l * B.h + X.h;
> +	Z.v = (u64)A.h * B.h + Y.h;
> +	Y.v = (u64)A.h * B.l + Y.l;
> +	X.h = Y.l;
> +	Z.v += Y.h;
> +
> +	u64 n_lo = X.v, n_hi = Z.v;

I tried to understand your patch. This part could really benefit from
some comments. With pen and paper I worked out your idea:

	The goal is:

		A * B == Z << 64 + X

	With A = A.h << 32 + A.l and similar identities for B, we have:

	A * B = (A.h << 32 + A.l) * (B.h << 32 + B.l)
	      = (A.h * B.h << 64 + (A.h * B.l + A.l * B.h) << 32 + A.l * B.l

	The operations done here are only 32 bit multiplications and
	additions, and with U32_MAX = 0xffffffff we have:
	U32_MAX * U32_MAX + U32_MAX = (U32_MAX + 1) * U32_MAX =
	0xffffffff00000000 which fits into an u64.  Even when adding
	another U32_MAX (which happens with Z.v += Y.h) it still fits
	into u64, and so the operations won't overflow.

> +
> +#endif

So starting here we have

	n_hi << 64 + n_lo == a * b

> +	int shift = __builtin_ctzll(c);
> +
> +	/* try reducing the fraction in case the dividend becomes <= 64 bits */
> +	if ((n_hi >> shift) == 0) {

The idea here is: c = c_ << shift, and so

	a * b / c == (a * b) >> shift / c_

In this if-body we're handling (a * b) >> shift fitting into an u64.

> +		u64 n = (n_lo >> shift) | (n_hi << (64 - shift));
> +
> +		return div64_u64(n, c >> shift);
>  		/*
> -		 * (b * a) / c is equal to
> -		 *
> -		 *      (b / c) * a +
> -		 *      (b % c) * a / c
> -		 *
> -		 * if nothing overflows. Can the 1st multiplication
> -		 * overflow? Yes, but we do not care: this can only
> -		 * happen if the end result can't fit in u64 anyway.
> -		 *
> -		 * So the code below does
> -		 *
> -		 *      res = (b / c) * a;
> -		 *      b = b % c;
> +		 * The remainder value if needed would be:
> +		 *   res = div64_u64_rem(n, c >> shift, &rem);
> +		 *   rem = (rem << shift) + (n_lo - (n << shift));
>  		 */
> -		div = div64_u64_rem(b, c, &rem);
> -		res = div * a;
> -		b = rem;
> -
> -		shift = ilog2(a) + ilog2(b) - 62;
> -		if (shift > 0) {
> -			/* drop precision */
> -			b >>= shift;
> -			c >>= shift;
> -			if (!c)
> -				return res;
> -		}
>  	}
>  
> -	return res + div64_u64(a * b, c);
> +	if (n_hi >= c) {
> +		/* overflow: result is unrepresentable in a u64 */
> +		return -1;
> +	}
> +
> +	/* Do the full 128 by 64 bits division */

Here is the code location where I stop understanding your code :-)

Maybe stating the loop invariant in a comment would be helpful?

> +	shift = __builtin_clzll(c);
> +	c <<= shift;
> +
> +	int p = 64 + shift;
> +	u64 res = 0;
> +	bool carry;
> +
> +	do {
> +		carry = n_hi >> 63;
> +		shift = carry ? 1 : __builtin_clzll(n_hi);
> +		if (p < shift)
> +			break;
> +		p -= shift;
> +		n_hi <<= shift;
> +		n_hi |= n_lo >> (64 - shift);
> +		n_lo <<= shift;
> +		if (carry || (n_hi >= c)) {
> +			n_hi -= c;
> +			res |= 1ULL << p;
> +		}
> +	} while (n_hi);
> +	/* The remainder value if needed would be n_hi << p */

I indeed need a variant of this function that rounds up. So maybe
creating a function

	u64 mul_u64_u64_div_u64_rem(u64 a, u64 b, u64 c, u64 *rem)

with the sophistication of your mul_u64_u64_div_u64 and then:

	u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
	{
		u64 rem, ret;

		ret = mul_u64_u64_div_u64_rem(a, b, c, &rem);
		return ret;
	}

(In the hope that the compiler optimizes out the calculation for the
remainder) and:

	u64 mul_u64_u64_div_u64_roundup(u64 a, u64 b, u64 c)
	{
		u64 rem, ret;

		ret = mul_u64_u64_div_u64_rem(a, b, c, &rem);

		if (rem)
			ret += 1;

		return ret;
	}

would be nice. (This could be done in a follow up patch though.)

> +	return res;
>  }
>  EXPORT_SYMBOL(mul_u64_u64_div_u64);
>  #endif

Best regards
Uwe
Re: [PATCH v2 1/2] mul_u64_u64_div_u64: make it precise always
Posted by Nicolas Pitre 1 year, 5 months ago
On Thu, 4 Jul 2024, Uwe Kleine-König wrote:

> Hello Nico,
> 
> On Tue, Jul 02, 2024 at 11:34:08PM -0400, Nicolas Pitre wrote:
> > +	A.v = a;
> > +	B.v = b;
> > +
> > +	X.v = (u64)A.l * B.l;
> > +	Y.v = (u64)A.l * B.h + X.h;
> > +	Z.v = (u64)A.h * B.h + Y.h;
> > +	Y.v = (u64)A.h * B.l + Y.l;
> > +	X.h = Y.l;
> > +	Z.v += Y.h;
> > +
> > +	u64 n_lo = X.v, n_hi = Z.v;
> 
> I tried to understand your patch. This part could really benefit from
> some comments. With pen and paper I worked out your idea:
> 
> 	The goal is:
> 
> 		A * B == Z << 64 + X
> 
> 	With A = A.h << 32 + A.l and similar identities for B, we have:
> 
> 	A * B = (A.h << 32 + A.l) * (B.h << 32 + B.l)
> 	      = (A.h * B.h << 64 + (A.h * B.l + A.l * B.h) << 32 + A.l * B.l
                          ^ missing )

> 	The operations done here are only 32 bit multiplications and
> 	additions, and with U32_MAX = 0xffffffff we have:
> 	U32_MAX * U32_MAX + U32_MAX = (U32_MAX + 1) * U32_MAX =
> 	0xffffffff00000000 which fits into an u64.  Even when adding
> 	another U32_MAX (which happens with Z.v += Y.h) it still fits
> 	into u64, and so the operations won't overflow.

Exact, that's the idea.

I was about to reproduce the code I wrote for a similar purpose about 18 
years ago that currently lives in __arch_xprod_64() 
(include/asm-generic/div64.h). when I realized that, with some 
reordering, all the overflow handling could be avoided entirely.

So I'm about to submit some nice simplification for my old optimized 
__div64_const32() based on this realisation.

> > +	/* Do the full 128 by 64 bits division */
> 
> Here is the code location where I stop understanding your code :-)

Here's how it goes:

To do a binary division, you have to align the numbers, find how many 
times the 
divisor fits and subtract, just like we learned in primary school. 
Except that we have binary numbers instead of base 10 numbers, making 
the "how many times" either 0 or 1.

To align numbers, let's simply move set bits to the top left bit.

Let's suppose a divisor c of 0x00ffffff00000000

        shift = __builtin_clzll(c);
        c <<= shift;

so shift = 8 and c becomes 0xffffff0000000000

Also need to track the actual divisor power. Given that we start working 
on n_hi not n_lo, this means the power is initially 64. But we just 
shifted c which increased its power:

        p = 64 + shift;

Then, remember that n_hi < original c. That's ensured by the overflow 
test earlier. So shifting n_hi leftwards will require a greater shift 
than the one we applied to c, meaning that p will become 63 or less 
during the first loop.

Let's suppose n_hi = 0x000ffffffff00000 and n_lo = 0

Then enter the loop:

                carry = n_hi >> 63;

Top bit of n_hi is unset so no carry.

                shift = carry ? 1 : __builtin_clzll(n_hi);

If n'hi's top bit was set we'd have a shift of 1 with a carry. But here 
there is no carry and aligning n_hi to the top left bit requires a shift 
of 12.

                n_hi <<= shift;
                n_hi |= n_lo >> (64 - shift);
                n_lo <<= shift;

So n_hi is now 0xffffffff00000000

                p -= shift;

Shifting left the dividend reduces the divisor's power.
So p is now 64 + 8 - 12 = 60

Then, the crux of the operation:

                if (carry || (n_hi >= c)) {
                        n_hi -= c;
                        res |= 1ULL << p;
                }

So... if the divisor fits then we add a 1 to the result and subtract it.
n_hi = 0xffffffff00000000 - 0xffffff0000000000 = 0x000000ff00000000
res |= 1 << 60

Let's loop again:

                carry = n_hi >> 63;
                shift = carry ? 1 : __builtin_clzll(n_hi);
                ...

No carry, shift becomes 24, p becomes 60 - 24 = 36 and
n_hi becomes 0xff00000000000000.

                if (carry || (n_hi >= c)) { ...

No carry and n_hi is smaller than c so loop again.

                carry = n_hi >> 63;
                shift = carry ? 1 : __builtin_clzll(n_hi);

This time we have a carry as the top bit of n_hi is set and we're about 
to shift it by 1. p becomes 35 and n_hi becomes 0xfe00000000000000. In 
reality it is like having 0x1fe00000000000000 (a 65-bits value) which is 
obviously bigger than 0xffffff0000000000. So we can augment the result 
and subtract. Thanks to two's complement, we have:

n_hi = 0xfe00000000000000 - 0xffffff0000000000 = 0xfe00010000000000

and 

res = 1 << 60 | 1 << 35

And so on until either n_hi becomes 0 or p would go negative, which 
might happen quite quickly in some cases.

> > +	/* The remainder value if needed would be n_hi << p */
> 
> I indeed need a variant of this function that rounds up. So maybe
> creating a function
> 
> 	u64 mul_u64_u64_div_u64_rem(u64 a, u64 b, u64 c, u64 *rem)
> 
> with the sophistication of your mul_u64_u64_div_u64 and then:
> 
> 	u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
> 	{
> 		u64 rem, ret;
> 
> 		ret = mul_u64_u64_div_u64_rem(a, b, c, &rem);
> 		return ret;
> 	}
> 
> (In the hope that the compiler optimizes out the calculation for the
> remainder) 

It probably won't unless the core function is a static inline.

It might be more efficient to do this:

u64 mul_u64_u64_div_u64_rem(u64 a, u64 b, u64 c, u64 *rem)
{
	u64 res = u64 mul_u64_u64_div_u64(a, b, c);

	/* those multiplications will overflow but it doesn't matter */
	*rem = a * b - c * res;

	return res;
}

This way the core code doesn't get duplicated.

> and:
> 
> 	u64 mul_u64_u64_div_u64_roundup(u64 a, u64 b, u64 c)
> 	{
> 		u64 rem, ret;
> 
> 		ret = mul_u64_u64_div_u64_rem(a, b, c, &rem);
> 
> 		if (rem)
> 			ret += 1;
> 
> 		return ret;
> 	}
> 
> would be nice. (This could be done in a follow up patch though.)

You're welcome to it.  ;-)


Nicolas
Re: [PATCH v2 1/2] mul_u64_u64_div_u64: make it precise always
Posted by Uwe Kleine-König 1 year, 5 months ago
On Thu, Jul 04, 2024 at 05:16:37PM -0400, Nicolas Pitre wrote:
> On Thu, 4 Jul 2024, Uwe Kleine-König wrote:
> 
> > Hello Nico,
> > 
> > On Tue, Jul 02, 2024 at 11:34:08PM -0400, Nicolas Pitre wrote:
> > > +	A.v = a;
> > > +	B.v = b;
> > > +
> > > +	X.v = (u64)A.l * B.l;
> > > +	Y.v = (u64)A.l * B.h + X.h;
> > > +	Z.v = (u64)A.h * B.h + Y.h;
> > > +	Y.v = (u64)A.h * B.l + Y.l;
> > > +	X.h = Y.l;
> > > +	Z.v += Y.h;
> > > +
> > > +	u64 n_lo = X.v, n_hi = Z.v;
> > 
> > I tried to understand your patch. This part could really benefit from
> > some comments. With pen and paper I worked out your idea:
> > 
> > 	The goal is:
> > 
> > 		A * B == Z << 64 + X
> > 
> > 	With A = A.h << 32 + A.l and similar identities for B, we have:
> > 
> > 	A * B = (A.h << 32 + A.l) * (B.h << 32 + B.l)
> > 	      = (A.h * B.h << 64 + (A.h * B.l + A.l * B.h) << 32 + A.l * B.l
>                           ^ missing )

Ack, alternatively the opening ( can be dropped.

> > 	The operations done here are only 32 bit multiplications and
> > 	additions, and with U32_MAX = 0xffffffff we have:
> > 	U32_MAX * U32_MAX + U32_MAX = (U32_MAX + 1) * U32_MAX =
> > 	0xffffffff00000000 which fits into an u64.  Even when adding
> > 	another U32_MAX (which happens with Z.v += Y.h) it still fits
> > 	into u64, and so the operations won't overflow.
> 
> Exact, that's the idea.
> 
> I was about to reproduce the code I wrote for a similar purpose about 18 
> years ago that currently lives in __arch_xprod_64() 
> (include/asm-generic/div64.h). when I realized that, with some 
> reordering, all the overflow handling could be avoided entirely.
> 
> So I'm about to submit some nice simplification for my old optimized 
> __div64_const32() based on this realisation.
> 
> > > +	/* Do the full 128 by 64 bits division */
> > 
> > Here is the code location where I stop understanding your code :-)
> 
> Here's how it goes:
> 
> To do a binary division, you have to align the numbers, find how many 
> times the 
> divisor fits and subtract, just like we learned in primary school. 
> Except that we have binary numbers instead of base 10 numbers, making 
> the "how many times" either 0 or 1.
> 
> To align numbers, let's simply move set bits to the top left bit.
> 
> Let's suppose a divisor c of 0x00ffffff00000000
> 
>         shift = __builtin_clzll(c);
>         c <<= shift;
> 
> so shift = 8 and c becomes 0xffffff0000000000
> 
> Also need to track the actual divisor power. Given that we start working 
> on n_hi not n_lo, this means the power is initially 64. But we just 
> shifted c which increased its power:
> 
>         p = 64 + shift;
> 
> Then, remember that n_hi < original c. That's ensured by the overflow 
> test earlier. So shifting n_hi leftwards will require a greater shift 
> than the one we applied to c, meaning that p will become 63 or less 
> during the first loop.
> 
> Let's suppose n_hi = 0x000ffffffff00000 and n_lo = 0
> 
> Then enter the loop:
> 
>                 carry = n_hi >> 63;
> 
> Top bit of n_hi is unset so no carry.
> 
>                 shift = carry ? 1 : __builtin_clzll(n_hi);
> 
> If n'hi's top bit was set we'd have a shift of 1 with a carry. But here 
> there is no carry and aligning n_hi to the top left bit requires a shift 
> of 12.
> 
>                 n_hi <<= shift;
>                 n_hi |= n_lo >> (64 - shift);
>                 n_lo <<= shift;
> 
> So n_hi is now 0xffffffff00000000
> 
>                 p -= shift;
> 
> Shifting left the dividend reduces the divisor's power.
> So p is now 64 + 8 - 12 = 60
> 
> Then, the crux of the operation:
> 
>                 if (carry || (n_hi >= c)) {
>                         n_hi -= c;
>                         res |= 1ULL << p;
>                 }
> 
> So... if the divisor fits then we add a 1 to the result and subtract it.
> n_hi = 0xffffffff00000000 - 0xffffff0000000000 = 0x000000ff00000000
> res |= 1 << 60
> 
> Let's loop again:
> 
>                 carry = n_hi >> 63;
>                 shift = carry ? 1 : __builtin_clzll(n_hi);
>                 ...
> 
> No carry, shift becomes 24, p becomes 60 - 24 = 36 and
> n_hi becomes 0xff00000000000000.
> 
>                 if (carry || (n_hi >= c)) { ...
> 
> No carry and n_hi is smaller than c so loop again.
> 
>                 carry = n_hi >> 63;
>                 shift = carry ? 1 : __builtin_clzll(n_hi);
> 
> This time we have a carry as the top bit of n_hi is set and we're about 
> to shift it by 1. p becomes 35 and n_hi becomes 0xfe00000000000000. In 
> reality it is like having 0x1fe00000000000000 (a 65-bits value) which is 
> obviously bigger than 0xffffff0000000000. So we can augment the result 
> and subtract. Thanks to two's complement, we have:
> 
> n_hi = 0xfe00000000000000 - 0xffffff0000000000 = 0xfe00010000000000
> 
> and 
> 
> res = 1 << 60 | 1 << 35

Oh wow, that part is clever. Before your mail I wondered for a while why
the right thing happens if carry=1 but n_hi < c.

> And so on until either n_hi becomes 0 or p would go negative, which 
> might happen quite quickly in some cases.

OK, so the loop invariant at the end of each iteration is:

	final result = res + (n_hi#n_lo << p) / c

(with n_hi#n_lo = n_hi << 64 | n_lo), right?

> > > +	/* The remainder value if needed would be n_hi << p */
> > 
> > I indeed need a variant of this function that rounds up. So maybe
> > creating a function
> > 
> > 	u64 mul_u64_u64_div_u64_rem(u64 a, u64 b, u64 c, u64 *rem)
> > 
> > with the sophistication of your mul_u64_u64_div_u64 and then:
> > 
> > 	u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
> > 	{
> > 		u64 rem, ret;
> > 
> > 		ret = mul_u64_u64_div_u64_rem(a, b, c, &rem);
> > 		return ret;
> > 	}
> > 
> > (In the hope that the compiler optimizes out the calculation for the
> > remainder) 
> 
> It probably won't unless the core function is a static inline.
> 
> It might be more efficient to do this:
> 
> u64 mul_u64_u64_div_u64_rem(u64 a, u64 b, u64 c, u64 *rem)
> {
> 	u64 res = u64 mul_u64_u64_div_u64(a, b, c);
> 
> 	/* those multiplications will overflow but it doesn't matter */
> 	*rem = a * b - c * res;
> 
> 	return res;
> }
> 
> This way the core code doesn't get duplicated.

Good idea. I'll check that after the discussion about this patch.

Best regards
Uwe
Re: [PATCH v2 1/2] mul_u64_u64_div_u64: make it precise always
Posted by Nicolas Pitre 1 year, 5 months ago
On Fri, 5 Jul 2024, Uwe Kleine-König wrote:

> On Thu, Jul 04, 2024 at 05:16:37PM -0400, Nicolas Pitre wrote:
> > Then enter the loop:
> > 
> >                 carry = n_hi >> 63;
> > 
> > Top bit of n_hi is unset so no carry.
> > 
> >                 shift = carry ? 1 : __builtin_clzll(n_hi);
> > 
> > If n'hi's top bit was set we'd have a shift of 1 with a carry. But here 
> > there is no carry and aligning n_hi to the top left bit requires a shift 
> > of 12.
> > 
> >                 n_hi <<= shift;
> >                 n_hi |= n_lo >> (64 - shift);
> >                 n_lo <<= shift;
> > 
> > So n_hi is now 0xffffffff00000000
> > 
> >                 p -= shift;
> > 
> > Shifting left the dividend reduces the divisor's power.
> > So p is now 64 + 8 - 12 = 60
> > 
> > Then, the crux of the operation:
> > 
> >                 if (carry || (n_hi >= c)) {
> >                         n_hi -= c;
> >                         res |= 1ULL << p;
> >                 }
> > 
> > So... if the divisor fits then we add a 1 to the result and subtract it.
> > n_hi = 0xffffffff00000000 - 0xffffff0000000000 = 0x000000ff00000000
> > res |= 1 << 60
> > 
> > Let's loop again:
> > 
> >                 carry = n_hi >> 63;
> >                 shift = carry ? 1 : __builtin_clzll(n_hi);
> >                 ...
> > 
> > No carry, shift becomes 24, p becomes 60 - 24 = 36 and
> > n_hi becomes 0xff00000000000000.
> > 
> >                 if (carry || (n_hi >= c)) { ...
> > 
> > No carry and n_hi is smaller than c so loop again.
> > 
> >                 carry = n_hi >> 63;
> >                 shift = carry ? 1 : __builtin_clzll(n_hi);
> > 
> > This time we have a carry as the top bit of n_hi is set and we're about 
> > to shift it by 1. p becomes 35 and n_hi becomes 0xfe00000000000000. In 
> > reality it is like having 0x1fe00000000000000 (a 65-bits value) which is 
> > obviously bigger than 0xffffff0000000000. So we can augment the result 
> > and subtract. Thanks to two's complement, we have:
> > 
> > n_hi = 0xfe00000000000000 - 0xffffff0000000000 = 0xfe00010000000000
> > 
> > and 
> > 
> > res = 1 << 60 | 1 << 35
> 
> Oh wow, that part is clever. Before your mail I wondered for a while why
> the right thing happens if carry=1 but n_hi < c.
> 
> > And so on until either n_hi becomes 0 or p would go negative, which 
> > might happen quite quickly in some cases.
> 
> OK, so the loop invariant at the end of each iteration is:
> 
> 	final result = res + (n_hi#n_lo << p) / c
> 
> (with n_hi#n_lo = n_hi << 64 | n_lo), right?

Hmmm maybe. I hardly think in such terms so I can't say for sure if this 
is right.


Nicolas