[v2] Implement mul_u64_u64_div_u64_roundup()

[PATCH v2 next 3/4] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup()

Posted by David Laight 8 months, 3 weeks ago

The existing mul_u64_u64_div_u64() rounds down, a 'rounding up'
variant needs 'divisor - 1' adding in between the multiply and
divide so cannot easily be done by a caller.

Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d
and implement the 'round down' and 'round up' using it.

Update the x86-64 asm to optimise for 'c' being a constant zero.

For architectures that support u128 check for a 64bit product after
the multiply (will be cheap).
Leave in the early check for other architectures (mostly 32bit) when
'c' is zero to avoid the multi-part multiply.

Note that the cost of the 128bit divide will dwarf the rest of the code.
This function is very slow on everything except x86-64 (very very slow
on 32bit).

Add kerndoc definitions for all three functions.

Signed-off-by: David Laight <david.laight.linux@gmail.com>
---
Changes for v2 (formally patch 1/3):
- Reinstate the early call to div64_u64() on 32bit when 'c' is zero.
  Although I'm not convinced the path is common enough to be worth
  the two ilog2() calls.

 arch/x86/include/asm/div64.h | 19 ++++++++++-----
 include/linux/math64.h       | 45 +++++++++++++++++++++++++++++++++++-
 lib/math/div64.c             | 21 ++++++++++-------
 3 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9931e4c7d73f..7a0a916a2d7d 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -84,21 +84,28 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
  * Will generate an #DE when the result doesn't fit u64, could fix with an
  * __ex_table[] entry when it becomes an issue.
  */
-static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
+static inline u64 mul_u64_add_u64_div_u64(u64 a, u64 mul, u64 add, u64 div)
 {
 	u64 q;
 
-	asm ("mulq %2; divq %3" : "=a" (q)
-				: "a" (a), "rm" (mul), "rm" (div)
-				: "rdx");
+	if (statically_true(!add)) {
+		asm ("mulq %2; divq %3" : "=a" (q)
+					: "a" (a), "rm" (mul), "rm" (div)
+					: "rdx");
+	} else {
+		asm ("mulq %2; addq %3, %%rax; adcq $0, %%rdx; divq %4"
+			: "=a" (q)
+			: "a" (a), "rm" (mul), "rm" (add), "rm" (div)
+			: "rdx");
+	}
 
 	return q;
 }
-#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
+#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
 
 static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
 {
-	return mul_u64_u64_div_u64(a, mul, div);
+	return mul_u64_add_u64_div_u64(a, mul, 0, div);
 }
 #define mul_u64_u32_div	mul_u64_u32_div
 
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 6aaccc1626ab..e1c2e3642cec 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -282,7 +282,50 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
 }
 #endif /* mul_u64_u32_div */
 
-u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
+/**
+ * mul_u64_add_u64_div_u64 - unsigned 64bit multiply, add, and divide
+ * @a: first unsigned 64bit multiplicand
+ * @b: second unsigned 64bit multiplicand
+ * @c: unsigned 64bit addend
+ * @d: unsigned 64bit divisor
+ *
+ * Multiply two 64bit values together to generate a 128bit product
+ * add a third value and then divide by a fourth.
+ * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
+ *
+ * Return: (@a * @b + @c) / @d
+ */
+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d);
+
+/**
+ * mul_u64_u64_div_u64 - unsigned 64bit multiply and divide
+ * @a: first unsigned 64bit multiplicand
+ * @b: second unsigned 64bit multiplicand
+ * @d: unsigned 64bit divisor
+ *
+ * Multiply two 64bit values together to generate a 128bit product
+ * and then divide by a third value.
+ * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
+ *
+ * Return: @a * @b / @d
+ */
+#define mul_u64_u64_div_u64(a, b, d) mul_u64_add_u64_div_u64(a, b, 0, d)
+
+/**
+ * mul_u64_u64_div_u64_roundup - unsigned 64bit multiply and divide rounded up
+ * @a: first unsigned 64bit multiplicand
+ * @b: second unsigned 64bit multiplicand
+ * @d: unsigned 64bit divisor
+ *
+ * Multiply two 64bit values together to generate a 128bit product
+ * and then divide and round up.
+ * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
+ *
+ * Return: (@a * @b + @d - 1) / @d
+ */
+#define mul_u64_u64_div_u64_roundup(a, b, d) \
+	({ u64 _tmp = (d); mul_u64_add_u64_div_u64(a, b, _tmp - 1, _tmp); })
+
 
 /**
  * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up
diff --git a/lib/math/div64.c b/lib/math/div64.c
index c426fa0660bc..66bfb6159f02 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -183,29 +183,31 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
 }
 EXPORT_SYMBOL(iter_div_u64_rem);
 
-#ifndef mul_u64_u64_div_u64
-u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
+#ifndef mul_u64_add_u64_div_u64
+u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
 {
 	/* Trigger exception if divisor is zero */
 	BUG_ON(!d);
 
-	if (ilog2(a) + ilog2(b) <= 62)
-		return div64_u64(a * b, d);
-
 #if defined(__SIZEOF_INT128__)
 
 	/* native 64x64=128 bits multiplication */
-	u128 prod = (u128)a * b;
+	u128 prod = (u128)a * b + c;
 	u64 n_lo = prod, n_hi = prod >> 64;
 
 #else
 
+	if (!c && ilog2(a) + ilog2(b) <= 62)
+		return div64_u64(a * b, d);
+
 	/* perform a 64x64=128 bits multiplication manually */
 	u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32;
 	u64 x, y, z;
 
-	x = (u64)a_lo * b_lo;
+	/* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */
+	x = (u64)a_lo * b_lo + (u32)c;
 	y = (u64)a_lo * b_hi + (u32)(x >> 32);
+	y += (u32)(c >> 32);
 	z = (u64)a_hi * b_hi + (u32)(y >> 32);
 	y = (u64)a_hi * b_lo + (u32)y;
 	z += (u32)(y >> 32);
@@ -215,6 +217,9 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
 
 #endif
 
+	if (!n_hi)
+		return div64_u64(n_lo, d);
+
 	int shift = __builtin_ctzll(d);
 
 	/* try reducing the fraction in case the dividend becomes <= 64 bits */
@@ -261,5 +266,5 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
 
 	return res;
 }
-EXPORT_SYMBOL(mul_u64_u64_div_u64);
+EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
 #endif
-- 
2.39.5

Re: [PATCH v2 next 3/4] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup()

Posted by Nicolas Pitre 8 months, 3 weeks ago

On Sun, 18 May 2025, David Laight wrote:

> The existing mul_u64_u64_div_u64() rounds down, a 'rounding up'
> variant needs 'divisor - 1' adding in between the multiply and
> divide so cannot easily be done by a caller.
> 
> Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d
> and implement the 'round down' and 'round up' using it.
> 
> Update the x86-64 asm to optimise for 'c' being a constant zero.
> 
> For architectures that support u128 check for a 64bit product after
> the multiply (will be cheap).
> Leave in the early check for other architectures (mostly 32bit) when
> 'c' is zero to avoid the multi-part multiply.

I agree with this, except for the "'c' is zero" part. More below.

> Note that the cost of the 128bit divide will dwarf the rest of the code.
> This function is very slow on everything except x86-64 (very very slow
> on 32bit).
> 
> Add kerndoc definitions for all three functions.
> 
> Signed-off-by: David Laight <david.laight.linux@gmail.com>
> ---
> Changes for v2 (formally patch 1/3):
> - Reinstate the early call to div64_u64() on 32bit when 'c' is zero.
>   Although I'm not convinced the path is common enough to be worth
>   the two ilog2() calls.
> 
>  arch/x86/include/asm/div64.h | 19 ++++++++++-----
>  include/linux/math64.h       | 45 +++++++++++++++++++++++++++++++++++-
>  lib/math/div64.c             | 21 ++++++++++-------
>  3 files changed, 70 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
> index 9931e4c7d73f..7a0a916a2d7d 100644
> --- a/arch/x86/include/asm/div64.h
> +++ b/arch/x86/include/asm/div64.h
> @@ -84,21 +84,28 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
>   * Will generate an #DE when the result doesn't fit u64, could fix with an
>   * __ex_table[] entry when it becomes an issue.
>   */
> -static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
> +static inline u64 mul_u64_add_u64_div_u64(u64 a, u64 mul, u64 add, u64 div)
>  {
>  	u64 q;
>  
> -	asm ("mulq %2; divq %3" : "=a" (q)
> -				: "a" (a), "rm" (mul), "rm" (div)
> -				: "rdx");
> +	if (statically_true(!add)) {
> +		asm ("mulq %2; divq %3" : "=a" (q)
> +					: "a" (a), "rm" (mul), "rm" (div)
> +					: "rdx");
> +	} else {
> +		asm ("mulq %2; addq %3, %%rax; adcq $0, %%rdx; divq %4"
> +			: "=a" (q)
> +			: "a" (a), "rm" (mul), "rm" (add), "rm" (div)
> +			: "rdx");
> +	}
>  
>  	return q;
>  }
> -#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
> +#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
>  
>  static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
>  {
> -	return mul_u64_u64_div_u64(a, mul, div);
> +	return mul_u64_add_u64_div_u64(a, mul, 0, div);
>  }
>  #define mul_u64_u32_div	mul_u64_u32_div
>  
> diff --git a/include/linux/math64.h b/include/linux/math64.h
> index 6aaccc1626ab..e1c2e3642cec 100644
> --- a/include/linux/math64.h
> +++ b/include/linux/math64.h
> @@ -282,7 +282,50 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
>  }
>  #endif /* mul_u64_u32_div */
>  
> -u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
> +/**
> + * mul_u64_add_u64_div_u64 - unsigned 64bit multiply, add, and divide
> + * @a: first unsigned 64bit multiplicand
> + * @b: second unsigned 64bit multiplicand
> + * @c: unsigned 64bit addend
> + * @d: unsigned 64bit divisor
> + *
> + * Multiply two 64bit values together to generate a 128bit product
> + * add a third value and then divide by a fourth.
> + * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
> + *
> + * Return: (@a * @b + @c) / @d
> + */
> +u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d);
> +
> +/**
> + * mul_u64_u64_div_u64 - unsigned 64bit multiply and divide
> + * @a: first unsigned 64bit multiplicand
> + * @b: second unsigned 64bit multiplicand
> + * @d: unsigned 64bit divisor
> + *
> + * Multiply two 64bit values together to generate a 128bit product
> + * and then divide by a third value.
> + * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.

If the quotient exceeds 64 bits, the optimized x86 version truncates the 
value to the low 64 bits. The C version returns a saturated value i.e. 
UINT64_MAX (implemented with a -1). Nothing actually traps in that case.

> + *
> + * Return: @a * @b / @d
> + */
> +#define mul_u64_u64_div_u64(a, b, d) mul_u64_add_u64_div_u64(a, b, 0, d)
> +
> +/**
> + * mul_u64_u64_div_u64_roundup - unsigned 64bit multiply and divide rounded up
> + * @a: first unsigned 64bit multiplicand
> + * @b: second unsigned 64bit multiplicand
> + * @d: unsigned 64bit divisor
> + *
> + * Multiply two 64bit values together to generate a 128bit product
> + * and then divide and round up.
> + * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
> + *
> + * Return: (@a * @b + @d - 1) / @d
> + */
> +#define mul_u64_u64_div_u64_roundup(a, b, d) \
> +	({ u64 _tmp = (d); mul_u64_add_u64_div_u64(a, b, _tmp - 1, _tmp); })
> +
>  
>  /**
>   * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up
> diff --git a/lib/math/div64.c b/lib/math/div64.c
> index c426fa0660bc..66bfb6159f02 100644
> --- a/lib/math/div64.c
> +++ b/lib/math/div64.c
> @@ -183,29 +183,31 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
>  }
>  EXPORT_SYMBOL(iter_div_u64_rem);
>  
> -#ifndef mul_u64_u64_div_u64
> -u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
> +#ifndef mul_u64_add_u64_div_u64
> +u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
>  {
>  	/* Trigger exception if divisor is zero */
>  	BUG_ON(!d);
>  
> -	if (ilog2(a) + ilog2(b) <= 62)
> -		return div64_u64(a * b, d);
> -
>  #if defined(__SIZEOF_INT128__)
>  
>  	/* native 64x64=128 bits multiplication */
> -	u128 prod = (u128)a * b;
> +	u128 prod = (u128)a * b + c;
>  	u64 n_lo = prod, n_hi = prod >> 64;
>  
>  #else
>  
> +	if (!c && ilog2(a) + ilog2(b) <= 62)
> +		return div64_u64(a * b, d);
> +

Here you should do:

	if (ilog2(a) + ilog2(b) <= 62) {
		u64 ab = a * b;
		u64 abc = ab + c;
		if (ab <= abc)
			return div64_u64(abc, d);
	}

This is cheap and won't unconditionally discard the faster path when c != 0;

>  	/* perform a 64x64=128 bits multiplication manually */
>  	u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32;
>  	u64 x, y, z;
>  
> -	x = (u64)a_lo * b_lo;
> +	/* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */
> +	x = (u64)a_lo * b_lo + (u32)c;
>  	y = (u64)a_lo * b_hi + (u32)(x >> 32);
> +	y += (u32)(c >> 32);
>  	z = (u64)a_hi * b_hi + (u32)(y >> 32);
>  	y = (u64)a_hi * b_lo + (u32)y;
>  	z += (u32)(y >> 32);
> @@ -215,6 +217,9 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
>  
>  #endif
>  
> +	if (!n_hi)
> +		return div64_u64(n_lo, d);
> +
>  	int shift = __builtin_ctzll(d);
>  
>  	/* try reducing the fraction in case the dividend becomes <= 64 bits */
> @@ -261,5 +266,5 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
>  
>  	return res;
>  }
> -EXPORT_SYMBOL(mul_u64_u64_div_u64);
> +EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
>  #endif
> -- 
> 2.39.5
> 
>

Re: [PATCH v2 next 3/4] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup()

Posted by David Laight 8 months, 3 weeks ago

On Mon, 19 May 2025 23:03:21 -0400 (EDT)
Nicolas Pitre <npitre@baylibre.com> wrote:

> On Sun, 18 May 2025, David Laight wrote:
> 
> > The existing mul_u64_u64_div_u64() rounds down, a 'rounding up'
> > variant needs 'divisor - 1' adding in between the multiply and
> > divide so cannot easily be done by a caller.
> > 
> > Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d
> > and implement the 'round down' and 'round up' using it.
> > 
> > Update the x86-64 asm to optimise for 'c' being a constant zero.
> > 
> > For architectures that support u128 check for a 64bit product after
> > the multiply (will be cheap).
> > Leave in the early check for other architectures (mostly 32bit) when
> > 'c' is zero to avoid the multi-part multiply.  
> 
> I agree with this, except for the "'c' is zero" part. More below.
> 
> > Note that the cost of the 128bit divide will dwarf the rest of the code.
> > This function is very slow on everything except x86-64 (very very slow
> > on 32bit).
> > 
> > Add kerndoc definitions for all three functions.
> > 
> > Signed-off-by: David Laight <david.laight.linux@gmail.com>
> > ---
> > Changes for v2 (formally patch 1/3):
> > - Reinstate the early call to div64_u64() on 32bit when 'c' is zero.
> >   Although I'm not convinced the path is common enough to be worth
> >   the two ilog2() calls.
> > 
> >  arch/x86/include/asm/div64.h | 19 ++++++++++-----
> >  include/linux/math64.h       | 45 +++++++++++++++++++++++++++++++++++-
> >  lib/math/div64.c             | 21 ++++++++++-------
> >  3 files changed, 70 insertions(+), 15 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
> > index 9931e4c7d73f..7a0a916a2d7d 100644
> > --- a/arch/x86/include/asm/div64.h
> > +++ b/arch/x86/include/asm/div64.h
> > @@ -84,21 +84,28 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
> >   * Will generate an #DE when the result doesn't fit u64, could fix with an
> >   * __ex_table[] entry when it becomes an issue.
> >   */
> > -static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
> > +static inline u64 mul_u64_add_u64_div_u64(u64 a, u64 mul, u64 add, u64 div)
> >  {
> >  	u64 q;
> >  
> > -	asm ("mulq %2; divq %3" : "=a" (q)
> > -				: "a" (a), "rm" (mul), "rm" (div)
> > -				: "rdx");
> > +	if (statically_true(!add)) {
> > +		asm ("mulq %2; divq %3" : "=a" (q)
> > +					: "a" (a), "rm" (mul), "rm" (div)
> > +					: "rdx");
> > +	} else {
> > +		asm ("mulq %2; addq %3, %%rax; adcq $0, %%rdx; divq %4"
> > +			: "=a" (q)
> > +			: "a" (a), "rm" (mul), "rm" (add), "rm" (div)
> > +			: "rdx");
> > +	}
> >  
> >  	return q;
> >  }
> > -#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
> > +#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
> >  
> >  static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
> >  {
> > -	return mul_u64_u64_div_u64(a, mul, div);
> > +	return mul_u64_add_u64_div_u64(a, mul, 0, div);
> >  }
> >  #define mul_u64_u32_div	mul_u64_u32_div
> >  
> > diff --git a/include/linux/math64.h b/include/linux/math64.h
> > index 6aaccc1626ab..e1c2e3642cec 100644
> > --- a/include/linux/math64.h
> > +++ b/include/linux/math64.h
> > @@ -282,7 +282,50 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
> >  }
> >  #endif /* mul_u64_u32_div */
> >  
> > -u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
> > +/**
> > + * mul_u64_add_u64_div_u64 - unsigned 64bit multiply, add, and divide
> > + * @a: first unsigned 64bit multiplicand
> > + * @b: second unsigned 64bit multiplicand
> > + * @c: unsigned 64bit addend
> > + * @d: unsigned 64bit divisor
> > + *
> > + * Multiply two 64bit values together to generate a 128bit product
> > + * add a third value and then divide by a fourth.
> > + * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
> > + *
> > + * Return: (@a * @b + @c) / @d
> > + */
> > +u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d);
> > +
> > +/**
> > + * mul_u64_u64_div_u64 - unsigned 64bit multiply and divide
> > + * @a: first unsigned 64bit multiplicand
> > + * @b: second unsigned 64bit multiplicand
> > + * @d: unsigned 64bit divisor
> > + *
> > + * Multiply two 64bit values together to generate a 128bit product
> > + * and then divide by a third value.
> > + * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.  
> 
> If the quotient exceeds 64 bits, the optimized x86 version truncates the 
> value to the low 64 bits. The C version returns a saturated value i.e. 
> UINT64_MAX (implemented with a -1). Nothing actually traps in that case.

Nope. I've only got the iAPX 286 and 80386 reference manuals to hand.
Both say that 'interrupt 0' happens on overflow.
I don't expect the later documentation is any different.

If the kernel code is going to have an explicit instruction to trap
(rather then the code 'just trapping') it really is best to use BUG().
If nothing else it guarantees a trap regardless of the architecture
and compiler.

> 
> > + *
> > + * Return: @a * @b / @d
> > + */
> > +#define mul_u64_u64_div_u64(a, b, d) mul_u64_add_u64_div_u64(a, b, 0, d)
> > +
> > +/**
> > + * mul_u64_u64_div_u64_roundup - unsigned 64bit multiply and divide rounded up
> > + * @a: first unsigned 64bit multiplicand
> > + * @b: second unsigned 64bit multiplicand
> > + * @d: unsigned 64bit divisor
> > + *
> > + * Multiply two 64bit values together to generate a 128bit product
> > + * and then divide and round up.
> > + * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
> > + *
> > + * Return: (@a * @b + @d - 1) / @d
> > + */
> > +#define mul_u64_u64_div_u64_roundup(a, b, d) \
> > +	({ u64 _tmp = (d); mul_u64_add_u64_div_u64(a, b, _tmp - 1, _tmp); })
> > +
> >  
> >  /**
> >   * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up
> > diff --git a/lib/math/div64.c b/lib/math/div64.c
> > index c426fa0660bc..66bfb6159f02 100644
> > --- a/lib/math/div64.c
> > +++ b/lib/math/div64.c
> > @@ -183,29 +183,31 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
> >  }
> >  EXPORT_SYMBOL(iter_div_u64_rem);
> >  
> > -#ifndef mul_u64_u64_div_u64
> > -u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
> > +#ifndef mul_u64_add_u64_div_u64
> > +u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
> >  {
> >  	/* Trigger exception if divisor is zero */
> >  	BUG_ON(!d);
> >  
> > -	if (ilog2(a) + ilog2(b) <= 62)
> > -		return div64_u64(a * b, d);
> > -
> >  #if defined(__SIZEOF_INT128__)
> >  
> >  	/* native 64x64=128 bits multiplication */
> > -	u128 prod = (u128)a * b;
> > +	u128 prod = (u128)a * b + c;
> >  	u64 n_lo = prod, n_hi = prod >> 64;
> >  
> >  #else
> >  
> > +	if (!c && ilog2(a) + ilog2(b) <= 62)
> > +		return div64_u64(a * b, d);
> > +  
> 
> Here you should do:
> 
> 	if (ilog2(a) + ilog2(b) <= 62) {
> 		u64 ab = a * b;
> 		u64 abc = ab + c;
> 		if (ab <= abc)
> 			return div64_u64(abc, d);
> 	}
> 
> This is cheap and won't unconditionally discard the faster path when c != 0;

That isn't really cheap.
ilog2() is likely to be a similar cost to a multiply
(my brain remembers them both as 'latency 3' on x86).
My actual preference is to completely delete that test and rely
on the post-multiply check.

The 64 by 64 multiply code is actually fairly cheap.
On x86-64 it is only a few clocks slower than the u128 version
(and that is (much) the same code that should be generated for 32bit).

> 
> >  	/* perform a 64x64=128 bits multiplication manually */
> >  	u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32;
> >  	u64 x, y, z;
> >  
> > -	x = (u64)a_lo * b_lo;
> > +	/* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */
> > +	x = (u64)a_lo * b_lo + (u32)c;
> >  	y = (u64)a_lo * b_hi + (u32)(x >> 32);
> > +	y += (u32)(c >> 32);

Those two adds to y should be swapped - I need to do a v3 and will swap them.
It might save one clock - my timing code is accurate, but not THAT accurate.

> >  	z = (u64)a_hi * b_hi + (u32)(y >> 32);
> >  	y = (u64)a_hi * b_lo + (u32)y;
> >  	z += (u32)(y >> 32);
> > @@ -215,6 +217,9 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)

If we assume the compiler is sane (gcc isn't), a/b_hi/lo are in registers
and mul has a latency of 3 (and add 1) the code above can execute as:
clock 0: x_h/x_lo = a_lo * b_lo
clock 1: y_h/y_lo = a_lo * b_hi
clock 2: y1_ho/y1_lo = a_hi * b_lo
clock 3: z_hi/z_lo = a_hi + b_hi; x_lo += c_lo
clock 4: x_hi += carry; y_lo += c_hi
clock 5: y_hi += carry; y_lo += x_hi
clock 6: y_hi += carry; y1_lo += y_lo
clock 7: y1_hi += carry; z_lo += y_hi
clock 8: z_hi += carry; z_lo += y1_hi
clock 9: z_hi += carry
I don't think any more instructions can run in parallel.
But it really isn't that long an all.
Your 'fast path' test will be nearly that long even ignoring
mis-predicted branches.

For my updated version I've managed to stop gcc spilling zero words
to stack!

	David

> >  
> >  #endif
> >  
> > +	if (!n_hi)
> > +		return div64_u64(n_lo, d);
> > +
> >  	int shift = __builtin_ctzll(d);
> >  
> >  	/* try reducing the fraction in case the dividend becomes <= 64 bits */
> > @@ -261,5 +266,5 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 d)
> >  
> >  	return res;
> >  }
> > -EXPORT_SYMBOL(mul_u64_u64_div_u64);
> > +EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
> >  #endif
> > -- 
> > 2.39.5
> > 
> >

Re: [PATCH v2 next 3/4] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup()

Posted by Nicolas Pitre 8 months, 3 weeks ago

On Tue, 20 May 2025, David Laight wrote:

> On Mon, 19 May 2025 23:03:21 -0400 (EDT)
> Nicolas Pitre <npitre@baylibre.com> wrote:
> 
> > On Sun, 18 May 2025, David Laight wrote:
> > 
> > > The existing mul_u64_u64_div_u64() rounds down, a 'rounding up'
> > > variant needs 'divisor - 1' adding in between the multiply and
> > > divide so cannot easily be done by a caller.
> > > 
> > > Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d
> > > and implement the 'round down' and 'round up' using it.
> > > 
> > > Update the x86-64 asm to optimise for 'c' being a constant zero.
> > > 
> > > For architectures that support u128 check for a 64bit product after
> > > the multiply (will be cheap).
> > > Leave in the early check for other architectures (mostly 32bit) when
> > > 'c' is zero to avoid the multi-part multiply.  
> > 
> > I agree with this, except for the "'c' is zero" part. More below.
> > 
> > > Note that the cost of the 128bit divide will dwarf the rest of the code.
> > > This function is very slow on everything except x86-64 (very very slow
> > > on 32bit).
> > > 
> > > Add kerndoc definitions for all three functions.
> > > 
> > > Signed-off-by: David Laight <david.laight.linux@gmail.com>
> > > ---
> > > Changes for v2 (formally patch 1/3):
> > > - Reinstate the early call to div64_u64() on 32bit when 'c' is zero.
> > >   Although I'm not convinced the path is common enough to be worth
> > >   the two ilog2() calls.
> > > 
> > >  arch/x86/include/asm/div64.h | 19 ++++++++++-----
> > >  include/linux/math64.h       | 45 +++++++++++++++++++++++++++++++++++-
> > >  lib/math/div64.c             | 21 ++++++++++-------
> > >  3 files changed, 70 insertions(+), 15 deletions(-)
> > > 
> > > diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
> > > index 9931e4c7d73f..7a0a916a2d7d 100644
> > > --- a/arch/x86/include/asm/div64.h
> > > +++ b/arch/x86/include/asm/div64.h
> > > @@ -84,21 +84,28 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
> > >   * Will generate an #DE when the result doesn't fit u64, could fix with an
> > >   * __ex_table[] entry when it becomes an issue.
> > >   */
> > > -static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
> > > +static inline u64 mul_u64_add_u64_div_u64(u64 a, u64 mul, u64 add, u64 div)
> > >  {
> > >  	u64 q;
> > >  
> > > -	asm ("mulq %2; divq %3" : "=a" (q)
> > > -				: "a" (a), "rm" (mul), "rm" (div)
> > > -				: "rdx");
> > > +	if (statically_true(!add)) {
> > > +		asm ("mulq %2; divq %3" : "=a" (q)
> > > +					: "a" (a), "rm" (mul), "rm" (div)
> > > +					: "rdx");
> > > +	} else {
> > > +		asm ("mulq %2; addq %3, %%rax; adcq $0, %%rdx; divq %4"
> > > +			: "=a" (q)
> > > +			: "a" (a), "rm" (mul), "rm" (add), "rm" (div)
> > > +			: "rdx");
> > > +	}
> > >  
> > >  	return q;
> > >  }
> > > -#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
> > > +#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
> > >  
> > >  static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
> > >  {
> > > -	return mul_u64_u64_div_u64(a, mul, div);
> > > +	return mul_u64_add_u64_div_u64(a, mul, 0, div);
> > >  }
> > >  #define mul_u64_u32_div	mul_u64_u32_div
> > >  
> > > diff --git a/include/linux/math64.h b/include/linux/math64.h
> > > index 6aaccc1626ab..e1c2e3642cec 100644
> > > --- a/include/linux/math64.h
> > > +++ b/include/linux/math64.h
> > > @@ -282,7 +282,50 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
> > >  }
> > >  #endif /* mul_u64_u32_div */
> > >  
> > > -u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
> > > +/**
> > > + * mul_u64_add_u64_div_u64 - unsigned 64bit multiply, add, and divide
> > > + * @a: first unsigned 64bit multiplicand
> > > + * @b: second unsigned 64bit multiplicand
> > > + * @c: unsigned 64bit addend
> > > + * @d: unsigned 64bit divisor
> > > + *
> > > + * Multiply two 64bit values together to generate a 128bit product
> > > + * add a third value and then divide by a fourth.
> > > + * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.
> > > + *
> > > + * Return: (@a * @b + @c) / @d
> > > + */
> > > +u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d);
> > > +
> > > +/**
> > > + * mul_u64_u64_div_u64 - unsigned 64bit multiply and divide
> > > + * @a: first unsigned 64bit multiplicand
> > > + * @b: second unsigned 64bit multiplicand
> > > + * @d: unsigned 64bit divisor
> > > + *
> > > + * Multiply two 64bit values together to generate a 128bit product
> > > + * and then divide by a third value.
> > > + * May BUG()/trap if @d is zero or the quotient exceeds 64 bits.  
> > 
> > If the quotient exceeds 64 bits, the optimized x86 version truncates the 
> > value to the low 64 bits. The C version returns a saturated value i.e. 
> > UINT64_MAX (implemented with a -1). Nothing actually traps in that case.
> 
> Nope. I've only got the iAPX 286 and 80386 reference manuals to hand.
> Both say that 'interrupt 0' happens on overflow.
> I don't expect the later documentation is any different.

Hmmm... OK, you're right. I must have botched my test code initially.

> If the kernel code is going to have an explicit instruction to trap
> (rather then the code 'just trapping') it really is best to use BUG().
> If nothing else it guarantees a trap regardless of the architecture
> and compiler.

OK in the overflow case.

However in the div-by_0 case it is best if, for a given architecture, 
the behavior is coherent across all division operations.

> > > +	if (!c && ilog2(a) + ilog2(b) <= 62)
> > > +		return div64_u64(a * b, d);
> > > +  
> > 
> > Here you should do:
> > 
> > 	if (ilog2(a) + ilog2(b) <= 62) {
> > 		u64 ab = a * b;
> > 		u64 abc = ab + c;
> > 		if (ab <= abc)
> > 			return div64_u64(abc, d);
> > 	}
> > 
> > This is cheap and won't unconditionally discard the faster path when c != 0;
> 
> That isn't really cheap.
> ilog2() is likely to be a similar cost to a multiply
> (my brain remembers them both as 'latency 3' on x86).

I'm not discussing the ilog2() usage though. I'm just against limiting 
the test to !c. My suggestion is about supporting all values of c.

> My actual preference is to completely delete that test and rely
> on the post-multiply check.
> 
> The 64 by 64 multiply code is actually fairly cheap.
> On x86-64 it is only a few clocks slower than the u128 version
> (and that is (much) the same code that should be generated for 32bit).

Of course x86-64 is not the primary target here as it has its own 
optimized version.


Nicolas

Re: [PATCH v2 next 3/4] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup()

Posted by David Laight 8 months, 3 weeks ago

On Tue, 20 May 2025 18:24:58 -0400 (EDT)
Nicolas Pitre <npitre@baylibre.com> wrote:

> On Tue, 20 May 2025, David Laight wrote:
> 
> > On Mon, 19 May 2025 23:03:21 -0400 (EDT)
> > Nicolas Pitre <npitre@baylibre.com> wrote:
> >   
...
> > > Here you should do:
> > > 
> > > 	if (ilog2(a) + ilog2(b) <= 62) {
> > > 		u64 ab = a * b;
> > > 		u64 abc = ab + c;
> > > 		if (ab <= abc)
> > > 			return div64_u64(abc, d);
> > > 	}
> > > 
> > > This is cheap and won't unconditionally discard the faster path when c != 0;  
> > 
> > That isn't really cheap.
> > ilog2() is likely to be a similar cost to a multiply
> > (my brain remembers them both as 'latency 3' on x86).  
> 
> I'm not discussing the ilog2() usage though. I'm just against limiting 
> the test to !c. My suggestion is about supporting all values of c.

I've had further thoughts on that test.
Most (but not all - and I've forgotten which) 64bit cpu have a 64x64=>128
multiple and support u128.
So the 'multiply in parts' code is mostly for 32bit.
That means that the 'a * b' for the call to div64_u64() has to be three
32x32=>64 multiplies with all the extra 'add' and 'adc $0' to generate
a correct 64bit result.
This is (well should be) much the same as the multiply coded in the
function - except it is generated by the compiler itself.
The only parts it can ignore are the those that set 'z' and 'y_hi'.
If my clock sequence (in the other email) is correct it saves 3 of 10
clocks - so test to avoid the multiply has to be better than that.
That probably means the only worthwhile check is for a and b being 32bit
so a single multiply can be used.

The generated code for 32bit x86 isn't as good as one might hope.
partially due to only having 7 (6 if %bp is a stack frame) registers.
clang makes a reasonable job of it, gcc doesn't.
There is already a mul_u32_u32() wrapper in arch/x86/include/asm/div64.h.
There needs to be a similar add_u64_u32() (contains add %s,%d_lo, adc $0,%d_hi).
Without them gcc spills a lot of values to stack - including constant zeros.

I might add those and use them in v3 (which I need to send anyway).
They'll match what my 'pending' faster code does.

	David

Re: [PATCH v2 next 3/4] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup()

Posted by Nicolas Pitre 8 months, 3 weeks ago

On Wed, 21 May 2025, David Laight wrote:

> On Tue, 20 May 2025 18:24:58 -0400 (EDT)
> Nicolas Pitre <npitre@baylibre.com> wrote:
> 
> > On Tue, 20 May 2025, David Laight wrote:
> > 
> > > On Mon, 19 May 2025 23:03:21 -0400 (EDT)
> > > Nicolas Pitre <npitre@baylibre.com> wrote:
> > >   
> ...
> > > > Here you should do:
> > > > 
> > > > 	if (ilog2(a) + ilog2(b) <= 62) {
> > > > 		u64 ab = a * b;
> > > > 		u64 abc = ab + c;
> > > > 		if (ab <= abc)
> > > > 			return div64_u64(abc, d);
> > > > 	}
> > > > 
> > > > This is cheap and won't unconditionally discard the faster path when c != 0;  
> > > 
> > > That isn't really cheap.
> > > ilog2() is likely to be a similar cost to a multiply
> > > (my brain remembers them both as 'latency 3' on x86).  
> > 
> > I'm not discussing the ilog2() usage though. I'm just against limiting 
> > the test to !c. My suggestion is about supporting all values of c.
> 
> I've had further thoughts on that test.
> Most (but not all - and I've forgotten which) 64bit cpu have a 64x64=>128
> multiple and support u128.

Looks like X86-64, ARM64 and RV64 have it. That's probably 99% of the market.

> So the 'multiply in parts' code is mostly for 32bit.

Exact.

> That means that the 'a * b' for the call to div64_u64() has to be three
> 32x32=>64 multiplies with all the extra 'add' and 'adc $0' to generate
> a correct 64bit result.

4 multiplies to be precise.

> This is (well should be) much the same as the multiply coded in the
> function - except it is generated by the compiler itself.

I don't follow you here. What is the same as what?

> The only parts it can ignore are the those that set 'z' and 'y_hi'.
> If my clock sequence (in the other email) is correct it saves 3 of 10
> clocks - so test to avoid the multiply has to be better than that.
> That probably means the only worthwhile check is for a and b being 32bit
> so a single multiply can be used.

Depends how costly the ilog2 is. On ARM the clz instruction is about 1 
cycle. If you need to figure out the MSB manually then it might be best 
to skip those ilog2's.

> The generated code for 32bit x86 isn't as good as one might hope.
> partially due to only having 7 (6 if %bp is a stack frame) registers.
> clang makes a reasonable job of it, gcc doesn't.
> There is already a mul_u32_u32() wrapper in arch/x86/include/asm/div64.h.
> There needs to be a similar add_u64_u32() (contains add %s,%d_lo, adc $0,%d_hi).
> Without them gcc spills a lot of values to stack - including constant zeros.

I mainly looked at ARM32 and both gcc and clang do a good job here. ARM 
registers are plentiful of course.

> I might add those and use them in v3 (which I need to send anyway).
> They'll match what my 'pending' faster code does.
> 
> 	David
> 
>

Re: [PATCH v2 next 3/4] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup()

Posted by David Laight 8 months, 2 weeks ago

On Wed, 21 May 2025 09:50:28 -0400 (EDT)
Nicolas Pitre <npitre@baylibre.com> wrote:

> On Wed, 21 May 2025, David Laight wrote:
>...
> 
> Depends how costly the ilog2 is. On ARM the clz instruction is about 1 
> cycle. If you need to figure out the MSB manually then it might be best 
> to skip those ilog2's.

I was going to measure it.
But I've pulled chunks from the kernel headers into a userspace build.
This is the x86-32 code for the 'if (ilog2(a) + ilog2(b) < 62)' test:

    1b2b:       0f bd c6                bsr    %esi,%eax
    1b2e:       75 05                   jne    1b35 <mul_u64_add_u64_div_u64_new+0x75>
    1b30:       b8 ff ff ff ff          mov    $0xffffffff,%eax
    1b35:       85 c9                   test   %ecx,%ecx
    1b37:       74 0d                   je     1b46 <mul_u64_add_u64_div_u64_new+0x86>
    1b39:       0f bd c1                bsr    %ecx,%eax
    1b3c:       75 05                   jne    1b43 <mul_u64_add_u64_div_u64_new+0x83>
    1b3e:       b8 ff ff ff ff          mov    $0xffffffff,%eax
    1b43:       83 c0 20                add    $0x20,%eax
    1b46:       0f bd d5                bsr    %ebp,%edx
    1b49:       75 05                   jne    1b50 <mul_u64_add_u64_div_u64_new+0x90>
    1b4b:       ba ff ff ff ff          mov    $0xffffffff,%edx
    1b50:       85 ff                   test   %edi,%edi
    1b52:       74 0d                   je     1b61 <mul_u64_add_u64_div_u64_new+0xa1>
    1b54:       0f bd d7                bsr    %edi,%edx
    1b57:       75 05                   jne    1b5e <mul_u64_add_u64_div_u64_new+0x9e>
    1b59:       ba ff ff ff ff          mov    $0xffffffff,%edx
    1b5e:       83 c2 20                add    $0x20,%edx
    1b61:       8d 1c 02                lea    (%edx,%eax,1),%ebx
    1b64:       83 fb 3e                cmp    $0x3e,%ebx
    1b67:       0f 8e 0b 03 00 00       jle    1e78 <mul_u64_add_u64_div_u64_new+0x3b8>

If 'cmov' is enabled (not by default even after the current plan to remove 486 support) it is:

    1b2b:       ba ff ff ff ff          mov    $0xffffffff,%edx
    1b30:       85 c9                   test   %ecx,%ecx
    1b32:       0f 85 98 03 00 00       jne    1ed0 <mul_u64_add_u64_div_u64_new+0x410>
    1b38:       0f bd c6                bsr    %esi,%eax
    1b3b:       0f 44 c2                cmove  %edx,%eax
    1b3e:       bb ff ff ff ff          mov    $0xffffffff,%ebx
    1b43:       85 ff                   test   %edi,%edi
    1b45:       0f 85 75 03 00 00       jne    1ec0 <mul_u64_add_u64_div_u64_new+0x400>
    1b4b:       0f bd d5                bsr    %ebp,%edx
    1b4e:       0f 44 d3                cmove  %ebx,%edx
    1b51:       8d 1c 02                lea    (%edx,%eax,1),%ebx
    1b54:       83 fb 3e                cmp    $0x3e,%ebx
    1b57:       0f 8e 0b 03 00 00       jle    1e68 <mul_u64_add_u64_div_u64_new+0x3a8>

with:
    1ec0:       0f bd d7                bsr    %edi,%edx
    1ec3:       0f 44 d3                cmove  %ebx,%edx
    1ec6:       83 c2 20                add    $0x20,%edx
    1ec9:       e9 83 fc ff ff          jmp    1b51 <mul_u64_add_u64_div_u64_new+0x91>

    1ed0:       0f bd c1                bsr    %ecx,%eax
    1ed3:       0f 44 c2                cmove  %edx,%eax
    1ed6:       83 c0 20                add    $0x20,%eax
    1ed9:       e9 60 fc ff ff          jmp    1b3e <mul_u64_add_u64_div_u64_new+0x7e>

Neither is pretty...
Some of the 'damage' is because the x86 'bsr' (bit scan reverse) sets 'z' for zero
and leaves the output undefined (unchanged on later cpu).

For reference I can get the multiply down to:
    1b5d:       89 f0                   mov    %esi,%eax
    1b5f:       f7 e5                   mul    %ebp
    1b61:       03 44 24 38             add    0x38(%esp),%eax
    1b65:       83 d2 00                adc    $0x0,%edx
    1b68:       89 d3                   mov    %edx,%ebx
    1b6a:       89 44 24 08             mov    %eax,0x8(%esp)
    1b6e:       89 f0                   mov    %esi,%eax
    1b70:       f7 e7                   mul    %edi
    1b72:       03 44 24 3c             add    0x3c(%esp),%eax
    1b76:       83 d2 00                adc    $0x0,%edx
    1b79:       01 d8                   add    %ebx,%eax
    1b7b:       83 d2 00                adc    $0x0,%edx
    1b7e:       89 d6                   mov    %edx,%esi
    1b80:       89 c3                   mov    %eax,%ebx
    1b82:       89 c8                   mov    %ecx,%eax
    1b84:       f7 e7                   mul    %edi
    1b86:       89 c7                   mov    %eax,%edi
    1b88:       89 c8                   mov    %ecx,%eax
    1b8a:       01 f7                   add    %esi,%edi
    1b8c:       83 d2 00                adc    $0x0,%edx
    1b8f:       89 d6                   mov    %edx,%esi
    1b91:       f7 e5                   mul    %ebp
    1b93:       89 c1                   mov    %eax,%ecx
    1b95:       8b 44 24 08             mov    0x8(%esp),%eax
    1b99:       89 d5                   mov    %edx,%ebp
    1b9b:       01 d9                   add    %ebx,%ecx
    1b9d:       83 d5 00                adc    $0x0,%ebp
    1ba0:       89 44 24 28             mov    %eax,0x28(%esp)
    1ba4:       01 ef                   add    %ebp,%edi
    1ba6:       83 d6 00                adc    $0x0,%esi
    1ba9:       89 74 24 1c             mov    %esi,0x1c(%esp)
    1bad:       8b 5c 24 1c             mov    0x1c(%esp),%ebx
    1bb1:       89 7c 24 18             mov    %edi,0x18(%esp)
    1bb5:       8b 44 24 18             mov    0x18(%esp),%eax
    1bb9:       89 4c 24 2c             mov    %ecx,0x2c(%esp)
    1bbd:       09 c3                   or     %eax,%ebx
    1bbf:       0f 84 1b 03 00 00       je     1ee0 <mul_u64_add_u64_div_u64_new+0x420>

If you follow the register dependency chain it won't be as long as it looks.
(Although the last few instructions are terrible! - I've tried a few things
and they won't go away.)

	David

[PATCH v2 next 1/4] lib: mul_u64_u64_div_u64() rename parameter 'c' to 'd'
[PATCH v2 next 2/4] lib: mul_u64_u64_div_u64() Use BUG_ON() for divide by zero
[PATCH v2 next 3/4] lib: Add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup()
[PATCH v2 next 4/4] lib: Add tests for mul_u64_u64_div_u64_roundup()