Change the #if in div64.c so that test_mul_u64_u64_div_u64.c
can compile and test the generic version (including the 'long multiply')
on architectures (eg amd64) that define their own copy.
Test the kernel version and the locally compiled version on all arch.
Output the time taken (in ns) on the 'test completed' trace.
For reference, on my zen 5, the optimised version takes ~220ns and the
generic version ~3350ns.
Using the native multiply saves ~200ns and adding back the ilog2() 'optimisation'
test adds ~50ms.
Signed-off-by: David Laight <david.laight.linux@gmail.com>
---
New patch for v3, replacing changes in v1 that were removed for v2.
lib/math/div64.c | 8 +++--
lib/math/test_mul_u64_u64_div_u64.c | 48 ++++++++++++++++++++++++-----
2 files changed, 47 insertions(+), 9 deletions(-)
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 7850cc0a7596..22433e5565c4 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -178,13 +178,15 @@ EXPORT_SYMBOL(div64_s64);
* Iterative div/mod for use when dividend is not expected to be much
* bigger than divisor.
*/
+#ifndef iter_div_u64_rem
u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
{
return __iter_div_u64_rem(dividend, divisor, remainder);
}
EXPORT_SYMBOL(iter_div_u64_rem);
+#endif
-#ifndef mul_u64_add_u64_div_u64
+#if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64)
u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
{
if (WARN_ONCE(!d, "%s: division of (%#llx * %#llx + %#llx) by zero, returning 0",
@@ -196,7 +198,7 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
return 0;
}
-#if defined(__SIZEOF_INT128__)
+#if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64)
/* native 64x64=128 bits multiplication */
u128 prod = (u128)a * b + c;
@@ -270,5 +272,7 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
return res;
}
+#if !defined(test_mul_u64_add_u64_div_u64)
EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
#endif
+#endif
diff --git a/lib/math/test_mul_u64_u64_div_u64.c b/lib/math/test_mul_u64_u64_div_u64.c
index ea5b703cccff..f0134f25cb0d 100644
--- a/lib/math/test_mul_u64_u64_div_u64.c
+++ b/lib/math/test_mul_u64_u64_div_u64.c
@@ -73,21 +73,34 @@ done
*/
-static int __init test_init(void)
+static u64 test_mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d);
+
+static int __init test_run(unsigned int fn_no, const char *fn_name)
{
+ u64 start_time;
int errors = 0;
int tests = 0;
int i;
- pr_info("Starting mul_u64_u64_div_u64() test\n");
+ start_time = ktime_get_ns();
for (i = 0; i < ARRAY_SIZE(test_values); i++) {
u64 a = test_values[i].a;
u64 b = test_values[i].b;
u64 d = test_values[i].d;
u64 expected_result = test_values[i].result;
- u64 result = mul_u64_u64_div_u64(a, b, d);
- u64 result_up = mul_u64_u64_div_u64_roundup(a, b, d);
+ u64 result, result_up;
+
+ switch (fn_no) {
+ default:
+ result = mul_u64_u64_div_u64(a, b, d);
+ result_up = mul_u64_u64_div_u64_roundup(a, b, d);
+ break;
+ case 1:
+ result = test_mul_u64_add_u64_div_u64(a, b, 0, d);
+ result_up = test_mul_u64_add_u64_div_u64(a, b, d - 1, d);
+ break;
+ }
tests += 2;
@@ -106,15 +119,36 @@ static int __init test_init(void)
}
}
- pr_info("Completed mul_u64_u64_div_u64() test, %d tests, %d errors\n",
- tests, errors);
- return errors ? -EINVAL : 0;
+ pr_info("Completed %s() test, %d tests, %d errors, %llu ns\n",
+ fn_name, tests, errors, ktime_get_ns() - start_time);
+ return errors;
+}
+
+static int __init test_init(void)
+{
+ pr_info("Starting mul_u64_u64_div_u64() test\n");
+ if (test_run(0, "mul_u64_u64_div_u64"))
+ return -EINVAL;
+ if (test_run(1, "test_mul_u64_u64_div_u64"))
+ return -EINVAL;
+ return 0;
}
static void __exit test_exit(void)
{
}
+/* Compile the generic mul_u64_add_u64_div_u64() code */
+#define div64_u64 div64_u64
+#define div64_s64 div64_s64
+#define iter_div_u64_rem iter_div_u64_rem
+
+#undef mul_u64_add_u64_div_u64
+#define mul_u64_add_u64_div_u64 test_mul_u64_add_u64_div_u64
+#define test_mul_u64_add_u64_div_u64 test_mul_u64_add_u64_div_u64
+
+#include "div64.c"
+
module_init(test_init);
module_exit(test_exit);
--
2.39.5
On Sat, 14 Jun 2025, David Laight wrote: > Change the #if in div64.c so that test_mul_u64_u64_div_u64.c > can compile and test the generic version (including the 'long multiply') > on architectures (eg amd64) that define their own copy. Please also include some explanation for iter_div_u64_rem. > Test the kernel version and the locally compiled version on all arch. > Output the time taken (in ns) on the 'test completed' trace. > > For reference, on my zen 5, the optimised version takes ~220ns and the > generic version ~3350ns. > Using the native multiply saves ~200ns and adding back the ilog2() 'optimisation' > test adds ~50ms. > > Signed-off-by: David Laight <david.laight.linux@gmail.com> Reviewed-by: Nicolas Pitre <npitre@baylibre.com> > --- > > New patch for v3, replacing changes in v1 that were removed for v2. > > lib/math/div64.c | 8 +++-- > lib/math/test_mul_u64_u64_div_u64.c | 48 ++++++++++++++++++++++++----- > 2 files changed, 47 insertions(+), 9 deletions(-) > > diff --git a/lib/math/div64.c b/lib/math/div64.c > index 7850cc0a7596..22433e5565c4 100644 > --- a/lib/math/div64.c > +++ b/lib/math/div64.c > @@ -178,13 +178,15 @@ EXPORT_SYMBOL(div64_s64); > * Iterative div/mod for use when dividend is not expected to be much > * bigger than divisor. > */ > +#ifndef iter_div_u64_rem > u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) > { > return __iter_div_u64_rem(dividend, divisor, remainder); > } > EXPORT_SYMBOL(iter_div_u64_rem); > +#endif > > -#ifndef mul_u64_add_u64_div_u64 > +#if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64) > u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > { > if (WARN_ONCE(!d, "%s: division of (%#llx * %#llx + %#llx) by zero, returning 0", > @@ -196,7 +198,7 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > return 0; > } > > -#if defined(__SIZEOF_INT128__) > +#if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64) > > /* native 64x64=128 bits multiplication */ > u128 prod = (u128)a * b + c; > @@ -270,5 +272,7 @@ u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) > > return res; > } > +#if !defined(test_mul_u64_add_u64_div_u64) > EXPORT_SYMBOL(mul_u64_add_u64_div_u64); > #endif > +#endif > diff --git a/lib/math/test_mul_u64_u64_div_u64.c b/lib/math/test_mul_u64_u64_div_u64.c > index ea5b703cccff..f0134f25cb0d 100644 > --- a/lib/math/test_mul_u64_u64_div_u64.c > +++ b/lib/math/test_mul_u64_u64_div_u64.c > @@ -73,21 +73,34 @@ done > > */ > > -static int __init test_init(void) > +static u64 test_mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d); > + > +static int __init test_run(unsigned int fn_no, const char *fn_name) > { > + u64 start_time; > int errors = 0; > int tests = 0; > int i; > > - pr_info("Starting mul_u64_u64_div_u64() test\n"); > + start_time = ktime_get_ns(); > > for (i = 0; i < ARRAY_SIZE(test_values); i++) { > u64 a = test_values[i].a; > u64 b = test_values[i].b; > u64 d = test_values[i].d; > u64 expected_result = test_values[i].result; > - u64 result = mul_u64_u64_div_u64(a, b, d); > - u64 result_up = mul_u64_u64_div_u64_roundup(a, b, d); > + u64 result, result_up; > + > + switch (fn_no) { > + default: > + result = mul_u64_u64_div_u64(a, b, d); > + result_up = mul_u64_u64_div_u64_roundup(a, b, d); > + break; > + case 1: > + result = test_mul_u64_add_u64_div_u64(a, b, 0, d); > + result_up = test_mul_u64_add_u64_div_u64(a, b, d - 1, d); > + break; > + } > > tests += 2; > > @@ -106,15 +119,36 @@ static int __init test_init(void) > } > } > > - pr_info("Completed mul_u64_u64_div_u64() test, %d tests, %d errors\n", > - tests, errors); > - return errors ? -EINVAL : 0; > + pr_info("Completed %s() test, %d tests, %d errors, %llu ns\n", > + fn_name, tests, errors, ktime_get_ns() - start_time); > + return errors; > +} > + > +static int __init test_init(void) > +{ > + pr_info("Starting mul_u64_u64_div_u64() test\n"); > + if (test_run(0, "mul_u64_u64_div_u64")) > + return -EINVAL; > + if (test_run(1, "test_mul_u64_u64_div_u64")) > + return -EINVAL; > + return 0; > } > > static void __exit test_exit(void) > { > } > > +/* Compile the generic mul_u64_add_u64_div_u64() code */ > +#define div64_u64 div64_u64 > +#define div64_s64 div64_s64 > +#define iter_div_u64_rem iter_div_u64_rem > + > +#undef mul_u64_add_u64_div_u64 > +#define mul_u64_add_u64_div_u64 test_mul_u64_add_u64_div_u64 > +#define test_mul_u64_add_u64_div_u64 test_mul_u64_add_u64_div_u64 > + > +#include "div64.c" > + > module_init(test_init); > module_exit(test_exit); > > -- > 2.39.5 > >
On Sat, 14 Jun 2025, Nicolas Pitre wrote: > On Sat, 14 Jun 2025, David Laight wrote: > > > Change the #if in div64.c so that test_mul_u64_u64_div_u64.c > > can compile and test the generic version (including the 'long multiply') > > on architectures (eg amd64) that define their own copy. > > Test the kernel version and the locally compiled version on all arch. > > Output the time taken (in ns) on the 'test completed' trace. > > > > For reference, on my zen 5, the optimised version takes ~220ns and the > > generic version ~3350ns. > > Using the native multiply saves ~200ns and adding back the ilog2() 'optimisation' > > test adds ~50ms. > > > > Signed-off-by: David Laight <david.laight.linux@gmail.com> > > Reviewed-by: Nicolas Pitre <npitre@baylibre.com> In fact this doesn't compile on ARM32. The following is needed to fix that: commit 271a7224634699721b6383ba28f37b23f901319e Author: Nicolas Pitre <nico@fluxnic.net> Date: Tue Jun 17 17:14:05 2025 -0400 fixup! lib: test_mul_u64_u64_div_u64: Test both generic and arch versions diff --git a/lib/math/test_mul_u64_u64_div_u64.c b/lib/math/test_mul_u64_u64_div_u64.c index 88316e68512c..44df9aa39406 100644 --- a/lib/math/test_mul_u64_u64_div_u64.c +++ b/lib/math/test_mul_u64_u64_div_u64.c @@ -153,7 +153,10 @@ static void __exit test_exit(void) } /* Compile the generic mul_u64_add_u64_div_u64() code */ +#define __div64_32 __div64_32 +#define div_s64_rem div_s64_rem #define div64_u64 div64_u64 +#define div64_u64_rem div64_u64_rem #define div64_s64 div64_s64 #define iter_div_u64_rem iter_div_u64_rem
© 2016 - 2025 Red Hat, Inc.