drivers/char/random.c | 22 ++++++++++++++++++++++ include/linux/random.h | 22 ++++++++++++++++++++++ 2 files changed, 44 insertions(+)
bcachefs needs this, for sampling devices to read from based on squared
device latencies.
this uses the same algorithm as get_random_u32_below: since the multiply
uses the top and bottom halves separately, it works out fairly well.
Cc: "Theodore Ts'o" <tytso@mit.edu> (maintainer:RANDOM NUMBER DRIVER)
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com> (maintainer:RANDOM NUMBER DRIVER)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
drivers/char/random.c | 22 ++++++++++++++++++++++
include/linux/random.h | 22 ++++++++++++++++++++++
2 files changed, 44 insertions(+)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 2581186fa61b..84808300044c 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -588,6 +588,28 @@ u32 __get_random_u32_below(u32 ceil)
}
EXPORT_SYMBOL(__get_random_u32_below);
+u64 __get_random_u64_below(u64 ceil)
+{
+ if (unlikely(!ceil))
+ return get_random_u64();
+ if (ceil <= U32_MAX)
+ return __get_random_u32_below(ceil);
+
+ u64 rand = get_random_u64();
+ u64 mult = ceil * rand;
+
+ if (unlikely(mult < ceil)) {
+ u64 bound = -ceil % ceil;
+ while (unlikely(mult < bound)) {
+ rand = get_random_u64();
+ mult = ceil * rand;
+ }
+ }
+
+ return mul_u64_u64_shr(ceil, rand, 64);
+}
+EXPORT_SYMBOL(__get_random_u64_below);
+
#ifdef CONFIG_SMP
/*
* This function is called when the CPU is coming up, with entry
diff --git a/include/linux/random.h b/include/linux/random.h
index 333cecfca93f..b025bf3d8f27 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -6,6 +6,7 @@
#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/list.h>
+#include <linux/math64.h>
#include <uapi/linux/random.h>
@@ -90,6 +91,27 @@ static inline u32 get_random_u32_below(u32 ceil)
}
}
+u64 __get_random_u64_below(u64 ceil);
+
+static inline u64 get_random_u64_below(u32 ceil)
+{
+ if (!__builtin_constant_p(ceil))
+ return __get_random_u64_below(ceil);
+
+ BUILD_BUG_ON_MSG(!ceil, "get_random_u64_below() must take ceil > 0");
+ if (ceil <= 1)
+ return 0;
+ if (ceil <= U32_MAX)
+ return get_random_u32_below(ceil);
+
+ for (;;) {
+ u64 rand = get_random_u64();
+ u64 mult = ceil * rand;
+ if (likely(mult >= -ceil % ceil))
+ return mul_u64_u64_shr(ceil, rand, 64);
+ }
+}
+
/*
* Returns a random integer in the interval (floor, U32_MAX], with uniform
* distribution, suitable for all uses. Fastest when floor is a constant, but
--
2.47.2
On Thu, 13 Mar 2025 12:38:10 -0400
Kent Overstreet <kent.overstreet@linux.dev> wrote:
> bcachefs needs this, for sampling devices to read from based on squared
> device latencies.
>
> this uses the same algorithm as get_random_u32_below: since the multiply
> uses the top and bottom halves separately, it works out fairly well.
Adding two separate copies of much the same code is silly.
Given what the code is doing, does it ever make any sense to inline it.
Inlining the original get_random_u32_below(ceil) that did
(random_u32() * ((1ull << 32) / ceil) >> 32
(for constant ceil) made sense.
While good enough for most purposes it was replaced by the much more
expensive function that guarantees that all the output values are
equally likely - rather than just evenly distributed.
David
>
> Cc: "Theodore Ts'o" <tytso@mit.edu> (maintainer:RANDOM NUMBER DRIVER)
> Cc: "Jason A. Donenfeld" <Jason@zx2c4.com> (maintainer:RANDOM NUMBER DRIVER)
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> ---
> drivers/char/random.c | 22 ++++++++++++++++++++++
> include/linux/random.h | 22 ++++++++++++++++++++++
> 2 files changed, 44 insertions(+)
>
> diff --git a/drivers/char/random.c b/drivers/char/random.c
> index 2581186fa61b..84808300044c 100644
> --- a/drivers/char/random.c
> +++ b/drivers/char/random.c
> @@ -588,6 +588,28 @@ u32 __get_random_u32_below(u32 ceil)
> }
> EXPORT_SYMBOL(__get_random_u32_below);
>
> +u64 __get_random_u64_below(u64 ceil)
> +{
> + if (unlikely(!ceil))
> + return get_random_u64();
> + if (ceil <= U32_MAX)
> + return __get_random_u32_below(ceil);
> +
> + u64 rand = get_random_u64();
> + u64 mult = ceil * rand;
> +
> + if (unlikely(mult < ceil)) {
> + u64 bound = -ceil % ceil;
> + while (unlikely(mult < bound)) {
> + rand = get_random_u64();
> + mult = ceil * rand;
> + }
> + }
> +
> + return mul_u64_u64_shr(ceil, rand, 64);
> +}
> +EXPORT_SYMBOL(__get_random_u64_below);
> +
> #ifdef CONFIG_SMP
> /*
> * This function is called when the CPU is coming up, with entry
> diff --git a/include/linux/random.h b/include/linux/random.h
> index 333cecfca93f..b025bf3d8f27 100644
> --- a/include/linux/random.h
> +++ b/include/linux/random.h
> @@ -6,6 +6,7 @@
> #include <linux/bug.h>
> #include <linux/kernel.h>
> #include <linux/list.h>
> +#include <linux/math64.h>
>
> #include <uapi/linux/random.h>
>
> @@ -90,6 +91,27 @@ static inline u32 get_random_u32_below(u32 ceil)
> }
> }
>
> +u64 __get_random_u64_below(u64 ceil);
> +
> +static inline u64 get_random_u64_below(u32 ceil)
> +{
> + if (!__builtin_constant_p(ceil))
> + return __get_random_u64_below(ceil);
> +
> + BUILD_BUG_ON_MSG(!ceil, "get_random_u64_below() must take ceil > 0");
> + if (ceil <= 1)
> + return 0;
> + if (ceil <= U32_MAX)
> + return get_random_u32_below(ceil);
> +
> + for (;;) {
> + u64 rand = get_random_u64();
> + u64 mult = ceil * rand;
> + if (likely(mult >= -ceil % ceil))
> + return mul_u64_u64_shr(ceil, rand, 64);
> + }
> +}
> +
> /*
> * Returns a random integer in the interval (floor, U32_MAX], with uniform
> * distribution, suitable for all uses. Fastest when floor is a constant, but
On Sat, Mar 15, 2025 at 01:52:34PM +0000, David Laight wrote: > On Thu, 13 Mar 2025 12:38:10 -0400 > Kent Overstreet <kent.overstreet@linux.dev> wrote: > > > bcachefs needs this, for sampling devices to read from based on squared > > device latencies. > > > > this uses the same algorithm as get_random_u32_below: since the multiply > > uses the top and bottom halves separately, it works out fairly well. > > Adding two separate copies of much the same code is silly. > Given what the code is doing, does it ever make any sense to inline it. > > Inlining the original get_random_u32_below(ceil) that did > (random_u32() * ((1ull << 32) / ceil) >> 32 > (for constant ceil) made sense. > While good enough for most purposes it was replaced by the much more > expensive function that guarantees that all the output values are > equally likely - rather than just evenly distributed. Expensive!? It adds a multiply. That % gets constant folded, in the inlined case, and in the non-inline case it's hit only a small fraction of the, time, for typical ceil.
On Sat, 15 Mar 2025 14:20:46 -0400 Kent Overstreet <kent.overstreet@linux.dev> wrote: > On Sat, Mar 15, 2025 at 01:52:34PM +0000, David Laight wrote: > > On Thu, 13 Mar 2025 12:38:10 -0400 > > Kent Overstreet <kent.overstreet@linux.dev> wrote: > > > > > bcachefs needs this, for sampling devices to read from based on squared > > > device latencies. > > > > > > this uses the same algorithm as get_random_u32_below: since the multiply > > > uses the top and bottom halves separately, it works out fairly well. > > > > Adding two separate copies of much the same code is silly. > > Given what the code is doing, does it ever make any sense to inline it. > > > > Inlining the original get_random_u32_below(ceil) that did > > (random_u32() * ((1ull << 32) / ceil) >> 32 > > (for constant ceil) made sense. > > While good enough for most purposes it was replaced by the much more > > expensive function that guarantees that all the output values are > > equally likely - rather than just evenly distributed. > > Expensive!? It adds a multiply. I make it two multiplies and a loop. Have you looked at what happens on 32bit systems? > > That % gets constant folded, in the inlined case, and in the non-inline > case it's hit only a small fraction of the, time, for typical ceil. If the % is only a small fraction on the cost for the non-inline case (and it is over 100 clocks on many cpu that people still use) then why inline anything? A quick look shows divide being 'only moderately slow' on zen3 and coffee lake. What you might want to do is pass -ceil % ceil through to a real function (especially if constant). Oh I guess you haven't actually tested the version you submitted. Time to play 'spot the silly error'. David
On Sat, Mar 15, 2025 at 08:55:32PM +0000, David Laight wrote: > On Sat, 15 Mar 2025 14:20:46 -0400 > Kent Overstreet <kent.overstreet@linux.dev> wrote: > > > On Sat, Mar 15, 2025 at 01:52:34PM +0000, David Laight wrote: > > > On Thu, 13 Mar 2025 12:38:10 -0400 > > > Kent Overstreet <kent.overstreet@linux.dev> wrote: > > > > > > > bcachefs needs this, for sampling devices to read from based on squared > > > > device latencies. > > > > > > > > this uses the same algorithm as get_random_u32_below: since the multiply > > > > uses the top and bottom halves separately, it works out fairly well. > > > > > > Adding two separate copies of much the same code is silly. > > > Given what the code is doing, does it ever make any sense to inline it. > > > > > > Inlining the original get_random_u32_below(ceil) that did > > > (random_u32() * ((1ull << 32) / ceil) >> 32 > > > (for constant ceil) made sense. > > > While good enough for most purposes it was replaced by the much more > > > expensive function that guarantees that all the output values are > > > equally likely - rather than just evenly distributed. > > > > Expensive!? It adds a multiply. > > I make it two multiplies and a loop. > Have you looked at what happens on 32bit systems? Not many people run 32 bit anymore. Have you looked at get_random_bytes()? > > That % gets constant folded, in the inlined case, and in the non-inline > > case it's hit only a small fraction of the, time, for typical ceil. > > If the % is only a small fraction on the cost for the non-inline case > (and it is over 100 clocks on many cpu that people still use) then > why inline anything? > A quick look shows divide being 'only moderately slow' on zen3 and coffee lake. > > What you might want to do is pass -ceil % ceil through to a real function > (especially if constant). David, the way get_random_u32_below is optimized now is just fine. I'm not going to completely redo the inlining just for this, nor am I going to do get_random_u64_below() differently from get_random_u32_below(). > Oh I guess you haven't actually tested the version you submitted. > Time to play 'spot the silly error'. Please do share what you think you've found.
On Sat, 15 Mar 2025 17:32:38 -0400 Kent Overstreet <kent.overstreet@linux.dev> wrote: > On Sat, Mar 15, 2025 at 08:55:32PM +0000, David Laight wrote: > ... > > Oh I guess you haven't actually tested the version you submitted. > > Time to play 'spot the silly error'. > > Please do share what you think you've found. +static inline u64 get_random_u64_below(u32 ceil) David
On Sat, 15 Mar 2025 13:52:34 +0000
David Laight <david.laight.linux@gmail.com> wrote:
> On Thu, 13 Mar 2025 12:38:10 -0400
> Kent Overstreet <kent.overstreet@linux.dev> wrote:
>
> > bcachefs needs this, for sampling devices to read from based on squared
> > device latencies.
> >
> > this uses the same algorithm as get_random_u32_below: since the multiply
> > uses the top and bottom halves separately, it works out fairly well.
>
> Adding two separate copies of much the same code is silly.
> Given what the code is doing, does it ever make any sense to inline it.
>
> Inlining the original get_random_u32_below(ceil) that did
> (random_u32() * ((1ull << 32) / ceil) >> 32
Brain fade, it is just (random_u32() * (u64)ceil) >> 32
> (for constant ceil) made sense.
> While good enough for most purposes it was replaced by the much more
> expensive function that guarantees that all the output values are
> equally likely - rather than just evenly distributed.
>
> David
>
> >
> > Cc: "Theodore Ts'o" <tytso@mit.edu> (maintainer:RANDOM NUMBER DRIVER)
> > Cc: "Jason A. Donenfeld" <Jason@zx2c4.com> (maintainer:RANDOM NUMBER DRIVER)
> > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> > ---
> > drivers/char/random.c | 22 ++++++++++++++++++++++
> > include/linux/random.h | 22 ++++++++++++++++++++++
> > 2 files changed, 44 insertions(+)
> >
> > diff --git a/drivers/char/random.c b/drivers/char/random.c
> > index 2581186fa61b..84808300044c 100644
> > --- a/drivers/char/random.c
> > +++ b/drivers/char/random.c
> > @@ -588,6 +588,28 @@ u32 __get_random_u32_below(u32 ceil)
> > }
> > EXPORT_SYMBOL(__get_random_u32_below);
> >
> > +u64 __get_random_u64_below(u64 ceil)
> > +{
> > + if (unlikely(!ceil))
> > + return get_random_u64();
> > + if (ceil <= U32_MAX)
> > + return __get_random_u32_below(ceil);
> > +
> > + u64 rand = get_random_u64();
> > + u64 mult = ceil * rand;
> > +
> > + if (unlikely(mult < ceil)) {
> > + u64 bound = -ceil % ceil;
> > + while (unlikely(mult < bound)) {
> > + rand = get_random_u64();
> > + mult = ceil * rand;
> > + }
> > + }
> > +
> > + return mul_u64_u64_shr(ceil, rand, 64);
> > +}
> > +EXPORT_SYMBOL(__get_random_u64_below);
> > +
> > #ifdef CONFIG_SMP
> > /*
> > * This function is called when the CPU is coming up, with entry
> > diff --git a/include/linux/random.h b/include/linux/random.h
> > index 333cecfca93f..b025bf3d8f27 100644
> > --- a/include/linux/random.h
> > +++ b/include/linux/random.h
> > @@ -6,6 +6,7 @@
> > #include <linux/bug.h>
> > #include <linux/kernel.h>
> > #include <linux/list.h>
> > +#include <linux/math64.h>
> >
> > #include <uapi/linux/random.h>
> >
> > @@ -90,6 +91,27 @@ static inline u32 get_random_u32_below(u32 ceil)
> > }
> > }
> >
> > +u64 __get_random_u64_below(u64 ceil);
> > +
> > +static inline u64 get_random_u64_below(u32 ceil)
> > +{
> > + if (!__builtin_constant_p(ceil))
> > + return __get_random_u64_below(ceil);
> > +
> > + BUILD_BUG_ON_MSG(!ceil, "get_random_u64_below() must take ceil > 0");
> > + if (ceil <= 1)
> > + return 0;
> > + if (ceil <= U32_MAX)
> > + return get_random_u32_below(ceil);
> > +
> > + for (;;) {
> > + u64 rand = get_random_u64();
> > + u64 mult = ceil * rand;
> > + if (likely(mult >= -ceil % ceil))
> > + return mul_u64_u64_shr(ceil, rand, 64);
> > + }
> > +}
> > +
> > /*
> > * Returns a random integer in the interval (floor, U32_MAX], with uniform
> > * distribution, suitable for all uses. Fastest when floor is a constant, but
>
© 2016 - 2025 Red Hat, Inc.