[v3] Fix bugs and performance of kstack offset randomisation

[PATCH v3 2/3] prandom: Convert prandom_u32_state() to __always_inline

Posted by Ryan Roberts 1 month, 1 week ago

We will shortly use prandom_u32_state() to implement kstack offset
randomization and some arches need to call it from non-instrumentable
context. Given the function is just a handful of operations and doesn't
call out to any other functions, let's take the easy path and make it
__always_inline.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 include/linux/prandom.h | 19 ++++++++++++++++++-
 lib/random32.c          | 19 -------------------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/include/linux/prandom.h b/include/linux/prandom.h
index ff7dcc3fa105..e797b3709f5c 100644
--- a/include/linux/prandom.h
+++ b/include/linux/prandom.h
@@ -17,7 +17,24 @@ struct rnd_state {
 	__u32 s1, s2, s3, s4;
 };
 
-u32 prandom_u32_state(struct rnd_state *state);
+/**
+ * prandom_u32_state - seeded pseudo-random number generator.
+ * @state: pointer to state structure holding seeded state.
+ *
+ * This is used for pseudo-randomness with no outside seeding.
+ * For more random results, use get_random_u32().
+ */
+static __always_inline u32 prandom_u32_state(struct rnd_state *state)
+{
+#define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b)
+	state->s1 = TAUSWORTHE(state->s1,  6U, 13U, 4294967294U, 18U);
+	state->s2 = TAUSWORTHE(state->s2,  2U, 27U, 4294967288U,  2U);
+	state->s3 = TAUSWORTHE(state->s3, 13U, 21U, 4294967280U,  7U);
+	state->s4 = TAUSWORTHE(state->s4,  3U, 12U, 4294967168U, 13U);
+
+	return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4);
+}
+
 void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes);
 void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state);
 
diff --git a/lib/random32.c b/lib/random32.c
index 24e7acd9343f..d57baf489d4a 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -42,25 +42,6 @@
 #include <linux/slab.h>
 #include <linux/unaligned.h>
 
-/**
- *	prandom_u32_state - seeded pseudo-random number generator.
- *	@state: pointer to state structure holding seeded state.
- *
- *	This is used for pseudo-randomness with no outside seeding.
- *	For more random results, use get_random_u32().
- */
-u32 prandom_u32_state(struct rnd_state *state)
-{
-#define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b)
-	state->s1 = TAUSWORTHE(state->s1,  6U, 13U, 4294967294U, 18U);
-	state->s2 = TAUSWORTHE(state->s2,  2U, 27U, 4294967288U,  2U);
-	state->s3 = TAUSWORTHE(state->s3, 13U, 21U, 4294967280U,  7U);
-	state->s4 = TAUSWORTHE(state->s4,  3U, 12U, 4294967168U, 13U);
-
-	return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4);
-}
-EXPORT_SYMBOL(prandom_u32_state);
-
 /**
  *	prandom_bytes_state - get the requested number of pseudo-random bytes
  *
-- 
2.43.0

Re: [PATCH v3 2/3] prandom: Convert prandom_u32_state() to __always_inline

Posted by Mark Rutland 2 weeks, 6 days ago

On Fri, Jan 02, 2026 at 01:11:53PM +0000, Ryan Roberts wrote:
> We will shortly use prandom_u32_state() to implement kstack offset
> randomization and some arches need to call it from non-instrumentable
> context. Given the function is just a handful of operations and doesn't
> call out to any other functions, let's take the easy path and make it
> __always_inline.
> 
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>

I see there were some comments about keeping an out-of-line wrapper.
With or without that, this looks good to me, and either way:

Acked-by: Mark Rutland <mark.rutland@arm.com>

Mark.

> ---
>  include/linux/prandom.h | 19 ++++++++++++++++++-
>  lib/random32.c          | 19 -------------------
>  2 files changed, 18 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/prandom.h b/include/linux/prandom.h
> index ff7dcc3fa105..e797b3709f5c 100644
> --- a/include/linux/prandom.h
> +++ b/include/linux/prandom.h
> @@ -17,7 +17,24 @@ struct rnd_state {
>  	__u32 s1, s2, s3, s4;
>  };
>  
> -u32 prandom_u32_state(struct rnd_state *state);
> +/**
> + * prandom_u32_state - seeded pseudo-random number generator.
> + * @state: pointer to state structure holding seeded state.
> + *
> + * This is used for pseudo-randomness with no outside seeding.
> + * For more random results, use get_random_u32().
> + */
> +static __always_inline u32 prandom_u32_state(struct rnd_state *state)
> +{
> +#define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b)
> +	state->s1 = TAUSWORTHE(state->s1,  6U, 13U, 4294967294U, 18U);
> +	state->s2 = TAUSWORTHE(state->s2,  2U, 27U, 4294967288U,  2U);
> +	state->s3 = TAUSWORTHE(state->s3, 13U, 21U, 4294967280U,  7U);
> +	state->s4 = TAUSWORTHE(state->s4,  3U, 12U, 4294967168U, 13U);
> +
> +	return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4);
> +}
> +
>  void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes);
>  void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state);
>  
> diff --git a/lib/random32.c b/lib/random32.c
> index 24e7acd9343f..d57baf489d4a 100644
> --- a/lib/random32.c
> +++ b/lib/random32.c
> @@ -42,25 +42,6 @@
>  #include <linux/slab.h>
>  #include <linux/unaligned.h>
>  
> -/**
> - *	prandom_u32_state - seeded pseudo-random number generator.
> - *	@state: pointer to state structure holding seeded state.
> - *
> - *	This is used for pseudo-randomness with no outside seeding.
> - *	For more random results, use get_random_u32().
> - */
> -u32 prandom_u32_state(struct rnd_state *state)
> -{
> -#define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b)
> -	state->s1 = TAUSWORTHE(state->s1,  6U, 13U, 4294967294U, 18U);
> -	state->s2 = TAUSWORTHE(state->s2,  2U, 27U, 4294967288U,  2U);
> -	state->s3 = TAUSWORTHE(state->s3, 13U, 21U, 4294967280U,  7U);
> -	state->s4 = TAUSWORTHE(state->s4,  3U, 12U, 4294967168U, 13U);
> -
> -	return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4);
> -}
> -EXPORT_SYMBOL(prandom_u32_state);
> -
>  /**
>   *	prandom_bytes_state - get the requested number of pseudo-random bytes
>   *
> -- 
> 2.43.0
>

Re: [PATCH v3 2/3] prandom: Convert prandom_u32_state() to __always_inline

Posted by Jason A. Donenfeld 1 month, 1 week ago

Hi Ryan,

On Fri, Jan 2, 2026 at 2:12 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> context. Given the function is just a handful of operations and doesn't

How many? What's this looking like in terms of assembly? It'd also be
nice to have some brief analysis of other call sites to have
confirmation this isn't blowing up other users.

> +static __always_inline u32 prandom_u32_state(struct rnd_state *state)

Why not just normal `inline`? Is gcc disagreeing with the inlinability
of this function?

Jason

Re: [PATCH v3 2/3] prandom: Convert prandom_u32_state() to __always_inline

Posted by David Laight 1 month ago

On Fri, 2 Jan 2026 14:39:21 +0100
"Jason A. Donenfeld" <Jason@zx2c4.com> wrote:

> Hi Ryan,
...
> > +static __always_inline u32 prandom_u32_state(struct rnd_state *state)  
> 
> Why not just normal `inline`? Is gcc disagreeing with the inlinability
> of this function?

gcc has a mind of its own when it comes to inlining.
If there weren't some massive functions marked 'inline' that should never
really be inlined then making 'inline' '__always_inline' would make sense.
But first an audit would be needed.
(This has come up several times in the past.)
But if you need a function to be inlined (for any reason) it needs to be
always_inline.

Whether there should be an non-inlined 'option' here is another matter.
There could be a normal function that calls the inlined version.

	David

> 
> Jason
>

Re: [PATCH v3 2/3] prandom: Convert prandom_u32_state() to __always_inline

Posted by Ryan Roberts 1 month, 1 week ago

On 02/01/2026 13:39, Jason A. Donenfeld wrote:
> Hi Ryan,
> 
> On Fri, Jan 2, 2026 at 2:12 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>> context. Given the function is just a handful of operations and doesn't
> 
> How many? What's this looking like in terms of assembly? 

25 instructions on arm64:

0000000000000000 <prandom_u32_state>:
   0:	29401403 	ldp	w3, w5, [x0]
   4:	aa0003e1 	mov	x1, x0
   8:	29410002 	ldp	w2, w0, [x0, #8]
   c:	531e74a4 	lsl	w4, w5, #2
  10:	530e3468 	lsl	w8, w3, #18
  14:	4a0400a5 	eor	w5, w5, w4
  18:	4a031863 	eor	w3, w3, w3, lsl #6
  1c:	53196047 	lsl	w7, w2, #7
  20:	53134806 	lsl	w6, w0, #13
  24:	4a023442 	eor	w2, w2, w2, lsl #13
  28:	4a000c00 	eor	w0, w0, w0, lsl #3
  2c:	121b6884 	and	w4, w4, #0xffffffe0
  30:	120d3108 	and	w8, w8, #0xfff80000
  34:	121550e7 	and	w7, w7, #0xfffff800
  38:	120c2cc6 	and	w6, w6, #0xfff00000
  3c:	2a456c85 	orr	w5, w4, w5, lsr #27
  40:	2a433504 	orr	w4, w8, w3, lsr #13
  44:	2a4254e3 	orr	w3, w7, w2, lsr #21
  48:	2a4030c2 	orr	w2, w6, w0, lsr #12
  4c:	4a020066 	eor	w6, w3, w2
  50:	4a050080 	eor	w0, w4, w5
  54:	4a0000c0 	eor	w0, w6, w0
  58:	29001424 	stp	w4, w5, [x1]
  5c:	29010823 	stp	w3, w2, [x1, #8]
  60:	d65f03c0 	ret

> It'd also be
> nice to have some brief analysis of other call sites to have
> confirmation this isn't blowing up other users.

I compiled defconfig before and after this patch on arm64 and compared the text
sizes:

$ ./scripts/bloat-o-meter -t vmlinux.before vmlinux.after
add/remove: 3/4 grow/shrink: 4/1 up/down: 836/-128 (708)
Function                                     old     new   delta
prandom_seed_full_state                      364     932    +568
pick_next_task_fair                         1940    2036     +96
bpf_user_rnd_u32                             104     196     +92
prandom_bytes_state                          204     260     +56
e843419@0f2b_00012d69_e34                      -       8      +8
e843419@0db7_00010ec3_23ec                     -       8      +8
e843419@02cb_00003767_25c                      -       8      +8
bpf_prog_select_runtime                      448     444      -4
e843419@0aa3_0000cfd1_1580                     8       -      -8
e843419@0aa2_0000cfba_147c                     8       -      -8
e843419@075f_00008d8c_184                      8       -      -8
prandom_u32_state                            100       -    -100
Total: Before=19078072, After=19078780, chg +0.00%

So 708 bytes more after inlining. The main cost is prandom_seed_full_state(),
which calls prandom_u32_state() 10 times (via prandom_warmup()). I expect we
could turn that into a loop to reduce ~450 bytes overall.

I'm not really sure if 708 is good or bad...

> 
>> +static __always_inline u32 prandom_u32_state(struct rnd_state *state)
> 
> Why not just normal `inline`? Is gcc disagreeing with the inlinability
> of this function?

Given this needs to be called from a noinstr function, I didn't want to give the
compiler the opportunity to decide not to inline it, since in that case, some
instrumentation might end up being applied to the function body which would blow
up when called in the noinstr context.

I think the other 2 options are to keep prandom_u32_state() in the c file but
mark it noinstr or rearrange all the users so that thay don't call it until
instrumentation is allowable. The latter is something I was trying to avoid.

There is some previous discussion of this at [1].

[1] https://lore.kernel.org/all/aS65LFUfdgRPKv1l@J2N7QTR9R3/

Perhaps keeping prandom_u32_state() in the c file and making it noinstr is the
best compromise?

Thanks,
Ryan

> 
> Jason

Re: [PATCH v3 2/3] prandom: Convert prandom_u32_state() to __always_inline

Posted by David Laight 1 month ago

On Fri, 2 Jan 2026 14:09:26 +0000
Ryan Roberts <ryan.roberts@arm.com> wrote:

> On 02/01/2026 13:39, Jason A. Donenfeld wrote:
> > Hi Ryan,
> > 
> > On Fri, Jan 2, 2026 at 2:12 PM Ryan Roberts <ryan.roberts@arm.com> wrote:  
> >> context. Given the function is just a handful of operations and doesn't  
> > 
> > How many? What's this looking like in terms of assembly?   
> 
> 25 instructions on arm64:
> 
> 0000000000000000 <prandom_u32_state>:
>    0:	29401403 	ldp	w3, w5, [x0]
>    4:	aa0003e1 	mov	x1, x0
>    8:	29410002 	ldp	w2, w0, [x0, #8]
>    c:	531e74a4 	lsl	w4, w5, #2
>   10:	530e3468 	lsl	w8, w3, #18
>   14:	4a0400a5 	eor	w5, w5, w4
>   18:	4a031863 	eor	w3, w3, w3, lsl #6
>   1c:	53196047 	lsl	w7, w2, #7
>   20:	53134806 	lsl	w6, w0, #13
>   24:	4a023442 	eor	w2, w2, w2, lsl #13
>   28:	4a000c00 	eor	w0, w0, w0, lsl #3
>   2c:	121b6884 	and	w4, w4, #0xffffffe0
>   30:	120d3108 	and	w8, w8, #0xfff80000
>   34:	121550e7 	and	w7, w7, #0xfffff800
>   38:	120c2cc6 	and	w6, w6, #0xfff00000
>   3c:	2a456c85 	orr	w5, w4, w5, lsr #27
>   40:	2a433504 	orr	w4, w8, w3, lsr #13
>   44:	2a4254e3 	orr	w3, w7, w2, lsr #21
>   48:	2a4030c2 	orr	w2, w6, w0, lsr #12
>   4c:	4a020066 	eor	w6, w3, w2
>   50:	4a050080 	eor	w0, w4, w5
>   54:	4a0000c0 	eor	w0, w6, w0
>   58:	29001424 	stp	w4, w5, [x1]
>   5c:	29010823 	stp	w3, w2, [x1, #8]
>   60:	d65f03c0 	ret

That is gcc, clang seems to generate something horrid (from godbolt).
I'm not sure what it has tried to do (and maybe it can't in kernel)
but it clearly doesn't help!
.LCPI0_0:
        .word   18
        .word   2
        .word   7
        .word   13
.LCPI0_1:
        .word   6
        .word   2
        .word   13
        .word   3
.LCPI0_2:
        .word   4294443008
        .word   4294967264
        .word   4294965248
        .word   4293918720
.LCPI0_3:
        .word   4294967283
        .word   4294967269
        .word   4294967275
        .word   4294967284
prandom_u32_state:
        adrp    x9, .LCPI0_1
        ldr     q0, [x0]
        adrp    x10, .LCPI0_3
        ldr     q1, [x9, :lo12:.LCPI0_1]
        adrp    x9, .LCPI0_0
        ldr     q3, [x10, :lo12:.LCPI0_3]
        ldr     q2, [x9, :lo12:.LCPI0_0]
        adrp    x9, .LCPI0_2
        mov     x8, x0
        ushl    v1.4s, v0.4s, v1.4s
        ushl    v2.4s, v0.4s, v2.4s
        eor     v0.16b, v1.16b, v0.16b
        ldr     q1, [x9, :lo12:.LCPI0_2]
        and     v1.16b, v2.16b, v1.16b
        ushl    v0.4s, v0.4s, v3.4s
        orr     v0.16b, v0.16b, v1.16b
        ext     v1.16b, v0.16b, v0.16b, #8
        str     q0, [x8]
        eor     v1.8b, v0.8b, v1.8b
        fmov    x9, d1
        lsr     x10, x9, #32
        eor     w0, w9, w10
        ret

The x86 versions are a little longer (arm's barrel shifter helps a lot).

> 
> > It'd also be
> > nice to have some brief analysis of other call sites to have
> > confirmation this isn't blowing up other users.  
> 
> I compiled defconfig before and after this patch on arm64 and compared the text
> sizes:
> 
> $ ./scripts/bloat-o-meter -t vmlinux.before vmlinux.after
> add/remove: 3/4 grow/shrink: 4/1 up/down: 836/-128 (708)
> Function                                     old     new   delta
> prandom_seed_full_state                      364     932    +568
> pick_next_task_fair                         1940    2036     +96
> bpf_user_rnd_u32                             104     196     +92
> prandom_bytes_state                          204     260     +56
> e843419@0f2b_00012d69_e34                      -       8      +8
> e843419@0db7_00010ec3_23ec                     -       8      +8
> e843419@02cb_00003767_25c                      -       8      +8
> bpf_prog_select_runtime                      448     444      -4
> e843419@0aa3_0000cfd1_1580                     8       -      -8
> e843419@0aa2_0000cfba_147c                     8       -      -8
> e843419@075f_00008d8c_184                      8       -      -8
> prandom_u32_state                            100       -    -100
> Total: Before=19078072, After=19078780, chg +0.00%
> 
> So 708 bytes more after inlining.

Doesn't look like there are many calls.

> The main cost is prandom_seed_full_state(),
> which calls prandom_u32_state() 10 times (via prandom_warmup()). I expect we
> could turn that into a loop to reduce ~450 bytes overall.

That would always have helped the code size.
And I suspect the other costs of that code make unrolling the loop pointless.

> 
> I'm not really sure if 708 is good or bad...
> 
> >   
> >> +static __always_inline u32 prandom_u32_state(struct rnd_state *state)  
> > 
> > Why not just normal `inline`? Is gcc disagreeing with the inlinability
> > of this function?  
> 
> Given this needs to be called from a noinstr function, I didn't want to give the
> compiler the opportunity to decide not to inline it, since in that case, some
> instrumentation might end up being applied to the function body which would blow
> up when called in the noinstr context.
> 
> I think the other 2 options are to keep prandom_u32_state() in the c file but
> mark it noinstr or rearrange all the users so that thay don't call it until
> instrumentation is allowable. The latter is something I was trying to avoid.
> 
> There is some previous discussion of this at [1].
> 
> [1] https://lore.kernel.org/all/aS65LFUfdgRPKv1l@J2N7QTR9R3/
> 
> Perhaps keeping prandom_u32_state() in the c file and making it noinstr is the
> best compromise?

Or define prandom_u32_state_inline() as always_inline and have the
real function:
u32 prandom_u32_state(struct rnd_state *state)
{
	return prandom_u32_state_inline(state);
}

So that the callers can pick the inline version if it really matters.

	David

> 
> Thanks,
> Ryan
> 
> > 
> > Jason  
> 
>

Re: [PATCH v3 2/3] prandom: Convert prandom_u32_state() to __always_inline

Posted by Ryan Roberts 1 month ago

On 03/01/2026 10:46, David Laight wrote:
> On Fri, 2 Jan 2026 14:09:26 +0000
> Ryan Roberts <ryan.roberts@arm.com> wrote:
> 
>> On 02/01/2026 13:39, Jason A. Donenfeld wrote:
>>> Hi Ryan,
>>>
>>> On Fri, Jan 2, 2026 at 2:12 PM Ryan Roberts <ryan.roberts@arm.com> wrote:  
>>>> context. Given the function is just a handful of operations and doesn't  
>>>
>>> How many? What's this looking like in terms of assembly?   
>>
>> 25 instructions on arm64:
>>
>> 0000000000000000 <prandom_u32_state>:
>>    0:	29401403 	ldp	w3, w5, [x0]
>>    4:	aa0003e1 	mov	x1, x0
>>    8:	29410002 	ldp	w2, w0, [x0, #8]
>>    c:	531e74a4 	lsl	w4, w5, #2
>>   10:	530e3468 	lsl	w8, w3, #18
>>   14:	4a0400a5 	eor	w5, w5, w4
>>   18:	4a031863 	eor	w3, w3, w3, lsl #6
>>   1c:	53196047 	lsl	w7, w2, #7
>>   20:	53134806 	lsl	w6, w0, #13
>>   24:	4a023442 	eor	w2, w2, w2, lsl #13
>>   28:	4a000c00 	eor	w0, w0, w0, lsl #3
>>   2c:	121b6884 	and	w4, w4, #0xffffffe0
>>   30:	120d3108 	and	w8, w8, #0xfff80000
>>   34:	121550e7 	and	w7, w7, #0xfffff800
>>   38:	120c2cc6 	and	w6, w6, #0xfff00000
>>   3c:	2a456c85 	orr	w5, w4, w5, lsr #27
>>   40:	2a433504 	orr	w4, w8, w3, lsr #13
>>   44:	2a4254e3 	orr	w3, w7, w2, lsr #21
>>   48:	2a4030c2 	orr	w2, w6, w0, lsr #12
>>   4c:	4a020066 	eor	w6, w3, w2
>>   50:	4a050080 	eor	w0, w4, w5
>>   54:	4a0000c0 	eor	w0, w6, w0
>>   58:	29001424 	stp	w4, w5, [x1]
>>   5c:	29010823 	stp	w3, w2, [x1, #8]
>>   60:	d65f03c0 	ret
> 
> That is gcc, clang seems to generate something horrid (from godbolt).
> I'm not sure what it has tried to do (and maybe it can't in kernel)
> but it clearly doesn't help!
> .LCPI0_0:
>         .word   18
>         .word   2
>         .word   7
>         .word   13
> .LCPI0_1:
>         .word   6
>         .word   2
>         .word   13
>         .word   3
> .LCPI0_2:
>         .word   4294443008
>         .word   4294967264
>         .word   4294965248
>         .word   4293918720
> .LCPI0_3:
>         .word   4294967283
>         .word   4294967269
>         .word   4294967275
>         .word   4294967284
> prandom_u32_state:
>         adrp    x9, .LCPI0_1
>         ldr     q0, [x0]
>         adrp    x10, .LCPI0_3
>         ldr     q1, [x9, :lo12:.LCPI0_1]
>         adrp    x9, .LCPI0_0
>         ldr     q3, [x10, :lo12:.LCPI0_3]
>         ldr     q2, [x9, :lo12:.LCPI0_0]
>         adrp    x9, .LCPI0_2
>         mov     x8, x0
>         ushl    v1.4s, v0.4s, v1.4s
>         ushl    v2.4s, v0.4s, v2.4s
>         eor     v0.16b, v1.16b, v0.16b
>         ldr     q1, [x9, :lo12:.LCPI0_2]
>         and     v1.16b, v2.16b, v1.16b
>         ushl    v0.4s, v0.4s, v3.4s
>         orr     v0.16b, v0.16b, v1.16b
>         ext     v1.16b, v0.16b, v0.16b, #8
>         str     q0, [x8]
>         eor     v1.8b, v0.8b, v1.8b
>         fmov    x9, d1
>         lsr     x10, x9, #32
>         eor     w0, w9, w10
>         ret
> 
> The x86 versions are a little longer (arm's barrel shifter helps a lot).
> 
>>
>>> It'd also be
>>> nice to have some brief analysis of other call sites to have
>>> confirmation this isn't blowing up other users.  
>>
>> I compiled defconfig before and after this patch on arm64 and compared the text
>> sizes:
>>
>> $ ./scripts/bloat-o-meter -t vmlinux.before vmlinux.after
>> add/remove: 3/4 grow/shrink: 4/1 up/down: 836/-128 (708)
>> Function                                     old     new   delta
>> prandom_seed_full_state                      364     932    +568
>> pick_next_task_fair                         1940    2036     +96
>> bpf_user_rnd_u32                             104     196     +92
>> prandom_bytes_state                          204     260     +56
>> e843419@0f2b_00012d69_e34                      -       8      +8
>> e843419@0db7_00010ec3_23ec                     -       8      +8
>> e843419@02cb_00003767_25c                      -       8      +8
>> bpf_prog_select_runtime                      448     444      -4
>> e843419@0aa3_0000cfd1_1580                     8       -      -8
>> e843419@0aa2_0000cfba_147c                     8       -      -8
>> e843419@075f_00008d8c_184                      8       -      -8
>> prandom_u32_state                            100       -    -100
>> Total: Before=19078072, After=19078780, chg +0.00%
>>
>> So 708 bytes more after inlining.
> 
> Doesn't look like there are many calls.
> 
>> The main cost is prandom_seed_full_state(),
>> which calls prandom_u32_state() 10 times (via prandom_warmup()). I expect we
>> could turn that into a loop to reduce ~450 bytes overall.
> 
> That would always have helped the code size.
> And I suspect the other costs of that code make unrolling the loop pointless.
> 
>>
>> I'm not really sure if 708 is good or bad...
>>
>>>   
>>>> +static __always_inline u32 prandom_u32_state(struct rnd_state *state)  
>>>
>>> Why not just normal `inline`? Is gcc disagreeing with the inlinability
>>> of this function?  
>>
>> Given this needs to be called from a noinstr function, I didn't want to give the
>> compiler the opportunity to decide not to inline it, since in that case, some
>> instrumentation might end up being applied to the function body which would blow
>> up when called in the noinstr context.
>>
>> I think the other 2 options are to keep prandom_u32_state() in the c file but
>> mark it noinstr or rearrange all the users so that thay don't call it until
>> instrumentation is allowable. The latter is something I was trying to avoid.
>>
>> There is some previous discussion of this at [1].
>>
>> [1] https://lore.kernel.org/all/aS65LFUfdgRPKv1l@J2N7QTR9R3/
>>
>> Perhaps keeping prandom_u32_state() in the c file and making it noinstr is the
>> best compromise?
> 
> Or define prandom_u32_state_inline() as always_inline and have the
> real function:
> u32 prandom_u32_state(struct rnd_state *state)
> {
> 	return prandom_u32_state_inline(state);
> }
> 
> So that the callers can pick the inline version if it really matters.

Ahh yes, that sounds like the simplest/best idea to me. I'll take this approach
for the next version assuming Jason is ok with it?

Thanks,
Ryan

> 
> 	David
> 
>>
>> Thanks,
>> Ryan
>>
>>>
>>> Jason  
>>
>>
>

Re: [PATCH v3 2/3] prandom: Convert prandom_u32_state() to __always_inline

Posted by Christophe Leroy (CS GROUP) 1 month ago


Le 02/01/2026 à 15:09, Ryan Roberts a écrit :
> On 02/01/2026 13:39, Jason A. Donenfeld wrote:
>> Hi Ryan,
>>
>> On Fri, Jan 2, 2026 at 2:12 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>> context. Given the function is just a handful of operations and doesn't
>>
>> How many? What's this looking like in terms of assembly?
> 
> 25 instructions on arm64:

31 instructions on powerpc:

00000000 <prandom_u32_state>:
    0:	7c 69 1b 78 	mr      r9,r3
    4:	80 63 00 00 	lwz     r3,0(r3)
    8:	80 89 00 08 	lwz     r4,8(r9)
    c:	81 69 00 04 	lwz     r11,4(r9)
   10:	80 a9 00 0c 	lwz     r5,12(r9)
   14:	54 67 30 32 	slwi    r7,r3,6
   18:	7c e7 1a 78 	xor     r7,r7,r3
   1c:	55 66 10 3a 	slwi    r6,r11,2
   20:	54 88 68 24 	slwi    r8,r4,13
   24:	54 63 90 18 	rlwinm  r3,r3,18,0,12
   28:	7d 6b 32 78 	xor     r11,r11,r6
   2c:	7d 08 22 78 	xor     r8,r8,r4
   30:	54 aa 18 38 	slwi    r10,r5,3
   34:	54 e7 9b 7e 	srwi    r7,r7,13
   38:	7c e7 1a 78 	xor     r7,r7,r3
   3c:	51 66 2e fe 	rlwimi  r6,r11,5,27,31
   40:	54 84 38 28 	rlwinm  r4,r4,7,0,20
   44:	7d 4a 2a 78 	xor     r10,r10,r5
   48:	55 08 5d 7e 	srwi    r8,r8,21
   4c:	7d 08 22 78 	xor     r8,r8,r4
   50:	7c e3 32 78 	xor     r3,r7,r6
   54:	54 a5 68 16 	rlwinm  r5,r5,13,0,11
   58:	55 4a a3 3e 	srwi    r10,r10,12
   5c:	7d 4a 2a 78 	xor     r10,r10,r5
   60:	7c 63 42 78 	xor     r3,r3,r8
   64:	90 e9 00 00 	stw     r7,0(r9)
   68:	90 c9 00 04 	stw     r6,4(r9)
   6c:	91 09 00 08 	stw     r8,8(r9)
   70:	91 49 00 0c 	stw     r10,12(r9)
   74:	7c 63 52 78 	xor     r3,r3,r10
   78:	4e 80 00 20 	blr

Among those, 8 instructions are for reading/writing the state in stack. 
They of course disappear when inlining.

> 
>> It'd also be
>> nice to have some brief analysis of other call sites to have
>> confirmation this isn't blowing up other users.
> 
> I compiled defconfig before and after this patch on arm64 and compared the text
> sizes:
> 
> $ ./scripts/bloat-o-meter -t vmlinux.before vmlinux.after
> add/remove: 3/4 grow/shrink: 4/1 up/down: 836/-128 (708)
> Function                                     old     new   delta
> prandom_seed_full_state                      364     932    +568
> pick_next_task_fair                         1940    2036     +96
> bpf_user_rnd_u32                             104     196     +92
> prandom_bytes_state                          204     260     +56
> e843419@0f2b_00012d69_e34                      -       8      +8
> e843419@0db7_00010ec3_23ec                     -       8      +8
> e843419@02cb_00003767_25c                      -       8      +8
> bpf_prog_select_runtime                      448     444      -4
> e843419@0aa3_0000cfd1_1580                     8       -      -8
> e843419@0aa2_0000cfba_147c                     8       -      -8
> e843419@075f_00008d8c_184                      8       -      -8
> prandom_u32_state                            100       -    -100
> Total: Before=19078072, After=19078780, chg +0.00%
> 
> So 708 bytes more after inlining. The main cost is prandom_seed_full_state(),
> which calls prandom_u32_state() 10 times (via prandom_warmup()). I expect we
> could turn that into a loop to reduce ~450 bytes overall.
> 
With following change the increase of prandom_seed_full_state() remains 
reasonnable and performance wise it is a lot better as it avoids the 
read/write of the state via the stack

diff --git a/lib/random32.c b/lib/random32.c
index 24e7acd9343f6..28a5b109c9018 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -94,17 +94,11 @@ EXPORT_SYMBOL(prandom_bytes_state);

  static void prandom_warmup(struct rnd_state *state)
  {
+	int i;
+
  	/* Calling RNG ten times to satisfy recurrence condition */
-	prandom_u32_state(state);
-	prandom_u32_state(state);
-	prandom_u32_state(state);
-	prandom_u32_state(state);
-	prandom_u32_state(state);
-	prandom_u32_state(state);
-	prandom_u32_state(state);
-	prandom_u32_state(state);
-	prandom_u32_state(state);
-	prandom_u32_state(state);
+	for (i = 0; i < 10; i++)
+		prandom_u32_state(state);
  }

  void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state)

The loop is:

  248:	38 e0 00 0a 	li      r7,10
  24c:	7c e9 03 a6 	mtctr   r7
  250:	55 05 30 32 	slwi    r5,r8,6
  254:	55 46 68 24 	slwi    r6,r10,13
  258:	55 27 18 38 	slwi    r7,r9,3
  25c:	7c a5 42 78 	xor     r5,r5,r8
  260:	7c c6 52 78 	xor     r6,r6,r10
  264:	7c e7 4a 78 	xor     r7,r7,r9
  268:	54 8b 10 3a 	slwi    r11,r4,2
  26c:	7d 60 22 78 	xor     r0,r11,r4
  270:	54 a5 9b 7e 	srwi    r5,r5,13
  274:	55 08 90 18 	rlwinm  r8,r8,18,0,12
  278:	54 c6 5d 7e 	srwi    r6,r6,21
  27c:	55 4a 38 28 	rlwinm  r10,r10,7,0,20
  280:	54 e7 a3 3e 	srwi    r7,r7,12
  284:	55 29 68 16 	rlwinm  r9,r9,13,0,11
  288:	7d 64 5b 78 	mr      r4,r11
  28c:	7c a8 42 78 	xor     r8,r5,r8
  290:	7c ca 52 78 	xor     r10,r6,r10
  294:	7c e9 4a 78 	xor     r9,r7,r9
  298:	50 04 2e fe 	rlwimi  r4,r0,5,27,31
  29c:	42 00 ff b4 	bdnz    250 <prandom_seed_full_state+0x7c>

Which replaces the 10 calls to prandom_u32_state()

   fc:	91 3f 00 0c 	stw     r9,12(r31)
  100:	7f e3 fb 78 	mr      r3,r31
  104:	48 00 00 01 	bl      104 <prandom_seed_full_state+0x88>
			104: R_PPC_REL24	prandom_u32_state
  108:	7f e3 fb 78 	mr      r3,r31
  10c:	48 00 00 01 	bl      10c <prandom_seed_full_state+0x90>
			10c: R_PPC_REL24	prandom_u32_state
  110:	7f e3 fb 78 	mr      r3,r31
  114:	48 00 00 01 	bl      114 <prandom_seed_full_state+0x98>
			114: R_PPC_REL24	prandom_u32_state
  118:	7f e3 fb 78 	mr      r3,r31
  11c:	48 00 00 01 	bl      11c <prandom_seed_full_state+0xa0>
			11c: R_PPC_REL24	prandom_u32_state
  120:	7f e3 fb 78 	mr      r3,r31
  124:	48 00 00 01 	bl      124 <prandom_seed_full_state+0xa8>
			124: R_PPC_REL24	prandom_u32_state
  128:	7f e3 fb 78 	mr      r3,r31
  12c:	48 00 00 01 	bl      12c <prandom_seed_full_state+0xb0>
			12c: R_PPC_REL24	prandom_u32_state
  130:	7f e3 fb 78 	mr      r3,r31
  134:	48 00 00 01 	bl      134 <prandom_seed_full_state+0xb8>
			134: R_PPC_REL24	prandom_u32_state
  138:	7f e3 fb 78 	mr      r3,r31
  13c:	48 00 00 01 	bl      13c <prandom_seed_full_state+0xc0>
			13c: R_PPC_REL24	prandom_u32_state
  140:	7f e3 fb 78 	mr      r3,r31
  144:	48 00 00 01 	bl      144 <prandom_seed_full_state+0xc8>
			144: R_PPC_REL24	prandom_u32_state
  148:	80 01 00 24 	lwz     r0,36(r1)
  14c:	7f e3 fb 78 	mr      r3,r31
  150:	83 e1 00 1c 	lwz     r31,28(r1)
  154:	7c 08 03 a6 	mtlr    r0
  158:	38 21 00 20 	addi    r1,r1,32
  15c:	48 00 00 00 	b       15c <prandom_seed_full_state+0xe0>
			15c: R_PPC_REL24	prandom_u32_state


So approx the same number of instructions in size, while better performance.

> I'm not really sure if 708 is good or bad...

That's in the noise compared to the overall size of vmlinux, but if we 
change it to a loop we also reduce pressure on the cache.

Christophe

Re: [PATCH v3 2/3] prandom: Convert prandom_u32_state() to __always_inline

Posted by Ryan Roberts 1 month ago

On 03/01/2026 08:00, Christophe Leroy (CS GROUP) wrote:
> 
> 
> Le 02/01/2026 à 15:09, Ryan Roberts a écrit :
>> On 02/01/2026 13:39, Jason A. Donenfeld wrote:
>>> Hi Ryan,
>>>
>>> On Fri, Jan 2, 2026 at 2:12 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>> context. Given the function is just a handful of operations and doesn't
>>>
>>> How many? What's this looking like in terms of assembly?
>>
>> 25 instructions on arm64:
> 
> 31 instructions on powerpc:
> 
> 00000000 <prandom_u32_state>:
>    0:    7c 69 1b 78     mr      r9,r3
>    4:    80 63 00 00     lwz     r3,0(r3)
>    8:    80 89 00 08     lwz     r4,8(r9)
>    c:    81 69 00 04     lwz     r11,4(r9)
>   10:    80 a9 00 0c     lwz     r5,12(r9)
>   14:    54 67 30 32     slwi    r7,r3,6
>   18:    7c e7 1a 78     xor     r7,r7,r3
>   1c:    55 66 10 3a     slwi    r6,r11,2
>   20:    54 88 68 24     slwi    r8,r4,13
>   24:    54 63 90 18     rlwinm  r3,r3,18,0,12
>   28:    7d 6b 32 78     xor     r11,r11,r6
>   2c:    7d 08 22 78     xor     r8,r8,r4
>   30:    54 aa 18 38     slwi    r10,r5,3
>   34:    54 e7 9b 7e     srwi    r7,r7,13
>   38:    7c e7 1a 78     xor     r7,r7,r3
>   3c:    51 66 2e fe     rlwimi  r6,r11,5,27,31
>   40:    54 84 38 28     rlwinm  r4,r4,7,0,20
>   44:    7d 4a 2a 78     xor     r10,r10,r5
>   48:    55 08 5d 7e     srwi    r8,r8,21
>   4c:    7d 08 22 78     xor     r8,r8,r4
>   50:    7c e3 32 78     xor     r3,r7,r6
>   54:    54 a5 68 16     rlwinm  r5,r5,13,0,11
>   58:    55 4a a3 3e     srwi    r10,r10,12
>   5c:    7d 4a 2a 78     xor     r10,r10,r5
>   60:    7c 63 42 78     xor     r3,r3,r8
>   64:    90 e9 00 00     stw     r7,0(r9)
>   68:    90 c9 00 04     stw     r6,4(r9)
>   6c:    91 09 00 08     stw     r8,8(r9)
>   70:    91 49 00 0c     stw     r10,12(r9)
>   74:    7c 63 52 78     xor     r3,r3,r10
>   78:    4e 80 00 20     blr
> 
> Among those, 8 instructions are for reading/writing the state in stack. They of
> course disappear when inlining.
> 
>>
>>> It'd also be
>>> nice to have some brief analysis of other call sites to have
>>> confirmation this isn't blowing up other users.
>>
>> I compiled defconfig before and after this patch on arm64 and compared the text
>> sizes:
>>
>> $ ./scripts/bloat-o-meter -t vmlinux.before vmlinux.after
>> add/remove: 3/4 grow/shrink: 4/1 up/down: 836/-128 (708)
>> Function                                     old     new   delta
>> prandom_seed_full_state                      364     932    +568
>> pick_next_task_fair                         1940    2036     +96
>> bpf_user_rnd_u32                             104     196     +92
>> prandom_bytes_state                          204     260     +56
>> e843419@0f2b_00012d69_e34                      -       8      +8
>> e843419@0db7_00010ec3_23ec                     -       8      +8
>> e843419@02cb_00003767_25c                      -       8      +8
>> bpf_prog_select_runtime                      448     444      -4
>> e843419@0aa3_0000cfd1_1580                     8       -      -8
>> e843419@0aa2_0000cfba_147c                     8       -      -8
>> e843419@075f_00008d8c_184                      8       -      -8
>> prandom_u32_state                            100       -    -100
>> Total: Before=19078072, After=19078780, chg +0.00%
>>
>> So 708 bytes more after inlining. The main cost is prandom_seed_full_state(),
>> which calls prandom_u32_state() 10 times (via prandom_warmup()). I expect we
>> could turn that into a loop to reduce ~450 bytes overall.
>>
> With following change the increase of prandom_seed_full_state() remains
> reasonnable and performance wise it is a lot better as it avoids the read/write
> of the state via the stack
> 
> diff --git a/lib/random32.c b/lib/random32.c
> index 24e7acd9343f6..28a5b109c9018 100644
> --- a/lib/random32.c
> +++ b/lib/random32.c
> @@ -94,17 +94,11 @@ EXPORT_SYMBOL(prandom_bytes_state);
> 
>  static void prandom_warmup(struct rnd_state *state)
>  {
> +    int i;
> +
>      /* Calling RNG ten times to satisfy recurrence condition */
> -    prandom_u32_state(state);
> -    prandom_u32_state(state);
> -    prandom_u32_state(state);
> -    prandom_u32_state(state);
> -    prandom_u32_state(state);
> -    prandom_u32_state(state);
> -    prandom_u32_state(state);
> -    prandom_u32_state(state);
> -    prandom_u32_state(state);
> -    prandom_u32_state(state);
> +    for (i = 0; i < 10; i++)
> +        prandom_u32_state(state);
>  }
> 
>  void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state)
> 
> The loop is:
> 
>  248:    38 e0 00 0a     li      r7,10
>  24c:    7c e9 03 a6     mtctr   r7
>  250:    55 05 30 32     slwi    r5,r8,6
>  254:    55 46 68 24     slwi    r6,r10,13
>  258:    55 27 18 38     slwi    r7,r9,3
>  25c:    7c a5 42 78     xor     r5,r5,r8
>  260:    7c c6 52 78     xor     r6,r6,r10
>  264:    7c e7 4a 78     xor     r7,r7,r9
>  268:    54 8b 10 3a     slwi    r11,r4,2
>  26c:    7d 60 22 78     xor     r0,r11,r4
>  270:    54 a5 9b 7e     srwi    r5,r5,13
>  274:    55 08 90 18     rlwinm  r8,r8,18,0,12
>  278:    54 c6 5d 7e     srwi    r6,r6,21
>  27c:    55 4a 38 28     rlwinm  r10,r10,7,0,20
>  280:    54 e7 a3 3e     srwi    r7,r7,12
>  284:    55 29 68 16     rlwinm  r9,r9,13,0,11
>  288:    7d 64 5b 78     mr      r4,r11
>  28c:    7c a8 42 78     xor     r8,r5,r8
>  290:    7c ca 52 78     xor     r10,r6,r10
>  294:    7c e9 4a 78     xor     r9,r7,r9
>  298:    50 04 2e fe     rlwimi  r4,r0,5,27,31
>  29c:    42 00 ff b4     bdnz    250 <prandom_seed_full_state+0x7c>
> 
> Which replaces the 10 calls to prandom_u32_state()
> 
>   fc:    91 3f 00 0c     stw     r9,12(r31)
>  100:    7f e3 fb 78     mr      r3,r31
>  104:    48 00 00 01     bl      104 <prandom_seed_full_state+0x88>
>             104: R_PPC_REL24    prandom_u32_state
>  108:    7f e3 fb 78     mr      r3,r31
>  10c:    48 00 00 01     bl      10c <prandom_seed_full_state+0x90>
>             10c: R_PPC_REL24    prandom_u32_state
>  110:    7f e3 fb 78     mr      r3,r31
>  114:    48 00 00 01     bl      114 <prandom_seed_full_state+0x98>
>             114: R_PPC_REL24    prandom_u32_state
>  118:    7f e3 fb 78     mr      r3,r31
>  11c:    48 00 00 01     bl      11c <prandom_seed_full_state+0xa0>
>             11c: R_PPC_REL24    prandom_u32_state
>  120:    7f e3 fb 78     mr      r3,r31
>  124:    48 00 00 01     bl      124 <prandom_seed_full_state+0xa8>
>             124: R_PPC_REL24    prandom_u32_state
>  128:    7f e3 fb 78     mr      r3,r31
>  12c:    48 00 00 01     bl      12c <prandom_seed_full_state+0xb0>
>             12c: R_PPC_REL24    prandom_u32_state
>  130:    7f e3 fb 78     mr      r3,r31
>  134:    48 00 00 01     bl      134 <prandom_seed_full_state+0xb8>
>             134: R_PPC_REL24    prandom_u32_state
>  138:    7f e3 fb 78     mr      r3,r31
>  13c:    48 00 00 01     bl      13c <prandom_seed_full_state+0xc0>
>             13c: R_PPC_REL24    prandom_u32_state
>  140:    7f e3 fb 78     mr      r3,r31
>  144:    48 00 00 01     bl      144 <prandom_seed_full_state+0xc8>
>             144: R_PPC_REL24    prandom_u32_state
>  148:    80 01 00 24     lwz     r0,36(r1)
>  14c:    7f e3 fb 78     mr      r3,r31
>  150:    83 e1 00 1c     lwz     r31,28(r1)
>  154:    7c 08 03 a6     mtlr    r0
>  158:    38 21 00 20     addi    r1,r1,32
>  15c:    48 00 00 00     b       15c <prandom_seed_full_state+0xe0>
>             15c: R_PPC_REL24    prandom_u32_state
> 
> 
> So approx the same number of instructions in size, while better performance.
> 
>> I'm not really sure if 708 is good or bad...
> 
> That's in the noise compared to the overall size of vmlinux, but if we change it
> to a loop we also reduce pressure on the cache.

Thanks for the analysis; I'm going to follow David's suggestion and refactor
this into both an __always_inline and an out-of-line version. That way the
existing callsites can continue to use the out-of-line version and we will only
use the inline version for the kstack offset randomization.

Thanks,
Ryan

> 
> Christophe

[PATCH v3 1/3] randomize_kstack: Maintain kstack_offset per task
[PATCH v3 2/3] prandom: Convert prandom_u32_state() to __always_inline
[PATCH v3 3/3] randomize_kstack: Unify random source across arches