kernel/sched/isolation.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-)
When testing a linux-next kernel with commit 59bd1d914bb5 ("memblock:
warn when freeing reserved memory before memory map is initialized"),
the following warning was hit when there was a "nohz_full" kernel boot
parameter.
Cannot free reserved memory because of deferred initialization of the memory map
WARNING: mm/memblock.c:904 at __free_reserved_area+0xde/0xf0, CPU#0: swapper/0/0
:
Call Trace:
<TASK>
memblock_phys_free+0xcb/0x100
housekeeping_init+0x14c/0x170
start_kernel+0x207/0x450
x86_64_start_reservations+0x24/0x30
x86_64_start_kernel+0xda/0xe0
common_startup_64+0x13e/0x141
</TASK>
IOW, we shouldn't free memblock allocated memory so early
in the boot process when memory map isn't fully initialized in
deferred_init_memmap(). Fix it by adding a new housekeeping_late_init()
helper to defer the re-allocation of the housekeeping cpumasks to
when initcall's are being processed. Also change rcu_reference() by
rcu_reference_check() to prevent incorrect RCU lockdep splat as RCU
will be active in this later boot stage.
This commit also depends on the presence of commit 7c2eee9c1367
("memblock: don't touch memblock arrays when memblock_free() is called
late") to prevent a KASAN UAF bug report [1].
[1] https://lore.kernel.org/lkml/20260505051821.1107133-1-longman@redhat.com/
Fixes: 27c3a5967f05 ("sched/isolation: Convert housekeeping cpumasks to rcu pointers")
Signed-off-by: Waiman Long <longman@redhat.com>
---
kernel/sched/isolation.c | 20 +++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index ef152d401fe2..a947d75b43f1 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -164,8 +164,6 @@ int housekeeping_update(struct cpumask *isol_mask)
void __init housekeeping_init(void)
{
- enum hk_type type;
-
if (!housekeeping.flags)
return;
@@ -173,17 +171,27 @@ void __init housekeeping_init(void)
if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
sched_tick_offload_init();
+}
+
+static int __init housekeeping_late_init(void)
+{
+ enum hk_type type;
+
+ if (!housekeeping.flags)
+ return 0;
+
/*
* Realloc with a proper allocator so that any cpumask update
- * can indifferently free the old version with kfree().
+ * can indifferently free the old version with kfree(). This
+ * should be done after the completion of deferred_init_memmap().
*/
for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
if (WARN_ON_ONCE(!nmask))
- return;
+ return 0;
- omask = rcu_dereference(housekeeping.cpumasks[type]);
+ omask = rcu_dereference_check(housekeeping.cpumasks[type], 1);
/* We need at least one CPU to handle housekeeping work */
WARN_ON_ONCE(cpumask_empty(omask));
@@ -191,7 +199,9 @@ void __init housekeeping_init(void)
RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
memblock_free(omask, cpumask_size());
}
+ return 0;
}
+pure_initcall(housekeeping_late_init);
static void __init housekeeping_setup_type(enum hk_type type,
cpumask_var_t housekeeping_staging)
--
2.54.0
Le Tue, Jun 02, 2026 at 10:39:51AM -0400, Waiman Long a écrit :
> When testing a linux-next kernel with commit 59bd1d914bb5 ("memblock:
> warn when freeing reserved memory before memory map is initialized"),
> the following warning was hit when there was a "nohz_full" kernel boot
> parameter.
>
> Cannot free reserved memory because of deferred initialization of the memory map
> WARNING: mm/memblock.c:904 at __free_reserved_area+0xde/0xf0, CPU#0: swapper/0/0
> :
> Call Trace:
> <TASK>
> memblock_phys_free+0xcb/0x100
> housekeeping_init+0x14c/0x170
> start_kernel+0x207/0x450
> x86_64_start_reservations+0x24/0x30
> x86_64_start_kernel+0xda/0xe0
> common_startup_64+0x13e/0x141
> </TASK>
>
> IOW, we shouldn't free memblock allocated memory so early
> in the boot process when memory map isn't fully initialized in
> deferred_init_memmap(). Fix it by adding a new housekeeping_late_init()
> helper to defer the re-allocation of the housekeeping cpumasks to
> when initcall's are being processed. Also change rcu_reference() by
> rcu_reference_check() to prevent incorrect RCU lockdep splat as RCU
> will be active in this later boot stage.
>
> This commit also depends on the presence of commit 7c2eee9c1367
> ("memblock: don't touch memblock arrays when memblock_free() is called
> late") to prevent a KASAN UAF bug report [1].
>
> [1] https://lore.kernel.org/lkml/20260505051821.1107133-1-longman@redhat.com/
>
> Fixes: 27c3a5967f05 ("sched/isolation: Convert housekeeping cpumasks to rcu pointers")
> Signed-off-by: Waiman Long <longman@redhat.com>
Thanks for fixing it. Just some points below:
> ---
> kernel/sched/isolation.c | 20 +++++++++++++++-----
> 1 file changed, 15 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
> index ef152d401fe2..a947d75b43f1 100644
> --- a/kernel/sched/isolation.c
> +++ b/kernel/sched/isolation.c
> @@ -164,8 +164,6 @@ int housekeeping_update(struct cpumask *isol_mask)
>
> void __init housekeeping_init(void)
> {
> - enum hk_type type;
> -
> if (!housekeeping.flags)
> return;
>
> @@ -173,17 +171,27 @@ void __init housekeeping_init(void)
>
> if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
> sched_tick_offload_init();
> +}
> +
> +static int __init housekeeping_late_init(void)
> +{
> + enum hk_type type;
> +
> + if (!housekeeping.flags)
> + return 0;
> +
> /*
> * Realloc with a proper allocator so that any cpumask update
> - * can indifferently free the old version with kfree().
> + * can indifferently free the old version with kfree(). This
> + * should be done after the completion of deferred_init_memmap().
> */
> for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
> struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
>
> if (WARN_ON_ONCE(!nmask))
> - return;
> + return 0;
>
> - omask = rcu_dereference(housekeeping.cpumasks[type]);
> + omask = rcu_dereference_check(housekeeping.cpumasks[type], 1);
>
> /* We need at least one CPU to handle housekeeping work */
> WARN_ON_ONCE(cpumask_empty(omask));
> @@ -191,7 +199,9 @@ void __init housekeeping_init(void)
> RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
> memblock_free(omask, cpumask_size());
> }
> + return 0;
> }
> +pure_initcall(housekeeping_late_init);
Now that it has become an initcall, what prevents other CPUs from accessing
the housekeeping cpumasks concurrently and then dereference the free'ed memory?
And even as a pre-smp initcall, nothing prevents interrupts or preempting tasks
from accessing them either.
From which point on can memblock_free() be called? I see very early calls such
as print_unknown_bootoptions()...
Thanks.
>
> static void __init housekeeping_setup_type(enum hk_type type,
> cpumask_var_t housekeeping_staging)
> --
> 2.54.0
>
--
Frederic Weisbecker
SUSE Labs
On 6/4/26 11:24 AM, Frederic Weisbecker wrote:
> Le Tue, Jun 02, 2026 at 10:39:51AM -0400, Waiman Long a écrit :
>> When testing a linux-next kernel with commit 59bd1d914bb5 ("memblock:
>> warn when freeing reserved memory before memory map is initialized"),
>> the following warning was hit when there was a "nohz_full" kernel boot
>> parameter.
>>
>> Cannot free reserved memory because of deferred initialization of the memory map
>> WARNING: mm/memblock.c:904 at __free_reserved_area+0xde/0xf0, CPU#0: swapper/0/0
>> :
>> Call Trace:
>> <TASK>
>> memblock_phys_free+0xcb/0x100
>> housekeeping_init+0x14c/0x170
>> start_kernel+0x207/0x450
>> x86_64_start_reservations+0x24/0x30
>> x86_64_start_kernel+0xda/0xe0
>> common_startup_64+0x13e/0x141
>> </TASK>
>>
>> IOW, we shouldn't free memblock allocated memory so early
>> in the boot process when memory map isn't fully initialized in
>> deferred_init_memmap(). Fix it by adding a new housekeeping_late_init()
>> helper to defer the re-allocation of the housekeeping cpumasks to
>> when initcall's are being processed. Also change rcu_reference() by
>> rcu_reference_check() to prevent incorrect RCU lockdep splat as RCU
>> will be active in this later boot stage.
>>
>> This commit also depends on the presence of commit 7c2eee9c1367
>> ("memblock: don't touch memblock arrays when memblock_free() is called
>> late") to prevent a KASAN UAF bug report [1].
>>
>> [1] https://lore.kernel.org/lkml/20260505051821.1107133-1-longman@redhat.com/
>>
>> Fixes: 27c3a5967f05 ("sched/isolation: Convert housekeeping cpumasks to rcu pointers")
>> Signed-off-by: Waiman Long <longman@redhat.com>
> Thanks for fixing it. Just some points below:
>
>> ---
>> kernel/sched/isolation.c | 20 +++++++++++++++-----
>> 1 file changed, 15 insertions(+), 5 deletions(-)
>>
>> diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
>> index ef152d401fe2..a947d75b43f1 100644
>> --- a/kernel/sched/isolation.c
>> +++ b/kernel/sched/isolation.c
>> @@ -164,8 +164,6 @@ int housekeeping_update(struct cpumask *isol_mask)
>>
>> void __init housekeeping_init(void)
>> {
>> - enum hk_type type;
>> -
>> if (!housekeeping.flags)
>> return;
>>
>> @@ -173,17 +171,27 @@ void __init housekeeping_init(void)
>>
>> if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
>> sched_tick_offload_init();
>> +}
>> +
>> +static int __init housekeeping_late_init(void)
>> +{
>> + enum hk_type type;
>> +
>> + if (!housekeeping.flags)
>> + return 0;
>> +
>> /*
>> * Realloc with a proper allocator so that any cpumask update
>> - * can indifferently free the old version with kfree().
>> + * can indifferently free the old version with kfree(). This
>> + * should be done after the completion of deferred_init_memmap().
>> */
>> for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
>> struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
>>
>> if (WARN_ON_ONCE(!nmask))
>> - return;
>> + return 0;
>>
>> - omask = rcu_dereference(housekeeping.cpumasks[type]);
>> + omask = rcu_dereference_check(housekeeping.cpumasks[type], 1);
>>
>> /* We need at least one CPU to handle housekeeping work */
>> WARN_ON_ONCE(cpumask_empty(omask));
>> @@ -191,7 +199,9 @@ void __init housekeeping_init(void)
>> RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
>> memblock_free(omask, cpumask_size());
>> }
>> + return 0;
>> }
>> +pure_initcall(housekeeping_late_init);
> Now that it has become an initcall, what prevents other CPUs from accessing
> the housekeeping cpumasks concurrently and then dereference the free'ed memory?
>
> And even as a pre-smp initcall, nothing prevents interrupts or preempting tasks
> from accessing them either.
>
> From which point on can memblock_free() be called? I see very early calls such
> as print_unknown_bootoptions()...
You are right. I should have just deferred the memblock_free() call.
Will send a v3 to fix that.
Thanks,
Longman
© 2016 - 2026 Red Hat, Inc.