kernel/time/tick-common.c | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-)
After the recent commit 5097cbcb38e6 ("sched/isolation: Prevent boot
crash when the boot CPU is nohz_full") the kernel no longer crashes, but
there is another problem.
In this case tick_setup_device() calls tick_take_do_timer_from_boot() to
update tick_do_timer_cpu and this triggers the WARN_ON_ONCE(irqs_disabled)
in smp_call_function_single().
Kill tick_take_do_timer_from_boot() and just use WRITE_ONCE(), the new
comment tries to explain why this is safe (thanks Thomas!).
Fixes: 08ae95f4fd3b ("nohz_full: Allow the boot CPU to be nohz_full")
Link: https://lore.kernel.org/all/20240522151742.GA10400@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
kernel/time/tick-common.c | 39 +++++++++++++--------------------------
1 file changed, 13 insertions(+), 26 deletions(-)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index d88b13076b79..27d0018c8b05 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -178,26 +178,6 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
}
}
-#ifdef CONFIG_NO_HZ_FULL
-static void giveup_do_timer(void *info)
-{
- int cpu = *(unsigned int *)info;
-
- WARN_ON(tick_do_timer_cpu != smp_processor_id());
-
- tick_do_timer_cpu = cpu;
-}
-
-static void tick_take_do_timer_from_boot(void)
-{
- int cpu = smp_processor_id();
- int from = tick_do_timer_boot_cpu;
-
- if (from >= 0 && from != cpu)
- smp_call_function_single(from, giveup_do_timer, &cpu, 1);
-}
-#endif
-
/*
* Setup the tick device
*/
@@ -221,19 +201,26 @@ static void tick_setup_device(struct tick_device *td,
tick_next_period = ktime_get();
#ifdef CONFIG_NO_HZ_FULL
/*
- * The boot CPU may be nohz_full, in which case set
- * tick_do_timer_boot_cpu so the first housekeeping
- * secondary that comes up will take do_timer from
- * us.
+ * The boot CPU may be nohz_full, in which case the
+ * first housekeeping secondary will take do_timer()
+ * from us.
*/
if (tick_nohz_full_cpu(cpu))
tick_do_timer_boot_cpu = cpu;
} else if (tick_do_timer_boot_cpu != -1 &&
!tick_nohz_full_cpu(cpu)) {
- tick_take_do_timer_from_boot();
tick_do_timer_boot_cpu = -1;
- WARN_ON(READ_ONCE(tick_do_timer_cpu) != cpu);
+ /*
+ * The boot CPU will stay in periodic (NOHZ disabled)
+ * mode until clocksource_done_booting() called after
+ * smp_init() selects a high resolution clocksource and
+ * timekeeping_notify() kicks the NOHZ stuff alive.
+ *
+ * So this WRITE_ONCE can only race with the READ_ONCE
+ * check in tick_periodic() but this race is harmless.
+ */
+ WRITE_ONCE(tick_do_timer_cpu, cpu);
#endif
}
--
2.25.1.362.g51ebf55
Thomas, et al,
should I redo and/or resend this patch (and the next one on top of it) ?
Or what else can I do to fix this problem?
Thanks,
Oleg.
On 05/28, Oleg Nesterov wrote:
>
> After the recent commit 5097cbcb38e6 ("sched/isolation: Prevent boot
> crash when the boot CPU is nohz_full") the kernel no longer crashes, but
> there is another problem.
>
> In this case tick_setup_device() calls tick_take_do_timer_from_boot() to
> update tick_do_timer_cpu and this triggers the WARN_ON_ONCE(irqs_disabled)
> in smp_call_function_single().
>
> Kill tick_take_do_timer_from_boot() and just use WRITE_ONCE(), the new
> comment tries to explain why this is safe (thanks Thomas!).
>
> Fixes: 08ae95f4fd3b ("nohz_full: Allow the boot CPU to be nohz_full")
> Link: https://lore.kernel.org/all/20240522151742.GA10400@redhat.com
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> kernel/time/tick-common.c | 39 +++++++++++++--------------------------
> 1 file changed, 13 insertions(+), 26 deletions(-)
>
> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> index d88b13076b79..27d0018c8b05 100644
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -178,26 +178,6 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
> }
> }
>
> -#ifdef CONFIG_NO_HZ_FULL
> -static void giveup_do_timer(void *info)
> -{
> - int cpu = *(unsigned int *)info;
> -
> - WARN_ON(tick_do_timer_cpu != smp_processor_id());
> -
> - tick_do_timer_cpu = cpu;
> -}
> -
> -static void tick_take_do_timer_from_boot(void)
> -{
> - int cpu = smp_processor_id();
> - int from = tick_do_timer_boot_cpu;
> -
> - if (from >= 0 && from != cpu)
> - smp_call_function_single(from, giveup_do_timer, &cpu, 1);
> -}
> -#endif
> -
> /*
> * Setup the tick device
> */
> @@ -221,19 +201,26 @@ static void tick_setup_device(struct tick_device *td,
> tick_next_period = ktime_get();
> #ifdef CONFIG_NO_HZ_FULL
> /*
> - * The boot CPU may be nohz_full, in which case set
> - * tick_do_timer_boot_cpu so the first housekeeping
> - * secondary that comes up will take do_timer from
> - * us.
> + * The boot CPU may be nohz_full, in which case the
> + * first housekeeping secondary will take do_timer()
> + * from us.
> */
> if (tick_nohz_full_cpu(cpu))
> tick_do_timer_boot_cpu = cpu;
>
> } else if (tick_do_timer_boot_cpu != -1 &&
> !tick_nohz_full_cpu(cpu)) {
> - tick_take_do_timer_from_boot();
> tick_do_timer_boot_cpu = -1;
> - WARN_ON(READ_ONCE(tick_do_timer_cpu) != cpu);
> + /*
> + * The boot CPU will stay in periodic (NOHZ disabled)
> + * mode until clocksource_done_booting() called after
> + * smp_init() selects a high resolution clocksource and
> + * timekeeping_notify() kicks the NOHZ stuff alive.
> + *
> + * So this WRITE_ONCE can only race with the READ_ONCE
> + * check in tick_periodic() but this race is harmless.
> + */
> + WRITE_ONCE(tick_do_timer_cpu, cpu);
> #endif
> }
>
> --
> 2.25.1.362.g51ebf55
>
On Mon, Jun 10 2024 at 17:55, Oleg Nesterov wrote:
> should I redo and/or resend this patch (and the next one on top of it) ?
I lost track of the maze in this discussion. As the patch is obviously
correct, I just apply it as is and any cosmetic changes can go on top.
Thanks,
tglx
Le Tue, May 28, 2024 at 02:20:19PM +0200, Oleg Nesterov a écrit :
> After the recent commit 5097cbcb38e6 ("sched/isolation: Prevent boot
> crash when the boot CPU is nohz_full") the kernel no longer crashes, but
> there is another problem.
>
> In this case tick_setup_device() calls tick_take_do_timer_from_boot() to
> update tick_do_timer_cpu and this triggers the WARN_ON_ONCE(irqs_disabled)
> in smp_call_function_single().
>
> Kill tick_take_do_timer_from_boot() and just use WRITE_ONCE(), the new
> comment tries to explain why this is safe (thanks Thomas!).
>
> Fixes: 08ae95f4fd3b ("nohz_full: Allow the boot CPU to be nohz_full")
> Link: https://lore.kernel.org/all/20240522151742.GA10400@redhat.com
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> kernel/time/tick-common.c | 39 +++++++++++++--------------------------
> 1 file changed, 13 insertions(+), 26 deletions(-)
>
> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> index d88b13076b79..27d0018c8b05 100644
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -178,26 +178,6 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
> }
> }
>
> -#ifdef CONFIG_NO_HZ_FULL
> -static void giveup_do_timer(void *info)
> -{
> - int cpu = *(unsigned int *)info;
> -
> - WARN_ON(tick_do_timer_cpu != smp_processor_id());
> -
> - tick_do_timer_cpu = cpu;
> -}
> -
> -static void tick_take_do_timer_from_boot(void)
> -{
> - int cpu = smp_processor_id();
> - int from = tick_do_timer_boot_cpu;
> -
> - if (from >= 0 && from != cpu)
> - smp_call_function_single(from, giveup_do_timer, &cpu, 1);
> -}
> -#endif
> -
> /*
> * Setup the tick device
> */
> @@ -221,19 +201,26 @@ static void tick_setup_device(struct tick_device *td,
> tick_next_period = ktime_get();
> #ifdef CONFIG_NO_HZ_FULL
> /*
> - * The boot CPU may be nohz_full, in which case set
> - * tick_do_timer_boot_cpu so the first housekeeping
> - * secondary that comes up will take do_timer from
> - * us.
> + * The boot CPU may be nohz_full, in which case the
> + * first housekeeping secondary will take do_timer()
> + * from us.
> */
> if (tick_nohz_full_cpu(cpu))
> tick_do_timer_boot_cpu = cpu;
>
> } else if (tick_do_timer_boot_cpu != -1 &&
> !tick_nohz_full_cpu(cpu)) {
> - tick_take_do_timer_from_boot();
> tick_do_timer_boot_cpu = -1;
> - WARN_ON(READ_ONCE(tick_do_timer_cpu) != cpu);
> + /*
> + * The boot CPU will stay in periodic (NOHZ disabled)
> + * mode until clocksource_done_booting() called after
> + * smp_init() selects a high resolution clocksource and
> + * timekeeping_notify() kicks the NOHZ stuff alive.
> + *
> + * So this WRITE_ONCE can only race with the READ_ONCE
> + * check in tick_periodic() but this race is harmless.
> + */
> + WRITE_ONCE(tick_do_timer_cpu, cpu);
Looks good, but can we have a WARN_ONCE(tick_do_timer_cpu != tick_do_timer_boot_cpu)
right before that, just to make sure our assumptions above are right forever and
the boot CPU hasn't stopped the tick up to that point?
And after all, pushing a bit further your subsequent patch, can we get rid of
tick_do_timer_boot_cpu and ifdefery altogether? Such as:
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index fb0fdec8719a..63a7bd405de7 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -48,14 +48,6 @@ ktime_t tick_next_period;
* procedure also covers cpu hotplug.
*/
int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
-#ifdef CONFIG_NO_HZ_FULL
-/*
- * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
- * tick_do_timer_cpu and it should be taken over by an eligible secondary
- * when one comes online.
- */
-static int tick_do_timer_boot_cpu __read_mostly = -1;
-#endif
/*
* Debugging: see timer_list.c
@@ -177,26 +169,6 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
}
}
-#ifdef CONFIG_NO_HZ_FULL
-static void giveup_do_timer(void *info)
-{
- int cpu = *(unsigned int *)info;
-
- WARN_ON(tick_do_timer_cpu != smp_processor_id());
-
- tick_do_timer_cpu = cpu;
-}
-
-static void tick_take_do_timer_from_boot(void)
-{
- int cpu = smp_processor_id();
- int from = tick_do_timer_boot_cpu;
-
- if (from >= 0 && from != cpu)
- smp_call_function_single(from, giveup_do_timer, &cpu, 1);
-}
-#endif
-
/*
* Setup the tick device
*/
@@ -211,29 +183,28 @@ static void tick_setup_device(struct tick_device *td,
* First device setup ?
*/
if (!td->evtdev) {
+ int timekeeper = READ_ONCE(tick_do_timer_cpu);
/*
* If no cpu took the do_timer update, assign it to
* this cpu:
*/
- if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
+ if (timekeeper == TICK_DO_TIMER_BOOT) {
tick_do_timer_cpu = cpu;
tick_next_period = ktime_get();
-#ifdef CONFIG_NO_HZ_FULL
+ } else if (timekeeper == TICK_DO_TIMER_NONE) {
+ if (WARN_ON_ONCE(tick_nohz_full_enabled()))
+ WRITE_ONCE(tick_do_timer_cpu, cpu);
+ } else if (tick_nohz_full_cpu(timekeeper) && !tick_nohz_full_cpu(cpu)) {
/*
- * The boot CPU may be nohz_full, in which case set
- * tick_do_timer_boot_cpu so the first housekeeping
- * secondary that comes up will take do_timer from
- * us.
+ * The boot CPU will stay in periodic (NOHZ disabled)
+ * mode until clocksource_done_booting() called after
+ * smp_init() selects a high resolution clocksource and
+ * timekeeping_notify() kicks the NOHZ stuff alive.
+ *
+ * So this WRITE_ONCE can only race with the READ_ONCE
+ * check in tick_periodic() but this race is harmless.
*/
- if (tick_nohz_full_cpu(cpu))
- tick_do_timer_boot_cpu = cpu;
-
- } else if (tick_do_timer_boot_cpu != -1 &&
- !tick_nohz_full_cpu(cpu)) {
- tick_take_do_timer_from_boot();
- tick_do_timer_boot_cpu = -1;
- WARN_ON(tick_do_timer_cpu != cpu);
-#endif
+ WRITE_ONCE(tick_do_timer_cpu, cpu);
}
/*
Hi Frederic,
First of all, can we please make the additional changes you suggest on top of
this patch? I'd prefer to keep it as simple as possible, I will need to backport
it and I'd like to simplify the internal review.
On 05/30, Frederic Weisbecker wrote:
>
> And after all, pushing a bit further your subsequent patch, can we get rid of
> tick_do_timer_boot_cpu and ifdefery altogether? Such as:
Sure, I thought about this from the very beginning, see
https://lore.kernel.org/all/20240525135120.GA24152@redhat.com/
and the changelog in
[PATCH] tick/nohz_full: turn tick_do_timer_boot_cpu into boot_cpu_is_nohz_full
https://lore.kernel.org/all/20240530124032.GA26833@redhat.com/
on top of this patch.
And yes, in this case it is better to check that tick_do_timer_cpu != _NONE to
ensure that tick_nohz_full_cpu(tick_cpu) can't crash.
So I considered the change which is very close to yours, except
> + } else if (timekeeper == TICK_DO_TIMER_NONE) {
> + if (WARN_ON_ONCE(tick_nohz_full_enabled()))
> + WRITE_ONCE(tick_do_timer_cpu, cpu);
I don't think we need to change tick_do_timer_cpu in this case.
And I am not sure we need to check tick_nohz_full_enabled() here.
IOW, I was thinking about
if (!td->evtdev) {
int tick_cpu = READ_ONCE(tick_do_timer_cpu);
/*
* If no cpu took the do_timer update, assign it to
* this cpu:
*/
if (tick_cpu == TICK_DO_TIMER_BOOT) {
WRITE_ONCE(tick_do_timer_cpu, cpu);
tick_next_period = ktime_get();
/*
* The boot CPU may be nohz_full, in which case the
* first housekeeping secondary will take do_timer()
* from us.
*/
} else if (!WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) &&
tick_nohz_full_cpu(tick_cpu) &&
!tick_nohz_full_cpu(cpu)) {
/*
* The boot CPU will stay in periodic (NOHZ disabled)
* mode until clocksource_done_booting() called after
* smp_init() selects a high resolution clocksource and
* timekeeping_notify() kicks the NOHZ stuff alive.
*
* So this WRITE_ONCE can only race with the READ_ONCE
* check in tick_periodic() but this race is harmless.
*/
WRITE_ONCE(tick_do_timer_cpu, cpu);
}
But you know, somehow I like
[PATCH] tick/nohz_full: turn tick_do_timer_boot_cpu into boot_cpu_is_nohz_full
https://lore.kernel.org/all/20240530124032.GA26833@redhat.com/
a bit more, to me the code looks more understandable this way.
Note that this patch doesn't really need to keep #ifdef CONFIG_NO_HZ_FULL,
if (!td->evtdev) {
static bool boot_cpu_is_nohz_full;
/*
* If no cpu took the do_timer update, assign it to
* this cpu:
*/
if (READ_ONCE(tick_do_timer_cpu) == TICK_DO_TIMER_BOOT) {
WRITE_ONCE(tick_do_timer_cpu, cpu);
tick_next_period = ktime_get();
/*
* The boot CPU may be nohz_full, in which case the
* first housekeeping secondary will take do_timer()
* from us.
*/
boot_cpu_is_nohz_full = tick_nohz_full_cpu(cpu);
} else if (boot_cpu_is_nohz_full && !tick_nohz_full_cpu(cpu)) {
boot_cpu_is_nohz_full = false;
/*
* The boot CPU will stay in periodic (NOHZ disabled)
* mode until clocksource_done_booting() called after
* smp_init() selects a high resolution clocksource and
* timekeeping_notify() kicks the NOHZ stuff alive.
*
* So this WRITE_ONCE can only race with the READ_ONCE
* check in tick_periodic() but this race is harmless.
*/
WRITE_ONCE(tick_do_timer_cpu, cpu);
}
should work without #ifdef.
In this case I don't think we need the _NONE check, tick_sched_do_timer() will
complain.
But I won't argue. I will be happy to make V2 which follows your recommendations
but again, can I do this on top of this patch?
Oleg.
Le Sat, Jun 01, 2024 at 04:03:22PM +0200, Oleg Nesterov a écrit :
> Hi Frederic,
>
> First of all, can we please make the additional changes you suggest on top of
> this patch? I'd prefer to keep it as simple as possible, I will need to backport
> it and I'd like to simplify the internal review.
Sure!
>
> On 05/30, Frederic Weisbecker wrote:
> >
> > And after all, pushing a bit further your subsequent patch, can we get rid of
> > tick_do_timer_boot_cpu and ifdefery altogether? Such as:
>
> Sure, I thought about this from the very beginning, see
> https://lore.kernel.org/all/20240525135120.GA24152@redhat.com/
> and the changelog in
> [PATCH] tick/nohz_full: turn tick_do_timer_boot_cpu into boot_cpu_is_nohz_full
> https://lore.kernel.org/all/20240530124032.GA26833@redhat.com/
> on top of this patch.
>
> And yes, in this case it is better to check that tick_do_timer_cpu != _NONE to
> ensure that tick_nohz_full_cpu(tick_cpu) can't crash.
>
> So I considered the change which is very close to yours, except
>
> > + } else if (timekeeper == TICK_DO_TIMER_NONE) {
> > + if (WARN_ON_ONCE(tick_nohz_full_enabled()))
> > + WRITE_ONCE(tick_do_timer_cpu, cpu);
>
> I don't think we need to change tick_do_timer_cpu in this case.
> And I am not sure we need to check tick_nohz_full_enabled() here.
> IOW, I was thinking about
Hmm, in case of cpu-hotplug operations (that is after boot), we may be
past nohz enablement and therefore it might be TICK_DO_TIMER_NONE.
>
> if (!td->evtdev) {
> int tick_cpu = READ_ONCE(tick_do_timer_cpu);
> /*
> * If no cpu took the do_timer update, assign it to
> * this cpu:
> */
> if (tick_cpu == TICK_DO_TIMER_BOOT) {
> WRITE_ONCE(tick_do_timer_cpu, cpu);
> tick_next_period = ktime_get();
> /*
> * The boot CPU may be nohz_full, in which case the
> * first housekeeping secondary will take do_timer()
> * from us.
> */
> } else if (!WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) &&
> tick_nohz_full_cpu(tick_cpu) &&
> !tick_nohz_full_cpu(cpu)) {
> /*
> * The boot CPU will stay in periodic (NOHZ disabled)
> * mode until clocksource_done_booting() called after
> * smp_init() selects a high resolution clocksource and
> * timekeeping_notify() kicks the NOHZ stuff alive.
> *
> * So this WRITE_ONCE can only race with the READ_ONCE
> * check in tick_periodic() but this race is harmless.
> */
> WRITE_ONCE(tick_do_timer_cpu, cpu);
> }
>
> But you know, somehow I like
> [PATCH] tick/nohz_full: turn tick_do_timer_boot_cpu into boot_cpu_is_nohz_full
> https://lore.kernel.org/all/20240530124032.GA26833@redhat.com/
> a bit more, to me the code looks more understandable this way.
>
> Note that this patch doesn't really need to keep #ifdef CONFIG_NO_HZ_FULL,
>
> if (!td->evtdev) {
> static bool boot_cpu_is_nohz_full;
> /*
> * If no cpu took the do_timer update, assign it to
> * this cpu:
> */
> if (READ_ONCE(tick_do_timer_cpu) == TICK_DO_TIMER_BOOT) {
> WRITE_ONCE(tick_do_timer_cpu, cpu);
> tick_next_period = ktime_get();
> /*
> * The boot CPU may be nohz_full, in which case the
> * first housekeeping secondary will take do_timer()
> * from us.
> */
> boot_cpu_is_nohz_full = tick_nohz_full_cpu(cpu);
> } else if (boot_cpu_is_nohz_full && !tick_nohz_full_cpu(cpu)) {
> boot_cpu_is_nohz_full = false;
> /*
> * The boot CPU will stay in periodic (NOHZ disabled)
> * mode until clocksource_done_booting() called after
> * smp_init() selects a high resolution clocksource and
> * timekeeping_notify() kicks the NOHZ stuff alive.
> *
> * So this WRITE_ONCE can only race with the READ_ONCE
> * check in tick_periodic() but this race is harmless.
> */
> WRITE_ONCE(tick_do_timer_cpu, cpu);
> }
>
> should work without #ifdef.
>
> In this case I don't think we need the _NONE check, tick_sched_do_timer() will
> complain.
Right...
>
> But I won't argue. I will be happy to make V2 which follows your recommendations
> but again, can I do this on top of this patch?
I guess the static version above should work to remove the ifdef. And yes on top is fine.
Thanks!
On 06/02, Frederic Weisbecker wrote: > > I guess the static version above should work to remove the ifdef. And yes on top is fine. OK, I've sent v2. But again, I won't argue if you prefer to keep tick_do_timer_boot_cpu and add WARN_ON_ONCE(tick_cpu != tick_do_timer_boot_cpu) before WRITE_ONCE(tick_do_timer_cpu). In this case another patch makes no sense, I'll update this one. Just tell me what you like more. Sorry for the chaotic emails. Oleg.
Le Mon, Jun 03, 2024 at 05:41:33PM +0200, Oleg Nesterov a écrit : > On 06/02, Frederic Weisbecker wrote: > > > > I guess the static version above should work to remove the ifdef. And yes on top is fine. > > OK, I've sent v2. > > But again, I won't argue if you prefer to keep tick_do_timer_boot_cpu and add > WARN_ON_ONCE(tick_cpu != tick_do_timer_boot_cpu) before WRITE_ONCE(tick_do_timer_cpu). > In this case another patch makes no sense, I'll update this one. > > Just tell me what you like more. Sorry for the chaotic emails. I'm fine with the last one posted :-) Thanks.
On 05/30, Frederic Weisbecker wrote: > > Looks good, but can we have a WARN_ONCE(tick_do_timer_cpu != tick_do_timer_boot_cpu) > right before that, just to make sure our assumptions above are right forever and > the boot CPU hasn't stopped the tick up to that point? Sure, I thought about the additional sanity checks too. Although I had something different in mind. Frederic, et al, I am on private trip again without my working laptop, can't read the code. I'll reply on Saturday, OK? Thanks for review! Oleg.
Frederic,
Thanks for review.
On 05/30, Frederic Weisbecker wrote:
>
> Looks good, but can we have a WARN_ONCE(tick_do_timer_cpu != tick_do_timer_boot_cpu)
> right before that, just to make sure our assumptions above are right forever and
> the boot CPU hasn't stopped the tick up to that point?
Sure, I thought about the additional sanity checks too. Although I had something
different in mind.
Frederic, et al, I am on private trip again without my working laptop, can't read
the code. I'll reply on Saturday, OK?
Oleg.
>
> And after all, pushing a bit further your subsequent patch, can we get rid of
> tick_do_timer_boot_cpu and ifdefery altogether? Such as:
>
> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> index fb0fdec8719a..63a7bd405de7 100644
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -48,14 +48,6 @@ ktime_t tick_next_period;
> * procedure also covers cpu hotplug.
> */
> int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
> -#ifdef CONFIG_NO_HZ_FULL
> -/*
> - * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
> - * tick_do_timer_cpu and it should be taken over by an eligible secondary
> - * when one comes online.
> - */
> -static int tick_do_timer_boot_cpu __read_mostly = -1;
> -#endif
>
> /*
> * Debugging: see timer_list.c
> @@ -177,26 +169,6 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
> }
> }
>
> -#ifdef CONFIG_NO_HZ_FULL
> -static void giveup_do_timer(void *info)
> -{
> - int cpu = *(unsigned int *)info;
> -
> - WARN_ON(tick_do_timer_cpu != smp_processor_id());
> -
> - tick_do_timer_cpu = cpu;
> -}
> -
> -static void tick_take_do_timer_from_boot(void)
> -{
> - int cpu = smp_processor_id();
> - int from = tick_do_timer_boot_cpu;
> -
> - if (from >= 0 && from != cpu)
> - smp_call_function_single(from, giveup_do_timer, &cpu, 1);
> -}
> -#endif
> -
> /*
> * Setup the tick device
> */
> @@ -211,29 +183,28 @@ static void tick_setup_device(struct tick_device *td,
> * First device setup ?
> */
> if (!td->evtdev) {
> + int timekeeper = READ_ONCE(tick_do_timer_cpu);
> /*
> * If no cpu took the do_timer update, assign it to
> * this cpu:
> */
> - if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
> + if (timekeeper == TICK_DO_TIMER_BOOT) {
> tick_do_timer_cpu = cpu;
> tick_next_period = ktime_get();
> -#ifdef CONFIG_NO_HZ_FULL
> + } else if (timekeeper == TICK_DO_TIMER_NONE) {
> + if (WARN_ON_ONCE(tick_nohz_full_enabled()))
> + WRITE_ONCE(tick_do_timer_cpu, cpu);
> + } else if (tick_nohz_full_cpu(timekeeper) && !tick_nohz_full_cpu(cpu)) {
> /*
> - * The boot CPU may be nohz_full, in which case set
> - * tick_do_timer_boot_cpu so the first housekeeping
> - * secondary that comes up will take do_timer from
> - * us.
> + * The boot CPU will stay in periodic (NOHZ disabled)
> + * mode until clocksource_done_booting() called after
> + * smp_init() selects a high resolution clocksource and
> + * timekeeping_notify() kicks the NOHZ stuff alive.
> + *
> + * So this WRITE_ONCE can only race with the READ_ONCE
> + * check in tick_periodic() but this race is harmless.
> */
> - if (tick_nohz_full_cpu(cpu))
> - tick_do_timer_boot_cpu = cpu;
> -
> - } else if (tick_do_timer_boot_cpu != -1 &&
> - !tick_nohz_full_cpu(cpu)) {
> - tick_take_do_timer_from_boot();
> - tick_do_timer_boot_cpu = -1;
> - WARN_ON(tick_do_timer_cpu != cpu);
> -#endif
> + WRITE_ONCE(tick_do_timer_cpu, cpu);
> }
>
> /*
>
If this patch/changelog/comment is fine, I'll send another trivial
one which turns tick_do_timer_boot_cpu into "bool boot_cpu_is_nohz_full".
On 05/28, Oleg Nesterov wrote:
>
> After the recent commit 5097cbcb38e6 ("sched/isolation: Prevent boot
> crash when the boot CPU is nohz_full") the kernel no longer crashes, but
> there is another problem.
>
> In this case tick_setup_device() calls tick_take_do_timer_from_boot() to
> update tick_do_timer_cpu and this triggers the WARN_ON_ONCE(irqs_disabled)
> in smp_call_function_single().
>
> Kill tick_take_do_timer_from_boot() and just use WRITE_ONCE(), the new
> comment tries to explain why this is safe (thanks Thomas!).
>
> Fixes: 08ae95f4fd3b ("nohz_full: Allow the boot CPU to be nohz_full")
> Link: https://lore.kernel.org/all/20240522151742.GA10400@redhat.com
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> kernel/time/tick-common.c | 39 +++++++++++++--------------------------
> 1 file changed, 13 insertions(+), 26 deletions(-)
>
> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> index d88b13076b79..27d0018c8b05 100644
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -178,26 +178,6 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
> }
> }
>
> -#ifdef CONFIG_NO_HZ_FULL
> -static void giveup_do_timer(void *info)
> -{
> - int cpu = *(unsigned int *)info;
> -
> - WARN_ON(tick_do_timer_cpu != smp_processor_id());
> -
> - tick_do_timer_cpu = cpu;
> -}
> -
> -static void tick_take_do_timer_from_boot(void)
> -{
> - int cpu = smp_processor_id();
> - int from = tick_do_timer_boot_cpu;
> -
> - if (from >= 0 && from != cpu)
> - smp_call_function_single(from, giveup_do_timer, &cpu, 1);
> -}
> -#endif
> -
> /*
> * Setup the tick device
> */
> @@ -221,19 +201,26 @@ static void tick_setup_device(struct tick_device *td,
> tick_next_period = ktime_get();
> #ifdef CONFIG_NO_HZ_FULL
> /*
> - * The boot CPU may be nohz_full, in which case set
> - * tick_do_timer_boot_cpu so the first housekeeping
> - * secondary that comes up will take do_timer from
> - * us.
> + * The boot CPU may be nohz_full, in which case the
> + * first housekeeping secondary will take do_timer()
> + * from us.
> */
> if (tick_nohz_full_cpu(cpu))
> tick_do_timer_boot_cpu = cpu;
>
> } else if (tick_do_timer_boot_cpu != -1 &&
> !tick_nohz_full_cpu(cpu)) {
> - tick_take_do_timer_from_boot();
> tick_do_timer_boot_cpu = -1;
> - WARN_ON(READ_ONCE(tick_do_timer_cpu) != cpu);
> + /*
> + * The boot CPU will stay in periodic (NOHZ disabled)
> + * mode until clocksource_done_booting() called after
> + * smp_init() selects a high resolution clocksource and
> + * timekeeping_notify() kicks the NOHZ stuff alive.
> + *
> + * So this WRITE_ONCE can only race with the READ_ONCE
> + * check in tick_periodic() but this race is harmless.
> + */
> + WRITE_ONCE(tick_do_timer_cpu, cpu);
> #endif
> }
>
> --
> 2.25.1.362.g51ebf55
>
The following commit has been merged into the timers/urgent branch of tip:
Commit-ID: 07c54cc5988f19c9642fd463c2dbdac7fc52f777
Gitweb: https://git.kernel.org/tip/07c54cc5988f19c9642fd463c2dbdac7fc52f777
Author: Oleg Nesterov <oleg@redhat.com>
AuthorDate: Tue, 28 May 2024 14:20:19 +02:00
Committer: Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Mon, 10 Jun 2024 20:18:13 +02:00
tick/nohz_full: Don't abuse smp_call_function_single() in tick_setup_device()
After the recent commit 5097cbcb38e6 ("sched/isolation: Prevent boot crash
when the boot CPU is nohz_full") the kernel no longer crashes, but there is
another problem.
In this case tick_setup_device() calls tick_take_do_timer_from_boot() to
update tick_do_timer_cpu and this triggers the WARN_ON_ONCE(irqs_disabled)
in smp_call_function_single().
Kill tick_take_do_timer_from_boot() and just use WRITE_ONCE(), the new
comment explains why this is safe (thanks Thomas!).
Fixes: 08ae95f4fd3b ("nohz_full: Allow the boot CPU to be nohz_full")
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20240528122019.GA28794@redhat.com
Link: https://lore.kernel.org/all/20240522151742.GA10400@redhat.com
---
kernel/time/tick-common.c | 42 ++++++++++++--------------------------
1 file changed, 14 insertions(+), 28 deletions(-)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index d88b130..a47bcf7 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -178,26 +178,6 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
}
}
-#ifdef CONFIG_NO_HZ_FULL
-static void giveup_do_timer(void *info)
-{
- int cpu = *(unsigned int *)info;
-
- WARN_ON(tick_do_timer_cpu != smp_processor_id());
-
- tick_do_timer_cpu = cpu;
-}
-
-static void tick_take_do_timer_from_boot(void)
-{
- int cpu = smp_processor_id();
- int from = tick_do_timer_boot_cpu;
-
- if (from >= 0 && from != cpu)
- smp_call_function_single(from, giveup_do_timer, &cpu, 1);
-}
-#endif
-
/*
* Setup the tick device
*/
@@ -221,19 +201,25 @@ static void tick_setup_device(struct tick_device *td,
tick_next_period = ktime_get();
#ifdef CONFIG_NO_HZ_FULL
/*
- * The boot CPU may be nohz_full, in which case set
- * tick_do_timer_boot_cpu so the first housekeeping
- * secondary that comes up will take do_timer from
- * us.
+ * The boot CPU may be nohz_full, in which case the
+ * first housekeeping secondary will take do_timer()
+ * from it.
*/
if (tick_nohz_full_cpu(cpu))
tick_do_timer_boot_cpu = cpu;
- } else if (tick_do_timer_boot_cpu != -1 &&
- !tick_nohz_full_cpu(cpu)) {
- tick_take_do_timer_from_boot();
+ } else if (tick_do_timer_boot_cpu != -1 && !tick_nohz_full_cpu(cpu)) {
tick_do_timer_boot_cpu = -1;
- WARN_ON(READ_ONCE(tick_do_timer_cpu) != cpu);
+ /*
+ * The boot CPU will stay in periodic (NOHZ disabled)
+ * mode until clocksource_done_booting() called after
+ * smp_init() selects a high resolution clocksource and
+ * timekeeping_notify() kicks the NOHZ stuff alive.
+ *
+ * So this WRITE_ONCE can only race with the READ_ONCE
+ * check in tick_periodic() but this race is harmless.
+ */
+ WRITE_ONCE(tick_do_timer_cpu, cpu);
#endif
}
Le Mon, Jun 10, 2024 at 06:26:21PM -0000, tip-bot2 for Oleg Nesterov a écrit :
> The following commit has been merged into the timers/urgent branch of tip:
>
> Commit-ID: 07c54cc5988f19c9642fd463c2dbdac7fc52f777
> Gitweb: https://git.kernel.org/tip/07c54cc5988f19c9642fd463c2dbdac7fc52f777
> Author: Oleg Nesterov <oleg@redhat.com>
> AuthorDate: Tue, 28 May 2024 14:20:19 +02:00
> Committer: Thomas Gleixner <tglx@linutronix.de>
> CommitterDate: Mon, 10 Jun 2024 20:18:13 +02:00
>
> tick/nohz_full: Don't abuse smp_call_function_single() in tick_setup_device()
>
> After the recent commit 5097cbcb38e6 ("sched/isolation: Prevent boot crash
> when the boot CPU is nohz_full") the kernel no longer crashes, but there is
> another problem.
>
> In this case tick_setup_device() calls tick_take_do_timer_from_boot() to
> update tick_do_timer_cpu and this triggers the WARN_ON_ONCE(irqs_disabled)
> in smp_call_function_single().
>
> Kill tick_take_do_timer_from_boot() and just use WRITE_ONCE(), the new
> comment explains why this is safe (thanks Thomas!).
>
> Fixes: 08ae95f4fd3b ("nohz_full: Allow the boot CPU to be nohz_full")
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: stable@vger.kernel.org
> Link: https://lore.kernel.org/r/20240528122019.GA28794@redhat.com
> Link: https://lore.kernel.org/all/20240522151742.GA10400@redhat.com
I think we agreed on that version actually:
https://lore.kernel.org/all/20240603153557.GA8311@redhat.com/
Thanks.
We don't need to record the cpu number of the nohz_full boot CPU, a simple
boolean is enough.
We could even kill it along with #ifdef, the "else if" branch could check
tick_nohz_full_cpu(tick_do_timer_cpu) && !tick_nohz_full_cpu(cpu).
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
kernel/time/tick-common.c | 24 ++++++++----------------
1 file changed, 8 insertions(+), 16 deletions(-)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 27d0018c8b05..9829a2cda072 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -49,15 +49,6 @@ ktime_t tick_next_period;
* procedure also covers cpu hotplug.
*/
int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
-#ifdef CONFIG_NO_HZ_FULL
-/*
- * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
- * tick_do_timer_cpu and it should be taken over by an eligible secondary
- * when one comes online.
- */
-static int tick_do_timer_boot_cpu __read_mostly = -1;
-#endif
-
/*
* Debugging: see timer_list.c
*/
@@ -178,9 +169,13 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
}
}
+#ifdef CONFIG_NO_HZ_FULL
/*
- * Setup the tick device
+ * Iindicates that the boot CPU temporarily owns tick_do_timer_cpu.
*/
+static bool boot_cpu_is_nohz_full __read_mostly;
+#endif
+
static void tick_setup_device(struct tick_device *td,
struct clock_event_device *newdev, int cpu,
const struct cpumask *cpumask)
@@ -205,12 +200,9 @@ static void tick_setup_device(struct tick_device *td,
* first housekeeping secondary will take do_timer()
* from us.
*/
- if (tick_nohz_full_cpu(cpu))
- tick_do_timer_boot_cpu = cpu;
-
- } else if (tick_do_timer_boot_cpu != -1 &&
- !tick_nohz_full_cpu(cpu)) {
- tick_do_timer_boot_cpu = -1;
+ boot_cpu_is_nohz_full = tick_nohz_full_cpu(cpu);
+ } else if (boot_cpu_is_nohz_full && !tick_nohz_full_cpu(cpu)) {
+ boot_cpu_is_nohz_full = false;
/*
* The boot CPU will stay in periodic (NOHZ disabled)
* mode until clocksource_done_booting() called after
--
2.25.1.362.g51ebf55
We don't need to record the cpu number of the nohz_full boot CPU, a simple
boolean is enough. We could even kill it, the "else if" branch could check
tick_nohz_full_cpu(tick_do_timer_cpu) && !tick_nohz_full_cpu(cpu).
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
kernel/time/tick-common.c | 21 ++++-----------------
1 file changed, 4 insertions(+), 17 deletions(-)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 27d0018c8b05..82c1b2206631 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -49,15 +49,6 @@ ktime_t tick_next_period;
* procedure also covers cpu hotplug.
*/
int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
-#ifdef CONFIG_NO_HZ_FULL
-/*
- * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
- * tick_do_timer_cpu and it should be taken over by an eligible secondary
- * when one comes online.
- */
-static int tick_do_timer_boot_cpu __read_mostly = -1;
-#endif
-
/*
* Debugging: see timer_list.c
*/
@@ -192,6 +183,7 @@ static void tick_setup_device(struct tick_device *td,
* First device setup ?
*/
if (!td->evtdev) {
+ static bool boot_cpu_is_nohz_full __read_mostly;
/*
* If no cpu took the do_timer update, assign it to
* this cpu:
@@ -199,18 +191,14 @@ static void tick_setup_device(struct tick_device *td,
if (READ_ONCE(tick_do_timer_cpu) == TICK_DO_TIMER_BOOT) {
WRITE_ONCE(tick_do_timer_cpu, cpu);
tick_next_period = ktime_get();
-#ifdef CONFIG_NO_HZ_FULL
/*
* The boot CPU may be nohz_full, in which case the
* first housekeeping secondary will take do_timer()
* from us.
*/
- if (tick_nohz_full_cpu(cpu))
- tick_do_timer_boot_cpu = cpu;
-
- } else if (tick_do_timer_boot_cpu != -1 &&
- !tick_nohz_full_cpu(cpu)) {
- tick_do_timer_boot_cpu = -1;
+ boot_cpu_is_nohz_full = tick_nohz_full_cpu(cpu);
+ } else if (boot_cpu_is_nohz_full && !tick_nohz_full_cpu(cpu)) {
+ boot_cpu_is_nohz_full = false;
/*
* The boot CPU will stay in periodic (NOHZ disabled)
* mode until clocksource_done_booting() called after
@@ -221,7 +209,6 @@ static void tick_setup_device(struct tick_device *td,
* check in tick_periodic() but this race is harmless.
*/
WRITE_ONCE(tick_do_timer_cpu, cpu);
-#endif
}
/*
--
2.25.1.362.g51ebf55
On Tue Jun 4, 2024 at 1:35 AM AEST, Oleg Nesterov wrote:
> We don't need to record the cpu number of the nohz_full boot CPU, a simple
> boolean is enough. We could even kill it, the "else if" branch could check
> tick_nohz_full_cpu(tick_do_timer_cpu) && !tick_nohz_full_cpu(cpu).
>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
> ---
> kernel/time/tick-common.c | 21 ++++-----------------
> 1 file changed, 4 insertions(+), 17 deletions(-)
>
> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> index 27d0018c8b05..82c1b2206631 100644
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -49,15 +49,6 @@ ktime_t tick_next_period;
> * procedure also covers cpu hotplug.
> */
> int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
> -#ifdef CONFIG_NO_HZ_FULL
> -/*
> - * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
> - * tick_do_timer_cpu and it should be taken over by an eligible secondary
> - * when one comes online.
> - */
> -static int tick_do_timer_boot_cpu __read_mostly = -1;
> -#endif
> -
> /*
> * Debugging: see timer_list.c
> */
> @@ -192,6 +183,7 @@ static void tick_setup_device(struct tick_device *td,
> * First device setup ?
> */
> if (!td->evtdev) {
> + static bool boot_cpu_is_nohz_full __read_mostly;
> /*
> * If no cpu took the do_timer update, assign it to
> * this cpu:
> @@ -199,18 +191,14 @@ static void tick_setup_device(struct tick_device *td,
> if (READ_ONCE(tick_do_timer_cpu) == TICK_DO_TIMER_BOOT) {
> WRITE_ONCE(tick_do_timer_cpu, cpu);
> tick_next_period = ktime_get();
> -#ifdef CONFIG_NO_HZ_FULL
> /*
> * The boot CPU may be nohz_full, in which case the
> * first housekeeping secondary will take do_timer()
> * from us.
> */
> - if (tick_nohz_full_cpu(cpu))
> - tick_do_timer_boot_cpu = cpu;
> -
> - } else if (tick_do_timer_boot_cpu != -1 &&
> - !tick_nohz_full_cpu(cpu)) {
> - tick_do_timer_boot_cpu = -1;
> + boot_cpu_is_nohz_full = tick_nohz_full_cpu(cpu);
> + } else if (boot_cpu_is_nohz_full && !tick_nohz_full_cpu(cpu)) {
> + boot_cpu_is_nohz_full = false;
> /*
> * The boot CPU will stay in periodic (NOHZ disabled)
> * mode until clocksource_done_booting() called after
> @@ -221,7 +209,6 @@ static void tick_setup_device(struct tick_device *td,
> * check in tick_periodic() but this race is harmless.
> */
> WRITE_ONCE(tick_do_timer_cpu, cpu);
> -#endif
> }
>
> /*
Le Mon, Jun 03, 2024 at 05:35:57PM +0200, Oleg Nesterov a écrit : > We don't need to record the cpu number of the nohz_full boot CPU, a simple > boolean is enough. We could even kill it, the "else if" branch could check > tick_nohz_full_cpu(tick_do_timer_cpu) && !tick_nohz_full_cpu(cpu). > > Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Frederic Weisbecker <frederic@kernel.org>
© 2016 - 2025 Red Hat, Inc.