[v4] hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING

[PATCH v4] hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING

Posted by Frederic Weisbecker 1 year ago

hrtimers are migrated away from the dying CPU to any online target at
the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers
handling tasks involved in the CPU hotplug forward progress.

However wake ups can still be performed by the outgoing CPU after
CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers
being armed. Depending on several considerations (crystal ball
power management based election, earliest timer already enqueued, timer
migration enabled or not), the target may eventually be the current
CPU even if offline. If that happens, the timer is eventually ignored.

The most notable example is RCU which had to deal with each and every of
those wake-ups by deferring them to an online CPU, along with related
workarounds:

_ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying)
_ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU)
_ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq)

The problem isn't confined to RCU though as the stop machine kthread
(which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end
of its work through cpu_stop_signal_done() and performs a wake up that
eventually arms the deadline server timer:

           WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0
           CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted
           Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0
           RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0
           Call Trace:
            <TASK>
            ? hrtimer_start_range_ns
            start_dl_timer
            enqueue_dl_entity
            dl_server_start
            enqueue_task_fair
            enqueue_task
            ttwu_do_activate
            try_to_wake_up
            complete
            cpu_stopper_thread
            smpboot_thread_fn
            kthread
            ret_from_fork
            ret_from_fork_asm
            </TASK>

Instead of providing yet another bandaid to work around the situation,
fix it from hrtimers infrastructure instead: always migrate away a
timer to an online target whenever it is enqueued from an offline CPU.

This will also allow to revert all the above RCU disgraceful hacks.

Reported-by: Vlad Poenaru <vlad.wing@gmail.com>
Reported-by: Usama Arif <usamaarif642@gmail.com>
Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Closes: 20241213203739.1519801-1-usamaarif642@gmail.com
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/hrtimer_defs.h |  1 +
 kernel/time/hrtimer.c        | 92 +++++++++++++++++++++++++++++-------
 2 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h
index c3b4b7ed7c16..84a5045f80f3 100644
--- a/include/linux/hrtimer_defs.h
+++ b/include/linux/hrtimer_defs.h
@@ -125,6 +125,7 @@ struct hrtimer_cpu_base {
 	ktime_t				softirq_expires_next;
 	struct hrtimer			*softirq_next_timer;
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
+	call_single_data_t		csd;
 } ____cacheline_aligned;
 
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 030426c8c944..082d4b687fa1 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -58,6 +58,8 @@
 #define HRTIMER_ACTIVE_SOFT	(HRTIMER_ACTIVE_HARD << MASK_SHIFT)
 #define HRTIMER_ACTIVE_ALL	(HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
 
+static void retrigger_next_event(void *arg);
+
 /*
  * The timer bases:
  *
@@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.clockid = CLOCK_TAI,
 			.get_time = &ktime_get_clocktai,
 		},
-	}
+	},
+	.csd = CSD_INIT(retrigger_next_event, NULL)
 };
 
 static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
@@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
 	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
 };
 
+static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
+{
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+		return true;
+	else
+		return likely(base->online);
+}
+
 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
@@ -183,27 +194,58 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 }
 
 /*
- * We do not migrate the timer when it is expiring before the next
- * event on the target cpu. When high resolution is enabled, we cannot
- * reprogram the target cpu hardware and we would cause it to fire
- * late. To keep it simple, we handle the high resolution enabled and
- * disabled case similar.
+ * Check if the elected target is suitable considering its next
+ * event and the hotplug state of the current CPU.
+ *
+ * If the elected target is remote and its next event is after the timer
+ * to queue, then a remote reprogram is necessary. However there is no
+ * guarantee the IPI handling the operation would arrive in time to meet
+ * the high resolution deadline. In this case the local CPU becomes a
+ * preferred target, unless it is offline.
+ *
+ * High and low resolution modes are handled the same way for simplicity.
  *
  * Called with cpu_base->lock of target cpu held.
  */
-static int
-hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+static bool
+hrtimer_suitable_target(struct hrtimer *timer,
+			struct hrtimer_clock_base *new_base,
+			struct hrtimer_cpu_base *new_cpu_base,
+			struct hrtimer_cpu_base *this_cpu_base)
 {
 	ktime_t expires;
 
+	/*
+	 * The local CPU clockevent can be reprogrammed. Also get_target_base()
+	 * guarantees it is online.
+	 */
+	if (new_cpu_base == this_cpu_base)
+		return true;
+
+	/*
+	 * The offline local CPU can't be the default target if the
+	 * next remote target event is after this timer. Keep the
+	 * elected new base. An IPI will we issued to reprogram
+	 * it as a last resort.
+	 */
+	if (!hrtimer_base_is_online(this_cpu_base))
+		return true;
+
 	expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
-	return expires < new_base->cpu_base->expires_next;
+
+	return expires >= new_base->cpu_base->expires_next;
 }
 
 static inline
 struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
 					 int pinned)
 {
+	if (!hrtimer_base_is_online(base)) {
+		int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+
+		return &per_cpu(hrtimer_bases, cpu);
+	}
+
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 	if (static_branch_likely(&timers_migration_enabled) && !pinned)
 		return &per_cpu(hrtimer_bases, get_nohz_timer_target());
@@ -254,8 +296,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
-		if (new_cpu_base != this_cpu_base &&
-		    hrtimer_check_target(timer, new_base)) {
+		if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
+					     this_cpu_base)) {
 			raw_spin_unlock(&new_base->cpu_base->lock);
 			raw_spin_lock(&base->cpu_base->lock);
 			new_cpu_base = this_cpu_base;
@@ -264,8 +306,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 		}
 		WRITE_ONCE(timer->base, new_base);
 	} else {
-		if (new_cpu_base != this_cpu_base &&
-		    hrtimer_check_target(timer, new_base)) {
+		if (!hrtimer_suitable_target(timer, new_base,  new_cpu_base,
+					     this_cpu_base)) {
 			new_cpu_base = this_cpu_base;
 			goto again;
 		}
@@ -716,8 +758,6 @@ static inline int hrtimer_is_hres_enabled(void)
 	return hrtimer_hres_enabled;
 }
 
-static void retrigger_next_event(void *arg);
-
 /*
  * Switch to high resolution mode
  */
@@ -1206,6 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 				    u64 delta_ns, const enum hrtimer_mode mode,
 				    struct hrtimer_clock_base *base)
 {
+	struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
 	struct hrtimer_clock_base *new_base;
 	bool force_local, first;
 
@@ -1217,9 +1258,15 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	 * and enforce reprogramming after it is queued no matter whether
 	 * it is the new first expiring timer again or not.
 	 */
-	force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+	force_local = base->cpu_base == this_cpu_base;
 	force_local &= base->cpu_base->next_timer == timer;
 
+	/*
+	 * Don't force local queuing if this enqueue happens on a unplugged
+	 * CPU after hrtimer_cpu_dying() has been invoked.
+	 */
+	force_local &= this_cpu_base->online;
+
 	/*
 	 * Remove an active timer from the queue. In case it is not queued
 	 * on the current CPU, make sure that remove_hrtimer() updates the
@@ -1249,8 +1296,17 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	}
 
 	first = enqueue_hrtimer(timer, new_base, mode);
-	if (!force_local)
-		return first;
+	if (!force_local) {
+		if (hrtimer_base_is_online(this_cpu_base))
+			return first;
+
+		if (first) {
+			struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+
+			smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
+		}
+		return 0;
+	}
 
 	/*
 	 * Timer was forced to stay on the current CPU to avoid
-- 
2.46.0

Re: [PATCH v4] hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING

Posted by Paul E. McKenney 1 year ago

On Sat, Jan 18, 2025 at 12:24:33AM +0100, Frederic Weisbecker wrote:
> hrtimers are migrated away from the dying CPU to any online target at
> the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers
> handling tasks involved in the CPU hotplug forward progress.
> 
> However wake ups can still be performed by the outgoing CPU after
> CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers
> being armed. Depending on several considerations (crystal ball
> power management based election, earliest timer already enqueued, timer
> migration enabled or not), the target may eventually be the current
> CPU even if offline. If that happens, the timer is eventually ignored.
> 
> The most notable example is RCU which had to deal with each and every of
> those wake-ups by deferring them to an online CPU, along with related
> workarounds:
> 
> _ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying)
> _ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU)
> _ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq)
> 
> The problem isn't confined to RCU though as the stop machine kthread
> (which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end
> of its work through cpu_stop_signal_done() and performs a wake up that
> eventually arms the deadline server timer:
> 
>            WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0
>            CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted
>            Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0
>            RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0
>            Call Trace:
>             <TASK>
>             ? hrtimer_start_range_ns
>             start_dl_timer
>             enqueue_dl_entity
>             dl_server_start
>             enqueue_task_fair
>             enqueue_task
>             ttwu_do_activate
>             try_to_wake_up
>             complete
>             cpu_stopper_thread
>             smpboot_thread_fn
>             kthread
>             ret_from_fork
>             ret_from_fork_asm
>             </TASK>
> 
> Instead of providing yet another bandaid to work around the situation,
> fix it from hrtimers infrastructure instead: always migrate away a
> timer to an online target whenever it is enqueued from an offline CPU.
> 
> This will also allow to revert all the above RCU disgraceful hacks.
> 
> Reported-by: Vlad Poenaru <vlad.wing@gmail.com>
> Reported-by: Usama Arif <usamaarif642@gmail.com>
> Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
> Closes: 20241213203739.1519801-1-usamaarif642@gmail.com
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>

This passes over-holiday testing rcutorture, so, perhaps redundantly:

Tested-by: Paul E. McKenney <paulmck@kernel.org>

> ---
>  include/linux/hrtimer_defs.h |  1 +
>  kernel/time/hrtimer.c        | 92 +++++++++++++++++++++++++++++-------
>  2 files changed, 75 insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h
> index c3b4b7ed7c16..84a5045f80f3 100644
> --- a/include/linux/hrtimer_defs.h
> +++ b/include/linux/hrtimer_defs.h
> @@ -125,6 +125,7 @@ struct hrtimer_cpu_base {
>  	ktime_t				softirq_expires_next;
>  	struct hrtimer			*softirq_next_timer;
>  	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
> +	call_single_data_t		csd;
>  } ____cacheline_aligned;
>  
>  
> diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
> index 030426c8c944..082d4b687fa1 100644
> --- a/kernel/time/hrtimer.c
> +++ b/kernel/time/hrtimer.c
> @@ -58,6 +58,8 @@
>  #define HRTIMER_ACTIVE_SOFT	(HRTIMER_ACTIVE_HARD << MASK_SHIFT)
>  #define HRTIMER_ACTIVE_ALL	(HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
>  
> +static void retrigger_next_event(void *arg);
> +
>  /*
>   * The timer bases:
>   *
> @@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
>  			.clockid = CLOCK_TAI,
>  			.get_time = &ktime_get_clocktai,
>  		},
> -	}
> +	},
> +	.csd = CSD_INIT(retrigger_next_event, NULL)
>  };
>  
>  static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
> @@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
>  	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
>  };
>  
> +static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
> +{
> +	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
> +		return true;
> +	else
> +		return likely(base->online);
> +}
> +
>  /*
>   * Functions and macros which are different for UP/SMP systems are kept in a
>   * single place
> @@ -183,27 +194,58 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
>  }
>  
>  /*
> - * We do not migrate the timer when it is expiring before the next
> - * event on the target cpu. When high resolution is enabled, we cannot
> - * reprogram the target cpu hardware and we would cause it to fire
> - * late. To keep it simple, we handle the high resolution enabled and
> - * disabled case similar.
> + * Check if the elected target is suitable considering its next
> + * event and the hotplug state of the current CPU.
> + *
> + * If the elected target is remote and its next event is after the timer
> + * to queue, then a remote reprogram is necessary. However there is no
> + * guarantee the IPI handling the operation would arrive in time to meet
> + * the high resolution deadline. In this case the local CPU becomes a
> + * preferred target, unless it is offline.
> + *
> + * High and low resolution modes are handled the same way for simplicity.
>   *
>   * Called with cpu_base->lock of target cpu held.
>   */
> -static int
> -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
> +static bool
> +hrtimer_suitable_target(struct hrtimer *timer,
> +			struct hrtimer_clock_base *new_base,
> +			struct hrtimer_cpu_base *new_cpu_base,
> +			struct hrtimer_cpu_base *this_cpu_base)
>  {
>  	ktime_t expires;
>  
> +	/*
> +	 * The local CPU clockevent can be reprogrammed. Also get_target_base()
> +	 * guarantees it is online.
> +	 */
> +	if (new_cpu_base == this_cpu_base)
> +		return true;
> +
> +	/*
> +	 * The offline local CPU can't be the default target if the
> +	 * next remote target event is after this timer. Keep the
> +	 * elected new base. An IPI will we issued to reprogram
> +	 * it as a last resort.
> +	 */
> +	if (!hrtimer_base_is_online(this_cpu_base))
> +		return true;
> +
>  	expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
> -	return expires < new_base->cpu_base->expires_next;
> +
> +	return expires >= new_base->cpu_base->expires_next;
>  }
>  
>  static inline
>  struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
>  					 int pinned)
>  {
> +	if (!hrtimer_base_is_online(base)) {
> +		int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
> +
> +		return &per_cpu(hrtimer_bases, cpu);
> +	}
> +
>  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
>  	if (static_branch_likely(&timers_migration_enabled) && !pinned)
>  		return &per_cpu(hrtimer_bases, get_nohz_timer_target());
> @@ -254,8 +296,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
>  		raw_spin_unlock(&base->cpu_base->lock);
>  		raw_spin_lock(&new_base->cpu_base->lock);
>  
> -		if (new_cpu_base != this_cpu_base &&
> -		    hrtimer_check_target(timer, new_base)) {
> +		if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
> +					     this_cpu_base)) {
>  			raw_spin_unlock(&new_base->cpu_base->lock);
>  			raw_spin_lock(&base->cpu_base->lock);
>  			new_cpu_base = this_cpu_base;
> @@ -264,8 +306,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
>  		}
>  		WRITE_ONCE(timer->base, new_base);
>  	} else {
> -		if (new_cpu_base != this_cpu_base &&
> -		    hrtimer_check_target(timer, new_base)) {
> +		if (!hrtimer_suitable_target(timer, new_base,  new_cpu_base,
> +					     this_cpu_base)) {
>  			new_cpu_base = this_cpu_base;
>  			goto again;
>  		}
> @@ -716,8 +758,6 @@ static inline int hrtimer_is_hres_enabled(void)
>  	return hrtimer_hres_enabled;
>  }
>  
> -static void retrigger_next_event(void *arg);
> -
>  /*
>   * Switch to high resolution mode
>   */
> @@ -1206,6 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
>  				    u64 delta_ns, const enum hrtimer_mode mode,
>  				    struct hrtimer_clock_base *base)
>  {
> +	struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
>  	struct hrtimer_clock_base *new_base;
>  	bool force_local, first;
>  
> @@ -1217,9 +1258,15 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
>  	 * and enforce reprogramming after it is queued no matter whether
>  	 * it is the new first expiring timer again or not.
>  	 */
> -	force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
> +	force_local = base->cpu_base == this_cpu_base;
>  	force_local &= base->cpu_base->next_timer == timer;
>  
> +	/*
> +	 * Don't force local queuing if this enqueue happens on a unplugged
> +	 * CPU after hrtimer_cpu_dying() has been invoked.
> +	 */
> +	force_local &= this_cpu_base->online;
> +
>  	/*
>  	 * Remove an active timer from the queue. In case it is not queued
>  	 * on the current CPU, make sure that remove_hrtimer() updates the
> @@ -1249,8 +1296,17 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
>  	}
>  
>  	first = enqueue_hrtimer(timer, new_base, mode);
> -	if (!force_local)
> -		return first;
> +	if (!force_local) {
> +		if (hrtimer_base_is_online(this_cpu_base))
> +			return first;
> +
> +		if (first) {
> +			struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
> +
> +			smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
> +		}
> +		return 0;
> +	}
>  
>  	/*
>  	 * Timer was forced to stay on the current CPU to avoid
> -- 
> 2.46.0
>

Re: [PATCH v4] hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING

Posted by Walter Chang (張維哲) 10 months, 2 weeks ago

On Tue, 2025-01-21 at 09:08 -0800, Paul E. McKenney wrote:
> On Sat, Jan 18, 2025 at 12:24:33AM +0100, Frederic Weisbecker wrote:
> > hrtimers are migrated away from the dying CPU to any online target
> > at
> > the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth
> > timers
> > handling tasks involved in the CPU hotplug forward progress.
> > 
> > However wake ups can still be performed by the outgoing CPU after
> > CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers
> > being armed. Depending on several considerations (crystal ball
> > power management based election, earliest timer already enqueued,
> > timer
> > migration enabled or not), the target may eventually be the current
> > CPU even if offline. If that happens, the timer is eventually
> > ignored.
> > 
> > The most notable example is RCU which had to deal with each and
> > every of
> > those wake-ups by deferring them to an online CPU, along with
> > related
> > workarounds:
> > 
> > _ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying)
> > _ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from
> > offline CPU)
> > _ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq)
> > 
> > The problem isn't confined to RCU though as the stop machine
> > kthread
> > (which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the
> > end
> > of its work through cpu_stop_signal_done() and performs a wake up
> > that
> > eventually arms the deadline server timer:
> > 
> >            WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086
> > hrtimer_start_range_ns+0x289/0x2d0
> >            CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted
> >            Stopper: multi_cpu_stop+0x0/0x120 <-
> > stop_machine_cpuslocked+0x66/0xc0
> >            RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0
> >            Call Trace:
> >             <TASK>
> >             ? hrtimer_start_range_ns
> >             start_dl_timer
> >             enqueue_dl_entity
> >             dl_server_start
> >             enqueue_task_fair
> >             enqueue_task
> >             ttwu_do_activate
> >             try_to_wake_up
> >             complete
> >             cpu_stopper_thread
> >             smpboot_thread_fn
> >             kthread
> >             ret_from_fork
> >             ret_from_fork_asm
> >             </TASK>
> > 
> > Instead of providing yet another bandaid to work around the
> > situation,
> > fix it from hrtimers infrastructure instead: always migrate away a
> > timer to an online target whenever it is enqueued from an offline
> > CPU.
> > 
> > This will also allow to revert all the above RCU disgraceful hacks.
> > 
> > Reported-by: Vlad Poenaru <vlad.wing@gmail.com>
> > Reported-by: Usama Arif <usamaarif642@gmail.com>
> > Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from
> > outgoing CPU earlier")
> > Closes: 20241213203739.1519801-1-usamaarif642@gmail.com
> > Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> > Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> 
> This passes over-holiday testing rcutorture, so, perhaps redundantly:
> 
> Tested-by: Paul E. McKenney <paulmck@kernel.org>

Hi,

I encountered the same issue even after applying this patch.
Below are the details of the warning and call trace.


migration/3: ------------[ cut here ]------------
migration/3: WARNING: CPU: 3 PID: 42 at kernel/time/hrtimer.c:1125
enqueue_hrtimer+0x7c/0xec
migration/3: CPU: 3 UID: 0 PID: 42 Comm: migration/3 Tainted: G       
OE      6.12.18-android16-0-g59cb5a849beb-4k #1
0b440e43fa7b24aaa3b7e6e5d2b938948e0cacdb
migration/3: Stopper: multi_cpu_stop+0x0/0x184 <-
stop_machine_cpuslocked+0xc0/0x15c

migration/3: Call trace:
migration/3:  enqueue_hrtimer+0x7c/0xec
migration/3:  hrtimer_start_range_ns+0x54c/0x7b4
migration/3:  start_dl_timer+0x110/0x188
migration/3:  enqueue_dl_entity+0x478/0x80c
migration/3:  dl_server_start+0xf8/0x194
migration/3:  enqueue_task_fair+0x6c4/0x924
migration/3:  enqueue_task+0x70/0x3a4
migration/3:  do_activate_task+0xcc/0x178
migration/3:  ttwu_do_activate+0xac/0x2e0
migration/3:  try_to_wake_up+0x73c/0xaf4
migration/3:  wake_up_process+0x18/0x28
migration/3:  kick_pool+0xc4/0x170
migration/3:  __queue_work+0x44c/0x698
migration/3:  delayed_work_timer_fn+0x20/0x30
migration/3:  call_timer_fn+0x4c/0x1d0
migration/3:  __run_timer_base+0x278/0x350
migration/3:  run_timer_softirq+0x78/0x9c
migration/3:  handle_softirqs+0x154/0x42c
migration/3:  __do_softirq+0x14/0x20
migration/3:  ____do_softirq+0x10/0x20
migration/3:  call_on_irq_stack+0x3c/0x74
migration/3:  do_softirq_own_stack+0x1c/0x2c
migration/3:  __irq_exit_rcu+0x5c/0xd4
migration/3:  irq_exit_rcu+0x10/0x1c
migration/3:  el1_interrupt+0x38/0x58
migration/3:  el1h_64_irq_handler+0x18/0x24
migration/3:  el1h_64_irq+0x68/0x6c
migration/3:  multi_cpu_stop+0x15c/0x184
migration/3:  cpu_stopper_thread+0xf8/0x1b0
migration/3:  smpboot_thread_fn+0x204/0x304
migration/3:  kthread+0x110/0x1a4
migration/3:  ret_from_fork+0x10/0x20
migration/3: ---[ end trace 0000000000000000 ]---
kworker/1:1: psci: CPU0 killed (polled 0 ms)

Re: [PATCH v4] hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING

Posted by Frederic Weisbecker 10 months, 2 weeks ago

Hi Walter Chang,

Le Wed, Mar 26, 2025 at 05:46:38AM +0000, Walter Chang (張維哲) a écrit :
> On Tue, 2025-01-21 at 09:08 -0800, Paul E. McKenney wrote:
> > On Sat, Jan 18, 2025 at 12:24:33AM +0100, Frederic Weisbecker wrote:
> > > hrtimers are migrated away from the dying CPU to any online target
> > > at
> > > the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth
> > > timers
> > > handling tasks involved in the CPU hotplug forward progress.
> > > 
> > > However wake ups can still be performed by the outgoing CPU after
> > > CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers
> > > being armed. Depending on several considerations (crystal ball
> > > power management based election, earliest timer already enqueued,
> > > timer
> > > migration enabled or not), the target may eventually be the current
> > > CPU even if offline. If that happens, the timer is eventually
> > > ignored.
> > > 
> > > The most notable example is RCU which had to deal with each and
> > > every of
> > > those wake-ups by deferring them to an online CPU, along with
> > > related
> > > workarounds:
> > > 
> > > _ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying)
> > > _ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from
> > > offline CPU)
> > > _ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq)
> > > 
> > > The problem isn't confined to RCU though as the stop machine
> > > kthread
> > > (which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the
> > > end
> > > of its work through cpu_stop_signal_done() and performs a wake up
> > > that
> > > eventually arms the deadline server timer:
> > > 
> > >            WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086
> > > hrtimer_start_range_ns+0x289/0x2d0
> > >            CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted
> > >            Stopper: multi_cpu_stop+0x0/0x120 <-
> > > stop_machine_cpuslocked+0x66/0xc0
> > >            RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0
> > >            Call Trace:
> > >             <TASK>
> > >             ? hrtimer_start_range_ns
> > >             start_dl_timer
> > >             enqueue_dl_entity
> > >             dl_server_start
> > >             enqueue_task_fair
> > >             enqueue_task
> > >             ttwu_do_activate
> > >             try_to_wake_up
> > >             complete
> > >             cpu_stopper_thread
> > >             smpboot_thread_fn
> > >             kthread
> > >             ret_from_fork
> > >             ret_from_fork_asm
> > >             </TASK>
> > > 
> > > Instead of providing yet another bandaid to work around the
> > > situation,
> > > fix it from hrtimers infrastructure instead: always migrate away a
> > > timer to an online target whenever it is enqueued from an offline
> > > CPU.
> > > 
> > > This will also allow to revert all the above RCU disgraceful hacks.
> > > 
> > > Reported-by: Vlad Poenaru <vlad.wing@gmail.com>
> > > Reported-by: Usama Arif <usamaarif642@gmail.com>
> > > Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from
> > > outgoing CPU earlier")
> > > Closes: 20241213203739.1519801-1-usamaarif642@gmail.com
> > > Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> > > Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> > 
> > This passes over-holiday testing rcutorture, so, perhaps redundantly:
> > 
> > Tested-by: Paul E. McKenney <paulmck@kernel.org>
> 
> Hi,
> 
> I encountered the same issue even after applying this patch.
> Below are the details of the warning and call trace.
> 
> 
> migration/3: ------------[ cut here ]------------
> migration/3: WARNING: CPU: 3 PID: 42 at kernel/time/hrtimer.c:1125
> enqueue_hrtimer+0x7c/0xec
> migration/3: CPU: 3 UID: 0 PID: 42 Comm: migration/3 Tainted: G       
> OE      6.12.18-android16-0-g59cb5a849beb-4k #1
> 0b440e43fa7b24aaa3b7e6e5d2b938948e0cacdb
> migration/3: Stopper: multi_cpu_stop+0x0/0x184 <-
> stop_machine_cpuslocked+0xc0/0x15c

It's not the first time I get such a report on an out of tree
kernel. The problem is I don't know if the tainted modules are
involved. But something is probably making an offline CPU visible within
the hierarchy on get_nohz_timer_target(). And that new warning made
that visible.

Can you try this and tell us if the warning fires?

Thanks.

diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index 6d67e9a5af6b..f49512628269 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -9,6 +9,7 @@
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
 extern int get_nohz_timer_target(void);
+extern void assert_domain_online(void);
 #else
 static inline void nohz_balance_enter_idle(int cpu) { }
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 07455d25329c..98c8f8408403 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -13,6 +13,7 @@
 #include <linux/sched/isolation.h>
 #include <linux/sched/task.h>
 #include <linux/sched/smt.h>
+#include <linux/sched/nohz.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
@@ -1277,6 +1278,7 @@ static int take_cpu_down(void *_param)
 	if (err < 0)
 		return err;
 
+	assert_domain_online();
 	/*
 	 * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
 	 * down, that the current state is CPUHP_TEARDOWN_CPU - 1.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 175a5a7ac107..88157b1645cc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1163,6 +1163,20 @@ void resched_cpu(int cpu)
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ_COMMON
+void assert_domain_online(void)
+{
+	int cpu = smp_processor_id();
+	int i;
+	struct sched_domain *sd;
+
+	guard(rcu)();
+
+	for_each_domain(cpu, sd) {
+		for_each_cpu(i, sched_domain_span(sd)) {
+			WARN_ON_ONCE(cpu_is_offline(i));
+		}
+	}
+}
 /*
  * In the semi idle case, use the nearest busy CPU for migrating timers
  * from an idle CPU.  This is good for power-savings.

Re: [PATCH v4] hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING

Posted by Kuyo Chang (張建文) 10 months, 1 week ago

On Wed, 2025-03-26 at 17:44 +0100, Frederic Weisbecker wrote:
> Hi Walter Chang,
> 
> Le Wed, Mar 26, 2025 at 05:46:38AM +0000, Walter Chang (張維哲) a écrit
> :
> > On Tue, 2025-01-21 at 09:08 -0800, Paul E. McKenney wrote:
> > > On Sat, Jan 18, 2025 at 12:24:33AM +0100, Frederic Weisbecker
> > > wrote:
> > > > hrtimers are migrated away from the dying CPU to any online
> > > > target
> > > > at
> > > > the CPUHP_AP_HRTIMERS_DYING stage in order not to delay
> > > > bandwidth
> > > > timers
> > > > handling tasks involved in the CPU hotplug forward progress.
> > > > 
> > > > However wake ups can still be performed by the outgoing CPU
> > > > after
> > > > CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth
> > > > timers
> > > > being armed. Depending on several considerations (crystal ball
> > > > power management based election, earliest timer already
> > > > enqueued,
> > > > timer
> > > > migration enabled or not), the target may eventually be the
> > > > current
> > > > CPU even if offline. If that happens, the timer is eventually
> > > > ignored.
> > > > 
> > > > The most notable example is RCU which had to deal with each and
> > > > every of
> > > > those wake-ups by deferring them to an online CPU, along with
> > > > related
> > > > workarounds:
> > > > 
> > > > _ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is
> > > > dying)
> > > > _ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from
> > > > offline CPU)
> > > > _ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline
> > > > softirq)
> > > > 
> > > > The problem isn't confined to RCU though as the stop machine
> > > > kthread
> > > > (which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at
> > > > the
> > > > end
> > > > of its work through cpu_stop_signal_done() and performs a wake
> > > > up
> > > > that
> > > > eventually arms the deadline server timer:
> > > > 
> > > >            WARNING: CPU: 94 PID: 588 at
> > > > kernel/time/hrtimer.c:1086
> > > > hrtimer_start_range_ns+0x289/0x2d0
> > > >            CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not
> > > > tainted
> > > >            Stopper: multi_cpu_stop+0x0/0x120 <-
> > > > stop_machine_cpuslocked+0x66/0xc0
> > > >            RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0
> > > >            Call Trace:
> > > >             <TASK>
> > > >             ? hrtimer_start_range_ns
> > > >             start_dl_timer
> > > >             enqueue_dl_entity
> > > >             dl_server_start
> > > >             enqueue_task_fair
> > > >             enqueue_task
> > > >             ttwu_do_activate
> > > >             try_to_wake_up
> > > >             complete
> > > >             cpu_stopper_thread
> > > >             smpboot_thread_fn
> > > >             kthread
> > > >             ret_from_fork
> > > >             ret_from_fork_asm
> > > >             </TASK>
> > > > 
> > > > Instead of providing yet another bandaid to work around the
> > > > situation,
> > > > fix it from hrtimers infrastructure instead: always migrate
> > > > away a
> > > > timer to an online target whenever it is enqueued from an
> > > > offline
> > > > CPU.
> > > > 
> > > > This will also allow to revert all the above RCU disgraceful
> > > > hacks.
> > > > 
> > > > Reported-by: Vlad Poenaru <vlad.wing@gmail.com>
> > > > Reported-by: Usama Arif <usamaarif642@gmail.com>
> > > > Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from
> > > > outgoing CPU earlier")
> > > > Closes: 20241213203739.1519801-1-usamaarif642@gmail.com
> > > > Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> > > > Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> > > 
> > > This passes over-holiday testing rcutorture, so, perhaps
> > > redundantly:
> > > 
> > > Tested-by: Paul E. McKenney <paulmck@kernel.org>
> > 
> > Hi,
> > 
> > I encountered the same issue even after applying this patch.
> > Below are the details of the warning and call trace.
> > 
> > 
> > migration/3: ------------[ cut here ]------------
> > migration/3: WARNING: CPU: 3 PID: 42 at kernel/time/hrtimer.c:1125
> > enqueue_hrtimer+0x7c/0xec
> > migration/3: CPU: 3 UID: 0 PID: 42 Comm: migration/3 Tainted:
> > G       
> > OE      6.12.18-android16-0-g59cb5a849beb-4k #1
> > 0b440e43fa7b24aaa3b7e6e5d2b938948e0cacdb
> > migration/3: Stopper: multi_cpu_stop+0x0/0x184 <-
> > stop_machine_cpuslocked+0xc0/0x15c
> 
> It's not the first time I get such a report on an out of tree
> kernel. The problem is I don't know if the tainted modules are
> involved. But something is probably making an offline CPU visible
> within
> the hierarchy on get_nohz_timer_target(). And that new warning made
> that visible.
> 
Hi,

By review the get_nohz_timer_target(), it's probably making an offline
CPU visible at timer candidates, maybe this patch could fix it?


[PATCH] sched/core: Exclude offline CPUs from the timer candidates

The timer target is chosen from the HK_TYPE_KERNEL_NOISE.
However,the candidate may be an offline CPU,
so exclude offline CPUs and choose only from online CPUs.

Signed-off-by: kuyo chang <kuyo.chang@mediatek.com>
---
 kernel/sched/core.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cfaca3040b2f..efcc2576e622 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1182,7 +1182,7 @@ int get_nohz_timer_target(void)
        struct sched_domain *sd;
        const struct cpumask *hk_mask;

-       if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) {
+       if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE) &&
cpu_online(cpu)) {
                if (!idle_cpu(cpu))
                        return cpu;
                default_cpu = cpu;
@@ -1197,13 +1197,16 @@ int get_nohz_timer_target(void)
                        if (cpu == i)
                                continue;

-                       if (!idle_cpu(i))
+                       if (!idle_cpu(i) && cpu_online(i))
                                return i;
                }
        }

-       if (default_cpu == -1)
+       if (default_cpu == -1) {
                default_cpu =
housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
+               if (!cpu_online(default_cpu))
+                       default_cpu = cpumask_any(cpu_online_mask);
+       }

        return default_cpu;
 }
> Can you try this and tell us if the warning fires?
> 
> Thanks.
> 
> diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
> index 6d67e9a5af6b..f49512628269 100644
> --- a/include/linux/sched/nohz.h
> +++ b/include/linux/sched/nohz.h
> @@ -9,6 +9,7 @@
>  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
>  extern void nohz_balance_enter_idle(int cpu);
>  extern int get_nohz_timer_target(void);
> +extern void assert_domain_online(void);
>  #else
>  static inline void nohz_balance_enter_idle(int cpu) { }
>  #endif
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index 07455d25329c..98c8f8408403 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -13,6 +13,7 @@
>  #include <linux/sched/isolation.h>
>  #include <linux/sched/task.h>
>  #include <linux/sched/smt.h>
> +#include <linux/sched/nohz.h>
>  #include <linux/unistd.h>
>  #include <linux/cpu.h>
>  #include <linux/oom.h>
> @@ -1277,6 +1278,7 @@ static int take_cpu_down(void *_param)
>  	if (err < 0)
>  		return err;
>  
> +	assert_domain_online();
>  	/*
>  	 * Must be called from CPUHP_TEARDOWN_CPU, which means, as
> we are going
>  	 * down, that the current state is CPUHP_TEARDOWN_CPU - 1.
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 175a5a7ac107..88157b1645cc 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1163,6 +1163,20 @@ void resched_cpu(int cpu)
>  
>  #ifdef CONFIG_SMP
>  #ifdef CONFIG_NO_HZ_COMMON
> +void assert_domain_online(void)
> +{
> +	int cpu = smp_processor_id();
> +	int i;
> +	struct sched_domain *sd;
> +
> +	guard(rcu)();
> +
> +	for_each_domain(cpu, sd) {
> +		for_each_cpu(i, sched_domain_span(sd)) {
> +			WARN_ON_ONCE(cpu_is_offline(i));
> +		}
> +	}
> +}
>  /*
>   * In the semi idle case, use the nearest busy CPU for migrating
> timers
>   * from an idle CPU.  This is good for power-savings.

Re: [PATCH v4] hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING

Posted by Frederic Weisbecker 10 months, 1 week ago

Le Wed, Apr 02, 2025 at 06:53:24AM +0000, Kuyo Chang (張建文) a écrit :
> Hi,
> 
> By review the get_nohz_timer_target(), it's probably making an offline
> CPU visible at timer candidates, maybe this patch could fix it?
> 
> 
> [PATCH] sched/core: Exclude offline CPUs from the timer candidates
> 
> The timer target is chosen from the HK_TYPE_KERNEL_NOISE.
> However,the candidate may be an offline CPU,
> so exclude offline CPUs and choose only from online CPUs.
> 
> Signed-off-by: kuyo chang <kuyo.chang@mediatek.com>
> ---
>  kernel/sched/core.c | 9 ++++++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index cfaca3040b2f..efcc2576e622 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1182,7 +1182,7 @@ int get_nohz_timer_target(void)
>         struct sched_domain *sd;
>         const struct cpumask *hk_mask;
> 
> -       if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) {
> +       if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE) &&
> cpu_online(cpu)) {
>                 if (!idle_cpu(cpu))
>                         return cpu;
>                 default_cpu = cpu;

It can't choose the current CPU because get_target_base() prevents that:

	if (!hrtimer_base_is_online(base)) {
		int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));

		return &per_cpu(hrtimer_bases, cpu);
	}

> @@ -1197,13 +1197,16 @@ int get_nohz_timer_target(void)
>                         if (cpu == i)
>                                 continue;
> 
> -                       if (!idle_cpu(i))
> +                       if (!idle_cpu(i) && cpu_online(i))
>                                 return i;

CPUs within the domain hierarchy are guaranteed to be online.
sched_cpu_deactivate() -> cpuset_cpu_inactive(cpu) is supposed to
take care of that. Unless there is another bug lurking here, which is
my suspicion. But it's hard to know as we are dealing with a kernel
with out of tree patches.

>                 }
>         }
> 
> -       if (default_cpu == -1)
> +       if (default_cpu == -1) {
>                 default_cpu =
> housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
> +               if (!cpu_online(default_cpu))
> +                       default_cpu = cpumask_any(cpu_online_mask);

housekeeping_any_cpu() only returns online CPUs.

Thanks.

Re: [PATCH v4] hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING

Posted by Walter Chang (張維哲) 10 months, 1 week ago

On Wed, 2025-03-26 at 17:44 +0100, Frederic Weisbecker wrote:
> 
> It's not the first time I get such a report on an out of tree
> kernel. The problem is I don't know if the tainted modules are
> involved. But something is probably making an offline CPU visible
> within
> the hierarchy on get_nohz_timer_target(). And that new warning made
> that visible.
> 
> Can you try this and tell us if the warning fires?
> 
> Thanks.
> 
> diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
> index 6d67e9a5af6b..f49512628269 100644
> --- a/include/linux/sched/nohz.h
> +++ b/include/linux/sched/nohz.h
> @@ -9,6 +9,7 @@
>  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
>  extern void nohz_balance_enter_idle(int cpu);
>  extern int get_nohz_timer_target(void);
> +extern void assert_domain_online(void);
>  #else
>  static inline void nohz_balance_enter_idle(int cpu) { }
>  #endif
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index 07455d25329c..98c8f8408403 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -13,6 +13,7 @@
>  #include <linux/sched/isolation.h>
>  #include <linux/sched/task.h>
>  #include <linux/sched/smt.h>
> +#include <linux/sched/nohz.h>
>  #include <linux/unistd.h>
>  #include <linux/cpu.h>
>  #include <linux/oom.h>
> @@ -1277,6 +1278,7 @@ static int take_cpu_down(void *_param)
>         if (err < 0)
>                 return err;
> 
> +       assert_domain_online();
>         /*
>          * Must be called from CPUHP_TEARDOWN_CPU, which means, as we
> are going
>          * down, that the current state is CPUHP_TEARDOWN_CPU - 1.
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 175a5a7ac107..88157b1645cc 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1163,6 +1163,20 @@ void resched_cpu(int cpu)
> 
>  #ifdef CONFIG_SMP
>  #ifdef CONFIG_NO_HZ_COMMON
> +void assert_domain_online(void)
> +{
> +       int cpu = smp_processor_id();
> +       int i;
> +       struct sched_domain *sd;
> +
> +       guard(rcu)();
> +
> +       for_each_domain(cpu, sd) {
> +               for_each_cpu(i, sched_domain_span(sd)) {
> +                       WARN_ON_ONCE(cpu_is_offline(i));
> +               }
> +       }
> +}
>  /*
>   * In the semi idle case, use the nearest busy CPU for migrating
> timers
>   * from an idle CPU.  This is good for power-savings.

Hi Frederic,

Thank you for providing the patch to debug the hrtimer warning issue.

I have applied the patch and conducted stress testing over the weekend.
And the warning provided in the patch did not occur during this period.

Additionally, after a thorough review of our internal tainted modules,
I can confirm that you are correct in your assessment. The
get_nohz_timer_target() with our tainted modules may indeed return a
CPU that is offline, leading to the hrtimer warning issue. We are
working on fixing this within our tainted modules.

Thanks again for your help in debugging this issue.

Best regards,
Walter Chang

[tip: timers/urgent] hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING

Posted by tip-bot2 for Frederic Weisbecker 1 year ago

The following commit has been merged into the timers/urgent branch of tip:

Commit-ID:     53dac345395c0d2493cbc2f4c85fe38aef5b63f5
Gitweb:        https://git.kernel.org/tip/53dac345395c0d2493cbc2f4c85fe38aef5b63f5
Author:        Frederic Weisbecker <frederic@kernel.org>
AuthorDate:    Sat, 18 Jan 2025 00:24:33 +01:00
Committer:     Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Thu, 23 Jan 2025 20:06:35 +01:00

hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING

hrtimers are migrated away from the dying CPU to any online target at
the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers
handling tasks involved in the CPU hotplug forward progress.

However wakeups can still be performed by the outgoing CPU after
CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers being
armed. Depending on several considerations (crystal ball power management
based election, earliest timer already enqueued, timer migration enabled or
not), the target may eventually be the current CPU even if offline. If that
happens, the timer is eventually ignored.

The most notable example is RCU which had to deal with each and every of
those wake-ups by deferring them to an online CPU, along with related
workarounds:

_ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying)
_ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU)
_ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq)

The problem isn't confined to RCU though as the stop machine kthread
(which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end
of its work through cpu_stop_signal_done() and performs a wake up that
eventually arms the deadline server timer:

   WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0
   CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted
   Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0
   RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0
   Call Trace:
   <TASK>
     start_dl_timer
     enqueue_dl_entity
     dl_server_start
     enqueue_task_fair
     enqueue_task
     ttwu_do_activate
     try_to_wake_up
     complete
     cpu_stopper_thread

Instead of providing yet another bandaid to work around the situation, fix
it in the hrtimers infrastructure instead: always migrate away a timer to
an online target whenever it is enqueued from an offline CPU.

This will also allow to revert all the above RCU disgraceful hacks.

Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Reported-by: Vlad Poenaru <vlad.wing@gmail.com>
Reported-by: Usama Arif <usamaarif642@gmail.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/all/20250117232433.24027-1-frederic@kernel.org
Closes: 20241213203739.1519801-1-usamaarif642@gmail.com

---
 include/linux/hrtimer_defs.h |   1 +-
 kernel/time/hrtimer.c        | 103 +++++++++++++++++++++++++++-------
 2 files changed, 83 insertions(+), 21 deletions(-)

diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h
index c3b4b7e..84a5045 100644
--- a/include/linux/hrtimer_defs.h
+++ b/include/linux/hrtimer_defs.h
@@ -125,6 +125,7 @@ struct hrtimer_cpu_base {
 	ktime_t				softirq_expires_next;
 	struct hrtimer			*softirq_next_timer;
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
+	call_single_data_t		csd;
 } ____cacheline_aligned;
 
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 4fb81f8..deb1aa3 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -58,6 +58,8 @@
 #define HRTIMER_ACTIVE_SOFT	(HRTIMER_ACTIVE_HARD << MASK_SHIFT)
 #define HRTIMER_ACTIVE_ALL	(HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
 
+static void retrigger_next_event(void *arg);
+
 /*
  * The timer bases:
  *
@@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.clockid = CLOCK_TAI,
 			.get_time = &ktime_get_clocktai,
 		},
-	}
+	},
+	.csd = CSD_INIT(retrigger_next_event, NULL)
 };
 
 static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
@@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
 	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
 };
 
+static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
+{
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+		return true;
+	else
+		return likely(base->online);
+}
+
 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
@@ -178,27 +189,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 }
 
 /*
- * We do not migrate the timer when it is expiring before the next
- * event on the target cpu. When high resolution is enabled, we cannot
- * reprogram the target cpu hardware and we would cause it to fire
- * late. To keep it simple, we handle the high resolution enabled and
- * disabled case similar.
+ * Check if the elected target is suitable considering its next
+ * event and the hotplug state of the current CPU.
+ *
+ * If the elected target is remote and its next event is after the timer
+ * to queue, then a remote reprogram is necessary. However there is no
+ * guarantee the IPI handling the operation would arrive in time to meet
+ * the high resolution deadline. In this case the local CPU becomes a
+ * preferred target, unless it is offline.
+ *
+ * High and low resolution modes are handled the same way for simplicity.
  *
  * Called with cpu_base->lock of target cpu held.
  */
-static int
-hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
+				    struct hrtimer_cpu_base *new_cpu_base,
+				    struct hrtimer_cpu_base *this_cpu_base)
 {
 	ktime_t expires;
 
+	/*
+	 * The local CPU clockevent can be reprogrammed. Also get_target_base()
+	 * guarantees it is online.
+	 */
+	if (new_cpu_base == this_cpu_base)
+		return true;
+
+	/*
+	 * The offline local CPU can't be the default target if the
+	 * next remote target event is after this timer. Keep the
+	 * elected new base. An IPI will we issued to reprogram
+	 * it as a last resort.
+	 */
+	if (!hrtimer_base_is_online(this_cpu_base))
+		return true;
+
 	expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
-	return expires < new_base->cpu_base->expires_next;
+
+	return expires >= new_base->cpu_base->expires_next;
 }
 
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
-					 int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
 {
+	if (!hrtimer_base_is_online(base)) {
+		int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+
+		return &per_cpu(hrtimer_bases, cpu);
+	}
+
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 	if (static_branch_likely(&timers_migration_enabled) && !pinned)
 		return &per_cpu(hrtimer_bases, get_nohz_timer_target());
@@ -249,8 +287,8 @@ again:
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
-		if (new_cpu_base != this_cpu_base &&
-		    hrtimer_check_target(timer, new_base)) {
+		if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
+					     this_cpu_base)) {
 			raw_spin_unlock(&new_base->cpu_base->lock);
 			raw_spin_lock(&base->cpu_base->lock);
 			new_cpu_base = this_cpu_base;
@@ -259,8 +297,7 @@ again:
 		}
 		WRITE_ONCE(timer->base, new_base);
 	} else {
-		if (new_cpu_base != this_cpu_base &&
-		    hrtimer_check_target(timer, new_base)) {
+		if (!hrtimer_suitable_target(timer, new_base,  new_cpu_base, this_cpu_base)) {
 			new_cpu_base = this_cpu_base;
 			goto again;
 		}
@@ -706,8 +743,6 @@ static inline int hrtimer_is_hres_enabled(void)
 	return hrtimer_hres_enabled;
 }
 
-static void retrigger_next_event(void *arg);
-
 /*
  * Switch to high resolution mode
  */
@@ -1195,6 +1230,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 				    u64 delta_ns, const enum hrtimer_mode mode,
 				    struct hrtimer_clock_base *base)
 {
+	struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
 	struct hrtimer_clock_base *new_base;
 	bool force_local, first;
 
@@ -1206,10 +1242,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	 * and enforce reprogramming after it is queued no matter whether
 	 * it is the new first expiring timer again or not.
 	 */
-	force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+	force_local = base->cpu_base == this_cpu_base;
 	force_local &= base->cpu_base->next_timer == timer;
 
 	/*
+	 * Don't force local queuing if this enqueue happens on a unplugged
+	 * CPU after hrtimer_cpu_dying() has been invoked.
+	 */
+	force_local &= this_cpu_base->online;
+
+	/*
 	 * Remove an active timer from the queue. In case it is not queued
 	 * on the current CPU, make sure that remove_hrtimer() updates the
 	 * remote data correctly.
@@ -1238,8 +1280,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	}
 
 	first = enqueue_hrtimer(timer, new_base, mode);
-	if (!force_local)
-		return first;
+	if (!force_local) {
+		/*
+		 * If the current CPU base is online, then the timer is
+		 * never queued on a remote CPU if it would be the first
+		 * expiring timer there.
+		 */
+		if (hrtimer_base_is_online(this_cpu_base))
+			return first;
+
+		/*
+		 * Timer was enqueued remote because the current base is
+		 * already offline. If the timer is the first to expire,
+		 * kick the remote CPU to reprogram the clock event.
+		 */
+		if (first) {
+			struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+
+			smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
+		}
+		return 0;
+	}
 
 	/*
 	 * Timer was forced to stay on the current CPU to avoid

[tip: timers/urgent] hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING

Posted by tip-bot2 for Frederic Weisbecker 1 year ago

The following commit has been merged into the timers/urgent branch of tip:

Commit-ID:     b7a110336261147ccb373f4100cc88271c54bd91
Gitweb:        https://git.kernel.org/tip/b7a110336261147ccb373f4100cc88271c54bd91
Author:        Frederic Weisbecker <frederic@kernel.org>
AuthorDate:    Sat, 18 Jan 2025 00:24:33 +01:00
Committer:     Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Thu, 23 Jan 2025 11:47:23 +01:00

hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING

hrtimers are migrated away from the dying CPU to any online target at
the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers
handling tasks involved in the CPU hotplug forward progress.

However wakeups can still be performed by the outgoing CPU after
CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers being
armed. Depending on several considerations (crystal ball power management
based election, earliest timer already enqueued, timer migration enabled or
not), the target may eventually be the current CPU even if offline. If that
happens, the timer is eventually ignored.

The most notable example is RCU which had to deal with each and every of
those wake-ups by deferring them to an online CPU, along with related
workarounds:

_ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying)
_ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU)
_ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq)

The problem isn't confined to RCU though as the stop machine kthread
(which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end
of its work through cpu_stop_signal_done() and performs a wake up that
eventually arms the deadline server timer:

   WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0
   CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted
   Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0
   RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0
   Call Trace:
   <TASK>
     start_dl_timer
     enqueue_dl_entity
     dl_server_start
     enqueue_task_fair
     enqueue_task
     ttwu_do_activate
     try_to_wake_up
     complete
     cpu_stopper_thread

Instead of providing yet another bandaid to work around the situation, fix
it in the hrtimers infrastructure instead: always migrate away a timer to
an online target whenever it is enqueued from an offline CPU.

This will also allow to revert all the above RCU disgraceful hacks.

Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Reported-by: Vlad Poenaru <vlad.wing@gmail.com>
Reported-by: Usama Arif <usamaarif642@gmail.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/all/20250117232433.24027-1-frederic@kernel.org
Closes: 20241213203739.1519801-1-usamaarif642@gmail.com
---
 include/linux/hrtimer_defs.h |   1 +-
 kernel/time/hrtimer.c        | 103 +++++++++++++++++++++++++++-------
 2 files changed, 83 insertions(+), 21 deletions(-)

diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h
index c3b4b7e..84a5045 100644
--- a/include/linux/hrtimer_defs.h
+++ b/include/linux/hrtimer_defs.h
@@ -125,6 +125,7 @@ struct hrtimer_cpu_base {
 	ktime_t				softirq_expires_next;
 	struct hrtimer			*softirq_next_timer;
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
+	call_single_data_t		csd;
 } ____cacheline_aligned;
 
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 14bd09c..0feb38b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -58,6 +58,8 @@
 #define HRTIMER_ACTIVE_SOFT	(HRTIMER_ACTIVE_HARD << MASK_SHIFT)
 #define HRTIMER_ACTIVE_ALL	(HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
 
+static void retrigger_next_event(void *arg);
+
 /*
  * The timer bases:
  *
@@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.clockid = CLOCK_TAI,
 			.get_time = &ktime_get_clocktai,
 		},
-	}
+	},
+	.csd = CSD_INIT(retrigger_next_event, NULL)
 };
 
 static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
@@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
 	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
 };
 
+static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
+{
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+		return true;
+	else
+		return likely(base->online);
+}
+
 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
@@ -183,27 +194,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 }
 
 /*
- * We do not migrate the timer when it is expiring before the next
- * event on the target cpu. When high resolution is enabled, we cannot
- * reprogram the target cpu hardware and we would cause it to fire
- * late. To keep it simple, we handle the high resolution enabled and
- * disabled case similar.
+ * Check if the elected target is suitable considering its next
+ * event and the hotplug state of the current CPU.
+ *
+ * If the elected target is remote and its next event is after the timer
+ * to queue, then a remote reprogram is necessary. However there is no
+ * guarantee the IPI handling the operation would arrive in time to meet
+ * the high resolution deadline. In this case the local CPU becomes a
+ * preferred target, unless it is offline.
+ *
+ * High and low resolution modes are handled the same way for simplicity.
  *
  * Called with cpu_base->lock of target cpu held.
  */
-static int
-hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
+				    struct hrtimer_cpu_base *new_cpu_base,
+				    struct hrtimer_cpu_base *this_cpu_base)
 {
 	ktime_t expires;
 
+	/*
+	 * The local CPU clockevent can be reprogrammed. Also get_target_base()
+	 * guarantees it is online.
+	 */
+	if (new_cpu_base == this_cpu_base)
+		return true;
+
+	/*
+	 * The offline local CPU can't be the default target if the
+	 * next remote target event is after this timer. Keep the
+	 * elected new base. An IPI will we issued to reprogram
+	 * it as a last resort.
+	 */
+	if (!hrtimer_base_is_online(this_cpu_base))
+		return true;
+
 	expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
-	return expires < new_base->cpu_base->expires_next;
+
+	return expires >= new_base->cpu_base->expires_next;
 }
 
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
-					 int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
 {
+	if (!hrtimer_base_is_online(base)) {
+		int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+
+		return &per_cpu(hrtimer_bases, cpu);
+	}
+
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 	if (static_branch_likely(&timers_migration_enabled) && !pinned)
 		return &per_cpu(hrtimer_bases, get_nohz_timer_target());
@@ -254,8 +292,8 @@ again:
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
-		if (new_cpu_base != this_cpu_base &&
-		    hrtimer_check_target(timer, new_base)) {
+		if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
+					     this_cpu_base)) {
 			raw_spin_unlock(&new_base->cpu_base->lock);
 			raw_spin_lock(&base->cpu_base->lock);
 			new_cpu_base = this_cpu_base;
@@ -264,8 +302,7 @@ again:
 		}
 		WRITE_ONCE(timer->base, new_base);
 	} else {
-		if (new_cpu_base != this_cpu_base &&
-		    hrtimer_check_target(timer, new_base)) {
+		if (!hrtimer_suitable_target(timer, new_base,  new_cpu_base, this_cpu_base)) {
 			new_cpu_base = this_cpu_base;
 			goto again;
 		}
@@ -716,8 +753,6 @@ static inline int hrtimer_is_hres_enabled(void)
 	return hrtimer_hres_enabled;
 }
 
-static void retrigger_next_event(void *arg);
-
 /*
  * Switch to high resolution mode
  */
@@ -1205,6 +1240,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 				    u64 delta_ns, const enum hrtimer_mode mode,
 				    struct hrtimer_clock_base *base)
 {
+	struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
 	struct hrtimer_clock_base *new_base;
 	bool force_local, first;
 
@@ -1216,10 +1252,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	 * and enforce reprogramming after it is queued no matter whether
 	 * it is the new first expiring timer again or not.
 	 */
-	force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+	force_local = base->cpu_base == this_cpu_base;
 	force_local &= base->cpu_base->next_timer == timer;
 
 	/*
+	 * Don't force local queuing if this enqueue happens on a unplugged
+	 * CPU after hrtimer_cpu_dying() has been invoked.
+	 */
+	force_local &= this_cpu_base->online;
+
+	/*
 	 * Remove an active timer from the queue. In case it is not queued
 	 * on the current CPU, make sure that remove_hrtimer() updates the
 	 * remote data correctly.
@@ -1248,8 +1290,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	}
 
 	first = enqueue_hrtimer(timer, new_base, mode);
-	if (!force_local)
-		return first;
+	if (!force_local) {
+		/*
+		 * If the current CPU base is online, then the timer is
+		 * never queued on a remote CPU if it would be the first
+		 * expiring timer there.
+		 */
+		if (hrtimer_base_is_online(this_cpu_base))
+			return first;
+
+		/*
+		 * Timer was enqueued remote because the current base is
+		 * already offline. If the timer is the first to expire,
+		 * kick the remote CPU to reprogram the clock event.
+		 */
+		if (first) {
+			struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+
+			smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
+		}
+		return 0;
+	}
 
 	/*
 	 * Timer was forced to stay on the current CPU to avoid