[v1] *** Detect interrupt storm in softlockup ***

[PATCH 2/3] watchdog/softlockup: report the most time-consuming hardirq

Posted by Bitao Hu 2 years ago

When the watchdog determines that the current soft lockup is due
to an interrupt storm based on CPU utilization, reporting the
top three most time-consuming hardirq can be quite useful for
further troubleshooting.
Below is an example of interrupt storm. The call tree does not
provide useful information, but we can analyze which interrupt
caused the soft lockup by using the time-consuming information
of hardirq.

[   67.714044] watchdog: BUG: soft lockup - CPU#9 stuck for 28s! [swapper/9:0]
[   67.714548] CPU#9 Utilization every 4s during lockup:
[   67.714549]  #1: 0.0% system,        0.0% softirq,   95.0% hardirq,  0.0% idle
[   67.714551]  #2: 0.0% system,        0.0% softirq,   90.0% hardirq,  0.0% idle
[   67.714553]  #3: 0.0% system,        0.0% softirq,   90.0% hardirq,  0.0% idle
[   67.714555]  #4: 0.0% system,        0.0% softirq,   95.0% hardirq,  0.0% idle
[   67.714556]  #5: 0.0% system,        0.0% softirq,   90.0% hardirq,  0.0% idle
[   67.714570] CPU#9 Detect HardIRQ Time exceeds 50% since 45s. Most time-consuming HardIRQs:
[   67.714571]  #1: 99.9% irq#7(IPI)
[   67.714575]  #2: 0.0% irq#10(arch_timer)
[   67.714578]  #3: 0.0% irq#2(IPI)
...
[   67.714654] Call trace:
[   67.714656]  __do_softirq+0xa8/0x364

Signed-off-by: Bitao Hu <yaoma@linux.alibaba.com>
---
 include/linux/irq.h     |   9 ++
 include/linux/irqdesc.h |   2 +
 kernel/irq/irqdesc.c    |   9 +-
 kernel/watchdog.c       | 186 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 205 insertions(+), 1 deletion(-)

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 90081afa10ce..3a3e8b1e227e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -251,6 +251,15 @@ enum {
 	IRQD_RESEND_WHEN_IN_PROGRESS    = BIT(30),
 };
 
+
+/*
+ * struct per_irqtime - per irq stats for interrupt storm detection
+ */
+struct per_irqtime {
+	u64	start_time;
+	u64	delta;
+};
+
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
 
 static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index d9451d456a73..c5407cdcf2d6 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -29,6 +29,7 @@ struct pt_regs;
  * @wake_depth:		enable depth, for multiple irq_set_irq_wake() callers
  * @tot_count:		stats field for non-percpu irqs
  * @irq_count:		stats field to detect stalled irqs
+ * @irq_time:		stats field to detect interrupt storm
  * @last_unhandled:	aging timer for unhandled count
  * @irqs_unhandled:	stats field for spurious unhandled interrupts
  * @threads_handled:	stats field for deferred spurious detection of threaded handlers
@@ -64,6 +65,7 @@ struct irq_desc {
 	unsigned int		wake_depth;	/* nested wake enables */
 	unsigned int		tot_count;
 	unsigned int		irq_count;	/* For detecting broken IRQs */
+	struct per_irqtime __percpu *irq_time;
 	unsigned long		last_unhandled;	/* Aging timer for unhandled count */
 	unsigned int		irqs_unhandled;
 	atomic_t		threads_handled;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 27ca1c866f29..a4d7c324cba2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -422,9 +422,12 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
 	desc->kstat_irqs = alloc_percpu(unsigned int);
 	if (!desc->kstat_irqs)
 		goto err_desc;
+	desc->irq_time = alloc_percpu(struct per_irqtime);
+	if (!desc->irq_time)
+		goto err_kstat;
 
 	if (alloc_masks(desc, node))
-		goto err_kstat;
+		goto err_irqtime;
 
 	raw_spin_lock_init(&desc->lock);
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
@@ -439,6 +442,8 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
 
 	return desc;
 
+err_irqtime:
+	free_percpu(desc->irq_time);
 err_kstat:
 	free_percpu(desc->kstat_irqs);
 err_desc:
@@ -451,6 +456,7 @@ static void irq_kobj_release(struct kobject *kobj)
 	struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
 
 	free_masks(desc);
+	free_percpu(desc->irq_time);
 	free_percpu(desc->kstat_irqs);
 	kfree(desc);
 }
@@ -594,6 +600,7 @@ int __init early_irq_init(void)
 
 	for (i = 0; i < count; i++) {
 		desc[i].kstat_irqs = alloc_percpu(unsigned int);
+		desc[i].irq_time = alloc_percpu(struct per_irqtime);
 		alloc_masks(&desc[i], node);
 		raw_spin_lock_init(&desc[i].lock);
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9fad10e0a147..f347c5d8c5c1 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -25,6 +25,12 @@
 #include <linux/stop_machine.h>
 #include <linux/kernel_stat.h>
 #include <linux/math64.h>
+#include <linux/tracepoint.h>
+#include <linux/irq.h>
+#include <linux/bitops.h>
+#include <trace/events/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/timekeeping.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
@@ -431,11 +437,15 @@ void touch_softlockup_watchdog_sync(void)
 	__this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
 }
 
+static void set_potential_softlockup(unsigned long now, unsigned long touch_ts);
+
 static int is_softlockup(unsigned long touch_ts,
 			 unsigned long period_ts,
 			 unsigned long now)
 {
 	if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
+		/* Softlockup may occur in the current period */
+		set_potential_softlockup(now, period_ts);
 		/* Warn about unreasonable delays. */
 		if (time_after(now, period_ts + get_softlockup_thresh()))
 			return now - touch_ts;
@@ -448,6 +458,8 @@ static DEFINE_PER_CPU(u64, cpustat_old[NR_STATS]);
 static DEFINE_PER_CPU(u64, cpustat_diff[5][NR_STATS]);
 static DEFINE_PER_CPU(int, cpustat_tail);
 
+static void print_hardirq_time(void);
+
 static void update_cpustat(void)
 {
 	u64 *old = this_cpu_ptr(cpustat_old);
@@ -489,10 +501,179 @@ static void print_cpustat(void)
 			a[i][CPUTIME_IRQ], b[i][CPUTIME_IRQ],
 			a[i][CPUTIME_IDLE], b[i][CPUTIME_IDLE]);
 	}
+	print_hardirq_time();
+}
+
+static DEFINE_PER_CPU(u64, hardirq_start_time);
+
+#define SOFTLOCKUP_HARDIRQ	0
+static DEFINE_PER_CPU(long, softlockup_flags);
+
+static void find_irqtime_top3(u64 *irqtime, int *irq, u64 perirq_time, int perirq_id)
+{
+	if (perirq_time > irqtime[0] || (perirq_time == irqtime[0] && perirq_id < irq[0])) {
+		irqtime[2] = irqtime[1];
+		irq[2] = irq[1];
+		irqtime[1] = irqtime[0];
+		irq[1] = irq[0];
+		irqtime[0] = perirq_time;
+		irq[0] = perirq_id;
+	} else if (perirq_time > irqtime[1] || (perirq_time == irqtime[1] && perirq_id < irq[1])) {
+		irqtime[2] = irqtime[1];
+		irq[2] = irq[1];
+		irqtime[1] = perirq_time;
+		irq[1] = perirq_id;
+	} else if (perirq_time > irqtime[1] || (perirq_time == irqtime[2] && perirq_id < irq[2])) {
+		irqtime[2] = perirq_time;
+		irq[2] = perirq_id;
+	}
+}
+
+/*
+ * If the proportion of time spent handling irq exceeds 50% during a sampling period,
+ * then it is necessary to tally the handling time of each irq.
+ */
+static inline bool need_trace_irqtime(int type)
+{
+	int tail = this_cpu_read(cpustat_tail);
+
+	if (--tail == -1)
+		tail = 4;
+	return this_cpu_read(cpustat_diff[tail][type]) > sample_period/2;
+}
+
+static void irq_handler_entry_callback(void *data, int irq, struct irqaction *action)
+{
+	this_cpu_ptr(irq_to_desc(irq)->irq_time)->start_time = local_clock();
+}
+
+static void irq_handler_exit_callback(void *data, int irq, struct irqaction *action, int ret)
+{
+	u64 delta;
+	struct per_irqtime *irq_time;
+
+	if (test_bit(SOFTLOCKUP_HARDIRQ, this_cpu_ptr(&softlockup_flags))) {
+		irq_time = this_cpu_ptr(irq_to_desc(irq)->irq_time);
+		delta = local_clock() - irq_time->start_time;
+		irq_time->delta += delta;
+	}
+}
+/*
+ * Mark softlockup as potentially caused by hardirq
+ */
+static void set_potential_softlockup_hardirq(void)
+{
+	if (!need_trace_irqtime(CPUTIME_IRQ))
+		return;
+
+	if (test_and_set_bit(SOFTLOCKUP_HARDIRQ, this_cpu_ptr(&softlockup_flags)))
+		return;
+
+	__this_cpu_write(hardirq_start_time, local_clock());
+}
+
+static void clear_potential_softlockup_hardirq(void)
+{
+	unsigned int i;
+	unsigned long flags;
+	struct irq_desc *desc;
+
+	local_irq_save(flags);
+	clear_bit(SOFTLOCKUP_HARDIRQ, this_cpu_ptr(&softlockup_flags));
+	for_each_irq_desc(i, desc) {
+		if (!desc)
+			continue;
+		memset(this_cpu_ptr(desc->irq_time), 0, sizeof(struct per_irqtime));
+	}
+	local_irq_restore(flags);
+}
+
+static void hook_hardirq_events(void)
+{
+	int ret;
+
+	ret = register_trace_irq_handler_entry(irq_handler_entry_callback, NULL);
+	if (ret)
+		goto out_err;
+	ret = register_trace_irq_handler_exit(irq_handler_exit_callback, NULL);
+	if (ret)
+		goto out_unreg_entry;
+	return;
+
+out_unreg_entry:
+	unregister_trace_irq_handler_entry(irq_handler_entry_callback, NULL);
+out_err:
+	return;
+}
+
+static void unhook_hardirq_events(void)
+{
+	unregister_trace_irq_handler_entry(irq_handler_entry_callback, NULL);
+	unregister_trace_irq_handler_exit(irq_handler_exit_callback, NULL);
 }
+
+/*
+ * Mark that softlockup may occur
+ */
+static void set_potential_softlockup(unsigned long now, unsigned long period_ts)
+{
+	if (time_after_eq(now, period_ts + get_softlockup_thresh()/5))
+		set_potential_softlockup_hardirq();
+}
+
+static void clear_potential_softlockup(void)
+{
+	clear_potential_softlockup_hardirq();
+}
+
+static void print_hardirq_time(void)
+{
+	struct per_irqtime *irq_time;
+	struct irq_desc *desc;
+	u64 hardirqtime_top3[3] = {0, 0, 0};
+	int hardirq_top3[3] = {-1, -1, -1};
+	u64 start_time, now, a;
+	u32 period_us, i, b;
+
+	if (test_bit(SOFTLOCKUP_HARDIRQ, this_cpu_ptr(&softlockup_flags))) {
+		start_time = __this_cpu_read(hardirq_start_time);
+		now = local_clock();
+		period_us = (now - start_time)/1000;
+		/* Find the top three most time-consuming interrupts */
+		for_each_irq_desc(i, desc) {
+			if (!desc)
+				continue;
+			irq_time = this_cpu_ptr(desc->irq_time);
+			find_irqtime_top3(hardirqtime_top3, hardirq_top3, irq_time->delta, i);
+		}
+		printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds 50%% since %llds. Most time-consuming HardIRQs:\n",
+			smp_processor_id(), ns_to_timespec64(start_time - sample_period).tv_sec);
+		for (i = 0; i < 3; i++) {
+			if (hardirq_top3[i] == -1)
+				break;
+			a = 100 * (hardirqtime_top3[i] / 1000);
+			b = 10 * do_div(a, period_us);
+			do_div(b, period_us);
+			desc = irq_to_desc(hardirq_top3[i]);
+			if (desc && desc->action)
+				printk(KERN_CRIT "\t#%u: %llu.%u%% irq#%d(%s)\n",
+					i+1, a, b, hardirq_top3[i], desc->action->name);
+			else
+				printk(KERN_CRIT "\t#%u: %llu.%u%% irq#%d\n",
+				i+1, a, b, hardirq_top3[i]);
+		}
+		if (!need_trace_irqtime(CPUTIME_IRQ))
+			clear_potential_softlockup_hardirq();
+	}
+}
+
 #else
 static inline void update_cpustat(void) { }
 static inline void print_cpustat(void) { }
+static inline void set_potential_softlockup(unsigned long now, unsigned long period_ts) { }
+static inline void clear_potential_softlockup(void) { }
+static inline void hook_hardirq_events(void) { }
+static inline void unhook_hardirq_events(void) { }
 #endif
 
 /* watchdog detector functions */
@@ -510,6 +691,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
 static int softlockup_fn(void *data)
 {
 	update_touch_ts();
+	clear_potential_softlockup();
 	complete(this_cpu_ptr(&softlockup_completion));
 
 	return 0;
@@ -674,6 +856,8 @@ static void softlockup_stop_all(void)
 	if (!softlockup_initialized)
 		return;
 
+	unhook_hardirq_events();
+
 	for_each_cpu(cpu, &watchdog_allowed_mask)
 		smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false);
 
@@ -690,6 +874,8 @@ static void softlockup_start_all(void)
 {
 	int cpu;
 
+	hook_hardirq_events();
+
 	cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
 	for_each_cpu(cpu, &watchdog_allowed_mask)
 		smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
-- 
2.37.1 (Apple Git-137.1)

Re: [PATCH 2/3] watchdog/softlockup: report the most time-consuming hardirq

Posted by Thomas Gleixner 1 year, 11 months ago

On Tue, Jan 23 2024 at 20:12, Bitao Hu wrote:
> +/*
> + * If the proportion of time spent handling irq exceeds 50% during a sampling period,
> + * then it is necessary to tally the handling time of each irq.
> + */
> +static inline bool need_trace_irqtime(int type)
> +{
> +	int tail = this_cpu_read(cpustat_tail);
> +
> +	if (--tail == -1)
> +		tail = 4;
> +	return this_cpu_read(cpustat_diff[tail][type]) > sample_period/2;
> +}
> +
> +static void irq_handler_entry_callback(void *data, int irq, struct irqaction *action)
> +{
> +	this_cpu_ptr(irq_to_desc(irq)->irq_time)->start_time = local_clock();
> +}
> +
> +static void irq_handler_exit_callback(void *data, int irq, struct irqaction *action, int ret)
> +{
> +	u64 delta;
> +	struct per_irqtime *irq_time;
> +
> +	if (test_bit(SOFTLOCKUP_HARDIRQ, this_cpu_ptr(&softlockup_flags))) {
> +		irq_time = this_cpu_ptr(irq_to_desc(irq)->irq_time);
> +		delta = local_clock() - irq_time->start_time;
> +		irq_time->delta += delta;
> +	}
> +}

I can understand what you are trying to achieve, but you inflict the
overhead of two tracepoints to every interrupt unconditionally.

For the vast majority of usage scenarios that's just pointless overhead
for no value. That's not acceptable at all.

Instrumentation and debugging has to find the least intrusive solution
and not just go for the perfect picture. Remember: perfect is the enemy
of good.

You really have to think hard about what you really need to achieve for
a particular problem case.

In this case it's completely sufficient to know the number of interrupts
which happened since softlockup detection took a snapshot and the actual
analysis.

That's especially true when interrupt time accounting is active because
then the only interesting information is which interrupts fired during
the observation period.

Even if that's not available it is a reasonable assumption that the
number of interrupts during the observation period gives a pretty
conclusive hint about the potential cause of the problem.

This is not meant to find the oddball issue of an interrupt handler
which consumes a lot of time per invocation. This is about storm
detection where the handler runs briefly and actually returns
IRQ_HANDLED so that the spurious detection does not catch it.

This can be implemented with exactly zero overhead for the good case,
keeping it self contained to the interrupt core code and providing
sensible interfaces for the watchdog code.

See the uncompiled patch for illustration below.

As a side note: While C does not allow proper encapsulation it's a non
starter to fiddle with the interrupt descriptor internals in random code
just because the compiler allows you to do so. While not enforced there
are clear boundaries and we went a long way to encapsulate this.

Thanks,

        tglx
---
 include/linux/irqdesc.h |    9 +++++++--
 kernel/irq/internals.h  |    2 +-
 kernel/irq/irqdesc.c    |   32 ++++++++++++++++++++++++++------
 kernel/irq/proc.c       |    9 +++------
 4 files changed, 37 insertions(+), 15 deletions(-)

--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -17,6 +17,11 @@ struct irq_desc;
 struct irq_domain;
 struct pt_regs;
 
+struct irqstat {
+	unsigned int	cnt;
+	unsigned int	ref;
+};
+
 /**
  * struct irq_desc - interrupt descriptor
  * @irq_common_data:	per irq and chip data passed down to chip functions
@@ -55,7 +60,7 @@ struct pt_regs;
 struct irq_desc {
 	struct irq_common_data	irq_common_data;
 	struct irq_data		irq_data;
-	unsigned int __percpu	*kstat_irqs;
+	struct irqstat __percpu	*kstat_irqs;
 	irq_flow_handler_t	handle_irq;
 	struct irqaction	*action;	/* IRQ action list */
 	unsigned int		status_use_accessors;
@@ -119,7 +124,7 @@ extern struct irq_desc irq_desc[NR_IRQS]
 static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc,
 					      unsigned int cpu)
 {
-	return desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+	return desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
 }
 
 static inline struct irq_desc *irq_data_to_desc(struct irq_data *data)
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -258,7 +258,7 @@ static inline void irq_state_set_masked(
 
 static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc)
 {
-	__this_cpu_inc(*desc->kstat_irqs);
+	__this_cpu_inc(desc->kstat_irqs->cnt);
 	__this_cpu_inc(kstat.irqs_sum);
 }
 
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -122,7 +122,7 @@ static void desc_set_defaults(unsigned i
 	desc->name = NULL;
 	desc->owner = owner;
 	for_each_possible_cpu(cpu)
-		*per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
+		*per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { };
 	desc_smp_init(desc, node, affinity);
 }
 
@@ -418,8 +418,8 @@ static struct irq_desc *alloc_desc(int i
 	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
 	if (!desc)
 		return NULL;
-	/* allocate based on nr_cpu_ids */
-	desc->kstat_irqs = alloc_percpu(unsigned int);
+
+	desc->kstat_irqs = alloc_percpu(struct irqstat);
 	if (!desc->kstat_irqs)
 		goto err_desc;
 
@@ -952,8 +952,7 @@ unsigned int kstat_irqs_cpu(unsigned int
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	return desc && desc->kstat_irqs ?
-			*per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+	return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
 }
 
 static bool irq_is_nmi(struct irq_desc *desc)
@@ -975,10 +974,31 @@ static unsigned int kstat_irqs(unsigned
 		return data_race(desc->tot_count);
 
 	for_each_possible_cpu(cpu)
-		sum += data_race(*per_cpu_ptr(desc->kstat_irqs, cpu));
+		sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu));
 	return sum;
 }
 
+void kstat_snapshot_irqs(void)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+
+	for_each_irq_desc(irq, desc) {
+		if (!desc->kstat_irqs)
+			continue;
+		this_cpu_write(desc->kstat_irqs->ref, this_cpu_read(desc->kstat_irqs->cnt));
+	}
+}
+
+unsigned int kstat_get_irq_since_snapshot(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (!desc || !desc->kstat_irqs)
+		return 0;
+	return this_cpu_read(desc->kstat_irqs->cnt) - this_cpu_read(desc->kstat_irqs->ref);
+}
+
 /**
  * kstat_irqs_usr - Get the statistics for an interrupt from thread context
  * @irq:	The interrupt number
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -488,18 +488,15 @@ int show_interrupts(struct seq_file *p,
 	if (!desc || irq_settings_is_hidden(desc))
 		goto outsparse;
 
-	if (desc->kstat_irqs) {
-		for_each_online_cpu(j)
-			any_count |= data_race(*per_cpu_ptr(desc->kstat_irqs, j));
-	}
+	if (desc->kstat_irqs)
+		any_count = data_race(desc->tot_count);
 
 	if ((!desc->action || irq_desc_is_chained(desc)) && !any_count)
 		goto outsparse;
 
 	seq_printf(p, "%*d: ", prec, i);
 	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", desc->kstat_irqs ?
-					*per_cpu_ptr(desc->kstat_irqs, j) : 0);
+		seq_printf(p, "%10u ", desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0);
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
 	if (desc->irq_data.chip) {

Re: [PATCH 2/3] watchdog/softlockup: report the most time-consuming hardirq

Posted by Doug Anderson 1 year, 11 months ago

Hi,

On Wed, Feb 14, 2024 at 3:36 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Tue, Jan 23 2024 at 20:12, Bitao Hu wrote:
> > +/*
> > + * If the proportion of time spent handling irq exceeds 50% during a sampling period,
> > + * then it is necessary to tally the handling time of each irq.
> > + */
> > +static inline bool need_trace_irqtime(int type)
> > +{
> > +     int tail = this_cpu_read(cpustat_tail);
> > +
> > +     if (--tail == -1)
> > +             tail = 4;
> > +     return this_cpu_read(cpustat_diff[tail][type]) > sample_period/2;
> > +}
> > +
> > +static void irq_handler_entry_callback(void *data, int irq, struct irqaction *action)
> > +{
> > +     this_cpu_ptr(irq_to_desc(irq)->irq_time)->start_time = local_clock();
> > +}
> > +
> > +static void irq_handler_exit_callback(void *data, int irq, struct irqaction *action, int ret)
> > +{
> > +     u64 delta;
> > +     struct per_irqtime *irq_time;
> > +
> > +     if (test_bit(SOFTLOCKUP_HARDIRQ, this_cpu_ptr(&softlockup_flags))) {
> > +             irq_time = this_cpu_ptr(irq_to_desc(irq)->irq_time);
> > +             delta = local_clock() - irq_time->start_time;
> > +             irq_time->delta += delta;
> > +     }
> > +}
>
> I can understand what you are trying to achieve, but you inflict the
> overhead of two tracepoints to every interrupt unconditionally.
>
> For the vast majority of usage scenarios that's just pointless overhead
> for no value. That's not acceptable at all.
>
> Instrumentation and debugging has to find the least intrusive solution
> and not just go for the perfect picture. Remember: perfect is the enemy
> of good.
>
> You really have to think hard about what you really need to achieve for
> a particular problem case.
>
> In this case it's completely sufficient to know the number of interrupts
> which happened since softlockup detection took a snapshot and the actual
> analysis.
>
> That's especially true when interrupt time accounting is active because
> then the only interesting information is which interrupts fired during
> the observation period.
>
> Even if that's not available it is a reasonable assumption that the
> number of interrupts during the observation period gives a pretty
> conclusive hint about the potential cause of the problem.
>
> This is not meant to find the oddball issue of an interrupt handler
> which consumes a lot of time per invocation. This is about storm
> detection where the handler runs briefly and actually returns
> IRQ_HANDLED so that the spurious detection does not catch it.
>
> This can be implemented with exactly zero overhead for the good case,
> keeping it self contained to the interrupt core code and providing
> sensible interfaces for the watchdog code.
>
> See the uncompiled patch for illustration below.
>
> As a side note: While C does not allow proper encapsulation it's a non
> starter to fiddle with the interrupt descriptor internals in random code
> just because the compiler allows you to do so. While not enforced there
> are clear boundaries and we went a long way to encapsulate this.

I think you must have gotten dropped from all the future versions of
this patch series when Bitao took my advice and started using
interrupt counts instead of tracing. For what it's worth, the latest
version can be found at:

https://lore.kernel.org/r/20240214021430.87471-1-yaoma@linux.alibaba.com

Re: [PATCH 2/3] watchdog/softlockup: report the most time-consuming hardirq

Posted by Thomas Gleixner 1 year, 11 months ago

On Wed, Feb 14 2024 at 15:39, Doug Anderson wrote:
> On Wed, Feb 14, 2024 at 3:36 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>> On Tue, Jan 23 2024 at 20:12, Bitao Hu wrote:
>> As a side note: While C does not allow proper encapsulation it's a non
>> starter to fiddle with the interrupt descriptor internals in random code
>> just because the compiler allows you to do so. While not enforced there
>> are clear boundaries and we went a long way to encapsulate this.
>
> I think you must have gotten dropped from all the future versions of
> this patch series when Bitao took my advice and started using
> interrupt counts instead of tracing. For what it's worth, the latest
> version can be found at:
>
> https://lore.kernel.org/r/20240214021430.87471-1-yaoma@linux.alibaba.com

Yes. I'm not on CC and for some stupid reason it evaded my LKML filters.

Let me find that thread and stare at it.

Thanks,

        tglx

Re: [PATCH 2/3] watchdog/softlockup: report the most time-consuming hardirq

Posted by Doug Anderson 2 years ago

Hi,

On Tue, Jan 23, 2024 at 4:12 AM Bitao Hu <yaoma@linux.alibaba.com> wrote:
>
> When the watchdog determines that the current soft lockup is due
> to an interrupt storm based on CPU utilization, reporting the
> top three most time-consuming hardirq can be quite useful for
> further troubleshooting.
> Below is an example of interrupt storm. The call tree does not
> provide useful information, but we can analyze which interrupt
> caused the soft lockup by using the time-consuming information
> of hardirq.
>
> [   67.714044] watchdog: BUG: soft lockup - CPU#9 stuck for 28s! [swapper/9:0]
> [   67.714548] CPU#9 Utilization every 4s during lockup:
> [   67.714549]  #1: 0.0% system,        0.0% softirq,   95.0% hardirq,  0.0% idle
> [   67.714551]  #2: 0.0% system,        0.0% softirq,   90.0% hardirq,  0.0% idle
> [   67.714553]  #3: 0.0% system,        0.0% softirq,   90.0% hardirq,  0.0% idle
> [   67.714555]  #4: 0.0% system,        0.0% softirq,   95.0% hardirq,  0.0% idle
> [   67.714556]  #5: 0.0% system,        0.0% softirq,   90.0% hardirq,  0.0% idle
> [   67.714570] CPU#9 Detect HardIRQ Time exceeds 50% since 45s. Most time-consuming HardIRQs:
> [   67.714571]  #1: 99.9% irq#7(IPI)
> [   67.714575]  #2: 0.0% irq#10(arch_timer)
> [   67.714578]  #3: 0.0% irq#2(IPI)
> ...
> [   67.714654] Call trace:
> [   67.714656]  __do_softirq+0xa8/0x364
>
> Signed-off-by: Bitao Hu <yaoma@linux.alibaba.com>
> ---
>  include/linux/irq.h     |   9 ++
>  include/linux/irqdesc.h |   2 +
>  kernel/irq/irqdesc.c    |   9 +-
>  kernel/watchdog.c       | 186 ++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 205 insertions(+), 1 deletion(-)

Just like on patch #1, I'd overall like to say that I'm very excited
about this patch and think it'll help track down a lot of issues, so
please continue moving forward with it!

That being said, my gut says that this can be a lot simpler / lower
overhead if we just use the existing IRQ counts that the framework is
already keeping track of for us. We could look for the IRQs that had
the most counts on the soft locked CPU over the period of time.

It might not be quite as good/accurate compared to what you have here
but it seems like it would be "good enough". Normally when there's an
IRQ storm that's so bad that it's causing a softlockup then each IRQ
isn't actually that slow but they're going off nonstop. It would catch
that case perfectly.

I guess maybe (?) the case it wouldn't catch so well would be if you
had a loop that looked like:

* CPU from "storming" device takes ~10 ms and finishes.
* After the "storming" device finishes, we somehow manage to service a
whole pile of normal interrupts from non-storming devices.
* After the non-storming interrupts finished, then we went back to
servicing the slow storming interrupt.

I haven't dug into the bowels of the Linux IRQ handling well enough to
know if that would be possible. However, even if it was possible it
doesn't feel likely to happen. If those "normal" interrupts aren't
storming then it seems unlikely we'd see more than one of each of them
between the "storming" interrupts. Thus it feels like the "storming"
interrupt would still have the highest (or tied for the highest) count
on the locked up CPU. Maybe you could print the top 4 or 5 (instead of
the top 3) and you'd be pretty certain to catch even this case?

In any case, I'll wait before doing a more thorough review for now and
hope the above sounds right to you.

-Doug

Re: [PATCH 2/3] watchdog/softlockup: report the most time-consuming hardirq

Posted by Bitao Hu 2 years ago


On 2024/1/25 08:19, Doug Anderson wrote:
> 
> I guess maybe (?) the case it wouldn't catch so well would be if you
> had a loop that looked like:
> 
> * CPU from "storming" device takes ~10 ms and finishes.
> * After the "storming" device finishes, we somehow manage to service a
> whole pile of normal interrupts from non-storming devices.
> * After the non-storming interrupts finished, then we went back to
> servicing the slow storming interrupt.
> 
> I haven't dug into the bowels of the Linux IRQ handling well enough to
> know if that would be possible. However, even if it was possible it
> doesn't feel likely to happen. If those "normal" interrupts aren't
> storming then it seems unlikely we'd see more than one of each of them
> between the "storming" interrupts. Thus it feels like the "storming"
> interrupt would still have the highest (or tied for the highest) count
> on the locked up CPU. Maybe you could print the top 4 or 5 (instead of
> the top 3) and you'd be pretty certain to catch even this case?
> 
Hi, I now understand your suggestion. Using interrupt time as a
criterion of interrupt storm is suitable for scenarios where we don't
know whether an interrupt storm has happened. However, in the context of
this softlockup, we have in fact already determined that an interrupt
storm might have occurred through the "hardirq time", so there's no need
for a redundant determination. Thus, I agree that using IRQ counts
seems like good enough to find "storming" interrupts.

> In any case, I'll wait before doing a more thorough review for now and
> hope the above sounds right to you.
> 
> -Doug

Re: [PATCH 2/3] watchdog/softlockup: report the most time-consuming hardirq

Posted by Bitao Hu 2 years ago


On 2024/1/25 08:19, Doug Anderson wrote:
> Hi,
> 
> On Tue, Jan 23, 2024 at 4:12 AM Bitao Hu <yaoma@linux.alibaba.com> wrote:
>>
>> When the watchdog determines that the current soft lockup is due
>> to an interrupt storm based on CPU utilization, reporting the
>> top three most time-consuming hardirq can be quite useful for
>> further troubleshooting.
>> Below is an example of interrupt storm. The call tree does not
>> provide useful information, but we can analyze which interrupt
>> caused the soft lockup by using the time-consuming information
>> of hardirq.
>>
>> [   67.714044] watchdog: BUG: soft lockup - CPU#9 stuck for 28s! [swapper/9:0]
>> [   67.714548] CPU#9 Utilization every 4s during lockup:
>> [   67.714549]  #1: 0.0% system,        0.0% softirq,   95.0% hardirq,  0.0% idle
>> [   67.714551]  #2: 0.0% system,        0.0% softirq,   90.0% hardirq,  0.0% idle
>> [   67.714553]  #3: 0.0% system,        0.0% softirq,   90.0% hardirq,  0.0% idle
>> [   67.714555]  #4: 0.0% system,        0.0% softirq,   95.0% hardirq,  0.0% idle
>> [   67.714556]  #5: 0.0% system,        0.0% softirq,   90.0% hardirq,  0.0% idle
>> [   67.714570] CPU#9 Detect HardIRQ Time exceeds 50% since 45s. Most time-consuming HardIRQs:
>> [   67.714571]  #1: 99.9% irq#7(IPI)
>> [   67.714575]  #2: 0.0% irq#10(arch_timer)
>> [   67.714578]  #3: 0.0% irq#2(IPI)
>> ...
>> [   67.714654] Call trace:
>> [   67.714656]  __do_softirq+0xa8/0x364
>>
>> Signed-off-by: Bitao Hu <yaoma@linux.alibaba.com>
>> ---
>>   include/linux/irq.h     |   9 ++
>>   include/linux/irqdesc.h |   2 +
>>   kernel/irq/irqdesc.c    |   9 +-
>>   kernel/watchdog.c       | 186 ++++++++++++++++++++++++++++++++++++++++
>>   4 files changed, 205 insertions(+), 1 deletion(-)
> 
> Just like on patch #1, I'd overall like to say that I'm very excited
> about this patch and think it'll help track down a lot of issues, so
> please continue moving forward with it!
Sure, I will continue moving forward with it.
> 
> That being said, my gut says that this can be a lot simpler / lower
> overhead if we just use the existing IRQ counts that the framework is
> already keeping track of for us. We could look for the IRQs that had
> the most counts on the soft locked CPU over the period of time.
> 
> It might not be quite as good/accurate compared to what you have here
> but it seems like it would be "good enough". Normally when there's an
> IRQ storm that's so bad that it's causing a softlockup then each IRQ
> isn't actually that slow but they're going off nonstop. It would catch
> that case perfectly.
> 
> I guess maybe (?) the case it wouldn't catch so well would be if you
> had a loop that looked like:
> 
> * CPU from "storming" device takes ~10 ms and finishes.
> * After the "storming" device finishes, we somehow manage to service a
> whole pile of normal interrupts from non-storming devices.
> * After the non-storming interrupts finished, then we went back to
> servicing the slow storming interrupt.
I think there are two issues with using IRQ counts in determining 
interrupt storm:

1. we might not know the normal frequency for each interrupt, witch 
making the IRQ counts inconvenient for comparison.

For instance, in your example, suppose the slow interrupt takes 10ms at 
a frequency of 100Hz; the normal interrupt takes 10us at a frequency of 
500Hz. If we use IRQ counts, we might incorrectly conclude that it is 
the normal interrupt causing softlockup.

2. During an interrupt storm, the IRQ counts recorded by the kernel does 
not necessarily increase.
The softlockup caused by an NVMe interrupt storm that I encountered is a 
case in point. Around the time of the softlockup, the NVMe interrupt 
frequency actually decreased from ~300 kHz to ~17 kHz. However, the 
average time taken per NVMe interrupt increased from  ~2us to ~60us. 
This is because the nvme_irq handler is capable of polling and 
processing multiple interrupts reported by the NVMe controller, which 
are not captured in the kernel's interrupt statistics. The NVMe driver 
handles interrupts frequency is ~660 kHz actually.

Using interrupt time as a criterion of interrupt storm can avoid the 
above two points.

> 
> I haven't dug into the bowels of the Linux IRQ handling well enough to
> know if that would be possible. However, even if it was possible it
> doesn't feel likely to happen. If those "normal" interrupts aren't
> storming then it seems unlikely we'd see more than one of each of them
> between the "storming" interrupts. Thus it feels like the "storming"
> interrupt would still have the highest (or tied for the highest) count
> on the locked up CPU. Maybe you could print the top 4 or 5 (instead of
> the top 3) and you'd be pretty certain to catch even this case?
> 
> In any case, I'll wait before doing a more thorough review for now and
> hope the above sounds right to you.
> 
> -Doug

[PATCH 1/3] watchdog/softlockup: low-overhead detection of interrupt storm
[PATCH 2/3] watchdog/softlockup: report the most time-consuming hardirq
[PATCH 3/3] watchdog/softlockup: add parameter to control the reporting of time-consuming hardirq