[v1] *** Detect interrupt storm in softlockup ***

[PATCH 1/3] watchdog/softlockup: low-overhead detection of interrupt storm

Posted by Bitao Hu 2 years ago

The following softlockup is caused by interrupt storm, but it cannot be
identified from the call tree. Because the call tree is just a snapshot
and doesn't fully capture the behavior of the CPU during the soft lockup.
  watchdog: BUG: soft lockup - CPU#28 stuck for 23s! [fio:83921]
  ...
  Call trace:
    __do_softirq+0xa0/0x37c
    __irq_exit_rcu+0x108/0x140
    irq_exit+0x14/0x20
    __handle_domain_irq+0x84/0xe0
    gic_handle_irq+0x80/0x108
    el0_irq_naked+0x50/0x58

Therefore，I think it is necessary to report CPU utilization during the
softlockup_thresh period (report once every sample_period, for a total
of 5 reportings), like this:
  watchdog: BUG: soft lockup - CPU#28 stuck for 23s! [fio:83921]
  CPU#28 Utilization every 4s during lockup:
    #1: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
    #2: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
    #3: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
    #4: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
    #5: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
  ...

This would be helpful in determining whether an interrupt storm has
occurred or in identifying the cause of the softlockup. The criteria for
determination are as follows:
  a. If the hardirq utilization is high, then interrupt storm should be
  considered and the root cause cannot be determined from the call tree.
  b. If the softirq utilization is high, then we could analyze the call
  tree but it may cannot reflect the root cause.
  c. If the system utilization is high, then we could analyze the root
  cause from the call tree.

Signed-off-by: Bitao Hu <yaoma@linux.alibaba.com>
---
 kernel/watchdog.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 81a8862295d6..9fad10e0a147 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,6 +23,8 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/isolation.h>
 #include <linux/stop_machine.h>
+#include <linux/kernel_stat.h>
+#include <linux/math64.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
@@ -441,6 +443,58 @@ static int is_softlockup(unsigned long touch_ts,
 	return 0;
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static DEFINE_PER_CPU(u64, cpustat_old[NR_STATS]);
+static DEFINE_PER_CPU(u64, cpustat_diff[5][NR_STATS]);
+static DEFINE_PER_CPU(int, cpustat_tail);
+
+static void update_cpustat(void)
+{
+	u64 *old = this_cpu_ptr(cpustat_old);
+	u64 (*diff)[NR_STATS] = this_cpu_ptr(cpustat_diff);
+	int tail = this_cpu_read(cpustat_tail), i;
+	struct kernel_cpustat kcpustat;
+	u64 *cpustat = kcpustat.cpustat;
+
+	kcpustat_cpu_fetch(&kcpustat, smp_processor_id());
+	for (i = 0; i < NR_STATS; i++) {
+		diff[tail][i] = cpustat[i] - old[i];
+		old[i] = cpustat[i];
+	}
+	this_cpu_write(cpustat_tail, (tail + 1) % 5);
+}
+
+static void print_cpustat(void)
+{
+	int i, j, k;
+	u64 a[5][NR_STATS], b[5][NR_STATS];
+	u64 (*diff)[NR_STATS] = this_cpu_ptr(cpustat_diff);
+	int tail = this_cpu_read(cpustat_tail);
+	u32 period_us = sample_period / 1000;
+
+	for (i = 0; i < 5; i++) {
+		for (j = 0; j < NR_STATS; j++) {
+			a[i][j] = 100 * (diff[i][j] / 1000);
+			b[i][j] = 10 * do_div(a[i][j], period_us);
+			do_div(b[i][j], period_us);
+		}
+	}
+	printk(KERN_CRIT "CPU#%d Utilization every %us during lockup:\n",
+		smp_processor_id(), period_us/1000000);
+	for (k = 0, i = tail; k < 5; k++, i = (i + 1) % 5) {
+		printk(KERN_CRIT "\t#%d: %llu.%llu%% system,\t%llu.%llu%% softirq,\t"
+			"%llu.%llu%% hardirq,\t%llu.%llu%% idle\n", k+1,
+			a[i][CPUTIME_SYSTEM], b[i][CPUTIME_SYSTEM],
+			a[i][CPUTIME_SOFTIRQ], b[i][CPUTIME_SOFTIRQ],
+			a[i][CPUTIME_IRQ], b[i][CPUTIME_IRQ],
+			a[i][CPUTIME_IDLE], b[i][CPUTIME_IDLE]);
+	}
+}
+#else
+static inline void update_cpustat(void) { }
+static inline void print_cpustat(void) { }
+#endif
+
 /* watchdog detector functions */
 static DEFINE_PER_CPU(struct completion, softlockup_completion);
 static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
@@ -504,6 +558,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	 */
 	period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts));
 
+	/* update cpu usage stat */
+	update_cpustat();
+
 	/* Reset the interval when touched by known problematic code. */
 	if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
 		if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -539,6 +596,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
+		print_cpustat();
 		print_modules();
 		print_irqtrace_events(current);
 		if (regs)
-- 
2.37.1 (Apple Git-137.1)

Re: [PATCH 1/3] watchdog/softlockup: low-overhead detection of interrupt storm

Posted by kernel test robot 2 years ago

Hi Bitao,

kernel test robot noticed the following build errors:

[auto build test ERROR on tip/irq/core]
[also build test ERROR on linus/master v6.8-rc1 next-20240125]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Bitao-Hu/watchdog-softlockup-low-overhead-detection-of-interrupt-storm/20240123-201509
base:   tip/irq/core
patch link:    https://lore.kernel.org/r/20240123121223.22318-2-yaoma%40linux.alibaba.com
patch subject: [PATCH 1/3] watchdog/softlockup: low-overhead detection of interrupt storm
config: i386-randconfig-013-20240126 (https://download.01.org/0day-ci/archive/20240126/202401261310.Ixxefn7o-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240126/202401261310.Ixxefn7o-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202401261310.Ixxefn7o-lkp@intel.com/

All errors (new ones prefixed by >>):

   ld: kernel/watchdog.o: in function `print_cpustat':
>> kernel/watchdog.c:453: undefined reference to `__udivdi3'
>> ld: kernel/watchdog.c:457: undefined reference to `__udivdi3'


vim +453 kernel/watchdog.c

   446	
   447	static void print_cpustat(void)
   448	{
   449		int i, j, k;
   450		u64 a[5][NR_STATS], b[5][NR_STATS];
   451		u64 (*diff)[NR_STATS] = this_cpu_ptr(cpustat_diff);
   452		int tail = this_cpu_read(cpustat_tail);
 > 453		u32 period_us = sample_period / 1000;
   454	
   455		for (i = 0; i < 5; i++) {
   456			for (j = 0; j < NR_STATS; j++) {
 > 457				a[i][j] = 100 * (diff[i][j] / 1000);
   458				b[i][j] = 10 * do_div(a[i][j], period_us);
   459				do_div(b[i][j], period_us);
   460			}
   461		}
   462		printk(KERN_CRIT "CPU#%d Utilization every %us during lockup:\n",
   463			smp_processor_id(), period_us/1000000);
   464		for (k = 0, i = tail; k < 5; k++, i = (i + 1) % 5) {
   465			printk(KERN_CRIT "\t#%d: %llu.%llu%% system,\t%llu.%llu%% softirq,\t"
   466				"%llu.%llu%% hardirq,\t%llu.%llu%% idle\n", k+1,
   467				a[i][CPUTIME_SYSTEM], b[i][CPUTIME_SYSTEM],
   468				a[i][CPUTIME_SOFTIRQ], b[i][CPUTIME_SOFTIRQ],
   469				a[i][CPUTIME_IRQ], b[i][CPUTIME_IRQ],
   470				a[i][CPUTIME_IDLE], b[i][CPUTIME_IDLE]);
   471		}
   472	}
   473	#else
   474	static inline void update_cpustat(void) { }
   475	static inline void print_cpustat(void) { }
   476	#endif
   477	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH 1/3] watchdog/softlockup: low-overhead detection of interrupt storm

Posted by Doug Anderson 2 years ago

Hi,

On Tue, Jan 23, 2024 at 4:12 AM Bitao Hu <yaoma@linux.alibaba.com> wrote:
>
> The following softlockup is caused by interrupt storm, but it cannot be
> identified from the call tree. Because the call tree is just a snapshot
> and doesn't fully capture the behavior of the CPU during the soft lockup.
>   watchdog: BUG: soft lockup - CPU#28 stuck for 23s! [fio:83921]
>   ...
>   Call trace:
>     __do_softirq+0xa0/0x37c
>     __irq_exit_rcu+0x108/0x140
>     irq_exit+0x14/0x20
>     __handle_domain_irq+0x84/0xe0
>     gic_handle_irq+0x80/0x108
>     el0_irq_naked+0x50/0x58
>
> Therefore，I think it is necessary to report CPU utilization during the
> softlockup_thresh period (report once every sample_period, for a total
> of 5 reportings), like this:
>   watchdog: BUG: soft lockup - CPU#28 stuck for 23s! [fio:83921]
>   CPU#28 Utilization every 4s during lockup:
>     #1: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>     #2: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>     #3: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>     #4: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>     #5: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>   ...
>
> This would be helpful in determining whether an interrupt storm has
> occurred or in identifying the cause of the softlockup. The criteria for
> determination are as follows:
>   a. If the hardirq utilization is high, then interrupt storm should be
>   considered and the root cause cannot be determined from the call tree.
>   b. If the softirq utilization is high, then we could analyze the call
>   tree but it may cannot reflect the root cause.
>   c. If the system utilization is high, then we could analyze the root
>   cause from the call tree.

Just to set the tone: this sounds like a great idea! I've found myself
wanting something like this for a long time but I've never sat down to
try to code up a mechanism. I have review comments below, but mostly
it's just details. IMO the idea is definitely something we want to
land.

> Signed-off-by: Bitao Hu <yaoma@linux.alibaba.com>
> ---
>  kernel/watchdog.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 58 insertions(+)
>
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 81a8862295d6..9fad10e0a147 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -23,6 +23,8 @@
>  #include <linux/sched/debug.h>
>  #include <linux/sched/isolation.h>
>  #include <linux/stop_machine.h>
> +#include <linux/kernel_stat.h>
> +#include <linux/math64.h>
>
>  #include <asm/irq_regs.h>
>  #include <linux/kvm_para.h>
> @@ -441,6 +443,58 @@ static int is_softlockup(unsigned long touch_ts,
>         return 0;
>  }
>
> +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
> +static DEFINE_PER_CPU(u64, cpustat_old[NR_STATS]);
> +static DEFINE_PER_CPU(u64, cpustat_diff[5][NR_STATS]);
> +static DEFINE_PER_CPU(int, cpustat_tail);

The above is potentially a lot of memory. On some configs we allow up
to 8192 CPUs and you're storing (NR_STATS * 6) u64s per CPU plus
another int per CPU . NR_STATS can be up to 11 currently, so if I did
my math right then that's

8 * 8192 * 11 * 6 + 4 * 8192 = 4,358,144 bytes (!) That's probably not OK.

In theory you only need 4 stats (plus the total) instead of 11. If you
do that plus the "old" then that would give you (8 * 8192 * 5 * 6 + 4
* 8192) = 1,998,848 bytes. That's still a lot, but at least less.
...so at the very least you should only store the fields you need.

Given the amount of potential space at stake, it's probably worth
optimizing this to something smaller than u64 as well. It feels like
it would be easy to use u32. You only need diffs here and it doesn't
seem like it would be hard to figure out how to use 32-bits, even if
you have the drop a bit of granularity.

It feels like you could even go down to 8-bits per stat, though. You
don't _really_ need such accurate percentages, right? Does it really
matter if you have 99.4%? It seems like just saying 99% would be fine.
Just do the math here and store the integral percentage per stat you
care about. The numbers 0 - 100 can be stored in 8 bits.

Combining that all together, I guess you could do this (untested):

1. Get the current stats and right shift them all by 10 and convert
them to 32-bit. This gives you (roughly) microseconds which will roll
over roughly every ~4000 seconds. That seems about right, though I
guess it's possible to do more and see if we can squeeze into 16 bits.

2. Sum up all 11 of the stats to get a total. This should still fit
into 32-bits.

3. Do the math to get the integral percentages and store those in the tail slot.

4. Store the 4 stats you care about plus the total (all 32-bits) in
"cpustat_old".

If I've got that right, your definitions should be able to be:

#define NUM_OLD_STATS_GROUPS  5
#define NUM_STATS_PER_GROUP 4
static DEFINE_PER_CPU(u32, cpustat_old[NUM_STATS_PER_GROUP + 1]);
static DEFINE_PER_CPU(u8,
cpustat_utilization[NUM_OLD_STATS][NUM_STATS_PER_GROUP]);
static DEFINE_PER_CPU(u8, cpustat_tail);

With the maximum number of CPUs, that's now this, if I got my math right.

4 * 8192 * 5 + 1 * 8192 * 5 * 4 + 1 * 8192 = 335,872 bytes.

That's a lot less, but still a lot. I'd be interested to hear other
opinions, but it's probably worth a Kconfig knob.

> +static void update_cpustat(void)
> +{
> +       u64 *old = this_cpu_ptr(cpustat_old);
> +       u64 (*diff)[NR_STATS] = this_cpu_ptr(cpustat_diff);
> +       int tail = this_cpu_read(cpustat_tail), i;

nit: Please define "i" on its own line. It looks weird the way you have it here.

> +       struct kernel_cpustat kcpustat;
> +       u64 *cpustat = kcpustat.cpustat;
> +
> +       kcpustat_cpu_fetch(&kcpustat, smp_processor_id());
> +       for (i = 0; i < NR_STATS; i++) {
> +               diff[tail][i] = cpustat[i] - old[i];
> +               old[i] = cpustat[i];
> +       }
> +       this_cpu_write(cpustat_tail, (tail + 1) % 5);
> +}
> +
> +static void print_cpustat(void)
> +{
> +       int i, j, k;
> +       u64 a[5][NR_STATS], b[5][NR_STATS];
> +       u64 (*diff)[NR_STATS] = this_cpu_ptr(cpustat_diff);
> +       int tail = this_cpu_read(cpustat_tail);
> +       u32 period_us = sample_period / 1000;
> +
> +       for (i = 0; i < 5; i++) {
> +               for (j = 0; j < NR_STATS; j++) {
> +                       a[i][j] = 100 * (diff[i][j] / 1000);
> +                       b[i][j] = 10 * do_div(a[i][j], period_us);
> +                       do_div(b[i][j], period_us);
> +               }
> +       }
> +       printk(KERN_CRIT "CPU#%d Utilization every %us during lockup:\n",
> +               smp_processor_id(), period_us/1000000);
> +       for (k = 0, i = tail; k < 5; k++, i = (i + 1) % 5) {
> +               printk(KERN_CRIT "\t#%d: %llu.%llu%% system,\t%llu.%llu%% softirq,\t"
> +                       "%llu.%llu%% hardirq,\t%llu.%llu%% idle\n", k+1,
> +                       a[i][CPUTIME_SYSTEM], b[i][CPUTIME_SYSTEM],
> +                       a[i][CPUTIME_SOFTIRQ], b[i][CPUTIME_SOFTIRQ],
> +                       a[i][CPUTIME_IRQ], b[i][CPUTIME_IRQ],
> +                       a[i][CPUTIME_IDLE], b[i][CPUTIME_IDLE]);

As per your response to Liu Song, I understand why you're using
"printk" instead of pr_crit(), but it deserves a comment.

Re: [PATCH 1/3] watchdog/softlockup: low-overhead detection of interrupt storm

Posted by Bitao Hu 2 years ago


On 2024/1/25 08:19, Doug Anderson wrote:
> Hi,
> 
> On Tue, Jan 23, 2024 at 4:12 AM Bitao Hu <yaoma@linux.alibaba.com> wrote:
>>
>> The following softlockup is caused by interrupt storm, but it cannot be
>> identified from the call tree. Because the call tree is just a snapshot
>> and doesn't fully capture the behavior of the CPU during the soft lockup.
>>    watchdog: BUG: soft lockup - CPU#28 stuck for 23s! [fio:83921]
>>    ...
>>    Call trace:
>>      __do_softirq+0xa0/0x37c
>>      __irq_exit_rcu+0x108/0x140
>>      irq_exit+0x14/0x20
>>      __handle_domain_irq+0x84/0xe0
>>      gic_handle_irq+0x80/0x108
>>      el0_irq_naked+0x50/0x58
>>
>> Therefore，I think it is necessary to report CPU utilization during the
>> softlockup_thresh period (report once every sample_period, for a total
>> of 5 reportings), like this:
>>    watchdog: BUG: soft lockup - CPU#28 stuck for 23s! [fio:83921]
>>    CPU#28 Utilization every 4s during lockup:
>>      #1: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>>      #2: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>>      #3: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>>      #4: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>>      #5: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>>    ...
>>
>> This would be helpful in determining whether an interrupt storm has
>> occurred or in identifying the cause of the softlockup. The criteria for
>> determination are as follows:
>>    a. If the hardirq utilization is high, then interrupt storm should be
>>    considered and the root cause cannot be determined from the call tree.
>>    b. If the softirq utilization is high, then we could analyze the call
>>    tree but it may cannot reflect the root cause.
>>    c. If the system utilization is high, then we could analyze the root
>>    cause from the call tree.
> 
> Just to set the tone: this sounds like a great idea! I've found myself
> wanting something like this for a long time but I've never sat down to
> try to code up a mechanism. I have review comments below, but mostly
> it's just details. IMO the idea is definitely something we want to
> land.
Thanks for your support of this idea.
> 
> 
>> Signed-off-by: Bitao Hu <yaoma@linux.alibaba.com>
>> ---
>>   kernel/watchdog.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 58 insertions(+)
>>
>> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
>> index 81a8862295d6..9fad10e0a147 100644
>> --- a/kernel/watchdog.c
>> +++ b/kernel/watchdog.c
>> @@ -23,6 +23,8 @@
>>   #include <linux/sched/debug.h>
>>   #include <linux/sched/isolation.h>
>>   #include <linux/stop_machine.h>
>> +#include <linux/kernel_stat.h>
>> +#include <linux/math64.h>
>>
>>   #include <asm/irq_regs.h>
>>   #include <linux/kvm_para.h>
>> @@ -441,6 +443,58 @@ static int is_softlockup(unsigned long touch_ts,
>>          return 0;
>>   }
>>
>> +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
>> +static DEFINE_PER_CPU(u64, cpustat_old[NR_STATS]);
>> +static DEFINE_PER_CPU(u64, cpustat_diff[5][NR_STATS]);
>> +static DEFINE_PER_CPU(int, cpustat_tail);
> 
> The above is potentially a lot of memory. On some configs we allow up
> to 8192 CPUs and you're storing (NR_STATS * 6) u64s per CPU plus
> another int per CPU . NR_STATS can be up to 11 currently, so if I did
> my math right then that's
> 
> 8 * 8192 * 11 * 6 + 4 * 8192 = 4,358,144 bytes (!) That's probably not OK.
> 
> In theory you only need 4 stats (plus the total) instead of 11. If you
> do that plus the "old" then that would give you (8 * 8192 * 5 * 6 + 4
> * 8192) = 1,998,848 bytes. That's still a lot, but at least less.
> ...so at the very least you should only store the fields you need.
> 
> Given the amount of potential space at stake, it's probably worth
> optimizing this to something smaller than u64 as well. It feels like
> it would be easy to use u32. You only need diffs here and it doesn't
> seem like it would be hard to figure out how to use 32-bits, even if
> you have the drop a bit of granularity.
> 
> It feels like you could even go down to 8-bits per stat, though. You
> don't _really_ need such accurate percentages, right? Does it really
> matter if you have 99.4%? It seems like just saying 99% would be fine.
> Just do the math here and store the integral percentage per stat you
> care about. The numbers 0 - 100 can be stored in 8 bits.
> 
Thanks for your detailed analysis and optimization of memory 
consumption. I indeed hadn't considered this point, and I will
update it in v2.
> Combining that all together, I guess you could do this (untested):
> 
> 1. Get the current stats and right shift them all by 10 and convert
> them to 32-bit. This gives you (roughly) microseconds which will roll
> over roughly every ~4000 seconds. That seems about right, though I
> guess it's possible to do more and see if we can squeeze into 16 bits.
> 
> 2. Sum up all 11 of the stats to get a total. This should still fit
> into 32-bits.

> 
> 3. Do the math to get the integral percentages and store those in the tail slot.
> 
I'm a bit confused, why is there a need to sum up all 11 of the stats to
get a total? I calculate the percentage using sample_period.
> 4. Store the 4 stats you care about plus the total (all 32-bits) in
> "cpustat_old".
> 
> If I've got that right, your definitions should be able to be:
> 
> #define NUM_OLD_STATS_GROUPS  5
> #define NUM_STATS_PER_GROUP 4
> static DEFINE_PER_CPU(u32, cpustat_old[NUM_STATS_PER_GROUP + 1]);
> static DEFINE_PER_CPU(u8,
> cpustat_utilization[NUM_OLD_STATS][NUM_STATS_PER_GROUP]);
> static DEFINE_PER_CPU(u8, cpustat_tail);
> 
> With the maximum number of CPUs, that's now this, if I got my math right.
> 
> 4 * 8192 * 5 + 1 * 8192 * 5 * 4 + 1 * 8192 = 335,872 bytes.
> 
> That's a lot less, but still a lot. I'd be interested to hear other
> opinions, but it's probably worth a Kconfig knob.
> 
> 
>> +static void update_cpustat(void)
>> +{
>> +       u64 *old = this_cpu_ptr(cpustat_old);
>> +       u64 (*diff)[NR_STATS] = this_cpu_ptr(cpustat_diff);
>> +       int tail = this_cpu_read(cpustat_tail), i;
> 
> nit: Please define "i" on its own line. It looks weird the way you have it here.
Sure.
> 
>> +       struct kernel_cpustat kcpustat;
>> +       u64 *cpustat = kcpustat.cpustat;
>> +
>> +       kcpustat_cpu_fetch(&kcpustat, smp_processor_id());
>> +       for (i = 0; i < NR_STATS; i++) {
>> +               diff[tail][i] = cpustat[i] - old[i];
>> +               old[i] = cpustat[i];
>> +       }
>> +       this_cpu_write(cpustat_tail, (tail + 1) % 5);
>> +}
>> +
>> +static void print_cpustat(void)
>> +{
>> +       int i, j, k;
>> +       u64 a[5][NR_STATS], b[5][NR_STATS];
>> +       u64 (*diff)[NR_STATS] = this_cpu_ptr(cpustat_diff);
>> +       int tail = this_cpu_read(cpustat_tail);
>> +       u32 period_us = sample_period / 1000;
>> +
>> +       for (i = 0; i < 5; i++) {
>> +               for (j = 0; j < NR_STATS; j++) {
>> +                       a[i][j] = 100 * (diff[i][j] / 1000);
>> +                       b[i][j] = 10 * do_div(a[i][j], period_us);
>> +                       do_div(b[i][j], period_us);
>> +               }
>> +       }
>> +       printk(KERN_CRIT "CPU#%d Utilization every %us during lockup:\n",
>> +               smp_processor_id(), period_us/1000000);
>> +       for (k = 0, i = tail; k < 5; k++, i = (i + 1) % 5) {
>> +               printk(KERN_CRIT "\t#%d: %llu.%llu%% system,\t%llu.%llu%% softirq,\t"
>> +                       "%llu.%llu%% hardirq,\t%llu.%llu%% idle\n", k+1,
>> +                       a[i][CPUTIME_SYSTEM], b[i][CPUTIME_SYSTEM],
>> +                       a[i][CPUTIME_SOFTIRQ], b[i][CPUTIME_SOFTIRQ],
>> +                       a[i][CPUTIME_IRQ], b[i][CPUTIME_IRQ],
>> +                       a[i][CPUTIME_IDLE], b[i][CPUTIME_IDLE]);
> 
> As per your response to Liu Song, I understand why you're using
> "printk" instead of pr_crit(), but it deserves a comment.
Sure, I will add a comment here.

Re: [PATCH 1/3] watchdog/softlockup: low-overhead detection of interrupt storm

Posted by Doug Anderson 2 years ago

Hi,

On Thu, Jan 25, 2024 at 12:31 AM Bitao Hu <yaoma@linux.alibaba.com> wrote:
>
> > 2. Sum up all 11 of the stats to get a total. This should still fit
> > into 32-bits.
>
> >
> > 3. Do the math to get the integral percentages and store those in the tail slot.
> >
> I'm a bit confused, why is there a need to sum up all 11 of the stats to
> get a total? I calculate the percentage using sample_period.

Oh, you're right! I saw you looping over NR_STATS in print_cpustat()
and assumed that was because you were totaling them. Yeah, using
sample_period should work.

-Doug

Re: [PATCH 1/3] watchdog/softlockup: low-overhead detection of interrupt storm

Posted by Liu Song 2 years ago

在 2024/1/23 20:12, Bitao Hu 写道:
> The following softlockup is caused by interrupt storm, but it cannot be
> identified from the call tree. Because the call tree is just a snapshot
> and doesn't fully capture the behavior of the CPU during the soft lockup.
>    watchdog: BUG: soft lockup - CPU#28 stuck for 23s! [fio:83921]
>    ...
>    Call trace:
>      __do_softirq+0xa0/0x37c
>      __irq_exit_rcu+0x108/0x140
>      irq_exit+0x14/0x20
>      __handle_domain_irq+0x84/0xe0
>      gic_handle_irq+0x80/0x108
>      el0_irq_naked+0x50/0x58
>
> Therefore，I think it is necessary to report CPU utilization during the
> softlockup_thresh period (report once every sample_period, for a total
> of 5 reportings), like this:
>    watchdog: BUG: soft lockup - CPU#28 stuck for 23s! [fio:83921]
>    CPU#28 Utilization every 4s during lockup:
>      #1: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>      #2: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>      #3: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>      #4: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>      #5: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>    ...
>
> This would be helpful in determining whether an interrupt storm has
> occurred or in identifying the cause of the softlockup. The criteria for
> determination are as follows:
>    a. If the hardirq utilization is high, then interrupt storm should be
>    considered and the root cause cannot be determined from the call tree.
>    b. If the softirq utilization is high, then we could analyze the call
>    tree but it may cannot reflect the root cause.
>    c. If the system utilization is high, then we could analyze the root
>    cause from the call tree.
>
> Signed-off-by: Bitao Hu <yaoma@linux.alibaba.com>
> ---
>   kernel/watchdog.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 58 insertions(+)
>
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 81a8862295d6..9fad10e0a147 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -23,6 +23,8 @@
>   #include <linux/sched/debug.h>
>   #include <linux/sched/isolation.h>
>   #include <linux/stop_machine.h>
> +#include <linux/kernel_stat.h>
> +#include <linux/math64.h>
>   
>   #include <asm/irq_regs.h>
>   #include <linux/kvm_para.h>
> @@ -441,6 +443,58 @@ static int is_softlockup(unsigned long touch_ts,
>   	return 0;
>   }
>   
> +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
> +static DEFINE_PER_CPU(u64, cpustat_old[NR_STATS]);
> +static DEFINE_PER_CPU(u64, cpustat_diff[5][NR_STATS]);
> +static DEFINE_PER_CPU(int, cpustat_tail);
> +
> +static void update_cpustat(void)
> +{
> +	u64 *old = this_cpu_ptr(cpustat_old);
> +	u64 (*diff)[NR_STATS] = this_cpu_ptr(cpustat_diff);
> +	int tail = this_cpu_read(cpustat_tail), i;
> +	struct kernel_cpustat kcpustat;
> +	u64 *cpustat = kcpustat.cpustat;
> +
> +	kcpustat_cpu_fetch(&kcpustat, smp_processor_id());
> +	for (i = 0; i < NR_STATS; i++) {
> +		diff[tail][i] = cpustat[i] - old[i];
> +		old[i] = cpustat[i];
> +	}
> +	this_cpu_write(cpustat_tail, (tail + 1) % 5);
The number 5 here is related to the 5 in cpustat_diff[5], and it is 
recommended to use a macro definition instead of using the number 5 
directly.
> +}
> +
> +static void print_cpustat(void)
> +{
> +	int i, j, k;
> +	u64 a[5][NR_STATS], b[5][NR_STATS];
Use define instead of the literal number 5.
> +	u64 (*diff)[NR_STATS] = this_cpu_ptr(cpustat_diff);
> +	int tail = this_cpu_read(cpustat_tail);
> +	u32 period_us = sample_period / 1000;
Use NSEC_PER_USEC
> +
> +	for (i = 0; i < 5; i++) {
> +		for (j = 0; j < NR_STATS; j++) {
> +			a[i][j] = 100 * (diff[i][j] / 1000);
> +			b[i][j] = 10 * do_div(a[i][j], period_us);
> +			do_div(b[i][j], period_us);
> +		}
> +	}
> +	printk(KERN_CRIT "CPU#%d Utilization every %us during lockup:\n",
better use "pr_crit", and was the original intent here microseconds (us) 
or milliseconds (ms)?
> +		smp_processor_id(), period_us/1000000);
better use "period_us /NSEC_PER_MSEC"?
> +	for (k = 0, i = tail; k < 5; k++, i = (i + 1) % 5) {

It seems that only i and j are necessary, k is not essential.

> +		printk(KERN_CRIT "\t#%d: %llu.%llu%% system,\t%llu.%llu%% softirq,\t"
> +			"%llu.%llu%% hardirq,\t%llu.%llu%% idle\n", k+1,
> +			a[i][CPUTIME_SYSTEM], b[i][CPUTIME_SYSTEM],
> +			a[i][CPUTIME_SOFTIRQ], b[i][CPUTIME_SOFTIRQ],
> +			a[i][CPUTIME_IRQ], b[i][CPUTIME_IRQ],
> +			a[i][CPUTIME_IDLE], b[i][CPUTIME_IDLE]);
> +	}
> +}
> +#else
> +static inline void update_cpustat(void) { }
> +static inline void print_cpustat(void) { }
> +#endif
> +
>   /* watchdog detector functions */
>   static DEFINE_PER_CPU(struct completion, softlockup_completion);
>   static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
> @@ -504,6 +558,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
>   	 */
>   	period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts));
>   
> +	/* update cpu usage stat */
The function name already indicates that it involves graphs, so the 
comment here appears superfluous.
If a comment is absolutely necessary, please provide more detailed 
information.
> +	update_cpustat();
> +
>   	/* Reset the interval when touched by known problematic code. */
>   	if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
>   		if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
> @@ -539,6 +596,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
>   		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
>   			smp_processor_id(), duration,
>   			current->comm, task_pid_nr(current));
> +		print_cpustat();
>   		print_modules();
>   		print_irqtrace_events(current);
>   		if (regs)

Re: [PATCH 1/3] watchdog/softlockup: low-overhead detection of interrupt storm

Posted by yaoma 2 years ago


On 2024/1/24 09:43, Liu Song wrote:
> 
> 在 2024/1/23 20:12, Bitao Hu 写道:
>> The following softlockup is caused by interrupt storm, but it cannot be
>> identified from the call tree. Because the call tree is just a snapshot
>> and doesn't fully capture the behavior of the CPU during the soft lockup.
>>    watchdog: BUG: soft lockup - CPU#28 stuck for 23s! [fio:83921]
>>    ...
>>    Call trace:
>>      __do_softirq+0xa0/0x37c
>>      __irq_exit_rcu+0x108/0x140
>>      irq_exit+0x14/0x20
>>      __handle_domain_irq+0x84/0xe0
>>      gic_handle_irq+0x80/0x108
>>      el0_irq_naked+0x50/0x58
>>
>> Therefore，I think it is necessary to report CPU utilization during the
>> softlockup_thresh period (report once every sample_period, for a total
>> of 5 reportings), like this:
>>    watchdog: BUG: soft lockup - CPU#28 stuck for 23s! [fio:83921]
>>    CPU#28 Utilization every 4s during lockup:
>>      #1: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>>      #2: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>>      #3: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>>      #4: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>>      #5: 0.0% system, 0.0% softirq, 100.0% hardirq, 0.0% idle
>>    ...
>>
>> This would be helpful in determining whether an interrupt storm has
>> occurred or in identifying the cause of the softlockup. The criteria for
>> determination are as follows:
>>    a. If the hardirq utilization is high, then interrupt storm should be
>>    considered and the root cause cannot be determined from the call tree.
>>    b. If the softirq utilization is high, then we could analyze the call
>>    tree but it may cannot reflect the root cause.
>>    c. If the system utilization is high, then we could analyze the root
>>    cause from the call tree.
>>
>> Signed-off-by: Bitao Hu <yaoma@linux.alibaba.com>
>> ---
>>   kernel/watchdog.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 58 insertions(+)
>>
>> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
>> index 81a8862295d6..9fad10e0a147 100644
>> --- a/kernel/watchdog.c
>> +++ b/kernel/watchdog.c
>> @@ -23,6 +23,8 @@
>>   #include <linux/sched/debug.h>
>>   #include <linux/sched/isolation.h>
>>   #include <linux/stop_machine.h>
>> +#include <linux/kernel_stat.h>
>> +#include <linux/math64.h>
>>   #include <asm/irq_regs.h>
>>   #include <linux/kvm_para.h>
>> @@ -441,6 +443,58 @@ static int is_softlockup(unsigned long touch_ts,
>>       return 0;
>>   }
>> +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
>> +static DEFINE_PER_CPU(u64, cpustat_old[NR_STATS]);
>> +static DEFINE_PER_CPU(u64, cpustat_diff[5][NR_STATS]);
>> +static DEFINE_PER_CPU(int, cpustat_tail);
>> +
>> +static void update_cpustat(void)
>> +{
>> +    u64 *old = this_cpu_ptr(cpustat_old);
>> +    u64 (*diff)[NR_STATS] = this_cpu_ptr(cpustat_diff);
>> +    int tail = this_cpu_read(cpustat_tail), i;
>> +    struct kernel_cpustat kcpustat;
>> +    u64 *cpustat = kcpustat.cpustat;
>> +
>> +    kcpustat_cpu_fetch(&kcpustat, smp_processor_id());
>> +    for (i = 0; i < NR_STATS; i++) {
>> +        diff[tail][i] = cpustat[i] - old[i];
>> +        old[i] = cpustat[i];
>> +    }
>> +    this_cpu_write(cpustat_tail, (tail + 1) % 5);
> The number 5 here is related to the 5 in cpustat_diff[5], and it is 
> recommended to use a macro definition instead of using the number 5 
> directly.
In the "set_sample_period" function, the "sample_period" is hardcoded to 
be 1/5 of the "softlockup_thresh", therefore I define the length of the 
array here as 5.
>> +}
>> +
>> +static void print_cpustat(void)
>> +{
>> +    int i, j, k;
>> +    u64 a[5][NR_STATS], b[5][NR_STATS];
> Use define instead of the literal number 5.
Same as above
>> +    u64 (*diff)[NR_STATS] = this_cpu_ptr(cpustat_diff);
>> +    int tail = this_cpu_read(cpustat_tail);
>> +    u32 period_us = sample_period / 1000;
> Use NSEC_PER_USEC
Sure.
>> +
>> +    for (i = 0; i < 5; i++) {
>> +        for (j = 0; j < NR_STATS; j++) {
>> +            a[i][j] = 100 * (diff[i][j] / 1000);
>> +            b[i][j] = 10 * do_div(a[i][j], period_us);
>> +            do_div(b[i][j], period_us);
>> +        }
>> +    }
>> +    printk(KERN_CRIT "CPU#%d Utilization every %us during lockup:\n",
> better use "pr_crit", and was the original intent here microseconds (us) 
> or milliseconds (ms)?
Using "pr_crit", each line will have a "watchdog:" prefix, which I think 
might not look very neat.
The intended unit here is seconds(s).

>> +        smp_processor_id(), period_us/1000000);
> better use "period_us /NSEC_PER_MSEC"?
Sure, I will use USEC_PER_SEC here.
>> +    for (k = 0, i = tail; k < 5; k++, i = (i + 1) % 5) {
> 
> It seems that only i and j are necessary, k is not essential.
Sure, I will remove the variable 'k'.
> 
>> +        printk(KERN_CRIT "\t#%d: %llu.%llu%% system,\t%llu.%llu%% 
>> softirq,\t"
>> +            "%llu.%llu%% hardirq,\t%llu.%llu%% idle\n", k+1,
>> +            a[i][CPUTIME_SYSTEM], b[i][CPUTIME_SYSTEM],
>> +            a[i][CPUTIME_SOFTIRQ], b[i][CPUTIME_SOFTIRQ],
>> +            a[i][CPUTIME_IRQ], b[i][CPUTIME_IRQ],
>> +            a[i][CPUTIME_IDLE], b[i][CPUTIME_IDLE]);
>> +    }
>> +}
>> +#else
>> +static inline void update_cpustat(void) { }
>> +static inline void print_cpustat(void) { }
>> +#endif
>> +
>>   /* watchdog detector functions */
>>   static DEFINE_PER_CPU(struct completion, softlockup_completion);
>>   static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
>> @@ -504,6 +558,9 @@ static enum hrtimer_restart 
>> watchdog_timer_fn(struct hrtimer *hrtimer)
>>        */
>>       period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts));
>> +    /* update cpu usage stat */
> The function name already indicates that it involves graphs, so the 
> comment here appears superfluous.
> If a comment is absolutely necessary, please provide more detailed 
> information.
Sure.
>> +    update_cpustat();
>> +
>>       /* Reset the interval when touched by known problematic code. */
>>       if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
>>           if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
>> @@ -539,6 +596,7 @@ static enum hrtimer_restart 
>> watchdog_timer_fn(struct hrtimer *hrtimer)
>>           pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
>>               smp_processor_id(), duration,
>>               current->comm, task_pid_nr(current));
>> +        print_cpustat();
>>           print_modules();
>>           print_irqtrace_events(current);
>>           if (regs)

[PATCH 1/3] watchdog/softlockup: low-overhead detection of interrupt storm
[PATCH 2/3] watchdog/softlockup: report the most time-consuming hardirq
[PATCH 3/3] watchdog/softlockup: add parameter to control the reporting of time-consuming hardirq