[RFC PATCH bpf-next 1/2] Enhance BPF execution timing by excluding IRQ time

Jianlin Lv posted 2 patches 8 months ago
[RFC PATCH bpf-next 1/2] Enhance BPF execution timing by excluding IRQ time
Posted by Jianlin Lv 8 months ago
From: Jianlin Lv <iecedge@gmail.com>

This commit excludes IRQ time from the total execution duration of BPF
programs. When CONFIG_IRQ_TIME_ACCOUNTING is enabled, IRQ time is
accounted for separately, offering a more accurate assessment of CPU
usage for BPF programs.

Signed-off-by: Jianlin Lv <iecedge@gmail.com>
---
 include/linux/filter.h | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index f5cf4d35d83e..3e0f975176a6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -703,12 +703,32 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 	cant_migrate();
 	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
 		struct bpf_prog_stats *stats;
-		u64 duration, start = sched_clock();
+		u64 duration, start, start_time, end_time, irq_delta;
 		unsigned long flags;
+		unsigned int cpu;
 
-		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
+		#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+		if (in_task()) {
+			cpu = get_cpu();
+			put_cpu();
+			start_time = irq_time_read(cpu);
+		}
+		#endif
 
+		start = sched_clock();
+		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
 		duration = sched_clock() - start;
+
+		#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+		if (in_task()) {
+			end_time = irq_time_read(cpu);
+			if (end_time > start_time) {
+				irq_delta = end_time - start_time;
+				duration -= irq_delta;
+			}
+		}
+		#endif
+
 		stats = this_cpu_ptr(prog->stats);
 		flags = u64_stats_update_begin_irqsave(&stats->syncp);
 		u64_stats_inc(&stats->cnt);
-- 
2.34.1
Re: [RFC PATCH bpf-next 1/2] Enhance BPF execution timing by excluding IRQ time
Posted by Alexei Starovoitov 8 months ago
On Tue, Apr 22, 2025 at 6:47 AM Jianlin Lv <iecedge@gmail.com> wrote:
>
> From: Jianlin Lv <iecedge@gmail.com>
>
> This commit excludes IRQ time from the total execution duration of BPF
> programs. When CONFIG_IRQ_TIME_ACCOUNTING is enabled, IRQ time is
> accounted for separately, offering a more accurate assessment of CPU
> usage for BPF programs.
>
> Signed-off-by: Jianlin Lv <iecedge@gmail.com>
> ---
>  include/linux/filter.h | 24 ++++++++++++++++++++++--
>  1 file changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index f5cf4d35d83e..3e0f975176a6 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -703,12 +703,32 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
>         cant_migrate();
>         if (static_branch_unlikely(&bpf_stats_enabled_key)) {
>                 struct bpf_prog_stats *stats;
> -               u64 duration, start = sched_clock();
> +               u64 duration, start, start_time, end_time, irq_delta;
>                 unsigned long flags;
> +               unsigned int cpu;
>
> -               ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> +               #ifdef CONFIG_IRQ_TIME_ACCOUNTING
> +               if (in_task()) {
> +                       cpu = get_cpu();
> +                       put_cpu();
> +                       start_time = irq_time_read(cpu);
> +               }
> +               #endif
>
> +               start = sched_clock();
> +               ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
>                 duration = sched_clock() - start;
> +
> +               #ifdef CONFIG_IRQ_TIME_ACCOUNTING
> +               if (in_task()) {
> +                       end_time = irq_time_read(cpu);
> +                       if (end_time > start_time) {
> +                               irq_delta = end_time - start_time;
> +                               duration -= irq_delta;
> +                       }
> +               }
> +               #endif
> +

This is way too much overhead.
This timing loop is optimized to measure bpf prog runtime.
See commit ce09cbdd9888 ("bpf: Improve program stats run-time calculation")
IRQ can happen and distort the numbers, but you shouldn't
be running with bpf_stats_enabled for a long time.
You need to sample it instead.
Every couple minutes turn it on for a second, capture the stats
and aggregate over time. Filter out outliers due to IRQ or whatever.
Re: [RFC PATCH bpf-next 1/2] Enhance BPF execution timing by excluding IRQ time
Posted by Peter Zijlstra 8 months ago
On Tue, Apr 22, 2025 at 09:47:26PM +0800, Jianlin Lv wrote:
> From: Jianlin Lv <iecedge@gmail.com>
> 
> This commit excludes IRQ time from the total execution duration of BPF
> programs. When CONFIG_IRQ_TIME_ACCOUNTING is enabled, IRQ time is
> accounted for separately, offering a more accurate assessment of CPU
> usage for BPF programs.
> 
> Signed-off-by: Jianlin Lv <iecedge@gmail.com>
> ---
>  include/linux/filter.h | 24 ++++++++++++++++++++++--
>  1 file changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index f5cf4d35d83e..3e0f975176a6 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -703,12 +703,32 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
>  	cant_migrate();
>  	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
>  		struct bpf_prog_stats *stats;
> -		u64 duration, start = sched_clock();
> +		u64 duration, start, start_time, end_time, irq_delta;
>  		unsigned long flags;
> +		unsigned int cpu;
>  
> -		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> +		#ifdef CONFIG_IRQ_TIME_ACCOUNTING
> +		if (in_task()) {
> +			cpu = get_cpu();
> +			put_cpu();
> +			start_time = irq_time_read(cpu);

This is all sorts of daft.. you don't need get_cpu()/put_cpu().

> +		}
> +		#endif
>  
> +		start = sched_clock();
> +		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
>  		duration = sched_clock() - start;
> +
> +		#ifdef CONFIG_IRQ_TIME_ACCOUNTING
> +		if (in_task()) {
> +			end_time = irq_time_read(cpu);
> +			if (end_time > start_time) {
> +				irq_delta = end_time - start_time;
> +				duration -= irq_delta;
> +			}
> +		}
> +		#endif

This is really dodgy coding style. Please keep the preprocessor
directives at column 0.

What do you think about steal-time, do you want to remove that from your
BPF runtime too?

If so, perhaps expose the scheduler's clock_task, which does both things
already?