[PATCH RESEND] sched/stats: Optimize /proc/schedstat printing

Dmitry Ilvokhin posted 1 patch 3 months, 1 week ago
kernel/sched/stats.c | 86 ++++++++++++++++++++++++++------------------
1 file changed, 52 insertions(+), 34 deletions(-)
[PATCH RESEND] sched/stats: Optimize /proc/schedstat printing
Posted by Dmitry Ilvokhin 3 months, 1 week ago
Function seq_printf supports rich format string for decimals printing,
but there is no need for it in /proc/schedstat, since majority of the
data is space separared decimals. Use seq_put_decimal_ull instead as
faster alternative.

Performance counter stats (truncated) for sh -c 'cat /proc/schedstat >
/dev/null' before and after applying the patch from machine with 72 CPUs
are below.

Before:

      2.94 msec task-clock               #    0.820 CPUs utilized
         1      context-switches         #  340.551 /sec
         0      cpu-migrations           #    0.000 /sec
       340      page-faults              #  115.787 K/sec
10,327,200      instructions             #    1.89  insn per cycle
                                         #    0.10  stalled cycles per insn
 5,458,307      cycles                   #    1.859 GHz
 1,052,733      stalled-cycles-frontend  #   19.29% frontend cycles idle
 2,066,321      branches                 #  703.687 M/sec
    25,621      branch-misses            #    1.24% of all branches

0.00357974 +- 0.00000209 seconds time elapsed  ( +-  0.06% )

After:

      2.50 msec task-clock              #    0.785 CPUs utilized
         1      context-switches        #  399.780 /sec
         0      cpu-migrations          #    0.000 /sec
       340      page-faults             #  135.925 K/sec
 7,371,867      instructions            #    1.59  insn per cycle
                                        #    0.13  stalled cycles per insn
 4,647,053      cycles                  #    1.858 GHz
   986,487      stalled-cycles-frontend #   21.23% frontend cycles idle
 1,591,374      branches                #  636.199 M/sec
    28,973      branch-misses           #    1.82% of all branches

0.00318461 +- 0.00000295 seconds time elapsed  ( +-  0.09% )

This is ~11% (relative) improvement in time elapsed.

Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
---
 kernel/sched/stats.c | 86 ++++++++++++++++++++++++++------------------
 1 file changed, 52 insertions(+), 34 deletions(-)

diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index d1c9429a4ac5..b304f821e8ff 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -98,6 +98,56 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
 	}
 }
 
+static void show_runqueue_stats(struct seq_file *seq, int cpu, struct rq *rq)
+{
+	seq_printf(seq, "cpu%d", cpu);
+	seq_put_decimal_ull(seq, " ", rq->yld_count);
+	seq_put_decimal_ull(seq, " ", 0);
+	seq_put_decimal_ull(seq, " ", rq->sched_count);
+	seq_put_decimal_ull(seq, " ", rq->sched_goidle);
+	seq_put_decimal_ull(seq, " ", rq->ttwu_count);
+	seq_put_decimal_ull(seq, " ", rq->ttwu_local);
+	seq_put_decimal_ull(seq, " ", rq->rq_cpu_time);
+	seq_put_decimal_ull(seq, " ", rq->rq_sched_info.run_delay);
+	seq_put_decimal_ull(seq, " ", rq->rq_sched_info.pcount);
+	seq_putc(seq, '\n');
+}
+
+static void show_domain_stats(struct seq_file *seq, int dcount,
+			      struct sched_domain *sd)
+{
+	enum cpu_idle_type itype;
+
+	seq_printf(seq, "domain%d %s %*pb", dcount, sd->name,
+		   cpumask_pr_args(sched_domain_span(sd)));
+	for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
+		seq_put_decimal_ull(seq, " ", sd->lb_count[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_balanced[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_failed[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_imbalance_load[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_imbalance_util[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_imbalance_task[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_imbalance_misfit[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_gained[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_hot_gained[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_nobusyq[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_nobusyg[itype]);
+	}
+	seq_put_decimal_ull(seq, " ", sd->alb_count);
+	seq_put_decimal_ull(seq, " ", sd->alb_failed);
+	seq_put_decimal_ull(seq, " ", sd->alb_pushed);
+	seq_put_decimal_ull(seq, " ", sd->sbe_count);
+	seq_put_decimal_ull(seq, " ", sd->sbe_balanced);
+	seq_put_decimal_ull(seq, " ", sd->sbe_pushed);
+	seq_put_decimal_ull(seq, " ", sd->sbf_count);
+	seq_put_decimal_ull(seq, " ", sd->sbf_balanced);
+	seq_put_decimal_ull(seq, " ", sd->sbf_pushed);
+	seq_put_decimal_ull(seq, " ", sd->ttwu_wake_remote);
+	seq_put_decimal_ull(seq, " ", sd->ttwu_move_affine);
+	seq_put_decimal_ull(seq, " ", sd->ttwu_move_balance);
+	seq_putc(seq, '\n');
+}
+
 /*
  * Current schedstat API version.
  *
@@ -121,44 +171,12 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		rq = cpu_rq(cpu);
 
 		/* runqueue-specific stats */
-		seq_printf(seq,
-		    "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
-		    cpu, rq->yld_count,
-		    rq->sched_count, rq->sched_goidle,
-		    rq->ttwu_count, rq->ttwu_local,
-		    rq->rq_cpu_time,
-		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
-
-		seq_printf(seq, "\n");
+		show_runqueue_stats(seq, cpu, rq);
 
 		/* domain-specific stats */
 		rcu_read_lock();
 		for_each_domain(cpu, sd) {
-			enum cpu_idle_type itype;
-
-			seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name,
-				   cpumask_pr_args(sched_domain_span(sd)));
-			for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
-				seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u",
-				    sd->lb_count[itype],
-				    sd->lb_balanced[itype],
-				    sd->lb_failed[itype],
-				    sd->lb_imbalance_load[itype],
-				    sd->lb_imbalance_util[itype],
-				    sd->lb_imbalance_task[itype],
-				    sd->lb_imbalance_misfit[itype],
-				    sd->lb_gained[itype],
-				    sd->lb_hot_gained[itype],
-				    sd->lb_nobusyq[itype],
-				    sd->lb_nobusyg[itype]);
-			}
-			seq_printf(seq,
-				   " %u %u %u %u %u %u %u %u %u %u %u %u\n",
-			    sd->alb_count, sd->alb_failed, sd->alb_pushed,
-			    sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
-			    sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
-			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
-			    sd->ttwu_move_balance);
+			show_domain_stats(seq, dcount++, sd);
 		}
 		rcu_read_unlock();
 	}
-- 
2.47.3
Re: [PATCH RESEND] sched/stats: Optimize /proc/schedstat printing
Posted by Peter Zijlstra 3 months, 1 week ago
On Wed, Oct 29, 2025 at 01:07:15PM +0000, Dmitry Ilvokhin wrote:
> Function seq_printf supports rich format string for decimals printing,
> but there is no need for it in /proc/schedstat, since majority of the
> data is space separared decimals. Use seq_put_decimal_ull instead as
> faster alternative.
> 
> Performance counter stats (truncated) for sh -c 'cat /proc/schedstat >
> /dev/null' before and after applying the patch from machine with 72 CPUs
> are below.
> 
> Before:
> 
>       2.94 msec task-clock               #    0.820 CPUs utilized
>          1      context-switches         #  340.551 /sec
>          0      cpu-migrations           #    0.000 /sec
>        340      page-faults              #  115.787 K/sec
> 10,327,200      instructions             #    1.89  insn per cycle
>                                          #    0.10  stalled cycles per insn
>  5,458,307      cycles                   #    1.859 GHz
>  1,052,733      stalled-cycles-frontend  #   19.29% frontend cycles idle
>  2,066,321      branches                 #  703.687 M/sec
>     25,621      branch-misses            #    1.24% of all branches
> 
> 0.00357974 +- 0.00000209 seconds time elapsed  ( +-  0.06% )
> 
> After:
> 
>       2.50 msec task-clock              #    0.785 CPUs utilized
>          1      context-switches        #  399.780 /sec
>          0      cpu-migrations          #    0.000 /sec
>        340      page-faults             #  135.925 K/sec
>  7,371,867      instructions            #    1.59  insn per cycle
>                                         #    0.13  stalled cycles per insn
>  4,647,053      cycles                  #    1.858 GHz
>    986,487      stalled-cycles-frontend #   21.23% frontend cycles idle
>  1,591,374      branches                #  636.199 M/sec
>     28,973      branch-misses           #    1.82% of all branches
> 
> 0.00318461 +- 0.00000295 seconds time elapsed  ( +-  0.09% )
> 
> This is ~11% (relative) improvement in time elapsed.

Yeah, but who cares? Why do we want less obvious code for a silly stats
file?
Re: [PATCH RESEND] sched/stats: Optimize /proc/schedstat printing
Posted by Dmitry Ilvokhin 3 months, 1 week ago
On Wed, Oct 29, 2025 at 03:07:55PM +0100, Peter Zijlstra wrote:
> On Wed, Oct 29, 2025 at 01:07:15PM +0000, Dmitry Ilvokhin wrote:
> > Function seq_printf supports rich format string for decimals printing,
> > but there is no need for it in /proc/schedstat, since majority of the
> > data is space separared decimals. Use seq_put_decimal_ull instead as
> > faster alternative.
> > 
> > Performance counter stats (truncated) for sh -c 'cat /proc/schedstat >
> > /dev/null' before and after applying the patch from machine with 72 CPUs
> > are below.
> > 
> > Before:
> > 
> >       2.94 msec task-clock               #    0.820 CPUs utilized
> >          1      context-switches         #  340.551 /sec
> >          0      cpu-migrations           #    0.000 /sec
> >        340      page-faults              #  115.787 K/sec
> > 10,327,200      instructions             #    1.89  insn per cycle
> >                                          #    0.10  stalled cycles per insn
> >  5,458,307      cycles                   #    1.859 GHz
> >  1,052,733      stalled-cycles-frontend  #   19.29% frontend cycles idle
> >  2,066,321      branches                 #  703.687 M/sec
> >     25,621      branch-misses            #    1.24% of all branches
> > 
> > 0.00357974 +- 0.00000209 seconds time elapsed  ( +-  0.06% )
> > 
> > After:
> > 
> >       2.50 msec task-clock              #    0.785 CPUs utilized
> >          1      context-switches        #  399.780 /sec
> >          0      cpu-migrations          #    0.000 /sec
> >        340      page-faults             #  135.925 K/sec
> >  7,371,867      instructions            #    1.59  insn per cycle
> >                                         #    0.13  stalled cycles per insn
> >  4,647,053      cycles                  #    1.858 GHz
> >    986,487      stalled-cycles-frontend #   21.23% frontend cycles idle
> >  1,591,374      branches                #  636.199 M/sec
> >     28,973      branch-misses           #    1.82% of all branches
> > 
> > 0.00318461 +- 0.00000295 seconds time elapsed  ( +-  0.09% )
> > 
> > This is ~11% (relative) improvement in time elapsed.
> 
> Yeah, but who cares? Why do we want less obvious code for a silly stats
> file?

Thanks for the feedback, Peter.

Fair point that /proc/schedstat isn’t a hot path in the kernel itself,
but it is a hot path for monitoring software (Prometheus for example).
In large fleets, these files are polled periodically (often every few
seconds) on every machine. The cumulative overhead adds up quickly
across thousands of nodes, so reducing the cost of generating these
stats does have a measurable operational impact. With the ongoing trend
toward higher core counts per machine, this cost becomes even more
noticeable over time.

I've tried to keep the code as readable as possible, but I understand if
you think an ~11% improvement isn't worth the added complexity. If you
have suggestions for making the code cleaner or the intent clearer, I’d
be happy to rework it.
Re: [PATCH RESEND] sched/stats: Optimize /proc/schedstat printing
Posted by Peter Zijlstra 3 months, 1 week ago
On Wed, Oct 29, 2025 at 02:46:33PM +0000, Dmitry Ilvokhin wrote:
> On Wed, Oct 29, 2025 at 03:07:55PM +0100, Peter Zijlstra wrote:
> > On Wed, Oct 29, 2025 at 01:07:15PM +0000, Dmitry Ilvokhin wrote:
> > > Function seq_printf supports rich format string for decimals printing,
> > > but there is no need for it in /proc/schedstat, since majority of the
> > > data is space separared decimals. Use seq_put_decimal_ull instead as
> > > faster alternative.
> > > 
> > > Performance counter stats (truncated) for sh -c 'cat /proc/schedstat >
> > > /dev/null' before and after applying the patch from machine with 72 CPUs
> > > are below.
> > > 
> > > Before:
> > > 
> > >       2.94 msec task-clock               #    0.820 CPUs utilized
> > >          1      context-switches         #  340.551 /sec
> > >          0      cpu-migrations           #    0.000 /sec
> > >        340      page-faults              #  115.787 K/sec
> > > 10,327,200      instructions             #    1.89  insn per cycle
> > >                                          #    0.10  stalled cycles per insn
> > >  5,458,307      cycles                   #    1.859 GHz
> > >  1,052,733      stalled-cycles-frontend  #   19.29% frontend cycles idle
> > >  2,066,321      branches                 #  703.687 M/sec
> > >     25,621      branch-misses            #    1.24% of all branches
> > > 
> > > 0.00357974 +- 0.00000209 seconds time elapsed  ( +-  0.06% )
> > > 
> > > After:
> > > 
> > >       2.50 msec task-clock              #    0.785 CPUs utilized
> > >          1      context-switches        #  399.780 /sec
> > >          0      cpu-migrations          #    0.000 /sec
> > >        340      page-faults             #  135.925 K/sec
> > >  7,371,867      instructions            #    1.59  insn per cycle
> > >                                         #    0.13  stalled cycles per insn
> > >  4,647,053      cycles                  #    1.858 GHz
> > >    986,487      stalled-cycles-frontend #   21.23% frontend cycles idle
> > >  1,591,374      branches                #  636.199 M/sec
> > >     28,973      branch-misses           #    1.82% of all branches
> > > 
> > > 0.00318461 +- 0.00000295 seconds time elapsed  ( +-  0.09% )
> > > 
> > > This is ~11% (relative) improvement in time elapsed.
> > 
> > Yeah, but who cares? Why do we want less obvious code for a silly stats
> > file?
> 
> Thanks for the feedback, Peter.
> 
> Fair point that /proc/schedstat isn’t a hot path in the kernel itself,
> but it is a hot path for monitoring software (Prometheus for example).

Aliens! I like Xenomorphs :-) But I doubt that's what you're talking
about.

> In large fleets, these files are polled periodically (often every few
> seconds) on every machine. The cumulative overhead adds up quickly
> across thousands of nodes, so reducing the cost of generating these
> stats does have a measurable operational impact. With the ongoing trend
> toward higher core counts per machine, this cost becomes even more
> noticeable over time.
> 
> I've tried to keep the code as readable as possible, but I understand if
> you think an ~11% improvement isn't worth the added complexity. If you
> have suggestions for making the code cleaner or the intent clearer, I’d
> be happy to rework it.

What are they doing this for? I would much rather rework all this such
that all the schedstat crap becomes tracepoints and all the existing
cruft optional consumers of that.

Like I argued here:

  https://lkml.kernel.org/r/20250703141800.GX1613200@noisy.programming.kicks-ass.net

Then people can consume them however makes most sense, ideally with a
binary interface if it is high bandwidth.
Re: [PATCH RESEND] sched/stats: Optimize /proc/schedstat printing
Posted by Dmitry Ilvokhin 3 months, 1 week ago
On Wed, Oct 29, 2025 at 03:55:13PM +0100, Peter Zijlstra wrote:
> On Wed, Oct 29, 2025 at 02:46:33PM +0000, Dmitry Ilvokhin wrote:
> > On Wed, Oct 29, 2025 at 03:07:55PM +0100, Peter Zijlstra wrote:
> > > On Wed, Oct 29, 2025 at 01:07:15PM +0000, Dmitry Ilvokhin wrote:
> > > > Function seq_printf supports rich format string for decimals printing,
> > > > but there is no need for it in /proc/schedstat, since majority of the
> > > > data is space separared decimals. Use seq_put_decimal_ull instead as
> > > > faster alternative.
> > > > 
> > > > Performance counter stats (truncated) for sh -c 'cat /proc/schedstat >
> > > > /dev/null' before and after applying the patch from machine with 72 CPUs
> > > > are below.
> > > > 
> > > > Before:
> > > > 
> > > >       2.94 msec task-clock               #    0.820 CPUs utilized
> > > >          1      context-switches         #  340.551 /sec
> > > >          0      cpu-migrations           #    0.000 /sec
> > > >        340      page-faults              #  115.787 K/sec
> > > > 10,327,200      instructions             #    1.89  insn per cycle
> > > >                                          #    0.10  stalled cycles per insn
> > > >  5,458,307      cycles                   #    1.859 GHz
> > > >  1,052,733      stalled-cycles-frontend  #   19.29% frontend cycles idle
> > > >  2,066,321      branches                 #  703.687 M/sec
> > > >     25,621      branch-misses            #    1.24% of all branches
> > > > 
> > > > 0.00357974 +- 0.00000209 seconds time elapsed  ( +-  0.06% )
> > > > 
> > > > After:
> > > > 
> > > >       2.50 msec task-clock              #    0.785 CPUs utilized
> > > >          1      context-switches        #  399.780 /sec
> > > >          0      cpu-migrations          #    0.000 /sec
> > > >        340      page-faults             #  135.925 K/sec
> > > >  7,371,867      instructions            #    1.59  insn per cycle
> > > >                                         #    0.13  stalled cycles per insn
> > > >  4,647,053      cycles                  #    1.858 GHz
> > > >    986,487      stalled-cycles-frontend #   21.23% frontend cycles idle
> > > >  1,591,374      branches                #  636.199 M/sec
> > > >     28,973      branch-misses           #    1.82% of all branches
> > > > 
> > > > 0.00318461 +- 0.00000295 seconds time elapsed  ( +-  0.09% )
> > > > 
> > > > This is ~11% (relative) improvement in time elapsed.
> > > 
> > > Yeah, but who cares? Why do we want less obvious code for a silly stats
> > > file?
> > 
> > Thanks for the feedback, Peter.
> > 
> > Fair point that /proc/schedstat isn’t a hot path in the kernel itself,
> > but it is a hot path for monitoring software (Prometheus for example).
> 
> Aliens! I like Xenomorphs :-) But I doubt that's what you're talking
> about.
> 
> > In large fleets, these files are polled periodically (often every few
> > seconds) on every machine. The cumulative overhead adds up quickly
> > across thousands of nodes, so reducing the cost of generating these
> > stats does have a measurable operational impact. With the ongoing trend
> > toward higher core counts per machine, this cost becomes even more
> > noticeable over time.
> > 
> > I've tried to keep the code as readable as possible, but I understand if
> > you think an ~11% improvement isn't worth the added complexity. If you
> > have suggestions for making the code cleaner or the intent clearer, I’d
> > be happy to rework it.
> 
> What are they doing this for? I would much rather rework all this such
> that all the schedstat crap becomes tracepoints and all the existing
> cruft optional consumers of that.
> 

One common use case for /proc/schedstat that I'm aware of is post-mortem
analysis of scheduler behavior, for example, debugging latency,
fairness, or throughput issues after they have occurred. Continuous
polling is often done to preserve historical data, since it’s often
unclear in advance which metrics will be useful for future
investigation. I doubt historical data from /proc/schedstat is something
average users monitor daily, but kernel developers or performance
engineers are likely to use it for more in-depth analysis.

> Like I argued here:
> 
>   https://lkml.kernel.org/r/20250703141800.GX1613200@noisy.programming.kicks-ass.net
> 
> Then people can consume them however makes most sense, ideally with a
> binary interface if it is high bandwidth.

I also agree that a binary interface would be a better long-term
approach, not only because the text interface has formatting costs on
the kernel side, but also due to parsing overhead in userspace. However,
implementing a full binary interface is a larger project: other files
like /proc/interrupts could benefit as well. I chose to start with a
smaller-scale change because the /proc interface is unlikely to
disappear soon, and even with better solutions available, existing
software will continue to use it for some time.
Re: [PATCH RESEND] sched/stats: Optimize /proc/schedstat printing
Posted by Dmitry Ilvokhin 3 months ago
On Wed, Oct 29, 2025 at 03:49:43PM +0000, Dmitry Ilvokhin wrote:
> On Wed, Oct 29, 2025 at 03:55:13PM +0100, Peter Zijlstra wrote:
> > On Wed, Oct 29, 2025 at 02:46:33PM +0000, Dmitry Ilvokhin wrote:
> > > On Wed, Oct 29, 2025 at 03:07:55PM +0100, Peter Zijlstra wrote:
> > > > On Wed, Oct 29, 2025 at 01:07:15PM +0000, Dmitry Ilvokhin wrote:
> > > > > Function seq_printf supports rich format string for decimals printing,
> > > > > but there is no need for it in /proc/schedstat, since majority of the
> > > > > data is space separared decimals. Use seq_put_decimal_ull instead as
> > > > > faster alternative.
> > > > > 
> > > > > Performance counter stats (truncated) for sh -c 'cat /proc/schedstat >
> > > > > /dev/null' before and after applying the patch from machine with 72 CPUs
> > > > > are below.
> > > > > 
> > > > > Before:
> > > > > 
> > > > >       2.94 msec task-clock               #    0.820 CPUs utilized
> > > > >          1      context-switches         #  340.551 /sec
> > > > >          0      cpu-migrations           #    0.000 /sec
> > > > >        340      page-faults              #  115.787 K/sec
> > > > > 10,327,200      instructions             #    1.89  insn per cycle
> > > > >                                          #    0.10  stalled cycles per insn
> > > > >  5,458,307      cycles                   #    1.859 GHz
> > > > >  1,052,733      stalled-cycles-frontend  #   19.29% frontend cycles idle
> > > > >  2,066,321      branches                 #  703.687 M/sec
> > > > >     25,621      branch-misses            #    1.24% of all branches
> > > > > 
> > > > > 0.00357974 +- 0.00000209 seconds time elapsed  ( +-  0.06% )
> > > > > 
> > > > > After:
> > > > > 
> > > > >       2.50 msec task-clock              #    0.785 CPUs utilized
> > > > >          1      context-switches        #  399.780 /sec
> > > > >          0      cpu-migrations          #    0.000 /sec
> > > > >        340      page-faults             #  135.925 K/sec
> > > > >  7,371,867      instructions            #    1.59  insn per cycle
> > > > >                                         #    0.13  stalled cycles per insn
> > > > >  4,647,053      cycles                  #    1.858 GHz
> > > > >    986,487      stalled-cycles-frontend #   21.23% frontend cycles idle
> > > > >  1,591,374      branches                #  636.199 M/sec
> > > > >     28,973      branch-misses           #    1.82% of all branches
> > > > > 
> > > > > 0.00318461 +- 0.00000295 seconds time elapsed  ( +-  0.09% )
> > > > > 
> > > > > This is ~11% (relative) improvement in time elapsed.
> > > > 
> > > > Yeah, but who cares? Why do we want less obvious code for a silly stats
> > > > file?
> > > 
> > > Thanks for the feedback, Peter.
> > > 
> > > Fair point that /proc/schedstat isn’t a hot path in the kernel itself,
> > > but it is a hot path for monitoring software (Prometheus for example).
> > 
> > Aliens! I like Xenomorphs :-) But I doubt that's what you're talking
> > about.
> > 
> > > In large fleets, these files are polled periodically (often every few
> > > seconds) on every machine. The cumulative overhead adds up quickly
> > > across thousands of nodes, so reducing the cost of generating these
> > > stats does have a measurable operational impact. With the ongoing trend
> > > toward higher core counts per machine, this cost becomes even more
> > > noticeable over time.
> > > 
> > > I've tried to keep the code as readable as possible, but I understand if
> > > you think an ~11% improvement isn't worth the added complexity. If you
> > > have suggestions for making the code cleaner or the intent clearer, I’d
> > > be happy to rework it.
> > 
> > What are they doing this for? I would much rather rework all this such
> > that all the schedstat crap becomes tracepoints and all the existing
> > cruft optional consumers of that.
> > 
> 
> One common use case for /proc/schedstat that I'm aware of is post-mortem
> analysis of scheduler behavior, for example, debugging latency,
> fairness, or throughput issues after they have occurred. Continuous
> polling is often done to preserve historical data, since it’s often
> unclear in advance which metrics will be useful for future
> investigation. I doubt historical data from /proc/schedstat is something
> average users monitor daily, but kernel developers or performance
> engineers are likely to use it for more in-depth analysis.
> 
> > Like I argued here:
> > 
> >   https://lkml.kernel.org/r/20250703141800.GX1613200@noisy.programming.kicks-ass.net
> > 
> > Then people can consume them however makes most sense, ideally with a
> > binary interface if it is high bandwidth.
> 
> I also agree that a binary interface would be a better long-term
> approach, not only because the text interface has formatting costs on
> the kernel side, but also due to parsing overhead in userspace. However,
> implementing a full binary interface is a larger project: other files
> like /proc/interrupts could benefit as well. I chose to start with a
> smaller-scale change because the /proc interface is unlikely to
> disappear soon, and even with better solutions available, existing
> software will continue to use it for some time.

Hi Peter,

I wanted to follow up to make sure I fully understand your position and
to confirm whether there's any path forward for this change exploring
binary interface path, or if you'd prefer to leave things as they are.

To briefly recap, my motivation was practical: while I agree that a
binary interface or tracepoints would be a better long-term solution,
the current text-based /proc/schedstat is still widely used in existing
tools. In large-scale environments, even small improvements in the
efficiency of generating these stats can have a measurable operational
impact, especially as core counts continue to rise.

I tried to keep the change minimal and focused, aiming for a
straightforward improvement without adding significant complexity. 

I also want to make sure that the arguments for this change haven't
fallen between the cracks, and that the practical reasons for making the
text version faster are fully considered. My goal is to close the loop
on this topic, so if you feel this patch isn't justified, I'm happy to
defer to your judgment. 

Thanks again for your time and feedback.