[PATCH] sched/stats: Optimize /proc/schedstat printing

Dmitry Ilvokhin posted 1 patch 4 months, 1 week ago
There is a newer version of this series
kernel/sched/stats.c | 86 ++++++++++++++++++++++++++------------------
1 file changed, 52 insertions(+), 34 deletions(-)
[PATCH] sched/stats: Optimize /proc/schedstat printing
Posted by Dmitry Ilvokhin 4 months, 1 week ago
Function seq_printf supports rich format string for decimals printing,
but there is no need for it in /proc/schedstat, since majority of the
data is space separared decimals. Use seq_put_decimal_ull instead as
faster alternative.

Performance counter stats (truncated) for sh -c 'cat /proc/schedstat >
/dev/null' before and after applying the patch from machine with 72 CPUs
are below.

Before:

      2.94 msec task-clock               #    0.820 CPUs utilized
         1      context-switches         #  340.551 /sec
         0      cpu-migrations           #    0.000 /sec
       340      page-faults              #  115.787 K/sec
10,327,200      instructions             #    1.89  insn per cycle
                                         #    0.10  stalled cycles per insn
 5,458,307      cycles                   #    1.859 GHz
 1,052,733      stalled-cycles-frontend  #   19.29% frontend cycles idle
 2,066,321      branches                 #  703.687 M/sec
    25,621      branch-misses            #    1.24% of all branches

0.00357974 +- 0.00000209 seconds time elapsed  ( +-  0.06% )

After:

      2.50 msec task-clock              #    0.785 CPUs utilized
         1      context-switches        #  399.780 /sec
         0      cpu-migrations          #    0.000 /sec
       340      page-faults             #  135.925 K/sec
 7,371,867      instructions            #    1.59  insn per cycle
                                        #    0.13  stalled cycles per insn
 4,647,053      cycles                  #    1.858 GHz
   986,487      stalled-cycles-frontend #   21.23% frontend cycles idle
 1,591,374      branches                #  636.199 M/sec
    28,973      branch-misses           #    1.82% of all branches

0.00318461 +- 0.00000295 seconds time elapsed  ( +-  0.09% )

This is ~11% (relative) improvement in time elapsed.

Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
---
 kernel/sched/stats.c | 86 ++++++++++++++++++++++++++------------------
 1 file changed, 52 insertions(+), 34 deletions(-)

diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index d1c9429a4ac5..b304f821e8ff 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -98,6 +98,56 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
 	}
 }
 
+static void show_runqueue_stats(struct seq_file *seq, int cpu, struct rq *rq)
+{
+	seq_printf(seq, "cpu%d", cpu);
+	seq_put_decimal_ull(seq, " ", rq->yld_count);
+	seq_put_decimal_ull(seq, " ", 0);
+	seq_put_decimal_ull(seq, " ", rq->sched_count);
+	seq_put_decimal_ull(seq, " ", rq->sched_goidle);
+	seq_put_decimal_ull(seq, " ", rq->ttwu_count);
+	seq_put_decimal_ull(seq, " ", rq->ttwu_local);
+	seq_put_decimal_ull(seq, " ", rq->rq_cpu_time);
+	seq_put_decimal_ull(seq, " ", rq->rq_sched_info.run_delay);
+	seq_put_decimal_ull(seq, " ", rq->rq_sched_info.pcount);
+	seq_putc(seq, '\n');
+}
+
+static void show_domain_stats(struct seq_file *seq, int dcount,
+			      struct sched_domain *sd)
+{
+	enum cpu_idle_type itype;
+
+	seq_printf(seq, "domain%d %s %*pb", dcount, sd->name,
+		   cpumask_pr_args(sched_domain_span(sd)));
+	for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
+		seq_put_decimal_ull(seq, " ", sd->lb_count[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_balanced[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_failed[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_imbalance_load[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_imbalance_util[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_imbalance_task[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_imbalance_misfit[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_gained[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_hot_gained[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_nobusyq[itype]);
+		seq_put_decimal_ull(seq, " ", sd->lb_nobusyg[itype]);
+	}
+	seq_put_decimal_ull(seq, " ", sd->alb_count);
+	seq_put_decimal_ull(seq, " ", sd->alb_failed);
+	seq_put_decimal_ull(seq, " ", sd->alb_pushed);
+	seq_put_decimal_ull(seq, " ", sd->sbe_count);
+	seq_put_decimal_ull(seq, " ", sd->sbe_balanced);
+	seq_put_decimal_ull(seq, " ", sd->sbe_pushed);
+	seq_put_decimal_ull(seq, " ", sd->sbf_count);
+	seq_put_decimal_ull(seq, " ", sd->sbf_balanced);
+	seq_put_decimal_ull(seq, " ", sd->sbf_pushed);
+	seq_put_decimal_ull(seq, " ", sd->ttwu_wake_remote);
+	seq_put_decimal_ull(seq, " ", sd->ttwu_move_affine);
+	seq_put_decimal_ull(seq, " ", sd->ttwu_move_balance);
+	seq_putc(seq, '\n');
+}
+
 /*
  * Current schedstat API version.
  *
@@ -121,44 +171,12 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		rq = cpu_rq(cpu);
 
 		/* runqueue-specific stats */
-		seq_printf(seq,
-		    "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
-		    cpu, rq->yld_count,
-		    rq->sched_count, rq->sched_goidle,
-		    rq->ttwu_count, rq->ttwu_local,
-		    rq->rq_cpu_time,
-		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
-
-		seq_printf(seq, "\n");
+		show_runqueue_stats(seq, cpu, rq);
 
 		/* domain-specific stats */
 		rcu_read_lock();
 		for_each_domain(cpu, sd) {
-			enum cpu_idle_type itype;
-
-			seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name,
-				   cpumask_pr_args(sched_domain_span(sd)));
-			for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
-				seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u",
-				    sd->lb_count[itype],
-				    sd->lb_balanced[itype],
-				    sd->lb_failed[itype],
-				    sd->lb_imbalance_load[itype],
-				    sd->lb_imbalance_util[itype],
-				    sd->lb_imbalance_task[itype],
-				    sd->lb_imbalance_misfit[itype],
-				    sd->lb_gained[itype],
-				    sd->lb_hot_gained[itype],
-				    sd->lb_nobusyq[itype],
-				    sd->lb_nobusyg[itype]);
-			}
-			seq_printf(seq,
-				   " %u %u %u %u %u %u %u %u %u %u %u %u\n",
-			    sd->alb_count, sd->alb_failed, sd->alb_pushed,
-			    sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
-			    sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
-			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
-			    sd->ttwu_move_balance);
+			show_domain_stats(seq, dcount++, sd);
 		}
 		rcu_read_unlock();
 	}
-- 
2.47.3
Re: [PATCH] sched/stats: Optimize /proc/schedstat printing
Posted by Dmitry Ilvokhin 3 months, 3 weeks ago
Hello everyone,

I wanted to follow up on this patch and see if the proposed changes look
reasonable to you. I'd greatly appreciate any feedback you may have, and
thank you in advance for taking the time to review it.

Best regards,
Dmitry