[v2] Reorder some fields in struct rq.

[PATCH v2] Reorder some fields in struct rq.

Posted by Blake Jones 2 months ago

This colocates some hot fields in "struct rq" to be on the same cache line
as others that are often accessed at the same time or in similar ways.

Using data from a Google-internal fleet-scale profiler, I found three
distinct groups of hot fields in struct rq:

- (1) The runqueue lock: __lock.

- (2) Those accessed from hot code in pick_next_task_fair():
      nr_running, nr_numa_running, nr_preferred_running,
      ttwu_pending, cpu_capacity, curr, idle.

- (3) Those accessed from some other hot codepaths, e.g.
      update_curr(), update_rq_clock(), and scheduler_tick():
      clock_task, clock_pelt, clock, lost_idle_time,
      clock_update_flags, clock_pelt_idle, clock_idle.

The cycles spent on accessing these different groups of fields broke down
roughly as follows:

- 50% on group (1) (the runqueue lock, always read-write)
- 39% on group (2) (load:store ratio around 38:1)
-  8% on group (3) (load:store ratio around 5:1)
-  3% on all the other fields

Most of the fields in group (3) are already in a cache line grouping; this
patch just adds "clock" and "clock_update_flags" to that group. The fields
in group (2) are scattered across several cache lines; the main effect of
this patch is to group them together, on a single line at the beginning of
the structure. A few other less performance-critical fields (nr_switches,
numa_migrate_on, has_blocked_load, nohz_csd, last_blocked_load_update_tick)
were also reordered to reduce holes in the data structure.

Since the runqueue lock is acquired from so many different contexts, and is
basically always accessed using an atomic operation, putting it on either
of the cache lines for groups (2) or (3) would slow down accesses to those
fields dramatically, since those groups are read-mostly accesses.

This patch does not change the size of "struct rq" on machines with 64-byte
cache lines. The additional "____cacheline_aligned" to put the runqueue
lock on the next cache line will add an additional 64 bytes of padding on
machines with 128-byte cache lines; although this is unfortunate, it seemed
more likely to lead to stably good performance than e.g. by just putting
the runqueue lock somewhere in the middle of the structure and hoping it
wasn't on an otherwise busy cache line.

I ran "hackbench" to test this change, but it didn't show very conclusive
results.  Looking at a profile of the hackbench run, it was spending 95% of
its cycles inside __alloc_skb(), __kfree_skb(), or kmem_cache_free() -
almost all of which was spent updating memcg counters or contending on the
list_lock in kmem_cache_node. In contrast, it spent less than 0.5% of its
cycles inside either schedule() or try_to_wake_up(). So it's not surprising
that it didn't show useful results here.

Instead, to test this, I wrote a focused load test that would put load on
the pick_next_task_fair() path. A parent process would fork many child
processes, and each child would nanosleep() for 1 msec many times in a
loop. The load test was monitored with "perf", and I looked at the amount
of cycles that were spent with sched_balance_rq() on the stack. The test
reliably showed 15-25% of all cycles were spent there. I ran it 80 times on
a pair of 2-socket Intel GNR machines (480 vCPUs per machine) - one running
6.16-rc7, the other running this change - using 4800 child processes and
8000 1-msec sleeps per child.  The mean cycle count dropped from 405.4B to
387.6B, or a 4.4% decrease.

More importantly, given that this change reduces cache misses in a very hot
kernel codepath, there's likely to be additional application performance
improvement due to reduced cache conflicts from kernel data structures.

v1 -> v2 changes:
- Resynced to tip/sched/core tree, which has removed CONFIG_SMP.

Signed-off-by: Blake Jones <blakejones@google.com>
---
 kernel/sched/sched.h | 65 +++++++++++++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d3f33d10c58c..e399d480d680 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1092,26 +1092,46 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct rq {
-	/* runqueue lock: */
-	raw_spinlock_t		__lock;
-
+	/*
+	 * The following members are loaded together from pick_next_task(),
+	 * and should be on an isolated cache line to avoid cache pollution.
+	 */
 	unsigned int		nr_running;
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int		nr_numa_running;
 	unsigned int		nr_preferred_running;
-	unsigned int		numa_migrate_on;
 #endif
+	unsigned int		ttwu_pending;
+	unsigned long		cpu_capacity;
+#ifdef CONFIG_SCHED_PROXY_EXEC
+	struct task_struct __rcu	*donor;  /* Scheduling context */
+	struct task_struct __rcu	*curr;   /* Execution context */
+#else
+	union {
+		struct task_struct __rcu *donor; /* Scheduler context */
+		struct task_struct __rcu *curr;  /* Execution context */
+	};
+#endif
+	struct task_struct	*idle;
+	/* padding left here deliberately */
+
+	/*
+	 * The next cacheline holds the (hot) runqueue lock, as well as
+	 * some other less performance-critical fields.
+	 */
+	u64			nr_switches	____cacheline_aligned;
+
+	/* runqueue lock: */
+	raw_spinlock_t		__lock;
+
 #ifdef CONFIG_NO_HZ_COMMON
-	unsigned long		last_blocked_load_update_tick;
-	unsigned int		has_blocked_load;
-	call_single_data_t	nohz_csd;
 	unsigned int		nohz_tick_stopped;
 	atomic_t		nohz_flags;
+	unsigned int		has_blocked_load;
+	unsigned long		last_blocked_load_update_tick;
+	call_single_data_t	nohz_csd;
 #endif /* CONFIG_NO_HZ_COMMON */
 
-	unsigned int		ttwu_pending;
-	u64			nr_switches;
-
 #ifdef CONFIG_UCLAMP_TASK
 	/* Utilization clamp values based on CPU's RUNNABLE tasks */
 	struct uclamp_rq	uclamp[UCLAMP_CNT] ____cacheline_aligned;
@@ -1134,6 +1154,9 @@ struct rq {
 	struct list_head	*tmp_alone_branch;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_NUMA_BALANCING
+	unsigned int		numa_migrate_on;
+#endif
 	/*
 	 * This is part of a global counter where only the total sum
 	 * over all CPUs matters. A task can increase this counter on
@@ -1142,29 +1165,23 @@ struct rq {
 	 */
 	unsigned long 		nr_uninterruptible;
 
-#ifdef CONFIG_SCHED_PROXY_EXEC
-	struct task_struct __rcu	*donor;  /* Scheduling context */
-	struct task_struct __rcu	*curr;   /* Execution context */
-#else
-	union {
-		struct task_struct __rcu *donor; /* Scheduler context */
-		struct task_struct __rcu *curr;  /* Execution context */
-	};
-#endif
 	struct sched_dl_entity	*dl_server;
-	struct task_struct	*idle;
 	struct task_struct	*stop;
 	unsigned long		next_balance;
 	struct mm_struct	*prev_mm;
 
-	unsigned int		clock_update_flags;
-	u64			clock;
-	/* Ensure that all clocks are in the same cache line */
+	/*
+	 * The following fields of clock data are frequently referenced
+	 * and updated together, and should go on their own cache line.
+	 */
 	u64			clock_task ____cacheline_aligned;
 	u64			clock_pelt;
+	u64			clock;
 	unsigned long		lost_idle_time;
+	unsigned int		clock_update_flags;
 	u64			clock_pelt_idle;
 	u64			clock_idle;
+
 #ifndef CONFIG_64BIT
 	u64			clock_pelt_idle_copy;
 	u64			clock_idle_copy;
@@ -1182,8 +1199,6 @@ struct rq {
 	struct root_domain		*rd;
 	struct sched_domain __rcu	*sd;
 
-	unsigned long		cpu_capacity;
-
 	struct balance_callback *balance_callback;
 
 	unsigned char		nohz_idle_balance;
-- 
2.50.1.552.g942d659e1b-goog

Re: [PATCH v2] Reorder some fields in struct rq.

Posted by Josh Don 1 month, 2 weeks ago

Hi Blake,

On Wed, Jul 30, 2025 at 1:56 PM Blake Jones <blakejones@google.com> wrote:
>
> This colocates some hot fields in "struct rq" to be on the same cache line
> as others that are often accessed at the same time or in similar ways.
>

Thanks for the analysis and this patch.

I was going to suggest ____cacheline_aligned_in_smp, but it'll behave
nearly identical in practice so it doesn't matter (would save 64 bytes
on a 128 byte cacheline UP system).

Peter, any thoughts on this patch?

Best,
Josh

[Ping] Re: [PATCH v2] Reorder some fields in struct rq.

Posted by Blake Jones 3 weeks, 5 days ago

Hi Josh,

On Wed, Aug 20, 2025 at 5:11 PM Josh Don <joshdon@google.com> wrote:
> On Wed, Jul 30, 2025 at 1:56 PM Blake Jones <blakejones@google.com> wrote:
> > This colocates some hot fields in "struct rq" to be on the same cache line
> > as others that are often accessed at the same time or in similar ways.
>
> Thanks for the analysis and this patch.
>
> I was going to suggest ____cacheline_aligned_in_smp, but it'll behave
> nearly identical in practice so it doesn't matter (would save 64 bytes
> on a 128 byte cacheline UP system).

Thanks for your comments. I suspect there aren't a lot of
128-byte-cacheline UP systems out there, so I'm not going to worry
about this.

> Peter, any thoughts on this patch?

Echoing this - any thoughts from the maintainers?

Blake

Re: [PATCH v2] Reorder some fields in struct rq.

Posted by Madadi Vineeth Reddy 1 month, 3 weeks ago

Hi Blake,

On 31/07/25 02:26, Blake Jones wrote:
> This colocates some hot fields in "struct rq" to be on the same cache line
> as others that are often accessed at the same time or in similar ways.
> 

[..snip..]

> 
> This patch does not change the size of "struct rq" on machines with 64-byte
> cache lines. The additional "____cacheline_aligned" to put the runqueue
> lock on the next cache line will add an additional 64 bytes of padding on
> machines with 128-byte cache lines; although this is unfortunate, it seemed
> more likely to lead to stably good performance than e.g. by just putting
> the runqueue lock somewhere in the middle of the structure and hoping it
> wasn't on an otherwise busy cache line.

This change introduced an 88 byte hole due to having __lock in a different
cache line on Power11 which is 128 byte architecture which led to one cacheline
more than before.

Tested with your custom test case (thanks for sharing) and observed around
~5% decrease in the number of cycles, along with a slight increase in user
time — both are positive indicators.

Also ran ebizzy, which doesn’t seem to be impacted. I think it would be good
to run a set of standard benchmarks like schbench, ebizzy, hackbench, and
stress-ng, along with a real-life workload, to ensure there’s no negative
impact. I saw that hackbench was tried, but including those numbers would
be helpful.

Reviewed-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>

Thanks,
Madadi Vineeth Reddy

> 
> I ran "hackbench" to test this change, but it didn't show very conclusive
> results.  Looking at a profile of the hackbench run, it was spending 95% of
> its cycles inside __alloc_skb(), __kfree_skb(), or kmem_cache_free() -
> almost all of which was spent updating memcg counters or contending on the
> list_lock in kmem_cache_node. In contrast, it spent less than 0.5% of its
> cycles inside either schedule() or try_to_wake_up(). So it's not surprising
> that it didn't show useful results here.
> 

[..snip..]

> @@ -1182,8 +1199,6 @@ struct rq {
>  	struct root_domain		*rd;
>  	struct sched_domain __rcu	*sd;
>  
> -	unsigned long		cpu_capacity;
> -
>  	struct balance_callback *balance_callback;
>  
>  	unsigned char		nohz_idle_balance;

Re: [PATCH v2] Reorder some fields in struct rq.

Posted by Blake Jones 1 month, 3 weeks ago

Hi Madadi,

Thanks so much for reviewing this, and I'm glad to hear that the
change seemed to work well for you.

On Wed, Aug 13, 2025 at 12:30 AM Madadi Vineeth Reddy
<vineethr@linux.ibm.com> wrote:
> Also ran ebizzy, which doesn’t seem to be impacted. I think it would be good
> to run a set of standard benchmarks like schbench, ebizzy, hackbench, and
> stress-ng, along with a real-life workload, to ensure there’s no negative
> impact. I saw that hackbench was tried, but including those numbers would
> be helpful.

I agree that it would be interesting to have such a set of benchmarks
- and also, I'm not sure what they should be. Part of why I mentioned
my experiment with hackbench is that I'd seen it cited several times
as a standard scheduling benchmark, and my conclusion from running it
and profiling it was that I wouldn't recommend including it in a
canonical set of scheduling benchmarks. I don't have my actual numbers
from running it, but my recollection was that any delta in performance
that it showed was well within the margin of error of the test.

I did run this change on a fraction of the Google server fleet, as an
experiment along with a few other related data structure tweaks, and
saw a meaningful improvement in application performance. But I only
got results for the experiment as a whole, so I can't be sure how much
of the improvement was due to this change vs. the others.

Blake

> Reviewed-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
> Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>

Re: [PATCH v2] Reorder some fields in struct rq.

Posted by Blake Jones 2 months ago

Below is the source for the load test that I used. Although it
performs some timing calculations, the actual metric I used to
evaluate the change was "average amount of CPU time spent in
sched_balance_rq()", as described above.

------------------------------------------------------------------------

#include <poll.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>

struct {
        /* "-m": Number of milliseconds to sleep for. */
        int sleep_ms;

        /* "-l": (base 2) Log of number of sleeps to do. */
        int log_loops;

        /* "-c": Number of children. */
        int children;
} params = {
        .sleep_ms = 1,
        .log_loops = 13,
        .children = 1000,
};

/* ------------------------------------------------------------------ */

typedef struct {
        pthread_mutex_t        mutex;
        pthread_cond_t        cv;
        pthread_cond_t        parent_cv;
        int                nthreads;
        int                nthreads_total;
        int                go;
        int                stop;
} thread_group_t;

void
thread_data_init(thread_group_t *tg)
{
        pthread_mutexattr_t mattr;
        pthread_mutexattr_init(&mattr);
        pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
        pthread_mutex_init(&tg->mutex, &mattr);
        pthread_mutexattr_destroy(&mattr);

        pthread_condattr_t cattr;
        pthread_condattr_init(&cattr);
        pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED);
        pthread_cond_init(&tg->cv, &cattr);
        pthread_cond_init(&tg->parent_cv, &cattr);
        pthread_condattr_destroy(&cattr);

        tg->nthreads = 0;
        tg->nthreads_total = params.children;
        tg->go = 0;
        tg->stop = 0;
}

void
parent_thread(thread_group_t *tg)
{
        pthread_mutex_lock(&tg->mutex);
        while (tg->nthreads != tg->nthreads_total) {
                pthread_cond_wait(&tg->parent_cv, &tg->mutex);
        }
        tg->go = 1;
        pthread_cond_broadcast(&tg->cv);
        pthread_mutex_unlock(&tg->mutex);

        pthread_mutex_lock(&tg->mutex);
        while (tg->nthreads != 0) {
                pthread_cond_wait(&tg->parent_cv, &tg->mutex);
        }
        tg->stop = 1;
        pthread_cond_broadcast(&tg->cv);
        pthread_mutex_unlock(&tg->mutex);
}

void
loop(unsigned long long loops)
{
        struct timespec ts;
        ts.tv_sec = 0;
        ts.tv_nsec = params.sleep_ms * 1000000;

        for (unsigned long long i = 0; i < loops; i++) {
                nanosleep(&ts, NULL);
        }
}

void *
child_thread(thread_group_t *tg)
{
        pthread_mutex_lock(&tg->mutex);
        ++tg->nthreads;
        if (tg->nthreads == tg->nthreads_total) {
                pthread_cond_signal(&tg->parent_cv);
        }
        while (!tg->go) {
                pthread_cond_wait(&tg->cv, &tg->mutex);
        }
        pthread_mutex_unlock(&tg->mutex);

        loop(1ULL << params.log_loops);

        pthread_mutex_lock(&tg->mutex);
        --tg->nthreads;
        if (tg->nthreads == 0) {
                pthread_cond_signal(&tg->parent_cv);
        }
        while (!tg->stop) {
                pthread_cond_wait(&tg->cv, &tg->mutex);
        }
        pthread_mutex_unlock(&tg->mutex);
        return NULL;
}

void
thread_data_fini(thread_group_t *tg)
{
        pthread_mutex_destroy(&tg->mutex);
        pthread_cond_destroy(&tg->cv);
        pthread_cond_destroy(&tg->parent_cv);
}

/* ------------------------------------------------------------------ */

void *
spawn_procs(thread_group_t *tg)
{
        pid_t *pids = malloc(params.children * sizeof (pid_t));
        for (int c = 0; c < params.children; c++) {
                pid_t pid;
                switch (pid = fork()) {
                case 0:
                        child_thread(tg);
                        _exit(0);
                        break;
                case -1:
                        perror("fork() failed");
                        return NULL;
                default:
                        pids[c] = pid;
                        break;
                }
        }
        return pids;
}

void
await_procs(void *data)
{
        pid_t *pids = (pid_t *)data;
        for (int c = 0; c < params.children; c++) {
                int rv = waitpid(pids[c], NULL, 0);
                if (rv != pids[c]) {
                        char msg[256];
                        snprintf(msg, sizeof(msg),
                            "waitpid(%d) = %d", pids[c], rv);
                        perror(msg);
                }
        }
        free(pids);
}

double
main_loop(void)
{
        void *mem = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
                         MAP_ANONYMOUS | MAP_SHARED, -1, 0);
        if (mem == MAP_FAILED) {
                perror("mmap() failed");
                return 0;
        }
        thread_group_t *tg = mem;
        thread_data_init(tg);

        void *data = spawn_procs(tg);

        struct timespec ts1, ts2;
        if (clock_gettime(CLOCK_MONOTONIC, &ts1) == -1) {
                perror("clock_gettime(ts1)");
                return 1;
        }
        parent_thread(tg);
        if (clock_gettime(CLOCK_MONOTONIC, &ts2) == -1) {
                perror("clock_gettime(ts2)");
                return 1;
        }

        await_procs(data);
        thread_data_fini(tg);
        munmap(mem, 4096);

        double t1 = (double)ts1.tv_sec + (double)ts1.tv_nsec / 1e9;
        double t2 = (double)ts2.tv_sec + (double)ts2.tv_nsec / 1e9;
        return t2 - t1;
}

int
main(int argc, char **argv)
{
        int opt;
        while ((opt = getopt(argc, argv, "m:l:c:f")) != -1) {
                switch (opt) {
                case 'm':
                        params.sleep_ms = atoi(optarg);
                        break;
                case 'l':
                        params.log_loops = atoi(optarg);
                        break;
                case 'c':
                        params.children = atoi(optarg);
                        break;
                default:
                        fprintf(stderr, "Usage: "
                            "%s [-m <ms> -l <log> -c <children>]\n", argv[0]);
                        return 1;
                }
        }

        printf("Running: "
                "%5d children, 1<<%d loops, %d ms sleep/loop\n",
                params.children, params.log_loops, params.sleep_ms);

        printf("%.5lf\n", main_loop());

        return 0;
}