Introduce cpus_share_l2c to allow querying whether two logical CPUs
share a common L2 cache.
Considering a system like the AMD EPYC 9654 96-Core Processor, the L1
cache has a latency of 4-5 cycles, the L2 cache has a latency of at
least 14ns, whereas the L3 cache has a latency of 50ns [1]. Compared to
this, I measured the RAM accesses to a latency around 120ns on my
system [2]. So L3 really is only 2.4x faster than RAM accesses.
Therefore, with this relatively slow access speed compared to L2, the
scheduler will benefit from only considering CPUs sharing an L2 cache
for the purpose of using remote runqueue locking rather than queued
wakeups.
Link: https://en.wikichip.org/wiki/amd/microarchitectures/zen_4 [1]
Link: https://github.com/ChipsandCheese/MemoryLatencyTest [2]
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Swapnil Sapkal <Swapnil.Sapkal@amd.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Julien Desfossez <jdesfossez@digitalocean.com>
Cc: x86@kernel.org
---
Changes since v1:
- Fix l2c id for configurations where L2 have a single logical CPU:
use TOPOLOGY_CLUSTER_SYSFS to find out whether topology cluster is
implemented or if LLC should be used as fallback.
Changes since v2:
- Reverse order of cpu_get_l2c_info() l2c_id and l2c_size output
arguments to match the caller.
---
include/linux/sched/topology.h | 6 ++++++
kernel/sched/core.c | 8 ++++++++
kernel/sched/sched.h | 2 ++
kernel/sched/topology.c | 32 +++++++++++++++++++++++++++++---
4 files changed, 45 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7f9331f71260..c5fdee188bea 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -178,6 +178,7 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
+bool cpus_share_l2c(int this_cpu, int that_cpu);
bool cpus_share_llc(int this_cpu, int that_cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
@@ -227,6 +228,11 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
{
}
+static inline bool cpus_share_l2c(int this_cpu, int that_cpu)
+{
+ return true;
+}
+
static inline bool cpus_share_llc(int this_cpu, int that_cpu)
{
return true;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d096ce815099..11e60a69ae31 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3904,6 +3904,14 @@ void wake_up_if_idle(int cpu)
rcu_read_unlock();
}
+bool cpus_share_l2c(int this_cpu, int that_cpu)
+{
+ if (this_cpu == that_cpu)
+ return true;
+
+ return per_cpu(sd_l2c_id, this_cpu) == per_cpu(sd_l2c_id, that_cpu);
+}
+
bool cpus_share_llc(int this_cpu, int that_cpu)
{
if (this_cpu == that_cpu)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 81ac605b9cd5..d93543db214c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1828,6 +1828,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
return sd;
}
+DECLARE_PER_CPU(int, sd_l2c_size);
+DECLARE_PER_CPU(int, sd_l2c_id);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1ae2a0a1115a..fadb66edcf5e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -661,8 +661,11 @@ static void destroy_sched_domains(struct sched_domain *sd)
*
* Also keep a unique ID per domain (we use the first CPU number in
* the cpumask of the domain), this allows us to quickly tell if
- * two CPUs are in the same cache domain, see cpus_share_llc().
+ * two CPUs are in the same cache domain, see cpus_share_l2c() and
+ * cpus_share_llc().
*/
+DEFINE_PER_CPU(int, sd_l2c_size);
+DEFINE_PER_CPU(int, sd_l2c_id);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
@@ -672,12 +675,27 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+#ifdef TOPOLOGY_CLUSTER_SYSFS
+static int cpu_get_l2c_info(int cpu, int *l2c_size, int *l2c_id)
+{
+ const struct cpumask *cluster_mask = topology_cluster_cpumask(cpu);
+
+ *l2c_size = cpumask_weight(cluster_mask);
+ *l2c_id = cpumask_first(cluster_mask);
+ return 0;
+}
+#else
+static int cpu_get_l2c_info(int cpu, int *l2c_size, int *l2c_id)
+{
+ return -1;
+}
+#endif
+
static void update_top_cache_domain(int cpu)
{
struct sched_domain_shared *sds = NULL;
struct sched_domain *sd;
- int id = cpu;
- int size = 1;
+ int id = cpu, size = 1, l2c_id, l2c_size;
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
if (sd) {
@@ -686,6 +704,14 @@ static void update_top_cache_domain(int cpu)
sds = sd->shared;
}
+ if (cpu_get_l2c_info(cpu, &l2c_size, &l2c_id)) {
+ /* Fallback on using LLC. */
+ l2c_size = size;
+ l2c_id = id;
+ }
+ per_cpu(sd_l2c_size, cpu) = l2c_size;
+ per_cpu(sd_l2c_id, cpu) = l2c_id;
+
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
--
2.39.2
On 8/22/23 07:31, Mathieu Desnoyers wrote:
> Introduce cpus_share_l2c to allow querying whether two logical CPUs
> share a common L2 cache.
>
> Considering a system like the AMD EPYC 9654 96-Core Processor, the L1
> cache has a latency of 4-5 cycles, the L2 cache has a latency of at
> least 14ns, whereas the L3 cache has a latency of 50ns [1]. Compared to
> this, I measured the RAM accesses to a latency around 120ns on my
> system [2]. So L3 really is only 2.4x faster than RAM accesses.
> Therefore, with this relatively slow access speed compared to L2, the
> scheduler will benefit from only considering CPUs sharing an L2 cache
> for the purpose of using remote runqueue locking rather than queued
> wakeups.
So I did some more benchmarking to figure out whether the reason for
this speedup is the latency delta between L2 and L3, or is due to the
number of hw threads contending on the rq locks.
I tried to force grouping of those "skip ttwu queue" groups by a subset
of the LLC id, basically by taking the LLC id and adding the cpu number
modulo N, where N is chosen based on my machine topology.
The end result is that I have similar numbers for groups of 1, 2, 4 HW
threads (which use rq locks and skip queued ttwu within the group).
Starting with group of size 8, the performance starts to degrade.
So I wonder: do machines with more than 4 HW threads per L2 cache exist?
If it's the case, there we should think about grouping not only by L2
cache, but also sub-divide this group so the number of hw threads per
group is at most 4.
Here are my results with the hackbench test-case:
Group cpus by 16 hw threads:
Time: 49s
- group cpus by 8 hw threads: (llc_id + cpu modulo 2)
Time: 39s
- group cpus by 4 hw threads: (llc_id + cpu modulo 4)
Time: 34s
- group cpus by 2 hw threads: (llc_id + cpu modulo 8)
(expect same as L2 grouping on this machine)
Time: 34s
- group cpus by 1 hw threads: (cpu)
Time: 33s
Thanks,
Mathieu
>
> Link: https://en.wikichip.org/wiki/amd/microarchitectures/zen_4 [1]
> Link: https://github.com/ChipsandCheese/MemoryLatencyTest [2]
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Valentin Schneider <vschneid@redhat.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Ben Segall <bsegall@google.com>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
> Cc: Vincent Guittot <vincent.guittot@linaro.org>
> Cc: Juri Lelli <juri.lelli@redhat.com>
> Cc: Swapnil Sapkal <Swapnil.Sapkal@amd.com>
> Cc: Aaron Lu <aaron.lu@intel.com>
> Cc: Julien Desfossez <jdesfossez@digitalocean.com>
> Cc: x86@kernel.org
> ---
> Changes since v1:
> - Fix l2c id for configurations where L2 have a single logical CPU:
> use TOPOLOGY_CLUSTER_SYSFS to find out whether topology cluster is
> implemented or if LLC should be used as fallback.
>
> Changes since v2:
> - Reverse order of cpu_get_l2c_info() l2c_id and l2c_size output
> arguments to match the caller.
> ---
> include/linux/sched/topology.h | 6 ++++++
> kernel/sched/core.c | 8 ++++++++
> kernel/sched/sched.h | 2 ++
> kernel/sched/topology.c | 32 +++++++++++++++++++++++++++++---
> 4 files changed, 45 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 7f9331f71260..c5fdee188bea 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -178,6 +178,7 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
> cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
> void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
>
> +bool cpus_share_l2c(int this_cpu, int that_cpu);
> bool cpus_share_llc(int this_cpu, int that_cpu);
>
> typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
> @@ -227,6 +228,11 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
> {
> }
>
> +static inline bool cpus_share_l2c(int this_cpu, int that_cpu)
> +{
> + return true;
> +}
> +
> static inline bool cpus_share_llc(int this_cpu, int that_cpu)
> {
> return true;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index d096ce815099..11e60a69ae31 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3904,6 +3904,14 @@ void wake_up_if_idle(int cpu)
> rcu_read_unlock();
> }
>
> +bool cpus_share_l2c(int this_cpu, int that_cpu)
> +{
> + if (this_cpu == that_cpu)
> + return true;
> +
> + return per_cpu(sd_l2c_id, this_cpu) == per_cpu(sd_l2c_id, that_cpu);
> +}
> +
> bool cpus_share_llc(int this_cpu, int that_cpu)
> {
> if (this_cpu == that_cpu)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 81ac605b9cd5..d93543db214c 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1828,6 +1828,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
> return sd;
> }
>
> +DECLARE_PER_CPU(int, sd_l2c_size);
> +DECLARE_PER_CPU(int, sd_l2c_id);
> DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> DECLARE_PER_CPU(int, sd_llc_size);
> DECLARE_PER_CPU(int, sd_llc_id);
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 1ae2a0a1115a..fadb66edcf5e 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -661,8 +661,11 @@ static void destroy_sched_domains(struct sched_domain *sd)
> *
> * Also keep a unique ID per domain (we use the first CPU number in
> * the cpumask of the domain), this allows us to quickly tell if
> - * two CPUs are in the same cache domain, see cpus_share_llc().
> + * two CPUs are in the same cache domain, see cpus_share_l2c() and
> + * cpus_share_llc().
> */
> +DEFINE_PER_CPU(int, sd_l2c_size);
> +DEFINE_PER_CPU(int, sd_l2c_id);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> DEFINE_PER_CPU(int, sd_llc_size);
> DEFINE_PER_CPU(int, sd_llc_id);
> @@ -672,12 +675,27 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>
> +#ifdef TOPOLOGY_CLUSTER_SYSFS
> +static int cpu_get_l2c_info(int cpu, int *l2c_size, int *l2c_id)
> +{
> + const struct cpumask *cluster_mask = topology_cluster_cpumask(cpu);
> +
> + *l2c_size = cpumask_weight(cluster_mask);
> + *l2c_id = cpumask_first(cluster_mask);
> + return 0;
> +}
> +#else
> +static int cpu_get_l2c_info(int cpu, int *l2c_size, int *l2c_id)
> +{
> + return -1;
> +}
> +#endif
> +
> static void update_top_cache_domain(int cpu)
> {
> struct sched_domain_shared *sds = NULL;
> struct sched_domain *sd;
> - int id = cpu;
> - int size = 1;
> + int id = cpu, size = 1, l2c_id, l2c_size;
>
> sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
> if (sd) {
> @@ -686,6 +704,14 @@ static void update_top_cache_domain(int cpu)
> sds = sd->shared;
> }
>
> + if (cpu_get_l2c_info(cpu, &l2c_size, &l2c_id)) {
> + /* Fallback on using LLC. */
> + l2c_size = size;
> + l2c_id = id;
> + }
> + per_cpu(sd_l2c_size, cpu) = l2c_size;
> + per_cpu(sd_l2c_id, cpu) = l2c_id;
> +
> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> per_cpu(sd_llc_size, cpu) = size;
> per_cpu(sd_llc_id, cpu) = id;
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
On 8/23/23 11:26, Mathieu Desnoyers wrote: > On 8/22/23 07:31, Mathieu Desnoyers wrote: >> Introduce cpus_share_l2c to allow querying whether two logical CPUs >> share a common L2 cache. >> >> Considering a system like the AMD EPYC 9654 96-Core Processor, the L1 >> cache has a latency of 4-5 cycles, the L2 cache has a latency of at >> least 14ns, whereas the L3 cache has a latency of 50ns [1]. Compared to >> this, I measured the RAM accesses to a latency around 120ns on my >> system [2]. So L3 really is only 2.4x faster than RAM accesses. >> Therefore, with this relatively slow access speed compared to L2, the >> scheduler will benefit from only considering CPUs sharing an L2 cache >> for the purpose of using remote runqueue locking rather than queued >> wakeups. > > So I did some more benchmarking to figure out whether the reason for > this speedup is the latency delta between L2 and L3, or is due to the > number of hw threads contending on the rq locks. > > I tried to force grouping of those "skip ttwu queue" groups by a subset > of the LLC id, basically by taking the LLC id and adding the cpu number > modulo N, where N is chosen based on my machine topology. > > The end result is that I have similar numbers for groups of 1, 2, 4 HW > threads (which use rq locks and skip queued ttwu within the group). > Starting with group of size 8, the performance starts to degrade. > > So I wonder: do machines with more than 4 HW threads per L2 cache exist? > If it's the case, there we should think about grouping not only by L2 > cache, but also sub-divide this group so the number of hw threads per > group is at most 4. > > Here are my results with the hackbench test-case: > > Group cpus by 16 hw threads: > > Time: 49s > > - group cpus by 8 hw threads: (llc_id + cpu modulo 2) > > Time: 39s > > - group cpus by 4 hw threads: (llc_id + cpu modulo 4) > > Time: 34s > > - group cpus by 2 hw threads: (llc_id + cpu modulo 8) > (expect same as L2 grouping on this machine) > > Time: 34s > > - group cpus by 1 hw threads: (cpu) > > Time: 33s One more interesting data point: I tried modifying the grouping so that I would explicitly group by hw threads which sit in different L3, and even on different NUMA nodes for some (group id = cpu_id % 192). This is expected to generate really _bad_ cache locality for the runqueue locks within a group. The result for these groups of 3 HW threads is about 33s with the hackbench benchmark, which seems to confirm that the cause of the speedup is reduction of the contention on the rq locks by making the groups smaller, and therefore reducing the likelihood of contention for the rq locks, rather than by improving cache locality from L3 to L2. So grouping by shared L2 only happens to make the group size OK, but this benchmark does not significantly benefit from having all runqueue locks on the same L2. Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
On Wed, Aug 23, 2023 at 02:52:17PM -0400, Mathieu Desnoyers wrote:
> On 8/23/23 11:26, Mathieu Desnoyers wrote:
> > On 8/22/23 07:31, Mathieu Desnoyers wrote:
> > > Introduce cpus_share_l2c to allow querying whether two logical CPUs
> > > share a common L2 cache.
> > >
> > > Considering a system like the AMD EPYC 9654 96-Core Processor, the L1
> > > cache has a latency of 4-5 cycles, the L2 cache has a latency of at
> > > least 14ns, whereas the L3 cache has a latency of 50ns [1]. Compared to
> > > this, I measured the RAM accesses to a latency around 120ns on my
> > > system [2]. So L3 really is only 2.4x faster than RAM accesses.
> > > Therefore, with this relatively slow access speed compared to L2, the
> > > scheduler will benefit from only considering CPUs sharing an L2 cache
> > > for the purpose of using remote runqueue locking rather than queued
> > > wakeups.
> >
> > So I did some more benchmarking to figure out whether the reason for
> > this speedup is the latency delta between L2 and L3, or is due to the
> > number of hw threads contending on the rq locks.
> >
> > I tried to force grouping of those "skip ttwu queue" groups by a subset
> > of the LLC id, basically by taking the LLC id and adding the cpu number
> > modulo N, where N is chosen based on my machine topology.
> >
> > The end result is that I have similar numbers for groups of 1, 2, 4 HW
> > threads (which use rq locks and skip queued ttwu within the group).
> > Starting with group of size 8, the performance starts to degrade.
> >
> > So I wonder: do machines with more than 4 HW threads per L2 cache exist?
> > If it's the case, there we should think about grouping not only by L2
> > cache, but also sub-divide this group so the number of hw threads per
> > group is at most 4.
> >
> > Here are my results with the hackbench test-case:
> >
> > Group cpus by 16 hw threads:
> >
> > Time: 49s
> >
> > - group cpus by 8 hw threads: (llc_id + cpu modulo 2)
> >
> > Time: 39s
> >
> > - group cpus by 4 hw threads: (llc_id + cpu modulo 4)
> >
> > Time: 34s
> >
> > - group cpus by 2 hw threads: (llc_id + cpu modulo 8)
> > (expect same as L2 grouping on this machine)
> >
> > Time: 34s
> >
> > - group cpus by 1 hw threads: (cpu)
> >
> > Time: 33s
>
> One more interesting data point: I tried modifying the grouping
> so that I would explicitly group by hw threads which sit in different
> L3, and even on different NUMA nodes for some
> (group id = cpu_id % 192). This is expected to generate really _bad_
> cache locality for the runqueue locks within a group.
>
> The result for these groups of 3 HW threads is about 33s with the
> hackbench benchmark, which seems to confirm that the cause of the
> speedup is reduction of the contention on the rq locks by making the
> groups smaller, and therefore reducing the likelihood of contention for the
> rq locks, rather than by improving cache locality from L3 to L2.
In addition to reduced rq lock contention, I think another reason this
improves performance is because it reduced task migration. Not sure if
it is the case on your test system, but on my test machine(Intel SPR),
task migration number dropped.
Hackbench on Intel SPR(2sockets/112cores/224threads) test summary:
- performance improved for all three cases; the more tasks(groups), the
more performance gain;
- task migrations dropped with this series for nr_group=20 and 32
according to 'perf stat'. migration number didn't drop for nr_group=10
but the two update functions' cost dropped which means fewer access to
tg->load_avg and thus, fewer task migrations. This is contradictory
and I can not explain yet;
- rq lock contention dropped for all three cases and it dropped the most
under more overloaded case: nr_group=32.
It's not clear to me why this series can reduce task migrations. I doubt
it has something to do with more wakelist style wakeup becasue for this
test machine, only a single core with two SMT threads share L2 so more
wakeups are through wakelist. In wakelist style wakeup, the target rq's
ttwu_pending is set and that will make the target cpu as !idle_cpu();
This is faster than grabbing the target rq's lock and then increase
target rq's nr_running or set target rq's curr to something else than
idle. So wakelist style wakeup can make target cpu appear as non idle
faster, but I can't connect this with reduced migration yet, I just feel
this might be the reason why task migration reduced.
Below are detailed test data.
Base: 6.5-rc1.
rq_spin%: The percent of raw_spin_rq_lock_nested() as reported by
perf/events=cycles:pp
migration: cpu-migrations reported by "perf stat -a -- sleep 5"
The cmdline used is:
hackbench -g $nr_group -f 20 --pipe --threads -l 480000 -s 100
nr_group=10:
time rq_spin% update_cfs_group% update_load_avg% migration
base 46s 1.32% 20.06% 10.78% 10.227 K/sec
this_series 37s 0.57% 15.08% 7.05% 10.722 K/sec
nr_group=20:
time rq_spin% update_cfs_group% update_load_avg% migration
base 69s 2.57% 19.68% 10.74% 12.098 K/sec
this_series 41s 0.62% 12.11% 5.78% 8.617 K/sec
nr_group=32:
time rq_spin% update_cfs_group% update_load_avg% migration
base 192s±25% 15.12% 25.83% 9.33% 12.141 K/sec
this_series 71s 0.47% 10.98% 4.58% 8.198 K/sec
I also tested applying my "ratelimit update to tg->load_avg" patch and
the test summary is:
- performance improved noticeably for nr_group=20 and slightly for
nr_group=10 case; nr_group=32's performance is roughly the same.
- task migrations dropped for all three cases; nr_group=20 saw the
biggest drop.
- rq lock contention dropped for all three cases and again, nr_group=32
saw the biggest drop.
Below are detailed data.
Base: peter's sched/core branch with my "ratelimit" patch.
this_series: apply this patchset on top of base.
nr_group=10:
time rq_spin% update_cfs_group% update_load_avg% migration
base 36s 0.55% 0.46% 1.43% 15.034 K/sec
this_series 35s 0.56% 0.52% 1.53% 13.751 K/sec
nr_group=20:
time rq_spin% update_cfs_group% update_load_avg% migration
base 47s 1.28% 0.73% 2.33% 21.217 K/sec
this_series 42s 0.60% 0.69% 1.69% 14.130 K/sec
nr_group=32:
time rq_spin% update_cfs_group% update_load_avg% migration
base 70s 2.38% 0.60% 2.19% 17.855 K/sec
this_series 70s 0.58% 0.63% 1.77% 12.331 K/sec
Thanks,
Aaron
On 8/24/23 03:52, Aaron Lu wrote: > On Wed, Aug 23, 2023 at 02:52:17PM -0400, Mathieu Desnoyers wrote: >> On 8/23/23 11:26, Mathieu Desnoyers wrote: >>> On 8/22/23 07:31, Mathieu Desnoyers wrote: >>>> Introduce cpus_share_l2c to allow querying whether two logical CPUs >>>> share a common L2 cache. >>>> >>>> Considering a system like the AMD EPYC 9654 96-Core Processor, the L1 >>>> cache has a latency of 4-5 cycles, the L2 cache has a latency of at >>>> least 14ns, whereas the L3 cache has a latency of 50ns [1]. Compared to >>>> this, I measured the RAM accesses to a latency around 120ns on my >>>> system [2]. So L3 really is only 2.4x faster than RAM accesses. >>>> Therefore, with this relatively slow access speed compared to L2, the >>>> scheduler will benefit from only considering CPUs sharing an L2 cache >>>> for the purpose of using remote runqueue locking rather than queued >>>> wakeups. >>> >>> So I did some more benchmarking to figure out whether the reason for >>> this speedup is the latency delta between L2 and L3, or is due to the >>> number of hw threads contending on the rq locks. >>> >>> I tried to force grouping of those "skip ttwu queue" groups by a subset >>> of the LLC id, basically by taking the LLC id and adding the cpu number >>> modulo N, where N is chosen based on my machine topology. >>> >>> The end result is that I have similar numbers for groups of 1, 2, 4 HW >>> threads (which use rq locks and skip queued ttwu within the group). >>> Starting with group of size 8, the performance starts to degrade. >>> >>> So I wonder: do machines with more than 4 HW threads per L2 cache exist? >>> If it's the case, there we should think about grouping not only by L2 >>> cache, but also sub-divide this group so the number of hw threads per >>> group is at most 4. >>> >>> Here are my results with the hackbench test-case: >>> >>> Group cpus by 16 hw threads: >>> >>> Time: 49s >>> >>> - group cpus by 8 hw threads: (llc_id + cpu modulo 2) >>> >>> Time: 39s >>> >>> - group cpus by 4 hw threads: (llc_id + cpu modulo 4) >>> >>> Time: 34s >>> >>> - group cpus by 2 hw threads: (llc_id + cpu modulo 8) >>> (expect same as L2 grouping on this machine) >>> >>> Time: 34s >>> >>> - group cpus by 1 hw threads: (cpu) >>> >>> Time: 33s >> >> One more interesting data point: I tried modifying the grouping >> so that I would explicitly group by hw threads which sit in different >> L3, and even on different NUMA nodes for some >> (group id = cpu_id % 192). This is expected to generate really _bad_ >> cache locality for the runqueue locks within a group. >> >> The result for these groups of 3 HW threads is about 33s with the >> hackbench benchmark, which seems to confirm that the cause of the >> speedup is reduction of the contention on the rq locks by making the >> groups smaller, and therefore reducing the likelihood of contention for the >> rq locks, rather than by improving cache locality from L3 to L2. > > In addition to reduced rq lock contention, I think another reason this > improves performance is because it reduced task migration. Not sure if > it is the case on your test system, but on my test machine(Intel SPR), > task migration number dropped. Yes, it's indeed the case on my system as well. It cuts migrations by half (9.2K/sec down to 5.0K/sec). > Hackbench on Intel SPR(2sockets/112cores/224threads) test summary: > - performance improved for all three cases; the more tasks(groups), the > more performance gain; Interesting! > - task migrations dropped with this series for nr_group=20 and 32 > according to 'perf stat'. migration number didn't drop for nr_group=10 > but the two update functions' cost dropped which means fewer access to > tg->load_avg and thus, fewer task migrations. This is contradictory > and I can not explain yet; Neither can I. > - rq lock contention dropped for all three cases and it dropped the most > under more overloaded case: nr_group=32. The fact that you observed rq lock contention dropping is a good sign that doing more queued wakeups is a good thing compared to allowing non-queued wakeups across cpus sharing a whole LLC. At this point I'm not sure if the reduction on rq lock contention is mostly due to using queued wakeups rather than grabbing remote rq locks, or by a side-effet of doing a queued wakeup rather than immediately doing the wakeup, which would open a window where the target rq is still considered idle by the various code paths within select_task_rq_fair which don't care about rq->ttwu_pending. > It's not clear to me why this series can reduce task migrations. I doubt > it has something to do with more wakelist style wakeup becasue for this > test machine, only a single core with two SMT threads share L2 so more > wakeups are through wakelist. In wakelist style wakeup, the target rq's > ttwu_pending is set and that will make the target cpu as !idle_cpu(); > This is faster than grabbing the target rq's lock and then increase > target rq's nr_running or set target rq's curr to something else than > idle. So wakelist style wakeup can make target cpu appear as non idle > faster, but I can't connect this with reduced migration yet, I just feel > this might be the reason why task migration reduced. Many code paths in select_task_rq_fair don't seem to care about rq->ttwu_pending. In wake_affine_idle, for sync wakeups, if nr_running is 1 on the waker, we choose the waker cpu as target. In wake_affine_idle, if none of waker or prev wakee cpus are idle, then it uses wake_affine_weight to find out which of the waker/prev wakee cpus are targets based on their respective load. If wake_affine_idle cannot find an idle waker/prev wakee cpu, and if wake_affine_weight finds that the prev wakee cpu had a lower load, then wake_affine returns the prev wakee cpu as target. This happens even if the prev wakee cpu is not idle. This "target" cpu is then passed to select_idle_sibling. It expects the available_idle_cpu(target) to check again to see whether the target cpu is idle. However, it also uses "sched_idle_cpu(target)" which _only_ considers nr_running (not ttwu_pending flag). Likewise for the other similar idleness checks below in select_idle_sibling for prev and recent_used_cpu. The same happens for the case where a per-cpu kthread stacks with the wakee. I've tried adding checks for rq->ttwu_pending in those code paths on top of my patch and I'm still observing the reduction in number of migrations, so it's unclear to me how doing more queued wakeups can reduce migrations the way it does. I'm starting to think may want to explore explicitly rate limiting task migrations as well. For instance, we could do something like this: Within a 1ms window, if a task is migrated more than once, the following wakeups would consider that the prev runqueue should be considered in priority (as if it was completely idle) as long as its load is below a given threshold. So every 1ms tasks have a chance to be migrated to the idlest runqueues, but we would then eliminate those frequent migration patterns which end up being bad for cache locality. Thoughts ? Thanks, Mathieu > > Below are detailed test data. > Base: 6.5-rc1. > rq_spin%: The percent of raw_spin_rq_lock_nested() as reported by > perf/events=cycles:pp > migration: cpu-migrations reported by "perf stat -a -- sleep 5" > > The cmdline used is: > hackbench -g $nr_group -f 20 --pipe --threads -l 480000 -s 100 > > nr_group=10: > time rq_spin% update_cfs_group% update_load_avg% migration > base 46s 1.32% 20.06% 10.78% 10.227 K/sec > this_series 37s 0.57% 15.08% 7.05% 10.722 K/sec > > nr_group=20: > time rq_spin% update_cfs_group% update_load_avg% migration > base 69s 2.57% 19.68% 10.74% 12.098 K/sec > this_series 41s 0.62% 12.11% 5.78% 8.617 K/sec > > nr_group=32: > time rq_spin% update_cfs_group% update_load_avg% migration > base 192s±25% 15.12% 25.83% 9.33% 12.141 K/sec > this_series 71s 0.47% 10.98% 4.58% 8.198 K/sec > > I also tested applying my "ratelimit update to tg->load_avg" patch and > the test summary is: > - performance improved noticeably for nr_group=20 and slightly for > nr_group=10 case; nr_group=32's performance is roughly the same. > - task migrations dropped for all three cases; nr_group=20 saw the > biggest drop. > - rq lock contention dropped for all three cases and again, nr_group=32 > saw the biggest drop. > > Below are detailed data. > Base: peter's sched/core branch with my "ratelimit" patch. > this_series: apply this patchset on top of base. > > nr_group=10: > time rq_spin% update_cfs_group% update_load_avg% migration > base 36s 0.55% 0.46% 1.43% 15.034 K/sec > this_series 35s 0.56% 0.52% 1.53% 13.751 K/sec > > nr_group=20: > time rq_spin% update_cfs_group% update_load_avg% migration > base 47s 1.28% 0.73% 2.33% 21.217 K/sec > this_series 42s 0.60% 0.69% 1.69% 14.130 K/sec > > nr_group=32: > time rq_spin% update_cfs_group% update_load_avg% migration > base 70s 2.38% 0.60% 2.19% 17.855 K/sec > this_series 70s 0.58% 0.63% 1.77% 12.331 K/sec > > Thanks, > Aaron -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
On Thu, Aug 24, 2023 at 10:40:45AM -0400, Mathieu Desnoyers wrote: > On 8/24/23 03:52, Aaron Lu wrote: > > On Wed, Aug 23, 2023 at 02:52:17PM -0400, Mathieu Desnoyers wrote: > > > On 8/23/23 11:26, Mathieu Desnoyers wrote: > > > > On 8/22/23 07:31, Mathieu Desnoyers wrote: > > > > > Introduce cpus_share_l2c to allow querying whether two logical CPUs > > > > > share a common L2 cache. > > > > > > > > > > Considering a system like the AMD EPYC 9654 96-Core Processor, the L1 > > > > > cache has a latency of 4-5 cycles, the L2 cache has a latency of at > > > > > least 14ns, whereas the L3 cache has a latency of 50ns [1]. Compared to > > > > > this, I measured the RAM accesses to a latency around 120ns on my > > > > > system [2]. So L3 really is only 2.4x faster than RAM accesses. > > > > > Therefore, with this relatively slow access speed compared to L2, the > > > > > scheduler will benefit from only considering CPUs sharing an L2 cache > > > > > for the purpose of using remote runqueue locking rather than queued > > > > > wakeups. > > > > > > > > So I did some more benchmarking to figure out whether the reason for > > > > this speedup is the latency delta between L2 and L3, or is due to the > > > > number of hw threads contending on the rq locks. > > > > > > > > I tried to force grouping of those "skip ttwu queue" groups by a subset > > > > of the LLC id, basically by taking the LLC id and adding the cpu number > > > > modulo N, where N is chosen based on my machine topology. > > > > > > > > The end result is that I have similar numbers for groups of 1, 2, 4 HW > > > > threads (which use rq locks and skip queued ttwu within the group). > > > > Starting with group of size 8, the performance starts to degrade. > > > > > > > > So I wonder: do machines with more than 4 HW threads per L2 cache exist? > > > > If it's the case, there we should think about grouping not only by L2 > > > > cache, but also sub-divide this group so the number of hw threads per > > > > group is at most 4. > > > > > > > > Here are my results with the hackbench test-case: > > > > > > > > Group cpus by 16 hw threads: > > > > > > > > Time: 49s > > > > > > > > - group cpus by 8 hw threads: (llc_id + cpu modulo 2) > > > > > > > > Time: 39s > > > > > > > > - group cpus by 4 hw threads: (llc_id + cpu modulo 4) > > > > > > > > Time: 34s > > > > > > > > - group cpus by 2 hw threads: (llc_id + cpu modulo 8) > > > > (expect same as L2 grouping on this machine) > > > > > > > > Time: 34s > > > > > > > > - group cpus by 1 hw threads: (cpu) > > > > > > > > Time: 33s > > > > > > One more interesting data point: I tried modifying the grouping > > > so that I would explicitly group by hw threads which sit in different > > > L3, and even on different NUMA nodes for some > > > (group id = cpu_id % 192). This is expected to generate really _bad_ > > > cache locality for the runqueue locks within a group. > > > > > > The result for these groups of 3 HW threads is about 33s with the > > > hackbench benchmark, which seems to confirm that the cause of the > > > speedup is reduction of the contention on the rq locks by making the > > > groups smaller, and therefore reducing the likelihood of contention for the > > > rq locks, rather than by improving cache locality from L3 to L2. > > > > In addition to reduced rq lock contention, I think another reason this > > improves performance is because it reduced task migration. Not sure if > > it is the case on your test system, but on my test machine(Intel SPR), > > task migration number dropped. > > Yes, it's indeed the case on my system as well. It cuts migrations by half > (9.2K/sec down to 5.0K/sec). > > > Hackbench on Intel SPR(2sockets/112cores/224threads) test summary: > > - performance improved for all three cases; the more tasks(groups), the > > more performance gain; > > Interesting! > > > - task migrations dropped with this series for nr_group=20 and 32 > > according to 'perf stat'. migration number didn't drop for nr_group=10 > > but the two update functions' cost dropped which means fewer access to > > tg->load_avg and thus, fewer task migrations. This is contradictory > > and I can not explain yet; > > Neither can I. > > > - rq lock contention dropped for all three cases and it dropped the most > > under more overloaded case: nr_group=32. > > The fact that you observed rq lock contention dropping is a good sign > that doing more queued wakeups is a good thing compared to allowing > non-queued wakeups across cpus sharing a whole LLC. > > At this point I'm not sure if the reduction on rq lock contention is mostly > due to using queued wakeups rather than grabbing remote rq locks, or by a > side-effet of doing a queued wakeup rather than immediately doing the > wakeup, which would open a window where the target rq is still considered > idle by the various code paths within select_task_rq_fair which don't care > about rq->ttwu_pending. > > > It's not clear to me why this series can reduce task migrations. I doubt > > it has something to do with more wakelist style wakeup becasue for this > > test machine, only a single core with two SMT threads share L2 so more > > wakeups are through wakelist. In wakelist style wakeup, the target rq's > > ttwu_pending is set and that will make the target cpu as !idle_cpu(); > > This is faster than grabbing the target rq's lock and then increase > > target rq's nr_running or set target rq's curr to something else than > > idle. So wakelist style wakeup can make target cpu appear as non idle > > faster, but I can't connect this with reduced migration yet, I just feel > > this might be the reason why task migration reduced. > > Many code paths in select_task_rq_fair don't seem to care about > rq->ttwu_pending. > > In wake_affine_idle, for sync wakeups, if nr_running is 1 on the waker, we > choose the waker cpu as target. > > In wake_affine_idle, if none of waker or prev wakee cpus are idle, then it > uses wake_affine_weight to find out which of the waker/prev wakee cpus are > targets based on their respective load. > > If wake_affine_idle cannot find an idle waker/prev wakee cpu, and if > wake_affine_weight finds that the prev wakee cpu had a lower load, then > wake_affine returns the prev wakee cpu as target. This happens even if the > prev wakee cpu is not idle. > > This "target" cpu is then passed to select_idle_sibling. It expects the > available_idle_cpu(target) to check again to see whether the target cpu is > idle. However, it also uses "sched_idle_cpu(target)" which _only_ considers > nr_running (not ttwu_pending flag). Likewise for the other similar idleness > checks below in select_idle_sibling for prev and recent_used_cpu. The same > happens for the case where a per-cpu kthread > stacks with the wakee. sched_idle_cpu() mainly concerns with idle policy tasks and if the rq does not have any idle policy tasks, it will not return true. Since our tests do not use idle policy tasks, it should never return true. > I've tried adding checks for rq->ttwu_pending in those code paths on top of > my patch and I'm still observing the reduction in number of migrations, so > it's unclear to me how doing more queued wakeups can reduce migrations the > way it does. An interesting puzzle. > I'm starting to think may want to explore explicitly rate limiting task > migrations as well. > > For instance, we could do something like this: > > Within a 1ms window, if a task is migrated more than once, the following > wakeups would consider that the prev runqueue should be considered in > priority (as if it was completely idle) as long as its load is below a given > threshold. > > So every 1ms tasks have a chance to be migrated to the idlest runqueues, but > we would then eliminate those frequent migration patterns which end up being > bad for cache locality. > > Thoughts ? Not sure if this is a good idea. I had a feeling it could hurt latency.. Thanks, Aaron > > > > Below are detailed test data. > > Base: 6.5-rc1. > > rq_spin%: The percent of raw_spin_rq_lock_nested() as reported by > > perf/events=cycles:pp > > migration: cpu-migrations reported by "perf stat -a -- sleep 5" > > > > The cmdline used is: > > hackbench -g $nr_group -f 20 --pipe --threads -l 480000 -s 100 > > > > nr_group=10: > > time rq_spin% update_cfs_group% update_load_avg% migration > > base 46s 1.32% 20.06% 10.78% 10.227 K/sec > > this_series 37s 0.57% 15.08% 7.05% 10.722 K/sec > > > > nr_group=20: > > time rq_spin% update_cfs_group% update_load_avg% migration > > base 69s 2.57% 19.68% 10.74% 12.098 K/sec > > this_series 41s 0.62% 12.11% 5.78% 8.617 K/sec > > > > nr_group=32: > > time rq_spin% update_cfs_group% update_load_avg% migration > > base 192s±25% 15.12% 25.83% 9.33% 12.141 K/sec > > this_series 71s 0.47% 10.98% 4.58% 8.198 K/sec > > > > I also tested applying my "ratelimit update to tg->load_avg" patch and > > the test summary is: > > - performance improved noticeably for nr_group=20 and slightly for > > nr_group=10 case; nr_group=32's performance is roughly the same. > > - task migrations dropped for all three cases; nr_group=20 saw the > > biggest drop. > > - rq lock contention dropped for all three cases and again, nr_group=32 > > saw the biggest drop. > > > > Below are detailed data. > > Base: peter's sched/core branch with my "ratelimit" patch. > > this_series: apply this patchset on top of base. > > > > nr_group=10: > > time rq_spin% update_cfs_group% update_load_avg% migration > > base 36s 0.55% 0.46% 1.43% 15.034 K/sec > > this_series 35s 0.56% 0.52% 1.53% 13.751 K/sec > > > > nr_group=20: > > time rq_spin% update_cfs_group% update_load_avg% migration > > base 47s 1.28% 0.73% 2.33% 21.217 K/sec > > this_series 42s 0.60% 0.69% 1.69% 14.130 K/sec > > > > nr_group=32: > > time rq_spin% update_cfs_group% update_load_avg% migration > > base 70s 2.38% 0.60% 2.19% 17.855 K/sec > > this_series 70s 0.58% 0.63% 1.77% 12.331 K/sec > > > > Thanks, > > Aaron
On 8/25/23 02:49, Aaron Lu wrote: > On Thu, Aug 24, 2023 at 10:40:45AM -0400, Mathieu Desnoyers wrote: [...] >>> - task migrations dropped with this series for nr_group=20 and 32 >>> according to 'perf stat'. migration number didn't drop for nr_group=10 >>> but the two update functions' cost dropped which means fewer access to >>> tg->load_avg and thus, fewer task migrations. This is contradictory >>> and I can not explain yet; >> >> Neither can I. >> [...] >> >>> It's not clear to me why this series can reduce task migrations. I doubt >>> it has something to do with more wakelist style wakeup becasue for this >>> test machine, only a single core with two SMT threads share L2 so more >>> wakeups are through wakelist. In wakelist style wakeup, the target rq's >>> ttwu_pending is set and that will make the target cpu as !idle_cpu(); >>> This is faster than grabbing the target rq's lock and then increase >>> target rq's nr_running or set target rq's curr to something else than >>> idle. So wakelist style wakeup can make target cpu appear as non idle >>> faster, but I can't connect this with reduced migration yet, I just feel >>> this might be the reason why task migration reduced. >> > [...] >> I've tried adding checks for rq->ttwu_pending in those code paths on top of >> my patch and I'm still observing the reduction in number of migrations, so >> it's unclear to me how doing more queued wakeups can reduce migrations the >> way it does. > > An interesting puzzle. One metric that can help understand the impact of my patch: comparing hackbench from a baseline where only your load_avg patch is applied to a kernel with my l2c patch applied, I notice that the goidle schedstat is cut in half. For a given CPU (they are pretty much alike), it goes from 650456 to 353487. So could it be that by doing queued wakeups, we end up batching execution of the woken up tasks for a given CPU, rather than going back and forth between idle and non-idle ? One important thing that this changes is to reduce the number of newidle balance triggered. Thoughts ? Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
On Fri, Aug 25, 2023 at 09:51:19AM -0400, Mathieu Desnoyers wrote: > On 8/25/23 02:49, Aaron Lu wrote: > > On Thu, Aug 24, 2023 at 10:40:45AM -0400, Mathieu Desnoyers wrote: > [...] > > > > - task migrations dropped with this series for nr_group=20 and 32 > > > > according to 'perf stat'. migration number didn't drop for nr_group=10 > > > > but the two update functions' cost dropped which means fewer access to > > > > tg->load_avg and thus, fewer task migrations. This is contradictory > > > > and I can not explain yet; > > > > > > Neither can I. > > > > > [...] > > > > > > > > It's not clear to me why this series can reduce task migrations. I doubt > > > > it has something to do with more wakelist style wakeup becasue for this > > > > test machine, only a single core with two SMT threads share L2 so more > > > > wakeups are through wakelist. In wakelist style wakeup, the target rq's > > > > ttwu_pending is set and that will make the target cpu as !idle_cpu(); > > > > This is faster than grabbing the target rq's lock and then increase > > > > target rq's nr_running or set target rq's curr to something else than > > > > idle. So wakelist style wakeup can make target cpu appear as non idle > > > > faster, but I can't connect this with reduced migration yet, I just feel > > > > this might be the reason why task migration reduced. > > > > > > [...] > > > I've tried adding checks for rq->ttwu_pending in those code paths on top of > > > my patch and I'm still observing the reduction in number of migrations, so > > > it's unclear to me how doing more queued wakeups can reduce migrations the > > > way it does. > > > > An interesting puzzle. > > One metric that can help understand the impact of my patch: comparing > hackbench from a baseline where only your load_avg patch is applied > to a kernel with my l2c patch applied, I notice that the goidle > schedstat is cut in half. For a given CPU (they are pretty much alike), > it goes from 650456 to 353487. > > So could it be that by doing queued wakeups, we end up batching > execution of the woken up tasks for a given CPU, rather than going > back and forth between idle and non-idle ? One important thing that > this changes is to reduce the number of newidle balance triggered. I noticed the majority(>99%) migrations are from wakeup path on this Intel SPR when running hackbench: ttwu() -> set_task_cpu() -> migrate_task_rq_fair(), so while I think it's a good finding that newidle balance dropped, it's probably not the reason why migration number dropped... Thanks, Aaron
On Mon, Aug 28, 2023 at 07:19:45PM +0800, Aaron Lu wrote: > On Fri, Aug 25, 2023 at 09:51:19AM -0400, Mathieu Desnoyers wrote: > > On 8/25/23 02:49, Aaron Lu wrote: > > > On Thu, Aug 24, 2023 at 10:40:45AM -0400, Mathieu Desnoyers wrote: > > [...] > > > > > - task migrations dropped with this series for nr_group=20 and 32 > > > > > according to 'perf stat'. migration number didn't drop for nr_group=10 > > > > > but the two update functions' cost dropped which means fewer access to > > > > > tg->load_avg and thus, fewer task migrations. This is contradictory > > > > > and I can not explain yet; > > > > > > > > Neither can I. > > > > > > > > [...] > > > > > > > > > > > It's not clear to me why this series can reduce task migrations. I doubt > > > > > it has something to do with more wakelist style wakeup becasue for this > > > > > test machine, only a single core with two SMT threads share L2 so more > > > > > wakeups are through wakelist. In wakelist style wakeup, the target rq's > > > > > ttwu_pending is set and that will make the target cpu as !idle_cpu(); > > > > > This is faster than grabbing the target rq's lock and then increase > > > > > target rq's nr_running or set target rq's curr to something else than > > > > > idle. So wakelist style wakeup can make target cpu appear as non idle > > > > > faster, but I can't connect this with reduced migration yet, I just feel > > > > > this might be the reason why task migration reduced. > > > > > > > > > [...] > > > > I've tried adding checks for rq->ttwu_pending in those code paths on top of > > > > my patch and I'm still observing the reduction in number of migrations, so > > > > it's unclear to me how doing more queued wakeups can reduce migrations the > > > > way it does. > > > > > > An interesting puzzle. > > > > One metric that can help understand the impact of my patch: comparing > > hackbench from a baseline where only your load_avg patch is applied > > to a kernel with my l2c patch applied, I notice that the goidle > > schedstat is cut in half. For a given CPU (they are pretty much alike), > > it goes from 650456 to 353487. > > > > So could it be that by doing queued wakeups, we end up batching > > execution of the woken up tasks for a given CPU, rather than going > > back and forth between idle and non-idle ? One important thing that > > this changes is to reduce the number of newidle balance triggered. > > I noticed the majority(>99%) migrations are from wakeup path on this > Intel SPR when running hackbench: ttwu() -> set_task_cpu() -> > migrate_task_rq_fair(), so while I think it's a good finding that > newidle balance dropped, it's probably not the reason why migration > number dropped... I profiled select_idle_sibling() and found that with this series, select_idle_cpu() tends to fail more and select_idle_sibling() fallbacks to use target in the end, which equals to prev_cpu very often. Initially I think the reason why select_idle_cpu() failed more with this series is because "wake_list style enqueue" can make the target cpu appear as busy earlier and thus, it will be harder for select_idle_cpu() to find an idle cpu overall. But I also suspect SIS_UTIL makes a difference here: in vanilla kernel, the idle% is 8% and with this series, the idle% is only 2% and SIS_UTIL may simply skip doing any search for idle cpu. Anyway, I think I'll also need to profile select_idle_cpu() to see what's going on there too. The above profile was done with below workload on a 2 sockets Intel SPR: hackbench -g 20 -f 20 --pipe --threads -l 480000 -s 100 Thanks, Aaron
On Fri, Sep 01, 2023 at 09:45:28PM +0800, Aaron Lu wrote:
> On Mon, Aug 28, 2023 at 07:19:45PM +0800, Aaron Lu wrote:
> > On Fri, Aug 25, 2023 at 09:51:19AM -0400, Mathieu Desnoyers wrote:
> > > On 8/25/23 02:49, Aaron Lu wrote:
> > > > On Thu, Aug 24, 2023 at 10:40:45AM -0400, Mathieu Desnoyers wrote:
> > > [...]
> > > > > > - task migrations dropped with this series for nr_group=20 and 32
> > > > > > according to 'perf stat'. migration number didn't drop for nr_group=10
> > > > > > but the two update functions' cost dropped which means fewer access to
> > > > > > tg->load_avg and thus, fewer task migrations. This is contradictory
> > > > > > and I can not explain yet;
> > > > >
> > > > > Neither can I.
> > > > >
> > >
> > > [...]
> > >
> > > > >
> > > > > > It's not clear to me why this series can reduce task migrations. I doubt
> > > > > > it has something to do with more wakelist style wakeup becasue for this
> > > > > > test machine, only a single core with two SMT threads share L2 so more
> > > > > > wakeups are through wakelist. In wakelist style wakeup, the target rq's
> > > > > > ttwu_pending is set and that will make the target cpu as !idle_cpu();
> > > > > > This is faster than grabbing the target rq's lock and then increase
> > > > > > target rq's nr_running or set target rq's curr to something else than
> > > > > > idle. So wakelist style wakeup can make target cpu appear as non idle
> > > > > > faster, but I can't connect this with reduced migration yet, I just feel
> > > > > > this might be the reason why task migration reduced.
> > > > >
> > > >
> > > [...]
> > > > > I've tried adding checks for rq->ttwu_pending in those code paths on top of
> > > > > my patch and I'm still observing the reduction in number of migrations, so
> > > > > it's unclear to me how doing more queued wakeups can reduce migrations the
> > > > > way it does.
> > > >
> > > > An interesting puzzle.
> > >
> > > One metric that can help understand the impact of my patch: comparing
> > > hackbench from a baseline where only your load_avg patch is applied
> > > to a kernel with my l2c patch applied, I notice that the goidle
> > > schedstat is cut in half. For a given CPU (they are pretty much alike),
> > > it goes from 650456 to 353487.
> > >
> > > So could it be that by doing queued wakeups, we end up batching
> > > execution of the woken up tasks for a given CPU, rather than going
> > > back and forth between idle and non-idle ? One important thing that
> > > this changes is to reduce the number of newidle balance triggered.
> >
> > I noticed the majority(>99%) migrations are from wakeup path on this
> > Intel SPR when running hackbench: ttwu() -> set_task_cpu() ->
> > migrate_task_rq_fair(), so while I think it's a good finding that
> > newidle balance dropped, it's probably not the reason why migration
> > number dropped...
>
> I profiled select_idle_sibling() and found that with this series,
> select_idle_cpu() tends to fail more and select_idle_sibling() fallbacks
> to use target in the end, which equals to prev_cpu very often.
>
> Initially I think the reason why select_idle_cpu() failed more with this
> series is because "wake_list style enqueue" can make the target cpu appear
> as busy earlier and thus, it will be harder for select_idle_cpu() to
> find an idle cpu overall. But I also suspect SIS_UTIL makes a difference
> here: in vanilla kernel, the idle% is 8% and with this series, the idle%
> is only 2% and SIS_UTIL may simply skip doing any search for idle cpu.
> Anyway, I think I'll also need to profile select_idle_cpu() to see
> what's going on there too.
Looks like the reduction in task migration is due to SIS_UTIL, i.e.
select_idle_cpu() aborts a lot more after applying this series because
system utilization increased.
Here are some numbers:
@sis @sic @migrate_idle_cpu @abort
vanilla: 24640640 15883958 11913588 4148649
this_series: 22345434 18597564 4294995 14319284
note:
- @sis: number of times select_idle_sibling() called;
- @sic: number of times select_idle_cpu() called;
- @migrate_idle_cpu: number of times task migrated due to
select_idle_cpu() found an idle cpu that is different from prev_cpu;
- @abort: number of times select_idle_cpu() aborts the search due to
SIS_UTIL.
All numbers are captured during a 5s window while running the below
workload on a 2 sockets Intel SPR(56 cores, 112 threads per socket):
hackbench -g 20 -f 20 --pipe --threads -l 480000 -s 100
So for this workload, I think this series is doing something good: it
increased system utilization and due to SIS_UTIL, it also reduced task
migration where task migration isn't very useful since system is already
overloaded.
Thanks,
Aaron
On 9/5/23 03:21, Aaron Lu wrote: > On Fri, Sep 01, 2023 at 09:45:28PM +0800, Aaron Lu wrote: >> On Mon, Aug 28, 2023 at 07:19:45PM +0800, Aaron Lu wrote: >>> On Fri, Aug 25, 2023 at 09:51:19AM -0400, Mathieu Desnoyers wrote: >>>> On 8/25/23 02:49, Aaron Lu wrote: >>>>> On Thu, Aug 24, 2023 at 10:40:45AM -0400, Mathieu Desnoyers wrote: >>>> [...] >>>>>>> - task migrations dropped with this series for nr_group=20 and 32 >>>>>>> according to 'perf stat'. migration number didn't drop for nr_group=10 >>>>>>> but the two update functions' cost dropped which means fewer access to >>>>>>> tg->load_avg and thus, fewer task migrations. This is contradictory >>>>>>> and I can not explain yet; >>>>>> >>>>>> Neither can I. >>>>>> >>>> >>>> [...] >>>> >>>>>> >>>>>>> It's not clear to me why this series can reduce task migrations. I doubt >>>>>>> it has something to do with more wakelist style wakeup becasue for this >>>>>>> test machine, only a single core with two SMT threads share L2 so more >>>>>>> wakeups are through wakelist. In wakelist style wakeup, the target rq's >>>>>>> ttwu_pending is set and that will make the target cpu as !idle_cpu(); >>>>>>> This is faster than grabbing the target rq's lock and then increase >>>>>>> target rq's nr_running or set target rq's curr to something else than >>>>>>> idle. So wakelist style wakeup can make target cpu appear as non idle >>>>>>> faster, but I can't connect this with reduced migration yet, I just feel >>>>>>> this might be the reason why task migration reduced. >>>>>> >>>>> >>>> [...] >>>>>> I've tried adding checks for rq->ttwu_pending in those code paths on top of >>>>>> my patch and I'm still observing the reduction in number of migrations, so >>>>>> it's unclear to me how doing more queued wakeups can reduce migrations the >>>>>> way it does. >>>>> >>>>> An interesting puzzle. >>>> >>>> One metric that can help understand the impact of my patch: comparing >>>> hackbench from a baseline where only your load_avg patch is applied >>>> to a kernel with my l2c patch applied, I notice that the goidle >>>> schedstat is cut in half. For a given CPU (they are pretty much alike), >>>> it goes from 650456 to 353487. >>>> >>>> So could it be that by doing queued wakeups, we end up batching >>>> execution of the woken up tasks for a given CPU, rather than going >>>> back and forth between idle and non-idle ? One important thing that >>>> this changes is to reduce the number of newidle balance triggered. >>> >>> I noticed the majority(>99%) migrations are from wakeup path on this >>> Intel SPR when running hackbench: ttwu() -> set_task_cpu() -> >>> migrate_task_rq_fair(), so while I think it's a good finding that >>> newidle balance dropped, it's probably not the reason why migration >>> number dropped... >> >> I profiled select_idle_sibling() and found that with this series, >> select_idle_cpu() tends to fail more and select_idle_sibling() fallbacks >> to use target in the end, which equals to prev_cpu very often. >> >> Initially I think the reason why select_idle_cpu() failed more with this >> series is because "wake_list style enqueue" can make the target cpu appear >> as busy earlier and thus, it will be harder for select_idle_cpu() to >> find an idle cpu overall. But I also suspect SIS_UTIL makes a difference >> here: in vanilla kernel, the idle% is 8% and with this series, the idle% >> is only 2% and SIS_UTIL may simply skip doing any search for idle cpu. >> Anyway, I think I'll also need to profile select_idle_cpu() to see >> what's going on there too. > > Looks like the reduction in task migration is due to SIS_UTIL, i.e. > select_idle_cpu() aborts a lot more after applying this series because > system utilization increased. > > Here are some numbers: > @sis @sic @migrate_idle_cpu @abort > vanilla: 24640640 15883958 11913588 4148649 > this_series: 22345434 18597564 4294995 14319284 > > note: > - @sis: number of times select_idle_sibling() called; > - @sic: number of times select_idle_cpu() called; > - @migrate_idle_cpu: number of times task migrated due to > select_idle_cpu() found an idle cpu that is different from prev_cpu; > - @abort: number of times select_idle_cpu() aborts the search due to > SIS_UTIL. > > All numbers are captured during a 5s window while running the below > workload on a 2 sockets Intel SPR(56 cores, 112 threads per socket): > hackbench -g 20 -f 20 --pipe --threads -l 480000 -s 100 > > So for this workload, I think this series is doing something good: it > increased system utilization and due to SIS_UTIL, it also reduced task > migration where task migration isn't very useful since system is already > overloaded. This is interesting. Did you also profile the impact of the patches on wake_affine(), especially wake_affine_idle() ? Its behavior did change very significantly in my tests, and this impacts the target cpu number received by select_idle_sibling(). But independently of what wake_affine() returns as target (waker cpu or prev_cpu), if select_idle_cpu() is trigger-happy and finds idle cores near that target, this will cause lots of migrations. Based on your metrics, the ttwu-queued-l2 approach (in addition to reduce lock contention) appear to decrease the SIS_UTIL idleless level of the cpus enough to completely change the runqueue selection and migration behavior. I fear that we hide a bad scheduler behavior under the rug by changing the idleless level of a specific workload pattern, while leaving the underlying root cause unfixed. I'm currently working on a different approach: rate limit migrations. Basically, the idea is to detect when a task is migrated too often for its own good, and prevent the scheduler from migrating it for a short while. I get about 30% performance improvement with this approach as well (limit migration to 1 per 2ms window per task). I'll finish polishing my commit messages and send a series as RFC soon. Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
On Tue, Sep 05, 2023 at 08:46:42AM -0400, Mathieu Desnoyers wrote: > On 9/5/23 03:21, Aaron Lu wrote: > > Looks like the reduction in task migration is due to SIS_UTIL, i.e. > > select_idle_cpu() aborts a lot more after applying this series because > > system utilization increased. > > > > Here are some numbers: > > @sis @sic @migrate_idle_cpu @abort > > vanilla: 24640640 15883958 11913588 4148649 > > this_series: 22345434 18597564 4294995 14319284 > > > > note: > > - @sis: number of times select_idle_sibling() called; > > - @sic: number of times select_idle_cpu() called; > > - @migrate_idle_cpu: number of times task migrated due to > > select_idle_cpu() found an idle cpu that is different from prev_cpu; > > - @abort: number of times select_idle_cpu() aborts the search due to > > SIS_UTIL. > > > > All numbers are captured during a 5s window while running the below > > workload on a 2 sockets Intel SPR(56 cores, 112 threads per socket): > > hackbench -g 20 -f 20 --pipe --threads -l 480000 -s 100 > > > > So for this workload, I think this series is doing something good: it > > increased system utilization and due to SIS_UTIL, it also reduced task > > migration where task migration isn't very useful since system is already > > overloaded. > > This is interesting. Did you also profile the impact of the patches on > wake_affine(), especially wake_affine_idle() ? Its behavior did change very For group=20 case, wake_affine() and wake_affine_idle() don't appear to change much on this Intel machine, in that target received by sis() is mostly prev_cpu instead of waker(this) cpu for both kernels. But I do notice for group=32 case, in vanilla kernel, the chance of target as received by sis() becoming to waker cpu increased a lot while with this series, targer remains mostly prev_cpu and that is the reason why migration dropped with this series for group=32 case becasue when sis() fallback to use target, this series has a higher chance of not mirgating the task. And my profile shows for vanilla kernel, when it choose target as waker cpu, it's mostly due to wake_affine_weight(), not wake_affine_idle(). Thanks, Aaron > significantly in my tests, and this impacts the target cpu number received > by select_idle_sibling(). But independently of what wake_affine() returns as > target (waker cpu or prev_cpu), if select_idle_cpu() is trigger-happy and > finds idle cores near that target, this will cause lots of migrations. > > Based on your metrics, the ttwu-queued-l2 approach (in addition to reduce > lock contention) appear to decrease the SIS_UTIL idleless level of the cpus > enough to completely change the runqueue selection and migration behavior. > > I fear that we hide a bad scheduler behavior under the rug by changing the > idleless level of a specific workload pattern, while leaving the underlying > root cause unfixed. > > I'm currently working on a different approach: rate limit migrations. > Basically, the idea is to detect when a task is migrated too often for its > own good, and prevent the scheduler from migrating it for a short while. I > get about 30% performance improvement with this approach as well (limit > migration to 1 per 2ms window per task). I'll finish polishing my commit > messages and send a series as RFC soon. > > Thanks, > > Mathieu > > -- > Mathieu Desnoyers > EfficiOS Inc. > https://www.efficios.com >
© 2016 - 2025 Red Hat, Inc.