When running a multi-instance FFmpeg workload on HCC system, significant
contention is observed on bitmap of `cpupri_vec->cpumask`.
The SUT is a 2-socket machine with 240 physical cores and 480 logical
CPUs. 60 FFmpeg instances are launched, each pinned to 4 physical cores
(8 logical CPUs) for transcoding tasks. Sub-threads use RT priority 99
with FIFO scheduling. FPS is used as score.
perf c2c tool reveals:
cpumask (bitmap) cache line of `cpupri_vec->mask`:
- bits are loaded during cpupri_find
- bits are stored during cpupri_set
- cycles per load: ~2.2K to 8.7K
This change splits `cpupri_vec->cpumask` into per-NUMA-node data to
mitigate false sharing.
As a result:
- FPS improves by ~3.8%
- Kernel cycles% drops from ~20% to ~18.7%
- Cache line contention is mitigated, perf-c2c shows cycles per load
drops from ~2.2K-8.7K to ~0.5K-2.2K
Note: CONFIG_CPUMASK_OFFSTACK=n remains unchanged.
Appendix:
1. Perf c2c report of `cpupri_vec->mask` bitmap cache line:
------- ------- ------ ------ ------ ------ ------------------------
Rmt Lcl Store Data Load Total Symbol
Hitm% Hitm% L1 Hit% offset cycles records
------- ------- ------ ------ ------ ------ ------------------------
155 39 39 0xff14d52c4682d800
------- ------- ------ ------ ------ ------ ------------------------
43.23% 43.59% 0.00% 0x0 3489 415 _find_first_and_bit
3.23% 5.13% 0.00% 0x0 3478 107 __bitmap_and
3.23% 0.00% 0.00% 0x0 2712 33 _find_first_and_bit
1.94% 0.00% 7.69% 0x0 5992 33 cpupri_set
0.00% 0.00% 5.13% 0x0 3733 19 cpupri_set
12.90% 12.82% 0.00% 0x8 3452 297 _find_first_and_bit
1.29% 2.56% 0.00% 0x8 3007 117 __bitmap_and
0.00% 5.13% 0.00% 0x8 3041 20 _find_first_and_bit
0.00% 2.56% 2.56% 0x8 2374 22 cpupri_set
0.00% 0.00% 7.69% 0x8 4194 38 cpupri_set
8.39% 2.56% 0.00% 0x10 3336 264 _find_first_and_bit
3.23% 0.00% 0.00% 0x10 3023 46 _find_first_and_bit
2.58% 0.00% 0.00% 0x10 3040 130 __bitmap_and
1.29% 0.00% 12.82% 0x10 4075 34 cpupri_set
0.00% 0.00% 2.56% 0x10 2197 19 cpupri_set
0.00% 2.56% 7.69% 0x18 4085 27 cpupri_set
0.00% 2.56% 0.00% 0x18 3128 220 _find_first_and_bit
0.00% 0.00% 5.13% 0x18 3028 20 cpupri_set
2.58% 2.56% 0.00% 0x20 3089 198 _find_first_and_bit
1.29% 0.00% 5.13% 0x20 5114 29 cpupri_set
0.65% 2.56% 0.00% 0x20 3224 96 __bitmap_and
0.65% 0.00% 7.69% 0x20 4392 31 cpupri_set
2.58% 0.00% 0.00% 0x28 3327 214 _find_first_and_bit
0.65% 2.56% 5.13% 0x28 5252 31 cpupri_set
0.65% 0.00% 7.69% 0x28 8755 25 cpupri_set
0.65% 0.00% 0.00% 0x28 4414 14 _find_first_and_bit
1.29% 2.56% 0.00% 0x30 3139 171 _find_first_and_bit
0.65% 0.00% 7.69% 0x30 2185 18 cpupri_set
0.65% 0.00% 0.00% 0x30 3404 108 __bitmap_and
0.00% 0.00% 2.56% 0x30 5542 21 cpupri_set
3.23% 5.13% 0.00% 0x38 3493 190 _find_first_and_bit
3.23% 2.56% 0.00% 0x38 3171 108 __bitmap_and
0.00% 2.56% 7.69% 0x38 3285 14 cpupri_set
0.00% 0.00% 5.13% 0x38 4035 27 cpupri_set
Signed-off-by: Pan Deng <pan.deng@intel.com>
Reviewed-by: Tianyou Li <tianyou.li@intel.com>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
---
kernel/sched/cpupri.c | 200 ++++++++++++++++++++++++++++++++++++++----
kernel/sched/cpupri.h | 4 +
2 files changed, 186 insertions(+), 18 deletions(-)
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 42c40cfdf836..306b6baff4cd 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -64,6 +64,143 @@ static int convert_prio(int prio)
return cpupri;
}
+#ifdef CONFIG_CPUMASK_OFFSTACK
+static inline int alloc_vec_masks(struct cpupri_vec *vec)
+{
+ int i;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ if (!zalloc_cpumask_var_node(&vec->masks[i], GFP_KERNEL, i))
+ goto cleanup;
+
+ // Clear masks of cur node, set others
+ bitmap_complement(cpumask_bits(vec->masks[i]),
+ cpumask_bits(cpumask_of_node(i)), small_cpumask_bits);
+ }
+ return 0;
+
+cleanup:
+ while (i--)
+ free_cpumask_var(vec->masks[i]);
+ return -ENOMEM;
+}
+
+static inline void free_vec_masks(struct cpupri_vec *vec)
+{
+ for (int i = 0; i < nr_node_ids; i++)
+ free_cpumask_var(vec->masks[i]);
+}
+
+static inline int setup_vec_mask_var_ts(struct cpupri *cp)
+{
+ int i;
+
+ for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+
+ vec->masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
+ if (!vec->masks)
+ goto cleanup;
+ }
+ return 0;
+
+cleanup:
+ /* Free any already allocated masks */
+ while (i--) {
+ kfree(cp->pri_to_cpu[i].masks);
+ cp->pri_to_cpu[i].masks = NULL;
+ }
+
+ return -ENOMEM;
+}
+
+static inline void free_vec_mask_var_ts(struct cpupri *cp)
+{
+ for (int i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+ kfree(cp->pri_to_cpu[i].masks);
+ cp->pri_to_cpu[i].masks = NULL;
+ }
+}
+
+static inline int
+available_cpu_in_nodes(struct task_struct *p, struct cpupri_vec *vec)
+{
+ int cur_node = numa_node_id();
+
+ for (int i = 0; i < nr_node_ids; i++) {
+ int nid = (cur_node + i) % nr_node_ids;
+
+ if (cpumask_first_and_and(&p->cpus_mask, vec->masks[nid],
+ cpumask_of_node(nid)) < nr_cpu_ids)
+ return 1;
+ }
+
+ return 0;
+}
+
+#define available_cpu_in_vec available_cpu_in_nodes
+
+#else /* !CONFIG_CPUMASK_OFFSTACK */
+
+static inline int alloc_vec_masks(struct cpupri_vec *vec)
+{
+ if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static inline void free_vec_masks(struct cpupri_vec *vec)
+{
+ free_cpumask_var(vec->mask);
+}
+
+static inline int setup_vec_mask_var_ts(struct cpupri *cp)
+{
+ return 0;
+}
+
+static inline void free_vec_mask_var_ts(struct cpupri *cp)
+{
+}
+
+static inline int
+available_cpu_in_vec(struct task_struct *p, struct cpupri_vec *vec)
+{
+ if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
+ return 0;
+
+ return 1;
+}
+#endif
+
+static inline int alloc_all_masks(struct cpupri *cp)
+{
+ int i;
+
+ for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+ if (alloc_vec_masks(&cp->pri_to_cpu[i]))
+ goto cleanup;
+ }
+
+ return 0;
+
+cleanup:
+ while (i--)
+ free_vec_masks(&cp->pri_to_cpu[i]);
+
+ return -ENOMEM;
+}
+
+static inline void setup_vec_counts(struct cpupri *cp)
+{
+ for (int i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+
+ atomic_set(&vec->count, 0);
+ }
+}
+
static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask, int idx)
{
@@ -96,11 +233,24 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
return 0;
- if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
+ if (!available_cpu_in_vec(p, vec))
return 0;
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ struct cpumask *cpupri_mask = lowest_mask;
+
+ // available && lowest_mask
+ if (lowest_mask) {
+ cpumask_copy(cpupri_mask, vec->masks[0]);
+ for (int nid = 1; nid < nr_node_ids; nid++)
+ cpumask_and(cpupri_mask, cpupri_mask, vec->masks[nid]);
+ }
+#else
+ struct cpumask *cpupri_mask = vec->mask;
+#endif
+
if (lowest_mask) {
- cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
+ cpumask_and(lowest_mask, &p->cpus_mask, cpupri_mask);
cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
/*
@@ -229,7 +379,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
if (likely(newpri != CPUPRI_INVALID)) {
struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ cpumask_set_cpu(cpu, vec->masks[cpu_to_node(cpu)]);
+#else
cpumask_set_cpu(cpu, vec->mask);
+#endif
/*
* When adding a new vector, we update the mask first,
* do a write memory barrier, and then update the count, to
@@ -263,7 +417,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
*/
atomic_dec(&(vec)->count);
smp_mb__after_atomic();
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ cpumask_clear_cpu(cpu, vec->masks[cpu_to_node(cpu)]);
+#else
cpumask_clear_cpu(cpu, vec->mask);
+#endif
}
*currpri = newpri;
@@ -279,26 +437,31 @@ int cpupri_init(struct cpupri *cp)
{
int i;
- for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
- struct cpupri_vec *vec = &cp->pri_to_cpu[i];
-
- atomic_set(&vec->count, 0);
- if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
- goto cleanup;
- }
-
+ /* Allocate the cpu_to_pri array */
cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
if (!cp->cpu_to_pri)
- goto cleanup;
+ return -ENOMEM;
+ /* Initialize all CPUs to invalid priority */
for_each_possible_cpu(i)
cp->cpu_to_pri[i] = CPUPRI_INVALID;
+ /* Setup priority vectors */
+ setup_vec_counts(cp);
+ if (setup_vec_mask_var_ts(cp))
+ goto fail_setup_vectors;
+
+ /* Allocate masks for each priority vector */
+ if (alloc_all_masks(cp))
+ goto fail_alloc_masks;
+
return 0;
-cleanup:
- for (i--; i >= 0; i--)
- free_cpumask_var(cp->pri_to_cpu[i].mask);
+fail_alloc_masks:
+ free_vec_mask_var_ts(cp);
+
+fail_setup_vectors:
+ kfree(cp->cpu_to_pri);
return -ENOMEM;
}
@@ -308,9 +471,10 @@ int cpupri_init(struct cpupri *cp)
*/
void cpupri_cleanup(struct cpupri *cp)
{
- int i;
-
kfree(cp->cpu_to_pri);
- for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
- free_cpumask_var(cp->pri_to_cpu[i].mask);
+
+ for (int i = 0; i < CPUPRI_NR_PRIORITIES; i++)
+ free_vec_masks(&cp->pri_to_cpu[i]);
+
+ free_vec_mask_var_ts(cp);
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 245b0fa626be..c53f1f4dad86 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -9,7 +9,11 @@
struct cpupri_vec {
atomic_t count;
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ cpumask_var_t *masks ____cacheline_aligned;
+#else
cpumask_var_t mask ____cacheline_aligned;
+#endif
};
struct cpupri {
--
2.43.5
On 7/7/2025 10:35 AM, Pan Deng wrote: > When running a multi-instance FFmpeg workload on HCC system, significant > contention is observed on bitmap of `cpupri_vec->cpumask`. > > The SUT is a 2-socket machine with 240 physical cores and 480 logical > CPUs. 60 FFmpeg instances are launched, each pinned to 4 physical cores > (8 logical CPUs) for transcoding tasks. Sub-threads use RT priority 99 > with FIFO scheduling. FPS is used as score. > > perf c2c tool reveals: > cpumask (bitmap) cache line of `cpupri_vec->mask`: > - bits are loaded during cpupri_find > - bits are stored during cpupri_set > - cycles per load: ~2.2K to 8.7K > > This change splits `cpupri_vec->cpumask` into per-NUMA-node data to > mitigate false sharing. > > As a result: > - FPS improves by ~3.8% > - Kernel cycles% drops from ~20% to ~18.7% > - Cache line contention is mitigated, perf-c2c shows cycles per load > drops from ~2.2K-8.7K to ~0.5K-2.2K > This brings noticeable improvement for RT workload, and it would be even more convincing if we can have try on normal task workload, at least not bring regression(schbench/hackbenc, etc). thanks, Chenyu > Note: CONFIG_CPUMASK_OFFSTACK=n remains unchanged. >
> -----Original Message----- > From: Chen, Yu C <yu.c.chen@intel.com> > Sent: Monday, July 21, 2025 7:24 PM > To: Deng, Pan <pan.deng@intel.com> > Cc: linux-kernel@vger.kernel.org; Li, Tianyou <tianyou.li@intel.com>; > tim.c.chen@linux.intel.com; peterz@infradead.org; mingo@kernel.org > Subject: Re: [PATCH 4/4] sched/rt: Split cpupri_vec->cpumask to per NUMA > node to reduce contention > > On 7/7/2025 10:35 AM, Pan Deng wrote: > > When running a multi-instance FFmpeg workload on HCC system, > > significant contention is observed on bitmap of `cpupri_vec->cpumask`. > > > > The SUT is a 2-socket machine with 240 physical cores and 480 logical > > CPUs. 60 FFmpeg instances are launched, each pinned to 4 physical > > cores > > (8 logical CPUs) for transcoding tasks. Sub-threads use RT priority 99 > > with FIFO scheduling. FPS is used as score. > > > > perf c2c tool reveals: > > cpumask (bitmap) cache line of `cpupri_vec->mask`: > > - bits are loaded during cpupri_find > > - bits are stored during cpupri_set > > - cycles per load: ~2.2K to 8.7K > > > > This change splits `cpupri_vec->cpumask` into per-NUMA-node data to > > mitigate false sharing. > > > > As a result: > > - FPS improves by ~3.8% > > - Kernel cycles% drops from ~20% to ~18.7% > > - Cache line contention is mitigated, perf-c2c shows cycles per load > > drops from ~2.2K-8.7K to ~0.5K-2.2K > > > > This brings noticeable improvement for RT workload, and it would be even > more convincing if we can have try on normal task workload, at least not bring > regression(schbench/hackbenc, etc). > Thanks Yu, hackbench and schbench data will be provided later. > thanks, > Chenyu > > > Note: CONFIG_CPUMASK_OFFSTACK=n remains unchanged. > > >
> -----Original Message----- > From: Deng, Pan > Sent: Tuesday, July 22, 2025 10:47 PM > To: Chen, Yu C <yu.c.chen@intel.com> > Cc: linux-kernel@vger.kernel.org; Li, Tianyou <tianyou.li@intel.com>; > tim.c.chen@linux.intel.com; peterz@infradead.org; mingo@kernel.org > Subject: RE: [PATCH 4/4] sched/rt: Split cpupri_vec->cpumask to per NUMA > node to reduce contention > > > > -----Original Message----- > > From: Chen, Yu C <yu.c.chen@intel.com> > > Sent: Monday, July 21, 2025 7:24 PM > > To: Deng, Pan <pan.deng@intel.com> > > Cc: linux-kernel@vger.kernel.org; Li, Tianyou <tianyou.li@intel.com>; > > tim.c.chen@linux.intel.com; peterz@infradead.org; mingo@kernel.org > > Subject: Re: [PATCH 4/4] sched/rt: Split cpupri_vec->cpumask to per NUMA > > node to reduce contention > > > > On 7/7/2025 10:35 AM, Pan Deng wrote: > > > When running a multi-instance FFmpeg workload on HCC system, > > > significant contention is observed on bitmap of `cpupri_vec->cpumask`. > > > > > > The SUT is a 2-socket machine with 240 physical cores and 480 logical > > > CPUs. 60 FFmpeg instances are launched, each pinned to 4 physical > > > cores > > > (8 logical CPUs) for transcoding tasks. Sub-threads use RT priority 99 > > > with FIFO scheduling. FPS is used as score. > > > > > > perf c2c tool reveals: > > > cpumask (bitmap) cache line of `cpupri_vec->mask`: > > > - bits are loaded during cpupri_find > > > - bits are stored during cpupri_set > > > - cycles per load: ~2.2K to 8.7K > > > > > > This change splits `cpupri_vec->cpumask` into per-NUMA-node data to > > > mitigate false sharing. > > > > > > As a result: > > > - FPS improves by ~3.8% > > > - Kernel cycles% drops from ~20% to ~18.7% > > > - Cache line contention is mitigated, perf-c2c shows cycles per load > > > drops from ~2.2K-8.7K to ~0.5K-2.2K > > > > > > > This brings noticeable improvement for RT workload, and it would be even > > more convincing if we can have try on normal task workload, at least not > bring > > regression(schbench/hackbenc, etc). > > > > Thanks Yu, hackbench and schbench data will be provided later. > > TLDR; ==== Hackbench, both old and new version schbench were evaluted on SUT with 2-socket/6 NUMA nodes/240 physical cores/480 logical CPUs. No regressions were detected for patch 1-4. In addition, symbol-level analysis from `perf record -a` profiling data indicates that the changes introduced in patch 1-4 are unlikely to cause regressions in hackbench or schbench. Details ======= Hackbench ========= The workload is ran by test framework https://github.com/yu-chen-surf/schedtests, procedures: 1. Reboot system to run a workload. 2. Run 5 iterations of 1st configuration with 30s cool down period. 3. Run 5 iterations of 2nd configuration.. ... The test results are as follows: regressions exceeding -10% are marked with ** at the end of the line. However, when re-running the tests using the test framework or a vanilla workload, the regressions could not be reproduced. Notes: 15/30/45 are fds# as well as process/thread pairs in 1 group. Patch 1 case load baseline(std%) patch1%( std%) process-pipe-15 1-groups 1.00 ( 14.03) -8.81 ( 6.53) process-pipe-15 2-groups 1.00 ( 3.46) +1.82 ( 2.59) process-pipe-15 4-groups 1.00 ( 6.20) +8.60 ( 5.59) process-pipe-15 8-groups 1.00 ( 2.41) -0.21 ( 3.22) process-pipe-30 1-groups 1.00 ( 2.51) +2.24 ( 3.12) process-pipe-30 2-groups 1.00 ( 3.86) -0.58 ( 2.46) process-pipe-30 4-groups 1.00 ( 2.19) -1.81 ( 1.05) process-pipe-30 8-groups 1.00 ( 1.69) +0.52 ( 3.01) process-pipe-45 1-groups 1.00 ( 1.63) +1.63 ( 1.23) process-pipe-45 2-groups 1.00 ( 0.79) +0.08 ( 1.82) process-pipe-45 4-groups 1.00 ( 1.62) -0.06 ( 0.63) process-pipe-45 8-groups 1.00 ( 1.66) -4.12 ( 3.27) process-sockets-15 1-groups 1.00 ( 3.57) +2.36 ( 5.15) process-sockets-15 2-groups 1.00 ( 3.59) -1.33 ( 6.86) process-sockets-15 4-groups 1.00 ( 7.10) +5.44 ( 6.97) process-sockets-15 8-groups 1.00 ( 2.63) -3.05 ( 1.94) process-sockets-30 1-groups 1.00 ( 3.73) -2.69 ( 4.89) process-sockets-30 2-groups 1.00 ( 3.90) -4.25 ( 3.94) process-sockets-30 4-groups 1.00 ( 1.03) -1.58 ( 1.51) process-sockets-30 8-groups 1.00 ( 0.48) +1.09 ( 0.68) process-sockets-45 1-groups 1.00 ( 0.62) -2.25 ( 0.57) process-sockets-45 2-groups 1.00 ( 2.56) -0.61 ( 0.63) process-sockets-45 4-groups 1.00 ( 0.57) -0.51 ( 0.79) process-sockets-45 8-groups 1.00 ( 0.18) -5.23 ( 2.18) threads-pipe-15 1-groups 1.00 ( 5.30) -1.47 ( 5.38) threads-pipe-15 2-groups 1.00 ( 7.97) -1.31 ( 8.61) threads-pipe-15 4-groups 1.00 ( 4.94) -3.31 ( 5.48) threads-pipe-15 8-groups 1.00 ( 1.69) +7.28 ( 5.54) threads-pipe-30 1-groups 1.00 ( 5.12) -1.58 ( 4.82) threads-pipe-30 2-groups 1.00 ( 1.63) +3.29 ( 1.72) threads-pipe-30 4-groups 1.00 ( 3.41) +3.05 ( 3.22) threads-pipe-30 8-groups 1.00 ( 2.85) +1.58 ( 4.05) threads-pipe-45 1-groups 1.00 ( 5.13) -0.78 ( 6.78) threads-pipe-45 2-groups 1.00 ( 1.92) -2.87 ( 1.27) threads-pipe-45 4-groups 1.00 ( 2.41) -4.37 ( 1.23) threads-pipe-45 8-groups 1.00 ( 1.81) +1.85 ( 1.54) threads-sockets-15 1-groups 1.00 ( 4.72) -0.73 ( 2.75) threads-sockets-15 2-groups 1.00 ( 3.05) +3.09 ( 3.39) threads-sockets-15 4-groups 1.00 ( 5.92) +0.87 ( 2.25) threads-sockets-15 8-groups 1.00 ( 3.75) -7.24 ( 3.34) threads-sockets-30 1-groups 1.00 ( 5.96) -6.27 ( 3.35) threads-sockets-30 2-groups 1.00 ( 1.68) -1.78 ( 3.60) threads-sockets-30 4-groups 1.00 ( 5.02) -0.95 ( 3.60) threads-sockets-30 8-groups 1.00 ( 0.41) -3.09 ( 2.03) threads-sockets-45 1-groups 1.00 ( 2.55) -1.32 ( 1.37) threads-sockets-45 2-groups 1.00 ( 3.53) -0.46 ( 3.99) threads-sockets-45 4-groups 1.00 ( 0.51) +0.67 ( 0.74) threads-sockets-45 8-groups 1.00 ( 3.01) -16.85 ( 2.13) ** Patch 2 case load baseline(std%) patch2%( std%) process-pipe-15 1-groups 1.00 ( 14.03) -3.32 ( 11.34) process-pipe-15 2-groups 1.00 ( 3.46) +2.19 ( 7.27) process-pipe-15 4-groups 1.00 ( 6.20) +2.01 ( 2.83) process-pipe-15 8-groups 1.00 ( 2.41) +1.65 ( 4.39) process-pipe-30 1-groups 1.00 ( 2.51) -0.88 ( 3.26) process-pipe-30 2-groups 1.00 ( 3.86) +2.25 ( 3.21) process-pipe-30 4-groups 1.00 ( 2.19) +0.20 ( 1.72) process-pipe-30 8-groups 1.00 ( 1.69) +0.85 ( 0.61) process-pipe-45 1-groups 1.00 ( 1.63) +3.10 ( 4.01) process-pipe-45 2-groups 1.00 ( 0.79) -1.00 ( 0.69) process-pipe-45 4-groups 1.00 ( 1.62) +0.07 ( 0.63) process-pipe-45 8-groups 1.00 ( 1.66) +0.20 ( 1.47) process-sockets-15 1-groups 1.00 ( 3.57) -5.44 ( 3.45) process-sockets-15 2-groups 1.00 ( 3.59) +1.00 ( 4.35) process-sockets-15 4-groups 1.00 ( 7.10) +0.46 ( 4.45) process-sockets-15 8-groups 1.00 ( 2.63) -1.48 ( 4.56) process-sockets-30 1-groups 1.00 ( 3.73) -0.17 ( 3.57) process-sockets-30 2-groups 1.00 ( 3.90) +3.83 ( 7.54) process-sockets-30 4-groups 1.00 ( 1.03) -2.35 ( 6.11) process-sockets-30 8-groups 1.00 ( 0.48) -0.43 ( 0.79) process-sockets-45 1-groups 1.00 ( 0.62) -2.24 ( 1.63) process-sockets-45 2-groups 1.00 ( 2.56) -1.41 ( 3.17) process-sockets-45 4-groups 1.00 ( 0.57) -0.29 ( 0.33) process-sockets-45 8-groups 1.00 ( 0.18) -6.05 ( 1.55) threads-pipe-15 1-groups 1.00 ( 5.30) -5.83 ( 7.96) threads-pipe-15 2-groups 1.00 ( 7.97) -3.74 ( 4.22) threads-pipe-15 4-groups 1.00 ( 4.94) -2.23 ( 5.75) threads-pipe-15 8-groups 1.00 ( 1.69) +0.21 ( 3.08) threads-pipe-30 1-groups 1.00 ( 5.12) -5.73 ( 4.97) threads-pipe-30 2-groups 1.00 ( 1.63) -1.76 ( 4.49) threads-pipe-30 4-groups 1.00 ( 3.41) -0.99 ( 2.50) threads-pipe-30 8-groups 1.00 ( 2.85) +0.71 ( 1.04) threads-pipe-45 1-groups 1.00 ( 5.13) +0.08 ( 5.72) threads-pipe-45 2-groups 1.00 ( 1.92) -1.78 ( 1.30) threads-pipe-45 4-groups 1.00 ( 2.41) -3.79 ( 0.81) threads-pipe-45 8-groups 1.00 ( 1.81) -3.62 ( 1.41) threads-sockets-15 1-groups 1.00 ( 4.72) +2.52 ( 2.66) threads-sockets-15 2-groups 1.00 ( 3.05) -7.59 ( 1.80) threads-sockets-15 4-groups 1.00 ( 5.92) +1.59 ( 7.12) threads-sockets-15 8-groups 1.00 ( 3.75) -0.34 ( 3.62) threads-sockets-30 1-groups 1.00 ( 5.96) -2.45 ( 4.89) threads-sockets-30 2-groups 1.00 ( 1.68) -0.61 ( 4.80) threads-sockets-30 4-groups 1.00 ( 5.02) -2.15 ( 8.62) threads-sockets-30 8-groups 1.00 ( 0.41) -17.32 ( 0.88) ** threads-sockets-45 1-groups 1.00 ( 2.55) -3.24 ( 3.37) threads-sockets-45 2-groups 1.00 ( 3.53) -1.38 ( 2.40) threads-sockets-45 4-groups 1.00 ( 0.51) -0.17 ( 0.85) threads-sockets-45 8-groups 1.00 ( 3.01) -14.59 ( 5.48) ** Patch 3 case load baseline(std%) patch3%( std%) process-pipe-15 1-groups 1.00 ( 14.03) -10.18 ( 3.39) ** process-pipe-15 2-groups 1.00 ( 3.46) +5.18 ( 3.12) process-pipe-15 4-groups 1.00 ( 6.20) +8.63 ( 5.72) process-pipe-15 8-groups 1.00 ( 2.41) +5.37 ( 2.24) process-pipe-30 1-groups 1.00 ( 2.51) +5.53 ( 3.55) process-pipe-30 2-groups 1.00 ( 3.86) +5.70 ( 4.27) process-pipe-30 4-groups 1.00 ( 2.19) +3.95 ( 3.34) process-pipe-30 8-groups 1.00 ( 1.69) -3.38 ( 1.51) process-pipe-45 1-groups 1.00 ( 1.63) +5.19 ( 2.51) process-pipe-45 2-groups 1.00 ( 0.79) -0.63 ( 2.06) process-pipe-45 4-groups 1.00 ( 1.62) -5.83 ( 2.22) process-pipe-45 8-groups 1.00 ( 1.66) -6.13 ( 2.34) process-sockets-15 1-groups 1.00 ( 3.57) -1.51 ( 4.21) process-sockets-15 2-groups 1.00 ( 3.59) -1.30 ( 7.50) process-sockets-15 4-groups 1.00 ( 7.10) -1.80 ( 5.58) process-sockets-15 8-groups 1.00 ( 2.63) -1.68 ( 3.40) process-sockets-30 1-groups 1.00 ( 3.73) -7.74 ( 1.58) process-sockets-30 2-groups 1.00 ( 3.90) -1.98 ( 5.48) process-sockets-30 4-groups 1.00 ( 1.03) -0.33 ( 3.47) process-sockets-30 8-groups 1.00 ( 0.48) -0.40 ( 0.84) process-sockets-45 1-groups 1.00 ( 0.62) -0.21 ( 0.54) process-sockets-45 2-groups 1.00 ( 2.56) -1.97 ( 2.48) process-sockets-45 4-groups 1.00 ( 0.57) -0.61 ( 0.83) process-sockets-45 8-groups 1.00 ( 0.18) -5.09 ( 1.85) threads-pipe-15 1-groups 1.00 ( 5.30) +3.62 ( 11.04) threads-pipe-15 2-groups 1.00 ( 7.97) +8.08 ( 4.63) threads-pipe-15 4-groups 1.00 ( 4.94) +6.46 ( 5.27) threads-pipe-15 8-groups 1.00 ( 1.69) +2.68 ( 3.23) threads-pipe-30 1-groups 1.00 ( 5.12) +3.60 ( 7.09) threads-pipe-30 2-groups 1.00 ( 1.63) -0.80 ( 4.43) threads-pipe-30 4-groups 1.00 ( 3.41) +2.37 ( 2.16) threads-pipe-30 8-groups 1.00 ( 2.85) +4.17 ( 1.41) threads-pipe-45 1-groups 1.00 ( 5.13) +7.41 ( 4.48) threads-pipe-45 2-groups 1.00 ( 1.92) -1.40 ( 2.69) threads-pipe-45 4-groups 1.00 ( 2.41) -1.25 ( 2.15) threads-pipe-45 8-groups 1.00 ( 1.81) +1.62 ( 0.73) threads-sockets-15 1-groups 1.00 ( 4.72) +10.11 ( 7.95) threads-sockets-15 2-groups 1.00 ( 3.05) -8.41 ( 5.93) threads-sockets-15 4-groups 1.00 ( 5.92) -10.89 ( 4.29) ** threads-sockets-15 8-groups 1.00 ( 3.75) -7.66 ( 3.33) threads-sockets-30 1-groups 1.00 ( 5.96) -5.18 ( 2.77) threads-sockets-30 2-groups 1.00 ( 1.68) -4.91 ( 3.89) threads-sockets-30 4-groups 1.00 ( 5.02) -6.32 ( 4.19) threads-sockets-30 8-groups 1.00 ( 0.41) -11.73 ( 0.96) ** threads-sockets-45 1-groups 1.00 ( 2.55) -3.16 ( 1.97) threads-sockets-45 2-groups 1.00 ( 3.53) -0.21 ( 4.33) threads-sockets-45 4-groups 1.00 ( 0.51) -0.75 ( 2.07) threads-sockets-45 8-groups 1.00 ( 3.01) -20.52 ( 1.44) ** Patch 4 case load baseline(std%) patch4%( std%) process-pipe-15 1-groups 1.00 ( 14.03) -2.68 ( 9.64) process-pipe-15 2-groups 1.00 ( 3.46) +1.82 ( 7.55) process-pipe-15 4-groups 1.00 ( 6.20) +3.67 ( 8.17) process-pipe-15 8-groups 1.00 ( 2.41) +1.87 ( 0.92) process-pipe-30 1-groups 1.00 ( 2.51) -3.34 ( 3.96) process-pipe-30 2-groups 1.00 ( 3.86) -0.33 ( 3.53) process-pipe-30 4-groups 1.00 ( 2.19) -3.22 ( 1.31) process-pipe-30 8-groups 1.00 ( 1.69) -1.95 ( 1.07) process-pipe-45 1-groups 1.00 ( 1.63) +0.63 ( 2.86) process-pipe-45 2-groups 1.00 ( 0.79) -1.27 ( 1.39) process-pipe-45 4-groups 1.00 ( 1.62) -2.04 ( 1.87) process-pipe-45 8-groups 1.00 ( 1.66) -1.45 ( 3.20) process-sockets-15 1-groups 1.00 ( 3.57) -9.16 ( 5.33) process-sockets-15 2-groups 1.00 ( 3.59) -1.83 ( 5.36) process-sockets-15 4-groups 1.00 ( 7.10) +7.55 ( 6.34) process-sockets-15 8-groups 1.00 ( 2.63) -2.98 ( 5.95) process-sockets-30 1-groups 1.00 ( 3.73) +3.50 ( 4.92) process-sockets-30 2-groups 1.00 ( 3.90) +1.80 ( 5.68) process-sockets-30 4-groups 1.00 ( 1.03) -1.23 ( 4.79) process-sockets-30 8-groups 1.00 ( 0.48) -0.15 ( 0.33) process-sockets-45 1-groups 1.00 ( 0.62) -0.70 ( 1.12) process-sockets-45 2-groups 1.00 ( 2.56) +0.64 ( 0.86) process-sockets-45 4-groups 1.00 ( 0.57) +0.09 ( 0.53) process-sockets-45 8-groups 1.00 ( 0.18) -7.31 ( 2.11) threads-pipe-15 1-groups 1.00 ( 5.30) +4.94 ( 9.52) threads-pipe-15 2-groups 1.00 ( 7.97) -4.28 ( 2.30) threads-pipe-15 4-groups 1.00 ( 4.94) -1.83 ( 4.24) threads-pipe-15 8-groups 1.00 ( 1.69) -2.35 ( 1.50) threads-pipe-30 1-groups 1.00 ( 5.12) +2.06 ( 5.00) threads-pipe-30 2-groups 1.00 ( 1.63) +0.93 ( 4.53) threads-pipe-30 4-groups 1.00 ( 3.41) -2.85 ( 3.20) threads-pipe-30 8-groups 1.00 ( 2.85) -2.20 ( 2.68) threads-pipe-45 1-groups 1.00 ( 5.13) -0.97 ( 4.70) threads-pipe-45 2-groups 1.00 ( 1.92) -2.11 ( 1.21) threads-pipe-45 4-groups 1.00 ( 2.41) -2.69 ( 1.33) threads-pipe-45 8-groups 1.00 ( 1.81) -2.41 ( 1.14) threads-sockets-15 1-groups 1.00 ( 4.72) +0.82 ( 4.21) threads-sockets-15 2-groups 1.00 ( 3.05) -1.28 ( 2.48) threads-sockets-15 4-groups 1.00 ( 5.92) -1.75 ( 7.25) threads-sockets-15 8-groups 1.00 ( 3.75) -2.54 ( 3.49) threads-sockets-30 1-groups 1.00 ( 5.96) -0.46 ( 5.30) threads-sockets-30 2-groups 1.00 ( 1.68) -0.45 ( 1.75) threads-sockets-30 4-groups 1.00 ( 5.02) -1.48 ( 6.51) threads-sockets-30 8-groups 1.00 ( 0.41) -13.09 ( 1.61) ** threads-sockets-45 1-groups 1.00 ( 2.55) -1.68 ( 0.66) threads-sockets-45 2-groups 1.00 ( 3.53) +0.21 ( 2.23) threads-sockets-45 4-groups 1.00 ( 0.51) -1.27 ( 1.43) threads-sockets-45 8-groups 1.00 ( 3.01) -3.41 ( 0.43) Additionally, profiling data was collected using `perf record -a` for this workload. Firstly, the cycles distribution are almost the same among baseline and patch1-4. Secondly, the patch1-4 relevant symbols identified were set_rd_overloaded/set_rd_overutilized, which is potentially invoked (actually inlined) by `update_sd_lb_stats`. The `update_sd_lb_stats` itself takes ~2.6% cycles in the baseline configuration threads-sockets-45fd, 8 groups, while no regressions were observed in patches 1-4 about this function. So I think the patches won't cause regressions to hackbench. Schbench(old, 91ea787) ====================== The workload is ran by the same metholody as hackbench, with runtime 100s. Test result is as following, the regression over -5% is marked with ** at the end of the line, while, when re-run the test with either test framework or vanilla workload, the regression cannot re-produced. case load baseline(std%) opt1%( std%) normal 1-mthreads-8-workers 1.00 ( 1.44) -5.60 ( 2.96) ** normal 1-mthreads-2-workers 1.00 ( 2.79) -2.65 ( 5.48) normal 1-mthreads-1-workers 1.00 ( 1.27) -1.60 ( 1.03) normal 1-mthreads-31-workers 1.00 ( 1.30) -0.87 ( 2.34) normal 1-mthreads-16-workers 1.00 ( 1.74) -2.23 ( 1.15) normal 1-mthreads-4-workers 1.00 ( 3.35) -1.92 ( 1.62) normal 2-mthreads-8-workers 1.00 ( 2.17) -2.09 ( 1.38) normal 2-mthreads-31-workers 1.00 ( 1.83) +1.93 ( 1.84) normal 2-mthreads-16-workers 1.00 ( 2.06) +0.36 ( 2.38) normal 2-mthreads-1-workers 1.00 ( 3.86) +0.50 ( 2.46) normal 2-mthreads-2-workers 1.00 ( 1.76) -6.91 ( 2.55) normal 2-mthreads-4-workers 1.00 ( 1.59) -5.58 ( 5.99) normal 4-mthreads-8-workers 1.00 ( 0.85) +0.59 ( 0.54) normal 4-mthreads-31-workers 1.00 ( 15.31) +15.04 ( 12.71) normal 4-mthreads-16-workers 1.00 ( 0.99) -2.62 ( 2.15) normal 4-mthreads-4-workers 1.00 ( 1.42) -2.72 ( 1.70) normal 4-mthreads-1-workers 1.00 ( 1.43) -2.84 ( 1.73) normal 4-mthreads-2-workers 1.00 ( 1.78) -4.28 ( 2.08) normal 8-mthreads-16-workers 1.00 ( 10.04) +7.06 ( 0.73) normal 8-mthreads-31-workers 1.00 ( 1.94) -1.66 ( 2.28) normal 8-mthreads-2-workers 1.00 ( 2.51) -0.30 ( 1.53) normal 8-mthreads-8-workers 1.00 ( 1.56) -1.83 ( 1.39) normal 8-mthreads-1-workers 1.00 ( 4.08) +0.45 ( 1.45) normal 8-mthreads-4-workers 1.00 ( 1.84) +2.85 ( 1.07) case load baseline(std%) opt2%( std%) normal 1-mthreads-8-workers 1.00 ( 1.44) -1.48 ( 3.79) normal 1-mthreads-2-workers 1.00 ( 2.79) +3.32 ( 0.90) normal 1-mthreads-1-workers 1.00 ( 1.27) +1.98 ( 1.02) normal 1-mthreads-31-workers 1.00 ( 1.30) +5.84 ( 3.01) normal 1-mthreads-16-workers 1.00 ( 1.74) +5.90 ( 0.68) normal 1-mthreads-4-workers 1.00 ( 3.35) +1.82 ( 1.65) normal 2-mthreads-8-workers 1.00 ( 2.17) +2.80 ( 2.04) normal 2-mthreads-31-workers 1.00 ( 1.83) -0.07 ( 1.09) normal 2-mthreads-16-workers 1.00 ( 2.06) +2.45 ( 2.55) normal 2-mthreads-1-workers 1.00 ( 3.86) +2.41 ( 2.92) normal 2-mthreads-2-workers 1.00 ( 1.76) -1.29 ( 2.03) normal 2-mthreads-4-workers 1.00 ( 1.59) +0.44 ( 1.15) normal 4-mthreads-8-workers 1.00 ( 0.85) -0.81 ( 3.03) normal 4-mthreads-31-workers 1.00 ( 15.31) +2.06 ( 15.97) normal 4-mthreads-16-workers 1.00 ( 0.99) -1.46 ( 2.29) normal 4-mthreads-4-workers 1.00 ( 1.42) -0.15 ( 3.37) normal 4-mthreads-1-workers 1.00 ( 1.43) +0.97 ( 1.95) normal 4-mthreads-2-workers 1.00 ( 1.78) -0.38 ( 2.53) normal 8-mthreads-16-workers 1.00 ( 10.04) +5.80 ( 1.72) normal 8-mthreads-31-workers 1.00 ( 1.94) -0.76 ( 2.33) normal 8-mthreads-2-workers 1.00 ( 2.51) +2.47 ( 2.17) normal 8-mthreads-8-workers 1.00 ( 1.56) -0.66 ( 1.47) normal 8-mthreads-1-workers 1.00 ( 4.08) +2.71 ( 2.78) normal 8-mthreads-4-workers 1.00 ( 1.84) +2.35 ( 4.88) case load baseline(std%) opt3%( std%) normal 1-mthreads-8-workers 1.00 ( 1.44) -6.90 ( 3.85) ** normal 1-mthreads-2-workers 1.00 ( 2.79) +3.23 ( 3.09) normal 1-mthreads-1-workers 1.00 ( 1.27) -1.04 ( 2.22) normal 1-mthreads-31-workers 1.00 ( 1.30) +2.16 ( 1.64) normal 1-mthreads-16-workers 1.00 ( 1.74) -0.72 ( 5.70) normal 1-mthreads-4-workers 1.00 ( 3.35) -1.92 ( 4.31) normal 2-mthreads-8-workers 1.00 ( 2.17) +0.82 ( 1.90) normal 2-mthreads-31-workers 1.00 ( 1.83) +2.08 ( 1.16) normal 2-mthreads-16-workers 1.00 ( 2.06) +4.04 ( 2.42) normal 2-mthreads-1-workers 1.00 ( 3.86) +2.57 ( 3.44) normal 2-mthreads-2-workers 1.00 ( 1.76) -0.12 ( 1.29) normal 2-mthreads-4-workers 1.00 ( 1.59) -2.04 ( 2.83) normal 4-mthreads-8-workers 1.00 ( 0.85) +0.22 ( 1.65) normal 4-mthreads-31-workers 1.00 ( 15.31) +15.09 ( 9.83) normal 4-mthreads-16-workers 1.00 ( 0.99) +1.46 ( 1.88) normal 4-mthreads-4-workers 1.00 ( 1.42) +2.34 ( 1.57) normal 4-mthreads-1-workers 1.00 ( 1.43) -0.77 ( 2.45) normal 4-mthreads-2-workers 1.00 ( 1.78) -1.16 ( 1.85) normal 8-mthreads-16-workers 1.00 ( 10.04) +7.39 ( 1.65) normal 8-mthreads-31-workers 1.00 ( 1.94) -0.81 ( 2.14) normal 8-mthreads-2-workers 1.00 ( 2.51) -1.93 ( 2.00) normal 8-mthreads-8-workers 1.00 ( 1.56) +1.17 ( 1.40) normal 8-mthreads-1-workers 1.00 ( 4.08) +1.63 ( 0.51) normal 8-mthreads-4-workers 1.00 ( 1.84) +4.77 ( 2.36) case load baseline(std%) opt4%( std%) normal 1-mthreads-8-workers 1.00 ( 1.44) -0.27 ( 3.05) normal 1-mthreads-2-workers 1.00 ( 2.79) -0.31 ( 1.19) normal 1-mthreads-1-workers 1.00 ( 1.27) +1.62 ( 1.77) normal 1-mthreads-31-workers 1.00 ( 1.30) +1.30 ( 3.34) normal 1-mthreads-16-workers 1.00 ( 1.74) +0.07 ( 3.38) normal 1-mthreads-4-workers 1.00 ( 3.35) +1.08 ( 2.48) normal 2-mthreads-8-workers 1.00 ( 2.17) +0.04 ( 3.87) normal 2-mthreads-31-workers 1.00 ( 1.83) +1.29 ( 1.44) normal 2-mthreads-16-workers 1.00 ( 2.06) +0.94 ( 2.96) normal 2-mthreads-1-workers 1.00 ( 3.86) +2.85 ( 2.12) normal 2-mthreads-2-workers 1.00 ( 1.76) -0.30 ( 2.37) normal 2-mthreads-4-workers 1.00 ( 1.59) +2.22 ( 1.51) normal 4-mthreads-8-workers 1.00 ( 0.85) +2.20 ( 3.06) normal 4-mthreads-31-workers 1.00 ( 15.31) +15.65 ( 12.68) normal 4-mthreads-16-workers 1.00 ( 0.99) -1.96 ( 3.30) normal 4-mthreads-4-workers 1.00 ( 1.42) -1.19 ( 3.42) normal 4-mthreads-1-workers 1.00 ( 1.43) +2.26 ( 2.45) normal 4-mthreads-2-workers 1.00 ( 1.78) -1.36 ( 2.75) normal 8-mthreads-16-workers 1.00 ( 10.04) -0.33 ( 11.13) normal 8-mthreads-31-workers 1.00 ( 1.94) -1.14 ( 2.01) normal 8-mthreads-2-workers 1.00 ( 2.51) +2.32 ( 2.26) normal 8-mthreads-8-workers 1.00 ( 1.56) -0.44 ( 1.54) normal 8-mthreads-1-workers 1.00 ( 4.08) +2.17 ( 2.10) normal 8-mthreads-4-workers 1.00 ( 1.84) +3.42 ( 2.34) Again, per perf record data, the cycles distribution are almost the same among baseline and patch1-4. The symbols related to patches 1-4 are set_rd_overloaded/set_rd_overutilized that is inlined in `update_sd_lb_stats`, which accounts for ~0.47% (self) cycles in baseline in 1 message thread and 8 workers configuration, and no regressions were observed in patches 1-4 about this function. So I think the patches won't cause regressions to schbench(old). Schbench(new, 48aed1d) ====================== The workload was executed using the test framework available at https://github.com/gormanm/mmtests. Each configuration was run for 5 iterations, with a runtime of 100 seconds per iteration. No significant regressions were observed, as detailed below: Notes: 1. message threads# are always 6, the same to numa node# 2. 1/2/4/8/16/32/64/79 are worker# per message thread baseline patch1 Amean request-99.0th-qrtle-1 1.00 0.00% Amean rps-50.0th-qrtle-1 1.00 0.06% Amean wakeup-99.0th-qrtle-1 1.00 0.26% Amean request-99.0th-qrtle-2 1.00 0.23% Amean rps-50.0th-qrtle-2 1.00 0.00% Amean wakeup-99.0th-qrtle-2 1.00 1.09% Amean request-99.0th-qrtle-4 1.00 -1.32% Amean rps-50.0th-qrtle-4 1.00 0.11% Amean wakeup-99.0th-qrtle-4 1.00 -0.41% Amean request-99.0th-qrtle-8 1.00 -0.08% Amean rps-50.0th-qrtle-8 1.00 -0.17% Amean wakeup-99.0th-qrtle-8 1.00 0.37% Amean request-99.0th-qrtle-16 1.00 0.23% Amean rps-50.0th-qrtle-16 1.00 -0.06% Amean wakeup-99.0th-qrtle-16 1.00 1.03% Amean request-99.0th-qrtle-32 1.00 0.27% Amean rps-50.0th-qrtle-32 1.00 0.06% Amean wakeup-99.0th-qrtle-32 1.00 -0.37% Amean request-99.0th-qrtle-64 1.00 0.57% Amean rps-50.0th-qrtle-64 1.00 -0.28% Amean wakeup-99.0th-qrtle-64 1.00 -3.00% Amean request-99.0th-qrtle-79 1.00 0.21% Amean rps-50.0th-qrtle-79 1.00 -0.23% Amean wakeup-99.0th-qrtle-79 1.00 2.00% baseline patch2 Amean request-99.0th-qrtle-1 1.00 -0.46% Amean rps-50.0th-qrtle-1 1.00 0.11% Amean wakeup-99.0th-qrtle-1 1.00 -2.01% Amean request-99.0th-qrtle-2 1.00 -0.08% Amean rps-50.0th-qrtle-2 1.00 0.00% Amean wakeup-99.0th-qrtle-2 1.00 -1.42% Amean request-99.0th-qrtle-4 1.00 -1.16% Amean rps-50.0th-qrtle-4 1.00 0.11% Amean wakeup-99.0th-qrtle-4 1.00 -1.30% Amean request-99.0th-qrtle-8 1.00 -0.08% Amean rps-50.0th-qrtle-8 1.00 -0.40% Amean wakeup-99.0th-qrtle-8 1.00 1.25% Amean request-99.0th-qrtle-16 1.00 0.46% Amean rps-50.0th-qrtle-16 1.00 -0.06% Amean wakeup-99.0th-qrtle-16 1.00 2.52% Amean request-99.0th-qrtle-32 1.00 14.83% Amean rps-50.0th-qrtle-32 1.00 0.75% Amean wakeup-99.0th-qrtle-32 1.00 3.03% Amean request-99.0th-qrtle-64 1.00 -0.44% Amean rps-50.0th-qrtle-64 1.00 0.28% Amean wakeup-99.0th-qrtle-64 1.00 -3.50% Amean request-99.0th-qrtle-79 1.00 -0.09% Amean rps-50.0th-qrtle-79 1.00 0.08% Amean wakeup-99.0th-qrtle-79 1.00 -1.20% baseline patch3 Amean request-99.0th-qrtle-1 1.00 0.31% Amean rps-50.0th-qrtle-1 1.00 -0.17% Amean wakeup-99.0th-qrtle-1 1.00 0.44% Amean request-99.0th-qrtle-2 1.00 -0.61% Amean rps-50.0th-qrtle-2 1.00 -0.29% Amean wakeup-99.0th-qrtle-2 1.00 1.93% Amean request-99.0th-qrtle-4 1.00 -1.62% Amean rps-50.0th-qrtle-4 1.00 -0.17% Amean wakeup-99.0th-qrtle-4 1.00 0.00% Amean request-99.0th-qrtle-8 1.00 0.00% Amean rps-50.0th-qrtle-8 1.00 -0.40% Amean wakeup-99.0th-qrtle-8 1.00 -0.29% Amean request-99.0th-qrtle-16 1.00 0.53% Amean rps-50.0th-qrtle-16 1.00 -0.17% Amean wakeup-99.0th-qrtle-16 1.00 -1.03% Amean request-99.0th-qrtle-32 1.00 0.09% Amean rps-50.0th-qrtle-32 1.00 -0.17% Amean wakeup-99.0th-qrtle-32 1.00 2.41% Amean request-99.0th-qrtle-64 1.00 0.26% Amean rps-50.0th-qrtle-64 1.00 -0.16% Amean wakeup-99.0th-qrtle-64 1.00 -2.00% Amean request-99.0th-qrtle-79 1.00 0.26% Amean rps-50.0th-qrtle-79 1.00 -0.46% Amean wakeup-99.0th-qrtle-79 1.00 1.20% baseline patch4 Amean request-99.0th-qrtle-1 1.00 -0.15% Amean rps-50.0th-qrtle-1 1.00 -0.06% Amean wakeup-99.0th-qrtle-1 1.00 -2.88% Amean request-99.0th-qrtle-2 1.00 -0.31% Amean rps-50.0th-qrtle-2 1.00 -0.29% Amean wakeup-99.0th-qrtle-2 1.00 -0.59% Amean request-99.0th-qrtle-4 1.00 -0.23% Amean rps-50.0th-qrtle-4 1.00 -0.11% Amean wakeup-99.0th-qrtle-4 1.00 -0.41% Amean request-99.0th-qrtle-8 1.00 -0.08% Amean rps-50.0th-qrtle-8 1.00 -0.52% Amean wakeup-99.0th-qrtle-8 1.00 1.91% Amean request-99.0th-qrtle-16 1.00 0.76% Amean rps-50.0th-qrtle-16 1.00 0.06% Amean wakeup-99.0th-qrtle-16 1.00 1.03% Amean request-99.0th-qrtle-32 1.00 8.36% Amean rps-50.0th-qrtle-32 1.00 0.00% Amean wakeup-99.0th-qrtle-32 1.00 -1.05% Amean request-99.0th-qrtle-64 1.00 0.13% Amean rps-50.0th-qrtle-64 1.00 0.00% Amean wakeup-99.0th-qrtle-64 1.00 -4.00% Amean request-99.0th-qrtle-79 1.00 -0.39% Amean rps-50.0th-qrtle-79 1.00 0.14% Amean wakeup-99.0th-qrtle-79 1.00 -0.40%
© 2016 - 2025 Red Hat, Inc.