[PATCH 4/4] sched/rt: Split cpupri_vec->cpumask to per NUMA node to reduce contention

Pan Deng posted 4 patches 3 months ago
There is a newer version of this series
[PATCH 4/4] sched/rt: Split cpupri_vec->cpumask to per NUMA node to reduce contention
Posted by Pan Deng 3 months ago
When running a multi-instance FFmpeg workload on HCC system, significant
contention is observed on bitmap of `cpupri_vec->cpumask`.

The SUT is a 2-socket machine with 240 physical cores and 480 logical
CPUs. 60 FFmpeg instances are launched, each pinned to 4 physical cores
(8 logical CPUs) for transcoding tasks. Sub-threads use RT priority 99
with FIFO scheduling. FPS is used as score.

perf c2c tool reveals:
cpumask (bitmap) cache line of `cpupri_vec->mask`:
- bits are loaded during cpupri_find
- bits are stored during cpupri_set
- cycles per load: ~2.2K to 8.7K

This change splits `cpupri_vec->cpumask` into per-NUMA-node data to
mitigate false sharing.

As a result:
- FPS improves by ~3.8%
- Kernel cycles% drops from ~20% to ~18.7%
- Cache line contention is mitigated, perf-c2c shows cycles per load
  drops from ~2.2K-8.7K to ~0.5K-2.2K

Note: CONFIG_CPUMASK_OFFSTACK=n remains unchanged.

Appendix:
1. Perf c2c report of `cpupri_vec->mask` bitmap cache line:
-------  -------  ------  ------  ------  ------  ------------------------
 Rmt      Lcl     Store   Data    Load    Total    Symbol
Hitm%    Hitm%   L1 Hit%  offset  cycles  records
-------  -------  ------  ------  ------  ------  ------------------------
 155       39       39    0xff14d52c4682d800
-------  -------  ------  ------  ------  ------  ------------------------
43.23%   43.59%    0.00%  0x0     3489    415   _find_first_and_bit
 3.23%    5.13%    0.00%  0x0     3478    107   __bitmap_and
 3.23%    0.00%    0.00%  0x0     2712    33    _find_first_and_bit
 1.94%    0.00%    7.69%  0x0     5992    33    cpupri_set
 0.00%    0.00%    5.13%  0x0     3733    19    cpupri_set
12.90%   12.82%    0.00%  0x8     3452    297   _find_first_and_bit
 1.29%    2.56%    0.00%  0x8     3007    117   __bitmap_and
 0.00%    5.13%    0.00%  0x8     3041    20    _find_first_and_bit
 0.00%    2.56%    2.56%  0x8     2374    22    cpupri_set
 0.00%    0.00%    7.69%  0x8     4194    38    cpupri_set
 8.39%    2.56%    0.00%  0x10    3336    264   _find_first_and_bit
 3.23%    0.00%    0.00%  0x10    3023    46    _find_first_and_bit
 2.58%    0.00%    0.00%  0x10    3040    130   __bitmap_and
 1.29%    0.00%   12.82%  0x10    4075    34    cpupri_set
 0.00%    0.00%    2.56%  0x10    2197    19    cpupri_set
 0.00%    2.56%    7.69%  0x18    4085    27    cpupri_set
 0.00%    2.56%    0.00%  0x18    3128    220   _find_first_and_bit
 0.00%    0.00%    5.13%  0x18    3028    20    cpupri_set
 2.58%    2.56%    0.00%  0x20    3089    198   _find_first_and_bit
 1.29%    0.00%    5.13%  0x20    5114    29    cpupri_set
 0.65%    2.56%    0.00%  0x20    3224    96    __bitmap_and
 0.65%    0.00%    7.69%  0x20    4392    31    cpupri_set
 2.58%    0.00%    0.00%  0x28    3327    214   _find_first_and_bit
 0.65%    2.56%    5.13%  0x28    5252    31    cpupri_set
 0.65%    0.00%    7.69%  0x28    8755    25    cpupri_set
 0.65%    0.00%    0.00%  0x28    4414    14    _find_first_and_bit
 1.29%    2.56%    0.00%  0x30    3139    171   _find_first_and_bit
 0.65%    0.00%    7.69%  0x30    2185    18    cpupri_set
 0.65%    0.00%    0.00%  0x30    3404    108   __bitmap_and
 0.00%    0.00%    2.56%  0x30    5542    21    cpupri_set
 3.23%    5.13%    0.00%  0x38    3493    190   _find_first_and_bit
 3.23%    2.56%    0.00%  0x38    3171    108   __bitmap_and
 0.00%    2.56%    7.69%  0x38    3285    14    cpupri_set
 0.00%    0.00%    5.13%  0x38    4035    27    cpupri_set

Signed-off-by: Pan Deng <pan.deng@intel.com>
Reviewed-by: Tianyou Li <tianyou.li@intel.com>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
---
 kernel/sched/cpupri.c | 200 ++++++++++++++++++++++++++++++++++++++----
 kernel/sched/cpupri.h |   4 +
 2 files changed, 186 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 42c40cfdf836..306b6baff4cd 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -64,6 +64,143 @@ static int convert_prio(int prio)
 	return cpupri;
 }
 
+#ifdef	CONFIG_CPUMASK_OFFSTACK
+static inline int alloc_vec_masks(struct cpupri_vec *vec)
+{
+	int i;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		if (!zalloc_cpumask_var_node(&vec->masks[i], GFP_KERNEL, i))
+			goto cleanup;
+
+		// Clear masks of cur node, set others
+		bitmap_complement(cpumask_bits(vec->masks[i]),
+			cpumask_bits(cpumask_of_node(i)), small_cpumask_bits);
+	}
+	return 0;
+
+cleanup:
+	while (i--)
+		free_cpumask_var(vec->masks[i]);
+	return -ENOMEM;
+}
+
+static inline void free_vec_masks(struct cpupri_vec *vec)
+{
+	for (int i = 0; i < nr_node_ids; i++)
+		free_cpumask_var(vec->masks[i]);
+}
+
+static inline int setup_vec_mask_var_ts(struct cpupri *cp)
+{
+	int i;
+
+	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+		struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+
+		vec->masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
+		if (!vec->masks)
+			goto cleanup;
+	}
+	return 0;
+
+cleanup:
+	/* Free any already allocated masks */
+	while (i--) {
+		kfree(cp->pri_to_cpu[i].masks);
+		cp->pri_to_cpu[i].masks = NULL;
+	}
+
+	return -ENOMEM;
+}
+
+static inline void free_vec_mask_var_ts(struct cpupri *cp)
+{
+	for (int i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+		kfree(cp->pri_to_cpu[i].masks);
+		cp->pri_to_cpu[i].masks = NULL;
+	}
+}
+
+static inline int
+available_cpu_in_nodes(struct task_struct *p, struct cpupri_vec *vec)
+{
+	int cur_node = numa_node_id();
+
+	for (int i = 0; i < nr_node_ids; i++) {
+		int nid = (cur_node + i) % nr_node_ids;
+
+		if (cpumask_first_and_and(&p->cpus_mask, vec->masks[nid],
+					cpumask_of_node(nid)) < nr_cpu_ids)
+			return 1;
+	}
+
+	return 0;
+}
+
+#define available_cpu_in_vec available_cpu_in_nodes
+
+#else /* !CONFIG_CPUMASK_OFFSTACK */
+
+static inline int alloc_vec_masks(struct cpupri_vec *vec)
+{
+	if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline void free_vec_masks(struct cpupri_vec *vec)
+{
+	free_cpumask_var(vec->mask);
+}
+
+static inline int setup_vec_mask_var_ts(struct cpupri *cp)
+{
+	return 0;
+}
+
+static inline void free_vec_mask_var_ts(struct cpupri *cp)
+{
+}
+
+static inline int
+available_cpu_in_vec(struct task_struct *p, struct cpupri_vec *vec)
+{
+	if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
+		return 0;
+
+	return 1;
+}
+#endif
+
+static inline int alloc_all_masks(struct cpupri *cp)
+{
+	int i;
+
+	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+		if (alloc_vec_masks(&cp->pri_to_cpu[i]))
+			goto cleanup;
+	}
+
+	return 0;
+
+cleanup:
+	while (i--)
+		free_vec_masks(&cp->pri_to_cpu[i]);
+
+	return -ENOMEM;
+}
+
+static inline void setup_vec_counts(struct cpupri *cp)
+{
+	for (int i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+		struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+
+		atomic_set(&vec->count, 0);
+	}
+}
+
 static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
 				struct cpumask *lowest_mask, int idx)
 {
@@ -96,11 +233,24 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
 	if (skip)
 		return 0;
 
-	if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
+	if (!available_cpu_in_vec(p, vec))
 		return 0;
 
+#ifdef	CONFIG_CPUMASK_OFFSTACK
+	struct cpumask *cpupri_mask = lowest_mask;
+
+	// available && lowest_mask
+	if (lowest_mask) {
+		cpumask_copy(cpupri_mask, vec->masks[0]);
+		for (int nid = 1; nid < nr_node_ids; nid++)
+			cpumask_and(cpupri_mask, cpupri_mask, vec->masks[nid]);
+	}
+#else
+	struct cpumask *cpupri_mask = vec->mask;
+#endif
+
 	if (lowest_mask) {
-		cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
+		cpumask_and(lowest_mask, &p->cpus_mask, cpupri_mask);
 		cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
 
 		/*
@@ -229,7 +379,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 	if (likely(newpri != CPUPRI_INVALID)) {
 		struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
 
+#ifdef	CONFIG_CPUMASK_OFFSTACK
+		cpumask_set_cpu(cpu, vec->masks[cpu_to_node(cpu)]);
+#else
 		cpumask_set_cpu(cpu, vec->mask);
+#endif
 		/*
 		 * When adding a new vector, we update the mask first,
 		 * do a write memory barrier, and then update the count, to
@@ -263,7 +417,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 		 */
 		atomic_dec(&(vec)->count);
 		smp_mb__after_atomic();
+#ifdef	CONFIG_CPUMASK_OFFSTACK
+		cpumask_clear_cpu(cpu, vec->masks[cpu_to_node(cpu)]);
+#else
 		cpumask_clear_cpu(cpu, vec->mask);
+#endif
 	}
 
 	*currpri = newpri;
@@ -279,26 +437,31 @@ int cpupri_init(struct cpupri *cp)
 {
 	int i;
 
-	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
-		struct cpupri_vec *vec = &cp->pri_to_cpu[i];
-
-		atomic_set(&vec->count, 0);
-		if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
-			goto cleanup;
-	}
-
+	/* Allocate the cpu_to_pri array */
 	cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
 	if (!cp->cpu_to_pri)
-		goto cleanup;
+		return -ENOMEM;
 
+	/* Initialize all CPUs to invalid priority */
 	for_each_possible_cpu(i)
 		cp->cpu_to_pri[i] = CPUPRI_INVALID;
 
+	/* Setup priority vectors */
+	setup_vec_counts(cp);
+	if (setup_vec_mask_var_ts(cp))
+		goto fail_setup_vectors;
+
+	/* Allocate masks for each priority vector */
+	if (alloc_all_masks(cp))
+		goto fail_alloc_masks;
+
 	return 0;
 
-cleanup:
-	for (i--; i >= 0; i--)
-		free_cpumask_var(cp->pri_to_cpu[i].mask);
+fail_alloc_masks:
+	free_vec_mask_var_ts(cp);
+
+fail_setup_vectors:
+	kfree(cp->cpu_to_pri);
 	return -ENOMEM;
 }
 
@@ -308,9 +471,10 @@ int cpupri_init(struct cpupri *cp)
  */
 void cpupri_cleanup(struct cpupri *cp)
 {
-	int i;
-
 	kfree(cp->cpu_to_pri);
-	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
-		free_cpumask_var(cp->pri_to_cpu[i].mask);
+
+	for (int i = 0; i < CPUPRI_NR_PRIORITIES; i++)
+		free_vec_masks(&cp->pri_to_cpu[i]);
+
+	free_vec_mask_var_ts(cp);
 }
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 245b0fa626be..c53f1f4dad86 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -9,7 +9,11 @@
 
 struct cpupri_vec {
 	atomic_t		count;
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	cpumask_var_t		*masks	____cacheline_aligned;
+#else
 	cpumask_var_t		mask	____cacheline_aligned;
+#endif
 };
 
 struct cpupri {
-- 
2.43.5
Re: [PATCH 4/4] sched/rt: Split cpupri_vec->cpumask to per NUMA node to reduce contention
Posted by Chen, Yu C 2 months, 2 weeks ago
On 7/7/2025 10:35 AM, Pan Deng wrote:
> When running a multi-instance FFmpeg workload on HCC system, significant
> contention is observed on bitmap of `cpupri_vec->cpumask`.
> 
> The SUT is a 2-socket machine with 240 physical cores and 480 logical
> CPUs. 60 FFmpeg instances are launched, each pinned to 4 physical cores
> (8 logical CPUs) for transcoding tasks. Sub-threads use RT priority 99
> with FIFO scheduling. FPS is used as score.
> 
> perf c2c tool reveals:
> cpumask (bitmap) cache line of `cpupri_vec->mask`:
> - bits are loaded during cpupri_find
> - bits are stored during cpupri_set
> - cycles per load: ~2.2K to 8.7K
> 
> This change splits `cpupri_vec->cpumask` into per-NUMA-node data to
> mitigate false sharing.
> 
> As a result:
> - FPS improves by ~3.8%
> - Kernel cycles% drops from ~20% to ~18.7%
> - Cache line contention is mitigated, perf-c2c shows cycles per load
>    drops from ~2.2K-8.7K to ~0.5K-2.2K
> 

This brings noticeable improvement for RT workload, and it would
be even more convincing if we can have try on normal task workload,
at least not bring regression(schbench/hackbenc, etc).

thanks,
Chenyu

> Note: CONFIG_CPUMASK_OFFSTACK=n remains unchanged.
>
RE: [PATCH 4/4] sched/rt: Split cpupri_vec->cpumask to per NUMA node to reduce contention
Posted by Deng, Pan 2 months, 2 weeks ago
> -----Original Message-----
> From: Chen, Yu C <yu.c.chen@intel.com>
> Sent: Monday, July 21, 2025 7:24 PM
> To: Deng, Pan <pan.deng@intel.com>
> Cc: linux-kernel@vger.kernel.org; Li, Tianyou <tianyou.li@intel.com>;
> tim.c.chen@linux.intel.com; peterz@infradead.org; mingo@kernel.org
> Subject: Re: [PATCH 4/4] sched/rt: Split cpupri_vec->cpumask to per NUMA
> node to reduce contention
> 
> On 7/7/2025 10:35 AM, Pan Deng wrote:
> > When running a multi-instance FFmpeg workload on HCC system,
> > significant contention is observed on bitmap of `cpupri_vec->cpumask`.
> >
> > The SUT is a 2-socket machine with 240 physical cores and 480 logical
> > CPUs. 60 FFmpeg instances are launched, each pinned to 4 physical
> > cores
> > (8 logical CPUs) for transcoding tasks. Sub-threads use RT priority 99
> > with FIFO scheduling. FPS is used as score.
> >
> > perf c2c tool reveals:
> > cpumask (bitmap) cache line of `cpupri_vec->mask`:
> > - bits are loaded during cpupri_find
> > - bits are stored during cpupri_set
> > - cycles per load: ~2.2K to 8.7K
> >
> > This change splits `cpupri_vec->cpumask` into per-NUMA-node data to
> > mitigate false sharing.
> >
> > As a result:
> > - FPS improves by ~3.8%
> > - Kernel cycles% drops from ~20% to ~18.7%
> > - Cache line contention is mitigated, perf-c2c shows cycles per load
> >    drops from ~2.2K-8.7K to ~0.5K-2.2K
> >
> 
> This brings noticeable improvement for RT workload, and it would be even
> more convincing if we can have try on normal task workload, at least not bring
> regression(schbench/hackbenc, etc).
>

Thanks Yu, hackbench and schbench data will be provided later.
 

> thanks,
> Chenyu
> 
> > Note: CONFIG_CPUMASK_OFFSTACK=n remains unchanged.
> >
> 

RE: [PATCH 4/4] sched/rt: Split cpupri_vec->cpumask to per NUMA node to reduce contention
Posted by Deng, Pan 2 months ago
> -----Original Message-----
> From: Deng, Pan
> Sent: Tuesday, July 22, 2025 10:47 PM
> To: Chen, Yu C <yu.c.chen@intel.com>
> Cc: linux-kernel@vger.kernel.org; Li, Tianyou <tianyou.li@intel.com>;
> tim.c.chen@linux.intel.com; peterz@infradead.org; mingo@kernel.org
> Subject: RE: [PATCH 4/4] sched/rt: Split cpupri_vec->cpumask to per NUMA
> node to reduce contention
> 
> 
> > -----Original Message-----
> > From: Chen, Yu C <yu.c.chen@intel.com>
> > Sent: Monday, July 21, 2025 7:24 PM
> > To: Deng, Pan <pan.deng@intel.com>
> > Cc: linux-kernel@vger.kernel.org; Li, Tianyou <tianyou.li@intel.com>;
> > tim.c.chen@linux.intel.com; peterz@infradead.org; mingo@kernel.org
> > Subject: Re: [PATCH 4/4] sched/rt: Split cpupri_vec->cpumask to per NUMA
> > node to reduce contention
> >
> > On 7/7/2025 10:35 AM, Pan Deng wrote:
> > > When running a multi-instance FFmpeg workload on HCC system,
> > > significant contention is observed on bitmap of `cpupri_vec->cpumask`.
> > >
> > > The SUT is a 2-socket machine with 240 physical cores and 480 logical
> > > CPUs. 60 FFmpeg instances are launched, each pinned to 4 physical
> > > cores
> > > (8 logical CPUs) for transcoding tasks. Sub-threads use RT priority 99
> > > with FIFO scheduling. FPS is used as score.
> > >
> > > perf c2c tool reveals:
> > > cpumask (bitmap) cache line of `cpupri_vec->mask`:
> > > - bits are loaded during cpupri_find
> > > - bits are stored during cpupri_set
> > > - cycles per load: ~2.2K to 8.7K
> > >
> > > This change splits `cpupri_vec->cpumask` into per-NUMA-node data to
> > > mitigate false sharing.
> > >
> > > As a result:
> > > - FPS improves by ~3.8%
> > > - Kernel cycles% drops from ~20% to ~18.7%
> > > - Cache line contention is mitigated, perf-c2c shows cycles per load
> > >    drops from ~2.2K-8.7K to ~0.5K-2.2K
> > >
> >
> > This brings noticeable improvement for RT workload, and it would be even
> > more convincing if we can have try on normal task workload, at least not
> bring
> > regression(schbench/hackbenc, etc).
> >
> 
> Thanks Yu, hackbench and schbench data will be provided later.
> 
> 

TLDR;
====
Hackbench, both old and new version schbench were evaluted on SUT with
2-socket/6 NUMA nodes/240 physical cores/480 logical CPUs. No regressions
were detected for patch 1-4. In addition, symbol-level analysis from
`perf record -a` profiling data indicates that the changes introduced
in patch 1-4 are unlikely to cause regressions in hackbench or schbench.

Details
=======

Hackbench
=========
The workload is ran by test framework
https://github.com/yu-chen-surf/schedtests, procedures:
1. Reboot system to run a workload.
2. Run 5 iterations of 1st configuration with 30s cool down period.
3. Run 5 iterations of 2nd configuration..
...

The test results are as follows: regressions exceeding -10% are marked
with ** at the end of the line. However, when re-running the tests using
the test framework or a vanilla workload, the regressions could not be
reproduced.

Notes: 15/30/45 are fds# as well as process/thread pairs in 1 group.
Patch 1
case                              load             baseline(std%)  patch1%( std%)
process-pipe-15         1-groups         1.00 ( 14.03)   -8.81 (  6.53)
process-pipe-15         2-groups         1.00 (  3.46)   +1.82 (  2.59)
process-pipe-15         4-groups         1.00 (  6.20)   +8.60 (  5.59)
process-pipe-15         8-groups         1.00 (  2.41)   -0.21 (  3.22)
process-pipe-30         1-groups         1.00 (  2.51)   +2.24 (  3.12)
process-pipe-30         2-groups         1.00 (  3.86)   -0.58 (  2.46)
process-pipe-30         4-groups         1.00 (  2.19)   -1.81 (  1.05)
process-pipe-30         8-groups         1.00 (  1.69)   +0.52 (  3.01)
process-pipe-45         1-groups         1.00 (  1.63)   +1.63 (  1.23)
process-pipe-45         2-groups         1.00 (  0.79)   +0.08 (  1.82)
process-pipe-45         4-groups         1.00 (  1.62)   -0.06 (  0.63)
process-pipe-45         8-groups         1.00 (  1.66)   -4.12 (  3.27)
process-sockets-15      1-groups         1.00 (  3.57)   +2.36 (  5.15)
process-sockets-15      2-groups         1.00 (  3.59)   -1.33 (  6.86)
process-sockets-15      4-groups         1.00 (  7.10)   +5.44 (  6.97)
process-sockets-15      8-groups         1.00 (  2.63)   -3.05 (  1.94)
process-sockets-30      1-groups         1.00 (  3.73)   -2.69 (  4.89)
process-sockets-30      2-groups         1.00 (  3.90)   -4.25 (  3.94)
process-sockets-30      4-groups         1.00 (  1.03)   -1.58 (  1.51)
process-sockets-30      8-groups         1.00 (  0.48)   +1.09 (  0.68)
process-sockets-45      1-groups         1.00 (  0.62)   -2.25 (  0.57)
process-sockets-45      2-groups         1.00 (  2.56)   -0.61 (  0.63)
process-sockets-45      4-groups         1.00 (  0.57)   -0.51 (  0.79)
process-sockets-45      8-groups         1.00 (  0.18)   -5.23 (  2.18)
threads-pipe-15         1-groups         1.00 (  5.30)   -1.47 (  5.38)
threads-pipe-15         2-groups         1.00 (  7.97)   -1.31 (  8.61)
threads-pipe-15         4-groups         1.00 (  4.94)   -3.31 (  5.48)
threads-pipe-15         8-groups         1.00 (  1.69)   +7.28 (  5.54)
threads-pipe-30         1-groups         1.00 (  5.12)   -1.58 (  4.82)
threads-pipe-30         2-groups         1.00 (  1.63)   +3.29 (  1.72)
threads-pipe-30         4-groups         1.00 (  3.41)   +3.05 (  3.22)
threads-pipe-30         8-groups         1.00 (  2.85)   +1.58 (  4.05)
threads-pipe-45         1-groups         1.00 (  5.13)   -0.78 (  6.78)
threads-pipe-45         2-groups         1.00 (  1.92)   -2.87 (  1.27)
threads-pipe-45         4-groups         1.00 (  2.41)   -4.37 (  1.23)
threads-pipe-45         8-groups         1.00 (  1.81)   +1.85 (  1.54)
threads-sockets-15      1-groups         1.00 (  4.72)   -0.73 (  2.75)
threads-sockets-15      2-groups         1.00 (  3.05)   +3.09 (  3.39)
threads-sockets-15      4-groups         1.00 (  5.92)   +0.87 (  2.25)
threads-sockets-15      8-groups         1.00 (  3.75)   -7.24 (  3.34)
threads-sockets-30      1-groups         1.00 (  5.96)   -6.27 (  3.35)
threads-sockets-30      2-groups         1.00 (  1.68)   -1.78 (  3.60)
threads-sockets-30      4-groups         1.00 (  5.02)   -0.95 (  3.60)
threads-sockets-30      8-groups         1.00 (  0.41)   -3.09 (  2.03)
threads-sockets-45      1-groups         1.00 (  2.55)   -1.32 (  1.37)
threads-sockets-45      2-groups         1.00 (  3.53)   -0.46 (  3.99)
threads-sockets-45      4-groups         1.00 (  0.51)   +0.67 (  0.74)
threads-sockets-45      8-groups         1.00 (  3.01)  -16.85 (  2.13) **

Patch 2
case                              load             baseline(std%)  patch2%( std%)
process-pipe-15         1-groups         1.00 ( 14.03)   -3.32 ( 11.34)
process-pipe-15         2-groups         1.00 (  3.46)   +2.19 (  7.27)
process-pipe-15         4-groups         1.00 (  6.20)   +2.01 (  2.83)
process-pipe-15         8-groups         1.00 (  2.41)   +1.65 (  4.39)
process-pipe-30         1-groups         1.00 (  2.51)   -0.88 (  3.26)
process-pipe-30         2-groups         1.00 (  3.86)   +2.25 (  3.21)
process-pipe-30         4-groups         1.00 (  2.19)   +0.20 (  1.72)
process-pipe-30         8-groups         1.00 (  1.69)   +0.85 (  0.61)
process-pipe-45         1-groups         1.00 (  1.63)   +3.10 (  4.01)
process-pipe-45         2-groups         1.00 (  0.79)   -1.00 (  0.69)
process-pipe-45         4-groups         1.00 (  1.62)   +0.07 (  0.63)
process-pipe-45         8-groups         1.00 (  1.66)   +0.20 (  1.47)
process-sockets-15      1-groups         1.00 (  3.57)   -5.44 (  3.45)
process-sockets-15      2-groups         1.00 (  3.59)   +1.00 (  4.35)
process-sockets-15      4-groups         1.00 (  7.10)   +0.46 (  4.45)
process-sockets-15      8-groups         1.00 (  2.63)   -1.48 (  4.56)
process-sockets-30      1-groups         1.00 (  3.73)   -0.17 (  3.57)
process-sockets-30      2-groups         1.00 (  3.90)   +3.83 (  7.54)
process-sockets-30      4-groups         1.00 (  1.03)   -2.35 (  6.11)
process-sockets-30      8-groups         1.00 (  0.48)   -0.43 (  0.79)
process-sockets-45      1-groups         1.00 (  0.62)   -2.24 (  1.63)
process-sockets-45      2-groups         1.00 (  2.56)   -1.41 (  3.17)
process-sockets-45      4-groups         1.00 (  0.57)   -0.29 (  0.33)
process-sockets-45      8-groups         1.00 (  0.18)   -6.05 (  1.55)
threads-pipe-15         1-groups         1.00 (  5.30)   -5.83 (  7.96)
threads-pipe-15         2-groups         1.00 (  7.97)   -3.74 (  4.22)
threads-pipe-15         4-groups         1.00 (  4.94)   -2.23 (  5.75)
threads-pipe-15         8-groups         1.00 (  1.69)   +0.21 (  3.08)
threads-pipe-30         1-groups         1.00 (  5.12)   -5.73 (  4.97)
threads-pipe-30         2-groups         1.00 (  1.63)   -1.76 (  4.49)
threads-pipe-30         4-groups         1.00 (  3.41)   -0.99 (  2.50)
threads-pipe-30         8-groups         1.00 (  2.85)   +0.71 (  1.04)
threads-pipe-45         1-groups         1.00 (  5.13)   +0.08 (  5.72)
threads-pipe-45         2-groups         1.00 (  1.92)   -1.78 (  1.30)
threads-pipe-45         4-groups         1.00 (  2.41)   -3.79 (  0.81)
threads-pipe-45         8-groups         1.00 (  1.81)   -3.62 (  1.41)
threads-sockets-15      1-groups         1.00 (  4.72)   +2.52 (  2.66)
threads-sockets-15      2-groups         1.00 (  3.05)   -7.59 (  1.80)
threads-sockets-15      4-groups         1.00 (  5.92)   +1.59 (  7.12)
threads-sockets-15      8-groups         1.00 (  3.75)   -0.34 (  3.62)
threads-sockets-30      1-groups         1.00 (  5.96)   -2.45 (  4.89)
threads-sockets-30      2-groups         1.00 (  1.68)   -0.61 (  4.80)
threads-sockets-30      4-groups         1.00 (  5.02)   -2.15 (  8.62)
threads-sockets-30      8-groups         1.00 (  0.41)  -17.32 (  0.88) **
threads-sockets-45      1-groups         1.00 (  2.55)   -3.24 (  3.37)
threads-sockets-45      2-groups         1.00 (  3.53)   -1.38 (  2.40)
threads-sockets-45      4-groups         1.00 (  0.51)   -0.17 (  0.85)
threads-sockets-45      8-groups         1.00 (  3.01)  -14.59 (  5.48) **

Patch 3
case                              load             baseline(std%)  patch3%( std%)
process-pipe-15         1-groups         1.00 ( 14.03)  -10.18 (  3.39) **
process-pipe-15         2-groups         1.00 (  3.46)   +5.18 (  3.12)
process-pipe-15         4-groups         1.00 (  6.20)   +8.63 (  5.72)
process-pipe-15         8-groups         1.00 (  2.41)   +5.37 (  2.24)
process-pipe-30         1-groups         1.00 (  2.51)   +5.53 (  3.55)
process-pipe-30         2-groups         1.00 (  3.86)   +5.70 (  4.27)
process-pipe-30         4-groups         1.00 (  2.19)   +3.95 (  3.34)
process-pipe-30         8-groups         1.00 (  1.69)   -3.38 (  1.51)
process-pipe-45         1-groups         1.00 (  1.63)   +5.19 (  2.51)
process-pipe-45         2-groups         1.00 (  0.79)   -0.63 (  2.06)
process-pipe-45         4-groups         1.00 (  1.62)   -5.83 (  2.22)
process-pipe-45         8-groups         1.00 (  1.66)   -6.13 (  2.34)
process-sockets-15      1-groups         1.00 (  3.57)   -1.51 (  4.21)
process-sockets-15      2-groups         1.00 (  3.59)   -1.30 (  7.50)
process-sockets-15      4-groups         1.00 (  7.10)   -1.80 (  5.58)
process-sockets-15      8-groups         1.00 (  2.63)   -1.68 (  3.40)
process-sockets-30      1-groups         1.00 (  3.73)   -7.74 (  1.58)
process-sockets-30      2-groups         1.00 (  3.90)   -1.98 (  5.48)
process-sockets-30      4-groups         1.00 (  1.03)   -0.33 (  3.47)
process-sockets-30      8-groups         1.00 (  0.48)   -0.40 (  0.84)
process-sockets-45      1-groups         1.00 (  0.62)   -0.21 (  0.54)
process-sockets-45      2-groups         1.00 (  2.56)   -1.97 (  2.48)
process-sockets-45      4-groups         1.00 (  0.57)   -0.61 (  0.83)
process-sockets-45      8-groups         1.00 (  0.18)   -5.09 (  1.85)
threads-pipe-15         1-groups         1.00 (  5.30)   +3.62 ( 11.04)
threads-pipe-15         2-groups         1.00 (  7.97)   +8.08 (  4.63)
threads-pipe-15         4-groups         1.00 (  4.94)   +6.46 (  5.27)
threads-pipe-15         8-groups         1.00 (  1.69)   +2.68 (  3.23)
threads-pipe-30         1-groups         1.00 (  5.12)   +3.60 (  7.09)
threads-pipe-30         2-groups         1.00 (  1.63)   -0.80 (  4.43)
threads-pipe-30         4-groups         1.00 (  3.41)   +2.37 (  2.16)
threads-pipe-30         8-groups         1.00 (  2.85)   +4.17 (  1.41)
threads-pipe-45         1-groups         1.00 (  5.13)   +7.41 (  4.48)
threads-pipe-45         2-groups         1.00 (  1.92)   -1.40 (  2.69)
threads-pipe-45         4-groups         1.00 (  2.41)   -1.25 (  2.15)
threads-pipe-45         8-groups         1.00 (  1.81)   +1.62 (  0.73)
threads-sockets-15      1-groups         1.00 (  4.72)  +10.11 (  7.95)
threads-sockets-15      2-groups         1.00 (  3.05)   -8.41 (  5.93)
threads-sockets-15      4-groups         1.00 (  5.92)  -10.89 (  4.29) **
threads-sockets-15      8-groups         1.00 (  3.75)   -7.66 (  3.33)
threads-sockets-30      1-groups         1.00 (  5.96)   -5.18 (  2.77)
threads-sockets-30      2-groups         1.00 (  1.68)   -4.91 (  3.89)
threads-sockets-30      4-groups         1.00 (  5.02)   -6.32 (  4.19)
threads-sockets-30      8-groups         1.00 (  0.41)  -11.73 (  0.96) **
threads-sockets-45      1-groups         1.00 (  2.55)   -3.16 (  1.97)
threads-sockets-45      2-groups         1.00 (  3.53)   -0.21 (  4.33)
threads-sockets-45      4-groups         1.00 (  0.51)   -0.75 (  2.07)
threads-sockets-45      8-groups         1.00 (  3.01)  -20.52 (  1.44) **

Patch 4
case                              load             baseline(std%)  patch4%( std%)
process-pipe-15         1-groups         1.00 ( 14.03)   -2.68 (  9.64)
process-pipe-15         2-groups         1.00 (  3.46)   +1.82 (  7.55)
process-pipe-15         4-groups         1.00 (  6.20)   +3.67 (  8.17)
process-pipe-15         8-groups         1.00 (  2.41)   +1.87 (  0.92)
process-pipe-30         1-groups         1.00 (  2.51)   -3.34 (  3.96)
process-pipe-30         2-groups         1.00 (  3.86)   -0.33 (  3.53)
process-pipe-30         4-groups         1.00 (  2.19)   -3.22 (  1.31)
process-pipe-30         8-groups         1.00 (  1.69)   -1.95 (  1.07)
process-pipe-45         1-groups         1.00 (  1.63)   +0.63 (  2.86)
process-pipe-45         2-groups         1.00 (  0.79)   -1.27 (  1.39)
process-pipe-45         4-groups         1.00 (  1.62)   -2.04 (  1.87)
process-pipe-45         8-groups         1.00 (  1.66)   -1.45 (  3.20)
process-sockets-15      1-groups         1.00 (  3.57)   -9.16 (  5.33)
process-sockets-15      2-groups         1.00 (  3.59)   -1.83 (  5.36)
process-sockets-15      4-groups         1.00 (  7.10)   +7.55 (  6.34)
process-sockets-15      8-groups         1.00 (  2.63)   -2.98 (  5.95)
process-sockets-30      1-groups         1.00 (  3.73)   +3.50 (  4.92)
process-sockets-30      2-groups         1.00 (  3.90)   +1.80 (  5.68)
process-sockets-30      4-groups         1.00 (  1.03)   -1.23 (  4.79)
process-sockets-30      8-groups         1.00 (  0.48)   -0.15 (  0.33)
process-sockets-45      1-groups         1.00 (  0.62)   -0.70 (  1.12)
process-sockets-45      2-groups         1.00 (  2.56)   +0.64 (  0.86)
process-sockets-45      4-groups         1.00 (  0.57)   +0.09 (  0.53)
process-sockets-45      8-groups         1.00 (  0.18)   -7.31 (  2.11)
threads-pipe-15         1-groups         1.00 (  5.30)   +4.94 (  9.52)
threads-pipe-15         2-groups         1.00 (  7.97)   -4.28 (  2.30)
threads-pipe-15         4-groups         1.00 (  4.94)   -1.83 (  4.24)
threads-pipe-15         8-groups         1.00 (  1.69)   -2.35 (  1.50)
threads-pipe-30         1-groups         1.00 (  5.12)   +2.06 (  5.00)
threads-pipe-30         2-groups         1.00 (  1.63)   +0.93 (  4.53)
threads-pipe-30         4-groups         1.00 (  3.41)   -2.85 (  3.20)
threads-pipe-30         8-groups         1.00 (  2.85)   -2.20 (  2.68)
threads-pipe-45         1-groups         1.00 (  5.13)   -0.97 (  4.70)
threads-pipe-45         2-groups         1.00 (  1.92)   -2.11 (  1.21)
threads-pipe-45         4-groups         1.00 (  2.41)   -2.69 (  1.33)
threads-pipe-45         8-groups         1.00 (  1.81)   -2.41 (  1.14)
threads-sockets-15      1-groups         1.00 (  4.72)   +0.82 (  4.21)
threads-sockets-15      2-groups         1.00 (  3.05)   -1.28 (  2.48)
threads-sockets-15      4-groups         1.00 (  5.92)   -1.75 (  7.25)
threads-sockets-15      8-groups         1.00 (  3.75)   -2.54 (  3.49)
threads-sockets-30      1-groups         1.00 (  5.96)   -0.46 (  5.30)
threads-sockets-30      2-groups         1.00 (  1.68)   -0.45 (  1.75)
threads-sockets-30      4-groups         1.00 (  5.02)   -1.48 (  6.51)
threads-sockets-30      8-groups         1.00 (  0.41)  -13.09 (  1.61) **
threads-sockets-45      1-groups         1.00 (  2.55)   -1.68 (  0.66)
threads-sockets-45      2-groups         1.00 (  3.53)   +0.21 (  2.23)
threads-sockets-45      4-groups         1.00 (  0.51)   -1.27 (  1.43)
threads-sockets-45      8-groups         1.00 (  3.01)   -3.41 (  0.43)

Additionally, profiling data was collected using `perf record -a` for this
workload. Firstly, the cycles distribution are almost the same among
baseline and patch1-4. Secondly, the patch1-4 relevant symbols identified
were set_rd_overloaded/set_rd_overutilized, which is potentially invoked
(actually inlined) by `update_sd_lb_stats`. The `update_sd_lb_stats`
itself takes ~2.6% cycles in the baseline configuration
threads-sockets-45fd, 8 groups, while no regressions were observed in
patches 1-4 about this function. So I think the patches won't cause
regressions to hackbench.

Schbench(old, 91ea787)
======================
The workload is ran by the same metholody as hackbench, with runtime 100s.

Test result is as following, the regression over -5% is marked
with ** at the end of the line, while, when re-run the test with either
test framework or vanilla workload, the regression cannot re-produced.

case                         load                     baseline(std%)  opt1%( std%)
normal       1-mthreads-8-workers     1.00 (  1.44)   -5.60 (  2.96) **
normal       1-mthreads-2-workers     1.00 (  2.79)   -2.65 (  5.48)
normal       1-mthreads-1-workers     1.00 (  1.27)   -1.60 (  1.03)
normal       1-mthreads-31-workers    1.00 (  1.30)   -0.87 (  2.34)
normal       1-mthreads-16-workers    1.00 (  1.74)   -2.23 (  1.15)
normal       1-mthreads-4-workers     1.00 (  3.35)   -1.92 (  1.62)
normal       2-mthreads-8-workers     1.00 (  2.17)   -2.09 (  1.38)
normal       2-mthreads-31-workers    1.00 (  1.83)   +1.93 (  1.84)
normal       2-mthreads-16-workers    1.00 (  2.06)   +0.36 (  2.38)
normal       2-mthreads-1-workers     1.00 (  3.86)   +0.50 (  2.46)
normal       2-mthreads-2-workers     1.00 (  1.76)   -6.91 (  2.55)
normal       2-mthreads-4-workers     1.00 (  1.59)   -5.58 (  5.99)
normal       4-mthreads-8-workers     1.00 (  0.85)   +0.59 (  0.54)
normal       4-mthreads-31-workers    1.00 ( 15.31)  +15.04 ( 12.71)
normal       4-mthreads-16-workers    1.00 (  0.99)   -2.62 (  2.15)
normal       4-mthreads-4-workers     1.00 (  1.42)   -2.72 (  1.70)
normal       4-mthreads-1-workers     1.00 (  1.43)   -2.84 (  1.73)
normal       4-mthreads-2-workers     1.00 (  1.78)   -4.28 (  2.08)
normal       8-mthreads-16-workers    1.00 ( 10.04)   +7.06 (  0.73)
normal       8-mthreads-31-workers    1.00 (  1.94)   -1.66 (  2.28)
normal       8-mthreads-2-workers     1.00 (  2.51)   -0.30 (  1.53)
normal       8-mthreads-8-workers     1.00 (  1.56)   -1.83 (  1.39)
normal       8-mthreads-1-workers     1.00 (  4.08)   +0.45 (  1.45)
normal       8-mthreads-4-workers     1.00 (  1.84)   +2.85 (  1.07)

case                         load                     baseline(std%)  opt2%( std%)
normal       1-mthreads-8-workers     1.00 (  1.44)   -1.48 (  3.79)
normal       1-mthreads-2-workers     1.00 (  2.79)   +3.32 (  0.90)
normal       1-mthreads-1-workers     1.00 (  1.27)   +1.98 (  1.02)
normal       1-mthreads-31-workers    1.00 (  1.30)   +5.84 (  3.01)
normal       1-mthreads-16-workers    1.00 (  1.74)   +5.90 (  0.68)
normal       1-mthreads-4-workers     1.00 (  3.35)   +1.82 (  1.65)
normal       2-mthreads-8-workers     1.00 (  2.17)   +2.80 (  2.04)
normal       2-mthreads-31-workers    1.00 (  1.83)   -0.07 (  1.09)
normal       2-mthreads-16-workers    1.00 (  2.06)   +2.45 (  2.55)
normal       2-mthreads-1-workers     1.00 (  3.86)   +2.41 (  2.92)
normal       2-mthreads-2-workers     1.00 (  1.76)   -1.29 (  2.03)
normal       2-mthreads-4-workers     1.00 (  1.59)   +0.44 (  1.15)
normal       4-mthreads-8-workers     1.00 (  0.85)   -0.81 (  3.03)
normal       4-mthreads-31-workers    1.00 ( 15.31)   +2.06 ( 15.97)
normal       4-mthreads-16-workers    1.00 (  0.99)   -1.46 (  2.29)
normal       4-mthreads-4-workers     1.00 (  1.42)   -0.15 (  3.37)
normal       4-mthreads-1-workers     1.00 (  1.43)   +0.97 (  1.95)
normal       4-mthreads-2-workers     1.00 (  1.78)   -0.38 (  2.53)
normal       8-mthreads-16-workers    1.00 ( 10.04)   +5.80 (  1.72)
normal       8-mthreads-31-workers    1.00 (  1.94)   -0.76 (  2.33)
normal       8-mthreads-2-workers     1.00 (  2.51)   +2.47 (  2.17)
normal       8-mthreads-8-workers     1.00 (  1.56)   -0.66 (  1.47)
normal       8-mthreads-1-workers     1.00 (  4.08)   +2.71 (  2.78)
normal       8-mthreads-4-workers     1.00 (  1.84)   +2.35 (  4.88)

case                          load                     baseline(std%)  opt3%( std%)
normal       1-mthreads-8-workers     1.00 (  1.44)   -6.90 (  3.85)  **
normal       1-mthreads-2-workers     1.00 (  2.79)   +3.23 (  3.09)
normal       1-mthreads-1-workers     1.00 (  1.27)   -1.04 (  2.22)
normal       1-mthreads-31-workers    1.00 (  1.30)   +2.16 (  1.64)
normal       1-mthreads-16-workers    1.00 (  1.74)   -0.72 (  5.70)
normal       1-mthreads-4-workers     1.00 (  3.35)   -1.92 (  4.31)
normal       2-mthreads-8-workers     1.00 (  2.17)   +0.82 (  1.90)
normal       2-mthreads-31-workers    1.00 (  1.83)   +2.08 (  1.16)
normal       2-mthreads-16-workers    1.00 (  2.06)   +4.04 (  2.42)
normal       2-mthreads-1-workers     1.00 (  3.86)   +2.57 (  3.44)
normal       2-mthreads-2-workers     1.00 (  1.76)   -0.12 (  1.29)
normal       2-mthreads-4-workers     1.00 (  1.59)   -2.04 (  2.83)
normal       4-mthreads-8-workers     1.00 (  0.85)   +0.22 (  1.65)
normal       4-mthreads-31-workers    1.00 ( 15.31)  +15.09 (  9.83)
normal       4-mthreads-16-workers    1.00 (  0.99)   +1.46 (  1.88)
normal       4-mthreads-4-workers     1.00 (  1.42)   +2.34 (  1.57)
normal       4-mthreads-1-workers     1.00 (  1.43)   -0.77 (  2.45)
normal       4-mthreads-2-workers     1.00 (  1.78)   -1.16 (  1.85)
normal       8-mthreads-16-workers    1.00 ( 10.04)   +7.39 (  1.65)
normal       8-mthreads-31-workers    1.00 (  1.94)   -0.81 (  2.14)
normal       8-mthreads-2-workers     1.00 (  2.51)   -1.93 (  2.00)
normal       8-mthreads-8-workers     1.00 (  1.56)   +1.17 (  1.40)
normal       8-mthreads-1-workers     1.00 (  4.08)   +1.63 (  0.51)
normal       8-mthreads-4-workers     1.00 (  1.84)   +4.77 (  2.36)

case                          load                     baseline(std%)  opt4%( std%)
normal       1-mthreads-8-workers     1.00 (  1.44)   -0.27 (  3.05)
normal       1-mthreads-2-workers     1.00 (  2.79)   -0.31 (  1.19)
normal       1-mthreads-1-workers     1.00 (  1.27)   +1.62 (  1.77)
normal       1-mthreads-31-workers    1.00 (  1.30)   +1.30 (  3.34)
normal       1-mthreads-16-workers    1.00 (  1.74)   +0.07 (  3.38)
normal       1-mthreads-4-workers     1.00 (  3.35)   +1.08 (  2.48)
normal       2-mthreads-8-workers     1.00 (  2.17)   +0.04 (  3.87)
normal       2-mthreads-31-workers    1.00 (  1.83)   +1.29 (  1.44)
normal       2-mthreads-16-workers    1.00 (  2.06)   +0.94 (  2.96)
normal       2-mthreads-1-workers     1.00 (  3.86)   +2.85 (  2.12)
normal       2-mthreads-2-workers     1.00 (  1.76)   -0.30 (  2.37)
normal       2-mthreads-4-workers     1.00 (  1.59)   +2.22 (  1.51)
normal       4-mthreads-8-workers     1.00 (  0.85)   +2.20 (  3.06)
normal       4-mthreads-31-workers    1.00 ( 15.31)  +15.65 ( 12.68)
normal       4-mthreads-16-workers    1.00 (  0.99)   -1.96 (  3.30)
normal       4-mthreads-4-workers     1.00 (  1.42)   -1.19 (  3.42)
normal       4-mthreads-1-workers     1.00 (  1.43)   +2.26 (  2.45)
normal       4-mthreads-2-workers     1.00 (  1.78)   -1.36 (  2.75)
normal       8-mthreads-16-workers    1.00 ( 10.04)   -0.33 ( 11.13)
normal       8-mthreads-31-workers    1.00 (  1.94)   -1.14 (  2.01)
normal       8-mthreads-2-workers     1.00 (  2.51)   +2.32 (  2.26)
normal       8-mthreads-8-workers     1.00 (  1.56)   -0.44 (  1.54)
normal       8-mthreads-1-workers     1.00 (  4.08)   +2.17 (  2.10)
normal       8-mthreads-4-workers     1.00 (  1.84)   +3.42 (  2.34)

Again, per perf record data, the cycles distribution are almost the
same among baseline and patch1-4. The symbols related to patches 1-4
are set_rd_overloaded/set_rd_overutilized that is inlined in
`update_sd_lb_stats`, which accounts for ~0.47% (self) cycles in
baseline in 1 message thread and 8 workers configuration, and no
regressions were observed in patches 1-4 about this function.
So I think the patches won't cause regressions to schbench(old).

Schbench(new, 48aed1d)
======================
The workload was executed using the test framework available at
https://github.com/gormanm/mmtests. Each configuration was run for
5 iterations, with a runtime of 100 seconds per iteration. No
significant regressions were observed, as detailed below:

Notes:
1. message threads# are always 6, the same to numa node#
2. 1/2/4/8/16/32/64/79 are worker# per message thread
                                                     baseline     patch1
Amean request-99.0th-qrtle-1      1.00      0.00%
Amean rps-50.0th-qrtle-1              1.00      0.06%
Amean wakeup-99.0th-qrtle-1      1.00      0.26%
Amean request-99.0th-qrtle-2      1.00      0.23%
Amean rps-50.0th-qrtle-2              1.00      0.00%
Amean wakeup-99.0th-qrtle-2      1.00      1.09%
Amean request-99.0th-qrtle-4      1.00     -1.32%
Amean rps-50.0th-qrtle-4              1.00      0.11%
Amean wakeup-99.0th-qrtle-4      1.00     -0.41%
Amean request-99.0th-qrtle-8      1.00     -0.08%
Amean rps-50.0th-qrtle-8              1.00     -0.17%
Amean wakeup-99.0th-qrtle-8      1.00      0.37%
Amean request-99.0th-qrtle-16    1.00      0.23%
Amean rps-50.0th-qrtle-16            1.00     -0.06%
Amean wakeup-99.0th-qrtle-16    1.00      1.03%
Amean request-99.0th-qrtle-32    1.00      0.27%
Amean rps-50.0th-qrtle-32            1.00      0.06%
Amean wakeup-99.0th-qrtle-32    1.00     -0.37%
Amean request-99.0th-qrtle-64    1.00      0.57%
Amean rps-50.0th-qrtle-64            1.00     -0.28%
Amean wakeup-99.0th-qrtle-64    1.00     -3.00%
Amean request-99.0th-qrtle-79    1.00      0.21%
Amean rps-50.0th-qrtle-79            1.00     -0.23%
Amean wakeup-99.0th-qrtle-79    1.00      2.00%

                                                     baseline     patch2
Amean request-99.0th-qrtle-1      1.00     -0.46%
Amean rps-50.0th-qrtle-1              1.00      0.11%
Amean wakeup-99.0th-qrtle-1      1.00     -2.01%
Amean request-99.0th-qrtle-2      1.00     -0.08%
Amean rps-50.0th-qrtle-2              1.00      0.00%
Amean wakeup-99.0th-qrtle-2      1.00     -1.42%
Amean request-99.0th-qrtle-4      1.00     -1.16%
Amean rps-50.0th-qrtle-4              1.00      0.11%
Amean wakeup-99.0th-qrtle-4      1.00     -1.30%
Amean request-99.0th-qrtle-8      1.00     -0.08%
Amean rps-50.0th-qrtle-8              1.00     -0.40%
Amean wakeup-99.0th-qrtle-8      1.00      1.25%
Amean request-99.0th-qrtle-16    1.00      0.46%
Amean rps-50.0th-qrtle-16            1.00     -0.06%
Amean wakeup-99.0th-qrtle-16    1.00      2.52%
Amean request-99.0th-qrtle-32    1.00     14.83%
Amean rps-50.0th-qrtle-32            1.00      0.75%
Amean wakeup-99.0th-qrtle-32    1.00      3.03%
Amean request-99.0th-qrtle-64    1.00     -0.44%
Amean rps-50.0th-qrtle-64            1.00      0.28%
Amean wakeup-99.0th-qrtle-64    1.00     -3.50%
Amean request-99.0th-qrtle-79    1.00     -0.09%
Amean rps-50.0th-qrtle-79            1.00      0.08%
Amean wakeup-99.0th-qrtle-79    1.00     -1.20%

                                                      baseline     patch3
Amean request-99.0th-qrtle-1      1.00      0.31%
Amean rps-50.0th-qrtle-1              1.00     -0.17%
Amean wakeup-99.0th-qrtle-1      1.00      0.44%
Amean request-99.0th-qrtle-2      1.00     -0.61%
Amean rps-50.0th-qrtle-2              1.00     -0.29%
Amean wakeup-99.0th-qrtle-2      1.00      1.93%
Amean request-99.0th-qrtle-4      1.00     -1.62%
Amean rps-50.0th-qrtle-4              1.00     -0.17%
Amean wakeup-99.0th-qrtle-4      1.00      0.00%
Amean request-99.0th-qrtle-8      1.00      0.00%
Amean rps-50.0th-qrtle-8              1.00     -0.40%
Amean wakeup-99.0th-qrtle-8      1.00     -0.29%
Amean request-99.0th-qrtle-16    1.00      0.53%
Amean rps-50.0th-qrtle-16            1.00     -0.17%
Amean wakeup-99.0th-qrtle-16    1.00     -1.03%
Amean request-99.0th-qrtle-32    1.00      0.09%
Amean rps-50.0th-qrtle-32            1.00     -0.17%
Amean wakeup-99.0th-qrtle-32    1.00      2.41%
Amean request-99.0th-qrtle-64    1.00      0.26%
Amean rps-50.0th-qrtle-64            1.00     -0.16%
Amean wakeup-99.0th-qrtle-64    1.00     -2.00%
Amean request-99.0th-qrtle-79    1.00      0.26%
Amean rps-50.0th-qrtle-79            1.00     -0.46%
Amean wakeup-99.0th-qrtle-79    1.00      1.20%

                                                    baseline     patch4
Amean request-99.0th-qrtle-1      1.00     -0.15%
Amean rps-50.0th-qrtle-1              1.00     -0.06%
Amean wakeup-99.0th-qrtle-1      1.00     -2.88%
Amean request-99.0th-qrtle-2      1.00     -0.31%
Amean rps-50.0th-qrtle-2              1.00     -0.29%
Amean wakeup-99.0th-qrtle-2      1.00     -0.59%
Amean request-99.0th-qrtle-4      1.00     -0.23%
Amean rps-50.0th-qrtle-4              1.00     -0.11%
Amean wakeup-99.0th-qrtle-4      1.00     -0.41%
Amean request-99.0th-qrtle-8      1.00     -0.08%
Amean rps-50.0th-qrtle-8              1.00     -0.52%
Amean wakeup-99.0th-qrtle-8      1.00      1.91%
Amean request-99.0th-qrtle-16    1.00      0.76%
Amean rps-50.0th-qrtle-16            1.00      0.06%
Amean wakeup-99.0th-qrtle-16    1.00      1.03%
Amean request-99.0th-qrtle-32    1.00      8.36%
Amean rps-50.0th-qrtle-32            1.00      0.00%
Amean wakeup-99.0th-qrtle-32    1.00     -1.05%
Amean request-99.0th-qrtle-64    1.00      0.13%
Amean rps-50.0th-qrtle-64            1.00      0.00%
Amean wakeup-99.0th-qrtle-64    1.00     -4.00%
Amean request-99.0th-qrtle-79    1.00     -0.39%
Amean rps-50.0th-qrtle-79            1.00      0.14%
Amean wakeup-99.0th-qrtle-79    1.00     -0.40%