The vcpu_is_preempted() test stops osq_lock() spinning if a virtual
cpu is no longer running.
Although patched out for bare-metal the code still needs the cpu number.
Reading this from 'prev->cpu' is a pretty much guaranteed have a cache miss
when osq_unlock() is waking up the next cpu.
Instead save 'prev->cpu' in 'node->prev_cpu' and use that value instead.
Update in the osq_lock() 'unqueue' path when 'node->prev' is changed.
This is simpler than checking for 'node->prev' changing and caching
'prev->cpu'.
Signed-off-by: David Laight <david.laight@aculab.com>
---
kernel/locking/osq_lock.c | 16 +++++++---------
1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index e0bc74d85a76..eb8a6dfdb79d 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -14,8 +14,9 @@
struct optimistic_spin_node {
struct optimistic_spin_node *next, *prev;
- int locked; /* 1 if lock acquired */
- int cpu; /* encoded CPU # + 1 value */
+ int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # + 1 value */
+ int prev_cpu; /* encoded CPU # + 1 value */
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
@@ -29,11 +30,6 @@ static inline int encode_cpu(int cpu_nr)
return cpu_nr + 1;
}
-static inline int node_cpu(struct optimistic_spin_node *node)
-{
- return node->cpu - 1;
-}
-
static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
{
int cpu_nr = encoded_cpu_val - 1;
@@ -110,9 +106,10 @@ bool osq_lock(struct optimistic_spin_queue *lock)
if (old == OSQ_UNLOCKED_VAL)
return true;
- node->locked = 0;
+ node->prev_cpu = old;
prev = decode_cpu(old);
node->prev = prev;
+ node->locked = 0;
/*
* osq_lock() unqueue
@@ -144,7 +141,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* polling, be careful.
*/
if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
- vcpu_is_preempted(node_cpu(node->prev))))
+ vcpu_is_preempted(READ_ONCE(node->prev_cpu) - 1)))
return true;
/* unqueue */
@@ -201,6 +198,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* it will wait in Step-A.
*/
+ WRITE_ONCE(next->prev_cpu, prev->cpu);
WRITE_ONCE(next->prev, prev);
WRITE_ONCE(prev->next, next);
--
2.17.1
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Hello,
kernel test robot noticed a 10.7% improvement of stress-ng.netlink-task.ops_per_sec on:
commit: d93300891f810c9498d09a6ecea2403d7a3546f0 ("[PATCH next v2 2/5] locking/osq_lock: Optimise the vcpu_is_preempted() check.")
url: https://github.com/intel-lab-lkp/linux/commits/David-Laight/locking-osq_lock-Defer-clearing-node-locked-until-the-slow-osq_lock-path/20240101-055853
base: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git 610a9b8f49fbcf1100716370d3b5f6f884a2835a
patch link: https://lore.kernel.org/all/3a9d1782cd50436c99ced8c10175bae6@AcuMS.aculab.com/
patch subject: [PATCH next v2 2/5] locking/osq_lock: Optimise the vcpu_is_preempted() check.
testcase: stress-ng
test machine: 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory
parameters:
nr_threads: 100%
testtime: 60s
sc_pid_max: 4194304
class: scheduler
test: netlink-task
cpufreq_governor: performance
Details are as below:
-------------------------------------------------------------------------------------------------->
The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240108/202401081557.641738f5-oliver.sang@intel.com
=========================================================================================
class/compiler/cpufreq_governor/kconfig/nr_threads/rootfs/sc_pid_max/tbox_group/test/testcase/testtime:
scheduler/gcc-12/performance/x86_64-rhel-8.3/100%/debian-11.1-x86_64-20220510.cgz/4194304/lkp-icl-2sp8/netlink-task/stress-ng/60s
commit:
ff787c1fd0 ("locking/osq_lock: Defer clearing node->locked until the slow osq_lock() path.")
d93300891f ("locking/osq_lock: Optimise the vcpu_is_preempted() check.")
ff787c1fd0c13f9b d93300891f810c9498d09a6ecea
---------------- ---------------------------
%stddev %change %stddev
\ | \
3880 ± 7% +26.4% 4903 ± 18% vmstat.system.cs
0.48 ±126% -99.8% 0.00 ±141% perf-sched.sch_delay.avg.ms.__cond_resched.aa_sk_perm.security_socket_recvmsg.sock_recvmsg.__sys_recvfrom
0.16 ± 23% -38.9% 0.10 ± 32% perf-sched.sch_delay.avg.ms.schedule_preempt_disabled.__mutex_lock.constprop.0.genl_rcv_msg
1.50 ±118% -99.9% 0.00 ±142% perf-sched.sch_delay.max.ms.__cond_resched.aa_sk_perm.security_socket_recvmsg.sock_recvmsg.__sys_recvfrom
2.55 ± 97% -83.7% 0.42 ±145% perf-sched.wait_time.max.ms.__cond_resched.__mutex_lock.constprop.0.genl_rcv_msg
32244865 +10.7% 35709040 stress-ng.netlink-task.ops
537413 +10.7% 595150 stress-ng.netlink-task.ops_per_sec
38094 ± 12% +42.2% 54160 ± 27% stress-ng.time.involuntary_context_switches
42290 ± 11% +39.8% 59117 ± 23% stress-ng.time.voluntary_context_switches
143.50 ± 7% -28.8% 102.17 ± 15% perf-c2c.DRAM.local
4955 ± 3% -26.4% 3647 ± 4% perf-c2c.DRAM.remote
4038 ± 2% -18.8% 3277 ± 4% perf-c2c.HITM.local
3483 ± 3% -21.1% 2747 ± 5% perf-c2c.HITM.remote
7521 ± 2% -19.9% 6024 ± 4% perf-c2c.HITM.total
0.42 ± 3% -16.2% 0.35 ± 5% perf-stat.i.MPKI
1.066e+10 +9.6% 1.169e+10 perf-stat.i.branch-instructions
51.90 -2.5 49.42 ± 2% perf-stat.i.cache-miss-rate%
22517746 ± 3% -13.4% 19503564 ± 5% perf-stat.i.cache-misses
3730 ± 7% +29.2% 4819 ± 19% perf-stat.i.context-switches
3.99 -3.1% 3.86 perf-stat.i.cpi
9535 ± 3% +15.4% 11003 ± 5% perf-stat.i.cycles-between-cache-misses
0.00 ± 3% +0.0 0.00 ± 3% perf-stat.i.dTLB-load-miss-rate%
1.419e+10 -14.9% 1.207e+10 perf-stat.i.dTLB-loads
8.411e+08 +8.4% 9.118e+08 perf-stat.i.dTLB-stores
5.36e+10 +3.1% 5.524e+10 perf-stat.i.instructions
0.26 +7.0% 0.28 ± 5% perf-stat.i.ipc
837.29 ± 3% -9.8% 755.30 ± 4% perf-stat.i.metric.K/sec
401.41 -4.1% 385.10 perf-stat.i.metric.M/sec
6404966 -23.2% 4920722 perf-stat.i.node-load-misses
141818 ± 4% -22.2% 110404 ± 4% perf-stat.i.node-loads
69.54 +13.8 83.36 perf-stat.i.node-store-miss-rate%
3935319 +10.4% 4345724 perf-stat.i.node-store-misses
1626033 -52.6% 771187 ± 5% perf-stat.i.node-stores
0.42 ± 3% -16.0% 0.35 ± 5% perf-stat.overall.MPKI
0.11 -0.0 0.10 ± 8% perf-stat.overall.branch-miss-rate%
51.32 -2.5 48.79 ± 2% perf-stat.overall.cache-miss-rate%
4.06 -3.0% 3.94 perf-stat.overall.cpi
9677 ± 3% +15.6% 11187 ± 5% perf-stat.overall.cycles-between-cache-misses
0.00 ± 3% +0.0 0.00 ± 4% perf-stat.overall.dTLB-load-miss-rate%
0.25 +3.1% 0.25 perf-stat.overall.ipc
70.78 +14.2 84.94 perf-stat.overall.node-store-miss-rate%
1.049e+10 +9.5% 1.149e+10 perf-stat.ps.branch-instructions
22167740 ± 3% -13.4% 19186498 ± 5% perf-stat.ps.cache-misses
3667 ± 7% +29.1% 4735 ± 19% perf-stat.ps.context-switches
1.396e+10 -15.0% 1.187e+10 perf-stat.ps.dTLB-loads
8.273e+08 +8.3% 8.963e+08 perf-stat.ps.dTLB-stores
5.274e+10 +3.0% 5.433e+10 perf-stat.ps.instructions
6303682 -23.2% 4839978 perf-stat.ps.node-load-misses
140690 ± 4% -22.5% 109023 ± 4% perf-stat.ps.node-loads
3875362 +10.3% 4276026 perf-stat.ps.node-store-misses
1599985 -52.6% 758184 ± 5% perf-stat.ps.node-stores
3.297e+12 +3.0% 3.396e+12 perf-stat.total.instructions
96.07 -0.2 95.87 perf-profile.calltrace.cycles-pp.osq_lock.__mutex_lock.genl_rcv_msg.netlink_rcv_skb.genl_rcv
97.52 -0.1 97.37 perf-profile.calltrace.cycles-pp.__mutex_lock.genl_rcv_msg.netlink_rcv_skb.genl_rcv.netlink_unicast
98.98 -0.1 98.90 perf-profile.calltrace.cycles-pp.netlink_rcv_skb.genl_rcv.netlink_unicast.netlink_sendmsg.__sys_sendto
98.99 -0.1 98.92 perf-profile.calltrace.cycles-pp.genl_rcv.netlink_unicast.netlink_sendmsg.__sys_sendto.__x64_sys_sendto
98.97 -0.1 98.89 perf-profile.calltrace.cycles-pp.genl_rcv_msg.netlink_rcv_skb.genl_rcv.netlink_unicast.netlink_sendmsg
99.09 -0.1 99.04 perf-profile.calltrace.cycles-pp.netlink_unicast.netlink_sendmsg.__sys_sendto.__x64_sys_sendto.do_syscall_64
99.47 -0.0 99.43 perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.sendto.stress_netlink_taskstats_monitor.stress_netlink_task
99.44 -0.0 99.40 perf-profile.calltrace.cycles-pp.__x64_sys_sendto.do_syscall_64.entry_SYSCALL_64_after_hwframe.sendto.stress_netlink_taskstats_monitor
99.35 -0.0 99.32 perf-profile.calltrace.cycles-pp.netlink_sendmsg.__sys_sendto.__x64_sys_sendto.do_syscall_64.entry_SYSCALL_64_after_hwframe
99.44 -0.0 99.40 perf-profile.calltrace.cycles-pp.__sys_sendto.__x64_sys_sendto.do_syscall_64.entry_SYSCALL_64_after_hwframe.sendto
96.08 -0.2 95.89 perf-profile.children.cycles-pp.osq_lock
97.52 -0.1 97.38 perf-profile.children.cycles-pp.__mutex_lock
98.98 -0.1 98.90 perf-profile.children.cycles-pp.netlink_rcv_skb
99.00 -0.1 98.92 perf-profile.children.cycles-pp.genl_rcv
98.97 -0.1 98.89 perf-profile.children.cycles-pp.genl_rcv_msg
99.20 -0.0 99.15 perf-profile.children.cycles-pp.netlink_unicast
0.13 ± 3% -0.0 0.08 ± 7% perf-profile.children.cycles-pp.genl_cmd_full_to_split
0.14 ± 4% -0.0 0.10 ± 5% perf-profile.children.cycles-pp.genl_get_cmd
99.36 -0.0 99.32 perf-profile.children.cycles-pp.netlink_sendmsg
99.44 -0.0 99.41 perf-profile.children.cycles-pp.__x64_sys_sendto
99.44 -0.0 99.41 perf-profile.children.cycles-pp.__sys_sendto
99.59 -0.0 99.56 perf-profile.children.cycles-pp.sendto
0.07 ± 5% +0.0 0.08 ± 5% perf-profile.children.cycles-pp.genl_family_rcv_msg_attrs_parse
0.11 +0.0 0.12 ± 6% perf-profile.children.cycles-pp.apparmor_capable
0.18 ± 3% +0.0 0.20 ± 4% perf-profile.children.cycles-pp.netlink_recvmsg
0.36 +0.0 0.38 perf-profile.children.cycles-pp.fill_stats
0.13 ± 3% +0.0 0.15 ± 4% perf-profile.children.cycles-pp.ns_capable
0.20 ± 3% +0.0 0.23 ± 4% perf-profile.children.cycles-pp.sock_recvmsg
0.24 ± 3% +0.0 0.27 ± 3% perf-profile.children.cycles-pp.__sys_recvfrom
0.24 ± 3% +0.0 0.27 ± 4% perf-profile.children.cycles-pp.__x64_sys_recvfrom
0.31 ± 3% +0.0 0.34 ± 3% perf-profile.children.cycles-pp.recv
1.22 +0.0 1.26 perf-profile.children.cycles-pp.genl_family_rcv_msg
0.85 +0.1 0.90 perf-profile.children.cycles-pp.cmd_attr_pid
0.94 +0.1 1.01 perf-profile.children.cycles-pp.genl_family_rcv_msg_doit
1.11 +0.1 1.23 perf-profile.children.cycles-pp.mutex_spin_on_owner
95.80 -0.2 95.62 perf-profile.self.cycles-pp.osq_lock
0.13 ± 3% -0.0 0.08 ± 7% perf-profile.self.cycles-pp.genl_cmd_full_to_split
0.11 ± 3% +0.0 0.12 ± 6% perf-profile.self.cycles-pp.apparmor_capable
1.11 +0.1 1.23 perf-profile.self.cycles-pp.mutex_spin_on_owner
Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
* David Laight <David.Laight@ACULAB.COM> wrote:
> The vcpu_is_preempted() test stops osq_lock() spinning if a virtual
> cpu is no longer running.
>
> Although patched out for bare-metal the code still needs the cpu number.
Comma missing.
> Reading this from 'prev->cpu' is a pretty much guaranteed have a cache miss
> when osq_unlock() is waking up the next cpu.
>
> Instead save 'prev->cpu' in 'node->prev_cpu' and use that value instead.
> Update in the osq_lock() 'unqueue' path when 'node->prev' is changed.
>
> This is simpler than checking for 'node->prev' changing and caching
> 'prev->cpu'.
Throughout the series, in changelogs and comments, please do:
s/cpu
/CPU
Please be more careful about changelog quality.
> struct optimistic_spin_node {
> struct optimistic_spin_node *next, *prev;
> - int locked; /* 1 if lock acquired */
> - int cpu; /* encoded CPU # + 1 value */
> + int locked; /* 1 if lock acquired */
> + int cpu; /* encoded CPU # + 1 value */
> + int prev_cpu; /* encoded CPU # + 1 value */
> };
s/ encoded CPU # + 1 value
/ encoded value: CPU+1
Thanks,
Ingo
On 12/31/23 16:52, David Laight wrote:
> The vcpu_is_preempted() test stops osq_lock() spinning if a virtual
> cpu is no longer running.
>
> Although patched out for bare-metal the code still needs the cpu number.
> Reading this from 'prev->cpu' is a pretty much guaranteed have a cache miss
> when osq_unlock() is waking up the next cpu.
>
> Instead save 'prev->cpu' in 'node->prev_cpu' and use that value instead.
> Update in the osq_lock() 'unqueue' path when 'node->prev' is changed.
>
> This is simpler than checking for 'node->prev' changing and caching
> 'prev->cpu'.
>
> Signed-off-by: David Laight <david.laight@aculab.com>
> ---
> kernel/locking/osq_lock.c | 16 +++++++---------
> 1 file changed, 7 insertions(+), 9 deletions(-)
>
> diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
> index e0bc74d85a76..eb8a6dfdb79d 100644
> --- a/kernel/locking/osq_lock.c
> +++ b/kernel/locking/osq_lock.c
> @@ -14,8 +14,9 @@
>
> struct optimistic_spin_node {
> struct optimistic_spin_node *next, *prev;
> - int locked; /* 1 if lock acquired */
> - int cpu; /* encoded CPU # + 1 value */
> + int locked; /* 1 if lock acquired */
> + int cpu; /* encoded CPU # + 1 value */
> + int prev_cpu; /* encoded CPU # + 1 value */
> };
>
> static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
> @@ -29,11 +30,6 @@ static inline int encode_cpu(int cpu_nr)
> return cpu_nr + 1;
> }
>
> -static inline int node_cpu(struct optimistic_spin_node *node)
> -{
> - return node->cpu - 1;
> -}
> -
> static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
> {
> int cpu_nr = encoded_cpu_val - 1;
> @@ -110,9 +106,10 @@ bool osq_lock(struct optimistic_spin_queue *lock)
> if (old == OSQ_UNLOCKED_VAL)
> return true;
>
> - node->locked = 0;
> + node->prev_cpu = old;
> prev = decode_cpu(old);
> node->prev = prev;
> + node->locked = 0;
>
> /*
> * osq_lock() unqueue
> @@ -144,7 +141,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
> * polling, be careful.
> */
> if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
> - vcpu_is_preempted(node_cpu(node->prev))))
> + vcpu_is_preempted(READ_ONCE(node->prev_cpu) - 1)))
> return true;
>
> /* unqueue */
> @@ -201,6 +198,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
> * it will wait in Step-A.
> */
>
> + WRITE_ONCE(next->prev_cpu, prev->cpu);
> WRITE_ONCE(next->prev, prev);
> WRITE_ONCE(prev->next, next);
>
Reviewed-by: Waiman Long <longman@redhat.com>
Reviewed-by: Waiman Long <longman@redhat.com>
© 2016 - 2025 Red Hat, Inc.