locking/osq_lock: Optimisations to osq_lock code.

[PATCH next v2 2/5] locking/osq_lock: Optimise the vcpu_is_preempted() check.

Posted by David Laight 2 years, 1 month ago

The vcpu_is_preempted() test stops osq_lock() spinning if a virtual
cpu is no longer running.

Although patched out for bare-metal the code still needs the cpu number.
Reading this from 'prev->cpu' is a pretty much guaranteed have a cache miss
when osq_unlock() is waking up the next cpu.

Instead save 'prev->cpu' in 'node->prev_cpu' and use that value instead.
Update in the osq_lock() 'unqueue' path when 'node->prev' is changed.

This is simpler than checking for 'node->prev' changing and caching
'prev->cpu'.

Signed-off-by: David Laight <david.laight@aculab.com>
---
 kernel/locking/osq_lock.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index e0bc74d85a76..eb8a6dfdb79d 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -14,8 +14,9 @@
 
 struct optimistic_spin_node {
 	struct optimistic_spin_node *next, *prev;
-	int locked; /* 1 if lock acquired */
-	int cpu; /* encoded CPU # + 1 value */
+	int locked;    /* 1 if lock acquired */
+	int cpu;       /* encoded CPU # + 1 value */
+	int prev_cpu;  /* encoded CPU # + 1 value */
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
@@ -29,11 +30,6 @@ static inline int encode_cpu(int cpu_nr)
 	return cpu_nr + 1;
 }
 
-static inline int node_cpu(struct optimistic_spin_node *node)
-{
-	return node->cpu - 1;
-}
-
 static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
 {
 	int cpu_nr = encoded_cpu_val - 1;
@@ -110,9 +106,10 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	if (old == OSQ_UNLOCKED_VAL)
 		return true;
 
-	node->locked = 0;
+	node->prev_cpu = old;
 	prev = decode_cpu(old);
 	node->prev = prev;
+	node->locked = 0;
 
 	/*
 	 * osq_lock()			unqueue
@@ -144,7 +141,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	 * polling, be careful.
 	 */
 	if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
-				  vcpu_is_preempted(node_cpu(node->prev))))
+				  vcpu_is_preempted(READ_ONCE(node->prev_cpu) - 1)))
 		return true;
 
 	/* unqueue */
@@ -201,6 +198,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	 * it will wait in Step-A.
 	 */
 
+	WRITE_ONCE(next->prev_cpu, prev->cpu);
 	WRITE_ONCE(next->prev, prev);
 	WRITE_ONCE(prev->next, next);
 
-- 
2.17.1

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

Re: [PATCH next v2 2/5] locking/osq_lock: Optimise the vcpu_is_preempted() check.

Posted by kernel test robot 2 years, 1 month ago


Hello,

kernel test robot noticed a 10.7% improvement of stress-ng.netlink-task.ops_per_sec on:


commit: d93300891f810c9498d09a6ecea2403d7a3546f0 ("[PATCH next v2 2/5] locking/osq_lock: Optimise the vcpu_is_preempted() check.")
url: https://github.com/intel-lab-lkp/linux/commits/David-Laight/locking-osq_lock-Defer-clearing-node-locked-until-the-slow-osq_lock-path/20240101-055853
base: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git 610a9b8f49fbcf1100716370d3b5f6f884a2835a
patch link: https://lore.kernel.org/all/3a9d1782cd50436c99ced8c10175bae6@AcuMS.aculab.com/
patch subject: [PATCH next v2 2/5] locking/osq_lock: Optimise the vcpu_is_preempted() check.

testcase: stress-ng
test machine: 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory
parameters:

	nr_threads: 100%
	testtime: 60s
	sc_pid_max: 4194304
	class: scheduler
	test: netlink-task
	cpufreq_governor: performance






Details are as below:
-------------------------------------------------------------------------------------------------->


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240108/202401081557.641738f5-oliver.sang@intel.com

=========================================================================================
class/compiler/cpufreq_governor/kconfig/nr_threads/rootfs/sc_pid_max/tbox_group/test/testcase/testtime:
  scheduler/gcc-12/performance/x86_64-rhel-8.3/100%/debian-11.1-x86_64-20220510.cgz/4194304/lkp-icl-2sp8/netlink-task/stress-ng/60s

commit: 
  ff787c1fd0 ("locking/osq_lock: Defer clearing node->locked until the slow osq_lock() path.")
  d93300891f ("locking/osq_lock: Optimise the vcpu_is_preempted() check.")

ff787c1fd0c13f9b d93300891f810c9498d09a6ecea 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
      3880 ±  7%     +26.4%       4903 ± 18%  vmstat.system.cs
      0.48 ±126%     -99.8%       0.00 ±141%  perf-sched.sch_delay.avg.ms.__cond_resched.aa_sk_perm.security_socket_recvmsg.sock_recvmsg.__sys_recvfrom
      0.16 ± 23%     -38.9%       0.10 ± 32%  perf-sched.sch_delay.avg.ms.schedule_preempt_disabled.__mutex_lock.constprop.0.genl_rcv_msg
      1.50 ±118%     -99.9%       0.00 ±142%  perf-sched.sch_delay.max.ms.__cond_resched.aa_sk_perm.security_socket_recvmsg.sock_recvmsg.__sys_recvfrom
      2.55 ± 97%     -83.7%       0.42 ±145%  perf-sched.wait_time.max.ms.__cond_resched.__mutex_lock.constprop.0.genl_rcv_msg
  32244865           +10.7%   35709040        stress-ng.netlink-task.ops
    537413           +10.7%     595150        stress-ng.netlink-task.ops_per_sec
     38094 ± 12%     +42.2%      54160 ± 27%  stress-ng.time.involuntary_context_switches
     42290 ± 11%     +39.8%      59117 ± 23%  stress-ng.time.voluntary_context_switches
    143.50 ±  7%     -28.8%     102.17 ± 15%  perf-c2c.DRAM.local
      4955 ±  3%     -26.4%       3647 ±  4%  perf-c2c.DRAM.remote
      4038 ±  2%     -18.8%       3277 ±  4%  perf-c2c.HITM.local
      3483 ±  3%     -21.1%       2747 ±  5%  perf-c2c.HITM.remote
      7521 ±  2%     -19.9%       6024 ±  4%  perf-c2c.HITM.total
      0.42 ±  3%     -16.2%       0.35 ±  5%  perf-stat.i.MPKI
 1.066e+10            +9.6%  1.169e+10        perf-stat.i.branch-instructions
     51.90            -2.5       49.42 ±  2%  perf-stat.i.cache-miss-rate%
  22517746 ±  3%     -13.4%   19503564 ±  5%  perf-stat.i.cache-misses
      3730 ±  7%     +29.2%       4819 ± 19%  perf-stat.i.context-switches
      3.99            -3.1%       3.86        perf-stat.i.cpi
      9535 ±  3%     +15.4%      11003 ±  5%  perf-stat.i.cycles-between-cache-misses
      0.00 ±  3%      +0.0        0.00 ±  3%  perf-stat.i.dTLB-load-miss-rate%
 1.419e+10           -14.9%  1.207e+10        perf-stat.i.dTLB-loads
 8.411e+08            +8.4%  9.118e+08        perf-stat.i.dTLB-stores
  5.36e+10            +3.1%  5.524e+10        perf-stat.i.instructions
      0.26            +7.0%       0.28 ±  5%  perf-stat.i.ipc
    837.29 ±  3%      -9.8%     755.30 ±  4%  perf-stat.i.metric.K/sec
    401.41            -4.1%     385.10        perf-stat.i.metric.M/sec
   6404966           -23.2%    4920722        perf-stat.i.node-load-misses
    141818 ±  4%     -22.2%     110404 ±  4%  perf-stat.i.node-loads
     69.54           +13.8       83.36        perf-stat.i.node-store-miss-rate%
   3935319           +10.4%    4345724        perf-stat.i.node-store-misses
   1626033           -52.6%     771187 ±  5%  perf-stat.i.node-stores
      0.42 ±  3%     -16.0%       0.35 ±  5%  perf-stat.overall.MPKI
      0.11            -0.0        0.10 ±  8%  perf-stat.overall.branch-miss-rate%
     51.32            -2.5       48.79 ±  2%  perf-stat.overall.cache-miss-rate%
      4.06            -3.0%       3.94        perf-stat.overall.cpi
      9677 ±  3%     +15.6%      11187 ±  5%  perf-stat.overall.cycles-between-cache-misses
      0.00 ±  3%      +0.0        0.00 ±  4%  perf-stat.overall.dTLB-load-miss-rate%
      0.25            +3.1%       0.25        perf-stat.overall.ipc
     70.78           +14.2       84.94        perf-stat.overall.node-store-miss-rate%
 1.049e+10            +9.5%  1.149e+10        perf-stat.ps.branch-instructions
  22167740 ±  3%     -13.4%   19186498 ±  5%  perf-stat.ps.cache-misses
      3667 ±  7%     +29.1%       4735 ± 19%  perf-stat.ps.context-switches
 1.396e+10           -15.0%  1.187e+10        perf-stat.ps.dTLB-loads
 8.273e+08            +8.3%  8.963e+08        perf-stat.ps.dTLB-stores
 5.274e+10            +3.0%  5.433e+10        perf-stat.ps.instructions
   6303682           -23.2%    4839978        perf-stat.ps.node-load-misses
    140690 ±  4%     -22.5%     109023 ±  4%  perf-stat.ps.node-loads
   3875362           +10.3%    4276026        perf-stat.ps.node-store-misses
   1599985           -52.6%     758184 ±  5%  perf-stat.ps.node-stores
 3.297e+12            +3.0%  3.396e+12        perf-stat.total.instructions
     96.07            -0.2       95.87        perf-profile.calltrace.cycles-pp.osq_lock.__mutex_lock.genl_rcv_msg.netlink_rcv_skb.genl_rcv
     97.52            -0.1       97.37        perf-profile.calltrace.cycles-pp.__mutex_lock.genl_rcv_msg.netlink_rcv_skb.genl_rcv.netlink_unicast
     98.98            -0.1       98.90        perf-profile.calltrace.cycles-pp.netlink_rcv_skb.genl_rcv.netlink_unicast.netlink_sendmsg.__sys_sendto
     98.99            -0.1       98.92        perf-profile.calltrace.cycles-pp.genl_rcv.netlink_unicast.netlink_sendmsg.__sys_sendto.__x64_sys_sendto
     98.97            -0.1       98.89        perf-profile.calltrace.cycles-pp.genl_rcv_msg.netlink_rcv_skb.genl_rcv.netlink_unicast.netlink_sendmsg
     99.09            -0.1       99.04        perf-profile.calltrace.cycles-pp.netlink_unicast.netlink_sendmsg.__sys_sendto.__x64_sys_sendto.do_syscall_64
     99.47            -0.0       99.43        perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.sendto.stress_netlink_taskstats_monitor.stress_netlink_task
     99.44            -0.0       99.40        perf-profile.calltrace.cycles-pp.__x64_sys_sendto.do_syscall_64.entry_SYSCALL_64_after_hwframe.sendto.stress_netlink_taskstats_monitor
     99.35            -0.0       99.32        perf-profile.calltrace.cycles-pp.netlink_sendmsg.__sys_sendto.__x64_sys_sendto.do_syscall_64.entry_SYSCALL_64_after_hwframe
     99.44            -0.0       99.40        perf-profile.calltrace.cycles-pp.__sys_sendto.__x64_sys_sendto.do_syscall_64.entry_SYSCALL_64_after_hwframe.sendto
     96.08            -0.2       95.89        perf-profile.children.cycles-pp.osq_lock
     97.52            -0.1       97.38        perf-profile.children.cycles-pp.__mutex_lock
     98.98            -0.1       98.90        perf-profile.children.cycles-pp.netlink_rcv_skb
     99.00            -0.1       98.92        perf-profile.children.cycles-pp.genl_rcv
     98.97            -0.1       98.89        perf-profile.children.cycles-pp.genl_rcv_msg
     99.20            -0.0       99.15        perf-profile.children.cycles-pp.netlink_unicast
      0.13 ±  3%      -0.0        0.08 ±  7%  perf-profile.children.cycles-pp.genl_cmd_full_to_split
      0.14 ±  4%      -0.0        0.10 ±  5%  perf-profile.children.cycles-pp.genl_get_cmd
     99.36            -0.0       99.32        perf-profile.children.cycles-pp.netlink_sendmsg
     99.44            -0.0       99.41        perf-profile.children.cycles-pp.__x64_sys_sendto
     99.44            -0.0       99.41        perf-profile.children.cycles-pp.__sys_sendto
     99.59            -0.0       99.56        perf-profile.children.cycles-pp.sendto
      0.07 ±  5%      +0.0        0.08 ±  5%  perf-profile.children.cycles-pp.genl_family_rcv_msg_attrs_parse
      0.11            +0.0        0.12 ±  6%  perf-profile.children.cycles-pp.apparmor_capable
      0.18 ±  3%      +0.0        0.20 ±  4%  perf-profile.children.cycles-pp.netlink_recvmsg
      0.36            +0.0        0.38        perf-profile.children.cycles-pp.fill_stats
      0.13 ±  3%      +0.0        0.15 ±  4%  perf-profile.children.cycles-pp.ns_capable
      0.20 ±  3%      +0.0        0.23 ±  4%  perf-profile.children.cycles-pp.sock_recvmsg
      0.24 ±  3%      +0.0        0.27 ±  3%  perf-profile.children.cycles-pp.__sys_recvfrom
      0.24 ±  3%      +0.0        0.27 ±  4%  perf-profile.children.cycles-pp.__x64_sys_recvfrom
      0.31 ±  3%      +0.0        0.34 ±  3%  perf-profile.children.cycles-pp.recv
      1.22            +0.0        1.26        perf-profile.children.cycles-pp.genl_family_rcv_msg
      0.85            +0.1        0.90        perf-profile.children.cycles-pp.cmd_attr_pid
      0.94            +0.1        1.01        perf-profile.children.cycles-pp.genl_family_rcv_msg_doit
      1.11            +0.1        1.23        perf-profile.children.cycles-pp.mutex_spin_on_owner
     95.80            -0.2       95.62        perf-profile.self.cycles-pp.osq_lock
      0.13 ±  3%      -0.0        0.08 ±  7%  perf-profile.self.cycles-pp.genl_cmd_full_to_split
      0.11 ±  3%      +0.0        0.12 ±  6%  perf-profile.self.cycles-pp.apparmor_capable
      1.11            +0.1        1.23        perf-profile.self.cycles-pp.mutex_spin_on_owner




Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.


-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH next v2 2/5] locking/osq_lock: Optimise the vcpu_is_preempted() check.

Posted by Ingo Molnar 2 years, 1 month ago

* David Laight <David.Laight@ACULAB.COM> wrote:

> The vcpu_is_preempted() test stops osq_lock() spinning if a virtual
> cpu is no longer running.
> 
> Although patched out for bare-metal the code still needs the cpu number.

Comma missing.

> Reading this from 'prev->cpu' is a pretty much guaranteed have a cache miss
> when osq_unlock() is waking up the next cpu.
> 
> Instead save 'prev->cpu' in 'node->prev_cpu' and use that value instead.
> Update in the osq_lock() 'unqueue' path when 'node->prev' is changed.
> 
> This is simpler than checking for 'node->prev' changing and caching
> 'prev->cpu'.

Throughout the series, in changelogs and comments, please do:

 s/cpu
  /CPU

Please be more careful about changelog quality.

>  struct optimistic_spin_node {
>  	struct optimistic_spin_node *next, *prev;
> -	int locked; /* 1 if lock acquired */
> -	int cpu; /* encoded CPU # + 1 value */
> +	int locked;    /* 1 if lock acquired */
> +	int cpu;       /* encoded CPU # + 1 value */
> +	int prev_cpu;  /* encoded CPU # + 1 value */
>  };

s/ encoded CPU # + 1 value
 / encoded value: CPU+1

Thanks,

	Ingo

Re: [PATCH next v2 2/5] locking/osq_lock: Optimise the vcpu_is_preempted() check.

Posted by Waiman Long 2 years, 1 month ago

On 12/31/23 16:52, David Laight wrote:
> The vcpu_is_preempted() test stops osq_lock() spinning if a virtual
> cpu is no longer running.
>
> Although patched out for bare-metal the code still needs the cpu number.
> Reading this from 'prev->cpu' is a pretty much guaranteed have a cache miss
> when osq_unlock() is waking up the next cpu.
>
> Instead save 'prev->cpu' in 'node->prev_cpu' and use that value instead.
> Update in the osq_lock() 'unqueue' path when 'node->prev' is changed.
>
> This is simpler than checking for 'node->prev' changing and caching
> 'prev->cpu'.
>
> Signed-off-by: David Laight <david.laight@aculab.com>
> ---
>   kernel/locking/osq_lock.c | 16 +++++++---------
>   1 file changed, 7 insertions(+), 9 deletions(-)
>
> diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
> index e0bc74d85a76..eb8a6dfdb79d 100644
> --- a/kernel/locking/osq_lock.c
> +++ b/kernel/locking/osq_lock.c
> @@ -14,8 +14,9 @@
>   
>   struct optimistic_spin_node {
>   	struct optimistic_spin_node *next, *prev;
> -	int locked; /* 1 if lock acquired */
> -	int cpu; /* encoded CPU # + 1 value */
> +	int locked;    /* 1 if lock acquired */
> +	int cpu;       /* encoded CPU # + 1 value */
> +	int prev_cpu;  /* encoded CPU # + 1 value */
>   };
>   
>   static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
> @@ -29,11 +30,6 @@ static inline int encode_cpu(int cpu_nr)
>   	return cpu_nr + 1;
>   }
>   
> -static inline int node_cpu(struct optimistic_spin_node *node)
> -{
> -	return node->cpu - 1;
> -}
> -
>   static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
>   {
>   	int cpu_nr = encoded_cpu_val - 1;
> @@ -110,9 +106,10 @@ bool osq_lock(struct optimistic_spin_queue *lock)
>   	if (old == OSQ_UNLOCKED_VAL)
>   		return true;
>   
> -	node->locked = 0;
> +	node->prev_cpu = old;
>   	prev = decode_cpu(old);
>   	node->prev = prev;
> +	node->locked = 0;
>   
>   	/*
>   	 * osq_lock()			unqueue
> @@ -144,7 +141,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
>   	 * polling, be careful.
>   	 */
>   	if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
> -				  vcpu_is_preempted(node_cpu(node->prev))))
> +				  vcpu_is_preempted(READ_ONCE(node->prev_cpu) - 1)))
>   		return true;
>   
>   	/* unqueue */
> @@ -201,6 +198,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
>   	 * it will wait in Step-A.
>   	 */
>   
> +	WRITE_ONCE(next->prev_cpu, prev->cpu);
>   	WRITE_ONCE(next->prev, prev);
>   	WRITE_ONCE(prev->next, next);
>   

Reviewed-by: Waiman Long <longman@redhat.com>

Reviewed-by: Waiman Long <longman@redhat.com>

[PATCH next v2 1/5] locking/osq_lock: Defer clearing node->locked until the slow osq_lock() path.
[PATCH next v2 2/5] locking/osq_lock: Optimise the vcpu_is_preempted() check.
[PATCH next v2 3/5] locking/osq_lock: Use node->prev_cpu instead of saving node->prev.
[PATCH next v2 4/5] locking/osq_lock: Avoid writing to node->next in the osq_lock() fast path.
[PATCH next v2 5/5] locking/osq_lock: Optimise decode_cpu() and per_cpu_ptr().