[patch V3 14/18] posix-timers: Avoid false cacheline sharing

Thomas Gleixner posted 18 patches 9 months, 2 weeks ago
[patch V3 14/18] posix-timers: Avoid false cacheline sharing
Posted by Thomas Gleixner 9 months, 2 weeks ago
struct k_itimer has the hlist_node, which is used for lookup in the hash
bucket, and the timer lock in the same cache line.

That's obviously bad, if one CPU fiddles with a timer and the other is
walking the hash bucket on which that timer is queued.

Avoid this by restructuring struct k_itimer, so that the read mostly (only
modified during setup and teardown) fields are in the first cache line and
the lock and the rest of the fields which get written to are in cacheline
2-N.

Reduces cacheline contention in a test case of 64 processes creating and
accessing 20000 timers each by almost 30% according to perf.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
V2: New patch
---
 include/linux/posix-timers.h |   21 ++++++++++++---------
 kernel/time/posix-timers.c   |    4 ++--
 2 files changed, 14 insertions(+), 11 deletions(-)

--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -177,23 +177,26 @@ static inline void posix_cputimers_init_
  * @rcu:		RCU head for freeing the timer.
  */
 struct k_itimer {
-	struct hlist_node	list;
-	struct hlist_node	ignored_list;
+	/* 1st cacheline contains read-mostly fields */
 	struct hlist_node	t_hash;
-	spinlock_t		it_lock;
-	const struct k_clock	*kclock;
-	clockid_t		it_clock;
+	struct hlist_node	list;
 	timer_t			it_id;
+	clockid_t		it_clock;
+	int			it_sigev_notify;
+	enum pid_type		it_pid_type;
+	struct signal_struct	*it_signal;
+	const struct k_clock	*kclock;
+
+	/* 2nd cacheline and above contain fields which are modified regularly */
+	spinlock_t		it_lock;
 	int			it_status;
 	bool			it_sig_periodic;
 	s64			it_overrun;
 	s64			it_overrun_last;
 	unsigned int		it_signal_seq;
 	unsigned int		it_sigqueue_seq;
-	int			it_sigev_notify;
-	enum pid_type		it_pid_type;
 	ktime_t			it_interval;
-	struct signal_struct	*it_signal;
+	struct hlist_node	ignored_list;
 	union {
 		struct pid		*it_pid;
 		struct task_struct	*it_process;
@@ -210,7 +213,7 @@ struct k_itimer {
 		} alarm;
 	} it;
 	struct rcu_head		rcu;
-};
+} ____cacheline_aligned_in_smp;
 
 void run_posix_cpu_timers(void);
 void posix_cpu_timers_exit(struct task_struct *task);
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -260,8 +260,8 @@ static int posix_get_hrtimer_res(clockid
 
 static __init int init_posix_timers(void)
 {
-	posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer), 0,
-					       SLAB_ACCOUNT, NULL);
+	posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer),
+					       __alignof__(struct k_itimer), SLAB_ACCOUNT, NULL);
 	return 0;
 }
 __initcall(init_posix_timers);
Re: [patch V3 14/18] posix-timers: Avoid false cacheline sharing
Posted by Nysal Jan K.A. 9 months ago
On Sat, Mar 08, 2025 at 05:48:42PM +0100, Thomas Gleixner wrote:
> ---
> V2: New patch
> ---
>  include/linux/posix-timers.h |   21 ++++++++++++---------
>  kernel/time/posix-timers.c   |    4 ++--
>  2 files changed, 14 insertions(+), 11 deletions(-)
> 
> --- a/include/linux/posix-timers.h
> +++ b/include/linux/posix-timers.h
> @@ -177,23 +177,26 @@ static inline void posix_cputimers_init_
>   * @rcu:		RCU head for freeing the timer.
>   */
>  struct k_itimer {
> -	struct hlist_node	list;
> -	struct hlist_node	ignored_list;
> +	/* 1st cacheline contains read-mostly fields */
>  	struct hlist_node	t_hash;
> -	spinlock_t		it_lock;
> -	const struct k_clock	*kclock;
> -	clockid_t		it_clock;
> +	struct hlist_node	list;
>  	timer_t			it_id;
> +	clockid_t		it_clock;
> +	int			it_sigev_notify;
> +	enum pid_type		it_pid_type;
> +	struct signal_struct	*it_signal;
> +	const struct k_clock	*kclock;
> +
> +	/* 2nd cacheline and above contain fields which are modified regularly */

On architectures like powerpc where cache line size is 128 bytes, we might still
run into false sharing. Perhaps rearranging it towards the end of the struct might
help avoid it? Is the benchmark code public? I can collect perf c2c data on powerpc.

> +	spinlock_t		it_lock;
>  	int			it_status;
>  	bool			it_sig_periodic;
>  	s64			it_overrun;
>  	s64			it_overrun_last;
>  	unsigned int		it_signal_seq;
>  	unsigned int		it_sigqueue_seq;
> -	int			it_sigev_notify;
> -	enum pid_type		it_pid_type;
>  	ktime_t			it_interval;
> -	struct signal_struct	*it_signal;
> +	struct hlist_node	ignored_list;
>  	union {
>  		struct pid		*it_pid;
>  		struct task_struct	*it_process;
> @@ -210,7 +213,7 @@ struct k_itimer {
>  		} alarm;
>  	} it;
>  	struct rcu_head		rcu;
> -};
> +} ____cacheline_aligned_in_smp;
>  

--Nysal
Re: [patch V3 14/18] posix-timers: Avoid false cacheline sharing
Posted by David Laight 9 months, 1 week ago
On Sat,  8 Mar 2025 17:48:42 +0100 (CET)
Thomas Gleixner <tglx@linutronix.de> wrote:

> struct k_itimer has the hlist_node, which is used for lookup in the hash
> bucket, and the timer lock in the same cache line.
> 
> That's obviously bad, if one CPU fiddles with a timer and the other is
> walking the hash bucket on which that timer is queued.
> 
> Avoid this by restructuring struct k_itimer, so that the read mostly (only
> modified during setup and teardown) fields are in the first cache line and
> the lock and the rest of the fields which get written to are in cacheline
> 2-N.

How big is the structure?
If I count it correctly the first 'cacheline' is 64 bytes on 64bit
(and somewhat smaller on 32bit - if anyone cares).

But there are some cpu (probably ppc) with quite large cache lines.
In that case you either need to waste the space by aligning the 2nd
part the structure into an actual cache line, or just align the
structure to a 64 byte boundary.

	David

> 
> Reduces cacheline contention in a test case of 64 processes creating and
> accessing 20000 timers each by almost 30% according to perf.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> 
> ---
> V2: New patch
> ---
>  include/linux/posix-timers.h |   21 ++++++++++++---------
>  kernel/time/posix-timers.c   |    4 ++--
>  2 files changed, 14 insertions(+), 11 deletions(-)
> 
> --- a/include/linux/posix-timers.h
> +++ b/include/linux/posix-timers.h
> @@ -177,23 +177,26 @@ static inline void posix_cputimers_init_
>   * @rcu:		RCU head for freeing the timer.
>   */
>  struct k_itimer {
> -	struct hlist_node	list;
> -	struct hlist_node	ignored_list;
> +	/* 1st cacheline contains read-mostly fields */
>  	struct hlist_node	t_hash;
> -	spinlock_t		it_lock;
> -	const struct k_clock	*kclock;
> -	clockid_t		it_clock;
> +	struct hlist_node	list;
>  	timer_t			it_id;
> +	clockid_t		it_clock;
> +	int			it_sigev_notify;
> +	enum pid_type		it_pid_type;
> +	struct signal_struct	*it_signal;
> +	const struct k_clock	*kclock;
> +
> +	/* 2nd cacheline and above contain fields which are modified regularly */
> +	spinlock_t		it_lock;
>  	int			it_status;
>  	bool			it_sig_periodic;
>  	s64			it_overrun;
>  	s64			it_overrun_last;
>  	unsigned int		it_signal_seq;
>  	unsigned int		it_sigqueue_seq;
> -	int			it_sigev_notify;
> -	enum pid_type		it_pid_type;
>  	ktime_t			it_interval;
> -	struct signal_struct	*it_signal;
> +	struct hlist_node	ignored_list;
>  	union {
>  		struct pid		*it_pid;
>  		struct task_struct	*it_process;
> @@ -210,7 +213,7 @@ struct k_itimer {
>  		} alarm;
>  	} it;
>  	struct rcu_head		rcu;
> -};
> +} ____cacheline_aligned_in_smp;
>  
>  void run_posix_cpu_timers(void);
>  void posix_cpu_timers_exit(struct task_struct *task);
> --- a/kernel/time/posix-timers.c
> +++ b/kernel/time/posix-timers.c
> @@ -260,8 +260,8 @@ static int posix_get_hrtimer_res(clockid
>  
>  static __init int init_posix_timers(void)
>  {
> -	posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer), 0,
> -					       SLAB_ACCOUNT, NULL);
> +	posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer),
> +					       __alignof__(struct k_itimer), SLAB_ACCOUNT, NULL);
>  	return 0;
>  }
>  __initcall(init_posix_timers);
> 
>
Re: [patch V3 14/18] posix-timers: Avoid false cacheline sharing
Posted by Frederic Weisbecker 9 months, 1 week ago
Le Sat, Mar 08, 2025 at 05:48:42PM +0100, Thomas Gleixner a écrit :
> struct k_itimer has the hlist_node, which is used for lookup in the hash
> bucket, and the timer lock in the same cache line.
> 
> That's obviously bad, if one CPU fiddles with a timer and the other is
> walking the hash bucket on which that timer is queued.
> 
> Avoid this by restructuring struct k_itimer, so that the read mostly (only
> modified during setup and teardown) fields are in the first cache line and
> the lock and the rest of the fields which get written to are in cacheline
> 2-N.
> 
> Reduces cacheline contention in a test case of 64 processes creating and
> accessing 20000 timers each by almost 30% according to perf.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Impressive what a fields reshuffle and alignement can achieve!

Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
[tip: timers/core] posix-timers: Avoid false cacheline sharing
Posted by tip-bot2 for Thomas Gleixner 9 months, 1 week ago
The following commit has been merged into the timers/core branch of tip:

Commit-ID:     5fa75a432f1a6b1402edd8802ecc14f8bbb90e49
Gitweb:        https://git.kernel.org/tip/5fa75a432f1a6b1402edd8802ecc14f8bbb90e49
Author:        Thomas Gleixner <tglx@linutronix.de>
AuthorDate:    Sat, 08 Mar 2025 17:48:42 +01:00
Committer:     Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Thu, 13 Mar 2025 12:07:18 +01:00

posix-timers: Avoid false cacheline sharing

struct k_itimer has the hlist_node, which is used for lookup in the hash
bucket, and the timer lock in the same cache line.

That's obviously bad, if one CPU fiddles with a timer and the other is
walking the hash bucket on which that timer is queued.

Avoid this by restructuring struct k_itimer, so that the read mostly (only
modified during setup and teardown) fields are in the first cache line and
the lock and the rest of the fields which get written to are in cacheline
2-N.

Reduces cacheline contention in a test case of 64 processes creating and
accessing 20000 timers each by almost 30% according to perf.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/all/20250308155624.341108067@linutronix.de


---
 include/linux/posix-timers.h | 21 ++++++++++++---------
 kernel/time/posix-timers.c   |  4 ++--
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index e714a55..094ef57 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -177,23 +177,26 @@ static inline void posix_cputimers_init_work(void) { }
  * @rcu:		RCU head for freeing the timer.
  */
 struct k_itimer {
-	struct hlist_node	list;
-	struct hlist_node	ignored_list;
+	/* 1st cacheline contains read-mostly fields */
 	struct hlist_node	t_hash;
-	spinlock_t		it_lock;
-	const struct k_clock	*kclock;
-	clockid_t		it_clock;
+	struct hlist_node	list;
 	timer_t			it_id;
+	clockid_t		it_clock;
+	int			it_sigev_notify;
+	enum pid_type		it_pid_type;
+	struct signal_struct	*it_signal;
+	const struct k_clock	*kclock;
+
+	/* 2nd cacheline and above contain fields which are modified regularly */
+	spinlock_t		it_lock;
 	int			it_status;
 	bool			it_sig_periodic;
 	s64			it_overrun;
 	s64			it_overrun_last;
 	unsigned int		it_signal_seq;
 	unsigned int		it_sigqueue_seq;
-	int			it_sigev_notify;
-	enum pid_type		it_pid_type;
 	ktime_t			it_interval;
-	struct signal_struct	*it_signal;
+	struct hlist_node	ignored_list;
 	union {
 		struct pid		*it_pid;
 		struct task_struct	*it_process;
@@ -210,7 +213,7 @@ struct k_itimer {
 		} alarm;
 	} it;
 	struct rcu_head		rcu;
-};
+} ____cacheline_aligned_in_smp;
 
 void run_posix_cpu_timers(void);
 void posix_cpu_timers_exit(struct task_struct *task);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0c4cee3..e4c92f4 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -260,8 +260,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
 
 static __init int init_posix_timers(void)
 {
-	posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer), 0,
-					       SLAB_ACCOUNT, NULL);
+	posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer),
+					       __alignof__(struct k_itimer), SLAB_ACCOUNT, NULL);
 	return 0;
 }
 __initcall(init_posix_timers);