[v2] timekeeping: Rework to prepare support of indenpendent PTP clocks

[PATCH v2 06/25] timekeeping: Reorder struct timekeeper

Posted by Anna-Maria Behnsen 1 month, 2 weeks ago

From: Thomas Gleixner <tglx@linutronix.de>

From: Thomas Gleixner <tglx@linutronix.de>

struct timekeeper is ordered suboptimal vs. cachelines. The layout,
including the preceding seqcount (see struct tk_core in timekeeper.c) is:

 cacheline 0:   seqcount, tkr_mono
 cacheline 1:   tkr_raw, xtime_sec
 cacheline 2:   ktime_sec ... tai_offset, internal variables
 cacheline 3:	next_leap_ktime, raw_sec, internal variables
 cacheline 4:	internal variables

So any access to via ktime_get*() except for access to CLOCK_MONOTONIC_RAW
will use either cachelines 0 + 1 or cachelines 0 + 2. Access to
CLOCK_MONOTONIC_RAW uses cachelines 0 + 1 + 3.

Reorder the members so that the result is more efficient:

 cacheline 0:   seqcount, tkr_mono
 cacheline 1:   xtime_sec, ktime_sec ... tai_offset
 cacheline 2:	tkr_raw, raw_sec
 cacheline 3:	internal variables
 cacheline 4:	internal variables

That means ktime_get*() will access cacheline 0 + 1 and CLOCK_MONOTONIC_RAW
access will use cachelines 0 + 2.

Update kernel-doc and fix formatting issues while at it. Also fix a typo
in struct tk_read_base kernel-doc.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
---
 include/linux/timekeeper_internal.h | 102 +++++++++++++++++++++---------------
 1 file changed, 61 insertions(+), 41 deletions(-)

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 902c20ef495a..430e40549136 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -26,7 +26,7 @@
  * occupies a single 64byte cache line.
  *
  * The struct is separate from struct timekeeper as it is also used
- * for a fast NMI safe accessors.
+ * for the fast NMI safe accessors.
  *
  * @base_real is for the fast NMI safe accessor to allow reading clock
  * realtime from any context.
@@ -44,33 +44,41 @@ struct tk_read_base {
 
 /**
  * struct timekeeper - Structure holding internal timekeeping values.
- * @tkr_mono:		The readout base structure for CLOCK_MONOTONIC
- * @tkr_raw:		The readout base structure for CLOCK_MONOTONIC_RAW
- * @xtime_sec:		Current CLOCK_REALTIME time in seconds
- * @ktime_sec:		Current CLOCK_MONOTONIC time in seconds
- * @wall_to_monotonic:	CLOCK_REALTIME to CLOCK_MONOTONIC offset
- * @offs_real:		Offset clock monotonic -> clock realtime
- * @offs_boot:		Offset clock monotonic -> clock boottime
- * @offs_tai:		Offset clock monotonic -> clock tai
- * @tai_offset:		The current UTC to TAI offset in seconds
- * @clock_was_set_seq:	The sequence number of clock was set events
- * @cs_was_changed_seq:	The sequence number of clocksource change events
- * @next_leap_ktime:	CLOCK_MONOTONIC time value of a pending leap-second
- * @raw_sec:		CLOCK_MONOTONIC_RAW  time in seconds
- * @monotonic_to_boot:	CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
- * @cycle_interval:	Number of clock cycles in one NTP interval
- * @xtime_interval:	Number of clock shifted nano seconds in one NTP
- *			interval.
- * @xtime_remainder:	Shifted nano seconds left over when rounding
- *			@cycle_interval
- * @raw_interval:	Shifted raw nano seconds accumulated per NTP interval.
- * @ntp_error:		Difference between accumulated time and NTP time in ntp
- *			shifted nano seconds.
- * @ntp_error_shift:	Shift conversion between clock shifted nano seconds and
- *			ntp shifted nano seconds.
- * @last_warning:	Warning ratelimiter (DEBUG_TIMEKEEPING)
- * @underflow_seen:	Underflow warning flag (DEBUG_TIMEKEEPING)
- * @overflow_seen:	Overflow warning flag (DEBUG_TIMEKEEPING)
+ * @tkr_mono:			The readout base structure for CLOCK_MONOTONIC
+ * @xtime_sec:			Current CLOCK_REALTIME time in seconds
+ * @ktime_sec:			Current CLOCK_MONOTONIC time in seconds
+ * @wall_to_monotonic:		CLOCK_REALTIME to CLOCK_MONOTONIC offset
+ * @offs_real:			Offset clock monotonic -> clock realtime
+ * @offs_boot:			Offset clock monotonic -> clock boottime
+ * @offs_tai:			Offset clock monotonic -> clock tai
+ * @tai_offset:			The current UTC to TAI offset in seconds
+ * @tkr_raw:			The readout base structure for CLOCK_MONOTONIC_RAW
+ * @raw_sec:			CLOCK_MONOTONIC_RAW  time in seconds
+ * @clock_was_set_seq:		The sequence number of clock was set events
+ * @cs_was_changed_seq:		The sequence number of clocksource change events
+ * @monotonic_to_boot:		CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
+ * @cycle_interval:		Number of clock cycles in one NTP interval
+ * @xtime_interval:		Number of clock shifted nano seconds in one NTP
+ *				interval.
+ * @xtime_remainder:		Shifted nano seconds left over when rounding
+ *				@cycle_interval
+ * @raw_interval:		Shifted raw nano seconds accumulated per NTP interval.
+ * @next_leap_ktime:		CLOCK_MONOTONIC time value of a pending leap-second
+ * @ntp_tick:			The ntp_tick_length() value currently being
+ *				used. This cached copy ensures we consistently
+ *				apply the tick length for an entire tick, as
+ *				ntp_tick_length may change mid-tick, and we don't
+ *				want to apply that new value to the tick in
+ *				progress.
+ * @ntp_error:			Difference between accumulated time and NTP time in ntp
+ *				shifted nano seconds.
+ * @ntp_error_shift:		Shift conversion between clock shifted nano seconds and
+ *				ntp shifted nano seconds.
+ * @ntp_err_mult:		Multiplication factor for scaled math conversion
+ * @skip_second_overflow:	Flag used to avoid updating NTP twice with same second
+ * @last_warning:		Warning ratelimiter (DEBUG_TIMEKEEPING)
+ * @underflow_seen:		Underflow warning flag (DEBUG_TIMEKEEPING)
+ * @overflow_seen:		Overflow warning flag (DEBUG_TIMEKEEPING)
  *
  * Note: For timespec(64) based interfaces wall_to_monotonic is what
  * we need to add to xtime (or xtime corrected for sub jiffy times)
@@ -88,10 +96,25 @@ struct tk_read_base {
  *
  * @monotonic_to_boottime is a timespec64 representation of @offs_boot to
  * accelerate the VDSO update for CLOCK_BOOTTIME.
+ *
+ * The cacheline ordering of the structure is optimized for in kernel usage
+ * of the ktime_get() and ktime_get_ts64() family of time accessors. Struct
+ * timekeeper is prepended in the core timekeeeping code with a sequence
+ * count, which results in the following cacheline layout:
+ *
+ * 0:	seqcount, tkr_mono
+ * 1:	xtime_sec ... tai_offset
+ * 2:	tkr_raw, raw_sec
+ * 3,4: Internal variables
+ *
+ * Cacheline 0,1 contain the data which is used for accessing
+ * CLOCK_MONOTONIC/REALTIME/BOOTTIME/TAI, while cacheline 2 contains the
+ * data for accessing CLOCK_MONOTONIC_RAW.  Cacheline 3,4 are internal
+ * variables which are only accessed during timekeeper updates once per
+ * tick.
  */
 struct timekeeper {
 	struct tk_read_base	tkr_mono;
-	struct tk_read_base	tkr_raw;
 	u64			xtime_sec;
 	unsigned long		ktime_sec;
 	struct timespec64	wall_to_monotonic;
@@ -99,31 +122,28 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	ktime_t			offs_tai;
 	s32			tai_offset;
+
+	struct tk_read_base	tkr_raw;
+	u64			raw_sec;
+
+	/* The following members are for timekeeping internal use */
 	unsigned int		clock_was_set_seq;
 	u8			cs_was_changed_seq;
-	ktime_t			next_leap_ktime;
-	u64			raw_sec;
+
 	struct timespec64	monotonic_to_boot;
 
-	/* The following members are for timekeeping internal use */
 	u64			cycle_interval;
 	u64			xtime_interval;
 	s64			xtime_remainder;
 	u64			raw_interval;
-	/* The ntp_tick_length() value currently being used.
-	 * This cached copy ensures we consistently apply the tick
-	 * length for an entire tick, as ntp_tick_length may change
-	 * mid-tick, and we don't want to apply that new value to
-	 * the tick in progress.
-	 */
+
+	ktime_t			next_leap_ktime;
 	u64			ntp_tick;
-	/* Difference between accumulated time and NTP time in ntp
-	 * shifted nano seconds. */
 	s64			ntp_error;
 	u32			ntp_error_shift;
 	u32			ntp_err_mult;
-	/* Flag used to avoid updating NTP twice with same second */
 	u32			skip_second_overflow;
+
 #ifdef CONFIG_DEBUG_TIMEKEEPING
 	long			last_warning;
 	/*

-- 
2.39.5

Re: [PATCH v2 06/25] timekeeping: Reorder struct timekeeper

Posted by John Stultz 1 month, 2 weeks ago

On Wed, Oct 9, 2024 at 1:29 AM Anna-Maria Behnsen
<anna-maria@linutronix.de> wrote:
>
> From: Thomas Gleixner <tglx@linutronix.de>
>
> From: Thomas Gleixner <tglx@linutronix.de>
>
> struct timekeeper is ordered suboptimal vs. cachelines. The layout,
> including the preceding seqcount (see struct tk_core in timekeeper.c) is:
>
>  cacheline 0:   seqcount, tkr_mono
>  cacheline 1:   tkr_raw, xtime_sec
>  cacheline 2:   ktime_sec ... tai_offset, internal variables
>  cacheline 3:   next_leap_ktime, raw_sec, internal variables
>  cacheline 4:   internal variables
>
> So any access to via ktime_get*() except for access to CLOCK_MONOTONIC_RAW
> will use either cachelines 0 + 1 or cachelines 0 + 2. Access to
> CLOCK_MONOTONIC_RAW uses cachelines 0 + 1 + 3.
>
> Reorder the members so that the result is more efficient:
>
>  cacheline 0:   seqcount, tkr_mono
>  cacheline 1:   xtime_sec, ktime_sec ... tai_offset
>  cacheline 2:   tkr_raw, raw_sec
>  cacheline 3:   internal variables
>  cacheline 4:   internal variables
>
> That means ktime_get*() will access cacheline 0 + 1 and CLOCK_MONOTONIC_RAW
> access will use cachelines 0 + 2.
>
> Update kernel-doc and fix formatting issues while at it. Also fix a typo
> in struct tk_read_base kernel-doc.
>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>

Acked-by: John Stultz <jstultz@google.com>

> ---
>  include/linux/timekeeper_internal.h | 102 +++++++++++++++++++++---------------
>  1 file changed, 61 insertions(+), 41 deletions(-)
>
> diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
> index 902c20ef495a..430e40549136 100644
> --- a/include/linux/timekeeper_internal.h
> +++ b/include/linux/timekeeper_internal.h
> @@ -26,7 +26,7 @@
>   * occupies a single 64byte cache line.
>   *
>   * The struct is separate from struct timekeeper as it is also used
> - * for a fast NMI safe accessors.
> + * for the fast NMI safe accessors.
>   *
>   * @base_real is for the fast NMI safe accessor to allow reading clock
>   * realtime from any context.
> @@ -44,33 +44,41 @@ struct tk_read_base {
>
>  /**
>   * struct timekeeper - Structure holding internal timekeeping values.
> - * @tkr_mono:          The readout base structure for CLOCK_MONOTONIC
> - * @tkr_raw:           The readout base structure for CLOCK_MONOTONIC_RAW
> - * @xtime_sec:         Current CLOCK_REALTIME time in seconds
> - * @ktime_sec:         Current CLOCK_MONOTONIC time in seconds
> - * @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset
> - * @offs_real:         Offset clock monotonic -> clock realtime
> - * @offs_boot:         Offset clock monotonic -> clock boottime
> - * @offs_tai:          Offset clock monotonic -> clock tai
> - * @tai_offset:                The current UTC to TAI offset in seconds
> - * @clock_was_set_seq: The sequence number of clock was set events
> - * @cs_was_changed_seq:        The sequence number of clocksource change events
> - * @next_leap_ktime:   CLOCK_MONOTONIC time value of a pending leap-second
> - * @raw_sec:           CLOCK_MONOTONIC_RAW  time in seconds
> - * @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
> - * @cycle_interval:    Number of clock cycles in one NTP interval
> - * @xtime_interval:    Number of clock shifted nano seconds in one NTP
> - *                     interval.
> - * @xtime_remainder:   Shifted nano seconds left over when rounding
> - *                     @cycle_interval
> - * @raw_interval:      Shifted raw nano seconds accumulated per NTP interval.
> - * @ntp_error:         Difference between accumulated time and NTP time in ntp
> - *                     shifted nano seconds.
> - * @ntp_error_shift:   Shift conversion between clock shifted nano seconds and
> - *                     ntp shifted nano seconds.
> - * @last_warning:      Warning ratelimiter (DEBUG_TIMEKEEPING)
> - * @underflow_seen:    Underflow warning flag (DEBUG_TIMEKEEPING)
> - * @overflow_seen:     Overflow warning flag (DEBUG_TIMEKEEPING)
> + * @tkr_mono:                  The readout base structure for CLOCK_MONOTONIC
> + * @xtime_sec:                 Current CLOCK_REALTIME time in seconds
> + * @ktime_sec:                 Current CLOCK_MONOTONIC time in seconds
> + * @wall_to_monotonic:         CLOCK_REALTIME to CLOCK_MONOTONIC offset
> + * @offs_real:                 Offset clock monotonic -> clock realtime
> + * @offs_boot:                 Offset clock monotonic -> clock boottime
> + * @offs_tai:                  Offset clock monotonic -> clock tai
> + * @tai_offset:                        The current UTC to TAI offset in seconds
> + * @tkr_raw:                   The readout base structure for CLOCK_MONOTONIC_RAW
> + * @raw_sec:                   CLOCK_MONOTONIC_RAW  time in seconds
> + * @clock_was_set_seq:         The sequence number of clock was set events
> + * @cs_was_changed_seq:                The sequence number of clocksource change events
> + * @monotonic_to_boot:         CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
> + * @cycle_interval:            Number of clock cycles in one NTP interval
> + * @xtime_interval:            Number of clock shifted nano seconds in one NTP
> + *                             interval.
> + * @xtime_remainder:           Shifted nano seconds left over when rounding
> + *                             @cycle_interval
> + * @raw_interval:              Shifted raw nano seconds accumulated per NTP interval.
> + * @next_leap_ktime:           CLOCK_MONOTONIC time value of a pending leap-second
> + * @ntp_tick:                  The ntp_tick_length() value currently being
> + *                             used. This cached copy ensures we consistently
> + *                             apply the tick length for an entire tick, as
> + *                             ntp_tick_length may change mid-tick, and we don't
> + *                             want to apply that new value to the tick in
> + *                             progress.
> + * @ntp_error:                 Difference between accumulated time and NTP time in ntp
> + *                             shifted nano seconds.
> + * @ntp_error_shift:           Shift conversion between clock shifted nano seconds and
> + *                             ntp shifted nano seconds.
> + * @ntp_err_mult:              Multiplication factor for scaled math conversion
> + * @skip_second_overflow:      Flag used to avoid updating NTP twice with same second
> + * @last_warning:              Warning ratelimiter (DEBUG_TIMEKEEPING)
> + * @underflow_seen:            Underflow warning flag (DEBUG_TIMEKEEPING)
> + * @overflow_seen:             Overflow warning flag (DEBUG_TIMEKEEPING)
>   *
>   * Note: For timespec(64) based interfaces wall_to_monotonic is what
>   * we need to add to xtime (or xtime corrected for sub jiffy times)
> @@ -88,10 +96,25 @@ struct tk_read_base {
>   *
>   * @monotonic_to_boottime is a timespec64 representation of @offs_boot to
>   * accelerate the VDSO update for CLOCK_BOOTTIME.
> + *
> + * The cacheline ordering of the structure is optimized for in kernel usage
> + * of the ktime_get() and ktime_get_ts64() family of time accessors. Struct
> + * timekeeper is prepended in the core timekeeeping code with a sequence
> + * count, which results in the following cacheline layout:
> + *
> + * 0:  seqcount, tkr_mono
> + * 1:  xtime_sec ... tai_offset
> + * 2:  tkr_raw, raw_sec
> + * 3,4: Internal variables
> + *
> + * Cacheline 0,1 contain the data which is used for accessing
> + * CLOCK_MONOTONIC/REALTIME/BOOTTIME/TAI, while cacheline 2 contains the
> + * data for accessing CLOCK_MONOTONIC_RAW.  Cacheline 3,4 are internal
> + * variables which are only accessed during timekeeper updates once per
> + * tick.

Would it make sense to add divider comments or something in the struct
to make this more visible? I fret in the context of a patch, a + line
adding a new structure element that breaks the ordered alignment might
not be obvious.

thanks
-john

Re: [PATCH v2 06/25] timekeeping: Reorder struct timekeeper

Posted by Anna-Maria Behnsen 1 month, 2 weeks ago

John Stultz <jstultz@google.com> writes:

> On Wed, Oct 9, 2024 at 1:29 AM Anna-Maria Behnsen
> <anna-maria@linutronix.de> wrote:
>>
>> From: Thomas Gleixner <tglx@linutronix.de>
>>
>> From: Thomas Gleixner <tglx@linutronix.de>
>>
>> struct timekeeper is ordered suboptimal vs. cachelines. The layout,
>> including the preceding seqcount (see struct tk_core in timekeeper.c) is:
>>
>>  cacheline 0:   seqcount, tkr_mono
>>  cacheline 1:   tkr_raw, xtime_sec
>>  cacheline 2:   ktime_sec ... tai_offset, internal variables
>>  cacheline 3:   next_leap_ktime, raw_sec, internal variables
>>  cacheline 4:   internal variables
>>
>> So any access to via ktime_get*() except for access to CLOCK_MONOTONIC_RAW
>> will use either cachelines 0 + 1 or cachelines 0 + 2. Access to
>> CLOCK_MONOTONIC_RAW uses cachelines 0 + 1 + 3.
>>
>> Reorder the members so that the result is more efficient:
>>
>>  cacheline 0:   seqcount, tkr_mono
>>  cacheline 1:   xtime_sec, ktime_sec ... tai_offset
>>  cacheline 2:   tkr_raw, raw_sec
>>  cacheline 3:   internal variables
>>  cacheline 4:   internal variables
>>
>> That means ktime_get*() will access cacheline 0 + 1 and CLOCK_MONOTONIC_RAW
>> access will use cachelines 0 + 2.
>>
>> Update kernel-doc and fix formatting issues while at it. Also fix a typo
>> in struct tk_read_base kernel-doc.
>>
>> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
>> Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
>
> Acked-by: John Stultz <jstultz@google.com>
>
>> ---
>>  include/linux/timekeeper_internal.h | 102 +++++++++++++++++++++---------------
>>  1 file changed, 61 insertions(+), 41 deletions(-)
>>
>> diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
>> index 902c20ef495a..430e40549136 100644
>> --- a/include/linux/timekeeper_internal.h
>> +++ b/include/linux/timekeeper_internal.h
>> @@ -26,7 +26,7 @@
>>   * occupies a single 64byte cache line.
>>   *
>>   * The struct is separate from struct timekeeper as it is also used
>> - * for a fast NMI safe accessors.
>> + * for the fast NMI safe accessors.
>>   *
>>   * @base_real is for the fast NMI safe accessor to allow reading clock
>>   * realtime from any context.
>> @@ -44,33 +44,41 @@ struct tk_read_base {
>>
>>  /**
>>   * struct timekeeper - Structure holding internal timekeeping values.
>> - * @tkr_mono:          The readout base structure for CLOCK_MONOTONIC
>> - * @tkr_raw:           The readout base structure for CLOCK_MONOTONIC_RAW
>> - * @xtime_sec:         Current CLOCK_REALTIME time in seconds
>> - * @ktime_sec:         Current CLOCK_MONOTONIC time in seconds
>> - * @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset
>> - * @offs_real:         Offset clock monotonic -> clock realtime
>> - * @offs_boot:         Offset clock monotonic -> clock boottime
>> - * @offs_tai:          Offset clock monotonic -> clock tai
>> - * @tai_offset:                The current UTC to TAI offset in seconds
>> - * @clock_was_set_seq: The sequence number of clock was set events
>> - * @cs_was_changed_seq:        The sequence number of clocksource change events
>> - * @next_leap_ktime:   CLOCK_MONOTONIC time value of a pending leap-second
>> - * @raw_sec:           CLOCK_MONOTONIC_RAW  time in seconds
>> - * @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
>> - * @cycle_interval:    Number of clock cycles in one NTP interval
>> - * @xtime_interval:    Number of clock shifted nano seconds in one NTP
>> - *                     interval.
>> - * @xtime_remainder:   Shifted nano seconds left over when rounding
>> - *                     @cycle_interval
>> - * @raw_interval:      Shifted raw nano seconds accumulated per NTP interval.
>> - * @ntp_error:         Difference between accumulated time and NTP time in ntp
>> - *                     shifted nano seconds.
>> - * @ntp_error_shift:   Shift conversion between clock shifted nano seconds and
>> - *                     ntp shifted nano seconds.
>> - * @last_warning:      Warning ratelimiter (DEBUG_TIMEKEEPING)
>> - * @underflow_seen:    Underflow warning flag (DEBUG_TIMEKEEPING)
>> - * @overflow_seen:     Overflow warning flag (DEBUG_TIMEKEEPING)
>> + * @tkr_mono:                  The readout base structure for CLOCK_MONOTONIC
>> + * @xtime_sec:                 Current CLOCK_REALTIME time in seconds
>> + * @ktime_sec:                 Current CLOCK_MONOTONIC time in seconds
>> + * @wall_to_monotonic:         CLOCK_REALTIME to CLOCK_MONOTONIC offset
>> + * @offs_real:                 Offset clock monotonic -> clock realtime
>> + * @offs_boot:                 Offset clock monotonic -> clock boottime
>> + * @offs_tai:                  Offset clock monotonic -> clock tai
>> + * @tai_offset:                        The current UTC to TAI offset in seconds
>> + * @tkr_raw:                   The readout base structure for CLOCK_MONOTONIC_RAW
>> + * @raw_sec:                   CLOCK_MONOTONIC_RAW  time in seconds
>> + * @clock_was_set_seq:         The sequence number of clock was set events
>> + * @cs_was_changed_seq:                The sequence number of clocksource change events
>> + * @monotonic_to_boot:         CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
>> + * @cycle_interval:            Number of clock cycles in one NTP interval
>> + * @xtime_interval:            Number of clock shifted nano seconds in one NTP
>> + *                             interval.
>> + * @xtime_remainder:           Shifted nano seconds left over when rounding
>> + *                             @cycle_interval
>> + * @raw_interval:              Shifted raw nano seconds accumulated per NTP interval.
>> + * @next_leap_ktime:           CLOCK_MONOTONIC time value of a pending leap-second
>> + * @ntp_tick:                  The ntp_tick_length() value currently being
>> + *                             used. This cached copy ensures we consistently
>> + *                             apply the tick length for an entire tick, as
>> + *                             ntp_tick_length may change mid-tick, and we don't
>> + *                             want to apply that new value to the tick in
>> + *                             progress.
>> + * @ntp_error:                 Difference between accumulated time and NTP time in ntp
>> + *                             shifted nano seconds.
>> + * @ntp_error_shift:           Shift conversion between clock shifted nano seconds and
>> + *                             ntp shifted nano seconds.
>> + * @ntp_err_mult:              Multiplication factor for scaled math conversion
>> + * @skip_second_overflow:      Flag used to avoid updating NTP twice with same second
>> + * @last_warning:              Warning ratelimiter (DEBUG_TIMEKEEPING)
>> + * @underflow_seen:            Underflow warning flag (DEBUG_TIMEKEEPING)
>> + * @overflow_seen:             Overflow warning flag (DEBUG_TIMEKEEPING)
>>   *
>>   * Note: For timespec(64) based interfaces wall_to_monotonic is what
>>   * we need to add to xtime (or xtime corrected for sub jiffy times)
>> @@ -88,10 +96,25 @@ struct tk_read_base {
>>   *
>>   * @monotonic_to_boottime is a timespec64 representation of @offs_boot to
>>   * accelerate the VDSO update for CLOCK_BOOTTIME.
>> + *
>> + * The cacheline ordering of the structure is optimized for in kernel usage
>> + * of the ktime_get() and ktime_get_ts64() family of time accessors. Struct
>> + * timekeeper is prepended in the core timekeeeping code with a sequence
>> + * count, which results in the following cacheline layout:
>> + *
>> + * 0:  seqcount, tkr_mono
>> + * 1:  xtime_sec ... tai_offset
>> + * 2:  tkr_raw, raw_sec
>> + * 3,4: Internal variables
>> + *
>> + * Cacheline 0,1 contain the data which is used for accessing
>> + * CLOCK_MONOTONIC/REALTIME/BOOTTIME/TAI, while cacheline 2 contains the
>> + * data for accessing CLOCK_MONOTONIC_RAW.  Cacheline 3,4 are internal
>> + * variables which are only accessed during timekeeper updates once per
>> + * tick.
>
> Would it make sense to add divider comments or something in the struct
> to make this more visible? I fret in the context of a patch, a + line
> adding a new structure element that breaks the ordered alignment might
> not be obvious.

This is an argument! I'll add simple comments with /* Cachline X: */

Thanks,

	Anna-Maria

[PATCH v2a] timekeeping: Reorder struct timekeeper

Posted by Anna-Maria Behnsen 1 month, 1 week ago

From: Thomas Gleixner <tglx@linutronix.de>

struct timekeeper is ordered suboptimal vs. cachelines. The layout,
including the preceding seqcount (see struct tk_core in timekeeper.c) is:

 cacheline 0:   seqcount, tkr_mono
 cacheline 1:   tkr_raw, xtime_sec
 cacheline 2:   ktime_sec ... tai_offset, internal variables
 cacheline 3:	next_leap_ktime, raw_sec, internal variables
 cacheline 4:	internal variables

So any access to via ktime_get*() except for access to CLOCK_MONOTONIC_RAW
will use either cachelines 0 + 1 or cachelines 0 + 2. Access to
CLOCK_MONOTONIC_RAW uses cachelines 0 + 1 + 3.

Reorder the members so that the result is more efficient:

 cacheline 0:   seqcount, tkr_mono
 cacheline 1:   xtime_sec, ktime_sec ... tai_offset
 cacheline 2:	tkr_raw, raw_sec
 cacheline 3:	internal variables
 cacheline 4:	internal variables

That means ktime_get*() will access cacheline 0 + 1 and CLOCK_MONOTONIC_RAW
access will use cachelines 0 + 2.

Update kernel-doc and fix formatting issues while at it. Also fix a typo
in struct tk_read_base kernel-doc.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
---
 include/linux/timekeeper_internal.h | 106 +++++++++++++++++-----------
 1 file changed, 65 insertions(+), 41 deletions(-)

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 902c20ef495a..a3b6380a7777 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -26,7 +26,7 @@
  * occupies a single 64byte cache line.
  *
  * The struct is separate from struct timekeeper as it is also used
- * for a fast NMI safe accessors.
+ * for the fast NMI safe accessors.
  *
  * @base_real is for the fast NMI safe accessor to allow reading clock
  * realtime from any context.
@@ -44,33 +44,41 @@ struct tk_read_base {
 
 /**
  * struct timekeeper - Structure holding internal timekeeping values.
- * @tkr_mono:		The readout base structure for CLOCK_MONOTONIC
- * @tkr_raw:		The readout base structure for CLOCK_MONOTONIC_RAW
- * @xtime_sec:		Current CLOCK_REALTIME time in seconds
- * @ktime_sec:		Current CLOCK_MONOTONIC time in seconds
- * @wall_to_monotonic:	CLOCK_REALTIME to CLOCK_MONOTONIC offset
- * @offs_real:		Offset clock monotonic -> clock realtime
- * @offs_boot:		Offset clock monotonic -> clock boottime
- * @offs_tai:		Offset clock monotonic -> clock tai
- * @tai_offset:		The current UTC to TAI offset in seconds
- * @clock_was_set_seq:	The sequence number of clock was set events
- * @cs_was_changed_seq:	The sequence number of clocksource change events
- * @next_leap_ktime:	CLOCK_MONOTONIC time value of a pending leap-second
- * @raw_sec:		CLOCK_MONOTONIC_RAW  time in seconds
- * @monotonic_to_boot:	CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
- * @cycle_interval:	Number of clock cycles in one NTP interval
- * @xtime_interval:	Number of clock shifted nano seconds in one NTP
- *			interval.
- * @xtime_remainder:	Shifted nano seconds left over when rounding
- *			@cycle_interval
- * @raw_interval:	Shifted raw nano seconds accumulated per NTP interval.
- * @ntp_error:		Difference between accumulated time and NTP time in ntp
- *			shifted nano seconds.
- * @ntp_error_shift:	Shift conversion between clock shifted nano seconds and
- *			ntp shifted nano seconds.
- * @last_warning:	Warning ratelimiter (DEBUG_TIMEKEEPING)
- * @underflow_seen:	Underflow warning flag (DEBUG_TIMEKEEPING)
- * @overflow_seen:	Overflow warning flag (DEBUG_TIMEKEEPING)
+ * @tkr_mono:			The readout base structure for CLOCK_MONOTONIC
+ * @xtime_sec:			Current CLOCK_REALTIME time in seconds
+ * @ktime_sec:			Current CLOCK_MONOTONIC time in seconds
+ * @wall_to_monotonic:		CLOCK_REALTIME to CLOCK_MONOTONIC offset
+ * @offs_real:			Offset clock monotonic -> clock realtime
+ * @offs_boot:			Offset clock monotonic -> clock boottime
+ * @offs_tai:			Offset clock monotonic -> clock tai
+ * @tai_offset:			The current UTC to TAI offset in seconds
+ * @tkr_raw:			The readout base structure for CLOCK_MONOTONIC_RAW
+ * @raw_sec:			CLOCK_MONOTONIC_RAW  time in seconds
+ * @clock_was_set_seq:		The sequence number of clock was set events
+ * @cs_was_changed_seq:		The sequence number of clocksource change events
+ * @monotonic_to_boot:		CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
+ * @cycle_interval:		Number of clock cycles in one NTP interval
+ * @xtime_interval:		Number of clock shifted nano seconds in one NTP
+ *				interval.
+ * @xtime_remainder:		Shifted nano seconds left over when rounding
+ *				@cycle_interval
+ * @raw_interval:		Shifted raw nano seconds accumulated per NTP interval.
+ * @next_leap_ktime:		CLOCK_MONOTONIC time value of a pending leap-second
+ * @ntp_tick:			The ntp_tick_length() value currently being
+ *				used. This cached copy ensures we consistently
+ *				apply the tick length for an entire tick, as
+ *				ntp_tick_length may change mid-tick, and we don't
+ *				want to apply that new value to the tick in
+ *				progress.
+ * @ntp_error:			Difference between accumulated time and NTP time in ntp
+ *				shifted nano seconds.
+ * @ntp_error_shift:		Shift conversion between clock shifted nano seconds and
+ *				ntp shifted nano seconds.
+ * @ntp_err_mult:		Multiplication factor for scaled math conversion
+ * @skip_second_overflow:	Flag used to avoid updating NTP twice with same second
+ * @last_warning:		Warning ratelimiter (DEBUG_TIMEKEEPING)
+ * @underflow_seen:		Underflow warning flag (DEBUG_TIMEKEEPING)
+ * @overflow_seen:		Overflow warning flag (DEBUG_TIMEKEEPING)
  *
  * Note: For timespec(64) based interfaces wall_to_monotonic is what
  * we need to add to xtime (or xtime corrected for sub jiffy times)
@@ -88,10 +96,28 @@ struct tk_read_base {
  *
  * @monotonic_to_boottime is a timespec64 representation of @offs_boot to
  * accelerate the VDSO update for CLOCK_BOOTTIME.
+ *
+ * The cacheline ordering of the structure is optimized for in kernel usage of
+ * the ktime_get() and ktime_get_ts64() family of time accessors. Struct
+ * timekeeper is prepended in the core timekeeping code with a sequence count,
+ * which results in the following cacheline layout:
+ *
+ * 0:	seqcount, tkr_mono
+ * 1:	xtime_sec ... tai_offset
+ * 2:	tkr_raw, raw_sec
+ * 3,4: Internal variables
+ *
+ * Cacheline 0,1 contain the data which is used for accessing
+ * CLOCK_MONOTONIC/REALTIME/BOOTTIME/TAI, while cacheline 2 contains the
+ * data for accessing CLOCK_MONOTONIC_RAW.  Cacheline 3,4 are internal
+ * variables which are only accessed during timekeeper updates once per
+ * tick.
  */
 struct timekeeper {
+	/* Cacheline 0 (together with prepended seqcount of timekeeper core): */
 	struct tk_read_base	tkr_mono;
-	struct tk_read_base	tkr_raw;
+
+	/* Cacheline 1: */
 	u64			xtime_sec;
 	unsigned long		ktime_sec;
 	struct timespec64	wall_to_monotonic;
@@ -99,31 +125,29 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	ktime_t			offs_tai;
 	s32			tai_offset;
+
+	/* Cacheline 2: */
+	struct tk_read_base	tkr_raw;
+	u64			raw_sec;
+
+	/* Cachline 3 and 4 (timekeeping internal variables): */
 	unsigned int		clock_was_set_seq;
 	u8			cs_was_changed_seq;
-	ktime_t			next_leap_ktime;
-	u64			raw_sec;
+
 	struct timespec64	monotonic_to_boot;
 
-	/* The following members are for timekeeping internal use */
 	u64			cycle_interval;
 	u64			xtime_interval;
 	s64			xtime_remainder;
 	u64			raw_interval;
-	/* The ntp_tick_length() value currently being used.
-	 * This cached copy ensures we consistently apply the tick
-	 * length for an entire tick, as ntp_tick_length may change
-	 * mid-tick, and we don't want to apply that new value to
-	 * the tick in progress.
-	 */
+
+	ktime_t			next_leap_ktime;
 	u64			ntp_tick;
-	/* Difference between accumulated time and NTP time in ntp
-	 * shifted nano seconds. */
 	s64			ntp_error;
 	u32			ntp_error_shift;
 	u32			ntp_err_mult;
-	/* Flag used to avoid updating NTP twice with same second */
 	u32			skip_second_overflow;
+
 #ifdef CONFIG_DEBUG_TIMEKEEPING
 	long			last_warning;
 	/*
-- 
2.39.5

[tip: timers/core] timekeeping: Reorder struct timekeeper

Posted by tip-bot2 for Thomas Gleixner 1 month ago

The following commit has been merged into the timers/core branch of tip:

Commit-ID:     6860d28ccb2390b4eeda32ab2ce7eb10f71921e1
Gitweb:        https://git.kernel.org/tip/6860d28ccb2390b4eeda32ab2ce7eb10f71921e1
Author:        Thomas Gleixner <tglx@linutronix.de>
AuthorDate:    Tue, 15 Oct 2024 12:08:39 +02:00
Committer:     Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Fri, 25 Oct 2024 19:49:13 +02:00

timekeeping: Reorder struct timekeeper

struct timekeeper is ordered suboptimal vs. cachelines. The layout,
including the preceding seqcount (see struct tk_core in timekeeper.c) is:

 cacheline 0:   seqcount, tkr_mono
 cacheline 1:   tkr_raw, xtime_sec
 cacheline 2:   ktime_sec ... tai_offset, internal variables
 cacheline 3:	next_leap_ktime, raw_sec, internal variables
 cacheline 4:	internal variables

So any access to via ktime_get*() except for access to CLOCK_MONOTONIC_RAW
will use either cachelines 0 + 1 or cachelines 0 + 2. Access to
CLOCK_MONOTONIC_RAW uses cachelines 0 + 1 + 3.

Reorder the members so that the result is more efficient:

 cacheline 0:   seqcount, tkr_mono
 cacheline 1:   xtime_sec, ktime_sec ... tai_offset
 cacheline 2:	tkr_raw, raw_sec
 cacheline 3:	internal variables
 cacheline 4:	internal variables

That means ktime_get*() will access cacheline 0 + 1 and CLOCK_MONOTONIC_RAW
access will use cachelines 0 + 2.

Update kernel-doc and fix formatting issues while at it. Also fix a typo
in struct tk_read_base kernel-doc.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20241015100839.12702-1-anna-maria@linutronix.de

---
 include/linux/timekeeper_internal.h | 106 ++++++++++++++++-----------
 1 file changed, 65 insertions(+), 41 deletions(-)

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 902c20e..a3b6380 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -26,7 +26,7 @@
  * occupies a single 64byte cache line.
  *
  * The struct is separate from struct timekeeper as it is also used
- * for a fast NMI safe accessors.
+ * for the fast NMI safe accessors.
  *
  * @base_real is for the fast NMI safe accessor to allow reading clock
  * realtime from any context.
@@ -44,33 +44,41 @@ struct tk_read_base {
 
 /**
  * struct timekeeper - Structure holding internal timekeeping values.
- * @tkr_mono:		The readout base structure for CLOCK_MONOTONIC
- * @tkr_raw:		The readout base structure for CLOCK_MONOTONIC_RAW
- * @xtime_sec:		Current CLOCK_REALTIME time in seconds
- * @ktime_sec:		Current CLOCK_MONOTONIC time in seconds
- * @wall_to_monotonic:	CLOCK_REALTIME to CLOCK_MONOTONIC offset
- * @offs_real:		Offset clock monotonic -> clock realtime
- * @offs_boot:		Offset clock monotonic -> clock boottime
- * @offs_tai:		Offset clock monotonic -> clock tai
- * @tai_offset:		The current UTC to TAI offset in seconds
- * @clock_was_set_seq:	The sequence number of clock was set events
- * @cs_was_changed_seq:	The sequence number of clocksource change events
- * @next_leap_ktime:	CLOCK_MONOTONIC time value of a pending leap-second
- * @raw_sec:		CLOCK_MONOTONIC_RAW  time in seconds
- * @monotonic_to_boot:	CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
- * @cycle_interval:	Number of clock cycles in one NTP interval
- * @xtime_interval:	Number of clock shifted nano seconds in one NTP
- *			interval.
- * @xtime_remainder:	Shifted nano seconds left over when rounding
- *			@cycle_interval
- * @raw_interval:	Shifted raw nano seconds accumulated per NTP interval.
- * @ntp_error:		Difference between accumulated time and NTP time in ntp
- *			shifted nano seconds.
- * @ntp_error_shift:	Shift conversion between clock shifted nano seconds and
- *			ntp shifted nano seconds.
- * @last_warning:	Warning ratelimiter (DEBUG_TIMEKEEPING)
- * @underflow_seen:	Underflow warning flag (DEBUG_TIMEKEEPING)
- * @overflow_seen:	Overflow warning flag (DEBUG_TIMEKEEPING)
+ * @tkr_mono:			The readout base structure for CLOCK_MONOTONIC
+ * @xtime_sec:			Current CLOCK_REALTIME time in seconds
+ * @ktime_sec:			Current CLOCK_MONOTONIC time in seconds
+ * @wall_to_monotonic:		CLOCK_REALTIME to CLOCK_MONOTONIC offset
+ * @offs_real:			Offset clock monotonic -> clock realtime
+ * @offs_boot:			Offset clock monotonic -> clock boottime
+ * @offs_tai:			Offset clock monotonic -> clock tai
+ * @tai_offset:			The current UTC to TAI offset in seconds
+ * @tkr_raw:			The readout base structure for CLOCK_MONOTONIC_RAW
+ * @raw_sec:			CLOCK_MONOTONIC_RAW  time in seconds
+ * @clock_was_set_seq:		The sequence number of clock was set events
+ * @cs_was_changed_seq:		The sequence number of clocksource change events
+ * @monotonic_to_boot:		CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
+ * @cycle_interval:		Number of clock cycles in one NTP interval
+ * @xtime_interval:		Number of clock shifted nano seconds in one NTP
+ *				interval.
+ * @xtime_remainder:		Shifted nano seconds left over when rounding
+ *				@cycle_interval
+ * @raw_interval:		Shifted raw nano seconds accumulated per NTP interval.
+ * @next_leap_ktime:		CLOCK_MONOTONIC time value of a pending leap-second
+ * @ntp_tick:			The ntp_tick_length() value currently being
+ *				used. This cached copy ensures we consistently
+ *				apply the tick length for an entire tick, as
+ *				ntp_tick_length may change mid-tick, and we don't
+ *				want to apply that new value to the tick in
+ *				progress.
+ * @ntp_error:			Difference between accumulated time and NTP time in ntp
+ *				shifted nano seconds.
+ * @ntp_error_shift:		Shift conversion between clock shifted nano seconds and
+ *				ntp shifted nano seconds.
+ * @ntp_err_mult:		Multiplication factor for scaled math conversion
+ * @skip_second_overflow:	Flag used to avoid updating NTP twice with same second
+ * @last_warning:		Warning ratelimiter (DEBUG_TIMEKEEPING)
+ * @underflow_seen:		Underflow warning flag (DEBUG_TIMEKEEPING)
+ * @overflow_seen:		Overflow warning flag (DEBUG_TIMEKEEPING)
  *
  * Note: For timespec(64) based interfaces wall_to_monotonic is what
  * we need to add to xtime (or xtime corrected for sub jiffy times)
@@ -88,10 +96,28 @@ struct tk_read_base {
  *
  * @monotonic_to_boottime is a timespec64 representation of @offs_boot to
  * accelerate the VDSO update for CLOCK_BOOTTIME.
+ *
+ * The cacheline ordering of the structure is optimized for in kernel usage of
+ * the ktime_get() and ktime_get_ts64() family of time accessors. Struct
+ * timekeeper is prepended in the core timekeeping code with a sequence count,
+ * which results in the following cacheline layout:
+ *
+ * 0:	seqcount, tkr_mono
+ * 1:	xtime_sec ... tai_offset
+ * 2:	tkr_raw, raw_sec
+ * 3,4: Internal variables
+ *
+ * Cacheline 0,1 contain the data which is used for accessing
+ * CLOCK_MONOTONIC/REALTIME/BOOTTIME/TAI, while cacheline 2 contains the
+ * data for accessing CLOCK_MONOTONIC_RAW.  Cacheline 3,4 are internal
+ * variables which are only accessed during timekeeper updates once per
+ * tick.
  */
 struct timekeeper {
+	/* Cacheline 0 (together with prepended seqcount of timekeeper core): */
 	struct tk_read_base	tkr_mono;
-	struct tk_read_base	tkr_raw;
+
+	/* Cacheline 1: */
 	u64			xtime_sec;
 	unsigned long		ktime_sec;
 	struct timespec64	wall_to_monotonic;
@@ -99,31 +125,29 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	ktime_t			offs_tai;
 	s32			tai_offset;
+
+	/* Cacheline 2: */
+	struct tk_read_base	tkr_raw;
+	u64			raw_sec;
+
+	/* Cachline 3 and 4 (timekeeping internal variables): */
 	unsigned int		clock_was_set_seq;
 	u8			cs_was_changed_seq;
-	ktime_t			next_leap_ktime;
-	u64			raw_sec;
+
 	struct timespec64	monotonic_to_boot;
 
-	/* The following members are for timekeeping internal use */
 	u64			cycle_interval;
 	u64			xtime_interval;
 	s64			xtime_remainder;
 	u64			raw_interval;
-	/* The ntp_tick_length() value currently being used.
-	 * This cached copy ensures we consistently apply the tick
-	 * length for an entire tick, as ntp_tick_length may change
-	 * mid-tick, and we don't want to apply that new value to
-	 * the tick in progress.
-	 */
+
+	ktime_t			next_leap_ktime;
 	u64			ntp_tick;
-	/* Difference between accumulated time and NTP time in ntp
-	 * shifted nano seconds. */
 	s64			ntp_error;
 	u32			ntp_error_shift;
 	u32			ntp_err_mult;
-	/* Flag used to avoid updating NTP twice with same second */
 	u32			skip_second_overflow;
+
 #ifdef CONFIG_DEBUG_TIMEKEEPING
 	long			last_warning;
 	/*

[tip: timers/core] timekeeping: Reorder struct timekeeper

Posted by tip-bot2 for Thomas Gleixner 1 month ago

The following commit has been merged into the timers/core branch of tip:

Commit-ID:     3b7b1f9c33bc25ace98ee5be766d0c8b23332c14
Gitweb:        https://git.kernel.org/tip/3b7b1f9c33bc25ace98ee5be766d0c8b23332c14
Author:        Thomas Gleixner <tglx@linutronix.de>
AuthorDate:    Tue, 15 Oct 2024 12:08:39 +02:00
Committer:     Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Fri, 25 Oct 2024 16:41:11 +02:00

timekeeping: Reorder struct timekeeper

struct timekeeper is ordered suboptimal vs. cachelines. The layout,
including the preceding seqcount (see struct tk_core in timekeeper.c) is:

 cacheline 0:   seqcount, tkr_mono
 cacheline 1:   tkr_raw, xtime_sec
 cacheline 2:   ktime_sec ... tai_offset, internal variables
 cacheline 3:	next_leap_ktime, raw_sec, internal variables
 cacheline 4:	internal variables

So any access to via ktime_get*() except for access to CLOCK_MONOTONIC_RAW
will use either cachelines 0 + 1 or cachelines 0 + 2. Access to
CLOCK_MONOTONIC_RAW uses cachelines 0 + 1 + 3.

Reorder the members so that the result is more efficient:

 cacheline 0:   seqcount, tkr_mono
 cacheline 1:   xtime_sec, ktime_sec ... tai_offset
 cacheline 2:	tkr_raw, raw_sec
 cacheline 3:	internal variables
 cacheline 4:	internal variables

That means ktime_get*() will access cacheline 0 + 1 and CLOCK_MONOTONIC_RAW
access will use cachelines 0 + 2.

Update kernel-doc and fix formatting issues while at it. Also fix a typo
in struct tk_read_base kernel-doc.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20241015100839.12702-1-anna-maria@linutronix.de

---
 include/linux/timekeeper_internal.h | 106 ++++++++++++++++-----------
 1 file changed, 65 insertions(+), 41 deletions(-)

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 902c20e..a3b6380 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -26,7 +26,7 @@
  * occupies a single 64byte cache line.
  *
  * The struct is separate from struct timekeeper as it is also used
- * for a fast NMI safe accessors.
+ * for the fast NMI safe accessors.
  *
  * @base_real is for the fast NMI safe accessor to allow reading clock
  * realtime from any context.
@@ -44,33 +44,41 @@ struct tk_read_base {
 
 /**
  * struct timekeeper - Structure holding internal timekeeping values.
- * @tkr_mono:		The readout base structure for CLOCK_MONOTONIC
- * @tkr_raw:		The readout base structure for CLOCK_MONOTONIC_RAW
- * @xtime_sec:		Current CLOCK_REALTIME time in seconds
- * @ktime_sec:		Current CLOCK_MONOTONIC time in seconds
- * @wall_to_monotonic:	CLOCK_REALTIME to CLOCK_MONOTONIC offset
- * @offs_real:		Offset clock monotonic -> clock realtime
- * @offs_boot:		Offset clock monotonic -> clock boottime
- * @offs_tai:		Offset clock monotonic -> clock tai
- * @tai_offset:		The current UTC to TAI offset in seconds
- * @clock_was_set_seq:	The sequence number of clock was set events
- * @cs_was_changed_seq:	The sequence number of clocksource change events
- * @next_leap_ktime:	CLOCK_MONOTONIC time value of a pending leap-second
- * @raw_sec:		CLOCK_MONOTONIC_RAW  time in seconds
- * @monotonic_to_boot:	CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
- * @cycle_interval:	Number of clock cycles in one NTP interval
- * @xtime_interval:	Number of clock shifted nano seconds in one NTP
- *			interval.
- * @xtime_remainder:	Shifted nano seconds left over when rounding
- *			@cycle_interval
- * @raw_interval:	Shifted raw nano seconds accumulated per NTP interval.
- * @ntp_error:		Difference between accumulated time and NTP time in ntp
- *			shifted nano seconds.
- * @ntp_error_shift:	Shift conversion between clock shifted nano seconds and
- *			ntp shifted nano seconds.
- * @last_warning:	Warning ratelimiter (DEBUG_TIMEKEEPING)
- * @underflow_seen:	Underflow warning flag (DEBUG_TIMEKEEPING)
- * @overflow_seen:	Overflow warning flag (DEBUG_TIMEKEEPING)
+ * @tkr_mono:			The readout base structure for CLOCK_MONOTONIC
+ * @xtime_sec:			Current CLOCK_REALTIME time in seconds
+ * @ktime_sec:			Current CLOCK_MONOTONIC time in seconds
+ * @wall_to_monotonic:		CLOCK_REALTIME to CLOCK_MONOTONIC offset
+ * @offs_real:			Offset clock monotonic -> clock realtime
+ * @offs_boot:			Offset clock monotonic -> clock boottime
+ * @offs_tai:			Offset clock monotonic -> clock tai
+ * @tai_offset:			The current UTC to TAI offset in seconds
+ * @tkr_raw:			The readout base structure for CLOCK_MONOTONIC_RAW
+ * @raw_sec:			CLOCK_MONOTONIC_RAW  time in seconds
+ * @clock_was_set_seq:		The sequence number of clock was set events
+ * @cs_was_changed_seq:		The sequence number of clocksource change events
+ * @monotonic_to_boot:		CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
+ * @cycle_interval:		Number of clock cycles in one NTP interval
+ * @xtime_interval:		Number of clock shifted nano seconds in one NTP
+ *				interval.
+ * @xtime_remainder:		Shifted nano seconds left over when rounding
+ *				@cycle_interval
+ * @raw_interval:		Shifted raw nano seconds accumulated per NTP interval.
+ * @next_leap_ktime:		CLOCK_MONOTONIC time value of a pending leap-second
+ * @ntp_tick:			The ntp_tick_length() value currently being
+ *				used. This cached copy ensures we consistently
+ *				apply the tick length for an entire tick, as
+ *				ntp_tick_length may change mid-tick, and we don't
+ *				want to apply that new value to the tick in
+ *				progress.
+ * @ntp_error:			Difference between accumulated time and NTP time in ntp
+ *				shifted nano seconds.
+ * @ntp_error_shift:		Shift conversion between clock shifted nano seconds and
+ *				ntp shifted nano seconds.
+ * @ntp_err_mult:		Multiplication factor for scaled math conversion
+ * @skip_second_overflow:	Flag used to avoid updating NTP twice with same second
+ * @last_warning:		Warning ratelimiter (DEBUG_TIMEKEEPING)
+ * @underflow_seen:		Underflow warning flag (DEBUG_TIMEKEEPING)
+ * @overflow_seen:		Overflow warning flag (DEBUG_TIMEKEEPING)
  *
  * Note: For timespec(64) based interfaces wall_to_monotonic is what
  * we need to add to xtime (or xtime corrected for sub jiffy times)
@@ -88,10 +96,28 @@ struct tk_read_base {
  *
  * @monotonic_to_boottime is a timespec64 representation of @offs_boot to
  * accelerate the VDSO update for CLOCK_BOOTTIME.
+ *
+ * The cacheline ordering of the structure is optimized for in kernel usage of
+ * the ktime_get() and ktime_get_ts64() family of time accessors. Struct
+ * timekeeper is prepended in the core timekeeping code with a sequence count,
+ * which results in the following cacheline layout:
+ *
+ * 0:	seqcount, tkr_mono
+ * 1:	xtime_sec ... tai_offset
+ * 2:	tkr_raw, raw_sec
+ * 3,4: Internal variables
+ *
+ * Cacheline 0,1 contain the data which is used for accessing
+ * CLOCK_MONOTONIC/REALTIME/BOOTTIME/TAI, while cacheline 2 contains the
+ * data for accessing CLOCK_MONOTONIC_RAW.  Cacheline 3,4 are internal
+ * variables which are only accessed during timekeeper updates once per
+ * tick.
  */
 struct timekeeper {
+	/* Cacheline 0 (together with prepended seqcount of timekeeper core): */
 	struct tk_read_base	tkr_mono;
-	struct tk_read_base	tkr_raw;
+
+	/* Cacheline 1: */
 	u64			xtime_sec;
 	unsigned long		ktime_sec;
 	struct timespec64	wall_to_monotonic;
@@ -99,31 +125,29 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	ktime_t			offs_tai;
 	s32			tai_offset;
+
+	/* Cacheline 2: */
+	struct tk_read_base	tkr_raw;
+	u64			raw_sec;
+
+	/* Cachline 3 and 4 (timekeeping internal variables): */
 	unsigned int		clock_was_set_seq;
 	u8			cs_was_changed_seq;
-	ktime_t			next_leap_ktime;
-	u64			raw_sec;
+
 	struct timespec64	monotonic_to_boot;
 
-	/* The following members are for timekeeping internal use */
 	u64			cycle_interval;
 	u64			xtime_interval;
 	s64			xtime_remainder;
 	u64			raw_interval;
-	/* The ntp_tick_length() value currently being used.
-	 * This cached copy ensures we consistently apply the tick
-	 * length for an entire tick, as ntp_tick_length may change
-	 * mid-tick, and we don't want to apply that new value to
-	 * the tick in progress.
-	 */
+
+	ktime_t			next_leap_ktime;
 	u64			ntp_tick;
-	/* Difference between accumulated time and NTP time in ntp
-	 * shifted nano seconds. */
 	s64			ntp_error;
 	u32			ntp_error_shift;
 	u32			ntp_err_mult;
-	/* Flag used to avoid updating NTP twice with same second */
 	u32			skip_second_overflow;
+
 #ifdef CONFIG_DEBUG_TIMEKEEPING
 	long			last_warning;
 	/*