nohz_full is a feature that only fits into rare and very corner cases.
Yet distros enable it by default and therefore the related fields are
always reserved in the task struct.
Those task fields are stored in the middle of cacheline hot places such
as cputime accounting and context switch counting, which doesn't make
any sense for a feature that is disabled most of the time.
Move the nohz_full storage to colder places.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
include/linux/sched.h | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f96ac1982893..b5ce76db6d75 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1110,13 +1110,7 @@ struct task_struct {
#endif
u64 gtime;
struct prev_cputime prev_cputime;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- struct vtime vtime;
-#endif
-#ifdef CONFIG_NO_HZ_FULL
- atomic_t tick_dep_mask;
-#endif
/* Context switch counts: */
unsigned long nvcsw;
unsigned long nivcsw;
@@ -1438,6 +1432,14 @@ struct task_struct {
struct task_delay_info *delays;
#endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ struct vtime vtime;
+#endif
+
+#ifdef CONFIG_NO_HZ_FULL
+ atomic_t tick_dep_mask;
+#endif
+
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
unsigned int fail_nth;
--
2.48.1
On 4/10/25 20:53, Frederic Weisbecker wrote:
> nohz_full is a feature that only fits into rare and very corner cases.
> Yet distros enable it by default and therefore the related fields are
> always reserved in the task struct.
>
> Those task fields are stored in the middle of cacheline hot places such
> as cputime accounting and context switch counting, which doesn't make
> any sense for a feature that is disabled most of the time.
>
> Move the nohz_full storage to colder places.
>
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> ---
> include/linux/sched.h | 14 ++++++++------
> 1 file changed, 8 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index f96ac1982893..b5ce76db6d75 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1110,13 +1110,7 @@ struct task_struct {
> #endif
> u64 gtime;
> struct prev_cputime prev_cputime;
> -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> - struct vtime vtime;
> -#endif
>
> -#ifdef CONFIG_NO_HZ_FULL
> - atomic_t tick_dep_mask;
> -#endif
> /* Context switch counts: */
> unsigned long nvcsw;
> unsigned long nivcsw;
> @@ -1438,6 +1432,14 @@ struct task_struct {
> struct task_delay_info *delays;
> #endif
>
> +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> + struct vtime vtime;
> +#endif
> +
> +#ifdef CONFIG_NO_HZ_FULL
> + atomic_t tick_dep_mask;
> +#endif
> +
> #ifdef CONFIG_FAULT_INJECTION
> int make_it_fail;
> unsigned int fail_nth;
>
Hi Frederic.
maybe move these nohz related fields into their own cacheline instead?
on PowerPC where we have 128byte cache instead, i see
these fields are crossing a cache line boundary.
without patch:
/* XXX last struct has 4 bytes of padding */
struct vtime vtime; /* 2360 48 */
atomic_t tick_dep_mask; /* 2408 4 */
/* XXX 4 bytes hole, try to pack */
long unsigned int nvcsw; /* 2416 8 */
long unsigned int nivcsw; /* 2424 8 */
/* --- cacheline 19 boundary (2432 bytes) --- */
With patch:
struct vtime vtime; /* 3272 48 */
struct callback_head nohz_full_work; /* 3320 16 */
/* --- cacheline 26 boundary (3328 bytes) was 8 bytes ago --- */
atomic_t tick_dep_mask; /* 3336 4 */
Le Thu, Apr 24, 2025 at 12:10:26AM +0530, Shrikanth Hegde a écrit :
>
>
> On 4/10/25 20:53, Frederic Weisbecker wrote:
> > nohz_full is a feature that only fits into rare and very corner cases.
> > Yet distros enable it by default and therefore the related fields are
> > always reserved in the task struct.
> >
> > Those task fields are stored in the middle of cacheline hot places such
> > as cputime accounting and context switch counting, which doesn't make
> > any sense for a feature that is disabled most of the time.
> >
> > Move the nohz_full storage to colder places.
> >
> > Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> > ---
> > include/linux/sched.h | 14 ++++++++------
> > 1 file changed, 8 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index f96ac1982893..b5ce76db6d75 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1110,13 +1110,7 @@ struct task_struct {
> > #endif
> > u64 gtime;
> > struct prev_cputime prev_cputime;
> > -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> > - struct vtime vtime;
> > -#endif
> > -#ifdef CONFIG_NO_HZ_FULL
> > - atomic_t tick_dep_mask;
> > -#endif
> > /* Context switch counts: */
> > unsigned long nvcsw;
> > unsigned long nivcsw;
> > @@ -1438,6 +1432,14 @@ struct task_struct {
> > struct task_delay_info *delays;
> > #endif
> > +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> > + struct vtime vtime;
> > +#endif
> > +
> > +#ifdef CONFIG_NO_HZ_FULL
> > + atomic_t tick_dep_mask;
> > +#endif
> > +
> > #ifdef CONFIG_FAULT_INJECTION
> > int make_it_fail;
> > unsigned int fail_nth;
> >
>
> Hi Frederic.
>
> maybe move these nohz related fields into their own cacheline instead?
>
>
> on PowerPC where we have 128byte cache instead, i see
> these fields are crossing a cache line boundary.
>
> without patch:
> /* XXX last struct has 4 bytes of padding */
>
> struct vtime vtime; /* 2360 48 */
> atomic_t tick_dep_mask; /* 2408 4 */
> /* XXX 4 bytes hole, try to pack */
>
> long unsigned int nvcsw; /* 2416 8 */
> long unsigned int nivcsw; /* 2424 8 */
> /* --- cacheline 19 boundary (2432 bytes) --- */
>
>
> With patch:
> struct vtime vtime; /* 3272 48 */
> struct callback_head nohz_full_work; /* 3320 16 */
> /* --- cacheline 26 boundary (3328 bytes) was 8 bytes ago --- */
> atomic_t tick_dep_mask; /* 3336 4 */
>
It's not much a big deal because those fields shouldn't be accessed much
closely in time. Also such a cache alignement is hard to maintain everywhere
when there are so many ifdefferies in that structure.
Thanks.
--
Frederic Weisbecker
SUSE Labs
© 2016 - 2026 Red Hat, Inc.