[PATCH 4/8] x86/fpu: Remove the thread::fpu pointer

Ingo Molnar posted 8 patches 10 months ago
[PATCH 4/8] x86/fpu: Remove the thread::fpu pointer
Posted by Ingo Molnar 10 months ago
As suggested by Oleg, remove the thread::fpu pointer, as we can
calculate it via x86_task_fpu() at compile-time.

This improves code generation a bit:

   kepler:~/tip> size vmlinux.before vmlinux.after
   text        data        bss        dec         hex        filename
   26475405    10435342    1740804    38651551    24dc69f    vmlinux.before
   26475339    10959630    1216516    38651485    24dc65d    vmlinux.after

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Uros Bizjak <ubizjak@gmail.com>
Link: https://lore.kernel.org/r/20240605083557.2051480-3-mingo@kernel.org
---
 arch/x86/include/asm/processor.h | 5 +----
 arch/x86/kernel/fpu/core.c       | 4 +---
 arch/x86/kernel/fpu/init.c       | 1 -
 arch/x86/kernel/process.c        | 2 --
 arch/x86/kernel/vmlinux.lds.S    | 4 ++++
 5 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5ea7e5d2c4de..b7f7c9c83409 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -514,12 +514,9 @@ struct thread_struct {
 
 	struct thread_shstk	shstk;
 #endif
-
-	/* Floating point and extended processor state */
-	struct fpu		*fpu;
 };
 
-#define x86_task_fpu(task) ((task)->thread.fpu)
+#define x86_task_fpu(task)	((struct fpu *)((void *)(task) + sizeof(*(task))))
 
 /*
  * X86 doesn't need any embedded-FPU-struct quirks:
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 853a738fdf2d..974b276ff0da 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -600,13 +600,11 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
 	 * This is safe because task_struct size is a multiple of cacheline size.
 	 */
 	struct fpu *src_fpu = x86_task_fpu(current);
-	struct fpu *dst_fpu = (void *)dst + sizeof(*dst);
+	struct fpu *dst_fpu = x86_task_fpu(dst);
 
 	BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0);
 	BUG_ON(!src_fpu);
 
-	dst->thread.fpu = dst_fpu;
-
 	/* The new task's FPU state cannot be valid in the hardware. */
 	dst_fpu->last_cpu = -1;
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 848ea79886ba..da41a1d2c40f 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -76,7 +76,6 @@ static struct fpu x86_init_fpu __attribute__ ((aligned (64))) __read_mostly;
 static void __init fpu__init_system_early_generic(void)
 {
 	fpstate_reset(&x86_init_fpu);
-	current->thread.fpu = &x86_init_fpu;
 	set_thread_flag(TIF_NEED_FPU_LOAD);
 	x86_init_fpu.last_cpu = -1;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3ce4cce46f3f..88868a90459e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -102,8 +102,6 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 #ifdef CONFIG_VM86
 	dst->thread.vm86 = NULL;
 #endif
-	/* Drop the copied pointer to current's fpstate */
-	dst->thread.fpu = NULL;
 
 	return 0;
 }
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ccdc45e5b759..d9ca2d1754da 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -181,6 +181,10 @@ SECTIONS
 		/* equivalent to task_pt_regs(&init_task) */
 		__top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE;
 
+		__x86_init_fpu_begin = .;
+		. = __x86_init_fpu_begin + 128*PAGE_SIZE;
+		__x86_init_fpu_end = .;
+
 #ifdef CONFIG_X86_32
 		/* 32 bit has nosave before _edata */
 		NOSAVE_DATA
-- 
2.45.2
Re: [PATCH 4/8] x86/fpu: Remove the thread::fpu pointer
Posted by Peter Zijlstra 10 months ago
On Wed, Apr 09, 2025 at 11:11:23PM +0200, Ingo Molnar wrote:

> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index 5ea7e5d2c4de..b7f7c9c83409 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -514,12 +514,9 @@ struct thread_struct {
>  
>  	struct thread_shstk	shstk;
>  #endif
> -
> -	/* Floating point and extended processor state */
> -	struct fpu		*fpu;
>  };
>  
> -#define x86_task_fpu(task) ((task)->thread.fpu)
> +#define x86_task_fpu(task)	((struct fpu *)((void *)(task) + sizeof(*(task))))

Doesn't our FPU state need to be cacheline aligned? Looking at struct
fpu, it has a bunch of members that have __aligned__(64) on them, and as
such the whole of struct fpu will inherit this alignment requirement.

This means that both task and sizeof(*task) need to be cacheline aligned
for this to work.

Would it make sense to add something like:

static_assert(sizeof(struct task_struct) % 64 == 0);

And I didn't check, but isn't task a page pointer and as such always
page aligned?
Re: [PATCH 4/8] x86/fpu: Remove the thread::fpu pointer
Posted by Ingo Molnar 10 months ago
* Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Apr 09, 2025 at 11:11:23PM +0200, Ingo Molnar wrote:
> 
> > diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> > index 5ea7e5d2c4de..b7f7c9c83409 100644
> > --- a/arch/x86/include/asm/processor.h
> > +++ b/arch/x86/include/asm/processor.h
> > @@ -514,12 +514,9 @@ struct thread_struct {
> >  
> >  	struct thread_shstk	shstk;
> >  #endif
> > -
> > -	/* Floating point and extended processor state */
> > -	struct fpu		*fpu;
> >  };
> >  
> > -#define x86_task_fpu(task) ((task)->thread.fpu)
> > +#define x86_task_fpu(task)	((struct fpu *)((void *)(task) + sizeof(*(task))))
> 
> Doesn't our FPU state need to be cacheline aligned?

Yeah, and we do have a check for that:

+       BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0);

And task_struct is allocated cache-aligned, which means when we do this 
in fpu_clone():

+       struct fpu *dst_fpu = (void *)dst + sizeof(*dst);

the FPU pointer is guaranteed to be cacheline aligned as well.

'dst' in that context is the new task_struct.

BTW., Oleg suggested in a previous discussion for us to replace the 
task->thread.fpu pointer with a build-time calculation - but I'm still 
not sure it's a good idea.

Thanks,

	Ingo
Re: [PATCH 4/8] x86/fpu: Remove the thread::fpu pointer
Posted by Oleg Nesterov 10 months ago
On 04/10, Ingo Molnar wrote:
>
> * Peter Zijlstra <peterz@infradead.org> wrote:
>
> > On Wed, Apr 09, 2025 at 11:11:23PM +0200, Ingo Molnar wrote:
> >
> > > -#define x86_task_fpu(task) ((task)->thread.fpu)
> > > +#define x86_task_fpu(task)	((struct fpu *)((void *)(task) + sizeof(*(task))))
> >
> > Doesn't our FPU state need to be cacheline aligned?
>
> Yeah, and we do have a check for that:
>
> +       BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0);
>
> And task_struct is allocated cache-aligned, which means when we do this
> in fpu_clone():
>
> +       struct fpu *dst_fpu = (void *)dst + sizeof(*dst);
>
> the FPU pointer is guaranteed to be cacheline aligned as well.
>
> 'dst' in that context is the new task_struct.
>
> BTW., Oleg suggested in a previous discussion for us to replace the
> task->thread.fpu pointer with a build-time calculation - but I'm still
> not sure it's a good idea.

To be honest, I forgot everything we discussed before ;)

But I have found this email,
https://lore.kernel.org/all/20240616105550.GA18292@redhat.com/
Perhaps

	#define X86_TASK_SIZE	\
		ALIGN(sizeof(struct task_struct), __alignof__(union fpregs_state))

and
	#define x86_task_fpu(task)	\
		((struct fpu *)((void *)(task) + X86_TASK_SIZE))

plus a bit more similar changes make more sense than
__attribute__ ((aligned (64))) for struct task_struct?

OK, even if yes, we can do this later on top of this series.

I'll try to read it tomorrow.

Oleg.
Re: [PATCH 4/8] x86/fpu: Remove the thread::fpu pointer
Posted by Ingo Molnar 10 months ago
* Ingo Molnar <mingo@kernel.org> wrote:

> BTW., Oleg suggested in a previous discussion for us to replace the 
> task->thread.fpu pointer with a build-time calculation - but I'm 
> still not sure it's a good idea.

Actually, I've implemented that already:

  arch/x86/include/asm/processor.h:# define x86_task_fpu(task)    ((struct fpu *)((void *)(task) + sizeof(*(task))))

... then promptly forgot about it. So never mind me. :-)

Thanks,

	Ingo
Re: [PATCH 4/8] x86/fpu: Remove the thread::fpu pointer
Posted by Peter Zijlstra 10 months ago
On Thu, Apr 10, 2025 at 12:10:56PM +0200, Ingo Molnar wrote:
> 
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Wed, Apr 09, 2025 at 11:11:23PM +0200, Ingo Molnar wrote:
> > 
> > > diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> > > index 5ea7e5d2c4de..b7f7c9c83409 100644
> > > --- a/arch/x86/include/asm/processor.h
> > > +++ b/arch/x86/include/asm/processor.h
> > > @@ -514,12 +514,9 @@ struct thread_struct {
> > >  
> > >  	struct thread_shstk	shstk;
> > >  #endif
> > > -
> > > -	/* Floating point and extended processor state */
> > > -	struct fpu		*fpu;
> > >  };
> > >  
> > > -#define x86_task_fpu(task) ((task)->thread.fpu)
> > > +#define x86_task_fpu(task)	((struct fpu *)((void *)(task) + sizeof(*(task))))
> > 
> > Doesn't our FPU state need to be cacheline aligned?
> 
> Yeah, and we do have a check for that:
> 
> +       BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0);

Ah, missed that. Clearly I need to improve my reading skillz :-)
[PATCH] x86/fpu: Clarify FPU context cacheline alignment
Posted by Ingo Molnar 10 months ago

* Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, Apr 10, 2025 at 12:10:56PM +0200, Ingo Molnar wrote:
> > 
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Wed, Apr 09, 2025 at 11:11:23PM +0200, Ingo Molnar wrote:
> > > 
> > > > diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> > > > index 5ea7e5d2c4de..b7f7c9c83409 100644
> > > > --- a/arch/x86/include/asm/processor.h
> > > > +++ b/arch/x86/include/asm/processor.h
> > > > @@ -514,12 +514,9 @@ struct thread_struct {
> > > >  
> > > >  	struct thread_shstk	shstk;
> > > >  #endif
> > > > -
> > > > -	/* Floating point and extended processor state */
> > > > -	struct fpu		*fpu;
> > > >  };
> > > >  
> > > > -#define x86_task_fpu(task) ((task)->thread.fpu)
> > > > +#define x86_task_fpu(task)	((struct fpu *)((void *)(task) + sizeof(*(task))))
> > > 
> > > Doesn't our FPU state need to be cacheline aligned?
> > 
> > Yeah, and we do have a check for that:
> > 
> > +       BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0);
> 
> Ah, missed that. Clearly I need to improve my reading skillz :-)

Admittedly it's written a bit obtusely - how about the patch below?

Thanks,

	Ingo

============================>
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 10 Apr 2025 12:52:16 +0200
Subject: [PATCH] x86/fpu: Clarify FPU context cacheline alignment

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Uros Bizjak <ubizjak@gmail.com>
---
 arch/x86/kernel/fpu/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index d0a45f6492cb..3a19877a314e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -607,7 +607,8 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
 	 * We allocate the new FPU structure right after the end of the task struct.
 	 * task allocation size already took this into account.
 	 *
-	 * This is safe because task_struct size is a multiple of cacheline size.
+	 * This is safe because task_struct size is a multiple of cacheline size,
+	 * thus x86_task_fpu() will always be cacheline aligned as well.
 	 */
 	struct fpu *dst_fpu = (void *)dst + sizeof(*dst);
[tip: x86/merge] x86/fpu: Clarify FPU context cacheline alignment
Posted by tip-bot2 for Ingo Molnar 10 months ago
The following commit has been merged into the x86/merge branch of tip:

Commit-ID:     e3a52b67f54aa36ab21265eeea016460b5fe1c46
Gitweb:        https://git.kernel.org/tip/e3a52b67f54aa36ab21265eeea016460b5fe1c46
Author:        Ingo Molnar <mingo@kernel.org>
AuthorDate:    Thu, 10 Apr 2025 12:52:16 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Mon, 14 Apr 2025 08:18:29 +02:00

x86/fpu: Clarify FPU context cacheline alignment

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Chang S. Bae <chang.seok.bae@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/Z_ejggklB5-IWB5W@gmail.com
---
 arch/x86/kernel/fpu/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index d0a45f6..3a19877 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -607,7 +607,8 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
 	 * We allocate the new FPU structure right after the end of the task struct.
 	 * task allocation size already took this into account.
 	 *
-	 * This is safe because task_struct size is a multiple of cacheline size.
+	 * This is safe because task_struct size is a multiple of cacheline size,
+	 * thus x86_task_fpu() will always be cacheline aligned as well.
 	 */
 	struct fpu *dst_fpu = (void *)dst + sizeof(*dst);
[tip: x86/merge] x86/fpu: Remove the thread::fpu pointer
Posted by tip-bot2 for Ingo Molnar 10 months ago
The following commit has been merged into the x86/merge branch of tip:

Commit-ID:     55bc30f2e34dcc17a370d1f6c1c992be107c4502
Gitweb:        https://git.kernel.org/tip/55bc30f2e34dcc17a370d1f6c1c992be107c4502
Author:        Ingo Molnar <mingo@kernel.org>
AuthorDate:    Wed, 09 Apr 2025 23:11:23 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Mon, 14 Apr 2025 08:18:29 +02:00

x86/fpu: Remove the thread::fpu pointer

As suggested by Oleg, remove the thread::fpu pointer, as we can
calculate it via x86_task_fpu() at compile-time.

This improves code generation a bit:

   kepler:~/tip> size vmlinux.before vmlinux.after
   text        data        bss        dec         hex        filename
   26475405    10435342    1740804    38651551    24dc69f    vmlinux.before
   26475339    10959630    1216516    38651485    24dc65d    vmlinux.after

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chang S. Bae <chang.seok.bae@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Uros Bizjak <ubizjak@gmail.com>
Link: https://lore.kernel.org/r/20250409211127.3544993-5-mingo@kernel.org
---
 arch/x86/include/asm/processor.h | 5 +----
 arch/x86/kernel/fpu/core.c       | 4 +---
 arch/x86/kernel/fpu/init.c       | 1 -
 arch/x86/kernel/process.c        | 2 --
 arch/x86/kernel/vmlinux.lds.S    | 4 ++++
 5 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5ea7e5d..b7f7c9c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -514,12 +514,9 @@ struct thread_struct {
 
 	struct thread_shstk	shstk;
 #endif
-
-	/* Floating point and extended processor state */
-	struct fpu		*fpu;
 };
 
-#define x86_task_fpu(task) ((task)->thread.fpu)
+#define x86_task_fpu(task)	((struct fpu *)((void *)(task) + sizeof(*(task))))
 
 /*
  * X86 doesn't need any embedded-FPU-struct quirks:
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 853a738..974b276 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -600,13 +600,11 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
 	 * This is safe because task_struct size is a multiple of cacheline size.
 	 */
 	struct fpu *src_fpu = x86_task_fpu(current);
-	struct fpu *dst_fpu = (void *)dst + sizeof(*dst);
+	struct fpu *dst_fpu = x86_task_fpu(dst);
 
 	BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0);
 	BUG_ON(!src_fpu);
 
-	dst->thread.fpu = dst_fpu;
-
 	/* The new task's FPU state cannot be valid in the hardware. */
 	dst_fpu->last_cpu = -1;
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 848ea79..da41a1d 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -76,7 +76,6 @@ static struct fpu x86_init_fpu __attribute__ ((aligned (64))) __read_mostly;
 static void __init fpu__init_system_early_generic(void)
 {
 	fpstate_reset(&x86_init_fpu);
-	current->thread.fpu = &x86_init_fpu;
 	set_thread_flag(TIF_NEED_FPU_LOAD);
 	x86_init_fpu.last_cpu = -1;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3ce4cce..88868a9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -102,8 +102,6 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 #ifdef CONFIG_VM86
 	dst->thread.vm86 = NULL;
 #endif
-	/* Drop the copied pointer to current's fpstate */
-	dst->thread.fpu = NULL;
 
 	return 0;
 }
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ccdc45e..d9ca2d1 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -181,6 +181,10 @@ SECTIONS
 		/* equivalent to task_pt_regs(&init_task) */
 		__top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE;
 
+		__x86_init_fpu_begin = .;
+		. = __x86_init_fpu_begin + 128*PAGE_SIZE;
+		__x86_init_fpu_end = .;
+
 #ifdef CONFIG_X86_32
 		/* 32 bit has nosave before _edata */
 		NOSAVE_DATA