arch/x86/kernel/fpu/xstate.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
When CONFIG_X86_DEBUG_FPU=y is set, x86_task_fpu() returns NULL for
kernel threads. The avx512_status() function would then dereference this
NULL pointer via READ_ONCE(x86_task_fpu(task)->avx512_timestamp).
when reading /proc/*/arch_status, causing a kernel NULL pointer dereference
and system will crash.
[ 8215.540977] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] SMP KASAN NOPTI
[ 8215.542290] CPU: 3 UID: 0 PID: 9285 Comm: cat Kdump: loaded Tainted: G W 6.16.0-rc1 #4 PREEMPT(full)
[ 8215.543000] Tainted: [W]=WARN
[ 8215.544481] RIP: 0010:proc_pid_arch_status+0x30/0xe0
[ 8215.545408] Code: 1f 44 00 00 55 48 89 fd 48 89 cf 53 48 83 ec 08 e8 e5 64 ff ff 48 ba 00 00 00 00 00 fc ff df 48 8d 78 08 48 8
9 f9 48 c1 e9 03 <80> 3c 11 00 75 7d 48 8b 58 08 48 c7 c2 ff ff ff ff 48 85 db 74 3d
[ 8215.548456] RSP: 0018:ff11000194107b08 EFLAGS: 00010202
[ 8215.549443] RAX: 0000000000000000 RBX: ff11000211a9c9a0 RCX: 0000000000000001
[ 8215.550581] RDX: dffffc0000000000 RSI: ffffffff96d0d020 RDI: 0000000000000008
[ 8215.551740] RBP: ff11000111792490 R08: 0000000000000001 R09: ffe21c002117d61d
[ 8215.552917] R10: ff11000108beb0eb R11: 0000000000000000 R12: ff11000108a80b80
[ 8215.554111] R13: ff11000108beb0e8 R14: ffffffff96d0d020 R15: 0000000000000001
[ 8215.555323] FS: 00007f75c18ad740(0000) GS:ff11000e266d1000(0000) knlGS:0000000000000000
[ 8215.556629] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 8215.557880] CR2: 00005605184020f8 CR3: 0000000164499005 CR4: 0000000000771ef0
[ 8215.559553] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 8215.560882] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 8215.562205] PKRU: 55555554
[ 8215.563277] Call Trace:
[ 8215.564338] <TASK>
[ 8215.565383] proc_single_show+0x10c/0x1c0
[ 8215.566568] seq_read_iter+0x3e5/0x1050
[ 8215.567787] seq_read+0x24b/0x3b0
[ 8215.569305] ? __pfx_seq_read+0x10/0x10
[ 8215.570509] ? __pfx_handle_pte_fault+0x10/0x10
[ 8215.571782] ? __pfx_arch_get_unmapped_area_topdown+0x10/0x10
[ 8215.573142] ? __pfx_cp_new_stat+0x10/0x10
[ 8215.574417] vfs_read+0x186/0xad0
[ 8215.575657] ? __pfx_mas_prev+0x10/0x10
[ 8215.576947] ? __pfx_vfs_read+0x10/0x10
[ 8215.578234] ? count_memcg_events+0x1ce/0x410
[ 8215.579523] ? fdget_pos+0x1c9/0x4c0
[ 8215.580737] ksys_read+0xef/0x1c0
[ 8215.581896] ? __pfx_ksys_read+0x10/0x10
[ 8215.583265] ? do_user_addr_fault+0x4c6/0xb50
[ 8215.584633] do_syscall_64+0x73/0x330
[ 8215.585773] ? irqentry_exit_to_user_mode+0x32/0x210
[ 8215.586967] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 8215.588137] RIP: 0033:0x7f75c17147e2
[ 8215.589209] Code: c0 e9 b2 fe ff ff 50 48 8d 3d 8a b4 0c 00 e8 a5 1d 02 00 0f 1f 44 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 8
5 c0 75 10 0f 05 <48> 3d 00 f0 ff ff 77 56 c3 0f 1f 44 00 00 48 83 ec 28 48 89 54 24
[ 8215.592896] RSP: 002b:00007fffd6935ef8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[ 8215.594238] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f75c17147e2
[ 8215.595551] RDX: 0000000000020000 RSI: 00005605183e2000 RDI: 0000000000000003
[ 8215.596876] RBP: 00005605183e2000 R08: 0000000000000000 R09: 00005605183e10f0
[ 8215.598187] R10: 00005605183fe000 R11: 0000000000000246 R12: 0000000000000000
[ 8215.599494] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000
[ 8215.600807] </TASK>
Fixes: 22aafe3bcb67 ("x86/fpu: Remove init_task FPU state dependencies, add debugging warning for PF_KTHREAD tasks")
Signed-off-by: Fushuai Wang <wangfushuai@baidu.com>
---
arch/x86/kernel/fpu/xstate.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 9aa9ac8399ae..16f813a42f42 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1859,9 +1859,14 @@ long fpu_xstate_prctl(int option, unsigned long arg2)
*/
static void avx512_status(struct seq_file *m, struct task_struct *task)
{
- unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp);
+ unsigned long timestamp = 0;
long delta;
+#ifdef CONFIG_X86_DEBUG_FPU
+ if (!(task->flags & PF_KTHREAD))
+#endif
+ timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp);
+
if (!timestamp) {
/*
* Report -1 if no AVX512 usage
--
2.36.1
On 7/17/2025 2:43 AM, Fushuai Wang wrote: > When CONFIG_X86_DEBUG_FPU=y is set, x86_task_fpu() returns NULL for > kernel threads. It seems a bit odd that CONFIG_X86_DEBUG_FPU changes the behavior of x86_task_fpu(). We should probably change that behavior independent of the fix for this issue. Maybe? diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index ea138583dd92..04afc71a4993 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -58,8 +58,7 @@ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); #ifdef CONFIG_X86_DEBUG_FPU struct fpu *x86_task_fpu(struct task_struct *task) { - if (WARN_ON_ONCE(task->flags & PF_KTHREAD)) - return NULL; + WARN_ON_ONCE(task->flags & PF_KTHREAD) return (void *)task + sizeof(*task); } > The avx512_status() function would then dereference this > NULL pointer via READ_ONCE(x86_task_fpu(task)->avx512_timestamp). > when reading /proc/*/arch_status, causing a kernel NULL pointer dereference > and system will crash. > The kernel seems to assume that a Kthread would never call x86_task_fpu(). That assumption is breaking in this scenario, which causes the below issue. Can you please share any other warnings that were triggered before this Oops message? Also, I'll try to generate this locally. Any specific configuration needed for reproducing this apart from CONFIG_X86_DEBUG_FPU? > [ 8215.540977] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] SMP KASAN NOPTI > [ 8215.542290] CPU: 3 UID: 0 PID: 9285 Comm: cat Kdump: loaded Tainted: G W 6.16.0-rc1 #4 PREEMPT(full) > [ 8215.543000] Tainted: [W]=WARN > [ 8215.544481] RIP: 0010:proc_pid_arch_status+0x30/0xe0 > [ 8215.545408] Code: 1f 44 00 00 55 48 89 fd 48 89 cf 53 48 83 ec 08 e8 e5 64 ff ff 48 ba 00 00 00 00 00 fc ff df 48 8d 78 08 48 8 > 9 f9 48 c1 e9 03 <80> 3c 11 00 75 7d 48 8b 58 08 48 c7 c2 ff ff ff ff 48 85 db 74 3d > [ 8215.548456] RSP: 0018:ff11000194107b08 EFLAGS: 00010202 > [ 8215.549443] RAX: 0000000000000000 RBX: ff11000211a9c9a0 RCX: 0000000000000001 > [ 8215.550581] RDX: dffffc0000000000 RSI: ffffffff96d0d020 RDI: 0000000000000008 > [ 8215.551740] RBP: ff11000111792490 R08: 0000000000000001 R09: ffe21c002117d61d > [ 8215.552917] R10: ff11000108beb0eb R11: 0000000000000000 R12: ff11000108a80b80 > [ 8215.554111] R13: ff11000108beb0e8 R14: ffffffff96d0d020 R15: 0000000000000001 > [ 8215.555323] FS: 00007f75c18ad740(0000) GS:ff11000e266d1000(0000) knlGS:0000000000000000 > [ 8215.556629] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 8215.557880] CR2: 00005605184020f8 CR3: 0000000164499005 CR4: 0000000000771ef0 > [ 8215.559553] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [ 8215.560882] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > [ 8215.562205] PKRU: 55555554 > [ 8215.563277] Call Trace: > [ 8215.564338] <TASK> > [ 8215.565383] proc_single_show+0x10c/0x1c0 > [ 8215.566568] seq_read_iter+0x3e5/0x1050 > [ 8215.567787] seq_read+0x24b/0x3b0 > [ 8215.569305] ? __pfx_seq_read+0x10/0x10 > [ 8215.570509] ? __pfx_handle_pte_fault+0x10/0x10 > [ 8215.571782] ? __pfx_arch_get_unmapped_area_topdown+0x10/0x10 > [ 8215.573142] ? __pfx_cp_new_stat+0x10/0x10 > [ 8215.574417] vfs_read+0x186/0xad0 > [ 8215.575657] ? __pfx_mas_prev+0x10/0x10 > [ 8215.576947] ? __pfx_vfs_read+0x10/0x10 > [ 8215.578234] ? count_memcg_events+0x1ce/0x410 > [ 8215.579523] ? fdget_pos+0x1c9/0x4c0 > [ 8215.580737] ksys_read+0xef/0x1c0 > [ 8215.581896] ? __pfx_ksys_read+0x10/0x10 > [ 8215.583265] ? do_user_addr_fault+0x4c6/0xb50 > [ 8215.584633] do_syscall_64+0x73/0x330 > [ 8215.585773] ? irqentry_exit_to_user_mode+0x32/0x210 > [ 8215.586967] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [ 8215.588137] RIP: 0033:0x7f75c17147e2 > [ 8215.589209] Code: c0 e9 b2 fe ff ff 50 48 8d 3d 8a b4 0c 00 e8 a5 1d 02 00 0f 1f 44 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 8 > 5 c0 75 10 0f 05 <48> 3d 00 f0 ff ff 77 56 c3 0f 1f 44 00 00 48 83 ec 28 48 89 54 24 > [ 8215.592896] RSP: 002b:00007fffd6935ef8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 > [ 8215.594238] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f75c17147e2 > [ 8215.595551] RDX: 0000000000020000 RSI: 00005605183e2000 RDI: 0000000000000003 > [ 8215.596876] RBP: 00005605183e2000 R08: 0000000000000000 R09: 00005605183e10f0 > [ 8215.598187] R10: 00005605183fe000 R11: 0000000000000246 R12: 0000000000000000 > [ 8215.599494] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 > [ 8215.600807] </TASK> > For quoting backtraces in commit message, please see: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#backtraces-in-commit-messages > Fixes: 22aafe3bcb67 ("x86/fpu: Remove init_task FPU state dependencies, add debugging warning for PF_KTHREAD tasks") > Signed-off-by: Fushuai Wang <wangfushuai@baidu.com> > --- > arch/x86/kernel/fpu/xstate.c | 7 ++++++- > 1 file changed, 6 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c > index 9aa9ac8399ae..16f813a42f42 100644 > --- a/arch/x86/kernel/fpu/xstate.c > +++ b/arch/x86/kernel/fpu/xstate.c > @@ -1859,9 +1859,14 @@ long fpu_xstate_prctl(int option, unsigned long arg2) > */ > static void avx512_status(struct seq_file *m, struct task_struct *task) > { > - unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); > + unsigned long timestamp = 0; > long delta; > > +#ifdef CONFIG_X86_DEBUG_FPU > + if (!(task->flags & PF_KTHREAD)) > +#endif The logical code flow should not change based on X86_DEBUG_FPU. The fix for this issue likely needs to be somewhere else. Though, I am still working on identifying the exact root cause. > + timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); > + > if (!timestamp) { > /* > * Report -1 if no AVX512 usage
On 7/17/2025 12:21 PM, Sohil Mehta wrote: > On 7/17/2025 2:43 AM, Fushuai Wang wrote: >> The avx512_status() function would then dereference this >> NULL pointer via READ_ONCE(x86_task_fpu(task)->avx512_timestamp). >> when reading /proc/*/arch_status, causing a kernel NULL pointer dereference >> and system will crash. >> > > The kernel seems to assume that a Kthread would never call > x86_task_fpu(). That assumption is breaking in this scenario, which > causes the below issue. > This concern was discussed while adding the checks: https://lore.kernel.org/all/ZmFziN0i10sILaIo@gmail.com/ Adding a few folks who were involved in the discussion that time. > Can you please share any other warnings that were triggered before this > Oops message? Also, I'll try to generate this locally. Any specific > configuration needed for reproducing this apart from CONFIG_X86_DEBUG_FPU? > I was able to reproduce this on a system with X86_FEATURE_AVX512F. The issue only happens while reading arch_status on a kthread. $cat /proc/[kthread]/arch_status => NULL pointer exception $cat /proc/[user thread]/arch_status => No issue seen Can you confirm that you are seeing the same behavior? Unfortunately, avx512_timestamp resides within struct fpu. So getting that value for a kthread would mean going through x86_task_fpu(). I am wondering if we ever need to expose the AVX512 usage for kernel threads? If not, then we can do what you currently have but without the CONFIG_X86_DEBUG_FPU restriction. All kernel threads would always print the AVX512_elapsed_ms as -1. However, this would be a user visible change so we should probably get more inputs. I tried this experiment on an older kernel without the above issue. Among all the active kthreads on my system a handful of them show a valid value for AVX512 usage. The rest of them all show -1. PID: 2594 CMD: avahi-daemon: running [SAP.local] /proc/2594/arch_status content: AVX512_elapsed_ms: 46032 PID: 2729 CMD: sshd: /usr/sbin/sshd -D [listener] 0 of 10-100 startups /proc/2729/arch_status content: AVX512_elapsed_ms: 396656 To keep the older behavior, we might need to consider moving avx512_timestamp out of struct fpu. Though, I am uncertain about its implication. >> >> diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c >> index 9aa9ac8399ae..16f813a42f42 100644 >> --- a/arch/x86/kernel/fpu/xstate.c >> +++ b/arch/x86/kernel/fpu/xstate.c >> @@ -1859,9 +1859,14 @@ long fpu_xstate_prctl(int option, unsigned long arg2) >> */ >> static void avx512_status(struct seq_file *m, struct task_struct *task) >> { >> - unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); >> + unsigned long timestamp = 0; >> long delta; >> >> +#ifdef CONFIG_X86_DEBUG_FPU >> + if (!(task->flags & PF_KTHREAD)) >> +#endif > > The logical code flow should not change based on X86_DEBUG_FPU. The fix > for this issue likely needs to be somewhere else. Though, I am still > working on identifying the exact root cause. > >> + timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); >> + >> if (!timestamp) { >> /* >> * Report -1 if no AVX512 usage >
>> Can you please share any other warnings that were triggered before this >> Oops message? Also, I'll try to generate this locally. Any specific >> configuration needed for reproducing this apart from CONFIG_X86_DEBUG_FPU? > >I was able to reproduce this on a system with X86_FEATURE_AVX512F. The >issue only happens while reading arch_status on a kthread. > >$cat /proc/[kthread]/arch_status => NULL pointer exception >$cat /proc/[user thread]/arch_status => No issue seen > >Can you confirm that you are seeing the same behavior? Confirmed, same issue here. >Unfortunately, avx512_timestamp resides within struct fpu. So getting >that value for a kthread would mean going through x86_task_fpu(). > >I am wondering if we ever need to expose the AVX512 usage for kernel >threads? If not, then we can do what you currently have but without the >CONFIG_X86_DEBUG_FPU restriction. All kernel threads would always print >the AVX512_elapsed_ms as -1. > >However, this would be a user visible change so we should probably get >more inputs. I tried this experiment on an older kernel without the >above issue. Among all the active kthreads on my system a handful of >them show a valid value for AVX512 usage. The rest of them all show -1. > >PID: 2594 >CMD: avahi-daemon: running [SAP.local] > /proc/2594/arch_status content: >AVX512_elapsed_ms: 46032 > >PID: 2729 >CMD: sshd: /usr/sbin/sshd -D [listener] 0 of 10-100 startups > /proc/2729/arch_status content: >AVX512_elapsed_ms: 396656 > >To keep the older behavior, we might need to consider moving >avx512_timestamp out of struct fpu. Though, I am uncertain about its >implication. I think avx512_elapsed_ms should logically belong in the FPU structure, as it's a field inherently tied to FPU operations? To keep the older behavior, we can set CONFIG_X86_DEBUG_FPU=n, maybe? Is there a better approach to ensure kernel threads always correctly output avx512_elapsed_ms. Directly get FPU struct pointer without using x86_task_fpu()? diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9aa9ac8399ae..f989bc125e9b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1859,9 +1859,10 @@ long fpu_xstate_prctl(int option, unsigned long arg2) */ static void avx512_status(struct seq_file *m, struct task_struct *task) { - unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); + unsigned long timestamp; long delta; + timestamp = READ_ONCE((struct fpu *)((void *)(task) + sizeof(*(task)))->avx512_timestamp); if (!timestamp) { /* * Report -1 if no AVX512 usage
>> I am wondering if we ever need to expose the AVX512 usage for kernel >> threads? If not, then we can do what you currently have but without the >> CONFIG_X86_DEBUG_FPU restriction. All kernel threads would always print >> the AVX512_elapsed_ms as -1. >> Let's go with this approach. See below. >> However, this would be a user visible change so we should probably get >> more inputs. I tried this experiment on an older kernel without the >> above issue. Among all the active kthreads on my system a handful of >> them show a valid value for AVX512 usage. The rest of them all show -1. >> >> PID: 2594 >> CMD: avahi-daemon: running [SAP.local] >> /proc/2594/arch_status content: >> AVX512_elapsed_ms: 46032 >> >> PID: 2729 >> CMD: sshd: /usr/sbin/sshd -D [listener] 0 of 10-100 startups >> /proc/2729/arch_status content: >> AVX512_elapsed_ms: 396656 >> Correction: These aren't really Kthreads. There was a slight error in the script that I used. Reporting AVX512 usage doesn't seem very useful for Kthreads. The usage is mainly for userspace schedulers. Let's just report -1 for all Kthreads. How about something like below. This should work with and without CONFIG_X86_DEBUG_FPU. diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9aa9ac8399ae..10c3994295f2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1855,19 +1855,18 @@ long fpu_xstate_prctl(int option, unsigned long arg2) #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 - * use in the task. + * use in the task. Report -1 if no AVX512 usage. */ static void avx512_status(struct seq_file *m, struct task_struct *task) { - unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); - long delta; + unsigned long timestamp = 0; + long delta = -1; - if (!timestamp) { - /* - * Report -1 if no AVX512 usage - */ - delta = -1; - } else { + /* Do not report AVX512 usage for kernel threads */ + if (!(task->flags & (PF_KTHREAD | PF_USER_WORKER))) + timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); + + if (timestamp) { delta = (long)(jiffies - timestamp); /* * Cap to LONG_MAX if time difference > LONG_MAX
On 7/18/25 16:48, Sohil Mehta wrote: > + /* Do not report AVX512 usage for kernel threads */ > + if (!(task->flags & (PF_KTHREAD | PF_USER_WORKER))) > + timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); But the original reason that folks wanted this was so they can go find all the AVX-512 users and cluster them together. They obviously can't do that today if they're oopsing their kernels. But the real question to ask here is whether kernel threads can use AVX-512, and whether it's important to let userspace know which threads are using it. Let's fix the oops, then circle around and figure out whether tracking AVX-512 use in kernel threads is needed.
On Mon, Jul 21, 2025 at 07:09:52AM -0700, Dave Hansen wrote: > On 7/18/25 16:48, Sohil Mehta wrote: > > + /* Do not report AVX512 usage for kernel threads */ > > + if (!(task->flags & (PF_KTHREAD | PF_USER_WORKER))) > > + timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); > > But the original reason that folks wanted this was so they can go find > all the AVX-512 users and cluster them together. They obviously can't do > that today if they're oopsing their kernels. > > But the real question to ask here is whether kernel threads can use > AVX-512, and whether it's important to let userspace know which threads > are using it. > > Let's fix the oops, then circle around and figure out whether tracking > AVX-512 use in kernel threads is needed. The RAID6 Q-stripe calculation has a AVX512 implementation which usually is called from workqueue context.
© 2016 - 2025 Red Hat, Inc.