[PATCH] watchdog/hardlockup: Fix UAF in perf event cleanup due to migration race

Qiliang Yuan posted 1 patch 2 weeks, 2 days ago
There is a newer version of this series
kernel/watchdog_perf.c | 28 ++++++++++++++++++++++++----
1 file changed, 24 insertions(+), 4 deletions(-)
[PATCH] watchdog/hardlockup: Fix UAF in perf event cleanup due to migration race
Posted by Qiliang Yuan 2 weeks, 2 days ago
During the early initialization of the hardlockup detector, the
hardlockup_detector_perf_init() function probes for PMU hardware availability.
It originally used hardlockup_detector_event_create(), which interacts with
the per-cpu 'watchdog_ev' variable.

If the initializing task migrates to another CPU during this probe phase,
two issues arise:
1. The 'watchdog_ev' pointer on the original CPU is set but not cleared,
   leaving a stale pointer to a freed perf event.
2. The 'watchdog_ev' pointer on the new CPU might be incorrectly cleared.

This race condition was observed in console logs (captured by adding debug printks):

[23.038376] hardlockup_detector_perf_init 313 cur_cpu=2
...
[23.076385] hardlockup_detector_event_create 203 cpu(cur)=2 set watchdog_ev
...
[23.095788] perf_event_release_kernel 4623 cur_cpu=2
...
[23.116963] lockup_detector_reconfigure 577 cur_cpu=3

The log shows the task started on CPU 2, set watchdog_ev on CPU 2,
released the event on CPU 2, but then migrated to CPU 3 before the
cleanup logic (which would clear watchdog_ev) could run. This left
watchdog_ev on CPU 2 pointing to a freed event.

Later, when the watchdog is enabled/disabled on CPU 2, this stale pointer
leads to a Use-After-Free (UAF) in perf_event_disable(), as detected by KASAN:
[26.539140] ==================================================================
[26.540732] BUG: KASAN: use-after-free in perf_event_ctx_lock_nested.isra.72+0x6b/0x140
[26.542442] Read of size 8 at addr ff110006b360d718 by task kworker/2:1/94
[26.543954]
[26.544744] CPU: 2 PID: 94 Comm: kworker/2:1 Not tainted 4.19.90-debugkasan #11
[26.546505] Hardware name: GoStack Foundation OpenStack Nova, BIOS 1.16.3-3.ctl3 04/01/2014
[26.548256] Workqueue: events smp_call_on_cpu_callback
[26.549267] Call Trace:
[26.549936]  dump_stack+0x8b/0xbb
[26.550731]  print_address_description+0x6a/0x270
[26.551688]  kasan_report+0x179/0x2c0
[26.552519]  ? perf_event_ctx_lock_nested.isra.72+0x6b/0x140
[26.553654]  ? watchdog_disable+0x80/0x80
[26.553657]  perf_event_ctx_lock_nested.isra.72+0x6b/0x140
[26.556951]  ? dump_stack+0xa0/0xbb
[26.564006]  ? watchdog_disable+0x80/0x80
[26.564886]  perf_event_disable+0xa/0x30
[26.565746]  hardlockup_detector_perf_disable+0x1b/0x60
[26.566776]  watchdog_disable+0x51/0x80
[26.567624]  softlockup_stop_fn+0x11/0x20
[26.568499]  smp_call_on_cpu_callback+0x5b/0xb0
[26.569443]  process_one_work+0x389/0x770
[26.570311]  worker_thread+0x57/0x5a0
[26.571124]  ? process_one_work+0x770/0x770
[26.572031]  kthread+0x1ae/0x1d0
[26.572810]  ? kthread_create_worker_on_cpu+0xc0/0xc0
[26.573821]  ret_from_fork+0x1f/0x40
[26.574638]
[26.575178] Allocated by task 1:
[26.575990]  kasan_kmalloc+0xa0/0xd0
[26.576814]  kmem_cache_alloc_trace+0xf3/0x1e0
[26.577732]  perf_event_alloc.part.89+0xb5/0x12b0
[26.578700]  perf_event_create_kernel_counter+0x1e/0x1d0
[26.579728]  hardlockup_detector_event_create+0x4e/0xc0
[26.580744]  hardlockup_detector_perf_init+0x2f/0x60
[26.581746]  lockup_detector_init+0x85/0xdc
[26.582645]  kernel_init_freeable+0x34d/0x40e
[26.583568]  kernel_init+0xf/0x130
[26.584428]  ret_from_fork+0x1f/0x40
[26.584429]
[26.584430] Freed by task 0:
[26.584433]  __kasan_slab_free+0x130/0x180
[26.584436]  kfree+0x90/0x1a0
[26.589641]  rcu_process_callbacks+0x2cb/0x6e0
[26.590935]  __do_softirq+0x119/0x3a2
[26.591965]
[26.592630] The buggy address belongs to the object at ff110006b360d500
[26.592630]  which belongs to the cache kmalloc-2048 of size 2048
[26.592633] The buggy address is located 536 bytes inside of
[26.592633]  2048-byte region [ff110006b360d500, ff110006b360dd00)
[26.592634] The buggy address belongs to the page:
[26.592637] page:ffd400001acd8200 count:1 mapcount:0 mapping:ff11000107c0e800 index:0x0 compound_mapcount: 0
[26.600959] flags: 0x17ffffc0010200(slab|head)
[26.601891] raw: 0017ffffc0010200 dead000000000100 dead000000000200 ff11000107c0e800
[26.603541] raw: 0000000000000000 00000000800f000f 00000001ffffffff 0000000000000000
[26.605546] page dumped because: kasan: bad access detected
[26.606788]
[26.607351] Memory state around the buggy address:
[26.608556]  ff110006b360d600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.610565]  ff110006b360d680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.610567] >ff110006b360d700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.610568]                             ^
[26.610570]  ff110006b360d780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.610573]  ff110006b360d800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.618955] ==================================================================

Fix this by making the probe logic stateless. Use a local variable for the
perf event and avoid accessing the per-cpu 'watchdog_ev' during initialization.
This ensures that the probe event is always properly released regardless of
task migration, and no stale global state is left behind.

Signed-off-by: Shouxin Sun <sunshx@chinatelecom.cn>
Signed-off-by: Junnan Xhang <zhangjn11@chinatelecom.cn>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
---
 kernel/watchdog_perf.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index d3ca70e3c256..5066be7bba03 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -264,18 +264,38 @@ bool __weak __init arch_perf_nmi_is_available(void)
 int __init watchdog_hardlockup_probe(void)
 {
 	int ret;
+	struct perf_event_attr *wd_attr = &wd_hw_attr;
+	struct perf_event *evt;
+	unsigned int cpu;
 
 	if (!arch_perf_nmi_is_available())
 		return -ENODEV;
 
-	ret = hardlockup_detector_event_create();
+	/*
+	 * Test hardware PMU availability. Avoid using
+	 * hardlockup_detector_event_create() to prevent migration-related
+	 * stale pointers in the per-cpu watchdog_ev during early probe.
+	 */
+	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+	if (!wd_attr->sample_period)
+		return -EINVAL;
 
-	if (ret) {
+	/*
+	 * Use raw_smp_processor_id() for probing in preemptible init code.
+	 * Migration after reading ID is acceptable as counter creation on
+	 * the old CPU is sufficient for the probe.
+	 */
+	cpu = raw_smp_processor_id();
+	evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+					       watchdog_overflow_callback, NULL);
+	if (IS_ERR(evt)) {
 		pr_info("Perf NMI watchdog permanently disabled\n");
+		ret = PTR_ERR(evt);
 	} else {
-		perf_event_release_kernel(this_cpu_read(watchdog_ev));
-		this_cpu_write(watchdog_ev, NULL);
+		perf_event_release_kernel(evt);
+		ret = 0;
 	}
+
 	return ret;
 }
 
-- 
2.51.0
[PATCH v2] watchdog/hardlockup: Fix UAF in perf event cleanup due to migration race
Posted by Qiliang Yuan 2 weeks, 2 days ago
During the early initialization of the hardlockup detector, the
hardlockup_detector_perf_init() function probes for PMU hardware availability.
It originally used hardlockup_detector_event_create(), which interacts with
the per-cpu 'watchdog_ev' variable.

If the initializing task migrates to another CPU during this probe phase,
two issues arise:
1. The 'watchdog_ev' pointer on the original CPU is set but not cleared,
   leaving a stale pointer to a freed perf event.
2. The 'watchdog_ev' pointer on the new CPU might be incorrectly cleared.

This race condition was observed in console logs (captured by adding debug printks):

[23.038376] hardlockup_detector_perf_init 313 cur_cpu=2
...
[23.076385] hardlockup_detector_event_create 203 cpu(cur)=2 set watchdog_ev
...
[23.095788] perf_event_release_kernel 4623 cur_cpu=2
...
[23.116963] lockup_detector_reconfigure 577 cur_cpu=3

The log shows the task started on CPU 2, set watchdog_ev on CPU 2,
released the event on CPU 2, but then migrated to CPU 3 before the
cleanup logic (which would clear watchdog_ev) could run. This left
watchdog_ev on CPU 2 pointing to a freed event.

Later, when the watchdog is enabled/disabled on CPU 2, this stale pointer
leads to a Use-After-Free (UAF) in perf_event_disable(), as detected by KASAN:
[26.539140] ==================================================================
[26.540732] BUG: KASAN: use-after-free in perf_event_ctx_lock_nested.isra.72+0x6b/0x140
[26.542442] Read of size 8 at addr ff110006b360d718 by task kworker/2:1/94
[26.543954]
[26.544744] CPU: 2 PID: 94 Comm: kworker/2:1 Not tainted 4.19.90-debugkasan #11
[26.546505] Hardware name: GoStack Foundation OpenStack Nova, BIOS 1.16.3-3.ctl3 04/01/2014
[26.548256] Workqueue: events smp_call_on_cpu_callback
[26.549267] Call Trace:
[26.549936]  dump_stack+0x8b/0xbb
[26.550731]  print_address_description+0x6a/0x270
[26.551688]  kasan_report+0x179/0x2c0
[26.552519]  ? perf_event_ctx_lock_nested.isra.72+0x6b/0x140
[26.553654]  ? watchdog_disable+0x80/0x80
[26.553657]  perf_event_ctx_lock_nested.isra.72+0x6b/0x140
[26.556951]  ? dump_stack+0xa0/0xbb
[26.564006]  ? watchdog_disable+0x80/0x80
[26.564886]  perf_event_disable+0xa/0x30
[26.565746]  hardlockup_detector_perf_disable+0x1b/0x60
[26.566776]  watchdog_disable+0x51/0x80
[26.567624]  softlockup_stop_fn+0x11/0x20
[26.568499]  smp_call_on_cpu_callback+0x5b/0xb0
[26.569443]  process_one_work+0x389/0x770
[26.570311]  worker_thread+0x57/0x5a0
[26.571124]  ? process_one_work+0x770/0x770
[26.572031]  kthread+0x1ae/0x1d0
[26.572810]  ? kthread_create_worker_on_cpu+0xc0/0xc0
[26.573821]  ret_from_fork+0x1f/0x40
[26.574638]
[26.575178] Allocated by task 1:
[26.575990]  kasan_kmalloc+0xa0/0xd0
[26.576814]  kmem_cache_alloc_trace+0xf3/0x1e0
[26.577732]  perf_event_alloc.part.89+0xb5/0x12b0
[26.578700]  perf_event_create_kernel_counter+0x1e/0x1d0
[26.579728]  hardlockup_detector_event_create+0x4e/0xc0
[26.580744]  hardlockup_detector_perf_init+0x2f/0x60
[26.581746]  lockup_detector_init+0x85/0xdc
[26.582645]  kernel_init_freeable+0x34d/0x40e
[26.583568]  kernel_init+0xf/0x130
[26.584428]  ret_from_fork+0x1f/0x40
[26.584429]
[26.584430] Freed by task 0:
[26.584433]  __kasan_slab_free+0x130/0x180
[26.584436]  kfree+0x90/0x1a0
[26.589641]  rcu_process_callbacks+0x2cb/0x6e0
[26.590935]  __do_softirq+0x119/0x3a2
[26.591965]
[26.592630] The buggy address belongs to the object at ff110006b360d500
[26.592630]  which belongs to the cache kmalloc-2048 of size 2048
[26.592633] The buggy address is located 536 bytes inside of
[26.592633]  2048-byte region [ff110006b360d500, ff110006b360dd00)
[26.592634] The buggy address belongs to the page:
[26.592637] page:ffd400001acd8200 count:1 mapcount:0 mapping:ff11000107c0e800 index:0x0 compound_mapcount: 0
[26.600959] flags: 0x17ffffc0010200(slab|head)
[26.601891] raw: 0017ffffc0010200 dead000000000100 dead000000000200 ff11000107c0e800
[26.603541] raw: 0000000000000000 00000000800f000f 00000001ffffffff 0000000000000000
[26.605546] page dumped because: kasan: bad access detected
[26.606788]
[26.607351] Memory state around the buggy address:
[26.608556]  ff110006b360d600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.610565]  ff110006b360d680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.610567] >ff110006b360d700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.610568]                             ^
[26.610570]  ff110006b360d780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.610573]  ff110006b360d800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.618955] ==================================================================

Fix this by making the probe logic stateless. Use a local variable for the
perf event and avoid accessing the per-cpu 'watchdog_ev' during initialization.
This ensures that the probe event is always properly released regardless of
task migration, and no stale global state is left behind.

Cc: stable@vger.kernel.org
Signed-off-by: Shouxin Sun <sunshx@chinatelecom.cn>
Signed-off-by: Junnan Zhang <zhangjn11@chinatelecom.cn>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
---
v2:
- Add Cc: stable@vger.kernel.org tag.
---
 kernel/watchdog_perf.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index d3ca70e3c256..5066be7bba03 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -264,18 +264,38 @@ bool __weak __init arch_perf_nmi_is_available(void)
 int __init watchdog_hardlockup_probe(void)
 {
 	int ret;
+	struct perf_event_attr *wd_attr = &wd_hw_attr;
+	struct perf_event *evt;
+	unsigned int cpu;
 
 	if (!arch_perf_nmi_is_available())
 		return -ENODEV;
 
-	ret = hardlockup_detector_event_create();
+	/*
+	 * Test hardware PMU availability. Avoid using
+	 * hardlockup_detector_event_create() to prevent migration-related
+	 * stale pointers in the per-cpu watchdog_ev during early probe.
+	 */
+	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+	if (!wd_attr->sample_period)
+		return -EINVAL;
 
-	if (ret) {
+	/*
+	 * Use raw_smp_processor_id() for probing in preemptible init code.
+	 * Migration after reading ID is acceptable as counter creation on
+	 * the old CPU is sufficient for the probe.
+	 */
+	cpu = raw_smp_processor_id();
+	evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+					       watchdog_overflow_callback, NULL);
+	if (IS_ERR(evt)) {
 		pr_info("Perf NMI watchdog permanently disabled\n");
+		ret = PTR_ERR(evt);
 	} else {
-		perf_event_release_kernel(this_cpu_read(watchdog_ev));
-		this_cpu_write(watchdog_ev, NULL);
+		perf_event_release_kernel(evt);
+		ret = 0;
 	}
+
 	return ret;
 }
 
-- 
2.51.0
Re: [PATCH v2] watchdog/hardlockup: Fix UAF in perf event cleanup due to migration race
Posted by Andrew Morton 2 weeks, 1 day ago
On Thu, 22 Jan 2026 00:24:42 -0500 Qiliang Yuan <realwujing@gmail.com> wrote:

> During the early initialization of the hardlockup detector, the
> hardlockup_detector_perf_init() function probes for PMU hardware availability.
> It originally used hardlockup_detector_event_create(), which interacts with
> the per-cpu 'watchdog_ev' variable.

Thanks.

For a -stable backport it's desirable to have a Fixes: target.  But it
appears this is very old code?

Also, I'm not sure who best to ask to help review this change.  I'll
add a few cc's here.

[full email retained...]

> If the initializing task migrates to another CPU during this probe phase,
> two issues arise:
> 1. The 'watchdog_ev' pointer on the original CPU is set but not cleared,
>    leaving a stale pointer to a freed perf event.
> 2. The 'watchdog_ev' pointer on the new CPU might be incorrectly cleared.
> 
> This race condition was observed in console logs (captured by adding debug printks):
> 
> [23.038376] hardlockup_detector_perf_init 313 cur_cpu=2
> ...
> [23.076385] hardlockup_detector_event_create 203 cpu(cur)=2 set watchdog_ev
> ...
> [23.095788] perf_event_release_kernel 4623 cur_cpu=2
> ...
> [23.116963] lockup_detector_reconfigure 577 cur_cpu=3
> 
> The log shows the task started on CPU 2, set watchdog_ev on CPU 2,
> released the event on CPU 2, but then migrated to CPU 3 before the
> cleanup logic (which would clear watchdog_ev) could run. This left
> watchdog_ev on CPU 2 pointing to a freed event.
> 
> Later, when the watchdog is enabled/disabled on CPU 2, this stale pointer
> leads to a Use-After-Free (UAF) in perf_event_disable(), as detected by KASAN:
> [26.539140] ==================================================================
> [26.540732] BUG: KASAN: use-after-free in perf_event_ctx_lock_nested.isra.72+0x6b/0x140
> [26.542442] Read of size 8 at addr ff110006b360d718 by task kworker/2:1/94
> [26.543954]
> [26.544744] CPU: 2 PID: 94 Comm: kworker/2:1 Not tainted 4.19.90-debugkasan #11
> [26.546505] Hardware name: GoStack Foundation OpenStack Nova, BIOS 1.16.3-3.ctl3 04/01/2014
> [26.548256] Workqueue: events smp_call_on_cpu_callback
> [26.549267] Call Trace:
> [26.549936]  dump_stack+0x8b/0xbb
> [26.550731]  print_address_description+0x6a/0x270
> [26.551688]  kasan_report+0x179/0x2c0
> [26.552519]  ? perf_event_ctx_lock_nested.isra.72+0x6b/0x140
> [26.553654]  ? watchdog_disable+0x80/0x80
> [26.553657]  perf_event_ctx_lock_nested.isra.72+0x6b/0x140
> [26.556951]  ? dump_stack+0xa0/0xbb
> [26.564006]  ? watchdog_disable+0x80/0x80
> [26.564886]  perf_event_disable+0xa/0x30
> [26.565746]  hardlockup_detector_perf_disable+0x1b/0x60
> [26.566776]  watchdog_disable+0x51/0x80
> [26.567624]  softlockup_stop_fn+0x11/0x20
> [26.568499]  smp_call_on_cpu_callback+0x5b/0xb0
> [26.569443]  process_one_work+0x389/0x770
> [26.570311]  worker_thread+0x57/0x5a0
> [26.571124]  ? process_one_work+0x770/0x770
> [26.572031]  kthread+0x1ae/0x1d0
> [26.572810]  ? kthread_create_worker_on_cpu+0xc0/0xc0
> [26.573821]  ret_from_fork+0x1f/0x40
> [26.574638]
> [26.575178] Allocated by task 1:
> [26.575990]  kasan_kmalloc+0xa0/0xd0
> [26.576814]  kmem_cache_alloc_trace+0xf3/0x1e0
> [26.577732]  perf_event_alloc.part.89+0xb5/0x12b0
> [26.578700]  perf_event_create_kernel_counter+0x1e/0x1d0
> [26.579728]  hardlockup_detector_event_create+0x4e/0xc0
> [26.580744]  hardlockup_detector_perf_init+0x2f/0x60
> [26.581746]  lockup_detector_init+0x85/0xdc
> [26.582645]  kernel_init_freeable+0x34d/0x40e
> [26.583568]  kernel_init+0xf/0x130
> [26.584428]  ret_from_fork+0x1f/0x40
> [26.584429]
> [26.584430] Freed by task 0:
> [26.584433]  __kasan_slab_free+0x130/0x180
> [26.584436]  kfree+0x90/0x1a0
> [26.589641]  rcu_process_callbacks+0x2cb/0x6e0
> [26.590935]  __do_softirq+0x119/0x3a2
> [26.591965]
> [26.592630] The buggy address belongs to the object at ff110006b360d500
> [26.592630]  which belongs to the cache kmalloc-2048 of size 2048
> [26.592633] The buggy address is located 536 bytes inside of
> [26.592633]  2048-byte region [ff110006b360d500, ff110006b360dd00)
> [26.592634] The buggy address belongs to the page:
> [26.592637] page:ffd400001acd8200 count:1 mapcount:0 mapping:ff11000107c0e800 index:0x0 compound_mapcount: 0
> [26.600959] flags: 0x17ffffc0010200(slab|head)
> [26.601891] raw: 0017ffffc0010200 dead000000000100 dead000000000200 ff11000107c0e800
> [26.603541] raw: 0000000000000000 00000000800f000f 00000001ffffffff 0000000000000000
> [26.605546] page dumped because: kasan: bad access detected
> [26.606788]
> [26.607351] Memory state around the buggy address:
> [26.608556]  ff110006b360d600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [26.610565]  ff110006b360d680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [26.610567] >ff110006b360d700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [26.610568]                             ^
> [26.610570]  ff110006b360d780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [26.610573]  ff110006b360d800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [26.618955] ==================================================================
> 
> Fix this by making the probe logic stateless. Use a local variable for the
> perf event and avoid accessing the per-cpu 'watchdog_ev' during initialization.
> This ensures that the probe event is always properly released regardless of
> task migration, and no stale global state is left behind.
> 
> Cc: stable@vger.kernel.org
> Signed-off-by: Shouxin Sun <sunshx@chinatelecom.cn>
> Signed-off-by: Junnan Zhang <zhangjn11@chinatelecom.cn>
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
> ---
> v2:
> - Add Cc: stable@vger.kernel.org tag.
> ---
>  kernel/watchdog_perf.c | 28 ++++++++++++++++++++++++----
>  1 file changed, 24 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
> index d3ca70e3c256..5066be7bba03 100644
> --- a/kernel/watchdog_perf.c
> +++ b/kernel/watchdog_perf.c
> @@ -264,18 +264,38 @@ bool __weak __init arch_perf_nmi_is_available(void)
>  int __init watchdog_hardlockup_probe(void)
>  {
>  	int ret;
> +	struct perf_event_attr *wd_attr = &wd_hw_attr;
> +	struct perf_event *evt;
> +	unsigned int cpu;
>  
>  	if (!arch_perf_nmi_is_available())
>  		return -ENODEV;
>  
> -	ret = hardlockup_detector_event_create();
> +	/*
> +	 * Test hardware PMU availability. Avoid using
> +	 * hardlockup_detector_event_create() to prevent migration-related
> +	 * stale pointers in the per-cpu watchdog_ev during early probe.
> +	 */
> +	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
> +	if (!wd_attr->sample_period)
> +		return -EINVAL;
>  
> -	if (ret) {
> +	/*
> +	 * Use raw_smp_processor_id() for probing in preemptible init code.
> +	 * Migration after reading ID is acceptable as counter creation on
> +	 * the old CPU is sufficient for the probe.
> +	 */
> +	cpu = raw_smp_processor_id();
> +	evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
> +					       watchdog_overflow_callback, NULL);
> +	if (IS_ERR(evt)) {
>  		pr_info("Perf NMI watchdog permanently disabled\n");
> +		ret = PTR_ERR(evt);
>  	} else {
> -		perf_event_release_kernel(this_cpu_read(watchdog_ev));
> -		this_cpu_write(watchdog_ev, NULL);
> +		perf_event_release_kernel(evt);
> +		ret = 0;
>  	}
> +
>  	return ret;
>  }
>  
> -- 
> 2.51.0
Re: [PATCH v2] watchdog/hardlockup: Fix UAF in perf event cleanup due to migration race
Posted by Doug Anderson 2 weeks, 1 day ago
Hi,

On Thu, Jan 22, 2026 at 1:59 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Thu, 22 Jan 2026 00:24:42 -0500 Qiliang Yuan <realwujing@gmail.com> wrote:
>
> > During the early initialization of the hardlockup detector, the
> > hardlockup_detector_perf_init() function probes for PMU hardware availability.
> > It originally used hardlockup_detector_event_create(), which interacts with
> > the per-cpu 'watchdog_ev' variable.
>
> Thanks.
>
> For a -stable backport it's desirable to have a Fixes: target.  But it
> appears this is very old code?
>
> Also, I'm not sure who best to ask to help review this change.  I'll
> add a few cc's here.

I'm nowhere near an expert on the perf system or the perf-specific
bits of the hardlockup detector, but I took a quick look...

I guess my first question is: why didn't the
"WARN_ON(!is_percpu_thread());" in hardlockup_detector_event_create()
hit in this case?

I guess my second question is: your new code doesn't seem to use
"fallback_wd_hw_attr" if there is an error. Is that important?

My last thought is: why not just move the "this_cpu_write(watchdog_ev,
evt)" out of hardlockup_detector_event_create() and into
watchdog_hardlockup_enable()? You can just return evt from
hardlockup_detector_event_create(), right? Then you can keep using
hardlockup_detector_event_create() and share the code...

Full disclosure: I don't know this code and I looked at it quickly. If
something I said sounds stupid, please call me out on it.


-Doug
[PATCH v3] watchdog/hardlockup: Fix UAF in perf event cleanup due to migration race
Posted by Qiliang Yuan 2 weeks, 1 day ago
During the early initialization of the hardlockup detector, the
hardlockup_detector_perf_init() function probes for PMU hardware availability.
It originally used hardlockup_detector_event_create(), which interacts with
the per-cpu 'watchdog_ev' variable.

If the initializing task migrates to another CPU during this probe phase,
two issues arise:
1. The 'watchdog_ev' pointer on the original CPU is set but not cleared,
   leaving a stale pointer to a freed perf event.
2. The 'watchdog_ev' pointer on the new CPU might be incorrectly cleared.

This race condition was observed in console logs (captured by adding debug printks):

[23.038376] hardlockup_detector_perf_init 313 cur_cpu=2
...
[23.076385] hardlockup_detector_event_create 203 cpu(cur)=2 set watchdog_ev
...
[23.095788] perf_event_release_kernel 4623 cur_cpu=2
...
[23.116963] lockup_detector_reconfigure 577 cur_cpu=3

The log shows the task started on CPU 2, set watchdog_ev on CPU 2,
released the event on CPU 2, but then migrated to CPU 3 before the
cleanup logic (which would clear watchdog_ev) could run. This left
watchdog_ev on CPU 2 pointing to a freed event.

Later, when the watchdog is enabled/disabled on CPU 2, this stale pointer
leads to a Use-After-Free (UAF) in perf_event_disable(), as detected by KASAN:
[26.539140] ==================================================================
[26.540732] BUG: KASAN: use-after-free in perf_event_ctx_lock_nested.isra.72+0x6b/0x140
[26.542442] Read of size 8 at addr ff110006b360d718 by task kworker/2:1/94
[26.543954]
[26.544744] CPU: 2 PID: 94 Comm: kworker/2:1 Not tainted 4.19.90-debugkasan #11
[26.546505] Hardware name: GoStack Foundation OpenStack Nova, BIOS 1.16.3-3.ctl3 04/01/2014
[26.548256] Workqueue: events smp_call_on_cpu_callback
[26.549267] Call Trace:
[26.549936]  dump_stack+0x8b/0xbb
[26.550731]  print_address_description+0x6a/0x270
[26.551688]  kasan_report+0x179/0x2c0
[26.552519]  ? perf_event_ctx_lock_nested.isra.72+0x6b/0x140
[26.553654]  ? watchdog_disable+0x80/0x80
[26.553657]  perf_event_ctx_lock_nested.isra.72+0x6b/0x140
[26.556951]  ? dump_stack+0xa0/0xbb
[26.564006]  ? watchdog_disable+0x80/0x80
[26.564886]  perf_event_disable+0xa/0x30
[26.565746]  hardlockup_detector_perf_disable+0x1b/0x60
[26.566776]  watchdog_disable+0x51/0x80
[26.567624]  softlockup_stop_fn+0x11/0x20
[26.568499]  smp_call_on_cpu_callback+0x5b/0xb0
[26.569443]  process_one_work+0x389/0x770
[26.570311]  worker_thread+0x57/0x5a0
[26.571124]  ? process_one_work+0x770/0x770
[26.572031]  kthread+0x1ae/0x1d0
[26.572810]  ? kthread_create_worker_on_cpu+0xc0/0xc0
[26.573821]  ret_from_fork+0x1f/0x40
[26.574638]
[26.575178] Allocated by task 1:
[26.575990]  kasan_kmalloc+0xa0/0xd0
[26.576814]  kmem_cache_alloc_trace+0xf3/0x1e0
[26.577732]  perf_event_alloc.part.89+0xb5/0x12b0
[26.578700]  perf_event_create_kernel_counter+0x1e/0x1d0
[26.579728]  hardlockup_detector_event_create+0x4e/0xc0
[26.580744]  hardlockup_detector_perf_init+0x2f/0x60
[26.581746]  lockup_detector_init+0x85/0xdc
[26.582645]  kernel_init_freeable+0x34d/0x40e
[26.583568]  kernel_init+0xf/0x130
[26.584428]  ret_from_fork+0x1f/0x40
[26.584429]
[26.584430] Freed by task 0:
[26.584433]  __kasan_slab_free+0x130/0x180
[26.584436]  kfree+0x90/0x1a0
[26.589641]  rcu_process_callbacks+0x2cb/0x6e0
[26.590935]  __do_softirq+0x119/0x3a2
[26.591965]
[26.592630] The buggy address belongs to the object at ff110006b360d500
[26.592630]  which belongs to the cache kmalloc-2048 of size 2048
[26.592633] The buggy address is located 536 bytes inside of
[26.592633]  2048-byte region [ff110006b360d500, ff110006b360dd00)
[26.592634] The buggy address belongs to the page:
[26.592637] page:ffd400001acd8200 count:1 mapcount:0 mapping:ff11000107c0e800 index:0x0 compound_mapcount: 0
[26.600959] flags: 0x17ffffc0010200(slab|head)
[26.601891] raw: 0017ffffc0010200 dead000000000100 dead000000000200 ff11000107c0e800
[26.603541] raw: 0000000000000000 00000000800f000f 00000001ffffffff 0000000000000000
[26.605546] page dumped because: kasan: bad access detected
[26.606788]
[26.607351] Memory state around the buggy address:
[26.608556]  ff110006b360d600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.610565]  ff110006b360d680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.610567] >ff110006b360d700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.610568]                             ^
[26.610570]  ff110006b360d780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.610573]  ff110006b360d800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[26.618955] ==================================================================

Fix this by refactoring hardlockup_detector_event_create() to return the
created perf event instead of directly assigning it to the per-cpu variable.
This allows the probe logic to reuse the creation code (including fallback
logic) without affecting the global state, ensuring that task migration
during probe no longer leaves stale pointers in 'watchdog_ev'.

Signed-off-by: Shouxin Sun <sunshx@chinatelecom.cn>
Signed-off-by: Junnan Zhang <zhangjn11@chinatelecom.cn>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
Cc: Song Liu <song@kernel.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Jinchao Wang <wangjinchao600@gmail.com>
Cc: Wang Jinchao <wangjinchao600@gmail.com>
Cc: <stable@vger.kernel.org>
---
v3: Refactor creation logic to return event pointer; restores PMU cycle fallback and unifies paths.
v2: Add Cc: <stable@vger.kernel.org>.
v1: Avoid 'watchdog_ev' in probe path by manually creating and releasing a local perf event.

 kernel/watchdog_perf.c | 51 ++++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index d3ca70e3c256..d045b92bc514 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -118,18 +118,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
 	watchdog_hardlockup_check(smp_processor_id(), regs);
 }
 
-static int hardlockup_detector_event_create(void)
+static struct perf_event *hardlockup_detector_event_create(unsigned int cpu)
 {
-	unsigned int cpu;
 	struct perf_event_attr *wd_attr;
 	struct perf_event *evt;
 
-	/*
-	 * Preemption is not disabled because memory will be allocated.
-	 * Ensure CPU-locality by calling this in per-CPU kthread.
-	 */
-	WARN_ON(!is_percpu_thread());
-	cpu = raw_smp_processor_id();
 	wd_attr = &wd_hw_attr;
 	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
 
@@ -143,14 +136,7 @@ static int hardlockup_detector_event_create(void)
 						       watchdog_overflow_callback, NULL);
 	}
 
-	if (IS_ERR(evt)) {
-		pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
-			 PTR_ERR(evt));
-		return PTR_ERR(evt);
-	}
-	WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak");
-	this_cpu_write(watchdog_ev, evt);
-	return 0;
+	return evt;
 }
 
 /**
@@ -159,17 +145,26 @@ static int hardlockup_detector_event_create(void)
  */
 void watchdog_hardlockup_enable(unsigned int cpu)
 {
+	struct perf_event *evt;
+
 	WARN_ON_ONCE(cpu != smp_processor_id());
 
-	if (hardlockup_detector_event_create())
+	evt = hardlockup_detector_event_create(cpu);
+	if (IS_ERR(evt)) {
+		pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
+			 PTR_ERR(evt));
 		return;
+	}
 
 	/* use original value for check */
 	if (!atomic_fetch_inc(&watchdog_cpus))
 		pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
 
+	WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak");
+	this_cpu_write(watchdog_ev, evt);
+
 	watchdog_init_timestamp();
-	perf_event_enable(this_cpu_read(watchdog_ev));
+	perf_event_enable(evt);
 }
 
 /**
@@ -263,19 +258,31 @@ bool __weak __init arch_perf_nmi_is_available(void)
  */
 int __init watchdog_hardlockup_probe(void)
 {
+	struct perf_event *evt;
+	unsigned int cpu;
 	int ret;
 
 	if (!arch_perf_nmi_is_available())
 		return -ENODEV;
 
-	ret = hardlockup_detector_event_create();
+	if (!hw_nmi_get_sample_period(watchdog_thresh))
+		return -EINVAL;
 
-	if (ret) {
+	/*
+	 * Test hardware PMU availability by creating a temporary perf event.
+	 * Allow migration during the check as any successfully created per-cpu
+	 * event validates PMU support. The event is released immediately.
+	 */
+	cpu = raw_smp_processor_id();
+	evt = hardlockup_detector_event_create(cpu);
+	if (IS_ERR(evt)) {
 		pr_info("Perf NMI watchdog permanently disabled\n");
+		ret = PTR_ERR(evt);
 	} else {
-		perf_event_release_kernel(this_cpu_read(watchdog_ev));
-		this_cpu_write(watchdog_ev, NULL);
+		perf_event_release_kernel(evt);
+		ret = 0;
 	}
+
 	return ret;
 }
 
-- 
2.51.0
Re: [PATCH v3] watchdog/hardlockup: Fix UAF in perf event cleanup due to migration race
Posted by Doug Anderson 2 weeks ago
Hi,

On Thu, Jan 22, 2026 at 10:34 PM Qiliang Yuan <realwujing@gmail.com> wrote:
>
> During the early initialization of the hardlockup detector, the
> hardlockup_detector_perf_init() function probes for PMU hardware availability.
> It originally used hardlockup_detector_event_create(), which interacts with
> the per-cpu 'watchdog_ev' variable.
>
> If the initializing task migrates to another CPU during this probe phase,
> two issues arise:
> 1. The 'watchdog_ev' pointer on the original CPU is set but not cleared,
>    leaving a stale pointer to a freed perf event.
> 2. The 'watchdog_ev' pointer on the new CPU might be incorrectly cleared.
>
> This race condition was observed in console logs (captured by adding debug printks):
>
> [23.038376] hardlockup_detector_perf_init 313 cur_cpu=2

Wait a second... The above function hasn't existed for 2.5 years. It
was removed in commit d9b3629ade8e ("watchdog/hardlockup: have the
perf hardlockup use __weak functions more cleanly"). All that's left
in the ToT kernel referencing that function is an old comment...

Oh, and I guess I can see below that your stack traces are on 4.19,
which is ancient! Things have changed a bit in the meantime. Are you
certain that the problem still reproduces on ToT?


> Signed-off-by: Shouxin Sun <sunshx@chinatelecom.cn>
> Signed-off-by: Junnan Zhang <zhangjn11@chinatelecom.cn>
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
> Cc: Song Liu <song@kernel.org>
> Cc: Douglas Anderson <dianders@chromium.org>
> Cc: Jinchao Wang <wangjinchao600@gmail.com>
> Cc: Wang Jinchao <wangjinchao600@gmail.com>
> Cc: <stable@vger.kernel.org>

Probably want a "Fixes" tag? If I had to guess, maybe?

Fixes: 930d8f8dbab9 ("watchdog/perf: adapt the watchdog_perf interface
for async model")

Why? I think before that the init function could only be called
directly from the kernel init code and before smp_init(). After that,
a worker could call it, which is the case where preemption could have
been enabled. Does my logic sound correct?

Can you confirm that you're only seeing the problem when the retry
hits? In other words when called from lockup_detector_delay_init()?
Oh, though if you're on 4.19 then I'm not sure what to think...


> @@ -118,18 +118,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
>         watchdog_hardlockup_check(smp_processor_id(), regs);
>  }
>
> -static int hardlockup_detector_event_create(void)
> +static struct perf_event *hardlockup_detector_event_create(unsigned int cpu)
>  {
> -       unsigned int cpu;
>         struct perf_event_attr *wd_attr;
>         struct perf_event *evt;
>
> -       /*
> -        * Preemption is not disabled because memory will be allocated.
> -        * Ensure CPU-locality by calling this in per-CPU kthread.
> -        */
> -       WARN_ON(!is_percpu_thread());

I'm still a bit confused why this warning didn't trigger previously.
Do you know why?


> @@ -263,19 +258,31 @@ bool __weak __init arch_perf_nmi_is_available(void)
>   */
>  int __init watchdog_hardlockup_probe(void)
>  {
> +       struct perf_event *evt;
> +       unsigned int cpu;
>         int ret;
>
>         if (!arch_perf_nmi_is_available())
>                 return -ENODEV;
>
> -       ret = hardlockup_detector_event_create();
> +       if (!hw_nmi_get_sample_period(watchdog_thresh))
> +               return -EINVAL;
>
> -       if (ret) {
> +       /*
> +        * Test hardware PMU availability by creating a temporary perf event.
> +        * Allow migration during the check as any successfully created per-cpu
> +        * event validates PMU support. The event is released immediately.

I guess it's implied by the "Allow migration during the check", but I
might even word it more strongly and say something like "The cpu we
use here is arbitrary, so we don't disable preemption and use
raw_smp_processor_id() to get a CPU."

I guess that should be OK. Hopefully the arbitrary CPU that you pick
doesn't go offline during this function. I don't know "perf" well, but
I could imagine that it might be upset if you tried to create a perf
event for a CPU that has gone offline. I guess you could be paranoid
and surround this with cpu_hotplug_disable() / cpu_hotplug_enable()?


I guess overall thoughts: the problem you're describing does seem
real, but the fact that your reports are from an ancient 4.19 kernel
make me concerned about whether you really tested all the cases on a
new kernel...

-Doug
Re: [PATCH v3] watchdog/hardlockup: Fix UAF in perf event cleanup due to migration race
Posted by Qiliang Yuan 2 weeks ago
Thanks for the detailed review!

> Wait a second... The above function hasn't existed for 2.5 years. It
> was removed in commit d9b3629ade8e ("watchdog/hardlockup: have the
> perf hardlockup use __weak functions more cleanly"). All that's left
> in the ToT kernel referencing that function is an old comment...
>
> Oh, and I guess I can see below that your stack traces are on 4.19,
> which is ancient! Things have changed a bit in the meantime. Are you
> certain that the problem still reproduces on ToT?

The function hardlockup_detector_perf_init() was renamed to
watchdog_hardlockup_probe() in commit d9b3629ade8e ("watchdog/hardlockup:
have the perf hardlockup use __weak functions more cleanly").
Additionally, the source file was moved from kernel/watchdog_hld.c to
kernel/watchdog_perf.c in commit 6ea0d04211a7. The v3 commit message
inadvertently retained legacy terminology from the 4.19 kernel; this will
be updated in V4 to reflect current ToT naming.

The core logic remains the same: the race condition persists despite the
renaming and cleanup of the __weak function logic.

Regarding ToT reproducibility: while the KASAN report originated from
4.19, the underlying logic is still problematic in ToT. In
watchdog_hardlockup_probe(), the call to
hardlockup_detector_event_create() still writes to the per-cpu
watchdog_ev. Task migration between event creation and the subsequent
perf_event_release_kernel() leaves a stale pointer in the watchdog_ev of
the original CPU.

> Probably want a "Fixes" tag? If I had to guess, maybe?
>
> Fixes: 930d8f8dbab9 ("watchdog/perf: adapt the watchdog_perf interface
> for async model")

Commit 930d8f8dbab9 introduced the async initialization which allows
preemption/migration during the probe phase. This tag will be included in
V4.

> I'm still a bit confused why this warning didn't trigger previously.
> Do you know why?

In 4.19, hardlockup_detector_event_create() did not include the
WARN_ON(!is_percpu_thread()) check, which was added in later versions. In
ToT, this warning is expected to trigger if watchdog_hardlockup_probe()
is called from a non-per-cpu-bound thread (such as kernel_init). This
further justifies refactoring the creation logic to be CPU-agnostic for
probing.

> I guess it's implied by the "Allow migration during the check", but I
> might even word it more strongly and say something like "The cpu we
> use here is arbitrary, so we don't disable preemption and use
> raw_smp_processor_id() to get a CPU."
>
> I guess that should be OK. Hopefully the arbitrary CPU that you pick
> doesn't go offline during this function. I don't know "perf" well, but
> I could imagine that it might be upset if you tried to create a perf
> event for a CPU that has gone offline. I guess you could be paranoid
> and surround this with cpu_hotplug_disable() / cpu_hotplug_enable()?

The point is well-taken. While unlikely during early boot, adding
cpu_hotplug_disable() ensures robustness.

V4 will be submitted with the following changes:
1. Clarified commit message (retaining 4.19 logs while explaining the
   renaming to watchdog_hardlockup_probe).
2. Inclusion of the "Fixes" tag.
3. Addition of cpu_hotplug_disable() around the probe.
4. Refined comments.

Best regards,
Qiliang
Re: [PATCH v3] watchdog/hardlockup: Fix UAF in perf event cleanup due to migration race
Posted by Doug Anderson 1 week, 6 days ago
Hi,

On Fri, Jan 23, 2026 at 10:57 PM Qiliang Yuan <realwujing@gmail.com> wrote:
>
> Thanks for the detailed review!
>
> > Wait a second... The above function hasn't existed for 2.5 years. It
> > was removed in commit d9b3629ade8e ("watchdog/hardlockup: have the
> > perf hardlockup use __weak functions more cleanly"). All that's left
> > in the ToT kernel referencing that function is an old comment...
> >
> > Oh, and I guess I can see below that your stack traces are on 4.19,
> > which is ancient! Things have changed a bit in the meantime. Are you
> > certain that the problem still reproduces on ToT?
>
> The function hardlockup_detector_perf_init() was renamed to
> watchdog_hardlockup_probe() in commit d9b3629ade8e ("watchdog/hardlockup:
> have the perf hardlockup use __weak functions more cleanly").
> Additionally, the source file was moved from kernel/watchdog_hld.c to
> kernel/watchdog_perf.c in commit 6ea0d04211a7. The v3 commit message
> inadvertently retained legacy terminology from the 4.19 kernel; this will
> be updated in V4 to reflect current ToT naming.
>
> The core logic remains the same: the race condition persists despite the
> renaming and cleanup of the __weak function logic.
>
> Regarding ToT reproducibility: while the KASAN report originated from
> 4.19, the underlying logic is still problematic in ToT. In
> watchdog_hardlockup_probe(), the call to
> hardlockup_detector_event_create() still writes to the per-cpu
> watchdog_ev. Task migration between event creation and the subsequent
> perf_event_release_kernel() leaves a stale pointer in the watchdog_ev of
> the original CPU.
>
> > Probably want a "Fixes" tag? If I had to guess, maybe?
> >
> > Fixes: 930d8f8dbab9 ("watchdog/perf: adapt the watchdog_perf interface
> > for async model")
>
> Commit 930d8f8dbab9 introduced the async initialization which allows
> preemption/migration during the probe phase. This tag will be included in
> V4.

The part that doesn't make a lot of sense to me, though, is that v4.19
also doesn't have commit 930d8f8dbab9 ("watchdog/perf: adapt the
watchdog_perf interface for async model"), which is where we are
saying the problem was introduced.

...so in v4.19 I think:
* hardlockup_detector_perf_init() is only called from watchdog_nmi_probe()
* watchdog_nmi_probe() is only called from lockup_detector_init()
* lockup_detector_init() is only called from kernel_init_freeable()
right before smp_init()

Thus I'm super confused about how you could have seen the problem on
v4.19. Maybe your v4.19 kernel has some backported patches that makes
this possible?

While I'm not saying that the v4 patch you just posted is incorrect,
I'm just trying to make sure that:

1. We actually understand the problem you were seeing.

2. We are identifying the correct "Fixes" commit.


> > I'm still a bit confused why this warning didn't trigger previously.
> > Do you know why?
>
> In 4.19, hardlockup_detector_event_create() did not include the
> WARN_ON(!is_percpu_thread()) check, which was added in later versions. In
> ToT, this warning is expected to trigger if watchdog_hardlockup_probe()
> is called from a non-per-cpu-bound thread (such as kernel_init). This
> further justifies refactoring the creation logic to be CPU-agnostic for
> probing.

OK, fair enough. ...but I'm a bit curious why nobody else saw this
WARN_ON(). I'm also curious if you have tested the hardlockup detector
on newer kernels, or if all of your work has been done on 4.19. If all
your work has been done on 4.19, do we need to find someone to test
your patch on a newer kernel and make sure it works OK? If you've
tested on a newer kernel, did the hardlockup detector init from the
kernel's early-init code, or the retry code?

-Doug
Re: [PATCH v3] watchdog/hardlockup: Fix UAF in perf event cleanup due to migration race
Posted by Qiliang Yuan 1 week, 5 days ago
Hi Doug,

Thanks for your further questions and for digging into the 4.19 vs ToT
differences.

On Sat, 24 Jan 2026 15:36:01 Doug Anderson <dianders@chromium.org> wrote:
> The part that doesn't make a lot of sense to me, though, is that v4.19
> also doesn't have commit 930d8f8dbab9 ("watchdog/perf: adapt the
> watchdog_perf interface for async model"), which is where we are
> saying the problem was introduced.
> 
> ...so in v4.19 I think:
> * hardlockup_detector_perf_init() is only called from watchdog_nmi_probe()
> * watchdog_nmi_probe() is only called from lockup_detector_init()
> * lockup_detector_init() is only called from kernel_init_freeable()
> right before smp_init()
> 
> Thus I'm super confused about how you could have seen the problem on
> v4.19. Maybe your v4.19 kernel has some backported patches that makes
> this possible?

You caught it! Here is the context for the differences:

1. Mainline (ToT):
   - `lockup_detector_init()` is always called before `smp_init()`
     (pre-SMP phase).
   - Risk source: The asynchronous retry path (`lockup_detector_delay_init`)
     introduced by 930d8f8dbab9, which runs in a workqueue (post-SMP)
     context and triggers the UAF.

2. openEuler (4.19/5.10):
   - Local `euler inclusion` patches moved `lockup_detector_init()` after
     `do_basic_setup()` (post-SMP phase).
   - Risk source: The initial probe occurs directly in a post-SMP
     environment, exposing the race condition.

For openEuler (4.19/5.10) kernel, the call stack looks like this:
  kernel_init()
  -> kernel_init_freeable()
    -> lockup_detector_init()       <-- Called after smp_init()
      -> watchdog_nmi_probe()
        -> hardlockup_detector_perf_init()
          -> hardlockup_detector_event_create()

In mainline (ToT), the initial probe (safe) call stack is:
  kernel_init()
  -> kernel_init_freeable()
    -> lockup_detector_init()       <-- Called before smp_init()
      -> watchdog_hardlockup_probe()
        -> hardlockup_detector_event_create()

However, the asynchronous retry mechanism (commit 930d8f8dbab9) executes the
probe logic in a post-SMP, preemptible context. 

For the mainline (ToT) retry path (at risk), the call stack is:
  kworker thread
  -> process_one_work()
    -> lockup_detector_delay_init()
      -> watchdog_hardlockup_probe()
        -> hardlockup_detector_event_create()

Thus, `930d8f8dbab9` remains the correct "Fixes" target for ToT.

> OK, fair enough. ...but I'm a bit curious why nobody else saw this
> WARN_ON(). I'm also curious if you have tested the hardlockup detector
> on newer kernels, or if all of your work has been done on 4.19. If all
> your work has been done on 4.19, do we need to find someone to test
> your patch on a newer kernel and make sure it works OK? If you've
> tested on a newer kernel, did the hardlockup detector init from the
> kernel's early-init code, or the retry code?

In newer kernels, when the probe fails initially and falls
back to the retry workqueue (or even during early init if preemption is
enabled), the `WARN_ON(!is_percpu_thread())` in
`hardlockup_detector_event_create()` does indeed trigger because
`watchdog_hardlockup_probe()` is called from a non-bound context.

I have verified this patch on the openEuler 4.19 kernel. During our stress
testing, where we start dozens of VMs simultaneously to create high resource
contention, the UAF was consistently reproducible without this fix and is now
confirmed resolved.

The v4 patch addresses this by refactoring the creation logic to be stateless
and adding `cpu_hotplug_disable()` to ensure the probed CPU stays alive.

I'll wait for your further thoughts on v4:
https://lore.kernel.org/all/20260124070814.806828-1-realwujing@gmail.com/

Best regards,
Qiliang
Re: [PATCH v3] watchdog/hardlockup: Fix UAF in perf event cleanup due to migration race
Posted by Doug Anderson 1 week, 4 days ago
Hi,

On Sun, Jan 25, 2026 at 7:30 PM Qiliang Yuan <realwujing@gmail.com> wrote:
>
> Hi Doug,
>
> Thanks for your further questions and for digging into the 4.19 vs ToT
> differences.
>
> On Sat, 24 Jan 2026 15:36:01 Doug Anderson <dianders@chromium.org> wrote:
> > The part that doesn't make a lot of sense to me, though, is that v4.19
> > also doesn't have commit 930d8f8dbab9 ("watchdog/perf: adapt the
> > watchdog_perf interface for async model"), which is where we are
> > saying the problem was introduced.
> >
> > ...so in v4.19 I think:
> > * hardlockup_detector_perf_init() is only called from watchdog_nmi_probe()
> > * watchdog_nmi_probe() is only called from lockup_detector_init()
> > * lockup_detector_init() is only called from kernel_init_freeable()
> > right before smp_init()
> >
> > Thus I'm super confused about how you could have seen the problem on
> > v4.19. Maybe your v4.19 kernel has some backported patches that makes
> > this possible?
>
> You caught it! Here is the context for the differences:
>
> 1. Mainline (ToT):
>    - `lockup_detector_init()` is always called before `smp_init()`
>      (pre-SMP phase).
>    - Risk source: The asynchronous retry path (`lockup_detector_delay_init`)
>      introduced by 930d8f8dbab9, which runs in a workqueue (post-SMP)
>      context and triggers the UAF.
>
> 2. openEuler (4.19/5.10):
>    - Local `euler inclusion` patches moved `lockup_detector_init()` after
>      `do_basic_setup()` (post-SMP phase).
>    - Risk source: The initial probe occurs directly in a post-SMP
>      environment, exposing the race condition.
>
> For openEuler (4.19/5.10) kernel, the call stack looks like this:
>   kernel_init()
>   -> kernel_init_freeable()
>     -> lockup_detector_init()       <-- Called after smp_init()
>       -> watchdog_nmi_probe()
>         -> hardlockup_detector_perf_init()
>           -> hardlockup_detector_event_create()
>
> In mainline (ToT), the initial probe (safe) call stack is:
>   kernel_init()
>   -> kernel_init_freeable()
>     -> lockup_detector_init()       <-- Called before smp_init()
>       -> watchdog_hardlockup_probe()
>         -> hardlockup_detector_event_create()
>
> However, the asynchronous retry mechanism (commit 930d8f8dbab9) executes the
> probe logic in a post-SMP, preemptible context.
>
> For the mainline (ToT) retry path (at risk), the call stack is:
>   kworker thread
>   -> process_one_work()
>     -> lockup_detector_delay_init()
>       -> watchdog_hardlockup_probe()
>         -> hardlockup_detector_event_create()
>
> Thus, `930d8f8dbab9` remains the correct "Fixes" target for ToT.

OK, at least I'm not crazy! That does indeed explain why things seemed
so wonky...


> > OK, fair enough. ...but I'm a bit curious why nobody else saw this
> > WARN_ON(). I'm also curious if you have tested the hardlockup detector
> > on newer kernels, or if all of your work has been done on 4.19. If all
> > your work has been done on 4.19, do we need to find someone to test
> > your patch on a newer kernel and make sure it works OK? If you've
> > tested on a newer kernel, did the hardlockup detector init from the
> > kernel's early-init code, or the retry code?
>
> In newer kernels, when the probe fails initially and falls
> back to the retry workqueue (or even during early init if preemption is
> enabled), the `WARN_ON(!is_percpu_thread())` in
> `hardlockup_detector_event_create()` does indeed trigger because
> `watchdog_hardlockup_probe()` is called from a non-bound context.
>
> I have verified this patch on the openEuler 4.19 kernel. During our stress
> testing, where we start dozens of VMs simultaneously to create high resource
> contention, the UAF was consistently reproducible without this fix and is now
> confirmed resolved.
>
> The v4 patch addresses this by refactoring the creation logic to be stateless
> and adding `cpu_hotplug_disable()` to ensure the probed CPU stays alive.

OK, so I think the answer is: you haven't actually seen the problem
(or the WARN_ON) on a mainline kernel, only on the openEuler 4.19
kernel...

...actually, I looked and now think the problem doesn't exist on a
mainline kernel. Specificaly, when we run lockup_detector_retry_init()
we call schedule_work() to do the work. That schedules work on the
"system_percpu_wq". While the work ends up being queued with
"WORK_CPU_UNBOUND", I believe that we still end up running on a thread
that's bound to just one CPU in the end.  This is presumably why
nobody has reported that "WARN_ON(!is_percpu_thread())" actually
hitting on mainline.

Given the above, it sounds to me like the problem you're having is
with a downstream kernel and upstream is actually fine. Did I
understand that correctly?

If that's the case, we'd definitely want to at least change the
description and presumably _remove_ the Fixes tag? I actually still
think the code looks nicer after your CL and (maybe?) we could even
remove the whole schedule_work() for running this code? Maybe it was
only added to deal with this exact problem? ...but the CL description
would definitely need to be updated.


> I'll wait for your further thoughts on v4:
> https://lore.kernel.org/all/20260124070814.806828-1-realwujing@gmail.com/

Sure. In the very least the CL description would need to be updated
(assuming my understanding is correct), but for now let's avoid
forking the conversation and resolve things here?

-Doug
[PATCH v4] watchdog/hardlockup: Fix UAF in perf event cleanup due to migration race
Posted by Qiliang Yuan 2 weeks ago
Original analysis on Linux 4.19 showed a race condition in the hardlockup
detector's initialization phase. Specifically, during the early probe
phase, hardlockup_detector_perf_init() (renamed to
watchdog_hardlockup_probe() in newer kernels via commit d9b3629ade8e)
interacted with the per-cpu 'watchdog_ev' variable.

If the initializing task migrates to another CPU during this probe phase,
two issues arise:
1. The 'watchdog_ev' pointer on the original CPU is set but not cleared,
   leaving a stale pointer to a freed perf event.
2. The 'watchdog_ev' pointer on the new CPU might be incorrectly cleared.

Note: Although the logs below reference hardlockup_detector_perf_init(),
the same logic persists in the current watchdog_hardlockup_probe()
implementation.

This race condition was observed in console logs:
[23.038376] hardlockup_detector_perf_init 313 cur_cpu=2
...
[23.076385] hardlockup_detector_event_create 203 cpu(cur)=2 set watchdog_ev
...
[23.095788] perf_event_release_kernel 4623 cur_cpu=2
...
[23.116963] lockup_detector_reconfigure 577 cur_cpu=3

The log shows the task started on CPU 2, set watchdog_ev on CPU 2,
released the event on CPU 2, but then migrated to CPU 3 before the
cleanup logic could run. This left watchdog_ev on CPU 2 pointing to a
freed event, resulting in a UAF when later accessed:

[26.540732] BUG: KASAN: use-after-free in perf_event_ctx_lock_nested.isra.72+0x6b/0x140
[26.542442] Read of size 8 at addr ff110006b360d718 by task kworker/2:1/94

Fix this by refactoring hardlockup_detector_event_create() to return the
created perf event instead of directly assigning it to the per-cpu variable.
In the probe function, use an arbitrary CPU but ensure it remains
online via cpu_hotplug_disable() during the check.

Fixes: 930d8f8dbab9 ("watchdog/perf: adapt the watchdog_perf interface for async model")
Signed-off-by: Shouxin Sun <sunshx@chinatelecom.cn>
Signed-off-by: Junnan Zhang <zhangjn11@chinatelecom.cn>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
Cc: Song Liu <song@kernel.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Jinchao Wang <wangjinchao600@gmail.com>
Cc: Wang Jinchao <wangjinchao600@gmail.com>
Cc: <stable@vger.kernel.org>
---
v4:
- Add cpu_hotplug_disable() in watchdog_hardlockup_probe() to ensure the
  sampled CPU remains online during probing. 
- Update commit message to explain the relevance of 4.19 logs even
  though functions were renamed in modern kernels. 
v3:
- Refactor hardlockup_detector_event_create() to return the event pointer
  instead of directly assigning to per-cpu variables to fix the UAF.
- Restore PMU cycle fallback and unify the enable/probe paths.
v2:
- Add Cc: <stable@vger.kernel.org>.
v1:
- Avoid 'watchdog_ev' in probe path by manually creating and releasing a
  local perf event.
 kernel/watchdog_perf.c | 56 +++++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index d3ca70e3c256..887b61c65c1b 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -17,6 +17,7 @@
 #include <linux/atomic.h>
 #include <linux/module.h>
 #include <linux/sched/debug.h>
+#include <linux/cpu.h>
 
 #include <asm/irq_regs.h>
 #include <linux/perf_event.h>
@@ -118,18 +119,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
 	watchdog_hardlockup_check(smp_processor_id(), regs);
 }
 
-static int hardlockup_detector_event_create(void)
+static struct perf_event *hardlockup_detector_event_create(unsigned int cpu)
 {
-	unsigned int cpu;
 	struct perf_event_attr *wd_attr;
 	struct perf_event *evt;
 
-	/*
-	 * Preemption is not disabled because memory will be allocated.
-	 * Ensure CPU-locality by calling this in per-CPU kthread.
-	 */
-	WARN_ON(!is_percpu_thread());
-	cpu = raw_smp_processor_id();
 	wd_attr = &wd_hw_attr;
 	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
 
@@ -143,14 +137,7 @@ static int hardlockup_detector_event_create(void)
 						       watchdog_overflow_callback, NULL);
 	}
 
-	if (IS_ERR(evt)) {
-		pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
-			 PTR_ERR(evt));
-		return PTR_ERR(evt);
-	}
-	WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak");
-	this_cpu_write(watchdog_ev, evt);
-	return 0;
+	return evt;
 }
 
 /**
@@ -159,17 +146,26 @@ static int hardlockup_detector_event_create(void)
  */
 void watchdog_hardlockup_enable(unsigned int cpu)
 {
+	struct perf_event *evt;
+
 	WARN_ON_ONCE(cpu != smp_processor_id());
 
-	if (hardlockup_detector_event_create())
+	evt = hardlockup_detector_event_create(cpu);
+	if (IS_ERR(evt)) {
+		pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
+			 PTR_ERR(evt));
 		return;
+	}
 
 	/* use original value for check */
 	if (!atomic_fetch_inc(&watchdog_cpus))
 		pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
 
+	WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak");
+	this_cpu_write(watchdog_ev, evt);
+
 	watchdog_init_timestamp();
-	perf_event_enable(this_cpu_read(watchdog_ev));
+	perf_event_enable(evt);
 }
 
 /**
@@ -263,19 +259,35 @@ bool __weak __init arch_perf_nmi_is_available(void)
  */
 int __init watchdog_hardlockup_probe(void)
 {
+	struct perf_event *evt;
+	unsigned int cpu;
 	int ret;
 
 	if (!arch_perf_nmi_is_available())
 		return -ENODEV;
 
-	ret = hardlockup_detector_event_create();
+	if (!hw_nmi_get_sample_period(watchdog_thresh))
+		return -EINVAL;
 
-	if (ret) {
+	/*
+	 * Test hardware PMU availability by creating a temporary perf event.
+	 * The requested CPU is arbitrary; preemption is not disabled, so
+	 * raw_smp_processor_id() is used. Surround with cpu_hotplug_disable()
+	 * to ensure the arbitrarily chosen CPU remains online during the check.
+	 * The event is released immediately.
+	 */
+	cpu_hotplug_disable();
+	cpu = raw_smp_processor_id();
+	evt = hardlockup_detector_event_create(cpu);
+	if (IS_ERR(evt)) {
 		pr_info("Perf NMI watchdog permanently disabled\n");
+		ret = PTR_ERR(evt);
 	} else {
-		perf_event_release_kernel(this_cpu_read(watchdog_ev));
-		this_cpu_write(watchdog_ev, NULL);
+		perf_event_release_kernel(evt);
+		ret = 0;
 	}
+	cpu_hotplug_enable();
+
 	return ret;
 }
 
-- 
2.51.0