kernel/cgroup/cgroup.c | 43 +++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-)
From: Chen Ridong <chenridong@huawei.com>
A hung task can occur during [1] LTP cgroup testing when repeatedly
mounting/unmounting perf_event and net_prio controllers with
systemd.unified_cgroup_hierarchy=1. The hang manifests in
cgroup_lock_and_drain_offline() during root destruction.
Related case:
cgroup_fj_function_perf_event cgroup_fj_function.sh perf_event
cgroup_fj_function_net_prio cgroup_fj_function.sh net_prio
Call Trace:
cgroup_lock_and_drain_offline+0x14c/0x1e8
cgroup_destroy_root+0x3c/0x2c0
css_free_rwork_fn+0x248/0x338
process_one_work+0x16c/0x3b8
worker_thread+0x22c/0x3b0
kthread+0xec/0x100
ret_from_fork+0x10/0x20
Root Cause:
CPU0 CPU1
mount perf_event umount net_prio
cgroup1_get_tree cgroup_kill_sb
rebind_subsystems // root destruction enqueues
// cgroup_destroy_wq
// kill all perf_event css
// one perf_event css A is dying
// css A offline enqueues cgroup_destroy_wq
// root destruction will be executed first
css_free_rwork_fn
cgroup_destroy_root
cgroup_lock_and_drain_offline
// some perf descendants are dying
// cgroup_destroy_wq max_active = 1
// waiting for css A to die
Problem scenario:
1. CPU0 mounts perf_event (rebind_subsystems)
2. CPU1 unmounts net_prio (cgroup_kill_sb), queuing root destruction work
3. A dying perf_event CSS gets queued for offline after root destruction
4. Root destruction waits for offline completion, but offline work is
blocked behind root destruction in cgroup_destroy_wq (max_active=1)
Solution:
Split cgroup_destroy_wq into three dedicated workqueues:
cgroup_offline_wq – Handles CSS offline operations
cgroup_release_wq – Manages resource release
cgroup_free_wq – Performs final memory deallocation
This separation eliminates blocking in the CSS free path while waiting for
offline operations to complete.
[1] https://github.com/linux-test-project/ltp/blob/master/runtest/controllers
Fixes: 334c3679ec4b ("cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends")
Reported-by: Gao Yingjie <gaoyingjie@uniontech.com>
Signed-off-by: Chen Ridong <chenridong@huawei.com>
Suggested-by: Teju Heo <tj@kernel.org>
---
kernel/cgroup/cgroup.c | 43 +++++++++++++++++++++++++++++++++++-------
1 file changed, 36 insertions(+), 7 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 312c6a8b55bb..79b1d79f86a3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -126,8 +126,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
* of concurrent destructions. Use a separate workqueue so that cgroup
* destruction work items don't end up filling up max_active of system_wq
* which may lead to deadlock.
+ *
+ * A cgroup destruction should enqueue work sequentially to:
+ * cgroup_offline_wq: use for css offline work
+ * cgroup_release_wq: use for css release work
+ * cgroup_free_wq: use for free work
+ *
+ * Rationale for using separate workqueues:
+ * The cgroup root free work may depend on completion of other css offline
+ * operations. If all tasks were enqueued to a single workqueue, this could
+ * create a deadlock scenario where:
+ * - Free work waits for other css offline work to complete.
+ * - But other css offline work is queued after free work in the same queue.
+ *
+ * Example deadlock scenario with single workqueue (cgroup_destroy_wq):
+ * 1. umount net_prio
+ * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx)
+ * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx)
+ * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline.
+ * 5. net_prio root destruction blocks waiting for perf_event CSS A offline,
+ * which can never complete as it's behind in the same queue and
+ * workqueue's max_active is 1.
*/
-static struct workqueue_struct *cgroup_destroy_wq;
+static struct workqueue_struct *cgroup_offline_wq;
+static struct workqueue_struct *cgroup_release_wq;
+static struct workqueue_struct *cgroup_free_wq;
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
@@ -5558,7 +5581,7 @@ static void css_release_work_fn(struct work_struct *work)
cgroup_unlock();
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
}
static void css_release(struct percpu_ref *ref)
@@ -5567,7 +5590,7 @@ static void css_release(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_release_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_release_wq, &css->destroy_work);
}
static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5701,7 +5724,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
list_del_rcu(&css->sibling);
err_free_css:
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
return ERR_PTR(err);
}
@@ -5939,7 +5962,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
if (atomic_dec_and_test(&css->online_cnt)) {
INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_offline_wq, &css->destroy_work);
}
}
@@ -6325,8 +6348,14 @@ static int __init cgroup_wq_init(void)
* We would prefer to do this in cgroup_init() above, but that
* is called before init_workqueues(): so leave this until after.
*/
- cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
- BUG_ON(!cgroup_destroy_wq);
+ cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1);
+ BUG_ON(!cgroup_offline_wq);
+
+ cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1);
+ BUG_ON(!cgroup_release_wq);
+
+ cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1);
+ BUG_ON(!cgroup_free_wq);
return 0;
}
core_initcall(cgroup_wq_init);
--
2.34.1
On Tue, Aug 19, 2025 at 01:07:24AM +0000, Chen Ridong wrote: > From: Chen Ridong <chenridong@huawei.com> > > A hung task can occur during [1] LTP cgroup testing when repeatedly > mounting/unmounting perf_event and net_prio controllers with > systemd.unified_cgroup_hierarchy=1. The hang manifests in > cgroup_lock_and_drain_offline() during root destruction. > > Related case: > cgroup_fj_function_perf_event cgroup_fj_function.sh perf_event > cgroup_fj_function_net_prio cgroup_fj_function.sh net_prio > > Call Trace: > cgroup_lock_and_drain_offline+0x14c/0x1e8 > cgroup_destroy_root+0x3c/0x2c0 > css_free_rwork_fn+0x248/0x338 > process_one_work+0x16c/0x3b8 > worker_thread+0x22c/0x3b0 > kthread+0xec/0x100 > ret_from_fork+0x10/0x20 > > Root Cause: > > CPU0 CPU1 > mount perf_event umount net_prio > cgroup1_get_tree cgroup_kill_sb > rebind_subsystems // root destruction enqueues > // cgroup_destroy_wq > // kill all perf_event css > // one perf_event css A is dying > // css A offline enqueues cgroup_destroy_wq > // root destruction will be executed first > css_free_rwork_fn > cgroup_destroy_root > cgroup_lock_and_drain_offline > // some perf descendants are dying > // cgroup_destroy_wq max_active = 1 > // waiting for css A to die > > Problem scenario: > 1. CPU0 mounts perf_event (rebind_subsystems) > 2. CPU1 unmounts net_prio (cgroup_kill_sb), queuing root destruction work > 3. A dying perf_event CSS gets queued for offline after root destruction > 4. Root destruction waits for offline completion, but offline work is > blocked behind root destruction in cgroup_destroy_wq (max_active=1) > > Solution: > Split cgroup_destroy_wq into three dedicated workqueues: > cgroup_offline_wq – Handles CSS offline operations > cgroup_release_wq – Manages resource release > cgroup_free_wq – Performs final memory deallocation > > This separation eliminates blocking in the CSS free path while waiting for > offline operations to complete. > > [1] https://github.com/linux-test-project/ltp/blob/master/runtest/controllers > Fixes: 334c3679ec4b ("cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends") > Reported-by: Gao Yingjie <gaoyingjie@uniontech.com> > Signed-off-by: Chen Ridong <chenridong@huawei.com> > Suggested-by: Teju Heo <tj@kernel.org> Applied to cgroup/for-6.17-fixes. Sorry about the delay. I missed the thread. Thanks. -- tejun
On 2025/8/23 1:45, Tejun Heo wrote: > On Tue, Aug 19, 2025 at 01:07:24AM +0000, Chen Ridong wrote: >> From: Chen Ridong <chenridong@huawei.com> >> >> A hung task can occur during [1] LTP cgroup testing when repeatedly >> mounting/unmounting perf_event and net_prio controllers with >> systemd.unified_cgroup_hierarchy=1. The hang manifests in >> cgroup_lock_and_drain_offline() during root destruction. >> >> Related case: >> cgroup_fj_function_perf_event cgroup_fj_function.sh perf_event >> cgroup_fj_function_net_prio cgroup_fj_function.sh net_prio >> >> Call Trace: >> cgroup_lock_and_drain_offline+0x14c/0x1e8 >> cgroup_destroy_root+0x3c/0x2c0 >> css_free_rwork_fn+0x248/0x338 >> process_one_work+0x16c/0x3b8 >> worker_thread+0x22c/0x3b0 >> kthread+0xec/0x100 >> ret_from_fork+0x10/0x20 >> >> Root Cause: >> >> CPU0 CPU1 >> mount perf_event umount net_prio >> cgroup1_get_tree cgroup_kill_sb >> rebind_subsystems // root destruction enqueues >> // cgroup_destroy_wq >> // kill all perf_event css >> // one perf_event css A is dying >> // css A offline enqueues cgroup_destroy_wq >> // root destruction will be executed first >> css_free_rwork_fn >> cgroup_destroy_root >> cgroup_lock_and_drain_offline >> // some perf descendants are dying >> // cgroup_destroy_wq max_active = 1 >> // waiting for css A to die >> >> Problem scenario: >> 1. CPU0 mounts perf_event (rebind_subsystems) >> 2. CPU1 unmounts net_prio (cgroup_kill_sb), queuing root destruction work >> 3. A dying perf_event CSS gets queued for offline after root destruction >> 4. Root destruction waits for offline completion, but offline work is >> blocked behind root destruction in cgroup_destroy_wq (max_active=1) >> >> Solution: >> Split cgroup_destroy_wq into three dedicated workqueues: >> cgroup_offline_wq – Handles CSS offline operations >> cgroup_release_wq – Manages resource release >> cgroup_free_wq – Performs final memory deallocation >> >> This separation eliminates blocking in the CSS free path while waiting for >> offline operations to complete. >> >> [1] https://github.com/linux-test-project/ltp/blob/master/runtest/controllers >> Fixes: 334c3679ec4b ("cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends") >> Reported-by: Gao Yingjie <gaoyingjie@uniontech.com> >> Signed-off-by: Chen Ridong <chenridong@huawei.com> >> Suggested-by: Teju Heo <tj@kernel.org> > > Applied to cgroup/for-6.17-fixes. Sorry about the delay. I missed the > thread. > > Thanks. > Thanks -- Best regards, Ridong
On 2025/8/19 9:07, Chen Ridong wrote: > From: Chen Ridong <chenridong@huawei.com> > > A hung task can occur during [1] LTP cgroup testing when repeatedly > mounting/unmounting perf_event and net_prio controllers with > systemd.unified_cgroup_hierarchy=1. The hang manifests in > cgroup_lock_and_drain_offline() during root destruction. > > Related case: > cgroup_fj_function_perf_event cgroup_fj_function.sh perf_event > cgroup_fj_function_net_prio cgroup_fj_function.sh net_prio > > Call Trace: > cgroup_lock_and_drain_offline+0x14c/0x1e8 > cgroup_destroy_root+0x3c/0x2c0 > css_free_rwork_fn+0x248/0x338 > process_one_work+0x16c/0x3b8 > worker_thread+0x22c/0x3b0 > kthread+0xec/0x100 > ret_from_fork+0x10/0x20 > > Root Cause: > > CPU0 CPU1 > mount perf_event umount net_prio > cgroup1_get_tree cgroup_kill_sb > rebind_subsystems // root destruction enqueues > // cgroup_destroy_wq > // kill all perf_event css > // one perf_event css A is dying > // css A offline enqueues cgroup_destroy_wq > // root destruction will be executed first > css_free_rwork_fn > cgroup_destroy_root > cgroup_lock_and_drain_offline > // some perf descendants are dying > // cgroup_destroy_wq max_active = 1 > // waiting for css A to die > > Problem scenario: > 1. CPU0 mounts perf_event (rebind_subsystems) > 2. CPU1 unmounts net_prio (cgroup_kill_sb), queuing root destruction work > 3. A dying perf_event CSS gets queued for offline after root destruction > 4. Root destruction waits for offline completion, but offline work is > blocked behind root destruction in cgroup_destroy_wq (max_active=1) > > Solution: > Split cgroup_destroy_wq into three dedicated workqueues: > cgroup_offline_wq – Handles CSS offline operations > cgroup_release_wq – Manages resource release > cgroup_free_wq – Performs final memory deallocation > > This separation eliminates blocking in the CSS free path while waiting for > offline operations to complete. > > [1] https://github.com/linux-test-project/ltp/blob/master/runtest/controllers > Fixes: 334c3679ec4b ("cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends") > Reported-by: Gao Yingjie <gaoyingjie@uniontech.com> > Signed-off-by: Chen Ridong <chenridong@huawei.com> > Suggested-by: Teju Heo <tj@kernel.org> > --- > kernel/cgroup/cgroup.c | 43 +++++++++++++++++++++++++++++++++++------- > 1 file changed, 36 insertions(+), 7 deletions(-) > > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > index 312c6a8b55bb..79b1d79f86a3 100644 > --- a/kernel/cgroup/cgroup.c > +++ b/kernel/cgroup/cgroup.c > @@ -126,8 +126,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); > * of concurrent destructions. Use a separate workqueue so that cgroup > * destruction work items don't end up filling up max_active of system_wq > * which may lead to deadlock. > + * > + * A cgroup destruction should enqueue work sequentially to: > + * cgroup_offline_wq: use for css offline work > + * cgroup_release_wq: use for css release work > + * cgroup_free_wq: use for free work > + * > + * Rationale for using separate workqueues: > + * The cgroup root free work may depend on completion of other css offline > + * operations. If all tasks were enqueued to a single workqueue, this could > + * create a deadlock scenario where: > + * - Free work waits for other css offline work to complete. > + * - But other css offline work is queued after free work in the same queue. > + * > + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): > + * 1. umount net_prio > + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) > + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) > + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. > + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, > + * which can never complete as it's behind in the same queue and > + * workqueue's max_active is 1. > */ > -static struct workqueue_struct *cgroup_destroy_wq; > +static struct workqueue_struct *cgroup_offline_wq; > +static struct workqueue_struct *cgroup_release_wq; > +static struct workqueue_struct *cgroup_free_wq; > > /* generate an array of cgroup subsystem pointers */ > #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, > @@ -5558,7 +5581,7 @@ static void css_release_work_fn(struct work_struct *work) > cgroup_unlock(); > > INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); > - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); > + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); > } > > static void css_release(struct percpu_ref *ref) > @@ -5567,7 +5590,7 @@ static void css_release(struct percpu_ref *ref) > container_of(ref, struct cgroup_subsys_state, refcnt); > > INIT_WORK(&css->destroy_work, css_release_work_fn); > - queue_work(cgroup_destroy_wq, &css->destroy_work); > + queue_work(cgroup_release_wq, &css->destroy_work); > } > > static void init_and_link_css(struct cgroup_subsys_state *css, > @@ -5701,7 +5724,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, > list_del_rcu(&css->sibling); > err_free_css: > INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); > - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); > + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); > return ERR_PTR(err); > } > > @@ -5939,7 +5962,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) > > if (atomic_dec_and_test(&css->online_cnt)) { > INIT_WORK(&css->destroy_work, css_killed_work_fn); > - queue_work(cgroup_destroy_wq, &css->destroy_work); > + queue_work(cgroup_offline_wq, &css->destroy_work); > } > } > > @@ -6325,8 +6348,14 @@ static int __init cgroup_wq_init(void) > * We would prefer to do this in cgroup_init() above, but that > * is called before init_workqueues(): so leave this until after. > */ > - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); > - BUG_ON(!cgroup_destroy_wq); > + cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1); > + BUG_ON(!cgroup_offline_wq); > + > + cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1); > + BUG_ON(!cgroup_release_wq); > + > + cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1); > + BUG_ON(!cgroup_free_wq); > return 0; > } > core_initcall(cgroup_wq_init); Hi, Tj, Just checking in on the v6 I sent here. I am not sure whether you miss it. -- Best regards, Ridong
© 2016 - 2025 Red Hat, Inc.