[PATCH cgroup/for-6.19] cgroup: Fix sleeping from invalid context warning on PREEMPT_RT

Tejun Heo posted 1 patch 1 month, 1 week ago
There is a newer version of this series
include/linux/sched.h  |    5 +++-
kernel/cgroup/cgroup.c |   53 ++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 56 insertions(+), 2 deletions(-)
[PATCH cgroup/for-6.19] cgroup: Fix sleeping from invalid context warning on PREEMPT_RT
Posted by Tejun Heo 1 month, 1 week ago
cgroup_task_dead() is called from finish_task_switch() which runs with
preemption disabled and doesn't allow scheduling even on PREEMPT_RT. The
function needs to acquire css_set_lock which is a regular spinlock that can
sleep on RT kernels, leading to "sleeping function called from invalid
context" warnings.

css_set_lock is too large in scope to convert to a raw_spinlock. However,
the unlinking operations don't need to run synchronously - they just need
to complete after the task is done running.

On PREEMPT_RT, defer the work through irq_work.

Fixes: d245698d727a ("cgroup: Defer task cgroup unlink until after the task is done switching out")
Reported-by: Calvin Owens <calvin@wbinvd.org>
Link: https://lore.kernel.org/r/20251104181114.489391-1-calvin@wbinvd.org
Signed-off-by: Tejun Heo <tj@kernel.org>
---
Hello,

Calvin, this seems to work fine here but can you please try it out?
Sebastian, Peter, does this look okay to you guys?

Thanks.

 include/linux/sched.h  |    5 +++-
 kernel/cgroup/cgroup.c |   53 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 56 insertions(+), 2 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1324,7 +1324,10 @@ struct task_struct {
 	struct css_set __rcu		*cgroups;
 	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
 	struct list_head		cg_list;
-#endif
+#ifdef CONFIG_PREEMPT_RT
+	struct llist_node		cg_dead_lnode;
+#endif	/* CONFIG_PREEMPT_RT */
+#endif	/* CONFIG_CGROUPS */
 #ifdef CONFIG_X86_CPU_RESCTRL
 	u32				closid;
 	u32				rmid;
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -290,6 +290,7 @@ static void kill_css(struct cgroup_subsy
 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 			      struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
+static void cgroup_rt_init(void);
 
 #ifdef CONFIG_DEBUG_CGROUP_REF
 #define CGROUP_REF_FN_ATTRS	noinline
@@ -6360,6 +6361,7 @@ int __init cgroup_init(void)
 	BUG_ON(ss_rstat_init(NULL));
 
 	get_user_ns(init_cgroup_ns.user_ns);
+	cgroup_rt_init();
 
 	cgroup_lock();
 
@@ -6990,7 +6992,7 @@ void cgroup_task_exit(struct task_struct
 	} while_each_subsys_mask();
 }
 
-void cgroup_task_dead(struct task_struct *tsk)
+static void do_cgroup_task_dead(struct task_struct *tsk)
 {
 	struct css_set *cset;
 	unsigned long flags;
@@ -7016,6 +7018,55 @@ void cgroup_task_dead(struct task_struct
 	spin_unlock_irqrestore(&css_set_lock, flags);
 }
 
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
+ * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
+ * this lead to sleeping in the invalid context warning bug. css_set_lock is too
+ * big to become a raw_spinlock. The task_dead path doesn't need to run
+ * synchronously. Bounce through irq_work instead.
+ */
+static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
+static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
+
+static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
+{
+	struct llist_node *lnode;
+	struct task_struct *task, *next;
+
+	lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
+	llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
+		do_cgroup_task_dead(task);
+		put_task_struct(task);
+	}
+}
+
+static void __init cgroup_rt_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
+		init_irq_work(per_cpu_ptr(&cgrp_dead_tasks_iwork, cpu),
+			      cgrp_dead_tasks_iwork_fn);
+	}
+}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+	get_task_struct(task);
+	llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
+	irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
+}
+#else	/* CONFIG_PREEMPT_RT */
+static void __init cgroup_rt_init(void) {}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+	do_cgroup_task_dead(task);
+}
+#endif	/* CONFIG_PREEMPT_RT */
+
 void cgroup_task_release(struct task_struct *task)
 {
 	struct cgroup_subsys *ss;
Re: [PATCH cgroup/for-6.19] cgroup: Fix sleeping from invalid context warning on PREEMPT_RT
Posted by Sebastian Andrzej Siewior 1 month, 1 week ago
On 2025-11-05 09:03:55 [-1000], Tejun Heo wrote:
> +#ifdef CONFIG_PREEMPT_RT
> +/*
> + * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
> + * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
> + * this lead to sleeping in the invalid context warning bug. css_set_lock is too
> + * big to become a raw_spinlock. The task_dead path doesn't need to run
> + * synchronously. Bounce through irq_work instead.
> + */
> +static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
> +static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
> +
> +static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
> +{
> +	struct llist_node *lnode;
> +	struct task_struct *task, *next;
> +
> +	lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
> +	llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
> +		do_cgroup_task_dead(task);
> +		put_task_struct(task);
> +	}
> +}
> +
> +static void __init cgroup_rt_init(void)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
> +		init_irq_work(per_cpu_ptr(&cgrp_dead_tasks_iwork, cpu),
> +			      cgrp_dead_tasks_iwork_fn);

How important is it, that it happens right away? Written as-is, this
leads to an interrupt then wakes irq_work/$cpu thread which then runs
this callback. That thread runs as SCHED_FIFO-1. This means the
termination of a SCHED_OTHER tasks on a single CPU will run as follows:
 - TASK_DEAD
   schedule()
     - queue IRQ_WORK
     -> INTERRUPT
     -> WAKE irq_work
   -> preempt to irq_work/
      -> handle one callback
      schedule()
 back to next TASK_DEAD

So cgrp_dead_tasks_iwork_fn() will never have to opportunity to batch.
Unless the exiting task's priority is > 1. Then it will be delayed
until all RT tasks are done.

My proposal would be to init the irq_work item with
	*per_cpu_ptr(&cgrp_dead_tasks_iwork, cpu) = IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);

instead which won't raise an IRQ immediately and delay the callback
until the next timer tick. So it could batch multiple tasks.

[ queue_work() should work, too but the overhead to schedule is greater
  imho so this makes sense ]

Sebastian
Re: [PATCH cgroup/for-6.19] cgroup: Fix sleeping from invalid context warning on PREEMPT_RT
Posted by Tejun Heo 1 month, 1 week ago
Hello,

On Thu, Nov 06, 2025 at 04:07:17PM +0100, Sebastian Andrzej Siewior wrote:
> How important is it, that it happens right away? Written as-is, this

Not important at all.

> leads to an interrupt then wakes irq_work/$cpu thread which then runs
> this callback. That thread runs as SCHED_FIFO-1. This means the
> termination of a SCHED_OTHER tasks on a single CPU will run as follows:
>  - TASK_DEAD
>    schedule()
>      - queue IRQ_WORK
>      -> INTERRUPT
>      -> WAKE irq_work
>    -> preempt to irq_work/
>       -> handle one callback
>       schedule()
>  back to next TASK_DEAD
> 
> So cgrp_dead_tasks_iwork_fn() will never have to opportunity to batch.
> Unless the exiting task's priority is > 1. Then it will be delayed
> until all RT tasks are done.
> 
> My proposal would be to init the irq_work item with
> 	*per_cpu_ptr(&cgrp_dead_tasks_iwork, cpu) = IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
> 
> instead which won't raise an IRQ immediately and delay the callback
> until the next timer tick. So it could batch multiple tasks.
> 
> [ queue_work() should work, too but the overhead to schedule is greater
>   imho so this makes sense ]

Will switch to IRQ_WORK_LAZY_INIT.

Thanks for the review.

-- 
tejun
Re: [PATCH cgroup/for-6.19] cgroup: Fix sleeping from invalid context warning on PREEMPT_RT
Posted by Sebastian Andrzej Siewior 1 month, 1 week ago
On 2025-11-06 07:37:16 [-1000], Tejun Heo wrote:
> Hello,
Hi,

> On Thu, Nov 06, 2025 at 04:07:17PM +0100, Sebastian Andrzej Siewior wrote:
> > How important is it, that it happens right away? Written as-is, this
> 
> Not important at all.
> 
…
> Will switch to IRQ_WORK_LAZY_INIT.

Quick question: Since it is not important at all, would it work to have
it in task's RCU callback, __put_task_struct()?

> Thanks for the review.
> 

Sebastian
Re: [PATCH cgroup/for-6.19] cgroup: Fix sleeping from invalid context warning on PREEMPT_RT
Posted by Tejun Heo 1 month, 1 week ago
Hello,

On Thu, Nov 06, 2025 at 06:46:14PM +0100, Sebastian Andrzej Siewior wrote:
> On 2025-11-06 07:37:16 [-1000], Tejun Heo wrote:
> > Will switch to IRQ_WORK_LAZY_INIT.
> 
> Quick question: Since it is not important at all, would it work to have
> it in task's RCU callback, __put_task_struct()?

It doesn't have to run right away but it better run in some definite time
frame because at this point the task is not visible from userspace otherwise
(it doesn't have a pid) but are still pinning the cgroup, so we're in this
limbo state where reading cgroup.procs should return empty (there may be a
bug here right now. I think the code will try to deref NULL pid pointer) but
the cgroup is not empty. This window is not really broken in itself because
cgroup empty state is tracked and notified separately. However, task_struct
can be pinned and can linger for indefinite amount of time after being dead,
and that would become an actual problem.

So, to add a bit of qualitifier, while it's okay to run it with some amount
of delay that's not very significant to human perception, we definitely
don't want to allow delaying it indefinitely.

Thanks.

-- 
tejun
Re: [PATCH cgroup/for-6.19] cgroup: Fix sleeping from invalid context warning on PREEMPT_RT
Posted by Sebastian Andrzej Siewior 1 month, 1 week ago
On 2025-11-06 07:55:02 [-1000], Tejun Heo wrote:
> Hello,
Hi,

> On Thu, Nov 06, 2025 at 06:46:14PM +0100, Sebastian Andrzej Siewior wrote:
> > On 2025-11-06 07:37:16 [-1000], Tejun Heo wrote:
> > > Will switch to IRQ_WORK_LAZY_INIT.
> > 
> > Quick question: Since it is not important at all, would it work to have
> > it in task's RCU callback, __put_task_struct()?
> 
> It doesn't have to run right away but it better run in some definite time
> frame because at this point the task is not visible from userspace otherwise
> (it doesn't have a pid) but are still pinning the cgroup, so we're in this
> limbo state where reading cgroup.procs should return empty (there may be a
> bug here right now. I think the code will try to deref NULL pid pointer) but
> the cgroup is not empty. This window is not really broken in itself because
> cgroup empty state is tracked and notified separately. However, task_struct
> can be pinned and can linger for indefinite amount of time after being dead,
> and that would become an actual problem.
> 
> So, to add a bit of qualitifier, while it's okay to run it with some amount
> of delay that's not very significant to human perception, we definitely
> don't want to allow delaying it indefinitely.

Okay. This is some arguing that would justify the additional extension
of task_struct :)
Understood.

> Thanks.

Sebastian
Re: [PATCH cgroup/for-6.19] cgroup: Fix sleeping from invalid context warning on PREEMPT_RT
Posted by Calvin Owens 1 month, 1 week ago
On Wednesday 11/05 at 09:03 -1000, Tejun Heo wrote:
> cgroup_task_dead() is called from finish_task_switch() which runs with
> preemption disabled and doesn't allow scheduling even on PREEMPT_RT. The
> function needs to acquire css_set_lock which is a regular spinlock that can
> sleep on RT kernels, leading to "sleeping function called from invalid
> context" warnings.
> 
> css_set_lock is too large in scope to convert to a raw_spinlock. However,
> the unlinking operations don't need to run synchronously - they just need
> to complete after the task is done running.
> 
> On PREEMPT_RT, defer the work through irq_work.
> 
> Fixes: d245698d727a ("cgroup: Defer task cgroup unlink until after the task is done switching out")
> Reported-by: Calvin Owens <calvin@wbinvd.org>
> Link: https://lore.kernel.org/r/20251104181114.489391-1-calvin@wbinvd.org
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
> Hello,
> 
> Calvin, this seems to work fine here but can you please try it out?

Works for me, no splats with that on top of next-20251104.
Re: [PATCH cgroup/for-6.19] cgroup: Fix sleeping from invalid context warning on PREEMPT_RT
Posted by Tejun Heo 1 month, 1 week ago
On Wed, Nov 05, 2025 at 05:15:50PM -0800, Calvin Owens wrote:
> Works for me, no splats with that on top of next-20251104.

Thanks for testing!

-- 
tejun