kernel/sched/core.c | 8 ++++++++ 1 file changed, 8 insertions(+)
Below recursion is observed in a rare scenario where __schedule()
takes rq lock, at around same time task's affinity is being changed,
bpf function for tracing sched_switch calls migrate_enabled(),
checks for affinity change (cpus_ptr != cpus_mask) lands into
__set_cpus_allowed_ptr which tries acquire rq lock and causing the
recursion bug.
Fix the issue by switching to preempt_enable/disable() for non-RT
Kernels.
-010 |spin_bug(lock = ???, msg = ???)
-011 |debug_spin_lock_before(inline)
-011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
-012 |_raw_spin_lock(inline)
-012 |raw_spin_rq_lock_nested(inline)
-012 |raw_spin_rq_lock(inline)
-012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
-013 |__set_cpus_allowed_ptr(inline)
-013 |migrate_enable()
-014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
-015 |perf_trace_run_bpf_submit(inline)
-015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
-016 |__traceiter_sched_switch(inline)
-016 |trace_sched_switch(inline)
-016 |__schedule(sched_mode = ?)
-017 |schedule()
-018 |arch_local_save_flags(inline)
-018 |arch_irqs_disabled(inline)
-018 |__raw_spin_lock_irq(inline)
-018 |_raw_spin_lock_irq(inline)
-018 |worker_thread(__worker = 0xFFFFFF88CE251300)
-019 |kthread(_create = 0xFFFFFF88730A5A80)
-020 |ret_from_fork(asm)
Signed-off-by: Satya Durga Srinivasu Prabhala <quic_satyap@quicinc.com>
---
kernel/sched/core.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bfa7452ca92e..e254e9227341 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2223,6 +2223,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
void migrate_disable(void)
{
+#ifdef CONFIG_PREEMPT_RT
struct task_struct *p = current;
if (p->migration_disabled) {
@@ -2234,11 +2235,15 @@ void migrate_disable(void)
this_rq()->nr_pinned++;
p->migration_disabled = 1;
preempt_enable();
+#else
+ preempt_disable();
+#endif
}
EXPORT_SYMBOL_GPL(migrate_disable);
void migrate_enable(void)
{
+#ifdef CONFIG_PREEMPT_RT
struct task_struct *p = current;
if (p->migration_disabled > 1) {
@@ -2265,6 +2270,9 @@ void migrate_enable(void)
p->migration_disabled = 0;
this_rq()->nr_pinned--;
preempt_enable();
+#else
+ preempt_enable();
+#endif
}
EXPORT_SYMBOL_GPL(migrate_enable);
--
2.36.1
On Fri, Jun 24, 2022 at 12:42:40AM -0700, Satya Durga Srinivasu Prabhala wrote:
> Below recursion is observed in a rare scenario where __schedule()
> takes rq lock, at around same time task's affinity is being changed,
> bpf function for tracing sched_switch calls migrate_enabled(),
> checks for affinity change (cpus_ptr != cpus_mask) lands into
> __set_cpus_allowed_ptr which tries acquire rq lock and causing the
> recursion bug.
>
> Fix the issue by switching to preempt_enable/disable() for non-RT
> Kernels.
>
> -010 |spin_bug(lock = ???, msg = ???)
> -011 |debug_spin_lock_before(inline)
> -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
> -012 |_raw_spin_lock(inline)
> -012 |raw_spin_rq_lock_nested(inline)
> -012 |raw_spin_rq_lock(inline)
> -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
> -013 |__set_cpus_allowed_ptr(inline)
> -013 |migrate_enable()
> -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
> -015 |perf_trace_run_bpf_submit(inline)
> -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
> -016 |__traceiter_sched_switch(inline)
> -016 |trace_sched_switch(inline)
trace_sched_switch() disables preemption.
So how is this a fix?
-- Steve
> -016 |__schedule(sched_mode = ?)
> -017 |schedule()
> -018 |arch_local_save_flags(inline)
> -018 |arch_irqs_disabled(inline)
> -018 |__raw_spin_lock_irq(inline)
> -018 |_raw_spin_lock_irq(inline)
> -018 |worker_thread(__worker = 0xFFFFFF88CE251300)
> -019 |kthread(_create = 0xFFFFFF88730A5A80)
> -020 |ret_from_fork(asm)
>
> Signed-off-by: Satya Durga Srinivasu Prabhala <quic_satyap@quicinc.com>
> ---
> kernel/sched/core.c | 8 ++++++++
> 1 file changed, 8 insertions(+)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index bfa7452ca92e..e254e9227341 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2223,6 +2223,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
>
> void migrate_disable(void)
> {
> +#ifdef CONFIG_PREEMPT_RT
> struct task_struct *p = current;
>
> if (p->migration_disabled) {
> @@ -2234,11 +2235,15 @@ void migrate_disable(void)
> this_rq()->nr_pinned++;
> p->migration_disabled = 1;
> preempt_enable();
> +#else
> + preempt_disable();
> +#endif
> }
> EXPORT_SYMBOL_GPL(migrate_disable);
>
> void migrate_enable(void)
> {
> +#ifdef CONFIG_PREEMPT_RT
> struct task_struct *p = current;
>
> if (p->migration_disabled > 1) {
> @@ -2265,6 +2270,9 @@ void migrate_enable(void)
> p->migration_disabled = 0;
> this_rq()->nr_pinned--;
> preempt_enable();
> +#else
> + preempt_enable();
> +#endif
> }
> EXPORT_SYMBOL_GPL(migrate_enable);
>
> --
> 2.36.1
On 6/30/22 3:37 PM, Steven Rostedt wrote:
> On Fri, Jun 24, 2022 at 12:42:40AM -0700, Satya Durga Srinivasu Prabhala wrote:
>> Below recursion is observed in a rare scenario where __schedule()
>> takes rq lock, at around same time task's affinity is being changed,
>> bpf function for tracing sched_switch calls migrate_enabled(),
>> checks for affinity change (cpus_ptr != cpus_mask) lands into
>> __set_cpus_allowed_ptr which tries acquire rq lock and causing the
>> recursion bug.
>>
>> Fix the issue by switching to preempt_enable/disable() for non-RT
>> Kernels.
>>
>> -010 |spin_bug(lock = ???, msg = ???)
>> -011 |debug_spin_lock_before(inline)
>> -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
>> -012 |_raw_spin_lock(inline)
>> -012 |raw_spin_rq_lock_nested(inline)
>> -012 |raw_spin_rq_lock(inline)
>> -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
>> -013 |__set_cpus_allowed_ptr(inline)
>> -013 |migrate_enable()
>> -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
>> -015 |perf_trace_run_bpf_submit(inline)
>> -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
>> -016 |__traceiter_sched_switch(inline)
>> -016 |trace_sched_switch(inline)
> trace_sched_switch() disables preemption.
>
> So how is this a fix?
Thanks for your time and comments.
I was more looking at non-RT Kernel where switching to
preempt_disable/enable() helps as it's
just increment/decrement of count. I agree, this isn't a right fix.
I'm still cross checking on easy way to repro the issue. Will cross
check further and get back.
>
> -- Steve
>
>> -016 |__schedule(sched_mode = ?)
>> -017 |schedule()
>> -018 |arch_local_save_flags(inline)
>> -018 |arch_irqs_disabled(inline)
>> -018 |__raw_spin_lock_irq(inline)
>> -018 |_raw_spin_lock_irq(inline)
>> -018 |worker_thread(__worker = 0xFFFFFF88CE251300)
>> -019 |kthread(_create = 0xFFFFFF88730A5A80)
>> -020 |ret_from_fork(asm)
>>
>> Signed-off-by: Satya Durga Srinivasu Prabhala <quic_satyap@quicinc.com>
>> ---
>> kernel/sched/core.c | 8 ++++++++
>> 1 file changed, 8 insertions(+)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index bfa7452ca92e..e254e9227341 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -2223,6 +2223,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
>>
>> void migrate_disable(void)
>> {
>> +#ifdef CONFIG_PREEMPT_RT
>> struct task_struct *p = current;
>>
>> if (p->migration_disabled) {
>> @@ -2234,11 +2235,15 @@ void migrate_disable(void)
>> this_rq()->nr_pinned++;
>> p->migration_disabled = 1;
>> preempt_enable();
>> +#else
>> + preempt_disable();
>> +#endif
>> }
>> EXPORT_SYMBOL_GPL(migrate_disable);
>>
>> void migrate_enable(void)
>> {
>> +#ifdef CONFIG_PREEMPT_RT
>> struct task_struct *p = current;
>>
>> if (p->migration_disabled > 1) {
>> @@ -2265,6 +2270,9 @@ void migrate_enable(void)
>> p->migration_disabled = 0;
>> this_rq()->nr_pinned--;
>> preempt_enable();
>> +#else
>> + preempt_enable();
>> +#endif
>> }
>> EXPORT_SYMBOL_GPL(migrate_enable);
>>
>> --
>> 2.36.1
On Thu, 30 Jun 2022 18:37:04 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > > > > Fix the issue by switching to preempt_enable/disable() for non-RT > > Kernels. > > > > -010 |spin_bug(lock = ???, msg = ???) > > -011 |debug_spin_lock_before(inline) > > -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600) > > -012 |_raw_spin_lock(inline) > > -012 |raw_spin_rq_lock_nested(inline) > > -012 |raw_spin_rq_lock(inline) > > -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8) > > -013 |__set_cpus_allowed_ptr(inline) > > -013 |migrate_enable() > > -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600) > > -015 |perf_trace_run_bpf_submit(inline) > > -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?) > > -016 |__traceiter_sched_switch(inline) > > -016 |trace_sched_switch(inline) > > trace_sched_switch() disables preemption. > > So how is this a fix? Let me rephrase my question. As trace_sched_switch() disables preemption, why is trace_call_bpf() calling migrate_disable()? Looks like you could modify the code to include a __bpf_prog_run_array() that skips the migrate_disable(). You even have a "cant_sleep()" call in trace_call_bpf(). -- Steve
On 7/1/22 8:54 AM, Steven Rostedt wrote: > On Thu, 30 Jun 2022 18:37:04 -0400 > Steven Rostedt <rostedt@goodmis.org> wrote: > >>> Fix the issue by switching to preempt_enable/disable() for non-RT >>> Kernels. >>> >>> -010 |spin_bug(lock = ???, msg = ???) >>> -011 |debug_spin_lock_before(inline) >>> -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600) >>> -012 |_raw_spin_lock(inline) >>> -012 |raw_spin_rq_lock_nested(inline) >>> -012 |raw_spin_rq_lock(inline) >>> -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8) >>> -013 |__set_cpus_allowed_ptr(inline) >>> -013 |migrate_enable() >>> -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600) >>> -015 |perf_trace_run_bpf_submit(inline) >>> -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?) >>> -016 |__traceiter_sched_switch(inline) >>> -016 |trace_sched_switch(inline) >> trace_sched_switch() disables preemption. >> >> So how is this a fix? > Let me rephrase my question. > > As trace_sched_switch() disables preemption, why is trace_call_bpf() > calling migrate_disable()? I'm not sure, why we have migrate_disable/enable(). I will need to cross check further. > > Looks like you could modify the code to include a __bpf_prog_run_array() > that skips the migrate_disable(). You even have a "cant_sleep()" call in > trace_call_bpf(). Thanks for the inputs. I still need to cross check and find a better way to repro the issue to be able to comment further. Will cross check on your suggestion. > > -- Steve
Hi Satya
On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote:
> Below recursion is observed in a rare scenario where __schedule()
> takes rq lock, at around same time task's affinity is being changed,
> bpf function for tracing sched_switch calls migrate_enabled(),
> checks for affinity change (cpus_ptr != cpus_mask) lands into
> __set_cpus_allowed_ptr which tries acquire rq lock and causing the
> recursion bug.
>
> Fix the issue by switching to preempt_enable/disable() for non-RT
> Kernels.
Interesting bug. Thanks for the report. Unfortunately I can't see this being
a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix
anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure.
I'm curious how the race with set affinity is happening. I would have thought
user space would get blocked as __schedule() will hold the rq lock.
Do you have more details on that?
Thanks
--
Qais Yousef
>
> -010 |spin_bug(lock = ???, msg = ???)
> -011 |debug_spin_lock_before(inline)
> -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
> -012 |_raw_spin_lock(inline)
> -012 |raw_spin_rq_lock_nested(inline)
> -012 |raw_spin_rq_lock(inline)
> -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
> -013 |__set_cpus_allowed_ptr(inline)
> -013 |migrate_enable()
> -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
> -015 |perf_trace_run_bpf_submit(inline)
> -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
> -016 |__traceiter_sched_switch(inline)
> -016 |trace_sched_switch(inline)
> -016 |__schedule(sched_mode = ?)
> -017 |schedule()
> -018 |arch_local_save_flags(inline)
> -018 |arch_irqs_disabled(inline)
> -018 |__raw_spin_lock_irq(inline)
> -018 |_raw_spin_lock_irq(inline)
> -018 |worker_thread(__worker = 0xFFFFFF88CE251300)
> -019 |kthread(_create = 0xFFFFFF88730A5A80)
> -020 |ret_from_fork(asm)
>
> Signed-off-by: Satya Durga Srinivasu Prabhala <quic_satyap@quicinc.com>
> ---
> kernel/sched/core.c | 8 ++++++++
> 1 file changed, 8 insertions(+)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index bfa7452ca92e..e254e9227341 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2223,6 +2223,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
>
> void migrate_disable(void)
> {
> +#ifdef CONFIG_PREEMPT_RT
> struct task_struct *p = current;
>
> if (p->migration_disabled) {
> @@ -2234,11 +2235,15 @@ void migrate_disable(void)
> this_rq()->nr_pinned++;
> p->migration_disabled = 1;
> preempt_enable();
> +#else
> + preempt_disable();
> +#endif
> }
> EXPORT_SYMBOL_GPL(migrate_disable);
>
> void migrate_enable(void)
> {
> +#ifdef CONFIG_PREEMPT_RT
> struct task_struct *p = current;
>
> if (p->migration_disabled > 1) {
> @@ -2265,6 +2270,9 @@ void migrate_enable(void)
> p->migration_disabled = 0;
> this_rq()->nr_pinned--;
> preempt_enable();
> +#else
> + preempt_enable();
> +#endif
> }
> EXPORT_SYMBOL_GPL(migrate_enable);
>
> --
> 2.36.1
>
On 6/30/22 2:53 PM, Qais Yousef wrote:
> Hi Satya
>
> On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote:
>> Below recursion is observed in a rare scenario where __schedule()
>> takes rq lock, at around same time task's affinity is being changed,
>> bpf function for tracing sched_switch calls migrate_enabled(),
>> checks for affinity change (cpus_ptr != cpus_mask) lands into
>> __set_cpus_allowed_ptr which tries acquire rq lock and causing the
>> recursion bug.
>>
>> Fix the issue by switching to preempt_enable/disable() for non-RT
>> Kernels.
> Interesting bug. Thanks for the report. Unfortunately I can't see this being
> a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix
> anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure.
Thanks for taking a look. That's right.
> I'm curious how the race with set affinity is happening. I would have thought
> user space would get blocked as __schedule() will hold the rq lock.
>
> Do you have more details on that?
I'm still trying to figure out how we even end with up such a race. No
easy way repro
the issue, I'm checking internally with some debug logs.
>
> Thanks
>
> --
> Qais Yousef
>
>> -010 |spin_bug(lock = ???, msg = ???)
>> -011 |debug_spin_lock_before(inline)
>> -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600)
>> -012 |_raw_spin_lock(inline)
>> -012 |raw_spin_rq_lock_nested(inline)
>> -012 |raw_spin_rq_lock(inline)
>> -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8)
>> -013 |__set_cpus_allowed_ptr(inline)
>> -013 |migrate_enable()
>> -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600)
>> -015 |perf_trace_run_bpf_submit(inline)
>> -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?)
>> -016 |__traceiter_sched_switch(inline)
>> -016 |trace_sched_switch(inline)
>> -016 |__schedule(sched_mode = ?)
>> -017 |schedule()
>> -018 |arch_local_save_flags(inline)
>> -018 |arch_irqs_disabled(inline)
>> -018 |__raw_spin_lock_irq(inline)
>> -018 |_raw_spin_lock_irq(inline)
>> -018 |worker_thread(__worker = 0xFFFFFF88CE251300)
>> -019 |kthread(_create = 0xFFFFFF88730A5A80)
>> -020 |ret_from_fork(asm)
>>
>> Signed-off-by: Satya Durga Srinivasu Prabhala <quic_satyap@quicinc.com>
>> ---
>> kernel/sched/core.c | 8 ++++++++
>> 1 file changed, 8 insertions(+)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index bfa7452ca92e..e254e9227341 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -2223,6 +2223,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
>>
>> void migrate_disable(void)
>> {
>> +#ifdef CONFIG_PREEMPT_RT
>> struct task_struct *p = current;
>>
>> if (p->migration_disabled) {
>> @@ -2234,11 +2235,15 @@ void migrate_disable(void)
>> this_rq()->nr_pinned++;
>> p->migration_disabled = 1;
>> preempt_enable();
>> +#else
>> + preempt_disable();
>> +#endif
>> }
>> EXPORT_SYMBOL_GPL(migrate_disable);
>>
>> void migrate_enable(void)
>> {
>> +#ifdef CONFIG_PREEMPT_RT
>> struct task_struct *p = current;
>>
>> if (p->migration_disabled > 1) {
>> @@ -2265,6 +2270,9 @@ void migrate_enable(void)
>> p->migration_disabled = 0;
>> this_rq()->nr_pinned--;
>> preempt_enable();
>> +#else
>> + preempt_enable();
>> +#endif
>> }
>> EXPORT_SYMBOL_GPL(migrate_enable);
>>
>> --
>> 2.36.1
>>
On Thu, Jun 30, 2022 at 10:53:10PM +0100, Qais Yousef wrote: > Hi Satya > > On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote: > > Below recursion is observed in a rare scenario where __schedule() > > takes rq lock, at around same time task's affinity is being changed, > > bpf function for tracing sched_switch calls migrate_enabled(), > > checks for affinity change (cpus_ptr != cpus_mask) lands into > > __set_cpus_allowed_ptr which tries acquire rq lock and causing the > > recursion bug. > > > > Fix the issue by switching to preempt_enable/disable() for non-RT > > Kernels. > > Interesting bug. Thanks for the report. Unfortunately I can't see this being > a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix > anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure. Worse, there's !RT stuff that grew to rely on the preemptible migrate disable stuff, so this actively breaks things. > I'm curious how the race with set affinity is happening. I would have thought > user space would get blocked as __schedule() will hold the rq lock. > > Do you have more details on that? Yeah, I'm not seeing how this works either, in order for migrate_enable() to actually call __set_cpus_allowed_ptr(), it needs to have done migrate_disable() *before* schedule, schedule() will then have to call migrate_disable_swich(), and *then* migrate_enable() does this. However, if things are nicely balanced (as they should be), then trace_call_bpf() using migrate_disable()/migrate_enable() should never hit this path. If, OTOH, migrate_disable() was called prior to schedule() and we did do migrate_disable_switch(), then it should be impossible for the tracepoint/bpf stuff to reach p->migration_disabled == 0. > > -010 |spin_bug(lock = ???, msg = ???) > > -011 |debug_spin_lock_before(inline) > > -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600) > > -012 |_raw_spin_lock(inline) > > -012 |raw_spin_rq_lock_nested(inline) > > -012 |raw_spin_rq_lock(inline) > > -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8) > > -013 |__set_cpus_allowed_ptr(inline) > > -013 |migrate_enable() > > -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600) > > -015 |perf_trace_run_bpf_submit(inline) > > -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?) > > -016 |__traceiter_sched_switch(inline) > > -016 |trace_sched_switch(inline) > > -016 |__schedule(sched_mode = ?) > > -017 |schedule() > > -018 |arch_local_save_flags(inline) > > -018 |arch_irqs_disabled(inline) > > -018 |__raw_spin_lock_irq(inline) > > -018 |_raw_spin_lock_irq(inline) > > -018 |worker_thread(__worker = 0xFFFFFF88CE251300) > > -019 |kthread(_create = 0xFFFFFF88730A5A80) > > -020 |ret_from_fork(asm) This doesn't clarify much. Please explain how things got to be unbalanced, don't ever just make a problem dissapear like this without understanding what the root cause is, that'll just get your reputation sullied.
On 7/1/22 1:33 AM, Peter Zijlstra wrote: > On Thu, Jun 30, 2022 at 10:53:10PM +0100, Qais Yousef wrote: >> Hi Satya >> >> On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote: >>> Below recursion is observed in a rare scenario where __schedule() >>> takes rq lock, at around same time task's affinity is being changed, >>> bpf function for tracing sched_switch calls migrate_enabled(), >>> checks for affinity change (cpus_ptr != cpus_mask) lands into >>> __set_cpus_allowed_ptr which tries acquire rq lock and causing the >>> recursion bug. >>> >>> Fix the issue by switching to preempt_enable/disable() for non-RT >>> Kernels. >> Interesting bug. Thanks for the report. Unfortunately I can't see this being >> a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix >> anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure. > Worse, there's !RT stuff that grew to rely on the preemptible migrate > disable stuff, so this actively breaks things. Sorry about that. I'm cross checking further on ways to repro issue easily. >> I'm curious how the race with set affinity is happening. I would have thought >> user space would get blocked as __schedule() will hold the rq lock. >> >> Do you have more details on that? > Yeah, I'm not seeing how this works either, in order for > migrate_enable() to actually call __set_cpus_allowed_ptr(), it needs to > have done migrate_disable() *before* schedule, schedule() will then have > to call migrate_disable_swich(), and *then* migrate_enable() does this. > > However, if things are nicely balanced (as they should be), then > trace_call_bpf() using migrate_disable()/migrate_enable() should never > hit this path. > > If, OTOH, migrate_disable() was called prior to schedule() and we did do > migrate_disable_switch(), then it should be impossible for the > tracepoint/bpf stuff to reach p->migration_disabled == 0. > Thanks for explanation. Will cross check further on these points. >>> -010 |spin_bug(lock = ???, msg = ???) >>> -011 |debug_spin_lock_before(inline) >>> -011 |do_raw_spin_lock(lock = 0xFFFFFF89323BB600) >>> -012 |_raw_spin_lock(inline) >>> -012 |raw_spin_rq_lock_nested(inline) >>> -012 |raw_spin_rq_lock(inline) >>> -012 |task_rq_lock(p = 0xFFFFFF88CFF1DA00, rf = 0xFFFFFFC03707BBE8) >>> -013 |__set_cpus_allowed_ptr(inline) >>> -013 |migrate_enable() >>> -014 |trace_call_bpf(call = ?, ctx = 0xFFFFFFFDEF954600) >>> -015 |perf_trace_run_bpf_submit(inline) >>> -015 |perf_trace_sched_switch(__data = 0xFFFFFFE82CF0BCB8, preempt = FALSE, prev = ?, next = ?) >>> -016 |__traceiter_sched_switch(inline) >>> -016 |trace_sched_switch(inline) >>> -016 |__schedule(sched_mode = ?) >>> -017 |schedule() >>> -018 |arch_local_save_flags(inline) >>> -018 |arch_irqs_disabled(inline) >>> -018 |__raw_spin_lock_irq(inline) >>> -018 |_raw_spin_lock_irq(inline) >>> -018 |worker_thread(__worker = 0xFFFFFF88CE251300) >>> -019 |kthread(_create = 0xFFFFFF88730A5A80) >>> -020 |ret_from_fork(asm) > This doesn't clarify much. Please explain how things got to be > unbalanced, don't ever just make a problem dissapear like this without > understanding what the root cause is, that'll just get your reputation > sullied. Agreed, thanks for the comments and suggestion. Yes, I'm trying to cross check further and find ways to repro the issue. Will get back once I find a better way to handle the issue. I should have just tried to get comments/feedback on the issue first instead proposing fix. Lesson learnt :)
On 07/01/22 10:33, Peter Zijlstra wrote: > On Thu, Jun 30, 2022 at 10:53:10PM +0100, Qais Yousef wrote: > > Hi Satya > > > > On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote: > > > Below recursion is observed in a rare scenario where __schedule() > > > takes rq lock, at around same time task's affinity is being changed, > > > bpf function for tracing sched_switch calls migrate_enabled(), > > > checks for affinity change (cpus_ptr != cpus_mask) lands into > > > __set_cpus_allowed_ptr which tries acquire rq lock and causing the > > > recursion bug. > > > > > > Fix the issue by switching to preempt_enable/disable() for non-RT > > > Kernels. > > > > Interesting bug. Thanks for the report. Unfortunately I can't see this being > > a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix > > anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure. > > Worse, there's !RT stuff that grew to rely on the preemptible migrate > disable stuff, so this actively breaks things. > > > I'm curious how the race with set affinity is happening. I would have thought > > user space would get blocked as __schedule() will hold the rq lock. > > > > Do you have more details on that? > > Yeah, I'm not seeing how this works either, in order for > migrate_enable() to actually call __set_cpus_allowed_ptr(), it needs to > have done migrate_disable() *before* schedule, schedule() will then have > to call migrate_disable_swich(), and *then* migrate_enable() does this. > > However, if things are nicely balanced (as they should be), then > trace_call_bpf() using migrate_disable()/migrate_enable() should never > hit this path. > > If, OTOH, migrate_disable() was called prior to schedule() and we did do > migrate_disable_switch(), then it should be impossible for the > tracepoint/bpf stuff to reach p->migration_disabled == 0. I think it's worth to confirm which kernel Satya is on too. If it's GKI, then worth checking first this is actually reproducible on/applicable to mainline. Cheers -- Qais Yousef
On 7/1/22 4:48 AM, Qais Yousef wrote: > On 07/01/22 10:33, Peter Zijlstra wrote: >> On Thu, Jun 30, 2022 at 10:53:10PM +0100, Qais Yousef wrote: >>> Hi Satya >>> >>> On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote: >>>> Below recursion is observed in a rare scenario where __schedule() >>>> takes rq lock, at around same time task's affinity is being changed, >>>> bpf function for tracing sched_switch calls migrate_enabled(), >>>> checks for affinity change (cpus_ptr != cpus_mask) lands into >>>> __set_cpus_allowed_ptr which tries acquire rq lock and causing the >>>> recursion bug. >>>> >>>> Fix the issue by switching to preempt_enable/disable() for non-RT >>>> Kernels. >>> Interesting bug. Thanks for the report. Unfortunately I can't see this being >>> a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix >>> anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure. >> Worse, there's !RT stuff that grew to rely on the preemptible migrate >> disable stuff, so this actively breaks things. >> >>> I'm curious how the race with set affinity is happening. I would have thought >>> user space would get blocked as __schedule() will hold the rq lock. >>> >>> Do you have more details on that? >> Yeah, I'm not seeing how this works either, in order for >> migrate_enable() to actually call __set_cpus_allowed_ptr(), it needs to >> have done migrate_disable() *before* schedule, schedule() will then have >> to call migrate_disable_swich(), and *then* migrate_enable() does this. >> >> However, if things are nicely balanced (as they should be), then >> trace_call_bpf() using migrate_disable()/migrate_enable() should never >> hit this path. >> >> If, OTOH, migrate_disable() was called prior to schedule() and we did do >> migrate_disable_switch(), then it should be impossible for the >> tracepoint/bpf stuff to reach p->migration_disabled == 0. > I think it's worth to confirm which kernel Satya is on too. If it's GKI, then > worth checking first this is actually reproducible on/applicable to mainline. We are seeing the issue on 5.15 GKI Kernel. On older Kernels, like 5.10 Kernel migrate_disable/enable() end-up calling preempt_disable/enable(). I will cross check further on comments and inputs I received so far. > > Cheers > > -- > Qais Yousef
On 01/07/2022 13:48, Qais Yousef wrote:
> On 07/01/22 10:33, Peter Zijlstra wrote:
>> On Thu, Jun 30, 2022 at 10:53:10PM +0100, Qais Yousef wrote:
>>> Hi Satya
>>>
>>> On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote:
>>>> Below recursion is observed in a rare scenario where __schedule()
>>>> takes rq lock, at around same time task's affinity is being changed,
>>>> bpf function for tracing sched_switch calls migrate_enabled(),
>>>> checks for affinity change (cpus_ptr != cpus_mask) lands into
>>>> __set_cpus_allowed_ptr which tries acquire rq lock and causing the
>>>> recursion bug.
>>>>
>>>> Fix the issue by switching to preempt_enable/disable() for non-RT
>>>> Kernels.
>>>
>>> Interesting bug. Thanks for the report. Unfortunately I can't see this being
>>> a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix
>>> anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure.
>>
>> Worse, there's !RT stuff that grew to rely on the preemptible migrate
>> disable stuff, so this actively breaks things.
>>
>>> I'm curious how the race with set affinity is happening. I would have thought
>>> user space would get blocked as __schedule() will hold the rq lock.
>>>
>>> Do you have more details on that?
>>
>> Yeah, I'm not seeing how this works either, in order for
>> migrate_enable() to actually call __set_cpus_allowed_ptr(), it needs to
>> have done migrate_disable() *before* schedule, schedule() will then have
>> to call migrate_disable_swich(), and *then* migrate_enable() does this.
>>
>> However, if things are nicely balanced (as they should be), then
>> trace_call_bpf() using migrate_disable()/migrate_enable() should never
>> hit this path.
>>
>> If, OTOH, migrate_disable() was called prior to schedule() and we did do
>> migrate_disable_switch(), then it should be impossible for the
>> tracepoint/bpf stuff to reach p->migration_disabled == 0.
>
> I think it's worth to confirm which kernel Satya is on too. If it's GKI, then
> worth checking first this is actually reproducible on/applicable to mainline.
Satya, do you still have these lines from your spin_dump() output showing
current, the kernel version and the hardware? Or a way to recreate this?
I couldn't provoke it so far.
...
[ 212.196452] BUG: spinlock recursion on CPU#4, bpftrace/1662
^^^^^^^^^^^^^
[ 212.196473] lock: 0xffff00097ef7f500, .magic: dead4ead, .owner: bpftrace/1662, .owner_cpu: 4
[ 212.196500] CPU: 4 PID: 1662 Comm: bpftrace Not tainted 5.19.0-rc2-00018-gb7ce5b6b4622-dirty #96
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[ 212.196513] Hardware name: ARM Juno development board (r0) (DT)
^^^^^^^^^^^^^^^^^^^^^^^^^^
[ 212.196520] Call trace:
...
On 7/4/22 1:32 AM, Dietmar Eggemann wrote: > On 01/07/2022 13:48, Qais Yousef wrote: >> On 07/01/22 10:33, Peter Zijlstra wrote: >>> On Thu, Jun 30, 2022 at 10:53:10PM +0100, Qais Yousef wrote: >>>> Hi Satya >>>> >>>> On 06/24/22 00:42, Satya Durga Srinivasu Prabhala wrote: >>>>> Below recursion is observed in a rare scenario where __schedule() >>>>> takes rq lock, at around same time task's affinity is being changed, >>>>> bpf function for tracing sched_switch calls migrate_enabled(), >>>>> checks for affinity change (cpus_ptr != cpus_mask) lands into >>>>> __set_cpus_allowed_ptr which tries acquire rq lock and causing the >>>>> recursion bug. >>>>> >>>>> Fix the issue by switching to preempt_enable/disable() for non-RT >>>>> Kernels. >>>> Interesting bug. Thanks for the report. Unfortunately I can't see this being >>>> a fix as it just limits the bug visibility to PREEMPT_RT kernels, but won't fix >>>> anything, no? ie: Kernels compiled with PREEMPT_RT will still hit this failure. >>> Worse, there's !RT stuff that grew to rely on the preemptible migrate >>> disable stuff, so this actively breaks things. >>> >>>> I'm curious how the race with set affinity is happening. I would have thought >>>> user space would get blocked as __schedule() will hold the rq lock. >>>> >>>> Do you have more details on that? >>> Yeah, I'm not seeing how this works either, in order for >>> migrate_enable() to actually call __set_cpus_allowed_ptr(), it needs to >>> have done migrate_disable() *before* schedule, schedule() will then have >>> to call migrate_disable_swich(), and *then* migrate_enable() does this. >>> >>> However, if things are nicely balanced (as they should be), then >>> trace_call_bpf() using migrate_disable()/migrate_enable() should never >>> hit this path. >>> >>> If, OTOH, migrate_disable() was called prior to schedule() and we did do >>> migrate_disable_switch(), then it should be impossible for the >>> tracepoint/bpf stuff to reach p->migration_disabled == 0. >> I think it's worth to confirm which kernel Satya is on too. If it's GKI, then >> worth checking first this is actually reproducible on/applicable to mainline. > Satya, do you still have these lines from your spin_dump() output showing > current, the kernel version and the hardware? Or a way to recreate this? > I couldn't provoke it so far. > > ... > [ 212.196452] BUG: spinlock recursion on CPU#4, bpftrace/1662 > ^^^^^^^^^^^^^ > [ 212.196473] lock: 0xffff00097ef7f500, .magic: dead4ead, .owner: bpftrace/1662, .owner_cpu: 4 > [ 212.196500] CPU: 4 PID: 1662 Comm: bpftrace Not tainted 5.19.0-rc2-00018-gb7ce5b6b4622-dirty #96 > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > [ 212.196513] Hardware name: ARM Juno development board (r0) (DT) > ^^^^^^^^^^^^^^^^^^^^^^^^^^ > [ 212.196520] Call trace: > ... Thanks for cross checking. Below are the output lines from spin_dump. I'm on 5.15 GKI Kernel. [ 7447.326924] BUG: spinlock recursion on CPU#6, kworker/6:9/738 [ 7447.333615] lock: 0xffffff89321d8600, .magic: dead4ead, .owner: kworker/6:9/738, .owner_cpu: 6 I'm trying to get the issue reproduced with some additional debug logs, didn't find any easy way so far. Still cross checking.
© 2016 - 2026 Red Hat, Inc.