sched/all: Change BUG_ON() instances to WARN_ON()

[PATCH] sched/all: Change BUG_ON() instances to WARN_ON()

Posted by Ingo Molnar 3 years, 8 months ago


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> I just tried to find a valid BUG_ON() that would make me go "yeah, that's 
> actually worth it", and couldn't really find one. Yeah, there are several 
> ones in the scheduler that make me go "ok, if that triggers, the machine 
> is dead anyway", so in that sense there are certainly BUG_ON()s that 
> don't _hurt_.

That's a mostly accidental, historical accumulation of BUG_ON()s - I 
believe we can change all of them to WARN_ON() via the patch below.

As far as the scheduler is concerned, we don't need any BUG_ON()s.

[ This assumes that printk() itself is atomic and non-recursive wrt. the 
  scheduler in these code paths ... ]

Thanks,

	Ingo

===============>
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 11 Aug 2022 08:54:52 +0200
Subject: [PATCH] sched/all: Change BUG_ON() instances to WARN_ON()

There's no good reason to crash a user's system with a BUG_ON(),
chances are high that they'll never even see the crash message on
Xorg, and it won't make it into the syslog either.

By using a WARN_ON() we at least give the user a chance to report
any bugs triggered here - instead of getting silent hangs.

None of these WARN_ON()s are supposed to trigger, ever - so we ignore
cases where a NULL check is done via a BUG_ON() and we let a NULL
pointer through after a WARN_ON().

There's one exception: WARN_ON() arguments with side-effects,
such as locking - in this case we use the return value of the
WARN_ON(), such as in:

 -       BUG_ON(!lock_task_sighand(p, &flags));
 +       if (WARN_ON(!lock_task_sighand(p, &flags)))
 +               return;

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/autogroup.c |  3 ++-
 kernel/sched/core.c      |  2 +-
 kernel/sched/cpupri.c    |  2 +-
 kernel/sched/deadline.c  | 26 +++++++++++++-------------
 kernel/sched/fair.c      | 10 +++++-----
 kernel/sched/rt.c        |  2 +-
 kernel/sched/sched.h     |  6 +++---
 7 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 4ebaf97f7bd8..13f6b6da35a0 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -161,7 +161,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 	struct task_struct *t;
 	unsigned long flags;
 
-	BUG_ON(!lock_task_sighand(p, &flags));
+	if (WARN_ON(!lock_task_sighand(p, &flags)))
+		return;
 
 	prev = p->signal->autogroup;
 	if (prev == ag) {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d3d61cbb6b3c..f84206bf42cd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2328,7 +2328,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
 	rq = cpu_rq(new_cpu);
 
 	rq_lock(rq, rf);
-	BUG_ON(task_cpu(p) != new_cpu);
+	WARN_ON(task_cpu(p) != new_cpu);
 	activate_task(rq, p, 0);
 	check_preempt_curr(rq, p, 0);
 
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index fa9ce9d83683..9f719e4ea081 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -147,7 +147,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
 	int task_pri = convert_prio(p->prio);
 	int idx, cpu;
 
-	BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
+	WARN_ON(task_pri >= CPUPRI_NR_PRIORITIES);
 
 	for (idx = 0; idx < task_pri; idx++) {
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1d9c90958baa..fb234077c317 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -310,7 +310,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
 {
 	struct rq *rq;
 
-	BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
+	WARN_ON(p->dl.flags & SCHED_FLAG_SUGOV);
 
 	if (task_on_rq_queued(p))
 		return;
@@ -607,7 +607,7 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
 {
 	struct rb_node *leftmost;
 
-	BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
+	WARN_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
 
 	leftmost = rb_add_cached(&p->pushable_dl_tasks,
 				 &rq->dl.pushable_dl_tasks_root,
@@ -684,7 +684,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 			 * Failed to find any suitable CPU.
 			 * The task will never come back!
 			 */
-			BUG_ON(dl_bandwidth_enabled());
+			WARN_ON(dl_bandwidth_enabled());
 
 			/*
 			 * If admission control is disabled we
@@ -830,7 +830,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
-	BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
+	WARN_ON(pi_of(dl_se)->dl_runtime <= 0);
 
 	/*
 	 * This could be the case for a !-dl task that is boosted.
@@ -1616,7 +1616,7 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
 {
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 
-	BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
+	WARN_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
 
 	rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less);
 
@@ -1640,7 +1640,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
 static void
 enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 {
-	BUG_ON(on_dl_rq(dl_se));
+	WARN_ON(on_dl_rq(dl_se));
 
 	update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
 
@@ -2017,7 +2017,7 @@ static struct task_struct *pick_task_dl(struct rq *rq)
 		return NULL;
 
 	dl_se = pick_next_dl_entity(dl_rq);
-	BUG_ON(!dl_se);
+	WARN_ON(!dl_se);
 	p = dl_task_of(dl_se);
 
 	return p;
@@ -2277,12 +2277,12 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
 
 	p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
 
-	BUG_ON(rq->cpu != task_cpu(p));
-	BUG_ON(task_current(rq, p));
-	BUG_ON(p->nr_cpus_allowed <= 1);
+	WARN_ON(rq->cpu != task_cpu(p));
+	WARN_ON(task_current(rq, p));
+	WARN_ON(p->nr_cpus_allowed <= 1);
 
-	BUG_ON(!task_on_rq_queued(p));
-	BUG_ON(!dl_task(p));
+	WARN_ON(!task_on_rq_queued(p));
+	WARN_ON(!dl_task(p));
 
 	return p;
 }
@@ -2492,7 +2492,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 	struct root_domain *src_rd;
 	struct rq *rq;
 
-	BUG_ON(!dl_task(p));
+	WARN_ON(!dl_task(p));
 
 	rq = task_rq(p);
 	src_rd = rq->rd;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da388657d5ac..00c01b3232b9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2600,7 +2600,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 	if (!join)
 		return;
 
-	BUG_ON(irqs_disabled());
+	WARN_ON(irqs_disabled());
 	double_lock_irq(&my_grp->lock, &grp->lock);
 
 	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
@@ -7279,7 +7279,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		return;
 
 	find_matching_se(&se, &pse);
-	BUG_ON(!pse);
+	WARN_ON(!pse);
 
 	cse_is_idle = se_is_idle(se);
 	pse_is_idle = se_is_idle(pse);
@@ -8159,7 +8159,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 {
 	lockdep_assert_rq_held(rq);
 
-	BUG_ON(task_rq(p) != rq);
+	WARN_ON(task_rq(p) != rq);
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
 	check_preempt_curr(rq, p, 0);
 }
@@ -10134,7 +10134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		goto out_balanced;
 	}
 
-	BUG_ON(busiest == env.dst_rq);
+	WARN_ON(busiest == env.dst_rq);
 
 	schedstat_add(sd->lb_imbalance[idle], env.imbalance);
 
@@ -10430,7 +10430,7 @@ static int active_load_balance_cpu_stop(void *data)
 	 * we need to fix it. Originally reported by
 	 * Bjorn Helgaas on a 128-CPU setup.
 	 */
-	BUG_ON(busiest_rq == target_rq);
+	WARN_ON(busiest_rq == target_rq);
 
 	/* Search for an sd spanning us and the target CPU. */
 	rcu_read_lock();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 054b6711e961..acf9f5ce0c4a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -843,7 +843,7 @@ static void __disable_runtime(struct rq *rq)
 		 * We cannot be left wanting - that would mean some runtime
 		 * leaked out of the system.
 		 */
-		BUG_ON(want);
+		WARN_ON(want);
 balanced:
 		/*
 		 * Disable all the borrow logic by pretending we have inf
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b0bf2287dd9d..8e5df3bc3483 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2699,8 +2699,8 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	__acquires(rq1->lock)
 	__acquires(rq2->lock)
 {
-	BUG_ON(!irqs_disabled());
-	BUG_ON(rq1 != rq2);
+	WARN_ON(!irqs_disabled());
+	WARN_ON(rq1 != rq2);
 	raw_spin_rq_lock(rq1);
 	__acquire(rq2->lock);	/* Fake it out ;) */
 	double_rq_clock_clear_update(rq1, rq2);
@@ -2716,7 +2716,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 	__releases(rq1->lock)
 	__releases(rq2->lock)
 {
-	BUG_ON(rq1 != rq2);
+	WARN_ON(rq1 != rq2);
 	raw_spin_rq_unlock(rq1);
 	__release(rq2->lock);
 }

Re: [PATCH] sched/all: Change BUG_ON() instances to WARN_ON()

Posted by Linus Torvalds 3 years, 8 months ago

On Thu, Aug 11, 2022 at 12:13 AM Ingo Molnar <mingo@kernel.org> wrote:
>
> By using a WARN_ON() we at least give the user a chance to report
> any bugs triggered here - instead of getting silent hangs.
>
> None of these WARN_ON()s are supposed to trigger, ever - so we ignore
> cases where a NULL check is done via a BUG_ON() and we let a NULL
> pointer through after a WARN_ON().

May I suggest going one step further, and making these WARN_ON_ONCE() instead.

From personal experience, once some scheduler bug (or task struct
corruption) happens, ti often *keeps* happening, and the logs just
fill up with more and more data, to the point where you lose sight of
the original report (and the machine can even get unusable just from
the logging).

WARN_ON_ONCE() can help that situation.

Now, obviously

 (a) WARN_ON_ONCE *can* also result in less information, and maybe
there are situations where having more - possibly different - cases of
the same thing triggering could be useful.

 (b) WARN_ON_ONCE historically generated a bit bigger code than
WARN_ON simply due to the extra "did this already trigger" check.

I *think* (b) is no longer true, and it's just a flag these days, but
I didn't actually check.

so it's not like there aren't potential downsides, but in general I
think the sanest and most natural thing is to have BUG_ON() translate
to WARN_ON_ONCE().

For the "reboot-on-warn" people, it ends up being the same thing. And
for the rest of us, the "give me *one* warning" can end up making the
reporting a lot easier.

Obviously, with the "this never actually happens", the whole "once or
many times" is kind of moot. But if it never happens at all, to the
point where it doesn't even add a chance of helping debugging, maybe
the whole test should be removed entirely...

           Linus

Re: [PATCH] sched/all: Change BUG_ON() instances to WARN_ON()

Posted by Matthew Wilcox 3 years, 8 months ago

On Thu, Aug 11, 2022 at 01:43:09PM -0700, Linus Torvalds wrote:
> May I suggest going one step further, and making these WARN_ON_ONCE() instead.
> 
> >From personal experience, once some scheduler bug (or task struct
> corruption) happens, ti often *keeps* happening, and the logs just
> fill up with more and more data, to the point where you lose sight of
> the original report (and the machine can even get unusable just from
> the logging).

I've been thinking about magically turning all the WARN_ON_ONCE() into
(effectively) WARN_ON_RATELIMIT().  I had some patches in that direction
a while ago but never got round to tidying them up for submission.

Re: [PATCH] sched/all: Change BUG_ON() instances to WARN_ON()

Posted by Jason Gunthorpe 3 years, 8 months ago

On Thu, Aug 11, 2022 at 10:28:27PM +0100, Matthew Wilcox wrote:
> On Thu, Aug 11, 2022 at 01:43:09PM -0700, Linus Torvalds wrote:
> > May I suggest going one step further, and making these WARN_ON_ONCE() instead.
> > 
> > >From personal experience, once some scheduler bug (or task struct
> > corruption) happens, ti often *keeps* happening, and the logs just
> > fill up with more and more data, to the point where you lose sight of
> > the original report (and the machine can even get unusable just from
> > the logging).
> 
> I've been thinking about magically turning all the WARN_ON_ONCE() into
> (effectively) WARN_ON_RATELIMIT().  I had some patches in that direction
> a while ago but never got round to tidying them up for submission.

I often wonder if we have a justification for WARN_ON to even exist, I
see a lot of pressure to make things into WARN_ON_ONCE based on the
logic that spamming makes it useless..

Maybe a global limit of 10 warn ons per minute or something would be
interesting?

Jason

Re: [PATCH] sched/all: Change BUG_ON() instances to WARN_ON()

Posted by John Hubbard 3 years, 8 months ago

On 8/11/22 16:22, Jason Gunthorpe wrote:
> On Thu, Aug 11, 2022 at 10:28:27PM +0100, Matthew Wilcox wrote:
>> On Thu, Aug 11, 2022 at 01:43:09PM -0700, Linus Torvalds wrote:
>>> May I suggest going one step further, and making these WARN_ON_ONCE() instead.
>>>
>>> >From personal experience, once some scheduler bug (or task struct
>>> corruption) happens, ti often *keeps* happening, and the logs just
>>> fill up with more and more data, to the point where you lose sight of
>>> the original report (and the machine can even get unusable just from
>>> the logging).
>>
>> I've been thinking about magically turning all the WARN_ON_ONCE() into
>> (effectively) WARN_ON_RATELIMIT().  I had some patches in that direction
>> a while ago but never got round to tidying them up for submission.

If you do that, I'd like to suggest that you avoid using magic here, but
instead just rename at the call sites.

Because:

First and foremost, something named WARN_ON_ONCE() clearly has a solemn
responsibility to warn exactly "once times"! :)

Second, it's not yet clear (or is it?) that WARN_ON_ONCE() is always
worse than rate limiting. It's a trade-off, rather than a clear win for
either case, in my experience. The _ONCE variant can get overwritten
if the kernel log wraps, but the _RATELIMIT on the other hand, may be
excessive.

And finally, if it *is* agreed on here that WARN_ON_RATELIMIT() is
always better than WARN_ON_ONCE(), then there is still no harm in
spending a patch or two (coccinelle...) to rename WARN_ON_ONCE() -->
WARN_ON_RATELIMIT(), so that we end up with accurate names.

> 
> I often wonder if we have a justification for WARN_ON to even exist, I
> see a lot of pressure to make things into WARN_ON_ONCE based on the
> logic that spamming makes it useless..

Agreed. WARN_ON_ONCE() or WARN_ON_RATELIMIT(), take your pick. But not
WARN_ON_EVERY_TIME()--that usually causes a serious problems in the
logs.

thanks,
-- 
John Hubbard
NVIDIA

[PATCH v2] sched/all: Change all BUG_ON() instances in the scheduler to WARN_ON_ONCE()

Posted by Ingo Molnar 3 years, 8 months ago


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Thu, Aug 11, 2022 at 12:13 AM Ingo Molnar <mingo@kernel.org> wrote:
> >
> > By using a WARN_ON() we at least give the user a chance to report
> > any bugs triggered here - instead of getting silent hangs.
> >
> > None of these WARN_ON()s are supposed to trigger, ever - so we ignore
> > cases where a NULL check is done via a BUG_ON() and we let a NULL
> > pointer through after a WARN_ON().
> 
> May I suggest going one step further, and making these WARN_ON_ONCE() instead.

Ok, agreed.

> From personal experience, once some scheduler bug (or task struct 
> corruption) happens, ti often *keeps* happening, and the logs just fill 
> up with more and more data, to the point where you lose sight of the 
> original report (and the machine can even get unusable just from the 
> logging).
> 
> WARN_ON_ONCE() can help that situation.

Yeah, true.

> Now, obviously
> 
>  (a) WARN_ON_ONCE *can* also result in less information, and maybe there 
> are situations where having more - possibly different - cases of the same 
> thing triggering could be useful.

None of these warnings are supposed to happen absent serious random data 
corruption, ever™, so if against expectations they do happen once, 
somewhere, and its brevity makes it hard to figure out, we can still 
reconsider on a case by case basis & increase verbosity.

>  (b) WARN_ON_ONCE historically generated a bit bigger code than
> WARN_ON simply due to the extra "did this already trigger" check.
> 
> I *think* (b) is no longer true, and it's just a flag these days, but I 
> didn't actually check.

Yeah, so on architectures that implement a smart __WARN_FLAGS() primitive, 
such as x86, overhead is pretty good:

#define __WARN_FLAGS(flags)                                     \
do {                                                            \
        __auto_type __flags = BUGFLAG_WARNING|(flags);          \
        instrumentation_begin();                                \
        _BUG_FLAGS(ASM_UD2, __flags, ASM_REACHABLE);            \
        instrumentation_end();                                  \
} while (0)

For them the runtime overhead of a WARN_ON_ONCE() is basically just the 
check itself:

# no checks:

ffffffff810a3ae0 <check_preempt_curr>:
ffffffff810a3ae0:       48 8b 87 20 09 00 00    mov    0x920(%rdi),%rax
ffffffff810a3ae7:       48 8b 8e 90 02 00 00    mov    0x290(%rsi),%rcx
ffffffff810a3aee:       53                      push   %rbx
ffffffff810a3aef:       48 89 fb                mov    %rdi,%rbx
ffffffff810a3af2:       48 3b 88 90 02 00 00    cmp    0x290(%rax),%rcx

# Single-branch WARN_ON_ONCE():
#
#  void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
#  {
# +       WARN_ON_ONCE(!rq);
# +
#         if (p->sched_class == rq->curr->sched_class)
#                 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
#         else if (sched_class_above(p->sched_class, rq->curr->sched_class))

ffffffff810a3ae0 <check_preempt_curr>:
ffffffff810a3ae0:       53                      push   %rbx
ffffffff810a3ae1:       48 89 fb                mov    %rdi,%rbx
ffffffff810a3ae4:  |    48 85 ff                test   %rdi,%rdi
ffffffff810a3ae7:  |    74 69                   je     ffffffff810a3b52 <check_preempt_curr+0x72>
ffffffff810a3ae9:       48 8b 83 20 09 00 00    mov    0x920(%rbx),%rax
ffffffff810a3af0:       48 8b 8e 90 02 00 00    mov    0x290(%rsi),%rcx
ffffffff810a3af7:       48 3b 88 90 02 00 00    cmp    0x290(%rax),%rcx
...
ffffffff810a3b50:       eb d1                   jmp    ffffffff810a3b23 <check_preempt_curr+0x43>    # tail-call
ffffffff810a3b52:  |    0f 0b                   ud2    

So it's a test instruction and an unlikely-forward-branch, plus a UD2 trap 
squeezed into the function epilogue, often hidden by alignment holes.

As lightweight as it gets, and no in-line penalty from being a 'once' 
warning.

> so it's not like there aren't potential downsides, but in general I think 
> the sanest and most natural thing is to have BUG_ON() translate to 
> WARN_ON_ONCE().
>
> For the "reboot-on-warn" people, it ends up being the same thing. And for 
> the rest of us, the "give me *one* warning" can end up making the 
> reporting a lot easier.

Agreed - updated v2 patch attached.

> Obviously, with the "this never actually happens", the whole "once or 
> many times" is kind of moot. But if it never happens at all, to the point 
> where it doesn't even add a chance of helping debugging, maybe the whole 
> test should be removed entirely...

Yeah.

Thanks,

	Ingo

========================================

From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 11 Aug 2022 08:54:52 +0200
Subject: [PATCH] sched/all: Change all BUG_ON() instances in the scheduler to WARN_ON_ONCE()

There's no good reason to crash a user's system with a BUG_ON(),
chances are high that they'll never even see the crash message on
Xorg, and it won't make it into the syslog either.

By using a WARN_ON_ONCE() we at least give the user a chance to report
any bugs triggered here - instead of getting silent hangs.

None of these WARN_ON_ONCE()s are supposed to trigger, ever - so we ignore
cases where a NULL check is done via a BUG_ON() and we let a NULL
pointer through after a WARN_ON_ONCE().

There's one exception: WARN_ON_ONCE() arguments with side-effects,
such as locking - in this case we use the return value of the
WARN_ON_ONCE(), such as in:

 -       BUG_ON(!lock_task_sighand(p, &flags));
 +       if (WARN_ON_ONCE(!lock_task_sighand(p, &flags)))
 +               return;

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/YvSsKcAXISmshtHo@gmail.com
---
 kernel/sched/autogroup.c |  3 ++-
 kernel/sched/core.c      |  2 +-
 kernel/sched/cpupri.c    |  2 +-
 kernel/sched/deadline.c  | 26 +++++++++++++-------------
 kernel/sched/fair.c      | 10 +++++-----
 kernel/sched/rt.c        |  2 +-
 kernel/sched/sched.h     |  6 +++---
 7 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 4ebaf97f7bd8..991fc9002535 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -161,7 +161,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 	struct task_struct *t;
 	unsigned long flags;
 
-	BUG_ON(!lock_task_sighand(p, &flags));
+	if (WARN_ON_ONCE(!lock_task_sighand(p, &flags)))
+		return;
 
 	prev = p->signal->autogroup;
 	if (prev == ag) {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ee28253c9ac0..813687a5f5cf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2328,7 +2328,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
 	rq = cpu_rq(new_cpu);
 
 	rq_lock(rq, rf);
-	BUG_ON(task_cpu(p) != new_cpu);
+	WARN_ON_ONCE(task_cpu(p) != new_cpu);
 	activate_task(rq, p, 0);
 	check_preempt_curr(rq, p, 0);
 
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index fa9ce9d83683..a286e726eb4b 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -147,7 +147,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
 	int task_pri = convert_prio(p->prio);
 	int idx, cpu;
 
-	BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
+	WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES);
 
 	for (idx = 0; idx < task_pri; idx++) {
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0ab79d819a0d..962b169b05cf 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -310,7 +310,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
 {
 	struct rq *rq;
 
-	BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
+	WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
 
 	if (task_on_rq_queued(p))
 		return;
@@ -607,7 +607,7 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
 {
 	struct rb_node *leftmost;
 
-	BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
+	WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
 
 	leftmost = rb_add_cached(&p->pushable_dl_tasks,
 				 &rq->dl.pushable_dl_tasks_root,
@@ -684,7 +684,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 			 * Failed to find any suitable CPU.
 			 * The task will never come back!
 			 */
-			BUG_ON(dl_bandwidth_enabled());
+			WARN_ON_ONCE(dl_bandwidth_enabled());
 
 			/*
 			 * If admission control is disabled we
@@ -830,7 +830,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
-	BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
+	WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0);
 
 	/*
 	 * This could be the case for a !-dl task that is boosted.
@@ -1616,7 +1616,7 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
 {
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 
-	BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
+	WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node));
 
 	rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less);
 
@@ -1640,7 +1640,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
 static void
 enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 {
-	BUG_ON(on_dl_rq(dl_se));
+	WARN_ON_ONCE(on_dl_rq(dl_se));
 
 	update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
 
@@ -2017,7 +2017,7 @@ static struct task_struct *pick_task_dl(struct rq *rq)
 		return NULL;
 
 	dl_se = pick_next_dl_entity(dl_rq);
-	BUG_ON(!dl_se);
+	WARN_ON_ONCE(!dl_se);
 	p = dl_task_of(dl_se);
 
 	return p;
@@ -2277,12 +2277,12 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
 
 	p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
 
-	BUG_ON(rq->cpu != task_cpu(p));
-	BUG_ON(task_current(rq, p));
-	BUG_ON(p->nr_cpus_allowed <= 1);
+	WARN_ON_ONCE(rq->cpu != task_cpu(p));
+	WARN_ON_ONCE(task_current(rq, p));
+	WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
 
-	BUG_ON(!task_on_rq_queued(p));
-	BUG_ON(!dl_task(p));
+	WARN_ON_ONCE(!task_on_rq_queued(p));
+	WARN_ON_ONCE(!dl_task(p));
 
 	return p;
 }
@@ -2492,7 +2492,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 	struct root_domain *src_rd;
 	struct rq *rq;
 
-	BUG_ON(!dl_task(p));
+	WARN_ON_ONCE(!dl_task(p));
 
 	rq = task_rq(p);
 	src_rd = rq->rd;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 914096c5b1ae..28f10dccd194 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2600,7 +2600,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 	if (!join)
 		return;
 
-	BUG_ON(irqs_disabled());
+	WARN_ON_ONCE(irqs_disabled());
 	double_lock_irq(&my_grp->lock, &grp->lock);
 
 	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
@@ -7279,7 +7279,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		return;
 
 	find_matching_se(&se, &pse);
-	BUG_ON(!pse);
+	WARN_ON_ONCE(!pse);
 
 	cse_is_idle = se_is_idle(se);
 	pse_is_idle = se_is_idle(pse);
@@ -8159,7 +8159,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 {
 	lockdep_assert_rq_held(rq);
 
-	BUG_ON(task_rq(p) != rq);
+	WARN_ON_ONCE(task_rq(p) != rq);
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
 	check_preempt_curr(rq, p, 0);
 }
@@ -10134,7 +10134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		goto out_balanced;
 	}
 
-	BUG_ON(busiest == env.dst_rq);
+	WARN_ON_ONCE(busiest == env.dst_rq);
 
 	schedstat_add(sd->lb_imbalance[idle], env.imbalance);
 
@@ -10430,7 +10430,7 @@ static int active_load_balance_cpu_stop(void *data)
 	 * we need to fix it. Originally reported by
 	 * Bjorn Helgaas on a 128-CPU setup.
 	 */
-	BUG_ON(busiest_rq == target_rq);
+	WARN_ON_ONCE(busiest_rq == target_rq);
 
 	/* Search for an sd spanning us and the target CPU. */
 	rcu_read_lock();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 55f39c8f4203..2936fe55cef7 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -843,7 +843,7 @@ static void __disable_runtime(struct rq *rq)
 		 * We cannot be left wanting - that would mean some runtime
 		 * leaked out of the system.
 		 */
-		BUG_ON(want);
+		WARN_ON_ONCE(want);
 balanced:
 		/*
 		 * Disable all the borrow logic by pretending we have inf
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e26688d387ae..7a44dceeb50a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2709,8 +2709,8 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	__acquires(rq1->lock)
 	__acquires(rq2->lock)
 {
-	BUG_ON(!irqs_disabled());
-	BUG_ON(rq1 != rq2);
+	WARN_ON_ONCE(!irqs_disabled());
+	WARN_ON_ONCE(rq1 != rq2);
 	raw_spin_rq_lock(rq1);
 	__acquire(rq2->lock);	/* Fake it out ;) */
 	double_rq_clock_clear_update(rq1, rq2);
@@ -2726,7 +2726,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 	__releases(rq1->lock)
 	__releases(rq2->lock)
 {
-	BUG_ON(rq1 != rq2);
+	WARN_ON_ONCE(rq1 != rq2);
 	raw_spin_rq_unlock(rq1);
 	__release(rq2->lock);
 }