kernel/sched/fair.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-)
kernel encounters the following error when running workload:
BUG: kernel NULL pointer dereference, address: 0000002c
EIP: set_next_entity (fair.c:?)
which was caused by NULL pointer returned by pick_eevdf().
Further investigation has shown that, the entity_eligible() has an
false-negative issue when the entity's vruntime is far behind the
cfs_rq.min_vruntime that, the (vruntime - cfs_rq->min_vruntime) * load
caused a s64 overflow, thus every entity on the rb-tree is not
eligible, which results in a NULL candidate.
The reason why entity's vruntime is far behind the cfs_rq.min_vruntime
is because during a on_rq task group's update_cfs_group()->reweight_eevdf(),
there is no limit on the new entity's vlag. If the new weight is much
smaller than the old one,
vlag = div_s64(vlag * old_weight, weight)
generates a huge vlag, and results in very small(negative) vruntime.
Thus limit the range of vlag accordingly.
Reported-by: Sergei Trofimovich <slyich@gmail.com>
Closes: https://lore.kernel.org/all/ZhuYyrh3mweP_Kd8@nz.home/
Reported-by: Igor Raits <igor@gooddata.com>
Closes: https://lore.kernel.org/all/CA+9S74ih+45M_2TPUY_mPPVDhNvyYfy1J1ftSix+KjiTVxg8nw@mail.gmail.com/
Reported-by: Breno Leitao <leitao@debian.org>
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/lkml/202401301012.2ed95df0-oliver.sang@intel.com/
Reported-by: Yujie Liu <yujie.liu@intel.com>
Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
---
changes of v2:
-add reported-by (suggested by <yu.c.chen@intel.com>)
-remork the changelog (<yu.c.chen@intel.com>)
-remove the judgement of fork (Peter)
-remove the !on_rq case. (Peter)
---
Previous discussion link:
https://lore.kernel.org/lkml/20240226082349.302363-1-yu.c.chen@intel.com/
https://lore.kernel.org/all/20240130080643.1828-1-xuewen.yan@unisoc.com/
---
---
kernel/sched/fair.c | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03be0d1330a6..64826f406d6d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -696,15 +696,21 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
*
* XXX could add max_slice to the augmented data to track this.
*/
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static s64 entity_lag(u64 avruntime, struct sched_entity *se)
{
- s64 lag, limit;
+ s64 vlag, limit;
+
+ vlag = avruntime - se->vruntime;
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+
+ return clamp(vlag, -limit, limit);
+}
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
SCHED_WARN_ON(!se->on_rq);
- lag = avg_vruntime(cfs_rq) - se->vruntime;
- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
- se->vlag = clamp(lag, -limit, limit);
+ se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
}
/*
@@ -3761,7 +3767,7 @@ static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
* = V - vl'
*/
if (avruntime != se->vruntime) {
- vlag = (s64)(avruntime - se->vruntime);
+ vlag = entity_lag(avruntime, se);
vlag = div_s64(vlag * old_weight, weight);
se->vruntime = avruntime - vlag;
}
--
2.25.1
On 2024-04-22 at 16:22:38 +0800, Xuewen Yan wrote:
> kernel encounters the following error when running workload:
>
> BUG: kernel NULL pointer dereference, address: 0000002c
> EIP: set_next_entity (fair.c:?)
>
> which was caused by NULL pointer returned by pick_eevdf().
>
> Further investigation has shown that, the entity_eligible() has an
> false-negative issue when the entity's vruntime is far behind the
> cfs_rq.min_vruntime that, the (vruntime - cfs_rq->min_vruntime) * load
> caused a s64 overflow, thus every entity on the rb-tree is not
> eligible, which results in a NULL candidate.
>
> The reason why entity's vruntime is far behind the cfs_rq.min_vruntime
> is because during a on_rq task group's update_cfs_group()->reweight_eevdf(),
> there is no limit on the new entity's vlag. If the new weight is much
> smaller than the old one,
>
> vlag = div_s64(vlag * old_weight, weight)
>
> generates a huge vlag, and results in very small(negative) vruntime.
>
> Thus limit the range of vlag accordingly.
>
Thanks for the fix.
Might also add comments from Tim suggested here:
https://lore.kernel.org/lkml/ec479251e6245148b89b226f734192f6d5343bbc.camel@linux.intel.com/
A fix tag might be needed.
Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight")
> Reported-by: Sergei Trofimovich <slyich@gmail.com>
> Closes: https://lore.kernel.org/all/ZhuYyrh3mweP_Kd8@nz.home/
> Reported-by: Igor Raits <igor@gooddata.com>
> Closes: https://lore.kernel.org/all/CA+9S74ih+45M_2TPUY_mPPVDhNvyYfy1J1ftSix+KjiTVxg8nw@mail.gmail.com/
> Reported-by: Breno Leitao <leitao@debian.org>
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Closes: https://lore.kernel.org/lkml/202401301012.2ed95df0-oliver.sang@intel.com/
> Reported-by: Yujie Liu <yujie.liu@intel.com>
> Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
> ---
Cced Sergei, Igor, Breno who have encountered this NULL pointer issue before.
From my testing, with this applied I did not see the NULL pointer exception
after running trinity for 100 cycles, so
Reviewed-and-tested-by: Chen Yu <yu.c.chen@intel.com>
thanks,
Chenyu
> changes of v2:
> -add reported-by (suggested by <yu.c.chen@intel.com>)
> -remork the changelog (<yu.c.chen@intel.com>)
> -remove the judgement of fork (Peter)
> -remove the !on_rq case. (Peter)
> ---
> Previous discussion link:
> https://lore.kernel.org/lkml/20240226082349.302363-1-yu.c.chen@intel.com/
> https://lore.kernel.org/all/20240130080643.1828-1-xuewen.yan@unisoc.com/
> ---
> ---
> kernel/sched/fair.c | 18 ++++++++++++------
> 1 file changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 03be0d1330a6..64826f406d6d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -696,15 +696,21 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
> *
> * XXX could add max_slice to the augmented data to track this.
> */
> -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +static s64 entity_lag(u64 avruntime, struct sched_entity *se)
> {
> - s64 lag, limit;
> + s64 vlag, limit;
> +
> + vlag = avruntime - se->vruntime;
> + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
> +
> + return clamp(vlag, -limit, limit);
> +}
>
> +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> SCHED_WARN_ON(!se->on_rq);
> - lag = avg_vruntime(cfs_rq) - se->vruntime;
>
> - limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
> - se->vlag = clamp(lag, -limit, limit);
> + se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
> }
>
> /*
> @@ -3761,7 +3767,7 @@ static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
> * = V - vl'
> */
> if (avruntime != se->vruntime) {
> - vlag = (s64)(avruntime - se->vruntime);
> + vlag = entity_lag(avruntime, se);
> vlag = div_s64(vlag * old_weight, weight);
> se->vruntime = avruntime - vlag;
> }
> --
> 2.25.1
>
On Mon, Apr 22, 2024 at 04:47:31PM +0800, Chen Yu wrote:
> On 2024-04-22 at 16:22:38 +0800, Xuewen Yan wrote:
> > kernel encounters the following error when running workload:
> >
> > BUG: kernel NULL pointer dereference, address: 0000002c
> > EIP: set_next_entity (fair.c:?)
> >
> > which was caused by NULL pointer returned by pick_eevdf().
> >
> > Further investigation has shown that, the entity_eligible() has an
> > false-negative issue when the entity's vruntime is far behind the
> > cfs_rq.min_vruntime that, the (vruntime - cfs_rq->min_vruntime) * load
> > caused a s64 overflow, thus every entity on the rb-tree is not
> > eligible, which results in a NULL candidate.
> >
> > The reason why entity's vruntime is far behind the cfs_rq.min_vruntime
> > is because during a on_rq task group's update_cfs_group()->reweight_eevdf(),
> > there is no limit on the new entity's vlag. If the new weight is much
> > smaller than the old one,
> >
> > vlag = div_s64(vlag * old_weight, weight)
> >
> > generates a huge vlag, and results in very small(negative) vruntime.
> >
> > Thus limit the range of vlag accordingly.
> >
>
> Thanks for the fix.
>
> Might also add comments from Tim suggested here:
> https://lore.kernel.org/lkml/ec479251e6245148b89b226f734192f6d5343bbc.camel@linux.intel.com/
>
> A fix tag might be needed.
> Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight")
>
> > Reported-by: Sergei Trofimovich <slyich@gmail.com>
> > Closes: https://lore.kernel.org/all/ZhuYyrh3mweP_Kd8@nz.home/
> > Reported-by: Igor Raits <igor@gooddata.com>
> > Closes: https://lore.kernel.org/all/CA+9S74ih+45M_2TPUY_mPPVDhNvyYfy1J1ftSix+KjiTVxg8nw@mail.gmail.com/
> > Reported-by: Breno Leitao <leitao@debian.org>
> > Reported-by: kernel test robot <oliver.sang@intel.com>
> > Closes: https://lore.kernel.org/lkml/202401301012.2ed95df0-oliver.sang@intel.com/
> > Reported-by: Yujie Liu <yujie.liu@intel.com>
> > Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
> > ---
>
> Cced Sergei, Igor, Breno who have encountered this NULL pointer issue before.
>
> From my testing, with this applied I did not see the NULL pointer exception
> after running trinity for 100 cycles, so
>
> Reviewed-and-tested-by: Chen Yu <yu.c.chen@intel.com>
>
> thanks,
> Chenyu
>
From 0-Day testing, with this patch applied on v6.9-rc5, we no longer
see the NULL pointer issue in 999 cycles of trinity test.
Tested-by: Yujie Liu <yujie.liu@intel.com>
=========================================================================================
tbox_group/testcase/rootfs/kconfig/compiler/runtime/group/nr_groups:
vm-snb/trinity/debian-11.1-i386-20220923.cgz/i386-randconfig-004-20240122/clang-17/300s/group-03/5
commit:
v6.9-rc5
v6.9-rc5+patch ("sched/eevdf: Prevent vlag from going out of bounds when reweight_eevdf")
v6.9-rc5 v6.9-rc5+patch
---------------- ---------------------------
fail:runs %reproduction fail:runs
| | |
41:999 -4% :999 dmesg.BUG:kernel_NULL_pointer_dereference,address
24:999 -2% :999 dmesg.EIP:pick_next_task_fair
17:999 -2% :999 dmesg.EIP:set_next_entity
41:999 -4% :999 dmesg.Kernel_panic-not_syncing:Fatal_exception
41:999 -4% :999 dmesg.Oops:#[##]
> > changes of v2:
> > -add reported-by (suggested by <yu.c.chen@intel.com>)
> > -remork the changelog (<yu.c.chen@intel.com>)
> > -remove the judgement of fork (Peter)
> > -remove the !on_rq case. (Peter)
> > ---
> > Previous discussion link:
> > https://lore.kernel.org/lkml/20240226082349.302363-1-yu.c.chen@intel.com/
> > https://lore.kernel.org/all/20240130080643.1828-1-xuewen.yan@unisoc.com/
> > ---
> > ---
> > kernel/sched/fair.c | 18 ++++++++++++------
> > 1 file changed, 12 insertions(+), 6 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 03be0d1330a6..64826f406d6d 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -696,15 +696,21 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
> > *
> > * XXX could add max_slice to the augmented data to track this.
> > */
> > -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > +static s64 entity_lag(u64 avruntime, struct sched_entity *se)
> > {
> > - s64 lag, limit;
> > + s64 vlag, limit;
> > +
> > + vlag = avruntime - se->vruntime;
> > + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
> > +
> > + return clamp(vlag, -limit, limit);
> > +}
> >
> > +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > +{
> > SCHED_WARN_ON(!se->on_rq);
> > - lag = avg_vruntime(cfs_rq) - se->vruntime;
> >
> > - limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
> > - se->vlag = clamp(lag, -limit, limit);
> > + se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
> > }
> >
> > /*
> > @@ -3761,7 +3767,7 @@ static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
> > * = V - vl'
> > */
> > if (avruntime != se->vruntime) {
> > - vlag = (s64)(avruntime - se->vruntime);
> > + vlag = entity_lag(avruntime, se);
> > vlag = div_s64(vlag * old_weight, weight);
> > se->vruntime = avruntime - vlag;
> > }
> > --
> > 2.25.1
> >
Hi Peter,
Because the issue may be urgent for many people, I sent the patch first.
However, when I test on the Android system, I find there still are
vlags which exceed the limit.
On the Android system, the nice value of a task will change very
frequently. The limit can also be exceeded.
Maybe the !on_rq case is still necessary.
So I'm planning to propose another patch for !on_rq case later after
careful testing locally.
BR
On Mon, Apr 22, 2024 at 4:23 PM Xuewen Yan <xuewen.yan@unisoc.com> wrote:
>
> kernel encounters the following error when running workload:
>
> BUG: kernel NULL pointer dereference, address: 0000002c
> EIP: set_next_entity (fair.c:?)
>
> which was caused by NULL pointer returned by pick_eevdf().
>
> Further investigation has shown that, the entity_eligible() has an
> false-negative issue when the entity's vruntime is far behind the
> cfs_rq.min_vruntime that, the (vruntime - cfs_rq->min_vruntime) * load
> caused a s64 overflow, thus every entity on the rb-tree is not
> eligible, which results in a NULL candidate.
>
> The reason why entity's vruntime is far behind the cfs_rq.min_vruntime
> is because during a on_rq task group's update_cfs_group()->reweight_eevdf(),
> there is no limit on the new entity's vlag. If the new weight is much
> smaller than the old one,
>
> vlag = div_s64(vlag * old_weight, weight)
>
> generates a huge vlag, and results in very small(negative) vruntime.
>
> Thus limit the range of vlag accordingly.
>
> Reported-by: Sergei Trofimovich <slyich@gmail.com>
> Closes: https://lore.kernel.org/all/ZhuYyrh3mweP_Kd8@nz.home/
> Reported-by: Igor Raits <igor@gooddata.com>
> Closes: https://lore.kernel.org/all/CA+9S74ih+45M_2TPUY_mPPVDhNvyYfy1J1ftSix+KjiTVxg8nw@mail.gmail.com/
> Reported-by: Breno Leitao <leitao@debian.org>
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Closes: https://lore.kernel.org/lkml/202401301012.2ed95df0-oliver.sang@intel.com/
> Reported-by: Yujie Liu <yujie.liu@intel.com>
> Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
> ---
> changes of v2:
> -add reported-by (suggested by <yu.c.chen@intel.com>)
> -remork the changelog (<yu.c.chen@intel.com>)
> -remove the judgement of fork (Peter)
> -remove the !on_rq case. (Peter)
> ---
> Previous discussion link:
> https://lore.kernel.org/lkml/20240226082349.302363-1-yu.c.chen@intel.com/
> https://lore.kernel.org/all/20240130080643.1828-1-xuewen.yan@unisoc.com/
> ---
> ---
> kernel/sched/fair.c | 18 ++++++++++++------
> 1 file changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 03be0d1330a6..64826f406d6d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -696,15 +696,21 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
> *
> * XXX could add max_slice to the augmented data to track this.
> */
> -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +static s64 entity_lag(u64 avruntime, struct sched_entity *se)
> {
> - s64 lag, limit;
> + s64 vlag, limit;
> +
> + vlag = avruntime - se->vruntime;
> + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
> +
> + return clamp(vlag, -limit, limit);
> +}
>
> +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> SCHED_WARN_ON(!se->on_rq);
> - lag = avg_vruntime(cfs_rq) - se->vruntime;
>
> - limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
> - se->vlag = clamp(lag, -limit, limit);
> + se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
> }
>
> /*
> @@ -3761,7 +3767,7 @@ static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
> * = V - vl'
> */
> if (avruntime != se->vruntime) {
> - vlag = (s64)(avruntime - se->vruntime);
> + vlag = entity_lag(avruntime, se);
> vlag = div_s64(vlag * old_weight, weight);
> se->vruntime = avruntime - vlag;
> }
> --
> 2.25.1
>
>
On Mon, Apr 22, 2024 at 04:33:37PM +0800, Xuewen Yan wrote: > On the Android system, the nice value of a task will change very > frequently. The limit can also be exceeded. > Maybe the !on_rq case is still necessary. > So I'm planning to propose another patch for !on_rq case later after > careful testing locally. So the scaling is: vlag = vlag * old_Weight / weight But given that integer devision is truncating, you could expect repeated application of such scaling would eventually decrease the vlag instead of grow it. Is there perhaps an invocation of reweight_task() missing? Looking at set_load_weight() I'm suspicious of the task_has_idle_policy() case.
On Mon, Apr 22, 2024 at 5:42 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Mon, Apr 22, 2024 at 04:33:37PM +0800, Xuewen Yan wrote: > > > On the Android system, the nice value of a task will change very > > frequently. The limit can also be exceeded. > > Maybe the !on_rq case is still necessary. > > So I'm planning to propose another patch for !on_rq case later after > > careful testing locally. > > So the scaling is: vlag = vlag * old_Weight / weight > > But given that integer devision is truncating, you could expect repeated > application of such scaling would eventually decrease the vlag instead > of grow it. > > Is there perhaps an invocation of reweight_task() missing? Looking at Is it necessary to add reweight_task in the prio_changed_fair()? > set_load_weight() I'm suspicious of the task_has_idle_policy() case.
On Mon, Apr 22, 2024 at 07:07:25PM +0800, Xuewen Yan wrote: > On Mon, Apr 22, 2024 at 5:42 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Mon, Apr 22, 2024 at 04:33:37PM +0800, Xuewen Yan wrote: > > > > > On the Android system, the nice value of a task will change very > > > frequently. The limit can also be exceeded. > > > Maybe the !on_rq case is still necessary. > > > So I'm planning to propose another patch for !on_rq case later after > > > careful testing locally. > > > > So the scaling is: vlag = vlag * old_Weight / weight > > > > But given that integer devision is truncating, you could expect repeated > > application of such scaling would eventually decrease the vlag instead > > of grow it. > > > > Is there perhaps an invocation of reweight_task() missing? Looking at > > Is it necessary to add reweight_task in the prio_changed_fair()? I think that's the wrong place. Note how __setscheduler_params() already has set_load_weight(). And all other callers of ->prio_changed() already seem to do set_load_weight() as well. But that idle policy thing there still looks wrong, that sets the weight very low but doesn't re-adjust anything.
Hi peter, On Mon, Apr 22, 2024 at 7:17 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Mon, Apr 22, 2024 at 07:07:25PM +0800, Xuewen Yan wrote: > > On Mon, Apr 22, 2024 at 5:42 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > On Mon, Apr 22, 2024 at 04:33:37PM +0800, Xuewen Yan wrote: > > > > > > > On the Android system, the nice value of a task will change very > > > > frequently. The limit can also be exceeded. > > > > Maybe the !on_rq case is still necessary. > > > > So I'm planning to propose another patch for !on_rq case later after > > > > careful testing locally. > > > > > > So the scaling is: vlag = vlag * old_Weight / weight > > > > > > But given that integer devision is truncating, you could expect repeated > > > application of such scaling would eventually decrease the vlag instead > > > of grow it. > > > > > > Is there perhaps an invocation of reweight_task() missing? Looking at > > > > Is it necessary to add reweight_task in the prio_changed_fair()? > > I think that's the wrong place. Note how __setscheduler_params() already > has set_load_weight(). And all other callers of ->prio_changed() already > seem to do set_load_weight() as well. > > But that idle policy thing there still looks wrong, that sets the weight > very low but doesn't re-adjust anything. By adding a log to observe weight changes in reweight_entity, I found that calc_group_shares() often causes new_weight to become very small: Hardware name: Unisoc UMS-base Board (DT) Call trace: dump_backtrace+0xec/0x138 show_stack+0x18/0x24 dump_stack_lvl+0x60/0x84 dump_stack+0x18/0x24 reweight_entity+0x3e8/0x5f4 dequeue_task_fair+0x448/0x948 dequeue_task+0xc4/0x398 deactivate_task+0x1c/0x28 pull_tasks+0x200/0x334 newidle_balance+0x3cc/0x438 pick_next_task_fair+0x58/0x670 __schedule+0x204/0x9a0 schedule+0x128/0x1a8 schedule_timeout+0x44/0x1c8 __skb_wait_for_more_packets+0xd0/0x17c __unix_dgram_recvmsg+0xdc/0x3a8 unix_seqpacket_recvmsg+0x64/0x74 __sys_recvfrom+0x14c/0x1e4 __arm64_sys_recvfrom+0x24/0x38 invoke_syscall+0x58/0x114 el0_svc_common+0xac/0xe0 do_el0_svc+0x1c/0x28 el0_svc+0x3c/0x70 el0t_64_sync_handler+0x68/0xbc el0t_64_sync+0x1a8/0x1ac reweight_entity: the lag=-831088603030 vruntime=2086205903 limit=3071999998 old_weight=237238 new_weight=2
On Mon, Apr 22, 2024 at 09:12:12PM +0800, Xuewen Yan wrote:
> By adding a log to observe weight changes in reweight_entity, I found
> that calc_group_shares() often causes new_weight to become very small:
Yes, cgroups do that. But over-all that should not matter no?
Specifically, the whole re-weight thing turns into a series like:
w_0 w_1 w_n-1 w_0
S = --- * --- * ... * ----- = ---
w_1 w_2 w_n w_n
Where S is our ultimate scale factor.
So even if w_m (0 < m < n) is 2, it completely disappears. But yes, it
will create a big term, which is why the initial vlag should be limited.
Notably, nice should not exceed 88761*1024 / 2, but I'm not sure I
remember the limits (if there are any on the cgrou pmuck).
But if roughly 27 bits go to weight, then vlag should not exceed 36,
which should be well within the slice limit iirc.
Also, as said before, due to integer division being truncating, the
actual S should be smaller than the expected S due to error
accumulation.
Anyway, the things to verify are:
- the S series is complete -- missing terms will mess things up right
quick;
- the limits on both the weight and vlag part, their sum exceeding
63bit (plut 1 for sign) will also mess things up.
On Mon, Apr 22, 2024 at 11:59 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Apr 22, 2024 at 09:12:12PM +0800, Xuewen Yan wrote:
>
> > By adding a log to observe weight changes in reweight_entity, I found
> > that calc_group_shares() often causes new_weight to become very small:
>
> Yes, cgroups do that. But over-all that should not matter no?
>
> Specifically, the whole re-weight thing turns into a series like:
>
> w_0 w_1 w_n-1 w_0
> S = --- * --- * ... * ----- = ---
> w_1 w_2 w_n w_n
>
> Where S is our ultimate scale factor.
>
> So even if w_m (0 < m < n) is 2, it completely disappears. But yes, it
> will create a big term, which is why the initial vlag should be limited.
Okay, I understand what you mean. Even if the weight during dequeue is
very small, the weight will be eliminated during enqueue.
In this case, the necessity of the !on_rq case does not seem to be
very important.
On the other hand, the following case:
place_entity()
{
...
5244 load = cfs_rq->avg_load;
5245 if (curr && curr->on_rq)
5246 load += scale_load_down(curr->load.weight);
5247
5248 lag *= load + scale_load_down(se->load.weight);
5249 if (WARN_ON_ONCE(!load))
5250 load = 1;
5251 lag = div_s64(lag, load);<<<<
...
}
reweight_eevdf()
{
...
if (avruntime != se->vruntime) {
3770 vlag = entity_lag(avruntime, se);
3771 vlag = div_s64(vlag * old_weight, weight); <<<<
3772 se->vruntime = avruntime - vlag;
3773 }
.....
}
There is no need to clamp the above two positions because these two
calculations will not theoretically cause s64 overflow?
Thanks!
>
> Notably, nice should not exceed 88761*1024 / 2, but I'm not sure I
> remember the limits (if there are any on the cgrou pmuck).
>
> But if roughly 27 bits go to weight, then vlag should not exceed 36,
> which should be well within the slice limit iirc.
>
> Also, as said before, due to integer division being truncating, the
> actual S should be smaller than the expected S due to error
> accumulation.
>
> Anyway, the things to verify are:
>
> - the S series is complete -- missing terms will mess things up right
> quick;
>
> - the limits on both the weight and vlag part, their sum exceeding
> 63bit (plut 1 for sign) will also mess things up.
On Tue, Apr 23, 2024 at 11:05:20AM +0800, Xuewen Yan wrote:
> On Mon, Apr 22, 2024 at 11:59 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Apr 22, 2024 at 09:12:12PM +0800, Xuewen Yan wrote:
> >
> > > By adding a log to observe weight changes in reweight_entity, I found
> > > that calc_group_shares() often causes new_weight to become very small:
> >
> > Yes, cgroups do that. But over-all that should not matter no?
> >
> > Specifically, the whole re-weight thing turns into a series like:
> >
> > w_0 w_1 w_n-1 w_0
> > S = --- * --- * ... * ----- = ---
> > w_1 w_2 w_n w_n
> >
> > Where S is our ultimate scale factor.
> >
> > So even if w_m (0 < m < n) is 2, it completely disappears. But yes, it
> > will create a big term, which is why the initial vlag should be limited.
>
> Okay, I understand what you mean. Even if the weight during dequeue is
> very small, the weight will be eliminated during enqueue.
> In this case, the necessity of the !on_rq case does not seem to be
> very important.
>
> On the other hand, the following case:
> place_entity()
> {
> ...
> 5244 load = cfs_rq->avg_load;
> 5245 if (curr && curr->on_rq)
> 5246 load += scale_load_down(curr->load.weight);
> 5247
> 5248 lag *= load + scale_load_down(se->load.weight);
> 5249 if (WARN_ON_ONCE(!load))
> 5250 load = 1;
> 5251 lag = div_s64(lag, load);<<<<
> ...
> }
So this plays games with scale_load_down() because this is W, the sum of
all w, which can indeed grow quite large and cause overflow.
> reweight_eevdf()
> {
> ...
> if (avruntime != se->vruntime) {
> 3770 vlag = entity_lag(avruntime, se);
> 3771 vlag = div_s64(vlag * old_weight, weight); <<<<
> 3772 se->vruntime = avruntime - vlag;
> 3773 }
> .....
> }
While here we're talking about a single w, which is much more limited in
scope. And per the above, what we're trying to do is:
vlag = lag/w
lag/w * w/w' = lag/w'
That is, move vlag from one w to another.
> There is no need to clamp the above two positions because these two
> calculations will not theoretically cause s64 overflow?
Well, supposedly, if I didn't get it wrong etc.. (I do tend to get
things wrong from time to time :-).
I would think limited vlag would stay below 1 second or about 30 bits
this leaves another 30 bits for w which *should* be enough.
Anyway, if you're unsure, sprinkle some check_mul_overflow() and see if
you can tickle it.
On Tue, Apr 23, 2024 at 7:48 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Apr 23, 2024 at 11:05:20AM +0800, Xuewen Yan wrote:
> > On Mon, Apr 22, 2024 at 11:59 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Mon, Apr 22, 2024 at 09:12:12PM +0800, Xuewen Yan wrote:
> > >
> > > > By adding a log to observe weight changes in reweight_entity, I found
> > > > that calc_group_shares() often causes new_weight to become very small:
> > >
> > > Yes, cgroups do that. But over-all that should not matter no?
> > >
> > > Specifically, the whole re-weight thing turns into a series like:
> > >
> > > w_0 w_1 w_n-1 w_0
> > > S = --- * --- * ... * ----- = ---
> > > w_1 w_2 w_n w_n
> > >
> > > Where S is our ultimate scale factor.
> > >
> > > So even if w_m (0 < m < n) is 2, it completely disappears. But yes, it
> > > will create a big term, which is why the initial vlag should be limited.
> >
> > Okay, I understand what you mean. Even if the weight during dequeue is
> > very small, the weight will be eliminated during enqueue.
> > In this case, the necessity of the !on_rq case does not seem to be
> > very important.
> >
> > On the other hand, the following case:
> > place_entity()
> > {
> > ...
> > 5244 load = cfs_rq->avg_load;
> > 5245 if (curr && curr->on_rq)
> > 5246 load += scale_load_down(curr->load.weight);
> > 5247
> > 5248 lag *= load + scale_load_down(se->load.weight);
> > 5249 if (WARN_ON_ONCE(!load))
> > 5250 load = 1;
> > 5251 lag = div_s64(lag, load);<<<<
> > ...
> > }
>
> So this plays games with scale_load_down() because this is W, the sum of
> all w, which can indeed grow quite large and cause overflow.
>
> > reweight_eevdf()
> > {
> > ...
> > if (avruntime != se->vruntime) {
> > 3770 vlag = entity_lag(avruntime, se);
> > 3771 vlag = div_s64(vlag * old_weight, weight); <<<<
> > 3772 se->vruntime = avruntime - vlag;
> > 3773 }
> > .....
> > }
>
> While here we're talking about a single w, which is much more limited in
> scope. And per the above, what we're trying to do is:
>
> vlag = lag/w
> lag/w * w/w' = lag/w'
>
> That is, move vlag from one w to another.
>
> > There is no need to clamp the above two positions because these two
> > calculations will not theoretically cause s64 overflow?
>
> Well, supposedly, if I didn't get it wrong etc.. (I do tend to get
> things wrong from time to time :-).
>
> I would think limited vlag would stay below 1 second or about 30 bits
> this leaves another 30 bits for w which *should* be enough.
>
> Anyway, if you're unsure, sprinkle some check_mul_overflow() and see if
> you can tickle it.
Okay, Thank you for your patient answer:)
BR
---
xuewen
On 2024-04-22 at 21:12:12 +0800, Xuewen Yan wrote: > Hi peter, > > On Mon, Apr 22, 2024 at 7:17 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Mon, Apr 22, 2024 at 07:07:25PM +0800, Xuewen Yan wrote: > > > On Mon, Apr 22, 2024 at 5:42 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > > > On Mon, Apr 22, 2024 at 04:33:37PM +0800, Xuewen Yan wrote: > > > > > > > > > On the Android system, the nice value of a task will change very > > > > > frequently. The limit can also be exceeded. > > > > > Maybe the !on_rq case is still necessary. > > > > > So I'm planning to propose another patch for !on_rq case later after > > > > > careful testing locally. > > > > > > > > So the scaling is: vlag = vlag * old_Weight / weight > > > > > > > > But given that integer devision is truncating, you could expect repeated > > > > application of such scaling would eventually decrease the vlag instead > > > > of grow it. > > > > > > > > Is there perhaps an invocation of reweight_task() missing? Looking at > > > > > > Is it necessary to add reweight_task in the prio_changed_fair()? > > > > I think that's the wrong place. Note how __setscheduler_params() already > > has set_load_weight(). And all other callers of ->prio_changed() already > > seem to do set_load_weight() as well. > > > > But that idle policy thing there still looks wrong, that sets the weight > > very low but doesn't re-adjust anything. > > By adding a log to observe weight changes in reweight_entity, I found > that calc_group_shares() often causes new_weight to become very small: > If I understand correctly, the on_rq matters when doing reweight. In the following calltrace, after the entity(task group) is dequeued from the tree, on_rq is 0, then subsequent update_cfs_group()->reweight_entity() does not clamp the vlag because reweight_eevdf() can not be invoked, which could result in the scaling(237238/2) of se->vlag quite large. thanks, Chenyu > Hardware name: Unisoc UMS-base Board (DT) > Call trace: > dump_backtrace+0xec/0x138 > show_stack+0x18/0x24 > dump_stack_lvl+0x60/0x84 > dump_stack+0x18/0x24 > reweight_entity+0x3e8/0x5f4 > dequeue_task_fair+0x448/0x948 > dequeue_task+0xc4/0x398 > deactivate_task+0x1c/0x28 > pull_tasks+0x200/0x334 > newidle_balance+0x3cc/0x438 > pick_next_task_fair+0x58/0x670 > __schedule+0x204/0x9a0 > schedule+0x128/0x1a8 > schedule_timeout+0x44/0x1c8 > __skb_wait_for_more_packets+0xd0/0x17c > __unix_dgram_recvmsg+0xdc/0x3a8 > unix_seqpacket_recvmsg+0x64/0x74 > __sys_recvfrom+0x14c/0x1e4 > __arm64_sys_recvfrom+0x24/0x38 > invoke_syscall+0x58/0x114 > el0_svc_common+0xac/0xe0 > do_el0_svc+0x1c/0x28 > el0_svc+0x3c/0x70 > el0t_64_sync_handler+0x68/0xbc > el0t_64_sync+0x1a8/0x1ac > reweight_entity: the lag=-831088603030 vruntime=2086205903 > limit=3071999998 old_weight=237238 new_weight=2
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: 1560d1f6eb6b398bddd80c16676776c0325fe5fe
Gitweb: https://git.kernel.org/tip/1560d1f6eb6b398bddd80c16676776c0325fe5fe
Author: Xuewen Yan <xuewen.yan@unisoc.com>
AuthorDate: Mon, 22 Apr 2024 16:22:38 +08:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 22 Apr 2024 13:01:27 +02:00
sched/eevdf: Prevent vlag from going out of bounds in reweight_eevdf()
It was possible to have pick_eevdf() return NULL, which then causes a
NULL-deref. This turned out to be due to entity_eligible() returning
falsely negative because of a s64 multiplcation overflow.
Specifically, reweight_eevdf() computes the vlag without considering
the limit placed upon vlag as update_entity_lag() does, and then the
scaling multiplication (remember that weight is 20bit fixed point) can
overflow. This then leads to the new vruntime being weird which then
causes the above entity_eligible() to go side-ways and claim nothing
is eligible.
Thus limit the range of vlag accordingly.
All this was quite rare, but fatal when it does happen.
Closes: https://lore.kernel.org/all/ZhuYyrh3mweP_Kd8@nz.home/
Closes: https://lore.kernel.org/all/CA+9S74ih+45M_2TPUY_mPPVDhNvyYfy1J1ftSix+KjiTVxg8nw@mail.gmail.com/
Closes: https://lore.kernel.org/lkml/202401301012.2ed95df0-oliver.sang@intel.com/
Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight")
Reported-by: Sergei Trofimovich <slyich@gmail.com>
Reported-by: Igor Raits <igor@gooddata.com>
Reported-by: Breno Leitao <leitao@debian.org>
Reported-by: kernel test robot <oliver.sang@intel.com>
Reported-by: Yujie Liu <yujie.liu@intel.com>
Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
Reviewed-and-tested-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240422082238.5784-1-xuewen.yan@unisoc.com
---
kernel/sched/fair.c | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6d26691..c62805d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -696,15 +696,21 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
*
* XXX could add max_slice to the augmented data to track this.
*/
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static s64 entity_lag(u64 avruntime, struct sched_entity *se)
{
- s64 lag, limit;
+ s64 vlag, limit;
+
+ vlag = avruntime - se->vruntime;
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+
+ return clamp(vlag, -limit, limit);
+}
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
SCHED_WARN_ON(!se->on_rq);
- lag = avg_vruntime(cfs_rq) - se->vruntime;
- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
- se->vlag = clamp(lag, -limit, limit);
+ se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
}
/*
@@ -3760,7 +3766,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
* = V - vl'
*/
if (avruntime != se->vruntime) {
- vlag = (s64)(avruntime - se->vruntime);
+ vlag = entity_lag(avruntime, se);
vlag = div_s64(vlag * old_weight, weight);
se->vruntime = avruntime - vlag;
}
© 2016 - 2026 Red Hat, Inc.