kernel/sched/fair.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-)
In reweight_task(), there are two situations:
1. The task was on_rq, then the task's load_avg is accurate because we
synchronized it with cfs_rq through update_load_avg() in dequeue_task().
2. The task is sleeping, its load_avg might not have been updated for some
time, which can result in inaccurate dequeue_load_avg() in
reweight_entity().
This patch solves this by using sync_entity_load_avg() to synchronize the
load_avg of se with cfs_rq before dequeue_load_avg() in reweight_entity().
For tasks were on_rq, since we already update load_avg to accurate values
in dequeue_task(), this change will not have other effects due to the short
time interval between the two updates.
Suggested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
Changes in v3:
- use sync_entity_load_avg() rather than update_load_avg() to sync the
sleeping task with its cfs_rq suggested by Dietmar.
- Link t0 v2: https://lore.kernel.org/lkml/20240720051248.59608-1-zhouchuyi@bytedance.com/
Changes in v2:
- change the description in commit log.
- use update_load_avg() in reweight_task() rather than in reweight_entity
suggested by chengming.
- Link to v1: https://lore.kernel.org/lkml/20240716150840.23061-1-zhouchuyi@bytedance.com/
---
kernel/sched/fair.c | 43 ++++++++++++++++++++++++-------------------
1 file changed, 24 insertions(+), 19 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9057584ec06d..da3cdd86ab2e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3669,11 +3669,32 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
+
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ return u64_u32_load_copy(cfs_rq->avg.last_update_time,
+ cfs_rq->last_update_time_copy);
+}
+
+/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+static void sync_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+ __update_load_avg_blocked_se(last_update_time, se);
+}
+
#else
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static void sync_entity_load_avg(struct sched_entity *se) { }
#endif
static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
@@ -3795,7 +3816,9 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
if (!curr)
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
- }
+ } else if (entity_is_task(se))
+ sync_entity_load_avg(se);
+
dequeue_load_avg(cfs_rq, se);
if (se->on_rq) {
@@ -4034,11 +4057,6 @@ static inline bool load_avg_is_decayed(struct sched_avg *sa)
return true;
}
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
- return u64_u32_load_copy(cfs_rq->avg.last_update_time,
- cfs_rq->last_update_time_copy);
-}
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
@@ -4773,19 +4791,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
}
}
-/*
- * Synchronize entity load avg of dequeued entity without locking
- * the previous rq.
- */
-static void sync_entity_load_avg(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
-
- last_update_time = cfs_rq_last_update_time(cfs_rq);
- __update_load_avg_blocked_se(last_update_time, se);
-}
-
/*
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
--
2.20.1
On 23/07/2024 13:42, Chuyi Zhou wrote: > In reweight_task(), there are two situations: > > 1. The task was on_rq, then the task's load_avg is accurate because we > synchronized it with cfs_rq through update_load_avg() in dequeue_task(). Just asking: That's the dequeue_task() in __sched_setscheduler() or set_user_nice()? Maybe this is worth mentioning here? [...] > @@ -3795,7 +3816,9 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, > if (!curr) > __dequeue_entity(cfs_rq, se); > update_load_sub(&cfs_rq->load, se->load.weight); > - } > + } else if (entity_is_task(se)) > + sync_entity_load_avg(se); > + IMHO, the 'if else' path needs braces. See Documentation/process/coding-style.rst '3) Placing Braces and Spaces'. > dequeue_load_avg(cfs_rq, se); > > if (se->on_rq) { [...] Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Hello, 在 2024/7/29 15:56, Dietmar Eggemann 写道: > On 23/07/2024 13:42, Chuyi Zhou wrote: >> In reweight_task(), there are two situations: >> >> 1. The task was on_rq, then the task's load_avg is accurate because we >> synchronized it with cfs_rq through update_load_avg() in dequeue_task(). > > Just asking: That's the dequeue_task() in __sched_setscheduler() or > set_user_nice()? Maybe this is worth mentioning here? OK. > > [...] > >> @@ -3795,7 +3816,9 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, >> if (!curr) >> __dequeue_entity(cfs_rq, se); >> update_load_sub(&cfs_rq->load, se->load.weight); >> - } >> + } else if (entity_is_task(se)) >> + sync_entity_load_avg(se); >> + > > IMHO, the 'if else' path needs braces. See > Documentation/process/coding-style.rst '3) Placing Braces and Spaces'. Thanks, will fix it in next version. > >> dequeue_load_avg(cfs_rq, se); >> >> if (se->on_rq) { > > [...] > > Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
On Tue, Jul 23, 2024 at 07:42:47PM +0800, Chuyi Zhou wrote: > In reweight_task(), there are two situations: > > 1. The task was on_rq, then the task's load_avg is accurate because we > synchronized it with cfs_rq through update_load_avg() in dequeue_task(). > > 2. The task is sleeping, its load_avg might not have been updated for some > time, which can result in inaccurate dequeue_load_avg() in > reweight_entity(). > > This patch solves this by using sync_entity_load_avg() to synchronize the > load_avg of se with cfs_rq before dequeue_load_avg() in reweight_entity(). > For tasks were on_rq, since we already update load_avg to accurate values > in dequeue_task(), this change will not have other effects due to the short > time interval between the two updates. > > Suggested-by: Dietmar Eggemann <dietmar.eggemann@arm.com> > Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> > --- > Changes in v3: > - use sync_entity_load_avg() rather than update_load_avg() to sync the > sleeping task with its cfs_rq suggested by Dietmar. > - Link t0 v2: https://lore.kernel.org/lkml/20240720051248.59608-1-zhouchuyi@bytedance.com/ > Changes in v2: > - change the description in commit log. > - use update_load_avg() in reweight_task() rather than in reweight_entity > suggested by chengming. > - Link to v1: https://lore.kernel.org/lkml/20240716150840.23061-1-zhouchuyi@bytedance.com/ > --- > kernel/sched/fair.c | 43 ++++++++++++++++++++++++------------------- > 1 file changed, 24 insertions(+), 19 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 9057584ec06d..da3cdd86ab2e 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -3669,11 +3669,32 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) > cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, > cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); > } > + > +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) > +{ > + return u64_u32_load_copy(cfs_rq->avg.last_update_time, > + cfs_rq->last_update_time_copy); > +} > + > +/* > + * Synchronize entity load avg of dequeued entity without locking > + * the previous rq. > + */ > +static void sync_entity_load_avg(struct sched_entity *se) > +{ > + struct cfs_rq *cfs_rq = cfs_rq_of(se); > + u64 last_update_time; > + > + last_update_time = cfs_rq_last_update_time(cfs_rq); > + __update_load_avg_blocked_se(last_update_time, se); > +} > + > #else > static inline void > enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } > static inline void > dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } > +static void sync_entity_load_avg(struct sched_entity *se) { } > #endif > > static void reweight_eevdf(struct sched_entity *se, u64 avruntime, > @@ -3795,7 +3816,9 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, > if (!curr) > __dequeue_entity(cfs_rq, se); > update_load_sub(&cfs_rq->load, se->load.weight); > - } > + } else if (entity_is_task(se)) > + sync_entity_load_avg(se); > + > dequeue_load_avg(cfs_rq, se); > > if (se->on_rq) { > @@ -4034,11 +4057,6 @@ static inline bool load_avg_is_decayed(struct sched_avg *sa) > return true; > } > > -static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) > -{ > - return u64_u32_load_copy(cfs_rq->avg.last_update_time, > - cfs_rq->last_update_time_copy); > -} > #ifdef CONFIG_FAIR_GROUP_SCHED > /* > * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list > @@ -4773,19 +4791,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s > } > } > > -/* > - * Synchronize entity load avg of dequeued entity without locking > - * the previous rq. > - */ > -static void sync_entity_load_avg(struct sched_entity *se) > -{ > - struct cfs_rq *cfs_rq = cfs_rq_of(se); > - u64 last_update_time; > - > - last_update_time = cfs_rq_last_update_time(cfs_rq); > - __update_load_avg_blocked_se(last_update_time, se); > -} > - > /* > * Task first catches up with cfs_rq, and then subtract > * itself from the cfs_rq (task must be off the queue now). > -- > 2.20.1 > Reviewed-by: Vishal Chourasia <vishalc@linux.ibm.com> Thank you for the patch! -- vishal.c
On 2024/7/23 19:42, Chuyi Zhou wrote: > In reweight_task(), there are two situations: > > 1. The task was on_rq, then the task's load_avg is accurate because we > synchronized it with cfs_rq through update_load_avg() in dequeue_task(). > > 2. The task is sleeping, its load_avg might not have been updated for some > time, which can result in inaccurate dequeue_load_avg() in > reweight_entity(). > > This patch solves this by using sync_entity_load_avg() to synchronize the > load_avg of se with cfs_rq before dequeue_load_avg() in reweight_entity(). > For tasks were on_rq, since we already update load_avg to accurate values > in dequeue_task(), this change will not have other effects due to the short > time interval between the two updates. > > Suggested-by: Dietmar Eggemann <dietmar.eggemann@arm.com> > Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> Looks good to me! Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev> Thanks. > --- > Changes in v3: > - use sync_entity_load_avg() rather than update_load_avg() to sync the > sleeping task with its cfs_rq suggested by Dietmar. > - Link t0 v2: https://lore.kernel.org/lkml/20240720051248.59608-1-zhouchuyi@bytedance.com/ > Changes in v2: > - change the description in commit log. > - use update_load_avg() in reweight_task() rather than in reweight_entity > suggested by chengming. > - Link to v1: https://lore.kernel.org/lkml/20240716150840.23061-1-zhouchuyi@bytedance.com/ > --- > kernel/sched/fair.c | 43 ++++++++++++++++++++++++------------------- > 1 file changed, 24 insertions(+), 19 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 9057584ec06d..da3cdd86ab2e 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -3669,11 +3669,32 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) > cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, > cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); > } > + > +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) > +{ > + return u64_u32_load_copy(cfs_rq->avg.last_update_time, > + cfs_rq->last_update_time_copy); > +} > + > +/* > + * Synchronize entity load avg of dequeued entity without locking > + * the previous rq. > + */ > +static void sync_entity_load_avg(struct sched_entity *se) > +{ > + struct cfs_rq *cfs_rq = cfs_rq_of(se); > + u64 last_update_time; > + > + last_update_time = cfs_rq_last_update_time(cfs_rq); > + __update_load_avg_blocked_se(last_update_time, se); > +} > + > #else > static inline void > enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } > static inline void > dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } > +static void sync_entity_load_avg(struct sched_entity *se) { } > #endif > > static void reweight_eevdf(struct sched_entity *se, u64 avruntime, > @@ -3795,7 +3816,9 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, > if (!curr) > __dequeue_entity(cfs_rq, se); > update_load_sub(&cfs_rq->load, se->load.weight); > - } > + } else if (entity_is_task(se)) > + sync_entity_load_avg(se); > + > dequeue_load_avg(cfs_rq, se); > > if (se->on_rq) { > @@ -4034,11 +4057,6 @@ static inline bool load_avg_is_decayed(struct sched_avg *sa) > return true; > } > > -static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) > -{ > - return u64_u32_load_copy(cfs_rq->avg.last_update_time, > - cfs_rq->last_update_time_copy); > -} > #ifdef CONFIG_FAIR_GROUP_SCHED > /* > * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list > @@ -4773,19 +4791,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s > } > } > > -/* > - * Synchronize entity load avg of dequeued entity without locking > - * the previous rq. > - */ > -static void sync_entity_load_avg(struct sched_entity *se) > -{ > - struct cfs_rq *cfs_rq = cfs_rq_of(se); > - u64 last_update_time; > - > - last_update_time = cfs_rq_last_update_time(cfs_rq); > - __update_load_avg_blocked_se(last_update_time, se); > -} > - > /* > * Task first catches up with cfs_rq, and then subtract > * itself from the cfs_rq (task must be off the queue now).
On Tue, Jul 23, 2024 at 07:42:47PM +0800, Chuyi Zhou wrote: > In reweight_task(), there are two situations: > > 1. The task was on_rq, then the task's load_avg is accurate because we > synchronized it with cfs_rq through update_load_avg() in dequeue_task(). > > 2. The task is sleeping, its load_avg might not have been updated for some > time, which can result in inaccurate dequeue_load_avg() in > reweight_entity(). > > This patch solves this by using sync_entity_load_avg() to synchronize the > load_avg of se with cfs_rq before dequeue_load_avg() in reweight_entity(). > For tasks were on_rq, since we already update load_avg to accurate values > in dequeue_task(), this change will not have other effects due to the short > time interval between the two updates. > > Suggested-by: Dietmar Eggemann <dietmar.eggemann@arm.com> > Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> > --- > Changes in v3: > - use sync_entity_load_avg() rather than update_load_avg() to sync the > sleeping task with its cfs_rq suggested by Dietmar. > - Link t0 v2: https://lore.kernel.org/lkml/20240720051248.59608-1-zhouchuyi@bytedance.com/ > Changes in v2: > - change the description in commit log. > - use update_load_avg() in reweight_task() rather than in reweight_entity > suggested by chengming. > - Link to v1: https://lore.kernel.org/lkml/20240716150840.23061-1-zhouchuyi@bytedance.com/ > --- > kernel/sched/fair.c | 43 ++++++++++++++++++++++++------------------- > 1 file changed, 24 insertions(+), 19 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 9057584ec06d..da3cdd86ab2e 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -3669,11 +3669,32 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) > cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, > cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); > } > + > +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) > +{ > + return u64_u32_load_copy(cfs_rq->avg.last_update_time, > + cfs_rq->last_update_time_copy); > +} > + > +/* > + * Synchronize entity load avg of dequeued entity without locking > + * the previous rq. > + */ > +static void sync_entity_load_avg(struct sched_entity *se) > +{ > + struct cfs_rq *cfs_rq = cfs_rq_of(se); > + u64 last_update_time; > + > + last_update_time = cfs_rq_last_update_time(cfs_rq); > + __update_load_avg_blocked_se(last_update_time, se); > +} > + The difference between using update_load_avg() and sync_entity_load_avg() is: 1. update_load_avg() uses the updated PELT clock value from the rq structure. 2. sync_entity_load_avg() uses the last updated time of the cfs_rq where the scheduling entity (se) is attached. Won't this affect the entity load sync? -- vishal.c > #else > static inline void > enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } > static inline void > dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } > +static void sync_entity_load_avg(struct sched_entity *se) { } > #endif > > static void reweight_eevdf(struct sched_entity *se, u64 avruntime, > @@ -3795,7 +3816,9 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, > if (!curr) > __dequeue_entity(cfs_rq, se); > update_load_sub(&cfs_rq->load, se->load.weight); > - } > + } else if (entity_is_task(se)) > + sync_entity_load_avg(se); > + > dequeue_load_avg(cfs_rq, se); > > if (se->on_rq) { > @@ -4034,11 +4057,6 @@ static inline bool load_avg_is_decayed(struct sched_avg *sa) > return true; > } > > -static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) > -{ > - return u64_u32_load_copy(cfs_rq->avg.last_update_time, > - cfs_rq->last_update_time_copy); > -} > #ifdef CONFIG_FAIR_GROUP_SCHED > /* > * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list > @@ -4773,19 +4791,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s > } > } > > -/* > - * Synchronize entity load avg of dequeued entity without locking > - * the previous rq. > - */ > -static void sync_entity_load_avg(struct sched_entity *se) > -{ > - struct cfs_rq *cfs_rq = cfs_rq_of(se); > - u64 last_update_time; > - > - last_update_time = cfs_rq_last_update_time(cfs_rq); > - __update_load_avg_blocked_se(last_update_time, se); > -} > - > /* > * Task first catches up with cfs_rq, and then subtract > * itself from the cfs_rq (task must be off the queue now). > -- > 2.20.1 >
On 23/07/2024 17:48, Vishal Chourasia wrote: > On Tue, Jul 23, 2024 at 07:42:47PM +0800, Chuyi Zhou wrote: >> In reweight_task(), there are two situations: >> >> 1. The task was on_rq, then the task's load_avg is accurate because we >> synchronized it with cfs_rq through update_load_avg() in dequeue_task(). >> >> 2. The task is sleeping, its load_avg might not have been updated for some >> time, which can result in inaccurate dequeue_load_avg() in >> reweight_entity(). >> >> This patch solves this by using sync_entity_load_avg() to synchronize the >> load_avg of se with cfs_rq before dequeue_load_avg() in reweight_entity(). >> For tasks were on_rq, since we already update load_avg to accurate values >> in dequeue_task(), this change will not have other effects due to the short >> time interval between the two updates. >> >> Suggested-by: Dietmar Eggemann <dietmar.eggemann@arm.com> >> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> >> --- >> Changes in v3: >> - use sync_entity_load_avg() rather than update_load_avg() to sync the >> sleeping task with its cfs_rq suggested by Dietmar. >> - Link t0 v2: https://lore.kernel.org/lkml/20240720051248.59608-1-zhouchuyi@bytedance.com/ >> Changes in v2: >> - change the description in commit log. >> - use update_load_avg() in reweight_task() rather than in reweight_entity >> suggested by chengming. >> - Link to v1: https://lore.kernel.org/lkml/20240716150840.23061-1-zhouchuyi@bytedance.com/ >> --- >> kernel/sched/fair.c | 43 ++++++++++++++++++++++++------------------- >> 1 file changed, 24 insertions(+), 19 deletions(-) >> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >> index 9057584ec06d..da3cdd86ab2e 100644 >> --- a/kernel/sched/fair.c >> +++ b/kernel/sched/fair.c >> @@ -3669,11 +3669,32 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) >> cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, >> cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); >> } >> + >> +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) >> +{ >> + return u64_u32_load_copy(cfs_rq->avg.last_update_time, >> + cfs_rq->last_update_time_copy); >> +} >> + >> +/* >> + * Synchronize entity load avg of dequeued entity without locking >> + * the previous rq. >> + */ >> +static void sync_entity_load_avg(struct sched_entity *se) >> +{ >> + struct cfs_rq *cfs_rq = cfs_rq_of(se); >> + u64 last_update_time; >> + >> + last_update_time = cfs_rq_last_update_time(cfs_rq); >> + __update_load_avg_blocked_se(last_update_time, se); >> +} >> + > The difference between using update_load_avg() and > sync_entity_load_avg() is: > 1. update_load_avg() uses the updated PELT clock value from the rq > structure. > 2. sync_entity_load_avg() uses the last updated time of > the cfs_rq where the scheduling entity (se) is attached. > > Won't this affect the entity load sync? Not sure what you mean exactly by entity load sync here. The task has been sleeping for a long time, i.e. its PELT values haven't been updated or a long time (its last_update_time (lut) value is pretty old). In this meantime the task's cfs_rq has potentially seen other PELT updates due to PELT updates of other non-sleeping tasks related to this cfs_rq. I.e. the cfs_rq lut is much more recent. What we want to do here is to sync the sleeping task with its cfs_rq. If the task was sleeping for more than 1us (1024ns) and we cross a 1ms PELT period (1024us) when we use cfs_rq's lut as the 'now' value for __update_load_avg_blocked_se() then we will see the task PELT values decay. We rely on sync_entity_load_avg() for instance in EAS wakeup where the task's util_avg influences on which CPU type the task will run next. So we sync the wakee with its cfs_rq to be able to work with a current task util_avg.
On 24/07/24 2:40 am, Dietmar Eggemann wrote: > On 23/07/2024 17:48, Vishal Chourasia wrote: >> On Tue, Jul 23, 2024 at 07:42:47PM +0800, Chuyi Zhou wrote: >>> In reweight_task(), there are two situations: >>> >>> 1. The task was on_rq, then the task's load_avg is accurate because we >>> synchronized it with cfs_rq through update_load_avg() in dequeue_task(). >>> >>> 2. The task is sleeping, its load_avg might not have been updated for some >>> time, which can result in inaccurate dequeue_load_avg() in >>> reweight_entity(). >>> >>> This patch solves this by using sync_entity_load_avg() to synchronize the >>> load_avg of se with cfs_rq before dequeue_load_avg() in reweight_entity(). >>> For tasks were on_rq, since we already update load_avg to accurate values >>> in dequeue_task(), this change will not have other effects due to the short >>> time interval between the two updates. >>> >>> Suggested-by: Dietmar Eggemann <dietmar.eggemann@arm.com> >>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> >>> --- >>> Changes in v3: >>> - use sync_entity_load_avg() rather than update_load_avg() to sync the >>> sleeping task with its cfs_rq suggested by Dietmar. >>> - Link t0 v2: https://lore.kernel.org/lkml/20240720051248.59608-1-zhouchuyi@bytedance.com/ >>> Changes in v2: >>> - change the description in commit log. >>> - use update_load_avg() in reweight_task() rather than in reweight_entity >>> suggested by chengming. >>> - Link to v1: https://lore.kernel.org/lkml/20240716150840.23061-1-zhouchuyi@bytedance.com/ >>> --- >>> kernel/sched/fair.c | 43 ++++++++++++++++++++++++------------------- >>> 1 file changed, 24 insertions(+), 19 deletions(-) >>> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>> index 9057584ec06d..da3cdd86ab2e 100644 >>> --- a/kernel/sched/fair.c >>> +++ b/kernel/sched/fair.c >>> @@ -3669,11 +3669,32 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) >>> cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, >>> cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); >>> } >>> + >>> +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) >>> +{ >>> + return u64_u32_load_copy(cfs_rq->avg.last_update_time, >>> + cfs_rq->last_update_time_copy); >>> +} >>> + >>> +/* >>> + * Synchronize entity load avg of dequeued entity without locking >>> + * the previous rq. >>> + */ >>> +static void sync_entity_load_avg(struct sched_entity *se) >>> +{ >>> + struct cfs_rq *cfs_rq = cfs_rq_of(se); >>> + u64 last_update_time; >>> + >>> + last_update_time = cfs_rq_last_update_time(cfs_rq); >>> + __update_load_avg_blocked_se(last_update_time, se); >>> +} >>> + >> The difference between using update_load_avg() and >> sync_entity_load_avg() is: >> 1. update_load_avg() uses the updated PELT clock value from the rq >> structure. >> 2. sync_entity_load_avg() uses the last updated time of >> the cfs_rq where the scheduling entity (se) is attached. >> >> Won't this affect the entity load sync? > > Not sure what you mean exactly by entity load sync here. load avg sync for the wakee > > The task has been sleeping for a long time, i.e. its PELT values haven't > been updated or a long time (its last_update_time (lut) value is pretty > old). > > In this meantime the task's cfs_rq has potentially seen other PELT > updates due to PELT updates of other non-sleeping tasks related to this > cfs_rq. I.e. the cfs_rq lut is much more recent. > > What we want to do here is to sync the sleeping task with its cfs_rq. If > the task was sleeping for more than 1us (1024ns) and we cross a 1ms PELT > period (1024us) when we use cfs_rq's lut as the 'now' value for > __update_load_avg_blocked_se() then we will see the task PELT values decay. > > We rely on sync_entity_load_avg() for instance in EAS wakeup where the > task's util_avg influences on which CPU type the task will run next. So > we sync the wakee with its cfs_rq to be able to work with a current task > util_avg. I was talking about the case where all the tasks on a cfs_rq are sleeping. In this case, lut of the cfs_rq will be same as, at the time of last dequeue. And, wakee is been woken up (suppose) after 1us window I guess, in this case pelt metric would not have changed for the cfs_rq Thanks -- vishal.c
On 2024/7/24 07:00, Vishal Chourasia wrote: > On 24/07/24 2:40 am, Dietmar Eggemann wrote: >> On 23/07/2024 17:48, Vishal Chourasia wrote: >>> On Tue, Jul 23, 2024 at 07:42:47PM +0800, Chuyi Zhou wrote: >>>> In reweight_task(), there are two situations: >>>> >>>> 1. The task was on_rq, then the task's load_avg is accurate because we >>>> synchronized it with cfs_rq through update_load_avg() in dequeue_task(). >>>> >>>> 2. The task is sleeping, its load_avg might not have been updated for some >>>> time, which can result in inaccurate dequeue_load_avg() in >>>> reweight_entity(). >>>> >>>> This patch solves this by using sync_entity_load_avg() to synchronize the >>>> load_avg of se with cfs_rq before dequeue_load_avg() in reweight_entity(). >>>> For tasks were on_rq, since we already update load_avg to accurate values >>>> in dequeue_task(), this change will not have other effects due to the short >>>> time interval between the two updates. >>>> >>>> Suggested-by: Dietmar Eggemann <dietmar.eggemann@arm.com> >>>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> >>>> --- >>>> Changes in v3: >>>> - use sync_entity_load_avg() rather than update_load_avg() to sync the >>>> sleeping task with its cfs_rq suggested by Dietmar. >>>> - Link t0 v2: https://lore.kernel.org/lkml/20240720051248.59608-1-zhouchuyi@bytedance.com/ >>>> Changes in v2: >>>> - change the description in commit log. >>>> - use update_load_avg() in reweight_task() rather than in reweight_entity >>>> suggested by chengming. >>>> - Link to v1: https://lore.kernel.org/lkml/20240716150840.23061-1-zhouchuyi@bytedance.com/ >>>> --- >>>> kernel/sched/fair.c | 43 ++++++++++++++++++++++++------------------- >>>> 1 file changed, 24 insertions(+), 19 deletions(-) >>>> >>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>>> index 9057584ec06d..da3cdd86ab2e 100644 >>>> --- a/kernel/sched/fair.c >>>> +++ b/kernel/sched/fair.c >>>> @@ -3669,11 +3669,32 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) >>>> cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, >>>> cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); >>>> } >>>> + >>>> +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) >>>> +{ >>>> + return u64_u32_load_copy(cfs_rq->avg.last_update_time, >>>> + cfs_rq->last_update_time_copy); >>>> +} >>>> + >>>> +/* >>>> + * Synchronize entity load avg of dequeued entity without locking >>>> + * the previous rq. >>>> + */ >>>> +static void sync_entity_load_avg(struct sched_entity *se) >>>> +{ >>>> + struct cfs_rq *cfs_rq = cfs_rq_of(se); >>>> + u64 last_update_time; >>>> + >>>> + last_update_time = cfs_rq_last_update_time(cfs_rq); >>>> + __update_load_avg_blocked_se(last_update_time, se); >>>> +} >>>> + >>> The difference between using update_load_avg() and >>> sync_entity_load_avg() is: >>> 1. update_load_avg() uses the updated PELT clock value from the rq >>> structure. >>> 2. sync_entity_load_avg() uses the last updated time of >>> the cfs_rq where the scheduling entity (se) is attached. >>> >>> Won't this affect the entity load sync? >> >> Not sure what you mean exactly by entity load sync here. > load avg sync for the wakee >> >> The task has been sleeping for a long time, i.e. its PELT values haven't >> been updated or a long time (its last_update_time (lut) value is pretty >> old). >> >> In this meantime the task's cfs_rq has potentially seen other PELT >> updates due to PELT updates of other non-sleeping tasks related to this >> cfs_rq. I.e. the cfs_rq lut is much more recent. >> >> What we want to do here is to sync the sleeping task with its cfs_rq. If >> the task was sleeping for more than 1us (1024ns) and we cross a 1ms PELT >> period (1024us) when we use cfs_rq's lut as the 'now' value for >> __update_load_avg_blocked_se() then we will see the task PELT values decay. >> >> We rely on sync_entity_load_avg() for instance in EAS wakeup where the >> task's util_avg influences on which CPU type the task will run next. So >> we sync the wakee with its cfs_rq to be able to work with a current task >> util_avg. > I was talking about the case where all the tasks on a cfs_rq are sleeping. > In this case, lut of the cfs_rq will be same as, at the time of last dequeue. In this case, cfs_rq is not on_rq, its load_sum/avg will be updated when enqueue next time. (Or periodically updated from load balance) > > And, wakee is been woken up (suppose) after 1us window > > > I guess, in this case pelt metric would not have changed for the cfs_rq IMHO, so long as task_se load is synced with cfs_rq there should be ok. Thanks. > > Thanks > -- vishal.c >
On 24/07/2024 04:12, Chengming Zhou wrote: > On 2024/7/24 07:00, Vishal Chourasia wrote: >> On 24/07/24 2:40 am, Dietmar Eggemann wrote: >>> On 23/07/2024 17:48, Vishal Chourasia wrote: >>>> On Tue, Jul 23, 2024 at 07:42:47PM +0800, Chuyi Zhou wrote: [...] >>>> The difference between using update_load_avg() and >>>> sync_entity_load_avg() is: >>>> 1. update_load_avg() uses the updated PELT clock value from the rq >>>> structure. >>>> 2. sync_entity_load_avg() uses the last updated time of >>>> the cfs_rq where the scheduling entity (se) is attached. >>>> >>>> Won't this affect the entity load sync? >>> >>> Not sure what you mean exactly by entity load sync here. >> load avg sync for the wakee >>> >>> The task has been sleeping for a long time, i.e. its PELT values haven't >>> been updated or a long time (its last_update_time (lut) value is pretty >>> old). >>> >>> In this meantime the task's cfs_rq has potentially seen other PELT >>> updates due to PELT updates of other non-sleeping tasks related to this >>> cfs_rq. I.e. the cfs_rq lut is much more recent. >>> >>> What we want to do here is to sync the sleeping task with its cfs_rq. If >>> the task was sleeping for more than 1us (1024ns) and we cross a 1ms PELT >>> period (1024us) when we use cfs_rq's lut as the 'now' value for >>> __update_load_avg_blocked_se() then we will see the task PELT values >>> decay. >>> >>> We rely on sync_entity_load_avg() for instance in EAS wakeup where the >>> task's util_avg influences on which CPU type the task will run next. So >>> we sync the wakee with its cfs_rq to be able to work with a current task >>> util_avg. >> I was talking about the case where all the tasks on a cfs_rq are >> sleeping. >> In this case, lut of the cfs_rq will be same as, at the time of last >> dequeue. > > In this case, cfs_rq is not on_rq, its load_sum/avg will be updated when > enqueue next time. (Or periodically updated from load balance) Yes, cfs_rq PELT values of an idle CPU decay via sched_balance_update_blocked_averages() -> __update_blocked_fair() [...]
© 2016 - 2024 Red Hat, Inc.