From: Valentin Schneider <vschneid@redhat.com>
In current throttle model, when a cfs_rq is throttled, its entity will
be dequeued from cpu's rq, making tasks attached to it not able to run,
thus achiveing the throttle target.
This has a drawback though: assume a task is a reader of percpu_rwsem
and is waiting. When it gets woken, it can not run till its task group's
next period comes, which can be a relatively long time. Waiting writer
will have to wait longer due to this and it also makes further reader
build up and eventually trigger task hung.
To improve this situation, change the throttle model to task based, i.e.
when a cfs_rq is throttled, record its throttled status but do not remove
it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
they get picked, add a task work to them so that when they return
to user, they can be dequeued there. In this way, tasks throttled will
not hold any kernel resources. And on unthrottle, enqueue back those
tasks so they can continue to run.
Throttled cfs_rq's PELT clock is handled differently now: previously the
cfs_rq's PELT clock is stopped once it entered throttled state but since
now tasks(in kernel mode) can continue to run, change the behaviour to
stop PELT clock only when the throttled cfs_rq has no tasks left.
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
kernel/sched/fair.c | 336 ++++++++++++++++++++++---------------------
kernel/sched/pelt.h | 4 +-
kernel/sched/sched.h | 3 +-
3 files changed, 176 insertions(+), 167 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 54c2a4df6a5d1..0eeea7f2e693d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5285,18 +5285,23 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
- if (!throttled_hierarchy(cfs_rq)) {
- list_add_leaf_cfs_rq(cfs_rq);
- } else {
+ list_add_leaf_cfs_rq(cfs_rq);
#ifdef CONFIG_CFS_BANDWIDTH
+ if (throttled_hierarchy(cfs_rq)) {
struct rq *rq = rq_of(cfs_rq);
if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
cfs_rq->throttled_clock = rq_clock(rq);
if (!cfs_rq->throttled_clock_self)
cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
+
+ if (cfs_rq->pelt_clock_throttled) {
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
+ }
}
+#endif
}
}
@@ -5335,8 +5340,6 @@ static void set_delayed(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_runnable--;
- if (cfs_rq_throttled(cfs_rq))
- break;
}
}
@@ -5357,8 +5360,6 @@ static void clear_delayed(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_runnable++;
- if (cfs_rq_throttled(cfs_rq))
- break;
}
}
@@ -5444,8 +5445,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
- if (cfs_rq->nr_queued == 0)
+ if (cfs_rq->nr_queued == 0) {
update_idle_cfs_rq_clock_pelt(cfs_rq);
+#ifdef CONFIG_CFS_BANDWIDTH
+ if (throttled_hierarchy(cfs_rq)) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
+ }
+#endif
+ }
return true;
}
@@ -5784,6 +5795,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ /*
+ * Must not set throttled before dequeue or dequeue will
+ * mistakenly regard this task as an already throttled one.
+ */
p->throttled = true;
resched_curr(rq);
}
@@ -5797,32 +5812,119 @@ void init_cfs_throttle_work(struct task_struct *p)
INIT_LIST_HEAD(&p->throttle_node);
}
+/*
+ * Task is throttled and someone wants to dequeue it again:
+ * it could be sched/core when core needs to do things like
+ * task affinity change, task group change, task sched class
+ * change etc. and in these cases, DEQUEUE_SLEEP is not set;
+ * or the task is blocked after throttled due to freezer etc.
+ * and in these cases, DEQUEUE_SLEEP is set.
+ */
+static void detach_task_cfs_rq(struct task_struct *p);
+static void dequeue_throttled_task(struct task_struct *p, int flags)
+{
+ WARN_ON_ONCE(p->se.on_rq);
+ list_del_init(&p->throttle_node);
+
+ /* task blocked after throttled */
+ if (flags & DEQUEUE_SLEEP) {
+ p->throttled = false;
+ return;
+ }
+
+ /*
+ * task is migrating off its old cfs_rq, detach
+ * the task's load from its old cfs_rq.
+ */
+ if (task_on_rq_migrating(p))
+ detach_task_cfs_rq(p);
+}
+
+static bool enqueue_throttled_task(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+ /*
+ * If the throttled task is enqueued to a throttled cfs_rq,
+ * take the fast path by directly put the task on target
+ * cfs_rq's limbo list, except when p is current because
+ * the following race can cause p's group_node left in rq's
+ * cfs_tasks list when it's throttled:
+ *
+ * cpuX cpuY
+ * taskA ret2user
+ * throttle_cfs_rq_work() sched_move_task(taskA)
+ * task_rq_lock acquired
+ * dequeue_task_fair(taskA)
+ * task_rq_lock released
+ * task_rq_lock acquired
+ * task_current_donor(taskA) == true
+ * task_on_rq_queued(taskA) == true
+ * dequeue_task(taskA)
+ * put_prev_task(taskA)
+ * sched_change_group()
+ * enqueue_task(taskA) -> taskA's new cfs_rq
+ * is throttled, go
+ * fast path and skip
+ * actual enqueue
+ * set_next_task(taskA)
+ * __set_next_task_fair(taskA)
+ * list_move(&se->group_node, &rq->cfs_tasks); // bug
+ * schedule()
+ *
+ * And in the above race case, the task's current cfs_rq is in the same
+ * rq as its previous cfs_rq because sched_move_task() doesn't migrate
+ * task so we can use its current cfs_rq to derive rq and test if the
+ * task is current.
+ */
+ if (throttled_hierarchy(cfs_rq) &&
+ !task_current_donor(rq_of(cfs_rq), p)) {
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ return true;
+ }
+
+ /* we can't take the fast path, do an actual enqueue*/
+ p->throttled = false;
+ return false;
+}
+
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct task_struct *p, *tmp;
+
+ if (--cfs_rq->throttle_count)
+ return 0;
- cfs_rq->throttle_count--;
- if (!cfs_rq->throttle_count) {
+ if (cfs_rq->pelt_clock_throttled) {
cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
+ }
- /* Add cfs_rq with load or one or more already running entities to the list */
- if (!cfs_rq_is_decayed(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
- if (cfs_rq->throttled_clock_self) {
- u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+ cfs_rq->throttled_clock_self = 0;
- cfs_rq->throttled_clock_self = 0;
+ if (WARN_ON_ONCE((s64)delta < 0))
+ delta = 0;
- if (WARN_ON_ONCE((s64)delta < 0))
- delta = 0;
+ cfs_rq->throttled_clock_self_time += delta;
+ }
- cfs_rq->throttled_clock_self_time += delta;
- }
+ /* Re-enqueue the tasks that have been throttled at this level. */
+ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+ list_del_init(&p->throttle_node);
+ enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
}
+ /* Add cfs_rq with load or one or more already running entities to the list */
+ if (!cfs_rq_is_decayed(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+
return 0;
}
@@ -5851,17 +5953,25 @@ static int tg_throttle_down(struct task_group *tg, void *data)
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ if (cfs_rq->throttle_count++)
+ return 0;
+
+
/* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+ if (cfs_rq->nr_queued)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+ else {
+ /*
+ * For cfs_rqs that still have entities enqueued, PELT clock
+ * stop happens at dequeue time when all entities are dequeued.
+ */
list_del_leaf_cfs_rq(cfs_rq);
-
- WARN_ON_ONCE(cfs_rq->throttled_clock_self);
- if (cfs_rq->nr_queued)
- cfs_rq->throttled_clock_self = rq_clock(rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
}
- cfs_rq->throttle_count++;
+ WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
return 0;
}
@@ -5869,8 +5979,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta, dequeue = 1;
+ int dequeue = 1;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5893,68 +6002,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!dequeue)
return false; /* Throttle no longer required. */
- se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- int flags;
-
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- /*
- * Abuse SPECIAL to avoid delayed dequeue in this instance.
- * This avoids teaching dequeue_entities() about throttled
- * entities and keeps things relatively simple.
- */
- flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
- if (se->sched_delayed)
- flags |= DEQUEUE_DELAYED;
- dequeue_entity(qcfs_rq, se, flags);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
-
- if (qcfs_rq->load.weight) {
- /* Avoid re-evaluating load for this entity: */
- se = parent_entity(se);
- break;
- }
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- update_load_avg(qcfs_rq, se, 0);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
- }
-
- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, queued_delta);
-done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
@@ -5970,9 +6022,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta;
- long rq_h_nr_queued = rq->cfs.h_nr_queued;
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+
+ /*
+ * It's possible we are called with !runtime_remaining due to things
+ * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
+ * unthrottled us with a positive runtime_remaining but other still
+ * running entities consumed those runtime before we reached here.
+ *
+ * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
+ * because any enqueue in tg_unthrottle_up() will immediately trigger a
+ * throttle, which is not supposed to happen on unthrottle path.
+ */
+ if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
+ return;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -6002,62 +6065,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
break;
}
- goto unthrottle_throttle;
}
- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- /* Handle any unfinished DELAY_DEQUEUE business first. */
- if (se->sched_delayed) {
- int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
- dequeue_entity(qcfs_rq, se, flags);
- } else if (se->on_rq)
- break;
- enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- update_load_avg(qcfs_rq, se, UPDATE_TG);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- /* Start the fair server if un-throttling resulted in new runnable tasks */
- if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
- dl_server_start(&rq->fair_server);
-
- /* At this point se is NULL and we are at root level*/
- add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
@@ -6711,6 +6720,8 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void task_throttle_setup_work(struct task_struct *p) {}
static bool task_is_throttled(struct task_struct *p) { return false; }
+static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
@@ -6903,6 +6914,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
+ if (unlikely(task_is_throttled(p) && enqueue_throttled_task(p)))
+ return;
+
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
@@ -6955,10 +6969,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
-
flags = ENQUEUE_WAKEUP;
}
@@ -6980,10 +6990,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
}
if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -7013,7 +7019,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!task_new)
check_update_overutilized_status(rq);
-enqueue_throttle:
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
@@ -7068,10 +7073,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
-
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
slice = cfs_rq_min_slice(cfs_rq);
@@ -7108,10 +7109,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
}
sub_nr_running(rq, h_nr_queued);
@@ -7145,6 +7142,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
*/
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
+ if (unlikely(task_is_throttled(p))) {
+ dequeue_throttled_task(p, flags);
+ return true;
+ }
+
if (!p->se.sched_delayed)
util_est_dequeue(&rq->cfs, p);
@@ -8813,19 +8815,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
+ struct task_struct *p;
+ bool throttled;
again:
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_queued)
return NULL;
+ throttled = false;
+
do {
/* Might not have done put_prev_entity() */
if (cfs_rq->curr && cfs_rq->curr->on_rq)
update_curr(cfs_rq);
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto again;
+ throttled |= check_cfs_rq_runtime(cfs_rq);
se = pick_next_entity(rq, cfs_rq);
if (!se)
@@ -8833,7 +8838,10 @@ static struct task_struct *pick_task_fair(struct rq *rq)
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
- return task_of(se);
+ p = task_of(se);
+ if (unlikely(throttled))
+ task_throttle_setup_work(p);
+ return p;
}
static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 62c3fa543c0f2..f921302dc40fb 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
u64 throttled;
- if (unlikely(cfs_rq->throttle_count))
+ if (unlikely(cfs_rq->pelt_clock_throttled))
throttled = U64_MAX;
else
throttled = cfs_rq->throttled_clock_pelt_time;
@@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
- if (unlikely(cfs_rq->throttle_count))
+ if (unlikely(cfs_rq->pelt_clock_throttled))
return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b0c9559992d8a..fc697d4bf6685 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -737,7 +737,8 @@ struct cfs_rq {
u64 throttled_clock_pelt_time;
u64 throttled_clock_self;
u64 throttled_clock_self_time;
- int throttled;
+ int throttled:1;
+ int pelt_clock_throttled:1;
int throttle_count;
struct list_head throttled_list;
struct list_head throttled_csd_list;
--
2.39.5
On 7/15/2025 3:16 PM, Aaron Lu wrote: > From: Valentin Schneider <vschneid@redhat.com> > > In current throttle model, when a cfs_rq is throttled, its entity will > be dequeued from cpu's rq, making tasks attached to it not able to run, > thus achiveing the throttle target. > > This has a drawback though: assume a task is a reader of percpu_rwsem > and is waiting. When it gets woken, it can not run till its task group's > next period comes, which can be a relatively long time. Waiting writer > will have to wait longer due to this and it also makes further reader > build up and eventually trigger task hung. > > To improve this situation, change the throttle model to task based, i.e. > when a cfs_rq is throttled, record its throttled status but do not remove > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when > they get picked, add a task work to them so that when they return > to user, they can be dequeued there. In this way, tasks throttled will > not hold any kernel resources. And on unthrottle, enqueue back those > tasks so they can continue to run. > > Throttled cfs_rq's PELT clock is handled differently now: previously the > cfs_rq's PELT clock is stopped once it entered throttled state but since > now tasks(in kernel mode) can continue to run, change the behaviour to > stop PELT clock only when the throttled cfs_rq has no tasks left. > > Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> > Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick > Signed-off-by: Valentin Schneider <vschneid@redhat.com> > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> > --- [snip] > @@ -8813,19 +8815,22 @@ static struct task_struct *pick_task_fair(struct rq *rq) > { > struct sched_entity *se; > struct cfs_rq *cfs_rq; > + struct task_struct *p; > + bool throttled; > > again: > cfs_rq = &rq->cfs; > if (!cfs_rq->nr_queued) > return NULL; > > + throttled = false; > + > do { > /* Might not have done put_prev_entity() */ > if (cfs_rq->curr && cfs_rq->curr->on_rq) > update_curr(cfs_rq); > > - if (unlikely(check_cfs_rq_runtime(cfs_rq))) > - goto again; > + throttled |= check_cfs_rq_runtime(cfs_rq); > > se = pick_next_entity(rq, cfs_rq); > if (!se) > @@ -8833,7 +8838,10 @@ static struct task_struct *pick_task_fair(struct rq *rq) > cfs_rq = group_cfs_rq(se); > } while (cfs_rq); > > - return task_of(se); > + p = task_of(se); > + if (unlikely(throttled)) > + task_throttle_setup_work(p); > + return p; > } > Previously, I was wondering if the above change might impact wakeup latency in some corner cases: If there are many tasks enqueued on a throttled cfs_rq, the above pick-up mechanism might return an invalid p repeatedly (where p is dequeued, and a reschedule is triggered in throttle_cfs_rq_work() to pick the next p; then the new p is found again on a throttled cfs_rq). Before the above change, the entire cfs_rq's corresponding sched_entity was dequeued in throttle_cfs_rq(): se = cfs_rq->tg->se(cpu) So I did some tests for this scenario on a Xeon with 6 NUMA nodes and 384 CPUs. I created 10 levels of cgroups and ran schbench on the leaf cgroup. The results show that there is not much impact in terms of wakeup latency (considering the standard deviation). Based on the data and my understanding, for this series, Tested-by: Chen Yu <yu.c.chen@intel.com> Tested script parameters are borrowed from the previous attached ones: #!/bin/bash if [ $# -ne 1 ]; then echo "please provide cgroup level" exit fi N=$1 current_path="/sys/fs/cgroup" for ((i=1; i<=N; i++)); do new_dir="${current_path}/${i}" mkdir -p "$new_dir" if [ "$i" -ne "$N" ]; then echo '+cpu +memory +pids' > ${new_dir}/cgroup.subtree_control fi current_path="$new_dir" done echo "current_path:${current_path}" echo "1600000 100000" > ${current_path}/cpu.max echo "34G" > ${current_path}/memory.max echo $$ > ${current_path}/cgroup.procs #./run-mmtests.sh --no-monitor --config config-schbench baseline ./run-mmtests.sh --no-monitor --config config-schbench sch_throt pids=$(cat "${current_path}/cgroup.procs") for pid in $pids; do echo $pid > "/sys/fs/cgroup/cgroup.procs" 2>/dev/null done for ((i=N; i>=1; i--)); do rmdir ${current_path} current_path=$(dirname "$current_path") done Results: schbench thread = 1 Metric Base (mean±std) Compare (mean±std) Change ------------------------------------------------------------------------------------- //the baseline's std% is 35%, the change should not be a problem Wakeup Latencies 99.0th 15.00(5.29) 17.00(1.00) -13.33% Request Latencies 99.0th 3830.67(33.31) 3854.67(25.72) -0.63% RPS 50.0th 1598.00(4.00) 1606.00(4.00) +0.50% Average RPS 1597.77(5.13) 1606.11(4.75) +0.52% schbench thread = 2 Metric Base (mean±std) Compare (mean±std) Change ------------------------------------------------------------------------------------- Wakeup Latencies 99.0th 18.33(0.58) 18.67(0.58) -1.85% Request Latencies 99.0th 3868.00(49.96) 3854.67(44.06) +0.34% RPS 50.0th 3185.33(4.62) 3204.00(8.00) +0.59% Average RPS 3186.49(2.70) 3204.21(11.25) +0.56% schbench thread = 4 Metric Base (mean±std) Compare (mean±std) Change ------------------------------------------------------------------------------------- Wakeup Latencies 99.0th 19.33(1.15) 19.33(0.58) 0.00% Request Latencies 99.0th 35690.67(517.31) 35946.67(517.31) -0.72% RPS 50.0th 4418.67(18.48) 4434.67(9.24) +0.36% Average RPS 4414.38(16.94) 4436.02(8.77) +0.49% schbench thread = 8 Metric Base (mean±std) Compare (mean±std) Change ------------------------------------------------------------------------------------- Wakeup Latencies 99.0th 22.67(0.58) 22.33(0.58) +1.50% Request Latencies 99.0th 73002.67(147.80) 72661.33(147.80) +0.47% RPS 50.0th 4376.00(16.00) 4392.00(0.00) +0.37% Average RPS 4373.89(15.04) 4393.88(6.22) +0.46% schbench thread = 16 Metric Base (mean±std) Compare (mean±std) Change ------------------------------------------------------------------------------------- Wakeup Latencies 99.0th 29.00(2.65) 29.00(3.61) 0.00% Request Latencies 99.0th 88704.00(0.00) 88704.00(0.00) 0.00% RPS 50.0th 4274.67(24.44) 4290.67(9.24) +0.37% Average RPS 4277.27(24.80) 4287.97(9.80) +0.25% schbench thread = 32 Metric Base (mean±std) Compare (mean±std) Change ------------------------------------------------------------------------------------- Wakeup Latencies 99.0th 100.00(22.61) 82.00(16.46) +18.00% Request Latencies 99.0th 100138.67(295.60) 100053.33(147.80) +0.09% RPS 50.0th 3942.67(20.13) 3916.00(42.33) -0.68% Average RPS 3919.39(19.01) 3892.39(42.26) -0.69% schbench thread = 63 Metric Base (mean±std) Compare (mean±std) Change ------------------------------------------------------------------------------------- Wakeup Latencies 99.0th 94848.00(0.00) 94336.00(0.00) +0.54% //the baseline's std% is 19%, the change should not be a problem Request Latencies 99.0th 264618.67(51582.78) 298154.67(591.21) -12.67% RPS 50.0th 2641.33(4.62) 2628.00(8.00) -0.50% Average RPS 2659.49(8.88) 2636.17(7.58) -0.88% thanks, Chenyu
On Sun, Aug 17, 2025 at 04:50:50PM +0800, Chen, Yu C wrote: > On 7/15/2025 3:16 PM, Aaron Lu wrote: > > From: Valentin Schneider <vschneid@redhat.com> > > > > In current throttle model, when a cfs_rq is throttled, its entity will > > be dequeued from cpu's rq, making tasks attached to it not able to run, > > thus achiveing the throttle target. > > > > This has a drawback though: assume a task is a reader of percpu_rwsem > > and is waiting. When it gets woken, it can not run till its task group's > > next period comes, which can be a relatively long time. Waiting writer > > will have to wait longer due to this and it also makes further reader > > build up and eventually trigger task hung. > > > > To improve this situation, change the throttle model to task based, i.e. > > when a cfs_rq is throttled, record its throttled status but do not remove > > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when > > they get picked, add a task work to them so that when they return > > to user, they can be dequeued there. In this way, tasks throttled will > > not hold any kernel resources. And on unthrottle, enqueue back those > > tasks so they can continue to run. > > > > Throttled cfs_rq's PELT clock is handled differently now: previously the > > cfs_rq's PELT clock is stopped once it entered throttled state but since > > now tasks(in kernel mode) can continue to run, change the behaviour to > > stop PELT clock only when the throttled cfs_rq has no tasks left. > > > > Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> > > Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick > > Signed-off-by: Valentin Schneider <vschneid@redhat.com> > > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> > > --- > > [snip] > > > > @@ -8813,19 +8815,22 @@ static struct task_struct *pick_task_fair(struct rq *rq) > > { > > struct sched_entity *se; > > struct cfs_rq *cfs_rq; > > + struct task_struct *p; > > + bool throttled; > > again: > > cfs_rq = &rq->cfs; > > if (!cfs_rq->nr_queued) > > return NULL; > > + throttled = false; > > + > > do { > > /* Might not have done put_prev_entity() */ > > if (cfs_rq->curr && cfs_rq->curr->on_rq) > > update_curr(cfs_rq); > > - if (unlikely(check_cfs_rq_runtime(cfs_rq))) > > - goto again; > > + throttled |= check_cfs_rq_runtime(cfs_rq); > > se = pick_next_entity(rq, cfs_rq); > > if (!se) > > @@ -8833,7 +8838,10 @@ static struct task_struct *pick_task_fair(struct rq *rq) > > cfs_rq = group_cfs_rq(se); > > } while (cfs_rq); > > - return task_of(se); > > + p = task_of(se); > > + if (unlikely(throttled)) > > + task_throttle_setup_work(p); > > + return p; > > } > > Previously, I was wondering if the above change might impact > wakeup latency in some corner cases: If there are many tasks > enqueued on a throttled cfs_rq, the above pick-up mechanism > might return an invalid p repeatedly (where p is dequeued, By invalid, do you mean task that is in a throttled hierarchy? > and a reschedule is triggered in throttle_cfs_rq_work() to > pick the next p; then the new p is found again on a throttled > cfs_rq). Before the above change, the entire cfs_rq's corresponding > sched_entity was dequeued in throttle_cfs_rq(): se = cfs_rq->tg->se(cpu) > Yes this is true and it sounds inefficient, but these newly woken tasks may hold some kernel resources like a reader lock so we really want them to finish their kernel jobs and release that resource before being throttled or it can block/impact other tasks and even cause the whole system to hung. > So I did some tests for this scenario on a Xeon with 6 NUMA nodes and > 384 CPUs. I created 10 levels of cgroups and ran schbench on the leaf > cgroup. The results show that there is not much impact in terms of > wakeup latency (considering the standard deviation). Based on the data > and my understanding, for this series, > > Tested-by: Chen Yu <yu.c.chen@intel.com> Good to know this and thanks a lot for the test!
On Mon, Aug 18, 2025 at 10:50:14AM +0800, Aaron Lu wrote: > On Sun, Aug 17, 2025 at 04:50:50PM +0800, Chen, Yu C wrote: > > On 7/15/2025 3:16 PM, Aaron Lu wrote: > > > From: Valentin Schneider <vschneid@redhat.com> > > > > > > In current throttle model, when a cfs_rq is throttled, its entity will > > > be dequeued from cpu's rq, making tasks attached to it not able to run, > > > thus achiveing the throttle target. > > > > > > This has a drawback though: assume a task is a reader of percpu_rwsem > > > and is waiting. When it gets woken, it can not run till its task group's > > > next period comes, which can be a relatively long time. Waiting writer > > > will have to wait longer due to this and it also makes further reader > > > build up and eventually trigger task hung. > > > > > > To improve this situation, change the throttle model to task based, i.e. > > > when a cfs_rq is throttled, record its throttled status but do not remove > > > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when > > > they get picked, add a task work to them so that when they return > > > to user, they can be dequeued there. In this way, tasks throttled will > > > not hold any kernel resources. And on unthrottle, enqueue back those > > > tasks so they can continue to run. > > > > > > Throttled cfs_rq's PELT clock is handled differently now: previously the > > > cfs_rq's PELT clock is stopped once it entered throttled state but since > > > now tasks(in kernel mode) can continue to run, change the behaviour to > > > stop PELT clock only when the throttled cfs_rq has no tasks left. > > > > > > Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> > > > Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick > > > Signed-off-by: Valentin Schneider <vschneid@redhat.com> > > > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> > > > --- > > > > [snip] > > > > > > > @@ -8813,19 +8815,22 @@ static struct task_struct *pick_task_fair(struct rq *rq) > > > { > > > struct sched_entity *se; > > > struct cfs_rq *cfs_rq; > > > + struct task_struct *p; > > > + bool throttled; > > > again: > > > cfs_rq = &rq->cfs; > > > if (!cfs_rq->nr_queued) > > > return NULL; > > > + throttled = false; > > > + > > > do { > > > /* Might not have done put_prev_entity() */ > > > if (cfs_rq->curr && cfs_rq->curr->on_rq) > > > update_curr(cfs_rq); > > > - if (unlikely(check_cfs_rq_runtime(cfs_rq))) > > > - goto again; > > > + throttled |= check_cfs_rq_runtime(cfs_rq); > > > se = pick_next_entity(rq, cfs_rq); > > > if (!se) > > > @@ -8833,7 +8838,10 @@ static struct task_struct *pick_task_fair(struct rq *rq) > > > cfs_rq = group_cfs_rq(se); > > > } while (cfs_rq); > > > - return task_of(se); > > > + p = task_of(se); > > > + if (unlikely(throttled)) > > > + task_throttle_setup_work(p); > > > + return p; > > > } > > > > Previously, I was wondering if the above change might impact > > wakeup latency in some corner cases: If there are many tasks > > enqueued on a throttled cfs_rq, the above pick-up mechanism > > might return an invalid p repeatedly (where p is dequeued, > > By invalid, do you mean task that is in a throttled hierarchy? > > > and a reschedule is triggered in throttle_cfs_rq_work() to > > pick the next p; then the new p is found again on a throttled > > cfs_rq). Before the above change, the entire cfs_rq's corresponding > > sched_entity was dequeued in throttle_cfs_rq(): se = cfs_rq->tg->se(cpu) > > > > Yes this is true and it sounds inefficient, but these newly woken tasks > may hold some kernel resources like a reader lock so we really want them ~~~~ Sorry, I meant reader semaphore. > to finish their kernel jobs and release that resource before being > throttled or it can block/impact other tasks and even cause the whole > system to hung. > > > So I did some tests for this scenario on a Xeon with 6 NUMA nodes and > > 384 CPUs. I created 10 levels of cgroups and ran schbench on the leaf > > cgroup. The results show that there is not much impact in terms of > > wakeup latency (considering the standard deviation). Based on the data > > and my understanding, for this series, > > > > Tested-by: Chen Yu <yu.c.chen@intel.com> > > Good to know this and thanks a lot for the test!
On 8/18/2025 10:50 AM, Aaron Lu wrote: > On Sun, Aug 17, 2025 at 04:50:50PM +0800, Chen, Yu C wrote: >> On 7/15/2025 3:16 PM, Aaron Lu wrote: >>> From: Valentin Schneider <vschneid@redhat.com> >>> >>> In current throttle model, when a cfs_rq is throttled, its entity will >>> be dequeued from cpu's rq, making tasks attached to it not able to run, >>> thus achiveing the throttle target. >>> >>> This has a drawback though: assume a task is a reader of percpu_rwsem >>> and is waiting. When it gets woken, it can not run till its task group's >>> next period comes, which can be a relatively long time. Waiting writer >>> will have to wait longer due to this and it also makes further reader >>> build up and eventually trigger task hung. >>> >>> To improve this situation, change the throttle model to task based, i.e. >>> when a cfs_rq is throttled, record its throttled status but do not remove >>> it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when >>> they get picked, add a task work to them so that when they return >>> to user, they can be dequeued there. In this way, tasks throttled will >>> not hold any kernel resources. And on unthrottle, enqueue back those >>> tasks so they can continue to run. >>> >>> Throttled cfs_rq's PELT clock is handled differently now: previously the >>> cfs_rq's PELT clock is stopped once it entered throttled state but since >>> now tasks(in kernel mode) can continue to run, change the behaviour to >>> stop PELT clock only when the throttled cfs_rq has no tasks left. >>> >>> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> >>> Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick >>> Signed-off-by: Valentin Schneider <vschneid@redhat.com> >>> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> >>> --- >> >> [snip] >> >> >>> @@ -8813,19 +8815,22 @@ static struct task_struct *pick_task_fair(struct rq *rq) >>> { >>> struct sched_entity *se; >>> struct cfs_rq *cfs_rq; >>> + struct task_struct *p; >>> + bool throttled; >>> again: >>> cfs_rq = &rq->cfs; >>> if (!cfs_rq->nr_queued) >>> return NULL; >>> + throttled = false; >>> + >>> do { >>> /* Might not have done put_prev_entity() */ >>> if (cfs_rq->curr && cfs_rq->curr->on_rq) >>> update_curr(cfs_rq); >>> - if (unlikely(check_cfs_rq_runtime(cfs_rq))) >>> - goto again; >>> + throttled |= check_cfs_rq_runtime(cfs_rq); >>> se = pick_next_entity(rq, cfs_rq); >>> if (!se) >>> @@ -8833,7 +8838,10 @@ static struct task_struct *pick_task_fair(struct rq *rq) >>> cfs_rq = group_cfs_rq(se); >>> } while (cfs_rq); >>> - return task_of(se); >>> + p = task_of(se); >>> + if (unlikely(throttled)) >>> + task_throttle_setup_work(p); >>> + return p; >>> } >> >> Previously, I was wondering if the above change might impact >> wakeup latency in some corner cases: If there are many tasks >> enqueued on a throttled cfs_rq, the above pick-up mechanism >> might return an invalid p repeatedly (where p is dequeued, > > By invalid, do you mean task that is in a throttled hierarchy? > Yes. >> and a reschedule is triggered in throttle_cfs_rq_work() to >> pick the next p; then the new p is found again on a throttled >> cfs_rq). Before the above change, the entire cfs_rq's corresponding >> sched_entity was dequeued in throttle_cfs_rq(): se = cfs_rq->tg->se(cpu) >> > > Yes this is true and it sounds inefficient, but these newly woken tasks > may hold some kernel resources like a reader lock so we really want them > to finish their kernel jobs and release that resource before being > throttled or it can block/impact other tasks and even cause the whole > system to hung. > I see. Always dequeue each task during their ret2user phase would be safer. thanks, Chenyu >> So I did some tests for this scenario on a Xeon with 6 NUMA nodes and >> 384 CPUs. I created 10 levels of cgroups and ran schbench on the leaf >> cgroup. The results show that there is not much impact in terms of >> wakeup latency (considering the standard deviation). Based on the data >> and my understanding, for this series, >> >> Tested-by: Chen Yu <yu.c.chen@intel.com> > > Good to know this and thanks a lot for the test!
On 15/07/25 15:16, Aaron Lu wrote: > From: Valentin Schneider <vschneid@redhat.com> > > In current throttle model, when a cfs_rq is throttled, its entity will > be dequeued from cpu's rq, making tasks attached to it not able to run, > thus achiveing the throttle target. > > This has a drawback though: assume a task is a reader of percpu_rwsem > and is waiting. When it gets woken, it can not run till its task group's > next period comes, which can be a relatively long time. Waiting writer > will have to wait longer due to this and it also makes further reader > build up and eventually trigger task hung. > > To improve this situation, change the throttle model to task based, i.e. > when a cfs_rq is throttled, record its throttled status but do not remove > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when > they get picked, add a task work to them so that when they return > to user, they can be dequeued there. In this way, tasks throttled will > not hold any kernel resources. And on unthrottle, enqueue back those > tasks so they can continue to run. > Moving the actual throttle work to pick time is clever. In my previous versions I tried really hard to stay out of the enqueue/dequeue/pick paths, but this makes the code a lot more palatable. I'd like to see how this impacts performance though. > Throttled cfs_rq's PELT clock is handled differently now: previously the > cfs_rq's PELT clock is stopped once it entered throttled state but since > now tasks(in kernel mode) can continue to run, change the behaviour to > stop PELT clock only when the throttled cfs_rq has no tasks left. > I haven't spent much time looking at the PELT stuff yet; I'll do that next week. Thanks for all the work you've been putting into this, and sorry it got me this long to get a proper look at it! > Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> > Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick > Signed-off-by: Valentin Schneider <vschneid@redhat.com> > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> > +static bool enqueue_throttled_task(struct task_struct *p) > +{ > + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); > + > + /* > + * If the throttled task is enqueued to a throttled cfs_rq, > + * take the fast path by directly put the task on target > + * cfs_rq's limbo list, except when p is current because > + * the following race can cause p's group_node left in rq's > + * cfs_tasks list when it's throttled: > + * > + * cpuX cpuY > + * taskA ret2user > + * throttle_cfs_rq_work() sched_move_task(taskA) > + * task_rq_lock acquired > + * dequeue_task_fair(taskA) > + * task_rq_lock released > + * task_rq_lock acquired > + * task_current_donor(taskA) == true > + * task_on_rq_queued(taskA) == true > + * dequeue_task(taskA) > + * put_prev_task(taskA) > + * sched_change_group() > + * enqueue_task(taskA) -> taskA's new cfs_rq > + * is throttled, go > + * fast path and skip > + * actual enqueue > + * set_next_task(taskA) > + * __set_next_task_fair(taskA) > + * list_move(&se->group_node, &rq->cfs_tasks); // bug > + * schedule() > + * > + * And in the above race case, the task's current cfs_rq is in the same > + * rq as its previous cfs_rq because sched_move_task() doesn't migrate > + * task so we can use its current cfs_rq to derive rq and test if the > + * task is current. > + */ OK I think I got this; I slightly rephrased things to match similar comments in the sched code: --->8--- diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a7c86c5b8a2b..8566ee0399527 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5827,37 +5827,38 @@ static bool enqueue_throttled_task(struct task_struct *p) struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); /* - * If the throttled task is enqueued to a throttled cfs_rq, - * take the fast path by directly put the task on target - * cfs_rq's limbo list, except when p is current because - * the following race can cause p's group_node left in rq's - * cfs_tasks list when it's throttled: + * If the throttled task @p is enqueued to a throttled cfs_rq, + * take the fast path by directly putting the task on the + * target cfs_rq's limbo list. + + * Do not do that when @p is current because the following race can + * cause @p's group_node to be incorectly re-insterted in its in rq's + * cfs_tasks list, despite being throttled: * * cpuX cpuY - * taskA ret2user - * throttle_cfs_rq_work() sched_move_task(taskA) - * task_rq_lock acquired - * dequeue_task_fair(taskA) - * task_rq_lock released - * task_rq_lock acquired - * task_current_donor(taskA) == true - * task_on_rq_queued(taskA) == true - * dequeue_task(taskA) - * put_prev_task(taskA) - * sched_change_group() - * enqueue_task(taskA) -> taskA's new cfs_rq - * is throttled, go - * fast path and skip - * actual enqueue - * set_next_task(taskA) - * __set_next_task_fair(taskA) - * list_move(&se->group_node, &rq->cfs_tasks); // bug + * p ret2user + * throttle_cfs_rq_work() sched_move_task(p) + * LOCK task_rq_lock + * dequeue_task_fair(p) + * UNLOCK task_rq_lock + * LOCK task_rq_lock + * task_current_donor(p) == true + * task_on_rq_queued(p) == true + * dequeue_task(p) + * put_prev_task(p) + * sched_change_group() + * enqueue_task(p) -> p's new cfs_rq + * is throttled, go + * fast path and skip + * actual enqueue + * set_next_task(p) + * list_move(&se->group_node, &rq->cfs_tasks); // bug * schedule() * - * And in the above race case, the task's current cfs_rq is in the same - * rq as its previous cfs_rq because sched_move_task() doesn't migrate - * task so we can use its current cfs_rq to derive rq and test if the - * task is current. + * In the above race case, @p current cfs_rq is in the same + * rq as its previous cfs_rq because sched_move_task() only moves a task + * to a different group from the same rq, so we can use its current + * cfs_rq to derive rq and test if the * task is current. */ if (throttled_hierarchy(cfs_rq) && !task_current_donor(rq_of(cfs_rq), p)) { ---8<--- > + if (throttled_hierarchy(cfs_rq) && > + !task_current_donor(rq_of(cfs_rq), p)) { > + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); > + return true; > + } > + > + /* we can't take the fast path, do an actual enqueue*/ > + p->throttled = false; So we clear p->throttled but not p->throttle_node? Won't that cause issues when @p's previous cfs_rq gets unthrottled? > + return false; > +} > + > @@ -7145,6 +7142,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) > */ > static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) > { > + if (unlikely(task_is_throttled(p))) { > + dequeue_throttled_task(p, flags); > + return true; > + } > + Handling a throttled task's move pattern at dequeue does simplify things, however that's quite a hot path. I think this wants at the very least a if (cfs_bandwidth_used()) since that has a static key underneath. Some heavy EQ/DQ benchmark wouldn't hurt, but we can probably find some people who really care about that to run it for us ;) > if (!p->se.sched_delayed) > util_est_dequeue(&rq->cfs, p); >
On Fri, Aug 08, 2025 at 11:12:48AM +0200, Valentin Schneider wrote: > On 15/07/25 15:16, Aaron Lu wrote: > > From: Valentin Schneider <vschneid@redhat.com> > > > > In current throttle model, when a cfs_rq is throttled, its entity will > > be dequeued from cpu's rq, making tasks attached to it not able to run, > > thus achiveing the throttle target. > > > > This has a drawback though: assume a task is a reader of percpu_rwsem > > and is waiting. When it gets woken, it can not run till its task group's > > next period comes, which can be a relatively long time. Waiting writer > > will have to wait longer due to this and it also makes further reader > > build up and eventually trigger task hung. > > > > To improve this situation, change the throttle model to task based, i.e. > > when a cfs_rq is throttled, record its throttled status but do not remove > > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when > > they get picked, add a task work to them so that when they return > > to user, they can be dequeued there. In this way, tasks throttled will > > not hold any kernel resources. And on unthrottle, enqueue back those > > tasks so they can continue to run. > > > > Moving the actual throttle work to pick time is clever. In my previous > versions I tried really hard to stay out of the enqueue/dequeue/pick paths, > but this makes the code a lot more palatable. I'd like to see how this > impacts performance though. > Let me run some scheduler benchmark to see how it impacts performance. I'm thinking maybe running something like hackbench on server platforms, first with quota not set and see if performance changes; then also test with quota set and see how performance changes. Does this sound good to you? Or do you have any specific benchmark and test methodology in mind? > > Throttled cfs_rq's PELT clock is handled differently now: previously the > > cfs_rq's PELT clock is stopped once it entered throttled state but since > > now tasks(in kernel mode) can continue to run, change the behaviour to > > stop PELT clock only when the throttled cfs_rq has no tasks left. > > > > I haven't spent much time looking at the PELT stuff yet; I'll do that next > week. Thanks for all the work you've been putting into this, and sorry it > got me this long to get a proper look at it! > Never mind and thanks for taking a look now :) > > Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> > > Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick > > Signed-off-by: Valentin Schneider <vschneid@redhat.com> > > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> > > > +static bool enqueue_throttled_task(struct task_struct *p) > > +{ > > + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); > > + > > + /* > > + * If the throttled task is enqueued to a throttled cfs_rq, > > + * take the fast path by directly put the task on target > > + * cfs_rq's limbo list, except when p is current because > > + * the following race can cause p's group_node left in rq's > > + * cfs_tasks list when it's throttled: > > + * > > + * cpuX cpuY > > + * taskA ret2user > > + * throttle_cfs_rq_work() sched_move_task(taskA) > > + * task_rq_lock acquired > > + * dequeue_task_fair(taskA) > > + * task_rq_lock released > > + * task_rq_lock acquired > > + * task_current_donor(taskA) == true > > + * task_on_rq_queued(taskA) == true > > + * dequeue_task(taskA) > > + * put_prev_task(taskA) > > + * sched_change_group() > > + * enqueue_task(taskA) -> taskA's new cfs_rq > > + * is throttled, go > > + * fast path and skip > > + * actual enqueue > > + * set_next_task(taskA) > > + * __set_next_task_fair(taskA) > > + * list_move(&se->group_node, &rq->cfs_tasks); // bug > > + * schedule() > > + * > > + * And in the above race case, the task's current cfs_rq is in the same > > + * rq as its previous cfs_rq because sched_move_task() doesn't migrate > > + * task so we can use its current cfs_rq to derive rq and test if the > > + * task is current. > > + */ > > OK I think I got this; I slightly rephrased things to match similar > comments in the sched code: > > --->8--- > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 3a7c86c5b8a2b..8566ee0399527 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -5827,37 +5827,38 @@ static bool enqueue_throttled_task(struct task_struct *p) > struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); > > /* > - * If the throttled task is enqueued to a throttled cfs_rq, > - * take the fast path by directly put the task on target > - * cfs_rq's limbo list, except when p is current because > - * the following race can cause p's group_node left in rq's > - * cfs_tasks list when it's throttled: > + * If the throttled task @p is enqueued to a throttled cfs_rq, > + * take the fast path by directly putting the task on the > + * target cfs_rq's limbo list. > + > + * Do not do that when @p is current because the following race can > + * cause @p's group_node to be incorectly re-insterted in its in rq's > + * cfs_tasks list, despite being throttled: > * > * cpuX cpuY > - * taskA ret2user > - * throttle_cfs_rq_work() sched_move_task(taskA) > - * task_rq_lock acquired > - * dequeue_task_fair(taskA) > - * task_rq_lock released > - * task_rq_lock acquired > - * task_current_donor(taskA) == true > - * task_on_rq_queued(taskA) == true > - * dequeue_task(taskA) > - * put_prev_task(taskA) > - * sched_change_group() > - * enqueue_task(taskA) -> taskA's new cfs_rq > - * is throttled, go > - * fast path and skip > - * actual enqueue > - * set_next_task(taskA) > - * __set_next_task_fair(taskA) > - * list_move(&se->group_node, &rq->cfs_tasks); // bug > + * p ret2user > + * throttle_cfs_rq_work() sched_move_task(p) > + * LOCK task_rq_lock > + * dequeue_task_fair(p) > + * UNLOCK task_rq_lock > + * LOCK task_rq_lock > + * task_current_donor(p) == true > + * task_on_rq_queued(p) == true > + * dequeue_task(p) > + * put_prev_task(p) > + * sched_change_group() > + * enqueue_task(p) -> p's new cfs_rq > + * is throttled, go > + * fast path and skip > + * actual enqueue > + * set_next_task(p) > + * list_move(&se->group_node, &rq->cfs_tasks); // bug > * schedule() > * > - * And in the above race case, the task's current cfs_rq is in the same > - * rq as its previous cfs_rq because sched_move_task() doesn't migrate > - * task so we can use its current cfs_rq to derive rq and test if the > - * task is current. > + * In the above race case, @p current cfs_rq is in the same > + * rq as its previous cfs_rq because sched_move_task() only moves a task > + * to a different group from the same rq, so we can use its current > + * cfs_rq to derive rq and test if the * task is current. > */ > if (throttled_hierarchy(cfs_rq) && > !task_current_donor(rq_of(cfs_rq), p)) { > ---8<--- > Will follow your suggestion in next version. > > + if (throttled_hierarchy(cfs_rq) && > > + !task_current_donor(rq_of(cfs_rq), p)) { > > + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); > > + return true; > > + } > > + > > + /* we can't take the fast path, do an actual enqueue*/ > > + p->throttled = false; > > So we clear p->throttled but not p->throttle_node? Won't that cause issues > when @p's previous cfs_rq gets unthrottled? > p->throttle_node is already removed from its previous cfs_rq at dequeue time in dequeue_throttled_task(). This is done so because in enqueue time, we may not hold its previous rq's lock so can't touch its previous cfs_rq's limbo list, like when dealing with affinity changes. > > + return false; > > +} > > + > > > @@ -7145,6 +7142,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) > > */ > > static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) > > { > > + if (unlikely(task_is_throttled(p))) { > > + dequeue_throttled_task(p, flags); > > + return true; > > + } > > + > > Handling a throttled task's move pattern at dequeue does simplify things, > however that's quite a hot path. I think this wants at the very least a > > if (cfs_bandwidth_used()) > > since that has a static key underneath. Some heavy EQ/DQ benchmark wouldn't > hurt, but we can probably find some people who really care about that to > run it for us ;) > No problem. I'm thinking maybe I can add this cfs_bandwidth_used() in task_is_throttled()? So that other callsites of task_is_throttled() can also get the benefit of paying less cost when cfs bandwidth is not enabled. > > if (!p->se.sched_delayed) > > util_est_dequeue(&rq->cfs, p); > > >
On 08/08/25 18:13, Aaron Lu wrote: > On Fri, Aug 08, 2025 at 11:12:48AM +0200, Valentin Schneider wrote: >> On 15/07/25 15:16, Aaron Lu wrote: >> > From: Valentin Schneider <vschneid@redhat.com> >> > >> > In current throttle model, when a cfs_rq is throttled, its entity will >> > be dequeued from cpu's rq, making tasks attached to it not able to run, >> > thus achiveing the throttle target. >> > >> > This has a drawback though: assume a task is a reader of percpu_rwsem >> > and is waiting. When it gets woken, it can not run till its task group's >> > next period comes, which can be a relatively long time. Waiting writer >> > will have to wait longer due to this and it also makes further reader >> > build up and eventually trigger task hung. >> > >> > To improve this situation, change the throttle model to task based, i.e. >> > when a cfs_rq is throttled, record its throttled status but do not remove >> > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when >> > they get picked, add a task work to them so that when they return >> > to user, they can be dequeued there. In this way, tasks throttled will >> > not hold any kernel resources. And on unthrottle, enqueue back those >> > tasks so they can continue to run. >> > >> >> Moving the actual throttle work to pick time is clever. In my previous >> versions I tried really hard to stay out of the enqueue/dequeue/pick paths, >> but this makes the code a lot more palatable. I'd like to see how this >> impacts performance though. >> > > Let me run some scheduler benchmark to see how it impacts performance. > > I'm thinking maybe running something like hackbench on server platforms, > first with quota not set and see if performance changes; then also test > with quota set and see how performance changes. > > Does this sound good to you? Or do you have any specific benchmark and > test methodology in mind? > Yeah hackbench is pretty good for stressing the EQ/DQ paths. >> > + if (throttled_hierarchy(cfs_rq) && >> > + !task_current_donor(rq_of(cfs_rq), p)) { >> > + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); >> > + return true; >> > + } >> > + >> > + /* we can't take the fast path, do an actual enqueue*/ >> > + p->throttled = false; >> >> So we clear p->throttled but not p->throttle_node? Won't that cause issues >> when @p's previous cfs_rq gets unthrottled? >> > > p->throttle_node is already removed from its previous cfs_rq at dequeue > time in dequeue_throttled_task(). > > This is done so because in enqueue time, we may not hold its previous > rq's lock so can't touch its previous cfs_rq's limbo list, like when > dealing with affinity changes. > Ah right, the DQ/EQ_throttled_task() functions are when DQ/EQ is applied to an already-throttled task and it does the right thing. Does this mean we want this as enqueue_throttled_task()'s prologue? /* @p should have gone through dequeue_throttled_task() first */ WARN_ON_ONCE(!list_empty(&p->throttle_node)); >> > @@ -7145,6 +7142,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) >> > */ >> > static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) >> > { >> > + if (unlikely(task_is_throttled(p))) { >> > + dequeue_throttled_task(p, flags); >> > + return true; >> > + } >> > + >> >> Handling a throttled task's move pattern at dequeue does simplify things, >> however that's quite a hot path. I think this wants at the very least a >> >> if (cfs_bandwidth_used()) >> >> since that has a static key underneath. Some heavy EQ/DQ benchmark wouldn't >> hurt, but we can probably find some people who really care about that to >> run it for us ;) >> > > No problem. > > I'm thinking maybe I can add this cfs_bandwidth_used() in > task_is_throttled()? So that other callsites of task_is_throttled() can > also get the benefit of paying less cost when cfs bandwidth is not > enabled. > Sounds good to me; just drop the unlikely and let the static key do its thing :)
On Fri, Aug 08, 2025 at 01:45:11PM +0200, Valentin Schneider wrote: > On 08/08/25 18:13, Aaron Lu wrote: > > On Fri, Aug 08, 2025 at 11:12:48AM +0200, Valentin Schneider wrote: ... ... > >> > + if (throttled_hierarchy(cfs_rq) && > >> > + !task_current_donor(rq_of(cfs_rq), p)) { > >> > + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); > >> > + return true; > >> > + } > >> > + > >> > + /* we can't take the fast path, do an actual enqueue*/ > >> > + p->throttled = false; > >> > >> So we clear p->throttled but not p->throttle_node? Won't that cause issues > >> when @p's previous cfs_rq gets unthrottled? > >> > > > > p->throttle_node is already removed from its previous cfs_rq at dequeue > > time in dequeue_throttled_task(). > > > > This is done so because in enqueue time, we may not hold its previous > > rq's lock so can't touch its previous cfs_rq's limbo list, like when > > dealing with affinity changes. > > > > Ah right, the DQ/EQ_throttled_task() functions are when DQ/EQ is applied to an > already-throttled task and it does the right thing. > > Does this mean we want this as enqueue_throttled_task()'s prologue? > > /* @p should have gone through dequeue_throttled_task() first */ > WARN_ON_ONCE(!list_empty(&p->throttle_node)); > While adding this change to the new version, I remembered this enqueue_throttled_task() also gets called for tasks that are going to be unthrottled on unthrottle path, i.e. unthrottle_cfs_rq() -> tg_unthrottle_up() -> enqueue_task_fair() because task's throttled flag is not cleared yet(but throttle_node is removed from the limbo list so the above warn still works as expected). I didn't clear p->throttled in tg_unthrottle_up() before calling enqueue_task_fair() because enqueue_throttled_task() will take care of that but now I look at it, I think it is better to clear p->throttled before calling enqueue_task_fair(): this saves some cycles by skipping enqueue_throttled_task() for these unthrottled tasks and enqueue_throttled_task() only has to deal with migrated throttled task. This feels cleaner and more efficient. I remember Prateek also suggested this before but I couldn't find his email now, not sure if I remembered wrong. Any way, just a note that I'm going to make below changes to the next version, let me know if this doesn't look right, thanks. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 785a15caffbcc..df8dc389af8e1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5904,6 +5904,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) /* Re-enqueue the tasks that have been throttled at this level. */ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { list_del_init(&p->throttle_node); + p->throttled = false; enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); }
On Fri, Aug 08, 2025 at 01:45:11PM +0200, Valentin Schneider wrote: > On 08/08/25 18:13, Aaron Lu wrote: > > On Fri, Aug 08, 2025 at 11:12:48AM +0200, Valentin Schneider wrote: > >> On 15/07/25 15:16, Aaron Lu wrote: > >> > From: Valentin Schneider <vschneid@redhat.com> > >> > > >> > In current throttle model, when a cfs_rq is throttled, its entity will > >> > be dequeued from cpu's rq, making tasks attached to it not able to run, > >> > thus achiveing the throttle target. > >> > > >> > This has a drawback though: assume a task is a reader of percpu_rwsem > >> > and is waiting. When it gets woken, it can not run till its task group's > >> > next period comes, which can be a relatively long time. Waiting writer > >> > will have to wait longer due to this and it also makes further reader > >> > build up and eventually trigger task hung. > >> > > >> > To improve this situation, change the throttle model to task based, i.e. > >> > when a cfs_rq is throttled, record its throttled status but do not remove > >> > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when > >> > they get picked, add a task work to them so that when they return > >> > to user, they can be dequeued there. In this way, tasks throttled will > >> > not hold any kernel resources. And on unthrottle, enqueue back those > >> > tasks so they can continue to run. > >> > > >> > >> Moving the actual throttle work to pick time is clever. In my previous > >> versions I tried really hard to stay out of the enqueue/dequeue/pick paths, > >> but this makes the code a lot more palatable. I'd like to see how this > >> impacts performance though. > >> > > > > Let me run some scheduler benchmark to see how it impacts performance. > > > > I'm thinking maybe running something like hackbench on server platforms, > > first with quota not set and see if performance changes; then also test > > with quota set and see how performance changes. > > > > Does this sound good to you? Or do you have any specific benchmark and > > test methodology in mind? > > > > Yeah hackbench is pretty good for stressing the EQ/DQ paths. > Tested hackbench/pipe and netperf/UDP_RR on Intel EMR(2 sockets/240 cpus) and AMD Genoa(2 sockets/384 cpus), the tldr is: there is no clear performance change between base and this patchset(head). Below is detailed test data: (turbo/boost disabled, cpuidle disabled, cpufreq set to performance) hackbench/pipe/loops=150000 (seconds, smaller is better) On Intel EMR: nr_group base head change 1 3.62±2.99% 3.61±10.42% +0.28% 8 8.06±1.58% 7.88±5.82% +2.23% 16 11.40±2.57% 11.25±3.72% +1.32% For nr_group=16 case, configure a cgroup and set quota to half cpu and then let hackbench run in this cgroup: base head change quota=50% 18.35±2.40% 18.78±1.97% -2.34% On AMD Genoa: nr_group base head change 1 17.05±1.92% 16.99±2.81% +0.35% 8 16.54±0.71% 16.73±1.18% -1.15% 16 27.04±0.39% 26.72±2.37% +1.18% For nr_group=16 case, configure a cgroup and set quota to half cpu and then let hackbench run in this cgroup: base head change quota=50% 43.79±1.10% 44.65±0.37% -1.96% Netperf/UDP_RR/testlen=30s (throughput, higher is better) 25% means nr_clients set to 1/4 nr_cpu, 50% means nr_clients is 1/2 nr_cpu, etc. On Intel EMR: nr_clients base head change 25% 83,567±0.06% 84,298±0.23% +0.87% 50% 61,336±1.49% 60,816±0.63% -0.85% 75% 40,592±0.97% 40,461±0.14% -0.32% 100% 31,277±2.11% 30,948±1.84% -1.05% For nr_clients=100% case, configure a cgroup and set quota to half cpu and then let netperf run in this cgroup: nr_clients base head change 100% 25,532±0.56% 26,772±3.05% +4.86% On AMD Genoa: nr_clients base head change 25% 12,443±0.40% 12,525±0.06% +0.66% 50% 11,403±0.35% 11,472±0.50% +0.61% 75% 10,070±0.19% 10,071±0.95% 0.00% 100% 9,947±0.80% 9,881±0.58% -0.66% For nr_clients=100% case, configure a cgroup and set quota to half cpu and then let netperf run in this cgroup: nr_clients base head change 100% 4,954±0.24% 4,952±0.14% 0.00% > >> > + if (throttled_hierarchy(cfs_rq) && > >> > + !task_current_donor(rq_of(cfs_rq), p)) { > >> > + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); > >> > + return true; > >> > + } > >> > + > >> > + /* we can't take the fast path, do an actual enqueue*/ > >> > + p->throttled = false; > >> > >> So we clear p->throttled but not p->throttle_node? Won't that cause issues > >> when @p's previous cfs_rq gets unthrottled? > >> > > > > p->throttle_node is already removed from its previous cfs_rq at dequeue > > time in dequeue_throttled_task(). > > > > This is done so because in enqueue time, we may not hold its previous > > rq's lock so can't touch its previous cfs_rq's limbo list, like when > > dealing with affinity changes. > > > > Ah right, the DQ/EQ_throttled_task() functions are when DQ/EQ is applied to an > already-throttled task and it does the right thing. > > Does this mean we want this as enqueue_throttled_task()'s prologue? > > /* @p should have gone through dequeue_throttled_task() first */ > WARN_ON_ONCE(!list_empty(&p->throttle_node)); > Sure, will add it in next version. > >> > @@ -7145,6 +7142,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) > >> > */ > >> > static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) > >> > { > >> > + if (unlikely(task_is_throttled(p))) { > >> > + dequeue_throttled_task(p, flags); > >> > + return true; > >> > + } > >> > + > >> > >> Handling a throttled task's move pattern at dequeue does simplify things, > >> however that's quite a hot path. I think this wants at the very least a > >> > >> if (cfs_bandwidth_used()) > >> > >> since that has a static key underneath. Some heavy EQ/DQ benchmark wouldn't > >> hurt, but we can probably find some people who really care about that to > >> run it for us ;) > >> > > > > No problem. > > > > I'm thinking maybe I can add this cfs_bandwidth_used() in > > task_is_throttled()? So that other callsites of task_is_throttled() can > > also get the benefit of paying less cost when cfs bandwidth is not > > enabled. > > > > Sounds good to me; just drop the unlikely and let the static key do its > thing :) Got it, thanks for the suggestion.
On 12/08/25 16:48, Aaron Lu wrote: > On Fri, Aug 08, 2025 at 01:45:11PM +0200, Valentin Schneider wrote: >> On 08/08/25 18:13, Aaron Lu wrote: >> > Let me run some scheduler benchmark to see how it impacts performance. >> > >> > I'm thinking maybe running something like hackbench on server platforms, >> > first with quota not set and see if performance changes; then also test >> > with quota set and see how performance changes. >> > >> > Does this sound good to you? Or do you have any specific benchmark and >> > test methodology in mind? >> > >> >> Yeah hackbench is pretty good for stressing the EQ/DQ paths. >> > > Tested hackbench/pipe and netperf/UDP_RR on Intel EMR(2 sockets/240 > cpus) and AMD Genoa(2 sockets/384 cpus), the tldr is: there is no clear > performance change between base and this patchset(head). Below is > detailed test data: > (turbo/boost disabled, cpuidle disabled, cpufreq set to performance) > > hackbench/pipe/loops=150000 > (seconds, smaller is better) > > On Intel EMR: > > nr_group base head change > 1 3.62±2.99% 3.61±10.42% +0.28% > 8 8.06±1.58% 7.88±5.82% +2.23% > 16 11.40±2.57% 11.25±3.72% +1.32% > > For nr_group=16 case, configure a cgroup and set quota to half cpu and > then let hackbench run in this cgroup: > > base head change > quota=50% 18.35±2.40% 18.78±1.97% -2.34% > > On AMD Genoa: > > nr_group base head change > 1 17.05±1.92% 16.99±2.81% +0.35% > 8 16.54±0.71% 16.73±1.18% -1.15% > 16 27.04±0.39% 26.72±2.37% +1.18% > > For nr_group=16 case, configure a cgroup and set quota to half cpu and > then let hackbench run in this cgroup: > > base head change > quota=50% 43.79±1.10% 44.65±0.37% -1.96% > > Netperf/UDP_RR/testlen=30s > (throughput, higher is better) > > 25% means nr_clients set to 1/4 nr_cpu, 50% means nr_clients is 1/2 > nr_cpu, etc. > > On Intel EMR: > > nr_clients base head change > 25% 83,567±0.06% 84,298±0.23% +0.87% > 50% 61,336±1.49% 60,816±0.63% -0.85% > 75% 40,592±0.97% 40,461±0.14% -0.32% > 100% 31,277±2.11% 30,948±1.84% -1.05% > > For nr_clients=100% case, configure a cgroup and set quota to half cpu > and then let netperf run in this cgroup: > > nr_clients base head change > 100% 25,532±0.56% 26,772±3.05% +4.86% > > On AMD Genoa: > > nr_clients base head change > 25% 12,443±0.40% 12,525±0.06% +0.66% > 50% 11,403±0.35% 11,472±0.50% +0.61% > 75% 10,070±0.19% 10,071±0.95% 0.00% > 100% 9,947±0.80% 9,881±0.58% -0.66% > > For nr_clients=100% case, configure a cgroup and set quota to half cpu > and then let netperf run in this cgroup: > > nr_clients base head change > 100% 4,954±0.24% 4,952±0.14% 0.00% Thank you for running these, looks like mostly slightly bigger variance on a few of these but that's about it. I would also suggest running similar benchmarks but with deeper hierarchies, to get an idea of how much worse unthrottle_cfs_rq() can get when tg_unthrottle_up() goes up a bigger tree.
On Thu, Aug 14, 2025 at 05:54:34PM +0200, Valentin Schneider wrote: > On 12/08/25 16:48, Aaron Lu wrote: > > On Fri, Aug 08, 2025 at 01:45:11PM +0200, Valentin Schneider wrote: > >> On 08/08/25 18:13, Aaron Lu wrote: > >> > Let me run some scheduler benchmark to see how it impacts performance. > >> > > >> > I'm thinking maybe running something like hackbench on server platforms, > >> > first with quota not set and see if performance changes; then also test > >> > with quota set and see how performance changes. > >> > > >> > Does this sound good to you? Or do you have any specific benchmark and > >> > test methodology in mind? > >> > > >> > >> Yeah hackbench is pretty good for stressing the EQ/DQ paths. > >> > > > > Tested hackbench/pipe and netperf/UDP_RR on Intel EMR(2 sockets/240 > > cpus) and AMD Genoa(2 sockets/384 cpus), the tldr is: there is no clear > > performance change between base and this patchset(head). Below is > > detailed test data: > > (turbo/boost disabled, cpuidle disabled, cpufreq set to performance) > > > > hackbench/pipe/loops=150000 > > (seconds, smaller is better) > > > > On Intel EMR: > > > > nr_group base head change > > 1 3.62±2.99% 3.61±10.42% +0.28% > > 8 8.06±1.58% 7.88±5.82% +2.23% > > 16 11.40±2.57% 11.25±3.72% +1.32% > > > > For nr_group=16 case, configure a cgroup and set quota to half cpu and > > then let hackbench run in this cgroup: > > > > base head change > > quota=50% 18.35±2.40% 18.78±1.97% -2.34% > > > > On AMD Genoa: > > > > nr_group base head change > > 1 17.05±1.92% 16.99±2.81% +0.35% > > 8 16.54±0.71% 16.73±1.18% -1.15% > > 16 27.04±0.39% 26.72±2.37% +1.18% > > > > For nr_group=16 case, configure a cgroup and set quota to half cpu and > > then let hackbench run in this cgroup: > > > > base head change > > quota=50% 43.79±1.10% 44.65±0.37% -1.96% > > > > Netperf/UDP_RR/testlen=30s > > (throughput, higher is better) > > > > 25% means nr_clients set to 1/4 nr_cpu, 50% means nr_clients is 1/2 > > nr_cpu, etc. > > > > On Intel EMR: > > > > nr_clients base head change > > 25% 83,567±0.06% 84,298±0.23% +0.87% > > 50% 61,336±1.49% 60,816±0.63% -0.85% > > 75% 40,592±0.97% 40,461±0.14% -0.32% > > 100% 31,277±2.11% 30,948±1.84% -1.05% > > > > For nr_clients=100% case, configure a cgroup and set quota to half cpu > > and then let netperf run in this cgroup: > > > > nr_clients base head change > > 100% 25,532±0.56% 26,772±3.05% +4.86% > > > > On AMD Genoa: > > > > nr_clients base head change > > 25% 12,443±0.40% 12,525±0.06% +0.66% > > 50% 11,403±0.35% 11,472±0.50% +0.61% > > 75% 10,070±0.19% 10,071±0.95% 0.00% > > 100% 9,947±0.80% 9,881±0.58% -0.66% > > > > For nr_clients=100% case, configure a cgroup and set quota to half cpu > > and then let netperf run in this cgroup: > > > > nr_clients base head change > > 100% 4,954±0.24% 4,952±0.14% 0.00% > > Thank you for running these, looks like mostly slightly bigger variance on > a few of these but that's about it. > > I would also suggest running similar benchmarks but with deeper > hierarchies, to get an idea of how much worse unthrottle_cfs_rq() can get > when tg_unthrottle_up() goes up a bigger tree. No problem. I suppose I can reuse the previous shared test script: https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/ There I used: nr_level1=2 nr_level2=100 nr_level3=10 But I can tweak these numbers for this performance evaluation. I can make the leaf level to be 5 level deep and place tasks in leaf level cgroups and configure quota on 1st level cgroups. I'll get back to you once I finished collecting data, feel free to let me know if you have other idea testing this :)
On Fri, Aug 15, 2025 at 05:30:08PM +0800, Aaron Lu wrote: > On Thu, Aug 14, 2025 at 05:54:34PM +0200, Valentin Schneider wrote: ... ... > > I would also suggest running similar benchmarks but with deeper > > hierarchies, to get an idea of how much worse unthrottle_cfs_rq() can get > > when tg_unthrottle_up() goes up a bigger tree. > > No problem. > > I suppose I can reuse the previous shared test script: > https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/ > > There I used: > nr_level1=2 > nr_level2=100 > nr_level3=10 > > But I can tweak these numbers for this performance evaluation. I can make > the leaf level to be 5 level deep and place tasks in leaf level cgroups > and configure quota on 1st level cgroups. Tested on Intel EMR(2 sockets, 120cores, 240cpus) and AMD Genoa(2 sockets, 192cores, 384cpus), with turbo/boost disabled, cpufreq set to performance and cpuidle states all disabled. cgroup hierarchy: nr_level1=2 nr_level2=2 nr_level3=2 nr_level4=5 nr_level5=5 i.e. two cgroups in the root level, with each level1 cgroup having 2 child cgroups, and each level2 cgroup having 2 child cgroups, etc. This creates a 5 level deep, 200 leaf cgroups setup. Tasks are placed in leaf cgroups. Quota are set on the two level1 cgroups. The TLDR is, when there is a very large number of tasks(like 8000 tasks), task based throttle saw 10-20% performance drop on AMD Genoa; otherwise, no obvious performance change is observed. Detailed test results below. Netperf: measured in throughput, more is better - quota set to 50 cpu for each level1 cgroup; - each leaf cgroup run a pair of netserver and netperf with following cmdline: netserver -p $port_for_this_cgroup netperf -p $port_for_this_cgroup -H 127.0.0.1 -t UDP_RR -c -C -l 30 i.e. each cgroup has 2 tasks, total task number is 2 * 200 = 400 tasks. On Intel EMR: base head diff throughput 33305±8.40% 33995±7.84% noise On AMD Genoa: base head diff throughput 5013±1.16% 4967±1.82 noise Hackbench, measured in seconds, less is better: - quota set to 50cpu for each level1 cgroup; - each cgroup runs with the following cmdline: hackbench -p -g 1 -l $see_below i.e. each leaf cgroup has 20 sender tasks and 20 receiver tasks, total task number is 40 * 200 = 8000 tasks. On Intel EMR(loops set to 100000): base head diff Time 85.45±3.98% 86.41±3.98% noise On AMD Genoa(loops set to 20000): base head diff Time 104±4.33% 116±7.71% -11.54% So for this test case, task based throttle suffered ~10% performance drop. I also tested on another AMD Genoa(same cpu spec) to make sure it's not a machine problem and performance dropped there too: On 2nd AMD Genoa(loops set to 50000) base head diff Time 81±3.13% 101±7.05% -24.69% According to perf, __schedule() in head takes 7.29% cycles while in base it takes 4.61% cycles. I suppose with task based throttle, __schedule() is more frequent since tasks in a throttled cfs_rq have to be dequeued one by one while in current behaviour, the cfs_rq can be dequeued off rq in one go. This is most obvious when there are multiple tasks in a single cfs_rq; if there is only 1 task per cfs_rq, things should be roughly the same for the two throttling model. With this said, I reduced the task number and retested on this 2nd AMD Genoa: - quota set to 50 cpu for each level1 cgroup; - using only 1 fd pair, i.e. 2 task for each cgroup: hackbench -p -g 1 -f 1 -l 50000000 i.e. each leaf cgroup has 1 sender task and 1 receiver task, total task number is 2 * 200 = 400 tasks. base head diff Time 127.77±2.60% 127.49±2.63% noise In this setup, performance is about the same. Now I'm wondering why on Intel EMR, running that extreme setup(8000 tasks), performance of task based throttle didn't see noticeable drop...
On Fri, Aug 22, 2025 at 07:07:01PM +0800, Aaron Lu wrote: > On Fri, Aug 15, 2025 at 05:30:08PM +0800, Aaron Lu wrote: > > On Thu, Aug 14, 2025 at 05:54:34PM +0200, Valentin Schneider wrote: > ... ... > > > I would also suggest running similar benchmarks but with deeper > > > hierarchies, to get an idea of how much worse unthrottle_cfs_rq() can get > > > when tg_unthrottle_up() goes up a bigger tree. > > > > No problem. > > > > I suppose I can reuse the previous shared test script: > > https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/ > > > > There I used: > > nr_level1=2 > > nr_level2=100 > > nr_level3=10 > > > > But I can tweak these numbers for this performance evaluation. I can make > > the leaf level to be 5 level deep and place tasks in leaf level cgroups > > and configure quota on 1st level cgroups. > > Tested on Intel EMR(2 sockets, 120cores, 240cpus) and AMD Genoa(2 > sockets, 192cores, 384cpus), with turbo/boost disabled, cpufreq set to > performance and cpuidle states all disabled. > > cgroup hierarchy: > nr_level1=2 > nr_level2=2 > nr_level3=2 > nr_level4=5 > nr_level5=5 > i.e. two cgroups in the root level, with each level1 cgroup having 2 > child cgroups, and each level2 cgroup having 2 child cgroups, etc. This > creates a 5 level deep, 200 leaf cgroups setup. Tasks are placed in leaf > cgroups. Quota are set on the two level1 cgroups. > > The TLDR is, when there is a very large number of tasks(like 8000 tasks), > task based throttle saw 10-20% performance drop on AMD Genoa; otherwise, > no obvious performance change is observed. Detailed test results below. > > Netperf: measured in throughput, more is better > - quota set to 50 cpu for each level1 cgroup; > - each leaf cgroup run a pair of netserver and netperf with following > cmdline: > netserver -p $port_for_this_cgroup > netperf -p $port_for_this_cgroup -H 127.0.0.1 -t UDP_RR -c -C -l 30 > i.e. each cgroup has 2 tasks, total task number is 2 * 200 = 400 > tasks. > > On Intel EMR: > base head diff > throughput 33305±8.40% 33995±7.84% noise > > On AMD Genoa: > base head diff > throughput 5013±1.16% 4967±1.82 noise > > > Hackbench, measured in seconds, less is better: > - quota set to 50cpu for each level1 cgroup; > - each cgroup runs with the following cmdline: > hackbench -p -g 1 -l $see_below > i.e. each leaf cgroup has 20 sender tasks and 20 receiver tasks, total > task number is 40 * 200 = 8000 tasks. > > On Intel EMR(loops set to 100000): > > base head diff > Time 85.45±3.98% 86.41±3.98% noise > > On AMD Genoa(loops set to 20000): > > base head diff > Time 104±4.33% 116±7.71% -11.54% > > So for this test case, task based throttle suffered ~10% performance > drop. I also tested on another AMD Genoa(same cpu spec) to make sure > it's not a machine problem and performance dropped there too: > > On 2nd AMD Genoa(loops set to 50000) > > base head diff > Time 81±3.13% 101±7.05% -24.69% > > According to perf, __schedule() in head takes 7.29% cycles while in base > it takes 4.61% cycles. I suppose with task based throttle, __schedule() > is more frequent since tasks in a throttled cfs_rq have to be dequeued > one by one while in current behaviour, the cfs_rq can be dequeued off rq > in one go. This is most obvious when there are multiple tasks in a single > cfs_rq; if there is only 1 task per cfs_rq, things should be roughly the > same for the two throttling model. > > With this said, I reduced the task number and retested on this 2nd AMD > Genoa: > - quota set to 50 cpu for each level1 cgroup; > - using only 1 fd pair, i.e. 2 task for each cgroup: > hackbench -p -g 1 -f 1 -l 50000000 > i.e. each leaf cgroup has 1 sender task and 1 receiver task, total > task number is 2 * 200 = 400 tasks. > > base head diff > Time 127.77±2.60% 127.49±2.63% noise > > In this setup, performance is about the same. > > Now I'm wondering why on Intel EMR, running that extreme setup(8000 > tasks), performance of task based throttle didn't see noticeable drop... Looks like hackbench doesn't like task migration on this AMD system (domain0 SMT; domain1 MC; domain2 PKG; domain3 NUMA). If I revert patch5, running this 40 * 200 = 8000 hackbench workload again, performance is roughly the same now(head~1 is slightly worse but given the 4+% stddev in base, it can be considered in noise range): base head~1(patch1-4) diff head(patch1-5) Time 82.55±4.82% 84.45±2.70% -2.3% 99.69±6.71% According to /proc/schedstat, the lb_gained for domain2 is: NOT_IDLE IDLE NEWLY_IDLE base 0 8052 81791 head~1 0 7197 175096 head 1 14818 793065 Other domains have similar number: base has smallest migration number while head has the most and head~1 reduce the number a lot. I suppose this is expected, because we removed the throttled_lb_pair() restriction in patch5 and that can cause runnable tasks in throttled hierarchy to be balanced to other cpus while in base, this can not happen. I think patch5 still makes sense and is correct, it's just this specific workload doesn't like task migrations. Intel EMR doesn't suffer from this, I suppose that's because EMR has a much larger LLC while AMD Genoa has a relatively small LLC and task migrations across LLC boundary hurts hackbench's performance. I also tried to apply below hack to prove this "task migration across LLC boundary hurts hackbench" theory on both base and head: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c2..34c5f6b75e53d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9297,6 +9297,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; + if (!(env->sd->flags & SD_SHARE_LLC)) + return 0; + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; With this diff applied, the result is: base' head' diff Time 74.78±8.2% 78.87±15.4% -5.47% base': base + above diff head': head + above diff So both performs better now, but with much larger variance, I guess that's because no load balance on domain2 and above now. head' is still worse than base, but not as much as before. To conclude this: hackbench doesn't like task migration, especially when task is migrated across LLC boundary. patch5 removed the restriction of no balancing throttled tasks, this caused more balance to happen and hackbench doesn't like this. But balancing has its own merit and could still benefit other workloads so I think patch5 should stay, especially considering that when throttled tasks are eventually dequeued, they will not stay on rq's cfs_tasks list so no need to take special care for them when doing load balance. On a side note: should we increase the cost of balancing tasks out of LLC boundary? I tried to enlarge sysctl_sched_migration_cost 100 times for domains without SD_SHARE_LLC in task_hot() but that didn't help.
Hello Aaron, On 9/3/2025 12:44 PM, Aaron Lu wrote: > On Fri, Aug 22, 2025 at 07:07:01PM +0800, Aaron Lu wrote: >> With this said, I reduced the task number and retested on this 2nd AMD >> Genoa: >> - quota set to 50 cpu for each level1 cgroup; What exactly is the quota and period when you say 50cpu? >> - using only 1 fd pair, i.e. 2 task for each cgroup: >> hackbench -p -g 1 -f 1 -l 50000000 >> i.e. each leaf cgroup has 1 sender task and 1 receiver task, total >> task number is 2 * 200 = 400 tasks. >> >> base head diff >> Time 127.77±2.60% 127.49±2.63% noise >> >> In this setup, performance is about the same. >> >> Now I'm wondering why on Intel EMR, running that extreme setup(8000 >> tasks), performance of task based throttle didn't see noticeable drop... > > Looks like hackbench doesn't like task migration on this AMD system > (domain0 SMT; domain1 MC; domain2 PKG; domain3 NUMA). > > If I revert patch5, running this 40 * 200 = 8000 hackbench workload > again, performance is roughly the same now(head~1 is slightly worse but > given the 4+% stddev in base, it can be considered in noise range): > > base head~1(patch1-4) diff head(patch1-5) > Time 82.55±4.82% 84.45±2.70% -2.3% 99.69±6.71% > > According to /proc/schedstat, the lb_gained for domain2 is: > > NOT_IDLE IDLE NEWLY_IDLE > base 0 8052 81791 > head~1 0 7197 175096 > head 1 14818 793065 Since these are mostly idle and newidle balance, I wonder if we can run into a scenario where, 1. All the tasks are throttled. 2. CPU turning idle does a newidle balance. 3. CPU pulls a tasks from throttled hierarchy and selects it. 4. The task exits to user space and is dequeued. 5. Goto 1. and when the CPU is unthrottled, it has a large number of tasks on it that'll again require a load balance to even stuff out. > > Other domains have similar number: base has smallest migration number > while head has the most and head~1 reduce the number a lot. I suppose > this is expected, because we removed the throttled_lb_pair() restriction > in patch5 and that can cause runnable tasks in throttled hierarchy to be > balanced to other cpus while in base, this can not happen. > > I think patch5 still makes sense and is correct, it's just this specific > workload doesn't like task migrations. Intel EMR doesn't suffer from > this, I suppose that's because EMR has a much larger LLC while AMD Genoa > has a relatively small LLC and task migrations across LLC boundary hurts > hackbench's performance. I think we can leave the throttled_lb_pair() condition as is and revisit it later if this is visible in real world workloads. I cannot think of any easy way to avoid the case for potential pileup without accounting for the throttled tasks in limbo except for something like below at head~1: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bdc9bfa0b9ef..3dc807af21ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9385,7 +9385,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) delayed dequeued unless we migrate load, or - * 2) throttled_lb_pair, or + * 2) throttled_lb_pair unless we migrate load, or * 3) cannot be migrated to this CPU due to cpus_ptr, or * 4) running (obviously), or * 5) are cache-hot on their current CPU, or @@ -9394,7 +9394,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu) && + env->migration_type != migrate_load) return 0; /* --- Since load_avg moves slowly, it might be enough to avoid pileup of tasks. This is similar to the condition for migrating delayed tasks above but unlike the hierarchies of delayed tasks, the weight of throttled hierarchy does change when throttled tasks are transitioned to limbo so this needs some more staring at. > > I also tried to apply below hack to prove this "task migration across > LLC boundary hurts hackbench" theory on both base and head: > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index b173a059315c2..34c5f6b75e53d 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -9297,6 +9297,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) > if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) > return 0; > > + if (!(env->sd->flags & SD_SHARE_LLC)) > + return 0; > + > if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) > return 0; > > With this diff applied, the result is: > > > base' head' diff > Time 74.78±8.2% 78.87±15.4% -5.47% > > base': base + above diff > head': head + above diff > > So both performs better now, but with much larger variance, I guess > that's because no load balance on domain2 and above now. head' is still > worse than base, but not as much as before. > > To conclude this: hackbench doesn't like task migration, especially when > task is migrated across LLC boundary. patch5 removed the restriction of > no balancing throttled tasks, this caused more balance to happen and > hackbench doesn't like this. But balancing has its own merit and could > still benefit other workloads so I think patch5 should stay, especially > considering that when throttled tasks are eventually dequeued, they will > not stay on rq's cfs_tasks list so no need to take special care for them > when doing load balance. Mathieu had run some experiments a couple years ago where he too discovered reducing the number of migrations for hackbench helps but it wasn't clear if these strategies would benefit real-world workloads: https://lore.kernel.org/lkml/20230905171105.1005672-1-mathieu.desnoyers@efficios.com/ https://lore.kernel.org/lkml/20231018204511.1563390-1-mathieu.desnoyers@efficios.com/ > > On a side note: should we increase the cost of balancing tasks out of LLC > boundary? I tried to enlarge sysctl_sched_migration_cost 100 times for > domains without SD_SHARE_LLC in task_hot() but that didn't help. I'll take a look at sd->imbalance_pct and see if there is any room for improvements there. Thank you again for the detailed analysis. -- Thanks and Regards, Prateek
Hi Prateek, On Wed, Sep 03, 2025 at 02:41:55PM +0530, K Prateek Nayak wrote: > Hello Aaron, > > On 9/3/2025 12:44 PM, Aaron Lu wrote: > > On Fri, Aug 22, 2025 at 07:07:01PM +0800, Aaron Lu wrote: > >> With this said, I reduced the task number and retested on this 2nd AMD > >> Genoa: > >> - quota set to 50 cpu for each level1 cgroup; > > What exactly is the quota and period when you say 50cpu? period is the default 100000 and quota is set to 5000000. > > >> - using only 1 fd pair, i.e. 2 task for each cgroup: > >> hackbench -p -g 1 -f 1 -l 50000000 > >> i.e. each leaf cgroup has 1 sender task and 1 receiver task, total > >> task number is 2 * 200 = 400 tasks. > >> > >> base head diff > >> Time 127.77±2.60% 127.49±2.63% noise > >> > >> In this setup, performance is about the same. > >> > >> Now I'm wondering why on Intel EMR, running that extreme setup(8000 > >> tasks), performance of task based throttle didn't see noticeable drop... > > > > Looks like hackbench doesn't like task migration on this AMD system > > (domain0 SMT; domain1 MC; domain2 PKG; domain3 NUMA). > > > > If I revert patch5, running this 40 * 200 = 8000 hackbench workload > > again, performance is roughly the same now(head~1 is slightly worse but > > given the 4+% stddev in base, it can be considered in noise range): > > > > base head~1(patch1-4) diff head(patch1-5) > > Time 82.55±4.82% 84.45±2.70% -2.3% 99.69±6.71% > > > > According to /proc/schedstat, the lb_gained for domain2 is: > > > > NOT_IDLE IDLE NEWLY_IDLE > > base 0 8052 81791 > > head~1 0 7197 175096 > > head 1 14818 793065 > > Since these are mostly idle and newidle balance, I wonder if we can run > into a scenario where, > > 1. All the tasks are throttled. > 2. CPU turning idle does a newidle balance. > 3. CPU pulls a tasks from throttled hierarchy and selects it. > 4. The task exits to user space and is dequeued. > 5. Goto 1. > > and when the CPU is unthrottled, it has a large number of tasks on it > that'll again require a load balance to even stuff out. > I think it is because we allow balancing tasks under a throttled hirarchy that made the balance number much larger. > > > > Other domains have similar number: base has smallest migration number > > while head has the most and head~1 reduce the number a lot. I suppose > > this is expected, because we removed the throttled_lb_pair() restriction > > in patch5 and that can cause runnable tasks in throttled hierarchy to be > > balanced to other cpus while in base, this can not happen. > > > > I think patch5 still makes sense and is correct, it's just this specific > > workload doesn't like task migrations. Intel EMR doesn't suffer from > > this, I suppose that's because EMR has a much larger LLC while AMD Genoa > > has a relatively small LLC and task migrations across LLC boundary hurts > > hackbench's performance. > > I think we can leave the throttled_lb_pair() condition as is and revisit > it later if this is visible in real world workloads. I cannot think of > any easy way to avoid the case for potential pileup without accounting > for the throttled tasks in limbo except for something like below at > head~1: > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index bdc9bfa0b9ef..3dc807af21ba 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -9385,7 +9385,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) > /* > * We do not migrate tasks that are: > * 1) delayed dequeued unless we migrate load, or > - * 2) throttled_lb_pair, or > + * 2) throttled_lb_pair unless we migrate load, or > * 3) cannot be migrated to this CPU due to cpus_ptr, or > * 4) running (obviously), or > * 5) are cache-hot on their current CPU, or > @@ -9394,7 +9394,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) > if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) > return 0; > > - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) > + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu) && > + env->migration_type != migrate_load) > return 0; > > /* > --- > > Since load_avg moves slowly, it might be enough to avoid pileup of > tasks. This is similar to the condition for migrating delayed tasks > above but unlike the hierarchies of delayed tasks, the weight of > throttled hierarchy does change when throttled tasks are transitioned to > limbo so this needs some more staring at. > I was thinking: should we not allow task balancing to a throttled target cfs_rq? For task based throttle model, if a task is on rq's cfs_tasks list, it is allowed to run so we should not check src cfs_rq's throttle status but we should check if the target cfs_rq is throttled and if it is, then it's probably not very useful to do the balance. I tried below diff and the performance is restored: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df8dc389af8e1..3e927b9b7eeb6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9370,6 +9370,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; + if (throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu])) + return 0; + /* * We want to prioritize the migration of eligible tasks. * For ineligible tasks we soft-limit them and only allow base head' diff head(patch1-5) Time 82.55±4.82% 83.81±2.89% -1.5% 99.69±6.71% head': head + above diff I also tested netperf on this AMD system as well as hackbench and netperf on Intel EMR, no obvious performance difference observed after applying the above diff, i.e. base and head' performance is roughly the same. Does the above diff make sense? One thing I'm slightly concerned is, there may be one case when balancing a task to a throttled target cfs_rq makes sense: if the task holds some kernel resource and is running inside kernel, even its target cfs_rq is throttled, we still want this task to go there and finish its job in kernel mode sooner, this could help other resource waiters. But, this may not be a big deal and in most of the time, balancing a task to a throttled cfs_rq doesn't look like a meaningful thing to do. Best regards, Aaron
Hello Aaron, On 9/3/2025 3:41 PM, Aaron Lu wrote: > Hi Prateek, > > On Wed, Sep 03, 2025 at 02:41:55PM +0530, K Prateek Nayak wrote: >> Hello Aaron, >> >> On 9/3/2025 12:44 PM, Aaron Lu wrote: >>> On Fri, Aug 22, 2025 at 07:07:01PM +0800, Aaron Lu wrote: >>>> With this said, I reduced the task number and retested on this 2nd AMD >>>> Genoa: >>>> - quota set to 50 cpu for each level1 cgroup; >> >> What exactly is the quota and period when you say 50cpu? > > period is the default 100000 and quota is set to 5000000. Thank you! I'll do some tests on my end as well. [..snip..] > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index df8dc389af8e1..3e927b9b7eeb6 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -9370,6 +9370,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) > if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) > return 0; > > + if (throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu])) > + return 0; > + This makes sense instead of the full throttled_lb_pair(). You'll still need to put it behind CONFIG_CGROUP_SCHED (or better yet CONFIG_CFS_BANDWIDTH) since task_group() can return NULL if GROUP_SCHED is not enabled. > /* > * We want to prioritize the migration of eligible tasks. > * For ineligible tasks we soft-limit them and only allow > > base head' diff head(patch1-5) > Time 82.55±4.82% 83.81±2.89% -1.5% 99.69±6.71% > > head': head + above diff > > I also tested netperf on this AMD system as well as hackbench and > netperf on Intel EMR, no obvious performance difference observed > after applying the above diff, i.e. base and head' performance is > roughly the same. > > Does the above diff make sense? One thing I'm slightly concerned is, > there may be one case when balancing a task to a throttled target > cfs_rq makes sense: if the task holds some kernel resource and is > running inside kernel, even its target cfs_rq is throttled, we still > want this task to go there and finish its job in kernel mode sooner, > this could help other resource waiters. But, this may not be a big deal I think it is still an improvement over per-cfs_rq throttling from a tail latency perspective. > and in most of the time, balancing a task to a throttled cfs_rq doesn't > look like a meaningful thing to do.Ack. -- Thanks and Regards, Prateek
On Wed, Sep 03, 2025 at 04:01:03PM +0530, K Prateek Nayak wrote: > Hello Aaron, > > On 9/3/2025 3:41 PM, Aaron Lu wrote: > > Hi Prateek, > > > > On Wed, Sep 03, 2025 at 02:41:55PM +0530, K Prateek Nayak wrote: > >> Hello Aaron, > >> > >> On 9/3/2025 12:44 PM, Aaron Lu wrote: > >>> On Fri, Aug 22, 2025 at 07:07:01PM +0800, Aaron Lu wrote: > >>>> With this said, I reduced the task number and retested on this 2nd AMD > >>>> Genoa: > >>>> - quota set to 50 cpu for each level1 cgroup; > >> > >> What exactly is the quota and period when you say 50cpu? > > > > period is the default 100000 and quota is set to 5000000. > > Thank you! I'll do some tests on my end as well. > I've attached test scripts I've used for your reference. > [..snip..] > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > > index df8dc389af8e1..3e927b9b7eeb6 100644 > > --- a/kernel/sched/fair.c > > +++ b/kernel/sched/fair.c > > @@ -9370,6 +9370,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) > > if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) > > return 0; > > > > + if (throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu])) > > + return 0; > > + > > This makes sense instead of the full throttled_lb_pair(). You'll still > need to put it behind CONFIG_CGROUP_SCHED (or better yet > CONFIG_CFS_BANDWIDTH) since task_group() can return NULL if GROUP_SCHED > is not enabled. > Got it, thanks for the remind. Maybe I can avoid adding new wrappers and just check task_group() first, something like this: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df8dc389af8e1..d9abde5e631b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9362,6 +9362,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) delayed dequeued unless we migrate load, or + * 2) target cfs_rq is in throttled hierarchy, or * 2) cannot be migrated to this CPU due to cpus_ptr, or * 3) running (obviously), or * 4) are cache-hot on their current CPU, or @@ -9370,6 +9371,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; + if (task_group(p) && + throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu])) + return 0; + /* * We want to prioritize the migration of eligible tasks. * For ineligible tasks we soft-limit them and only allow > > /* > > * We want to prioritize the migration of eligible tasks. > > * For ineligible tasks we soft-limit them and only allow > > > > base head' diff head(patch1-5) > > Time 82.55±4.82% 83.81±2.89% -1.5% 99.69±6.71% > > > > head': head + above diff > > > > I also tested netperf on this AMD system as well as hackbench and > > netperf on Intel EMR, no obvious performance difference observed > > after applying the above diff, i.e. base and head' performance is > > roughly the same. > > > > Does the above diff make sense? One thing I'm slightly concerned is, > > there may be one case when balancing a task to a throttled target > > cfs_rq makes sense: if the task holds some kernel resource and is > > running inside kernel, even its target cfs_rq is throttled, we still > > want this task to go there and finish its job in kernel mode sooner, > > this could help other resource waiters. But, this may not be a big deal > > I think it is still an improvement over per-cfs_rq throttling from a > tail latency perspective. > > > and in most of the time, balancing a task to a throttled cfs_rq doesn't > > look like a meaningful thing to do.Ack. Just want to add that with the above diff applied, I also tested songtang's stress test[0] and Jan's rt deadlock reproducer[1] and didn't see any problem. [0]: https://lore.kernel.org/lkml/20250715072218.GA304@bytedance/ [1]: https://lore.kernel.org/all/7483d3ae-5846-4067-b9f7-390a614ba408@siemens.com/
Hi Aaron, On Wed, 2025-09-03 at 19:35 +0800, Aaron Lu wrote: > > > [..snip..] > > > > > Just want to add that with the above diff applied, I also tested > songtang's stress test[0] and Jan's rt deadlock reproducer[1] and didn't > see any problem. Thanks a lot for taking the reproducer into account. But: To trigger PREEMPT_RT needs to be enabled, otherwise the rwlock in the epoll infrastructure (that we highly stress) won't sleep. Just to be sure: PREEMPT_RT was enabled on your end? > > [0]: https://lore.kernel.org/lkml/20250715072218.GA304@bytedance/ > [1]: https://lore.kernel.org/all/7483d3ae-5846-4067-b9f7-390a614ba408@siemens.com/
Hi Florian, On Thu, Sep 04, 2025 at 07:33:03AM +0000, Bezdeka, Florian wrote: > Hi Aaron, > > On Wed, 2025-09-03 at 19:35 +0800, Aaron Lu wrote: > > > > > [..snip..] > > > > > > > > Just want to add that with the above diff applied, I also tested > > songtang's stress test[0] and Jan's rt deadlock reproducer[1] and didn't > > see any problem. > > Thanks a lot for taking the reproducer into account. But: To trigger > PREEMPT_RT needs to be enabled, otherwise the rwlock in the epoll > infrastructure (that we highly stress) won't sleep. Just to be sure: > PREEMPT_RT was enabled on your end? > Yes, understood. I've PREEMPT_RT enabled when running Jan's rt deadlock reproducer. I also run base and make sure there is hung happened in base before testing this series. Normally, base would hung in 2 minutes and when testing this series, I'll give it 5 minutes to see if anything abnormal happened. > > > > [0]: https://lore.kernel.org/lkml/20250715072218.GA304@bytedance/ > > [1]: https://lore.kernel.org/all/7483d3ae-5846-4067-b9f7-390a614ba408@siemens.com/
Hello Florian, On 9/4/2025 1:03 PM, Bezdeka, Florian wrote: > Hi Aaron, > > On Wed, 2025-09-03 at 19:35 +0800, Aaron Lu wrote: >> >>> [..snip..] >>>> >> >> Just want to add that with the above diff applied, I also tested >> songtang's stress test[0] and Jan's rt deadlock reproducer[1] and didn't >> see any problem. > > Thanks a lot for taking the reproducer into account. But: To trigger > PREEMPT_RT needs to be enabled, otherwise the rwlock in the epoll > infrastructure (that we highly stress) won't sleep. Just to be sure: > PREEMPT_RT was enabled on your end? I've too tested this series with Jan's reproducer on PREEMPT_RT and can confirm no hang was triggered over a full weekend run lasting more than 72Hrs with Aaron's changes applied on top of v6.16-rc6 based kernel. > >> >> [0]: https://lore.kernel.org/lkml/20250715072218.GA304@bytedance/ >> [1]: https://lore.kernel.org/all/7483d3ae-5846-4067-b9f7-390a614ba408@siemens.com/ I used the same reproducer from [1] with small modification to run.sh to periodically move epoll-stall and epoll-stall-writer to one particular CPU and keep changing this CPU ever minute. -- Thanks and Regards, Prateek
Hi Aaron, kernel test robot noticed the following build warnings: [auto build test WARNING on tip/sched/core] [also build test WARNING on next-20250716] [cannot apply to linus/master v6.16-rc6] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307 base: tip/sched/core patch link: https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model config: xtensa-randconfig-r121-20250716 (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/config) compiler: xtensa-linux-gcc (GCC) 8.5.0 reproduce: (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202507162238.qiw7kyu0-lkp@intel.com/ sparse warnings: (new ones prefixed by >>) kernel/sched/core.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/thread_info.h, ...): arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI kernel/sched/core.c: note: in included file: >> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield kernel/sched/sched.h:742:55: sparse: sparse: dubious one-bit signed bitfield kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2252:26: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2252:26: sparse: struct task_struct * kernel/sched/sched.h:2429:9: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2429:9: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2429:9: sparse: struct task_struct * kernel/sched/core.c:2131:38: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/core.c:2131:38: sparse: struct task_struct [noderef] __rcu * kernel/sched/core.c:2131:38: sparse: struct task_struct const * kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2252:26: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2252:26: sparse: struct task_struct * kernel/sched/sched.h:2452:9: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2452:9: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2452:9: sparse: struct task_struct * kernel/sched/sched.h:2452:9: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2452:9: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2452:9: sparse: struct task_struct * kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2252:26: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2252:26: sparse: struct task_struct * kernel/sched/sched.h:2429:9: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2429:9: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2429:9: sparse: struct task_struct * -- kernel/sched/fair.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/kernel.h, ...): arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI kernel/sched/fair.c: note: in included file: >> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield kernel/sched/sched.h:742:55: sparse: sparse: dubious one-bit signed bitfield kernel/sched/fair.c:6073:22: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/fair.c:6073:22: sparse: struct task_struct [noderef] __rcu * kernel/sched/fair.c:6073:22: sparse: struct task_struct * kernel/sched/fair.c:10625:22: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/fair.c:10625:22: sparse: struct task_struct [noderef] __rcu * kernel/sched/fair.c:10625:22: sparse: struct task_struct * kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2252:26: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2252:26: sparse: struct task_struct * kernel/sched/sched.h:2452:9: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2452:9: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2452:9: sparse: struct task_struct * kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2252:26: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2252:26: sparse: struct task_struct * kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2252:26: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2252:26: sparse: struct task_struct * -- kernel/sched/build_policy.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/kernel.h, ...): arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI kernel/sched/build_policy.c: note: in included file: >> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield kernel/sched/sched.h:742:55: sparse: sparse: dubious one-bit signed bitfield kernel/sched/build_policy.c: note: in included file: kernel/sched/rt.c:2289:25: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/rt.c:2289:25: sparse: struct task_struct * kernel/sched/rt.c:2289:25: sparse: struct task_struct [noderef] __rcu * kernel/sched/rt.c:1994:13: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/rt.c:1994:13: sparse: struct task_struct * kernel/sched/rt.c:1994:13: sparse: struct task_struct [noderef] __rcu * kernel/sched/build_policy.c: note: in included file: kernel/sched/deadline.c:2675:13: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/deadline.c:2675:13: sparse: struct task_struct * kernel/sched/deadline.c:2675:13: sparse: struct task_struct [noderef] __rcu * kernel/sched/deadline.c:2781:25: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/deadline.c:2781:25: sparse: struct task_struct * kernel/sched/deadline.c:2781:25: sparse: struct task_struct [noderef] __rcu * kernel/sched/deadline.c:3024:23: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/deadline.c:3024:23: sparse: struct task_struct [noderef] __rcu * kernel/sched/deadline.c:3024:23: sparse: struct task_struct * kernel/sched/build_policy.c: note: in included file: kernel/sched/syscalls.c:206:22: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/syscalls.c:206:22: sparse: struct task_struct [noderef] __rcu * kernel/sched/syscalls.c:206:22: sparse: struct task_struct * kernel/sched/build_policy.c: note: in included file: kernel/sched/sched.h:2241:25: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2241:25: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2241:25: sparse: struct task_struct * kernel/sched/sched.h:2241:25: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2241:25: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2241:25: sparse: struct task_struct * kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2252:26: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2252:26: sparse: struct task_struct * kernel/sched/sched.h:2241:25: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2241:25: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2241:25: sparse: struct task_struct * kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2252:26: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2252:26: sparse: struct task_struct * kernel/sched/sched.h:2241:25: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2241:25: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2241:25: sparse: struct task_struct * kernel/sched/sched.h:2241:25: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2241:25: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2241:25: sparse: struct task_struct * kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2252:26: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2252:26: sparse: struct task_struct * kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2252:26: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2252:26: sparse: struct task_struct * kernel/sched/sched.h:2429:9: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2429:9: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2429:9: sparse: struct task_struct * kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2252:26: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2252:26: sparse: struct task_struct * kernel/sched/sched.h:2429:9: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2429:9: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2429:9: sparse: struct task_struct * -- kernel/sched/build_utility.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/kernel.h, ...): arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI kernel/sched/build_utility.c: note: in included file: >> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield kernel/sched/sched.h:742:55: sparse: sparse: dubious one-bit signed bitfield kernel/sched/sched.h:2241:25: sparse: sparse: incompatible types in comparison expression (different address spaces): kernel/sched/sched.h:2241:25: sparse: struct task_struct [noderef] __rcu * kernel/sched/sched.h:2241:25: sparse: struct task_struct * vim +741 kernel/sched/sched.h 709 710 #ifdef CONFIG_FAIR_GROUP_SCHED 711 struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ 712 713 /* 714 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 715 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 716 * (like users, containers etc.) 717 * 718 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. 719 * This list is used during load balance. 720 */ 721 int on_list; 722 struct list_head leaf_cfs_rq_list; 723 struct task_group *tg; /* group that "owns" this runqueue */ 724 725 /* Locally cached copy of our task_group's idle value */ 726 int idle; 727 728 #ifdef CONFIG_CFS_BANDWIDTH 729 int runtime_enabled; 730 s64 runtime_remaining; 731 732 u64 throttled_pelt_idle; 733 #ifndef CONFIG_64BIT 734 u64 throttled_pelt_idle_copy; 735 #endif 736 u64 throttled_clock; 737 u64 throttled_clock_pelt; 738 u64 throttled_clock_pelt_time; 739 u64 throttled_clock_self; 740 u64 throttled_clock_self_time; > 741 int throttled:1; 742 int pelt_clock_throttled:1; 743 int throttle_count; 744 struct list_head throttled_list; 745 struct list_head throttled_csd_list; 746 struct list_head throttled_limbo_list; 747 #endif /* CONFIG_CFS_BANDWIDTH */ 748 #endif /* CONFIG_FAIR_GROUP_SCHED */ 749 }; 750 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
On Wed, Jul 16, 2025 at 11:20:55PM +0800, kernel test robot wrote: > Hi Aaron, > > kernel test robot noticed the following build warnings: > > [auto build test WARNING on tip/sched/core] > [also build test WARNING on next-20250716] > [cannot apply to linus/master v6.16-rc6] > [If your patch is applied to the wrong git tree, kindly drop us a note. > And when submitting patch, we suggest to use '--base' as documented in > https://git-scm.com/docs/git-format-patch#_base_tree_information] > > url: https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307 > base: tip/sched/core > patch link: https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com > patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model > config: xtensa-randconfig-r121-20250716 (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/config) > compiler: xtensa-linux-gcc (GCC) 8.5.0 > reproduce: (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/reproduce) > > If you fix the issue in a separate patch/commit (i.e. not just a new version of > the same patch/commit), kindly add following tags > | Reported-by: kernel test robot <lkp@intel.com> > | Closes: https://lore.kernel.org/oe-kbuild-all/202507162238.qiw7kyu0-lkp@intel.com/ > > sparse warnings: (new ones prefixed by >>) > kernel/sched/core.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/thread_info.h, ...): > arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI > arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI > kernel/sched/core.c: note: in included file: > >> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield Same problem as last report. I've downloaded this compiler from kernel.org and confirmed there is no such warnings after using bool. Thanks, Aaron
hi, Aaron, On Thu, Jul 17, 2025 at 11:52:43AM +0800, Aaron Lu wrote: > On Wed, Jul 16, 2025 at 11:20:55PM +0800, kernel test robot wrote: > > Hi Aaron, > > > > kernel test robot noticed the following build warnings: > > > > [auto build test WARNING on tip/sched/core] > > [also build test WARNING on next-20250716] > > [cannot apply to linus/master v6.16-rc6] > > [If your patch is applied to the wrong git tree, kindly drop us a note. > > And when submitting patch, we suggest to use '--base' as documented in > > https://git-scm.com/docs/git-format-patch#_base_tree_information] > > > > url: https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307 > > base: tip/sched/core > > patch link: https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com > > patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model > > config: xtensa-randconfig-r121-20250716 (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/config) > > compiler: xtensa-linux-gcc (GCC) 8.5.0 > > reproduce: (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/reproduce) > > > > If you fix the issue in a separate patch/commit (i.e. not just a new version of > > the same patch/commit), kindly add following tags > > | Reported-by: kernel test robot <lkp@intel.com> > > | Closes: https://lore.kernel.org/oe-kbuild-all/202507162238.qiw7kyu0-lkp@intel.com/ > > > > sparse warnings: (new ones prefixed by >>) > > kernel/sched/core.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/thread_info.h, ...): > > arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI > > arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI > > kernel/sched/core.c: note: in included file: > > >> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield > > Same problem as last report. > > I've downloaded this compiler from kernel.org and confirmed there is no > such warnings after using bool. want to confirm, do you mean you can reproduce the build sparse error > kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield then after doing below change: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3c3ea0089b0b5..6eb15b00edccd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -738,7 +738,7 @@ struct cfs_rq { u64 throttled_clock_pelt_time; u64 throttled_clock_self; u64 throttled_clock_self_time; - int throttled:1; + bool throttled:1; int pelt_clock_throttled:1; int throttle_count; struct list_head throttled_list; the issue will disappear? > > Thanks, > Aaron >
Hi Oliver, On Wed, Jul 23, 2025 at 04:21:59PM +0800, Oliver Sang wrote: > hi, Aaron, > > On Thu, Jul 17, 2025 at 11:52:43AM +0800, Aaron Lu wrote: > > On Wed, Jul 16, 2025 at 11:20:55PM +0800, kernel test robot wrote: > > > Hi Aaron, > > > > > > kernel test robot noticed the following build warnings: > > > > > > [auto build test WARNING on tip/sched/core] > > > [also build test WARNING on next-20250716] > > > [cannot apply to linus/master v6.16-rc6] > > > [If your patch is applied to the wrong git tree, kindly drop us a note. > > > And when submitting patch, we suggest to use '--base' as documented in > > > https://git-scm.com/docs/git-format-patch#_base_tree_information] > > > > > > url: https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307 > > > base: tip/sched/core > > > patch link: https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com > > > patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model > > > config: xtensa-randconfig-r121-20250716 (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/config) > > > compiler: xtensa-linux-gcc (GCC) 8.5.0 > > > reproduce: (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/reproduce) > > > > > > If you fix the issue in a separate patch/commit (i.e. not just a new version of > > > the same patch/commit), kindly add following tags > > > | Reported-by: kernel test robot <lkp@intel.com> > > > | Closes: https://lore.kernel.org/oe-kbuild-all/202507162238.qiw7kyu0-lkp@intel.com/ > > > > > > sparse warnings: (new ones prefixed by >>) > > > kernel/sched/core.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/thread_info.h, ...): > > > arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI > > > arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI > > > kernel/sched/core.c: note: in included file: > > > >> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield > > > > Same problem as last report. > > > > I've downloaded this compiler from kernel.org and confirmed there is no > > such warnings after using bool. > > > want to confirm, do you mean you can reproduce the build sparse error > > kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield > Right. > > then after doing below change: > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 3c3ea0089b0b5..6eb15b00edccd 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -738,7 +738,7 @@ struct cfs_rq { > u64 throttled_clock_pelt_time; > u64 throttled_clock_self; > u64 throttled_clock_self_time; > - int throttled:1; > + bool throttled:1; > int pelt_clock_throttled:1; > int throttle_count; > struct list_head throttled_list; > > > the issue will disappear? > Yes, that's correct.
Hi Aaron, kernel test robot noticed the following build warnings: [auto build test WARNING on tip/sched/core] [also build test WARNING on next-20250715] [cannot apply to linus/master v6.16-rc6] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307 base: tip/sched/core patch link: https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model config: i386-buildonly-randconfig-006-20250716 (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/config) compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202507160730.0cXkgs0S-lkp@intel.com/ All warnings (new ones prefixed by >>): >> kernel/sched/fair.c:5456:33: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion] 5456 | cfs_rq->pelt_clock_throttled = 1; | ^ ~ kernel/sched/fair.c:5971:32: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion] 5971 | cfs_rq->pelt_clock_throttled = 1; | ^ ~ kernel/sched/fair.c:6014:20: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion] 6014 | cfs_rq->throttled = 1; | ^ ~ 3 warnings generated. vim +/int +5456 kernel/sched/fair.c 5372 5373 static bool 5374 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 5375 { 5376 bool sleep = flags & DEQUEUE_SLEEP; 5377 int action = UPDATE_TG; 5378 5379 update_curr(cfs_rq); 5380 clear_buddies(cfs_rq, se); 5381 5382 if (flags & DEQUEUE_DELAYED) { 5383 WARN_ON_ONCE(!se->sched_delayed); 5384 } else { 5385 bool delay = sleep; 5386 /* 5387 * DELAY_DEQUEUE relies on spurious wakeups, special task 5388 * states must not suffer spurious wakeups, excempt them. 5389 */ 5390 if (flags & DEQUEUE_SPECIAL) 5391 delay = false; 5392 5393 WARN_ON_ONCE(delay && se->sched_delayed); 5394 5395 if (sched_feat(DELAY_DEQUEUE) && delay && 5396 !entity_eligible(cfs_rq, se)) { 5397 update_load_avg(cfs_rq, se, 0); 5398 set_delayed(se); 5399 return false; 5400 } 5401 } 5402 5403 if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) 5404 action |= DO_DETACH; 5405 5406 /* 5407 * When dequeuing a sched_entity, we must: 5408 * - Update loads to have both entity and cfs_rq synced with now. 5409 * - For group_entity, update its runnable_weight to reflect the new 5410 * h_nr_runnable of its group cfs_rq. 5411 * - Subtract its previous weight from cfs_rq->load.weight. 5412 * - For group entity, update its weight to reflect the new share 5413 * of its group cfs_rq. 5414 */ 5415 update_load_avg(cfs_rq, se, action); 5416 se_update_runnable(se); 5417 5418 update_stats_dequeue_fair(cfs_rq, se, flags); 5419 5420 update_entity_lag(cfs_rq, se); 5421 if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { 5422 se->deadline -= se->vruntime; 5423 se->rel_deadline = 1; 5424 } 5425 5426 if (se != cfs_rq->curr) 5427 __dequeue_entity(cfs_rq, se); 5428 se->on_rq = 0; 5429 account_entity_dequeue(cfs_rq, se); 5430 5431 /* return excess runtime on last dequeue */ 5432 return_cfs_rq_runtime(cfs_rq); 5433 5434 update_cfs_group(se); 5435 5436 /* 5437 * Now advance min_vruntime if @se was the entity holding it back, 5438 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be 5439 * put back on, and if we advance min_vruntime, we'll be placed back 5440 * further than we started -- i.e. we'll be penalized. 5441 */ 5442 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) 5443 update_min_vruntime(cfs_rq); 5444 5445 if (flags & DEQUEUE_DELAYED) 5446 finish_delayed_dequeue_entity(se); 5447 5448 if (cfs_rq->nr_queued == 0) { 5449 update_idle_cfs_rq_clock_pelt(cfs_rq); 5450 #ifdef CONFIG_CFS_BANDWIDTH 5451 if (throttled_hierarchy(cfs_rq)) { 5452 struct rq *rq = rq_of(cfs_rq); 5453 5454 list_del_leaf_cfs_rq(cfs_rq); 5455 cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); > 5456 cfs_rq->pelt_clock_throttled = 1; 5457 } 5458 #endif 5459 } 5460 5461 return true; 5462 } 5463 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
On Wed, Jul 16, 2025 at 07:29:37AM +0800, kernel test robot wrote: > Hi Aaron, > > kernel test robot noticed the following build warnings: > > [auto build test WARNING on tip/sched/core] > [also build test WARNING on next-20250715] > [cannot apply to linus/master v6.16-rc6] > [If your patch is applied to the wrong git tree, kindly drop us a note. > And when submitting patch, we suggest to use '--base' as documented in > https://git-scm.com/docs/git-format-patch#_base_tree_information] > > url: https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307 > base: tip/sched/core > patch link: https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com > patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model > config: i386-buildonly-randconfig-006-20250716 (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/config) > compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261) > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/reproduce) > > If you fix the issue in a separate patch/commit (i.e. not just a new version of > the same patch/commit), kindly add following tags > | Reported-by: kernel test robot <lkp@intel.com> > | Closes: https://lore.kernel.org/oe-kbuild-all/202507160730.0cXkgs0S-lkp@intel.com/ > > All warnings (new ones prefixed by >>): > > >> kernel/sched/fair.c:5456:33: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion] > 5456 | cfs_rq->pelt_clock_throttled = 1; > | ^ ~ Thanks for the report. I don't think this will affect correctness since both cfs_rq's throttled and pelt_clock_throttled fields are used as true(not 0) or false(0). I used bitfield for them to save some space. Change their types to either unsigned int or bool should cure this warning, I suppose bool looks more clear? diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dbe52e18b93a0..434f816a56701 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -737,8 +737,8 @@ struct cfs_rq { u64 throttled_clock_pelt_time; u64 throttled_clock_self; u64 throttled_clock_self_time; - int throttled:1; - int pelt_clock_throttled:1; + bool throttled:1; + bool pelt_clock_throttled:1; int throttle_count; struct list_head throttled_list; struct list_head throttled_csd_list; Hi LKP, I tried using clang-19 but couldn't reproduce this warning and I don't have clang-20 at hand. Can you please apply the above hunk on top of this series and see if that warning is gone? Thanks. Best regards, Aaron > kernel/sched/fair.c:5971:32: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion] > 5971 | cfs_rq->pelt_clock_throttled = 1; > | ^ ~ > kernel/sched/fair.c:6014:20: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion] > 6014 | cfs_rq->throttled = 1; > | ^ ~ > 3 warnings generated. > > > vim +/int +5456 kernel/sched/fair.c > > 5372 > 5373 static bool > 5374 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) > 5375 { > 5376 bool sleep = flags & DEQUEUE_SLEEP; > 5377 int action = UPDATE_TG; > 5378 > 5379 update_curr(cfs_rq); > 5380 clear_buddies(cfs_rq, se); > 5381 > 5382 if (flags & DEQUEUE_DELAYED) { > 5383 WARN_ON_ONCE(!se->sched_delayed); > 5384 } else { > 5385 bool delay = sleep; > 5386 /* > 5387 * DELAY_DEQUEUE relies on spurious wakeups, special task > 5388 * states must not suffer spurious wakeups, excempt them. > 5389 */ > 5390 if (flags & DEQUEUE_SPECIAL) > 5391 delay = false; > 5392 > 5393 WARN_ON_ONCE(delay && se->sched_delayed); > 5394 > 5395 if (sched_feat(DELAY_DEQUEUE) && delay && > 5396 !entity_eligible(cfs_rq, se)) { > 5397 update_load_avg(cfs_rq, se, 0); > 5398 set_delayed(se); > 5399 return false; > 5400 } > 5401 } > 5402 > 5403 if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) > 5404 action |= DO_DETACH; > 5405 > 5406 /* > 5407 * When dequeuing a sched_entity, we must: > 5408 * - Update loads to have both entity and cfs_rq synced with now. > 5409 * - For group_entity, update its runnable_weight to reflect the new > 5410 * h_nr_runnable of its group cfs_rq. > 5411 * - Subtract its previous weight from cfs_rq->load.weight. > 5412 * - For group entity, update its weight to reflect the new share > 5413 * of its group cfs_rq. > 5414 */ > 5415 update_load_avg(cfs_rq, se, action); > 5416 se_update_runnable(se); > 5417 > 5418 update_stats_dequeue_fair(cfs_rq, se, flags); > 5419 > 5420 update_entity_lag(cfs_rq, se); > 5421 if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { > 5422 se->deadline -= se->vruntime; > 5423 se->rel_deadline = 1; > 5424 } > 5425 > 5426 if (se != cfs_rq->curr) > 5427 __dequeue_entity(cfs_rq, se); > 5428 se->on_rq = 0; > 5429 account_entity_dequeue(cfs_rq, se); > 5430 > 5431 /* return excess runtime on last dequeue */ > 5432 return_cfs_rq_runtime(cfs_rq); > 5433 > 5434 update_cfs_group(se); > 5435 > 5436 /* > 5437 * Now advance min_vruntime if @se was the entity holding it back, > 5438 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be > 5439 * put back on, and if we advance min_vruntime, we'll be placed back > 5440 * further than we started -- i.e. we'll be penalized. > 5441 */ > 5442 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) > 5443 update_min_vruntime(cfs_rq); > 5444 > 5445 if (flags & DEQUEUE_DELAYED) > 5446 finish_delayed_dequeue_entity(se); > 5447 > 5448 if (cfs_rq->nr_queued == 0) { > 5449 update_idle_cfs_rq_clock_pelt(cfs_rq); > 5450 #ifdef CONFIG_CFS_BANDWIDTH > 5451 if (throttled_hierarchy(cfs_rq)) { > 5452 struct rq *rq = rq_of(cfs_rq); > 5453 > 5454 list_del_leaf_cfs_rq(cfs_rq); > 5455 cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); > > 5456 cfs_rq->pelt_clock_throttled = 1; > 5457 } > 5458 #endif > 5459 } > 5460 > 5461 return true; > 5462 } > 5463 > > -- > 0-DAY CI Kernel Test Service > https://github.com/intel/lkp-tests/wiki
On Wed, Jul 16, 2025 at 02:57:07PM +0800, Aaron Lu wrote: > On Wed, Jul 16, 2025 at 07:29:37AM +0800, kernel test robot wrote: > > Hi Aaron, > > > > kernel test robot noticed the following build warnings: > > > > [auto build test WARNING on tip/sched/core] > > [also build test WARNING on next-20250715] > > [cannot apply to linus/master v6.16-rc6] > > [If your patch is applied to the wrong git tree, kindly drop us a note. > > And when submitting patch, we suggest to use '--base' as documented in > > https://git-scm.com/docs/git-format-patch#_base_tree_information] > > > > url: https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307 > > base: tip/sched/core > > patch link: https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com > > patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model > > config: i386-buildonly-randconfig-006-20250716 (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/config) > > compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261) > > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/reproduce) > > > > If you fix the issue in a separate patch/commit (i.e. not just a new version of > > the same patch/commit), kindly add following tags > > | Reported-by: kernel test robot <lkp@intel.com> > > | Closes: https://lore.kernel.org/oe-kbuild-all/202507160730.0cXkgs0S-lkp@intel.com/ > > > > All warnings (new ones prefixed by >>): > > > > >> kernel/sched/fair.c:5456:33: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion] > > 5456 | cfs_rq->pelt_clock_throttled = 1; > > | ^ ~ Nice warning from clang. > > Thanks for the report. > > I don't think this will affect correctness since both cfs_rq's throttled > and pelt_clock_throttled fields are used as true(not 0) or false(0). I > used bitfield for them to save some space. > > Change their types to either unsigned int or bool should cure this > warning, I suppose bool looks more clear? > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index dbe52e18b93a0..434f816a56701 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -737,8 +737,8 @@ struct cfs_rq { > u64 throttled_clock_pelt_time; > u64 throttled_clock_self; > u64 throttled_clock_self_time; > - int throttled:1; > - int pelt_clock_throttled:1; > + bool throttled:1; > + bool pelt_clock_throttled:1; > int throttle_count; > struct list_head throttled_list; > struct list_head throttled_csd_list; Yeah, either this or any unsigned type will do.
On Wed, Jul 16, 2025 at 02:57:07PM +0800, Aaron Lu wrote: > On Wed, Jul 16, 2025 at 07:29:37AM +0800, kernel test robot wrote: > > Hi Aaron, > > > > kernel test robot noticed the following build warnings: > > > > [auto build test WARNING on tip/sched/core] > > [also build test WARNING on next-20250715] > > [cannot apply to linus/master v6.16-rc6] > > [If your patch is applied to the wrong git tree, kindly drop us a note. > > And when submitting patch, we suggest to use '--base' as documented in > > https://git-scm.com/docs/git-format-patch#_base_tree_information] > > > > url: https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307 > > base: tip/sched/core > > patch link: https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com > > patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model > > config: i386-buildonly-randconfig-006-20250716 (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/config) > > compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261) > > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/reproduce) > > > > If you fix the issue in a separate patch/commit (i.e. not just a new version of > > the same patch/commit), kindly add following tags > > | Reported-by: kernel test robot <lkp@intel.com> > > | Closes: https://lore.kernel.org/oe-kbuild-all/202507160730.0cXkgs0S-lkp@intel.com/ > > > > All warnings (new ones prefixed by >>): > > > > >> kernel/sched/fair.c:5456:33: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion] > > 5456 | cfs_rq->pelt_clock_throttled = 1; > > | ^ ~ > > Thanks for the report. > > I don't think this will affect correctness since both cfs_rq's throttled > and pelt_clock_throttled fields are used as true(not 0) or false(0). I > used bitfield for them to save some space. > > Change their types to either unsigned int or bool should cure this > warning, I suppose bool looks more clear? > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index dbe52e18b93a0..434f816a56701 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -737,8 +737,8 @@ struct cfs_rq { > u64 throttled_clock_pelt_time; > u64 throttled_clock_self; > u64 throttled_clock_self_time; > - int throttled:1; > - int pelt_clock_throttled:1; > + bool throttled:1; > + bool pelt_clock_throttled:1; > int throttle_count; > struct list_head throttled_list; > struct list_head throttled_csd_list; > > Hi LKP, > > I tried using clang-19 but couldn't reproduce this warning and I don't > have clang-20 at hand. Can you please apply the above hunk on top of > this series and see if that warning is gone? Thanks. Got it, is it possible to give a try for the reproduce step [1], which can download clang-20 to local dir? If it has issue, we will follow up to check as early as possible. [1] https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/reproduce > > Best regards, > Aaron > > > kernel/sched/fair.c:5971:32: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion] > > 5971 | cfs_rq->pelt_clock_throttled = 1; > > | ^ ~ > > kernel/sched/fair.c:6014:20: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion] > > 6014 | cfs_rq->throttled = 1; > > | ^ ~ > > 3 warnings generated. > > > > > > vim +/int +5456 kernel/sched/fair.c > > > > 5372 > > 5373 static bool > > 5374 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) > > 5375 { > > 5376 bool sleep = flags & DEQUEUE_SLEEP; > > 5377 int action = UPDATE_TG; > > 5378 > > 5379 update_curr(cfs_rq); > > 5380 clear_buddies(cfs_rq, se); > > 5381 > > 5382 if (flags & DEQUEUE_DELAYED) { > > 5383 WARN_ON_ONCE(!se->sched_delayed); > > 5384 } else { > > 5385 bool delay = sleep; > > 5386 /* > > 5387 * DELAY_DEQUEUE relies on spurious wakeups, special task > > 5388 * states must not suffer spurious wakeups, excempt them. > > 5389 */ > > 5390 if (flags & DEQUEUE_SPECIAL) > > 5391 delay = false; > > 5392 > > 5393 WARN_ON_ONCE(delay && se->sched_delayed); > > 5394 > > 5395 if (sched_feat(DELAY_DEQUEUE) && delay && > > 5396 !entity_eligible(cfs_rq, se)) { > > 5397 update_load_avg(cfs_rq, se, 0); > > 5398 set_delayed(se); > > 5399 return false; > > 5400 } > > 5401 } > > 5402 > > 5403 if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) > > 5404 action |= DO_DETACH; > > 5405 > > 5406 /* > > 5407 * When dequeuing a sched_entity, we must: > > 5408 * - Update loads to have both entity and cfs_rq synced with now. > > 5409 * - For group_entity, update its runnable_weight to reflect the new > > 5410 * h_nr_runnable of its group cfs_rq. > > 5411 * - Subtract its previous weight from cfs_rq->load.weight. > > 5412 * - For group entity, update its weight to reflect the new share > > 5413 * of its group cfs_rq. > > 5414 */ > > 5415 update_load_avg(cfs_rq, se, action); > > 5416 se_update_runnable(se); > > 5417 > > 5418 update_stats_dequeue_fair(cfs_rq, se, flags); > > 5419 > > 5420 update_entity_lag(cfs_rq, se); > > 5421 if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { > > 5422 se->deadline -= se->vruntime; > > 5423 se->rel_deadline = 1; > > 5424 } > > 5425 > > 5426 if (se != cfs_rq->curr) > > 5427 __dequeue_entity(cfs_rq, se); > > 5428 se->on_rq = 0; > > 5429 account_entity_dequeue(cfs_rq, se); > > 5430 > > 5431 /* return excess runtime on last dequeue */ > > 5432 return_cfs_rq_runtime(cfs_rq); > > 5433 > > 5434 update_cfs_group(se); > > 5435 > > 5436 /* > > 5437 * Now advance min_vruntime if @se was the entity holding it back, > > 5438 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be > > 5439 * put back on, and if we advance min_vruntime, we'll be placed back > > 5440 * further than we started -- i.e. we'll be penalized. > > 5441 */ > > 5442 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) > > 5443 update_min_vruntime(cfs_rq); > > 5444 > > 5445 if (flags & DEQUEUE_DELAYED) > > 5446 finish_delayed_dequeue_entity(se); > > 5447 > > 5448 if (cfs_rq->nr_queued == 0) { > > 5449 update_idle_cfs_rq_clock_pelt(cfs_rq); > > 5450 #ifdef CONFIG_CFS_BANDWIDTH > > 5451 if (throttled_hierarchy(cfs_rq)) { > > 5452 struct rq *rq = rq_of(cfs_rq); > > 5453 > > 5454 list_del_leaf_cfs_rq(cfs_rq); > > 5455 cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); > > > 5456 cfs_rq->pelt_clock_throttled = 1; > > 5457 } > > 5458 #endif > > 5459 } > > 5460 > > 5461 return true; > > 5462 } > > 5463 > > > > -- > > 0-DAY CI Kernel Test Service > > https://github.com/intel/lkp-tests/wiki >
On Wed, Jul 16, 2025 at 03:40:26PM +0800, Philip Li wrote:
> On Wed, Jul 16, 2025 at 02:57:07PM +0800, Aaron Lu wrote:
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index dbe52e18b93a0..434f816a56701 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -737,8 +737,8 @@ struct cfs_rq {
> > u64 throttled_clock_pelt_time;
> > u64 throttled_clock_self;
> > u64 throttled_clock_self_time;
> > - int throttled:1;
> > - int pelt_clock_throttled:1;
> > + bool throttled:1;
> > + bool pelt_clock_throttled:1;
> > int throttle_count;
> > struct list_head throttled_list;
> > struct list_head throttled_csd_list;
> >
> > Hi LKP,
> >
> > I tried using clang-19 but couldn't reproduce this warning and I don't
> > have clang-20 at hand. Can you please apply the above hunk on top of
> > this series and see if that warning is gone? Thanks.
>
> Got it, is it possible to give a try for the reproduce step [1], which can
> download clang-20 to local dir? If it has issue, we will follow up to check
> as early as possible.
I managed to install clang-20 and can see this warning. With the above
hunk applied, the warning is gone.
Here is the updated patch3:
From: Valentin Schneider <vschneid@redhat.com>
Date: Fri, 23 May 2025 15:30:15 +0800
Subject: [PATCH v3 update 3/5] sched/fair: Switch to task based throttle model
In current throttle model, when a cfs_rq is throttled, its entity will
be dequeued from cpu's rq, making tasks attached to it not able to run,
thus achiveing the throttle target.
This has a drawback though: assume a task is a reader of percpu_rwsem
and is waiting. When it gets woken, it can not run till its task group's
next period comes, which can be a relatively long time. Waiting writer
will have to wait longer due to this and it also makes further reader
build up and eventually trigger task hung.
To improve this situation, change the throttle model to task based, i.e.
when a cfs_rq is throttled, record its throttled status but do not remove
it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
they get picked, add a task work to them so that when they return
to user, they can be dequeued there. In this way, tasks throttled will
not hold any kernel resources. And on unthrottle, enqueue back those
tasks so they can continue to run.
Throttled cfs_rq's PELT clock is handled differently now: previously the
cfs_rq's PELT clock is stopped once it entered throttled state but since
now tasks(in kernel mode) can continue to run, change the behaviour to
stop PELT clock when the throttled cfs_rq has no tasks left.
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
v3 update:
- fix compiler warning about int bit-field.
kernel/sched/fair.c | 336 ++++++++++++++++++++++---------------------
kernel/sched/pelt.h | 4 +-
kernel/sched/sched.h | 3 +-
3 files changed, 176 insertions(+), 167 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 54c2a4df6a5d1..0eeea7f2e693d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5285,18 +5285,23 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
- if (!throttled_hierarchy(cfs_rq)) {
- list_add_leaf_cfs_rq(cfs_rq);
- } else {
+ list_add_leaf_cfs_rq(cfs_rq);
#ifdef CONFIG_CFS_BANDWIDTH
+ if (throttled_hierarchy(cfs_rq)) {
struct rq *rq = rq_of(cfs_rq);
if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
cfs_rq->throttled_clock = rq_clock(rq);
if (!cfs_rq->throttled_clock_self)
cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
+
+ if (cfs_rq->pelt_clock_throttled) {
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
+ }
}
+#endif
}
}
@@ -5335,8 +5340,6 @@ static void set_delayed(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_runnable--;
- if (cfs_rq_throttled(cfs_rq))
- break;
}
}
@@ -5357,8 +5360,6 @@ static void clear_delayed(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_runnable++;
- if (cfs_rq_throttled(cfs_rq))
- break;
}
}
@@ -5444,8 +5445,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
- if (cfs_rq->nr_queued == 0)
+ if (cfs_rq->nr_queued == 0) {
update_idle_cfs_rq_clock_pelt(cfs_rq);
+#ifdef CONFIG_CFS_BANDWIDTH
+ if (throttled_hierarchy(cfs_rq)) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
+ }
+#endif
+ }
return true;
}
@@ -5784,6 +5795,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ /*
+ * Must not set throttled before dequeue or dequeue will
+ * mistakenly regard this task as an already throttled one.
+ */
p->throttled = true;
resched_curr(rq);
}
@@ -5797,32 +5812,119 @@ void init_cfs_throttle_work(struct task_struct *p)
INIT_LIST_HEAD(&p->throttle_node);
}
+/*
+ * Task is throttled and someone wants to dequeue it again:
+ * it could be sched/core when core needs to do things like
+ * task affinity change, task group change, task sched class
+ * change etc. and in these cases, DEQUEUE_SLEEP is not set;
+ * or the task is blocked after throttled due to freezer etc.
+ * and in these cases, DEQUEUE_SLEEP is set.
+ */
+static void detach_task_cfs_rq(struct task_struct *p);
+static void dequeue_throttled_task(struct task_struct *p, int flags)
+{
+ WARN_ON_ONCE(p->se.on_rq);
+ list_del_init(&p->throttle_node);
+
+ /* task blocked after throttled */
+ if (flags & DEQUEUE_SLEEP) {
+ p->throttled = false;
+ return;
+ }
+
+ /*
+ * task is migrating off its old cfs_rq, detach
+ * the task's load from its old cfs_rq.
+ */
+ if (task_on_rq_migrating(p))
+ detach_task_cfs_rq(p);
+}
+
+static bool enqueue_throttled_task(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+ /*
+ * If the throttled task is enqueued to a throttled cfs_rq,
+ * take the fast path by directly put the task on target
+ * cfs_rq's limbo list, except when p is current because
+ * the following race can cause p's group_node left in rq's
+ * cfs_tasks list when it's throttled:
+ *
+ * cpuX cpuY
+ * taskA ret2user
+ * throttle_cfs_rq_work() sched_move_task(taskA)
+ * task_rq_lock acquired
+ * dequeue_task_fair(taskA)
+ * task_rq_lock released
+ * task_rq_lock acquired
+ * task_current_donor(taskA) == true
+ * task_on_rq_queued(taskA) == true
+ * dequeue_task(taskA)
+ * put_prev_task(taskA)
+ * sched_change_group()
+ * enqueue_task(taskA) -> taskA's new cfs_rq
+ * is throttled, go
+ * fast path and skip
+ * actual enqueue
+ * set_next_task(taskA)
+ * __set_next_task_fair(taskA)
+ * list_move(&se->group_node, &rq->cfs_tasks); // bug
+ * schedule()
+ *
+ * And in the above race case, the task's current cfs_rq is in the same
+ * rq as its previous cfs_rq because sched_move_task() doesn't migrate
+ * task so we can use its current cfs_rq to derive rq and test if the
+ * task is current.
+ */
+ if (throttled_hierarchy(cfs_rq) &&
+ !task_current_donor(rq_of(cfs_rq), p)) {
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ return true;
+ }
+
+ /* we can't take the fast path, do an actual enqueue*/
+ p->throttled = false;
+ return false;
+}
+
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct task_struct *p, *tmp;
+
+ if (--cfs_rq->throttle_count)
+ return 0;
- cfs_rq->throttle_count--;
- if (!cfs_rq->throttle_count) {
+ if (cfs_rq->pelt_clock_throttled) {
cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
+ }
- /* Add cfs_rq with load or one or more already running entities to the list */
- if (!cfs_rq_is_decayed(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
- if (cfs_rq->throttled_clock_self) {
- u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+ cfs_rq->throttled_clock_self = 0;
- cfs_rq->throttled_clock_self = 0;
+ if (WARN_ON_ONCE((s64)delta < 0))
+ delta = 0;
- if (WARN_ON_ONCE((s64)delta < 0))
- delta = 0;
+ cfs_rq->throttled_clock_self_time += delta;
+ }
- cfs_rq->throttled_clock_self_time += delta;
- }
+ /* Re-enqueue the tasks that have been throttled at this level. */
+ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+ list_del_init(&p->throttle_node);
+ enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
}
+ /* Add cfs_rq with load or one or more already running entities to the list */
+ if (!cfs_rq_is_decayed(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+
return 0;
}
@@ -5851,17 +5953,25 @@ static int tg_throttle_down(struct task_group *tg, void *data)
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ if (cfs_rq->throttle_count++)
+ return 0;
+
+
/* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+ if (cfs_rq->nr_queued)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+ else {
+ /*
+ * For cfs_rqs that still have entities enqueued, PELT clock
+ * stop happens at dequeue time when all entities are dequeued.
+ */
list_del_leaf_cfs_rq(cfs_rq);
-
- WARN_ON_ONCE(cfs_rq->throttled_clock_self);
- if (cfs_rq->nr_queued)
- cfs_rq->throttled_clock_self = rq_clock(rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
}
- cfs_rq->throttle_count++;
+ WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
return 0;
}
@@ -5869,8 +5979,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta, dequeue = 1;
+ int dequeue = 1;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5893,68 +6002,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!dequeue)
return false; /* Throttle no longer required. */
- se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- int flags;
-
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- /*
- * Abuse SPECIAL to avoid delayed dequeue in this instance.
- * This avoids teaching dequeue_entities() about throttled
- * entities and keeps things relatively simple.
- */
- flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
- if (se->sched_delayed)
- flags |= DEQUEUE_DELAYED;
- dequeue_entity(qcfs_rq, se, flags);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
-
- if (qcfs_rq->load.weight) {
- /* Avoid re-evaluating load for this entity: */
- se = parent_entity(se);
- break;
- }
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- update_load_avg(qcfs_rq, se, 0);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
- }
-
- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, queued_delta);
-done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
@@ -5970,9 +6022,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta;
- long rq_h_nr_queued = rq->cfs.h_nr_queued;
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+
+ /*
+ * It's possible we are called with !runtime_remaining due to things
+ * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
+ * unthrottled us with a positive runtime_remaining but other still
+ * running entities consumed those runtime before we reached here.
+ *
+ * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
+ * because any enqueue in tg_unthrottle_up() will immediately trigger a
+ * throttle, which is not supposed to happen on unthrottle path.
+ */
+ if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
+ return;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -6002,62 +6065,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
break;
}
- goto unthrottle_throttle;
}
- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- /* Handle any unfinished DELAY_DEQUEUE business first. */
- if (se->sched_delayed) {
- int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
- dequeue_entity(qcfs_rq, se, flags);
- } else if (se->on_rq)
- break;
- enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- update_load_avg(qcfs_rq, se, UPDATE_TG);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- /* Start the fair server if un-throttling resulted in new runnable tasks */
- if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
- dl_server_start(&rq->fair_server);
-
- /* At this point se is NULL and we are at root level*/
- add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
@@ -6711,6 +6720,8 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void task_throttle_setup_work(struct task_struct *p) {}
static bool task_is_throttled(struct task_struct *p) { return false; }
+static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
@@ -6903,6 +6914,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
+ if (unlikely(task_is_throttled(p) && enqueue_throttled_task(p)))
+ return;
+
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
@@ -6955,10 +6969,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
-
flags = ENQUEUE_WAKEUP;
}
@@ -6980,10 +6990,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
}
if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -7013,7 +7019,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!task_new)
check_update_overutilized_status(rq);
-enqueue_throttle:
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
@@ -7068,10 +7073,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
-
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
slice = cfs_rq_min_slice(cfs_rq);
@@ -7108,10 +7109,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
}
sub_nr_running(rq, h_nr_queued);
@@ -7145,6 +7142,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
*/
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
+ if (unlikely(task_is_throttled(p))) {
+ dequeue_throttled_task(p, flags);
+ return true;
+ }
+
if (!p->se.sched_delayed)
util_est_dequeue(&rq->cfs, p);
@@ -8813,19 +8815,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
+ struct task_struct *p;
+ bool throttled;
again:
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_queued)
return NULL;
+ throttled = false;
+
do {
/* Might not have done put_prev_entity() */
if (cfs_rq->curr && cfs_rq->curr->on_rq)
update_curr(cfs_rq);
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto again;
+ throttled |= check_cfs_rq_runtime(cfs_rq);
se = pick_next_entity(rq, cfs_rq);
if (!se)
@@ -8833,7 +8838,10 @@ static struct task_struct *pick_task_fair(struct rq *rq)
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
- return task_of(se);
+ p = task_of(se);
+ if (unlikely(throttled))
+ task_throttle_setup_work(p);
+ return p;
}
static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 62c3fa543c0f2..f921302dc40fb 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
u64 throttled;
- if (unlikely(cfs_rq->throttle_count))
+ if (unlikely(cfs_rq->pelt_clock_throttled))
throttled = U64_MAX;
else
throttled = cfs_rq->throttled_clock_pelt_time;
@@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
- if (unlikely(cfs_rq->throttle_count))
+ if (unlikely(cfs_rq->pelt_clock_throttled))
return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b0c9559992d8a..51d000ab4a70d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -737,7 +737,8 @@ struct cfs_rq {
u64 throttled_clock_pelt_time;
u64 throttled_clock_self;
u64 throttled_clock_self_time;
- int throttled;
+ bool throttled:1;
+ bool pelt_clock_throttled:1;
int throttle_count;
struct list_head throttled_list;
struct list_head throttled_csd_list;
--
2.39.5
© 2016 - 2025 Red Hat, Inc.