Actively push out any task running on a CPU marked as avoid.
If a task is sleeping it is pushed out if it wakes up on that CPU.
Since the task is running, need to use the stopper class to push the
task out. Use __balance_push_cpu_stop to achieve that.
This currently works only CFS and RT.
Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
kernel/sched/core.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 1 +
2 files changed, 45 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 13e44d7a0b90..aea4232e3ec4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5577,6 +5577,10 @@ void sched_tick(void)
sched_clock_tick();
+ /* push the current task out if cpu is marked as avoid */
+ if (cpu_avoid(cpu))
+ push_current_task(rq);
+
rq_lock(rq, &rf);
donor = rq->donor;
@@ -8028,6 +8032,43 @@ static void balance_hotplug_wait(void)
TASK_UNINTERRUPTIBLE);
}
+static DEFINE_PER_CPU(struct cpu_stop_work, push_task_work);
+
+/* A CPU is marked as Avoid when there is contention for underlying
+ * physical CPU and using this CPU will lead to hypervisor preemptions.
+ * It is better not to use this CPU.
+ *
+ * In case any task is scheduled on such CPU, move it out. In
+ * select_fallback_rq a non_avoid CPU will be chosen and henceforth
+ * task shouldn't come back to this CPU
+ */
+void push_current_task(struct rq *rq)
+{
+ struct task_struct *push_task = rq->curr;
+ unsigned long flags;
+
+ /* idle task can't be pused out */
+ if (rq->curr == rq->idle || !cpu_avoid(rq->cpu))
+ return;
+
+ /* Do for only SCHED_NORMAL AND RT for now */
+ if (push_task->sched_class != &fair_sched_class &&
+ push_task->sched_class != &rt_sched_class)
+ return;
+
+ if (kthread_is_per_cpu(push_task) ||
+ is_migration_disabled(push_task))
+ return;
+
+ local_irq_save(flags);
+ get_task_struct(push_task);
+ preempt_disable();
+
+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+ this_cpu_ptr(&push_task_work));
+ preempt_enable();
+ local_irq_restore(flags);
+}
#else /* !CONFIG_HOTPLUG_CPU: */
static inline void balance_push(struct rq *rq)
@@ -8042,6 +8083,9 @@ static inline void balance_hotplug_wait(void)
{
}
+void push_current_task(struct rq *rq)
+{
+}
#endif /* !CONFIG_HOTPLUG_CPU */
void set_rq_online(struct rq *rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 105190b18020..b9614873762e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1709,6 +1709,7 @@ struct rq_flags {
};
extern struct balance_callback balance_push_callback;
+void push_current_task(struct rq *rq);
#ifdef CONFIG_SCHED_CLASS_EXT
extern const struct sched_class ext_sched_class;
--
2.43.0
Sorry for the delay in response to bloat-o-meter report. Since stop_one_cpu_nowait needs protection
against race, need to add a field in rq. So ifdef check of CONFIG_PARAVIRT makes sense.
>
> Since the task is running, need to use the stopper class to push the
> task out. Use __balance_push_cpu_stop to achieve that.
>
> This currently works only CFS and RT.
>
> Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
> ---
> kernel/sched/core.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/sched.h | 1 +
> 2 files changed, 45 insertions(+)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 13e44d7a0b90..aea4232e3ec4 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5577,6 +5577,10 @@ void sched_tick(void)
>
> sched_clock_tick();
>
> + /* push the current task out if cpu is marked as avoid */
> + if (cpu_avoid(cpu))
> + push_current_task(rq);
> +
> rq_lock(rq, &rf);
> donor = rq->donor;
>
> @@ -8028,6 +8032,43 @@ static void balance_hotplug_wait(void)
> TASK_UNINTERRUPTIBLE);
> }
>
> +static DEFINE_PER_CPU(struct cpu_stop_work, push_task_work);
> +
> +/* A CPU is marked as Avoid when there is contention for underlying
> + * physical CPU and using this CPU will lead to hypervisor preemptions.
> + * It is better not to use this CPU.
> + *
> + * In case any task is scheduled on such CPU, move it out. In
> + * select_fallback_rq a non_avoid CPU will be chosen and henceforth
> + * task shouldn't come back to this CPU
> + */
> +void push_current_task(struct rq *rq)
> +{
> + struct task_struct *push_task = rq->curr;
> + unsigned long flags;
> +
> + /* idle task can't be pused out */
> + if (rq->curr == rq->idle || !cpu_avoid(rq->cpu))
> + return;
> +
> + /* Do for only SCHED_NORMAL AND RT for now */
> + if (push_task->sched_class != &fair_sched_class &&
> + push_task->sched_class != &rt_sched_class)
> + return;
> +
> + if (kthread_is_per_cpu(push_task) ||
> + is_migration_disabled(push_task))
> + return;
> +
> + local_irq_save(flags);
> + get_task_struct(push_task);
> + preempt_disable();
> +
> + stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
> + this_cpu_ptr(&push_task_work));
Doing a perf record occasionally caused the crash. This happens because stop_one_cpu_nowait
expects the callers to sync and push_task_work should be untouched until the stopper executes.
So, i had to do something similar to whats done in active_balance.
Add a field in rq and set/unset accordingly.
Using this field in __balance_push_cpu_stop is also hacky. I have to do something like below,
if (rq->balance_callback != &balance_push_callback)
rq->push_task_work_pending = 0;
or i have to copy __balance_push_cpu_stop and do the above.
After this, it makes sense to put all this under CONFIG_PARAVIRT.
(Also, i did explore using stop_one_cpu variant, got to it via scheduling a work and then execute it at
preemptible context. That occasionally ends up in deadlock. due to some issues at my end, haven't debugged that
further. a backup option for nowait)
> + preempt_enable();
> + local_irq_restore(flags);
> +}
> #else /* !CONFIG_HOTPLUG_CPU: */
>
> static inline void balance_push(struct rq *rq)
> @@ -8042,6 +8083,9 @@ static inline void balance_hotplug_wait(void)
> {
> }
>
> +void push_current_task(struct rq *rq)
> +{
> +}
> #endif /* !CONFIG_HOTPLUG_CPU */
>
> void set_rq_online(struct rq *rq)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 105190b18020..b9614873762e 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1709,6 +1709,7 @@ struct rq_flags {
> };
>
> extern struct balance_callback balance_push_callback;
> +void push_current_task(struct rq *rq);
>
> #ifdef CONFIG_SCHED_CLASS_EXT
> extern const struct sched_class ext_sched_class;
Hopefully i should be able to send out v3 soon addressing the comments.
Namewise, going to keep it cpu_paravirt_mask and cpu_paravirt(cpu).
© 2016 - 2026 Red Hat, Inc.