kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
We remove the cfs_rq throttled_csd_list entry *before* doing the
unthrottle. The problem with that is that destroy_bandwidth() does a
lockless scan of the system for any non-empty CSD lists. As a result,
it is possible that destroy_bandwidth() returns while we still have a
cfs_rq from the task group about to be unthrottled.
For full correctness, we should avoid removal from the list until after
we're done unthrottling in __cfsb_csd_unthrottle().
For consistency, we make the same change to distribute_cfs_runtime(),
even though this should already be safe due to destroy_bandwidth()
cancelling the bandwidth hrtimers.
Signed-off-by: Josh Don <joshdon@google.com>
---
kernel/sched/fair.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 34fe6e9490c2..78f542ab03cf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5917,10 +5917,10 @@ static void __cfsb_csd_unthrottle(void *arg)
list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
throttled_csd_list) {
- list_del_init(&cursor->throttled_csd_list);
-
if (cfs_rq_throttled(cursor))
unthrottle_cfs_rq(cursor);
+
+ list_del_init(&cursor->throttled_csd_list);
}
rcu_read_unlock();
@@ -6034,11 +6034,11 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
rq_lock_irqsave(rq, &rf);
- list_del_init(&cfs_rq->throttled_csd_list);
-
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
+ list_del_init(&cfs_rq->throttled_csd_list);
+
rq_unlock_irqrestore(rq, &rf);
}
SCHED_WARN_ON(!list_empty(&local_unthrottle));
--
2.48.1.502.g6dc24dfdaf-goog
… > For full correctness, we should avoid removal from the list until after > we're done unthrottling in __cfsb_csd_unthrottle(). … How do you think about to add any tags (like “Fixes” and “Cc”) accordingly? https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v6.14-rc3#n145 Regards, Markus
On Wed, Feb 19, 2025 at 12:11 PM Markus Elfring <Markus.Elfring@web.de> wrote: > > … > > For full correctness, we should avoid removal from the list until after > > we're done unthrottling in __cfsb_csd_unthrottle(). > … > > How do you think about to add any tags (like “Fixes” and “Cc”) accordingly? > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v6.14-rc3#n145 Thanks, yea I'll resend with the fixes tag and with all the additional review tags collected. Best, Josh
On 2025/2/11 03:51, Josh Don wrote:
> We remove the cfs_rq throttled_csd_list entry *before* doing the
> unthrottle. The problem with that is that destroy_bandwidth() does a
> lockless scan of the system for any non-empty CSD lists. As a result,
> it is possible that destroy_bandwidth() returns while we still have a
> cfs_rq from the task group about to be unthrottled.
>
> For full correctness, we should avoid removal from the list until after
> we're done unthrottling in __cfsb_csd_unthrottle().
>
> For consistency, we make the same change to distribute_cfs_runtime(),
> even though this should already be safe due to destroy_bandwidth()
> cancelling the bandwidth hrtimers.
>
> Signed-off-by: Josh Don <joshdon@google.com>
Good catch!
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
BTW, I just drew the cfs_rq UAF as below:
CPU0 CPU1
__cfsb_csd_unthrottle()
rq lock
for each cfs_rq on list
list_del_init from list
unregister_fair_sched_group()
destroy_cfs_bandwidth()
if (list_empty(&rq->cfsb_csd_list))
continue; // skip rq0
if (cfs_rq->on_list) // maybe false
unthrottle_cfs_rq()
add cfs_rq to list
rq unlock
cfs_rq freed after RCU grace period
cfs_rq UAF!
Thanks!
> ---
> kernel/sched/fair.c | 8 ++++----
> 1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 34fe6e9490c2..78f542ab03cf 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5917,10 +5917,10 @@ static void __cfsb_csd_unthrottle(void *arg)
>
> list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
> throttled_csd_list) {
> - list_del_init(&cursor->throttled_csd_list);
> -
> if (cfs_rq_throttled(cursor))
> unthrottle_cfs_rq(cursor);
> +
> + list_del_init(&cursor->throttled_csd_list);
> }
>
> rcu_read_unlock();
> @@ -6034,11 +6034,11 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
>
> rq_lock_irqsave(rq, &rf);
>
> - list_del_init(&cfs_rq->throttled_csd_list);
> -
> if (cfs_rq_throttled(cfs_rq))
> unthrottle_cfs_rq(cfs_rq);
>
> + list_del_init(&cfs_rq->throttled_csd_list);
> +
> rq_unlock_irqrestore(rq, &rf);
> }
> SCHED_WARN_ON(!list_empty(&local_unthrottle));
Hello Josh,
On 2/11/2025 1:21 AM, Josh Don wrote:
> We remove the cfs_rq throttled_csd_list entry *before* doing the
> unthrottle. The problem with that is that destroy_bandwidth() does a
> lockless scan of the system for any non-empty CSD lists. As a result,
> it is possible that destroy_bandwidth() returns while we still have a
> cfs_rq from the task group about to be unthrottled.
>
> For full correctness, we should avoid removal from the list until after
> we're done unthrottling in __cfsb_csd_unthrottle().
>
> For consistency, we make the same change to distribute_cfs_runtime(),
> even though this should already be safe due to destroy_bandwidth()
> cancelling the bandwidth hrtimers.
>
> Signed-off-by: Josh Don <joshdon@google.com>
Other than a small nit: s/destroy_bandwidth/destroy_cfs_bandwidth/g
please feel free to add:
Reviewed-and-tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
--
Thanks and Regards,
Prateek
> ---
> kernel/sched/fair.c | 8 ++++----
> 1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 34fe6e9490c2..78f542ab03cf 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5917,10 +5917,10 @@ static void __cfsb_csd_unthrottle(void *arg)
>
> list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
> throttled_csd_list) {
> - list_del_init(&cursor->throttled_csd_list);
> -
> if (cfs_rq_throttled(cursor))
> unthrottle_cfs_rq(cursor);
> +
> + list_del_init(&cursor->throttled_csd_list);
> }
>
> rcu_read_unlock();
> @@ -6034,11 +6034,11 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
>
> rq_lock_irqsave(rq, &rf);
>
> - list_del_init(&cfs_rq->throttled_csd_list);
> -
> if (cfs_rq_throttled(cfs_rq))
> unthrottle_cfs_rq(cfs_rq);
>
> + list_del_init(&cfs_rq->throttled_csd_list);
> +
> rq_unlock_irqrestore(rq, &rf);
> }
> SCHED_WARN_ON(!list_empty(&local_unthrottle));
© 2016 - 2026 Red Hat, Inc.