kernel/cgroup/cgroup.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-)
A potential race condition exists between pressure write and cgroup file
release regarding the priv member of struct kernfs_open_file, which
triggers the uaf reported in [1].
Consider the following scenario involving execution on two separate CPUs:
CPU0 CPU1
==== ====
vfs_rmdir()
kernfs_iop_rmdir()
cgroup_rmdir()
cgroup_kn_lock_live()
cgroup_destroy_locked()
cgroup_addrm_files()
cgroup_rm_file()
kernfs_remove_by_name()
kernfs_remove_by_name_ns()
vfs_write() __kernfs_remove()
new_sync_write() kernfs_drain()
kernfs_fop_write_iter() kernfs_drain_open_files()
cgroup_file_write() kernfs_release_file()
pressure_write() cgroup_file_release()
ctx = of->priv;
kfree(ctx);
of->priv = NULL;
cgroup_kn_unlock()
cgroup_kn_lock_live()
cgroup_get(cgrp)
cgroup_kn_unlock()
if (ctx->psi.trigger) // here, trigger uaf for ctx, that is of->priv
The cgroup_rmdir() is protected by the cgroup_mutex, it also safeguards
the memory deallocation of of->priv performed within cgroup_file_release().
However, the operations involving of->priv executed within pressure_write()
are not entirely covered by the protection of cgroup_mutex. Consequently,
if the code in pressure_write(), specifically the section handling the
ctx variable executes after cgroup_file_release() has completed, a uaf
vulnerability involving of->priv is triggered.
Therefore, the issue can be resolved by extending the scope of the
cgroup_mutex lock within pressure_write() to encompass all code paths
involving of->priv, thereby properly synchronizing the race condition
occurring between cgroup_file_release() and pressure_write().
[1]
BUG: KASAN: slab-use-after-free in pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
Call Trace:
pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
cgroup_file_write+0x36f/0x790 kernel/cgroup/cgroup.c:4311
kernfs_fop_write_iter+0x3b0/0x540 fs/kernfs/file.c:352
Allocated by task 9352:
cgroup_file_open+0x90/0x3a0 kernel/cgroup/cgroup.c:4256
kernfs_fop_open+0x9eb/0xcb0 fs/kernfs/file.c:724
do_dentry_open+0x83d/0x13e0 fs/open.c:949
Freed by task 9353:
cgroup_file_release+0xd6/0x100 kernel/cgroup/cgroup.c:4283
kernfs_release_file fs/kernfs/file.c:764 [inline]
kernfs_drain_open_files+0x392/0x720 fs/kernfs/file.c:834
kernfs_drain+0x470/0x600 fs/kernfs/dir.c:525
Fixes: 0e94682b73bf ("psi: introduce psi monitor")
Reported-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=33e571025d88efd1312c
Tested-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
---
v1 -> v2: refactor unlock and update comments
kernel/cgroup/cgroup.c | 21 +++++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4ca3cb993da2..46db30de817b 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3995,34 +3995,47 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
- struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_file_ctx *ctx;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
+ ssize_t ret = 0;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
+ ctx = of->priv;
+ if (!ctx) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
cgroup_get(cgrp);
- cgroup_kn_unlock(of->kn);
/* Allow only one trigger per file descriptor */
if (ctx->psi.trigger) {
cgroup_put(cgrp);
- return -EBUSY;
+ ret = -EBUSY;
+ goto out_unlock;
}
psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res, of->file, of);
if (IS_ERR(new)) {
cgroup_put(cgrp);
- return PTR_ERR(new);
+ ret = PTR_ERR(new);
+ goto out_unlock;
}
smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ if (ret)
+ return ret;
+
return nbytes;
}
--
2.43.0
Hello,
On Fri, Apr 10, 2026 at 08:39:45PM +0800, Edward Adam Davis wrote:
> static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
> size_t nbytes, enum psi_res res)
> {
> - struct cgroup_file_ctx *ctx = of->priv;
> + struct cgroup_file_ctx *ctx;
> struct psi_trigger *new;
> struct cgroup *cgrp;
> struct psi_group *psi;
> + ssize_t ret = 0;
>
> cgrp = cgroup_kn_lock_live(of->kn, false);
> if (!cgrp)
> return -ENODEV;
>
> + ctx = of->priv;
> + if (!ctx) {
This test likely isn't necessary but that's pre-existing.
> + ret = -ENODEV;
> + goto out_unlock;
> + }
> +
> cgroup_get(cgrp);
We don't need get/put if we don't drop the mutex, right?
Thanks.
--
tejun
On Fri, 10 Apr 2026 09:14:05 -1000, Tejun Heo wrote:
> > static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
> > size_t nbytes, enum psi_res res)
> > {
> > - struct cgroup_file_ctx *ctx = of->priv;
> > + struct cgroup_file_ctx *ctx;
> > struct psi_trigger *new;
> > struct cgroup *cgrp;
> > struct psi_group *psi;
> > + ssize_t ret = 0;
> >
> > cgrp = cgroup_kn_lock_live(of->kn, false);
> > if (!cgrp)
> > return -ENODEV;
> >
> > + ctx = of->priv;
> > + if (!ctx) {
>
> This test likely isn't necessary but that's pre-existing.
Where?
Are you referring to the check for of->released within:
'kernfs_fop_write_iter()->kernfs_get_active_of()'? This check is not
performed under the protection of the cgroup_mutex; consequently, it
is susceptible to race conditions, rendering the value unreliable, as
it could be updated at any moment.
>
> > + ret = -ENODEV;
> > + goto out_unlock;
> > + }
> > +
> > cgroup_get(cgrp);
>
> We don't need get/put if we don't drop the mutex, right?
I believe that is indeed the case; the cgroup_get() call here is intended
to facilitate subsequent operations, such as executing an smp store.
Edward
BR
On 2026/4/11 12:25, Edward Adam Davis wrote:
> On Fri, 10 Apr 2026 09:14:05 -1000, Tejun Heo wrote:
>>> static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
>>> size_t nbytes, enum psi_res res)
>>> {
>>> - struct cgroup_file_ctx *ctx = of->priv;
>>> + struct cgroup_file_ctx *ctx;
>>> struct psi_trigger *new;
>>> struct cgroup *cgrp;
>>> struct psi_group *psi;
>>> + ssize_t ret = 0;
>>>
>>> cgrp = cgroup_kn_lock_live(of->kn, false);
>>> if (!cgrp)
>>> return -ENODEV;
>>>
>>> + ctx = of->priv;
>>> + if (!ctx) {
>>
>> This test likely isn't necessary but that's pre-existing.
> Where?
> Are you referring to the check for of->released within:
> 'kernfs_fop_write_iter()->kernfs_get_active_of()'? This check is not
> performed under the protection of the cgroup_mutex; consequently, it
> is susceptible to race conditions, rendering the value unreliable, as
> it could be updated at any moment.
>>
>>> + ret = -ENODEV;
>>> + goto out_unlock;
>>> + }
>>> +
>>> cgroup_get(cgrp);
>>
>> We don't need get/put if we don't drop the mutex, right?
> I believe that is indeed the case; the cgroup_get() call here is intended
> to facilitate subsequent operations, such as executing an smp store.
>
Sorry, I don’t quite understand why get/put is needed. Could you elaborate a bit
more?
--
Best regards,
Ridong
On Mon, 13 Apr 2026 09:51:19 +0800, Chen Ridong wrote:
>On 2026/4/11 12:25, Edward Adam Davis wrote:
>> On Fri, 10 Apr 2026 09:14:05 -1000, Tejun Heo wrote:
>>>> static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
>>>> size_t nbytes, enum psi_res res)
>>>> {
>>>> - struct cgroup_file_ctx *ctx = of->priv;
>>>> + struct cgroup_file_ctx *ctx;
>>>> struct psi_trigger *new;
>>>> struct cgroup *cgrp;
>>>> struct psi_group *psi;
>>>> + ssize_t ret = 0;
>>>>
>>>> cgrp = cgroup_kn_lock_live(of->kn, false);
>>>> if (!cgrp)
>>>> return -ENODEV;
>>>>
>>>> + ctx = of->priv;
>>>> + if (!ctx) {
>>>
>>> This test likely isn't necessary but that's pre-existing.
>> Where?
>> Are you referring to the check for of->released within:
>> 'kernfs_fop_write_iter()->kernfs_get_active_of()'? This check is not
>> performed under the protection of the cgroup_mutex; consequently, it
>> is susceptible to race conditions, rendering the value unreliable, as
>> it could be updated at any moment.
>>>
>>>> + ret = -ENODEV;
>>>> + goto out_unlock;
>>>> + }
>>>> +
>>>> cgroup_get(cgrp);
>>>
>>> We don't need get/put if we don't drop the mutex, right?
>> I believe that is indeed the case; the cgroup_get() call here is intended
>> to facilitate subsequent operations, such as executing an smp store.
>>
>
>Sorry, I don't quite understand why get/put is needed. Could you elaborate a bit
>more?
Oh, I had overlooked the fact that acquiring the live kn lock simultaneously
performs a cgroup_get; therefore, after extending the scope of the mutex,
the original explicit get/put operations are no longer necessary. I will
issue a new version of the patch shortly to address this specific point.
Edward
BR
A potential race condition exists between pressure write and cgroup file
release regarding the priv member of struct kernfs_open_file, which
triggers the uaf reported in [1].
Consider the following scenario involving execution on two separate CPUs:
CPU0 CPU1
==== ====
vfs_rmdir()
kernfs_iop_rmdir()
cgroup_rmdir()
cgroup_kn_lock_live()
cgroup_destroy_locked()
cgroup_addrm_files()
cgroup_rm_file()
kernfs_remove_by_name()
kernfs_remove_by_name_ns()
vfs_write() __kernfs_remove()
new_sync_write() kernfs_drain()
kernfs_fop_write_iter() kernfs_drain_open_files()
cgroup_file_write() kernfs_release_file()
pressure_write() cgroup_file_release()
ctx = of->priv;
kfree(ctx);
of->priv = NULL;
cgroup_kn_unlock()
cgroup_kn_lock_live()
cgroup_get(cgrp)
cgroup_kn_unlock()
if (ctx->psi.trigger) // here, trigger uaf for ctx, that is of->priv
The cgroup_rmdir() is protected by the cgroup_mutex, it also safeguards
the memory deallocation of of->priv performed within cgroup_file_release().
However, the operations involving of->priv executed within pressure_write()
are not entirely covered by the protection of cgroup_mutex. Consequently,
if the code in pressure_write(), specifically the section handling the
ctx variable executes after cgroup_file_release() has completed, a uaf
vulnerability involving of->priv is triggered.
Therefore, the issue can be resolved by extending the scope of the
cgroup_mutex lock within pressure_write() to encompass all code paths
involving of->priv, thereby properly synchronizing the race condition
occurring between cgroup_file_release() and pressure_write().
And, if an active kn lock can be successfully acquired while executing
the pressure write operation, it indicates that the cgroup deletion
process has not yet reached its final stage; consequently, the priv
pointer within open_file cannot be NULL. Therefore, the operation to
retrieve the ctx value must be moved to a point *after* the active kn
lock has been successfully acquired.
Now that the scope of the cgroup_mutex has been expanded, the original
explicit cgroup_get/put operations are no longer necessary, this is
because acquiring/releasing the live kn lock inherently executes a
cgroup get/put operation.
[1]
BUG: KASAN: slab-use-after-free in pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
Call Trace:
pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
cgroup_file_write+0x36f/0x790 kernel/cgroup/cgroup.c:4311
kernfs_fop_write_iter+0x3b0/0x540 fs/kernfs/file.c:352
Allocated by task 9352:
cgroup_file_open+0x90/0x3a0 kernel/cgroup/cgroup.c:4256
kernfs_fop_open+0x9eb/0xcb0 fs/kernfs/file.c:724
do_dentry_open+0x83d/0x13e0 fs/open.c:949
Freed by task 9353:
cgroup_file_release+0xd6/0x100 kernel/cgroup/cgroup.c:4283
kernfs_release_file fs/kernfs/file.c:764 [inline]
kernfs_drain_open_files+0x392/0x720 fs/kernfs/file.c:834
kernfs_drain+0x470/0x600 fs/kernfs/dir.c:525
Fixes: 0e94682b73bf ("psi: introduce psi monitor")
Reported-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=33e571025d88efd1312c
Tested-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
---
v1 -> v2: refactor unlock and update comments
v2 -> v3: remove check for !ctx and update comments
v3 -> v4: remove orig get/put for get cgroup refcnt and update comments
kernel/cgroup/cgroup.c | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4ca3cb993da2..c94a16352c33 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3995,33 +3995,38 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
- struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_file_ctx *ctx;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
+ ssize_t ret = 0;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
- cgroup_get(cgrp);
- cgroup_kn_unlock(of->kn);
+ /* of->priv can not be NULL, because cgroup is CSS_ONLINE */
+ ctx = of->priv;
/* Allow only one trigger per file descriptor */
if (ctx->psi.trigger) {
- cgroup_put(cgrp);
- return -EBUSY;
+ ret = -EBUSY;
+ goto out_unlock;
}
psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res, of->file, of);
if (IS_ERR(new)) {
- cgroup_put(cgrp);
- return PTR_ERR(new);
+ ret = PTR_ERR(new);
+ goto out_unlock;
}
smp_store_release(&ctx->psi.trigger, new);
- cgroup_put(cgrp);
+
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ if (ret)
+ return ret;
return nbytes;
}
--
2.43.0
On 2026/4/13 10:44, Edward Adam Davis wrote:
> A potential race condition exists between pressure write and cgroup file
> release regarding the priv member of struct kernfs_open_file, which
> triggers the uaf reported in [1].
>
> Consider the following scenario involving execution on two separate CPUs:
>
> CPU0 CPU1
> ==== ====
> vfs_rmdir()
> kernfs_iop_rmdir()
> cgroup_rmdir()
> cgroup_kn_lock_live()
> cgroup_destroy_locked()
> cgroup_addrm_files()
> cgroup_rm_file()
> kernfs_remove_by_name()
> kernfs_remove_by_name_ns()
> vfs_write() __kernfs_remove()
> new_sync_write() kernfs_drain()
> kernfs_fop_write_iter() kernfs_drain_open_files()
> cgroup_file_write() kernfs_release_file()
> pressure_write() cgroup_file_release()
> ctx = of->priv;
> kfree(ctx);
> of->priv = NULL;
> cgroup_kn_unlock()
> cgroup_kn_lock_live()
> cgroup_get(cgrp)
> cgroup_kn_unlock()
> if (ctx->psi.trigger) // here, trigger uaf for ctx, that is of->priv
>
> The cgroup_rmdir() is protected by the cgroup_mutex, it also safeguards
> the memory deallocation of of->priv performed within cgroup_file_release().
> However, the operations involving of->priv executed within pressure_write()
> are not entirely covered by the protection of cgroup_mutex. Consequently,
> if the code in pressure_write(), specifically the section handling the
> ctx variable executes after cgroup_file_release() has completed, a uaf
> vulnerability involving of->priv is triggered.
>
> Therefore, the issue can be resolved by extending the scope of the
> cgroup_mutex lock within pressure_write() to encompass all code paths
> involving of->priv, thereby properly synchronizing the race condition
> occurring between cgroup_file_release() and pressure_write().
>
> And, if an active kn lock can be successfully acquired while executing
> the pressure write operation, it indicates that the cgroup deletion
> process has not yet reached its final stage; consequently, the priv
> pointer within open_file cannot be NULL. Therefore, the operation to
> retrieve the ctx value must be moved to a point *after* the active kn
> lock has been successfully acquired.
>
> Now that the scope of the cgroup_mutex has been expanded, the original
> explicit cgroup_get/put operations are no longer necessary, this is
> because acquiring/releasing the live kn lock inherently executes a
> cgroup get/put operation.
>
> [1]
> BUG: KASAN: slab-use-after-free in pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
> Call Trace:
> pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
> cgroup_file_write+0x36f/0x790 kernel/cgroup/cgroup.c:4311
> kernfs_fop_write_iter+0x3b0/0x540 fs/kernfs/file.c:352
>
> Allocated by task 9352:
> cgroup_file_open+0x90/0x3a0 kernel/cgroup/cgroup.c:4256
> kernfs_fop_open+0x9eb/0xcb0 fs/kernfs/file.c:724
> do_dentry_open+0x83d/0x13e0 fs/open.c:949
>
> Freed by task 9353:
> cgroup_file_release+0xd6/0x100 kernel/cgroup/cgroup.c:4283
> kernfs_release_file fs/kernfs/file.c:764 [inline]
> kernfs_drain_open_files+0x392/0x720 fs/kernfs/file.c:834
> kernfs_drain+0x470/0x600 fs/kernfs/dir.c:525
>
> Fixes: 0e94682b73bf ("psi: introduce psi monitor")
> Reported-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=33e571025d88efd1312c
> Tested-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
> Signed-off-by: Edward Adam Davis <eadavis@qq.com>
> ---
> v1 -> v2: refactor unlock and update comments
> v2 -> v3: remove check for !ctx and update comments
> v3 -> v4: remove orig get/put for get cgroup refcnt and update comments
>
> kernel/cgroup/cgroup.c | 21 +++++++++++++--------
> 1 file changed, 13 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 4ca3cb993da2..c94a16352c33 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -3995,33 +3995,38 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
> static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
> size_t nbytes, enum psi_res res)
> {
> - struct cgroup_file_ctx *ctx = of->priv;
> + struct cgroup_file_ctx *ctx;
> struct psi_trigger *new;
> struct cgroup *cgrp;
> struct psi_group *psi;
> + ssize_t ret = 0;
>
> cgrp = cgroup_kn_lock_live(of->kn, false);
> if (!cgrp)
> return -ENODEV;
>
> - cgroup_get(cgrp);
> - cgroup_kn_unlock(of->kn);
> + /* of->priv can not be NULL, because cgroup is CSS_ONLINE */
> + ctx = of->priv;
>
> /* Allow only one trigger per file descriptor */
> if (ctx->psi.trigger) {
> - cgroup_put(cgrp);
> - return -EBUSY;
> + ret = -EBUSY;
> + goto out_unlock;
> }
>
CPU0: write memory.pressure CPU1: write cgroup.pressure=0
================================== ==================================
kernfs_fop_write_iter()
kernfs_get_active_of(of)
pressure_write()
cgroup_kn_lock_live(memory.pressure)
cgroup_tryget(cgrp)
kernfs_break_active_protection(kn)
... blocks on cgroup_mutex
cgroup_pressure_write()
cgroup_kn_lock_live(cgroup.pressure)
cgroup_file_show(memory.pressure, false)
kernfs_show(false)
kernfs_drain_open_files()
cgroup_file_release(of)
kfree(ctx)
of->priv = NULL
cgroup_kn_unlock()
... acquires cgroup_mutex
ctx = of->priv; // may now be NULL
if (ctx->psi.trigger) // NULL dereference
IIUC, for rmdir, 'of->priv cannot be NULL' may be true, but for the other patch
shown above, it might not be.
> psi = cgroup_psi(cgrp);
> new = psi_trigger_create(psi, buf, res, of->file, of);
> if (IS_ERR(new)) {
> - cgroup_put(cgrp);
> - return PTR_ERR(new);
> + ret = PTR_ERR(new);
> + goto out_unlock;
> }
>
> smp_store_release(&ctx->psi.trigger, new);
> - cgroup_put(cgrp);
> +
> +out_unlock:
> + cgroup_kn_unlock(of->kn);
> + if (ret)
> + return ret;
>
> return nbytes;
> }
--
Best regards,
Ridong
On Tue, 14 Apr 2026 10:29:20 +0800, Chen Ridong wrote: > CPU0: write memory.pressure CPU1: write cgroup.pressure=0 > ================================== ================================== > > kernfs_fop_write_iter() > kernfs_get_active_of(of) > pressure_write() > cgroup_kn_lock_live(memory.pressure) > cgroup_tryget(cgrp) > kernfs_break_active_protection(kn) > ... blocks on cgroup_mutex > > cgroup_pressure_write() > cgroup_kn_lock_live(cgroup.pressure) > cgroup_file_show(memory.pressure, false) > kernfs_show(false) > kernfs_drain_open_files() > cgroup_file_release(of) > kfree(ctx) > of->priv = NULL > cgroup_kn_unlock() > > ... acquires cgroup_mutex > ctx = of->priv; // may now be NULL > if (ctx->psi.trigger) // NULL dereference > > IIUC, for rmdir, 'of->priv cannot be NULL' may be true, but for the other patch > shown above, it might not be. Marvelous! Edward BR
A potential race condition exists between pressure write and cgroup file
release regarding the priv member of struct kernfs_open_file, which
triggers the uaf reported in [1].
Consider the following scenario involving execution on two separate CPUs:
CPU0 CPU1
==== ====
vfs_rmdir()
kernfs_iop_rmdir()
cgroup_rmdir()
cgroup_kn_lock_live()
cgroup_destroy_locked()
cgroup_addrm_files()
cgroup_rm_file()
kernfs_remove_by_name()
kernfs_remove_by_name_ns()
vfs_write() __kernfs_remove()
new_sync_write() kernfs_drain()
kernfs_fop_write_iter() kernfs_drain_open_files()
cgroup_file_write() kernfs_release_file()
pressure_write() cgroup_file_release()
ctx = of->priv;
kfree(ctx);
of->priv = NULL;
cgroup_kn_unlock()
cgroup_kn_lock_live()
cgroup_get(cgrp)
cgroup_kn_unlock()
if (ctx->psi.trigger) // here, trigger uaf for ctx, that is of->priv
The cgroup_rmdir() is protected by the cgroup_mutex, it also safeguards
the memory deallocation of of->priv performed within cgroup_file_release().
However, the operations involving of->priv executed within pressure_write()
are not entirely covered by the protection of cgroup_mutex. Consequently,
if the code in pressure_write(), specifically the section handling the
ctx variable executes after cgroup_file_release() has completed, a uaf
vulnerability involving of->priv is triggered.
Therefore, the issue can be resolved by extending the scope of the
cgroup_mutex lock within pressure_write() to encompass all code paths
involving of->priv, thereby properly synchronizing the race condition
occurring between cgroup_file_release() and pressure_write().
And, if an live kn lock can be successfully acquired while executing
the pressure write operation, it indicates that the cgroup deletion
process has not yet reached its final stage; consequently, the priv
pointer within open_file cannot be NULL. Therefore, the operation to
retrieve the ctx value must be moved to a point *after* the live kn
lock has been successfully acquired.
In another situation, specifically after entering cgroup_kn_lock_live()
but before acquiring cgroup_mutex, there exists a different class of
race condition:
CPU0: write memory.pressure CPU1: write cgroup.pressure=0
=========================== =============================
kernfs_fop_write_iter()
kernfs_get_active_of(of)
pressure_write()
cgroup_kn_lock_live(memory.pressure)
cgroup_tryget(cgrp)
kernfs_break_active_protection(kn)
... blocks on cgroup_mutex
cgroup_pressure_write()
cgroup_kn_lock_live(cgroup.pressure)
cgroup_file_show(memory.pressure, false)
kernfs_show(false)
kernfs_drain_open_files()
cgroup_file_release(of)
kfree(ctx)
of->priv = NULL
cgroup_kn_unlock()
... acquires cgroup_mutex
ctx = of->priv; // may now be NULL
if (ctx->psi.trigger) // NULL dereference
Consequently, there is a possibility that of->priv is NULL, the pressure
write needs to check for this.
Now that the scope of the cgroup_mutex has been expanded, the original
explicit cgroup_get/put operations are no longer necessary, this is
because acquiring/releasing the live kn lock inherently executes a
cgroup get/put operation.
[1]
BUG: KASAN: slab-use-after-free in pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
Call Trace:
pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
cgroup_file_write+0x36f/0x790 kernel/cgroup/cgroup.c:4311
kernfs_fop_write_iter+0x3b0/0x540 fs/kernfs/file.c:352
Allocated by task 9352:
cgroup_file_open+0x90/0x3a0 kernel/cgroup/cgroup.c:4256
kernfs_fop_open+0x9eb/0xcb0 fs/kernfs/file.c:724
do_dentry_open+0x83d/0x13e0 fs/open.c:949
Freed by task 9353:
cgroup_file_release+0xd6/0x100 kernel/cgroup/cgroup.c:4283
kernfs_release_file fs/kernfs/file.c:764 [inline]
kernfs_drain_open_files+0x392/0x720 fs/kernfs/file.c:834
kernfs_drain+0x470/0x600 fs/kernfs/dir.c:525
Fixes: 0e94682b73bf ("psi: introduce psi monitor")
Reported-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=33e571025d88efd1312c
Tested-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
---
v1 -> v2: refactor unlock and update comments
v2 -> v3: remove check for !ctx and update comments
v3 -> v4: remove orig get/put for get cgroup refcnt and update comments
v4 -> v5: check !ctx
kernel/cgroup/cgroup.c | 24 ++++++++++++++++--------
1 file changed, 16 insertions(+), 8 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4ca3cb993da2..4366fd62eb3d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3995,33 +3995,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
- struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_file_ctx *ctx;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
+ ssize_t ret = 0;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
- cgroup_get(cgrp);
- cgroup_kn_unlock(of->kn);
+ ctx = of->priv;
+ if (!ctx) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
/* Allow only one trigger per file descriptor */
if (ctx->psi.trigger) {
- cgroup_put(cgrp);
- return -EBUSY;
+ ret = -EBUSY;
+ goto out_unlock;
}
psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res, of->file, of);
if (IS_ERR(new)) {
- cgroup_put(cgrp);
- return PTR_ERR(new);
+ ret = PTR_ERR(new);
+ goto out_unlock;
}
smp_store_release(&ctx->psi.trigger, new);
- cgroup_put(cgrp);
+
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ if (ret)
+ return ret;
return nbytes;
}
--
2.43.0
Hello, > Edward Adam Davis (1): > sched/psi: fix race between file release and pressure write Applied to cgroup/for-7.1-fixes. Thanks. -- tejun
On 2026/4/14 14:15, Edward Adam Davis wrote:
> A potential race condition exists between pressure write and cgroup file
> release regarding the priv member of struct kernfs_open_file, which
> triggers the uaf reported in [1].
>
> Consider the following scenario involving execution on two separate CPUs:
>
> CPU0 CPU1
> ==== ====
> vfs_rmdir()
> kernfs_iop_rmdir()
> cgroup_rmdir()
> cgroup_kn_lock_live()
> cgroup_destroy_locked()
> cgroup_addrm_files()
> cgroup_rm_file()
> kernfs_remove_by_name()
> kernfs_remove_by_name_ns()
> vfs_write() __kernfs_remove()
> new_sync_write() kernfs_drain()
> kernfs_fop_write_iter() kernfs_drain_open_files()
> cgroup_file_write() kernfs_release_file()
> pressure_write() cgroup_file_release()
> ctx = of->priv;
> kfree(ctx);
> of->priv = NULL;
> cgroup_kn_unlock()
> cgroup_kn_lock_live()
> cgroup_get(cgrp)
> cgroup_kn_unlock()
> if (ctx->psi.trigger) // here, trigger uaf for ctx, that is of->priv
>
> The cgroup_rmdir() is protected by the cgroup_mutex, it also safeguards
> the memory deallocation of of->priv performed within cgroup_file_release().
> However, the operations involving of->priv executed within pressure_write()
> are not entirely covered by the protection of cgroup_mutex. Consequently,
> if the code in pressure_write(), specifically the section handling the
> ctx variable executes after cgroup_file_release() has completed, a uaf
> vulnerability involving of->priv is triggered.
>
> Therefore, the issue can be resolved by extending the scope of the
> cgroup_mutex lock within pressure_write() to encompass all code paths
> involving of->priv, thereby properly synchronizing the race condition
> occurring between cgroup_file_release() and pressure_write().
>
> And, if an live kn lock can be successfully acquired while executing
> the pressure write operation, it indicates that the cgroup deletion
> process has not yet reached its final stage; consequently, the priv
> pointer within open_file cannot be NULL. Therefore, the operation to
> retrieve the ctx value must be moved to a point *after* the live kn
> lock has been successfully acquired.
>
> In another situation, specifically after entering cgroup_kn_lock_live()
> but before acquiring cgroup_mutex, there exists a different class of
> race condition:
>
> CPU0: write memory.pressure CPU1: write cgroup.pressure=0
> =========================== =============================
>
> kernfs_fop_write_iter()
> kernfs_get_active_of(of)
> pressure_write()
> cgroup_kn_lock_live(memory.pressure)
> cgroup_tryget(cgrp)
> kernfs_break_active_protection(kn)
> ... blocks on cgroup_mutex
>
> cgroup_pressure_write()
> cgroup_kn_lock_live(cgroup.pressure)
> cgroup_file_show(memory.pressure, false)
> kernfs_show(false)
> kernfs_drain_open_files()
> cgroup_file_release(of)
> kfree(ctx)
> of->priv = NULL
> cgroup_kn_unlock()
>
> ... acquires cgroup_mutex
> ctx = of->priv; // may now be NULL
> if (ctx->psi.trigger) // NULL dereference
>
> Consequently, there is a possibility that of->priv is NULL, the pressure
> write needs to check for this.
>
> Now that the scope of the cgroup_mutex has been expanded, the original
> explicit cgroup_get/put operations are no longer necessary, this is
> because acquiring/releasing the live kn lock inherently executes a
> cgroup get/put operation.
>
> [1]
> BUG: KASAN: slab-use-after-free in pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
> Call Trace:
> pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
> cgroup_file_write+0x36f/0x790 kernel/cgroup/cgroup.c:4311
> kernfs_fop_write_iter+0x3b0/0x540 fs/kernfs/file.c:352
>
> Allocated by task 9352:
> cgroup_file_open+0x90/0x3a0 kernel/cgroup/cgroup.c:4256
> kernfs_fop_open+0x9eb/0xcb0 fs/kernfs/file.c:724
> do_dentry_open+0x83d/0x13e0 fs/open.c:949
>
> Freed by task 9353:
> cgroup_file_release+0xd6/0x100 kernel/cgroup/cgroup.c:4283
> kernfs_release_file fs/kernfs/file.c:764 [inline]
> kernfs_drain_open_files+0x392/0x720 fs/kernfs/file.c:834
> kernfs_drain+0x470/0x600 fs/kernfs/dir.c:525
>
> Fixes: 0e94682b73bf ("psi: introduce psi monitor")
> Reported-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=33e571025d88efd1312c
> Tested-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
> Signed-off-by: Edward Adam Davis <eadavis@qq.com>
> ---
> v1 -> v2: refactor unlock and update comments
> v2 -> v3: remove check for !ctx and update comments
> v3 -> v4: remove orig get/put for get cgroup refcnt and update comments
> v4 -> v5: check !ctx
>
> kernel/cgroup/cgroup.c | 24 ++++++++++++++++--------
> 1 file changed, 16 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 4ca3cb993da2..4366fd62eb3d 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -3995,33 +3995,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
> static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
> size_t nbytes, enum psi_res res)
> {
> - struct cgroup_file_ctx *ctx = of->priv;
> + struct cgroup_file_ctx *ctx;
> struct psi_trigger *new;
> struct cgroup *cgrp;
> struct psi_group *psi;
> + ssize_t ret = 0;
>
> cgrp = cgroup_kn_lock_live(of->kn, false);
> if (!cgrp)
> return -ENODEV;
>
> - cgroup_get(cgrp);
> - cgroup_kn_unlock(of->kn);
> + ctx = of->priv;
> + if (!ctx) {
> + ret = -ENODEV;
> + goto out_unlock;
> + }
>
> /* Allow only one trigger per file descriptor */
> if (ctx->psi.trigger) {
> - cgroup_put(cgrp);
> - return -EBUSY;
> + ret = -EBUSY;
> + goto out_unlock;
> }
>
> psi = cgroup_psi(cgrp);
> new = psi_trigger_create(psi, buf, res, of->file, of);
> if (IS_ERR(new)) {
> - cgroup_put(cgrp);
> - return PTR_ERR(new);
> + ret = PTR_ERR(new);
> + goto out_unlock;
> }
>
> smp_store_release(&ctx->psi.trigger, new);
> - cgroup_put(cgrp);
> +
> +out_unlock:
> + cgroup_kn_unlock(of->kn);
> + if (ret)
> + return ret;
>
> return nbytes;
> }
LGTM.
Thanks.
Reviewed-by: Chen Ridong <chenridong@huaweicloud.com>
--
Best regards,
Ridong
Hello, Edward.
On Sat, Apr 11, 2026 at 12:25:47PM +0800, Edward Adam Davis wrote:
> > > + ctx = of->priv;
> > > + if (!ctx) {
> >
> > This test likely isn't necessary but that's pre-existing.
> Where?
> Are you referring to the check for of->released within:
No, I'm talking about of->priv. I don't think it can be NULL while a live
cgroup kn is locked, can it?
Thanks.
--
tejun
On Fri, 10 Apr 2026 21:39:49 -1000, Tejun Heo wrote:
> > > > + ctx = of->priv;
> > > > + if (!ctx) {
> > >
> > > This test likely isn't necessary but that's pre-existing.
> > Where?
> > Are you referring to the check for of->released within:
>
> No, I'm talking about of->priv. I don't think it can be NULL while a live
> cgroup kn is locked, can it?
If the lock is acquired before the execution of cgroup_file_release()
completes, it will not be NULL; however, if acquired afterwards, it
will invariably be NULL.
Edward
BR
Hello,
On Sat, Apr 11, 2026 at 04:29:22PM +0800, Edward Adam Davis wrote:
> On Fri, 10 Apr 2026 21:39:49 -1000, Tejun Heo wrote:
> > > > > + ctx = of->priv;
> > > > > + if (!ctx) {
> > > >
> > > > This test likely isn't necessary but that's pre-existing.
> > > Where?
> > > Are you referring to the check for of->released within:
> >
> > No, I'm talking about of->priv. I don't think it can be NULL while a live
> > cgroup kn is locked, can it?
>
> If the lock is acquired before the execution of cgroup_file_release()
> completes, it will not be NULL; however, if acquired afterwards, it
> will invariably be NULL.
Hmmm? While the write is in flight the file can't be released and the cgroup
couldn't have been dead if lock_live succeeded. This part is tangential
anyway. Let's ignore for now.
Thanks.
--
tejun
On Sat, 11 Apr 2026 10:40:13 -1000, Tejun Heo wrote:
> On Sat, Apr 11, 2026 at 04:29:22PM +0800, Edward Adam Davis wrote:
> > On Fri, 10 Apr 2026 21:39:49 -1000, Tejun Heo wrote:
> > > > > > + ctx = of->priv;
> > > > > > + if (!ctx) {
> > > > >
> > > > > This test likely isn't necessary but that's pre-existing.
> > > > Where?
> > > > Are you referring to the check for of->released within:
> > >
> > > No, I'm talking about of->priv. I don't think it can be NULL while a live
> > > cgroup kn is locked, can it?
> >
> > If the lock is acquired before the execution of cgroup_file_release()
> > completes, it will not be NULL; however, if acquired afterwards, it
> > will invariably be NULL.
>
> Hmmm? While the write is in flight the file can't be released and the cgroup
> couldn't have been dead if lock_live succeeded. This part is tangential
> anyway. Let's ignore for now.
I have once again walked through the entire workflow for the cgroup
deletion operation. Indeed, if the active kn lock can be successfully
acquired while executing pressure write, it indicates that the cgroup
deletion process has not yet reached its final stage; therefore, the
`priv` pointer within open_file cannot possibly be NULL.
I will submit the third version of the patch shortly.
Edward
BR
A potential race condition exists between pressure write and cgroup file
release regarding the priv member of struct kernfs_open_file, which
triggers the uaf reported in [1].
Consider the following scenario involving execution on two separate CPUs:
CPU0 CPU1
==== ====
vfs_rmdir()
kernfs_iop_rmdir()
cgroup_rmdir()
cgroup_kn_lock_live()
cgroup_destroy_locked()
cgroup_addrm_files()
cgroup_rm_file()
kernfs_remove_by_name()
kernfs_remove_by_name_ns()
vfs_write() __kernfs_remove()
new_sync_write() kernfs_drain()
kernfs_fop_write_iter() kernfs_drain_open_files()
cgroup_file_write() kernfs_release_file()
pressure_write() cgroup_file_release()
ctx = of->priv;
kfree(ctx);
of->priv = NULL;
cgroup_kn_unlock()
cgroup_kn_lock_live()
cgroup_get(cgrp)
cgroup_kn_unlock()
if (ctx->psi.trigger) // here, trigger uaf for ctx, that is of->priv
The cgroup_rmdir() is protected by the cgroup_mutex, it also safeguards
the memory deallocation of of->priv performed within cgroup_file_release().
However, the operations involving of->priv executed within pressure_write()
are not entirely covered by the protection of cgroup_mutex. Consequently,
if the code in pressure_write(), specifically the section handling the
ctx variable executes after cgroup_file_release() has completed, a uaf
vulnerability involving of->priv is triggered.
Therefore, the issue can be resolved by extending the scope of the
cgroup_mutex lock within pressure_write() to encompass all code paths
involving of->priv, thereby properly synchronizing the race condition
occurring between cgroup_file_release() and pressure_write().
And, if an active kn lock can be successfully acquired while executing
the pressure write operation, it indicates that the cgroup deletion
process has not yet reached its final stage; consequently, the priv
pointer within open_file cannot be NULL. Therefore, the operation to
retrieve the ctx value must be moved to a point *after* the active kn
lock has been successfully acquired.
[1]
BUG: KASAN: slab-use-after-free in pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
Call Trace:
pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
cgroup_file_write+0x36f/0x790 kernel/cgroup/cgroup.c:4311
kernfs_fop_write_iter+0x3b0/0x540 fs/kernfs/file.c:352
Allocated by task 9352:
cgroup_file_open+0x90/0x3a0 kernel/cgroup/cgroup.c:4256
kernfs_fop_open+0x9eb/0xcb0 fs/kernfs/file.c:724
do_dentry_open+0x83d/0x13e0 fs/open.c:949
Freed by task 9353:
cgroup_file_release+0xd6/0x100 kernel/cgroup/cgroup.c:4283
kernfs_release_file fs/kernfs/file.c:764 [inline]
kernfs_drain_open_files+0x392/0x720 fs/kernfs/file.c:834
kernfs_drain+0x470/0x600 fs/kernfs/dir.c:525
Fixes: 0e94682b73bf ("psi: introduce psi monitor")
Reported-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=33e571025d88efd1312c
Tested-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
---
v1 -> v2: refactor unlock and update comments
v2 -> v3: remove check for !ctx and update comments
kernel/cgroup/cgroup.c | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4ca3cb993da2..1d89fab82850 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3995,34 +3995,43 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
- struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_file_ctx *ctx;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
+ ssize_t ret = 0;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
+ /* of->priv can not be NULL, because cgroup is CSS_ONLINE */
+ ctx = of->priv;
cgroup_get(cgrp);
- cgroup_kn_unlock(of->kn);
/* Allow only one trigger per file descriptor */
if (ctx->psi.trigger) {
cgroup_put(cgrp);
- return -EBUSY;
+ ret = -EBUSY;
+ goto out_unlock;
}
psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res, of->file, of);
if (IS_ERR(new)) {
cgroup_put(cgrp);
- return PTR_ERR(new);
+ ret = PTR_ERR(new);
+ goto out_unlock;
}
smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ if (ret)
+ return ret;
+
return nbytes;
}
--
2.43.0
© 2016 - 2026 Red Hat, Inc.