[PATCH v4] sched/psi: fix race between file release and pressure write

Edward Adam Davis posted 1 patch 2 months ago
There is a newer version of this series
kernel/cgroup/cgroup.c | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
[PATCH v4] sched/psi: fix race between file release and pressure write
Posted by Edward Adam Davis 2 months ago
A potential race condition exists between pressure write and cgroup file
release regarding the priv member of struct kernfs_open_file, which
triggers the uaf reported in [1].

Consider the following scenario involving execution on two separate CPUs:

   CPU0					CPU1
   ====					====
					vfs_rmdir()
					kernfs_iop_rmdir()
					cgroup_rmdir()
					cgroup_kn_lock_live()
					cgroup_destroy_locked()
					cgroup_addrm_files()
					cgroup_rm_file()
					kernfs_remove_by_name()
					kernfs_remove_by_name_ns()
 vfs_write()				__kernfs_remove()
 new_sync_write()			kernfs_drain()
 kernfs_fop_write_iter()		kernfs_drain_open_files()
 cgroup_file_write()			kernfs_release_file()
 pressure_write()			cgroup_file_release()
 ctx = of->priv;
					kfree(ctx);
 					of->priv = NULL;
					cgroup_kn_unlock()
 cgroup_kn_lock_live()
 cgroup_get(cgrp)
 cgroup_kn_unlock()
 if (ctx->psi.trigger)  // here, trigger uaf for ctx, that is of->priv

The cgroup_rmdir() is protected by the cgroup_mutex, it also safeguards
the memory deallocation of of->priv performed within cgroup_file_release().
However, the operations involving of->priv executed within pressure_write()
are not entirely covered by the protection of cgroup_mutex. Consequently,
if the code in pressure_write(), specifically the section handling the
ctx variable executes after cgroup_file_release() has completed, a uaf
vulnerability involving of->priv is triggered.

Therefore, the issue can be resolved by extending the scope of the
cgroup_mutex lock within pressure_write() to encompass all code paths
involving of->priv, thereby properly synchronizing the race condition
occurring between cgroup_file_release() and pressure_write().

And, if an active kn lock can be successfully acquired while executing
the pressure write operation, it indicates that the cgroup deletion
process has not yet reached its final stage; consequently, the priv
pointer within open_file cannot be NULL. Therefore, the operation to
retrieve the ctx value must be moved to a point *after* the active kn
lock has been successfully acquired.

Now that the scope of the cgroup_mutex has been expanded, the original
explicit cgroup_get/put operations are no longer necessary, this is
because acquiring/releasing the live kn lock inherently executes a
cgroup get/put operation. 

[1]
BUG: KASAN: slab-use-after-free in pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
Call Trace:
 pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
 cgroup_file_write+0x36f/0x790 kernel/cgroup/cgroup.c:4311
 kernfs_fop_write_iter+0x3b0/0x540 fs/kernfs/file.c:352

Allocated by task 9352:
 cgroup_file_open+0x90/0x3a0 kernel/cgroup/cgroup.c:4256
 kernfs_fop_open+0x9eb/0xcb0 fs/kernfs/file.c:724
 do_dentry_open+0x83d/0x13e0 fs/open.c:949

Freed by task 9353:
 cgroup_file_release+0xd6/0x100 kernel/cgroup/cgroup.c:4283
 kernfs_release_file fs/kernfs/file.c:764 [inline]
 kernfs_drain_open_files+0x392/0x720 fs/kernfs/file.c:834
 kernfs_drain+0x470/0x600 fs/kernfs/dir.c:525

Fixes: 0e94682b73bf ("psi: introduce psi monitor")
Reported-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=33e571025d88efd1312c
Tested-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
---
v1 -> v2: refactor unlock and update comments
v2 -> v3: remove check for !ctx and update comments
v3 -> v4: remove orig get/put for get cgroup refcnt and update comments

 kernel/cgroup/cgroup.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4ca3cb993da2..c94a16352c33 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3995,33 +3995,38 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
 static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
 			      size_t nbytes, enum psi_res res)
 {
-	struct cgroup_file_ctx *ctx = of->priv;
+	struct cgroup_file_ctx *ctx;
 	struct psi_trigger *new;
 	struct cgroup *cgrp;
 	struct psi_group *psi;
+	ssize_t ret = 0;
 
 	cgrp = cgroup_kn_lock_live(of->kn, false);
 	if (!cgrp)
 		return -ENODEV;
 
-	cgroup_get(cgrp);
-	cgroup_kn_unlock(of->kn);
+	/* of->priv can not be NULL, because cgroup is CSS_ONLINE */
+	ctx = of->priv;
 
 	/* Allow only one trigger per file descriptor */
 	if (ctx->psi.trigger) {
-		cgroup_put(cgrp);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto out_unlock;
 	}
 
 	psi = cgroup_psi(cgrp);
 	new = psi_trigger_create(psi, buf, res, of->file, of);
 	if (IS_ERR(new)) {
-		cgroup_put(cgrp);
-		return PTR_ERR(new);
+		ret = PTR_ERR(new);
+		goto out_unlock;
 	}
 
 	smp_store_release(&ctx->psi.trigger, new);
-	cgroup_put(cgrp);
+
+out_unlock:
+	cgroup_kn_unlock(of->kn);
+	if (ret)
+		return ret;
 
 	return nbytes;
 }
-- 
2.43.0
Re: [PATCH v4] sched/psi: fix race between file release and pressure write
Posted by Chen Ridong 2 months ago

On 2026/4/13 10:44, Edward Adam Davis wrote:
> A potential race condition exists between pressure write and cgroup file
> release regarding the priv member of struct kernfs_open_file, which
> triggers the uaf reported in [1].
> 
> Consider the following scenario involving execution on two separate CPUs:
> 
>    CPU0					CPU1
>    ====					====
> 					vfs_rmdir()
> 					kernfs_iop_rmdir()
> 					cgroup_rmdir()
> 					cgroup_kn_lock_live()
> 					cgroup_destroy_locked()
> 					cgroup_addrm_files()
> 					cgroup_rm_file()
> 					kernfs_remove_by_name()
> 					kernfs_remove_by_name_ns()
>  vfs_write()				__kernfs_remove()
>  new_sync_write()			kernfs_drain()
>  kernfs_fop_write_iter()		kernfs_drain_open_files()
>  cgroup_file_write()			kernfs_release_file()
>  pressure_write()			cgroup_file_release()
>  ctx = of->priv;
> 					kfree(ctx);
>  					of->priv = NULL;
> 					cgroup_kn_unlock()
>  cgroup_kn_lock_live()
>  cgroup_get(cgrp)
>  cgroup_kn_unlock()
>  if (ctx->psi.trigger)  // here, trigger uaf for ctx, that is of->priv
> 
> The cgroup_rmdir() is protected by the cgroup_mutex, it also safeguards
> the memory deallocation of of->priv performed within cgroup_file_release().
> However, the operations involving of->priv executed within pressure_write()
> are not entirely covered by the protection of cgroup_mutex. Consequently,
> if the code in pressure_write(), specifically the section handling the
> ctx variable executes after cgroup_file_release() has completed, a uaf
> vulnerability involving of->priv is triggered.
> 
> Therefore, the issue can be resolved by extending the scope of the
> cgroup_mutex lock within pressure_write() to encompass all code paths
> involving of->priv, thereby properly synchronizing the race condition
> occurring between cgroup_file_release() and pressure_write().
> 
> And, if an active kn lock can be successfully acquired while executing
> the pressure write operation, it indicates that the cgroup deletion
> process has not yet reached its final stage; consequently, the priv
> pointer within open_file cannot be NULL. Therefore, the operation to
> retrieve the ctx value must be moved to a point *after* the active kn
> lock has been successfully acquired.
> 
> Now that the scope of the cgroup_mutex has been expanded, the original
> explicit cgroup_get/put operations are no longer necessary, this is
> because acquiring/releasing the live kn lock inherently executes a
> cgroup get/put operation. 
> 
> [1]
> BUG: KASAN: slab-use-after-free in pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
> Call Trace:
>  pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
>  cgroup_file_write+0x36f/0x790 kernel/cgroup/cgroup.c:4311
>  kernfs_fop_write_iter+0x3b0/0x540 fs/kernfs/file.c:352
> 
> Allocated by task 9352:
>  cgroup_file_open+0x90/0x3a0 kernel/cgroup/cgroup.c:4256
>  kernfs_fop_open+0x9eb/0xcb0 fs/kernfs/file.c:724
>  do_dentry_open+0x83d/0x13e0 fs/open.c:949
> 
> Freed by task 9353:
>  cgroup_file_release+0xd6/0x100 kernel/cgroup/cgroup.c:4283
>  kernfs_release_file fs/kernfs/file.c:764 [inline]
>  kernfs_drain_open_files+0x392/0x720 fs/kernfs/file.c:834
>  kernfs_drain+0x470/0x600 fs/kernfs/dir.c:525
> 
> Fixes: 0e94682b73bf ("psi: introduce psi monitor")
> Reported-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=33e571025d88efd1312c
> Tested-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
> Signed-off-by: Edward Adam Davis <eadavis@qq.com>
> ---
> v1 -> v2: refactor unlock and update comments
> v2 -> v3: remove check for !ctx and update comments
> v3 -> v4: remove orig get/put for get cgroup refcnt and update comments
> 
>  kernel/cgroup/cgroup.c | 21 +++++++++++++--------
>  1 file changed, 13 insertions(+), 8 deletions(-)
> 
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 4ca3cb993da2..c94a16352c33 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -3995,33 +3995,38 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
>  static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
>  			      size_t nbytes, enum psi_res res)
>  {
> -	struct cgroup_file_ctx *ctx = of->priv;
> +	struct cgroup_file_ctx *ctx;
>  	struct psi_trigger *new;
>  	struct cgroup *cgrp;
>  	struct psi_group *psi;
> +	ssize_t ret = 0;
>  
>  	cgrp = cgroup_kn_lock_live(of->kn, false);
>  	if (!cgrp)
>  		return -ENODEV;
>  
> -	cgroup_get(cgrp);
> -	cgroup_kn_unlock(of->kn);
> +	/* of->priv can not be NULL, because cgroup is CSS_ONLINE */
> +	ctx = of->priv;
>  
>  	/* Allow only one trigger per file descriptor */
>  	if (ctx->psi.trigger) {
> -		cgroup_put(cgrp);
> -		return -EBUSY;
> +		ret = -EBUSY;
> +		goto out_unlock;
>  	}
>  

CPU0: write memory.pressure               CPU1: write cgroup.pressure=0
  ==================================       ==================================

  kernfs_fop_write_iter()
    kernfs_get_active_of(of)
    pressure_write()
      cgroup_kn_lock_live(memory.pressure)
        cgroup_tryget(cgrp)
        kernfs_break_active_protection(kn)
        ... blocks on cgroup_mutex

                                        cgroup_pressure_write()
                                        cgroup_kn_lock_live(cgroup.pressure)
                                        cgroup_file_show(memory.pressure, false)
                                          kernfs_show(false)
                                            kernfs_drain_open_files()
                                              cgroup_file_release(of)
                                                kfree(ctx)
                                                  of->priv = NULL
                                        cgroup_kn_unlock()

      ... acquires cgroup_mutex
      ctx = of->priv;        // may now be NULL
      if (ctx->psi.trigger)  // NULL dereference

IIUC, for rmdir, 'of->priv cannot be NULL' may be true, but for the other patch
shown above, it might not be.

>  	psi = cgroup_psi(cgrp);
>  	new = psi_trigger_create(psi, buf, res, of->file, of);
>  	if (IS_ERR(new)) {
> -		cgroup_put(cgrp);
> -		return PTR_ERR(new);
> +		ret = PTR_ERR(new);
> +		goto out_unlock;
>  	}
>  
>  	smp_store_release(&ctx->psi.trigger, new);
> -	cgroup_put(cgrp);
> +
> +out_unlock:
> +	cgroup_kn_unlock(of->kn);
> +	if (ret)
> +		return ret;
>  
>  	return nbytes;
>  }

-- 
Best regards,
Ridong
Re: [PATCH v4] sched/psi: fix race between file release and pressure write
Posted by Edward Adam Davis 2 months ago
On Tue, 14 Apr 2026 10:29:20 +0800, Chen Ridong wrote:
> CPU0: write memory.pressure               CPU1: write cgroup.pressure=0
>   ==================================       ==================================
> 
>   kernfs_fop_write_iter()
>     kernfs_get_active_of(of)
>     pressure_write()
>       cgroup_kn_lock_live(memory.pressure)
>         cgroup_tryget(cgrp)
>         kernfs_break_active_protection(kn)
>         ... blocks on cgroup_mutex
> 
>                                         cgroup_pressure_write()
>                                         cgroup_kn_lock_live(cgroup.pressure)
>                                         cgroup_file_show(memory.pressure, false)
>                                           kernfs_show(false)
>                                             kernfs_drain_open_files()
>                                               cgroup_file_release(of)
>                                                 kfree(ctx)
>                                                   of->priv = NULL
>                                         cgroup_kn_unlock()
> 
>       ... acquires cgroup_mutex
>       ctx = of->priv;        // may now be NULL
>       if (ctx->psi.trigger)  // NULL dereference
> 
> IIUC, for rmdir, 'of->priv cannot be NULL' may be true, but for the other patch
> shown above, it might not be.
Marvelous!

Edward
BR
[PATCH v5] sched/psi: fix race between file release and pressure write
Posted by Edward Adam Davis 2 months ago
A potential race condition exists between pressure write and cgroup file
release regarding the priv member of struct kernfs_open_file, which
triggers the uaf reported in [1].

Consider the following scenario involving execution on two separate CPUs:

   CPU0					CPU1
   ====					====
					vfs_rmdir()
					kernfs_iop_rmdir()
					cgroup_rmdir()
					cgroup_kn_lock_live()
					cgroup_destroy_locked()
					cgroup_addrm_files()
					cgroup_rm_file()
					kernfs_remove_by_name()
					kernfs_remove_by_name_ns()
 vfs_write()				__kernfs_remove()
 new_sync_write()			kernfs_drain()
 kernfs_fop_write_iter()		kernfs_drain_open_files()
 cgroup_file_write()			kernfs_release_file()
 pressure_write()			cgroup_file_release()
 ctx = of->priv;
					kfree(ctx);
 					of->priv = NULL;
					cgroup_kn_unlock()
 cgroup_kn_lock_live()
 cgroup_get(cgrp)
 cgroup_kn_unlock()
 if (ctx->psi.trigger)  // here, trigger uaf for ctx, that is of->priv

The cgroup_rmdir() is protected by the cgroup_mutex, it also safeguards
the memory deallocation of of->priv performed within cgroup_file_release().
However, the operations involving of->priv executed within pressure_write()
are not entirely covered by the protection of cgroup_mutex. Consequently,
if the code in pressure_write(), specifically the section handling the
ctx variable executes after cgroup_file_release() has completed, a uaf
vulnerability involving of->priv is triggered.

Therefore, the issue can be resolved by extending the scope of the
cgroup_mutex lock within pressure_write() to encompass all code paths
involving of->priv, thereby properly synchronizing the race condition
occurring between cgroup_file_release() and pressure_write().

And, if an live kn lock can be successfully acquired while executing
the pressure write operation, it indicates that the cgroup deletion
process has not yet reached its final stage; consequently, the priv
pointer within open_file cannot be NULL. Therefore, the operation to
retrieve the ctx value must be moved to a point *after* the live kn
lock has been successfully acquired.

In another situation, specifically after entering cgroup_kn_lock_live()
but before acquiring cgroup_mutex, there exists a different class of
race condition:

CPU0: write memory.pressure               CPU1: write cgroup.pressure=0
===========================		  =============================

kernfs_fop_write_iter()
 kernfs_get_active_of(of)
 pressure_write()
   cgroup_kn_lock_live(memory.pressure)
     cgroup_tryget(cgrp)
     kernfs_break_active_protection(kn)
     ... blocks on cgroup_mutex

                                     	  cgroup_pressure_write()
                                     	  cgroup_kn_lock_live(cgroup.pressure)
                                     	  cgroup_file_show(memory.pressure, false)
                                     	    kernfs_show(false)
                                     	      kernfs_drain_open_files()
                                     	        cgroup_file_release(of)
                                     	          kfree(ctx)
                                     	            of->priv = NULL
                                     	  cgroup_kn_unlock()

   ... acquires cgroup_mutex
   ctx = of->priv;        // may now be NULL
   if (ctx->psi.trigger)  // NULL dereference

Consequently, there is a possibility that of->priv is NULL, the pressure
write needs to check for this.

Now that the scope of the cgroup_mutex has been expanded, the original
explicit cgroup_get/put operations are no longer necessary, this is
because acquiring/releasing the live kn lock inherently executes a
cgroup get/put operation. 

[1]
BUG: KASAN: slab-use-after-free in pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
Call Trace:
 pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
 cgroup_file_write+0x36f/0x790 kernel/cgroup/cgroup.c:4311
 kernfs_fop_write_iter+0x3b0/0x540 fs/kernfs/file.c:352

Allocated by task 9352:
 cgroup_file_open+0x90/0x3a0 kernel/cgroup/cgroup.c:4256
 kernfs_fop_open+0x9eb/0xcb0 fs/kernfs/file.c:724
 do_dentry_open+0x83d/0x13e0 fs/open.c:949

Freed by task 9353:
 cgroup_file_release+0xd6/0x100 kernel/cgroup/cgroup.c:4283
 kernfs_release_file fs/kernfs/file.c:764 [inline]
 kernfs_drain_open_files+0x392/0x720 fs/kernfs/file.c:834
 kernfs_drain+0x470/0x600 fs/kernfs/dir.c:525

Fixes: 0e94682b73bf ("psi: introduce psi monitor")
Reported-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=33e571025d88efd1312c
Tested-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
---
v1 -> v2: refactor unlock and update comments
v2 -> v3: remove check for !ctx and update comments
v3 -> v4: remove orig get/put for get cgroup refcnt and update comments
v4 -> v5: check !ctx

 kernel/cgroup/cgroup.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4ca3cb993da2..4366fd62eb3d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3995,33 +3995,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
 static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
 			      size_t nbytes, enum psi_res res)
 {
-	struct cgroup_file_ctx *ctx = of->priv;
+	struct cgroup_file_ctx *ctx;
 	struct psi_trigger *new;
 	struct cgroup *cgrp;
 	struct psi_group *psi;
+	ssize_t ret = 0;
 
 	cgrp = cgroup_kn_lock_live(of->kn, false);
 	if (!cgrp)
 		return -ENODEV;
 
-	cgroup_get(cgrp);
-	cgroup_kn_unlock(of->kn);
+	ctx = of->priv;
+	if (!ctx) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
 
 	/* Allow only one trigger per file descriptor */
 	if (ctx->psi.trigger) {
-		cgroup_put(cgrp);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto out_unlock;
 	}
 
 	psi = cgroup_psi(cgrp);
 	new = psi_trigger_create(psi, buf, res, of->file, of);
 	if (IS_ERR(new)) {
-		cgroup_put(cgrp);
-		return PTR_ERR(new);
+		ret = PTR_ERR(new);
+		goto out_unlock;
 	}
 
 	smp_store_release(&ctx->psi.trigger, new);
-	cgroup_put(cgrp);
+
+out_unlock:
+	cgroup_kn_unlock(of->kn);
+	if (ret)
+		return ret;
 
 	return nbytes;
 }
-- 
2.43.0
Re: [PATCH v5] sched/psi: fix race between file release and pressure write
Posted by Tejun Heo 1 month, 4 weeks ago
Hello,

> Edward Adam Davis (1):
>   sched/psi: fix race between file release and pressure write

Applied to cgroup/for-7.1-fixes.

Thanks.

--
tejun
Re: [PATCH v5] sched/psi: fix race between file release and pressure write
Posted by Chen Ridong 2 months ago

On 2026/4/14 14:15, Edward Adam Davis wrote:
> A potential race condition exists between pressure write and cgroup file
> release regarding the priv member of struct kernfs_open_file, which
> triggers the uaf reported in [1].
> 
> Consider the following scenario involving execution on two separate CPUs:
> 
>    CPU0					CPU1
>    ====					====
> 					vfs_rmdir()
> 					kernfs_iop_rmdir()
> 					cgroup_rmdir()
> 					cgroup_kn_lock_live()
> 					cgroup_destroy_locked()
> 					cgroup_addrm_files()
> 					cgroup_rm_file()
> 					kernfs_remove_by_name()
> 					kernfs_remove_by_name_ns()
>  vfs_write()				__kernfs_remove()
>  new_sync_write()			kernfs_drain()
>  kernfs_fop_write_iter()		kernfs_drain_open_files()
>  cgroup_file_write()			kernfs_release_file()
>  pressure_write()			cgroup_file_release()
>  ctx = of->priv;
> 					kfree(ctx);
>  					of->priv = NULL;
> 					cgroup_kn_unlock()
>  cgroup_kn_lock_live()
>  cgroup_get(cgrp)
>  cgroup_kn_unlock()
>  if (ctx->psi.trigger)  // here, trigger uaf for ctx, that is of->priv
> 
> The cgroup_rmdir() is protected by the cgroup_mutex, it also safeguards
> the memory deallocation of of->priv performed within cgroup_file_release().
> However, the operations involving of->priv executed within pressure_write()
> are not entirely covered by the protection of cgroup_mutex. Consequently,
> if the code in pressure_write(), specifically the section handling the
> ctx variable executes after cgroup_file_release() has completed, a uaf
> vulnerability involving of->priv is triggered.
> 
> Therefore, the issue can be resolved by extending the scope of the
> cgroup_mutex lock within pressure_write() to encompass all code paths
> involving of->priv, thereby properly synchronizing the race condition
> occurring between cgroup_file_release() and pressure_write().
> 
> And, if an live kn lock can be successfully acquired while executing
> the pressure write operation, it indicates that the cgroup deletion
> process has not yet reached its final stage; consequently, the priv
> pointer within open_file cannot be NULL. Therefore, the operation to
> retrieve the ctx value must be moved to a point *after* the live kn
> lock has been successfully acquired.
> 
> In another situation, specifically after entering cgroup_kn_lock_live()
> but before acquiring cgroup_mutex, there exists a different class of
> race condition:
> 
> CPU0: write memory.pressure               CPU1: write cgroup.pressure=0
> ===========================		  =============================
> 
> kernfs_fop_write_iter()
>  kernfs_get_active_of(of)
>  pressure_write()
>    cgroup_kn_lock_live(memory.pressure)
>      cgroup_tryget(cgrp)
>      kernfs_break_active_protection(kn)
>      ... blocks on cgroup_mutex
> 
>                                      	  cgroup_pressure_write()
>                                      	  cgroup_kn_lock_live(cgroup.pressure)
>                                      	  cgroup_file_show(memory.pressure, false)
>                                      	    kernfs_show(false)
>                                      	      kernfs_drain_open_files()
>                                      	        cgroup_file_release(of)
>                                      	          kfree(ctx)
>                                      	            of->priv = NULL
>                                      	  cgroup_kn_unlock()
> 
>    ... acquires cgroup_mutex
>    ctx = of->priv;        // may now be NULL
>    if (ctx->psi.trigger)  // NULL dereference
> 
> Consequently, there is a possibility that of->priv is NULL, the pressure
> write needs to check for this.
> 
> Now that the scope of the cgroup_mutex has been expanded, the original
> explicit cgroup_get/put operations are no longer necessary, this is
> because acquiring/releasing the live kn lock inherently executes a
> cgroup get/put operation. 
> 
> [1]
> BUG: KASAN: slab-use-after-free in pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
> Call Trace:
>  pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011
>  cgroup_file_write+0x36f/0x790 kernel/cgroup/cgroup.c:4311
>  kernfs_fop_write_iter+0x3b0/0x540 fs/kernfs/file.c:352
> 
> Allocated by task 9352:
>  cgroup_file_open+0x90/0x3a0 kernel/cgroup/cgroup.c:4256
>  kernfs_fop_open+0x9eb/0xcb0 fs/kernfs/file.c:724
>  do_dentry_open+0x83d/0x13e0 fs/open.c:949
> 
> Freed by task 9353:
>  cgroup_file_release+0xd6/0x100 kernel/cgroup/cgroup.c:4283
>  kernfs_release_file fs/kernfs/file.c:764 [inline]
>  kernfs_drain_open_files+0x392/0x720 fs/kernfs/file.c:834
>  kernfs_drain+0x470/0x600 fs/kernfs/dir.c:525
> 
> Fixes: 0e94682b73bf ("psi: introduce psi monitor")
> Reported-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=33e571025d88efd1312c
> Tested-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com
> Signed-off-by: Edward Adam Davis <eadavis@qq.com>
> ---
> v1 -> v2: refactor unlock and update comments
> v2 -> v3: remove check for !ctx and update comments
> v3 -> v4: remove orig get/put for get cgroup refcnt and update comments
> v4 -> v5: check !ctx
> 
>  kernel/cgroup/cgroup.c | 24 ++++++++++++++++--------
>  1 file changed, 16 insertions(+), 8 deletions(-)
> 
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 4ca3cb993da2..4366fd62eb3d 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -3995,33 +3995,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
>  static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
>  			      size_t nbytes, enum psi_res res)
>  {
> -	struct cgroup_file_ctx *ctx = of->priv;
> +	struct cgroup_file_ctx *ctx;
>  	struct psi_trigger *new;
>  	struct cgroup *cgrp;
>  	struct psi_group *psi;
> +	ssize_t ret = 0;
>  
>  	cgrp = cgroup_kn_lock_live(of->kn, false);
>  	if (!cgrp)
>  		return -ENODEV;
>  
> -	cgroup_get(cgrp);
> -	cgroup_kn_unlock(of->kn);
> +	ctx = of->priv;
> +	if (!ctx) {
> +		ret = -ENODEV;
> +		goto out_unlock;
> +	}
>  
>  	/* Allow only one trigger per file descriptor */
>  	if (ctx->psi.trigger) {
> -		cgroup_put(cgrp);
> -		return -EBUSY;
> +		ret = -EBUSY;
> +		goto out_unlock;
>  	}
>  
>  	psi = cgroup_psi(cgrp);
>  	new = psi_trigger_create(psi, buf, res, of->file, of);
>  	if (IS_ERR(new)) {
> -		cgroup_put(cgrp);
> -		return PTR_ERR(new);
> +		ret = PTR_ERR(new);
> +		goto out_unlock;
>  	}
>  
>  	smp_store_release(&ctx->psi.trigger, new);
> -	cgroup_put(cgrp);
> +
> +out_unlock:
> +	cgroup_kn_unlock(of->kn);
> +	if (ret)
> +		return ret;
>  
>  	return nbytes;
>  }

LGTM.

Thanks.

Reviewed-by: Chen Ridong <chenridong@huaweicloud.com>

-- 
Best regards,
Ridong