[v1] nfsd: put dl_stid if fail to queue dl_recall

[PATCH] nfsd: put dl_stid if fail to queue dl_recall

Posted by Li Lingfeng 11 months, 4 weeks ago

Before calling nfsd4_run_cb to queue dl_recall to the callback_wq, we
increment the reference count of dl_stid.
We expect that after the corresponding work_struct is processed, the
reference count of dl_stid will be decremented through the callback
function nfsd4_cb_recall_release.
However, if the call to nfsd4_run_cb fails, the incremented reference
count of dl_stid will not be decremented correspondingly, leading to the
following nfs4_stid leak:
unreferenced object 0xffff88812067b578 (size 344):
  comm "nfsd", pid 2761, jiffies 4295044002 (age 5541.241s)
  hex dump (first 32 bytes):
    01 00 00 00 6b 6b 6b 6b b8 02 c0 e2 81 88 ff ff  ....kkkk........
    00 6b 6b 6b 6b 6b 6b 6b 00 00 00 00 ad 4e ad de  .kkkkkkk.....N..
  backtrace:
    kmem_cache_alloc+0x4b9/0x700
    nfsd4_process_open1+0x34/0x300
    nfsd4_open+0x2d1/0x9d0
    nfsd4_proc_compound+0x7a2/0xe30
    nfsd_dispatch+0x241/0x3e0
    svc_process_common+0x5d3/0xcc0
    svc_process+0x2a3/0x320
    nfsd+0x180/0x2e0
    kthread+0x199/0x1d0
    ret_from_fork+0x30/0x50
    ret_from_fork_asm+0x1b/0x30
unreferenced object 0xffff8881499f4d28 (size 368):
  comm "nfsd", pid 2761, jiffies 4295044005 (age 5541.239s)
  hex dump (first 32 bytes):
    01 00 00 00 00 00 00 00 30 4d 9f 49 81 88 ff ff  ........0M.I....
    30 4d 9f 49 81 88 ff ff 20 00 00 00 01 00 00 00  0M.I.... .......
  backtrace:
    kmem_cache_alloc+0x4b9/0x700
    nfs4_alloc_stid+0x29/0x210
    alloc_init_deleg+0x92/0x2e0
    nfs4_set_delegation+0x284/0xc00
    nfs4_open_delegation+0x216/0x3f0
    nfsd4_process_open2+0x2b3/0xee0
    nfsd4_open+0x770/0x9d0
    nfsd4_proc_compound+0x7a2/0xe30
    nfsd_dispatch+0x241/0x3e0
    svc_process_common+0x5d3/0xcc0
    svc_process+0x2a3/0x320
    nfsd+0x180/0x2e0
    kthread+0x199/0x1d0
    ret_from_fork+0x30/0x50
    ret_from_fork_asm+0x1b/0x30
Fix it by checking the result of nfsd4_run_cb and call nfs4_put_stid if
fail to queue dl_recall.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
---
 fs/nfsd/nfs4state.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 153eeea2c7c9..0ccb87be47b7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5414,6 +5414,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
 
 static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 {
+	bool queued;
 	/*
 	 * We're assuming the state code never drops its reference
 	 * without first removing the lease.  Since we're in this lease
@@ -5422,7 +5423,10 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 	 * we know it's safe to take a reference.
 	 */
 	refcount_inc(&dp->dl_stid.sc_count);
-	WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall));
+	queued = nfsd4_run_cb(&dp->dl_recall);
+	WARN_ON_ONCE(!queued);
+	if (!queued)
+		nfs4_put_stid(&dp->dl_stid);
 }
 
 /* Called from break_lease() with flc_lock held. */
-- 
2.31.1

Re: [PATCH] nfsd: put dl_stid if fail to queue dl_recall

Posted by Jeff Layton 11 months, 4 weeks ago

On Thu, 2025-02-13 at 15:25 +0800, Li Lingfeng wrote:
> Before calling nfsd4_run_cb to queue dl_recall to the callback_wq, we
> increment the reference count of dl_stid.
> We expect that after the corresponding work_struct is processed, the
> reference count of dl_stid will be decremented through the callback
> function nfsd4_cb_recall_release.
> However, if the call to nfsd4_run_cb fails, the incremented reference
> count of dl_stid will not be decremented correspondingly, leading to the
> following nfs4_stid leak:
> unreferenced object 0xffff88812067b578 (size 344):
>   comm "nfsd", pid 2761, jiffies 4295044002 (age 5541.241s)
>   hex dump (first 32 bytes):
>     01 00 00 00 6b 6b 6b 6b b8 02 c0 e2 81 88 ff ff  ....kkkk........
>     00 6b 6b 6b 6b 6b 6b 6b 00 00 00 00 ad 4e ad de  .kkkkkkk.....N..
>   backtrace:
>     kmem_cache_alloc+0x4b9/0x700
>     nfsd4_process_open1+0x34/0x300
>     nfsd4_open+0x2d1/0x9d0
>     nfsd4_proc_compound+0x7a2/0xe30
>     nfsd_dispatch+0x241/0x3e0
>     svc_process_common+0x5d3/0xcc0
>     svc_process+0x2a3/0x320
>     nfsd+0x180/0x2e0
>     kthread+0x199/0x1d0
>     ret_from_fork+0x30/0x50
>     ret_from_fork_asm+0x1b/0x30
> unreferenced object 0xffff8881499f4d28 (size 368):
>   comm "nfsd", pid 2761, jiffies 4295044005 (age 5541.239s)
>   hex dump (first 32 bytes):
>     01 00 00 00 00 00 00 00 30 4d 9f 49 81 88 ff ff  ........0M.I....
>     30 4d 9f 49 81 88 ff ff 20 00 00 00 01 00 00 00  0M.I.... .......
>   backtrace:
>     kmem_cache_alloc+0x4b9/0x700
>     nfs4_alloc_stid+0x29/0x210
>     alloc_init_deleg+0x92/0x2e0
>     nfs4_set_delegation+0x284/0xc00
>     nfs4_open_delegation+0x216/0x3f0
>     nfsd4_process_open2+0x2b3/0xee0
>     nfsd4_open+0x770/0x9d0
>     nfsd4_proc_compound+0x7a2/0xe30
>     nfsd_dispatch+0x241/0x3e0
>     svc_process_common+0x5d3/0xcc0
>     svc_process+0x2a3/0x320
>     nfsd+0x180/0x2e0
>     kthread+0x199/0x1d0
>     ret_from_fork+0x30/0x50
>     ret_from_fork_asm+0x1b/0x30
> Fix it by checking the result of nfsd4_run_cb and call nfs4_put_stid if
> fail to queue dl_recall.
> 
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
> ---
>  fs/nfsd/nfs4state.c | 6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 153eeea2c7c9..0ccb87be47b7 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -5414,6 +5414,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
>  
>  static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
>  {
> +	bool queued;
>  	/*
>  	 * We're assuming the state code never drops its reference
>  	 * without first removing the lease.  Since we're in this lease
> @@ -5422,7 +5423,10 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
>  	 * we know it's safe to take a reference.
>  	 */
>  	refcount_inc(&dp->dl_stid.sc_count);
> -	WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall));
> +	queued = nfsd4_run_cb(&dp->dl_recall);
> +	WARN_ON_ONCE(!queued);
> +	if (!queued)
> +		nfs4_put_stid(&dp->dl_stid);
>  }
>  
>  /* Called from break_lease() with flc_lock held. */


Have you actually seen the WARN_ON_ONCE() pop under normal usage, or
was the problem you reproduced done via fault injection?

Unfortunately, this won't work. nfsd_break_one_deleg() is called from
the ->break_lease callback with the flc->flc_lock held, and
nfs4_put_stid can sleep in the sc_free callbacks.
-- 
Jeff Layton <jlayton@kernel.org>

Re: [PATCH] nfsd: put dl_stid if fail to queue dl_recall

Posted by Li Lingfeng 11 months, 4 weeks ago

在 2025/2/13 19:49, Jeff Layton 写道:
> On Thu, 2025-02-13 at 15:25 +0800, Li Lingfeng wrote:
>> Before calling nfsd4_run_cb to queue dl_recall to the callback_wq, we
>> increment the reference count of dl_stid.
>> We expect that after the corresponding work_struct is processed, the
>> reference count of dl_stid will be decremented through the callback
>> function nfsd4_cb_recall_release.
>> However, if the call to nfsd4_run_cb fails, the incremented reference
>> count of dl_stid will not be decremented correspondingly, leading to the
>> following nfs4_stid leak:
>> unreferenced object 0xffff88812067b578 (size 344):
>>    comm "nfsd", pid 2761, jiffies 4295044002 (age 5541.241s)
>>    hex dump (first 32 bytes):
>>      01 00 00 00 6b 6b 6b 6b b8 02 c0 e2 81 88 ff ff  ....kkkk........
>>      00 6b 6b 6b 6b 6b 6b 6b 00 00 00 00 ad 4e ad de  .kkkkkkk.....N..
>>    backtrace:
>>      kmem_cache_alloc+0x4b9/0x700
>>      nfsd4_process_open1+0x34/0x300
>>      nfsd4_open+0x2d1/0x9d0
>>      nfsd4_proc_compound+0x7a2/0xe30
>>      nfsd_dispatch+0x241/0x3e0
>>      svc_process_common+0x5d3/0xcc0
>>      svc_process+0x2a3/0x320
>>      nfsd+0x180/0x2e0
>>      kthread+0x199/0x1d0
>>      ret_from_fork+0x30/0x50
>>      ret_from_fork_asm+0x1b/0x30
>> unreferenced object 0xffff8881499f4d28 (size 368):
>>    comm "nfsd", pid 2761, jiffies 4295044005 (age 5541.239s)
>>    hex dump (first 32 bytes):
>>      01 00 00 00 00 00 00 00 30 4d 9f 49 81 88 ff ff  ........0M.I....
>>      30 4d 9f 49 81 88 ff ff 20 00 00 00 01 00 00 00  0M.I.... .......
>>    backtrace:
>>      kmem_cache_alloc+0x4b9/0x700
>>      nfs4_alloc_stid+0x29/0x210
>>      alloc_init_deleg+0x92/0x2e0
>>      nfs4_set_delegation+0x284/0xc00
>>      nfs4_open_delegation+0x216/0x3f0
>>      nfsd4_process_open2+0x2b3/0xee0
>>      nfsd4_open+0x770/0x9d0
>>      nfsd4_proc_compound+0x7a2/0xe30
>>      nfsd_dispatch+0x241/0x3e0
>>      svc_process_common+0x5d3/0xcc0
>>      svc_process+0x2a3/0x320
>>      nfsd+0x180/0x2e0
>>      kthread+0x199/0x1d0
>>      ret_from_fork+0x30/0x50
>>      ret_from_fork_asm+0x1b/0x30
>> Fix it by checking the result of nfsd4_run_cb and call nfs4_put_stid if
>> fail to queue dl_recall.
>>
>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>> Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
>> ---
>>   fs/nfsd/nfs4state.c | 6 +++++-
>>   1 file changed, 5 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index 153eeea2c7c9..0ccb87be47b7 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -5414,6 +5414,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
>>   
>>   static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
>>   {
>> +	bool queued;
>>   	/*
>>   	 * We're assuming the state code never drops its reference
>>   	 * without first removing the lease.  Since we're in this lease
>> @@ -5422,7 +5423,10 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
>>   	 * we know it's safe to take a reference.
>>   	 */
>>   	refcount_inc(&dp->dl_stid.sc_count);
>> -	WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall));
>> +	queued = nfsd4_run_cb(&dp->dl_recall);
>> +	WARN_ON_ONCE(!queued);
>> +	if (!queued)
>> +		nfs4_put_stid(&dp->dl_stid);
>>   }
>>   
>>   /* Called from break_lease() with flc_lock held. */
>
> Have you actually seen the WARN_ON_ONCE() pop under normal usage, or
> was the problem you reproduced done via fault injection?

Hi,

I add mdelay for cb_work before processing it and add msleep in 
nfs_do_return_delegation to reproduce it.

--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1188,6 +1188,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct 
nfs4_file *fp,
      dp->dl_recalled = false;
      nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
                &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
+    dp->dl_recall.cb_work.debug_flag = 0x1234;
+    printk("%s init work func %px\n", __func__, 
dp->dl_recall.cb_work.func);
      get_nfs4_file(fp);
      dp->dl_stid.sc_file = fp;
      return dp;

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2708,6 +2708,12 @@ static void process_scheduled_works(struct worker 
*worker)
              worker->pool->watchdog_ts = jiffies;
              first = false;
          }
+        if (work->debug_flag == 0x1234) {
+            printk("%s func %px\n", __func__, work->func);
+            printk("%s delay before clear pending...\n", __func__);
+            mdelay(10 * 1000);
+            printk("%s delay done\n", __func__);
+        }
          process_one_work(worker, work);
      }
  }

--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -263,6 +263,15 @@ static int nfs_do_return_delegation(struct inode 
*inode, struct nfs_delegation *
      const struct cred *cred;
      int res = 0;

+    printk("sleep before deleg return...\n");
+    while (1) {
+        ifdebug(PROC)
+            msleep(10 * 1000);
+        else
+            break;
+    }
+    printk("sleep done\n");
+
      if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
          spin_lock(&delegation->lock);
          cred = get_cred(delegation->cred);

> Unfortunately, this won't work. nfsd_break_one_deleg() is called from
> the ->break_lease callback with the flc->flc_lock held, and
> nfs4_put_stid can sleep in the sc_free callbacks.

Are you referring to nfs4_free_deleg? I didn't quite see where it might 
sleep. Could you please explain it to me?

Thanks.

Re: [PATCH] nfsd: put dl_stid if fail to queue dl_recall

Posted by Jeff Layton 11 months, 4 weeks ago

On Thu, 2025-02-13 at 20:28 +0800, Li Lingfeng wrote:
> 在 2025/2/13 19:49, Jeff Layton 写道:
> > On Thu, 2025-02-13 at 15:25 +0800, Li Lingfeng wrote:
> > > Before calling nfsd4_run_cb to queue dl_recall to the callback_wq, we
> > > increment the reference count of dl_stid.
> > > We expect that after the corresponding work_struct is processed, the
> > > reference count of dl_stid will be decremented through the callback
> > > function nfsd4_cb_recall_release.
> > > However, if the call to nfsd4_run_cb fails, the incremented reference
> > > count of dl_stid will not be decremented correspondingly, leading to the
> > > following nfs4_stid leak:
> > > unreferenced object 0xffff88812067b578 (size 344):
> > >    comm "nfsd", pid 2761, jiffies 4295044002 (age 5541.241s)
> > >    hex dump (first 32 bytes):
> > >      01 00 00 00 6b 6b 6b 6b b8 02 c0 e2 81 88 ff ff  ....kkkk........
> > >      00 6b 6b 6b 6b 6b 6b 6b 00 00 00 00 ad 4e ad de  .kkkkkkk.....N..
> > >    backtrace:
> > >      kmem_cache_alloc+0x4b9/0x700
> > >      nfsd4_process_open1+0x34/0x300
> > >      nfsd4_open+0x2d1/0x9d0
> > >      nfsd4_proc_compound+0x7a2/0xe30
> > >      nfsd_dispatch+0x241/0x3e0
> > >      svc_process_common+0x5d3/0xcc0
> > >      svc_process+0x2a3/0x320
> > >      nfsd+0x180/0x2e0
> > >      kthread+0x199/0x1d0
> > >      ret_from_fork+0x30/0x50
> > >      ret_from_fork_asm+0x1b/0x30
> > > unreferenced object 0xffff8881499f4d28 (size 368):
> > >    comm "nfsd", pid 2761, jiffies 4295044005 (age 5541.239s)
> > >    hex dump (first 32 bytes):
> > >      01 00 00 00 00 00 00 00 30 4d 9f 49 81 88 ff ff  ........0M.I....
> > >      30 4d 9f 49 81 88 ff ff 20 00 00 00 01 00 00 00  0M.I.... .......
> > >    backtrace:
> > >      kmem_cache_alloc+0x4b9/0x700
> > >      nfs4_alloc_stid+0x29/0x210
> > >      alloc_init_deleg+0x92/0x2e0
> > >      nfs4_set_delegation+0x284/0xc00
> > >      nfs4_open_delegation+0x216/0x3f0
> > >      nfsd4_process_open2+0x2b3/0xee0
> > >      nfsd4_open+0x770/0x9d0
> > >      nfsd4_proc_compound+0x7a2/0xe30
> > >      nfsd_dispatch+0x241/0x3e0
> > >      svc_process_common+0x5d3/0xcc0
> > >      svc_process+0x2a3/0x320
> > >      nfsd+0x180/0x2e0
> > >      kthread+0x199/0x1d0
> > >      ret_from_fork+0x30/0x50
> > >      ret_from_fork_asm+0x1b/0x30
> > > Fix it by checking the result of nfsd4_run_cb and call nfs4_put_stid if
> > > fail to queue dl_recall.
> > > 
> > > Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> > > Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
> > > ---
> > >   fs/nfsd/nfs4state.c | 6 +++++-
> > >   1 file changed, 5 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > index 153eeea2c7c9..0ccb87be47b7 100644
> > > --- a/fs/nfsd/nfs4state.c
> > > +++ b/fs/nfsd/nfs4state.c
> > > @@ -5414,6 +5414,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
> > >   
> > >   static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
> > >   {
> > > +	bool queued;
> > >   	/*
> > >   	 * We're assuming the state code never drops its reference
> > >   	 * without first removing the lease.  Since we're in this lease
> > > @@ -5422,7 +5423,10 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
> > >   	 * we know it's safe to take a reference.
> > >   	 */
> > >   	refcount_inc(&dp->dl_stid.sc_count);
> > > -	WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall));
> > > +	queued = nfsd4_run_cb(&dp->dl_recall);
> > > +	WARN_ON_ONCE(!queued);
> > > +	if (!queued)
> > > +		nfs4_put_stid(&dp->dl_stid);
> > >   }
> > >   
> > >   /* Called from break_lease() with flc_lock held. */
> > 
> > Have you actually seen the WARN_ON_ONCE() pop under normal usage, or
> > was the problem you reproduced done via fault injection?
> 
> Hi,
> 
> I add mdelay for cb_work before processing it and add msleep in 
> nfs_do_return_delegation to reproduce it.
> 
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -1188,6 +1188,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct 
> nfs4_file *fp,
>       dp->dl_recalled = false;
>       nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
>                 &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
> +    dp->dl_recall.cb_work.debug_flag = 0x1234;
> +    printk("%s init work func %px\n", __func__, 
> dp->dl_recall.cb_work.func);
>       get_nfs4_file(fp);
>       dp->dl_stid.sc_file = fp;
>       return dp;
> 
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -2708,6 +2708,12 @@ static void process_scheduled_works(struct worker 
> *worker)
>               worker->pool->watchdog_ts = jiffies;
>               first = false;
>           }
> +        if (work->debug_flag == 0x1234) {
> +            printk("%s func %px\n", __func__, work->func);
> +            printk("%s delay before clear pending...\n", __func__);
> +            mdelay(10 * 1000);
> +            printk("%s delay done\n", __func__);
> +        }
>           process_one_work(worker, work);
>       }
>   }
> 
> --- a/fs/nfs/delegation.c
> +++ b/fs/nfs/delegation.c
> @@ -263,6 +263,15 @@ static int nfs_do_return_delegation(struct inode 
> *inode, struct nfs_delegation *
>       const struct cred *cred;
>       int res = 0;
> 
> +    printk("sleep before deleg return...\n");
> +    while (1) {
> +        ifdebug(PROC)
> +            msleep(10 * 1000);
> +        else
> +            break;
> +    }
> +    printk("sleep done\n");
> +
>       if (!test_bit(NFS_DELEGATION_REVOKED, &delegation-flags)) {
>           spin_lock(&delegation->lock);
>           cred = get_cred(delegation->cred);
> 
> > Unfortunately, this won't work. nfsd_break_one_deleg() is called from
> > the ->break_lease callback with the flc->flc_lock held, and
> > nfs4_put_stid can sleep in the sc_free callbacks.
> 
> Are you referring to nfs4_free_deleg? I didn't quite see where it might 
> sleep. Could you please explain it to me?
> 

I stand corrected. nfs4_free_deleg() won't sleep, but
nfs4_free_lock_stateid() can. In this case, you know that this is a
deleg stateid, so this should actually be safe after all.

If you do this though, please add a comment to nfs4_free_deleg()
mentioning that it mustn't ever sleep, since it can be called from that
context.
-- 
Jeff Layton <jlayton@kernel.org>

Re: [PATCH] nfsd: put dl_stid if fail to queue dl_recall

Posted by Li Lingfeng 11 months, 4 weeks ago

在 2025/2/13 20:34, Jeff Layton 写道:
> On Thu, 2025-02-13 at 20:28 +0800, Li Lingfeng wrote:
>> 在 2025/2/13 19:49, Jeff Layton 写道:
>>> On Thu, 2025-02-13 at 15:25 +0800, Li Lingfeng wrote:
>>>> Before calling nfsd4_run_cb to queue dl_recall to the callback_wq, we
>>>> increment the reference count of dl_stid.
>>>> We expect that after the corresponding work_struct is processed, the
>>>> reference count of dl_stid will be decremented through the callback
>>>> function nfsd4_cb_recall_release.
>>>> However, if the call to nfsd4_run_cb fails, the incremented reference
>>>> count of dl_stid will not be decremented correspondingly, leading to the
>>>> following nfs4_stid leak:
>>>> unreferenced object 0xffff88812067b578 (size 344):
>>>>     comm "nfsd", pid 2761, jiffies 4295044002 (age 5541.241s)
>>>>     hex dump (first 32 bytes):
>>>>       01 00 00 00 6b 6b 6b 6b b8 02 c0 e2 81 88 ff ff  ....kkkk........
>>>>       00 6b 6b 6b 6b 6b 6b 6b 00 00 00 00 ad 4e ad de  .kkkkkkk.....N..
>>>>     backtrace:
>>>>       kmem_cache_alloc+0x4b9/0x700
>>>>       nfsd4_process_open1+0x34/0x300
>>>>       nfsd4_open+0x2d1/0x9d0
>>>>       nfsd4_proc_compound+0x7a2/0xe30
>>>>       nfsd_dispatch+0x241/0x3e0
>>>>       svc_process_common+0x5d3/0xcc0
>>>>       svc_process+0x2a3/0x320
>>>>       nfsd+0x180/0x2e0
>>>>       kthread+0x199/0x1d0
>>>>       ret_from_fork+0x30/0x50
>>>>       ret_from_fork_asm+0x1b/0x30
>>>> unreferenced object 0xffff8881499f4d28 (size 368):
>>>>     comm "nfsd", pid 2761, jiffies 4295044005 (age 5541.239s)
>>>>     hex dump (first 32 bytes):
>>>>       01 00 00 00 00 00 00 00 30 4d 9f 49 81 88 ff ff  ........0M.I....
>>>>       30 4d 9f 49 81 88 ff ff 20 00 00 00 01 00 00 00  0M.I.... .......
>>>>     backtrace:
>>>>       kmem_cache_alloc+0x4b9/0x700
>>>>       nfs4_alloc_stid+0x29/0x210
>>>>       alloc_init_deleg+0x92/0x2e0
>>>>       nfs4_set_delegation+0x284/0xc00
>>>>       nfs4_open_delegation+0x216/0x3f0
>>>>       nfsd4_process_open2+0x2b3/0xee0
>>>>       nfsd4_open+0x770/0x9d0
>>>>       nfsd4_proc_compound+0x7a2/0xe30
>>>>       nfsd_dispatch+0x241/0x3e0
>>>>       svc_process_common+0x5d3/0xcc0
>>>>       svc_process+0x2a3/0x320
>>>>       nfsd+0x180/0x2e0
>>>>       kthread+0x199/0x1d0
>>>>       ret_from_fork+0x30/0x50
>>>>       ret_from_fork_asm+0x1b/0x30
>>>> Fix it by checking the result of nfsd4_run_cb and call nfs4_put_stid if
>>>> fail to queue dl_recall.
>>>>
>>>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>>>> Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
>>>> ---
>>>>    fs/nfsd/nfs4state.c | 6 +++++-
>>>>    1 file changed, 5 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>> index 153eeea2c7c9..0ccb87be47b7 100644
>>>> --- a/fs/nfsd/nfs4state.c
>>>> +++ b/fs/nfsd/nfs4state.c
>>>> @@ -5414,6 +5414,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
>>>>    
>>>>    static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
>>>>    {
>>>> +	bool queued;
>>>>    	/*
>>>>    	 * We're assuming the state code never drops its reference
>>>>    	 * without first removing the lease.  Since we're in this lease
>>>> @@ -5422,7 +5423,10 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
>>>>    	 * we know it's safe to take a reference.
>>>>    	 */
>>>>    	refcount_inc(&dp->dl_stid.sc_count);
>>>> -	WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall));
>>>> +	queued = nfsd4_run_cb(&dp->dl_recall);
>>>> +	WARN_ON_ONCE(!queued);
>>>> +	if (!queued)
>>>> +		nfs4_put_stid(&dp->dl_stid);
>>>>    }
>>>>    
>>>>    /* Called from break_lease() with flc_lock held. */
>>> Have you actually seen the WARN_ON_ONCE() pop under normal usage, or
>>> was the problem you reproduced done via fault injection?
>> Hi,
>>
>> I add mdelay for cb_work before processing it and add msleep in
>> nfs_do_return_delegation to reproduce it.
>>
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -1188,6 +1188,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct
>> nfs4_file *fp,
>>        dp->dl_recalled = false;
>>        nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
>>                  &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
>> +    dp->dl_recall.cb_work.debug_flag = 0x1234;
>> +    printk("%s init work func %px\n", __func__,
>> dp->dl_recall.cb_work.func);
>>        get_nfs4_file(fp);
>>        dp->dl_stid.sc_file = fp;
>>        return dp;
>>
>> --- a/kernel/workqueue.c
>> +++ b/kernel/workqueue.c
>> @@ -2708,6 +2708,12 @@ static void process_scheduled_works(struct worker
>> *worker)
>>                worker->pool->watchdog_ts = jiffies;
>>                first = false;
>>            }
>> +        if (work->debug_flag == 0x1234) {
>> +            printk("%s func %px\n", __func__, work->func);
>> +            printk("%s delay before clear pending...\n", __func__);
>> +            mdelay(10 * 1000);
>> +            printk("%s delay done\n", __func__);
>> +        }
>>            process_one_work(worker, work);
>>        }
>>    }
>>
>> --- a/fs/nfs/delegation.c
>> +++ b/fs/nfs/delegation.c
>> @@ -263,6 +263,15 @@ static int nfs_do_return_delegation(struct inode
>> *inode, struct nfs_delegation *
>>        const struct cred *cred;
>>        int res = 0;
>>
>> +    printk("sleep before deleg return...\n");
>> +    while (1) {
>> +        ifdebug(PROC)
>> +            msleep(10 * 1000);
>> +        else
>> +            break;
>> +    }
>> +    printk("sleep done\n");
>> +
>>        if (!test_bit(NFS_DELEGATION_REVOKED, &delegation-flags)) {
>>            spin_lock(&delegation->lock);
>>            cred = get_cred(delegation->cred);
>>
>>> Unfortunately, this won't work. nfsd_break_one_deleg() is called from
>>> the ->break_lease callback with the flc->flc_lock held, and
>>> nfs4_put_stid can sleep in the sc_free callbacks.
>> Are you referring to nfs4_free_deleg? I didn't quite see where it might
>> sleep. Could you please explain it to me?
>>
> I stand corrected. nfs4_free_deleg() won't sleep, but
> nfs4_free_lock_stateid() can. In this case, you know that this is a
> deleg stateid, so this should actually be safe after all.
>
> If you do this though, please add a comment to nfs4_free_deleg()
> mentioning that it mustn't ever sleep, since it can be called from that
> context.
Thank you for your advice, I will send v2 soon.