[v2] nfsd: Invoke tracking callbacks only after initialization is complete

[PATCH v2] nfsd: Invoke tracking callbacks only after initialization is complete

Posted by Li Lingfeng 8 months ago

Checking whether tracking callbacks can be called based on whether
nn->client_tracking_ops is NULL may lead to callbacks being invoked
before tracking initialization completes, causing resource access
violations (UAF, NULL pointer dereference). Examples:

1) nfsd4_client_tracking_init
   // set nn->client_tracking_ops
   nfsd4_cld_tracking_init
    nfs4_cld_state_init
     nn->reclaim_str_hashtbl = kmalloc_array
    ... // error path, goto err
    nfs4_cld_state_shutdown
     kfree(nn->reclaim_str_hashtbl)
                                      write_v4_end_grace
                                       nfsd4_end_grace
                                        nfsd4_record_grace_done
                                         nfsd4_cld_grace_done
                                          nfs4_release_reclaim
                                           nn->reclaim_str_hashtbl[i]
                                           // UAF
   // clear nn->client_tracking_ops

2) nfsd4_client_tracking_init
   // set nn->client_tracking_ops
   nfsd4_cld_tracking_init
                                      write_v4_end_grace
                                       nfsd4_end_grace
                                        nfsd4_record_grace_done
                                         nfsd4_cld_grace_done
                                          alloc_cld_upcall
                                           cn = nn->cld_net
                                           spin_lock // cn->cn_lock
                                           // NULL deref
   // error path, skip init pipe
   __nfsd4_init_cld_pipe
    cn = kzalloc
    nn->cld_net = cn
   // clear nn->client_tracking_ops

After nfsd mounts, users can trigger grace_done callbacks via
/proc/fs/nfsd/v4_end_grace. If resources are uninitialized or freed
in error paths, this causes access violations.

Resolve the issue by leveraging nfsd_mutex to prevent concurrency.

Fixes: 52e19c09a183 ("nfsd: make reclaim_str_hashtbl allocated per net")
Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
---
  Changes in v2:
    Use nfsd_mutex instead of adding a new flag to prevent concurrency.
 fs/nfsd/nfs4recover.c | 8 ++++++++
 fs/nfsd/nfs4state.c   | 4 ++++
 fs/nfsd/nfsctl.c      | 2 ++
 3 files changed, 14 insertions(+)

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 82785db730d9..8ac089f8134c 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -162,7 +162,9 @@ legacy_recdir_name_error(struct nfs4_client *clp, int error)
 	if (error == -ENOENT) {
 		printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
 			"Reboot recovery will not function correctly!\n");
+		mutex_lock(&nfsd_mutex);
 		nfsd4_client_tracking_exit(clp->net);
+		mutex_unlock(&nfsd_mutex);
 	}
 }
 
@@ -2083,8 +2085,10 @@ nfsd4_client_record_create(struct nfs4_client *clp)
 {
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
 
+	mutex_lock(&nfsd_mutex);
 	if (nn->client_tracking_ops)
 		nn->client_tracking_ops->create(clp);
+	mutex_unlock(&nfsd_mutex);
 }
 
 void
@@ -2092,8 +2096,10 @@ nfsd4_client_record_remove(struct nfs4_client *clp)
 {
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
 
+	mutex_lock(&nfsd_mutex);
 	if (nn->client_tracking_ops)
 		nn->client_tracking_ops->remove(clp);
+	mutex_unlock(&nfsd_mutex);
 }
 
 int
@@ -2101,8 +2107,10 @@ nfsd4_client_record_check(struct nfs4_client *clp)
 {
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
 
+	mutex_lock(&nfsd_mutex);
 	if (nn->client_tracking_ops)
 		return nn->client_tracking_ops->check(clp);
+	mutex_unlock(&nfsd_mutex);
 
 	return -EOPNOTSUPP;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d5694987f86f..2794fdc8b678 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2529,7 +2529,9 @@ static void inc_reclaim_complete(struct nfs4_client *clp)
 			nn->reclaim_str_hashtbl_size) {
 		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
 				clp->net->ns.inum);
+		mutex_lock(&nfsd_mutex);
 		nfsd4_end_grace(nn);
+		mutex_unlock(&nfsd_mutex);
 	}
 }
 
@@ -6773,7 +6775,9 @@ nfs4_laundromat(struct nfsd_net *nn)
 		lt.new_timeo = 0;
 		goto out;
 	}
+	mutex_lock(&nfsd_mutex);
 	nfsd4_end_grace(nn);
+	mutex_unlock(&nfsd_mutex);
 
 	spin_lock(&nn->s2s_cp_lock);
 	idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3f3e9f6c4250..649850b4bb60 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1085,7 +1085,9 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
 			if (!nn->nfsd_serv)
 				return -EBUSY;
 			trace_nfsd_end_grace(netns(file));
+			mutex_lock(&nfsd_mutex);
 			nfsd4_end_grace(nn);
+			mutex_lock(&nfsd_mutex);
 			break;
 		default:
 			return -EINVAL;
-- 
2.46.1

Re: [PATCH v2] nfsd: Invoke tracking callbacks only after initialization is complete

Posted by NeilBrown 7 months, 3 weeks ago

On Thu, 12 Jun 2025, Li Lingfeng wrote:
> Checking whether tracking callbacks can be called based on whether
> nn->client_tracking_ops is NULL may lead to callbacks being invoked
> before tracking initialization completes, causing resource access
> violations (UAF, NULL pointer dereference). Examples:
> 
> 1) nfsd4_client_tracking_init
>    // set nn->client_tracking_ops
>    nfsd4_cld_tracking_init
>     nfs4_cld_state_init
>      nn->reclaim_str_hashtbl = kmalloc_array
>     ... // error path, goto err
>     nfs4_cld_state_shutdown
>      kfree(nn->reclaim_str_hashtbl)
>                                       write_v4_end_grace

I suspect the problem here is that write_v4_end_grace() is one of the
few write_op functions which doesn't take nfsd_mutex.  It should hold
that lock while testing ->nfsd_serv and calling nfsd4_end_grace()

write_filehandle(), write_unlock_ip(), and write_unlock_fs() also don't
take that lock.  They don't even check if nfsd_serv is NULL.  I suspect
they all should.

NeilBrown


>                                        nfsd4_end_grace
>                                         nfsd4_record_grace_done
>                                          nfsd4_cld_grace_done
>                                           nfs4_release_reclaim
>                                            nn->reclaim_str_hashtbl[i]
>                                            // UAF
>    // clear nn->client_tracking_ops
> 
> 2) nfsd4_client_tracking_init
>    // set nn->client_tracking_ops
>    nfsd4_cld_tracking_init
>                                       write_v4_end_grace
>                                        nfsd4_end_grace
>                                         nfsd4_record_grace_done
>                                          nfsd4_cld_grace_done
>                                           alloc_cld_upcall
>                                            cn = nn->cld_net
>                                            spin_lock // cn->cn_lock
>                                            // NULL deref
>    // error path, skip init pipe
>    __nfsd4_init_cld_pipe
>     cn = kzalloc
>     nn->cld_net = cn
>    // clear nn->client_tracking_ops
> 
> After nfsd mounts, users can trigger grace_done callbacks via
> /proc/fs/nfsd/v4_end_grace. If resources are uninitialized or freed
> in error paths, this causes access violations.
> 
> Resolve the issue by leveraging nfsd_mutex to prevent concurrency.
> 
> Fixes: 52e19c09a183 ("nfsd: make reclaim_str_hashtbl allocated per net")
> Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
> ---
>   Changes in v2:
>     Use nfsd_mutex instead of adding a new flag to prevent concurrency.
>  fs/nfsd/nfs4recover.c | 8 ++++++++
>  fs/nfsd/nfs4state.c   | 4 ++++
>  fs/nfsd/nfsctl.c      | 2 ++
>  3 files changed, 14 insertions(+)
> 
> diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> index 82785db730d9..8ac089f8134c 100644
> --- a/fs/nfsd/nfs4recover.c
> +++ b/fs/nfsd/nfs4recover.c
> @@ -162,7 +162,9 @@ legacy_recdir_name_error(struct nfs4_client *clp, int error)
>  	if (error == -ENOENT) {
>  		printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
>  			"Reboot recovery will not function correctly!\n");
> +		mutex_lock(&nfsd_mutex);
>  		nfsd4_client_tracking_exit(clp->net);
> +		mutex_unlock(&nfsd_mutex);
>  	}
>  }
>  
> @@ -2083,8 +2085,10 @@ nfsd4_client_record_create(struct nfs4_client *clp)
>  {
>  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
>  
> +	mutex_lock(&nfsd_mutex);
>  	if (nn->client_tracking_ops)
>  		nn->client_tracking_ops->create(clp);
> +	mutex_unlock(&nfsd_mutex);
>  }
>  
>  void
> @@ -2092,8 +2096,10 @@ nfsd4_client_record_remove(struct nfs4_client *clp)
>  {
>  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
>  
> +	mutex_lock(&nfsd_mutex);
>  	if (nn->client_tracking_ops)
>  		nn->client_tracking_ops->remove(clp);
> +	mutex_unlock(&nfsd_mutex);
>  }
>  
>  int
> @@ -2101,8 +2107,10 @@ nfsd4_client_record_check(struct nfs4_client *clp)
>  {
>  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
>  
> +	mutex_lock(&nfsd_mutex);
>  	if (nn->client_tracking_ops)
>  		return nn->client_tracking_ops->check(clp);
> +	mutex_unlock(&nfsd_mutex);
>  
>  	return -EOPNOTSUPP;
>  }
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index d5694987f86f..2794fdc8b678 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2529,7 +2529,9 @@ static void inc_reclaim_complete(struct nfs4_client *clp)
>  			nn->reclaim_str_hashtbl_size) {
>  		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
>  				clp->net->ns.inum);
> +		mutex_lock(&nfsd_mutex);
>  		nfsd4_end_grace(nn);
> +		mutex_unlock(&nfsd_mutex);
>  	}
>  }
>  
> @@ -6773,7 +6775,9 @@ nfs4_laundromat(struct nfsd_net *nn)
>  		lt.new_timeo = 0;
>  		goto out;
>  	}
> +	mutex_lock(&nfsd_mutex);
>  	nfsd4_end_grace(nn);
> +	mutex_unlock(&nfsd_mutex);
>  
>  	spin_lock(&nn->s2s_cp_lock);
>  	idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index 3f3e9f6c4250..649850b4bb60 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -1085,7 +1085,9 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
>  			if (!nn->nfsd_serv)
>  				return -EBUSY;
>  			trace_nfsd_end_grace(netns(file));
> +			mutex_lock(&nfsd_mutex);
>  			nfsd4_end_grace(nn);
> +			mutex_lock(&nfsd_mutex);
>  			break;
>  		default:
>  			return -EINVAL;
> -- 
> 2.46.1
> 
>

Re: [PATCH v2] nfsd: Invoke tracking callbacks only after initialization is complete

Posted by Jeff Layton 7 months, 3 weeks ago

On Thu, 2025-06-12 at 11:55 +0800, Li Lingfeng wrote:
> Checking whether tracking callbacks can be called based on whether
> nn->client_tracking_ops is NULL may lead to callbacks being invoked
> before tracking initialization completes, causing resource access
> violations (UAF, NULL pointer dereference). Examples:
> 
> 1) nfsd4_client_tracking_init
>    // set nn->client_tracking_ops
>    nfsd4_cld_tracking_init
>     nfs4_cld_state_init
>      nn->reclaim_str_hashtbl = kmalloc_array
>     ... // error path, goto err
>     nfs4_cld_state_shutdown
>      kfree(nn->reclaim_str_hashtbl)
>                                       write_v4_end_grace
>                                        nfsd4_end_grace
>                                         nfsd4_record_grace_done
>                                          nfsd4_cld_grace_done
>                                           nfs4_release_reclaim
>                                            nn->reclaim_str_hashtbl[i]
>                                            // UAF
>    // clear nn->client_tracking_ops
> 
> 2) nfsd4_client_tracking_init
>    // set nn->client_tracking_ops
>    nfsd4_cld_tracking_init
>                                       write_v4_end_grace
>                                        nfsd4_end_grace
>                                         nfsd4_record_grace_done
>                                          nfsd4_cld_grace_done
>                                           alloc_cld_upcall
>                                            cn = nn->cld_net
>                                            spin_lock // cn->cn_lock
>                                            // NULL deref
>    // error path, skip init pipe
>    __nfsd4_init_cld_pipe
>     cn = kzalloc
>     nn->cld_net = cn
>    // clear nn->client_tracking_ops
> 


Have you seen this race in the wild?

Looking at this more closely, I don't think this race is possible.
You'd need to invoke the ->init routine concurrently from two different
tasks, but nfsd4_client_tracking_init is called during net ns
initialization, which should ensure that only one task invokes it.



> After nfsd mounts, users can trigger grace_done callbacks via
> /proc/fs/nfsd/v4_end_grace. If resources are uninitialized or freed
> in error paths, this causes access violations.
> 
> Resolve the issue by leveraging nfsd_mutex to prevent concurrency.
> 
> Fixes: 52e19c09a183 ("nfsd: make reclaim_str_hashtbl allocated per net")
> Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
> ---
>   Changes in v2:
>     Use nfsd_mutex instead of adding a new flag to prevent concurrency.
>  fs/nfsd/nfs4recover.c | 8 ++++++++
>  fs/nfsd/nfs4state.c   | 4 ++++
>  fs/nfsd/nfsctl.c      | 2 ++
>  3 files changed, 14 insertions(+)
> 
> diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> index 82785db730d9..8ac089f8134c 100644
> --- a/fs/nfsd/nfs4recover.c
> +++ b/fs/nfsd/nfs4recover.c
> @@ -162,7 +162,9 @@ legacy_recdir_name_error(struct nfs4_client *clp, int error)
>  	if (error == -ENOENT) {
>  		printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
>  			"Reboot recovery will not function correctly!\n");
> +		mutex_lock(&nfsd_mutex);
>  		nfsd4_client_tracking_exit(clp->net);
> +		mutex_unlock(&nfsd_mutex);
>  	}
>  }
>  
> @@ -2083,8 +2085,10 @@ nfsd4_client_record_create(struct nfs4_client *clp)
>  {
>  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
>  
> +	mutex_lock(&nfsd_mutex);
>  	if (nn->client_tracking_ops)
>  		nn->client_tracking_ops->create(clp);
> +	mutex_unlock(&nfsd_mutex);
>  }
>  
>  void
> @@ -2092,8 +2096,10 @@ nfsd4_client_record_remove(struct nfs4_client *clp)
>  {
>  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
>  
> +	mutex_lock(&nfsd_mutex);
>  	if (nn->client_tracking_ops)
>  		nn->client_tracking_ops->remove(clp);
> +	mutex_unlock(&nfsd_mutex);
>  }
>  
>  int
> @@ -2101,8 +2107,10 @@ nfsd4_client_record_check(struct nfs4_client *clp)
>  {
>  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
>  
> +	mutex_lock(&nfsd_mutex);
>  	if (nn->client_tracking_ops)
>  		return nn->client_tracking_ops->check(clp);
> +	mutex_unlock(&nfsd_mutex);
>  
>  	return -EOPNOTSUPP;
>  }
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index d5694987f86f..2794fdc8b678 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2529,7 +2529,9 @@ static void inc_reclaim_complete(struct nfs4_client *clp)
>  			nn->reclaim_str_hashtbl_size) {
>  		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
>  				clp->net->ns.inum);
> +		mutex_lock(&nfsd_mutex);
>  		nfsd4_end_grace(nn);
> +		mutex_unlock(&nfsd_mutex);
>  	}
>  }
>  
> @@ -6773,7 +6775,9 @@ nfs4_laundromat(struct nfsd_net *nn)
>  		lt.new_timeo = 0;
>  		goto out;
>  	}
> +	mutex_lock(&nfsd_mutex);
>  	nfsd4_end_grace(nn);
> +	mutex_unlock(&nfsd_mutex);
>  
>  	spin_lock(&nn->s2s_cp_lock);
>  	idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index 3f3e9f6c4250..649850b4bb60 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -1085,7 +1085,9 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
>  			if (!nn->nfsd_serv)
>  				return -EBUSY;
>  			trace_nfsd_end_grace(netns(file));
> +			mutex_lock(&nfsd_mutex);
>  			nfsd4_end_grace(nn);
> +			mutex_lock(&nfsd_mutex);
>  			break;
>  		default:
>  			return -EINVAL;

-- 
Jeff Layton <jlayton@kernel.org>

Re: [PATCH v2] nfsd: Invoke tracking callbacks only after initialization is complete

Posted by Jeff Layton 7 months, 3 weeks ago

On Wed, 2025-06-18 at 08:35 -0400, Jeff Layton wrote:
> On Thu, 2025-06-12 at 11:55 +0800, Li Lingfeng wrote:
> > Checking whether tracking callbacks can be called based on whether
> > nn->client_tracking_ops is NULL may lead to callbacks being invoked
> > before tracking initialization completes, causing resource access
> > violations (UAF, NULL pointer dereference). Examples:
> > 
> > 1) nfsd4_client_tracking_init
> >    // set nn->client_tracking_ops
> >    nfsd4_cld_tracking_init
> >     nfs4_cld_state_init
> >      nn->reclaim_str_hashtbl = kmalloc_array
> >     ... // error path, goto err
> >     nfs4_cld_state_shutdown
> >      kfree(nn->reclaim_str_hashtbl)
> >                                       write_v4_end_grace
> >                                        nfsd4_end_grace
> >                                         nfsd4_record_grace_done
> >                                          nfsd4_cld_grace_done
> >                                           nfs4_release_reclaim
> >                                            nn->reclaim_str_hashtbl[i]
> >                                            // UAF
> >    // clear nn->client_tracking_ops
> > 
> > 2) nfsd4_client_tracking_init
> >    // set nn->client_tracking_ops
> >    nfsd4_cld_tracking_init
> >                                       write_v4_end_grace
> >                                        nfsd4_end_grace
> >                                         nfsd4_record_grace_done
> >                                          nfsd4_cld_grace_done
> >                                           alloc_cld_upcall
> >                                            cn = nn->cld_net
> >                                            spin_lock // cn->cn_lock
> >                                            // NULL deref
> >    // error path, skip init pipe
> >    __nfsd4_init_cld_pipe
> >     cn = kzalloc
> >     nn->cld_net = cn
> >    // clear nn->client_tracking_ops
> > 
> 
> 
> Have you seen this race in the wild?
> 
> Looking at this more closely, I don't think this race is possible.
> You'd need to invoke the ->init routine concurrently from two different
> tasks, but nfsd4_client_tracking_init is called during net ns
> initialization, which should ensure that only one task invokes it.
> 

My bad. It's not called during net namespace initialization, but during
server startup. But, the nfsd_mutex is held during this initialization,
so I still don't think this race can happen.

> 
> 
> > After nfsd mounts, users can trigger grace_done callbacks via
> > /proc/fs/nfsd/v4_end_grace. If resources are uninitialized or freed
> > in error paths, this causes access violations.
> > 
> > Resolve the issue by leveraging nfsd_mutex to prevent concurrency.
> > 
> > Fixes: 52e19c09a183 ("nfsd: make reclaim_str_hashtbl allocated per net")
> > Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
> > ---
> >   Changes in v2:
> >     Use nfsd_mutex instead of adding a new flag to prevent concurrency.
> >  fs/nfsd/nfs4recover.c | 8 ++++++++
> >  fs/nfsd/nfs4state.c   | 4 ++++
> >  fs/nfsd/nfsctl.c      | 2 ++
> >  3 files changed, 14 insertions(+)
> > 
> > diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> > index 82785db730d9..8ac089f8134c 100644
> > --- a/fs/nfsd/nfs4recover.c
> > +++ b/fs/nfsd/nfs4recover.c
> > @@ -162,7 +162,9 @@ legacy_recdir_name_error(struct nfs4_client *clp, int error)
> >  	if (error == -ENOENT) {
> >  		printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
> >  			"Reboot recovery will not function correctly!\n");
> > +		mutex_lock(&nfsd_mutex);
> >  		nfsd4_client_tracking_exit(clp->net);
> > +		mutex_unlock(&nfsd_mutex);
> >  	}
> >  }
> >  
> > @@ -2083,8 +2085,10 @@ nfsd4_client_record_create(struct nfs4_client *clp)
> >  {
> >  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> >  
> > +	mutex_lock(&nfsd_mutex);
> >  	if (nn->client_tracking_ops)
> >  		nn->client_tracking_ops->create(clp);
> > +	mutex_unlock(&nfsd_mutex);
> >  }
> >  
> >  void
> > @@ -2092,8 +2096,10 @@ nfsd4_client_record_remove(struct nfs4_client *clp)
> >  {
> >  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> >  
> > +	mutex_lock(&nfsd_mutex);
> >  	if (nn->client_tracking_ops)
> >  		nn->client_tracking_ops->remove(clp);
> > +	mutex_unlock(&nfsd_mutex);
> >  }
> >  
> >  int
> > @@ -2101,8 +2107,10 @@ nfsd4_client_record_check(struct nfs4_client *clp)
> >  {
> >  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> >  
> > +	mutex_lock(&nfsd_mutex);
> >  	if (nn->client_tracking_ops)
> >  		return nn->client_tracking_ops->check(clp);
> > +	mutex_unlock(&nfsd_mutex);
> >  
> >  	return -EOPNOTSUPP;
> >  }
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index d5694987f86f..2794fdc8b678 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -2529,7 +2529,9 @@ static void inc_reclaim_complete(struct nfs4_client *clp)
> >  			nn->reclaim_str_hashtbl_size) {
> >  		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
> >  				clp->net->ns.inum);
> > +		mutex_lock(&nfsd_mutex);
> >  		nfsd4_end_grace(nn);
> > +		mutex_unlock(&nfsd_mutex);
> >  	}
> >  }
> >  
> > @@ -6773,7 +6775,9 @@ nfs4_laundromat(struct nfsd_net *nn)
> >  		lt.new_timeo = 0;
> >  		goto out;
> >  	}
> > +	mutex_lock(&nfsd_mutex);
> >  	nfsd4_end_grace(nn);
> > +	mutex_unlock(&nfsd_mutex);
> >  
> >  	spin_lock(&nn->s2s_cp_lock);
> >  	idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
> > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> > index 3f3e9f6c4250..649850b4bb60 100644
> > --- a/fs/nfsd/nfsctl.c
> > +++ b/fs/nfsd/nfsctl.c
> > @@ -1085,7 +1085,9 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
> >  			if (!nn->nfsd_serv)
> >  				return -EBUSY;
> >  			trace_nfsd_end_grace(netns(file));
> > +			mutex_lock(&nfsd_mutex);
> >  			nfsd4_end_grace(nn);
> > +			mutex_lock(&nfsd_mutex);
> >  			break;
> >  		default:
> >  			return -EINVAL;

-- 
Jeff Layton <jlayton@kernel.org>

Re: [PATCH v2] nfsd: Invoke tracking callbacks only after initialization is complete

Posted by Jeff Layton 7 months, 3 weeks ago

On Thu, 2025-06-12 at 11:55 +0800, Li Lingfeng wrote:
> Checking whether tracking callbacks can be called based on whether
> nn->client_tracking_ops is NULL may lead to callbacks being invoked
> before tracking initialization completes, causing resource access
> violations (UAF, NULL pointer dereference). Examples:
> 
> 1) nfsd4_client_tracking_init
>    // set nn->client_tracking_ops
>    nfsd4_cld_tracking_init
>     nfs4_cld_state_init
>      nn->reclaim_str_hashtbl = kmalloc_array
>     ... // error path, goto err
>     nfs4_cld_state_shutdown
>      kfree(nn->reclaim_str_hashtbl)
>                                       write_v4_end_grace
>                                        nfsd4_end_grace
>                                         nfsd4_record_grace_done
>                                          nfsd4_cld_grace_done
>                                           nfs4_release_reclaim
>                                            nn->reclaim_str_hashtbl[i]
>                                            // UAF
>    // clear nn->client_tracking_ops
> 
> 2) nfsd4_client_tracking_init
>    // set nn->client_tracking_ops
>    nfsd4_cld_tracking_init
>                                       write_v4_end_grace
>                                        nfsd4_end_grace
>                                         nfsd4_record_grace_done
>                                          nfsd4_cld_grace_done
>                                           alloc_cld_upcall
>                                            cn = nn->cld_net
>                                            spin_lock // cn->cn_lock
>                                            // NULL deref
>    // error path, skip init pipe
>    __nfsd4_init_cld_pipe
>     cn = kzalloc
>     nn->cld_net = cn
>    // clear nn->client_tracking_ops
> 
> After nfsd mounts, users can trigger grace_done callbacks via
> /proc/fs/nfsd/v4_end_grace. If resources are uninitialized or freed
> in error paths, this causes access violations.
> 
> Resolve the issue by leveraging nfsd_mutex to prevent concurrency.
> 
> Fixes: 52e19c09a183 ("nfsd: make reclaim_str_hashtbl allocated per net")
> Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
> ---
>   Changes in v2:
>     Use nfsd_mutex instead of adding a new flag to prevent concurrency.
>  fs/nfsd/nfs4recover.c | 8 ++++++++
>  fs/nfsd/nfs4state.c   | 4 ++++
>  fs/nfsd/nfsctl.c      | 2 ++
>  3 files changed, 14 insertions(+)
> 
> diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> index 82785db730d9..8ac089f8134c 100644
> --- a/fs/nfsd/nfs4recover.c
> +++ b/fs/nfsd/nfs4recover.c
> @@ -162,7 +162,9 @@ legacy_recdir_name_error(struct nfs4_client *clp, int error)
>  	if (error == -ENOENT) {
>  		printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
>  			"Reboot recovery will not function correctly!\n");
> +		mutex_lock(&nfsd_mutex);
>  		nfsd4_client_tracking_exit(clp->net);
> +		mutex_unlock(&nfsd_mutex);
>  	}
>  }
>  
> @@ -2083,8 +2085,10 @@ nfsd4_client_record_create(struct nfs4_client *clp)
>  {
>  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
>  
> +	mutex_lock(&nfsd_mutex);
>  	if (nn->client_tracking_ops)
>  		nn->client_tracking_ops->create(clp);
> +	mutex_unlock(&nfsd_mutex);
>  }
>  
>  void
> @@ -2092,8 +2096,10 @@ nfsd4_client_record_remove(struct nfs4_client *clp)
>  {
>  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
>  
> +	mutex_lock(&nfsd_mutex);
>  	if (nn->client_tracking_ops)
>  		nn->client_tracking_ops->remove(clp);
> +	mutex_unlock(&nfsd_mutex);
>  }
>  
>  int
> @@ -2101,8 +2107,10 @@ nfsd4_client_record_check(struct nfs4_client *clp)
>  {
>  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
>  
> +	mutex_lock(&nfsd_mutex);
>  	if (nn->client_tracking_ops)
>  		return nn->client_tracking_ops->check(clp);
> +	mutex_unlock(&nfsd_mutex);
>  
>  	return -EOPNOTSUPP;
>  }
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index d5694987f86f..2794fdc8b678 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2529,7 +2529,9 @@ static void inc_reclaim_complete(struct nfs4_client *clp)
>  			nn->reclaim_str_hashtbl_size) {
>  		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
>  				clp->net->ns.inum);
> +		mutex_lock(&nfsd_mutex);
>  		nfsd4_end_grace(nn);
> +		mutex_unlock(&nfsd_mutex);
>  	}
>  }
>  
> @@ -6773,7 +6775,9 @@ nfs4_laundromat(struct nfsd_net *nn)
>  		lt.new_timeo = 0;
>  		goto out;
>  	}
> +	mutex_lock(&nfsd_mutex);
>  	nfsd4_end_grace(nn);
> +	mutex_unlock(&nfsd_mutex);
>  
>  	spin_lock(&nn->s2s_cp_lock);
>  	idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index 3f3e9f6c4250..649850b4bb60 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -1085,7 +1085,9 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
>  			if (!nn->nfsd_serv)
>  				return -EBUSY;
>  			trace_nfsd_end_grace(netns(file));
> +			mutex_lock(&nfsd_mutex);
>  			nfsd4_end_grace(nn);
> +			mutex_lock(&nfsd_mutex);
>  			break;
>  		default:
>  			return -EINVAL;

This seems like a very heavyweight way to ensure that this doesn't
race, especially since the client tracking ops are per net-namespace
and the nfsd_mutex is global.

Also, how do you get two different tasks calling
nfsd4_client_tracking_init() at the same time? That's called when the
net namespace is set up, so there shouldn't be more than one copy
running.

I thought from an earlier patch that we were going to change this to
just ensure that the client_tracking_ops pointer did a NULL -> non-NULL
transition only once?

I think that's sufficient to ensure that this race can't occur.
-- 
Jeff Layton <jlayton@kernel.org>