[PATCH] nfs: pnfs: handle early layoutreturn failures gracefully

Robert Milkowski posted 1 patch 1 week, 3 days ago
fs/nfs/nfs4proc.c | 37 +++++++++++++++++++++++++------------
fs/nfs/pnfs.c     | 21 +++++++++++++++++++--
2 files changed, 44 insertions(+), 14 deletions(-)
[PATCH] nfs: pnfs: handle early layoutreturn failures gracefully
Posted by Robert Milkowski 1 week, 3 days ago
pnfs_layoutreturn_before_put_layout_hdr() bumps the layout header refcount
and sets NFS_LAYOUT_RETURN before prepare or rpc_run_task dispatch. If the
layout driver fails prepare or rpc_run_task() fails to queue the call, we
currently leak refs and leave waiters stuck on
pnfs_prepare_to_retry_layoutget().

Mirror the normal completion path for these early failures: warn and
schedule pnfs_layoutreturn_retry_later(), free any reserved slot, drop
refs/creds/inode, and clear the wait bit.

Signed-off-by: Robert Milkowski <rmilkowski@gmail.com>
---
 fs/nfs/nfs4proc.c | 37 +++++++++++++++++++++++++------------
 fs/nfs/pnfs.c     | 21 +++++++++++++++++++--
 2 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 93c6ce04332b..6066a1c7227d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -10132,25 +10132,34 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 	rpc_restart_call_prepare(task);
 }
 
-static void nfs4_layoutreturn_release(void *calldata)
+static void nfs4_layoutreturn_cleanup(struct nfs4_layoutreturn *lrp, int status)
 {
-	struct nfs4_layoutreturn *lrp = calldata;
 	struct pnfs_layout_hdr *lo = lrp->args.layout;
 
-	if (lrp->rpc_status == 0 || !lrp->inode)
-		pnfs_layoutreturn_free_lsegs(
-			lo, &lrp->args.stateid, &lrp->args.range,
-			lrp->res.lrs_present ? &lrp->res.stateid : NULL);
+	if (status == 0 || !lrp->inode)
+		pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid,
+					     &lrp->args.range,
+					     lrp->res.lrs_present ?
+					     &lrp->res.stateid : NULL);
 	else
 		pnfs_layoutreturn_retry_later(lo, &lrp->args.stateid,
 					      &lrp->args.range);
-	nfs4_sequence_free_slot(&lrp->res.seq_res);
+	if (lrp->res.seq_res.sr_slot)
+		nfs4_sequence_free_slot(&lrp->res.seq_res);
 	if (lrp->ld_private.ops && lrp->ld_private.ops->free)
 		lrp->ld_private.ops->free(&lrp->ld_private);
-	pnfs_put_layout_hdr(lrp->args.layout);
-	nfs_iput_and_deactive(lrp->inode);
+	pnfs_put_layout_hdr(lo);
+	if (lrp->inode)
+		nfs_iput_and_deactive(lrp->inode);
 	put_cred(lrp->cred);
-	kfree(calldata);
+	kfree(lrp);
+}
+
+static void nfs4_layoutreturn_release(void *calldata)
+{
+	struct nfs4_layoutreturn *lrp = calldata;
+
+	nfs4_layoutreturn_cleanup(lrp, lrp->rpc_status);
 }
 
 static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
@@ -10198,8 +10207,12 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, unsigned int flags)
 		nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1,
 				   0);
 	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task))
-		return PTR_ERR(task);
+	if (IS_ERR(task)) {
+		status = PTR_ERR(task);
+		trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status);
+		nfs4_layoutreturn_cleanup(lrp, status);
+		return status;
+	}
 	if (!(flags & PNFS_FL_LAYOUTRETURN_ASYNC))
 		status = task->tk_status;
 	trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f157d43d1312..a489f43344b8 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1370,13 +1370,30 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
 	lrp->args.ld_private = &lrp->ld_private;
 	lrp->clp = NFS_SERVER(ino)->nfs_client;
 	lrp->cred = cred;
-	if (ld->prepare_layoutreturn)
-		ld->prepare_layoutreturn(&lrp->args);
+	if (ld->prepare_layoutreturn) {
+		status = ld->prepare_layoutreturn(&lrp->args);
+		if (status) {
+			pr_warn_ratelimited("NFS: pNFS layoutreturn prepare failed (%d) for layout driver %s\n",
+				status, ld->name ? ld->name : "unknown");
+			goto out_prepare_fail;
+		}
+	}
 
 	status = nfs4_proc_layoutreturn(lrp, flags);
 out:
 	dprintk("<-- %s status: %d\n", __func__, status);
 	return status;
+
+out_prepare_fail:
+	pnfs_layoutreturn_retry_later(lo, &lrp->args.stateid, &lrp->args.range);
+	if (lrp->ld_private.ops && lrp->ld_private.ops->free)
+		lrp->ld_private.ops->free(&lrp->ld_private);
+	if (lrp->inode)
+		nfs_iput_and_deactive(lrp->inode);
+	put_cred(cred);
+	kfree(lrp);
+	pnfs_put_layout_hdr(lo);
+	return status;
 }
 
 /* Return true if layoutreturn is needed */

base-commit: cb015814f8b6eebcbb8e46e111d108892c5e6821
-- 
2.47.1
Re: [PATCH] nfs: pnfs: handle early layoutreturn failures gracefully
Posted by Trond Myklebust 1 week, 3 days ago
On Tue, 2025-12-09 at 14:53 +0000, Robert Milkowski wrote:
> pnfs_layoutreturn_before_put_layout_hdr() bumps the layout header
> refcount
> and sets NFS_LAYOUT_RETURN before prepare or rpc_run_task dispatch.
> If the
> layout driver fails prepare or rpc_run_task() fails to queue the
> call, we
> currently leak refs and leave waiters stuck on
> pnfs_prepare_to_retry_layoutget().
> 
> Mirror the normal completion path for these early failures: warn and
> schedule pnfs_layoutreturn_retry_later(), free any reserved slot,
> drop
> refs/creds/inode, and clear the wait bit.
> 
> Signed-off-by: Robert Milkowski <rmilkowski@gmail.com>
> ---
>  fs/nfs/nfs4proc.c | 37 +++++++++++++++++++++++++------------
>  fs/nfs/pnfs.c     | 21 +++++++++++++++++++--
>  2 files changed, 44 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 93c6ce04332b..6066a1c7227d 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -10132,25 +10132,34 @@ static void nfs4_layoutreturn_done(struct
> rpc_task *task, void *calldata)
>  	rpc_restart_call_prepare(task);
>  }
>  
> -static void nfs4_layoutreturn_release(void *calldata)
> +static void nfs4_layoutreturn_cleanup(struct nfs4_layoutreturn *lrp,
> int status)
>  {
> -	struct nfs4_layoutreturn *lrp = calldata;
>  	struct pnfs_layout_hdr *lo = lrp->args.layout;
>  
> -	if (lrp->rpc_status == 0 || !lrp->inode)
> -		pnfs_layoutreturn_free_lsegs(
> -			lo, &lrp->args.stateid, &lrp->args.range,
> -			lrp->res.lrs_present ? &lrp->res.stateid :
> NULL);
> +	if (status == 0 || !lrp->inode)
> +		pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid,
> +					     &lrp->args.range,
> +					     lrp->res.lrs_present ?
> +					     &lrp->res.stateid :
> NULL);
>  	else
>  		pnfs_layoutreturn_retry_later(lo, &lrp-
> >args.stateid,
>  					      &lrp->args.range);
> -	nfs4_sequence_free_slot(&lrp->res.seq_res);
> +	if (lrp->res.seq_res.sr_slot)
> +		nfs4_sequence_free_slot(&lrp->res.seq_res);
>  	if (lrp->ld_private.ops && lrp->ld_private.ops->free)
>  		lrp->ld_private.ops->free(&lrp->ld_private);
> -	pnfs_put_layout_hdr(lrp->args.layout);
> -	nfs_iput_and_deactive(lrp->inode);
> +	pnfs_put_layout_hdr(lo);
> +	if (lrp->inode)
> +		nfs_iput_and_deactive(lrp->inode);
>  	put_cred(lrp->cred);
> -	kfree(calldata);
> +	kfree(lrp);
> +}
> +
> +static void nfs4_layoutreturn_release(void *calldata)
> +{
> +	struct nfs4_layoutreturn *lrp = calldata;
> +
> +	nfs4_layoutreturn_cleanup(lrp, lrp->rpc_status);
>  }
>  
>  static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
> @@ -10198,8 +10207,12 @@ int nfs4_proc_layoutreturn(struct
> nfs4_layoutreturn *lrp, unsigned int flags)
>  		nfs4_init_sequence(&lrp->args.seq_args, &lrp-
> >res.seq_res, 1,
>  				   0);
>  	task = rpc_run_task(&task_setup_data);
> -	if (IS_ERR(task))
> -		return PTR_ERR(task);
> +	if (IS_ERR(task)) {
> +		status = PTR_ERR(task);
> +		trace_nfs4_layoutreturn(lrp->args.inode, &lrp-
> >args.stateid, status);
> +		nfs4_layoutreturn_cleanup(lrp, status);
> +		return status;
> +	}

NACK. The above introduces a use-after-free. There is no need to call
the release routine after a call to rpc_run_task().

>  	if (!(flags & PNFS_FL_LAYOUTRETURN_ASYNC))
>  		status = task->tk_status;
>  	trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid,
> status);
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index f157d43d1312..a489f43344b8 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1370,13 +1370,30 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr
> *lo,
>  	lrp->args.ld_private = &lrp->ld_private;
>  	lrp->clp = NFS_SERVER(ino)->nfs_client;
>  	lrp->cred = cred;
> -	if (ld->prepare_layoutreturn)
> -		ld->prepare_layoutreturn(&lrp->args);
> +	if (ld->prepare_layoutreturn) {
> +		status = ld->prepare_layoutreturn(&lrp->args);
> +		if (status) {
> +			pr_warn_ratelimited("NFS: pNFS layoutreturn
> prepare failed (%d) for layout driver %s\n",
> +				status, ld->name ? ld->name :
> "unknown");
> +			goto out_prepare_fail;
> +		}
> +	}

This is also unnecessary. The existing code will cope just fine with
args->ld_private being unset.

>  
>  	status = nfs4_proc_layoutreturn(lrp, flags);
>  out:
>  	dprintk("<-- %s status: %d\n", __func__, status);
>  	return status;
> +
> +out_prepare_fail:
> +	pnfs_layoutreturn_retry_later(lo, &lrp->args.stateid, &lrp-
> >args.range);
> +	if (lrp->ld_private.ops && lrp->ld_private.ops->free)
> +		lrp->ld_private.ops->free(&lrp->ld_private);
> +	if (lrp->inode)
> +		nfs_iput_and_deactive(lrp->inode);
> +	put_cred(cred);
> +	kfree(lrp);
> +	pnfs_put_layout_hdr(lo);
> +	return status;
>  }
>  
>  /* Return true if layoutreturn is needed */
> 
> base-commit: cb015814f8b6eebcbb8e46e111d108892c5e6821

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trondmy@kernel.org, trond.myklebust@hammerspace.com