[PATCH RFC 2/2] nfsd: call generic_fadvise after v3 READ, stable WRITE or COMMIT

Jeff Layton posted 2 patches 3 months ago
[PATCH RFC 2/2] nfsd: call generic_fadvise after v3 READ, stable WRITE or COMMIT
Posted by Jeff Layton 3 months ago
Recent testing has shown that keeping pagecache pages around for too
long can be detrimental to performance with nfsd. Clients only rarely
revisit the same data, so the pages tend to just hang around.

This patch changes the pc_release callbacks for NFSv3 READ, WRITE and
COMMIT to call generic_fadvise(..., POSIX_FADV_DONTNEED) on the accessed
range.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/debugfs.c  |  2 ++
 fs/nfsd/nfs3proc.c | 59 +++++++++++++++++++++++++++++++++++++++++++++---------
 fs/nfsd/nfsd.h     |  1 +
 fs/nfsd/nfsproc.c  |  4 ++--
 fs/nfsd/vfs.c      | 21 ++++++++++++++-----
 fs/nfsd/vfs.h      |  5 +++--
 fs/nfsd/xdr3.h     |  3 +++
 7 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
index 84b0c8b559dc90bd5c2d9d5e15c8e0682c0d610c..b007718dd959bc081166ec84e06f577a8fc2b46b 100644
--- a/fs/nfsd/debugfs.c
+++ b/fs/nfsd/debugfs.c
@@ -44,4 +44,6 @@ void nfsd_debugfs_init(void)
 
 	debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO,
 			    nfsd_top_dir, NULL, &nfsd_dsr_fops);
+	debugfs_create_bool("enable-fadvise-dontneed", 0644,
+			    nfsd_top_dir, &nfsd_enable_fadvise_dontneed);
 }
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index b6d03e1ef5f7a5e8dd111b0d56c061f1e91abff7..11261cf67ea817ec566626f08b733e09c9e121de 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -9,6 +9,7 @@
 #include <linux/ext2_fs.h>
 #include <linux/magic.h>
 #include <linux/namei.h>
+#include <linux/fadvise.h>
 
 #include "cache.h"
 #include "xdr3.h"
@@ -206,11 +207,25 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
 
 	fh_copy(&resp->fh, &argp->fh);
 	resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
-				 &resp->count, &resp->eof);
+				 &resp->count, &resp->eof, &resp->nf);
 	resp->status = nfsd3_map_status(resp->status);
 	return rpc_success;
 }
 
+static void
+nfsd3_release_read(struct svc_rqst *rqstp)
+{
+	struct nfsd3_readargs *argp = rqstp->rq_argp;
+	struct nfsd3_readres *resp = rqstp->rq_resp;
+
+	if (nfsd_enable_fadvise_dontneed && resp->status == nfs_ok)
+		generic_fadvise(nfsd_file_file(resp->nf), argp->offset, resp->count,
+				POSIX_FADV_DONTNEED);
+	if (resp->nf)
+		nfsd_file_put(resp->nf);
+	fh_put(&resp->fh);
+}
+
 /*
  * Write data to a file
  */
@@ -236,12 +251,26 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
 	resp->committed = argp->stable;
 	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
 				  &argp->payload, &cnt,
-				  resp->committed, resp->verf);
+				  resp->committed, resp->verf, &resp->nf);
 	resp->count = cnt;
 	resp->status = nfsd3_map_status(resp->status);
 	return rpc_success;
 }
 
+static void
+nfsd3_release_write(struct svc_rqst *rqstp)
+{
+	struct nfsd3_writeargs *argp = rqstp->rq_argp;
+	struct nfsd3_writeres *resp = rqstp->rq_resp;
+
+	if (nfsd_enable_fadvise_dontneed && resp->status == nfs_ok && resp->committed)
+		generic_fadvise(nfsd_file_file(resp->nf), argp->offset, resp->count,
+				POSIX_FADV_DONTNEED);
+	if (resp->nf)
+		nfsd_file_put(resp->nf);
+	fh_put(&resp->fh);
+}
+
 /*
  * Implement NFSv3's unchecked, guarded, and exclusive CREATE
  * semantics for regular files. Except for the created file,
@@ -748,7 +777,6 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
 {
 	struct nfsd3_commitargs *argp = rqstp->rq_argp;
 	struct nfsd3_commitres *resp = rqstp->rq_resp;
-	struct nfsd_file *nf;
 
 	dprintk("nfsd: COMMIT(3)   %s %u@%Lu\n",
 				SVCFH_fmt(&argp->fh),
@@ -757,17 +785,30 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
 
 	fh_copy(&resp->fh, &argp->fh);
 	resp->status = nfsd_file_acquire_gc(rqstp, &resp->fh, NFSD_MAY_WRITE |
-					    NFSD_MAY_NOT_BREAK_LEASE, &nf);
+					    NFSD_MAY_NOT_BREAK_LEASE, &resp->nf);
 	if (resp->status)
 		goto out;
-	resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset,
+	resp->status = nfsd_commit(rqstp, &resp->fh, resp->nf, argp->offset,
 				   argp->count, resp->verf);
-	nfsd_file_put(nf);
 out:
 	resp->status = nfsd3_map_status(resp->status);
 	return rpc_success;
 }
 
+static void
+nfsd3_release_commit(struct svc_rqst *rqstp)
+{
+	struct nfsd3_commitargs *argp = rqstp->rq_argp;
+	struct nfsd3_commitres *resp = rqstp->rq_resp;
+
+	if (nfsd_enable_fadvise_dontneed && resp->status == nfs_ok)
+		generic_fadvise(nfsd_file_file(resp->nf), argp->offset, argp->count,
+				POSIX_FADV_DONTNEED);
+	if (resp->nf)
+		nfsd_file_put(resp->nf);
+	fh_put(&resp->fh);
+}
+
 
 /*
  * NFSv3 Server procedures.
@@ -864,7 +905,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_func = nfsd3_proc_read,
 		.pc_decode = nfs3svc_decode_readargs,
 		.pc_encode = nfs3svc_encode_readres,
-		.pc_release = nfs3svc_release_fhandle,
+		.pc_release = nfsd3_release_read,
 		.pc_argsize = sizeof(struct nfsd3_readargs),
 		.pc_argzero = sizeof(struct nfsd3_readargs),
 		.pc_ressize = sizeof(struct nfsd3_readres),
@@ -876,7 +917,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_func = nfsd3_proc_write,
 		.pc_decode = nfs3svc_decode_writeargs,
 		.pc_encode = nfs3svc_encode_writeres,
-		.pc_release = nfs3svc_release_fhandle,
+		.pc_release = nfsd3_release_write,
 		.pc_argsize = sizeof(struct nfsd3_writeargs),
 		.pc_argzero = sizeof(struct nfsd3_writeargs),
 		.pc_ressize = sizeof(struct nfsd3_writeres),
@@ -1039,7 +1080,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_func = nfsd3_proc_commit,
 		.pc_decode = nfs3svc_decode_commitargs,
 		.pc_encode = nfs3svc_encode_commitres,
-		.pc_release = nfs3svc_release_fhandle,
+		.pc_release = nfsd3_release_commit,
 		.pc_argsize = sizeof(struct nfsd3_commitargs),
 		.pc_argzero = sizeof(struct nfsd3_commitargs),
 		.pc_ressize = sizeof(struct nfsd3_commitres),
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 1cd0bed57bc2f27248fd66a8efef692a5e9a390c..288904d88b9245c03eae0aa347e867037b7c85c5 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -152,6 +152,7 @@ static inline void nfsd_debugfs_exit(void) {}
 #endif
 
 extern bool nfsd_disable_splice_read __read_mostly;
+extern bool nfsd_enable_fadvise_dontneed __read_mostly;
 
 extern int nfsd_max_blksize;
 
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 8f71f5748c75b69f15bae5e63799842e0e8b3139..bb8f98adb3e31b10adc4694987f8f5036726bf7f 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -225,7 +225,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
 	resp->count = argp->count;
 	fh_copy(&resp->fh, &argp->fh);
 	resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
-				 &resp->count, &eof);
+				 &resp->count, &eof, NULL);
 	if (resp->status == nfs_ok)
 		resp->status = fh_getattr(&resp->fh, &resp->stat);
 	else if (resp->status == nfserr_jukebox)
@@ -258,7 +258,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
 
 	fh_copy(&resp->fh, &argp->fh);
 	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
-				  &argp->payload, &cnt, NFS_DATA_SYNC, NULL);
+				  &argp->payload, &cnt, NFS_DATA_SYNC, NULL, NULL);
 	if (resp->status == nfs_ok)
 		resp->status = fh_getattr(&resp->fh, &resp->stat);
 	else if (resp->status == nfserr_jukebox)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ee78b6fb17098b788b07f5cd90598e678244b57e..f23eb3a07bb99dc231be9ea6db4e58b9e328a689 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -49,6 +49,7 @@
 #define NFSDDBG_FACILITY		NFSDDBG_FILEOP
 
 bool nfsd_disable_splice_read __read_mostly;
+bool nfsd_enable_fadvise_dontneed __read_mostly;
 
 /**
  * nfserrno - Map Linux errnos to NFS errnos
@@ -1280,6 +1281,7 @@ bool nfsd_read_splice_ok(struct svc_rqst *rqstp)
  * @offset: starting byte offset
  * @count: IN: requested number of bytes; OUT: number of bytes read
  * @eof: OUT: set non-zero if operation reached the end of the file
+ * @pnf: optional return pointer to pass back nfsd_file reference
  *
  * The caller must verify that there is enough space in @rqstp.rq_res
  * to perform this operation.
@@ -1290,7 +1292,8 @@ bool nfsd_read_splice_ok(struct svc_rqst *rqstp)
  * returned.
  */
 __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
-		 loff_t offset, unsigned long *count, u32 *eof)
+		 loff_t offset, unsigned long *count, u32 *eof,
+		 struct nfsd_file **pnf)
 {
 	struct nfsd_file	*nf;
 	struct file *file;
@@ -1307,7 +1310,10 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	else
 		err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof);
 
-	nfsd_file_put(nf);
+	if (pnf && err == nfs_ok)
+		*pnf = nf;
+	else
+		nfsd_file_put(nf);
 	trace_nfsd_read_done(rqstp, fhp, offset, *count);
 	return err;
 }
@@ -1321,8 +1327,10 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
  * @stable: An NFS stable_how value
  * @verf: NFS WRITE verifier
+ * @pnf: optional return pointer to pass back nfsd_file reference
  *
- * Upon return, caller must invoke fh_put on @fhp.
+ * Upon return, caller must invoke fh_put() on @fhp. If it sets @pnf,
+ * then it must also call nfsd_file_put() on the returned reference.
  *
  * Return values:
  *   An nfsstat value in network byte order.
@@ -1330,7 +1338,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 __be32
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 	   const struct xdr_buf *payload, unsigned long *cnt, int stable,
-	   __be32 *verf)
+	   __be32 *verf, struct nfsd_file **pnf)
 {
 	struct nfsd_file *nf;
 	__be32 err;
@@ -1343,7 +1351,10 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 
 	err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt,
 			     stable, verf);
-	nfsd_file_put(nf);
+	if (pnf && err == nfs_ok)
+		*pnf = nf;
+	else
+		nfsd_file_put(nf);
 out:
 	trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
 	return err;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index eff04959606fe55c141ab4a2eed97c7e0716a5f5..2d063ee7786f499f34c39cd3ba7e776bb7562d57 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -127,10 +127,11 @@ __be32		nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 bool		nfsd_read_splice_ok(struct svc_rqst *rqstp);
 __be32		nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				loff_t offset, unsigned long *count,
-				u32 *eof);
+				u32 *eof, struct nfsd_file **pnf);
 __be32		nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				loff_t offset, const struct xdr_buf *payload,
-				unsigned long *cnt, int stable, __be32 *verf);
+				unsigned long *cnt, int stable, __be32 *verf,
+				struct nfsd_file **pnf);
 __be32		nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				struct nfsd_file *nf, loff_t offset,
 				const struct xdr_buf *payload,
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 522067b7fd755930a1c2e42b826d9132ac2993be..3f4c51983003188be51a0f8c2db2e0acc9a1363b 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -146,6 +146,7 @@ struct nfsd3_readres {
 	unsigned long		count;
 	__u32			eof;
 	struct page		**pages;
+	struct nfsd_file	*nf;
 };
 
 struct nfsd3_writeres {
@@ -154,6 +155,7 @@ struct nfsd3_writeres {
 	unsigned long		count;
 	int			committed;
 	__be32			verf[2];
+	struct nfsd_file	*nf;
 };
 
 struct nfsd3_renameres {
@@ -217,6 +219,7 @@ struct nfsd3_commitres {
 	__be32			status;
 	struct svc_fh		fh;
 	__be32			verf[2];
+	struct nfsd_file	*nf;
 };
 
 struct nfsd3_getaclres {

-- 
2.50.0
Re: [PATCH RFC 2/2] nfsd: call generic_fadvise after v3 READ, stable WRITE or COMMIT
Posted by NeilBrown 3 months ago
On Fri, 04 Jul 2025, Jeff Layton wrote:
> Recent testing has shown that keeping pagecache pages around for too
> long can be detrimental to performance with nfsd. Clients only rarely
> revisit the same data, so the pages tend to just hang around.
> 
> This patch changes the pc_release callbacks for NFSv3 READ, WRITE and
> COMMIT to call generic_fadvise(..., POSIX_FADV_DONTNEED) on the accessed
> range.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfsd/debugfs.c  |  2 ++
>  fs/nfsd/nfs3proc.c | 59 +++++++++++++++++++++++++++++++++++++++++++++---------
>  fs/nfsd/nfsd.h     |  1 +
>  fs/nfsd/nfsproc.c  |  4 ++--
>  fs/nfsd/vfs.c      | 21 ++++++++++++++-----
>  fs/nfsd/vfs.h      |  5 +++--
>  fs/nfsd/xdr3.h     |  3 +++
>  7 files changed, 77 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> index 84b0c8b559dc90bd5c2d9d5e15c8e0682c0d610c..b007718dd959bc081166ec84e06f577a8fc2b46b 100644
> --- a/fs/nfsd/debugfs.c
> +++ b/fs/nfsd/debugfs.c
> @@ -44,4 +44,6 @@ void nfsd_debugfs_init(void)
>  
>  	debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO,
>  			    nfsd_top_dir, NULL, &nfsd_dsr_fops);
> +	debugfs_create_bool("enable-fadvise-dontneed", 0644,
> +			    nfsd_top_dir, &nfsd_enable_fadvise_dontneed);
>  }
> diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
> index b6d03e1ef5f7a5e8dd111b0d56c061f1e91abff7..11261cf67ea817ec566626f08b733e09c9e121de 100644
> --- a/fs/nfsd/nfs3proc.c
> +++ b/fs/nfsd/nfs3proc.c
> @@ -9,6 +9,7 @@
>  #include <linux/ext2_fs.h>
>  #include <linux/magic.h>
>  #include <linux/namei.h>
> +#include <linux/fadvise.h>
>  
>  #include "cache.h"
>  #include "xdr3.h"
> @@ -206,11 +207,25 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
>  
>  	fh_copy(&resp->fh, &argp->fh);
>  	resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
> -				 &resp->count, &resp->eof);
> +				 &resp->count, &resp->eof, &resp->nf);
>  	resp->status = nfsd3_map_status(resp->status);
>  	return rpc_success;
>  }
>  
> +static void
> +nfsd3_release_read(struct svc_rqst *rqstp)
> +{
> +	struct nfsd3_readargs *argp = rqstp->rq_argp;
> +	struct nfsd3_readres *resp = rqstp->rq_resp;
> +
> +	if (nfsd_enable_fadvise_dontneed && resp->status == nfs_ok)
> +		generic_fadvise(nfsd_file_file(resp->nf), argp->offset, resp->count,
> +				POSIX_FADV_DONTNEED);
> +	if (resp->nf)
> +		nfsd_file_put(resp->nf);
> +	fh_put(&resp->fh);

This looks wrong - testing resp->nf after assuming it was non-NULL.
I don't think it *is* wrong because ->state == nfs_ok ensures
->nf is valid. But still....

How about:

    fh_put(resp->fh);
    if (!resp->nf)
         return;
    if (nfsd_enable_fadvise_dontneed)
	generic_fadvise(nfsd_file_file(resp->nf), argp->offset, resp->count,
		POSIX_FADV_DONTNEED);
    nfsd_file_put(resp->nf);

??
Note that we don't test ->status because that is identical to testing ->nf.
Ditto for other release functions.

Otherwise it makes sense for exploring how to optimise IO.

Reviewed-by: NeilBrown <neil@brown.name>

NeilBrown
Re: [PATCH RFC 2/2] nfsd: call generic_fadvise after v3 READ, stable WRITE or COMMIT
Posted by NeilBrown 3 months ago
On Fri, 04 Jul 2025, NeilBrown wrote:
> 
> Otherwise it makes sense for exploring how to optimise IO.
> 
> Reviewed-by: NeilBrown <neil@brown.name>

Actually - I take that back.  generic_fadvise() is the wrong interface.
It is for filesystems to use if the don't have any special requirements,
and for vfs_fadvise() to use if the file system hasn't give a function
to use.

nfsd should be calling vfs_fadvise().

NeilBrown
Re: [PATCH RFC 2/2] nfsd: call generic_fadvise after v3 READ, stable WRITE or COMMIT
Posted by Jeff Layton 3 months ago
On Fri, 2025-07-04 at 17:26 +1000, NeilBrown wrote:
> On Fri, 04 Jul 2025, NeilBrown wrote:
> > 
> > Otherwise it makes sense for exploring how to optimise IO.
> > 
> > Reviewed-by: NeilBrown <neil@brown.name>
> 
> Actually - I take that back.  generic_fadvise() is the wrong interface.
> It is for filesystems to use if the don't have any special requirements,
> and for vfs_fadvise() to use if the file system hasn't give a function
> to use.
> 
> nfsd should be calling vfs_fadvise().

Good catch. I'll fix that up in the next version.

Thanks,
-- 
Jeff Layton <jlayton@kernel.org>
Re: [PATCH RFC 2/2] nfsd: call generic_fadvise after v3 READ, stable WRITE or COMMIT
Posted by Jeff Layton 3 months ago
On Fri, 2025-07-04 at 09:44 +1000, NeilBrown wrote:
> On Fri, 04 Jul 2025, Jeff Layton wrote:
> > Recent testing has shown that keeping pagecache pages around for too
> > long can be detrimental to performance with nfsd. Clients only rarely
> > revisit the same data, so the pages tend to just hang around.
> > 
> > This patch changes the pc_release callbacks for NFSv3 READ, WRITE and
> > COMMIT to call generic_fadvise(..., POSIX_FADV_DONTNEED) on the accessed
> > range.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/nfsd/debugfs.c  |  2 ++
> >  fs/nfsd/nfs3proc.c | 59 +++++++++++++++++++++++++++++++++++++++++++++---------
> >  fs/nfsd/nfsd.h     |  1 +
> >  fs/nfsd/nfsproc.c  |  4 ++--
> >  fs/nfsd/vfs.c      | 21 ++++++++++++++-----
> >  fs/nfsd/vfs.h      |  5 +++--
> >  fs/nfsd/xdr3.h     |  3 +++
> >  7 files changed, 77 insertions(+), 18 deletions(-)
> > 
> > diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> > index 84b0c8b559dc90bd5c2d9d5e15c8e0682c0d610c..b007718dd959bc081166ec84e06f577a8fc2b46b 100644
> > --- a/fs/nfsd/debugfs.c
> > +++ b/fs/nfsd/debugfs.c
> > @@ -44,4 +44,6 @@ void nfsd_debugfs_init(void)
> >  
> >  	debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO,
> >  			    nfsd_top_dir, NULL, &nfsd_dsr_fops);
> > +	debugfs_create_bool("enable-fadvise-dontneed", 0644,
> > +			    nfsd_top_dir, &nfsd_enable_fadvise_dontneed);
> >  }
> > diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
> > index b6d03e1ef5f7a5e8dd111b0d56c061f1e91abff7..11261cf67ea817ec566626f08b733e09c9e121de 100644
> > --- a/fs/nfsd/nfs3proc.c
> > +++ b/fs/nfsd/nfs3proc.c
> > @@ -9,6 +9,7 @@
> >  #include <linux/ext2_fs.h>
> >  #include <linux/magic.h>
> >  #include <linux/namei.h>
> > +#include <linux/fadvise.h>
> >  
> >  #include "cache.h"
> >  #include "xdr3.h"
> > @@ -206,11 +207,25 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
> >  
> >  	fh_copy(&resp->fh, &argp->fh);
> >  	resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
> > -				 &resp->count, &resp->eof);
> > +				 &resp->count, &resp->eof, &resp->nf);
> >  	resp->status = nfsd3_map_status(resp->status);
> >  	return rpc_success;
> >  }
> >  
> > +static void
> > +nfsd3_release_read(struct svc_rqst *rqstp)
> > +{
> > +	struct nfsd3_readargs *argp = rqstp->rq_argp;
> > +	struct nfsd3_readres *resp = rqstp->rq_resp;
> > +
> > +	if (nfsd_enable_fadvise_dontneed && resp->status == nfs_ok)
> > +		generic_fadvise(nfsd_file_file(resp->nf), argp->offset, resp->count,
> > +				POSIX_FADV_DONTNEED);
> > +	if (resp->nf)
> > +		nfsd_file_put(resp->nf);
> > +	fh_put(&resp->fh);
> 
> This looks wrong - testing resp->nf after assuming it was non-NULL.
> I don't think it *is* wrong because ->state == nfs_ok ensures
> ->nf is valid. But still....
> 

That was my thinking, but I agree that it's a bit fragile.

> How about:
> 
>     fh_put(resp->fh);
>     if (!resp->nf)
>          return;
>     if (nfsd_enable_fadvise_dontneed)
> 	generic_fadvise(nfsd_file_file(resp->nf), argp->offset, resp->count,
> 		POSIX_FADV_DONTNEED);
>     nfsd_file_put(resp->nf);
> 
> ??
> Note that we don't test ->status because that is identical to testing ->nf.
> Ditto for other release functions.
> 

That looks good. I'll plan to do that in the next respin.

> Otherwise it makes sense for exploring how to optimise IO.
> 
> Reviewed-by: NeilBrown <neil@brown.name>
> 
> NeilBrown


Thanks for the reviews!
-- 
Jeff Layton <jlayton@kernel.org>
Re: [PATCH RFC 2/2] nfsd: call generic_fadvise after v3 READ, stable WRITE or COMMIT
Posted by Chuck Lever 3 months ago
On 7/3/25 3:53 PM, Jeff Layton wrote:
> Recent testing has shown that keeping pagecache pages around for too
> long can be detrimental to performance with nfsd. Clients only rarely
> revisit the same data, so the pages tend to just hang around.
> 
> This patch changes the pc_release callbacks for NFSv3 READ, WRITE and
> COMMIT to call generic_fadvise(..., POSIX_FADV_DONTNEED) on the accessed
> range.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfsd/debugfs.c  |  2 ++
>  fs/nfsd/nfs3proc.c | 59 +++++++++++++++++++++++++++++++++++++++++++++---------
>  fs/nfsd/nfsd.h     |  1 +
>  fs/nfsd/nfsproc.c  |  4 ++--
>  fs/nfsd/vfs.c      | 21 ++++++++++++++-----
>  fs/nfsd/vfs.h      |  5 +++--
>  fs/nfsd/xdr3.h     |  3 +++
>  7 files changed, 77 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> index 84b0c8b559dc90bd5c2d9d5e15c8e0682c0d610c..b007718dd959bc081166ec84e06f577a8fc2b46b 100644
> --- a/fs/nfsd/debugfs.c
> +++ b/fs/nfsd/debugfs.c
> @@ -44,4 +44,6 @@ void nfsd_debugfs_init(void)
>  
>  	debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO,
>  			    nfsd_top_dir, NULL, &nfsd_dsr_fops);
> +	debugfs_create_bool("enable-fadvise-dontneed", 0644,
> +			    nfsd_top_dir, &nfsd_enable_fadvise_dontneed);

I prefer that this setting is folded into the new io_cache_read /
io_cache_write tune-ables that Mike's patch adds, rather than adding
a new boolean.

That might make a hybrid "DONTCACHE for READ and fadvise for WRITE"
pretty easy.


>  }
> diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
> index b6d03e1ef5f7a5e8dd111b0d56c061f1e91abff7..11261cf67ea817ec566626f08b733e09c9e121de 100644
> --- a/fs/nfsd/nfs3proc.c
> +++ b/fs/nfsd/nfs3proc.c
> @@ -9,6 +9,7 @@
>  #include <linux/ext2_fs.h>
>  #include <linux/magic.h>
>  #include <linux/namei.h>
> +#include <linux/fadvise.h>
>  
>  #include "cache.h"
>  #include "xdr3.h"
> @@ -206,11 +207,25 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
>  
>  	fh_copy(&resp->fh, &argp->fh);
>  	resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
> -				 &resp->count, &resp->eof);
> +				 &resp->count, &resp->eof, &resp->nf);
>  	resp->status = nfsd3_map_status(resp->status);
>  	return rpc_success;
>  }
>  
> +static void
> +nfsd3_release_read(struct svc_rqst *rqstp)
> +{
> +	struct nfsd3_readargs *argp = rqstp->rq_argp;
> +	struct nfsd3_readres *resp = rqstp->rq_resp;
> +
> +	if (nfsd_enable_fadvise_dontneed && resp->status == nfs_ok)
> +		generic_fadvise(nfsd_file_file(resp->nf), argp->offset, resp->count,
> +				POSIX_FADV_DONTNEED);
> +	if (resp->nf)
> +		nfsd_file_put(resp->nf);
> +	fh_put(&resp->fh);
> +}
> +
>  /*
>   * Write data to a file
>   */
> @@ -236,12 +251,26 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
>  	resp->committed = argp->stable;
>  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
>  				  &argp->payload, &cnt,
> -				  resp->committed, resp->verf);
> +				  resp->committed, resp->verf, &resp->nf);
>  	resp->count = cnt;
>  	resp->status = nfsd3_map_status(resp->status);
>  	return rpc_success;
>  }
>  
> +static void
> +nfsd3_release_write(struct svc_rqst *rqstp)
> +{
> +	struct nfsd3_writeargs *argp = rqstp->rq_argp;
> +	struct nfsd3_writeres *resp = rqstp->rq_resp;
> +
> +	if (nfsd_enable_fadvise_dontneed && resp->status == nfs_ok && resp->committed)
> +		generic_fadvise(nfsd_file_file(resp->nf), argp->offset, resp->count,
> +				POSIX_FADV_DONTNEED);
> +	if (resp->nf)
> +		nfsd_file_put(resp->nf);
> +	fh_put(&resp->fh);
> +}
> +
>  /*
>   * Implement NFSv3's unchecked, guarded, and exclusive CREATE
>   * semantics for regular files. Except for the created file,
> @@ -748,7 +777,6 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
>  {
>  	struct nfsd3_commitargs *argp = rqstp->rq_argp;
>  	struct nfsd3_commitres *resp = rqstp->rq_resp;
> -	struct nfsd_file *nf;
>  
>  	dprintk("nfsd: COMMIT(3)   %s %u@%Lu\n",
>  				SVCFH_fmt(&argp->fh),
> @@ -757,17 +785,30 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
>  
>  	fh_copy(&resp->fh, &argp->fh);
>  	resp->status = nfsd_file_acquire_gc(rqstp, &resp->fh, NFSD_MAY_WRITE |
> -					    NFSD_MAY_NOT_BREAK_LEASE, &nf);
> +					    NFSD_MAY_NOT_BREAK_LEASE, &resp->nf);
>  	if (resp->status)
>  		goto out;
> -	resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset,
> +	resp->status = nfsd_commit(rqstp, &resp->fh, resp->nf, argp->offset,
>  				   argp->count, resp->verf);
> -	nfsd_file_put(nf);
>  out:
>  	resp->status = nfsd3_map_status(resp->status);
>  	return rpc_success;
>  }
>  
> +static void
> +nfsd3_release_commit(struct svc_rqst *rqstp)
> +{
> +	struct nfsd3_commitargs *argp = rqstp->rq_argp;
> +	struct nfsd3_commitres *resp = rqstp->rq_resp;
> +
> +	if (nfsd_enable_fadvise_dontneed && resp->status == nfs_ok)
> +		generic_fadvise(nfsd_file_file(resp->nf), argp->offset, argp->count,
> +				POSIX_FADV_DONTNEED);
> +	if (resp->nf)
> +		nfsd_file_put(resp->nf);
> +	fh_put(&resp->fh);
> +}
> +
>  
>  /*
>   * NFSv3 Server procedures.
> @@ -864,7 +905,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
>  		.pc_func = nfsd3_proc_read,
>  		.pc_decode = nfs3svc_decode_readargs,
>  		.pc_encode = nfs3svc_encode_readres,
> -		.pc_release = nfs3svc_release_fhandle,
> +		.pc_release = nfsd3_release_read,
>  		.pc_argsize = sizeof(struct nfsd3_readargs),
>  		.pc_argzero = sizeof(struct nfsd3_readargs),
>  		.pc_ressize = sizeof(struct nfsd3_readres),
> @@ -876,7 +917,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
>  		.pc_func = nfsd3_proc_write,
>  		.pc_decode = nfs3svc_decode_writeargs,
>  		.pc_encode = nfs3svc_encode_writeres,
> -		.pc_release = nfs3svc_release_fhandle,
> +		.pc_release = nfsd3_release_write,
>  		.pc_argsize = sizeof(struct nfsd3_writeargs),
>  		.pc_argzero = sizeof(struct nfsd3_writeargs),
>  		.pc_ressize = sizeof(struct nfsd3_writeres),
> @@ -1039,7 +1080,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
>  		.pc_func = nfsd3_proc_commit,
>  		.pc_decode = nfs3svc_decode_commitargs,
>  		.pc_encode = nfs3svc_encode_commitres,
> -		.pc_release = nfs3svc_release_fhandle,
> +		.pc_release = nfsd3_release_commit,
>  		.pc_argsize = sizeof(struct nfsd3_commitargs),
>  		.pc_argzero = sizeof(struct nfsd3_commitargs),
>  		.pc_ressize = sizeof(struct nfsd3_commitres),
> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> index 1cd0bed57bc2f27248fd66a8efef692a5e9a390c..288904d88b9245c03eae0aa347e867037b7c85c5 100644
> --- a/fs/nfsd/nfsd.h
> +++ b/fs/nfsd/nfsd.h
> @@ -152,6 +152,7 @@ static inline void nfsd_debugfs_exit(void) {}
>  #endif
>  
>  extern bool nfsd_disable_splice_read __read_mostly;
> +extern bool nfsd_enable_fadvise_dontneed __read_mostly;
>  
>  extern int nfsd_max_blksize;
>  
> diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
> index 8f71f5748c75b69f15bae5e63799842e0e8b3139..bb8f98adb3e31b10adc4694987f8f5036726bf7f 100644
> --- a/fs/nfsd/nfsproc.c
> +++ b/fs/nfsd/nfsproc.c
> @@ -225,7 +225,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
>  	resp->count = argp->count;
>  	fh_copy(&resp->fh, &argp->fh);
>  	resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
> -				 &resp->count, &eof);
> +				 &resp->count, &eof, NULL);
>  	if (resp->status == nfs_ok)
>  		resp->status = fh_getattr(&resp->fh, &resp->stat);
>  	else if (resp->status == nfserr_jukebox)
> @@ -258,7 +258,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
>  
>  	fh_copy(&resp->fh, &argp->fh);
>  	resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
> -				  &argp->payload, &cnt, NFS_DATA_SYNC, NULL);
> +				  &argp->payload, &cnt, NFS_DATA_SYNC, NULL, NULL);
>  	if (resp->status == nfs_ok)
>  		resp->status = fh_getattr(&resp->fh, &resp->stat);
>  	else if (resp->status == nfserr_jukebox)
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index ee78b6fb17098b788b07f5cd90598e678244b57e..f23eb3a07bb99dc231be9ea6db4e58b9e328a689 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -49,6 +49,7 @@
>  #define NFSDDBG_FACILITY		NFSDDBG_FILEOP
>  
>  bool nfsd_disable_splice_read __read_mostly;
> +bool nfsd_enable_fadvise_dontneed __read_mostly;
>  
>  /**
>   * nfserrno - Map Linux errnos to NFS errnos
> @@ -1280,6 +1281,7 @@ bool nfsd_read_splice_ok(struct svc_rqst *rqstp)
>   * @offset: starting byte offset
>   * @count: IN: requested number of bytes; OUT: number of bytes read
>   * @eof: OUT: set non-zero if operation reached the end of the file
> + * @pnf: optional return pointer to pass back nfsd_file reference
>   *
>   * The caller must verify that there is enough space in @rqstp.rq_res
>   * to perform this operation.
> @@ -1290,7 +1292,8 @@ bool nfsd_read_splice_ok(struct svc_rqst *rqstp)
>   * returned.
>   */
>  __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> -		 loff_t offset, unsigned long *count, u32 *eof)
> +		 loff_t offset, unsigned long *count, u32 *eof,
> +		 struct nfsd_file **pnf)
>  {
>  	struct nfsd_file	*nf;
>  	struct file *file;
> @@ -1307,7 +1310,10 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  	else
>  		err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof);
>  
> -	nfsd_file_put(nf);
> +	if (pnf && err == nfs_ok)
> +		*pnf = nf;
> +	else
> +		nfsd_file_put(nf);
>  	trace_nfsd_read_done(rqstp, fhp, offset, *count);
>  	return err;
>  }
> @@ -1321,8 +1327,10 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>   * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
>   * @stable: An NFS stable_how value
>   * @verf: NFS WRITE verifier
> + * @pnf: optional return pointer to pass back nfsd_file reference
>   *
> - * Upon return, caller must invoke fh_put on @fhp.
> + * Upon return, caller must invoke fh_put() on @fhp. If it sets @pnf,
> + * then it must also call nfsd_file_put() on the returned reference.
>   *
>   * Return values:
>   *   An nfsstat value in network byte order.
> @@ -1330,7 +1338,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  __be32
>  nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
>  	   const struct xdr_buf *payload, unsigned long *cnt, int stable,
> -	   __be32 *verf)
> +	   __be32 *verf, struct nfsd_file **pnf)
>  {
>  	struct nfsd_file *nf;
>  	__be32 err;
> @@ -1343,7 +1351,10 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
>  
>  	err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt,
>  			     stable, verf);
> -	nfsd_file_put(nf);
> +	if (pnf && err == nfs_ok)
> +		*pnf = nf;
> +	else
> +		nfsd_file_put(nf);
>  out:
>  	trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
>  	return err;
> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> index eff04959606fe55c141ab4a2eed97c7e0716a5f5..2d063ee7786f499f34c39cd3ba7e776bb7562d57 100644
> --- a/fs/nfsd/vfs.h
> +++ b/fs/nfsd/vfs.h
> @@ -127,10 +127,11 @@ __be32		nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  bool		nfsd_read_splice_ok(struct svc_rqst *rqstp);
>  __be32		nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  				loff_t offset, unsigned long *count,
> -				u32 *eof);
> +				u32 *eof, struct nfsd_file **pnf);
>  __be32		nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  				loff_t offset, const struct xdr_buf *payload,
> -				unsigned long *cnt, int stable, __be32 *verf);
> +				unsigned long *cnt, int stable, __be32 *verf,
> +				struct nfsd_file **pnf);
>  __be32		nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  				struct nfsd_file *nf, loff_t offset,
>  				const struct xdr_buf *payload,
> diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
> index 522067b7fd755930a1c2e42b826d9132ac2993be..3f4c51983003188be51a0f8c2db2e0acc9a1363b 100644
> --- a/fs/nfsd/xdr3.h
> +++ b/fs/nfsd/xdr3.h
> @@ -146,6 +146,7 @@ struct nfsd3_readres {
>  	unsigned long		count;
>  	__u32			eof;
>  	struct page		**pages;
> +	struct nfsd_file	*nf;
>  };
>  
>  struct nfsd3_writeres {
> @@ -154,6 +155,7 @@ struct nfsd3_writeres {
>  	unsigned long		count;
>  	int			committed;
>  	__be32			verf[2];
> +	struct nfsd_file	*nf;
>  };
>  
>  struct nfsd3_renameres {
> @@ -217,6 +219,7 @@ struct nfsd3_commitres {
>  	__be32			status;
>  	struct svc_fh		fh;
>  	__be32			verf[2];
> +	struct nfsd_file	*nf;
>  };
>  
>  struct nfsd3_getaclres {
> 


-- 
Chuck Lever
Re: [PATCH RFC 2/2] nfsd: call generic_fadvise after v3 READ, stable WRITE or COMMIT
Posted by Mike Snitzer 3 months ago
On Thu, Jul 03, 2025 at 04:07:51PM -0400, Chuck Lever wrote:
> On 7/3/25 3:53 PM, Jeff Layton wrote:
> > Recent testing has shown that keeping pagecache pages around for too
> > long can be detrimental to performance with nfsd. Clients only rarely
> > revisit the same data, so the pages tend to just hang around.
> > 
> > This patch changes the pc_release callbacks for NFSv3 READ, WRITE and
> > COMMIT to call generic_fadvise(..., POSIX_FADV_DONTNEED) on the accessed
> > range.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/nfsd/debugfs.c  |  2 ++
> >  fs/nfsd/nfs3proc.c | 59 +++++++++++++++++++++++++++++++++++++++++++++---------
> >  fs/nfsd/nfsd.h     |  1 +
> >  fs/nfsd/nfsproc.c  |  4 ++--
> >  fs/nfsd/vfs.c      | 21 ++++++++++++++-----
> >  fs/nfsd/vfs.h      |  5 +++--
> >  fs/nfsd/xdr3.h     |  3 +++
> >  7 files changed, 77 insertions(+), 18 deletions(-)
> > 
> > diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> > index 84b0c8b559dc90bd5c2d9d5e15c8e0682c0d610c..b007718dd959bc081166ec84e06f577a8fc2b46b 100644
> > --- a/fs/nfsd/debugfs.c
> > +++ b/fs/nfsd/debugfs.c
> > @@ -44,4 +44,6 @@ void nfsd_debugfs_init(void)
> >  
> >  	debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO,
> >  			    nfsd_top_dir, NULL, &nfsd_dsr_fops);
> > +	debugfs_create_bool("enable-fadvise-dontneed", 0644,
> > +			    nfsd_top_dir, &nfsd_enable_fadvise_dontneed);
> 
> I prefer that this setting is folded into the new io_cache_read /
> io_cache_write tune-ables that Mike's patch adds, rather than adding
> a new boolean.
> 
> That might make a hybrid "DONTCACHE for READ and fadvise for WRITE"
> pretty easy.

That'd be really easy.  Jeff, maybe have a look to rebase your changes
on this patchset:
https://lore.kernel.org/linux-nfs/20250708160619.64800-1-snitzer@kernel.org/

Ontop of this patch in particular:
https://lore.kernel.org/linux-nfs/20250708160619.64800-8-snitzer@kernel.org/

My git branch with this patchset at the tip is available here:
https://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git/log/?h=kernel-6.12.24/nfsd-testing-snitm

Mike
Re: [PATCH RFC 2/2] nfsd: call generic_fadvise after v3 READ, stable WRITE or COMMIT
Posted by Jeff Layton 3 months ago
On Thu, 2025-07-03 at 16:07 -0400, Chuck Lever wrote:
> On 7/3/25 3:53 PM, Jeff Layton wrote:
> > Recent testing has shown that keeping pagecache pages around for too
> > long can be detrimental to performance with nfsd. Clients only rarely
> > revisit the same data, so the pages tend to just hang around.
> > 
> > This patch changes the pc_release callbacks for NFSv3 READ, WRITE and
> > COMMIT to call generic_fadvise(..., POSIX_FADV_DONTNEED) on the accessed
> > range.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/nfsd/debugfs.c  |  2 ++
> >  fs/nfsd/nfs3proc.c | 59 +++++++++++++++++++++++++++++++++++++++++++++---------
> >  fs/nfsd/nfsd.h     |  1 +
> >  fs/nfsd/nfsproc.c  |  4 ++--
> >  fs/nfsd/vfs.c      | 21 ++++++++++++++-----
> >  fs/nfsd/vfs.h      |  5 +++--
> >  fs/nfsd/xdr3.h     |  3 +++
> >  7 files changed, 77 insertions(+), 18 deletions(-)
> > 
> > diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> > index 84b0c8b559dc90bd5c2d9d5e15c8e0682c0d610c..b007718dd959bc081166ec84e06f577a8fc2b46b 100644
> > --- a/fs/nfsd/debugfs.c
> > +++ b/fs/nfsd/debugfs.c
> > @@ -44,4 +44,6 @@ void nfsd_debugfs_init(void)
> >  
> >  	debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO,
> >  			    nfsd_top_dir, NULL, &nfsd_dsr_fops);
> > +	debugfs_create_bool("enable-fadvise-dontneed", 0644,
> > +			    nfsd_top_dir, &nfsd_enable_fadvise_dontneed);
> 
> I prefer that this setting is folded into the new io_cache_read /
> io_cache_write tune-ables that Mike's patch adds, rather than adding
> a new boolean.
> 
> That might make a hybrid "DONTCACHE for READ and fadvise for WRITE"
> pretty easy.
> 

I ended up rebasing Mike's dontcache branch on top of v6.16-rc5 with
all of Chuck's trees in. I then added the attached patch and did some
testing with a couple of machines I checked out internally at Meta.
This is the throughput results with the fio-seq-RW test with the file
size set to 100G and the duration at 5 mins.

Note that:

read and writes buffered:
   READ: bw=3024MiB/s (3171MB/s), 186MiB/s-191MiB/s (195MB/s-201MB/s), io=889GiB (954GB), run=300012-300966msec
  WRITE: bw=2015MiB/s (2113MB/s), 124MiB/s-128MiB/s (131MB/s-134MB/s), io=592GiB (636GB), run=300012-300966msec

   READ: bw=2902MiB/s (3043MB/s), 177MiB/s-183MiB/s (186MB/s-192MB/s), io=851GiB (913GB), run=300027-300118msec
  WRITE: bw=1934MiB/s (2027MB/s), 119MiB/s-122MiB/s (124MB/s-128MB/s), io=567GiB (608GB), run=300027-300118msec

   READ: bw=2897MiB/s (3037MB/s), 178MiB/s-183MiB/s (186MB/s-192MB/s), io=849GiB (911GB), run=300006-300078msec
  WRITE: bw=1930MiB/s (2023MB/s), 119MiB/s-122MiB/s (125MB/s-128MB/s), io=565GiB (607GB), run=300006-300078msec

reads and writes RWF_DONTCACHE:
   READ: bw=3090MiB/s (3240MB/s), 190MiB/s-195MiB/s (199MB/s-205MB/s), io=906GiB (972GB), run=300015-300113msec
  WRITE: bw=2060MiB/s (2160MB/s), 126MiB/s-130MiB/s (132MB/s-137MB/s), io=604GiB (648GB), run=300015-300113msec

   READ: bw=3057MiB/s (3205MB/s), 188MiB/s-193MiB/s (198MB/s-203MB/s), io=897GiB (963GB), run=300329-300450msec
  WRITE: bw=2037MiB/s (2136MB/s), 126MiB/s-129MiB/s (132MB/s-135MB/s), io=598GiB (642GB), run=300329-300450msec

   READ: bw=3166MiB/s (3320MB/s), 196MiB/s-200MiB/s (205MB/s-210MB/s), io=928GiB (996GB), run=300021-300090msec
  WRITE: bw=2111MiB/s (2213MB/s), 131MiB/s-133MiB/s (137MB/s-140MB/s), io=619GiB (664GB), run=300021-300090msec

reads and writes witg O_DIRECT:
   READ: bw=3115MiB/s (3267MB/s), 192MiB/s-198MiB/s (201MB/s-208MB/s), io=913GiB (980GB), run=300025-300078msec
  WRITE: bw=2077MiB/s (2178MB/s), 128MiB/s-131MiB/s (134MB/s-138MB/s), io=609GiB (653GB), run=300025-300078msec

   READ: bw=3189MiB/s (3343MB/s), 197MiB/s-202MiB/s (207MB/s-211MB/s), io=934GiB (1003GB), run=300023-300096msec
  WRITE: bw=2125MiB/s (2228MB/s), 132MiB/s-134MiB/s (138MB/s-140MB/s), io=623GiB (669GB), run=300023-300096msec

   READ: bw=3113MiB/s (3264MB/s), 191MiB/s-197MiB/s (200MB/s-207MB/s), io=912GiB (979GB), run=300020-300098msec
  WRITE: bw=2075MiB/s (2175MB/s), 127MiB/s-131MiB/s (134MB/s-138MB/s), io=608GiB (653GB), run=300020-300098msec

RWF_DONTCACHE on reads and stable writes + fadvise DONTNEED after COMMIT:
   READ: bw=2888MiB/s (3029MB/s), 178MiB/s-182MiB/s (187MB/s-191MB/s), io=846GiB (909GB), run=300012-300109msec
  WRITE: bw=1924MiB/s (2017MB/s), 118MiB/s-121MiB/s (124MB/s-127MB/s), io=564GiB (605GB), run=300012-300109msec

   READ: bw=2899MiB/s (3040MB/s), 180MiB/s-183MiB/s (188MB/s-192MB/s), io=852GiB (915GB), run=300022-300940msec
  WRITE: bw=1931MiB/s (2025MB/s), 119MiB/s-122MiB/s (125MB/s-128MB/s), io=567GiB (609GB), run=300022-300940msec

   READ: bw=2902MiB/s (3043MB/s), 179MiB/s-184MiB/s (188MB/s-193MB/s), io=853GiB (916GB), run=300913-301146msec
  WRITE: bw=1933MiB/s (2027MB/s), 119MiB/s-122MiB/s (125MB/s-128MB/s), io=568GiB (610GB), run=300913-301146msec


The fadvise case is clearly slower than the others. Interestingly it
also slowed down read performance, which leads me to believe that maybe
the fadvise calls were interfering with concurrent reads. Given the
disappointing numbers, I'll probably drop the last patch.

There is probably a case to be made for patch #1, on the general
principle of expediting sending the reply as much as possible. Chuck,
let me know if you want me to submit that individually.
-- 
Jeff Layton <jlayton@kernel.org>
Re: [PATCH RFC 2/2] nfsd: call generic_fadvise after v3 READ, stable WRITE or COMMIT
Posted by Mike Snitzer 3 months ago
On Tue, Jul 08, 2025 at 10:34:15AM -0400, Jeff Layton wrote:
> On Thu, 2025-07-03 at 16:07 -0400, Chuck Lever wrote:
> > On 7/3/25 3:53 PM, Jeff Layton wrote:
> > > Recent testing has shown that keeping pagecache pages around for too
> > > long can be detrimental to performance with nfsd. Clients only rarely
> > > revisit the same data, so the pages tend to just hang around.
> > > 
> > > This patch changes the pc_release callbacks for NFSv3 READ, WRITE and
> > > COMMIT to call generic_fadvise(..., POSIX_FADV_DONTNEED) on the accessed
> > > range.
> > > 
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > >  fs/nfsd/debugfs.c  |  2 ++
> > >  fs/nfsd/nfs3proc.c | 59 +++++++++++++++++++++++++++++++++++++++++++++---------
> > >  fs/nfsd/nfsd.h     |  1 +
> > >  fs/nfsd/nfsproc.c  |  4 ++--
> > >  fs/nfsd/vfs.c      | 21 ++++++++++++++-----
> > >  fs/nfsd/vfs.h      |  5 +++--
> > >  fs/nfsd/xdr3.h     |  3 +++
> > >  7 files changed, 77 insertions(+), 18 deletions(-)
> > > 
> > > diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> > > index 84b0c8b559dc90bd5c2d9d5e15c8e0682c0d610c..b007718dd959bc081166ec84e06f577a8fc2b46b 100644
> > > --- a/fs/nfsd/debugfs.c
> > > +++ b/fs/nfsd/debugfs.c
> > > @@ -44,4 +44,6 @@ void nfsd_debugfs_init(void)
> > >  
> > >  	debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO,
> > >  			    nfsd_top_dir, NULL, &nfsd_dsr_fops);
> > > +	debugfs_create_bool("enable-fadvise-dontneed", 0644,
> > > +			    nfsd_top_dir, &nfsd_enable_fadvise_dontneed);
> > 
> > I prefer that this setting is folded into the new io_cache_read /
> > io_cache_write tune-ables that Mike's patch adds, rather than adding
> > a new boolean.
> > 
> > That might make a hybrid "DONTCACHE for READ and fadvise for WRITE"
> > pretty easy.
> > 
> 
> I ended up rebasing Mike's dontcache branch on top of v6.16-rc5 with
> all of Chuck's trees in. I then added the attached patch and did some
> testing with a couple of machines I checked out internally at Meta.
> This is the throughput results with the fio-seq-RW test with the file
> size set to 100G and the duration at 5 mins.
> 
> Note that:
> 
> read and writes buffered:
>    READ: bw=3024MiB/s (3171MB/s), 186MiB/s-191MiB/s (195MB/s-201MB/s), io=889GiB (954GB), run=300012-300966msec
>   WRITE: bw=2015MiB/s (2113MB/s), 124MiB/s-128MiB/s (131MB/s-134MB/s), io=592GiB (636GB), run=300012-300966msec
> 
>    READ: bw=2902MiB/s (3043MB/s), 177MiB/s-183MiB/s (186MB/s-192MB/s), io=851GiB (913GB), run=300027-300118msec
>   WRITE: bw=1934MiB/s (2027MB/s), 119MiB/s-122MiB/s (124MB/s-128MB/s), io=567GiB (608GB), run=300027-300118msec
> 
>    READ: bw=2897MiB/s (3037MB/s), 178MiB/s-183MiB/s (186MB/s-192MB/s), io=849GiB (911GB), run=300006-300078msec
>   WRITE: bw=1930MiB/s (2023MB/s), 119MiB/s-122MiB/s (125MB/s-128MB/s), io=565GiB (607GB), run=300006-300078msec
> 
> reads and writes RWF_DONTCACHE:
>    READ: bw=3090MiB/s (3240MB/s), 190MiB/s-195MiB/s (199MB/s-205MB/s), io=906GiB (972GB), run=300015-300113msec
>   WRITE: bw=2060MiB/s (2160MB/s), 126MiB/s-130MiB/s (132MB/s-137MB/s), io=604GiB (648GB), run=300015-300113msec
> 
>    READ: bw=3057MiB/s (3205MB/s), 188MiB/s-193MiB/s (198MB/s-203MB/s), io=897GiB (963GB), run=300329-300450msec
>   WRITE: bw=2037MiB/s (2136MB/s), 126MiB/s-129MiB/s (132MB/s-135MB/s), io=598GiB (642GB), run=300329-300450msec
> 
>    READ: bw=3166MiB/s (3320MB/s), 196MiB/s-200MiB/s (205MB/s-210MB/s), io=928GiB (996GB), run=300021-300090msec
>   WRITE: bw=2111MiB/s (2213MB/s), 131MiB/s-133MiB/s (137MB/s-140MB/s), io=619GiB (664GB), run=300021-300090msec
> 
> reads and writes witg O_DIRECT:
>    READ: bw=3115MiB/s (3267MB/s), 192MiB/s-198MiB/s (201MB/s-208MB/s), io=913GiB (980GB), run=300025-300078msec
>   WRITE: bw=2077MiB/s (2178MB/s), 128MiB/s-131MiB/s (134MB/s-138MB/s), io=609GiB (653GB), run=300025-300078msec
> 
>    READ: bw=3189MiB/s (3343MB/s), 197MiB/s-202MiB/s (207MB/s-211MB/s), io=934GiB (1003GB), run=300023-300096msec
>   WRITE: bw=2125MiB/s (2228MB/s), 132MiB/s-134MiB/s (138MB/s-140MB/s), io=623GiB (669GB), run=300023-300096msec
> 
>    READ: bw=3113MiB/s (3264MB/s), 191MiB/s-197MiB/s (200MB/s-207MB/s), io=912GiB (979GB), run=300020-300098msec
>   WRITE: bw=2075MiB/s (2175MB/s), 127MiB/s-131MiB/s (134MB/s-138MB/s), io=608GiB (653GB), run=300020-300098msec
> 
> RWF_DONTCACHE on reads and stable writes + fadvise DONTNEED after COMMIT:
>    READ: bw=2888MiB/s (3029MB/s), 178MiB/s-182MiB/s (187MB/s-191MB/s), io=846GiB (909GB), run=300012-300109msec
>   WRITE: bw=1924MiB/s (2017MB/s), 118MiB/s-121MiB/s (124MB/s-127MB/s), io=564GiB (605GB), run=300012-300109msec
> 
>    READ: bw=2899MiB/s (3040MB/s), 180MiB/s-183MiB/s (188MB/s-192MB/s), io=852GiB (915GB), run=300022-300940msec
>   WRITE: bw=1931MiB/s (2025MB/s), 119MiB/s-122MiB/s (125MB/s-128MB/s), io=567GiB (609GB), run=300022-300940msec
> 
>    READ: bw=2902MiB/s (3043MB/s), 179MiB/s-184MiB/s (188MB/s-193MB/s), io=853GiB (916GB), run=300913-301146msec
>   WRITE: bw=1933MiB/s (2027MB/s), 119MiB/s-122MiB/s (125MB/s-128MB/s), io=568GiB (610GB), run=300913-301146msec
> 
> 
> The fadvise case is clearly slower than the others. Interestingly it
> also slowed down read performance, which leads me to believe that maybe
> the fadvise calls were interfering with concurrent reads. Given the
> disappointing numbers, I'll probably drop the last patch.
> 
> There is probably a case to be made for patch #1, on the general
> principle of expediting sending the reply as much as possible. Chuck,
> let me know if you want me to submit that individually.
> -- 
> Jeff Layton <jlayton@kernel.org>

> From 14958516bf45f92a8609cb6ad504e92550b416d7 Mon Sep 17 00:00:00 2001
> From: Jeff Layton <jlayton@kernel.org>
> Date: Mon, 7 Jul 2025 11:00:34 -0400
> Subject: [PATCH] nfsd: add a NFSD_IO_FADVISE setting to io_cache_write
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>

Ah, you raced ahead and did what I suggested in my previous reply just
now.  Too bad the performance is lacking... but I applaud you're
trying out a new/interesting idea!

BTW, measuring CPU and memory use is also important to capture to get
full picture.

Mike