[PATCH] NFSD: Impl multiple extents in block/scsi layoutget

Sergey Bashirov posted 1 patch 3 weeks ago
fs/nfsd/blocklayout.c    | 167 +++++++++++++++++++++++++++++----------
fs/nfsd/blocklayoutxdr.c |  36 ++++++---
fs/nfsd/blocklayoutxdr.h |   5 ++
3 files changed, 157 insertions(+), 51 deletions(-)
[PATCH] NFSD: Impl multiple extents in block/scsi layoutget
Posted by Sergey Bashirov 3 weeks ago
This patch allows the pNFS server to respond with multiple extents
in a layoutget request. As a result, the number of layoutget requests
is significantly reduced for various file access patterns, including
random and parallel writes, avoiding unnecessary load to the server.
On the client side, this improves the performance of writing large
files and allows requesting layouts with minimum length greater than
PAGE_SIZE.

Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com>
---
Checked with smatch, tested on pNFS block volume setup.

 fs/nfsd/blocklayout.c    | 167 +++++++++++++++++++++++++++++----------
 fs/nfsd/blocklayoutxdr.c |  36 ++++++---
 fs/nfsd/blocklayoutxdr.h |   5 ++
 3 files changed, 157 insertions(+), 51 deletions(-)

diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index fde5539cf6a6..d53f3ec8823a 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -17,48 +17,39 @@
 #define NFSDDBG_FACILITY	NFSDDBG_PNFS
 
 
+/**
+ * nfsd4_block_map_extent - get extent that covers the start of segment
+ * @inode: inode of the file requested by the client
+ * @fhp: handle of the file requested by the client
+ * @seg: layout subrange requested by the client
+ * @minlength: layout min length requested by the client
+ * @bex: output block extent structure
+ *
+ * Get an extent from the file system that starts at @seg->offset or below,
+ * but may be shorter than @seg->length.
+ *
+ * Return values:
+ *   %nfs_ok: Success, @bex is initialized and valid
+ *   %nfserr_layoutunavailable: Failed to get extent for requested @seg
+ *   OS errors converted to NFS errors
+ */
 static __be32
-nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
-		const struct svc_fh *fhp, struct nfsd4_layoutget *args)
+nfsd4_block_map_extent(struct inode *inode, const struct svc_fh *fhp,
+		const struct nfsd4_layout_seg *seg, u64 minlength,
+		struct pnfs_block_extent *bex)
 {
-	struct nfsd4_layout_seg *seg = &args->lg_seg;
 	struct super_block *sb = inode->i_sb;
-	u32 block_size = i_blocksize(inode);
-	struct pnfs_block_extent *bex;
 	struct iomap iomap;
 	u32 device_generation = 0;
 	int error;
 
-	if (locks_in_grace(SVC_NET(rqstp)))
-		return nfserr_grace;
-
-	if (seg->offset & (block_size - 1)) {
-		dprintk("pnfsd: I/O misaligned\n");
-		goto out_layoutunavailable;
-	}
-
-	/*
-	 * Some clients barf on non-zero block numbers for NONE or INVALID
-	 * layouts, so make sure to zero the whole structure.
-	 */
-	error = -ENOMEM;
-	bex = kzalloc(sizeof(*bex), GFP_KERNEL);
-	if (!bex)
-		goto out_error;
-	args->lg_content = bex;
-
 	error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
 					    &iomap, seg->iomode != IOMODE_READ,
 					    &device_generation);
 	if (error) {
 		if (error == -ENXIO)
-			goto out_layoutunavailable;
-		goto out_error;
-	}
-
-	if (iomap.length < args->lg_minlength) {
-		dprintk("pnfsd: extent smaller than minlength\n");
-		goto out_layoutunavailable;
+			return nfserr_layoutunavailable;
+		return nfserrno(error);
 	}
 
 	switch (iomap.type) {
@@ -74,9 +65,9 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
 			/*
 			 * Crack monkey special case from section 2.3.1.
 			 */
-			if (args->lg_minlength == 0) {
+			if (minlength == 0) {
 				dprintk("pnfsd: no soup for you!\n");
-				goto out_layoutunavailable;
+				return nfserr_layoutunavailable;
 			}
 
 			bex->es = PNFS_BLOCK_INVALID_DATA;
@@ -93,27 +84,119 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
 	case IOMAP_DELALLOC:
 	default:
 		WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
-		goto out_layoutunavailable;
+		return nfserr_layoutunavailable;
 	}
 
 	error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
 	if (error)
-		goto out_error;
+		return nfserrno(error);
+
 	bex->foff = iomap.offset;
 	bex->len = iomap.length;
+	return nfs_ok;
+}
 
-	seg->offset = iomap.offset;
-	seg->length = iomap.length;
+/**
+ * nfsd4_block_map_segment - get extent array for the requested layout
+ * @inode: inode of the file requested by the client
+ * @fhp: handle of the file requested by the client
+ * @seg: layout range requested by the client
+ * @minlength: layout min length requested by the client
+ * @bl: output array of block extents
+ *
+ * Get an array of consecutive block extents that span the requested
+ * layout range. The resulting range may be shorter than requested if
+ * all preallocated block extents are used.
+ *
+ * Return values:
+ *   %nfs_ok: Success, @bl initialized and valid
+ *   %nfserr_layoutunavailable: Failed to get extents for requested @seg
+ *   OS errors converted to NFS errors
+ */
+static __be32
+nfsd4_block_map_segment(struct inode *inode, const struct svc_fh *fhp,
+		const struct nfsd4_layout_seg *seg, u64 minlength,
+		struct pnfs_block_layout *bl)
+{
+	struct nfsd4_layout_seg subseg = *seg;
+	u32 i;
+	__be32 nfserr;
 
-	dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es);
-	return 0;
+	for (i = 0; i < bl->nr_extents; i++) {
+		struct pnfs_block_extent *extent = bl->extents + i;
+		u64 extent_len;
+
+		nfserr = nfsd4_block_map_extent(inode, fhp, &subseg,
+				minlength, extent);
+		if (nfserr != nfs_ok)
+			return nfserr;
+
+		extent_len = extent->len - (subseg.offset - extent->foff);
+		if (extent_len >= subseg.length) {
+			bl->nr_extents = i + 1;
+			return nfs_ok;
+		}
+
+		subseg.offset = extent->foff + extent->len;
+		subseg.length -= extent_len;
+	}
+
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
+		const struct svc_fh *fhp, struct nfsd4_layoutget *args)
+{
+	struct nfsd4_layout_seg *seg = &args->lg_seg;
+	u64 seg_length;
+	struct pnfs_block_extent *first_bex, *last_bex;
+	struct pnfs_block_layout *bl;
+	u32 nr_extents_max = PAGE_SIZE / sizeof(bl->extents[0]) - 1;
+	u32 block_size = i_blocksize(inode);
+	__be32 nfserr;
+
+	if (locks_in_grace(SVC_NET(rqstp)))
+		return nfserr_grace;
+
+	nfserr = nfserr_layoutunavailable;
+	if (seg->offset & (block_size - 1)) {
+		dprintk("pnfsd: I/O misaligned\n");
+		goto out_error;
+	}
+
+	/*
+	 * Some clients barf on non-zero block numbers for NONE or INVALID
+	 * layouts, so make sure to zero the whole structure.
+	 */
+	nfserr = nfserrno(-ENOMEM);
+	bl = kzalloc(struct_size(bl, extents, nr_extents_max), GFP_KERNEL);
+	if (!bl)
+		goto out_error;
+	bl->nr_extents = nr_extents_max;
+	args->lg_content = bl;
+
+	nfserr = nfsd4_block_map_segment(inode, fhp, seg,
+			args->lg_minlength, bl);
+	if (nfserr != nfs_ok)
+		goto out_error;
+	first_bex = bl->extents;
+	last_bex = bl->extents + bl->nr_extents - 1;
+
+	nfserr = nfserr_layoutunavailable;
+	seg_length = last_bex->foff + last_bex->len - seg->offset;
+	if (seg_length < args->lg_minlength) {
+		dprintk("pnfsd: extent smaller than minlength\n");
+		goto out_error;
+	}
+
+	seg->offset = first_bex->foff;
+	seg->length = last_bex->foff - first_bex->foff + last_bex->len;
+	return nfs_ok;
 
 out_error:
 	seg->length = 0;
-	return nfserrno(error);
-out_layoutunavailable:
-	seg->length = 0;
-	return nfserr_layoutunavailable;
+	return nfserr;
 }
 
 static __be32
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index e50afe340737..68c112d47cee 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -14,12 +14,25 @@
 #define NFSDDBG_FACILITY	NFSDDBG_PNFS
 
 
+/**
+ * nfsd4_block_encode_layoutget - encode block/scsi layout extent array
+ * @xdr: stream for data encoding
+ * @lgp: layoutget content, actually an array of extents to encode
+ *
+ * This function encodes the opaque loc_body field in the layoutget response.
+ * Since the pnfs_block_layout4 and pnfs_scsi_layout4 structures on the wire
+ * are the same, this function is used by both layout drivers.
+ *
+ * Return values:
+ *   %nfs_ok: Success, all extents encoded into @xdr
+ *   %nfserr_toosmall: Not enough space in @xdr to encode all the data
+ */
 __be32
 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
 		const struct nfsd4_layoutget *lgp)
 {
-	const struct pnfs_block_extent *b = lgp->lg_content;
-	int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
+	const struct pnfs_block_layout *bl = lgp->lg_content;
+	u32 i, len = sizeof(__be32) + bl->nr_extents * PNFS_BLOCK_EXTENT_SIZE;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, sizeof(__be32) + len);
@@ -27,14 +40,19 @@ nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
 		return nfserr_toosmall;
 
 	*p++ = cpu_to_be32(len);
-	*p++ = cpu_to_be32(1);		/* we always return a single extent */
+	*p++ = cpu_to_be32(bl->nr_extents);
 
-	p = svcxdr_encode_deviceid4(p, &b->vol_id);
-	p = xdr_encode_hyper(p, b->foff);
-	p = xdr_encode_hyper(p, b->len);
-	p = xdr_encode_hyper(p, b->soff);
-	*p++ = cpu_to_be32(b->es);
-	return 0;
+	for (i = 0; i < bl->nr_extents; i++) {
+		const struct pnfs_block_extent *bex = bl->extents + i;
+
+		p = svcxdr_encode_deviceid4(p, &bex->vol_id);
+		p = xdr_encode_hyper(p, bex->foff);
+		p = xdr_encode_hyper(p, bex->len);
+		p = xdr_encode_hyper(p, bex->soff);
+		*p++ = cpu_to_be32(bex->es);
+	}
+
+	return nfs_ok;
 }
 
 static int
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index 7d25ef689671..54fe7f517a94 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -21,6 +21,11 @@ struct pnfs_block_range {
 	u64				len;
 };
 
+struct pnfs_block_layout {
+	u32				nr_extents;
+	struct pnfs_block_extent	extents[] __counted_by(nr_extents);
+};
+
 /*
  * Random upper cap for the uuid length to avoid unbounded allocation.
  * Not actually limited by the protocol.
-- 
2.43.0
Re: [PATCH] NFSD: Impl multiple extents in block/scsi layoutget
Posted by Christoph Hellwig 3 days, 10 hours ago
The subject line is odd.  What is impl supposed to mean?

I'd say something like:

nfsd/blocklayout: support multiple extent per LAYOUTGET

On Thu, Sep 11, 2025 at 07:02:03PM +0300, Sergey Bashirov wrote:
> This patch allows the pNFS server to respond with multiple extents

This patch is redundant and should generally be avoided in commit log.

> Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com>
> ---
> Checked with smatch, tested on pNFS block volume setup.

Any chance we could get testing for this added to pynfs?

Also what is this patch against?  If fails to apply to linux-next for
me.

> +/**
> + * nfsd4_block_map_extent - get extent that covers the start of segment
> + * @inode: inode of the file requested by the client
> + * @fhp: handle of the file requested by the client
> + * @seg: layout subrange requested by the client
> + * @minlength: layout min length requested by the client
> + * @bex: output block extent structure
> + *
> + * Get an extent from the file system that starts at @seg->offset or below,
> + * but may be shorter than @seg->length.
> + *
> + * Return values:
> + *   %nfs_ok: Success, @bex is initialized and valid
> + *   %nfserr_layoutunavailable: Failed to get extent for requested @seg
> + *   OS errors converted to NFS errors
> + */

I'm not sure how Chuck handles this, but kerneldoc comments for static
functions and thus not as public documentation can be really annoying
due to the verbose formatting of the paramters.  I'd generally avoid
them and instead of have short comments explaining the interesting bits
about the calling convention and/or functionality.

> +nfsd4_block_map_extent(struct inode *inode, const struct svc_fh *fhp,
> +		const struct nfsd4_layout_seg *seg, u64 minlength,
> +		struct pnfs_block_extent *bex)
>  {
>  	struct super_block *sb = inode->i_sb;
>  	struct iomap iomap;
>  	u32 device_generation = 0;
>  	int error;
>  
> -	if (locks_in_grace(SVC_NET(rqstp)))
> -		return nfserr_grace;
> -
> -	if (seg->offset & (block_size - 1)) {
> -		dprintk("pnfsd: I/O misaligned\n");
> -		goto out_layoutunavailable;
> -	}
> -

There is a lot of code movement here mixed with functional changes,
which makes it very hard to follow.  Any chance you could first
split the functions, then introduce the new pnfs_block_layout structure
and only then actually add multiple extents per layoutget?  That'll
make it a lot easier there are no unintended changes.

> + *   %nfserr_layoutunavailable: Failed to get extents for requested @seg
> + *   OS errors converted to NFS errors
> + */
> +static __be32
> +nfsd4_block_map_segment(struct inode *inode, const struct svc_fh *fhp,
> +		const struct nfsd4_layout_seg *seg, u64 minlength,
> +		struct pnfs_block_layout *bl)

I struggle to see what the point of the nfsd4_block_map_segment helpers
is, as it seems to add no real value while making the code harder to
follow.

> +{
> +	struct nfsd4_layout_seg subseg = *seg;
> +	u32 i;
> +	__be32 nfserr;

Nit: nfserr can be local in the loop.

The subextent handling confuses me a bit - there is no such thing
as a subsegment in NFS.  I'd pass a never changing end of layout,
a moving offset, and a constant iomode directly, which should simply
the code quite a bit, i.e. something like:

	u64 map_start = seg->offset;
	u64 seg_end = seg->offset + seg->len;

	for (..) {

		nfserr = nfsd4_block_map_extent(inode, fhp, ...
				map_start, &end);
		if (end > seg_end) {
			bl->nr_extents = i + 1;
			break;
		}

		map_start = end;
	}


> +nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
> +		const struct svc_fh *fhp, struct nfsd4_layoutget *args)
> +{
> +	struct nfsd4_layout_seg *seg = &args->lg_seg;
> +	u64 seg_length;
> +	struct pnfs_block_extent *first_bex, *last_bex;
> +	struct pnfs_block_layout *bl;
> +	u32 nr_extents_max = PAGE_SIZE / sizeof(bl->extents[0]) - 1;

Where is that -1 coming from?  Or the magic PAGE_SIZE?
Re: [PATCH] NFSD: Impl multiple extents in block/scsi layoutget
Posted by Sergey Bashirov 2 days, 23 hours ago
On Mon, Sep 29, 2025 at 12:23:48AM -0700, Christoph Hellwig wrote:
> The subject line is odd.  What is impl supposed to mean?
> 
> I'd say something like:
> 
> nfsd/blocklayout: support multiple extent per LAYOUTGET

Impl is short for implement. But let's rephrase that if it looks odd.

> On Thu, Sep 11, 2025 at 07:02:03PM +0300, Sergey Bashirov wrote:
> > Checked with smatch, tested on pNFS block volume setup.
> 
> Any chance we could get testing for this added to pynfs?

Thanks for pointing out pynfs! I'm not familiar with it yet, but adding
tests is a good idea. Will take a look on it.

> Also what is this patch against?  If fails to apply to linux-next for
> me.

At the time of submission, the patch was made on top of nfsd-testing.

> There is a lot of code movement here mixed with functional changes,
> which makes it very hard to follow.  Any chance you could first
> split the functions, then introduce the new pnfs_block_layout structure
> and only then actually add multiple extents per layoutget?  That'll
> make it a lot easier there are no unintended changes.

Agree. I will try to refactor or split the patch to reduce unnecessary
code movement produced by diff tool.

> I struggle to see what the point of the nfsd4_block_map_segment helpers
> is, as it seems to add no real value while making the code harder to
> follow

The main idea of nfsd4_block_map_segment is to logically separate work with
XFS, handling single extent, from the loop through the pnfs_block_layout
structure across all extents. If we combine nfsd4_block_map_segment and
nfsd4_block_map_extent, the switch statement will end up inside the for
loop, which will result in a large code movement in diff due to the right
shifts and line breaks.

> The subextent handling confuses me a bit - there is no such thing
> as a subsegment in NFS.  I'd pass a never changing end of layout,
> a moving offset, and a constant iomode directly, which should simply
> the code quite a bit, i.e. something like:

You are right, subsegment here simply means a smaller portion of the
original requested segment and is not related to NFS structures or
specification. Actually, I first made a version with three separate
arguments. But then reworked it to reduce the number of arguments in the
nfsd4_block_map_extent function. I am ok with any option, let's pass
three variables without using structure.

> > +nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
> > +		const struct svc_fh *fhp, struct nfsd4_layoutget *args)
> > +{
> > +	struct nfsd4_layout_seg *seg = &args->lg_seg;
> > +	u64 seg_length;
> > +	struct pnfs_block_extent *first_bex, *last_bex;
> > +	struct pnfs_block_layout *bl;
> > +	u32 nr_extents_max = PAGE_SIZE / sizeof(bl->extents[0]) - 1;
> 
> Where is that -1 coming from?  Or the magic PAGE_SIZE?

PAGE_SIZE is the maximum possible size of the pnfs_block_layout
structure in the server's memory that we would like to have. And -1
ensures that the nr_extents field will also fit in it
if PAGE_SIZE % sizeof(struct pnfs_block_extent) == 0.

Yes, it would be nice to add a comment in the code, or even better,
an inline function that calculates the number of extents in a given
memory limit for the pnfs_block_layout structure.


I will prepare new patch(es).

--
Sergey Bashirov
Re: [PATCH] NFSD: Impl multiple extents in block/scsi layoutget
Posted by Chuck Lever 3 days, 5 hours ago
On 9/29/25 12:23 AM, Christoph Hellwig wrote:
> The subject line is odd.  What is impl supposed to mean?
> 
> I'd say something like:
> 
> nfsd/blocklayout: support multiple extent per LAYOUTGET
> 
> On Thu, Sep 11, 2025 at 07:02:03PM +0300, Sergey Bashirov wrote:
>> This patch allows the pNFS server to respond with multiple extents
> 
> This patch is redundant and should generally be avoided in commit log.

To put this another way, the patch description should explain "why" you
want this change -- what's the benefit going to be. And maybe a little
"how" it is done, if that isn't obvious from the diff.


>> Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com>
>> ---
>> Checked with smatch, tested on pNFS block volume setup.
> 
> Any chance we could get testing for this added to pynfs?
> 
> Also what is this patch against?  If fails to apply to linux-next for
> me.
> 
>> +/**
>> + * nfsd4_block_map_extent - get extent that covers the start of segment
>> + * @inode: inode of the file requested by the client
>> + * @fhp: handle of the file requested by the client
>> + * @seg: layout subrange requested by the client
>> + * @minlength: layout min length requested by the client
>> + * @bex: output block extent structure
>> + *
>> + * Get an extent from the file system that starts at @seg->offset or below,
>> + * but may be shorter than @seg->length.
>> + *
>> + * Return values:
>> + *   %nfs_ok: Success, @bex is initialized and valid
>> + *   %nfserr_layoutunavailable: Failed to get extent for requested @seg
>> + *   OS errors converted to NFS errors
>> + */
> 
> I'm not sure how Chuck handles this, but kerneldoc comments for static
> functions and thus not as public documentation can be really annoying
> due to the verbose formatting of the paramters.  I'd generally avoid
> them and instead of have short comments explaining the interesting bits
> about the calling convention and/or functionality.

For NFSD, the tradition is to avoid kdoc comments for static functions
unless they are the target of a virtual function call.

Static functions otherwise can have big comments. Just not kdoc style.


>> +nfsd4_block_map_extent(struct inode *inode, const struct svc_fh *fhp,
>> +		const struct nfsd4_layout_seg *seg, u64 minlength,
>> +		struct pnfs_block_extent *bex)
>>  {
>>  	struct super_block *sb = inode->i_sb;
>>  	struct iomap iomap;
>>  	u32 device_generation = 0;
>>  	int error;
>>  
>> -	if (locks_in_grace(SVC_NET(rqstp)))
>> -		return nfserr_grace;
>> -
>> -	if (seg->offset & (block_size - 1)) {
>> -		dprintk("pnfsd: I/O misaligned\n");
>> -		goto out_layoutunavailable;
>> -	}
>> -
> 
> There is a lot of code movement here mixed with functional changes,
> which makes it very hard to follow.  Any chance you could first
> split the functions, then introduce the new pnfs_block_layout structure
> and only then actually add multiple extents per layoutget?  That'll
> make it a lot easier there are no unintended changes.
> 
>> + *   %nfserr_layoutunavailable: Failed to get extents for requested @seg
>> + *   OS errors converted to NFS errors
>> + */
>> +static __be32
>> +nfsd4_block_map_segment(struct inode *inode, const struct svc_fh *fhp,
>> +		const struct nfsd4_layout_seg *seg, u64 minlength,
>> +		struct pnfs_block_layout *bl)
> 
> I struggle to see what the point of the nfsd4_block_map_segment helpers
> is, as it seems to add no real value while making the code harder to
> follow.
> 
>> +{
>> +	struct nfsd4_layout_seg subseg = *seg;
>> +	u32 i;
>> +	__be32 nfserr;
> 
> Nit: nfserr can be local in the loop.
> 
> The subextent handling confuses me a bit - there is no such thing
> as a subsegment in NFS.  I'd pass a never changing end of layout,
> a moving offset, and a constant iomode directly, which should simply
> the code quite a bit, i.e. something like:
> 
> 	u64 map_start = seg->offset;
> 	u64 seg_end = seg->offset + seg->len;
> 
> 	for (..) {
> 
> 		nfserr = nfsd4_block_map_extent(inode, fhp, ...
> 				map_start, &end);
> 		if (end > seg_end) {
> 			bl->nr_extents = i + 1;
> 			break;
> 		}
> 
> 		map_start = end;
> 	}
> 
> 
>> +nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
>> +		const struct svc_fh *fhp, struct nfsd4_layoutget *args)
>> +{
>> +	struct nfsd4_layout_seg *seg = &args->lg_seg;
>> +	u64 seg_length;
>> +	struct pnfs_block_extent *first_bex, *last_bex;
>> +	struct pnfs_block_layout *bl;
>> +	u32 nr_extents_max = PAGE_SIZE / sizeof(bl->extents[0]) - 1;
> 
> Where is that -1 coming from?  Or the magic PAGE_SIZE?
> 


-- 
Chuck Lever
Re: [PATCH] NFSD: Impl multiple extents in block/scsi layoutget
Posted by Dai Ngo 2 weeks, 2 days ago
On 9/11/25 9:02 AM, Sergey Bashirov wrote:
> This patch allows the pNFS server to respond with multiple extents
> in a layoutget request. As a result, the number of layoutget requests
> is significantly reduced for various file access patterns, including
> random and parallel writes, avoiding unnecessary load to the server.
> On the client side, this improves the performance of writing large
> files and allows requesting layouts with minimum length greater than
> PAGE_SIZE.
>
> Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com>
> ---
> Checked with smatch, tested on pNFS block volume setup.
>
>   fs/nfsd/blocklayout.c    | 167 +++++++++++++++++++++++++++++----------
>   fs/nfsd/blocklayoutxdr.c |  36 ++++++---
>   fs/nfsd/blocklayoutxdr.h |   5 ++
>   3 files changed, 157 insertions(+), 51 deletions(-)
>
> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> index fde5539cf6a6..d53f3ec8823a 100644
> --- a/fs/nfsd/blocklayout.c
> +++ b/fs/nfsd/blocklayout.c
> @@ -17,48 +17,39 @@
>   #define NFSDDBG_FACILITY	NFSDDBG_PNFS
>   
>   
> +/**
> + * nfsd4_block_map_extent - get extent that covers the start of segment
> + * @inode: inode of the file requested by the client
> + * @fhp: handle of the file requested by the client
> + * @seg: layout subrange requested by the client
> + * @minlength: layout min length requested by the client
> + * @bex: output block extent structure
> + *
> + * Get an extent from the file system that starts at @seg->offset or below,
> + * but may be shorter than @seg->length.
> + *
> + * Return values:
> + *   %nfs_ok: Success, @bex is initialized and valid
> + *   %nfserr_layoutunavailable: Failed to get extent for requested @seg
> + *   OS errors converted to NFS errors
> + */
>   static __be32
> -nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
> -		const struct svc_fh *fhp, struct nfsd4_layoutget *args)
> +nfsd4_block_map_extent(struct inode *inode, const struct svc_fh *fhp,
> +		const struct nfsd4_layout_seg *seg, u64 minlength,
> +		struct pnfs_block_extent *bex)
>   {
> -	struct nfsd4_layout_seg *seg = &args->lg_seg;
>   	struct super_block *sb = inode->i_sb;
> -	u32 block_size = i_blocksize(inode);
> -	struct pnfs_block_extent *bex;
>   	struct iomap iomap;
>   	u32 device_generation = 0;
>   	int error;
>   
> -	if (locks_in_grace(SVC_NET(rqstp)))
> -		return nfserr_grace;
> -
> -	if (seg->offset & (block_size - 1)) {
> -		dprintk("pnfsd: I/O misaligned\n");
> -		goto out_layoutunavailable;
> -	}
> -
> -	/*
> -	 * Some clients barf on non-zero block numbers for NONE or INVALID
> -	 * layouts, so make sure to zero the whole structure.
> -	 */
> -	error = -ENOMEM;
> -	bex = kzalloc(sizeof(*bex), GFP_KERNEL);
> -	if (!bex)
> -		goto out_error;
> -	args->lg_content = bex;
> -
>   	error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
>   					    &iomap, seg->iomode != IOMODE_READ,
>   					    &device_generation);
>   	if (error) {
>   		if (error == -ENXIO)
> -			goto out_layoutunavailable;
> -		goto out_error;
> -	}
> -
> -	if (iomap.length < args->lg_minlength) {
> -		dprintk("pnfsd: extent smaller than minlength\n");
> -		goto out_layoutunavailable;
> +			return nfserr_layoutunavailable;
> +		return nfserrno(error);
>   	}
>   
>   	switch (iomap.type) {
> @@ -74,9 +65,9 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
>   			/*
>   			 * Crack monkey special case from section 2.3.1.
>   			 */
> -			if (args->lg_minlength == 0) {
> +			if (minlength == 0) {
>   				dprintk("pnfsd: no soup for you!\n");
> -				goto out_layoutunavailable;
> +				return nfserr_layoutunavailable;
>   			}
>   
>   			bex->es = PNFS_BLOCK_INVALID_DATA;
> @@ -93,27 +84,119 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
>   	case IOMAP_DELALLOC:
>   	default:
>   		WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
> -		goto out_layoutunavailable;
> +		return nfserr_layoutunavailable;
>   	}
>   
>   	error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
>   	if (error)
> -		goto out_error;
> +		return nfserrno(error);
> +
>   	bex->foff = iomap.offset;
>   	bex->len = iomap.length;
> +	return nfs_ok;
> +}
>   
> -	seg->offset = iomap.offset;
> -	seg->length = iomap.length;
> +/**
> + * nfsd4_block_map_segment - get extent array for the requested layout
> + * @inode: inode of the file requested by the client
> + * @fhp: handle of the file requested by the client
> + * @seg: layout range requested by the client
> + * @minlength: layout min length requested by the client
> + * @bl: output array of block extents
> + *
> + * Get an array of consecutive block extents that span the requested
> + * layout range. The resulting range may be shorter than requested if
> + * all preallocated block extents are used.
> + *
> + * Return values:
> + *   %nfs_ok: Success, @bl initialized and valid
> + *   %nfserr_layoutunavailable: Failed to get extents for requested @seg
> + *   OS errors converted to NFS errors
> + */
> +static __be32
> +nfsd4_block_map_segment(struct inode *inode, const struct svc_fh *fhp,
> +		const struct nfsd4_layout_seg *seg, u64 minlength,
> +		struct pnfs_block_layout *bl)
> +{
> +	struct nfsd4_layout_seg subseg = *seg;
> +	u32 i;
> +	__be32 nfserr;
>   
> -	dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es);
> -	return 0;
> +	for (i = 0; i < bl->nr_extents; i++) {
> +		struct pnfs_block_extent *extent = bl->extents + i;
> +		u64 extent_len;
> +
> +		nfserr = nfsd4_block_map_extent(inode, fhp, &subseg,
> +				minlength, extent);
> +		if (nfserr != nfs_ok)
> +			return nfserr;
> +
> +		extent_len = extent->len - (subseg.offset - extent->foff);
> +		if (extent_len >= subseg.length) {
> +			bl->nr_extents = i + 1;
> +			return nfs_ok;
> +		}
> +
> +		subseg.offset = extent->foff + extent->len;
> +		subseg.length -= extent_len;
> +	}
> +
> +	return nfs_ok;
> +}
> +
> +static __be32
> +nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
> +		const struct svc_fh *fhp, struct nfsd4_layoutget *args)
> +{
> +	struct nfsd4_layout_seg *seg = &args->lg_seg;
> +	u64 seg_length;
> +	struct pnfs_block_extent *first_bex, *last_bex;
> +	struct pnfs_block_layout *bl;
> +	u32 nr_extents_max = PAGE_SIZE / sizeof(bl->extents[0]) - 1;
> +	u32 block_size = i_blocksize(inode);
> +	__be32 nfserr;
> +
> +	if (locks_in_grace(SVC_NET(rqstp)))
> +		return nfserr_grace;
> +
> +	nfserr = nfserr_layoutunavailable;
> +	if (seg->offset & (block_size - 1)) {
> +		dprintk("pnfsd: I/O misaligned\n");
> +		goto out_error;
> +	}
> +
> +	/*
> +	 * Some clients barf on non-zero block numbers for NONE or INVALID
> +	 * layouts, so make sure to zero the whole structure.
> +	 */
> +	nfserr = nfserrno(-ENOMEM);
> +	bl = kzalloc(struct_size(bl, extents, nr_extents_max), GFP_KERNEL);
> +	if (!bl)
> +		goto out_error;
> +	bl->nr_extents = nr_extents_max;
> +	args->lg_content = bl;
> +
> +	nfserr = nfsd4_block_map_segment(inode, fhp, seg,
> +			args->lg_minlength, bl);
> +	if (nfserr != nfs_ok)
> +		goto out_error;
> +	first_bex = bl->extents;
> +	last_bex = bl->extents + bl->nr_extents - 1;
> +
> +	nfserr = nfserr_layoutunavailable;
> +	seg_length = last_bex->foff + last_bex->len - seg->offset;
> +	if (seg_length < args->lg_minlength) {
> +		dprintk("pnfsd: extent smaller than minlength\n");
> +		goto out_error;
> +	}
> +
> +	seg->offset = first_bex->foff;
> +	seg->length = last_bex->foff - first_bex->foff + last_bex->len;
> +	return nfs_ok;
>   
>   out_error:
>   	seg->length = 0;
> -	return nfserrno(error);
> -out_layoutunavailable:
> -	seg->length = 0;
> -	return nfserr_layoutunavailable;
> +	return nfserr;
>   }
>   
>   static __be32
> diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
> index e50afe340737..68c112d47cee 100644
> --- a/fs/nfsd/blocklayoutxdr.c
> +++ b/fs/nfsd/blocklayoutxdr.c
> @@ -14,12 +14,25 @@
>   #define NFSDDBG_FACILITY	NFSDDBG_PNFS
>   
>   
> +/**
> + * nfsd4_block_encode_layoutget - encode block/scsi layout extent array
> + * @xdr: stream for data encoding
> + * @lgp: layoutget content, actually an array of extents to encode
> + *
> + * This function encodes the opaque loc_body field in the layoutget response.
> + * Since the pnfs_block_layout4 and pnfs_scsi_layout4 structures on the wire
> + * are the same, this function is used by both layout drivers.
> + *
> + * Return values:
> + *   %nfs_ok: Success, all extents encoded into @xdr
> + *   %nfserr_toosmall: Not enough space in @xdr to encode all the data
> + */
>   __be32
>   nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
>   		const struct nfsd4_layoutget *lgp)
>   {
> -	const struct pnfs_block_extent *b = lgp->lg_content;
> -	int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
> +	const struct pnfs_block_layout *bl = lgp->lg_content;
> +	u32 i, len = sizeof(__be32) + bl->nr_extents * PNFS_BLOCK_EXTENT_SIZE;
>   	__be32 *p;
>   
>   	p = xdr_reserve_space(xdr, sizeof(__be32) + len);
> @@ -27,14 +40,19 @@ nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
>   		return nfserr_toosmall;
>   
>   	*p++ = cpu_to_be32(len);
> -	*p++ = cpu_to_be32(1);		/* we always return a single extent */
> +	*p++ = cpu_to_be32(bl->nr_extents);
>   
> -	p = svcxdr_encode_deviceid4(p, &b->vol_id);
> -	p = xdr_encode_hyper(p, b->foff);
> -	p = xdr_encode_hyper(p, b->len);
> -	p = xdr_encode_hyper(p, b->soff);
> -	*p++ = cpu_to_be32(b->es);
> -	return 0;
> +	for (i = 0; i < bl->nr_extents; i++) {
> +		const struct pnfs_block_extent *bex = bl->extents + i;
> +
> +		p = svcxdr_encode_deviceid4(p, &bex->vol_id);
> +		p = xdr_encode_hyper(p, bex->foff);
> +		p = xdr_encode_hyper(p, bex->len);
> +		p = xdr_encode_hyper(p, bex->soff);
> +		*p++ = cpu_to_be32(bex->es);
> +	}
> +
> +	return nfs_ok;
>   }
>   
>   static int
> diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
> index 7d25ef689671..54fe7f517a94 100644
> --- a/fs/nfsd/blocklayoutxdr.h
> +++ b/fs/nfsd/blocklayoutxdr.h
> @@ -21,6 +21,11 @@ struct pnfs_block_range {
>   	u64				len;
>   };
>   
> +struct pnfs_block_layout {
> +	u32				nr_extents;
> +	struct pnfs_block_extent	extents[] __counted_by(nr_extents);
> +};
> +
>   /*
>    * Random upper cap for the uuid length to avoid unbounded allocation.
>    * Not actually limited by the protocol.

Tested-by: Dai Ngo <dai.ngo@oracle.com>
Reviewed-by: Dai Ngo <dai.ngo@oracle.com>

-Dai
Re: [PATCH] NFSD: Impl multiple extents in block/scsi layoutget
Posted by Chuck Lever 3 weeks ago
On 9/11/25 12:02 PM, Sergey Bashirov wrote:
> This patch allows the pNFS server to respond with multiple extents
> in a layoutget request. As a result, the number of layoutget requests
> is significantly reduced for various file access patterns, including
> random and parallel writes, avoiding unnecessary load to the server.
> On the client side, this improves the performance of writing large
> files and allows requesting layouts with minimum length greater than
> PAGE_SIZE.
> 
> Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com>
> ---
> Checked with smatch, tested on pNFS block volume setup.
> 
>  fs/nfsd/blocklayout.c    | 167 +++++++++++++++++++++++++++++----------
>  fs/nfsd/blocklayoutxdr.c |  36 ++++++---
>  fs/nfsd/blocklayoutxdr.h |   5 ++
>  3 files changed, 157 insertions(+), 51 deletions(-)
> 
> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> index fde5539cf6a6..d53f3ec8823a 100644
> --- a/fs/nfsd/blocklayout.c
> +++ b/fs/nfsd/blocklayout.c
> @@ -17,48 +17,39 @@
>  #define NFSDDBG_FACILITY	NFSDDBG_PNFS
>  
>  
> +/**
> + * nfsd4_block_map_extent - get extent that covers the start of segment
> + * @inode: inode of the file requested by the client
> + * @fhp: handle of the file requested by the client
> + * @seg: layout subrange requested by the client
> + * @minlength: layout min length requested by the client
> + * @bex: output block extent structure
> + *
> + * Get an extent from the file system that starts at @seg->offset or below,
> + * but may be shorter than @seg->length.
> + *
> + * Return values:
> + *   %nfs_ok: Success, @bex is initialized and valid
> + *   %nfserr_layoutunavailable: Failed to get extent for requested @seg
> + *   OS errors converted to NFS errors
> + */
>  static __be32
> -nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
> -		const struct svc_fh *fhp, struct nfsd4_layoutget *args)
> +nfsd4_block_map_extent(struct inode *inode, const struct svc_fh *fhp,
> +		const struct nfsd4_layout_seg *seg, u64 minlength,
> +		struct pnfs_block_extent *bex)
>  {
> -	struct nfsd4_layout_seg *seg = &args->lg_seg;
>  	struct super_block *sb = inode->i_sb;
> -	u32 block_size = i_blocksize(inode);
> -	struct pnfs_block_extent *bex;
>  	struct iomap iomap;
>  	u32 device_generation = 0;
>  	int error;
>  
> -	if (locks_in_grace(SVC_NET(rqstp)))
> -		return nfserr_grace;
> -
> -	if (seg->offset & (block_size - 1)) {
> -		dprintk("pnfsd: I/O misaligned\n");
> -		goto out_layoutunavailable;
> -	}
> -
> -	/*
> -	 * Some clients barf on non-zero block numbers for NONE or INVALID
> -	 * layouts, so make sure to zero the whole structure.
> -	 */
> -	error = -ENOMEM;
> -	bex = kzalloc(sizeof(*bex), GFP_KERNEL);
> -	if (!bex)
> -		goto out_error;
> -	args->lg_content = bex;
> -
>  	error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
>  					    &iomap, seg->iomode != IOMODE_READ,
>  					    &device_generation);
>  	if (error) {
>  		if (error == -ENXIO)
> -			goto out_layoutunavailable;
> -		goto out_error;
> -	}
> -
> -	if (iomap.length < args->lg_minlength) {
> -		dprintk("pnfsd: extent smaller than minlength\n");
> -		goto out_layoutunavailable;
> +			return nfserr_layoutunavailable;
> +		return nfserrno(error);
>  	}
>  
>  	switch (iomap.type) {
> @@ -74,9 +65,9 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
>  			/*
>  			 * Crack monkey special case from section 2.3.1.
>  			 */
> -			if (args->lg_minlength == 0) {
> +			if (minlength == 0) {
>  				dprintk("pnfsd: no soup for you!\n");
> -				goto out_layoutunavailable;
> +				return nfserr_layoutunavailable;
>  			}
>  
>  			bex->es = PNFS_BLOCK_INVALID_DATA;
> @@ -93,27 +84,119 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
>  	case IOMAP_DELALLOC:
>  	default:
>  		WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
> -		goto out_layoutunavailable;
> +		return nfserr_layoutunavailable;
>  	}
>  
>  	error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
>  	if (error)
> -		goto out_error;
> +		return nfserrno(error);
> +
>  	bex->foff = iomap.offset;
>  	bex->len = iomap.length;
> +	return nfs_ok;
> +}
>  
> -	seg->offset = iomap.offset;
> -	seg->length = iomap.length;
> +/**
> + * nfsd4_block_map_segment - get extent array for the requested layout
> + * @inode: inode of the file requested by the client
> + * @fhp: handle of the file requested by the client
> + * @seg: layout range requested by the client
> + * @minlength: layout min length requested by the client
> + * @bl: output array of block extents
> + *
> + * Get an array of consecutive block extents that span the requested
> + * layout range. The resulting range may be shorter than requested if
> + * all preallocated block extents are used.
> + *
> + * Return values:
> + *   %nfs_ok: Success, @bl initialized and valid
> + *   %nfserr_layoutunavailable: Failed to get extents for requested @seg
> + *   OS errors converted to NFS errors
> + */
> +static __be32
> +nfsd4_block_map_segment(struct inode *inode, const struct svc_fh *fhp,
> +		const struct nfsd4_layout_seg *seg, u64 minlength,
> +		struct pnfs_block_layout *bl)
> +{
> +	struct nfsd4_layout_seg subseg = *seg;
> +	u32 i;
> +	__be32 nfserr;
>  
> -	dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es);
> -	return 0;
> +	for (i = 0; i < bl->nr_extents; i++) {
> +		struct pnfs_block_extent *extent = bl->extents + i;
> +		u64 extent_len;
> +
> +		nfserr = nfsd4_block_map_extent(inode, fhp, &subseg,
> +				minlength, extent);
> +		if (nfserr != nfs_ok)
> +			return nfserr;
> +
> +		extent_len = extent->len - (subseg.offset - extent->foff);
> +		if (extent_len >= subseg.length) {
> +			bl->nr_extents = i + 1;
> +			return nfs_ok;
> +		}
> +
> +		subseg.offset = extent->foff + extent->len;
> +		subseg.length -= extent_len;
> +	}
> +
> +	return nfs_ok;
> +}
> +
> +static __be32
> +nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
> +		const struct svc_fh *fhp, struct nfsd4_layoutget *args)
> +{
> +	struct nfsd4_layout_seg *seg = &args->lg_seg;
> +	u64 seg_length;
> +	struct pnfs_block_extent *first_bex, *last_bex;
> +	struct pnfs_block_layout *bl;
> +	u32 nr_extents_max = PAGE_SIZE / sizeof(bl->extents[0]) - 1;
> +	u32 block_size = i_blocksize(inode);
> +	__be32 nfserr;
> +
> +	if (locks_in_grace(SVC_NET(rqstp)))
> +		return nfserr_grace;
> +
> +	nfserr = nfserr_layoutunavailable;
> +	if (seg->offset & (block_size - 1)) {
> +		dprintk("pnfsd: I/O misaligned\n");
> +		goto out_error;
> +	}
> +
> +	/*
> +	 * Some clients barf on non-zero block numbers for NONE or INVALID
> +	 * layouts, so make sure to zero the whole structure.
> +	 */
> +	nfserr = nfserrno(-ENOMEM);
> +	bl = kzalloc(struct_size(bl, extents, nr_extents_max), GFP_KERNEL);
> +	if (!bl)
> +		goto out_error;
> +	bl->nr_extents = nr_extents_max;
> +	args->lg_content = bl;
> +
> +	nfserr = nfsd4_block_map_segment(inode, fhp, seg,
> +			args->lg_minlength, bl);
> +	if (nfserr != nfs_ok)
> +		goto out_error;
> +	first_bex = bl->extents;
> +	last_bex = bl->extents + bl->nr_extents - 1;
> +
> +	nfserr = nfserr_layoutunavailable;
> +	seg_length = last_bex->foff + last_bex->len - seg->offset;
> +	if (seg_length < args->lg_minlength) {
> +		dprintk("pnfsd: extent smaller than minlength\n");
> +		goto out_error;
> +	}
> +
> +	seg->offset = first_bex->foff;
> +	seg->length = last_bex->foff - first_bex->foff + last_bex->len;
> +	return nfs_ok;
>  
>  out_error:
>  	seg->length = 0;
> -	return nfserrno(error);
> -out_layoutunavailable:
> -	seg->length = 0;
> -	return nfserr_layoutunavailable;
> +	return nfserr;
>  }
>  
>  static __be32
> diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
> index e50afe340737..68c112d47cee 100644
> --- a/fs/nfsd/blocklayoutxdr.c
> +++ b/fs/nfsd/blocklayoutxdr.c
> @@ -14,12 +14,25 @@
>  #define NFSDDBG_FACILITY	NFSDDBG_PNFS
>  
>  
> +/**
> + * nfsd4_block_encode_layoutget - encode block/scsi layout extent array
> + * @xdr: stream for data encoding
> + * @lgp: layoutget content, actually an array of extents to encode
> + *
> + * This function encodes the opaque loc_body field in the layoutget response.
> + * Since the pnfs_block_layout4 and pnfs_scsi_layout4 structures on the wire
> + * are the same, this function is used by both layout drivers.
> + *
> + * Return values:
> + *   %nfs_ok: Success, all extents encoded into @xdr
> + *   %nfserr_toosmall: Not enough space in @xdr to encode all the data
> + */
>  __be32
>  nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
>  		const struct nfsd4_layoutget *lgp)
>  {
> -	const struct pnfs_block_extent *b = lgp->lg_content;
> -	int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
> +	const struct pnfs_block_layout *bl = lgp->lg_content;
> +	u32 i, len = sizeof(__be32) + bl->nr_extents * PNFS_BLOCK_EXTENT_SIZE;
>  	__be32 *p;
>  
>  	p = xdr_reserve_space(xdr, sizeof(__be32) + len);
> @@ -27,14 +40,19 @@ nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
>  		return nfserr_toosmall;
>  
>  	*p++ = cpu_to_be32(len);
> -	*p++ = cpu_to_be32(1);		/* we always return a single extent */
> +	*p++ = cpu_to_be32(bl->nr_extents);
>  
> -	p = svcxdr_encode_deviceid4(p, &b->vol_id);
> -	p = xdr_encode_hyper(p, b->foff);
> -	p = xdr_encode_hyper(p, b->len);
> -	p = xdr_encode_hyper(p, b->soff);
> -	*p++ = cpu_to_be32(b->es);
> -	return 0;
> +	for (i = 0; i < bl->nr_extents; i++) {
> +		const struct pnfs_block_extent *bex = bl->extents + i;
> +
> +		p = svcxdr_encode_deviceid4(p, &bex->vol_id);
> +		p = xdr_encode_hyper(p, bex->foff);
> +		p = xdr_encode_hyper(p, bex->len);
> +		p = xdr_encode_hyper(p, bex->soff);
> +		*p++ = cpu_to_be32(bex->es);
> +	}
> +
> +	return nfs_ok;
>  }
>  
>  static int
> diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
> index 7d25ef689671..54fe7f517a94 100644
> --- a/fs/nfsd/blocklayoutxdr.h
> +++ b/fs/nfsd/blocklayoutxdr.h
> @@ -21,6 +21,11 @@ struct pnfs_block_range {
>  	u64				len;
>  };
>  
> +struct pnfs_block_layout {
> +	u32				nr_extents;
> +	struct pnfs_block_extent	extents[] __counted_by(nr_extents);
> +};
> +
>  /*
>   * Random upper cap for the uuid length to avoid unbounded allocation.
>   * Not actually limited by the protocol.

Dai, Christoph - please review and/or test.


-- 
Chuck Lever
Re: [PATCH] NFSD: Impl multiple extents in block/scsi layoutget
Posted by Dai Ngo 2 weeks, 2 days ago
On 9/11/25 11:05 AM, Chuck Lever wrote:
> On 9/11/25 12:02 PM, Sergey Bashirov wrote:
>> This patch allows the pNFS server to respond with multiple extents
>> in a layoutget request. As a result, the number of layoutget requests
>> is significantly reduced for various file access patterns, including
>> random and parallel writes, avoiding unnecessary load to the server.
>> On the client side, this improves the performance of writing large
>> files and allows requesting layouts with minimum length greater than
>> PAGE_SIZE.
>>
>> Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com>
>> ---
>> Checked with smatch, tested on pNFS block volume setup.
>>
>>   fs/nfsd/blocklayout.c    | 167 +++++++++++++++++++++++++++++----------
>>   fs/nfsd/blocklayoutxdr.c |  36 ++++++---
>>   fs/nfsd/blocklayoutxdr.h |   5 ++
>>   3 files changed, 157 insertions(+), 51 deletions(-)
>>
>> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
>> index fde5539cf6a6..d53f3ec8823a 100644
>> --- a/fs/nfsd/blocklayout.c
>> +++ b/fs/nfsd/blocklayout.c
>> @@ -17,48 +17,39 @@
>>   #define NFSDDBG_FACILITY	NFSDDBG_PNFS
>>   
>>   
>> +/**
>> + * nfsd4_block_map_extent - get extent that covers the start of segment
>> + * @inode: inode of the file requested by the client
>> + * @fhp: handle of the file requested by the client
>> + * @seg: layout subrange requested by the client
>> + * @minlength: layout min length requested by the client
>> + * @bex: output block extent structure
>> + *
>> + * Get an extent from the file system that starts at @seg->offset or below,
>> + * but may be shorter than @seg->length.
>> + *
>> + * Return values:
>> + *   %nfs_ok: Success, @bex is initialized and valid
>> + *   %nfserr_layoutunavailable: Failed to get extent for requested @seg
>> + *   OS errors converted to NFS errors
>> + */
>>   static __be32
>> -nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
>> -		const struct svc_fh *fhp, struct nfsd4_layoutget *args)
>> +nfsd4_block_map_extent(struct inode *inode, const struct svc_fh *fhp,
>> +		const struct nfsd4_layout_seg *seg, u64 minlength,
>> +		struct pnfs_block_extent *bex)
>>   {
>> -	struct nfsd4_layout_seg *seg = &args->lg_seg;
>>   	struct super_block *sb = inode->i_sb;
>> -	u32 block_size = i_blocksize(inode);
>> -	struct pnfs_block_extent *bex;
>>   	struct iomap iomap;
>>   	u32 device_generation = 0;
>>   	int error;
>>   
>> -	if (locks_in_grace(SVC_NET(rqstp)))
>> -		return nfserr_grace;
>> -
>> -	if (seg->offset & (block_size - 1)) {
>> -		dprintk("pnfsd: I/O misaligned\n");
>> -		goto out_layoutunavailable;
>> -	}
>> -
>> -	/*
>> -	 * Some clients barf on non-zero block numbers for NONE or INVALID
>> -	 * layouts, so make sure to zero the whole structure.
>> -	 */
>> -	error = -ENOMEM;
>> -	bex = kzalloc(sizeof(*bex), GFP_KERNEL);
>> -	if (!bex)
>> -		goto out_error;
>> -	args->lg_content = bex;
>> -
>>   	error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
>>   					    &iomap, seg->iomode != IOMODE_READ,
>>   					    &device_generation);
>>   	if (error) {
>>   		if (error == -ENXIO)
>> -			goto out_layoutunavailable;
>> -		goto out_error;
>> -	}
>> -
>> -	if (iomap.length < args->lg_minlength) {
>> -		dprintk("pnfsd: extent smaller than minlength\n");
>> -		goto out_layoutunavailable;
>> +			return nfserr_layoutunavailable;
>> +		return nfserrno(error);
>>   	}
>>   
>>   	switch (iomap.type) {
>> @@ -74,9 +65,9 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
>>   			/*
>>   			 * Crack monkey special case from section 2.3.1.
>>   			 */
>> -			if (args->lg_minlength == 0) {
>> +			if (minlength == 0) {
>>   				dprintk("pnfsd: no soup for you!\n");
>> -				goto out_layoutunavailable;
>> +				return nfserr_layoutunavailable;
>>   			}
>>   
>>   			bex->es = PNFS_BLOCK_INVALID_DATA;
>> @@ -93,27 +84,119 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
>>   	case IOMAP_DELALLOC:
>>   	default:
>>   		WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
>> -		goto out_layoutunavailable;
>> +		return nfserr_layoutunavailable;
>>   	}
>>   
>>   	error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
>>   	if (error)
>> -		goto out_error;
>> +		return nfserrno(error);
>> +
>>   	bex->foff = iomap.offset;
>>   	bex->len = iomap.length;
>> +	return nfs_ok;
>> +}
>>   
>> -	seg->offset = iomap.offset;
>> -	seg->length = iomap.length;
>> +/**
>> + * nfsd4_block_map_segment - get extent array for the requested layout
>> + * @inode: inode of the file requested by the client
>> + * @fhp: handle of the file requested by the client
>> + * @seg: layout range requested by the client
>> + * @minlength: layout min length requested by the client
>> + * @bl: output array of block extents
>> + *
>> + * Get an array of consecutive block extents that span the requested
>> + * layout range. The resulting range may be shorter than requested if
>> + * all preallocated block extents are used.
>> + *
>> + * Return values:
>> + *   %nfs_ok: Success, @bl initialized and valid
>> + *   %nfserr_layoutunavailable: Failed to get extents for requested @seg
>> + *   OS errors converted to NFS errors
>> + */
>> +static __be32
>> +nfsd4_block_map_segment(struct inode *inode, const struct svc_fh *fhp,
>> +		const struct nfsd4_layout_seg *seg, u64 minlength,
>> +		struct pnfs_block_layout *bl)
>> +{
>> +	struct nfsd4_layout_seg subseg = *seg;
>> +	u32 i;
>> +	__be32 nfserr;
>>   
>> -	dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es);
>> -	return 0;
>> +	for (i = 0; i < bl->nr_extents; i++) {
>> +		struct pnfs_block_extent *extent = bl->extents + i;
>> +		u64 extent_len;
>> +
>> +		nfserr = nfsd4_block_map_extent(inode, fhp, &subseg,
>> +				minlength, extent);
>> +		if (nfserr != nfs_ok)
>> +			return nfserr;
>> +
>> +		extent_len = extent->len - (subseg.offset - extent->foff);
>> +		if (extent_len >= subseg.length) {
>> +			bl->nr_extents = i + 1;
>> +			return nfs_ok;
>> +		}
>> +
>> +		subseg.offset = extent->foff + extent->len;
>> +		subseg.length -= extent_len;
>> +	}
>> +
>> +	return nfs_ok;
>> +}
>> +
>> +static __be32
>> +nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
>> +		const struct svc_fh *fhp, struct nfsd4_layoutget *args)
>> +{
>> +	struct nfsd4_layout_seg *seg = &args->lg_seg;
>> +	u64 seg_length;
>> +	struct pnfs_block_extent *first_bex, *last_bex;
>> +	struct pnfs_block_layout *bl;
>> +	u32 nr_extents_max = PAGE_SIZE / sizeof(bl->extents[0]) - 1;
>> +	u32 block_size = i_blocksize(inode);
>> +	__be32 nfserr;
>> +
>> +	if (locks_in_grace(SVC_NET(rqstp)))
>> +		return nfserr_grace;
>> +
>> +	nfserr = nfserr_layoutunavailable;
>> +	if (seg->offset & (block_size - 1)) {
>> +		dprintk("pnfsd: I/O misaligned\n");
>> +		goto out_error;
>> +	}
>> +
>> +	/*
>> +	 * Some clients barf on non-zero block numbers for NONE or INVALID
>> +	 * layouts, so make sure to zero the whole structure.
>> +	 */
>> +	nfserr = nfserrno(-ENOMEM);
>> +	bl = kzalloc(struct_size(bl, extents, nr_extents_max), GFP_KERNEL);
>> +	if (!bl)
>> +		goto out_error;
>> +	bl->nr_extents = nr_extents_max;
>> +	args->lg_content = bl;
>> +
>> +	nfserr = nfsd4_block_map_segment(inode, fhp, seg,
>> +			args->lg_minlength, bl);
>> +	if (nfserr != nfs_ok)
>> +		goto out_error;
>> +	first_bex = bl->extents;
>> +	last_bex = bl->extents + bl->nr_extents - 1;
>> +
>> +	nfserr = nfserr_layoutunavailable;
>> +	seg_length = last_bex->foff + last_bex->len - seg->offset;
>> +	if (seg_length < args->lg_minlength) {
>> +		dprintk("pnfsd: extent smaller than minlength\n");
>> +		goto out_error;
>> +	}
>> +
>> +	seg->offset = first_bex->foff;
>> +	seg->length = last_bex->foff - first_bex->foff + last_bex->len;
>> +	return nfs_ok;
>>   
>>   out_error:
>>   	seg->length = 0;
>> -	return nfserrno(error);
>> -out_layoutunavailable:
>> -	seg->length = 0;
>> -	return nfserr_layoutunavailable;
>> +	return nfserr;
>>   }
>>   
>>   static __be32
>> diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
>> index e50afe340737..68c112d47cee 100644
>> --- a/fs/nfsd/blocklayoutxdr.c
>> +++ b/fs/nfsd/blocklayoutxdr.c
>> @@ -14,12 +14,25 @@
>>   #define NFSDDBG_FACILITY	NFSDDBG_PNFS
>>   
>>   
>> +/**
>> + * nfsd4_block_encode_layoutget - encode block/scsi layout extent array
>> + * @xdr: stream for data encoding
>> + * @lgp: layoutget content, actually an array of extents to encode
>> + *
>> + * This function encodes the opaque loc_body field in the layoutget response.
>> + * Since the pnfs_block_layout4 and pnfs_scsi_layout4 structures on the wire
>> + * are the same, this function is used by both layout drivers.
>> + *
>> + * Return values:
>> + *   %nfs_ok: Success, all extents encoded into @xdr
>> + *   %nfserr_toosmall: Not enough space in @xdr to encode all the data
>> + */
>>   __be32
>>   nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
>>   		const struct nfsd4_layoutget *lgp)
>>   {
>> -	const struct pnfs_block_extent *b = lgp->lg_content;
>> -	int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
>> +	const struct pnfs_block_layout *bl = lgp->lg_content;
>> +	u32 i, len = sizeof(__be32) + bl->nr_extents * PNFS_BLOCK_EXTENT_SIZE;
>>   	__be32 *p;
>>   
>>   	p = xdr_reserve_space(xdr, sizeof(__be32) + len);
>> @@ -27,14 +40,19 @@ nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
>>   		return nfserr_toosmall;
>>   
>>   	*p++ = cpu_to_be32(len);
>> -	*p++ = cpu_to_be32(1);		/* we always return a single extent */
>> +	*p++ = cpu_to_be32(bl->nr_extents);
>>   
>> -	p = svcxdr_encode_deviceid4(p, &b->vol_id);
>> -	p = xdr_encode_hyper(p, b->foff);
>> -	p = xdr_encode_hyper(p, b->len);
>> -	p = xdr_encode_hyper(p, b->soff);
>> -	*p++ = cpu_to_be32(b->es);
>> -	return 0;
>> +	for (i = 0; i < bl->nr_extents; i++) {
>> +		const struct pnfs_block_extent *bex = bl->extents + i;
>> +
>> +		p = svcxdr_encode_deviceid4(p, &bex->vol_id);
>> +		p = xdr_encode_hyper(p, bex->foff);
>> +		p = xdr_encode_hyper(p, bex->len);
>> +		p = xdr_encode_hyper(p, bex->soff);
>> +		*p++ = cpu_to_be32(bex->es);
>> +	}
>> +
>> +	return nfs_ok;
>>   }
>>   
>>   static int
>> diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
>> index 7d25ef689671..54fe7f517a94 100644
>> --- a/fs/nfsd/blocklayoutxdr.h
>> +++ b/fs/nfsd/blocklayoutxdr.h
>> @@ -21,6 +21,11 @@ struct pnfs_block_range {
>>   	u64				len;
>>   };
>>   
>> +struct pnfs_block_layout {
>> +	u32				nr_extents;
>> +	struct pnfs_block_extent	extents[] __counted_by(nr_extents);
>> +};
>> +
>>   /*
>>    * Random upper cap for the uuid length to avoid unbounded allocation.
>>    * Not actually limited by the protocol.
> Dai, Christoph - please review and/or test.

LGTM.

Tested with SCSI layout and verified with multi-extent IO Read and Write.

-Dai

>
>