fs/nfsd/blocklayout.c | 8 ++-- fs/nfsd/blocklayoutxdr.c | 89 +++++++++++++++++++++++++++++----------- fs/nfsd/blocklayoutxdr.h | 8 ++-- fs/nfsd/nfs4xdr.c | 11 +++-- fs/nfsd/xdr4.h | 3 +- 5 files changed, 78 insertions(+), 41 deletions(-)
When pNFS client in block layout mode sends layoutcommit RPC to MDS,
a variable length array of modified extents is supplied within request.
This patch allows NFS server to accept such extent arrays if they do not
fit within single memory page.
The issue can be reproduced when writing to a 1GB file using FIO with small
block size and large I/O depth. Server returns NFSERR_BADXDR to the client.
Co-developed-by: Konstantin Evtushenko <koevtushenko@yandex.com>
Signed-off-by: Konstantin Evtushenko <koevtushenko@yandex.com>
Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com>
---
Changes in v2:
- Drop the len argument in *_decode_layoutupdate()
- Remove lc_up_len field from nfsd4_layoutcommit structure
- Fix code formatting
fs/nfsd/blocklayout.c | 8 ++--
fs/nfsd/blocklayoutxdr.c | 89 +++++++++++++++++++++++++++++-----------
fs/nfsd/blocklayoutxdr.h | 8 ++--
fs/nfsd/nfs4xdr.c | 11 +++--
fs/nfsd/xdr4.h | 3 +-
5 files changed, 78 insertions(+), 41 deletions(-)
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 08a20e5bcf7f..e74c3a19aeb9 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -179,8 +179,8 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
struct iomap *iomaps;
int nr_iomaps;
- nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
- lcp->lc_up_len, &iomaps, i_blocksize(inode));
+ nr_iomaps = nfsd4_block_decode_layoutupdate(&lcp->lc_up_layout,
+ &iomaps, i_blocksize(inode));
if (nr_iomaps < 0)
return nfserrno(nr_iomaps);
@@ -317,8 +317,8 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
struct iomap *iomaps;
int nr_iomaps;
- nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
- lcp->lc_up_len, &iomaps, i_blocksize(inode));
+ nr_iomaps = nfsd4_scsi_decode_layoutupdate(&lcp->lc_up_layout,
+ &iomaps, i_blocksize(inode));
if (nr_iomaps < 0)
return nfserrno(nr_iomaps);
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index ce78f74715ee..b76dbb119e19 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -113,26 +113,27 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
}
int
-nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, struct iomap **iomapp,
u32 block_size)
{
struct iomap *iomaps;
- u32 nr_iomaps, i;
+ u32 nr_iomaps, expected, len, i;
+ struct xdr_stream xdr;
+ char scratch[sizeof(struct pnfs_block_extent)];
- if (len < sizeof(u32)) {
- dprintk("%s: extent array too small: %u\n", __func__, len);
- return -EINVAL;
- }
- len -= sizeof(u32);
- if (len % PNFS_BLOCK_EXTENT_SIZE) {
- dprintk("%s: extent array invalid: %u\n", __func__, len);
+ xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL);
+ xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch));
+
+ if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) {
+ dprintk("%s: failed to decode extent array size\n", __func__);
return -EINVAL;
}
- nr_iomaps = be32_to_cpup(p++);
- if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
+ len = sizeof(__be32) + xdr_stream_remaining(&xdr);
+ expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
+ if (len != expected) {
dprintk("%s: extent array size mismatch: %u/%u\n",
- __func__, len, nr_iomaps);
+ __func__, len, expected);
return -EINVAL;
}
@@ -144,29 +145,54 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
for (i = 0; i < nr_iomaps; i++) {
struct pnfs_block_extent bex;
+ ssize_t ret;
- memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
- p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
+ ret = xdr_stream_decode_opaque_fixed(&xdr,
+ &bex.vol_id, sizeof(bex.vol_id));
+ if (ret < sizeof(bex.vol_id)) {
+ dprintk("%s: failed to decode device id for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
- p = xdr_decode_hyper(p, &bex.foff);
+ if (xdr_stream_decode_u64(&xdr, &bex.foff)) {
+ dprintk("%s: failed to decode offset for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
if (bex.foff & (block_size - 1)) {
dprintk("%s: unaligned offset 0x%llx\n",
__func__, bex.foff);
goto fail;
}
- p = xdr_decode_hyper(p, &bex.len);
+
+ if (xdr_stream_decode_u64(&xdr, &bex.len)) {
+ dprintk("%s: failed to decode length for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
if (bex.len & (block_size - 1)) {
dprintk("%s: unaligned length 0x%llx\n",
__func__, bex.foff);
goto fail;
}
- p = xdr_decode_hyper(p, &bex.soff);
+
+ if (xdr_stream_decode_u64(&xdr, &bex.soff)) {
+ dprintk("%s: failed to decode soffset for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
if (bex.soff & (block_size - 1)) {
dprintk("%s: unaligned disk offset 0x%llx\n",
__func__, bex.soff);
goto fail;
}
- bex.es = be32_to_cpup(p++);
+
+ if (xdr_stream_decode_u32(&xdr, &bex.es)) {
+ dprintk("%s: failed to decode estate for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
dprintk("%s: incorrect extent state %d\n",
__func__, bex.es);
@@ -185,18 +211,23 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
}
int
-nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, struct iomap **iomapp,
u32 block_size)
{
struct iomap *iomaps;
- u32 nr_iomaps, expected, i;
+ u32 nr_iomaps, expected, len, i;
+ struct xdr_stream xdr;
+ char scratch[sizeof(struct pnfs_block_range)];
- if (len < sizeof(u32)) {
- dprintk("%s: extent array too small: %u\n", __func__, len);
+ xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL);
+ xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch));
+
+ if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) {
+ dprintk("%s: failed to decode extent array size\n", __func__);
return -EINVAL;
}
- nr_iomaps = be32_to_cpup(p++);
+ len = sizeof(__be32) + xdr_stream_remaining(&xdr);
expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
if (len != expected) {
dprintk("%s: extent array size mismatch: %u/%u\n",
@@ -213,14 +244,22 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
for (i = 0; i < nr_iomaps; i++) {
u64 val;
- p = xdr_decode_hyper(p, &val);
+ if (xdr_stream_decode_u64(&xdr, &val)) {
+ dprintk("%s: failed to decode offset for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
if (val & (block_size - 1)) {
dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
goto fail;
}
iomaps[i].offset = val;
- p = xdr_decode_hyper(p, &val);
+ if (xdr_stream_decode_u64(&xdr, &val)) {
+ dprintk("%s: failed to decode length for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
if (val & (block_size - 1)) {
dprintk("%s: unaligned length 0x%llx\n", __func__, val);
goto fail;
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index 4e28ac8f1127..6e603cfec0a7 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
const struct nfsd4_getdeviceinfo *gdp);
__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
const struct nfsd4_layoutget *lgp);
-int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
- u32 block_size);
-int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
- u32 block_size);
+int nfsd4_block_decode_layoutupdate(struct xdr_buf *buf,
+ struct iomap **iomapp, u32 block_size);
+int nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf,
+ struct iomap **iomapp, u32 block_size);
#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 3afcdbed6e14..659e60b85d5f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -604,6 +604,8 @@ static __be32
nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp,
struct nfsd4_layoutcommit *lcp)
{
+ u32 len;
+
if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_layout_type) < 0)
return nfserr_bad_xdr;
if (lcp->lc_layout_type < LAYOUT_NFSV4_1_FILES)
@@ -611,13 +613,10 @@ nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp,
if (lcp->lc_layout_type >= LAYOUT_TYPE_MAX)
return nfserr_bad_xdr;
- if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0)
+ if (xdr_stream_decode_u32(argp->xdr, &len) < 0)
+ return nfserr_bad_xdr;
+ if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, len))
return nfserr_bad_xdr;
- if (lcp->lc_up_len > 0) {
- lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len);
- if (!lcp->lc_up_layout)
- return nfserr_bad_xdr;
- }
return nfs_ok;
}
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index aa2a356da784..02887029a81c 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -630,8 +630,7 @@ struct nfsd4_layoutcommit {
u64 lc_last_wr; /* request */
struct timespec64 lc_mtime; /* request */
u32 lc_layout_type; /* request */
- u32 lc_up_len; /* layout length */
- void *lc_up_layout; /* decoded by callback */
+ struct xdr_buf lc_up_layout; /* decoded by callback */
bool lc_size_chg; /* response */
u64 lc_newsize; /* response */
};
--
2.43.0
On 6/9/25 9:18 PM, Sergey Bashirov wrote: > When pNFS client in block layout mode sends layoutcommit RPC to MDS, > a variable length array of modified extents is supplied within request. > This patch allows NFS server to accept such extent arrays if they do not > fit within single memory page. > > The issue can be reproduced when writing to a 1GB file using FIO with small > block size and large I/O depth. Server returns NFSERR_BADXDR to the client. > > Co-developed-by: Konstantin Evtushenko <koevtushenko@yandex.com> > Signed-off-by: Konstantin Evtushenko <koevtushenko@yandex.com> > Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com> > --- > Changes in v2: > - Drop the len argument in *_decode_layoutupdate() > - Remove lc_up_len field from nfsd4_layoutcommit structure > - Fix code formatting > > fs/nfsd/blocklayout.c | 8 ++-- > fs/nfsd/blocklayoutxdr.c | 89 +++++++++++++++++++++++++++++----------- > fs/nfsd/blocklayoutxdr.h | 8 ++-- > fs/nfsd/nfs4xdr.c | 11 +++-- > fs/nfsd/xdr4.h | 3 +- > 5 files changed, 78 insertions(+), 41 deletions(-) > > diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c > index 08a20e5bcf7f..e74c3a19aeb9 100644 > --- a/fs/nfsd/blocklayout.c > +++ b/fs/nfsd/blocklayout.c > @@ -179,8 +179,8 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, > struct iomap *iomaps; > int nr_iomaps; > > - nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, > - lcp->lc_up_len, &iomaps, i_blocksize(inode)); > + nr_iomaps = nfsd4_block_decode_layoutupdate(&lcp->lc_up_layout, > + &iomaps, i_blocksize(inode)); > if (nr_iomaps < 0) > return nfserrno(nr_iomaps); > > @@ -317,8 +317,8 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, > struct iomap *iomaps; > int nr_iomaps; > > - nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, > - lcp->lc_up_len, &iomaps, i_blocksize(inode)); > + nr_iomaps = nfsd4_scsi_decode_layoutupdate(&lcp->lc_up_layout, > + &iomaps, i_blocksize(inode)); > if (nr_iomaps < 0) > return nfserrno(nr_iomaps); > > diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c > index ce78f74715ee..b76dbb119e19 100644 > --- a/fs/nfsd/blocklayoutxdr.c > +++ b/fs/nfsd/blocklayoutxdr.c > @@ -113,26 +113,27 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > } > > int > -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > +nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, struct iomap **iomapp, > u32 block_size) > { > struct iomap *iomaps; > - u32 nr_iomaps, i; > + u32 nr_iomaps, expected, len, i; > + struct xdr_stream xdr; > + char scratch[sizeof(struct pnfs_block_extent)]; Using "sizeof(struct yada)" for an XDR decoding buffer is incorrect. struct pnfs_block_extent is a C structure, with padding and architecture-dependent field sizes. It does not represent the size of the XDR data items on the wire. > - if (len < sizeof(u32)) { > - dprintk("%s: extent array too small: %u\n", __func__, len); > - return -EINVAL; > - } > - len -= sizeof(u32); > - if (len % PNFS_BLOCK_EXTENT_SIZE) { > - dprintk("%s: extent array invalid: %u\n", __func__, len); > + xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL); > + xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch)); Consider using svcxdr_init_decode() instead. > + > + if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) { > + dprintk("%s: failed to decode extent array size\n", __func__); > return -EINVAL; > } Throughout, please drop the dprintk's. At least don't add any new ones. I find the use of -EINVAL to signal an XDR decoding error to be somewhat inappropriate, but I guess it's historical (ie, its usage is not introduced by your patch). I would have expected perhaps NFS4ERR_BADXDR. Perhaps that should be corrected in a prerequisite patch. NFS4ERR_EINVAL means the server was able to decode the request but the decoded values are invalid. Here, the decoding has failed, and that's really a different problem. > - nr_iomaps = be32_to_cpup(p++); > - if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) { > + len = sizeof(__be32) + xdr_stream_remaining(&xdr); > + expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE; > + if (len != expected) { > dprintk("%s: extent array size mismatch: %u/%u\n", > - __func__, len, nr_iomaps); > + __func__, len, expected); > return -EINVAL; > } > > @@ -144,29 +145,54 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > > for (i = 0; i < nr_iomaps; i++) { > struct pnfs_block_extent bex; > + ssize_t ret; > > - memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); > - p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); > + ret = xdr_stream_decode_opaque_fixed(&xdr, > + &bex.vol_id, sizeof(bex.vol_id)); > + if (ret < sizeof(bex.vol_id)) { > + dprintk("%s: failed to decode device id for entry %u\n", > + __func__, i); > + goto fail; > + } > > - p = xdr_decode_hyper(p, &bex.foff); > + if (xdr_stream_decode_u64(&xdr, &bex.foff)) { > + dprintk("%s: failed to decode offset for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.foff & (block_size - 1)) { > dprintk("%s: unaligned offset 0x%llx\n", > __func__, bex.foff); > goto fail; > } > - p = xdr_decode_hyper(p, &bex.len); > + > + if (xdr_stream_decode_u64(&xdr, &bex.len)) { > + dprintk("%s: failed to decode length for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.len & (block_size - 1)) { > dprintk("%s: unaligned length 0x%llx\n", > __func__, bex.foff); > goto fail; > } > - p = xdr_decode_hyper(p, &bex.soff); > + > + if (xdr_stream_decode_u64(&xdr, &bex.soff)) { > + dprintk("%s: failed to decode soffset for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.soff & (block_size - 1)) { > dprintk("%s: unaligned disk offset 0x%llx\n", > __func__, bex.soff); > goto fail; > } > - bex.es = be32_to_cpup(p++); > + > + if (xdr_stream_decode_u32(&xdr, &bex.es)) { > + dprintk("%s: failed to decode estate for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.es != PNFS_BLOCK_READWRITE_DATA) { > dprintk("%s: incorrect extent state %d\n", > __func__, bex.es); > @@ -185,18 +211,23 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > } > > int > -nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > +nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, struct iomap **iomapp, > u32 block_size) > { > struct iomap *iomaps; > - u32 nr_iomaps, expected, i; > + u32 nr_iomaps, expected, len, i; > + struct xdr_stream xdr; > + char scratch[sizeof(struct pnfs_block_range)]; > > - if (len < sizeof(u32)) { > - dprintk("%s: extent array too small: %u\n", __func__, len); > + xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL); > + xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch)); > + > + if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) { > + dprintk("%s: failed to decode extent array size\n", __func__); > return -EINVAL; > } > > - nr_iomaps = be32_to_cpup(p++); > + len = sizeof(__be32) + xdr_stream_remaining(&xdr); > expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE; > if (len != expected) { > dprintk("%s: extent array size mismatch: %u/%u\n", > @@ -213,14 +244,22 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > for (i = 0; i < nr_iomaps; i++) { > u64 val; > > - p = xdr_decode_hyper(p, &val); > + if (xdr_stream_decode_u64(&xdr, &val)) { > + dprintk("%s: failed to decode offset for entry %u\n", > + __func__, i); > + goto fail; > + } > if (val & (block_size - 1)) { > dprintk("%s: unaligned offset 0x%llx\n", __func__, val); > goto fail; > } > iomaps[i].offset = val; > > - p = xdr_decode_hyper(p, &val); > + if (xdr_stream_decode_u64(&xdr, &val)) { > + dprintk("%s: failed to decode length for entry %u\n", > + __func__, i); > + goto fail; > + } > if (val & (block_size - 1)) { > dprintk("%s: unaligned length 0x%llx\n", __func__, val); > goto fail; > diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h > index 4e28ac8f1127..6e603cfec0a7 100644 > --- a/fs/nfsd/blocklayoutxdr.h > +++ b/fs/nfsd/blocklayoutxdr.h > @@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > const struct nfsd4_getdeviceinfo *gdp); > __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, > const struct nfsd4_layoutget *lgp); > -int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > - u32 block_size); > -int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > - u32 block_size); > +int nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, > + struct iomap **iomapp, u32 block_size); > +int nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, > + struct iomap **iomapp, u32 block_size); > > #endif /* _NFSD_BLOCKLAYOUTXDR_H */ > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c > index 3afcdbed6e14..659e60b85d5f 100644 > --- a/fs/nfsd/nfs4xdr.c > +++ b/fs/nfsd/nfs4xdr.c > @@ -604,6 +604,8 @@ static __be32 > nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, > struct nfsd4_layoutcommit *lcp) > { > + u32 len; > + > if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_layout_type) < 0) > return nfserr_bad_xdr; > if (lcp->lc_layout_type < LAYOUT_NFSV4_1_FILES) > @@ -611,13 +613,10 @@ nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, > if (lcp->lc_layout_type >= LAYOUT_TYPE_MAX) > return nfserr_bad_xdr; > > - if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0) > + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) > + return nfserr_bad_xdr; > + if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, len)) > return nfserr_bad_xdr; The layout is effectively an opaque payload at this point, so using xdr_stream_subsegment() makes sense to me. > - if (lcp->lc_up_len > 0) { > - lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len); > - if (!lcp->lc_up_layout) > - return nfserr_bad_xdr; > - } > > return nfs_ok; > } > diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h > index aa2a356da784..02887029a81c 100644 > --- a/fs/nfsd/xdr4.h > +++ b/fs/nfsd/xdr4.h > @@ -630,8 +630,7 @@ struct nfsd4_layoutcommit { > u64 lc_last_wr; /* request */ > struct timespec64 lc_mtime; /* request */ > u32 lc_layout_type; /* request */ > - u32 lc_up_len; /* layout length */ > - void *lc_up_layout; /* decoded by callback */ > + struct xdr_buf lc_up_layout; /* decoded by callback */ > bool lc_size_chg; /* response */ > u64 lc_newsize; /* response */ > }; -- Chuck Lever
On Tue, Jun 10, 2025 at 02:10:46PM -0400, Chuck Lever wrote: > On 6/9/25 9:18 PM, Sergey Bashirov wrote: > > + xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL); > > + xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch)); > > Consider using svcxdr_init_decode() instead. I see that svcxdr_init_decode() does the same two steps. What I concerned about is that it takes the top-level svc_rqst struct and modifies it. Of course, we can pass rqstp from nfsd4_layoutcommit() to the layout driver callback. But then we would need to make a backup of the original xdr buffer and stream position, set up and initialize the xdr sub-buffer, and at the end restore back the original xdr stream. All these actions seem somewhat unnecessary and not so elegant to me. Is it acceptable to keep the current solution in the patch or am I missing something? -- Sergey Bashirov
On 6/13/25 8:01 AM, Sergey Bashirov wrote: > On Tue, Jun 10, 2025 at 02:10:46PM -0400, Chuck Lever wrote: >> On 6/9/25 9:18 PM, Sergey Bashirov wrote: >>> + xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL); >>> + xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch)); >> >> Consider using svcxdr_init_decode() instead. > > I see that svcxdr_init_decode() does the same two steps. What I > concerned about is that it takes the top-level svc_rqst struct > and modifies it. Of course, we can pass rqstp from nfsd4_layoutcommit() > to the layout driver callback. But then we would need to make a backup > of the original xdr buffer and stream position, set up and initialize > the xdr sub-buffer, and at the end restore back the original xdr stream. > All these actions seem somewhat unnecessary and not so elegant to me. > > Is it acceptable to keep the current solution in the patch or am I > missing something? At the very least, you will need to allocate a proper scratch buffer and free it when the layout decode is complete if you do not use svcxdr_init_decode(). IIUC nfsd4_layoutcommit() is called after the COMPOUND arguments have been XDR decoded, and before the COMPOUND result is encoded. (I'm guessing) it should be safe to use svcxdr_init_decode() in nfsd4_layoutcommit(), but of course you should test first. If it turns out not to be safe, you might just use the idea of invoking alloc_page() and put_page() for the scratch buffer. -- Chuck Lever
On Tue, Jun 10, 2025 at 02:10:46PM -0400, Chuck Lever wrote: > > + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) > > + return nfserr_bad_xdr; > > + if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, len)) > > return nfserr_bad_xdr; > > The layout is effectively an opaque payload at this point, so using > xdr_stream_subsegment() makes sense to me. Btw, when trying to switch XDR to work with bvec backing, xdr_stream_subsegment has been a very painful primitive. I also don't really understand what the benefit of it is vs just keeping on decoding the subsegment normally. That might just be me not understanding the code, though.
On 6/11/25 2:56 AM, Christoph Hellwig wrote: > On Tue, Jun 10, 2025 at 02:10:46PM -0400, Chuck Lever wrote: >>> + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) >>> + return nfserr_bad_xdr; >>> + if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, len)) >>> return nfserr_bad_xdr; >> >> The layout is effectively an opaque payload at this point, so using >> xdr_stream_subsegment() makes sense to me. > > Btw, when trying to switch XDR to work with bvec backing, > xdr_stream_subsegment has been a very painful primitive. I also don't > really understand what the benefit of it is vs just keeping on decoding > the subsegment normally. That might just be me not understanding the > code, though. XDR subsegments are useful when another layer will be responsible for encoding or decoding the field. Generally a subsegment contains an opaque (an operation in an NFSv4 COMPOUND, a read or write payload, or an RPCGSS_SEC payload that is checksummed or encrypted). For pNFS, the layout metadata has to be skipped over when it is encountered in a generic operation like LAYOUTCOMMIT, but then it is later handled by another layer (the layouttype-specific plug-in, which is part of an "upper layer"). The generic layer has to confirm that the length of the opaque is correct, skip over that length, and then continue marshaling or unmarshaling fields after the opaque. -- Chuck Lever
© 2016 - 2025 Red Hat, Inc.