Set iomap->extent_size when sub-extent zeroing is required.
We treat a sub-extent write same as an unaligned write, so we can leverage
the existing sub-FSblock unaligned write support, i.e. try a shared lock
with IOMAP_DIO_OVERWRITE_ONLY flag, if this fails then try the exclusive
lock.
In xfs_iomap_write_unwritten(), FSB calcs are now based on the extsize.
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_file.c | 35 ++++++++++++++++++++++-------------
fs/xfs/xfs_iomap.c | 13 +++++++++++--
2 files changed, 33 insertions(+), 15 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e81e01e6b22b..ee4f94cf6f4e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -620,18 +620,19 @@ xfs_file_dio_write_aligned(
* Handle block unaligned direct I/O writes
*
* In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
- * them to be done in parallel with reads and other direct I/O writes. However,
- * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
- * to do sub-block zeroing and that requires serialisation against other direct
- * I/O to the same block. In this case we need to serialise the submission of
- * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
- * In the case where sub-block zeroing is not required, we can do concurrent
- * sub-block dios to the same block successfully.
+ * them to be done in parallel with reads and other direct I/O writes.
+ * However if the I/O is not aligned to filesystem blocks/extent, the direct
+ * I/O layer may need to do sub-block/extent zeroing and that requires
+ * serialisation against other direct I/O to the same block/extent. In this
+ * case we need to serialise the submission of the unaligned I/O so that we
+ * don't get racing block/extent zeroing in the dio layer.
+ * In the case where sub-block/extent zeroing is not required, we can do
+ * concurrent sub-block/extent dios to the same block/extent successfully.
*
* Optimistically submit the I/O using the shared lock first, but use the
* IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
- * if block allocation or partial block zeroing would be required. In that case
- * we try again with the exclusive lock.
+ * if block/extent allocation or partial block/extent zeroing would be
+ * required. In that case we try again with the exclusive lock.
*/
static noinline ssize_t
xfs_file_dio_write_unaligned(
@@ -646,9 +647,9 @@ xfs_file_dio_write_unaligned(
ssize_t ret;
/*
- * Extending writes need exclusivity because of the sub-block zeroing
- * that the DIO code always does for partial tail blocks beyond EOF, so
- * don't even bother trying the fast path in this case.
+ * Extending writes need exclusivity because of the sub-block/extent
+ * zeroing that the DIO code always does for partial tail blocks
+ * beyond EOF, so don't even bother trying the fast path in this case.
*/
if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
if (iocb->ki_flags & IOCB_NOWAIT)
@@ -714,11 +715,19 @@ xfs_file_dio_write(
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
size_t count = iov_iter_count(from);
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int blockmask;
/* direct I/O must be aligned to device logical sector size */
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
- if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+
+ if (xfs_inode_has_forcealign(ip) && ip->i_extsize > 1)
+ blockmask = XFS_FSB_TO_B(mp, ip->i_extsize) - 1;
+ else
+ blockmask = mp->m_blockmask;
+
+ if ((iocb->ki_pos | count) & blockmask)
return xfs_file_dio_write_unaligned(ip, iocb, from);
return xfs_file_dio_write_aligned(ip, iocb, from);
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 4087af7f3c9f..1a3692bbc84d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -138,6 +138,8 @@ xfs_bmbt_to_iomap(
iomap->validity_cookie = sequence_cookie;
iomap->folio_ops = &xfs_iomap_folio_ops;
+ if (xfs_inode_has_forcealign(ip) && ip->i_extsize > 1)
+ iomap->extent_size = XFS_FSB_TO_B(mp, ip->i_extsize);
return 0;
}
@@ -570,8 +572,15 @@ xfs_iomap_write_unwritten(
trace_xfs_unwritten_convert(ip, offset, count);
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+ if (xfs_inode_has_forcealign(ip) && ip->i_extsize > 1) {
+ xfs_extlen_t extsize_bytes = mp->m_sb.sb_blocksize * ip->i_extsize;
+
+ offset_fsb = XFS_B_TO_FSBT(mp, round_down(offset, extsize_bytes));
+ count_fsb = XFS_B_TO_FSB(mp, round_up(offset + count, extsize_bytes));
+ } else {
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+ }
count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
/*
--
2.31.1
On Mon, Apr 29, 2024 at 05:47:40PM +0000, John Garry wrote:
> Set iomap->extent_size when sub-extent zeroing is required.
>
> We treat a sub-extent write same as an unaligned write, so we can leverage
> the existing sub-FSblock unaligned write support, i.e. try a shared lock
> with IOMAP_DIO_OVERWRITE_ONLY flag, if this fails then try the exclusive
> lock.
>
> In xfs_iomap_write_unwritten(), FSB calcs are now based on the extsize.
If forcedalign is set, should we just reject unaligned DIOs?
.....
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
> fs/xfs/xfs_file.c | 35 ++++++++++++++++++++++-------------
> fs/xfs/xfs_iomap.c | 13 +++++++++++--
> 2 files changed, 33 insertions(+), 15 deletions(-)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index e81e01e6b22b..ee4f94cf6f4e 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -620,18 +620,19 @@ xfs_file_dio_write_aligned(
> * Handle block unaligned direct I/O writes
* Handle unaligned direct IO writes.
> *
> * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
> - * them to be done in parallel with reads and other direct I/O writes. However,
> - * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
> - * to do sub-block zeroing and that requires serialisation against other direct
> - * I/O to the same block. In this case we need to serialise the submission of
> - * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
> - * In the case where sub-block zeroing is not required, we can do concurrent
> - * sub-block dios to the same block successfully.
> + * them to be done in parallel with reads and other direct I/O writes.
> + * However if the I/O is not aligned to filesystem blocks/extent, the direct
> + * I/O layer may need to do sub-block/extent zeroing and that requires
> + * serialisation against other direct I/O to the same block/extent. In this
> + * case we need to serialise the submission of the unaligned I/O so that we
> + * don't get racing block/extent zeroing in the dio layer.
> + * In the case where sub-block/extent zeroing is not required, we can do
> + * concurrent sub-block/extent dios to the same block/extent successfully.
> *
> * Optimistically submit the I/O using the shared lock first, but use the
> * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
> - * if block allocation or partial block zeroing would be required. In that case
> - * we try again with the exclusive lock.
> + * if block/extent allocation or partial block/extent zeroing would be
> + * required. In that case we try again with the exclusive lock.
Rather than changing every "block" to "block/extent", leave the bulk
of the comment unchanged and add another paragraph to it that says
something like:
* If forced extent alignment is turned on, then serialisation
* constraints are extended from filesystem block alignment
* to extent alignment boundaries. In this case, we treat any
* non-extent-aligned DIO the same as a sub-block DIO.
> */
> static noinline ssize_t
> xfs_file_dio_write_unaligned(
> @@ -646,9 +647,9 @@ xfs_file_dio_write_unaligned(
> ssize_t ret;
>
> /*
> - * Extending writes need exclusivity because of the sub-block zeroing
> - * that the DIO code always does for partial tail blocks beyond EOF, so
> - * don't even bother trying the fast path in this case.
> + * Extending writes need exclusivity because of the sub-block/extent
> + * zeroing that the DIO code always does for partial tail blocks
> + * beyond EOF, so don't even bother trying the fast path in this case.
> */
> if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
> if (iocb->ki_flags & IOCB_NOWAIT)
> @@ -714,11 +715,19 @@ xfs_file_dio_write(
> struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
> struct xfs_buftarg *target = xfs_inode_buftarg(ip);
> size_t count = iov_iter_count(from);
> + struct xfs_mount *mp = ip->i_mount;
> + unsigned int blockmask;
>
> /* direct I/O must be aligned to device logical sector size */
> if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
> return -EINVAL;
> - if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
> +
> + if (xfs_inode_has_forcealign(ip) && ip->i_extsize > 1)
> + blockmask = XFS_FSB_TO_B(mp, ip->i_extsize) - 1;
> + else
> + blockmask = mp->m_blockmask;
alignmask = XFS_FSB_TO_B(mp, xfs_inode_alignment(ip)) - 1;
Note that this would consider sub rt_extsize IO as unaligned, which
may be undesirable. In that case, we should define a second helper
such as xfs_inode_io_alignment() that doesn't take into account RT
extent sizes because we can still do filesystem block sized
unwritten extent conversion on those devices. The same IO-specific
wrapper would be used for the other cases in this patch, too.
> +
> + if ((iocb->ki_pos | count) & blockmask)
> return xfs_file_dio_write_unaligned(ip, iocb, from);
> return xfs_file_dio_write_aligned(ip, iocb, from);
> }
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 4087af7f3c9f..1a3692bbc84d 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -138,6 +138,8 @@ xfs_bmbt_to_iomap(
>
> iomap->validity_cookie = sequence_cookie;
> iomap->folio_ops = &xfs_iomap_folio_ops;
> + if (xfs_inode_has_forcealign(ip) && ip->i_extsize > 1)
> + iomap->extent_size = XFS_FSB_TO_B(mp, ip->i_extsize);
iomap->io_block_size = XFS_FSB_TO_B(mp, xfs_inode_alignment(ip));
> return 0;
> }
>
> @@ -570,8 +572,15 @@ xfs_iomap_write_unwritten(
>
> trace_xfs_unwritten_convert(ip, offset, count);
>
> - offset_fsb = XFS_B_TO_FSBT(mp, offset);
> - count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
> + if (xfs_inode_has_forcealign(ip) && ip->i_extsize > 1) {
> + xfs_extlen_t extsize_bytes = mp->m_sb.sb_blocksize * ip->i_extsize;
> +
> + offset_fsb = XFS_B_TO_FSBT(mp, round_down(offset, extsize_bytes));
> + count_fsb = XFS_B_TO_FSB(mp, round_up(offset + count, extsize_bytes));
> + } else {
> + offset_fsb = XFS_B_TO_FSBT(mp, offset);
> + count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
> + }
More places we can use a xfs_inode_alignment() helper.
offset_fsb = XFS_B_TO_FSBT(mp, offset);
count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
rounding = XFS_FSB_TO_B(mp, xfs_inode_alignment(ip));
if (rounding > 1) {
offset_fsb = rounddown_64(offset_fsb, rounding);
count_fsb = roundup_64(count_fsb, rounding);
}
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
On 01/05/2024 02:32, Dave Chinner wrote:
> On Mon, Apr 29, 2024 at 05:47:40PM +0000, John Garry wrote:
>> Set iomap->extent_size when sub-extent zeroing is required.
>>
>> We treat a sub-extent write same as an unaligned write, so we can leverage
>> the existing sub-FSblock unaligned write support, i.e. try a shared lock
>> with IOMAP_DIO_OVERWRITE_ONLY flag, if this fails then try the exclusive
>> lock.
>>
>> In xfs_iomap_write_unwritten(), FSB calcs are now based on the extsize.
>
> If forcedalign is set, should we just reject unaligned DIOs?
Why would we? That's very restrictive. Indeed, we got to the point of
adding the sub-extent zeroing just for supporting that.
>
> .....
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>> fs/xfs/xfs_file.c | 35 ++++++++++++++++++++++-------------
>> fs/xfs/xfs_iomap.c | 13 +++++++++++--
>> 2 files changed, 33 insertions(+), 15 deletions(-)
>>
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index e81e01e6b22b..ee4f94cf6f4e 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -620,18 +620,19 @@ xfs_file_dio_write_aligned(
>> * Handle block unaligned direct I/O writes
>
> * Handle unaligned direct IO writes.
>
>> *
>> * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
>> - * them to be done in parallel with reads and other direct I/O writes. However,
>> - * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
>> - * to do sub-block zeroing and that requires serialisation against other direct
>> - * I/O to the same block. In this case we need to serialise the submission of
>> - * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
>> - * In the case where sub-block zeroing is not required, we can do concurrent
>> - * sub-block dios to the same block successfully.
>> + * them to be done in parallel with reads and other direct I/O writes.
>> + * However if the I/O is not aligned to filesystem blocks/extent, the direct
>> + * I/O layer may need to do sub-block/extent zeroing and that requires
>> + * serialisation against other direct I/O to the same block/extent. In this
>> + * case we need to serialise the submission of the unaligned I/O so that we
>> + * don't get racing block/extent zeroing in the dio layer.
>> + * In the case where sub-block/extent zeroing is not required, we can do
>> + * concurrent sub-block/extent dios to the same block/extent successfully.
>> *
>> * Optimistically submit the I/O using the shared lock first, but use the
>> * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
>> - * if block allocation or partial block zeroing would be required. In that case
>> - * we try again with the exclusive lock.
>> + * if block/extent allocation or partial block/extent zeroing would be
>> + * required. In that case we try again with the exclusive lock.
>
> Rather than changing every "block" to "block/extent", leave the bulk
> of the comment unchanged and add another paragraph to it that says
> something like:
>
> * If forced extent alignment is turned on, then serialisation
> * constraints are extended from filesystem block alignment
> * to extent alignment boundaries. In this case, we treat any
> * non-extent-aligned DIO the same as a sub-block DIO.
ok, fine
>
>> */
>> static noinline ssize_t
>> xfs_file_dio_write_unaligned(
>> @@ -646,9 +647,9 @@ xfs_file_dio_write_unaligned(
>> ssize_t ret;
>>
>> /*
>> - * Extending writes need exclusivity because of the sub-block zeroing
>> - * that the DIO code always does for partial tail blocks beyond EOF, so
>> - * don't even bother trying the fast path in this case.
>> + * Extending writes need exclusivity because of the sub-block/extent
>> + * zeroing that the DIO code always does for partial tail blocks
>> + * beyond EOF, so don't even bother trying the fast path in this case.
>> */
>> if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
>> if (iocb->ki_flags & IOCB_NOWAIT)
>> @@ -714,11 +715,19 @@ xfs_file_dio_write(
>> struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
>> struct xfs_buftarg *target = xfs_inode_buftarg(ip);
>> size_t count = iov_iter_count(from);
>> + struct xfs_mount *mp = ip->i_mount;
>> + unsigned int blockmask;
>>
>> /* direct I/O must be aligned to device logical sector size */
>> if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
>> return -EINVAL;
>> - if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
>> +
>> + if (xfs_inode_has_forcealign(ip) && ip->i_extsize > 1)
>> + blockmask = XFS_FSB_TO_B(mp, ip->i_extsize) - 1;
>> + else
>> + blockmask = mp->m_blockmask;
>
> alignmask = XFS_FSB_TO_B(mp, xfs_inode_alignment(ip)) - 1;
Do you mean xfs_extent_alignment() instead of xfs_inode_alignment()?
>
> Note that this would consider sub rt_extsize IO as unaligned,
> which
> may be undesirable. In that case, we should define a second helper
> such as xfs_inode_io_alignment() that doesn't take into account RT
> extent sizes because we can still do filesystem block sized
> unwritten extent conversion on those devices. The same IO-specific
> wrapper would be used for the other cases in this patch, too.
ok, fine
>
>> +
>> + if ((iocb->ki_pos | count) & blockmask)
>> return xfs_file_dio_write_unaligned(ip, iocb, from);
>> return xfs_file_dio_write_aligned(ip, iocb, from);
>> }
>> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
>> index 4087af7f3c9f..1a3692bbc84d 100644
>> --- a/fs/xfs/xfs_iomap.c
>> +++ b/fs/xfs/xfs_iomap.c
>> @@ -138,6 +138,8 @@ xfs_bmbt_to_iomap(
>>
>> iomap->validity_cookie = sequence_cookie;
>> iomap->folio_ops = &xfs_iomap_folio_ops;
>> + if (xfs_inode_has_forcealign(ip) && ip->i_extsize > 1)
>> + iomap->extent_size = XFS_FSB_TO_B(mp, ip->i_extsize);
>
> iomap->io_block_size = XFS_FSB_TO_B(mp, xfs_inode_alignment(ip));
>
>> return 0;
>> }
>>
>> @@ -570,8 +572,15 @@ xfs_iomap_write_unwritten(
>>
>> trace_xfs_unwritten_convert(ip, offset, count);
>>
>> - offset_fsb = XFS_B_TO_FSBT(mp, offset);
>> - count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
>> + if (xfs_inode_has_forcealign(ip) && ip->i_extsize > 1) {
>> + xfs_extlen_t extsize_bytes = mp->m_sb.sb_blocksize * ip->i_extsize;
>> +
>> + offset_fsb = XFS_B_TO_FSBT(mp, round_down(offset, extsize_bytes));
>> + count_fsb = XFS_B_TO_FSB(mp, round_up(offset + count, extsize_bytes));
>> + } else {
>> + offset_fsb = XFS_B_TO_FSBT(mp, offset);
>> + count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
>> + }
>
> More places we can use a xfs_inode_alignment() helper.
>
> offset_fsb = XFS_B_TO_FSBT(mp, offset);
> count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
> rounding = XFS_FSB_TO_B(mp, xfs_inode_alignment(ip));
> if (rounding > 1) {
> offset_fsb = rounddown_64(offset_fsb, rounding);
> count_fsb = roundup_64(count_fsb, rounding);
> }
ok, but again I assume you mean xfs_extent_alignment()
Thanks,
John
On Wed, May 01, 2024 at 12:36:02PM +0100, John Garry wrote:
> On 01/05/2024 02:32, Dave Chinner wrote:
> > On Mon, Apr 29, 2024 at 05:47:40PM +0000, John Garry wrote:
> > > Set iomap->extent_size when sub-extent zeroing is required.
> > >
> > > We treat a sub-extent write same as an unaligned write, so we can leverage
> > > the existing sub-FSblock unaligned write support, i.e. try a shared lock
> > > with IOMAP_DIO_OVERWRITE_ONLY flag, if this fails then try the exclusive
> > > lock.
> > >
> > > In xfs_iomap_write_unwritten(), FSB calcs are now based on the extsize.
> >
> > If forcedalign is set, should we just reject unaligned DIOs?
>
> Why would we? That's very restrictive. Indeed, we got to the point of adding
> the sub-extent zeroing just for supporting that.
> > > @@ -646,9 +647,9 @@ xfs_file_dio_write_unaligned(
> > > ssize_t ret;
> > > /*
> > > - * Extending writes need exclusivity because of the sub-block zeroing
> > > - * that the DIO code always does for partial tail blocks beyond EOF, so
> > > - * don't even bother trying the fast path in this case.
> > > + * Extending writes need exclusivity because of the sub-block/extent
> > > + * zeroing that the DIO code always does for partial tail blocks
> > > + * beyond EOF, so don't even bother trying the fast path in this case.
> > > */
> > > if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
> > > if (iocb->ki_flags & IOCB_NOWAIT)
> > > @@ -714,11 +715,19 @@ xfs_file_dio_write(
> > > struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
> > > struct xfs_buftarg *target = xfs_inode_buftarg(ip);
> > > size_t count = iov_iter_count(from);
> > > + struct xfs_mount *mp = ip->i_mount;
> > > + unsigned int blockmask;
> > > /* direct I/O must be aligned to device logical sector size */
> > > if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
> > > return -EINVAL;
> > > - if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
> > > +
> > > + if (xfs_inode_has_forcealign(ip) && ip->i_extsize > 1)
> > > + blockmask = XFS_FSB_TO_B(mp, ip->i_extsize) - 1;
> > > + else
> > > + blockmask = mp->m_blockmask;
> >
> > alignmask = XFS_FSB_TO_B(mp, xfs_inode_alignment(ip)) - 1;
>
> Do you mean xfs_extent_alignment() instead of xfs_inode_alignment()?
Yes, I was.
I probably should have named it xfs_inode_extent_alignment() because
clearly I kept thinking of it as "inode alignment"... :)
-Dave.
--
Dave Chinner
david@fromorbit.com
© 2016 - 2025 Red Hat, Inc.