In cases of an atomic write occurs for misaligned or discontiguous disk
blocks, we will use a CoW-based method to issue the atomic write.
So, for that case, return -EAGAIN to request that the write be issued in
CoW atomic write mode. The dio write path should detect this, similar to
how misaligned regular DIO writes are handled.
For normal HW-based mode, when the range which we are atomic writing to
covers a shared data extent, try to allocate a new CoW fork. However, if
we find that what we allocated does not meet atomic write requirements
in terms of length and alignment, then fallback on the CoW-based mode
for the atomic write.
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_iomap.c | 137 ++++++++++++++++++++++++++++++++++++++++++++-
fs/xfs/xfs_iomap.h | 1 +
2 files changed, 136 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f3a6ec2d3a40..6c963786530d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -798,6 +798,23 @@ imap_spans_range(
return true;
}
+static bool
+xfs_bmap_valid_for_atomic_write(
+ struct xfs_bmbt_irec *imap,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ /* Misaligned start block wrt size */
+ if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
+ return false;
+
+ /* Discontiguous or mixed extents */
+ if (!imap_spans_range(imap, offset_fsb, end_fsb))
+ return false;
+
+ return true;
+}
+
static int
xfs_direct_write_iomap_begin(
struct inode *inode,
@@ -812,10 +829,13 @@ xfs_direct_write_iomap_begin(
struct xfs_bmbt_irec imap, cmap;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+ xfs_fileoff_t orig_end_fsb = end_fsb;
+ bool atomic_hw = flags & IOMAP_ATOMIC_HW;
int nimaps = 1, error = 0;
unsigned int reflink_flags = 0;
bool shared = false;
u16 iomap_flags = 0;
+ bool needs_alloc;
unsigned int lockmode;
u64 seq;
@@ -874,13 +894,37 @@ xfs_direct_write_iomap_begin(
&lockmode, reflink_flags);
if (error)
goto out_unlock;
- if (shared)
+ if (shared) {
+ if (atomic_hw &&
+ !xfs_bmap_valid_for_atomic_write(&cmap,
+ offset_fsb, end_fsb)) {
+ error = -EAGAIN;
+ goto out_unlock;
+ }
goto out_found_cow;
+ }
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
}
- if (imap_needs_alloc(inode, flags, &imap, nimaps))
+ needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
+
+ if (atomic_hw) {
+ error = -EAGAIN;
+ /*
+ * Use CoW method for when we need to alloc > 1 block,
+ * otherwise we might allocate less than what we need here and
+ * have multiple mappings.
+ */
+ if (needs_alloc && orig_end_fsb - offset_fsb > 1)
+ goto out_unlock;
+
+ if (!xfs_bmap_valid_for_atomic_write(&imap, offset_fsb,
+ orig_end_fsb))
+ goto out_unlock;
+ }
+
+ if (needs_alloc)
goto allocate_blocks;
/*
@@ -1021,6 +1065,95 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
};
#endif /* CONFIG_XFS_RT */
+static int
+xfs_atomic_write_sw_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec imap, cmap;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+ int nimaps = 1, error;
+ bool shared = false;
+ unsigned int lockmode = XFS_ILOCK_EXCL;
+ u64 seq;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ if (!xfs_has_reflink(mp))
+ return -EINVAL;
+
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
+
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+ &nimaps, 0);
+ if (error)
+ goto out_unlock;
+
+ error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
+ &lockmode, XFS_REFLINK_CONVERT |
+ XFS_REFLINK_ATOMIC_SW);
+ /*
+ * Don't check @shared. For atomic writes, we should error when
+ * we don't get a COW mapping
+ */
+ if (error)
+ goto out_unlock;
+
+ end_fsb = imap.br_startoff + imap.br_blockcount;
+
+ length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+ trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+ if (imap.br_startblock != HOLESTARTBLOCK) {
+ seq = xfs_iomap_inode_sequence(ip, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
+ if (error)
+ goto out_unlock;
+ }
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+ xfs_iunlock(ip, lockmode);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
+
+out_unlock:
+ if (lockmode)
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
+static int
+xfs_atomic_write_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ ASSERT(flags & IOMAP_WRITE);
+ ASSERT(flags & IOMAP_DIRECT);
+
+ if (flags & IOMAP_ATOMIC_SW)
+ return xfs_atomic_write_sw_iomap_begin(inode, offset, length,
+ flags, iomap, srcmap);
+
+ ASSERT(flags & IOMAP_ATOMIC_HW);
+ return xfs_direct_write_iomap_begin(inode, offset, length, flags,
+ iomap, srcmap);
+}
+
+const struct iomap_ops xfs_atomic_write_iomap_ops = {
+ .iomap_begin = xfs_atomic_write_iomap_begin,
+};
+
static int
xfs_dax_write_iomap_end(
struct inode *inode,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index d330c4a581b1..5272cf9ec9d3 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -56,5 +56,6 @@ extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
extern const struct iomap_ops xfs_dax_write_iomap_ops;
+extern const struct iomap_ops xfs_atomic_write_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
--
2.31.1
On Mon, Mar 10, 2025 at 06:39:41PM +0000, John Garry wrote:
> In cases of an atomic write occurs for misaligned or discontiguous disk
> blocks, we will use a CoW-based method to issue the atomic write.
>
> So, for that case, return -EAGAIN to request that the write be issued in
> CoW atomic write mode. The dio write path should detect this, similar to
> how misaligned regular DIO writes are handled.
How is -EAGAIN going to work here given that it is also used to defer
non-blocking requests to the caller blocking context?
What is the probem with only setting the flag that causes REQ_ATOMIC
to be set from the file system instead of forcing it when calling
iomap_dio_rw?
Also how you ensure this -EAGAIN only happens on the first extent
mapped and you doesn't cause double writes?
> + bool atomic_hw = flags & IOMAP_ATOMIC_HW;
Again, atomic_hw is not a very useful variable name. But the
whole idea of using a non-descriptive bool variable for a flags
field feels like an antipattern to me.
> - if (shared)
> + if (shared) {
> + if (atomic_hw &&
> + !xfs_bmap_valid_for_atomic_write(&cmap,
> + offset_fsb, end_fsb)) {
> + error = -EAGAIN;
> + goto out_unlock;
> + }
> goto out_found_cow;
This needs a big fat comment explaining why bailing out here is
fine and how it works.
> + /*
> + * Use CoW method for when we need to alloc > 1 block,
> + * otherwise we might allocate less than what we need here and
> + * have multiple mappings.
> + */
Describe why this is done, not just what is done.
On 12/03/2025 07:37, Christoph Hellwig wrote:
> On Mon, Mar 10, 2025 at 06:39:41PM +0000, John Garry wrote:
>> In cases of an atomic write occurs for misaligned or discontiguous disk
>> blocks, we will use a CoW-based method to issue the atomic write.
>>
>> So, for that case, return -EAGAIN to request that the write be issued in
>> CoW atomic write mode. The dio write path should detect this, similar to
>> how misaligned regular DIO writes are handled.
>
> How is -EAGAIN going to work here given that it is also used to defer
> non-blocking requests to the caller blocking context?
You are talking about IOMAP_NOWAIT handling, right? If so, we handle
that in xfs_file_dio_write_atomic(), similar to
xfs_file_dio_write_unaligned(), i.e. if IOMAP_NOWAIT is set and we get
-EAGAIN, then we will return -EAGAIN directly to the caller.
>
> What is the probem with only setting the flag that causes REQ_ATOMIC
> to be set from the file system instead of forcing it when calling
> iomap_dio_rw?
We have this in __iomap_dio_rw():
if (dio_flags & IOMAP_DIO_ATOMIC_SW)
iomi.flags |= IOMAP_ATOMIC_SW;
else if (iocb->ki_flags & IOCB_ATOMIC)
iomi.flags |= IOMAP_ATOMIC_HW;
I do admit that the checks are a bit uneven, i.e. check vs
IOMAP_DIO_ATOMIC_SW and IOCB_ATOMIC
If we want a flag to set REQ_ATOMIC from the FS then we need
IOMAP_DIO_BIO_ATOMIC, and that would set IOMAP_BIO_ATOMIC. Is that better?
>
> Also how you ensure this -EAGAIN only happens on the first extent
> mapped and you doesn't cause double writes?
When we find that a mapping does not suit REQ_ATOMIC-based atomic write,
then we immediately bail and retry with FS-based atomic write. And that
check should cover all requirements for a REQ_ATOMIC-based atomic write:
- aligned
- contiguous blocks, i.e. the mapping covers the full write
And we also have the check in iomap_dio_bit_iter() to ensure that the
mapping covers the full write (for REQ_ATOMIC-based atomic write).
>
>> + bool atomic_hw = flags & IOMAP_ATOMIC_HW;
>
> Again, atomic_hw is not a very useful variable name. But the
> whole idea of using a non-descriptive bool variable for a flags
> field feels like an antipattern to me.
>
>> - if (shared)
>> + if (shared) {
>> + if (atomic_hw &&
>> + !xfs_bmap_valid_for_atomic_write(&cmap,
>> + offset_fsb, end_fsb)) {
>> + error = -EAGAIN;
>> + goto out_unlock;
>> + }
>> goto out_found_cow;
>
> This needs a big fat comment explaining why bailing out here is
> fine and how it works.
ok
>
>> + /*
>> + * Use CoW method for when we need to alloc > 1 block,
>> + * otherwise we might allocate less than what we need here and
>> + * have multiple mappings.
>> + */
>
> Describe why this is done, not just what is done.
I did say that we may get multiple mappings, which obvs is not useful
for REQ_ATOMIC-based atomic write. But I can add a bit more detail.
Thanks,
John
On Wed, Mar 12, 2025 at 09:00:52AM +0000, John Garry wrote: > > How is -EAGAIN going to work here given that it is also used to defer > > non-blocking requests to the caller blocking context? > > You are talking about IOMAP_NOWAIT handling, right? Yes. > If so, we handle that in > xfs_file_dio_write_atomic(), similar to xfs_file_dio_write_unaligned(), i.e. > if IOMAP_NOWAIT is set and we get -EAGAIN, then we will return -EAGAIN > directly to the caller. Can you document this including the interaction between the different cases of -EAGAIN somewhere? > > What is the probem with only setting the flag that causes REQ_ATOMIC > > to be set from the file system instead of forcing it when calling > > iomap_dio_rw? > > We have this in __iomap_dio_rw(): > > if (dio_flags & IOMAP_DIO_ATOMIC_SW) > iomi.flags |= IOMAP_ATOMIC_SW; > else if (iocb->ki_flags & IOCB_ATOMIC) > iomi.flags |= IOMAP_ATOMIC_HW; > > I do admit that the checks are a bit uneven, i.e. check vs > IOMAP_DIO_ATOMIC_SW and IOCB_ATOMIC > > If we want a flag to set REQ_ATOMIC from the FS then we need > IOMAP_DIO_BIO_ATOMIC, and that would set IOMAP_BIO_ATOMIC. Is that better? My expectation from a very cursory view is that iomap would be that there is a IOMAP_F_REQ_ATOMIC that is set in ->iomap_begin and which would make the core iomap code set REQ_ATOMIC on the bio for that iteration. > > Also how you ensure this -EAGAIN only happens on the first extent > > mapped and you doesn't cause double writes? > > When we find that a mapping does not suit REQ_ATOMIC-based atomic write, > then we immediately bail and retry with FS-based atomic write. And that > check should cover all requirements for a REQ_ATOMIC-based atomic write: > - aligned > - contiguous blocks, i.e. the mapping covers the full write > > And we also have the check in iomap_dio_bit_iter() to ensure that the > mapping covers the full write (for REQ_ATOMIC-based atomic write). Ah, I guess that's the if (bio_atomic && length != iter->len) return -EINVAL; So yes, please adda comment there that this is about a single iteration covering the entire write.
On 12/03/2025 13:52, Christoph Hellwig wrote: > On Wed, Mar 12, 2025 at 09:00:52AM +0000, John Garry wrote: >>> How is -EAGAIN going to work here given that it is also used to defer >>> non-blocking requests to the caller blocking context? >> >> You are talking about IOMAP_NOWAIT handling, right? > > Yes. > >> If so, we handle that in >> xfs_file_dio_write_atomic(), similar to xfs_file_dio_write_unaligned(), i.e. >> if IOMAP_NOWAIT is set and we get -EAGAIN, then we will return -EAGAIN >> directly to the caller. > > Can you document this including the interaction between the different > cases of -EAGAIN somewhere? Sure We do the same for dio write unaligned - but that already has a big comment explaining the retry mechanism. > >>> What is the probem with only setting the flag that causes REQ_ATOMIC >>> to be set from the file system instead of forcing it when calling >>> iomap_dio_rw? >> >> We have this in __iomap_dio_rw(): >> >> if (dio_flags & IOMAP_DIO_ATOMIC_SW) >> iomi.flags |= IOMAP_ATOMIC_SW; >> else if (iocb->ki_flags & IOCB_ATOMIC) >> iomi.flags |= IOMAP_ATOMIC_HW; >> >> I do admit that the checks are a bit uneven, i.e. check vs >> IOMAP_DIO_ATOMIC_SW and IOCB_ATOMIC >> >> If we want a flag to set REQ_ATOMIC from the FS then we need >> IOMAP_DIO_BIO_ATOMIC, and that would set IOMAP_BIO_ATOMIC. Is that better? > > My expectation from a very cursory view is that iomap would be that > there is a IOMAP_F_REQ_ATOMIC that is set in ->iomap_begin and which > would make the core iomap code set REQ_ATOMIC on the bio for that > iteration. but we still need to tell ->iomap_begin about IOCB_ATOMIC, hence IOMAP_DIO_BIO_ATOMIC which sets IOMAP_BIO_ATOMIC. We can't allow __iomap_dio_rw() check IOCB_ATOMIC only (and set IOMAP_BIO_ATOMIC), as this is the common path for COW and regular atomic write > >>> Also how you ensure this -EAGAIN only happens on the first extent >>> mapped and you doesn't cause double writes? >> >> When we find that a mapping does not suit REQ_ATOMIC-based atomic write, >> then we immediately bail and retry with FS-based atomic write. And that >> check should cover all requirements for a REQ_ATOMIC-based atomic write: >> - aligned >> - contiguous blocks, i.e. the mapping covers the full write >> >> And we also have the check in iomap_dio_bit_iter() to ensure that the >> mapping covers the full write (for REQ_ATOMIC-based atomic write). > > Ah, I guess that's the > > if (bio_atomic && length != iter->len) > return -EINVAL; > > So yes, please adda comment there that this is about a single iteration > covering the entire write. ok, fine. Thanks, John >
On Wed, Mar 12, 2025 at 02:57:38PM +0000, John Garry wrote: > > > I do admit that the checks are a bit uneven, i.e. check vs > > > IOMAP_DIO_ATOMIC_SW and IOCB_ATOMIC > > > > > > If we want a flag to set REQ_ATOMIC from the FS then we need > > > IOMAP_DIO_BIO_ATOMIC, and that would set IOMAP_BIO_ATOMIC. Is that better? > > > > My expectation from a very cursory view is that iomap would be that > > there is a IOMAP_F_REQ_ATOMIC that is set in ->iomap_begin and which > > would make the core iomap code set REQ_ATOMIC on the bio for that > > iteration. > > but we still need to tell ->iomap_begin about IOCB_ATOMIC, hence Yeah, ->iomap_begin can't directly look at the iocb. > IOMAP_DIO_BIO_ATOMIC which sets IOMAP_BIO_ATOMIC. > > We can't allow __iomap_dio_rw() check IOCB_ATOMIC only (and set > IOMAP_BIO_ATOMIC), as this is the common path for COW and regular atomic > write Well, I'd imagine __iomap_dio_rw just sets IOMAP_ATOMIC from IOCB_ATOMIC and then it's up to file system internal state if it wants to set IOMAP_F_REQ_ATOMIC based on that, i.e. the actual setting of IOMAP_F_REQ_ATOMIC is fully controlled by the file system and not by the iomap core.
On 12/03/2025 15:55, Christoph Hellwig wrote: >>> would make the core iomap code set REQ_ATOMIC on the bio for that >>> iteration. >> but we still need to tell ->iomap_begin about IOCB_ATOMIC, hence > Yeah, ->iomap_begin can't directly look at the iocb. > >> IOMAP_DIO_BIO_ATOMIC which sets IOMAP_BIO_ATOMIC. >> We can't allow __iomap_dio_rw() check IOCB_ATOMIC only (and set >> IOMAP_BIO_ATOMIC), as this is the common path for COW and regular atomic >> write > Well, I'd imagine __iomap_dio_rw just sets IOMAP_ATOMIC from IOCB_ATOMIC > and then it's up to file system internal state if it wants to set > IOMAP_F_REQ_ATOMIC based on that, i.e. the actual setting of > IOMAP_F_REQ_ATOMIC is fully controlled by the file system and not > by the iomap core. ok, fine. But I will also need to change ext4 to do the same (in terms of setting IOMAP_F_REQ_ATOMIC). And I am thinking of IOMAP_F_ATOMIC_BIO as the name, as we mentioned earlier that it is not so nice to with clash block layer names Thanks, John
© 2016 - 2026 Red Hat, Inc.