Add the boilerplate needed to start supporting RWF_WRITETHROUGH in XFS.
We use the direct wirte ->iomap_begin() functions to ensure the range
under write through always has a real non-delalloc extent. We reuse the xfs
dio's end IO function to perform extent conversion and i_size handling
for us.
*Note on COW extent over DATA hole case*
In case of an unmapped COW extent over a DATA hole
(due to COW preallocations), leave the extent unmapped until we are just
about to send IO. At that time, use the ->writethrough_submit() call
back to convert the COW extent to written.
We initially tried converting during iomap_begin() time (like dio does)
but that results in a stale data exposure as follows:
1. iomap_begin() - converts COW extent over DATA hole to written and
marks IOMAP_F_NEW to handle zeroing.
2. During iomap_write_begin() -> realise extent is stale and return back
without zeroing.
3. iomap_begin() - Again sees the same COW extent but it's written
this time so we don't mark IOMAP_F_NEW
4. Since IOMAP_F_NEW is unmarked, we never zeroout and hence expose
stale data.
To avoid the above, take the buffered IO approach of converting the
extent just before IO, when we are sure to have zeroed out the folio.
Co-developed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
fs/xfs/xfs_file.c | 53 +++++++++++++++++++++++++++++++++++++++++------
1 file changed, 47 insertions(+), 6 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 6246f34df9fd..d8436d840476 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -988,6 +988,39 @@ xfs_file_dax_write(
return ret;
}
+static int
+xfs_writethrough_submit(
+ struct inode *inode,
+ struct iomap *iomap,
+ loff_t offset,
+ u64 count)
+{
+ int error = 0;
+ unsigned int nofs_flag;
+
+ /*
+ * Convert CoW extents to regular.
+ *
+ * We are under writethrough context with folio lock possibly held. To
+ * avoid memory allocation deadlocks, set the task-wide nofs context.
+ */
+ if (iomap->flags & IOMAP_F_SHARED) {
+ nofs_flag = memalloc_nofs_save();
+ error = xfs_reflink_convert_cow(XFS_I(inode), offset, count);
+ memalloc_nofs_restore(nofs_flag);
+ }
+
+ return error;
+}
+
+const struct iomap_writethrough_ops xfs_writethrough_ops = {
+ .ops = &xfs_direct_write_iomap_ops,
+ .write_ops = &xfs_iomap_write_ops,
+ .dops = &xfs_dio_write_ops,
+ .writethrough_submit = &xfs_writethrough_submit
+};
+
+
STATIC ssize_t
xfs_file_buffered_write(
struct kiocb *iocb,
@@ -1010,9 +1043,13 @@ xfs_file_buffered_write(
goto out;
trace_xfs_file_buffered_write(iocb, from);
- ret = iomap_file_buffered_write(iocb, from,
- &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
- NULL);
+ if (iocb->ki_flags & IOCB_WRITETHROUGH) {
+ ret = iomap_file_writethrough_write(iocb, from,
+ &xfs_writethrough_ops, NULL);
+ } else
+ ret = iomap_file_buffered_write(iocb, from,
+ &xfs_buffered_write_iomap_ops,
+ &xfs_iomap_write_ops, NULL);
/*
* If we hit a space limit, try to free up some lingering preallocated
@@ -1047,8 +1084,12 @@ xfs_file_buffered_write(
if (ret > 0) {
XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
- /* Handle various SYNC-type writes */
- ret = generic_write_sync(iocb, ret);
+ /*
+ * Handle various SYNC-type writes.
+ * For writethrough, we handle sync during completion.
+ */
+ if (!(iocb->ki_flags & IOCB_WRITETHROUGH))
+ ret = generic_write_sync(iocb, ret);
}
return ret;
}
@@ -2042,7 +2083,7 @@ const struct file_operations xfs_file_operations = {
.remap_file_range = xfs_file_remap_range,
.fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
- FOP_DONTCACHE,
+ FOP_DONTCACHE | FOP_WRITETHROUGH,
.setlease = generic_setlease,
};
--
2.53.0