buffered block atomic writes

[PATCH RFC 5/7] fs: iomap: buffered atomic write support

Posted by John Garry 1 year, 9 months ago

Add special handling of PG_atomic flag to iomap buffered write path.

To flag an iomap iter for an atomic write, set IOMAP_ATOMIC.

For a folio associated with a write which has IOMAP_ATOMIC set, set
PG_atomic.

Otherwise, when IOMAP_ATOMIC is unset, clear PG_atomic.

This means that for an "atomic" folio which has not been written back, it
loses it "atomicity". So if userspace issues a write with RWF_ATOMIC set
and another write with RWF_ATOMIC unset and which fully or partially
overwrites that same region as the first write, that folio is not written
back atomically. For such a scenario to occur, it would be considered a
userspace usage error.

To ensure that a buffered atomic write is written back atomically when
the write syscall returns, RWF_SYNC or similar needs to be used (in
conjunction with RWF_ATOMIC).

As a safety check, when getting a folio for an atomic write in
iomap_get_folio(), ensure that the length matches the inode mapping folio
order-limit.

Only a single BIO should ever be submitted for an atomic write. So modify
iomap_add_to_ioend() to ensure that we don't try to write back an atomic
folio as part of a larger mixed-atomicity BIO.

In iomap_alloc_ioend(), handle an atomic write by setting REQ_ATOMIC for
the allocated BIO.

When a folio is written back, again clear PG_atomic, as it is no longer
required. I assume it will not be needlessly written back a second time...

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/iomap/buffered-io.c | 53 ++++++++++++++++++++++++++++++++++++------
 fs/iomap/trace.h       |  3 ++-
 include/linux/iomap.h  |  1 +
 3 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 4e8e41c8b3c0..ac2a014c91a9 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -586,13 +586,25 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
  */
 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
 {
+	struct address_space *mapping = iter->inode->i_mapping;
 	fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS;
 
 	if (iter->flags & IOMAP_NOWAIT)
 		fgp |= FGP_NOWAIT;
 	fgp |= fgf_set_order(len);
 
-	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
+	if (iter->flags & IOMAP_ATOMIC) {
+		unsigned int min_order = mapping_min_folio_order(mapping);
+		unsigned int max_order = mapping_max_folio_order(mapping);
+		unsigned int order = FGF_GET_ORDER(fgp);
+
+		if (order != min_order)
+			return ERR_PTR(-EINVAL);
+		if (order != max_order)
+			return ERR_PTR(-EINVAL);
+	}
+
+	return __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
 			fgp, mapping_gfp_mask(iter->inode->i_mapping));
 }
 EXPORT_SYMBOL_GPL(iomap_get_folio);
@@ -769,6 +781,7 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
 {
 	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
 	const struct iomap *srcmap = iomap_iter_srcmap(iter);
+	bool is_atomic = iter->flags & IOMAP_ATOMIC;
 	struct folio *folio;
 	int status = 0;
 
@@ -786,6 +799,11 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
 	if (IS_ERR(folio))
 		return PTR_ERR(folio);
 
+	if (is_atomic)
+		folio_set_atomic(folio);
+	else
+		folio_clear_atomic(folio);
+
 	/*
 	 * Now we have a locked folio, before we do anything with it we need to
 	 * check that the iomap we have cached is not stale. The inode extent
@@ -1010,6 +1028,8 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		iter.flags |= IOMAP_NOWAIT;
+	if (iocb->ki_flags & IOCB_ATOMIC)
+		iter.flags |= IOMAP_ATOMIC;
 
 	while ((ret = iomap_iter(&iter, ops)) > 0)
 		iter.processed = iomap_write_iter(&iter, i);
@@ -1499,8 +1519,10 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
 	WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
 	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
 
-	if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
+	if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) {
+		folio_clear_atomic(folio);
 		folio_end_writeback(folio);
+	}
 }
 
 /*
@@ -1679,14 +1701,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
 }
 
 static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
-		struct writeback_control *wbc, struct inode *inode, loff_t pos)
+		struct writeback_control *wbc, struct inode *inode, loff_t pos,
+		bool atomic)
 {
+	blk_opf_t opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 	struct iomap_ioend *ioend;
 	struct bio *bio;
 
+	if (atomic)
+		opf |= REQ_ATOMIC;
+
 	bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
-			       REQ_OP_WRITE | wbc_to_write_flags(wbc),
-			       GFP_NOFS, &iomap_ioend_bioset);
+			       opf, GFP_NOFS, &iomap_ioend_bioset);
 	bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
 	bio->bi_end_io = iomap_writepage_end_bio;
 	wbc_init_bio(wbc, bio);
@@ -1744,14 +1770,27 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 {
 	struct iomap_folio_state *ifs = folio->private;
 	size_t poff = offset_in_folio(folio, pos);
+	bool is_atomic = folio_test_atomic(folio);
 	int error;
 
-	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
+	if (!wpc->ioend || is_atomic || !iomap_can_add_to_ioend(wpc, pos)) {
 new_ioend:
 		error = iomap_submit_ioend(wpc, 0);
 		if (error)
 			return error;
-		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
+		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, is_atomic);
+	}
+
+	/* We must not append anything later if atomic, so submit now */
+	if (is_atomic) {
+		if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+			return -EINVAL;
+		wpc->ioend->io_size = len;
+		wbc_account_cgroup_owner(wbc, &folio->page, len);
+		if (ifs)
+			atomic_add(len, &ifs->write_bytes_pending);
+
+		return iomap_submit_ioend(wpc, 0);
 	}
 
 	if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 0a991c4ce87d..4118a42cdab0 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
 	{ IOMAP_REPORT,		"REPORT" }, \
 	{ IOMAP_FAULT,		"FAULT" }, \
 	{ IOMAP_DIRECT,		"DIRECT" }, \
-	{ IOMAP_NOWAIT,		"NOWAIT" }
+	{ IOMAP_NOWAIT,		"NOWAIT" }, \
+	{ IOMAP_ATOMIC,		"ATOMIC" }
 
 #define IOMAP_F_FLAGS_STRINGS \
 	{ IOMAP_F_NEW,		"NEW" }, \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index f726f0058fd6..2f50abe06f27 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -179,6 +179,7 @@ struct iomap_folio_ops {
 #else
 #define IOMAP_DAX		0
 #endif /* CONFIG_FS_DAX */
+#define IOMAP_ATOMIC	(1 << 9)
 
 struct iomap_ops {
 	/*
-- 
2.31.1

Re: [PATCH RFC 5/7] fs: iomap: buffered atomic write support

Posted by Matthew Wilcox 1 year, 9 months ago

On Mon, Apr 22, 2024 at 02:39:21PM +0000, John Garry wrote:
> Add special handling of PG_atomic flag to iomap buffered write path.
> 
> To flag an iomap iter for an atomic write, set IOMAP_ATOMIC.
> 
> For a folio associated with a write which has IOMAP_ATOMIC set, set
> PG_atomic.
> 
> Otherwise, when IOMAP_ATOMIC is unset, clear PG_atomic.
> 
> This means that for an "atomic" folio which has not been written back, it
> loses it "atomicity". So if userspace issues a write with RWF_ATOMIC set
> and another write with RWF_ATOMIC unset and which fully or partially
> overwrites that same region as the first write, that folio is not written
> back atomically. For such a scenario to occur, it would be considered a
> userspace usage error.
> 
> To ensure that a buffered atomic write is written back atomically when
> the write syscall returns, RWF_SYNC or similar needs to be used (in
> conjunction with RWF_ATOMIC).
> 
> As a safety check, when getting a folio for an atomic write in
> iomap_get_folio(), ensure that the length matches the inode mapping folio
> order-limit.
> 
> Only a single BIO should ever be submitted for an atomic write. So modify
> iomap_add_to_ioend() to ensure that we don't try to write back an atomic
> folio as part of a larger mixed-atomicity BIO.
> 
> In iomap_alloc_ioend(), handle an atomic write by setting REQ_ATOMIC for
> the allocated BIO.
> 
> When a folio is written back, again clear PG_atomic, as it is no longer
> required. I assume it will not be needlessly written back a second time...

I'm not taking a position on the mechanism yet; need to think about it
some more.  But there's a hole here I also don't have a solution to,
so we can all start thinking about it.

In iomap_write_iter(), we call copy_folio_from_iter_atomic().  Through no
fault of the application, if the range crosses a page boundary, we might
partially copy the bytes from the first page, then take a page fault on
the second page, hence doing a short write into the folio.  And there's
nothing preventing writeback from writing back a partially copied folio.

Now, if it's not dirty, then it can't be written back.  So if we're
doing an atomic write, we could clear the dirty bit after calling
iomap_write_begin() (given the usage scenarios we've discussed, it should
always be clear ...)

We need to prevent the "fall back to a short copy" logic in
iomap_write_iter() as well.  But then we also need to make sure we don't
get stuck in a loop, so maybe go three times around, and if it's still
not readable as a chunk, -EFAULT?

Re: [PATCH RFC 5/7] fs: iomap: buffered atomic write support

Posted by John Garry 1 year, 9 months ago

On 22/04/2024 16:03, Matthew Wilcox wrote:
> On Mon, Apr 22, 2024 at 02:39:21PM +0000, John Garry wrote:
>> Add special handling of PG_atomic flag to iomap buffered write path.
>>
>> To flag an iomap iter for an atomic write, set IOMAP_ATOMIC.
>>
>> For a folio associated with a write which has IOMAP_ATOMIC set, set
>> PG_atomic.
>>
>> Otherwise, when IOMAP_ATOMIC is unset, clear PG_atomic.
>>
>> This means that for an "atomic" folio which has not been written back, it
>> loses it "atomicity". So if userspace issues a write with RWF_ATOMIC set
>> and another write with RWF_ATOMIC unset and which fully or partially
>> overwrites that same region as the first write, that folio is not written
>> back atomically. For such a scenario to occur, it would be considered a
>> userspace usage error.
>>
>> To ensure that a buffered atomic write is written back atomically when
>> the write syscall returns, RWF_SYNC or similar needs to be used (in
>> conjunction with RWF_ATOMIC).
>>
>> As a safety check, when getting a folio for an atomic write in
>> iomap_get_folio(), ensure that the length matches the inode mapping folio
>> order-limit.
>>
>> Only a single BIO should ever be submitted for an atomic write. So modify
>> iomap_add_to_ioend() to ensure that we don't try to write back an atomic
>> folio as part of a larger mixed-atomicity BIO.
>>
>> In iomap_alloc_ioend(), handle an atomic write by setting REQ_ATOMIC for
>> the allocated BIO.
>>
>> When a folio is written back, again clear PG_atomic, as it is no longer
>> required. I assume it will not be needlessly written back a second time...
> 
> I'm not taking a position on the mechanism yet; need to think about it
> some more.  But there's a hole here I also don't have a solution to,
> so we can all start thinking about it.
> 
> In iomap_write_iter(), we call copy_folio_from_iter_atomic().  Through no
> fault of the application, if the range crosses a page boundary, we might
> partially copy the bytes from the first page, then take a page fault on
> the second page, hence doing a short write into the folio.  And there's
> nothing preventing writeback from writing back a partially copied folio.
> 
> Now, if it's not dirty, then it can't be written back.  So if we're
> doing an atomic write, we could clear the dirty bit after calling
> iomap_write_begin() (given the usage scenarios we've discussed, it should
> always be clear ...)
> > We need to prevent the "fall back to a short copy" logic in
> iomap_write_iter() as well.  But then we also need to make sure we don't
> get stuck in a loop, so maybe go three times around, and if it's still
> not readable as a chunk, -EFAULT?
This idea sounds reasonable. So at what stage would the dirty flag be 
set? Would it be only when all bytes are copied successfully as a single 
chunk?

FWIW, we do have somewhat equivalent handling in direct IO path, being 
that if the iomap iter loops more than once such that we will need to 
create > 1 bio in the DIO bio submission handler, then we -EINVAL as 
something has gone wrong. But that's not so relevant here.

Thanks,
John