[PATCH 12/16] ext4: add RWF_UNCACHED write support

Jens Axboe posted 16 patches 1 week, 4 days ago
There is a newer version of this series
[PATCH 12/16] ext4: add RWF_UNCACHED write support
Posted by Jens Axboe 1 week, 4 days ago
IOCB_UNCACHED IO needs to prune writeback regions on IO completion,
and hence need the worker punt that ext4 also does for unwritten
extents. Add an io_end flag to manage that.

If foliop is set to foliop_uncached in ext4_write_begin(), then set
FGP_UNCACHED so that __filemap_get_folio() will mark newly created
folios as uncached. That in turn will make writeback completion drop
these ranges from the page cache.

Now that ext4 supports both uncached reads and writes, add the fop_flag
FOP_UNCACHED to enable it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/ext4/ext4.h    |  1 +
 fs/ext4/file.c    |  2 +-
 fs/ext4/inline.c  |  7 ++++++-
 fs/ext4/inode.c   | 18 ++++++++++++++++--
 fs/ext4/page-io.c | 28 ++++++++++++++++------------
 5 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 44b0d418143c..60dc9ffae076 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -279,6 +279,7 @@ struct ext4_system_blocks {
  * Flags for ext4_io_end->flags
  */
 #define	EXT4_IO_END_UNWRITTEN	0x0001
+#define EXT4_IO_UNCACHED	0x0002
 
 struct ext4_io_end_vec {
 	struct list_head list;		/* list of io_end_vec */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f14aed14b9cf..0ef39d738598 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -944,7 +944,7 @@ const struct file_operations ext4_file_operations = {
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= ext4_fallocate,
 	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
-			  FOP_DIO_PARALLEL_WRITE,
+			  FOP_DIO_PARALLEL_WRITE | FOP_UNCACHED,
 };
 
 const struct inode_operations ext4_file_inode_operations = {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3536ca7e4fcc..4089d0744164 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -667,6 +667,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
 	handle_t *handle;
 	struct folio *folio;
 	struct ext4_iloc iloc;
+	fgf_t fgp_flags;
 
 	if (pos + len > ext4_get_max_inline_size(inode))
 		goto convert;
@@ -702,7 +703,11 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
 	if (ret)
 		goto out;
 
-	folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
+	fgp_flags = FGP_WRITEBEGIN | FGP_NOFS;
+	if (*foliop == foliop_uncached)
+		fgp_flags |= FGP_UNCACHED;
+
+	folio = __filemap_get_folio(mapping, 0, fgp_flags,
 					mapping_gfp_mask(mapping));
 	if (IS_ERR(folio)) {
 		ret = PTR_ERR(folio);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 54bdd4884fe6..afae3ab64c9e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1138,6 +1138,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 	int ret, needed_blocks;
 	handle_t *handle;
 	int retries = 0;
+	fgf_t fgp_flags;
 	struct folio *folio;
 	pgoff_t index;
 	unsigned from, to;
@@ -1164,6 +1165,15 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 			return 0;
 	}
 
+	/*
+	 * Set FGP_WRITEBEGIN, and FGP_UNCACHED if foliop contains
+	 * foliop_uncached. That's how generic_perform_write() informs us
+	 * that this is an uncached write.
+	 */
+	fgp_flags = FGP_WRITEBEGIN;
+	if (*foliop == foliop_uncached)
+		fgp_flags |= FGP_UNCACHED;
+
 	/*
 	 * __filemap_get_folio() can take a long time if the
 	 * system is thrashing due to memory pressure, or if the folio
@@ -1172,7 +1182,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 	 * the folio (if needed) without using GFP_NOFS.
 	 */
 retry_grab:
-	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+	folio = __filemap_get_folio(mapping, index, fgp_flags,
 					mapping_gfp_mask(mapping));
 	if (IS_ERR(folio))
 		return PTR_ERR(folio);
@@ -2903,6 +2913,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	struct folio *folio;
 	pgoff_t index;
 	struct inode *inode = mapping->host;
+	fgf_t fgp_flags;
 
 	if (unlikely(ext4_forced_shutdown(inode->i_sb)))
 		return -EIO;
@@ -2926,8 +2937,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 			return 0;
 	}
 
+	fgp_flags = FGP_WRITEBEGIN;
+	if (*foliop == foliop_uncached)
+		fgp_flags |= FGP_UNCACHED;
 retry:
-	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+	folio = __filemap_get_folio(mapping, index, fgp_flags,
 			mapping_gfp_mask(mapping));
 	if (IS_ERR(folio))
 		return PTR_ERR(folio);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index ad5543866d21..10447c3c4ff1 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -226,8 +226,6 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
 	unsigned long flags;
 
 	/* Only reserved conversions from writeback should enter here */
-	WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
-	WARN_ON(!io_end->handle && sbi->s_journal);
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 	wq = sbi->rsv_conversion_wq;
 	if (list_empty(&ei->i_rsv_conversion_list))
@@ -252,7 +250,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
 
 	while (!list_empty(&unwritten)) {
 		io_end = list_entry(unwritten.next, ext4_io_end_t, list);
-		BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+		BUG_ON(!(io_end->flag & (EXT4_IO_END_UNWRITTEN|EXT4_IO_UNCACHED)));
 		list_del_init(&io_end->list);
 
 		err = ext4_end_io_end(io_end);
@@ -287,14 +285,15 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 
 void ext4_put_io_end_defer(ext4_io_end_t *io_end)
 {
-	if (refcount_dec_and_test(&io_end->count)) {
-		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
-				list_empty(&io_end->list_vec)) {
-			ext4_release_io_end(io_end);
-			return;
-		}
-		ext4_add_complete_io(io_end);
+	if (!refcount_dec_and_test(&io_end->count))
+		return;
+	if ((!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
+	    list_empty(&io_end->list_vec)) &&
+	    !(io_end->flag & EXT4_IO_UNCACHED)) {
+		ext4_release_io_end(io_end);
+		return;
 	}
+	ext4_add_complete_io(io_end);
 }
 
 int ext4_put_io_end(ext4_io_end_t *io_end)
@@ -348,7 +347,7 @@ static void ext4_end_bio(struct bio *bio)
 				blk_status_to_errno(bio->bi_status));
 	}
 
-	if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+	if (io_end->flag & (EXT4_IO_END_UNWRITTEN|EXT4_IO_UNCACHED)) {
 		/*
 		 * Link bio into list hanging from io_end. We have to do it
 		 * atomically as bio completions can be racing against each
@@ -417,8 +416,13 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
 submit_and_retry:
 		ext4_io_submit(io);
 	}
-	if (io->io_bio == NULL)
+	if (io->io_bio == NULL) {
 		io_submit_init_bio(io, bh);
+		if (folio_test_uncached(folio)) {
+			ext4_io_end_t *io_end = io->io_bio->bi_private;
+			io_end->flag |= EXT4_IO_UNCACHED;
+		}
+	}
 	if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
 		goto submit_and_retry;
 	wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size);
-- 
2.45.2
Re: [PATCH 12/16] ext4: add RWF_UNCACHED write support
Posted by Brian Foster 1 week, 4 days ago
On Mon, Nov 11, 2024 at 04:37:39PM -0700, Jens Axboe wrote:
> IOCB_UNCACHED IO needs to prune writeback regions on IO completion,
> and hence need the worker punt that ext4 also does for unwritten
> extents. Add an io_end flag to manage that.
> 
> If foliop is set to foliop_uncached in ext4_write_begin(), then set
> FGP_UNCACHED so that __filemap_get_folio() will mark newly created
> folios as uncached. That in turn will make writeback completion drop
> these ranges from the page cache.
> 
> Now that ext4 supports both uncached reads and writes, add the fop_flag
> FOP_UNCACHED to enable it.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  fs/ext4/ext4.h    |  1 +
>  fs/ext4/file.c    |  2 +-
>  fs/ext4/inline.c  |  7 ++++++-
>  fs/ext4/inode.c   | 18 ++++++++++++++++--
>  fs/ext4/page-io.c | 28 ++++++++++++++++------------
>  5 files changed, 40 insertions(+), 16 deletions(-)
> 
...
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 54bdd4884fe6..afae3ab64c9e 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1138,6 +1138,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
>  	int ret, needed_blocks;
>  	handle_t *handle;
>  	int retries = 0;
> +	fgf_t fgp_flags;
>  	struct folio *folio;
>  	pgoff_t index;
>  	unsigned from, to;
> @@ -1164,6 +1165,15 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
>  			return 0;
>  	}
>  
> +	/*
> +	 * Set FGP_WRITEBEGIN, and FGP_UNCACHED if foliop contains
> +	 * foliop_uncached. That's how generic_perform_write() informs us
> +	 * that this is an uncached write.
> +	 */
> +	fgp_flags = FGP_WRITEBEGIN;
> +	if (*foliop == foliop_uncached)
> +		fgp_flags |= FGP_UNCACHED;
> +
>  	/*
>  	 * __filemap_get_folio() can take a long time if the
>  	 * system is thrashing due to memory pressure, or if the folio
> @@ -1172,7 +1182,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
>  	 * the folio (if needed) without using GFP_NOFS.
>  	 */
>  retry_grab:
> -	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
> +	folio = __filemap_get_folio(mapping, index, fgp_flags,
>  					mapping_gfp_mask(mapping));
>  	if (IS_ERR(folio))
>  		return PTR_ERR(folio);

JFYI, I notice that ext4 cycles the folio lock here in this path and
thus follows up with a couple checks presumably to accommodate that. One
is whether i_mapping has changed, which I assume means uncached state
would have been handled/cleared externally somewhere..? I.e., if an
uncached folio is somehow truncated/freed without ever having been
written back?

The next is a folio_wait_stable() call "in case writeback began ..."
It's not immediately clear to me if that is possible here, but taking
that at face value, is it an issue if we were to create an uncached
folio, drop the folio lock, then have some other task dirty and
writeback the folio (due to a sync write or something), then have
writeback completion invalidate the folio before we relock it here?

Brian

> @@ -2903,6 +2913,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
>  	struct folio *folio;
>  	pgoff_t index;
>  	struct inode *inode = mapping->host;
> +	fgf_t fgp_flags;
>  
>  	if (unlikely(ext4_forced_shutdown(inode->i_sb)))
>  		return -EIO;
> @@ -2926,8 +2937,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
>  			return 0;
>  	}
>  
> +	fgp_flags = FGP_WRITEBEGIN;
> +	if (*foliop == foliop_uncached)
> +		fgp_flags |= FGP_UNCACHED;
>  retry:
> -	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
> +	folio = __filemap_get_folio(mapping, index, fgp_flags,
>  			mapping_gfp_mask(mapping));
>  	if (IS_ERR(folio))
>  		return PTR_ERR(folio);
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index ad5543866d21..10447c3c4ff1 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -226,8 +226,6 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
>  	unsigned long flags;
>  
>  	/* Only reserved conversions from writeback should enter here */
> -	WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
> -	WARN_ON(!io_end->handle && sbi->s_journal);
>  	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
>  	wq = sbi->rsv_conversion_wq;
>  	if (list_empty(&ei->i_rsv_conversion_list))
> @@ -252,7 +250,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
>  
>  	while (!list_empty(&unwritten)) {
>  		io_end = list_entry(unwritten.next, ext4_io_end_t, list);
> -		BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
> +		BUG_ON(!(io_end->flag & (EXT4_IO_END_UNWRITTEN|EXT4_IO_UNCACHED)));
>  		list_del_init(&io_end->list);
>  
>  		err = ext4_end_io_end(io_end);
> @@ -287,14 +285,15 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
>  
>  void ext4_put_io_end_defer(ext4_io_end_t *io_end)
>  {
> -	if (refcount_dec_and_test(&io_end->count)) {
> -		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
> -				list_empty(&io_end->list_vec)) {
> -			ext4_release_io_end(io_end);
> -			return;
> -		}
> -		ext4_add_complete_io(io_end);
> +	if (!refcount_dec_and_test(&io_end->count))
> +		return;
> +	if ((!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
> +	    list_empty(&io_end->list_vec)) &&
> +	    !(io_end->flag & EXT4_IO_UNCACHED)) {
> +		ext4_release_io_end(io_end);
> +		return;
>  	}
> +	ext4_add_complete_io(io_end);
>  }
>  
>  int ext4_put_io_end(ext4_io_end_t *io_end)
> @@ -348,7 +347,7 @@ static void ext4_end_bio(struct bio *bio)
>  				blk_status_to_errno(bio->bi_status));
>  	}
>  
> -	if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
> +	if (io_end->flag & (EXT4_IO_END_UNWRITTEN|EXT4_IO_UNCACHED)) {
>  		/*
>  		 * Link bio into list hanging from io_end. We have to do it
>  		 * atomically as bio completions can be racing against each
> @@ -417,8 +416,13 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
>  submit_and_retry:
>  		ext4_io_submit(io);
>  	}
> -	if (io->io_bio == NULL)
> +	if (io->io_bio == NULL) {
>  		io_submit_init_bio(io, bh);
> +		if (folio_test_uncached(folio)) {
> +			ext4_io_end_t *io_end = io->io_bio->bi_private;
> +			io_end->flag |= EXT4_IO_UNCACHED;
> +		}
> +	}
>  	if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
>  		goto submit_and_retry;
>  	wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size);
> -- 
> 2.45.2
> 
>
Re: [PATCH 12/16] ext4: add RWF_UNCACHED write support
Posted by Jens Axboe 1 week, 4 days ago
On 11/12/24 9:36 AM, Brian Foster wrote:
> On Mon, Nov 11, 2024 at 04:37:39PM -0700, Jens Axboe wrote:
>> IOCB_UNCACHED IO needs to prune writeback regions on IO completion,
>> and hence need the worker punt that ext4 also does for unwritten
>> extents. Add an io_end flag to manage that.
>>
>> If foliop is set to foliop_uncached in ext4_write_begin(), then set
>> FGP_UNCACHED so that __filemap_get_folio() will mark newly created
>> folios as uncached. That in turn will make writeback completion drop
>> these ranges from the page cache.
>>
>> Now that ext4 supports both uncached reads and writes, add the fop_flag
>> FOP_UNCACHED to enable it.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  fs/ext4/ext4.h    |  1 +
>>  fs/ext4/file.c    |  2 +-
>>  fs/ext4/inline.c  |  7 ++++++-
>>  fs/ext4/inode.c   | 18 ++++++++++++++++--
>>  fs/ext4/page-io.c | 28 ++++++++++++++++------------
>>  5 files changed, 40 insertions(+), 16 deletions(-)
>>
> ...
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 54bdd4884fe6..afae3ab64c9e 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -1138,6 +1138,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
>>  	int ret, needed_blocks;
>>  	handle_t *handle;
>>  	int retries = 0;
>> +	fgf_t fgp_flags;
>>  	struct folio *folio;
>>  	pgoff_t index;
>>  	unsigned from, to;
>> @@ -1164,6 +1165,15 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
>>  			return 0;
>>  	}
>>  
>> +	/*
>> +	 * Set FGP_WRITEBEGIN, and FGP_UNCACHED if foliop contains
>> +	 * foliop_uncached. That's how generic_perform_write() informs us
>> +	 * that this is an uncached write.
>> +	 */
>> +	fgp_flags = FGP_WRITEBEGIN;
>> +	if (*foliop == foliop_uncached)
>> +		fgp_flags |= FGP_UNCACHED;
>> +
>>  	/*
>>  	 * __filemap_get_folio() can take a long time if the
>>  	 * system is thrashing due to memory pressure, or if the folio
>> @@ -1172,7 +1182,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
>>  	 * the folio (if needed) without using GFP_NOFS.
>>  	 */
>>  retry_grab:
>> -	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
>> +	folio = __filemap_get_folio(mapping, index, fgp_flags,
>>  					mapping_gfp_mask(mapping));
>>  	if (IS_ERR(folio))
>>  		return PTR_ERR(folio);
> 
> JFYI, I notice that ext4 cycles the folio lock here in this path and
> thus follows up with a couple checks presumably to accommodate that. One
> is whether i_mapping has changed, which I assume means uncached state
> would have been handled/cleared externally somewhere..? I.e., if an
> uncached folio is somehow truncated/freed without ever having been
> written back?
> 
> The next is a folio_wait_stable() call "in case writeback began ..."
> It's not immediately clear to me if that is possible here, but taking
> that at face value, is it an issue if we were to create an uncached
> folio, drop the folio lock, then have some other task dirty and
> writeback the folio (due to a sync write or something), then have
> writeback completion invalidate the folio before we relock it here?

I don't either of those are an issue. The UNCACHED flag will only be set
on a newly created folio, it does not get inherited for folios that
already exist.

-- 
Jens Axboe
Re: [PATCH 12/16] ext4: add RWF_UNCACHED write support
Posted by Brian Foster 1 week, 4 days ago
On Tue, Nov 12, 2024 at 10:13:12AM -0700, Jens Axboe wrote:
> On 11/12/24 9:36 AM, Brian Foster wrote:
> > On Mon, Nov 11, 2024 at 04:37:39PM -0700, Jens Axboe wrote:
> >> IOCB_UNCACHED IO needs to prune writeback regions on IO completion,
> >> and hence need the worker punt that ext4 also does for unwritten
> >> extents. Add an io_end flag to manage that.
> >>
> >> If foliop is set to foliop_uncached in ext4_write_begin(), then set
> >> FGP_UNCACHED so that __filemap_get_folio() will mark newly created
> >> folios as uncached. That in turn will make writeback completion drop
> >> these ranges from the page cache.
> >>
> >> Now that ext4 supports both uncached reads and writes, add the fop_flag
> >> FOP_UNCACHED to enable it.
> >>
> >> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> >> ---
> >>  fs/ext4/ext4.h    |  1 +
> >>  fs/ext4/file.c    |  2 +-
> >>  fs/ext4/inline.c  |  7 ++++++-
> >>  fs/ext4/inode.c   | 18 ++++++++++++++++--
> >>  fs/ext4/page-io.c | 28 ++++++++++++++++------------
> >>  5 files changed, 40 insertions(+), 16 deletions(-)
> >>
> > ...
> >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> >> index 54bdd4884fe6..afae3ab64c9e 100644
> >> --- a/fs/ext4/inode.c
> >> +++ b/fs/ext4/inode.c
> >> @@ -1138,6 +1138,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
> >>  	int ret, needed_blocks;
> >>  	handle_t *handle;
> >>  	int retries = 0;
> >> +	fgf_t fgp_flags;
> >>  	struct folio *folio;
> >>  	pgoff_t index;
> >>  	unsigned from, to;
> >> @@ -1164,6 +1165,15 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
> >>  			return 0;
> >>  	}
> >>  
> >> +	/*
> >> +	 * Set FGP_WRITEBEGIN, and FGP_UNCACHED if foliop contains
> >> +	 * foliop_uncached. That's how generic_perform_write() informs us
> >> +	 * that this is an uncached write.
> >> +	 */
> >> +	fgp_flags = FGP_WRITEBEGIN;
> >> +	if (*foliop == foliop_uncached)
> >> +		fgp_flags |= FGP_UNCACHED;
> >> +
> >>  	/*
> >>  	 * __filemap_get_folio() can take a long time if the
> >>  	 * system is thrashing due to memory pressure, or if the folio
> >> @@ -1172,7 +1182,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
> >>  	 * the folio (if needed) without using GFP_NOFS.
> >>  	 */
> >>  retry_grab:
> >> -	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
> >> +	folio = __filemap_get_folio(mapping, index, fgp_flags,
> >>  					mapping_gfp_mask(mapping));
> >>  	if (IS_ERR(folio))
> >>  		return PTR_ERR(folio);
> > 
> > JFYI, I notice that ext4 cycles the folio lock here in this path and
> > thus follows up with a couple checks presumably to accommodate that. One
> > is whether i_mapping has changed, which I assume means uncached state
> > would have been handled/cleared externally somewhere..? I.e., if an
> > uncached folio is somehow truncated/freed without ever having been
> > written back?
> > 
> > The next is a folio_wait_stable() call "in case writeback began ..."
> > It's not immediately clear to me if that is possible here, but taking
> > that at face value, is it an issue if we were to create an uncached
> > folio, drop the folio lock, then have some other task dirty and
> > writeback the folio (due to a sync write or something), then have
> > writeback completion invalidate the folio before we relock it here?
> 
> I don't either of those are an issue. The UNCACHED flag will only be set
> on a newly created folio, it does not get inherited for folios that
> already exist.
> 

Right.. but what I was wondering for that latter case is if the folio is
created here by ext4, so uncached is set before it is unlocked.

On second look I guess the uncached completion invalidation should clear
mapping and thus trigger the retry logic here. That seems reasonable
enough, but is it still possible to race with writeback?

Maybe this is a better way to ask.. what happens if a write completes to
an uncached folio that is already under writeback? For example, uncached
write 1 completes, submits for writeback and returns to userspace. Then
write 2 begins and redirties the same folio before the uncached
writeback completes.

If I follow correctly, if write 2 is also uncached, it eventually blocks
in writeback submission (folio_prepare_writeback() ->
folio_wait_writeback()). It looks like folio lock is held there, so
presumably that would bypass the completion time invalidation in
folio_end_uncached(). But what if write 2 was not uncached or perhaps
writeback completion won the race for folio lock vs. the write side
(between locking the folio for dirtying and later for writeback
submission)? Does anything prevent invalidation of the folio before the
second write is submitted for writeback?

IOW, I'm wondering if the uncached completion time invalidation also
needs a folio dirty check..?

Brian

> -- 
> Jens Axboe
>
Re: [PATCH 12/16] ext4: add RWF_UNCACHED write support
Posted by Jens Axboe 1 week, 3 days ago
On 11/12/24 11:11 AM, Brian Foster wrote:
> On Tue, Nov 12, 2024 at 10:13:12AM -0700, Jens Axboe wrote:
>> On 11/12/24 9:36 AM, Brian Foster wrote:
>>> On Mon, Nov 11, 2024 at 04:37:39PM -0700, Jens Axboe wrote:
>>>> IOCB_UNCACHED IO needs to prune writeback regions on IO completion,
>>>> and hence need the worker punt that ext4 also does for unwritten
>>>> extents. Add an io_end flag to manage that.
>>>>
>>>> If foliop is set to foliop_uncached in ext4_write_begin(), then set
>>>> FGP_UNCACHED so that __filemap_get_folio() will mark newly created
>>>> folios as uncached. That in turn will make writeback completion drop
>>>> these ranges from the page cache.
>>>>
>>>> Now that ext4 supports both uncached reads and writes, add the fop_flag
>>>> FOP_UNCACHED to enable it.
>>>>
>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>> ---
>>>>  fs/ext4/ext4.h    |  1 +
>>>>  fs/ext4/file.c    |  2 +-
>>>>  fs/ext4/inline.c  |  7 ++++++-
>>>>  fs/ext4/inode.c   | 18 ++++++++++++++++--
>>>>  fs/ext4/page-io.c | 28 ++++++++++++++++------------
>>>>  5 files changed, 40 insertions(+), 16 deletions(-)
>>>>
>>> ...
>>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>>> index 54bdd4884fe6..afae3ab64c9e 100644
>>>> --- a/fs/ext4/inode.c
>>>> +++ b/fs/ext4/inode.c
>>>> @@ -1138,6 +1138,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
>>>>  	int ret, needed_blocks;
>>>>  	handle_t *handle;
>>>>  	int retries = 0;
>>>> +	fgf_t fgp_flags;
>>>>  	struct folio *folio;
>>>>  	pgoff_t index;
>>>>  	unsigned from, to;
>>>> @@ -1164,6 +1165,15 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
>>>>  			return 0;
>>>>  	}
>>>>  
>>>> +	/*
>>>> +	 * Set FGP_WRITEBEGIN, and FGP_UNCACHED if foliop contains
>>>> +	 * foliop_uncached. That's how generic_perform_write() informs us
>>>> +	 * that this is an uncached write.
>>>> +	 */
>>>> +	fgp_flags = FGP_WRITEBEGIN;
>>>> +	if (*foliop == foliop_uncached)
>>>> +		fgp_flags |= FGP_UNCACHED;
>>>> +
>>>>  	/*
>>>>  	 * __filemap_get_folio() can take a long time if the
>>>>  	 * system is thrashing due to memory pressure, or if the folio
>>>> @@ -1172,7 +1182,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
>>>>  	 * the folio (if needed) without using GFP_NOFS.
>>>>  	 */
>>>>  retry_grab:
>>>> -	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
>>>> +	folio = __filemap_get_folio(mapping, index, fgp_flags,
>>>>  					mapping_gfp_mask(mapping));
>>>>  	if (IS_ERR(folio))
>>>>  		return PTR_ERR(folio);
>>>
>>> JFYI, I notice that ext4 cycles the folio lock here in this path and
>>> thus follows up with a couple checks presumably to accommodate that. One
>>> is whether i_mapping has changed, which I assume means uncached state
>>> would have been handled/cleared externally somewhere..? I.e., if an
>>> uncached folio is somehow truncated/freed without ever having been
>>> written back?
>>>
>>> The next is a folio_wait_stable() call "in case writeback began ..."
>>> It's not immediately clear to me if that is possible here, but taking
>>> that at face value, is it an issue if we were to create an uncached
>>> folio, drop the folio lock, then have some other task dirty and
>>> writeback the folio (due to a sync write or something), then have
>>> writeback completion invalidate the folio before we relock it here?
>>
>> I don't either of those are an issue. The UNCACHED flag will only be set
>> on a newly created folio, it does not get inherited for folios that
>> already exist.
>>
> 
> Right.. but what I was wondering for that latter case is if the folio is
> created here by ext4, so uncached is set before it is unlocked.
> 
> On second look I guess the uncached completion invalidation should clear
> mapping and thus trigger the retry logic here. That seems reasonable
> enough, but is it still possible to race with writeback?
> 
> Maybe this is a better way to ask.. what happens if a write completes to
> an uncached folio that is already under writeback? For example, uncached
> write 1 completes, submits for writeback and returns to userspace. Then
> write 2 begins and redirties the same folio before the uncached
> writeback completes.
> 
> If I follow correctly, if write 2 is also uncached, it eventually blocks
> in writeback submission (folio_prepare_writeback() ->
> folio_wait_writeback()). It looks like folio lock is held there, so
> presumably that would bypass the completion time invalidation in
> folio_end_uncached(). But what if write 2 was not uncached or perhaps
> writeback completion won the race for folio lock vs. the write side
> (between locking the folio for dirtying and later for writeback
> submission)? Does anything prevent invalidation of the folio before the
> second write is submitted for writeback?
> 
> IOW, I'm wondering if the uncached completion time invalidation also
> needs a folio dirty check..?

Ah ok, I see what you mean. If the folio is dirty, the unmapping will
fail. But I guess with the recent change, we'll actually unmap it first.
I'll add the folio dirty check, thanks!

-- 
Jens Axboe