[v1] ext4: refactor partial block zero-out for iomap conversion

[PATCH 10/10] ext4: zero post-EOF partial block before appending write

Posted by Zhang Yi 1 month ago

From: Zhang Yi <yi.zhang@huawei.com>

In cases of appending write beyond EOF, ext4_zero_partial_blocks() is
called within ext4_*_write_end() to zero out the partial block beyond
EOF. This prevents exposing stale data that might be written through
mmap.

However, supporting only the regular buffered write path is
insufficient. It is also necessary to support the DAX path as well as
the upcoming iomap buffered write path. Therefore, move this operation
to ext4_write_checks().

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/file.c  | 14 ++++++++++++++
 fs/ext4/inode.c | 21 +++++++--------------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f1dc5ce791a7..b2e44601ab6a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -271,6 +271,8 @@ static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
 
 static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 {
+	struct inode *inode = file_inode(iocb->ki_filp);
+	loff_t old_size = i_size_read(inode);
 	ssize_t ret, count;
 
 	count = ext4_generic_write_checks(iocb, from);
@@ -280,6 +282,18 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	ret = file_modified(iocb->ki_filp);
 	if (ret)
 		return ret;
+
+	/*
+	 * If the position is beyond the EOF, it is necessary to zero out the
+	 * partial block that beyond the existing EOF, as it may contains
+	 * stale data written through mmap.
+	 */
+	if (iocb->ki_pos > old_size && !ext4_verity_in_progress(inode)) {
+		ret = ext4_block_zero_eof(inode, old_size, iocb->ki_pos);
+		if (ret < 0)
+			return ret;
+	}
+
 	return count;
 }
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5288d36b0f09..67a4d12fcb4d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1456,10 +1456,9 @@ static int ext4_write_end(const struct kiocb *iocb,
 	folio_unlock(folio);
 	folio_put(folio);
 
-	if (old_size < pos && !verity) {
+	if (old_size < pos && !verity)
 		pagecache_isize_extended(inode, old_size, pos);
-		ext4_block_zero_eof(inode, old_size, pos);
-	}
+
 	/*
 	 * Don't mark the inode dirty under folio lock. First, it unnecessarily
 	 * makes the holding time of folio lock longer. Second, it forces lock
@@ -1574,10 +1573,8 @@ static int ext4_journalled_write_end(const struct kiocb *iocb,
 	folio_unlock(folio);
 	folio_put(folio);
 
-	if (old_size < pos && !verity) {
+	if (old_size < pos && !verity)
 		pagecache_isize_extended(inode, old_size, pos);
-		ext4_block_zero_eof(inode, old_size, pos);
-	}
 
 	if (size_changed) {
 		ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -3196,7 +3193,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	loff_t old_size = inode->i_size;
 	bool disksize_changed = false;
-	loff_t new_i_size, zero_len = 0;
+	loff_t new_i_size;
 	handle_t *handle;
 
 	if (unlikely(!folio_buffers(folio))) {
@@ -3240,19 +3237,15 @@ static int ext4_da_do_write_end(struct address_space *mapping,
 	folio_unlock(folio);
 	folio_put(folio);
 
-	if (pos > old_size) {
+	if (pos > old_size)
 		pagecache_isize_extended(inode, old_size, pos);
-		zero_len = pos - old_size;
-	}
 
-	if (!disksize_changed && !zero_len)
+	if (!disksize_changed)
 		return copied;
 
-	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
-	if (zero_len)
-		ext4_block_zero_eof(inode, old_size, pos);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 
-- 
2.52.0

Re: [PATCH 10/10] ext4: zero post-EOF partial block before appending write

Posted by Jan Kara 2 weeks, 2 days ago

On Tue 10-03-26 09:41:01, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> In cases of appending write beyond EOF, ext4_zero_partial_blocks() is
> called within ext4_*_write_end() to zero out the partial block beyond
> EOF. This prevents exposing stale data that might be written through
> mmap.
> 
> However, supporting only the regular buffered write path is
> insufficient. It is also necessary to support the DAX path as well as
> the upcoming iomap buffered write path. Therefore, move this operation
> to ext4_write_checks().
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

I'd note that this allows page fault to race in between the zeroing and
actual write resulting in new possible result - previously for file size 8,
pwrite('WWWW...', 8, 16) racing with mmap writes of 'MMMMMM...' at offset 8
into the page you could see either:

DDDDDDDD00000000WWWWWWWW
or
DDDDDDDDMMMMMMMMMMMMMMMM


now you can see both of the above an also

DDDDDDDMMMMMMMMWWWWWWWWW

But I don't think that's strictly invalid content and userspace that would
depend on the outcome of such race would be silly. So feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/file.c  | 14 ++++++++++++++
>  fs/ext4/inode.c | 21 +++++++--------------
>  2 files changed, 21 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index f1dc5ce791a7..b2e44601ab6a 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -271,6 +271,8 @@ static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
>  
>  static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  {
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	loff_t old_size = i_size_read(inode);
>  	ssize_t ret, count;
>  
>  	count = ext4_generic_write_checks(iocb, from);
> @@ -280,6 +282,18 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  	ret = file_modified(iocb->ki_filp);
>  	if (ret)
>  		return ret;
> +
> +	/*
> +	 * If the position is beyond the EOF, it is necessary to zero out the
> +	 * partial block that beyond the existing EOF, as it may contains
> +	 * stale data written through mmap.
> +	 */
> +	if (iocb->ki_pos > old_size && !ext4_verity_in_progress(inode)) {
> +		ret = ext4_block_zero_eof(inode, old_size, iocb->ki_pos);
> +		if (ret < 0)
> +			return ret;
> +	}
> +
>  	return count;
>  }
>  
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5288d36b0f09..67a4d12fcb4d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1456,10 +1456,9 @@ static int ext4_write_end(const struct kiocb *iocb,
>  	folio_unlock(folio);
>  	folio_put(folio);
>  
> -	if (old_size < pos && !verity) {
> +	if (old_size < pos && !verity)
>  		pagecache_isize_extended(inode, old_size, pos);
> -		ext4_block_zero_eof(inode, old_size, pos);
> -	}
> +
>  	/*
>  	 * Don't mark the inode dirty under folio lock. First, it unnecessarily
>  	 * makes the holding time of folio lock longer. Second, it forces lock
> @@ -1574,10 +1573,8 @@ static int ext4_journalled_write_end(const struct kiocb *iocb,
>  	folio_unlock(folio);
>  	folio_put(folio);
>  
> -	if (old_size < pos && !verity) {
> +	if (old_size < pos && !verity)
>  		pagecache_isize_extended(inode, old_size, pos);
> -		ext4_block_zero_eof(inode, old_size, pos);
> -	}
>  
>  	if (size_changed) {
>  		ret2 = ext4_mark_inode_dirty(handle, inode);
> @@ -3196,7 +3193,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
>  	struct inode *inode = mapping->host;
>  	loff_t old_size = inode->i_size;
>  	bool disksize_changed = false;
> -	loff_t new_i_size, zero_len = 0;
> +	loff_t new_i_size;
>  	handle_t *handle;
>  
>  	if (unlikely(!folio_buffers(folio))) {
> @@ -3240,19 +3237,15 @@ static int ext4_da_do_write_end(struct address_space *mapping,
>  	folio_unlock(folio);
>  	folio_put(folio);
>  
> -	if (pos > old_size) {
> +	if (pos > old_size)
>  		pagecache_isize_extended(inode, old_size, pos);
> -		zero_len = pos - old_size;
> -	}
>  
> -	if (!disksize_changed && !zero_len)
> +	if (!disksize_changed)
>  		return copied;
>  
> -	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
>  	if (IS_ERR(handle))
>  		return PTR_ERR(handle);
> -	if (zero_len)
> -		ext4_block_zero_eof(inode, old_size, pos);
>  	ext4_mark_inode_dirty(handle, inode);
>  	ext4_journal_stop(handle);
>  
> -- 
> 2.52.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

Re: [PATCH 10/10] ext4: zero post-EOF partial block before appending write

Posted by Zhang Yi 2 weeks, 2 days ago

On 3/24/2026 4:31 AM, Jan Kara wrote:
> On Tue 10-03-26 09:41:01, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> In cases of appending write beyond EOF, ext4_zero_partial_blocks() is
>> called within ext4_*_write_end() to zero out the partial block beyond
>> EOF. This prevents exposing stale data that might be written through
>> mmap.
>>
>> However, supporting only the regular buffered write path is
>> insufficient. It is also necessary to support the DAX path as well as
>> the upcoming iomap buffered write path. Therefore, move this operation
>> to ext4_write_checks().
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> 
> I'd note that this allows page fault to race in between the zeroing and
> actual write resulting in new possible result - previously for file size 8,
> pwrite('WWWW...', 8, 16) racing with mmap writes of 'MMMMMM...' at offset 8
> into the page you could see either:
> 
> DDDDDDDD00000000WWWWWWWW
> or
> DDDDDDDDMMMMMMMMMMMMMMMM
> 
> 
> now you can see both of the above an also
> 
> DDDDDDDMMMMMMMMWWWWWWWWW
> 
> But I don't think that's strictly invalid content and userspace that would
> depend on the outcome of such race would be silly. So feel free to add:

Yes exactly.  :-)

Thanks,
Yi.

> 
> Reviewed-by: Jan Kara <jack@suse.cz>
> 
> 								Honza
> 
>> ---
>>  fs/ext4/file.c  | 14 ++++++++++++++
>>  fs/ext4/inode.c | 21 +++++++--------------
>>  2 files changed, 21 insertions(+), 14 deletions(-)
>>
>> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
>> index f1dc5ce791a7..b2e44601ab6a 100644
>> --- a/fs/ext4/file.c
>> +++ b/fs/ext4/file.c
>> @@ -271,6 +271,8 @@ static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
>>  
>>  static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>>  {
>> +	struct inode *inode = file_inode(iocb->ki_filp);
>> +	loff_t old_size = i_size_read(inode);
>>  	ssize_t ret, count;
>>  
>>  	count = ext4_generic_write_checks(iocb, from);
>> @@ -280,6 +282,18 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>>  	ret = file_modified(iocb->ki_filp);
>>  	if (ret)
>>  		return ret;
>> +
>> +	/*
>> +	 * If the position is beyond the EOF, it is necessary to zero out the
>> +	 * partial block that beyond the existing EOF, as it may contains
>> +	 * stale data written through mmap.
>> +	 */
>> +	if (iocb->ki_pos > old_size && !ext4_verity_in_progress(inode)) {
>> +		ret = ext4_block_zero_eof(inode, old_size, iocb->ki_pos);
>> +		if (ret < 0)
>> +			return ret;
>> +	}
>> +
>>  	return count;
>>  }
>>  
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 5288d36b0f09..67a4d12fcb4d 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -1456,10 +1456,9 @@ static int ext4_write_end(const struct kiocb *iocb,
>>  	folio_unlock(folio);
>>  	folio_put(folio);
>>  
>> -	if (old_size < pos && !verity) {
>> +	if (old_size < pos && !verity)
>>  		pagecache_isize_extended(inode, old_size, pos);
>> -		ext4_block_zero_eof(inode, old_size, pos);
>> -	}
>> +
>>  	/*
>>  	 * Don't mark the inode dirty under folio lock. First, it unnecessarily
>>  	 * makes the holding time of folio lock longer. Second, it forces lock
>> @@ -1574,10 +1573,8 @@ static int ext4_journalled_write_end(const struct kiocb *iocb,
>>  	folio_unlock(folio);
>>  	folio_put(folio);
>>  
>> -	if (old_size < pos && !verity) {
>> +	if (old_size < pos && !verity)
>>  		pagecache_isize_extended(inode, old_size, pos);
>> -		ext4_block_zero_eof(inode, old_size, pos);
>> -	}
>>  
>>  	if (size_changed) {
>>  		ret2 = ext4_mark_inode_dirty(handle, inode);
>> @@ -3196,7 +3193,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
>>  	struct inode *inode = mapping->host;
>>  	loff_t old_size = inode->i_size;
>>  	bool disksize_changed = false;
>> -	loff_t new_i_size, zero_len = 0;
>> +	loff_t new_i_size;
>>  	handle_t *handle;
>>  
>>  	if (unlikely(!folio_buffers(folio))) {
>> @@ -3240,19 +3237,15 @@ static int ext4_da_do_write_end(struct address_space *mapping,
>>  	folio_unlock(folio);
>>  	folio_put(folio);
>>  
>> -	if (pos > old_size) {
>> +	if (pos > old_size)
>>  		pagecache_isize_extended(inode, old_size, pos);
>> -		zero_len = pos - old_size;
>> -	}
>>  
>> -	if (!disksize_changed && !zero_len)
>> +	if (!disksize_changed)
>>  		return copied;
>>  
>> -	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
>> +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
>>  	if (IS_ERR(handle))
>>  		return PTR_ERR(handle);
>> -	if (zero_len)
>> -		ext4_block_zero_eof(inode, old_size, pos);
>>  	ext4_mark_inode_dirty(handle, inode);
>>  	ext4_journal_stop(handle);
>>  
>> -- 
>> 2.52.0
>>