[PATCH] 9p: fix data corruption with writeback caching during concurrent stat

Pierre Barre posted 1 patch 1 month, 1 week ago
fs/9p/vfs_inode.c      | 11 ++++++++---
fs/9p/vfs_inode_dotl.c | 13 +++++++++----
2 files changed, 17 insertions(+), 7 deletions(-)
[PATCH] 9p: fix data corruption with writeback caching during concurrent stat
Posted by Pierre Barre 1 month, 1 week ago
When using writeback caching (cache=mmap), v9fs_vfs_getattr/setattr have
two issues that can cause data corruption:

1. filemap_fdatawrite() initiates writeback but doesn't wait for
   completion. The subsequent server stat sees stale file size.

2. v9fs_stat2inode()/v9fs_stat2inode_dotl() unconditionally overwrite
   i_size from the server response, even when dirty pages exist locally.
   This causes processes using lseek(SEEK_END) to see incorrect file
   sizes.

Fix by using filemap_write_and_wait() instead of filemap_fdatawrite(),
and passing V9FS_STAT2INODE_KEEP_ISIZE when CACHE_WRITEBACK is enabled
to preserve the local i_size.

Also fix v9fs_vfs_getattr_dotl() to check for CACHE_WRITEBACK specifically
rather than any cache mode.

Signed-off-by: Pierre Barre <pierre@barre.sh>
---
 fs/9p/vfs_inode.c      | 11 ++++++++---
 fs/9p/vfs_inode_dotl.c | 13 +++++++++----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 97abe65bf7c1..f4c294ca759b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -977,7 +977,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path,
 		return 0;
 	} else if (v9ses->cache & CACHE_WRITEBACK) {
 		if (S_ISREG(inode->i_mode)) {
-			int retval = filemap_fdatawrite(inode->i_mapping);
+			int retval = filemap_write_and_wait(inode->i_mapping);
 
 			if (retval)
 				p9_debug(P9_DEBUG_ERROR,
@@ -993,7 +993,12 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path,
 	if (IS_ERR(st))
 		return PTR_ERR(st);
 
-	v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
+	/*
+	 * With writeback caching, the client is authoritative for i_size.
+	 * Don't let the server overwrite it with a potentially stale value.
+	 */
+	v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb,
+		(v9ses->cache & CACHE_WRITEBACK) ? V9FS_STAT2INODE_KEEP_ISIZE : 0);
 	generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
 
 	p9stat_free(st);
@@ -1058,7 +1063,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
 
 	/* Write all dirty data */
 	if (d_is_reg(dentry)) {
-		retval = filemap_fdatawrite(inode->i_mapping);
+		retval = filemap_write_and_wait(inode->i_mapping);
 		if (retval)
 			p9_debug(P9_DEBUG_ERROR,
 			    "flushing writeback during setattr returned %d\n", retval);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 643e759eacb2..362a68a2bca3 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -431,9 +431,9 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
 	if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
 		generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
 		return 0;
-	} else if (v9ses->cache) {
+	} else if (v9ses->cache & CACHE_WRITEBACK) {
 		if (S_ISREG(inode->i_mode)) {
-			int retval = filemap_fdatawrite(inode->i_mapping);
+			int retval = filemap_write_and_wait(inode->i_mapping);
 
 			if (retval)
 				p9_debug(P9_DEBUG_ERROR,
@@ -453,7 +453,12 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
 	if (IS_ERR(st))
 		return PTR_ERR(st);
 
-	v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
+	/*
+	 * With writeback caching, the client is authoritative for i_size.
+	 * Don't let the server overwrite it with a potentially stale value.
+	 */
+	v9fs_stat2inode_dotl(st, d_inode(dentry),
+		(v9ses->cache & CACHE_WRITEBACK) ? V9FS_STAT2INODE_KEEP_ISIZE : 0);
 	generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
 	/* Change block size to what the server returned */
 	stat->blksize = st->st_blksize;
@@ -561,7 +566,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
 
 	/* Write all dirty data */
 	if (S_ISREG(inode->i_mode)) {
-		retval = filemap_fdatawrite(inode->i_mapping);
+		retval = filemap_write_and_wait(inode->i_mapping);
 		if (retval < 0)
 			p9_debug(P9_DEBUG_ERROR,
 			    "Flushing file prior to setattr failed: %d\n", retval);
-- 
2.43.0
Re: [PATCH] 9p: fix data corruption with writeback caching during concurrent stat
Posted by Christian Schoenebeck 1 month, 1 week ago
On Saturday, 27 December 2025 09:37:51 CET Pierre Barre wrote:
> When using writeback caching (cache=mmap), v9fs_vfs_getattr/setattr have
> two issues that can cause data corruption:
> 
> 1. filemap_fdatawrite() initiates writeback but doesn't wait for
>    completion. The subsequent server stat sees stale file size.
> 
> 2. v9fs_stat2inode()/v9fs_stat2inode_dotl() unconditionally overwrite
>    i_size from the server response, even when dirty pages exist locally.
>    This causes processes using lseek(SEEK_END) to see incorrect file
>    sizes.
> 
> Fix by using filemap_write_and_wait() instead of filemap_fdatawrite(),
> and passing V9FS_STAT2INODE_KEEP_ISIZE when CACHE_WRITEBACK is enabled
> to preserve the local i_size.
> 
> Also fix v9fs_vfs_getattr_dotl() to check for CACHE_WRITEBACK specifically
> rather than any cache mode.
> 
> Signed-off-by: Pierre Barre <pierre@barre.sh>
> ---
>  fs/9p/vfs_inode.c      | 11 ++++++++---
>  fs/9p/vfs_inode_dotl.c | 13 +++++++++----
>  2 files changed, 17 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
> index 97abe65bf7c1..f4c294ca759b 100644
> --- a/fs/9p/vfs_inode.c
> +++ b/fs/9p/vfs_inode.c
> @@ -977,7 +977,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct
> path *path, return 0;
>  	} else if (v9ses->cache & CACHE_WRITEBACK) {
>  		if (S_ISREG(inode->i_mode)) {
> -			int retval = filemap_fdatawrite(inode->i_mapping);
> +			int retval = filemap_write_and_wait(inode->i_mapping);

Haven't reviewed thorougly, but this looks wrong to me. The point about write-
back is not having to wait for completion.

>  			if (retval)
>  				p9_debug(P9_DEBUG_ERROR,
> @@ -993,7 +993,12 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct
> path *path, if (IS_ERR(st))
>  		return PTR_ERR(st);
> 
> -	v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
> +	/*
> +	 * With writeback caching, the client is authoritative for i_size.
> +	 * Don't let the server overwrite it with a potentially stale value.
> +	 */
> +	v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb,
> +		(v9ses->cache & CACHE_WRITEBACK) ? V9FS_STAT2INODE_KEEP_ISIZE : 0);

And this measure alone (along with the same change in v9fs_vfs_getattr_dotl()
that is) would not fix the misbehavior you encountered?

>  	generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
> 
>  	p9stat_free(st);
> @@ -1058,7 +1063,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
> 
>  	/* Write all dirty data */
>  	if (d_is_reg(dentry)) {
> -		retval = filemap_fdatawrite(inode->i_mapping);
> +		retval = filemap_write_and_wait(inode->i_mapping);
>  		if (retval)
>  			p9_debug(P9_DEBUG_ERROR,
>  			    "flushing writeback during setattr returned %d\n", retval);
> diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
> index 643e759eacb2..362a68a2bca3 100644
> --- a/fs/9p/vfs_inode_dotl.c
> +++ b/fs/9p/vfs_inode_dotl.c
> @@ -431,9 +431,9 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
>  	if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
>  		generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
>  		return 0;
> -	} else if (v9ses->cache) {
> +	} else if (v9ses->cache & CACHE_WRITEBACK) {

OK, so here is an inconsistency between v9fs_vfs_getattr_dotl() and
v9fs_vfs_getattr(). But to me it should be the other way around, i.e.
v9fs_vfs_getattr() should do it any cache mode, not only for write-back?

/Christian

>  		if (S_ISREG(inode->i_mode)) {
> -			int retval = filemap_fdatawrite(inode->i_mapping);
> +			int retval = filemap_write_and_wait(inode->i_mapping);
> 
>  			if (retval)
>  				p9_debug(P9_DEBUG_ERROR,
> @@ -453,7 +453,12 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
>  	if (IS_ERR(st))
>  		return PTR_ERR(st);
> 
> -	v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
> +	/*
> +	 * With writeback caching, the client is authoritative for i_size.
> +	 * Don't let the server overwrite it with a potentially stale value.
> +	 */
> +	v9fs_stat2inode_dotl(st, d_inode(dentry),
> +		(v9ses->cache & CACHE_WRITEBACK) ? V9FS_STAT2INODE_KEEP_ISIZE : 0);
>  	generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
>  	/* Change block size to what the server returned */
>  	stat->blksize = st->st_blksize;
> @@ -561,7 +566,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
> 
>  	/* Write all dirty data */
>  	if (S_ISREG(inode->i_mode)) {
> -		retval = filemap_fdatawrite(inode->i_mapping);
> +		retval = filemap_write_and_wait(inode->i_mapping);
>  		if (retval < 0)
>  			p9_debug(P9_DEBUG_ERROR,
>  			    "Flushing file prior to setattr failed: %d\n", retval);
Re: [PATCH] 9p: fix data corruption with writeback caching during concurrent stat
Posted by Pierre Barre 1 month, 1 week ago
Hi Christian,

On Sat, Dec 27, 2025, at 12:30, Christian Schoenebeck wrote:
> On Saturday, 27 December 2025 09:37:51 CET Pierre Barre wrote:
>> When using writeback caching (cache=mmap), v9fs_vfs_getattr/setattr have
>> two issues that can cause data corruption:
>> 
>> 1. filemap_fdatawrite() initiates writeback but doesn't wait for
>>    completion. The subsequent server stat sees stale file size.
>> 
>> 2. v9fs_stat2inode()/v9fs_stat2inode_dotl() unconditionally overwrite
>>    i_size from the server response, even when dirty pages exist locally.
>>    This causes processes using lseek(SEEK_END) to see incorrect file
>>    sizes.
>> 
>> Fix by using filemap_write_and_wait() instead of filemap_fdatawrite(),
>> and passing V9FS_STAT2INODE_KEEP_ISIZE when CACHE_WRITEBACK is enabled
>> to preserve the local i_size.
>> 
>> Also fix v9fs_vfs_getattr_dotl() to check for CACHE_WRITEBACK specifically
>> rather than any cache mode.
>> 
>> Signed-off-by: Pierre Barre <pierre@barre.sh>
>> ---
>>  fs/9p/vfs_inode.c      | 11 ++++++++---
>>  fs/9p/vfs_inode_dotl.c | 13 +++++++++----
>>  2 files changed, 17 insertions(+), 7 deletions(-)
>> 
>> diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
>> index 97abe65bf7c1..f4c294ca759b 100644
>> --- a/fs/9p/vfs_inode.c
>> +++ b/fs/9p/vfs_inode.c
>> @@ -977,7 +977,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct
>> path *path, return 0;
>>  	} else if (v9ses->cache & CACHE_WRITEBACK) {
>>  		if (S_ISREG(inode->i_mode)) {
>> -			int retval = filemap_fdatawrite(inode->i_mapping);
>> +			int retval = filemap_write_and_wait(inode->i_mapping);
>
> Haven't reviewed thorougly, but this looks wrong to me. The point about write-
> back is not having to wait for completion.
>

You're right, apologies for not thinking that through properly.

>>  			if (retval)
>>  				p9_debug(P9_DEBUG_ERROR,
>> @@ -993,7 +993,12 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct
>> path *path, if (IS_ERR(st))
>>  		return PTR_ERR(st);
>> 
>> -	v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
>> +	/*
>> +	 * With writeback caching, the client is authoritative for i_size.
>> +	 * Don't let the server overwrite it with a potentially stale value.
>> +	 */
>> +	v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb,
>> +		(v9ses->cache & CACHE_WRITEBACK) ? V9FS_STAT2INODE_KEEP_ISIZE : 0);
>
> And this measure alone (along with the same change in v9fs_vfs_getattr_dotl()
> that is) would not fix the misbehavior you encountered?

Indeed, I tested with only that change and the same workload, and this fixes it.

>>  	generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
>> 
>>  	p9stat_free(st);
>> @@ -1058,7 +1063,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
>> 
>>  	/* Write all dirty data */
>>  	if (d_is_reg(dentry)) {
>> -		retval = filemap_fdatawrite(inode->i_mapping);
>> +		retval = filemap_write_and_wait(inode->i_mapping);
>>  		if (retval)
>>  			p9_debug(P9_DEBUG_ERROR,
>>  			    "flushing writeback during setattr returned %d\n", retval);
>> diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
>> index 643e759eacb2..362a68a2bca3 100644
>> --- a/fs/9p/vfs_inode_dotl.c
>> +++ b/fs/9p/vfs_inode_dotl.c
>> @@ -431,9 +431,9 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
>>  	if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
>>  		generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
>>  		return 0;
>> -	} else if (v9ses->cache) {
>> +	} else if (v9ses->cache & CACHE_WRITEBACK) {
>
> OK, so here is an inconsistency between v9fs_vfs_getattr_dotl() and
> v9fs_vfs_getattr(). But to me it should be the other way around, i.e.
> v9fs_vfs_getattr() should do it any cache mode, not only for write-back?

My understanding was that dirty pages only exist with CACHE_WRITEBACK, so filemap_fdatawrite() would be a no-op for other modes. Is there a case I'm missing? Anyhow, even if this understanding is correct, this should probably be a separate patch.

Thanks for the review,
Pierre

> /Christian
>
>>  		if (S_ISREG(inode->i_mode)) {
>> -			int retval = filemap_fdatawrite(inode->i_mapping);
>> +			int retval = filemap_write_and_wait(inode->i_mapping);
>> 
>>  			if (retval)
>>  				p9_debug(P9_DEBUG_ERROR,
>> @@ -453,7 +453,12 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
>>  	if (IS_ERR(st))
>>  		return PTR_ERR(st);
>> 
>> -	v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
>> +	/*
>> +	 * With writeback caching, the client is authoritative for i_size.
>> +	 * Don't let the server overwrite it with a potentially stale value.
>> +	 */
>> +	v9fs_stat2inode_dotl(st, d_inode(dentry),
>> +		(v9ses->cache & CACHE_WRITEBACK) ? V9FS_STAT2INODE_KEEP_ISIZE : 0);
>>  	generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
>>  	/* Change block size to what the server returned */
>>  	stat->blksize = st->st_blksize;
>> @@ -561,7 +566,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
>> 
>>  	/* Write all dirty data */
>>  	if (S_ISREG(inode->i_mode)) {
>> -		retval = filemap_fdatawrite(inode->i_mapping);
>> +		retval = filemap_write_and_wait(inode->i_mapping);
>>  		if (retval < 0)
>>  			p9_debug(P9_DEBUG_ERROR,
>>  			    "Flushing file prior to setattr failed: %d\n", retval);