[PATCH v5 2/2] ocfs2: detect released suballocator BG for fh_to_[dentry|parent]

Heming Zhao posted 2 patches 1 day, 22 hours ago
There is a newer version of this series
[PATCH v5 2/2] ocfs2: detect released suballocator BG for fh_to_[dentry|parent]
Posted by Heming Zhao 1 day, 22 hours ago
After ocfs2 gained the ability to reclaim suballocator free block
group (BGs), a suballocator block group may be released. This change
causes the xfstest case generic/426 to fail.

generic/426 expects return value -ENOENT or -ESTALE, but the current
code triggers -EROFS.

Call stack before ocfs2 gained the ability to reclaim bg:

ocfs2_fh_to_dentry //or ocfs2_fh_to_parent
 ocfs2_get_dentry
  + ocfs2_test_inode_bit
  |  ocfs2_test_suballoc_bit
  |   + ocfs2_read_group_descriptor //Since ocfs2 never releases the bg,
  |   |                             //the bg block was always found.
  |   + *res = ocfs2_test_bit //unlink was called, and the bit is zero
  |
  + if (!set) //because the above *res is 0
     status = -ESTALE //the generic/426 expected return value

Current call stack that triggers -EROFS:

ocfs2_get_dentry
 ocfs2_test_inode_bit
  ocfs2_test_suballoc_bit
   ocfs2_read_group_descriptor
    + if reading a released bg, validation fails and triggers -EROFS

How to fix:
Since the read BG is already released, we must avoid triggering -EROFS.
With this commit, we use ocfs2_read_hint_group_descriptor() to detect
the released BG block. This approach quietly handles this type of error
and returns -EINVAL, which triggers the caller's existing conversion
path to -ESTALE.

Signed-off-by: Heming Zhao <heming.zhao@suse.com>
Reviewed-by: Su Yue <glass.su@suse.com>
---
 fs/ocfs2/export.c   |  6 ++++--
 fs/ocfs2/suballoc.c | 28 ++++++++++++++++++----------
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index b95724b767e1..9c2665dd24e2 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -74,8 +74,9 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
 			 * nice
 			 */
 			status = -ESTALE;
-		} else
+		} else if (status != -ESTALE) {
 			mlog(ML_ERROR, "test inode bit failed %d\n", status);
+		}
 		goto unlock_nfs_sync;
 	}
 
@@ -162,8 +163,9 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 	if (status < 0) {
 		if (status == -EINVAL) {
 			status = -ESTALE;
-		} else
+		} else if (status != -ESTALE) {
 			mlog(ML_ERROR, "test inode bit failed %d\n", status);
+		}
 		parent = ERR_PTR(status);
 		goto bail_unlock;
 	}
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 9a19f5230c8c..ddcfa6e001e8 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -3152,7 +3152,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
 	struct ocfs2_group_desc *group;
 	struct buffer_head *group_bh = NULL;
 	u64 bg_blkno;
-	int status;
+	int status, quiet = 0, released;
 
 	trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
 				      (unsigned int)bit);
@@ -3168,11 +3168,15 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
 
 	bg_blkno = group_blkno ? group_blkno :
 		   ocfs2_which_suballoc_group(blkno, bit);
-	status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
-					     &group_bh);
-	if (status < 0) {
+	status = ocfs2_read_hint_group_descriptor(suballoc, alloc_di, bg_blkno,
+					     &group_bh, &released);
+	if (released) {
+		quiet = 1;
+		status = -ESTALE;
+		goto bail;
+	} else if (status < 0) {
 		mlog(ML_ERROR, "read group %llu failed %d\n",
-		     (unsigned long long)bg_blkno, status);
+				(unsigned long long)bg_blkno, status);
 		goto bail;
 	}
 
@@ -3182,7 +3186,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
 bail:
 	brelse(group_bh);
 
-	if (status)
+	if (status && (!quiet))
 		mlog_errno(status);
 	return status;
 }
@@ -3202,7 +3206,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
  */
 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 {
-	int status;
+	int status, quiet = 0;
 	u64 group_blkno = 0;
 	u16 suballoc_bit = 0, suballoc_slot = 0;
 	struct inode *inode_alloc_inode;
@@ -3244,8 +3248,12 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 
 	status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
 					 group_blkno, blkno, suballoc_bit, res);
-	if (status < 0)
-		mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
+	if (status < 0) {
+		if (status == -ESTALE)
+			quiet = 1;
+		else
+			mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
+	}
 
 	ocfs2_inode_unlock(inode_alloc_inode, 0);
 	inode_unlock(inode_alloc_inode);
@@ -3253,7 +3261,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 	iput(inode_alloc_inode);
 	brelse(alloc_bh);
 bail:
-	if (status)
+	if (status && !quiet)
 		mlog_errno(status);
 	return status;
 }
-- 
2.43.0
Re: [PATCH v5 2/2] ocfs2: detect released suballocator BG for fh_to_[dentry|parent]
Posted by Joseph Qi 1 day, 22 hours ago

On 2025/12/12 15:00, Heming Zhao wrote:
> After ocfs2 gained the ability to reclaim suballocator free block
> group (BGs), a suballocator block group may be released. This change
> causes the xfstest case generic/426 to fail.
> 
> generic/426 expects return value -ENOENT or -ESTALE, but the current
> code triggers -EROFS.
> 
> Call stack before ocfs2 gained the ability to reclaim bg:
> 
> ocfs2_fh_to_dentry //or ocfs2_fh_to_parent
>  ocfs2_get_dentry
>   + ocfs2_test_inode_bit
>   |  ocfs2_test_suballoc_bit
>   |   + ocfs2_read_group_descriptor //Since ocfs2 never releases the bg,
>   |   |                             //the bg block was always found.
>   |   + *res = ocfs2_test_bit //unlink was called, and the bit is zero
>   |
>   + if (!set) //because the above *res is 0
>      status = -ESTALE //the generic/426 expected return value
> 
> Current call stack that triggers -EROFS:
> 
> ocfs2_get_dentry
>  ocfs2_test_inode_bit
>   ocfs2_test_suballoc_bit
>    ocfs2_read_group_descriptor
>     + if reading a released bg, validation fails and triggers -EROFS
> 
> How to fix:
> Since the read BG is already released, we must avoid triggering -EROFS.
> With this commit, we use ocfs2_read_hint_group_descriptor() to detect
> the released BG block. This approach quietly handles this type of error
> and returns -EINVAL, which triggers the caller's existing conversion
> path to -ESTALE.
> 
> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
> Reviewed-by: Su Yue <glass.su@suse.com>
> ---
>  fs/ocfs2/export.c   |  6 ++++--
>  fs/ocfs2/suballoc.c | 28 ++++++++++++++++++----------
>  2 files changed, 22 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
> index b95724b767e1..9c2665dd24e2 100644
> --- a/fs/ocfs2/export.c
> +++ b/fs/ocfs2/export.c
> @@ -74,8 +74,9 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
>  			 * nice
>  			 */
>  			status = -ESTALE;
> -		} else
> +		} else if (status != -ESTALE) {
>  			mlog(ML_ERROR, "test inode bit failed %d\n", status);
> +		}
>  		goto unlock_nfs_sync;
>  	}
>  
> @@ -162,8 +163,9 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
>  	if (status < 0) {
>  		if (status == -EINVAL) {
>  			status = -ESTALE;
> -		} else
> +		} else if (status != -ESTALE) {
>  			mlog(ML_ERROR, "test inode bit failed %d\n", status);
> +		}
>  		parent = ERR_PTR(status);
>  		goto bail_unlock;
>  	}
> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
> index 9a19f5230c8c..ddcfa6e001e8 100644
> --- a/fs/ocfs2/suballoc.c
> +++ b/fs/ocfs2/suballoc.c
> @@ -3152,7 +3152,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
>  	struct ocfs2_group_desc *group;
>  	struct buffer_head *group_bh = NULL;
>  	u64 bg_blkno;
> -	int status;
> +	int status, quiet = 0, released;
>  
>  	trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
>  				      (unsigned int)bit);
> @@ -3168,11 +3168,15 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
>  
>  	bg_blkno = group_blkno ? group_blkno :
>  		   ocfs2_which_suballoc_group(blkno, bit);
> -	status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
> -					     &group_bh);
> -	if (status < 0) {
> +	status = ocfs2_read_hint_group_descriptor(suballoc, alloc_di, bg_blkno,
> +					     &group_bh, &released);
> +	if (released) {
> +		quiet = 1;
> +		status = -ESTALE;
> +		goto bail;
> +	} else if (status < 0) {
>  		mlog(ML_ERROR, "read group %llu failed %d\n",
> -		     (unsigned long long)bg_blkno, status);
> +				(unsigned long long)bg_blkno, status);

This can be kept untouched.

>  		goto bail;
>  	}
>  
> @@ -3182,7 +3186,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
>  bail:
>  	brelse(group_bh);
>  
> -	if (status)
> +	if (status && (!quiet))

'!quiet' is enough, the parentheses is unneeded.

Other looks good to me.

Thanks,
Joseph

>  		mlog_errno(status);
>  	return status;
>  }
> @@ -3202,7 +3206,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
>   */
>  int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
>  {
> -	int status;
> +	int status, quiet = 0;
>  	u64 group_blkno = 0;
>  	u16 suballoc_bit = 0, suballoc_slot = 0;
>  	struct inode *inode_alloc_inode;
> @@ -3244,8 +3248,12 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
>  
>  	status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
>  					 group_blkno, blkno, suballoc_bit, res);
> -	if (status < 0)
> -		mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
> +	if (status < 0) {
> +		if (status == -ESTALE)
> +			quiet = 1;
> +		else
> +			mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
> +	}
>  
>  	ocfs2_inode_unlock(inode_alloc_inode, 0);
>  	inode_unlock(inode_alloc_inode);
> @@ -3253,7 +3261,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
>  	iput(inode_alloc_inode);
>  	brelse(alloc_bh);
>  bail:
> -	if (status)
> +	if (status && !quiet)
>  		mlog_errno(status);
>  	return status;
>  }
Re: [PATCH v5 2/2] ocfs2: detect released suballocator BG for fh_to_[dentry|parent]
Posted by Heming Zhao 1 day, 22 hours ago
On Fri, Dec 12, 2025 at 03:12:53PM +0800, Joseph Qi wrote:
> 
> 
> On 2025/12/12 15:00, Heming Zhao wrote:
> > After ocfs2 gained the ability to reclaim suballocator free block
> > group (BGs), a suballocator block group may be released. This change
> > causes the xfstest case generic/426 to fail.
> > 
> > generic/426 expects return value -ENOENT or -ESTALE, but the current
> > code triggers -EROFS.
> > 
> > Call stack before ocfs2 gained the ability to reclaim bg:
> > 
> > ocfs2_fh_to_dentry //or ocfs2_fh_to_parent
> >  ocfs2_get_dentry
> >   + ocfs2_test_inode_bit
> >   |  ocfs2_test_suballoc_bit
> >   |   + ocfs2_read_group_descriptor //Since ocfs2 never releases the bg,
> >   |   |                             //the bg block was always found.
> >   |   + *res = ocfs2_test_bit //unlink was called, and the bit is zero
> >   |
> >   + if (!set) //because the above *res is 0
> >      status = -ESTALE //the generic/426 expected return value
> > 
> > Current call stack that triggers -EROFS:
> > 
> > ocfs2_get_dentry
> >  ocfs2_test_inode_bit
> >   ocfs2_test_suballoc_bit
> >    ocfs2_read_group_descriptor
> >     + if reading a released bg, validation fails and triggers -EROFS
> > 
> > How to fix:
> > Since the read BG is already released, we must avoid triggering -EROFS.
> > With this commit, we use ocfs2_read_hint_group_descriptor() to detect
> > the released BG block. This approach quietly handles this type of error
> > and returns -EINVAL, which triggers the caller's existing conversion
> > path to -ESTALE.
> > 
> > Signed-off-by: Heming Zhao <heming.zhao@suse.com>
> > Reviewed-by: Su Yue <glass.su@suse.com>
> > ---
> >  fs/ocfs2/export.c   |  6 ++++--
> >  fs/ocfs2/suballoc.c | 28 ++++++++++++++++++----------
> >  2 files changed, 22 insertions(+), 12 deletions(-)
> > 
> > diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
> > index b95724b767e1..9c2665dd24e2 100644
> > --- a/fs/ocfs2/export.c
> > +++ b/fs/ocfs2/export.c
> > @@ -74,8 +74,9 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
> >  			 * nice
> >  			 */
> >  			status = -ESTALE;
> > -		} else
> > +		} else if (status != -ESTALE) {
> >  			mlog(ML_ERROR, "test inode bit failed %d\n", status);
> > +		}
> >  		goto unlock_nfs_sync;
> >  	}
> >  
> > @@ -162,8 +163,9 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
> >  	if (status < 0) {
> >  		if (status == -EINVAL) {
> >  			status = -ESTALE;
> > -		} else
> > +		} else if (status != -ESTALE) {
> >  			mlog(ML_ERROR, "test inode bit failed %d\n", status);
> > +		}
> >  		parent = ERR_PTR(status);
> >  		goto bail_unlock;
> >  	}
> > diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
> > index 9a19f5230c8c..ddcfa6e001e8 100644
> > --- a/fs/ocfs2/suballoc.c
> > +++ b/fs/ocfs2/suballoc.c
> > @@ -3152,7 +3152,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
> >  	struct ocfs2_group_desc *group;
> >  	struct buffer_head *group_bh = NULL;
> >  	u64 bg_blkno;
> > -	int status;
> > +	int status, quiet = 0, released;
> >  
> >  	trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
> >  				      (unsigned int)bit);
> > @@ -3168,11 +3168,15 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
> >  
> >  	bg_blkno = group_blkno ? group_blkno :
> >  		   ocfs2_which_suballoc_group(blkno, bit);
> > -	status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
> > -					     &group_bh);
> > -	if (status < 0) {
> > +	status = ocfs2_read_hint_group_descriptor(suballoc, alloc_di, bg_blkno,
> > +					     &group_bh, &released);
> > +	if (released) {
> > +		quiet = 1;
> > +		status = -ESTALE;
> > +		goto bail;
> > +	} else if (status < 0) {
> >  		mlog(ML_ERROR, "read group %llu failed %d\n",
> > -		     (unsigned long long)bg_blkno, status);
> > +				(unsigned long long)bg_blkno, status);

OK
> 
> This can be kept untouched.
> 
> >  		goto bail;
> >  	}
> >  
> > @@ -3182,7 +3186,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
> >  bail:
> >  	brelse(group_bh);
> >  
> > -	if (status)
> > +	if (status && (!quiet))
> 
> '!quiet' is enough, the parentheses is unneeded.

OK. Very interesting, this is not my code style.
Thanks for your careful review.

Heming
> 
> Other looks good to me.
> 
> Thanks,
> Joseph
> 
> >  		mlog_errno(status);
> >  	return status;
> >  }
> > @@ -3202,7 +3206,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
> >   */
> >  int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
> >  {
> > -	int status;
> > +	int status, quiet = 0;
> >  	u64 group_blkno = 0;
> >  	u16 suballoc_bit = 0, suballoc_slot = 0;
> >  	struct inode *inode_alloc_inode;
> > @@ -3244,8 +3248,12 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
> >  
> >  	status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
> >  					 group_blkno, blkno, suballoc_bit, res);
> > -	if (status < 0)
> > -		mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
> > +	if (status < 0) {
> > +		if (status == -ESTALE)
> > +			quiet = 1;
> > +		else
> > +			mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
> > +	}
> >  
> >  	ocfs2_inode_unlock(inode_alloc_inode, 0);
> >  	inode_unlock(inode_alloc_inode);
> > @@ -3253,7 +3261,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
> >  	iput(inode_alloc_inode);
> >  	brelse(alloc_bh);
> >  bail:
> > -	if (status)
> > +	if (status && !quiet)
> >  		mlog_errno(status);
> >  	return status;
> >  }
>