[PATCH v8 05/15] xfs: ignore HW which cannot atomic write a single block

John Garry posted 15 patches 9 months, 3 weeks ago
There is a newer version of this series
[PATCH v8 05/15] xfs: ignore HW which cannot atomic write a single block
Posted by John Garry 9 months, 3 weeks ago
Currently only HW which can write at least 1x block is supported.

For supporting atomic writes > 1x block, a CoW-based method will also be
used and this will not be resticted to using HW which can write >= 1x
block.

However for deciding if HW-based atomic writes can be used, we need to
start adding checks for write length < HW min, which complicates the code.
Indeed, a statx field similar to unit_max_opt should also be added for this
minimum, which is undesirable.

HW which can only write > 1x blocks would be uncommon and quite weird, so
let's just not support it.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/xfs_inode.h | 17 ++++++++---------
 fs/xfs/xfs_mount.c | 14 ++++++++++++++
 fs/xfs/xfs_mount.h |  4 ++++
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index cff643cd03fc..725cd7c16a6e 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -355,20 +355,19 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip)
 #define xfs_inode_buftarg(ip) \
 	(XFS_IS_REALTIME_INODE(ip) ? \
 		(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
+/*
+ * Return max atomic write unit for a given inode.
+ */
+#define xfs_inode_hw_atomicwrite_max(ip) \
+	(XFS_IS_REALTIME_INODE(ip) ? \
+		(ip)->i_mount->m_rt_awu_hw_max : \
+		(ip)->i_mount->m_dd_awu_hw_max)
 
 static inline bool
 xfs_inode_can_hw_atomicwrite(
 	struct xfs_inode	*ip)
 {
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
-
-	if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min)
-		return false;
-	if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max)
-		return false;
-
-	return true;
+	return xfs_inode_hw_atomicwrite_max(ip);
 }
 
 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 00b53f479ece..ee68c026e6cd 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1082,6 +1082,20 @@ xfs_mountfs(
 		xfs_zone_gc_start(mp);
 	}
 
+	/*
+	 * Set atomic write unit max for mp. Ignore devices which cannot atomic
+	 * a single block, as they would be uncommon and more difficult to
+	 * support.
+	 */
+	if (mp->m_ddev_targp->bt_bdev_awu_min <= mp->m_sb.sb_blocksize &&
+	    mp->m_ddev_targp->bt_bdev_awu_max >= mp->m_sb.sb_blocksize)
+		mp->m_dd_awu_hw_max = mp->m_ddev_targp->bt_bdev_awu_max;
+
+	if (mp->m_rtdev_targp &&
+	    mp->m_rtdev_targp->bt_bdev_awu_min <= mp->m_sb.sb_blocksize &&
+	    mp->m_rtdev_targp->bt_bdev_awu_max >= mp->m_sb.sb_blocksize)
+		mp->m_rt_awu_hw_max = mp->m_rtdev_targp->bt_bdev_awu_max;
+
 	return 0;
 
  out_agresv:
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e5192c12e7ac..2819e160f0e9 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -231,6 +231,10 @@ typedef struct xfs_mount {
 	unsigned int		m_max_open_zones;
 	unsigned int		m_zonegc_low_space;
 
+	/* ddev and rtdev HW max atomic write size */
+	unsigned int		m_dd_awu_hw_max;
+	unsigned int		m_rt_awu_hw_max;
+
 	/*
 	 * Bitsets of per-fs metadata that have been checked and/or are sick.
 	 * Callers must hold m_sb_lock to access these two fields.
-- 
2.31.1
Re: [PATCH v8 05/15] xfs: ignore HW which cannot atomic write a single block
Posted by Darrick J. Wong 9 months, 3 weeks ago
On Tue, Apr 22, 2025 at 12:27:29PM +0000, John Garry wrote:
> Currently only HW which can write at least 1x block is supported.
> 
> For supporting atomic writes > 1x block, a CoW-based method will also be
> used and this will not be resticted to using HW which can write >= 1x
> block.
> 
> However for deciding if HW-based atomic writes can be used, we need to
> start adding checks for write length < HW min, which complicates the code.
> Indeed, a statx field similar to unit_max_opt should also be added for this
> minimum, which is undesirable.
> 
> HW which can only write > 1x blocks would be uncommon and quite weird, so
> let's just not support it.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/xfs_inode.h | 17 ++++++++---------
>  fs/xfs/xfs_mount.c | 14 ++++++++++++++
>  fs/xfs/xfs_mount.h |  4 ++++
>  3 files changed, 26 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index cff643cd03fc..725cd7c16a6e 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -355,20 +355,19 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip)
>  #define xfs_inode_buftarg(ip) \
>  	(XFS_IS_REALTIME_INODE(ip) ? \
>  		(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
> +/*
> + * Return max atomic write unit for a given inode.
> + */
> +#define xfs_inode_hw_atomicwrite_max(ip) \
> +	(XFS_IS_REALTIME_INODE(ip) ? \
> +		(ip)->i_mount->m_rt_awu_hw_max : \
> +		(ip)->i_mount->m_dd_awu_hw_max)
>  
>  static inline bool
>  xfs_inode_can_hw_atomicwrite(
>  	struct xfs_inode	*ip)
>  {
> -	struct xfs_mount	*mp = ip->i_mount;
> -	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> -
> -	if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min)
> -		return false;
> -	if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max)
> -		return false;
> -
> -	return true;
> +	return xfs_inode_hw_atomicwrite_max(ip);
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 00b53f479ece..ee68c026e6cd 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -1082,6 +1082,20 @@ xfs_mountfs(
>  		xfs_zone_gc_start(mp);
>  	}
>  
> +	/*
> +	 * Set atomic write unit max for mp. Ignore devices which cannot atomic
> +	 * a single block, as they would be uncommon and more difficult to
> +	 * support.
> +	 */
> +	if (mp->m_ddev_targp->bt_bdev_awu_min <= mp->m_sb.sb_blocksize &&
> +	    mp->m_ddev_targp->bt_bdev_awu_max >= mp->m_sb.sb_blocksize)
> +		mp->m_dd_awu_hw_max = mp->m_ddev_targp->bt_bdev_awu_max;

If we don't want to use the device's atomic write capabilities due to
fsblock alignment problems, why not just zero out bt_bdev_awu_min/max?
That would cut down on the number of "awu" variables around the
codebase.

/*
 * Ignore hardware atomic writes if the device can't handle a single
 * fsblock for us.  Most devices set the min_awu to the LBA size, but
 * the spec allows for a functionality gap.
 */
static void
xfs_buftarg_reconcile_awu(
	struct xfs_buftarg	*btp)
{
	struct xfs_mount	*mp = btp->bt_mount;

	if (btp->bt_bdev_awu_min > mp->m_sb.sb_blocksize ||
	    btp->bt_bdev_awu_max < mp->m_sb.sb_blocksize) {
		btp->bt_bdev_awu_min = 0;
		btp->bt_bdev_awu_max = 0;
	}
}

	xfs_buftarg_reconcile_awu(mp->m_ddev_targp);
	if (mp->m_rtdev_targp)
		xfs_buftarg_reconcile_awu(mp->m_rtdev_targp);

Hrm?

--D

> +
> +	if (mp->m_rtdev_targp &&
> +	    mp->m_rtdev_targp->bt_bdev_awu_min <= mp->m_sb.sb_blocksize &&
> +	    mp->m_rtdev_targp->bt_bdev_awu_max >= mp->m_sb.sb_blocksize)
> +		mp->m_rt_awu_hw_max = mp->m_rtdev_targp->bt_bdev_awu_max;
> +
>  	return 0;
>  
>   out_agresv:
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index e5192c12e7ac..2819e160f0e9 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -231,6 +231,10 @@ typedef struct xfs_mount {
>  	unsigned int		m_max_open_zones;
>  	unsigned int		m_zonegc_low_space;
>  
> +	/* ddev and rtdev HW max atomic write size */
> +	unsigned int		m_dd_awu_hw_max;
> +	unsigned int		m_rt_awu_hw_max;
> +
>  	/*
>  	 * Bitsets of per-fs metadata that have been checked and/or are sick.
>  	 * Callers must hold m_sb_lock to access these two fields.
> -- 
> 2.31.1
> 
>
Re: [PATCH v8 05/15] xfs: ignore HW which cannot atomic write a single block
Posted by John Garry 9 months, 3 weeks ago
On 23/04/2025 01:38, Darrick J. Wong wrote:
>>   
>> +	/*
>> +	 * Set atomic write unit max for mp. Ignore devices which cannot atomic
>> +	 * a single block, as they would be uncommon and more difficult to
>> +	 * support.
>> +	 */
>> +	if (mp->m_ddev_targp->bt_bdev_awu_min <= mp->m_sb.sb_blocksize &&
>> +	    mp->m_ddev_targp->bt_bdev_awu_max >= mp->m_sb.sb_blocksize)
>> +		mp->m_dd_awu_hw_max = mp->m_ddev_targp->bt_bdev_awu_max;
> If we don't want to use the device's atomic write capabilities due to
> fsblock alignment problems, why not just zero out bt_bdev_awu_min/max?
> That would cut down on the number of "awu" variables around the
> codebase.

Sure, I did consider this, but thought it a bit unpleasant to zero out 
structure members like this.

Ideally we could have not set them in the first place, but need to know 
the blocksize when xfs_alloc_buftarg() is called, but it is not yet set 
for mp/sb. Is there any neat way to know the blocksize when 
xfs_alloc_buftarg() is called?

> 
> /*
>   * Ignore hardware atomic writes if the device can't handle a single
>   * fsblock for us.  Most devices set the min_awu to the LBA size, but
>   * the spec allows for a functionality gap.
>   */
> static void

You would call this around the same point in xfs_mountfs(), right?

> xfs_buftarg_reconcile_awu(
> 	struct xfs_buftarg	*btp)
> {
> 	struct xfs_mount	*mp = btp->bt_mount;
> 
> 	if (btp->bt_bdev_awu_min > mp->m_sb.sb_blocksize ||
> 	    btp->bt_bdev_awu_max < mp->m_sb.sb_blocksize) {
> 		btp->bt_bdev_awu_min = 0;
> 		btp->bt_bdev_awu_max = 0;
> 	}
> }
> 
> 	xfs_buftarg_reconcile_awu(mp->m_ddev_targp);
> 	if (mp->m_rtdev_targp)
> 		xfs_buftarg_reconcile_awu(mp->m_rtdev_targp);
> 
> Hrm?

Cheers,
John
Re: [PATCH v8 05/15] xfs: ignore HW which cannot atomic write a single block
Posted by Christoph Hellwig 9 months, 3 weeks ago
On Wed, Apr 23, 2025 at 08:15:43AM +0100, John Garry wrote:
> Ideally we could have not set them in the first place, but need to know the 
> blocksize when xfs_alloc_buftarg() is called, but it is not yet set for 
> mp/sb. Is there any neat way to know the blocksize when xfs_alloc_buftarg() 
> is called?

The buftarg is needed to read the superblock, which is used to determine
the block size, so no.

But maybe we should just delay setting the atomic values until later so
that it can be done in a single pass?  E.g. into xfs_setsize_buftarg
which then should probably be rename to something like
xfs_buftarg_setup.
Re: [PATCH v8 05/15] xfs: ignore HW which cannot atomic write a single block
Posted by John Garry 9 months, 3 weeks ago
On 23/04/2025 09:10, Christoph Hellwig wrote:
> On Wed, Apr 23, 2025 at 08:15:43AM +0100, John Garry wrote:
>> Ideally we could have not set them in the first place, but need to know the
>> blocksize when xfs_alloc_buftarg() is called, but it is not yet set for
>> mp/sb. Is there any neat way to know the blocksize when xfs_alloc_buftarg()
>> is called?
> 
> The buftarg is needed to read the superblock, which is used to determine
> the block size, so no.
> 
> But maybe we should just delay setting the atomic values until later so
> that it can be done in a single pass?  E.g. into xfs_setsize_buftarg
> which then should probably be rename to something like
> xfs_buftarg_setup.
> 

How about just do away with btp->bt_bdev_awu_{min, max} struct members, 
and call bdev_atomic_write_unit_max(mp->m_ddev_targp->bt_bdev) [and same 
for RT] to later to set the mp awu max values at mountfs time? I think 
that would work..
Re: [PATCH v8 05/15] xfs: ignore HW which cannot atomic write a single block
Posted by Christoph Hellwig 9 months, 3 weeks ago
On Wed, Apr 23, 2025 at 09:28:14AM +0100, John Garry wrote:
>> But maybe we should just delay setting the atomic values until later so
>> that it can be done in a single pass?  E.g. into xfs_setsize_buftarg
>> which then should probably be rename to something like
>> xfs_buftarg_setup.
>>
>
> How about just do away with btp->bt_bdev_awu_{min, max} struct members, and 
> call bdev_atomic_write_unit_max(mp->m_ddev_targp->bt_bdev) [and same for 
> RT] to later to set the mp awu max values at mountfs time? I think that 
> would work..

Sounds reasonable.
Re: [PATCH v8 05/15] xfs: ignore HW which cannot atomic write a single block
Posted by Darrick J. Wong 9 months, 3 weeks ago
On Wed, Apr 23, 2025 at 10:33:17AM +0200, Christoph Hellwig wrote:
> On Wed, Apr 23, 2025 at 09:28:14AM +0100, John Garry wrote:
> >> But maybe we should just delay setting the atomic values until later so
> >> that it can be done in a single pass?  E.g. into xfs_setsize_buftarg
> >> which then should probably be rename to something like
> >> xfs_buftarg_setup.
> >>
> >
> > How about just do away with btp->bt_bdev_awu_{min, max} struct members, and 
> > call bdev_atomic_write_unit_max(mp->m_ddev_targp->bt_bdev) [and same for 
> > RT] to later to set the mp awu max values at mountfs time? I think that 
> > would work..
> 
> Sounds reasonable.

I disagree, leaving the hardware awu_min/max in the buftarg makes more
sense to me because the buftarg is our abstraction for a block device,
and these fields describe the atomic write units that we can use with
that block device.

IOWs, I don't like dumping even more into struct xfs_mount.  xfs_group
has an awu_max for the software fallback, xfs_buftarg has an awu_min/max
for hardware, and even this V8 has yet a third pair of awu_min/max in
xfs_mount which I think is just the buftarg version but possibly
truncated.  I find those last two pairs confusing.

--D
Re: [PATCH v8 05/15] xfs: ignore HW which cannot atomic write a single block
Posted by Christoph Hellwig 9 months, 3 weeks ago
On Wed, Apr 23, 2025 at 08:12:24AM -0700, Darrick J. Wong wrote:
> I disagree, leaving the hardware awu_min/max in the buftarg makes more
> sense to me because the buftarg is our abstraction for a block device,
> and these fields describe the atomic write units that we can use with
> that block device.

Yeah.  If you want to keep it I'd suggest we go back to my earlier
suggestion.