[PATCH RFC 09/10] xfs: Update atomic write max size

John Garry posted 10 patches 10 months, 1 week ago
There is a newer version of this series
[PATCH RFC 09/10] xfs: Update atomic write max size
Posted by John Garry 10 months, 1 week ago
Now that CoW-based atomic writes are supported, update the max size of an
atomic write.

For simplicity, limit at the max of what the mounted bdev can support in
terms of atomic write limits. Maybe in future we will have a better way
to advertise this optimised limit.

In addition, the max atomic write size needs to be aligned to the agsize.
Currently when attempting to use HW offload, we  just check that the
mapping startblock is aligned. However, that is just the startblock within
the AG, and the AG may not be properly aligned to the underlying block
device atomic write limits.

As such, limit atomic writes to the greatest power-of-2 which fits in an
AG, so that aligning to the startblock will be mean that we are also
aligned to the disk block.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/xfs_iops.c  |  7 ++++++-
 fs/xfs/xfs_mount.c | 28 ++++++++++++++++++++++++++++
 fs/xfs/xfs_mount.h |  1 +
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ea79fb246e33..95681d6c2bcd 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -606,12 +606,17 @@ xfs_get_atomic_write_attr(
 	unsigned int		*unit_min,
 	unsigned int		*unit_max)
 {
+	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+	struct xfs_mount	*mp = ip->i_mount;
+
 	if (!xfs_inode_can_atomicwrite(ip)) {
 		*unit_min = *unit_max = 0;
 		return;
 	}
 
-	*unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
+	*unit_min = ip->i_mount->m_sb.sb_blocksize;
+	*unit_max =  min_t(unsigned int, XFS_FSB_TO_B(mp, mp->awu_max),
+					target->bt_bdev_awu_max);
 }
 
 static void
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 477c5262cf91..4e60347f6b7e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -651,6 +651,32 @@ xfs_agbtree_compute_maxlevels(
 	levels = max(levels, mp->m_rmap_maxlevels);
 	mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
 }
+static inline void
+xfs_mp_compute_awu_max(
+	struct xfs_mount	*mp)
+{
+	xfs_agblock_t		agsize = mp->m_sb.sb_agblocks;
+	xfs_agblock_t		awu_max;
+
+	if (!xfs_has_reflink(mp)) {
+		mp->awu_max = 1;
+		return;
+	}
+
+	/*
+	 * Find highest power-of-2 evenly divisible into agsize and which
+	 * also fits into an unsigned int field.
+	 */
+	awu_max = 1;
+	while (1) {
+		if (agsize % (awu_max * 2))
+			break;
+		if (XFS_FSB_TO_B(mp, awu_max * 2) > UINT_MAX)
+			break;
+		awu_max *= 2;
+	}
+	mp->awu_max = awu_max;
+}
 
 /* Compute maximum possible height for realtime btree types for this fs. */
 static inline void
@@ -736,6 +762,8 @@ xfs_mountfs(
 	xfs_agbtree_compute_maxlevels(mp);
 	xfs_rtbtree_compute_maxlevels(mp);
 
+	xfs_mp_compute_awu_max(mp);
+
 	/*
 	 * Check if sb_agblocks is aligned at stripe boundary.  If sb_agblocks
 	 * is NOT aligned turn off m_dalign since allocator alignment is within
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index fbed172d6770..34286c87ac4a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -198,6 +198,7 @@ typedef struct xfs_mount {
 	bool			m_fail_unmount;
 	bool			m_finobt_nores; /* no per-AG finobt resv. */
 	bool			m_update_sb;	/* sb needs update in mount */
+	xfs_extlen_t		awu_max;	/* max atomic write */
 
 	/*
 	 * Bitsets of per-fs metadata that have been checked and/or are sick.
-- 
2.31.1
Re: [PATCH RFC 09/10] xfs: Update atomic write max size
Posted by Darrick J. Wong 10 months, 1 week ago
On Tue, Feb 04, 2025 at 12:01:26PM +0000, John Garry wrote:
> Now that CoW-based atomic writes are supported, update the max size of an
> atomic write.
> 
> For simplicity, limit at the max of what the mounted bdev can support in
> terms of atomic write limits. Maybe in future we will have a better way
> to advertise this optimised limit.
> 
> In addition, the max atomic write size needs to be aligned to the agsize.
> Currently when attempting to use HW offload, we  just check that the
> mapping startblock is aligned. However, that is just the startblock within
> the AG, and the AG may not be properly aligned to the underlying block
> device atomic write limits.
> 
> As such, limit atomic writes to the greatest power-of-2 which fits in an
> AG, so that aligning to the startblock will be mean that we are also
> aligned to the disk block.

I don't understand this sentence -- what are we "aligning to the
startblock"?  I think you're saying that you want to limit the size of
untorn writes to the greatest power-of-two factor of the agsize so that
allocations for an untorn write will always be aligned compatibly with
the alignment requirements of the storage for an untorn write?

> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/xfs_iops.c  |  7 ++++++-
>  fs/xfs/xfs_mount.c | 28 ++++++++++++++++++++++++++++
>  fs/xfs/xfs_mount.h |  1 +
>  3 files changed, 35 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index ea79fb246e33..95681d6c2bcd 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -606,12 +606,17 @@ xfs_get_atomic_write_attr(
>  	unsigned int		*unit_min,
>  	unsigned int		*unit_max)
>  {
> +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> +	struct xfs_mount	*mp = ip->i_mount;
> +
>  	if (!xfs_inode_can_atomicwrite(ip)) {
>  		*unit_min = *unit_max = 0;
>  		return;
>  	}
>  
> -	*unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
> +	*unit_min = ip->i_mount->m_sb.sb_blocksize;
> +	*unit_max =  min_t(unsigned int, XFS_FSB_TO_B(mp, mp->awu_max),
> +					target->bt_bdev_awu_max);
>  }
>  
>  static void
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 477c5262cf91..4e60347f6b7e 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -651,6 +651,32 @@ xfs_agbtree_compute_maxlevels(
>  	levels = max(levels, mp->m_rmap_maxlevels);
>  	mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
>  }
> +static inline void
> +xfs_mp_compute_awu_max(

xfs_compute_awu_max() ?

> +	struct xfs_mount	*mp)
> +{
> +	xfs_agblock_t		agsize = mp->m_sb.sb_agblocks;
> +	xfs_agblock_t		awu_max;
> +
> +	if (!xfs_has_reflink(mp)) {
> +		mp->awu_max = 1;
> +		return;
> +	}
> +
> +	/*
> +	 * Find highest power-of-2 evenly divisible into agsize and which
> +	 * also fits into an unsigned int field.
> +	 */
> +	awu_max = 1;
> +	while (1) {
> +		if (agsize % (awu_max * 2))
> +			break;
> +		if (XFS_FSB_TO_B(mp, awu_max * 2) > UINT_MAX)
> +			break;
> +		awu_max *= 2;
> +	}
> +	mp->awu_max = awu_max;

I think you need two awu_maxes here -- one for the data device, and
another for the realtime device.  The rt computation is probably more
complex since I think it's the greatest power of two that fits in the rt
extent size if it isn't a power of two; or the greatest power of two
that fits in the rtgroup if rtgroups are enabled; or probably just no
limit otherwise.

--D

> +}
>  
>  /* Compute maximum possible height for realtime btree types for this fs. */
>  static inline void
> @@ -736,6 +762,8 @@ xfs_mountfs(
>  	xfs_agbtree_compute_maxlevels(mp);
>  	xfs_rtbtree_compute_maxlevels(mp);
>  
> +	xfs_mp_compute_awu_max(mp);
> +
>  	/*
>  	 * Check if sb_agblocks is aligned at stripe boundary.  If sb_agblocks
>  	 * is NOT aligned turn off m_dalign since allocator alignment is within
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index fbed172d6770..34286c87ac4a 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -198,6 +198,7 @@ typedef struct xfs_mount {
>  	bool			m_fail_unmount;
>  	bool			m_finobt_nores; /* no per-AG finobt resv. */
>  	bool			m_update_sb;	/* sb needs update in mount */
> +	xfs_extlen_t		awu_max;	/* max atomic write */
>  
>  	/*
>  	 * Bitsets of per-fs metadata that have been checked and/or are sick.
> -- 
> 2.31.1
> 
>
Re: [PATCH RFC 09/10] xfs: Update atomic write max size
Posted by John Garry 10 months, 1 week ago
On 05/02/2025 19:41, Darrick J. Wong wrote:
> On Tue, Feb 04, 2025 at 12:01:26PM +0000, John Garry wrote:
>> Now that CoW-based atomic writes are supported, update the max size of an
>> atomic write.
>>
>> For simplicity, limit at the max of what the mounted bdev can support in
>> terms of atomic write limits. Maybe in future we will have a better way
>> to advertise this optimised limit.
>>
>> In addition, the max atomic write size needs to be aligned to the agsize.
>> Currently when attempting to use HW offload, we  just check that the
>> mapping startblock is aligned. However, that is just the startblock within
>> the AG, and the AG may not be properly aligned to the underlying block
>> device atomic write limits.
>>
>> As such, limit atomic writes to the greatest power-of-2 which fits in an
>> AG, so that aligning to the startblock will be mean that we are also
>> aligned to the disk block.

Right, "startblock" is a bit vague

> 
> I don't understand this sentence -- what are we "aligning to the
> startblock"?  I think you're saying that you want to limit the size of
> untorn writes to the greatest power-of-two factor of the agsize so that
> allocations for an untorn write will always be aligned compatibly with
> the alignment requirements of the storage for an untorn write?

Yes, that's it. I'll borrow your wording :)

> 
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/xfs/xfs_iops.c  |  7 ++++++-
>>   fs/xfs/xfs_mount.c | 28 ++++++++++++++++++++++++++++
>>   fs/xfs/xfs_mount.h |  1 +
>>   3 files changed, 35 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
>> index ea79fb246e33..95681d6c2bcd 100644
>> --- a/fs/xfs/xfs_iops.c
>> +++ b/fs/xfs/xfs_iops.c
>> @@ -606,12 +606,17 @@ xfs_get_atomic_write_attr(
>>   	unsigned int		*unit_min,
>>   	unsigned int		*unit_max)
>>   {
>> +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
>> +	struct xfs_mount	*mp = ip->i_mount;
>> +
>>   	if (!xfs_inode_can_atomicwrite(ip)) {
>>   		*unit_min = *unit_max = 0;
>>   		return;
>>   	}
>>   
>> -	*unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
>> +	*unit_min = ip->i_mount->m_sb.sb_blocksize;
>> +	*unit_max =  min_t(unsigned int, XFS_FSB_TO_B(mp, mp->awu_max),
>> +					target->bt_bdev_awu_max);
>>   }
>>   
>>   static void
>> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
>> index 477c5262cf91..4e60347f6b7e 100644
>> --- a/fs/xfs/xfs_mount.c
>> +++ b/fs/xfs/xfs_mount.c
>> @@ -651,6 +651,32 @@ xfs_agbtree_compute_maxlevels(
>>   	levels = max(levels, mp->m_rmap_maxlevels);
>>   	mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
>>   }
>> +static inline void
>> +xfs_mp_compute_awu_max(
> 
> xfs_compute_awu_max() ?

ok

> 
>> +	struct xfs_mount	*mp)
>> +{
>> +	xfs_agblock_t		agsize = mp->m_sb.sb_agblocks;
>> +	xfs_agblock_t		awu_max;
>> +
>> +	if (!xfs_has_reflink(mp)) {
>> +		mp->awu_max = 1;
>> +		return;
>> +	}
>> +
>> +	/*
>> +	 * Find highest power-of-2 evenly divisible into agsize and which
>> +	 * also fits into an unsigned int field.
>> +	 */
>> +	awu_max = 1;
>> +	while (1) {
>> +		if (agsize % (awu_max * 2))
>> +			break;
>> +		if (XFS_FSB_TO_B(mp, awu_max * 2) > UINT_MAX)
>> +			break;
>> +		awu_max *= 2;
>> +	}
>> +	mp->awu_max = awu_max;
> 
> I think you need two awu_maxes here -- one for the data device, and
> another for the realtime device.
How about we just don't support rtdev initially for this CoW-based 
method, i.e. stick at 1x FSB awu max?

 >  The rt computation is probably more
 > complex since I think it's the greatest power of two that fits in the rt
 > extent size if it isn't a power of two;> or the greatest power of 
two> that fits in the rtgroup if rtgroups are enabled; or probably just no
 > limit otherwise.
 >

Thanks,
John
Re: [PATCH RFC 09/10] xfs: Update atomic write max size
Posted by Darrick J. Wong 10 months, 1 week ago
On Thu, Feb 06, 2025 at 09:15:16AM +0000, John Garry wrote:
> On 05/02/2025 19:41, Darrick J. Wong wrote:
> > On Tue, Feb 04, 2025 at 12:01:26PM +0000, John Garry wrote:
> > > Now that CoW-based atomic writes are supported, update the max size of an
> > > atomic write.
> > > 
> > > For simplicity, limit at the max of what the mounted bdev can support in
> > > terms of atomic write limits. Maybe in future we will have a better way
> > > to advertise this optimised limit.
> > > 
> > > In addition, the max atomic write size needs to be aligned to the agsize.
> > > Currently when attempting to use HW offload, we  just check that the
> > > mapping startblock is aligned. However, that is just the startblock within
> > > the AG, and the AG may not be properly aligned to the underlying block
> > > device atomic write limits.
> > > 
> > > As such, limit atomic writes to the greatest power-of-2 which fits in an
> > > AG, so that aligning to the startblock will be mean that we are also
> > > aligned to the disk block.
> 
> Right, "startblock" is a bit vague
> 
> > 
> > I don't understand this sentence -- what are we "aligning to the
> > startblock"?  I think you're saying that you want to limit the size of
> > untorn writes to the greatest power-of-two factor of the agsize so that
> > allocations for an untorn write will always be aligned compatibly with
> > the alignment requirements of the storage for an untorn write?
> 
> Yes, that's it. I'll borrow your wording :)
> 
> > 
> > > Signed-off-by: John Garry <john.g.garry@oracle.com>
> > > ---
> > >   fs/xfs/xfs_iops.c  |  7 ++++++-
> > >   fs/xfs/xfs_mount.c | 28 ++++++++++++++++++++++++++++
> > >   fs/xfs/xfs_mount.h |  1 +
> > >   3 files changed, 35 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> > > index ea79fb246e33..95681d6c2bcd 100644
> > > --- a/fs/xfs/xfs_iops.c
> > > +++ b/fs/xfs/xfs_iops.c
> > > @@ -606,12 +606,17 @@ xfs_get_atomic_write_attr(
> > >   	unsigned int		*unit_min,
> > >   	unsigned int		*unit_max)
> > >   {
> > > +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> > > +	struct xfs_mount	*mp = ip->i_mount;
> > > +
> > >   	if (!xfs_inode_can_atomicwrite(ip)) {
> > >   		*unit_min = *unit_max = 0;
> > >   		return;
> > >   	}
> > > -	*unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
> > > +	*unit_min = ip->i_mount->m_sb.sb_blocksize;
> > > +	*unit_max =  min_t(unsigned int, XFS_FSB_TO_B(mp, mp->awu_max),
> > > +					target->bt_bdev_awu_max);
> > >   }
> > >   static void
> > > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > > index 477c5262cf91..4e60347f6b7e 100644
> > > --- a/fs/xfs/xfs_mount.c
> > > +++ b/fs/xfs/xfs_mount.c
> > > @@ -651,6 +651,32 @@ xfs_agbtree_compute_maxlevels(
> > >   	levels = max(levels, mp->m_rmap_maxlevels);
> > >   	mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
> > >   }
> > > +static inline void
> > > +xfs_mp_compute_awu_max(
> > 
> > xfs_compute_awu_max() ?
> 
> ok
> 
> > 
> > > +	struct xfs_mount	*mp)
> > > +{
> > > +	xfs_agblock_t		agsize = mp->m_sb.sb_agblocks;
> > > +	xfs_agblock_t		awu_max;
> > > +
> > > +	if (!xfs_has_reflink(mp)) {
> > > +		mp->awu_max = 1;
> > > +		return;
> > > +	}
> > > +
> > > +	/*
> > > +	 * Find highest power-of-2 evenly divisible into agsize and which
> > > +	 * also fits into an unsigned int field.
> > > +	 */
> > > +	awu_max = 1;
> > > +	while (1) {
> > > +		if (agsize % (awu_max * 2))
> > > +			break;
> > > +		if (XFS_FSB_TO_B(mp, awu_max * 2) > UINT_MAX)
> > > +			break;
> > > +		awu_max *= 2;
> > > +	}
> > > +	mp->awu_max = awu_max;
> > 
> > I think you need two awu_maxes here -- one for the data device, and
> > another for the realtime device.
> How about we just don't support rtdev initially for this CoW-based method,
> i.e. stick at 1x FSB awu max?

I guess, but that's more unfinished business.

--D

> >  The rt computation is probably more
> > complex since I think it's the greatest power of two that fits in the rt
> > extent size if it isn't a power of two;> or the greatest power of two>
> that fits in the rtgroup if rtgroups are enabled; or probably just no
> > limit otherwise.
> >
> 
> Thanks,
> John
>
Re: [PATCH RFC 09/10] xfs: Update atomic write max size
Posted by John Garry 10 months, 1 week ago
On 06/02/2025 21:54, Darrick J. Wong wrote:
>>> I think you need two awu_maxes here -- one for the data device, and
>>> another for the realtime device.
>> How about we just don't support rtdev initially for this CoW-based method,
>> i.e. stick at 1x FSB awu max?
> I guess, but that's more unfinished business.

Understood. Let me see how the changes look for RT and then reconsider.

Cheers,
John