[v4] large atomic writes for xfs with CoW

[PATCH v4 12/12] xfs: Allow block allocator to take an alignment hint

Posted by John Garry 11 months, 1 week ago

When issuing an atomic write by the CoW method, give the block allocator a
hint to align to the extszhint.

This means that we have a better chance to issuing the atomic write via
HW offload next time.

It does mean that the inode extszhint should be set appropriately for the
expected atomic write size.

Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 7 ++++++-
 fs/xfs/libxfs/xfs_bmap.h | 6 +++++-
 fs/xfs/xfs_reflink.c     | 8 ++++++--
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 0ef19f1469ec..9bfdfb7cdcae 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3454,6 +3454,12 @@ xfs_bmap_compute_alignments(
 		align = xfs_get_cowextsz_hint(ap->ip);
 	else if (ap->datatype & XFS_ALLOC_USERDATA)
 		align = xfs_get_extsz_hint(ap->ip);
+
+	if (align > 1 && ap->flags & XFS_BMAPI_EXTSZALIGN)
+		args->alignment = align;
+	else
+		args->alignment = 1;
+
 	if (align) {
 		if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
 					ap->eof, 0, ap->conv, &ap->offset,
@@ -3782,7 +3788,6 @@ xfs_bmap_btalloc(
 		.wasdel		= ap->wasdel,
 		.resv		= XFS_AG_RESV_NONE,
 		.datatype	= ap->datatype,
-		.alignment	= 1,
 		.minalignslop	= 0,
 	};
 	xfs_fileoff_t		orig_offset;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 4b721d935994..e6baa81e20d8 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -87,6 +87,9 @@ struct xfs_bmalloca {
 /* Do not update the rmap btree.  Used for reconstructing bmbt from rmapbt. */
 #define XFS_BMAPI_NORMAP	(1u << 10)
 
+/* Try to align allocations to the extent size hint */
+#define XFS_BMAPI_EXTSZALIGN	(1u << 11)
+
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
@@ -98,7 +101,8 @@ struct xfs_bmalloca {
 	{ XFS_BMAPI_REMAP,	"REMAP" }, \
 	{ XFS_BMAPI_COWFORK,	"COWFORK" }, \
 	{ XFS_BMAPI_NODISCARD,	"NODISCARD" }, \
-	{ XFS_BMAPI_NORMAP,	"NORMAP" }
+	{ XFS_BMAPI_NORMAP,	"NORMAP" },\
+	{ XFS_BMAPI_EXTSZALIGN,	"EXTSZALIGN" }
 
 
 static inline int xfs_bmapi_aflag(int w)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 844e2b43357b..72fb60df9a53 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -445,6 +445,11 @@ xfs_reflink_fill_cow_hole(
 	int			error;
 	bool			found;
 	bool			atomic_sw = flags & XFS_REFLINK_ATOMIC_SW;
+	uint32_t		bmapi_flags = XFS_BMAPI_COWFORK |
+					      XFS_BMAPI_PREALLOC;
+
+	if (atomic_sw)
+		bmapi_flags |= XFS_BMAPI_EXTSZALIGN;
 
 	resaligned = xfs_aligned_fsb_count(imap->br_startoff,
 		imap->br_blockcount, xfs_get_cowextsz_hint(ip));
@@ -478,8 +483,7 @@ xfs_reflink_fill_cow_hole(
 	/* Allocate the entire reservation as unwritten blocks. */
 	nimaps = 1;
 	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
-			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
-			&nimaps);
+			bmapi_flags, 0, cmap, &nimaps);
 	if (error)
 		goto out_trans_cancel;
 
-- 
2.31.1

Re: [PATCH v4 12/12] xfs: Allow block allocator to take an alignment hint

Posted by Dave Chinner 11 months, 1 week ago

On Mon, Mar 03, 2025 at 05:11:20PM +0000, John Garry wrote:
> When issuing an atomic write by the CoW method, give the block allocator a
> hint to align to the extszhint.
> 
> This means that we have a better chance to issuing the atomic write via
> HW offload next time.
> 
> It does mean that the inode extszhint should be set appropriately for the
> expected atomic write size.
> 
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/libxfs/xfs_bmap.c | 7 ++++++-
>  fs/xfs/libxfs/xfs_bmap.h | 6 +++++-
>  fs/xfs/xfs_reflink.c     | 8 ++++++--
>  3 files changed, 17 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 0ef19f1469ec..9bfdfb7cdcae 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -3454,6 +3454,12 @@ xfs_bmap_compute_alignments(
>  		align = xfs_get_cowextsz_hint(ap->ip);
>  	else if (ap->datatype & XFS_ALLOC_USERDATA)
>  		align = xfs_get_extsz_hint(ap->ip);
> +
> +	if (align > 1 && ap->flags & XFS_BMAPI_EXTSZALIGN)

needs () around the & logic.

	if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN))

> +		args->alignment = align;
> +	else
> +		args->alignment = 1;

When is  args->alignment not already initialised to 1?

> +
>  	if (align) {
>  		if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
>  					ap->eof, 0, ap->conv, &ap->offset,
> @@ -3782,7 +3788,6 @@ xfs_bmap_btalloc(
>  		.wasdel		= ap->wasdel,
>  		.resv		= XFS_AG_RESV_NONE,
>  		.datatype	= ap->datatype,
> -		.alignment	= 1,
>  		.minalignslop	= 0,
>  	};

Oh, you removed the initialisation to 1, so now we have the
possibility of getting args->alignment = 0 anywhere in the
allocation stack?

FWIW, we've been trying to get rid of that case - args->alignment should
always be 1 if no alignment is necessary so we don't ahve to special
case alignment of 0  (meaning no alignemnt) anywhere. This seems
like a step backwards from that perspective...



>  	xfs_fileoff_t		orig_offset;
> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> index 4b721d935994..e6baa81e20d8 100644
> --- a/fs/xfs/libxfs/xfs_bmap.h
> +++ b/fs/xfs/libxfs/xfs_bmap.h
> @@ -87,6 +87,9 @@ struct xfs_bmalloca {
>  /* Do not update the rmap btree.  Used for reconstructing bmbt from rmapbt. */
>  #define XFS_BMAPI_NORMAP	(1u << 10)
>  
> +/* Try to align allocations to the extent size hint */
> +#define XFS_BMAPI_EXTSZALIGN	(1u << 11)

Don't we already do that?

Or is this doing something subtle and non-obvious like overriding
stripe width alignment for large atomic writes?

-Dave.
-- 
Dave Chinner
david@fromorbit.com

Re: [PATCH v4 12/12] xfs: Allow block allocator to take an alignment hint

Posted by John Garry 11 months ago

On 09/03/2025 22:03, Dave Chinner wrote:
> On Mon, Mar 03, 2025 at 05:11:20PM +0000, John Garry wrote:
>> When issuing an atomic write by the CoW method, give the block allocator a
>> hint to align to the extszhint.
>>
>> This means that we have a better chance to issuing the atomic write via
>> HW offload next time.
>>
>> It does mean that the inode extszhint should be set appropriately for the
>> expected atomic write size.
>>
>> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/xfs/libxfs/xfs_bmap.c | 7 ++++++-
>>   fs/xfs/libxfs/xfs_bmap.h | 6 +++++-
>>   fs/xfs/xfs_reflink.c     | 8 ++++++--
>>   3 files changed, 17 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
>> index 0ef19f1469ec..9bfdfb7cdcae 100644
>> --- a/fs/xfs/libxfs/xfs_bmap.c
>> +++ b/fs/xfs/libxfs/xfs_bmap.c
>> @@ -3454,6 +3454,12 @@ xfs_bmap_compute_alignments(
>>   		align = xfs_get_cowextsz_hint(ap->ip);
>>   	else if (ap->datatype & XFS_ALLOC_USERDATA)
>>   		align = xfs_get_extsz_hint(ap->ip);
>> +
>> +	if (align > 1 && ap->flags & XFS_BMAPI_EXTSZALIGN)
> 
> needs () around the & logic.

ok

> 
> 	if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN))
> 
>> +		args->alignment = align;
>> +	else
>> +		args->alignment = 1;
> 
> When is  args->alignment not already initialised to 1?
> 
>> +
>>   	if (align) {
>>   		if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
>>   					ap->eof, 0, ap->conv, &ap->offset,
>> @@ -3782,7 +3788,6 @@ xfs_bmap_btalloc(
>>   		.wasdel		= ap->wasdel,
>>   		.resv		= XFS_AG_RESV_NONE,
>>   		.datatype	= ap->datatype,
>> -		.alignment	= 1,
>>   		.minalignslop	= 0,
>>   	};
> 
> Oh, you removed the initialisation to 1, so now we have the
> possibility of getting args->alignment = 0 anywhere in the
> allocation stack?
> 
> FWIW, we've been trying to get rid of that case - args->alignment should
> always be 1 if no alignment is necessary so we don't ahve to special
> case alignment of 0  (meaning no alignemnt) anywhere. This seems
> like a step backwards from that perspective...

As I recall, doing this was a suggestion when developing the forcealign 
support (as it had similar logic).

Anyway, I can leave the init to 1 in xfs_bmap_btalloc()

> 
> 
> 
>>   	xfs_fileoff_t		orig_offset;
>> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
>> index 4b721d935994..e6baa81e20d8 100644
>> --- a/fs/xfs/libxfs/xfs_bmap.h
>> +++ b/fs/xfs/libxfs/xfs_bmap.h
>> @@ -87,6 +87,9 @@ struct xfs_bmalloca {
>>   /* Do not update the rmap btree.  Used for reconstructing bmbt from rmapbt. */
>>   #define XFS_BMAPI_NORMAP	(1u << 10)
>>   
>> +/* Try to align allocations to the extent size hint */
>> +#define XFS_BMAPI_EXTSZALIGN	(1u << 11)
> 
> Don't we already do that?
> 
> Or is this doing something subtle and non-obvious like overriding
> stripe width alignment for large atomic writes?
>

stripe alignment only comes into play for eof allocation.

args->alignment is used in xfs_alloc_compute_aligned() to actually align 
the start bno.

If I don't have this, then we can get this ping-pong affect when 
overwriting atomically the same region:

# dd if=/dev/zero of=mnt/file bs=1M count=10 conv=fsync
# xfs_bmap -vp mnt/file
mnt/file:
EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
   0: [0..20479]:      192..20671        0 (192..20671)     20480 000000
# /xfs_io -d -C "pwrite -b 64k -V 1 -A -D 0 64k" mnt/file
wrote 65536/65536 bytes at offset 0
64 KiB, 1 ops; 0.0525 sec (1.190 MiB/sec and 19.0425 ops/sec)
# xfs_bmap -vp mnt/file
mnt/file:
EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
   0: [0..127]:        20672..20799      0 (20672..20799)     128 000000
   1: [128..20479]:    320..20671        0 (320..20671)     20352 000000
# /xfs_io -d -C "pwrite -b 64k -V 1 -A -D 0 64k" mnt/file
wrote 65536/65536 bytes at offset 0
64 KiB, 1 ops; 0.0524 sec (1.191 MiB/sec and 19.0581 ops/sec)
# xfs_bmap -vp mnt/file
mnt/file:
EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
   0: [0..20479]:      192..20671        0 (192..20671)     20480 000000
# /xfs_io -d -C "pwrite -b 64k -V 1 -A -D 0 64k" mnt/file
wrote 65536/65536 bytes at offset 0
64 KiB, 1 ops; 0.0524 sec (1.191 MiB/sec and 19.0611 ops/sec)
# xfs_bmap -vp mnt/file
mnt/file:
EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
   0: [0..127]:        20672..20799      0 (20672..20799)     128 000000
   1: [128..20479]:    320..20671        0 (320..20671)     20352 000000

We are never getting aligned extents wrt write length, and so have to 
fall back to the SW-based atomic write always. That is not what we want.

Thanks,
John

Re: [PATCH v4 12/12] xfs: Allow block allocator to take an alignment hint

Posted by Dave Chinner 11 months ago

On Mon, Mar 10, 2025 at 12:10:44PM +0000, John Garry wrote:
> On 09/03/2025 22:03, Dave Chinner wrote:
> > On Mon, Mar 03, 2025 at 05:11:20PM +0000, John Garry wrote:
> > > diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> > > index 4b721d935994..e6baa81e20d8 100644
> > > --- a/fs/xfs/libxfs/xfs_bmap.h
> > > +++ b/fs/xfs/libxfs/xfs_bmap.h
> > > @@ -87,6 +87,9 @@ struct xfs_bmalloca {
> > >   /* Do not update the rmap btree.  Used for reconstructing bmbt from rmapbt. */
> > >   #define XFS_BMAPI_NORMAP	(1u << 10)
> > > +/* Try to align allocations to the extent size hint */
> > > +#define XFS_BMAPI_EXTSZALIGN	(1u << 11)
> > 
> > Don't we already do that?
> > 
> > Or is this doing something subtle and non-obvious like overriding
> > stripe width alignment for large atomic writes?
> > 
> 
> stripe alignment only comes into play for eof allocation.
> 
> args->alignment is used in xfs_alloc_compute_aligned() to actually align the
> start bno.
> 
> If I don't have this, then we can get this ping-pong affect when overwriting
> atomically the same region:
> 
> # dd if=/dev/zero of=mnt/file bs=1M count=10 conv=fsync
> # xfs_bmap -vp mnt/file
> mnt/file:
> EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
>   0: [0..20479]:      192..20671        0 (192..20671)     20480 000000
> # /xfs_io -d -C "pwrite -b 64k -V 1 -A -D 0 64k" mnt/file
> wrote 65536/65536 bytes at offset 0
> 64 KiB, 1 ops; 0.0525 sec (1.190 MiB/sec and 19.0425 ops/sec)
> # xfs_bmap -vp mnt/file
> mnt/file:
> EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
>   0: [0..127]:        20672..20799      0 (20672..20799)     128 000000
>   1: [128..20479]:    320..20671        0 (320..20671)     20352 000000
> # /xfs_io -d -C "pwrite -b 64k -V 1 -A -D 0 64k" mnt/file
> wrote 65536/65536 bytes at offset 0
> 64 KiB, 1 ops; 0.0524 sec (1.191 MiB/sec and 19.0581 ops/sec)
> # xfs_bmap -vp mnt/file
> mnt/file:
> EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
>   0: [0..20479]:      192..20671        0 (192..20671)     20480 000000
> # /xfs_io -d -C "pwrite -b 64k -V 1 -A -D 0 64k" mnt/file
> wrote 65536/65536 bytes at offset 0
> 64 KiB, 1 ops; 0.0524 sec (1.191 MiB/sec and 19.0611 ops/sec)
> # xfs_bmap -vp mnt/file
> mnt/file:
> EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
>   0: [0..127]:        20672..20799      0 (20672..20799)     128 000000
>   1: [128..20479]:    320..20671        0 (320..20671)     20352 000000
> 
> We are never getting aligned extents wrt write length, and so have to fall
> back to the SW-based atomic write always. That is not what we want.

Please add a comment to explain this where the XFS_BMAPI_EXTSZALIGN
flag is set, because it's not at all obvious what it is doing or why
it is needed from the name of the variable or the implementation.

-Dave.

-- 
Dave Chinner
david@fromorbit.com

Re: [PATCH v4 12/12] xfs: Allow block allocator to take an alignment hint

Posted by John Garry 11 months ago

On 12/03/2025 19:51, Dave Chinner wrote:
>> We are never getting aligned extents wrt write length, and so have to fall
>> back to the SW-based atomic write always. That is not what we want.
> Please add a comment to explain this where the XFS_BMAPI_EXTSZALIGN
> flag is set, because it's not at all obvious what it is doing or why
> it is needed from the name of the variable or the implementation.

ok, fine. But that is as long as we just are doing the alignment for 
XFS_BMAPI_EXTSZALIGN (and not always, as Christoph queried).

Thanks,
John