[v1] large atomic writes for xfs with CoW

[PATCH RFC 08/10] xfs: Commit CoW-based atomic writes atomically

Posted by John Garry 1 year ago

When completing a CoW-based write, each extent range mapping update is
covered by a separate transaction.

For a CoW-based atomic write, all mappings must be changed at once, so
change to use a single transaction.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/xfs_file.c    |  5 ++++-
 fs/xfs/xfs_reflink.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_reflink.h |  3 +++
 3 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 12af5cdc3094..170d7891f90d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -527,7 +527,10 @@ xfs_dio_write_end_io(
 	nofs_flag = memalloc_nofs_save();
 
 	if (flags & IOMAP_DIO_COW) {
-		error = xfs_reflink_end_cow(ip, offset, size);
+		if (iocb->ki_flags & IOCB_ATOMIC)
+			error = xfs_reflink_end_atomic_cow(ip, offset, size);
+		else
+			error = xfs_reflink_end_cow(ip, offset, size);
 		if (error)
 			goto out;
 	}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index dbce333b60eb..60c986300faa 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -990,6 +990,54 @@ xfs_reflink_end_cow(
 		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
 	return error;
 }
+int
+xfs_reflink_end_atomic_cow(
+	struct xfs_inode		*ip,
+	xfs_off_t			offset,
+	xfs_off_t			count)
+{
+	xfs_fileoff_t			offset_fsb;
+	xfs_fileoff_t			end_fsb;
+	int				error = 0;
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_trans		*tp;
+	unsigned int			resblks;
+	bool				commit = false;
+
+	trace_xfs_reflink_end_cow(ip, offset, count);
+
+	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+
+	resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
+				(unsigned int)(end_fsb - offset_fsb),
+				XFS_DATA_FORK);
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+			XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	while (end_fsb > offset_fsb && !error)
+		error = xfs_reflink_end_cow_extent_locked(ip, &offset_fsb,
+						end_fsb, tp, &commit);
+
+	if (error || !commit)
+		goto out_cancel;
+
+	if (error)
+		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+out_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
 
 /*
  * Free all CoW staging blocks that are still referenced by the ondisk refcount
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index ef5c8b2398d8..2c3b096c1386 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -45,6 +45,9 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t count, bool cancel_real);
 extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t count);
+		int
+xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset,
+		xfs_off_t count);
 extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
 extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
 		struct file *file_out, loff_t pos_out, loff_t len,
-- 
2.31.1

Re: [PATCH RFC 08/10] xfs: Commit CoW-based atomic writes atomically

Posted by Darrick J. Wong 1 year ago

On Tue, Feb 04, 2025 at 12:01:25PM +0000, John Garry wrote:
> When completing a CoW-based write, each extent range mapping update is
> covered by a separate transaction.
> 
> For a CoW-based atomic write, all mappings must be changed at once, so
> change to use a single transaction.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/xfs_file.c    |  5 ++++-
>  fs/xfs/xfs_reflink.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_reflink.h |  3 +++
>  3 files changed, 55 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 12af5cdc3094..170d7891f90d 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -527,7 +527,10 @@ xfs_dio_write_end_io(
>  	nofs_flag = memalloc_nofs_save();
>  
>  	if (flags & IOMAP_DIO_COW) {
> -		error = xfs_reflink_end_cow(ip, offset, size);
> +		if (iocb->ki_flags & IOCB_ATOMIC)
> +			error = xfs_reflink_end_atomic_cow(ip, offset, size);
> +		else
> +			error = xfs_reflink_end_cow(ip, offset, size);
>  		if (error)
>  			goto out;
>  	}
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index dbce333b60eb..60c986300faa 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -990,6 +990,54 @@ xfs_reflink_end_cow(
>  		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
>  	return error;
>  }
> +int
> +xfs_reflink_end_atomic_cow(
> +	struct xfs_inode		*ip,
> +	xfs_off_t			offset,
> +	xfs_off_t			count)
> +{
> +	xfs_fileoff_t			offset_fsb;
> +	xfs_fileoff_t			end_fsb;
> +	int				error = 0;
> +	struct xfs_mount		*mp = ip->i_mount;
> +	struct xfs_trans		*tp;
> +	unsigned int			resblks;
> +	bool				commit = false;
> +
> +	trace_xfs_reflink_end_cow(ip, offset, count);
> +
> +	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
> +	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
> +
> +	resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
> +				(unsigned int)(end_fsb - offset_fsb),
> +				XFS_DATA_FORK);
> +
> +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,

xfs gained reflink support for realtime volumes in 6.14-rc1, so you now
have to calculate for that in here too.

> +			XFS_TRANS_RESERVE, &tp);
> +	if (error)
> +		return error;
> +
> +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> +	xfs_trans_ijoin(tp, ip, 0);
> +
> +	while (end_fsb > offset_fsb && !error)
> +		error = xfs_reflink_end_cow_extent_locked(ip, &offset_fsb,
> +						end_fsb, tp, &commit);

Hmm.  Attaching intent items to a transaction consumes space in that
transaction, so we probably ought to limit the amount that we try to do
here.  Do you know what that limit is?  I don't, but it's roughly
tr_logres divided by the average size of a log intent item.

This means we need to restrict the size of an untorn write to a
double-digit number of fsblocks for safety.

The logic in here looks reasonable though.

--D

> +
> +	if (error || !commit)
> +		goto out_cancel;
> +
> +	if (error)
> +		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
> +	error = xfs_trans_commit(tp);
> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +	return error;
> +out_cancel:
> +	xfs_trans_cancel(tp);
> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +	return error;
> +}
>  
>  /*
>   * Free all CoW staging blocks that are still referenced by the ondisk refcount
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index ef5c8b2398d8..2c3b096c1386 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -45,6 +45,9 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
>  		xfs_off_t count, bool cancel_real);
>  extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
>  		xfs_off_t count);
> +		int
> +xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset,
> +		xfs_off_t count);
>  extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
>  extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
>  		struct file *file_out, loff_t pos_out, loff_t len,
> -- 
> 2.31.1
> 
>

Re: [PATCH RFC 08/10] xfs: Commit CoW-based atomic writes atomically

Posted by John Garry 1 year ago

On 05/02/2025 19:47, Darrick J. Wong wrote:
> On Tue, Feb 04, 2025 at 12:01:25PM +0000, John Garry wrote:
>> When completing a CoW-based write, each extent range mapping update is
>> covered by a separate transaction.
>>
>> For a CoW-based atomic write, all mappings must be changed at once, so
>> change to use a single transaction.
>>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/xfs/xfs_file.c    |  5 ++++-
>>   fs/xfs/xfs_reflink.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
>>   fs/xfs/xfs_reflink.h |  3 +++
>>   3 files changed, 55 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index 12af5cdc3094..170d7891f90d 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -527,7 +527,10 @@ xfs_dio_write_end_io(
>>   	nofs_flag = memalloc_nofs_save();
>>   
>>   	if (flags & IOMAP_DIO_COW) {
>> -		error = xfs_reflink_end_cow(ip, offset, size);
>> +		if (iocb->ki_flags & IOCB_ATOMIC)
>> +			error = xfs_reflink_end_atomic_cow(ip, offset, size);
>> +		else
>> +			error = xfs_reflink_end_cow(ip, offset, size);
>>   		if (error)
>>   			goto out;
>>   	}
>> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
>> index dbce333b60eb..60c986300faa 100644
>> --- a/fs/xfs/xfs_reflink.c
>> +++ b/fs/xfs/xfs_reflink.c
>> @@ -990,6 +990,54 @@ xfs_reflink_end_cow(
>>   		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
>>   	return error;
>>   }
>> +int
>> +xfs_reflink_end_atomic_cow(
>> +	struct xfs_inode		*ip,
>> +	xfs_off_t			offset,
>> +	xfs_off_t			count)
>> +{
>> +	xfs_fileoff_t			offset_fsb;
>> +	xfs_fileoff_t			end_fsb;
>> +	int				error = 0;
>> +	struct xfs_mount		*mp = ip->i_mount;
>> +	struct xfs_trans		*tp;
>> +	unsigned int			resblks;
>> +	bool				commit = false;
>> +
>> +	trace_xfs_reflink_end_cow(ip, offset, count);
>> +
>> +	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
>> +	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
>> +
>> +	resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
>> +				(unsigned int)(end_fsb - offset_fsb),
>> +				XFS_DATA_FORK);
>> +
>> +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
> 
> xfs gained reflink support for realtime volumes in 6.14-rc1, so you now
> have to calculate for that in here too.
> 
>> +			XFS_TRANS_RESERVE, &tp);
>> +	if (error)
>> +		return error;
>> +
>> +	xfs_ilock(ip, XFS_ILOCK_EXCL);
>> +	xfs_trans_ijoin(tp, ip, 0);
>> +
>> +	while (end_fsb > offset_fsb && !error)
>> +		error = xfs_reflink_end_cow_extent_locked(ip, &offset_fsb,
>> +						end_fsb, tp, &commit);
> 
> Hmm.  Attaching intent items to a transaction consumes space in that
> transaction, so we probably ought to limit the amount that we try to do
> here.  Do you know what that limit is?  I don't, 

nor do I ...

> but it's roughly
> tr_logres divided by the average size of a log intent item.

So you have a ballpark figure on the average size of a log intent item, 
or an idea on how to get it?

> 
> This means we need to restrict the size of an untorn write to a
> double-digit number of fsblocks for safety.

Sure, but won't we also still be liable to suffer the same issue which 
was fixed in commit d6f215f359637?

> 
> The logic in here looks reasonable though.
> 

Thanks,
John

> --D
> 
>> +
>> +	if (error || !commit)
>> +		goto out_cancel;
>> +
>> +	if (error)
>> +		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
>> +	error = xfs_trans_commit(tp);
>> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>> +	return error;
>> +out_cancel:
>> +	xfs_trans_cancel(tp);
>> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>> +	return error;
>> +}
>>   
>>   /*
>>    * Free all CoW staging blocks that are still referenced by the ondisk refcount
>> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
>> index ef5c8b2398d8..2c3b096c1386 100644
>> --- a/fs/xfs/xfs_reflink.h
>> +++ b/fs/xfs/xfs_reflink.h
>> @@ -45,6 +45,9 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
>>   		xfs_off_t count, bool cancel_real);
>>   extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
>>   		xfs_off_t count);
>> +		int
>> +xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset,
>> +		xfs_off_t count);
>>   extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
>>   extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
>>   		struct file *file_out, loff_t pos_out, loff_t len,
>> -- 
>> 2.31.1
>>
>>

Re: [PATCH RFC 08/10] xfs: Commit CoW-based atomic writes atomically

Posted by Darrick J. Wong 1 year ago

On Thu, Feb 06, 2025 at 10:27:45AM +0000, John Garry wrote:
> On 05/02/2025 19:47, Darrick J. Wong wrote:
> > On Tue, Feb 04, 2025 at 12:01:25PM +0000, John Garry wrote:
> > > When completing a CoW-based write, each extent range mapping update is
> > > covered by a separate transaction.
> > > 
> > > For a CoW-based atomic write, all mappings must be changed at once, so
> > > change to use a single transaction.
> > > 
> > > Signed-off-by: John Garry <john.g.garry@oracle.com>
> > > ---
> > >   fs/xfs/xfs_file.c    |  5 ++++-
> > >   fs/xfs/xfs_reflink.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
> > >   fs/xfs/xfs_reflink.h |  3 +++
> > >   3 files changed, 55 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > > index 12af5cdc3094..170d7891f90d 100644
> > > --- a/fs/xfs/xfs_file.c
> > > +++ b/fs/xfs/xfs_file.c
> > > @@ -527,7 +527,10 @@ xfs_dio_write_end_io(
> > >   	nofs_flag = memalloc_nofs_save();
> > >   	if (flags & IOMAP_DIO_COW) {
> > > -		error = xfs_reflink_end_cow(ip, offset, size);
> > > +		if (iocb->ki_flags & IOCB_ATOMIC)
> > > +			error = xfs_reflink_end_atomic_cow(ip, offset, size);
> > > +		else
> > > +			error = xfs_reflink_end_cow(ip, offset, size);
> > >   		if (error)
> > >   			goto out;
> > >   	}
> > > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > > index dbce333b60eb..60c986300faa 100644
> > > --- a/fs/xfs/xfs_reflink.c
> > > +++ b/fs/xfs/xfs_reflink.c
> > > @@ -990,6 +990,54 @@ xfs_reflink_end_cow(
> > >   		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
> > >   	return error;
> > >   }
> > > +int
> > > +xfs_reflink_end_atomic_cow(
> > > +	struct xfs_inode		*ip,
> > > +	xfs_off_t			offset,
> > > +	xfs_off_t			count)
> > > +{
> > > +	xfs_fileoff_t			offset_fsb;
> > > +	xfs_fileoff_t			end_fsb;
> > > +	int				error = 0;
> > > +	struct xfs_mount		*mp = ip->i_mount;
> > > +	struct xfs_trans		*tp;
> > > +	unsigned int			resblks;
> > > +	bool				commit = false;
> > > +
> > > +	trace_xfs_reflink_end_cow(ip, offset, count);
> > > +
> > > +	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
> > > +	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
> > > +
> > > +	resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
> > > +				(unsigned int)(end_fsb - offset_fsb),
> > > +				XFS_DATA_FORK);
> > > +
> > > +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
> > 
> > xfs gained reflink support for realtime volumes in 6.14-rc1, so you now
> > have to calculate for that in here too.
> > 
> > > +			XFS_TRANS_RESERVE, &tp);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> > > +	xfs_trans_ijoin(tp, ip, 0);
> > > +
> > > +	while (end_fsb > offset_fsb && !error)
> > > +		error = xfs_reflink_end_cow_extent_locked(ip, &offset_fsb,
> > > +						end_fsb, tp, &commit);
> > 
> > Hmm.  Attaching intent items to a transaction consumes space in that
> > transaction, so we probably ought to limit the amount that we try to do
> > here.  Do you know what that limit is?  I don't,
> 
> nor do I ...
> 
> > but it's roughly
> > tr_logres divided by the average size of a log intent item.
> 
> So you have a ballpark figure on the average size of a log intent item, or
> an idea on how to get it?

You could add up the size of struct
xfs_{bui,rmap,refcount,efi}_log_format structures and add 20%, that will
give you a ballpark figure of the worst case per-block requirements.

My guess is that 64 blocks is ok provided resblks is big enough.  But I
guess we could estimate it (very conservatively) dynamically too.

(also note tr_itruncate declares more logres)

> > This means we need to restrict the size of an untorn write to a
> > double-digit number of fsblocks for safety.
> 
> Sure, but won't we also still be liable to suffer the same issue which was
> fixed in commit d6f215f359637?

Yeah, come to think of it, you need to reserve the worst case space
reservation, i.e. each of the blocks between offset_fsb and end_fsb
becomes a separate btree update.

	resblks = (end_fsb - offset_fsb) *
			XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);

--D

> > 
> > The logic in here looks reasonable though.
> > 
> 
> Thanks,
> John
> 
> > --D
> > 
> > > +
> > > +	if (error || !commit)
> > > +		goto out_cancel;
> > > +
> > > +	if (error)
> > > +		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
> > > +	error = xfs_trans_commit(tp);
> > > +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > > +	return error;
> > > +out_cancel:
> > > +	xfs_trans_cancel(tp);
> > > +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > > +	return error;
> > > +}
> > >   /*
> > >    * Free all CoW staging blocks that are still referenced by the ondisk refcount
> > > diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> > > index ef5c8b2398d8..2c3b096c1386 100644
> > > --- a/fs/xfs/xfs_reflink.h
> > > +++ b/fs/xfs/xfs_reflink.h
> > > @@ -45,6 +45,9 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
> > >   		xfs_off_t count, bool cancel_real);
> > >   extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
> > >   		xfs_off_t count);
> > > +		int
> > > +xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset,
> > > +		xfs_off_t count);
> > >   extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
> > >   extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
> > >   		struct file *file_out, loff_t pos_out, loff_t len,
> > > -- 
> > > 2.31.1
> > > 
> > > 
> 
>

Re: [PATCH RFC 08/10] xfs: Commit CoW-based atomic writes atomically

Posted by John Garry 1 year ago

On 06/02/2025 21:50, Darrick J. Wong wrote:
> On Thu, Feb 06, 2025 at 10:27:45AM +0000, John Garry wrote:
>> On 05/02/2025 19:47, Darrick J. Wong wrote:
>>> On Tue, Feb 04, 2025 at 12:01:25PM +0000, John Garry wrote:
>>>> When completing a CoW-based write, each extent range mapping update is
>>>> covered by a separate transaction.
>>>>
>>>> For a CoW-based atomic write, all mappings must be changed at once, so
>>>> change to use a single transaction.
>>>>
>>>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>>>> ---
>>>>    fs/xfs/xfs_file.c    |  5 ++++-
>>>>    fs/xfs/xfs_reflink.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
>>>>    fs/xfs/xfs_reflink.h |  3 +++
>>>>    3 files changed, 55 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>>>> index 12af5cdc3094..170d7891f90d 100644
>>>> --- a/fs/xfs/xfs_file.c
>>>> +++ b/fs/xfs/xfs_file.c
>>>> @@ -527,7 +527,10 @@ xfs_dio_write_end_io(
>>>>    	nofs_flag = memalloc_nofs_save();
>>>>    	if (flags & IOMAP_DIO_COW) {
>>>> -		error = xfs_reflink_end_cow(ip, offset, size);
>>>> +		if (iocb->ki_flags & IOCB_ATOMIC)
>>>> +			error = xfs_reflink_end_atomic_cow(ip, offset, size);
>>>> +		else
>>>> +			error = xfs_reflink_end_cow(ip, offset, size);
>>>>    		if (error)
>>>>    			goto out;
>>>>    	}
>>>> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
>>>> index dbce333b60eb..60c986300faa 100644
>>>> --- a/fs/xfs/xfs_reflink.c
>>>> +++ b/fs/xfs/xfs_reflink.c
>>>> @@ -990,6 +990,54 @@ xfs_reflink_end_cow(
>>>>    		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
>>>>    	return error;
>>>>    }
>>>> +int
>>>> +xfs_reflink_end_atomic_cow(
>>>> +	struct xfs_inode		*ip,
>>>> +	xfs_off_t			offset,
>>>> +	xfs_off_t			count)
>>>> +{
>>>> +	xfs_fileoff_t			offset_fsb;
>>>> +	xfs_fileoff_t			end_fsb;
>>>> +	int				error = 0;
>>>> +	struct xfs_mount		*mp = ip->i_mount;
>>>> +	struct xfs_trans		*tp;
>>>> +	unsigned int			resblks;
>>>> +	bool				commit = false;
>>>> +
>>>> +	trace_xfs_reflink_end_cow(ip, offset, count);
>>>> +
>>>> +	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
>>>> +	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
>>>> +
>>>> +	resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
>>>> +				(unsigned int)(end_fsb - offset_fsb),
>>>> +				XFS_DATA_FORK);
>>>> +
>>>> +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
>>>
>>> xfs gained reflink support for realtime volumes in 6.14-rc1, so you now
>>> have to calculate for that in here too.
>>>
>>>> +			XFS_TRANS_RESERVE, &tp);
>>>> +	if (error)
>>>> +		return error;
>>>> +
>>>> +	xfs_ilock(ip, XFS_ILOCK_EXCL);
>>>> +	xfs_trans_ijoin(tp, ip, 0);
>>>> +
>>>> +	while (end_fsb > offset_fsb && !error)
>>>> +		error = xfs_reflink_end_cow_extent_locked(ip, &offset_fsb,
>>>> +						end_fsb, tp, &commit);
>>>
>>> Hmm.  Attaching intent items to a transaction consumes space in that
>>> transaction, so we probably ought to limit the amount that we try to do
>>> here.  Do you know what that limit is?  I don't,
>>
>> nor do I ...
>>
>>> but it's roughly
>>> tr_logres divided by the average size of a log intent item.
>>
>> So you have a ballpark figure on the average size of a log intent item, or
>> an idea on how to get it?
> 
> You could add up the size of struct
> xfs_{bui,rmap,refcount,efi}_log_format structures and add 20%, that will
> give you a ballpark figure of the worst case per-block requirements.
> 
> My guess is that 64 blocks is ok provided resblks is big enough.  But I
> guess we could estimate it (very conservatively) dynamically too.
> 
> (also note tr_itruncate declares more logres)

64 blocks gives quite a large awu max, so that would be ok

> 
>>> This means we need to restrict the size of an untorn write to a
>>> double-digit number of fsblocks for safety.
>>
>> Sure, but won't we also still be liable to suffer the same issue which was
>> fixed in commit d6f215f359637?
> 
> Yeah, come to think of it, you need to reserve the worst case space
> reservation, i.e. each of the blocks between offset_fsb and end_fsb
> becomes a separate btree update.
> 
> 	resblks = (end_fsb - offset_fsb) *
> 			XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);
> 
>

ok

Thanks,
John