fs/btrfs/extent_io.c | 39 ++++++++++++++++++++++++++++++++++++--- fs/btrfs/reflink.c | 21 +++++---------------- 2 files changed, 41 insertions(+), 19 deletions(-)
For archival purposes, forwarding an incoming command email to
linux-kernel@vger.kernel.org, syzkaller-bugs@googlegroups.com.
***
Subject: [PATCH] btrfs: fix hung task and deadlock when cloning inline extents
Author: kartikey406@gmail.com
#syz test: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master
When cloning or deduplicating inline extents, the clone operation sets
BTRFS_INODE_NO_DELALLOC_FLUSH on the destination inode inside
copy_inline_to_page() after the extent range is locked and the page is
dirtied. This creates a race window where writeback can be triggered
before the flag is set, causing a hung task or deadlock.
The sequence that causes the deadlock is:
1. copy_inline_to_page() locks extent range [0, block_size) in the
destination inode's io_tree and dirties a page. The inode's i_size
is still 0 at this point since clone_finish_inode_update() has not
been called yet.
2. The clone calls start_transaction() which may block waiting for the
current transaction to commit (pid A waits for pid B).
3. The transaction commit (pid B) calls btrfs_start_delalloc_flush()
which calls try_to_writeback_inodes_sb(), queuing a kworker to
flush the destination inode.
4. The kworker calls btrfs_writepages() -> extent_writepage(). Since
i_size is still 0, the dirty page appears beyond EOF, causing
extent_writepage() to call folio_invalidate() ->
btrfs_invalidate_folio() -> btrfs_lock_extent(), which blocks
forever because the clone already holds that lock.
This creates a circular deadlock:
clone -> waits for transaction commit (pid A waits for pid B)
commit -> waits for kworker writeback (pid B waits for pid C)
kworker -> waits for extent lock held by clone (pid C waits for pid A)
Additionally any periodic background writeback racing with the clone
operation will block on the same extent lock causing a hung task warning
even without the circular deadlock.
The existing fix set BTRFS_INODE_NO_DELALLOC_FLUSH inside
copy_inline_to_page() after marking the range as delalloc, leaving a
race window where writeback could start before the flag is set. Also
the flag was only checked in start_delalloc_inodes() which is only
reached through the btrfs metadata reclaim path. The transaction commit
path goes through try_to_writeback_inodes_sb() which is a VFS function
that bypasses start_delalloc_inodes() entirely.
Fix this with two changes:
1. Move the set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH) to before the
btrfs_lock_extent() call in both btrfs_clone_files() and
btrfs_extent_same_range(), and clear it after btrfs_unlock_extent().
This ensures the flag is set before the extent lock is taken and
before any page is dirtied, closing the race window completely.
The destination inode is protected by i_mutex for the entire
operation so concurrent clone/dedupe on the same inode is
impossible, making set_bit/clear_bit safe without reference
counting.
2. Check BTRFS_INODE_NO_DELALLOC_FLUSH at the top of
btrfs_writepages() and return early if set. This catches all
writeback paths since both the VFS writeback path via
try_to_writeback_inodes_sb() and the btrfs delalloc path
eventually call btrfs_writepages(). The inode will be safely
written after the clone operation finishes and clears the flag,
at which point all locks are released and i_size is properly set.
Reported-by: syzbot+63056bf627663701bbbf@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=63056bf627663701bbbf
Signed-off-by: Deepanshu Kartikey <Kartikey406@gmail.com>
---
fs/btrfs/extent_io.c | 39 ++++++++++++++++++++++++++++++++++++---
fs/btrfs/reflink.c | 21 +++++----------------
2 files changed, 41 insertions(+), 19 deletions(-)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5f97a3d2a8d7..f7df7c0c8955 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2698,21 +2698,54 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
- struct inode *inode = mapping->host;
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
int ret = 0;
struct btrfs_bio_ctrl bio_ctrl = {
.wbc = wbc,
.opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
};
+ /*
+ * If this inode is being used for a clone/reflink operation that
+ * copied an inline extent into a page of the destination inode, skip
+ * writeback to avoid a deadlock or a long blocked task.
+ *
+ * The clone operation holds the extent range locked in the inode's
+ * io_tree for its entire duration. Any writeback attempt on this
+ * inode will block trying to lock that same extent range inside
+ * writepage_delalloc() or btrfs_invalidate_folio(), causing a
+ * hung task.
+ *
+ * When writeback is triggered from the transaction commit path via
+ * btrfs_start_delalloc_flush() -> try_to_writeback_inodes_sb(),
+ * this becomes a true circular deadlock:
+ *
+ * clone -> waits for transaction commit to finish
+ * commit -> waits for kworker writeback to finish
+ * kworker -> waits for extent lock held by clone
+ *
+ * The flag BTRFS_INODE_NO_DELALLOC_FLUSH was already checked in
+ * start_delalloc_inodes() but only for the btrfs metadata reclaim
+ * path. The transaction commit path goes through
+ * try_to_writeback_inodes_sb() which bypasses that check entirely
+ * and calls btrfs_writepages() directly.
+ *
+ * By checking the flag here we catch all writeback paths. The inode
+ * will be safely written after the clone operation finishes and
+ * clears BTRFS_INODE_NO_DELALLOC_FLUSH, at which point all locks
+ * are released and writeback can proceed normally.
+ */
+ if (test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags))
+ return 0;
+
/*
* Allow only a single thread to do the reloc work in zoned mode to
* protect the write pointer updates.
*/
- btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
+ btrfs_zoned_data_reloc_lock(inode);
ret = extent_write_cache_pages(mapping, &bio_ctrl);
submit_write_bio(&bio_ctrl, ret);
- btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
+ btrfs_zoned_data_reloc_unlock(inode);
return ret;
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 314cb95ba846..f3387baa71ae 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -100,19 +100,6 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (ret)
goto out_unlock;
- /*
- * After dirtying the page our caller will need to start a transaction,
- * and if we are low on metadata free space, that can cause flushing of
- * delalloc for all inodes in order to get metadata space released.
- * However we are holding the range locked for the whole duration of
- * the clone/dedupe operation, so we may deadlock if that happens and no
- * other task releases enough space. So mark this inode as not being
- * possible to flush to avoid such deadlock. We will clear that flag
- * when we finish cloning all extents, since a transaction is started
- * after finding each extent to clone.
- */
- set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
-
if (comp_type == BTRFS_COMPRESS_NONE) {
memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start,
datal);
@@ -610,8 +597,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
}
out:
- clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
-
return ret;
}
@@ -644,11 +629,12 @@ static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len,
* because we have already locked the inode's i_mmap_lock in exclusive
* mode.
*/
+ set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &dst->runtime_flags);
btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state);
ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len,
ALIGN(len, bs), dst_loff, 1);
btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state);
-
+ clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &dst->runtime_flags);
btrfs_btree_balance_dirty(fs_info);
return ret;
@@ -746,9 +732,12 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* mode.
*/
end = destoff + len - 1;
+ set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
+ clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
+
if (ret < 0)
return ret;
--
2.43.0
© 2016 - 2026 Red Hat, Inc.