[v2] ext4: fix race condition between buffer write and page_mkwrite

[PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Baokun Li 2 years, 8 months ago

Syzbot reported a BUG_ON:
==================================================================
EXT4-fs (loop0): mounted filesystem without journal. Quota mode: none.
EXT4-fs error (device loop0): ext4_mb_generate_buddy:1098: group 0, block
     bitmap and bg descriptor inconsistent: 25 vs 150994969 free clusters
------------[ cut here ]------------
kernel BUG at fs/ext4/ext4_jbd2.c:53!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 494 Comm: syz-executor.0 6.1.0-rc7-syzkaller-ga4412fdd49dc #0
RIP: 0010:__ext4_journal_stop+0x1b3/0x1c0
 [...]
Call Trace:
 ext4_write_inline_data_end+0xa39/0xdf0
 ext4_da_write_end+0x1e2/0x950
 generic_perform_write+0x401/0x5f0
 ext4_buffered_write_iter+0x35f/0x640
 ext4_file_write_iter+0x198/0x1cd0
 vfs_write+0x8b5/0xef0
 [...]
==================================================================

The above BUG_ON is triggered by the following race:

           cpu1                    cpu2
________________________|________________________
ksys_write
 vfs_write
  new_sync_write
   ext4_file_write_iter
    ext4_buffered_write_iter
     generic_perform_write
      ext4_da_write_begin
                          do_fault
                           do_page_mkwrite
                            ext4_page_mkwrite
                             ext4_convert_inline_data
                              ext4_convert_inline_data_nolock
                               ext4_destroy_inline_data_nolock
                                //clear EXT4_STATE_MAY_INLINE_DATA
                               ext4_map_blocks --> return error
       ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)
       ext4_block_write_begin
                               ext4_restore_inline_data
                                // set EXT4_STATE_MAY_INLINE_DATA
      ext4_da_write_end
       ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)
       ext4_write_inline_data_end
        handle=NULL
        ext4_journal_stop(handle)
         __ext4_journal_stop
          ext4_put_nojournal(handle)
           ref_cnt = (unsigned long)handle
           BUG_ON(ref_cnt == 0)  ---> BUG_ON

The root cause of this problem is that the ext4_convert_inline_data() in
ext4_page_mkwrite() does not grab i_rwsem, so it may race with
ext4_buffered_write_iter() and cause the write_begin() and write_end()
functions to be inconsistent and trigger BUG_ON.

To solve the above issue, we can not add inode_lock directly to
ext4_page_mkwrite(), which would not only cause performance degradation but
also ABBA deadlock (see Link). Hence we move ext4_convert_inline_data() to
ext4_file_mmap(), and only when inline_data is enabled and mmap a writeable
file in shared mode, we hold the lock to convert, which avoids the above
problems.

Link: https://lore.kernel.org/r/20230530102804.6t7np7om6tczscuo@quack3/
Reported-by: Jun Nie <jun.nie@linaro.org>
Closes: https://lore.kernel.org/lkml/63903521.5040307@huawei.com/t/
Reported-by: syzbot+a158d886ca08a3fecca4@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?id=899b37f20ce4072bcdfecfe1647b39602e956e36
Fixes: 7b4cc9787fe3 ("ext4: evict inline data when writing to memory map")
CC: stable@vger.kernel.org # 4.12+
Signed-off-by: Baokun Li <libaokun1@huawei.com>
---
 fs/ext4/file.c  | 24 +++++++++++++++++++++++-
 fs/ext4/inode.c |  4 ----
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d101b3b0c7da..9df82d72eb90 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -795,7 +795,8 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_mapping->host;
-	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct super_block *sb = inode->i_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct dax_device *dax_dev = sbi->s_daxdev;
 
 	if (unlikely(ext4_forced_shutdown(sbi)))
@@ -808,6 +809,27 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!daxdev_mapping_supported(vma, dax_dev))
 		return -EOPNOTSUPP;
 
+	/*
+	 * Writing via mmap has no logic to handle inline data, so we
+	 * need to call ext4_convert_inline_data() to convert the inode
+	 * to normal format before doing so, otherwise a BUG_ON will be
+	 * triggered in ext4_writepages() due to the
+	 * EXT4_STATE_MAY_INLINE_DATA flag. Moreover, we need to grab
+	 * i_rwsem during conversion, since clearing and setting the
+	 * inline data flag may race with ext4_buffered_write_iter()
+	 * to trigger a BUG_ON.
+	 */
+	if (ext4_has_feature_inline_data(sb) &&
+	    vma->vm_flags & VM_SHARED && vma->vm_flags & VM_MAYWRITE) {
+		int err;
+
+		inode_lock(inode);
+		err = ext4_convert_inline_data(inode);
+		inode_unlock(inode);
+		if (err)
+			return err;
+	}
+
 	file_accessed(file);
 	if (IS_DAX(file_inode(file))) {
 		vma->vm_ops = &ext4_dax_vm_ops;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ce5f21b6c2b3..31844c4ec9fe 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6043,10 +6043,6 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 
 	filemap_invalidate_lock_shared(mapping);
 
-	err = ext4_convert_inline_data(inode);
-	if (err)
-		goto out_ret;
-
 	/*
 	 * On data journalling we skip straight to the transaction handle:
 	 * there's no delalloc; page truncated will be checked later; the
-- 
2.31.1

Re: [PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Theodore Ts'o 2 years, 8 months ago

I tried testing to see if this fixed [1], and it appears to be
triggering a lockdep warning[2] at this line in the patch:

[1] https://syzkaller.appspot.com/bug?extid=f4582777a19ec422b517
[2] https://syzkaller.appspot.com/x/report.txt?x=17260843280000

> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index d101b3b0c7da..9df82d72eb90 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -808,6 +809,27 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
>  	if (!daxdev_mapping_supported(vma, dax_dev))
>  		return -EOPNOTSUPP;
>  
> +	/*
> +	 * Writing via mmap has no logic to handle inline data, so we
> +	 * need to call ext4_convert_inline_data() to convert the inode
> +	 * to normal format before doing so, otherwise a BUG_ON will be
> +	 * triggered in ext4_writepages() due to the
> +	 * EXT4_STATE_MAY_INLINE_DATA flag. Moreover, we need to grab
> +	 * i_rwsem during conversion, since clearing and setting the
> +	 * inline data flag may race with ext4_buffered_write_iter()
> +	 * to trigger a BUG_ON.
> +	 */
> +	if (ext4_has_feature_inline_data(sb) &&
> +	    vma->vm_flags & VM_SHARED && vma->vm_flags & VM_MAYWRITE) {
> +		int err;
> +
> +		inode_lock(inode); <=================== LOCKDEP warning
> +		err = ext4_convert_inline_data(inode);
> +		inode_unlock(inode);
> +		if (err)
> +			return err;
> +	}


The details of the lockdep warning from [2], which appears to be a
mmap(2) racing with a buffered write(2) are below.  Could you take a
look?

Thanks!

					- Ted

======================================================
WARNING: possible circular locking dependency detected
6.4.0-rc4-syzkaller-geb1f822c76be-dirty #0 Not tainted
------------------------------------------------------
syz-executor.4/5589 is trying to acquire lock:
ffff888024228168 (&mm->mmap_lock){++++}-{3:3}, at: mmap_read_lock include/linux/mmap_lock.h:142 [inline]
ffff888024228168 (&mm->mmap_lock){++++}-{3:3}, at: do_user_addr_fault+0xb3d/0x1210 arch/x86/mm/fault.c:1391

but task is already holding lock:
ffff88806a066800 (&sb->s_type->i_mutex_key#8){++++}-{3:3}, at: inode_lock include/linux/fs.h:775 [inline]
ffff88806a066800 (&sb->s_type->i_mutex_key#8){++++}-{3:3}, at: ext4_buffered_write_iter+0xb0/0x460 fs/ext4/file.c:283

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (&sb->s_type->i_mutex_key#8){++++}-{3:3}:
       down_write+0x92/0x200 kernel/locking/rwsem.c:1573
       inode_lock include/linux/fs.h:775 [inline]
       ext4_file_mmap+0x62e/0x800 fs/ext4/file.c:826
       call_mmap include/linux/fs.h:1873 [inline]
       mmap_region+0x694/0x28d0 mm/mmap.c:2652
       do_mmap+0x831/0xf60 mm/mmap.c:1394
       vm_mmap_pgoff+0x1a2/0x3b0 mm/util.c:543
       ksys_mmap_pgoff+0x41f/0x5a0 mm/mmap.c:1440
       do_syscall_x64 arch/x86/entry/common.c:50 [inline]
       do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80
       entry_SYSCALL_64_after_hwframe+0x63/0xcd

-> #0 (&mm->mmap_lock){++++}-{3:3}:
       check_prev_add kernel/locking/lockdep.c:3113 [inline]
       check_prevs_add kernel/locking/lockdep.c:3232 [inline]
       validate_chain kernel/locking/lockdep.c:3847 [inline]
       __lock_acquire+0x2fcd/0x5f30 kernel/locking/lockdep.c:5088
       lock_acquire kernel/locking/lockdep.c:5705 [inline]
       lock_acquire+0x1b1/0x520 kernel/locking/lockdep.c:5670
       down_read+0x9c/0x480 kernel/locking/rwsem.c:1520
       mmap_read_lock include/linux/mmap_lock.h:142 [inline]
       do_user_addr_fault+0xb3d/0x1210 arch/x86/mm/fault.c:1391
       handle_page_fault arch/x86/mm/fault.c:1534 [inline]
       exc_page_fault+0x98/0x170 arch/x86/mm/fault.c:1590
       asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:570
       fault_in_readable+0x1a5/0x210 mm/gup.c:1856
       fault_in_iov_iter_readable+0x252/0x2c0 lib/iov_iter.c:362
       generic_perform_write+0x1ae/0x570 mm/filemap.c:3913
       ext4_buffered_write_iter+0x15b/0x460 fs/ext4/file.c:289
       ext4_file_write_iter+0xbe0/0x1740 fs/ext4/file.c:710
       call_write_iter include/linux/fs.h:1868 [inline]
       new_sync_write fs/read_write.c:491 [inline]
       vfs_write+0x945/0xd50 fs/read_write.c:584
       ksys_write+0x12b/0x250 fs/read_write.c:637
       do_syscall_x64 arch/x86/entry/common.c:50 [inline]
       do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80
       entry_SYSCALL_64_after_hwframe+0x63/0xcd

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&sb->s_type->i_mutex_key#8);
                               lock(&mm->mmap_lock);
                               lock(&sb->s_type->i_mutex_key#8);
  rlock(&mm->mmap_lock);

 *** DEADLOCK ***

3 locks held by syz-executor.4/5589:
 #0: ffff88802a7fe0e8 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0xe7/0x100 fs/file.c:1047
 #1: ffff888021fe0460 (sb_writers#4){.+.+}-{0:0}, at: ksys_write+0x12b/0x250 fs/read_write.c:637
 #2: ffff88806a066800 (&sb->s_type->i_mutex_key#8){++++}-{3:3}, at: inode_lock include/linux/fs.h:775 [inline]
 #2: ffff88806a066800 (&sb->s_type->i_mutex_key#8){++++}-{3:3}, at: ext4_buffered_write_iter+0xb0/0x460 fs/ext4/file.c:283

stack backtrace:
CPU: 0 PID: 5589 Comm: syz-executor.4 Not tainted 6.4.0-rc4-syzkaller-geb1f822c76be-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/25/2023
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106
 check_noncircular+0x25f/0x2e0 kernel/locking/lockdep.c:2188
 check_prev_add kernel/locking/lockdep.c:3113 [inline]
 check_prevs_add kernel/locking/lockdep.c:3232 [inline]
 validate_chain kernel/locking/lockdep.c:3847 [inline]
 __lock_acquire+0x2fcd/0x5f30 kernel/locking/lockdep.c:5088
 lock_acquire kernel/locking/lockdep.c:5705 [inline]
 lock_acquire+0x1b1/0x520 kernel/locking/lockdep.c:5670
 down_read+0x9c/0x480 kernel/locking/rwsem.c:1520
 mmap_read_lock include/linux/mmap_lock.h:142 [inline]
 do_user_addr_fault+0xb3d/0x1210 arch/x86/mm/fault.c:1391
 handle_page_fault arch/x86/mm/fault.c:1534 [inline]
 exc_page_fault+0x98/0x170 arch/x86/mm/fault.c:1590
 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:570
RIP: 0010:fault_in_readable+0x1a5/0x210 mm/gup.c:1856
Code: fc ff df 48 c7 04 02 00 00 00 00 48 83 c4 48 4c 89 e0 5b 5d 41 5c 41 5d 41 5e 41 5f c3 45 31 e4 eb ce e8 ae 51 c4 ff 45 31 f6 <41> 8a 45 00 31 ff 44 89 f6 88 44 24 28 e8 b9 4d c4 ff 45 85 f6 75
RSP: 0018:ffffc90006187a38 EFLAGS: 00050246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: ffff888026905940 RSI: ffffffff81bff672 RDI: 0000000000000007
RBP: 00000000200002cc R08: 0000000000000007 R09: 0000000000000000
R10: 00000000000002c0 R11: 1ffffffff219cbe3 R12: 000000000000000c
R13: 00000000200002c0 R14: 0000000000000000 R15: 1ffff92000c30f48
 fault_in_iov_iter_readable+0x252/0x2c0 lib/iov_iter.c:362
 generic_perform_write+0x1ae/0x570 mm/filemap.c:3913
 ext4_buffered_write_iter+0x15b/0x460 fs/ext4/file.c:289
 ext4_file_write_iter+0xbe0/0x1740 fs/ext4/file.c:710
 call_write_iter include/linux/fs.h:1868 [inline]
 new_sync_write fs/read_write.c:491 [inline]
 vfs_write+0x945/0xd50 fs/read_write.c:584
 ksys_write+0x12b/0x250 fs/read_write.c:637
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7f2359a8c169
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 f1 19 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f235a721168 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 00007f2359bac050 RCX: 00007f2359a8c169
RDX: 000000000000000c RSI: 00000000200002c0 RDI: 0000000000000004
RBP: 00007f2359ae7ca1 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007ffea89659df R14: 00007f235a721300 R15: 0000000000022000
 </TASK>
----------------
Code disassembly (best guess), 2 bytes skipped:
   0:	df 48 c7             	fisttps -0x39(%rax)
   3:	04 02                	add    $0x2,%al
   5:	00 00                	add    %al,(%rax)
   7:	00 00                	add    %al,(%rax)
   9:	48 83 c4 48          	add    $0x48,%rsp
   d:	4c 89 e0             	mov    %r12,%rax
  10:	5b                   	pop    %rbx
  11:	5d                   	pop    %rbp
  12:	41 5c                	pop    %r12
  14:	41 5d                	pop    %r13
  16:	41 5e                	pop    %r14
  18:	41 5f                	pop    %r15
  1a:	c3                   	retq
  1b:	45 31 e4             	xor    %r12d,%r12d
  1e:	eb ce                	jmp    0xffffffee
  20:	e8 ae 51 c4 ff       	callq  0xffc451d3
  25:	45 31 f6             	xor    %r14d,%r14d
* 28:	41 8a 45 00          	mov    0x0(%r13),%al <-- trapping instruction
  2c:	31 ff                	xor    %edi,%edi
  2e:	44 89 f6             	mov    %r14d,%esi
  31:	88 44 24 28          	mov    %al,0x28(%rsp)
  35:	e8 b9 4d c4 ff       	callq  0xffc44df3
  3a:	45 85 f6             	test   %r14d,%r14d
  3d:	75                   	.byte 0x75

Re: [PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Baokun Li 2 years, 8 months ago

On 2023/6/4 11:04, Theodore Ts'o wrote:
> I tried testing to see if this fixed [1], and it appears to be
> triggering a lockdep warning[2] at this line in the patch:
>
> [1] https://syzkaller.appspot.com/bug?extid=f4582777a19ec422b517
> [2] https://syzkaller.appspot.com/x/report.txt?x=17260843280000
>
>> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
>> index d101b3b0c7da..9df82d72eb90 100644
>> --- a/fs/ext4/file.c
>> +++ b/fs/ext4/file.c
>> @@ -808,6 +809,27 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
>>   	if (!daxdev_mapping_supported(vma, dax_dev))
>>   		return -EOPNOTSUPP;
>>   
>> +	/*
>> +	 * Writing via mmap has no logic to handle inline data, so we
>> +	 * need to call ext4_convert_inline_data() to convert the inode
>> +	 * to normal format before doing so, otherwise a BUG_ON will be
>> +	 * triggered in ext4_writepages() due to the
>> +	 * EXT4_STATE_MAY_INLINE_DATA flag. Moreover, we need to grab
>> +	 * i_rwsem during conversion, since clearing and setting the
>> +	 * inline data flag may race with ext4_buffered_write_iter()
>> +	 * to trigger a BUG_ON.
>> +	 */
>> +	if (ext4_has_feature_inline_data(sb) &&
>> +	    vma->vm_flags & VM_SHARED && vma->vm_flags & VM_MAYWRITE) {
>> +		int err;
>> +
>> +		inode_lock(inode); <=================== LOCKDEP warning
>> +		err = ext4_convert_inline_data(inode);
>> +		inode_unlock(inode);
>> +		if (err)
>> +			return err;
>> +	}
>
> The details of the lockdep warning from [2], which appears to be a
> mmap(2) racing with a buffered write(2) are below.  Could you take a
> look?
>
> Thanks!
>
> 					- Ted

Sorry for the late reply!

Had a look at this question which is similar to the one Honza mentioned 
earlier.
Concurrency between write and mmap as follows can lead to ABBA deadlocks:

     CPU0                   CPU1
   write(2)                mmap(2)
ext4_file_write_iter
  ext4_buffered_write_iter
   inode_lock(inode)  ---> LOCK A
   generic_perform_write
                          ksys_mmap_pgoff
                           vm_mmap_pgoff
                            mmap_write_lock_killable(mm) ---> LOCK B
                            do_mmap
                             mmap_region
                              call_mmap
                               ext4_file_mmap
                                inode_lock(inode)  ---> try LOCK A again
    fault_in_iov_iter_readable              |
     fault_in_readable                      |
      asm_exc_page_fault                ABBA deadlock
       handle_page_fault                    |
        do_user_addr_fault                  |
         mmap_read_lock(mm) ---> try LOCK B again


Thanks!
-- 
With Best Regards,
Baokun Li
.

Re: [PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Theodore Ts'o 2 years, 8 months ago

On Sat, Jun 03, 2023 at 11:04:45PM -0400, Theodore Ts'o wrote:
> I tried testing to see if this fixed [1], and it appears to be
> triggering a lockdep warning[2] at this line in the patch:
> 
> [1] https://syzkaller.appspot.com/bug?extid=f4582777a19ec422b517
> [2] https://syzkaller.appspot.com/x/report.txt?x=17260843280000

Looking at this more closely, the fundamental problem is by the time
ext4_file_mmap() is called, the mm layer has already taken
current->mm->mmap_lock, and when we try to take the inode_lock, this
causes locking ordering problems with how buffered write path works,
which take the inode_lock first, and then in some cases, may end up
taking the mmap_lock if there is a page fault for the buffer used for
the buffered write.

If we're going to stick with the approach in this patch, I think what
we would need is to add a pre_mmap() function to file_operations
struct, which would get called by the mmap path *before* taking
current->mm->mmap_lock, so we can do the inline conversion before we
take the mmap_lock.

I'm not sure how the mm folks would react to such a proposal, though.
I could be seen as a bit hacky, and it's not clear that any file
system other than ext4 would need something like this.  Willy, as
someone who does a lot of work in both mm and fs worlds --- I'm
curious what you think about this idea?

Thanks,

    	     	     	      	    	  - Ted

> > diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> > index d101b3b0c7da..9df82d72eb90 100644
> > --- a/fs/ext4/file.c
> > +++ b/fs/ext4/file.c
> > @@ -808,6 +809,27 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
> >  	if (!daxdev_mapping_supported(vma, dax_dev))
> >  		return -EOPNOTSUPP;
> >  
> > +	/*
> > +	 * Writing via mmap has no logic to handle inline data, so we
> > +	 * need to call ext4_convert_inline_data() to convert the inode
> > +	 * to normal format before doing so, otherwise a BUG_ON will be
> > +	 * triggered in ext4_writepages() due to the
> > +	 * EXT4_STATE_MAY_INLINE_DATA flag. Moreover, we need to grab
> > +	 * i_rwsem during conversion, since clearing and setting the
> > +	 * inline data flag may race with ext4_buffered_write_iter()
> > +	 * to trigger a BUG_ON.
> > +	 */
> > +	if (ext4_has_feature_inline_data(sb) &&
> > +	    vma->vm_flags & VM_SHARED && vma->vm_flags & VM_MAYWRITE) {
> > +		int err;
> > +
> > +		inode_lock(inode); <=================== LOCKDEP warning
> > +		err = ext4_convert_inline_data(inode);
> > +		inode_unlock(inode);
> > +		if (err)
> > +			return err;
> > +	}
> 
> 
> The details of the lockdep warning from [2], which appears to be a
> mmap(2) racing with a buffered write(2) are below.  Could you take a
> look?
> 
> Thanks!
> 
> 					- Ted
> 
> ======================================================
> WARNING: possible circular locking dependency detected
> 6.4.0-rc4-syzkaller-geb1f822c76be-dirty #0 Not tainted
> ------------------------------------------------------
> syz-executor.4/5589 is trying to acquire lock:
> ffff888024228168 (&mm->mmap_lock){++++}-{3:3}, at: mmap_read_lock include/linux/mmap_lock.h:142 [inline]
> ffff888024228168 (&mm->mmap_lock){++++}-{3:3}, at: do_user_addr_fault+0xb3d/0x1210 arch/x86/mm/fault.c:1391
> 
> but task is already holding lock:
> ffff88806a066800 (&sb->s_type->i_mutex_key#8){++++}-{3:3}, at: inode_lock include/linux/fs.h:775 [inline]
> ffff88806a066800 (&sb->s_type->i_mutex_key#8){++++}-{3:3}, at: ext4_buffered_write_iter+0xb0/0x460 fs/ext4/file.c:283
> 
> which lock already depends on the new lock.
> 
> 
> the existing dependency chain (in reverse order) is:
> 
> -> #1 (&sb->s_type->i_mutex_key#8){++++}-{3:3}:
>        down_write+0x92/0x200 kernel/locking/rwsem.c:1573
>        inode_lock include/linux/fs.h:775 [inline]
>        ext4_file_mmap+0x62e/0x800 fs/ext4/file.c:826
>        call_mmap include/linux/fs.h:1873 [inline]
>        mmap_region+0x694/0x28d0 mm/mmap.c:2652
>        do_mmap+0x831/0xf60 mm/mmap.c:1394
>        vm_mmap_pgoff+0x1a2/0x3b0 mm/util.c:543
>        ksys_mmap_pgoff+0x41f/0x5a0 mm/mmap.c:1440
>        do_syscall_x64 arch/x86/entry/common.c:50 [inline]
>        do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80
>        entry_SYSCALL_64_after_hwframe+0x63/0xcd
> 
> -> #0 (&mm->mmap_lock){++++}-{3:3}:
>        check_prev_add kernel/locking/lockdep.c:3113 [inline]
>        check_prevs_add kernel/locking/lockdep.c:3232 [inline]
>        validate_chain kernel/locking/lockdep.c:3847 [inline]
>        __lock_acquire+0x2fcd/0x5f30 kernel/locking/lockdep.c:5088
>        lock_acquire kernel/locking/lockdep.c:5705 [inline]
>        lock_acquire+0x1b1/0x520 kernel/locking/lockdep.c:5670
>        down_read+0x9c/0x480 kernel/locking/rwsem.c:1520
>        mmap_read_lock include/linux/mmap_lock.h:142 [inline]
>        do_user_addr_fault+0xb3d/0x1210 arch/x86/mm/fault.c:1391
>        handle_page_fault arch/x86/mm/fault.c:1534 [inline]
>        exc_page_fault+0x98/0x170 arch/x86/mm/fault.c:1590
>        asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:570
>        fault_in_readable+0x1a5/0x210 mm/gup.c:1856
>        fault_in_iov_iter_readable+0x252/0x2c0 lib/iov_iter.c:362
>        generic_perform_write+0x1ae/0x570 mm/filemap.c:3913
>        ext4_buffered_write_iter+0x15b/0x460 fs/ext4/file.c:289
>        ext4_file_write_iter+0xbe0/0x1740 fs/ext4/file.c:710
>        call_write_iter include/linux/fs.h:1868 [inline]
>        new_sync_write fs/read_write.c:491 [inline]
>        vfs_write+0x945/0xd50 fs/read_write.c:584
>        ksys_write+0x12b/0x250 fs/read_write.c:637
>        do_syscall_x64 arch/x86/entry/common.c:50 [inline]
>        do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80
>        entry_SYSCALL_64_after_hwframe+0x63/0xcd
> 
> other info that might help us debug this:
> 
>  Possible unsafe locking scenario:
> 
>        CPU0                    CPU1
>        ----                    ----
>   lock(&sb->s_type->i_mutex_key#8);
>                                lock(&mm->mmap_lock);
>                                lock(&sb->s_type->i_mutex_key#8);
>   rlock(&mm->mmap_lock);
> 
>  *** DEADLOCK ***
> 
> 3 locks held by syz-executor.4/5589:
>  #0: ffff88802a7fe0e8 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0xe7/0x100 fs/file.c:1047
>  #1: ffff888021fe0460 (sb_writers#4){.+.+}-{0:0}, at: ksys_write+0x12b/0x250 fs/read_write.c:637
>  #2: ffff88806a066800 (&sb->s_type->i_mutex_key#8){++++}-{3:3}, at: inode_lock include/linux/fs.h:775 [inline]
>  #2: ffff88806a066800 (&sb->s_type->i_mutex_key#8){++++}-{3:3}, at: ext4_buffered_write_iter+0xb0/0x460 fs/ext4/file.c:283
> 
> stack backtrace:
> CPU: 0 PID: 5589 Comm: syz-executor.4 Not tainted 6.4.0-rc4-syzkaller-geb1f822c76be-dirty #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/25/2023
> Call Trace:
>  <TASK>
>  __dump_stack lib/dump_stack.c:88 [inline]
>  dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106
>  check_noncircular+0x25f/0x2e0 kernel/locking/lockdep.c:2188
>  check_prev_add kernel/locking/lockdep.c:3113 [inline]
>  check_prevs_add kernel/locking/lockdep.c:3232 [inline]
>  validate_chain kernel/locking/lockdep.c:3847 [inline]
>  __lock_acquire+0x2fcd/0x5f30 kernel/locking/lockdep.c:5088
>  lock_acquire kernel/locking/lockdep.c:5705 [inline]
>  lock_acquire+0x1b1/0x520 kernel/locking/lockdep.c:5670
>  down_read+0x9c/0x480 kernel/locking/rwsem.c:1520
>  mmap_read_lock include/linux/mmap_lock.h:142 [inline]
>  do_user_addr_fault+0xb3d/0x1210 arch/x86/mm/fault.c:1391
>  handle_page_fault arch/x86/mm/fault.c:1534 [inline]
>  exc_page_fault+0x98/0x170 arch/x86/mm/fault.c:1590
>  asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:570
> RIP: 0010:fault_in_readable+0x1a5/0x210 mm/gup.c:1856
> Code: fc ff df 48 c7 04 02 00 00 00 00 48 83 c4 48 4c 89 e0 5b 5d 41 5c 41 5d 41 5e 41 5f c3 45 31 e4 eb ce e8 ae 51 c4 ff 45 31 f6 <41> 8a 45 00 31 ff 44 89 f6 88 44 24 28 e8 b9 4d c4 ff 45 85 f6 75
> RSP: 0018:ffffc90006187a38 EFLAGS: 00050246
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> RDX: ffff888026905940 RSI: ffffffff81bff672 RDI: 0000000000000007
> RBP: 00000000200002cc R08: 0000000000000007 R09: 0000000000000000
> R10: 00000000000002c0 R11: 1ffffffff219cbe3 R12: 000000000000000c
> R13: 00000000200002c0 R14: 0000000000000000 R15: 1ffff92000c30f48
>  fault_in_iov_iter_readable+0x252/0x2c0 lib/iov_iter.c:362
>  generic_perform_write+0x1ae/0x570 mm/filemap.c:3913
>  ext4_buffered_write_iter+0x15b/0x460 fs/ext4/file.c:289
>  ext4_file_write_iter+0xbe0/0x1740 fs/ext4/file.c:710
>  call_write_iter include/linux/fs.h:1868 [inline]
>  new_sync_write fs/read_write.c:491 [inline]
>  vfs_write+0x945/0xd50 fs/read_write.c:584
>  ksys_write+0x12b/0x250 fs/read_write.c:637
>  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
>  do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80
>  entry_SYSCALL_64_after_hwframe+0x63/0xcd
> RIP: 0033:0x7f2359a8c169
> Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 f1 19 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:00007f235a721168 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
> RAX: ffffffffffffffda RBX: 00007f2359bac050 RCX: 00007f2359a8c169
> RDX: 000000000000000c RSI: 00000000200002c0 RDI: 0000000000000004
> RBP: 00007f2359ae7ca1 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
> R13: 00007ffea89659df R14: 00007f235a721300 R15: 0000000000022000
>  </TASK>
> ----------------
> Code disassembly (best guess), 2 bytes skipped:
>    0:	df 48 c7             	fisttps -0x39(%rax)
>    3:	04 02                	add    $0x2,%al
>    5:	00 00                	add    %al,(%rax)
>    7:	00 00                	add    %al,(%rax)
>    9:	48 83 c4 48          	add    $0x48,%rsp
>    d:	4c 89 e0             	mov    %r12,%rax
>   10:	5b                   	pop    %rbx
>   11:	5d                   	pop    %rbp
>   12:	41 5c                	pop    %r12
>   14:	41 5d                	pop    %r13
>   16:	41 5e                	pop    %r14
>   18:	41 5f                	pop    %r15
>   1a:	c3                   	retq
>   1b:	45 31 e4             	xor    %r12d,%r12d
>   1e:	eb ce                	jmp    0xffffffee
>   20:	e8 ae 51 c4 ff       	callq  0xffc451d3
>   25:	45 31 f6             	xor    %r14d,%r14d
> * 28:	41 8a 45 00          	mov    0x0(%r13),%al <-- trapping instruction
>   2c:	31 ff                	xor    %edi,%edi
>   2e:	44 89 f6             	mov    %r14d,%esi
>   31:	88 44 24 28          	mov    %al,0x28(%rsp)
>   35:	e8 b9 4d c4 ff       	callq  0xffc44df3
>   3a:	45 85 f6             	test   %r14d,%r14d
>   3d:	75                   	.byte 0x75
>

Re: [PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Matthew Wilcox 2 years, 8 months ago

On Sun, Jun 04, 2023 at 05:08:21PM -0400, Theodore Ts'o wrote:
> On Sat, Jun 03, 2023 at 11:04:45PM -0400, Theodore Ts'o wrote:
> > I tried testing to see if this fixed [1], and it appears to be
> > triggering a lockdep warning[2] at this line in the patch:
> > 
> > [1] https://syzkaller.appspot.com/bug?extid=f4582777a19ec422b517
> > [2] https://syzkaller.appspot.com/x/report.txt?x=17260843280000
> 
> Looking at this more closely, the fundamental problem is by the time
> ext4_file_mmap() is called, the mm layer has already taken
> current->mm->mmap_lock, and when we try to take the inode_lock, this
> causes locking ordering problems with how buffered write path works,
> which take the inode_lock first, and then in some cases, may end up
> taking the mmap_lock if there is a page fault for the buffer used for
> the buffered write.
> 
> If we're going to stick with the approach in this patch, I think what
> we would need is to add a pre_mmap() function to file_operations
> struct, which would get called by the mmap path *before* taking
> current->mm->mmap_lock, so we can do the inline conversion before we
> take the mmap_lock.
> 
> I'm not sure how the mm folks would react to such a proposal, though.
> I could be seen as a bit hacky, and it's not clear that any file
> system other than ext4 would need something like this.  Willy, as
> someone who does a lot of work in both mm and fs worlds --- I'm
> curious what you think about this idea?

I'm probably missing something here, but why do we need to convert inline
data in page_mkwrite?  mmap() can't change i_size (stores past i_size are
discarded), so we should be able to simply copy the data from the page
cache into the inode and write the inode when it comes to writepages()
time.

Unless somebody does a truncate() or write() that expands i_size, but we
should be able to do the conversion then without the mmap_lock held.  No?
I'm not too familiar with inline data.

Re: [PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Jan Kara 2 years, 8 months ago

On Mon 05-06-23 02:58:15, Matthew Wilcox wrote:
> On Sun, Jun 04, 2023 at 05:08:21PM -0400, Theodore Ts'o wrote:
> > On Sat, Jun 03, 2023 at 11:04:45PM -0400, Theodore Ts'o wrote:
> > > I tried testing to see if this fixed [1], and it appears to be
> > > triggering a lockdep warning[2] at this line in the patch:
> > > 
> > > [1] https://syzkaller.appspot.com/bug?extid=f4582777a19ec422b517
> > > [2] https://syzkaller.appspot.com/x/report.txt?x=17260843280000
> > 
> > Looking at this more closely, the fundamental problem is by the time
> > ext4_file_mmap() is called, the mm layer has already taken
> > current->mm->mmap_lock, and when we try to take the inode_lock, this
> > causes locking ordering problems with how buffered write path works,
> > which take the inode_lock first, and then in some cases, may end up
> > taking the mmap_lock if there is a page fault for the buffer used for
> > the buffered write.
> > 
> > If we're going to stick with the approach in this patch, I think what
> > we would need is to add a pre_mmap() function to file_operations
> > struct, which would get called by the mmap path *before* taking
> > current->mm->mmap_lock, so we can do the inline conversion before we
> > take the mmap_lock.
> > 
> > I'm not sure how the mm folks would react to such a proposal, though.
> > I could be seen as a bit hacky, and it's not clear that any file
> > system other than ext4 would need something like this.  Willy, as
> > someone who does a lot of work in both mm and fs worlds --- I'm
> > curious what you think about this idea?
> 
> I'm probably missing something here, but why do we need to convert inline
> data in page_mkwrite?  mmap() can't change i_size (stores past i_size are
> discarded), so we should be able to simply copy the data from the page
> cache into the inode and write the inode when it comes to writepages()
> time.
> 
> Unless somebody does a truncate() or write() that expands i_size, but we
> should be able to do the conversion then without the mmap_lock held.  No?
> I'm not too familiar with inline data.

Yeah, I agree, that is also the conclusion I have arrived at when thinking
about this problem now. We should be able to just remove the conversion
from ext4_page_mkwrite() and rely on write(2) or truncate(2) doing it when
growing i_size.

								Honza

-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

Re: [PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Jan Kara 2 years, 8 months ago

On Mon 05-06-23 11:16:55, Jan Kara wrote:
> On Mon 05-06-23 02:58:15, Matthew Wilcox wrote:
> > On Sun, Jun 04, 2023 at 05:08:21PM -0400, Theodore Ts'o wrote:
> > > On Sat, Jun 03, 2023 at 11:04:45PM -0400, Theodore Ts'o wrote:
> > > > I tried testing to see if this fixed [1], and it appears to be
> > > > triggering a lockdep warning[2] at this line in the patch:
> > > > 
> > > > [1] https://syzkaller.appspot.com/bug?extid=f4582777a19ec422b517
> > > > [2] https://syzkaller.appspot.com/x/report.txt?x=17260843280000
> > > 
> > > Looking at this more closely, the fundamental problem is by the time
> > > ext4_file_mmap() is called, the mm layer has already taken
> > > current->mm->mmap_lock, and when we try to take the inode_lock, this
> > > causes locking ordering problems with how buffered write path works,
> > > which take the inode_lock first, and then in some cases, may end up
> > > taking the mmap_lock if there is a page fault for the buffer used for
> > > the buffered write.
> > > 
> > > If we're going to stick with the approach in this patch, I think what
> > > we would need is to add a pre_mmap() function to file_operations
> > > struct, which would get called by the mmap path *before* taking
> > > current->mm->mmap_lock, so we can do the inline conversion before we
> > > take the mmap_lock.
> > > 
> > > I'm not sure how the mm folks would react to such a proposal, though.
> > > I could be seen as a bit hacky, and it's not clear that any file
> > > system other than ext4 would need something like this.  Willy, as
> > > someone who does a lot of work in both mm and fs worlds --- I'm
> > > curious what you think about this idea?
> > 
> > I'm probably missing something here, but why do we need to convert inline
> > data in page_mkwrite?  mmap() can't change i_size (stores past i_size are
> > discarded), so we should be able to simply copy the data from the page
> > cache into the inode and write the inode when it comes to writepages()
> > time.
> > 
> > Unless somebody does a truncate() or write() that expands i_size, but we
> > should be able to do the conversion then without the mmap_lock held.  No?
> > I'm not too familiar with inline data.
> 
> Yeah, I agree, that is also the conclusion I have arrived at when thinking
> about this problem now. We should be able to just remove the conversion
> from ext4_page_mkwrite() and rely on write(2) or truncate(2) doing it when
> growing i_size.

OK, thinking more about this and searching through the history, I've
realized why the conversion is originally in ext4_page_mkwrite(). The
problem is described in commit 7b4cc9787fe35b ("ext4: evict inline data
when writing to memory map") but essentially it boils down to the fact that
ext4 writeback code does not expect dirty page for a file with inline data
because ext4_write_inline_data_end() should have copied the data into the
inode and cleared the folio's dirty flag.

Indeed messing with xattrs from the writeback path to copy page contents
into inline data xattr would be ... interesting. Hum, out of good ideas for
now :-|.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

Re: [PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Matthew Wilcox 2 years, 8 months ago

On Mon, Jun 05, 2023 at 02:21:41PM +0200, Jan Kara wrote:
> On Mon 05-06-23 11:16:55, Jan Kara wrote:
> > Yeah, I agree, that is also the conclusion I have arrived at when thinking
> > about this problem now. We should be able to just remove the conversion
> > from ext4_page_mkwrite() and rely on write(2) or truncate(2) doing it when
> > growing i_size.
> 
> OK, thinking more about this and searching through the history, I've
> realized why the conversion is originally in ext4_page_mkwrite(). The
> problem is described in commit 7b4cc9787fe35b ("ext4: evict inline data
> when writing to memory map") but essentially it boils down to the fact that
> ext4 writeback code does not expect dirty page for a file with inline data
> because ext4_write_inline_data_end() should have copied the data into the
> inode and cleared the folio's dirty flag.
> 
> Indeed messing with xattrs from the writeback path to copy page contents
> into inline data xattr would be ... interesting. Hum, out of good ideas for
> now :-|.

Is it so bad?  Now that we don't have writepage in ext4, only
writepages, it seems like we have a considerably more benign locking
environment to work in.

Re: [PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Jan Kara 2 years, 8 months ago

On Mon 05-06-23 15:55:35, Matthew Wilcox wrote:
> On Mon, Jun 05, 2023 at 02:21:41PM +0200, Jan Kara wrote:
> > On Mon 05-06-23 11:16:55, Jan Kara wrote:
> > > Yeah, I agree, that is also the conclusion I have arrived at when thinking
> > > about this problem now. We should be able to just remove the conversion
> > > from ext4_page_mkwrite() and rely on write(2) or truncate(2) doing it when
> > > growing i_size.
> > 
> > OK, thinking more about this and searching through the history, I've
> > realized why the conversion is originally in ext4_page_mkwrite(). The
> > problem is described in commit 7b4cc9787fe35b ("ext4: evict inline data
> > when writing to memory map") but essentially it boils down to the fact that
> > ext4 writeback code does not expect dirty page for a file with inline data
> > because ext4_write_inline_data_end() should have copied the data into the
> > inode and cleared the folio's dirty flag.
> > 
> > Indeed messing with xattrs from the writeback path to copy page contents
> > into inline data xattr would be ... interesting. Hum, out of good ideas for
> > now :-|.
> 
> Is it so bad?  Now that we don't have writepage in ext4, only
> writepages, it seems like we have a considerably more benign locking
> environment to work in.

Well, yes, without ->writepage() it might be *possible*. But still rather
ugly. The problem is that in ->writepages() i_size is not stable. Thus also
whether the inode data is inline or not is not stable. We'd need inode_lock
for that but that is not easily doable in the writeback path - inode lock
would then become fs_reclaim unsafe...

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

Re: [PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Baokun Li 1 year, 9 months ago

On 2023/6/5 23:08, Jan Kara wrote:
> On Mon 05-06-23 15:55:35, Matthew Wilcox wrote:
>> On Mon, Jun 05, 2023 at 02:21:41PM +0200, Jan Kara wrote:
>>> On Mon 05-06-23 11:16:55, Jan Kara wrote:
>>>> Yeah, I agree, that is also the conclusion I have arrived at when thinking
>>>> about this problem now. We should be able to just remove the conversion
>>>> from ext4_page_mkwrite() and rely on write(2) or truncate(2) doing it when
>>>> growing i_size.
>>> OK, thinking more about this and searching through the history, I've
>>> realized why the conversion is originally in ext4_page_mkwrite(). The
>>> problem is described in commit 7b4cc9787fe35b ("ext4: evict inline data
>>> when writing to memory map") but essentially it boils down to the fact that
>>> ext4 writeback code does not expect dirty page for a file with inline data
>>> because ext4_write_inline_data_end() should have copied the data into the
>>> inode and cleared the folio's dirty flag.
>>>
>>> Indeed messing with xattrs from the writeback path to copy page contents
>>> into inline data xattr would be ... interesting. Hum, out of good ideas for
>>> now :-|.
>> Is it so bad?  Now that we don't have writepage in ext4, only
>> writepages, it seems like we have a considerably more benign locking
>> environment to work in.
> Well, yes, without ->writepage() it might be *possible*. But still rather
> ugly. The problem is that in ->writepages() i_size is not stable. Thus also
> whether the inode data is inline or not is not stable. We'd need inode_lock
> for that but that is not easily doable in the writeback path - inode lock
> would then become fs_reclaim unsafe...
>
> 								Honza
Hi Honza!
Hi Ted!
Hi Matthew!

Long time later came back to this, because while discussing another similar
ABBA problem with Hou Tao, he mentioned VM_FAULT_RETRY, and then I
thought that this could be used to solve this problem as well.

The general idea is that if we see a file with inline data in 
ext4_page_mkwrite(),
we release the mmap_lock and grab the inode_lock to convert the inline data,
and then return VM_FAULT_RETRY to retry to get the mmap_lock.

The code implementation is as follows, do you have any thoughts?

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 537803250ca9..e044c11c9cf6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6056,15 +6056,36 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
      if (unlikely(IS_IMMUTABLE(inode)))
          return VM_FAULT_SIGBUS;

+    /*
+     * The ext4 writeback code does not expect dirty page for a file with
+     * inline data, so inline data needs to be converted. But it needs to
+     * hold the inode_lock when converting, and trying to lock the inode
+     * while holding the mmap_lock may result in an ABBA deadlock. So here
+     * we release the mmap_lock for conversion and retry after conversion.
+     * Only one retry is allowed to avoid endless loops.
+     * Acquire xattr_sem to avoid race with inline data conversion.
+     */
+    down_read(&EXT4_I(inode)->xattr_sem);
+    if (ext4_has_inline_data(inode)) {
+        up_read(&EXT4_I(inode)->xattr_sem);
+
+        if (!fault_flag_allow_retry_first(vmf->flags))
+            return VM_FAULT_SIGBUS;
+
+        release_fault_lock(vmf);
+
+        inode_lock(inode);
+        ext4_convert_inline_data(inode);
+        inode_unlock(inode);
+        return VM_FAULT_RETRY;
+    }
+    up_read(&EXT4_I(inode)->xattr_sem);
+
      sb_start_pagefault(inode->i_sb);
      file_update_time(vma->vm_file);

      filemap_invalidate_lock_shared(mapping);

-    err = ext4_convert_inline_data(inode);
-    if (err)
-        goto out_ret;
-
      /*
       * On data journalling we skip straight to the transaction handle:
       * there's no delalloc; page truncated will be checked later; the

-- 
With Best Regards,
Baokun Li
.

Re: [PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Jan Kara 1 year, 9 months ago

On Mon 15-04-24 12:28:01, Baokun Li wrote:
> On 2023/6/5 23:08, Jan Kara wrote:
> > On Mon 05-06-23 15:55:35, Matthew Wilcox wrote:
> > > On Mon, Jun 05, 2023 at 02:21:41PM +0200, Jan Kara wrote:
> > > > On Mon 05-06-23 11:16:55, Jan Kara wrote:
> > > > > Yeah, I agree, that is also the conclusion I have arrived at when thinking
> > > > > about this problem now. We should be able to just remove the conversion
> > > > > from ext4_page_mkwrite() and rely on write(2) or truncate(2) doing it when
> > > > > growing i_size.
> > > > OK, thinking more about this and searching through the history, I've
> > > > realized why the conversion is originally in ext4_page_mkwrite(). The
> > > > problem is described in commit 7b4cc9787fe35b ("ext4: evict inline data
> > > > when writing to memory map") but essentially it boils down to the fact that
> > > > ext4 writeback code does not expect dirty page for a file with inline data
> > > > because ext4_write_inline_data_end() should have copied the data into the
> > > > inode and cleared the folio's dirty flag.
> > > > 
> > > > Indeed messing with xattrs from the writeback path to copy page contents
> > > > into inline data xattr would be ... interesting. Hum, out of good ideas for
> > > > now :-|.
> > > Is it so bad?  Now that we don't have writepage in ext4, only
> > > writepages, it seems like we have a considerably more benign locking
> > > environment to work in.
> > Well, yes, without ->writepage() it might be *possible*. But still rather
> > ugly. The problem is that in ->writepages() i_size is not stable. Thus also
> > whether the inode data is inline or not is not stable. We'd need inode_lock
> > for that but that is not easily doable in the writeback path - inode lock
> > would then become fs_reclaim unsafe...
> > 
> > 								Honza
> Hi Honza!
> Hi Ted!
> Hi Matthew!
> 
> Long time later came back to this, because while discussing another similar
> ABBA problem with Hou Tao, he mentioned VM_FAULT_RETRY, and then I
> thought that this could be used to solve this problem as well.
> 
> The general idea is that if we see a file with inline data in
> ext4_page_mkwrite(),
> we release the mmap_lock and grab the inode_lock to convert the inline data,
> and then return VM_FAULT_RETRY to retry to get the mmap_lock.
> 
> The code implementation is as follows, do you have any thoughts?

So the problem with this is that VM_FAULT_RETRY is not always an option -
in particular the caller has to set FAULT_FLAG_ALLOW_RETRY to indicate it
is prepared to handle VM_FAULT_RETRY return. See how
maybe_unlock_mmap_for_io() is carefully checking this. There are callers
(most notably some get_user_pages() users) that don't set
FAULT_FLAG_ALLOW_RETRY so the escape through VM_FAULT_RETRY is sadly not a
reliable solution.

My long-term wish is we were always allowed to use VM_FAULT_RETRY and that
was actually what motivated some get_user_pages() cleanups I did couple
years ago. But dealing with all the cases in various drivers was too
difficult and I've run out of time. Now maybe it would be worth it to
revisit this since things have changed noticeably and maybe now it would be
easier to achive the goal...

								Honza

> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 537803250ca9..e044c11c9cf6 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -6056,15 +6056,36 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>      if (unlikely(IS_IMMUTABLE(inode)))
>          return VM_FAULT_SIGBUS;
> 
> +    /*
> +     * The ext4 writeback code does not expect dirty page for a file with
> +     * inline data, so inline data needs to be converted. But it needs to
> +     * hold the inode_lock when converting, and trying to lock the inode
> +     * while holding the mmap_lock may result in an ABBA deadlock. So here
> +     * we release the mmap_lock for conversion and retry after conversion.
> +     * Only one retry is allowed to avoid endless loops.
> +     * Acquire xattr_sem to avoid race with inline data conversion.
> +     */
> +    down_read(&EXT4_I(inode)->xattr_sem);
> +    if (ext4_has_inline_data(inode)) {
> +        up_read(&EXT4_I(inode)->xattr_sem);
> +
> +        if (!fault_flag_allow_retry_first(vmf->flags))
> +            return VM_FAULT_SIGBUS;
> +
> +        release_fault_lock(vmf);
> +
> +        inode_lock(inode);
> +        ext4_convert_inline_data(inode);
> +        inode_unlock(inode);
> +        return VM_FAULT_RETRY;
> +    }
> +    up_read(&EXT4_I(inode)->xattr_sem);
> +
>      sb_start_pagefault(inode->i_sb);
>      file_update_time(vma->vm_file);
> 
>      filemap_invalidate_lock_shared(mapping);
> 
> -    err = ext4_convert_inline_data(inode);
> -    if (err)
> -        goto out_ret;
> -
>      /*
>       * On data journalling we skip straight to the transaction handle:
>       * there's no delalloc; page truncated will be checked later; the
> 
> -- 
> With Best Regards,
> Baokun Li
> .
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

Re: [PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Baokun Li 1 year, 9 months ago

On 2024/4/15 20:34, Jan Kara wrote:
> On Mon 15-04-24 12:28:01, Baokun Li wrote:
>> On 2023/6/5 23:08, Jan Kara wrote:
>>> On Mon 05-06-23 15:55:35, Matthew Wilcox wrote:
>>>> On Mon, Jun 05, 2023 at 02:21:41PM +0200, Jan Kara wrote:
>>>>> On Mon 05-06-23 11:16:55, Jan Kara wrote:
>>>>>> Yeah, I agree, that is also the conclusion I have arrived at when thinking
>>>>>> about this problem now. We should be able to just remove the conversion
>>>>>> from ext4_page_mkwrite() and rely on write(2) or truncate(2) doing it when
>>>>>> growing i_size.
>>>>> OK, thinking more about this and searching through the history, I've
>>>>> realized why the conversion is originally in ext4_page_mkwrite(). The
>>>>> problem is described in commit 7b4cc9787fe35b ("ext4: evict inline data
>>>>> when writing to memory map") but essentially it boils down to the fact that
>>>>> ext4 writeback code does not expect dirty page for a file with inline data
>>>>> because ext4_write_inline_data_end() should have copied the data into the
>>>>> inode and cleared the folio's dirty flag.
>>>>>
>>>>> Indeed messing with xattrs from the writeback path to copy page contents
>>>>> into inline data xattr would be ... interesting. Hum, out of good ideas for
>>>>> now :-|.
>>>> Is it so bad?  Now that we don't have writepage in ext4, only
>>>> writepages, it seems like we have a considerably more benign locking
>>>> environment to work in.
>>> Well, yes, without ->writepage() it might be *possible*. But still rather
>>> ugly. The problem is that in ->writepages() i_size is not stable. Thus also
>>> whether the inode data is inline or not is not stable. We'd need inode_lock
>>> for that but that is not easily doable in the writeback path - inode lock
>>> would then become fs_reclaim unsafe...
>>>
>>> 								Honza
>> Hi Honza!
>> Hi Ted!
>> Hi Matthew!
>>
>> Long time later came back to this, because while discussing another similar
>> ABBA problem with Hou Tao, he mentioned VM_FAULT_RETRY, and then I
>> thought that this could be used to solve this problem as well.
>>
>> The general idea is that if we see a file with inline data in
>> ext4_page_mkwrite(),
>> we release the mmap_lock and grab the inode_lock to convert the inline data,
>> and then return VM_FAULT_RETRY to retry to get the mmap_lock.
>>
>> The code implementation is as follows, do you have any thoughts?
> So the problem with this is that VM_FAULT_RETRY is not always an option -
> in particular the caller has to set FAULT_FLAG_ALLOW_RETRY to indicate it
> is prepared to handle VM_FAULT_RETRY return. See how
> maybe_unlock_mmap_for_io() is carefully checking this.
Yes, at least we need to check for FAULT_FLAG_RETRY_NOWAIT.
> There are callers
> (most notably some get_user_pages() users) that don't set
> FAULT_FLAG_ALLOW_RETRY so the escape through VM_FAULT_RETRY is sadly not a
> reliable solution.
It is indeed sad.  I'm going to go learn more about the code for
FAULT_FLAG_ALLOW_RETRY.
> My long-term wish is we were always allowed to use VM_FAULT_RETRY and that
> was actually what motivated some get_user_pages() cleanups I did couple
> years ago. But dealing with all the cases in various drivers was too
> difficult and I've run out of time. Now maybe it would be worth it to
> revisit this since things have changed noticeably and maybe now it would be
> easier to achive the goal...
>
> 								Honza
That sounds like a great idea. I will try to get the history on it and
then come back.

Thank you very much for your patient explanation!
-- 
With Best Regards,
Baokun Li
.

Re: [PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Baokun Li 2 years, 8 months ago

On 2023/6/5 23:08, Jan Kara wrote:
> On Mon 05-06-23 15:55:35, Matthew Wilcox wrote:
>> On Mon, Jun 05, 2023 at 02:21:41PM +0200, Jan Kara wrote:
>>> On Mon 05-06-23 11:16:55, Jan Kara wrote:
>>>> Yeah, I agree, that is also the conclusion I have arrived at when thinking
>>>> about this problem now. We should be able to just remove the conversion
>>>> from ext4_page_mkwrite() and rely on write(2) or truncate(2) doing it when
>>>> growing i_size.
>>> OK, thinking more about this and searching through the history, I've
>>> realized why the conversion is originally in ext4_page_mkwrite(). The
>>> problem is described in commit 7b4cc9787fe35b ("ext4: evict inline data
>>> when writing to memory map") but essentially it boils down to the fact that
>>> ext4 writeback code does not expect dirty page for a file with inline data
>>> because ext4_write_inline_data_end() should have copied the data into the
>>> inode and cleared the folio's dirty flag.
>>>
>>> Indeed messing with xattrs from the writeback path to copy page contents
>>> into inline data xattr would be ... interesting. Hum, out of good ideas for
>>> now :-|.
>> Is it so bad?  Now that we don't have writepage in ext4, only
>> writepages, it seems like we have a considerably more benign locking
>> environment to work in.
> Well, yes, without ->writepage() it might be *possible*. But still rather
> ugly. The problem is that in ->writepages() i_size is not stable. Thus also
> whether the inode data is inline or not is not stable. We'd need inode_lock
> for that but that is not easily doable in the writeback path - inode lock
> would then become fs_reclaim unsafe...
>
> 								Honza

If we try to add inode_lock to ext4_writepages to complete the
conversion, there may be a deadlock as follows:

       CPU0             CPU1
writeback_single_inode
  spin_lock(&inode->i_lock) ---> LOCK B
                    enable_verity
                     inode_lock(inode)  ---> LOCK A
                     vops->begin_enable_verity
                     ext4_begin_enable_verity
                      ext4_inode_attach_jinode
                       spin_lock(&inode->i_lock)   ---> try LOCK B
  __writeback_single_inode          |
   do_writepages                ABBA deadlock
    ext4_writepages                 |
     inode_lock(inode)  ---> try LOCK A

If we add inode_lock to the write back process to complete the inline 
conversion,
it seems that we still have to add an ops ...

I've been going over this problem for a long time, but I can't think of 
a good way
to solve it.

-- 
With Best Regards,
Baokun Li
.

Re: [PATCH v2] ext4: fix race condition between buffer write and page_mkwrite

Posted by Jan Kara 2 years, 8 months ago

On Tue 30-05-23 21:44:05, Baokun Li wrote:
> Syzbot reported a BUG_ON:
> ==================================================================
> EXT4-fs (loop0): mounted filesystem without journal. Quota mode: none.
> EXT4-fs error (device loop0): ext4_mb_generate_buddy:1098: group 0, block
>      bitmap and bg descriptor inconsistent: 25 vs 150994969 free clusters
> ------------[ cut here ]------------
> kernel BUG at fs/ext4/ext4_jbd2.c:53!
> invalid opcode: 0000 [#1] PREEMPT SMP KASAN
> CPU: 1 PID: 494 Comm: syz-executor.0 6.1.0-rc7-syzkaller-ga4412fdd49dc #0
> RIP: 0010:__ext4_journal_stop+0x1b3/0x1c0
>  [...]
> Call Trace:
>  ext4_write_inline_data_end+0xa39/0xdf0
>  ext4_da_write_end+0x1e2/0x950
>  generic_perform_write+0x401/0x5f0
>  ext4_buffered_write_iter+0x35f/0x640
>  ext4_file_write_iter+0x198/0x1cd0
>  vfs_write+0x8b5/0xef0
>  [...]
> ==================================================================
> 
> The above BUG_ON is triggered by the following race:
> 
>            cpu1                    cpu2
> ________________________|________________________
> ksys_write
>  vfs_write
>   new_sync_write
>    ext4_file_write_iter
>     ext4_buffered_write_iter
>      generic_perform_write
>       ext4_da_write_begin
>                           do_fault
>                            do_page_mkwrite
>                             ext4_page_mkwrite
>                              ext4_convert_inline_data
>                               ext4_convert_inline_data_nolock
>                                ext4_destroy_inline_data_nolock
>                                 //clear EXT4_STATE_MAY_INLINE_DATA
>                                ext4_map_blocks --> return error
>        ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)
>        ext4_block_write_begin
>                                ext4_restore_inline_data
>                                 // set EXT4_STATE_MAY_INLINE_DATA
>       ext4_da_write_end
>        ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)
>        ext4_write_inline_data_end
>         handle=NULL
>         ext4_journal_stop(handle)
>          __ext4_journal_stop
>           ext4_put_nojournal(handle)
>            ref_cnt = (unsigned long)handle
>            BUG_ON(ref_cnt == 0)  ---> BUG_ON
> 
> The root cause of this problem is that the ext4_convert_inline_data() in
> ext4_page_mkwrite() does not grab i_rwsem, so it may race with
> ext4_buffered_write_iter() and cause the write_begin() and write_end()
> functions to be inconsistent and trigger BUG_ON.
> 
> To solve the above issue, we can not add inode_lock directly to
> ext4_page_mkwrite(), which would not only cause performance degradation but
> also ABBA deadlock (see Link). Hence we move ext4_convert_inline_data() to
> ext4_file_mmap(), and only when inline_data is enabled and mmap a writeable
> file in shared mode, we hold the lock to convert, which avoids the above
> problems.
> 
> Link: https://lore.kernel.org/r/20230530102804.6t7np7om6tczscuo@quack3/
> Reported-by: Jun Nie <jun.nie@linaro.org>
> Closes: https://lore.kernel.org/lkml/63903521.5040307@huawei.com/t/
> Reported-by: syzbot+a158d886ca08a3fecca4@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?id=899b37f20ce4072bcdfecfe1647b39602e956e36
> Fixes: 7b4cc9787fe3 ("ext4: evict inline data when writing to memory map")
> CC: stable@vger.kernel.org # 4.12+
> Signed-off-by: Baokun Li <libaokun1@huawei.com>

Nice. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/file.c  | 24 +++++++++++++++++++++++-
>  fs/ext4/inode.c |  4 ----
>  2 files changed, 23 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index d101b3b0c7da..9df82d72eb90 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -795,7 +795,8 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
>  static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
>  {
>  	struct inode *inode = file->f_mapping->host;
> -	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> +	struct super_block *sb = inode->i_sb;
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
>  	struct dax_device *dax_dev = sbi->s_daxdev;
>  
>  	if (unlikely(ext4_forced_shutdown(sbi)))
> @@ -808,6 +809,27 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
>  	if (!daxdev_mapping_supported(vma, dax_dev))
>  		return -EOPNOTSUPP;
>  
> +	/*
> +	 * Writing via mmap has no logic to handle inline data, so we
> +	 * need to call ext4_convert_inline_data() to convert the inode
> +	 * to normal format before doing so, otherwise a BUG_ON will be
> +	 * triggered in ext4_writepages() due to the
> +	 * EXT4_STATE_MAY_INLINE_DATA flag. Moreover, we need to grab
> +	 * i_rwsem during conversion, since clearing and setting the
> +	 * inline data flag may race with ext4_buffered_write_iter()
> +	 * to trigger a BUG_ON.
> +	 */
> +	if (ext4_has_feature_inline_data(sb) &&
> +	    vma->vm_flags & VM_SHARED && vma->vm_flags & VM_MAYWRITE) {
> +		int err;
> +
> +		inode_lock(inode);
> +		err = ext4_convert_inline_data(inode);
> +		inode_unlock(inode);
> +		if (err)
> +			return err;
> +	}
> +
>  	file_accessed(file);
>  	if (IS_DAX(file_inode(file))) {
>  		vma->vm_ops = &ext4_dax_vm_ops;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index ce5f21b6c2b3..31844c4ec9fe 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -6043,10 +6043,6 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>  
>  	filemap_invalidate_lock_shared(mapping);
>  
> -	err = ext4_convert_inline_data(inode);
> -	if (err)
> -		goto out_ret;
> -
>  	/*
>  	 * On data journalling we skip straight to the transaction handle:
>  	 * there's no delalloc; page truncated will be checked later; the
> -- 
> 2.31.1
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR