[PATCH] jbd2: fix data-race and null-ptr-deref in jbd2_journal_dirty_metadata()

Jeongjun Park posted 1 patch 8 months, 4 weeks ago
There is a newer version of this series
fs/jbd2/transaction.c | 49 +++++++++++++++----------------------------
1 file changed, 17 insertions(+), 32 deletions(-)
[PATCH] jbd2: fix data-race and null-ptr-deref in jbd2_journal_dirty_metadata()
Posted by Jeongjun Park 8 months, 4 weeks ago
Since handle->h_transaction may be a NULL pointer, so we should change it
to call is_handle_aborted(handle) first before dereferencing it.

And the following data-race was reported in my fuzzer:

==================================================================
BUG: KCSAN: data-race in jbd2_journal_dirty_metadata / jbd2_journal_dirty_metadata

write to 0xffff888011024104 of 4 bytes by task 10881 on cpu 1:
 jbd2_journal_dirty_metadata+0x2a5/0x770 fs/jbd2/transaction.c:1556
 __ext4_handle_dirty_metadata+0xe7/0x4b0 fs/ext4/ext4_jbd2.c:358
 ext4_do_update_inode fs/ext4/inode.c:5220 [inline]
 ext4_mark_iloc_dirty+0x32c/0xd50 fs/ext4/inode.c:5869
 __ext4_mark_inode_dirty+0xe1/0x450 fs/ext4/inode.c:6074
 ext4_dirty_inode+0x98/0xc0 fs/ext4/inode.c:6103
....

read to 0xffff888011024104 of 4 bytes by task 10880 on cpu 0:
 jbd2_journal_dirty_metadata+0xf2/0x770 fs/jbd2/transaction.c:1512
 __ext4_handle_dirty_metadata+0xe7/0x4b0 fs/ext4/ext4_jbd2.c:358
 ext4_do_update_inode fs/ext4/inode.c:5220 [inline]
 ext4_mark_iloc_dirty+0x32c/0xd50 fs/ext4/inode.c:5869
 __ext4_mark_inode_dirty+0xe1/0x450 fs/ext4/inode.c:6074
 ext4_dirty_inode+0x98/0xc0 fs/ext4/inode.c:6103
....

value changed: 0x00000000 -> 0x00000001
==================================================================

According to this crash report, there is a read/write data-race in
jh->b_modified.

This is because the b_state_lock is locked too late.

For some reason, jbd2_journal_dirty_metadata() has been written in a way
that it does not lock b_state_lock before checking jh->b_transaction.

However, This makes the code that checks jh->b_transaction messy and
causes a data-race in jh->b_* variables. Since locking b_state_lock
earlier is not enough to significantly affect performance and most of the
functions defined in transaction.c lock b_state_lock before
reading/writing jh->b_*.

Thereforce, I think it would be appropriate to modify 
jbd2_journal_dirty_metadata() to lock b_state_lock earlier as well.

Reported-by: syzbot+de24c3fe3c4091051710@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=de24c3fe3c4091051710
Fixes: 6e06ae88edae ("jbd2: speedup jbd2_journal_dirty_metadata()")
Signed-off-by: Jeongjun Park <aha310510@gmail.com>
---
 fs/jbd2/transaction.c | 49 +++++++++++++++----------------------------
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cbc4785462f5..7e6dbf37396f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1496,41 +1496,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	jbd2_debug(5, "journal_head %p\n", jh);
 	JBUFFER_TRACE(jh, "entry");
 
-	/*
-	 * This and the following assertions are unreliable since we may see jh
-	 * in inconsistent state unless we grab bh_state lock. But this is
-	 * crucial to catch bugs so let's do a reliable check until the
-	 * lockless handling is fully proven.
-	 */
-	if (data_race(jh->b_transaction != transaction &&
-	    jh->b_next_transaction != transaction)) {
-		spin_lock(&jh->b_state_lock);
-		J_ASSERT_JH(jh, jh->b_transaction == transaction ||
-				jh->b_next_transaction == transaction);
-		spin_unlock(&jh->b_state_lock);
-	}
+	spin_lock(&jh->b_state_lock);
+
+	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+			jh->b_next_transaction == transaction);
+
 	if (jh->b_modified == 1) {
 		/* If it's in our transaction it must be in BJ_Metadata list. */
-		if (data_race(jh->b_transaction == transaction &&
-		    jh->b_jlist != BJ_Metadata)) {
-			spin_lock(&jh->b_state_lock);
-			if (jh->b_transaction == transaction &&
-			    jh->b_jlist != BJ_Metadata)
-				pr_err("JBD2: assertion failure: h_type=%u "
-				       "h_line_no=%u block_no=%llu jlist=%u\n",
-				       handle->h_type, handle->h_line_no,
-				       (unsigned long long) bh->b_blocknr,
-				       jh->b_jlist);
-			J_ASSERT_JH(jh, jh->b_transaction != transaction ||
-					jh->b_jlist == BJ_Metadata);
-			spin_unlock(&jh->b_state_lock);
-		}
-		goto out;
+		if (jh->b_transaction == transaction &&
+			jh->b_jlist != BJ_Metadata)
+			pr_err("JBD2: assertion failure: h_type=%u "
+			       "h_line_no=%u block_no=%llu jlist=%u\n",
+			       handle->h_type, handle->h_line_no,
+			       (unsigned long long) bh->b_blocknr,
+			       jh->b_jlist);
+		J_ASSERT_JH(jh, jh->b_transaction != transaction ||
+				jh->b_jlist == BJ_Metadata);
+		goto out_unlock_bh;
 	}
 
-	journal = transaction->t_journal;
-	spin_lock(&jh->b_state_lock);
-
 	if (is_handle_aborted(handle)) {
 		/*
 		 * Check journal aborting with @jh->b_state_lock locked,
@@ -1543,6 +1527,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 		goto out_unlock_bh;
 	}
 
+	journal = transaction->t_journal;
+
 	if (jh->b_modified == 0) {
 		/*
 		 * This buffer's got modified and becoming part
@@ -1628,7 +1614,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	spin_unlock(&journal->j_list_lock);
 out_unlock_bh:
 	spin_unlock(&jh->b_state_lock);
-out:
 	JBUFFER_TRACE(jh, "exit");
 	return ret;
 }
--
Re: [PATCH] jbd2: fix data-race and null-ptr-deref in jbd2_journal_dirty_metadata()
Posted by Jan Kara 8 months, 4 weeks ago
On Wed 14-05-25 02:04:41, Jeongjun Park wrote:
> Since handle->h_transaction may be a NULL pointer, so we should change it
> to call is_handle_aborted(handle) first before dereferencing it.
> 
> And the following data-race was reported in my fuzzer:
> 
> ==================================================================
> BUG: KCSAN: data-race in jbd2_journal_dirty_metadata / jbd2_journal_dirty_metadata
> 
> write to 0xffff888011024104 of 4 bytes by task 10881 on cpu 1:
>  jbd2_journal_dirty_metadata+0x2a5/0x770 fs/jbd2/transaction.c:1556
>  __ext4_handle_dirty_metadata+0xe7/0x4b0 fs/ext4/ext4_jbd2.c:358
>  ext4_do_update_inode fs/ext4/inode.c:5220 [inline]
>  ext4_mark_iloc_dirty+0x32c/0xd50 fs/ext4/inode.c:5869
>  __ext4_mark_inode_dirty+0xe1/0x450 fs/ext4/inode.c:6074
>  ext4_dirty_inode+0x98/0xc0 fs/ext4/inode.c:6103
> ....
> 
> read to 0xffff888011024104 of 4 bytes by task 10880 on cpu 0:
>  jbd2_journal_dirty_metadata+0xf2/0x770 fs/jbd2/transaction.c:1512
>  __ext4_handle_dirty_metadata+0xe7/0x4b0 fs/ext4/ext4_jbd2.c:358
>  ext4_do_update_inode fs/ext4/inode.c:5220 [inline]
>  ext4_mark_iloc_dirty+0x32c/0xd50 fs/ext4/inode.c:5869
>  __ext4_mark_inode_dirty+0xe1/0x450 fs/ext4/inode.c:6074
>  ext4_dirty_inode+0x98/0xc0 fs/ext4/inode.c:6103
> ....
> 
> value changed: 0x00000000 -> 0x00000001
> ==================================================================
> 
> According to this crash report, there is a read/write data-race in
> jh->b_modified.
> 
> This is because the b_state_lock is locked too late.
> 
> For some reason, jbd2_journal_dirty_metadata() has been written in a way
> that it does not lock b_state_lock before checking jh->b_transaction.

Yes and that is deliberate because for some buffers like bitmaps the
contention (and cacheline bouncing) on the b_state_lock can be actually
quite significant. See commit 6e06ae88edae ("jbd2: speedup
jbd2_journal_dirty_metadata()") for a bit more description.

> However, This makes the code that checks jh->b_transaction messy and
> causes a data-race in jh->b_* variables. Since locking b_state_lock
> earlier is not enough to significantly affect performance and most of the
> functions defined in transaction.c lock b_state_lock before
> reading/writing jh->b_*.

Well, the code is written with the expectation that b_transaction, b_jlist,
and b_modified fields can be changing underneath. Since this was
implemented 10 years ago, we didn't really bother with properly annotating
this but the code as is is safe. I agree the data_race() annotation for the
b_modified check was missed when adding data_race() annotations 4 years
ago so that should be fixed.

								Honza

> Thereforce, I think it would be appropriate to modify 
> jbd2_journal_dirty_metadata() to lock b_state_lock earlier as well.
> 
> Reported-by: syzbot+de24c3fe3c4091051710@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=de24c3fe3c4091051710
> Fixes: 6e06ae88edae ("jbd2: speedup jbd2_journal_dirty_metadata()")
> Signed-off-by: Jeongjun Park <aha310510@gmail.com>
> ---
>  fs/jbd2/transaction.c | 49 +++++++++++++++----------------------------
>  1 file changed, 17 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
> index cbc4785462f5..7e6dbf37396f 100644
> --- a/fs/jbd2/transaction.c
> +++ b/fs/jbd2/transaction.c
> @@ -1496,41 +1496,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
>  	jbd2_debug(5, "journal_head %p\n", jh);
>  	JBUFFER_TRACE(jh, "entry");
>  
> -	/*
> -	 * This and the following assertions are unreliable since we may see jh
> -	 * in inconsistent state unless we grab bh_state lock. But this is
> -	 * crucial to catch bugs so let's do a reliable check until the
> -	 * lockless handling is fully proven.
> -	 */
> -	if (data_race(jh->b_transaction != transaction &&
> -	    jh->b_next_transaction != transaction)) {
> -		spin_lock(&jh->b_state_lock);
> -		J_ASSERT_JH(jh, jh->b_transaction == transaction ||
> -				jh->b_next_transaction == transaction);
> -		spin_unlock(&jh->b_state_lock);
> -	}
> +	spin_lock(&jh->b_state_lock);
> +
> +	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
> +			jh->b_next_transaction == transaction);
> +
>  	if (jh->b_modified == 1) {
>  		/* If it's in our transaction it must be in BJ_Metadata list. */
> -		if (data_race(jh->b_transaction == transaction &&
> -		    jh->b_jlist != BJ_Metadata)) {
> -			spin_lock(&jh->b_state_lock);
> -			if (jh->b_transaction == transaction &&
> -			    jh->b_jlist != BJ_Metadata)
> -				pr_err("JBD2: assertion failure: h_type=%u "
> -				       "h_line_no=%u block_no=%llu jlist=%u\n",
> -				       handle->h_type, handle->h_line_no,
> -				       (unsigned long long) bh->b_blocknr,
> -				       jh->b_jlist);
> -			J_ASSERT_JH(jh, jh->b_transaction != transaction ||
> -					jh->b_jlist == BJ_Metadata);
> -			spin_unlock(&jh->b_state_lock);
> -		}
> -		goto out;
> +		if (jh->b_transaction == transaction &&
> +			jh->b_jlist != BJ_Metadata)
> +			pr_err("JBD2: assertion failure: h_type=%u "
> +			       "h_line_no=%u block_no=%llu jlist=%u\n",
> +			       handle->h_type, handle->h_line_no,
> +			       (unsigned long long) bh->b_blocknr,
> +			       jh->b_jlist);
> +		J_ASSERT_JH(jh, jh->b_transaction != transaction ||
> +				jh->b_jlist == BJ_Metadata);
> +		goto out_unlock_bh;
>  	}
>  
> -	journal = transaction->t_journal;
> -	spin_lock(&jh->b_state_lock);
> -
>  	if (is_handle_aborted(handle)) {
>  		/*
>  		 * Check journal aborting with @jh->b_state_lock locked,
> @@ -1543,6 +1527,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
>  		goto out_unlock_bh;
>  	}
>  
> +	journal = transaction->t_journal;
> +
>  	if (jh->b_modified == 0) {
>  		/*
>  		 * This buffer's got modified and becoming part
> @@ -1628,7 +1614,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
>  	spin_unlock(&journal->j_list_lock);
>  out_unlock_bh:
>  	spin_unlock(&jh->b_state_lock);
> -out:
>  	JBUFFER_TRACE(jh, "exit");
>  	return ret;
>  }
> --
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR
Re: [PATCH] jbd2: fix data-race and null-ptr-deref in jbd2_journal_dirty_metadata()
Posted by Jeongjun Park 8 months, 4 weeks ago
Jan Kara <jack@suse.cz> wrote:
>
> On Wed 14-05-25 02:04:41, Jeongjun Park wrote:
> > Since handle->h_transaction may be a NULL pointer, so we should change it
> > to call is_handle_aborted(handle) first before dereferencing it.
> >
> > And the following data-race was reported in my fuzzer:
> >
> > ==================================================================
> > BUG: KCSAN: data-race in jbd2_journal_dirty_metadata / jbd2_journal_dirty_metadata
> >
> > write to 0xffff888011024104 of 4 bytes by task 10881 on cpu 1:
> >  jbd2_journal_dirty_metadata+0x2a5/0x770 fs/jbd2/transaction.c:1556
> >  __ext4_handle_dirty_metadata+0xe7/0x4b0 fs/ext4/ext4_jbd2.c:358
> >  ext4_do_update_inode fs/ext4/inode.c:5220 [inline]
> >  ext4_mark_iloc_dirty+0x32c/0xd50 fs/ext4/inode.c:5869
> >  __ext4_mark_inode_dirty+0xe1/0x450 fs/ext4/inode.c:6074
> >  ext4_dirty_inode+0x98/0xc0 fs/ext4/inode.c:6103
> > ....
> >
> > read to 0xffff888011024104 of 4 bytes by task 10880 on cpu 0:
> >  jbd2_journal_dirty_metadata+0xf2/0x770 fs/jbd2/transaction.c:1512
> >  __ext4_handle_dirty_metadata+0xe7/0x4b0 fs/ext4/ext4_jbd2.c:358
> >  ext4_do_update_inode fs/ext4/inode.c:5220 [inline]
> >  ext4_mark_iloc_dirty+0x32c/0xd50 fs/ext4/inode.c:5869
> >  __ext4_mark_inode_dirty+0xe1/0x450 fs/ext4/inode.c:6074
> >  ext4_dirty_inode+0x98/0xc0 fs/ext4/inode.c:6103
> > ....
> >
> > value changed: 0x00000000 -> 0x00000001
> > ==================================================================
> >
> > According to this crash report, there is a read/write data-race in
> > jh->b_modified.
> >
> > This is because the b_state_lock is locked too late.
> >
> > For some reason, jbd2_journal_dirty_metadata() has been written in a way
> > that it does not lock b_state_lock before checking jh->b_transaction.
>
> Yes and that is deliberate because for some buffers like bitmaps the
> contention (and cacheline bouncing) on the b_state_lock can be actually
> quite significant. See commit 6e06ae88edae ("jbd2: speedup
> jbd2_journal_dirty_metadata()") for a bit more description.
>
> > However, This makes the code that checks jh->b_transaction messy and
> > causes a data-race in jh->b_* variables. Since locking b_state_lock
> > earlier is not enough to significantly affect performance and most of the
> > functions defined in transaction.c lock b_state_lock before
> > reading/writing jh->b_*.
>
> Well, the code is written with the expectation that b_transaction, b_jlist,
> and b_modified fields can be changing underneath. Since this was
> implemented 10 years ago, we didn't really bother with properly annotating
> this but the code as is is safe. I agree the data_race() annotation for the
> b_modified check was missed when adding data_race() annotations 4 years
> ago so that should be fixed.
>
>                                                                 Honza
>

It's surprising that the spinlock protection for just two if statements
has such a big impact on performance. I'll change it to annotate
jh->b_modified with data_race() and send v2 patch.

Regards,

Jeongjun Park

> > Thereforce, I think it would be appropriate to modify
> > jbd2_journal_dirty_metadata() to lock b_state_lock earlier as well.
> >
> > Reported-by: syzbot+de24c3fe3c4091051710@syzkaller.appspotmail.com
> > Closes: https://syzkaller.appspot.com/bug?extid=de24c3fe3c4091051710
> > Fixes: 6e06ae88edae ("jbd2: speedup jbd2_journal_dirty_metadata()")
> > Signed-off-by: Jeongjun Park <aha310510@gmail.com>
> > ---
> >  fs/jbd2/transaction.c | 49 +++++++++++++++----------------------------
> >  1 file changed, 17 insertions(+), 32 deletions(-)
> >
> > diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
> > index cbc4785462f5..7e6dbf37396f 100644
> > --- a/fs/jbd2/transaction.c
> > +++ b/fs/jbd2/transaction.c
> > @@ -1496,41 +1496,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
> >       jbd2_debug(5, "journal_head %p\n", jh);
> >       JBUFFER_TRACE(jh, "entry");
> >
> > -     /*
> > -      * This and the following assertions are unreliable since we may see jh
> > -      * in inconsistent state unless we grab bh_state lock. But this is
> > -      * crucial to catch bugs so let's do a reliable check until the
> > -      * lockless handling is fully proven.
> > -      */
> > -     if (data_race(jh->b_transaction != transaction &&
> > -         jh->b_next_transaction != transaction)) {
> > -             spin_lock(&jh->b_state_lock);
> > -             J_ASSERT_JH(jh, jh->b_transaction == transaction ||
> > -                             jh->b_next_transaction == transaction);
> > -             spin_unlock(&jh->b_state_lock);
> > -     }
> > +     spin_lock(&jh->b_state_lock);
> > +
> > +     J_ASSERT_JH(jh, jh->b_transaction == transaction ||
> > +                     jh->b_next_transaction == transaction);
> > +
> >       if (jh->b_modified == 1) {
> >               /* If it's in our transaction it must be in BJ_Metadata list. */
> > -             if (data_race(jh->b_transaction == transaction &&
> > -                 jh->b_jlist != BJ_Metadata)) {
> > -                     spin_lock(&jh->b_state_lock);
> > -                     if (jh->b_transaction == transaction &&
> > -                         jh->b_jlist != BJ_Metadata)
> > -                             pr_err("JBD2: assertion failure: h_type=%u "
> > -                                    "h_line_no=%u block_no=%llu jlist=%u\n",
> > -                                    handle->h_type, handle->h_line_no,
> > -                                    (unsigned long long) bh->b_blocknr,
> > -                                    jh->b_jlist);
> > -                     J_ASSERT_JH(jh, jh->b_transaction != transaction ||
> > -                                     jh->b_jlist == BJ_Metadata);
> > -                     spin_unlock(&jh->b_state_lock);
> > -             }
> > -             goto out;
> > +             if (jh->b_transaction == transaction &&
> > +                     jh->b_jlist != BJ_Metadata)
> > +                     pr_err("JBD2: assertion failure: h_type=%u "
> > +                            "h_line_no=%u block_no=%llu jlist=%u\n",
> > +                            handle->h_type, handle->h_line_no,
> > +                            (unsigned long long) bh->b_blocknr,
> > +                            jh->b_jlist);
> > +             J_ASSERT_JH(jh, jh->b_transaction != transaction ||
> > +                             jh->b_jlist == BJ_Metadata);
> > +             goto out_unlock_bh;
> >       }
> >
> > -     journal = transaction->t_journal;
> > -     spin_lock(&jh->b_state_lock);
> > -
> >       if (is_handle_aborted(handle)) {
> >               /*
> >                * Check journal aborting with @jh->b_state_lock locked,
> > @@ -1543,6 +1527,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
> >               goto out_unlock_bh;
> >       }
> >
> > +     journal = transaction->t_journal;
> > +
> >       if (jh->b_modified == 0) {
> >               /*
> >                * This buffer's got modified and becoming part
> > @@ -1628,7 +1614,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
> >       spin_unlock(&journal->j_list_lock);
> >  out_unlock_bh:
> >       spin_unlock(&jh->b_state_lock);
> > -out:
> >       JBUFFER_TRACE(jh, "exit");
> >       return ret;
> >  }
> > --
> --
> Jan Kara <jack@suse.com>
> SUSE Labs, CR