[PATCH v3 2/2] jbd2: gracefully abort on transaction state corruptions

Milos Nikic posted 2 patches 1 month, 1 week ago
There is a newer version of this series
[PATCH v3 2/2] jbd2: gracefully abort on transaction state corruptions
Posted by Milos Nikic 1 month, 1 week ago
Auditing the jbd2 codebase reveals several legacy J_ASSERT calls
that enforce internal state machine invariants (e.g., verifying
jh->b_transaction or jh->b_next_transaction pointers).

When these invariants are broken, the journal is in a corrupted
state. However, triggering a fatal panic brings down the entire
system for a localized filesystem error.

This patch targets a specific class of these asserts: those
residing inside functions that natively return integer error codes,
booleans, or error pointers. It replaces the hard J_ASSERTs with
WARN_ON_ONCE to capture the offending stack trace, safely drops
any held locks, gracefully aborts the journal, and returns -EINVAL.

This prevents a catastrophic kernel panic while ensuring the
corrupted journal state is safely contained and upstream callers
(like ext4 or ocfs2) can gracefully handle the aborted handle.

Functions modified in fs/jbd2/transaction.c:
- jbd2__journal_start()
- do_get_write_access()
- jbd2_journal_dirty_metadata()
- jbd2_journal_forget()
- jbd2_journal_try_to_free_buffers()
- jbd2_journal_file_inode()

Signed-off-by: Milos Nikic <nikic.milos@gmail.com>
---
 fs/jbd2/transaction.c | 108 +++++++++++++++++++++++++++++++++---------
 1 file changed, 85 insertions(+), 23 deletions(-)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 04d17a5f2a82..3cb4e524e0a6 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -474,7 +474,8 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
 		return ERR_PTR(-EROFS);
 
 	if (handle) {
-		J_ASSERT(handle->h_transaction->t_journal == journal);
+		if (WARN_ON_ONCE(handle->h_transaction->t_journal != journal))
+			return ERR_PTR(-EINVAL);
 		handle->h_ref++;
 		return handle;
 	}
@@ -1036,7 +1037,13 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 	 */
 	if (!jh->b_transaction) {
 		JBUFFER_TRACE(jh, "no transaction");
-		J_ASSERT_JH(jh, !jh->b_next_transaction);
+		if (WARN_ON_ONCE(jh->b_next_transaction)) {
+			spin_unlock(&jh->b_state_lock);
+			unlock_buffer(bh);
+			error = -EINVAL;
+			jbd2_journal_abort(journal, error);
+			goto out;
+		}
 		JBUFFER_TRACE(jh, "file as BJ_Reserved");
 		/*
 		 * Make sure all stores to jh (b_modified, b_frozen_data) are
@@ -1069,13 +1076,27 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 	 */
 	if (jh->b_frozen_data) {
 		JBUFFER_TRACE(jh, "has frozen data");
-		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+		if (WARN_ON_ONCE(jh->b_next_transaction)) {
+			spin_unlock(&jh->b_state_lock);
+			error = -EINVAL;
+			jbd2_journal_abort(journal, error);
+			goto out;
+		}
 		goto attach_next;
 	}
 
 	JBUFFER_TRACE(jh, "owned by older transaction");
-	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-	J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction);
+	if (WARN_ON_ONCE(jh->b_next_transaction ||
+			 jh->b_transaction !=
+			 journal->j_committing_transaction)) {
+		pr_err("JBD2: %s: assertion failure: b_next_transaction=%p b_transaction=%p j_committing_transaction=%p\n",
+		       journal->j_devname, jh->b_next_transaction,
+		       jh->b_transaction, journal->j_committing_transaction);
+		spin_unlock(&jh->b_state_lock);
+		error = -EINVAL;
+		jbd2_journal_abort(journal, error);
+		goto out;
+	}
 
 	/*
 	 * There is one case we have to be very careful about.  If the
@@ -1520,8 +1541,14 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	if (data_race(jh->b_transaction != transaction &&
 	    jh->b_next_transaction != transaction)) {
 		spin_lock(&jh->b_state_lock);
-		J_ASSERT_JH(jh, jh->b_transaction == transaction ||
-				jh->b_next_transaction == transaction);
+		if (WARN_ON_ONCE(jh->b_transaction != transaction &&
+				 jh->b_next_transaction != transaction)) {
+			pr_err("JBD2: %s: assertion failure: b_transaction=%p transaction=%p b_next_transaction=%p\n",
+			       journal->j_devname, jh->b_transaction,
+			       transaction, jh->b_next_transaction);
+			ret = -EINVAL;
+			goto out_unlock_bh;
+		}
 		spin_unlock(&jh->b_state_lock);
 	}
 	if (data_race(jh->b_modified == 1)) {
@@ -1531,13 +1558,15 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 			spin_lock(&jh->b_state_lock);
 			if (jh->b_transaction == transaction &&
 			    jh->b_jlist != BJ_Metadata)
-				pr_err("JBD2: assertion failure: h_type=%u "
-				       "h_line_no=%u block_no=%llu jlist=%u\n",
+				pr_err("JBD2: assertion failure: h_type=%u h_line_no=%u block_no=%llu jlist=%u\n",
 				       handle->h_type, handle->h_line_no,
 				       (unsigned long long) bh->b_blocknr,
 				       jh->b_jlist);
-			J_ASSERT_JH(jh, jh->b_transaction != transaction ||
-					jh->b_jlist == BJ_Metadata);
+			if (WARN_ON_ONCE(jh->b_transaction == transaction &&
+					 jh->b_jlist != BJ_Metadata)) {
+				ret = -EINVAL;
+				goto out_unlock_bh;
+			}
 			spin_unlock(&jh->b_state_lock);
 		}
 		goto out;
@@ -1636,7 +1665,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	}
 
 	/* That test should have eliminated the following case: */
-	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
+	if (WARN_ON_ONCE(jh->b_frozen_data)) {
+		ret = -EINVAL;
+		goto out_unlock_bh;
+	}
 
 	JBUFFER_TRACE(jh, "file as BJ_Metadata");
 	spin_lock(&journal->j_list_lock);
@@ -1675,6 +1707,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
 	int err = 0;
 	int was_modified = 0;
 	int wait_for_writeback = 0;
+	int abort_journal = 0;
 
 	if (is_handle_aborted(handle))
 		return -EROFS;
@@ -1708,7 +1741,11 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
 	jh->b_modified = 0;
 
 	if (jh->b_transaction == transaction) {
-		J_ASSERT_JH(jh, !jh->b_frozen_data);
+		if (WARN_ON_ONCE(jh->b_frozen_data)) {
+			err = -EINVAL;
+			abort_journal = 1;
+			goto drop;
+		}
 
 		/* If we are forgetting a buffer which is already part
 		 * of this transaction, then we can just drop it from
@@ -1747,8 +1784,11 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
 		}
 		spin_unlock(&journal->j_list_lock);
 	} else if (jh->b_transaction) {
-		J_ASSERT_JH(jh, (jh->b_transaction ==
-				 journal->j_committing_transaction));
+		if (WARN_ON_ONCE(jh->b_transaction != journal->j_committing_transaction)) {
+			err = -EINVAL;
+			abort_journal = 1;
+			goto drop;
+		}
 		/* However, if the buffer is still owned by a prior
 		 * (committing) transaction, we can't drop it yet... */
 		JBUFFER_TRACE(jh, "belongs to older transaction");
@@ -1766,7 +1806,11 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
 			jh->b_next_transaction = transaction;
 			spin_unlock(&journal->j_list_lock);
 		} else {
-			J_ASSERT(jh->b_next_transaction == transaction);
+			if (WARN_ON_ONCE(jh->b_next_transaction != transaction)) {
+				err = -EINVAL;
+				abort_journal = 1;
+				goto drop;
+			}
 
 			/*
 			 * only drop a reference if this transaction modified
@@ -1812,6 +1856,8 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
 drop:
 	__brelse(bh);
 	spin_unlock(&jh->b_state_lock);
+	if (abort_journal)
+		jbd2_journal_abort(journal, err);
 	if (wait_for_writeback)
 		wait_on_buffer(bh);
 	jbd2_journal_put_journal_head(jh);
@@ -2136,7 +2182,8 @@ bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
 	struct buffer_head *bh;
 	bool ret = false;
 
-	J_ASSERT(folio_test_locked(folio));
+	if (WARN_ON_ONCE(!folio_test_locked(folio)))
+		return false;
 
 	head = folio_buffers(folio);
 	bh = head;
@@ -2651,6 +2698,8 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal;
+	int err = 0;
+	int abort_transaction = 0;
 
 	if (is_handle_aborted(handle))
 		return -EROFS;
@@ -2685,20 +2734,33 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
 	/* On some different transaction's list - should be
 	 * the committing one */
 	if (jinode->i_transaction) {
-		J_ASSERT(jinode->i_next_transaction == NULL);
-		J_ASSERT(jinode->i_transaction ==
-					journal->j_committing_transaction);
+		if (WARN_ON_ONCE(jinode->i_next_transaction ||
+				 jinode->i_transaction !=
+				 journal->j_committing_transaction)) {
+			pr_err("JBD2: %s: assertion failure: i_next_transaction=%p i_transaction=%p j_committing_transaction=%p\n",
+			       journal->j_devname, jinode->i_next_transaction,
+			       jinode->i_transaction,
+			       journal->j_committing_transaction);
+			err = -EINVAL;
+			abort_transaction = 1;
+			goto done;
+		}
 		jinode->i_next_transaction = transaction;
 		goto done;
 	}
 	/* Not on any transaction list... */
-	J_ASSERT(!jinode->i_next_transaction);
+	if (WARN_ON_ONCE(jinode->i_next_transaction)) {
+		err = -EINVAL;
+		abort_transaction = 1;
+		goto done;
+	}
 	jinode->i_transaction = transaction;
 	list_add(&jinode->i_list, &transaction->t_inode_list);
 done:
 	spin_unlock(&journal->j_list_lock);
-
-	return 0;
+	if (abort_transaction)
+		jbd2_journal_abort(journal, err);
+	return err;
 }
 
 int jbd2_journal_inode_ranged_write(handle_t *handle,
-- 
2.53.0
Re: [PATCH v3 2/2] jbd2: gracefully abort on transaction state corruptions
Posted by kernel test robot 1 month, 1 week ago
Hi Milos,

kernel test robot noticed the following build warnings:

[auto build test WARNING on brauner-vfs/vfs.all]
[also build test WARNING on tytso-ext4/dev next-20260302]
[cannot apply to linus/master v6.16-rc1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Milos-Nikic/jbd2-gracefully-abort-instead-of-panicking-on-unlocked-buffer/20260303-085946
base:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git vfs.all
patch link:    https://lore.kernel.org/r/20260303005502.337108-3-nikic.milos%40gmail.com
patch subject: [PATCH v3 2/2] jbd2: gracefully abort on transaction state corruptions
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20260303/202603030706.NpyA7pqe-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260303/202603030706.NpyA7pqe-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202603030706.NpyA7pqe-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> fs/jbd2/transaction.c:1547:11: warning: variable 'journal' is uninitialized when used here [-Wuninitialized]
    1547 |                                journal->j_devname, jh->b_transaction,
         |                                ^~~~~~~
   include/linux/printk.h:554:33: note: expanded from macro 'pr_err'
     554 |         printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
         |                                        ^~~~~~~~~~~
   include/linux/printk.h:511:60: note: expanded from macro 'printk'
     511 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
         |                                                            ^~~~~~~~~~~
   include/linux/printk.h:483:19: note: expanded from macro 'printk_index_wrap'
     483 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
         |                                 ^~~~~~~~~~~
   fs/jbd2/transaction.c:1520:20: note: initialize the variable 'journal' to silence this warning
    1520 |         journal_t *journal;
         |                           ^
         |                            = NULL
   1 warning generated.


vim +/journal +1547 fs/jbd2/transaction.c

  1493	
  1494	/**
  1495	 * jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  1496	 * @handle: transaction to add buffer to.
  1497	 * @bh: buffer to mark
  1498	 *
  1499	 * mark dirty metadata which needs to be journaled as part of the current
  1500	 * transaction.
  1501	 *
  1502	 * The buffer must have previously had jbd2_journal_get_write_access()
  1503	 * called so that it has a valid journal_head attached to the buffer
  1504	 * head.
  1505	 *
  1506	 * The buffer is placed on the transaction's metadata list and is marked
  1507	 * as belonging to the transaction.
  1508	 *
  1509	 * Returns error number or 0 on success.
  1510	 *
  1511	 * Special care needs to be taken if the buffer already belongs to the
  1512	 * current committing transaction (in which case we should have frozen
  1513	 * data present for that commit).  In that case, we don't relink the
  1514	 * buffer: that only gets done when the old transaction finally
  1515	 * completes its commit.
  1516	 */
  1517	int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
  1518	{
  1519		transaction_t *transaction = handle->h_transaction;
  1520		journal_t *journal;
  1521		struct journal_head *jh;
  1522		int ret = 0;
  1523	
  1524		if (!buffer_jbd(bh))
  1525			return -EUCLEAN;
  1526	
  1527		/*
  1528		 * We don't grab jh reference here since the buffer must be part
  1529		 * of the running transaction.
  1530		 */
  1531		jh = bh2jh(bh);
  1532		jbd2_debug(5, "journal_head %p\n", jh);
  1533		JBUFFER_TRACE(jh, "entry");
  1534	
  1535		/*
  1536		 * This and the following assertions are unreliable since we may see jh
  1537		 * in inconsistent state unless we grab bh_state lock. But this is
  1538		 * crucial to catch bugs so let's do a reliable check until the
  1539		 * lockless handling is fully proven.
  1540		 */
  1541		if (data_race(jh->b_transaction != transaction &&
  1542		    jh->b_next_transaction != transaction)) {
  1543			spin_lock(&jh->b_state_lock);
  1544			if (WARN_ON_ONCE(jh->b_transaction != transaction &&
  1545					 jh->b_next_transaction != transaction)) {
  1546				pr_err("JBD2: %s: assertion failure: b_transaction=%p transaction=%p b_next_transaction=%p\n",
> 1547				       journal->j_devname, jh->b_transaction,
  1548				       transaction, jh->b_next_transaction);
  1549				ret = -EINVAL;
  1550				goto out_unlock_bh;
  1551			}
  1552			spin_unlock(&jh->b_state_lock);
  1553		}
  1554		if (data_race(jh->b_modified == 1)) {
  1555			/* If it's in our transaction it must be in BJ_Metadata list. */
  1556			if (data_race(jh->b_transaction == transaction &&
  1557			    jh->b_jlist != BJ_Metadata)) {
  1558				spin_lock(&jh->b_state_lock);
  1559				if (jh->b_transaction == transaction &&
  1560				    jh->b_jlist != BJ_Metadata)
  1561					pr_err("JBD2: assertion failure: h_type=%u h_line_no=%u block_no=%llu jlist=%u\n",
  1562					       handle->h_type, handle->h_line_no,
  1563					       (unsigned long long) bh->b_blocknr,
  1564					       jh->b_jlist);
  1565				if (WARN_ON_ONCE(jh->b_transaction == transaction &&
  1566						 jh->b_jlist != BJ_Metadata)) {
  1567					ret = -EINVAL;
  1568					goto out_unlock_bh;
  1569				}
  1570				spin_unlock(&jh->b_state_lock);
  1571			}
  1572			goto out;
  1573		}
  1574	
  1575		spin_lock(&jh->b_state_lock);
  1576	
  1577		if (is_handle_aborted(handle)) {
  1578			/*
  1579			 * Check journal aborting with @jh->b_state_lock locked,
  1580			 * since 'jh->b_transaction' could be replaced with
  1581			 * 'jh->b_next_transaction' during old transaction
  1582			 * committing if journal aborted, which may fail
  1583			 * assertion on 'jh->b_frozen_data == NULL'.
  1584			 */
  1585			ret = -EROFS;
  1586			goto out_unlock_bh;
  1587		}
  1588	
  1589		journal = transaction->t_journal;
  1590	
  1591		if (jh->b_modified == 0) {
  1592			/*
  1593			 * This buffer's got modified and becoming part
  1594			 * of the transaction. This needs to be done
  1595			 * once a transaction -bzzz
  1596			 */
  1597			if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
  1598				ret = -ENOSPC;
  1599				goto out_unlock_bh;
  1600			}
  1601			jh->b_modified = 1;
  1602			handle->h_total_credits--;
  1603		}
  1604	
  1605		/*
  1606		 * fastpath, to avoid expensive locking.  If this buffer is already
  1607		 * on the running transaction's metadata list there is nothing to do.
  1608		 * Nobody can take it off again because there is a handle open.
  1609		 * I _think_ we're OK here with SMP barriers - a mistaken decision will
  1610		 * result in this test being false, so we go in and take the locks.
  1611		 */
  1612		if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
  1613			JBUFFER_TRACE(jh, "fastpath");
  1614			if (unlikely(jh->b_transaction !=
  1615				     journal->j_running_transaction)) {
  1616				printk(KERN_ERR "JBD2: %s: "
  1617				       "jh->b_transaction (%llu, %p, %u) != "
  1618				       "journal->j_running_transaction (%p, %u)\n",
  1619				       journal->j_devname,
  1620				       (unsigned long long) bh->b_blocknr,
  1621				       jh->b_transaction,
  1622				       jh->b_transaction ? jh->b_transaction->t_tid : 0,
  1623				       journal->j_running_transaction,
  1624				       journal->j_running_transaction ?
  1625				       journal->j_running_transaction->t_tid : 0);
  1626				ret = -EINVAL;
  1627			}
  1628			goto out_unlock_bh;
  1629		}
  1630	
  1631		set_buffer_jbddirty(bh);
  1632	
  1633		/*
  1634		 * Metadata already on the current transaction list doesn't
  1635		 * need to be filed.  Metadata on another transaction's list must
  1636		 * be committing, and will be refiled once the commit completes:
  1637		 * leave it alone for now.
  1638		 */
  1639		if (jh->b_transaction != transaction) {
  1640			JBUFFER_TRACE(jh, "already on other transaction");
  1641			if (unlikely(((jh->b_transaction !=
  1642				       journal->j_committing_transaction)) ||
  1643				     (jh->b_next_transaction != transaction))) {
  1644				printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
  1645				       "bad jh for block %llu: "
  1646				       "transaction (%p, %u), "
  1647				       "jh->b_transaction (%p, %u), "
  1648				       "jh->b_next_transaction (%p, %u), jlist %u\n",
  1649				       journal->j_devname,
  1650				       (unsigned long long) bh->b_blocknr,
  1651				       transaction, transaction->t_tid,
  1652				       jh->b_transaction,
  1653				       jh->b_transaction ?
  1654				       jh->b_transaction->t_tid : 0,
  1655				       jh->b_next_transaction,
  1656				       jh->b_next_transaction ?
  1657				       jh->b_next_transaction->t_tid : 0,
  1658				       jh->b_jlist);
  1659				WARN_ON(1);
  1660				ret = -EINVAL;
  1661			}
  1662			/* And this case is illegal: we can't reuse another
  1663			 * transaction's data buffer, ever. */
  1664			goto out_unlock_bh;
  1665		}
  1666	
  1667		/* That test should have eliminated the following case: */
  1668		if (WARN_ON_ONCE(jh->b_frozen_data)) {
  1669			ret = -EINVAL;
  1670			goto out_unlock_bh;
  1671		}
  1672	
  1673		JBUFFER_TRACE(jh, "file as BJ_Metadata");
  1674		spin_lock(&journal->j_list_lock);
  1675		__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
  1676		spin_unlock(&journal->j_list_lock);
  1677	out_unlock_bh:
  1678		spin_unlock(&jh->b_state_lock);
  1679	out:
  1680		JBUFFER_TRACE(jh, "exit");
  1681		return ret;
  1682	}
  1683	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki