[PATCH] jfs: fix tlock double-allocation race in txLock()

Cen Zhang posted 1 patch 1 week ago
fs/jfs/jfs_txnmgr.c | 117 ++++++++++++++++++++++++--------------------
1 file changed, 63 insertions(+), 54 deletions(-)
[PATCH] jfs: fix tlock double-allocation race in txLock()
Posted by Cen Zhang 1 week ago
In txLock()'s allocateLock path, the newly allocated tlock is
published (mp->lid = lid) only after TXN_UNLOCK(), creating a window
where a concurrent txLock() call on the same metapage reads a stale
mp->lid of 0, enters allocateLock itself, and creates a second tlock
for the same page.

This leads to two transactions holding separate tlocks for the same
metapage, causing journal log corruption: both transactions log
conflicting changes to the same page, and the second tlock's mp->lid
assignment silently orphans the first tlock.

The race is triggered in practice when two concurrent truncate
operations (via jfs_setattr and ftruncate) call diWrite() for
different inodes whose disk inodes share the same ipimap metapage.
Both call txLock(tid, ipimap, mp, tlckINODE|tlckENTRY) concurrently.

Fix this by completing all tlock initialization -- including flag,
type, linelock setup, page binding (mp->lid / xtlid), and
transaction/anonymous list enqueue -- while still holding TXN_LOCK,
before any concurrent txLock() caller can observe the tlock.

Only metapage_nohomeok() is deferred to after TXN_UNLOCK(), as it
may block on folio_lock() and folio_wait_writeback() and therefore
cannot execute under the TXN_LOCK spinlock.  By this point the tlock
is fully initialized and published, so a concurrent adopter will see
a complete tlock.

For anonymous transactions, the list_add_tail() to
TxAnchor.anon_list is now performed under the already-held TXN_LOCK
instead of dropping and re-acquiring it, which also eliminates a
potential adoption-before-enqueue window.

Signed-off-by: Cen Zhang <zzzccc427@gmail.com>
---
 fs/jfs/jfs_txnmgr.c | 117 ++++++++++++++++++++++++--------------------
 1 file changed, 63 insertions(+), 54 deletions(-)

diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 51be07337c7a..3be6af49a770 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -692,29 +692,10 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
 	 */
 	tlck->tid = tid;
 
-	TXN_UNLOCK();
-
 	/* mark tlock for meta-data page */
-	if (mp->xflag & COMMIT_PAGE) {
-
+	if (mp->xflag & COMMIT_PAGE)
 		tlck->flag = tlckPAGELOCK;
-
-		/* mark the page dirty and nohomeok */
-		metapage_nohomeok(mp);
-
-		jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p",
-			 mp, mp->nohomeok, tid, tlck);
-
-		/* if anonymous transaction, and buffer is on the group
-		 * commit synclist, mark inode to show this.  This will
-		 * prevent the buffer from being marked nohomeok for too
-		 * long a time.
-		 */
-		if ((tid == 0) && mp->lsn)
-			set_cflag(COMMIT_Synclist, ip);
-	}
-	/* mark tlock for in-memory inode */
-	else
+	else /* mark tlock for in-memory inode */
 		tlck->flag = tlckINODELOCK;
 
 	if (S_ISDIR(ip->i_mode))
@@ -725,39 +706,6 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
 	/* bind the tlock and the page */
 	tlck->ip = ip;
 	tlck->mp = mp;
-	if (dir_xtree)
-		jfs_ip->xtlid = lid;
-	else
-		mp->lid = lid;
-
-	/*
-	 * enqueue transaction lock to transaction/inode
-	 */
-	/* insert the tlock at tail of transaction tlock list */
-	if (tid) {
-		tblk = tid_to_tblock(tid);
-		if (tblk->next)
-			lid_to_tlock(tblk->last)->next = lid;
-		else
-			tblk->next = lid;
-		tlck->next = 0;
-		tblk->last = lid;
-	}
-	/* anonymous transaction:
-	 * insert the tlock at head of inode anonymous tlock list
-	 */
-	else {
-		tlck->next = jfs_ip->atlhead;
-		jfs_ip->atlhead = lid;
-		if (tlck->next == 0) {
-			/* This inode's first anonymous transaction */
-			jfs_ip->atltail = lid;
-			TXN_LOCK();
-			list_add_tail(&jfs_ip->anon_inode_list,
-				      &TxAnchor.anon_list);
-			TXN_UNLOCK();
-		}
-	}
 
 	/* initialize type dependent area for linelock */
 	linelock = (struct linelock *) & tlck->lock;
@@ -807,6 +755,67 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
 		jfs_err("UFO tlock:0x%p", tlck);
 	}
 
+	/*
+	 * Publish the tlock (set mp->lid or jfs_ip->xtlid) and enqueue it
+	 * while still holding TXN_LOCK.  A concurrent txLock() on the same
+	 * metapage must see the non-zero lid here; otherwise it will take
+	 * the allocateLock path and allocate a duplicate tlock, corrupting
+	 * the transaction journal.
+	 */
+	if (dir_xtree)
+		jfs_ip->xtlid = lid;
+	else
+		mp->lid = lid;
+
+	/*
+	 * enqueue transaction lock to transaction/inode
+	 */
+	/* insert the tlock at tail of transaction tlock list */
+	if (tid) {
+		tblk = tid_to_tblock(tid);
+		if (tblk->next)
+			lid_to_tlock(tblk->last)->next = lid;
+		else
+			tblk->next = lid;
+		tlck->next = 0;
+		tblk->last = lid;
+	}
+	/* anonymous transaction:
+	 * insert the tlock at head of inode anonymous tlock list
+	 */
+	else {
+		tlck->next = jfs_ip->atlhead;
+		jfs_ip->atlhead = lid;
+		if (tlck->next == 0) {
+			/* This inode's first anonymous transaction */
+			jfs_ip->atltail = lid;
+			list_add_tail(&jfs_ip->anon_inode_list,
+				      &TxAnchor.anon_list);
+		}
+	}
+
+	TXN_UNLOCK();
+
+	/*
+	 * metapage_nohomeok() may block (folio_lock, folio_wait_writeback),
+	 * so it must be called after dropping TXN_LOCK.  The tlock is fully
+	 * initialized and visible at this point.
+	 */
+	if (mp->xflag & COMMIT_PAGE) {
+		metapage_nohomeok(mp);
+
+		jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p",
+			 mp, mp->nohomeok, tid, tlck);
+
+		/* if anonymous transaction, and buffer is on the group
+		 * commit synclist, mark inode to show this.  This will
+		 * prevent the buffer from being marked nohomeok for too
+		 * long a time.
+		 */
+		if ((tid == 0) && mp->lsn)
+			set_cflag(COMMIT_Synclist, ip);
+	}
+
 	/*
 	 * update tlock vector
 	 */
-- 
2.34.1