[PATCH v3 3/4] xfs: avoid dereferencing log items after push callbacks

Yuto Ohnuki posted 4 patches 1 month ago
There is a newer version of this series
[PATCH v3 3/4] xfs: avoid dereferencing log items after push callbacks
Posted by Yuto Ohnuki 1 month ago
After xfsaild_push_item() calls iop_push(), the log item may have been
freed if the AIL lock was dropped during the push. The tracepoints in
the switch statement dereference the log item after iop_push() returns,
which can result in a use-after-free.

Fix this by capturing the log item type, flags, and LSN before calling
xfsaild_push_item(), and introducing a new xfs_ail_push_class trace
event class that takes these pre-captured values and the ailp pointer
instead of the log item pointer.

Reported-by: syzbot+652af2b3c5569c4ab63c@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=652af2b3c5569c4ab63c
Fixes: 90c60e164012 ("xfs: xfs_iflush() is no longer necessary")
Cc: <stable@vger.kernel.org> # v5.9
Signed-off-by: Yuto Ohnuki <ytohnuki@amazon.com>
---
 fs/xfs/xfs_trace.h     | 36 ++++++++++++++++++++++++++++++++----
 fs/xfs/xfs_trans_ail.c | 24 ++++++++++++++++--------
 2 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 813e5a9f57eb..0e994b3f768f 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -56,6 +56,7 @@
 #include <linux/tracepoint.h>
 
 struct xfs_agf;
+struct xfs_ail;
 struct xfs_alloc_arg;
 struct xfs_attr_list_context;
 struct xfs_buf_log_item;
@@ -1650,16 +1651,43 @@ TRACE_EVENT(xfs_log_force,
 DEFINE_EVENT(xfs_log_item_class, name, \
 	TP_PROTO(struct xfs_log_item *lip), \
 	TP_ARGS(lip))
-DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
-DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
-DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
-DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
 DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark);
 DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip);
 DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin);
 DEFINE_LOG_ITEM_EVENT(xlog_ail_insert_abort);
 DEFINE_LOG_ITEM_EVENT(xfs_trans_free_abort);
 
+DECLARE_EVENT_CLASS(xfs_ail_push_class,
+	TP_PROTO(struct xfs_ail *ailp, uint type, unsigned long flags, xfs_lsn_t lsn),
+	TP_ARGS(ailp, type, flags, lsn),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(uint, type)
+		__field(unsigned long, flags)
+		__field(xfs_lsn_t, lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = ailp->ail_log->l_mp->m_super->s_dev;
+		__entry->type = type;
+		__entry->flags = flags;
+		__entry->lsn = lsn;
+	),
+	TP_printk("dev %d:%d lsn %d/%d type %s flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn),
+		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+		  __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_AIL_PUSH_EVENT(name) \
+DEFINE_EVENT(xfs_ail_push_class, name, \
+	TP_PROTO(struct xfs_ail *ailp, uint type, unsigned long flags, xfs_lsn_t lsn), \
+	TP_ARGS(ailp, type, flags, lsn))
+DEFINE_AIL_PUSH_EVENT(xfs_ail_push);
+DEFINE_AIL_PUSH_EVENT(xfs_ail_pinned);
+DEFINE_AIL_PUSH_EVENT(xfs_ail_locked);
+DEFINE_AIL_PUSH_EVENT(xfs_ail_flushing);
+
 DECLARE_EVENT_CLASS(xfs_ail_class,
 	TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
 	TP_ARGS(lip, old_lsn, new_lsn),
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index ac747804e1d6..14ffb77b12ea 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -365,6 +365,12 @@ xfsaild_resubmit_item(
 	return XFS_ITEM_SUCCESS;
 }
 
+/*
+ * Push a single log item from the AIL.
+ *
+ * @lip may have been released and freed by the time this function returns,
+ * so callers must not dereference the log item afterwards.
+ */
 static inline uint
 xfsaild_push_item(
 	struct xfs_ail		*ailp,
@@ -462,11 +468,13 @@ static void
 xfsaild_process_logitem(
 	struct xfs_ail		*ailp,
 	struct xfs_log_item	*lip,
-	xfs_lsn_t		lsn,
 	int			*stuck,
 	int			*flushing)
 {
 	struct xfs_mount	*mp = ailp->ail_log->l_mp;
+	uint			type = lip->li_type;
+	unsigned long		flags = lip->li_flags;
+	xfs_lsn_t		item_lsn = lip->li_lsn;
 	int			lock_result;
 
 	/*
@@ -478,9 +486,9 @@ xfsaild_process_logitem(
 	switch (lock_result) {
 	case XFS_ITEM_SUCCESS:
 		XFS_STATS_INC(mp, xs_push_ail_success);
-		trace_xfs_ail_push(lip);
+		trace_xfs_ail_push(ailp, type, flags, item_lsn);
 
-		ailp->ail_last_pushed_lsn = lsn;
+		ailp->ail_last_pushed_lsn = item_lsn;
 		break;
 
 	case XFS_ITEM_FLUSHING:
@@ -496,22 +504,22 @@ xfsaild_process_logitem(
 		 * AIL is being flushed.
 		 */
 		XFS_STATS_INC(mp, xs_push_ail_flushing);
-		trace_xfs_ail_flushing(lip);
+		trace_xfs_ail_flushing(ailp, type, flags, item_lsn);
 
 		(*flushing)++;
-		ailp->ail_last_pushed_lsn = lsn;
+		ailp->ail_last_pushed_lsn = item_lsn;
 		break;
 
 	case XFS_ITEM_PINNED:
 		XFS_STATS_INC(mp, xs_push_ail_pinned);
-		trace_xfs_ail_pinned(lip);
+		trace_xfs_ail_pinned(ailp, type, flags, item_lsn);
 
 		(*stuck)++;
 		ailp->ail_log_flush++;
 		break;
 	case XFS_ITEM_LOCKED:
 		XFS_STATS_INC(mp, xs_push_ail_locked);
-		trace_xfs_ail_locked(lip);
+		trace_xfs_ail_locked(ailp, type, flags, item_lsn);
 
 		(*stuck)++;
 		break;
@@ -572,7 +580,7 @@ xfsaild_push(
 		if (test_bit(XFS_LI_FLUSHING, &lip->li_flags))
 			goto next_item;
 
-		xfsaild_process_logitem(ailp, lip, lsn, &stuck, &flushing);
+		xfsaild_process_logitem(ailp, lip, &stuck, &flushing);
 		count++;
 
 		/*
-- 
2.50.1




Amazon Web Services EMEA SARL, 38 avenue John F. Kennedy, L-1855 Luxembourg, R.C.S. Luxembourg B186284

Amazon Web Services EMEA SARL, Irish Branch, One Burlington Plaza, Burlington Road, Dublin 4, Ireland, branch registration number 908705
Re: [PATCH v3 3/4] xfs: avoid dereferencing log items after push callbacks
Posted by Dave Chinner 1 month ago
On Sun, Mar 08, 2026 at 06:28:08PM +0000, Yuto Ohnuki wrote:
> After xfsaild_push_item() calls iop_push(), the log item may have been
> freed if the AIL lock was dropped during the push. The tracepoints in
> the switch statement dereference the log item after iop_push() returns,
> which can result in a use-after-free.
> 
> Fix this by capturing the log item type, flags, and LSN before calling
> xfsaild_push_item(), and introducing a new xfs_ail_push_class trace
> event class that takes these pre-captured values and the ailp pointer
> instead of the log item pointer.
> 
> Reported-by: syzbot+652af2b3c5569c4ab63c@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=652af2b3c5569c4ab63c
> Fixes: 90c60e164012 ("xfs: xfs_iflush() is no longer necessary")
> Cc: <stable@vger.kernel.org> # v5.9
> Signed-off-by: Yuto Ohnuki <ytohnuki@amazon.com>

Reviewed-by: Dave Chinner <dchinner@redhat.com>
-- 
Dave Chinner
dgc@kernel.org
Re: [PATCH v3 3/4] xfs: avoid dereferencing log items after push callbacks
Posted by Yuto Ohnuki 4 weeks, 1 day ago
> > After xfsaild_push_item() calls iop_push(), the log item may have been
> > freed if the AIL lock was dropped during the push. The tracepoints in
> > the switch statement dereference the log item after iop_push() returns,
> > which can result in a use-after-free.
> > 
> > Fix this by capturing the log item type, flags, and LSN before calling
> > xfsaild_push_item(), and introducing a new xfs_ail_push_class trace
> > event class that takes these pre-captured values and the ailp pointer
> > instead of the log item pointer.
> > 
> > Reported-by: syzbot+652af2b3c5569c4ab63c@syzkaller.appspotmail.com
> > Closes: https://syzkaller.appspot.com/bug?extid=652af2b3c5569c4ab63c
> > Fixes: 90c60e164012 ("xfs: xfs_iflush() is no longer necessary")
> > Cc: <stable@vger.kernel.org> # v5.9
> > Signed-off-by: Yuto Ohnuki <ytohnuki@amazon.com>
> 
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
> -- 
> Dave Chinner
> dgc@kernel.org

Thanks for the review, Dave.

In v4, I reworked the patch ordering so that the bugfix patches come
before the refactoring.

Since the context has changed, I've dropped your Reviewed-by from
this patch in v4 just to be safe. I would appreciate another look
when you get a chance.

Yuto



Amazon Web Services EMEA SARL, 38 avenue John F. Kennedy, L-1855 Luxembourg, R.C.S. Luxembourg B186284

Amazon Web Services EMEA SARL, Irish Branch, One Burlington Plaza, Burlington Road, Dublin 4, Ireland, branch registration number 908705
Re: [PATCH v3 3/4] xfs: avoid dereferencing log items after push callbacks
Posted by Darrick J. Wong 1 month ago
On Sun, Mar 08, 2026 at 06:28:08PM +0000, Yuto Ohnuki wrote:
> After xfsaild_push_item() calls iop_push(), the log item may have been
> freed if the AIL lock was dropped during the push. The tracepoints in
> the switch statement dereference the log item after iop_push() returns,
> which can result in a use-after-free.

How difficult would it be to add a refcount to xfs_log_item so that any
other code walking through the AIL's log item list can't accidentally
suffer from this UAF?  I keep seeing periodic log item UAF bugfixes on
the list, which (to me anyway) suggests we ought to think about a
struct(ural) fix to this problem.

I /think/ the answer to that is "sort of nasty" because of things like
xfs_dquot embedding its own log item.  The other log item types might
not be so bad because at least they're allocated separately.  However,
refcount_t accesses also aren't free.

> Fix this by capturing the log item type, flags, and LSN before calling
> xfsaild_push_item(), and introducing a new xfs_ail_push_class trace
> event class that takes these pre-captured values and the ailp pointer
> instead of the log item pointer.
> 
> Reported-by: syzbot+652af2b3c5569c4ab63c@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=652af2b3c5569c4ab63c
> Fixes: 90c60e164012 ("xfs: xfs_iflush() is no longer necessary")
> Cc: <stable@vger.kernel.org> # v5.9
> Signed-off-by: Yuto Ohnuki <ytohnuki@amazon.com>
> ---
>  fs/xfs/xfs_trace.h     | 36 ++++++++++++++++++++++++++++++++----
>  fs/xfs/xfs_trans_ail.c | 24 ++++++++++++++++--------
>  2 files changed, 48 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 813e5a9f57eb..0e994b3f768f 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -56,6 +56,7 @@
>  #include <linux/tracepoint.h>
>  
>  struct xfs_agf;
> +struct xfs_ail;
>  struct xfs_alloc_arg;
>  struct xfs_attr_list_context;
>  struct xfs_buf_log_item;
> @@ -1650,16 +1651,43 @@ TRACE_EVENT(xfs_log_force,
>  DEFINE_EVENT(xfs_log_item_class, name, \
>  	TP_PROTO(struct xfs_log_item *lip), \
>  	TP_ARGS(lip))
> -DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
> -DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
> -DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
> -DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
>  DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark);
>  DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip);
>  DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin);
>  DEFINE_LOG_ITEM_EVENT(xlog_ail_insert_abort);
>  DEFINE_LOG_ITEM_EVENT(xfs_trans_free_abort);
>  
> +DECLARE_EVENT_CLASS(xfs_ail_push_class,
> +	TP_PROTO(struct xfs_ail *ailp, uint type, unsigned long flags, xfs_lsn_t lsn),
> +	TP_ARGS(ailp, type, flags, lsn),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(uint, type)
> +		__field(unsigned long, flags)
> +		__field(xfs_lsn_t, lsn)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = ailp->ail_log->l_mp->m_super->s_dev;
> +		__entry->type = type;
> +		__entry->flags = flags;
> +		__entry->lsn = lsn;
> +	),
> +	TP_printk("dev %d:%d lsn %d/%d type %s flags %s",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn),
> +		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
> +		  __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
> +)
> +
> +#define DEFINE_AIL_PUSH_EVENT(name) \
> +DEFINE_EVENT(xfs_ail_push_class, name, \
> +	TP_PROTO(struct xfs_ail *ailp, uint type, unsigned long flags, xfs_lsn_t lsn), \
> +	TP_ARGS(ailp, type, flags, lsn))
> +DEFINE_AIL_PUSH_EVENT(xfs_ail_push);
> +DEFINE_AIL_PUSH_EVENT(xfs_ail_pinned);
> +DEFINE_AIL_PUSH_EVENT(xfs_ail_locked);
> +DEFINE_AIL_PUSH_EVENT(xfs_ail_flushing);
> +
>  DECLARE_EVENT_CLASS(xfs_ail_class,
>  	TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
>  	TP_ARGS(lip, old_lsn, new_lsn),
> diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
> index ac747804e1d6..14ffb77b12ea 100644
> --- a/fs/xfs/xfs_trans_ail.c
> +++ b/fs/xfs/xfs_trans_ail.c
> @@ -365,6 +365,12 @@ xfsaild_resubmit_item(
>  	return XFS_ITEM_SUCCESS;
>  }
>  
> +/*
> + * Push a single log item from the AIL.
> + *
> + * @lip may have been released and freed by the time this function returns,
> + * so callers must not dereference the log item afterwards.

This is true after the xfsaild_push_item call, correct?  If so then I
think the comment for the call needs updating too:

	/*
	 * Note that iop_push may unlock and reacquire the AIL lock.  We
	 * rely on the AIL cursor implementation to be able to deal with
	 * the dropped lock.
	 *
	 * The log item may have been freed by the push, so it must not
	 * be accessed or dereferenced below this line.
	 */
	lock_result = xfsaild_push_item(ailp, lip);

Otherwise this looks ok to me.

--D

> + */
>  static inline uint
>  xfsaild_push_item(
>  	struct xfs_ail		*ailp,
> @@ -462,11 +468,13 @@ static void
>  xfsaild_process_logitem(
>  	struct xfs_ail		*ailp,
>  	struct xfs_log_item	*lip,
> -	xfs_lsn_t		lsn,
>  	int			*stuck,
>  	int			*flushing)
>  {
>  	struct xfs_mount	*mp = ailp->ail_log->l_mp;
> +	uint			type = lip->li_type;
> +	unsigned long		flags = lip->li_flags;
> +	xfs_lsn_t		item_lsn = lip->li_lsn;
>  	int			lock_result;
>  
>  	/*
> @@ -478,9 +486,9 @@ xfsaild_process_logitem(
>  	switch (lock_result) {
>  	case XFS_ITEM_SUCCESS:
>  		XFS_STATS_INC(mp, xs_push_ail_success);
> -		trace_xfs_ail_push(lip);
> +		trace_xfs_ail_push(ailp, type, flags, item_lsn);
>  
> -		ailp->ail_last_pushed_lsn = lsn;
> +		ailp->ail_last_pushed_lsn = item_lsn;
>  		break;
>  
>  	case XFS_ITEM_FLUSHING:
> @@ -496,22 +504,22 @@ xfsaild_process_logitem(
>  		 * AIL is being flushed.
>  		 */
>  		XFS_STATS_INC(mp, xs_push_ail_flushing);
> -		trace_xfs_ail_flushing(lip);
> +		trace_xfs_ail_flushing(ailp, type, flags, item_lsn);
>  
>  		(*flushing)++;
> -		ailp->ail_last_pushed_lsn = lsn;
> +		ailp->ail_last_pushed_lsn = item_lsn;
>  		break;
>  
>  	case XFS_ITEM_PINNED:
>  		XFS_STATS_INC(mp, xs_push_ail_pinned);
> -		trace_xfs_ail_pinned(lip);
> +		trace_xfs_ail_pinned(ailp, type, flags, item_lsn);
>  
>  		(*stuck)++;
>  		ailp->ail_log_flush++;
>  		break;
>  	case XFS_ITEM_LOCKED:
>  		XFS_STATS_INC(mp, xs_push_ail_locked);
> -		trace_xfs_ail_locked(lip);
> +		trace_xfs_ail_locked(ailp, type, flags, item_lsn);
>  
>  		(*stuck)++;
>  		break;
> @@ -572,7 +580,7 @@ xfsaild_push(
>  		if (test_bit(XFS_LI_FLUSHING, &lip->li_flags))
>  			goto next_item;
>  
> -		xfsaild_process_logitem(ailp, lip, lsn, &stuck, &flushing);
> +		xfsaild_process_logitem(ailp, lip, &stuck, &flushing);
>  		count++;
>  
>  		/*
> -- 
> 2.50.1
> 
> 
> 
> 
> Amazon Web Services EMEA SARL, 38 avenue John F. Kennedy, L-1855 Luxembourg, R.C.S. Luxembourg B186284
> 
> Amazon Web Services EMEA SARL, Irish Branch, One Burlington Plaza, Burlington Road, Dublin 4, Ireland, branch registration number 908705
> 
> 
> 
>
Re: [PATCH v3 3/4] xfs: avoid dereferencing log items after push callbacks
Posted by Yuto Ohnuki 4 weeks, 1 day ago
> How difficult would it be to add a refcount to xfs_log_item so that any
> other code walking through the AIL's log item list can't accidentally
> suffer from this UAF?  I keep seeing periodic log item UAF bugfixes on
> the list, which (to me anyway) suggests we ought to think about a
> struct(ural) fix to this problem.
> 
> I /think/ the answer to that is "sort of nasty" because of things like
> xfs_dquot embedding its own log item.  The other log item types might
> not be so bad because at least they're allocated separately.  However,
> refcount_t accesses also aren't free.

Agreed that a structural fix would be the right long-term approach.
As you noted, the dquot embedding makes it non-trivial. I'd like to
keep this series focused on the immediate syzbot fix and explore a
refcount-based approach as a separate effort.

> This is true after the xfsaild_push_item call, correct?  If so then I
> think the comment for the call needs updating too:
> 
>       /*
>        * Note that iop_push may unlock and reacquire the AIL lock.  We
>        * rely on the AIL cursor implementation to be able to deal with
>        * the dropped lock.
>        *
>        * The log item may have been freed by the push, so it must not
>        * be accessed or dereferenced below this line.
>        */
>       lock_result = xfsaild_push_item(ailp, lip);
> 
> Otherwise this looks ok to me.
> 
> --D

Thank you. In v4, I have added the comments you suggested.

Yuto



Amazon Web Services EMEA SARL, 38 avenue John F. Kennedy, L-1855 Luxembourg, R.C.S. Luxembourg B186284

Amazon Web Services EMEA SARL, Irish Branch, One Burlington Plaza, Burlington Road, Dublin 4, Ireland, branch registration number 908705
Re: [PATCH v3 3/4] xfs: avoid dereferencing log items after push callbacks
Posted by Dave Chinner 1 month ago
On Mon, Mar 09, 2026 at 09:27:10AM -0700, Darrick J. Wong wrote:
> On Sun, Mar 08, 2026 at 06:28:08PM +0000, Yuto Ohnuki wrote:
> > After xfsaild_push_item() calls iop_push(), the log item may have been
> > freed if the AIL lock was dropped during the push. The tracepoints in
> > the switch statement dereference the log item after iop_push() returns,
> > which can result in a use-after-free.
> 
> How difficult would it be to add a refcount to xfs_log_item so that any
> other code walking through the AIL's log item list can't accidentally
> suffer from this UAF?  I keep seeing periodic log item UAF bugfixes on
> the list, which (to me anyway) suggests we ought to think about a
> struct(ural) fix to this problem.
> 
> I /think/ the answer to that is "sort of nasty" because of things like
> xfs_dquot embedding its own log item.  The other log item types might
> not be so bad because at least they're allocated separately.  However,
> refcount_t accesses also aren't free.

It's nasty for many reasons. The biggest one is that the log item
isn't valid as a stand-alone object. Once the owner item has been
freed, any attempt to use the log item requires dereferencing back
to the owner object, and now we have a different set of UAF
problems.

For example, we can't leave log items in the AIL after freeing the
owner object because we have to write the owner object to disk to
remove the log item from the AIL. The log item has to be removed
from the AIL before we free the high level item the log item belongs
to.

Hence the life time of a log item must always be a subset of the
owner object. That is where log item reference counting becomes an
issue - for it to work the log item has to hold a reference to the
owner object.

We already have log items that do this: the BLI is one example.

However, other UAF issues on log items come from using reference
counts and the needing references and (potentially) locks on the
owner object. Those complexities end up causing - you guessed it -
UAF problems...

For example: the BLI keeps a reference count for all accesses to the
BLI *except* for the AIL reference, because the AIL can't keep
active references to dirty high level objects.

For example: releasing the last reference to some high level objects
(e.g. inodes) can result in them being journalled, and hence the
journalling subsystem now has to be able to track and process those
dirty high level items without holding an active reference to them.

For example: The BLI reference count/buffer locking model is all the
complexity in freeing metadata extents (stale buffers) comes from.
At transaction completion, the transaction reference to the BLI and
the buffer lock is transferred to the CIL (the journal) and is only
then released on completion of the journal IO. This is how we
prevent a buffer from being reused whilst the transaction freeing
underlying storage is in flight - the buffer needs to remain locked
until the freeing transaction(s) is stable in the journal. This
complexity is where all the UAF in the BLI unpinning operations come
from.

Normally, the transaction reference and buffer lock are released
when the transaction context is torn down after the commit
completes. The problems with UAFs in this BLI code comes from the
fact that stale, pinned buffers have been transferred to the CIL and
the transaction no longer owns the BLI reference...

And then, of course, is the fact that the AIL cannot rely on log
items with referenced owner objects.  Hence the high level items
tracked in the AIL are, at times, tracking otherwise unreferenced
items.

IOWs, we have problems with UAF w.r.t. buffers and BLIs because of
the mess of the BLI reference counting model. And we have problems
with ILI/inode life times because the ILI does not take references
to the inode and it is assumed it is never freed until the inode
itself is torn down. And neither buffers, inodes, BLIs nor ILIs are
reference counted when they are on the AIL.

The impact of this is two-fold:

1. it requires high level object reclaim to be aware of dirty items
and to be able to skip over them; and
2. unmount requires explicitly AIL pushing because the AIL
might be the only remaining subsystem that tracks the unreferenced
object that we need to reclaim before unmount can progress. This is
especially true for shutdown filesystems.

Ideally xfs_reclaim_inode() would not be trying to abort dirty
inodes on shutdown. Historically speaking, this functionality has
been necessary because there were times without other mechanisms to
abort and clean dirty, unreferenced inodes and this would result in
unount on shutdown filesystems hanging.

I suspect those times are long since passed - all dirty inodes are
tracked in the journal and unmount pushes all dirty objects - so
maybe the lesson here is that we could be carrying historic code
that worked around shutdown bugs that ino longer occur and so we no
longer need...

So, yeah, I agree that it would be be great to untangle all this
mess, but my experience qwith trying to untangle it over the years
is that the a can of worms it opens gets all tangled up in the
ball of string I'm trying to untangle....

-Dave.
-- 
Dave Chinner
dgc@kernel.org