[PATCH] fsnotify: fix inode reference leak in fsnotify_recalc_mask()

Xin Yin posted 1 patch 1 month, 3 weeks ago
fs/notify/mark.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
[PATCH] fsnotify: fix inode reference leak in fsnotify_recalc_mask()
Posted by Xin Yin 1 month, 3 weeks ago
fsnotify_recalc_mask() fails to handle the return value of
__fsnotify_recalc_mask(), which may return an inode pointer that needs
to be released via fsnotify_drop_object() when the connector's HAS_IREF
flag transitions from set to cleared.

This manifests as a hung task with the following call trace:

  INFO: task umount:1234 blocked for more than 120 seconds.
  Call Trace:
   __schedule
   schedule
   fsnotify_sb_delete
   generic_shutdown_super
   kill_anon_super
   cleanup_mnt
   task_work_run
   do_exit
   do_group_exit

The race window that triggers the iref leak:

  Thread A (adding mark)              Thread B (removing mark)
  ──────────────────────              ────────────────────────
  fsnotify_add_mark_locked():
    fsnotify_add_mark_list():
      spin_lock(conn->lock)
      add mark_B(evictable) to list
      spin_unlock(conn->lock)
    return

    /* ---- gap: no lock held ---- */

                                      fsnotify_detach_mark(mark_A):
                                        spin_lock(mark_A->lock)
                                        clear ATTACHED flag on mark_A
                                        spin_unlock(mark_A->lock)

    fsnotify_recalc_mask():
      spin_lock(conn->lock)
      __fsnotify_recalc_mask():
        /* mark_A skipped: ATTACHED cleared */
        /* only mark_B(evictable) remains */
        want_iref = false
        has_iref = true  /* not yet cleared */
        -> HAS_IREF transitions true -> false
        -> returns inode pointer
      spin_unlock(conn->lock)
      /* BUG: return value discarded!
       * iput() and fsnotify_put_sb_watched_objects()
       * are never called */

Fix this by capturing the return value of __fsnotify_recalc_mask() and
passing it to fsnotify_drop_object() after releasing the spinlock, which
is the same pattern used in fsnotify_put_mark().

Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
---
 fs/notify/mark.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index c2ed5b11b0fe6..cc93fcc2c5a9c 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -283,6 +283,8 @@ static void fsnotify_conn_set_children_dentry_flags(
 	fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
 }
 
+static void fsnotify_drop_object(unsigned int type, void *objp);
+
 /*
  * Calculate mask of events for a list of marks. The caller must make sure
  * connector and connector->obj cannot disappear under us.  Callers achieve
@@ -292,15 +294,19 @@ static void fsnotify_conn_set_children_dentry_flags(
 void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 {
 	bool update_children;
+	unsigned int type;
+	void *objp;
 
 	if (!conn)
 		return;
 
 	spin_lock(&conn->lock);
 	update_children = !fsnotify_conn_watches_children(conn);
-	__fsnotify_recalc_mask(conn);
+	objp = __fsnotify_recalc_mask(conn);
+	type = conn->type;
 	update_children &= fsnotify_conn_watches_children(conn);
 	spin_unlock(&conn->lock);
+	fsnotify_drop_object(type, objp);
 	/*
 	 * Set children's PARENT_WATCHED flags only if parent started watching.
 	 * When parent stops watching, we clear false positive PARENT_WATCHED
-- 
2.20.1
Re: [PATCH] fsnotify: fix inode reference leak in fsnotify_recalc_mask()
Posted by Amir Goldstein 1 month, 3 weeks ago
On Mon, Apr 20, 2026 at 8:52 AM Xin Yin <yinxin.x@bytedance.com> wrote:
>
> fsnotify_recalc_mask() fails to handle the return value of
> __fsnotify_recalc_mask(), which may return an inode pointer that needs
> to be released via fsnotify_drop_object() when the connector's HAS_IREF
> flag transitions from set to cleared.
>
> This manifests as a hung task with the following call trace:
>
>   INFO: task umount:1234 blocked for more than 120 seconds.
>   Call Trace:
>    __schedule
>    schedule
>    fsnotify_sb_delete
>    generic_shutdown_super
>    kill_anon_super
>    cleanup_mnt
>    task_work_run
>    do_exit
>    do_group_exit
>
> The race window that triggers the iref leak:
>
>   Thread A (adding mark)              Thread B (removing mark)
>   ──────────────────────              ────────────────────────
>   fsnotify_add_mark_locked():
>     fsnotify_add_mark_list():
>       spin_lock(conn->lock)
>       add mark_B(evictable) to list
>       spin_unlock(conn->lock)
>     return
>
>     /* ---- gap: no lock held ---- */
>
>                                       fsnotify_detach_mark(mark_A):
>                                         spin_lock(mark_A->lock)
>                                         clear ATTACHED flag on mark_A
>                                         spin_unlock(mark_A->lock)
>
>     fsnotify_recalc_mask():
>       spin_lock(conn->lock)
>       __fsnotify_recalc_mask():
>         /* mark_A skipped: ATTACHED cleared */
>         /* only mark_B(evictable) remains */
>         want_iref = false
>         has_iref = true  /* not yet cleared */
>         -> HAS_IREF transitions true -> false
>         -> returns inode pointer
>       spin_unlock(conn->lock)
>       /* BUG: return value discarded!
>        * iput() and fsnotify_put_sb_watched_objects()
>        * are never called */
>

Hi Xin!

Ouch! nasty one.
Very good analysis.
Do you also have a reproducer?


> Fix this by capturing the return value of __fsnotify_recalc_mask() and
> passing it to fsnotify_drop_object() after releasing the spinlock, which
> is the same pattern used in fsnotify_put_mark().

The fix looks correct, but I prefer for my mental model to keep
fsnotify_drop_object() in one place at the end of fsnotify_put_mark().

Please see the suggested (only sanity tested) patch.

Thanks,
Amir.


>
> Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
> Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
> ---
>  fs/notify/mark.c | 8 +++++++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index c2ed5b11b0fe6..cc93fcc2c5a9c 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -283,6 +283,8 @@ static void fsnotify_conn_set_children_dentry_flags(
>         fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
>  }
>
> +static void fsnotify_drop_object(unsigned int type, void *objp);
> +
>  /*
>   * Calculate mask of events for a list of marks. The caller must make sure
>   * connector and connector->obj cannot disappear under us.  Callers achieve
> @@ -292,15 +294,19 @@ static void fsnotify_conn_set_children_dentry_flags(
>  void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
>  {
>         bool update_children;
> +       unsigned int type;
> +       void *objp;
>
>         if (!conn)
>                 return;
>
>         spin_lock(&conn->lock);
>         update_children = !fsnotify_conn_watches_children(conn);
> -       __fsnotify_recalc_mask(conn);
> +       objp = __fsnotify_recalc_mask(conn);
> +       type = conn->type;
>         update_children &= fsnotify_conn_watches_children(conn);
>         spin_unlock(&conn->lock);
> +       fsnotify_drop_object(type, objp);
>         /*
>          * Set children's PARENT_WATCHED flags only if parent started watching.
>          * When parent stops watching, we clear false positive PARENT_WATCHED
> --
> 2.20.1
Re: [PATCH] fsnotify: fix inode reference leak in fsnotify_recalc_mask()
Posted by 尹欣 1 month, 3 weeks ago
> From: "Amir Goldstein"<amir73il@gmail.com>
> Date:  Mon, Apr 20, 2026, 21:34
> Subject:  Re: [PATCH] fsnotify: fix inode reference leak in fsnotify_recalc_mask()
> To: "Xin Yin"<yinxin.x@bytedance.com>
> Cc: <jan@kara.cz>, <linux-fsdevel@vger.kernel.org>, <linux-kernel@vger.kernel.org>
> On Mon, Apr 20, 2026 at 8:52 AM Xin Yin <yinxin.x@bytedance.com> wrote:
> >
> > fsnotify_recalc_mask() fails to handle the return value of
> > __fsnotify_recalc_mask(), which may return an inode pointer that needs
> > to be released via fsnotify_drop_object() when the connector's HAS_IREF
> > flag transitions from set to cleared.
> >
> > This manifests as a hung task with the following call trace:
> >
> >   INFO: task umount:1234 blocked for more than 120 seconds.
> >   Call Trace:
> >    __schedule
> >    schedule
> >    fsnotify_sb_delete
> >    generic_shutdown_super
> >    kill_anon_super
> >    cleanup_mnt
> >    task_work_run
> >    do_exit
> >    do_group_exit
> >
> > The race window that triggers the iref leak:
> >
> >   Thread A (adding mark)              Thread B (removing mark)
> >   ──────────────────────              ────────────────────────
> >   fsnotify_add_mark_locked():
> >     fsnotify_add_mark_list():
> >       spin_lock(conn->lock)
> >       add mark_B(evictable) to list
> >       spin_unlock(conn->lock)
> >     return
> >
> >     /* ---- gap: no lock held ---- */
> >
> >                                       fsnotify_detach_mark(mark_A):
> >                                         spin_lock(mark_A->lock)
> >                                         clear ATTACHED flag on mark_A
> >                                         spin_unlock(mark_A->lock)
> >
> >     fsnotify_recalc_mask():
> >       spin_lock(conn->lock)
> >       __fsnotify_recalc_mask():
> >         /* mark_A skipped: ATTACHED cleared */
> >         /* only mark_B(evictable) remains */
> >         want_iref = false
> >         has_iref = true  /* not yet cleared */
> >         -> HAS_IREF transitions true -> false
> >         -> returns inode pointer
> >       spin_unlock(conn->lock)
> >       /* BUG: return value discarded!
> >        * iput() and fsnotify_put_sb_watched_objects()
> >        * are never called */
> >
> 
> Hi Xin!
> 
> Ouch! nasty one.
> Very good analysis.
> Do you also have a reproducer?

Yes, I have a reproducer attached. 
In my testing it reproduces reliably - typically within the first round.
./repro_nomod 200 16 3

Thanks,
Xin.
> 
> 
> > Fix this by capturing the return value of __fsnotify_recalc_mask() and
> > passing it to fsnotify_drop_object() after releasing the spinlock, which
> > is the same pattern used in fsnotify_put_mark().
> 
> The fix looks correct, but I prefer for my mental model to keep
> fsnotify_drop_object() in one place at the end of fsnotify_put_mark().
> 
> Please see the suggested (only sanity tested) patch.
> 
> Thanks,
> Amir.
> 
> 
> >
> > Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
> > Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
> > ---
> >  fs/notify/mark.c | 8 +++++++-
> >  1 file changed, 7 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> > index c2ed5b11b0fe6..cc93fcc2c5a9c 100644
> > --- a/fs/notify/mark.c
> > +++ b/fs/notify/mark.c
> > @@ -283,6 +283,8 @@ static void fsnotify_conn_set_children_dentry_flags(
> >         fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
> >  }
> >
> > +static void fsnotify_drop_object(unsigned int type, void *objp);
> > +
> >  /*
> >   * Calculate mask of events for a list of marks. The caller must make sure
> >   * connector and connector->obj cannot disappear under us.  Callers achieve
> > @@ -292,15 +294,19 @@ static void fsnotify_conn_set_children_dentry_flags(
> >  void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> >  {
> >         bool update_children;
> > +       unsigned int type;
> > +       void *objp;
> >
> >         if (!conn)
> >                 return;
> >
> >         spin_lock(&conn->lock);
> >         update_children = !fsnotify_conn_watches_children(conn);
> > -       __fsnotify_recalc_mask(conn);
> > +       objp = __fsnotify_recalc_mask(conn);
> > +       type = conn->type;
> >         update_children &= fsnotify_conn_watches_children(conn);
> >         spin_unlock(&conn->lock);
> > +       fsnotify_drop_object(type, objp);
> >         /*
> >          * Set children's PARENT_WATCHED flags only if parent started watching.
> >          * When parent stops watching, we clear false positive PARENT_WATCHED
> > --
> > 2.20.1
> 
/*
 * Reproducer for fsnotify_recalc_mask() iref leak (NO kernel modification).
 *
 * Bug: fsnotify_recalc_mask() discards the return value of
 * __fsnotify_recalc_mask(), leaking an ihold() reference when HAS_IREF
 * transitions true->false. This prevents s_fsnotify_connectors from
 * reaching 0, blocking umount (fsnotify_sb_delete) forever.
 *
 * Strategy: Pure brute-force race without kernel delay injection.
 *
 * Thread group A (many threads): Rapidly add then remove a regular
 *   (non-evictable) fanotify mark on a file. The removal goes through
 *   fsnotify_put_mark -> fsnotify_detach_mark (clears ATTACHED flag).
 *
 * Thread group B (many threads): Rapidly add evictable marks on the
 *   same file. Addition calls fsnotify_recalc_mask (the buggy path).
 *   If thread A just cleared ATTACHED on the regular mark, recalc sees
 *   only evictable marks -> HAS_IREF transitions false -> inode leaked.
 *
 * Detection: After racing, attempt umount with a timeout. If umount
 *   hangs, an iref was leaked -> bug reproduced. We preserve the scene
 *   (no MNT_DETACH) so the hung task remains for dump analysis.
 *
 * Compile: gcc -static -O2 -pthread -o repro_nomod repro_nomod.c
 * Run:     ./repro_nomod [rounds] [threads] [duration_sec]
 *          Default: 100 rounds, 16 threads per group, 3 seconds each
 *
 * Needs: root, kernel with CONFIG_FANOTIFY=y
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <pthread.h>
#include <errno.h>
#include <signal.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/fanotify.h>
#include <sys/syscall.h>
#include <sched.h>

#ifndef FAN_MARK_EVICTABLE
#define FAN_MARK_EVICTABLE 0x00000200
#endif

static volatile int stop_flag = 0;
static const char *testfile;
static int num_threads = 16;

/* Bind thread to a specific CPU for cache locality */
static void pin_cpu(int cpu)
{
    cpu_set_t set;
    CPU_ZERO(&set);
    CPU_SET(cpu % sysconf(_SC_NPROCESSORS_ONLN), &set);
    sched_setaffinity(0, sizeof(set), &set);
}

/*
 * Thread A: add and remove regular (non-evictable) marks as fast as possible.
 * The key is the REMOVAL: fsnotify_detach_mark() clears ATTACHED flag,
 * which is what thread B's recalc needs to see.
 */
static void *thread_regular_mark(void *arg)
{
    int id = (int)(long)arg;
    int iter = 0;

    pin_cpu(id * 2);  /* even CPUs */

    while (!stop_flag) {
        int fan_fd = fanotify_init(FAN_CLASS_NOTIF, O_RDONLY);
        if (fan_fd < 0)
            continue;

        /* Add regular mark -> sets HAS_IREF on connector */
        fanotify_mark(fan_fd, FAN_MARK_ADD,
                      FAN_ACCESS | FAN_MODIFY,
                      AT_FDCWD, testfile);

        /*
         * NO delay here - close immediately to trigger removal ASAP.
         * close(fan_fd) -> fsnotify_put_mark -> fsnotify_detach_mark
         * which clears ATTACHED flag on the mark.
         */
        close(fan_fd);
        iter++;
    }
    return NULL;
}

/*
 * Thread B: add evictable marks as fast as possible.
 * fsnotify_add_mark -> fsnotify_recalc_mask (buggy path).
 * If it races with thread A's removal, recalc sees no regular marks
 * -> HAS_IREF transitions false -> returns inode -> LEAKED.
 */
static void *thread_evictable_mark(void *arg)
{
    int id = (int)(long)arg;
    int iter = 0;

    pin_cpu(id * 2 + 1);  /* odd CPUs */

    while (!stop_flag) {
        int fan_fd = fanotify_init(FAN_CLASS_NOTIF, O_RDONLY);
        if (fan_fd < 0)
            continue;

        /* Add evictable mark -> triggers fsnotify_recalc_mask */
        fanotify_mark(fan_fd, FAN_MARK_ADD | FAN_MARK_EVICTABLE,
                      FAN_ACCESS,
                      AT_FDCWD, testfile);

        close(fan_fd);
        iter++;
    }
    return NULL;
}

/* Alarm handler for umount timeout detection */
static volatile int umount_timed_out = 0;
static void alarm_handler(int sig) { umount_timed_out = 1; }

int main(int argc, char *argv[])
{
    int rounds = 100;
    int duration_sec = 3;
    int umount_timeout = 15;  /* seconds to wait for umount */
    char filepath[256];
    char mntpoint[256];

    if (argc > 1) rounds = atoi(argv[1]);
    if (argc > 2) num_threads = atoi(argv[2]);
    if (argc > 3) duration_sec = atoi(argv[3]);

    printf("=== fsnotify iref leak reproducer (no kernel mod) ===\n");
    printf("Rounds: %d, Threads/group: %d, Duration/round: %ds\n",
           rounds, num_threads, duration_sec);
    printf("Umount timeout: %ds (hang = bug reproduced)\n\n", umount_timeout);

    signal(SIGALRM, alarm_handler);

    for (int round = 1; round <= rounds; round++) {
        /* Use unique mountpoint per round to avoid leftover issues */
        snprintf(mntpoint, sizeof(mntpoint), "/tmp/repro_nomod_%d", round);
        mkdir(mntpoint, 0755);

        if (mount("tmpfs", mntpoint, "tmpfs", 0, "size=4M") != 0) {
            umount2(mntpoint, MNT_DETACH);
            usleep(100000);
            if (mount("tmpfs", mntpoint, "tmpfs", 0, "size=4M") != 0) {
                perror("mount");
                continue;
            }
        }

        snprintf(filepath, sizeof(filepath), "%s/target", mntpoint);
        testfile = filepath;
        int fd = open(testfile, O_CREAT | O_WRONLY, 0644);
        if (fd < 0) { perror("open"); umount2(mntpoint, MNT_DETACH); continue; }
        write(fd, "x", 1);
        close(fd);

        /* Start racing threads */
        stop_flag = 0;
        pthread_t *ta = calloc(num_threads, sizeof(pthread_t));
        pthread_t *tb = calloc(num_threads, sizeof(pthread_t));

        for (int i = 0; i < num_threads; i++) {
            pthread_create(&ta[i], NULL, thread_regular_mark, (void *)(long)i);
            pthread_create(&tb[i], NULL, thread_evictable_mark, (void *)(long)i);
        }

        sleep(duration_sec);
        stop_flag = 1;

        for (int i = 0; i < num_threads; i++) {
            pthread_join(ta[i], NULL);
            pthread_join(tb[i], NULL);
        }
        free(ta);
        free(tb);

        /* Remove the test file so umount has fewer obstacles */
        unlink(filepath);

        /*
         * Attempt umount with timeout.
         * If iref leaked, fsnotify_sb_delete will block in wait_var_event
         * waiting for s_fsnotify_connectors to reach 0.
         */
        printf("[Round %d/%d] umount %s ... ", round, rounds, mntpoint);
        fflush(stdout);

        umount_timed_out = 0;
        alarm(umount_timeout);

        pid_t pid = fork();
        if (pid == 0) {
            /* Child: try umount */
            int ret = umount2(mntpoint, 0);
            _exit(ret == 0 ? 0 : 1);
        }

        int status;
        pid_t w = waitpid(pid, &status, 0);
        alarm(0);  /* cancel alarm */

        if (umount_timed_out || w <= 0) {
            printf("HUNG! iref leak detected! Preserving scene.\n");
            printf("\n");
            printf("========================================\n");
            printf("BUG REPRODUCED at round %d!\n", round);
            printf("umount is stuck in fsnotify_sb_delete.\n");
            printf("========================================\n");
            printf("\n");
            printf("Scene preserved. Do NOT force umount.\n");
            printf("Run: dmesg | grep -i hung\n");
            printf("To dump: kworkspace dump master_upstream\n");
            printf("Mount point: %s\n", mntpoint);
            fflush(stdout);
            /* Exit but leave everything intact for analysis */
            /* The child process is stuck in umount - leave it */
            exit(0);
        }

        if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
            printf("OK\n");
        } else {
            printf("failed (ret=%d), force detach\n",
                   WIFEXITED(status) ? WEXITSTATUS(status) : -1);
            umount2(mntpoint, MNT_DETACH);
        }
    }

    printf("\n=== All %d rounds complete without hang. ===\n", rounds);
    printf("Bug was NOT reproduced. The race window is very narrow without\n");
    printf("kernel delay injection. Try increasing rounds/threads/duration.\n");
    return 1;
}
Re: [PATCH] fsnotify: fix inode reference leak in fsnotify_recalc_mask()
Posted by Amir Goldstein 1 month, 3 weeks ago
[fix Jan's cc]

On Mon, Apr 20, 2026 at 03:33:30PM +0200, Amir Goldstein wrote:
> On Mon, Apr 20, 2026 at 8:52 AM Xin Yin <yinxin.x@bytedance.com> wrote:
> >
> > fsnotify_recalc_mask() fails to handle the return value of
> > __fsnotify_recalc_mask(), which may return an inode pointer that needs
> > to be released via fsnotify_drop_object() when the connector's HAS_IREF
> > flag transitions from set to cleared.
> >
> > This manifests as a hung task with the following call trace:
> >
> >   INFO: task umount:1234 blocked for more than 120 seconds.
> >   Call Trace:
> >    __schedule
> >    schedule
> >    fsnotify_sb_delete
> >    generic_shutdown_super
> >    kill_anon_super
> >    cleanup_mnt
> >    task_work_run
> >    do_exit
> >    do_group_exit
> >
> > The race window that triggers the iref leak:
> >
> >   Thread A (adding mark)              Thread B (removing mark)
> >   ──────────────────────              ────────────────────────
> >   fsnotify_add_mark_locked():
> >     fsnotify_add_mark_list():
> >       spin_lock(conn->lock)
> >       add mark_B(evictable) to list
> >       spin_unlock(conn->lock)
> >     return
> >
> >     /* ---- gap: no lock held ---- */
> >
> >                                       fsnotify_detach_mark(mark_A):
> >                                         spin_lock(mark_A->lock)
> >                                         clear ATTACHED flag on mark_A
> >                                         spin_unlock(mark_A->lock)
> >
> >     fsnotify_recalc_mask():
> >       spin_lock(conn->lock)
> >       __fsnotify_recalc_mask():
> >         /* mark_A skipped: ATTACHED cleared */
> >         /* only mark_B(evictable) remains */
> >         want_iref = false
> >         has_iref = true  /* not yet cleared */
> >         -> HAS_IREF transitions true -> false
> >         -> returns inode pointer
> >       spin_unlock(conn->lock)
> >       /* BUG: return value discarded!
> >        * iput() and fsnotify_put_sb_watched_objects()
> >        * are never called */
> >
> 
> Hi Xin!
> 
> Ouch! nasty one.
> Very good analysis.
> Do you also have a reproducer?
> 
> 
> > Fix this by capturing the return value of __fsnotify_recalc_mask() and
> > passing it to fsnotify_drop_object() after releasing the spinlock, which
> > is the same pattern used in fsnotify_put_mark().
> 
> The fix looks correct, but I prefer for my mental model to keep
> fsnotify_drop_object() in one place at the end of fsnotify_put_mark().
> 
> Please see the suggested (only sanity tested) patch.
> 
> Thanks,
> Amir.
> 
> 
> >
> > Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
> > Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
> > ---
> >  fs/notify/mark.c | 8 +++++++-
> >  1 file changed, 7 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> > index c2ed5b11b0fe6..cc93fcc2c5a9c 100644
> > --- a/fs/notify/mark.c
> > +++ b/fs/notify/mark.c
> > @@ -283,6 +283,8 @@ static void fsnotify_conn_set_children_dentry_flags(
> >         fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
> >  }
> >
> > +static void fsnotify_drop_object(unsigned int type, void *objp);
> > +
> >  /*
> >   * Calculate mask of events for a list of marks. The caller must make sure
> >   * connector and connector->obj cannot disappear under us.  Callers achieve
> > @@ -292,15 +294,19 @@ static void fsnotify_conn_set_children_dentry_flags(
> >  void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> >  {
> >         bool update_children;
> > +       unsigned int type;
> > +       void *objp;
> >
> >         if (!conn)
> >                 return;
> >
> >         spin_lock(&conn->lock);
> >         update_children = !fsnotify_conn_watches_children(conn);
> > -       __fsnotify_recalc_mask(conn);
> > +       objp = __fsnotify_recalc_mask(conn);
> > +       type = conn->type;
> >         update_children &= fsnotify_conn_watches_children(conn);
> >         spin_unlock(&conn->lock);
> > +       fsnotify_drop_object(type, objp);
> >         /*
> >          * Set children's PARENT_WATCHED flags only if parent started watching.
> >          * When parent stops watching, we clear false positive PARENT_WATCHED
> > --
> > 2.20.1

> From 8cc3e7e2e75f5c93875bffce6bf8398d94398c33 Mon Sep 17 00:00:00 2001
> From: Amir Goldstein <amir73il@gmail.com>
> Date: Mon, 20 Apr 2026 14:58:00 +0200
> Subject: [PATCH v2] fsnotify: fix inode reference leak in
>  fsnotify_recalc_mask()
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
> 
> fsnotify_recalc_mask() fails to handle the return value of
> __fsnotify_recalc_mask(), which may return an inode pointer that needs
> to be released via fsnotify_drop_object() when the connector's HAS_IREF
> flag transitions from set to cleared.
> 
> This manifests as a hung task with the following call trace:
> 
>   INFO: task umount:1234 blocked for more than 120 seconds.
>   Call Trace:
>    __schedule
>    schedule
>    fsnotify_sb_delete
>    generic_shutdown_super
>    kill_anon_super
>    cleanup_mnt
>    task_work_run
>    do_exit
>    do_group_exit
> 
> The race window that triggers the iref leak:
> 
>   Thread A (adding mark)              Thread B (removing mark)
>   ──────────────────────              ────────────────────────
>   fsnotify_add_mark_locked():
>     fsnotify_add_mark_list():
>       spin_lock(conn->lock)
>       add mark_B(evictable) to list
>       spin_unlock(conn->lock)
>     return
> 
>     /* ---- gap: no lock held ---- */
> 
>                                       fsnotify_detach_mark(mark_A):
>                                         spin_lock(mark_A->lock)
>                                         clear ATTACHED flag on mark_A
>                                         spin_unlock(mark_A->lock)
>                                         fsnotify_put_mark(mark_A)
> 
>     fsnotify_recalc_mask():
>       spin_lock(conn->lock)
>       __fsnotify_recalc_mask():
>         /* mark_A skipped: ATTACHED cleared */
>         /* only mark_B(evictable) remains */
>         want_iref = false
>         has_iref = true  /* not yet cleared */
>         -> HAS_IREF transitions true -> false
>         -> returns inode pointer
>       spin_unlock(conn->lock)
>       /* BUG: return value discarded!
>        * iput() and fsnotify_put_sb_watched_objects()
>        * are never called */
> 
> Fix this by deferring the transition true -> false of HAS_IREF flag from
> fsnotify_recalc_mask() (Thread A) to fsnotify_put_mark() (thread B).
> 
> Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
> Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> ---
>  fs/notify/mark.c | 39 ++++++++++++++++++++++++++++++++++++---
>  1 file changed, 36 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index 622f05977f86a..e256b420100dc 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -238,7 +238,12 @@ static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
>  	return inode;
>  }
>  
> -static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> +/*
> + * Calculate mask of events for a list of marks.
> + *
> + * Return true if any of the attached marks want to hold an inode reference.
> + */
> +static bool __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
>  {
>  	u32 new_mask = 0;
>  	bool want_iref = false;
> @@ -262,6 +267,34 @@ static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
>  	 */
>  	WRITE_ONCE(*fsnotify_conn_mask_p(conn), new_mask);
>  
> +	return want_iref;
> +}
> +
> +/*
> + * Calculate mask of events for a list of marks after attach/modify mark
> + * and get an inode reference for the connector if needed.
> + *
> + * A concurrent add of evictable mark and detach of non-evictable mark can
> + * lead to __fsnotify_recalc_mask() returning false want_iref, but in this
> + * case we defer clearing iref to fsnotify_recalc_mask_clear_iref() called
> + * from fsnotify_put_mark().
> + */
> +static void fsnotify_recalc_mask_set_iref(struct fsnotify_mark_connector *conn)
> +{
> +	bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
> +	bool want_iref = __fsnotify_recalc_mask(conn) || has_iref;
> +
> +	(void) fsnotify_update_iref(conn, want_iref);
> +}
> +
> +/*
> + * Calculate mask of events for a list of marks after detach mark
> + * and return the inode object if its reference is no longer needed.
> + */
> +static void *fsnotify_recalc_mask_clear_iref(struct fsnotify_mark_connector *conn)
> +{
> +	bool want_iref = __fsnotify_recalc_mask(conn);
> +
>  	return fsnotify_update_iref(conn, want_iref);
>  }
>  
> @@ -298,7 +331,7 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
>  
>  	spin_lock(&conn->lock);
>  	update_children = !fsnotify_conn_watches_children(conn);
> -	__fsnotify_recalc_mask(conn);
> +	fsnotify_recalc_mask_set_iref(conn);
>  	update_children &= fsnotify_conn_watches_children(conn);
>  	spin_unlock(&conn->lock);
>  	/*
> @@ -419,7 +452,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
>  		/* Update watched objects after detaching mark */
>  		if (sb)
>  			fsnotify_update_sb_watchers(sb, conn);
> -		objp = __fsnotify_recalc_mask(conn);
> +		objp = fsnotify_recalc_mask_clear_iref(conn);
>  		type = conn->type;
>  	}
>  	WRITE_ONCE(mark->connector, NULL);
> -- 
> 2.53.0
> 

Re: [PATCH] fsnotify: fix inode reference leak in fsnotify_recalc_mask()
Posted by Jan Kara 1 month, 3 weeks ago
On Mon 20-04-26 16:00:51, Amir Goldstein wrote:
> [fix Jan's cc]
> 
> On Mon, Apr 20, 2026 at 03:33:30PM +0200, Amir Goldstein wrote:
> > On Mon, Apr 20, 2026 at 8:52 AM Xin Yin <yinxin.x@bytedance.com> wrote:
> > >
> > > fsnotify_recalc_mask() fails to handle the return value of
> > > __fsnotify_recalc_mask(), which may return an inode pointer that needs
> > > to be released via fsnotify_drop_object() when the connector's HAS_IREF
> > > flag transitions from set to cleared.
> > >
> > > This manifests as a hung task with the following call trace:
> > >
> > >   INFO: task umount:1234 blocked for more than 120 seconds.
> > >   Call Trace:
> > >    __schedule
> > >    schedule
> > >    fsnotify_sb_delete
> > >    generic_shutdown_super
> > >    kill_anon_super
> > >    cleanup_mnt
> > >    task_work_run
> > >    do_exit
> > >    do_group_exit
> > >
> > > The race window that triggers the iref leak:
> > >
> > >   Thread A (adding mark)              Thread B (removing mark)
> > >   ──────────────────────              ────────────────────────
> > >   fsnotify_add_mark_locked():
> > >     fsnotify_add_mark_list():
> > >       spin_lock(conn->lock)
> > >       add mark_B(evictable) to list
> > >       spin_unlock(conn->lock)
> > >     return
> > >
> > >     /* ---- gap: no lock held ---- */
> > >
> > >                                       fsnotify_detach_mark(mark_A):
> > >                                         spin_lock(mark_A->lock)
> > >                                         clear ATTACHED flag on mark_A
> > >                                         spin_unlock(mark_A->lock)
> > >
> > >     fsnotify_recalc_mask():
> > >       spin_lock(conn->lock)
> > >       __fsnotify_recalc_mask():
> > >         /* mark_A skipped: ATTACHED cleared */
> > >         /* only mark_B(evictable) remains */
> > >         want_iref = false
> > >         has_iref = true  /* not yet cleared */
> > >         -> HAS_IREF transitions true -> false
> > >         -> returns inode pointer
> > >       spin_unlock(conn->lock)
> > >       /* BUG: return value discarded!
> > >        * iput() and fsnotify_put_sb_watched_objects()
> > >        * are never called */
> > >
> > 
> > Hi Xin!
> > 
> > Ouch! nasty one.
> > Very good analysis.
> > Do you also have a reproducer?
> > 
> > 
> > > Fix this by capturing the return value of __fsnotify_recalc_mask() and
> > > passing it to fsnotify_drop_object() after releasing the spinlock, which
> > > is the same pattern used in fsnotify_put_mark().
> > 
> > The fix looks correct, but I prefer for my mental model to keep
> > fsnotify_drop_object() in one place at the end of fsnotify_put_mark().
> > 
> > Please see the suggested (only sanity tested) patch.

Thanks for the fix guys. Frankly, I don't care much between Amir's and
original version. I guess Amir's version keeps iput() constrained to a
single location so it might be somewhat safer going forward so I'll merge
that. It reminds me I should dust off my patches to remove inode references
from fsnotify code altogether. That will remove these games as well.

								Honza


> > From 8cc3e7e2e75f5c93875bffce6bf8398d94398c33 Mon Sep 17 00:00:00 2001
> > From: Amir Goldstein <amir73il@gmail.com>
> > Date: Mon, 20 Apr 2026 14:58:00 +0200
> > Subject: [PATCH v2] fsnotify: fix inode reference leak in
> >  fsnotify_recalc_mask()
> > MIME-Version: 1.0
> > Content-Type: text/plain; charset=UTF-8
> > Content-Transfer-Encoding: 8bit
> > 
> > fsnotify_recalc_mask() fails to handle the return value of
> > __fsnotify_recalc_mask(), which may return an inode pointer that needs
> > to be released via fsnotify_drop_object() when the connector's HAS_IREF
> > flag transitions from set to cleared.
> > 
> > This manifests as a hung task with the following call trace:
> > 
> >   INFO: task umount:1234 blocked for more than 120 seconds.
> >   Call Trace:
> >    __schedule
> >    schedule
> >    fsnotify_sb_delete
> >    generic_shutdown_super
> >    kill_anon_super
> >    cleanup_mnt
> >    task_work_run
> >    do_exit
> >    do_group_exit
> > 
> > The race window that triggers the iref leak:
> > 
> >   Thread A (adding mark)              Thread B (removing mark)
> >   ──────────────────────              ────────────────────────
> >   fsnotify_add_mark_locked():
> >     fsnotify_add_mark_list():
> >       spin_lock(conn->lock)
> >       add mark_B(evictable) to list
> >       spin_unlock(conn->lock)
> >     return
> > 
> >     /* ---- gap: no lock held ---- */
> > 
> >                                       fsnotify_detach_mark(mark_A):
> >                                         spin_lock(mark_A->lock)
> >                                         clear ATTACHED flag on mark_A
> >                                         spin_unlock(mark_A->lock)
> >                                         fsnotify_put_mark(mark_A)
> > 
> >     fsnotify_recalc_mask():
> >       spin_lock(conn->lock)
> >       __fsnotify_recalc_mask():
> >         /* mark_A skipped: ATTACHED cleared */
> >         /* only mark_B(evictable) remains */
> >         want_iref = false
> >         has_iref = true  /* not yet cleared */
> >         -> HAS_IREF transitions true -> false
> >         -> returns inode pointer
> >       spin_unlock(conn->lock)
> >       /* BUG: return value discarded!
> >        * iput() and fsnotify_put_sb_watched_objects()
> >        * are never called */
> > 
> > Fix this by deferring the transition true -> false of HAS_IREF flag from
> > fsnotify_recalc_mask() (Thread A) to fsnotify_put_mark() (thread B).
> > 
> > Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
> > Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > ---
> >  fs/notify/mark.c | 39 ++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 36 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> > index 622f05977f86a..e256b420100dc 100644
> > --- a/fs/notify/mark.c
> > +++ b/fs/notify/mark.c
> > @@ -238,7 +238,12 @@ static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
> >  	return inode;
> >  }
> >  
> > -static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> > +/*
> > + * Calculate mask of events for a list of marks.
> > + *
> > + * Return true if any of the attached marks want to hold an inode reference.
> > + */
> > +static bool __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> >  {
> >  	u32 new_mask = 0;
> >  	bool want_iref = false;
> > @@ -262,6 +267,34 @@ static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> >  	 */
> >  	WRITE_ONCE(*fsnotify_conn_mask_p(conn), new_mask);
> >  
> > +	return want_iref;
> > +}
> > +
> > +/*
> > + * Calculate mask of events for a list of marks after attach/modify mark
> > + * and get an inode reference for the connector if needed.
> > + *
> > + * A concurrent add of evictable mark and detach of non-evictable mark can
> > + * lead to __fsnotify_recalc_mask() returning false want_iref, but in this
> > + * case we defer clearing iref to fsnotify_recalc_mask_clear_iref() called
> > + * from fsnotify_put_mark().
> > + */
> > +static void fsnotify_recalc_mask_set_iref(struct fsnotify_mark_connector *conn)
> > +{
> > +	bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
> > +	bool want_iref = __fsnotify_recalc_mask(conn) || has_iref;
> > +
> > +	(void) fsnotify_update_iref(conn, want_iref);
> > +}
> > +
> > +/*
> > + * Calculate mask of events for a list of marks after detach mark
> > + * and return the inode object if its reference is no longer needed.
> > + */
> > +static void *fsnotify_recalc_mask_clear_iref(struct fsnotify_mark_connector *conn)
> > +{
> > +	bool want_iref = __fsnotify_recalc_mask(conn);
> > +
> >  	return fsnotify_update_iref(conn, want_iref);
> >  }
> >  
> > @@ -298,7 +331,7 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> >  
> >  	spin_lock(&conn->lock);
> >  	update_children = !fsnotify_conn_watches_children(conn);
> > -	__fsnotify_recalc_mask(conn);
> > +	fsnotify_recalc_mask_set_iref(conn);
> >  	update_children &= fsnotify_conn_watches_children(conn);
> >  	spin_unlock(&conn->lock);
> >  	/*
> > @@ -419,7 +452,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
> >  		/* Update watched objects after detaching mark */
> >  		if (sb)
> >  			fsnotify_update_sb_watchers(sb, conn);
> > -		objp = __fsnotify_recalc_mask(conn);
> > +		objp = fsnotify_recalc_mask_clear_iref(conn);
> >  		type = conn->type;
> >  	}
> >  	WRITE_ONCE(mark->connector, NULL);
> > -- 
> > 2.53.0
> > 
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR
Re: [PATCH] fsnotify: fix inode reference leak in fsnotify_recalc_mask()
Posted by 尹欣 1 month, 3 weeks ago
Cc  Jan
Sorry, I used the wrong email address.

Thanks,
Xin Yin

> From: "Xin Yin"<yinxin.x@bytedance.com>
> Date:  Mon, Apr 20, 2026, 14:51
> Subject:  [PATCH] fsnotify: fix inode reference leak in fsnotify_recalc_mask()
> To: <jan@kara.cz>, <amir73il@gmail.com>
> Cc: <linux-fsdevel@vger.kernel.org>, <linux-kernel@vger.kernel.org>, "Xin Yin"<yinxin.x@bytedance.com>
> fsnotify_recalc_mask() fails to handle the return value of
> __fsnotify_recalc_mask(), which may return an inode pointer that needs
> to be released via fsnotify_drop_object() when the connector's HAS_IREF
> flag transitions from set to cleared.
> 
> This manifests as a hung task with the following call trace:
> 
>   INFO: task umount:1234 blocked for more than 120 seconds.
>   Call Trace:
>    __schedule
>    schedule
>    fsnotify_sb_delete
>    generic_shutdown_super
>    kill_anon_super
>    cleanup_mnt
>    task_work_run
>    do_exit
>    do_group_exit
> 
> The race window that triggers the iref leak:
> 
>   Thread A (adding mark)              Thread B (removing mark)
>   ──────────────────────              ────────────────────────
>   fsnotify_add_mark_locked():
>     fsnotify_add_mark_list():
>       spin_lock(conn->lock)
>       add mark_B(evictable) to list
>       spin_unlock(conn->lock)
>     return
> 
>     /* ---- gap: no lock held ---- */
> 
>                                       fsnotify_detach_mark(mark_A):
>                                         spin_lock(mark_A->lock)
>                                         clear ATTACHED flag on mark_A
>                                         spin_unlock(mark_A->lock)
> 
>     fsnotify_recalc_mask():
>       spin_lock(conn->lock)
>       __fsnotify_recalc_mask():
>         /* mark_A skipped: ATTACHED cleared */
>         /* only mark_B(evictable) remains */
>         want_iref = false
>         has_iref = true  /* not yet cleared */
>         -> HAS_IREF transitions true -> false
>         -> returns inode pointer
>       spin_unlock(conn->lock)
>       /* BUG: return value discarded!
>        * iput() and fsnotify_put_sb_watched_objects()
>        * are never called */
> 
> Fix this by capturing the return value of __fsnotify_recalc_mask() and
> passing it to fsnotify_drop_object() after releasing the spinlock, which
> is the same pattern used in fsnotify_put_mark().
> 
> Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
> Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
> ---
>  fs/notify/mark.c | 8 +++++++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index c2ed5b11b0fe6..cc93fcc2c5a9c 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -283,6 +283,8 @@ static void fsnotify_conn_set_children_dentry_flags(
>          fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
>  }
>  
> +static void fsnotify_drop_object(unsigned int type, void *objp);
> +
>  /*
>   * Calculate mask of events for a list of marks. The caller must make sure
>   * connector and connector->obj cannot disappear under us.  Callers achieve
> @@ -292,15 +294,19 @@ static void fsnotify_conn_set_children_dentry_flags(
>  void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
>  {
>          bool update_children;
> +        unsigned int type;
> +        void *objp;
>  
>          if (!conn)
>                  return;
>  
>          spin_lock(&conn->lock);
>          update_children = !fsnotify_conn_watches_children(conn);
> -        __fsnotify_recalc_mask(conn);
> +        objp = __fsnotify_recalc_mask(conn);
> +        type = conn->type;
>          update_children &= fsnotify_conn_watches_children(conn);
>          spin_unlock(&conn->lock);
> +        fsnotify_drop_object(type, objp);
>          /*
>           * Set children's PARENT_WATCHED flags only if parent started watching.
>           * When parent stops watching, we clear false positive PARENT_WATCHED
> -- 
> 2.20.1
>