fs/notify/mark.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
fsnotify_recalc_mask() fails to handle the return value of
__fsnotify_recalc_mask(), which may return an inode pointer that needs
to be released via fsnotify_drop_object() when the connector's HAS_IREF
flag transitions from set to cleared.
This manifests as a hung task with the following call trace:
INFO: task umount:1234 blocked for more than 120 seconds.
Call Trace:
__schedule
schedule
fsnotify_sb_delete
generic_shutdown_super
kill_anon_super
cleanup_mnt
task_work_run
do_exit
do_group_exit
The race window that triggers the iref leak:
Thread A (adding mark) Thread B (removing mark)
────────────────────── ────────────────────────
fsnotify_add_mark_locked():
fsnotify_add_mark_list():
spin_lock(conn->lock)
add mark_B(evictable) to list
spin_unlock(conn->lock)
return
/* ---- gap: no lock held ---- */
fsnotify_detach_mark(mark_A):
spin_lock(mark_A->lock)
clear ATTACHED flag on mark_A
spin_unlock(mark_A->lock)
fsnotify_recalc_mask():
spin_lock(conn->lock)
__fsnotify_recalc_mask():
/* mark_A skipped: ATTACHED cleared */
/* only mark_B(evictable) remains */
want_iref = false
has_iref = true /* not yet cleared */
-> HAS_IREF transitions true -> false
-> returns inode pointer
spin_unlock(conn->lock)
/* BUG: return value discarded!
* iput() and fsnotify_put_sb_watched_objects()
* are never called */
Fix this by capturing the return value of __fsnotify_recalc_mask() and
passing it to fsnotify_drop_object() after releasing the spinlock, which
is the same pattern used in fsnotify_put_mark().
Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
---
fs/notify/mark.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index c2ed5b11b0fe6..cc93fcc2c5a9c 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -283,6 +283,8 @@ static void fsnotify_conn_set_children_dentry_flags(
fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
}
+static void fsnotify_drop_object(unsigned int type, void *objp);
+
/*
* Calculate mask of events for a list of marks. The caller must make sure
* connector and connector->obj cannot disappear under us. Callers achieve
@@ -292,15 +294,19 @@ static void fsnotify_conn_set_children_dentry_flags(
void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
bool update_children;
+ unsigned int type;
+ void *objp;
if (!conn)
return;
spin_lock(&conn->lock);
update_children = !fsnotify_conn_watches_children(conn);
- __fsnotify_recalc_mask(conn);
+ objp = __fsnotify_recalc_mask(conn);
+ type = conn->type;
update_children &= fsnotify_conn_watches_children(conn);
spin_unlock(&conn->lock);
+ fsnotify_drop_object(type, objp);
/*
* Set children's PARENT_WATCHED flags only if parent started watching.
* When parent stops watching, we clear false positive PARENT_WATCHED
--
2.20.1
On Mon, Apr 20, 2026 at 8:52 AM Xin Yin <yinxin.x@bytedance.com> wrote:
>
> fsnotify_recalc_mask() fails to handle the return value of
> __fsnotify_recalc_mask(), which may return an inode pointer that needs
> to be released via fsnotify_drop_object() when the connector's HAS_IREF
> flag transitions from set to cleared.
>
> This manifests as a hung task with the following call trace:
>
> INFO: task umount:1234 blocked for more than 120 seconds.
> Call Trace:
> __schedule
> schedule
> fsnotify_sb_delete
> generic_shutdown_super
> kill_anon_super
> cleanup_mnt
> task_work_run
> do_exit
> do_group_exit
>
> The race window that triggers the iref leak:
>
> Thread A (adding mark) Thread B (removing mark)
> ────────────────────── ────────────────────────
> fsnotify_add_mark_locked():
> fsnotify_add_mark_list():
> spin_lock(conn->lock)
> add mark_B(evictable) to list
> spin_unlock(conn->lock)
> return
>
> /* ---- gap: no lock held ---- */
>
> fsnotify_detach_mark(mark_A):
> spin_lock(mark_A->lock)
> clear ATTACHED flag on mark_A
> spin_unlock(mark_A->lock)
>
> fsnotify_recalc_mask():
> spin_lock(conn->lock)
> __fsnotify_recalc_mask():
> /* mark_A skipped: ATTACHED cleared */
> /* only mark_B(evictable) remains */
> want_iref = false
> has_iref = true /* not yet cleared */
> -> HAS_IREF transitions true -> false
> -> returns inode pointer
> spin_unlock(conn->lock)
> /* BUG: return value discarded!
> * iput() and fsnotify_put_sb_watched_objects()
> * are never called */
>
Hi Xin!
Ouch! nasty one.
Very good analysis.
Do you also have a reproducer?
> Fix this by capturing the return value of __fsnotify_recalc_mask() and
> passing it to fsnotify_drop_object() after releasing the spinlock, which
> is the same pattern used in fsnotify_put_mark().
The fix looks correct, but I prefer for my mental model to keep
fsnotify_drop_object() in one place at the end of fsnotify_put_mark().
Please see the suggested (only sanity tested) patch.
Thanks,
Amir.
>
> Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
> Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
> ---
> fs/notify/mark.c | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index c2ed5b11b0fe6..cc93fcc2c5a9c 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -283,6 +283,8 @@ static void fsnotify_conn_set_children_dentry_flags(
> fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
> }
>
> +static void fsnotify_drop_object(unsigned int type, void *objp);
> +
> /*
> * Calculate mask of events for a list of marks. The caller must make sure
> * connector and connector->obj cannot disappear under us. Callers achieve
> @@ -292,15 +294,19 @@ static void fsnotify_conn_set_children_dentry_flags(
> void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> {
> bool update_children;
> + unsigned int type;
> + void *objp;
>
> if (!conn)
> return;
>
> spin_lock(&conn->lock);
> update_children = !fsnotify_conn_watches_children(conn);
> - __fsnotify_recalc_mask(conn);
> + objp = __fsnotify_recalc_mask(conn);
> + type = conn->type;
> update_children &= fsnotify_conn_watches_children(conn);
> spin_unlock(&conn->lock);
> + fsnotify_drop_object(type, objp);
> /*
> * Set children's PARENT_WATCHED flags only if parent started watching.
> * When parent stops watching, we clear false positive PARENT_WATCHED
> --
> 2.20.1
> From: "Amir Goldstein"<amir73il@gmail.com>
> Date: Mon, Apr 20, 2026, 21:34
> Subject: Re: [PATCH] fsnotify: fix inode reference leak in fsnotify_recalc_mask()
> To: "Xin Yin"<yinxin.x@bytedance.com>
> Cc: <jan@kara.cz>, <linux-fsdevel@vger.kernel.org>, <linux-kernel@vger.kernel.org>
> On Mon, Apr 20, 2026 at 8:52 AM Xin Yin <yinxin.x@bytedance.com> wrote:
> >
> > fsnotify_recalc_mask() fails to handle the return value of
> > __fsnotify_recalc_mask(), which may return an inode pointer that needs
> > to be released via fsnotify_drop_object() when the connector's HAS_IREF
> > flag transitions from set to cleared.
> >
> > This manifests as a hung task with the following call trace:
> >
> > INFO: task umount:1234 blocked for more than 120 seconds.
> > Call Trace:
> > __schedule
> > schedule
> > fsnotify_sb_delete
> > generic_shutdown_super
> > kill_anon_super
> > cleanup_mnt
> > task_work_run
> > do_exit
> > do_group_exit
> >
> > The race window that triggers the iref leak:
> >
> > Thread A (adding mark) Thread B (removing mark)
> > ────────────────────── ────────────────────────
> > fsnotify_add_mark_locked():
> > fsnotify_add_mark_list():
> > spin_lock(conn->lock)
> > add mark_B(evictable) to list
> > spin_unlock(conn->lock)
> > return
> >
> > /* ---- gap: no lock held ---- */
> >
> > fsnotify_detach_mark(mark_A):
> > spin_lock(mark_A->lock)
> > clear ATTACHED flag on mark_A
> > spin_unlock(mark_A->lock)
> >
> > fsnotify_recalc_mask():
> > spin_lock(conn->lock)
> > __fsnotify_recalc_mask():
> > /* mark_A skipped: ATTACHED cleared */
> > /* only mark_B(evictable) remains */
> > want_iref = false
> > has_iref = true /* not yet cleared */
> > -> HAS_IREF transitions true -> false
> > -> returns inode pointer
> > spin_unlock(conn->lock)
> > /* BUG: return value discarded!
> > * iput() and fsnotify_put_sb_watched_objects()
> > * are never called */
> >
>
> Hi Xin!
>
> Ouch! nasty one.
> Very good analysis.
> Do you also have a reproducer?
Yes, I have a reproducer attached.
In my testing it reproduces reliably - typically within the first round.
./repro_nomod 200 16 3
Thanks,
Xin.
>
>
> > Fix this by capturing the return value of __fsnotify_recalc_mask() and
> > passing it to fsnotify_drop_object() after releasing the spinlock, which
> > is the same pattern used in fsnotify_put_mark().
>
> The fix looks correct, but I prefer for my mental model to keep
> fsnotify_drop_object() in one place at the end of fsnotify_put_mark().
>
> Please see the suggested (only sanity tested) patch.
>
> Thanks,
> Amir.
>
>
> >
> > Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
> > Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
> > ---
> > fs/notify/mark.c | 8 +++++++-
> > 1 file changed, 7 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> > index c2ed5b11b0fe6..cc93fcc2c5a9c 100644
> > --- a/fs/notify/mark.c
> > +++ b/fs/notify/mark.c
> > @@ -283,6 +283,8 @@ static void fsnotify_conn_set_children_dentry_flags(
> > fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
> > }
> >
> > +static void fsnotify_drop_object(unsigned int type, void *objp);
> > +
> > /*
> > * Calculate mask of events for a list of marks. The caller must make sure
> > * connector and connector->obj cannot disappear under us. Callers achieve
> > @@ -292,15 +294,19 @@ static void fsnotify_conn_set_children_dentry_flags(
> > void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> > {
> > bool update_children;
> > + unsigned int type;
> > + void *objp;
> >
> > if (!conn)
> > return;
> >
> > spin_lock(&conn->lock);
> > update_children = !fsnotify_conn_watches_children(conn);
> > - __fsnotify_recalc_mask(conn);
> > + objp = __fsnotify_recalc_mask(conn);
> > + type = conn->type;
> > update_children &= fsnotify_conn_watches_children(conn);
> > spin_unlock(&conn->lock);
> > + fsnotify_drop_object(type, objp);
> > /*
> > * Set children's PARENT_WATCHED flags only if parent started watching.
> > * When parent stops watching, we clear false positive PARENT_WATCHED
> > --
> > 2.20.1
>
/*
* Reproducer for fsnotify_recalc_mask() iref leak (NO kernel modification).
*
* Bug: fsnotify_recalc_mask() discards the return value of
* __fsnotify_recalc_mask(), leaking an ihold() reference when HAS_IREF
* transitions true->false. This prevents s_fsnotify_connectors from
* reaching 0, blocking umount (fsnotify_sb_delete) forever.
*
* Strategy: Pure brute-force race without kernel delay injection.
*
* Thread group A (many threads): Rapidly add then remove a regular
* (non-evictable) fanotify mark on a file. The removal goes through
* fsnotify_put_mark -> fsnotify_detach_mark (clears ATTACHED flag).
*
* Thread group B (many threads): Rapidly add evictable marks on the
* same file. Addition calls fsnotify_recalc_mask (the buggy path).
* If thread A just cleared ATTACHED on the regular mark, recalc sees
* only evictable marks -> HAS_IREF transitions false -> inode leaked.
*
* Detection: After racing, attempt umount with a timeout. If umount
* hangs, an iref was leaked -> bug reproduced. We preserve the scene
* (no MNT_DETACH) so the hung task remains for dump analysis.
*
* Compile: gcc -static -O2 -pthread -o repro_nomod repro_nomod.c
* Run: ./repro_nomod [rounds] [threads] [duration_sec]
* Default: 100 rounds, 16 threads per group, 3 seconds each
*
* Needs: root, kernel with CONFIG_FANOTIFY=y
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <pthread.h>
#include <errno.h>
#include <signal.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/fanotify.h>
#include <sys/syscall.h>
#include <sched.h>
#ifndef FAN_MARK_EVICTABLE
#define FAN_MARK_EVICTABLE 0x00000200
#endif
static volatile int stop_flag = 0;
static const char *testfile;
static int num_threads = 16;
/* Bind thread to a specific CPU for cache locality */
static void pin_cpu(int cpu)
{
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(cpu % sysconf(_SC_NPROCESSORS_ONLN), &set);
sched_setaffinity(0, sizeof(set), &set);
}
/*
* Thread A: add and remove regular (non-evictable) marks as fast as possible.
* The key is the REMOVAL: fsnotify_detach_mark() clears ATTACHED flag,
* which is what thread B's recalc needs to see.
*/
static void *thread_regular_mark(void *arg)
{
int id = (int)(long)arg;
int iter = 0;
pin_cpu(id * 2); /* even CPUs */
while (!stop_flag) {
int fan_fd = fanotify_init(FAN_CLASS_NOTIF, O_RDONLY);
if (fan_fd < 0)
continue;
/* Add regular mark -> sets HAS_IREF on connector */
fanotify_mark(fan_fd, FAN_MARK_ADD,
FAN_ACCESS | FAN_MODIFY,
AT_FDCWD, testfile);
/*
* NO delay here - close immediately to trigger removal ASAP.
* close(fan_fd) -> fsnotify_put_mark -> fsnotify_detach_mark
* which clears ATTACHED flag on the mark.
*/
close(fan_fd);
iter++;
}
return NULL;
}
/*
* Thread B: add evictable marks as fast as possible.
* fsnotify_add_mark -> fsnotify_recalc_mask (buggy path).
* If it races with thread A's removal, recalc sees no regular marks
* -> HAS_IREF transitions false -> returns inode -> LEAKED.
*/
static void *thread_evictable_mark(void *arg)
{
int id = (int)(long)arg;
int iter = 0;
pin_cpu(id * 2 + 1); /* odd CPUs */
while (!stop_flag) {
int fan_fd = fanotify_init(FAN_CLASS_NOTIF, O_RDONLY);
if (fan_fd < 0)
continue;
/* Add evictable mark -> triggers fsnotify_recalc_mask */
fanotify_mark(fan_fd, FAN_MARK_ADD | FAN_MARK_EVICTABLE,
FAN_ACCESS,
AT_FDCWD, testfile);
close(fan_fd);
iter++;
}
return NULL;
}
/* Alarm handler for umount timeout detection */
static volatile int umount_timed_out = 0;
static void alarm_handler(int sig) { umount_timed_out = 1; }
int main(int argc, char *argv[])
{
int rounds = 100;
int duration_sec = 3;
int umount_timeout = 15; /* seconds to wait for umount */
char filepath[256];
char mntpoint[256];
if (argc > 1) rounds = atoi(argv[1]);
if (argc > 2) num_threads = atoi(argv[2]);
if (argc > 3) duration_sec = atoi(argv[3]);
printf("=== fsnotify iref leak reproducer (no kernel mod) ===\n");
printf("Rounds: %d, Threads/group: %d, Duration/round: %ds\n",
rounds, num_threads, duration_sec);
printf("Umount timeout: %ds (hang = bug reproduced)\n\n", umount_timeout);
signal(SIGALRM, alarm_handler);
for (int round = 1; round <= rounds; round++) {
/* Use unique mountpoint per round to avoid leftover issues */
snprintf(mntpoint, sizeof(mntpoint), "/tmp/repro_nomod_%d", round);
mkdir(mntpoint, 0755);
if (mount("tmpfs", mntpoint, "tmpfs", 0, "size=4M") != 0) {
umount2(mntpoint, MNT_DETACH);
usleep(100000);
if (mount("tmpfs", mntpoint, "tmpfs", 0, "size=4M") != 0) {
perror("mount");
continue;
}
}
snprintf(filepath, sizeof(filepath), "%s/target", mntpoint);
testfile = filepath;
int fd = open(testfile, O_CREAT | O_WRONLY, 0644);
if (fd < 0) { perror("open"); umount2(mntpoint, MNT_DETACH); continue; }
write(fd, "x", 1);
close(fd);
/* Start racing threads */
stop_flag = 0;
pthread_t *ta = calloc(num_threads, sizeof(pthread_t));
pthread_t *tb = calloc(num_threads, sizeof(pthread_t));
for (int i = 0; i < num_threads; i++) {
pthread_create(&ta[i], NULL, thread_regular_mark, (void *)(long)i);
pthread_create(&tb[i], NULL, thread_evictable_mark, (void *)(long)i);
}
sleep(duration_sec);
stop_flag = 1;
for (int i = 0; i < num_threads; i++) {
pthread_join(ta[i], NULL);
pthread_join(tb[i], NULL);
}
free(ta);
free(tb);
/* Remove the test file so umount has fewer obstacles */
unlink(filepath);
/*
* Attempt umount with timeout.
* If iref leaked, fsnotify_sb_delete will block in wait_var_event
* waiting for s_fsnotify_connectors to reach 0.
*/
printf("[Round %d/%d] umount %s ... ", round, rounds, mntpoint);
fflush(stdout);
umount_timed_out = 0;
alarm(umount_timeout);
pid_t pid = fork();
if (pid == 0) {
/* Child: try umount */
int ret = umount2(mntpoint, 0);
_exit(ret == 0 ? 0 : 1);
}
int status;
pid_t w = waitpid(pid, &status, 0);
alarm(0); /* cancel alarm */
if (umount_timed_out || w <= 0) {
printf("HUNG! iref leak detected! Preserving scene.\n");
printf("\n");
printf("========================================\n");
printf("BUG REPRODUCED at round %d!\n", round);
printf("umount is stuck in fsnotify_sb_delete.\n");
printf("========================================\n");
printf("\n");
printf("Scene preserved. Do NOT force umount.\n");
printf("Run: dmesg | grep -i hung\n");
printf("To dump: kworkspace dump master_upstream\n");
printf("Mount point: %s\n", mntpoint);
fflush(stdout);
/* Exit but leave everything intact for analysis */
/* The child process is stuck in umount - leave it */
exit(0);
}
if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
printf("OK\n");
} else {
printf("failed (ret=%d), force detach\n",
WIFEXITED(status) ? WEXITSTATUS(status) : -1);
umount2(mntpoint, MNT_DETACH);
}
}
printf("\n=== All %d rounds complete without hang. ===\n", rounds);
printf("Bug was NOT reproduced. The race window is very narrow without\n");
printf("kernel delay injection. Try increasing rounds/threads/duration.\n");
return 1;
}
[fix Jan's cc]
On Mon, Apr 20, 2026 at 03:33:30PM +0200, Amir Goldstein wrote:
> On Mon, Apr 20, 2026 at 8:52 AM Xin Yin <yinxin.x@bytedance.com> wrote:
> >
> > fsnotify_recalc_mask() fails to handle the return value of
> > __fsnotify_recalc_mask(), which may return an inode pointer that needs
> > to be released via fsnotify_drop_object() when the connector's HAS_IREF
> > flag transitions from set to cleared.
> >
> > This manifests as a hung task with the following call trace:
> >
> > INFO: task umount:1234 blocked for more than 120 seconds.
> > Call Trace:
> > __schedule
> > schedule
> > fsnotify_sb_delete
> > generic_shutdown_super
> > kill_anon_super
> > cleanup_mnt
> > task_work_run
> > do_exit
> > do_group_exit
> >
> > The race window that triggers the iref leak:
> >
> > Thread A (adding mark) Thread B (removing mark)
> > ────────────────────── ────────────────────────
> > fsnotify_add_mark_locked():
> > fsnotify_add_mark_list():
> > spin_lock(conn->lock)
> > add mark_B(evictable) to list
> > spin_unlock(conn->lock)
> > return
> >
> > /* ---- gap: no lock held ---- */
> >
> > fsnotify_detach_mark(mark_A):
> > spin_lock(mark_A->lock)
> > clear ATTACHED flag on mark_A
> > spin_unlock(mark_A->lock)
> >
> > fsnotify_recalc_mask():
> > spin_lock(conn->lock)
> > __fsnotify_recalc_mask():
> > /* mark_A skipped: ATTACHED cleared */
> > /* only mark_B(evictable) remains */
> > want_iref = false
> > has_iref = true /* not yet cleared */
> > -> HAS_IREF transitions true -> false
> > -> returns inode pointer
> > spin_unlock(conn->lock)
> > /* BUG: return value discarded!
> > * iput() and fsnotify_put_sb_watched_objects()
> > * are never called */
> >
>
> Hi Xin!
>
> Ouch! nasty one.
> Very good analysis.
> Do you also have a reproducer?
>
>
> > Fix this by capturing the return value of __fsnotify_recalc_mask() and
> > passing it to fsnotify_drop_object() after releasing the spinlock, which
> > is the same pattern used in fsnotify_put_mark().
>
> The fix looks correct, but I prefer for my mental model to keep
> fsnotify_drop_object() in one place at the end of fsnotify_put_mark().
>
> Please see the suggested (only sanity tested) patch.
>
> Thanks,
> Amir.
>
>
> >
> > Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
> > Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
> > ---
> > fs/notify/mark.c | 8 +++++++-
> > 1 file changed, 7 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> > index c2ed5b11b0fe6..cc93fcc2c5a9c 100644
> > --- a/fs/notify/mark.c
> > +++ b/fs/notify/mark.c
> > @@ -283,6 +283,8 @@ static void fsnotify_conn_set_children_dentry_flags(
> > fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
> > }
> >
> > +static void fsnotify_drop_object(unsigned int type, void *objp);
> > +
> > /*
> > * Calculate mask of events for a list of marks. The caller must make sure
> > * connector and connector->obj cannot disappear under us. Callers achieve
> > @@ -292,15 +294,19 @@ static void fsnotify_conn_set_children_dentry_flags(
> > void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> > {
> > bool update_children;
> > + unsigned int type;
> > + void *objp;
> >
> > if (!conn)
> > return;
> >
> > spin_lock(&conn->lock);
> > update_children = !fsnotify_conn_watches_children(conn);
> > - __fsnotify_recalc_mask(conn);
> > + objp = __fsnotify_recalc_mask(conn);
> > + type = conn->type;
> > update_children &= fsnotify_conn_watches_children(conn);
> > spin_unlock(&conn->lock);
> > + fsnotify_drop_object(type, objp);
> > /*
> > * Set children's PARENT_WATCHED flags only if parent started watching.
> > * When parent stops watching, we clear false positive PARENT_WATCHED
> > --
> > 2.20.1
> From 8cc3e7e2e75f5c93875bffce6bf8398d94398c33 Mon Sep 17 00:00:00 2001
> From: Amir Goldstein <amir73il@gmail.com>
> Date: Mon, 20 Apr 2026 14:58:00 +0200
> Subject: [PATCH v2] fsnotify: fix inode reference leak in
> fsnotify_recalc_mask()
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
>
> fsnotify_recalc_mask() fails to handle the return value of
> __fsnotify_recalc_mask(), which may return an inode pointer that needs
> to be released via fsnotify_drop_object() when the connector's HAS_IREF
> flag transitions from set to cleared.
>
> This manifests as a hung task with the following call trace:
>
> INFO: task umount:1234 blocked for more than 120 seconds.
> Call Trace:
> __schedule
> schedule
> fsnotify_sb_delete
> generic_shutdown_super
> kill_anon_super
> cleanup_mnt
> task_work_run
> do_exit
> do_group_exit
>
> The race window that triggers the iref leak:
>
> Thread A (adding mark) Thread B (removing mark)
> ────────────────────── ────────────────────────
> fsnotify_add_mark_locked():
> fsnotify_add_mark_list():
> spin_lock(conn->lock)
> add mark_B(evictable) to list
> spin_unlock(conn->lock)
> return
>
> /* ---- gap: no lock held ---- */
>
> fsnotify_detach_mark(mark_A):
> spin_lock(mark_A->lock)
> clear ATTACHED flag on mark_A
> spin_unlock(mark_A->lock)
> fsnotify_put_mark(mark_A)
>
> fsnotify_recalc_mask():
> spin_lock(conn->lock)
> __fsnotify_recalc_mask():
> /* mark_A skipped: ATTACHED cleared */
> /* only mark_B(evictable) remains */
> want_iref = false
> has_iref = true /* not yet cleared */
> -> HAS_IREF transitions true -> false
> -> returns inode pointer
> spin_unlock(conn->lock)
> /* BUG: return value discarded!
> * iput() and fsnotify_put_sb_watched_objects()
> * are never called */
>
> Fix this by deferring the transition true -> false of HAS_IREF flag from
> fsnotify_recalc_mask() (Thread A) to fsnotify_put_mark() (thread B).
>
> Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
> Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> ---
> fs/notify/mark.c | 39 ++++++++++++++++++++++++++++++++++++---
> 1 file changed, 36 insertions(+), 3 deletions(-)
>
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index 622f05977f86a..e256b420100dc 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -238,7 +238,12 @@ static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
> return inode;
> }
>
> -static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> +/*
> + * Calculate mask of events for a list of marks.
> + *
> + * Return true if any of the attached marks want to hold an inode reference.
> + */
> +static bool __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> {
> u32 new_mask = 0;
> bool want_iref = false;
> @@ -262,6 +267,34 @@ static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> */
> WRITE_ONCE(*fsnotify_conn_mask_p(conn), new_mask);
>
> + return want_iref;
> +}
> +
> +/*
> + * Calculate mask of events for a list of marks after attach/modify mark
> + * and get an inode reference for the connector if needed.
> + *
> + * A concurrent add of evictable mark and detach of non-evictable mark can
> + * lead to __fsnotify_recalc_mask() returning false want_iref, but in this
> + * case we defer clearing iref to fsnotify_recalc_mask_clear_iref() called
> + * from fsnotify_put_mark().
> + */
> +static void fsnotify_recalc_mask_set_iref(struct fsnotify_mark_connector *conn)
> +{
> + bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
> + bool want_iref = __fsnotify_recalc_mask(conn) || has_iref;
> +
> + (void) fsnotify_update_iref(conn, want_iref);
> +}
> +
> +/*
> + * Calculate mask of events for a list of marks after detach mark
> + * and return the inode object if its reference is no longer needed.
> + */
> +static void *fsnotify_recalc_mask_clear_iref(struct fsnotify_mark_connector *conn)
> +{
> + bool want_iref = __fsnotify_recalc_mask(conn);
> +
> return fsnotify_update_iref(conn, want_iref);
> }
>
> @@ -298,7 +331,7 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
>
> spin_lock(&conn->lock);
> update_children = !fsnotify_conn_watches_children(conn);
> - __fsnotify_recalc_mask(conn);
> + fsnotify_recalc_mask_set_iref(conn);
> update_children &= fsnotify_conn_watches_children(conn);
> spin_unlock(&conn->lock);
> /*
> @@ -419,7 +452,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
> /* Update watched objects after detaching mark */
> if (sb)
> fsnotify_update_sb_watchers(sb, conn);
> - objp = __fsnotify_recalc_mask(conn);
> + objp = fsnotify_recalc_mask_clear_iref(conn);
> type = conn->type;
> }
> WRITE_ONCE(mark->connector, NULL);
> --
> 2.53.0
>
On Mon 20-04-26 16:00:51, Amir Goldstein wrote:
> [fix Jan's cc]
>
> On Mon, Apr 20, 2026 at 03:33:30PM +0200, Amir Goldstein wrote:
> > On Mon, Apr 20, 2026 at 8:52 AM Xin Yin <yinxin.x@bytedance.com> wrote:
> > >
> > > fsnotify_recalc_mask() fails to handle the return value of
> > > __fsnotify_recalc_mask(), which may return an inode pointer that needs
> > > to be released via fsnotify_drop_object() when the connector's HAS_IREF
> > > flag transitions from set to cleared.
> > >
> > > This manifests as a hung task with the following call trace:
> > >
> > > INFO: task umount:1234 blocked for more than 120 seconds.
> > > Call Trace:
> > > __schedule
> > > schedule
> > > fsnotify_sb_delete
> > > generic_shutdown_super
> > > kill_anon_super
> > > cleanup_mnt
> > > task_work_run
> > > do_exit
> > > do_group_exit
> > >
> > > The race window that triggers the iref leak:
> > >
> > > Thread A (adding mark) Thread B (removing mark)
> > > ────────────────────── ────────────────────────
> > > fsnotify_add_mark_locked():
> > > fsnotify_add_mark_list():
> > > spin_lock(conn->lock)
> > > add mark_B(evictable) to list
> > > spin_unlock(conn->lock)
> > > return
> > >
> > > /* ---- gap: no lock held ---- */
> > >
> > > fsnotify_detach_mark(mark_A):
> > > spin_lock(mark_A->lock)
> > > clear ATTACHED flag on mark_A
> > > spin_unlock(mark_A->lock)
> > >
> > > fsnotify_recalc_mask():
> > > spin_lock(conn->lock)
> > > __fsnotify_recalc_mask():
> > > /* mark_A skipped: ATTACHED cleared */
> > > /* only mark_B(evictable) remains */
> > > want_iref = false
> > > has_iref = true /* not yet cleared */
> > > -> HAS_IREF transitions true -> false
> > > -> returns inode pointer
> > > spin_unlock(conn->lock)
> > > /* BUG: return value discarded!
> > > * iput() and fsnotify_put_sb_watched_objects()
> > > * are never called */
> > >
> >
> > Hi Xin!
> >
> > Ouch! nasty one.
> > Very good analysis.
> > Do you also have a reproducer?
> >
> >
> > > Fix this by capturing the return value of __fsnotify_recalc_mask() and
> > > passing it to fsnotify_drop_object() after releasing the spinlock, which
> > > is the same pattern used in fsnotify_put_mark().
> >
> > The fix looks correct, but I prefer for my mental model to keep
> > fsnotify_drop_object() in one place at the end of fsnotify_put_mark().
> >
> > Please see the suggested (only sanity tested) patch.
Thanks for the fix guys. Frankly, I don't care much between Amir's and
original version. I guess Amir's version keeps iput() constrained to a
single location so it might be somewhat safer going forward so I'll merge
that. It reminds me I should dust off my patches to remove inode references
from fsnotify code altogether. That will remove these games as well.
Honza
> > From 8cc3e7e2e75f5c93875bffce6bf8398d94398c33 Mon Sep 17 00:00:00 2001
> > From: Amir Goldstein <amir73il@gmail.com>
> > Date: Mon, 20 Apr 2026 14:58:00 +0200
> > Subject: [PATCH v2] fsnotify: fix inode reference leak in
> > fsnotify_recalc_mask()
> > MIME-Version: 1.0
> > Content-Type: text/plain; charset=UTF-8
> > Content-Transfer-Encoding: 8bit
> >
> > fsnotify_recalc_mask() fails to handle the return value of
> > __fsnotify_recalc_mask(), which may return an inode pointer that needs
> > to be released via fsnotify_drop_object() when the connector's HAS_IREF
> > flag transitions from set to cleared.
> >
> > This manifests as a hung task with the following call trace:
> >
> > INFO: task umount:1234 blocked for more than 120 seconds.
> > Call Trace:
> > __schedule
> > schedule
> > fsnotify_sb_delete
> > generic_shutdown_super
> > kill_anon_super
> > cleanup_mnt
> > task_work_run
> > do_exit
> > do_group_exit
> >
> > The race window that triggers the iref leak:
> >
> > Thread A (adding mark) Thread B (removing mark)
> > ────────────────────── ────────────────────────
> > fsnotify_add_mark_locked():
> > fsnotify_add_mark_list():
> > spin_lock(conn->lock)
> > add mark_B(evictable) to list
> > spin_unlock(conn->lock)
> > return
> >
> > /* ---- gap: no lock held ---- */
> >
> > fsnotify_detach_mark(mark_A):
> > spin_lock(mark_A->lock)
> > clear ATTACHED flag on mark_A
> > spin_unlock(mark_A->lock)
> > fsnotify_put_mark(mark_A)
> >
> > fsnotify_recalc_mask():
> > spin_lock(conn->lock)
> > __fsnotify_recalc_mask():
> > /* mark_A skipped: ATTACHED cleared */
> > /* only mark_B(evictable) remains */
> > want_iref = false
> > has_iref = true /* not yet cleared */
> > -> HAS_IREF transitions true -> false
> > -> returns inode pointer
> > spin_unlock(conn->lock)
> > /* BUG: return value discarded!
> > * iput() and fsnotify_put_sb_watched_objects()
> > * are never called */
> >
> > Fix this by deferring the transition true -> false of HAS_IREF flag from
> > fsnotify_recalc_mask() (Thread A) to fsnotify_put_mark() (thread B).
> >
> > Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
> > Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > ---
> > fs/notify/mark.c | 39 ++++++++++++++++++++++++++++++++++++---
> > 1 file changed, 36 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> > index 622f05977f86a..e256b420100dc 100644
> > --- a/fs/notify/mark.c
> > +++ b/fs/notify/mark.c
> > @@ -238,7 +238,12 @@ static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
> > return inode;
> > }
> >
> > -static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> > +/*
> > + * Calculate mask of events for a list of marks.
> > + *
> > + * Return true if any of the attached marks want to hold an inode reference.
> > + */
> > +static bool __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> > {
> > u32 new_mask = 0;
> > bool want_iref = false;
> > @@ -262,6 +267,34 @@ static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> > */
> > WRITE_ONCE(*fsnotify_conn_mask_p(conn), new_mask);
> >
> > + return want_iref;
> > +}
> > +
> > +/*
> > + * Calculate mask of events for a list of marks after attach/modify mark
> > + * and get an inode reference for the connector if needed.
> > + *
> > + * A concurrent add of evictable mark and detach of non-evictable mark can
> > + * lead to __fsnotify_recalc_mask() returning false want_iref, but in this
> > + * case we defer clearing iref to fsnotify_recalc_mask_clear_iref() called
> > + * from fsnotify_put_mark().
> > + */
> > +static void fsnotify_recalc_mask_set_iref(struct fsnotify_mark_connector *conn)
> > +{
> > + bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
> > + bool want_iref = __fsnotify_recalc_mask(conn) || has_iref;
> > +
> > + (void) fsnotify_update_iref(conn, want_iref);
> > +}
> > +
> > +/*
> > + * Calculate mask of events for a list of marks after detach mark
> > + * and return the inode object if its reference is no longer needed.
> > + */
> > +static void *fsnotify_recalc_mask_clear_iref(struct fsnotify_mark_connector *conn)
> > +{
> > + bool want_iref = __fsnotify_recalc_mask(conn);
> > +
> > return fsnotify_update_iref(conn, want_iref);
> > }
> >
> > @@ -298,7 +331,7 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> >
> > spin_lock(&conn->lock);
> > update_children = !fsnotify_conn_watches_children(conn);
> > - __fsnotify_recalc_mask(conn);
> > + fsnotify_recalc_mask_set_iref(conn);
> > update_children &= fsnotify_conn_watches_children(conn);
> > spin_unlock(&conn->lock);
> > /*
> > @@ -419,7 +452,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
> > /* Update watched objects after detaching mark */
> > if (sb)
> > fsnotify_update_sb_watchers(sb, conn);
> > - objp = __fsnotify_recalc_mask(conn);
> > + objp = fsnotify_recalc_mask_clear_iref(conn);
> > type = conn->type;
> > }
> > WRITE_ONCE(mark->connector, NULL);
> > --
> > 2.53.0
> >
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
Cc Jan
Sorry, I used the wrong email address.
Thanks,
Xin Yin
> From: "Xin Yin"<yinxin.x@bytedance.com>
> Date: Mon, Apr 20, 2026, 14:51
> Subject: [PATCH] fsnotify: fix inode reference leak in fsnotify_recalc_mask()
> To: <jan@kara.cz>, <amir73il@gmail.com>
> Cc: <linux-fsdevel@vger.kernel.org>, <linux-kernel@vger.kernel.org>, "Xin Yin"<yinxin.x@bytedance.com>
> fsnotify_recalc_mask() fails to handle the return value of
> __fsnotify_recalc_mask(), which may return an inode pointer that needs
> to be released via fsnotify_drop_object() when the connector's HAS_IREF
> flag transitions from set to cleared.
>
> This manifests as a hung task with the following call trace:
>
> INFO: task umount:1234 blocked for more than 120 seconds.
> Call Trace:
> __schedule
> schedule
> fsnotify_sb_delete
> generic_shutdown_super
> kill_anon_super
> cleanup_mnt
> task_work_run
> do_exit
> do_group_exit
>
> The race window that triggers the iref leak:
>
> Thread A (adding mark) Thread B (removing mark)
> ────────────────────── ────────────────────────
> fsnotify_add_mark_locked():
> fsnotify_add_mark_list():
> spin_lock(conn->lock)
> add mark_B(evictable) to list
> spin_unlock(conn->lock)
> return
>
> /* ---- gap: no lock held ---- */
>
> fsnotify_detach_mark(mark_A):
> spin_lock(mark_A->lock)
> clear ATTACHED flag on mark_A
> spin_unlock(mark_A->lock)
>
> fsnotify_recalc_mask():
> spin_lock(conn->lock)
> __fsnotify_recalc_mask():
> /* mark_A skipped: ATTACHED cleared */
> /* only mark_B(evictable) remains */
> want_iref = false
> has_iref = true /* not yet cleared */
> -> HAS_IREF transitions true -> false
> -> returns inode pointer
> spin_unlock(conn->lock)
> /* BUG: return value discarded!
> * iput() and fsnotify_put_sb_watched_objects()
> * are never called */
>
> Fix this by capturing the return value of __fsnotify_recalc_mask() and
> passing it to fsnotify_drop_object() after releasing the spinlock, which
> is the same pattern used in fsnotify_put_mark().
>
> Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
> Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
> ---
> fs/notify/mark.c | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index c2ed5b11b0fe6..cc93fcc2c5a9c 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -283,6 +283,8 @@ static void fsnotify_conn_set_children_dentry_flags(
> fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
> }
>
> +static void fsnotify_drop_object(unsigned int type, void *objp);
> +
> /*
> * Calculate mask of events for a list of marks. The caller must make sure
> * connector and connector->obj cannot disappear under us. Callers achieve
> @@ -292,15 +294,19 @@ static void fsnotify_conn_set_children_dentry_flags(
> void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> {
> bool update_children;
> + unsigned int type;
> + void *objp;
>
> if (!conn)
> return;
>
> spin_lock(&conn->lock);
> update_children = !fsnotify_conn_watches_children(conn);
> - __fsnotify_recalc_mask(conn);
> + objp = __fsnotify_recalc_mask(conn);
> + type = conn->type;
> update_children &= fsnotify_conn_watches_children(conn);
> spin_unlock(&conn->lock);
> + fsnotify_drop_object(type, objp);
> /*
> * Set children's PARENT_WATCHED flags only if parent started watching.
> * When parent stops watching, we clear false positive PARENT_WATCHED
> --
> 2.20.1
>
© 2016 - 2026 Red Hat, Inc.