Currently some kernfs files (e.g. cgroup.events, memory.events) support
inotify watches for IN_MODIFY, but unlike with regular filesystems, they
do not receive IN_DELETE_SELF or IN_IGNORED events when they are
removed.
This creates a problem for processes monitoring cgroups. For example, a
service monitoring memory.events for memory.high breaches needs to know
when a cgroup is removed to clean up its state. Where it's known that a
cgroup is removed when all processes die, without IN_DELETE_SELF the
service must resort to inefficient workarounds such as:
1. Periodically scanning procfs to detect process death (wastes CPU and
is susceptible to PID reuse).
2. Placing an additional IN_DELETE watch on the parent directory
(wastes resources managing double the watches).
3. Holding a pidfd for every monitored cgroup (can exhaust file
descriptors).
This patch enables kernfs to send IN_DELETE_SELF and IN_IGNORED events.
This allows applications to rely on a single existing watch on the file
of interest (e.g. memory.events) to receive notifications for both
modifications and the eventual removal of the file, as well as automatic
watch descriptor cleanup, simplifying userspace logic and improving
resource efficiency.
Implementation details:
The kernfs notification worker is updated to handle file deletion.
The optimized single call for MODIFY events to both the parent and the
file is retained, however because CREATE (parent) events remain
unsupported for kernfs files, support for DELETE (parent) events is not
added here to retain symmetry. Only support for DELETE_SELF events is
added.
Signed-off-by: T.J. Mercier <tjmercier@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
fs/kernfs/dir.c | 21 +++++++++++++++++
fs/kernfs/file.c | 45 ++++++++++++++++++++-----------------
fs/kernfs/kernfs-internal.h | 3 +++
3 files changed, 48 insertions(+), 21 deletions(-)
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 29baeeb97871..e5bda829fcb8 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -9,6 +9,7 @@
#include <linux/sched.h>
#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/idr.h>
#include <linux/slab.h>
@@ -1471,6 +1472,23 @@ void kernfs_show(struct kernfs_node *kn, bool show)
up_write(&root->kernfs_rwsem);
}
+static void kernfs_notify_file_deleted(struct kernfs_node *kn)
+{
+ static DECLARE_WORK(kernfs_notify_deleted_work,
+ kernfs_notify_workfn);
+
+ guard(spinlock_irqsave)(&kernfs_notify_lock);
+ /* may overwite already pending FS_MODIFY events */
+ kn->attr.notify_event = FS_DELETE;
+
+ if (!kn->attr.notify_next) {
+ kernfs_get(kn);
+ kn->attr.notify_next = kernfs_notify_list;
+ kernfs_notify_list = kn;
+ schedule_work(&kernfs_notify_deleted_work);
+ }
+}
+
static void __kernfs_remove(struct kernfs_node *kn)
{
struct kernfs_node *pos, *parent;
@@ -1520,6 +1538,9 @@ static void __kernfs_remove(struct kernfs_node *kn)
struct kernfs_iattrs *ps_iattr =
parent ? parent->iattr : NULL;
+ if (kernfs_type(pos) == KERNFS_FILE)
+ kernfs_notify_file_deleted(pos);
+
/* update timestamps on the parent */
down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e978284ff983..4be9bbe29378 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -37,8 +37,8 @@ struct kernfs_open_node {
*/
#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list)
-static DEFINE_SPINLOCK(kernfs_notify_lock);
-static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
+DEFINE_SPINLOCK(kernfs_notify_lock);
+struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn)
{
@@ -909,7 +909,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
return ret;
}
-static void kernfs_notify_workfn(struct work_struct *work)
+void kernfs_notify_workfn(struct work_struct *work)
{
struct kernfs_node *kn;
struct kernfs_super_info *info;
@@ -935,11 +935,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
down_read(&root->kernfs_supers_rwsem);
down_read(&root->kernfs_rwsem);
list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
- struct kernfs_node *parent;
- struct inode *p_inode = NULL;
- const char *kn_name;
struct inode *inode;
- struct qstr name;
/*
* We want fsnotify_modify() on @kn but as the
@@ -951,24 +947,31 @@ static void kernfs_notify_workfn(struct work_struct *work)
if (!inode)
continue;
- kn_name = kernfs_rcu_name(kn);
- name = QSTR(kn_name);
- parent = kernfs_get_parent(kn);
- if (parent) {
- p_inode = ilookup(info->sb, kernfs_ino(parent));
- if (p_inode) {
- fsnotify(notify_event | FS_EVENT_ON_CHILD,
- inode, FSNOTIFY_EVENT_INODE,
- p_inode, &name, inode, 0);
- iput(p_inode);
+ if (notify_event == FS_DELETE) {
+ fsnotify_inoderemove(inode);
+ } else {
+ struct kernfs_node *parent = kernfs_get_parent(kn);
+ struct inode *p_inode = NULL;
+
+ if (parent) {
+ p_inode = ilookup(info->sb, kernfs_ino(parent));
+ if (p_inode) {
+ const char *kn_name = kernfs_rcu_name(kn);
+ struct qstr name = QSTR(kn_name);
+
+ fsnotify(notify_event | FS_EVENT_ON_CHILD,
+ inode, FSNOTIFY_EVENT_INODE,
+ p_inode, &name, inode, 0);
+ iput(p_inode);
+ }
+
+ kernfs_put(parent);
}
- kernfs_put(parent);
+ if (!p_inode)
+ fsnotify_inode(inode, notify_event);
}
- if (!p_inode)
- fsnotify_inode(inode, notify_event);
-
iput(inode);
}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 6061b6f70d2a..cf4b21f4f3b6 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -199,6 +199,8 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
* file.c
*/
extern const struct file_operations kernfs_file_fops;
+extern struct kernfs_node *kernfs_notify_list;
+extern void kernfs_notify_workfn(struct work_struct *work);
bool kernfs_should_drain_open_files(struct kernfs_node *kn);
void kernfs_drain_open_files(struct kernfs_node *kn);
@@ -212,4 +214,5 @@ extern const struct inode_operations kernfs_symlink_iops;
* kernfs locks
*/
extern struct kernfs_global_locks *kernfs_locks;
+extern spinlock_t kernfs_notify_lock;
#endif /* __KERNFS_INTERNAL_H */
--
2.53.0.310.g728cabbaf7-goog
On Wed, Feb 18, 2026 at 5:22 AM T.J. Mercier <tjmercier@google.com> wrote:
>
> Currently some kernfs files (e.g. cgroup.events, memory.events) support
> inotify watches for IN_MODIFY, but unlike with regular filesystems, they
> do not receive IN_DELETE_SELF or IN_IGNORED events when they are
> removed.
>
> This creates a problem for processes monitoring cgroups. For example, a
> service monitoring memory.events for memory.high breaches needs to know
> when a cgroup is removed to clean up its state. Where it's known that a
> cgroup is removed when all processes die, without IN_DELETE_SELF the
> service must resort to inefficient workarounds such as:
> 1. Periodically scanning procfs to detect process death (wastes CPU and
> is susceptible to PID reuse).
> 2. Placing an additional IN_DELETE watch on the parent directory
> (wastes resources managing double the watches).
This sentence is a red flag for me.
"wastes resources"? What resources are you talking about?
A single inotify watch? That's nothing.
This is not a valid argument IMO.
I fail to see how managing N watches is different than managing 2N watches.
I have no objection to your patch, but we need to keep our arguments honest.
> 3. Holding a pidfd for every monitored cgroup (can exhaust file
> descriptors).
>
> This patch enables kernfs to send IN_DELETE_SELF and IN_IGNORED events.
> This allows applications to rely on a single existing watch on the file
> of interest (e.g. memory.events) to receive notifications for both
> modifications and the eventual removal of the file, as well as automatic
> watch descriptor cleanup, simplifying userspace logic and improving
> resource efficiency.
>
> Implementation details:
> The kernfs notification worker is updated to handle file deletion.
> The optimized single call for MODIFY events to both the parent and the
> file is retained, however because CREATE (parent) events remain
> unsupported for kernfs files, support for DELETE (parent) events is not
Either drop this story about DELETE or expand it.
inotify does not generate a DELETE event when watching a file,
because DELETE is an event notifying a change of a directory.
If you would have kept your DELETE implementation that would have
broken this rule.
> added here to retain symmetry. Only support for DELETE_SELF events is
> added.
>
> Signed-off-by: T.J. Mercier <tjmercier@google.com>
> Acked-by: Tejun Heo <tj@kernel.org>
> ---
> fs/kernfs/dir.c | 21 +++++++++++++++++
> fs/kernfs/file.c | 45 ++++++++++++++++++++-----------------
> fs/kernfs/kernfs-internal.h | 3 +++
> 3 files changed, 48 insertions(+), 21 deletions(-)
>
> diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> index 29baeeb97871..e5bda829fcb8 100644
> --- a/fs/kernfs/dir.c
> +++ b/fs/kernfs/dir.c
> @@ -9,6 +9,7 @@
>
> #include <linux/sched.h>
> #include <linux/fs.h>
> +#include <linux/fsnotify_backend.h>
> #include <linux/namei.h>
> #include <linux/idr.h>
> #include <linux/slab.h>
> @@ -1471,6 +1472,23 @@ void kernfs_show(struct kernfs_node *kn, bool show)
> up_write(&root->kernfs_rwsem);
> }
>
> +static void kernfs_notify_file_deleted(struct kernfs_node *kn)
> +{
> + static DECLARE_WORK(kernfs_notify_deleted_work,
> + kernfs_notify_workfn);
> +
> + guard(spinlock_irqsave)(&kernfs_notify_lock);
> + /* may overwite already pending FS_MODIFY events */
Typo: overwite
> + kn->attr.notify_event = FS_DELETE;
FS_DELETE_SELF
> +
> + if (!kn->attr.notify_next) {
> + kernfs_get(kn);
> + kn->attr.notify_next = kernfs_notify_list;
> + kernfs_notify_list = kn;
> + schedule_work(&kernfs_notify_deleted_work);
> + }
> +}
> +
> static void __kernfs_remove(struct kernfs_node *kn)
> {
> struct kernfs_node *pos, *parent;
> @@ -1520,6 +1538,9 @@ static void __kernfs_remove(struct kernfs_node *kn)
> struct kernfs_iattrs *ps_iattr =
> parent ? parent->iattr : NULL;
>
> + if (kernfs_type(pos) == KERNFS_FILE)
> + kernfs_notify_file_deleted(pos);
> +
Why are we not notifying a deleted directory?
If users expect DELETE_SELF on a watched cgroup file
they would definitely expect DELETE_SELF on a watched cgroup dir
when the cgroup is destroyed.
I claim that *this* should be the standard way to monitor
destroyed cgroups.
> /* update timestamps on the parent */
> down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
>
> diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
> index e978284ff983..4be9bbe29378 100644
> --- a/fs/kernfs/file.c
> +++ b/fs/kernfs/file.c
> @@ -37,8 +37,8 @@ struct kernfs_open_node {
> */
> #define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list)
>
> -static DEFINE_SPINLOCK(kernfs_notify_lock);
> -static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
> +DEFINE_SPINLOCK(kernfs_notify_lock);
> +struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
>
> static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn)
> {
> @@ -909,7 +909,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
> return ret;
> }
>
> -static void kernfs_notify_workfn(struct work_struct *work)
> +void kernfs_notify_workfn(struct work_struct *work)
> {
> struct kernfs_node *kn;
> struct kernfs_super_info *info;
> @@ -935,11 +935,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
> down_read(&root->kernfs_supers_rwsem);
> down_read(&root->kernfs_rwsem);
> list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
> - struct kernfs_node *parent;
> - struct inode *p_inode = NULL;
> - const char *kn_name;
> struct inode *inode;
> - struct qstr name;
>
> /*
> * We want fsnotify_modify() on @kn but as the
> @@ -951,24 +947,31 @@ static void kernfs_notify_workfn(struct work_struct *work)
> if (!inode)
> continue;
>
> - kn_name = kernfs_rcu_name(kn);
> - name = QSTR(kn_name);
> - parent = kernfs_get_parent(kn);
> - if (parent) {
> - p_inode = ilookup(info->sb, kernfs_ino(parent));
> - if (p_inode) {
> - fsnotify(notify_event | FS_EVENT_ON_CHILD,
> - inode, FSNOTIFY_EVENT_INODE,
> - p_inode, &name, inode, 0);
> - iput(p_inode);
> + if (notify_event == FS_DELETE) {
FS_DELETE_SELF
> + fsnotify_inoderemove(inode);
iput(inode);
continue;
}
Avoids all the churn and unneeded extra indentation that follows.
Thanks,
Amir.
On Tue 17-02-26 19:22:31, T.J. Mercier wrote:
> Currently some kernfs files (e.g. cgroup.events, memory.events) support
> inotify watches for IN_MODIFY, but unlike with regular filesystems, they
> do not receive IN_DELETE_SELF or IN_IGNORED events when they are
> removed.
Please see my email:
https://lore.kernel.org/all/lc2jgt3yrvuvtdj2kk7q3rloie2c5mzyhfdy4zvxylx732voet@ol3kl4ackrpb
I think this is actually a bug in kernfs...
Honza
>
> This creates a problem for processes monitoring cgroups. For example, a
> service monitoring memory.events for memory.high breaches needs to know
> when a cgroup is removed to clean up its state. Where it's known that a
> cgroup is removed when all processes die, without IN_DELETE_SELF the
> service must resort to inefficient workarounds such as:
> 1. Periodically scanning procfs to detect process death (wastes CPU and
> is susceptible to PID reuse).
> 2. Placing an additional IN_DELETE watch on the parent directory
> (wastes resources managing double the watches).
> 3. Holding a pidfd for every monitored cgroup (can exhaust file
> descriptors).
>
> This patch enables kernfs to send IN_DELETE_SELF and IN_IGNORED events.
> This allows applications to rely on a single existing watch on the file
> of interest (e.g. memory.events) to receive notifications for both
> modifications and the eventual removal of the file, as well as automatic
> watch descriptor cleanup, simplifying userspace logic and improving
> resource efficiency.
>
> Implementation details:
> The kernfs notification worker is updated to handle file deletion.
> The optimized single call for MODIFY events to both the parent and the
> file is retained, however because CREATE (parent) events remain
> unsupported for kernfs files, support for DELETE (parent) events is not
> added here to retain symmetry. Only support for DELETE_SELF events is
> added.
>
> Signed-off-by: T.J. Mercier <tjmercier@google.com>
> Acked-by: Tejun Heo <tj@kernel.org>
> ---
> fs/kernfs/dir.c | 21 +++++++++++++++++
> fs/kernfs/file.c | 45 ++++++++++++++++++++-----------------
> fs/kernfs/kernfs-internal.h | 3 +++
> 3 files changed, 48 insertions(+), 21 deletions(-)
>
> diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> index 29baeeb97871..e5bda829fcb8 100644
> --- a/fs/kernfs/dir.c
> +++ b/fs/kernfs/dir.c
> @@ -9,6 +9,7 @@
>
> #include <linux/sched.h>
> #include <linux/fs.h>
> +#include <linux/fsnotify_backend.h>
> #include <linux/namei.h>
> #include <linux/idr.h>
> #include <linux/slab.h>
> @@ -1471,6 +1472,23 @@ void kernfs_show(struct kernfs_node *kn, bool show)
> up_write(&root->kernfs_rwsem);
> }
>
> +static void kernfs_notify_file_deleted(struct kernfs_node *kn)
> +{
> + static DECLARE_WORK(kernfs_notify_deleted_work,
> + kernfs_notify_workfn);
> +
> + guard(spinlock_irqsave)(&kernfs_notify_lock);
> + /* may overwite already pending FS_MODIFY events */
> + kn->attr.notify_event = FS_DELETE;
> +
> + if (!kn->attr.notify_next) {
> + kernfs_get(kn);
> + kn->attr.notify_next = kernfs_notify_list;
> + kernfs_notify_list = kn;
> + schedule_work(&kernfs_notify_deleted_work);
> + }
> +}
> +
> static void __kernfs_remove(struct kernfs_node *kn)
> {
> struct kernfs_node *pos, *parent;
> @@ -1520,6 +1538,9 @@ static void __kernfs_remove(struct kernfs_node *kn)
> struct kernfs_iattrs *ps_iattr =
> parent ? parent->iattr : NULL;
>
> + if (kernfs_type(pos) == KERNFS_FILE)
> + kernfs_notify_file_deleted(pos);
> +
> /* update timestamps on the parent */
> down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
>
> diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
> index e978284ff983..4be9bbe29378 100644
> --- a/fs/kernfs/file.c
> +++ b/fs/kernfs/file.c
> @@ -37,8 +37,8 @@ struct kernfs_open_node {
> */
> #define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list)
>
> -static DEFINE_SPINLOCK(kernfs_notify_lock);
> -static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
> +DEFINE_SPINLOCK(kernfs_notify_lock);
> +struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
>
> static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn)
> {
> @@ -909,7 +909,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
> return ret;
> }
>
> -static void kernfs_notify_workfn(struct work_struct *work)
> +void kernfs_notify_workfn(struct work_struct *work)
> {
> struct kernfs_node *kn;
> struct kernfs_super_info *info;
> @@ -935,11 +935,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
> down_read(&root->kernfs_supers_rwsem);
> down_read(&root->kernfs_rwsem);
> list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
> - struct kernfs_node *parent;
> - struct inode *p_inode = NULL;
> - const char *kn_name;
> struct inode *inode;
> - struct qstr name;
>
> /*
> * We want fsnotify_modify() on @kn but as the
> @@ -951,24 +947,31 @@ static void kernfs_notify_workfn(struct work_struct *work)
> if (!inode)
> continue;
>
> - kn_name = kernfs_rcu_name(kn);
> - name = QSTR(kn_name);
> - parent = kernfs_get_parent(kn);
> - if (parent) {
> - p_inode = ilookup(info->sb, kernfs_ino(parent));
> - if (p_inode) {
> - fsnotify(notify_event | FS_EVENT_ON_CHILD,
> - inode, FSNOTIFY_EVENT_INODE,
> - p_inode, &name, inode, 0);
> - iput(p_inode);
> + if (notify_event == FS_DELETE) {
> + fsnotify_inoderemove(inode);
> + } else {
> + struct kernfs_node *parent = kernfs_get_parent(kn);
> + struct inode *p_inode = NULL;
> +
> + if (parent) {
> + p_inode = ilookup(info->sb, kernfs_ino(parent));
> + if (p_inode) {
> + const char *kn_name = kernfs_rcu_name(kn);
> + struct qstr name = QSTR(kn_name);
> +
> + fsnotify(notify_event | FS_EVENT_ON_CHILD,
> + inode, FSNOTIFY_EVENT_INODE,
> + p_inode, &name, inode, 0);
> + iput(p_inode);
> + }
> +
> + kernfs_put(parent);
> }
>
> - kernfs_put(parent);
> + if (!p_inode)
> + fsnotify_inode(inode, notify_event);
> }
>
> - if (!p_inode)
> - fsnotify_inode(inode, notify_event);
> -
> iput(inode);
> }
>
> diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
> index 6061b6f70d2a..cf4b21f4f3b6 100644
> --- a/fs/kernfs/kernfs-internal.h
> +++ b/fs/kernfs/kernfs-internal.h
> @@ -199,6 +199,8 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
> * file.c
> */
> extern const struct file_operations kernfs_file_fops;
> +extern struct kernfs_node *kernfs_notify_list;
> +extern void kernfs_notify_workfn(struct work_struct *work);
>
> bool kernfs_should_drain_open_files(struct kernfs_node *kn);
> void kernfs_drain_open_files(struct kernfs_node *kn);
> @@ -212,4 +214,5 @@ extern const struct inode_operations kernfs_symlink_iops;
> * kernfs locks
> */
> extern struct kernfs_global_locks *kernfs_locks;
> +extern spinlock_t kernfs_notify_lock;
> #endif /* __KERNFS_INTERNAL_H */
> --
> 2.53.0.310.g728cabbaf7-goog
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
On Wed, Feb 18, 2026 at 10:01 AM Jan Kara <jack@suse.cz> wrote:
>
> On Tue 17-02-26 19:22:31, T.J. Mercier wrote:
> > Currently some kernfs files (e.g. cgroup.events, memory.events) support
> > inotify watches for IN_MODIFY, but unlike with regular filesystems, they
> > do not receive IN_DELETE_SELF or IN_IGNORED events when they are
> > removed.
>
> Please see my email:
> https://lore.kernel.org/all/lc2jgt3yrvuvtdj2kk7q3rloie2c5mzyhfdy4zvxylx732voet@ol3kl4ackrpb
>
> I think this is actually a bug in kernfs...
>
> Honza
Thanks, I'm looking at this now. I've tried calling clear_nlink in
kernfs_iop_rmdir, but I've found that when we get back to vfs_rmdir
and shrink_dcache_parent is called, d_walk doesn't find any entries,
so shrink_kill->__dentry_kill is not called. I'm investigating why
that is...
> >
> > This creates a problem for processes monitoring cgroups. For example, a
> > service monitoring memory.events for memory.high breaches needs to know
> > when a cgroup is removed to clean up its state. Where it's known that a
> > cgroup is removed when all processes die, without IN_DELETE_SELF the
> > service must resort to inefficient workarounds such as:
> > 1. Periodically scanning procfs to detect process death (wastes CPU and
> > is susceptible to PID reuse).
> > 2. Placing an additional IN_DELETE watch on the parent directory
> > (wastes resources managing double the watches).
> > 3. Holding a pidfd for every monitored cgroup (can exhaust file
> > descriptors).
> >
> > This patch enables kernfs to send IN_DELETE_SELF and IN_IGNORED events.
> > This allows applications to rely on a single existing watch on the file
> > of interest (e.g. memory.events) to receive notifications for both
> > modifications and the eventual removal of the file, as well as automatic
> > watch descriptor cleanup, simplifying userspace logic and improving
> > resource efficiency.
> >
> > Implementation details:
> > The kernfs notification worker is updated to handle file deletion.
> > The optimized single call for MODIFY events to both the parent and the
> > file is retained, however because CREATE (parent) events remain
> > unsupported for kernfs files, support for DELETE (parent) events is not
> > added here to retain symmetry. Only support for DELETE_SELF events is
> > added.
> >
> > Signed-off-by: T.J. Mercier <tjmercier@google.com>
> > Acked-by: Tejun Heo <tj@kernel.org>
> > ---
> > fs/kernfs/dir.c | 21 +++++++++++++++++
> > fs/kernfs/file.c | 45 ++++++++++++++++++++-----------------
> > fs/kernfs/kernfs-internal.h | 3 +++
> > 3 files changed, 48 insertions(+), 21 deletions(-)
> >
> > diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> > index 29baeeb97871..e5bda829fcb8 100644
> > --- a/fs/kernfs/dir.c
> > +++ b/fs/kernfs/dir.c
> > @@ -9,6 +9,7 @@
> >
> > #include <linux/sched.h>
> > #include <linux/fs.h>
> > +#include <linux/fsnotify_backend.h>
> > #include <linux/namei.h>
> > #include <linux/idr.h>
> > #include <linux/slab.h>
> > @@ -1471,6 +1472,23 @@ void kernfs_show(struct kernfs_node *kn, bool show)
> > up_write(&root->kernfs_rwsem);
> > }
> >
> > +static void kernfs_notify_file_deleted(struct kernfs_node *kn)
> > +{
> > + static DECLARE_WORK(kernfs_notify_deleted_work,
> > + kernfs_notify_workfn);
> > +
> > + guard(spinlock_irqsave)(&kernfs_notify_lock);
> > + /* may overwite already pending FS_MODIFY events */
> > + kn->attr.notify_event = FS_DELETE;
> > +
> > + if (!kn->attr.notify_next) {
> > + kernfs_get(kn);
> > + kn->attr.notify_next = kernfs_notify_list;
> > + kernfs_notify_list = kn;
> > + schedule_work(&kernfs_notify_deleted_work);
> > + }
> > +}
> > +
> > static void __kernfs_remove(struct kernfs_node *kn)
> > {
> > struct kernfs_node *pos, *parent;
> > @@ -1520,6 +1538,9 @@ static void __kernfs_remove(struct kernfs_node *kn)
> > struct kernfs_iattrs *ps_iattr =
> > parent ? parent->iattr : NULL;
> >
> > + if (kernfs_type(pos) == KERNFS_FILE)
> > + kernfs_notify_file_deleted(pos);
> > +
> > /* update timestamps on the parent */
> > down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
> >
> > diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
> > index e978284ff983..4be9bbe29378 100644
> > --- a/fs/kernfs/file.c
> > +++ b/fs/kernfs/file.c
> > @@ -37,8 +37,8 @@ struct kernfs_open_node {
> > */
> > #define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list)
> >
> > -static DEFINE_SPINLOCK(kernfs_notify_lock);
> > -static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
> > +DEFINE_SPINLOCK(kernfs_notify_lock);
> > +struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
> >
> > static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn)
> > {
> > @@ -909,7 +909,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
> > return ret;
> > }
> >
> > -static void kernfs_notify_workfn(struct work_struct *work)
> > +void kernfs_notify_workfn(struct work_struct *work)
> > {
> > struct kernfs_node *kn;
> > struct kernfs_super_info *info;
> > @@ -935,11 +935,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
> > down_read(&root->kernfs_supers_rwsem);
> > down_read(&root->kernfs_rwsem);
> > list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
> > - struct kernfs_node *parent;
> > - struct inode *p_inode = NULL;
> > - const char *kn_name;
> > struct inode *inode;
> > - struct qstr name;
> >
> > /*
> > * We want fsnotify_modify() on @kn but as the
> > @@ -951,24 +947,31 @@ static void kernfs_notify_workfn(struct work_struct *work)
> > if (!inode)
> > continue;
> >
> > - kn_name = kernfs_rcu_name(kn);
> > - name = QSTR(kn_name);
> > - parent = kernfs_get_parent(kn);
> > - if (parent) {
> > - p_inode = ilookup(info->sb, kernfs_ino(parent));
> > - if (p_inode) {
> > - fsnotify(notify_event | FS_EVENT_ON_CHILD,
> > - inode, FSNOTIFY_EVENT_INODE,
> > - p_inode, &name, inode, 0);
> > - iput(p_inode);
> > + if (notify_event == FS_DELETE) {
> > + fsnotify_inoderemove(inode);
> > + } else {
> > + struct kernfs_node *parent = kernfs_get_parent(kn);
> > + struct inode *p_inode = NULL;
> > +
> > + if (parent) {
> > + p_inode = ilookup(info->sb, kernfs_ino(parent));
> > + if (p_inode) {
> > + const char *kn_name = kernfs_rcu_name(kn);
> > + struct qstr name = QSTR(kn_name);
> > +
> > + fsnotify(notify_event | FS_EVENT_ON_CHILD,
> > + inode, FSNOTIFY_EVENT_INODE,
> > + p_inode, &name, inode, 0);
> > + iput(p_inode);
> > + }
> > +
> > + kernfs_put(parent);
> > }
> >
> > - kernfs_put(parent);
> > + if (!p_inode)
> > + fsnotify_inode(inode, notify_event);
> > }
> >
> > - if (!p_inode)
> > - fsnotify_inode(inode, notify_event);
> > -
> > iput(inode);
> > }
> >
> > diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
> > index 6061b6f70d2a..cf4b21f4f3b6 100644
> > --- a/fs/kernfs/kernfs-internal.h
> > +++ b/fs/kernfs/kernfs-internal.h
> > @@ -199,6 +199,8 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
> > * file.c
> > */
> > extern const struct file_operations kernfs_file_fops;
> > +extern struct kernfs_node *kernfs_notify_list;
> > +extern void kernfs_notify_workfn(struct work_struct *work);
> >
> > bool kernfs_should_drain_open_files(struct kernfs_node *kn);
> > void kernfs_drain_open_files(struct kernfs_node *kn);
> > @@ -212,4 +214,5 @@ extern const struct inode_operations kernfs_symlink_iops;
> > * kernfs locks
> > */
> > extern struct kernfs_global_locks *kernfs_locks;
> > +extern spinlock_t kernfs_notify_lock;
> > #endif /* __KERNFS_INTERNAL_H */
> > --
> > 2.53.0.310.g728cabbaf7-goog
> >
> --
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
On Wed 18-02-26 10:06:35, T.J. Mercier wrote: > On Wed, Feb 18, 2026 at 10:01 AM Jan Kara <jack@suse.cz> wrote: > > > > On Tue 17-02-26 19:22:31, T.J. Mercier wrote: > > > Currently some kernfs files (e.g. cgroup.events, memory.events) support > > > inotify watches for IN_MODIFY, but unlike with regular filesystems, they > > > do not receive IN_DELETE_SELF or IN_IGNORED events when they are > > > removed. > > > > Please see my email: > > https://lore.kernel.org/all/lc2jgt3yrvuvtdj2kk7q3rloie2c5mzyhfdy4zvxylx732voet@ol3kl4ackrpb > > > > I think this is actually a bug in kernfs... > > > > Honza > > Thanks, I'm looking at this now. I've tried calling clear_nlink in > kernfs_iop_rmdir, but I've found that when we get back to vfs_rmdir > and shrink_dcache_parent is called, d_walk doesn't find any entries, > so shrink_kill->__dentry_kill is not called. I'm investigating why > that is... Strange because when I was experimenting with this in my VM I have seen __dentry_kill being called (if the dentries were created by someone looking up the names). Honza -- Jan Kara <jack@suse.com> SUSE Labs, CR
On Wed, Feb 18, 2026 at 10:37 AM Jan Kara <jack@suse.cz> wrote: > > On Wed 18-02-26 10:06:35, T.J. Mercier wrote: > > On Wed, Feb 18, 2026 at 10:01 AM Jan Kara <jack@suse.cz> wrote: > > > > > > On Tue 17-02-26 19:22:31, T.J. Mercier wrote: > > > > Currently some kernfs files (e.g. cgroup.events, memory.events) support > > > > inotify watches for IN_MODIFY, but unlike with regular filesystems, they > > > > do not receive IN_DELETE_SELF or IN_IGNORED events when they are > > > > removed. > > > > > > Please see my email: > > > https://lore.kernel.org/all/lc2jgt3yrvuvtdj2kk7q3rloie2c5mzyhfdy4zvxylx732voet@ol3kl4ackrpb > > > > > > I think this is actually a bug in kernfs... > > > > > > Honza > > > > Thanks, I'm looking at this now. I've tried calling clear_nlink in > > kernfs_iop_rmdir, but I've found that when we get back to vfs_rmdir > > and shrink_dcache_parent is called, d_walk doesn't find any entries, > > so shrink_kill->__dentry_kill is not called. I'm investigating why > > that is... > > Strange because when I was experimenting with this in my VM I have seen > __dentry_kill being called (if the dentries were created by someone looking > up the names). Ahh yes, that's the difference. I was just doing mkdir /sys/fs/cgroup/foo immediately followed by rmdir /sys/fs/cgroup/foo. kernfs creates the dentries in kernfs_iop_lookup, so there were none when I did the rmdir because I didn't cause any lookups. If I actually have a program watching /sys/fs/cgroup/foo/memory.events, then I do see the __dentry_kill kill calls, but despite the prior clear_nlink call i_nlink is 1 so fsnotify_inoderemove is skipped. Something must be incrementing it.
On Wed, Feb 18, 2026 at 11:15 AM T.J. Mercier <tjmercier@google.com> wrote: > > On Wed, Feb 18, 2026 at 10:37 AM Jan Kara <jack@suse.cz> wrote: > > > > On Wed 18-02-26 10:06:35, T.J. Mercier wrote: > > > On Wed, Feb 18, 2026 at 10:01 AM Jan Kara <jack@suse.cz> wrote: > > > > > > > > On Tue 17-02-26 19:22:31, T.J. Mercier wrote: > > > > > Currently some kernfs files (e.g. cgroup.events, memory.events) support > > > > > inotify watches for IN_MODIFY, but unlike with regular filesystems, they > > > > > do not receive IN_DELETE_SELF or IN_IGNORED events when they are > > > > > removed. > > > > > > > > Please see my email: > > > > https://lore.kernel.org/all/lc2jgt3yrvuvtdj2kk7q3rloie2c5mzyhfdy4zvxylx732voet@ol3kl4ackrpb > > > > > > > > I think this is actually a bug in kernfs... > > > > > > > > Honza > > > > > > Thanks, I'm looking at this now. I've tried calling clear_nlink in > > > kernfs_iop_rmdir, but I've found that when we get back to vfs_rmdir > > > and shrink_dcache_parent is called, d_walk doesn't find any entries, > > > so shrink_kill->__dentry_kill is not called. I'm investigating why > > > that is... > > > > Strange because when I was experimenting with this in my VM I have seen > > __dentry_kill being called (if the dentries were created by someone looking > > up the names). > > Ahh yes, that's the difference. I was just doing mkdir > /sys/fs/cgroup/foo immediately followed by rmdir /sys/fs/cgroup/foo. > kernfs creates the dentries in kernfs_iop_lookup, so there were none > when I did the rmdir because I didn't cause any lookups. > > If I actually have a program watching > /sys/fs/cgroup/foo/memory.events, then I do see the __dentry_kill kill > calls, but despite the prior clear_nlink call i_nlink is 1 so > fsnotify_inoderemove is skipped. Something must be incrementing it. The issue was that kernfs_remove unlinks the kernfs nodes, but doesn't clear_nlink when it does so. Adding that seems to work to generate IN_DELETE_SELF and IN_IGNORED. I'll do some more testing and get a patch ready.
On Wed, Feb 18, 2026 at 11:58 AM T.J. Mercier <tjmercier@google.com> wrote: > > On Wed, Feb 18, 2026 at 11:15 AM T.J. Mercier <tjmercier@google.com> wrote: > > > > On Wed, Feb 18, 2026 at 10:37 AM Jan Kara <jack@suse.cz> wrote: > > > > > > On Wed 18-02-26 10:06:35, T.J. Mercier wrote: > > > > On Wed, Feb 18, 2026 at 10:01 AM Jan Kara <jack@suse.cz> wrote: > > > > > > > > > > On Tue 17-02-26 19:22:31, T.J. Mercier wrote: > > > > > > Currently some kernfs files (e.g. cgroup.events, memory.events) support > > > > > > inotify watches for IN_MODIFY, but unlike with regular filesystems, they > > > > > > do not receive IN_DELETE_SELF or IN_IGNORED events when they are > > > > > > removed. > > > > > > > > > > Please see my email: > > > > > https://lore.kernel.org/all/lc2jgt3yrvuvtdj2kk7q3rloie2c5mzyhfdy4zvxylx732voet@ol3kl4ackrpb > > > > > > > > > > I think this is actually a bug in kernfs... > > > > > > > > > > Honza > > > > > > > > Thanks, I'm looking at this now. I've tried calling clear_nlink in > > > > kernfs_iop_rmdir, but I've found that when we get back to vfs_rmdir > > > > and shrink_dcache_parent is called, d_walk doesn't find any entries, > > > > so shrink_kill->__dentry_kill is not called. I'm investigating why > > > > that is... > > > > > > Strange because when I was experimenting with this in my VM I have seen > > > __dentry_kill being called (if the dentries were created by someone looking > > > up the names). > > > > Ahh yes, that's the difference. I was just doing mkdir > > /sys/fs/cgroup/foo immediately followed by rmdir /sys/fs/cgroup/foo. > > kernfs creates the dentries in kernfs_iop_lookup, so there were none > > when I did the rmdir because I didn't cause any lookups. > > > > If I actually have a program watching > > /sys/fs/cgroup/foo/memory.events, then I do see the __dentry_kill kill > > calls, but despite the prior clear_nlink call i_nlink is 1 so > > fsnotify_inoderemove is skipped. Something must be incrementing it. > > The issue was that kernfs_remove unlinks the kernfs nodes, but doesn't > clear_nlink when it does so. Adding that seems to work to generate > IN_DELETE_SELF and IN_IGNORED. I'll do some more testing and get a > patch ready. This works for the rmdir case, because vfs_rmdir->shrink_dcache_parent->shrink_kill->__dentry_kill is invoked when the user runs rmdir. However the case where a kernfs file is removed because a cgroup subsys is deactivated does not work, because it occurs when the user writes to cgroup.subtree_control. That is a vfs_write which calls fsnotify_modify for cgroup.subtree_control, but (very reasonably) there is no attempt made to clean up the dcache in VFS on writes. So I think kernfs still needs to generate fsnotify events manually for the cgroup_subtree_control_write->cgroup_apply_control_disable case. Those removals happen via kernfs_remove_by_name->__kernfs_remove, so that would look a lot like what I sent in this v3 patch, even if we also add clear_nlink calls for the rmdir case.
On Wed 18-02-26 14:10:42, T.J. Mercier wrote: > On Wed, Feb 18, 2026 at 11:58 AM T.J. Mercier <tjmercier@google.com> wrote: > > On Wed, Feb 18, 2026 at 11:15 AM T.J. Mercier <tjmercier@google.com> wrote: > > > On Wed, Feb 18, 2026 at 10:37 AM Jan Kara <jack@suse.cz> wrote: > > > > On Wed 18-02-26 10:06:35, T.J. Mercier wrote: > > > > > On Wed, Feb 18, 2026 at 10:01 AM Jan Kara <jack@suse.cz> wrote: > > > > > > On Tue 17-02-26 19:22:31, T.J. Mercier wrote: > > > > > > > Currently some kernfs files (e.g. cgroup.events, memory.events) support > > > > > > > inotify watches for IN_MODIFY, but unlike with regular filesystems, they > > > > > > > do not receive IN_DELETE_SELF or IN_IGNORED events when they are > > > > > > > removed. > > > > > > > > > > > > Please see my email: > > > > > > https://lore.kernel.org/all/lc2jgt3yrvuvtdj2kk7q3rloie2c5mzyhfdy4zvxylx732voet@ol3kl4ackrpb > > > > > > > > > > > > I think this is actually a bug in kernfs... > > > > > > > > > > > > Honza > > > > > > > > > > Thanks, I'm looking at this now. I've tried calling clear_nlink in > > > > > kernfs_iop_rmdir, but I've found that when we get back to vfs_rmdir > > > > > and shrink_dcache_parent is called, d_walk doesn't find any entries, > > > > > so shrink_kill->__dentry_kill is not called. I'm investigating why > > > > > that is... > > > > > > > > Strange because when I was experimenting with this in my VM I have seen > > > > __dentry_kill being called (if the dentries were created by someone looking > > > > up the names). > > > > > > Ahh yes, that's the difference. I was just doing mkdir > > > /sys/fs/cgroup/foo immediately followed by rmdir /sys/fs/cgroup/foo. > > > kernfs creates the dentries in kernfs_iop_lookup, so there were none > > > when I did the rmdir because I didn't cause any lookups. > > > > > > If I actually have a program watching > > > /sys/fs/cgroup/foo/memory.events, then I do see the __dentry_kill kill > > > calls, but despite the prior clear_nlink call i_nlink is 1 so > > > fsnotify_inoderemove is skipped. Something must be incrementing it. > > > > The issue was that kernfs_remove unlinks the kernfs nodes, but doesn't > > clear_nlink when it does so. Adding that seems to work to generate > > IN_DELETE_SELF and IN_IGNORED. I'll do some more testing and get a > > patch ready. > > This works for the rmdir case, because > vfs_rmdir->shrink_dcache_parent->shrink_kill->__dentry_kill is invoked > when the user runs rmdir. > > However the case where a kernfs file is removed because a cgroup > subsys is deactivated does not work, because it occurs when the user > writes to cgroup.subtree_control. That is a vfs_write which calls > fsnotify_modify for cgroup.subtree_control, but (very reasonably) > there is no attempt made to clean up the dcache in VFS on writes. OK, and is this mostly a theoretical concern or do you practically expect someone to monitor subsystem files in a cgroup with inotify to learn that the subsystem has been disabled? It doesn't look very probable to me... > So I think kernfs still needs to generate fsnotify events manually for > the cgroup_subtree_control_write->cgroup_apply_control_disable case. > Those removals happen via kernfs_remove_by_name->__kernfs_remove, so > that would look a lot like what I sent in this v3 patch, even if we > also add clear_nlink calls for the rmdir case. If there's a sensible usecase for monitoring of subsystem files being deleted, we could also d_delete() the dentry from cgroup_rm_file(). But maybe the performance overhead would be visible for some larger scale removals so maybe just using fsnotify_inoderemove() to paper over the problem would be easier if this case is really needed. Honza -- Jan Kara <jack@suse.com> SUSE Labs, CR
On Thu, Feb 19, 2026 at 3:05 AM Jan Kara <jack@suse.cz> wrote: > > On Wed 18-02-26 14:10:42, T.J. Mercier wrote: > > On Wed, Feb 18, 2026 at 11:58 AM T.J. Mercier <tjmercier@google.com> wrote: > > > On Wed, Feb 18, 2026 at 11:15 AM T.J. Mercier <tjmercier@google.com> wrote: > > > > On Wed, Feb 18, 2026 at 10:37 AM Jan Kara <jack@suse.cz> wrote: > > > > > On Wed 18-02-26 10:06:35, T.J. Mercier wrote: > > > > > > On Wed, Feb 18, 2026 at 10:01 AM Jan Kara <jack@suse.cz> wrote: > > > > > > > On Tue 17-02-26 19:22:31, T.J. Mercier wrote: > > > > > > > > Currently some kernfs files (e.g. cgroup.events, memory.events) support > > > > > > > > inotify watches for IN_MODIFY, but unlike with regular filesystems, they > > > > > > > > do not receive IN_DELETE_SELF or IN_IGNORED events when they are > > > > > > > > removed. > > > > > > > > > > > > > > Please see my email: > > > > > > > https://lore.kernel.org/all/lc2jgt3yrvuvtdj2kk7q3rloie2c5mzyhfdy4zvxylx732voet@ol3kl4ackrpb > > > > > > > > > > > > > > I think this is actually a bug in kernfs... > > > > > > > > > > > > > > Honza > > > > > > > > > > > > Thanks, I'm looking at this now. I've tried calling clear_nlink in > > > > > > kernfs_iop_rmdir, but I've found that when we get back to vfs_rmdir > > > > > > and shrink_dcache_parent is called, d_walk doesn't find any entries, > > > > > > so shrink_kill->__dentry_kill is not called. I'm investigating why > > > > > > that is... > > > > > > > > > > Strange because when I was experimenting with this in my VM I have seen > > > > > __dentry_kill being called (if the dentries were created by someone looking > > > > > up the names). > > > > > > > > Ahh yes, that's the difference. I was just doing mkdir > > > > /sys/fs/cgroup/foo immediately followed by rmdir /sys/fs/cgroup/foo. > > > > kernfs creates the dentries in kernfs_iop_lookup, so there were none > > > > when I did the rmdir because I didn't cause any lookups. > > > > > > > > If I actually have a program watching > > > > /sys/fs/cgroup/foo/memory.events, then I do see the __dentry_kill kill > > > > calls, but despite the prior clear_nlink call i_nlink is 1 so > > > > fsnotify_inoderemove is skipped. Something must be incrementing it. > > > > > > The issue was that kernfs_remove unlinks the kernfs nodes, but doesn't > > > clear_nlink when it does so. Adding that seems to work to generate > > > IN_DELETE_SELF and IN_IGNORED. I'll do some more testing and get a > > > patch ready. > > > > This works for the rmdir case, because > > vfs_rmdir->shrink_dcache_parent->shrink_kill->__dentry_kill is invoked > > when the user runs rmdir. > > > > However the case where a kernfs file is removed because a cgroup > > subsys is deactivated does not work, because it occurs when the user > > writes to cgroup.subtree_control. That is a vfs_write which calls > > fsnotify_modify for cgroup.subtree_control, but (very reasonably) > > there is no attempt made to clean up the dcache in VFS on writes. > > OK, and is this mostly a theoretical concern or do you practically expect > someone to monitor subsystem files in a cgroup with inotify to learn that > the subsystem has been disabled? It doesn't look very probable to me... The rmdir case is the main one I'd like to fix. In production we don't currently disable cgroup controllers after they have been enabled. I agree the monitor-for-subsystem-disable case seems improbable. > > So I think kernfs still needs to generate fsnotify events manually for > > the cgroup_subtree_control_write->cgroup_apply_control_disable case. > > Those removals happen via kernfs_remove_by_name->__kernfs_remove, so > > that would look a lot like what I sent in this v3 patch, even if we > > also add clear_nlink calls for the rmdir case. > > If there's a sensible usecase for monitoring of subsystem files being > deleted, we could also d_delete() the dentry from cgroup_rm_file(). But > maybe the performance overhead would be visible for some larger scale > removals so maybe just using fsnotify_inoderemove() to paper over the > problem would be easier if this case is really needed. > > Honza > -- > Jan Kara <jack@suse.com> > SUSE Labs, CR
© 2016 - 2026 Red Hat, Inc.