Adapt pidfs to use the rhashtable-based xattr path by switching from a
dedicated slab cache to simple_xattrs_alloc().
Previously pidfs used a custom kmem_cache (pidfs_xattr_cachep) that
allocated a struct containing an embedded simple_xattrs plus
simple_xattrs_init(). Replace this with simple_xattrs_alloc() which
combines kzalloc + rhashtable_init, and drop the dedicated slab cache
entirely.
Use simple_xattr_free_rcu() for replaced xattr entries to allow
concurrent RCU readers to finish.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/pidfs.c | 65 +++++++++++++++++++++++++++++++++++++++-----------------------
1 file changed, 41 insertions(+), 24 deletions(-)
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 1e20e36e0ed5..cb62000681df 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -21,6 +21,7 @@
#include <linux/utsname.h>
#include <net/net_namespace.h>
#include <linux/coredump.h>
+#include <linux/llist.h>
#include <linux/xattr.h>
#include "internal.h"
@@ -29,7 +30,6 @@
#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
-static struct kmem_cache *pidfs_xattr_cachep __ro_after_init;
static struct path pidfs_root_path = {};
@@ -44,9 +44,8 @@ enum pidfs_attr_mask_bits {
PIDFS_ATTR_BIT_COREDUMP = 1,
};
-struct pidfs_attr {
+struct pidfs_anon_attr {
unsigned long attr_mask;
- struct simple_xattrs *xattrs;
struct /* exit info */ {
__u64 cgroupid;
__s32 exit_code;
@@ -55,6 +54,14 @@ struct pidfs_attr {
__u32 coredump_signal;
};
+struct pidfs_attr {
+ struct simple_xattrs *xattrs;
+ union {
+ struct pidfs_anon_attr;
+ struct llist_node pidfs_llist;
+ };
+};
+
static struct rb_root pidfs_ino_tree = RB_ROOT;
#if BITS_PER_LONG == 32
@@ -147,10 +154,30 @@ void pidfs_remove_pid(struct pid *pid)
write_seqcount_end(&pidmap_lock_seq);
}
+static LLIST_HEAD(pidfs_free_list);
+
+static void pidfs_free_attr_work(struct work_struct *work)
+{
+ struct pidfs_attr *attr, *next;
+ struct llist_node *head;
+
+ head = llist_del_all(&pidfs_free_list);
+ llist_for_each_entry_safe(attr, next, head, pidfs_llist) {
+ struct simple_xattrs *xattrs = attr->xattrs;
+
+ if (xattrs) {
+ simple_xattrs_free(xattrs, NULL);
+ kfree(xattrs);
+ }
+ kfree(attr);
+ }
+}
+
+static DECLARE_WORK(pidfs_free_work, pidfs_free_attr_work);
+
void pidfs_free_pid(struct pid *pid)
{
- struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr);
- struct simple_xattrs *xattrs __free(kfree) = NULL;
+ struct pidfs_attr *attr = pid->attr;
/*
* Any dentry must've been wiped from the pid by now.
@@ -169,9 +196,10 @@ void pidfs_free_pid(struct pid *pid)
if (IS_ERR(attr))
return;
- xattrs = no_free_ptr(attr->xattrs);
- if (xattrs)
- simple_xattrs_free(xattrs, NULL);
+ if (likely(!attr->xattrs))
+ kfree(attr);
+ else if (llist_add(&attr->pidfs_llist, &pidfs_free_list))
+ schedule_work(&pidfs_free_work);
}
#ifdef CONFIG_PROC_FS
@@ -998,7 +1026,7 @@ static int pidfs_xattr_get(const struct xattr_handler *handler,
xattrs = READ_ONCE(attr->xattrs);
if (!xattrs)
- return 0;
+ return -ENODATA;
name = xattr_full_name(handler, suffix);
return simple_xattr_get(xattrs, name, value, size);
@@ -1018,22 +1046,16 @@ static int pidfs_xattr_set(const struct xattr_handler *handler,
/* Ensure we're the only one to set @attr->xattrs. */
WARN_ON_ONCE(!inode_is_locked(inode));
- xattrs = READ_ONCE(attr->xattrs);
- if (!xattrs) {
- xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL);
- if (!xattrs)
- return -ENOMEM;
-
- simple_xattrs_init(xattrs);
- smp_store_release(&pid->attr->xattrs, xattrs);
- }
+ xattrs = simple_xattrs_lazy_alloc(&attr->xattrs, value, flags);
+ if (IS_ERR_OR_NULL(xattrs))
+ return PTR_ERR(xattrs);
name = xattr_full_name(handler, suffix);
old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
if (IS_ERR(old_xattr))
return PTR_ERR(old_xattr);
- simple_xattr_free(old_xattr);
+ simple_xattr_free_rcu(old_xattr);
return 0;
}
@@ -1108,11 +1130,6 @@ void __init pidfs_init(void)
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
SLAB_ACCOUNT | SLAB_PANIC), NULL);
- pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache",
- sizeof(struct simple_xattrs), 0,
- (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
- SLAB_ACCOUNT | SLAB_PANIC), NULL);
-
pidfs_mnt = kern_mount(&pidfs_type);
if (IS_ERR(pidfs_mnt))
panic("Failed to mount pidfs pseudo filesystem");
--
2.47.3
On Mon 16-02-26 14:32:01, Christian Brauner wrote:
> Adapt pidfs to use the rhashtable-based xattr path by switching from a
> dedicated slab cache to simple_xattrs_alloc().
>
> Previously pidfs used a custom kmem_cache (pidfs_xattr_cachep) that
> allocated a struct containing an embedded simple_xattrs plus
> simple_xattrs_init(). Replace this with simple_xattrs_alloc() which
> combines kzalloc + rhashtable_init, and drop the dedicated slab cache
> entirely.
>
> Use simple_xattr_free_rcu() for replaced xattr entries to allow
> concurrent RCU readers to finish.
>
> Signed-off-by: Christian Brauner <brauner@kernel.org>
One question below:
> +static LLIST_HEAD(pidfs_free_list);
> +
> +static void pidfs_free_attr_work(struct work_struct *work)
> +{
> + struct pidfs_attr *attr, *next;
> + struct llist_node *head;
> +
> + head = llist_del_all(&pidfs_free_list);
> + llist_for_each_entry_safe(attr, next, head, pidfs_llist) {
> + struct simple_xattrs *xattrs = attr->xattrs;
> +
> + if (xattrs) {
> + simple_xattrs_free(xattrs, NULL);
> + kfree(xattrs);
> + }
> + kfree(attr);
> + }
> +}
> +
> +static DECLARE_WORK(pidfs_free_work, pidfs_free_attr_work);
> +
So you bother with postponing the freeing to a scheduled work because
put_pid() can be called from a context where acquiring rcu to iterate
rhashtable would not be possible? Frankly I have hard time imagining such
context (where previous rbtree code wouldn't have issues as well), in
particular because AFAIR rcu is safe to arbitrarily nest. What am I
missing?
Honza
> void pidfs_free_pid(struct pid *pid)
> {
> - struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr);
> - struct simple_xattrs *xattrs __free(kfree) = NULL;
> + struct pidfs_attr *attr = pid->attr;
>
> /*
> * Any dentry must've been wiped from the pid by now.
> @@ -169,9 +196,10 @@ void pidfs_free_pid(struct pid *pid)
> if (IS_ERR(attr))
> return;
>
> - xattrs = no_free_ptr(attr->xattrs);
> - if (xattrs)
> - simple_xattrs_free(xattrs, NULL);
> + if (likely(!attr->xattrs))
> + kfree(attr);
> + else if (llist_add(&attr->pidfs_llist, &pidfs_free_list))
> + schedule_work(&pidfs_free_work);
> }
>
> #ifdef CONFIG_PROC_FS
> @@ -998,7 +1026,7 @@ static int pidfs_xattr_get(const struct xattr_handler *handler,
>
> xattrs = READ_ONCE(attr->xattrs);
> if (!xattrs)
> - return 0;
> + return -ENODATA;
>
> name = xattr_full_name(handler, suffix);
> return simple_xattr_get(xattrs, name, value, size);
> @@ -1018,22 +1046,16 @@ static int pidfs_xattr_set(const struct xattr_handler *handler,
> /* Ensure we're the only one to set @attr->xattrs. */
> WARN_ON_ONCE(!inode_is_locked(inode));
>
> - xattrs = READ_ONCE(attr->xattrs);
> - if (!xattrs) {
> - xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL);
> - if (!xattrs)
> - return -ENOMEM;
> -
> - simple_xattrs_init(xattrs);
> - smp_store_release(&pid->attr->xattrs, xattrs);
> - }
> + xattrs = simple_xattrs_lazy_alloc(&attr->xattrs, value, flags);
> + if (IS_ERR_OR_NULL(xattrs))
> + return PTR_ERR(xattrs);
>
> name = xattr_full_name(handler, suffix);
> old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
> if (IS_ERR(old_xattr))
> return PTR_ERR(old_xattr);
>
> - simple_xattr_free(old_xattr);
> + simple_xattr_free_rcu(old_xattr);
> return 0;
> }
>
> @@ -1108,11 +1130,6 @@ void __init pidfs_init(void)
> (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
> SLAB_ACCOUNT | SLAB_PANIC), NULL);
>
> - pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache",
> - sizeof(struct simple_xattrs), 0,
> - (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
> - SLAB_ACCOUNT | SLAB_PANIC), NULL);
> -
> pidfs_mnt = kern_mount(&pidfs_type);
> if (IS_ERR(pidfs_mnt))
> panic("Failed to mount pidfs pseudo filesystem");
>
> --
> 2.47.3
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
On Fri 27-02-26 16:09:15, Jan Kara wrote:
> On Mon 16-02-26 14:32:01, Christian Brauner wrote:
> > Adapt pidfs to use the rhashtable-based xattr path by switching from a
> > dedicated slab cache to simple_xattrs_alloc().
> >
> > Previously pidfs used a custom kmem_cache (pidfs_xattr_cachep) that
> > allocated a struct containing an embedded simple_xattrs plus
> > simple_xattrs_init(). Replace this with simple_xattrs_alloc() which
> > combines kzalloc + rhashtable_init, and drop the dedicated slab cache
> > entirely.
> >
> > Use simple_xattr_free_rcu() for replaced xattr entries to allow
> > concurrent RCU readers to finish.
> >
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
>
> One question below:
>
> > +static LLIST_HEAD(pidfs_free_list);
> > +
> > +static void pidfs_free_attr_work(struct work_struct *work)
> > +{
> > + struct pidfs_attr *attr, *next;
> > + struct llist_node *head;
> > +
> > + head = llist_del_all(&pidfs_free_list);
> > + llist_for_each_entry_safe(attr, next, head, pidfs_llist) {
> > + struct simple_xattrs *xattrs = attr->xattrs;
> > +
> > + if (xattrs) {
> > + simple_xattrs_free(xattrs, NULL);
> > + kfree(xattrs);
> > + }
> > + kfree(attr);
> > + }
> > +}
> > +
> > +static DECLARE_WORK(pidfs_free_work, pidfs_free_attr_work);
> > +
>
> So you bother with postponing the freeing to a scheduled work because
> put_pid() can be called from a context where acquiring rcu to iterate
> rhashtable would not be possible? Frankly I have hard time imagining such
> context (where previous rbtree code wouldn't have issues as well), in
> particular because AFAIR rcu is safe to arbitrarily nest. What am I
> missing?
Ah, I've now found out rhashtable_free_and_destroy() can sleep and that's
likely the reason. OK. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
On Fri, Feb 27, 2026 at 04:16:04PM +0100, Jan Kara wrote:
> On Fri 27-02-26 16:09:15, Jan Kara wrote:
> > On Mon 16-02-26 14:32:01, Christian Brauner wrote:
> > > Adapt pidfs to use the rhashtable-based xattr path by switching from a
> > > dedicated slab cache to simple_xattrs_alloc().
> > >
> > > Previously pidfs used a custom kmem_cache (pidfs_xattr_cachep) that
> > > allocated a struct containing an embedded simple_xattrs plus
> > > simple_xattrs_init(). Replace this with simple_xattrs_alloc() which
> > > combines kzalloc + rhashtable_init, and drop the dedicated slab cache
> > > entirely.
> > >
> > > Use simple_xattr_free_rcu() for replaced xattr entries to allow
> > > concurrent RCU readers to finish.
> > >
> > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> >
> > One question below:
> >
> > > +static LLIST_HEAD(pidfs_free_list);
> > > +
> > > +static void pidfs_free_attr_work(struct work_struct *work)
> > > +{
> > > + struct pidfs_attr *attr, *next;
> > > + struct llist_node *head;
> > > +
> > > + head = llist_del_all(&pidfs_free_list);
> > > + llist_for_each_entry_safe(attr, next, head, pidfs_llist) {
> > > + struct simple_xattrs *xattrs = attr->xattrs;
> > > +
> > > + if (xattrs) {
> > > + simple_xattrs_free(xattrs, NULL);
> > > + kfree(xattrs);
> > > + }
> > > + kfree(attr);
> > > + }
> > > +}
> > > +
> > > +static DECLARE_WORK(pidfs_free_work, pidfs_free_attr_work);
> > > +
> >
> > So you bother with postponing the freeing to a scheduled work because
> > put_pid() can be called from a context where acquiring rcu to iterate
> > rhashtable would not be possible? Frankly I have hard time imagining such
> > context (where previous rbtree code wouldn't have issues as well), in
> > particular because AFAIR rcu is safe to arbitrarily nest. What am I
> > missing?
>
> Ah, I've now found out rhashtable_free_and_destroy() can sleep and that's
> likely the reason. OK. Feel free to add:
Yeah, it was a surprise to me too. :)
© 2016 - 2026 Red Hat, Inc.