[PATCH 05/14] pidfs: adapt to rhashtable-based simple_xattrs

Christian Brauner posted 14 patches 1 month ago
[PATCH 05/14] pidfs: adapt to rhashtable-based simple_xattrs
Posted by Christian Brauner 1 month ago
Adapt pidfs to use the rhashtable-based xattr path by switching from a
dedicated slab cache to simple_xattrs_alloc().

Previously pidfs used a custom kmem_cache (pidfs_xattr_cachep) that
allocated a struct containing an embedded simple_xattrs plus
simple_xattrs_init(). Replace this with simple_xattrs_alloc() which
combines kzalloc + rhashtable_init, and drop the dedicated slab cache
entirely.

Use simple_xattr_free_rcu() for replaced xattr entries to allow
concurrent RCU readers to finish.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c | 65 +++++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 41 insertions(+), 24 deletions(-)

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 1e20e36e0ed5..cb62000681df 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -21,6 +21,7 @@
 #include <linux/utsname.h>
 #include <net/net_namespace.h>
 #include <linux/coredump.h>
+#include <linux/llist.h>
 #include <linux/xattr.h>
 
 #include "internal.h"
@@ -29,7 +30,6 @@
 #define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
 
 static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
-static struct kmem_cache *pidfs_xattr_cachep __ro_after_init;
 
 static struct path pidfs_root_path = {};
 
@@ -44,9 +44,8 @@ enum pidfs_attr_mask_bits {
 	PIDFS_ATTR_BIT_COREDUMP	= 1,
 };
 
-struct pidfs_attr {
+struct pidfs_anon_attr {
 	unsigned long attr_mask;
-	struct simple_xattrs *xattrs;
 	struct /* exit info */ {
 		__u64 cgroupid;
 		__s32 exit_code;
@@ -55,6 +54,14 @@ struct pidfs_attr {
 	__u32 coredump_signal;
 };
 
+struct pidfs_attr {
+	struct simple_xattrs *xattrs;
+	union {
+		struct pidfs_anon_attr;
+		struct llist_node pidfs_llist;
+	};
+};
+
 static struct rb_root pidfs_ino_tree = RB_ROOT;
 
 #if BITS_PER_LONG == 32
@@ -147,10 +154,30 @@ void pidfs_remove_pid(struct pid *pid)
 	write_seqcount_end(&pidmap_lock_seq);
 }
 
+static LLIST_HEAD(pidfs_free_list);
+
+static void pidfs_free_attr_work(struct work_struct *work)
+{
+	struct pidfs_attr *attr, *next;
+	struct llist_node *head;
+
+	head = llist_del_all(&pidfs_free_list);
+	llist_for_each_entry_safe(attr, next, head, pidfs_llist) {
+		struct simple_xattrs *xattrs = attr->xattrs;
+
+		if (xattrs) {
+			simple_xattrs_free(xattrs, NULL);
+			kfree(xattrs);
+		}
+		kfree(attr);
+	}
+}
+
+static DECLARE_WORK(pidfs_free_work, pidfs_free_attr_work);
+
 void pidfs_free_pid(struct pid *pid)
 {
-	struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr);
-	struct simple_xattrs *xattrs __free(kfree) = NULL;
+	struct pidfs_attr *attr = pid->attr;
 
 	/*
 	 * Any dentry must've been wiped from the pid by now.
@@ -169,9 +196,10 @@ void pidfs_free_pid(struct pid *pid)
 	if (IS_ERR(attr))
 		return;
 
-	xattrs = no_free_ptr(attr->xattrs);
-	if (xattrs)
-		simple_xattrs_free(xattrs, NULL);
+	if (likely(!attr->xattrs))
+		kfree(attr);
+	else if (llist_add(&attr->pidfs_llist, &pidfs_free_list))
+		schedule_work(&pidfs_free_work);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -998,7 +1026,7 @@ static int pidfs_xattr_get(const struct xattr_handler *handler,
 
 	xattrs = READ_ONCE(attr->xattrs);
 	if (!xattrs)
-		return 0;
+		return -ENODATA;
 
 	name = xattr_full_name(handler, suffix);
 	return simple_xattr_get(xattrs, name, value, size);
@@ -1018,22 +1046,16 @@ static int pidfs_xattr_set(const struct xattr_handler *handler,
 	/* Ensure we're the only one to set @attr->xattrs. */
 	WARN_ON_ONCE(!inode_is_locked(inode));
 
-	xattrs = READ_ONCE(attr->xattrs);
-	if (!xattrs) {
-		xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL);
-		if (!xattrs)
-			return -ENOMEM;
-
-		simple_xattrs_init(xattrs);
-		smp_store_release(&pid->attr->xattrs, xattrs);
-	}
+	xattrs = simple_xattrs_lazy_alloc(&attr->xattrs, value, flags);
+	if (IS_ERR_OR_NULL(xattrs))
+		return PTR_ERR(xattrs);
 
 	name = xattr_full_name(handler, suffix);
 	old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
 	if (IS_ERR(old_xattr))
 		return PTR_ERR(old_xattr);
 
-	simple_xattr_free(old_xattr);
+	simple_xattr_free_rcu(old_xattr);
 	return 0;
 }
 
@@ -1108,11 +1130,6 @@ void __init pidfs_init(void)
 					 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
 					  SLAB_ACCOUNT | SLAB_PANIC), NULL);
 
-	pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache",
-					       sizeof(struct simple_xattrs), 0,
-					       (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
-						SLAB_ACCOUNT | SLAB_PANIC), NULL);
-
 	pidfs_mnt = kern_mount(&pidfs_type);
 	if (IS_ERR(pidfs_mnt))
 		panic("Failed to mount pidfs pseudo filesystem");

-- 
2.47.3
Re: [PATCH 05/14] pidfs: adapt to rhashtable-based simple_xattrs
Posted by Jan Kara 2 weeks, 5 days ago
On Mon 16-02-26 14:32:01, Christian Brauner wrote:
> Adapt pidfs to use the rhashtable-based xattr path by switching from a
> dedicated slab cache to simple_xattrs_alloc().
> 
> Previously pidfs used a custom kmem_cache (pidfs_xattr_cachep) that
> allocated a struct containing an embedded simple_xattrs plus
> simple_xattrs_init(). Replace this with simple_xattrs_alloc() which
> combines kzalloc + rhashtable_init, and drop the dedicated slab cache
> entirely.
> 
> Use simple_xattr_free_rcu() for replaced xattr entries to allow
> concurrent RCU readers to finish.
> 
> Signed-off-by: Christian Brauner <brauner@kernel.org>

One question below:

> +static LLIST_HEAD(pidfs_free_list);
> +
> +static void pidfs_free_attr_work(struct work_struct *work)
> +{
> +	struct pidfs_attr *attr, *next;
> +	struct llist_node *head;
> +
> +	head = llist_del_all(&pidfs_free_list);
> +	llist_for_each_entry_safe(attr, next, head, pidfs_llist) {
> +		struct simple_xattrs *xattrs = attr->xattrs;
> +
> +		if (xattrs) {
> +			simple_xattrs_free(xattrs, NULL);
> +			kfree(xattrs);
> +		}
> +		kfree(attr);
> +	}
> +}
> +
> +static DECLARE_WORK(pidfs_free_work, pidfs_free_attr_work);
> +

So you bother with postponing the freeing to a scheduled work because
put_pid() can be called from a context where acquiring rcu to iterate
rhashtable would not be possible? Frankly I have hard time imagining such
context (where previous rbtree code wouldn't have issues as well), in
particular because AFAIR rcu is safe to arbitrarily nest. What am I
missing?

								Honza


>  void pidfs_free_pid(struct pid *pid)
>  {
> -	struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr);
> -	struct simple_xattrs *xattrs __free(kfree) = NULL;
> +	struct pidfs_attr *attr = pid->attr;
>  
>  	/*
>  	 * Any dentry must've been wiped from the pid by now.
> @@ -169,9 +196,10 @@ void pidfs_free_pid(struct pid *pid)
>  	if (IS_ERR(attr))
>  		return;
>  
> -	xattrs = no_free_ptr(attr->xattrs);
> -	if (xattrs)
> -		simple_xattrs_free(xattrs, NULL);
> +	if (likely(!attr->xattrs))
> +		kfree(attr);
> +	else if (llist_add(&attr->pidfs_llist, &pidfs_free_list))
> +		schedule_work(&pidfs_free_work);
>  }
>  
>  #ifdef CONFIG_PROC_FS
> @@ -998,7 +1026,7 @@ static int pidfs_xattr_get(const struct xattr_handler *handler,
>  
>  	xattrs = READ_ONCE(attr->xattrs);
>  	if (!xattrs)
> -		return 0;
> +		return -ENODATA;
>  
>  	name = xattr_full_name(handler, suffix);
>  	return simple_xattr_get(xattrs, name, value, size);
> @@ -1018,22 +1046,16 @@ static int pidfs_xattr_set(const struct xattr_handler *handler,
>  	/* Ensure we're the only one to set @attr->xattrs. */
>  	WARN_ON_ONCE(!inode_is_locked(inode));
>  
> -	xattrs = READ_ONCE(attr->xattrs);
> -	if (!xattrs) {
> -		xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL);
> -		if (!xattrs)
> -			return -ENOMEM;
> -
> -		simple_xattrs_init(xattrs);
> -		smp_store_release(&pid->attr->xattrs, xattrs);
> -	}
> +	xattrs = simple_xattrs_lazy_alloc(&attr->xattrs, value, flags);
> +	if (IS_ERR_OR_NULL(xattrs))
> +		return PTR_ERR(xattrs);
>  
>  	name = xattr_full_name(handler, suffix);
>  	old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
>  	if (IS_ERR(old_xattr))
>  		return PTR_ERR(old_xattr);
>  
> -	simple_xattr_free(old_xattr);
> +	simple_xattr_free_rcu(old_xattr);
>  	return 0;
>  }
>  
> @@ -1108,11 +1130,6 @@ void __init pidfs_init(void)
>  					 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
>  					  SLAB_ACCOUNT | SLAB_PANIC), NULL);
>  
> -	pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache",
> -					       sizeof(struct simple_xattrs), 0,
> -					       (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
> -						SLAB_ACCOUNT | SLAB_PANIC), NULL);
> -
>  	pidfs_mnt = kern_mount(&pidfs_type);
>  	if (IS_ERR(pidfs_mnt))
>  		panic("Failed to mount pidfs pseudo filesystem");
> 
> -- 
> 2.47.3
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR
Re: [PATCH 05/14] pidfs: adapt to rhashtable-based simple_xattrs
Posted by Jan Kara 2 weeks, 5 days ago
On Fri 27-02-26 16:09:15, Jan Kara wrote:
> On Mon 16-02-26 14:32:01, Christian Brauner wrote:
> > Adapt pidfs to use the rhashtable-based xattr path by switching from a
> > dedicated slab cache to simple_xattrs_alloc().
> > 
> > Previously pidfs used a custom kmem_cache (pidfs_xattr_cachep) that
> > allocated a struct containing an embedded simple_xattrs plus
> > simple_xattrs_init(). Replace this with simple_xattrs_alloc() which
> > combines kzalloc + rhashtable_init, and drop the dedicated slab cache
> > entirely.
> > 
> > Use simple_xattr_free_rcu() for replaced xattr entries to allow
> > concurrent RCU readers to finish.
> > 
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> 
> One question below:
> 
> > +static LLIST_HEAD(pidfs_free_list);
> > +
> > +static void pidfs_free_attr_work(struct work_struct *work)
> > +{
> > +	struct pidfs_attr *attr, *next;
> > +	struct llist_node *head;
> > +
> > +	head = llist_del_all(&pidfs_free_list);
> > +	llist_for_each_entry_safe(attr, next, head, pidfs_llist) {
> > +		struct simple_xattrs *xattrs = attr->xattrs;
> > +
> > +		if (xattrs) {
> > +			simple_xattrs_free(xattrs, NULL);
> > +			kfree(xattrs);
> > +		}
> > +		kfree(attr);
> > +	}
> > +}
> > +
> > +static DECLARE_WORK(pidfs_free_work, pidfs_free_attr_work);
> > +
> 
> So you bother with postponing the freeing to a scheduled work because
> put_pid() can be called from a context where acquiring rcu to iterate
> rhashtable would not be possible? Frankly I have hard time imagining such
> context (where previous rbtree code wouldn't have issues as well), in
> particular because AFAIR rcu is safe to arbitrarily nest. What am I
> missing?

Ah, I've now found out rhashtable_free_and_destroy() can sleep and that's
likely the reason. OK. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR
Re: [PATCH 05/14] pidfs: adapt to rhashtable-based simple_xattrs
Posted by Christian Brauner 2 weeks, 2 days ago
On Fri, Feb 27, 2026 at 04:16:04PM +0100, Jan Kara wrote:
> On Fri 27-02-26 16:09:15, Jan Kara wrote:
> > On Mon 16-02-26 14:32:01, Christian Brauner wrote:
> > > Adapt pidfs to use the rhashtable-based xattr path by switching from a
> > > dedicated slab cache to simple_xattrs_alloc().
> > > 
> > > Previously pidfs used a custom kmem_cache (pidfs_xattr_cachep) that
> > > allocated a struct containing an embedded simple_xattrs plus
> > > simple_xattrs_init(). Replace this with simple_xattrs_alloc() which
> > > combines kzalloc + rhashtable_init, and drop the dedicated slab cache
> > > entirely.
> > > 
> > > Use simple_xattr_free_rcu() for replaced xattr entries to allow
> > > concurrent RCU readers to finish.
> > > 
> > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > 
> > One question below:
> > 
> > > +static LLIST_HEAD(pidfs_free_list);
> > > +
> > > +static void pidfs_free_attr_work(struct work_struct *work)
> > > +{
> > > +	struct pidfs_attr *attr, *next;
> > > +	struct llist_node *head;
> > > +
> > > +	head = llist_del_all(&pidfs_free_list);
> > > +	llist_for_each_entry_safe(attr, next, head, pidfs_llist) {
> > > +		struct simple_xattrs *xattrs = attr->xattrs;
> > > +
> > > +		if (xattrs) {
> > > +			simple_xattrs_free(xattrs, NULL);
> > > +			kfree(xattrs);
> > > +		}
> > > +		kfree(attr);
> > > +	}
> > > +}
> > > +
> > > +static DECLARE_WORK(pidfs_free_work, pidfs_free_attr_work);
> > > +
> > 
> > So you bother with postponing the freeing to a scheduled work because
> > put_pid() can be called from a context where acquiring rcu to iterate
> > rhashtable would not be possible? Frankly I have hard time imagining such
> > context (where previous rbtree code wouldn't have issues as well), in
> > particular because AFAIR rcu is safe to arbitrarily nest. What am I
> > missing?
> 
> Ah, I've now found out rhashtable_free_and_destroy() can sleep and that's
> likely the reason. OK. Feel free to add:

Yeah, it was a surprise to me too. :)