[PATCH] pidfs: implement ino allocation without the pidmap lock

Mateusz Guzik posted 1 patch 2 weeks, 3 days ago
fs/pidfs.c   | 107 +++++++++++++++++++++++++++++++++++----------------
kernel/pid.c |   3 +-
2 files changed, 74 insertions(+), 36 deletions(-)
[PATCH] pidfs: implement ino allocation without the pidmap lock
Posted by Mateusz Guzik 2 weeks, 3 days ago
This paves the way for scalable PID allocation later.

The 32 bit variant merely takes a spinlock for simplicity, the 64 bit
variant uses a scalable scheme.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---

this patch assumes the rb -> rhashtable conversion landed

i booted the 32 bit code on the 64 bit kernel, i take it its fine

I'm slightly worried about error handling. It seems pid->pidfs_hash.next
= NULL is supposed to sort it out.

Given that ino of 0 is not legal, I think it should be used as a
sentinel value for presence in the table instead.

so something like:

alloc_pid:
pid->ino = 0;
....

then:

void pidfs_remove_pid(struct pid *pid)
{
        if (unlikely(!pid->ino))
                return;
        rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
                               pidfs_ino_ht_params);
}

 fs/pidfs.c   | 107 +++++++++++++++++++++++++++++++++++----------------
 kernel/pid.c |   3 +-
 2 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 3da5e8e0a76b..46b46a484d45 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -65,7 +65,39 @@ static const struct rhashtable_params pidfs_ino_ht_params = {
 	.automatic_shrinking	= true,
 };
 
+/*
+ * inode number handling
+ *
+ * On 64 bit nothing special happens. The 64bit number assigned
+ * to struct pid is the inode number.
+ *
+ * On 32 bit the 64 bit number assigned to struct pid is split
+ * into two 32 bit numbers. The lower 32 bits are used as the
+ * inode number and the upper 32 bits are used as the inode
+ * generation number.
+ *
+ * On 32 bit pidfs_ino() will return the lower 32 bit. When
+ * pidfs_ino() returns zero a wrap around happened. When a
+ * wraparound happens the 64 bit number will be incremented by 2
+ * so inode numbering starts at 2 again.
+ *
+ * On 64 bit comparing two pidfds is as simple as comparing
+ * inode numbers.
+ *
+ * When a wraparound happens on 32 bit multiple pidfds with the
+ * same inode number are likely to exist (This isn't a problem
+ * since before pidfs pidfds used the anonymous inode meaning
+ * all pidfds had the same inode number.). Userspace can
+ * reconstruct the 64 bit identifier by retrieving both the
+ * inode number and the inode generation number to compare or
+ * use file handles.
+ */
+
 #if BITS_PER_LONG == 32
+
+DEFINE_SPINLOCK(pidfs_ino_lock);
+static u64 pidfs_ino_nr = 2;
+
 static inline unsigned long pidfs_ino(u64 ino)
 {
 	return lower_32_bits(ino);
@@ -77,6 +109,18 @@ static inline u32 pidfs_gen(u64 ino)
 	return upper_32_bits(ino);
 }
 
+static inline u64 pidfs_alloc_ino(void)
+{
+	u64 ino;
+
+	spin_lock(&pidfs_ino_lock);
+	if (pidfs_ino(pidfs_ino_nr) == 0)
+		pidfs_ino_nr += 2;
+	ino = pidfs_ino_nr++;
+	spin_unlock(&pidfs_ino_lock);
+	return ino;
+}
+
 #else
 
 /* On 64 bit simply return ino. */
@@ -90,53 +134,48 @@ static inline u32 pidfs_gen(u64 ino)
 {
 	return 0;
 }
-#endif
 
 /*
- * Allocate inode number and initialize pidfs fields.
- * Called with pidmap_lock held.
+ * A patched up copy of get_next_ino(). Uses 64 bit, does not do overflow checks
+ * and guarantees ino of at least 2.
  */
-void pidfs_prepare_pid(struct pid *pid)
+#define LAST_INO_BATCH 1024
+static DEFINE_PER_CPU(u64, pidfs_last_ino);
+
+static u64 pidfs_alloc_ino(void)
 {
-	static u64 pidfs_ino_nr = 2;
+	u64 *p = &get_cpu_var(pidfs_last_ino);
+	u64 res = *p;
+
+#ifdef CONFIG_SMP
+	if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
+		static atomic64_t pidfs_shared_last_ino = ATOMIC_INIT(2);
+		u64 next = atomic64_add_return(LAST_INO_BATCH, &pidfs_shared_last_ino);
+		res = next - LAST_INO_BATCH;
+	}
+#endif
 
-	/*
-	 * On 64 bit nothing special happens. The 64bit number assigned
-	 * to struct pid is the inode number.
-	 *
-	 * On 32 bit the 64 bit number assigned to struct pid is split
-	 * into two 32 bit numbers. The lower 32 bits are used as the
-	 * inode number and the upper 32 bits are used as the inode
-	 * generation number.
-	 *
-	 * On 32 bit pidfs_ino() will return the lower 32 bit. When
-	 * pidfs_ino() returns zero a wrap around happened. When a
-	 * wraparound happens the 64 bit number will be incremented by 2
-	 * so inode numbering starts at 2 again.
-	 *
-	 * On 64 bit comparing two pidfds is as simple as comparing
-	 * inode numbers.
-	 *
-	 * When a wraparound happens on 32 bit multiple pidfds with the
-	 * same inode number are likely to exist (This isn't a problem
-	 * since before pidfs pidfds used the anonymous inode meaning
-	 * all pidfds had the same inode number.). Userspace can
-	 * reconstruct the 64 bit identifier by retrieving both the
-	 * inode number and the inode generation number to compare or
-	 * use file handles.
-	 */
-	if (pidfs_ino(pidfs_ino_nr) == 0)
-		pidfs_ino_nr += 2;
+	res++;
+	*p = res;
+	put_cpu_var(pidfs_last_ino);
+	return res;
+}
+
+#endif
 
-	pid->ino = pidfs_ino_nr;
+/*
+ * Initialize pidfs fields.
+ */
+void pidfs_prepare_pid(struct pid *pid)
+{
 	pid->pidfs_hash.next = NULL;
 	pid->stashed = NULL;
 	pid->attr = NULL;
-	pidfs_ino_nr++;
 }
 
 int pidfs_add_pid(struct pid *pid)
 {
+	pid->ino = pidfs_alloc_ino();
 	return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
 				      pidfs_ino_ht_params);
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index 06356e40ac00..72c9372b84b8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -198,6 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
 		INIT_HLIST_HEAD(&pid->tasks[type]);
 	init_waitqueue_head(&pid->wait_pidfd);
 	INIT_HLIST_HEAD(&pid->inodes);
+	pidfs_prepare_pid(pid);
 
 	/*
 	 * 2. perm check checkpoint_restore_ns_capable()
@@ -314,8 +315,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
 	retval = -ENOMEM;
 	if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
 		goto out_free;
-	pidfs_prepare_pid(pid);
-
 	for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);
-- 
2.48.1
Re: [PATCH] pidfs: implement ino allocation without the pidmap lock
Posted by Christian Brauner 2 weeks ago
On Tue, Jan 20, 2026 at 07:45:39PM +0100, Mateusz Guzik wrote:
> This paves the way for scalable PID allocation later.
> 
> The 32 bit variant merely takes a spinlock for simplicity, the 64 bit
> variant uses a scalable scheme.
> 
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
> 
> this patch assumes the rb -> rhashtable conversion landed
> 
> i booted the 32 bit code on the 64 bit kernel, i take it its fine
> 
> I'm slightly worried about error handling. It seems pid->pidfs_hash.next
> = NULL is supposed to sort it out.

r
> 
> Given that ino of 0 is not legal, I think it should be used as a
> sentinel value for presence in the table instead.
> 
> so something like:
> 
> alloc_pid:
> pid->ino = 0;
> ....
> 
> then:
> 
> void pidfs_remove_pid(struct pid *pid)
> {
>         if (unlikely(!pid->ino))
>                 return;
>         rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
>                                pidfs_ino_ht_params);
> }

All fine, but we should probably just use DEFINE_COOKIE() and accept
that numbering starts at 1 for 64-bit. Does the below look good to you
too?

From aaaa5cbb6e6920854aaee0ed59382d71614e785e Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Tue, 20 Jan 2026 19:45:39 +0100
Subject: [PATCH] pidfs: implement ino allocation without the pidmap lock

This paves the way for scalable PID allocation later.

The 32 bit variant merely takes a spinlock for simplicity, the 64 bit
variant uses a scalable scheme.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20260120184539.1480930-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c   | 115 ++++++++++++++++++++++++++++++++++-----------------
 kernel/pid.c |   3 +-
 2 files changed, 78 insertions(+), 40 deletions(-)

diff --git a/fs/pidfs.c b/fs/pidfs.c
index ee0e36dd29d2..3e7e7bdda780 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -23,6 +23,7 @@
 #include <linux/coredump.h>
 #include <linux/rhashtable.h>
 #include <linux/xattr.h>
+#include <linux/cookie.h>
 
 #include "internal.h"
 #include "mount.h"
@@ -65,7 +66,39 @@ static const struct rhashtable_params pidfs_ino_ht_params = {
 	.automatic_shrinking	= true,
 };
 
+/*
+ * inode number handling
+ *
+ * On 64 bit nothing special happens. The 64bit number assigned
+ * to struct pid is the inode number.
+ *
+ * On 32 bit the 64 bit number assigned to struct pid is split
+ * into two 32 bit numbers. The lower 32 bits are used as the
+ * inode number and the upper 32 bits are used as the inode
+ * generation number.
+ *
+ * On 32 bit pidfs_ino() will return the lower 32 bit. When
+ * pidfs_ino() returns zero a wrap around happened. When a
+ * wraparound happens the 64 bit number will be incremented by 2
+ * so inode numbering starts at 2 again.
+ *
+ * On 64 bit comparing two pidfds is as simple as comparing
+ * inode numbers.
+ *
+ * When a wraparound happens on 32 bit multiple pidfds with the
+ * same inode number are likely to exist (This isn't a problem
+ * since before pidfs pidfds used the anonymous inode meaning
+ * all pidfds had the same inode number.). Userspace can
+ * reconstruct the 64 bit identifier by retrieving both the
+ * inode number and the inode generation number to compare or
+ * use file handles.
+ */
+
 #if BITS_PER_LONG == 32
+
+DEFINE_SPINLOCK(pidfs_ino_lock);
+static u64 pidfs_ino_nr = 2;
+
 static inline unsigned long pidfs_ino(u64 ino)
 {
 	return lower_32_bits(ino);
@@ -77,6 +110,18 @@ static inline u32 pidfs_gen(u64 ino)
 	return upper_32_bits(ino);
 }
 
+static inline u64 pidfs_alloc_ino(void)
+{
+	u64 ino;
+
+	spin_lock(&pidfs_ino_lock);
+	if (pidfs_ino(pidfs_ino_nr) == 0)
+		pidfs_ino_nr += 2;
+	ino = pidfs_ino_nr++;
+	spin_unlock(&pidfs_ino_lock);
+	return ino;
+}
+
 #else
 
 /* On 64 bit simply return ino. */
@@ -90,61 +135,55 @@ static inline u32 pidfs_gen(u64 ino)
 {
 	return 0;
 }
+
+DEFINE_COOKIE(pidfs_ino_cookie);
+
+static u64 pidfs_alloc_ino(void)
+{
+	u64 ino;
+
+	preempt_disable();
+	ino = gen_cookie_next(&pidfs_ino_cookie);
+	preempt_enable();
+
+	VFS_WARN_ON_ONCE(ino < 1);
+	return ino;
+}
+
 #endif
 
 /*
- * Allocate inode number and initialize pidfs fields.
- * Called with pidmap_lock held.
+ * Initialize pidfs fields.
  */
 void pidfs_prepare_pid(struct pid *pid)
 {
-	static u64 pidfs_ino_nr = 2;
-
-	/*
-	 * On 64 bit nothing special happens. The 64bit number assigned
-	 * to struct pid is the inode number.
-	 *
-	 * On 32 bit the 64 bit number assigned to struct pid is split
-	 * into two 32 bit numbers. The lower 32 bits are used as the
-	 * inode number and the upper 32 bits are used as the inode
-	 * generation number.
-	 *
-	 * On 32 bit pidfs_ino() will return the lower 32 bit. When
-	 * pidfs_ino() returns zero a wrap around happened. When a
-	 * wraparound happens the 64 bit number will be incremented by 2
-	 * so inode numbering starts at 2 again.
-	 *
-	 * On 64 bit comparing two pidfds is as simple as comparing
-	 * inode numbers.
-	 *
-	 * When a wraparound happens on 32 bit multiple pidfds with the
-	 * same inode number are likely to exist (This isn't a problem
-	 * since before pidfs pidfds used the anonymous inode meaning
-	 * all pidfds had the same inode number.). Userspace can
-	 * reconstruct the 64 bit identifier by retrieving both the
-	 * inode number and the inode generation number to compare or
-	 * use file handles.
-	 */
-	if (pidfs_ino(pidfs_ino_nr) == 0)
-		pidfs_ino_nr += 2;
-
-	pid->ino = pidfs_ino_nr;
 	pid->pidfs_hash.next = NULL;
 	pid->stashed = NULL;
 	pid->attr = NULL;
-	pidfs_ino_nr++;
 }
 
 int pidfs_add_pid(struct pid *pid)
 {
-	return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
-				      pidfs_ino_ht_params);
+	int ret;
+
+	pid->ino = pidfs_alloc_ino();
+	ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
+				     pidfs_ino_ht_params);
+	/*
+	 * This is fine. The pid will not have a task attached to it so
+	 * no pidfd can be created for it. So we can unset the inode
+	 * number.
+	 */
+	if (unlikely(ret))
+		pid->ino = 0;
+	return ret;
 }
 
 void pidfs_remove_pid(struct pid *pid)
 {
-	rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
-			       pidfs_ino_ht_params);
+	if (likely(pid->ino))
+		rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
+				       pidfs_ino_ht_params);
 }
 
 void pidfs_free_pid(struct pid *pid)
diff --git a/kernel/pid.c b/kernel/pid.c
index 6077da774652..dfa545a97c00 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -198,6 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
 		INIT_HLIST_HEAD(&pid->tasks[type]);
 	init_waitqueue_head(&pid->wait_pidfd);
 	INIT_HLIST_HEAD(&pid->inodes);
+	pidfs_prepare_pid(pid);
 
 	/*
 	 * 2. perm check checkpoint_restore_ns_capable()
@@ -313,8 +314,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
 	retval = -ENOMEM;
 	if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
 		goto out_free;
-	pidfs_prepare_pid(pid);
-
 	for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);
-- 
2.47.3
Re: [PATCH] pidfs: implement ino allocation without the pidmap lock
Posted by Mateusz Guzik 2 weeks ago
On Fri, Jan 23, 2026 at 1:06 PM Christian Brauner <brauner@kernel.org> wrote:
>
> On Tue, Jan 20, 2026 at 07:45:39PM +0100, Mateusz Guzik wrote:
> > This paves the way for scalable PID allocation later.
> >
> > The 32 bit variant merely takes a spinlock for simplicity, the 64 bit
> > variant uses a scalable scheme.
> >
> > Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> > ---
> >
> > this patch assumes the rb -> rhashtable conversion landed
> >
> > i booted the 32 bit code on the 64 bit kernel, i take it its fine
> >
> > I'm slightly worried about error handling. It seems pid->pidfs_hash.next
> > = NULL is supposed to sort it out.
>
> r
> >
> > Given that ino of 0 is not legal, I think it should be used as a
> > sentinel value for presence in the table instead.
> >
> > so something like:
> >
> > alloc_pid:
> > pid->ino = 0;
> > ....
> >
> > then:
> >
> > void pidfs_remove_pid(struct pid *pid)
> > {
> >         if (unlikely(!pid->ino))
> >                 return;
> >         rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
> >                                pidfs_ino_ht_params);
> > }
>
> All fine, but we should probably just use DEFINE_COOKIE() and accept
> that numbering starts at 1 for 64-bit. Does the below look good to you
> too?
>
> From aaaa5cbb6e6920854aaee0ed59382d71614e785e Mon Sep 17 00:00:00 2001
> From: Mateusz Guzik <mjguzik@gmail.com>
> Date: Tue, 20 Jan 2026 19:45:39 +0100
> Subject: [PATCH] pidfs: implement ino allocation without the pidmap lock
>
> This paves the way for scalable PID allocation later.
>
> The 32 bit variant merely takes a spinlock for simplicity, the 64 bit
> variant uses a scalable scheme.
>
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> Link: https://patch.msgid.link/20260120184539.1480930-1-mjguzik@gmail.com
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
>  fs/pidfs.c   | 115 ++++++++++++++++++++++++++++++++++-----------------
>  kernel/pid.c |   3 +-
>  2 files changed, 78 insertions(+), 40 deletions(-)
>
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index ee0e36dd29d2..3e7e7bdda780 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -23,6 +23,7 @@
>  #include <linux/coredump.h>
>  #include <linux/rhashtable.h>
>  #include <linux/xattr.h>
> +#include <linux/cookie.h>
>
>  #include "internal.h"
>  #include "mount.h"
> @@ -65,7 +66,39 @@ static const struct rhashtable_params pidfs_ino_ht_params = {
>         .automatic_shrinking    = true,
>  };
>
> +/*
> + * inode number handling
> + *
> + * On 64 bit nothing special happens. The 64bit number assigned
> + * to struct pid is the inode number.
> + *
> + * On 32 bit the 64 bit number assigned to struct pid is split
> + * into two 32 bit numbers. The lower 32 bits are used as the
> + * inode number and the upper 32 bits are used as the inode
> + * generation number.
> + *
> + * On 32 bit pidfs_ino() will return the lower 32 bit. When
> + * pidfs_ino() returns zero a wrap around happened. When a
> + * wraparound happens the 64 bit number will be incremented by 2
> + * so inode numbering starts at 2 again.
> + *
> + * On 64 bit comparing two pidfds is as simple as comparing
> + * inode numbers.
> + *
> + * When a wraparound happens on 32 bit multiple pidfds with the
> + * same inode number are likely to exist (This isn't a problem
> + * since before pidfs pidfds used the anonymous inode meaning
> + * all pidfds had the same inode number.). Userspace can
> + * reconstruct the 64 bit identifier by retrieving both the
> + * inode number and the inode generation number to compare or
> + * use file handles.
> + */
> +
>  #if BITS_PER_LONG == 32
> +
> +DEFINE_SPINLOCK(pidfs_ino_lock);
> +static u64 pidfs_ino_nr = 2;
> +
>  static inline unsigned long pidfs_ino(u64 ino)
>  {
>         return lower_32_bits(ino);
> @@ -77,6 +110,18 @@ static inline u32 pidfs_gen(u64 ino)
>         return upper_32_bits(ino);
>  }
>
> +static inline u64 pidfs_alloc_ino(void)
> +{
> +       u64 ino;
> +
> +       spin_lock(&pidfs_ino_lock);
> +       if (pidfs_ino(pidfs_ino_nr) == 0)
> +               pidfs_ino_nr += 2;

this should + 1 for consistency with the 64 bit variant

> +       ino = pidfs_ino_nr++;
> +       spin_unlock(&pidfs_ino_lock);
> +       return ino;
> +}
> +
>  #else
>
>  /* On 64 bit simply return ino. */
> @@ -90,61 +135,55 @@ static inline u32 pidfs_gen(u64 ino)
>  {
>         return 0;
>  }
> +
> +DEFINE_COOKIE(pidfs_ino_cookie);
> +
> +static u64 pidfs_alloc_ino(void)
> +{
> +       u64 ino;
> +
> +       preempt_disable();
> +       ino = gen_cookie_next(&pidfs_ino_cookie);
> +       preempt_enable();
> +
> +       VFS_WARN_ON_ONCE(ino < 1);
> +       return ino;
> +}
> +
>  #endif
>
>  /*
> - * Allocate inode number and initialize pidfs fields.
> - * Called with pidmap_lock held.
> + * Initialize pidfs fields.
>   */
>  void pidfs_prepare_pid(struct pid *pid)
>  {
> -       static u64 pidfs_ino_nr = 2;
> -
> -       /*
> -        * On 64 bit nothing special happens. The 64bit number assigned
> -        * to struct pid is the inode number.
> -        *
> -        * On 32 bit the 64 bit number assigned to struct pid is split
> -        * into two 32 bit numbers. The lower 32 bits are used as the
> -        * inode number and the upper 32 bits are used as the inode
> -        * generation number.
> -        *
> -        * On 32 bit pidfs_ino() will return the lower 32 bit. When
> -        * pidfs_ino() returns zero a wrap around happened. When a
> -        * wraparound happens the 64 bit number will be incremented by 2
> -        * so inode numbering starts at 2 again.
> -        *
> -        * On 64 bit comparing two pidfds is as simple as comparing
> -        * inode numbers.
> -        *
> -        * When a wraparound happens on 32 bit multiple pidfds with the
> -        * same inode number are likely to exist (This isn't a problem
> -        * since before pidfs pidfds used the anonymous inode meaning
> -        * all pidfds had the same inode number.). Userspace can
> -        * reconstruct the 64 bit identifier by retrieving both the
> -        * inode number and the inode generation number to compare or
> -        * use file handles.
> -        */
> -       if (pidfs_ino(pidfs_ino_nr) == 0)
> -               pidfs_ino_nr += 2;
> -
> -       pid->ino = pidfs_ino_nr;
>         pid->pidfs_hash.next = NULL;
>         pid->stashed = NULL;
>         pid->attr = NULL;
> -       pidfs_ino_nr++;
>  }
>
>  int pidfs_add_pid(struct pid *pid)
>  {
> -       return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
> -                                     pidfs_ino_ht_params);
> +       int ret;
> +
> +       pid->ino = pidfs_alloc_ino();
> +       ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
> +                                    pidfs_ino_ht_params);
> +       /*
> +        * This is fine. The pid will not have a task attached to it so
> +        * no pidfd can be created for it. So we can unset the inode
> +        * number.
> +        */
> +       if (unlikely(ret))
> +               pid->ino = 0;

I would pid->ino = 0 unconditoinally in pidfs_prepare_pid. this
gurantees safe execution of pidfs_remove_pid even with pidfs_add_pid
did not get called

this removes the above comment as well

then pidfs_prepare_pid can also stop nullyifying the hash thing

then that's something i'm happy ti sing off on

> +       return ret;
>  }
>
>  void pidfs_remove_pid(struct pid *pid)
>  {
> -       rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
> -                              pidfs_ino_ht_params);
> +       if (likely(pid->ino))
> +               rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
> +                                      pidfs_ino_ht_params);
>  }
>
>  void pidfs_free_pid(struct pid *pid)
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 6077da774652..dfa545a97c00 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -198,6 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
>                 INIT_HLIST_HEAD(&pid->tasks[type]);
>         init_waitqueue_head(&pid->wait_pidfd);
>         INIT_HLIST_HEAD(&pid->inodes);
> +       pidfs_prepare_pid(pid);
>
>         /*
>          * 2. perm check checkpoint_restore_ns_capable()
> @@ -313,8 +314,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
>         retval = -ENOMEM;
>         if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
>                 goto out_free;
> -       pidfs_prepare_pid(pid);
> -
>         for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
>                 /* Make the PID visible to find_pid_ns. */
>                 idr_replace(&upid->ns->idr, pid, upid->nr);
> --
> 2.47.3
>