kernel/exit.c | 3 +- kernel/fork.c | 5 +- kernel/pid.c | 17 +- kernel/pid_namespace.c | 9 - .../selftests/pid_namespace/.gitignore | 1 + .../testing/selftests/pid_namespace/Makefile | 2 +- .../pid_namespace/pidns_init_via_setns.c | 238 ++++++++++++++++++ 7 files changed, 256 insertions(+), 19 deletions(-) create mode 100644 tools/testing/selftests/pid_namespace/pidns_init_via_setns.c
The first patch properly annotates accesses to ->child_reaper with _ONCE macroses, to protect unlocked accesses from possible cpu/compiler optimization problems. The second patch makes sure that the init is always a first process in the pid namespace, previously this was only checked for set_tid case. The third patch allows to join pid namespace before pid namespace init is created, that allows to create pid namespace by one process and then create pid namespace init from another process after setns(). Please see the detailed description in the patch commit message. It depends on the second patch. The forth and the final patch is a comprehansive test, that tests both basic usecase of creating pid namespace and init separately, and a more specific usecase which shows how we can improve clone3(set_tid) usability after this change. This change is generally useful as it makes clone3(set_tid) more universal, and let's it work in all the cases evenly. Also it is highly useful to CRIU to handle nested containers. v2: Use *_ONCE for ->child_reaper accesses atomicity, and avoid taking task_list lock for reading it. Rebase to master. v3: Separate *_ONCE change and "init is first" checks into separate commits. v4: Update second patch commit message. Include Oleg's review tags. This series is also available here: https://github.com/Snorch/linux/commits/allow-creating-pid-namespace-init-after-setns-v4/ Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com> Pavel Tikhomirov (4): pid_namespace: avoid optimization of accesses to ->child_reaper pid: check init is created first after idr alloc pid_namespace: allow opening pid_for_children before init was created selftests: Add tests for creating pidns init via setns kernel/exit.c | 3 +- kernel/fork.c | 5 +- kernel/pid.c | 17 +- kernel/pid_namespace.c | 9 - .../selftests/pid_namespace/.gitignore | 1 + .../testing/selftests/pid_namespace/Makefile | 2 +- .../pid_namespace/pidns_init_via_setns.c | 238 ++++++++++++++++++ 7 files changed, 256 insertions(+), 19 deletions(-) create mode 100644 tools/testing/selftests/pid_namespace/pidns_init_via_setns.c -- 2.53.0
Andrew, On top of Pavel's series but but doesn't depend on it. Can be applied before or after. Oleg. --- kernel/fork.c | 6 +++++- kernel/pid.c | 18 +++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-)
Currently we allow only one attempt to create init in a new namespace.
If the first fork() fails after alloc_pid() succeeds, free_pid() clears
PIDNS_ADDING and thus disables further PID allocations.
Nowadays this looks like an unnecessary limitation. The original reason
to handle "case PIDNS_ADDING" in free_pid() is gone, most probably after
commit 69879c01a0c3 ("proc: Remove the now unnecessary internal mount of
proc").
Change free_pid() to keep ns->pid_allocated == PIDNS_ADDING, and change
alloc_pid() to reset the cursor early, right after taking pidmap_lock.
Test-case:
#define _GNU_SOURCE
#include <linux/sched.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <assert.h>
#include <sched.h>
#include <errno.h>
int main(void)
{
struct clone_args args = {
.exit_signal = SIGCHLD,
.flags = CLONE_PIDFD,
.pidfd = 0,
};
unsigned long pidfd;
int pid;
assert(unshare(CLONE_NEWPID) == 0);
pid = syscall(__NR_clone3, &args, sizeof(args));
assert(pid == -1 && errno == EFAULT);
args.pidfd = (unsigned long)&pidfd;
pid = syscall(__NR_clone3, &args, sizeof(args));
if (pid)
assert(pid > 0 && wait(NULL) == pid);
else
assert(getpid() == 1);
return 0;
}
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
kernel/pid.c | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/kernel/pid.c b/kernel/pid.c
index ebf013f35cb3..1a0d2ac1f4a9 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -131,9 +131,8 @@ void free_pid(struct pid *pid)
wake_up_process(ns->child_reaper);
break;
case PIDNS_ADDING:
- /* Handle a fork failure of the first process */
- WARN_ON(ns->child_reaper);
- ns->pid_allocated = 0;
+ /* Only possible if the 1st fork fails */
+ WARN_ON(READ_ONCE(ns->child_reaper));
break;
}
@@ -230,6 +229,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
retried_preload = false;
idr_preload(GFP_KERNEL);
spin_lock(&pidmap_lock);
+ /* For the case when the previous attempt to create init failed */
+ if (ns->pid_allocated == PIDNS_ADDING)
+ idr_set_cursor(&ns->idr, 0);
+
for (tmp = ns, i = ns->level; i >= 0;) {
int tid = set_tid[ns->level - i];
@@ -341,10 +344,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
idr_remove(&upid->ns->idr, upid->nr);
}
- /* On failure to allocate the first pid, reset the state */
- if (ns->pid_allocated == PIDNS_ADDING)
- idr_set_cursor(&ns->idr, 0);
-
spin_unlock(&pidmap_lock);
idr_preload_end();
--
2.52.0
On Fri, Feb 27, 2026 at 4:04 AM Oleg Nesterov <oleg@redhat.com> wrote:
>
> Currently we allow only one attempt to create init in a new namespace.
> If the first fork() fails after alloc_pid() succeeds, free_pid() clears
> PIDNS_ADDING and thus disables further PID allocations.
>
> Nowadays this looks like an unnecessary limitation. The original reason
> to handle "case PIDNS_ADDING" in free_pid() is gone, most probably after
> commit 69879c01a0c3 ("proc: Remove the now unnecessary internal mount of
> proc").
>
> Change free_pid() to keep ns->pid_allocated == PIDNS_ADDING, and change
> alloc_pid() to reset the cursor early, right after taking pidmap_lock.
>
> Test-case:
>
> #define _GNU_SOURCE
> #include <linux/sched.h>
> #include <sys/syscall.h>
> #include <sys/wait.h>
> #include <assert.h>
> #include <sched.h>
> #include <errno.h>
>
> int main(void)
> {
> struct clone_args args = {
> .exit_signal = SIGCHLD,
> .flags = CLONE_PIDFD,
> .pidfd = 0,
> };
> unsigned long pidfd;
> int pid;
>
> assert(unshare(CLONE_NEWPID) == 0);
>
> pid = syscall(__NR_clone3, &args, sizeof(args));
> assert(pid == -1 && errno == EFAULT);
>
> args.pidfd = (unsigned long)&pidfd;
> pid = syscall(__NR_clone3, &args, sizeof(args));
> if (pid)
> assert(pid > 0 && wait(NULL) == pid);
> else
> assert(getpid() == 1);
>
> return 0;
> }
>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Andrei Vagin <avagin@gmail.com>
Thanks,
Andrei
Both copy_process() and alloc_pid() do the same PIDNS_ADDING check.
The reasons for these checks, and the fact that both are necessary,
are not immediately obvious. Add the comments.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
kernel/fork.c | 6 +++++-
kernel/pid.c | 5 +++++
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/kernel/fork.c b/kernel/fork.c
index 544fe1b43d88..7cfa8addc080 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2392,7 +2392,11 @@ __latent_entropy struct task_struct *copy_process(
rseq_fork(p, clone_flags);
- /* Don't start children in a dying pid namespace */
+ /*
+ * If zap_pid_ns_processes() was called after alloc_pid(), the new
+ * child missed SIGKILL. If current is not in the same namespace,
+ * we can't rely on fatal_signal_pending() below.
+ */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_core_free;
diff --git a/kernel/pid.c b/kernel/pid.c
index 1a0d2ac1f4a9..082a3c4a053f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -317,6 +317,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
*
* This can't be done earlier because we need to preserve other
* error conditions.
+ *
+ * We need this even if copy_process() does the same check. If two
+ * or more tasks from parent namespace try to inject a child into a
+ * dead namespace, one of free_pid() calls from the copy_process()
+ * error path may try to wakeup the possibly freed ns->child_reaper.
*/
retval = -ENOMEM;
if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
--
2.52.0
© 2016 - 2026 Red Hat, Inc.