[PATCH 1/2] Revert "pid: allow pid_max to be set per pid namespace"

Michal Koutný posted 2 patches 9 months, 4 weeks ago
[PATCH 1/2] Revert "pid: allow pid_max to be set per pid namespace"
Posted by Michal Koutný 9 months, 4 weeks ago
This reverts commit 7863dcc72d0f4b13a641065670426435448b3d80.

It is already difficult for users to troubleshoot which of multiple pid
limits restricts their workload. I'm afraid making pid_max
per-(hierarchical-)NS will contribute to confusion.
Also, the implementation copies the limit upon creation from
parent, this pattern showed cumbersome with some attributes in legacy
cgroup controllers -- it's subject to race condition between parent's
limit modification and children creation and once copied it must be
changed in the descendant.

This is very similar to what pids.max of a cgroup (already) does that
can be used as an alternative.

Link: https://lore.kernel.org/r/bnxhqrq7tip6jl2hu6jsvxxogdfii7ugmafbhgsogovrchxfyp@kagotkztqurt/
Signed-off-by: Michal Koutný <mkoutny@suse.com>
---
 include/linux/pid.h               |   3 +
 include/linux/pid_namespace.h     |  10 +--
 kernel/pid.c                      | 125 ++----------------------------
 kernel/pid_namespace.c            |  43 +++-------
 kernel/sysctl.c                   |   9 +++
 kernel/trace/pid_list.c           |   2 +-
 kernel/trace/trace.h              |   2 +
 kernel/trace/trace_sched_switch.c |   2 +-
 8 files changed, 35 insertions(+), 161 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 98837a1ff0f33..fe575fcdb4afa 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -108,6 +108,9 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old);
 extern void transfer_pid(struct task_struct *old, struct task_struct *new,
 			 enum pid_type);
 
+extern int pid_max;
+extern int pid_max_min, pid_max_max;
+
 /*
  * look up a PID in the hash table. Must be called with the tasklist_lock
  * or rcu_read_lock() held.
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 7c67a58111998..f9f9931e02d6a 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -30,7 +30,6 @@ struct pid_namespace {
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
 	unsigned int level;
-	int pid_max;
 	struct pid_namespace *parent;
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct fs_pin *bacct;
@@ -39,14 +38,9 @@ struct pid_namespace {
 	struct ucounts *ucounts;
 	int reboot;	/* group exit code if this pidns was rebooted */
 	struct ns_common ns;
-	struct work_struct	work;
-#ifdef CONFIG_SYSCTL
-	struct ctl_table_set	set;
-	struct ctl_table_header *sysctls;
-#if defined(CONFIG_MEMFD_CREATE)
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
 	int memfd_noexec_scope;
 #endif
-#endif
 } __randomize_layout;
 
 extern struct pid_namespace init_pid_ns;
@@ -123,8 +117,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
 void pidhash_init(void);
 void pid_idr_init(void);
-int register_pidns_sysctls(struct pid_namespace *pidns);
-void unregister_pidns_sysctls(struct pid_namespace *pidns);
 
 static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
 {
diff --git a/kernel/pid.c b/kernel/pid.c
index 924084713be8b..aa2a7d4da4555 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -61,8 +61,10 @@ struct pid init_struct_pid = {
 	}, }
 };
 
-static int pid_max_min = RESERVED_PIDS + 1;
-static int pid_max_max = PID_MAX_LIMIT;
+int pid_max = PID_MAX_DEFAULT;
+
+int pid_max_min = RESERVED_PIDS + 1;
+int pid_max_max = PID_MAX_LIMIT;
 
 /*
  * PID-map pages start out as NULL, they get allocated upon
@@ -81,7 +83,6 @@ struct pid_namespace init_pid_ns = {
 #ifdef CONFIG_PID_NS
 	.ns.ops = &pidns_operations,
 #endif
-	.pid_max = PID_MAX_DEFAULT,
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
 	.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
 #endif
@@ -190,7 +191,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 
 	for (i = ns->level; i >= 0; i--) {
 		int tid = 0;
-		int pid_max = READ_ONCE(tmp->pid_max);
 
 		if (set_tid_size) {
 			tid = set_tid[ns->level - i];
@@ -644,118 +644,17 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
 	return fd;
 }
 
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
-{
-	return &task_active_pid_ns(current)->set;
-}
-
-static int set_is_seen(struct ctl_table_set *set)
-{
-	return &task_active_pid_ns(current)->set == set;
-}
-
-static int pid_table_root_permissions(struct ctl_table_header *head,
-				      const struct ctl_table *table)
-{
-	struct pid_namespace *pidns =
-		container_of(head->set, struct pid_namespace, set);
-	int mode = table->mode;
-
-	if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
-	    uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
-		mode = (mode & S_IRWXU) >> 6;
-	else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
-		mode = (mode & S_IRWXG) >> 3;
-	else
-		mode = mode & S_IROTH;
-	return (mode << 6) | (mode << 3) | mode;
-}
-
-static void pid_table_root_set_ownership(struct ctl_table_header *head,
-					 kuid_t *uid, kgid_t *gid)
-{
-	struct pid_namespace *pidns =
-		container_of(head->set, struct pid_namespace, set);
-	kuid_t ns_root_uid;
-	kgid_t ns_root_gid;
-
-	ns_root_uid = make_kuid(pidns->user_ns, 0);
-	if (uid_valid(ns_root_uid))
-		*uid = ns_root_uid;
-
-	ns_root_gid = make_kgid(pidns->user_ns, 0);
-	if (gid_valid(ns_root_gid))
-		*gid = ns_root_gid;
-}
-
-static struct ctl_table_root pid_table_root = {
-	.lookup		= pid_table_root_lookup,
-	.permissions	= pid_table_root_permissions,
-	.set_ownership	= pid_table_root_set_ownership,
-};
-
-static const struct ctl_table pid_table[] = {
-	{
-		.procname	= "pid_max",
-		.data		= &init_pid_ns.pid_max,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &pid_max_min,
-		.extra2		= &pid_max_max,
-	},
-};
-#endif
-
-int register_pidns_sysctls(struct pid_namespace *pidns)
-{
-#ifdef CONFIG_SYSCTL
-	struct ctl_table *tbl;
-
-	setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
-
-	tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
-	if (!tbl)
-		return -ENOMEM;
-	tbl->data = &pidns->pid_max;
-	pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
-			     PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
-
-	pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
-						 ARRAY_SIZE(pid_table));
-	if (!pidns->sysctls) {
-		kfree(tbl);
-		retire_sysctl_set(&pidns->set);
-		return -ENOMEM;
-	}
-#endif
-	return 0;
-}
-
-void unregister_pidns_sysctls(struct pid_namespace *pidns)
-{
-#ifdef CONFIG_SYSCTL
-	const struct ctl_table *tbl;
-
-	tbl = pidns->sysctls->ctl_table_arg;
-	unregister_sysctl_table(pidns->sysctls);
-	retire_sysctl_set(&pidns->set);
-	kfree(tbl);
-#endif
-}
-
 void __init pid_idr_init(void)
 {
 	/* Verify no one has done anything silly: */
 	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
 
 	/* bump default and minimum pid_max based on number of cpus */
-	init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
-				  PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+	pid_max = min(pid_max_max, max_t(int, pid_max,
+				PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
 	pid_max_min = max_t(int, pid_max_min,
 				PIDS_PER_CPU_MIN * num_possible_cpus());
-	pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
+	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
 
 	idr_init(&init_pid_ns.idr);
 
@@ -766,16 +665,6 @@ void __init pid_idr_init(void)
 			NULL);
 }
 
-static __init int pid_namespace_sysctl_init(void)
-{
-#ifdef CONFIG_SYSCTL
-	/* "kernel" directory will have already been initialized. */
-	BUG_ON(register_pidns_sysctls(&init_pid_ns));
-#endif
-	return 0;
-}
-subsys_initcall(pid_namespace_sysctl_init);
-
 static struct file *__pidfd_fget(struct task_struct *task, int fd)
 {
 	struct file *file;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 8f6cfec87555a..0f23285be4f92 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -70,8 +70,6 @@ static void dec_pid_namespaces(struct ucounts *ucounts)
 	dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
 }
 
-static void destroy_pid_namespace_work(struct work_struct *work);
-
 static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
 	struct pid_namespace *parent_pid_ns)
 {
@@ -107,27 +105,17 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 		goto out_free_idr;
 	ns->ns.ops = &pidns_operations;
 
-	ns->pid_max = parent_pid_ns->pid_max;
-	err = register_pidns_sysctls(ns);
-	if (err)
-		goto out_free_inum;
-
 	refcount_set(&ns->ns.count, 1);
 	ns->level = level;
 	ns->parent = get_pid_ns(parent_pid_ns);
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
 	ns->pid_allocated = PIDNS_ADDING;
-	INIT_WORK(&ns->work, destroy_pid_namespace_work);
-
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
 	ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
 #endif
-
 	return ns;
 
-out_free_inum:
-	ns_free_inum(&ns->ns);
 out_free_idr:
 	idr_destroy(&ns->idr);
 	kmem_cache_free(pid_ns_cachep, ns);
@@ -149,28 +137,12 @@ static void delayed_free_pidns(struct rcu_head *p)
 
 static void destroy_pid_namespace(struct pid_namespace *ns)
 {
-	unregister_pidns_sysctls(ns);
-
 	ns_free_inum(&ns->ns);
 
 	idr_destroy(&ns->idr);
 	call_rcu(&ns->rcu, delayed_free_pidns);
 }
 
-static void destroy_pid_namespace_work(struct work_struct *work)
-{
-	struct pid_namespace *ns =
-		container_of(work, struct pid_namespace, work);
-
-	do {
-		struct pid_namespace *parent;
-
-		parent = ns->parent;
-		destroy_pid_namespace(ns);
-		ns = parent;
-	} while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
-}
-
 struct pid_namespace *copy_pid_ns(unsigned long flags,
 	struct user_namespace *user_ns, struct pid_namespace *old_ns)
 {
@@ -183,8 +155,15 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
 
 void put_pid_ns(struct pid_namespace *ns)
 {
-	if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
-		schedule_work(&ns->work);
+	struct pid_namespace *parent;
+
+	while (ns != &init_pid_ns) {
+		parent = ns->parent;
+		if (!refcount_dec_and_test(&ns->ns.count))
+			break;
+		destroy_pid_namespace(ns);
+		ns = parent;
+	}
 }
 EXPORT_SYMBOL_GPL(put_pid_ns);
 
@@ -295,7 +274,6 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
 	next = idr_get_cursor(&pid_ns->idr) - 1;
 
 	tmp.data = &next;
-	tmp.extra2 = &pid_ns->pid_max;
 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 	if (!ret && write)
 		idr_set_cursor(&pid_ns->idr, next + 1);
@@ -303,6 +281,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
 	return ret;
 }
 
+extern int pid_max;
 static const struct ctl_table pid_ns_ctl_table[] = {
 	{
 		.procname = "ns_last_pid",
@@ -310,7 +289,7 @@ static const struct ctl_table pid_ns_ctl_table[] = {
 		.mode = 0666, /* permissions are checked in the handler */
 		.proc_handler = pid_ns_ctl_handler,
 		.extra1 = SYSCTL_ZERO,
-		.extra2 = &init_pid_ns.pid_max,
+		.extra2 = &pid_max,
 	},
 };
 #endif	/* CONFIG_CHECKPOINT_RESTORE */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cb57da499ebb1..bb739608680f2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1803,6 +1803,15 @@ static const struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
+	{
+		.procname	= "pid_max",
+		.data		= &pid_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &pid_max_min,
+		.extra2		= &pid_max_max,
+	},
 	{
 		.procname	= "panic_on_oops",
 		.data		= &panic_on_oops,
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index c62b9b3cfb3d8..4966e6bbdf6f3 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
 	int i;
 
 	/* According to linux/thread.h, pids can be no bigger that 30 bits */
-	WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
+	WARN_ON_ONCE(pid_max > (1 << 30));
 
 	pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
 	if (!pid_list)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9c21ba45b7af6..46c65402ad7e5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -732,6 +732,8 @@ extern unsigned long tracing_thresh;
 
 /* PID filtering */
 
+extern int pid_max;
+
 bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
 			     pid_t search_pid);
 bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index cb49f7279dc80..573b5d8e8a28e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void)
 	if (tgid_map)
 		return 0;
 
-	tgid_map_max = init_pid_ns.pid_max;
+	tgid_map_max = pid_max;
 	map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
 		       GFP_KERNEL);
 	if (!map)
-- 
2.48.1

Re: [PATCH 1/2] Revert "pid: allow pid_max to be set per pid namespace"
Posted by kernel test robot 9 months, 1 week ago

Hello,

kernel test robot noticed a 23.4% improvement of stress-ng.sigxfsz.ops_per_sec on:


commit: ee2a5c3e36093d0ff5709bc8f21d3793cf55f746 ("[PATCH 1/2] Revert "pid: allow pid_max to be set per pid namespace"")
url: https://github.com/intel-lab-lkp/linux/commits/Michal-Koutn/Revert-pid-allow-pid_max-to-be-set-per-pid-namespace/20250222-010942
patch link: https://lore.kernel.org/all/20250221170249.890014-2-mkoutny@suse.com/
patch subject: [PATCH 1/2] Revert "pid: allow pid_max to be set per pid namespace"

testcase: stress-ng
config: x86_64-rhel-9.4
compiler: gcc-12
test machine: 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory
parameters:

	nr_threads: 100%
	testtime: 60s
	test: sigxfsz
	cpufreq_governor: performance


In addition to that, the commit also has significant impact on the following tests:

+------------------+-------------------------------------------------------------------------------------------+
| testcase: change | stress-ng: stress-ng.mprotect.ops_per_sec 4.5% improvement                                |
| test machine     | 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory |
| test parameters  | cpufreq_governor=performance                                                              |
|                  | nr_threads=100%                                                                           |
|                  | test=mprotect                                                                             |
|                  | testtime=60s                                                                              |
+------------------+-------------------------------------------------------------------------------------------+
| testcase: change | stress-ng: stress-ng.sigrt.ops_per_sec 15.7% improvement                                  |
| test machine     | 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory |
| test parameters  | cpufreq_governor=performance                                                              |
|                  | nr_threads=100%                                                                           |
|                  | test=sigrt                                                                                |
|                  | testtime=60s                                                                              |
+------------------+-------------------------------------------------------------------------------------------+
| testcase: change | stress-ng: stress-ng.sigbus.ops_per_sec 20.6% improvement                                 |
| test machine     | 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory |
| test parameters  | cpufreq_governor=performance                                                              |
|                  | nr_threads=100%                                                                           |
|                  | test=sigbus                                                                               |
|                  | testtime=60s                                                                              |
+------------------+-------------------------------------------------------------------------------------------+




Details are as below:
-------------------------------------------------------------------------------------------------->


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20250310/202503101532.348576bb-lkp@intel.com

=========================================================================================
compiler/cpufreq_governor/kconfig/nr_threads/rootfs/tbox_group/test/testcase/testtime:
  gcc-12/performance/x86_64-rhel-9.4/100%/debian-12-x86_64-20240206.cgz/lkp-icl-2sp8/sigxfsz/stress-ng/60s

commit: 
  3344260945 ("Merge tag 'for-v6.14-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply")
  ee2a5c3e36 ("Revert "pid: allow pid_max to be set per pid namespace"")

334426094588f817 ee2a5c3e36093d0ff5709bc8f21 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
      5.11            +1.3        6.43        mpstat.cpu.all.usr%
      3737 ±  6%     -38.8%       2286 ± 42%  proc-vmstat.numa_hint_faults_local
   1212920 ±  4%     -10.4%    1086901 ±  5%  sched_debug.cpu.avg_idle.max
     35.50 ± 16%     -30.0%      24.83 ± 20%  perf-c2c.DRAM.local
      1517 ±  4%     -46.5%     812.17 ±  3%  perf-c2c.DRAM.remote
      1808 ±  2%     +57.0%       2840        perf-c2c.HITM.local
      1360 ±  5%     -49.9%     680.83 ±  2%  perf-c2c.HITM.remote
      5.22 ±  3%     +19.8%       6.26 ±  7%  perf-sched.wait_and_delay.avg.ms.schedule_timeout.rcu_gp_fqs_loop.rcu_gp_kthread.kthread
     53.33 ± 15%     +25.0%      66.67 ± 15%  perf-sched.wait_and_delay.count.__cond_resched.vfs_write.__x64_sys_pwrite64.do_syscall_64.entry_SYSCALL_64_after_hwframe
    953.83 ±  3%     -16.5%     796.33 ±  7%  perf-sched.wait_and_delay.count.schedule_timeout.rcu_gp_fqs_loop.rcu_gp_kthread.kthread
      5.21 ±  3%     +20.0%       6.25 ±  7%  perf-sched.wait_time.avg.ms.schedule_timeout.rcu_gp_fqs_loop.rcu_gp_kthread.kthread
    163515           +27.8%     208915        stress-ng.sigxfsz.SIGXFSZ_signals_per_sec
 6.668e+08           +23.4%   8.23e+08        stress-ng.sigxfsz.ops
  11113966           +23.4%   13716156        stress-ng.sigxfsz.ops_per_sec
      3623            -1.4%       3573        stress-ng.time.system_time
    163.26           +31.7%     214.98        stress-ng.time.user_time
      0.25           -54.7%       0.12 ±  2%  perf-stat.i.MPKI
 1.125e+10           +22.1%  1.373e+10        perf-stat.i.branch-instructions
      0.54            -0.0        0.50        perf-stat.i.branch-miss-rate%
  59748239           +10.9%   66264440        perf-stat.i.branch-misses
     33.30           -17.9       15.38 ±  2%  perf-stat.i.cache-miss-rate%
  13040640           -45.8%    7066419 ±  2%  perf-stat.i.cache-misses
  39047103           +15.5%   45098530        perf-stat.i.cache-references
      4.39           -18.2%       3.59        perf-stat.i.cpi
     17823           +97.0%      35113        perf-stat.i.cycles-between-cache-misses
 5.144e+10           +22.0%  6.275e+10        perf-stat.i.instructions
      0.23           +21.3%       0.28        perf-stat.i.ipc
      0.25           -55.6%       0.11 ±  2%  perf-stat.overall.MPKI
      0.53            -0.0        0.48        perf-stat.overall.branch-miss-rate%
     33.40           -17.7       15.67 ±  2%  perf-stat.overall.cache-miss-rate%
      4.40           -18.0%       3.60        perf-stat.overall.cpi
     17350           +84.6%      32027 ±  2%  perf-stat.overall.cycles-between-cache-misses
      0.23           +22.0%       0.28        perf-stat.overall.ipc
 1.106e+10           +22.1%   1.35e+10        perf-stat.ps.branch-instructions
  58763534           +10.9%   65180843        perf-stat.ps.branch-misses
  12827760           -45.8%    6951883 ±  2%  perf-stat.ps.cache-misses
  38411225           +15.5%   44365626        perf-stat.ps.cache-references
  5.06e+10           +22.0%  6.172e+10        perf-stat.ps.instructions
 3.106e+12           +21.9%  3.787e+12        perf-stat.total.instructions


***************************************************************************************************
lkp-icl-2sp7: 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory
=========================================================================================
compiler/cpufreq_governor/kconfig/nr_threads/rootfs/tbox_group/test/testcase/testtime:
  gcc-12/performance/x86_64-rhel-9.4/100%/debian-12-x86_64-20240206.cgz/lkp-icl-2sp7/mprotect/stress-ng/60s

commit: 
  3344260945 ("Merge tag 'for-v6.14-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply")
  ee2a5c3e36 ("Revert "pid: allow pid_max to be set per pid namespace"")

334426094588f817 ee2a5c3e36093d0ff5709bc8f21 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
     10205 ± 25%     +33.5%      13621 ± 16%  numa-meminfo.node0.KernelStack
      0.02 ± 37%     -37.8%       0.01 ± 13%  perf-sched.sch_delay.avg.ms.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm
      0.82 ± 32%     -37.7%       0.51 ±  7%  perf-sched.sch_delay.max.ms.schedule_timeout.kcompactd.kthread.ret_from_fork
    807.17 ±  5%      -8.5%     738.67 ±  5%  perf-sched.wait_and_delay.count.__cond_resched.down_write.vma_prepare.__split_vma.vma_modify
    433709            +4.9%     454923 ±  5%  proc-vmstat.nr_active_anon
     61940 ±  3%     +31.3%      81315 ± 35%  proc-vmstat.nr_shmem
    433709            +4.9%     454923 ±  5%  proc-vmstat.nr_zone_active_anon
 4.903e+08            +4.5%  5.124e+08        stress-ng.mprotect.ops
   8163833            +4.5%    8533021        stress-ng.mprotect.ops_per_sec
    239.55            +4.7%     250.91        stress-ng.time.user_time
   3960356 ±  7%     -16.0%    3325457        numa-numastat.node0.local_node
   3990670 ±  7%     -16.1%    3348370        numa-numastat.node0.numa_hit
   2608139 ±  6%     +34.5%    3507199 ±  4%  numa-numastat.node1.local_node
   2644058 ±  6%     +34.3%    3550893 ±  4%  numa-numastat.node1.numa_hit
   3986137 ±  7%     -16.0%    3349506        numa-vmstat.node0.numa_hit
   3955823 ±  7%     -15.9%    3326594        numa-vmstat.node0.numa_local
   2639425 ±  6%     +34.6%    3552253 ±  4%  numa-vmstat.node1.numa_hit
   2603506 ±  6%     +34.8%    3508559 ±  4%  numa-vmstat.node1.numa_local
      1.11 ± 20%     -38.9%       0.68 ± 31%  sched_debug.cfs_rq:/.h_nr_queued.stddev
      1.11 ± 19%     -38.6%       0.68 ± 31%  sched_debug.cfs_rq:/.h_nr_runnable.stddev
      5890 ±  6%     -10.7%       5262        sched_debug.cfs_rq:/.runnable_avg.max
      1064 ± 20%     -41.1%     626.67 ± 33%  sched_debug.cfs_rq:/.runnable_avg.stddev
      1151           -12.2%       1010        sched_debug.cpu.clock_task.stddev
      1.11 ± 20%     -39.1%       0.68 ± 32%  sched_debug.cpu.nr_running.stddev
 1.861e+10            +4.5%  1.945e+10        perf-stat.i.branch-instructions
 1.264e+08            +4.1%  1.316e+08        perf-stat.i.branch-misses
  1.45e+08            +5.3%  1.526e+08        perf-stat.i.cache-references
      2.28            -4.3%       2.18        perf-stat.i.cpi
 8.533e+10            +4.5%   8.92e+10        perf-stat.i.instructions
      0.44            +4.5%       0.46        perf-stat.i.ipc
     63.03            +4.5%      65.90        perf-stat.i.metric.K/sec
   4035009            +4.5%    4218051        perf-stat.i.page-faults
      2.29            -4.4%       2.19        perf-stat.overall.cpi
      0.44            +4.6%       0.46        perf-stat.overall.ipc
 1.829e+10            +4.5%  1.912e+10        perf-stat.ps.branch-instructions
 1.242e+08            +4.1%  1.293e+08        perf-stat.ps.branch-misses
 1.424e+08            +5.3%  1.499e+08        perf-stat.ps.cache-references
 8.385e+10            +4.6%  8.767e+10        perf-stat.ps.instructions
   3966080            +4.6%    4146673        perf-stat.ps.page-faults
 5.154e+12            +4.6%  5.389e+12        perf-stat.total.instructions
     36.24            -1.9       34.36 ±  2%  perf-profile.calltrace.cycles-pp.asm_exc_page_fault.stress_mprotect_mem
     38.30            -1.7       36.58 ±  2%  perf-profile.calltrace.cycles-pp.stress_mprotect_mem
     14.45 ±  2%      -1.7       12.80 ±  2%  perf-profile.calltrace.cycles-pp.get_signal.arch_do_signal_or_restart.irqentry_exit_to_user_mode.asm_exc_page_fault.stress_mprotect_mem
     17.12            -1.5       15.58 ±  2%  perf-profile.calltrace.cycles-pp.irqentry_exit_to_user_mode.asm_exc_page_fault.stress_mprotect_mem
     17.06            -1.5       15.54 ±  2%  perf-profile.calltrace.cycles-pp.arch_do_signal_or_restart.irqentry_exit_to_user_mode.asm_exc_page_fault.stress_mprotect_mem
     12.44 ±  2%      -1.5       10.92 ±  2%  perf-profile.calltrace.cycles-pp.do_dec_rlimit_put_ucounts.__sigqueue_free.get_signal.arch_do_signal_or_restart.irqentry_exit_to_user_mode
     12.46 ±  2%      -1.5       10.94 ±  2%  perf-profile.calltrace.cycles-pp.__sigqueue_free.get_signal.arch_do_signal_or_restart.irqentry_exit_to_user_mode.asm_exc_page_fault
      0.54 ±  2%      -0.1        0.43 ± 44%  perf-profile.calltrace.cycles-pp.up_read.__bad_area.bad_area_access_error.exc_page_fault.asm_exc_page_fault
      0.84            -0.1        0.75 ±  4%  perf-profile.calltrace.cycles-pp.down_write.__split_vma.vma_modify.vma_modify_flags.mprotect_fixup
      1.60            -0.1        1.51 ±  2%  perf-profile.calltrace.cycles-pp.asm_exc_page_fault.stress_sig_handler
      1.59            -0.1        1.51 ±  2%  perf-profile.calltrace.cycles-pp.irqentry_exit_to_user_mode.asm_exc_page_fault.stress_sig_handler
      0.82 ±  3%      -0.1        0.74 ±  2%  perf-profile.calltrace.cycles-pp.sigprocmask.__x64_sys_rt_sigprocmask.do_syscall_64.entry_SYSCALL_64_after_hwframe.pthread_sigmask
      1.44            -0.1        1.37 ±  2%  perf-profile.calltrace.cycles-pp.arch_do_signal_or_restart.irqentry_exit_to_user_mode.asm_exc_page_fault.stress_sig_handler
      1.03 ±  2%      -0.1        0.98        perf-profile.calltrace.cycles-pp.__x64_sys_rt_sigprocmask.do_syscall_64.entry_SYSCALL_64_after_hwframe.pthread_sigmask
      1.29 ±  2%      -0.1        1.23        perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.pthread_sigmask
      0.68 ±  3%      -0.0        0.64 ±  2%  perf-profile.calltrace.cycles-pp.up_write.vma_complete.__split_vma.vma_modify.vma_modify_flags
      0.58 ±  2%      -0.0        0.54 ±  3%  perf-profile.calltrace.cycles-pp.__bad_area.bad_area_access_error.exc_page_fault.asm_exc_page_fault.stress_mprotect_mem
      0.58 ±  2%      -0.0        0.56        perf-profile.calltrace.cycles-pp.fpu__clear_user_states.handle_signal.arch_do_signal_or_restart.irqentry_exit_to_user_mode.asm_exc_page_fault
      0.62 ±  3%      +0.1        0.67 ±  2%  perf-profile.calltrace.cycles-pp.mas_prev_slot.do_mprotect_pkey.__x64_sys_mprotect.do_syscall_64.entry_SYSCALL_64_after_hwframe
      1.01            +0.1        1.07        perf-profile.calltrace.cycles-pp.copy_fpstate_to_sigframe.get_sigframe.x64_setup_rt_frame.handle_signal.arch_do_signal_or_restart
      1.23            +0.1        1.30 ±  2%  perf-profile.calltrace.cycles-pp.do_user_addr_fault.exc_page_fault.asm_exc_page_fault.stress_mprotect_mem
      0.84 ±  3%      +0.1        0.91 ±  2%  perf-profile.calltrace.cycles-pp.vma_interval_tree_insert.vma_complete.commit_merge.vma_merge_existing_range.vma_modify
      0.84 ±  2%      +0.1        0.91        perf-profile.calltrace.cycles-pp.mas_preallocate.__split_vma.vma_modify.vma_modify_flags.mprotect_fixup
      1.75 ±  2%      +0.1        1.83        perf-profile.calltrace.cycles-pp.entry_SYSCALL_64.__mprotect
      0.59 ±  2%      +0.1        0.67 ±  2%  perf-profile.calltrace.cycles-pp.simple_dname.perf_event_mmap_event.perf_event_mmap.mprotect_fixup.do_mprotect_pkey
      2.41 ±  2%      +0.1        2.50        perf-profile.calltrace.cycles-pp.clear_bhb_loop.__mprotect
      1.77            +0.1        1.88        perf-profile.calltrace.cycles-pp.get_sigframe.x64_setup_rt_frame.handle_signal.arch_do_signal_or_restart.irqentry_exit_to_user_mode
      2.02            +0.1        2.14        perf-profile.calltrace.cycles-pp.x64_setup_rt_frame.handle_signal.arch_do_signal_or_restart.irqentry_exit_to_user_mode.asm_exc_page_fault
      0.98 ± 18%      +0.1        1.10        perf-profile.calltrace.cycles-pp.change_protection_range.mprotect_fixup.do_mprotect_pkey.__x64_sys_mprotect.do_syscall_64
      2.57            +0.1        2.70        perf-profile.calltrace.cycles-pp.handle_signal.arch_do_signal_or_restart.irqentry_exit_to_user_mode.asm_exc_page_fault.stress_mprotect_mem
      3.13 ±  3%      +0.2        3.34 ±  2%  perf-profile.calltrace.cycles-pp.asm_exc_page_fault.__mprotect
      0.00            +0.6        0.55 ±  2%  perf-profile.calltrace.cycles-pp.prepend_copy.simple_dname.perf_event_mmap_event.perf_event_mmap.mprotect_fixup
     34.00            +1.1       35.12 ±  2%  perf-profile.calltrace.cycles-pp.mprotect_fixup.do_mprotect_pkey.__x64_sys_mprotect.do_syscall_64.entry_SYSCALL_64_after_hwframe
     46.05            +1.1       47.19        perf-profile.calltrace.cycles-pp.do_mprotect_pkey.__x64_sys_mprotect.do_syscall_64.entry_SYSCALL_64_after_hwframe.__mprotect
     46.28            +1.2       47.43        perf-profile.calltrace.cycles-pp.__x64_sys_mprotect.do_syscall_64.entry_SYSCALL_64_after_hwframe.__mprotect
     48.43            +1.2       49.61        perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.__mprotect
     48.86            +1.2       50.06        perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.__mprotect
     55.84            +1.6       57.41        perf-profile.calltrace.cycles-pp.__mprotect
     39.48            -1.9       37.62 ±  2%  perf-profile.children.cycles-pp.asm_exc_page_fault
     14.48 ±  2%      -1.6       12.83 ±  2%  perf-profile.children.cycles-pp.get_signal
     18.72            -1.6       17.11        perf-profile.children.cycles-pp.irqentry_exit_to_user_mode
     39.92            -1.6       38.32 ±  2%  perf-profile.children.cycles-pp.stress_mprotect_mem
     18.52            -1.6       16.92        perf-profile.children.cycles-pp.arch_do_signal_or_restart
     12.47 ±  2%      -1.5       10.94 ±  2%  perf-profile.children.cycles-pp.__sigqueue_free
     12.44 ±  2%      -1.5       10.92 ±  2%  perf-profile.children.cycles-pp.do_dec_rlimit_put_ucounts
      5.00            -0.2        4.83 ±  2%  perf-profile.children.cycles-pp.up_write
      0.47 ± 10%      -0.1        0.34 ±  7%  perf-profile.children.cycles-pp.__sysvec_apic_timer_interrupt
      0.47 ± 10%      -0.1        0.34 ±  7%  perf-profile.children.cycles-pp.hrtimer_interrupt
      1.16 ±  3%      -0.1        1.05        perf-profile.children.cycles-pp.recalc_sigpending
      0.35 ±  7%      -0.1        0.24 ±  6%  perf-profile.children.cycles-pp.__hrtimer_run_queues
      0.89 ±  6%      -0.1        0.79 ±  5%  perf-profile.children.cycles-pp._raw_spin_lock_irq
      0.34 ±  8%      -0.1        0.24 ±  6%  perf-profile.children.cycles-pp.tick_nohz_handler
      0.86 ±  2%      -0.1        0.78        perf-profile.children.cycles-pp.sigprocmask
      0.28 ± 10%      -0.1        0.21 ±  6%  perf-profile.children.cycles-pp.update_process_times
      1.05 ±  2%      -0.1        0.98        perf-profile.children.cycles-pp.__x64_sys_rt_sigprocmask
      0.30 ±  3%      -0.0        0.26 ±  3%  perf-profile.children.cycles-pp.fpregs_mark_activate
      0.17 ± 10%      -0.0        0.13 ±  6%  perf-profile.children.cycles-pp.sched_tick
      0.47 ±  3%      -0.0        0.43 ±  3%  perf-profile.children.cycles-pp.complete_signal
      0.54 ±  2%      -0.0        0.51 ±  2%  perf-profile.children.cycles-pp.up_read
      0.58 ±  2%      -0.0        0.55 ±  2%  perf-profile.children.cycles-pp.__bad_area
      0.61            -0.0        0.58        perf-profile.children.cycles-pp.fpu__clear_user_states
      0.12 ±  5%      +0.0        0.14 ±  4%  perf-profile.children.cycles-pp.__get_user_nocheck_4
      0.13 ±  3%      +0.0        0.14 ±  3%  perf-profile.children.cycles-pp.ima_file_mprotect
      0.22 ±  5%      +0.0        0.24 ±  2%  perf-profile.children.cycles-pp.security_file_mprotect
      0.25 ±  3%      +0.0        0.28 ±  4%  perf-profile.children.cycles-pp.stress_mwc16
      0.18 ±  5%      +0.0        0.20 ±  6%  perf-profile.children.cycles-pp.stress_mwc16modn
      0.34 ±  3%      +0.0        0.37 ±  3%  perf-profile.children.cycles-pp.mas_ascend
      0.12 ±  4%      +0.0        0.15 ±  5%  perf-profile.children.cycles-pp.copy_from_kernel_nofault_allowed
      0.30 ±  8%      +0.0        0.33 ±  2%  perf-profile.children.cycles-pp.rcu_all_qs
      0.26 ±  4%      +0.0        0.29 ±  6%  perf-profile.children.cycles-pp.mas_pop_node
      0.44 ±  2%      +0.0        0.47        perf-profile.children.cycles-pp.vma_set_page_prot
      0.49 ±  3%      +0.0        0.53 ±  3%  perf-profile.children.cycles-pp.save_xstate_epilog
      0.66 ±  2%      +0.0        0.71 ±  2%  perf-profile.children.cycles-pp.native_irq_return_iret
      0.02 ± 99%      +0.1        0.08 ± 11%  perf-profile.children.cycles-pp.anon_vma_clone
      1.27            +0.1        1.33        perf-profile.children.cycles-pp.do_user_addr_fault
      0.84            +0.1        0.90        perf-profile.children.cycles-pp.mas_prev_slot
      1.04            +0.1        1.11        perf-profile.children.cycles-pp.copy_fpstate_to_sigframe
      0.73 ±  7%      +0.1        0.79 ±  2%  perf-profile.children.cycles-pp.__cond_resched
      0.46 ±  3%      +0.1        0.53 ±  2%  perf-profile.children.cycles-pp.copy_from_kernel_nofault
      1.30 ±  2%      +0.1        1.37        perf-profile.children.cycles-pp.entry_SYSCALL_64
      0.50 ±  2%      +0.1        0.58 ±  2%  perf-profile.children.cycles-pp.prepend_copy
      1.68            +0.1        1.75        perf-profile.children.cycles-pp.mas_preallocate
      0.61 ±  3%      +0.1        0.70 ±  3%  perf-profile.children.cycles-pp.simple_dname
      2.77 ±  2%      +0.1        2.87        perf-profile.children.cycles-pp.clear_bhb_loop
      3.27            +0.1        3.37        perf-profile.children.cycles-pp.handle_signal
      1.78            +0.1        1.89        perf-profile.children.cycles-pp.get_sigframe
      2.05            +0.1        2.16        perf-profile.children.cycles-pp.x64_setup_rt_frame
      0.99 ± 18%      +0.1        1.11        perf-profile.children.cycles-pp.change_protection_range
      7.00            +0.2        7.24 ±  2%  perf-profile.children.cycles-pp.vma_prepare
     34.09            +1.1       35.22 ±  2%  perf-profile.children.cycles-pp.mprotect_fixup
     50.17            +1.1       51.31        perf-profile.children.cycles-pp.do_syscall_64
     46.24            +1.2       47.39        perf-profile.children.cycles-pp.do_mprotect_pkey
     46.33            +1.2       47.49        perf-profile.children.cycles-pp.__x64_sys_mprotect
     50.61            +1.2       51.78        perf-profile.children.cycles-pp.entry_SYSCALL_64_after_hwframe
     55.94            +1.6       57.52        perf-profile.children.cycles-pp.__mprotect
     12.44 ±  2%      -1.5       10.91 ±  2%  perf-profile.self.cycles-pp.do_dec_rlimit_put_ucounts
      4.36            -0.1        4.22 ±  2%  perf-profile.self.cycles-pp.up_write
      1.14 ±  3%      -0.1        1.03        perf-profile.self.cycles-pp.recalc_sigpending
      0.87 ±  6%      -0.1        0.78 ±  5%  perf-profile.self.cycles-pp._raw_spin_lock_irq
      2.83            -0.1        2.75        perf-profile.self.cycles-pp.down_write
      0.28 ±  5%      -0.0        0.23 ±  5%  perf-profile.self.cycles-pp.fpregs_mark_activate
      0.19 ± 10%      -0.0        0.14 ± 12%  perf-profile.self.cycles-pp.__perf_event_header__init_id
      0.40 ±  3%      -0.0        0.36 ±  5%  perf-profile.self.cycles-pp.complete_signal
      0.52 ±  2%      -0.0        0.48 ±  2%  perf-profile.self.cycles-pp.up_read
      0.15 ±  2%      -0.0        0.14 ±  3%  perf-profile.self.cycles-pp.__send_signal_locked
      0.10 ±  4%      -0.0        0.09 ±  4%  perf-profile.self.cycles-pp.__bad_area_nosemaphore
      0.30 ±  3%      +0.0        0.33 ±  4%  perf-profile.self.cycles-pp.mas_ascend
      0.10 ±  5%      +0.0        0.12 ±  5%  perf-profile.self.cycles-pp.do_user_addr_fault
      0.10 ±  4%      +0.0        0.12 ±  3%  perf-profile.self.cycles-pp.copy_from_kernel_nofault_allowed
      0.21 ±  6%      +0.0        0.24 ±  4%  perf-profile.self.cycles-pp.rwsem_down_write_slowpath
      0.40            +0.0        0.43 ±  2%  perf-profile.self.cycles-pp.change_protection_range
      0.44            +0.0        0.47        perf-profile.self.cycles-pp.entry_SYSCALL_64_after_hwframe
      0.24 ±  3%      +0.0        0.27 ±  6%  perf-profile.self.cycles-pp.mas_pop_node
      0.34 ±  2%      +0.0        0.38 ±  3%  perf-profile.self.cycles-pp.mas_preallocate
      0.37 ±  8%      +0.0        0.41 ±  3%  perf-profile.self.cycles-pp.__cond_resched
      0.72            +0.0        0.76 ±  2%  perf-profile.self.cycles-pp.copy_fpstate_to_sigframe
      0.41            +0.0        0.45 ±  3%  perf-profile.self.cycles-pp.mas_prev_slot
      0.66 ±  2%      +0.0        0.71 ±  2%  perf-profile.self.cycles-pp.native_irq_return_iret
      0.30 ±  4%      +0.0        0.35 ±  2%  perf-profile.self.cycles-pp.copy_from_kernel_nofault
      0.02 ±141%      +0.1        0.08 ± 11%  perf-profile.self.cycles-pp.anon_vma_clone
      1.21 ±  2%      +0.1        1.30 ±  2%  perf-profile.self.cycles-pp.__mprotect
      2.73 ±  2%      +0.1        2.83        perf-profile.self.cycles-pp.clear_bhb_loop
      2.76            +0.1        2.88        perf-profile.self.cycles-pp.do_mprotect_pkey
      3.48 ±  3%      +0.3        3.74 ±  2%  perf-profile.self.cycles-pp.stress_mprotect_mem



***************************************************************************************************
lkp-icl-2sp8: 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory
=========================================================================================
compiler/cpufreq_governor/kconfig/nr_threads/rootfs/tbox_group/test/testcase/testtime:
  gcc-12/performance/x86_64-rhel-9.4/100%/debian-12-x86_64-20240206.cgz/lkp-icl-2sp8/sigrt/stress-ng/60s

commit: 
  3344260945 ("Merge tag 'for-v6.14-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply")
  ee2a5c3e36 ("Revert "pid: allow pid_max to be set per pid namespace"")

334426094588f817 ee2a5c3e36093d0ff5709bc8f21 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
      1345 ±  9%     -15.8%       1132 ±  5%  perf-c2c.HITM.remote
   5328778           +18.0%    6289475        vmstat.system.cs
    197362            +2.0%     201296        vmstat.system.in
     45.97 ±118%     -85.4%       6.71 ± 55%  perf-sched.sch_delay.max.ms.__cond_resched.__alloc_frozen_pages_noprof.alloc_pages_mpol.folio_alloc_mpol_noprof.shmem_alloc_folio
    582.79 ± 39%     -39.2%     354.28 ± 31%  perf-sched.sch_delay.max.ms.schedule_hrtimeout_range.do_sigtimedwait.isra.0.__x64_sys_rt_sigtimedwait
      1260 ± 46%     -43.7%     709.74 ± 31%  perf-sched.wait_and_delay.max.ms.schedule_hrtimeout_range.do_sigtimedwait.isra.0.__x64_sys_rt_sigtimedwait
     45.97 ±118%     -85.4%       6.71 ± 55%  perf-sched.wait_time.max.ms.__cond_resched.__alloc_frozen_pages_noprof.alloc_pages_mpol.folio_alloc_mpol_noprof.shmem_alloc_folio
    705.59 ± 50%     -48.9%     360.90 ± 32%  perf-sched.wait_time.max.ms.schedule_hrtimeout_range.do_sigtimedwait.isra.0.__x64_sys_rt_sigtimedwait
     83250           -16.0%      69935        stress-ng.sigrt.nanosecs_between_sigqueue_and_sigwaitinfo_completion
 3.362e+08           +15.7%   3.89e+08        stress-ng.sigrt.ops
   5601334           +15.7%    6480915        stress-ng.sigrt.ops_per_sec
  65582158           +17.7%   77176472        stress-ng.time.involuntary_context_switches
      3423            -1.4%       3375        stress-ng.time.system_time
    335.13 ±  2%     +14.5%     383.80 ±  2%  stress-ng.time.user_time
 2.714e+08           +17.4%  3.185e+08        stress-ng.time.voluntary_context_switches
   4202907 ± 15%     -24.2%    3184715 ± 12%  sched_debug.cfs_rq:/.avg_vruntime.max
     82.07 ± 12%    +391.9%     403.68 ± 94%  sched_debug.cfs_rq:/.load_avg.avg
    169.48 ±  8%   +1182.4%       2173 ±115%  sched_debug.cfs_rq:/.load_avg.stddev
   4202907 ± 15%     -24.2%    3184715 ± 12%  sched_debug.cfs_rq:/.min_vruntime.max
      1239 ±  8%     +14.2%       1415 ± 12%  sched_debug.cfs_rq:/.util_avg.max
   2593172           +17.4%    3044316        sched_debug.cpu.nr_switches.avg
   1526897 ±  3%     +66.4%    2540867 ±  2%  sched_debug.cpu.nr_switches.min
    606805           -67.2%     198918 ±  9%  sched_debug.cpu.nr_switches.stddev
 1.902e+10           +14.8%  2.184e+10        perf-stat.i.branch-instructions
  1.42e+08 ±  3%     +16.2%   1.65e+08        perf-stat.i.branch-misses
      6.65 ±  4%      -0.9        5.77 ±  7%  perf-stat.i.cache-miss-rate%
 3.931e+08 ±  9%     +17.1%  4.605e+08 ±  6%  perf-stat.i.cache-references
   5534190           +17.4%    6498045        perf-stat.i.context-switches
      2.71           -14.3%       2.33        perf-stat.i.cpi
 8.694e+10           +14.8%  9.976e+10        perf-stat.i.instructions
      0.39           +14.2%       0.45        perf-stat.i.ipc
     86.53           +17.4%     101.60        perf-stat.i.metric.K/sec
      6.82 ±  5%      -0.9        5.91 ±  9%  perf-stat.overall.cache-miss-rate%
      2.59           -12.9%       2.26        perf-stat.overall.cpi
      0.39           +14.7%       0.44        perf-stat.overall.ipc
 1.871e+10           +14.8%  2.149e+10        perf-stat.ps.branch-instructions
 1.396e+08 ±  3%     +16.2%  1.622e+08        perf-stat.ps.branch-misses
 3.868e+08 ±  9%     +17.1%   4.53e+08 ±  6%  perf-stat.ps.cache-references
   5443676           +17.4%    6391319        perf-stat.ps.context-switches
 8.552e+10           +14.8%  9.813e+10        perf-stat.ps.instructions
 5.251e+12           +14.3%      6e+12        perf-stat.total.instructions



***************************************************************************************************
lkp-icl-2sp8: 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory
=========================================================================================
compiler/cpufreq_governor/kconfig/nr_threads/rootfs/tbox_group/test/testcase/testtime:
  gcc-12/performance/x86_64-rhel-9.4/100%/debian-12-x86_64-20240206.cgz/lkp-icl-2sp8/sigbus/stress-ng/60s

commit: 
  3344260945 ("Merge tag 'for-v6.14-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply")
  ee2a5c3e36 ("Revert "pid: allow pid_max to be set per pid namespace"")

334426094588f817 ee2a5c3e36093d0ff5709bc8f21 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
      7.64            +1.7        9.30        mpstat.cpu.all.usr%
     36.50 ± 16%     -42.9%      20.83 ± 31%  perf-c2c.DRAM.local
      2312 ±  6%     -68.7%     723.17 ±  4%  perf-c2c.DRAM.remote
      3690 ±  3%     +44.9%       5347 ±  6%  perf-c2c.HITM.local
      2155 ±  6%     -71.8%     608.17 ±  4%  perf-c2c.HITM.remote
      4477 ± 69%     -70.3%       1328 ± 35%  proc-vmstat.numa_hint_faults
      2459 ± 11%     -64.8%     866.33 ± 47%  proc-vmstat.numa_hint_faults_local
    140611 ± 21%     -33.6%      93302 ± 45%  proc-vmstat.numa_pte_updates
 7.197e+08           +20.7%  8.685e+08        proc-vmstat.pgfault
 7.201e+08           +20.6%  8.682e+08        stress-ng.sigbus.ops
  12001759           +20.6%   14469786        stress-ng.sigbus.ops_per_sec
      3526            -1.8%       3461        stress-ng.time.system_time
    261.31           +25.4%     327.64        stress-ng.time.user_time
      0.03 ± 55%     -64.6%       0.01 ± 17%  perf-sched.sch_delay.avg.ms.do_wait.kernel_wait4.do_syscall_64.entry_SYSCALL_64_after_hwframe
      0.86 ±150%     -90.1%       0.09 ±201%  perf-sched.sch_delay.avg.ms.schedule_hrtimeout_range.ep_poll.do_epoll_wait.__x64_sys_epoll_wait
      0.02 ± 50%     -58.7%       0.01 ± 14%  perf-sched.sch_delay.avg.ms.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm
      1.08 ± 18%     -34.1%       0.71 ± 14%  perf-sched.sch_delay.avg.ms.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe.[unknown]
      0.31 ± 72%     -65.9%       0.11 ± 71%  perf-sched.sch_delay.avg.ms.worker_thread.kthread.ret_from_fork.ret_from_fork_asm
      0.02 ± 10%     -23.4%       0.01 ± 15%  perf-sched.sch_delay.max.ms.rcu_gp_kthread.kthread.ret_from_fork.ret_from_fork_asm
      1.91 ±218%     -99.2%       0.02 ± 11%  perf-sched.sch_delay.max.ms.schedule_timeout.rcu_gp_fqs_loop.rcu_gp_kthread.kthread
      4.00 ± 49%     -71.6%       1.14 ± 56%  perf-sched.sch_delay.max.ms.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm
    261.25 ± 37%    +199.1%     781.43 ± 15%  perf-sched.wait_and_delay.avg.ms.__cond_resched.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm
     81.02 ± 59%    +274.1%     303.13 ± 50%  perf-sched.wait_and_delay.avg.ms.schedule_hrtimeout_range.ep_poll.do_epoll_wait.__x64_sys_epoll_wait
      6.60 ±  2%     +16.9%       7.71 ±  3%  perf-sched.wait_and_delay.avg.ms.schedule_timeout.rcu_gp_fqs_loop.rcu_gp_kthread.kthread
    108.83 ± 63%     -81.2%      20.50 ±113%  perf-sched.wait_and_delay.count.devkmsg_read.vfs_read.ksys_read.do_syscall_64
      3107 ±  3%     -12.6%       2714 ±  5%  perf-sched.wait_and_delay.count.irqentry_exit_to_user_mode.asm_exc_page_fault.[unknown]
    124.17 ± 63%     -70.1%      37.17 ± 60%  perf-sched.wait_and_delay.count.schedule_hrtimeout_range.ep_poll.do_epoll_wait.__x64_sys_epoll_wait
    751.00 ±  2%     -17.0%     623.50 ±  2%  perf-sched.wait_and_delay.count.schedule_timeout.rcu_gp_fqs_loop.rcu_gp_kthread.kthread
      1550 ± 31%    +119.7%       3406 ± 19%  perf-sched.wait_and_delay.max.ms.__cond_resched.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm
    261.24 ± 37%    +199.1%     781.42 ± 15%  perf-sched.wait_time.avg.ms.__cond_resched.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm
     80.16 ± 60%    +278.0%     303.05 ± 50%  perf-sched.wait_time.avg.ms.schedule_hrtimeout_range.ep_poll.do_epoll_wait.__x64_sys_epoll_wait
      6.59 ±  2%     +17.0%       7.71 ±  3%  perf-sched.wait_time.avg.ms.schedule_timeout.rcu_gp_fqs_loop.rcu_gp_kthread.kthread
      1550 ± 31%    +119.7%       3406 ± 19%  perf-sched.wait_time.max.ms.__cond_resched.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm
      0.18           -49.0%       0.09 ±  3%  perf-stat.i.MPKI
  1.59e+10           +19.7%  1.903e+10        perf-stat.i.branch-instructions
      0.28            -0.0        0.25        perf-stat.i.branch-miss-rate%
  40989724            +5.3%   43173098 ±  2%  perf-stat.i.branch-misses
     32.63           -15.8       16.81 ±  2%  perf-stat.i.cache-miss-rate%
  12733301 ±  2%     -40.3%    7597041 ±  3%  perf-stat.i.cache-misses
  38933806           +14.5%   44591128        perf-stat.i.cache-references
      3.17           -16.4%       2.65        perf-stat.i.cpi
     18224           +75.2%      31921        perf-stat.i.cycles-between-cache-misses
 7.098e+10           +19.6%  8.489e+10        perf-stat.i.instructions
      0.32           +19.0%       0.38        perf-stat.i.ipc
    184.67           +20.6%     222.65        perf-stat.i.metric.K/sec
  11819123           +20.6%   14249011        perf-stat.i.page-faults
      0.18           -50.1%       0.09 ±  3%  perf-stat.overall.MPKI
      0.26            -0.0        0.23        perf-stat.overall.branch-miss-rate%
     32.70           -15.7       17.04 ±  3%  perf-stat.overall.cache-miss-rate%
      3.19           -16.4%       2.66        perf-stat.overall.cpi
     17772 ±  2%     +67.6%      29795 ±  2%  perf-stat.overall.cycles-between-cache-misses
      0.31           +19.6%       0.38        perf-stat.overall.ipc
 1.564e+10           +19.7%  1.871e+10        perf-stat.ps.branch-instructions
  40314687            +5.4%   42478375 ±  2%  perf-stat.ps.branch-misses
  12525837 ±  2%     -40.3%    7473864 ±  3%  perf-stat.ps.cache-misses
  38300912           +14.5%   43866104        perf-stat.ps.cache-references
 6.982e+10           +19.6%   8.35e+10        perf-stat.ps.instructions
  11626044           +20.6%   14016280        perf-stat.ps.page-faults
 4.284e+12           +19.5%  5.117e+12        perf-stat.total.instructions





Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.


-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH 1/2] Revert "pid: allow pid_max to be set per pid namespace"
Posted by Alexander Mikhalitsyn 9 months, 3 weeks ago
Am Fr., 21. Feb. 2025 um 18:02 Uhr schrieb Michal Koutný <mkoutny@suse.com>:
>
> This reverts commit 7863dcc72d0f4b13a641065670426435448b3d80.

If we revert this one, then we should also revert a corresponding kselftest:
https://github.com/torvalds/linux/commit/615ab43b838bb982dc234feff75ee9ad35447c5d

>
> It is already difficult for users to troubleshoot which of multiple pid
> limits restricts their workload. I'm afraid making pid_max
> per-(hierarchical-)NS will contribute to confusion.
> Also, the implementation copies the limit upon creation from
> parent, this pattern showed cumbersome with some attributes in legacy
> cgroup controllers -- it's subject to race condition between parent's
> limit modification and children creation and once copied it must be
> changed in the descendant.
>
> This is very similar to what pids.max of a cgroup (already) does that
> can be used as an alternative.
>
> Link: https://lore.kernel.org/r/bnxhqrq7tip6jl2hu6jsvxxogdfii7ugmafbhgsogovrchxfyp@kagotkztqurt/
> Signed-off-by: Michal Koutný <mkoutny@suse.com>
> ---
>  include/linux/pid.h               |   3 +
>  include/linux/pid_namespace.h     |  10 +--
>  kernel/pid.c                      | 125 ++----------------------------
>  kernel/pid_namespace.c            |  43 +++-------
>  kernel/sysctl.c                   |   9 +++
>  kernel/trace/pid_list.c           |   2 +-
>  kernel/trace/trace.h              |   2 +
>  kernel/trace/trace_sched_switch.c |   2 +-
>  8 files changed, 35 insertions(+), 161 deletions(-)
>
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 98837a1ff0f33..fe575fcdb4afa 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -108,6 +108,9 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old);
>  extern void transfer_pid(struct task_struct *old, struct task_struct *new,
>                          enum pid_type);
>
> +extern int pid_max;
> +extern int pid_max_min, pid_max_max;
> +
>  /*
>   * look up a PID in the hash table. Must be called with the tasklist_lock
>   * or rcu_read_lock() held.
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index 7c67a58111998..f9f9931e02d6a 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -30,7 +30,6 @@ struct pid_namespace {
>         struct task_struct *child_reaper;
>         struct kmem_cache *pid_cachep;
>         unsigned int level;
> -       int pid_max;
>         struct pid_namespace *parent;
>  #ifdef CONFIG_BSD_PROCESS_ACCT
>         struct fs_pin *bacct;
> @@ -39,14 +38,9 @@ struct pid_namespace {
>         struct ucounts *ucounts;
>         int reboot;     /* group exit code if this pidns was rebooted */
>         struct ns_common ns;
> -       struct work_struct      work;
> -#ifdef CONFIG_SYSCTL
> -       struct ctl_table_set    set;
> -       struct ctl_table_header *sysctls;
> -#if defined(CONFIG_MEMFD_CREATE)
> +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
>         int memfd_noexec_scope;
>  #endif
> -#endif
>  } __randomize_layout;
>
>  extern struct pid_namespace init_pid_ns;
> @@ -123,8 +117,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
>  extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
>  void pidhash_init(void);
>  void pid_idr_init(void);
> -int register_pidns_sysctls(struct pid_namespace *pidns);
> -void unregister_pidns_sysctls(struct pid_namespace *pidns);
>
>  static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
>  {
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 924084713be8b..aa2a7d4da4555 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -61,8 +61,10 @@ struct pid init_struct_pid = {
>         }, }
>  };
>
> -static int pid_max_min = RESERVED_PIDS + 1;
> -static int pid_max_max = PID_MAX_LIMIT;
> +int pid_max = PID_MAX_DEFAULT;
> +
> +int pid_max_min = RESERVED_PIDS + 1;
> +int pid_max_max = PID_MAX_LIMIT;
>
>  /*
>   * PID-map pages start out as NULL, they get allocated upon
> @@ -81,7 +83,6 @@ struct pid_namespace init_pid_ns = {
>  #ifdef CONFIG_PID_NS
>         .ns.ops = &pidns_operations,
>  #endif
> -       .pid_max = PID_MAX_DEFAULT,
>  #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
>         .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
>  #endif
> @@ -190,7 +191,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>
>         for (i = ns->level; i >= 0; i--) {
>                 int tid = 0;
> -               int pid_max = READ_ONCE(tmp->pid_max);
>
>                 if (set_tid_size) {
>                         tid = set_tid[ns->level - i];
> @@ -644,118 +644,17 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
>         return fd;
>  }
>
> -#ifdef CONFIG_SYSCTL
> -static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
> -{
> -       return &task_active_pid_ns(current)->set;
> -}
> -
> -static int set_is_seen(struct ctl_table_set *set)
> -{
> -       return &task_active_pid_ns(current)->set == set;
> -}
> -
> -static int pid_table_root_permissions(struct ctl_table_header *head,
> -                                     const struct ctl_table *table)
> -{
> -       struct pid_namespace *pidns =
> -               container_of(head->set, struct pid_namespace, set);
> -       int mode = table->mode;
> -
> -       if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
> -           uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
> -               mode = (mode & S_IRWXU) >> 6;
> -       else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
> -               mode = (mode & S_IRWXG) >> 3;
> -       else
> -               mode = mode & S_IROTH;
> -       return (mode << 6) | (mode << 3) | mode;
> -}
> -
> -static void pid_table_root_set_ownership(struct ctl_table_header *head,
> -                                        kuid_t *uid, kgid_t *gid)
> -{
> -       struct pid_namespace *pidns =
> -               container_of(head->set, struct pid_namespace, set);
> -       kuid_t ns_root_uid;
> -       kgid_t ns_root_gid;
> -
> -       ns_root_uid = make_kuid(pidns->user_ns, 0);
> -       if (uid_valid(ns_root_uid))
> -               *uid = ns_root_uid;
> -
> -       ns_root_gid = make_kgid(pidns->user_ns, 0);
> -       if (gid_valid(ns_root_gid))
> -               *gid = ns_root_gid;
> -}
> -
> -static struct ctl_table_root pid_table_root = {
> -       .lookup         = pid_table_root_lookup,
> -       .permissions    = pid_table_root_permissions,
> -       .set_ownership  = pid_table_root_set_ownership,
> -};
> -
> -static const struct ctl_table pid_table[] = {
> -       {
> -               .procname       = "pid_max",
> -               .data           = &init_pid_ns.pid_max,
> -               .maxlen         = sizeof(int),
> -               .mode           = 0644,
> -               .proc_handler   = proc_dointvec_minmax,
> -               .extra1         = &pid_max_min,
> -               .extra2         = &pid_max_max,
> -       },
> -};
> -#endif
> -
> -int register_pidns_sysctls(struct pid_namespace *pidns)
> -{
> -#ifdef CONFIG_SYSCTL
> -       struct ctl_table *tbl;
> -
> -       setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
> -
> -       tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
> -       if (!tbl)
> -               return -ENOMEM;
> -       tbl->data = &pidns->pid_max;
> -       pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
> -                            PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
> -
> -       pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
> -                                                ARRAY_SIZE(pid_table));
> -       if (!pidns->sysctls) {
> -               kfree(tbl);
> -               retire_sysctl_set(&pidns->set);
> -               return -ENOMEM;
> -       }
> -#endif
> -       return 0;
> -}
> -
> -void unregister_pidns_sysctls(struct pid_namespace *pidns)
> -{
> -#ifdef CONFIG_SYSCTL
> -       const struct ctl_table *tbl;
> -
> -       tbl = pidns->sysctls->ctl_table_arg;
> -       unregister_sysctl_table(pidns->sysctls);
> -       retire_sysctl_set(&pidns->set);
> -       kfree(tbl);
> -#endif
> -}
> -
>  void __init pid_idr_init(void)
>  {
>         /* Verify no one has done anything silly: */
>         BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
>
>         /* bump default and minimum pid_max based on number of cpus */
> -       init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
> -                                 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
> +       pid_max = min(pid_max_max, max_t(int, pid_max,
> +                               PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
>         pid_max_min = max_t(int, pid_max_min,
>                                 PIDS_PER_CPU_MIN * num_possible_cpus());
> -       pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
> +       pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
>
>         idr_init(&init_pid_ns.idr);
>
> @@ -766,16 +665,6 @@ void __init pid_idr_init(void)
>                         NULL);
>  }
>
> -static __init int pid_namespace_sysctl_init(void)
> -{
> -#ifdef CONFIG_SYSCTL
> -       /* "kernel" directory will have already been initialized. */
> -       BUG_ON(register_pidns_sysctls(&init_pid_ns));
> -#endif
> -       return 0;
> -}
> -subsys_initcall(pid_namespace_sysctl_init);
> -
>  static struct file *__pidfd_fget(struct task_struct *task, int fd)
>  {
>         struct file *file;
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 8f6cfec87555a..0f23285be4f92 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -70,8 +70,6 @@ static void dec_pid_namespaces(struct ucounts *ucounts)
>         dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
>  }
>
> -static void destroy_pid_namespace_work(struct work_struct *work);
> -
>  static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
>         struct pid_namespace *parent_pid_ns)
>  {
> @@ -107,27 +105,17 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
>                 goto out_free_idr;
>         ns->ns.ops = &pidns_operations;
>
> -       ns->pid_max = parent_pid_ns->pid_max;
> -       err = register_pidns_sysctls(ns);
> -       if (err)
> -               goto out_free_inum;
> -
>         refcount_set(&ns->ns.count, 1);
>         ns->level = level;
>         ns->parent = get_pid_ns(parent_pid_ns);
>         ns->user_ns = get_user_ns(user_ns);
>         ns->ucounts = ucounts;
>         ns->pid_allocated = PIDNS_ADDING;
> -       INIT_WORK(&ns->work, destroy_pid_namespace_work);
> -
>  #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
>         ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
>  #endif
> -
>         return ns;
>
> -out_free_inum:
> -       ns_free_inum(&ns->ns);
>  out_free_idr:
>         idr_destroy(&ns->idr);
>         kmem_cache_free(pid_ns_cachep, ns);
> @@ -149,28 +137,12 @@ static void delayed_free_pidns(struct rcu_head *p)
>
>  static void destroy_pid_namespace(struct pid_namespace *ns)
>  {
> -       unregister_pidns_sysctls(ns);
> -
>         ns_free_inum(&ns->ns);
>
>         idr_destroy(&ns->idr);
>         call_rcu(&ns->rcu, delayed_free_pidns);
>  }
>
> -static void destroy_pid_namespace_work(struct work_struct *work)
> -{
> -       struct pid_namespace *ns =
> -               container_of(work, struct pid_namespace, work);
> -
> -       do {
> -               struct pid_namespace *parent;
> -
> -               parent = ns->parent;
> -               destroy_pid_namespace(ns);
> -               ns = parent;
> -       } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
> -}
> -
>  struct pid_namespace *copy_pid_ns(unsigned long flags,
>         struct user_namespace *user_ns, struct pid_namespace *old_ns)
>  {
> @@ -183,8 +155,15 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
>
>  void put_pid_ns(struct pid_namespace *ns)
>  {
> -       if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
> -               schedule_work(&ns->work);
> +       struct pid_namespace *parent;
> +
> +       while (ns != &init_pid_ns) {
> +               parent = ns->parent;
> +               if (!refcount_dec_and_test(&ns->ns.count))
> +                       break;
> +               destroy_pid_namespace(ns);
> +               ns = parent;
> +       }
>  }
>  EXPORT_SYMBOL_GPL(put_pid_ns);
>
> @@ -295,7 +274,6 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
>         next = idr_get_cursor(&pid_ns->idr) - 1;
>
>         tmp.data = &next;
> -       tmp.extra2 = &pid_ns->pid_max;
>         ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
>         if (!ret && write)
>                 idr_set_cursor(&pid_ns->idr, next + 1);
> @@ -303,6 +281,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
>         return ret;
>  }
>
> +extern int pid_max;
>  static const struct ctl_table pid_ns_ctl_table[] = {
>         {
>                 .procname = "ns_last_pid",
> @@ -310,7 +289,7 @@ static const struct ctl_table pid_ns_ctl_table[] = {
>                 .mode = 0666, /* permissions are checked in the handler */
>                 .proc_handler = pid_ns_ctl_handler,
>                 .extra1 = SYSCTL_ZERO,
> -               .extra2 = &init_pid_ns.pid_max,
> +               .extra2 = &pid_max,
>         },
>  };
>  #endif /* CONFIG_CHECKPOINT_RESTORE */
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index cb57da499ebb1..bb739608680f2 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1803,6 +1803,15 @@ static const struct ctl_table kern_table[] = {
>                 .proc_handler   = proc_dointvec,
>         },
>  #endif
> +       {
> +               .procname       = "pid_max",
> +               .data           = &pid_max,
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
> +               .proc_handler   = proc_dointvec_minmax,
> +               .extra1         = &pid_max_min,
> +               .extra2         = &pid_max_max,
> +       },
>         {
>                 .procname       = "panic_on_oops",
>                 .data           = &panic_on_oops,
> diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
> index c62b9b3cfb3d8..4966e6bbdf6f3 100644
> --- a/kernel/trace/pid_list.c
> +++ b/kernel/trace/pid_list.c
> @@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
>         int i;
>
>         /* According to linux/thread.h, pids can be no bigger that 30 bits */
> -       WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
> +       WARN_ON_ONCE(pid_max > (1 << 30));
>
>         pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
>         if (!pid_list)
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 9c21ba45b7af6..46c65402ad7e5 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -732,6 +732,8 @@ extern unsigned long tracing_thresh;
>
>  /* PID filtering */
>
> +extern int pid_max;
> +
>  bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
>                              pid_t search_pid);
>  bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
> diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
> index cb49f7279dc80..573b5d8e8a28e 100644
> --- a/kernel/trace/trace_sched_switch.c
> +++ b/kernel/trace/trace_sched_switch.c
> @@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void)
>         if (tgid_map)
>                 return 0;
>
> -       tgid_map_max = init_pid_ns.pid_max;
> +       tgid_map_max = pid_max;
>         map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
>                        GFP_KERNEL);
>         if (!map)
> --
> 2.48.1
>