include/linux/sysctl.h | 3 +++ kernel/panic.c | 6 ++--- kernel/printk/sysctl.c | 10 +------- kernel/reboot.c | 3 ++- kernel/sysctl.c | 58 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 67 insertions(+), 13 deletions(-)
From f5ff7a45f093ebdf4d1db0f3dad244d2c5065943 Mon Sep 17 00:00:00 2001
From: wooridge <yurenwang152@gmail.com>
Date: Fri, 17 Apr 2026 20:59:09 +0800
Subject: [PATCH] sysctl: add CAP_SYS_ADMIN check to panic/ctrl-alt-del sysctls
Several kernel sysctls that control critical system behavior use proc_dointvec() as their handler with mode 0644, but proc_dointvec() does not perform any capability checks. In a user namespace where a process has uid 0, the VFS layer allows writes based on the file permission bits (0644), enabling unprivileged modification of critical kernel parameters.
Affected sysctls: kernel/panic, kernel/panic_on_oops, kernel/panic_on_warn, kernel/ctrl-alt-del
Fix by adding proc_dointvec_sysadmin() and proc_dointvec_minmax_sysadmin() as generic wrappers that check capable(CAP_SYS_ADMIN) on writes, then delegate to proc_dointvec()/proc_dointvec_minmax(). Also remove the existing static proc_dointvec_minmax_sysadmin() from kernel/printk/sysctl.c in favor of the new shared implementation.
Signed-off-by: wooridge <yurenwang152@gmail.com>
---
include/linux/sysctl.h | 3 +++
kernel/panic.c | 6 ++---
kernel/printk/sysctl.c | 10 +-------
kernel/reboot.c | 3 ++-
kernel/sysctl.c | 58 ++++++++++++++++++++++++++++++++++++++++++
5 files changed, 67 insertions(+), 13 deletions(-)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 2886fbceb5d6..6322822206a7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -82,8 +82,11 @@ int proc_dobool(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);
int proc_dointvec(const struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_dointvec_sysadmin(const struct ctl_table *, int, void *, size_t *, loff_t *);
int proc_dointvec_minmax(const struct ctl_table *table, int dir, void *buffer,
size_t *lenp, loff_t *ppos);
+int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int dir, void *buffer,
+ size_t *lenp, loff_t *ppos);
int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer,
size_t *lenp, loff_t *ppos,
int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr,
diff --git a/kernel/panic.c b/kernel/panic.c
index c78600212b6c..a966a4c81473 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -162,14 +162,14 @@ static const struct ctl_table kern_panic_table[] = {
.data = &panic_timeout,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_sysadmin,
},
{
.procname = "panic_on_oops",
.data = &panic_on_oops,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_sysadmin,
},
{
.procname = "panic_print",
@@ -183,7 +183,7 @@ static const struct ctl_table kern_panic_table[] = {
.data = &panic_on_warn,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c
index f15732e93c2e..c48694739fee 100644
--- a/kernel/printk/sysctl.c
+++ b/kernel/printk/sysctl.c
@@ -5,20 +5,12 @@
#include <linux/printk.h>
#include <linux/capability.h>
+#include <linux/sysctl.h>
#include <linux/ratelimit.h>
#include "internal.h"
static const int ten_thousand = 10000;
-static int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-}
-
static const struct ctl_table printk_sysctls[] = {
{
.procname = "printk",
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 695c33e75efd..47055fedabbc 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -16,6 +16,7 @@
#include <linux/reboot.h>
#include <linux/suspend.h>
#include <linux/syscalls.h>
+#include <linux/sysctl.h>
#include <linux/syscore_ops.h>
#include <linux/uaccess.h>
@@ -1379,7 +1380,7 @@ static const struct ctl_table kern_reboot_table[] = {
.data = &C_A_D,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_sysadmin,
},
};
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9d3a666ffde1..4e0bc095ffeb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -843,6 +843,52 @@ int proc_dointvec(const struct ctl_table *table, int dir, void *buffer,
return do_proc_dointvec(table, dir, buffer, lenp, ppos, NULL);
}
+/**
+ * proc_dointvec_sysadmin - read/write a vector of integers with CAP_SYS_ADMIN check
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Same as proc_dointvec, but writes require CAP_SYS_ADMIN.
+ * This prevents unprivileged writes from user namespaces where
+ * the process has uid 0 and thus passes VFS permission checks.
+ *
+ * Returns 0 on success, -EPERM if a write lacks CAP_SYS_ADMIN.
+ */
+int proc_dointvec_sysadmin(const struct ctl_table *table, int dir, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ if (dir && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec(table, dir, buffer, lenp, ppos);
+}
+EXPORT_SYMBOL_GPL(proc_dointvec_sysadmin);
+
+/**
+ * proc_dointvec_minmax_sysadmin - read/write a vector of integers with range and CAP_SYS_ADMIN check
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Same as proc_dointvec_minmax, but writes require CAP_SYS_ADMIN.
+ *
+ * Returns 0 on success, -EPERM if a write lacks CAP_SYS_ADMIN.
+ */
+int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (dir && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, dir, buffer, lenp, ppos);
+}
+EXPORT_SYMBOL_GPL(proc_dointvec_minmax_sysadmin);
+
/**
* proc_douintvec - read a vector of unsigned integers
* @table: the sysctl table
@@ -1260,6 +1306,12 @@ int proc_dointvec(const struct ctl_table *table, int dir,
return -ENOSYS;
}
+int proc_dointvec_sysadmin(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_douintvec(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -1272,6 +1324,12 @@ int proc_dointvec_minmax(const struct ctl_table *table, int dir,
return -ENOSYS;
}
+int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_douintvec_minmax(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos)
{
--
2.50.1 (Apple Git-155)
On 2026-04-17, wooridge <yurenwang152@gmail.com> wrote:
> Several kernel sysctls that control critical system behavior use
> proc_dointvec() as their handler with mode 0644, but proc_dointvec()
> does not perform any capability checks. In a user namespace where a
> process has uid 0, the VFS layer allows writes based on the file
> permission bits (0644), enabling unprivileged modification of critical
> kernel parameters.
"unprivileged" and "uid 0" are not really compatible terms.
More importantly, these kinds of check-permissions-at-write checks are
actually quite weak because of setuid binaries -- you can always
redirect the output of a setuid binary to a file to bypass these kinds
of checks. A very classic example is mempodipper (CVE-2012-0056) but the
recent "CrackArmor" bug (in particular CVE-2026-23268) is an even more
obvious example. For those reasons, if an attacker can get access to an
O_RDWR handle to a sysctl file, you're long since screwed.
For reference, proc_dointvec_minmax_sysadmin() was added in commit
bfdc0b497faa ("sysctl: restrict write access to dmesg_restrict") but
there are two things to consider:
1. This code was added before mempodipper was discovered and so the
understanding around why these kinds of access controls are not that
useful was not as developed as it is today.
2. The change was primarily focused on trying to avoid a loophole when
people were running with non-userns containers and had access to
writable /proc/sys. This is considered wildly insecure by modern
standards and would not be considered a security hole worthy of fixing
today. (Most container runtimes use user namespaces, which block this,
and most container runtimes also bind-mount /proc/sys as read-only
which also block this.)
So, in my view this logic was somewhat flawed back then but makes
absolutely no sense today and should not be propagated to other sysctls
because it just gives a false sense of security.
> Affected sysctls: kernel/panic, kernel/panic_on_oops, kernel/panic_on_warn, kernel/ctrl-alt-del
I also want to point out that the files you mentioned aren't even the
juiciest targets -- kernel.core_pattern gives you free root code
execution on the host if you can write to it.
--
Aleksa Sarai
https://www.cyphar.com/
On Sat, Apr 18, 2026 at 05:14:06AM +1000, Aleksa Sarai wrote:
> More importantly, these kinds of check-permissions-at-write checks are
> actually quite weak because of setuid binaries -- you can always
> redirect the output of a setuid binary to a file to bypass these kinds
> of checks. A very classic example is mempodipper (CVE-2012-0056) but the
> recent "CrackArmor" bug (in particular CVE-2026-23268) is an even more
> obvious example. For those reasons, if an attacker can get access to an
> O_RDWR handle to a sysctl file, you're long since screwed.
>
> So, in my view this logic was somewhat flawed back then but makes
> absolutely no sense today and should not be propagated to other sysctls
> because it just gives a false sense of security.
>
> I also want to point out that the files you mentioned aren't even the
> juiciest targets -- kernel.core_pattern gives you free root code
> execution on the host if you can write to it.
Hi,
I was looking at this in the context of sethostname()/setdomainname(),
and procfs/sysctl seems inconsistent with the syscall interface there:
- the syscalls require:
ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)
- /proc/sys/kernel/hostname and /proc/sys/kernel/domainname mutate the
same UTS namespace state, but only go through the generic procfs DAC
permission check.
So a process can fail the syscall after dropping effective
CAP_SYS_ADMIN, but still change the same value through sysctl if DAC
allows the write.
Is the "false sense of security" argument meant to apply to this direct
API consistency issue as well? The already-open-fd delegation case is a
valid concern, but it seems to show that write-time checks are
incomplete, not that the direct write path should be weaker than the
syscall path.
To summarize the current situation: no fd delegation or privileged helper
is needed. A task that passes procfs DAC can mutate UTS state
(hostname/domainname) directly after dropping CAP_SYS_ADMIN, while the
syscall path rejects the same operation.
If the already-open fd case is the concern, then the check should happen
when the file is opened for write, or should use file->f_cred so the
write is authorized against the opener's credentials. The kernel already
has examples in this direction: /proc/kcore rejects open without
CAP_SYS_RAWIO, /proc/<pid>/setgroups rejects open-for-write without
CAP_SYS_ADMIN in the target user namespace, and file_ns_capable() checks
the opener's credentials.
If those other files require capabilities, it seems to me that
kernel.core_pattern should as well. It controls global coredump
behaviour, including pipe helpers, and a pipe helper runs through the
usermodehelper path with root credentials and the default usermodehelper
capability set. Direct writes to this sysctl should therefore require
CAP_SYS_ADMIN rather than only procfs DAC permissions.
If you agree with this, should I send a patch for hostname/domainname
that makes its procfs writes (through proc_do_uts_string()) require the
same ns_capable() check, together with a selftest for this case?
Longer term, I think the sysctl documentation should state the policy
more explicitly. If writable /proc/sys is not meant to be a security
boundary, Documentation/admin-guide/sysctl/kernel.rst should say so
directly, so users know that environments exposing it writable (e.g.
containers) must treat it as privileged. Then individual entries should
document the non-obvious details: whether writes rely only on procfs DAC,
whether an additional capability or opener-credential check is done, and
whether the knob can have sensitive side effects such as core_pattern's
pipe helper execution. That would avoid having users infer the intended
security model from kernel behaviour and scattered examples.
© 2016 - 2026 Red Hat, Inc.