security/commoncap.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-)
If a program enables `NO_NEW_PRIVS` and sets up
differing real/effective/saved/fs ids, the effective ids are
downgraded during exec because the kernel believes it should "get no
more than they had, and maybe less".
I believe it is safe to keep differing ids even if `NO_NEW_PRIVS` is
set. The newly executed program doesn't get any more, but there's no
reason to give it less.
This is different from "set[ug]id/setpcap" execution where privileges
may be raised; here, the assumption that it's "set[ug]id" if
effective!=real is too broad.
If we verify that all user/group ids remain as they were, we can
safely allow the new program to keep them.
Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
---
security/commoncap.c | 23 ++++++++++++++++++++++-
1 file changed, 22 insertions(+), 1 deletion(-)
diff --git a/security/commoncap.c b/security/commoncap.c
index 58a0c1c3e409..057a7400ef7d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -861,6 +861,26 @@ static inline bool __is_setuid(struct cred *new, const struct cred *old)
static inline bool __is_setgid(struct cred *new, const struct cred *old)
{ return !gid_eq(new->egid, old->gid); }
+/**
+ * Are all user/group ids in both cred instances identical?
+ *
+ * It can be used after __is_setuid() / __is_setgid() to check whether
+ * this is really a set*id operation or whether both processes just
+ * have differing real/effective ids. It is safe to keep differing
+ * real/effective ids in "unsafe" program execution.
+ */
+static bool has_identical_uids_gids(const struct cred *a, const struct cred *b)
+{
+ return uid_eq(a->uid, b->uid) &&
+ gid_eq(a->gid, b->gid) &&
+ uid_eq(a->suid, b->suid) &&
+ gid_eq(a->sgid, b->sgid) &&
+ uid_eq(a->euid, b->euid) &&
+ gid_eq(a->egid, b->egid) &&
+ uid_eq(a->fsuid, b->fsuid) &&
+ gid_eq(a->fsgid, b->fsgid);
+}
+
/*
* 1) Audit candidate if current->cap_effective is set
*
@@ -940,7 +960,8 @@ int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
*
* In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
*/
- is_setid = __is_setuid(new, old) || __is_setgid(new, old);
+ is_setid = (__is_setuid(new, old) || __is_setgid(new, old)) &&
+ !has_identical_uids_gids(new, old);
if ((is_setid || __cap_gained(permitted, new, old)) &&
((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
--
2.47.2
On Thu, Mar 06, 2025 at 09:26:15AM +0100, Max Kellermann wrote:
> If a program enables `NO_NEW_PRIVS` and sets up
> differing real/effective/saved/fs ids, the effective ids are
> downgraded during exec because the kernel believes it should "get no
> more than they had, and maybe less".
>
> I believe it is safe to keep differing ids even if `NO_NEW_PRIVS` is
> set. The newly executed program doesn't get any more, but there's no
> reason to give it less.
>
> This is different from "set[ug]id/setpcap" execution where privileges
> may be raised; here, the assumption that it's "set[ug]id" if
> effective!=real is too broad.
>
> If we verify that all user/group ids remain as they were, we can
> safely allow the new program to keep them.
Thanks, it's an interesting point. Seems to mainly depend on what users
of the feature have come to expect.
Andy, what do you think?
> Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
> ---
> security/commoncap.c | 23 ++++++++++++++++++++++-
> 1 file changed, 22 insertions(+), 1 deletion(-)
>
> diff --git a/security/commoncap.c b/security/commoncap.c
> index 58a0c1c3e409..057a7400ef7d 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -861,6 +861,26 @@ static inline bool __is_setuid(struct cred *new, const struct cred *old)
> static inline bool __is_setgid(struct cred *new, const struct cred *old)
> { return !gid_eq(new->egid, old->gid); }
>
> +/**
> + * Are all user/group ids in both cred instances identical?
> + *
> + * It can be used after __is_setuid() / __is_setgid() to check whether
> + * this is really a set*id operation or whether both processes just
> + * have differing real/effective ids. It is safe to keep differing
> + * real/effective ids in "unsafe" program execution.
> + */
> +static bool has_identical_uids_gids(const struct cred *a, const struct cred *b)
> +{
> + return uid_eq(a->uid, b->uid) &&
> + gid_eq(a->gid, b->gid) &&
> + uid_eq(a->suid, b->suid) &&
> + gid_eq(a->sgid, b->sgid) &&
> + uid_eq(a->euid, b->euid) &&
> + gid_eq(a->egid, b->egid) &&
> + uid_eq(a->fsuid, b->fsuid) &&
> + gid_eq(a->fsgid, b->fsgid);
> +}
> +
> /*
> * 1) Audit candidate if current->cap_effective is set
> *
> @@ -940,7 +960,8 @@ int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
> *
> * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
> */
> - is_setid = __is_setuid(new, old) || __is_setgid(new, old);
> + is_setid = (__is_setuid(new, old) || __is_setgid(new, old)) &&
> + !has_identical_uids_gids(new, old);
>
> if ((is_setid || __cap_gained(permitted, new, old)) &&
> ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
> --
> 2.47.2
On Sun, Mar 9, 2025 at 4:19 PM Serge E. Hallyn <serge@hallyn.com> wrote: > > On Thu, Mar 06, 2025 at 09:26:15AM +0100, Max Kellermann wrote: > > If a program enables `NO_NEW_PRIVS` and sets up > > differing real/effective/saved/fs ids, the effective ids are > > downgraded during exec because the kernel believes it should "get no > > more than they had, and maybe less". > > > > I believe it is safe to keep differing ids even if `NO_NEW_PRIVS` is > > set. The newly executed program doesn't get any more, but there's no > > reason to give it less. > > > > This is different from "set[ug]id/setpcap" execution where privileges > > may be raised; here, the assumption that it's "set[ug]id" if > > effective!=real is too broad. > > > > If we verify that all user/group ids remain as they were, we can > > safely allow the new program to keep them. > > Thanks, it's an interesting point. Seems to mainly depend on what users > of the feature have come to expect. > > Andy, what do you think? Serge & Andy, what's your opinion on my patch?
On Mon, Apr 28, 2025 at 01:43:43PM +0200, Max Kellermann wrote: > On Sun, Mar 9, 2025 at 4:19 PM Serge E. Hallyn <serge@hallyn.com> wrote: > > Adding Kees and Andrew Morgan for their opinions. Sorry, I had snipped the actual patch below. Pre-b4 I would have appended it here, but as you can just b4 mbox CAKPOu+_vTuZqsBLfRH+kyphiWAtRfWq=nKAcAYu=Wn2JBAkkYg@mail.gmail.com I won't do so unless you ask me to. > > On Thu, Mar 06, 2025 at 09:26:15AM +0100, Max Kellermann wrote: > > > If a program enables `NO_NEW_PRIVS` and sets up > > > differing real/effective/saved/fs ids, the effective ids are > > > downgraded during exec because the kernel believes it should "get no > > > more than they had, and maybe less". Just to quibble here: I don't use NO_NEW_PRIVS, but it seems to me quite likely that your claim is wrong here. The whole SECBIT_KEEP_CAPS etc dance is based on the idea that you understand that once you exec, you lose some of your existing privilege. Similarly, it seems quite likely to me that people using NO_NEW_PRIVS understand, expect, and count on the fact that their effective ids will be cleared on exec. Note also that so far I'm only asking about the intent of the patch. Apart from that, I do think the implementation is wrong, because you are impacting non-NO_NEW_PRIVS behavior as well, such as calculation of cap_permitted and the clearing of ambient capabilities. And, I'm not sure the has_identical_uids_gids() is quite right, as I'm not sure what the bprm->cred->fsuid and suid make sense, though the process's fsuid and suid of course need to be checked. > > > I believe it is safe to keep differing ids even if `NO_NEW_PRIVS` is > > > set. The newly executed program doesn't get any more, but there's no > > > reason to give it less. > > > > > > This is different from "set[ug]id/setpcap" execution where privileges > > > may be raised; here, the assumption that it's "set[ug]id" if > > > effective!=real is too broad. > > > > > > If we verify that all user/group ids remain as they were, we can > > > safely allow the new program to keep them. > > > > Thanks, it's an interesting point. Seems to mainly depend on what users > > of the feature have come to expect. > > > > Andy, what do you think? > > Serge & Andy, what's your opinion on my patch?
On Tue, May 6, 2025 at 3:22 PM Serge E. Hallyn <serge@hallyn.com> wrote: > Just to quibble here: I don't use NO_NEW_PRIVS, but it seems to me quite > likely that your claim is wrong here. The whole SECBIT_KEEP_CAPS etc > dance is based on the idea that you understand that once you exec, you > lose some of your existing privilege. Similarly, it seems quite > likely to me that people using NO_NEW_PRIVS understand, expect, and > count on the fact that their effective ids will be cleared on exec. One could define NO_NEW_PRIVS that way, but that's not how it is documented. Of course, we can't rule out that somewhere, somebody exists who relies on the current behavior, and that we must preserve it for ABI stability (I think this was your point). If you desire ABI stability, then this behavior should really be documented. To me, the current implementation looks weird and buggy (but that's just my opinion). The code figures that that it's a set-id exec when effective!=real, which is indeed how set-id execution looks like, but still that check is slightly off: 1. it's really only set-id when new!=old; checking real!=effective is conceptually the wrong angle 2. there may be other reasons why real!=effective My patch is an attempt to fix this in an unintrusive way, by not rewriting it but adding another check to rule out some special case. If I were to rewrite this from scratch, I'd do it differently (only compare new!=old), but I don't want to mess too much with security code that I'm not very familiar with. I believe the guy who initially wrote it made wrong assumptions, but maybe I'm just wrong, I'm not the expert here. > Note also that so far I'm only asking about the intent of the patch. In a shared webhosting environment, we want to run an Apache (or nginx) in each website's container. If the website owner does "chmod 600", the Apache should not be able to read the file; but PHP processes spawned by the Apache should have full access. Therefore, we run Apache with a different fsuid; when Apache executes PHP, the fsuid is reverted. But how to spawn Apache with a different fsuid? Not possible directly (fsuid is always reverted on exec), but by giving it a different euid (and ruid = website uid), granting it access to that secondary uid. After exec, Apache swaps uids, sets effective=real=apache_uid, and fsuid=website_uid. That works fine, until we enable NO_NEW_PRIVS - which is surprising, because we indeed don't want any new privs - just keep the existing ones. The documentation doesn't explain this behavior, and we don't want to omit NO_NEW_PRIVS as a workaround. > Apart from that, I do think the implementation is wrong, because you > are impacting non-NO_NEW_PRIVS behavior as well, such as calculation > of cap_permitted and the clearing of ambient capabilities. You are right, it affects all three code blocks that are checking "is_setid", but why do you believe it's wrong? I can move the new check to the bottom, covering only the "secureexec=1" line, if that worries you. What sure is flawed is that my patch description fails to mention the other two changes. Sorry for that, I'll amend the description (if/when we agree that my patch is ok). Though I do believe that all 3 changes are correct. Why would you want to clear ambient capabilities just because real!=effective? The manpage says: "Executing a program that changes UID or GID due to the set-user-ID or set-group-ID bits or executing a program that has any file capabilities set will clear the ambient set." Documentation and code disagree! Currently, the kernel does not check for "changes UID/GID", but whether effective!=real. These two are orthogonal, the kernel is buggy, and my patch makes it a little bit more correct (but does not remove the wrong real!=effective check, see above). > And, I'm not sure the has_identical_uids_gids() is quite right, as I'm > not sure what the bprm->cred->fsuid and suid make sense, though the > process's fsuid and suid of course need to be checked. Sorry, I don't get that. What do you mean?
On Tue, May 06, 2025 at 04:51:39PM +0200, Max Kellermann wrote: > On Tue, May 6, 2025 at 3:22 PM Serge E. Hallyn <serge@hallyn.com> wrote: > > Just to quibble here: I don't use NO_NEW_PRIVS, but it seems to me quite > > likely that your claim is wrong here. The whole SECBIT_KEEP_CAPS etc > > dance is based on the idea that you understand that once you exec, you > > lose some of your existing privilege. Similarly, it seems quite > > likely to me that people using NO_NEW_PRIVS understand, expect, and > > count on the fact that their effective ids will be cleared on exec. > > One could define NO_NEW_PRIVS that way, but that's not how it is documented. > Of course, we can't rule out that somewhere, somebody exists who > relies on the current behavior, and that we must preserve it for ABI > stability (I think this was your point). If you desire ABI stability, > then this behavior should really be documented. ABI stability is about the most important thing to Linus, so yes, if documentation and code disagree, then we should fix the documentation, except in the case where the current behavior just really is wrong or insecure. > To me, the current implementation looks weird and buggy (but that's > just my opinion). The code figures that that it's a set-id exec when > effective!=real, which is indeed how set-id execution looks like, but > still that check is slightly off: Here's my current reading: The bprm->cred is initially set from current's, with suid and fsuid set to euid. So new->euid will be same as old->euid. Then bprm_creds_from_file() will call bprm_fill_uid() right before security_bprm_creds_from_file(). bprm_fill_uid() will set new->euid to the file's i_uid - only if the setuid bit is set AND only if not task_no_new_privs(current). Meaning that for NNP tasks it never sets new->euid to the file's i_uid. I think the summary is that I don't object to your patch per se (except the ambient creds part, which I'll reply to later - oh and possibly the potential for capability dropping, also for later), but your terminology. setuid and setgid mean something very specific: a file which, when any user executes it, causes execution to happen under the file's owner credentials. And the behavior you are changing has nothing to do with that. What you are changing is that a NNP process with different ruid and euid will continue to run, after exec, with those previous ruid and euid, whether or not the file is setXid. > 1. it's really only set-id when new!=old; checking real!=effective is > conceptually the wrong angle > 2. there may be other reasons why real!=effective > > My patch is an attempt to fix this in an unintrusive way, by not > rewriting it but adding another check to rule out some special case. > If I were to rewrite this from scratch, I'd do it differently (only > compare new!=old), but I don't want to mess too much with security > code that I'm not very familiar with. I believe the guy who initially > wrote it made wrong assumptions, but maybe I'm just wrong, I'm not the > expert here. > > > Note also that so far I'm only asking about the intent of the patch. > > In a shared webhosting environment, we want to run an Apache (or > nginx) in each website's container. If the website owner does "chmod > 600", the Apache should not be able to read the file; but PHP > processes spawned by the Apache should have full access. Therefore, we > run Apache with a different fsuid; when Apache executes PHP, the fsuid > is reverted. > > But how to spawn Apache with a different fsuid? Not possible directly > (fsuid is always reverted on exec), but by giving it a different euid > (and ruid = website uid), granting it access to that secondary uid. > After exec, Apache swaps uids, sets effective=real=apache_uid, and > fsuid=website_uid. > That works fine, until we enable NO_NEW_PRIVS - which is surprising, > because we indeed don't want any new privs - just keep the existing > ones. > The documentation doesn't explain this behavior, and we don't want to > omit NO_NEW_PRIVS as a workaround. > > > Apart from that, I do think the implementation is wrong, because you > > are impacting non-NO_NEW_PRIVS behavior as well, such as calculation > > of cap_permitted and the clearing of ambient capabilities. > > You are right, it affects all three code blocks that are checking > "is_setid", but why do you believe it's wrong? > I can move the new check to the bottom, covering only the > "secureexec=1" line, if that worries you. > > What sure is flawed is that my patch description fails to mention the > other two changes. Sorry for that, I'll amend the description (if/when > we agree that my patch is ok). > > Though I do believe that all 3 changes are correct. Why would you want > to clear ambient capabilities just because real!=effective? The > manpage says: "Executing a program that changes UID or GID due to the > set-user-ID or set-group-ID bits or executing a program that has any > file capabilities set will clear the ambient set." > > Documentation and code disagree! Currently, the kernel does not check > for "changes UID/GID", but whether effective!=real. These two are > orthogonal, the kernel is buggy, and my patch makes it a little bit > more correct (but does not remove the wrong real!=effective check, see > above). > > > And, I'm not sure the has_identical_uids_gids() is quite right, as I'm > > not sure what the bprm->cred->fsuid and suid make sense, though the > > process's fsuid and suid of course need to be checked. > > Sorry, I don't get that. What do you mean? I meant I hadn't yet delved back into the location of bprm_fill_uid() etc, and I know that code has moved around a bit in the last few years.
On Fri, May 9, 2025 at 12:12 AM <sergeh@kernel.org> wrote: > ABI stability is about the most important thing to Linus, so yes, if > documentation and code disagree, then we should fix the documentation, > except in the case where the current behavior just really is wrong > or insecure. It is insecure indeed (can be abused for LD_PRELOAD attacks):https://lore.kernel.org/lkml/CAKPOu+8+1uVrDJHwmHJd2d46-N6AwjR4_bbtoSJS+sx6J=rkjg@mail.gmail.com/
Max Kellermann <max.kellermann@ionos.com> writes: > On Fri, May 9, 2025 at 12:12 AM <sergeh@kernel.org> wrote: >> ABI stability is about the most important thing to Linus, so yes, if >> documentation and code disagree, then we should fix the documentation, >> except in the case where the current behavior just really is wrong >> or insecure. > > It is insecure indeed (can be abused for LD_PRELOAD > attacks):https://lore.kernel.org/lkml/CAKPOu+8+1uVrDJHwmHJd2d46-N6AwjR4_bbtoSJS+sx6J=rkjg@mail.gmail.com/ I don't understand what you are trying to solve, but the patch at the top of the thread introduces a has_identical_uids_gids and is pure nonsense. In particular __is_setuid or __is_setgid being true guarantees that has_identical_uids_gids will be false. Which means has_identical_uids_gids adds nothing, and the patch is pointless. If your concern is LD_PRELOAD and the like please don't play with the uids/gids and instead just make certain bprm->secureexec gets set. At this point I am pretty certain that changing the logic and leaving extra uids/gids set will result in security vulnerabilities for someone who actually depends upon how the code works today. I see no evidence in this conversation that anyone has surveyed the users of NO_NEW_PRIVS and verified how anyone actually uses it. Without such evidence we have to assume that userspace depends upon the current behavior. Eric
On Fri, May 9, 2025 at 4:45 PM Eric W. Biederman <ebiederm@xmission.com> wrote: > In particular __is_setuid or __is_setgid being true guarantees > that has_identical_uids_gids will be false. Sorry, no, that's completely wrong! __is_setXid() compares effective with real. has_identical_uids_gids() compares effective with effective, real with real etc. See the difference? > Which means has_identical_uids_gids adds nothing, and the patch is > pointless. Also wrong. If that were correct, then my patch would not have an observable effect. But it does. Try it, try the small program I posted! It seems your whole email is based on this misunderstanding. Please reconsider. > If your concern is LD_PRELOAD and the like please don't play with > the uids/gids and instead just make certain bprm->secureexec gets > set. LD_PRELOAD is not my concern at all. I just observed that the current kernel behavior can annul the LD_PRELOAD/suid protection as implemented in glibc. > I see no evidence > in this conversation that anyone has surveyed the users of NO_NEW_PRIVS > and verified how anyone actually uses it. Without such evidence we > have to assume that userspace depends upon the current behavior. That's fine for me. But this behavior should be documented, because it is rather surprising. (In any case, we will keep the patch in our kernel fork because we need this part of the kernel to work properly. Our machines don't run any code that depends on the buggy behavior.) Max
On Fri, May 09, 2025 at 06:53:11PM +0200, Max Kellermann wrote: > On Fri, May 9, 2025 at 4:45 PM Eric W. Biederman <ebiederm@xmission.com> wrote: > > In particular __is_setuid or __is_setgid being true guarantees > > that has_identical_uids_gids will be false. > > Sorry, no, that's completely wrong! > > __is_setXid() compares effective with real. > has_identical_uids_gids() compares effective with effective, real with real etc. > > See the difference? > > > Which means has_identical_uids_gids adds nothing, and the patch is > > pointless. > > Also wrong. If that were correct, then my patch would not have an > observable effect. But it does. Try it, try the small program I > posted! > > It seems your whole email is based on this misunderstanding. Please reconsider. > > > If your concern is LD_PRELOAD and the like please don't play with > > the uids/gids and instead just make certain bprm->secureexec gets > > set. > > LD_PRELOAD is not my concern at all. I just observed that the current Right, it is an aside, though an important one. > kernel behavior can annul the LD_PRELOAD/suid protection as > implemented in glibc. Hm, but no, it doesn't annul glibc's protection, right? The concern is that: a. musl doesn't implement LD_PRELOAD clearing b. with NNP, setuid-exec followed by setting NNP followed by exec, will lead to different behavior from non-NNP. In non-NNP, you'll continue to have euid=0, ruid=1000, and caps. With NNP, you'll have euid=ruid=1000, and still full caps. So, if someone is using NNP believing that it is a safe way to execute untrusted code with privilege, because they see they are now uid 1000, they will be confused. Worse, they are subject with musl to LD_PRELOAD from the user before setuid-root. So two things we can do are 1. have NNP drop privilege. 2. have NNP not force euid to be ruid. (1) sort of makes sense since you've bothered to use NNP, but as Max, who is a user of NNP, says, that is not the behavior that would be useful to him. It also leaves the non-NNP and NNP exec behavior - which is already - obviously - far too complicated - with yet more cases. (2) is concerning because it is a change in behavior for NNP users, but on the other hand, it leaves us with fewer special cases. At this point I'm kind of leaning towards (2), though with the obvious modification Max has already found should be added (for secureexec). > > I see no evidence > > in this conversation that anyone has surveyed the users of NO_NEW_PRIVS > > and verified how anyone actually uses it. Without such evidence we Max is such a user. I don't know what we can do to get input from more users. Perhaps scan the debian codebase results at https://codesearch.debian.net/search?q=NO_NEW_PRIVS&literal=1 I'll take a look through those in a bit. > > have to assume that userspace depends upon the current behavior. > > That's fine for me. But this behavior should be documented, because it > is rather surprising. > > (In any case, we will keep the patch in our kernel fork because we > need this part of the kernel to work properly. Our machines don't run > any code that depends on the buggy behavior.) > > Max
Usually, execve() preserves the effective ids. Many programs rely on
this to detect setuid/setgid execution and will disable certain
features (such as rejecting certain user input / environment
variables).
However, if NO_NEW_PRIVS is set, effective ids are always reset by
cap_bprm_creds_from_file(), but capabilities are not revoked. That
means the process looks like it's not setuid/setgid, but has full
capabilities, and is effectively a superuser process. This breaks
userspace assumptions.
It was argued [1] that this surprising behavior must not change
because programs might rely on it:
Of course, this leaves many programs vulnerable, but if we decide the
behavior must remain, we should at least document it with a warning.
[1] https://lore.kernel.org/lkml/87h61t7siv.fsf@email.froward.int.ebiederm.org/
Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
---
Documentation/userspace-api/no_new_privs.rst | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/Documentation/userspace-api/no_new_privs.rst b/Documentation/userspace-api/no_new_privs.rst
index d060ea217ea1..89b0884991e9 100644
--- a/Documentation/userspace-api/no_new_privs.rst
+++ b/Documentation/userspace-api/no_new_privs.rst
@@ -29,6 +29,12 @@ bits will no longer change the uid or gid; file capabilities will not
add to the permitted set, and LSMs will not relax constraints after
execve.
+A successful execve call with ``no_new_privs`` will reset the
+effective uid/gid to the real uid/gid, but does not drop capabilities.
+This means that comparing effective and real ids is not a valid method
+to detect setuid/setgid execution; the proper way to do that is
+getauxval(AT_SECURE).
+
To set ``no_new_privs``, use::
prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
--
2.47.2
On Tue, May 6, 2025 at 7:51 AM Max Kellermann <max.kellermann@ionos.com> wrote: > To me, the current implementation looks weird and buggy (but that's > just my opinion). The code figures that that it's a set-id exec when > effective!=real, which is indeed how set-id execution looks like, but > still that check is slightly off: > > 1. it's really only set-id when new!=old; checking real!=effective is > conceptually the wrong angle > 2. there may be other reasons why real!=effective > > My patch is an attempt to fix this in an unintrusive way, by not > rewriting it but adding another check to rule out some special case. > If I were to rewrite this from scratch, I'd do it differently (only > compare new!=old), but I don't want to mess too much with security > code that I'm not very familiar with. I believe the guy who initially > wrote it made wrong assumptions, but maybe I'm just wrong, I'm not the > expert here. > > > Note also that so far I'm only asking about the intent of the patch. > > In a shared webhosting environment, we want to run an Apache (or > nginx) in each website's container. If the website owner does "chmod > 600", the Apache should not be able to read the file; but PHP > processes spawned by the Apache should have full access. Therefore, we > run Apache with a different fsuid; when Apache executes PHP, the fsuid > is reverted. [...] > > Apart from that, I do think the implementation is wrong, because you > > are impacting non-NO_NEW_PRIVS behavior as well, such as calculation > > of cap_permitted and the clearing of ambient capabilities. > > You are right, it affects all three code blocks that are checking > "is_setid", but why do you believe it's wrong? > I can move the new check to the bottom, covering only the > "secureexec=1" line, if that worries you. If a setuid program execs itself, does the presence of this code undo any protection the kernel afforded it on its first invocation? It feels like changing that could easily turn out to be exploitable in some context. Programs re-exec themselves all the time. > > And, I'm not sure the has_identical_uids_gids() is quite right, as I'm > > not sure what the bprm->cred->fsuid and suid make sense, though the > > process's fsuid and suid of course need to be checked. > > Sorry, I don't get that. What do you mean? I suspect Serge is noting that s*id and f*uid are forced to be the corresponding e*id later in the code (indeed you mention this feature above where I've put [...]), so comparing the values before that takes effect isn't really comparing the right values in your added code. FWIW I ran the libcap quicktest.sh script against your change and it doesn't break any capability thing I test for when making libcap releases. That being said, in general this whole zoo of *[ug]id values and their subtle behavior is not casually obvious. I'd recommend using a userspace workaround for your use case, and not changing the legacy behavior of the kernel. Cheers Andrew
On Wed, May 7, 2025 at 5:16 AM Andrew G. Morgan <morgan@kernel.org> wrote: > If a setuid program execs itself, does the presence of this code undo > any protection the kernel afforded it on its first invocation? What protection do you mean, and what behavior do you expect when setid execs itself? I see this affects: 1. reset effective ids to real ids (only affects NO_NEW_PRIVS) 2. new cap_permitted cannot be higher than old cap_permitted 3. clear cap_ambient 4. clear pdeath_signal (in begin_new_exec) 5. reset stack limits (in begin_new_exec) About these (from my very limited knowledge of this part of the kernel): 1. is my primary goal, and really no new privs gained by allowing the process to keep existing ids 2. only ever changes anything if new cap_permitted is higher, but if that's the case, the is_setid check is irrelevant because __cap_gained is true, therefore no change with my patch 3. as I already described, the kernel is wrong (or the documentation is wrong), and my patch adjusts kernel to behave as documented 4. I don't see how this is dangerous for anything regarding re-exec; if pdeath_signal wasn't reset on the first exec, it's safe to keep it after the re-exec, too 5. same as 4, I think Did I miss anything? > FWIW I ran the libcap quicktest.sh script against your change and it > doesn't break any capability thing I test for when making libcap > releases. Thanks for taking the time to run these tests. I'm glad the existing tests didn't find any obvious bugs. If we identify an actual problem with my patch, let's write a new test that fails with my patch! > That being said, in general this whole zoo of *[ug]id values > and their subtle behavior is not casually obvious. I'd recommend using > a userspace workaround for your use case, and not changing the legacy > behavior of the kernel. What userspace workaround would be possible? The only thing that comes to my mind is to apply NO_NEW_PRIVS in the child process after exec - but that's too late, because arbitrary untrusted code has already been executed at this point. This means I lose NO_NEW_PRIVS completely, because the attacker can simply skip the call. Max
On Wed, May 7, 2025 at 8:33 AM Max Kellermann <max.kellermann@ionos.com> wrote: > What protection do you mean, and what behavior do you expect when > setid execs itself? I see this affects: > > 1. reset effective ids to real ids (only affects NO_NEW_PRIVS) > 2. new cap_permitted cannot be higher than old cap_permitted > 3. clear cap_ambient > 4. clear pdeath_signal (in begin_new_exec) > 5. reset stack limits (in begin_new_exec) > [...] > > Did I miss anything? Indeed I missed something (but this was apparently so hard to find that nobody could answer my question, until I found out myself). The "secureexec" flag is not just used for resetting pdeath_signal and stack limits; its primary purpose is to set the AT_SECURE ELF flag. So yes, my patch is wrong. The "secureexec" setting must be excluded from my patch. Max
I'm looking at this code:
/* Check for privilege-elevated exec. */
if (is_setid ||
(!__is_real(root_uid, new) &&
(effective ||
__cap_grew(permitted, ambient, new))))
bprm->secureexec = 1;
If a luser runs a setuid program, then the kernel will set this
bprm->secureexec bit. Indeed, every time this program re-exec's
itself, that bit will consistently be set. Today.
However, with your change, that behavior will change. The first time
the program is exec'd by luser this bit will be set. However, it will
"surprisingly" not occur should the program exec itself again. If you
are unaware of this bit's purpose there is a nice writeup here:
https://www.kernel.org/doc/html/v5.1/security/LSM.html
See the "bprm_set_creds" paragraph. My concern is that there is an
exploit vector associated with an abuser setting LD_LIBRARY_PATH= to
something nefarious, and then invoking a setuid program that happens
to re-exec itself for some reason. The first invocation will be as
before, but when the binary re-exec's itself, I am concerned that this
could cause the privileged binary to load an exploit.
This has nothing to do with your interest in NO_NEW_PRIV but more to
do with the legacy behavior changes like this are exposed to.
Cheers
Andrew
On Tue, May 6, 2025 at 11:33 PM Max Kellermann <max.kellermann@ionos.com> wrote:
>
> On Wed, May 7, 2025 at 5:16 AM Andrew G. Morgan <morgan@kernel.org> wrote:
> > If a setuid program execs itself, does the presence of this code undo
> > any protection the kernel afforded it on its first invocation?
>
> What protection do you mean, and what behavior do you expect when
> setid execs itself? I see this affects:
>
> 1. reset effective ids to real ids (only affects NO_NEW_PRIVS)
> 2. new cap_permitted cannot be higher than old cap_permitted
> 3. clear cap_ambient
> 4. clear pdeath_signal (in begin_new_exec)
> 5. reset stack limits (in begin_new_exec)
>
> About these (from my very limited knowledge of this part of the kernel):
>
> 1. is my primary goal, and really no new privs gained by allowing the
> process to keep existing ids
> 2. only ever changes anything if new cap_permitted is higher, but if
> that's the case, the is_setid check is irrelevant because __cap_gained
> is true, therefore no change with my patch
> 3. as I already described, the kernel is wrong (or the documentation
> is wrong), and my patch adjusts kernel to behave as documented
> 4. I don't see how this is dangerous for anything regarding re-exec;
> if pdeath_signal wasn't reset on the first exec, it's safe to keep it
> after the re-exec, too
> 5. same as 4, I think
>
> Did I miss anything?
>
> > FWIW I ran the libcap quicktest.sh script against your change and it
> > doesn't break any capability thing I test for when making libcap
> > releases.
>
> Thanks for taking the time to run these tests. I'm glad the existing
> tests didn't find any obvious bugs. If we identify an actual problem
> with my patch, let's write a new test that fails with my patch!
>
> > That being said, in general this whole zoo of *[ug]id values
> > and their subtle behavior is not casually obvious. I'd recommend using
> > a userspace workaround for your use case, and not changing the legacy
> > behavior of the kernel.
>
> What userspace workaround would be possible? The only thing that comes
> to my mind is to apply NO_NEW_PRIVS in the child process after exec -
> but that's too late, because arbitrary untrusted code has already been
> executed at this point. This means I lose NO_NEW_PRIVS completely,
> because the attacker can simply skip the call.
>
> Max
On Thu, May 8, 2025 at 5:32 AM Andrew G. Morgan <morgan@kernel.org> wrote:
> See the "bprm_set_creds" paragraph. My concern is that there is an
> exploit vector associated with an abuser setting LD_LIBRARY_PATH= to
> something nefarious, and then invoking a setuid program that happens
> to re-exec itself for some reason. The first invocation will be as
> before, but when the binary re-exec's itself, I am concerned that this
> could cause the privileged binary to load an exploit.
Let's talk about this potential vulnerability again. Consider the
following code:
#include <stdio.h>
#include <sys/prctl.h>
#include <unistd.h>
int main(int argc, char **argv) {
printf("ruid=%d euid=%d\n", getuid(), geteuid());
if (argc > 1) {
printf("setting NO_NEW_PRIVS\n");
prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
}
if (**argv) {
printf("re-exec\n");
execl(*argv, "", NULL);
perror("exec");
}
}
Without my patch:
$ /reexec
ruid=1000 euid=0
re-exec
ruid=1000 euid=0
$ /reexec 1
ruid=1000 euid=0
setting NO_NEW_PRIVS
re-exec
ruid=1000 euid=1000
Without NO_NEW_PRIVS, the re-exec keeps the real/effective, but also
gains setuid again, but the suid is no-op; the euid is already 0.
With NO_NEW_PRIVS, the re-exec drops the euid=0, and doesn't regain it
- the setuid bit is ignored.
With my patch:
$ /reexec
ruid=1000 euid=0
re-exec
ruid=1000 euid=0
$ /reexec 1
ruid=1000 euid=0
setting NO_NEW_PRIVS
re-exec
ruid=1000 euid=0
Same without NO_NEW_PRIVS (but internally, the re-exec is not
considered "secureexec" because the setuid bit is effectively a
no-op).
With NO_NEW_PRIVS, the setuid bit is ignored, but the process is
allowed to keep the euid=0 - which is the whole point of my patch.
Indeed, no new privs are gained - just like NO_NEW_PRIVS is
documented!
Back to your LD_LIBRARY_PATH example: with my patch, glibc ignores
LD_LIBRARY_PATH because effective!==real (still).
But without my patch, with the vanilla kernel, glibc does use
LD_LIBRARY_PATH (effective==real because effective was reset) and
you're actually vulnerable! It seems to be quite opposite of what you
have been assuming?
(Don't get confused that the kernel has reverted the euid to 1000;
this is a privileged superuser process with a full set of
capabilities!)
Am I missing something obvious, or is NO_NEW_PRIVS+reexec is a serious
Linux kernel vulnerability? (And it is fixed by my patch)
Max
On Thu, May 8, 2025 at 5:32 AM Andrew G. Morgan <morgan@kernel.org> wrote: > If a luser runs a setuid program, then the kernel will set this > bprm->secureexec bit. Indeed, every time this program re-exec's > itself, that bit will consistently be set. Today. > > However, with your change, that behavior will change. The first time > the program is exec'd by luser this bit will be set. However, it will > "surprisingly" not occur should the program exec itself again. I had covered this case in my previous email already. This flag is only used by begin_new_exec(), and the only consequence is resetting pdeath_signal (point 4) and stack limit reset (point 5). I thought both are no deal at all for the second exec - or are they? I don't know. > If you are unaware of this bit's purpose there is a nice writeup here: > > https://www.kernel.org/doc/html/v5.1/security/LSM.html I don't see this document describing the purpose anywhere. It only states that bprm_set_creds should set it "if a secure exec has happened" but doesn't say what a "secure exec" is, and doesn't say what is supposed to happen when it's set. Is it really a "secureexec" if executing a non-suid program while having effective!=real? (Currently: true, my patch: false) Is it really a "secureexec" if executing a suid program but no actual uid/gid changes occurred (no-op) because they're the same as before (and no capabilities raised)? (Currently: true, my patch: false) > See the "bprm_set_creds" paragraph. My concern is that there is an > exploit vector associated with an abuser setting LD_LIBRARY_PATH= to > something nefarious, and then invoking a setuid program that happens > to re-exec itself for some reason. The first invocation will be as > before, but when the binary re-exec's itself, I am concerned that this > could cause the privileged binary to load an exploit. How would resetting pdeath_signal and stack limit affect this problem? LD_LIBRARY_PATH is an environment variable that's used by the userspace linker, not by the kernel. Userspace doesn't have access to the "secureexec" flag. The usual protection against LD_LIBRARY_PATH/LD_PRELOAD attacks is that the glibc ld.so ignores (or removes?) these when it observes real!=effective. That check still works with my patch, and has nothing to do with the "secureexec" flag, does it? > This has nothing to do with your interest in NO_NEW_PRIV but more to > do with the legacy behavior changes like this are exposed to. Yes, I understand your worries, and I can limit my patch to the NO_NEW_PRIVS case, if you prefer that. But I think it is worth exploring the legacy behavior and see if there is really a problem (i.e. if there is really a userspace-visible effect), and if that legacy behavior really makes sense. To me, it seems like the legacy code doesn't make much sense; the checks for "secureexec" are wrong currently, and maybe we can fix this without adding more complexity, but by cutting unnecessary complexity away. Simpler code that is easier to understand is less likely to have vulnerabilities. Max
Hi Max,
kernel test robot noticed the following build warnings:
[auto build test WARNING on pcmoore-selinux/next]
[also build test WARNING on linus/master v6.14-rc5 next-20250306]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Max-Kellermann/security-commoncap-don-t-assume-setid-if-all-ids-are-identical/20250306-162826
base: https://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git next
patch link: https://lore.kernel.org/r/20250306082615.174777-1-max.kellermann%40ionos.com
patch subject: [PATCH] security/commoncap: don't assume "setid" if all ids are identical
config: arc-allnoconfig (https://download.01.org/0day-ci/archive/20250307/202503071808.FE4vwUc4-lkp@intel.com/config)
compiler: arc-elf-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250307/202503071808.FE4vwUc4-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202503071808.FE4vwUc4-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> security/commoncap.c:865: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
* Are all user/group ids in both cred instances identical?
vim +865 security/commoncap.c
863
864 /**
> 865 * Are all user/group ids in both cred instances identical?
866 *
867 * It can be used after __is_setuid() / __is_setgid() to check whether
868 * this is really a set*id operation or whether both processes just
869 * have differing real/effective ids. It is safe to keep differing
870 * real/effective ids in "unsafe" program execution.
871 */
872 static bool has_identical_uids_gids(const struct cred *a, const struct cred *b)
873 {
874 return uid_eq(a->uid, b->uid) &&
875 gid_eq(a->gid, b->gid) &&
876 uid_eq(a->suid, b->suid) &&
877 gid_eq(a->sgid, b->sgid) &&
878 uid_eq(a->euid, b->euid) &&
879 gid_eq(a->egid, b->egid) &&
880 uid_eq(a->fsuid, b->fsuid) &&
881 gid_eq(a->fsgid, b->fsgid);
882 }
883
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
© 2016 - 2026 Red Hat, Inc.