fs/exec.c | 22 ++++++++++++++++++---- include/uapi/linux/fcntl.h | 3 ++- 2 files changed, 20 insertions(+), 5 deletions(-)
From: Tycho Andersen <tandersen@netflix.com>
Zbigniew mentioned at Linux Plumber's that systemd is interested in
switching to execveat() for service execution, but can't, because the
contents of /proc/pid/comm are the file descriptor which was used,
instead of the path to the binary. This makes the output of tools like
top and ps useless, especially in a world where most fds are opened
CLOEXEC so the number is truly meaningless.
This patch adds an AT_ flag to fix up /proc/pid/comm to instead be the
contents of argv[0], instead of the fdno.
Signed-off-by: Tycho Andersen <tandersen@netflix.com>
Suggested-by: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
CC: Aleksa Sarai <cyphar@cyphar.com>
---
There is some question about what to name the flag; it seems to me that
"everyone wants this" instead of the fdno, but probably "REASONABLE" is not
a good choice.
Also, requiring the arg to alloc_bprm() is a bit ugly: kernel-based execs
will never use this, so they just have to pass an empty thing. We could
introduce a bprm_fixup_comm() to do the munging there, but then the code
paths start to diverge, which is maybe not nice. I left it this way because
this is the smallest patch in terms of size, but I'm happy to change it.
Finally, here is a small set of test programs, I'm happy to turn them into
kselftests if we agree on an API
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
int main(void)
{
int fd;
char buf[128];
fd = open("/proc/self/comm", O_RDONLY);
if (fd < 0) {
perror("open comm");
exit(1);
}
if (read(fd, buf, 128) < 0) {
perror("read");
exit(1);
}
printf("comm: %s", buf);
exit(0);
}
#define _GNU_SOURCE
#include <stdio.h>
#include <syscall.h>
#include <stdbool.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/wait.h>
#ifndef AT_EMPTY_PATH
#define AT_EMPTY_PATH 0x1000 /* Allow empty relative */
#endif
#ifndef AT_EXEC_REASONABLE_COMM
#define AT_EXEC_REASONABLE_COMM 0x200
#endif
int main(int argc, char *argv[])
{
pid_t pid;
int status;
bool wants_reasonable_comm = argc > 1;
pid = fork();
if (pid < 0) {
perror("fork");
exit(1);
}
if (pid == 0) {
int fd;
long ret, flags;
fd = open("./catprocselfcomm", O_PATH);
if (fd < 0) {
perror("open catprocselfname");
exit(1);
}
flags = AT_EMPTY_PATH;
if (wants_reasonable_comm)
flags |= AT_EXEC_REASONABLE_COMM;
syscall(__NR_execveat, fd, "", (char *[]){"./catprocselfcomm", NULL}, NULL, flags);
fprintf(stderr, "execveat failed %d\n", errno);
exit(1);
}
if (waitpid(pid, &status, 0) != pid) {
fprintf(stderr, "wrong child\n");
exit(1);
}
if (!WIFEXITED(status)) {
fprintf(stderr, "exit status %x\n", status);
exit(1);
}
if (WEXITSTATUS(status) != 0) {
fprintf(stderr, "child failed\n");
exit(1);
}
return 0;
}
---
fs/exec.c | 22 ++++++++++++++++++----
include/uapi/linux/fcntl.h | 3 ++-
2 files changed, 20 insertions(+), 5 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index dad402d55681..36434feddb7b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1569,11 +1569,15 @@ static void free_bprm(struct linux_binprm *bprm)
kfree(bprm);
}
-static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags)
+static struct linux_binprm *alloc_bprm(int fd, struct filename *filename,
+ struct user_arg_ptr argv, int flags)
{
struct linux_binprm *bprm;
struct file *file;
int retval = -ENOMEM;
+ bool needs_comm_fixup = flags & AT_EXEC_REASONABLE_COMM;
+
+ flags &= ~AT_EXEC_REASONABLE_COMM;
file = do_open_execat(fd, filename, flags);
if (IS_ERR(file))
@@ -1590,11 +1594,20 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl
if (fd == AT_FDCWD || filename->name[0] == '/') {
bprm->filename = filename->name;
} else {
- if (filename->name[0] == '\0')
+ if (needs_comm_fixup) {
+ const char __user *p = get_user_arg_ptr(argv, 0);
+
+ retval = -EFAULT;
+ if (!p)
+ goto out_free;
+
+ bprm->fdpath = strndup_user(p, MAX_ARG_STRLEN);
+ } else if (filename->name[0] == '\0')
bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
else
bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
fd, filename->name);
+ retval = -ENOMEM;
if (!bprm->fdpath)
goto out_free;
@@ -1969,7 +1982,7 @@ static int do_execveat_common(int fd, struct filename *filename,
* further execve() calls fail. */
current->flags &= ~PF_NPROC_EXCEEDED;
- bprm = alloc_bprm(fd, filename, flags);
+ bprm = alloc_bprm(fd, filename, argv, flags);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
@@ -2034,6 +2047,7 @@ int kernel_execve(const char *kernel_filename,
struct linux_binprm *bprm;
int fd = AT_FDCWD;
int retval;
+ struct user_arg_ptr user_argv = {};
/* It is non-sense for kernel threads to call execve */
if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
@@ -2043,7 +2057,7 @@ int kernel_execve(const char *kernel_filename,
if (IS_ERR(filename))
return PTR_ERR(filename);
- bprm = alloc_bprm(fd, filename, 0);
+ bprm = alloc_bprm(fd, filename, user_argv, 0);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 87e2dec79fea..7178d1e4a3de 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -100,7 +100,8 @@
/* Reserved for per-syscall flags 0xff. */
#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic
links. */
-/* Reserved for per-syscall flags 0x200 */
+#define AT_EXEC_REASONABLE_COMM 0x200 /* Use argv[0] for comm in
+ execveat */
#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */
#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount
traversal. */
base-commit: baeb9a7d8b60b021d907127509c44507539c15e5
--
2.34.1
On Tue, Sep 24, 2024 at 08:10:01AM GMT, Tycho Andersen wrote: > From: Tycho Andersen <tandersen@netflix.com> > > Zbigniew mentioned at Linux Plumber's that systemd is interested in > switching to execveat() for service execution, but can't, because the > contents of /proc/pid/comm are the file descriptor which was used, > instead of the path to the binary. This makes the output of tools like > top and ps useless, especially in a world where most fds are opened > CLOEXEC so the number is truly meaningless. > > This patch adds an AT_ flag to fix up /proc/pid/comm to instead be the > contents of argv[0], instead of the fdno. > > Signed-off-by: Tycho Andersen <tandersen@netflix.com> > Suggested-by: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> > CC: Aleksa Sarai <cyphar@cyphar.com> > --- > There is some question about what to name the flag; it seems to me that > "everyone wants this" instead of the fdno, but probably "REASONABLE" is not > a good choice. > > Also, requiring the arg to alloc_bprm() is a bit ugly: kernel-based execs > will never use this, so they just have to pass an empty thing. We could > introduce a bprm_fixup_comm() to do the munging there, but then the code > paths start to diverge, which is maybe not nice. I left it this way because > this is the smallest patch in terms of size, but I'm happy to change it. > > Finally, here is a small set of test programs, I'm happy to turn them into > kselftests if we agree on an API > > #include <stdio.h> > #include <unistd.h> > #include <stdlib.h> > #include <sys/types.h> > #include <sys/stat.h> > #include <fcntl.h> > > int main(void) > { > int fd; > char buf[128]; > > fd = open("/proc/self/comm", O_RDONLY); > if (fd < 0) { > perror("open comm"); > exit(1); > } > > if (read(fd, buf, 128) < 0) { > perror("read"); > exit(1); > } > > printf("comm: %s", buf); > exit(0); > } > > #define _GNU_SOURCE > #include <stdio.h> > #include <syscall.h> > #include <stdbool.h> > #include <unistd.h> > #include <fcntl.h> > #include <stdlib.h> > #include <errno.h> > #include <sys/wait.h> > > #ifndef AT_EMPTY_PATH > #define AT_EMPTY_PATH 0x1000 /* Allow empty relative */ > #endif > > #ifndef AT_EXEC_REASONABLE_COMM > #define AT_EXEC_REASONABLE_COMM 0x200 > #endif > > int main(int argc, char *argv[]) > { > pid_t pid; > int status; > bool wants_reasonable_comm = argc > 1; > > pid = fork(); > if (pid < 0) { > perror("fork"); > exit(1); > } > > if (pid == 0) { > int fd; > long ret, flags; > > fd = open("./catprocselfcomm", O_PATH); > if (fd < 0) { > perror("open catprocselfname"); > exit(1); > } > > flags = AT_EMPTY_PATH; > if (wants_reasonable_comm) > flags |= AT_EXEC_REASONABLE_COMM; > syscall(__NR_execveat, fd, "", (char *[]){"./catprocselfcomm", NULL}, NULL, flags); Yes, that one is the actually palatable solution that I mentioned during the session and not the questionable version where the path argument is overloaded by the flag. Please add a: Link: https://github.com/uapi-group/kernel-features#set-comm-field-before-exec to the commit where this originated from. > fprintf(stderr, "execveat failed %d\n", errno); > exit(1); > } > > if (waitpid(pid, &status, 0) != pid) { > fprintf(stderr, "wrong child\n"); > exit(1); > } > > if (!WIFEXITED(status)) { > fprintf(stderr, "exit status %x\n", status); > exit(1); > } > > if (WEXITSTATUS(status) != 0) { > fprintf(stderr, "child failed\n"); > exit(1); > } > > return 0; > } > --- > fs/exec.c | 22 ++++++++++++++++++---- > include/uapi/linux/fcntl.h | 3 ++- > 2 files changed, 20 insertions(+), 5 deletions(-) > > diff --git a/fs/exec.c b/fs/exec.c > index dad402d55681..36434feddb7b 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -1569,11 +1569,15 @@ static void free_bprm(struct linux_binprm *bprm) > kfree(bprm); > } > > -static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) > +static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, > + struct user_arg_ptr argv, int flags) > { > struct linux_binprm *bprm; > struct file *file; > int retval = -ENOMEM; > + bool needs_comm_fixup = flags & AT_EXEC_REASONABLE_COMM; > + > + flags &= ~AT_EXEC_REASONABLE_COMM; > > file = do_open_execat(fd, filename, flags); > if (IS_ERR(file)) > @@ -1590,11 +1594,20 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl > if (fd == AT_FDCWD || filename->name[0] == '/') { > bprm->filename = filename->name; > } else { > - if (filename->name[0] == '\0') > + if (needs_comm_fixup) { > + const char __user *p = get_user_arg_ptr(argv, 0); > + > + retval = -EFAULT; > + if (!p) > + goto out_free; > + > + bprm->fdpath = strndup_user(p, MAX_ARG_STRLEN); > + } else if (filename->name[0] == '\0') > bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); > else > bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", > fd, filename->name); > + retval = -ENOMEM; > if (!bprm->fdpath) > goto out_free; > > @@ -1969,7 +1982,7 @@ static int do_execveat_common(int fd, struct filename *filename, > * further execve() calls fail. */ > current->flags &= ~PF_NPROC_EXCEEDED; > > - bprm = alloc_bprm(fd, filename, flags); > + bprm = alloc_bprm(fd, filename, argv, flags); > if (IS_ERR(bprm)) { > retval = PTR_ERR(bprm); > goto out_ret; > @@ -2034,6 +2047,7 @@ int kernel_execve(const char *kernel_filename, > struct linux_binprm *bprm; > int fd = AT_FDCWD; > int retval; > + struct user_arg_ptr user_argv = {}; > > /* It is non-sense for kernel threads to call execve */ > if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) > @@ -2043,7 +2057,7 @@ int kernel_execve(const char *kernel_filename, > if (IS_ERR(filename)) > return PTR_ERR(filename); > > - bprm = alloc_bprm(fd, filename, 0); > + bprm = alloc_bprm(fd, filename, user_argv, 0); > if (IS_ERR(bprm)) { > retval = PTR_ERR(bprm); > goto out_ret; > diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h > index 87e2dec79fea..7178d1e4a3de 100644 > --- a/include/uapi/linux/fcntl.h > +++ b/include/uapi/linux/fcntl.h > @@ -100,7 +100,8 @@ > /* Reserved for per-syscall flags 0xff. */ > #define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic > links. */ > -/* Reserved for per-syscall flags 0x200 */ > +#define AT_EXEC_REASONABLE_COMM 0x200 /* Use argv[0] for comm in > + execveat */ > #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ > #define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount > traversal. */ > > base-commit: baeb9a7d8b60b021d907127509c44507539c15e5 > -- > 2.34.1 >
Christian Brauner <brauner@kernel.org> writes: > Please add a: > > Link: https://github.com/uapi-group/kernel-features#set-comm-field-before-exec > > to the commit where this originated from. What standing does some random github project have? Eric
On Wed, Sep 25, 2024 at 08:18:39AM GMT, Eric W. Biederman wrote: > Christian Brauner <brauner@kernel.org> writes: > > > Please add a: > > > > Link: https://github.com/uapi-group/kernel-features#set-comm-field-before-exec > > > > to the commit where this originated from. ^^^^^^^^^^^^^^^^^^^^^^^^^^ > What standing does some random github project have? git log --grep "Link: https://github.com/" git log --grep "\[.\]: https://github.com/" For any additional questions, I'm sure searching for uapi group will bring forth the LWN articles and homepage. Otherwise the Plumber's video stream for the associated uapi group microconference where this idea among many other was discussed will surely help. Christian
Tycho Andersen <tycho@tycho.pizza> writes: > From: Tycho Andersen <tandersen@netflix.com> > > Zbigniew mentioned at Linux Plumber's that systemd is interested in > switching to execveat() for service execution, but can't, because the > contents of /proc/pid/comm are the file descriptor which was used, > instead of the path to the binary. This makes the output of tools like > top and ps useless, especially in a world where most fds are opened > CLOEXEC so the number is truly meaningless. > > This patch adds an AT_ flag to fix up /proc/pid/comm to instead be the > contents of argv[0], instead of the fdno. The kernel allows prctl(PR_SET_NAME, ...) without any permission checks so adding an AT_ flat to use argv[0] instead of the execed filename seems reasonable. Maybe the flag should be called AT_NAME_ARGV0. That said I am trying to remember why we picked /dev/fd/N, as the filename. My memory is that we couldn't think of anything more reasonable to use. Looking at commit 51f39a1f0cea ("syscalls: implement execveat() system call") unfortunately doesn't clarify anything for me, except that /dev/fd/N was a reasonable choice. I am thinking the code could reasonably try: get_fs_root_rcu(current->fs, &root); path = __d_path(file->f_path, root, buf, buflen); To see if a path to the file from the current root directory can be found. For files that are not reachable from the current root the code still need to fallback to /dev/fd/N. Do you think you can investigate that and see if that would generate a reasonable task->comm? If for no other reason than because it would generate a usable result for #! scripts, without /proc mounted. It looks like a reasonable case can be made that while /dev/fd/N is a good path for interpreters, it is never a good choice for comm, so perhaps we could always use argv[0] if the fdpath is of the form /dev/fd/N. All of that said I am not a fan of the implementation below as it has the side effect of replacing /dev/fd/N with a filename that is not usable by #! interpreters. So I suggest an implementation that affects task->comm and not brpm->filename. Eric > Signed-off-by: Tycho Andersen <tandersen@netflix.com> > Suggested-by: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> > CC: Aleksa Sarai <cyphar@cyphar.com> > --- > There is some question about what to name the flag; it seems to me that > "everyone wants this" instead of the fdno, but probably "REASONABLE" is not > a good choice. > > Also, requiring the arg to alloc_bprm() is a bit ugly: kernel-based execs > will never use this, so they just have to pass an empty thing. We could > introduce a bprm_fixup_comm() to do the munging there, but then the code > paths start to diverge, which is maybe not nice. I left it this way because > this is the smallest patch in terms of size, but I'm happy to change it. > > Finally, here is a small set of test programs, I'm happy to turn them into > kselftests if we agree on an API > > #include <stdio.h> > #include <unistd.h> > #include <stdlib.h> > #include <sys/types.h> > #include <sys/stat.h> > #include <fcntl.h> > > int main(void) > { > int fd; > char buf[128]; > > fd = open("/proc/self/comm", O_RDONLY); > if (fd < 0) { > perror("open comm"); > exit(1); > } > > if (read(fd, buf, 128) < 0) { > perror("read"); > exit(1); > } > > printf("comm: %s", buf); > exit(0); > } > > #define _GNU_SOURCE > #include <stdio.h> > #include <syscall.h> > #include <stdbool.h> > #include <unistd.h> > #include <fcntl.h> > #include <stdlib.h> > #include <errno.h> > #include <sys/wait.h> > > #ifndef AT_EMPTY_PATH > #define AT_EMPTY_PATH 0x1000 /* Allow empty relative */ > #endif > > #ifndef AT_EXEC_REASONABLE_COMM > #define AT_EXEC_REASONABLE_COMM 0x200 > #endif > > int main(int argc, char *argv[]) > { > pid_t pid; > int status; > bool wants_reasonable_comm = argc > 1; > > pid = fork(); > if (pid < 0) { > perror("fork"); > exit(1); > } > > if (pid == 0) { > int fd; > long ret, flags; > > fd = open("./catprocselfcomm", O_PATH); > if (fd < 0) { > perror("open catprocselfname"); > exit(1); > } > > flags = AT_EMPTY_PATH; > if (wants_reasonable_comm) > flags |= AT_EXEC_REASONABLE_COMM; > syscall(__NR_execveat, fd, "", (char *[]){"./catprocselfcomm", NULL}, NULL, flags); > fprintf(stderr, "execveat failed %d\n", errno); > exit(1); > } > > if (waitpid(pid, &status, 0) != pid) { > fprintf(stderr, "wrong child\n"); > exit(1); > } > > if (!WIFEXITED(status)) { > fprintf(stderr, "exit status %x\n", status); > exit(1); > } > > if (WEXITSTATUS(status) != 0) { > fprintf(stderr, "child failed\n"); > exit(1); > } > > return 0; > } > --- > fs/exec.c | 22 ++++++++++++++++++---- > include/uapi/linux/fcntl.h | 3 ++- > 2 files changed, 20 insertions(+), 5 deletions(-) > > diff --git a/fs/exec.c b/fs/exec.c > index dad402d55681..36434feddb7b 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -1569,11 +1569,15 @@ static void free_bprm(struct linux_binprm *bprm) > kfree(bprm); > } > > -static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) > +static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, > + struct user_arg_ptr argv, int flags) > { > struct linux_binprm *bprm; > struct file *file; > int retval = -ENOMEM; > + bool needs_comm_fixup = flags & AT_EXEC_REASONABLE_COMM; > + > + flags &= ~AT_EXEC_REASONABLE_COMM; > > file = do_open_execat(fd, filename, flags); > if (IS_ERR(file)) > @@ -1590,11 +1594,20 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl > if (fd == AT_FDCWD || filename->name[0] == '/') { > bprm->filename = filename->name; > } else { > - if (filename->name[0] == '\0') > + if (needs_comm_fixup) { > + const char __user *p = get_user_arg_ptr(argv, 0); > + > + retval = -EFAULT; > + if (!p) > + goto out_free; > + > + bprm->fdpath = strndup_user(p, MAX_ARG_STRLEN); > + } else if (filename->name[0] == '\0') > bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); > else > bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", > fd, filename->name); > + retval = -ENOMEM; > if (!bprm->fdpath) > goto out_free; > > @@ -1969,7 +1982,7 @@ static int do_execveat_common(int fd, struct filename *filename, > * further execve() calls fail. */ > current->flags &= ~PF_NPROC_EXCEEDED; > > - bprm = alloc_bprm(fd, filename, flags); > + bprm = alloc_bprm(fd, filename, argv, flags); > if (IS_ERR(bprm)) { > retval = PTR_ERR(bprm); > goto out_ret; > @@ -2034,6 +2047,7 @@ int kernel_execve(const char *kernel_filename, > struct linux_binprm *bprm; > int fd = AT_FDCWD; > int retval; > + struct user_arg_ptr user_argv = {}; > > /* It is non-sense for kernel threads to call execve */ > if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) > @@ -2043,7 +2057,7 @@ int kernel_execve(const char *kernel_filename, > if (IS_ERR(filename)) > return PTR_ERR(filename); > > - bprm = alloc_bprm(fd, filename, 0); > + bprm = alloc_bprm(fd, filename, user_argv, 0); > if (IS_ERR(bprm)) { > retval = PTR_ERR(bprm); > goto out_ret; > diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h > index 87e2dec79fea..7178d1e4a3de 100644 > --- a/include/uapi/linux/fcntl.h > +++ b/include/uapi/linux/fcntl.h > @@ -100,7 +100,8 @@ > /* Reserved for per-syscall flags 0xff. */ > #define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic > links. */ > -/* Reserved for per-syscall flags 0x200 */ > +#define AT_EXEC_REASONABLE_COMM 0x200 /* Use argv[0] for comm in > + execveat */ > #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ > #define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount > traversal. */ > > base-commit: baeb9a7d8b60b021d907127509c44507539c15e5
On Tue, Sep 24, 2024 at 12:39:35PM -0500, Eric W. Biederman wrote: > Tycho Andersen <tycho@tycho.pizza> writes: > > > From: Tycho Andersen <tandersen@netflix.com> > > > > Zbigniew mentioned at Linux Plumber's that systemd is interested in > > switching to execveat() for service execution, but can't, because the > > contents of /proc/pid/comm are the file descriptor which was used, > > instead of the path to the binary. This makes the output of tools like > > top and ps useless, especially in a world where most fds are opened > > CLOEXEC so the number is truly meaningless. > > > > This patch adds an AT_ flag to fix up /proc/pid/comm to instead be the > > contents of argv[0], instead of the fdno. I tried this version (with a local modification to drop the flag and enable the new codepath if get_user_arg_ptr(argv, 0) returns nonnull as suggested later in the thread), and it seems to work as expected. In particular, 'pgrep' finds for the original name in case of symlinks. > All of that said I am not a fan of the implementation below as it has > the side effect of replacing /dev/fd/N with a filename that is not > usable by #! interpreters. So I suggest an implementation that affects > task->comm and not brpm->filename. Hmm, I don't understand this. /dev/fd/ would not generally contain an open fd for the original binary. It only would if the caller uses fexecve with an fd opened without O_CLOEXEC, but then it'd be something like /dev/fd/3 or /dev/fd/4 and the callee would be confused by having an extra fd, so except for some specialed cases, the caller should always use O_CLOEXEC. With this patch: $ sudo ln -sv /bin/sleep /usr/local/bin/sleep-link $ sudo systemd-run sleep-link 10000 $ sudo strace -f -e execve,execveat -p 1 ... [pid 1200] execve("/proc/self/fd/9", ["/usr/lib/systemd/systemd-executo"..., "--deserialize", "150", "--log-level", "info", "--log-target", "journal-or-kmsg"], 0x7ffe97b98178 /* 3 vars */) = 0 [pid 1200] execveat(4, "", ["/usr/local/bin/sleep-link", "10000"], 0xd8edf70 /* 9 vars */, AT_EMPTY_PATH) = 0 ^C $ pgrep sleep-link 1200 $ sudo ls -l /proc/1200/fd total 0 lr-x------ 1 root root 64 Oct 2 17:13 0 -> /dev/null lrwx------ 1 root root 64 Oct 2 17:13 1 -> 'socket:[8585]' lrwx------ 1 root root 64 Oct 2 17:13 2 -> 'socket:[8585]' $ head -n1 /proc/1200/{comm,status,stat} ==> /proc/1200/comm <== sleep-link ==> /proc/1200/status <== Name: sleep-link ==> /proc/1200/stat <== 1200 (sleep-link) ... This all looks good. Zbyszek
On Wed, Oct 02, 2024 at 02:34:43PM +0000, Zbigniew Jędrzejewski-Szmek wrote: > On Tue, Sep 24, 2024 at 12:39:35PM -0500, Eric W. Biederman wrote: > > Tycho Andersen <tycho@tycho.pizza> writes: > > > > > From: Tycho Andersen <tandersen@netflix.com> > > > > > > Zbigniew mentioned at Linux Plumber's that systemd is interested in > > > switching to execveat() for service execution, but can't, because the > > > contents of /proc/pid/comm are the file descriptor which was used, > > > instead of the path to the binary. This makes the output of tools like > > > top and ps useless, especially in a world where most fds are opened > > > CLOEXEC so the number is truly meaningless. > > > > > > This patch adds an AT_ flag to fix up /proc/pid/comm to instead be the > > > contents of argv[0], instead of the fdno. > > I tried this version (with a local modification to drop the flag and > enable the new codepath if get_user_arg_ptr(argv, 0) returns nonnull > as suggested later in the thread), and it seems to work as expected. > In particular, 'pgrep' finds for the original name in case of > symlinks. Here is a version that only affects /proc/pid/comm, without a flag. We still have to do the dance of keeping the user argv0 before actually doing __set_task_comm(), since we want to surface the resulting fault if people pass a bad argv0. Thoughts? Tycho diff --git a/fs/exec.c b/fs/exec.c index dad402d55681..61de8a71f316 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1416,7 +1416,16 @@ int begin_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); - __set_task_comm(me, kbasename(bprm->filename), true); + + /* + * If fdpath was set, execveat() made up a path that will + * probably not be useful to admins running ps or similar. + * Let's fix it up to be something reasonable. + */ + if (bprm->argv0) + __set_task_comm(me, kbasename(bprm->argv0), true); + else + __set_task_comm(me, kbasename(bprm->filename), true); /* An exec changes our domain. We are no longer part of the thread group */ @@ -1566,9 +1575,30 @@ static void free_bprm(struct linux_binprm *bprm) if (bprm->interp != bprm->filename) kfree(bprm->interp); kfree(bprm->fdpath); + kfree(bprm->argv0); kfree(bprm); } +static int bprm_add_fixup_comm(struct linux_binprm *bprm, struct user_arg_ptr argv) +{ + const char __user *p = get_user_arg_ptr(argv, 0); + + /* + * In keeping with the logic in do_execveat_common(), we say p == NULL + * => "" for comm. + */ + if (!p) { + bprm->argv0 = kstrdup("", GFP_KERNEL); + return 0; + } + + bprm->argv0 = strndup_user(p, MAX_ARG_STRLEN); + if (bprm->argv0) + return 0; + + return -EFAULT; +} + static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) { struct linux_binprm *bprm; @@ -1975,6 +2005,10 @@ static int do_execveat_common(int fd, struct filename *filename, goto out_ret; } + retval = bprm_add_fixup_comm(bprm, argv); + if (retval != 0) + goto out_free; + retval = count(argv, MAX_ARG_STRINGS); if (retval == 0) pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index e6c00e860951..0cd1f2d0e8c6 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -55,6 +55,7 @@ struct linux_binprm { of the time same as filename, but could be different for binfmt_{misc,script} */ const char *fdpath; /* generated filename for execveat */ + const char *argv0; /* argv0 from execveat */ unsigned interp_flags; int execfd; /* File descriptor of the executable */ unsigned long loader, exec;
On Wed, Oct 09, 2024 at 08:41:31AM -0600, Tycho Andersen wrote: > On Wed, Oct 02, 2024 at 02:34:43PM +0000, Zbigniew Jędrzejewski-Szmek wrote: > > On Tue, Sep 24, 2024 at 12:39:35PM -0500, Eric W. Biederman wrote: > > > Tycho Andersen <tycho@tycho.pizza> writes: > > > > > > > From: Tycho Andersen <tandersen@netflix.com> > > > > > > > > Zbigniew mentioned at Linux Plumber's that systemd is interested in > > > > switching to execveat() for service execution, but can't, because the > > > > contents of /proc/pid/comm are the file descriptor which was used, > > > > instead of the path to the binary. This makes the output of tools like > > > > top and ps useless, especially in a world where most fds are opened > > > > CLOEXEC so the number is truly meaningless. > > > > > > > > This patch adds an AT_ flag to fix up /proc/pid/comm to instead be the > > > > contents of argv[0], instead of the fdno. > > > > I tried this version (with a local modification to drop the flag and > > enable the new codepath if get_user_arg_ptr(argv, 0) returns nonnull > > as suggested later in the thread), and it seems to work as expected. > > In particular, 'pgrep' finds for the original name in case of > > symlinks. > > Here is a version that only affects /proc/pid/comm, without a flag. We > still have to do the dance of keeping the user argv0 before actually > doing __set_task_comm(), since we want to surface the resulting fault > if people pass a bad argv0. Thoughts? > > Tycho > > > > diff --git a/fs/exec.c b/fs/exec.c > index dad402d55681..61de8a71f316 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -1416,7 +1416,16 @@ int begin_new_exec(struct linux_binprm * bprm) > set_dumpable(current->mm, SUID_DUMP_USER); > > perf_event_exec(); > - __set_task_comm(me, kbasename(bprm->filename), true); > + > + /* > + * If fdpath was set, execveat() made up a path that will > + * probably not be useful to admins running ps or similar. > + * Let's fix it up to be something reasonable. > + */ > + if (bprm->argv0) > + __set_task_comm(me, kbasename(bprm->argv0), true); > + else > + __set_task_comm(me, kbasename(bprm->filename), true); This isn't checking fdpath? > > /* An exec changes our domain. We are no longer part of the thread > group */ > @@ -1566,9 +1575,30 @@ static void free_bprm(struct linux_binprm *bprm) > if (bprm->interp != bprm->filename) > kfree(bprm->interp); > kfree(bprm->fdpath); > + kfree(bprm->argv0); > kfree(bprm); > } > > +static int bprm_add_fixup_comm(struct linux_binprm *bprm, struct user_arg_ptr argv) > +{ > + const char __user *p = get_user_arg_ptr(argv, 0); > + > + /* > + * In keeping with the logic in do_execveat_common(), we say p == NULL > + * => "" for comm. > + */ > + if (!p) { > + bprm->argv0 = kstrdup("", GFP_KERNEL); > + return 0; > + } > + > + bprm->argv0 = strndup_user(p, MAX_ARG_STRLEN); > + if (bprm->argv0) > + return 0; > + > + return -EFAULT; > +} I'd rather this logic got done in copy_strings() and to avoid duplicating a copy for all exec users. I think it should be possible to just do this, to find the __user char *: diff --git a/fs/exec.c b/fs/exec.c index 77364806b48d..e12fd706f577 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -642,6 +642,8 @@ static int copy_strings(int argc, struct user_arg_ptr argv, goto out; } } + if (argc == 0) + bprm->argv0 = str; } ret = 0; out: Once we get to begin_new_exec(), only if we need to do the work (fdpath set), then we can do the strndup_user() instead of making every exec hold a copy regardless of whether it will be needed. -Kees > + > static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) > { > struct linux_binprm *bprm; > @@ -1975,6 +2005,10 @@ static int do_execveat_common(int fd, struct filename *filename, > goto out_ret; > } > > + retval = bprm_add_fixup_comm(bprm, argv); > + if (retval != 0) > + goto out_free; > + > retval = count(argv, MAX_ARG_STRINGS); > if (retval == 0) > pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", > diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h > index e6c00e860951..0cd1f2d0e8c6 100644 > --- a/include/linux/binfmts.h > +++ b/include/linux/binfmts.h > @@ -55,6 +55,7 @@ struct linux_binprm { > of the time same as filename, but could be > different for binfmt_{misc,script} */ > const char *fdpath; /* generated filename for execveat */ > + const char *argv0; /* argv0 from execveat */ > unsigned interp_flags; > int execfd; /* File descriptor of the executable */ > unsigned long loader, exec; -- Kees Cook
On Mon, Oct 14, 2024 at 02:13:32PM -0700, Kees Cook wrote: > On Wed, Oct 09, 2024 at 08:41:31AM -0600, Tycho Andersen wrote: > > +static int bprm_add_fixup_comm(struct linux_binprm *bprm, struct user_arg_ptr argv) > > +{ > > + const char __user *p = get_user_arg_ptr(argv, 0); > > + > > + /* > > + * In keeping with the logic in do_execveat_common(), we say p == NULL > > + * => "" for comm. > > + */ > > + if (!p) { > > + bprm->argv0 = kstrdup("", GFP_KERNEL); > > + return 0; > > + } > > + > > + bprm->argv0 = strndup_user(p, MAX_ARG_STRLEN); > > + if (bprm->argv0) > > + return 0; > > + > > + return -EFAULT; > > +} > > I'd rather this logic got done in copy_strings() and to avoid duplicating > a copy for all exec users. I think it should be possible to just do > this, to find the __user char *: > > diff --git a/fs/exec.c b/fs/exec.c > index 77364806b48d..e12fd706f577 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -642,6 +642,8 @@ static int copy_strings(int argc, struct user_arg_ptr argv, > goto out; > } > } > + if (argc == 0) > + bprm->argv0 = str; > } > ret = 0; > out: Isn't str here a __user? We want a kernel string for setting comm, so I guess kaddr+offset? But that's not mapped any more... > Once we get to begin_new_exec(), only if we need to do the work (fdpath > set), then we can do the strndup_user() instead of making every exec > hold a copy regardless of whether it will be needed. What happens if that allocation fails? begin_new_exec() says it is the point of no return, so we would just swallow the exec? Or have mysteriously inconsistent behavior? I think we could check ->fdpath in the bprm_add_fixup_comm() above, and only do the allocation when really necessary. I should have done that in the above version, which would have made the comment about checking fdpath even somewhat true :) Something like the below? Tycho diff --git a/fs/exec.c b/fs/exec.c index dad402d55681..7ec0bbfbc3c3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1416,7 +1416,16 @@ int begin_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); - __set_task_comm(me, kbasename(bprm->filename), true); + + /* + * If argv0 was set, execveat() made up a path that will + * probably not be useful to admins running ps or similar. + * Let's fix it up to be something reasonable. + */ + if (bprm->argv0) + __set_task_comm(me, kbasename(bprm->argv0), true); + else + __set_task_comm(me, kbasename(bprm->filename), true); /* An exec changes our domain. We are no longer part of the thread group */ @@ -1566,9 +1575,36 @@ static void free_bprm(struct linux_binprm *bprm) if (bprm->interp != bprm->filename) kfree(bprm->interp); kfree(bprm->fdpath); + kfree(bprm->argv0); kfree(bprm); } +static int bprm_add_fixup_comm(struct linux_binprm *bprm, struct user_arg_ptr argv) +{ + const char __user *p = get_user_arg_ptr(argv, 0); + + /* + * If this isn't an execveat(), we don't need to fix up the command. + */ + if (!bprm->fdpath) + return 0; + + /* + * In keeping with the logic in do_execveat_common(), we say p == NULL + * => "" for comm. + */ + if (!p) { + bprm->argv0 = kstrdup("", GFP_KERNEL); + return 0; + } + + bprm->argv0 = strndup_user(p, MAX_ARG_STRLEN); + if (bprm->argv0) + return 0; + + return -EFAULT; +} + static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) { struct linux_binprm *bprm; @@ -1975,6 +2011,10 @@ static int do_execveat_common(int fd, struct filename *filename, goto out_ret; } + retval = bprm_add_fixup_comm(bprm, argv); + if (retval != 0) + goto out_free; + retval = count(argv, MAX_ARG_STRINGS); if (retval == 0) pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
On Thu, Oct 17, 2024 at 08:34:43AM -0600, Tycho Andersen wrote: > On Mon, Oct 14, 2024 at 02:13:32PM -0700, Kees Cook wrote: > > On Wed, Oct 09, 2024 at 08:41:31AM -0600, Tycho Andersen wrote: > > > +static int bprm_add_fixup_comm(struct linux_binprm *bprm, struct user_arg_ptr argv) > > > +{ > > > + const char __user *p = get_user_arg_ptr(argv, 0); > > > + > > > + /* > > > + * In keeping with the logic in do_execveat_common(), we say p == NULL > > > + * => "" for comm. > > > + */ > > > + if (!p) { > > > + bprm->argv0 = kstrdup("", GFP_KERNEL); > > > + return 0; > > > + } > > > + > > > + bprm->argv0 = strndup_user(p, MAX_ARG_STRLEN); > > > + if (bprm->argv0) > > > + return 0; > > > + > > > + return -EFAULT; > > > +} > > > > I'd rather this logic got done in copy_strings() and to avoid duplicating > > a copy for all exec users. I think it should be possible to just do > > this, to find the __user char *: > > > > diff --git a/fs/exec.c b/fs/exec.c > > index 77364806b48d..e12fd706f577 100644 > > --- a/fs/exec.c > > +++ b/fs/exec.c > > @@ -642,6 +642,8 @@ static int copy_strings(int argc, struct user_arg_ptr argv, > > goto out; > > } > > } > > + if (argc == 0) > > + bprm->argv0 = str; > > } > > ret = 0; > > out: > > Isn't str here a __user? We want a kernel string for setting comm, so > I guess kaddr+offset? But that's not mapped any more... Yes, but it'll be valid __user addr in the new process. (IIUC) > > Once we get to begin_new_exec(), only if we need to do the work (fdpath > > set), then we can do the strndup_user() instead of making every exec > > hold a copy regardless of whether it will be needed. > > What happens if that allocation fails? begin_new_exec() says it is the > point of no return, so we would just swallow the exec? Or have > mysteriously inconsistent behavior? If we can't alloc a string in begin_new_exec() we're going to have much later problems, so yeah, I'm fine with it failing there. > I think we could check ->fdpath in the bprm_add_fixup_comm() above, > and only do the allocation when really necessary. I should have done > that in the above version, which would have made the comment about > checking fdpath even somewhat true :) But to keep this more readable, I do like your version below, with some notes. > > Something like the below? > > Tycho > > > > diff --git a/fs/exec.c b/fs/exec.c > index dad402d55681..7ec0bbfbc3c3 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -1416,7 +1416,16 @@ int begin_new_exec(struct linux_binprm * bprm) > set_dumpable(current->mm, SUID_DUMP_USER); > > perf_event_exec(); > - __set_task_comm(me, kbasename(bprm->filename), true); > + > + /* > + * If argv0 was set, execveat() made up a path that will > + * probably not be useful to admins running ps or similar. > + * Let's fix it up to be something reasonable. > + */ > + if (bprm->argv0) > + __set_task_comm(me, kbasename(bprm->argv0), true); > + else > + __set_task_comm(me, kbasename(bprm->filename), true); > > /* An exec changes our domain. We are no longer part of the thread > group */ > @@ -1566,9 +1575,36 @@ static void free_bprm(struct linux_binprm *bprm) > if (bprm->interp != bprm->filename) > kfree(bprm->interp); > kfree(bprm->fdpath); > + kfree(bprm->argv0); > kfree(bprm); > } > > +static int bprm_add_fixup_comm(struct linux_binprm *bprm, struct user_arg_ptr argv) > +{ > + const char __user *p = get_user_arg_ptr(argv, 0); To keep this const but not call get_user_arg_ptr() before the fdpath check, how about externalizing it. See further below... > + > + /* > + * If this isn't an execveat(), we don't need to fix up the command. > + */ > + if (!bprm->fdpath) > + return 0; > + > + /* > + * In keeping with the logic in do_execveat_common(), we say p == NULL > + * => "" for comm. > + */ > + if (!p) { > + bprm->argv0 = kstrdup("", GFP_KERNEL); Do we want an empty argv0, though? Shouldn't an empty fall back to fdpath? > + return 0; > + } > + > + bprm->argv0 = strndup_user(p, MAX_ARG_STRLEN); > + if (bprm->argv0) > + return 0; > + > + return -EFAULT; > +} > + > static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) > { > struct linux_binprm *bprm; > @@ -1975,6 +2011,10 @@ static int do_execveat_common(int fd, struct filename *filename, > goto out_ret; > } > > + retval = bprm_add_fixup_comm(bprm, argv); > + if (retval != 0) > + goto out_free; How about: if (unlikely(bprm->fdpath)) { retval = bprm_add_fixup_comm(bprm, argv); if (retval != 0) goto out_free; } with the fdpath removed from bprm_add_fixup_comm()? > + > retval = count(argv, MAX_ARG_STRINGS); > if (retval == 0) > pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", -- Kees Cook
On Thu, Oct 17, 2024 at 08:47:03AM -0700, Kees Cook wrote: > On Thu, Oct 17, 2024 at 08:34:43AM -0600, Tycho Andersen wrote: > > On Mon, Oct 14, 2024 at 02:13:32PM -0700, Kees Cook wrote: > > > On Wed, Oct 09, 2024 at 08:41:31AM -0600, Tycho Andersen wrote: > > > > +static int bprm_add_fixup_comm(struct linux_binprm *bprm, struct user_arg_ptr argv) > > > > +{ > > > > + const char __user *p = get_user_arg_ptr(argv, 0); > > > > + > > > > + /* > > > > + * In keeping with the logic in do_execveat_common(), we say p == NULL > > > > + * => "" for comm. > > > > + */ > > > > + if (!p) { > > > > + bprm->argv0 = kstrdup("", GFP_KERNEL); > > > > + return 0; > > > > + } > > > > + > > > > + bprm->argv0 = strndup_user(p, MAX_ARG_STRLEN); > > > > + if (bprm->argv0) > > > > + return 0; > > > > + > > > > + return -EFAULT; > > > > +} > > > > > > I'd rather this logic got done in copy_strings() and to avoid duplicating > > > a copy for all exec users. I think it should be possible to just do > > > this, to find the __user char *: > > > > > > diff --git a/fs/exec.c b/fs/exec.c > > > index 77364806b48d..e12fd706f577 100644 > > > --- a/fs/exec.c > > > +++ b/fs/exec.c > > > @@ -642,6 +642,8 @@ static int copy_strings(int argc, struct user_arg_ptr argv, > > > goto out; > > > } > > > } > > > + if (argc == 0) > > > + bprm->argv0 = str; > > > } > > > ret = 0; > > > out: > > > > Isn't str here a __user? We want a kernel string for setting comm, so > > I guess kaddr+offset? But that's not mapped any more... > > Yes, but it'll be valid __user addr in the new process. (IIUC) Yes, it's valid, but we need a kernel pointer for __set_task_comm(). > > > Once we get to begin_new_exec(), only if we need to do the work (fdpath > > > set), then we can do the strndup_user() instead of making every exec > > > hold a copy regardless of whether it will be needed. > > > > What happens if that allocation fails? begin_new_exec() says it is the > > point of no return, so we would just swallow the exec? Or have > > mysteriously inconsistent behavior? > > If we can't alloc a string in begin_new_exec() we're going to have much > later problems, so yeah, I'm fine with it failing there. Ok, cool. But with your notes below, the allocation will still be before begin_new_execexit(), just only in the cases where we actually need it, hopefully that's okay. > > +static int bprm_add_fixup_comm(struct linux_binprm *bprm, struct user_arg_ptr argv) > > +{ > > + const char __user *p = get_user_arg_ptr(argv, 0); > > To keep this const but not call get_user_arg_ptr() before the fdpath > check, how about externalizing it. See further below... > > > + > > + /* > > + * If this isn't an execveat(), we don't need to fix up the command. > > + */ > > + if (!bprm->fdpath) > > + return 0; > > + > > + /* > > + * In keeping with the logic in do_execveat_common(), we say p == NULL > > + * => "" for comm. > > + */ > > + if (!p) { > > + bprm->argv0 = kstrdup("", GFP_KERNEL); > > Do we want an empty argv0, though? Shouldn't an empty fall back to > fdpath? Yes, sounds good. > > + return 0; > > + } > > + > > + bprm->argv0 = strndup_user(p, MAX_ARG_STRLEN); > > + if (bprm->argv0) > > + return 0; > > + > > + return -EFAULT; > > +} > > + > > static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) > > { > > struct linux_binprm *bprm; > > @@ -1975,6 +2011,10 @@ static int do_execveat_common(int fd, struct filename *filename, > > goto out_ret; > > } > > > > + retval = bprm_add_fixup_comm(bprm, argv); > > + if (retval != 0) > > + goto out_free; > > How about: > > if (unlikely(bprm->fdpath)) { > retval = bprm_add_fixup_comm(bprm, argv); > if (retval != 0) > goto out_free; > } > > with the fdpath removed from bprm_add_fixup_comm()? Yep, this is much clearer, thanks. I will respin with these as a Real Patch. Tycho
On 2024-09-24, Eric W. Biederman <ebiederm@xmission.com> wrote: > Tycho Andersen <tycho@tycho.pizza> writes: > > > From: Tycho Andersen <tandersen@netflix.com> > > > > Zbigniew mentioned at Linux Plumber's that systemd is interested in > > switching to execveat() for service execution, but can't, because the > > contents of /proc/pid/comm are the file descriptor which was used, > > instead of the path to the binary. This makes the output of tools like > > top and ps useless, especially in a world where most fds are opened > > CLOEXEC so the number is truly meaningless. > > > > This patch adds an AT_ flag to fix up /proc/pid/comm to instead be the > > contents of argv[0], instead of the fdno. > > The kernel allows prctl(PR_SET_NAME, ...) without any permission > checks so adding an AT_ flat to use argv[0] instead of the execed > filename seems reasonable. > > Maybe the flag should be called AT_NAME_ARGV0. > > > That said I am trying to remember why we picked /dev/fd/N, as the > filename. > > My memory is that we couldn't think of anything more reasonable to use. > Looking at commit 51f39a1f0cea ("syscalls: implement execveat() system > call") unfortunately doesn't clarify anything for me, except that > /dev/fd/N was a reasonable choice. > > I am thinking the code could reasonably try: > get_fs_root_rcu(current->fs, &root); > path = __d_path(file->f_path, root, buf, buflen); > > To see if a path to the file from the current root directory can be > found. For files that are not reachable from the current root the code > still need to fallback to /dev/fd/N. > > Do you think you can investigate that and see if that would generate > a reasonable task->comm? The problem mentioned during the discussion after the talk was that busybox symlinks everything to the same program, so using d_path will give somewhat confusing results and so separate behaviour is still needed (though to be fair, the current results are also confusing). > If for no other reason than because it would generate a usable result > for #! scripts, without /proc mounted. For interpreters, wouldn't there be a race condition where the path might change after doing d_path? I don't know if any interpreter actually cares about that, but it seems possible that it could lead to issues. Though for O_CLOEXEC, the fd will always be closed (as Zbigniew said in his talk) so maybe this isn't a problem in practice. > It looks like a reasonable case can be made that while /dev/fd/N is > a good path for interpreters, it is never a good choice for comm, > so perhaps we could always use argv[0] if the fdpath is of the > form /dev/fd/N. > > All of that said I am not a fan of the implementation below as it has > the side effect of replacing /dev/fd/N with a filename that is not > usable by #! interpreters. So I suggest an implementation that affects > task->comm and not brpm->filename. I think only affecting task->comm would be ideal. > Eric > > > > Signed-off-by: Tycho Andersen <tandersen@netflix.com> > > Suggested-by: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> > > CC: Aleksa Sarai <cyphar@cyphar.com> > > --- > > There is some question about what to name the flag; it seems to me that > > "everyone wants this" instead of the fdno, but probably "REASONABLE" is not > > a good choice. > > > > Also, requiring the arg to alloc_bprm() is a bit ugly: kernel-based execs > > will never use this, so they just have to pass an empty thing. We could > > introduce a bprm_fixup_comm() to do the munging there, but then the code > > paths start to diverge, which is maybe not nice. I left it this way because > > this is the smallest patch in terms of size, but I'm happy to change it. > > > > Finally, here is a small set of test programs, I'm happy to turn them into > > kselftests if we agree on an API > > > > #include <stdio.h> > > #include <unistd.h> > > #include <stdlib.h> > > #include <sys/types.h> > > #include <sys/stat.h> > > #include <fcntl.h> > > > > int main(void) > > { > > int fd; > > char buf[128]; > > > > fd = open("/proc/self/comm", O_RDONLY); > > if (fd < 0) { > > perror("open comm"); > > exit(1); > > } > > > > if (read(fd, buf, 128) < 0) { > > perror("read"); > > exit(1); > > } > > > > printf("comm: %s", buf); > > exit(0); > > } > > > > #define _GNU_SOURCE > > #include <stdio.h> > > #include <syscall.h> > > #include <stdbool.h> > > #include <unistd.h> > > #include <fcntl.h> > > #include <stdlib.h> > > #include <errno.h> > > #include <sys/wait.h> > > > > #ifndef AT_EMPTY_PATH > > #define AT_EMPTY_PATH 0x1000 /* Allow empty relative */ > > #endif > > > > #ifndef AT_EXEC_REASONABLE_COMM > > #define AT_EXEC_REASONABLE_COMM 0x200 > > #endif > > > > int main(int argc, char *argv[]) > > { > > pid_t pid; > > int status; > > bool wants_reasonable_comm = argc > 1; > > > > pid = fork(); > > if (pid < 0) { > > perror("fork"); > > exit(1); > > } > > > > if (pid == 0) { > > int fd; > > long ret, flags; > > > > fd = open("./catprocselfcomm", O_PATH); > > if (fd < 0) { > > perror("open catprocselfname"); > > exit(1); > > } > > > > flags = AT_EMPTY_PATH; > > if (wants_reasonable_comm) > > flags |= AT_EXEC_REASONABLE_COMM; > > syscall(__NR_execveat, fd, "", (char *[]){"./catprocselfcomm", NULL}, NULL, flags); > > fprintf(stderr, "execveat failed %d\n", errno); > > exit(1); > > } > > > > if (waitpid(pid, &status, 0) != pid) { > > fprintf(stderr, "wrong child\n"); > > exit(1); > > } > > > > if (!WIFEXITED(status)) { > > fprintf(stderr, "exit status %x\n", status); > > exit(1); > > } > > > > if (WEXITSTATUS(status) != 0) { > > fprintf(stderr, "child failed\n"); > > exit(1); > > } > > > > return 0; > > } > > --- > > fs/exec.c | 22 ++++++++++++++++++---- > > include/uapi/linux/fcntl.h | 3 ++- > > 2 files changed, 20 insertions(+), 5 deletions(-) > > > > diff --git a/fs/exec.c b/fs/exec.c > > index dad402d55681..36434feddb7b 100644 > > --- a/fs/exec.c > > +++ b/fs/exec.c > > @@ -1569,11 +1569,15 @@ static void free_bprm(struct linux_binprm *bprm) > > kfree(bprm); > > } > > > > -static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) > > +static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, > > + struct user_arg_ptr argv, int flags) > > { > > struct linux_binprm *bprm; > > struct file *file; > > int retval = -ENOMEM; > > + bool needs_comm_fixup = flags & AT_EXEC_REASONABLE_COMM; > > + > > + flags &= ~AT_EXEC_REASONABLE_COMM; > > > > file = do_open_execat(fd, filename, flags); > > if (IS_ERR(file)) > > @@ -1590,11 +1594,20 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl > > if (fd == AT_FDCWD || filename->name[0] == '/') { > > bprm->filename = filename->name; > > } else { > > - if (filename->name[0] == '\0') > > + if (needs_comm_fixup) { > > + const char __user *p = get_user_arg_ptr(argv, 0); > > + > > + retval = -EFAULT; > > + if (!p) > > + goto out_free; > > + > > + bprm->fdpath = strndup_user(p, MAX_ARG_STRLEN); > > + } else if (filename->name[0] == '\0') > > bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); > > else > > bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", > > fd, filename->name); > > + retval = -ENOMEM; > > if (!bprm->fdpath) > > goto out_free; > > > > @@ -1969,7 +1982,7 @@ static int do_execveat_common(int fd, struct filename *filename, > > * further execve() calls fail. */ > > current->flags &= ~PF_NPROC_EXCEEDED; > > > > - bprm = alloc_bprm(fd, filename, flags); > > + bprm = alloc_bprm(fd, filename, argv, flags); > > if (IS_ERR(bprm)) { > > retval = PTR_ERR(bprm); > > goto out_ret; > > @@ -2034,6 +2047,7 @@ int kernel_execve(const char *kernel_filename, > > struct linux_binprm *bprm; > > int fd = AT_FDCWD; > > int retval; > > + struct user_arg_ptr user_argv = {}; > > > > /* It is non-sense for kernel threads to call execve */ > > if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) > > @@ -2043,7 +2057,7 @@ int kernel_execve(const char *kernel_filename, > > if (IS_ERR(filename)) > > return PTR_ERR(filename); > > > > - bprm = alloc_bprm(fd, filename, 0); > > + bprm = alloc_bprm(fd, filename, user_argv, 0); > > if (IS_ERR(bprm)) { > > retval = PTR_ERR(bprm); > > goto out_ret; > > diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h > > index 87e2dec79fea..7178d1e4a3de 100644 > > --- a/include/uapi/linux/fcntl.h > > +++ b/include/uapi/linux/fcntl.h > > @@ -100,7 +100,8 @@ > > /* Reserved for per-syscall flags 0xff. */ > > #define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic > > links. */ > > -/* Reserved for per-syscall flags 0x200 */ > > +#define AT_EXEC_REASONABLE_COMM 0x200 /* Use argv[0] for comm in > > + execveat */ > > #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ > > #define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount > > traversal. */ > > > > base-commit: baeb9a7d8b60b021d907127509c44507539c15e5 -- Aleksa Sarai Senior Software Engineer (Containers) SUSE Linux GmbH <https://www.cyphar.com/>
On Wed, Sep 25, 2024 at 05:50:10PM +0200, Aleksa Sarai wrote: > On 2024-09-24, Eric W. Biederman <ebiederm@xmission.com> wrote: > > Tycho Andersen <tycho@tycho.pizza> writes: > > > > > From: Tycho Andersen <tandersen@netflix.com> > > > > > > Zbigniew mentioned at Linux Plumber's that systemd is interested in > > > switching to execveat() for service execution, but can't, because the > > > contents of /proc/pid/comm are the file descriptor which was used, > > > instead of the path to the binary. This makes the output of tools like > > > top and ps useless, especially in a world where most fds are opened > > > CLOEXEC so the number is truly meaningless. > > > > > > This patch adds an AT_ flag to fix up /proc/pid/comm to instead be the > > > contents of argv[0], instead of the fdno. > > > > The kernel allows prctl(PR_SET_NAME, ...) without any permission > > checks so adding an AT_ flat to use argv[0] instead of the execed > > filename seems reasonable. > > > > Maybe the flag should be called AT_NAME_ARGV0. > > > > > > That said I am trying to remember why we picked /dev/fd/N, as the > > filename. > > > > My memory is that we couldn't think of anything more reasonable to use. > > Looking at commit 51f39a1f0cea ("syscalls: implement execveat() system > > call") unfortunately doesn't clarify anything for me, except that > > /dev/fd/N was a reasonable choice. > > > > I am thinking the code could reasonably try: > > get_fs_root_rcu(current->fs, &root); > > path = __d_path(file->f_path, root, buf, buflen); > > > > To see if a path to the file from the current root directory can be > > found. For files that are not reachable from the current root the code > > still need to fallback to /dev/fd/N. > > > > Do you think you can investigate that and see if that would generate > > a reasonable task->comm? > > The problem mentioned during the discussion after the talk was that > busybox symlinks everything to the same program, so using d_path will > give somewhat confusing results and so separate behaviour is still > needed (though to be fair, the current results are also confusing). I also remember that busybox used to do symlinks, but I just looked the latest version on the docker hub (perhaps not representative...) and it's all hard links, which works fine with the __d_path() trick. > > It looks like a reasonable case can be made that while /dev/fd/N is > > a good path for interpreters, it is never a good choice for comm, > > so perhaps we could always use argv[0] if the fdpath is of the > > form /dev/fd/N. > > > > All of that said I am not a fan of the implementation below as it has > > the side effect of replacing /dev/fd/N with a filename that is not > > usable by #! interpreters. So I suggest an implementation that affects > > task->comm and not brpm->filename. > > I think only affecting task->comm would be ideal. Yep, I did this for the test above, and it worked fine: if (bprm->fdpath) { /* * If fdpath was set, execveat() made up a path that will * probably not be useful to admins running ps or similar. * Let's fix it up to be something reasonable. */ struct path root; char *path, buf[1024]; get_fs_root(current->fs, &root); path = __d_path(&bprm->file->f_path, &root, buf, sizeof(buf)); __set_task_comm(me, kbasename(path), true); } else { __set_task_comm(me, kbasename(bprm->filename), true); } obviously we don't want a stack allocated buffer, but triggering on ->fdpath != NULL seems like the right thing, so we won't need a flag either. The question is: argv[0] or __d_path()? Tycho
Tycho Andersen <tycho@tycho.pizza> writes: > Yep, I did this for the test above, and it worked fine: > > if (bprm->fdpath) { > /* > * If fdpath was set, execveat() made up a path that will > * probably not be useful to admins running ps or similar. > * Let's fix it up to be something reasonable. > */ > struct path root; > char *path, buf[1024]; > > get_fs_root(current->fs, &root); > path = __d_path(&bprm->file->f_path, &root, buf, sizeof(buf)); > > __set_task_comm(me, kbasename(path), true); > } else { > __set_task_comm(me, kbasename(bprm->filename), true); > } > > obviously we don't want a stack allocated buffer, but triggering on > ->fdpath != NULL seems like the right thing, so we won't need a flag > either. > > The question is: argv[0] or __d_path()? You know. I think we can just do: BUILD_BUG_ON(DNAME_INLINE_LEN >= TASK_COMM_LEN); __set_task_comm(me, bprm->file->f_path.dentry->d_name.name, true); Barring cache misses that should be faster and more reliable than what we currently have and produce the same output in all of the cases we like, and produce better output in all of the cases that are a problem today. Does anyone see any problem with that? Eric
On Wed, Sep 25, 2024 at 09:09:18PM -0500, Eric W. Biederman wrote: > Tycho Andersen <tycho@tycho.pizza> writes: > > > Yep, I did this for the test above, and it worked fine: > > > > if (bprm->fdpath) { > > /* > > * If fdpath was set, execveat() made up a path that will > > * probably not be useful to admins running ps or similar. > > * Let's fix it up to be something reasonable. > > */ > > struct path root; > > char *path, buf[1024]; > > > > get_fs_root(current->fs, &root); > > path = __d_path(&bprm->file->f_path, &root, buf, sizeof(buf)); > > > > __set_task_comm(me, kbasename(path), true); > > } else { > > __set_task_comm(me, kbasename(bprm->filename), true); > > } > > > > obviously we don't want a stack allocated buffer, but triggering on > > ->fdpath != NULL seems like the right thing, so we won't need a flag > > either. > > > > The question is: argv[0] or __d_path()? > > You know. I think we can just do: > > BUILD_BUG_ON(DNAME_INLINE_LEN >= TASK_COMM_LEN); > __set_task_comm(me, bprm->file->f_path.dentry->d_name.name, true); > > Barring cache misses that should be faster and more reliable than what > we currently have and produce the same output in all of the cases we > like, and produce better output in all of the cases that are a problem > today. > > Does anyone see any problem with that? Nice, this works great. We need to drop the BUILD_BUG_ON() since it is violated in today's tree, but I think this is safe to do anyway since __set_task_comm() does strscpy_pad(tsk->comm, buf, sizeof(tsk->comm)). I will respin with this and dropping the flag. Tycho
Tycho Andersen <tycho@tycho.pizza> writes: > On Wed, Sep 25, 2024 at 09:09:18PM -0500, Eric W. Biederman wrote: >> Tycho Andersen <tycho@tycho.pizza> writes: >> >> > Yep, I did this for the test above, and it worked fine: >> > >> > if (bprm->fdpath) { >> > /* >> > * If fdpath was set, execveat() made up a path that will >> > * probably not be useful to admins running ps or similar. >> > * Let's fix it up to be something reasonable. >> > */ >> > struct path root; >> > char *path, buf[1024]; >> > >> > get_fs_root(current->fs, &root); >> > path = __d_path(&bprm->file->f_path, &root, buf, sizeof(buf)); >> > >> > __set_task_comm(me, kbasename(path), true); >> > } else { >> > __set_task_comm(me, kbasename(bprm->filename), true); >> > } >> > >> > obviously we don't want a stack allocated buffer, but triggering on >> > ->fdpath != NULL seems like the right thing, so we won't need a flag >> > either. >> > >> > The question is: argv[0] or __d_path()? >> >> You know. I think we can just do: >> >> BUILD_BUG_ON(DNAME_INLINE_LEN >= TASK_COMM_LEN); >> __set_task_comm(me, bprm->file->f_path.dentry->d_name.name, true); >> >> Barring cache misses that should be faster and more reliable than what >> we currently have and produce the same output in all of the cases we >> like, and produce better output in all of the cases that are a problem >> today. >> >> Does anyone see any problem with that? > > Nice, this works great. We need to drop the BUILD_BUG_ON() since it is > violated in today's tree, but I think this is safe to do anyway since > __set_task_comm() does strscpy_pad(tsk->comm, buf, sizeof(tsk->comm)). Doh. I simply put the conditional in the wrong order. That should have been: BUILD_BUG_ON(TASK_COMM_LEN > DNAME_INLINE_LEN); Sorry I was thinking of the invariant that needs to be preserved rather than the bug that happens. Eric
On Fri, Sep 27, 2024 at 09:43:22AM -0500, Eric W. Biederman wrote: > Tycho Andersen <tycho@tycho.pizza> writes: > > > On Wed, Sep 25, 2024 at 09:09:18PM -0500, Eric W. Biederman wrote: > >> Tycho Andersen <tycho@tycho.pizza> writes: > >> > >> > Yep, I did this for the test above, and it worked fine: > >> > > >> > if (bprm->fdpath) { > >> > /* > >> > * If fdpath was set, execveat() made up a path that will > >> > * probably not be useful to admins running ps or similar. > >> > * Let's fix it up to be something reasonable. > >> > */ > >> > struct path root; > >> > char *path, buf[1024]; > >> > > >> > get_fs_root(current->fs, &root); > >> > path = __d_path(&bprm->file->f_path, &root, buf, sizeof(buf)); > >> > > >> > __set_task_comm(me, kbasename(path), true); > >> > } else { > >> > __set_task_comm(me, kbasename(bprm->filename), true); > >> > } > >> > > >> > obviously we don't want a stack allocated buffer, but triggering on > >> > ->fdpath != NULL seems like the right thing, so we won't need a flag > >> > either. > >> > > >> > The question is: argv[0] or __d_path()? > >> > >> You know. I think we can just do: > >> > >> BUILD_BUG_ON(DNAME_INLINE_LEN >= TASK_COMM_LEN); > >> __set_task_comm(me, bprm->file->f_path.dentry->d_name.name, true); > >> > >> Barring cache misses that should be faster and more reliable than what > >> we currently have and produce the same output in all of the cases we > >> like, and produce better output in all of the cases that are a problem > >> today. > >> > >> Does anyone see any problem with that? > > > > Nice, this works great. We need to drop the BUILD_BUG_ON() since it is > > violated in today's tree, but I think this is safe to do anyway since > > __set_task_comm() does strscpy_pad(tsk->comm, buf, sizeof(tsk->comm)). > > Doh. I simply put the conditional in the wrong order. That should have > been: > BUILD_BUG_ON(TASK_COMM_LEN > DNAME_INLINE_LEN); > > Sorry I was thinking of the invariant that needs to be preserved rather > than the bug that happens. Thanks, I will include that. Just for my own education: this is still *safe* to do, because of _pad, it's just that it is a userspace visible break if TASK_COMM_LEN > DNAME_INLINE_LEN is ever true? Tycho
Tycho Andersen <tycho@tycho.pizza> writes: > On Fri, Sep 27, 2024 at 09:43:22AM -0500, Eric W. Biederman wrote: >> Tycho Andersen <tycho@tycho.pizza> writes: >> >> > On Wed, Sep 25, 2024 at 09:09:18PM -0500, Eric W. Biederman wrote: >> >> Tycho Andersen <tycho@tycho.pizza> writes: >> >> >> >> > Yep, I did this for the test above, and it worked fine: >> >> > >> >> > if (bprm->fdpath) { >> >> > /* >> >> > * If fdpath was set, execveat() made up a path that will >> >> > * probably not be useful to admins running ps or similar. >> >> > * Let's fix it up to be something reasonable. >> >> > */ >> >> > struct path root; >> >> > char *path, buf[1024]; >> >> > >> >> > get_fs_root(current->fs, &root); >> >> > path = __d_path(&bprm->file->f_path, &root, buf, sizeof(buf)); >> >> > >> >> > __set_task_comm(me, kbasename(path), true); >> >> > } else { >> >> > __set_task_comm(me, kbasename(bprm->filename), true); >> >> > } >> >> > >> >> > obviously we don't want a stack allocated buffer, but triggering on >> >> > ->fdpath != NULL seems like the right thing, so we won't need a flag >> >> > either. >> >> > >> >> > The question is: argv[0] or __d_path()? >> >> >> >> You know. I think we can just do: >> >> >> >> BUILD_BUG_ON(DNAME_INLINE_LEN >= TASK_COMM_LEN); >> >> __set_task_comm(me, bprm->file->f_path.dentry->d_name.name, true); >> >> >> >> Barring cache misses that should be faster and more reliable than what >> >> we currently have and produce the same output in all of the cases we >> >> like, and produce better output in all of the cases that are a problem >> >> today. >> >> >> >> Does anyone see any problem with that? >> > >> > Nice, this works great. We need to drop the BUILD_BUG_ON() since it is >> > violated in today's tree, but I think this is safe to do anyway since >> > __set_task_comm() does strscpy_pad(tsk->comm, buf, sizeof(tsk->comm)). >> >> Doh. I simply put the conditional in the wrong order. That should have >> been: >> BUILD_BUG_ON(TASK_COMM_LEN > DNAME_INLINE_LEN); >> >> Sorry I was thinking of the invariant that needs to be preserved rather >> than the bug that happens. > > Thanks, I will include that. Just for my own education: this is still > *safe* to do, because of _pad, it's just that it is a userspace > visible break if TASK_COMM_LEN > DNAME_INLINE_LEN is ever true? Not a userspace visible issue at all. With TASK_COMM_LEN <= DNAME_INLINE_LEN we could just use a memcpy of TASK_COMM_LEN bytes, and everything will be safe. (But we aren't guaranteed a terminating '\0'). If you look at d_move and copy_name in dcache.c you can see that there are cases where a rename of the dentry that happens as we are reading it to fill task->comm a terminating '\0' might be missed. strscpy_pad relies on either finding a final '\0' after which is adds more '\0's or on finding the end of the source buffer. strscpy_pad will guarantee that there is a final '\0' in task->comm. There might be some race in reading dentry->d_name, but I don't think we much care. Eric
On September 24, 2024 10:39:35 AM PDT, "Eric W. Biederman" <ebiederm@xmission.com> wrote: >Tycho Andersen <tycho@tycho.pizza> writes: > >> From: Tycho Andersen <tandersen@netflix.com> >> >> Zbigniew mentioned at Linux Plumber's that systemd is interested in >> switching to execveat() for service execution, but can't, because the >> contents of /proc/pid/comm are the file descriptor which was used, >> instead of the path to the binary. This makes the output of tools like >> top and ps useless, especially in a world where most fds are opened >> CLOEXEC so the number is truly meaningless. And just to double check: systemd's use would be entirely cosmetic, yes? >> >> This patch adds an AT_ flag to fix up /proc/pid/comm to instead be the >> contents of argv[0], instead of the fdno. > >The kernel allows prctl(PR_SET_NAME, ...) without any permission >checks so adding an AT_ flat to use argv[0] instead of the execed >filename seems reasonable. > >Maybe the flag should be called AT_NAME_ARGV0. If we add an AT flag I like this name. > > >That said I am trying to remember why we picked /dev/fd/N, as the >filename. > >My memory is that we couldn't think of anything more reasonable to use. >Looking at commit 51f39a1f0cea ("syscalls: implement execveat() system >call") unfortunately doesn't clarify anything for me, except that >/dev/fd/N was a reasonable choice. > >I am thinking the code could reasonably try: > get_fs_root_rcu(current->fs, &root); > path = __d_path(file->f_path, root, buf, buflen); > >To see if a path to the file from the current root directory can be >found. For files that are not reachable from the current root the code >still need to fallback to /dev/fd/N. > >Do you think you can investigate that and see if that would generate >a reasonable task->comm? > >If for no other reason than because it would generate a usable result >for #! scripts, without /proc mounted. > > >It looks like a reasonable case can be made that while /dev/fd/N is >a good path for interpreters, it is never a good choice for comm, >so perhaps we could always use argv[0] if the fdpath is of the >form /dev/fd/N. I haven't had a chance to go look closely yet, but this was the same thought I had when I first read this RFC. Nobody really wants a dev path in comm. Can we do this unconditionally? (And if argv0 is empty, use dev path...) >All of that said I am not a fan of the implementation below as it has >the side effect of replacing /dev/fd/N with a filename that is not >usable by #! interpreters. So I suggest an implementation that affects >task->comm and not brpm->filename. Also agreed. There is already enough fiddly usage of the bprm filename/interpreter/fdpath members -- the argv0 stuff should be distinct. Perhaps store a pointer to argv0 during arg copy? I need to go look but I'm still AFK/OoO... -Kees -- Kees Cook
On Tue, Sep 24, 2024 at 02:37:13PM -0700, Kees Cook wrote: > > > On September 24, 2024 10:39:35 AM PDT, "Eric W. Biederman" <ebiederm@xmission.com> wrote: > >Tycho Andersen <tycho@tycho.pizza> writes: > > > >> From: Tycho Andersen <tandersen@netflix.com> > >> > >> Zbigniew mentioned at Linux Plumber's that systemd is interested in > >> switching to execveat() for service execution, but can't, because the > >> contents of /proc/pid/comm are the file descriptor which was used, > >> instead of the path to the binary. This makes the output of tools like > >> top and ps useless, especially in a world where most fds are opened > >> CLOEXEC so the number is truly meaningless. > > And just to double check: systemd's use would be entirely cosmetic, yes? I think it's not really systemd, but their concern for admins looking at `ps` and being confused by "4 is using lots of CPU". IIUC systemd won't actually use the value at all. Zbigniew can confirm though. > >> > >> This patch adds an AT_ flag to fix up /proc/pid/comm to instead be the > >> contents of argv[0], instead of the fdno. > > > >The kernel allows prctl(PR_SET_NAME, ...) without any permission > >checks so adding an AT_ flat to use argv[0] instead of the execed > >filename seems reasonable. > > > >Maybe the flag should be called AT_NAME_ARGV0. > > If we add an AT flag I like this name. +1 > > > > > >That said I am trying to remember why we picked /dev/fd/N, as the > >filename. > > > >My memory is that we couldn't think of anything more reasonable to use. > >Looking at commit 51f39a1f0cea ("syscalls: implement execveat() system > >call") unfortunately doesn't clarify anything for me, except that > >/dev/fd/N was a reasonable choice. > > > >I am thinking the code could reasonably try: > > get_fs_root_rcu(current->fs, &root); > > path = __d_path(file->f_path, root, buf, buflen); > > > >To see if a path to the file from the current root directory can be > >found. For files that are not reachable from the current root the code > >still need to fallback to /dev/fd/N. > > > >Do you think you can investigate that and see if that would generate > >a reasonable task->comm? > > > >If for no other reason than because it would generate a usable result > >for #! scripts, without /proc mounted. > > > > > >It looks like a reasonable case can be made that while /dev/fd/N is > >a good path for interpreters, it is never a good choice for comm, > >so perhaps we could always use argv[0] if the fdpath is of the > >form /dev/fd/N. > > I haven't had a chance to go look closely yet, but this was the same thought I had when I first read this RFC. Nobody really wants a dev path in comm. Can we do this unconditionally? (And if argv0 is empty, use dev path...) We can, I was just worried about the behavior change. But it seems we are all in violent agreement that the current behavior isn't very good, so maybe it's fine to change. > >All of that said I am not a fan of the implementation below as it has > >the side effect of replacing /dev/fd/N with a filename that is not > >usable by #! interpreters. So I suggest an implementation that affects > >task->comm and not brpm->filename. > > Also agreed. There is already enough fiddly usage of the bprm filename/interpreter/fdpath members -- the argv0 stuff should be distinct. Perhaps store a pointer to argv0 during arg copy? I need to go look but I'm still AFK/OoO... Yeah, on second thought we could do something like: diff --git a/fs/exec.c b/fs/exec.c index 36434feddb7b..a45ea270cc43 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1416,7 +1416,10 @@ int begin_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); - __set_task_comm(me, kbasename(bprm->filename), true); + if (needs_comm_fixup) + __set_task_comm(me, argv0, true); + else + __set_task_comm(me, kbasename(bprm->filename), true); /* An exec changes our domain. We are no longer part of the thread group */ and then we don't need to mess with bprm at all. Seems much cleaner. I will see about the get_fs_root_rcu(current->fs, &root); path = __d_path(file->f_path, root, buf, buflen); that Eric suggested and how that works with the above. Tycho
Tycho Andersen <tycho@tycho.pizza> writes: > Yeah, on second thought we could do something like: > > diff --git a/fs/exec.c b/fs/exec.c > index 36434feddb7b..a45ea270cc43 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -1416,7 +1416,10 @@ int begin_new_exec(struct linux_binprm * bprm) > set_dumpable(current->mm, SUID_DUMP_USER); > > perf_event_exec(); > - __set_task_comm(me, kbasename(bprm->filename), true); > + if (needs_comm_fixup) > + __set_task_comm(me, argv0, true); ^^^^^ nit: make that kbasename(argv0) The typical case is for applications to use the filename as argv0, at which point the directories in the pathname are just noise. With only 16 characters in TASK_COMM we want to keep the noise down. Eric
© 2016 - 2024 Red Hat, Inc.