linux-user: Add option to run `execve`d programs through QEMU

Noah Goldstein posted 1 patch 2 days, 19 hours ago
There is a newer version of this series
linux-user/main.c                             | 21 ++++++
linux-user/syscall.c                          | 21 ++++--
linux-user/user-internals.h                   |  4 ++
tests/tcg/multiarch/Makefile.target           |  8 +++
.../linux/linux-execve-qemu-children.c        | 68 +++++++++++++++++++
5 files changed, 117 insertions(+), 5 deletions(-)
create mode 100644 tests/tcg/multiarch/linux/linux-execve-qemu-children.c
linux-user: Add option to run `execve`d programs through QEMU
Posted by Noah Goldstein 2 days, 19 hours ago
The new option '-qemu-children' makes it so that on `execve` the child
process will be launch by the same `qemu` executable that is currently
running along with its current commandline arguments.

The motivation for the change is to make it so that plugins running
through `qemu` can continue to run on children.  Why not just
`binfmt`?: Plugins can be desirable regardless of system/architecture
emulation, and can sometimes be useful for elf files that can run
natively. Enabling `binfmt` for all natively runnable elf files may
not be desirable.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 linux-user/main.c                             | 21 ++++++
 linux-user/syscall.c                          | 21 ++++--
 linux-user/user-internals.h                   |  4 ++
 tests/tcg/multiarch/Makefile.target           |  8 +++
 .../linux/linux-execve-qemu-children.c        | 68 +++++++++++++++++++
 5 files changed, 117 insertions(+), 5 deletions(-)
 create mode 100644 tests/tcg/multiarch/linux/linux-execve-qemu-children.c

diff --git a/linux-user/main.c b/linux-user/main.c
index 8143a0d4b0..5e3d41dc2b 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -81,6 +81,10 @@ unsigned long mmap_min_addr;
 uintptr_t guest_base;
 bool have_guest_base;
 
+bool qemu_dup_for_children;
+int qemu_argc;
+char **qemu_argv;
+
 /*
  * Used to implement backwards-compatibility for the `-strace`, and
  * QEMU_STRACE options. Without this, the QEMU_LOG can be overwritten by
@@ -451,6 +455,11 @@ static void handle_arg_jitdump(const char *arg)
     perf_enable_jitdump();
 }
 
+static void handle_arg_qemu_children(const char *arg)
+{
+    qemu_dup_for_children = true;
+}
+
 static QemuPluginList plugins = QTAILQ_HEAD_INITIALIZER(plugins);
 
 #ifdef CONFIG_PLUGIN
@@ -526,6 +535,10 @@ static const struct qemu_argument arg_table[] = {
      "",           "Generate a /tmp/perf-${pid}.map file for perf"},
     {"jitdump",    "QEMU_JITDUMP",     false, handle_arg_jitdump,
      "",           "Generate a jit-${pid}.dump file for perf"},
+    {"qemu-children",
+                   "QEMU_CHILDREN",    false, handle_arg_qemu_children,
+     "",           "Run child processes (created with execve) with qemu "
+                   "(as instantiated for the parent)"},
     {NULL, NULL, false, NULL, NULL, NULL}
 };
 
@@ -729,6 +742,14 @@ int main(int argc, char **argv, char **envp)
 
     optind = parse_args(argc, argv);
 
+    if (qemu_dup_for_children) {
+        qemu_argc = optind;
+        qemu_argv = g_new0(char *, qemu_argc);
+        for (i = 0; i < optind; ++i) {
+            qemu_argv[i] = strdup(argv[i]);
+        }
+    }
+
     qemu_set_log_filename_flags(last_log_filename,
                                 last_log_mask | (enable_strace * LOG_STRACE),
                                 &error_fatal);
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 59b2080b98..96b105e9ce 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -8550,13 +8550,14 @@ static int do_execv(CPUArchState *cpu_env, int dirfd,
                     abi_long pathname, abi_long guest_argp,
                     abi_long guest_envp, int flags, bool is_execveat)
 {
-    int ret;
+    int ret, argp_offset;
     char **argp, **envp;
     int argc, envc;
     abi_ulong gp;
     abi_ulong addr;
     char **q;
     void *p;
+    bool through_qemu = dirfd == AT_FDCWD && qemu_dup_for_children;
 
     argc = 0;
 
@@ -8580,10 +8581,12 @@ static int do_execv(CPUArchState *cpu_env, int dirfd,
         envc++;
     }
 
-    argp = g_new0(char *, argc + 1);
+    argp_offset = through_qemu ? qemu_argc : 0;
+    argp = g_new0(char *, argc + argp_offset + 1);
     envp = g_new0(char *, envc + 1);
 
-    for (gp = guest_argp, q = argp; gp; gp += sizeof(abi_ulong), q++) {
+    for (gp = guest_argp, q = argp + argp_offset;
+         gp; gp += sizeof(abi_ulong), q++) {
         if (get_user_ual(addr, gp)) {
             goto execve_efault;
         }
@@ -8628,9 +8631,16 @@ static int do_execv(CPUArchState *cpu_env, int dirfd,
     }
 
     const char *exe = p;
-    if (is_proc_myself(p, "exe")) {
+    if (through_qemu) {
+        int i;
+        for (i = 0; i < argp_offset; ++i) {
+            argp[i] = qemu_argv[i];
+        }
+        exe = qemu_argv[0];
+    } else if (is_proc_myself(p, "exe")) {
         exe = exec_path;
     }
+
     ret = is_execveat
         ? safe_execveat(dirfd, exe, argp, envp, flags)
         : safe_execve(exe, argp, envp);
@@ -8644,7 +8654,8 @@ execve_efault:
     ret = -TARGET_EFAULT;
 
 execve_end:
-    for (gp = guest_argp, q = argp; *q; gp += sizeof(abi_ulong), q++) {
+    for (gp = guest_argp, q = argp + argp_offset;
+         *q; gp += sizeof(abi_ulong), q++) {
         if (get_user_ual(addr, gp) || !addr) {
             break;
         }
diff --git a/linux-user/user-internals.h b/linux-user/user-internals.h
index 46ffc093f4..ed3ed666a0 100644
--- a/linux-user/user-internals.h
+++ b/linux-user/user-internals.h
@@ -30,6 +30,10 @@ void stop_all_tasks(void);
 extern const char *qemu_uname_release;
 extern unsigned long mmap_min_addr;
 
+extern bool qemu_dup_for_children;
+extern int qemu_argc;
+extern char **qemu_argv;
+
 typedef struct IOCTLEntry IOCTLEntry;
 
 typedef abi_long do_ioctl_fn(const IOCTLEntry *ie, uint8_t *buf_temp,
diff --git a/tests/tcg/multiarch/Makefile.target b/tests/tcg/multiarch/Makefile.target
index 78b83d5575..0e220953e7 100644
--- a/tests/tcg/multiarch/Makefile.target
+++ b/tests/tcg/multiarch/Makefile.target
@@ -30,6 +30,14 @@ run-float_%: float_%
 	$(call conditional-diff-out,$<,$(SRC_PATH)/tests/tcg/$(TARGET_NAME)/$<.ref)
 
 
+run-linux-execve-qemu-children: linux-execve-qemu-children
+	$(call run-test,$<, $(QEMU) $(QEMU_OPTS) -qemu-children $< $(QEMU) 0)
+	$(call run-test,$<, $(QEMU) $(QEMU_OPTS) $< linux-execve 0 skip)
+
+run-plugin-linux-execve-qemu-children-with-%: linux-execve-qemu-children
+	$(call run-test,$<, $(QEMU) $(QEMU_OPTS) -qemu-children $< $(QEMU) 0)
+	$(call run-test,$<, $(QEMU) $(QEMU_OPTS) $< linux-execve 0 skip)
+
 testthread: LDFLAGS+=-lpthread
 
 threadcount: LDFLAGS+=-lpthread
diff --git a/tests/tcg/multiarch/linux/linux-execve-qemu-children.c b/tests/tcg/multiarch/linux/linux-execve-qemu-children.c
new file mode 100644
index 0000000000..60d6537666
--- /dev/null
+++ b/tests/tcg/multiarch/linux/linux-execve-qemu-children.c
@@ -0,0 +1,68 @@
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#define MAX_COMM_SIZE (4096)
+
+int
+main(int argc, char ** argv, char ** envp) {
+    int          fd;
+    char         next_arg[2];
+    char *       buf;
+    ssize_t      off;
+    const char * expec_comm;
+    assert(argc == 3 || argc == 4);
+    fd = open("/proc/self/comm", O_RDONLY);
+    assert(fd > 0);
+
+    buf = calloc(MAX_COMM_SIZE + 1, 1);
+    assert(buf != NULL);
+
+    off = 0;
+    for (;;) {
+        ssize_t res = read(fd, buf + off, 1);
+        if (res < 0 && errno != EAGAIN) {
+            perror("Failed to read comm");
+            return -1;
+        }
+        if (res == 0) {
+            break;
+        }
+
+        off += res;
+
+        if (off >= MAX_COMM_SIZE) {
+            fprintf(stderr, "/proc/self/comm too large for test\n");
+            return -1;
+        }
+    }
+    assert(off && buf[off] == '\0' && buf[off - 1] == '\n');
+    buf[off - 1] = '\0';
+    expec_comm   = basename(argv[1]);
+    if (argc == 3 && strncmp(buf, expec_comm, strlen(expec_comm))) {
+        fprintf(stderr,
+                "Didn't propagate qemu settings\nComm:  '%s'\nExpec: '%s'\n",
+                buf, expec_comm);
+        return -1;
+    }
+    free(buf);
+    next_arg[0] = argv[2][0];
+    next_arg[1] = '\0';
+    if (next_arg[0] == '9') {
+        return 0;
+    }
+    next_arg[0] += 1;
+    char * next_args[] = { argv[0], argv[1], next_arg, NULL };
+    int    eres        = execve(argv[0], &next_args[0], envp);
+    if (eres != 0) {
+        fprintf(stderr, "Unable to execve: %d/%d -> %s\n", eres, errno,
+                strerror(errno));
+        return -1;
+    }
+    return 0;
+}
-- 
2.43.0
Re: linux-user: Add option to run `execve`d programs through QEMU
Posted by Noah Goldstein 2 days, 19 hours ago
On Wed, Oct 30, 2024 at 9:10 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new option '-qemu-children' makes it so that on `execve` the child
> process will be launch by the same `qemu` executable that is currently
> running along with its current commandline arguments.
>
> The motivation for the change is to make it so that plugins running
> through `qemu` can continue to run on children.  Why not just
> `binfmt`?: Plugins can be desirable regardless of system/architecture
> emulation, and can sometimes be useful for elf files that can run
> natively. Enabling `binfmt` for all natively runnable elf files may
> not be desirable.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  linux-user/main.c                             | 21 ++++++
>  linux-user/syscall.c                          | 21 ++++--
>  linux-user/user-internals.h                   |  4 ++
>  tests/tcg/multiarch/Makefile.target           |  8 +++
>  .../linux/linux-execve-qemu-children.c        | 68 +++++++++++++++++++
>  5 files changed, 117 insertions(+), 5 deletions(-)
>  create mode 100644 tests/tcg/multiarch/linux/linux-execve-qemu-children.c
>
> diff --git a/linux-user/main.c b/linux-user/main.c
> index 8143a0d4b0..5e3d41dc2b 100644
> --- a/linux-user/main.c
> +++ b/linux-user/main.c
> @@ -81,6 +81,10 @@ unsigned long mmap_min_addr;
>  uintptr_t guest_base;
>  bool have_guest_base;
>
> +bool qemu_dup_for_children;
> +int qemu_argc;
> +char **qemu_argv;
> +
>  /*
>   * Used to implement backwards-compatibility for the `-strace`, and
>   * QEMU_STRACE options. Without this, the QEMU_LOG can be overwritten by
> @@ -451,6 +455,11 @@ static void handle_arg_jitdump(const char *arg)
>      perf_enable_jitdump();
>  }
>
> +static void handle_arg_qemu_children(const char *arg)
> +{
> +    qemu_dup_for_children = true;
> +}
> +
>  static QemuPluginList plugins = QTAILQ_HEAD_INITIALIZER(plugins);
>
>  #ifdef CONFIG_PLUGIN
> @@ -526,6 +535,10 @@ static const struct qemu_argument arg_table[] = {
>       "",           "Generate a /tmp/perf-${pid}.map file for perf"},
>      {"jitdump",    "QEMU_JITDUMP",     false, handle_arg_jitdump,
>       "",           "Generate a jit-${pid}.dump file for perf"},
> +    {"qemu-children",
> +                   "QEMU_CHILDREN",    false, handle_arg_qemu_children,
> +     "",           "Run child processes (created with execve) with qemu "
> +                   "(as instantiated for the parent)"},
>      {NULL, NULL, false, NULL, NULL, NULL}
>  };
>
> @@ -729,6 +742,14 @@ int main(int argc, char **argv, char **envp)
>
>      optind = parse_args(argc, argv);
>
> +    if (qemu_dup_for_children) {
> +        qemu_argc = optind;
> +        qemu_argv = g_new0(char *, qemu_argc);
> +        for (i = 0; i < optind; ++i) {
> +            qemu_argv[i] = strdup(argv[i]);
> +        }
> +    }
> +
>      qemu_set_log_filename_flags(last_log_filename,
>                                  last_log_mask | (enable_strace * LOG_STRACE),
>                                  &error_fatal);
> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
> index 59b2080b98..96b105e9ce 100644
> --- a/linux-user/syscall.c
> +++ b/linux-user/syscall.c
> @@ -8550,13 +8550,14 @@ static int do_execv(CPUArchState *cpu_env, int dirfd,
>                      abi_long pathname, abi_long guest_argp,
>                      abi_long guest_envp, int flags, bool is_execveat)
>  {
> -    int ret;
> +    int ret, argp_offset;
>      char **argp, **envp;
>      int argc, envc;
>      abi_ulong gp;
>      abi_ulong addr;
>      char **q;
>      void *p;
> +    bool through_qemu = dirfd == AT_FDCWD && qemu_dup_for_children;
>
>      argc = 0;
>
> @@ -8580,10 +8581,12 @@ static int do_execv(CPUArchState *cpu_env, int dirfd,
>          envc++;
>      }
>
> -    argp = g_new0(char *, argc + 1);
> +    argp_offset = through_qemu ? qemu_argc : 0;
> +    argp = g_new0(char *, argc + argp_offset + 1);
>      envp = g_new0(char *, envc + 1);
>
> -    for (gp = guest_argp, q = argp; gp; gp += sizeof(abi_ulong), q++) {
> +    for (gp = guest_argp, q = argp + argp_offset;
> +         gp; gp += sizeof(abi_ulong), q++) {
>          if (get_user_ual(addr, gp)) {
>              goto execve_efault;
>          }
> @@ -8628,9 +8631,16 @@ static int do_execv(CPUArchState *cpu_env, int dirfd,
>      }
>
>      const char *exe = p;
> -    if (is_proc_myself(p, "exe")) {
> +    if (through_qemu) {
> +        int i;
> +        for (i = 0; i < argp_offset; ++i) {
> +            argp[i] = qemu_argv[i];
> +        }
> +        exe = qemu_argv[0];
> +    } else if (is_proc_myself(p, "exe")) {
>          exe = exec_path;
>      }
> +
>      ret = is_execveat
>          ? safe_execveat(dirfd, exe, argp, envp, flags)
>          : safe_execve(exe, argp, envp);
> @@ -8644,7 +8654,8 @@ execve_efault:
>      ret = -TARGET_EFAULT;
>
>  execve_end:
> -    for (gp = guest_argp, q = argp; *q; gp += sizeof(abi_ulong), q++) {
> +    for (gp = guest_argp, q = argp + argp_offset;
> +         *q; gp += sizeof(abi_ulong), q++) {
>          if (get_user_ual(addr, gp) || !addr) {
>              break;
>          }
> diff --git a/linux-user/user-internals.h b/linux-user/user-internals.h
> index 46ffc093f4..ed3ed666a0 100644
> --- a/linux-user/user-internals.h
> +++ b/linux-user/user-internals.h
> @@ -30,6 +30,10 @@ void stop_all_tasks(void);
>  extern const char *qemu_uname_release;
>  extern unsigned long mmap_min_addr;
>
> +extern bool qemu_dup_for_children;
> +extern int qemu_argc;
> +extern char **qemu_argv;
> +
>  typedef struct IOCTLEntry IOCTLEntry;
>
>  typedef abi_long do_ioctl_fn(const IOCTLEntry *ie, uint8_t *buf_temp,
> diff --git a/tests/tcg/multiarch/Makefile.target b/tests/tcg/multiarch/Makefile.target
> index 78b83d5575..0e220953e7 100644
> --- a/tests/tcg/multiarch/Makefile.target
> +++ b/tests/tcg/multiarch/Makefile.target
> @@ -30,6 +30,14 @@ run-float_%: float_%
>         $(call conditional-diff-out,$<,$(SRC_PATH)/tests/tcg/$(TARGET_NAME)/$<.ref)
>
>
> +run-linux-execve-qemu-children: linux-execve-qemu-children
> +       $(call run-test,$<, $(QEMU) $(QEMU_OPTS) -qemu-children $< $(QEMU) 0)
> +       $(call run-test,$<, $(QEMU) $(QEMU_OPTS) $< linux-execve 0 skip)
> +
> +run-plugin-linux-execve-qemu-children-with-%: linux-execve-qemu-children
> +       $(call run-test,$<, $(QEMU) $(QEMU_OPTS) -qemu-children $< $(QEMU) 0)
> +       $(call run-test,$<, $(QEMU) $(QEMU_OPTS) $< linux-execve 0 skip)
> +
>  testthread: LDFLAGS+=-lpthread
>
>  threadcount: LDFLAGS+=-lpthread
> diff --git a/tests/tcg/multiarch/linux/linux-execve-qemu-children.c b/tests/tcg/multiarch/linux/linux-execve-qemu-children.c
> new file mode 100644
> index 0000000000..60d6537666
> --- /dev/null
> +++ b/tests/tcg/multiarch/linux/linux-execve-qemu-children.c
> @@ -0,0 +1,68 @@
> +#include <assert.h>
> +#include <errno.h>
> +#include <fcntl.h>
> +#include <libgen.h>
> +#include <malloc.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <unistd.h>
> +
> +#define MAX_COMM_SIZE (4096)
> +
> +int
> +main(int argc, char ** argv, char ** envp) {
> +    int          fd;
> +    char         next_arg[2];
> +    char *       buf;
> +    ssize_t      off;
> +    const char * expec_comm;
> +    assert(argc == 3 || argc == 4);
> +    fd = open("/proc/self/comm", O_RDONLY);
> +    assert(fd > 0);
> +
> +    buf = calloc(MAX_COMM_SIZE + 1, 1);
> +    assert(buf != NULL);
> +
> +    off = 0;
> +    for (;;) {
> +        ssize_t res = read(fd, buf + off, 1);
> +        if (res < 0 && errno != EAGAIN) {
> +            perror("Failed to read comm");
> +            return -1;
> +        }
> +        if (res == 0) {
> +            break;
> +        }
> +
> +        off += res;
> +
> +        if (off >= MAX_COMM_SIZE) {
> +            fprintf(stderr, "/proc/self/comm too large for test\n");
> +            return -1;
> +        }
> +    }
> +    assert(off && buf[off] == '\0' && buf[off - 1] == '\n');
> +    buf[off - 1] = '\0';
> +    expec_comm   = basename(argv[1]);
> +    if (argc == 3 && strncmp(buf, expec_comm, strlen(expec_comm))) {
> +        fprintf(stderr,
> +                "Didn't propagate qemu settings\nComm:  '%s'\nExpec: '%s'\n",
> +                buf, expec_comm);
> +        return -1;
> +    }
> +    free(buf);
> +    next_arg[0] = argv[2][0];
> +    next_arg[1] = '\0';
> +    if (next_arg[0] == '9') {
> +        return 0;
> +    }
> +    next_arg[0] += 1;
> +    char * next_args[] = { argv[0], argv[1], next_arg, NULL };
> +    int    eres        = execve(argv[0], &next_args[0], envp);
> +    if (eres != 0) {
> +        fprintf(stderr, "Unable to execve: %d/%d -> %s\n", eres, errno,
> +                strerror(errno));
> +        return -1;
> +    }
> +    return 0;
> +}
> --
> 2.43.0
>

Added test that tests both old behavior (no propagation of qemu)
and new behavior (propagation of qemu + cmdline).

Tested on Aarch64 + Linux with:
```
make check-tcg
```