From: Arusekk <floss@arusekk.pl>
This commit adds support for the `prctl(PR_SET_SYSCALL_USER_DISPATCH)`
function in the Linux userspace emulator.
It is implemented as a fully host-independent function, by forcing
a SIGSYS early during syscall handling, if the PC is outside the
allowed range.
Since disabled SUD is indistinguishable from enabled SUD with
always-allowed region length == ~0, this encoding is used
instead of introducing a new flag.
Tested on [uglendix][1], will probably also apply to software like
tiny-wine, rpcsx, limbo, lazypoline, vicar, sysfail and endokernel,
to name a few.
[1]: https://sr.ht/~arusekk/uglendix
Signed-off-by: Arusekk <floss@arusekk.pl>
Message-ID: <20250711225226.14652-1-floss@arusekk.pl>
[rth: Split out is_vdso_sigreturn region matching and other minor tweaks.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
linux-user/qemu.h | 5 +++
linux-user/signal-common.h | 5 +++
linux-user/syscall_defs.h | 6 +++
linux-user/main.c | 2 +
linux-user/syscall.c | 76 +++++++++++++++++++++++++++++++++++++-
5 files changed, 93 insertions(+), 1 deletion(-)
diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index e4dca0c20f..cabb7bd6a8 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -155,6 +155,11 @@ struct TaskState {
/* This thread's sigaltstack, if it has one */
struct target_sigaltstack sigaltstack_used;
+ /* This thread's SYSCALL_USER_DISPATCH state, len=~0 means disabled */
+ vaddr sys_dispatch;
+ vaddr sys_dispatch_selector;
+ abi_ulong sys_dispatch_len;
+
/* Start time of task after system boot in clock ticks */
uint64_t start_boottime;
};
diff --git a/linux-user/signal-common.h b/linux-user/signal-common.h
index 0b04868727..8a44714251 100644
--- a/linux-user/signal-common.h
+++ b/linux-user/signal-common.h
@@ -28,6 +28,11 @@ extern abi_ulong default_rt_sigreturn;
extern abi_ulong vdso_sigreturn_region_start;
extern abi_ulong vdso_sigreturn_region_end;
+static inline bool is_vdso_sigreturn(abi_ulong pc)
+{
+ return pc >= vdso_sigreturn_region_start && pc < vdso_sigreturn_region_end;
+}
+
void setup_sigtramp(abi_ulong tramp_page);
int on_sig_stack(unsigned long sp);
diff --git a/linux-user/syscall_defs.h b/linux-user/syscall_defs.h
index df26a2d28f..cd9ff709b8 100644
--- a/linux-user/syscall_defs.h
+++ b/linux-user/syscall_defs.h
@@ -689,6 +689,12 @@ typedef struct target_siginfo {
#define TARGET_TRAP_HWBKPT (4) /* hardware breakpoint/watchpoint */
#define TARGET_TRAP_UNK (5) /* undiagnosed trap */
+/*
+ * SIGSYS si_codes
+ */
+#define TARGET_SYS_SECCOMP (1) /* seccomp triggered */
+#define TARGET_SYS_USER_DISPATCH (2) /* syscall user dispatch triggered */
+
/*
* SIGEMT si_codes
*/
diff --git a/linux-user/main.c b/linux-user/main.c
index 7b0ccb6fd6..d8b7df4a79 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -233,6 +233,8 @@ void init_task_state(TaskState *ts)
ts->start_boottime += bt.tv_nsec * (uint64_t) ticks_per_sec /
NANOSECONDS_PER_SECOND;
}
+
+ ts->sys_dispatch_len = -1;
}
CPUArchState *cpu_copy(CPUArchState *env)
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 91360a072c..9098cdb9fa 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -6344,6 +6344,10 @@ abi_long do_arch_prctl(CPUX86State *env, int code, abi_ulong addr)
#endif
#ifndef PR_SET_SYSCALL_USER_DISPATCH
# define PR_SET_SYSCALL_USER_DISPATCH 59
+# define PR_SYS_DISPATCH_OFF 0
+# define PR_SYS_DISPATCH_ON 1
+# define SYSCALL_DISPATCH_FILTER_ALLOW 0
+# define SYSCALL_DISPATCH_FILTER_BLOCK 1
#endif
#ifndef PR_SME_SET_VL
# define PR_SME_SET_VL 63
@@ -6398,6 +6402,36 @@ static abi_long do_prctl_inval1(CPUArchState *env, abi_long arg2)
#define do_prctl_sme_set_vl do_prctl_inval1
#endif
+static abi_long do_prctl_syscall_user_dispatch(CPUArchState *env,
+ abi_ulong arg2, abi_ulong arg3,
+ abi_ulong arg4, abi_ulong arg5)
+{
+ CPUState *cpu = env_cpu(env);
+ TaskState *ts = get_task_state(cpu);
+
+ switch (arg2) {
+ case PR_SYS_DISPATCH_OFF:
+ if (arg3 || arg4 || arg5) {
+ return -TARGET_EINVAL;
+ }
+ ts->sys_dispatch_len = -1;
+ return 0;
+ case PR_SYS_DISPATCH_ON:
+ if (arg3 && arg3 + arg4 <= arg3) {
+ return -TARGET_EINVAL;
+ }
+ if (arg5 && !access_ok(cpu, VERIFY_READ, arg5, 1)) {
+ return -TARGET_EFAULT;
+ }
+ ts->sys_dispatch = arg3;
+ ts->sys_dispatch_len = arg4;
+ ts->sys_dispatch_selector = arg5;
+ return 0;
+ default:
+ return -TARGET_EINVAL;
+ }
+}
+
static abi_long do_prctl(CPUArchState *env, abi_long option, abi_long arg2,
abi_long arg3, abi_long arg4, abi_long arg5)
{
@@ -6473,6 +6507,9 @@ static abi_long do_prctl(CPUArchState *env, abi_long option, abi_long arg2,
case PR_SET_UNALIGN:
return do_prctl_set_unalign(env, arg2);
+ case PR_SET_SYSCALL_USER_DISPATCH:
+ return do_prctl_syscall_user_dispatch(env, arg2, arg3, arg4, arg5);
+
case PR_CAP_AMBIENT:
case PR_CAPBSET_READ:
case PR_CAPBSET_DROP:
@@ -6527,7 +6564,6 @@ static abi_long do_prctl(CPUArchState *env, abi_long option, abi_long arg2,
case PR_SET_MM:
case PR_GET_SECCOMP:
case PR_SET_SECCOMP:
- case PR_SET_SYSCALL_USER_DISPATCH:
case PR_GET_THP_DISABLE:
case PR_SET_THP_DISABLE:
case PR_GET_TSC:
@@ -13897,12 +13933,46 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1,
return ret;
}
+static bool sys_dispatch(CPUState *cpu, TaskState *ts)
+{
+ abi_ptr pc;
+
+ if (likely(ts->sys_dispatch_len == -1)) {
+ return false;
+ }
+
+ pc = cpu->cc->get_pc(cpu);
+ if (likely(pc - ts->sys_dispatch < ts->sys_dispatch_len)) {
+ return false;
+ }
+ if (unlikely(is_vdso_sigreturn(pc))) {
+ return false;
+ }
+ if (likely(ts->sys_dispatch_selector)) {
+ uint8_t sb;
+ if (get_user_u8(sb, ts->sys_dispatch_selector)) {
+ force_sig(TARGET_SIGSEGV);
+ return true;
+ }
+ if (likely(sb == SYSCALL_DISPATCH_FILTER_ALLOW)) {
+ return false;
+ }
+ if (unlikely(sb != SYSCALL_DISPATCH_FILTER_BLOCK)) {
+ force_sig(TARGET_SIGSYS);
+ return true;
+ }
+ }
+ force_sig_fault(TARGET_SIGSYS, TARGET_SYS_USER_DISPATCH, pc);
+ return true;
+}
+
abi_long do_syscall(CPUArchState *cpu_env, int num, abi_long arg1,
abi_long arg2, abi_long arg3, abi_long arg4,
abi_long arg5, abi_long arg6, abi_long arg7,
abi_long arg8)
{
CPUState *cpu = env_cpu(cpu_env);
+ TaskState *ts = get_task_state(cpu);
abi_long ret;
#ifdef DEBUG_ERESTARTSYS
@@ -13919,6 +13989,10 @@ abi_long do_syscall(CPUArchState *cpu_env, int num, abi_long arg1,
}
#endif
+ if (sys_dispatch(cpu, ts)) {
+ return -QEMU_ESIGRETURN;
+ }
+
record_syscall_start(cpu, num, arg1,
arg2, arg3, arg4, arg5, arg6, arg7, arg8);
--
2.43.0
© 2016 - 2025 Red Hat, Inc.