This commit introduces an entry point of syscall interface for !MMU
mode. It uses an entry function, __kernel_vsyscall, a kernel-wide global
symbol accessible from any locations.
Although it isn't in the scope of this commit, it can be also exposed
via vdso image which is directly accessible from userspace. A standard
library (i.e., libc) can utilize this entry point to implement syscall
wrapper; we can also use this by hooking syscall for unmodified userspace
applications/libraries, which will be implemented in the subsequent
commit.
This only supports 64-bit mode of x86 architecture.
Signed-off-by: Hajime Tazaki <thehajime@gmail.com>
Signed-off-by: Ricardo Koller <ricarkol@google.com>
---
arch/x86/um/Makefile | 4 ++
arch/x86/um/nommu/Makefile | 8 +++
arch/x86/um/nommu/do_syscall_64.c | 37 ++++++++++
arch/x86/um/nommu/entry_64.S | 91 +++++++++++++++++++++++++
arch/x86/um/nommu/syscalls.h | 16 +++++
arch/x86/um/shared/sysdep/syscalls_64.h | 6 ++
6 files changed, 162 insertions(+)
create mode 100644 arch/x86/um/nommu/Makefile
create mode 100644 arch/x86/um/nommu/do_syscall_64.c
create mode 100644 arch/x86/um/nommu/entry_64.S
create mode 100644 arch/x86/um/nommu/syscalls.h
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index b42c31cd2390..227af2a987e2 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -32,6 +32,10 @@ obj-y += syscalls_64.o vdso/
subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o \
../lib/memmove_64.o ../lib/memset_64.o
+ifneq ($(CONFIG_MMU),y)
+obj-y += nommu/
+endif
+
endif
subarch-$(CONFIG_MODULES) += ../kernel/module.o
diff --git a/arch/x86/um/nommu/Makefile b/arch/x86/um/nommu/Makefile
new file mode 100644
index 000000000000..d72c63afffa5
--- /dev/null
+++ b/arch/x86/um/nommu/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+ifeq ($(CONFIG_X86_32),y)
+ BITS := 32
+else
+ BITS := 64
+endif
+
+obj-y = do_syscall_$(BITS).o entry_$(BITS).o
diff --git a/arch/x86/um/nommu/do_syscall_64.c b/arch/x86/um/nommu/do_syscall_64.c
new file mode 100644
index 000000000000..5d0fa83e7fdc
--- /dev/null
+++ b/arch/x86/um/nommu/do_syscall_64.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+#include <linux/ptrace.h>
+#include <kern_util.h>
+#include <sysdep/syscalls.h>
+#include <os.h>
+
+__visible void do_syscall_64(struct pt_regs *regs)
+{
+ int syscall;
+
+ syscall = PT_SYSCALL_NR(regs->regs.gp);
+ UPT_SYSCALL_NR(®s->regs) = syscall;
+
+ pr_debug("syscall(%d) (current=%lx) (fn=%lx)\n",
+ syscall, (unsigned long)current,
+ (unsigned long)sys_call_table[syscall]);
+
+ if (likely(syscall < NR_syscalls)) {
+ PT_REGS_SET_SYSCALL_RETURN(regs,
+ EXECUTE_SYSCALL(syscall, regs));
+ }
+
+ pr_debug("syscall(%d) --> %lx\n", syscall,
+ regs->regs.gp[HOST_AX]);
+
+ PT_REGS_SYSCALL_RET(regs) = regs->regs.gp[HOST_AX];
+
+ /* execve succeeded */
+ if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0)
+ userspace(¤t->thread.regs.regs);
+
+ /* force do_signal() --> is_syscall() */
+ set_thread_flag(TIF_SIGPENDING);
+ interrupt_end();
+}
diff --git a/arch/x86/um/nommu/entry_64.S b/arch/x86/um/nommu/entry_64.S
new file mode 100644
index 000000000000..e9bfc7b93c84
--- /dev/null
+++ b/arch/x86/um/nommu/entry_64.S
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/errno.h>
+
+#include <linux/linkage.h>
+#include <asm/percpu.h>
+#include <asm/desc.h>
+
+#include "../entry/calling.h"
+
+#ifdef CONFIG_SMP
+#error need to stash these variables somewhere else
+#endif
+
+#define UM_GLOBAL_VAR(x) .data; .align 8; .globl x; x:; .long 0
+
+UM_GLOBAL_VAR(current_top_of_stack)
+UM_GLOBAL_VAR(current_ptregs)
+
+.code64
+.section .entry.text, "ax"
+
+.align 8
+#undef ENTRY
+#define ENTRY(x) .text; .globl x; .type x,%function; x:
+#undef END
+#define END(x) .size x, . - x
+
+/*
+ * %rcx has the return address (we set it before entering __kernel_vsyscall).
+ *
+ * Registers on entry:
+ * rax system call number
+ * rcx return address
+ * rdi arg0
+ * rsi arg1
+ * rdx arg2
+ * r10 arg3
+ * r8 arg4
+ * r9 arg5
+ *
+ * (note: we are allowed to mess with r11: r11 is callee-clobbered
+ * register in C ABI)
+ */
+ENTRY(__kernel_vsyscall)
+
+ movq %rsp, %r11
+
+ /* Point rsp to the top of the ptregs array, so we can
+ just fill it with a bunch of push'es. */
+ movq current_ptregs, %rsp
+
+ /* 8 bytes * 20 registers (plus 8 for the push) */
+ addq $168, %rsp
+
+ /* Construct struct pt_regs on stack */
+ pushq $0 /* pt_regs->ss (index 20) */
+ pushq %r11 /* pt_regs->sp */
+ pushfq /* pt_regs->flags */
+ pushq $0 /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
+ pushq %rax /* pt_regs->orig_ax */
+
+ PUSH_AND_CLEAR_REGS rax=$-ENOSYS
+
+ mov %rsp, %rdi
+
+ /*
+ * Switch to current top of stack, so "current->" points
+ * to the right task.
+ */
+ movq current_top_of_stack, %rsp
+
+ call do_syscall_64
+
+ movq current_ptregs, %rsp
+
+ POP_REGS
+
+ addq $8, %rsp /* skip orig_ax */
+ popq %rcx /* pt_regs->ip */
+ addq $8, %rsp /* skip cs */
+ addq $8, %rsp /* skip flags */
+ popq %rsp
+
+ /*
+ * not return w/ ret but w/ jmp as the stack is already popped before
+ * entering __kernel_vsyscall
+ */
+ jmp *%rcx
+
+END(__kernel_vsyscall)
diff --git a/arch/x86/um/nommu/syscalls.h b/arch/x86/um/nommu/syscalls.h
new file mode 100644
index 000000000000..a2433756b1fc
--- /dev/null
+++ b/arch/x86/um/nommu/syscalls.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_NOMMU_SYSCALLS_H
+#define __UM_NOMMU_SYSCALLS_H
+
+
+#define task_top_of_stack(task) \
+({ \
+ unsigned long __ptr = (unsigned long)task->stack; \
+ __ptr += THREAD_SIZE; \
+ __ptr; \
+})
+
+extern long current_top_of_stack;
+extern long current_ptregs;
+
+#endif
diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h
index b6b997225841..ffd80ee3b9dc 100644
--- a/arch/x86/um/shared/sysdep/syscalls_64.h
+++ b/arch/x86/um/shared/sysdep/syscalls_64.h
@@ -25,4 +25,10 @@ extern syscall_handler_t *sys_call_table[];
extern syscall_handler_t sys_modify_ldt;
extern syscall_handler_t sys_arch_prctl;
+#ifndef CONFIG_MMU
+extern void do_syscall_64(struct pt_regs *regs);
+extern long __kernel_vsyscall(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
+ int64_t a4, int64_t a5, int64_t a6);
+#endif
+
#endif
--
2.43.0
Hi, On Thu, 2025-06-19 at 10:04 +0900, Hajime Tazaki wrote: > This commit introduces an entry point of syscall interface for !MMU > mode. It uses an entry function, __kernel_vsyscall, a kernel-wide global > symbol accessible from any locations. > > Although it isn't in the scope of this commit, it can be also exposed > via vdso image which is directly accessible from userspace. A standard > library (i.e., libc) can utilize this entry point to implement syscall > wrapper; we can also use this by hooking syscall for unmodified userspace > applications/libraries, which will be implemented in the subsequent > commit. > > This only supports 64-bit mode of x86 architecture. > > Signed-off-by: Hajime Tazaki <thehajime@gmail.com> > Signed-off-by: Ricardo Koller <ricarkol@google.com> > --- > arch/x86/um/Makefile | 4 ++ > arch/x86/um/nommu/Makefile | 8 +++ > arch/x86/um/nommu/do_syscall_64.c | 37 ++++++++++ > arch/x86/um/nommu/entry_64.S | 91 +++++++++++++++++++++++++ > arch/x86/um/nommu/syscalls.h | 16 +++++ > arch/x86/um/shared/sysdep/syscalls_64.h | 6 ++ > 6 files changed, 162 insertions(+) > create mode 100644 arch/x86/um/nommu/Makefile > create mode 100644 arch/x86/um/nommu/do_syscall_64.c > create mode 100644 arch/x86/um/nommu/entry_64.S > create mode 100644 arch/x86/um/nommu/syscalls.h > > diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile > index b42c31cd2390..227af2a987e2 100644 > --- a/arch/x86/um/Makefile > +++ b/arch/x86/um/Makefile > @@ -32,6 +32,10 @@ obj-y += syscalls_64.o vdso/ > subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o \ > ../lib/memmove_64.o ../lib/memset_64.o > > +ifneq ($(CONFIG_MMU),y) > +obj-y += nommu/ > +endif > + > endif > > subarch-$(CONFIG_MODULES) += ../kernel/module.o > diff --git a/arch/x86/um/nommu/Makefile b/arch/x86/um/nommu/Makefile > new file mode 100644 > index 000000000000..d72c63afffa5 > --- /dev/null > +++ b/arch/x86/um/nommu/Makefile > @@ -0,0 +1,8 @@ > +# SPDX-License-Identifier: GPL-2.0 > +ifeq ($(CONFIG_X86_32),y) > + BITS := 32 > +else > + BITS := 64 > +endif > + > +obj-y = do_syscall_$(BITS).o entry_$(BITS).o > diff --git a/arch/x86/um/nommu/do_syscall_64.c b/arch/x86/um/nommu/do_syscall_64.c > new file mode 100644 > index 000000000000..5d0fa83e7fdc > --- /dev/null > +++ b/arch/x86/um/nommu/do_syscall_64.c > @@ -0,0 +1,37 @@ > +// SPDX-License-Identifier: GPL-2.0 > + > +#include <linux/kernel.h> > +#include <linux/ptrace.h> > +#include <kern_util.h> > +#include <sysdep/syscalls.h> > +#include <os.h> > + > +__visible void do_syscall_64(struct pt_regs *regs) > +{ > + int syscall; > + > + syscall = PT_SYSCALL_NR(regs->regs.gp); > + UPT_SYSCALL_NR(®s->regs) = syscall; > + > + pr_debug("syscall(%d) (current=%lx) (fn=%lx)\n", > + syscall, (unsigned long)current, > + (unsigned long)sys_call_table[syscall]); You probably want to drop the pr_debug from the syscall path. > + if (likely(syscall < NR_syscalls)) { > + PT_REGS_SET_SYSCALL_RETURN(regs, > + EXECUTE_SYSCALL(syscall, regs)); > + } > + > + pr_debug("syscall(%d) --> %lx\n", syscall, > + regs->regs.gp[HOST_AX]); > + > + PT_REGS_SYSCALL_RET(regs) = regs->regs.gp[HOST_AX]; > + > + /* execve succeeded */ > + if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0) > + userspace(¤t->thread.regs.regs); That said, this is what I am stumbling over. Why do you need to jump into userspace() here? It seems odd to me to need a special case in the syscall path itself. Aren't there other possibilities to hook/override the kernel task state? > + > + /* force do_signal() --> is_syscall() */ > + set_thread_flag(TIF_SIGPENDING); > + interrupt_end(); Same here. The MMU UML code seems to also do this, but restricted to ptrace'd processes? Maybe I am just missing something obvious … Benjamin > +} > diff --git a/arch/x86/um/nommu/entry_64.S b/arch/x86/um/nommu/entry_64.S > new file mode 100644 > index 000000000000..e9bfc7b93c84 > --- /dev/null > +++ b/arch/x86/um/nommu/entry_64.S > @@ -0,0 +1,91 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#include <asm/errno.h> > + > +#include <linux/linkage.h> > +#include <asm/percpu.h> > +#include <asm/desc.h> > + > +#include "../entry/calling.h" > + > +#ifdef CONFIG_SMP > +#error need to stash these variables somewhere else > +#endif > + > +#define UM_GLOBAL_VAR(x) .data; .align 8; .globl x; x:; .long 0 > + > +UM_GLOBAL_VAR(current_top_of_stack) > +UM_GLOBAL_VAR(current_ptregs) > + > +.code64 > +.section .entry.text, "ax" > + > +.align 8 > +#undef ENTRY > +#define ENTRY(x) .text; .globl x; .type x,%function; x: > +#undef END > +#define END(x) .size x, . - x > + > +/* > + * %rcx has the return address (we set it before entering __kernel_vsyscall). > + * > + * Registers on entry: > + * rax system call number > + * rcx return address > + * rdi arg0 > + * rsi arg1 > + * rdx arg2 > + * r10 arg3 > + * r8 arg4 > + * r9 arg5 > + * > + * (note: we are allowed to mess with r11: r11 is callee-clobbered > + * register in C ABI) > + */ > +ENTRY(__kernel_vsyscall) > + > + movq %rsp, %r11 > + > + /* Point rsp to the top of the ptregs array, so we can > + just fill it with a bunch of push'es. */ > + movq current_ptregs, %rsp > + > + /* 8 bytes * 20 registers (plus 8 for the push) */ > + addq $168, %rsp > + > + /* Construct struct pt_regs on stack */ > + pushq $0 /* pt_regs->ss (index 20) */ > + pushq %r11 /* pt_regs->sp */ > + pushfq /* pt_regs->flags */ > + pushq $0 /* pt_regs->cs */ > + pushq %rcx /* pt_regs->ip */ > + pushq %rax /* pt_regs->orig_ax */ > + > + PUSH_AND_CLEAR_REGS rax=$-ENOSYS > + > + mov %rsp, %rdi > + > + /* > + * Switch to current top of stack, so "current->" points > + * to the right task. > + */ > + movq current_top_of_stack, %rsp > + > + call do_syscall_64 > + > + movq current_ptregs, %rsp > + > + POP_REGS > + > + addq $8, %rsp /* skip orig_ax */ > + popq %rcx /* pt_regs->ip */ > + addq $8, %rsp /* skip cs */ > + addq $8, %rsp /* skip flags */ > + popq %rsp > + > + /* > + * not return w/ ret but w/ jmp as the stack is already popped before > + * entering __kernel_vsyscall > + */ > + jmp *%rcx > + > +END(__kernel_vsyscall) > diff --git a/arch/x86/um/nommu/syscalls.h b/arch/x86/um/nommu/syscalls.h > new file mode 100644 > index 000000000000..a2433756b1fc > --- /dev/null > +++ b/arch/x86/um/nommu/syscalls.h > @@ -0,0 +1,16 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef __UM_NOMMU_SYSCALLS_H > +#define __UM_NOMMU_SYSCALLS_H > + > + > +#define task_top_of_stack(task) \ > +({ \ > + unsigned long __ptr = (unsigned long)task->stack; \ > + __ptr += THREAD_SIZE; \ > + __ptr; \ > +}) > + > +extern long current_top_of_stack; > +extern long current_ptregs; > + > +#endif > diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h > index b6b997225841..ffd80ee3b9dc 100644 > --- a/arch/x86/um/shared/sysdep/syscalls_64.h > +++ b/arch/x86/um/shared/sysdep/syscalls_64.h > @@ -25,4 +25,10 @@ extern syscall_handler_t *sys_call_table[]; > extern syscall_handler_t sys_modify_ldt; > extern syscall_handler_t sys_arch_prctl; > > +#ifndef CONFIG_MMU > +extern void do_syscall_64(struct pt_regs *regs); > +extern long __kernel_vsyscall(int64_t a0, int64_t a1, int64_t a2, int64_t a3, > + int64_t a4, int64_t a5, int64_t a6); > +#endif > + > #endif
On Thu, 19 Jun 2025 19:31:53 +0900, Benjamin Berg wrote: > > diff --git a/arch/x86/um/nommu/do_syscall_64.c b/arch/x86/um/nommu/do_syscall_64.c > > new file mode 100644 > > index 000000000000..5d0fa83e7fdc > > --- /dev/null > > +++ b/arch/x86/um/nommu/do_syscall_64.c > > @@ -0,0 +1,37 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > + > > +#include <linux/kernel.h> > > +#include <linux/ptrace.h> > > +#include <kern_util.h> > > +#include <sysdep/syscalls.h> > > +#include <os.h> > > + > > +__visible void do_syscall_64(struct pt_regs *regs) > > +{ > > + int syscall; > > + > > + syscall = PT_SYSCALL_NR(regs->regs.gp); > > + UPT_SYSCALL_NR(®s->regs) = syscall; > > + > > + pr_debug("syscall(%d) (current=%lx) (fn=%lx)\n", > > + syscall, (unsigned long)current, > > + (unsigned long)sys_call_table[syscall]); > > You probably want to drop the pr_debug from the syscall path. okay, I'll update those parts. > > + if (likely(syscall < NR_syscalls)) { > > + PT_REGS_SET_SYSCALL_RETURN(regs, > > + EXECUTE_SYSCALL(syscall, regs)); > > + } > > + > > + pr_debug("syscall(%d) --> %lx\n", syscall, > > + regs->regs.gp[HOST_AX]); > > + > > + PT_REGS_SYSCALL_RET(regs) = regs->regs.gp[HOST_AX]; > > + > > + /* execve succeeded */ > > + if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0) > > + userspace(¤t->thread.regs.regs); > > That said, this is what I am stumbling over. Why do you need to jump > into userspace() here? It seems odd to me to need a special case in the > syscall path itself. > Aren't there other possibilities to hook/override the kernel task > state? thanks, I found that this is a leftover of our early implementation which doesn't have a proper schedule upon an exit from syscall. we can remove this part and I'll fix them after more investigation. > > + /* force do_signal() --> is_syscall() */ > > + set_thread_flag(TIF_SIGPENDING); > > + interrupt_end(); > > Same here. The MMU UML code seems to also do this, but restricted to > ptrace'd processes? Maybe I am just missing something obvious … nommu doesn't have separate process/context to indicate a schedule to the context here (syscall). without that part we do not have a chance to schedule tasks and signals to userspace. But the force on SIGPENDING flag is not actually needed so, I'll remove that part. thanks for pointing out. -- Hajime
© 2016 - 2025 Red Hat, Inc.