[v9] nommu UML | Patchew

[PATCH v9 04/13] x86/um: nommu: syscall handling

Posted by Hajime Tazaki 7 months, 3 weeks ago

This commit introduces an entry point of syscall interface for !MMU
mode. It uses an entry function, __kernel_vsyscall, a kernel-wide global
symbol accessible from any locations.

Although it isn't in the scope of this commit, it can be also exposed
via vdso image which is directly accessible from userspace. A standard
library (i.e., libc) can utilize this entry point to implement syscall
wrapper; we can also use this by hooking syscall for unmodified userspace
applications/libraries, which will be implemented in the subsequent
commit.

This only supports 64-bit mode of x86 architecture.

Signed-off-by: Hajime Tazaki <thehajime@gmail.com>
Signed-off-by: Ricardo Koller <ricarkol@google.com>
---
 arch/x86/um/Makefile                    |  4 ++
 arch/x86/um/nommu/Makefile              |  8 +++
 arch/x86/um/nommu/do_syscall_64.c       | 37 ++++++++++
 arch/x86/um/nommu/entry_64.S            | 91 +++++++++++++++++++++++++
 arch/x86/um/nommu/syscalls.h            | 16 +++++
 arch/x86/um/shared/sysdep/syscalls_64.h |  6 ++
 6 files changed, 162 insertions(+)
 create mode 100644 arch/x86/um/nommu/Makefile
 create mode 100644 arch/x86/um/nommu/do_syscall_64.c
 create mode 100644 arch/x86/um/nommu/entry_64.S
 create mode 100644 arch/x86/um/nommu/syscalls.h

diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index b42c31cd2390..227af2a987e2 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -32,6 +32,10 @@ obj-y += syscalls_64.o vdso/
 subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o \
 	../lib/memmove_64.o ../lib/memset_64.o
 
+ifneq ($(CONFIG_MMU),y)
+obj-y += nommu/
+endif
+
 endif
 
 subarch-$(CONFIG_MODULES) += ../kernel/module.o
diff --git a/arch/x86/um/nommu/Makefile b/arch/x86/um/nommu/Makefile
new file mode 100644
index 000000000000..d72c63afffa5
--- /dev/null
+++ b/arch/x86/um/nommu/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+ifeq ($(CONFIG_X86_32),y)
+	BITS := 32
+else
+	BITS := 64
+endif
+
+obj-y = do_syscall_$(BITS).o entry_$(BITS).o
diff --git a/arch/x86/um/nommu/do_syscall_64.c b/arch/x86/um/nommu/do_syscall_64.c
new file mode 100644
index 000000000000..5d0fa83e7fdc
--- /dev/null
+++ b/arch/x86/um/nommu/do_syscall_64.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+#include <linux/ptrace.h>
+#include <kern_util.h>
+#include <sysdep/syscalls.h>
+#include <os.h>
+
+__visible void do_syscall_64(struct pt_regs *regs)
+{
+	int syscall;
+
+	syscall = PT_SYSCALL_NR(regs->regs.gp);
+	UPT_SYSCALL_NR(&regs->regs) = syscall;
+
+	pr_debug("syscall(%d) (current=%lx) (fn=%lx)\n",
+		 syscall, (unsigned long)current,
+		 (unsigned long)sys_call_table[syscall]);
+
+	if (likely(syscall < NR_syscalls)) {
+		PT_REGS_SET_SYSCALL_RETURN(regs,
+				EXECUTE_SYSCALL(syscall, regs));
+	}
+
+	pr_debug("syscall(%d) --> %lx\n", syscall,
+		regs->regs.gp[HOST_AX]);
+
+	PT_REGS_SYSCALL_RET(regs) = regs->regs.gp[HOST_AX];
+
+	/* execve succeeded */
+	if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0)
+		userspace(&current->thread.regs.regs);
+
+	/* force do_signal() --> is_syscall() */
+	set_thread_flag(TIF_SIGPENDING);
+	interrupt_end();
+}
diff --git a/arch/x86/um/nommu/entry_64.S b/arch/x86/um/nommu/entry_64.S
new file mode 100644
index 000000000000..e9bfc7b93c84
--- /dev/null
+++ b/arch/x86/um/nommu/entry_64.S
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/errno.h>
+
+#include <linux/linkage.h>
+#include <asm/percpu.h>
+#include <asm/desc.h>
+
+#include "../entry/calling.h"
+
+#ifdef CONFIG_SMP
+#error need to stash these variables somewhere else
+#endif
+
+#define UM_GLOBAL_VAR(x) .data; .align 8; .globl x; x:; .long 0
+
+UM_GLOBAL_VAR(current_top_of_stack)
+UM_GLOBAL_VAR(current_ptregs)
+
+.code64
+.section .entry.text, "ax"
+
+.align 8
+#undef ENTRY
+#define ENTRY(x) .text; .globl x; .type x,%function; x:
+#undef END
+#define END(x)   .size x, . - x
+
+/*
+ * %rcx has the return address (we set it before entering __kernel_vsyscall).
+ *
+ * Registers on entry:
+ * rax  system call number
+ * rcx  return address
+ * rdi  arg0
+ * rsi  arg1
+ * rdx  arg2
+ * r10  arg3
+ * r8   arg4
+ * r9   arg5
+ *
+ * (note: we are allowed to mess with r11: r11 is callee-clobbered
+ * register in C ABI)
+ */
+ENTRY(__kernel_vsyscall)
+
+	movq	%rsp, %r11
+
+	/* Point rsp to the top of the ptregs array, so we can
+           just fill it with a bunch of push'es. */
+	movq	current_ptregs, %rsp
+
+	/* 8 bytes * 20 registers (plus 8 for the push) */
+	addq	$168, %rsp
+
+	/* Construct struct pt_regs on stack */
+	pushq	$0		/* pt_regs->ss (index 20) */
+	pushq   %r11		/* pt_regs->sp */
+	pushfq			/* pt_regs->flags */
+	pushq	$0		/* pt_regs->cs */
+	pushq	%rcx		/* pt_regs->ip */
+	pushq	%rax		/* pt_regs->orig_ax */
+
+	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
+
+	mov %rsp, %rdi
+
+	/*
+	 * Switch to current top of stack, so "current->" points
+	 * to the right task.
+	 */
+	movq	current_top_of_stack, %rsp
+
+	call	do_syscall_64
+
+	movq	current_ptregs, %rsp
+
+	POP_REGS
+
+	addq	$8, %rsp	/* skip orig_ax */
+	popq	%rcx		/* pt_regs->ip */
+	addq	$8, %rsp	/* skip cs */
+	addq	$8, %rsp	/* skip flags */
+	popq	%rsp
+
+	/*
+	* not return w/ ret but w/ jmp as the stack is already popped before
+	* entering __kernel_vsyscall
+	*/
+	jmp	*%rcx
+
+END(__kernel_vsyscall)
diff --git a/arch/x86/um/nommu/syscalls.h b/arch/x86/um/nommu/syscalls.h
new file mode 100644
index 000000000000..a2433756b1fc
--- /dev/null
+++ b/arch/x86/um/nommu/syscalls.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_NOMMU_SYSCALLS_H
+#define __UM_NOMMU_SYSCALLS_H
+
+
+#define task_top_of_stack(task) \
+({									\
+	unsigned long __ptr = (unsigned long)task->stack;	\
+	__ptr += THREAD_SIZE;			\
+	__ptr;					\
+})
+
+extern long current_top_of_stack;
+extern long current_ptregs;
+
+#endif
diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h
index b6b997225841..ffd80ee3b9dc 100644
--- a/arch/x86/um/shared/sysdep/syscalls_64.h
+++ b/arch/x86/um/shared/sysdep/syscalls_64.h
@@ -25,4 +25,10 @@ extern syscall_handler_t *sys_call_table[];
 extern syscall_handler_t sys_modify_ldt;
 extern syscall_handler_t sys_arch_prctl;
 
+#ifndef CONFIG_MMU
+extern void do_syscall_64(struct pt_regs *regs);
+extern long __kernel_vsyscall(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
+			      int64_t a4, int64_t a5, int64_t a6);
+#endif
+
 #endif
-- 
2.43.0

Re: [PATCH v9 04/13] x86/um: nommu: syscall handling

Posted by Benjamin Berg 7 months, 3 weeks ago

Hi,

On Thu, 2025-06-19 at 10:04 +0900, Hajime Tazaki wrote:
> This commit introduces an entry point of syscall interface for !MMU
> mode. It uses an entry function, __kernel_vsyscall, a kernel-wide global
> symbol accessible from any locations.
> 
> Although it isn't in the scope of this commit, it can be also exposed
> via vdso image which is directly accessible from userspace. A standard
> library (i.e., libc) can utilize this entry point to implement syscall
> wrapper; we can also use this by hooking syscall for unmodified userspace
> applications/libraries, which will be implemented in the subsequent
> commit.
> 
> This only supports 64-bit mode of x86 architecture.
> 
> Signed-off-by: Hajime Tazaki <thehajime@gmail.com>
> Signed-off-by: Ricardo Koller <ricarkol@google.com>
> ---
>  arch/x86/um/Makefile                    |  4 ++
>  arch/x86/um/nommu/Makefile              |  8 +++
>  arch/x86/um/nommu/do_syscall_64.c       | 37 ++++++++++
>  arch/x86/um/nommu/entry_64.S            | 91 +++++++++++++++++++++++++
>  arch/x86/um/nommu/syscalls.h            | 16 +++++
>  arch/x86/um/shared/sysdep/syscalls_64.h |  6 ++
>  6 files changed, 162 insertions(+)
>  create mode 100644 arch/x86/um/nommu/Makefile
>  create mode 100644 arch/x86/um/nommu/do_syscall_64.c
>  create mode 100644 arch/x86/um/nommu/entry_64.S
>  create mode 100644 arch/x86/um/nommu/syscalls.h
> 
> diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
> index b42c31cd2390..227af2a987e2 100644
> --- a/arch/x86/um/Makefile
> +++ b/arch/x86/um/Makefile
> @@ -32,6 +32,10 @@ obj-y += syscalls_64.o vdso/
>  subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o \
>  	../lib/memmove_64.o ../lib/memset_64.o
>  
> +ifneq ($(CONFIG_MMU),y)
> +obj-y += nommu/
> +endif
> +
>  endif
>  
>  subarch-$(CONFIG_MODULES) += ../kernel/module.o
> diff --git a/arch/x86/um/nommu/Makefile b/arch/x86/um/nommu/Makefile
> new file mode 100644
> index 000000000000..d72c63afffa5
> --- /dev/null
> +++ b/arch/x86/um/nommu/Makefile
> @@ -0,0 +1,8 @@
> +# SPDX-License-Identifier: GPL-2.0
> +ifeq ($(CONFIG_X86_32),y)
> +	BITS := 32
> +else
> +	BITS := 64
> +endif
> +
> +obj-y = do_syscall_$(BITS).o entry_$(BITS).o
> diff --git a/arch/x86/um/nommu/do_syscall_64.c b/arch/x86/um/nommu/do_syscall_64.c
> new file mode 100644
> index 000000000000..5d0fa83e7fdc
> --- /dev/null
> +++ b/arch/x86/um/nommu/do_syscall_64.c
> @@ -0,0 +1,37 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/kernel.h>
> +#include <linux/ptrace.h>
> +#include <kern_util.h>
> +#include <sysdep/syscalls.h>
> +#include <os.h>
> +
> +__visible void do_syscall_64(struct pt_regs *regs)
> +{
> +	int syscall;
> +
> +	syscall = PT_SYSCALL_NR(regs->regs.gp);
> +	UPT_SYSCALL_NR(&regs->regs) = syscall;
> +
> +	pr_debug("syscall(%d) (current=%lx) (fn=%lx)\n",
> +		 syscall, (unsigned long)current,
> +		 (unsigned long)sys_call_table[syscall]);

You probably want to drop the pr_debug from the syscall path.

> +	if (likely(syscall < NR_syscalls)) {
> +		PT_REGS_SET_SYSCALL_RETURN(regs,
> +				EXECUTE_SYSCALL(syscall, regs));
> +	}
> +
> +	pr_debug("syscall(%d) --> %lx\n", syscall,
> +		regs->regs.gp[HOST_AX]);
> +
> +	PT_REGS_SYSCALL_RET(regs) = regs->regs.gp[HOST_AX];
> +
> +	/* execve succeeded */
> +	if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0)
> +		userspace(&current->thread.regs.regs);

That said, this is what I am stumbling over. Why do you need to jump
into userspace() here? It seems odd to me to need a special case in the
syscall path itself.
Aren't there other possibilities to hook/override the kernel task
state?

> +
> +	/* force do_signal() --> is_syscall() */
> +	set_thread_flag(TIF_SIGPENDING);
> +	interrupt_end();

Same here. The MMU UML code seems to also do this, but restricted to
ptrace'd processes? Maybe I am just missing something obvious …

Benjamin

> +}
> diff --git a/arch/x86/um/nommu/entry_64.S b/arch/x86/um/nommu/entry_64.S
> new file mode 100644
> index 000000000000..e9bfc7b93c84
> --- /dev/null
> +++ b/arch/x86/um/nommu/entry_64.S
> @@ -0,0 +1,91 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#include <asm/errno.h>
> +
> +#include <linux/linkage.h>
> +#include <asm/percpu.h>
> +#include <asm/desc.h>
> +
> +#include "../entry/calling.h"
> +
> +#ifdef CONFIG_SMP
> +#error need to stash these variables somewhere else
> +#endif
> +
> +#define UM_GLOBAL_VAR(x) .data; .align 8; .globl x; x:; .long 0
> +
> +UM_GLOBAL_VAR(current_top_of_stack)
> +UM_GLOBAL_VAR(current_ptregs)
> +
> +.code64
> +.section .entry.text, "ax"
> +
> +.align 8
> +#undef ENTRY
> +#define ENTRY(x) .text; .globl x; .type x,%function; x:
> +#undef END
> +#define END(x)   .size x, . - x
> +
> +/*
> + * %rcx has the return address (we set it before entering __kernel_vsyscall).
> + *
> + * Registers on entry:
> + * rax  system call number
> + * rcx  return address
> + * rdi  arg0
> + * rsi  arg1
> + * rdx  arg2
> + * r10  arg3
> + * r8   arg4
> + * r9   arg5
> + *
> + * (note: we are allowed to mess with r11: r11 is callee-clobbered
> + * register in C ABI)
> + */
> +ENTRY(__kernel_vsyscall)
> +
> +	movq	%rsp, %r11
> +
> +	/* Point rsp to the top of the ptregs array, so we can
> +           just fill it with a bunch of push'es. */
> +	movq	current_ptregs, %rsp
> +
> +	/* 8 bytes * 20 registers (plus 8 for the push) */
> +	addq	$168, %rsp
> +
> +	/* Construct struct pt_regs on stack */
> +	pushq	$0		/* pt_regs->ss (index 20) */
> +	pushq   %r11		/* pt_regs->sp */
> +	pushfq			/* pt_regs->flags */
> +	pushq	$0		/* pt_regs->cs */
> +	pushq	%rcx		/* pt_regs->ip */
> +	pushq	%rax		/* pt_regs->orig_ax */
> +
> +	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
> +
> +	mov %rsp, %rdi
> +
> +	/*
> +	 * Switch to current top of stack, so "current->" points
> +	 * to the right task.
> +	 */
> +	movq	current_top_of_stack, %rsp
> +
> +	call	do_syscall_64
> +
> +	movq	current_ptregs, %rsp
> +
> +	POP_REGS
> +
> +	addq	$8, %rsp	/* skip orig_ax */
> +	popq	%rcx		/* pt_regs->ip */
> +	addq	$8, %rsp	/* skip cs */
> +	addq	$8, %rsp	/* skip flags */
> +	popq	%rsp
> +
> +	/*
> +	* not return w/ ret but w/ jmp as the stack is already popped before
> +	* entering __kernel_vsyscall
> +	*/
> +	jmp	*%rcx
> +
> +END(__kernel_vsyscall)
> diff --git a/arch/x86/um/nommu/syscalls.h b/arch/x86/um/nommu/syscalls.h
> new file mode 100644
> index 000000000000..a2433756b1fc
> --- /dev/null
> +++ b/arch/x86/um/nommu/syscalls.h
> @@ -0,0 +1,16 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __UM_NOMMU_SYSCALLS_H
> +#define __UM_NOMMU_SYSCALLS_H
> +
> +
> +#define task_top_of_stack(task) \
> +({									\
> +	unsigned long __ptr = (unsigned long)task->stack;	\
> +	__ptr += THREAD_SIZE;			\
> +	__ptr;					\
> +})
> +
> +extern long current_top_of_stack;
> +extern long current_ptregs;
> +
> +#endif
> diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h
> index b6b997225841..ffd80ee3b9dc 100644
> --- a/arch/x86/um/shared/sysdep/syscalls_64.h
> +++ b/arch/x86/um/shared/sysdep/syscalls_64.h
> @@ -25,4 +25,10 @@ extern syscall_handler_t *sys_call_table[];
>  extern syscall_handler_t sys_modify_ldt;
>  extern syscall_handler_t sys_arch_prctl;
>  
> +#ifndef CONFIG_MMU
> +extern void do_syscall_64(struct pt_regs *regs);
> +extern long __kernel_vsyscall(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
> +			      int64_t a4, int64_t a5, int64_t a6);
> +#endif
> +
>  #endif

Re: [PATCH v9 04/13] x86/um: nommu: syscall handling

Posted by Hajime Tazaki 7 months, 3 weeks ago

On Thu, 19 Jun 2025 19:31:53 +0900,
Benjamin Berg wrote:

> > diff --git a/arch/x86/um/nommu/do_syscall_64.c b/arch/x86/um/nommu/do_syscall_64.c
> > new file mode 100644
> > index 000000000000..5d0fa83e7fdc
> > --- /dev/null
> > +++ b/arch/x86/um/nommu/do_syscall_64.c
> > @@ -0,0 +1,37 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +#include <linux/kernel.h>
> > +#include <linux/ptrace.h>
> > +#include <kern_util.h>
> > +#include <sysdep/syscalls.h>
> > +#include <os.h>
> > +
> > +__visible void do_syscall_64(struct pt_regs *regs)
> > +{
> > +	int syscall;
> > +
> > +	syscall = PT_SYSCALL_NR(regs->regs.gp);
> > +	UPT_SYSCALL_NR(&regs->regs) = syscall;
> > +
> > +	pr_debug("syscall(%d) (current=%lx) (fn=%lx)\n",
> > +		 syscall, (unsigned long)current,
> > +		 (unsigned long)sys_call_table[syscall]);
> 
> You probably want to drop the pr_debug from the syscall path.

okay, I'll update those parts.

> > +	if (likely(syscall < NR_syscalls)) {
> > +		PT_REGS_SET_SYSCALL_RETURN(regs,
> > +				EXECUTE_SYSCALL(syscall, regs));
> > +	}
> > +
> > +	pr_debug("syscall(%d) --> %lx\n", syscall,
> > +		regs->regs.gp[HOST_AX]);
> > +
> > +	PT_REGS_SYSCALL_RET(regs) = regs->regs.gp[HOST_AX];
> > +
> > +	/* execve succeeded */
> > +	if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0)
> > +		userspace(&current->thread.regs.regs);
> 
> That said, this is what I am stumbling over. Why do you need to jump
> into userspace() here? It seems odd to me to need a special case in the
> syscall path itself.
> Aren't there other possibilities to hook/override the kernel task
> state?

thanks, I found that this is a leftover of our early implementation
which doesn't have a proper schedule upon an exit from syscall.  we
can remove this part and I'll fix them after more investigation.

> > +	/* force do_signal() --> is_syscall() */
> > +	set_thread_flag(TIF_SIGPENDING);
> > +	interrupt_end();
> 
> Same here. The MMU UML code seems to also do this, but restricted to
> ptrace'd processes? Maybe I am just missing something obvious …

nommu doesn't have separate process/context to indicate a schedule to
the context here (syscall).  without that part we do not have a chance
to schedule tasks and signals to userspace.

But the force on SIGPENDING flag is not actually needed so, I'll
remove that part.

thanks for pointing out.

-- Hajime