[v6] uprobes: Add support to optimize usdt probes on x86_64

[PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Jiri Olsa 2 months, 2 weeks ago

Adding new uprobe syscall that calls uprobe handlers for given
'breakpoint' address.

The idea is that the 'breakpoint' address calls the user space
trampoline which executes the uprobe syscall.

The syscall handler reads the return address of the initial call
to retrieve the original 'breakpoint' address. With this address
we find the related uprobe object and call its consumers.

Adding the arch_uprobe_trampoline_mapping function that provides
uprobe trampoline mapping. This mapping is backed with one global
page initialized at __init time and shared by the all the mapping
instances.

We do not allow to execute uprobe syscall if the caller is not
from uprobe trampoline mapping.

The uprobe syscall ensures the consumer (bpf program) sees registers
values in the state before the trampoline was called.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 arch/x86/kernel/uprobes.c              | 139 +++++++++++++++++++++++++
 include/linux/syscalls.h               |   2 +
 include/linux/uprobes.h                |   1 +
 kernel/events/uprobes.c                |  17 +++
 kernel/sys_ni.c                        |   1 +
 6 files changed, 161 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index cfb5ca41e30d..9fd1291e7bdf 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@
 333	common	io_pgetevents		sys_io_pgetevents
 334	common	rseq			sys_rseq
 335	common	uretprobe		sys_uretprobe
+336	common	uprobe			sys_uprobe
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	sys_pidfd_send_signal
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c4dcbdd0c3c..d18e1ae59901 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm)
 	hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
 		destroy_uprobe_trampoline(tramp);
 }
+
+static bool __in_uprobe_trampoline(unsigned long ip)
+{
+	struct vm_area_struct *vma = vma_lookup(current->mm, ip);
+
+	return vma && vma_is_special_mapping(vma, &tramp_mapping);
+}
+
+static bool in_uprobe_trampoline(unsigned long ip)
+{
+	struct mm_struct *mm = current->mm;
+	bool found, retry = true;
+	unsigned int seq;
+
+	rcu_read_lock();
+	if (mmap_lock_speculate_try_begin(mm, &seq)) {
+		found = __in_uprobe_trampoline(ip);
+		retry = mmap_lock_speculate_retry(mm, seq);
+	}
+	rcu_read_unlock();
+
+	if (retry) {
+		mmap_read_lock(mm);
+		found = __in_uprobe_trampoline(ip);
+		mmap_read_unlock(mm);
+	}
+	return found;
+}
+
+/*
+ * See uprobe syscall trampoline; the call to the trampoline will push
+ * the return address on the stack, the trampoline itself then pushes
+ * cx, r11 and ax.
+ */
+struct uprobe_syscall_args {
+	unsigned long ax;
+	unsigned long r11;
+	unsigned long cx;
+	unsigned long retaddr;
+};
+
+SYSCALL_DEFINE0(uprobe)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	struct uprobe_syscall_args args;
+	unsigned long ip, sp;
+	int err;
+
+	/* Allow execution only from uprobe trampolines. */
+	if (!in_uprobe_trampoline(regs->ip))
+		goto sigill;
+
+	err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
+	if (err)
+		goto sigill;
+
+	ip = regs->ip;
+
+	/*
+	 * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus:
+	 * - adjust ip to the probe address, call saved next instruction address
+	 * - adjust sp to the probe's stack frame (check trampoline code)
+	 */
+	regs->ax  = args.ax;
+	regs->r11 = args.r11;
+	regs->cx  = args.cx;
+	regs->ip  = args.retaddr - 5;
+	regs->sp += sizeof(args);
+	regs->orig_ax = -1;
+
+	sp = regs->sp;
+
+	handle_syscall_uprobe(regs, regs->ip);
+
+	/*
+	 * Some of the uprobe consumers has changed sp, we can do nothing,
+	 * just return via iret.
+	 */
+	if (regs->sp != sp) {
+		/* skip the trampoline call */
+		if (args.retaddr - 5 == regs->ip)
+			regs->ip += 5;
+		return regs->ax;
+	}
+
+	regs->sp -= sizeof(args);
+
+	/* for the case uprobe_consumer has changed ax/r11/cx */
+	args.ax  = regs->ax;
+	args.r11 = regs->r11;
+	args.cx  = regs->cx;
+
+	/* keep return address unless we are instructed otherwise */
+	if (args.retaddr - 5 != regs->ip)
+		args.retaddr = regs->ip;
+
+	regs->ip = ip;
+
+	err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
+	if (err)
+		goto sigill;
+
+	/* ensure sysret, see do_syscall_64() */
+	regs->r11 = regs->flags;
+	regs->cx  = regs->ip;
+	return 0;
+
+sigill:
+	force_sig(SIGILL);
+	return -1;
+}
+
+asm (
+	".pushsection .rodata\n"
+	".balign " __stringify(PAGE_SIZE) "\n"
+	"uprobe_trampoline_entry:\n"
+	"push %rcx\n"
+	"push %r11\n"
+	"push %rax\n"
+	"movq $" __stringify(__NR_uprobe) ", %rax\n"
+	"syscall\n"
+	"pop %rax\n"
+	"pop %r11\n"
+	"pop %rcx\n"
+	"ret\n"
+	".balign " __stringify(PAGE_SIZE) "\n"
+	".popsection\n"
+);
+
+extern u8 uprobe_trampoline_entry[];
+
+static int __init arch_uprobes_init(void)
+{
+	tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry);
+	return 0;
+}
+
+late_initcall(arch_uprobes_init);
+
 #else /* 32-bit: */
 /*
  * No RIP-relative addressing on 32-bit
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e5603cc91963..b0cc60f1c458 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -998,6 +998,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
 
 asmlinkage long sys_uretprobe(void);
 
+asmlinkage long sys_uprobe(void);
+
 /* pciconfig: alpha, arm, arm64, ia64, sparc */
 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
 				unsigned long off, unsigned long len,
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b40d33aae016..b6b077cc7d0f 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -239,6 +239,7 @@ extern unsigned long uprobe_get_trampoline_vaddr(void);
 extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len);
 extern void arch_uprobe_clear_state(struct mm_struct *mm);
 extern void arch_uprobe_init_state(struct mm_struct *mm);
+extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index acec91a676b7..cbba31c0495f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2772,6 +2772,23 @@ static void handle_swbp(struct pt_regs *regs)
 	rcu_read_unlock_trace();
 }
 
+void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
+{
+	struct uprobe *uprobe;
+	int is_swbp;
+
+	guard(rcu_tasks_trace)();
+
+	uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
+	if (!uprobe)
+		return;
+	if (!get_utask())
+		return;
+	if (arch_uprobe_ignore(&uprobe->arch, regs))
+		return;
+	handler_chain(uprobe, regs);
+}
+
 /*
  * Perform required fix-ups and disable singlestep.
  * Allow pending signals to take effect.
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c00a86931f8c..bf5d05c635ff 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -392,3 +392,4 @@ COND_SYSCALL(setuid16);
 COND_SYSCALL(rseq);
 
 COND_SYSCALL(uretprobe);
+COND_SYSCALL(uprobe);
-- 
2.50.1

Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Andrii Nakryiko 1 month ago

On Sun, Jul 20, 2025 at 4:23 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> Adding new uprobe syscall that calls uprobe handlers for given
> 'breakpoint' address.
>
> The idea is that the 'breakpoint' address calls the user space
> trampoline which executes the uprobe syscall.
>
> The syscall handler reads the return address of the initial call
> to retrieve the original 'breakpoint' address. With this address
> we find the related uprobe object and call its consumers.
>
> Adding the arch_uprobe_trampoline_mapping function that provides
> uprobe trampoline mapping. This mapping is backed with one global
> page initialized at __init time and shared by the all the mapping
> instances.
>
> We do not allow to execute uprobe syscall if the caller is not
> from uprobe trampoline mapping.
>
> The uprobe syscall ensures the consumer (bpf program) sees registers
> values in the state before the trampoline was called.
>
> Acked-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  arch/x86/kernel/uprobes.c              | 139 +++++++++++++++++++++++++
>  include/linux/syscalls.h               |   2 +
>  include/linux/uprobes.h                |   1 +
>  kernel/events/uprobes.c                |  17 +++
>  kernel/sys_ni.c                        |   1 +
>  6 files changed, 161 insertions(+)
>
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index cfb5ca41e30d..9fd1291e7bdf 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -345,6 +345,7 @@
>  333    common  io_pgetevents           sys_io_pgetevents
>  334    common  rseq                    sys_rseq
>  335    common  uretprobe               sys_uretprobe
> +336    common  uprobe                  sys_uprobe
>  # don't use numbers 387 through 423, add new calls after the last
>  # 'common' entry
>  424    common  pidfd_send_signal       sys_pidfd_send_signal
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index 6c4dcbdd0c3c..d18e1ae59901 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm)
>         hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
>                 destroy_uprobe_trampoline(tramp);
>  }
> +
> +static bool __in_uprobe_trampoline(unsigned long ip)
> +{
> +       struct vm_area_struct *vma = vma_lookup(current->mm, ip);
> +
> +       return vma && vma_is_special_mapping(vma, &tramp_mapping);
> +}
> +
> +static bool in_uprobe_trampoline(unsigned long ip)
> +{
> +       struct mm_struct *mm = current->mm;
> +       bool found, retry = true;
> +       unsigned int seq;
> +
> +       rcu_read_lock();
> +       if (mmap_lock_speculate_try_begin(mm, &seq)) {
> +               found = __in_uprobe_trampoline(ip);
> +               retry = mmap_lock_speculate_retry(mm, seq);
> +       }
> +       rcu_read_unlock();
> +
> +       if (retry) {
> +               mmap_read_lock(mm);
> +               found = __in_uprobe_trampoline(ip);
> +               mmap_read_unlock(mm);
> +       }
> +       return found;
> +}
> +
> +/*
> + * See uprobe syscall trampoline; the call to the trampoline will push
> + * the return address on the stack, the trampoline itself then pushes
> + * cx, r11 and ax.
> + */
> +struct uprobe_syscall_args {
> +       unsigned long ax;
> +       unsigned long r11;
> +       unsigned long cx;
> +       unsigned long retaddr;
> +};
> +
> +SYSCALL_DEFINE0(uprobe)
> +{
> +       struct pt_regs *regs = task_pt_regs(current);
> +       struct uprobe_syscall_args args;
> +       unsigned long ip, sp;
> +       int err;
> +
> +       /* Allow execution only from uprobe trampolines. */
> +       if (!in_uprobe_trampoline(regs->ip))
> +               goto sigill;

Hey Jiri,

So I've been thinking what's the simplest and most reliable way to
feature-detect support for this sys_uprobe (e.g., for libbpf to know
whether we should attach at nop5 vs nop1), and clearly that would be
to try to call uprobe() syscall not from trampoline, and expect some
error code.

How bad would it be to change this part to return some unique-enough
error code (-ENXIO, -EDOM, whatever).

Is there any reason not to do this? Security-wise it will be just fine, right?

> +
> +       err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
> +       if (err)
> +               goto sigill;
> +
> +       ip = regs->ip;
> +

[...]

Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Jiri Olsa 1 month ago

On Wed, Sep 03, 2025 at 11:24:31AM -0700, Andrii Nakryiko wrote:
> On Sun, Jul 20, 2025 at 4:23 AM Jiri Olsa <jolsa@kernel.org> wrote:
> >
> > Adding new uprobe syscall that calls uprobe handlers for given
> > 'breakpoint' address.
> >
> > The idea is that the 'breakpoint' address calls the user space
> > trampoline which executes the uprobe syscall.
> >
> > The syscall handler reads the return address of the initial call
> > to retrieve the original 'breakpoint' address. With this address
> > we find the related uprobe object and call its consumers.
> >
> > Adding the arch_uprobe_trampoline_mapping function that provides
> > uprobe trampoline mapping. This mapping is backed with one global
> > page initialized at __init time and shared by the all the mapping
> > instances.
> >
> > We do not allow to execute uprobe syscall if the caller is not
> > from uprobe trampoline mapping.
> >
> > The uprobe syscall ensures the consumer (bpf program) sees registers
> > values in the state before the trampoline was called.
> >
> > Acked-by: Andrii Nakryiko <andrii@kernel.org>
> > Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> > ---
> >  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
> >  arch/x86/kernel/uprobes.c              | 139 +++++++++++++++++++++++++
> >  include/linux/syscalls.h               |   2 +
> >  include/linux/uprobes.h                |   1 +
> >  kernel/events/uprobes.c                |  17 +++
> >  kernel/sys_ni.c                        |   1 +
> >  6 files changed, 161 insertions(+)
> >
> > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> > index cfb5ca41e30d..9fd1291e7bdf 100644
> > --- a/arch/x86/entry/syscalls/syscall_64.tbl
> > +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> > @@ -345,6 +345,7 @@
> >  333    common  io_pgetevents           sys_io_pgetevents
> >  334    common  rseq                    sys_rseq
> >  335    common  uretprobe               sys_uretprobe
> > +336    common  uprobe                  sys_uprobe
> >  # don't use numbers 387 through 423, add new calls after the last
> >  # 'common' entry
> >  424    common  pidfd_send_signal       sys_pidfd_send_signal
> > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> > index 6c4dcbdd0c3c..d18e1ae59901 100644
> > --- a/arch/x86/kernel/uprobes.c
> > +++ b/arch/x86/kernel/uprobes.c
> > @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm)
> >         hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
> >                 destroy_uprobe_trampoline(tramp);
> >  }
> > +
> > +static bool __in_uprobe_trampoline(unsigned long ip)
> > +{
> > +       struct vm_area_struct *vma = vma_lookup(current->mm, ip);
> > +
> > +       return vma && vma_is_special_mapping(vma, &tramp_mapping);
> > +}
> > +
> > +static bool in_uprobe_trampoline(unsigned long ip)
> > +{
> > +       struct mm_struct *mm = current->mm;
> > +       bool found, retry = true;
> > +       unsigned int seq;
> > +
> > +       rcu_read_lock();
> > +       if (mmap_lock_speculate_try_begin(mm, &seq)) {
> > +               found = __in_uprobe_trampoline(ip);
> > +               retry = mmap_lock_speculate_retry(mm, seq);
> > +       }
> > +       rcu_read_unlock();
> > +
> > +       if (retry) {
> > +               mmap_read_lock(mm);
> > +               found = __in_uprobe_trampoline(ip);
> > +               mmap_read_unlock(mm);
> > +       }
> > +       return found;
> > +}
> > +
> > +/*
> > + * See uprobe syscall trampoline; the call to the trampoline will push
> > + * the return address on the stack, the trampoline itself then pushes
> > + * cx, r11 and ax.
> > + */
> > +struct uprobe_syscall_args {
> > +       unsigned long ax;
> > +       unsigned long r11;
> > +       unsigned long cx;
> > +       unsigned long retaddr;
> > +};
> > +
> > +SYSCALL_DEFINE0(uprobe)
> > +{
> > +       struct pt_regs *regs = task_pt_regs(current);
> > +       struct uprobe_syscall_args args;
> > +       unsigned long ip, sp;
> > +       int err;
> > +
> > +       /* Allow execution only from uprobe trampolines. */
> > +       if (!in_uprobe_trampoline(regs->ip))
> > +               goto sigill;
> 
> Hey Jiri,
> 
> So I've been thinking what's the simplest and most reliable way to
> feature-detect support for this sys_uprobe (e.g., for libbpf to know
> whether we should attach at nop5 vs nop1), and clearly that would be

wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5
and store some info about that in the usdt's elf note, right?

libbpf will read usdt record and in case it has both nop1/nop5 and if
the sys_uprobe is detected, we will adjust usdt address to nop1 or nop5

I recall you said you might have an idea where to store this flag
in elf note.. or are we bumping the usdt's elf note n_type ?

thanks,
jirka


> to try to call uprobe() syscall not from trampoline, and expect some
> error code.
> 
> How bad would it be to change this part to return some unique-enough
> error code (-ENXIO, -EDOM, whatever).
> 
> Is there any reason not to do this? Security-wise it will be just fine, right?
> 
> > +
> > +       err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
> > +       if (err)
> > +               goto sigill;
> > +
> > +       ip = regs->ip;
> > +
> 
> [...]

nop5-optimized USDTs WAS: Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Andrii Nakryiko 1 month ago

-- Andrii

On Thu, Sep 4, 2025 at 1:13 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Wed, Sep 03, 2025 at 11:24:31AM -0700, Andrii Nakryiko wrote:
> > On Sun, Jul 20, 2025 at 4:23 AM Jiri Olsa <jolsa@kernel.org> wrote:
> > >
> > > Adding new uprobe syscall that calls uprobe handlers for given
> > > 'breakpoint' address.
> > >
> > > The idea is that the 'breakpoint' address calls the user space
> > > trampoline which executes the uprobe syscall.
> > >
> > > The syscall handler reads the return address of the initial call
> > > to retrieve the original 'breakpoint' address. With this address
> > > we find the related uprobe object and call its consumers.
> > >
> > > Adding the arch_uprobe_trampoline_mapping function that provides
> > > uprobe trampoline mapping. This mapping is backed with one global
> > > page initialized at __init time and shared by the all the mapping
> > > instances.
> > >
> > > We do not allow to execute uprobe syscall if the caller is not
> > > from uprobe trampoline mapping.
> > >
> > > The uprobe syscall ensures the consumer (bpf program) sees registers
> > > values in the state before the trampoline was called.
> > >
> > > Acked-by: Andrii Nakryiko <andrii@kernel.org>
> > > Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> > > ---
> > >  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
> > >  arch/x86/kernel/uprobes.c              | 139 +++++++++++++++++++++++++
> > >  include/linux/syscalls.h               |   2 +
> > >  include/linux/uprobes.h                |   1 +
> > >  kernel/events/uprobes.c                |  17 +++
> > >  kernel/sys_ni.c                        |   1 +
> > >  6 files changed, 161 insertions(+)
> > >
> > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> > > index cfb5ca41e30d..9fd1291e7bdf 100644
> > > --- a/arch/x86/entry/syscalls/syscall_64.tbl
> > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> > > @@ -345,6 +345,7 @@
> > >  333    common  io_pgetevents           sys_io_pgetevents
> > >  334    common  rseq                    sys_rseq
> > >  335    common  uretprobe               sys_uretprobe
> > > +336    common  uprobe                  sys_uprobe
> > >  # don't use numbers 387 through 423, add new calls after the last
> > >  # 'common' entry
> > >  424    common  pidfd_send_signal       sys_pidfd_send_signal
> > > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> > > index 6c4dcbdd0c3c..d18e1ae59901 100644
> > > --- a/arch/x86/kernel/uprobes.c
> > > +++ b/arch/x86/kernel/uprobes.c
> > > @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm)
> > >         hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
> > >                 destroy_uprobe_trampoline(tramp);
> > >  }
> > > +
> > > +static bool __in_uprobe_trampoline(unsigned long ip)
> > > +{
> > > +       struct vm_area_struct *vma = vma_lookup(current->mm, ip);
> > > +
> > > +       return vma && vma_is_special_mapping(vma, &tramp_mapping);
> > > +}
> > > +
> > > +static bool in_uprobe_trampoline(unsigned long ip)
> > > +{
> > > +       struct mm_struct *mm = current->mm;
> > > +       bool found, retry = true;
> > > +       unsigned int seq;
> > > +
> > > +       rcu_read_lock();
> > > +       if (mmap_lock_speculate_try_begin(mm, &seq)) {
> > > +               found = __in_uprobe_trampoline(ip);
> > > +               retry = mmap_lock_speculate_retry(mm, seq);
> > > +       }
> > > +       rcu_read_unlock();
> > > +
> > > +       if (retry) {
> > > +               mmap_read_lock(mm);
> > > +               found = __in_uprobe_trampoline(ip);
> > > +               mmap_read_unlock(mm);
> > > +       }
> > > +       return found;
> > > +}
> > > +
> > > +/*
> > > + * See uprobe syscall trampoline; the call to the trampoline will push
> > > + * the return address on the stack, the trampoline itself then pushes
> > > + * cx, r11 and ax.
> > > + */
> > > +struct uprobe_syscall_args {
> > > +       unsigned long ax;
> > > +       unsigned long r11;
> > > +       unsigned long cx;
> > > +       unsigned long retaddr;
> > > +};
> > > +
> > > +SYSCALL_DEFINE0(uprobe)
> > > +{
> > > +       struct pt_regs *regs = task_pt_regs(current);
> > > +       struct uprobe_syscall_args args;
> > > +       unsigned long ip, sp;
> > > +       int err;
> > > +
> > > +       /* Allow execution only from uprobe trampolines. */
> > > +       if (!in_uprobe_trampoline(regs->ip))
> > > +               goto sigill;
> >
> > Hey Jiri,
> >
> > So I've been thinking what's the simplest and most reliable way to
> > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > whether we should attach at nop5 vs nop1), and clearly that would be
>
> wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5
> and store some info about that in the usdt's elf note, right?
>

Yes.

> libbpf will read usdt record and in case it has both nop1/nop5 and if
> the sys_uprobe is detected, we will adjust usdt address to nop1 or nop5
>

Yes.

> I recall you said you might have an idea where to store this flag
> in elf note.. or are we bumping the usdt's elf note n_type ?
>

Neither. I was contemplating just to look whether there is nop5 after
nop1 from libbpf side, which would probably always work reliably in
practice, but, technically, might be misused if you artificially put
nop5 after USDT() call and then jump into that nop5 from somewhere
else. Then it would trigger USDT where it shouldn't.

But I don't want to change n_type, as I want anyone else doing their
own USDT parsing/attaching to work just as they used to.

So, here's the current idea. USDT lays out three strings one after the
other in ELF note: provider, \0, name, \0, args, \0. We also record
total note data size there, so we know how much contents is there. I
was thinking we can add just one extra \0 at the end (and if necessary
we can treat that as 4th string with extra arguments in the future,
who knows). If libbpf detects that extra \0, then we can be reasonably
confident that nop5 is part of USDT and is safe to be used correctly.

Unless there are some super paranoid USDT parsers out there, it should
be nicely backwards compatible. Libbpf seems to not care right now
about that extra zero, which is a good sign. readelf handles that just
fine as well. So all good signs so far. I haven't tested anything else
(bpftrace is currently broken for me w.r.t. USDT attachment even with
old format, so I can't quickly check, please help, if you can).

> thanks,
> jirka
>
>
> > to try to call uprobe() syscall not from trampoline, and expect some
> > error code.
> >
> > How bad would it be to change this part to return some unique-enough
> > error code (-ENXIO, -EDOM, whatever).
> >
> > Is there any reason not to do this? Security-wise it will be just fine, right?
> >
> > > +
> > > +       err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
> > > +       if (err)
> > > +               goto sigill;
> > > +
> > > +       ip = regs->ip;
> > > +
> >
> > [...]

Re: nop5-optimized USDTs WAS: Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Peter Zijlstra 1 month ago

On Thu, Sep 04, 2025 at 11:27:45AM -0700, Andrii Nakryiko wrote:

> > > So I've been thinking what's the simplest and most reliable way to
> > > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > > whether we should attach at nop5 vs nop1), and clearly that would be
> >
> > wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5
> > and store some info about that in the usdt's elf note, right?

Wait, what? You're doing to emit 6 bytes and two nops? Why? Surely the
old kernel can INT3 on top of a NOP5?

Re: nop5-optimized USDTs WAS: Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Andrii Nakryiko 1 month ago

On Thu, Sep 4, 2025 at 1:35 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Sep 04, 2025 at 11:27:45AM -0700, Andrii Nakryiko wrote:
>
> > > > So I've been thinking what's the simplest and most reliable way to
> > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > > > whether we should attach at nop5 vs nop1), and clearly that would be
> > >
> > > wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5
> > > and store some info about that in the usdt's elf note, right?
>
> Wait, what? You're doing to emit 6 bytes and two nops? Why? Surely the
> old kernel can INT3 on top of a NOP5?
>

Yes it can, but it's 2x slower in terms of uprobe triggering compared
to nop1. So while it will work to use just nop5 on old kernels, it's
going to be a performance regression if we do this unconditionally.

So the idea was to have nop1 + nop5, stick to nop1 for old kernels,
attach to nop5 for newer ones (that's where feature detection I was
asking about is important, libbpf will do this automatically and
transparently).

I know it's messy, but I think we have to do that.

>
>
>

Re: nop5-optimized USDTs WAS: Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Peter Zijlstra 1 month ago

On Thu, Sep 04, 2025 at 01:49:49PM -0700, Andrii Nakryiko wrote:
> On Thu, Sep 4, 2025 at 1:35 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Thu, Sep 04, 2025 at 11:27:45AM -0700, Andrii Nakryiko wrote:
> >
> > > > > So I've been thinking what's the simplest and most reliable way to
> > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > > > > whether we should attach at nop5 vs nop1), and clearly that would be
> > > >
> > > > wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5
> > > > and store some info about that in the usdt's elf note, right?
> >
> > Wait, what? You're doing to emit 6 bytes and two nops? Why? Surely the
> > old kernel can INT3 on top of a NOP5?
> >
> 
> Yes it can, but it's 2x slower in terms of uprobe triggering compared
> to nop1. 

Why? That doesn't really make sense.

I realize its probably to late to fix the old kernel not to be stupid --
this must be something stupid, right? But now I need to know.

Re: nop5-optimized USDTs WAS: Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Andrii Nakryiko 1 month ago

On Thu, Sep 4, 2025 at 1:52 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Sep 04, 2025 at 01:49:49PM -0700, Andrii Nakryiko wrote:
> > On Thu, Sep 4, 2025 at 1:35 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Thu, Sep 04, 2025 at 11:27:45AM -0700, Andrii Nakryiko wrote:
> > >
> > > > > > So I've been thinking what's the simplest and most reliable way to
> > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > > > > > whether we should attach at nop5 vs nop1), and clearly that would be
> > > > >
> > > > > wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5
> > > > > and store some info about that in the usdt's elf note, right?
> > >
> > > Wait, what? You're doing to emit 6 bytes and two nops? Why? Surely the
> > > old kernel can INT3 on top of a NOP5?
> > >
> >
> > Yes it can, but it's 2x slower in terms of uprobe triggering compared
> > to nop1.
>
> Why? That doesn't really make sense.
>

Of course it's silly... It's because nop5 wasn't recognized as one of
the emulated instructions, so was handled through single-stepping.

> I realize its probably to late to fix the old kernel not to be stupid --
> this must be something stupid, right? But now I need to know.

Jiri fixed this, but as you said, too late for old kernels. See [0]
for the patch that landed not so long ago.

  [0] https://lore.kernel.org/all/20250414083647.1234007-1-jolsa@kernel.org/

Re: nop5-optimized USDTs WAS: Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Peter Zijlstra 1 month ago

On Thu, Sep 04, 2025 at 02:44:03PM -0700, Andrii Nakryiko wrote:
> On Thu, Sep 4, 2025 at 1:52 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Thu, Sep 04, 2025 at 01:49:49PM -0700, Andrii Nakryiko wrote:
> > > On Thu, Sep 4, 2025 at 1:35 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Thu, Sep 04, 2025 at 11:27:45AM -0700, Andrii Nakryiko wrote:
> > > >
> > > > > > > So I've been thinking what's the simplest and most reliable way to
> > > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > > > > > > whether we should attach at nop5 vs nop1), and clearly that would be
> > > > > >
> > > > > > wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5
> > > > > > and store some info about that in the usdt's elf note, right?
> > > >
> > > > Wait, what? You're doing to emit 6 bytes and two nops? Why? Surely the
> > > > old kernel can INT3 on top of a NOP5?
> > > >
> > >
> > > Yes it can, but it's 2x slower in terms of uprobe triggering compared
> > > to nop1.
> >
> > Why? That doesn't really make sense.
> >
> 
> Of course it's silly... It's because nop5 wasn't recognized as one of
> the emulated instructions, so was handled through single-stepping.

*groan*

> > I realize its probably to late to fix the old kernel not to be stupid --
> > this must be something stupid, right? But now I need to know.
> 
> Jiri fixed this, but as you said, too late for old kernels. See [0]
> for the patch that landed not so long ago.
> 
>   [0] https://lore.kernel.org/all/20250414083647.1234007-1-jolsa@kernel.org/

Ooh, that suggests we do something like so:


diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 0a8c0a4a5423..223f8925097b 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -309,6 +309,29 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
 	return -ENOTSUPP;
 }
 
+static bool insn_is_nop(struct insn *insn)
+{
+	return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90;
+}
+
+static bool insn_is_nopl(struct insn *insn)
+{
+	if (insn->opcode.nbytes != 2)
+		return false;
+
+	if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f)
+		return false;
+
+	if (!insn->modrm.nbytes)
+		return false;
+
+	if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0)
+		return false;
+
+	/* 0f 1f /0 - NOPL */
+	return true;
+}
+
 #ifdef CONFIG_X86_64
 
 struct uretprobe_syscall_args {
@@ -1158,29 +1181,6 @@ void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
 	mmap_write_unlock(mm);
 }
 
-static bool insn_is_nop(struct insn *insn)
-{
-	return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90;
-}
-
-static bool insn_is_nopl(struct insn *insn)
-{
-	if (insn->opcode.nbytes != 2)
-		return false;
-
-	if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f)
-		return false;
-
-	if (!insn->modrm.nbytes)
-		return false;
-
-	if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0)
-		return false;
-
-	/* 0f 1f /0 - NOPL */
-	return true;
-}
-
 static bool can_optimize(struct insn *insn, unsigned long vaddr)
 {
 	if (!insn->x86_64 || insn->length != 5)
@@ -1428,17 +1428,13 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
 	insn_byte_t p;
 	int i;
 
-	/* x86_nops[insn->length]; same as jmp with .offs = 0 */
-	if (insn->length <= ASM_NOP_MAX &&
-	    !memcmp(insn->kaddr, x86_nops[insn->length], insn->length))
+	if (insn_is_nop(insn) || insn_is_nopl(insn))
 		goto setup;
 
 	switch (opc1) {
 	case 0xeb:	/* jmp 8 */
 	case 0xe9:	/* jmp 32 */
 		break;
-	case 0x90:	/* prefix* + nop; same as jmp with .offs = 0 */
-		goto setup;
 
 	case 0xe8:	/* call relative */
 		branch_clear_offset(auprobe, insn);

Re: nop5-optimized USDTs WAS: Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Peter Zijlstra 1 month ago

On Thu, Sep 04, 2025 at 11:56:17PM +0200, Peter Zijlstra wrote:

> Ooh, that suggests we do something like so:

N/m, I need to go sleep, that doesn't work right for the 32bit nops that
use lea instead of nopl. I'll see if I can come up with something more
sensible.

Re: nop5-optimized USDTs WAS: Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Peter Zijlstra 1 month ago

On Thu, Sep 04, 2025 at 11:58:26PM +0200, Peter Zijlstra wrote:
> On Thu, Sep 04, 2025 at 11:56:17PM +0200, Peter Zijlstra wrote:
> 
> > Ooh, that suggests we do something like so:
> 
> N/m, I need to go sleep, that doesn't work right for the 32bit nops that
> use lea instead of nopl. I'll see if I can come up with something more
> sensible.

Something like this. Can someone please look very critical at this fancy
insn_is_nop()?

---
 arch/x86/include/asm/insn-eval.h |  2 +
 arch/x86/kernel/alternative.c    | 20 +--------
 arch/x86/kernel/uprobes.c        | 32 ++------------
 arch/x86/lib/insn-eval.c         | 92 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 48 deletions(-)

diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index 54368a43abf6..4733e9064ee5 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -44,4 +44,6 @@ enum insn_mmio_type {
 
 enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes);
 
+bool insn_is_nop(struct insn *insn);
+
 #endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 7bde68247b5f..e1f98189fe77 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -9,6 +9,7 @@
 
 #include <asm/text-patching.h>
 #include <asm/insn.h>
+#include <asm/insn-eval.h>
 #include <asm/ibt.h>
 #include <asm/set_memory.h>
 #include <asm/nmi.h>
@@ -345,25 +346,6 @@ static void add_nop(u8 *buf, unsigned int len)
 		*buf = INT3_INSN_OPCODE;
 }
 
-/*
- * Matches NOP and NOPL, not any of the other possible NOPs.
- */
-static bool insn_is_nop(struct insn *insn)
-{
-	/* Anything NOP, but no REP NOP */
-	if (insn->opcode.bytes[0] == 0x90 &&
-	    (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
-		return true;
-
-	/* NOPL */
-	if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
-		return true;
-
-	/* TODO: more nops */
-
-	return false;
-}
-
 /*
  * Find the offset of the first non-NOP instruction starting at @offset
  * but no further than @len.
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 0a8c0a4a5423..32bc583e6fd4 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -17,6 +17,7 @@
 #include <linux/kdebug.h>
 #include <asm/processor.h>
 #include <asm/insn.h>
+#include <asm/insn-eval.h>
 #include <asm/mmu_context.h>
 #include <asm/nops.h>
 
@@ -1158,35 +1159,12 @@ void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
 	mmap_write_unlock(mm);
 }
 
-static bool insn_is_nop(struct insn *insn)
-{
-	return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90;
-}
-
-static bool insn_is_nopl(struct insn *insn)
-{
-	if (insn->opcode.nbytes != 2)
-		return false;
-
-	if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f)
-		return false;
-
-	if (!insn->modrm.nbytes)
-		return false;
-
-	if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0)
-		return false;
-
-	/* 0f 1f /0 - NOPL */
-	return true;
-}
-
 static bool can_optimize(struct insn *insn, unsigned long vaddr)
 {
 	if (!insn->x86_64 || insn->length != 5)
 		return false;
 
-	if (!insn_is_nop(insn) && !insn_is_nopl(insn))
+	if (!insn_is_nop(insn))
 		return false;
 
 	/* We can't do cross page atomic writes yet. */
@@ -1428,17 +1406,13 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
 	insn_byte_t p;
 	int i;
 
-	/* x86_nops[insn->length]; same as jmp with .offs = 0 */
-	if (insn->length <= ASM_NOP_MAX &&
-	    !memcmp(insn->kaddr, x86_nops[insn->length], insn->length))
+	if (insn_is_nop(insn))
 		goto setup;
 
 	switch (opc1) {
 	case 0xeb:	/* jmp 8 */
 	case 0xe9:	/* jmp 32 */
 		break;
-	case 0x90:	/* prefix* + nop; same as jmp with .offs = 0 */
-		goto setup;
 
 	case 0xe8:	/* call relative */
 		branch_clear_offset(auprobe, insn);
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 4e385cbfd444..3a67f1c5582c 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -1676,3 +1676,95 @@ enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
 
 	return type;
 }
+
+bool insn_is_nop(struct insn *insn)
+{
+	u8 rex, rex_b = 0, rex_x = 0, rex_r = 0, rex_w = 0;
+	u8 modrm, modrm_mod, modrm_reg, modrm_rm;
+	u8 sib = 0, sib_scale, sib_index, sib_base;
+
+	if (insn->rex_prefix.nbytes) {
+		rex = insn->rex_prefix.bytes[0];
+		rex_w = !!X86_REX_W(rex);
+		rex_r = !!X86_REX_R(rex);
+		rex_x = !!X86_REX_X(rex);
+		rex_b = !!X86_REX_B(rex);
+	}
+
+	if (insn->modrm.nbytes) {
+		modrm = insn->modrm.bytes[0];
+		modrm_mod = X86_MODRM_MOD(modrm);
+		modrm_reg = X86_MODRM_REG(modrm) + 8*rex_r;
+		modrm_rm  = X86_MODRM_RM(modrm)  + 8*rex_b;
+	}
+
+	if (insn->sib.nbytes) {
+		sib = insn->sib.bytes[0];
+		sib_scale = X86_SIB_SCALE(sib);
+		sib_index = X86_SIB_INDEX(sib) + 8*rex_x;
+		sib_base  = X86_SIB_BASE(sib)  + 8*rex_b;
+
+		modrm_rm = sib_base;
+	}
+
+	switch (insn->opcode.bytes[0]) {
+	case 0x0f: /* 2nd byte */
+		break;
+
+	case 0x89: /* MOV */
+		if (modrm_mod != 3) /* register-direct */
+			return false;
+
+		if (insn->x86_64 && !rex_w) /* native size */
+			return false;
+
+		for (int i = 0; i < insn->prefixes.nbytes; i++) {
+			if (insn->prefixes.bytes[i] == 0x66) /* OSP */
+				return false;
+		}
+
+		return modrm_reg == modrm_rm; /* MOV %reg, %reg */
+
+	case 0x8d: /* LEA */
+		if (modrm_mod == 0 || modrm_mod == 3) /* register-indirect with disp */
+			return false;
+
+		if (insn->x86_64 && !rex_w) /* native size */
+			return false;
+
+		if (insn->displacement.value != 0)
+			return false;
+
+		if (sib & (sib_scale != 0 || sib_index != 4)) /* (%reg, %eiz, 1) */
+			return false;
+
+		for (int i = 0; i < insn->prefixes.nbytes; i++) {
+			if (insn->prefixes.bytes[i] != 0x3e) /* DS */
+				return false;
+		}
+
+		return modrm_reg == modrm_rm; /* LEA 0(%reg), %reg */
+
+	case 0x90: /* NOP */
+		for (int i = 0; i < insn->prefixes.nbytes; i++) {
+			if (insn->prefixes.bytes[i] == 0xf3) /* REP */
+				return false; /* REP NOP -- PAUSE */
+		}
+		return true;
+
+	case 0xe9: /* JMP.d32 */
+	case 0xeb: /* JMP.d8 */
+		return insn->immediate.value == 0; /* JMP +0 */
+
+	default:
+		return false;
+	}
+
+	switch (insn->opcode.bytes[1]) {
+	case 0x1f:
+		return modrm_reg == 0; /* 0f 1f /0 -- NOPL */
+
+	default:
+		return false;
+	}
+}

Re: nop5-optimized USDTs WAS: Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Andrii Nakryiko 3 weeks, 5 days ago

On Fri, Sep 5, 2025 at 4:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Sep 04, 2025 at 11:58:26PM +0200, Peter Zijlstra wrote:
> > On Thu, Sep 04, 2025 at 11:56:17PM +0200, Peter Zijlstra wrote:
> >
> > > Ooh, that suggests we do something like so:
> >
> > N/m, I need to go sleep, that doesn't work right for the 32bit nops that
> > use lea instead of nopl. I'll see if I can come up with something more
> > sensible.
>
> Something like this. Can someone please look very critical at this fancy
> insn_is_nop()?

Can't truly review that low-level instruction decoding logic (and you
seem to have found an issue yourself), but superficially the cases
that are claimed to be handled seem like legit no-op instructions. And
the overall logic of nop handling in can_optimize and emulation seems
to be intact as well.

Thanks for generalizing all this!

To the extent that this means anything:

Acked-by: Andrii Nakryiko <andrii@kernel.org>

>
> ---
>  arch/x86/include/asm/insn-eval.h |  2 +
>  arch/x86/kernel/alternative.c    | 20 +--------
>  arch/x86/kernel/uprobes.c        | 32 ++------------
>  arch/x86/lib/insn-eval.c         | 92 ++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 98 insertions(+), 48 deletions(-)
>

[...]

Re: nop5-optimized USDTs WAS: Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Peter Zijlstra 1 month ago

On Fri, Sep 05, 2025 at 10:24:47AM +0200, Peter Zijlstra wrote:
> +bool insn_is_nop(struct insn *insn)
> +{
> +	u8 rex, rex_b = 0, rex_x = 0, rex_r = 0, rex_w = 0;
> +	u8 modrm, modrm_mod, modrm_reg, modrm_rm;
> +	u8 sib = 0, sib_scale, sib_index, sib_base;
> +
> +	if (insn->rex_prefix.nbytes) {
> +		rex = insn->rex_prefix.bytes[0];
> +		rex_w = !!X86_REX_W(rex);
> +		rex_r = !!X86_REX_R(rex);
> +		rex_x = !!X86_REX_X(rex);
> +		rex_b = !!X86_REX_B(rex);
> +	}
> +
> +	if (insn->modrm.nbytes) {
> +		modrm = insn->modrm.bytes[0];
> +		modrm_mod = X86_MODRM_MOD(modrm);
> +		modrm_reg = X86_MODRM_REG(modrm) + 8*rex_r;
> +		modrm_rm  = X86_MODRM_RM(modrm)  + 8*rex_b;
> +	}
> +
> +	if (insn->sib.nbytes) {
> +		sib = insn->sib.bytes[0];
> +		sib_scale = X86_SIB_SCALE(sib);
> +		sib_index = X86_SIB_INDEX(sib) + 8*rex_x;
> +		sib_base  = X86_SIB_BASE(sib)  + 8*rex_b;
> +
> +		modrm_rm = sib_base;
> +	}
> +
> +	switch (insn->opcode.bytes[0]) {
> +	case 0x0f: /* 2nd byte */
> +		break;
> +
> +	case 0x89: /* MOV */
> +		if (modrm_mod != 3) /* register-direct */
> +			return false;
> +
> +		if (insn->x86_64 && !rex_w) /* native size */
> +			return false;
> +
> +		for (int i = 0; i < insn->prefixes.nbytes; i++) {
> +			if (insn->prefixes.bytes[i] == 0x66) /* OSP */
> +				return false;
> +		}
> +
> +		return modrm_reg == modrm_rm; /* MOV %reg, %reg */
> +
> +	case 0x8d: /* LEA */
> +		if (modrm_mod == 0 || modrm_mod == 3) /* register-indirect with disp */
> +			return false;
> +
> +		if (insn->x86_64 && !rex_w) /* native size */
> +			return false;
> +
> +		if (insn->displacement.value != 0)
> +			return false;
> +
> +		if (sib & (sib_scale != 0 || sib_index != 4)) /* (%reg, %eiz, 1) */

Argh, that should obviously be: &&

> +			return false;
> +
> +		for (int i = 0; i < insn->prefixes.nbytes; i++) {
> +			if (insn->prefixes.bytes[i] != 0x3e) /* DS */
> +				return false;
> +		}
> +
> +		return modrm_reg == modrm_rm; /* LEA 0(%reg), %reg */
> +
> +	case 0x90: /* NOP */
> +		for (int i = 0; i < insn->prefixes.nbytes; i++) {
> +			if (insn->prefixes.bytes[i] == 0xf3) /* REP */
> +				return false; /* REP NOP -- PAUSE */
> +		}
> +		return true;
> +
> +	case 0xe9: /* JMP.d32 */
> +	case 0xeb: /* JMP.d8 */
> +		return insn->immediate.value == 0; /* JMP +0 */
> +
> +	default:
> +		return false;
> +	}
> +
> +	switch (insn->opcode.bytes[1]) {
> +	case 0x1f:
> +		return modrm_reg == 0; /* 0f 1f /0 -- NOPL */
> +
> +	default:
> +		return false;
> +	}
> +}

Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Jiri Olsa 1 month ago

On Wed, Sep 03, 2025 at 11:24:31AM -0700, Andrii Nakryiko wrote:
> On Sun, Jul 20, 2025 at 4:23 AM Jiri Olsa <jolsa@kernel.org> wrote:
> >
> > Adding new uprobe syscall that calls uprobe handlers for given
> > 'breakpoint' address.
> >
> > The idea is that the 'breakpoint' address calls the user space
> > trampoline which executes the uprobe syscall.
> >
> > The syscall handler reads the return address of the initial call
> > to retrieve the original 'breakpoint' address. With this address
> > we find the related uprobe object and call its consumers.
> >
> > Adding the arch_uprobe_trampoline_mapping function that provides
> > uprobe trampoline mapping. This mapping is backed with one global
> > page initialized at __init time and shared by the all the mapping
> > instances.
> >
> > We do not allow to execute uprobe syscall if the caller is not
> > from uprobe trampoline mapping.
> >
> > The uprobe syscall ensures the consumer (bpf program) sees registers
> > values in the state before the trampoline was called.
> >
> > Acked-by: Andrii Nakryiko <andrii@kernel.org>
> > Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> > ---
> >  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
> >  arch/x86/kernel/uprobes.c              | 139 +++++++++++++++++++++++++
> >  include/linux/syscalls.h               |   2 +
> >  include/linux/uprobes.h                |   1 +
> >  kernel/events/uprobes.c                |  17 +++
> >  kernel/sys_ni.c                        |   1 +
> >  6 files changed, 161 insertions(+)
> >
> > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> > index cfb5ca41e30d..9fd1291e7bdf 100644
> > --- a/arch/x86/entry/syscalls/syscall_64.tbl
> > +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> > @@ -345,6 +345,7 @@
> >  333    common  io_pgetevents           sys_io_pgetevents
> >  334    common  rseq                    sys_rseq
> >  335    common  uretprobe               sys_uretprobe
> > +336    common  uprobe                  sys_uprobe
> >  # don't use numbers 387 through 423, add new calls after the last
> >  # 'common' entry
> >  424    common  pidfd_send_signal       sys_pidfd_send_signal
> > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> > index 6c4dcbdd0c3c..d18e1ae59901 100644
> > --- a/arch/x86/kernel/uprobes.c
> > +++ b/arch/x86/kernel/uprobes.c
> > @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm)
> >         hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
> >                 destroy_uprobe_trampoline(tramp);
> >  }
> > +
> > +static bool __in_uprobe_trampoline(unsigned long ip)
> > +{
> > +       struct vm_area_struct *vma = vma_lookup(current->mm, ip);
> > +
> > +       return vma && vma_is_special_mapping(vma, &tramp_mapping);
> > +}
> > +
> > +static bool in_uprobe_trampoline(unsigned long ip)
> > +{
> > +       struct mm_struct *mm = current->mm;
> > +       bool found, retry = true;
> > +       unsigned int seq;
> > +
> > +       rcu_read_lock();
> > +       if (mmap_lock_speculate_try_begin(mm, &seq)) {
> > +               found = __in_uprobe_trampoline(ip);
> > +               retry = mmap_lock_speculate_retry(mm, seq);
> > +       }
> > +       rcu_read_unlock();
> > +
> > +       if (retry) {
> > +               mmap_read_lock(mm);
> > +               found = __in_uprobe_trampoline(ip);
> > +               mmap_read_unlock(mm);
> > +       }
> > +       return found;
> > +}
> > +
> > +/*
> > + * See uprobe syscall trampoline; the call to the trampoline will push
> > + * the return address on the stack, the trampoline itself then pushes
> > + * cx, r11 and ax.
> > + */
> > +struct uprobe_syscall_args {
> > +       unsigned long ax;
> > +       unsigned long r11;
> > +       unsigned long cx;
> > +       unsigned long retaddr;
> > +};
> > +
> > +SYSCALL_DEFINE0(uprobe)
> > +{
> > +       struct pt_regs *regs = task_pt_regs(current);
> > +       struct uprobe_syscall_args args;
> > +       unsigned long ip, sp;
> > +       int err;
> > +
> > +       /* Allow execution only from uprobe trampolines. */
> > +       if (!in_uprobe_trampoline(regs->ip))
> > +               goto sigill;
> 
> Hey Jiri,
> 
> So I've been thinking what's the simplest and most reliable way to
> feature-detect support for this sys_uprobe (e.g., for libbpf to know
> whether we should attach at nop5 vs nop1), and clearly that would be
> to try to call uprobe() syscall not from trampoline, and expect some
> error code.
> 
> How bad would it be to change this part to return some unique-enough
> error code (-ENXIO, -EDOM, whatever).
> 
> Is there any reason not to do this? Security-wise it will be just fine, right?

good question.. maybe :) the sys_uprobe sigill error path followed the
uprobe logic when things go bad, seem like good idea to be strict

I understand it'd make the detection code simpler, but it could just
just fork and check for sigill, right?

jirka


> 
> > +
> > +       err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
> > +       if (err)
> > +               goto sigill;
> > +
> > +       ip = regs->ip;
> > +
> 
> [...]

Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Peter Zijlstra 1 month ago

On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote:

> > > +SYSCALL_DEFINE0(uprobe)
> > > +{
> > > +       struct pt_regs *regs = task_pt_regs(current);
> > > +       struct uprobe_syscall_args args;
> > > +       unsigned long ip, sp;
> > > +       int err;
> > > +
> > > +       /* Allow execution only from uprobe trampolines. */
> > > +       if (!in_uprobe_trampoline(regs->ip))
> > > +               goto sigill;
> > 
> > Hey Jiri,
> > 
> > So I've been thinking what's the simplest and most reliable way to
> > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > whether we should attach at nop5 vs nop1), and clearly that would be
> > to try to call uprobe() syscall not from trampoline, and expect some
> > error code.
> > 
> > How bad would it be to change this part to return some unique-enough
> > error code (-ENXIO, -EDOM, whatever).
> > 
> > Is there any reason not to do this? Security-wise it will be just fine, right?
> 
> good question.. maybe :) the sys_uprobe sigill error path followed the
> uprobe logic when things go bad, seem like good idea to be strict
> 
> I understand it'd make the detection code simpler, but it could just
> just fork and check for sigill, right?

Can't you simply uprobe your own nop5 and read back the text to see what
it turns into?

Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Andrii Nakryiko 1 month ago

On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote:
>
> > > > +SYSCALL_DEFINE0(uprobe)
> > > > +{
> > > > +       struct pt_regs *regs = task_pt_regs(current);
> > > > +       struct uprobe_syscall_args args;
> > > > +       unsigned long ip, sp;
> > > > +       int err;
> > > > +
> > > > +       /* Allow execution only from uprobe trampolines. */
> > > > +       if (!in_uprobe_trampoline(regs->ip))
> > > > +               goto sigill;
> > >
> > > Hey Jiri,
> > >
> > > So I've been thinking what's the simplest and most reliable way to
> > > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > > whether we should attach at nop5 vs nop1), and clearly that would be
> > > to try to call uprobe() syscall not from trampoline, and expect some
> > > error code.
> > >
> > > How bad would it be to change this part to return some unique-enough
> > > error code (-ENXIO, -EDOM, whatever).
> > >
> > > Is there any reason not to do this? Security-wise it will be just fine, right?
> >
> > good question.. maybe :) the sys_uprobe sigill error path followed the
> > uprobe logic when things go bad, seem like good idea to be strict
> >
> > I understand it'd make the detection code simpler, but it could just
> > just fork and check for sigill, right?
>
> Can't you simply uprobe your own nop5 and read back the text to see what
> it turns into?

Sure, but none of that is neither fast, nor cheap, nor that simple...
(and requires elevated permissions just to detect)

Forking is also resource-intensive. (think from libbpf's perspective,
it's not cool for library to fork some application just to check such
a seemingly simple thing as whether to

The question is why all that? That SIGILL when !in_uprobe_trampoline()
is just paranoid. I understand killing an application if it tries to
screw up "protocol" in all the subsequent checks. But here it's
equally secure to just fail that syscall with normal error, instead of
punishing by death.

Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Jiri Olsa 1 month ago

On Wed, Sep 03, 2025 at 04:12:37PM -0700, Andrii Nakryiko wrote:
> On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote:
> >
> > > > > +SYSCALL_DEFINE0(uprobe)
> > > > > +{
> > > > > +       struct pt_regs *regs = task_pt_regs(current);
> > > > > +       struct uprobe_syscall_args args;
> > > > > +       unsigned long ip, sp;
> > > > > +       int err;
> > > > > +
> > > > > +       /* Allow execution only from uprobe trampolines. */
> > > > > +       if (!in_uprobe_trampoline(regs->ip))
> > > > > +               goto sigill;
> > > >
> > > > Hey Jiri,
> > > >
> > > > So I've been thinking what's the simplest and most reliable way to
> > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > > > whether we should attach at nop5 vs nop1), and clearly that would be
> > > > to try to call uprobe() syscall not from trampoline, and expect some
> > > > error code.
> > > >
> > > > How bad would it be to change this part to return some unique-enough
> > > > error code (-ENXIO, -EDOM, whatever).
> > > >
> > > > Is there any reason not to do this? Security-wise it will be just fine, right?
> > >
> > > good question.. maybe :) the sys_uprobe sigill error path followed the
> > > uprobe logic when things go bad, seem like good idea to be strict
> > >
> > > I understand it'd make the detection code simpler, but it could just
> > > just fork and check for sigill, right?
> >
> > Can't you simply uprobe your own nop5 and read back the text to see what
> > it turns into?
> 
> Sure, but none of that is neither fast, nor cheap, nor that simple...
> (and requires elevated permissions just to detect)
> 
> Forking is also resource-intensive. (think from libbpf's perspective,
> it's not cool for library to fork some application just to check such
> a seemingly simple thing as whether to
> 
> The question is why all that? That SIGILL when !in_uprobe_trampoline()
> is just paranoid. I understand killing an application if it tries to
> screw up "protocol" in all the subsequent checks. But here it's
> equally secure to just fail that syscall with normal error, instead of
> punishing by death.

adding Jann to the loop, any thoughts on this ^^^ ?

thanks,
jirka

Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Jann Horn 1 month ago

On Thu, Sep 4, 2025 at 9:56 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> On Wed, Sep 03, 2025 at 04:12:37PM -0700, Andrii Nakryiko wrote:
> > On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote:
> > >
> > > > > > +SYSCALL_DEFINE0(uprobe)
> > > > > > +{
> > > > > > +       struct pt_regs *regs = task_pt_regs(current);
> > > > > > +       struct uprobe_syscall_args args;
> > > > > > +       unsigned long ip, sp;
> > > > > > +       int err;
> > > > > > +
> > > > > > +       /* Allow execution only from uprobe trampolines. */
> > > > > > +       if (!in_uprobe_trampoline(regs->ip))
> > > > > > +               goto sigill;
> > > > >
> > > > > Hey Jiri,
> > > > >
> > > > > So I've been thinking what's the simplest and most reliable way to
> > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > > > > whether we should attach at nop5 vs nop1), and clearly that would be
> > > > > to try to call uprobe() syscall not from trampoline, and expect some
> > > > > error code.
> > > > >
> > > > > How bad would it be to change this part to return some unique-enough
> > > > > error code (-ENXIO, -EDOM, whatever).
> > > > >
> > > > > Is there any reason not to do this? Security-wise it will be just fine, right?
> > > >
> > > > good question.. maybe :) the sys_uprobe sigill error path followed the
> > > > uprobe logic when things go bad, seem like good idea to be strict
> > > >
> > > > I understand it'd make the detection code simpler, but it could just
> > > > just fork and check for sigill, right?
> > >
> > > Can't you simply uprobe your own nop5 and read back the text to see what
> > > it turns into?
> >
> > Sure, but none of that is neither fast, nor cheap, nor that simple...
> > (and requires elevated permissions just to detect)
> >
> > Forking is also resource-intensive. (think from libbpf's perspective,
> > it's not cool for library to fork some application just to check such
> > a seemingly simple thing as whether to
> >
> > The question is why all that? That SIGILL when !in_uprobe_trampoline()
> > is just paranoid. I understand killing an application if it tries to
> > screw up "protocol" in all the subsequent checks. But here it's
> > equally secure to just fail that syscall with normal error, instead of
> > punishing by death.
>
> adding Jann to the loop, any thoughts on this ^^^ ?

If I understand correctly, the main reason for the SIGILL is that if
you hit an error in here when coming from an actual uprobe, and if the
syscall were to just return an error, then you'd end up not restoring
registers as expected which would probably end up crashing the process
in a pretty ugly way?

I guess as long as in_uprobe_trampoline() is reliable (which it should
be), it would be fine to return an error when in_uprobe_trampoline()
fails, though it would be nice to have a short comment describing that
calls from uprobe trampolines must never fail with an error.

Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Jiri Olsa 1 month ago

On Thu, Sep 04, 2025 at 11:39:33AM +0200, Jann Horn wrote:
> On Thu, Sep 4, 2025 at 9:56 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> > On Wed, Sep 03, 2025 at 04:12:37PM -0700, Andrii Nakryiko wrote:
> > > On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote:
> > > >
> > > > > > > +SYSCALL_DEFINE0(uprobe)
> > > > > > > +{
> > > > > > > +       struct pt_regs *regs = task_pt_regs(current);
> > > > > > > +       struct uprobe_syscall_args args;
> > > > > > > +       unsigned long ip, sp;
> > > > > > > +       int err;
> > > > > > > +
> > > > > > > +       /* Allow execution only from uprobe trampolines. */
> > > > > > > +       if (!in_uprobe_trampoline(regs->ip))
> > > > > > > +               goto sigill;
> > > > > >
> > > > > > Hey Jiri,
> > > > > >
> > > > > > So I've been thinking what's the simplest and most reliable way to
> > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > > > > > whether we should attach at nop5 vs nop1), and clearly that would be
> > > > > > to try to call uprobe() syscall not from trampoline, and expect some
> > > > > > error code.
> > > > > >
> > > > > > How bad would it be to change this part to return some unique-enough
> > > > > > error code (-ENXIO, -EDOM, whatever).
> > > > > >
> > > > > > Is there any reason not to do this? Security-wise it will be just fine, right?
> > > > >
> > > > > good question.. maybe :) the sys_uprobe sigill error path followed the
> > > > > uprobe logic when things go bad, seem like good idea to be strict
> > > > >
> > > > > I understand it'd make the detection code simpler, but it could just
> > > > > just fork and check for sigill, right?
> > > >
> > > > Can't you simply uprobe your own nop5 and read back the text to see what
> > > > it turns into?
> > >
> > > Sure, but none of that is neither fast, nor cheap, nor that simple...
> > > (and requires elevated permissions just to detect)
> > >
> > > Forking is also resource-intensive. (think from libbpf's perspective,
> > > it's not cool for library to fork some application just to check such
> > > a seemingly simple thing as whether to
> > >
> > > The question is why all that? That SIGILL when !in_uprobe_trampoline()
> > > is just paranoid. I understand killing an application if it tries to
> > > screw up "protocol" in all the subsequent checks. But here it's
> > > equally secure to just fail that syscall with normal error, instead of
> > > punishing by death.
> >
> > adding Jann to the loop, any thoughts on this ^^^ ?
> 
> If I understand correctly, the main reason for the SIGILL is that if
> you hit an error in here when coming from an actual uprobe, and if the
> syscall were to just return an error, then you'd end up not restoring
> registers as expected which would probably end up crashing the process
> in a pretty ugly way?

for some cases yes, for the initial checks I think we could just skip
the uprobe and process would continue just fine

we use sigill because the trap code paths use it for errors and to be
paranoid about the !in_uprobe_trampoline check

jirka

> 
> I guess as long as in_uprobe_trampoline() is reliable (which it should
> be), it would be fine to return an error when in_uprobe_trampoline()
> fails, though it would be nice to have a short comment describing that
> calls from uprobe trampolines must never fail with an error.

Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Andrii Nakryiko 1 month ago

On Thu, Sep 4, 2025 at 7:03 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Thu, Sep 04, 2025 at 11:39:33AM +0200, Jann Horn wrote:
> > On Thu, Sep 4, 2025 at 9:56 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> > > On Wed, Sep 03, 2025 at 04:12:37PM -0700, Andrii Nakryiko wrote:
> > > > On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > >
> > > > > On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote:
> > > > >
> > > > > > > > +SYSCALL_DEFINE0(uprobe)
> > > > > > > > +{
> > > > > > > > +       struct pt_regs *regs = task_pt_regs(current);
> > > > > > > > +       struct uprobe_syscall_args args;
> > > > > > > > +       unsigned long ip, sp;
> > > > > > > > +       int err;
> > > > > > > > +
> > > > > > > > +       /* Allow execution only from uprobe trampolines. */
> > > > > > > > +       if (!in_uprobe_trampoline(regs->ip))
> > > > > > > > +               goto sigill;
> > > > > > >
> > > > > > > Hey Jiri,
> > > > > > >
> > > > > > > So I've been thinking what's the simplest and most reliable way to
> > > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > > > > > > whether we should attach at nop5 vs nop1), and clearly that would be
> > > > > > > to try to call uprobe() syscall not from trampoline, and expect some
> > > > > > > error code.
> > > > > > >
> > > > > > > How bad would it be to change this part to return some unique-enough
> > > > > > > error code (-ENXIO, -EDOM, whatever).
> > > > > > >
> > > > > > > Is there any reason not to do this? Security-wise it will be just fine, right?
> > > > > >
> > > > > > good question.. maybe :) the sys_uprobe sigill error path followed the
> > > > > > uprobe logic when things go bad, seem like good idea to be strict
> > > > > >
> > > > > > I understand it'd make the detection code simpler, but it could just
> > > > > > just fork and check for sigill, right?
> > > > >
> > > > > Can't you simply uprobe your own nop5 and read back the text to see what
> > > > > it turns into?
> > > >
> > > > Sure, but none of that is neither fast, nor cheap, nor that simple...
> > > > (and requires elevated permissions just to detect)
> > > >
> > > > Forking is also resource-intensive. (think from libbpf's perspective,
> > > > it's not cool for library to fork some application just to check such
> > > > a seemingly simple thing as whether to
> > > >
> > > > The question is why all that? That SIGILL when !in_uprobe_trampoline()
> > > > is just paranoid. I understand killing an application if it tries to
> > > > screw up "protocol" in all the subsequent checks. But here it's
> > > > equally secure to just fail that syscall with normal error, instead of
> > > > punishing by death.
> > >
> > > adding Jann to the loop, any thoughts on this ^^^ ?
> >
> > If I understand correctly, the main reason for the SIGILL is that if
> > you hit an error in here when coming from an actual uprobe, and if the
> > syscall were to just return an error, then you'd end up not restoring
> > registers as expected which would probably end up crashing the process
> > in a pretty ugly way?
>
> for some cases yes, for the initial checks I think we could just skip
> the uprobe and process would continue just fine
>

For non-buggy kernel implementation in_uprobe_trampoline(regs->ip)
will (should) always be true when triggered for kernel-installed
uprobe. So this check can fail only for cases when someone
intentionally called sys_uprobe not from kernel-generated and
kernel-controlled trampoline.

At which point it's totally fine to just return an error and do nothing.

> we use sigill because the trap code paths use it for errors and to be
> paranoid about the !in_uprobe_trampoline check

Yeah, and it should be totally fine to keep doing that.

It's just about that entry in_uprobe_trampoline() check. And that's
sufficient to make all this nicely integrated with USDT use cases.

(I'd say it would be nice to also amend this into original patch to
avoid someone cherry picking original commit and forgetting/missing
the follow up change. But that's up to Peter.)

Jiri, can you please send a quick patch and see how that goes? Thanks!

>
> jirka
>
> >
> > I guess as long as in_uprobe_trampoline() is reliable (which it should
> > be), it would be fine to return an error when in_uprobe_trampoline()
> > fails, though it would be nice to have a short comment describing that
> > calls from uprobe trampolines must never fail with an error.
>
>

Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Jiri Olsa 1 month ago

On Thu, Sep 04, 2025 at 11:32:06AM -0700, Andrii Nakryiko wrote:
> On Thu, Sep 4, 2025 at 7:03 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > On Thu, Sep 04, 2025 at 11:39:33AM +0200, Jann Horn wrote:
> > > On Thu, Sep 4, 2025 at 9:56 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> > > > On Wed, Sep 03, 2025 at 04:12:37PM -0700, Andrii Nakryiko wrote:
> > > > > On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > >
> > > > > > On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote:
> > > > > >
> > > > > > > > > +SYSCALL_DEFINE0(uprobe)
> > > > > > > > > +{
> > > > > > > > > +       struct pt_regs *regs = task_pt_regs(current);
> > > > > > > > > +       struct uprobe_syscall_args args;
> > > > > > > > > +       unsigned long ip, sp;
> > > > > > > > > +       int err;
> > > > > > > > > +
> > > > > > > > > +       /* Allow execution only from uprobe trampolines. */
> > > > > > > > > +       if (!in_uprobe_trampoline(regs->ip))
> > > > > > > > > +               goto sigill;
> > > > > > > >
> > > > > > > > Hey Jiri,
> > > > > > > >
> > > > > > > > So I've been thinking what's the simplest and most reliable way to
> > > > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > > > > > > > whether we should attach at nop5 vs nop1), and clearly that would be
> > > > > > > > to try to call uprobe() syscall not from trampoline, and expect some
> > > > > > > > error code.
> > > > > > > >
> > > > > > > > How bad would it be to change this part to return some unique-enough
> > > > > > > > error code (-ENXIO, -EDOM, whatever).
> > > > > > > >
> > > > > > > > Is there any reason not to do this? Security-wise it will be just fine, right?
> > > > > > >
> > > > > > > good question.. maybe :) the sys_uprobe sigill error path followed the
> > > > > > > uprobe logic when things go bad, seem like good idea to be strict
> > > > > > >
> > > > > > > I understand it'd make the detection code simpler, but it could just
> > > > > > > just fork and check for sigill, right?
> > > > > >
> > > > > > Can't you simply uprobe your own nop5 and read back the text to see what
> > > > > > it turns into?
> > > > >
> > > > > Sure, but none of that is neither fast, nor cheap, nor that simple...
> > > > > (and requires elevated permissions just to detect)
> > > > >
> > > > > Forking is also resource-intensive. (think from libbpf's perspective,
> > > > > it's not cool for library to fork some application just to check such
> > > > > a seemingly simple thing as whether to
> > > > >
> > > > > The question is why all that? That SIGILL when !in_uprobe_trampoline()
> > > > > is just paranoid. I understand killing an application if it tries to
> > > > > screw up "protocol" in all the subsequent checks. But here it's
> > > > > equally secure to just fail that syscall with normal error, instead of
> > > > > punishing by death.
> > > >
> > > > adding Jann to the loop, any thoughts on this ^^^ ?
> > >
> > > If I understand correctly, the main reason for the SIGILL is that if
> > > you hit an error in here when coming from an actual uprobe, and if the
> > > syscall were to just return an error, then you'd end up not restoring
> > > registers as expected which would probably end up crashing the process
> > > in a pretty ugly way?
> >
> > for some cases yes, for the initial checks I think we could just skip
> > the uprobe and process would continue just fine
> >
> 
> For non-buggy kernel implementation in_uprobe_trampoline(regs->ip)
> will (should) always be true when triggered for kernel-installed
> uprobe. So this check can fail only for cases when someone
> intentionally called sys_uprobe not from kernel-generated and
> kernel-controlled trampoline.
> 
> At which point it's totally fine to just return an error and do nothing.
> 
> > we use sigill because the trap code paths use it for errors and to be
> > paranoid about the !in_uprobe_trampoline check
> 
> Yeah, and it should be totally fine to keep doing that.
> 
> It's just about that entry in_uprobe_trampoline() check. And that's
> sufficient to make all this nicely integrated with USDT use cases.
> 
> (I'd say it would be nice to also amend this into original patch to
> avoid someone cherry picking original commit and forgetting/missing
> the follow up change. But that's up to Peter.)
> 
> Jiri, can you please send a quick patch and see how that goes? Thanks!

seems like it's as easy as the change below, I'll send formal patches
later if I don't hear otherwise.. we will also need man page change

jirka


---
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 0a8c0a4a5423..845aeaf36b8d 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -810,7 +810,7 @@ SYSCALL_DEFINE0(uprobe)
 
 	/* Allow execution only from uprobe trampolines. */
 	if (!in_uprobe_trampoline(regs->ip))
-		goto sigill;
+		return -ENXIO;
 
 	err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
 	if (err)
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index 5da0b49eeaca..6d75ede16e7c 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -757,34 +757,12 @@ static void test_uprobe_race(void)
 #define __NR_uprobe 336
 #endif
 
-static void test_uprobe_sigill(void)
+static void test_uprobe_error(void)
 {
-	int status, err, pid;
+	long err = syscall(__NR_uprobe);
 
-	pid = fork();
-	if (!ASSERT_GE(pid, 0, "fork"))
-		return;
-	/* child */
-	if (pid == 0) {
-		asm volatile (
-			"pushq %rax\n"
-			"pushq %rcx\n"
-			"pushq %r11\n"
-			"movq $" __stringify(__NR_uprobe) ", %rax\n"
-			"syscall\n"
-			"popq %r11\n"
-			"popq %rcx\n"
-			"retq\n"
-		);
-		exit(0);
-	}
-
-	err = waitpid(pid, &status, 0);
-	ASSERT_EQ(err, pid, "waitpid");
-
-	/* verify the child got killed with SIGILL */
-	ASSERT_EQ(WIFSIGNALED(status), 1, "WIFSIGNALED");
-	ASSERT_EQ(WTERMSIG(status), SIGILL, "WTERMSIG");
+	ASSERT_EQ(err, -1, "error");
+	ASSERT_EQ(errno, ENXIO, "errno");
 }
 
 static void __test_uprobe_syscall(void)
@@ -805,8 +783,8 @@ static void __test_uprobe_syscall(void)
 		test_uprobe_usdt();
 	if (test__start_subtest("uprobe_race"))
 		test_uprobe_race();
-	if (test__start_subtest("uprobe_sigill"))
-		test_uprobe_sigill();
+	if (test__start_subtest("uprobe_error"))
+		test_uprobe_error();
 	if (test__start_subtest("uprobe_regs_equal"))
 		test_uprobe_regs_equal(false);
 	if (test__start_subtest("regs_change"))

Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Andrii Nakryiko 1 month ago

On Fri, Sep 5, 2025 at 3:46 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Thu, Sep 04, 2025 at 11:32:06AM -0700, Andrii Nakryiko wrote:
> > On Thu, Sep 4, 2025 at 7:03 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> > >
> > > On Thu, Sep 04, 2025 at 11:39:33AM +0200, Jann Horn wrote:
> > > > On Thu, Sep 4, 2025 at 9:56 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> > > > > On Wed, Sep 03, 2025 at 04:12:37PM -0700, Andrii Nakryiko wrote:
> > > > > > On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > > >
> > > > > > > On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote:
> > > > > > >
> > > > > > > > > > +SYSCALL_DEFINE0(uprobe)
> > > > > > > > > > +{
> > > > > > > > > > +       struct pt_regs *regs = task_pt_regs(current);
> > > > > > > > > > +       struct uprobe_syscall_args args;
> > > > > > > > > > +       unsigned long ip, sp;
> > > > > > > > > > +       int err;
> > > > > > > > > > +
> > > > > > > > > > +       /* Allow execution only from uprobe trampolines. */
> > > > > > > > > > +       if (!in_uprobe_trampoline(regs->ip))
> > > > > > > > > > +               goto sigill;
> > > > > > > > >
> > > > > > > > > Hey Jiri,
> > > > > > > > >
> > > > > > > > > So I've been thinking what's the simplest and most reliable way to
> > > > > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know
> > > > > > > > > whether we should attach at nop5 vs nop1), and clearly that would be
> > > > > > > > > to try to call uprobe() syscall not from trampoline, and expect some
> > > > > > > > > error code.
> > > > > > > > >
> > > > > > > > > How bad would it be to change this part to return some unique-enough
> > > > > > > > > error code (-ENXIO, -EDOM, whatever).
> > > > > > > > >
> > > > > > > > > Is there any reason not to do this? Security-wise it will be just fine, right?
> > > > > > > >
> > > > > > > > good question.. maybe :) the sys_uprobe sigill error path followed the
> > > > > > > > uprobe logic when things go bad, seem like good idea to be strict
> > > > > > > >
> > > > > > > > I understand it'd make the detection code simpler, but it could just
> > > > > > > > just fork and check for sigill, right?
> > > > > > >
> > > > > > > Can't you simply uprobe your own nop5 and read back the text to see what
> > > > > > > it turns into?
> > > > > >
> > > > > > Sure, but none of that is neither fast, nor cheap, nor that simple...
> > > > > > (and requires elevated permissions just to detect)
> > > > > >
> > > > > > Forking is also resource-intensive. (think from libbpf's perspective,
> > > > > > it's not cool for library to fork some application just to check such
> > > > > > a seemingly simple thing as whether to
> > > > > >
> > > > > > The question is why all that? That SIGILL when !in_uprobe_trampoline()
> > > > > > is just paranoid. I understand killing an application if it tries to
> > > > > > screw up "protocol" in all the subsequent checks. But here it's
> > > > > > equally secure to just fail that syscall with normal error, instead of
> > > > > > punishing by death.
> > > > >
> > > > > adding Jann to the loop, any thoughts on this ^^^ ?
> > > >
> > > > If I understand correctly, the main reason for the SIGILL is that if
> > > > you hit an error in here when coming from an actual uprobe, and if the
> > > > syscall were to just return an error, then you'd end up not restoring
> > > > registers as expected which would probably end up crashing the process
> > > > in a pretty ugly way?
> > >
> > > for some cases yes, for the initial checks I think we could just skip
> > > the uprobe and process would continue just fine
> > >
> >
> > For non-buggy kernel implementation in_uprobe_trampoline(regs->ip)
> > will (should) always be true when triggered for kernel-installed
> > uprobe. So this check can fail only for cases when someone
> > intentionally called sys_uprobe not from kernel-generated and
> > kernel-controlled trampoline.
> >
> > At which point it's totally fine to just return an error and do nothing.
> >
> > > we use sigill because the trap code paths use it for errors and to be
> > > paranoid about the !in_uprobe_trampoline check
> >
> > Yeah, and it should be totally fine to keep doing that.
> >
> > It's just about that entry in_uprobe_trampoline() check. And that's
> > sufficient to make all this nicely integrated with USDT use cases.
> >
> > (I'd say it would be nice to also amend this into original patch to
> > avoid someone cherry picking original commit and forgetting/missing
> > the follow up change. But that's up to Peter.)
> >
> > Jiri, can you please send a quick patch and see how that goes? Thanks!
>
> seems like it's as easy as the change below, I'll send formal patches
> later if I don't hear otherwise.. we will also need man page change
>
> jirka
>
>
> ---
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index 0a8c0a4a5423..845aeaf36b8d 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -810,7 +810,7 @@ SYSCALL_DEFINE0(uprobe)
>
>         /* Allow execution only from uprobe trampolines. */
>         if (!in_uprobe_trampoline(regs->ip))
> -               goto sigill;
> +               return -ENXIO;
>

thanks!

Acked-by: Andrii Nakryiko <andrii@kernel.org>

>         err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
>         if (err)
> diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> index 5da0b49eeaca..6d75ede16e7c 100644
> --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> @@ -757,34 +757,12 @@ static void test_uprobe_race(void)
>  #define __NR_uprobe 336
>  #endif
>
> -static void test_uprobe_sigill(void)
> +static void test_uprobe_error(void)
>  {
> -       int status, err, pid;
> +       long err = syscall(__NR_uprobe);
>
> -       pid = fork();
> -       if (!ASSERT_GE(pid, 0, "fork"))
> -               return;
> -       /* child */
> -       if (pid == 0) {
> -               asm volatile (
> -                       "pushq %rax\n"
> -                       "pushq %rcx\n"
> -                       "pushq %r11\n"
> -                       "movq $" __stringify(__NR_uprobe) ", %rax\n"
> -                       "syscall\n"
> -                       "popq %r11\n"
> -                       "popq %rcx\n"
> -                       "retq\n"
> -               );
> -               exit(0);
> -       }
> -
> -       err = waitpid(pid, &status, 0);
> -       ASSERT_EQ(err, pid, "waitpid");
> -
> -       /* verify the child got killed with SIGILL */
> -       ASSERT_EQ(WIFSIGNALED(status), 1, "WIFSIGNALED");
> -       ASSERT_EQ(WTERMSIG(status), SIGILL, "WTERMSIG");
> +       ASSERT_EQ(err, -1, "error");
> +       ASSERT_EQ(errno, ENXIO, "errno");
>  }
>
>  static void __test_uprobe_syscall(void)
> @@ -805,8 +783,8 @@ static void __test_uprobe_syscall(void)
>                 test_uprobe_usdt();
>         if (test__start_subtest("uprobe_race"))
>                 test_uprobe_race();
> -       if (test__start_subtest("uprobe_sigill"))
> -               test_uprobe_sigill();
> +       if (test__start_subtest("uprobe_error"))
> +               test_uprobe_error();
>         if (test__start_subtest("uprobe_regs_equal"))
>                 test_uprobe_regs_equal(false);
>         if (test__start_subtest("regs_change"))

Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Masami Hiramatsu (Google) 2 months, 1 week ago

On Sun, 20 Jul 2025 13:21:19 +0200
Jiri Olsa <jolsa@kernel.org> wrote:

> Adding new uprobe syscall that calls uprobe handlers for given
> 'breakpoint' address.
> 
> The idea is that the 'breakpoint' address calls the user space
> trampoline which executes the uprobe syscall.
> 
> The syscall handler reads the return address of the initial call
> to retrieve the original 'breakpoint' address. With this address
> we find the related uprobe object and call its consumers.
> 
> Adding the arch_uprobe_trampoline_mapping function that provides
> uprobe trampoline mapping. This mapping is backed with one global
> page initialized at __init time and shared by the all the mapping
> instances.
> 
> We do not allow to execute uprobe syscall if the caller is not
> from uprobe trampoline mapping.
> 
> The uprobe syscall ensures the consumer (bpf program) sees registers
> values in the state before the trampoline was called.
> 

Looks good to me.

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks!

> Acked-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  arch/x86/kernel/uprobes.c              | 139 +++++++++++++++++++++++++
>  include/linux/syscalls.h               |   2 +
>  include/linux/uprobes.h                |   1 +
>  kernel/events/uprobes.c                |  17 +++
>  kernel/sys_ni.c                        |   1 +
>  6 files changed, 161 insertions(+)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index cfb5ca41e30d..9fd1291e7bdf 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -345,6 +345,7 @@
>  333	common	io_pgetevents		sys_io_pgetevents
>  334	common	rseq			sys_rseq
>  335	common	uretprobe		sys_uretprobe
> +336	common	uprobe			sys_uprobe
>  # don't use numbers 387 through 423, add new calls after the last
>  # 'common' entry
>  424	common	pidfd_send_signal	sys_pidfd_send_signal
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index 6c4dcbdd0c3c..d18e1ae59901 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm)
>  	hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
>  		destroy_uprobe_trampoline(tramp);
>  }
> +
> +static bool __in_uprobe_trampoline(unsigned long ip)
> +{
> +	struct vm_area_struct *vma = vma_lookup(current->mm, ip);
> +
> +	return vma && vma_is_special_mapping(vma, &tramp_mapping);
> +}
> +
> +static bool in_uprobe_trampoline(unsigned long ip)
> +{
> +	struct mm_struct *mm = current->mm;
> +	bool found, retry = true;
> +	unsigned int seq;
> +
> +	rcu_read_lock();
> +	if (mmap_lock_speculate_try_begin(mm, &seq)) {
> +		found = __in_uprobe_trampoline(ip);
> +		retry = mmap_lock_speculate_retry(mm, seq);
> +	}
> +	rcu_read_unlock();
> +
> +	if (retry) {
> +		mmap_read_lock(mm);
> +		found = __in_uprobe_trampoline(ip);
> +		mmap_read_unlock(mm);
> +	}
> +	return found;
> +}
> +
> +/*
> + * See uprobe syscall trampoline; the call to the trampoline will push
> + * the return address on the stack, the trampoline itself then pushes
> + * cx, r11 and ax.
> + */
> +struct uprobe_syscall_args {
> +	unsigned long ax;
> +	unsigned long r11;
> +	unsigned long cx;
> +	unsigned long retaddr;
> +};
> +
> +SYSCALL_DEFINE0(uprobe)
> +{
> +	struct pt_regs *regs = task_pt_regs(current);
> +	struct uprobe_syscall_args args;
> +	unsigned long ip, sp;
> +	int err;
> +
> +	/* Allow execution only from uprobe trampolines. */
> +	if (!in_uprobe_trampoline(regs->ip))
> +		goto sigill;
> +
> +	err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
> +	if (err)
> +		goto sigill;
> +
> +	ip = regs->ip;
> +
> +	/*
> +	 * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus:
> +	 * - adjust ip to the probe address, call saved next instruction address
> +	 * - adjust sp to the probe's stack frame (check trampoline code)
> +	 */
> +	regs->ax  = args.ax;
> +	regs->r11 = args.r11;
> +	regs->cx  = args.cx;
> +	regs->ip  = args.retaddr - 5;
> +	regs->sp += sizeof(args);
> +	regs->orig_ax = -1;
> +
> +	sp = regs->sp;
> +
> +	handle_syscall_uprobe(regs, regs->ip);
> +
> +	/*
> +	 * Some of the uprobe consumers has changed sp, we can do nothing,
> +	 * just return via iret.
> +	 */
> +	if (regs->sp != sp) {
> +		/* skip the trampoline call */
> +		if (args.retaddr - 5 == regs->ip)
> +			regs->ip += 5;
> +		return regs->ax;
> +	}
> +
> +	regs->sp -= sizeof(args);
> +
> +	/* for the case uprobe_consumer has changed ax/r11/cx */
> +	args.ax  = regs->ax;
> +	args.r11 = regs->r11;
> +	args.cx  = regs->cx;
> +
> +	/* keep return address unless we are instructed otherwise */
> +	if (args.retaddr - 5 != regs->ip)
> +		args.retaddr = regs->ip;
> +
> +	regs->ip = ip;
> +
> +	err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
> +	if (err)
> +		goto sigill;
> +
> +	/* ensure sysret, see do_syscall_64() */
> +	regs->r11 = regs->flags;
> +	regs->cx  = regs->ip;
> +	return 0;
> +
> +sigill:
> +	force_sig(SIGILL);
> +	return -1;
> +}
> +
> +asm (
> +	".pushsection .rodata\n"
> +	".balign " __stringify(PAGE_SIZE) "\n"
> +	"uprobe_trampoline_entry:\n"
> +	"push %rcx\n"
> +	"push %r11\n"
> +	"push %rax\n"
> +	"movq $" __stringify(__NR_uprobe) ", %rax\n"
> +	"syscall\n"
> +	"pop %rax\n"
> +	"pop %r11\n"
> +	"pop %rcx\n"
> +	"ret\n"
> +	".balign " __stringify(PAGE_SIZE) "\n"
> +	".popsection\n"
> +);
> +
> +extern u8 uprobe_trampoline_entry[];
> +
> +static int __init arch_uprobes_init(void)
> +{
> +	tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry);
> +	return 0;
> +}
> +
> +late_initcall(arch_uprobes_init);
> +
>  #else /* 32-bit: */
>  /*
>   * No RIP-relative addressing on 32-bit
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index e5603cc91963..b0cc60f1c458 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -998,6 +998,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
>  
>  asmlinkage long sys_uretprobe(void);
>  
> +asmlinkage long sys_uprobe(void);
> +
>  /* pciconfig: alpha, arm, arm64, ia64, sparc */
>  asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
>  				unsigned long off, unsigned long len,
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index b40d33aae016..b6b077cc7d0f 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -239,6 +239,7 @@ extern unsigned long uprobe_get_trampoline_vaddr(void);
>  extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len);
>  extern void arch_uprobe_clear_state(struct mm_struct *mm);
>  extern void arch_uprobe_init_state(struct mm_struct *mm);
> +extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
>  #else /* !CONFIG_UPROBES */
>  struct uprobes_state {
>  };
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index acec91a676b7..cbba31c0495f 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -2772,6 +2772,23 @@ static void handle_swbp(struct pt_regs *regs)
>  	rcu_read_unlock_trace();
>  }
>  
> +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
> +{
> +	struct uprobe *uprobe;
> +	int is_swbp;
> +
> +	guard(rcu_tasks_trace)();
> +
> +	uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
> +	if (!uprobe)
> +		return;
> +	if (!get_utask())
> +		return;
> +	if (arch_uprobe_ignore(&uprobe->arch, regs))
> +		return;
> +	handler_chain(uprobe, regs);
> +}
> +
>  /*
>   * Perform required fix-ups and disable singlestep.
>   * Allow pending signals to take effect.
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index c00a86931f8c..bf5d05c635ff 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16);
>  COND_SYSCALL(rseq);
>  
>  COND_SYSCALL(uretprobe);
> +COND_SYSCALL(uprobe);
> -- 
> 2.50.1
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

Re: [PATCHv6 perf/core 09/22] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by Oleg Nesterov 2 months, 2 weeks ago

On 07/20, Jiri Olsa wrote:
>
> Adding new uprobe syscall that calls uprobe handlers for given
> 'breakpoint' address.
>
> The idea is that the 'breakpoint' address calls the user space
> trampoline which executes the uprobe syscall.
>
> The syscall handler reads the return address of the initial call
> to retrieve the original 'breakpoint' address. With this address
> we find the related uprobe object and call its consumers.
>
> Adding the arch_uprobe_trampoline_mapping function that provides
> uprobe trampoline mapping. This mapping is backed with one global
> page initialized at __init time and shared by the all the mapping
> instances.
>
> We do not allow to execute uprobe syscall if the caller is not
> from uprobe trampoline mapping.
>
> The uprobe syscall ensures the consumer (bpf program) sees registers
> values in the state before the trampoline was called.
>
> Acked-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>

My ack still stands,

Acked-by: Oleg Nesterov <oleg@redhat.com>

[tip: perf/core] uprobes/x86: Add uprobe syscall to speed up uprobe

Posted by tip-bot2 for Jiri Olsa 1 month, 1 week ago

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     56101b69c9190667f473b9f93f8b6d8209aaa816
Gitweb:        https://git.kernel.org/tip/56101b69c9190667f473b9f93f8b6d8209aaa816
Author:        Jiri Olsa <jolsa@kernel.org>
AuthorDate:    Sun, 20 Jul 2025 13:21:19 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 21 Aug 2025 20:09:20 +02:00

uprobes/x86: Add uprobe syscall to speed up uprobe

Adding new uprobe syscall that calls uprobe handlers for given
'breakpoint' address.

The idea is that the 'breakpoint' address calls the user space
trampoline which executes the uprobe syscall.

The syscall handler reads the return address of the initial call
to retrieve the original 'breakpoint' address. With this address
we find the related uprobe object and call its consumers.

Adding the arch_uprobe_trampoline_mapping function that provides
uprobe trampoline mapping. This mapping is backed with one global
page initialized at __init time and shared by the all the mapping
instances.

We do not allow to execute uprobe syscall if the caller is not
from uprobe trampoline mapping.

The uprobe syscall ensures the consumer (bpf program) sees registers
values in the state before the trampoline was called.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20250720112133.244369-10-jolsa@kernel.org
---
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +-
 arch/x86/kernel/uprobes.c              | 139 ++++++++++++++++++++++++-
 include/linux/syscalls.h               |   2 +-
 include/linux/uprobes.h                |   1 +-
 kernel/events/uprobes.c                |  17 +++-
 kernel/sys_ni.c                        |   1 +-
 6 files changed, 161 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 92cf0fe..ced2a1d 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@
 333	common	io_pgetevents		sys_io_pgetevents
 334	common	rseq			sys_rseq
 335	common	uretprobe		sys_uretprobe
+336	common	uprobe			sys_uprobe
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	sys_pidfd_send_signal
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c4dcbd..d18e1ae 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm)
 	hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
 		destroy_uprobe_trampoline(tramp);
 }
+
+static bool __in_uprobe_trampoline(unsigned long ip)
+{
+	struct vm_area_struct *vma = vma_lookup(current->mm, ip);
+
+	return vma && vma_is_special_mapping(vma, &tramp_mapping);
+}
+
+static bool in_uprobe_trampoline(unsigned long ip)
+{
+	struct mm_struct *mm = current->mm;
+	bool found, retry = true;
+	unsigned int seq;
+
+	rcu_read_lock();
+	if (mmap_lock_speculate_try_begin(mm, &seq)) {
+		found = __in_uprobe_trampoline(ip);
+		retry = mmap_lock_speculate_retry(mm, seq);
+	}
+	rcu_read_unlock();
+
+	if (retry) {
+		mmap_read_lock(mm);
+		found = __in_uprobe_trampoline(ip);
+		mmap_read_unlock(mm);
+	}
+	return found;
+}
+
+/*
+ * See uprobe syscall trampoline; the call to the trampoline will push
+ * the return address on the stack, the trampoline itself then pushes
+ * cx, r11 and ax.
+ */
+struct uprobe_syscall_args {
+	unsigned long ax;
+	unsigned long r11;
+	unsigned long cx;
+	unsigned long retaddr;
+};
+
+SYSCALL_DEFINE0(uprobe)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	struct uprobe_syscall_args args;
+	unsigned long ip, sp;
+	int err;
+
+	/* Allow execution only from uprobe trampolines. */
+	if (!in_uprobe_trampoline(regs->ip))
+		goto sigill;
+
+	err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
+	if (err)
+		goto sigill;
+
+	ip = regs->ip;
+
+	/*
+	 * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus:
+	 * - adjust ip to the probe address, call saved next instruction address
+	 * - adjust sp to the probe's stack frame (check trampoline code)
+	 */
+	regs->ax  = args.ax;
+	regs->r11 = args.r11;
+	regs->cx  = args.cx;
+	regs->ip  = args.retaddr - 5;
+	regs->sp += sizeof(args);
+	regs->orig_ax = -1;
+
+	sp = regs->sp;
+
+	handle_syscall_uprobe(regs, regs->ip);
+
+	/*
+	 * Some of the uprobe consumers has changed sp, we can do nothing,
+	 * just return via iret.
+	 */
+	if (regs->sp != sp) {
+		/* skip the trampoline call */
+		if (args.retaddr - 5 == regs->ip)
+			regs->ip += 5;
+		return regs->ax;
+	}
+
+	regs->sp -= sizeof(args);
+
+	/* for the case uprobe_consumer has changed ax/r11/cx */
+	args.ax  = regs->ax;
+	args.r11 = regs->r11;
+	args.cx  = regs->cx;
+
+	/* keep return address unless we are instructed otherwise */
+	if (args.retaddr - 5 != regs->ip)
+		args.retaddr = regs->ip;
+
+	regs->ip = ip;
+
+	err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
+	if (err)
+		goto sigill;
+
+	/* ensure sysret, see do_syscall_64() */
+	regs->r11 = regs->flags;
+	regs->cx  = regs->ip;
+	return 0;
+
+sigill:
+	force_sig(SIGILL);
+	return -1;
+}
+
+asm (
+	".pushsection .rodata\n"
+	".balign " __stringify(PAGE_SIZE) "\n"
+	"uprobe_trampoline_entry:\n"
+	"push %rcx\n"
+	"push %r11\n"
+	"push %rax\n"
+	"movq $" __stringify(__NR_uprobe) ", %rax\n"
+	"syscall\n"
+	"pop %rax\n"
+	"pop %r11\n"
+	"pop %rcx\n"
+	"ret\n"
+	".balign " __stringify(PAGE_SIZE) "\n"
+	".popsection\n"
+);
+
+extern u8 uprobe_trampoline_entry[];
+
+static int __init arch_uprobes_init(void)
+{
+	tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry);
+	return 0;
+}
+
+late_initcall(arch_uprobes_init);
+
 #else /* 32-bit: */
 /*
  * No RIP-relative addressing on 32-bit
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 77f45e5..66c06fc 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1005,6 +1005,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
 
 asmlinkage long sys_uretprobe(void);
 
+asmlinkage long sys_uprobe(void);
+
 /* pciconfig: alpha, arm, arm64, ia64, sparc */
 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
 				unsigned long off, unsigned long len,
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b40d33a..b6b077c 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -239,6 +239,7 @@ extern unsigned long uprobe_get_trampoline_vaddr(void);
 extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len);
 extern void arch_uprobe_clear_state(struct mm_struct *mm);
 extern void arch_uprobe_init_state(struct mm_struct *mm);
+extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2cd7a4c..eb07e60 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2771,6 +2771,23 @@ out:
 	rcu_read_unlock_trace();
 }
 
+void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
+{
+	struct uprobe *uprobe;
+	int is_swbp;
+
+	guard(rcu_tasks_trace)();
+
+	uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
+	if (!uprobe)
+		return;
+	if (!get_utask())
+		return;
+	if (arch_uprobe_ignore(&uprobe->arch, regs))
+		return;
+	handler_chain(uprobe, regs);
+}
+
 /*
  * Perform required fix-ups and disable singlestep.
  * Allow pending signals to take effect.
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c00a869..bf5d05c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -392,3 +392,4 @@ COND_SYSCALL(setuid16);
 COND_SYSCALL(rseq);
 
 COND_SYSCALL(uretprobe);
+COND_SYSCALL(uprobe);