Adding new uprobe syscall that calls uprobe handlers for given
'breakpoint' address.
The idea is that the 'breakpoint' address calls the user space
trampoline which executes the uprobe syscall.
The syscall handler reads the return address of the initial call
to retrieve the original 'breakpoint' address. With this address
we find the related uprobe object and call its consumers.
Adding the arch_uprobe_trampoline_mapping function that provides
uprobe trampoline mapping. This mapping is backed with one global
page initialized at __init time and shared by the all the mapping
instances.
We do not allow to execute uprobe syscall if the caller is not
from uprobe trampoline mapping.
The uprobe syscall ensures the consumer (bpf program) sees registers
values in the state before the trampoline was called.
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
arch/x86/kernel/uprobes.c | 139 +++++++++++++++++++++++++
include/linux/syscalls.h | 2 +
include/linux/uprobes.h | 1 +
kernel/events/uprobes.c | 17 +++
kernel/sys_ni.c | 1 +
6 files changed, 161 insertions(+)
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index cfb5ca41e30d..9fd1291e7bdf 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@
333 common io_pgetevents sys_io_pgetevents
334 common rseq sys_rseq
335 common uretprobe sys_uretprobe
+336 common uprobe sys_uprobe
# don't use numbers 387 through 423, add new calls after the last
# 'common' entry
424 common pidfd_send_signal sys_pidfd_send_signal
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c4dcbdd0c3c..d18e1ae59901 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm)
hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
destroy_uprobe_trampoline(tramp);
}
+
+static bool __in_uprobe_trampoline(unsigned long ip)
+{
+ struct vm_area_struct *vma = vma_lookup(current->mm, ip);
+
+ return vma && vma_is_special_mapping(vma, &tramp_mapping);
+}
+
+static bool in_uprobe_trampoline(unsigned long ip)
+{
+ struct mm_struct *mm = current->mm;
+ bool found, retry = true;
+ unsigned int seq;
+
+ rcu_read_lock();
+ if (mmap_lock_speculate_try_begin(mm, &seq)) {
+ found = __in_uprobe_trampoline(ip);
+ retry = mmap_lock_speculate_retry(mm, seq);
+ }
+ rcu_read_unlock();
+
+ if (retry) {
+ mmap_read_lock(mm);
+ found = __in_uprobe_trampoline(ip);
+ mmap_read_unlock(mm);
+ }
+ return found;
+}
+
+/*
+ * See uprobe syscall trampoline; the call to the trampoline will push
+ * the return address on the stack, the trampoline itself then pushes
+ * cx, r11 and ax.
+ */
+struct uprobe_syscall_args {
+ unsigned long ax;
+ unsigned long r11;
+ unsigned long cx;
+ unsigned long retaddr;
+};
+
+SYSCALL_DEFINE0(uprobe)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct uprobe_syscall_args args;
+ unsigned long ip, sp;
+ int err;
+
+ /* Allow execution only from uprobe trampolines. */
+ if (!in_uprobe_trampoline(regs->ip))
+ goto sigill;
+
+ err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
+ if (err)
+ goto sigill;
+
+ ip = regs->ip;
+
+ /*
+ * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus:
+ * - adjust ip to the probe address, call saved next instruction address
+ * - adjust sp to the probe's stack frame (check trampoline code)
+ */
+ regs->ax = args.ax;
+ regs->r11 = args.r11;
+ regs->cx = args.cx;
+ regs->ip = args.retaddr - 5;
+ regs->sp += sizeof(args);
+ regs->orig_ax = -1;
+
+ sp = regs->sp;
+
+ handle_syscall_uprobe(regs, regs->ip);
+
+ /*
+ * Some of the uprobe consumers has changed sp, we can do nothing,
+ * just return via iret.
+ */
+ if (regs->sp != sp) {
+ /* skip the trampoline call */
+ if (args.retaddr - 5 == regs->ip)
+ regs->ip += 5;
+ return regs->ax;
+ }
+
+ regs->sp -= sizeof(args);
+
+ /* for the case uprobe_consumer has changed ax/r11/cx */
+ args.ax = regs->ax;
+ args.r11 = regs->r11;
+ args.cx = regs->cx;
+
+ /* keep return address unless we are instructed otherwise */
+ if (args.retaddr - 5 != regs->ip)
+ args.retaddr = regs->ip;
+
+ regs->ip = ip;
+
+ err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
+ if (err)
+ goto sigill;
+
+ /* ensure sysret, see do_syscall_64() */
+ regs->r11 = regs->flags;
+ regs->cx = regs->ip;
+ return 0;
+
+sigill:
+ force_sig(SIGILL);
+ return -1;
+}
+
+asm (
+ ".pushsection .rodata\n"
+ ".balign " __stringify(PAGE_SIZE) "\n"
+ "uprobe_trampoline_entry:\n"
+ "push %rcx\n"
+ "push %r11\n"
+ "push %rax\n"
+ "movq $" __stringify(__NR_uprobe) ", %rax\n"
+ "syscall\n"
+ "pop %rax\n"
+ "pop %r11\n"
+ "pop %rcx\n"
+ "ret\n"
+ ".balign " __stringify(PAGE_SIZE) "\n"
+ ".popsection\n"
+);
+
+extern u8 uprobe_trampoline_entry[];
+
+static int __init arch_uprobes_init(void)
+{
+ tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry);
+ return 0;
+}
+
+late_initcall(arch_uprobes_init);
+
#else /* 32-bit: */
/*
* No RIP-relative addressing on 32-bit
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e5603cc91963..b0cc60f1c458 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -998,6 +998,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
asmlinkage long sys_uretprobe(void);
+asmlinkage long sys_uprobe(void);
+
/* pciconfig: alpha, arm, arm64, ia64, sparc */
asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
unsigned long off, unsigned long len,
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b40d33aae016..b6b077cc7d0f 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -239,6 +239,7 @@ extern unsigned long uprobe_get_trampoline_vaddr(void);
extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len);
extern void arch_uprobe_clear_state(struct mm_struct *mm);
extern void arch_uprobe_init_state(struct mm_struct *mm);
+extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
#else /* !CONFIG_UPROBES */
struct uprobes_state {
};
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index acec91a676b7..cbba31c0495f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2772,6 +2772,23 @@ static void handle_swbp(struct pt_regs *regs)
rcu_read_unlock_trace();
}
+void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
+{
+ struct uprobe *uprobe;
+ int is_swbp;
+
+ guard(rcu_tasks_trace)();
+
+ uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
+ if (!uprobe)
+ return;
+ if (!get_utask())
+ return;
+ if (arch_uprobe_ignore(&uprobe->arch, regs))
+ return;
+ handler_chain(uprobe, regs);
+}
+
/*
* Perform required fix-ups and disable singlestep.
* Allow pending signals to take effect.
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c00a86931f8c..bf5d05c635ff 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -392,3 +392,4 @@ COND_SYSCALL(setuid16);
COND_SYSCALL(rseq);
COND_SYSCALL(uretprobe);
+COND_SYSCALL(uprobe);
--
2.50.1
On Sun, Jul 20, 2025 at 4:23 AM Jiri Olsa <jolsa@kernel.org> wrote: > > Adding new uprobe syscall that calls uprobe handlers for given > 'breakpoint' address. > > The idea is that the 'breakpoint' address calls the user space > trampoline which executes the uprobe syscall. > > The syscall handler reads the return address of the initial call > to retrieve the original 'breakpoint' address. With this address > we find the related uprobe object and call its consumers. > > Adding the arch_uprobe_trampoline_mapping function that provides > uprobe trampoline mapping. This mapping is backed with one global > page initialized at __init time and shared by the all the mapping > instances. > > We do not allow to execute uprobe syscall if the caller is not > from uprobe trampoline mapping. > > The uprobe syscall ensures the consumer (bpf program) sees registers > values in the state before the trampoline was called. > > Acked-by: Andrii Nakryiko <andrii@kernel.org> > Signed-off-by: Jiri Olsa <jolsa@kernel.org> > --- > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > arch/x86/kernel/uprobes.c | 139 +++++++++++++++++++++++++ > include/linux/syscalls.h | 2 + > include/linux/uprobes.h | 1 + > kernel/events/uprobes.c | 17 +++ > kernel/sys_ni.c | 1 + > 6 files changed, 161 insertions(+) > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > index cfb5ca41e30d..9fd1291e7bdf 100644 > --- a/arch/x86/entry/syscalls/syscall_64.tbl > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > @@ -345,6 +345,7 @@ > 333 common io_pgetevents sys_io_pgetevents > 334 common rseq sys_rseq > 335 common uretprobe sys_uretprobe > +336 common uprobe sys_uprobe > # don't use numbers 387 through 423, add new calls after the last > # 'common' entry > 424 common pidfd_send_signal sys_pidfd_send_signal > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c > index 6c4dcbdd0c3c..d18e1ae59901 100644 > --- a/arch/x86/kernel/uprobes.c > +++ b/arch/x86/kernel/uprobes.c > @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm) > hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) > destroy_uprobe_trampoline(tramp); > } > + > +static bool __in_uprobe_trampoline(unsigned long ip) > +{ > + struct vm_area_struct *vma = vma_lookup(current->mm, ip); > + > + return vma && vma_is_special_mapping(vma, &tramp_mapping); > +} > + > +static bool in_uprobe_trampoline(unsigned long ip) > +{ > + struct mm_struct *mm = current->mm; > + bool found, retry = true; > + unsigned int seq; > + > + rcu_read_lock(); > + if (mmap_lock_speculate_try_begin(mm, &seq)) { > + found = __in_uprobe_trampoline(ip); > + retry = mmap_lock_speculate_retry(mm, seq); > + } > + rcu_read_unlock(); > + > + if (retry) { > + mmap_read_lock(mm); > + found = __in_uprobe_trampoline(ip); > + mmap_read_unlock(mm); > + } > + return found; > +} > + > +/* > + * See uprobe syscall trampoline; the call to the trampoline will push > + * the return address on the stack, the trampoline itself then pushes > + * cx, r11 and ax. > + */ > +struct uprobe_syscall_args { > + unsigned long ax; > + unsigned long r11; > + unsigned long cx; > + unsigned long retaddr; > +}; > + > +SYSCALL_DEFINE0(uprobe) > +{ > + struct pt_regs *regs = task_pt_regs(current); > + struct uprobe_syscall_args args; > + unsigned long ip, sp; > + int err; > + > + /* Allow execution only from uprobe trampolines. */ > + if (!in_uprobe_trampoline(regs->ip)) > + goto sigill; Hey Jiri, So I've been thinking what's the simplest and most reliable way to feature-detect support for this sys_uprobe (e.g., for libbpf to know whether we should attach at nop5 vs nop1), and clearly that would be to try to call uprobe() syscall not from trampoline, and expect some error code. How bad would it be to change this part to return some unique-enough error code (-ENXIO, -EDOM, whatever). Is there any reason not to do this? Security-wise it will be just fine, right? > + > + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); > + if (err) > + goto sigill; > + > + ip = regs->ip; > + [...]
On Wed, Sep 03, 2025 at 11:24:31AM -0700, Andrii Nakryiko wrote: > On Sun, Jul 20, 2025 at 4:23 AM Jiri Olsa <jolsa@kernel.org> wrote: > > > > Adding new uprobe syscall that calls uprobe handlers for given > > 'breakpoint' address. > > > > The idea is that the 'breakpoint' address calls the user space > > trampoline which executes the uprobe syscall. > > > > The syscall handler reads the return address of the initial call > > to retrieve the original 'breakpoint' address. With this address > > we find the related uprobe object and call its consumers. > > > > Adding the arch_uprobe_trampoline_mapping function that provides > > uprobe trampoline mapping. This mapping is backed with one global > > page initialized at __init time and shared by the all the mapping > > instances. > > > > We do not allow to execute uprobe syscall if the caller is not > > from uprobe trampoline mapping. > > > > The uprobe syscall ensures the consumer (bpf program) sees registers > > values in the state before the trampoline was called. > > > > Acked-by: Andrii Nakryiko <andrii@kernel.org> > > Signed-off-by: Jiri Olsa <jolsa@kernel.org> > > --- > > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > > arch/x86/kernel/uprobes.c | 139 +++++++++++++++++++++++++ > > include/linux/syscalls.h | 2 + > > include/linux/uprobes.h | 1 + > > kernel/events/uprobes.c | 17 +++ > > kernel/sys_ni.c | 1 + > > 6 files changed, 161 insertions(+) > > > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > > index cfb5ca41e30d..9fd1291e7bdf 100644 > > --- a/arch/x86/entry/syscalls/syscall_64.tbl > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > > @@ -345,6 +345,7 @@ > > 333 common io_pgetevents sys_io_pgetevents > > 334 common rseq sys_rseq > > 335 common uretprobe sys_uretprobe > > +336 common uprobe sys_uprobe > > # don't use numbers 387 through 423, add new calls after the last > > # 'common' entry > > 424 common pidfd_send_signal sys_pidfd_send_signal > > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c > > index 6c4dcbdd0c3c..d18e1ae59901 100644 > > --- a/arch/x86/kernel/uprobes.c > > +++ b/arch/x86/kernel/uprobes.c > > @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm) > > hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) > > destroy_uprobe_trampoline(tramp); > > } > > + > > +static bool __in_uprobe_trampoline(unsigned long ip) > > +{ > > + struct vm_area_struct *vma = vma_lookup(current->mm, ip); > > + > > + return vma && vma_is_special_mapping(vma, &tramp_mapping); > > +} > > + > > +static bool in_uprobe_trampoline(unsigned long ip) > > +{ > > + struct mm_struct *mm = current->mm; > > + bool found, retry = true; > > + unsigned int seq; > > + > > + rcu_read_lock(); > > + if (mmap_lock_speculate_try_begin(mm, &seq)) { > > + found = __in_uprobe_trampoline(ip); > > + retry = mmap_lock_speculate_retry(mm, seq); > > + } > > + rcu_read_unlock(); > > + > > + if (retry) { > > + mmap_read_lock(mm); > > + found = __in_uprobe_trampoline(ip); > > + mmap_read_unlock(mm); > > + } > > + return found; > > +} > > + > > +/* > > + * See uprobe syscall trampoline; the call to the trampoline will push > > + * the return address on the stack, the trampoline itself then pushes > > + * cx, r11 and ax. > > + */ > > +struct uprobe_syscall_args { > > + unsigned long ax; > > + unsigned long r11; > > + unsigned long cx; > > + unsigned long retaddr; > > +}; > > + > > +SYSCALL_DEFINE0(uprobe) > > +{ > > + struct pt_regs *regs = task_pt_regs(current); > > + struct uprobe_syscall_args args; > > + unsigned long ip, sp; > > + int err; > > + > > + /* Allow execution only from uprobe trampolines. */ > > + if (!in_uprobe_trampoline(regs->ip)) > > + goto sigill; > > Hey Jiri, > > So I've been thinking what's the simplest and most reliable way to > feature-detect support for this sys_uprobe (e.g., for libbpf to know > whether we should attach at nop5 vs nop1), and clearly that would be wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5 and store some info about that in the usdt's elf note, right? libbpf will read usdt record and in case it has both nop1/nop5 and if the sys_uprobe is detected, we will adjust usdt address to nop1 or nop5 I recall you said you might have an idea where to store this flag in elf note.. or are we bumping the usdt's elf note n_type ? thanks, jirka > to try to call uprobe() syscall not from trampoline, and expect some > error code. > > How bad would it be to change this part to return some unique-enough > error code (-ENXIO, -EDOM, whatever). > > Is there any reason not to do this? Security-wise it will be just fine, right? > > > + > > + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); > > + if (err) > > + goto sigill; > > + > > + ip = regs->ip; > > + > > [...]
-- Andrii On Thu, Sep 4, 2025 at 1:13 AM Jiri Olsa <olsajiri@gmail.com> wrote: > > On Wed, Sep 03, 2025 at 11:24:31AM -0700, Andrii Nakryiko wrote: > > On Sun, Jul 20, 2025 at 4:23 AM Jiri Olsa <jolsa@kernel.org> wrote: > > > > > > Adding new uprobe syscall that calls uprobe handlers for given > > > 'breakpoint' address. > > > > > > The idea is that the 'breakpoint' address calls the user space > > > trampoline which executes the uprobe syscall. > > > > > > The syscall handler reads the return address of the initial call > > > to retrieve the original 'breakpoint' address. With this address > > > we find the related uprobe object and call its consumers. > > > > > > Adding the arch_uprobe_trampoline_mapping function that provides > > > uprobe trampoline mapping. This mapping is backed with one global > > > page initialized at __init time and shared by the all the mapping > > > instances. > > > > > > We do not allow to execute uprobe syscall if the caller is not > > > from uprobe trampoline mapping. > > > > > > The uprobe syscall ensures the consumer (bpf program) sees registers > > > values in the state before the trampoline was called. > > > > > > Acked-by: Andrii Nakryiko <andrii@kernel.org> > > > Signed-off-by: Jiri Olsa <jolsa@kernel.org> > > > --- > > > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > > > arch/x86/kernel/uprobes.c | 139 +++++++++++++++++++++++++ > > > include/linux/syscalls.h | 2 + > > > include/linux/uprobes.h | 1 + > > > kernel/events/uprobes.c | 17 +++ > > > kernel/sys_ni.c | 1 + > > > 6 files changed, 161 insertions(+) > > > > > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > > > index cfb5ca41e30d..9fd1291e7bdf 100644 > > > --- a/arch/x86/entry/syscalls/syscall_64.tbl > > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > > > @@ -345,6 +345,7 @@ > > > 333 common io_pgetevents sys_io_pgetevents > > > 334 common rseq sys_rseq > > > 335 common uretprobe sys_uretprobe > > > +336 common uprobe sys_uprobe > > > # don't use numbers 387 through 423, add new calls after the last > > > # 'common' entry > > > 424 common pidfd_send_signal sys_pidfd_send_signal > > > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c > > > index 6c4dcbdd0c3c..d18e1ae59901 100644 > > > --- a/arch/x86/kernel/uprobes.c > > > +++ b/arch/x86/kernel/uprobes.c > > > @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm) > > > hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) > > > destroy_uprobe_trampoline(tramp); > > > } > > > + > > > +static bool __in_uprobe_trampoline(unsigned long ip) > > > +{ > > > + struct vm_area_struct *vma = vma_lookup(current->mm, ip); > > > + > > > + return vma && vma_is_special_mapping(vma, &tramp_mapping); > > > +} > > > + > > > +static bool in_uprobe_trampoline(unsigned long ip) > > > +{ > > > + struct mm_struct *mm = current->mm; > > > + bool found, retry = true; > > > + unsigned int seq; > > > + > > > + rcu_read_lock(); > > > + if (mmap_lock_speculate_try_begin(mm, &seq)) { > > > + found = __in_uprobe_trampoline(ip); > > > + retry = mmap_lock_speculate_retry(mm, seq); > > > + } > > > + rcu_read_unlock(); > > > + > > > + if (retry) { > > > + mmap_read_lock(mm); > > > + found = __in_uprobe_trampoline(ip); > > > + mmap_read_unlock(mm); > > > + } > > > + return found; > > > +} > > > + > > > +/* > > > + * See uprobe syscall trampoline; the call to the trampoline will push > > > + * the return address on the stack, the trampoline itself then pushes > > > + * cx, r11 and ax. > > > + */ > > > +struct uprobe_syscall_args { > > > + unsigned long ax; > > > + unsigned long r11; > > > + unsigned long cx; > > > + unsigned long retaddr; > > > +}; > > > + > > > +SYSCALL_DEFINE0(uprobe) > > > +{ > > > + struct pt_regs *regs = task_pt_regs(current); > > > + struct uprobe_syscall_args args; > > > + unsigned long ip, sp; > > > + int err; > > > + > > > + /* Allow execution only from uprobe trampolines. */ > > > + if (!in_uprobe_trampoline(regs->ip)) > > > + goto sigill; > > > > Hey Jiri, > > > > So I've been thinking what's the simplest and most reliable way to > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > whether we should attach at nop5 vs nop1), and clearly that would be > > wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5 > and store some info about that in the usdt's elf note, right? > Yes. > libbpf will read usdt record and in case it has both nop1/nop5 and if > the sys_uprobe is detected, we will adjust usdt address to nop1 or nop5 > Yes. > I recall you said you might have an idea where to store this flag > in elf note.. or are we bumping the usdt's elf note n_type ? > Neither. I was contemplating just to look whether there is nop5 after nop1 from libbpf side, which would probably always work reliably in practice, but, technically, might be misused if you artificially put nop5 after USDT() call and then jump into that nop5 from somewhere else. Then it would trigger USDT where it shouldn't. But I don't want to change n_type, as I want anyone else doing their own USDT parsing/attaching to work just as they used to. So, here's the current idea. USDT lays out three strings one after the other in ELF note: provider, \0, name, \0, args, \0. We also record total note data size there, so we know how much contents is there. I was thinking we can add just one extra \0 at the end (and if necessary we can treat that as 4th string with extra arguments in the future, who knows). If libbpf detects that extra \0, then we can be reasonably confident that nop5 is part of USDT and is safe to be used correctly. Unless there are some super paranoid USDT parsers out there, it should be nicely backwards compatible. Libbpf seems to not care right now about that extra zero, which is a good sign. readelf handles that just fine as well. So all good signs so far. I haven't tested anything else (bpftrace is currently broken for me w.r.t. USDT attachment even with old format, so I can't quickly check, please help, if you can). > thanks, > jirka > > > > to try to call uprobe() syscall not from trampoline, and expect some > > error code. > > > > How bad would it be to change this part to return some unique-enough > > error code (-ENXIO, -EDOM, whatever). > > > > Is there any reason not to do this? Security-wise it will be just fine, right? > > > > > + > > > + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); > > > + if (err) > > > + goto sigill; > > > + > > > + ip = regs->ip; > > > + > > > > [...]
On Thu, Sep 04, 2025 at 11:27:45AM -0700, Andrii Nakryiko wrote: > > > So I've been thinking what's the simplest and most reliable way to > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > > whether we should attach at nop5 vs nop1), and clearly that would be > > > > wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5 > > and store some info about that in the usdt's elf note, right? Wait, what? You're doing to emit 6 bytes and two nops? Why? Surely the old kernel can INT3 on top of a NOP5?
On Thu, Sep 4, 2025 at 1:35 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Thu, Sep 04, 2025 at 11:27:45AM -0700, Andrii Nakryiko wrote: > > > > > So I've been thinking what's the simplest and most reliable way to > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > > > whether we should attach at nop5 vs nop1), and clearly that would be > > > > > > wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5 > > > and store some info about that in the usdt's elf note, right? > > Wait, what? You're doing to emit 6 bytes and two nops? Why? Surely the > old kernel can INT3 on top of a NOP5? > Yes it can, but it's 2x slower in terms of uprobe triggering compared to nop1. So while it will work to use just nop5 on old kernels, it's going to be a performance regression if we do this unconditionally. So the idea was to have nop1 + nop5, stick to nop1 for old kernels, attach to nop5 for newer ones (that's where feature detection I was asking about is important, libbpf will do this automatically and transparently). I know it's messy, but I think we have to do that. > > >
On Thu, Sep 04, 2025 at 01:49:49PM -0700, Andrii Nakryiko wrote: > On Thu, Sep 4, 2025 at 1:35 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Thu, Sep 04, 2025 at 11:27:45AM -0700, Andrii Nakryiko wrote: > > > > > > > So I've been thinking what's the simplest and most reliable way to > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > > > > whether we should attach at nop5 vs nop1), and clearly that would be > > > > > > > > wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5 > > > > and store some info about that in the usdt's elf note, right? > > > > Wait, what? You're doing to emit 6 bytes and two nops? Why? Surely the > > old kernel can INT3 on top of a NOP5? > > > > Yes it can, but it's 2x slower in terms of uprobe triggering compared > to nop1. Why? That doesn't really make sense. I realize its probably to late to fix the old kernel not to be stupid -- this must be something stupid, right? But now I need to know.
On Thu, Sep 4, 2025 at 1:52 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Thu, Sep 04, 2025 at 01:49:49PM -0700, Andrii Nakryiko wrote: > > On Thu, Sep 4, 2025 at 1:35 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > On Thu, Sep 04, 2025 at 11:27:45AM -0700, Andrii Nakryiko wrote: > > > > > > > > > So I've been thinking what's the simplest and most reliable way to > > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > > > > > whether we should attach at nop5 vs nop1), and clearly that would be > > > > > > > > > > wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5 > > > > > and store some info about that in the usdt's elf note, right? > > > > > > Wait, what? You're doing to emit 6 bytes and two nops? Why? Surely the > > > old kernel can INT3 on top of a NOP5? > > > > > > > Yes it can, but it's 2x slower in terms of uprobe triggering compared > > to nop1. > > Why? That doesn't really make sense. > Of course it's silly... It's because nop5 wasn't recognized as one of the emulated instructions, so was handled through single-stepping. > I realize its probably to late to fix the old kernel not to be stupid -- > this must be something stupid, right? But now I need to know. Jiri fixed this, but as you said, too late for old kernels. See [0] for the patch that landed not so long ago. [0] https://lore.kernel.org/all/20250414083647.1234007-1-jolsa@kernel.org/
On Thu, Sep 04, 2025 at 02:44:03PM -0700, Andrii Nakryiko wrote: > On Thu, Sep 4, 2025 at 1:52 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Thu, Sep 04, 2025 at 01:49:49PM -0700, Andrii Nakryiko wrote: > > > On Thu, Sep 4, 2025 at 1:35 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > > > On Thu, Sep 04, 2025 at 11:27:45AM -0700, Andrii Nakryiko wrote: > > > > > > > > > > > So I've been thinking what's the simplest and most reliable way to > > > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > > > > > > whether we should attach at nop5 vs nop1), and clearly that would be > > > > > > > > > > > > wrt nop5/nop1.. so the idea is to have USDT macro emit both nop1,nop5 > > > > > > and store some info about that in the usdt's elf note, right? > > > > > > > > Wait, what? You're doing to emit 6 bytes and two nops? Why? Surely the > > > > old kernel can INT3 on top of a NOP5? > > > > > > > > > > Yes it can, but it's 2x slower in terms of uprobe triggering compared > > > to nop1. > > > > Why? That doesn't really make sense. > > > > Of course it's silly... It's because nop5 wasn't recognized as one of > the emulated instructions, so was handled through single-stepping. *groan* > > I realize its probably to late to fix the old kernel not to be stupid -- > > this must be something stupid, right? But now I need to know. > > Jiri fixed this, but as you said, too late for old kernels. See [0] > for the patch that landed not so long ago. > > [0] https://lore.kernel.org/all/20250414083647.1234007-1-jolsa@kernel.org/ Ooh, that suggests we do something like so: diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 0a8c0a4a5423..223f8925097b 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -309,6 +309,29 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool return -ENOTSUPP; } +static bool insn_is_nop(struct insn *insn) +{ + return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; +} + +static bool insn_is_nopl(struct insn *insn) +{ + if (insn->opcode.nbytes != 2) + return false; + + if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) + return false; + + if (!insn->modrm.nbytes) + return false; + + if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) + return false; + + /* 0f 1f /0 - NOPL */ + return true; +} + #ifdef CONFIG_X86_64 struct uretprobe_syscall_args { @@ -1158,29 +1181,6 @@ void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) mmap_write_unlock(mm); } -static bool insn_is_nop(struct insn *insn) -{ - return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; -} - -static bool insn_is_nopl(struct insn *insn) -{ - if (insn->opcode.nbytes != 2) - return false; - - if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) - return false; - - if (!insn->modrm.nbytes) - return false; - - if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) - return false; - - /* 0f 1f /0 - NOPL */ - return true; -} - static bool can_optimize(struct insn *insn, unsigned long vaddr) { if (!insn->x86_64 || insn->length != 5) @@ -1428,17 +1428,13 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) insn_byte_t p; int i; - /* x86_nops[insn->length]; same as jmp with .offs = 0 */ - if (insn->length <= ASM_NOP_MAX && - !memcmp(insn->kaddr, x86_nops[insn->length], insn->length)) + if (insn_is_nop(insn) || insn_is_nopl(insn)) goto setup; switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ break; - case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ - goto setup; case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn);
On Thu, Sep 04, 2025 at 11:56:17PM +0200, Peter Zijlstra wrote: > Ooh, that suggests we do something like so: N/m, I need to go sleep, that doesn't work right for the 32bit nops that use lea instead of nopl. I'll see if I can come up with something more sensible.
On Thu, Sep 04, 2025 at 11:58:26PM +0200, Peter Zijlstra wrote: > On Thu, Sep 04, 2025 at 11:56:17PM +0200, Peter Zijlstra wrote: > > > Ooh, that suggests we do something like so: > > N/m, I need to go sleep, that doesn't work right for the 32bit nops that > use lea instead of nopl. I'll see if I can come up with something more > sensible. Something like this. Can someone please look very critical at this fancy insn_is_nop()? --- arch/x86/include/asm/insn-eval.h | 2 + arch/x86/kernel/alternative.c | 20 +-------- arch/x86/kernel/uprobes.c | 32 ++------------ arch/x86/lib/insn-eval.c | 92 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+), 48 deletions(-) diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index 54368a43abf6..4733e9064ee5 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -44,4 +44,6 @@ enum insn_mmio_type { enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes); +bool insn_is_nop(struct insn *insn); + #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7bde68247b5f..e1f98189fe77 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -9,6 +9,7 @@ #include <asm/text-patching.h> #include <asm/insn.h> +#include <asm/insn-eval.h> #include <asm/ibt.h> #include <asm/set_memory.h> #include <asm/nmi.h> @@ -345,25 +346,6 @@ static void add_nop(u8 *buf, unsigned int len) *buf = INT3_INSN_OPCODE; } -/* - * Matches NOP and NOPL, not any of the other possible NOPs. - */ -static bool insn_is_nop(struct insn *insn) -{ - /* Anything NOP, but no REP NOP */ - if (insn->opcode.bytes[0] == 0x90 && - (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) - return true; - - /* NOPL */ - if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) - return true; - - /* TODO: more nops */ - - return false; -} - /* * Find the offset of the first non-NOP instruction starting at @offset * but no further than @len. diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 0a8c0a4a5423..32bc583e6fd4 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -17,6 +17,7 @@ #include <linux/kdebug.h> #include <asm/processor.h> #include <asm/insn.h> +#include <asm/insn-eval.h> #include <asm/mmu_context.h> #include <asm/nops.h> @@ -1158,35 +1159,12 @@ void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) mmap_write_unlock(mm); } -static bool insn_is_nop(struct insn *insn) -{ - return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; -} - -static bool insn_is_nopl(struct insn *insn) -{ - if (insn->opcode.nbytes != 2) - return false; - - if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) - return false; - - if (!insn->modrm.nbytes) - return false; - - if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) - return false; - - /* 0f 1f /0 - NOPL */ - return true; -} - static bool can_optimize(struct insn *insn, unsigned long vaddr) { if (!insn->x86_64 || insn->length != 5) return false; - if (!insn_is_nop(insn) && !insn_is_nopl(insn)) + if (!insn_is_nop(insn)) return false; /* We can't do cross page atomic writes yet. */ @@ -1428,17 +1406,13 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) insn_byte_t p; int i; - /* x86_nops[insn->length]; same as jmp with .offs = 0 */ - if (insn->length <= ASM_NOP_MAX && - !memcmp(insn->kaddr, x86_nops[insn->length], insn->length)) + if (insn_is_nop(insn)) goto setup; switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ break; - case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ - goto setup; case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 4e385cbfd444..3a67f1c5582c 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1676,3 +1676,95 @@ enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes) return type; } + +bool insn_is_nop(struct insn *insn) +{ + u8 rex, rex_b = 0, rex_x = 0, rex_r = 0, rex_w = 0; + u8 modrm, modrm_mod, modrm_reg, modrm_rm; + u8 sib = 0, sib_scale, sib_index, sib_base; + + if (insn->rex_prefix.nbytes) { + rex = insn->rex_prefix.bytes[0]; + rex_w = !!X86_REX_W(rex); + rex_r = !!X86_REX_R(rex); + rex_x = !!X86_REX_X(rex); + rex_b = !!X86_REX_B(rex); + } + + if (insn->modrm.nbytes) { + modrm = insn->modrm.bytes[0]; + modrm_mod = X86_MODRM_MOD(modrm); + modrm_reg = X86_MODRM_REG(modrm) + 8*rex_r; + modrm_rm = X86_MODRM_RM(modrm) + 8*rex_b; + } + + if (insn->sib.nbytes) { + sib = insn->sib.bytes[0]; + sib_scale = X86_SIB_SCALE(sib); + sib_index = X86_SIB_INDEX(sib) + 8*rex_x; + sib_base = X86_SIB_BASE(sib) + 8*rex_b; + + modrm_rm = sib_base; + } + + switch (insn->opcode.bytes[0]) { + case 0x0f: /* 2nd byte */ + break; + + case 0x89: /* MOV */ + if (modrm_mod != 3) /* register-direct */ + return false; + + if (insn->x86_64 && !rex_w) /* native size */ + return false; + + for (int i = 0; i < insn->prefixes.nbytes; i++) { + if (insn->prefixes.bytes[i] == 0x66) /* OSP */ + return false; + } + + return modrm_reg == modrm_rm; /* MOV %reg, %reg */ + + case 0x8d: /* LEA */ + if (modrm_mod == 0 || modrm_mod == 3) /* register-indirect with disp */ + return false; + + if (insn->x86_64 && !rex_w) /* native size */ + return false; + + if (insn->displacement.value != 0) + return false; + + if (sib & (sib_scale != 0 || sib_index != 4)) /* (%reg, %eiz, 1) */ + return false; + + for (int i = 0; i < insn->prefixes.nbytes; i++) { + if (insn->prefixes.bytes[i] != 0x3e) /* DS */ + return false; + } + + return modrm_reg == modrm_rm; /* LEA 0(%reg), %reg */ + + case 0x90: /* NOP */ + for (int i = 0; i < insn->prefixes.nbytes; i++) { + if (insn->prefixes.bytes[i] == 0xf3) /* REP */ + return false; /* REP NOP -- PAUSE */ + } + return true; + + case 0xe9: /* JMP.d32 */ + case 0xeb: /* JMP.d8 */ + return insn->immediate.value == 0; /* JMP +0 */ + + default: + return false; + } + + switch (insn->opcode.bytes[1]) { + case 0x1f: + return modrm_reg == 0; /* 0f 1f /0 -- NOPL */ + + default: + return false; + } +}
On Fri, Sep 5, 2025 at 4:24 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Thu, Sep 04, 2025 at 11:58:26PM +0200, Peter Zijlstra wrote: > > On Thu, Sep 04, 2025 at 11:56:17PM +0200, Peter Zijlstra wrote: > > > > > Ooh, that suggests we do something like so: > > > > N/m, I need to go sleep, that doesn't work right for the 32bit nops that > > use lea instead of nopl. I'll see if I can come up with something more > > sensible. > > Something like this. Can someone please look very critical at this fancy > insn_is_nop()? Can't truly review that low-level instruction decoding logic (and you seem to have found an issue yourself), but superficially the cases that are claimed to be handled seem like legit no-op instructions. And the overall logic of nop handling in can_optimize and emulation seems to be intact as well. Thanks for generalizing all this! To the extent that this means anything: Acked-by: Andrii Nakryiko <andrii@kernel.org> > > --- > arch/x86/include/asm/insn-eval.h | 2 + > arch/x86/kernel/alternative.c | 20 +-------- > arch/x86/kernel/uprobes.c | 32 ++------------ > arch/x86/lib/insn-eval.c | 92 ++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 98 insertions(+), 48 deletions(-) > [...]
On Fri, Sep 05, 2025 at 10:24:47AM +0200, Peter Zijlstra wrote: > +bool insn_is_nop(struct insn *insn) > +{ > + u8 rex, rex_b = 0, rex_x = 0, rex_r = 0, rex_w = 0; > + u8 modrm, modrm_mod, modrm_reg, modrm_rm; > + u8 sib = 0, sib_scale, sib_index, sib_base; > + > + if (insn->rex_prefix.nbytes) { > + rex = insn->rex_prefix.bytes[0]; > + rex_w = !!X86_REX_W(rex); > + rex_r = !!X86_REX_R(rex); > + rex_x = !!X86_REX_X(rex); > + rex_b = !!X86_REX_B(rex); > + } > + > + if (insn->modrm.nbytes) { > + modrm = insn->modrm.bytes[0]; > + modrm_mod = X86_MODRM_MOD(modrm); > + modrm_reg = X86_MODRM_REG(modrm) + 8*rex_r; > + modrm_rm = X86_MODRM_RM(modrm) + 8*rex_b; > + } > + > + if (insn->sib.nbytes) { > + sib = insn->sib.bytes[0]; > + sib_scale = X86_SIB_SCALE(sib); > + sib_index = X86_SIB_INDEX(sib) + 8*rex_x; > + sib_base = X86_SIB_BASE(sib) + 8*rex_b; > + > + modrm_rm = sib_base; > + } > + > + switch (insn->opcode.bytes[0]) { > + case 0x0f: /* 2nd byte */ > + break; > + > + case 0x89: /* MOV */ > + if (modrm_mod != 3) /* register-direct */ > + return false; > + > + if (insn->x86_64 && !rex_w) /* native size */ > + return false; > + > + for (int i = 0; i < insn->prefixes.nbytes; i++) { > + if (insn->prefixes.bytes[i] == 0x66) /* OSP */ > + return false; > + } > + > + return modrm_reg == modrm_rm; /* MOV %reg, %reg */ > + > + case 0x8d: /* LEA */ > + if (modrm_mod == 0 || modrm_mod == 3) /* register-indirect with disp */ > + return false; > + > + if (insn->x86_64 && !rex_w) /* native size */ > + return false; > + > + if (insn->displacement.value != 0) > + return false; > + > + if (sib & (sib_scale != 0 || sib_index != 4)) /* (%reg, %eiz, 1) */ Argh, that should obviously be: && > + return false; > + > + for (int i = 0; i < insn->prefixes.nbytes; i++) { > + if (insn->prefixes.bytes[i] != 0x3e) /* DS */ > + return false; > + } > + > + return modrm_reg == modrm_rm; /* LEA 0(%reg), %reg */ > + > + case 0x90: /* NOP */ > + for (int i = 0; i < insn->prefixes.nbytes; i++) { > + if (insn->prefixes.bytes[i] == 0xf3) /* REP */ > + return false; /* REP NOP -- PAUSE */ > + } > + return true; > + > + case 0xe9: /* JMP.d32 */ > + case 0xeb: /* JMP.d8 */ > + return insn->immediate.value == 0; /* JMP +0 */ > + > + default: > + return false; > + } > + > + switch (insn->opcode.bytes[1]) { > + case 0x1f: > + return modrm_reg == 0; /* 0f 1f /0 -- NOPL */ > + > + default: > + return false; > + } > +}
On Wed, Sep 03, 2025 at 11:24:31AM -0700, Andrii Nakryiko wrote: > On Sun, Jul 20, 2025 at 4:23 AM Jiri Olsa <jolsa@kernel.org> wrote: > > > > Adding new uprobe syscall that calls uprobe handlers for given > > 'breakpoint' address. > > > > The idea is that the 'breakpoint' address calls the user space > > trampoline which executes the uprobe syscall. > > > > The syscall handler reads the return address of the initial call > > to retrieve the original 'breakpoint' address. With this address > > we find the related uprobe object and call its consumers. > > > > Adding the arch_uprobe_trampoline_mapping function that provides > > uprobe trampoline mapping. This mapping is backed with one global > > page initialized at __init time and shared by the all the mapping > > instances. > > > > We do not allow to execute uprobe syscall if the caller is not > > from uprobe trampoline mapping. > > > > The uprobe syscall ensures the consumer (bpf program) sees registers > > values in the state before the trampoline was called. > > > > Acked-by: Andrii Nakryiko <andrii@kernel.org> > > Signed-off-by: Jiri Olsa <jolsa@kernel.org> > > --- > > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > > arch/x86/kernel/uprobes.c | 139 +++++++++++++++++++++++++ > > include/linux/syscalls.h | 2 + > > include/linux/uprobes.h | 1 + > > kernel/events/uprobes.c | 17 +++ > > kernel/sys_ni.c | 1 + > > 6 files changed, 161 insertions(+) > > > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > > index cfb5ca41e30d..9fd1291e7bdf 100644 > > --- a/arch/x86/entry/syscalls/syscall_64.tbl > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > > @@ -345,6 +345,7 @@ > > 333 common io_pgetevents sys_io_pgetevents > > 334 common rseq sys_rseq > > 335 common uretprobe sys_uretprobe > > +336 common uprobe sys_uprobe > > # don't use numbers 387 through 423, add new calls after the last > > # 'common' entry > > 424 common pidfd_send_signal sys_pidfd_send_signal > > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c > > index 6c4dcbdd0c3c..d18e1ae59901 100644 > > --- a/arch/x86/kernel/uprobes.c > > +++ b/arch/x86/kernel/uprobes.c > > @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm) > > hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) > > destroy_uprobe_trampoline(tramp); > > } > > + > > +static bool __in_uprobe_trampoline(unsigned long ip) > > +{ > > + struct vm_area_struct *vma = vma_lookup(current->mm, ip); > > + > > + return vma && vma_is_special_mapping(vma, &tramp_mapping); > > +} > > + > > +static bool in_uprobe_trampoline(unsigned long ip) > > +{ > > + struct mm_struct *mm = current->mm; > > + bool found, retry = true; > > + unsigned int seq; > > + > > + rcu_read_lock(); > > + if (mmap_lock_speculate_try_begin(mm, &seq)) { > > + found = __in_uprobe_trampoline(ip); > > + retry = mmap_lock_speculate_retry(mm, seq); > > + } > > + rcu_read_unlock(); > > + > > + if (retry) { > > + mmap_read_lock(mm); > > + found = __in_uprobe_trampoline(ip); > > + mmap_read_unlock(mm); > > + } > > + return found; > > +} > > + > > +/* > > + * See uprobe syscall trampoline; the call to the trampoline will push > > + * the return address on the stack, the trampoline itself then pushes > > + * cx, r11 and ax. > > + */ > > +struct uprobe_syscall_args { > > + unsigned long ax; > > + unsigned long r11; > > + unsigned long cx; > > + unsigned long retaddr; > > +}; > > + > > +SYSCALL_DEFINE0(uprobe) > > +{ > > + struct pt_regs *regs = task_pt_regs(current); > > + struct uprobe_syscall_args args; > > + unsigned long ip, sp; > > + int err; > > + > > + /* Allow execution only from uprobe trampolines. */ > > + if (!in_uprobe_trampoline(regs->ip)) > > + goto sigill; > > Hey Jiri, > > So I've been thinking what's the simplest and most reliable way to > feature-detect support for this sys_uprobe (e.g., for libbpf to know > whether we should attach at nop5 vs nop1), and clearly that would be > to try to call uprobe() syscall not from trampoline, and expect some > error code. > > How bad would it be to change this part to return some unique-enough > error code (-ENXIO, -EDOM, whatever). > > Is there any reason not to do this? Security-wise it will be just fine, right? good question.. maybe :) the sys_uprobe sigill error path followed the uprobe logic when things go bad, seem like good idea to be strict I understand it'd make the detection code simpler, but it could just just fork and check for sigill, right? jirka > > > + > > + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); > > + if (err) > > + goto sigill; > > + > > + ip = regs->ip; > > + > > [...]
On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote: > > > +SYSCALL_DEFINE0(uprobe) > > > +{ > > > + struct pt_regs *regs = task_pt_regs(current); > > > + struct uprobe_syscall_args args; > > > + unsigned long ip, sp; > > > + int err; > > > + > > > + /* Allow execution only from uprobe trampolines. */ > > > + if (!in_uprobe_trampoline(regs->ip)) > > > + goto sigill; > > > > Hey Jiri, > > > > So I've been thinking what's the simplest and most reliable way to > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > whether we should attach at nop5 vs nop1), and clearly that would be > > to try to call uprobe() syscall not from trampoline, and expect some > > error code. > > > > How bad would it be to change this part to return some unique-enough > > error code (-ENXIO, -EDOM, whatever). > > > > Is there any reason not to do this? Security-wise it will be just fine, right? > > good question.. maybe :) the sys_uprobe sigill error path followed the > uprobe logic when things go bad, seem like good idea to be strict > > I understand it'd make the detection code simpler, but it could just > just fork and check for sigill, right? Can't you simply uprobe your own nop5 and read back the text to see what it turns into?
On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote: > > > > > +SYSCALL_DEFINE0(uprobe) > > > > +{ > > > > + struct pt_regs *regs = task_pt_regs(current); > > > > + struct uprobe_syscall_args args; > > > > + unsigned long ip, sp; > > > > + int err; > > > > + > > > > + /* Allow execution only from uprobe trampolines. */ > > > > + if (!in_uprobe_trampoline(regs->ip)) > > > > + goto sigill; > > > > > > Hey Jiri, > > > > > > So I've been thinking what's the simplest and most reliable way to > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > > whether we should attach at nop5 vs nop1), and clearly that would be > > > to try to call uprobe() syscall not from trampoline, and expect some > > > error code. > > > > > > How bad would it be to change this part to return some unique-enough > > > error code (-ENXIO, -EDOM, whatever). > > > > > > Is there any reason not to do this? Security-wise it will be just fine, right? > > > > good question.. maybe :) the sys_uprobe sigill error path followed the > > uprobe logic when things go bad, seem like good idea to be strict > > > > I understand it'd make the detection code simpler, but it could just > > just fork and check for sigill, right? > > Can't you simply uprobe your own nop5 and read back the text to see what > it turns into? Sure, but none of that is neither fast, nor cheap, nor that simple... (and requires elevated permissions just to detect) Forking is also resource-intensive. (think from libbpf's perspective, it's not cool for library to fork some application just to check such a seemingly simple thing as whether to The question is why all that? That SIGILL when !in_uprobe_trampoline() is just paranoid. I understand killing an application if it tries to screw up "protocol" in all the subsequent checks. But here it's equally secure to just fail that syscall with normal error, instead of punishing by death.
On Wed, Sep 03, 2025 at 04:12:37PM -0700, Andrii Nakryiko wrote: > On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote: > > > > > > > +SYSCALL_DEFINE0(uprobe) > > > > > +{ > > > > > + struct pt_regs *regs = task_pt_regs(current); > > > > > + struct uprobe_syscall_args args; > > > > > + unsigned long ip, sp; > > > > > + int err; > > > > > + > > > > > + /* Allow execution only from uprobe trampolines. */ > > > > > + if (!in_uprobe_trampoline(regs->ip)) > > > > > + goto sigill; > > > > > > > > Hey Jiri, > > > > > > > > So I've been thinking what's the simplest and most reliable way to > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > > > whether we should attach at nop5 vs nop1), and clearly that would be > > > > to try to call uprobe() syscall not from trampoline, and expect some > > > > error code. > > > > > > > > How bad would it be to change this part to return some unique-enough > > > > error code (-ENXIO, -EDOM, whatever). > > > > > > > > Is there any reason not to do this? Security-wise it will be just fine, right? > > > > > > good question.. maybe :) the sys_uprobe sigill error path followed the > > > uprobe logic when things go bad, seem like good idea to be strict > > > > > > I understand it'd make the detection code simpler, but it could just > > > just fork and check for sigill, right? > > > > Can't you simply uprobe your own nop5 and read back the text to see what > > it turns into? > > Sure, but none of that is neither fast, nor cheap, nor that simple... > (and requires elevated permissions just to detect) > > Forking is also resource-intensive. (think from libbpf's perspective, > it's not cool for library to fork some application just to check such > a seemingly simple thing as whether to > > The question is why all that? That SIGILL when !in_uprobe_trampoline() > is just paranoid. I understand killing an application if it tries to > screw up "protocol" in all the subsequent checks. But here it's > equally secure to just fail that syscall with normal error, instead of > punishing by death. adding Jann to the loop, any thoughts on this ^^^ ? thanks, jirka
On Thu, Sep 4, 2025 at 9:56 AM Jiri Olsa <olsajiri@gmail.com> wrote: > On Wed, Sep 03, 2025 at 04:12:37PM -0700, Andrii Nakryiko wrote: > > On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote: > > > > > > > > > +SYSCALL_DEFINE0(uprobe) > > > > > > +{ > > > > > > + struct pt_regs *regs = task_pt_regs(current); > > > > > > + struct uprobe_syscall_args args; > > > > > > + unsigned long ip, sp; > > > > > > + int err; > > > > > > + > > > > > > + /* Allow execution only from uprobe trampolines. */ > > > > > > + if (!in_uprobe_trampoline(regs->ip)) > > > > > > + goto sigill; > > > > > > > > > > Hey Jiri, > > > > > > > > > > So I've been thinking what's the simplest and most reliable way to > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > > > > whether we should attach at nop5 vs nop1), and clearly that would be > > > > > to try to call uprobe() syscall not from trampoline, and expect some > > > > > error code. > > > > > > > > > > How bad would it be to change this part to return some unique-enough > > > > > error code (-ENXIO, -EDOM, whatever). > > > > > > > > > > Is there any reason not to do this? Security-wise it will be just fine, right? > > > > > > > > good question.. maybe :) the sys_uprobe sigill error path followed the > > > > uprobe logic when things go bad, seem like good idea to be strict > > > > > > > > I understand it'd make the detection code simpler, but it could just > > > > just fork and check for sigill, right? > > > > > > Can't you simply uprobe your own nop5 and read back the text to see what > > > it turns into? > > > > Sure, but none of that is neither fast, nor cheap, nor that simple... > > (and requires elevated permissions just to detect) > > > > Forking is also resource-intensive. (think from libbpf's perspective, > > it's not cool for library to fork some application just to check such > > a seemingly simple thing as whether to > > > > The question is why all that? That SIGILL when !in_uprobe_trampoline() > > is just paranoid. I understand killing an application if it tries to > > screw up "protocol" in all the subsequent checks. But here it's > > equally secure to just fail that syscall with normal error, instead of > > punishing by death. > > adding Jann to the loop, any thoughts on this ^^^ ? If I understand correctly, the main reason for the SIGILL is that if you hit an error in here when coming from an actual uprobe, and if the syscall were to just return an error, then you'd end up not restoring registers as expected which would probably end up crashing the process in a pretty ugly way? I guess as long as in_uprobe_trampoline() is reliable (which it should be), it would be fine to return an error when in_uprobe_trampoline() fails, though it would be nice to have a short comment describing that calls from uprobe trampolines must never fail with an error.
On Thu, Sep 04, 2025 at 11:39:33AM +0200, Jann Horn wrote: > On Thu, Sep 4, 2025 at 9:56 AM Jiri Olsa <olsajiri@gmail.com> wrote: > > On Wed, Sep 03, 2025 at 04:12:37PM -0700, Andrii Nakryiko wrote: > > > On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > > > On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote: > > > > > > > > > > > +SYSCALL_DEFINE0(uprobe) > > > > > > > +{ > > > > > > > + struct pt_regs *regs = task_pt_regs(current); > > > > > > > + struct uprobe_syscall_args args; > > > > > > > + unsigned long ip, sp; > > > > > > > + int err; > > > > > > > + > > > > > > > + /* Allow execution only from uprobe trampolines. */ > > > > > > > + if (!in_uprobe_trampoline(regs->ip)) > > > > > > > + goto sigill; > > > > > > > > > > > > Hey Jiri, > > > > > > > > > > > > So I've been thinking what's the simplest and most reliable way to > > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > > > > > whether we should attach at nop5 vs nop1), and clearly that would be > > > > > > to try to call uprobe() syscall not from trampoline, and expect some > > > > > > error code. > > > > > > > > > > > > How bad would it be to change this part to return some unique-enough > > > > > > error code (-ENXIO, -EDOM, whatever). > > > > > > > > > > > > Is there any reason not to do this? Security-wise it will be just fine, right? > > > > > > > > > > good question.. maybe :) the sys_uprobe sigill error path followed the > > > > > uprobe logic when things go bad, seem like good idea to be strict > > > > > > > > > > I understand it'd make the detection code simpler, but it could just > > > > > just fork and check for sigill, right? > > > > > > > > Can't you simply uprobe your own nop5 and read back the text to see what > > > > it turns into? > > > > > > Sure, but none of that is neither fast, nor cheap, nor that simple... > > > (and requires elevated permissions just to detect) > > > > > > Forking is also resource-intensive. (think from libbpf's perspective, > > > it's not cool for library to fork some application just to check such > > > a seemingly simple thing as whether to > > > > > > The question is why all that? That SIGILL when !in_uprobe_trampoline() > > > is just paranoid. I understand killing an application if it tries to > > > screw up "protocol" in all the subsequent checks. But here it's > > > equally secure to just fail that syscall with normal error, instead of > > > punishing by death. > > > > adding Jann to the loop, any thoughts on this ^^^ ? > > If I understand correctly, the main reason for the SIGILL is that if > you hit an error in here when coming from an actual uprobe, and if the > syscall were to just return an error, then you'd end up not restoring > registers as expected which would probably end up crashing the process > in a pretty ugly way? for some cases yes, for the initial checks I think we could just skip the uprobe and process would continue just fine we use sigill because the trap code paths use it for errors and to be paranoid about the !in_uprobe_trampoline check jirka > > I guess as long as in_uprobe_trampoline() is reliable (which it should > be), it would be fine to return an error when in_uprobe_trampoline() > fails, though it would be nice to have a short comment describing that > calls from uprobe trampolines must never fail with an error.
On Thu, Sep 4, 2025 at 7:03 AM Jiri Olsa <olsajiri@gmail.com> wrote: > > On Thu, Sep 04, 2025 at 11:39:33AM +0200, Jann Horn wrote: > > On Thu, Sep 4, 2025 at 9:56 AM Jiri Olsa <olsajiri@gmail.com> wrote: > > > On Wed, Sep 03, 2025 at 04:12:37PM -0700, Andrii Nakryiko wrote: > > > > On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > > > > > On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote: > > > > > > > > > > > > > +SYSCALL_DEFINE0(uprobe) > > > > > > > > +{ > > > > > > > > + struct pt_regs *regs = task_pt_regs(current); > > > > > > > > + struct uprobe_syscall_args args; > > > > > > > > + unsigned long ip, sp; > > > > > > > > + int err; > > > > > > > > + > > > > > > > > + /* Allow execution only from uprobe trampolines. */ > > > > > > > > + if (!in_uprobe_trampoline(regs->ip)) > > > > > > > > + goto sigill; > > > > > > > > > > > > > > Hey Jiri, > > > > > > > > > > > > > > So I've been thinking what's the simplest and most reliable way to > > > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > > > > > > whether we should attach at nop5 vs nop1), and clearly that would be > > > > > > > to try to call uprobe() syscall not from trampoline, and expect some > > > > > > > error code. > > > > > > > > > > > > > > How bad would it be to change this part to return some unique-enough > > > > > > > error code (-ENXIO, -EDOM, whatever). > > > > > > > > > > > > > > Is there any reason not to do this? Security-wise it will be just fine, right? > > > > > > > > > > > > good question.. maybe :) the sys_uprobe sigill error path followed the > > > > > > uprobe logic when things go bad, seem like good idea to be strict > > > > > > > > > > > > I understand it'd make the detection code simpler, but it could just > > > > > > just fork and check for sigill, right? > > > > > > > > > > Can't you simply uprobe your own nop5 and read back the text to see what > > > > > it turns into? > > > > > > > > Sure, but none of that is neither fast, nor cheap, nor that simple... > > > > (and requires elevated permissions just to detect) > > > > > > > > Forking is also resource-intensive. (think from libbpf's perspective, > > > > it's not cool for library to fork some application just to check such > > > > a seemingly simple thing as whether to > > > > > > > > The question is why all that? That SIGILL when !in_uprobe_trampoline() > > > > is just paranoid. I understand killing an application if it tries to > > > > screw up "protocol" in all the subsequent checks. But here it's > > > > equally secure to just fail that syscall with normal error, instead of > > > > punishing by death. > > > > > > adding Jann to the loop, any thoughts on this ^^^ ? > > > > If I understand correctly, the main reason for the SIGILL is that if > > you hit an error in here when coming from an actual uprobe, and if the > > syscall were to just return an error, then you'd end up not restoring > > registers as expected which would probably end up crashing the process > > in a pretty ugly way? > > for some cases yes, for the initial checks I think we could just skip > the uprobe and process would continue just fine > For non-buggy kernel implementation in_uprobe_trampoline(regs->ip) will (should) always be true when triggered for kernel-installed uprobe. So this check can fail only for cases when someone intentionally called sys_uprobe not from kernel-generated and kernel-controlled trampoline. At which point it's totally fine to just return an error and do nothing. > we use sigill because the trap code paths use it for errors and to be > paranoid about the !in_uprobe_trampoline check Yeah, and it should be totally fine to keep doing that. It's just about that entry in_uprobe_trampoline() check. And that's sufficient to make all this nicely integrated with USDT use cases. (I'd say it would be nice to also amend this into original patch to avoid someone cherry picking original commit and forgetting/missing the follow up change. But that's up to Peter.) Jiri, can you please send a quick patch and see how that goes? Thanks! > > jirka > > > > > I guess as long as in_uprobe_trampoline() is reliable (which it should > > be), it would be fine to return an error when in_uprobe_trampoline() > > fails, though it would be nice to have a short comment describing that > > calls from uprobe trampolines must never fail with an error. > >
On Thu, Sep 04, 2025 at 11:32:06AM -0700, Andrii Nakryiko wrote: > On Thu, Sep 4, 2025 at 7:03 AM Jiri Olsa <olsajiri@gmail.com> wrote: > > > > On Thu, Sep 04, 2025 at 11:39:33AM +0200, Jann Horn wrote: > > > On Thu, Sep 4, 2025 at 9:56 AM Jiri Olsa <olsajiri@gmail.com> wrote: > > > > On Wed, Sep 03, 2025 at 04:12:37PM -0700, Andrii Nakryiko wrote: > > > > > On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > > > > > > > On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote: > > > > > > > > > > > > > > > +SYSCALL_DEFINE0(uprobe) > > > > > > > > > +{ > > > > > > > > > + struct pt_regs *regs = task_pt_regs(current); > > > > > > > > > + struct uprobe_syscall_args args; > > > > > > > > > + unsigned long ip, sp; > > > > > > > > > + int err; > > > > > > > > > + > > > > > > > > > + /* Allow execution only from uprobe trampolines. */ > > > > > > > > > + if (!in_uprobe_trampoline(regs->ip)) > > > > > > > > > + goto sigill; > > > > > > > > > > > > > > > > Hey Jiri, > > > > > > > > > > > > > > > > So I've been thinking what's the simplest and most reliable way to > > > > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > > > > > > > whether we should attach at nop5 vs nop1), and clearly that would be > > > > > > > > to try to call uprobe() syscall not from trampoline, and expect some > > > > > > > > error code. > > > > > > > > > > > > > > > > How bad would it be to change this part to return some unique-enough > > > > > > > > error code (-ENXIO, -EDOM, whatever). > > > > > > > > > > > > > > > > Is there any reason not to do this? Security-wise it will be just fine, right? > > > > > > > > > > > > > > good question.. maybe :) the sys_uprobe sigill error path followed the > > > > > > > uprobe logic when things go bad, seem like good idea to be strict > > > > > > > > > > > > > > I understand it'd make the detection code simpler, but it could just > > > > > > > just fork and check for sigill, right? > > > > > > > > > > > > Can't you simply uprobe your own nop5 and read back the text to see what > > > > > > it turns into? > > > > > > > > > > Sure, but none of that is neither fast, nor cheap, nor that simple... > > > > > (and requires elevated permissions just to detect) > > > > > > > > > > Forking is also resource-intensive. (think from libbpf's perspective, > > > > > it's not cool for library to fork some application just to check such > > > > > a seemingly simple thing as whether to > > > > > > > > > > The question is why all that? That SIGILL when !in_uprobe_trampoline() > > > > > is just paranoid. I understand killing an application if it tries to > > > > > screw up "protocol" in all the subsequent checks. But here it's > > > > > equally secure to just fail that syscall with normal error, instead of > > > > > punishing by death. > > > > > > > > adding Jann to the loop, any thoughts on this ^^^ ? > > > > > > If I understand correctly, the main reason for the SIGILL is that if > > > you hit an error in here when coming from an actual uprobe, and if the > > > syscall were to just return an error, then you'd end up not restoring > > > registers as expected which would probably end up crashing the process > > > in a pretty ugly way? > > > > for some cases yes, for the initial checks I think we could just skip > > the uprobe and process would continue just fine > > > > For non-buggy kernel implementation in_uprobe_trampoline(regs->ip) > will (should) always be true when triggered for kernel-installed > uprobe. So this check can fail only for cases when someone > intentionally called sys_uprobe not from kernel-generated and > kernel-controlled trampoline. > > At which point it's totally fine to just return an error and do nothing. > > > we use sigill because the trap code paths use it for errors and to be > > paranoid about the !in_uprobe_trampoline check > > Yeah, and it should be totally fine to keep doing that. > > It's just about that entry in_uprobe_trampoline() check. And that's > sufficient to make all this nicely integrated with USDT use cases. > > (I'd say it would be nice to also amend this into original patch to > avoid someone cherry picking original commit and forgetting/missing > the follow up change. But that's up to Peter.) > > Jiri, can you please send a quick patch and see how that goes? Thanks! seems like it's as easy as the change below, I'll send formal patches later if I don't hear otherwise.. we will also need man page change jirka --- diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 0a8c0a4a5423..845aeaf36b8d 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -810,7 +810,7 @@ SYSCALL_DEFINE0(uprobe) /* Allow execution only from uprobe trampolines. */ if (!in_uprobe_trampoline(regs->ip)) - goto sigill; + return -ENXIO; err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); if (err) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index 5da0b49eeaca..6d75ede16e7c 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -757,34 +757,12 @@ static void test_uprobe_race(void) #define __NR_uprobe 336 #endif -static void test_uprobe_sigill(void) +static void test_uprobe_error(void) { - int status, err, pid; + long err = syscall(__NR_uprobe); - pid = fork(); - if (!ASSERT_GE(pid, 0, "fork")) - return; - /* child */ - if (pid == 0) { - asm volatile ( - "pushq %rax\n" - "pushq %rcx\n" - "pushq %r11\n" - "movq $" __stringify(__NR_uprobe) ", %rax\n" - "syscall\n" - "popq %r11\n" - "popq %rcx\n" - "retq\n" - ); - exit(0); - } - - err = waitpid(pid, &status, 0); - ASSERT_EQ(err, pid, "waitpid"); - - /* verify the child got killed with SIGILL */ - ASSERT_EQ(WIFSIGNALED(status), 1, "WIFSIGNALED"); - ASSERT_EQ(WTERMSIG(status), SIGILL, "WTERMSIG"); + ASSERT_EQ(err, -1, "error"); + ASSERT_EQ(errno, ENXIO, "errno"); } static void __test_uprobe_syscall(void) @@ -805,8 +783,8 @@ static void __test_uprobe_syscall(void) test_uprobe_usdt(); if (test__start_subtest("uprobe_race")) test_uprobe_race(); - if (test__start_subtest("uprobe_sigill")) - test_uprobe_sigill(); + if (test__start_subtest("uprobe_error")) + test_uprobe_error(); if (test__start_subtest("uprobe_regs_equal")) test_uprobe_regs_equal(false); if (test__start_subtest("regs_change"))
On Fri, Sep 5, 2025 at 3:46 AM Jiri Olsa <olsajiri@gmail.com> wrote: > > On Thu, Sep 04, 2025 at 11:32:06AM -0700, Andrii Nakryiko wrote: > > On Thu, Sep 4, 2025 at 7:03 AM Jiri Olsa <olsajiri@gmail.com> wrote: > > > > > > On Thu, Sep 04, 2025 at 11:39:33AM +0200, Jann Horn wrote: > > > > On Thu, Sep 4, 2025 at 9:56 AM Jiri Olsa <olsajiri@gmail.com> wrote: > > > > > On Wed, Sep 03, 2025 at 04:12:37PM -0700, Andrii Nakryiko wrote: > > > > > > On Wed, Sep 3, 2025 at 2:01 PM Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > > > > > > > > > On Wed, Sep 03, 2025 at 10:56:10PM +0200, Jiri Olsa wrote: > > > > > > > > > > > > > > > > > +SYSCALL_DEFINE0(uprobe) > > > > > > > > > > +{ > > > > > > > > > > + struct pt_regs *regs = task_pt_regs(current); > > > > > > > > > > + struct uprobe_syscall_args args; > > > > > > > > > > + unsigned long ip, sp; > > > > > > > > > > + int err; > > > > > > > > > > + > > > > > > > > > > + /* Allow execution only from uprobe trampolines. */ > > > > > > > > > > + if (!in_uprobe_trampoline(regs->ip)) > > > > > > > > > > + goto sigill; > > > > > > > > > > > > > > > > > > Hey Jiri, > > > > > > > > > > > > > > > > > > So I've been thinking what's the simplest and most reliable way to > > > > > > > > > feature-detect support for this sys_uprobe (e.g., for libbpf to know > > > > > > > > > whether we should attach at nop5 vs nop1), and clearly that would be > > > > > > > > > to try to call uprobe() syscall not from trampoline, and expect some > > > > > > > > > error code. > > > > > > > > > > > > > > > > > > How bad would it be to change this part to return some unique-enough > > > > > > > > > error code (-ENXIO, -EDOM, whatever). > > > > > > > > > > > > > > > > > > Is there any reason not to do this? Security-wise it will be just fine, right? > > > > > > > > > > > > > > > > good question.. maybe :) the sys_uprobe sigill error path followed the > > > > > > > > uprobe logic when things go bad, seem like good idea to be strict > > > > > > > > > > > > > > > > I understand it'd make the detection code simpler, but it could just > > > > > > > > just fork and check for sigill, right? > > > > > > > > > > > > > > Can't you simply uprobe your own nop5 and read back the text to see what > > > > > > > it turns into? > > > > > > > > > > > > Sure, but none of that is neither fast, nor cheap, nor that simple... > > > > > > (and requires elevated permissions just to detect) > > > > > > > > > > > > Forking is also resource-intensive. (think from libbpf's perspective, > > > > > > it's not cool for library to fork some application just to check such > > > > > > a seemingly simple thing as whether to > > > > > > > > > > > > The question is why all that? That SIGILL when !in_uprobe_trampoline() > > > > > > is just paranoid. I understand killing an application if it tries to > > > > > > screw up "protocol" in all the subsequent checks. But here it's > > > > > > equally secure to just fail that syscall with normal error, instead of > > > > > > punishing by death. > > > > > > > > > > adding Jann to the loop, any thoughts on this ^^^ ? > > > > > > > > If I understand correctly, the main reason for the SIGILL is that if > > > > you hit an error in here when coming from an actual uprobe, and if the > > > > syscall were to just return an error, then you'd end up not restoring > > > > registers as expected which would probably end up crashing the process > > > > in a pretty ugly way? > > > > > > for some cases yes, for the initial checks I think we could just skip > > > the uprobe and process would continue just fine > > > > > > > For non-buggy kernel implementation in_uprobe_trampoline(regs->ip) > > will (should) always be true when triggered for kernel-installed > > uprobe. So this check can fail only for cases when someone > > intentionally called sys_uprobe not from kernel-generated and > > kernel-controlled trampoline. > > > > At which point it's totally fine to just return an error and do nothing. > > > > > we use sigill because the trap code paths use it for errors and to be > > > paranoid about the !in_uprobe_trampoline check > > > > Yeah, and it should be totally fine to keep doing that. > > > > It's just about that entry in_uprobe_trampoline() check. And that's > > sufficient to make all this nicely integrated with USDT use cases. > > > > (I'd say it would be nice to also amend this into original patch to > > avoid someone cherry picking original commit and forgetting/missing > > the follow up change. But that's up to Peter.) > > > > Jiri, can you please send a quick patch and see how that goes? Thanks! > > seems like it's as easy as the change below, I'll send formal patches > later if I don't hear otherwise.. we will also need man page change > > jirka > > > --- > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c > index 0a8c0a4a5423..845aeaf36b8d 100644 > --- a/arch/x86/kernel/uprobes.c > +++ b/arch/x86/kernel/uprobes.c > @@ -810,7 +810,7 @@ SYSCALL_DEFINE0(uprobe) > > /* Allow execution only from uprobe trampolines. */ > if (!in_uprobe_trampoline(regs->ip)) > - goto sigill; > + return -ENXIO; > thanks! Acked-by: Andrii Nakryiko <andrii@kernel.org> > err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); > if (err) > diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c > index 5da0b49eeaca..6d75ede16e7c 100644 > --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c > +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c > @@ -757,34 +757,12 @@ static void test_uprobe_race(void) > #define __NR_uprobe 336 > #endif > > -static void test_uprobe_sigill(void) > +static void test_uprobe_error(void) > { > - int status, err, pid; > + long err = syscall(__NR_uprobe); > > - pid = fork(); > - if (!ASSERT_GE(pid, 0, "fork")) > - return; > - /* child */ > - if (pid == 0) { > - asm volatile ( > - "pushq %rax\n" > - "pushq %rcx\n" > - "pushq %r11\n" > - "movq $" __stringify(__NR_uprobe) ", %rax\n" > - "syscall\n" > - "popq %r11\n" > - "popq %rcx\n" > - "retq\n" > - ); > - exit(0); > - } > - > - err = waitpid(pid, &status, 0); > - ASSERT_EQ(err, pid, "waitpid"); > - > - /* verify the child got killed with SIGILL */ > - ASSERT_EQ(WIFSIGNALED(status), 1, "WIFSIGNALED"); > - ASSERT_EQ(WTERMSIG(status), SIGILL, "WTERMSIG"); > + ASSERT_EQ(err, -1, "error"); > + ASSERT_EQ(errno, ENXIO, "errno"); > } > > static void __test_uprobe_syscall(void) > @@ -805,8 +783,8 @@ static void __test_uprobe_syscall(void) > test_uprobe_usdt(); > if (test__start_subtest("uprobe_race")) > test_uprobe_race(); > - if (test__start_subtest("uprobe_sigill")) > - test_uprobe_sigill(); > + if (test__start_subtest("uprobe_error")) > + test_uprobe_error(); > if (test__start_subtest("uprobe_regs_equal")) > test_uprobe_regs_equal(false); > if (test__start_subtest("regs_change"))
On Sun, 20 Jul 2025 13:21:19 +0200 Jiri Olsa <jolsa@kernel.org> wrote: > Adding new uprobe syscall that calls uprobe handlers for given > 'breakpoint' address. > > The idea is that the 'breakpoint' address calls the user space > trampoline which executes the uprobe syscall. > > The syscall handler reads the return address of the initial call > to retrieve the original 'breakpoint' address. With this address > we find the related uprobe object and call its consumers. > > Adding the arch_uprobe_trampoline_mapping function that provides > uprobe trampoline mapping. This mapping is backed with one global > page initialized at __init time and shared by the all the mapping > instances. > > We do not allow to execute uprobe syscall if the caller is not > from uprobe trampoline mapping. > > The uprobe syscall ensures the consumer (bpf program) sees registers > values in the state before the trampoline was called. > Looks good to me. Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Thanks! > Acked-by: Andrii Nakryiko <andrii@kernel.org> > Signed-off-by: Jiri Olsa <jolsa@kernel.org> > --- > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > arch/x86/kernel/uprobes.c | 139 +++++++++++++++++++++++++ > include/linux/syscalls.h | 2 + > include/linux/uprobes.h | 1 + > kernel/events/uprobes.c | 17 +++ > kernel/sys_ni.c | 1 + > 6 files changed, 161 insertions(+) > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > index cfb5ca41e30d..9fd1291e7bdf 100644 > --- a/arch/x86/entry/syscalls/syscall_64.tbl > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > @@ -345,6 +345,7 @@ > 333 common io_pgetevents sys_io_pgetevents > 334 common rseq sys_rseq > 335 common uretprobe sys_uretprobe > +336 common uprobe sys_uprobe > # don't use numbers 387 through 423, add new calls after the last > # 'common' entry > 424 common pidfd_send_signal sys_pidfd_send_signal > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c > index 6c4dcbdd0c3c..d18e1ae59901 100644 > --- a/arch/x86/kernel/uprobes.c > +++ b/arch/x86/kernel/uprobes.c > @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm) > hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) > destroy_uprobe_trampoline(tramp); > } > + > +static bool __in_uprobe_trampoline(unsigned long ip) > +{ > + struct vm_area_struct *vma = vma_lookup(current->mm, ip); > + > + return vma && vma_is_special_mapping(vma, &tramp_mapping); > +} > + > +static bool in_uprobe_trampoline(unsigned long ip) > +{ > + struct mm_struct *mm = current->mm; > + bool found, retry = true; > + unsigned int seq; > + > + rcu_read_lock(); > + if (mmap_lock_speculate_try_begin(mm, &seq)) { > + found = __in_uprobe_trampoline(ip); > + retry = mmap_lock_speculate_retry(mm, seq); > + } > + rcu_read_unlock(); > + > + if (retry) { > + mmap_read_lock(mm); > + found = __in_uprobe_trampoline(ip); > + mmap_read_unlock(mm); > + } > + return found; > +} > + > +/* > + * See uprobe syscall trampoline; the call to the trampoline will push > + * the return address on the stack, the trampoline itself then pushes > + * cx, r11 and ax. > + */ > +struct uprobe_syscall_args { > + unsigned long ax; > + unsigned long r11; > + unsigned long cx; > + unsigned long retaddr; > +}; > + > +SYSCALL_DEFINE0(uprobe) > +{ > + struct pt_regs *regs = task_pt_regs(current); > + struct uprobe_syscall_args args; > + unsigned long ip, sp; > + int err; > + > + /* Allow execution only from uprobe trampolines. */ > + if (!in_uprobe_trampoline(regs->ip)) > + goto sigill; > + > + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); > + if (err) > + goto sigill; > + > + ip = regs->ip; > + > + /* > + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: > + * - adjust ip to the probe address, call saved next instruction address > + * - adjust sp to the probe's stack frame (check trampoline code) > + */ > + regs->ax = args.ax; > + regs->r11 = args.r11; > + regs->cx = args.cx; > + regs->ip = args.retaddr - 5; > + regs->sp += sizeof(args); > + regs->orig_ax = -1; > + > + sp = regs->sp; > + > + handle_syscall_uprobe(regs, regs->ip); > + > + /* > + * Some of the uprobe consumers has changed sp, we can do nothing, > + * just return via iret. > + */ > + if (regs->sp != sp) { > + /* skip the trampoline call */ > + if (args.retaddr - 5 == regs->ip) > + regs->ip += 5; > + return regs->ax; > + } > + > + regs->sp -= sizeof(args); > + > + /* for the case uprobe_consumer has changed ax/r11/cx */ > + args.ax = regs->ax; > + args.r11 = regs->r11; > + args.cx = regs->cx; > + > + /* keep return address unless we are instructed otherwise */ > + if (args.retaddr - 5 != regs->ip) > + args.retaddr = regs->ip; > + > + regs->ip = ip; > + > + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); > + if (err) > + goto sigill; > + > + /* ensure sysret, see do_syscall_64() */ > + regs->r11 = regs->flags; > + regs->cx = regs->ip; > + return 0; > + > +sigill: > + force_sig(SIGILL); > + return -1; > +} > + > +asm ( > + ".pushsection .rodata\n" > + ".balign " __stringify(PAGE_SIZE) "\n" > + "uprobe_trampoline_entry:\n" > + "push %rcx\n" > + "push %r11\n" > + "push %rax\n" > + "movq $" __stringify(__NR_uprobe) ", %rax\n" > + "syscall\n" > + "pop %rax\n" > + "pop %r11\n" > + "pop %rcx\n" > + "ret\n" > + ".balign " __stringify(PAGE_SIZE) "\n" > + ".popsection\n" > +); > + > +extern u8 uprobe_trampoline_entry[]; > + > +static int __init arch_uprobes_init(void) > +{ > + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); > + return 0; > +} > + > +late_initcall(arch_uprobes_init); > + > #else /* 32-bit: */ > /* > * No RIP-relative addressing on 32-bit > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index e5603cc91963..b0cc60f1c458 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -998,6 +998,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on); > > asmlinkage long sys_uretprobe(void); > > +asmlinkage long sys_uprobe(void); > + > /* pciconfig: alpha, arm, arm64, ia64, sparc */ > asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, > unsigned long off, unsigned long len, > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h > index b40d33aae016..b6b077cc7d0f 100644 > --- a/include/linux/uprobes.h > +++ b/include/linux/uprobes.h > @@ -239,6 +239,7 @@ extern unsigned long uprobe_get_trampoline_vaddr(void); > extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); > extern void arch_uprobe_clear_state(struct mm_struct *mm); > extern void arch_uprobe_init_state(struct mm_struct *mm); > +extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); > #else /* !CONFIG_UPROBES */ > struct uprobes_state { > }; > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c > index acec91a676b7..cbba31c0495f 100644 > --- a/kernel/events/uprobes.c > +++ b/kernel/events/uprobes.c > @@ -2772,6 +2772,23 @@ static void handle_swbp(struct pt_regs *regs) > rcu_read_unlock_trace(); > } > > +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) > +{ > + struct uprobe *uprobe; > + int is_swbp; > + > + guard(rcu_tasks_trace)(); > + > + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); > + if (!uprobe) > + return; > + if (!get_utask()) > + return; > + if (arch_uprobe_ignore(&uprobe->arch, regs)) > + return; > + handler_chain(uprobe, regs); > +} > + > /* > * Perform required fix-ups and disable singlestep. > * Allow pending signals to take effect. > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > index c00a86931f8c..bf5d05c635ff 100644 > --- a/kernel/sys_ni.c > +++ b/kernel/sys_ni.c > @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16); > COND_SYSCALL(rseq); > > COND_SYSCALL(uretprobe); > +COND_SYSCALL(uprobe); > -- > 2.50.1 > -- Masami Hiramatsu (Google) <mhiramat@kernel.org>
On 07/20, Jiri Olsa wrote: > > Adding new uprobe syscall that calls uprobe handlers for given > 'breakpoint' address. > > The idea is that the 'breakpoint' address calls the user space > trampoline which executes the uprobe syscall. > > The syscall handler reads the return address of the initial call > to retrieve the original 'breakpoint' address. With this address > we find the related uprobe object and call its consumers. > > Adding the arch_uprobe_trampoline_mapping function that provides > uprobe trampoline mapping. This mapping is backed with one global > page initialized at __init time and shared by the all the mapping > instances. > > We do not allow to execute uprobe syscall if the caller is not > from uprobe trampoline mapping. > > The uprobe syscall ensures the consumer (bpf program) sees registers > values in the state before the trampoline was called. > > Acked-by: Andrii Nakryiko <andrii@kernel.org> > Signed-off-by: Jiri Olsa <jolsa@kernel.org> My ack still stands, Acked-by: Oleg Nesterov <oleg@redhat.com>
The following commit has been merged into the perf/core branch of tip:
Commit-ID: 56101b69c9190667f473b9f93f8b6d8209aaa816
Gitweb: https://git.kernel.org/tip/56101b69c9190667f473b9f93f8b6d8209aaa816
Author: Jiri Olsa <jolsa@kernel.org>
AuthorDate: Sun, 20 Jul 2025 13:21:19 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 21 Aug 2025 20:09:20 +02:00
uprobes/x86: Add uprobe syscall to speed up uprobe
Adding new uprobe syscall that calls uprobe handlers for given
'breakpoint' address.
The idea is that the 'breakpoint' address calls the user space
trampoline which executes the uprobe syscall.
The syscall handler reads the return address of the initial call
to retrieve the original 'breakpoint' address. With this address
we find the related uprobe object and call its consumers.
Adding the arch_uprobe_trampoline_mapping function that provides
uprobe trampoline mapping. This mapping is backed with one global
page initialized at __init time and shared by the all the mapping
instances.
We do not allow to execute uprobe syscall if the caller is not
from uprobe trampoline mapping.
The uprobe syscall ensures the consumer (bpf program) sees registers
values in the state before the trampoline was called.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20250720112133.244369-10-jolsa@kernel.org
---
arch/x86/entry/syscalls/syscall_64.tbl | 1 +-
arch/x86/kernel/uprobes.c | 139 ++++++++++++++++++++++++-
include/linux/syscalls.h | 2 +-
include/linux/uprobes.h | 1 +-
kernel/events/uprobes.c | 17 +++-
kernel/sys_ni.c | 1 +-
6 files changed, 161 insertions(+)
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 92cf0fe..ced2a1d 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@
333 common io_pgetevents sys_io_pgetevents
334 common rseq sys_rseq
335 common uretprobe sys_uretprobe
+336 common uprobe sys_uprobe
# don't use numbers 387 through 423, add new calls after the last
# 'common' entry
424 common pidfd_send_signal sys_pidfd_send_signal
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c4dcbd..d18e1ae 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm)
hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
destroy_uprobe_trampoline(tramp);
}
+
+static bool __in_uprobe_trampoline(unsigned long ip)
+{
+ struct vm_area_struct *vma = vma_lookup(current->mm, ip);
+
+ return vma && vma_is_special_mapping(vma, &tramp_mapping);
+}
+
+static bool in_uprobe_trampoline(unsigned long ip)
+{
+ struct mm_struct *mm = current->mm;
+ bool found, retry = true;
+ unsigned int seq;
+
+ rcu_read_lock();
+ if (mmap_lock_speculate_try_begin(mm, &seq)) {
+ found = __in_uprobe_trampoline(ip);
+ retry = mmap_lock_speculate_retry(mm, seq);
+ }
+ rcu_read_unlock();
+
+ if (retry) {
+ mmap_read_lock(mm);
+ found = __in_uprobe_trampoline(ip);
+ mmap_read_unlock(mm);
+ }
+ return found;
+}
+
+/*
+ * See uprobe syscall trampoline; the call to the trampoline will push
+ * the return address on the stack, the trampoline itself then pushes
+ * cx, r11 and ax.
+ */
+struct uprobe_syscall_args {
+ unsigned long ax;
+ unsigned long r11;
+ unsigned long cx;
+ unsigned long retaddr;
+};
+
+SYSCALL_DEFINE0(uprobe)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct uprobe_syscall_args args;
+ unsigned long ip, sp;
+ int err;
+
+ /* Allow execution only from uprobe trampolines. */
+ if (!in_uprobe_trampoline(regs->ip))
+ goto sigill;
+
+ err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
+ if (err)
+ goto sigill;
+
+ ip = regs->ip;
+
+ /*
+ * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus:
+ * - adjust ip to the probe address, call saved next instruction address
+ * - adjust sp to the probe's stack frame (check trampoline code)
+ */
+ regs->ax = args.ax;
+ regs->r11 = args.r11;
+ regs->cx = args.cx;
+ regs->ip = args.retaddr - 5;
+ regs->sp += sizeof(args);
+ regs->orig_ax = -1;
+
+ sp = regs->sp;
+
+ handle_syscall_uprobe(regs, regs->ip);
+
+ /*
+ * Some of the uprobe consumers has changed sp, we can do nothing,
+ * just return via iret.
+ */
+ if (regs->sp != sp) {
+ /* skip the trampoline call */
+ if (args.retaddr - 5 == regs->ip)
+ regs->ip += 5;
+ return regs->ax;
+ }
+
+ regs->sp -= sizeof(args);
+
+ /* for the case uprobe_consumer has changed ax/r11/cx */
+ args.ax = regs->ax;
+ args.r11 = regs->r11;
+ args.cx = regs->cx;
+
+ /* keep return address unless we are instructed otherwise */
+ if (args.retaddr - 5 != regs->ip)
+ args.retaddr = regs->ip;
+
+ regs->ip = ip;
+
+ err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
+ if (err)
+ goto sigill;
+
+ /* ensure sysret, see do_syscall_64() */
+ regs->r11 = regs->flags;
+ regs->cx = regs->ip;
+ return 0;
+
+sigill:
+ force_sig(SIGILL);
+ return -1;
+}
+
+asm (
+ ".pushsection .rodata\n"
+ ".balign " __stringify(PAGE_SIZE) "\n"
+ "uprobe_trampoline_entry:\n"
+ "push %rcx\n"
+ "push %r11\n"
+ "push %rax\n"
+ "movq $" __stringify(__NR_uprobe) ", %rax\n"
+ "syscall\n"
+ "pop %rax\n"
+ "pop %r11\n"
+ "pop %rcx\n"
+ "ret\n"
+ ".balign " __stringify(PAGE_SIZE) "\n"
+ ".popsection\n"
+);
+
+extern u8 uprobe_trampoline_entry[];
+
+static int __init arch_uprobes_init(void)
+{
+ tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry);
+ return 0;
+}
+
+late_initcall(arch_uprobes_init);
+
#else /* 32-bit: */
/*
* No RIP-relative addressing on 32-bit
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 77f45e5..66c06fc 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1005,6 +1005,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
asmlinkage long sys_uretprobe(void);
+asmlinkage long sys_uprobe(void);
+
/* pciconfig: alpha, arm, arm64, ia64, sparc */
asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
unsigned long off, unsigned long len,
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b40d33a..b6b077c 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -239,6 +239,7 @@ extern unsigned long uprobe_get_trampoline_vaddr(void);
extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len);
extern void arch_uprobe_clear_state(struct mm_struct *mm);
extern void arch_uprobe_init_state(struct mm_struct *mm);
+extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
#else /* !CONFIG_UPROBES */
struct uprobes_state {
};
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2cd7a4c..eb07e60 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2771,6 +2771,23 @@ out:
rcu_read_unlock_trace();
}
+void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
+{
+ struct uprobe *uprobe;
+ int is_swbp;
+
+ guard(rcu_tasks_trace)();
+
+ uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
+ if (!uprobe)
+ return;
+ if (!get_utask())
+ return;
+ if (arch_uprobe_ignore(&uprobe->arch, regs))
+ return;
+ handler_chain(uprobe, regs);
+}
+
/*
* Perform required fix-ups and disable singlestep.
* Allow pending signals to take effect.
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c00a869..bf5d05c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -392,3 +392,4 @@ COND_SYSCALL(setuid16);
COND_SYSCALL(rseq);
COND_SYSCALL(uretprobe);
+COND_SYSCALL(uprobe);
© 2016 - 2025 Red Hat, Inc.