Adding support to add special mapping for for user space trampoline
with following functions:
uprobe_trampoline_get - find or add uprobe_trampoline
uprobe_trampoline_put - remove or destroy uprobe_trampoline
The user space trampoline is exported as arch specific user space special
mapping through tramp_mapping, which is initialized in following changes
with new uprobe syscall.
The uprobe trampoline needs to be callable/reachable from the probed address,
so while searching for available address we use is_reachable_by_call function
to decide if the uprobe trampoline is callable from the probe address.
All uprobe_trampoline objects are stored in uprobes_state object and are
cleaned up when the process mm_struct goes down. Adding new arch hooks
for that, because this change is x86_64 specific.
Locking is provided by callers in following changes.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
arch/x86/kernel/uprobes.c | 131 ++++++++++++++++++++++++++++++++++++++
include/linux/uprobes.h | 6 ++
kernel/events/uprobes.c | 10 +++
kernel/fork.c | 1 +
4 files changed, 148 insertions(+)
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 77050e5a4680..023c55d52138 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -608,6 +608,137 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
*sr = utask->autask.saved_scratch_register;
}
}
+
+static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
+{
+ return -EPERM;
+}
+
+static struct page *tramp_mapping_pages[2] __ro_after_init;
+
+static struct vm_special_mapping tramp_mapping = {
+ .name = "[uprobes-trampoline]",
+ .mremap = tramp_mremap,
+ .pages = tramp_mapping_pages,
+};
+
+struct uprobe_trampoline {
+ struct hlist_node node;
+ unsigned long vaddr;
+ atomic64_t ref;
+};
+
+static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr)
+{
+ long delta = (long)(vaddr + 5 - vtramp);
+
+ return delta >= INT_MIN && delta <= INT_MAX;
+}
+
+static unsigned long find_nearest_page(unsigned long vaddr)
+{
+ struct vm_area_struct *vma, *prev = NULL;
+ unsigned long prev_vm_end = PAGE_SIZE;
+ VMA_ITERATOR(vmi, current->mm, 0);
+
+ vma = vma_next(&vmi);
+ while (vma) {
+ if (prev)
+ prev_vm_end = prev->vm_end;
+ if (vma->vm_start - prev_vm_end >= PAGE_SIZE) {
+ if (is_reachable_by_call(prev_vm_end, vaddr))
+ return prev_vm_end;
+ if (is_reachable_by_call(vma->vm_start - PAGE_SIZE, vaddr))
+ return vma->vm_start - PAGE_SIZE;
+ }
+ prev = vma;
+ vma = vma_next(&vmi);
+ }
+
+ return 0;
+}
+
+static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct mm_struct *mm = current->mm;
+ struct uprobe_trampoline *tramp;
+ struct vm_area_struct *vma;
+
+ if (!user_64bit_mode(regs))
+ return NULL;
+
+ vaddr = find_nearest_page(vaddr);
+ if (!vaddr)
+ return NULL;
+
+ tramp = kzalloc(sizeof(*tramp), GFP_KERNEL);
+ if (unlikely(!tramp))
+ return NULL;
+
+ atomic64_set(&tramp->ref, 1);
+ tramp->vaddr = vaddr;
+
+ vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE,
+ VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
+ &tramp_mapping);
+ if (IS_ERR(vma))
+ goto free_area;
+ return tramp;
+
+free_area:
+ kfree(tramp);
+ return NULL;
+}
+
+__maybe_unused
+static struct uprobe_trampoline *uprobe_trampoline_get(unsigned long vaddr)
+{
+ struct uprobes_state *state = ¤t->mm->uprobes_state;
+ struct uprobe_trampoline *tramp = NULL;
+
+ hlist_for_each_entry(tramp, &state->head_tramps, node) {
+ if (is_reachable_by_call(tramp->vaddr, vaddr)) {
+ atomic64_inc(&tramp->ref);
+ return tramp;
+ }
+ }
+
+ tramp = create_uprobe_trampoline(vaddr);
+ if (!tramp)
+ return NULL;
+
+ hlist_add_head(&tramp->node, &state->head_tramps);
+ return tramp;
+}
+
+static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp)
+{
+ hlist_del(&tramp->node);
+ kfree(tramp);
+}
+
+__maybe_unused
+static void uprobe_trampoline_put(struct uprobe_trampoline *tramp)
+{
+ if (tramp && atomic64_dec_and_test(&tramp->ref))
+ destroy_uprobe_trampoline(tramp);
+}
+
+void arch_uprobe_init_state(struct mm_struct *mm)
+{
+ INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps);
+}
+
+void arch_uprobe_clear_state(struct mm_struct *mm)
+{
+ struct uprobes_state *state = &mm->uprobes_state;
+ struct uprobe_trampoline *tramp;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
+ destroy_uprobe_trampoline(tramp);
+}
#else /* 32-bit: */
/*
* No RIP-relative addressing on 32-bit
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 6af61e977bfb..bc532d086813 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -17,6 +17,7 @@
#include <linux/wait.h>
#include <linux/timer.h>
#include <linux/seqlock.h>
+#include <linux/mutex.h>
struct uprobe;
struct vm_area_struct;
@@ -185,6 +186,9 @@ struct xol_area;
struct uprobes_state {
struct xol_area *xol_area;
+#ifdef CONFIG_X86_64
+ struct hlist_head head_tramps;
+#endif
};
typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr,
@@ -233,6 +237,8 @@ extern void uprobe_handle_trampoline(struct pt_regs *regs);
extern void *arch_uretprobe_trampoline(unsigned long *psize);
extern unsigned long uprobe_get_trampoline_vaddr(void);
extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len);
+extern void arch_uprobe_clear_state(struct mm_struct *mm);
+extern void arch_uprobe_init_state(struct mm_struct *mm);
#else /* !CONFIG_UPROBES */
struct uprobes_state {
};
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d256c695d7ff..a3107f63f295 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1812,6 +1812,14 @@ static struct xol_area *get_xol_area(void)
return area;
}
+void __weak arch_uprobe_clear_state(struct mm_struct *mm)
+{
+}
+
+void __weak arch_uprobe_init_state(struct mm_struct *mm)
+{
+}
+
/*
* uprobe_clear_state - Free the area allocated for slots.
*/
@@ -1823,6 +1831,8 @@ void uprobe_clear_state(struct mm_struct *mm)
delayed_uprobe_remove(NULL, mm);
mutex_unlock(&delayed_uprobe_lock);
+ arch_uprobe_clear_state(mm);
+
if (!area)
return;
diff --git a/kernel/fork.c b/kernel/fork.c
index c4b26cd8998b..4c2df3816728 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1269,6 +1269,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
mm->uprobes_state.xol_area = NULL;
+ arch_uprobe_init_state(mm);
#endif
}
--
2.49.0
On 04/21, Jiri Olsa wrote:
>
> +struct uprobe_trampoline {
> + struct hlist_node node;
> + unsigned long vaddr;
> + atomic64_t ref;
> +};
I don't really understand the point of uprobe_trampoline->ref...
set_orig_insn/swbp_unoptimize paths don't call uprobe_trampoline_put().
It is only called in unlikely case when swbp_optimize() fails, so perhaps
we can kill this member and uprobe_trampoline_put() ? At least in the initial
version.
> +static void uprobe_trampoline_put(struct uprobe_trampoline *tramp)
> +{
> + if (tramp && atomic64_dec_and_test(&tramp->ref))
> + destroy_uprobe_trampoline(tramp);
> +}
Why does it check tramp != NULL ?
Oleg.
On Sun, Apr 27, 2025 at 08:04:32PM +0200, Oleg Nesterov wrote:
> On 04/21, Jiri Olsa wrote:
> >
> > +struct uprobe_trampoline {
> > + struct hlist_node node;
> > + unsigned long vaddr;
> > + atomic64_t ref;
> > +};
>
> I don't really understand the point of uprobe_trampoline->ref...
>
> set_orig_insn/swbp_unoptimize paths don't call uprobe_trampoline_put().
> It is only called in unlikely case when swbp_optimize() fails, so perhaps
> we can kill this member and uprobe_trampoline_put() ? At least in the initial
> version.
right, we can remove that
>
> > +static void uprobe_trampoline_put(struct uprobe_trampoline *tramp)
> > +{
> > + if (tramp && atomic64_dec_and_test(&tramp->ref))
> > + destroy_uprobe_trampoline(tramp);
> > +}
>
> Why does it check tramp != NULL ?
I think some earlier version of the code could have called that with NULL,
will remove that
thanks,
jirka
On 04/21, Jiri Olsa wrote:
>
> +static unsigned long find_nearest_page(unsigned long vaddr)
> +{
> + struct vm_area_struct *vma, *prev = NULL;
> + unsigned long prev_vm_end = PAGE_SIZE;
> + VMA_ITERATOR(vmi, current->mm, 0);
> +
> + vma = vma_next(&vmi);
> + while (vma) {
> + if (prev)
> + prev_vm_end = prev->vm_end;
> + if (vma->vm_start - prev_vm_end >= PAGE_SIZE) {
> + if (is_reachable_by_call(prev_vm_end, vaddr))
> + return prev_vm_end;
> + if (is_reachable_by_call(vma->vm_start - PAGE_SIZE, vaddr))
> + return vma->vm_start - PAGE_SIZE;
> + }
> + prev = vma;
> + vma = vma_next(&vmi);
> + }
> +
> + return 0;
> +}
This can be simplified afaics... We don't really need prev, and we can
use for_each_vma(),
static unsigned long find_nearest_page(unsigned long vaddr)
{
struct vm_area_struct *vma;
unsigned long prev_vm_end = PAGE_SIZE;
VMA_ITERATOR(vmi, current->mm, 0);
for_each_vma(vmi, vma) {
if (vma->vm_start - prev_vm_end >= PAGE_SIZE) {
if (is_reachable_by_call(prev_vm_end, vaddr))
return prev_vm_end;
if (is_reachable_by_call(vma->vm_start - PAGE_SIZE, vaddr))
return vma->vm_start - PAGE_SIZE;
}
prev_vm_end = vma->vm_end;
}
return 0;
}
> +static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
> +{
> + struct pt_regs *regs = task_pt_regs(current);
> + struct mm_struct *mm = current->mm;
> + struct uprobe_trampoline *tramp;
> + struct vm_area_struct *vma;
> +
> + if (!user_64bit_mode(regs))
> + return NULL;
Cosmetic, but I think it would be better to move this check into the
caller, uprobe_trampoline_get().
> + vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE,
> + VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
> + &tramp_mapping);
Note that xol_add_vma() -> _install_special_mapping() uses VM_SEALED_SYSMAP.
Perhaps create_uprobe_trampoline() should use this flag too for consistency?
Oleg.
On 04/27, Oleg Nesterov wrote:
>
> On 04/21, Jiri Olsa wrote:
> >
> > +static unsigned long find_nearest_page(unsigned long vaddr)
> > +{
> > + struct vm_area_struct *vma, *prev = NULL;
> > + unsigned long prev_vm_end = PAGE_SIZE;
> > + VMA_ITERATOR(vmi, current->mm, 0);
> > +
> > + vma = vma_next(&vmi);
> > + while (vma) {
> > + if (prev)
> > + prev_vm_end = prev->vm_end;
> > + if (vma->vm_start - prev_vm_end >= PAGE_SIZE) {
> > + if (is_reachable_by_call(prev_vm_end, vaddr))
> > + return prev_vm_end;
> > + if (is_reachable_by_call(vma->vm_start - PAGE_SIZE, vaddr))
> > + return vma->vm_start - PAGE_SIZE;
> > + }
> > + prev = vma;
> > + vma = vma_next(&vmi);
> > + }
> > +
> > + return 0;
> > +}
>
> This can be simplified afaics... We don't really need prev, and we can
> use for_each_vma(),
>
> static unsigned long find_nearest_page(unsigned long vaddr)
> {
> struct vm_area_struct *vma;
> unsigned long prev_vm_end = PAGE_SIZE;
> VMA_ITERATOR(vmi, current->mm, 0);
>
> for_each_vma(vmi, vma) {
> if (vma->vm_start - prev_vm_end >= PAGE_SIZE) {
> if (is_reachable_by_call(prev_vm_end, vaddr))
> return prev_vm_end;
> if (is_reachable_by_call(vma->vm_start - PAGE_SIZE, vaddr))
> return vma->vm_start - PAGE_SIZE;
> }
> prev_vm_end = vma->vm_end;
> }
>
> return 0;
> }
Either way it doesn't look nice. If nothing else, we should respect
vm_start/end_gap(vma).
Can't we do something like
struct vm_unmapped_area_info info = {};
info.length = PAGE_SIZE;
info.low_limit = vaddr - INT_MIN + 5;
info.high_limit = vaddr + INT_MAX;
info.flags = VM_UNMAPPED_AREA_TOPDOWN; // makes sense?
return vm_unmapped_area(&info);
instead ?
Oleg.
On Sun, Apr 27, 2025 at 07:34:56PM +0200, Oleg Nesterov wrote:
> On 04/27, Oleg Nesterov wrote:
> >
> > On 04/21, Jiri Olsa wrote:
> > >
> > > +static unsigned long find_nearest_page(unsigned long vaddr)
> > > +{
> > > + struct vm_area_struct *vma, *prev = NULL;
> > > + unsigned long prev_vm_end = PAGE_SIZE;
> > > + VMA_ITERATOR(vmi, current->mm, 0);
> > > +
> > > + vma = vma_next(&vmi);
> > > + while (vma) {
> > > + if (prev)
> > > + prev_vm_end = prev->vm_end;
> > > + if (vma->vm_start - prev_vm_end >= PAGE_SIZE) {
> > > + if (is_reachable_by_call(prev_vm_end, vaddr))
> > > + return prev_vm_end;
> > > + if (is_reachable_by_call(vma->vm_start - PAGE_SIZE, vaddr))
> > > + return vma->vm_start - PAGE_SIZE;
> > > + }
> > > + prev = vma;
> > > + vma = vma_next(&vmi);
> > > + }
> > > +
> > > + return 0;
> > > +}
> >
> > This can be simplified afaics... We don't really need prev, and we can
> > use for_each_vma(),
> >
> > static unsigned long find_nearest_page(unsigned long vaddr)
> > {
> > struct vm_area_struct *vma;
> > unsigned long prev_vm_end = PAGE_SIZE;
> > VMA_ITERATOR(vmi, current->mm, 0);
> >
> > for_each_vma(vmi, vma) {
> > if (vma->vm_start - prev_vm_end >= PAGE_SIZE) {
> > if (is_reachable_by_call(prev_vm_end, vaddr))
> > return prev_vm_end;
> > if (is_reachable_by_call(vma->vm_start - PAGE_SIZE, vaddr))
> > return vma->vm_start - PAGE_SIZE;
> > }
> > prev_vm_end = vma->vm_end;
> > }
> >
> > return 0;
> > }
>
> Either way it doesn't look nice. If nothing else, we should respect
> vm_start/end_gap(vma).
>
> Can't we do something like
>
> struct vm_unmapped_area_info info = {};
>
> info.length = PAGE_SIZE;
> info.low_limit = vaddr - INT_MIN + 5;
> info.high_limit = vaddr + INT_MAX;
>
> info.flags = VM_UNMAPPED_AREA_TOPDOWN; // makes sense?
so this would return highest available space right? current code goes from
bottom now, not sure what's preffered
>
> return vm_unmapped_area(&info);
>
> instead ?
yes, I did not realize we could use this, looks better, will try that
thanks,
jirka
On Mon, Apr 21, 2025 at 2:46 PM Jiri Olsa <jolsa@kernel.org> wrote: > > Adding support to add special mapping for for user space trampoline for for > with following functions: > > uprobe_trampoline_get - find or add uprobe_trampoline > uprobe_trampoline_put - remove or destroy uprobe_trampoline > > The user space trampoline is exported as arch specific user space special > mapping through tramp_mapping, which is initialized in following changes > with new uprobe syscall. > > The uprobe trampoline needs to be callable/reachable from the probed address, > so while searching for available address we use is_reachable_by_call function > to decide if the uprobe trampoline is callable from the probe address. > > All uprobe_trampoline objects are stored in uprobes_state object and are > cleaned up when the process mm_struct goes down. Adding new arch hooks > for that, because this change is x86_64 specific. > > Locking is provided by callers in following changes. > > Signed-off-by: Jiri Olsa <jolsa@kernel.org> > --- > arch/x86/kernel/uprobes.c | 131 ++++++++++++++++++++++++++++++++++++++ > include/linux/uprobes.h | 6 ++ > kernel/events/uprobes.c | 10 +++ > kernel/fork.c | 1 + > 4 files changed, 148 insertions(+) > Acked-by: Andrii Nakryiko <andrii@kernel.org> [...]
© 2016 - 2025 Red Hat, Inc.