[v1] uprobes: Add support to optimize usdt probes on x86_64

[PATCH perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Jiri Olsa 9 months, 3 weeks ago

Putting together all the previously added pieces to support optimized
uprobes on top of 5-byte nop instruction.

The current uprobe execution goes through following:

  - installs breakpoint instruction over original instruction
  - exception handler hit and calls related uprobe consumers
  - and either simulates original instruction or does out of line single step
    execution of it
  - returns to user space

The optimized uprobe path does following:

  - checks the original instruction is 5-byte nop (plus other checks)
  - adds (or uses existing) user space trampoline with uprobe syscall
  - overwrites original instruction (5-byte nop) with call to user space
    trampoline
  - the user space trampoline executes uprobe syscall that calls related uprobe
    consumers
  - trampoline returns back to next instruction

This approach won't speed up all uprobes as it's limited to using nop5 as
original instruction, but we plan to use nop5 as USDT probe instruction
(which currently uses single byte nop) and speed up the USDT probes.

The arch_uprobe_optimize triggers the uprobe optimization and is called after
first uprobe hit. I originally had it called on uprobe installation but then
it clashed with elf loader, because the user space trampoline was added in a
place where loader might need to put elf segments, so I decided to do it after
first uprobe hit when loading is done.

The uprobe is un-optimized in arch specific set_orig_insn call.

The instruction overwrite is x86 arch specific and needs to go through 3 updates:
(on top of nop5 instruction)

  - write int3 into 1st byte
  - write last 4 bytes of the call instruction
  - update the call instruction opcode

And cleanup goes though similar reverse stages:

  - overwrite call opcode with breakpoint (int3)
  - write last 4 bytes of the nop5 instruction
  - write the nop5 first instruction byte

We do not unmap and release uprobe trampoline when it's no longer needed,
because there's no easy way to make sure none of the threads is still
inside the trampoline. But we do not waste memory, because there's just
single page for all the uprobe trampoline mappings.

We do waste frame on page mapping for every 4GB by keeping the uprobe
trampoline page mapped, but that seems ok.

We take the benefit from the fact that set_swbp and set_orig_insn are
called under mmap_write_lock(mm), so we can use the current instruction
as the state the uprobe is in - nop5/breakpoint/call trampoline -
and decide the needed action (optimize/un-optimize) based on that.

Attaching the speed up from benchs/run_bench_uprobes.sh script:

current:
        usermode-count :  152.604 ± 0.044M/s
        syscall-count  :   13.359 ± 0.042M/s
-->     uprobe-nop     :    3.229 ± 0.002M/s
        uprobe-push    :    3.086 ± 0.004M/s
        uprobe-ret     :    1.114 ± 0.004M/s
        uprobe-nop5    :    1.121 ± 0.005M/s
        uretprobe-nop  :    2.145 ± 0.002M/s
        uretprobe-push :    2.070 ± 0.001M/s
        uretprobe-ret  :    0.931 ± 0.001M/s
        uretprobe-nop5 :    0.957 ± 0.001M/s

after the change:
        usermode-count :  152.448 ± 0.244M/s
        syscall-count  :   14.321 ± 0.059M/s
        uprobe-nop     :    3.148 ± 0.007M/s
        uprobe-push    :    2.976 ± 0.004M/s
        uprobe-ret     :    1.068 ± 0.003M/s
-->     uprobe-nop5    :    7.038 ± 0.007M/s
        uretprobe-nop  :    2.109 ± 0.004M/s
        uretprobe-push :    2.035 ± 0.001M/s
        uretprobe-ret  :    0.908 ± 0.001M/s
        uretprobe-nop5 :    3.377 ± 0.009M/s

I see bit more speed up on Intel (above) compared to AMD. The big nop5
speed up is partly due to emulating nop5 and partly due to optimization.

The key speed up we do this for is the USDT switch from nop to nop5:
        uprobe-nop     :    3.148 ± 0.007M/s
        uprobe-nop5    :    7.038 ± 0.007M/s

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/include/asm/uprobes.h |   7 +
 arch/x86/kernel/uprobes.c      | 281 ++++++++++++++++++++++++++++++++-
 include/linux/uprobes.h        |   6 +-
 kernel/events/uprobes.c        |  15 +-
 4 files changed, 301 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 678fb546f0a7..1ee2e5115955 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
 #define UPROBE_SWBP_INSN		0xcc
 #define UPROBE_SWBP_INSN_SIZE		   1
 
+enum {
+	ARCH_UPROBE_FLAG_CAN_OPTIMIZE   = 0,
+	ARCH_UPROBE_FLAG_OPTIMIZE_FAIL  = 1,
+};
+
 struct uprobe_xol_ops;
 
 struct arch_uprobe {
@@ -45,6 +50,8 @@ struct arch_uprobe {
 			u8	ilen;
 		}			push;
 	};
+
+	unsigned long flags;
 };
 
 struct arch_uprobe_task {
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 01b3035e01ea..d5ef04a1626d 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -18,6 +18,7 @@
 #include <asm/processor.h>
 #include <asm/insn.h>
 #include <asm/mmu_context.h>
+#include <asm/nops.h>
 
 /* Post-execution fixups. */
 
@@ -691,7 +692,6 @@ static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
 	return NULL;
 }
 
-__maybe_unused
 static struct uprobe_trampoline *uprobe_trampoline_get(unsigned long vaddr)
 {
 	struct uprobes_state *state = &current->mm->uprobes_state;
@@ -718,7 +718,6 @@ static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp)
 	kfree(tramp);
 }
 
-__maybe_unused
 static void uprobe_trampoline_put(struct uprobe_trampoline *tramp)
 {
 	if (tramp && atomic64_dec_and_test(&tramp->ref))
@@ -861,6 +860,277 @@ static int __init arch_uprobes_init(void)
 
 late_initcall(arch_uprobes_init);
 
+enum {
+	OPT_PART,
+	OPT_INSN,
+	UNOPT_INT3,
+	UNOPT_PART,
+};
+
+struct write_opcode_ctx {
+	unsigned long base;
+	int update;
+};
+
+static int is_call_insn(uprobe_opcode_t *insn)
+{
+	return *insn == CALL_INSN_OPCODE;
+}
+
+static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
+		       int nbytes, void *data)
+{
+	struct write_opcode_ctx *ctx = data;
+	uprobe_opcode_t old_opcode[5];
+
+	uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
+
+	switch (ctx->update) {
+	case OPT_PART:
+	case OPT_INSN:
+		if (is_swbp_insn(&old_opcode[0]))
+			return 1;
+		break;
+	case UNOPT_INT3:
+		if (is_call_insn(&old_opcode[0]))
+			return 1;
+		break;
+	case UNOPT_PART:
+		if (is_swbp_insn(&old_opcode[0]))
+			return 1;
+		break;
+	}
+
+	return -1;
+}
+
+static int write_insn(struct vm_area_struct *vma, unsigned long vaddr,
+		      uprobe_opcode_t *insn, int nbytes, void *ctx)
+{
+	return uprobe_write(vma, vaddr, insn, nbytes, verify_insn, true, ctx);
+}
+
+static void relative_call(void *dest, long from, long to)
+{
+	struct __packed __arch_relative_insn {
+		u8 op;
+		s32 raddr;
+	} *insn;
+
+	insn = (struct __arch_relative_insn *)dest;
+	insn->raddr = (s32)(to - (from + 5));
+	insn->op = CALL_INSN_OPCODE;
+}
+
+static int swbp_optimize(struct vm_area_struct *vma, unsigned long vaddr, unsigned long tramp)
+{
+	struct write_opcode_ctx ctx = {
+		.base = vaddr,
+	};
+	char call[5];
+	int err;
+
+	relative_call(call, vaddr, tramp);
+
+	/*
+	 * We are in state where breakpoint (int3) is installed on top of first
+	 * byte of the nop5 instruction. We will do following steps to overwrite
+	 * this to call instruction:
+	 *
+	 * - sync cores
+	 * - write last 4 bytes of the call instruction
+	 * - sync cores
+	 * - update the call instruction opcode
+	 */
+
+	text_poke_sync();
+
+	ctx.update = OPT_PART;
+	err = write_insn(vma, vaddr + 1, call + 1, 4, &ctx);
+	if (err)
+		return err;
+
+	text_poke_sync();
+
+	ctx.update = OPT_INSN;
+	return write_insn(vma, vaddr, call, 1, &ctx);
+}
+
+static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+			   unsigned long vaddr)
+{
+	uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
+	struct write_opcode_ctx ctx = {
+		.base = vaddr,
+	};
+	int err;
+
+	/*
+	 * We need to overwrite call instruction into nop5 instruction with
+	 * breakpoint (int3) installed on top of its first byte. We will:
+	 *
+	 * - overwrite call opcode with breakpoint (int3)
+	 * - sync cores
+	 * - write last 4 bytes of the nop5 instruction
+	 * - sync cores
+	 */
+
+	ctx.update = UNOPT_INT3;
+	err = write_insn(vma, vaddr, &int3, 1, &ctx);
+	if (err)
+		return err;
+
+	text_poke_sync();
+
+	ctx.update = UNOPT_PART;
+	err = write_insn(vma, vaddr + 1, (uprobe_opcode_t *) auprobe->insn + 1, 4, &ctx);
+
+	text_poke_sync();
+	return err;
+}
+
+static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
+{
+	unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD;
+	struct vm_area_struct *vma;
+	struct page *page;
+
+	page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+	uprobe_copy_from_page(page, vaddr, dst, len);
+	put_page(page);
+	return 0;
+}
+
+static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
+{
+	struct __packed __arch_relative_insn {
+		u8 op;
+		s32 raddr;
+	} *call = (struct __arch_relative_insn *) insn;
+
+	if (!is_call_insn(insn))
+		return false;
+	return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
+}
+
+static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized)
+{
+	uprobe_opcode_t insn[5];
+	int err;
+
+	err = copy_from_vaddr(mm, vaddr, &insn, 5);
+	if (err)
+		return err;
+	*optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr);
+	return 0;
+}
+
+static bool should_optimize(struct arch_uprobe *auprobe)
+{
+	return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) &&
+		test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+}
+
+int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+	     unsigned long vaddr)
+{
+	if (should_optimize(auprobe)) {
+		bool optimized = false;
+		int err;
+
+		/*
+		 * We could race with another thread that already optimized the probe,
+		 * so let's not overwrite it with int3 again in this case.
+		 */
+		err = is_optimized(vma->vm_mm, vaddr, &optimized);
+		if (err || optimized)
+			return err;
+	}
+	return uprobe_write_opcode(vma, vaddr, UPROBE_SWBP_INSN, true);
+}
+
+int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+		  unsigned long vaddr)
+{
+	if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
+		struct mm_struct *mm = vma->vm_mm;
+		bool optimized = false;
+		int err;
+
+		err = is_optimized(mm, vaddr, &optimized);
+		if (err)
+			return err;
+		if (optimized)
+			WARN_ON_ONCE(swbp_unoptimize(auprobe, vma, vaddr));
+	}
+	return uprobe_write_opcode(vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, false);
+}
+
+static int __arch_uprobe_optimize(struct mm_struct *mm, unsigned long vaddr)
+{
+	struct uprobe_trampoline *tramp;
+	struct vm_area_struct *vma;
+	int err = 0;
+
+	vma = find_vma(mm, vaddr);
+	if (!vma)
+		return -1;
+	tramp = uprobe_trampoline_get(vaddr);
+	if (!tramp)
+		return -1;
+	err = swbp_optimize(vma, vaddr, tramp->vaddr);
+	if (WARN_ON_ONCE(err))
+		uprobe_trampoline_put(tramp);
+	return err;
+}
+
+void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	struct mm_struct *mm = current->mm;
+	uprobe_opcode_t insn[5];
+
+	/*
+	 * Do not optimize if shadow stack is enabled, the return address hijack
+	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
+	 * the entry uprobe is optimized and the shadow stack crashes the app.
+	 */
+	if (shstk_is_enabled())
+		return;
+
+	if (!should_optimize(auprobe))
+		return;
+
+	mmap_write_lock(mm);
+
+	/*
+	 * Check if some other thread already optimized the uprobe for us,
+	 * if it's the case just go away silently.
+	 */
+	if (copy_from_vaddr(mm, vaddr, &insn, 5))
+		goto unlock;
+	if (!is_swbp_insn((uprobe_opcode_t*) &insn))
+		goto unlock;
+
+	/*
+	 * If we fail to optimize the uprobe we set the fail bit so the
+	 * above should_optimize will fail from now on.
+	 */
+	if (__arch_uprobe_optimize(mm, vaddr))
+		set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
+
+unlock:
+	mmap_write_unlock(mm);
+}
+
+static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	if (memcmp(&auprobe->insn, x86_nops[5], 5))
+		return false;
+	/* We can't do cross page atomic writes yet. */
+	return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
+}
 #else /* 32-bit: */
 /*
  * No RIP-relative addressing on 32-bit
@@ -874,6 +1144,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 {
 }
+static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	return false;
+}
 #endif /* CONFIG_X86_64 */
 
 struct uprobe_xol_ops {
@@ -1240,6 +1514,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	if (ret)
 		return ret;
 
+	if (can_optimize(auprobe, addr))
+		set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+
 	ret = branch_setup_xol_ops(auprobe, &insn);
 	if (ret != -ENOSYS)
 		return ret;
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index bbe218ff16cc..d4c1fed9a9e4 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -192,7 +192,7 @@ struct uprobes_state {
 };
 
 typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr,
-				     uprobe_opcode_t *insn, int nbytes);
+				     uprobe_opcode_t *insn, int nbytes, void *data);
 
 extern void __init uprobes_init(void);
 extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr);
@@ -204,7 +204,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct vm_area_struct *vma, unsigned long vaddr,
 			       uprobe_opcode_t opcode, bool is_register);
 extern int uprobe_write(struct vm_area_struct *vma, const unsigned long insn_vaddr,
-			uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register);
+			uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register,
+			void *data);
 extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
 extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
@@ -240,6 +241,7 @@ extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *
 extern void arch_uprobe_clear_state(struct mm_struct *mm);
 extern void arch_uprobe_init_state(struct mm_struct *mm);
 extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
+extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 97a7b9f0c7ca..408a134c1a7b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -192,7 +192,7 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
 }
 
 static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
-			 int nbytes)
+			 int nbytes, void *data)
 {
 	uprobe_opcode_t old_opcode;
 	bool is_swbp;
@@ -491,12 +491,12 @@ int uprobe_write_opcode(struct vm_area_struct *vma, const unsigned long opcode_v
 			uprobe_opcode_t opcode, bool is_register)
 {
 	return uprobe_write(vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
-			    verify_opcode, is_register);
+			    verify_opcode, is_register, NULL);
 }
 
 int uprobe_write(struct vm_area_struct *vma, const unsigned long insn_vaddr,
 		 uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify,
-		 bool is_register)
+		 bool is_register, void *data)
 {
 	const unsigned long vaddr = insn_vaddr & PAGE_MASK;
 	struct mm_struct *mm = vma->vm_mm;
@@ -527,7 +527,7 @@ int uprobe_write(struct vm_area_struct *vma, const unsigned long insn_vaddr,
 		goto out;
 	folio = page_folio(page);
 
-	ret = verify(page, insn_vaddr, insn, nbytes);
+	ret = verify(page, insn_vaddr, insn, nbytes, data);
 	if (ret <= 0) {
 		folio_put(folio);
 		goto out;
@@ -2707,6 +2707,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
 	return true;
 }
 
+void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+}
+
 /*
  * Run handler and ask thread to singlestep.
  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -2771,6 +2775,9 @@ static void handle_swbp(struct pt_regs *regs)
 
 	handler_chain(uprobe, regs);
 
+	/* Try to optimize after first hit. */
+	arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
 	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
 		goto out;
 
-- 
2.49.0

Re: [PATCH perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Oleg Nesterov 9 months, 2 weeks ago

I didn't actually read this patch yet, but let me ask anyway...

On 04/21, Jiri Olsa wrote:
>
> +static int swbp_optimize(struct vm_area_struct *vma, unsigned long vaddr, unsigned long tramp)
> +{
> +	struct write_opcode_ctx ctx = {
> +		.base = vaddr,
> +	};
> +	char call[5];
> +	int err;
> +
> +	relative_call(call, vaddr, tramp);
> +
> +	/*
> +	 * We are in state where breakpoint (int3) is installed on top of first
> +	 * byte of the nop5 instruction. We will do following steps to overwrite
> +	 * this to call instruction:
> +	 *
> +	 * - sync cores
> +	 * - write last 4 bytes of the call instruction
> +	 * - sync cores
> +	 * - update the call instruction opcode
> +	 */
> +
> +	text_poke_sync();

Hmm. I would like to understand why exactly we need at least this first
text_poke_sync() before "write last 4 bytes of the call instruction".

And... I don't suggest to do this right now, but I am wondering if we can
use mm_cpumask(vma->vm_mm) later, I guess we don't care if we race with
switch_mm_irqs_off() which can add another CPU to this mask...

> +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
> +{
> +	struct mm_struct *mm = current->mm;
> +	uprobe_opcode_t insn[5];
> +
> +	/*
> +	 * Do not optimize if shadow stack is enabled, the return address hijack
> +	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
> +	 * the entry uprobe is optimized and the shadow stack crashes the app.
> +	 */
> +	if (shstk_is_enabled())
> +		return;

Not sure I fully understand the comment/problem, but what if
prctl(ARCH_SHSTK_ENABLE) is called after arch_uprobe_optimize() succeeds?

Oleg.

Re: [PATCH perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Jiri Olsa 9 months, 2 weeks ago

On Sun, Apr 27, 2025 at 07:11:43PM +0200, Oleg Nesterov wrote:
> I didn't actually read this patch yet, but let me ask anyway...
> 
> On 04/21, Jiri Olsa wrote:
> >
> > +static int swbp_optimize(struct vm_area_struct *vma, unsigned long vaddr, unsigned long tramp)
> > +{
> > +	struct write_opcode_ctx ctx = {
> > +		.base = vaddr,
> > +	};
> > +	char call[5];
> > +	int err;
> > +
> > +	relative_call(call, vaddr, tramp);
> > +
> > +	/*
> > +	 * We are in state where breakpoint (int3) is installed on top of first
> > +	 * byte of the nop5 instruction. We will do following steps to overwrite
> > +	 * this to call instruction:
> > +	 *
> > +	 * - sync cores
> > +	 * - write last 4 bytes of the call instruction
> > +	 * - sync cores
> > +	 * - update the call instruction opcode
> > +	 */
> > +
> > +	text_poke_sync();
> 
> Hmm. I would like to understand why exactly we need at least this first
> text_poke_sync() before "write last 4 bytes of the call instruction".

I followed David's comment in here:

  https://lore.kernel.org/bpf/e206df95d98d4cbab77824cf7a32a80f@AcuMS.aculab.com/

  > That might work provided there are IPI (to flush the decode pipeline)
  > after the write of the 'int3' and one before the write of the 'call'.
  > You'll need to ensure the I-cache gets invalidated as well.


swbp_optimize is called when there's already int3 in place

> 
> 
> And... I don't suggest to do this right now, but I am wondering if we can
> use mm_cpumask(vma->vm_mm) later, I guess we don't care if we race with
> switch_mm_irqs_off() which can add another CPU to this mask...

hum, probably..

> 
> > +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
> > +{
> > +	struct mm_struct *mm = current->mm;
> > +	uprobe_opcode_t insn[5];
> > +
> > +	/*
> > +	 * Do not optimize if shadow stack is enabled, the return address hijack
> > +	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
> > +	 * the entry uprobe is optimized and the shadow stack crashes the app.
> > +	 */
> > +	if (shstk_is_enabled())
> > +		return;
> 
> Not sure I fully understand the comment/problem, but what if
> prctl(ARCH_SHSTK_ENABLE) is called after arch_uprobe_optimize() succeeds?

I'll address this in separate email

thanks,
jirka

Re: [PATCH perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Jiri Olsa 9 months, 2 weeks ago

On Sun, Apr 27, 2025 at 07:11:43PM +0200, Oleg Nesterov wrote:

SNIP

> > +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
> > +{
> > +	struct mm_struct *mm = current->mm;
> > +	uprobe_opcode_t insn[5];
> > +
> > +	/*
> > +	 * Do not optimize if shadow stack is enabled, the return address hijack
> > +	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
> > +	 * the entry uprobe is optimized and the shadow stack crashes the app.
> > +	 */
> > +	if (shstk_is_enabled())
> > +		return;
> 
> Not sure I fully understand the comment/problem, but ...

the issue is that sys_uprobe adjusts rsp to skip the uprobe trampoline stack frame
(which is call + 3x push), so the uprobe consumers see expected stack

then if we need to hijack the return address we:
  - update the return value on actual stack (updated rsp)
  - we update shadow stack with shstk_update_last_frame (last shadow stack frame)
    which will cause mismatch and the app crashes on trampoline's ret instruction

I think we could make that work, but to make it simple I think it's better
to skip it for now


> what if
> prctl(ARCH_SHSTK_ENABLE) is called after arch_uprobe_optimize() succeeds?

so that would look like this:

  foo:
    [int3 -> call tramp] hijack foo's return address
    ...

    prctl(ARCH_SHSTK_ENABLE)
    ...
    prctl(ARCH_SHSTK_DISABLE)

    ret -> jumps to uretprobe trampoline

at the time 'prctl(ARCH_SHSTK_ENABLE)' is called the return address is already
hijacked/changed in any case IIUC you need to disable shadow stack before
'foo' returns

thanks,
jirka

Re: [PATCH perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Andrii Nakryiko 9 months, 3 weeks ago

On Mon, Apr 21, 2025 at 2:46 PM Jiri Olsa <jolsa@kernel.org> wrote:
>
> Putting together all the previously added pieces to support optimized
> uprobes on top of 5-byte nop instruction.
>
> The current uprobe execution goes through following:
>
>   - installs breakpoint instruction over original instruction
>   - exception handler hit and calls related uprobe consumers
>   - and either simulates original instruction or does out of line single step
>     execution of it
>   - returns to user space
>
> The optimized uprobe path does following:
>
>   - checks the original instruction is 5-byte nop (plus other checks)
>   - adds (or uses existing) user space trampoline with uprobe syscall
>   - overwrites original instruction (5-byte nop) with call to user space
>     trampoline
>   - the user space trampoline executes uprobe syscall that calls related uprobe
>     consumers
>   - trampoline returns back to next instruction
>
> This approach won't speed up all uprobes as it's limited to using nop5 as
> original instruction, but we plan to use nop5 as USDT probe instruction
> (which currently uses single byte nop) and speed up the USDT probes.
>
> The arch_uprobe_optimize triggers the uprobe optimization and is called after
> first uprobe hit. I originally had it called on uprobe installation but then
> it clashed with elf loader, because the user space trampoline was added in a
> place where loader might need to put elf segments, so I decided to do it after
> first uprobe hit when loading is done.
>
> The uprobe is un-optimized in arch specific set_orig_insn call.
>
> The instruction overwrite is x86 arch specific and needs to go through 3 updates:
> (on top of nop5 instruction)
>
>   - write int3 into 1st byte
>   - write last 4 bytes of the call instruction
>   - update the call instruction opcode
>
> And cleanup goes though similar reverse stages:
>
>   - overwrite call opcode with breakpoint (int3)
>   - write last 4 bytes of the nop5 instruction
>   - write the nop5 first instruction byte
>
> We do not unmap and release uprobe trampoline when it's no longer needed,
> because there's no easy way to make sure none of the threads is still
> inside the trampoline. But we do not waste memory, because there's just
> single page for all the uprobe trampoline mappings.
>
> We do waste frame on page mapping for every 4GB by keeping the uprobe
> trampoline page mapped, but that seems ok.
>
> We take the benefit from the fact that set_swbp and set_orig_insn are
> called under mmap_write_lock(mm), so we can use the current instruction
> as the state the uprobe is in - nop5/breakpoint/call trampoline -
> and decide the needed action (optimize/un-optimize) based on that.
>
> Attaching the speed up from benchs/run_bench_uprobes.sh script:
>
> current:
>         usermode-count :  152.604 ± 0.044M/s
>         syscall-count  :   13.359 ± 0.042M/s
> -->     uprobe-nop     :    3.229 ± 0.002M/s
>         uprobe-push    :    3.086 ± 0.004M/s
>         uprobe-ret     :    1.114 ± 0.004M/s
>         uprobe-nop5    :    1.121 ± 0.005M/s
>         uretprobe-nop  :    2.145 ± 0.002M/s
>         uretprobe-push :    2.070 ± 0.001M/s
>         uretprobe-ret  :    0.931 ± 0.001M/s
>         uretprobe-nop5 :    0.957 ± 0.001M/s
>
> after the change:
>         usermode-count :  152.448 ± 0.244M/s
>         syscall-count  :   14.321 ± 0.059M/s
>         uprobe-nop     :    3.148 ± 0.007M/s
>         uprobe-push    :    2.976 ± 0.004M/s
>         uprobe-ret     :    1.068 ± 0.003M/s
> -->     uprobe-nop5    :    7.038 ± 0.007M/s
>         uretprobe-nop  :    2.109 ± 0.004M/s
>         uretprobe-push :    2.035 ± 0.001M/s
>         uretprobe-ret  :    0.908 ± 0.001M/s
>         uretprobe-nop5 :    3.377 ± 0.009M/s
>
> I see bit more speed up on Intel (above) compared to AMD. The big nop5
> speed up is partly due to emulating nop5 and partly due to optimization.
>
> The key speed up we do this for is the USDT switch from nop to nop5:
>         uprobe-nop     :    3.148 ± 0.007M/s
>         uprobe-nop5    :    7.038 ± 0.007M/s
>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  arch/x86/include/asm/uprobes.h |   7 +
>  arch/x86/kernel/uprobes.c      | 281 ++++++++++++++++++++++++++++++++-
>  include/linux/uprobes.h        |   6 +-
>  kernel/events/uprobes.c        |  15 +-
>  4 files changed, 301 insertions(+), 8 deletions(-)
>

just minor nits, LGTM

Acked-by: Andrii Nakryiko <andrii@kernel.org>

> +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> +            unsigned long vaddr)
> +{
> +       if (should_optimize(auprobe)) {
> +               bool optimized = false;
> +               int err;
> +
> +               /*
> +                * We could race with another thread that already optimized the probe,
> +                * so let's not overwrite it with int3 again in this case.
> +                */
> +               err = is_optimized(vma->vm_mm, vaddr, &optimized);
> +               if (err || optimized)
> +                       return err;

IMO, this is a bit too clever, I'd go with plain

if (err)
    return err;
if (optimized)
    return 0; /* we are done */

(and mirror set_orig_insn() structure, consistently)


> +       }
> +       return uprobe_write_opcode(vma, vaddr, UPROBE_SWBP_INSN, true);
> +}
> +
> +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> +                 unsigned long vaddr)
> +{
> +       if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
> +               struct mm_struct *mm = vma->vm_mm;
> +               bool optimized = false;
> +               int err;
> +
> +               err = is_optimized(mm, vaddr, &optimized);
> +               if (err)
> +                       return err;
> +               if (optimized)
> +                       WARN_ON_ONCE(swbp_unoptimize(auprobe, vma, vaddr));
> +       }
> +       return uprobe_write_opcode(vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, false);
> +}
> +
> +static int __arch_uprobe_optimize(struct mm_struct *mm, unsigned long vaddr)
> +{
> +       struct uprobe_trampoline *tramp;
> +       struct vm_area_struct *vma;
> +       int err = 0;
> +
> +       vma = find_vma(mm, vaddr);
> +       if (!vma)
> +               return -1;

this is EPERM, will be confusing to debug... why not -EINVAL?

> +       tramp = uprobe_trampoline_get(vaddr);
> +       if (!tramp)
> +               return -1;

ditto

> +       err = swbp_optimize(vma, vaddr, tramp->vaddr);
> +       if (WARN_ON_ONCE(err))
> +               uprobe_trampoline_put(tramp);
> +       return err;
> +}
> +

[...]

Re: [PATCH perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Jiri Olsa 9 months, 2 weeks ago

On Tue, Apr 22, 2025 at 05:04:03PM -0700, Andrii Nakryiko wrote:

SNIP

> >  arch/x86/include/asm/uprobes.h |   7 +
> >  arch/x86/kernel/uprobes.c      | 281 ++++++++++++++++++++++++++++++++-
> >  include/linux/uprobes.h        |   6 +-
> >  kernel/events/uprobes.c        |  15 +-
> >  4 files changed, 301 insertions(+), 8 deletions(-)
> >
> 
> just minor nits, LGTM
> 
> Acked-by: Andrii Nakryiko <andrii@kernel.org>
> 
> > +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> > +            unsigned long vaddr)
> > +{
> > +       if (should_optimize(auprobe)) {
> > +               bool optimized = false;
> > +               int err;
> > +
> > +               /*
> > +                * We could race with another thread that already optimized the probe,
> > +                * so let's not overwrite it with int3 again in this case.
> > +                */
> > +               err = is_optimized(vma->vm_mm, vaddr, &optimized);
> > +               if (err || optimized)
> > +                       return err;
> 
> IMO, this is a bit too clever, I'd go with plain
> 
> if (err)
>     return err;
> if (optimized)
>     return 0; /* we are done */
> 

ok

> (and mirror set_orig_insn() structure, consistently)

set_orig_insn does that already, right?

> 
> 
> > +       }
> > +       return uprobe_write_opcode(vma, vaddr, UPROBE_SWBP_INSN, true);
> > +}
> > +
> > +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> > +                 unsigned long vaddr)
> > +{
> > +       if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
> > +               struct mm_struct *mm = vma->vm_mm;
> > +               bool optimized = false;
> > +               int err;
> > +
> > +               err = is_optimized(mm, vaddr, &optimized);
> > +               if (err)
> > +                       return err;
> > +               if (optimized)
> > +                       WARN_ON_ONCE(swbp_unoptimize(auprobe, vma, vaddr));
> > +       }
> > +       return uprobe_write_opcode(vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, false);
> > +}
> > +
> > +static int __arch_uprobe_optimize(struct mm_struct *mm, unsigned long vaddr)
> > +{
> > +       struct uprobe_trampoline *tramp;
> > +       struct vm_area_struct *vma;
> > +       int err = 0;
> > +
> > +       vma = find_vma(mm, vaddr);
> > +       if (!vma)
> > +               return -1;
> 
> this is EPERM, will be confusing to debug... why not -EINVAL?
> 
> > +       tramp = uprobe_trampoline_get(vaddr);
> > +       if (!tramp)
> > +               return -1;
> 
> ditto

so the error value is not exposed to user space in this case,
we try to optimize in the first hit with:

	handle_swbp()
	{
		arch_uprobe_optimize()
		{

			if (__arch_uprobe_optimize(mm, vaddr))
				set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);

		}
	}

and set ARCH_UPROBE_FLAG_OPTIMIZE_FAIL flags bit in case of error,
plus there's WARN for swbp_optimize which should pass in case we
get that far

thanks,
jirka

> 
> > +       err = swbp_optimize(vma, vaddr, tramp->vaddr);
> > +       if (WARN_ON_ONCE(err))
> > +               uprobe_trampoline_put(tramp);
> > +       return err;
> > +}
> > +
> 
> [...]

Re: [PATCH perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Andrii Nakryiko 9 months, 2 weeks ago

On Thu, Apr 24, 2025 at 5:49 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Tue, Apr 22, 2025 at 05:04:03PM -0700, Andrii Nakryiko wrote:
>
> SNIP
>
> > >  arch/x86/include/asm/uprobes.h |   7 +
> > >  arch/x86/kernel/uprobes.c      | 281 ++++++++++++++++++++++++++++++++-
> > >  include/linux/uprobes.h        |   6 +-
> > >  kernel/events/uprobes.c        |  15 +-
> > >  4 files changed, 301 insertions(+), 8 deletions(-)
> > >
> >
> > just minor nits, LGTM
> >
> > Acked-by: Andrii Nakryiko <andrii@kernel.org>
> >
> > > +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> > > +            unsigned long vaddr)
> > > +{
> > > +       if (should_optimize(auprobe)) {
> > > +               bool optimized = false;
> > > +               int err;
> > > +
> > > +               /*
> > > +                * We could race with another thread that already optimized the probe,
> > > +                * so let's not overwrite it with int3 again in this case.
> > > +                */
> > > +               err = is_optimized(vma->vm_mm, vaddr, &optimized);
> > > +               if (err || optimized)
> > > +                       return err;
> >
> > IMO, this is a bit too clever, I'd go with plain
> >
> > if (err)
> >     return err;
> > if (optimized)
> >     return 0; /* we are done */
> >
>
> ok
>
> > (and mirror set_orig_insn() structure, consistently)
>
> set_orig_insn does that already, right?
>

right, and that was my point

> >
> >
> > > +       }
> > > +       return uprobe_write_opcode(vma, vaddr, UPROBE_SWBP_INSN, true);
> > > +}
> > > +
> > > +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> > > +                 unsigned long vaddr)
> > > +{
> > > +       if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
> > > +               struct mm_struct *mm = vma->vm_mm;
> > > +               bool optimized = false;
> > > +               int err;
> > > +
> > > +               err = is_optimized(mm, vaddr, &optimized);
> > > +               if (err)
> > > +                       return err;
> > > +               if (optimized)
> > > +                       WARN_ON_ONCE(swbp_unoptimize(auprobe, vma, vaddr));
> > > +       }
> > > +       return uprobe_write_opcode(vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, false);
> > > +}
> > > +
> > > +static int __arch_uprobe_optimize(struct mm_struct *mm, unsigned long vaddr)
> > > +{
> > > +       struct uprobe_trampoline *tramp;
> > > +       struct vm_area_struct *vma;
> > > +       int err = 0;
> > > +
> > > +       vma = find_vma(mm, vaddr);
> > > +       if (!vma)
> > > +               return -1;
> >
> > this is EPERM, will be confusing to debug... why not -EINVAL?
> >
> > > +       tramp = uprobe_trampoline_get(vaddr);
> > > +       if (!tramp)
> > > +               return -1;
> >
> > ditto
>
> so the error value is not exposed to user space in this case,
> we try to optimize in the first hit with:
>
>         handle_swbp()
>         {
>                 arch_uprobe_optimize()
>                 {
>
>                         if (__arch_uprobe_optimize(mm, vaddr))
>                                 set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
>
>                 }
>         }
>
> and set ARCH_UPROBE_FLAG_OPTIMIZE_FAIL flags bit in case of error,
> plus there's WARN for swbp_optimize which should pass in case we
> get that far

yeah, I know, but I don't think we should deviate from kernel-wide
-Exxx convention for returning errors from functions just because this
error doesn't make it all the way to user space

>
> thanks,
> jirka
>
> >
> > > +       err = swbp_optimize(vma, vaddr, tramp->vaddr);
> > > +       if (WARN_ON_ONCE(err))
> > > +               uprobe_trampoline_put(tramp);
> > > +       return err;
> > > +}
> > > +
> >
> > [...]