[v6] uprobes: Add support to optimize usdt probes on x86_64

[PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Jiri Olsa 2 months, 2 weeks ago

Putting together all the previously added pieces to support optimized
uprobes on top of 5-byte nop instruction.

The current uprobe execution goes through following:

  - installs breakpoint instruction over original instruction
  - exception handler hit and calls related uprobe consumers
  - and either simulates original instruction or does out of line single step
    execution of it
  - returns to user space

The optimized uprobe path does following:

  - checks the original instruction is 5-byte nop (plus other checks)
  - adds (or uses existing) user space trampoline with uprobe syscall
  - overwrites original instruction (5-byte nop) with call to user space
    trampoline
  - the user space trampoline executes uprobe syscall that calls related uprobe
    consumers
  - trampoline returns back to next instruction

This approach won't speed up all uprobes as it's limited to using nop5 as
original instruction, but we plan to use nop5 as USDT probe instruction
(which currently uses single byte nop) and speed up the USDT probes.

The arch_uprobe_optimize triggers the uprobe optimization and is called after
first uprobe hit. I originally had it called on uprobe installation but then
it clashed with elf loader, because the user space trampoline was added in a
place where loader might need to put elf segments, so I decided to do it after
first uprobe hit when loading is done.

The uprobe is un-optimized in arch specific set_orig_insn call.

The instruction overwrite is x86 arch specific and needs to go through 3 updates:
(on top of nop5 instruction)

  - write int3 into 1st byte
  - write last 4 bytes of the call instruction
  - update the call instruction opcode

And cleanup goes though similar reverse stages:

  - overwrite call opcode with breakpoint (int3)
  - write last 4 bytes of the nop5 instruction
  - write the nop5 first instruction byte

We do not unmap and release uprobe trampoline when it's no longer needed,
because there's no easy way to make sure none of the threads is still
inside the trampoline. But we do not waste memory, because there's just
single page for all the uprobe trampoline mappings.

We do waste frame on page mapping for every 4GB by keeping the uprobe
trampoline page mapped, but that seems ok.

We take the benefit from the fact that set_swbp and set_orig_insn are
called under mmap_write_lock(mm), so we can use the current instruction
as the state the uprobe is in - nop5/breakpoint/call trampoline -
and decide the needed action (optimize/un-optimize) based on that.

Attaching the speed up from benchs/run_bench_uprobes.sh script:

current:
        usermode-count :  152.604 ± 0.044M/s
        syscall-count  :   13.359 ± 0.042M/s
-->     uprobe-nop     :    3.229 ± 0.002M/s
        uprobe-push    :    3.086 ± 0.004M/s
        uprobe-ret     :    1.114 ± 0.004M/s
        uprobe-nop5    :    1.121 ± 0.005M/s
        uretprobe-nop  :    2.145 ± 0.002M/s
        uretprobe-push :    2.070 ± 0.001M/s
        uretprobe-ret  :    0.931 ± 0.001M/s
        uretprobe-nop5 :    0.957 ± 0.001M/s

after the change:
        usermode-count :  152.448 ± 0.244M/s
        syscall-count  :   14.321 ± 0.059M/s
        uprobe-nop     :    3.148 ± 0.007M/s
        uprobe-push    :    2.976 ± 0.004M/s
        uprobe-ret     :    1.068 ± 0.003M/s
-->     uprobe-nop5    :    7.038 ± 0.007M/s
        uretprobe-nop  :    2.109 ± 0.004M/s
        uretprobe-push :    2.035 ± 0.001M/s
        uretprobe-ret  :    0.908 ± 0.001M/s
        uretprobe-nop5 :    3.377 ± 0.009M/s

I see bit more speed up on Intel (above) compared to AMD. The big nop5
speed up is partly due to emulating nop5 and partly due to optimization.

The key speed up we do this for is the USDT switch from nop to nop5:
        uprobe-nop     :    3.148 ± 0.007M/s
        uprobe-nop5    :    7.038 ± 0.007M/s

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/include/asm/uprobes.h |   7 +
 arch/x86/kernel/uprobes.c      | 283 ++++++++++++++++++++++++++++++++-
 include/linux/uprobes.h        |   6 +-
 kernel/events/uprobes.c        |  16 +-
 4 files changed, 305 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 678fb546f0a7..1ee2e5115955 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
 #define UPROBE_SWBP_INSN		0xcc
 #define UPROBE_SWBP_INSN_SIZE		   1
 
+enum {
+	ARCH_UPROBE_FLAG_CAN_OPTIMIZE   = 0,
+	ARCH_UPROBE_FLAG_OPTIMIZE_FAIL  = 1,
+};
+
 struct uprobe_xol_ops;
 
 struct arch_uprobe {
@@ -45,6 +50,8 @@ struct arch_uprobe {
 			u8	ilen;
 		}			push;
 	};
+
+	unsigned long flags;
 };
 
 struct arch_uprobe_task {
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index d18e1ae59901..209ce74ab93f 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -18,6 +18,7 @@
 #include <asm/processor.h>
 #include <asm/insn.h>
 #include <asm/mmu_context.h>
+#include <asm/nops.h>
 
 /* Post-execution fixups. */
 
@@ -702,7 +703,6 @@ static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
 	return tramp;
 }
 
-__maybe_unused
 static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new)
 {
 	struct uprobes_state *state = &current->mm->uprobes_state;
@@ -891,6 +891,280 @@ static int __init arch_uprobes_init(void)
 
 late_initcall(arch_uprobes_init);
 
+enum {
+	EXPECT_SWBP,
+	EXPECT_CALL,
+};
+
+struct write_opcode_ctx {
+	unsigned long base;
+	int expect;
+};
+
+static int is_call_insn(uprobe_opcode_t *insn)
+{
+	return *insn == CALL_INSN_OPCODE;
+}
+
+/*
+ * Verification callback used by int3_update uprobe_write calls to make sure
+ * the underlying instruction is as expected - either int3 or call.
+ */
+static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
+		       int nbytes, void *data)
+{
+	struct write_opcode_ctx *ctx = data;
+	uprobe_opcode_t old_opcode[5];
+
+	uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
+
+	switch (ctx->expect) {
+	case EXPECT_SWBP:
+		if (is_swbp_insn(&old_opcode[0]))
+			return 1;
+		break;
+	case EXPECT_CALL:
+		if (is_call_insn(&old_opcode[0]))
+			return 1;
+		break;
+	}
+
+	return -1;
+}
+
+/*
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
+ * (borrowed comment from smp_text_poke_batch_finish)
+ *
+ * The way it is done:
+ *   - Add an INT3 trap to the address that will be patched
+ *   - SMP sync all CPUs
+ *   - Update all but the first byte of the patched range
+ *   - SMP sync all CPUs
+ *   - Replace the first byte (INT3) by the first byte of the replacing opcode
+ *   - SMP sync all CPUs
+ */
+static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+		       unsigned long vaddr, char *insn, bool optimize)
+{
+	uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
+	struct write_opcode_ctx ctx = {
+		.base = vaddr,
+	};
+	int err;
+
+	/*
+	 * Write int3 trap.
+	 *
+	 * The swbp_optimize path comes with breakpoint already installed,
+	 * so we can skip this step for optimize == true.
+	 */
+	if (!optimize) {
+		ctx.expect = EXPECT_CALL;
+		err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
+				   true /* is_register */, false /* do_update_ref_ctr */,
+				   &ctx);
+		if (err)
+			return err;
+	}
+
+	smp_text_poke_sync_each_cpu();
+
+	/* Write all but the first byte of the patched range. */
+	ctx.expect = EXPECT_SWBP;
+	err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
+			   true /* is_register */, false /* do_update_ref_ctr */,
+			   &ctx);
+	if (err)
+		return err;
+
+	smp_text_poke_sync_each_cpu();
+
+	/*
+	 * Write first byte.
+	 *
+	 * The swbp_unoptimize needs to finish uprobe removal together
+	 * with ref_ctr update, using uprobe_write with proper flags.
+	 */
+	err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn,
+			   optimize /* is_register */, !optimize /* do_update_ref_ctr */,
+			   &ctx);
+	if (err)
+		return err;
+
+	smp_text_poke_sync_each_cpu();
+	return 0;
+}
+
+static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+			 unsigned long vaddr, unsigned long tramp)
+{
+	u8 call[5];
+
+	__text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr,
+			(const void *) tramp, CALL_INSN_SIZE);
+	return int3_update(auprobe, vma, vaddr, call, true /* optimize */);
+}
+
+static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+			   unsigned long vaddr)
+{
+	return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */);
+}
+
+static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
+{
+	unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD;
+	struct vm_area_struct *vma;
+	struct page *page;
+
+	page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+	uprobe_copy_from_page(page, vaddr, dst, len);
+	put_page(page);
+	return 0;
+}
+
+static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
+{
+	struct __packed __arch_relative_insn {
+		u8 op;
+		s32 raddr;
+	} *call = (struct __arch_relative_insn *) insn;
+
+	if (!is_call_insn(insn))
+		return false;
+	return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
+}
+
+static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized)
+{
+	uprobe_opcode_t insn[5];
+	int err;
+
+	err = copy_from_vaddr(mm, vaddr, &insn, 5);
+	if (err)
+		return err;
+	*optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr);
+	return 0;
+}
+
+static bool should_optimize(struct arch_uprobe *auprobe)
+{
+	return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) &&
+		test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+}
+
+int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+	     unsigned long vaddr)
+{
+	if (should_optimize(auprobe)) {
+		bool optimized = false;
+		int err;
+
+		/*
+		 * We could race with another thread that already optimized the probe,
+		 * so let's not overwrite it with int3 again in this case.
+		 */
+		err = is_optimized(vma->vm_mm, vaddr, &optimized);
+		if (err)
+			return err;
+		if (optimized)
+			return 0;
+	}
+	return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN,
+				   true /* is_register */);
+}
+
+int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+		  unsigned long vaddr)
+{
+	if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
+		struct mm_struct *mm = vma->vm_mm;
+		bool optimized = false;
+		int err;
+
+		err = is_optimized(mm, vaddr, &optimized);
+		if (err)
+			return err;
+		if (optimized) {
+			err = swbp_unoptimize(auprobe, vma, vaddr);
+			WARN_ON_ONCE(err);
+			return err;
+		}
+	}
+	return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn,
+				   false /* is_register */);
+}
+
+static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
+				  unsigned long vaddr)
+{
+	struct uprobe_trampoline *tramp;
+	struct vm_area_struct *vma;
+	bool new = false;
+	int err = 0;
+
+	vma = find_vma(mm, vaddr);
+	if (!vma)
+		return -EINVAL;
+	tramp = get_uprobe_trampoline(vaddr, &new);
+	if (!tramp)
+		return -EINVAL;
+	err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr);
+	if (WARN_ON_ONCE(err) && new)
+		destroy_uprobe_trampoline(tramp);
+	return err;
+}
+
+void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	struct mm_struct *mm = current->mm;
+	uprobe_opcode_t insn[5];
+
+	/*
+	 * Do not optimize if shadow stack is enabled, the return address hijack
+	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
+	 * the entry uprobe is optimized and the shadow stack crashes the app.
+	 */
+	if (shstk_is_enabled())
+		return;
+
+	if (!should_optimize(auprobe))
+		return;
+
+	mmap_write_lock(mm);
+
+	/*
+	 * Check if some other thread already optimized the uprobe for us,
+	 * if it's the case just go away silently.
+	 */
+	if (copy_from_vaddr(mm, vaddr, &insn, 5))
+		goto unlock;
+	if (!is_swbp_insn((uprobe_opcode_t*) &insn))
+		goto unlock;
+
+	/*
+	 * If we fail to optimize the uprobe we set the fail bit so the
+	 * above should_optimize will fail from now on.
+	 */
+	if (__arch_uprobe_optimize(auprobe, mm, vaddr))
+		set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
+
+unlock:
+	mmap_write_unlock(mm);
+}
+
+static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	if (memcmp(&auprobe->insn, x86_nops[5], 5))
+		return false;
+	/* We can't do cross page atomic writes yet. */
+	return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
+}
 #else /* 32-bit: */
 /*
  * No RIP-relative addressing on 32-bit
@@ -904,6 +1178,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 {
 }
+static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	return false;
+}
 #endif /* CONFIG_X86_64 */
 
 struct uprobe_xol_ops {
@@ -1270,6 +1548,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	if (ret)
 		return ret;
 
+	if (can_optimize(auprobe, addr))
+		set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+
 	ret = branch_setup_xol_ops(auprobe, &insn);
 	if (ret != -ENOSYS)
 		return ret;
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b6b077cc7d0f..08ef78439d0d 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -192,7 +192,7 @@ struct uprobes_state {
 };
 
 typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr,
-				     uprobe_opcode_t *insn, int nbytes);
+				     uprobe_opcode_t *insn, int nbytes, void *data);
 
 extern void __init uprobes_init(void);
 extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr);
@@ -204,7 +204,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t,
 			       bool is_register);
 extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr,
-			uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr);
+			uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
+			void *data);
 extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
 extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
@@ -240,6 +241,7 @@ extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *
 extern void arch_uprobe_clear_state(struct mm_struct *mm);
 extern void arch_uprobe_init_state(struct mm_struct *mm);
 extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
+extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cbba31c0495f..e54081beeab9 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -192,7 +192,7 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
 }
 
 static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
-			 int nbytes)
+			 int nbytes, void *data)
 {
 	uprobe_opcode_t old_opcode;
 	bool is_swbp;
@@ -492,12 +492,13 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 		bool is_register)
 {
 	return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
-			    verify_opcode, is_register, true /* do_update_ref_ctr */);
+			    verify_opcode, is_register, true /* do_update_ref_ctr */, NULL);
 }
 
 int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 		 const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
-		 uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr)
+		 uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
+		 void *data)
 {
 	const unsigned long vaddr = insn_vaddr & PAGE_MASK;
 	struct mm_struct *mm = vma->vm_mm;
@@ -531,7 +532,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 		goto out;
 	folio = page_folio(page);
 
-	ret = verify(page, insn_vaddr, insn, nbytes);
+	ret = verify(page, insn_vaddr, insn, nbytes, data);
 	if (ret <= 0) {
 		folio_put(folio);
 		goto out;
@@ -2697,6 +2698,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
 	return true;
 }
 
+void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+}
+
 /*
  * Run handler and ask thread to singlestep.
  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -2761,6 +2766,9 @@ static void handle_swbp(struct pt_regs *regs)
 
 	handler_chain(uprobe, regs);
 
+	/* Try to optimize after first hit. */
+	arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
 	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
 		goto out;
 
-- 
2.50.1

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Peter Zijlstra 1 month, 2 weeks ago

On Sun, Jul 20, 2025 at 01:21:20PM +0200, Jiri Olsa wrote:

> +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
> +{
> +	struct __packed __arch_relative_insn {
> +		u8 op;
> +		s32 raddr;
> +	} *call = (struct __arch_relative_insn *) insn;

Not something you need to clean up now I suppose, but we could do with
unifying this thing. we have a bunch of instances around.

> +
> +	if (!is_call_insn(insn))
> +		return false;
> +	return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
> +}

> +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
> +{
> +	struct mm_struct *mm = current->mm;
> +	uprobe_opcode_t insn[5];
> +
> +	/*
> +	 * Do not optimize if shadow stack is enabled, the return address hijack
> +	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
> +	 * the entry uprobe is optimized and the shadow stack crashes the app.
> +	 */
> +	if (shstk_is_enabled())
> +		return;

Kernel should be able to fix up userspace shadow stack just fine.

> +	if (!should_optimize(auprobe))
> +		return;
> +
> +	mmap_write_lock(mm);
> +
> +	/*
> +	 * Check if some other thread already optimized the uprobe for us,
> +	 * if it's the case just go away silently.
> +	 */
> +	if (copy_from_vaddr(mm, vaddr, &insn, 5))
> +		goto unlock;
> +	if (!is_swbp_insn((uprobe_opcode_t*) &insn))
> +		goto unlock;
> +
> +	/*
> +	 * If we fail to optimize the uprobe we set the fail bit so the
> +	 * above should_optimize will fail from now on.
> +	 */
> +	if (__arch_uprobe_optimize(auprobe, mm, vaddr))
> +		set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
> +
> +unlock:
> +	mmap_write_unlock(mm);
> +}
> +
> +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
> +{
> +	if (memcmp(&auprobe->insn, x86_nops[5], 5))
> +		return false;
> +	/* We can't do cross page atomic writes yet. */
> +	return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
> +}

This seems needlessly restrictive. Something like:

is_nop5(const char *buf)
{
	struct insn insn;

	ret = insn_decode_kernel(&insn, buf)
	if (ret < 0)
		return false;

	if (insn.length != 5)
		return false;

	if (insn.opcode[0] != 0x0f ||
	    insn.opcode[1] != 0x1f)
	    	return false;

	return true;
}

Should do I suppose. Anyway, I think something like:

  f0 0f 1f 44 00 00	lock nopl 0(%eax, %eax, 1)

is a valid NOP5 at +1 and will 'optimize' and result in:

  f0 e8 disp32		lock call disp32

which will #UD.

But this is nearly unfixable. Just doing my best to find weirdo cases
;-)

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Jiri Olsa 1 month ago

On Tue, Aug 19, 2025 at 09:15:15PM +0200, Peter Zijlstra wrote:
> On Sun, Jul 20, 2025 at 01:21:20PM +0200, Jiri Olsa wrote:
> 
> > +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
> > +{
> > +	struct __packed __arch_relative_insn {
> > +		u8 op;
> > +		s32 raddr;
> > +	} *call = (struct __arch_relative_insn *) insn;
> 
> Not something you need to clean up now I suppose, but we could do with
> unifying this thing. we have a bunch of instances around.

found two below, maybe we could use 'union text_poke_insn' instead like below?

jirka


---
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 6079d15dab8c..7fd03897d776 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -109,14 +109,10 @@ const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
 static nokprobe_inline void
 __synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
 {
-	struct __arch_relative_insn {
-		u8 op;
-		s32 raddr;
-	} __packed *insn;
-
-	insn = (struct __arch_relative_insn *)dest;
-	insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
-	insn->op = op;
+	union text_poke_insn *insn = dest;
+
+	insn->disp = (s32)((long)(to) - ((long)(from) + 5));
+	insn->opcode = op;
 }
 
 /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 0a8c0a4a5423..bac14f3165c3 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1046,14 +1046,11 @@ static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst,
 
 static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
 {
-	struct __packed __arch_relative_insn {
-		u8 op;
-		s32 raddr;
-	} *call = (struct __arch_relative_insn *) insn;
+	union text_poke_insn *call = (union text_poke_insn *) insn;
 
 	if (!is_call_insn(insn))
 		return false;
-	return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
+	return __in_uprobe_trampoline(vaddr + 5 + call->disp);
 }
 
 static int is_optimized(struct mm_struct *mm, unsigned long vaddr)

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Peter Zijlstra 1 month, 2 weeks ago

On Tue, Aug 19, 2025 at 09:15:15PM +0200, Peter Zijlstra wrote:
> On Sun, Jul 20, 2025 at 01:21:20PM +0200, Jiri Olsa wrote:

> > +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
> > +{
> > +	struct mm_struct *mm = current->mm;
> > +	uprobe_opcode_t insn[5];
> > +
> > +	/*
> > +	 * Do not optimize if shadow stack is enabled, the return address hijack
> > +	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
> > +	 * the entry uprobe is optimized and the shadow stack crashes the app.
> > +	 */
> > +	if (shstk_is_enabled())
> > +		return;
> 
> Kernel should be able to fix up userspace shadow stack just fine.
> 
> > +	if (!should_optimize(auprobe))
> > +		return;
> > +
> > +	mmap_write_lock(mm);
> > +
> > +	/*
> > +	 * Check if some other thread already optimized the uprobe for us,
> > +	 * if it's the case just go away silently.
> > +	 */
> > +	if (copy_from_vaddr(mm, vaddr, &insn, 5))
> > +		goto unlock;
> > +	if (!is_swbp_insn((uprobe_opcode_t*) &insn))
> > +		goto unlock;
> > +
> > +	/*
> > +	 * If we fail to optimize the uprobe we set the fail bit so the
> > +	 * above should_optimize will fail from now on.
> > +	 */
> > +	if (__arch_uprobe_optimize(auprobe, mm, vaddr))
> > +		set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
> > +
> > +unlock:
> > +	mmap_write_unlock(mm);
> > +}

Something a little like this should do I suppose...

--- a/arch/x86/include/asm/shstk.h
+++ b/arch/x86/include/asm/shstk.h
@@ -23,6 +23,8 @@ int setup_signal_shadow_stack(struct ksi
 int restore_signal_shadow_stack(void);
 int shstk_update_last_frame(unsigned long val);
 bool shstk_is_enabled(void);
+int shstk_pop(u64 *val);
+int shstk_push(u64 val);
 #else
 static inline long shstk_prctl(struct task_struct *task, int option,
 			       unsigned long arg2) { return -EINVAL; }
@@ -35,6 +37,8 @@ static inline int setup_signal_shadow_st
 static inline int restore_signal_shadow_stack(void) { return 0; }
 static inline int shstk_update_last_frame(unsigned long val) { return 0; }
 static inline bool shstk_is_enabled(void) { return false; }
+static inline int shstk_pop(u64 *val) { return -ENOTSUPP; }
+static inline int shstk_push(u64 val) { return -ENOTSUPP; }
 #endif /* CONFIG_X86_USER_SHADOW_STACK */
 
 #endif /* __ASSEMBLER__ */
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr
 	return ssp;
 }
 
+int shstk_pop(u64 *val)
+{
+	int ret = 0;
+	u64 ssp;
+
+	if (!features_enabled(ARCH_SHSTK_SHSTK))
+		return -ENOTSUPP;
+
+	fpregs_lock_and_load();
+
+	rdmsrq(MSR_IA32_PL3_SSP, ssp);
+	if (val && get_user(*val, (__user u64 *)ssp))
+	    ret = -EFAULT;
+	ssp += SS_FRAME_SIZE;
+	wrmsrq(MSR_IA32_PL3_SSP, ssp);
+
+	fpregs_unlock();
+
+	return ret;
+}
+
+int shstk_push(u64 val)
+{
+	u64 ssp;
+	int ret;
+
+	if (!features_enabled(ARCH_SHSTK_SHSTK))
+		return -ENOTSUPP;
+
+	fpregs_lock_and_load();
+
+	rdmsrq(MSR_IA32_PL3_SSP, ssp);
+	ssp -= SS_FRAME_SIZE;
+	wrmsrq(MSR_IA32_PL3_SSP, ssp);
+	ret = write_user_shstk_64((__user void *)ssp, val);
+	fpregs_unlock();
+
+	return ret;
+}
+
 #define SHSTK_DATA_BIT BIT(63)
 
 static int put_shstk_data(u64 __user *addr, u64 data)
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -804,7 +804,7 @@ SYSCALL_DEFINE0(uprobe)
 {
 	struct pt_regs *regs = task_pt_regs(current);
 	struct uprobe_syscall_args args;
-	unsigned long ip, sp;
+	unsigned long ip, sp, sret;
 	int err;
 
 	/* Allow execution only from uprobe trampolines. */
@@ -831,6 +831,9 @@ SYSCALL_DEFINE0(uprobe)
 
 	sp = regs->sp;
 
+	if (shstk_pop(&sret) == 0 && sret != args.retaddr)
+		goto sigill;
+
 	handle_syscall_uprobe(regs, regs->ip);
 
 	/*
@@ -855,6 +858,9 @@ SYSCALL_DEFINE0(uprobe)
 	if (args.retaddr - 5 != regs->ip)
 		args.retaddr = regs->ip;
 
+	if (shstk_push(args.retaddr) == -EFAULT)
+		goto sigill;
+
 	regs->ip = ip;
 
 	err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
@@ -1124,14 +1130,6 @@ void arch_uprobe_optimize(struct arch_up
 	struct mm_struct *mm = current->mm;
 	uprobe_opcode_t insn[5];
 
-	/*
-	 * Do not optimize if shadow stack is enabled, the return address hijack
-	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
-	 * the entry uprobe is optimized and the shadow stack crashes the app.
-	 */
-	if (shstk_is_enabled())
-		return;
-
 	if (!should_optimize(auprobe))
 		return;

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Jiri Olsa 1 month, 2 weeks ago

On Wed, Aug 20, 2025 at 02:30:33PM +0200, Peter Zijlstra wrote:
> On Tue, Aug 19, 2025 at 09:15:15PM +0200, Peter Zijlstra wrote:
> > On Sun, Jul 20, 2025 at 01:21:20PM +0200, Jiri Olsa wrote:
> 
> > > +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
> > > +{
> > > +	struct mm_struct *mm = current->mm;
> > > +	uprobe_opcode_t insn[5];
> > > +
> > > +	/*
> > > +	 * Do not optimize if shadow stack is enabled, the return address hijack
> > > +	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
> > > +	 * the entry uprobe is optimized and the shadow stack crashes the app.
> > > +	 */
> > > +	if (shstk_is_enabled())
> > > +		return;
> > 
> > Kernel should be able to fix up userspace shadow stack just fine.
> > 
> > > +	if (!should_optimize(auprobe))
> > > +		return;
> > > +
> > > +	mmap_write_lock(mm);
> > > +
> > > +	/*
> > > +	 * Check if some other thread already optimized the uprobe for us,
> > > +	 * if it's the case just go away silently.
> > > +	 */
> > > +	if (copy_from_vaddr(mm, vaddr, &insn, 5))
> > > +		goto unlock;
> > > +	if (!is_swbp_insn((uprobe_opcode_t*) &insn))
> > > +		goto unlock;
> > > +
> > > +	/*
> > > +	 * If we fail to optimize the uprobe we set the fail bit so the
> > > +	 * above should_optimize will fail from now on.
> > > +	 */
> > > +	if (__arch_uprobe_optimize(auprobe, mm, vaddr))
> > > +		set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
> > > +
> > > +unlock:
> > > +	mmap_write_unlock(mm);
> > > +}
> 
> Something a little like this should do I suppose...
> 
> --- a/arch/x86/include/asm/shstk.h
> +++ b/arch/x86/include/asm/shstk.h
> @@ -23,6 +23,8 @@ int setup_signal_shadow_stack(struct ksi
>  int restore_signal_shadow_stack(void);
>  int shstk_update_last_frame(unsigned long val);
>  bool shstk_is_enabled(void);
> +int shstk_pop(u64 *val);
> +int shstk_push(u64 val);
>  #else
>  static inline long shstk_prctl(struct task_struct *task, int option,
>  			       unsigned long arg2) { return -EINVAL; }
> @@ -35,6 +37,8 @@ static inline int setup_signal_shadow_st
>  static inline int restore_signal_shadow_stack(void) { return 0; }
>  static inline int shstk_update_last_frame(unsigned long val) { return 0; }
>  static inline bool shstk_is_enabled(void) { return false; }
> +static inline int shstk_pop(u64 *val) { return -ENOTSUPP; }
> +static inline int shstk_push(u64 val) { return -ENOTSUPP; }
>  #endif /* CONFIG_X86_USER_SHADOW_STACK */
>  
>  #endif /* __ASSEMBLER__ */
> --- a/arch/x86/kernel/shstk.c
> +++ b/arch/x86/kernel/shstk.c
> @@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr
>  	return ssp;
>  }
>  
> +int shstk_pop(u64 *val)
> +{
> +	int ret = 0;
> +	u64 ssp;
> +
> +	if (!features_enabled(ARCH_SHSTK_SHSTK))
> +		return -ENOTSUPP;
> +
> +	fpregs_lock_and_load();
> +
> +	rdmsrq(MSR_IA32_PL3_SSP, ssp);
> +	if (val && get_user(*val, (__user u64 *)ssp))
> +	    ret = -EFAULT;
> +	ssp += SS_FRAME_SIZE;
> +	wrmsrq(MSR_IA32_PL3_SSP, ssp);
> +
> +	fpregs_unlock();
> +
> +	return ret;
> +}
> +
> +int shstk_push(u64 val)
> +{
> +	u64 ssp;
> +	int ret;
> +
> +	if (!features_enabled(ARCH_SHSTK_SHSTK))
> +		return -ENOTSUPP;
> +
> +	fpregs_lock_and_load();
> +
> +	rdmsrq(MSR_IA32_PL3_SSP, ssp);
> +	ssp -= SS_FRAME_SIZE;
> +	wrmsrq(MSR_IA32_PL3_SSP, ssp);
> +	ret = write_user_shstk_64((__user void *)ssp, val);
> +	fpregs_unlock();
> +
> +	return ret;
> +}
> +
>  #define SHSTK_DATA_BIT BIT(63)
>  
>  static int put_shstk_data(u64 __user *addr, u64 data)
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -804,7 +804,7 @@ SYSCALL_DEFINE0(uprobe)
>  {
>  	struct pt_regs *regs = task_pt_regs(current);
>  	struct uprobe_syscall_args args;
> -	unsigned long ip, sp;
> +	unsigned long ip, sp, sret;
>  	int err;
>  
>  	/* Allow execution only from uprobe trampolines. */
> @@ -831,6 +831,9 @@ SYSCALL_DEFINE0(uprobe)
>  
>  	sp = regs->sp;
>  
> +	if (shstk_pop(&sret) == 0 && sret != args.retaddr)
> +		goto sigill;
> +
>  	handle_syscall_uprobe(regs, regs->ip);
>  
>  	/*
> @@ -855,6 +858,9 @@ SYSCALL_DEFINE0(uprobe)
>  	if (args.retaddr - 5 != regs->ip)
>  		args.retaddr = regs->ip;
>  
> +	if (shstk_push(args.retaddr) == -EFAULT)
> +		goto sigill;
> +
>  	regs->ip = ip;
>  
>  	err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
> @@ -1124,14 +1130,6 @@ void arch_uprobe_optimize(struct arch_up
>  	struct mm_struct *mm = current->mm;
>  	uprobe_opcode_t insn[5];
>  
> -	/*
> -	 * Do not optimize if shadow stack is enabled, the return address hijack
> -	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
> -	 * the entry uprobe is optimized and the shadow stack crashes the app.
> -	 */
> -	if (shstk_is_enabled())
> -		return;
> -

nice, we will need to adjust selftests for that, there's shadow stack part
in prog_tests/uprobe_syscall.c that expects non optimized uprobe after enabling
shadow stack.. I'll run it and send the change

thanks,
jirka

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Edgecombe, Rick P 1 month, 2 weeks ago

I'm not sure we should optimize for shadow stack yet. Unless it's easy to think
about... (below)

On Wed, 2025-08-20 at 14:30 +0200, Peter Zijlstra wrote:
> --- a/arch/x86/include/asm/shstk.h
> +++ b/arch/x86/include/asm/shstk.h
> @@ -23,6 +23,8 @@ int setup_signal_shadow_stack(struct ksi
>  int restore_signal_shadow_stack(void);
>  int shstk_update_last_frame(unsigned long val);
>  bool shstk_is_enabled(void);
> +int shstk_pop(u64 *val);
> +int shstk_push(u64 val);
>  #else
>  static inline long shstk_prctl(struct task_struct *task, int option,
>  			       unsigned long arg2) { return -EINVAL; }
> @@ -35,6 +37,8 @@ static inline int setup_signal_shadow_st
>  static inline int restore_signal_shadow_stack(void) { return 0; }
>  static inline int shstk_update_last_frame(unsigned long val) { return 0; }
>  static inline bool shstk_is_enabled(void) { return false; }
> +static inline int shstk_pop(u64 *val) { return -ENOTSUPP; }
> +static inline int shstk_push(u64 val) { return -ENOTSUPP; }
>  #endif /* CONFIG_X86_USER_SHADOW_STACK */
>  
>  #endif /* __ASSEMBLER__ */
> --- a/arch/x86/kernel/shstk.c
> +++ b/arch/x86/kernel/shstk.c
> @@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr
>  	return ssp;
>  }
>  
> +int shstk_pop(u64 *val)
> +{
> +	int ret = 0;
> +	u64 ssp;
> +
> +	if (!features_enabled(ARCH_SHSTK_SHSTK))
> +		return -ENOTSUPP;
> +
> +	fpregs_lock_and_load();
> +
> +	rdmsrq(MSR_IA32_PL3_SSP, ssp);
> +	if (val && get_user(*val, (__user u64 *)ssp))
> +	    ret = -EFAULT;
> +	ssp += SS_FRAME_SIZE;
> +	wrmsrq(MSR_IA32_PL3_SSP, ssp);
> +
> +	fpregs_unlock();
> +
> +	return ret;
> +}
> +
> +int shstk_push(u64 val)
> +{
> +	u64 ssp;
> +	int ret;
> +
> +	if (!features_enabled(ARCH_SHSTK_SHSTK))
> +		return -ENOTSUPP;
> +
> +	fpregs_lock_and_load();
> +
> +	rdmsrq(MSR_IA32_PL3_SSP, ssp);
> +	ssp -= SS_FRAME_SIZE;
> +	wrmsrq(MSR_IA32_PL3_SSP, ssp);
> +	ret = write_user_shstk_64((__user void *)ssp, val);

Should we role back ssp if there is a fault?

> +	fpregs_unlock();
> +
> +	return ret;
> +}
> +
>  #define SHSTK_DATA_BIT BIT(63)
>  
>  static int put_shstk_data(u64 __user *addr, u64 data)
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -804,7 +804,7 @@ SYSCALL_DEFINE0(uprobe)
>  {
>  	struct pt_regs *regs = task_pt_regs(current);
>  	struct uprobe_syscall_args args;
> -	unsigned long ip, sp;
> +	unsigned long ip, sp, sret;
>  	int err;
>  
>  	/* Allow execution only from uprobe trampolines. */
> @@ -831,6 +831,9 @@ SYSCALL_DEFINE0(uprobe)
>  
>  	sp = regs->sp;
>  
> +	if (shstk_pop(&sret) == 0 && sret != args.retaddr)
> +		goto sigill;
> +
>  	handle_syscall_uprobe(regs, regs->ip);
>  
>  	/*
> @@ -855,6 +858,9 @@ SYSCALL_DEFINE0(uprobe)
>  	if (args.retaddr - 5 != regs->ip)
>  		args.retaddr = regs->ip;
>  
> +	if (shstk_push(args.retaddr) == -EFAULT)
> +		goto sigill;
> +

Are we effectively allowing arbitrary shadow stack push here? I see we need to
be in in_uprobe_trampoline(), but there is no mmap lock taken, so it's a racy
check. I'm questioning if the security posture tweak is worth thinking about for
whatever the level of intersection of uprobes usage and shadow stack is today.

>  	regs->ip = ip;
>  
>  	err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
> @@ -1124,14 +1130,6 @@ void arch_uprobe_optimize(struct arch_up
>  	struct mm_struct *mm = current->mm;
>  	uprobe_opcode_t insn[5];
>  
> -	/*
> -	 * Do not optimize if shadow stack is enabled, the return address hijack
> -	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
> -	 * the entry uprobe is optimized and the shadow stack crashes the app.
> -	 */
> -	if (shstk_is_enabled())
> -		return;
> -
>  	if (!should_optimize(auprobe))
>  		return;
>

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Peter Zijlstra 1 month, 2 weeks ago

On Wed, Aug 20, 2025 at 03:58:14PM +0000, Edgecombe, Rick P wrote:
> I'm not sure we should optimize for shadow stack yet. Unless it's easy to think
> about... (below)
> 
> On Wed, 2025-08-20 at 14:30 +0200, Peter Zijlstra wrote:
> > --- a/arch/x86/include/asm/shstk.h
> > +++ b/arch/x86/include/asm/shstk.h
> > @@ -23,6 +23,8 @@ int setup_signal_shadow_stack(struct ksi
> >  int restore_signal_shadow_stack(void);
> >  int shstk_update_last_frame(unsigned long val);
> >  bool shstk_is_enabled(void);
> > +int shstk_pop(u64 *val);
> > +int shstk_push(u64 val);
> >  #else
> >  static inline long shstk_prctl(struct task_struct *task, int option,
> >  			       unsigned long arg2) { return -EINVAL; }
> > @@ -35,6 +37,8 @@ static inline int setup_signal_shadow_st
> >  static inline int restore_signal_shadow_stack(void) { return 0; }
> >  static inline int shstk_update_last_frame(unsigned long val) { return 0; }
> >  static inline bool shstk_is_enabled(void) { return false; }
> > +static inline int shstk_pop(u64 *val) { return -ENOTSUPP; }
> > +static inline int shstk_push(u64 val) { return -ENOTSUPP; }
> >  #endif /* CONFIG_X86_USER_SHADOW_STACK */
> >  
> >  #endif /* __ASSEMBLER__ */
> > --- a/arch/x86/kernel/shstk.c
> > +++ b/arch/x86/kernel/shstk.c
> > @@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr
> >  	return ssp;
> >  }
> >  
> > +int shstk_pop(u64 *val)
> > +{
> > +	int ret = 0;
> > +	u64 ssp;
> > +
> > +	if (!features_enabled(ARCH_SHSTK_SHSTK))
> > +		return -ENOTSUPP;
> > +
> > +	fpregs_lock_and_load();
> > +
> > +	rdmsrq(MSR_IA32_PL3_SSP, ssp);
> > +	if (val && get_user(*val, (__user u64 *)ssp))
> > +	    ret = -EFAULT;
> > +	ssp += SS_FRAME_SIZE;
> > +	wrmsrq(MSR_IA32_PL3_SSP, ssp);
> > +
> > +	fpregs_unlock();
> > +
> > +	return ret;
> > +}
> > +
> > +int shstk_push(u64 val)
> > +{
> > +	u64 ssp;
> > +	int ret;
> > +
> > +	if (!features_enabled(ARCH_SHSTK_SHSTK))
> > +		return -ENOTSUPP;
> > +
> > +	fpregs_lock_and_load();
> > +
> > +	rdmsrq(MSR_IA32_PL3_SSP, ssp);
> > +	ssp -= SS_FRAME_SIZE;
> > +	wrmsrq(MSR_IA32_PL3_SSP, ssp);
> > +	ret = write_user_shstk_64((__user void *)ssp, val);
> 
> Should we role back ssp if there is a fault?

Ah, probably. And same with pop I suppose, don't adjust ssp if we can't
read it etc.

> > +	fpregs_unlock();
> > +
> > +	return ret;
> > +}
> > +
> >  #define SHSTK_DATA_BIT BIT(63)
> >  
> >  static int put_shstk_data(u64 __user *addr, u64 data)
> > --- a/arch/x86/kernel/uprobes.c
> > +++ b/arch/x86/kernel/uprobes.c
> > @@ -804,7 +804,7 @@ SYSCALL_DEFINE0(uprobe)
> >  {
> >  	struct pt_regs *regs = task_pt_regs(current);
> >  	struct uprobe_syscall_args args;
> > -	unsigned long ip, sp;
> > +	unsigned long ip, sp, sret;
> >  	int err;
> >  
> >  	/* Allow execution only from uprobe trampolines. */
> > @@ -831,6 +831,9 @@ SYSCALL_DEFINE0(uprobe)
> >  
> >  	sp = regs->sp;
> >  
> > +	if (shstk_pop(&sret) == 0 && sret != args.retaddr)
> > +		goto sigill;
> > +
> >  	handle_syscall_uprobe(regs, regs->ip);
> >  
> >  	/*
> > @@ -855,6 +858,9 @@ SYSCALL_DEFINE0(uprobe)
> >  	if (args.retaddr - 5 != regs->ip)
> >  		args.retaddr = regs->ip;
> >  
> > +	if (shstk_push(args.retaddr) == -EFAULT)
> > +		goto sigill;
> > +
> 
> Are we effectively allowing arbitrary shadow stack push here? 

Yeah, why not? Userspace shadow stacks does not, and cannot, protect
from the kernel being funneh. It fully relies on the kernel being
trusted. So the kernel doing a shstk_{pop,push}() to make things line up
properly shouldn't be a problem.

> I see we need to be in in_uprobe_trampoline(), but there is no mmap
> lock taken, so it's a racy check.

Racy how? Isn't this more or less equivalent to what a normal CALL
instruction would do?

> I'm questioning if the security posture tweak is worth thinking about for
> whatever the level of intersection of uprobes usage and shadow stack is today.

I have no idea how much code is built with shadow stack enabled today;
but I see no point in not supporting uprobes on it. The whole of
userspace shadow stacks only ever protects from userspace attacking
other userspace -- and that protection isn't changed by this.

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Edgecombe, Rick P 1 month, 2 weeks ago

On Wed, 2025-08-20 at 19:12 +0200, Peter Zijlstra wrote:
> > Are we effectively allowing arbitrary shadow stack push here? 
> 
> Yeah, why not? Userspace shadow stacks does not, and cannot, protect
> from the kernel being funneh. It fully relies on the kernel being
> trusted. So the kernel doing a shstk_{pop,push}() to make things line up
> properly shouldn't be a problem.

Emulating a call/ret should be fine.

> 
> > I see we need to be in in_uprobe_trampoline(), but there is no mmap
> > lock taken, so it's a racy check.
> 
> Racy how? Isn't this more or less equivalent to what a normal CALL
> instruction would do?

Racy in terms of the "is trampoline" check happening before the write to the
shadow stack. I was thinking like a TOCTOU thing. The "Allow execution only from
uprobe trampolines" check is not very strong.

As for call equivalence, args.retaddr comes from userspace, right?

> 
> > I'm questioning if the security posture tweak is worth thinking about for
> > whatever the level of intersection of uprobes usage and shadow stack is
> > today.
> 
> I have no idea how much code is built with shadow stack enabled today;
> but I see no point in not supporting uprobes on it. The whole of
> userspace shadow stacks only ever protects from userspace attacking
> other userspace -- and that protection isn't changed by this.

Isn't this just about whether to support an optimization for uprobes?

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Peter Zijlstra 1 month, 2 weeks ago

On Wed, Aug 20, 2025 at 05:26:38PM +0000, Edgecombe, Rick P wrote:
> On Wed, 2025-08-20 at 19:12 +0200, Peter Zijlstra wrote:
> > > Are we effectively allowing arbitrary shadow stack push here? 
> > 
> > Yeah, why not? Userspace shadow stacks does not, and cannot, protect
> > from the kernel being funneh. It fully relies on the kernel being
> > trusted. So the kernel doing a shstk_{pop,push}() to make things line up
> > properly shouldn't be a problem.
> 
> Emulating a call/ret should be fine.
> 
> > 
> > > I see we need to be in in_uprobe_trampoline(), but there is no mmap
> > > lock taken, so it's a racy check.
> > 
> > Racy how? Isn't this more or less equivalent to what a normal CALL
> > instruction would do?
> 
> Racy in terms of the "is trampoline" check happening before the write to the
> shadow stack. I was thinking like a TOCTOU thing. The "Allow execution only from
> uprobe trampolines" check is not very strong.
> 
> As for call equivalence, args.retaddr comes from userspace, right?

Yeah. So this whole thing is your random code having a 5 byte nop. And
instead of using INT3 to turn it into #BP, we turn it into "CALL
uprobe_trampoline".

That trampoline looks like:

	push %rcx
	push %r11
	push %rax;
	mov $__NR_uprobe, %rax
	syscall
	pop %rax
	pop %r11
	pop %rcx
	ret

Now, that syscall handler is the one doing shstk_pop/push. But it does
that right along with modifying the normal SP.

Basically the syscall copies the 4 (CALL,PUSH,PUSH,PUSH) words from the
stack into a local struct (args), adjusts SP, and adjusts IP to point to
the CALL instruction that got us here (retaddr-5).

This way, we get the same context as that #BP would've gotten. Then we
run uprobe crap, and on return:

 - sp changed, we take the (slow) IRET path out, and can just jump
   wherever -- typically right after the CALL that got us here, no need
   to re-adjust the stack and take the trampoline tail.

 - sp didn't change, we take the (fast) sysexit path out, and have to
   re-apply the CALL,PUSH,PUSH,PUSH such that the trampoline tail can
   undo them again.

The added shstk_pop/push() exactly match the above undo/redo of the CALL
(and other stack ops).

> > > I'm questioning if the security posture tweak is worth thinking about for
> > > whatever the level of intersection of uprobes usage and shadow stack is
> > > today.
> > 
> > I have no idea how much code is built with shadow stack enabled today;
> > but I see no point in not supporting uprobes on it. The whole of
> > userspace shadow stacks only ever protects from userspace attacking
> > other userspace -- and that protection isn't changed by this.
> 
> Isn't this just about whether to support an optimization for uprobes?

Yes. But supporting the shstk isn't hard (as per this patch), it exactly
matches what it already does to the normal stack. So I don't see a
reason not to do it.

Anyway, I'm not a huge fan of any of this. I suspect FRED will make all
this fancy code totally irrelevant. But until people have FRED enabled
hardware in large quantities, I suppose this has a use.

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Edgecombe, Rick P 1 month, 2 weeks ago

On Wed, 2025-08-20 at 19:43 +0200, Peter Zijlstra wrote:
> Yes. But supporting the shstk isn't hard (as per this patch), it exactly
> matches what it already does to the normal stack. So I don't see a
> reason not to do it.

Thanks for explaining, and sorry for being slow. Going to blame this head cold.

> 
> Anyway, I'm not a huge fan of any of this. I suspect FRED will make all
> this fancy code totally irrelevant. But until people have FRED enabled
> hardware in large quantities, I suppose this has a use.

It doesn't sound too unbounded and I guess as long as it's just an optimization
we can always back it out if someone finds a way to abuse it.

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Jiri Olsa 1 month, 2 weeks ago

On Tue, Aug 19, 2025 at 09:15:15PM +0200, Peter Zijlstra wrote:
> On Sun, Jul 20, 2025 at 01:21:20PM +0200, Jiri Olsa wrote:
> 
> > +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
> > +{
> > +	struct __packed __arch_relative_insn {
> > +		u8 op;
> > +		s32 raddr;
> > +	} *call = (struct __arch_relative_insn *) insn;
> 
> Not something you need to clean up now I suppose, but we could do with
> unifying this thing. we have a bunch of instances around.

ok, I noticed, will send patch for that

> 
> > +
> > +	if (!is_call_insn(insn))
> > +		return false;
> > +	return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
> > +}
> 
> > +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
> > +{
> > +	struct mm_struct *mm = current->mm;
> > +	uprobe_opcode_t insn[5];
> > +
> > +	/*
> > +	 * Do not optimize if shadow stack is enabled, the return address hijack
> > +	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
> > +	 * the entry uprobe is optimized and the shadow stack crashes the app.
> > +	 */
> > +	if (shstk_is_enabled())
> > +		return;
> 
> Kernel should be able to fix up userspace shadow stack just fine.

ok, will send follow up fix

> 
> > +	if (!should_optimize(auprobe))
> > +		return;
> > +
> > +	mmap_write_lock(mm);
> > +
> > +	/*
> > +	 * Check if some other thread already optimized the uprobe for us,
> > +	 * if it's the case just go away silently.
> > +	 */
> > +	if (copy_from_vaddr(mm, vaddr, &insn, 5))
> > +		goto unlock;
> > +	if (!is_swbp_insn((uprobe_opcode_t*) &insn))
> > +		goto unlock;
> > +
> > +	/*
> > +	 * If we fail to optimize the uprobe we set the fail bit so the
> > +	 * above should_optimize will fail from now on.
> > +	 */
> > +	if (__arch_uprobe_optimize(auprobe, mm, vaddr))
> > +		set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
> > +
> > +unlock:
> > +	mmap_write_unlock(mm);
> > +}
> > +
> > +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
> > +{
> > +	if (memcmp(&auprobe->insn, x86_nops[5], 5))
> > +		return false;
> > +	/* We can't do cross page atomic writes yet. */
> > +	return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
> > +}
> 
> This seems needlessly restrictive. Something like:
> 
> is_nop5(const char *buf)
> {
> 	struct insn insn;
> 
> 	ret = insn_decode_kernel(&insn, buf)
> 	if (ret < 0)
> 		return false;
> 
> 	if (insn.length != 5)
> 		return false;
> 
> 	if (insn.opcode[0] != 0x0f ||
> 	    insn.opcode[1] != 0x1f)
> 	    	return false;
> 
> 	return true;
> }
> 
> Should do I suppose.

ok, looks good, should I respin with this, or is follow up ok?

> Anyway, I think something like:
> 
>   f0 0f 1f 44 00 00	lock nopl 0(%eax, %eax, 1)
> 
> is a valid NOP5 at +1 and will 'optimize' and result in:
> 
>   f0 e8 disp32		lock call disp32
> 
> which will #UD.
> 
> But this is nearly unfixable. Just doing my best to find weirdo cases
> ;-)

nice, but I think if user puts not-optimized uprobe in the middle of the
instruction like to lock-nop5 + 1 the app would crash as well

thanks,
jirka

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Peter Zijlstra 1 month, 2 weeks ago

On Wed, Aug 20, 2025 at 02:19:15PM +0200, Jiri Olsa wrote:

> > This seems needlessly restrictive. Something like:
> > 
> > is_nop5(const char *buf)
> > {
> > 	struct insn insn;
> > 
> > 	ret = insn_decode_kernel(&insn, buf)
> > 	if (ret < 0)
> > 		return false;
> > 
> > 	if (insn.length != 5)
> > 		return false;
> > 
> > 	if (insn.opcode[0] != 0x0f ||
> > 	    insn.opcode[1] != 0x1f)
> > 	    	return false;
> > 
> > 	return true;
> > }
> > 
> > Should do I suppose.
> 
> ok, looks good, should I respin with this, or is follow up ok?

I cleaned up already; I pushed out these patches to queue/perf/core and
added a few of my own.

I will need to write better Changelogs, and post them, but I need to run
some errants first.

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Masami Hiramatsu (Google) 2 months, 1 week ago

On Sun, 20 Jul 2025 13:21:20 +0200
Jiri Olsa <jolsa@kernel.org> wrote:

> Putting together all the previously added pieces to support optimized
> uprobes on top of 5-byte nop instruction.
> 
> The current uprobe execution goes through following:
> 
>   - installs breakpoint instruction over original instruction
>   - exception handler hit and calls related uprobe consumers
>   - and either simulates original instruction or does out of line single step
>     execution of it
>   - returns to user space
> 
> The optimized uprobe path does following:
> 
>   - checks the original instruction is 5-byte nop (plus other checks)
>   - adds (or uses existing) user space trampoline with uprobe syscall
>   - overwrites original instruction (5-byte nop) with call to user space
>     trampoline
>   - the user space trampoline executes uprobe syscall that calls related uprobe
>     consumers
>   - trampoline returns back to next instruction
> 
> This approach won't speed up all uprobes as it's limited to using nop5 as
> original instruction, but we plan to use nop5 as USDT probe instruction
> (which currently uses single byte nop) and speed up the USDT probes.
> 
> The arch_uprobe_optimize triggers the uprobe optimization and is called after
> first uprobe hit. I originally had it called on uprobe installation but then
> it clashed with elf loader, because the user space trampoline was added in a
> place where loader might need to put elf segments, so I decided to do it after
> first uprobe hit when loading is done.
> 
> The uprobe is un-optimized in arch specific set_orig_insn call.
> 
> The instruction overwrite is x86 arch specific and needs to go through 3 updates:
> (on top of nop5 instruction)
> 
>   - write int3 into 1st byte
>   - write last 4 bytes of the call instruction
>   - update the call instruction opcode
> 
> And cleanup goes though similar reverse stages:
> 
>   - overwrite call opcode with breakpoint (int3)
>   - write last 4 bytes of the nop5 instruction
>   - write the nop5 first instruction byte
> 
> We do not unmap and release uprobe trampoline when it's no longer needed,
> because there's no easy way to make sure none of the threads is still
> inside the trampoline. But we do not waste memory, because there's just
> single page for all the uprobe trampoline mappings.
> 
> We do waste frame on page mapping for every 4GB by keeping the uprobe
> trampoline page mapped, but that seems ok.
> 
> We take the benefit from the fact that set_swbp and set_orig_insn are
> called under mmap_write_lock(mm), so we can use the current instruction
> as the state the uprobe is in - nop5/breakpoint/call trampoline -
> and decide the needed action (optimize/un-optimize) based on that.
> 
> Attaching the speed up from benchs/run_bench_uprobes.sh script:
> 
> current:
>         usermode-count :  152.604 ± 0.044M/s
>         syscall-count  :   13.359 ± 0.042M/s
> -->     uprobe-nop     :    3.229 ± 0.002M/s
>         uprobe-push    :    3.086 ± 0.004M/s
>         uprobe-ret     :    1.114 ± 0.004M/s
>         uprobe-nop5    :    1.121 ± 0.005M/s
>         uretprobe-nop  :    2.145 ± 0.002M/s
>         uretprobe-push :    2.070 ± 0.001M/s
>         uretprobe-ret  :    0.931 ± 0.001M/s
>         uretprobe-nop5 :    0.957 ± 0.001M/s
> 
> after the change:
>         usermode-count :  152.448 ± 0.244M/s
>         syscall-count  :   14.321 ± 0.059M/s
>         uprobe-nop     :    3.148 ± 0.007M/s
>         uprobe-push    :    2.976 ± 0.004M/s
>         uprobe-ret     :    1.068 ± 0.003M/s
> -->     uprobe-nop5    :    7.038 ± 0.007M/s
>         uretprobe-nop  :    2.109 ± 0.004M/s
>         uretprobe-push :    2.035 ± 0.001M/s
>         uretprobe-ret  :    0.908 ± 0.001M/s
>         uretprobe-nop5 :    3.377 ± 0.009M/s
> 
> I see bit more speed up on Intel (above) compared to AMD. The big nop5
> speed up is partly due to emulating nop5 and partly due to optimization.
> 
> The key speed up we do this for is the USDT switch from nop to nop5:
>         uprobe-nop     :    3.148 ± 0.007M/s
>         uprobe-nop5    :    7.038 ± 0.007M/s
> 

This also looks good to me.

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks!

> Acked-by: Andrii Nakryiko <andrii@kernel.org>
> Acked-by: Oleg Nesterov <oleg@redhat.com>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  arch/x86/include/asm/uprobes.h |   7 +
>  arch/x86/kernel/uprobes.c      | 283 ++++++++++++++++++++++++++++++++-
>  include/linux/uprobes.h        |   6 +-
>  kernel/events/uprobes.c        |  16 +-
>  4 files changed, 305 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
> index 678fb546f0a7..1ee2e5115955 100644
> --- a/arch/x86/include/asm/uprobes.h
> +++ b/arch/x86/include/asm/uprobes.h
> @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
>  #define UPROBE_SWBP_INSN		0xcc
>  #define UPROBE_SWBP_INSN_SIZE		   1
>  
> +enum {
> +	ARCH_UPROBE_FLAG_CAN_OPTIMIZE   = 0,
> +	ARCH_UPROBE_FLAG_OPTIMIZE_FAIL  = 1,
> +};
> +
>  struct uprobe_xol_ops;
>  
>  struct arch_uprobe {
> @@ -45,6 +50,8 @@ struct arch_uprobe {
>  			u8	ilen;
>  		}			push;
>  	};
> +
> +	unsigned long flags;
>  };
>  
>  struct arch_uprobe_task {
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index d18e1ae59901..209ce74ab93f 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -18,6 +18,7 @@
>  #include <asm/processor.h>
>  #include <asm/insn.h>
>  #include <asm/mmu_context.h>
> +#include <asm/nops.h>
>  
>  /* Post-execution fixups. */
>  
> @@ -702,7 +703,6 @@ static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
>  	return tramp;
>  }
>  
> -__maybe_unused
>  static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new)
>  {
>  	struct uprobes_state *state = &current->mm->uprobes_state;
> @@ -891,6 +891,280 @@ static int __init arch_uprobes_init(void)
>  
>  late_initcall(arch_uprobes_init);
>  
> +enum {
> +	EXPECT_SWBP,
> +	EXPECT_CALL,
> +};
> +
> +struct write_opcode_ctx {
> +	unsigned long base;
> +	int expect;
> +};
> +
> +static int is_call_insn(uprobe_opcode_t *insn)
> +{
> +	return *insn == CALL_INSN_OPCODE;
> +}
> +
> +/*
> + * Verification callback used by int3_update uprobe_write calls to make sure
> + * the underlying instruction is as expected - either int3 or call.
> + */
> +static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
> +		       int nbytes, void *data)
> +{
> +	struct write_opcode_ctx *ctx = data;
> +	uprobe_opcode_t old_opcode[5];
> +
> +	uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
> +
> +	switch (ctx->expect) {
> +	case EXPECT_SWBP:
> +		if (is_swbp_insn(&old_opcode[0]))
> +			return 1;
> +		break;
> +	case EXPECT_CALL:
> +		if (is_call_insn(&old_opcode[0]))
> +			return 1;
> +		break;
> +	}
> +
> +	return -1;
> +}
> +
> +/*
> + * Modify multi-byte instructions by using INT3 breakpoints on SMP.
> + * We completely avoid using stop_machine() here, and achieve the
> + * synchronization using INT3 breakpoints and SMP cross-calls.
> + * (borrowed comment from smp_text_poke_batch_finish)
> + *
> + * The way it is done:
> + *   - Add an INT3 trap to the address that will be patched
> + *   - SMP sync all CPUs
> + *   - Update all but the first byte of the patched range
> + *   - SMP sync all CPUs
> + *   - Replace the first byte (INT3) by the first byte of the replacing opcode
> + *   - SMP sync all CPUs
> + */
> +static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> +		       unsigned long vaddr, char *insn, bool optimize)
> +{
> +	uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
> +	struct write_opcode_ctx ctx = {
> +		.base = vaddr,
> +	};
> +	int err;
> +
> +	/*
> +	 * Write int3 trap.
> +	 *
> +	 * The swbp_optimize path comes with breakpoint already installed,
> +	 * so we can skip this step for optimize == true.
> +	 */
> +	if (!optimize) {
> +		ctx.expect = EXPECT_CALL;
> +		err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
> +				   true /* is_register */, false /* do_update_ref_ctr */,
> +				   &ctx);
> +		if (err)
> +			return err;
> +	}
> +
> +	smp_text_poke_sync_each_cpu();
> +
> +	/* Write all but the first byte of the patched range. */
> +	ctx.expect = EXPECT_SWBP;
> +	err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
> +			   true /* is_register */, false /* do_update_ref_ctr */,
> +			   &ctx);
> +	if (err)
> +		return err;
> +
> +	smp_text_poke_sync_each_cpu();
> +
> +	/*
> +	 * Write first byte.
> +	 *
> +	 * The swbp_unoptimize needs to finish uprobe removal together
> +	 * with ref_ctr update, using uprobe_write with proper flags.
> +	 */
> +	err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn,
> +			   optimize /* is_register */, !optimize /* do_update_ref_ctr */,
> +			   &ctx);
> +	if (err)
> +		return err;
> +
> +	smp_text_poke_sync_each_cpu();
> +	return 0;
> +}
> +
> +static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> +			 unsigned long vaddr, unsigned long tramp)
> +{
> +	u8 call[5];
> +
> +	__text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr,
> +			(const void *) tramp, CALL_INSN_SIZE);
> +	return int3_update(auprobe, vma, vaddr, call, true /* optimize */);
> +}
> +
> +static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> +			   unsigned long vaddr)
> +{
> +	return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */);
> +}
> +
> +static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
> +{
> +	unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD;
> +	struct vm_area_struct *vma;
> +	struct page *page;
> +
> +	page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
> +	if (IS_ERR(page))
> +		return PTR_ERR(page);
> +	uprobe_copy_from_page(page, vaddr, dst, len);
> +	put_page(page);
> +	return 0;
> +}
> +
> +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
> +{
> +	struct __packed __arch_relative_insn {
> +		u8 op;
> +		s32 raddr;
> +	} *call = (struct __arch_relative_insn *) insn;
> +
> +	if (!is_call_insn(insn))
> +		return false;
> +	return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
> +}
> +
> +static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized)
> +{
> +	uprobe_opcode_t insn[5];
> +	int err;
> +
> +	err = copy_from_vaddr(mm, vaddr, &insn, 5);
> +	if (err)
> +		return err;
> +	*optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr);
> +	return 0;
> +}
> +
> +static bool should_optimize(struct arch_uprobe *auprobe)
> +{
> +	return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) &&
> +		test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
> +}
> +
> +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> +	     unsigned long vaddr)
> +{
> +	if (should_optimize(auprobe)) {
> +		bool optimized = false;
> +		int err;
> +
> +		/*
> +		 * We could race with another thread that already optimized the probe,
> +		 * so let's not overwrite it with int3 again in this case.
> +		 */
> +		err = is_optimized(vma->vm_mm, vaddr, &optimized);
> +		if (err)
> +			return err;
> +		if (optimized)
> +			return 0;
> +	}
> +	return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN,
> +				   true /* is_register */);
> +}
> +
> +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> +		  unsigned long vaddr)
> +{
> +	if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
> +		struct mm_struct *mm = vma->vm_mm;
> +		bool optimized = false;
> +		int err;
> +
> +		err = is_optimized(mm, vaddr, &optimized);
> +		if (err)
> +			return err;
> +		if (optimized) {
> +			err = swbp_unoptimize(auprobe, vma, vaddr);
> +			WARN_ON_ONCE(err);
> +			return err;
> +		}
> +	}
> +	return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn,
> +				   false /* is_register */);
> +}
> +
> +static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
> +				  unsigned long vaddr)
> +{
> +	struct uprobe_trampoline *tramp;
> +	struct vm_area_struct *vma;
> +	bool new = false;
> +	int err = 0;
> +
> +	vma = find_vma(mm, vaddr);
> +	if (!vma)
> +		return -EINVAL;
> +	tramp = get_uprobe_trampoline(vaddr, &new);
> +	if (!tramp)
> +		return -EINVAL;
> +	err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr);
> +	if (WARN_ON_ONCE(err) && new)
> +		destroy_uprobe_trampoline(tramp);
> +	return err;
> +}
> +
> +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
> +{
> +	struct mm_struct *mm = current->mm;
> +	uprobe_opcode_t insn[5];
> +
> +	/*
> +	 * Do not optimize if shadow stack is enabled, the return address hijack
> +	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
> +	 * the entry uprobe is optimized and the shadow stack crashes the app.
> +	 */
> +	if (shstk_is_enabled())
> +		return;
> +
> +	if (!should_optimize(auprobe))
> +		return;
> +
> +	mmap_write_lock(mm);
> +
> +	/*
> +	 * Check if some other thread already optimized the uprobe for us,
> +	 * if it's the case just go away silently.
> +	 */
> +	if (copy_from_vaddr(mm, vaddr, &insn, 5))
> +		goto unlock;
> +	if (!is_swbp_insn((uprobe_opcode_t*) &insn))
> +		goto unlock;
> +
> +	/*
> +	 * If we fail to optimize the uprobe we set the fail bit so the
> +	 * above should_optimize will fail from now on.
> +	 */
> +	if (__arch_uprobe_optimize(auprobe, mm, vaddr))
> +		set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
> +
> +unlock:
> +	mmap_write_unlock(mm);
> +}
> +
> +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
> +{
> +	if (memcmp(&auprobe->insn, x86_nops[5], 5))
> +		return false;
> +	/* We can't do cross page atomic writes yet. */
> +	return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
> +}
>  #else /* 32-bit: */
>  /*
>   * No RIP-relative addressing on 32-bit
> @@ -904,6 +1178,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
>  static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
>  {
>  }
> +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
> +{
> +	return false;
> +}
>  #endif /* CONFIG_X86_64 */
>  
>  struct uprobe_xol_ops {
> @@ -1270,6 +1548,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
>  	if (ret)
>  		return ret;
>  
> +	if (can_optimize(auprobe, addr))
> +		set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
> +
>  	ret = branch_setup_xol_ops(auprobe, &insn);
>  	if (ret != -ENOSYS)
>  		return ret;
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index b6b077cc7d0f..08ef78439d0d 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -192,7 +192,7 @@ struct uprobes_state {
>  };
>  
>  typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr,
> -				     uprobe_opcode_t *insn, int nbytes);
> +				     uprobe_opcode_t *insn, int nbytes, void *data);
>  
>  extern void __init uprobes_init(void);
>  extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr);
> @@ -204,7 +204,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
>  extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t,
>  			       bool is_register);
>  extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr,
> -			uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr);
> +			uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
> +			void *data);
>  extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
>  extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
>  extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
> @@ -240,6 +241,7 @@ extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *
>  extern void arch_uprobe_clear_state(struct mm_struct *mm);
>  extern void arch_uprobe_init_state(struct mm_struct *mm);
>  extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
> +extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr);
>  #else /* !CONFIG_UPROBES */
>  struct uprobes_state {
>  };
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index cbba31c0495f..e54081beeab9 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -192,7 +192,7 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
>  }
>  
>  static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
> -			 int nbytes)
> +			 int nbytes, void *data)
>  {
>  	uprobe_opcode_t old_opcode;
>  	bool is_swbp;
> @@ -492,12 +492,13 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
>  		bool is_register)
>  {
>  	return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
> -			    verify_opcode, is_register, true /* do_update_ref_ctr */);
> +			    verify_opcode, is_register, true /* do_update_ref_ctr */, NULL);
>  }
>  
>  int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
>  		 const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
> -		 uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr)
> +		 uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
> +		 void *data)
>  {
>  	const unsigned long vaddr = insn_vaddr & PAGE_MASK;
>  	struct mm_struct *mm = vma->vm_mm;
> @@ -531,7 +532,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
>  		goto out;
>  	folio = page_folio(page);
>  
> -	ret = verify(page, insn_vaddr, insn, nbytes);
> +	ret = verify(page, insn_vaddr, insn, nbytes, data);
>  	if (ret <= 0) {
>  		folio_put(folio);
>  		goto out;
> @@ -2697,6 +2698,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
>  	return true;
>  }
>  
> +void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
> +{
> +}
> +
>  /*
>   * Run handler and ask thread to singlestep.
>   * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
> @@ -2761,6 +2766,9 @@ static void handle_swbp(struct pt_regs *regs)
>  
>  	handler_chain(uprobe, regs);
>  
> +	/* Try to optimize after first hit. */
> +	arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
> +
>  	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
>  		goto out;
>  
> -- 
> 2.50.1
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Jiri Olsa 2 months, 1 week ago

On Fri, Jul 25, 2025 at 07:13:18PM +0900, Masami Hiramatsu wrote:
> On Sun, 20 Jul 2025 13:21:20 +0200
> Jiri Olsa <jolsa@kernel.org> wrote:
> 
> > Putting together all the previously added pieces to support optimized
> > uprobes on top of 5-byte nop instruction.
> > 
> > The current uprobe execution goes through following:
> > 
> >   - installs breakpoint instruction over original instruction
> >   - exception handler hit and calls related uprobe consumers
> >   - and either simulates original instruction or does out of line single step
> >     execution of it
> >   - returns to user space
> > 
> > The optimized uprobe path does following:
> > 
> >   - checks the original instruction is 5-byte nop (plus other checks)
> >   - adds (or uses existing) user space trampoline with uprobe syscall
> >   - overwrites original instruction (5-byte nop) with call to user space
> >     trampoline
> >   - the user space trampoline executes uprobe syscall that calls related uprobe
> >     consumers
> >   - trampoline returns back to next instruction
> > 
> > This approach won't speed up all uprobes as it's limited to using nop5 as
> > original instruction, but we plan to use nop5 as USDT probe instruction
> > (which currently uses single byte nop) and speed up the USDT probes.
> > 
> > The arch_uprobe_optimize triggers the uprobe optimization and is called after
> > first uprobe hit. I originally had it called on uprobe installation but then
> > it clashed with elf loader, because the user space trampoline was added in a
> > place where loader might need to put elf segments, so I decided to do it after
> > first uprobe hit when loading is done.
> > 
> > The uprobe is un-optimized in arch specific set_orig_insn call.
> > 
> > The instruction overwrite is x86 arch specific and needs to go through 3 updates:
> > (on top of nop5 instruction)
> > 
> >   - write int3 into 1st byte
> >   - write last 4 bytes of the call instruction
> >   - update the call instruction opcode
> > 
> > And cleanup goes though similar reverse stages:
> > 
> >   - overwrite call opcode with breakpoint (int3)
> >   - write last 4 bytes of the nop5 instruction
> >   - write the nop5 first instruction byte
> > 
> > We do not unmap and release uprobe trampoline when it's no longer needed,
> > because there's no easy way to make sure none of the threads is still
> > inside the trampoline. But we do not waste memory, because there's just
> > single page for all the uprobe trampoline mappings.
> > 
> > We do waste frame on page mapping for every 4GB by keeping the uprobe
> > trampoline page mapped, but that seems ok.
> > 
> > We take the benefit from the fact that set_swbp and set_orig_insn are
> > called under mmap_write_lock(mm), so we can use the current instruction
> > as the state the uprobe is in - nop5/breakpoint/call trampoline -
> > and decide the needed action (optimize/un-optimize) based on that.
> > 
> > Attaching the speed up from benchs/run_bench_uprobes.sh script:
> > 
> > current:
> >         usermode-count :  152.604 ± 0.044M/s
> >         syscall-count  :   13.359 ± 0.042M/s
> > -->     uprobe-nop     :    3.229 ± 0.002M/s
> >         uprobe-push    :    3.086 ± 0.004M/s
> >         uprobe-ret     :    1.114 ± 0.004M/s
> >         uprobe-nop5    :    1.121 ± 0.005M/s
> >         uretprobe-nop  :    2.145 ± 0.002M/s
> >         uretprobe-push :    2.070 ± 0.001M/s
> >         uretprobe-ret  :    0.931 ± 0.001M/s
> >         uretprobe-nop5 :    0.957 ± 0.001M/s
> > 
> > after the change:
> >         usermode-count :  152.448 ± 0.244M/s
> >         syscall-count  :   14.321 ± 0.059M/s
> >         uprobe-nop     :    3.148 ± 0.007M/s
> >         uprobe-push    :    2.976 ± 0.004M/s
> >         uprobe-ret     :    1.068 ± 0.003M/s
> > -->     uprobe-nop5    :    7.038 ± 0.007M/s
> >         uretprobe-nop  :    2.109 ± 0.004M/s
> >         uretprobe-push :    2.035 ± 0.001M/s
> >         uretprobe-ret  :    0.908 ± 0.001M/s
> >         uretprobe-nop5 :    3.377 ± 0.009M/s
> > 
> > I see bit more speed up on Intel (above) compared to AMD. The big nop5
> > speed up is partly due to emulating nop5 and partly due to optimization.
> > 
> > The key speed up we do this for is the USDT switch from nop to nop5:
> >         uprobe-nop     :    3.148 ± 0.007M/s
> >         uprobe-nop5    :    7.038 ± 0.007M/s
> > 
> 
> This also looks good to me.
> 
> Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

thanks!

Peter, do you have more comments?

thanks,
jirka

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Peter Zijlstra 1 month, 2 weeks ago

On Mon, Jul 28, 2025 at 11:34:56PM +0200, Jiri Olsa wrote:

> Peter, do you have more comments?

I'm not really a fan of this syscall is faster than exception stuff. Yes
it is for current hardware, but I suspect much of this will be a
maintenance burden 'soon'.

Anyway, I'll queue the patches tomorrow. I think the shadow stack thing
wants fixing though. The rest we can prod at whenever.

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Jiri Olsa 1 month, 2 weeks ago

On Tue, Aug 19, 2025 at 09:17:44PM +0200, Peter Zijlstra wrote:
> On Mon, Jul 28, 2025 at 11:34:56PM +0200, Jiri Olsa wrote:
> 
> > Peter, do you have more comments?
> 
> I'm not really a fan of this syscall is faster than exception stuff. Yes
> it is for current hardware, but I suspect much of this will be a
> maintenance burden 'soon'.
> 
> Anyway, I'll queue the patches tomorrow. I think the shadow stack thing
> wants fixing though. The rest we can prod at whenever.

ok, will send follow up

thanks,
jirka

Re: [PATCHv6 perf/core 10/22] uprobes/x86: Add support to optimize uprobes

Posted by Jiri Olsa 1 month, 4 weeks ago

ping, thanks

On Mon, Jul 28, 2025 at 11:34:56PM +0200, Jiri Olsa wrote:
> On Fri, Jul 25, 2025 at 07:13:18PM +0900, Masami Hiramatsu wrote:
> > On Sun, 20 Jul 2025 13:21:20 +0200
> > Jiri Olsa <jolsa@kernel.org> wrote:
> > 
> > > Putting together all the previously added pieces to support optimized
> > > uprobes on top of 5-byte nop instruction.
> > > 
> > > The current uprobe execution goes through following:
> > > 
> > >   - installs breakpoint instruction over original instruction
> > >   - exception handler hit and calls related uprobe consumers
> > >   - and either simulates original instruction or does out of line single step
> > >     execution of it
> > >   - returns to user space
> > > 
> > > The optimized uprobe path does following:
> > > 
> > >   - checks the original instruction is 5-byte nop (plus other checks)
> > >   - adds (or uses existing) user space trampoline with uprobe syscall
> > >   - overwrites original instruction (5-byte nop) with call to user space
> > >     trampoline
> > >   - the user space trampoline executes uprobe syscall that calls related uprobe
> > >     consumers
> > >   - trampoline returns back to next instruction
> > > 
> > > This approach won't speed up all uprobes as it's limited to using nop5 as
> > > original instruction, but we plan to use nop5 as USDT probe instruction
> > > (which currently uses single byte nop) and speed up the USDT probes.
> > > 
> > > The arch_uprobe_optimize triggers the uprobe optimization and is called after
> > > first uprobe hit. I originally had it called on uprobe installation but then
> > > it clashed with elf loader, because the user space trampoline was added in a
> > > place where loader might need to put elf segments, so I decided to do it after
> > > first uprobe hit when loading is done.
> > > 
> > > The uprobe is un-optimized in arch specific set_orig_insn call.
> > > 
> > > The instruction overwrite is x86 arch specific and needs to go through 3 updates:
> > > (on top of nop5 instruction)
> > > 
> > >   - write int3 into 1st byte
> > >   - write last 4 bytes of the call instruction
> > >   - update the call instruction opcode
> > > 
> > > And cleanup goes though similar reverse stages:
> > > 
> > >   - overwrite call opcode with breakpoint (int3)
> > >   - write last 4 bytes of the nop5 instruction
> > >   - write the nop5 first instruction byte
> > > 
> > > We do not unmap and release uprobe trampoline when it's no longer needed,
> > > because there's no easy way to make sure none of the threads is still
> > > inside the trampoline. But we do not waste memory, because there's just
> > > single page for all the uprobe trampoline mappings.
> > > 
> > > We do waste frame on page mapping for every 4GB by keeping the uprobe
> > > trampoline page mapped, but that seems ok.
> > > 
> > > We take the benefit from the fact that set_swbp and set_orig_insn are
> > > called under mmap_write_lock(mm), so we can use the current instruction
> > > as the state the uprobe is in - nop5/breakpoint/call trampoline -
> > > and decide the needed action (optimize/un-optimize) based on that.
> > > 
> > > Attaching the speed up from benchs/run_bench_uprobes.sh script:
> > > 
> > > current:
> > >         usermode-count :  152.604 ± 0.044M/s
> > >         syscall-count  :   13.359 ± 0.042M/s
> > > -->     uprobe-nop     :    3.229 ± 0.002M/s
> > >         uprobe-push    :    3.086 ± 0.004M/s
> > >         uprobe-ret     :    1.114 ± 0.004M/s
> > >         uprobe-nop5    :    1.121 ± 0.005M/s
> > >         uretprobe-nop  :    2.145 ± 0.002M/s
> > >         uretprobe-push :    2.070 ± 0.001M/s
> > >         uretprobe-ret  :    0.931 ± 0.001M/s
> > >         uretprobe-nop5 :    0.957 ± 0.001M/s
> > > 
> > > after the change:
> > >         usermode-count :  152.448 ± 0.244M/s
> > >         syscall-count  :   14.321 ± 0.059M/s
> > >         uprobe-nop     :    3.148 ± 0.007M/s
> > >         uprobe-push    :    2.976 ± 0.004M/s
> > >         uprobe-ret     :    1.068 ± 0.003M/s
> > > -->     uprobe-nop5    :    7.038 ± 0.007M/s
> > >         uretprobe-nop  :    2.109 ± 0.004M/s
> > >         uretprobe-push :    2.035 ± 0.001M/s
> > >         uretprobe-ret  :    0.908 ± 0.001M/s
> > >         uretprobe-nop5 :    3.377 ± 0.009M/s
> > > 
> > > I see bit more speed up on Intel (above) compared to AMD. The big nop5
> > > speed up is partly due to emulating nop5 and partly due to optimization.
> > > 
> > > The key speed up we do this for is the USDT switch from nop to nop5:
> > >         uprobe-nop     :    3.148 ± 0.007M/s
> > >         uprobe-nop5    :    7.038 ± 0.007M/s
> > > 
> > 
> > This also looks good to me.
> > 
> > Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> 
> thanks!
> 
> Peter, do you have more comments?
> 
> thanks,
> jirka

[tip: perf/core] uprobes/x86: Add support to optimize uprobes

Posted by tip-bot2 for Jiri Olsa 1 month, 1 week ago

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     ba2bfc97b4629b10bd8d02b36e04f3932a04cac4
Gitweb:        https://git.kernel.org/tip/ba2bfc97b4629b10bd8d02b36e04f3932a04cac4
Author:        Jiri Olsa <jolsa@kernel.org>
AuthorDate:    Sun, 20 Jul 2025 13:21:20 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 21 Aug 2025 20:09:21 +02:00

uprobes/x86: Add support to optimize uprobes

Putting together all the previously added pieces to support optimized
uprobes on top of 5-byte nop instruction.

The current uprobe execution goes through following:

  - installs breakpoint instruction over original instruction
  - exception handler hit and calls related uprobe consumers
  - and either simulates original instruction or does out of line single step
    execution of it
  - returns to user space

The optimized uprobe path does following:

  - checks the original instruction is 5-byte nop (plus other checks)
  - adds (or uses existing) user space trampoline with uprobe syscall
  - overwrites original instruction (5-byte nop) with call to user space
    trampoline
  - the user space trampoline executes uprobe syscall that calls related uprobe
    consumers
  - trampoline returns back to next instruction

This approach won't speed up all uprobes as it's limited to using nop5 as
original instruction, but we plan to use nop5 as USDT probe instruction
(which currently uses single byte nop) and speed up the USDT probes.

The arch_uprobe_optimize triggers the uprobe optimization and is called after
first uprobe hit. I originally had it called on uprobe installation but then
it clashed with elf loader, because the user space trampoline was added in a
place where loader might need to put elf segments, so I decided to do it after
first uprobe hit when loading is done.

The uprobe is un-optimized in arch specific set_orig_insn call.

The instruction overwrite is x86 arch specific and needs to go through 3 updates:
(on top of nop5 instruction)

  - write int3 into 1st byte
  - write last 4 bytes of the call instruction
  - update the call instruction opcode

And cleanup goes though similar reverse stages:

  - overwrite call opcode with breakpoint (int3)
  - write last 4 bytes of the nop5 instruction
  - write the nop5 first instruction byte

We do not unmap and release uprobe trampoline when it's no longer needed,
because there's no easy way to make sure none of the threads is still
inside the trampoline. But we do not waste memory, because there's just
single page for all the uprobe trampoline mappings.

We do waste frame on page mapping for every 4GB by keeping the uprobe
trampoline page mapped, but that seems ok.

We take the benefit from the fact that set_swbp and set_orig_insn are
called under mmap_write_lock(mm), so we can use the current instruction
as the state the uprobe is in - nop5/breakpoint/call trampoline -
and decide the needed action (optimize/un-optimize) based on that.

Attaching the speed up from benchs/run_bench_uprobes.sh script:

current:
        usermode-count :  152.604 ± 0.044M/s
        syscall-count  :   13.359 ± 0.042M/s
-->     uprobe-nop     :    3.229 ± 0.002M/s
        uprobe-push    :    3.086 ± 0.004M/s
        uprobe-ret     :    1.114 ± 0.004M/s
        uprobe-nop5    :    1.121 ± 0.005M/s
        uretprobe-nop  :    2.145 ± 0.002M/s
        uretprobe-push :    2.070 ± 0.001M/s
        uretprobe-ret  :    0.931 ± 0.001M/s
        uretprobe-nop5 :    0.957 ± 0.001M/s

after the change:
        usermode-count :  152.448 ± 0.244M/s
        syscall-count  :   14.321 ± 0.059M/s
        uprobe-nop     :    3.148 ± 0.007M/s
        uprobe-push    :    2.976 ± 0.004M/s
        uprobe-ret     :    1.068 ± 0.003M/s
-->     uprobe-nop5    :    7.038 ± 0.007M/s
        uretprobe-nop  :    2.109 ± 0.004M/s
        uretprobe-push :    2.035 ± 0.001M/s
        uretprobe-ret  :    0.908 ± 0.001M/s
        uretprobe-nop5 :    3.377 ± 0.009M/s

I see bit more speed up on Intel (above) compared to AMD. The big nop5
speed up is partly due to emulating nop5 and partly due to optimization.

The key speed up we do this for is the USDT switch from nop to nop5:
        uprobe-nop     :    3.148 ± 0.007M/s
        uprobe-nop5    :    7.038 ± 0.007M/s

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20250720112133.244369-11-jolsa@kernel.org
---
 arch/x86/include/asm/uprobes.h |   7 +-
 arch/x86/kernel/uprobes.c      | 283 +++++++++++++++++++++++++++++++-
 include/linux/uprobes.h        |   6 +-
 kernel/events/uprobes.c        |  16 +-
 4 files changed, 305 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 678fb54..1ee2e51 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
 #define UPROBE_SWBP_INSN		0xcc
 #define UPROBE_SWBP_INSN_SIZE		   1
 
+enum {
+	ARCH_UPROBE_FLAG_CAN_OPTIMIZE   = 0,
+	ARCH_UPROBE_FLAG_OPTIMIZE_FAIL  = 1,
+};
+
 struct uprobe_xol_ops;
 
 struct arch_uprobe {
@@ -45,6 +50,8 @@ struct arch_uprobe {
 			u8	ilen;
 		}			push;
 	};
+
+	unsigned long flags;
 };
 
 struct arch_uprobe_task {
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index d18e1ae..209ce74 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -18,6 +18,7 @@
 #include <asm/processor.h>
 #include <asm/insn.h>
 #include <asm/mmu_context.h>
+#include <asm/nops.h>
 
 /* Post-execution fixups. */
 
@@ -702,7 +703,6 @@ static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
 	return tramp;
 }
 
-__maybe_unused
 static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new)
 {
 	struct uprobes_state *state = &current->mm->uprobes_state;
@@ -891,6 +891,280 @@ static int __init arch_uprobes_init(void)
 
 late_initcall(arch_uprobes_init);
 
+enum {
+	EXPECT_SWBP,
+	EXPECT_CALL,
+};
+
+struct write_opcode_ctx {
+	unsigned long base;
+	int expect;
+};
+
+static int is_call_insn(uprobe_opcode_t *insn)
+{
+	return *insn == CALL_INSN_OPCODE;
+}
+
+/*
+ * Verification callback used by int3_update uprobe_write calls to make sure
+ * the underlying instruction is as expected - either int3 or call.
+ */
+static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
+		       int nbytes, void *data)
+{
+	struct write_opcode_ctx *ctx = data;
+	uprobe_opcode_t old_opcode[5];
+
+	uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
+
+	switch (ctx->expect) {
+	case EXPECT_SWBP:
+		if (is_swbp_insn(&old_opcode[0]))
+			return 1;
+		break;
+	case EXPECT_CALL:
+		if (is_call_insn(&old_opcode[0]))
+			return 1;
+		break;
+	}
+
+	return -1;
+}
+
+/*
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
+ * (borrowed comment from smp_text_poke_batch_finish)
+ *
+ * The way it is done:
+ *   - Add an INT3 trap to the address that will be patched
+ *   - SMP sync all CPUs
+ *   - Update all but the first byte of the patched range
+ *   - SMP sync all CPUs
+ *   - Replace the first byte (INT3) by the first byte of the replacing opcode
+ *   - SMP sync all CPUs
+ */
+static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+		       unsigned long vaddr, char *insn, bool optimize)
+{
+	uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
+	struct write_opcode_ctx ctx = {
+		.base = vaddr,
+	};
+	int err;
+
+	/*
+	 * Write int3 trap.
+	 *
+	 * The swbp_optimize path comes with breakpoint already installed,
+	 * so we can skip this step for optimize == true.
+	 */
+	if (!optimize) {
+		ctx.expect = EXPECT_CALL;
+		err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
+				   true /* is_register */, false /* do_update_ref_ctr */,
+				   &ctx);
+		if (err)
+			return err;
+	}
+
+	smp_text_poke_sync_each_cpu();
+
+	/* Write all but the first byte of the patched range. */
+	ctx.expect = EXPECT_SWBP;
+	err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
+			   true /* is_register */, false /* do_update_ref_ctr */,
+			   &ctx);
+	if (err)
+		return err;
+
+	smp_text_poke_sync_each_cpu();
+
+	/*
+	 * Write first byte.
+	 *
+	 * The swbp_unoptimize needs to finish uprobe removal together
+	 * with ref_ctr update, using uprobe_write with proper flags.
+	 */
+	err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn,
+			   optimize /* is_register */, !optimize /* do_update_ref_ctr */,
+			   &ctx);
+	if (err)
+		return err;
+
+	smp_text_poke_sync_each_cpu();
+	return 0;
+}
+
+static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+			 unsigned long vaddr, unsigned long tramp)
+{
+	u8 call[5];
+
+	__text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr,
+			(const void *) tramp, CALL_INSN_SIZE);
+	return int3_update(auprobe, vma, vaddr, call, true /* optimize */);
+}
+
+static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+			   unsigned long vaddr)
+{
+	return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */);
+}
+
+static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
+{
+	unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD;
+	struct vm_area_struct *vma;
+	struct page *page;
+
+	page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+	uprobe_copy_from_page(page, vaddr, dst, len);
+	put_page(page);
+	return 0;
+}
+
+static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
+{
+	struct __packed __arch_relative_insn {
+		u8 op;
+		s32 raddr;
+	} *call = (struct __arch_relative_insn *) insn;
+
+	if (!is_call_insn(insn))
+		return false;
+	return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
+}
+
+static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized)
+{
+	uprobe_opcode_t insn[5];
+	int err;
+
+	err = copy_from_vaddr(mm, vaddr, &insn, 5);
+	if (err)
+		return err;
+	*optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr);
+	return 0;
+}
+
+static bool should_optimize(struct arch_uprobe *auprobe)
+{
+	return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) &&
+		test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+}
+
+int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+	     unsigned long vaddr)
+{
+	if (should_optimize(auprobe)) {
+		bool optimized = false;
+		int err;
+
+		/*
+		 * We could race with another thread that already optimized the probe,
+		 * so let's not overwrite it with int3 again in this case.
+		 */
+		err = is_optimized(vma->vm_mm, vaddr, &optimized);
+		if (err)
+			return err;
+		if (optimized)
+			return 0;
+	}
+	return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN,
+				   true /* is_register */);
+}
+
+int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+		  unsigned long vaddr)
+{
+	if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
+		struct mm_struct *mm = vma->vm_mm;
+		bool optimized = false;
+		int err;
+
+		err = is_optimized(mm, vaddr, &optimized);
+		if (err)
+			return err;
+		if (optimized) {
+			err = swbp_unoptimize(auprobe, vma, vaddr);
+			WARN_ON_ONCE(err);
+			return err;
+		}
+	}
+	return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn,
+				   false /* is_register */);
+}
+
+static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
+				  unsigned long vaddr)
+{
+	struct uprobe_trampoline *tramp;
+	struct vm_area_struct *vma;
+	bool new = false;
+	int err = 0;
+
+	vma = find_vma(mm, vaddr);
+	if (!vma)
+		return -EINVAL;
+	tramp = get_uprobe_trampoline(vaddr, &new);
+	if (!tramp)
+		return -EINVAL;
+	err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr);
+	if (WARN_ON_ONCE(err) && new)
+		destroy_uprobe_trampoline(tramp);
+	return err;
+}
+
+void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	struct mm_struct *mm = current->mm;
+	uprobe_opcode_t insn[5];
+
+	/*
+	 * Do not optimize if shadow stack is enabled, the return address hijack
+	 * code in arch_uretprobe_hijack_return_addr updates wrong frame when
+	 * the entry uprobe is optimized and the shadow stack crashes the app.
+	 */
+	if (shstk_is_enabled())
+		return;
+
+	if (!should_optimize(auprobe))
+		return;
+
+	mmap_write_lock(mm);
+
+	/*
+	 * Check if some other thread already optimized the uprobe for us,
+	 * if it's the case just go away silently.
+	 */
+	if (copy_from_vaddr(mm, vaddr, &insn, 5))
+		goto unlock;
+	if (!is_swbp_insn((uprobe_opcode_t*) &insn))
+		goto unlock;
+
+	/*
+	 * If we fail to optimize the uprobe we set the fail bit so the
+	 * above should_optimize will fail from now on.
+	 */
+	if (__arch_uprobe_optimize(auprobe, mm, vaddr))
+		set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
+
+unlock:
+	mmap_write_unlock(mm);
+}
+
+static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	if (memcmp(&auprobe->insn, x86_nops[5], 5))
+		return false;
+	/* We can't do cross page atomic writes yet. */
+	return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
+}
 #else /* 32-bit: */
 /*
  * No RIP-relative addressing on 32-bit
@@ -904,6 +1178,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 {
 }
+static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	return false;
+}
 #endif /* CONFIG_X86_64 */
 
 struct uprobe_xol_ops {
@@ -1270,6 +1548,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, 
 	if (ret)
 		return ret;
 
+	if (can_optimize(auprobe, addr))
+		set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+
 	ret = branch_setup_xol_ops(auprobe, &insn);
 	if (ret != -ENOSYS)
 		return ret;
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b6b077c..08ef784 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -192,7 +192,7 @@ struct uprobes_state {
 };
 
 typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr,
-				     uprobe_opcode_t *insn, int nbytes);
+				     uprobe_opcode_t *insn, int nbytes, void *data);
 
 extern void __init uprobes_init(void);
 extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr);
@@ -204,7 +204,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t,
 			       bool is_register);
 extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr,
-			uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr);
+			uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
+			void *data);
 extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
 extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
@@ -240,6 +241,7 @@ extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *
 extern void arch_uprobe_clear_state(struct mm_struct *mm);
 extern void arch_uprobe_init_state(struct mm_struct *mm);
 extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
+extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index eb07e60..4a194d7 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -192,7 +192,7 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
 }
 
 static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
-			 int nbytes)
+			 int nbytes, void *data)
 {
 	uprobe_opcode_t old_opcode;
 	bool is_swbp;
@@ -491,12 +491,13 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 		bool is_register)
 {
 	return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
-			    verify_opcode, is_register, true /* do_update_ref_ctr */);
+			    verify_opcode, is_register, true /* do_update_ref_ctr */, NULL);
 }
 
 int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 		 const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
-		 uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr)
+		 uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
+		 void *data)
 {
 	const unsigned long vaddr = insn_vaddr & PAGE_MASK;
 	struct mm_struct *mm = vma->vm_mm;
@@ -530,7 +531,7 @@ retry:
 		goto out;
 	folio = page_folio(page);
 
-	ret = verify(page, insn_vaddr, insn, nbytes);
+	ret = verify(page, insn_vaddr, insn, nbytes, data);
 	if (ret <= 0) {
 		folio_put(folio);
 		goto out;
@@ -2696,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
 	return true;
 }
 
+void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+}
+
 /*
  * Run handler and ask thread to singlestep.
  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -2760,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs)
 
 	handler_chain(uprobe, regs);
 
+	/* Try to optimize after first hit. */
+	arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
 	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
 		goto out;