[RFC perf/core 07/11] uprobes/x86: Add support to optimize uprobes

Jiri Olsa posted 11 patches 2 weeks, 4 days ago
[RFC perf/core 07/11] uprobes/x86: Add support to optimize uprobes
Posted by Jiri Olsa 2 weeks, 4 days ago
Putting together all the previously added pieces to support optimized
uprobes on top of 5-byte nop instruction.

The current uprobe execution goes through following:
  - installs breakpoint instruction over original instruction
  - exception handler hit and calls related uprobe consumers
  - and either simulates original instruction or does out of line single step
    execution of it
  - returns to user space

The optimized uprobe path

  - checks the original instruction is 5-byte nop (plus other checks)
  - adds (or uses existing) user space trampoline and overwrites original
    instruction (5-byte nop) with call to user space trampoline
  - the user space trampoline executes uprobe syscall that calls related uprobe
    consumers
  - trampoline returns back to next instruction

This approach won't speed up all uprobes as it's limited to using nop5 as
original instruction, but we could use nop5 as USDT probe instruction (which
uses single byte nop ATM) and speed up the USDT probes.

This patch overloads related arch functions in uprobe_write_opcode and
set_orig_insn so they can install call instruction if needed.

The arch_uprobe_optimize triggers the uprobe optimization and is called after
first uprobe hit. I originally had it called on uprobe installation but then
it clashed with elf loader, because the user space trampoline was added in a
place where loader might need to put elf segments, so I decided to do it after
first uprobe hit when loading is done.

TODO release uprobe trampoline when it's no longer needed.. we might need to
stop all cpus to make sure no user space thread is in the trampoline.. or we
might just keep it, because there's just one 4GB memory region?

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/include/asm/uprobes.h |   7 ++
 arch/x86/kernel/uprobes.c      | 130 +++++++++++++++++++++++++++++++++
 include/linux/uprobes.h        |   1 +
 kernel/events/uprobes.c        |   3 +
 4 files changed, 141 insertions(+)

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 678fb546f0a7..84a75ed748f0 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
 #define UPROBE_SWBP_INSN		0xcc
 #define UPROBE_SWBP_INSN_SIZE		   1
 
+enum {
+	ARCH_UPROBE_FLAG_CAN_OPTIMIZE	= 0,
+	ARCH_UPROBE_FLAG_OPTIMIZED	= 1,
+};
+
 struct uprobe_xol_ops;
 
 struct arch_uprobe {
@@ -45,6 +50,8 @@ struct arch_uprobe {
 			u8	ilen;
 		}			push;
 	};
+
+	unsigned long flags;
 };
 
 struct arch_uprobe_task {
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 02aa4519b677..50ccf24ff42c 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -18,6 +18,7 @@
 #include <asm/processor.h>
 #include <asm/insn.h>
 #include <asm/mmu_context.h>
+#include <asm/nops.h>
 
 /* Post-execution fixups. */
 
@@ -877,6 +878,33 @@ static const struct uprobe_xol_ops push_xol_ops = {
 	.emulate  = push_emulate_op,
 };
 
+static int is_nop5_insns(uprobe_opcode_t *insn)
+{
+	return !memcmp(insn, x86_nops[5], 5);
+}
+
+static int is_call_insns(uprobe_opcode_t *insn)
+{
+	return *insn == 0xe8;
+}
+
+static void relative_insn(void *dest, void *from, void *to, u8 op)
+{
+	struct __arch_relative_insn {
+		u8 op;
+		s32 raddr;
+	} __packed *insn;
+
+	insn = (struct __arch_relative_insn *)dest;
+	insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
+	insn->op = op;
+}
+
+static void relative_call(void *dest, void *from, void *to)
+{
+	relative_insn(dest, from, to, CALL_INSN_OPCODE);
+}
+
 /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
 static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
 {
@@ -896,6 +924,10 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
 		break;
 
 	case 0x0f:
+		if (is_nop5_insns((uprobe_opcode_t *) &auprobe->insn)) {
+			set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+			break;
+		}
 		if (insn->opcode.nbytes != 2)
 			return -ENOSYS;
 		/*
@@ -1267,3 +1299,101 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
 	else
 		return regs->sp <= ret->stack;
 }
+
+int arch_uprobe_verify_opcode(struct page *page, unsigned long vaddr,
+			      uprobe_opcode_t *new_opcode, void *opt)
+{
+	if (opt) {
+		uprobe_opcode_t old_opcode[5];
+		bool is_call;
+
+		uprobe_copy_from_page(page, vaddr, (uprobe_opcode_t *) &old_opcode, 5);
+		is_call = is_call_insns((uprobe_opcode_t *) &old_opcode);
+
+		if (is_call_insns(new_opcode)) {
+			if (is_call)		/* register: already installed? */
+				return 0;
+		} else {
+			if (!is_call)		/* unregister: was it changed by us? */
+				return 0;
+		}
+
+		return 1;
+	}
+
+	return uprobe_verify_opcode(page, vaddr, new_opcode);
+}
+
+bool arch_uprobe_is_register(uprobe_opcode_t *insn, int len, void *data)
+{
+	return data ? len == 5 && is_call_insns(insn) : is_swbp_insn(insn);
+}
+
+static void __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
+				   unsigned long vaddr)
+{
+	struct tramp_area *area = NULL;
+	char call[5];
+
+	/* We can't do cross page atomic writes yet. */
+	if (PAGE_SIZE - (vaddr & ~PAGE_MASK) < 5)
+		goto fail;
+
+	area = get_tramp_area(vaddr);
+	if (!area)
+		goto fail;
+
+	relative_call(call, (void *) vaddr, (void *) area->vaddr);
+	if (uprobe_write_opcode(auprobe, mm, vaddr, call, 5, (void *) 1))
+		goto fail;
+
+	set_bit(ARCH_UPROBE_FLAG_OPTIMIZED, &auprobe->flags);
+	return;
+
+fail:
+	/* Once we fail we never try again. */
+	put_tramp_area(area);
+	clear_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+}
+
+static bool should_optimize(struct arch_uprobe *auprobe)
+{
+	if (!test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags))
+		return false;
+	if (test_bit(ARCH_UPROBE_FLAG_OPTIMIZED, &auprobe->flags))
+		return false;
+	return true;
+}
+
+void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	struct mm_struct *mm = current->mm;
+
+	if (!should_optimize(auprobe))
+		return;
+
+	mmap_write_lock(mm);
+	if (should_optimize(auprobe))
+		__arch_uprobe_optimize(auprobe, mm, vaddr);
+	mmap_write_unlock(mm);
+}
+
+int set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+	uprobe_opcode_t *insn = (uprobe_opcode_t *) auprobe->insn;
+
+	if (test_bit(ARCH_UPROBE_FLAG_OPTIMIZED, &auprobe->flags))
+		return uprobe_write_opcode(auprobe, mm, vaddr, insn, 5, (void *) 1);
+
+	return uprobe_write_opcode(auprobe, mm, vaddr, insn, UPROBE_SWBP_INSN_SIZE, NULL);
+}
+
+bool arch_uprobe_is_callable(unsigned long vtramp, unsigned long vaddr)
+{
+	unsigned long delta;
+
+	/* call instructions size */
+	vaddr += 5;
+	delta = vaddr < vtramp ? vtramp - vaddr : vaddr - vtramp;
+	return delta < 0xffffffff;
+}
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 4024e6ea52a4..42ab29f80220 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -233,6 +233,7 @@ void put_tramp_area(struct tramp_area *area);
 bool arch_uprobe_is_callable(unsigned long vtramp, unsigned long vaddr);
 extern void *arch_uprobe_trampoline(unsigned long *psize);
 extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
+extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b8399684231c..efe45fcd5d0a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2759,6 +2759,9 @@ static void handle_swbp(struct pt_regs *regs)
 
 	handler_chain(uprobe, regs);
 
+	/* Try to optimize after first hit. */
+	arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
 	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
 		goto out;
 
-- 
2.47.0
Re: [RFC perf/core 07/11] uprobes/x86: Add support to optimize uprobes
Posted by Masami Hiramatsu (Google) 6 days, 3 hours ago
On Tue,  5 Nov 2024 14:34:01 +0100
Jiri Olsa <jolsa@kernel.org> wrote:

> Putting together all the previously added pieces to support optimized
> uprobes on top of 5-byte nop instruction.
> 
> The current uprobe execution goes through following:
>   - installs breakpoint instruction over original instruction
>   - exception handler hit and calls related uprobe consumers
>   - and either simulates original instruction or does out of line single step
>     execution of it
>   - returns to user space
> 
> The optimized uprobe path
> 
>   - checks the original instruction is 5-byte nop (plus other checks)
>   - adds (or uses existing) user space trampoline and overwrites original
>     instruction (5-byte nop) with call to user space trampoline
>   - the user space trampoline executes uprobe syscall that calls related uprobe
>     consumers
>   - trampoline returns back to next instruction
> 
> This approach won't speed up all uprobes as it's limited to using nop5 as
> original instruction, but we could use nop5 as USDT probe instruction (which
> uses single byte nop ATM) and speed up the USDT probes.
> 
> This patch overloads related arch functions in uprobe_write_opcode and
> set_orig_insn so they can install call instruction if needed.
> 
> The arch_uprobe_optimize triggers the uprobe optimization and is called after
> first uprobe hit. I originally had it called on uprobe installation but then
> it clashed with elf loader, because the user space trampoline was added in a
> place where loader might need to put elf segments, so I decided to do it after
> first uprobe hit when loading is done.
> 
> TODO release uprobe trampoline when it's no longer needed.. we might need to
> stop all cpus to make sure no user space thread is in the trampoline.. or we
> might just keep it, because there's just one 4GB memory region?
> 
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  arch/x86/include/asm/uprobes.h |   7 ++
>  arch/x86/kernel/uprobes.c      | 130 +++++++++++++++++++++++++++++++++
>  include/linux/uprobes.h        |   1 +
>  kernel/events/uprobes.c        |   3 +
>  4 files changed, 141 insertions(+)
> 
> diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
> index 678fb546f0a7..84a75ed748f0 100644
> --- a/arch/x86/include/asm/uprobes.h
> +++ b/arch/x86/include/asm/uprobes.h
> @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
>  #define UPROBE_SWBP_INSN		0xcc
>  #define UPROBE_SWBP_INSN_SIZE		   1
>  
> +enum {
> +	ARCH_UPROBE_FLAG_CAN_OPTIMIZE	= 0,
> +	ARCH_UPROBE_FLAG_OPTIMIZED	= 1,
> +};
> +
>  struct uprobe_xol_ops;
>  
>  struct arch_uprobe {
> @@ -45,6 +50,8 @@ struct arch_uprobe {
>  			u8	ilen;
>  		}			push;
>  	};
> +
> +	unsigned long flags;
>  };
>  
>  struct arch_uprobe_task {
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index 02aa4519b677..50ccf24ff42c 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -18,6 +18,7 @@
>  #include <asm/processor.h>
>  #include <asm/insn.h>
>  #include <asm/mmu_context.h>
> +#include <asm/nops.h>
>  
>  /* Post-execution fixups. */
>  
> @@ -877,6 +878,33 @@ static const struct uprobe_xol_ops push_xol_ops = {
>  	.emulate  = push_emulate_op,
>  };
>  
> +static int is_nop5_insns(uprobe_opcode_t *insn)
> +{
> +	return !memcmp(insn, x86_nops[5], 5);

Maybe better to use BYTES_NOP5 directly?

> +}
> +
> +static int is_call_insns(uprobe_opcode_t *insn)
> +{
> +	return *insn == 0xe8;

0xe8 -> CALL_INSN_OPCODE

Thank you,



-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>
Re: [RFC perf/core 07/11] uprobes/x86: Add support to optimize uprobes
Posted by Jiri Olsa 6 days, 2 hours ago
On Mon, Nov 18, 2024 at 05:18:08PM +0900, Masami Hiramatsu wrote:
> On Tue,  5 Nov 2024 14:34:01 +0100
> Jiri Olsa <jolsa@kernel.org> wrote:
> 
> > Putting together all the previously added pieces to support optimized
> > uprobes on top of 5-byte nop instruction.
> > 
> > The current uprobe execution goes through following:
> >   - installs breakpoint instruction over original instruction
> >   - exception handler hit and calls related uprobe consumers
> >   - and either simulates original instruction or does out of line single step
> >     execution of it
> >   - returns to user space
> > 
> > The optimized uprobe path
> > 
> >   - checks the original instruction is 5-byte nop (plus other checks)
> >   - adds (or uses existing) user space trampoline and overwrites original
> >     instruction (5-byte nop) with call to user space trampoline
> >   - the user space trampoline executes uprobe syscall that calls related uprobe
> >     consumers
> >   - trampoline returns back to next instruction
> > 
> > This approach won't speed up all uprobes as it's limited to using nop5 as
> > original instruction, but we could use nop5 as USDT probe instruction (which
> > uses single byte nop ATM) and speed up the USDT probes.
> > 
> > This patch overloads related arch functions in uprobe_write_opcode and
> > set_orig_insn so they can install call instruction if needed.
> > 
> > The arch_uprobe_optimize triggers the uprobe optimization and is called after
> > first uprobe hit. I originally had it called on uprobe installation but then
> > it clashed with elf loader, because the user space trampoline was added in a
> > place where loader might need to put elf segments, so I decided to do it after
> > first uprobe hit when loading is done.
> > 
> > TODO release uprobe trampoline when it's no longer needed.. we might need to
> > stop all cpus to make sure no user space thread is in the trampoline.. or we
> > might just keep it, because there's just one 4GB memory region?
> > 
> > Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> > ---
> >  arch/x86/include/asm/uprobes.h |   7 ++
> >  arch/x86/kernel/uprobes.c      | 130 +++++++++++++++++++++++++++++++++
> >  include/linux/uprobes.h        |   1 +
> >  kernel/events/uprobes.c        |   3 +
> >  4 files changed, 141 insertions(+)
> > 
> > diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
> > index 678fb546f0a7..84a75ed748f0 100644
> > --- a/arch/x86/include/asm/uprobes.h
> > +++ b/arch/x86/include/asm/uprobes.h
> > @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
> >  #define UPROBE_SWBP_INSN		0xcc
> >  #define UPROBE_SWBP_INSN_SIZE		   1
> >  
> > +enum {
> > +	ARCH_UPROBE_FLAG_CAN_OPTIMIZE	= 0,
> > +	ARCH_UPROBE_FLAG_OPTIMIZED	= 1,
> > +};
> > +
> >  struct uprobe_xol_ops;
> >  
> >  struct arch_uprobe {
> > @@ -45,6 +50,8 @@ struct arch_uprobe {
> >  			u8	ilen;
> >  		}			push;
> >  	};
> > +
> > +	unsigned long flags;
> >  };
> >  
> >  struct arch_uprobe_task {
> > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> > index 02aa4519b677..50ccf24ff42c 100644
> > --- a/arch/x86/kernel/uprobes.c
> > +++ b/arch/x86/kernel/uprobes.c
> > @@ -18,6 +18,7 @@
> >  #include <asm/processor.h>
> >  #include <asm/insn.h>
> >  #include <asm/mmu_context.h>
> > +#include <asm/nops.h>
> >  
> >  /* Post-execution fixups. */
> >  
> > @@ -877,6 +878,33 @@ static const struct uprobe_xol_ops push_xol_ops = {
> >  	.emulate  = push_emulate_op,
> >  };
> >  
> > +static int is_nop5_insns(uprobe_opcode_t *insn)
> > +{
> > +	return !memcmp(insn, x86_nops[5], 5);
> 
> Maybe better to use BYTES_NOP5 directly?

ok

> 
> > +}
> > +
> > +static int is_call_insns(uprobe_opcode_t *insn)
> > +{
> > +	return *insn == 0xe8;
> 
> 0xe8 -> CALL_INSN_OPCODE

right, thanks

jirka
Re: [RFC perf/core 07/11] uprobes/x86: Add support to optimize uprobes
Posted by Andrii Nakryiko 1 week, 2 days ago
On Tue, Nov 5, 2024 at 5:35 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> Putting together all the previously added pieces to support optimized
> uprobes on top of 5-byte nop instruction.
>
> The current uprobe execution goes through following:
>   - installs breakpoint instruction over original instruction
>   - exception handler hit and calls related uprobe consumers
>   - and either simulates original instruction or does out of line single step
>     execution of it
>   - returns to user space
>
> The optimized uprobe path
>
>   - checks the original instruction is 5-byte nop (plus other checks)
>   - adds (or uses existing) user space trampoline and overwrites original
>     instruction (5-byte nop) with call to user space trampoline
>   - the user space trampoline executes uprobe syscall that calls related uprobe
>     consumers
>   - trampoline returns back to next instruction
>
> This approach won't speed up all uprobes as it's limited to using nop5 as
> original instruction, but we could use nop5 as USDT probe instruction (which
> uses single byte nop ATM) and speed up the USDT probes.

As discussed offline, it's not as simple as just replacing nop1 with
nop5 in USDT definition due to performance considerations on old
kernels (nop5 isn't fast as far as uprobe is concerned), but I think
we'll be able to accommodate transparent "nop1 or nop5" behavior in
user space, we'll just need a careful backwards compatible extension
to USDT definition.

BTW, do you plan to send an optimization for nop5 to avoid
single-stepping them? Ideally we'd just handle any-sized nop, so we
don't have to do this "nop1 or nop5" acrobatics in the future.

>
> This patch overloads related arch functions in uprobe_write_opcode and
> set_orig_insn so they can install call instruction if needed.
>
> The arch_uprobe_optimize triggers the uprobe optimization and is called after
> first uprobe hit. I originally had it called on uprobe installation but then
> it clashed with elf loader, because the user space trampoline was added in a
> place where loader might need to put elf segments, so I decided to do it after
> first uprobe hit when loading is done.

fun... ideally we wouldn't do this lazily, I just came up with another
possible idea, but let's keep all this discussion to another thread
with Peter

>
> TODO release uprobe trampoline when it's no longer needed.. we might need to
> stop all cpus to make sure no user space thread is in the trampoline.. or we
> might just keep it, because there's just one 4GB memory region?

4KB not 4GB, right? Yeah, probably leaving it until process exists is
totally fine.

>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  arch/x86/include/asm/uprobes.h |   7 ++
>  arch/x86/kernel/uprobes.c      | 130 +++++++++++++++++++++++++++++++++
>  include/linux/uprobes.h        |   1 +
>  kernel/events/uprobes.c        |   3 +
>  4 files changed, 141 insertions(+)
>
> diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
> index 678fb546f0a7..84a75ed748f0 100644
> --- a/arch/x86/include/asm/uprobes.h
> +++ b/arch/x86/include/asm/uprobes.h
> @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
>  #define UPROBE_SWBP_INSN               0xcc
>  #define UPROBE_SWBP_INSN_SIZE             1
>
> +enum {
> +       ARCH_UPROBE_FLAG_CAN_OPTIMIZE   = 0,
> +       ARCH_UPROBE_FLAG_OPTIMIZED      = 1,
> +};
> +
>  struct uprobe_xol_ops;
>
>  struct arch_uprobe {
> @@ -45,6 +50,8 @@ struct arch_uprobe {
>                         u8      ilen;
>                 }                       push;
>         };
> +
> +       unsigned long flags;
>  };
>
>  struct arch_uprobe_task {
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index 02aa4519b677..50ccf24ff42c 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -18,6 +18,7 @@
>  #include <asm/processor.h>
>  #include <asm/insn.h>
>  #include <asm/mmu_context.h>
> +#include <asm/nops.h>
>
>  /* Post-execution fixups. */
>
> @@ -877,6 +878,33 @@ static const struct uprobe_xol_ops push_xol_ops = {
>         .emulate  = push_emulate_op,
>  };
>
> +static int is_nop5_insns(uprobe_opcode_t *insn)

insns -> insn?

> +{
> +       return !memcmp(insn, x86_nops[5], 5);
> +}
> +
> +static int is_call_insns(uprobe_opcode_t *insn)

ditto, insn, singular?

> +{
> +       return *insn == 0xe8;
> +}
> +
> +static void relative_insn(void *dest, void *from, void *to, u8 op)
> +{
> +       struct __arch_relative_insn {
> +               u8 op;
> +               s32 raddr;
> +       } __packed *insn;
> +
> +       insn = (struct __arch_relative_insn *)dest;
> +       insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
> +       insn->op = op;
> +}
> +
> +static void relative_call(void *dest, void *from, void *to)
> +{
> +       relative_insn(dest, from, to, CALL_INSN_OPCODE);
> +}
> +
>  /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
>  static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
>  {
> @@ -896,6 +924,10 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
>                 break;
>
>         case 0x0f:
> +               if (is_nop5_insns((uprobe_opcode_t *) &auprobe->insn)) {
> +                       set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
> +                       break;
> +               }
>                 if (insn->opcode.nbytes != 2)
>                         return -ENOSYS;
>                 /*
> @@ -1267,3 +1299,101 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
>         else
>                 return regs->sp <= ret->stack;
>  }
> +
> +int arch_uprobe_verify_opcode(struct page *page, unsigned long vaddr,
> +                             uprobe_opcode_t *new_opcode, void *opt)
> +{
> +       if (opt) {
> +               uprobe_opcode_t old_opcode[5];
> +               bool is_call;
> +
> +               uprobe_copy_from_page(page, vaddr, (uprobe_opcode_t *) &old_opcode, 5);
> +               is_call = is_call_insns((uprobe_opcode_t *) &old_opcode);
> +
> +               if (is_call_insns(new_opcode)) {
> +                       if (is_call)            /* register: already installed? */

probably should check that the destination of the call instruction is
what we expect?

> +                               return 0;
> +               } else {
> +                       if (!is_call)           /* unregister: was it changed by us? */
> +                               return 0;
> +               }
> +
> +               return 1;
> +       }
> +
> +       return uprobe_verify_opcode(page, vaddr, new_opcode);
> +}

[...]

> +int set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
> +{
> +       uprobe_opcode_t *insn = (uprobe_opcode_t *) auprobe->insn;
> +
> +       if (test_bit(ARCH_UPROBE_FLAG_OPTIMIZED, &auprobe->flags))
> +               return uprobe_write_opcode(auprobe, mm, vaddr, insn, 5, (void *) 1);
> +
> +       return uprobe_write_opcode(auprobe, mm, vaddr, insn, UPROBE_SWBP_INSN_SIZE, NULL);
> +}
> +
> +bool arch_uprobe_is_callable(unsigned long vtramp, unsigned long vaddr)
> +{
> +       unsigned long delta;
> +
> +       /* call instructions size */
> +       vaddr += 5;
> +       delta = vaddr < vtramp ? vtramp - vaddr : vaddr - vtramp;
> +       return delta < 0xffffffff;

isn't immediate a sign extended 32-bit value (that is, int)? wouldn't
this work and be correct:

long delta = (long)(vaddr + 5 - vtramp);
return delta >= INT_MIN && delta <= INT_MAX;

?


> +}
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index 4024e6ea52a4..42ab29f80220 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -233,6 +233,7 @@ void put_tramp_area(struct tramp_area *area);
>  bool arch_uprobe_is_callable(unsigned long vtramp, unsigned long vaddr);
>  extern void *arch_uprobe_trampoline(unsigned long *psize);
>  extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
> +extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr);
>  #else /* !CONFIG_UPROBES */
>  struct uprobes_state {
>  };
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index b8399684231c..efe45fcd5d0a 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -2759,6 +2759,9 @@ static void handle_swbp(struct pt_regs *regs)
>
>         handler_chain(uprobe, regs);
>
> +       /* Try to optimize after first hit. */
> +       arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
> +
>         if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
>                 goto out;
>
> --
> 2.47.0
>
Re: [RFC perf/core 07/11] uprobes/x86: Add support to optimize uprobes
Posted by Jiri Olsa 1 week ago
On Thu, Nov 14, 2024 at 03:44:20PM -0800, Andrii Nakryiko wrote:
> On Tue, Nov 5, 2024 at 5:35 AM Jiri Olsa <jolsa@kernel.org> wrote:
> >
> > Putting together all the previously added pieces to support optimized
> > uprobes on top of 5-byte nop instruction.
> >
> > The current uprobe execution goes through following:
> >   - installs breakpoint instruction over original instruction
> >   - exception handler hit and calls related uprobe consumers
> >   - and either simulates original instruction or does out of line single step
> >     execution of it
> >   - returns to user space
> >
> > The optimized uprobe path
> >
> >   - checks the original instruction is 5-byte nop (plus other checks)
> >   - adds (or uses existing) user space trampoline and overwrites original
> >     instruction (5-byte nop) with call to user space trampoline
> >   - the user space trampoline executes uprobe syscall that calls related uprobe
> >     consumers
> >   - trampoline returns back to next instruction
> >
> > This approach won't speed up all uprobes as it's limited to using nop5 as
> > original instruction, but we could use nop5 as USDT probe instruction (which
> > uses single byte nop ATM) and speed up the USDT probes.
> 
> As discussed offline, it's not as simple as just replacing nop1 with
> nop5 in USDT definition due to performance considerations on old
> kernels (nop5 isn't fast as far as uprobe is concerned), but I think
> we'll be able to accommodate transparent "nop1 or nop5" behavior in
> user space, we'll just need a careful backwards compatible extension
> to USDT definition.
> 
> BTW, do you plan to send an optimization for nop5 to avoid
> single-stepping them? Ideally we'd just handle any-sized nop, so we
> don't have to do this "nop1 or nop5" acrobatics in the future.

I'll prepare that, but I'd like to check on the alternative calls
you suggested first

> 
> >
> > This patch overloads related arch functions in uprobe_write_opcode and
> > set_orig_insn so they can install call instruction if needed.
> >
> > The arch_uprobe_optimize triggers the uprobe optimization and is called after
> > first uprobe hit. I originally had it called on uprobe installation but then
> > it clashed with elf loader, because the user space trampoline was added in a
> > place where loader might need to put elf segments, so I decided to do it after
> > first uprobe hit when loading is done.
> 
> fun... ideally we wouldn't do this lazily, I just came up with another
> possible idea, but let's keep all this discussion to another thread
> with Peter
> 
> >
> > TODO release uprobe trampoline when it's no longer needed.. we might need to
> > stop all cpus to make sure no user space thread is in the trampoline.. or we
> > might just keep it, because there's just one 4GB memory region?
> 
> 4KB not 4GB, right? Yeah, probably leaving it until process exists is
> totally fine.

yep, ok

SNIP

> > +#include <asm/nops.h>
> >
> >  /* Post-execution fixups. */
> >
> > @@ -877,6 +878,33 @@ static const struct uprobe_xol_ops push_xol_ops = {
> >         .emulate  = push_emulate_op,
> >  };
> >
> > +static int is_nop5_insns(uprobe_opcode_t *insn)
> 
> insns -> insn?
> 
> > +{
> > +       return !memcmp(insn, x86_nops[5], 5);
> > +}
> > +
> > +static int is_call_insns(uprobe_opcode_t *insn)
> 
> ditto, insn, singular?

ok

SNIP

> > +int arch_uprobe_verify_opcode(struct page *page, unsigned long vaddr,
> > +                             uprobe_opcode_t *new_opcode, void *opt)
> > +{
> > +       if (opt) {
> > +               uprobe_opcode_t old_opcode[5];
> > +               bool is_call;
> > +
> > +               uprobe_copy_from_page(page, vaddr, (uprobe_opcode_t *) &old_opcode, 5);
> > +               is_call = is_call_insns((uprobe_opcode_t *) &old_opcode);
> > +
> > +               if (is_call_insns(new_opcode)) {
> > +                       if (is_call)            /* register: already installed? */
> 
> probably should check that the destination of the call instruction is
> what we expect?

ok

SNIP

> > +bool arch_uprobe_is_callable(unsigned long vtramp, unsigned long vaddr)
> > +{
> > +       unsigned long delta;
> > +
> > +       /* call instructions size */
> > +       vaddr += 5;
> > +       delta = vaddr < vtramp ? vtramp - vaddr : vaddr - vtramp;
> > +       return delta < 0xffffffff;
> 
> isn't immediate a sign extended 32-bit value (that is, int)? wouldn't
> this work and be correct:
> 
> long delta = (long)(vaddr + 5 - vtramp);
> return delta >= INT_MIN && delta <= INT_MAX;
> 
> ?

ah, right.. should be sign value :-\ thanks

jirka