arch/Kconfig | 1 + kernel/events/uprobes.c | 38 ++++++++++++++++---------------------- 2 files changed, 17 insertions(+), 22 deletions(-)
This patch switches uprobes SRCU usage to RCU Tasks Trace flavor, which
is optimized for more lightweight and quick readers (at the expense of
slower writers, which for uprobes is a fine tradeof) and has better
performance and scalability with number of CPUs.
Similarly to baseline vs SRCU, we've benchmarked SRCU-based
implementation vs RCU Tasks Trace implementation.
SRCU
====
uprobe-nop ( 1 cpus): 3.276 ± 0.005M/s ( 3.276M/s/cpu)
uprobe-nop ( 2 cpus): 4.125 ± 0.002M/s ( 2.063M/s/cpu)
uprobe-nop ( 4 cpus): 7.713 ± 0.002M/s ( 1.928M/s/cpu)
uprobe-nop ( 8 cpus): 8.097 ± 0.006M/s ( 1.012M/s/cpu)
uprobe-nop (16 cpus): 6.501 ± 0.056M/s ( 0.406M/s/cpu)
uprobe-nop (32 cpus): 4.398 ± 0.084M/s ( 0.137M/s/cpu)
uprobe-nop (64 cpus): 6.452 ± 0.000M/s ( 0.101M/s/cpu)
uretprobe-nop ( 1 cpus): 2.055 ± 0.001M/s ( 2.055M/s/cpu)
uretprobe-nop ( 2 cpus): 2.677 ± 0.000M/s ( 1.339M/s/cpu)
uretprobe-nop ( 4 cpus): 4.561 ± 0.003M/s ( 1.140M/s/cpu)
uretprobe-nop ( 8 cpus): 5.291 ± 0.002M/s ( 0.661M/s/cpu)
uretprobe-nop (16 cpus): 5.065 ± 0.019M/s ( 0.317M/s/cpu)
uretprobe-nop (32 cpus): 3.622 ± 0.003M/s ( 0.113M/s/cpu)
uretprobe-nop (64 cpus): 3.723 ± 0.002M/s ( 0.058M/s/cpu)
RCU Tasks Trace
===============
uprobe-nop ( 1 cpus): 3.396 ± 0.002M/s ( 3.396M/s/cpu)
uprobe-nop ( 2 cpus): 4.271 ± 0.006M/s ( 2.135M/s/cpu)
uprobe-nop ( 4 cpus): 8.499 ± 0.015M/s ( 2.125M/s/cpu)
uprobe-nop ( 8 cpus): 10.355 ± 0.028M/s ( 1.294M/s/cpu)
uprobe-nop (16 cpus): 7.615 ± 0.099M/s ( 0.476M/s/cpu)
uprobe-nop (32 cpus): 4.430 ± 0.007M/s ( 0.138M/s/cpu)
uprobe-nop (64 cpus): 6.887 ± 0.020M/s ( 0.108M/s/cpu)
uretprobe-nop ( 1 cpus): 2.174 ± 0.001M/s ( 2.174M/s/cpu)
uretprobe-nop ( 2 cpus): 2.853 ± 0.001M/s ( 1.426M/s/cpu)
uretprobe-nop ( 4 cpus): 4.913 ± 0.002M/s ( 1.228M/s/cpu)
uretprobe-nop ( 8 cpus): 5.883 ± 0.002M/s ( 0.735M/s/cpu)
uretprobe-nop (16 cpus): 5.147 ± 0.001M/s ( 0.322M/s/cpu)
uretprobe-nop (32 cpus): 3.738 ± 0.008M/s ( 0.117M/s/cpu)
uretprobe-nop (64 cpus): 4.397 ± 0.002M/s ( 0.069M/s/cpu)
Peak throughput for uprobes increases from 8 mln/s to 10.3 mln/s
(+28%!), and for uretprobes from 5.3 mln/s to 5.8 mln/s (+11%), as we
have more work to do on uretprobes side.
Even single-thread (no contention) performance is slightly better: 3.276
mln/s to 3.396 mln/s (+3.5%) for uprobes, and 2.055 mln/s to 2.174 mln/s
(+5.8%) for uretprobes.
We also select TASKS_TRACE_RCU for UPROBES in Kconfig due to the new
dependency.
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
arch/Kconfig | 1 +
kernel/events/uprobes.c | 38 ++++++++++++++++----------------------
2 files changed, 17 insertions(+), 22 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 975dd22a2dbd..a0df3f3dc484 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -126,6 +126,7 @@ config KPROBES_ON_FTRACE
config UPROBES
def_bool n
depends on ARCH_SUPPORTS_UPROBES
+ select TASKS_TRACE_RCU
help
Uprobes is the user-space counterpart to kprobes: they
enable instrumentation applications (such as 'perf probe')
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4b7e590dc428..a2e6a57f79f2 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -26,6 +26,7 @@
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>
+#include <linux/rcupdate_trace.h>
#include <linux/uprobes.h>
@@ -42,8 +43,6 @@ static struct rb_root uprobes_tree = RB_ROOT;
static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
-DEFINE_STATIC_SRCU(uprobes_srcu);
-
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
@@ -652,7 +651,7 @@ static void put_uprobe(struct uprobe *uprobe)
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
- call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
+ call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu);
}
static __always_inline
@@ -707,7 +706,7 @@ static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
struct rb_node *node;
unsigned int seq;
- lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
+ lockdep_assert(rcu_read_lock_trace_held());
do {
seq = read_seqcount_begin(&uprobes_seqcount);
@@ -935,8 +934,7 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
bool ret = false;
down_read(&uprobe->consumer_rwsem);
- list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
- srcu_read_lock_held(&uprobes_srcu)) {
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
ret = consumer_filter(uc, mm);
if (ret)
break;
@@ -1157,7 +1155,7 @@ void uprobe_unregister_sync(void)
* unlucky enough caller can free consumer's memory and cause
* handler_chain() or handle_uretprobe_chain() to do an use-after-free.
*/
- synchronize_srcu(&uprobes_srcu);
+ synchronize_rcu_tasks_trace();
}
EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
@@ -1241,19 +1239,18 @@ EXPORT_SYMBOL_GPL(uprobe_register);
int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
{
struct uprobe_consumer *con;
- int ret = -ENOENT, srcu_idx;
+ int ret = -ENOENT;
down_write(&uprobe->register_rwsem);
- srcu_idx = srcu_read_lock(&uprobes_srcu);
- list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
- srcu_read_lock_held(&uprobes_srcu)) {
+ rcu_read_lock_trace();
+ list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
if (con == uc) {
ret = register_for_each_vma(uprobe, add ? uc : NULL);
break;
}
}
- srcu_read_unlock(&uprobes_srcu, srcu_idx);
+ rcu_read_unlock_trace();
up_write(&uprobe->register_rwsem);
@@ -2123,8 +2120,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
current->utask->auprobe = &uprobe->arch;
- list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
- srcu_read_lock_held(&uprobes_srcu)) {
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
int rc = 0;
if (uc->handler) {
@@ -2162,15 +2158,13 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
{
struct uprobe *uprobe = ri->uprobe;
struct uprobe_consumer *uc;
- int srcu_idx;
- srcu_idx = srcu_read_lock(&uprobes_srcu);
- list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
- srcu_read_lock_held(&uprobes_srcu)) {
+ rcu_read_lock_trace();
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
if (uc->ret_handler)
uc->ret_handler(uc, ri->func, regs);
}
- srcu_read_unlock(&uprobes_srcu, srcu_idx);
+ rcu_read_unlock_trace();
}
static struct return_instance *find_next_ret_chain(struct return_instance *ri)
@@ -2255,13 +2249,13 @@ static void handle_swbp(struct pt_regs *regs)
{
struct uprobe *uprobe;
unsigned long bp_vaddr;
- int is_swbp, srcu_idx;
+ int is_swbp;
bp_vaddr = uprobe_get_swbp_addr(regs);
if (bp_vaddr == uprobe_get_trampoline_vaddr())
return uprobe_handle_trampoline(regs);
- srcu_idx = srcu_read_lock(&uprobes_srcu);
+ rcu_read_lock_trace();
uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
if (!uprobe) {
@@ -2319,7 +2313,7 @@ static void handle_swbp(struct pt_regs *regs)
out:
/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
- srcu_read_unlock(&uprobes_srcu, srcu_idx);
+ rcu_read_unlock_trace();
}
/*
--
2.43.5
On Tue, 10 Sep 2024 10:43:12 -0700 Andrii Nakryiko <andrii@kernel.org> wrote: > This patch switches uprobes SRCU usage to RCU Tasks Trace flavor, which > is optimized for more lightweight and quick readers (at the expense of > slower writers, which for uprobes is a fine tradeof) and has better > performance and scalability with number of CPUs. > > Similarly to baseline vs SRCU, we've benchmarked SRCU-based > implementation vs RCU Tasks Trace implementation. > > SRCU > ==== > uprobe-nop ( 1 cpus): 3.276 ± 0.005M/s ( 3.276M/s/cpu) > uprobe-nop ( 2 cpus): 4.125 ± 0.002M/s ( 2.063M/s/cpu) > uprobe-nop ( 4 cpus): 7.713 ± 0.002M/s ( 1.928M/s/cpu) > uprobe-nop ( 8 cpus): 8.097 ± 0.006M/s ( 1.012M/s/cpu) > uprobe-nop (16 cpus): 6.501 ± 0.056M/s ( 0.406M/s/cpu) > uprobe-nop (32 cpus): 4.398 ± 0.084M/s ( 0.137M/s/cpu) > uprobe-nop (64 cpus): 6.452 ± 0.000M/s ( 0.101M/s/cpu) > > uretprobe-nop ( 1 cpus): 2.055 ± 0.001M/s ( 2.055M/s/cpu) > uretprobe-nop ( 2 cpus): 2.677 ± 0.000M/s ( 1.339M/s/cpu) > uretprobe-nop ( 4 cpus): 4.561 ± 0.003M/s ( 1.140M/s/cpu) > uretprobe-nop ( 8 cpus): 5.291 ± 0.002M/s ( 0.661M/s/cpu) > uretprobe-nop (16 cpus): 5.065 ± 0.019M/s ( 0.317M/s/cpu) > uretprobe-nop (32 cpus): 3.622 ± 0.003M/s ( 0.113M/s/cpu) > uretprobe-nop (64 cpus): 3.723 ± 0.002M/s ( 0.058M/s/cpu) > > RCU Tasks Trace > =============== > uprobe-nop ( 1 cpus): 3.396 ± 0.002M/s ( 3.396M/s/cpu) > uprobe-nop ( 2 cpus): 4.271 ± 0.006M/s ( 2.135M/s/cpu) > uprobe-nop ( 4 cpus): 8.499 ± 0.015M/s ( 2.125M/s/cpu) > uprobe-nop ( 8 cpus): 10.355 ± 0.028M/s ( 1.294M/s/cpu) > uprobe-nop (16 cpus): 7.615 ± 0.099M/s ( 0.476M/s/cpu) > uprobe-nop (32 cpus): 4.430 ± 0.007M/s ( 0.138M/s/cpu) > uprobe-nop (64 cpus): 6.887 ± 0.020M/s ( 0.108M/s/cpu) > > uretprobe-nop ( 1 cpus): 2.174 ± 0.001M/s ( 2.174M/s/cpu) > uretprobe-nop ( 2 cpus): 2.853 ± 0.001M/s ( 1.426M/s/cpu) > uretprobe-nop ( 4 cpus): 4.913 ± 0.002M/s ( 1.228M/s/cpu) > uretprobe-nop ( 8 cpus): 5.883 ± 0.002M/s ( 0.735M/s/cpu) > uretprobe-nop (16 cpus): 5.147 ± 0.001M/s ( 0.322M/s/cpu) > uretprobe-nop (32 cpus): 3.738 ± 0.008M/s ( 0.117M/s/cpu) > uretprobe-nop (64 cpus): 4.397 ± 0.002M/s ( 0.069M/s/cpu) > > Peak throughput for uprobes increases from 8 mln/s to 10.3 mln/s > (+28%!), and for uretprobes from 5.3 mln/s to 5.8 mln/s (+11%), as we > have more work to do on uretprobes side. > > Even single-thread (no contention) performance is slightly better: 3.276 > mln/s to 3.396 mln/s (+3.5%) for uprobes, and 2.055 mln/s to 2.174 mln/s > (+5.8%) for uretprobes. > > We also select TASKS_TRACE_RCU for UPROBES in Kconfig due to the new > dependency. > Looks good to me. Peter, do you have any comment? Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > Reviewed-by: Oleg Nesterov <oleg@redhat.com> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org> > --- > arch/Kconfig | 1 + > kernel/events/uprobes.c | 38 ++++++++++++++++---------------------- > 2 files changed, 17 insertions(+), 22 deletions(-) > > diff --git a/arch/Kconfig b/arch/Kconfig > index 975dd22a2dbd..a0df3f3dc484 100644 > --- a/arch/Kconfig > +++ b/arch/Kconfig > @@ -126,6 +126,7 @@ config KPROBES_ON_FTRACE > config UPROBES > def_bool n > depends on ARCH_SUPPORTS_UPROBES > + select TASKS_TRACE_RCU > help > Uprobes is the user-space counterpart to kprobes: they > enable instrumentation applications (such as 'perf probe') > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c > index 4b7e590dc428..a2e6a57f79f2 100644 > --- a/kernel/events/uprobes.c > +++ b/kernel/events/uprobes.c > @@ -26,6 +26,7 @@ > #include <linux/task_work.h> > #include <linux/shmem_fs.h> > #include <linux/khugepaged.h> > +#include <linux/rcupdate_trace.h> > > #include <linux/uprobes.h> > > @@ -42,8 +43,6 @@ static struct rb_root uprobes_tree = RB_ROOT; > static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ > static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); > > -DEFINE_STATIC_SRCU(uprobes_srcu); > - > #define UPROBES_HASH_SZ 13 > /* serialize uprobe->pending_list */ > static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; > @@ -652,7 +651,7 @@ static void put_uprobe(struct uprobe *uprobe) > delayed_uprobe_remove(uprobe, NULL); > mutex_unlock(&delayed_uprobe_lock); > > - call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu); > + call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu); > } > > static __always_inline > @@ -707,7 +706,7 @@ static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) > struct rb_node *node; > unsigned int seq; > > - lockdep_assert(srcu_read_lock_held(&uprobes_srcu)); > + lockdep_assert(rcu_read_lock_trace_held()); > > do { > seq = read_seqcount_begin(&uprobes_seqcount); > @@ -935,8 +934,7 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) > bool ret = false; > > down_read(&uprobe->consumer_rwsem); > - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, > - srcu_read_lock_held(&uprobes_srcu)) { > + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { > ret = consumer_filter(uc, mm); > if (ret) > break; > @@ -1157,7 +1155,7 @@ void uprobe_unregister_sync(void) > * unlucky enough caller can free consumer's memory and cause > * handler_chain() or handle_uretprobe_chain() to do an use-after-free. > */ > - synchronize_srcu(&uprobes_srcu); > + synchronize_rcu_tasks_trace(); > } > EXPORT_SYMBOL_GPL(uprobe_unregister_sync); > > @@ -1241,19 +1239,18 @@ EXPORT_SYMBOL_GPL(uprobe_register); > int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) > { > struct uprobe_consumer *con; > - int ret = -ENOENT, srcu_idx; > + int ret = -ENOENT; > > down_write(&uprobe->register_rwsem); > > - srcu_idx = srcu_read_lock(&uprobes_srcu); > - list_for_each_entry_srcu(con, &uprobe->consumers, cons_node, > - srcu_read_lock_held(&uprobes_srcu)) { > + rcu_read_lock_trace(); > + list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { > if (con == uc) { > ret = register_for_each_vma(uprobe, add ? uc : NULL); > break; > } > } > - srcu_read_unlock(&uprobes_srcu, srcu_idx); > + rcu_read_unlock_trace(); > > up_write(&uprobe->register_rwsem); > > @@ -2123,8 +2120,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) > > current->utask->auprobe = &uprobe->arch; > > - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, > - srcu_read_lock_held(&uprobes_srcu)) { > + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { > int rc = 0; > > if (uc->handler) { > @@ -2162,15 +2158,13 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) > { > struct uprobe *uprobe = ri->uprobe; > struct uprobe_consumer *uc; > - int srcu_idx; > > - srcu_idx = srcu_read_lock(&uprobes_srcu); > - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, > - srcu_read_lock_held(&uprobes_srcu)) { > + rcu_read_lock_trace(); > + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { > if (uc->ret_handler) > uc->ret_handler(uc, ri->func, regs); > } > - srcu_read_unlock(&uprobes_srcu, srcu_idx); > + rcu_read_unlock_trace(); > } > > static struct return_instance *find_next_ret_chain(struct return_instance *ri) > @@ -2255,13 +2249,13 @@ static void handle_swbp(struct pt_regs *regs) > { > struct uprobe *uprobe; > unsigned long bp_vaddr; > - int is_swbp, srcu_idx; > + int is_swbp; > > bp_vaddr = uprobe_get_swbp_addr(regs); > if (bp_vaddr == uprobe_get_trampoline_vaddr()) > return uprobe_handle_trampoline(regs); > > - srcu_idx = srcu_read_lock(&uprobes_srcu); > + rcu_read_lock_trace(); > > uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); > if (!uprobe) { > @@ -2319,7 +2313,7 @@ static void handle_swbp(struct pt_regs *regs) > > out: > /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ > - srcu_read_unlock(&uprobes_srcu, srcu_idx); > + rcu_read_unlock_trace(); > } > > /* > -- > 2.43.5 > -- Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Tue, Sep 10, 2024 at 10:43 AM Andrii Nakryiko <andrii@kernel.org> wrote: > > This patch switches uprobes SRCU usage to RCU Tasks Trace flavor, which > is optimized for more lightweight and quick readers (at the expense of > slower writers, which for uprobes is a fine tradeof) and has better > performance and scalability with number of CPUs. > > Similarly to baseline vs SRCU, we've benchmarked SRCU-based > implementation vs RCU Tasks Trace implementation. > > SRCU > ==== > uprobe-nop ( 1 cpus): 3.276 ± 0.005M/s ( 3.276M/s/cpu) > uprobe-nop ( 2 cpus): 4.125 ± 0.002M/s ( 2.063M/s/cpu) > uprobe-nop ( 4 cpus): 7.713 ± 0.002M/s ( 1.928M/s/cpu) > uprobe-nop ( 8 cpus): 8.097 ± 0.006M/s ( 1.012M/s/cpu) > uprobe-nop (16 cpus): 6.501 ± 0.056M/s ( 0.406M/s/cpu) > uprobe-nop (32 cpus): 4.398 ± 0.084M/s ( 0.137M/s/cpu) > uprobe-nop (64 cpus): 6.452 ± 0.000M/s ( 0.101M/s/cpu) > > uretprobe-nop ( 1 cpus): 2.055 ± 0.001M/s ( 2.055M/s/cpu) > uretprobe-nop ( 2 cpus): 2.677 ± 0.000M/s ( 1.339M/s/cpu) > uretprobe-nop ( 4 cpus): 4.561 ± 0.003M/s ( 1.140M/s/cpu) > uretprobe-nop ( 8 cpus): 5.291 ± 0.002M/s ( 0.661M/s/cpu) > uretprobe-nop (16 cpus): 5.065 ± 0.019M/s ( 0.317M/s/cpu) > uretprobe-nop (32 cpus): 3.622 ± 0.003M/s ( 0.113M/s/cpu) > uretprobe-nop (64 cpus): 3.723 ± 0.002M/s ( 0.058M/s/cpu) > > RCU Tasks Trace > =============== > uprobe-nop ( 1 cpus): 3.396 ± 0.002M/s ( 3.396M/s/cpu) > uprobe-nop ( 2 cpus): 4.271 ± 0.006M/s ( 2.135M/s/cpu) > uprobe-nop ( 4 cpus): 8.499 ± 0.015M/s ( 2.125M/s/cpu) > uprobe-nop ( 8 cpus): 10.355 ± 0.028M/s ( 1.294M/s/cpu) > uprobe-nop (16 cpus): 7.615 ± 0.099M/s ( 0.476M/s/cpu) > uprobe-nop (32 cpus): 4.430 ± 0.007M/s ( 0.138M/s/cpu) > uprobe-nop (64 cpus): 6.887 ± 0.020M/s ( 0.108M/s/cpu) > > uretprobe-nop ( 1 cpus): 2.174 ± 0.001M/s ( 2.174M/s/cpu) > uretprobe-nop ( 2 cpus): 2.853 ± 0.001M/s ( 1.426M/s/cpu) > uretprobe-nop ( 4 cpus): 4.913 ± 0.002M/s ( 1.228M/s/cpu) > uretprobe-nop ( 8 cpus): 5.883 ± 0.002M/s ( 0.735M/s/cpu) > uretprobe-nop (16 cpus): 5.147 ± 0.001M/s ( 0.322M/s/cpu) > uretprobe-nop (32 cpus): 3.738 ± 0.008M/s ( 0.117M/s/cpu) > uretprobe-nop (64 cpus): 4.397 ± 0.002M/s ( 0.069M/s/cpu) > > Peak throughput for uprobes increases from 8 mln/s to 10.3 mln/s > (+28%!), and for uretprobes from 5.3 mln/s to 5.8 mln/s (+11%), as we > have more work to do on uretprobes side. > > Even single-thread (no contention) performance is slightly better: 3.276 > mln/s to 3.396 mln/s (+3.5%) for uprobes, and 2.055 mln/s to 2.174 mln/s > (+5.8%) for uretprobes. > > We also select TASKS_TRACE_RCU for UPROBES in Kconfig due to the new > dependency. > > Reviewed-by: Oleg Nesterov <oleg@redhat.com> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org> > --- > arch/Kconfig | 1 + > kernel/events/uprobes.c | 38 ++++++++++++++++---------------------- > 2 files changed, 17 insertions(+), 22 deletions(-) > Just in case this slipped through the cracks (and is not just waiting its turn to be applied), ping. It would be nice to have this patch with the rest of uprobe patches from the original patch set to go in together. Thanks! > diff --git a/arch/Kconfig b/arch/Kconfig > index 975dd22a2dbd..a0df3f3dc484 100644 > --- a/arch/Kconfig > +++ b/arch/Kconfig > @@ -126,6 +126,7 @@ config KPROBES_ON_FTRACE > config UPROBES > def_bool n > depends on ARCH_SUPPORTS_UPROBES > + select TASKS_TRACE_RCU > help > Uprobes is the user-space counterpart to kprobes: they > enable instrumentation applications (such as 'perf probe') [...]
The following commit has been merged into the perf/core branch of tip:
Commit-ID: 87195a1ee332add27bd51448c6b54aad551a28f5
Gitweb: https://git.kernel.org/tip/87195a1ee332add27bd51448c6b54aad551a28f5
Author: Andrii Nakryiko <andrii@kernel.org>
AuthorDate: Tue, 10 Sep 2024 10:43:12 -07:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 07 Oct 2024 09:28:42 +02:00
uprobes: switch to RCU Tasks Trace flavor for better performance
This patch switches uprobes SRCU usage to RCU Tasks Trace flavor, which
is optimized for more lightweight and quick readers (at the expense of
slower writers, which for uprobes is a fine tradeof) and has better
performance and scalability with number of CPUs.
Similarly to baseline vs SRCU, we've benchmarked SRCU-based
implementation vs RCU Tasks Trace implementation.
SRCU
====
uprobe-nop ( 1 cpus): 3.276 ± 0.005M/s ( 3.276M/s/cpu)
uprobe-nop ( 2 cpus): 4.125 ± 0.002M/s ( 2.063M/s/cpu)
uprobe-nop ( 4 cpus): 7.713 ± 0.002M/s ( 1.928M/s/cpu)
uprobe-nop ( 8 cpus): 8.097 ± 0.006M/s ( 1.012M/s/cpu)
uprobe-nop (16 cpus): 6.501 ± 0.056M/s ( 0.406M/s/cpu)
uprobe-nop (32 cpus): 4.398 ± 0.084M/s ( 0.137M/s/cpu)
uprobe-nop (64 cpus): 6.452 ± 0.000M/s ( 0.101M/s/cpu)
uretprobe-nop ( 1 cpus): 2.055 ± 0.001M/s ( 2.055M/s/cpu)
uretprobe-nop ( 2 cpus): 2.677 ± 0.000M/s ( 1.339M/s/cpu)
uretprobe-nop ( 4 cpus): 4.561 ± 0.003M/s ( 1.140M/s/cpu)
uretprobe-nop ( 8 cpus): 5.291 ± 0.002M/s ( 0.661M/s/cpu)
uretprobe-nop (16 cpus): 5.065 ± 0.019M/s ( 0.317M/s/cpu)
uretprobe-nop (32 cpus): 3.622 ± 0.003M/s ( 0.113M/s/cpu)
uretprobe-nop (64 cpus): 3.723 ± 0.002M/s ( 0.058M/s/cpu)
RCU Tasks Trace
===============
uprobe-nop ( 1 cpus): 3.396 ± 0.002M/s ( 3.396M/s/cpu)
uprobe-nop ( 2 cpus): 4.271 ± 0.006M/s ( 2.135M/s/cpu)
uprobe-nop ( 4 cpus): 8.499 ± 0.015M/s ( 2.125M/s/cpu)
uprobe-nop ( 8 cpus): 10.355 ± 0.028M/s ( 1.294M/s/cpu)
uprobe-nop (16 cpus): 7.615 ± 0.099M/s ( 0.476M/s/cpu)
uprobe-nop (32 cpus): 4.430 ± 0.007M/s ( 0.138M/s/cpu)
uprobe-nop (64 cpus): 6.887 ± 0.020M/s ( 0.108M/s/cpu)
uretprobe-nop ( 1 cpus): 2.174 ± 0.001M/s ( 2.174M/s/cpu)
uretprobe-nop ( 2 cpus): 2.853 ± 0.001M/s ( 1.426M/s/cpu)
uretprobe-nop ( 4 cpus): 4.913 ± 0.002M/s ( 1.228M/s/cpu)
uretprobe-nop ( 8 cpus): 5.883 ± 0.002M/s ( 0.735M/s/cpu)
uretprobe-nop (16 cpus): 5.147 ± 0.001M/s ( 0.322M/s/cpu)
uretprobe-nop (32 cpus): 3.738 ± 0.008M/s ( 0.117M/s/cpu)
uretprobe-nop (64 cpus): 4.397 ± 0.002M/s ( 0.069M/s/cpu)
Peak throughput for uprobes increases from 8 mln/s to 10.3 mln/s
(+28%!), and for uretprobes from 5.3 mln/s to 5.8 mln/s (+11%), as we
have more work to do on uretprobes side.
Even single-thread (no contention) performance is slightly better: 3.276
mln/s to 3.396 mln/s (+3.5%) for uprobes, and 2.055 mln/s to 2.174 mln/s
(+5.8%) for uretprobes.
We also select TASKS_TRACE_RCU for UPROBES in Kconfig due to the new
dependency.
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20240910174312.3646590-1-andrii@kernel.org
---
arch/Kconfig | 1 +
kernel/events/uprobes.c | 38 ++++++++++++++++----------------------
2 files changed, 17 insertions(+), 22 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 8af374e..5b5c216 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -135,6 +135,7 @@ config KPROBES_ON_FTRACE
config UPROBES
def_bool n
depends on ARCH_SUPPORTS_UPROBES
+ select TASKS_TRACE_RCU
help
Uprobes is the user-space counterpart to kprobes: they
enable instrumentation applications (such as 'perf probe')
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4b52cb2..5106dc1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -26,6 +26,7 @@
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>
+#include <linux/rcupdate_trace.h>
#include <linux/uprobes.h>
@@ -42,8 +43,6 @@ static struct rb_root uprobes_tree = RB_ROOT;
static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
-DEFINE_STATIC_SRCU(uprobes_srcu);
-
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
@@ -651,7 +650,7 @@ static void put_uprobe(struct uprobe *uprobe)
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
- call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
+ call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu);
}
static __always_inline
@@ -706,7 +705,7 @@ static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
struct rb_node *node;
unsigned int seq;
- lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
+ lockdep_assert(rcu_read_lock_trace_held());
do {
seq = read_seqcount_begin(&uprobes_seqcount);
@@ -934,8 +933,7 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
bool ret = false;
down_read(&uprobe->consumer_rwsem);
- list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
- srcu_read_lock_held(&uprobes_srcu)) {
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
ret = consumer_filter(uc, mm);
if (ret)
break;
@@ -1156,7 +1154,7 @@ void uprobe_unregister_sync(void)
* unlucky enough caller can free consumer's memory and cause
* handler_chain() or handle_uretprobe_chain() to do an use-after-free.
*/
- synchronize_srcu(&uprobes_srcu);
+ synchronize_rcu_tasks_trace();
}
EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
@@ -1240,19 +1238,18 @@ EXPORT_SYMBOL_GPL(uprobe_register);
int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
{
struct uprobe_consumer *con;
- int ret = -ENOENT, srcu_idx;
+ int ret = -ENOENT;
down_write(&uprobe->register_rwsem);
- srcu_idx = srcu_read_lock(&uprobes_srcu);
- list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
- srcu_read_lock_held(&uprobes_srcu)) {
+ rcu_read_lock_trace();
+ list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
if (con == uc) {
ret = register_for_each_vma(uprobe, add ? uc : NULL);
break;
}
}
- srcu_read_unlock(&uprobes_srcu, srcu_idx);
+ rcu_read_unlock_trace();
up_write(&uprobe->register_rwsem);
@@ -2134,8 +2131,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
current->utask->auprobe = &uprobe->arch;
- list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
- srcu_read_lock_held(&uprobes_srcu)) {
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
int rc = 0;
if (uc->handler) {
@@ -2173,15 +2169,13 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
{
struct uprobe *uprobe = ri->uprobe;
struct uprobe_consumer *uc;
- int srcu_idx;
- srcu_idx = srcu_read_lock(&uprobes_srcu);
- list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
- srcu_read_lock_held(&uprobes_srcu)) {
+ rcu_read_lock_trace();
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
if (uc->ret_handler)
uc->ret_handler(uc, ri->func, regs);
}
- srcu_read_unlock(&uprobes_srcu, srcu_idx);
+ rcu_read_unlock_trace();
}
static struct return_instance *find_next_ret_chain(struct return_instance *ri)
@@ -2266,13 +2260,13 @@ static void handle_swbp(struct pt_regs *regs)
{
struct uprobe *uprobe;
unsigned long bp_vaddr;
- int is_swbp, srcu_idx;
+ int is_swbp;
bp_vaddr = uprobe_get_swbp_addr(regs);
if (bp_vaddr == uprobe_get_trampoline_vaddr())
return uprobe_handle_trampoline(regs);
- srcu_idx = srcu_read_lock(&uprobes_srcu);
+ rcu_read_lock_trace();
uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
if (!uprobe) {
@@ -2330,7 +2324,7 @@ static void handle_swbp(struct pt_regs *regs)
out:
/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
- srcu_read_unlock(&uprobes_srcu, srcu_idx);
+ rcu_read_unlock_trace();
}
/*
© 2016 - 2024 Red Hat, Inc.