With handle_swbp() hitting concurrently on (all) CPUs the
uprobe->register_rwsem can get very contended. Replace it with
SRCU+mutex.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/events/uprobes.c | 37 +++++++++++++++++++++++--------------
1 file changed, 23 insertions(+), 14 deletions(-)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -26,6 +26,7 @@
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>
+#include <linux/srcu.h>
#include <linux/uprobes.h>
@@ -49,6 +50,8 @@ static struct mutex uprobes_mmap_mutex[U
DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
+DEFINE_STATIC_SRCU(uprobes_srcu);
+
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
@@ -56,7 +59,7 @@ struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
refcount_t ref;
struct rcu_head rcu;
- struct rw_semaphore register_rwsem;
+ struct mutex register_mutex;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
struct uprobe_consumer *consumers;
@@ -750,7 +753,7 @@ static struct uprobe *alloc_uprobe(struc
uprobe->inode = inode;
uprobe->offset = offset;
uprobe->ref_ctr_offset = ref_ctr_offset;
- init_rwsem(&uprobe->register_rwsem);
+ mutex_init(&uprobe->register_mutex);
init_rwsem(&uprobe->consumer_rwsem);
/* add to uprobes_tree, sorted on inode:offset */
@@ -1129,10 +1132,12 @@ void uprobe_unregister(struct inode *ino
if (WARN_ON(!uprobe))
return;
- down_write(&uprobe->register_rwsem);
+ mutex_lock(&uprobe->register_mutex);
__uprobe_unregister(uprobe, uc);
- up_write(&uprobe->register_rwsem);
+ mutex_unlock(&uprobe->register_mutex);
put_uprobe(uprobe);
+
+ synchronize_srcu(&uprobes_srcu);
}
EXPORT_SYMBOL_GPL(uprobe_unregister);
@@ -1192,7 +1197,7 @@ static int __uprobe_register(struct inod
* We can race with uprobe_unregister()->delete_uprobe().
* Check uprobe_is_active() and retry if it is false.
*/
- down_write(&uprobe->register_rwsem);
+ mutex_lock(&uprobe->register_mutex);
ret = -EAGAIN;
if (likely(uprobe_is_active(uprobe))) {
consumer_add(uprobe, uc);
@@ -1200,7 +1205,7 @@ static int __uprobe_register(struct inod
if (ret)
__uprobe_unregister(uprobe, uc);
}
- up_write(&uprobe->register_rwsem);
+ mutex_unlock(&uprobe->register_mutex);
put_uprobe(uprobe);
if (unlikely(ret == -EAGAIN))
@@ -1240,12 +1245,12 @@ int uprobe_apply(struct inode *inode, lo
if (WARN_ON(!uprobe))
return ret;
- down_write(&uprobe->register_rwsem);
+ mutex_lock(&uprobe->register_mutex);
for (con = uprobe->consumers; con && con != uc ; con = con->next)
;
if (con)
ret = register_for_each_vma(uprobe, add ? uc : NULL);
- up_write(&uprobe->register_rwsem);
+ mutex_unlock(&uprobe->register_mutex);
put_uprobe(uprobe);
return ret;
@@ -2087,14 +2092,19 @@ static struct uprobe *find_active_uprobe
return uprobe;
}
+#define for_each_consumer_rcu(pos, head) \
+ for (pos = rcu_dereference_raw(head); pos; \
+ pos = rcu_dereference_raw(pos->next))
+
static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
int remove = UPROBE_HANDLER_REMOVE;
bool need_prep = false; /* prepare return uprobe, when needed */
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
+ guard(srcu)(&uprobes_srcu);
+
+ for_each_consumer_rcu(uc, uprobe->consumers) {
int rc = 0;
if (uc->handler) {
@@ -2116,7 +2126,6 @@ static void handler_chain(struct uprobe
WARN_ON(!uprobe_is_active(uprobe));
unapply_uprobe(uprobe, current->mm);
}
- up_read(&uprobe->register_rwsem);
}
static void
@@ -2125,12 +2134,12 @@ handle_uretprobe_chain(struct return_ins
struct uprobe *uprobe = ri->uprobe;
struct uprobe_consumer *uc;
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
+ guard(srcu)(&uprobes_srcu);
+
+ for_each_consumer_rcu(uc, uprobe->consumers) {
if (uc->ret_handler)
uc->ret_handler(uc, ri->func, regs);
}
- up_read(&uprobe->register_rwsem);
}
static struct return_instance *find_next_ret_chain(struct return_instance *ri)
On Mon, Jul 08, 2024 at 11:12:46AM +0200, Peter Zijlstra wrote:
> @@ -2087,14 +2092,19 @@ static struct uprobe *find_active_uprobe
> return uprobe;
> }
>
> +#define for_each_consumer_rcu(pos, head) \
> + for (pos = rcu_dereference_raw(head); pos; \
> + pos = rcu_dereference_raw(pos->next))
> +
> static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
> {
> struct uprobe_consumer *uc;
> int remove = UPROBE_HANDLER_REMOVE;
> bool need_prep = false; /* prepare return uprobe, when needed */
>
> - down_read(&uprobe->register_rwsem);
> - for (uc = uprobe->consumers; uc; uc = uc->next) {
> + guard(srcu)(&uprobes_srcu);
> +
> + for_each_consumer_rcu(uc, uprobe->consumers) {
> int rc = 0;
>
> if (uc->handler) {
> @@ -2116,7 +2126,6 @@ static void handler_chain(struct uprobe
> WARN_ON(!uprobe_is_active(uprobe));
> unapply_uprobe(uprobe, current->mm);
^^^ this remove case needs more thought.
> }
> - up_read(&uprobe->register_rwsem);
> }
On 07/09, Peter Zijlstra wrote:
>
> > + guard(srcu)(&uprobes_srcu);
> > +
> > + for_each_consumer_rcu(uc, uprobe->consumers) {
> > int rc = 0;
> >
> > if (uc->handler) {
> > @@ -2116,7 +2126,6 @@ static void handler_chain(struct uprobe
> > WARN_ON(!uprobe_is_active(uprobe));
> > unapply_uprobe(uprobe, current->mm);
>
> ^^^ this remove case needs more thought.
Yeah... that is why the current code doesn't use ->consumer_rwsem, iirc.
And consumer_add/del need some changes. Say, consumer_add() does
uc->next = uprobe->consumers;
uprobe->consumers = uc;
I guess it should do
WRITE_ONCE(uc->next, uprobe->consumers);
rcu_assign_pointer(uprobe->consumers, uc);
Oleg.
On Tue, Jul 09, 2024 at 03:33:49PM +0200, Oleg Nesterov wrote:
> On 07/09, Peter Zijlstra wrote:
> >
> > > + guard(srcu)(&uprobes_srcu);
> > > +
> > > + for_each_consumer_rcu(uc, uprobe->consumers) {
> > > int rc = 0;
> > >
> > > if (uc->handler) {
> > > @@ -2116,7 +2126,6 @@ static void handler_chain(struct uprobe
> > > WARN_ON(!uprobe_is_active(uprobe));
> > > unapply_uprobe(uprobe, current->mm);
> >
> > ^^^ this remove case needs more thought.
>
> Yeah... that is why the current code doesn't use ->consumer_rwsem, iirc.
AFAICT something like the below should work. Concurrent
remove_breakpoint() should already be possible today and doesn't appear
to be a problem.
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1260,6 +1260,10 @@ int uprobe_apply(struct inode *inode, lo
return ret;
}
+/*
+ * Can race against uprobe_unregister() / register_for_each_vma(), and relies
+ * on duplicate remove_breakpoint() being a no-op.
+ */
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
VMA_ITERATOR(vmi, mm, 0);
@@ -2101,6 +2105,7 @@ static void handler_chain(struct uprobe
struct uprobe_consumer *uc;
int remove = UPROBE_HANDLER_REMOVE;
bool need_prep = false; /* prepare return uprobe, when needed */
+ bool had_handler = false;
down_read(&uprobe->register_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
@@ -2115,16 +2120,26 @@ static void handler_chain(struct uprobe
if (uc->ret_handler)
need_prep = true;
+ /*
+ * A single handler that does not mask out REMOVE, means the
+ * probe stays.
+ */
+ had_handler = true;
remove &= rc;
}
+ /*
+ * If there were no handlers called, nobody asked for it to be removed
+ * but also nobody got to mask the value. Fix it up.
+ */
+ if (!had_handler)
+ remove = 0;
+
if (need_prep && !remove)
prepare_uretprobe(uprobe, regs); /* put bp at return */
- if (remove && uprobe->consumers) {
- WARN_ON(!uprobe_is_active(uprobe));
+ if (remove)
unapply_uprobe(uprobe, current->mm);
- }
up_read(&uprobe->register_rwsem);
}
On 07/09, Peter Zijlstra wrote:
>
> On Tue, Jul 09, 2024 at 03:33:49PM +0200, Oleg Nesterov wrote:
> > On 07/09, Peter Zijlstra wrote:
> > >
> > > > + guard(srcu)(&uprobes_srcu);
> > > > +
> > > > + for_each_consumer_rcu(uc, uprobe->consumers) {
> > > > int rc = 0;
> > > >
> > > > if (uc->handler) {
> > > > @@ -2116,7 +2126,6 @@ static void handler_chain(struct uprobe
> > > > WARN_ON(!uprobe_is_active(uprobe));
> > > > unapply_uprobe(uprobe, current->mm);
> > >
> > > ^^^ this remove case needs more thought.
> >
> > Yeah... that is why the current code doesn't use ->consumer_rwsem, iirc.
>
> AFAICT something like the below should work. Concurrent
> remove_breakpoint() should already be possible today and doesn't appear
> to be a problem.
Sorry, I don't understand how can this patch help. Yes, it removes the
uprobe->consumers != NULL check, but this is minor.
To simplify, suppose we have a single consumer which is not interested
in this task/mm, it returns UPROBE_HANDLER_REMOVE.
For example, event->hw.target != NULL and the current task is the forked
child which hits the breakpoint copied by dup_mmap().
Now. We need to ensure that another (say system-wide) consumer can't come
and call register_for_each_vma() before unapply_uprobe().
But perhaps I missed your point...
Oleg.
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -1260,6 +1260,10 @@ int uprobe_apply(struct inode *inode, lo
> return ret;
> }
>
> +/*
> + * Can race against uprobe_unregister() / register_for_each_vma(), and relies
> + * on duplicate remove_breakpoint() being a no-op.
> + */
> static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
> {
> VMA_ITERATOR(vmi, mm, 0);
> @@ -2101,6 +2105,7 @@ static void handler_chain(struct uprobe
> struct uprobe_consumer *uc;
> int remove = UPROBE_HANDLER_REMOVE;
> bool need_prep = false; /* prepare return uprobe, when needed */
> + bool had_handler = false;
>
> down_read(&uprobe->register_rwsem);
> for (uc = uprobe->consumers; uc; uc = uc->next) {
> @@ -2115,16 +2120,26 @@ static void handler_chain(struct uprobe
> if (uc->ret_handler)
> need_prep = true;
>
> + /*
> + * A single handler that does not mask out REMOVE, means the
> + * probe stays.
> + */
> + had_handler = true;
> remove &= rc;
> }
>
> + /*
> + * If there were no handlers called, nobody asked for it to be removed
> + * but also nobody got to mask the value. Fix it up.
> + */
> + if (!had_handler)
> + remove = 0;
> +
> if (need_prep && !remove)
> prepare_uretprobe(uprobe, regs); /* put bp at return */
>
> - if (remove && uprobe->consumers) {
> - WARN_ON(!uprobe_is_active(uprobe));
> + if (remove)
> unapply_uprobe(uprobe, current->mm);
> - }
> up_read(&uprobe->register_rwsem);
> }
>
>
On Tue, Jul 09, 2024 at 05:05:04PM +0200, Oleg Nesterov wrote: > To simplify, suppose we have a single consumer which is not interested > in this task/mm, it returns UPROBE_HANDLER_REMOVE. > > For example, event->hw.target != NULL and the current task is the forked > child which hits the breakpoint copied by dup_mmap(). > > Now. We need to ensure that another (say system-wide) consumer can't come > and call register_for_each_vma() before unapply_uprobe(). > > But perhaps I missed your point... Ooh, I see. I failed to consider that particular case. This is interleaving uprobe_register() and handler_chain(). Silly me only looked at uprobe_unregister() and handler_chain(). Hmm, easiest would be to add a seqcount to register_mutex and simply skip the remove case when odd. Then the handler might get a few extra (unwanted) calls, but it should be able to handle them, they're fundamentally not different from the first one where it says REMOVE. Right?
On 07/09, Peter Zijlstra wrote: > > Hmm, easiest would be to add a seqcount to register_mutex and simply > skip the remove case when odd. probably yes, > Then the handler might get a few extra (unwanted) calls, but it should > be able to handle them, they're fundamentally not different from the > first one where it says REMOVE. Right? plus the handler can never assume that UPROBE_HANDLER_REMOVE won't be ignored, another consumer can want to trace this task. Oleg.
© 2016 - 2026 Red Hat, Inc.