With handle_swbp() hitting concurrently on (all) CPUs the
uprobe->register_rwsem can get very contended. Add an SRCU instance to
cover the consumer list and consumer lifetime.
Since the consumer are externally embedded structures, unregister will
have to suffer a synchronize_srcu().
A notably complication is the UPROBE_HANDLER_REMOVE logic which can
race against uprobe_register() such that it might want to remove a
freshly installer handler that didn't get called. In order to close
this hole, a seqcount is added. With that, the removal path can tell
if anything changed and bail out of the removal.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/events/uprobes.c | 60 ++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 50 insertions(+), 10 deletions(-)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -26,6 +26,7 @@
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>
+#include <linux/srcu.h>
#include <linux/uprobes.h>
@@ -49,6 +50,11 @@ static struct mutex uprobes_mmap_mutex[U
DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
+/*
+ * Covers uprobe->consumers lifetime.
+ */
+DEFINE_STATIC_SRCU(uprobes_srcu);
+
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
@@ -57,6 +63,7 @@ struct uprobe {
refcount_t ref;
struct rcu_head rcu;
struct rw_semaphore register_rwsem;
+ seqcount_t register_seq;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
struct uprobe_consumer *consumers;
@@ -760,6 +767,7 @@ static struct uprobe *alloc_uprobe(struc
uprobe->offset = offset;
uprobe->ref_ctr_offset = ref_ctr_offset;
init_rwsem(&uprobe->register_rwsem);
+ seqcount_init(&uprobe->register_seq);
init_rwsem(&uprobe->consumer_rwsem);
/* add to uprobes_tree, sorted on inode:offset */
@@ -782,8 +790,8 @@ static struct uprobe *alloc_uprobe(struc
static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
down_write(&uprobe->consumer_rwsem);
- uc->next = uprobe->consumers;
- uprobe->consumers = uc;
+ WRITE_ONCE(uc->next, uprobe->consumers);
+ rcu_assign_pointer(uprobe->consumers, uc);
up_write(&uprobe->consumer_rwsem);
}
@@ -800,7 +808,7 @@ static bool consumer_del(struct uprobe *
down_write(&uprobe->consumer_rwsem);
for (con = &uprobe->consumers; *con; con = &(*con)->next) {
if (*con == uc) {
- *con = uc->next;
+ WRITE_ONCE(*con, uc->next);
ret = true;
break;
}
@@ -1139,9 +1147,13 @@ void uprobe_unregister(struct inode *ino
return;
down_write(&uprobe->register_rwsem);
+ raw_write_seqcount_begin(&uprobe->register_seq);
__uprobe_unregister(uprobe, uc);
+ raw_write_seqcount_end(&uprobe->register_seq);
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
+
+ synchronize_srcu(&uprobes_srcu);
}
EXPORT_SYMBOL_GPL(uprobe_unregister);
@@ -1204,10 +1216,12 @@ static int __uprobe_register(struct inod
down_write(&uprobe->register_rwsem);
ret = -EAGAIN;
if (likely(uprobe_is_active(uprobe))) {
+ raw_write_seqcount_begin(&uprobe->register_seq);
consumer_add(uprobe, uc);
ret = register_for_each_vma(uprobe, uc);
if (ret)
__uprobe_unregister(uprobe, uc);
+ raw_write_seqcount_end(&uprobe->register_seq);
}
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
@@ -1250,10 +1264,12 @@ int uprobe_apply(struct inode *inode, lo
return ret;
down_write(&uprobe->register_rwsem);
+ raw_write_seqcount_begin(&uprobe->register_seq);
for (con = uprobe->consumers; con && con != uc ; con = con->next)
;
if (con)
ret = register_for_each_vma(uprobe, add ? uc : NULL);
+ raw_write_seqcount_end(&uprobe->register_seq);
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
@@ -2096,15 +2112,23 @@ static struct uprobe *find_active_uprobe
return uprobe;
}
+#define for_each_consumer_rcu(pos, head) \
+ for (pos = rcu_dereference_raw(head); pos; \
+ pos = rcu_dereference_raw(pos->next))
+
static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
int remove = UPROBE_HANDLER_REMOVE;
bool need_prep = false; /* prepare return uprobe, when needed */
bool had_handler = false;
+ unsigned int seq;
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
+ guard(srcu)(&uprobes_srcu);
+
+ seq = raw_read_seqcount_begin(&uprobe->register_seq);
+
+ for_each_consumer_rcu(uc, uprobe->consumers) {
int rc = 0;
if (uc->handler) {
@@ -2134,9 +2158,25 @@ static void handler_chain(struct uprobe
if (need_prep && !remove)
prepare_uretprobe(uprobe, regs); /* put bp at return */
- if (remove)
+ if (remove) {
+ /*
+ * Removing uprobes is a slow path, after all, the more probes
+ * you remove, the less probe hits you get.
+ *
+ * This needs to serialize against uprobe_register(), such that
+ * if the above RCU iteration missed a new handler that
+ * would've liked to keep the probe, we don't go uninstall the
+ * probe after it already ran register_for_each_vma().
+ *
+ * The rwsem ensures exclusivity against uprobe_register()
+ * while the seqcount will avoid the removal if anything has
+ * changed since we started.
+ */
+ guard(rwsem_read)(&uprobe->register_rwsem);
+ if (read_seqcount_retry(&uprobe->register_seq, seq))
+ return;
unapply_uprobe(uprobe, current->mm);
- up_read(&uprobe->register_rwsem);
+ }
}
static void
@@ -2145,12 +2185,12 @@ handle_uretprobe_chain(struct return_ins
struct uprobe *uprobe = ri->uprobe;
struct uprobe_consumer *uc;
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
+ guard(srcu)(&uprobes_srcu);
+
+ for_each_consumer_rcu(uc, uprobe->consumers) {
if (uc->ret_handler)
uc->ret_handler(uc, ri->func, regs);
}
- up_read(&uprobe->register_rwsem);
}
static struct return_instance *find_next_ret_chain(struct return_instance *ri)
+ bpf@vger, please cc bpf ML for the next revision, these changes are
very relevant there as well, thanks
On Thu, Jul 11, 2024 at 4:07 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> With handle_swbp() hitting concurrently on (all) CPUs the
> uprobe->register_rwsem can get very contended. Add an SRCU instance to
> cover the consumer list and consumer lifetime.
>
> Since the consumer are externally embedded structures, unregister will
> have to suffer a synchronize_srcu().
>
> A notably complication is the UPROBE_HANDLER_REMOVE logic which can
> race against uprobe_register() such that it might want to remove a
> freshly installer handler that didn't get called. In order to close
> this hole, a seqcount is added. With that, the removal path can tell
> if anything changed and bail out of the removal.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> kernel/events/uprobes.c | 60 ++++++++++++++++++++++++++++++++++++++++--------
> 1 file changed, 50 insertions(+), 10 deletions(-)
>
[...]
> @@ -800,7 +808,7 @@ static bool consumer_del(struct uprobe *
> down_write(&uprobe->consumer_rwsem);
> for (con = &uprobe->consumers; *con; con = &(*con)->next) {
> if (*con == uc) {
> - *con = uc->next;
> + WRITE_ONCE(*con, uc->next);
I have a dumb and mechanical question.
Above in consumer_add() you are doing WRITE_ONCE() for uc->next
assignment, but rcu_assign_pointer() for uprobe->consumers. Here, you
are doing WRITE_ONCE() for the same operation, if it so happens that
uc == *con == uprobe->consumers. So is rcu_assign_pointer() necessary
in consumer_addr()? If yes, we should have it here as well, no? And if
not, why bother with it in consumer_add()?
> ret = true;
> break;
> }
> @@ -1139,9 +1147,13 @@ void uprobe_unregister(struct inode *ino
> return;
>
> down_write(&uprobe->register_rwsem);
> + raw_write_seqcount_begin(&uprobe->register_seq);
> __uprobe_unregister(uprobe, uc);
> + raw_write_seqcount_end(&uprobe->register_seq);
> up_write(&uprobe->register_rwsem);
> put_uprobe(uprobe);
> +
> + synchronize_srcu(&uprobes_srcu);
> }
> EXPORT_SYMBOL_GPL(uprobe_unregister);
[...]
On Fri, Jul 12, 2024 at 02:06:08PM -0700, Andrii Nakryiko wrote:
> + bpf@vger, please cc bpf ML for the next revision, these changes are
> very relevant there as well, thanks
>
> On Thu, Jul 11, 2024 at 4:07 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > With handle_swbp() hitting concurrently on (all) CPUs the
> > uprobe->register_rwsem can get very contended. Add an SRCU instance to
> > cover the consumer list and consumer lifetime.
> >
> > Since the consumer are externally embedded structures, unregister will
> > have to suffer a synchronize_srcu().
> >
> > A notably complication is the UPROBE_HANDLER_REMOVE logic which can
> > race against uprobe_register() such that it might want to remove a
> > freshly installer handler that didn't get called. In order to close
> > this hole, a seqcount is added. With that, the removal path can tell
> > if anything changed and bail out of the removal.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> > kernel/events/uprobes.c | 60 ++++++++++++++++++++++++++++++++++++++++--------
> > 1 file changed, 50 insertions(+), 10 deletions(-)
> >
>
> [...]
>
> > @@ -800,7 +808,7 @@ static bool consumer_del(struct uprobe *
> > down_write(&uprobe->consumer_rwsem);
> > for (con = &uprobe->consumers; *con; con = &(*con)->next) {
> > if (*con == uc) {
> > - *con = uc->next;
> > + WRITE_ONCE(*con, uc->next);
>
> I have a dumb and mechanical question.
>
> Above in consumer_add() you are doing WRITE_ONCE() for uc->next
> assignment, but rcu_assign_pointer() for uprobe->consumers. Here, you
> are doing WRITE_ONCE() for the same operation, if it so happens that
> uc == *con == uprobe->consumers. So is rcu_assign_pointer() necessary
> in consumer_addr()? If yes, we should have it here as well, no? And if
> not, why bother with it in consumer_add()?
add is a publish and needs to ensure all stores to the object are
ordered before the object is linked in. Note that rcu_assign_pointer()
is basically a fancy way of writing smp_store_release().
del otoh does not publish, it removes and doesn't need ordering.
It does however need to ensure the pointer write itself isn't torn. That
is, without the WRITE_ONCE() the compiler is free to do byte stores in
order to update the 8 byte pointer value (assuming 64bit). This is
pretty dumb, but very much permitted by C and also utterly fatal in the
case where an RCU iteration comes by and reads a half-half pointer
value.
> > ret = true;
> > break;
> > }
> > @@ -1139,9 +1147,13 @@ void uprobe_unregister(struct inode *ino
> > return;
> >
> > down_write(&uprobe->register_rwsem);
> > + raw_write_seqcount_begin(&uprobe->register_seq);
> > __uprobe_unregister(uprobe, uc);
> > + raw_write_seqcount_end(&uprobe->register_seq);
> > up_write(&uprobe->register_rwsem);
> > put_uprobe(uprobe);
> > +
> > + synchronize_srcu(&uprobes_srcu);
> > }
> > EXPORT_SYMBOL_GPL(uprobe_unregister);
>
> [...]
On Mon, Jul 15, 2024 at 4:25 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Jul 12, 2024 at 02:06:08PM -0700, Andrii Nakryiko wrote:
> > + bpf@vger, please cc bpf ML for the next revision, these changes are
> > very relevant there as well, thanks
> >
> > On Thu, Jul 11, 2024 at 4:07 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > With handle_swbp() hitting concurrently on (all) CPUs the
> > > uprobe->register_rwsem can get very contended. Add an SRCU instance to
> > > cover the consumer list and consumer lifetime.
> > >
> > > Since the consumer are externally embedded structures, unregister will
> > > have to suffer a synchronize_srcu().
> > >
> > > A notably complication is the UPROBE_HANDLER_REMOVE logic which can
> > > race against uprobe_register() such that it might want to remove a
> > > freshly installer handler that didn't get called. In order to close
> > > this hole, a seqcount is added. With that, the removal path can tell
> > > if anything changed and bail out of the removal.
> > >
> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > ---
> > > kernel/events/uprobes.c | 60 ++++++++++++++++++++++++++++++++++++++++--------
> > > 1 file changed, 50 insertions(+), 10 deletions(-)
> > >
> >
> > [...]
> >
> > > @@ -800,7 +808,7 @@ static bool consumer_del(struct uprobe *
> > > down_write(&uprobe->consumer_rwsem);
> > > for (con = &uprobe->consumers; *con; con = &(*con)->next) {
> > > if (*con == uc) {
> > > - *con = uc->next;
> > > + WRITE_ONCE(*con, uc->next);
> >
> > I have a dumb and mechanical question.
> >
> > Above in consumer_add() you are doing WRITE_ONCE() for uc->next
> > assignment, but rcu_assign_pointer() for uprobe->consumers. Here, you
> > are doing WRITE_ONCE() for the same operation, if it so happens that
> > uc == *con == uprobe->consumers. So is rcu_assign_pointer() necessary
> > in consumer_addr()? If yes, we should have it here as well, no? And if
> > not, why bother with it in consumer_add()?
>
> add is a publish and needs to ensure all stores to the object are
> ordered before the object is linked in. Note that rcu_assign_pointer()
> is basically a fancy way of writing smp_store_release().
>
> del otoh does not publish, it removes and doesn't need ordering.
>
> It does however need to ensure the pointer write itself isn't torn. That
> is, without the WRITE_ONCE() the compiler is free to do byte stores in
> order to update the 8 byte pointer value (assuming 64bit). This is
> pretty dumb, but very much permitted by C and also utterly fatal in the
> case where an RCU iteration comes by and reads a half-half pointer
> value.
>
Thanks for elaborating, very helpful! It's basically the same idea as
with rb_find_add_rcu(), got it.
> > > ret = true;
> > > break;
> > > }
> > > @@ -1139,9 +1147,13 @@ void uprobe_unregister(struct inode *ino
> > > return;
> > >
> > > down_write(&uprobe->register_rwsem);
> > > + raw_write_seqcount_begin(&uprobe->register_seq);
> > > __uprobe_unregister(uprobe, uc);
> > > + raw_write_seqcount_end(&uprobe->register_seq);
> > > up_write(&uprobe->register_rwsem);
> > > put_uprobe(uprobe);
> > > +
> > > + synchronize_srcu(&uprobes_srcu);
> > > }
> > > EXPORT_SYMBOL_GPL(uprobe_unregister);
> >
> > [...]
© 2016 - 2025 Red Hat, Inc.