With handle_swbp() triggering concurrently on (all) CPUs, tree_lock
becomes a bottleneck. Avoid treelock by doing RCU lookups of the
uprobe.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/events/uprobes.c | 40 ++++++++++++++++++++++++++++++++--------
1 file changed, 32 insertions(+), 8 deletions(-)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -40,6 +40,7 @@ static struct rb_root uprobes_tree = RB_
#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
+static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
@@ -54,6 +55,7 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem)
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
refcount_t ref;
+ struct rcu_head rcu;
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
@@ -593,6 +595,12 @@ static struct uprobe *get_uprobe(struct
return uprobe;
}
+static void uprobe_free_rcu(struct rcu_head *rcu)
+{
+ struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+ kfree(uprobe);
+}
+
static void put_uprobe(struct uprobe *uprobe)
{
if (refcount_dec_and_test(&uprobe->ref)) {
@@ -604,7 +612,7 @@ static void put_uprobe(struct uprobe *up
mutex_lock(&delayed_uprobe_lock);
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
- kfree(uprobe);
+ call_rcu(&uprobe->rcu, uprobe_free_rcu);
}
}
@@ -653,7 +661,7 @@ static struct uprobe *__find_uprobe(stru
.inode = inode,
.offset = offset,
};
- struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
+ struct rb_node *node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key);
if (node)
return get_uprobe(__node_2_uprobe(node));
@@ -667,20 +675,32 @@ static struct uprobe *__find_uprobe(stru
*/
static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
{
- struct uprobe *uprobe;
+ unsigned int seq;
- read_lock(&uprobes_treelock);
- uprobe = __find_uprobe(inode, offset);
- read_unlock(&uprobes_treelock);
+ guard(rcu)();
- return uprobe;
+ do {
+ seq = read_seqcount_begin(&uprobes_seqcount);
+ struct uprobe *uprobe = __find_uprobe(inode, offset);
+ if (uprobe) {
+ /*
+ * Lockless RB-tree lookups are prone to false-negatives.
+ * If they find something, it's good. If they do not find,
+ * it needs to be validated.
+ */
+ return uprobe;
+ }
+ } while (read_seqcount_retry(&uprobes_seqcount, seq));
+
+ /* Really didn't find anything. */
+ return NULL;
}
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
struct rb_node *node;
- node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
+ node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
if (node)
return get_uprobe(__node_2_uprobe(node));
@@ -702,7 +722,9 @@ static struct uprobe *insert_uprobe(stru
struct uprobe *u;
write_lock(&uprobes_treelock);
+ write_seqcount_begin(&uprobes_seqcount);
u = __insert_uprobe(uprobe);
+ write_seqcount_end(&uprobes_seqcount);
write_unlock(&uprobes_treelock);
return u;
@@ -936,7 +958,9 @@ static void delete_uprobe(struct uprobe
return;
write_lock(&uprobes_treelock);
+ write_seqcount_begin(&uprobes_seqcount);
rb_erase(&uprobe->rb_node, &uprobes_tree);
+ write_seqcount_end(&uprobes_seqcount);
write_unlock(&uprobes_treelock);
RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
put_uprobe(uprobe);
I hate to say this again, but I'll try to read this series later ;)
But let me ask...
On 07/08, Peter Zijlstra wrote:
>
> +static void uprobe_free_rcu(struct rcu_head *rcu)
> +{
> + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> + kfree(uprobe);
> +}
> +
> static void put_uprobe(struct uprobe *uprobe)
> {
> if (refcount_dec_and_test(&uprobe->ref)) {
> @@ -604,7 +612,7 @@ static void put_uprobe(struct uprobe *up
> mutex_lock(&delayed_uprobe_lock);
> delayed_uprobe_remove(uprobe, NULL);
> mutex_unlock(&delayed_uprobe_lock);
> - kfree(uprobe);
> + call_rcu(&uprobe->rcu, uprobe_free_rcu);
kfree_rcu() ?
> static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
> {
> - struct uprobe *uprobe;
> + unsigned int seq;
>
> - read_lock(&uprobes_treelock);
> - uprobe = __find_uprobe(inode, offset);
> - read_unlock(&uprobes_treelock);
> + guard(rcu)();
>
> - return uprobe;
> + do {
> + seq = read_seqcount_begin(&uprobes_seqcount);
> + struct uprobe *uprobe = __find_uprobe(inode, offset);
> + if (uprobe) {
> + /*
> + * Lockless RB-tree lookups are prone to false-negatives.
> + * If they find something, it's good.
Is it true in this case?
Suppose we have uprobe U which has no extra refs, so uprobe_unregister()
called by the task X should remove it from uprobes_tree and kfree.
Suppose that the task T hits the breakpoint and enters handle_swbp().
Now,
- X calls find_uprobe(), this increments U->ref from 1 to 2
register_for_each_vma() succeeds
X enters delete_uprobe()
- T calls find_active_uprobe() -> find_uprobe()
__read_seqcount_begin__read_seqcount_begin() returns an even number
__find_uprobe() -> rb_find_rcu() succeeds
- X continues and returns from delete_uprobe(), U->ref == 1
then it does the final uprobe_unregister()->put_uprobe(U),
refcount_dec_and_test() succeeds, X calls call_rcu(uprobe_free_rcu).
- T does get_uprobe() which changes U->ref from 0 to 1, __find_uprobe()
returns, find_uprobe() doesn't check read_seqcount_retry().
- T returns from find_active_uprobe() and plays with the "soon to be
freed" U.
Looks like __find_uprobe() needs refcount_inc_not_zero() or return NULL, but
I am not sure this is the only problem...
No?
Oleg.
On Mon, Jul 08, 2024 at 06:35:45PM +0200, Oleg Nesterov wrote:
> I hate to say this again, but I'll try to read this series later ;)
>
> But let me ask...
:-)
> On 07/08, Peter Zijlstra wrote:
> >
> > +static void uprobe_free_rcu(struct rcu_head *rcu)
> > +{
> > + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> > + kfree(uprobe);
> > +}
> > +
> > static void put_uprobe(struct uprobe *uprobe)
> > {
> > if (refcount_dec_and_test(&uprobe->ref)) {
> > @@ -604,7 +612,7 @@ static void put_uprobe(struct uprobe *up
> > mutex_lock(&delayed_uprobe_lock);
> > delayed_uprobe_remove(uprobe, NULL);
> > mutex_unlock(&delayed_uprobe_lock);
> > - kfree(uprobe);
> > + call_rcu(&uprobe->rcu, uprobe_free_rcu);
>
> kfree_rcu() ?
I can never remember how that works, also this will very soon be
call_srcu().
> > static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
> > {
> > - struct uprobe *uprobe;
> > + unsigned int seq;
> >
> > - read_lock(&uprobes_treelock);
> > - uprobe = __find_uprobe(inode, offset);
> > - read_unlock(&uprobes_treelock);
> > + guard(rcu)();
> >
> > - return uprobe;
> > + do {
> > + seq = read_seqcount_begin(&uprobes_seqcount);
> > + struct uprobe *uprobe = __find_uprobe(inode, offset);
> > + if (uprobe) {
> > + /*
> > + * Lockless RB-tree lookups are prone to false-negatives.
> > + * If they find something, it's good.
>
> Is it true in this case?
>
> Suppose we have uprobe U which has no extra refs, so uprobe_unregister()
> called by the task X should remove it from uprobes_tree and kfree.
>
> Suppose that the task T hits the breakpoint and enters handle_swbp().
>
> Now,
>
> - X calls find_uprobe(), this increments U->ref from 1 to 2
>
> register_for_each_vma() succeeds
>
> X enters delete_uprobe()
>
> - T calls find_active_uprobe() -> find_uprobe()
>
> __read_seqcount_begin__read_seqcount_begin() returns an even number
>
> __find_uprobe() -> rb_find_rcu() succeeds
>
> - X continues and returns from delete_uprobe(), U->ref == 1
>
> then it does the final uprobe_unregister()->put_uprobe(U),
> refcount_dec_and_test() succeeds, X calls call_rcu(uprobe_free_rcu).
>
> - T does get_uprobe() which changes U->ref from 0 to 1, __find_uprobe()
> returns, find_uprobe() doesn't check read_seqcount_retry().
I think you're right. However, this get_uprobe() will go away in a few
patches.
But yeah, I should perhaps have been more careful when splitting things
up :/
On 07/08, Peter Zijlstra wrote: > > On Mon, Jul 08, 2024 at 06:35:45PM +0200, Oleg Nesterov wrote: > > > > Suppose we have uprobe U which has no extra refs, so uprobe_unregister() > > called by the task X should remove it from uprobes_tree and kfree. > > > > Suppose that the task T hits the breakpoint and enters handle_swbp(). > > > > Now, > > > > - X calls find_uprobe(), this increments U->ref from 1 to 2 > > > > register_for_each_vma() succeeds > > > > X enters delete_uprobe() > > > > - T calls find_active_uprobe() -> find_uprobe() > > > > __read_seqcount_begin__read_seqcount_begin() returns an even number > > > > __find_uprobe() -> rb_find_rcu() succeeds > > > > - X continues and returns from delete_uprobe(), U->ref == 1 > > > > then it does the final uprobe_unregister()->put_uprobe(U), > > refcount_dec_and_test() succeeds, X calls call_rcu(uprobe_free_rcu). > > > > - T does get_uprobe() which changes U->ref from 0 to 1, __find_uprobe() > > returns, find_uprobe() doesn't check read_seqcount_retry(). > > I think you're right. However, this get_uprobe() will go away in a few > patches. OK, I am looking at 7/10 perf/uprobe: Convert (some) uprobe->refcount to SRCU Yes, __find_uprobe() no longer does get_uprobe(). But at first glance we have the same problem with this change @@ -1977,7 +1979,7 @@ pre_ssout(struct uprobe *uprobe, struct return err; } - utask->active_uprobe = uprobe; + utask->active_uprobe = get_uprobe(uprobe); utask->state = UTASK_SSTEP; return 0; } from 7/12 above. It can change uprobe->ref from 0 to 1 when call_srcu(&uprobes_srcu, uprobe_free_rcu) was already scheduled. Once guard(srcu)(&uprobes_srcu) in handle_swbp() drops the uprobes_srcu lock, utask->active_uprobe can be freed. Oleg.
On Tue, Jul 09, 2024 at 04:32:55PM +0200, Oleg Nesterov wrote: > Once guard(srcu)(&uprobes_srcu) in handle_swbp() drops the uprobes_srcu lock, > utask->active_uprobe can be freed. Yeah, I've fixed all those already. It's a bit of churn, adding inc_not_zero all over the place and then removing it again, but yeah, it makes the individual patches better. Let me finish the complete set and I'll push out.
On 07/09, Peter Zijlstra wrote:
> On Tue, Jul 09, 2024 at 04:32:55PM +0200, Oleg Nesterov wrote:
>
> > Once guard(srcu)(&uprobes_srcu) in handle_swbp() drops the uprobes_srcu lock,
> > utask->active_uprobe can be freed.
>
> Yeah, I've fixed all those already. It's a bit of churn, adding
> inc_not_zero all over the place
I am wondering if we can move delayed_uprobe_remove() from put_uprobe()
to delete_uprobe()... probably not, I forgot everything.
But if we can, then we can probably do
put_uprobe(uprobe)
{
if (refcount_dec_and_test(&uprobe->ref))
kfree(uprobe);
}
uprobe_put_rcu(struct rcu_head *rcu)
{
uprobe = container_of(...);
put_uprobe(uprobe);
}
delete_uprobe(uprobe)
{
rb_erase(...);
delayed_uprobe_remove(...);
...
call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_put_rcu);
}
and avoid inc_not_zero.
Not sure, I am already exhausted ;)
Oleg.
© 2016 - 2026 Red Hat, Inc.