Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
makes it unsuitable to be called from more restricted context like softirq.
Let's make put_uprobe() agnostic to the context in which it is called,
and use work queue to defer the mutex-protected clean up steps.
To avoid unnecessarily increasing the size of struct uprobe, we colocate
work_struct in parallel with rb_node and rcu, both of which are unused
by the time we get to schedule clean up work.
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++---
1 file changed, 27 insertions(+), 3 deletions(-)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a2e6a57f79f2..377bd524bc8b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,7 @@
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>
#include <linux/rcupdate_trace.h>
+#include <linux/workqueue.h>
#include <linux/uprobes.h>
@@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
#define UPROBE_COPY_INSN 0
struct uprobe {
- struct rb_node rb_node; /* node in the rb tree */
+ union {
+ struct {
+ struct rb_node rb_node; /* node in the rb tree */
+ struct rcu_head rcu;
+ };
+ /* work is used only during freeing, rcu and rb_node are unused at that point */
+ struct work_struct work;
+ };
refcount_t ref;
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
struct list_head consumers;
struct inode *inode; /* Also hold a ref to inode */
- struct rcu_head rcu;
loff_t offset;
loff_t ref_ctr_offset;
unsigned long flags;
@@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
return !RB_EMPTY_NODE(&uprobe->rb_node);
}
+static void uprobe_free_deferred(struct work_struct *work)
+{
+ struct uprobe *uprobe = container_of(work, struct uprobe, work);
+
+ /*
+ * If application munmap(exec_vma) before uprobe_unregister()
+ * gets called, we don't get a chance to remove uprobe from
+ * delayed_uprobe_list from remove_breakpoint(). Do it here.
+ */
+ mutex_lock(&delayed_uprobe_lock);
+ delayed_uprobe_remove(uprobe, NULL);
+ mutex_unlock(&delayed_uprobe_lock);
+
+ kfree(uprobe);
+}
+
static void uprobe_free_rcu(struct rcu_head *rcu)
{
struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
- kfree(uprobe);
+ INIT_WORK(&uprobe->work, uprobe_free_deferred);
+ schedule_work(&uprobe->work);
}
static void put_uprobe(struct uprobe *uprobe)
--
2.43.5
On 09/09, Andrii Nakryiko wrote:
>
> Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> makes it unsuitable to be called from more restricted context like softirq.
>
> Let's make put_uprobe() agnostic to the context in which it is called,
> and use work queue to defer the mutex-protected clean up steps.
...
> +static void uprobe_free_deferred(struct work_struct *work)
> +{
> + struct uprobe *uprobe = container_of(work, struct uprobe, work);
> +
> + /*
> + * If application munmap(exec_vma) before uprobe_unregister()
> + * gets called, we don't get a chance to remove uprobe from
> + * delayed_uprobe_list from remove_breakpoint(). Do it here.
> + */
> + mutex_lock(&delayed_uprobe_lock);
> + delayed_uprobe_remove(uprobe, NULL);
> + mutex_unlock(&delayed_uprobe_lock);
> +
> + kfree(uprobe);
> +}
> +
> static void uprobe_free_rcu(struct rcu_head *rcu)
> {
> struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
>
> - kfree(uprobe);
> + INIT_WORK(&uprobe->work, uprobe_free_deferred);
> + schedule_work(&uprobe->work);
> }
This is still wrong afaics...
If put_uprobe() can be called from softirq (after the next patch), then
put_uprobe() and all other users of uprobes_treelock should use
write_lock_bh/read_lock_bh to avoid the deadlock.
To be honest... I simply can't force myself to even try to read 2/3 ;) I'll
try to do this later, but I am sure I will never like it, sorry.
Oleg.
On Sun, Sep 15, 2024 at 4:49 PM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 09/09, Andrii Nakryiko wrote:
> >
> > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> > makes it unsuitable to be called from more restricted context like softirq.
> >
> > Let's make put_uprobe() agnostic to the context in which it is called,
> > and use work queue to defer the mutex-protected clean up steps.
>
> ...
>
> > +static void uprobe_free_deferred(struct work_struct *work)
> > +{
> > + struct uprobe *uprobe = container_of(work, struct uprobe, work);
> > +
> > + /*
> > + * If application munmap(exec_vma) before uprobe_unregister()
> > + * gets called, we don't get a chance to remove uprobe from
> > + * delayed_uprobe_list from remove_breakpoint(). Do it here.
> > + */
> > + mutex_lock(&delayed_uprobe_lock);
> > + delayed_uprobe_remove(uprobe, NULL);
> > + mutex_unlock(&delayed_uprobe_lock);
> > +
> > + kfree(uprobe);
> > +}
> > +
> > static void uprobe_free_rcu(struct rcu_head *rcu)
> > {
> > struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> >
> > - kfree(uprobe);
> > + INIT_WORK(&uprobe->work, uprobe_free_deferred);
> > + schedule_work(&uprobe->work);
> > }
>
> This is still wrong afaics...
>
> If put_uprobe() can be called from softirq (after the next patch), then
> put_uprobe() and all other users of uprobes_treelock should use
> write_lock_bh/read_lock_bh to avoid the deadlock.
Ok, I see the problem, that's unfortunate.
I see three ways to handle that:
1) keep put_uprobe() as is, and instead do schedule_work() from the
timer thread to postpone put_uprobe(). (but I'm not a big fan of this)
2) move uprobes_treelock part of put_uprobe() into rcu callback, I
think it has no bearing on correctness, uprobe_is_active() is there
already to handle races between putting uprobe and removing it from
uprobes_tree (I prefer this one over #1 )
3) you might like this the most ;) I think I can simplify
hprobes_expire() from patch #2 to not need put_uprobe() at all, if I
protect uprobe lifetime with non-sleepable
rcu_read_lock()/rcu_read_unlock() and perform try_get_uprobe() as the
very last step after cmpxchg() succeeded.
I'm leaning towards #3, but #2 seems fine to me as well.
>
> To be honest... I simply can't force myself to even try to read 2/3 ;) I'll
> try to do this later, but I am sure I will never like it, sorry.
This might sound rude, but the goal here is not to make you like it :)
The goal is to improve performance with minimal complexity. And I'm
very open to any alternative proposals as to how to make uretprobes
RCU-protected to avoid refcounting in the hot path.
I think #3 proposal above will make it a bit more palatable (but there
is still locklessness, cmpxchg, etc, I see no way around that,
unfortunately).
>
> Oleg.
>
On Tue, Sep 17, 2024 at 1:19 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Sun, Sep 15, 2024 at 4:49 PM Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > On 09/09, Andrii Nakryiko wrote:
> > >
> > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> > > makes it unsuitable to be called from more restricted context like softirq.
> > >
> > > Let's make put_uprobe() agnostic to the context in which it is called,
> > > and use work queue to defer the mutex-protected clean up steps.
> >
> > ...
> >
> > > +static void uprobe_free_deferred(struct work_struct *work)
> > > +{
> > > + struct uprobe *uprobe = container_of(work, struct uprobe, work);
> > > +
> > > + /*
> > > + * If application munmap(exec_vma) before uprobe_unregister()
> > > + * gets called, we don't get a chance to remove uprobe from
> > > + * delayed_uprobe_list from remove_breakpoint(). Do it here.
> > > + */
> > > + mutex_lock(&delayed_uprobe_lock);
> > > + delayed_uprobe_remove(uprobe, NULL);
> > > + mutex_unlock(&delayed_uprobe_lock);
> > > +
> > > + kfree(uprobe);
> > > +}
> > > +
> > > static void uprobe_free_rcu(struct rcu_head *rcu)
> > > {
> > > struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> > >
> > > - kfree(uprobe);
> > > + INIT_WORK(&uprobe->work, uprobe_free_deferred);
> > > + schedule_work(&uprobe->work);
> > > }
> >
> > This is still wrong afaics...
> >
> > If put_uprobe() can be called from softirq (after the next patch), then
> > put_uprobe() and all other users of uprobes_treelock should use
> > write_lock_bh/read_lock_bh to avoid the deadlock.
>
> Ok, I see the problem, that's unfortunate.
>
> I see three ways to handle that:
>
> 1) keep put_uprobe() as is, and instead do schedule_work() from the
> timer thread to postpone put_uprobe(). (but I'm not a big fan of this)
> 2) move uprobes_treelock part of put_uprobe() into rcu callback, I
> think it has no bearing on correctness, uprobe_is_active() is there
> already to handle races between putting uprobe and removing it from
> uprobes_tree (I prefer this one over #1 )
> 3) you might like this the most ;) I think I can simplify
> hprobes_expire() from patch #2 to not need put_uprobe() at all, if I
> protect uprobe lifetime with non-sleepable
> rcu_read_lock()/rcu_read_unlock() and perform try_get_uprobe() as the
> very last step after cmpxchg() succeeded.
>
> I'm leaning towards #3, but #2 seems fine to me as well.
Ok, so just a short update. I don't think #3 works, I do need
try_get_uprobe() before I know for sure that cmpxchg() succeeds. Which
means I'd need a compensating put_uprobe() if cmpxchg() fails. So for
put_uprobe(), I just made it do all the locking in deferred work
callback (which is #2 above), which I think resolved the issue you
pointed out with potential deadlock and removes any limitations on
put_uprobe().
Also, I rewrote the hprobe_consume() and hprobe_expire() in terms of
an explicit state machine with 4 possible states (LEASED, STABLE,
GONE, CONSUMED), which I think makes the logic a bit more
straightforward to follow. Hopefully that will make the change more
palatable for you. I'm probably going to post patches next week,
though.
>
> >
> > To be honest... I simply can't force myself to even try to read 2/3 ;) I'll
> > try to do this later, but I am sure I will never like it, sorry.
>
> This might sound rude, but the goal here is not to make you like it :)
> The goal is to improve performance with minimal complexity. And I'm
> very open to any alternative proposals as to how to make uretprobes
> RCU-protected to avoid refcounting in the hot path.
>
> I think #3 proposal above will make it a bit more palatable (but there
> is still locklessness, cmpxchg, etc, I see no way around that,
> unfortunately).
>
> >
> > Oleg.
> >
On Mon, Sep 9, 2024 at 3:49 PM Andrii Nakryiko <andrii@kernel.org> wrote:
>
> Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> makes it unsuitable to be called from more restricted context like softirq.
>
> Let's make put_uprobe() agnostic to the context in which it is called,
> and use work queue to defer the mutex-protected clean up steps.
>
> To avoid unnecessarily increasing the size of struct uprobe, we colocate
> work_struct in parallel with rb_node and rcu, both of which are unused
> by the time we get to schedule clean up work.
>
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
> kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++---
> 1 file changed, 27 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index a2e6a57f79f2..377bd524bc8b 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -27,6 +27,7 @@
> #include <linux/shmem_fs.h>
> #include <linux/khugepaged.h>
> #include <linux/rcupdate_trace.h>
> +#include <linux/workqueue.h>
>
> #include <linux/uprobes.h>
>
> @@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
> #define UPROBE_COPY_INSN 0
>
> struct uprobe {
> - struct rb_node rb_node; /* node in the rb tree */
> + union {
> + struct {
> + struct rb_node rb_node; /* node in the rb tree */
> + struct rcu_head rcu;
> + };
> + /* work is used only during freeing, rcu and rb_node are unused at that point */
> + struct work_struct work;
> + };
> refcount_t ref;
> struct rw_semaphore register_rwsem;
> struct rw_semaphore consumer_rwsem;
> struct list_head pending_list;
> struct list_head consumers;
> struct inode *inode; /* Also hold a ref to inode */
> - struct rcu_head rcu;
> loff_t offset;
> loff_t ref_ctr_offset;
> unsigned long flags;
> @@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
> return !RB_EMPTY_NODE(&uprobe->rb_node);
> }
>
> +static void uprobe_free_deferred(struct work_struct *work)
> +{
> + struct uprobe *uprobe = container_of(work, struct uprobe, work);
> +
> + /*
> + * If application munmap(exec_vma) before uprobe_unregister()
> + * gets called, we don't get a chance to remove uprobe from
> + * delayed_uprobe_list from remove_breakpoint(). Do it here.
> + */
> + mutex_lock(&delayed_uprobe_lock);
> + delayed_uprobe_remove(uprobe, NULL);
> + mutex_unlock(&delayed_uprobe_lock);
> +
> + kfree(uprobe);
> +}
> +
> static void uprobe_free_rcu(struct rcu_head *rcu)
> {
> struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
>
> - kfree(uprobe);
> + INIT_WORK(&uprobe->work, uprobe_free_deferred);
> + schedule_work(&uprobe->work);
> }
>
> static void put_uprobe(struct uprobe *uprobe)
It seems put_uprobe hunk was lost, since the patch is not doing
what commit log describes.
On Mon, Sep 9, 2024 at 7:52 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Sep 9, 2024 at 3:49 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> >
> > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> > makes it unsuitable to be called from more restricted context like softirq.
> >
> > Let's make put_uprobe() agnostic to the context in which it is called,
> > and use work queue to defer the mutex-protected clean up steps.
> >
> > To avoid unnecessarily increasing the size of struct uprobe, we colocate
> > work_struct in parallel with rb_node and rcu, both of which are unused
> > by the time we get to schedule clean up work.
> >
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> > kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++---
> > 1 file changed, 27 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > index a2e6a57f79f2..377bd524bc8b 100644
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -27,6 +27,7 @@
> > #include <linux/shmem_fs.h>
> > #include <linux/khugepaged.h>
> > #include <linux/rcupdate_trace.h>
> > +#include <linux/workqueue.h>
> >
> > #include <linux/uprobes.h>
> >
> > @@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
> > #define UPROBE_COPY_INSN 0
> >
> > struct uprobe {
> > - struct rb_node rb_node; /* node in the rb tree */
> > + union {
> > + struct {
> > + struct rb_node rb_node; /* node in the rb tree */
> > + struct rcu_head rcu;
> > + };
> > + /* work is used only during freeing, rcu and rb_node are unused at that point */
> > + struct work_struct work;
> > + };
> > refcount_t ref;
> > struct rw_semaphore register_rwsem;
> > struct rw_semaphore consumer_rwsem;
> > struct list_head pending_list;
> > struct list_head consumers;
> > struct inode *inode; /* Also hold a ref to inode */
> > - struct rcu_head rcu;
> > loff_t offset;
> > loff_t ref_ctr_offset;
> > unsigned long flags;
> > @@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
> > return !RB_EMPTY_NODE(&uprobe->rb_node);
> > }
> >
> > +static void uprobe_free_deferred(struct work_struct *work)
> > +{
> > + struct uprobe *uprobe = container_of(work, struct uprobe, work);
> > +
> > + /*
> > + * If application munmap(exec_vma) before uprobe_unregister()
> > + * gets called, we don't get a chance to remove uprobe from
> > + * delayed_uprobe_list from remove_breakpoint(). Do it here.
> > + */
> > + mutex_lock(&delayed_uprobe_lock);
> > + delayed_uprobe_remove(uprobe, NULL);
> > + mutex_unlock(&delayed_uprobe_lock);
> > +
> > + kfree(uprobe);
> > +}
> > +
> > static void uprobe_free_rcu(struct rcu_head *rcu)
> > {
> > struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> >
> > - kfree(uprobe);
> > + INIT_WORK(&uprobe->work, uprobe_free_deferred);
> > + schedule_work(&uprobe->work);
> > }
> >
> > static void put_uprobe(struct uprobe *uprobe)
>
> It seems put_uprobe hunk was lost, since the patch is not doing
> what commit log describes.
Hmm, put_uprobe() has:
call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
at the end (see [0], which added that), so we do schedule_work() in
RCU callback, similarly to what we do with bpf_map freeing in the BPF
subsystem.
This patch set is based on the latest tip/perf/core (and also assuming
the RCU Tasks Trace patch that mysteriously disappeared is actually
there, hopefully it will just as magically be restored).
[0] https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=perf/core&id=8617408f7a01e94ce1f73e40a7704530e5dfb25c
On Mon, Sep 9, 2024 at 10:13 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Mon, Sep 9, 2024 at 7:52 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Sep 9, 2024 at 3:49 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> > >
> > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> > > makes it unsuitable to be called from more restricted context like softirq.
> > >
> > > Let's make put_uprobe() agnostic to the context in which it is called,
> > > and use work queue to defer the mutex-protected clean up steps.
> > >
> > > To avoid unnecessarily increasing the size of struct uprobe, we colocate
> > > work_struct in parallel with rb_node and rcu, both of which are unused
> > > by the time we get to schedule clean up work.
> > >
> > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > ---
> > > kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++---
> > > 1 file changed, 27 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > index a2e6a57f79f2..377bd524bc8b 100644
> > > --- a/kernel/events/uprobes.c
> > > +++ b/kernel/events/uprobes.c
> > > @@ -27,6 +27,7 @@
> > > #include <linux/shmem_fs.h>
> > > #include <linux/khugepaged.h>
> > > #include <linux/rcupdate_trace.h>
> > > +#include <linux/workqueue.h>
> > >
> > > #include <linux/uprobes.h>
> > >
> > > @@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
> > > #define UPROBE_COPY_INSN 0
> > >
> > > struct uprobe {
> > > - struct rb_node rb_node; /* node in the rb tree */
> > > + union {
> > > + struct {
> > > + struct rb_node rb_node; /* node in the rb tree */
> > > + struct rcu_head rcu;
> > > + };
> > > + /* work is used only during freeing, rcu and rb_node are unused at that point */
> > > + struct work_struct work;
> > > + };
> > > refcount_t ref;
> > > struct rw_semaphore register_rwsem;
> > > struct rw_semaphore consumer_rwsem;
> > > struct list_head pending_list;
> > > struct list_head consumers;
> > > struct inode *inode; /* Also hold a ref to inode */
> > > - struct rcu_head rcu;
> > > loff_t offset;
> > > loff_t ref_ctr_offset;
> > > unsigned long flags;
> > > @@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
> > > return !RB_EMPTY_NODE(&uprobe->rb_node);
> > > }
> > >
> > > +static void uprobe_free_deferred(struct work_struct *work)
> > > +{
> > > + struct uprobe *uprobe = container_of(work, struct uprobe, work);
> > > +
> > > + /*
> > > + * If application munmap(exec_vma) before uprobe_unregister()
> > > + * gets called, we don't get a chance to remove uprobe from
> > > + * delayed_uprobe_list from remove_breakpoint(). Do it here.
> > > + */
> > > + mutex_lock(&delayed_uprobe_lock);
> > > + delayed_uprobe_remove(uprobe, NULL);
> > > + mutex_unlock(&delayed_uprobe_lock);
> > > +
> > > + kfree(uprobe);
> > > +}
> > > +
> > > static void uprobe_free_rcu(struct rcu_head *rcu)
> > > {
> > > struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> > >
> > > - kfree(uprobe);
> > > + INIT_WORK(&uprobe->work, uprobe_free_deferred);
> > > + schedule_work(&uprobe->work);
> > > }
> > >
> > > static void put_uprobe(struct uprobe *uprobe)
> >
> > It seems put_uprobe hunk was lost, since the patch is not doing
> > what commit log describes.
>
>
> Hmm, put_uprobe() has:
>
> call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
>
> at the end (see [0], which added that), so we do schedule_work() in
> RCU callback, similarly to what we do with bpf_map freeing in the BPF
> subsystem.
>
> This patch set is based on the latest tip/perf/core (and also assuming
> the RCU Tasks Trace patch that mysteriously disappeared is actually
> there, hopefully it will just as magically be restored).
>
> [0] https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=perf/core&id=8617408f7a01e94ce1f73e40a7704530e5dfb25c
I'm still not following.
put_uprobe() did delayed_uprobe_remove() and this patch is doing it again.
The commit log implies that mutex+delayed_uprobe_remove should be removed
from put_uprobe(), but that's not what the patch is doing.
On Tue, Sep 10, 2024 at 8:56 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Sep 9, 2024 at 10:13 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Mon, Sep 9, 2024 at 7:52 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Mon, Sep 9, 2024 at 3:49 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> > > >
> > > > Currently put_uprobe() might trigger mutex_lock()/mutex_unlock(), which
> > > > makes it unsuitable to be called from more restricted context like softirq.
> > > >
> > > > Let's make put_uprobe() agnostic to the context in which it is called,
> > > > and use work queue to defer the mutex-protected clean up steps.
> > > >
> > > > To avoid unnecessarily increasing the size of struct uprobe, we colocate
> > > > work_struct in parallel with rb_node and rcu, both of which are unused
> > > > by the time we get to schedule clean up work.
> > > >
> > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > > ---
> > > > kernel/events/uprobes.c | 30 +++++++++++++++++++++++++++---
> > > > 1 file changed, 27 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > index a2e6a57f79f2..377bd524bc8b 100644
> > > > --- a/kernel/events/uprobes.c
> > > > +++ b/kernel/events/uprobes.c
> > > > @@ -27,6 +27,7 @@
> > > > #include <linux/shmem_fs.h>
> > > > #include <linux/khugepaged.h>
> > > > #include <linux/rcupdate_trace.h>
> > > > +#include <linux/workqueue.h>
> > > >
> > > > #include <linux/uprobes.h>
> > > >
> > > > @@ -54,14 +55,20 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
> > > > #define UPROBE_COPY_INSN 0
> > > >
> > > > struct uprobe {
> > > > - struct rb_node rb_node; /* node in the rb tree */
> > > > + union {
> > > > + struct {
> > > > + struct rb_node rb_node; /* node in the rb tree */
> > > > + struct rcu_head rcu;
> > > > + };
> > > > + /* work is used only during freeing, rcu and rb_node are unused at that point */
> > > > + struct work_struct work;
> > > > + };
> > > > refcount_t ref;
> > > > struct rw_semaphore register_rwsem;
> > > > struct rw_semaphore consumer_rwsem;
> > > > struct list_head pending_list;
> > > > struct list_head consumers;
> > > > struct inode *inode; /* Also hold a ref to inode */
> > > > - struct rcu_head rcu;
> > > > loff_t offset;
> > > > loff_t ref_ctr_offset;
> > > > unsigned long flags;
> > > > @@ -620,11 +627,28 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
> > > > return !RB_EMPTY_NODE(&uprobe->rb_node);
> > > > }
> > > >
> > > > +static void uprobe_free_deferred(struct work_struct *work)
> > > > +{
> > > > + struct uprobe *uprobe = container_of(work, struct uprobe, work);
> > > > +
> > > > + /*
> > > > + * If application munmap(exec_vma) before uprobe_unregister()
> > > > + * gets called, we don't get a chance to remove uprobe from
> > > > + * delayed_uprobe_list from remove_breakpoint(). Do it here.
> > > > + */
> > > > + mutex_lock(&delayed_uprobe_lock);
> > > > + delayed_uprobe_remove(uprobe, NULL);
> > > > + mutex_unlock(&delayed_uprobe_lock);
> > > > +
> > > > + kfree(uprobe);
> > > > +}
> > > > +
> > > > static void uprobe_free_rcu(struct rcu_head *rcu)
> > > > {
> > > > struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> > > >
> > > > - kfree(uprobe);
> > > > + INIT_WORK(&uprobe->work, uprobe_free_deferred);
> > > > + schedule_work(&uprobe->work);
> > > > }
> > > >
> > > > static void put_uprobe(struct uprobe *uprobe)
> > >
> > > It seems put_uprobe hunk was lost, since the patch is not doing
> > > what commit log describes.
> >
> >
> > Hmm, put_uprobe() has:
> >
> > call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
> >
> > at the end (see [0], which added that), so we do schedule_work() in
> > RCU callback, similarly to what we do with bpf_map freeing in the BPF
> > subsystem.
> >
> > This patch set is based on the latest tip/perf/core (and also assuming
> > the RCU Tasks Trace patch that mysteriously disappeared is actually
> > there, hopefully it will just as magically be restored).
> >
> > [0] https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=perf/core&id=8617408f7a01e94ce1f73e40a7704530e5dfb25c
>
> I'm still not following.
> put_uprobe() did delayed_uprobe_remove() and this patch is doing it again.
>
> The commit log implies that mutex+delayed_uprobe_remove should be removed
> from put_uprobe(), but that's not what the patch is doing.
Ah, that part. You are right, it's my bad. I copied that lock to a
work queue callback, but didn't remove it from the put_uprobe(). Will
fix it in v2, thanks for catching!
© 2016 - 2026 Red Hat, Inc.