For now, fgraph is used for the fprobe, even if we need trace the entry
only. However, the performance of ftrace is better than fgraph, and we
can use ftrace_ops for this case.
Then performance of kprobe-multi increases from 54M to 69M. Before this
commit:
$ ./benchs/run_bench_trigger.sh kprobe-multi
kprobe-multi : 54.663 ± 0.493M/s
After this commit:
$ ./benchs/run_bench_trigger.sh kprobe-multi
kprobe-multi : 69.447 ± 0.143M/s
Mitigation is disable during the bench testing above.
Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
v2:
- add some document for fprobe_fgraph_entry as Masami suggested
- merge the rename of fprobe_entry into current patch
- use ftrace_test_recursion_trylock() in fprobe_ftrace_entry()
---
kernel/trace/fprobe.c | 104 +++++++++++++++++++++++++++++++++++++-----
1 file changed, 93 insertions(+), 11 deletions(-)
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 99d83c08b9e2..bb02d6d09d6a 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -254,8 +254,9 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent
return ret;
}
-static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
- struct ftrace_regs *fregs)
+/* fgraph_ops callback, this processes fprobes which have exit_handler. */
+static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
unsigned long *fgraph_data = NULL;
unsigned long func = trace->func;
@@ -292,7 +293,7 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
if (node->addr != func)
continue;
fp = READ_ONCE(node->fp);
- if (fp && !fprobe_disabled(fp))
+ if (fp && !fprobe_disabled(fp) && fp->exit_handler)
fp->nmissed++;
}
return 0;
@@ -312,11 +313,11 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
if (node->addr != func)
continue;
fp = READ_ONCE(node->fp);
- if (!fp || fprobe_disabled(fp))
+ if (unlikely(!fp || fprobe_disabled(fp) || !fp->exit_handler))
continue;
data_size = fp->entry_data_size;
- if (data_size && fp->exit_handler)
+ if (data_size)
data = fgraph_data + used + FPROBE_HEADER_SIZE_IN_LONG;
else
data = NULL;
@@ -327,7 +328,7 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
ret = __fprobe_handler(func, ret_ip, fp, fregs, data);
/* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */
- if (!ret && fp->exit_handler) {
+ if (!ret) {
int size_words = SIZE_IN_LONG(data_size);
if (write_fprobe_header(&fgraph_data[used], fp, size_words))
@@ -340,7 +341,7 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
/* If any exit_handler is set, data must be used. */
return used != 0;
}
-NOKPROBE_SYMBOL(fprobe_entry);
+NOKPROBE_SYMBOL(fprobe_fgraph_entry);
static void fprobe_return(struct ftrace_graph_ret *trace,
struct fgraph_ops *gops,
@@ -379,11 +380,82 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
NOKPROBE_SYMBOL(fprobe_return);
static struct fgraph_ops fprobe_graph_ops = {
- .entryfunc = fprobe_entry,
+ .entryfunc = fprobe_fgraph_entry,
.retfunc = fprobe_return,
};
static int fprobe_graph_active;
+/* ftrace_ops callback, this processes fprobes which have only entry_handler. */
+static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe_hlist_node *node;
+ struct rhlist_head *head, *pos;
+ struct fprobe *fp;
+ int bit;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
+ rcu_read_lock();
+ head = rhltable_lookup(&fprobe_ip_table, &ip, fprobe_rht_params);
+
+ rhl_for_each_entry_rcu(node, pos, head, hlist) {
+ if (node->addr != ip)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (unlikely(!fp || fprobe_disabled(fp) || fp->exit_handler))
+ continue;
+
+ if (fprobe_shared_with_kprobes(fp))
+ __fprobe_kprobe_handler(ip, parent_ip, fp, fregs, NULL);
+ else
+ __fprobe_handler(ip, parent_ip, fp, fregs, NULL);
+ }
+ rcu_read_unlock();
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(fprobe_ftrace_entry);
+
+static struct ftrace_ops fprobe_ftrace_ops = {
+ .func = fprobe_ftrace_entry,
+ .flags = FTRACE_OPS_FL_SAVE_REGS,
+};
+static int fprobe_ftrace_active;
+
+static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
+{
+ int ret;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ ret = ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 0, 0);
+ if (ret)
+ return ret;
+
+ if (!fprobe_ftrace_active) {
+ ret = register_ftrace_function(&fprobe_ftrace_ops);
+ if (ret) {
+ ftrace_free_filter(&fprobe_ftrace_ops);
+ return ret;
+ }
+ }
+ fprobe_ftrace_active++;
+ return 0;
+}
+
+static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
+{
+ lockdep_assert_held(&fprobe_mutex);
+
+ fprobe_ftrace_active--;
+ if (!fprobe_ftrace_active)
+ unregister_ftrace_function(&fprobe_ftrace_ops);
+ if (num)
+ ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0);
+}
+
/* Add @addrs to the ftrace filter and register fgraph if needed. */
static int fprobe_graph_add_ips(unsigned long *addrs, int num)
{
@@ -498,9 +570,12 @@ static int fprobe_module_callback(struct notifier_block *nb,
} while (node == ERR_PTR(-EAGAIN));
rhashtable_walk_exit(&iter);
- if (alist.index > 0)
+ if (alist.index > 0) {
ftrace_set_filter_ips(&fprobe_graph_ops.ops,
alist.addrs, alist.index, 1, 0);
+ ftrace_set_filter_ips(&fprobe_ftrace_ops,
+ alist.addrs, alist.index, 1, 0);
+ }
mutex_unlock(&fprobe_mutex);
kfree(alist.addrs);
@@ -733,7 +808,11 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
mutex_lock(&fprobe_mutex);
hlist_array = fp->hlist_array;
- ret = fprobe_graph_add_ips(addrs, num);
+ if (fp->exit_handler)
+ ret = fprobe_graph_add_ips(addrs, num);
+ else
+ ret = fprobe_ftrace_add_ips(addrs, num);
+
if (!ret) {
add_fprobe_hash(fp);
for (i = 0; i < hlist_array->size; i++) {
@@ -829,7 +908,10 @@ int unregister_fprobe(struct fprobe *fp)
}
del_fprobe_hash(fp);
- fprobe_graph_remove_ips(addrs, count);
+ if (fp->exit_handler)
+ fprobe_graph_remove_ips(addrs, count);
+ else
+ fprobe_ftrace_remove_ips(addrs, count);
kfree_rcu(hlist_array, rcu);
fp->hlist_array = NULL;
--
2.51.0
Hi Menglong,
I remember why I haven't implement this.
On Fri, 10 Oct 2025 11:38:46 +0800
Menglong Dong <menglong8.dong@gmail.com> wrote:
> +
> +static struct ftrace_ops fprobe_ftrace_ops = {
> + .func = fprobe_ftrace_entry,
> + .flags = FTRACE_OPS_FL_SAVE_REGS,
Actually, this flag is the problem. This can fail fprobe on architecture
which does not support CONFIG_DYNAMIC_FTRACE_WITH_REGS (e.g. arm64, riscv)
* SAVE_REGS - The ftrace_ops wants regs saved at each function called
* and passed to the callback. If this flag is set, but the
* architecture does not support passing regs
* (CONFIG_DYNAMIC_FTRACE_WITH_REGS is not defined), then the
* ftrace_ops will fail to register, unless the next flag
* is set.
fgraph has a special entry code for saving ftrace_regs.
So at least we need to fail back to fgraph if arch does not
support CONFIG_DYNAMIC_FTRACE_WITH_REGS.
Thank you,
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
Hi Menglong,
On Fri, 10 Oct 2025 11:38:46 +0800
Menglong Dong <menglong8.dong@gmail.com> wrote:
> For now, fgraph is used for the fprobe, even if we need trace the entry
> only. However, the performance of ftrace is better than fgraph, and we
> can use ftrace_ops for this case.
>
> Then performance of kprobe-multi increases from 54M to 69M. Before this
> commit:
>
> $ ./benchs/run_bench_trigger.sh kprobe-multi
> kprobe-multi : 54.663 ± 0.493M/s
>
> After this commit:
>
> $ ./benchs/run_bench_trigger.sh kprobe-multi
> kprobe-multi : 69.447 ± 0.143M/s
>
> Mitigation is disable during the bench testing above.
>
Thanks for updating!
This looks good to me. Just a nit comment below;
[...]
> @@ -379,11 +380,82 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
> NOKPROBE_SYMBOL(fprobe_return);
>
> static struct fgraph_ops fprobe_graph_ops = {
> - .entryfunc = fprobe_entry,
> + .entryfunc = fprobe_fgraph_entry,
> .retfunc = fprobe_return,
> };
> static int fprobe_graph_active;
>
> +/* ftrace_ops callback, this processes fprobes which have only entry_handler. */
> +static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip,
> + struct ftrace_ops *ops, struct ftrace_regs *fregs)
> +{
> + struct fprobe_hlist_node *node;
> + struct rhlist_head *head, *pos;
> + struct fprobe *fp;
> + int bit;
> +
> + bit = ftrace_test_recursion_trylock(ip, parent_ip);
> + if (bit < 0)
> + return;
> +
nit: We'd better to explain why we need this here too;
/*
* ftrace_test_recursion_trylock() disables preemption, but
* rhltable_lookup() checks whether rcu_read_lcok is held.
* So we take rcu_read_lock() here.
*/
> + rcu_read_lock();
> + head = rhltable_lookup(&fprobe_ip_table, &ip, fprobe_rht_params);
> +
> + rhl_for_each_entry_rcu(node, pos, head, hlist) {
> + if (node->addr != ip)
> + break;
> + fp = READ_ONCE(node->fp);
> + if (unlikely(!fp || fprobe_disabled(fp) || fp->exit_handler))
> + continue;
> +
> + if (fprobe_shared_with_kprobes(fp))
> + __fprobe_kprobe_handler(ip, parent_ip, fp, fregs, NULL);
> + else
> + __fprobe_handler(ip, parent_ip, fp, fregs, NULL);
> + }
> + rcu_read_unlock();
> + ftrace_test_recursion_unlock(bit);
> +}
> +NOKPROBE_SYMBOL(fprobe_ftrace_entry);
Thank you,
> +
> +static struct ftrace_ops fprobe_ftrace_ops = {
> + .func = fprobe_ftrace_entry,
> + .flags = FTRACE_OPS_FL_SAVE_REGS,
> +};
> +static int fprobe_ftrace_active;
> +
> +static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
> +{
> + int ret;
> +
> + lockdep_assert_held(&fprobe_mutex);
> +
> + ret = ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 0, 0);
> + if (ret)
> + return ret;
> +
> + if (!fprobe_ftrace_active) {
> + ret = register_ftrace_function(&fprobe_ftrace_ops);
> + if (ret) {
> + ftrace_free_filter(&fprobe_ftrace_ops);
> + return ret;
> + }
> + }
> + fprobe_ftrace_active++;
> + return 0;
> +}
> +
> +static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
> +{
> + lockdep_assert_held(&fprobe_mutex);
> +
> + fprobe_ftrace_active--;
> + if (!fprobe_ftrace_active)
> + unregister_ftrace_function(&fprobe_ftrace_ops);
> + if (num)
> + ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0);
> +}
> +
> /* Add @addrs to the ftrace filter and register fgraph if needed. */
> static int fprobe_graph_add_ips(unsigned long *addrs, int num)
> {
> @@ -498,9 +570,12 @@ static int fprobe_module_callback(struct notifier_block *nb,
> } while (node == ERR_PTR(-EAGAIN));
> rhashtable_walk_exit(&iter);
>
> - if (alist.index > 0)
> + if (alist.index > 0) {
> ftrace_set_filter_ips(&fprobe_graph_ops.ops,
> alist.addrs, alist.index, 1, 0);
> + ftrace_set_filter_ips(&fprobe_ftrace_ops,
> + alist.addrs, alist.index, 1, 0);
> + }
> mutex_unlock(&fprobe_mutex);
>
> kfree(alist.addrs);
> @@ -733,7 +808,11 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
> mutex_lock(&fprobe_mutex);
>
> hlist_array = fp->hlist_array;
> - ret = fprobe_graph_add_ips(addrs, num);
> + if (fp->exit_handler)
> + ret = fprobe_graph_add_ips(addrs, num);
> + else
> + ret = fprobe_ftrace_add_ips(addrs, num);
> +
> if (!ret) {
> add_fprobe_hash(fp);
> for (i = 0; i < hlist_array->size; i++) {
> @@ -829,7 +908,10 @@ int unregister_fprobe(struct fprobe *fp)
> }
> del_fprobe_hash(fp);
>
> - fprobe_graph_remove_ips(addrs, count);
> + if (fp->exit_handler)
> + fprobe_graph_remove_ips(addrs, count);
> + else
> + fprobe_ftrace_remove_ips(addrs, count);
>
> kfree_rcu(hlist_array, rcu);
> fp->hlist_array = NULL;
> --
> 2.51.0
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
On 2025/10/12 12:07, Masami Hiramatsu wrote:
> Hi Menglong,
>
> On Fri, 10 Oct 2025 11:38:46 +0800
> Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> > For now, fgraph is used for the fprobe, even if we need trace the entry
> > only. However, the performance of ftrace is better than fgraph, and we
> > can use ftrace_ops for this case.
> >
> > Then performance of kprobe-multi increases from 54M to 69M. Before this
> > commit:
> >
> > $ ./benchs/run_bench_trigger.sh kprobe-multi
> > kprobe-multi : 54.663 ± 0.493M/s
> >
> > After this commit:
> >
> > $ ./benchs/run_bench_trigger.sh kprobe-multi
> > kprobe-multi : 69.447 ± 0.143M/s
> >
> > Mitigation is disable during the bench testing above.
> >
>
> Thanks for updating!
>
> This looks good to me. Just a nit comment below;
>
> [...]
> > @@ -379,11 +380,82 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
> > NOKPROBE_SYMBOL(fprobe_return);
> >
> > static struct fgraph_ops fprobe_graph_ops = {
> > - .entryfunc = fprobe_entry,
> > + .entryfunc = fprobe_fgraph_entry,
> > .retfunc = fprobe_return,
> > };
> > static int fprobe_graph_active;
> >
> > +/* ftrace_ops callback, this processes fprobes which have only entry_handler. */
> > +static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip,
> > + struct ftrace_ops *ops, struct ftrace_regs *fregs)
> > +{
> > + struct fprobe_hlist_node *node;
> > + struct rhlist_head *head, *pos;
> > + struct fprobe *fp;
> > + int bit;
> > +
> > + bit = ftrace_test_recursion_trylock(ip, parent_ip);
> > + if (bit < 0)
> > + return;
> > +
>
> nit: We'd better to explain why we need this here too;
>
> /*
> * ftrace_test_recursion_trylock() disables preemption, but
> * rhltable_lookup() checks whether rcu_read_lcok is held.
> * So we take rcu_read_lock() here.
> */
It's very nice! I'll send a V3 now.
Thanks!
Menglong Dong
>
> > + rcu_read_lock();
> > + head = rhltable_lookup(&fprobe_ip_table, &ip, fprobe_rht_params);
> > +
> > + rhl_for_each_entry_rcu(node, pos, head, hlist) {
> > + if (node->addr != ip)
> > + break;
> > + fp = READ_ONCE(node->fp);
> > + if (unlikely(!fp || fprobe_disabled(fp) || fp->exit_handler))
> > + continue;
> > +
> > + if (fprobe_shared_with_kprobes(fp))
> > + __fprobe_kprobe_handler(ip, parent_ip, fp, fregs, NULL);
> > + else
> > + __fprobe_handler(ip, parent_ip, fp, fregs, NULL);
> > + }
> > + rcu_read_unlock();
> > + ftrace_test_recursion_unlock(bit);
> > +}
> > +NOKPROBE_SYMBOL(fprobe_ftrace_entry);
>
> Thank you,
>
> > +
> > +static struct ftrace_ops fprobe_ftrace_ops = {
> > + .func = fprobe_ftrace_entry,
> > + .flags = FTRACE_OPS_FL_SAVE_REGS,
> > +};
> > +static int fprobe_ftrace_active;
> > +
> > +static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
> > +{
> > + int ret;
> > +
> > + lockdep_assert_held(&fprobe_mutex);
> > +
> > + ret = ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 0, 0);
> > + if (ret)
> > + return ret;
> > +
> > + if (!fprobe_ftrace_active) {
> > + ret = register_ftrace_function(&fprobe_ftrace_ops);
> > + if (ret) {
> > + ftrace_free_filter(&fprobe_ftrace_ops);
> > + return ret;
> > + }
> > + }
> > + fprobe_ftrace_active++;
> > + return 0;
> > +}
> > +
> > +static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
> > +{
> > + lockdep_assert_held(&fprobe_mutex);
> > +
> > + fprobe_ftrace_active--;
> > + if (!fprobe_ftrace_active)
> > + unregister_ftrace_function(&fprobe_ftrace_ops);
> > + if (num)
> > + ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0);
> > +}
> > +
> > /* Add @addrs to the ftrace filter and register fgraph if needed. */
> > static int fprobe_graph_add_ips(unsigned long *addrs, int num)
> > {
> > @@ -498,9 +570,12 @@ static int fprobe_module_callback(struct notifier_block *nb,
> > } while (node == ERR_PTR(-EAGAIN));
> > rhashtable_walk_exit(&iter);
> >
> > - if (alist.index > 0)
> > + if (alist.index > 0) {
> > ftrace_set_filter_ips(&fprobe_graph_ops.ops,
> > alist.addrs, alist.index, 1, 0);
> > + ftrace_set_filter_ips(&fprobe_ftrace_ops,
> > + alist.addrs, alist.index, 1, 0);
> > + }
> > mutex_unlock(&fprobe_mutex);
> >
> > kfree(alist.addrs);
> > @@ -733,7 +808,11 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
> > mutex_lock(&fprobe_mutex);
> >
> > hlist_array = fp->hlist_array;
> > - ret = fprobe_graph_add_ips(addrs, num);
> > + if (fp->exit_handler)
> > + ret = fprobe_graph_add_ips(addrs, num);
> > + else
> > + ret = fprobe_ftrace_add_ips(addrs, num);
> > +
> > if (!ret) {
> > add_fprobe_hash(fp);
> > for (i = 0; i < hlist_array->size; i++) {
> > @@ -829,7 +908,10 @@ int unregister_fprobe(struct fprobe *fp)
> > }
> > del_fprobe_hash(fp);
> >
> > - fprobe_graph_remove_ips(addrs, count);
> > + if (fp->exit_handler)
> > + fprobe_graph_remove_ips(addrs, count);
> > + else
> > + fprobe_ftrace_remove_ips(addrs, count);
> >
> > kfree_rcu(hlist_array, rcu);
> > fp->hlist_array = NULL;
>
>
>
© 2016 - 2025 Red Hat, Inc.