[PATCH v2 1/2] tracing: fprobe: optimization for entry only case

Menglong Dong posted 2 patches 2 months, 1 week ago
There is a newer version of this series
[PATCH v2 1/2] tracing: fprobe: optimization for entry only case
Posted by Menglong Dong 2 months, 1 week ago
For now, fgraph is used for the fprobe, even if we need trace the entry
only. However, the performance of ftrace is better than fgraph, and we
can use ftrace_ops for this case.

Then performance of kprobe-multi increases from 54M to 69M. Before this
commit:

  $ ./benchs/run_bench_trigger.sh kprobe-multi
  kprobe-multi   :   54.663 ± 0.493M/s

After this commit:

  $ ./benchs/run_bench_trigger.sh kprobe-multi
  kprobe-multi   :   69.447 ± 0.143M/s

Mitigation is disable during the bench testing above.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
v2:
- add some document for fprobe_fgraph_entry as Masami suggested
- merge the rename of fprobe_entry into current patch
- use ftrace_test_recursion_trylock() in fprobe_ftrace_entry()
---
 kernel/trace/fprobe.c | 104 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 93 insertions(+), 11 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 99d83c08b9e2..bb02d6d09d6a 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -254,8 +254,9 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent
 	return ret;
 }
 
-static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
-			struct ftrace_regs *fregs)
+/* fgraph_ops callback, this processes fprobes which have exit_handler. */
+static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+			       struct ftrace_regs *fregs)
 {
 	unsigned long *fgraph_data = NULL;
 	unsigned long func = trace->func;
@@ -292,7 +293,7 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
 				if (node->addr != func)
 					continue;
 				fp = READ_ONCE(node->fp);
-				if (fp && !fprobe_disabled(fp))
+				if (fp && !fprobe_disabled(fp) && fp->exit_handler)
 					fp->nmissed++;
 			}
 			return 0;
@@ -312,11 +313,11 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
 		if (node->addr != func)
 			continue;
 		fp = READ_ONCE(node->fp);
-		if (!fp || fprobe_disabled(fp))
+		if (unlikely(!fp || fprobe_disabled(fp) || !fp->exit_handler))
 			continue;
 
 		data_size = fp->entry_data_size;
-		if (data_size && fp->exit_handler)
+		if (data_size)
 			data = fgraph_data + used + FPROBE_HEADER_SIZE_IN_LONG;
 		else
 			data = NULL;
@@ -327,7 +328,7 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
 			ret = __fprobe_handler(func, ret_ip, fp, fregs, data);
 
 		/* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */
-		if (!ret && fp->exit_handler) {
+		if (!ret) {
 			int size_words = SIZE_IN_LONG(data_size);
 
 			if (write_fprobe_header(&fgraph_data[used], fp, size_words))
@@ -340,7 +341,7 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
 	/* If any exit_handler is set, data must be used. */
 	return used != 0;
 }
-NOKPROBE_SYMBOL(fprobe_entry);
+NOKPROBE_SYMBOL(fprobe_fgraph_entry);
 
 static void fprobe_return(struct ftrace_graph_ret *trace,
 			  struct fgraph_ops *gops,
@@ -379,11 +380,82 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
 NOKPROBE_SYMBOL(fprobe_return);
 
 static struct fgraph_ops fprobe_graph_ops = {
-	.entryfunc	= fprobe_entry,
+	.entryfunc	= fprobe_fgraph_entry,
 	.retfunc	= fprobe_return,
 };
 static int fprobe_graph_active;
 
+/* ftrace_ops callback, this processes fprobes which have only entry_handler. */
+static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip,
+	struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+	struct fprobe_hlist_node *node;
+	struct rhlist_head *head, *pos;
+	struct fprobe *fp;
+	int bit;
+
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
+	if (bit < 0)
+		return;
+
+	rcu_read_lock();
+	head = rhltable_lookup(&fprobe_ip_table, &ip, fprobe_rht_params);
+
+	rhl_for_each_entry_rcu(node, pos, head, hlist) {
+		if (node->addr != ip)
+			break;
+		fp = READ_ONCE(node->fp);
+		if (unlikely(!fp || fprobe_disabled(fp) || fp->exit_handler))
+			continue;
+
+		if (fprobe_shared_with_kprobes(fp))
+			__fprobe_kprobe_handler(ip, parent_ip, fp, fregs, NULL);
+		else
+			__fprobe_handler(ip, parent_ip, fp, fregs, NULL);
+	}
+	rcu_read_unlock();
+	ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(fprobe_ftrace_entry);
+
+static struct ftrace_ops fprobe_ftrace_ops = {
+	.func	= fprobe_ftrace_entry,
+	.flags	= FTRACE_OPS_FL_SAVE_REGS,
+};
+static int fprobe_ftrace_active;
+
+static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
+{
+	int ret;
+
+	lockdep_assert_held(&fprobe_mutex);
+
+	ret = ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 0, 0);
+	if (ret)
+		return ret;
+
+	if (!fprobe_ftrace_active) {
+		ret = register_ftrace_function(&fprobe_ftrace_ops);
+		if (ret) {
+			ftrace_free_filter(&fprobe_ftrace_ops);
+			return ret;
+		}
+	}
+	fprobe_ftrace_active++;
+	return 0;
+}
+
+static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
+{
+	lockdep_assert_held(&fprobe_mutex);
+
+	fprobe_ftrace_active--;
+	if (!fprobe_ftrace_active)
+		unregister_ftrace_function(&fprobe_ftrace_ops);
+	if (num)
+		ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0);
+}
+
 /* Add @addrs to the ftrace filter and register fgraph if needed. */
 static int fprobe_graph_add_ips(unsigned long *addrs, int num)
 {
@@ -498,9 +570,12 @@ static int fprobe_module_callback(struct notifier_block *nb,
 	} while (node == ERR_PTR(-EAGAIN));
 	rhashtable_walk_exit(&iter);
 
-	if (alist.index > 0)
+	if (alist.index > 0) {
 		ftrace_set_filter_ips(&fprobe_graph_ops.ops,
 				      alist.addrs, alist.index, 1, 0);
+		ftrace_set_filter_ips(&fprobe_ftrace_ops,
+				      alist.addrs, alist.index, 1, 0);
+	}
 	mutex_unlock(&fprobe_mutex);
 
 	kfree(alist.addrs);
@@ -733,7 +808,11 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
 	mutex_lock(&fprobe_mutex);
 
 	hlist_array = fp->hlist_array;
-	ret = fprobe_graph_add_ips(addrs, num);
+	if (fp->exit_handler)
+		ret = fprobe_graph_add_ips(addrs, num);
+	else
+		ret = fprobe_ftrace_add_ips(addrs, num);
+
 	if (!ret) {
 		add_fprobe_hash(fp);
 		for (i = 0; i < hlist_array->size; i++) {
@@ -829,7 +908,10 @@ int unregister_fprobe(struct fprobe *fp)
 	}
 	del_fprobe_hash(fp);
 
-	fprobe_graph_remove_ips(addrs, count);
+	if (fp->exit_handler)
+		fprobe_graph_remove_ips(addrs, count);
+	else
+		fprobe_ftrace_remove_ips(addrs, count);
 
 	kfree_rcu(hlist_array, rcu);
 	fp->hlist_array = NULL;
-- 
2.51.0

Re: [PATCH v2 1/2] tracing: fprobe: optimization for entry only case
Posted by Masami Hiramatsu (Google) 2 months ago
Hi Menglong,

I remember why I haven't implement this.

On Fri, 10 Oct 2025 11:38:46 +0800
Menglong Dong <menglong8.dong@gmail.com> wrote:

> +
> +static struct ftrace_ops fprobe_ftrace_ops = {
> +	.func	= fprobe_ftrace_entry,
> +	.flags	= FTRACE_OPS_FL_SAVE_REGS,

Actually, this flag is the problem. This can fail fprobe on architecture
which does not support CONFIG_DYNAMIC_FTRACE_WITH_REGS (e.g. arm64, riscv)

 * SAVE_REGS - The ftrace_ops wants regs saved at each function called
 *            and passed to the callback. If this flag is set, but the
 *            architecture does not support passing regs
 *            (CONFIG_DYNAMIC_FTRACE_WITH_REGS is not defined), then the
 *            ftrace_ops will fail to register, unless the next flag
 *            is set.

fgraph has a special entry code for saving ftrace_regs.
So at least we need to fail back to fgraph if arch does not
support CONFIG_DYNAMIC_FTRACE_WITH_REGS.

Thank you,

-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>
Re: [PATCH v2 1/2] tracing: fprobe: optimization for entry only case
Posted by Masami Hiramatsu (Google) 2 months, 1 week ago
Hi Menglong,

On Fri, 10 Oct 2025 11:38:46 +0800
Menglong Dong <menglong8.dong@gmail.com> wrote:

> For now, fgraph is used for the fprobe, even if we need trace the entry
> only. However, the performance of ftrace is better than fgraph, and we
> can use ftrace_ops for this case.
> 
> Then performance of kprobe-multi increases from 54M to 69M. Before this
> commit:
> 
>   $ ./benchs/run_bench_trigger.sh kprobe-multi
>   kprobe-multi   :   54.663 ± 0.493M/s
> 
> After this commit:
> 
>   $ ./benchs/run_bench_trigger.sh kprobe-multi
>   kprobe-multi   :   69.447 ± 0.143M/s
> 
> Mitigation is disable during the bench testing above.
> 

Thanks for updating!

This looks good to me. Just a nit comment below;

[...]
> @@ -379,11 +380,82 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
>  NOKPROBE_SYMBOL(fprobe_return);
>  
>  static struct fgraph_ops fprobe_graph_ops = {
> -	.entryfunc	= fprobe_entry,
> +	.entryfunc	= fprobe_fgraph_entry,
>  	.retfunc	= fprobe_return,
>  };
>  static int fprobe_graph_active;
>  
> +/* ftrace_ops callback, this processes fprobes which have only entry_handler. */
> +static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip,
> +	struct ftrace_ops *ops, struct ftrace_regs *fregs)
> +{
> +	struct fprobe_hlist_node *node;
> +	struct rhlist_head *head, *pos;
> +	struct fprobe *fp;
> +	int bit;
> +
> +	bit = ftrace_test_recursion_trylock(ip, parent_ip);
> +	if (bit < 0)
> +		return;
> +

nit: We'd better to explain why we need this here too;

	/*
	 * ftrace_test_recursion_trylock() disables preemption, but
	 * rhltable_lookup() checks whether rcu_read_lcok is held.
	 * So we take rcu_read_lock() here.
	 */

> +	rcu_read_lock();
> +	head = rhltable_lookup(&fprobe_ip_table, &ip, fprobe_rht_params);
> +
> +	rhl_for_each_entry_rcu(node, pos, head, hlist) {
> +		if (node->addr != ip)
> +			break;
> +		fp = READ_ONCE(node->fp);
> +		if (unlikely(!fp || fprobe_disabled(fp) || fp->exit_handler))
> +			continue;
> +
> +		if (fprobe_shared_with_kprobes(fp))
> +			__fprobe_kprobe_handler(ip, parent_ip, fp, fregs, NULL);
> +		else
> +			__fprobe_handler(ip, parent_ip, fp, fregs, NULL);
> +	}
> +	rcu_read_unlock();
> +	ftrace_test_recursion_unlock(bit);
> +}
> +NOKPROBE_SYMBOL(fprobe_ftrace_entry);

Thank you,

> +
> +static struct ftrace_ops fprobe_ftrace_ops = {
> +	.func	= fprobe_ftrace_entry,
> +	.flags	= FTRACE_OPS_FL_SAVE_REGS,
> +};
> +static int fprobe_ftrace_active;
> +
> +static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
> +{
> +	int ret;
> +
> +	lockdep_assert_held(&fprobe_mutex);
> +
> +	ret = ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 0, 0);
> +	if (ret)
> +		return ret;
> +
> +	if (!fprobe_ftrace_active) {
> +		ret = register_ftrace_function(&fprobe_ftrace_ops);
> +		if (ret) {
> +			ftrace_free_filter(&fprobe_ftrace_ops);
> +			return ret;
> +		}
> +	}
> +	fprobe_ftrace_active++;
> +	return 0;
> +}
> +
> +static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
> +{
> +	lockdep_assert_held(&fprobe_mutex);
> +
> +	fprobe_ftrace_active--;
> +	if (!fprobe_ftrace_active)
> +		unregister_ftrace_function(&fprobe_ftrace_ops);
> +	if (num)
> +		ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0);
> +}
> +
>  /* Add @addrs to the ftrace filter and register fgraph if needed. */
>  static int fprobe_graph_add_ips(unsigned long *addrs, int num)
>  {
> @@ -498,9 +570,12 @@ static int fprobe_module_callback(struct notifier_block *nb,
>  	} while (node == ERR_PTR(-EAGAIN));
>  	rhashtable_walk_exit(&iter);
>  
> -	if (alist.index > 0)
> +	if (alist.index > 0) {
>  		ftrace_set_filter_ips(&fprobe_graph_ops.ops,
>  				      alist.addrs, alist.index, 1, 0);
> +		ftrace_set_filter_ips(&fprobe_ftrace_ops,
> +				      alist.addrs, alist.index, 1, 0);
> +	}
>  	mutex_unlock(&fprobe_mutex);
>  
>  	kfree(alist.addrs);
> @@ -733,7 +808,11 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
>  	mutex_lock(&fprobe_mutex);
>  
>  	hlist_array = fp->hlist_array;
> -	ret = fprobe_graph_add_ips(addrs, num);
> +	if (fp->exit_handler)
> +		ret = fprobe_graph_add_ips(addrs, num);
> +	else
> +		ret = fprobe_ftrace_add_ips(addrs, num);
> +
>  	if (!ret) {
>  		add_fprobe_hash(fp);
>  		for (i = 0; i < hlist_array->size; i++) {
> @@ -829,7 +908,10 @@ int unregister_fprobe(struct fprobe *fp)
>  	}
>  	del_fprobe_hash(fp);
>  
> -	fprobe_graph_remove_ips(addrs, count);
> +	if (fp->exit_handler)
> +		fprobe_graph_remove_ips(addrs, count);
> +	else
> +		fprobe_ftrace_remove_ips(addrs, count);
>  
>  	kfree_rcu(hlist_array, rcu);
>  	fp->hlist_array = NULL;
> -- 
> 2.51.0
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>
Re: [PATCH v2 1/2] tracing: fprobe: optimization for entry only case
Posted by Menglong Dong 2 months, 1 week ago
On 2025/10/12 12:07, Masami Hiramatsu wrote:
> Hi Menglong,
> 
> On Fri, 10 Oct 2025 11:38:46 +0800
> Menglong Dong <menglong8.dong@gmail.com> wrote:
> 
> > For now, fgraph is used for the fprobe, even if we need trace the entry
> > only. However, the performance of ftrace is better than fgraph, and we
> > can use ftrace_ops for this case.
> > 
> > Then performance of kprobe-multi increases from 54M to 69M. Before this
> > commit:
> > 
> >   $ ./benchs/run_bench_trigger.sh kprobe-multi
> >   kprobe-multi   :   54.663 ± 0.493M/s
> > 
> > After this commit:
> > 
> >   $ ./benchs/run_bench_trigger.sh kprobe-multi
> >   kprobe-multi   :   69.447 ± 0.143M/s
> > 
> > Mitigation is disable during the bench testing above.
> > 
> 
> Thanks for updating!
> 
> This looks good to me. Just a nit comment below;
> 
> [...]
> > @@ -379,11 +380,82 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
> >  NOKPROBE_SYMBOL(fprobe_return);
> >  
> >  static struct fgraph_ops fprobe_graph_ops = {
> > -	.entryfunc	= fprobe_entry,
> > +	.entryfunc	= fprobe_fgraph_entry,
> >  	.retfunc	= fprobe_return,
> >  };
> >  static int fprobe_graph_active;
> >  
> > +/* ftrace_ops callback, this processes fprobes which have only entry_handler. */
> > +static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip,
> > +	struct ftrace_ops *ops, struct ftrace_regs *fregs)
> > +{
> > +	struct fprobe_hlist_node *node;
> > +	struct rhlist_head *head, *pos;
> > +	struct fprobe *fp;
> > +	int bit;
> > +
> > +	bit = ftrace_test_recursion_trylock(ip, parent_ip);
> > +	if (bit < 0)
> > +		return;
> > +
> 
> nit: We'd better to explain why we need this here too;
> 
> 	/*
> 	 * ftrace_test_recursion_trylock() disables preemption, but
> 	 * rhltable_lookup() checks whether rcu_read_lcok is held.
> 	 * So we take rcu_read_lock() here.
> 	 */

It's very nice! I'll send a V3 now.

Thanks!
Menglong Dong

> 
> > +	rcu_read_lock();
> > +	head = rhltable_lookup(&fprobe_ip_table, &ip, fprobe_rht_params);
> > +
> > +	rhl_for_each_entry_rcu(node, pos, head, hlist) {
> > +		if (node->addr != ip)
> > +			break;
> > +		fp = READ_ONCE(node->fp);
> > +		if (unlikely(!fp || fprobe_disabled(fp) || fp->exit_handler))
> > +			continue;
> > +
> > +		if (fprobe_shared_with_kprobes(fp))
> > +			__fprobe_kprobe_handler(ip, parent_ip, fp, fregs, NULL);
> > +		else
> > +			__fprobe_handler(ip, parent_ip, fp, fregs, NULL);
> > +	}
> > +	rcu_read_unlock();
> > +	ftrace_test_recursion_unlock(bit);
> > +}
> > +NOKPROBE_SYMBOL(fprobe_ftrace_entry);
> 
> Thank you,
> 
> > +
> > +static struct ftrace_ops fprobe_ftrace_ops = {
> > +	.func	= fprobe_ftrace_entry,
> > +	.flags	= FTRACE_OPS_FL_SAVE_REGS,
> > +};
> > +static int fprobe_ftrace_active;
> > +
> > +static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
> > +{
> > +	int ret;
> > +
> > +	lockdep_assert_held(&fprobe_mutex);
> > +
> > +	ret = ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 0, 0);
> > +	if (ret)
> > +		return ret;
> > +
> > +	if (!fprobe_ftrace_active) {
> > +		ret = register_ftrace_function(&fprobe_ftrace_ops);
> > +		if (ret) {
> > +			ftrace_free_filter(&fprobe_ftrace_ops);
> > +			return ret;
> > +		}
> > +	}
> > +	fprobe_ftrace_active++;
> > +	return 0;
> > +}
> > +
> > +static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
> > +{
> > +	lockdep_assert_held(&fprobe_mutex);
> > +
> > +	fprobe_ftrace_active--;
> > +	if (!fprobe_ftrace_active)
> > +		unregister_ftrace_function(&fprobe_ftrace_ops);
> > +	if (num)
> > +		ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0);
> > +}
> > +
> >  /* Add @addrs to the ftrace filter and register fgraph if needed. */
> >  static int fprobe_graph_add_ips(unsigned long *addrs, int num)
> >  {
> > @@ -498,9 +570,12 @@ static int fprobe_module_callback(struct notifier_block *nb,
> >  	} while (node == ERR_PTR(-EAGAIN));
> >  	rhashtable_walk_exit(&iter);
> >  
> > -	if (alist.index > 0)
> > +	if (alist.index > 0) {
> >  		ftrace_set_filter_ips(&fprobe_graph_ops.ops,
> >  				      alist.addrs, alist.index, 1, 0);
> > +		ftrace_set_filter_ips(&fprobe_ftrace_ops,
> > +				      alist.addrs, alist.index, 1, 0);
> > +	}
> >  	mutex_unlock(&fprobe_mutex);
> >  
> >  	kfree(alist.addrs);
> > @@ -733,7 +808,11 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
> >  	mutex_lock(&fprobe_mutex);
> >  
> >  	hlist_array = fp->hlist_array;
> > -	ret = fprobe_graph_add_ips(addrs, num);
> > +	if (fp->exit_handler)
> > +		ret = fprobe_graph_add_ips(addrs, num);
> > +	else
> > +		ret = fprobe_ftrace_add_ips(addrs, num);
> > +
> >  	if (!ret) {
> >  		add_fprobe_hash(fp);
> >  		for (i = 0; i < hlist_array->size; i++) {
> > @@ -829,7 +908,10 @@ int unregister_fprobe(struct fprobe *fp)
> >  	}
> >  	del_fprobe_hash(fp);
> >  
> > -	fprobe_graph_remove_ips(addrs, count);
> > +	if (fp->exit_handler)
> > +		fprobe_graph_remove_ips(addrs, count);
> > +	else
> > +		fprobe_ftrace_remove_ips(addrs, count);
> >  
> >  	kfree_rcu(hlist_array, rcu);
> >  	fp->hlist_array = NULL;
> 
> 
>