[PATCH bpf-next 1/4] fprobe: use rhashtable

Menglong Dong posted 4 patches 2 months, 1 week ago
There is a newer version of this series
[PATCH bpf-next 1/4] fprobe: use rhashtable
Posted by Menglong Dong 2 months, 1 week ago
For now, all the kernel functions who are hooked by the fprobe will be
added to the hash table "fprobe_ip_table". The key of it is the function
address, and the value of it is "struct fprobe_hlist_node".

The budget of the hash table is FPROBE_IP_TABLE_SIZE, which is 256. And
this means the overhead of the hash table lookup will grow linearly if
the count of the functions in the fprobe more than 256. When we try to
hook all the kernel functions, the overhead will be huge.

Therefore, replace the hash table with rhashtable to reduce the overhead.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 include/linux/fprobe.h |   2 +-
 kernel/trace/fprobe.c  | 144 +++++++++++++++++++++++------------------
 2 files changed, 82 insertions(+), 64 deletions(-)

diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
index 702099f08929..0c9b239f5485 100644
--- a/include/linux/fprobe.h
+++ b/include/linux/fprobe.h
@@ -26,7 +26,7 @@ typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip,
  * @fp: The fprobe which owns this.
  */
 struct fprobe_hlist_node {
-	struct hlist_node	hlist;
+	struct rhash_head	hlist;
 	unsigned long		addr;
 	struct fprobe		*fp;
 };
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index ba7ff14f5339..b3e16303fc6a 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -12,6 +12,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/sort.h>
+#include <linux/rhashtable.h>
 
 #include <asm/fprobe.h>
 
@@ -41,47 +42,47 @@
  *  - RCU hlist traversal under disabling preempt
  */
 static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE];
-static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE];
+static struct rhashtable fprobe_ip_table;
 static DEFINE_MUTEX(fprobe_mutex);
 
-/*
- * Find first fprobe in the hlist. It will be iterated twice in the entry
- * probe, once for correcting the total required size, the second time is
- * calling back the user handlers.
- * Thus the hlist in the fprobe_table must be sorted and new probe needs to
- * be added *before* the first fprobe.
- */
-static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip)
+static u32 fprobe_node_hashfn(const void *data, u32 len, u32 seed)
 {
-	struct fprobe_hlist_node *node;
-	struct hlist_head *head;
+	return hash_ptr(*(unsigned long **)data, 32);
+}
 
-	head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
-	hlist_for_each_entry_rcu(node, head, hlist,
-				 lockdep_is_held(&fprobe_mutex)) {
-		if (node->addr == ip)
-			return node;
-	}
-	return NULL;
+static int fprobe_node_cmp(struct rhashtable_compare_arg *arg,
+			   const void *ptr)
+{
+	unsigned long key = *(unsigned long *)arg->key;
+	const struct fprobe_hlist_node *n = ptr;
+
+	return n->addr != key;
+}
+
+static u32 fprobe_node_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct fprobe_hlist_node *n = data;
+
+	return hash_ptr((void *)n->addr, 32);
 }
-NOKPROBE_SYMBOL(find_first_fprobe_node);
+
+static const struct rhashtable_params fprobe_rht_params = {
+	.head_offset		= offsetof(struct fprobe_hlist_node, hlist),
+	.key_offset		= offsetof(struct fprobe_hlist_node, addr),
+	.key_len		= sizeof_field(struct fprobe_hlist_node, addr),
+	.hashfn			= fprobe_node_hashfn,
+	.obj_hashfn		= fprobe_node_obj_hashfn,
+	.obj_cmpfn		= fprobe_node_cmp,
+	.automatic_shrinking	= true,
+};
 
 /* Node insertion and deletion requires the fprobe_mutex */
 static void insert_fprobe_node(struct fprobe_hlist_node *node)
 {
-	unsigned long ip = node->addr;
-	struct fprobe_hlist_node *next;
-	struct hlist_head *head;
-
 	lockdep_assert_held(&fprobe_mutex);
 
-	next = find_first_fprobe_node(ip);
-	if (next) {
-		hlist_add_before_rcu(&node->hlist, &next->hlist);
-		return;
-	}
-	head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
-	hlist_add_head_rcu(&node->hlist, head);
+	rhashtable_insert_fast(&fprobe_ip_table, &node->hlist,
+			       fprobe_rht_params);
 }
 
 /* Return true if there are synonims */
@@ -92,9 +93,11 @@ static bool delete_fprobe_node(struct fprobe_hlist_node *node)
 	/* Avoid double deleting */
 	if (READ_ONCE(node->fp) != NULL) {
 		WRITE_ONCE(node->fp, NULL);
-		hlist_del_rcu(&node->hlist);
+		rhashtable_remove_fast(&fprobe_ip_table, &node->hlist,
+				       fprobe_rht_params);
 	}
-	return !!find_first_fprobe_node(node->addr);
+	return !!rhashtable_lookup_fast(&fprobe_ip_table, &node->addr,
+					fprobe_rht_params);
 }
 
 /* Check existence of the fprobe */
@@ -249,25 +252,28 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent
 static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
 			struct ftrace_regs *fregs)
 {
-	struct fprobe_hlist_node *node, *first;
+	struct rhash_lock_head __rcu *const *bkt;
+	struct fprobe_hlist_node *node;
 	unsigned long *fgraph_data = NULL;
 	unsigned long func = trace->func;
+	struct bucket_table *tbl;
+	struct rhash_head *head;
 	unsigned long ret_ip;
 	int reserved_words;
 	struct fprobe *fp;
+	unsigned int key;
 	int used, ret;
 
 	if (WARN_ON_ONCE(!fregs))
 		return 0;
 
-	first = node = find_first_fprobe_node(func);
-	if (unlikely(!first))
-		return 0;
-
+	tbl = rht_dereference_rcu(fprobe_ip_table.tbl, &fprobe_ip_table);
+	key = rht_key_hashfn(&fprobe_ip_table, tbl, &func, fprobe_rht_params);
+	bkt = rht_bucket(tbl, key);
 	reserved_words = 0;
-	hlist_for_each_entry_from_rcu(node, hlist) {
+	rht_for_each_entry_rcu_from(node, head, rht_ptr_rcu(bkt), tbl, key, hlist) {
 		if (node->addr != func)
-			break;
+			continue;
 		fp = READ_ONCE(node->fp);
 		if (!fp || !fp->exit_handler)
 			continue;
@@ -278,13 +284,13 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
 		reserved_words +=
 			FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size);
 	}
-	node = first;
 	if (reserved_words) {
 		fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
 		if (unlikely(!fgraph_data)) {
-			hlist_for_each_entry_from_rcu(node, hlist) {
+			rht_for_each_entry_rcu_from(node, head, rht_ptr_rcu(bkt),
+						    tbl, key, hlist) {
 				if (node->addr != func)
-					break;
+					continue;
 				fp = READ_ONCE(node->fp);
 				if (fp && !fprobe_disabled(fp))
 					fp->nmissed++;
@@ -299,12 +305,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
 	 */
 	ret_ip = ftrace_regs_get_return_address(fregs);
 	used = 0;
-	hlist_for_each_entry_from_rcu(node, hlist) {
+	rht_for_each_entry_rcu_from(node, head, rht_ptr_rcu(bkt), tbl, key, hlist) {
 		int data_size;
 		void *data;
 
 		if (node->addr != func)
-			break;
+			continue;
 		fp = READ_ONCE(node->fp);
 		if (!fp || fprobe_disabled(fp))
 			continue;
@@ -448,25 +454,21 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad
 	return 0;
 }
 
-static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head,
-					struct fprobe_addr_list *alist)
+static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
+					 struct fprobe_addr_list *alist)
 {
-	struct fprobe_hlist_node *node;
 	int ret = 0;
 
-	hlist_for_each_entry_rcu(node, head, hlist,
-				 lockdep_is_held(&fprobe_mutex)) {
-		if (!within_module(node->addr, mod))
-			continue;
-		if (delete_fprobe_node(node))
-			continue;
-		/*
-		 * If failed to update alist, just continue to update hlist.
-		 * Therefore, at list user handler will not hit anymore.
-		 */
-		if (!ret)
-			ret = fprobe_addr_list_add(alist, node->addr);
-	}
+	if (!within_module(node->addr, mod))
+		return;
+	if (delete_fprobe_node(node))
+		return;
+	/*
+	 * If failed to update alist, just continue to update hlist.
+	 * Therefore, at list user handler will not hit anymore.
+	 */
+	if (!ret)
+		ret = fprobe_addr_list_add(alist, node->addr);
 }
 
 /* Handle module unloading to manage fprobe_ip_table. */
@@ -474,8 +476,9 @@ static int fprobe_module_callback(struct notifier_block *nb,
 				  unsigned long val, void *data)
 {
 	struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT};
+	struct fprobe_hlist_node *node;
+	struct rhashtable_iter iter;
 	struct module *mod = data;
-	int i;
 
 	if (val != MODULE_STATE_GOING)
 		return NOTIFY_DONE;
@@ -486,8 +489,16 @@ static int fprobe_module_callback(struct notifier_block *nb,
 		return NOTIFY_DONE;
 
 	mutex_lock(&fprobe_mutex);
-	for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++)
-		fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist);
+	rhashtable_walk_enter(&fprobe_ip_table, &iter);
+	do {
+		rhashtable_walk_start(&iter);
+
+		while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node))
+			fprobe_remove_node_in_module(mod, node, &alist);
+
+		rhashtable_walk_stop(&iter);
+	} while (node == ERR_PTR(-EAGAIN));
+	rhashtable_walk_exit(&iter);
 
 	if (alist.index < alist.size && alist.index > 0)
 		ftrace_set_filter_ips(&fprobe_graph_ops.ops,
@@ -819,3 +830,10 @@ int unregister_fprobe(struct fprobe *fp)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(unregister_fprobe);
+
+static int __init fprobe_initcall(void)
+{
+	rhashtable_init(&fprobe_ip_table, &fprobe_rht_params);
+	return 0;
+}
+late_initcall(fprobe_initcall);
-- 
2.50.1
Re: [PATCH bpf-next 1/4] fprobe: use rhashtable
Posted by kernel test robot 2 months, 1 week ago
Hi Menglong,

kernel test robot noticed the following build errors:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Menglong-Dong/fprobe-use-rhashtable/20250728-121631
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20250728041252.441040-2-dongml2%40chinatelecom.cn
patch subject: [PATCH bpf-next 1/4] fprobe: use rhashtable
config: loongarch-allmodconfig (https://download.01.org/0day-ci/archive/20250729/202507291147.Fov8pl4N-lkp@intel.com/config)
compiler: clang version 19.1.7 (https://github.com/llvm/llvm-project cd708029e0b2869e80abe31ddb175f7c35361f90)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250729/202507291147.Fov8pl4N-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507291147.Fov8pl4N-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from kernel/trace/fprobe.c:8:
>> include/linux/fprobe.h:29:20: error: field has incomplete type 'struct rhash_head'
      29 |         struct rhash_head       hlist;
         |                                 ^
   include/linux/fprobe.h:29:9: note: forward declaration of 'struct rhash_head'
      29 |         struct rhash_head       hlist;
         |                ^
>> kernel/trace/fprobe.c:71:17: error: initializer element is not a compile-time constant
      71 |         .key_offset             = offsetof(struct fprobe_hlist_node, addr),
         |                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/stddef.h:16:32: note: expanded from macro 'offsetof'
      16 | #define offsetof(TYPE, MEMBER)  __builtin_offsetof(TYPE, MEMBER)
         |                                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   2 errors generated.
--
   In file included from kernel/trace/trace_fprobe.c:9:
>> include/linux/fprobe.h:29:20: error: field has incomplete type 'struct rhash_head'
      29 |         struct rhash_head       hlist;
         |                                 ^
   include/linux/fprobe.h:29:9: note: forward declaration of 'struct rhash_head'
      29 |         struct rhash_head       hlist;
         |                ^
   1 error generated.


vim +29 include/linux/fprobe.h

    11	
    12	struct fprobe;
    13	typedef int (*fprobe_entry_cb)(struct fprobe *fp, unsigned long entry_ip,
    14				       unsigned long ret_ip, struct ftrace_regs *regs,
    15				       void *entry_data);
    16	
    17	typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip,
    18				       unsigned long ret_ip, struct ftrace_regs *regs,
    19				       void *entry_data);
    20	
    21	/**
    22	 * struct fprobe_hlist_node - address based hash list node for fprobe.
    23	 *
    24	 * @hlist: The hlist node for address search hash table.
    25	 * @addr: One of the probing address of @fp.
    26	 * @fp: The fprobe which owns this.
    27	 */
    28	struct fprobe_hlist_node {
  > 29		struct rhash_head	hlist;
    30		unsigned long		addr;
    31		struct fprobe		*fp;
    32	};
    33	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH bpf-next 1/4] fprobe: use rhashtable
Posted by Jiri Olsa 2 months, 1 week ago
On Mon, Jul 28, 2025 at 12:12:48PM +0800, Menglong Dong wrote:

SNIP

> +static const struct rhashtable_params fprobe_rht_params = {
> +	.head_offset		= offsetof(struct fprobe_hlist_node, hlist),
> +	.key_offset		= offsetof(struct fprobe_hlist_node, addr),
> +	.key_len		= sizeof_field(struct fprobe_hlist_node, addr),
> +	.hashfn			= fprobe_node_hashfn,
> +	.obj_hashfn		= fprobe_node_obj_hashfn,
> +	.obj_cmpfn		= fprobe_node_cmp,
> +	.automatic_shrinking	= true,
> +};
>  
>  /* Node insertion and deletion requires the fprobe_mutex */
>  static void insert_fprobe_node(struct fprobe_hlist_node *node)
>  {
> -	unsigned long ip = node->addr;
> -	struct fprobe_hlist_node *next;
> -	struct hlist_head *head;
> -
>  	lockdep_assert_held(&fprobe_mutex);
>  
> -	next = find_first_fprobe_node(ip);
> -	if (next) {
> -		hlist_add_before_rcu(&node->hlist, &next->hlist);
> -		return;
> -	}
> -	head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
> -	hlist_add_head_rcu(&node->hlist, head);
> +	rhashtable_insert_fast(&fprobe_ip_table, &node->hlist,
> +			       fprobe_rht_params);

onw that rhashtable_insert_fast can fail, I think insert_fprobe_node
needs to be able to fail as well

>  }
>  
>  /* Return true if there are synonims */
> @@ -92,9 +93,11 @@ static bool delete_fprobe_node(struct fprobe_hlist_node *node)
>  	/* Avoid double deleting */
>  	if (READ_ONCE(node->fp) != NULL) {
>  		WRITE_ONCE(node->fp, NULL);
> -		hlist_del_rcu(&node->hlist);
> +		rhashtable_remove_fast(&fprobe_ip_table, &node->hlist,
> +				       fprobe_rht_params);

I guess this one can't fail in here.. ?

jirka

>  	}
> -	return !!find_first_fprobe_node(node->addr);
> +	return !!rhashtable_lookup_fast(&fprobe_ip_table, &node->addr,
> +					fprobe_rht_params);
>  }
>  

SNIP
Re: [PATCH bpf-next 1/4] fprobe: use rhashtable
Posted by Menglong Dong 2 months, 1 week ago
On Mon, Jul 28, 2025 at 9:13 PM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Mon, Jul 28, 2025 at 12:12:48PM +0800, Menglong Dong wrote:
>
> SNIP
>
> > +static const struct rhashtable_params fprobe_rht_params = {
> > +     .head_offset            = offsetof(struct fprobe_hlist_node, hlist),
> > +     .key_offset             = offsetof(struct fprobe_hlist_node, addr),
> > +     .key_len                = sizeof_field(struct fprobe_hlist_node, addr),
> > +     .hashfn                 = fprobe_node_hashfn,
> > +     .obj_hashfn             = fprobe_node_obj_hashfn,
> > +     .obj_cmpfn              = fprobe_node_cmp,
> > +     .automatic_shrinking    = true,
> > +};
> >
> >  /* Node insertion and deletion requires the fprobe_mutex */
> >  static void insert_fprobe_node(struct fprobe_hlist_node *node)
> >  {
> > -     unsigned long ip = node->addr;
> > -     struct fprobe_hlist_node *next;
> > -     struct hlist_head *head;
> > -
> >       lockdep_assert_held(&fprobe_mutex);
> >
> > -     next = find_first_fprobe_node(ip);
> > -     if (next) {
> > -             hlist_add_before_rcu(&node->hlist, &next->hlist);
> > -             return;
> > -     }
> > -     head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
> > -     hlist_add_head_rcu(&node->hlist, head);
> > +     rhashtable_insert_fast(&fprobe_ip_table, &node->hlist,
> > +                            fprobe_rht_params);
>
> onw that rhashtable_insert_fast can fail, I think insert_fprobe_node
> needs to be able to fail as well

You are right, the insert_fprobe_node should return a error and
be handled properly.

>
> >  }
> >
> >  /* Return true if there are synonims */
> > @@ -92,9 +93,11 @@ static bool delete_fprobe_node(struct fprobe_hlist_node *node)
> >       /* Avoid double deleting */
> >       if (READ_ONCE(node->fp) != NULL) {
> >               WRITE_ONCE(node->fp, NULL);
> > -             hlist_del_rcu(&node->hlist);
> > +             rhashtable_remove_fast(&fprobe_ip_table, &node->hlist,
> > +                                    fprobe_rht_params);
>
> I guess this one can't fail in here.. ?

Yeah, the only failure is the entry doesn't exist in the hash
table.

BTW, the usage of rhltable is similar to rhashtable, and the
comment here is valid in the V2 too.

Thanks!
Menglong Dong

>
> jirka
>
> >       }
> > -     return !!find_first_fprobe_node(node->addr);
> > +     return !!rhashtable_lookup_fast(&fprobe_ip_table, &node->addr,
> > +                                     fprobe_rht_params);
> >  }
> >
>
> SNIP