[PATCH] tracing: Use queue_rcu_work() to free filters

Steven Rostedt posted 1 patch 4 months ago
kernel/trace/trace_events_filter.c | 28 ++++++++++++++++++++--------
1 file changed, 20 insertions(+), 8 deletions(-)
[PATCH] tracing: Use queue_rcu_work() to free filters
Posted by Steven Rostedt 4 months ago
From: Steven Rostedt <rostedt@goodmis.org>

Freeing of filters requires to wait for both an RCU grace period as well as
a RCU task trace wait period after they have been detached from their
lists. The trace task period can be quite large so the freeing of the
filters was moved to use the call_rcu*() routines. The problem with that is
that the callback functions of call_rcu*() is done from a soft irq and can
cause latencies if the callback takes a bit of time.

The filters are freed per event in a system and the syscalls system
contains an event per system call, which can be over 700 events. Freeing 700
filters in a bottom half is undesirable.

Instead, move the freeing to use queue_rcu_work() which is done in task
context.

Link: https://lore.kernel.org/all/9a2f0cd0-1561-4206-8966-f93ccd25927f@paulmck-laptop/

Fixes: a9d0aab5eb33 ("tracing: Fix regression of filter waiting a long time on RCU synchronization")
Suggested-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Note, I added a Fixes tag but not a stable tag as this is a nice-to-have
but doesn't hit the level of critical fix to backport or add to an rc
release. If someone wants to back port it, feel free.

 kernel/trace/trace_events_filter.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index ea8b364b6818..b6fe8167ef01 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1344,13 +1344,14 @@ struct filter_list {
 
 struct filter_head {
 	struct list_head	list;
-	struct rcu_head		rcu;
+	union {
+		struct rcu_head		rcu;
+		struct rcu_work		rwork;
+	};
 };
 
-
-static void free_filter_list(struct rcu_head *rhp)
+static void free_filter_list(struct filter_head *filter_list)
 {
-	struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
 	struct filter_list *filter_item, *tmp;
 
 	list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) {
@@ -1361,9 +1362,20 @@ static void free_filter_list(struct rcu_head *rhp)
 	kfree(filter_list);
 }
 
+static void free_filter_list_work(struct work_struct *work)
+{
+	struct filter_head *filter_list;
+
+	filter_list = container_of(to_rcu_work(work), struct filter_head, rwork);
+	free_filter_list(filter_list);
+}
+
 static void free_filter_list_tasks(struct rcu_head *rhp)
 {
-	call_rcu(rhp, free_filter_list);
+	struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
+
+	INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work);
+	queue_rcu_work(system_wq, &filter_list->rwork);
 }
 
 /*
@@ -1462,7 +1474,7 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
 	tracepoint_synchronize_unregister();
 
 	if (head)
-		free_filter_list(&head->rcu);
+		free_filter_list(head);
 
 	list_for_each_entry(file, &tr->events, list) {
 		if (file->system != dir || !file->filter)
@@ -2307,7 +2319,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
 	return 0;
  fail:
 	/* No call succeeded */
-	free_filter_list(&filter_list->rcu);
+	free_filter_list(filter_list);
 	parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
 	return -EINVAL;
  fail_mem:
@@ -2317,7 +2329,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
 	if (!fail)
 		delay_free_filter(filter_list);
 	else
-		free_filter_list(&filter_list->rcu);
+		free_filter_list(filter_list);
 
 	return -ENOMEM;
 }
-- 
2.47.2
Re: [PATCH] tracing: Use queue_rcu_work() to free filters
Posted by Paul E. McKenney 4 months ago
On Mon, Jun 09, 2025 at 01:17:32PM -0400, Steven Rostedt wrote:
> From: Steven Rostedt <rostedt@goodmis.org>
> 
> Freeing of filters requires to wait for both an RCU grace period as well as
> a RCU task trace wait period after they have been detached from their
> lists. The trace task period can be quite large so the freeing of the
> filters was moved to use the call_rcu*() routines. The problem with that is
> that the callback functions of call_rcu*() is done from a soft irq and can
> cause latencies if the callback takes a bit of time.
> 
> The filters are freed per event in a system and the syscalls system
> contains an event per system call, which can be over 700 events. Freeing 700
> filters in a bottom half is undesirable.
> 
> Instead, move the freeing to use queue_rcu_work() which is done in task
> context.
> 
> Link: https://lore.kernel.org/all/9a2f0cd0-1561-4206-8966-f93ccd25927f@paulmck-laptop/
> 
> Fixes: a9d0aab5eb33 ("tracing: Fix regression of filter waiting a long time on RCU synchronization")
> Suggested-by: "Paul E. McKenney" <paulmck@kernel.org>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

Thank you, and looks good to me.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>

> ---
> Note, I added a Fixes tag but not a stable tag as this is a nice-to-have
> but doesn't hit the level of critical fix to backport or add to an rc
> release. If someone wants to back port it, feel free.
> 
>  kernel/trace/trace_events_filter.c | 28 ++++++++++++++++++++--------
>  1 file changed, 20 insertions(+), 8 deletions(-)
> 
> diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
> index ea8b364b6818..b6fe8167ef01 100644
> --- a/kernel/trace/trace_events_filter.c
> +++ b/kernel/trace/trace_events_filter.c
> @@ -1344,13 +1344,14 @@ struct filter_list {
>  
>  struct filter_head {
>  	struct list_head	list;
> -	struct rcu_head		rcu;
> +	union {
> +		struct rcu_head		rcu;
> +		struct rcu_work		rwork;
> +	};
>  };
>  
> -
> -static void free_filter_list(struct rcu_head *rhp)
> +static void free_filter_list(struct filter_head *filter_list)
>  {
> -	struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
>  	struct filter_list *filter_item, *tmp;
>  
>  	list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) {
> @@ -1361,9 +1362,20 @@ static void free_filter_list(struct rcu_head *rhp)
>  	kfree(filter_list);
>  }
>  
> +static void free_filter_list_work(struct work_struct *work)
> +{
> +	struct filter_head *filter_list;
> +
> +	filter_list = container_of(to_rcu_work(work), struct filter_head, rwork);
> +	free_filter_list(filter_list);
> +}
> +
>  static void free_filter_list_tasks(struct rcu_head *rhp)
>  {
> -	call_rcu(rhp, free_filter_list);
> +	struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
> +
> +	INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work);
> +	queue_rcu_work(system_wq, &filter_list->rwork);
>  }
>  
>  /*
> @@ -1462,7 +1474,7 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
>  	tracepoint_synchronize_unregister();
>  
>  	if (head)
> -		free_filter_list(&head->rcu);
> +		free_filter_list(head);
>  
>  	list_for_each_entry(file, &tr->events, list) {
>  		if (file->system != dir || !file->filter)
> @@ -2307,7 +2319,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
>  	return 0;
>   fail:
>  	/* No call succeeded */
> -	free_filter_list(&filter_list->rcu);
> +	free_filter_list(filter_list);
>  	parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
>  	return -EINVAL;
>   fail_mem:
> @@ -2317,7 +2329,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
>  	if (!fail)
>  		delay_free_filter(filter_list);
>  	else
> -		free_filter_list(&filter_list->rcu);
> +		free_filter_list(filter_list);
>  
>  	return -ENOMEM;
>  }
> -- 
> 2.47.2
>