[v1] fgraph: Give ret_stack its own kmem cache

[PATCH] fgraph: Give ret_stack its own kmem cache

Posted by Steven Rostedt 1 month, 1 week ago

From: Steven Rostedt <rostedt@goodmis.org>

The ret_stack (shadow stack used by function graph infrastructure) is
created for every task on the system when function graph is enabled. Give
it its own kmem_cache. This will make it easier to see how much memory is
being used specifically for function graph shadow stacks.

In the future, this size may change and may not be a power of two. Having
its own cache can also keep it from fragmenting memory.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/fgraph.c | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 51e81b299a0d..51849bc42d3a 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -172,6 +172,8 @@ enum {
 DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
 int ftrace_graph_active;
 
+static struct kmem_cache *fgraph_stack_cachep;
+
 static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
 static unsigned long fgraph_array_bitmask;
 
@@ -1022,8 +1024,11 @@ static int alloc_retstack_tasklist(unsigned long **ret_stack_list)
 	int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
 	struct task_struct *g, *t;
 
+	if (WARN_ON_ONCE(!fgraph_stack_cachep))
+		return -ENOMEM;
+
 	for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
-		ret_stack_list[i] = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
+		ret_stack_list[i] = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
 		if (!ret_stack_list[i]) {
 			start = 0;
 			end = i;
@@ -1054,7 +1059,7 @@ static int alloc_retstack_tasklist(unsigned long **ret_stack_list)
 	rcu_read_unlock();
 free:
 	for (i = start; i < end; i++)
-		kfree(ret_stack_list[i]);
+		kmem_cache_free(fgraph_stack_cachep, ret_stack_list[i]);
 	return ret;
 }
 
@@ -1117,9 +1122,12 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
 	if (ftrace_graph_active) {
 		unsigned long *ret_stack;
 
+		if (WARN_ON_ONCE(!fgraph_stack_cachep))
+			return;
+
 		ret_stack = per_cpu(idle_ret_stack, cpu);
 		if (!ret_stack) {
-			ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
+			ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
 			if (!ret_stack)
 				return;
 			per_cpu(idle_ret_stack, cpu) = ret_stack;
@@ -1139,7 +1147,10 @@ void ftrace_graph_init_task(struct task_struct *t)
 	if (ftrace_graph_active) {
 		unsigned long *ret_stack;
 
-		ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
+		if (WARN_ON_ONCE(!fgraph_stack_cachep))
+			return;
+
+		ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
 		if (!ret_stack)
 			return;
 		graph_init_task(t, ret_stack);
@@ -1154,7 +1165,11 @@ void ftrace_graph_exit_task(struct task_struct *t)
 	/* NULL must become visible to IRQs before we free it: */
 	barrier();
 
-	kfree(ret_stack);
+	if (ret_stack) {
+		if (WARN_ON_ONCE(!fgraph_stack_cachep))
+			return;
+		kmem_cache_free(fgraph_stack_cachep, ret_stack);
+	}
 }
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -1290,6 +1305,16 @@ int register_ftrace_graph(struct fgraph_ops *gops)
 
 	mutex_lock(&ftrace_lock);
 
+	if (!fgraph_stack_cachep) {
+		fgraph_stack_cachep = kmem_cache_create("fgraph_stack",
+							SHADOW_STACK_SIZE,
+							SHADOW_STACK_SIZE, 0, NULL);
+		if (!fgraph_stack_cachep) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
 	if (!fgraph_array[0]) {
 		/* The array must always have real data on it */
 		for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
-- 
2.45.2

Re: [PATCH] fgraph: Give ret_stack its own kmem cache

Posted by Ryan Roberts 1 month ago

On 19/10/2024 20:27, Steven Rostedt wrote:
> From: Steven Rostedt <rostedt@goodmis.org>
> 
> The ret_stack (shadow stack used by function graph infrastructure) is
> created for every task on the system when function graph is enabled. Give
> it its own kmem_cache. This will make it easier to see how much memory is
> being used specifically for function graph shadow stacks.
> 
> In the future, this size may change and may not be a power of two. Having
> its own cache can also keep it from fragmenting memory.
> 
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
>  kernel/trace/fgraph.c | 35 ++++++++++++++++++++++++++++++-----
>  1 file changed, 30 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
> index 51e81b299a0d..51849bc42d3a 100644
> --- a/kernel/trace/fgraph.c
> +++ b/kernel/trace/fgraph.c
> @@ -172,6 +172,8 @@ enum {
>  DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
>  int ftrace_graph_active;
>  
> +static struct kmem_cache *fgraph_stack_cachep;
> +
>  static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
>  static unsigned long fgraph_array_bitmask;
>  
> @@ -1022,8 +1024,11 @@ static int alloc_retstack_tasklist(unsigned long **ret_stack_list)
>  	int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
>  	struct task_struct *g, *t;
>  
> +	if (WARN_ON_ONCE(!fgraph_stack_cachep))
> +		return -ENOMEM;
> +
>  	for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
> -		ret_stack_list[i] = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
> +		ret_stack_list[i] = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
>  		if (!ret_stack_list[i]) {
>  			start = 0;
>  			end = i;
> @@ -1054,7 +1059,7 @@ static int alloc_retstack_tasklist(unsigned long **ret_stack_list)
>  	rcu_read_unlock();
>  free:
>  	for (i = start; i < end; i++)
> -		kfree(ret_stack_list[i]);
> +		kmem_cache_free(fgraph_stack_cachep, ret_stack_list[i]);
>  	return ret;
>  }
>  
> @@ -1117,9 +1122,12 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
>  	if (ftrace_graph_active) {
>  		unsigned long *ret_stack;
>  
> +		if (WARN_ON_ONCE(!fgraph_stack_cachep))
> +			return;
> +
>  		ret_stack = per_cpu(idle_ret_stack, cpu);
>  		if (!ret_stack) {
> -			ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
> +			ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
>  			if (!ret_stack)
>  				return;
>  			per_cpu(idle_ret_stack, cpu) = ret_stack;
> @@ -1139,7 +1147,10 @@ void ftrace_graph_init_task(struct task_struct *t)
>  	if (ftrace_graph_active) {
>  		unsigned long *ret_stack;
>  
> -		ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
> +		if (WARN_ON_ONCE(!fgraph_stack_cachep))
> +			return;
> +
> +		ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
>  		if (!ret_stack)
>  			return;
>  		graph_init_task(t, ret_stack);
> @@ -1154,7 +1165,11 @@ void ftrace_graph_exit_task(struct task_struct *t)
>  	/* NULL must become visible to IRQs before we free it: */
>  	barrier();
>  
> -	kfree(ret_stack);
> +	if (ret_stack) {
> +		if (WARN_ON_ONCE(!fgraph_stack_cachep))
> +			return;
> +		kmem_cache_free(fgraph_stack_cachep, ret_stack);
> +	}
>  }
>  
>  #ifdef CONFIG_DYNAMIC_FTRACE
> @@ -1290,6 +1305,16 @@ int register_ftrace_graph(struct fgraph_ops *gops)
>  
>  	mutex_lock(&ftrace_lock);
>  
> +	if (!fgraph_stack_cachep) {
> +		fgraph_stack_cachep = kmem_cache_create("fgraph_stack",
> +							SHADOW_STACK_SIZE,
> +							SHADOW_STACK_SIZE, 0, NULL);

(I don't have any experience with this code, but...) is there any value/need to
destroy the cache in unregister_ftrace_graph()? I guess you would need to
refcount it, so its created on the first call to register and destroyed on the
last call to unregister?

Thanks,
Ryan

> +		if (!fgraph_stack_cachep) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +	}
> +
>  	if (!fgraph_array[0]) {
>  		/* The array must always have real data on it */
>  		for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)

Re: [PATCH] fgraph: Give ret_stack its own kmem cache

Posted by Steven Rostedt 1 month ago

On Mon, 21 Oct 2024 08:58:11 +0100
Ryan Roberts <ryan.roberts@arm.com> wrote:

> > @@ -1290,6 +1305,16 @@ int register_ftrace_graph(struct fgraph_ops *gops)
> >  
> >  	mutex_lock(&ftrace_lock);
> >  
> > +	if (!fgraph_stack_cachep)
> > +		fgraph_stack_cachep = kmem_cache_create("fgraph_stack",
> > +							SHADOW_STACK_SIZE,
> > +							SHADOW_STACK_SIZE, 0, NULL);  
> 
> (I don't have any experience with this code, but...) is there any value/need to
> destroy the cache in unregister_ftrace_graph()? I guess you would need to
> refcount it, so its created on the first call to register and destroyed on the
> last call to unregister?

No, we can't destroy it. In fact, we can't destroy the stacks
themselves until the task exits. This is because a function could have
been traced and its return address gets replaced by the fgraph return
code. Then it goes to sleep. For example, say you were tracing poll,
and systemd did a poll and you traced it. Now it may be sleeping
forever, waiting for some input. When it finally wakes up and exits the
function, it will need to get its original return address back.

The ret_stack holds the original return address that is needed when the
function finishes. Thus, its not safe to free it even when tracing is
finished. The callbacks may not be called when tracing is done, but the
ret_stack used to do the tracing will be called long after tracing is
over.

Now I'm looking at being able to free stacks by scanning all the tasks
after tracing is over and if the stack isn't being used (it's easy to
know if it is or not) then we can free it. But for those cases where
they are still being used, then we just have to give up and leave it be.

-- Steve

Re: [PATCH] fgraph: Give ret_stack its own kmem cache

Posted by Ryan Roberts 1 month ago

On 21/10/2024 09:37, Steven Rostedt wrote:
> On Mon, 21 Oct 2024 08:58:11 +0100
> Ryan Roberts <ryan.roberts@arm.com> wrote:
> 
>>> @@ -1290,6 +1305,16 @@ int register_ftrace_graph(struct fgraph_ops *gops)
>>>  
>>>  	mutex_lock(&ftrace_lock);
>>>  
>>> +	if (!fgraph_stack_cachep)
>>> +		fgraph_stack_cachep = kmem_cache_create("fgraph_stack",
>>> +							SHADOW_STACK_SIZE,
>>> +							SHADOW_STACK_SIZE, 0, NULL);  
>>
>> (I don't have any experience with this code, but...) is there any value/need to
>> destroy the cache in unregister_ftrace_graph()? I guess you would need to
>> refcount it, so its created on the first call to register and destroyed on the
>> last call to unregister?
> 
> No, we can't destroy it. In fact, we can't destroy the stacks
> themselves until the task exits. This is because a function could have
> been traced and its return address gets replaced by the fgraph return
> code. Then it goes to sleep. For example, say you were tracing poll,
> and systemd did a poll and you traced it. Now it may be sleeping
> forever, waiting for some input. When it finally wakes up and exits the
> function, it will need to get its original return address back.
> 
> The ret_stack holds the original return address that is needed when the
> function finishes. Thus, its not safe to free it even when tracing is
> finished. The callbacks may not be called when tracing is done, but the
> ret_stack used to do the tracing will be called long after tracing is
> over.
> 
> Now I'm looking at being able to free stacks by scanning all the tasks
> after tracing is over and if the stack isn't being used (it's easy to
> know if it is or not) then we can free it. But for those cases where
> they are still being used, then we just have to give up and leave it be.

Ah, gotya. Thanks for the explanation!

> 
> -- Steve