kernel/trace/fgraph.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-)
From: Steven Rostedt <rostedt@goodmis.org>
The ret_stack (shadow stack used by function graph infrastructure) is
created for every task on the system when function graph is enabled. Give
it its own kmem_cache. This will make it easier to see how much memory is
being used specifically for function graph shadow stacks.
In the future, this size may change and may not be a power of two. Having
its own cache can also keep it from fragmenting memory.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/fgraph.c | 35 ++++++++++++++++++++++++++++++-----
1 file changed, 30 insertions(+), 5 deletions(-)
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 51e81b299a0d..51849bc42d3a 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -172,6 +172,8 @@ enum {
DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
int ftrace_graph_active;
+static struct kmem_cache *fgraph_stack_cachep;
+
static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
static unsigned long fgraph_array_bitmask;
@@ -1022,8 +1024,11 @@ static int alloc_retstack_tasklist(unsigned long **ret_stack_list)
int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
struct task_struct *g, *t;
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return -ENOMEM;
+
for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
- ret_stack_list[i] = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
+ ret_stack_list[i] = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
if (!ret_stack_list[i]) {
start = 0;
end = i;
@@ -1054,7 +1059,7 @@ static int alloc_retstack_tasklist(unsigned long **ret_stack_list)
rcu_read_unlock();
free:
for (i = start; i < end; i++)
- kfree(ret_stack_list[i]);
+ kmem_cache_free(fgraph_stack_cachep, ret_stack_list[i]);
return ret;
}
@@ -1117,9 +1122,12 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
if (ftrace_graph_active) {
unsigned long *ret_stack;
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return;
+
ret_stack = per_cpu(idle_ret_stack, cpu);
if (!ret_stack) {
- ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
+ ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
if (!ret_stack)
return;
per_cpu(idle_ret_stack, cpu) = ret_stack;
@@ -1139,7 +1147,10 @@ void ftrace_graph_init_task(struct task_struct *t)
if (ftrace_graph_active) {
unsigned long *ret_stack;
- ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return;
+
+ ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
if (!ret_stack)
return;
graph_init_task(t, ret_stack);
@@ -1154,7 +1165,11 @@ void ftrace_graph_exit_task(struct task_struct *t)
/* NULL must become visible to IRQs before we free it: */
barrier();
- kfree(ret_stack);
+ if (ret_stack) {
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return;
+ kmem_cache_free(fgraph_stack_cachep, ret_stack);
+ }
}
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1290,6 +1305,16 @@ int register_ftrace_graph(struct fgraph_ops *gops)
mutex_lock(&ftrace_lock);
+ if (!fgraph_stack_cachep) {
+ fgraph_stack_cachep = kmem_cache_create("fgraph_stack",
+ SHADOW_STACK_SIZE,
+ SHADOW_STACK_SIZE, 0, NULL);
+ if (!fgraph_stack_cachep) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
if (!fgraph_array[0]) {
/* The array must always have real data on it */
for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
--
2.45.2
On 19/10/2024 20:27, Steven Rostedt wrote: > From: Steven Rostedt <rostedt@goodmis.org> > > The ret_stack (shadow stack used by function graph infrastructure) is > created for every task on the system when function graph is enabled. Give > it its own kmem_cache. This will make it easier to see how much memory is > being used specifically for function graph shadow stacks. > > In the future, this size may change and may not be a power of two. Having > its own cache can also keep it from fragmenting memory. > > Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> > --- > kernel/trace/fgraph.c | 35 ++++++++++++++++++++++++++++++----- > 1 file changed, 30 insertions(+), 5 deletions(-) > > diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c > index 51e81b299a0d..51849bc42d3a 100644 > --- a/kernel/trace/fgraph.c > +++ b/kernel/trace/fgraph.c > @@ -172,6 +172,8 @@ enum { > DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph); > int ftrace_graph_active; > > +static struct kmem_cache *fgraph_stack_cachep; > + > static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE]; > static unsigned long fgraph_array_bitmask; > > @@ -1022,8 +1024,11 @@ static int alloc_retstack_tasklist(unsigned long **ret_stack_list) > int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; > struct task_struct *g, *t; > > + if (WARN_ON_ONCE(!fgraph_stack_cachep)) > + return -ENOMEM; > + > for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { > - ret_stack_list[i] = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); > + ret_stack_list[i] = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL); > if (!ret_stack_list[i]) { > start = 0; > end = i; > @@ -1054,7 +1059,7 @@ static int alloc_retstack_tasklist(unsigned long **ret_stack_list) > rcu_read_unlock(); > free: > for (i = start; i < end; i++) > - kfree(ret_stack_list[i]); > + kmem_cache_free(fgraph_stack_cachep, ret_stack_list[i]); > return ret; > } > > @@ -1117,9 +1122,12 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) > if (ftrace_graph_active) { > unsigned long *ret_stack; > > + if (WARN_ON_ONCE(!fgraph_stack_cachep)) > + return; > + > ret_stack = per_cpu(idle_ret_stack, cpu); > if (!ret_stack) { > - ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); > + ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL); > if (!ret_stack) > return; > per_cpu(idle_ret_stack, cpu) = ret_stack; > @@ -1139,7 +1147,10 @@ void ftrace_graph_init_task(struct task_struct *t) > if (ftrace_graph_active) { > unsigned long *ret_stack; > > - ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); > + if (WARN_ON_ONCE(!fgraph_stack_cachep)) > + return; > + > + ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL); > if (!ret_stack) > return; > graph_init_task(t, ret_stack); > @@ -1154,7 +1165,11 @@ void ftrace_graph_exit_task(struct task_struct *t) > /* NULL must become visible to IRQs before we free it: */ > barrier(); > > - kfree(ret_stack); > + if (ret_stack) { > + if (WARN_ON_ONCE(!fgraph_stack_cachep)) > + return; > + kmem_cache_free(fgraph_stack_cachep, ret_stack); > + } > } > > #ifdef CONFIG_DYNAMIC_FTRACE > @@ -1290,6 +1305,16 @@ int register_ftrace_graph(struct fgraph_ops *gops) > > mutex_lock(&ftrace_lock); > > + if (!fgraph_stack_cachep) { > + fgraph_stack_cachep = kmem_cache_create("fgraph_stack", > + SHADOW_STACK_SIZE, > + SHADOW_STACK_SIZE, 0, NULL); (I don't have any experience with this code, but...) is there any value/need to destroy the cache in unregister_ftrace_graph()? I guess you would need to refcount it, so its created on the first call to register and destroyed on the last call to unregister? Thanks, Ryan > + if (!fgraph_stack_cachep) { > + ret = -ENOMEM; > + goto out; > + } > + } > + > if (!fgraph_array[0]) { > /* The array must always have real data on it */ > for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
On Mon, 21 Oct 2024 08:58:11 +0100 Ryan Roberts <ryan.roberts@arm.com> wrote: > > @@ -1290,6 +1305,16 @@ int register_ftrace_graph(struct fgraph_ops *gops) > > > > mutex_lock(&ftrace_lock); > > > > + if (!fgraph_stack_cachep) > > + fgraph_stack_cachep = kmem_cache_create("fgraph_stack", > > + SHADOW_STACK_SIZE, > > + SHADOW_STACK_SIZE, 0, NULL); > > (I don't have any experience with this code, but...) is there any value/need to > destroy the cache in unregister_ftrace_graph()? I guess you would need to > refcount it, so its created on the first call to register and destroyed on the > last call to unregister? No, we can't destroy it. In fact, we can't destroy the stacks themselves until the task exits. This is because a function could have been traced and its return address gets replaced by the fgraph return code. Then it goes to sleep. For example, say you were tracing poll, and systemd did a poll and you traced it. Now it may be sleeping forever, waiting for some input. When it finally wakes up and exits the function, it will need to get its original return address back. The ret_stack holds the original return address that is needed when the function finishes. Thus, its not safe to free it even when tracing is finished. The callbacks may not be called when tracing is done, but the ret_stack used to do the tracing will be called long after tracing is over. Now I'm looking at being able to free stacks by scanning all the tasks after tracing is over and if the stack isn't being used (it's easy to know if it is or not) then we can free it. But for those cases where they are still being used, then we just have to give up and leave it be. -- Steve
On 21/10/2024 09:37, Steven Rostedt wrote: > On Mon, 21 Oct 2024 08:58:11 +0100 > Ryan Roberts <ryan.roberts@arm.com> wrote: > >>> @@ -1290,6 +1305,16 @@ int register_ftrace_graph(struct fgraph_ops *gops) >>> >>> mutex_lock(&ftrace_lock); >>> >>> + if (!fgraph_stack_cachep) >>> + fgraph_stack_cachep = kmem_cache_create("fgraph_stack", >>> + SHADOW_STACK_SIZE, >>> + SHADOW_STACK_SIZE, 0, NULL); >> >> (I don't have any experience with this code, but...) is there any value/need to >> destroy the cache in unregister_ftrace_graph()? I guess you would need to >> refcount it, so its created on the first call to register and destroyed on the >> last call to unregister? > > No, we can't destroy it. In fact, we can't destroy the stacks > themselves until the task exits. This is because a function could have > been traced and its return address gets replaced by the fgraph return > code. Then it goes to sleep. For example, say you were tracing poll, > and systemd did a poll and you traced it. Now it may be sleeping > forever, waiting for some input. When it finally wakes up and exits the > function, it will need to get its original return address back. > > The ret_stack holds the original return address that is needed when the > function finishes. Thus, its not safe to free it even when tracing is > finished. The callbacks may not be called when tracing is done, but the > ret_stack used to do the tracing will be called long after tracing is > over. > > Now I'm looking at being able to free stacks by scanning all the tasks > after tracing is over and if the stack isn't being used (it's easy to > know if it is or not) then we can free it. But for those cases where > they are still being used, then we just have to give up and leave it be. Ah, gotya. Thanks for the explanation! > > -- Steve
© 2016 - 2024 Red Hat, Inc.