mm/memcontrol.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+)
When reading cgroup memory.stat files there is significant work the kernel
has to perform in string formatting the numeric data to text. Once a user
mode program gets this text further work has to be done, converting the
text back to numeric data. This work can be expensive for programs that
periodically sample this data over a large enough fleet.
As an alternative to reading memory.stat, introduce new kfuncs to allow
fetching specific memcg stats from within bpf cgroup iterator programs.
This approach eliminates the conversion work done by both the kernel and
user mode programs. Previously a program could open memory.stat and
repeatedly read from the associated file descriptor (while seeking back to
zero before each subsequent read). That action can now be replaced by
setting up a link to the bpf program once in advance and then reusing it to
invoke the cgroup iterator program each time a read is desired. An example
program can be found here [0].
There is a significant perf benefit when using this approach. In terms of
elapsed time, the kfuncs allow a bpf cgroup iterator program to outperform
the traditional file reading method, saving almost 80% of the time spent in
kernel.
control: elapsed time
real 0m14.421s
user 0m0.183s
sys 0m14.184s
experiment: elapsed time
real 0m3.250s
user 0m0.225s
sys 0m2.916s
control: perf data
22.24% a.out [kernel.kallsyms] [k] vsnprintf
17.35% a.out [kernel.kallsyms] [k] format_decode
12.60% a.out [kernel.kallsyms] [k] string
12.12% a.out [kernel.kallsyms] [k] number
8.06% a.out [kernel.kallsyms] [k] strlen
5.21% a.out [kernel.kallsyms] [k] memcpy_orig
4.26% a.out [kernel.kallsyms] [k] seq_buf_printf
4.19% a.out [kernel.kallsyms] [k] memory_stat_format
2.53% a.out [kernel.kallsyms] [k] widen_string
1.62% a.out [kernel.kallsyms] [k] put_dec_trunc8
0.99% a.out [kernel.kallsyms] [k] put_dec_full8
0.72% a.out [kernel.kallsyms] [k] put_dec
0.70% a.out [kernel.kallsyms] [k] memcpy
0.60% a.out [kernel.kallsyms] [k] mutex_lock
0.59% a.out [kernel.kallsyms] [k] entry_SYSCALL_64
experiment: perf data
8.17% memcgstat bpf_prog_c6d320d8e5cfb560_query [k] bpf_prog_c6d320d8e5cfb560_query
8.03% memcgstat [kernel.kallsyms] [k] memcg_node_stat_fetch
5.21% memcgstat [kernel.kallsyms] [k] __memcg_slab_post_alloc_hook
3.87% memcgstat [kernel.kallsyms] [k] _raw_spin_lock
3.01% memcgstat [kernel.kallsyms] [k] entry_SYSRETQ_unsafe_stack
2.49% memcgstat [kernel.kallsyms] [k] memcg_vm_event_fetch
2.47% memcgstat [kernel.kallsyms] [k] __memcg_slab_free_hook
2.34% memcgstat [kernel.kallsyms] [k] kmem_cache_free
2.32% memcgstat [kernel.kallsyms] [k] entry_SYSCALL_64
1.92% memcgstat [kernel.kallsyms] [k] mutex_lock
The overhead of string formatting and text conversion on the control side
is eliminated on the experimental side since the values are read directly
through shared memory with the bpf program. The kfunc/bpf approach also
provides flexibility in how this numeric data could be delivered to a user
mode program. It is possible to use a struct for example, with select
memory stat fields instead of an array. This opens up opportunities for
custom serialization as well since it is totally up to the bpf programmer
on how to lay out the data.
The patch also includes a kfunc for flushing stats. This is not required
for fetching stats, since the kernel periodically flushes memcg stats every
2s. It is up to the programmer if they want the very latest stats or not.
[0] https://gist.github.com/inwardvessel/416d629d6930e22954edb094b4e23347
https://gist.github.com/inwardvessel/28e0a9c8bf51ba07fa8516bceeb25669
https://gist.github.com/inwardvessel/b05e1b9ea0f766f4ad78dad178c49703
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
---
mm/memcontrol.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 67 insertions(+)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8dd7fbed5a94..aa8cbf883d71 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -870,6 +870,73 @@ unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
}
#endif
+static inline struct mem_cgroup *memcg_from_cgroup(struct cgroup *cgrp)
+{
+ return cgrp ? mem_cgroup_from_css(cgrp->subsys[memory_cgrp_id]) : NULL;
+}
+
+__bpf_kfunc static void memcg_flush_stats(struct cgroup *cgrp)
+{
+ struct mem_cgroup *memcg = memcg_from_cgroup(cgrp);
+
+ if (!memcg)
+ return;
+
+ mem_cgroup_flush_stats(memcg);
+}
+
+__bpf_kfunc static unsigned long memcg_node_stat_fetch(struct cgroup *cgrp,
+ enum node_stat_item item)
+{
+ struct mem_cgroup *memcg = memcg_from_cgroup(cgrp);
+
+ if (!memcg)
+ return 0;
+
+ return memcg_page_state_output(memcg, item);
+}
+
+__bpf_kfunc static unsigned long memcg_stat_fetch(struct cgroup *cgrp,
+ enum memcg_stat_item item)
+{
+ struct mem_cgroup *memcg = memcg_from_cgroup(cgrp);
+
+ if (!memcg)
+ return 0;
+
+ return memcg_page_state_output(memcg, item);
+}
+
+__bpf_kfunc static unsigned long memcg_vm_event_fetch(struct cgroup *cgrp,
+ enum vm_event_item item)
+{
+ struct mem_cgroup *memcg = memcg_from_cgroup(cgrp);
+
+ if (!memcg)
+ return 0;
+
+ return memcg_events(memcg, item);
+}
+
+BTF_KFUNCS_START(bpf_memcontrol_kfunc_ids)
+BTF_ID_FLAGS(func, memcg_flush_stats)
+BTF_ID_FLAGS(func, memcg_node_stat_fetch)
+BTF_ID_FLAGS(func, memcg_stat_fetch)
+BTF_ID_FLAGS(func, memcg_vm_event_fetch)
+BTF_KFUNCS_END(bpf_memcontrol_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_memcontrol_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_memcontrol_kfunc_ids,
+};
+
+static int __init bpf_memcontrol_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_memcontrol_kfunc_set);
+}
+late_initcall(bpf_memcontrol_kfunc_init);
+
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
/*
--
2.47.3
© 2016 - 2025 Red Hat, Inc.