To effectively operate with memory cgroups in BPF there is a need
to convert css pointers to memcg pointers. A simple container_of
cast which is used in the kernel code can't be used in BPF because
from the verifier's point of view that's a out-of-bounds memory access.
Introduce helper get/put kfuncs which can be used to get
a refcounted memcg pointer from the css pointer:
- bpf_get_mem_cgroup,
- bpf_put_mem_cgroup.
bpf_get_mem_cgroup() can take both memcg's css and the corresponding
cgroup's "self" css. It allows it to be used with the existing cgroup
iterator which iterates over cgroup tree, not memcg tree.
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
mm/Makefile | 1 +
mm/bpf_memcontrol.c | 88 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 89 insertions(+)
create mode 100644 mm/bpf_memcontrol.c
diff --git a/mm/Makefile b/mm/Makefile
index 051e88c699af..2d8f9beb3c71 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif
ifdef CONFIG_BPF_SYSCALL
obj-y += bpf_oom.o
+obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
endif
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_GUP_TEST) += gup_test.o
diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c
new file mode 100644
index 000000000000..1e46097745cf
--- /dev/null
+++ b/mm/bpf_memcontrol.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Memory Controller-related BPF kfuncs and auxiliary code
+ *
+ * Author: Roman Gushchin <roman.gushchin@linux.dev>
+ */
+
+#include <linux/memcontrol.h>
+#include <linux/bpf.h>
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_get_mem_cgroup - Get a reference to a memory cgroup
+ * @css: pointer to the css structure
+ *
+ * Returns a pointer to a mem_cgroup structure after bumping
+ * the corresponding css's reference counter.
+ *
+ * It's fine to pass a css which belongs to any cgroup controller,
+ * e.g. unified hierarchy's main css.
+ *
+ * Implements KF_ACQUIRE semantics.
+ */
+__bpf_kfunc struct mem_cgroup *
+bpf_get_mem_cgroup(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = NULL;
+ bool rcu_unlock = false;
+
+ if (!root_mem_cgroup)
+ return NULL;
+
+ if (root_mem_cgroup->css.ss != css->ss) {
+ struct cgroup *cgroup = css->cgroup;
+ int ssid = root_mem_cgroup->css.ss->id;
+
+ rcu_read_lock();
+ rcu_unlock = true;
+ css = rcu_dereference_raw(cgroup->subsys[ssid]);
+ }
+
+ if (css && css_tryget(css))
+ memcg = container_of(css, struct mem_cgroup, css);
+
+ if (rcu_unlock)
+ rcu_read_unlock();
+
+ return memcg;
+}
+
+/**
+ * bpf_put_mem_cgroup - Put a reference to a memory cgroup
+ * @memcg: memory cgroup to release
+ *
+ * Releases a previously acquired memcg reference.
+ * Implements KF_RELEASE semantics.
+ */
+__bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg)
+{
+ css_put(&memcg->css);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_memcontrol_kfuncs)
+BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
+BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE)
+
+BTF_KFUNCS_END(bpf_memcontrol_kfuncs)
+
+static const struct btf_kfunc_id_set bpf_memcontrol_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_memcontrol_kfuncs,
+};
+
+static int __init bpf_memcontrol_init(void)
+{
+ int err;
+
+ err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &bpf_memcontrol_kfunc_set);
+ if (err)
+ pr_warn("error while registering bpf memcontrol kfuncs: %d", err);
+
+ return err;
+}
+late_initcall(bpf_memcontrol_init);
--
2.51.0
On Mon, Oct 27, 2025 at 04:17:11PM -0700, Roman Gushchin wrote:
> +__bpf_kfunc struct mem_cgroup *
> +bpf_get_mem_cgroup(struct cgroup_subsys_state *css)
> +{
> + struct mem_cgroup *memcg = NULL;
> + bool rcu_unlock = false;
> +
> + if (!root_mem_cgroup)
> + return NULL;
> +
> + if (root_mem_cgroup->css.ss != css->ss) {
> + struct cgroup *cgroup = css->cgroup;
> + int ssid = root_mem_cgroup->css.ss->id;
> +
> + rcu_read_lock();
> + rcu_unlock = true;
> + css = rcu_dereference_raw(cgroup->subsys[ssid]);
Would it make more sense to use cgroup_e_css()?
Thanks.
--
tejun
Tejun Heo <tj@kernel.org> writes:
> On Mon, Oct 27, 2025 at 04:17:11PM -0700, Roman Gushchin wrote:
>> +__bpf_kfunc struct mem_cgroup *
>> +bpf_get_mem_cgroup(struct cgroup_subsys_state *css)
>> +{
>> + struct mem_cgroup *memcg = NULL;
>> + bool rcu_unlock = false;
>> +
>> + if (!root_mem_cgroup)
>> + return NULL;
>> +
>> + if (root_mem_cgroup->css.ss != css->ss) {
>> + struct cgroup *cgroup = css->cgroup;
>> + int ssid = root_mem_cgroup->css.ss->id;
>> +
>> + rcu_read_lock();
>> + rcu_unlock = true;
>> + css = rcu_dereference_raw(cgroup->subsys[ssid]);
>
> Would it make more sense to use cgroup_e_css()?
Good call, will update in the next version.
Thank you!
```
commit ec8e2e1769704b4f69261f36e6aaf035d47c5f0f
Author: Roman Gushchin <roman.gushchin@linux.dev>
mm: introduce BPF kfuncs to deal with memcg pointers
This commit introduces two new BPF kfuncs for converting css pointers
to memcg pointers with proper reference counting: bpf_get_mem_cgroup()
and bpf_put_mem_cgroup().
> diff --git a/mm/Makefile b/mm/Makefile
> index 051e88c69..2d8f9beb3 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -107,6 +107,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o
> endif
> ifdef CONFIG_BPF_SYSCALL
> obj-y += bpf_oom.o
> +obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
> endif
> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
> obj-$(CONFIG_GUP_TEST) += gup_test.o
> diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c
> new file mode 100644
> index 000000000..1e4609774
> --- /dev/null
> +++ b/mm/bpf_memcontrol.c
> @@ -0,0 +1,88 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Memory Controller-related BPF kfuncs and auxiliary code
> + *
> + * Author: Roman Gushchin <roman.gushchin@linux.dev>
> + */
> +
> +#include <linux/memcontrol.h>
> +#include <linux/bpf.h>
> +
> +__bpf_kfunc_start_defs();
> +
> +/**
> + * bpf_get_mem_cgroup - Get a reference to a memory cgroup
> + * @css: pointer to the css structure
> + *
> + * Returns a pointer to a mem_cgroup structure after bumping
> + * the corresponding css's reference counter.
> + *
> + * It's fine to pass a css which belongs to any cgroup controller,
> + * e.g. unified hierarchy's main css.
> + *
> + * Implements KF_ACQUIRE semantics.
> + */
> +__bpf_kfunc struct mem_cgroup *
> +bpf_get_mem_cgroup(struct cgroup_subsys_state *css)
> +{
> + struct mem_cgroup *memcg = NULL;
> + bool rcu_unlock = false;
> +
> + if (!root_mem_cgroup)
> + return NULL;
> +
> + if (root_mem_cgroup->css.ss != css->ss) {
^^^^^^^
Can this dereference a NULL css pointer? The function checks css for
NULL at line 42 with "if (css && css_tryget(css))", which suggests css
can be NULL. If a BPF program calls this function with a NULL css,
wouldn't it crash here before reaching that check?
> + struct cgroup *cgroup = css->cgroup;
> + int ssid = root_mem_cgroup->css.ss->id;
> +
> + rcu_read_lock();
> + rcu_unlock = true;
> + css = rcu_dereference_raw(cgroup->subsys[ssid]);
> + }
> +
> + if (css && css_tryget(css))
> + memcg = container_of(css, struct mem_cgroup, css);
> +
> + if (rcu_unlock)
> + rcu_read_unlock();
> +
> + return memcg;
> +}
[ ... ]
> +__bpf_kfunc_end_defs();
> +
> +BTF_KFUNCS_START(bpf_memcontrol_kfuncs)
> +BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
> +BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE)
> +
> +BTF_KFUNCS_END(bpf_memcontrol_kfuncs)
[ ... ]
```
---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
In-Reply-To-Subject: `mm: introduce BPF kfuncs to deal with memcg pointers`
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/18859027430
bot+bpf-ci@kernel.org writes:
> ```
> commit ec8e2e1769704b4f69261f36e6aaf035d47c5f0f
> Author: Roman Gushchin <roman.gushchin@linux.dev>
>
> mm: introduce BPF kfuncs to deal with memcg pointers
>
> This commit introduces two new BPF kfuncs for converting css pointers
> to memcg pointers with proper reference counting: bpf_get_mem_cgroup()
> and bpf_put_mem_cgroup().
>
>> diff --git a/mm/Makefile b/mm/Makefile
>> index 051e88c69..2d8f9beb3 100644
>> --- a/mm/Makefile
>> +++ b/mm/Makefile
>> @@ -107,6 +107,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o
>> endif
>> ifdef CONFIG_BPF_SYSCALL
>> obj-y += bpf_oom.o
>> +obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
>> endif
>> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
>> obj-$(CONFIG_GUP_TEST) += gup_test.o
>> diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c
>> new file mode 100644
>> index 000000000..1e4609774
>> --- /dev/null
>> +++ b/mm/bpf_memcontrol.c
>> @@ -0,0 +1,88 @@
>> +// SPDX-License-Identifier: GPL-2.0-or-later
>> +/*
>> + * Memory Controller-related BPF kfuncs and auxiliary code
>> + *
>> + * Author: Roman Gushchin <roman.gushchin@linux.dev>
>> + */
>> +
>> +#include <linux/memcontrol.h>
>> +#include <linux/bpf.h>
>> +
>> +__bpf_kfunc_start_defs();
>> +
>> +/**
>> + * bpf_get_mem_cgroup - Get a reference to a memory cgroup
>> + * @css: pointer to the css structure
>> + *
>> + * Returns a pointer to a mem_cgroup structure after bumping
>> + * the corresponding css's reference counter.
>> + *
>> + * It's fine to pass a css which belongs to any cgroup controller,
>> + * e.g. unified hierarchy's main css.
>> + *
>> + * Implements KF_ACQUIRE semantics.
>> + */
>> +__bpf_kfunc struct mem_cgroup *
>> +bpf_get_mem_cgroup(struct cgroup_subsys_state *css)
>> +{
>> + struct mem_cgroup *memcg = NULL;
>> + bool rcu_unlock = false;
>> +
>> + if (!root_mem_cgroup)
>> + return NULL;
>> +
>> + if (root_mem_cgroup->css.ss != css->ss) {
> ^^^^^^^
>
> Can this dereference a NULL css pointer? The function checks css for
> NULL at line 42 with "if (css && css_tryget(css))", which suggests css
> can be NULL. If a BPF program calls this function with a NULL css,
> wouldn't it crash here before reaching that check?
Here css passed as an argument to bpf_get_mem_cgroup() can't be NULL,
because the BPF verifier guarantees that it's a valid css pointer.
However the result of rcu_dereference_raw(cgroup->subsys[ssid]) can be
NULL, this is why the "if (css && css_tryget(css))" check is required
down below.
On Tue, Oct 28, 2025 at 9:11 AM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> bot+bpf-ci@kernel.org writes:
>
> > ```
> > commit ec8e2e1769704b4f69261f36e6aaf035d47c5f0f
> > Author: Roman Gushchin <roman.gushchin@linux.dev>
> >
> > mm: introduce BPF kfuncs to deal with memcg pointers
> >
> > This commit introduces two new BPF kfuncs for converting css pointers
> > to memcg pointers with proper reference counting: bpf_get_mem_cgroup()
> > and bpf_put_mem_cgroup().
> >
> >> diff --git a/mm/Makefile b/mm/Makefile
> >> index 051e88c69..2d8f9beb3 100644
> >> --- a/mm/Makefile
> >> +++ b/mm/Makefile
> >> @@ -107,6 +107,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o
> >> endif
> >> ifdef CONFIG_BPF_SYSCALL
> >> obj-y += bpf_oom.o
> >> +obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
> >> endif
> >> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
> >> obj-$(CONFIG_GUP_TEST) += gup_test.o
> >> diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c
> >> new file mode 100644
> >> index 000000000..1e4609774
> >> --- /dev/null
> >> +++ b/mm/bpf_memcontrol.c
> >> @@ -0,0 +1,88 @@
> >> +// SPDX-License-Identifier: GPL-2.0-or-later
> >> +/*
> >> + * Memory Controller-related BPF kfuncs and auxiliary code
> >> + *
> >> + * Author: Roman Gushchin <roman.gushchin@linux.dev>
> >> + */
> >> +
> >> +#include <linux/memcontrol.h>
> >> +#include <linux/bpf.h>
> >> +
> >> +__bpf_kfunc_start_defs();
> >> +
> >> +/**
> >> + * bpf_get_mem_cgroup - Get a reference to a memory cgroup
> >> + * @css: pointer to the css structure
> >> + *
> >> + * Returns a pointer to a mem_cgroup structure after bumping
> >> + * the corresponding css's reference counter.
> >> + *
> >> + * It's fine to pass a css which belongs to any cgroup controller,
> >> + * e.g. unified hierarchy's main css.
> >> + *
> >> + * Implements KF_ACQUIRE semantics.
> >> + */
> >> +__bpf_kfunc struct mem_cgroup *
> >> +bpf_get_mem_cgroup(struct cgroup_subsys_state *css)
> >> +{
> >> + struct mem_cgroup *memcg = NULL;
> >> + bool rcu_unlock = false;
> >> +
> >> + if (!root_mem_cgroup)
> >> + return NULL;
> >> +
> >> + if (root_mem_cgroup->css.ss != css->ss) {
> > ^^^^^^^
> >
> > Can this dereference a NULL css pointer? The function checks css for
> > NULL at line 42 with "if (css && css_tryget(css))", which suggests css
> > can be NULL. If a BPF program calls this function with a NULL css,
> > wouldn't it crash here before reaching that check?
>
> Here css passed as an argument to bpf_get_mem_cgroup() can't be NULL,
> because the BPF verifier guarantees that it's a valid css pointer.
>
> However the result of rcu_dereference_raw(cgroup->subsys[ssid]) can be
> NULL, this is why the "if (css && css_tryget(css))" check is required
> down below.
Yeah. Not sure how feasible it is to teach AI about KF_RCU semantics.
On 10/28/25 1:12 PM, Alexei Starovoitov wrote: > On Tue, Oct 28, 2025 at 9:11 AM Roman Gushchin <roman.gushchin@linux.dev> wrote: >> >> bot+bpf-ci@kernel.org writes: >> >>> ``` >>> commit ec8e2e1769704b4f69261f36e6aaf035d47c5f0f >>> Author: Roman Gushchin <roman.gushchin@linux.dev> >>> Can this dereference a NULL css pointer? The function checks css for >>> NULL at line 42 with "if (css && css_tryget(css))", which suggests css >>> can be NULL. If a BPF program calls this function with a NULL css, >>> wouldn't it crash here before reaching that check? >> >> Here css passed as an argument to bpf_get_mem_cgroup() can't be NULL, >> because the BPF verifier guarantees that it's a valid css pointer. >> >> However the result of rcu_dereference_raw(cgroup->subsys[ssid]) can be >> NULL, this is why the "if (css && css_tryget(css))" check is required >> down below. > > Yeah. Not sure how feasible it is to teach AI about KF_RCU semantics. I pulled it down locally to try and w/semcode it is properly catching this: False Positives Eliminated 1. EH-001 NULL dereference - css parameter dereferenced without check - Why false positive: BPF verifier ensures pointer parameters are non-NULL. All kernel kfuncs follow the same pattern of not checking parameters for NULL (css_rstat_updated, css_rstat_flush, bpf_put_mem_cgroup, etc.). The KF_RET_NULL flag controls return value, not parameter nullability. My plan is to just have the prompt read Documentation/bpf/kfuncs.rst, which Eduard suggested. I'll make a bpf kfuncs pattern and do that. -chris
Chris Mason <clm@meta.com> writes: > On 10/28/25 1:12 PM, Alexei Starovoitov wrote: >> On Tue, Oct 28, 2025 at 9:11 AM Roman Gushchin <roman.gushchin@linux.dev> wrote: >>> >>> bot+bpf-ci@kernel.org writes: >>> >>>> ``` >>>> commit ec8e2e1769704b4f69261f36e6aaf035d47c5f0f >>>> Author: Roman Gushchin <roman.gushchin@linux.dev> >>>> Can this dereference a NULL css pointer? The function checks css for >>>> NULL at line 42 with "if (css && css_tryget(css))", which suggests css >>>> can be NULL. If a BPF program calls this function with a NULL css, >>>> wouldn't it crash here before reaching that check? >>> >>> Here css passed as an argument to bpf_get_mem_cgroup() can't be NULL, >>> because the BPF verifier guarantees that it's a valid css pointer. >>> >>> However the result of rcu_dereference_raw(cgroup->subsys[ssid]) can be >>> NULL, this is why the "if (css && css_tryget(css))" check is required >>> down below. >> >> Yeah. Not sure how feasible it is to teach AI about KF_RCU semantics. > > I pulled it down locally to try and w/semcode it is properly catching this: > > False Positives Eliminated > > 1. EH-001 NULL dereference - css parameter dereferenced without check > > - Why false positive: BPF verifier ensures pointer parameters are > non-NULL. All kernel kfuncs follow the same pattern of not checking > parameters for NULL (css_rstat_updated, css_rstat_flush, > bpf_put_mem_cgroup, etc.). The KF_RET_NULL flag controls return value, > not parameter nullability. > > My plan is to just have the prompt read Documentation/bpf/kfuncs.rst, > which Eduard suggested. I'll make a bpf kfuncs pattern and do that. Awesome, thank you!
© 2016 - 2025 Red Hat, Inc.