[v2] mm: BPF OOM

[PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Roman Gushchin 3 months, 2 weeks ago

When a struct ops is being attached and a bpf link is created,
allow to pass a cgroup fd using bpf attr, so that struct ops
can be attached to a cgroup instead of globally.

Attached struct ops doesn't hold a reference to the cgroup,
only preserves cgroup id.

Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
 include/linux/bpf.h         |  1 +
 kernel/bpf/bpf_struct_ops.c | 13 +++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index eae907218188..7205b813e25f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1849,6 +1849,7 @@ struct bpf_struct_ops_link {
 	struct bpf_link link;
 	struct bpf_map __rcu *map;
 	wait_queue_head_t wait_hup;
+	u64 cgroup_id;
 };
 
 struct bpf_link_primer {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 45cc5ee19dc2..58664779a2b6 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -13,6 +13,7 @@
 #include <linux/btf_ids.h>
 #include <linux/rcupdate_wait.h>
 #include <linux/poll.h>
+#include <linux/cgroup.h>
 
 struct bpf_struct_ops_value {
 	struct bpf_struct_ops_common_value common;
@@ -1359,6 +1360,18 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
 	}
 	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
 		      attr->link_create.attach_type);
+#ifdef CONFIG_CGROUPS
+	if (attr->link_create.cgroup.relative_fd) {
+		struct cgroup *cgrp;
+
+		cgrp = cgroup_get_from_fd(attr->link_create.cgroup.relative_fd);
+		if (IS_ERR(cgrp))
+			return PTR_ERR(cgrp);
+
+		link->cgroup_id = cgroup_id(cgrp);
+		cgroup_put(cgrp);
+	}
+#endif /* CONFIG_CGROUPS */
 
 	err = bpf_link_prime(&link->link, &link_primer);
 	if (err)
-- 
2.51.0

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Martin KaFai Lau 3 months, 1 week ago


On 10/27/25 4:17 PM, Roman Gushchin wrote:
> When a struct ops is being attached and a bpf link is created,
> allow to pass a cgroup fd using bpf attr, so that struct ops
> can be attached to a cgroup instead of globally.
> 
> Attached struct ops doesn't hold a reference to the cgroup,
> only preserves cgroup id.
> 
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> ---
>   include/linux/bpf.h         |  1 +
>   kernel/bpf/bpf_struct_ops.c | 13 +++++++++++++
>   2 files changed, 14 insertions(+)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index eae907218188..7205b813e25f 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1849,6 +1849,7 @@ struct bpf_struct_ops_link {
>   	struct bpf_link link;
>   	struct bpf_map __rcu *map;
>   	wait_queue_head_t wait_hup;
> +	u64 cgroup_id;
>   };
>   
>   struct bpf_link_primer {
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index 45cc5ee19dc2..58664779a2b6 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -13,6 +13,7 @@
>   #include <linux/btf_ids.h>
>   #include <linux/rcupdate_wait.h>
>   #include <linux/poll.h>
> +#include <linux/cgroup.h>
>   
>   struct bpf_struct_ops_value {
>   	struct bpf_struct_ops_common_value common;
> @@ -1359,6 +1360,18 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
>   	}
>   	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
>   		      attr->link_create.attach_type);
> +#ifdef CONFIG_CGROUPS
> +	if (attr->link_create.cgroup.relative_fd) {
> +		struct cgroup *cgrp;
> +
> +		cgrp = cgroup_get_from_fd(attr->link_create.cgroup.relative_fd);
> +		if (IS_ERR(cgrp))
> +			return PTR_ERR(cgrp);
> +
> +		link->cgroup_id = cgroup_id(cgrp);

Not sure storing the cgroup_id or storing the memcg/cgroup pointer is 
better here. Regardless, link->cgroup_id should be cleared in 
bpf_struct_ops_map_link_detach(). The cgroup_id probably is useful to 
bpf_struct_ops_map_link_show_fdinfo().

> +		cgroup_put(cgrp);
> +	}
> +#endif /* CONFIG_CGROUPS */
>   
>   	err = bpf_link_prime(&link->link, &link_primer);
>   	if (err)

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Song Liu 3 months, 1 week ago

On Mon, Oct 27, 2025 at 4:17 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> When a struct ops is being attached and a bpf link is created,
> allow to pass a cgroup fd using bpf attr, so that struct ops
> can be attached to a cgroup instead of globally.
>
> Attached struct ops doesn't hold a reference to the cgroup,
> only preserves cgroup id.

With the current model, when a cgroup is freed, the bpf link still
holds a reference to the struct_ops. Can we make the cgroup to hold
a reference to the struct_ops, so that the struct_ops is freed
automatically when the cgroup is freed?

I think the downside is that we will need an API to remove/change
per cgroup OOM handler.

Thanks,
Song

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Tejun Heo 3 months, 1 week ago

Hello,

On Mon, Oct 27, 2025 at 04:17:05PM -0700, Roman Gushchin wrote:
> @@ -1849,6 +1849,7 @@ struct bpf_struct_ops_link {
>  	struct bpf_link link;
>  	struct bpf_map __rcu *map;
>  	wait_queue_head_t wait_hup;
> +	u64 cgroup_id;
>  };

BTW, for sched_ext sub-sched support, I'm just adding cgroup_id to
struct_ops, which seems to work fine. It'd be nice to align on the same
approach. What are the benefits of doing this through fd?

Thanks.

-- 
tejun

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Roman Gushchin 3 months, 1 week ago

Tejun Heo <tj@kernel.org> writes:

> Hello,
>
> On Mon, Oct 27, 2025 at 04:17:05PM -0700, Roman Gushchin wrote:
>> @@ -1849,6 +1849,7 @@ struct bpf_struct_ops_link {
>>  	struct bpf_link link;
>>  	struct bpf_map __rcu *map;
>>  	wait_queue_head_t wait_hup;
>> +	u64 cgroup_id;
>>  };
>
> BTW, for sched_ext sub-sched support, I'm just adding cgroup_id to
> struct_ops, which seems to work fine. It'd be nice to align on the same
> approach. What are the benefits of doing this through fd?

Then you can attach a single struct ops to multiple cgroups (or Idk
sockets or processes or some other objects in the future).
And IMO it's just a more generic solution.

Thanks!

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Tejun Heo 3 months, 1 week ago

On Wed, Oct 29, 2025 at 01:25:52PM -0700, Roman Gushchin wrote:
> > BTW, for sched_ext sub-sched support, I'm just adding cgroup_id to
> > struct_ops, which seems to work fine. It'd be nice to align on the same
> > approach. What are the benefits of doing this through fd?
> 
> Then you can attach a single struct ops to multiple cgroups (or Idk
> sockets or processes or some other objects in the future).
> And IMO it's just a more generic solution.

I'm not very convinced that sharing a single struct_ops instance across
multiple cgroups would be all that useful. If you map this to normal
userspace programs, a given struct_ops instance is package of code and all
the global data (maps). ie. it's not like running the same program multiple
times against different targets. It's more akin to running a single program
instance which can handle multiple targets.

Maybe that's useful in some cases, but that program would have to explicitly
distinguish the cgroups that it's attached to. I have a hard time imagining
use cases where a single struct_ops has to service multiple disjoint cgroups
in the hierarchy and it ends up stepping outside of the usual operation
model of cgroups - commonality being expressed through the hierarchical
structure.

Thanks.

-- 
tejun

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Song Liu 3 months, 1 week ago

Hi Tejun,

On Wed, Oct 29, 2025 at 1:36 PM Tejun Heo <tj@kernel.org> wrote:
>
> On Wed, Oct 29, 2025 at 01:25:52PM -0700, Roman Gushchin wrote:
> > > BTW, for sched_ext sub-sched support, I'm just adding cgroup_id to
> > > struct_ops, which seems to work fine. It'd be nice to align on the same
> > > approach. What are the benefits of doing this through fd?
> >
> > Then you can attach a single struct ops to multiple cgroups (or Idk
> > sockets or processes or some other objects in the future).
> > And IMO it's just a more generic solution.
>
> I'm not very convinced that sharing a single struct_ops instance across
> multiple cgroups would be all that useful. If you map this to normal
> userspace programs, a given struct_ops instance is package of code and all
> the global data (maps). ie. it's not like running the same program multiple
> times against different targets. It's more akin to running a single program
> instance which can handle multiple targets.
>
> Maybe that's useful in some cases, but that program would have to explicitly
> distinguish the cgroups that it's attached to. I have a hard time imagining
> use cases where a single struct_ops has to service multiple disjoint cgroups
> in the hierarchy and it ends up stepping outside of the usual operation
> model of cgroups - commonality being expressed through the hierarchical
> structure.

How about we pass a pointer to mem_cgroup (and/or related pointers)
to all the callbacks in the struct_ops? AFAICT, in-kernel _ops structures like
struct file_operations and struct tcp_congestion_ops use this method. And
we can actually implement struct tcp_congestion_ops in BPF. With the
struct tcp_congestion_ops model, the struct_ops map and the struct_ops
link are both shared among multiple instances (sockets).

With this model, the system admin with root access can load a bunch of
available oom handlers, and users in their container can pick a preferred
oom handler for the sub cgroup. AFAICT, the users in the container can
pick the proper OOM handler without CAP_BPF. Does this sound useful
for some cases?

Thanks,
Song

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Roman Gushchin 3 months, 1 week ago

Song Liu <song@kernel.org> writes:

> Hi Tejun,
>
> On Wed, Oct 29, 2025 at 1:36 PM Tejun Heo <tj@kernel.org> wrote:
>>
>> On Wed, Oct 29, 2025 at 01:25:52PM -0700, Roman Gushchin wrote:
>> > > BTW, for sched_ext sub-sched support, I'm just adding cgroup_id to
>> > > struct_ops, which seems to work fine. It'd be nice to align on the same
>> > > approach. What are the benefits of doing this through fd?
>> >
>> > Then you can attach a single struct ops to multiple cgroups (or Idk
>> > sockets or processes or some other objects in the future).
>> > And IMO it's just a more generic solution.
>>
>> I'm not very convinced that sharing a single struct_ops instance across
>> multiple cgroups would be all that useful. If you map this to normal
>> userspace programs, a given struct_ops instance is package of code and all
>> the global data (maps). ie. it's not like running the same program multiple
>> times against different targets. It's more akin to running a single program
>> instance which can handle multiple targets.
>>
>> Maybe that's useful in some cases, but that program would have to explicitly
>> distinguish the cgroups that it's attached to. I have a hard time imagining
>> use cases where a single struct_ops has to service multiple disjoint cgroups
>> in the hierarchy and it ends up stepping outside of the usual operation
>> model of cgroups - commonality being expressed through the hierarchical
>> structure.
>
> How about we pass a pointer to mem_cgroup (and/or related pointers)
> to all the callbacks in the struct_ops? AFAICT, in-kernel _ops structures like
> struct file_operations and struct tcp_congestion_ops use this method. And
> we can actually implement struct tcp_congestion_ops in BPF. With the
> struct tcp_congestion_ops model, the struct_ops map and the struct_ops
> link are both shared among multiple instances (sockets).

+1 to this.
I agree it might be debatable when it comes to cgroups, but when it comes to
sockets or similar objects, having a separate struct ops per object
isn't really an option.

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Alexei Starovoitov 3 months, 1 week ago

On Wed, Oct 29, 2025 at 2:53 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> Song Liu <song@kernel.org> writes:
>
> > Hi Tejun,
> >
> > On Wed, Oct 29, 2025 at 1:36 PM Tejun Heo <tj@kernel.org> wrote:
> >>
> >> On Wed, Oct 29, 2025 at 01:25:52PM -0700, Roman Gushchin wrote:
> >> > > BTW, for sched_ext sub-sched support, I'm just adding cgroup_id to
> >> > > struct_ops, which seems to work fine. It'd be nice to align on the same
> >> > > approach. What are the benefits of doing this through fd?
> >> >
> >> > Then you can attach a single struct ops to multiple cgroups (or Idk
> >> > sockets or processes or some other objects in the future).
> >> > And IMO it's just a more generic solution.
> >>
> >> I'm not very convinced that sharing a single struct_ops instance across
> >> multiple cgroups would be all that useful. If you map this to normal
> >> userspace programs, a given struct_ops instance is package of code and all
> >> the global data (maps). ie. it's not like running the same program multiple
> >> times against different targets. It's more akin to running a single program
> >> instance which can handle multiple targets.
> >>
> >> Maybe that's useful in some cases, but that program would have to explicitly
> >> distinguish the cgroups that it's attached to. I have a hard time imagining
> >> use cases where a single struct_ops has to service multiple disjoint cgroups
> >> in the hierarchy and it ends up stepping outside of the usual operation
> >> model of cgroups - commonality being expressed through the hierarchical
> >> structure.
> >
> > How about we pass a pointer to mem_cgroup (and/or related pointers)
> > to all the callbacks in the struct_ops? AFAICT, in-kernel _ops structures like
> > struct file_operations and struct tcp_congestion_ops use this method. And
> > we can actually implement struct tcp_congestion_ops in BPF. With the
> > struct tcp_congestion_ops model, the struct_ops map and the struct_ops
> > link are both shared among multiple instances (sockets).
>
> +1 to this.
> I agree it might be debatable when it comes to cgroups, but when it comes to
> sockets or similar objects, having a separate struct ops per object
> isn't really an option.

I think the general bpf philosophy that load and attach are two
separate steps. For struct-ops it's almost there, but not quite.
struct-ops shouldn't be an exception.
The bpf infra should be able to load a set of progs (aka struct-ops)
and attach it with a link to different entities. Like cgroups.
I think sched-ext should do that too. Even if there is no use case
today for the same sched-ext in two different cgroups.
For bpf-oom I can imagine a use case where container management sw
would pre-load struct-ops and then attach it later to different
containers depending on container configs. These container might
be peers in hierarchy, but attaching to their parent won't be
equivalent, since other peers might not need that bpf-oom management.
The "workaround" could be to create another cgroup layer
between parent and container, but that becomes messy, since now
there is a cgroup only for the purpose of attaching bpf-oom to it.

Whether struct-ops link attach is using cgroup_fd or cgroup_id
is debatable. I think FD is cleaner.

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Tejun Heo 3 months, 1 week ago

Hello,

On Wed, Oct 29, 2025 at 03:43:39PM -0700, Alexei Starovoitov wrote:
...
> I think the general bpf philosophy that load and attach are two
> separate steps. For struct-ops it's almost there, but not quite.
> struct-ops shouldn't be an exception.
> The bpf infra should be able to load a set of progs (aka struct-ops)
> and attach it with a link to different entities. Like cgroups.
> I think sched-ext should do that too. Even if there is no use case
> today for the same sched-ext in two different cgroups.

I'm not sure it's just that there's no use case.

- How would recursion work with private stacks? Aren't those attached to
  each BPF program?

- Wouldn't that also complicate attributing kfunc calls to the handle
  instance? If there is one struct_ops per cgroup, the oom kill kfunc can
  look that up and then verify that the struct_ops has authority over the
  target process. Multiple attachments can work too but that'd require
  iterating all attachments, right?

Thanks.

-- 
tejun

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Alexei Starovoitov 3 months, 1 week ago

On Wed, Oct 29, 2025 at 3:53 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Wed, Oct 29, 2025 at 03:43:39PM -0700, Alexei Starovoitov wrote:
> ...
> > I think the general bpf philosophy that load and attach are two
> > separate steps. For struct-ops it's almost there, but not quite.
> > struct-ops shouldn't be an exception.
> > The bpf infra should be able to load a set of progs (aka struct-ops)
> > and attach it with a link to different entities. Like cgroups.
> > I think sched-ext should do that too. Even if there is no use case
> > today for the same sched-ext in two different cgroups.
>
> I'm not sure it's just that there's no use case.

I think there will be a use case for sched-ext as well,
just the current way the scheds are written is too specific.
There is cgroup local storage, so scheds can certainly
store whatever state there.
Potentially we can improve UX further by utilizing __thread on bpf.c
side in some way.

> - How would recursion work with private stacks? Aren't those attached to
>   each BPF program?

yes. private stack is per prog, but why does it matter?
I'm not suggesting that the same prog to be attached at different
levels of the cgroup hierarchy, because such configuration
will indeed trigger recursion prevention logic (with or without private
stack).
But having one logical sched-ext prog set to manage tasks
in container A and in container B makes sense as a use case to me
where A and B are different cgroups.
DSQs can be cgroup scoped too.

> - Wouldn't that also complicate attributing kfunc calls to the handle
>   instance?

you mean the whole prog_assoc stuff ?
That's orthogonal. tracing progs are global so there is
no perfect place to associate them with. struct-ops map
is the best we can do today, but ideally it's run_ctx
that should be per-attachment. Like cookie.

> If there is one struct_ops per cgroup, the oom kill kfunc can
>   look that up and then verify that the struct_ops has authority over the
>   target process. Multiple attachments can work too but that'd require
>   iterating all attachments, right?

Are you talking about bpf_oom_kill_process() kfunc from these patch set?
I don't think it needs any changes. oom context is passed into prog
and passed along to kfunc. Doesn't matter the cgroup origin.

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Tejun Heo 3 months, 1 week ago

Hello,

On Wed, Oct 29, 2025 at 04:53:07PM -0700, Alexei Starovoitov wrote:
...
> > - How would recursion work with private stacks? Aren't those attached to
> >   each BPF program?
> 
> yes. private stack is per prog, but why does it matter?
> I'm not suggesting that the same prog to be attached at different
> levels of the cgroup hierarchy, because such configuration
> will indeed trigger recursion prevention logic (with or without private
> stack).
> But having one logical sched-ext prog set to manage tasks
> in container A and in container B makes sense as a use case to me
> where A and B are different cgroups.
> DSQs can be cgroup scoped too.

I don't know. Maybe, but this is kinda specific and I don't see how this
would be useful in practical sense. Have nothing against using the
mechanism. I can still enforce the same rules from scx side. It just looks
unnecessarily over-designed. Maybe consistency with other BPF progs
justifies it.

> > If there is one struct_ops per cgroup, the oom kill kfunc can
> >   look that up and then verify that the struct_ops has authority over the
> >   target process. Multiple attachments can work too but that'd require
> >   iterating all attachments, right?
> 
> Are you talking about bpf_oom_kill_process() kfunc from these patch set?
> I don't think it needs any changes. oom context is passed into prog
> and passed along to kfunc. Doesn't matter the cgroup origin.

Oh, if there are other mechanisms to enforce boundaries, it's not a problem,
but I can almost guarantee as the framework grows, there will be needs for
kfuncs to identify and verify the callers and handlers communicating with
each other along the hierarchy requiring recursive calls.

Thanks.

-- 
tejun

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Alexei Starovoitov 3 months, 1 week ago

On Wed, Oct 29, 2025 at 5:03 PM Tejun Heo <tj@kernel.org> wrote:
>
> Oh, if there are other mechanisms to enforce boundaries, it's not a problem,
> but I can almost guarantee as the framework grows, there will be needs for
> kfuncs to identify and verify the callers and handlers communicating with
> each other along the hierarchy requiring recursive calls.

tbh I think it's a combination of sched_ext_ops and bpf infra problem.
All of the scx ops are missing "this" pointer which would have
been there if it was a C++ class.
And "this" should be pointing to an instance of class.
If sched-ext progs are attached to different cgroups, then
every attachment would have been a different instance and
different "this".
Then all kfuncs would effectively be declared as helper
methods within a class. In this case within "struct sched_ext_ops"
as functions that ops callback can call but they will
also have implicit "this" that points back to a particular instance.

Special aux__prog and prog_assoc are not exactly pretty
workarounds for lack of "this".

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Yafang Shao 3 months, 1 week ago

On Thu, Oct 30, 2025 at 8:16 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Oct 29, 2025 at 5:03 PM Tejun Heo <tj@kernel.org> wrote:
> >
> > Oh, if there are other mechanisms to enforce boundaries, it's not a problem,
> > but I can almost guarantee as the framework grows, there will be needs for
> > kfuncs to identify and verify the callers and handlers communicating with
> > each other along the hierarchy requiring recursive calls.
>
> tbh I think it's a combination of sched_ext_ops and bpf infra problem.
> All of the scx ops are missing "this" pointer which would have
> been there if it was a C++ class.
> And "this" should be pointing to an instance of class.
> If sched-ext progs are attached to different cgroups, then
> every attachment would have been a different instance and
> different "this".
> Then all kfuncs would effectively be declared as helper
> methods within a class. In this case within "struct sched_ext_ops"
> as functions that ops callback can call but they will
> also have implicit "this" that points back to a particular instance.
>
> Special aux__prog and prog_assoc are not exactly pretty
> workarounds for lack of "this".
>

I also share the concern that supporting the attachment of a single
struct-ops to multiple cgroups appears over-engineered for the current
needs. Given that we do not anticipate a large number of cgroup
attachments in real-world use, implementing such a generalized
mechanism now seems premature. We can always introduce this
functionality later in a backward-compatible manner if concrete use
cases emerge.

That said, if we still decide to move forward with this approach, I
would suggest merging this patch as a standalone change. Doing so
would allow my BPF-THP series to build upon the same mechanism.

-- 
Regards
Yafang

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Tejun Heo 3 months, 1 week ago

Hello,

On Wed, Oct 29, 2025 at 02:18:00PM -0700, Song Liu wrote:
...
> How about we pass a pointer to mem_cgroup (and/or related pointers)
> to all the callbacks in the struct_ops? AFAICT, in-kernel _ops structures like
> struct file_operations and struct tcp_congestion_ops use this method. And
> we can actually implement struct tcp_congestion_ops in BPF. With the
> struct tcp_congestion_ops model, the struct_ops map and the struct_ops
> link are both shared among multiple instances (sockets).
> 
> With this model, the system admin with root access can load a bunch of
> available oom handlers, and users in their container can pick a preferred
> oom handler for the sub cgroup. AFAICT, the users in the container can
> pick the proper OOM handler without CAP_BPF. Does this sound useful
> for some cases?

Doesn't that assume that the programs are more or less stateless? Wouldn't
oom handlers want to track historical information, running averages, which
process expanded the most and so on?

Thanks.

-- 
tejun

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Song Liu 3 months, 1 week ago

On Wed, Oct 29, 2025 at 2:27 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Wed, Oct 29, 2025 at 02:18:00PM -0700, Song Liu wrote:
> ...
> > How about we pass a pointer to mem_cgroup (and/or related pointers)
> > to all the callbacks in the struct_ops? AFAICT, in-kernel _ops structures like
> > struct file_operations and struct tcp_congestion_ops use this method. And
> > we can actually implement struct tcp_congestion_ops in BPF. With the
> > struct tcp_congestion_ops model, the struct_ops map and the struct_ops
> > link are both shared among multiple instances (sockets).
> >
> > With this model, the system admin with root access can load a bunch of
> > available oom handlers, and users in their container can pick a preferred
> > oom handler for the sub cgroup. AFAICT, the users in the container can
> > pick the proper OOM handler without CAP_BPF. Does this sound useful
> > for some cases?
>
> Doesn't that assume that the programs are more or less stateless? Wouldn't
> oom handlers want to track historical information, running averages, which
> process expanded the most and so on?

Yes, this does mean the program needs to store data in some BPF maps.
Do we have concern with the performance of BPF maps?

Thanks,
Song

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Tejun Heo 3 months, 1 week ago

Hello,

On Wed, Oct 29, 2025 at 02:37:38PM -0700, Song Liu wrote:
> On Wed, Oct 29, 2025 at 2:27 PM Tejun Heo <tj@kernel.org> wrote:
> > Doesn't that assume that the programs are more or less stateless? Wouldn't
> > oom handlers want to track historical information, running averages, which
> > process expanded the most and so on?
> 
> Yes, this does mean the program needs to store data in some BPF maps.
> Do we have concern with the performance of BPF maps?

It's just a lot more awkward to do and I have a difficult time thinking up
reasons why one would need to do that. If you attach a single struct_ops
instance to one cgroup, you can use global variables, maps, arena to track
what's happening with the cgroup. If you share the same struct_ops across
multiple cgroups, each operation has to scope per-cgroup states. I can see
how that probably makes sense for sockets but cgroups aren't sockets. There
are a lot fewer cgroups and they are organized in a tree.

Thanks.

-- 
tejun

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Song Liu 3 months, 1 week ago

On Wed, Oct 29, 2025 at 2:45 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Wed, Oct 29, 2025 at 02:37:38PM -0700, Song Liu wrote:
> > On Wed, Oct 29, 2025 at 2:27 PM Tejun Heo <tj@kernel.org> wrote:
> > > Doesn't that assume that the programs are more or less stateless? Wouldn't
> > > oom handlers want to track historical information, running averages, which
> > > process expanded the most and so on?
> >
> > Yes, this does mean the program needs to store data in some BPF maps.
> > Do we have concern with the performance of BPF maps?
>
> It's just a lot more awkward to do and I have a difficult time thinking up
> reasons why one would need to do that. If you attach a single struct_ops
> instance to one cgroup, you can use global variables, maps, arena to track
> what's happening with the cgroup. If you share the same struct_ops across
> multiple cgroups, each operation has to scope per-cgroup states. I can see
> how that probably makes sense for sockets but cgroups aren't sockets. There
> are a lot fewer cgroups and they are organized in a tree.

If the use case is to attach a single struct_ops to a single cgroup, the author
of that BPF program can always ignore the memcg parameter and use
global variables, etc. We waste a register in BPF ISA to save the pointer to
memcg,  but JiT may recover that in native instructions.

OTOH, starting without a memcg parameter, it will be impossible to allow
attaching the same struct_ops to different cgroups. I still think it is a valid
use case that the sysadmin loads a set of OOM handlers for users in the
containers to choose from is a valid use case.

Also, a per cgroup oom handler may need to access the memcg information
anyway. Without a dedicated memcg argument, the user need to fetch it
somewhere else.

Thanks,
Song

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Tejun Heo 3 months, 1 week ago

Hello,

On Wed, Oct 29, 2025 at 09:32:44PM -0700, Song Liu wrote:
> If the use case is to attach a single struct_ops to a single cgroup, the author
> of that BPF program can always ignore the memcg parameter and use
> global variables, etc. We waste a register in BPF ISA to save the pointer to
> memcg,  but JiT may recover that in native instructions.
> 
> OTOH, starting without a memcg parameter, it will be impossible to allow
> attaching the same struct_ops to different cgroups. I still think it is a valid
> use case that the sysadmin loads a set of OOM handlers for users in the
> containers to choose from is a valid use case.

I find something like that being implemented through struct_ops attaching
rather unlikely. Wouldn't it look more like the following?

- Attach a handler at the parent level which implements different policies.

- Child cgroups pick the desired policy using e.g. cgroup xattrs and when
  OOM event happens, the OOM handler attached at the parent implements the
  requested policy.

- If further customization is desired and supported, it's implemented
  through child loading its own OOM handler which operates under the
  parent's OOM handler.

> Also, a per cgroup oom handler may need to access the memcg information
> anyway. Without a dedicated memcg argument, the user need to fetch it
> somewhere else.

An OOM handler attached to a cgroup doesn't just need to handle OOM events
in the cgroup itself. It's responsible for the whole sub-hierarchy. ie. It
will need accessors to reach all those memcgs anyway.

Another thing to consider is that the memcg for a given cgroup can change by
the controller being enabled and disabled. There isn't the one permanent
memcg that a given cgroup is associated with.

Thanks.

-- 
tejun

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Song Liu 3 months, 1 week ago

On Thu, Oct 30, 2025 at 9:14 AM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Wed, Oct 29, 2025 at 09:32:44PM -0700, Song Liu wrote:
> > If the use case is to attach a single struct_ops to a single cgroup, the author
> > of that BPF program can always ignore the memcg parameter and use
> > global variables, etc. We waste a register in BPF ISA to save the pointer to
> > memcg,  but JiT may recover that in native instructions.
> >
> > OTOH, starting without a memcg parameter, it will be impossible to allow
> > attaching the same struct_ops to different cgroups. I still think it is a valid
> > use case that the sysadmin loads a set of OOM handlers for users in the
> > containers to choose from is a valid use case.
>
> I find something like that being implemented through struct_ops attaching
> rather unlikely. Wouldn't it look more like the following?
>
> - Attach a handler at the parent level which implements different policies.
>
> - Child cgroups pick the desired policy using e.g. cgroup xattrs and when
>   OOM event happens, the OOM handler attached at the parent implements the
>   requested policy.

OK, using xattrs is another way to achieve this.

> - If further customization is desired and supported, it's implemented
>   through child loading its own OOM handler which operates under the
>   parent's OOM handler.
>
> > Also, a per cgroup oom handler may need to access the memcg information
> > anyway. Without a dedicated memcg argument, the user need to fetch it
> > somewhere else.
>
> An OOM handler attached to a cgroup doesn't just need to handle OOM events
> in the cgroup itself. It's responsible for the whole sub-hierarchy. ie. It
> will need accessors to reach all those memcgs anyway.
>
> Another thing to consider is that the memcg for a given cgroup can change by
> the controller being enabled and disabled. There isn't the one permanent
> memcg that a given cgroup is associated with.

In the current version, bpf_oom_ops is attached to the memcg. As long as
we feed a pointer to memcg to all struct_ops functions, these functions
can be implemented in a stateless way. I think having the option to do
this stateless implementation will help us in the long term.

Thanks,
Song

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Song Liu 3 months, 1 week ago

On Mon, Oct 27, 2025 at 4:17 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
[...]
>  struct bpf_struct_ops_value {
>         struct bpf_struct_ops_common_value common;
> @@ -1359,6 +1360,18 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
>         }
>         bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
>                       attr->link_create.attach_type);
> +#ifdef CONFIG_CGROUPS
> +       if (attr->link_create.cgroup.relative_fd) {
> +               struct cgroup *cgrp;
> +
> +               cgrp = cgroup_get_from_fd(attr->link_create.cgroup.relative_fd);

We should use "target_fd" here, not relative_fd.

Also, 0 is a valid fd, so we cannot use target_fd == 0 to attach to
global memcg.

Thanks,
Song

> +               if (IS_ERR(cgrp))
> +                       return PTR_ERR(cgrp);
> +
> +               link->cgroup_id = cgroup_id(cgrp);
> +               cgroup_put(cgrp);
> +       }
> +#endif /* CONFIG_CGROUPS */
>
>         err = bpf_link_prime(&link->link, &link_primer);
>         if (err)
> --
> 2.51.0
>

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Roman Gushchin 3 months, 1 week ago

Song Liu <song@kernel.org> writes:

> On Mon, Oct 27, 2025 at 4:17 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
> [...]
>>  struct bpf_struct_ops_value {
>>         struct bpf_struct_ops_common_value common;
>> @@ -1359,6 +1360,18 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
>>         }
>>         bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
>>                       attr->link_create.attach_type);
>> +#ifdef CONFIG_CGROUPS
>> +       if (attr->link_create.cgroup.relative_fd) {
>> +               struct cgroup *cgrp;
>> +
>> +               cgrp = cgroup_get_from_fd(attr->link_create.cgroup.relative_fd);
>
> We should use "target_fd" here, not relative_fd.
>
> Also, 0 is a valid fd, so we cannot use target_fd == 0 to attach to
> global memcg.

Yep, but then we need somehow signal there is a cgroup fd passed,
so that struct ops'es which are not attached to cgroups keep working
as previously. And we can't use link_create.attach_type.

Should I use link_create.flags? E.g. something like add new flag

@@ -1224,6 +1224,7 @@ enum bpf_perf_event_type {
 #define BPF_F_AFTER		(1U << 4)
 #define BPF_F_ID		(1U << 5)
 #define BPF_F_PREORDER		(1U << 6)
+#define BPF_F_CGROUP		(1U << 7)
 #define BPF_F_LINK		BPF_F_LINK /* 1 << 13 */
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the

and then do something like this:

int bpf_struct_ops_link_create(union bpf_attr *attr)
{
	<...>
	if (attr->link_create.flags & BPF_F_CGROUP) {
		struct cgroup *cgrp;

		cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
		if (IS_ERR(cgrp)) {
			err = PTR_ERR(cgrp);
			goto err_out;
		}

		link->cgroup_id = cgroup_id(cgrp);
		cgroup_put(cgrp);
	}

Does it sound right?

Thanks

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Song Liu 3 months, 1 week ago

On Thu, Oct 30, 2025 at 10:22 AM Roman Gushchin
<roman.gushchin@linux.dev> wrote:
>
> Song Liu <song@kernel.org> writes:
>
> > On Mon, Oct 27, 2025 at 4:17 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
> > [...]
> >>  struct bpf_struct_ops_value {
> >>         struct bpf_struct_ops_common_value common;
> >> @@ -1359,6 +1360,18 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
> >>         }
> >>         bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
> >>                       attr->link_create.attach_type);
> >> +#ifdef CONFIG_CGROUPS
> >> +       if (attr->link_create.cgroup.relative_fd) {
> >> +               struct cgroup *cgrp;
> >> +
> >> +               cgrp = cgroup_get_from_fd(attr->link_create.cgroup.relative_fd);
> >
> > We should use "target_fd" here, not relative_fd.
> >
> > Also, 0 is a valid fd, so we cannot use target_fd == 0 to attach to
> > global memcg.
>
> Yep, but then we need somehow signal there is a cgroup fd passed,
> so that struct ops'es which are not attached to cgroups keep working
> as previously. And we can't use link_create.attach_type.
>
> Should I use link_create.flags? E.g. something like add new flag
>
> @@ -1224,6 +1224,7 @@ enum bpf_perf_event_type {
>  #define BPF_F_AFTER            (1U << 4)
>  #define BPF_F_ID               (1U << 5)
>  #define BPF_F_PREORDER         (1U << 6)
> +#define BPF_F_CGROUP           (1U << 7)
>  #define BPF_F_LINK             BPF_F_LINK /* 1 << 13 */
>
>  /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
>
> and then do something like this:
>
> int bpf_struct_ops_link_create(union bpf_attr *attr)
> {
>         <...>
>         if (attr->link_create.flags & BPF_F_CGROUP) {
>                 struct cgroup *cgrp;
>
>                 cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
>                 if (IS_ERR(cgrp)) {
>                         err = PTR_ERR(cgrp);
>                         goto err_out;
>                 }
>
>                 link->cgroup_id = cgroup_id(cgrp);
>                 cgroup_put(cgrp);
>         }
>
> Does it sound right?

I believe adding a flag (BPF_F_CGROUP or some other name), is the
right solution for this.

OTOH, I am not sure whether we want to add cgroup fd/id to the
bpf link. I personally prefer the model used by TCP congestion
control: the link attaches the struct_ops to a global list, then each
user picks a struct_ops from the list. But I do agree this might be
an overkill for cgroup use cases.

Thanks,
Song

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Amery Hung 3 months, 1 week ago

On Thu, Oct 30, 2025 at 11:09 AM Song Liu <song@kernel.org> wrote:
>
> On Thu, Oct 30, 2025 at 10:22 AM Roman Gushchin
> <roman.gushchin@linux.dev> wrote:
> >
> > Song Liu <song@kernel.org> writes:
> >
> > > On Mon, Oct 27, 2025 at 4:17 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
> > > [...]
> > >>  struct bpf_struct_ops_value {
> > >>         struct bpf_struct_ops_common_value common;
> > >> @@ -1359,6 +1360,18 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
> > >>         }
> > >>         bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
> > >>                       attr->link_create.attach_type);
> > >> +#ifdef CONFIG_CGROUPS
> > >> +       if (attr->link_create.cgroup.relative_fd) {
> > >> +               struct cgroup *cgrp;
> > >> +
> > >> +               cgrp = cgroup_get_from_fd(attr->link_create.cgroup.relative_fd);
> > >
> > > We should use "target_fd" here, not relative_fd.
> > >
> > > Also, 0 is a valid fd, so we cannot use target_fd == 0 to attach to
> > > global memcg.
> >
> > Yep, but then we need somehow signal there is a cgroup fd passed,
> > so that struct ops'es which are not attached to cgroups keep working
> > as previously. And we can't use link_create.attach_type.
> >
> > Should I use link_create.flags? E.g. something like add new flag
> >
> > @@ -1224,6 +1224,7 @@ enum bpf_perf_event_type {
> >  #define BPF_F_AFTER            (1U << 4)
> >  #define BPF_F_ID               (1U << 5)
> >  #define BPF_F_PREORDER         (1U << 6)
> > +#define BPF_F_CGROUP           (1U << 7)
> >  #define BPF_F_LINK             BPF_F_LINK /* 1 << 13 */
> >
> >  /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
> >
> > and then do something like this:
> >
> > int bpf_struct_ops_link_create(union bpf_attr *attr)
> > {
> >         <...>
> >         if (attr->link_create.flags & BPF_F_CGROUP) {
> >                 struct cgroup *cgrp;
> >
> >                 cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
> >                 if (IS_ERR(cgrp)) {
> >                         err = PTR_ERR(cgrp);
> >                         goto err_out;
> >                 }
> >
> >                 link->cgroup_id = cgroup_id(cgrp);
> >                 cgroup_put(cgrp);
> >         }
> >
> > Does it sound right?
>
> I believe adding a flag (BPF_F_CGROUP or some other name), is the
> right solution for this.
>
> OTOH, I am not sure whether we want to add cgroup fd/id to the
> bpf link. I personally prefer the model used by TCP congestion
> control: the link attaches the struct_ops to a global list, then each
> user picks a struct_ops from the list. But I do agree this might be
> an overkill for cgroup use cases.

+1.

In TCP congestion control and BPF qdisc's model:

During link_create, both adds the struct_ops to a list, and the
struct_ops can be indexed by name. The struct_ops are not "active" by
this time.
Then, each has their own interface to 'apply' the struct_ops to a
socket or queue: setsockopt() or netlink.

But maybe cgroup-related struct_ops are different.

-Amery

>
> Thanks,
> Song
>

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Roman Gushchin 3 months, 1 week ago

Amery Hung <ameryhung@gmail.com> writes:

> On Thu, Oct 30, 2025 at 11:09 AM Song Liu <song@kernel.org> wrote:
>>
>> On Thu, Oct 30, 2025 at 10:22 AM Roman Gushchin
>> <roman.gushchin@linux.dev> wrote:
>> >
>> > Song Liu <song@kernel.org> writes:
>> >
>> > > On Mon, Oct 27, 2025 at 4:17 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>> > > [...]
>> > >>  struct bpf_struct_ops_value {
>> > >>         struct bpf_struct_ops_common_value common;
>> > >> @@ -1359,6 +1360,18 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
>> > >>         }
>> > >>         bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
>> > >>                       attr->link_create.attach_type);
>> > >> +#ifdef CONFIG_CGROUPS
>> > >> +       if (attr->link_create.cgroup.relative_fd) {
>> > >> +               struct cgroup *cgrp;
>> > >> +
>> > >> +               cgrp = cgroup_get_from_fd(attr->link_create.cgroup.relative_fd);
>> > >
>> > > We should use "target_fd" here, not relative_fd.
>> > >
>> > > Also, 0 is a valid fd, so we cannot use target_fd == 0 to attach to
>> > > global memcg.
>> >
>> > Yep, but then we need somehow signal there is a cgroup fd passed,
>> > so that struct ops'es which are not attached to cgroups keep working
>> > as previously. And we can't use link_create.attach_type.
>> >
>> > Should I use link_create.flags? E.g. something like add new flag
>> >
>> > @@ -1224,6 +1224,7 @@ enum bpf_perf_event_type {
>> >  #define BPF_F_AFTER            (1U << 4)
>> >  #define BPF_F_ID               (1U << 5)
>> >  #define BPF_F_PREORDER         (1U << 6)
>> > +#define BPF_F_CGROUP           (1U << 7)
>> >  #define BPF_F_LINK             BPF_F_LINK /* 1 << 13 */
>> >
>> >  /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
>> >
>> > and then do something like this:
>> >
>> > int bpf_struct_ops_link_create(union bpf_attr *attr)
>> > {
>> >         <...>
>> >         if (attr->link_create.flags & BPF_F_CGROUP) {
>> >                 struct cgroup *cgrp;
>> >
>> >                 cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
>> >                 if (IS_ERR(cgrp)) {
>> >                         err = PTR_ERR(cgrp);
>> >                         goto err_out;
>> >                 }
>> >
>> >                 link->cgroup_id = cgroup_id(cgrp);
>> >                 cgroup_put(cgrp);
>> >         }
>> >
>> > Does it sound right?
>>
>> I believe adding a flag (BPF_F_CGROUP or some other name), is the
>> right solution for this.
>>
>> OTOH, I am not sure whether we want to add cgroup fd/id to the
>> bpf link. I personally prefer the model used by TCP congestion
>> control: the link attaches the struct_ops to a global list, then each
>> user picks a struct_ops from the list. But I do agree this might be
>> an overkill for cgroup use cases.
>
> +1.
>
> In TCP congestion control and BPF qdisc's model:
>
> During link_create, both adds the struct_ops to a list, and the
> struct_ops can be indexed by name. The struct_ops are not "active" by
> this time.
> Then, each has their own interface to 'apply' the struct_ops to a
> socket or queue: setsockopt() or netlink.
>
> But maybe cgroup-related struct_ops are different.

Both tcp congestion and qdisk cases are somewhat different because
there already is a way to select between multiple implementations, bpf
just adds another one. In the oom case, it's not true. As of today,
there is only one (global) oom killer. Of course we can create
interfaces to allow a user make a choice. But the question is do we want
to create such interface for the oom case specifically (and later for
each new case separately), or there is a place for some generalization?


Ok, let me summarize the options we discussed here:

1) Make the attachment details (e.g. cgroup_id) the part of struct ops
itself. The attachment is happening at the reg() time.

  +: It's convenient for complex stateful struct ops'es, because a
      single entity represents a combination of code and data.
  -: No way to attach a single struct ops to multiple entities.

This approach is used by Tejun for per-cgroup sched_ext prototype.

2) Make the attachment details a part of bpf_link creation. The
attachment is still happening at the reg() time.

  +: A single struct ops can be attached to multiple entities.
  -: Implementing stateful struct ops'es is harder and requires passing
     an additional argument (some sort of "self") to all callbacks.

I'm using this approach in the bpf oom proposal.

3) Move the attachment out of .reg() scope entirely. reg() will register
the implementation system-wide and then some 3rd-party interface
(e.g. cgroupfs) should be used to select the implementation.

  +: ?
  -: New hard-coded interfaces might be required to enable bpf-driven
     kernel customization. The "attachment" code is not shared between
     various struct ops cases.
     Implementing stateful struct ops'es is harder and requires passing
     an additional argument (some sort of "self") to all callbacks.

This approach works well for cases when there is already a selection
of implementations (e.g. tcp congestion mechanisms), and bpf is adding
another one.

I personally lean towards 2), but I can easily implement bpf_oom with 1)
and most likely with 3) too, but it's a bit more complicated.

So I guess we all need to come to an agreement which way to go.

Thanks!

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Song Liu 3 months, 1 week ago

Hi Roman,

On Thu, Oct 30, 2025 at 12:07 PM Roman Gushchin
<roman.gushchin@linux.dev> wrote:
[...]
> > In TCP congestion control and BPF qdisc's model:
> >
> > During link_create, both adds the struct_ops to a list, and the
> > struct_ops can be indexed by name. The struct_ops are not "active" by
> > this time.
> > Then, each has their own interface to 'apply' the struct_ops to a
> > socket or queue: setsockopt() or netlink.
> >
> > But maybe cgroup-related struct_ops are different.
>
> Both tcp congestion and qdisk cases are somewhat different because
> there already is a way to select between multiple implementations, bpf
> just adds another one. In the oom case, it's not true. As of today,
> there is only one (global) oom killer. Of course we can create
> interfaces to allow a user make a choice. But the question is do we want
> to create such interface for the oom case specifically (and later for
> each new case separately), or there is a place for some generalization?

Agreed that this approach requires a separate mechanism to attach
the struct_ops to an entity.

> Ok, let me summarize the options we discussed here:

Thanks for the summary!

>
> 1) Make the attachment details (e.g. cgroup_id) the part of struct ops
> itself. The attachment is happening at the reg() time.
>
>   +: It's convenient for complex stateful struct ops'es, because a
>       single entity represents a combination of code and data.
>   -: No way to attach a single struct ops to multiple entities.
>
> This approach is used by Tejun for per-cgroup sched_ext prototype.
>
> 2) Make the attachment details a part of bpf_link creation. The
> attachment is still happening at the reg() time.
>
>   +: A single struct ops can be attached to multiple entities.
>   -: Implementing stateful struct ops'es is harder and requires passing
>      an additional argument (some sort of "self") to all callbacks.
> I'm using this approach in the bpf oom proposal.
>

I think both 1) and 2) have the following issue. With cgroup_id in
struct_ops or the link, the cgroup_id works more like a filter. The
cgroup doesn't hold any reference to the struct_ops. The bpf link
holds the reference to the struct_ops, so we need to keep the
the link alive, either by keeping an active fd, or by pinning the
link to bpffs. When the cgroup is removed, we need to clean up
the bpf link separately.

> 3) Move the attachment out of .reg() scope entirely. reg() will register
> the implementation system-wide and then some 3rd-party interface
> (e.g. cgroupfs) should be used to select the implementation.
>
>   +: ?
>   -: New hard-coded interfaces might be required to enable bpf-driven
>      kernel customization. The "attachment" code is not shared between
>      various struct ops cases.
>      Implementing stateful struct ops'es is harder and requires passing
>      an additional argument (some sort of "self") to all callbacks.
>
> This approach works well for cases when there is already a selection
> of implementations (e.g. tcp congestion mechanisms), and bpf is adding
> another one.

Another benefit of 3) is that it allows loading an OOM controller in a
kernel module, just like loading a file system in a kernel module. This
is possible with 3) because we paid the cost of adding a new select
attach interface.

A semi-separate topic, option 2) enables attaching a BPF program
to a kernel object (a cgroup here, but could be something else). This
is an interesting idea, and we may find it useful in other cases (attach
a BPF program to a task_struct, etc.).

Thanks,
Song

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Martin KaFai Lau 3 months, 1 week ago


On 10/30/25 2:34 PM, Song Liu wrote:
> Hi Roman,
> 
> On Thu, Oct 30, 2025 at 12:07 PM Roman Gushchin
> <roman.gushchin@linux.dev> wrote:
> [...]
>>> In TCP congestion control and BPF qdisc's model:
>>>
>>> During link_create, both adds the struct_ops to a list, and the
>>> struct_ops can be indexed by name. The struct_ops are not "active" by
>>> this time.
>>> Then, each has their own interface to 'apply' the struct_ops to a
>>> socket or queue: setsockopt() or netlink.
>>>
>>> But maybe cgroup-related struct_ops are different.
>>
>> Both tcp congestion and qdisk cases are somewhat different because
>> there already is a way to select between multiple implementations, bpf
>> just adds another one. In the oom case, it's not true. As of today,
>> there is only one (global) oom killer. Of course we can create
>> interfaces to allow a user make a choice. But the question is do we want
>> to create such interface for the oom case specifically (and later for
>> each new case separately), or there is a place for some generalization?
> 
> Agreed that this approach requires a separate mechanism to attach
> the struct_ops to an entity.
> 
>> Ok, let me summarize the options we discussed here:
> 
> Thanks for the summary!
> 
>>
>> 1) Make the attachment details (e.g. cgroup_id) the part of struct ops
>> itself. The attachment is happening at the reg() time.
>>
>>    +: It's convenient for complex stateful struct ops'es, because a
>>        single entity represents a combination of code and data.
>>    -: No way to attach a single struct ops to multiple entities.
>>
>> This approach is used by Tejun for per-cgroup sched_ext prototype.
>>
>> 2) Make the attachment details a part of bpf_link creation. The
>> attachment is still happening at the reg() time.
>>
>>    +: A single struct ops can be attached to multiple entities.
>>    -: Implementing stateful struct ops'es is harder and requires passing
>>       an additional argument (some sort of "self") to all callbacks.
>> I'm using this approach in the bpf oom proposal.
>>
> 
> I think both 1) and 2) have the following issue. With cgroup_id in
> struct_ops or the link, the cgroup_id works more like a filter. The
> cgroup doesn't hold any reference to the struct_ops. The bpf link
> holds the reference to the struct_ops, so we need to keep the
> the link alive, either by keeping an active fd, or by pinning the
> link to bpffs. When the cgroup is removed, we need to clean up
> the bpf link separately.

The link can be detached (struct_ops's unreg) by the user space.

The link can also be detached from the subsystem (cgroup) here.
It was requested by scx:
https://lore.kernel.org/all/20240530065946.979330-7-thinker.li@gmail.com/

Not sure if scx has started using it.

> 
>> 3) Move the attachment out of .reg() scope entirely. reg() will register
>> the implementation system-wide and then some 3rd-party interface
>> (e.g. cgroupfs) should be used to select the implementation.
>>
>>    +: ?
>>    -: New hard-coded interfaces might be required to enable bpf-driven
>>       kernel customization. The "attachment" code is not shared between
>>       various struct ops cases.
>>       Implementing stateful struct ops'es is harder and requires passing
>>       an additional argument (some sort of "self") to all callbacks.
>>
>> This approach works well for cases when there is already a selection
>> of implementations (e.g. tcp congestion mechanisms), and bpf is adding
>> another one.
> 
> Another benefit of 3) is that it allows loading an OOM controller in a
> kernel module, just like loading a file system in a kernel module. This
> is possible with 3) because we paid the cost of adding a new select
> attach interface.
> 
> A semi-separate topic, option 2) enables attaching a BPF program
> to a kernel object (a cgroup here, but could be something else). This
> is an interesting idea, and we may find it useful in other cases (attach
> a BPF program to a task_struct, etc.).

Does it have plan for a pure kernel module oom implementation?
I think the link-to-cgrp support here does not necessary stop the
later write to cgroupfs support if a kernel module oom is indeed needed
in the future.

imo, cgroup-bpf has a eco-system around it, so it is sort of special. bpf user
has expectation on how a bpf prog is attached to a cgroup. The introspection,
auto detachment from the cgroup when the link is gone...etc.

If link-to-cgrp is used, I prefer (2). Stay with one way to attach
to a cgrp. It is also consistent with the current way of attaching a single
bpf prog to a cgroup. It is now attaching a map/set of bpf prog to a cgroup.
The individual struct_ops implementation can decide if it should
allow a struct_ops be attached multiple times.

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Song Liu 3 months, 1 week ago

On Thu, Oct 30, 2025 at 3:42 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
[...]
>
> The link can be detached (struct_ops's unreg) by the user space.
>
> The link can also be detached from the subsystem (cgroup) here.
> It was requested by scx:
> https://lore.kernel.org/all/20240530065946.979330-7-thinker.li@gmail.com/
>
> Not sure if scx has started using it.

I see. The user space can poll the link fd, and get notified when the
cgroup is removed.

> >
> >> 3) Move the attachment out of .reg() scope entirely. reg() will register
> >> the implementation system-wide and then some 3rd-party interface
> >> (e.g. cgroupfs) should be used to select the implementation.
> >>
> >>    +: ?
> >>    -: New hard-coded interfaces might be required to enable bpf-driven
> >>       kernel customization. The "attachment" code is not shared between
> >>       various struct ops cases.
> >>       Implementing stateful struct ops'es is harder and requires passing
> >>       an additional argument (some sort of "self") to all callbacks.
> >>
> >> This approach works well for cases when there is already a selection
> >> of implementations (e.g. tcp congestion mechanisms), and bpf is adding
> >> another one.
> >
> > Another benefit of 3) is that it allows loading an OOM controller in a
> > kernel module, just like loading a file system in a kernel module. This
> > is possible with 3) because we paid the cost of adding a new select
> > attach interface.
> >
> > A semi-separate topic, option 2) enables attaching a BPF program
> > to a kernel object (a cgroup here, but could be something else). This
> > is an interesting idea, and we may find it useful in other cases (attach
> > a BPF program to a task_struct, etc.).
>
> Does it have plan for a pure kernel module oom implementation?
> I think the link-to-cgrp support here does not necessary stop the
> later write to cgroupfs support if a kernel module oom is indeed needed
> in the future.

I am not aware of use cases to write OOM handlers in modules. Also
agreed that adding attach to cgroup link doesn't stop us from using
modules in the future.

Thanks,
Song

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Roman Gushchin 3 months, 1 week ago

Martin KaFai Lau <martin.lau@linux.dev> writes:

> On 10/30/25 2:34 PM, Song Liu wrote:
>> Hi Roman,
>> On Thu, Oct 30, 2025 at 12:07 PM Roman Gushchin
>> <roman.gushchin@linux.dev> wrote:
>> [...]
>>>> In TCP congestion control and BPF qdisc's model:
>>>>
>>>> During link_create, both adds the struct_ops to a list, and the
>>>> struct_ops can be indexed by name. The struct_ops are not "active" by
>>>> this time.
>>>> Then, each has their own interface to 'apply' the struct_ops to a
>>>> socket or queue: setsockopt() or netlink.
>>>>
>>>> But maybe cgroup-related struct_ops are different.
>>>
>>> Both tcp congestion and qdisk cases are somewhat different because
>>> there already is a way to select between multiple implementations, bpf
>>> just adds another one. In the oom case, it's not true. As of today,
>>> there is only one (global) oom killer. Of course we can create
>>> interfaces to allow a user make a choice. But the question is do we want
>>> to create such interface for the oom case specifically (and later for
>>> each new case separately), or there is a place for some generalization?
>> Agreed that this approach requires a separate mechanism to attach
>> the struct_ops to an entity.
>> 
>>> Ok, let me summarize the options we discussed here:
>> Thanks for the summary!
>> 
>>>
>>> 1) Make the attachment details (e.g. cgroup_id) the part of struct ops
>>> itself. The attachment is happening at the reg() time.
>>>
>>>    +: It's convenient for complex stateful struct ops'es, because a
>>>        single entity represents a combination of code and data.
>>>    -: No way to attach a single struct ops to multiple entities.
>>>
>>> This approach is used by Tejun for per-cgroup sched_ext prototype.
>>>
>>> 2) Make the attachment details a part of bpf_link creation. The
>>> attachment is still happening at the reg() time.
>>>
>>>    +: A single struct ops can be attached to multiple entities.
>>>    -: Implementing stateful struct ops'es is harder and requires passing
>>>       an additional argument (some sort of "self") to all callbacks.
>>> I'm using this approach in the bpf oom proposal.
>>>
>> I think both 1) and 2) have the following issue. With cgroup_id in
>> struct_ops or the link, the cgroup_id works more like a filter. The
>> cgroup doesn't hold any reference to the struct_ops. The bpf link
>> holds the reference to the struct_ops, so we need to keep the
>> the link alive, either by keeping an active fd, or by pinning the
>> link to bpffs. When the cgroup is removed, we need to clean up
>> the bpf link separately.
>
> The link can be detached (struct_ops's unreg) by the user space.
>
> The link can also be detached from the subsystem (cgroup) here.
> It was requested by scx:
> https://lore.kernel.org/all/20240530065946.979330-7-thinker.li@gmail.com/
>
> Not sure if scx has started using it.
>
>> 
>>> 3) Move the attachment out of .reg() scope entirely. reg() will register
>>> the implementation system-wide and then some 3rd-party interface
>>> (e.g. cgroupfs) should be used to select the implementation.
>>>
>>>    +: ?
>>>    -: New hard-coded interfaces might be required to enable bpf-driven
>>>       kernel customization. The "attachment" code is not shared between
>>>       various struct ops cases.
>>>       Implementing stateful struct ops'es is harder and requires passing
>>>       an additional argument (some sort of "self") to all callbacks.
>>>
>>> This approach works well for cases when there is already a selection
>>> of implementations (e.g. tcp congestion mechanisms), and bpf is adding
>>> another one.
>> Another benefit of 3) is that it allows loading an OOM controller in
>> a
>> kernel module, just like loading a file system in a kernel module. This
>> is possible with 3) because we paid the cost of adding a new select
>> attach interface.
>> A semi-separate topic, option 2) enables attaching a BPF program
>> to a kernel object (a cgroup here, but could be something else). This
>> is an interesting idea, and we may find it useful in other cases (attach
>> a BPF program to a task_struct, etc.).

Yep, task_struct is an attractive target for something like mm-related
policies (THP, NUMA, memory tiers etc).

>
> Does it have plan for a pure kernel module oom implementation?

I highly doubt.

> I think the link-to-cgrp support here does not necessary stop the
> later write to cgroupfs support if a kernel module oom is indeed needed
> in the future.
>
> imo, cgroup-bpf has a eco-system around it, so it is sort of special. bpf user
> has expectation on how a bpf prog is attached to a cgroup. The introspection,
> auto detachment from the cgroup when the link is gone...etc.
>
> If link-to-cgrp is used, I prefer (2). Stay with one way to attach
> to a cgrp. It is also consistent with the current way of attaching a single
> bpf prog to a cgroup. It is now attaching a map/set of bpf prog to a cgroup.
> The individual struct_ops implementation can decide if it should
> allow a struct_ops be attached multiple times.

+1

bpf_st_ops and cgroups. Was: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Alexei Starovoitov 3 months, 1 week ago

On Thu, Oct 30, 2025 at 12:06 PM Roman Gushchin
<roman.gushchin@linux.dev> wrote:
>
> Ok, let me summarize the options we discussed here:
>
> 1) Make the attachment details (e.g. cgroup_id) the part of struct ops
> itself. The attachment is happening at the reg() time.
>
>   +: It's convenient for complex stateful struct ops'es, because a
>       single entity represents a combination of code and data.
>   -: No way to attach a single struct ops to multiple entities.
>
> This approach is used by Tejun for per-cgroup sched_ext prototype.

It's wrong. It should adopt bpf_struct_ops_link_create() approach
and use attr->link_create.cgroup.relative_fd to attach.
At that point scx can enforce that it attaches to one cgroup only
if it simplifies things for sched-ext. That's fine.
But api must be link based.
Otherwise cgroup_id inside st_ops all the way from bpf prog
will not be backward compatible if/when people would want
to attach the same sched-ext to multiple cgroups.

> 2) Make the attachment details a part of bpf_link creation. The
> attachment is still happening at the reg() time.
>
>   +: A single struct ops can be attached to multiple entities.
>   -: Implementing stateful struct ops'es is harder and requires passing
>      an additional argument (some sort of "self") to all callbacks.

sched-ext is already suffering from lack of 'this'.
The current workarounds with prog_assoc and aux__prog are not great.
We should learn from that mistake instead of repeating it with bpf-oom.

As far as 'this' I think we should pass
'struct bpf_struct_ops_link *' to all callbacks.
This patch is proposing to have cougrp_id in there.
It can be a pointer to cgroup too. This detail we can change later.

We can brainstorm a way to pass 'link *' in run_ctx,
and have an easy way to access it from ops and from kfuncs
that ops will call.
The existing tracing style bpf_set_run_ctx() should work for bpf-oom,
and 'link *'->cgroup_id->cgrp->memcg will be there for ops
and for kfuncs, but it doesn't quite work for sched-ext as-is
that wants run_ctx to be different for sched-ext-s
attached at different levels of hierarchy.
Maybe additional bpf_set_run_ctx() while traversing
hierarchy will do the trick?
Then we might not even need aux_prog and kf_implicit_args that much.
Though they may be useful on their own though.

> I'm using this approach in the bpf oom proposal.
>
> 3) Move the attachment out of .reg() scope entirely. reg() will register
> the implementation system-wide and then some 3rd-party interface
> (e.g. cgroupfs) should be used to select the implementation.

We went that road with ioctl-s and subsystem specific ways to attach.
All of them sucked. link_create is the only acceptable approach
because it returns FD.

Re: bpf_st_ops and cgroups. Was: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Roman Gushchin 3 months, 1 week ago

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Thu, Oct 30, 2025 at 12:06 PM Roman Gushchin
> <roman.gushchin@linux.dev> wrote:
>>
>> Ok, let me summarize the options we discussed here:
>>
>> 1) Make the attachment details (e.g. cgroup_id) the part of struct ops
>> itself. The attachment is happening at the reg() time.
>>
>>   +: It's convenient for complex stateful struct ops'es, because a
>>       single entity represents a combination of code and data.
>>   -: No way to attach a single struct ops to multiple entities.
>>
>> This approach is used by Tejun for per-cgroup sched_ext prototype.
>
> It's wrong. It should adopt bpf_struct_ops_link_create() approach
> and use attr->link_create.cgroup.relative_fd to attach.

This is basically what I have in v2, but Andrii and Song suggested that
I should use attr->link_create.target_fd instead.

I have a slight preference towards attr->link_create.cgroup.relative_fd
because it makes it clear that fd is a cgroup fd and potentially opens
a possibility to e.g. attach struct_ops to individual tasks and
cgroups, but I'm fine with both options.

Also, as Song pointed out, fd==0 is in theory a valid target, so instead of
using the "if (fd) {...}" check we might need a new flag. Idk if it
really makes sense to complicate the code for it.

Can we, please, decide on what's best here?

Re: bpf_st_ops and cgroups. Was: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Alexei Starovoitov 3 months, 1 week ago

On Thu, Oct 30, 2025 at 4:24 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>
> > On Thu, Oct 30, 2025 at 12:06 PM Roman Gushchin
> > <roman.gushchin@linux.dev> wrote:
> >>
> >> Ok, let me summarize the options we discussed here:
> >>
> >> 1) Make the attachment details (e.g. cgroup_id) the part of struct ops
> >> itself. The attachment is happening at the reg() time.
> >>
> >>   +: It's convenient for complex stateful struct ops'es, because a
> >>       single entity represents a combination of code and data.
> >>   -: No way to attach a single struct ops to multiple entities.
> >>
> >> This approach is used by Tejun for per-cgroup sched_ext prototype.
> >
> > It's wrong. It should adopt bpf_struct_ops_link_create() approach
> > and use attr->link_create.cgroup.relative_fd to attach.
>
> This is basically what I have in v2, but Andrii and Song suggested that
> I should use attr->link_create.target_fd instead.

Yes. Of course.
link_create.cgroup.relative_fd actually points to a program.
We will need it if/when we add support for mprog style attach.

> I have a slight preference towards attr->link_create.cgroup.relative_fd
> because it makes it clear that fd is a cgroup fd and potentially opens
> a possibility to e.g. attach struct_ops to individual tasks and
> cgroups, but I'm fine with both options.

yeah. The name is confusing. It's not a cgroup fd.

> Also, as Song pointed out, fd==0 is in theory a valid target, so instead of
> using the "if (fd) {...}" check we might need a new flag. Idk if it
> really makes sense to complicate the code for it.

One option is to cgroup_get_from_fd(attr->link_create.target_fd)
and if it's not a cgroup, just ignore it in bpf_struct_ops_link_create()
But a new flag like BPF_F_CGROUP_FD maybe cleaner ?
If we ever attach st_ops to tasks there will be another BPF_F_PID_FD flag ?
Or we may try different supported kinds like bpf_fd_probe_obj() does
and don't bother with flags.

New attach_type-s are not necessary. The type of st_ops itself
reflects the purpose and hook location.

Re: bpf_st_ops and cgroups. Was: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Song Liu 3 months, 1 week ago

On Thu, Oct 30, 2025 at 4:24 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>
> > On Thu, Oct 30, 2025 at 12:06 PM Roman Gushchin
> > <roman.gushchin@linux.dev> wrote:
> >>
> >> Ok, let me summarize the options we discussed here:
> >>
> >> 1) Make the attachment details (e.g. cgroup_id) the part of struct ops
> >> itself. The attachment is happening at the reg() time.
> >>
> >>   +: It's convenient for complex stateful struct ops'es, because a
> >>       single entity represents a combination of code and data.
> >>   -: No way to attach a single struct ops to multiple entities.
> >>
> >> This approach is used by Tejun for per-cgroup sched_ext prototype.
> >
> > It's wrong. It should adopt bpf_struct_ops_link_create() approach
> > and use attr->link_create.cgroup.relative_fd to attach.
>
> This is basically what I have in v2, but Andrii and Song suggested that
> I should use attr->link_create.target_fd instead.
>
> I have a slight preference towards attr->link_create.cgroup.relative_fd
> because it makes it clear that fd is a cgroup fd and potentially opens
> a possibility to e.g. attach struct_ops to individual tasks and
> cgroups, but I'm fine with both options.

relative_fd and relative_id have specific meaning. When multiple
programs are attached to the same object (cgroup, socket, etc.),
relative_fd and relative_id (together with BPF_F_BEFORE and
BPF_F_AFTER) are used to specify the order of execution.

>
> Also, as Song pointed out, fd==0 is in theory a valid target, so instead of
> using the "if (fd) {...}" check we might need a new flag. Idk if it
> really makes sense to complicate the code for it.
>
> Can we, please, decide on what's best here?

How about we add a new attach_type BPF_STRUCT_OPS_CGROUP?

Thanks,
Song

Re: bpf_st_ops and cgroups. Was: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Yafang Shao 3 months, 1 week ago

On Fri, Oct 31, 2025 at 2:14 PM Song Liu <song@kernel.org> wrote:
>
> On Thu, Oct 30, 2025 at 4:24 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
> >
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> >
> > > On Thu, Oct 30, 2025 at 12:06 PM Roman Gushchin
> > > <roman.gushchin@linux.dev> wrote:
> > >>
> > >> Ok, let me summarize the options we discussed here:
> > >>
> > >> 1) Make the attachment details (e.g. cgroup_id) the part of struct ops
> > >> itself. The attachment is happening at the reg() time.
> > >>
> > >>   +: It's convenient for complex stateful struct ops'es, because a
> > >>       single entity represents a combination of code and data.
> > >>   -: No way to attach a single struct ops to multiple entities.
> > >>
> > >> This approach is used by Tejun for per-cgroup sched_ext prototype.
> > >
> > > It's wrong. It should adopt bpf_struct_ops_link_create() approach
> > > and use attr->link_create.cgroup.relative_fd to attach.
> >
> > This is basically what I have in v2, but Andrii and Song suggested that
> > I should use attr->link_create.target_fd instead.
> >
> > I have a slight preference towards attr->link_create.cgroup.relative_fd
> > because it makes it clear that fd is a cgroup fd and potentially opens
> > a possibility to e.g. attach struct_ops to individual tasks and
> > cgroups, but I'm fine with both options.
>
> relative_fd and relative_id have specific meaning. When multiple
> programs are attached to the same object (cgroup, socket, etc.),
> relative_fd and relative_id (together with BPF_F_BEFORE and
> BPF_F_AFTER) are used to specify the order of execution.
>
> >
> > Also, as Song pointed out, fd==0 is in theory a valid target, so instead of
> > using the "if (fd) {...}" check we might need a new flag. Idk if it
> > really makes sense to complicate the code for it.
> >
> > Can we, please, decide on what's best here?
>
> How about we add a new attach_type BPF_STRUCT_OPS_CGROUP?

I'm concerned that defining a unique BPF_STRUCT_OPS_XXX for each type
might lead to a maintainability challenge. To keep the design clean
and forward-looking, we might want to consider a more generic
abstraction that could easily accommodate other kernel structures
(task_struct, mm_struct, etc.) without duplication.

-- 
Regards
Yafang

Re: bpf_st_ops and cgroups. Was: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Yafang Shao 3 months, 1 week ago

On Fri, Oct 31, 2025 at 7:30 AM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>
> > On Thu, Oct 30, 2025 at 12:06 PM Roman Gushchin
> > <roman.gushchin@linux.dev> wrote:
> >>
> >> Ok, let me summarize the options we discussed here:
> >>
> >> 1) Make the attachment details (e.g. cgroup_id) the part of struct ops
> >> itself. The attachment is happening at the reg() time.
> >>
> >>   +: It's convenient for complex stateful struct ops'es, because a
> >>       single entity represents a combination of code and data.
> >>   -: No way to attach a single struct ops to multiple entities.
> >>
> >> This approach is used by Tejun for per-cgroup sched_ext prototype.
> >
> > It's wrong. It should adopt bpf_struct_ops_link_create() approach
> > and use attr->link_create.cgroup.relative_fd to attach.
>
> This is basically what I have in v2, but Andrii and Song suggested that
> I should use attr->link_create.target_fd instead.
>
> I have a slight preference towards attr->link_create.cgroup.relative_fd
> because it makes it clear that fd is a cgroup fd and potentially opens
> a possibility to e.g. attach struct_ops to individual tasks and
> cgroups, but I'm fine with both options.
>
> Also, as Song pointed out, fd==0 is in theory a valid target, so instead of
> using the "if (fd) {...}" check we might need a new flag.

I recall that Linus has reminded the BPF subsystem not to use `if
(fd)` to check for a valid fd. We should avoid repeating this mistake.
The proper solution is to add a new flag to indicate whether a fd is
valid.

> Idk if it
> really makes sense to complicate the code for it.
>
> Can we, please, decide on what's best here?
>

It seems the only way for us to learn is through practice—even if that
means making mistakes first ;-)

I can imagine a key benefit of a single struct-ops-to-multiple-cgroups
model is the ability to pre-load all required policies. This allows
users the flexibility to attach them on demand, while completely
avoiding the complex lifecycle management of individual links—a major
practical pain point.

-- 
Regards
Yafang

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Roman Gushchin 3 months, 1 week ago

Song Liu <song@kernel.org> writes:

> On Mon, Oct 27, 2025 at 4:17 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
> [...]
>>  struct bpf_struct_ops_value {
>>         struct bpf_struct_ops_common_value common;
>> @@ -1359,6 +1360,18 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
>>         }
>>         bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
>>                       attr->link_create.attach_type);
>> +#ifdef CONFIG_CGROUPS
>> +       if (attr->link_create.cgroup.relative_fd) {
>> +               struct cgroup *cgrp;
>> +
>> +               cgrp = cgroup_get_from_fd(attr->link_create.cgroup.relative_fd);
>
> We should use "target_fd" here, not relative_fd.

Ok, thanks!

>
> Also, 0 is a valid fd, so we cannot use target_fd == 0 to attach to
> global memcg.

Yep, switching to using root_memcg's fd instead.

Thanks!

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by bot+bpf-ci@kernel.org 3 months, 2 weeks ago

```
commit 31d238f36a0a6a56efa48c847cb776e67ba2f079
Author: Roman Gushchin <roman.gushchin@linux.dev>

bpf: initial support for attaching struct ops to cgroups

When a struct ops is being attached and a bpf link is created, allow to
pass a cgroup fd using bpf attr, so that struct ops can be attached to a
cgroup instead of globally. Attached struct ops doesn't hold a reference
to the cgroup, only preserves cgroup id.

> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index eae907218..7205b813e 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1849,6 +1849,7 @@ struct bpf_struct_ops_link {
>  	struct bpf_link link;
>  	struct bpf_map __rcu *map;
>  	wait_queue_head_t wait_hup;
> +	u64 cgroup_id;
>  };
>
>  struct bpf_link_primer {
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index 45cc5ee19..58664779a 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -13,6 +13,7 @@
>  #include <linux/btf_ids.h>
>  #include <linux/rcupdate_wait.h>
>  #include <linux/poll.h>
> +#include <linux/cgroup.h>
>
>  struct bpf_struct_ops_value {
>  	struct bpf_struct_ops_common_value common;

[ ... ]

> @@ -1359,6 +1360,18 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
>  	}
>  	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
>  		      attr->link_create.attach_type);
> +#ifdef CONFIG_CGROUPS
> +	if (attr->link_create.cgroup.relative_fd) {
> +		struct cgroup *cgrp;
> +
> +		cgrp = cgroup_get_from_fd(attr->link_create.cgroup.relative_fd);
> +		if (IS_ERR(cgrp))
> +			return PTR_ERR(cgrp);
                        ^^^^^^^^^^^^^^^^^^^^^^

Can this leak the link and map? In bpf_struct_ops_link_create(), the link
is allocated with kzalloc(GFP_USER) and map is obtained with bpf_map_get()
before this point. If cgroup_get_from_fd() fails, the early return skips
the err_out label which does kfree(link) and bpf_map_put(map).

All other error paths in this function use "err = <error>; goto err_out;"
but this one returns directly.

> +
> +		link->cgroup_id = cgroup_id(cgrp);
> +		cgroup_put(cgrp);
> +	}
> +#endif /* CONFIG_CGROUPS */
> +
>  	err = bpf_link_prime(&link->link, &link_primer);
>  	if (err)
>  		goto err_out;


```

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

In-Reply-To-Subject: `bpf: initial support for attaching struct ops to cgroups`
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/18859027430

Re: [PATCH v2 02/23] bpf: initial support for attaching struct ops to cgroups

Posted by Roman Gushchin 3 months, 1 week ago

bot+bpf-ci@kernel.org writes:

> ```
> commit 31d238f36a0a6a56efa48c847cb776e67ba2f079
> Author: Roman Gushchin <roman.gushchin@linux.dev>
>
> bpf: initial support for attaching struct ops to cgroups
>
> When a struct ops is being attached and a bpf link is created, allow to
> pass a cgroup fd using bpf attr, so that struct ops can be attached to a
> cgroup instead of globally. Attached struct ops doesn't hold a reference
> to the cgroup, only preserves cgroup id.
>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index eae907218..7205b813e 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1849,6 +1849,7 @@ struct bpf_struct_ops_link {
>>  	struct bpf_link link;
>>  	struct bpf_map __rcu *map;
>>  	wait_queue_head_t wait_hup;
>> +	u64 cgroup_id;
>>  };
>>
>>  struct bpf_link_primer {
>> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
>> index 45cc5ee19..58664779a 100644
>> --- a/kernel/bpf/bpf_struct_ops.c
>> +++ b/kernel/bpf/bpf_struct_ops.c
>> @@ -13,6 +13,7 @@
>>  #include <linux/btf_ids.h>
>>  #include <linux/rcupdate_wait.h>
>>  #include <linux/poll.h>
>> +#include <linux/cgroup.h>
>>
>>  struct bpf_struct_ops_value {
>>  	struct bpf_struct_ops_common_value common;
>
> [ ... ]
>
>> @@ -1359,6 +1360,18 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
>>  	}
>>  	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
>>  		      attr->link_create.attach_type);
>> +#ifdef CONFIG_CGROUPS
>> +	if (attr->link_create.cgroup.relative_fd) {
>> +		struct cgroup *cgrp;
>> +
>> +		cgrp = cgroup_get_from_fd(attr->link_create.cgroup.relative_fd);
>> +		if (IS_ERR(cgrp))
>> +			return PTR_ERR(cgrp);
>                         ^^^^^^^^^^^^^^^^^^^^^^
>
> Can this leak the link and map? In bpf_struct_ops_link_create(), the link
> is allocated with kzalloc(GFP_USER) and map is obtained with bpf_map_get()
> before this point. If cgroup_get_from_fd() fails, the early return skips
> the err_out label which does kfree(link) and bpf_map_put(map).
>
> All other error paths in this function use "err = <error>; goto err_out;"
> but this one returns directly.

Ok, this is indeed wrong, will fix in the next version.

Thanks!