Introduce a bpf struct ops for implementing custom OOM handling policies.
The struct ops provides the bpf_handle_out_of_memory() callback,
which expected to return 1 if it was able to free some memory and 0
otherwise.
In the latter case it's guaranteed that the in-kernel OOM killer will
be invoked. Otherwise the kernel also checks the bpf_memory_freed
field of the oom_control structure, which is expected to be set by
kfuncs suitable for releasing memory. It's a safety mechanism which
prevents a bpf program to claim forward progress without actually
releasing memory. The callback program is sleepable to enable using
iterators, e.g. cgroup iterators.
The callback receives struct oom_control as an argument, so it can
easily filter out OOM's it doesn't want to handle, e.g. global vs
memcg OOM's.
The callback is executed just before the kernel victim task selection
algorithm, so all heuristics and sysctls like panic on oom,
sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task
are respected.
The struct ops also has the name field, which allows to define a
custom name for the implemented policy. It's printed in the OOM report
in the oom_policy=<policy> format. "default" is printed if bpf is not
used or policy name is not specified.
[ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
oom_policy=bpf_test_policy
[ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full)
[ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
[ 112.698167] Call Trace:
[ 112.698177] <TASK>
[ 112.698182] dump_stack_lvl+0x4d/0x70
[ 112.698192] dump_header+0x59/0x1c6
[ 112.698199] oom_kill_process.cold+0x8/0xef
[ 112.698206] bpf_oom_kill_process+0x59/0xb0
[ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313
[ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf
[ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5
[ 112.698240] bpf_handle_oom+0x11a/0x1e0
[ 112.698250] out_of_memory+0xab/0x5c0
[ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110
[ 112.698274] try_charge_memcg+0x4b5/0x7e0
[ 112.698288] charge_memcg+0x2f/0xc0
[ 112.698293] __mem_cgroup_charge+0x30/0xc0
[ 112.698299] do_anonymous_page+0x40f/0xa50
[ 112.698311] __handle_mm_fault+0xbba/0x1140
[ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5
[ 112.698335] handle_mm_fault+0xe6/0x370
[ 112.698343] do_user_addr_fault+0x211/0x6a0
[ 112.698354] exc_page_fault+0x75/0x1d0
[ 112.698363] asm_exc_page_fault+0x26/0x30
[ 112.698366] RIP: 0033:0x7fa97236db00
It's possible to load multiple bpf struct programs. In the case of
oom, they will be executed one by one in the same order they been
loaded until one of them returns 1 and bpf_memory_freed is set to 1
- an indication that the memory was freed. This allows to have
multiple bpf programs to focus on different types of OOM's - e.g.
one program can only handle memcg OOM's in one memory cgroup.
But the filtering is done in bpf - so it's fully flexible.
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
include/linux/bpf_oom.h | 49 +++++++++++++
include/linux/oom.h | 8 ++
mm/Makefile | 3 +
mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++
mm/oom_kill.c | 22 +++++-
5 files changed, 237 insertions(+), 2 deletions(-)
create mode 100644 include/linux/bpf_oom.h
create mode 100644 mm/bpf_oom.c
diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
new file mode 100644
index 000000000000..29cb5ea41d97
--- /dev/null
+++ b/include/linux/bpf_oom.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef __BPF_OOM_H
+#define __BPF_OOM_H
+
+struct bpf_oom;
+struct oom_control;
+
+#define BPF_OOM_NAME_MAX_LEN 64
+
+struct bpf_oom_ops {
+ /**
+ * @handle_out_of_memory: Out of memory bpf handler, called before
+ * the in-kernel OOM killer.
+ * @oc: OOM control structure
+ *
+ * Should return 1 if some memory was freed up, otherwise
+ * the in-kernel OOM killer is invoked.
+ */
+ int (*handle_out_of_memory)(struct oom_control *oc);
+
+ /**
+ * @name: BPF OOM policy name
+ */
+ char name[BPF_OOM_NAME_MAX_LEN];
+
+ /* Private */
+ struct bpf_oom *bpf_oom;
+};
+
+#ifdef CONFIG_BPF_SYSCALL
+/**
+ * @bpf_handle_oom: handle out of memory using bpf programs
+ * @oc: OOM control structure
+ *
+ * Returns true if a bpf oom program was executed, returned 1
+ * and some memory was actually freed.
+ */
+bool bpf_handle_oom(struct oom_control *oc);
+
+#else /* CONFIG_BPF_SYSCALL */
+static inline bool bpf_handle_oom(struct oom_control *oc)
+{
+ return false;
+}
+
+#endif /* CONFIG_BPF_SYSCALL */
+
+#endif /* __BPF_OOM_H */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 1e0fc6931ce9..ef453309b7ea 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -51,6 +51,14 @@ struct oom_control {
/* Used to print the constraint info. */
enum oom_constraint constraint;
+
+#ifdef CONFIG_BPF_SYSCALL
+ /* Used by the bpf oom implementation to mark the forward progress */
+ bool bpf_memory_freed;
+
+ /* Policy name */
+ const char *bpf_policy_name;
+#endif
};
extern struct mutex oom_lock;
diff --git a/mm/Makefile b/mm/Makefile
index 1a7a11d4933d..a714aba03759 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
ifdef CONFIG_SWAP
obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif
+ifdef CONFIG_BPF_SYSCALL
+obj-y += bpf_oom.o
+endif
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_GUP_TEST) += gup_test.o
obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
new file mode 100644
index 000000000000..47633046819c
--- /dev/null
+++ b/mm/bpf_oom.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * BPF-driven OOM killer customization
+ *
+ * Author: Roman Gushchin <roman.gushchin@linux.dev>
+ */
+
+#include <linux/bpf.h>
+#include <linux/oom.h>
+#include <linux/bpf_oom.h>
+#include <linux/srcu.h>
+
+DEFINE_STATIC_SRCU(bpf_oom_srcu);
+static DEFINE_SPINLOCK(bpf_oom_lock);
+static LIST_HEAD(bpf_oom_handlers);
+
+struct bpf_oom {
+ struct bpf_oom_ops *ops;
+ struct list_head node;
+ struct srcu_struct srcu;
+};
+
+bool bpf_handle_oom(struct oom_control *oc)
+{
+ struct bpf_oom_ops *ops;
+ struct bpf_oom *bpf_oom;
+ int list_idx, idx, ret = 0;
+
+ oc->bpf_memory_freed = false;
+
+ list_idx = srcu_read_lock(&bpf_oom_srcu);
+ list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) {
+ ops = READ_ONCE(bpf_oom->ops);
+ if (!ops || !ops->handle_out_of_memory)
+ continue;
+ idx = srcu_read_lock(&bpf_oom->srcu);
+ oc->bpf_policy_name = ops->name[0] ? &ops->name[0] :
+ "bpf_defined_policy";
+ ret = ops->handle_out_of_memory(oc);
+ oc->bpf_policy_name = NULL;
+ srcu_read_unlock(&bpf_oom->srcu, idx);
+
+ if (ret && oc->bpf_memory_freed)
+ break;
+ }
+ srcu_read_unlock(&bpf_oom_srcu, list_idx);
+
+ return ret && oc->bpf_memory_freed;
+}
+
+static int __handle_out_of_memory(struct oom_control *oc)
+{
+ return 0;
+}
+
+static struct bpf_oom_ops __bpf_oom_ops = {
+ .handle_out_of_memory = __handle_out_of_memory,
+};
+
+static const struct bpf_func_proto *
+bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return tracing_prog_func_proto(func_id, prog);
+}
+
+static bool bpf_oom_ops_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static const struct bpf_verifier_ops bpf_oom_verifier_ops = {
+ .get_func_proto = bpf_oom_func_proto,
+ .is_valid_access = bpf_oom_ops_is_valid_access,
+};
+
+static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_oom_ops *ops = kdata;
+ struct bpf_oom *bpf_oom;
+ int ret;
+
+ bpf_oom = kmalloc(sizeof(*bpf_oom), GFP_KERNEL_ACCOUNT);
+ if (!bpf_oom)
+ return -ENOMEM;
+
+ ret = init_srcu_struct(&bpf_oom->srcu);
+ if (ret) {
+ kfree(bpf_oom);
+ return ret;
+ }
+
+ WRITE_ONCE(bpf_oom->ops, ops);
+ ops->bpf_oom = bpf_oom;
+
+ spin_lock(&bpf_oom_lock);
+ list_add_rcu(&bpf_oom->node, &bpf_oom_handlers);
+ spin_unlock(&bpf_oom_lock);
+
+ return 0;
+}
+
+static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_oom_ops *ops = kdata;
+ struct bpf_oom *bpf_oom = ops->bpf_oom;
+
+ WRITE_ONCE(bpf_oom->ops, NULL);
+
+ spin_lock(&bpf_oom_lock);
+ list_del_rcu(&bpf_oom->node);
+ spin_unlock(&bpf_oom_lock);
+
+ synchronize_srcu(&bpf_oom->srcu);
+
+ kfree(bpf_oom);
+}
+
+static int bpf_oom_ops_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct bpf_oom_ops *uops = (const struct bpf_oom_ops *)udata;
+ struct bpf_oom_ops *ops = (struct bpf_oom_ops *)kdata;
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct bpf_oom_ops, name):
+ strscpy_pad(ops->name, uops->name, sizeof(ops->name));
+ return 1;
+ }
+ return 0;
+}
+
+static int bpf_oom_ops_init(struct btf *btf)
+{
+ return 0;
+}
+
+static struct bpf_struct_ops bpf_oom_bpf_ops = {
+ .verifier_ops = &bpf_oom_verifier_ops,
+ .reg = bpf_oom_ops_reg,
+ .unreg = bpf_oom_ops_unreg,
+ .init_member = bpf_oom_ops_init_member,
+ .init = bpf_oom_ops_init,
+ .name = "bpf_oom_ops",
+ .owner = THIS_MODULE,
+ .cfi_stubs = &__bpf_oom_ops
+};
+
+static int __init bpf_oom_struct_ops_init(void)
+{
+ return register_bpf_struct_ops(&bpf_oom_bpf_ops, bpf_oom_ops);
+}
+late_initcall(bpf_oom_struct_ops_init);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 25923cfec9c6..ad7bd65061d6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -45,6 +45,7 @@
#include <linux/mmu_notifier.h>
#include <linux/cred.h>
#include <linux/nmi.h>
+#include <linux/bpf_oom.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -246,6 +247,15 @@ static const char * const oom_constraint_text[] = {
[CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
};
+static const char *oom_policy_name(struct oom_control *oc)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ if (oc->bpf_policy_name)
+ return oc->bpf_policy_name;
+#endif
+ return "default";
+}
+
/*
* Determine the type of allocation constraint.
*/
@@ -458,9 +468,10 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim)
static void dump_header(struct oom_control *oc)
{
- pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\noom_policy=%s\n",
current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
- current->signal->oom_score_adj);
+ current->signal->oom_score_adj,
+ oom_policy_name(oc));
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
pr_warn("COMPACTION is disabled!!!\n");
@@ -1161,6 +1172,13 @@ bool out_of_memory(struct oom_control *oc)
return true;
}
+ /*
+ * Let bpf handle the OOM first. If it was able to free up some memory,
+ * bail out. Otherwise fall back to the kernel OOM killer.
+ */
+ if (bpf_handle_oom(oc))
+ return true;
+
select_bad_process(oc);
/* Found nothing?!?! */
if (!oc->chosen) {
--
2.50.1
On 8/18/25 10:01 AM, Roman Gushchin wrote:
> Introduce a bpf struct ops for implementing custom OOM handling policies.
>
> The struct ops provides the bpf_handle_out_of_memory() callback,
> which expected to return 1 if it was able to free some memory and 0
> otherwise.
>
> In the latter case it's guaranteed that the in-kernel OOM killer will
> be invoked. Otherwise the kernel also checks the bpf_memory_freed
> field of the oom_control structure, which is expected to be set by
> kfuncs suitable for releasing memory. It's a safety mechanism which
> prevents a bpf program to claim forward progress without actually
> releasing memory. The callback program is sleepable to enable using
> iterators, e.g. cgroup iterators.
>
> The callback receives struct oom_control as an argument, so it can
> easily filter out OOM's it doesn't want to handle, e.g. global vs
> memcg OOM's.
>
> The callback is executed just before the kernel victim task selection
> algorithm, so all heuristics and sysctls like panic on oom,
> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task
> are respected.
>
> The struct ops also has the name field, which allows to define a
> custom name for the implemented policy. It's printed in the OOM report
> in the oom_policy=<policy> format. "default" is printed if bpf is not
> used or policy name is not specified.
>
> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
> oom_policy=bpf_test_policy
> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full)
> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
> [ 112.698167] Call Trace:
> [ 112.698177] <TASK>
> [ 112.698182] dump_stack_lvl+0x4d/0x70
> [ 112.698192] dump_header+0x59/0x1c6
> [ 112.698199] oom_kill_process.cold+0x8/0xef
> [ 112.698206] bpf_oom_kill_process+0x59/0xb0
> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313
> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf
> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5
> [ 112.698240] bpf_handle_oom+0x11a/0x1e0
> [ 112.698250] out_of_memory+0xab/0x5c0
> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110
> [ 112.698274] try_charge_memcg+0x4b5/0x7e0
> [ 112.698288] charge_memcg+0x2f/0xc0
> [ 112.698293] __mem_cgroup_charge+0x30/0xc0
> [ 112.698299] do_anonymous_page+0x40f/0xa50
> [ 112.698311] __handle_mm_fault+0xbba/0x1140
> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5
> [ 112.698335] handle_mm_fault+0xe6/0x370
> [ 112.698343] do_user_addr_fault+0x211/0x6a0
> [ 112.698354] exc_page_fault+0x75/0x1d0
> [ 112.698363] asm_exc_page_fault+0x26/0x30
> [ 112.698366] RIP: 0033:0x7fa97236db00
>
> It's possible to load multiple bpf struct programs. In the case of
> oom, they will be executed one by one in the same order they been
> loaded until one of them returns 1 and bpf_memory_freed is set to 1
> - an indication that the memory was freed. This allows to have
> multiple bpf programs to focus on different types of OOM's - e.g.
> one program can only handle memcg OOM's in one memory cgroup.
> But the filtering is done in bpf - so it's fully flexible.
>
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> ---
> include/linux/bpf_oom.h | 49 +++++++++++++
> include/linux/oom.h | 8 ++
> mm/Makefile | 3 +
> mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++
> mm/oom_kill.c | 22 +++++-
> 5 files changed, 237 insertions(+), 2 deletions(-)
> create mode 100644 include/linux/bpf_oom.h
> create mode 100644 mm/bpf_oom.c
>
> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
> new file mode 100644
> index 000000000000..29cb5ea41d97
> --- /dev/null
> +++ b/include/linux/bpf_oom.h
> @@ -0,0 +1,49 @@
> +/* SPDX-License-Identifier: GPL-2.0+ */
> +
> +#ifndef __BPF_OOM_H
> +#define __BPF_OOM_H
> +
> +struct bpf_oom;
> +struct oom_control;
> +
> +#define BPF_OOM_NAME_MAX_LEN 64
> +
> +struct bpf_oom_ops {
> + /**
> + * @handle_out_of_memory: Out of memory bpf handler, called before
> + * the in-kernel OOM killer.
> + * @oc: OOM control structure
> + *
> + * Should return 1 if some memory was freed up, otherwise
> + * the in-kernel OOM killer is invoked.
> + */
> + int (*handle_out_of_memory)(struct oom_control *oc);
I suggest adding "struct bpf_oom *" as the first argument to all
bpf_oom_ops to future-proof. It will allow an bpf_oom kfunc or prog to
refer to the struct_ops instance itself.
Since bpf_oom_ops allows multiple attachment, if a bpf_prog is shared
between two bpf_oom, it will be able to infer which bpf_oom_ops is
calling by this extra argument.
> +
> + /**
> + * @name: BPF OOM policy name
> + */
> + char name[BPF_OOM_NAME_MAX_LEN];
> +
> + /* Private */
> + struct bpf_oom *bpf_oom;
> +};
> +
> +#ifdef CONFIG_BPF_SYSCALL
> +/**
> + * @bpf_handle_oom: handle out of memory using bpf programs
> + * @oc: OOM control structure
> + *
> + * Returns true if a bpf oom program was executed, returned 1
> + * and some memory was actually freed.
> + */
> +bool bpf_handle_oom(struct oom_control *oc);
> +
> +#else /* CONFIG_BPF_SYSCALL */
> +static inline bool bpf_handle_oom(struct oom_control *oc)
> +{
> + return false;
> +}
> +
> +#endif /* CONFIG_BPF_SYSCALL */
> +
> +#endif /* __BPF_OOM_H */
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 1e0fc6931ce9..ef453309b7ea 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -51,6 +51,14 @@ struct oom_control {
>
> /* Used to print the constraint info. */
> enum oom_constraint constraint;
> +
> +#ifdef CONFIG_BPF_SYSCALL
> + /* Used by the bpf oom implementation to mark the forward progress */
> + bool bpf_memory_freed;
> +
> + /* Policy name */
> + const char *bpf_policy_name;
> +#endif
> };
>
> extern struct mutex oom_lock;
> diff --git a/mm/Makefile b/mm/Makefile
> index 1a7a11d4933d..a714aba03759 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
> ifdef CONFIG_SWAP
> obj-$(CONFIG_MEMCG) += swap_cgroup.o
> endif
> +ifdef CONFIG_BPF_SYSCALL
> +obj-y += bpf_oom.o
> +endif
> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
> obj-$(CONFIG_GUP_TEST) += gup_test.o
> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
> new file mode 100644
> index 000000000000..47633046819c
> --- /dev/null
> +++ b/mm/bpf_oom.c
> @@ -0,0 +1,157 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * BPF-driven OOM killer customization
> + *
> + * Author: Roman Gushchin <roman.gushchin@linux.dev>
> + */
> +
> +#include <linux/bpf.h>
> +#include <linux/oom.h>
> +#include <linux/bpf_oom.h>
> +#include <linux/srcu.h>
> +
> +DEFINE_STATIC_SRCU(bpf_oom_srcu);
> +static DEFINE_SPINLOCK(bpf_oom_lock);
> +static LIST_HEAD(bpf_oom_handlers);
> +
> +struct bpf_oom {
> + struct bpf_oom_ops *ops;
> + struct list_head node;
> + struct srcu_struct srcu;
> +};
> +
> +bool bpf_handle_oom(struct oom_control *oc)
> +{
> + struct bpf_oom_ops *ops;
> + struct bpf_oom *bpf_oom;
> + int list_idx, idx, ret = 0;
> +
> + oc->bpf_memory_freed = false;
> +
> + list_idx = srcu_read_lock(&bpf_oom_srcu);
> + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) {
> + ops = READ_ONCE(bpf_oom->ops);
> + if (!ops || !ops->handle_out_of_memory)
> + continue;
> + idx = srcu_read_lock(&bpf_oom->srcu);
> + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] :
> + "bpf_defined_policy";
> + ret = ops->handle_out_of_memory(oc);
> + oc->bpf_policy_name = NULL;
> + srcu_read_unlock(&bpf_oom->srcu, idx);
> +
> + if (ret && oc->bpf_memory_freed)
> + break;
> + }
> + srcu_read_unlock(&bpf_oom_srcu, list_idx);
> +
> + return ret && oc->bpf_memory_freed;
> +}
> +
> +static int __handle_out_of_memory(struct oom_control *oc)
> +{
> + return 0;
> +}
> +
> +static struct bpf_oom_ops __bpf_oom_ops = {
> + .handle_out_of_memory = __handle_out_of_memory,
> +};
> +
> +static const struct bpf_func_proto *
> +bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +{
> + return tracing_prog_func_proto(func_id, prog);
> +}
> +
> +static bool bpf_oom_ops_is_valid_access(int off, int size,
> + enum bpf_access_type type,
> + const struct bpf_prog *prog,
> + struct bpf_insn_access_aux *info)
> +{
> + return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
> +}
> +
> +static const struct bpf_verifier_ops bpf_oom_verifier_ops = {
> + .get_func_proto = bpf_oom_func_proto,
> + .is_valid_access = bpf_oom_ops_is_valid_access,
> +};
> +
> +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
> +{
> + struct bpf_oom_ops *ops = kdata;
> + struct bpf_oom *bpf_oom;
> + int ret;
> +
> + bpf_oom = kmalloc(sizeof(*bpf_oom), GFP_KERNEL_ACCOUNT);
> + if (!bpf_oom)
> + return -ENOMEM;
> +
> + ret = init_srcu_struct(&bpf_oom->srcu);
> + if (ret) {
> + kfree(bpf_oom);
> + return ret;
> + }
> +
> + WRITE_ONCE(bpf_oom->ops, ops);
> + ops->bpf_oom = bpf_oom;
> +
> + spin_lock(&bpf_oom_lock);
> + list_add_rcu(&bpf_oom->node, &bpf_oom_handlers);
> + spin_unlock(&bpf_oom_lock);
> +
> + return 0;
> +}
> +
> +static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link)
> +{
> + struct bpf_oom_ops *ops = kdata;
> + struct bpf_oom *bpf_oom = ops->bpf_oom;
> +
> + WRITE_ONCE(bpf_oom->ops, NULL);
> +
> + spin_lock(&bpf_oom_lock);
> + list_del_rcu(&bpf_oom->node);
> + spin_unlock(&bpf_oom_lock);
> +
> + synchronize_srcu(&bpf_oom->srcu);
> +
> + kfree(bpf_oom);
> +}
> +
> +static int bpf_oom_ops_init_member(const struct btf_type *t,
> + const struct btf_member *member,
> + void *kdata, const void *udata)
> +{
> + const struct bpf_oom_ops *uops = (const struct bpf_oom_ops *)udata;
> + struct bpf_oom_ops *ops = (struct bpf_oom_ops *)kdata;
> + u32 moff = __btf_member_bit_offset(t, member) / 8;
> +
> + switch (moff) {
> + case offsetof(struct bpf_oom_ops, name):
> + strscpy_pad(ops->name, uops->name, sizeof(ops->name));
> + return 1;
> + }
> + return 0;
> +}
> +
> +static int bpf_oom_ops_init(struct btf *btf)
> +{
> + return 0;
> +}
> +
> +static struct bpf_struct_ops bpf_oom_bpf_ops = {
> + .verifier_ops = &bpf_oom_verifier_ops,
> + .reg = bpf_oom_ops_reg,
> + .unreg = bpf_oom_ops_unreg,
> + .init_member = bpf_oom_ops_init_member,
> + .init = bpf_oom_ops_init,
> + .name = "bpf_oom_ops",
> + .owner = THIS_MODULE,
> + .cfi_stubs = &__bpf_oom_ops
> +};
> +
> +static int __init bpf_oom_struct_ops_init(void)
> +{
> + return register_bpf_struct_ops(&bpf_oom_bpf_ops, bpf_oom_ops);
> +}
> +late_initcall(bpf_oom_struct_ops_init);
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 25923cfec9c6..ad7bd65061d6 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -45,6 +45,7 @@
> #include <linux/mmu_notifier.h>
> #include <linux/cred.h>
> #include <linux/nmi.h>
> +#include <linux/bpf_oom.h>
>
> #include <asm/tlb.h>
> #include "internal.h"
> @@ -246,6 +247,15 @@ static const char * const oom_constraint_text[] = {
> [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
> };
>
> +static const char *oom_policy_name(struct oom_control *oc)
> +{
> +#ifdef CONFIG_BPF_SYSCALL
> + if (oc->bpf_policy_name)
> + return oc->bpf_policy_name;
> +#endif
> + return "default";
> +}
> +
> /*
> * Determine the type of allocation constraint.
> */
> @@ -458,9 +468,10 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim)
>
> static void dump_header(struct oom_control *oc)
> {
> - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
> + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\noom_policy=%s\n",
> current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
> - current->signal->oom_score_adj);
> + current->signal->oom_score_adj,
> + oom_policy_name(oc));
> if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
> pr_warn("COMPACTION is disabled!!!\n");
>
> @@ -1161,6 +1172,13 @@ bool out_of_memory(struct oom_control *oc)
> return true;
> }
>
> + /*
> + * Let bpf handle the OOM first. If it was able to free up some memory,
> + * bail out. Otherwise fall back to the kernel OOM killer.
> + */
> + if (bpf_handle_oom(oc))
> + return true;
> +
> select_bad_process(oc);
> /* Found nothing?!?! */
> if (!oc->chosen) {
On Mon, 18 Aug 2025 at 19:01, Roman Gushchin <roman.gushchin@linux.dev> wrote: > > Introduce a bpf struct ops for implementing custom OOM handling policies. > > The struct ops provides the bpf_handle_out_of_memory() callback, > which expected to return 1 if it was able to free some memory and 0 > otherwise. > > In the latter case it's guaranteed that the in-kernel OOM killer will > be invoked. Otherwise the kernel also checks the bpf_memory_freed > field of the oom_control structure, which is expected to be set by > kfuncs suitable for releasing memory. It's a safety mechanism which > prevents a bpf program to claim forward progress without actually > releasing memory. The callback program is sleepable to enable using > iterators, e.g. cgroup iterators. > > The callback receives struct oom_control as an argument, so it can > easily filter out OOM's it doesn't want to handle, e.g. global vs > memcg OOM's. > > The callback is executed just before the kernel victim task selection > algorithm, so all heuristics and sysctls like panic on oom, > sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task > are respected. > > The struct ops also has the name field, which allows to define a > custom name for the implemented policy. It's printed in the OOM report > in the oom_policy=<policy> format. "default" is printed if bpf is not > used or policy name is not specified. > > [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 > oom_policy=bpf_test_policy > [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) > [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 > [ 112.698167] Call Trace: > [ 112.698177] <TASK> > [ 112.698182] dump_stack_lvl+0x4d/0x70 > [ 112.698192] dump_header+0x59/0x1c6 > [ 112.698199] oom_kill_process.cold+0x8/0xef > [ 112.698206] bpf_oom_kill_process+0x59/0xb0 > [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 > [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf > [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 > [ 112.698240] bpf_handle_oom+0x11a/0x1e0 > [ 112.698250] out_of_memory+0xab/0x5c0 > [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 > [ 112.698274] try_charge_memcg+0x4b5/0x7e0 > [ 112.698288] charge_memcg+0x2f/0xc0 > [ 112.698293] __mem_cgroup_charge+0x30/0xc0 > [ 112.698299] do_anonymous_page+0x40f/0xa50 > [ 112.698311] __handle_mm_fault+0xbba/0x1140 > [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 > [ 112.698335] handle_mm_fault+0xe6/0x370 > [ 112.698343] do_user_addr_fault+0x211/0x6a0 > [ 112.698354] exc_page_fault+0x75/0x1d0 > [ 112.698363] asm_exc_page_fault+0x26/0x30 > [ 112.698366] RIP: 0033:0x7fa97236db00 > > It's possible to load multiple bpf struct programs. In the case of > oom, they will be executed one by one in the same order they been > loaded until one of them returns 1 and bpf_memory_freed is set to 1 > - an indication that the memory was freed. This allows to have > multiple bpf programs to focus on different types of OOM's - e.g. > one program can only handle memcg OOM's in one memory cgroup. > But the filtering is done in bpf - so it's fully flexible. I think a natural question here is ordering. Is this ability to have multiple OOM programs critical right now? How is it decided who gets to run before the other? Is it based on order of attachment (which can be non-deterministic)? There was a lot of discussion on something similar for tc progs, and we went with specific flags that capture partial ordering constraints (instead of priorities that may collide). https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net It would be nice if we can find a way of making this consistent. Another option is to exclude the multiple attachment bit from the initial version and do this as a follow up, since it probably requires more discussion. > > Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> > --- > [...]
Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: > On Mon, 18 Aug 2025 at 19:01, Roman Gushchin <roman.gushchin@linux.dev> wrote: >> >> Introduce a bpf struct ops for implementing custom OOM handling policies. >> >> The struct ops provides the bpf_handle_out_of_memory() callback, >> which expected to return 1 if it was able to free some memory and 0 >> otherwise. >> >> In the latter case it's guaranteed that the in-kernel OOM killer will >> be invoked. Otherwise the kernel also checks the bpf_memory_freed >> field of the oom_control structure, which is expected to be set by >> kfuncs suitable for releasing memory. It's a safety mechanism which >> prevents a bpf program to claim forward progress without actually >> releasing memory. The callback program is sleepable to enable using >> iterators, e.g. cgroup iterators. >> >> The callback receives struct oom_control as an argument, so it can >> easily filter out OOM's it doesn't want to handle, e.g. global vs >> memcg OOM's. >> >> The callback is executed just before the kernel victim task selection >> algorithm, so all heuristics and sysctls like panic on oom, >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task >> are respected. >> >> The struct ops also has the name field, which allows to define a >> custom name for the implemented policy. It's printed in the OOM report >> in the oom_policy=<policy> format. "default" is printed if bpf is not >> used or policy name is not specified. >> >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 >> oom_policy=bpf_test_policy >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 >> [ 112.698167] Call Trace: >> [ 112.698177] <TASK> >> [ 112.698182] dump_stack_lvl+0x4d/0x70 >> [ 112.698192] dump_header+0x59/0x1c6 >> [ 112.698199] oom_kill_process.cold+0x8/0xef >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0 >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0 >> [ 112.698250] out_of_memory+0xab/0x5c0 >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0 >> [ 112.698288] charge_memcg+0x2f/0xc0 >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0 >> [ 112.698299] do_anonymous_page+0x40f/0xa50 >> [ 112.698311] __handle_mm_fault+0xbba/0x1140 >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 >> [ 112.698335] handle_mm_fault+0xe6/0x370 >> [ 112.698343] do_user_addr_fault+0x211/0x6a0 >> [ 112.698354] exc_page_fault+0x75/0x1d0 >> [ 112.698363] asm_exc_page_fault+0x26/0x30 >> [ 112.698366] RIP: 0033:0x7fa97236db00 >> >> It's possible to load multiple bpf struct programs. In the case of >> oom, they will be executed one by one in the same order they been >> loaded until one of them returns 1 and bpf_memory_freed is set to 1 >> - an indication that the memory was freed. This allows to have >> multiple bpf programs to focus on different types of OOM's - e.g. >> one program can only handle memcg OOM's in one memory cgroup. >> But the filtering is done in bpf - so it's fully flexible. > > I think a natural question here is ordering. Is this ability to have > multiple OOM programs critical right now? Good question. Initially I had only supported a single bpf policy. But then I realized that likely people would want to have different policies handling different parts of the cgroup tree. E.g. a global policy and several policies handling OOMs only in some memory cgroups. So having just a single policy is likely a no go. > How is it decided who gets to run before the other? Is it based on > order of attachment (which can be non-deterministic)? Yeah, now it's the order of attachment. > There was a lot of discussion on something similar for tc progs, and > we went with specific flags that capture partial ordering constraints > (instead of priorities that may collide). > https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net > It would be nice if we can find a way of making this consistent. I'll take a look, thanks! I hope that my naive approach might be good enough for the start and we can implement something more sophisticated later, but maybe I'm wrong here.
On 8/20/25 5:24 PM, Roman Gushchin wrote: >> How is it decided who gets to run before the other? Is it based on >> order of attachment (which can be non-deterministic)? > Yeah, now it's the order of attachment. > >> There was a lot of discussion on something similar for tc progs, and >> we went with specific flags that capture partial ordering constraints >> (instead of priorities that may collide). >> https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net >> It would be nice if we can find a way of making this consistent. +1 The cgroup bpf prog has recently added the mprog api support also. If the simple order of attachment is not enough and needs to have specific ordering, we should make the bpf struct_ops support the same mprog api instead of asking each subsystem creating its own. fyi, another need for struct_ops ordering is to upgrade the BPF_PROG_TYPE_SOCK_OPS api to struct_ops for easier extension in the future. Slide 13 in https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view
Martin KaFai Lau <martin.lau@linux.dev> writes: > On 8/20/25 5:24 PM, Roman Gushchin wrote: >>> How is it decided who gets to run before the other? Is it based on >>> order of attachment (which can be non-deterministic)? >> Yeah, now it's the order of attachment. >> >>> There was a lot of discussion on something similar for tc progs, and >>> we went with specific flags that capture partial ordering constraints >>> (instead of priorities that may collide). >>> https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net >>> It would be nice if we can find a way of making this consistent. > > +1 > > The cgroup bpf prog has recently added the mprog api support also. If > the simple order of attachment is not enough and needs to have > specific ordering, we should make the bpf struct_ops support the same > mprog api instead of asking each subsystem creating its own. > > fyi, another need for struct_ops ordering is to upgrade the > BPF_PROG_TYPE_SOCK_OPS api to struct_ops for easier extension in the > future. Slide 13 in > https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view Does it mean it's better now to keep it simple in the context of oom patches with the plan to later reuse the generic struct_ops infrastructure? Honestly, I believe that the simple order of attachment should be good enough for quite a while, so I'd not over-complicate this, unless it's not fixable later. Thanks!
On 8/25/25 10:00 AM, Roman Gushchin wrote: > Martin KaFai Lau <martin.lau@linux.dev> writes: > >> On 8/20/25 5:24 PM, Roman Gushchin wrote: >>>> How is it decided who gets to run before the other? Is it based on >>>> order of attachment (which can be non-deterministic)? >>> Yeah, now it's the order of attachment. >>> >>>> There was a lot of discussion on something similar for tc progs, and >>>> we went with specific flags that capture partial ordering constraints >>>> (instead of priorities that may collide). >>>> https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net >>>> It would be nice if we can find a way of making this consistent. >> >> +1 >> >> The cgroup bpf prog has recently added the mprog api support also. If >> the simple order of attachment is not enough and needs to have >> specific ordering, we should make the bpf struct_ops support the same >> mprog api instead of asking each subsystem creating its own. >> >> fyi, another need for struct_ops ordering is to upgrade the >> BPF_PROG_TYPE_SOCK_OPS api to struct_ops for easier extension in the >> future. Slide 13 in >> https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view > > Does it mean it's better now to keep it simple in the context of oom > patches with the plan to later reuse the generic struct_ops > infrastructure? > > Honestly, I believe that the simple order of attachment should be > good enough for quite a while, so I'd not over-complicate this, > unless it's not fixable later. I think the simple attachment ordering is fine. Presumably the current link list in patch 1 can be replaced by the mprog in the future. Other experts can chime in if I have missed things. Once it needs to have an ordering api in the future, it should probably stay with mprog instead of each subsystem creating its own. The inspection tool (likely a subcmd in bpftool) can also be created to inspect the struct_ops order of a subsystem.
On Tue, Aug 26, 2025 at 11:01 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 8/25/25 10:00 AM, Roman Gushchin wrote:
> > Martin KaFai Lau <martin.lau@linux.dev> writes:
> >
> >> On 8/20/25 5:24 PM, Roman Gushchin wrote:
> >>>> How is it decided who gets to run before the other? Is it based on
> >>>> order of attachment (which can be non-deterministic)?
> >>> Yeah, now it's the order of attachment.
> >>>
> >>>> There was a lot of discussion on something similar for tc progs, and
> >>>> we went with specific flags that capture partial ordering constraints
> >>>> (instead of priorities that may collide).
> >>>> https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net
> >>>> It would be nice if we can find a way of making this consistent.
> >>
> >> +1
> >>
> >> The cgroup bpf prog has recently added the mprog api support also. If
> >> the simple order of attachment is not enough and needs to have
> >> specific ordering, we should make the bpf struct_ops support the same
> >> mprog api instead of asking each subsystem creating its own.
> >>
> >> fyi, another need for struct_ops ordering is to upgrade the
> >> BPF_PROG_TYPE_SOCK_OPS api to struct_ops for easier extension in the
> >> future. Slide 13 in
> >> https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view
> >
> > Does it mean it's better now to keep it simple in the context of oom
> > patches with the plan to later reuse the generic struct_ops
> > infrastructure?
> >
> > Honestly, I believe that the simple order of attachment should be
> > good enough for quite a while, so I'd not over-complicate this,
> > unless it's not fixable later.
>
> I think the simple attachment ordering is fine. Presumably the current link list
> in patch 1 can be replaced by the mprog in the future. Other experts can chime
> in if I have missed things.
I don't think the proposed approach of:
list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) {
is extensible without breaking things.
Sooner or later people will want bpf-oom handlers to be per
container, so we have to think upfront how to do it.
I would start with one bpf-oom prog per memcg and extend with mprog later.
Effectively placing 'struct bpf_oom_ops *' into oc->memcg,
and having one global bpf_oom_ops when oc->memcg == NULL.
I'm sure other designs are possible, but lets make sure container scope
is designed from the beginning.
mprog-like multi prog behavior per container can be added later.
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> On Tue, Aug 26, 2025 at 11:01 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 8/25/25 10:00 AM, Roman Gushchin wrote:
>> > Martin KaFai Lau <martin.lau@linux.dev> writes:
>> >
>> >> On 8/20/25 5:24 PM, Roman Gushchin wrote:
>> >>>> How is it decided who gets to run before the other? Is it based on
>> >>>> order of attachment (which can be non-deterministic)?
>> >>> Yeah, now it's the order of attachment.
>> >>>
>> >>>> There was a lot of discussion on something similar for tc progs, and
>> >>>> we went with specific flags that capture partial ordering constraints
>> >>>> (instead of priorities that may collide).
>> >>>> https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net
>> >>>> It would be nice if we can find a way of making this consistent.
>> >>
>> >> +1
>> >>
>> >> The cgroup bpf prog has recently added the mprog api support also. If
>> >> the simple order of attachment is not enough and needs to have
>> >> specific ordering, we should make the bpf struct_ops support the same
>> >> mprog api instead of asking each subsystem creating its own.
>> >>
>> >> fyi, another need for struct_ops ordering is to upgrade the
>> >> BPF_PROG_TYPE_SOCK_OPS api to struct_ops for easier extension in the
>> >> future. Slide 13 in
>> >> https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view
>> >
>> > Does it mean it's better now to keep it simple in the context of oom
>> > patches with the plan to later reuse the generic struct_ops
>> > infrastructure?
>> >
>> > Honestly, I believe that the simple order of attachment should be
>> > good enough for quite a while, so I'd not over-complicate this,
>> > unless it's not fixable later.
>>
>> I think the simple attachment ordering is fine. Presumably the current link list
>> in patch 1 can be replaced by the mprog in the future. Other experts can chime
>> in if I have missed things.
>
> I don't think the proposed approach of:
> list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) {
> is extensible without breaking things.
> Sooner or later people will want bpf-oom handlers to be per
> container, so we have to think upfront how to do it.
> I would start with one bpf-oom prog per memcg and extend with mprog later.
> Effectively placing 'struct bpf_oom_ops *' into oc->memcg,
> and having one global bpf_oom_ops when oc->memcg == NULL.
> I'm sure other designs are possible, but lets make sure container scope
> is designed from the beginning.
> mprog-like multi prog behavior per container can be added later.
Btw, what's the right way to attach struct ops to a cgroup, if there is
one? Add a cgroup_id field to the struct and use it in the .reg()
callback? Or there is something better?
Thanks
Hello, Roman. How are you? On Tue, Sep 02, 2025 at 10:31:33AM -0700, Roman Gushchin wrote: ... > Btw, what's the right way to attach struct ops to a cgroup, if there is > one? Add a cgroup_id field to the struct and use it in the .reg() > callback? Or there is something better? So, I'm trying to do something similar with sched_ext. Right now, I only have a very rough prototype (I can attach multiple schedulers with warnings and they even can schedule for several seconds before melting down). However, the basic pieces should may still be useful. The branch is: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git scx-hier-prototype There are several pieces: - cgroup recently grew lifetime notifiers that you can hook in there to receive on/offline events. This is useful for initializing per-cgroup fields and cleaning up when cgroup dies: https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5469 - I'm passing in cgroup_id as an optional field in struct_ops and then in enable path, look up the matching cgroup, verify it can attach there and insert and update data structures accordingly: https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5280 - I wanted to be able to group BPF programs together so that the related BPF timers, tracing progs and so on can call sched_ext kfuncs to operate on the associated scheduler instance. This currently isn't possible, so I'm using a really silly hack. I'm hoping we'd be able to get something better in the future: https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/commit/?h=scx-hier-prototype&id=b459b1f967fe1767783360761042cd36a1a5f2d6 Thanks. -- tejun
Tejun Heo <tj@kernel.org> writes:
> Hello, Roman. How are you?
Hi Tejun! Thank you for the links...
>
> On Tue, Sep 02, 2025 at 10:31:33AM -0700, Roman Gushchin wrote:
> ...
>> Btw, what's the right way to attach struct ops to a cgroup, if there is
>> one? Add a cgroup_id field to the struct and use it in the .reg()
>> callback? Or there is something better?
>
> So, I'm trying to do something similar with sched_ext. Right now, I only
> have a very rough prototype (I can attach multiple schedulers with warnings
> and they even can schedule for several seconds before melting down).
> However, the basic pieces should may still be useful. The branch is:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git scx-hier-prototype
>
> There are several pieces:
>
> - cgroup recently grew lifetime notifiers that you can hook in there to
> receive on/offline events. This is useful for initializing per-cgroup
> fields and cleaning up when cgroup dies:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5469
This is neat, I might use this for the psi struct ops to give a user a
chance to create new trigger(s) if a new cgroup is created.
>
> - I'm passing in cgroup_id as an optional field in struct_ops and then in
> enable path, look up the matching cgroup, verify it can attach there and
> insert and update data structures accordingly:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5280
Yeah, we discussed this option with Martin up in this thread. It doesn't
look as the best possible solution, but maybe the best we have at the moment.
Ideally, I want something like this:
void test_oom(void)
{
struct test_oom *skel;
int err, cgroup_fd;
cgroup_fd = open(...);
if (cgroup_fd < 0)
goto cleanup;
skel = test_oom__open_and_load();
if (!skel)
goto cleanup;
err = test_oom__attach_cgroup(skel, cgroup_fd);
if (CHECK_FAIL(err))
goto cleanup;
Hello,
On Wed, Sep 03, 2025 at 04:30:16PM -0700, Roman Gushchin wrote:
...
> > - I'm passing in cgroup_id as an optional field in struct_ops and then in
> > enable path, look up the matching cgroup, verify it can attach there and
> > insert and update data structures accordingly:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5280
>
> Yeah, we discussed this option with Martin up in this thread. It doesn't
> look as the best possible solution, but maybe the best we have at the moment.
>
> Ideally, I want something like this:
>
> void test_oom(void)
> {
> struct test_oom *skel;
> int err, cgroup_fd;
>
> cgroup_fd = open(...);
> if (cgroup_fd < 0)
> goto cleanup;
>
> skel = test_oom__open_and_load();
> if (!skel)
> goto cleanup;
>
> err = test_oom__attach_cgroup(skel, cgroup_fd);
> if (CHECK_FAIL(err))
> goto cleanup;
Yeah, that'd look better but are there practical differences? The only one I
can think of is fs based permission check but that can be done separately
too.
Thanks.
--
tejun
Tejun Heo <tj@kernel.org> writes:
> Hello,
>
> On Wed, Sep 03, 2025 at 04:30:16PM -0700, Roman Gushchin wrote:
> ...
>> > - I'm passing in cgroup_id as an optional field in struct_ops and then in
>> > enable path, look up the matching cgroup, verify it can attach there and
>> > insert and update data structures accordingly:
>> >
>> > https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5280
>>
>> Yeah, we discussed this option with Martin up in this thread. It doesn't
>> look as the best possible solution, but maybe the best we have at the moment.
>>
>> Ideally, I want something like this:
>>
>> void test_oom(void)
>> {
>> struct test_oom *skel;
>> int err, cgroup_fd;
>>
>> cgroup_fd = open(...);
>> if (cgroup_fd < 0)
>> goto cleanup;
>>
>> skel = test_oom__open_and_load();
>> if (!skel)
>> goto cleanup;
>>
>> err = test_oom__attach_cgroup(skel, cgroup_fd);
>> if (CHECK_FAIL(err))
>> goto cleanup;
>
> Yeah, that'd look better but are there practical differences? The only one I
> can think of is fs based permission check but that can be done separately
> too.
The practical difference is that a single struct ops can be attached
to multiple cgroups.
On Thu, Sep 4, 2025 at 7:32 AM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> Tejun Heo <tj@kernel.org> writes:
>
> > Hello,
> >
> > On Wed, Sep 03, 2025 at 04:30:16PM -0700, Roman Gushchin wrote:
> > ...
> >> > - I'm passing in cgroup_id as an optional field in struct_ops and then in
> >> > enable path, look up the matching cgroup, verify it can attach there and
> >> > insert and update data structures accordingly:
> >> >
> >> > https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5280
> >>
> >> Yeah, we discussed this option with Martin up in this thread. It doesn't
> >> look as the best possible solution, but maybe the best we have at the moment.
> >>
> >> Ideally, I want something like this:
> >>
> >> void test_oom(void)
> >> {
> >> struct test_oom *skel;
> >> int err, cgroup_fd;
> >>
> >> cgroup_fd = open(...);
> >> if (cgroup_fd < 0)
> >> goto cleanup;
> >>
> >> skel = test_oom__open_and_load();
> >> if (!skel)
> >> goto cleanup;
> >>
> >> err = test_oom__attach_cgroup(skel, cgroup_fd);
> >> if (CHECK_FAIL(err))
> >> goto cleanup;
> >
> > Yeah, that'd look better but are there practical differences? The only one I
> > can think of is fs based permission check but that can be done separately
> > too.
>
> The practical difference is that a single struct ops can be attached
> to multiple cgroups.
+1
Attaching the same scheduler to multiple cgroups also sounds useful.
I feel sched-ext should use cgroup_fd too and do scx_sub_enable() at
attach time instead of load time.
Then scx_sub_disable() can happen at link detach.
Looks more flexible from user pov.
Hello, On Thu, Sep 04, 2025 at 09:26:45AM -0700, Alexei Starovoitov wrote: ... > > >> err = test_oom__attach_cgroup(skel, cgroup_fd); > > >> if (CHECK_FAIL(err)) > > >> goto cleanup; > > > > > > Yeah, that'd look better but are there practical differences? The only one I > > > can think of is fs based permission check but that can be done separately > > > too. > > > > The practical difference is that a single struct ops can be attached > > to multiple cgroups. > > +1 > Attaching the same scheduler to multiple cgroups also sounds useful. > I feel sched-ext should use cgroup_fd too and do scx_sub_enable() at > attach time instead of load time. > Then scx_sub_disable() can happen at link detach. > Looks more flexible from user pov. Nothing wrong with that but I'm not sure that'd have practical user-noticeable benefits for sched_ext. Also, would it affect how associated programs can identify which instance they belong to? At least from sched_ext POV, that's a lot more important than the ability to attach the same programs in multiple places. Thanks. -- tejun
On 9/2/25 10:31 AM, Roman Gushchin wrote:
> Btw, what's the right way to attach struct ops to a cgroup, if there is
> one? Add a cgroup_id field to the struct and use it in the .reg()
Adding a cgroup id/fd field to the struct bpf_oom_ops will be hard to attach the
same bpf_oom_ops to multiple cgroups.
> callback? Or there is something better?
There is a link_create.target_fd in the "union bpf_attr". The
cgroup_bpf_link_attach() is using it as cgroup fd. May be it can be used here
also. This will limit it to link attach only. Meaning the
SEC(".struct_ops.link") is supported but not the older SEC(".struct_ops"). I
think this should be fine.
Martin KaFai Lau <martin.lau@linux.dev> writes:
> On 9/2/25 10:31 AM, Roman Gushchin wrote:
>> Btw, what's the right way to attach struct ops to a cgroup, if there is
>> one? Add a cgroup_id field to the struct and use it in the .reg()
>
> Adding a cgroup id/fd field to the struct bpf_oom_ops will be hard to
> attach the same bpf_oom_ops to multiple cgroups.
>
>> callback? Or there is something better?
>
> There is a link_create.target_fd in the "union bpf_attr". The
> cgroup_bpf_link_attach() is using it as cgroup fd. May be it can be
> used here also. This will limit it to link attach only. Meaning the
> SEC(".struct_ops.link") is supported but not the older
> SEC(".struct_ops"). I think this should be fine.
I thought a bit more about it (sorry for the delay):
if we want to be able to attach a single struct ops to multiple cgroups
(and potentially other objects, e.g. sockets), we can't really
use the existing struct ops's bpf_link.
So I guess we need to add a new .attach() function beside .reg()
which will take the existing link and struct bpf_attr as arguments and
return a new bpf_link. And in libbpf we need a corresponding new
bpf_link__attach_cgroup().
Does it sound right?
On 10/3/25 7:00 PM, Roman Gushchin wrote:
> Martin KaFai Lau <martin.lau@linux.dev> writes:
>
>> On 9/2/25 10:31 AM, Roman Gushchin wrote:
>>> Btw, what's the right way to attach struct ops to a cgroup, if there is
>>> one? Add a cgroup_id field to the struct and use it in the .reg()
>>
>> Adding a cgroup id/fd field to the struct bpf_oom_ops will be hard to
>> attach the same bpf_oom_ops to multiple cgroups.
>>
>>> callback? Or there is something better?
>>
>> There is a link_create.target_fd in the "union bpf_attr". The
>> cgroup_bpf_link_attach() is using it as cgroup fd. May be it can be
>> used here also. This will limit it to link attach only. Meaning the
>> SEC(".struct_ops.link") is supported but not the older
>> SEC(".struct_ops"). I think this should be fine.
>
> I thought a bit more about it (sorry for the delay):
> if we want to be able to attach a single struct ops to multiple cgroups
> (and potentially other objects, e.g. sockets), we can't really
> use the existing struct ops's bpf_link.
The existing 'struct bpf_struct_ops_link'? yeah, I think it needs to be extended.
>
> So I guess we need to add a new .attach() function beside .reg()
> which will take the existing link and struct bpf_attr as arguments and
> return a new bpf_link. And in libbpf we need a corresponding new
> bpf_link__attach_cgroup().
The target_fd (or cgroup_fd) is in attr. I think it is why the bpf_attr is
needed during link_create (which calls .reg).
Other than link_create, the link_detach and link_update also need to know the
cgroup but the cgroup_fd will not be in the attr. Only link_fd is available.
The cgroup probably needs to be stored in the link. The struct_ops has its
'struct bpf_struct_ops_link'. Potentially a future struct_ops can be attached to
non-cgroup also (e.g. attach to a netns), so maybe adding a 'void *target;' to
the 'struct bpf_struct_ops_link' and pass the attr to .reg(). Note that the link
pointer has already passed to .reg(). Then the subsystem (oom handling here) can
initialize the 'void *target;'.
On Fri, Oct 3, 2025 at 7:01 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> Martin KaFai Lau <martin.lau@linux.dev> writes:
>
> > On 9/2/25 10:31 AM, Roman Gushchin wrote:
> >> Btw, what's the right way to attach struct ops to a cgroup, if there is
> >> one? Add a cgroup_id field to the struct and use it in the .reg()
> >
> > Adding a cgroup id/fd field to the struct bpf_oom_ops will be hard to
> > attach the same bpf_oom_ops to multiple cgroups.
> >
> >> callback? Or there is something better?
> >
> > There is a link_create.target_fd in the "union bpf_attr". The
> > cgroup_bpf_link_attach() is using it as cgroup fd. May be it can be
> > used here also. This will limit it to link attach only. Meaning the
> > SEC(".struct_ops.link") is supported but not the older
> > SEC(".struct_ops"). I think this should be fine.
>
> I thought a bit more about it (sorry for the delay):
> if we want to be able to attach a single struct ops to multiple cgroups
> (and potentially other objects, e.g. sockets), we can't really
> use the existing struct ops's bpf_link.
>
> So I guess we need to add a new .attach() function beside .reg()
> which will take the existing link and struct bpf_attr as arguments and
> return a new bpf_link. And in libbpf we need a corresponding new
> bpf_link__attach_cgroup().
>
> Does it sound right?
>
Not really, but I also might be missing some details (I haven't read
the entire thread).
But conceptually, what you describe is not how things work w.r.t. BPF
links and attachment.
You don't attach a link to some hook (e.g., cgroup). You attach either
BPF program or (as in this case) BPF struct_ops map to a hook (i.e.,
cgroup), and get back the BPF link. That BPF link describes that one
attachment of prog/struct_ops to that hook. Each attachment gets its
own BPF link FD.
So, there cannot be bpf_link__attach_cgroup(), but there can be (at
least conceptually) bpf_map__attach_cgroup(), where map is struct_ops
map.
Having said that, we do have bpf_map__attach_struct_ops() already
(it's using BPF_LINK_CREATE command under the hood), and so perhaps
the right way is to have bpf_map__attach_struct_ops_opts() API, which
will accept optional extra attachment parameters which will be passed
into bpf_attr.link_create.struct_ops section of UAPI. That thing can
have target FD, where FD is cgroup/task/whatever we need to specify
attachment target. Just like we do that for BPF program's
BPF_LINK_CREATE, really.
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
> On Fri, Oct 3, 2025 at 7:01 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>>
>> Martin KaFai Lau <martin.lau@linux.dev> writes:
>>
>> > On 9/2/25 10:31 AM, Roman Gushchin wrote:
>> >> Btw, what's the right way to attach struct ops to a cgroup, if there is
>> >> one? Add a cgroup_id field to the struct and use it in the .reg()
>> >
>> > Adding a cgroup id/fd field to the struct bpf_oom_ops will be hard to
>> > attach the same bpf_oom_ops to multiple cgroups.
>> >
>> >> callback? Or there is something better?
>> >
>> > There is a link_create.target_fd in the "union bpf_attr". The
>> > cgroup_bpf_link_attach() is using it as cgroup fd. May be it can be
>> > used here also. This will limit it to link attach only. Meaning the
>> > SEC(".struct_ops.link") is supported but not the older
>> > SEC(".struct_ops"). I think this should be fine.
>>
>> I thought a bit more about it (sorry for the delay):
>> if we want to be able to attach a single struct ops to multiple cgroups
>> (and potentially other objects, e.g. sockets), we can't really
>> use the existing struct ops's bpf_link.
>>
>> So I guess we need to add a new .attach() function beside .reg()
>> which will take the existing link and struct bpf_attr as arguments and
>> return a new bpf_link. And in libbpf we need a corresponding new
>> bpf_link__attach_cgroup().
>>
>> Does it sound right?
>>
>
> Not really, but I also might be missing some details (I haven't read
> the entire thread).
>
> But conceptually, what you describe is not how things work w.r.t. BPF
> links and attachment.
>
> You don't attach a link to some hook (e.g., cgroup). You attach either
> BPF program or (as in this case) BPF struct_ops map to a hook (i.e.,
> cgroup), and get back the BPF link. That BPF link describes that one
> attachment of prog/struct_ops to that hook. Each attachment gets its
> own BPF link FD.
>
> So, there cannot be bpf_link__attach_cgroup(), but there can be (at
> least conceptually) bpf_map__attach_cgroup(), where map is struct_ops
> map.
I see...
So basically when a struct ops map is created we have a fd and then
we can attach it (theoretically multiple times) using BPF_LINK_CREATE.
>
> Having said that, we do have bpf_map__attach_struct_ops() already
> (it's using BPF_LINK_CREATE command under the hood), and so perhaps
> the right way is to have bpf_map__attach_struct_ops_opts() API, which
> will accept optional extra attachment parameters which will be passed
> into bpf_attr.link_create.struct_ops section of UAPI. That thing can
> have target FD, where FD is cgroup/task/whatever we need to specify
> attachment target. Just like we do that for BPF program's
> BPF_LINK_CREATE, really.
Yes, this sounds good to me!
Thanks you for the clarification.
On Mon, Oct 6, 2025 at 4:52 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>
> > On Fri, Oct 3, 2025 at 7:01 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
> >>
> >> Martin KaFai Lau <martin.lau@linux.dev> writes:
> >>
> >> > On 9/2/25 10:31 AM, Roman Gushchin wrote:
> >> >> Btw, what's the right way to attach struct ops to a cgroup, if there is
> >> >> one? Add a cgroup_id field to the struct and use it in the .reg()
> >> >
> >> > Adding a cgroup id/fd field to the struct bpf_oom_ops will be hard to
> >> > attach the same bpf_oom_ops to multiple cgroups.
> >> >
> >> >> callback? Or there is something better?
> >> >
> >> > There is a link_create.target_fd in the "union bpf_attr". The
> >> > cgroup_bpf_link_attach() is using it as cgroup fd. May be it can be
> >> > used here also. This will limit it to link attach only. Meaning the
> >> > SEC(".struct_ops.link") is supported but not the older
> >> > SEC(".struct_ops"). I think this should be fine.
> >>
> >> I thought a bit more about it (sorry for the delay):
> >> if we want to be able to attach a single struct ops to multiple cgroups
> >> (and potentially other objects, e.g. sockets), we can't really
> >> use the existing struct ops's bpf_link.
> >>
> >> So I guess we need to add a new .attach() function beside .reg()
> >> which will take the existing link and struct bpf_attr as arguments and
> >> return a new bpf_link. And in libbpf we need a corresponding new
> >> bpf_link__attach_cgroup().
> >>
> >> Does it sound right?
> >>
> >
> > Not really, but I also might be missing some details (I haven't read
> > the entire thread).
> >
> > But conceptually, what you describe is not how things work w.r.t. BPF
> > links and attachment.
> >
> > You don't attach a link to some hook (e.g., cgroup). You attach either
> > BPF program or (as in this case) BPF struct_ops map to a hook (i.e.,
> > cgroup), and get back the BPF link. That BPF link describes that one
> > attachment of prog/struct_ops to that hook. Each attachment gets its
> > own BPF link FD.
> >
> > So, there cannot be bpf_link__attach_cgroup(), but there can be (at
> > least conceptually) bpf_map__attach_cgroup(), where map is struct_ops
> > map.
>
> I see...
> So basically when a struct ops map is created we have a fd and then
> we can attach it (theoretically multiple times) using BPF_LINK_CREATE.
Yes, exactly. "theoretically" part is true right now because of how
things are wired up internally, but this must be fixable
>
> >
> > Having said that, we do have bpf_map__attach_struct_ops() already
> > (it's using BPF_LINK_CREATE command under the hood), and so perhaps
> > the right way is to have bpf_map__attach_struct_ops_opts() API, which
> > will accept optional extra attachment parameters which will be passed
> > into bpf_attr.link_create.struct_ops section of UAPI. That thing can
> > have target FD, where FD is cgroup/task/whatever we need to specify
> > attachment target. Just like we do that for BPF program's
> > BPF_LINK_CREATE, really.
>
> Yes, this sounds good to me!
>
> Thanks you for the clarification.
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
> On Mon, Oct 6, 2025 at 4:52 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>>
>> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>>
>> > On Fri, Oct 3, 2025 at 7:01 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>> >>
>> >> Martin KaFai Lau <martin.lau@linux.dev> writes:
>> >>
>> >> > On 9/2/25 10:31 AM, Roman Gushchin wrote:
>> >> >> Btw, what's the right way to attach struct ops to a cgroup, if there is
>> >> >> one? Add a cgroup_id field to the struct and use it in the .reg()
>> >> >
>> >> > Adding a cgroup id/fd field to the struct bpf_oom_ops will be hard to
>> >> > attach the same bpf_oom_ops to multiple cgroups.
>> >> >
>> >> >> callback? Or there is something better?
>> >> >
>> >> > There is a link_create.target_fd in the "union bpf_attr". The
>> >> > cgroup_bpf_link_attach() is using it as cgroup fd. May be it can be
>> >> > used here also. This will limit it to link attach only. Meaning the
>> >> > SEC(".struct_ops.link") is supported but not the older
>> >> > SEC(".struct_ops"). I think this should be fine.
>> >>
>> >> I thought a bit more about it (sorry for the delay):
>> >> if we want to be able to attach a single struct ops to multiple cgroups
>> >> (and potentially other objects, e.g. sockets), we can't really
>> >> use the existing struct ops's bpf_link.
>> >>
>> >> So I guess we need to add a new .attach() function beside .reg()
>> >> which will take the existing link and struct bpf_attr as arguments and
>> >> return a new bpf_link. And in libbpf we need a corresponding new
>> >> bpf_link__attach_cgroup().
>> >>
>> >> Does it sound right?
>> >>
>> >
>> > Not really, but I also might be missing some details (I haven't read
>> > the entire thread).
>> >
>> > But conceptually, what you describe is not how things work w.r.t. BPF
>> > links and attachment.
>> >
>> > You don't attach a link to some hook (e.g., cgroup). You attach either
>> > BPF program or (as in this case) BPF struct_ops map to a hook (i.e.,
>> > cgroup), and get back the BPF link. That BPF link describes that one
>> > attachment of prog/struct_ops to that hook. Each attachment gets its
>> > own BPF link FD.
>> >
>> > So, there cannot be bpf_link__attach_cgroup(), but there can be (at
>> > least conceptually) bpf_map__attach_cgroup(), where map is struct_ops
>> > map.
>>
>> I see...
>> So basically when a struct ops map is created we have a fd and then
>> we can attach it (theoretically multiple times) using BPF_LINK_CREATE.
>
> Yes, exactly. "theoretically" part is true right now because of how
> things are wired up internally, but this must be fixable
Ok, one more question: do you think it's better to alter the existing
bpf_struct_ops.reg() callback and add the bpf_attr parameter
or add the new .attach() callback?
On Mon, Oct 6, 2025 at 5:42 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: [...] > >> > > >> > So, there cannot be bpf_link__attach_cgroup(), but there can be (at > >> > least conceptually) bpf_map__attach_cgroup(), where map is struct_ops > >> > map. > >> > >> I see... > >> So basically when a struct ops map is created we have a fd and then > >> we can attach it (theoretically multiple times) using BPF_LINK_CREATE. > > > > Yes, exactly. "theoretically" part is true right now because of how > > things are wired up internally, but this must be fixable > > Ok, one more question: do you think it's better to alter the existing > bpf_struct_ops.reg() callback and add the bpf_attr parameter > or add the new .attach() callback? IIUC, bpf_struct_ops_link is just for bpf_struct_ops.reg(). The attach() operation can be separate, and it doesn't need to be implemented in sys_bpf() syscall. BPF TCP congestion control uses setsockopt() to do the attach(). Current sched_ext does the attach as part of reg(). Tejun is proposing to use reg() for sub scheduler [1]. In my earlier patch set for fanotify-bpf, I was planning to use ioctl on the fanotify fd [2]. I think these all work for the given use case. I am not sure what is the best option for cgroup oom killer. There are multiple options. Technically, it can even be a sysfs entry. We can use it as: # load and pin oom killers first $ cat /sys/fs/cgroup/user.slice/oom.killer [oom_a] oom_b oom_c $ echo oom_b > /sys/fs/cgroup/user.slice/oom.killer $ cat /sys/fs/cgroup/user.slice/oom.killer oom_a [oom_b] oom_c Note that, I am not proposing to use sysfs entries for oom killer. I just want to say it is an option. Given attach() can be implemented in different ways, we probably don't need to add it to bpf_struct_ops. But if that turns out to be the best option, I would not argue against it. OTOH, I think it is better to keep reg() and attach() separate, though sched_ext is using reg() for both options. Does this make sense? Thanks, Song [1] https://lore.kernel.org/bpf/20250920005931.2753828-1-tj@kernel.org/ [2] https://lore.kernel.org/bpf/20241114084345.1564165-1-song@kernel.org/
Song Liu <song@kernel.org> writes: > On Mon, Oct 6, 2025 at 5:42 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: > [...] >> >> > >> >> > So, there cannot be bpf_link__attach_cgroup(), but there can be (at >> >> > least conceptually) bpf_map__attach_cgroup(), where map is struct_ops >> >> > map. >> >> >> >> I see... >> >> So basically when a struct ops map is created we have a fd and then >> >> we can attach it (theoretically multiple times) using BPF_LINK_CREATE. >> > >> > Yes, exactly. "theoretically" part is true right now because of how >> > things are wired up internally, but this must be fixable >> >> Ok, one more question: do you think it's better to alter the existing >> bpf_struct_ops.reg() callback and add the bpf_attr parameter >> or add the new .attach() callback? > > IIUC, bpf_struct_ops_link is just for bpf_struct_ops.reg(). The > attach() operation can be separate, and it doesn't need to be > implemented in sys_bpf() syscall. BPF TCP congestion control > uses setsockopt() to do the attach(). Current sched_ext does > the attach as part of reg(). Tejun is proposing to use reg() for > sub scheduler [1]. In my earlier patch set for fanotify-bpf, I > was planning to use ioctl on the fanotify fd [2]. I think these > all work for the given use case. > > I am not sure what is the best option for cgroup oom killer. There > are multiple options. Technically, it can even be a sysfs entry. > We can use it as: > > # load and pin oom killers first > $ cat /sys/fs/cgroup/user.slice/oom.killer > [oom_a] oom_b oom_c > $ echo oom_b > /sys/fs/cgroup/user.slice/oom.killer > $ cat /sys/fs/cgroup/user.slice/oom.killer > oom_a [oom_b] oom_c It actually looks nice! But I expect that most users of bpf_oom won't use it directly, but through some sort of middleware (e.g. systemd), so Idk if such a user-oriented interface makes a lot of sense. > Note that, I am not proposing to use sysfs entries for oom killer. > I just want to say it is an option. > > Given attach() can be implemented in different ways, we probably > don't need to add it to bpf_struct_ops. But if that turns out to be > the best option, I would not argue against it. OTOH, I think it is > better to keep reg() and attach() separate, though sched_ext is > using reg() for both options. I'm inclining towards a similar approach, except that I don't want to embed cgroup_id into the struct_ops, but keep it in the link, as Martin suggested. But I need to implement it end-to-end before I can be sure that it's the best option. Working on it... > > Does this make sense? Yes, thank you for the great summary!
On Tue, Oct 7, 2025 at 7:15 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: [...] > > > > I am not sure what is the best option for cgroup oom killer. There > > are multiple options. Technically, it can even be a sysfs entry. > > We can use it as: > > > > # load and pin oom killers first > > $ cat /sys/fs/cgroup/user.slice/oom.killer > > [oom_a] oom_b oom_c > > $ echo oom_b > /sys/fs/cgroup/user.slice/oom.killer > > $ cat /sys/fs/cgroup/user.slice/oom.killer > > oom_a [oom_b] oom_c > > It actually looks nice! > But I expect that most users of bpf_oom won't use it directly, > but through some sort of middleware (e.g. systemd), so Idk if > such a user-oriented interface makes a lot of sense. > > > Note that, I am not proposing to use sysfs entries for oom killer. > > I just want to say it is an option. > > > > Given attach() can be implemented in different ways, we probably > > don't need to add it to bpf_struct_ops. But if that turns out to be > > the best option, I would not argue against it. OTOH, I think it is > > better to keep reg() and attach() separate, though sched_ext is > > using reg() for both options. > > I'm inclining towards a similar approach, except that I don't want > to embed cgroup_id into the struct_ops, but keep it in the link, > as Martin suggested. But I need to implement it end-to-end before I can > be sure that it's the best option. Working on it... If we add cgroup_id to the link, I guess this means we need the link (some fd in user space) to hold reference on the attachment of this oom struct_ops on this is cgroup. Do we also need this link to hold a reference on the cgroup? Alternatively, we can have the cgroup hold a reference to this struct_ops. This way, we don't need a link to hold reference to the struct_ops. I think this might be a cleaner design. Just an idea. If this doesn't make sense, we can revisit this with the code. Thanks, Song
Song Liu <liu.song.linuxdev@gmail.com> writes: > On Tue, Oct 7, 2025 at 7:15 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: > [...] >> > >> > I am not sure what is the best option for cgroup oom killer. There >> > are multiple options. Technically, it can even be a sysfs entry. >> > We can use it as: >> > >> > # load and pin oom killers first >> > $ cat /sys/fs/cgroup/user.slice/oom.killer >> > [oom_a] oom_b oom_c >> > $ echo oom_b > /sys/fs/cgroup/user.slice/oom.killer >> > $ cat /sys/fs/cgroup/user.slice/oom.killer >> > oom_a [oom_b] oom_c >> >> It actually looks nice! >> But I expect that most users of bpf_oom won't use it directly, >> but through some sort of middleware (e.g. systemd), so Idk if >> such a user-oriented interface makes a lot of sense. >> >> > Note that, I am not proposing to use sysfs entries for oom killer. >> > I just want to say it is an option. >> > >> > Given attach() can be implemented in different ways, we probably >> > don't need to add it to bpf_struct_ops. But if that turns out to be >> > the best option, I would not argue against it. OTOH, I think it is >> > better to keep reg() and attach() separate, though sched_ext is >> > using reg() for both options. >> >> I'm inclining towards a similar approach, except that I don't want >> to embed cgroup_id into the struct_ops, but keep it in the link, >> as Martin suggested. But I need to implement it end-to-end before I can >> be sure that it's the best option. Working on it... > > If we add cgroup_id to the link, I guess this means we need the link > (some fd in user space) to hold reference on the attachment of this > oom struct_ops on this is cgroup. Do we also need this link to hold > a reference on the cgroup? Not necessarily. I agree that the struct_ops should not hold a reference to the cgroup, it's better to do the opposite. This is why the link can have cgroup_id, not cgroup pointer. I think it's similar to Tejun's approach to embed cgroup_id into the struct ops, but potentially more flexible. Thanks!
Martin KaFai Lau <martin.lau@linux.dev> writes:
> On 9/2/25 10:31 AM, Roman Gushchin wrote:
>> Btw, what's the right way to attach struct ops to a cgroup, if there is
>> one? Add a cgroup_id field to the struct and use it in the .reg()
>
> Adding a cgroup id/fd field to the struct bpf_oom_ops will be hard to
> attach the same bpf_oom_ops to multiple cgroups.
Yeah, this is what I thought too, it doesn't look as an attractive path.
>
>> callback? Or there is something better?
>
> There is a link_create.target_fd in the "union bpf_attr". The
> cgroup_bpf_link_attach() is using it as cgroup fd. May be it can be
> used here also. This will limit it to link attach only. Meaning the
> SEC(".struct_ops.link") is supported but not the older
> SEC(".struct_ops"). I think this should be fine.
I'll take a look, thank you!
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> On Tue, Aug 26, 2025 at 11:01 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 8/25/25 10:00 AM, Roman Gushchin wrote:
>> > Martin KaFai Lau <martin.lau@linux.dev> writes:
>> >
>> >> On 8/20/25 5:24 PM, Roman Gushchin wrote:
>> >>>> How is it decided who gets to run before the other? Is it based on
>> >>>> order of attachment (which can be non-deterministic)?
>> >>> Yeah, now it's the order of attachment.
>> >>>
>> >>>> There was a lot of discussion on something similar for tc progs, and
>> >>>> we went with specific flags that capture partial ordering constraints
>> >>>> (instead of priorities that may collide).
>> >>>> https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net
>> >>>> It would be nice if we can find a way of making this consistent.
>> >>
>> >> +1
>> >>
>> >> The cgroup bpf prog has recently added the mprog api support also. If
>> >> the simple order of attachment is not enough and needs to have
>> >> specific ordering, we should make the bpf struct_ops support the same
>> >> mprog api instead of asking each subsystem creating its own.
>> >>
>> >> fyi, another need for struct_ops ordering is to upgrade the
>> >> BPF_PROG_TYPE_SOCK_OPS api to struct_ops for easier extension in the
>> >> future. Slide 13 in
>> >> https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view
>> >
>> > Does it mean it's better now to keep it simple in the context of oom
>> > patches with the plan to later reuse the generic struct_ops
>> > infrastructure?
>> >
>> > Honestly, I believe that the simple order of attachment should be
>> > good enough for quite a while, so I'd not over-complicate this,
>> > unless it's not fixable later.
>>
>> I think the simple attachment ordering is fine. Presumably the current link list
>> in patch 1 can be replaced by the mprog in the future. Other experts can chime
>> in if I have missed things.
>
> I don't think the proposed approach of:
> list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) {
> is extensible without breaking things.
> Sooner or later people will want bpf-oom handlers to be per
> container, so we have to think upfront how to do it.
> I would start with one bpf-oom prog per memcg and extend with mprog later.
> Effectively placing 'struct bpf_oom_ops *' into oc->memcg,
> and having one global bpf_oom_ops when oc->memcg == NULL.
> I'm sure other designs are possible, but lets make sure container scope
> is designed from the beginning.
> mprog-like multi prog behavior per container can be added later.
Sounds good to me, will implement something like this in the next version.
Thanks!
On Thu, 21 Aug 2025 at 02:25, Roman Gushchin <roman.gushchin@linux.dev> wrote: > > Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: > > > On Mon, 18 Aug 2025 at 19:01, Roman Gushchin <roman.gushchin@linux.dev> wrote: > >> > >> Introduce a bpf struct ops for implementing custom OOM handling policies. > >> > >> The struct ops provides the bpf_handle_out_of_memory() callback, > >> which expected to return 1 if it was able to free some memory and 0 > >> otherwise. > >> > >> In the latter case it's guaranteed that the in-kernel OOM killer will > >> be invoked. Otherwise the kernel also checks the bpf_memory_freed > >> field of the oom_control structure, which is expected to be set by > >> kfuncs suitable for releasing memory. It's a safety mechanism which > >> prevents a bpf program to claim forward progress without actually > >> releasing memory. The callback program is sleepable to enable using > >> iterators, e.g. cgroup iterators. > >> > >> The callback receives struct oom_control as an argument, so it can > >> easily filter out OOM's it doesn't want to handle, e.g. global vs > >> memcg OOM's. > >> > >> The callback is executed just before the kernel victim task selection > >> algorithm, so all heuristics and sysctls like panic on oom, > >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task > >> are respected. > >> > >> The struct ops also has the name field, which allows to define a > >> custom name for the implemented policy. It's printed in the OOM report > >> in the oom_policy=<policy> format. "default" is printed if bpf is not > >> used or policy name is not specified. > >> > >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 > >> oom_policy=bpf_test_policy > >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) > >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 > >> [ 112.698167] Call Trace: > >> [ 112.698177] <TASK> > >> [ 112.698182] dump_stack_lvl+0x4d/0x70 > >> [ 112.698192] dump_header+0x59/0x1c6 > >> [ 112.698199] oom_kill_process.cold+0x8/0xef > >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0 > >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 > >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf > >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 > >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0 > >> [ 112.698250] out_of_memory+0xab/0x5c0 > >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 > >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0 > >> [ 112.698288] charge_memcg+0x2f/0xc0 > >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0 > >> [ 112.698299] do_anonymous_page+0x40f/0xa50 > >> [ 112.698311] __handle_mm_fault+0xbba/0x1140 > >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 > >> [ 112.698335] handle_mm_fault+0xe6/0x370 > >> [ 112.698343] do_user_addr_fault+0x211/0x6a0 > >> [ 112.698354] exc_page_fault+0x75/0x1d0 > >> [ 112.698363] asm_exc_page_fault+0x26/0x30 > >> [ 112.698366] RIP: 0033:0x7fa97236db00 > >> > >> It's possible to load multiple bpf struct programs. In the case of > >> oom, they will be executed one by one in the same order they been > >> loaded until one of them returns 1 and bpf_memory_freed is set to 1 > >> - an indication that the memory was freed. This allows to have > >> multiple bpf programs to focus on different types of OOM's - e.g. > >> one program can only handle memcg OOM's in one memory cgroup. > >> But the filtering is done in bpf - so it's fully flexible. > > > > I think a natural question here is ordering. Is this ability to have > > multiple OOM programs critical right now? > > Good question. Initially I had only supported a single bpf policy. > But then I realized that likely people would want to have different > policies handling different parts of the cgroup tree. > E.g. a global policy and several policies handling OOMs only > in some memory cgroups. > So having just a single policy is likely a no go. If the ordering is more to facilitate scoping, would it then be better to support attaching the policy to specific memcg/cgroup? There is then one global policy if need be (by attaching to root), but descendants can have their own which takes precedence, if it doesn't act, we walk up the hierarchy and find the next handler in the parent cgroup etc. all the way to the root until one of them returns 1. > > [...]
Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: > On Thu, 21 Aug 2025 at 02:25, Roman Gushchin <roman.gushchin@linux.dev> wrote: >> >> Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: >> >> > On Mon, 18 Aug 2025 at 19:01, Roman Gushchin <roman.gushchin@linux.dev> wrote: >> >> >> >> Introduce a bpf struct ops for implementing custom OOM handling policies. >> >> >> >> The struct ops provides the bpf_handle_out_of_memory() callback, >> >> which expected to return 1 if it was able to free some memory and 0 >> >> otherwise. >> >> >> >> In the latter case it's guaranteed that the in-kernel OOM killer will >> >> be invoked. Otherwise the kernel also checks the bpf_memory_freed >> >> field of the oom_control structure, which is expected to be set by >> >> kfuncs suitable for releasing memory. It's a safety mechanism which >> >> prevents a bpf program to claim forward progress without actually >> >> releasing memory. The callback program is sleepable to enable using >> >> iterators, e.g. cgroup iterators. >> >> >> >> The callback receives struct oom_control as an argument, so it can >> >> easily filter out OOM's it doesn't want to handle, e.g. global vs >> >> memcg OOM's. >> >> >> >> The callback is executed just before the kernel victim task selection >> >> algorithm, so all heuristics and sysctls like panic on oom, >> >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task >> >> are respected. >> >> >> >> The struct ops also has the name field, which allows to define a >> >> custom name for the implemented policy. It's printed in the OOM report >> >> in the oom_policy=<policy> format. "default" is printed if bpf is not >> >> used or policy name is not specified. >> >> >> >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 >> >> oom_policy=bpf_test_policy >> >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) >> >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 >> >> [ 112.698167] Call Trace: >> >> [ 112.698177] <TASK> >> >> [ 112.698182] dump_stack_lvl+0x4d/0x70 >> >> [ 112.698192] dump_header+0x59/0x1c6 >> >> [ 112.698199] oom_kill_process.cold+0x8/0xef >> >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0 >> >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 >> >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf >> >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 >> >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0 >> >> [ 112.698250] out_of_memory+0xab/0x5c0 >> >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 >> >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0 >> >> [ 112.698288] charge_memcg+0x2f/0xc0 >> >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0 >> >> [ 112.698299] do_anonymous_page+0x40f/0xa50 >> >> [ 112.698311] __handle_mm_fault+0xbba/0x1140 >> >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 >> >> [ 112.698335] handle_mm_fault+0xe6/0x370 >> >> [ 112.698343] do_user_addr_fault+0x211/0x6a0 >> >> [ 112.698354] exc_page_fault+0x75/0x1d0 >> >> [ 112.698363] asm_exc_page_fault+0x26/0x30 >> >> [ 112.698366] RIP: 0033:0x7fa97236db00 >> >> >> >> It's possible to load multiple bpf struct programs. In the case of >> >> oom, they will be executed one by one in the same order they been >> >> loaded until one of them returns 1 and bpf_memory_freed is set to 1 >> >> - an indication that the memory was freed. This allows to have >> >> multiple bpf programs to focus on different types of OOM's - e.g. >> >> one program can only handle memcg OOM's in one memory cgroup. >> >> But the filtering is done in bpf - so it's fully flexible. >> > >> > I think a natural question here is ordering. Is this ability to have >> > multiple OOM programs critical right now? >> >> Good question. Initially I had only supported a single bpf policy. >> But then I realized that likely people would want to have different >> policies handling different parts of the cgroup tree. >> E.g. a global policy and several policies handling OOMs only >> in some memory cgroups. >> So having just a single policy is likely a no go. > > If the ordering is more to facilitate scoping, would it then be better > to support attaching the policy to specific memcg/cgroup? Well, it has some advantages and disadvantages. First, it will require way more infrastructure on the memcg side. Second, the interface is not super clear: we don't want to have a struct ops per cgroup, I guess. And in many case a single policy for all memcgs is just fine, so asking the user to attach it to all memcgs is just adding a toil and creating all kinds of races. So I see your point, but I'm not yet convinced, to be honest. Thanks!
On Wed, Aug 20, 2025 at 7:22 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: > > Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: > > > On Thu, 21 Aug 2025 at 02:25, Roman Gushchin <roman.gushchin@linux.dev> wrote: > >> > >> Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: > >> > >> > On Mon, 18 Aug 2025 at 19:01, Roman Gushchin <roman.gushchin@linux.dev> wrote: > >> >> > >> >> Introduce a bpf struct ops for implementing custom OOM handling policies. > >> >> > >> >> The struct ops provides the bpf_handle_out_of_memory() callback, > >> >> which expected to return 1 if it was able to free some memory and 0 > >> >> otherwise. > >> >> > >> >> In the latter case it's guaranteed that the in-kernel OOM killer will > >> >> be invoked. Otherwise the kernel also checks the bpf_memory_freed > >> >> field of the oom_control structure, which is expected to be set by > >> >> kfuncs suitable for releasing memory. It's a safety mechanism which > >> >> prevents a bpf program to claim forward progress without actually > >> >> releasing memory. The callback program is sleepable to enable using > >> >> iterators, e.g. cgroup iterators. > >> >> > >> >> The callback receives struct oom_control as an argument, so it can > >> >> easily filter out OOM's it doesn't want to handle, e.g. global vs > >> >> memcg OOM's. > >> >> > >> >> The callback is executed just before the kernel victim task selection > >> >> algorithm, so all heuristics and sysctls like panic on oom, > >> >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task > >> >> are respected. > >> >> > >> >> The struct ops also has the name field, which allows to define a > >> >> custom name for the implemented policy. It's printed in the OOM report > >> >> in the oom_policy=<policy> format. "default" is printed if bpf is not > >> >> used or policy name is not specified. > >> >> > >> >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 > >> >> oom_policy=bpf_test_policy > >> >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) > >> >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 > >> >> [ 112.698167] Call Trace: > >> >> [ 112.698177] <TASK> > >> >> [ 112.698182] dump_stack_lvl+0x4d/0x70 > >> >> [ 112.698192] dump_header+0x59/0x1c6 > >> >> [ 112.698199] oom_kill_process.cold+0x8/0xef > >> >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0 > >> >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 > >> >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf > >> >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 > >> >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0 > >> >> [ 112.698250] out_of_memory+0xab/0x5c0 > >> >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 > >> >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0 > >> >> [ 112.698288] charge_memcg+0x2f/0xc0 > >> >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0 > >> >> [ 112.698299] do_anonymous_page+0x40f/0xa50 > >> >> [ 112.698311] __handle_mm_fault+0xbba/0x1140 > >> >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 > >> >> [ 112.698335] handle_mm_fault+0xe6/0x370 > >> >> [ 112.698343] do_user_addr_fault+0x211/0x6a0 > >> >> [ 112.698354] exc_page_fault+0x75/0x1d0 > >> >> [ 112.698363] asm_exc_page_fault+0x26/0x30 > >> >> [ 112.698366] RIP: 0033:0x7fa97236db00 > >> >> > >> >> It's possible to load multiple bpf struct programs. In the case of > >> >> oom, they will be executed one by one in the same order they been > >> >> loaded until one of them returns 1 and bpf_memory_freed is set to 1 > >> >> - an indication that the memory was freed. This allows to have > >> >> multiple bpf programs to focus on different types of OOM's - e.g. > >> >> one program can only handle memcg OOM's in one memory cgroup. > >> >> But the filtering is done in bpf - so it's fully flexible. > >> > > >> > I think a natural question here is ordering. Is this ability to have > >> > multiple OOM programs critical right now? > >> > >> Good question. Initially I had only supported a single bpf policy. > >> But then I realized that likely people would want to have different > >> policies handling different parts of the cgroup tree. > >> E.g. a global policy and several policies handling OOMs only > >> in some memory cgroups. > >> So having just a single policy is likely a no go. > > > > If the ordering is more to facilitate scoping, would it then be better > > to support attaching the policy to specific memcg/cgroup? > > Well, it has some advantages and disadvantages. First, it will require > way more infrastructure on the memcg side. Second, the interface is not > super clear: we don't want to have a struct ops per cgroup, I guess. > And in many case a single policy for all memcgs is just fine, so asking > the user to attach it to all memcgs is just adding a toil and creating > all kinds of races. > So I see your point, but I'm not yet convinced, to be honest. I would suggest keeping it simple until we know there is a need to prioritize between multiple oom-killers. > > Thanks! >
On Mon, Aug 18, 2025 at 10:01 AM Roman Gushchin
<roman.gushchin@linux.dev> wrote:
>
> Introduce a bpf struct ops for implementing custom OOM handling policies.
>
> The struct ops provides the bpf_handle_out_of_memory() callback,
> which expected to return 1 if it was able to free some memory and 0
> otherwise.
>
> In the latter case it's guaranteed that the in-kernel OOM killer will
> be invoked. Otherwise the kernel also checks the bpf_memory_freed
> field of the oom_control structure, which is expected to be set by
> kfuncs suitable for releasing memory. It's a safety mechanism which
> prevents a bpf program to claim forward progress without actually
> releasing memory. The callback program is sleepable to enable using
> iterators, e.g. cgroup iterators.
>
> The callback receives struct oom_control as an argument, so it can
> easily filter out OOM's it doesn't want to handle, e.g. global vs
> memcg OOM's.
>
> The callback is executed just before the kernel victim task selection
> algorithm, so all heuristics and sysctls like panic on oom,
> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task
> are respected.
>
> The struct ops also has the name field, which allows to define a
> custom name for the implemented policy. It's printed in the OOM report
> in the oom_policy=<policy> format. "default" is printed if bpf is not
> used or policy name is not specified.
>
> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
> oom_policy=bpf_test_policy
> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full)
> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
> [ 112.698167] Call Trace:
> [ 112.698177] <TASK>
> [ 112.698182] dump_stack_lvl+0x4d/0x70
> [ 112.698192] dump_header+0x59/0x1c6
> [ 112.698199] oom_kill_process.cold+0x8/0xef
> [ 112.698206] bpf_oom_kill_process+0x59/0xb0
> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313
> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf
> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5
> [ 112.698240] bpf_handle_oom+0x11a/0x1e0
> [ 112.698250] out_of_memory+0xab/0x5c0
> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110
> [ 112.698274] try_charge_memcg+0x4b5/0x7e0
> [ 112.698288] charge_memcg+0x2f/0xc0
> [ 112.698293] __mem_cgroup_charge+0x30/0xc0
> [ 112.698299] do_anonymous_page+0x40f/0xa50
> [ 112.698311] __handle_mm_fault+0xbba/0x1140
> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5
> [ 112.698335] handle_mm_fault+0xe6/0x370
> [ 112.698343] do_user_addr_fault+0x211/0x6a0
> [ 112.698354] exc_page_fault+0x75/0x1d0
> [ 112.698363] asm_exc_page_fault+0x26/0x30
> [ 112.698366] RIP: 0033:0x7fa97236db00
>
> It's possible to load multiple bpf struct programs. In the case of
> oom, they will be executed one by one in the same order they been
> loaded until one of them returns 1 and bpf_memory_freed is set to 1
> - an indication that the memory was freed. This allows to have
> multiple bpf programs to focus on different types of OOM's - e.g.
> one program can only handle memcg OOM's in one memory cgroup.
> But the filtering is done in bpf - so it's fully flexible.
>
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> ---
> include/linux/bpf_oom.h | 49 +++++++++++++
> include/linux/oom.h | 8 ++
> mm/Makefile | 3 +
> mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++
> mm/oom_kill.c | 22 +++++-
> 5 files changed, 237 insertions(+), 2 deletions(-)
> create mode 100644 include/linux/bpf_oom.h
> create mode 100644 mm/bpf_oom.c
>
> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
> new file mode 100644
> index 000000000000..29cb5ea41d97
> --- /dev/null
> +++ b/include/linux/bpf_oom.h
> @@ -0,0 +1,49 @@
> +/* SPDX-License-Identifier: GPL-2.0+ */
> +
> +#ifndef __BPF_OOM_H
> +#define __BPF_OOM_H
> +
> +struct bpf_oom;
> +struct oom_control;
> +
> +#define BPF_OOM_NAME_MAX_LEN 64
> +
> +struct bpf_oom_ops {
> + /**
> + * @handle_out_of_memory: Out of memory bpf handler, called before
> + * the in-kernel OOM killer.
> + * @oc: OOM control structure
> + *
> + * Should return 1 if some memory was freed up, otherwise
> + * the in-kernel OOM killer is invoked.
> + */
> + int (*handle_out_of_memory)(struct oom_control *oc);
> +
> + /**
> + * @name: BPF OOM policy name
> + */
> + char name[BPF_OOM_NAME_MAX_LEN];
Why should the name be a part of ops structure? IMO it's not an
attribute of the operations but rather of the oom handler which is
represented by bpf_oom here.
> +
> + /* Private */
> + struct bpf_oom *bpf_oom;
> +};
> +
> +#ifdef CONFIG_BPF_SYSCALL
> +/**
> + * @bpf_handle_oom: handle out of memory using bpf programs
> + * @oc: OOM control structure
> + *
> + * Returns true if a bpf oom program was executed, returned 1
> + * and some memory was actually freed.
The above comment is unclear, please clarify.
> + */
> +bool bpf_handle_oom(struct oom_control *oc);
> +
> +#else /* CONFIG_BPF_SYSCALL */
> +static inline bool bpf_handle_oom(struct oom_control *oc)
> +{
> + return false;
> +}
> +
> +#endif /* CONFIG_BPF_SYSCALL */
> +
> +#endif /* __BPF_OOM_H */
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 1e0fc6931ce9..ef453309b7ea 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -51,6 +51,14 @@ struct oom_control {
>
> /* Used to print the constraint info. */
> enum oom_constraint constraint;
> +
> +#ifdef CONFIG_BPF_SYSCALL
> + /* Used by the bpf oom implementation to mark the forward progress */
> + bool bpf_memory_freed;
> +
> + /* Policy name */
> + const char *bpf_policy_name;
> +#endif
> };
>
> extern struct mutex oom_lock;
> diff --git a/mm/Makefile b/mm/Makefile
> index 1a7a11d4933d..a714aba03759 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
> ifdef CONFIG_SWAP
> obj-$(CONFIG_MEMCG) += swap_cgroup.o
> endif
> +ifdef CONFIG_BPF_SYSCALL
> +obj-y += bpf_oom.o
> +endif
> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
> obj-$(CONFIG_GUP_TEST) += gup_test.o
> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
> new file mode 100644
> index 000000000000..47633046819c
> --- /dev/null
> +++ b/mm/bpf_oom.c
> @@ -0,0 +1,157 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * BPF-driven OOM killer customization
> + *
> + * Author: Roman Gushchin <roman.gushchin@linux.dev>
> + */
> +
> +#include <linux/bpf.h>
> +#include <linux/oom.h>
> +#include <linux/bpf_oom.h>
> +#include <linux/srcu.h>
> +
> +DEFINE_STATIC_SRCU(bpf_oom_srcu);
> +static DEFINE_SPINLOCK(bpf_oom_lock);
> +static LIST_HEAD(bpf_oom_handlers);
> +
> +struct bpf_oom {
Perhaps bpf_oom_handler ? Then bpf_oom_ops->bpf_oom could be called
bpf_oom_ops->handler.
> + struct bpf_oom_ops *ops;
> + struct list_head node;
> + struct srcu_struct srcu;
> +};
> +
> +bool bpf_handle_oom(struct oom_control *oc)
> +{
> + struct bpf_oom_ops *ops;
> + struct bpf_oom *bpf_oom;
> + int list_idx, idx, ret = 0;
> +
> + oc->bpf_memory_freed = false;
> +
> + list_idx = srcu_read_lock(&bpf_oom_srcu);
> + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) {
> + ops = READ_ONCE(bpf_oom->ops);
> + if (!ops || !ops->handle_out_of_memory)
> + continue;
> + idx = srcu_read_lock(&bpf_oom->srcu);
> + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] :
> + "bpf_defined_policy";
> + ret = ops->handle_out_of_memory(oc);
> + oc->bpf_policy_name = NULL;
> + srcu_read_unlock(&bpf_oom->srcu, idx);
> +
> + if (ret && oc->bpf_memory_freed)
IIUC ret and oc->bpf_memory_freed seem to reflect the same state:
handler successfully freed some memory. Could you please clarify when
they differ?
> + break;
> + }
> + srcu_read_unlock(&bpf_oom_srcu, list_idx);
> +
> + return ret && oc->bpf_memory_freed;
> +}
> +
> +static int __handle_out_of_memory(struct oom_control *oc)
> +{
> + return 0;
> +}
> +
> +static struct bpf_oom_ops __bpf_oom_ops = {
> + .handle_out_of_memory = __handle_out_of_memory,
> +};
> +
> +static const struct bpf_func_proto *
> +bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +{
> + return tracing_prog_func_proto(func_id, prog);
> +}
> +
> +static bool bpf_oom_ops_is_valid_access(int off, int size,
> + enum bpf_access_type type,
> + const struct bpf_prog *prog,
> + struct bpf_insn_access_aux *info)
> +{
> + return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
> +}
> +
> +static const struct bpf_verifier_ops bpf_oom_verifier_ops = {
> + .get_func_proto = bpf_oom_func_proto,
> + .is_valid_access = bpf_oom_ops_is_valid_access,
> +};
> +
> +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
> +{
> + struct bpf_oom_ops *ops = kdata;
> + struct bpf_oom *bpf_oom;
> + int ret;
> +
> + bpf_oom = kmalloc(sizeof(*bpf_oom), GFP_KERNEL_ACCOUNT);
> + if (!bpf_oom)
> + return -ENOMEM;
> +
> + ret = init_srcu_struct(&bpf_oom->srcu);
> + if (ret) {
> + kfree(bpf_oom);
> + return ret;
> + }
> +
> + WRITE_ONCE(bpf_oom->ops, ops);
> + ops->bpf_oom = bpf_oom;
> +
> + spin_lock(&bpf_oom_lock);
> + list_add_rcu(&bpf_oom->node, &bpf_oom_handlers);
> + spin_unlock(&bpf_oom_lock);
> +
> + return 0;
> +}
> +
> +static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link)
> +{
> + struct bpf_oom_ops *ops = kdata;
> + struct bpf_oom *bpf_oom = ops->bpf_oom;
> +
> + WRITE_ONCE(bpf_oom->ops, NULL);
> +
> + spin_lock(&bpf_oom_lock);
> + list_del_rcu(&bpf_oom->node);
> + spin_unlock(&bpf_oom_lock);
> +
> + synchronize_srcu(&bpf_oom->srcu);
> +
> + kfree(bpf_oom);
> +}
> +
> +static int bpf_oom_ops_init_member(const struct btf_type *t,
> + const struct btf_member *member,
> + void *kdata, const void *udata)
> +{
> + const struct bpf_oom_ops *uops = (const struct bpf_oom_ops *)udata;
> + struct bpf_oom_ops *ops = (struct bpf_oom_ops *)kdata;
> + u32 moff = __btf_member_bit_offset(t, member) / 8;
> +
> + switch (moff) {
> + case offsetof(struct bpf_oom_ops, name):
> + strscpy_pad(ops->name, uops->name, sizeof(ops->name));
> + return 1;
> + }
> + return 0;
> +}
> +
> +static int bpf_oom_ops_init(struct btf *btf)
> +{
> + return 0;
> +}
> +
> +static struct bpf_struct_ops bpf_oom_bpf_ops = {
> + .verifier_ops = &bpf_oom_verifier_ops,
> + .reg = bpf_oom_ops_reg,
> + .unreg = bpf_oom_ops_unreg,
> + .init_member = bpf_oom_ops_init_member,
> + .init = bpf_oom_ops_init,
> + .name = "bpf_oom_ops",
> + .owner = THIS_MODULE,
> + .cfi_stubs = &__bpf_oom_ops
> +};
> +
> +static int __init bpf_oom_struct_ops_init(void)
> +{
> + return register_bpf_struct_ops(&bpf_oom_bpf_ops, bpf_oom_ops);
> +}
> +late_initcall(bpf_oom_struct_ops_init);
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 25923cfec9c6..ad7bd65061d6 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -45,6 +45,7 @@
> #include <linux/mmu_notifier.h>
> #include <linux/cred.h>
> #include <linux/nmi.h>
> +#include <linux/bpf_oom.h>
>
> #include <asm/tlb.h>
> #include "internal.h"
> @@ -246,6 +247,15 @@ static const char * const oom_constraint_text[] = {
> [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
> };
>
> +static const char *oom_policy_name(struct oom_control *oc)
> +{
> +#ifdef CONFIG_BPF_SYSCALL
> + if (oc->bpf_policy_name)
> + return oc->bpf_policy_name;
> +#endif
> + return "default";
> +}
> +
> /*
> * Determine the type of allocation constraint.
> */
> @@ -458,9 +468,10 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim)
>
> static void dump_header(struct oom_control *oc)
> {
> - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
> + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\noom_policy=%s\n",
> current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
> - current->signal->oom_score_adj);
> + current->signal->oom_score_adj,
> + oom_policy_name(oc));
> if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
> pr_warn("COMPACTION is disabled!!!\n");
>
> @@ -1161,6 +1172,13 @@ bool out_of_memory(struct oom_control *oc)
> return true;
> }
>
> + /*
> + * Let bpf handle the OOM first. If it was able to free up some memory,
> + * bail out. Otherwise fall back to the kernel OOM killer.
> + */
> + if (bpf_handle_oom(oc))
> + return true;
> +
> select_bad_process(oc);
> /* Found nothing?!?! */
> if (!oc->chosen) {
> --
> 2.50.1
>
Suren Baghdasaryan <surenb@google.com> writes:
> On Mon, Aug 18, 2025 at 10:01 AM Roman Gushchin
> <roman.gushchin@linux.dev> wrote:
>>
>> Introduce a bpf struct ops for implementing custom OOM handling policies.
>>
>> The struct ops provides the bpf_handle_out_of_memory() callback,
>> which expected to return 1 if it was able to free some memory and 0
>> otherwise.
>>
>> In the latter case it's guaranteed that the in-kernel OOM killer will
>> be invoked. Otherwise the kernel also checks the bpf_memory_freed
>> field of the oom_control structure, which is expected to be set by
>> kfuncs suitable for releasing memory. It's a safety mechanism which
>> prevents a bpf program to claim forward progress without actually
>> releasing memory. The callback program is sleepable to enable using
>> iterators, e.g. cgroup iterators.
>>
>> The callback receives struct oom_control as an argument, so it can
>> easily filter out OOM's it doesn't want to handle, e.g. global vs
>> memcg OOM's.
>>
>> The callback is executed just before the kernel victim task selection
>> algorithm, so all heuristics and sysctls like panic on oom,
>> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task
>> are respected.
>>
>> The struct ops also has the name field, which allows to define a
>> custom name for the implemented policy. It's printed in the OOM report
>> in the oom_policy=<policy> format. "default" is printed if bpf is not
>> used or policy name is not specified.
>>
>> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
>> oom_policy=bpf_test_policy
>> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full)
>> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
>> [ 112.698167] Call Trace:
>> [ 112.698177] <TASK>
>> [ 112.698182] dump_stack_lvl+0x4d/0x70
>> [ 112.698192] dump_header+0x59/0x1c6
>> [ 112.698199] oom_kill_process.cold+0x8/0xef
>> [ 112.698206] bpf_oom_kill_process+0x59/0xb0
>> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313
>> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf
>> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5
>> [ 112.698240] bpf_handle_oom+0x11a/0x1e0
>> [ 112.698250] out_of_memory+0xab/0x5c0
>> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110
>> [ 112.698274] try_charge_memcg+0x4b5/0x7e0
>> [ 112.698288] charge_memcg+0x2f/0xc0
>> [ 112.698293] __mem_cgroup_charge+0x30/0xc0
>> [ 112.698299] do_anonymous_page+0x40f/0xa50
>> [ 112.698311] __handle_mm_fault+0xbba/0x1140
>> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5
>> [ 112.698335] handle_mm_fault+0xe6/0x370
>> [ 112.698343] do_user_addr_fault+0x211/0x6a0
>> [ 112.698354] exc_page_fault+0x75/0x1d0
>> [ 112.698363] asm_exc_page_fault+0x26/0x30
>> [ 112.698366] RIP: 0033:0x7fa97236db00
>>
>> It's possible to load multiple bpf struct programs. In the case of
>> oom, they will be executed one by one in the same order they been
>> loaded until one of them returns 1 and bpf_memory_freed is set to 1
>> - an indication that the memory was freed. This allows to have
>> multiple bpf programs to focus on different types of OOM's - e.g.
>> one program can only handle memcg OOM's in one memory cgroup.
>> But the filtering is done in bpf - so it's fully flexible.
>>
>> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
>> ---
>> include/linux/bpf_oom.h | 49 +++++++++++++
>> include/linux/oom.h | 8 ++
>> mm/Makefile | 3 +
>> mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++
>> mm/oom_kill.c | 22 +++++-
>> 5 files changed, 237 insertions(+), 2 deletions(-)
>> create mode 100644 include/linux/bpf_oom.h
>> create mode 100644 mm/bpf_oom.c
>>
>> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
>> new file mode 100644
>> index 000000000000..29cb5ea41d97
>> --- /dev/null
>> +++ b/include/linux/bpf_oom.h
>> @@ -0,0 +1,49 @@
>> +/* SPDX-License-Identifier: GPL-2.0+ */
>> +
>> +#ifndef __BPF_OOM_H
>> +#define __BPF_OOM_H
>> +
>> +struct bpf_oom;
>> +struct oom_control;
>> +
>> +#define BPF_OOM_NAME_MAX_LEN 64
>> +
>> +struct bpf_oom_ops {
>> + /**
>> + * @handle_out_of_memory: Out of memory bpf handler, called before
>> + * the in-kernel OOM killer.
>> + * @oc: OOM control structure
>> + *
>> + * Should return 1 if some memory was freed up, otherwise
>> + * the in-kernel OOM killer is invoked.
>> + */
>> + int (*handle_out_of_memory)(struct oom_control *oc);
>> +
>> + /**
>> + * @name: BPF OOM policy name
>> + */
>> + char name[BPF_OOM_NAME_MAX_LEN];
>
> Why should the name be a part of ops structure? IMO it's not an
> attribute of the operations but rather of the oom handler which is
> represented by bpf_oom here.
The ops structure describes a user-defined oom policy. Currently
it's just one handler and the policy name. Later additional handlers
can be added, e.g. a handler to control the dmesg output.
bpf_oom is an implementation detail: it's basically an extension
to struct bpf_oom_ops which contains "private" fields required
for the internal machinery.
>
>> +
>> + /* Private */
>> + struct bpf_oom *bpf_oom;
>> +};
>> +
>> +#ifdef CONFIG_BPF_SYSCALL
>> +/**
>> + * @bpf_handle_oom: handle out of memory using bpf programs
>> + * @oc: OOM control structure
>> + *
>> + * Returns true if a bpf oom program was executed, returned 1
>> + * and some memory was actually freed.
>
> The above comment is unclear, please clarify.
Fixed, thanks.
/**
* @bpf_handle_oom: handle out of memory condition using bpf
* @oc: OOM control structure
*
* Returns true if some memory was freed.
*/
bool bpf_handle_oom(struct oom_control *oc);
>
>> + */
>> +bool bpf_handle_oom(struct oom_control *oc);
>> +
>> +#else /* CONFIG_BPF_SYSCALL */
>> +static inline bool bpf_handle_oom(struct oom_control *oc)
>> +{
>> + return false;
>> +}
>> +
>> +#endif /* CONFIG_BPF_SYSCALL */
>> +
>> +#endif /* __BPF_OOM_H */
>> diff --git a/include/linux/oom.h b/include/linux/oom.h
>> index 1e0fc6931ce9..ef453309b7ea 100644
>> --- a/include/linux/oom.h
>> +++ b/include/linux/oom.h
>> @@ -51,6 +51,14 @@ struct oom_control {
>>
>> /* Used to print the constraint info. */
>> enum oom_constraint constraint;
>> +
>> +#ifdef CONFIG_BPF_SYSCALL
>> + /* Used by the bpf oom implementation to mark the forward progress */
>> + bool bpf_memory_freed;
>> +
>> + /* Policy name */
>> + const char *bpf_policy_name;
>> +#endif
>> };
>>
>> extern struct mutex oom_lock;
>> diff --git a/mm/Makefile b/mm/Makefile
>> index 1a7a11d4933d..a714aba03759 100644
>> --- a/mm/Makefile
>> +++ b/mm/Makefile
>> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
>> ifdef CONFIG_SWAP
>> obj-$(CONFIG_MEMCG) += swap_cgroup.o
>> endif
>> +ifdef CONFIG_BPF_SYSCALL
>> +obj-y += bpf_oom.o
>> +endif
>> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
>> obj-$(CONFIG_GUP_TEST) += gup_test.o
>> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
>> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
>> new file mode 100644
>> index 000000000000..47633046819c
>> --- /dev/null
>> +++ b/mm/bpf_oom.c
>> @@ -0,0 +1,157 @@
>> +// SPDX-License-Identifier: GPL-2.0-or-later
>> +/*
>> + * BPF-driven OOM killer customization
>> + *
>> + * Author: Roman Gushchin <roman.gushchin@linux.dev>
>> + */
>> +
>> +#include <linux/bpf.h>
>> +#include <linux/oom.h>
>> +#include <linux/bpf_oom.h>
>> +#include <linux/srcu.h>
>> +
>> +DEFINE_STATIC_SRCU(bpf_oom_srcu);
>> +static DEFINE_SPINLOCK(bpf_oom_lock);
>> +static LIST_HEAD(bpf_oom_handlers);
>> +
>> +struct bpf_oom {
>
> Perhaps bpf_oom_handler ? Then bpf_oom_ops->bpf_oom could be called
> bpf_oom_ops->handler.
I don't think it's a handler, it's more like a private part
of bpf_oom_ops. Maybe bpf_oom_impl? Idk
>
>
>> + struct bpf_oom_ops *ops;
>> + struct list_head node;
>> + struct srcu_struct srcu;
>> +};
>> +
>> +bool bpf_handle_oom(struct oom_control *oc)
>> +{
>> + struct bpf_oom_ops *ops;
>> + struct bpf_oom *bpf_oom;
>> + int list_idx, idx, ret = 0;
>> +
>> + oc->bpf_memory_freed = false;
>> +
>> + list_idx = srcu_read_lock(&bpf_oom_srcu);
>> + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) {
>> + ops = READ_ONCE(bpf_oom->ops);
>> + if (!ops || !ops->handle_out_of_memory)
>> + continue;
>> + idx = srcu_read_lock(&bpf_oom->srcu);
>> + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] :
>> + "bpf_defined_policy";
>> + ret = ops->handle_out_of_memory(oc);
>> + oc->bpf_policy_name = NULL;
>> + srcu_read_unlock(&bpf_oom->srcu, idx);
>> +
>> + if (ret && oc->bpf_memory_freed)
>
> IIUC ret and oc->bpf_memory_freed seem to reflect the same state:
> handler successfully freed some memory. Could you please clarify when
> they differ?
The idea here is to provide an additional safety measure:
if the bpf program simple returns 1 without doing anything,
the system won't deadlock.
oc->bpf_memory_freed is set by the bpf_oom_kill_process() helper
(and potentially some other helpers in the future, e.g.
bpf_oom_rm_tmpfs_file()) and can't be modified by the bpf
program directly.
On Tue, Aug 19, 2025 at 1:06 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> Suren Baghdasaryan <surenb@google.com> writes:
>
> > On Mon, Aug 18, 2025 at 10:01 AM Roman Gushchin
> > <roman.gushchin@linux.dev> wrote:
> >>
> >> Introduce a bpf struct ops for implementing custom OOM handling policies.
> >>
> >> The struct ops provides the bpf_handle_out_of_memory() callback,
> >> which expected to return 1 if it was able to free some memory and 0
> >> otherwise.
> >>
> >> In the latter case it's guaranteed that the in-kernel OOM killer will
> >> be invoked. Otherwise the kernel also checks the bpf_memory_freed
> >> field of the oom_control structure, which is expected to be set by
> >> kfuncs suitable for releasing memory. It's a safety mechanism which
> >> prevents a bpf program to claim forward progress without actually
> >> releasing memory. The callback program is sleepable to enable using
> >> iterators, e.g. cgroup iterators.
> >>
> >> The callback receives struct oom_control as an argument, so it can
> >> easily filter out OOM's it doesn't want to handle, e.g. global vs
> >> memcg OOM's.
> >>
> >> The callback is executed just before the kernel victim task selection
> >> algorithm, so all heuristics and sysctls like panic on oom,
> >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task
> >> are respected.
> >>
> >> The struct ops also has the name field, which allows to define a
> >> custom name for the implemented policy. It's printed in the OOM report
> >> in the oom_policy=<policy> format. "default" is printed if bpf is not
> >> used or policy name is not specified.
> >>
> >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
> >> oom_policy=bpf_test_policy
> >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full)
> >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
> >> [ 112.698167] Call Trace:
> >> [ 112.698177] <TASK>
> >> [ 112.698182] dump_stack_lvl+0x4d/0x70
> >> [ 112.698192] dump_header+0x59/0x1c6
> >> [ 112.698199] oom_kill_process.cold+0x8/0xef
> >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0
> >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313
> >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf
> >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5
> >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0
> >> [ 112.698250] out_of_memory+0xab/0x5c0
> >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110
> >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0
> >> [ 112.698288] charge_memcg+0x2f/0xc0
> >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0
> >> [ 112.698299] do_anonymous_page+0x40f/0xa50
> >> [ 112.698311] __handle_mm_fault+0xbba/0x1140
> >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5
> >> [ 112.698335] handle_mm_fault+0xe6/0x370
> >> [ 112.698343] do_user_addr_fault+0x211/0x6a0
> >> [ 112.698354] exc_page_fault+0x75/0x1d0
> >> [ 112.698363] asm_exc_page_fault+0x26/0x30
> >> [ 112.698366] RIP: 0033:0x7fa97236db00
> >>
> >> It's possible to load multiple bpf struct programs. In the case of
> >> oom, they will be executed one by one in the same order they been
> >> loaded until one of them returns 1 and bpf_memory_freed is set to 1
> >> - an indication that the memory was freed. This allows to have
> >> multiple bpf programs to focus on different types of OOM's - e.g.
> >> one program can only handle memcg OOM's in one memory cgroup.
> >> But the filtering is done in bpf - so it's fully flexible.
> >>
> >> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> >> ---
> >> include/linux/bpf_oom.h | 49 +++++++++++++
> >> include/linux/oom.h | 8 ++
> >> mm/Makefile | 3 +
> >> mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++
> >> mm/oom_kill.c | 22 +++++-
> >> 5 files changed, 237 insertions(+), 2 deletions(-)
> >> create mode 100644 include/linux/bpf_oom.h
> >> create mode 100644 mm/bpf_oom.c
> >>
> >> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
> >> new file mode 100644
> >> index 000000000000..29cb5ea41d97
> >> --- /dev/null
> >> +++ b/include/linux/bpf_oom.h
> >> @@ -0,0 +1,49 @@
> >> +/* SPDX-License-Identifier: GPL-2.0+ */
> >> +
> >> +#ifndef __BPF_OOM_H
> >> +#define __BPF_OOM_H
> >> +
> >> +struct bpf_oom;
> >> +struct oom_control;
> >> +
> >> +#define BPF_OOM_NAME_MAX_LEN 64
> >> +
> >> +struct bpf_oom_ops {
> >> + /**
> >> + * @handle_out_of_memory: Out of memory bpf handler, called before
> >> + * the in-kernel OOM killer.
> >> + * @oc: OOM control structure
> >> + *
> >> + * Should return 1 if some memory was freed up, otherwise
> >> + * the in-kernel OOM killer is invoked.
> >> + */
> >> + int (*handle_out_of_memory)(struct oom_control *oc);
> >> +
> >> + /**
> >> + * @name: BPF OOM policy name
> >> + */
> >> + char name[BPF_OOM_NAME_MAX_LEN];
> >
> > Why should the name be a part of ops structure? IMO it's not an
> > attribute of the operations but rather of the oom handler which is
> > represented by bpf_oom here.
>
> The ops structure describes a user-defined oom policy. Currently
> it's just one handler and the policy name. Later additional handlers
> can be added, e.g. a handler to control the dmesg output.
>
> bpf_oom is an implementation detail: it's basically an extension
> to struct bpf_oom_ops which contains "private" fields required
> for the internal machinery.
Ok. I hope we can come up with some more descriptive naming but I
can't think of something good ATM.
>
> >
> >> +
> >> + /* Private */
> >> + struct bpf_oom *bpf_oom;
> >> +};
> >> +
> >> +#ifdef CONFIG_BPF_SYSCALL
> >> +/**
> >> + * @bpf_handle_oom: handle out of memory using bpf programs
> >> + * @oc: OOM control structure
> >> + *
> >> + * Returns true if a bpf oom program was executed, returned 1
> >> + * and some memory was actually freed.
> >
> > The above comment is unclear, please clarify.
>
> Fixed, thanks.
>
> /**
> * @bpf_handle_oom: handle out of memory condition using bpf
> * @oc: OOM control structure
> *
> * Returns true if some memory was freed.
> */
> bool bpf_handle_oom(struct oom_control *oc);
>
>
> >
> >> + */
> >> +bool bpf_handle_oom(struct oom_control *oc);
> >> +
> >> +#else /* CONFIG_BPF_SYSCALL */
> >> +static inline bool bpf_handle_oom(struct oom_control *oc)
> >> +{
> >> + return false;
> >> +}
> >> +
> >> +#endif /* CONFIG_BPF_SYSCALL */
> >> +
> >> +#endif /* __BPF_OOM_H */
> >> diff --git a/include/linux/oom.h b/include/linux/oom.h
> >> index 1e0fc6931ce9..ef453309b7ea 100644
> >> --- a/include/linux/oom.h
> >> +++ b/include/linux/oom.h
> >> @@ -51,6 +51,14 @@ struct oom_control {
> >>
> >> /* Used to print the constraint info. */
> >> enum oom_constraint constraint;
> >> +
> >> +#ifdef CONFIG_BPF_SYSCALL
> >> + /* Used by the bpf oom implementation to mark the forward progress */
> >> + bool bpf_memory_freed;
> >> +
> >> + /* Policy name */
> >> + const char *bpf_policy_name;
> >> +#endif
> >> };
> >>
> >> extern struct mutex oom_lock;
> >> diff --git a/mm/Makefile b/mm/Makefile
> >> index 1a7a11d4933d..a714aba03759 100644
> >> --- a/mm/Makefile
> >> +++ b/mm/Makefile
> >> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
> >> ifdef CONFIG_SWAP
> >> obj-$(CONFIG_MEMCG) += swap_cgroup.o
> >> endif
> >> +ifdef CONFIG_BPF_SYSCALL
> >> +obj-y += bpf_oom.o
> >> +endif
> >> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
> >> obj-$(CONFIG_GUP_TEST) += gup_test.o
> >> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
> >> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
> >> new file mode 100644
> >> index 000000000000..47633046819c
> >> --- /dev/null
> >> +++ b/mm/bpf_oom.c
> >> @@ -0,0 +1,157 @@
> >> +// SPDX-License-Identifier: GPL-2.0-or-later
> >> +/*
> >> + * BPF-driven OOM killer customization
> >> + *
> >> + * Author: Roman Gushchin <roman.gushchin@linux.dev>
> >> + */
> >> +
> >> +#include <linux/bpf.h>
> >> +#include <linux/oom.h>
> >> +#include <linux/bpf_oom.h>
> >> +#include <linux/srcu.h>
> >> +
> >> +DEFINE_STATIC_SRCU(bpf_oom_srcu);
> >> +static DEFINE_SPINLOCK(bpf_oom_lock);
> >> +static LIST_HEAD(bpf_oom_handlers);
> >> +
> >> +struct bpf_oom {
> >
> > Perhaps bpf_oom_handler ? Then bpf_oom_ops->bpf_oom could be called
> > bpf_oom_ops->handler.
>
> I don't think it's a handler, it's more like a private part
> of bpf_oom_ops. Maybe bpf_oom_impl? Idk
Yeah, we need to come up with some nomenclature and name these structs
accordingly. In my mind ops means a structure that contains only
operations, so current naming does not sit well but maybe that's just
me...
>
> >
> >
> >> + struct bpf_oom_ops *ops;
> >> + struct list_head node;
> >> + struct srcu_struct srcu;
> >> +};
> >> +
> >> +bool bpf_handle_oom(struct oom_control *oc)
> >> +{
> >> + struct bpf_oom_ops *ops;
> >> + struct bpf_oom *bpf_oom;
> >> + int list_idx, idx, ret = 0;
> >> +
> >> + oc->bpf_memory_freed = false;
> >> +
> >> + list_idx = srcu_read_lock(&bpf_oom_srcu);
> >> + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) {
> >> + ops = READ_ONCE(bpf_oom->ops);
> >> + if (!ops || !ops->handle_out_of_memory)
> >> + continue;
> >> + idx = srcu_read_lock(&bpf_oom->srcu);
> >> + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] :
> >> + "bpf_defined_policy";
> >> + ret = ops->handle_out_of_memory(oc);
> >> + oc->bpf_policy_name = NULL;
> >> + srcu_read_unlock(&bpf_oom->srcu, idx);
> >> +
> >> + if (ret && oc->bpf_memory_freed)
> >
> > IIUC ret and oc->bpf_memory_freed seem to reflect the same state:
> > handler successfully freed some memory. Could you please clarify when
> > they differ?
>
> The idea here is to provide an additional safety measure:
> if the bpf program simple returns 1 without doing anything,
> the system won't deadlock.
>
> oc->bpf_memory_freed is set by the bpf_oom_kill_process() helper
> (and potentially some other helpers in the future, e.g.
> bpf_oom_rm_tmpfs_file()) and can't be modified by the bpf
> program directly.
I see. Then maybe we use only oc->bpf_memory_freed and
handle_out_of_memory() does not return anything?
On 8/20/25 12:34 PM, Suren Baghdasaryan wrote:
> On Tue, Aug 19, 2025 at 1:06 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>> Suren Baghdasaryan <surenb@google.com> writes:
>>
>>> On Mon, Aug 18, 2025 at 10:01 AM Roman Gushchin
>>> <roman.gushchin@linux.dev> wrote:
>>>> Introduce a bpf struct ops for implementing custom OOM handling policies.
>>>>
>>>> The struct ops provides the bpf_handle_out_of_memory() callback,
>>>> which expected to return 1 if it was able to free some memory and 0
>>>> otherwise.
>>>>
>>>> In the latter case it's guaranteed that the in-kernel OOM killer will
>>>> be invoked. Otherwise the kernel also checks the bpf_memory_freed
>>>> field of the oom_control structure, which is expected to be set by
>>>> kfuncs suitable for releasing memory. It's a safety mechanism which
>>>> prevents a bpf program to claim forward progress without actually
>>>> releasing memory. The callback program is sleepable to enable using
>>>> iterators, e.g. cgroup iterators.
>>>>
>>>> The callback receives struct oom_control as an argument, so it can
>>>> easily filter out OOM's it doesn't want to handle, e.g. global vs
>>>> memcg OOM's.
>>>>
>>>> The callback is executed just before the kernel victim task selection
>>>> algorithm, so all heuristics and sysctls like panic on oom,
>>>> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task
>>>> are respected.
>>>>
>>>> The struct ops also has the name field, which allows to define a
>>>> custom name for the implemented policy. It's printed in the OOM report
>>>> in the oom_policy=<policy> format. "default" is printed if bpf is not
>>>> used or policy name is not specified.
>>>>
>>>> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
>>>> oom_policy=bpf_test_policy
>>>> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full)
>>>> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
>>>> [ 112.698167] Call Trace:
>>>> [ 112.698177] <TASK>
>>>> [ 112.698182] dump_stack_lvl+0x4d/0x70
>>>> [ 112.698192] dump_header+0x59/0x1c6
>>>> [ 112.698199] oom_kill_process.cold+0x8/0xef
>>>> [ 112.698206] bpf_oom_kill_process+0x59/0xb0
>>>> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313
>>>> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf
>>>> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5
>>>> [ 112.698240] bpf_handle_oom+0x11a/0x1e0
>>>> [ 112.698250] out_of_memory+0xab/0x5c0
>>>> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110
>>>> [ 112.698274] try_charge_memcg+0x4b5/0x7e0
>>>> [ 112.698288] charge_memcg+0x2f/0xc0
>>>> [ 112.698293] __mem_cgroup_charge+0x30/0xc0
>>>> [ 112.698299] do_anonymous_page+0x40f/0xa50
>>>> [ 112.698311] __handle_mm_fault+0xbba/0x1140
>>>> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5
>>>> [ 112.698335] handle_mm_fault+0xe6/0x370
>>>> [ 112.698343] do_user_addr_fault+0x211/0x6a0
>>>> [ 112.698354] exc_page_fault+0x75/0x1d0
>>>> [ 112.698363] asm_exc_page_fault+0x26/0x30
>>>> [ 112.698366] RIP: 0033:0x7fa97236db00
>>>>
>>>> It's possible to load multiple bpf struct programs. In the case of
>>>> oom, they will be executed one by one in the same order they been
>>>> loaded until one of them returns 1 and bpf_memory_freed is set to 1
>>>> - an indication that the memory was freed. This allows to have
>>>> multiple bpf programs to focus on different types of OOM's - e.g.
>>>> one program can only handle memcg OOM's in one memory cgroup.
>>>> But the filtering is done in bpf - so it's fully flexible.
>>>>
>>>> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
>>>> ---
>>>> include/linux/bpf_oom.h | 49 +++++++++++++
>>>> include/linux/oom.h | 8 ++
>>>> mm/Makefile | 3 +
>>>> mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++
>>>> mm/oom_kill.c | 22 +++++-
>>>> 5 files changed, 237 insertions(+), 2 deletions(-)
>>>> create mode 100644 include/linux/bpf_oom.h
>>>> create mode 100644 mm/bpf_oom.c
>>>>
>>>> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
>>>> new file mode 100644
>>>> index 000000000000..29cb5ea41d97
>>>> --- /dev/null
>>>> +++ b/include/linux/bpf_oom.h
>>>> @@ -0,0 +1,49 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0+ */
>>>> +
>>>> +#ifndef __BPF_OOM_H
>>>> +#define __BPF_OOM_H
>>>> +
>>>> +struct bpf_oom;
>>>> +struct oom_control;
>>>> +
>>>> +#define BPF_OOM_NAME_MAX_LEN 64
>>>> +
>>>> +struct bpf_oom_ops {
>>>> + /**
>>>> + * @handle_out_of_memory: Out of memory bpf handler, called before
>>>> + * the in-kernel OOM killer.
>>>> + * @oc: OOM control structure
>>>> + *
>>>> + * Should return 1 if some memory was freed up, otherwise
>>>> + * the in-kernel OOM killer is invoked.
>>>> + */
>>>> + int (*handle_out_of_memory)(struct oom_control *oc);
>>>> +
>>>> + /**
>>>> + * @name: BPF OOM policy name
>>>> + */
>>>> + char name[BPF_OOM_NAME_MAX_LEN];
>>> Why should the name be a part of ops structure? IMO it's not an
>>> attribute of the operations but rather of the oom handler which is
>>> represented by bpf_oom here.
>> The ops structure describes a user-defined oom policy. Currently
>> it's just one handler and the policy name. Later additional handlers
>> can be added, e.g. a handler to control the dmesg output.
>>
>> bpf_oom is an implementation detail: it's basically an extension
>> to struct bpf_oom_ops which contains "private" fields required
>> for the internal machinery.
> Ok. I hope we can come up with some more descriptive naming but I
> can't think of something good ATM.
>
>>>> +
>>>> + /* Private */
>>>> + struct bpf_oom *bpf_oom;
>>>> +};
>>>> +
>>>> +#ifdef CONFIG_BPF_SYSCALL
>>>> +/**
>>>> + * @bpf_handle_oom: handle out of memory using bpf programs
>>>> + * @oc: OOM control structure
>>>> + *
>>>> + * Returns true if a bpf oom program was executed, returned 1
>>>> + * and some memory was actually freed.
>>> The above comment is unclear, please clarify.
>> Fixed, thanks.
>>
>> /**
>> * @bpf_handle_oom: handle out of memory condition using bpf
>> * @oc: OOM control structure
>> *
>> * Returns true if some memory was freed.
>> */
>> bool bpf_handle_oom(struct oom_control *oc);
>>
>>
>>>> + */
>>>> +bool bpf_handle_oom(struct oom_control *oc);
>>>> +
>>>> +#else /* CONFIG_BPF_SYSCALL */
>>>> +static inline bool bpf_handle_oom(struct oom_control *oc)
>>>> +{
>>>> + return false;
>>>> +}
>>>> +
>>>> +#endif /* CONFIG_BPF_SYSCALL */
>>>> +
>>>> +#endif /* __BPF_OOM_H */
>>>> diff --git a/include/linux/oom.h b/include/linux/oom.h
>>>> index 1e0fc6931ce9..ef453309b7ea 100644
>>>> --- a/include/linux/oom.h
>>>> +++ b/include/linux/oom.h
>>>> @@ -51,6 +51,14 @@ struct oom_control {
>>>>
>>>> /* Used to print the constraint info. */
>>>> enum oom_constraint constraint;
>>>> +
>>>> +#ifdef CONFIG_BPF_SYSCALL
>>>> + /* Used by the bpf oom implementation to mark the forward progress */
>>>> + bool bpf_memory_freed;
>>>> +
>>>> + /* Policy name */
>>>> + const char *bpf_policy_name;
>>>> +#endif
>>>> };
>>>>
>>>> extern struct mutex oom_lock;
>>>> diff --git a/mm/Makefile b/mm/Makefile
>>>> index 1a7a11d4933d..a714aba03759 100644
>>>> --- a/mm/Makefile
>>>> +++ b/mm/Makefile
>>>> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
>>>> ifdef CONFIG_SWAP
>>>> obj-$(CONFIG_MEMCG) += swap_cgroup.o
>>>> endif
>>>> +ifdef CONFIG_BPF_SYSCALL
>>>> +obj-y += bpf_oom.o
>>>> +endif
>>>> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
>>>> obj-$(CONFIG_GUP_TEST) += gup_test.o
>>>> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
>>>> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
>>>> new file mode 100644
>>>> index 000000000000..47633046819c
>>>> --- /dev/null
>>>> +++ b/mm/bpf_oom.c
>>>> @@ -0,0 +1,157 @@
>>>> +// SPDX-License-Identifier: GPL-2.0-or-later
>>>> +/*
>>>> + * BPF-driven OOM killer customization
>>>> + *
>>>> + * Author: Roman Gushchin <roman.gushchin@linux.dev>
>>>> + */
>>>> +
>>>> +#include <linux/bpf.h>
>>>> +#include <linux/oom.h>
>>>> +#include <linux/bpf_oom.h>
>>>> +#include <linux/srcu.h>
>>>> +
>>>> +DEFINE_STATIC_SRCU(bpf_oom_srcu);
>>>> +static DEFINE_SPINLOCK(bpf_oom_lock);
>>>> +static LIST_HEAD(bpf_oom_handlers);
>>>> +
>>>> +struct bpf_oom {
>>> Perhaps bpf_oom_handler ? Then bpf_oom_ops->bpf_oom could be called
>>> bpf_oom_ops->handler.
>> I don't think it's a handler, it's more like a private part
>> of bpf_oom_ops. Maybe bpf_oom_impl? Idk
> Yeah, we need to come up with some nomenclature and name these structs
> accordingly. In my mind ops means a structure that contains only
> operations, so current naming does not sit well but maybe that's just
> me...
Some existing xxx_ops also have non-operation members. E.g.,
tcp_congestion_ops, Qdisc_ops, vfio_device_ops, or tpm_class_ops. Maybe
bpf_oom_ops is okay if that doesn't cause too much confusion?
>
>>>
>>>> + struct bpf_oom_ops *ops;
>>>> + struct list_head node;
>>>> + struct srcu_struct srcu;
>>>> +};
>>>> +
>>>> +bool bpf_handle_oom(struct oom_control *oc)
>>>> +{
>>>> + struct bpf_oom_ops *ops;
>>>> + struct bpf_oom *bpf_oom;
>>>> + int list_idx, idx, ret = 0;
>>>> +
>>>> + oc->bpf_memory_freed = false;
>>>> +
>>>> + list_idx = srcu_read_lock(&bpf_oom_srcu);
>>>> + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) {
>>>> + ops = READ_ONCE(bpf_oom->ops);
>>>> + if (!ops || !ops->handle_out_of_memory)
>>>> + continue;
>>>> + idx = srcu_read_lock(&bpf_oom->srcu);
>>>> + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] :
>>>> + "bpf_defined_policy";
>>>> + ret = ops->handle_out_of_memory(oc);
>>>> + oc->bpf_policy_name = NULL;
>>>> + srcu_read_unlock(&bpf_oom->srcu, idx);
>>>> +
>>>> + if (ret && oc->bpf_memory_freed)
>>> IIUC ret and oc->bpf_memory_freed seem to reflect the same state:
>>> handler successfully freed some memory. Could you please clarify when
>>> they differ?
>> The idea here is to provide an additional safety measure:
>> if the bpf program simple returns 1 without doing anything,
>> the system won't deadlock.
>>
>> oc->bpf_memory_freed is set by the bpf_oom_kill_process() helper
>> (and potentially some other helpers in the future, e.g.
>> bpf_oom_rm_tmpfs_file()) and can't be modified by the bpf
>> program directly.
> I see. Then maybe we use only oc->bpf_memory_freed and
> handle_out_of_memory() does not return anything?
Suren Baghdasaryan <surenb@google.com> writes:
> On Tue, Aug 19, 2025 at 1:06 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>>
>> Suren Baghdasaryan <surenb@google.com> writes:
>>
>> > On Mon, Aug 18, 2025 at 10:01 AM Roman Gushchin
>> > <roman.gushchin@linux.dev> wrote:
>> >>
>> >> Introduce a bpf struct ops for implementing custom OOM handling policies.
>> >>
>> >> The struct ops provides the bpf_handle_out_of_memory() callback,
>> >> which expected to return 1 if it was able to free some memory and 0
>> >> otherwise.
>> >>
>> >> In the latter case it's guaranteed that the in-kernel OOM killer will
>> >> be invoked. Otherwise the kernel also checks the bpf_memory_freed
>> >> field of the oom_control structure, which is expected to be set by
>> >> kfuncs suitable for releasing memory. It's a safety mechanism which
>> >> prevents a bpf program to claim forward progress without actually
>> >> releasing memory. The callback program is sleepable to enable using
>> >> iterators, e.g. cgroup iterators.
>> >>
>> >> The callback receives struct oom_control as an argument, so it can
>> >> easily filter out OOM's it doesn't want to handle, e.g. global vs
>> >> memcg OOM's.
>> >>
>> >> The callback is executed just before the kernel victim task selection
>> >> algorithm, so all heuristics and sysctls like panic on oom,
>> >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task
>> >> are respected.
>> >>
>> >> The struct ops also has the name field, which allows to define a
>> >> custom name for the implemented policy. It's printed in the OOM report
>> >> in the oom_policy=<policy> format. "default" is printed if bpf is not
>> >> used or policy name is not specified.
>> >>
>> >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
>> >> oom_policy=bpf_test_policy
>> >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full)
>> >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
>> >> [ 112.698167] Call Trace:
>> >> [ 112.698177] <TASK>
>> >> [ 112.698182] dump_stack_lvl+0x4d/0x70
>> >> [ 112.698192] dump_header+0x59/0x1c6
>> >> [ 112.698199] oom_kill_process.cold+0x8/0xef
>> >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0
>> >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313
>> >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf
>> >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5
>> >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0
>> >> [ 112.698250] out_of_memory+0xab/0x5c0
>> >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110
>> >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0
>> >> [ 112.698288] charge_memcg+0x2f/0xc0
>> >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0
>> >> [ 112.698299] do_anonymous_page+0x40f/0xa50
>> >> [ 112.698311] __handle_mm_fault+0xbba/0x1140
>> >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5
>> >> [ 112.698335] handle_mm_fault+0xe6/0x370
>> >> [ 112.698343] do_user_addr_fault+0x211/0x6a0
>> >> [ 112.698354] exc_page_fault+0x75/0x1d0
>> >> [ 112.698363] asm_exc_page_fault+0x26/0x30
>> >> [ 112.698366] RIP: 0033:0x7fa97236db00
>> >>
>> >> It's possible to load multiple bpf struct programs. In the case of
>> >> oom, they will be executed one by one in the same order they been
>> >> loaded until one of them returns 1 and bpf_memory_freed is set to 1
>> >> - an indication that the memory was freed. This allows to have
>> >> multiple bpf programs to focus on different types of OOM's - e.g.
>> >> one program can only handle memcg OOM's in one memory cgroup.
>> >> But the filtering is done in bpf - so it's fully flexible.
>> >>
>> >> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
>> >> ---
>> >> include/linux/bpf_oom.h | 49 +++++++++++++
>> >> include/linux/oom.h | 8 ++
>> >> mm/Makefile | 3 +
>> >> mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++
>> >> mm/oom_kill.c | 22 +++++-
>> >> 5 files changed, 237 insertions(+), 2 deletions(-)
>> >> create mode 100644 include/linux/bpf_oom.h
>> >> create mode 100644 mm/bpf_oom.c
>> >>
>> >> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
>> >> new file mode 100644
>> >> index 000000000000..29cb5ea41d97
>> >> --- /dev/null
>> >> +++ b/include/linux/bpf_oom.h
>> >> @@ -0,0 +1,49 @@
>> >> +/* SPDX-License-Identifier: GPL-2.0+ */
>> >> +
>> >> +#ifndef __BPF_OOM_H
>> >> +#define __BPF_OOM_H
>> >> +
>> >> +struct bpf_oom;
>> >> +struct oom_control;
>> >> +
>> >> +#define BPF_OOM_NAME_MAX_LEN 64
>> >> +
>> >> +struct bpf_oom_ops {
>> >> + /**
>> >> + * @handle_out_of_memory: Out of memory bpf handler, called before
>> >> + * the in-kernel OOM killer.
>> >> + * @oc: OOM control structure
>> >> + *
>> >> + * Should return 1 if some memory was freed up, otherwise
>> >> + * the in-kernel OOM killer is invoked.
>> >> + */
>> >> + int (*handle_out_of_memory)(struct oom_control *oc);
>> >> +
>> >> + /**
>> >> + * @name: BPF OOM policy name
>> >> + */
>> >> + char name[BPF_OOM_NAME_MAX_LEN];
>> >
>> > Why should the name be a part of ops structure? IMO it's not an
>> > attribute of the operations but rather of the oom handler which is
>> > represented by bpf_oom here.
>>
>> The ops structure describes a user-defined oom policy. Currently
>> it's just one handler and the policy name. Later additional handlers
>> can be added, e.g. a handler to control the dmesg output.
>>
>> bpf_oom is an implementation detail: it's basically an extension
>> to struct bpf_oom_ops which contains "private" fields required
>> for the internal machinery.
>
> Ok. I hope we can come up with some more descriptive naming but I
> can't think of something good ATM.
>
>>
>> >
>> >> +
>> >> + /* Private */
>> >> + struct bpf_oom *bpf_oom;
>> >> +};
>> >> +
>> >> +#ifdef CONFIG_BPF_SYSCALL
>> >> +/**
>> >> + * @bpf_handle_oom: handle out of memory using bpf programs
>> >> + * @oc: OOM control structure
>> >> + *
>> >> + * Returns true if a bpf oom program was executed, returned 1
>> >> + * and some memory was actually freed.
>> >
>> > The above comment is unclear, please clarify.
>>
>> Fixed, thanks.
>>
>> /**
>> * @bpf_handle_oom: handle out of memory condition using bpf
>> * @oc: OOM control structure
>> *
>> * Returns true if some memory was freed.
>> */
>> bool bpf_handle_oom(struct oom_control *oc);
>>
>>
>> >
>> >> + */
>> >> +bool bpf_handle_oom(struct oom_control *oc);
>> >> +
>> >> +#else /* CONFIG_BPF_SYSCALL */
>> >> +static inline bool bpf_handle_oom(struct oom_control *oc)
>> >> +{
>> >> + return false;
>> >> +}
>> >> +
>> >> +#endif /* CONFIG_BPF_SYSCALL */
>> >> +
>> >> +#endif /* __BPF_OOM_H */
>> >> diff --git a/include/linux/oom.h b/include/linux/oom.h
>> >> index 1e0fc6931ce9..ef453309b7ea 100644
>> >> --- a/include/linux/oom.h
>> >> +++ b/include/linux/oom.h
>> >> @@ -51,6 +51,14 @@ struct oom_control {
>> >>
>> >> /* Used to print the constraint info. */
>> >> enum oom_constraint constraint;
>> >> +
>> >> +#ifdef CONFIG_BPF_SYSCALL
>> >> + /* Used by the bpf oom implementation to mark the forward progress */
>> >> + bool bpf_memory_freed;
>> >> +
>> >> + /* Policy name */
>> >> + const char *bpf_policy_name;
>> >> +#endif
>> >> };
>> >>
>> >> extern struct mutex oom_lock;
>> >> diff --git a/mm/Makefile b/mm/Makefile
>> >> index 1a7a11d4933d..a714aba03759 100644
>> >> --- a/mm/Makefile
>> >> +++ b/mm/Makefile
>> >> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
>> >> ifdef CONFIG_SWAP
>> >> obj-$(CONFIG_MEMCG) += swap_cgroup.o
>> >> endif
>> >> +ifdef CONFIG_BPF_SYSCALL
>> >> +obj-y += bpf_oom.o
>> >> +endif
>> >> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
>> >> obj-$(CONFIG_GUP_TEST) += gup_test.o
>> >> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
>> >> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
>> >> new file mode 100644
>> >> index 000000000000..47633046819c
>> >> --- /dev/null
>> >> +++ b/mm/bpf_oom.c
>> >> @@ -0,0 +1,157 @@
>> >> +// SPDX-License-Identifier: GPL-2.0-or-later
>> >> +/*
>> >> + * BPF-driven OOM killer customization
>> >> + *
>> >> + * Author: Roman Gushchin <roman.gushchin@linux.dev>
>> >> + */
>> >> +
>> >> +#include <linux/bpf.h>
>> >> +#include <linux/oom.h>
>> >> +#include <linux/bpf_oom.h>
>> >> +#include <linux/srcu.h>
>> >> +
>> >> +DEFINE_STATIC_SRCU(bpf_oom_srcu);
>> >> +static DEFINE_SPINLOCK(bpf_oom_lock);
>> >> +static LIST_HEAD(bpf_oom_handlers);
>> >> +
>> >> +struct bpf_oom {
>> >
>> > Perhaps bpf_oom_handler ? Then bpf_oom_ops->bpf_oom could be called
>> > bpf_oom_ops->handler.
>>
>> I don't think it's a handler, it's more like a private part
>> of bpf_oom_ops. Maybe bpf_oom_impl? Idk
>
> Yeah, we need to come up with some nomenclature and name these structs
> accordingly. In my mind ops means a structure that contains only
> operations, so current naming does not sit well but maybe that's just
> me...
>
>>
>> >
>> >
>> >> + struct bpf_oom_ops *ops;
>> >> + struct list_head node;
>> >> + struct srcu_struct srcu;
>> >> +};
>> >> +
>> >> +bool bpf_handle_oom(struct oom_control *oc)
>> >> +{
>> >> + struct bpf_oom_ops *ops;
>> >> + struct bpf_oom *bpf_oom;
>> >> + int list_idx, idx, ret = 0;
>> >> +
>> >> + oc->bpf_memory_freed = false;
>> >> +
>> >> + list_idx = srcu_read_lock(&bpf_oom_srcu);
>> >> + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) {
>> >> + ops = READ_ONCE(bpf_oom->ops);
>> >> + if (!ops || !ops->handle_out_of_memory)
>> >> + continue;
>> >> + idx = srcu_read_lock(&bpf_oom->srcu);
>> >> + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] :
>> >> + "bpf_defined_policy";
>> >> + ret = ops->handle_out_of_memory(oc);
>> >> + oc->bpf_policy_name = NULL;
>> >> + srcu_read_unlock(&bpf_oom->srcu, idx);
>> >> +
>> >> + if (ret && oc->bpf_memory_freed)
>> >
>> > IIUC ret and oc->bpf_memory_freed seem to reflect the same state:
>> > handler successfully freed some memory. Could you please clarify when
>> > they differ?
>>
>> The idea here is to provide an additional safety measure:
>> if the bpf program simple returns 1 without doing anything,
>> the system won't deadlock.
>>
>> oc->bpf_memory_freed is set by the bpf_oom_kill_process() helper
>> (and potentially some other helpers in the future, e.g.
>> bpf_oom_rm_tmpfs_file()) and can't be modified by the bpf
>> program directly.
>
> I see. Then maybe we use only oc->bpf_memory_freed and
> handle_out_of_memory() does not return anything?
Idk, I think it's neat to have an ability to pass to the in-kernel
OOM killer even after killing a task.
Also, I believe, bpf programs have to return an int anyway,
so we can ignore it, but I don't necessary see the point.
On Wed, Aug 20, 2025 at 12:53 PM Roman Gushchin
<roman.gushchin@linux.dev> wrote:
>
> Suren Baghdasaryan <surenb@google.com> writes:
>
> > On Tue, Aug 19, 2025 at 1:06 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
> >>
> >> Suren Baghdasaryan <surenb@google.com> writes:
> >>
> >> > On Mon, Aug 18, 2025 at 10:01 AM Roman Gushchin
> >> > <roman.gushchin@linux.dev> wrote:
> >> >>
> >> >> Introduce a bpf struct ops for implementing custom OOM handling policies.
> >> >>
> >> >> The struct ops provides the bpf_handle_out_of_memory() callback,
> >> >> which expected to return 1 if it was able to free some memory and 0
> >> >> otherwise.
> >> >>
> >> >> In the latter case it's guaranteed that the in-kernel OOM killer will
> >> >> be invoked. Otherwise the kernel also checks the bpf_memory_freed
> >> >> field of the oom_control structure, which is expected to be set by
> >> >> kfuncs suitable for releasing memory. It's a safety mechanism which
> >> >> prevents a bpf program to claim forward progress without actually
> >> >> releasing memory. The callback program is sleepable to enable using
> >> >> iterators, e.g. cgroup iterators.
> >> >>
> >> >> The callback receives struct oom_control as an argument, so it can
> >> >> easily filter out OOM's it doesn't want to handle, e.g. global vs
> >> >> memcg OOM's.
> >> >>
> >> >> The callback is executed just before the kernel victim task selection
> >> >> algorithm, so all heuristics and sysctls like panic on oom,
> >> >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task
> >> >> are respected.
> >> >>
> >> >> The struct ops also has the name field, which allows to define a
> >> >> custom name for the implemented policy. It's printed in the OOM report
> >> >> in the oom_policy=<policy> format. "default" is printed if bpf is not
> >> >> used or policy name is not specified.
> >> >>
> >> >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
> >> >> oom_policy=bpf_test_policy
> >> >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full)
> >> >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
> >> >> [ 112.698167] Call Trace:
> >> >> [ 112.698177] <TASK>
> >> >> [ 112.698182] dump_stack_lvl+0x4d/0x70
> >> >> [ 112.698192] dump_header+0x59/0x1c6
> >> >> [ 112.698199] oom_kill_process.cold+0x8/0xef
> >> >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0
> >> >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313
> >> >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf
> >> >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5
> >> >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0
> >> >> [ 112.698250] out_of_memory+0xab/0x5c0
> >> >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110
> >> >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0
> >> >> [ 112.698288] charge_memcg+0x2f/0xc0
> >> >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0
> >> >> [ 112.698299] do_anonymous_page+0x40f/0xa50
> >> >> [ 112.698311] __handle_mm_fault+0xbba/0x1140
> >> >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5
> >> >> [ 112.698335] handle_mm_fault+0xe6/0x370
> >> >> [ 112.698343] do_user_addr_fault+0x211/0x6a0
> >> >> [ 112.698354] exc_page_fault+0x75/0x1d0
> >> >> [ 112.698363] asm_exc_page_fault+0x26/0x30
> >> >> [ 112.698366] RIP: 0033:0x7fa97236db00
> >> >>
> >> >> It's possible to load multiple bpf struct programs. In the case of
> >> >> oom, they will be executed one by one in the same order they been
> >> >> loaded until one of them returns 1 and bpf_memory_freed is set to 1
> >> >> - an indication that the memory was freed. This allows to have
> >> >> multiple bpf programs to focus on different types of OOM's - e.g.
> >> >> one program can only handle memcg OOM's in one memory cgroup.
> >> >> But the filtering is done in bpf - so it's fully flexible.
> >> >>
> >> >> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> >> >> ---
> >> >> include/linux/bpf_oom.h | 49 +++++++++++++
> >> >> include/linux/oom.h | 8 ++
> >> >> mm/Makefile | 3 +
> >> >> mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++
> >> >> mm/oom_kill.c | 22 +++++-
> >> >> 5 files changed, 237 insertions(+), 2 deletions(-)
> >> >> create mode 100644 include/linux/bpf_oom.h
> >> >> create mode 100644 mm/bpf_oom.c
> >> >>
> >> >> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
> >> >> new file mode 100644
> >> >> index 000000000000..29cb5ea41d97
> >> >> --- /dev/null
> >> >> +++ b/include/linux/bpf_oom.h
> >> >> @@ -0,0 +1,49 @@
> >> >> +/* SPDX-License-Identifier: GPL-2.0+ */
> >> >> +
> >> >> +#ifndef __BPF_OOM_H
> >> >> +#define __BPF_OOM_H
> >> >> +
> >> >> +struct bpf_oom;
> >> >> +struct oom_control;
> >> >> +
> >> >> +#define BPF_OOM_NAME_MAX_LEN 64
> >> >> +
> >> >> +struct bpf_oom_ops {
> >> >> + /**
> >> >> + * @handle_out_of_memory: Out of memory bpf handler, called before
> >> >> + * the in-kernel OOM killer.
> >> >> + * @oc: OOM control structure
> >> >> + *
> >> >> + * Should return 1 if some memory was freed up, otherwise
> >> >> + * the in-kernel OOM killer is invoked.
> >> >> + */
> >> >> + int (*handle_out_of_memory)(struct oom_control *oc);
> >> >> +
> >> >> + /**
> >> >> + * @name: BPF OOM policy name
> >> >> + */
> >> >> + char name[BPF_OOM_NAME_MAX_LEN];
> >> >
> >> > Why should the name be a part of ops structure? IMO it's not an
> >> > attribute of the operations but rather of the oom handler which is
> >> > represented by bpf_oom here.
> >>
> >> The ops structure describes a user-defined oom policy. Currently
> >> it's just one handler and the policy name. Later additional handlers
> >> can be added, e.g. a handler to control the dmesg output.
> >>
> >> bpf_oom is an implementation detail: it's basically an extension
> >> to struct bpf_oom_ops which contains "private" fields required
> >> for the internal machinery.
> >
> > Ok. I hope we can come up with some more descriptive naming but I
> > can't think of something good ATM.
> >
> >>
> >> >
> >> >> +
> >> >> + /* Private */
> >> >> + struct bpf_oom *bpf_oom;
> >> >> +};
> >> >> +
> >> >> +#ifdef CONFIG_BPF_SYSCALL
> >> >> +/**
> >> >> + * @bpf_handle_oom: handle out of memory using bpf programs
> >> >> + * @oc: OOM control structure
> >> >> + *
> >> >> + * Returns true if a bpf oom program was executed, returned 1
> >> >> + * and some memory was actually freed.
> >> >
> >> > The above comment is unclear, please clarify.
> >>
> >> Fixed, thanks.
> >>
> >> /**
> >> * @bpf_handle_oom: handle out of memory condition using bpf
> >> * @oc: OOM control structure
> >> *
> >> * Returns true if some memory was freed.
> >> */
> >> bool bpf_handle_oom(struct oom_control *oc);
> >>
> >>
> >> >
> >> >> + */
> >> >> +bool bpf_handle_oom(struct oom_control *oc);
> >> >> +
> >> >> +#else /* CONFIG_BPF_SYSCALL */
> >> >> +static inline bool bpf_handle_oom(struct oom_control *oc)
> >> >> +{
> >> >> + return false;
> >> >> +}
> >> >> +
> >> >> +#endif /* CONFIG_BPF_SYSCALL */
> >> >> +
> >> >> +#endif /* __BPF_OOM_H */
> >> >> diff --git a/include/linux/oom.h b/include/linux/oom.h
> >> >> index 1e0fc6931ce9..ef453309b7ea 100644
> >> >> --- a/include/linux/oom.h
> >> >> +++ b/include/linux/oom.h
> >> >> @@ -51,6 +51,14 @@ struct oom_control {
> >> >>
> >> >> /* Used to print the constraint info. */
> >> >> enum oom_constraint constraint;
> >> >> +
> >> >> +#ifdef CONFIG_BPF_SYSCALL
> >> >> + /* Used by the bpf oom implementation to mark the forward progress */
> >> >> + bool bpf_memory_freed;
> >> >> +
> >> >> + /* Policy name */
> >> >> + const char *bpf_policy_name;
> >> >> +#endif
> >> >> };
> >> >>
> >> >> extern struct mutex oom_lock;
> >> >> diff --git a/mm/Makefile b/mm/Makefile
> >> >> index 1a7a11d4933d..a714aba03759 100644
> >> >> --- a/mm/Makefile
> >> >> +++ b/mm/Makefile
> >> >> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
> >> >> ifdef CONFIG_SWAP
> >> >> obj-$(CONFIG_MEMCG) += swap_cgroup.o
> >> >> endif
> >> >> +ifdef CONFIG_BPF_SYSCALL
> >> >> +obj-y += bpf_oom.o
> >> >> +endif
> >> >> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
> >> >> obj-$(CONFIG_GUP_TEST) += gup_test.o
> >> >> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
> >> >> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
> >> >> new file mode 100644
> >> >> index 000000000000..47633046819c
> >> >> --- /dev/null
> >> >> +++ b/mm/bpf_oom.c
> >> >> @@ -0,0 +1,157 @@
> >> >> +// SPDX-License-Identifier: GPL-2.0-or-later
> >> >> +/*
> >> >> + * BPF-driven OOM killer customization
> >> >> + *
> >> >> + * Author: Roman Gushchin <roman.gushchin@linux.dev>
> >> >> + */
> >> >> +
> >> >> +#include <linux/bpf.h>
> >> >> +#include <linux/oom.h>
> >> >> +#include <linux/bpf_oom.h>
> >> >> +#include <linux/srcu.h>
> >> >> +
> >> >> +DEFINE_STATIC_SRCU(bpf_oom_srcu);
> >> >> +static DEFINE_SPINLOCK(bpf_oom_lock);
> >> >> +static LIST_HEAD(bpf_oom_handlers);
> >> >> +
> >> >> +struct bpf_oom {
> >> >
> >> > Perhaps bpf_oom_handler ? Then bpf_oom_ops->bpf_oom could be called
> >> > bpf_oom_ops->handler.
> >>
> >> I don't think it's a handler, it's more like a private part
> >> of bpf_oom_ops. Maybe bpf_oom_impl? Idk
> >
> > Yeah, we need to come up with some nomenclature and name these structs
> > accordingly. In my mind ops means a structure that contains only
> > operations, so current naming does not sit well but maybe that's just
> > me...
> >
> >>
> >> >
> >> >
> >> >> + struct bpf_oom_ops *ops;
> >> >> + struct list_head node;
> >> >> + struct srcu_struct srcu;
> >> >> +};
> >> >> +
> >> >> +bool bpf_handle_oom(struct oom_control *oc)
> >> >> +{
> >> >> + struct bpf_oom_ops *ops;
> >> >> + struct bpf_oom *bpf_oom;
> >> >> + int list_idx, idx, ret = 0;
> >> >> +
> >> >> + oc->bpf_memory_freed = false;
> >> >> +
> >> >> + list_idx = srcu_read_lock(&bpf_oom_srcu);
> >> >> + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) {
> >> >> + ops = READ_ONCE(bpf_oom->ops);
> >> >> + if (!ops || !ops->handle_out_of_memory)
> >> >> + continue;
> >> >> + idx = srcu_read_lock(&bpf_oom->srcu);
> >> >> + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] :
> >> >> + "bpf_defined_policy";
> >> >> + ret = ops->handle_out_of_memory(oc);
> >> >> + oc->bpf_policy_name = NULL;
> >> >> + srcu_read_unlock(&bpf_oom->srcu, idx);
> >> >> +
> >> >> + if (ret && oc->bpf_memory_freed)
> >> >
> >> > IIUC ret and oc->bpf_memory_freed seem to reflect the same state:
> >> > handler successfully freed some memory. Could you please clarify when
> >> > they differ?
> >>
> >> The idea here is to provide an additional safety measure:
> >> if the bpf program simple returns 1 without doing anything,
> >> the system won't deadlock.
> >>
> >> oc->bpf_memory_freed is set by the bpf_oom_kill_process() helper
> >> (and potentially some other helpers in the future, e.g.
> >> bpf_oom_rm_tmpfs_file()) and can't be modified by the bpf
> >> program directly.
> >
> > I see. Then maybe we use only oc->bpf_memory_freed and
> > handle_out_of_memory() does not return anything?
>
> Idk, I think it's neat to have an ability to pass to the in-kernel
> OOM killer even after killing a task.
> Also, I believe, bpf programs have to return an int anyway,
> so we can ignore it, but I don't necessary see the point.
Ok, so you want this parameter for the bpf oom-handler to say "even if
something got killed, proceed with the next oom-handler anyway?". I
can't think of a case when someone would do that... In any case, the
use for this return value should be clearly documented.
>
© 2016 - 2026 Red Hat, Inc.