Introduce a bpf struct ops for implementing custom OOM handling policies.
The struct ops provides the bpf_handle_out_of_memory() callback,
which expected to return 1 if it was able to free some memory and 0
otherwise.
In the latter case it's guaranteed that the in-kernel OOM killer will
be invoked. Otherwise the kernel also checks the bpf_memory_freed
field of the oom_control structure, which is expected to be set by
kfuncs suitable for releasing memory. It's a safety mechanism which
prevents a bpf program to claim forward progress without actually
releasing memory. The callback program is sleepable to enable using
iterators, e.g. cgroup iterators.
The callback receives struct oom_control as an argument, so it can
easily filter out OOM's it doesn't want to handle, e.g. global vs
memcg OOM's.
The callback is executed just before the kernel victim task selection
algorithm, so all heuristics and sysctls like panic on oom,
sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task
are respected.
The struct ops also has the name field, which allows to define a
custom name for the implemented policy. It's printed in the OOM report
in the oom_policy=<policy> format. "default" is printed if bpf is not
used or policy name is not specified.
[ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
oom_policy=bpf_test_policy
[ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full)
[ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
[ 112.698167] Call Trace:
[ 112.698177] <TASK>
[ 112.698182] dump_stack_lvl+0x4d/0x70
[ 112.698192] dump_header+0x59/0x1c6
[ 112.698199] oom_kill_process.cold+0x8/0xef
[ 112.698206] bpf_oom_kill_process+0x59/0xb0
[ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313
[ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf
[ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5
[ 112.698240] bpf_handle_oom+0x11a/0x1e0
[ 112.698250] out_of_memory+0xab/0x5c0
[ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110
[ 112.698274] try_charge_memcg+0x4b5/0x7e0
[ 112.698288] charge_memcg+0x2f/0xc0
[ 112.698293] __mem_cgroup_charge+0x30/0xc0
[ 112.698299] do_anonymous_page+0x40f/0xa50
[ 112.698311] __handle_mm_fault+0xbba/0x1140
[ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5
[ 112.698335] handle_mm_fault+0xe6/0x370
[ 112.698343] do_user_addr_fault+0x211/0x6a0
[ 112.698354] exc_page_fault+0x75/0x1d0
[ 112.698363] asm_exc_page_fault+0x26/0x30
[ 112.698366] RIP: 0033:0x7fa97236db00
It's possible to load multiple bpf struct programs. In the case of
oom, they will be executed one by one in the same order they been
loaded until one of them returns 1 and bpf_memory_freed is set to 1
- an indication that the memory was freed. This allows to have
multiple bpf programs to focus on different types of OOM's - e.g.
one program can only handle memcg OOM's in one memory cgroup.
But the filtering is done in bpf - so it's fully flexible.
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
include/linux/bpf_oom.h | 49 +++++++++++++
include/linux/oom.h | 8 ++
mm/Makefile | 3 +
mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++
mm/oom_kill.c | 22 +++++-
5 files changed, 237 insertions(+), 2 deletions(-)
create mode 100644 include/linux/bpf_oom.h
create mode 100644 mm/bpf_oom.c
diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
new file mode 100644
index 000000000000..29cb5ea41d97
--- /dev/null
+++ b/include/linux/bpf_oom.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef __BPF_OOM_H
+#define __BPF_OOM_H
+
+struct bpf_oom;
+struct oom_control;
+
+#define BPF_OOM_NAME_MAX_LEN 64
+
+struct bpf_oom_ops {
+ /**
+ * @handle_out_of_memory: Out of memory bpf handler, called before
+ * the in-kernel OOM killer.
+ * @oc: OOM control structure
+ *
+ * Should return 1 if some memory was freed up, otherwise
+ * the in-kernel OOM killer is invoked.
+ */
+ int (*handle_out_of_memory)(struct oom_control *oc);
+
+ /**
+ * @name: BPF OOM policy name
+ */
+ char name[BPF_OOM_NAME_MAX_LEN];
+
+ /* Private */
+ struct bpf_oom *bpf_oom;
+};
+
+#ifdef CONFIG_BPF_SYSCALL
+/**
+ * @bpf_handle_oom: handle out of memory using bpf programs
+ * @oc: OOM control structure
+ *
+ * Returns true if a bpf oom program was executed, returned 1
+ * and some memory was actually freed.
+ */
+bool bpf_handle_oom(struct oom_control *oc);
+
+#else /* CONFIG_BPF_SYSCALL */
+static inline bool bpf_handle_oom(struct oom_control *oc)
+{
+ return false;
+}
+
+#endif /* CONFIG_BPF_SYSCALL */
+
+#endif /* __BPF_OOM_H */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 1e0fc6931ce9..ef453309b7ea 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -51,6 +51,14 @@ struct oom_control {
/* Used to print the constraint info. */
enum oom_constraint constraint;
+
+#ifdef CONFIG_BPF_SYSCALL
+ /* Used by the bpf oom implementation to mark the forward progress */
+ bool bpf_memory_freed;
+
+ /* Policy name */
+ const char *bpf_policy_name;
+#endif
};
extern struct mutex oom_lock;
diff --git a/mm/Makefile b/mm/Makefile
index 1a7a11d4933d..a714aba03759 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
ifdef CONFIG_SWAP
obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif
+ifdef CONFIG_BPF_SYSCALL
+obj-y += bpf_oom.o
+endif
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_GUP_TEST) += gup_test.o
obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
new file mode 100644
index 000000000000..47633046819c
--- /dev/null
+++ b/mm/bpf_oom.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * BPF-driven OOM killer customization
+ *
+ * Author: Roman Gushchin <roman.gushchin@linux.dev>
+ */
+
+#include <linux/bpf.h>
+#include <linux/oom.h>
+#include <linux/bpf_oom.h>
+#include <linux/srcu.h>
+
+DEFINE_STATIC_SRCU(bpf_oom_srcu);
+static DEFINE_SPINLOCK(bpf_oom_lock);
+static LIST_HEAD(bpf_oom_handlers);
+
+struct bpf_oom {
+ struct bpf_oom_ops *ops;
+ struct list_head node;
+ struct srcu_struct srcu;
+};
+
+bool bpf_handle_oom(struct oom_control *oc)
+{
+ struct bpf_oom_ops *ops;
+ struct bpf_oom *bpf_oom;
+ int list_idx, idx, ret = 0;
+
+ oc->bpf_memory_freed = false;
+
+ list_idx = srcu_read_lock(&bpf_oom_srcu);
+ list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) {
+ ops = READ_ONCE(bpf_oom->ops);
+ if (!ops || !ops->handle_out_of_memory)
+ continue;
+ idx = srcu_read_lock(&bpf_oom->srcu);
+ oc->bpf_policy_name = ops->name[0] ? &ops->name[0] :
+ "bpf_defined_policy";
+ ret = ops->handle_out_of_memory(oc);
+ oc->bpf_policy_name = NULL;
+ srcu_read_unlock(&bpf_oom->srcu, idx);
+
+ if (ret && oc->bpf_memory_freed)
+ break;
+ }
+ srcu_read_unlock(&bpf_oom_srcu, list_idx);
+
+ return ret && oc->bpf_memory_freed;
+}
+
+static int __handle_out_of_memory(struct oom_control *oc)
+{
+ return 0;
+}
+
+static struct bpf_oom_ops __bpf_oom_ops = {
+ .handle_out_of_memory = __handle_out_of_memory,
+};
+
+static const struct bpf_func_proto *
+bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return tracing_prog_func_proto(func_id, prog);
+}
+
+static bool bpf_oom_ops_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static const struct bpf_verifier_ops bpf_oom_verifier_ops = {
+ .get_func_proto = bpf_oom_func_proto,
+ .is_valid_access = bpf_oom_ops_is_valid_access,
+};
+
+static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_oom_ops *ops = kdata;
+ struct bpf_oom *bpf_oom;
+ int ret;
+
+ bpf_oom = kmalloc(sizeof(*bpf_oom), GFP_KERNEL_ACCOUNT);
+ if (!bpf_oom)
+ return -ENOMEM;
+
+ ret = init_srcu_struct(&bpf_oom->srcu);
+ if (ret) {
+ kfree(bpf_oom);
+ return ret;
+ }
+
+ WRITE_ONCE(bpf_oom->ops, ops);
+ ops->bpf_oom = bpf_oom;
+
+ spin_lock(&bpf_oom_lock);
+ list_add_rcu(&bpf_oom->node, &bpf_oom_handlers);
+ spin_unlock(&bpf_oom_lock);
+
+ return 0;
+}
+
+static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_oom_ops *ops = kdata;
+ struct bpf_oom *bpf_oom = ops->bpf_oom;
+
+ WRITE_ONCE(bpf_oom->ops, NULL);
+
+ spin_lock(&bpf_oom_lock);
+ list_del_rcu(&bpf_oom->node);
+ spin_unlock(&bpf_oom_lock);
+
+ synchronize_srcu(&bpf_oom->srcu);
+
+ kfree(bpf_oom);
+}
+
+static int bpf_oom_ops_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct bpf_oom_ops *uops = (const struct bpf_oom_ops *)udata;
+ struct bpf_oom_ops *ops = (struct bpf_oom_ops *)kdata;
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct bpf_oom_ops, name):
+ strscpy_pad(ops->name, uops->name, sizeof(ops->name));
+ return 1;
+ }
+ return 0;
+}
+
+static int bpf_oom_ops_init(struct btf *btf)
+{
+ return 0;
+}
+
+static struct bpf_struct_ops bpf_oom_bpf_ops = {
+ .verifier_ops = &bpf_oom_verifier_ops,
+ .reg = bpf_oom_ops_reg,
+ .unreg = bpf_oom_ops_unreg,
+ .init_member = bpf_oom_ops_init_member,
+ .init = bpf_oom_ops_init,
+ .name = "bpf_oom_ops",
+ .owner = THIS_MODULE,
+ .cfi_stubs = &__bpf_oom_ops
+};
+
+static int __init bpf_oom_struct_ops_init(void)
+{
+ return register_bpf_struct_ops(&bpf_oom_bpf_ops, bpf_oom_ops);
+}
+late_initcall(bpf_oom_struct_ops_init);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 25923cfec9c6..ad7bd65061d6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -45,6 +45,7 @@
#include <linux/mmu_notifier.h>
#include <linux/cred.h>
#include <linux/nmi.h>
+#include <linux/bpf_oom.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -246,6 +247,15 @@ static const char * const oom_constraint_text[] = {
[CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
};
+static const char *oom_policy_name(struct oom_control *oc)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ if (oc->bpf_policy_name)
+ return oc->bpf_policy_name;
+#endif
+ return "default";
+}
+
/*
* Determine the type of allocation constraint.
*/
@@ -458,9 +468,10 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim)
static void dump_header(struct oom_control *oc)
{
- pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\noom_policy=%s\n",
current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
- current->signal->oom_score_adj);
+ current->signal->oom_score_adj,
+ oom_policy_name(oc));
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
pr_warn("COMPACTION is disabled!!!\n");
@@ -1161,6 +1172,13 @@ bool out_of_memory(struct oom_control *oc)
return true;
}
+ /*
+ * Let bpf handle the OOM first. If it was able to free up some memory,
+ * bail out. Otherwise fall back to the kernel OOM killer.
+ */
+ if (bpf_handle_oom(oc))
+ return true;
+
select_bad_process(oc);
/* Found nothing?!?! */
if (!oc->chosen) {
--
2.50.1
On 8/18/25 10:01 AM, Roman Gushchin wrote: > Introduce a bpf struct ops for implementing custom OOM handling policies. > > The struct ops provides the bpf_handle_out_of_memory() callback, > which expected to return 1 if it was able to free some memory and 0 > otherwise. > > In the latter case it's guaranteed that the in-kernel OOM killer will > be invoked. Otherwise the kernel also checks the bpf_memory_freed > field of the oom_control structure, which is expected to be set by > kfuncs suitable for releasing memory. It's a safety mechanism which > prevents a bpf program to claim forward progress without actually > releasing memory. The callback program is sleepable to enable using > iterators, e.g. cgroup iterators. > > The callback receives struct oom_control as an argument, so it can > easily filter out OOM's it doesn't want to handle, e.g. global vs > memcg OOM's. > > The callback is executed just before the kernel victim task selection > algorithm, so all heuristics and sysctls like panic on oom, > sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task > are respected. > > The struct ops also has the name field, which allows to define a > custom name for the implemented policy. It's printed in the OOM report > in the oom_policy=<policy> format. "default" is printed if bpf is not > used or policy name is not specified. > > [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 > oom_policy=bpf_test_policy > [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) > [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 > [ 112.698167] Call Trace: > [ 112.698177] <TASK> > [ 112.698182] dump_stack_lvl+0x4d/0x70 > [ 112.698192] dump_header+0x59/0x1c6 > [ 112.698199] oom_kill_process.cold+0x8/0xef > [ 112.698206] bpf_oom_kill_process+0x59/0xb0 > [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 > [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf > [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 > [ 112.698240] bpf_handle_oom+0x11a/0x1e0 > [ 112.698250] out_of_memory+0xab/0x5c0 > [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 > [ 112.698274] try_charge_memcg+0x4b5/0x7e0 > [ 112.698288] charge_memcg+0x2f/0xc0 > [ 112.698293] __mem_cgroup_charge+0x30/0xc0 > [ 112.698299] do_anonymous_page+0x40f/0xa50 > [ 112.698311] __handle_mm_fault+0xbba/0x1140 > [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 > [ 112.698335] handle_mm_fault+0xe6/0x370 > [ 112.698343] do_user_addr_fault+0x211/0x6a0 > [ 112.698354] exc_page_fault+0x75/0x1d0 > [ 112.698363] asm_exc_page_fault+0x26/0x30 > [ 112.698366] RIP: 0033:0x7fa97236db00 > > It's possible to load multiple bpf struct programs. In the case of > oom, they will be executed one by one in the same order they been > loaded until one of them returns 1 and bpf_memory_freed is set to 1 > - an indication that the memory was freed. This allows to have > multiple bpf programs to focus on different types of OOM's - e.g. > one program can only handle memcg OOM's in one memory cgroup. > But the filtering is done in bpf - so it's fully flexible. > > Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> > --- > include/linux/bpf_oom.h | 49 +++++++++++++ > include/linux/oom.h | 8 ++ > mm/Makefile | 3 + > mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++ > mm/oom_kill.c | 22 +++++- > 5 files changed, 237 insertions(+), 2 deletions(-) > create mode 100644 include/linux/bpf_oom.h > create mode 100644 mm/bpf_oom.c > > diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h > new file mode 100644 > index 000000000000..29cb5ea41d97 > --- /dev/null > +++ b/include/linux/bpf_oom.h > @@ -0,0 +1,49 @@ > +/* SPDX-License-Identifier: GPL-2.0+ */ > + > +#ifndef __BPF_OOM_H > +#define __BPF_OOM_H > + > +struct bpf_oom; > +struct oom_control; > + > +#define BPF_OOM_NAME_MAX_LEN 64 > + > +struct bpf_oom_ops { > + /** > + * @handle_out_of_memory: Out of memory bpf handler, called before > + * the in-kernel OOM killer. > + * @oc: OOM control structure > + * > + * Should return 1 if some memory was freed up, otherwise > + * the in-kernel OOM killer is invoked. > + */ > + int (*handle_out_of_memory)(struct oom_control *oc); I suggest adding "struct bpf_oom *" as the first argument to all bpf_oom_ops to future-proof. It will allow an bpf_oom kfunc or prog to refer to the struct_ops instance itself. Since bpf_oom_ops allows multiple attachment, if a bpf_prog is shared between two bpf_oom, it will be able to infer which bpf_oom_ops is calling by this extra argument. > + > + /** > + * @name: BPF OOM policy name > + */ > + char name[BPF_OOM_NAME_MAX_LEN]; > + > + /* Private */ > + struct bpf_oom *bpf_oom; > +}; > + > +#ifdef CONFIG_BPF_SYSCALL > +/** > + * @bpf_handle_oom: handle out of memory using bpf programs > + * @oc: OOM control structure > + * > + * Returns true if a bpf oom program was executed, returned 1 > + * and some memory was actually freed. > + */ > +bool bpf_handle_oom(struct oom_control *oc); > + > +#else /* CONFIG_BPF_SYSCALL */ > +static inline bool bpf_handle_oom(struct oom_control *oc) > +{ > + return false; > +} > + > +#endif /* CONFIG_BPF_SYSCALL */ > + > +#endif /* __BPF_OOM_H */ > diff --git a/include/linux/oom.h b/include/linux/oom.h > index 1e0fc6931ce9..ef453309b7ea 100644 > --- a/include/linux/oom.h > +++ b/include/linux/oom.h > @@ -51,6 +51,14 @@ struct oom_control { > > /* Used to print the constraint info. */ > enum oom_constraint constraint; > + > +#ifdef CONFIG_BPF_SYSCALL > + /* Used by the bpf oom implementation to mark the forward progress */ > + bool bpf_memory_freed; > + > + /* Policy name */ > + const char *bpf_policy_name; > +#endif > }; > > extern struct mutex oom_lock; > diff --git a/mm/Makefile b/mm/Makefile > index 1a7a11d4933d..a714aba03759 100644 > --- a/mm/Makefile > +++ b/mm/Makefile > @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o > ifdef CONFIG_SWAP > obj-$(CONFIG_MEMCG) += swap_cgroup.o > endif > +ifdef CONFIG_BPF_SYSCALL > +obj-y += bpf_oom.o > +endif > obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o > obj-$(CONFIG_GUP_TEST) += gup_test.o > obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o > diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c > new file mode 100644 > index 000000000000..47633046819c > --- /dev/null > +++ b/mm/bpf_oom.c > @@ -0,0 +1,157 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * BPF-driven OOM killer customization > + * > + * Author: Roman Gushchin <roman.gushchin@linux.dev> > + */ > + > +#include <linux/bpf.h> > +#include <linux/oom.h> > +#include <linux/bpf_oom.h> > +#include <linux/srcu.h> > + > +DEFINE_STATIC_SRCU(bpf_oom_srcu); > +static DEFINE_SPINLOCK(bpf_oom_lock); > +static LIST_HEAD(bpf_oom_handlers); > + > +struct bpf_oom { > + struct bpf_oom_ops *ops; > + struct list_head node; > + struct srcu_struct srcu; > +}; > + > +bool bpf_handle_oom(struct oom_control *oc) > +{ > + struct bpf_oom_ops *ops; > + struct bpf_oom *bpf_oom; > + int list_idx, idx, ret = 0; > + > + oc->bpf_memory_freed = false; > + > + list_idx = srcu_read_lock(&bpf_oom_srcu); > + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) { > + ops = READ_ONCE(bpf_oom->ops); > + if (!ops || !ops->handle_out_of_memory) > + continue; > + idx = srcu_read_lock(&bpf_oom->srcu); > + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] : > + "bpf_defined_policy"; > + ret = ops->handle_out_of_memory(oc); > + oc->bpf_policy_name = NULL; > + srcu_read_unlock(&bpf_oom->srcu, idx); > + > + if (ret && oc->bpf_memory_freed) > + break; > + } > + srcu_read_unlock(&bpf_oom_srcu, list_idx); > + > + return ret && oc->bpf_memory_freed; > +} > + > +static int __handle_out_of_memory(struct oom_control *oc) > +{ > + return 0; > +} > + > +static struct bpf_oom_ops __bpf_oom_ops = { > + .handle_out_of_memory = __handle_out_of_memory, > +}; > + > +static const struct bpf_func_proto * > +bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > +{ > + return tracing_prog_func_proto(func_id, prog); > +} > + > +static bool bpf_oom_ops_is_valid_access(int off, int size, > + enum bpf_access_type type, > + const struct bpf_prog *prog, > + struct bpf_insn_access_aux *info) > +{ > + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); > +} > + > +static const struct bpf_verifier_ops bpf_oom_verifier_ops = { > + .get_func_proto = bpf_oom_func_proto, > + .is_valid_access = bpf_oom_ops_is_valid_access, > +}; > + > +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link) > +{ > + struct bpf_oom_ops *ops = kdata; > + struct bpf_oom *bpf_oom; > + int ret; > + > + bpf_oom = kmalloc(sizeof(*bpf_oom), GFP_KERNEL_ACCOUNT); > + if (!bpf_oom) > + return -ENOMEM; > + > + ret = init_srcu_struct(&bpf_oom->srcu); > + if (ret) { > + kfree(bpf_oom); > + return ret; > + } > + > + WRITE_ONCE(bpf_oom->ops, ops); > + ops->bpf_oom = bpf_oom; > + > + spin_lock(&bpf_oom_lock); > + list_add_rcu(&bpf_oom->node, &bpf_oom_handlers); > + spin_unlock(&bpf_oom_lock); > + > + return 0; > +} > + > +static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link) > +{ > + struct bpf_oom_ops *ops = kdata; > + struct bpf_oom *bpf_oom = ops->bpf_oom; > + > + WRITE_ONCE(bpf_oom->ops, NULL); > + > + spin_lock(&bpf_oom_lock); > + list_del_rcu(&bpf_oom->node); > + spin_unlock(&bpf_oom_lock); > + > + synchronize_srcu(&bpf_oom->srcu); > + > + kfree(bpf_oom); > +} > + > +static int bpf_oom_ops_init_member(const struct btf_type *t, > + const struct btf_member *member, > + void *kdata, const void *udata) > +{ > + const struct bpf_oom_ops *uops = (const struct bpf_oom_ops *)udata; > + struct bpf_oom_ops *ops = (struct bpf_oom_ops *)kdata; > + u32 moff = __btf_member_bit_offset(t, member) / 8; > + > + switch (moff) { > + case offsetof(struct bpf_oom_ops, name): > + strscpy_pad(ops->name, uops->name, sizeof(ops->name)); > + return 1; > + } > + return 0; > +} > + > +static int bpf_oom_ops_init(struct btf *btf) > +{ > + return 0; > +} > + > +static struct bpf_struct_ops bpf_oom_bpf_ops = { > + .verifier_ops = &bpf_oom_verifier_ops, > + .reg = bpf_oom_ops_reg, > + .unreg = bpf_oom_ops_unreg, > + .init_member = bpf_oom_ops_init_member, > + .init = bpf_oom_ops_init, > + .name = "bpf_oom_ops", > + .owner = THIS_MODULE, > + .cfi_stubs = &__bpf_oom_ops > +}; > + > +static int __init bpf_oom_struct_ops_init(void) > +{ > + return register_bpf_struct_ops(&bpf_oom_bpf_ops, bpf_oom_ops); > +} > +late_initcall(bpf_oom_struct_ops_init); > diff --git a/mm/oom_kill.c b/mm/oom_kill.c > index 25923cfec9c6..ad7bd65061d6 100644 > --- a/mm/oom_kill.c > +++ b/mm/oom_kill.c > @@ -45,6 +45,7 @@ > #include <linux/mmu_notifier.h> > #include <linux/cred.h> > #include <linux/nmi.h> > +#include <linux/bpf_oom.h> > > #include <asm/tlb.h> > #include "internal.h" > @@ -246,6 +247,15 @@ static const char * const oom_constraint_text[] = { > [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG", > }; > > +static const char *oom_policy_name(struct oom_control *oc) > +{ > +#ifdef CONFIG_BPF_SYSCALL > + if (oc->bpf_policy_name) > + return oc->bpf_policy_name; > +#endif > + return "default"; > +} > + > /* > * Determine the type of allocation constraint. > */ > @@ -458,9 +468,10 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim) > > static void dump_header(struct oom_control *oc) > { > - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", > + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\noom_policy=%s\n", > current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, > - current->signal->oom_score_adj); > + current->signal->oom_score_adj, > + oom_policy_name(oc)); > if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) > pr_warn("COMPACTION is disabled!!!\n"); > > @@ -1161,6 +1172,13 @@ bool out_of_memory(struct oom_control *oc) > return true; > } > > + /* > + * Let bpf handle the OOM first. If it was able to free up some memory, > + * bail out. Otherwise fall back to the kernel OOM killer. > + */ > + if (bpf_handle_oom(oc)) > + return true; > + > select_bad_process(oc); > /* Found nothing?!?! */ > if (!oc->chosen) {
On Mon, 18 Aug 2025 at 19:01, Roman Gushchin <roman.gushchin@linux.dev> wrote: > > Introduce a bpf struct ops for implementing custom OOM handling policies. > > The struct ops provides the bpf_handle_out_of_memory() callback, > which expected to return 1 if it was able to free some memory and 0 > otherwise. > > In the latter case it's guaranteed that the in-kernel OOM killer will > be invoked. Otherwise the kernel also checks the bpf_memory_freed > field of the oom_control structure, which is expected to be set by > kfuncs suitable for releasing memory. It's a safety mechanism which > prevents a bpf program to claim forward progress without actually > releasing memory. The callback program is sleepable to enable using > iterators, e.g. cgroup iterators. > > The callback receives struct oom_control as an argument, so it can > easily filter out OOM's it doesn't want to handle, e.g. global vs > memcg OOM's. > > The callback is executed just before the kernel victim task selection > algorithm, so all heuristics and sysctls like panic on oom, > sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task > are respected. > > The struct ops also has the name field, which allows to define a > custom name for the implemented policy. It's printed in the OOM report > in the oom_policy=<policy> format. "default" is printed if bpf is not > used or policy name is not specified. > > [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 > oom_policy=bpf_test_policy > [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) > [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 > [ 112.698167] Call Trace: > [ 112.698177] <TASK> > [ 112.698182] dump_stack_lvl+0x4d/0x70 > [ 112.698192] dump_header+0x59/0x1c6 > [ 112.698199] oom_kill_process.cold+0x8/0xef > [ 112.698206] bpf_oom_kill_process+0x59/0xb0 > [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 > [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf > [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 > [ 112.698240] bpf_handle_oom+0x11a/0x1e0 > [ 112.698250] out_of_memory+0xab/0x5c0 > [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 > [ 112.698274] try_charge_memcg+0x4b5/0x7e0 > [ 112.698288] charge_memcg+0x2f/0xc0 > [ 112.698293] __mem_cgroup_charge+0x30/0xc0 > [ 112.698299] do_anonymous_page+0x40f/0xa50 > [ 112.698311] __handle_mm_fault+0xbba/0x1140 > [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 > [ 112.698335] handle_mm_fault+0xe6/0x370 > [ 112.698343] do_user_addr_fault+0x211/0x6a0 > [ 112.698354] exc_page_fault+0x75/0x1d0 > [ 112.698363] asm_exc_page_fault+0x26/0x30 > [ 112.698366] RIP: 0033:0x7fa97236db00 > > It's possible to load multiple bpf struct programs. In the case of > oom, they will be executed one by one in the same order they been > loaded until one of them returns 1 and bpf_memory_freed is set to 1 > - an indication that the memory was freed. This allows to have > multiple bpf programs to focus on different types of OOM's - e.g. > one program can only handle memcg OOM's in one memory cgroup. > But the filtering is done in bpf - so it's fully flexible. I think a natural question here is ordering. Is this ability to have multiple OOM programs critical right now? How is it decided who gets to run before the other? Is it based on order of attachment (which can be non-deterministic)? There was a lot of discussion on something similar for tc progs, and we went with specific flags that capture partial ordering constraints (instead of priorities that may collide). https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net It would be nice if we can find a way of making this consistent. Another option is to exclude the multiple attachment bit from the initial version and do this as a follow up, since it probably requires more discussion. > > Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> > --- > [...]
Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: > On Mon, 18 Aug 2025 at 19:01, Roman Gushchin <roman.gushchin@linux.dev> wrote: >> >> Introduce a bpf struct ops for implementing custom OOM handling policies. >> >> The struct ops provides the bpf_handle_out_of_memory() callback, >> which expected to return 1 if it was able to free some memory and 0 >> otherwise. >> >> In the latter case it's guaranteed that the in-kernel OOM killer will >> be invoked. Otherwise the kernel also checks the bpf_memory_freed >> field of the oom_control structure, which is expected to be set by >> kfuncs suitable for releasing memory. It's a safety mechanism which >> prevents a bpf program to claim forward progress without actually >> releasing memory. The callback program is sleepable to enable using >> iterators, e.g. cgroup iterators. >> >> The callback receives struct oom_control as an argument, so it can >> easily filter out OOM's it doesn't want to handle, e.g. global vs >> memcg OOM's. >> >> The callback is executed just before the kernel victim task selection >> algorithm, so all heuristics and sysctls like panic on oom, >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task >> are respected. >> >> The struct ops also has the name field, which allows to define a >> custom name for the implemented policy. It's printed in the OOM report >> in the oom_policy=<policy> format. "default" is printed if bpf is not >> used or policy name is not specified. >> >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 >> oom_policy=bpf_test_policy >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 >> [ 112.698167] Call Trace: >> [ 112.698177] <TASK> >> [ 112.698182] dump_stack_lvl+0x4d/0x70 >> [ 112.698192] dump_header+0x59/0x1c6 >> [ 112.698199] oom_kill_process.cold+0x8/0xef >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0 >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0 >> [ 112.698250] out_of_memory+0xab/0x5c0 >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0 >> [ 112.698288] charge_memcg+0x2f/0xc0 >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0 >> [ 112.698299] do_anonymous_page+0x40f/0xa50 >> [ 112.698311] __handle_mm_fault+0xbba/0x1140 >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 >> [ 112.698335] handle_mm_fault+0xe6/0x370 >> [ 112.698343] do_user_addr_fault+0x211/0x6a0 >> [ 112.698354] exc_page_fault+0x75/0x1d0 >> [ 112.698363] asm_exc_page_fault+0x26/0x30 >> [ 112.698366] RIP: 0033:0x7fa97236db00 >> >> It's possible to load multiple bpf struct programs. In the case of >> oom, they will be executed one by one in the same order they been >> loaded until one of them returns 1 and bpf_memory_freed is set to 1 >> - an indication that the memory was freed. This allows to have >> multiple bpf programs to focus on different types of OOM's - e.g. >> one program can only handle memcg OOM's in one memory cgroup. >> But the filtering is done in bpf - so it's fully flexible. > > I think a natural question here is ordering. Is this ability to have > multiple OOM programs critical right now? Good question. Initially I had only supported a single bpf policy. But then I realized that likely people would want to have different policies handling different parts of the cgroup tree. E.g. a global policy and several policies handling OOMs only in some memory cgroups. So having just a single policy is likely a no go. > How is it decided who gets to run before the other? Is it based on > order of attachment (which can be non-deterministic)? Yeah, now it's the order of attachment. > There was a lot of discussion on something similar for tc progs, and > we went with specific flags that capture partial ordering constraints > (instead of priorities that may collide). > https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net > It would be nice if we can find a way of making this consistent. I'll take a look, thanks! I hope that my naive approach might be good enough for the start and we can implement something more sophisticated later, but maybe I'm wrong here.
On 8/20/25 5:24 PM, Roman Gushchin wrote: >> How is it decided who gets to run before the other? Is it based on >> order of attachment (which can be non-deterministic)? > Yeah, now it's the order of attachment. > >> There was a lot of discussion on something similar for tc progs, and >> we went with specific flags that capture partial ordering constraints >> (instead of priorities that may collide). >> https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net >> It would be nice if we can find a way of making this consistent. +1 The cgroup bpf prog has recently added the mprog api support also. If the simple order of attachment is not enough and needs to have specific ordering, we should make the bpf struct_ops support the same mprog api instead of asking each subsystem creating its own. fyi, another need for struct_ops ordering is to upgrade the BPF_PROG_TYPE_SOCK_OPS api to struct_ops for easier extension in the future. Slide 13 in https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view
Martin KaFai Lau <martin.lau@linux.dev> writes: > On 8/20/25 5:24 PM, Roman Gushchin wrote: >>> How is it decided who gets to run before the other? Is it based on >>> order of attachment (which can be non-deterministic)? >> Yeah, now it's the order of attachment. >> >>> There was a lot of discussion on something similar for tc progs, and >>> we went with specific flags that capture partial ordering constraints >>> (instead of priorities that may collide). >>> https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net >>> It would be nice if we can find a way of making this consistent. > > +1 > > The cgroup bpf prog has recently added the mprog api support also. If > the simple order of attachment is not enough and needs to have > specific ordering, we should make the bpf struct_ops support the same > mprog api instead of asking each subsystem creating its own. > > fyi, another need for struct_ops ordering is to upgrade the > BPF_PROG_TYPE_SOCK_OPS api to struct_ops for easier extension in the > future. Slide 13 in > https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view Does it mean it's better now to keep it simple in the context of oom patches with the plan to later reuse the generic struct_ops infrastructure? Honestly, I believe that the simple order of attachment should be good enough for quite a while, so I'd not over-complicate this, unless it's not fixable later. Thanks!
On 8/25/25 10:00 AM, Roman Gushchin wrote: > Martin KaFai Lau <martin.lau@linux.dev> writes: > >> On 8/20/25 5:24 PM, Roman Gushchin wrote: >>>> How is it decided who gets to run before the other? Is it based on >>>> order of attachment (which can be non-deterministic)? >>> Yeah, now it's the order of attachment. >>> >>>> There was a lot of discussion on something similar for tc progs, and >>>> we went with specific flags that capture partial ordering constraints >>>> (instead of priorities that may collide). >>>> https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net >>>> It would be nice if we can find a way of making this consistent. >> >> +1 >> >> The cgroup bpf prog has recently added the mprog api support also. If >> the simple order of attachment is not enough and needs to have >> specific ordering, we should make the bpf struct_ops support the same >> mprog api instead of asking each subsystem creating its own. >> >> fyi, another need for struct_ops ordering is to upgrade the >> BPF_PROG_TYPE_SOCK_OPS api to struct_ops for easier extension in the >> future. Slide 13 in >> https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view > > Does it mean it's better now to keep it simple in the context of oom > patches with the plan to later reuse the generic struct_ops > infrastructure? > > Honestly, I believe that the simple order of attachment should be > good enough for quite a while, so I'd not over-complicate this, > unless it's not fixable later. I think the simple attachment ordering is fine. Presumably the current link list in patch 1 can be replaced by the mprog in the future. Other experts can chime in if I have missed things. Once it needs to have an ordering api in the future, it should probably stay with mprog instead of each subsystem creating its own. The inspection tool (likely a subcmd in bpftool) can also be created to inspect the struct_ops order of a subsystem.
On Tue, Aug 26, 2025 at 11:01 AM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > On 8/25/25 10:00 AM, Roman Gushchin wrote: > > Martin KaFai Lau <martin.lau@linux.dev> writes: > > > >> On 8/20/25 5:24 PM, Roman Gushchin wrote: > >>>> How is it decided who gets to run before the other? Is it based on > >>>> order of attachment (which can be non-deterministic)? > >>> Yeah, now it's the order of attachment. > >>> > >>>> There was a lot of discussion on something similar for tc progs, and > >>>> we went with specific flags that capture partial ordering constraints > >>>> (instead of priorities that may collide). > >>>> https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net > >>>> It would be nice if we can find a way of making this consistent. > >> > >> +1 > >> > >> The cgroup bpf prog has recently added the mprog api support also. If > >> the simple order of attachment is not enough and needs to have > >> specific ordering, we should make the bpf struct_ops support the same > >> mprog api instead of asking each subsystem creating its own. > >> > >> fyi, another need for struct_ops ordering is to upgrade the > >> BPF_PROG_TYPE_SOCK_OPS api to struct_ops for easier extension in the > >> future. Slide 13 in > >> https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view > > > > Does it mean it's better now to keep it simple in the context of oom > > patches with the plan to later reuse the generic struct_ops > > infrastructure? > > > > Honestly, I believe that the simple order of attachment should be > > good enough for quite a while, so I'd not over-complicate this, > > unless it's not fixable later. > > I think the simple attachment ordering is fine. Presumably the current link list > in patch 1 can be replaced by the mprog in the future. Other experts can chime > in if I have missed things. I don't think the proposed approach of: list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) { is extensible without breaking things. Sooner or later people will want bpf-oom handlers to be per container, so we have to think upfront how to do it. I would start with one bpf-oom prog per memcg and extend with mprog later. Effectively placing 'struct bpf_oom_ops *' into oc->memcg, and having one global bpf_oom_ops when oc->memcg == NULL. I'm sure other designs are possible, but lets make sure container scope is designed from the beginning. mprog-like multi prog behavior per container can be added later.
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes: > On Tue, Aug 26, 2025 at 11:01 AM Martin KaFai Lau <martin.lau@linux.dev> wrote: >> >> On 8/25/25 10:00 AM, Roman Gushchin wrote: >> > Martin KaFai Lau <martin.lau@linux.dev> writes: >> > >> >> On 8/20/25 5:24 PM, Roman Gushchin wrote: >> >>>> How is it decided who gets to run before the other? Is it based on >> >>>> order of attachment (which can be non-deterministic)? >> >>> Yeah, now it's the order of attachment. >> >>> >> >>>> There was a lot of discussion on something similar for tc progs, and >> >>>> we went with specific flags that capture partial ordering constraints >> >>>> (instead of priorities that may collide). >> >>>> https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net >> >>>> It would be nice if we can find a way of making this consistent. >> >> >> >> +1 >> >> >> >> The cgroup bpf prog has recently added the mprog api support also. If >> >> the simple order of attachment is not enough and needs to have >> >> specific ordering, we should make the bpf struct_ops support the same >> >> mprog api instead of asking each subsystem creating its own. >> >> >> >> fyi, another need for struct_ops ordering is to upgrade the >> >> BPF_PROG_TYPE_SOCK_OPS api to struct_ops for easier extension in the >> >> future. Slide 13 in >> >> https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view >> > >> > Does it mean it's better now to keep it simple in the context of oom >> > patches with the plan to later reuse the generic struct_ops >> > infrastructure? >> > >> > Honestly, I believe that the simple order of attachment should be >> > good enough for quite a while, so I'd not over-complicate this, >> > unless it's not fixable later. >> >> I think the simple attachment ordering is fine. Presumably the current link list >> in patch 1 can be replaced by the mprog in the future. Other experts can chime >> in if I have missed things. > > I don't think the proposed approach of: > list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) { > is extensible without breaking things. > Sooner or later people will want bpf-oom handlers to be per > container, so we have to think upfront how to do it. > I would start with one bpf-oom prog per memcg and extend with mprog later. > Effectively placing 'struct bpf_oom_ops *' into oc->memcg, > and having one global bpf_oom_ops when oc->memcg == NULL. > I'm sure other designs are possible, but lets make sure container scope > is designed from the beginning. > mprog-like multi prog behavior per container can be added later. Btw, what's the right way to attach struct ops to a cgroup, if there is one? Add a cgroup_id field to the struct and use it in the .reg() callback? Or there is something better? Thanks
Hello, Roman. How are you? On Tue, Sep 02, 2025 at 10:31:33AM -0700, Roman Gushchin wrote: ... > Btw, what's the right way to attach struct ops to a cgroup, if there is > one? Add a cgroup_id field to the struct and use it in the .reg() > callback? Or there is something better? So, I'm trying to do something similar with sched_ext. Right now, I only have a very rough prototype (I can attach multiple schedulers with warnings and they even can schedule for several seconds before melting down). However, the basic pieces should may still be useful. The branch is: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git scx-hier-prototype There are several pieces: - cgroup recently grew lifetime notifiers that you can hook in there to receive on/offline events. This is useful for initializing per-cgroup fields and cleaning up when cgroup dies: https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5469 - I'm passing in cgroup_id as an optional field in struct_ops and then in enable path, look up the matching cgroup, verify it can attach there and insert and update data structures accordingly: https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5280 - I wanted to be able to group BPF programs together so that the related BPF timers, tracing progs and so on can call sched_ext kfuncs to operate on the associated scheduler instance. This currently isn't possible, so I'm using a really silly hack. I'm hoping we'd be able to get something better in the future: https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/commit/?h=scx-hier-prototype&id=b459b1f967fe1767783360761042cd36a1a5f2d6 Thanks. -- tejun
Tejun Heo <tj@kernel.org> writes: > Hello, Roman. How are you? Hi Tejun! Thank you for the links... > > On Tue, Sep 02, 2025 at 10:31:33AM -0700, Roman Gushchin wrote: > ... >> Btw, what's the right way to attach struct ops to a cgroup, if there is >> one? Add a cgroup_id field to the struct and use it in the .reg() >> callback? Or there is something better? > > So, I'm trying to do something similar with sched_ext. Right now, I only > have a very rough prototype (I can attach multiple schedulers with warnings > and they even can schedule for several seconds before melting down). > However, the basic pieces should may still be useful. The branch is: > > git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git scx-hier-prototype > > There are several pieces: > > - cgroup recently grew lifetime notifiers that you can hook in there to > receive on/offline events. This is useful for initializing per-cgroup > fields and cleaning up when cgroup dies: > > https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5469 This is neat, I might use this for the psi struct ops to give a user a chance to create new trigger(s) if a new cgroup is created. > > - I'm passing in cgroup_id as an optional field in struct_ops and then in > enable path, look up the matching cgroup, verify it can attach there and > insert and update data structures accordingly: > > https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5280 Yeah, we discussed this option with Martin up in this thread. It doesn't look as the best possible solution, but maybe the best we have at the moment. Ideally, I want something like this: void test_oom(void) { struct test_oom *skel; int err, cgroup_fd; cgroup_fd = open(...); if (cgroup_fd < 0) goto cleanup; skel = test_oom__open_and_load(); if (!skel) goto cleanup; err = test_oom__attach_cgroup(skel, cgroup_fd); if (CHECK_FAIL(err)) goto cleanup;
Hello, On Wed, Sep 03, 2025 at 04:30:16PM -0700, Roman Gushchin wrote: ... > > - I'm passing in cgroup_id as an optional field in struct_ops and then in > > enable path, look up the matching cgroup, verify it can attach there and > > insert and update data structures accordingly: > > > > https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5280 > > Yeah, we discussed this option with Martin up in this thread. It doesn't > look as the best possible solution, but maybe the best we have at the moment. > > Ideally, I want something like this: > > void test_oom(void) > { > struct test_oom *skel; > int err, cgroup_fd; > > cgroup_fd = open(...); > if (cgroup_fd < 0) > goto cleanup; > > skel = test_oom__open_and_load(); > if (!skel) > goto cleanup; > > err = test_oom__attach_cgroup(skel, cgroup_fd); > if (CHECK_FAIL(err)) > goto cleanup; Yeah, that'd look better but are there practical differences? The only one I can think of is fs based permission check but that can be done separately too. Thanks. -- tejun
Tejun Heo <tj@kernel.org> writes: > Hello, > > On Wed, Sep 03, 2025 at 04:30:16PM -0700, Roman Gushchin wrote: > ... >> > - I'm passing in cgroup_id as an optional field in struct_ops and then in >> > enable path, look up the matching cgroup, verify it can attach there and >> > insert and update data structures accordingly: >> > >> > https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5280 >> >> Yeah, we discussed this option with Martin up in this thread. It doesn't >> look as the best possible solution, but maybe the best we have at the moment. >> >> Ideally, I want something like this: >> >> void test_oom(void) >> { >> struct test_oom *skel; >> int err, cgroup_fd; >> >> cgroup_fd = open(...); >> if (cgroup_fd < 0) >> goto cleanup; >> >> skel = test_oom__open_and_load(); >> if (!skel) >> goto cleanup; >> >> err = test_oom__attach_cgroup(skel, cgroup_fd); >> if (CHECK_FAIL(err)) >> goto cleanup; > > Yeah, that'd look better but are there practical differences? The only one I > can think of is fs based permission check but that can be done separately > too. The practical difference is that a single struct ops can be attached to multiple cgroups.
On Thu, Sep 4, 2025 at 7:32 AM Roman Gushchin <roman.gushchin@linux.dev> wrote: > > Tejun Heo <tj@kernel.org> writes: > > > Hello, > > > > On Wed, Sep 03, 2025 at 04:30:16PM -0700, Roman Gushchin wrote: > > ... > >> > - I'm passing in cgroup_id as an optional field in struct_ops and then in > >> > enable path, look up the matching cgroup, verify it can attach there and > >> > insert and update data structures accordingly: > >> > > >> > https://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git/tree/kernel/sched/ext.c?h=scx-hier-prototype#n5280 > >> > >> Yeah, we discussed this option with Martin up in this thread. It doesn't > >> look as the best possible solution, but maybe the best we have at the moment. > >> > >> Ideally, I want something like this: > >> > >> void test_oom(void) > >> { > >> struct test_oom *skel; > >> int err, cgroup_fd; > >> > >> cgroup_fd = open(...); > >> if (cgroup_fd < 0) > >> goto cleanup; > >> > >> skel = test_oom__open_and_load(); > >> if (!skel) > >> goto cleanup; > >> > >> err = test_oom__attach_cgroup(skel, cgroup_fd); > >> if (CHECK_FAIL(err)) > >> goto cleanup; > > > > Yeah, that'd look better but are there practical differences? The only one I > > can think of is fs based permission check but that can be done separately > > too. > > The practical difference is that a single struct ops can be attached > to multiple cgroups. +1 Attaching the same scheduler to multiple cgroups also sounds useful. I feel sched-ext should use cgroup_fd too and do scx_sub_enable() at attach time instead of load time. Then scx_sub_disable() can happen at link detach. Looks more flexible from user pov.
Hello, On Thu, Sep 04, 2025 at 09:26:45AM -0700, Alexei Starovoitov wrote: ... > > >> err = test_oom__attach_cgroup(skel, cgroup_fd); > > >> if (CHECK_FAIL(err)) > > >> goto cleanup; > > > > > > Yeah, that'd look better but are there practical differences? The only one I > > > can think of is fs based permission check but that can be done separately > > > too. > > > > The practical difference is that a single struct ops can be attached > > to multiple cgroups. > > +1 > Attaching the same scheduler to multiple cgroups also sounds useful. > I feel sched-ext should use cgroup_fd too and do scx_sub_enable() at > attach time instead of load time. > Then scx_sub_disable() can happen at link detach. > Looks more flexible from user pov. Nothing wrong with that but I'm not sure that'd have practical user-noticeable benefits for sched_ext. Also, would it affect how associated programs can identify which instance they belong to? At least from sched_ext POV, that's a lot more important than the ability to attach the same programs in multiple places. Thanks. -- tejun
On 9/2/25 10:31 AM, Roman Gushchin wrote: > Btw, what's the right way to attach struct ops to a cgroup, if there is > one? Add a cgroup_id field to the struct and use it in the .reg() Adding a cgroup id/fd field to the struct bpf_oom_ops will be hard to attach the same bpf_oom_ops to multiple cgroups. > callback? Or there is something better? There is a link_create.target_fd in the "union bpf_attr". The cgroup_bpf_link_attach() is using it as cgroup fd. May be it can be used here also. This will limit it to link attach only. Meaning the SEC(".struct_ops.link") is supported but not the older SEC(".struct_ops"). I think this should be fine.
Martin KaFai Lau <martin.lau@linux.dev> writes: > On 9/2/25 10:31 AM, Roman Gushchin wrote: >> Btw, what's the right way to attach struct ops to a cgroup, if there is >> one? Add a cgroup_id field to the struct and use it in the .reg() > > Adding a cgroup id/fd field to the struct bpf_oom_ops will be hard to > attach the same bpf_oom_ops to multiple cgroups. > >> callback? Or there is something better? > > There is a link_create.target_fd in the "union bpf_attr". The > cgroup_bpf_link_attach() is using it as cgroup fd. May be it can be > used here also. This will limit it to link attach only. Meaning the > SEC(".struct_ops.link") is supported but not the older > SEC(".struct_ops"). I think this should be fine. I thought a bit more about it (sorry for the delay): if we want to be able to attach a single struct ops to multiple cgroups (and potentially other objects, e.g. sockets), we can't really use the existing struct ops's bpf_link. So I guess we need to add a new .attach() function beside .reg() which will take the existing link and struct bpf_attr as arguments and return a new bpf_link. And in libbpf we need a corresponding new bpf_link__attach_cgroup(). Does it sound right?
Martin KaFai Lau <martin.lau@linux.dev> writes: > On 9/2/25 10:31 AM, Roman Gushchin wrote: >> Btw, what's the right way to attach struct ops to a cgroup, if there is >> one? Add a cgroup_id field to the struct and use it in the .reg() > > Adding a cgroup id/fd field to the struct bpf_oom_ops will be hard to > attach the same bpf_oom_ops to multiple cgroups. Yeah, this is what I thought too, it doesn't look as an attractive path. > >> callback? Or there is something better? > > There is a link_create.target_fd in the "union bpf_attr". The > cgroup_bpf_link_attach() is using it as cgroup fd. May be it can be > used here also. This will limit it to link attach only. Meaning the > SEC(".struct_ops.link") is supported but not the older > SEC(".struct_ops"). I think this should be fine. I'll take a look, thank you!
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes: > On Tue, Aug 26, 2025 at 11:01 AM Martin KaFai Lau <martin.lau@linux.dev> wrote: >> >> On 8/25/25 10:00 AM, Roman Gushchin wrote: >> > Martin KaFai Lau <martin.lau@linux.dev> writes: >> > >> >> On 8/20/25 5:24 PM, Roman Gushchin wrote: >> >>>> How is it decided who gets to run before the other? Is it based on >> >>>> order of attachment (which can be non-deterministic)? >> >>> Yeah, now it's the order of attachment. >> >>> >> >>>> There was a lot of discussion on something similar for tc progs, and >> >>>> we went with specific flags that capture partial ordering constraints >> >>>> (instead of priorities that may collide). >> >>>> https://lore.kernel.org/all/20230719140858.13224-2-daniel@iogearbox.net >> >>>> It would be nice if we can find a way of making this consistent. >> >> >> >> +1 >> >> >> >> The cgroup bpf prog has recently added the mprog api support also. If >> >> the simple order of attachment is not enough and needs to have >> >> specific ordering, we should make the bpf struct_ops support the same >> >> mprog api instead of asking each subsystem creating its own. >> >> >> >> fyi, another need for struct_ops ordering is to upgrade the >> >> BPF_PROG_TYPE_SOCK_OPS api to struct_ops for easier extension in the >> >> future. Slide 13 in >> >> https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view >> > >> > Does it mean it's better now to keep it simple in the context of oom >> > patches with the plan to later reuse the generic struct_ops >> > infrastructure? >> > >> > Honestly, I believe that the simple order of attachment should be >> > good enough for quite a while, so I'd not over-complicate this, >> > unless it's not fixable later. >> >> I think the simple attachment ordering is fine. Presumably the current link list >> in patch 1 can be replaced by the mprog in the future. Other experts can chime >> in if I have missed things. > > I don't think the proposed approach of: > list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) { > is extensible without breaking things. > Sooner or later people will want bpf-oom handlers to be per > container, so we have to think upfront how to do it. > I would start with one bpf-oom prog per memcg and extend with mprog later. > Effectively placing 'struct bpf_oom_ops *' into oc->memcg, > and having one global bpf_oom_ops when oc->memcg == NULL. > I'm sure other designs are possible, but lets make sure container scope > is designed from the beginning. > mprog-like multi prog behavior per container can be added later. Sounds good to me, will implement something like this in the next version. Thanks!
On Thu, 21 Aug 2025 at 02:25, Roman Gushchin <roman.gushchin@linux.dev> wrote: > > Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: > > > On Mon, 18 Aug 2025 at 19:01, Roman Gushchin <roman.gushchin@linux.dev> wrote: > >> > >> Introduce a bpf struct ops for implementing custom OOM handling policies. > >> > >> The struct ops provides the bpf_handle_out_of_memory() callback, > >> which expected to return 1 if it was able to free some memory and 0 > >> otherwise. > >> > >> In the latter case it's guaranteed that the in-kernel OOM killer will > >> be invoked. Otherwise the kernel also checks the bpf_memory_freed > >> field of the oom_control structure, which is expected to be set by > >> kfuncs suitable for releasing memory. It's a safety mechanism which > >> prevents a bpf program to claim forward progress without actually > >> releasing memory. The callback program is sleepable to enable using > >> iterators, e.g. cgroup iterators. > >> > >> The callback receives struct oom_control as an argument, so it can > >> easily filter out OOM's it doesn't want to handle, e.g. global vs > >> memcg OOM's. > >> > >> The callback is executed just before the kernel victim task selection > >> algorithm, so all heuristics and sysctls like panic on oom, > >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task > >> are respected. > >> > >> The struct ops also has the name field, which allows to define a > >> custom name for the implemented policy. It's printed in the OOM report > >> in the oom_policy=<policy> format. "default" is printed if bpf is not > >> used or policy name is not specified. > >> > >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 > >> oom_policy=bpf_test_policy > >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) > >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 > >> [ 112.698167] Call Trace: > >> [ 112.698177] <TASK> > >> [ 112.698182] dump_stack_lvl+0x4d/0x70 > >> [ 112.698192] dump_header+0x59/0x1c6 > >> [ 112.698199] oom_kill_process.cold+0x8/0xef > >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0 > >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 > >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf > >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 > >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0 > >> [ 112.698250] out_of_memory+0xab/0x5c0 > >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 > >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0 > >> [ 112.698288] charge_memcg+0x2f/0xc0 > >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0 > >> [ 112.698299] do_anonymous_page+0x40f/0xa50 > >> [ 112.698311] __handle_mm_fault+0xbba/0x1140 > >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 > >> [ 112.698335] handle_mm_fault+0xe6/0x370 > >> [ 112.698343] do_user_addr_fault+0x211/0x6a0 > >> [ 112.698354] exc_page_fault+0x75/0x1d0 > >> [ 112.698363] asm_exc_page_fault+0x26/0x30 > >> [ 112.698366] RIP: 0033:0x7fa97236db00 > >> > >> It's possible to load multiple bpf struct programs. In the case of > >> oom, they will be executed one by one in the same order they been > >> loaded until one of them returns 1 and bpf_memory_freed is set to 1 > >> - an indication that the memory was freed. This allows to have > >> multiple bpf programs to focus on different types of OOM's - e.g. > >> one program can only handle memcg OOM's in one memory cgroup. > >> But the filtering is done in bpf - so it's fully flexible. > > > > I think a natural question here is ordering. Is this ability to have > > multiple OOM programs critical right now? > > Good question. Initially I had only supported a single bpf policy. > But then I realized that likely people would want to have different > policies handling different parts of the cgroup tree. > E.g. a global policy and several policies handling OOMs only > in some memory cgroups. > So having just a single policy is likely a no go. If the ordering is more to facilitate scoping, would it then be better to support attaching the policy to specific memcg/cgroup? There is then one global policy if need be (by attaching to root), but descendants can have their own which takes precedence, if it doesn't act, we walk up the hierarchy and find the next handler in the parent cgroup etc. all the way to the root until one of them returns 1. > > [...]
Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: > On Thu, 21 Aug 2025 at 02:25, Roman Gushchin <roman.gushchin@linux.dev> wrote: >> >> Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: >> >> > On Mon, 18 Aug 2025 at 19:01, Roman Gushchin <roman.gushchin@linux.dev> wrote: >> >> >> >> Introduce a bpf struct ops for implementing custom OOM handling policies. >> >> >> >> The struct ops provides the bpf_handle_out_of_memory() callback, >> >> which expected to return 1 if it was able to free some memory and 0 >> >> otherwise. >> >> >> >> In the latter case it's guaranteed that the in-kernel OOM killer will >> >> be invoked. Otherwise the kernel also checks the bpf_memory_freed >> >> field of the oom_control structure, which is expected to be set by >> >> kfuncs suitable for releasing memory. It's a safety mechanism which >> >> prevents a bpf program to claim forward progress without actually >> >> releasing memory. The callback program is sleepable to enable using >> >> iterators, e.g. cgroup iterators. >> >> >> >> The callback receives struct oom_control as an argument, so it can >> >> easily filter out OOM's it doesn't want to handle, e.g. global vs >> >> memcg OOM's. >> >> >> >> The callback is executed just before the kernel victim task selection >> >> algorithm, so all heuristics and sysctls like panic on oom, >> >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task >> >> are respected. >> >> >> >> The struct ops also has the name field, which allows to define a >> >> custom name for the implemented policy. It's printed in the OOM report >> >> in the oom_policy=<policy> format. "default" is printed if bpf is not >> >> used or policy name is not specified. >> >> >> >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 >> >> oom_policy=bpf_test_policy >> >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) >> >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 >> >> [ 112.698167] Call Trace: >> >> [ 112.698177] <TASK> >> >> [ 112.698182] dump_stack_lvl+0x4d/0x70 >> >> [ 112.698192] dump_header+0x59/0x1c6 >> >> [ 112.698199] oom_kill_process.cold+0x8/0xef >> >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0 >> >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 >> >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf >> >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 >> >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0 >> >> [ 112.698250] out_of_memory+0xab/0x5c0 >> >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 >> >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0 >> >> [ 112.698288] charge_memcg+0x2f/0xc0 >> >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0 >> >> [ 112.698299] do_anonymous_page+0x40f/0xa50 >> >> [ 112.698311] __handle_mm_fault+0xbba/0x1140 >> >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 >> >> [ 112.698335] handle_mm_fault+0xe6/0x370 >> >> [ 112.698343] do_user_addr_fault+0x211/0x6a0 >> >> [ 112.698354] exc_page_fault+0x75/0x1d0 >> >> [ 112.698363] asm_exc_page_fault+0x26/0x30 >> >> [ 112.698366] RIP: 0033:0x7fa97236db00 >> >> >> >> It's possible to load multiple bpf struct programs. In the case of >> >> oom, they will be executed one by one in the same order they been >> >> loaded until one of them returns 1 and bpf_memory_freed is set to 1 >> >> - an indication that the memory was freed. This allows to have >> >> multiple bpf programs to focus on different types of OOM's - e.g. >> >> one program can only handle memcg OOM's in one memory cgroup. >> >> But the filtering is done in bpf - so it's fully flexible. >> > >> > I think a natural question here is ordering. Is this ability to have >> > multiple OOM programs critical right now? >> >> Good question. Initially I had only supported a single bpf policy. >> But then I realized that likely people would want to have different >> policies handling different parts of the cgroup tree. >> E.g. a global policy and several policies handling OOMs only >> in some memory cgroups. >> So having just a single policy is likely a no go. > > If the ordering is more to facilitate scoping, would it then be better > to support attaching the policy to specific memcg/cgroup? Well, it has some advantages and disadvantages. First, it will require way more infrastructure on the memcg side. Second, the interface is not super clear: we don't want to have a struct ops per cgroup, I guess. And in many case a single policy for all memcgs is just fine, so asking the user to attach it to all memcgs is just adding a toil and creating all kinds of races. So I see your point, but I'm not yet convinced, to be honest. Thanks!
On Wed, Aug 20, 2025 at 7:22 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: > > Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: > > > On Thu, 21 Aug 2025 at 02:25, Roman Gushchin <roman.gushchin@linux.dev> wrote: > >> > >> Kumar Kartikeya Dwivedi <memxor@gmail.com> writes: > >> > >> > On Mon, 18 Aug 2025 at 19:01, Roman Gushchin <roman.gushchin@linux.dev> wrote: > >> >> > >> >> Introduce a bpf struct ops for implementing custom OOM handling policies. > >> >> > >> >> The struct ops provides the bpf_handle_out_of_memory() callback, > >> >> which expected to return 1 if it was able to free some memory and 0 > >> >> otherwise. > >> >> > >> >> In the latter case it's guaranteed that the in-kernel OOM killer will > >> >> be invoked. Otherwise the kernel also checks the bpf_memory_freed > >> >> field of the oom_control structure, which is expected to be set by > >> >> kfuncs suitable for releasing memory. It's a safety mechanism which > >> >> prevents a bpf program to claim forward progress without actually > >> >> releasing memory. The callback program is sleepable to enable using > >> >> iterators, e.g. cgroup iterators. > >> >> > >> >> The callback receives struct oom_control as an argument, so it can > >> >> easily filter out OOM's it doesn't want to handle, e.g. global vs > >> >> memcg OOM's. > >> >> > >> >> The callback is executed just before the kernel victim task selection > >> >> algorithm, so all heuristics and sysctls like panic on oom, > >> >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task > >> >> are respected. > >> >> > >> >> The struct ops also has the name field, which allows to define a > >> >> custom name for the implemented policy. It's printed in the OOM report > >> >> in the oom_policy=<policy> format. "default" is printed if bpf is not > >> >> used or policy name is not specified. > >> >> > >> >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 > >> >> oom_policy=bpf_test_policy > >> >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) > >> >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 > >> >> [ 112.698167] Call Trace: > >> >> [ 112.698177] <TASK> > >> >> [ 112.698182] dump_stack_lvl+0x4d/0x70 > >> >> [ 112.698192] dump_header+0x59/0x1c6 > >> >> [ 112.698199] oom_kill_process.cold+0x8/0xef > >> >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0 > >> >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 > >> >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf > >> >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 > >> >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0 > >> >> [ 112.698250] out_of_memory+0xab/0x5c0 > >> >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 > >> >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0 > >> >> [ 112.698288] charge_memcg+0x2f/0xc0 > >> >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0 > >> >> [ 112.698299] do_anonymous_page+0x40f/0xa50 > >> >> [ 112.698311] __handle_mm_fault+0xbba/0x1140 > >> >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 > >> >> [ 112.698335] handle_mm_fault+0xe6/0x370 > >> >> [ 112.698343] do_user_addr_fault+0x211/0x6a0 > >> >> [ 112.698354] exc_page_fault+0x75/0x1d0 > >> >> [ 112.698363] asm_exc_page_fault+0x26/0x30 > >> >> [ 112.698366] RIP: 0033:0x7fa97236db00 > >> >> > >> >> It's possible to load multiple bpf struct programs. In the case of > >> >> oom, they will be executed one by one in the same order they been > >> >> loaded until one of them returns 1 and bpf_memory_freed is set to 1 > >> >> - an indication that the memory was freed. This allows to have > >> >> multiple bpf programs to focus on different types of OOM's - e.g. > >> >> one program can only handle memcg OOM's in one memory cgroup. > >> >> But the filtering is done in bpf - so it's fully flexible. > >> > > >> > I think a natural question here is ordering. Is this ability to have > >> > multiple OOM programs critical right now? > >> > >> Good question. Initially I had only supported a single bpf policy. > >> But then I realized that likely people would want to have different > >> policies handling different parts of the cgroup tree. > >> E.g. a global policy and several policies handling OOMs only > >> in some memory cgroups. > >> So having just a single policy is likely a no go. > > > > If the ordering is more to facilitate scoping, would it then be better > > to support attaching the policy to specific memcg/cgroup? > > Well, it has some advantages and disadvantages. First, it will require > way more infrastructure on the memcg side. Second, the interface is not > super clear: we don't want to have a struct ops per cgroup, I guess. > And in many case a single policy for all memcgs is just fine, so asking > the user to attach it to all memcgs is just adding a toil and creating > all kinds of races. > So I see your point, but I'm not yet convinced, to be honest. I would suggest keeping it simple until we know there is a need to prioritize between multiple oom-killers. > > Thanks! >
On Mon, Aug 18, 2025 at 10:01 AM Roman Gushchin <roman.gushchin@linux.dev> wrote: > > Introduce a bpf struct ops for implementing custom OOM handling policies. > > The struct ops provides the bpf_handle_out_of_memory() callback, > which expected to return 1 if it was able to free some memory and 0 > otherwise. > > In the latter case it's guaranteed that the in-kernel OOM killer will > be invoked. Otherwise the kernel also checks the bpf_memory_freed > field of the oom_control structure, which is expected to be set by > kfuncs suitable for releasing memory. It's a safety mechanism which > prevents a bpf program to claim forward progress without actually > releasing memory. The callback program is sleepable to enable using > iterators, e.g. cgroup iterators. > > The callback receives struct oom_control as an argument, so it can > easily filter out OOM's it doesn't want to handle, e.g. global vs > memcg OOM's. > > The callback is executed just before the kernel victim task selection > algorithm, so all heuristics and sysctls like panic on oom, > sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task > are respected. > > The struct ops also has the name field, which allows to define a > custom name for the implemented policy. It's printed in the OOM report > in the oom_policy=<policy> format. "default" is printed if bpf is not > used or policy name is not specified. > > [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 > oom_policy=bpf_test_policy > [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) > [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 > [ 112.698167] Call Trace: > [ 112.698177] <TASK> > [ 112.698182] dump_stack_lvl+0x4d/0x70 > [ 112.698192] dump_header+0x59/0x1c6 > [ 112.698199] oom_kill_process.cold+0x8/0xef > [ 112.698206] bpf_oom_kill_process+0x59/0xb0 > [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 > [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf > [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 > [ 112.698240] bpf_handle_oom+0x11a/0x1e0 > [ 112.698250] out_of_memory+0xab/0x5c0 > [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 > [ 112.698274] try_charge_memcg+0x4b5/0x7e0 > [ 112.698288] charge_memcg+0x2f/0xc0 > [ 112.698293] __mem_cgroup_charge+0x30/0xc0 > [ 112.698299] do_anonymous_page+0x40f/0xa50 > [ 112.698311] __handle_mm_fault+0xbba/0x1140 > [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 > [ 112.698335] handle_mm_fault+0xe6/0x370 > [ 112.698343] do_user_addr_fault+0x211/0x6a0 > [ 112.698354] exc_page_fault+0x75/0x1d0 > [ 112.698363] asm_exc_page_fault+0x26/0x30 > [ 112.698366] RIP: 0033:0x7fa97236db00 > > It's possible to load multiple bpf struct programs. In the case of > oom, they will be executed one by one in the same order they been > loaded until one of them returns 1 and bpf_memory_freed is set to 1 > - an indication that the memory was freed. This allows to have > multiple bpf programs to focus on different types of OOM's - e.g. > one program can only handle memcg OOM's in one memory cgroup. > But the filtering is done in bpf - so it's fully flexible. > > Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> > --- > include/linux/bpf_oom.h | 49 +++++++++++++ > include/linux/oom.h | 8 ++ > mm/Makefile | 3 + > mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++ > mm/oom_kill.c | 22 +++++- > 5 files changed, 237 insertions(+), 2 deletions(-) > create mode 100644 include/linux/bpf_oom.h > create mode 100644 mm/bpf_oom.c > > diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h > new file mode 100644 > index 000000000000..29cb5ea41d97 > --- /dev/null > +++ b/include/linux/bpf_oom.h > @@ -0,0 +1,49 @@ > +/* SPDX-License-Identifier: GPL-2.0+ */ > + > +#ifndef __BPF_OOM_H > +#define __BPF_OOM_H > + > +struct bpf_oom; > +struct oom_control; > + > +#define BPF_OOM_NAME_MAX_LEN 64 > + > +struct bpf_oom_ops { > + /** > + * @handle_out_of_memory: Out of memory bpf handler, called before > + * the in-kernel OOM killer. > + * @oc: OOM control structure > + * > + * Should return 1 if some memory was freed up, otherwise > + * the in-kernel OOM killer is invoked. > + */ > + int (*handle_out_of_memory)(struct oom_control *oc); > + > + /** > + * @name: BPF OOM policy name > + */ > + char name[BPF_OOM_NAME_MAX_LEN]; Why should the name be a part of ops structure? IMO it's not an attribute of the operations but rather of the oom handler which is represented by bpf_oom here. > + > + /* Private */ > + struct bpf_oom *bpf_oom; > +}; > + > +#ifdef CONFIG_BPF_SYSCALL > +/** > + * @bpf_handle_oom: handle out of memory using bpf programs > + * @oc: OOM control structure > + * > + * Returns true if a bpf oom program was executed, returned 1 > + * and some memory was actually freed. The above comment is unclear, please clarify. > + */ > +bool bpf_handle_oom(struct oom_control *oc); > + > +#else /* CONFIG_BPF_SYSCALL */ > +static inline bool bpf_handle_oom(struct oom_control *oc) > +{ > + return false; > +} > + > +#endif /* CONFIG_BPF_SYSCALL */ > + > +#endif /* __BPF_OOM_H */ > diff --git a/include/linux/oom.h b/include/linux/oom.h > index 1e0fc6931ce9..ef453309b7ea 100644 > --- a/include/linux/oom.h > +++ b/include/linux/oom.h > @@ -51,6 +51,14 @@ struct oom_control { > > /* Used to print the constraint info. */ > enum oom_constraint constraint; > + > +#ifdef CONFIG_BPF_SYSCALL > + /* Used by the bpf oom implementation to mark the forward progress */ > + bool bpf_memory_freed; > + > + /* Policy name */ > + const char *bpf_policy_name; > +#endif > }; > > extern struct mutex oom_lock; > diff --git a/mm/Makefile b/mm/Makefile > index 1a7a11d4933d..a714aba03759 100644 > --- a/mm/Makefile > +++ b/mm/Makefile > @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o > ifdef CONFIG_SWAP > obj-$(CONFIG_MEMCG) += swap_cgroup.o > endif > +ifdef CONFIG_BPF_SYSCALL > +obj-y += bpf_oom.o > +endif > obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o > obj-$(CONFIG_GUP_TEST) += gup_test.o > obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o > diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c > new file mode 100644 > index 000000000000..47633046819c > --- /dev/null > +++ b/mm/bpf_oom.c > @@ -0,0 +1,157 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * BPF-driven OOM killer customization > + * > + * Author: Roman Gushchin <roman.gushchin@linux.dev> > + */ > + > +#include <linux/bpf.h> > +#include <linux/oom.h> > +#include <linux/bpf_oom.h> > +#include <linux/srcu.h> > + > +DEFINE_STATIC_SRCU(bpf_oom_srcu); > +static DEFINE_SPINLOCK(bpf_oom_lock); > +static LIST_HEAD(bpf_oom_handlers); > + > +struct bpf_oom { Perhaps bpf_oom_handler ? Then bpf_oom_ops->bpf_oom could be called bpf_oom_ops->handler. > + struct bpf_oom_ops *ops; > + struct list_head node; > + struct srcu_struct srcu; > +}; > + > +bool bpf_handle_oom(struct oom_control *oc) > +{ > + struct bpf_oom_ops *ops; > + struct bpf_oom *bpf_oom; > + int list_idx, idx, ret = 0; > + > + oc->bpf_memory_freed = false; > + > + list_idx = srcu_read_lock(&bpf_oom_srcu); > + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) { > + ops = READ_ONCE(bpf_oom->ops); > + if (!ops || !ops->handle_out_of_memory) > + continue; > + idx = srcu_read_lock(&bpf_oom->srcu); > + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] : > + "bpf_defined_policy"; > + ret = ops->handle_out_of_memory(oc); > + oc->bpf_policy_name = NULL; > + srcu_read_unlock(&bpf_oom->srcu, idx); > + > + if (ret && oc->bpf_memory_freed) IIUC ret and oc->bpf_memory_freed seem to reflect the same state: handler successfully freed some memory. Could you please clarify when they differ? > + break; > + } > + srcu_read_unlock(&bpf_oom_srcu, list_idx); > + > + return ret && oc->bpf_memory_freed; > +} > + > +static int __handle_out_of_memory(struct oom_control *oc) > +{ > + return 0; > +} > + > +static struct bpf_oom_ops __bpf_oom_ops = { > + .handle_out_of_memory = __handle_out_of_memory, > +}; > + > +static const struct bpf_func_proto * > +bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > +{ > + return tracing_prog_func_proto(func_id, prog); > +} > + > +static bool bpf_oom_ops_is_valid_access(int off, int size, > + enum bpf_access_type type, > + const struct bpf_prog *prog, > + struct bpf_insn_access_aux *info) > +{ > + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); > +} > + > +static const struct bpf_verifier_ops bpf_oom_verifier_ops = { > + .get_func_proto = bpf_oom_func_proto, > + .is_valid_access = bpf_oom_ops_is_valid_access, > +}; > + > +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link) > +{ > + struct bpf_oom_ops *ops = kdata; > + struct bpf_oom *bpf_oom; > + int ret; > + > + bpf_oom = kmalloc(sizeof(*bpf_oom), GFP_KERNEL_ACCOUNT); > + if (!bpf_oom) > + return -ENOMEM; > + > + ret = init_srcu_struct(&bpf_oom->srcu); > + if (ret) { > + kfree(bpf_oom); > + return ret; > + } > + > + WRITE_ONCE(bpf_oom->ops, ops); > + ops->bpf_oom = bpf_oom; > + > + spin_lock(&bpf_oom_lock); > + list_add_rcu(&bpf_oom->node, &bpf_oom_handlers); > + spin_unlock(&bpf_oom_lock); > + > + return 0; > +} > + > +static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link) > +{ > + struct bpf_oom_ops *ops = kdata; > + struct bpf_oom *bpf_oom = ops->bpf_oom; > + > + WRITE_ONCE(bpf_oom->ops, NULL); > + > + spin_lock(&bpf_oom_lock); > + list_del_rcu(&bpf_oom->node); > + spin_unlock(&bpf_oom_lock); > + > + synchronize_srcu(&bpf_oom->srcu); > + > + kfree(bpf_oom); > +} > + > +static int bpf_oom_ops_init_member(const struct btf_type *t, > + const struct btf_member *member, > + void *kdata, const void *udata) > +{ > + const struct bpf_oom_ops *uops = (const struct bpf_oom_ops *)udata; > + struct bpf_oom_ops *ops = (struct bpf_oom_ops *)kdata; > + u32 moff = __btf_member_bit_offset(t, member) / 8; > + > + switch (moff) { > + case offsetof(struct bpf_oom_ops, name): > + strscpy_pad(ops->name, uops->name, sizeof(ops->name)); > + return 1; > + } > + return 0; > +} > + > +static int bpf_oom_ops_init(struct btf *btf) > +{ > + return 0; > +} > + > +static struct bpf_struct_ops bpf_oom_bpf_ops = { > + .verifier_ops = &bpf_oom_verifier_ops, > + .reg = bpf_oom_ops_reg, > + .unreg = bpf_oom_ops_unreg, > + .init_member = bpf_oom_ops_init_member, > + .init = bpf_oom_ops_init, > + .name = "bpf_oom_ops", > + .owner = THIS_MODULE, > + .cfi_stubs = &__bpf_oom_ops > +}; > + > +static int __init bpf_oom_struct_ops_init(void) > +{ > + return register_bpf_struct_ops(&bpf_oom_bpf_ops, bpf_oom_ops); > +} > +late_initcall(bpf_oom_struct_ops_init); > diff --git a/mm/oom_kill.c b/mm/oom_kill.c > index 25923cfec9c6..ad7bd65061d6 100644 > --- a/mm/oom_kill.c > +++ b/mm/oom_kill.c > @@ -45,6 +45,7 @@ > #include <linux/mmu_notifier.h> > #include <linux/cred.h> > #include <linux/nmi.h> > +#include <linux/bpf_oom.h> > > #include <asm/tlb.h> > #include "internal.h" > @@ -246,6 +247,15 @@ static const char * const oom_constraint_text[] = { > [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG", > }; > > +static const char *oom_policy_name(struct oom_control *oc) > +{ > +#ifdef CONFIG_BPF_SYSCALL > + if (oc->bpf_policy_name) > + return oc->bpf_policy_name; > +#endif > + return "default"; > +} > + > /* > * Determine the type of allocation constraint. > */ > @@ -458,9 +468,10 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim) > > static void dump_header(struct oom_control *oc) > { > - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", > + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\noom_policy=%s\n", > current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, > - current->signal->oom_score_adj); > + current->signal->oom_score_adj, > + oom_policy_name(oc)); > if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) > pr_warn("COMPACTION is disabled!!!\n"); > > @@ -1161,6 +1172,13 @@ bool out_of_memory(struct oom_control *oc) > return true; > } > > + /* > + * Let bpf handle the OOM first. If it was able to free up some memory, > + * bail out. Otherwise fall back to the kernel OOM killer. > + */ > + if (bpf_handle_oom(oc)) > + return true; > + > select_bad_process(oc); > /* Found nothing?!?! */ > if (!oc->chosen) { > -- > 2.50.1 >
Suren Baghdasaryan <surenb@google.com> writes: > On Mon, Aug 18, 2025 at 10:01 AM Roman Gushchin > <roman.gushchin@linux.dev> wrote: >> >> Introduce a bpf struct ops for implementing custom OOM handling policies. >> >> The struct ops provides the bpf_handle_out_of_memory() callback, >> which expected to return 1 if it was able to free some memory and 0 >> otherwise. >> >> In the latter case it's guaranteed that the in-kernel OOM killer will >> be invoked. Otherwise the kernel also checks the bpf_memory_freed >> field of the oom_control structure, which is expected to be set by >> kfuncs suitable for releasing memory. It's a safety mechanism which >> prevents a bpf program to claim forward progress without actually >> releasing memory. The callback program is sleepable to enable using >> iterators, e.g. cgroup iterators. >> >> The callback receives struct oom_control as an argument, so it can >> easily filter out OOM's it doesn't want to handle, e.g. global vs >> memcg OOM's. >> >> The callback is executed just before the kernel victim task selection >> algorithm, so all heuristics and sysctls like panic on oom, >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task >> are respected. >> >> The struct ops also has the name field, which allows to define a >> custom name for the implemented policy. It's printed in the OOM report >> in the oom_policy=<policy> format. "default" is printed if bpf is not >> used or policy name is not specified. >> >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 >> oom_policy=bpf_test_policy >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 >> [ 112.698167] Call Trace: >> [ 112.698177] <TASK> >> [ 112.698182] dump_stack_lvl+0x4d/0x70 >> [ 112.698192] dump_header+0x59/0x1c6 >> [ 112.698199] oom_kill_process.cold+0x8/0xef >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0 >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0 >> [ 112.698250] out_of_memory+0xab/0x5c0 >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0 >> [ 112.698288] charge_memcg+0x2f/0xc0 >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0 >> [ 112.698299] do_anonymous_page+0x40f/0xa50 >> [ 112.698311] __handle_mm_fault+0xbba/0x1140 >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 >> [ 112.698335] handle_mm_fault+0xe6/0x370 >> [ 112.698343] do_user_addr_fault+0x211/0x6a0 >> [ 112.698354] exc_page_fault+0x75/0x1d0 >> [ 112.698363] asm_exc_page_fault+0x26/0x30 >> [ 112.698366] RIP: 0033:0x7fa97236db00 >> >> It's possible to load multiple bpf struct programs. In the case of >> oom, they will be executed one by one in the same order they been >> loaded until one of them returns 1 and bpf_memory_freed is set to 1 >> - an indication that the memory was freed. This allows to have >> multiple bpf programs to focus on different types of OOM's - e.g. >> one program can only handle memcg OOM's in one memory cgroup. >> But the filtering is done in bpf - so it's fully flexible. >> >> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> >> --- >> include/linux/bpf_oom.h | 49 +++++++++++++ >> include/linux/oom.h | 8 ++ >> mm/Makefile | 3 + >> mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++ >> mm/oom_kill.c | 22 +++++- >> 5 files changed, 237 insertions(+), 2 deletions(-) >> create mode 100644 include/linux/bpf_oom.h >> create mode 100644 mm/bpf_oom.c >> >> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h >> new file mode 100644 >> index 000000000000..29cb5ea41d97 >> --- /dev/null >> +++ b/include/linux/bpf_oom.h >> @@ -0,0 +1,49 @@ >> +/* SPDX-License-Identifier: GPL-2.0+ */ >> + >> +#ifndef __BPF_OOM_H >> +#define __BPF_OOM_H >> + >> +struct bpf_oom; >> +struct oom_control; >> + >> +#define BPF_OOM_NAME_MAX_LEN 64 >> + >> +struct bpf_oom_ops { >> + /** >> + * @handle_out_of_memory: Out of memory bpf handler, called before >> + * the in-kernel OOM killer. >> + * @oc: OOM control structure >> + * >> + * Should return 1 if some memory was freed up, otherwise >> + * the in-kernel OOM killer is invoked. >> + */ >> + int (*handle_out_of_memory)(struct oom_control *oc); >> + >> + /** >> + * @name: BPF OOM policy name >> + */ >> + char name[BPF_OOM_NAME_MAX_LEN]; > > Why should the name be a part of ops structure? IMO it's not an > attribute of the operations but rather of the oom handler which is > represented by bpf_oom here. The ops structure describes a user-defined oom policy. Currently it's just one handler and the policy name. Later additional handlers can be added, e.g. a handler to control the dmesg output. bpf_oom is an implementation detail: it's basically an extension to struct bpf_oom_ops which contains "private" fields required for the internal machinery. > >> + >> + /* Private */ >> + struct bpf_oom *bpf_oom; >> +}; >> + >> +#ifdef CONFIG_BPF_SYSCALL >> +/** >> + * @bpf_handle_oom: handle out of memory using bpf programs >> + * @oc: OOM control structure >> + * >> + * Returns true if a bpf oom program was executed, returned 1 >> + * and some memory was actually freed. > > The above comment is unclear, please clarify. Fixed, thanks. /** * @bpf_handle_oom: handle out of memory condition using bpf * @oc: OOM control structure * * Returns true if some memory was freed. */ bool bpf_handle_oom(struct oom_control *oc); > >> + */ >> +bool bpf_handle_oom(struct oom_control *oc); >> + >> +#else /* CONFIG_BPF_SYSCALL */ >> +static inline bool bpf_handle_oom(struct oom_control *oc) >> +{ >> + return false; >> +} >> + >> +#endif /* CONFIG_BPF_SYSCALL */ >> + >> +#endif /* __BPF_OOM_H */ >> diff --git a/include/linux/oom.h b/include/linux/oom.h >> index 1e0fc6931ce9..ef453309b7ea 100644 >> --- a/include/linux/oom.h >> +++ b/include/linux/oom.h >> @@ -51,6 +51,14 @@ struct oom_control { >> >> /* Used to print the constraint info. */ >> enum oom_constraint constraint; >> + >> +#ifdef CONFIG_BPF_SYSCALL >> + /* Used by the bpf oom implementation to mark the forward progress */ >> + bool bpf_memory_freed; >> + >> + /* Policy name */ >> + const char *bpf_policy_name; >> +#endif >> }; >> >> extern struct mutex oom_lock; >> diff --git a/mm/Makefile b/mm/Makefile >> index 1a7a11d4933d..a714aba03759 100644 >> --- a/mm/Makefile >> +++ b/mm/Makefile >> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o >> ifdef CONFIG_SWAP >> obj-$(CONFIG_MEMCG) += swap_cgroup.o >> endif >> +ifdef CONFIG_BPF_SYSCALL >> +obj-y += bpf_oom.o >> +endif >> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o >> obj-$(CONFIG_GUP_TEST) += gup_test.o >> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o >> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c >> new file mode 100644 >> index 000000000000..47633046819c >> --- /dev/null >> +++ b/mm/bpf_oom.c >> @@ -0,0 +1,157 @@ >> +// SPDX-License-Identifier: GPL-2.0-or-later >> +/* >> + * BPF-driven OOM killer customization >> + * >> + * Author: Roman Gushchin <roman.gushchin@linux.dev> >> + */ >> + >> +#include <linux/bpf.h> >> +#include <linux/oom.h> >> +#include <linux/bpf_oom.h> >> +#include <linux/srcu.h> >> + >> +DEFINE_STATIC_SRCU(bpf_oom_srcu); >> +static DEFINE_SPINLOCK(bpf_oom_lock); >> +static LIST_HEAD(bpf_oom_handlers); >> + >> +struct bpf_oom { > > Perhaps bpf_oom_handler ? Then bpf_oom_ops->bpf_oom could be called > bpf_oom_ops->handler. I don't think it's a handler, it's more like a private part of bpf_oom_ops. Maybe bpf_oom_impl? Idk > > >> + struct bpf_oom_ops *ops; >> + struct list_head node; >> + struct srcu_struct srcu; >> +}; >> + >> +bool bpf_handle_oom(struct oom_control *oc) >> +{ >> + struct bpf_oom_ops *ops; >> + struct bpf_oom *bpf_oom; >> + int list_idx, idx, ret = 0; >> + >> + oc->bpf_memory_freed = false; >> + >> + list_idx = srcu_read_lock(&bpf_oom_srcu); >> + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) { >> + ops = READ_ONCE(bpf_oom->ops); >> + if (!ops || !ops->handle_out_of_memory) >> + continue; >> + idx = srcu_read_lock(&bpf_oom->srcu); >> + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] : >> + "bpf_defined_policy"; >> + ret = ops->handle_out_of_memory(oc); >> + oc->bpf_policy_name = NULL; >> + srcu_read_unlock(&bpf_oom->srcu, idx); >> + >> + if (ret && oc->bpf_memory_freed) > > IIUC ret and oc->bpf_memory_freed seem to reflect the same state: > handler successfully freed some memory. Could you please clarify when > they differ? The idea here is to provide an additional safety measure: if the bpf program simple returns 1 without doing anything, the system won't deadlock. oc->bpf_memory_freed is set by the bpf_oom_kill_process() helper (and potentially some other helpers in the future, e.g. bpf_oom_rm_tmpfs_file()) and can't be modified by the bpf program directly.
On Tue, Aug 19, 2025 at 1:06 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: > > Suren Baghdasaryan <surenb@google.com> writes: > > > On Mon, Aug 18, 2025 at 10:01 AM Roman Gushchin > > <roman.gushchin@linux.dev> wrote: > >> > >> Introduce a bpf struct ops for implementing custom OOM handling policies. > >> > >> The struct ops provides the bpf_handle_out_of_memory() callback, > >> which expected to return 1 if it was able to free some memory and 0 > >> otherwise. > >> > >> In the latter case it's guaranteed that the in-kernel OOM killer will > >> be invoked. Otherwise the kernel also checks the bpf_memory_freed > >> field of the oom_control structure, which is expected to be set by > >> kfuncs suitable for releasing memory. It's a safety mechanism which > >> prevents a bpf program to claim forward progress without actually > >> releasing memory. The callback program is sleepable to enable using > >> iterators, e.g. cgroup iterators. > >> > >> The callback receives struct oom_control as an argument, so it can > >> easily filter out OOM's it doesn't want to handle, e.g. global vs > >> memcg OOM's. > >> > >> The callback is executed just before the kernel victim task selection > >> algorithm, so all heuristics and sysctls like panic on oom, > >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task > >> are respected. > >> > >> The struct ops also has the name field, which allows to define a > >> custom name for the implemented policy. It's printed in the OOM report > >> in the oom_policy=<policy> format. "default" is printed if bpf is not > >> used or policy name is not specified. > >> > >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 > >> oom_policy=bpf_test_policy > >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) > >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 > >> [ 112.698167] Call Trace: > >> [ 112.698177] <TASK> > >> [ 112.698182] dump_stack_lvl+0x4d/0x70 > >> [ 112.698192] dump_header+0x59/0x1c6 > >> [ 112.698199] oom_kill_process.cold+0x8/0xef > >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0 > >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 > >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf > >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 > >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0 > >> [ 112.698250] out_of_memory+0xab/0x5c0 > >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 > >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0 > >> [ 112.698288] charge_memcg+0x2f/0xc0 > >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0 > >> [ 112.698299] do_anonymous_page+0x40f/0xa50 > >> [ 112.698311] __handle_mm_fault+0xbba/0x1140 > >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 > >> [ 112.698335] handle_mm_fault+0xe6/0x370 > >> [ 112.698343] do_user_addr_fault+0x211/0x6a0 > >> [ 112.698354] exc_page_fault+0x75/0x1d0 > >> [ 112.698363] asm_exc_page_fault+0x26/0x30 > >> [ 112.698366] RIP: 0033:0x7fa97236db00 > >> > >> It's possible to load multiple bpf struct programs. In the case of > >> oom, they will be executed one by one in the same order they been > >> loaded until one of them returns 1 and bpf_memory_freed is set to 1 > >> - an indication that the memory was freed. This allows to have > >> multiple bpf programs to focus on different types of OOM's - e.g. > >> one program can only handle memcg OOM's in one memory cgroup. > >> But the filtering is done in bpf - so it's fully flexible. > >> > >> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> > >> --- > >> include/linux/bpf_oom.h | 49 +++++++++++++ > >> include/linux/oom.h | 8 ++ > >> mm/Makefile | 3 + > >> mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++ > >> mm/oom_kill.c | 22 +++++- > >> 5 files changed, 237 insertions(+), 2 deletions(-) > >> create mode 100644 include/linux/bpf_oom.h > >> create mode 100644 mm/bpf_oom.c > >> > >> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h > >> new file mode 100644 > >> index 000000000000..29cb5ea41d97 > >> --- /dev/null > >> +++ b/include/linux/bpf_oom.h > >> @@ -0,0 +1,49 @@ > >> +/* SPDX-License-Identifier: GPL-2.0+ */ > >> + > >> +#ifndef __BPF_OOM_H > >> +#define __BPF_OOM_H > >> + > >> +struct bpf_oom; > >> +struct oom_control; > >> + > >> +#define BPF_OOM_NAME_MAX_LEN 64 > >> + > >> +struct bpf_oom_ops { > >> + /** > >> + * @handle_out_of_memory: Out of memory bpf handler, called before > >> + * the in-kernel OOM killer. > >> + * @oc: OOM control structure > >> + * > >> + * Should return 1 if some memory was freed up, otherwise > >> + * the in-kernel OOM killer is invoked. > >> + */ > >> + int (*handle_out_of_memory)(struct oom_control *oc); > >> + > >> + /** > >> + * @name: BPF OOM policy name > >> + */ > >> + char name[BPF_OOM_NAME_MAX_LEN]; > > > > Why should the name be a part of ops structure? IMO it's not an > > attribute of the operations but rather of the oom handler which is > > represented by bpf_oom here. > > The ops structure describes a user-defined oom policy. Currently > it's just one handler and the policy name. Later additional handlers > can be added, e.g. a handler to control the dmesg output. > > bpf_oom is an implementation detail: it's basically an extension > to struct bpf_oom_ops which contains "private" fields required > for the internal machinery. Ok. I hope we can come up with some more descriptive naming but I can't think of something good ATM. > > > > >> + > >> + /* Private */ > >> + struct bpf_oom *bpf_oom; > >> +}; > >> + > >> +#ifdef CONFIG_BPF_SYSCALL > >> +/** > >> + * @bpf_handle_oom: handle out of memory using bpf programs > >> + * @oc: OOM control structure > >> + * > >> + * Returns true if a bpf oom program was executed, returned 1 > >> + * and some memory was actually freed. > > > > The above comment is unclear, please clarify. > > Fixed, thanks. > > /** > * @bpf_handle_oom: handle out of memory condition using bpf > * @oc: OOM control structure > * > * Returns true if some memory was freed. > */ > bool bpf_handle_oom(struct oom_control *oc); > > > > > >> + */ > >> +bool bpf_handle_oom(struct oom_control *oc); > >> + > >> +#else /* CONFIG_BPF_SYSCALL */ > >> +static inline bool bpf_handle_oom(struct oom_control *oc) > >> +{ > >> + return false; > >> +} > >> + > >> +#endif /* CONFIG_BPF_SYSCALL */ > >> + > >> +#endif /* __BPF_OOM_H */ > >> diff --git a/include/linux/oom.h b/include/linux/oom.h > >> index 1e0fc6931ce9..ef453309b7ea 100644 > >> --- a/include/linux/oom.h > >> +++ b/include/linux/oom.h > >> @@ -51,6 +51,14 @@ struct oom_control { > >> > >> /* Used to print the constraint info. */ > >> enum oom_constraint constraint; > >> + > >> +#ifdef CONFIG_BPF_SYSCALL > >> + /* Used by the bpf oom implementation to mark the forward progress */ > >> + bool bpf_memory_freed; > >> + > >> + /* Policy name */ > >> + const char *bpf_policy_name; > >> +#endif > >> }; > >> > >> extern struct mutex oom_lock; > >> diff --git a/mm/Makefile b/mm/Makefile > >> index 1a7a11d4933d..a714aba03759 100644 > >> --- a/mm/Makefile > >> +++ b/mm/Makefile > >> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o > >> ifdef CONFIG_SWAP > >> obj-$(CONFIG_MEMCG) += swap_cgroup.o > >> endif > >> +ifdef CONFIG_BPF_SYSCALL > >> +obj-y += bpf_oom.o > >> +endif > >> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o > >> obj-$(CONFIG_GUP_TEST) += gup_test.o > >> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o > >> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c > >> new file mode 100644 > >> index 000000000000..47633046819c > >> --- /dev/null > >> +++ b/mm/bpf_oom.c > >> @@ -0,0 +1,157 @@ > >> +// SPDX-License-Identifier: GPL-2.0-or-later > >> +/* > >> + * BPF-driven OOM killer customization > >> + * > >> + * Author: Roman Gushchin <roman.gushchin@linux.dev> > >> + */ > >> + > >> +#include <linux/bpf.h> > >> +#include <linux/oom.h> > >> +#include <linux/bpf_oom.h> > >> +#include <linux/srcu.h> > >> + > >> +DEFINE_STATIC_SRCU(bpf_oom_srcu); > >> +static DEFINE_SPINLOCK(bpf_oom_lock); > >> +static LIST_HEAD(bpf_oom_handlers); > >> + > >> +struct bpf_oom { > > > > Perhaps bpf_oom_handler ? Then bpf_oom_ops->bpf_oom could be called > > bpf_oom_ops->handler. > > I don't think it's a handler, it's more like a private part > of bpf_oom_ops. Maybe bpf_oom_impl? Idk Yeah, we need to come up with some nomenclature and name these structs accordingly. In my mind ops means a structure that contains only operations, so current naming does not sit well but maybe that's just me... > > > > > > >> + struct bpf_oom_ops *ops; > >> + struct list_head node; > >> + struct srcu_struct srcu; > >> +}; > >> + > >> +bool bpf_handle_oom(struct oom_control *oc) > >> +{ > >> + struct bpf_oom_ops *ops; > >> + struct bpf_oom *bpf_oom; > >> + int list_idx, idx, ret = 0; > >> + > >> + oc->bpf_memory_freed = false; > >> + > >> + list_idx = srcu_read_lock(&bpf_oom_srcu); > >> + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) { > >> + ops = READ_ONCE(bpf_oom->ops); > >> + if (!ops || !ops->handle_out_of_memory) > >> + continue; > >> + idx = srcu_read_lock(&bpf_oom->srcu); > >> + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] : > >> + "bpf_defined_policy"; > >> + ret = ops->handle_out_of_memory(oc); > >> + oc->bpf_policy_name = NULL; > >> + srcu_read_unlock(&bpf_oom->srcu, idx); > >> + > >> + if (ret && oc->bpf_memory_freed) > > > > IIUC ret and oc->bpf_memory_freed seem to reflect the same state: > > handler successfully freed some memory. Could you please clarify when > > they differ? > > The idea here is to provide an additional safety measure: > if the bpf program simple returns 1 without doing anything, > the system won't deadlock. > > oc->bpf_memory_freed is set by the bpf_oom_kill_process() helper > (and potentially some other helpers in the future, e.g. > bpf_oom_rm_tmpfs_file()) and can't be modified by the bpf > program directly. I see. Then maybe we use only oc->bpf_memory_freed and handle_out_of_memory() does not return anything?
On 8/20/25 12:34 PM, Suren Baghdasaryan wrote: > On Tue, Aug 19, 2025 at 1:06 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: >> Suren Baghdasaryan <surenb@google.com> writes: >> >>> On Mon, Aug 18, 2025 at 10:01 AM Roman Gushchin >>> <roman.gushchin@linux.dev> wrote: >>>> Introduce a bpf struct ops for implementing custom OOM handling policies. >>>> >>>> The struct ops provides the bpf_handle_out_of_memory() callback, >>>> which expected to return 1 if it was able to free some memory and 0 >>>> otherwise. >>>> >>>> In the latter case it's guaranteed that the in-kernel OOM killer will >>>> be invoked. Otherwise the kernel also checks the bpf_memory_freed >>>> field of the oom_control structure, which is expected to be set by >>>> kfuncs suitable for releasing memory. It's a safety mechanism which >>>> prevents a bpf program to claim forward progress without actually >>>> releasing memory. The callback program is sleepable to enable using >>>> iterators, e.g. cgroup iterators. >>>> >>>> The callback receives struct oom_control as an argument, so it can >>>> easily filter out OOM's it doesn't want to handle, e.g. global vs >>>> memcg OOM's. >>>> >>>> The callback is executed just before the kernel victim task selection >>>> algorithm, so all heuristics and sysctls like panic on oom, >>>> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task >>>> are respected. >>>> >>>> The struct ops also has the name field, which allows to define a >>>> custom name for the implemented policy. It's printed in the OOM report >>>> in the oom_policy=<policy> format. "default" is printed if bpf is not >>>> used or policy name is not specified. >>>> >>>> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 >>>> oom_policy=bpf_test_policy >>>> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) >>>> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 >>>> [ 112.698167] Call Trace: >>>> [ 112.698177] <TASK> >>>> [ 112.698182] dump_stack_lvl+0x4d/0x70 >>>> [ 112.698192] dump_header+0x59/0x1c6 >>>> [ 112.698199] oom_kill_process.cold+0x8/0xef >>>> [ 112.698206] bpf_oom_kill_process+0x59/0xb0 >>>> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 >>>> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf >>>> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 >>>> [ 112.698240] bpf_handle_oom+0x11a/0x1e0 >>>> [ 112.698250] out_of_memory+0xab/0x5c0 >>>> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 >>>> [ 112.698274] try_charge_memcg+0x4b5/0x7e0 >>>> [ 112.698288] charge_memcg+0x2f/0xc0 >>>> [ 112.698293] __mem_cgroup_charge+0x30/0xc0 >>>> [ 112.698299] do_anonymous_page+0x40f/0xa50 >>>> [ 112.698311] __handle_mm_fault+0xbba/0x1140 >>>> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 >>>> [ 112.698335] handle_mm_fault+0xe6/0x370 >>>> [ 112.698343] do_user_addr_fault+0x211/0x6a0 >>>> [ 112.698354] exc_page_fault+0x75/0x1d0 >>>> [ 112.698363] asm_exc_page_fault+0x26/0x30 >>>> [ 112.698366] RIP: 0033:0x7fa97236db00 >>>> >>>> It's possible to load multiple bpf struct programs. In the case of >>>> oom, they will be executed one by one in the same order they been >>>> loaded until one of them returns 1 and bpf_memory_freed is set to 1 >>>> - an indication that the memory was freed. This allows to have >>>> multiple bpf programs to focus on different types of OOM's - e.g. >>>> one program can only handle memcg OOM's in one memory cgroup. >>>> But the filtering is done in bpf - so it's fully flexible. >>>> >>>> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> >>>> --- >>>> include/linux/bpf_oom.h | 49 +++++++++++++ >>>> include/linux/oom.h | 8 ++ >>>> mm/Makefile | 3 + >>>> mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++ >>>> mm/oom_kill.c | 22 +++++- >>>> 5 files changed, 237 insertions(+), 2 deletions(-) >>>> create mode 100644 include/linux/bpf_oom.h >>>> create mode 100644 mm/bpf_oom.c >>>> >>>> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h >>>> new file mode 100644 >>>> index 000000000000..29cb5ea41d97 >>>> --- /dev/null >>>> +++ b/include/linux/bpf_oom.h >>>> @@ -0,0 +1,49 @@ >>>> +/* SPDX-License-Identifier: GPL-2.0+ */ >>>> + >>>> +#ifndef __BPF_OOM_H >>>> +#define __BPF_OOM_H >>>> + >>>> +struct bpf_oom; >>>> +struct oom_control; >>>> + >>>> +#define BPF_OOM_NAME_MAX_LEN 64 >>>> + >>>> +struct bpf_oom_ops { >>>> + /** >>>> + * @handle_out_of_memory: Out of memory bpf handler, called before >>>> + * the in-kernel OOM killer. >>>> + * @oc: OOM control structure >>>> + * >>>> + * Should return 1 if some memory was freed up, otherwise >>>> + * the in-kernel OOM killer is invoked. >>>> + */ >>>> + int (*handle_out_of_memory)(struct oom_control *oc); >>>> + >>>> + /** >>>> + * @name: BPF OOM policy name >>>> + */ >>>> + char name[BPF_OOM_NAME_MAX_LEN]; >>> Why should the name be a part of ops structure? IMO it's not an >>> attribute of the operations but rather of the oom handler which is >>> represented by bpf_oom here. >> The ops structure describes a user-defined oom policy. Currently >> it's just one handler and the policy name. Later additional handlers >> can be added, e.g. a handler to control the dmesg output. >> >> bpf_oom is an implementation detail: it's basically an extension >> to struct bpf_oom_ops which contains "private" fields required >> for the internal machinery. > Ok. I hope we can come up with some more descriptive naming but I > can't think of something good ATM. > >>>> + >>>> + /* Private */ >>>> + struct bpf_oom *bpf_oom; >>>> +}; >>>> + >>>> +#ifdef CONFIG_BPF_SYSCALL >>>> +/** >>>> + * @bpf_handle_oom: handle out of memory using bpf programs >>>> + * @oc: OOM control structure >>>> + * >>>> + * Returns true if a bpf oom program was executed, returned 1 >>>> + * and some memory was actually freed. >>> The above comment is unclear, please clarify. >> Fixed, thanks. >> >> /** >> * @bpf_handle_oom: handle out of memory condition using bpf >> * @oc: OOM control structure >> * >> * Returns true if some memory was freed. >> */ >> bool bpf_handle_oom(struct oom_control *oc); >> >> >>>> + */ >>>> +bool bpf_handle_oom(struct oom_control *oc); >>>> + >>>> +#else /* CONFIG_BPF_SYSCALL */ >>>> +static inline bool bpf_handle_oom(struct oom_control *oc) >>>> +{ >>>> + return false; >>>> +} >>>> + >>>> +#endif /* CONFIG_BPF_SYSCALL */ >>>> + >>>> +#endif /* __BPF_OOM_H */ >>>> diff --git a/include/linux/oom.h b/include/linux/oom.h >>>> index 1e0fc6931ce9..ef453309b7ea 100644 >>>> --- a/include/linux/oom.h >>>> +++ b/include/linux/oom.h >>>> @@ -51,6 +51,14 @@ struct oom_control { >>>> >>>> /* Used to print the constraint info. */ >>>> enum oom_constraint constraint; >>>> + >>>> +#ifdef CONFIG_BPF_SYSCALL >>>> + /* Used by the bpf oom implementation to mark the forward progress */ >>>> + bool bpf_memory_freed; >>>> + >>>> + /* Policy name */ >>>> + const char *bpf_policy_name; >>>> +#endif >>>> }; >>>> >>>> extern struct mutex oom_lock; >>>> diff --git a/mm/Makefile b/mm/Makefile >>>> index 1a7a11d4933d..a714aba03759 100644 >>>> --- a/mm/Makefile >>>> +++ b/mm/Makefile >>>> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o >>>> ifdef CONFIG_SWAP >>>> obj-$(CONFIG_MEMCG) += swap_cgroup.o >>>> endif >>>> +ifdef CONFIG_BPF_SYSCALL >>>> +obj-y += bpf_oom.o >>>> +endif >>>> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o >>>> obj-$(CONFIG_GUP_TEST) += gup_test.o >>>> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o >>>> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c >>>> new file mode 100644 >>>> index 000000000000..47633046819c >>>> --- /dev/null >>>> +++ b/mm/bpf_oom.c >>>> @@ -0,0 +1,157 @@ >>>> +// SPDX-License-Identifier: GPL-2.0-or-later >>>> +/* >>>> + * BPF-driven OOM killer customization >>>> + * >>>> + * Author: Roman Gushchin <roman.gushchin@linux.dev> >>>> + */ >>>> + >>>> +#include <linux/bpf.h> >>>> +#include <linux/oom.h> >>>> +#include <linux/bpf_oom.h> >>>> +#include <linux/srcu.h> >>>> + >>>> +DEFINE_STATIC_SRCU(bpf_oom_srcu); >>>> +static DEFINE_SPINLOCK(bpf_oom_lock); >>>> +static LIST_HEAD(bpf_oom_handlers); >>>> + >>>> +struct bpf_oom { >>> Perhaps bpf_oom_handler ? Then bpf_oom_ops->bpf_oom could be called >>> bpf_oom_ops->handler. >> I don't think it's a handler, it's more like a private part >> of bpf_oom_ops. Maybe bpf_oom_impl? Idk > Yeah, we need to come up with some nomenclature and name these structs > accordingly. In my mind ops means a structure that contains only > operations, so current naming does not sit well but maybe that's just > me... Some existing xxx_ops also have non-operation members. E.g., tcp_congestion_ops, Qdisc_ops, vfio_device_ops, or tpm_class_ops. Maybe bpf_oom_ops is okay if that doesn't cause too much confusion? > >>> >>>> + struct bpf_oom_ops *ops; >>>> + struct list_head node; >>>> + struct srcu_struct srcu; >>>> +}; >>>> + >>>> +bool bpf_handle_oom(struct oom_control *oc) >>>> +{ >>>> + struct bpf_oom_ops *ops; >>>> + struct bpf_oom *bpf_oom; >>>> + int list_idx, idx, ret = 0; >>>> + >>>> + oc->bpf_memory_freed = false; >>>> + >>>> + list_idx = srcu_read_lock(&bpf_oom_srcu); >>>> + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) { >>>> + ops = READ_ONCE(bpf_oom->ops); >>>> + if (!ops || !ops->handle_out_of_memory) >>>> + continue; >>>> + idx = srcu_read_lock(&bpf_oom->srcu); >>>> + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] : >>>> + "bpf_defined_policy"; >>>> + ret = ops->handle_out_of_memory(oc); >>>> + oc->bpf_policy_name = NULL; >>>> + srcu_read_unlock(&bpf_oom->srcu, idx); >>>> + >>>> + if (ret && oc->bpf_memory_freed) >>> IIUC ret and oc->bpf_memory_freed seem to reflect the same state: >>> handler successfully freed some memory. Could you please clarify when >>> they differ? >> The idea here is to provide an additional safety measure: >> if the bpf program simple returns 1 without doing anything, >> the system won't deadlock. >> >> oc->bpf_memory_freed is set by the bpf_oom_kill_process() helper >> (and potentially some other helpers in the future, e.g. >> bpf_oom_rm_tmpfs_file()) and can't be modified by the bpf >> program directly. > I see. Then maybe we use only oc->bpf_memory_freed and > handle_out_of_memory() does not return anything?
Suren Baghdasaryan <surenb@google.com> writes: > On Tue, Aug 19, 2025 at 1:06 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: >> >> Suren Baghdasaryan <surenb@google.com> writes: >> >> > On Mon, Aug 18, 2025 at 10:01 AM Roman Gushchin >> > <roman.gushchin@linux.dev> wrote: >> >> >> >> Introduce a bpf struct ops for implementing custom OOM handling policies. >> >> >> >> The struct ops provides the bpf_handle_out_of_memory() callback, >> >> which expected to return 1 if it was able to free some memory and 0 >> >> otherwise. >> >> >> >> In the latter case it's guaranteed that the in-kernel OOM killer will >> >> be invoked. Otherwise the kernel also checks the bpf_memory_freed >> >> field of the oom_control structure, which is expected to be set by >> >> kfuncs suitable for releasing memory. It's a safety mechanism which >> >> prevents a bpf program to claim forward progress without actually >> >> releasing memory. The callback program is sleepable to enable using >> >> iterators, e.g. cgroup iterators. >> >> >> >> The callback receives struct oom_control as an argument, so it can >> >> easily filter out OOM's it doesn't want to handle, e.g. global vs >> >> memcg OOM's. >> >> >> >> The callback is executed just before the kernel victim task selection >> >> algorithm, so all heuristics and sysctls like panic on oom, >> >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task >> >> are respected. >> >> >> >> The struct ops also has the name field, which allows to define a >> >> custom name for the implemented policy. It's printed in the OOM report >> >> in the oom_policy=<policy> format. "default" is printed if bpf is not >> >> used or policy name is not specified. >> >> >> >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 >> >> oom_policy=bpf_test_policy >> >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) >> >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 >> >> [ 112.698167] Call Trace: >> >> [ 112.698177] <TASK> >> >> [ 112.698182] dump_stack_lvl+0x4d/0x70 >> >> [ 112.698192] dump_header+0x59/0x1c6 >> >> [ 112.698199] oom_kill_process.cold+0x8/0xef >> >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0 >> >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 >> >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf >> >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 >> >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0 >> >> [ 112.698250] out_of_memory+0xab/0x5c0 >> >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 >> >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0 >> >> [ 112.698288] charge_memcg+0x2f/0xc0 >> >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0 >> >> [ 112.698299] do_anonymous_page+0x40f/0xa50 >> >> [ 112.698311] __handle_mm_fault+0xbba/0x1140 >> >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 >> >> [ 112.698335] handle_mm_fault+0xe6/0x370 >> >> [ 112.698343] do_user_addr_fault+0x211/0x6a0 >> >> [ 112.698354] exc_page_fault+0x75/0x1d0 >> >> [ 112.698363] asm_exc_page_fault+0x26/0x30 >> >> [ 112.698366] RIP: 0033:0x7fa97236db00 >> >> >> >> It's possible to load multiple bpf struct programs. In the case of >> >> oom, they will be executed one by one in the same order they been >> >> loaded until one of them returns 1 and bpf_memory_freed is set to 1 >> >> - an indication that the memory was freed. This allows to have >> >> multiple bpf programs to focus on different types of OOM's - e.g. >> >> one program can only handle memcg OOM's in one memory cgroup. >> >> But the filtering is done in bpf - so it's fully flexible. >> >> >> >> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> >> >> --- >> >> include/linux/bpf_oom.h | 49 +++++++++++++ >> >> include/linux/oom.h | 8 ++ >> >> mm/Makefile | 3 + >> >> mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++ >> >> mm/oom_kill.c | 22 +++++- >> >> 5 files changed, 237 insertions(+), 2 deletions(-) >> >> create mode 100644 include/linux/bpf_oom.h >> >> create mode 100644 mm/bpf_oom.c >> >> >> >> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h >> >> new file mode 100644 >> >> index 000000000000..29cb5ea41d97 >> >> --- /dev/null >> >> +++ b/include/linux/bpf_oom.h >> >> @@ -0,0 +1,49 @@ >> >> +/* SPDX-License-Identifier: GPL-2.0+ */ >> >> + >> >> +#ifndef __BPF_OOM_H >> >> +#define __BPF_OOM_H >> >> + >> >> +struct bpf_oom; >> >> +struct oom_control; >> >> + >> >> +#define BPF_OOM_NAME_MAX_LEN 64 >> >> + >> >> +struct bpf_oom_ops { >> >> + /** >> >> + * @handle_out_of_memory: Out of memory bpf handler, called before >> >> + * the in-kernel OOM killer. >> >> + * @oc: OOM control structure >> >> + * >> >> + * Should return 1 if some memory was freed up, otherwise >> >> + * the in-kernel OOM killer is invoked. >> >> + */ >> >> + int (*handle_out_of_memory)(struct oom_control *oc); >> >> + >> >> + /** >> >> + * @name: BPF OOM policy name >> >> + */ >> >> + char name[BPF_OOM_NAME_MAX_LEN]; >> > >> > Why should the name be a part of ops structure? IMO it's not an >> > attribute of the operations but rather of the oom handler which is >> > represented by bpf_oom here. >> >> The ops structure describes a user-defined oom policy. Currently >> it's just one handler and the policy name. Later additional handlers >> can be added, e.g. a handler to control the dmesg output. >> >> bpf_oom is an implementation detail: it's basically an extension >> to struct bpf_oom_ops which contains "private" fields required >> for the internal machinery. > > Ok. I hope we can come up with some more descriptive naming but I > can't think of something good ATM. > >> >> > >> >> + >> >> + /* Private */ >> >> + struct bpf_oom *bpf_oom; >> >> +}; >> >> + >> >> +#ifdef CONFIG_BPF_SYSCALL >> >> +/** >> >> + * @bpf_handle_oom: handle out of memory using bpf programs >> >> + * @oc: OOM control structure >> >> + * >> >> + * Returns true if a bpf oom program was executed, returned 1 >> >> + * and some memory was actually freed. >> > >> > The above comment is unclear, please clarify. >> >> Fixed, thanks. >> >> /** >> * @bpf_handle_oom: handle out of memory condition using bpf >> * @oc: OOM control structure >> * >> * Returns true if some memory was freed. >> */ >> bool bpf_handle_oom(struct oom_control *oc); >> >> >> > >> >> + */ >> >> +bool bpf_handle_oom(struct oom_control *oc); >> >> + >> >> +#else /* CONFIG_BPF_SYSCALL */ >> >> +static inline bool bpf_handle_oom(struct oom_control *oc) >> >> +{ >> >> + return false; >> >> +} >> >> + >> >> +#endif /* CONFIG_BPF_SYSCALL */ >> >> + >> >> +#endif /* __BPF_OOM_H */ >> >> diff --git a/include/linux/oom.h b/include/linux/oom.h >> >> index 1e0fc6931ce9..ef453309b7ea 100644 >> >> --- a/include/linux/oom.h >> >> +++ b/include/linux/oom.h >> >> @@ -51,6 +51,14 @@ struct oom_control { >> >> >> >> /* Used to print the constraint info. */ >> >> enum oom_constraint constraint; >> >> + >> >> +#ifdef CONFIG_BPF_SYSCALL >> >> + /* Used by the bpf oom implementation to mark the forward progress */ >> >> + bool bpf_memory_freed; >> >> + >> >> + /* Policy name */ >> >> + const char *bpf_policy_name; >> >> +#endif >> >> }; >> >> >> >> extern struct mutex oom_lock; >> >> diff --git a/mm/Makefile b/mm/Makefile >> >> index 1a7a11d4933d..a714aba03759 100644 >> >> --- a/mm/Makefile >> >> +++ b/mm/Makefile >> >> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o >> >> ifdef CONFIG_SWAP >> >> obj-$(CONFIG_MEMCG) += swap_cgroup.o >> >> endif >> >> +ifdef CONFIG_BPF_SYSCALL >> >> +obj-y += bpf_oom.o >> >> +endif >> >> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o >> >> obj-$(CONFIG_GUP_TEST) += gup_test.o >> >> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o >> >> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c >> >> new file mode 100644 >> >> index 000000000000..47633046819c >> >> --- /dev/null >> >> +++ b/mm/bpf_oom.c >> >> @@ -0,0 +1,157 @@ >> >> +// SPDX-License-Identifier: GPL-2.0-or-later >> >> +/* >> >> + * BPF-driven OOM killer customization >> >> + * >> >> + * Author: Roman Gushchin <roman.gushchin@linux.dev> >> >> + */ >> >> + >> >> +#include <linux/bpf.h> >> >> +#include <linux/oom.h> >> >> +#include <linux/bpf_oom.h> >> >> +#include <linux/srcu.h> >> >> + >> >> +DEFINE_STATIC_SRCU(bpf_oom_srcu); >> >> +static DEFINE_SPINLOCK(bpf_oom_lock); >> >> +static LIST_HEAD(bpf_oom_handlers); >> >> + >> >> +struct bpf_oom { >> > >> > Perhaps bpf_oom_handler ? Then bpf_oom_ops->bpf_oom could be called >> > bpf_oom_ops->handler. >> >> I don't think it's a handler, it's more like a private part >> of bpf_oom_ops. Maybe bpf_oom_impl? Idk > > Yeah, we need to come up with some nomenclature and name these structs > accordingly. In my mind ops means a structure that contains only > operations, so current naming does not sit well but maybe that's just > me... > >> >> > >> > >> >> + struct bpf_oom_ops *ops; >> >> + struct list_head node; >> >> + struct srcu_struct srcu; >> >> +}; >> >> + >> >> +bool bpf_handle_oom(struct oom_control *oc) >> >> +{ >> >> + struct bpf_oom_ops *ops; >> >> + struct bpf_oom *bpf_oom; >> >> + int list_idx, idx, ret = 0; >> >> + >> >> + oc->bpf_memory_freed = false; >> >> + >> >> + list_idx = srcu_read_lock(&bpf_oom_srcu); >> >> + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) { >> >> + ops = READ_ONCE(bpf_oom->ops); >> >> + if (!ops || !ops->handle_out_of_memory) >> >> + continue; >> >> + idx = srcu_read_lock(&bpf_oom->srcu); >> >> + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] : >> >> + "bpf_defined_policy"; >> >> + ret = ops->handle_out_of_memory(oc); >> >> + oc->bpf_policy_name = NULL; >> >> + srcu_read_unlock(&bpf_oom->srcu, idx); >> >> + >> >> + if (ret && oc->bpf_memory_freed) >> > >> > IIUC ret and oc->bpf_memory_freed seem to reflect the same state: >> > handler successfully freed some memory. Could you please clarify when >> > they differ? >> >> The idea here is to provide an additional safety measure: >> if the bpf program simple returns 1 without doing anything, >> the system won't deadlock. >> >> oc->bpf_memory_freed is set by the bpf_oom_kill_process() helper >> (and potentially some other helpers in the future, e.g. >> bpf_oom_rm_tmpfs_file()) and can't be modified by the bpf >> program directly. > > I see. Then maybe we use only oc->bpf_memory_freed and > handle_out_of_memory() does not return anything? Idk, I think it's neat to have an ability to pass to the in-kernel OOM killer even after killing a task. Also, I believe, bpf programs have to return an int anyway, so we can ignore it, but I don't necessary see the point.
On Wed, Aug 20, 2025 at 12:53 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: > > Suren Baghdasaryan <surenb@google.com> writes: > > > On Tue, Aug 19, 2025 at 1:06 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: > >> > >> Suren Baghdasaryan <surenb@google.com> writes: > >> > >> > On Mon, Aug 18, 2025 at 10:01 AM Roman Gushchin > >> > <roman.gushchin@linux.dev> wrote: > >> >> > >> >> Introduce a bpf struct ops for implementing custom OOM handling policies. > >> >> > >> >> The struct ops provides the bpf_handle_out_of_memory() callback, > >> >> which expected to return 1 if it was able to free some memory and 0 > >> >> otherwise. > >> >> > >> >> In the latter case it's guaranteed that the in-kernel OOM killer will > >> >> be invoked. Otherwise the kernel also checks the bpf_memory_freed > >> >> field of the oom_control structure, which is expected to be set by > >> >> kfuncs suitable for releasing memory. It's a safety mechanism which > >> >> prevents a bpf program to claim forward progress without actually > >> >> releasing memory. The callback program is sleepable to enable using > >> >> iterators, e.g. cgroup iterators. > >> >> > >> >> The callback receives struct oom_control as an argument, so it can > >> >> easily filter out OOM's it doesn't want to handle, e.g. global vs > >> >> memcg OOM's. > >> >> > >> >> The callback is executed just before the kernel victim task selection > >> >> algorithm, so all heuristics and sysctls like panic on oom, > >> >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task > >> >> are respected. > >> >> > >> >> The struct ops also has the name field, which allows to define a > >> >> custom name for the implemented policy. It's printed in the OOM report > >> >> in the oom_policy=<policy> format. "default" is printed if bpf is not > >> >> used or policy name is not specified. > >> >> > >> >> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 > >> >> oom_policy=bpf_test_policy > >> >> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) > >> >> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 > >> >> [ 112.698167] Call Trace: > >> >> [ 112.698177] <TASK> > >> >> [ 112.698182] dump_stack_lvl+0x4d/0x70 > >> >> [ 112.698192] dump_header+0x59/0x1c6 > >> >> [ 112.698199] oom_kill_process.cold+0x8/0xef > >> >> [ 112.698206] bpf_oom_kill_process+0x59/0xb0 > >> >> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 > >> >> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf > >> >> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 > >> >> [ 112.698240] bpf_handle_oom+0x11a/0x1e0 > >> >> [ 112.698250] out_of_memory+0xab/0x5c0 > >> >> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 > >> >> [ 112.698274] try_charge_memcg+0x4b5/0x7e0 > >> >> [ 112.698288] charge_memcg+0x2f/0xc0 > >> >> [ 112.698293] __mem_cgroup_charge+0x30/0xc0 > >> >> [ 112.698299] do_anonymous_page+0x40f/0xa50 > >> >> [ 112.698311] __handle_mm_fault+0xbba/0x1140 > >> >> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 > >> >> [ 112.698335] handle_mm_fault+0xe6/0x370 > >> >> [ 112.698343] do_user_addr_fault+0x211/0x6a0 > >> >> [ 112.698354] exc_page_fault+0x75/0x1d0 > >> >> [ 112.698363] asm_exc_page_fault+0x26/0x30 > >> >> [ 112.698366] RIP: 0033:0x7fa97236db00 > >> >> > >> >> It's possible to load multiple bpf struct programs. In the case of > >> >> oom, they will be executed one by one in the same order they been > >> >> loaded until one of them returns 1 and bpf_memory_freed is set to 1 > >> >> - an indication that the memory was freed. This allows to have > >> >> multiple bpf programs to focus on different types of OOM's - e.g. > >> >> one program can only handle memcg OOM's in one memory cgroup. > >> >> But the filtering is done in bpf - so it's fully flexible. > >> >> > >> >> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> > >> >> --- > >> >> include/linux/bpf_oom.h | 49 +++++++++++++ > >> >> include/linux/oom.h | 8 ++ > >> >> mm/Makefile | 3 + > >> >> mm/bpf_oom.c | 157 ++++++++++++++++++++++++++++++++++++++++ > >> >> mm/oom_kill.c | 22 +++++- > >> >> 5 files changed, 237 insertions(+), 2 deletions(-) > >> >> create mode 100644 include/linux/bpf_oom.h > >> >> create mode 100644 mm/bpf_oom.c > >> >> > >> >> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h > >> >> new file mode 100644 > >> >> index 000000000000..29cb5ea41d97 > >> >> --- /dev/null > >> >> +++ b/include/linux/bpf_oom.h > >> >> @@ -0,0 +1,49 @@ > >> >> +/* SPDX-License-Identifier: GPL-2.0+ */ > >> >> + > >> >> +#ifndef __BPF_OOM_H > >> >> +#define __BPF_OOM_H > >> >> + > >> >> +struct bpf_oom; > >> >> +struct oom_control; > >> >> + > >> >> +#define BPF_OOM_NAME_MAX_LEN 64 > >> >> + > >> >> +struct bpf_oom_ops { > >> >> + /** > >> >> + * @handle_out_of_memory: Out of memory bpf handler, called before > >> >> + * the in-kernel OOM killer. > >> >> + * @oc: OOM control structure > >> >> + * > >> >> + * Should return 1 if some memory was freed up, otherwise > >> >> + * the in-kernel OOM killer is invoked. > >> >> + */ > >> >> + int (*handle_out_of_memory)(struct oom_control *oc); > >> >> + > >> >> + /** > >> >> + * @name: BPF OOM policy name > >> >> + */ > >> >> + char name[BPF_OOM_NAME_MAX_LEN]; > >> > > >> > Why should the name be a part of ops structure? IMO it's not an > >> > attribute of the operations but rather of the oom handler which is > >> > represented by bpf_oom here. > >> > >> The ops structure describes a user-defined oom policy. Currently > >> it's just one handler and the policy name. Later additional handlers > >> can be added, e.g. a handler to control the dmesg output. > >> > >> bpf_oom is an implementation detail: it's basically an extension > >> to struct bpf_oom_ops which contains "private" fields required > >> for the internal machinery. > > > > Ok. I hope we can come up with some more descriptive naming but I > > can't think of something good ATM. > > > >> > >> > > >> >> + > >> >> + /* Private */ > >> >> + struct bpf_oom *bpf_oom; > >> >> +}; > >> >> + > >> >> +#ifdef CONFIG_BPF_SYSCALL > >> >> +/** > >> >> + * @bpf_handle_oom: handle out of memory using bpf programs > >> >> + * @oc: OOM control structure > >> >> + * > >> >> + * Returns true if a bpf oom program was executed, returned 1 > >> >> + * and some memory was actually freed. > >> > > >> > The above comment is unclear, please clarify. > >> > >> Fixed, thanks. > >> > >> /** > >> * @bpf_handle_oom: handle out of memory condition using bpf > >> * @oc: OOM control structure > >> * > >> * Returns true if some memory was freed. > >> */ > >> bool bpf_handle_oom(struct oom_control *oc); > >> > >> > >> > > >> >> + */ > >> >> +bool bpf_handle_oom(struct oom_control *oc); > >> >> + > >> >> +#else /* CONFIG_BPF_SYSCALL */ > >> >> +static inline bool bpf_handle_oom(struct oom_control *oc) > >> >> +{ > >> >> + return false; > >> >> +} > >> >> + > >> >> +#endif /* CONFIG_BPF_SYSCALL */ > >> >> + > >> >> +#endif /* __BPF_OOM_H */ > >> >> diff --git a/include/linux/oom.h b/include/linux/oom.h > >> >> index 1e0fc6931ce9..ef453309b7ea 100644 > >> >> --- a/include/linux/oom.h > >> >> +++ b/include/linux/oom.h > >> >> @@ -51,6 +51,14 @@ struct oom_control { > >> >> > >> >> /* Used to print the constraint info. */ > >> >> enum oom_constraint constraint; > >> >> + > >> >> +#ifdef CONFIG_BPF_SYSCALL > >> >> + /* Used by the bpf oom implementation to mark the forward progress */ > >> >> + bool bpf_memory_freed; > >> >> + > >> >> + /* Policy name */ > >> >> + const char *bpf_policy_name; > >> >> +#endif > >> >> }; > >> >> > >> >> extern struct mutex oom_lock; > >> >> diff --git a/mm/Makefile b/mm/Makefile > >> >> index 1a7a11d4933d..a714aba03759 100644 > >> >> --- a/mm/Makefile > >> >> +++ b/mm/Makefile > >> >> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o > >> >> ifdef CONFIG_SWAP > >> >> obj-$(CONFIG_MEMCG) += swap_cgroup.o > >> >> endif > >> >> +ifdef CONFIG_BPF_SYSCALL > >> >> +obj-y += bpf_oom.o > >> >> +endif > >> >> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o > >> >> obj-$(CONFIG_GUP_TEST) += gup_test.o > >> >> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o > >> >> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c > >> >> new file mode 100644 > >> >> index 000000000000..47633046819c > >> >> --- /dev/null > >> >> +++ b/mm/bpf_oom.c > >> >> @@ -0,0 +1,157 @@ > >> >> +// SPDX-License-Identifier: GPL-2.0-or-later > >> >> +/* > >> >> + * BPF-driven OOM killer customization > >> >> + * > >> >> + * Author: Roman Gushchin <roman.gushchin@linux.dev> > >> >> + */ > >> >> + > >> >> +#include <linux/bpf.h> > >> >> +#include <linux/oom.h> > >> >> +#include <linux/bpf_oom.h> > >> >> +#include <linux/srcu.h> > >> >> + > >> >> +DEFINE_STATIC_SRCU(bpf_oom_srcu); > >> >> +static DEFINE_SPINLOCK(bpf_oom_lock); > >> >> +static LIST_HEAD(bpf_oom_handlers); > >> >> + > >> >> +struct bpf_oom { > >> > > >> > Perhaps bpf_oom_handler ? Then bpf_oom_ops->bpf_oom could be called > >> > bpf_oom_ops->handler. > >> > >> I don't think it's a handler, it's more like a private part > >> of bpf_oom_ops. Maybe bpf_oom_impl? Idk > > > > Yeah, we need to come up with some nomenclature and name these structs > > accordingly. In my mind ops means a structure that contains only > > operations, so current naming does not sit well but maybe that's just > > me... > > > >> > >> > > >> > > >> >> + struct bpf_oom_ops *ops; > >> >> + struct list_head node; > >> >> + struct srcu_struct srcu; > >> >> +}; > >> >> + > >> >> +bool bpf_handle_oom(struct oom_control *oc) > >> >> +{ > >> >> + struct bpf_oom_ops *ops; > >> >> + struct bpf_oom *bpf_oom; > >> >> + int list_idx, idx, ret = 0; > >> >> + > >> >> + oc->bpf_memory_freed = false; > >> >> + > >> >> + list_idx = srcu_read_lock(&bpf_oom_srcu); > >> >> + list_for_each_entry_srcu(bpf_oom, &bpf_oom_handlers, node, false) { > >> >> + ops = READ_ONCE(bpf_oom->ops); > >> >> + if (!ops || !ops->handle_out_of_memory) > >> >> + continue; > >> >> + idx = srcu_read_lock(&bpf_oom->srcu); > >> >> + oc->bpf_policy_name = ops->name[0] ? &ops->name[0] : > >> >> + "bpf_defined_policy"; > >> >> + ret = ops->handle_out_of_memory(oc); > >> >> + oc->bpf_policy_name = NULL; > >> >> + srcu_read_unlock(&bpf_oom->srcu, idx); > >> >> + > >> >> + if (ret && oc->bpf_memory_freed) > >> > > >> > IIUC ret and oc->bpf_memory_freed seem to reflect the same state: > >> > handler successfully freed some memory. Could you please clarify when > >> > they differ? > >> > >> The idea here is to provide an additional safety measure: > >> if the bpf program simple returns 1 without doing anything, > >> the system won't deadlock. > >> > >> oc->bpf_memory_freed is set by the bpf_oom_kill_process() helper > >> (and potentially some other helpers in the future, e.g. > >> bpf_oom_rm_tmpfs_file()) and can't be modified by the bpf > >> program directly. > > > > I see. Then maybe we use only oc->bpf_memory_freed and > > handle_out_of_memory() does not return anything? > > Idk, I think it's neat to have an ability to pass to the in-kernel > OOM killer even after killing a task. > Also, I believe, bpf programs have to return an int anyway, > so we can ignore it, but I don't necessary see the point. Ok, so you want this parameter for the bpf oom-handler to say "even if something got killed, proceed with the next oom-handler anyway?". I can't think of a case when someone would do that... In any case, the use for this return value should be clearly documented. >
© 2016 - 2025 Red Hat, Inc.