Introduce a bpf struct ops for implementing custom OOM handling
policies.
It's possible to load one bpf_oom_ops for the system and one
bpf_oom_ops for every memory cgroup. In case of a memcg OOM, the
cgroup tree is traversed from the OOM'ing memcg up to the root and
corresponding BPF OOM handlers are executed until some memory is
freed. If no memory is freed, the kernel OOM killer is invoked.
The struct ops provides the bpf_handle_out_of_memory() callback,
which expected to return 1 if it was able to free some memory and 0
otherwise. If 1 is returned, the kernel also checks the bpf_memory_freed
field of the oom_control structure, which is expected to be set by
kfuncs suitable for releasing memory. If both are set, OOM is
considered handled, otherwise the next OOM handler in the chain
(e.g. BPF OOM attached to the parent cgroup or the in-kernel OOM
killer) is executed.
The bpf_handle_out_of_memory() callback program is sleepable to enable
using iterators, e.g. cgroup iterators. The callback receives struct
oom_control as an argument, so it can determine the scope of the OOM
event: if this is a memcg-wide or system-wide OOM.
The callback is executed just before the kernel victim task selection
algorithm, so all heuristics and sysctls like panic on oom,
sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task
are respected.
BPF OOM struct ops provides the handle_cgroup_offline() callback
which is good for releasing struct ops if the corresponding cgroup
is gone.
The struct ops also has the name field, which allows to define a
custom name for the implemented policy. It's printed in the OOM report
in the oom_policy=<policy> format. "default" is printed if bpf is not
used or policy name is not specified.
[ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
oom_policy=bpf_test_policy
[ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full)
[ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
[ 112.698167] Call Trace:
[ 112.698177] <TASK>
[ 112.698182] dump_stack_lvl+0x4d/0x70
[ 112.698192] dump_header+0x59/0x1c6
[ 112.698199] oom_kill_process.cold+0x8/0xef
[ 112.698206] bpf_oom_kill_process+0x59/0xb0
[ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313
[ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf
[ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5
[ 112.698240] bpf_handle_oom+0x11a/0x1e0
[ 112.698250] out_of_memory+0xab/0x5c0
[ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110
[ 112.698274] try_charge_memcg+0x4b5/0x7e0
[ 112.698288] charge_memcg+0x2f/0xc0
[ 112.698293] __mem_cgroup_charge+0x30/0xc0
[ 112.698299] do_anonymous_page+0x40f/0xa50
[ 112.698311] __handle_mm_fault+0xbba/0x1140
[ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5
[ 112.698335] handle_mm_fault+0xe6/0x370
[ 112.698343] do_user_addr_fault+0x211/0x6a0
[ 112.698354] exc_page_fault+0x75/0x1d0
[ 112.698363] asm_exc_page_fault+0x26/0x30
[ 112.698366] RIP: 0033:0x7fa97236db00
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
include/linux/bpf_oom.h | 74 ++++++++++
include/linux/memcontrol.h | 5 +
include/linux/oom.h | 8 ++
mm/Makefile | 3 +
mm/bpf_oom.c | 272 +++++++++++++++++++++++++++++++++++++
mm/memcontrol.c | 2 +
mm/oom_kill.c | 22 ++-
7 files changed, 384 insertions(+), 2 deletions(-)
create mode 100644 include/linux/bpf_oom.h
create mode 100644 mm/bpf_oom.c
diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
new file mode 100644
index 000000000000..18c32a5a068b
--- /dev/null
+++ b/include/linux/bpf_oom.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef __BPF_OOM_H
+#define __BPF_OOM_H
+
+struct oom_control;
+
+#define BPF_OOM_NAME_MAX_LEN 64
+
+struct bpf_oom_ctx {
+ /*
+ * If bpf_oom_ops is attached to a cgroup, id of this cgroup.
+ * 0 otherwise.
+ */
+ u64 cgroup_id;
+};
+
+struct bpf_oom_ops {
+ /**
+ * @handle_out_of_memory: Out of memory bpf handler, called before
+ * the in-kernel OOM killer.
+ * @ctx: Execution context
+ * @oc: OOM control structure
+ *
+ * Should return 1 if some memory was freed up, otherwise
+ * the in-kernel OOM killer is invoked.
+ */
+ int (*handle_out_of_memory)(struct bpf_oom_ctx *ctx, struct oom_control *oc);
+
+ /**
+ * @handle_cgroup_offline: Cgroup offline callback
+ * @ctx: Execution context
+ * @cgroup_id: Id of deleted cgroup
+ *
+ * Called if the cgroup with the attached bpf_oom_ops is deleted.
+ */
+ void (*handle_cgroup_offline)(struct bpf_oom_ctx *ctx, u64 cgroup_id);
+
+ /**
+ * @name: BPF OOM policy name
+ */
+ char name[BPF_OOM_NAME_MAX_LEN];
+};
+
+#ifdef CONFIG_BPF_SYSCALL
+/**
+ * @bpf_handle_oom: handle out of memory condition using bpf
+ * @oc: OOM control structure
+ *
+ * Returns true if some memory was freed.
+ */
+bool bpf_handle_oom(struct oom_control *oc);
+
+
+/**
+ * @bpf_oom_memcg_offline: handle memcg offlining
+ * @memcg: Memory cgroup is offlined
+ *
+ * When a memory cgroup is about to be deleted and there is an
+ * attached BPF OOM structure, it has to be detached.
+ */
+void bpf_oom_memcg_offline(struct mem_cgroup *memcg);
+
+#else /* CONFIG_BPF_SYSCALL */
+static inline bool bpf_handle_oom(struct oom_control *oc)
+{
+ return false;
+}
+
+static inline void bpf_oom_memcg_offline(struct mem_cgroup *memcg) {}
+
+#endif /* CONFIG_BPF_SYSCALL */
+
+#endif /* __BPF_OOM_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 50d851ff3f27..39a6c7c8735b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -29,6 +29,7 @@ struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;
+struct bpf_oom_ops;
/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
@@ -226,6 +227,10 @@ struct mem_cgroup {
*/
bool oom_group;
+#ifdef CONFIG_BPF_SYSCALL
+ struct bpf_oom_ops *bpf_oom;
+#endif
+
int swappiness;
/* memory.events and memory.events.local */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 7b02bc1d0a7e..721087952d04 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -51,6 +51,14 @@ struct oom_control {
/* Used to print the constraint info. */
enum oom_constraint constraint;
+
+#ifdef CONFIG_BPF_SYSCALL
+ /* Used by the bpf oom implementation to mark the forward progress */
+ bool bpf_memory_freed;
+
+ /* Policy name */
+ const char *bpf_policy_name;
+#endif
};
extern struct mutex oom_lock;
diff --git a/mm/Makefile b/mm/Makefile
index 21abb3353550..051e88c699af 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
ifdef CONFIG_SWAP
obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif
+ifdef CONFIG_BPF_SYSCALL
+obj-y += bpf_oom.o
+endif
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_GUP_TEST) += gup_test.o
obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
new file mode 100644
index 000000000000..c4d09ed9d541
--- /dev/null
+++ b/mm/bpf_oom.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * BPF-driven OOM killer customization
+ *
+ * Author: Roman Gushchin <roman.gushchin@linux.dev>
+ */
+
+#include <linux/bpf.h>
+#include <linux/oom.h>
+#include <linux/bpf_oom.h>
+#include <linux/srcu.h>
+#include <linux/cgroup.h>
+#include <linux/memcontrol.h>
+
+DEFINE_STATIC_SRCU(bpf_oom_srcu);
+static struct bpf_oom_ops *system_bpf_oom;
+
+#ifdef CONFIG_MEMCG
+static u64 memcg_cgroup_id(struct mem_cgroup *memcg)
+{
+ return cgroup_id(memcg->css.cgroup);
+}
+
+static struct bpf_oom_ops **bpf_oom_memcg_ops_ptr(struct mem_cgroup *memcg)
+{
+ return &memcg->bpf_oom;
+}
+#else /* CONFIG_MEMCG */
+static u64 memcg_cgroup_id(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+static struct bpf_oom_ops **bpf_oom_memcg_ops_ptr(struct mem_cgroup *memcg)
+{
+ return NULL;
+}
+#endif
+
+static int bpf_ops_handle_oom(struct bpf_oom_ops *bpf_oom_ops,
+ struct mem_cgroup *memcg,
+ struct oom_control *oc)
+{
+ struct bpf_oom_ctx exec_ctx;
+ int ret;
+
+ if (IS_ENABLED(CONFIG_MEMCG) && memcg)
+ exec_ctx.cgroup_id = memcg_cgroup_id(memcg);
+ else
+ exec_ctx.cgroup_id = 0;
+
+ oc->bpf_policy_name = &bpf_oom_ops->name[0];
+ oc->bpf_memory_freed = false;
+ ret = bpf_oom_ops->handle_out_of_memory(&exec_ctx, oc);
+ oc->bpf_policy_name = NULL;
+
+ return ret;
+}
+
+bool bpf_handle_oom(struct oom_control *oc)
+{
+ struct bpf_oom_ops *bpf_oom_ops = NULL;
+ struct mem_cgroup __maybe_unused *memcg;
+ int idx, ret = 0;
+
+ /* All bpf_oom_ops structures are protected using bpf_oom_srcu */
+ idx = srcu_read_lock(&bpf_oom_srcu);
+
+#ifdef CONFIG_MEMCG
+ /* Find the nearest bpf_oom_ops traversing the cgroup tree upwards */
+ for (memcg = oc->memcg; memcg; memcg = parent_mem_cgroup(memcg)) {
+ bpf_oom_ops = READ_ONCE(memcg->bpf_oom);
+ if (!bpf_oom_ops)
+ continue;
+
+ /* Call BPF OOM handler */
+ ret = bpf_ops_handle_oom(bpf_oom_ops, memcg, oc);
+ if (ret && oc->bpf_memory_freed)
+ goto exit;
+ }
+#endif /* CONFIG_MEMCG */
+
+ /*
+ * System-wide OOM or per-memcg BPF OOM handler wasn't successful?
+ * Try system_bpf_oom.
+ */
+ bpf_oom_ops = READ_ONCE(system_bpf_oom);
+ if (!bpf_oom_ops)
+ goto exit;
+
+ /* Call BPF OOM handler */
+ ret = bpf_ops_handle_oom(bpf_oom_ops, NULL, oc);
+exit:
+ srcu_read_unlock(&bpf_oom_srcu, idx);
+ return ret && oc->bpf_memory_freed;
+}
+
+static int __handle_out_of_memory(struct bpf_oom_ctx *exec_ctx,
+ struct oom_control *oc)
+{
+ return 0;
+}
+
+static void __handle_cgroup_offline(struct bpf_oom_ctx *exec_ctx, u64 cgroup_id)
+{
+}
+
+static struct bpf_oom_ops __bpf_oom_ops = {
+ .handle_out_of_memory = __handle_out_of_memory,
+ .handle_cgroup_offline = __handle_cgroup_offline,
+};
+
+static const struct bpf_func_proto *
+bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return tracing_prog_func_proto(func_id, prog);
+}
+
+static bool bpf_oom_ops_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static const struct bpf_verifier_ops bpf_oom_verifier_ops = {
+ .get_func_proto = bpf_oom_func_proto,
+ .is_valid_access = bpf_oom_ops_is_valid_access,
+};
+
+static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link);
+ struct bpf_oom_ops **bpf_oom_ops_ptr = NULL;
+ struct bpf_oom_ops *bpf_oom_ops = kdata;
+ struct mem_cgroup *memcg = NULL;
+ int err = 0;
+
+ if (IS_ENABLED(CONFIG_MEMCG) && ops_link->cgroup_id) {
+ /* Attach to a memory cgroup? */
+ memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
+ if (IS_ERR_OR_NULL(memcg))
+ return PTR_ERR(memcg);
+ bpf_oom_ops_ptr = bpf_oom_memcg_ops_ptr(memcg);
+ } else {
+ /* System-wide OOM handler */
+ bpf_oom_ops_ptr = &system_bpf_oom;
+ }
+
+ /* Another struct ops attached? */
+ if (READ_ONCE(*bpf_oom_ops_ptr)) {
+ err = -EBUSY;
+ goto exit;
+ }
+
+ /* Expose bpf_oom_ops structure */
+ WRITE_ONCE(*bpf_oom_ops_ptr, bpf_oom_ops);
+exit:
+ mem_cgroup_put(memcg);
+ return err;
+}
+
+static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link);
+ struct bpf_oom_ops **bpf_oom_ops_ptr = NULL;
+ struct bpf_oom_ops *bpf_oom_ops = kdata;
+ struct mem_cgroup *memcg = NULL;
+
+ if (IS_ENABLED(CONFIG_MEMCG) && ops_link->cgroup_id) {
+ /* Detach from a memory cgroup? */
+ memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
+ if (IS_ERR_OR_NULL(memcg))
+ goto exit;
+ bpf_oom_ops_ptr = bpf_oom_memcg_ops_ptr(memcg);
+ } else {
+ /* System-wide OOM handler */
+ bpf_oom_ops_ptr = &system_bpf_oom;
+ }
+
+ /* Hide bpf_oom_ops from new callers */
+ if (!WARN_ON(READ_ONCE(*bpf_oom_ops_ptr) != bpf_oom_ops))
+ WRITE_ONCE(*bpf_oom_ops_ptr, NULL);
+
+ mem_cgroup_put(memcg);
+
+exit:
+ /* Release bpf_oom_ops after a srcu grace period */
+ synchronize_srcu(&bpf_oom_srcu);
+}
+
+#ifdef CONFIG_MEMCG
+void bpf_oom_memcg_offline(struct mem_cgroup *memcg)
+{
+ struct bpf_oom_ops *bpf_oom_ops;
+ struct bpf_oom_ctx exec_ctx;
+ u64 cgrp_id;
+ int idx;
+
+ /* All bpf_oom_ops structures are protected using bpf_oom_srcu */
+ idx = srcu_read_lock(&bpf_oom_srcu);
+
+ bpf_oom_ops = READ_ONCE(memcg->bpf_oom);
+ WRITE_ONCE(memcg->bpf_oom, NULL);
+
+ if (bpf_oom_ops && bpf_oom_ops->handle_cgroup_offline) {
+ cgrp_id = cgroup_id(memcg->css.cgroup);
+ exec_ctx.cgroup_id = cgrp_id;
+ bpf_oom_ops->handle_cgroup_offline(&exec_ctx, cgrp_id);
+ }
+
+ srcu_read_unlock(&bpf_oom_srcu, idx);
+}
+#endif /* CONFIG_MEMCG */
+
+static int bpf_oom_ops_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct bpf_oom_ops, handle_out_of_memory):
+ if (!prog)
+ return -EINVAL;
+ break;
+ }
+
+ return 0;
+}
+
+static int bpf_oom_ops_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct bpf_oom_ops *uops = udata;
+ struct bpf_oom_ops *ops = kdata;
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct bpf_oom_ops, name):
+ if (uops->name[0])
+ strscpy_pad(ops->name, uops->name, sizeof(ops->name));
+ else
+ strscpy_pad(ops->name, "bpf_defined_policy");
+ return 1;
+ }
+ return 0;
+}
+
+static int bpf_oom_ops_init(struct btf *btf)
+{
+ return 0;
+}
+
+static struct bpf_struct_ops bpf_oom_bpf_ops = {
+ .verifier_ops = &bpf_oom_verifier_ops,
+ .reg = bpf_oom_ops_reg,
+ .unreg = bpf_oom_ops_unreg,
+ .check_member = bpf_oom_ops_check_member,
+ .init_member = bpf_oom_ops_init_member,
+ .init = bpf_oom_ops_init,
+ .name = "bpf_oom_ops",
+ .owner = THIS_MODULE,
+ .cfi_stubs = &__bpf_oom_ops
+};
+
+static int __init bpf_oom_struct_ops_init(void)
+{
+ return register_bpf_struct_ops(&bpf_oom_bpf_ops, bpf_oom_ops);
+}
+late_initcall(bpf_oom_struct_ops_init);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5d27cd5372aa..d44c1f293e16 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -63,6 +63,7 @@
#include <linux/seq_buf.h>
#include <linux/sched/isolation.h>
#include <linux/kmemleak.h>
+#include <linux/bpf_oom.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -3885,6 +3886,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
zswap_memcg_offline_cleanup(memcg);
+ bpf_oom_memcg_offline(memcg);
memcg_offline_kmem(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c145b0feecc1..d05ec0f84087 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -45,6 +45,7 @@
#include <linux/mmu_notifier.h>
#include <linux/cred.h>
#include <linux/nmi.h>
+#include <linux/bpf_oom.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -246,6 +247,15 @@ static const char * const oom_constraint_text[] = {
[CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
};
+static const char *oom_policy_name(struct oom_control *oc)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ if (oc->bpf_policy_name)
+ return oc->bpf_policy_name;
+#endif
+ return "default";
+}
+
/*
* Determine the type of allocation constraint.
*/
@@ -458,9 +468,10 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim)
static void dump_header(struct oom_control *oc)
{
- pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\noom_policy=%s\n",
current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
- current->signal->oom_score_adj);
+ current->signal->oom_score_adj,
+ oom_policy_name(oc));
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
pr_warn("COMPACTION is disabled!!!\n");
@@ -1167,6 +1178,13 @@ bool out_of_memory(struct oom_control *oc)
return true;
}
+ /*
+ * Let bpf handle the OOM first. If it was able to free up some memory,
+ * bail out. Otherwise fall back to the kernel OOM killer.
+ */
+ if (bpf_handle_oom(oc))
+ return true;
+
select_bad_process(oc);
/* Found nothing?!?! */
if (!oc->chosen) {
--
2.51.0
On Mon 27-10-25 16:17:09, Roman Gushchin wrote: > Introduce a bpf struct ops for implementing custom OOM handling > policies. > > It's possible to load one bpf_oom_ops for the system and one > bpf_oom_ops for every memory cgroup. In case of a memcg OOM, the > cgroup tree is traversed from the OOM'ing memcg up to the root and > corresponding BPF OOM handlers are executed until some memory is > freed. If no memory is freed, the kernel OOM killer is invoked. Do you have any usecase in mind where parent memcg oom handler decides to not kill or cannot kill anything and hand over upwards in the hierarchy? > The struct ops provides the bpf_handle_out_of_memory() callback, > which expected to return 1 if it was able to free some memory and 0 > otherwise. If 1 is returned, the kernel also checks the bpf_memory_freed > field of the oom_control structure, which is expected to be set by > kfuncs suitable for releasing memory. If both are set, OOM is > considered handled, otherwise the next OOM handler in the chain > (e.g. BPF OOM attached to the parent cgroup or the in-kernel OOM > killer) is executed. Could you explain why do we need both? Why is not bpf_memory_freed return value sufficient? > The bpf_handle_out_of_memory() callback program is sleepable to enable > using iterators, e.g. cgroup iterators. The callback receives struct > oom_control as an argument, so it can determine the scope of the OOM > event: if this is a memcg-wide or system-wide OOM. This could be tricky because it might introduce a subtle and hard to debug lock dependency chain. lock(a); allocation() -> oom -> lock(a). Sleepable locks should be only allowed in trylock mode. > The callback is executed just before the kernel victim task selection > algorithm, so all heuristics and sysctls like panic on oom, > sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task > are respected. I guess you meant to say and sysctl_panic_on_oom. > BPF OOM struct ops provides the handle_cgroup_offline() callback > which is good for releasing struct ops if the corresponding cgroup > is gone. What kind of synchronization is expected between handle_cgroup_offline and bpf_handle_out_of_memory? > The struct ops also has the name field, which allows to define a > custom name for the implemented policy. It's printed in the OOM report > in the oom_policy=<policy> format. "default" is printed if bpf is not > used or policy name is not specified. oom_handler seems like a better fit but nothing I would insist on. Also I would just print it if there is an actual handler so that existing users who do not use bpf oom killers do not need to change their parsers. Other than that this looks reasonable to me. -- Michal Hocko SUSE Labs
Michal Hocko <mhocko@suse.com> writes: > On Mon 27-10-25 16:17:09, Roman Gushchin wrote: >> Introduce a bpf struct ops for implementing custom OOM handling >> policies. >> >> It's possible to load one bpf_oom_ops for the system and one >> bpf_oom_ops for every memory cgroup. In case of a memcg OOM, the >> cgroup tree is traversed from the OOM'ing memcg up to the root and >> corresponding BPF OOM handlers are executed until some memory is >> freed. If no memory is freed, the kernel OOM killer is invoked. > > Do you have any usecase in mind where parent memcg oom handler decides > to not kill or cannot kill anything and hand over upwards in the > hierarchy? I believe that in most cases bpf handlers will handle ooms themselves, but because strictly speaking I don't have control over what bpf programs do or do not, the kernel should provide the fallback mechanism. This is a common practice with bpf, e.g. sched_ext falls back to CFS/EEVDF in case something is wrong. Specifically to OOM case, I believe someone might want to use bpf programs just for monitoring/collecting some information, without trying to actually free some memory. >> The struct ops provides the bpf_handle_out_of_memory() callback, >> which expected to return 1 if it was able to free some memory and 0 >> otherwise. If 1 is returned, the kernel also checks the bpf_memory_freed >> field of the oom_control structure, which is expected to be set by >> kfuncs suitable for releasing memory. If both are set, OOM is >> considered handled, otherwise the next OOM handler in the chain >> (e.g. BPF OOM attached to the parent cgroup or the in-kernel OOM >> killer) is executed. > > Could you explain why do we need both? Why is not bpf_memory_freed > return value sufficient? Strictly speaking, bpf_memory_freed should be enough, but because bpf programs have to return an int and there is no additional cost to add this option (pass to next or in-kernel oom handler), I thought it's not a bad idea. If you feel strongly otherwise, I can ignore the return value on rely on bpf_memory_freed only. > >> The bpf_handle_out_of_memory() callback program is sleepable to enable >> using iterators, e.g. cgroup iterators. The callback receives struct >> oom_control as an argument, so it can determine the scope of the OOM >> event: if this is a memcg-wide or system-wide OOM. > > This could be tricky because it might introduce a subtle and hard to > debug lock dependency chain. lock(a); allocation() -> oom -> lock(a). > Sleepable locks should be only allowed in trylock mode. Agree, but it's achieved by controlling the context where oom can be declared (e.g. in bpf_psi case it's done from a work context). > >> The callback is executed just before the kernel victim task selection >> algorithm, so all heuristics and sysctls like panic on oom, >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task >> are respected. > > I guess you meant to say and sysctl_panic_on_oom. Yep, fixed. > >> BPF OOM struct ops provides the handle_cgroup_offline() callback >> which is good for releasing struct ops if the corresponding cgroup >> is gone. > > What kind of synchronization is expected between handle_cgroup_offline > and bpf_handle_out_of_memory? You mean from a user's perspective? E.g. can these two callbacks run in parallel? Currently yes, but it's a good question, I haven't thought about it, maybe it's better to synchronize them. Internally both rely on srcu to pin bpf_oom_ops in memory. > >> The struct ops also has the name field, which allows to define a >> custom name for the implemented policy. It's printed in the OOM report >> in the oom_policy=<policy> format. "default" is printed if bpf is not >> used or policy name is not specified. > > oom_handler seems like a better fit but nothing I would insist on. Also > I would just print it if there is an actual handler so that existing > users who do not use bpf oom killers do not need to change their > parsers. Sure, works for me too. > > Other than that this looks reasonable to me. Sound great, thank you for taking a look!
On Sun 02-11-25 13:36:25, Roman Gushchin wrote:
> Michal Hocko <mhocko@suse.com> writes:
>
> > On Mon 27-10-25 16:17:09, Roman Gushchin wrote:
> >> Introduce a bpf struct ops for implementing custom OOM handling
> >> policies.
> >>
> >> It's possible to load one bpf_oom_ops for the system and one
> >> bpf_oom_ops for every memory cgroup. In case of a memcg OOM, the
> >> cgroup tree is traversed from the OOM'ing memcg up to the root and
> >> corresponding BPF OOM handlers are executed until some memory is
> >> freed. If no memory is freed, the kernel OOM killer is invoked.
> >
> > Do you have any usecase in mind where parent memcg oom handler decides
> > to not kill or cannot kill anything and hand over upwards in the
> > hierarchy?
>
> I believe that in most cases bpf handlers will handle ooms themselves,
> but because strictly speaking I don't have control over what bpf
> programs do or do not, the kernel should provide the fallback mechanism.
> This is a common practice with bpf, e.g. sched_ext falls back to
> CFS/EEVDF in case something is wrong.
We do have fallback mechanism - the kernel oom handling. For that we do
not need to pass to parent handler. Please not that I am not opposing
this but I would like to understand thinking behind and hopefully start
with a simpler model and then extend it later than go with a more
complex one initially and then corner ourselves with weird side
effects.
> Specifically to OOM case, I believe someone might want to use bpf
> programs just for monitoring/collecting some information, without
> trying to actually free some memory.
>
> >> The struct ops provides the bpf_handle_out_of_memory() callback,
> >> which expected to return 1 if it was able to free some memory and 0
> >> otherwise. If 1 is returned, the kernel also checks the bpf_memory_freed
> >> field of the oom_control structure, which is expected to be set by
> >> kfuncs suitable for releasing memory. If both are set, OOM is
> >> considered handled, otherwise the next OOM handler in the chain
> >> (e.g. BPF OOM attached to the parent cgroup or the in-kernel OOM
> >> killer) is executed.
> >
> > Could you explain why do we need both? Why is not bpf_memory_freed
> > return value sufficient?
>
> Strictly speaking, bpf_memory_freed should be enough, but because
> bpf programs have to return an int and there is no additional cost
> to add this option (pass to next or in-kernel oom handler), I thought
> it's not a bad idea. If you feel strongly otherwise, I can ignore
> the return value on rely on bpf_memory_freed only.
No, I do not feel strongly one way or the other but I would like to
understand thinking behind that. My slight preference would be to have a
single return status that clearly describe the intention. If you want to
have more flexible chaining semantic then an enum { IGNORED, HANDLED,
PASS_TO_PARENT, ...} would be both more flexible, extensible and easier
to understand.
> >> The bpf_handle_out_of_memory() callback program is sleepable to enable
> >> using iterators, e.g. cgroup iterators. The callback receives struct
> >> oom_control as an argument, so it can determine the scope of the OOM
> >> event: if this is a memcg-wide or system-wide OOM.
> >
> > This could be tricky because it might introduce a subtle and hard to
> > debug lock dependency chain. lock(a); allocation() -> oom -> lock(a).
> > Sleepable locks should be only allowed in trylock mode.
>
> Agree, but it's achieved by controlling the context where oom can be
> declared (e.g. in bpf_psi case it's done from a work context).
but out_of_memory is any sleepable context. So this is a real problem.
> >> The callback is executed just before the kernel victim task selection
> >> algorithm, so all heuristics and sysctls like panic on oom,
> >> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task
> >> are respected.
> >
> > I guess you meant to say and sysctl_panic_on_oom.
>
> Yep, fixed.
> >
> >> BPF OOM struct ops provides the handle_cgroup_offline() callback
> >> which is good for releasing struct ops if the corresponding cgroup
> >> is gone.
> >
> > What kind of synchronization is expected between handle_cgroup_offline
> > and bpf_handle_out_of_memory?
>
> You mean from a user's perspective?
I mean from bpf handler writer POV
> E.g. can these two callbacks run in
> parallel? Currently yes, but it's a good question, I haven't thought
> about it, maybe it's better to synchronize them.
> Internally both rely on srcu to pin bpf_oom_ops in memory.
This should be really documented.
--
Michal Hocko
SUSE Labs
Michal Hocko <mhocko@suse.com> writes:
> On Sun 02-11-25 13:36:25, Roman Gushchin wrote:
>> Michal Hocko <mhocko@suse.com> writes:
>>
>> > On Mon 27-10-25 16:17:09, Roman Gushchin wrote:
>> >> Introduce a bpf struct ops for implementing custom OOM handling
>> >> policies.
>> >>
>> >> It's possible to load one bpf_oom_ops for the system and one
>> >> bpf_oom_ops for every memory cgroup. In case of a memcg OOM, the
>> >> cgroup tree is traversed from the OOM'ing memcg up to the root and
>> >> corresponding BPF OOM handlers are executed until some memory is
>> >> freed. If no memory is freed, the kernel OOM killer is invoked.
>> >
>> > Do you have any usecase in mind where parent memcg oom handler decides
>> > to not kill or cannot kill anything and hand over upwards in the
>> > hierarchy?
>>
>> I believe that in most cases bpf handlers will handle ooms themselves,
>> but because strictly speaking I don't have control over what bpf
>> programs do or do not, the kernel should provide the fallback mechanism.
>> This is a common practice with bpf, e.g. sched_ext falls back to
>> CFS/EEVDF in case something is wrong.
>
> We do have fallback mechanism - the kernel oom handling. For that we do
> not need to pass to parent handler. Please not that I am not opposing
> this but I would like to understand thinking behind and hopefully start
> with a simpler model and then extend it later than go with a more
> complex one initially and then corner ourselves with weird side
> effects.
>
>> Specifically to OOM case, I believe someone might want to use bpf
>> programs just for monitoring/collecting some information, without
>> trying to actually free some memory.
>>
>> >> The struct ops provides the bpf_handle_out_of_memory() callback,
>> >> which expected to return 1 if it was able to free some memory and 0
>> >> otherwise. If 1 is returned, the kernel also checks the bpf_memory_freed
>> >> field of the oom_control structure, which is expected to be set by
>> >> kfuncs suitable for releasing memory. If both are set, OOM is
>> >> considered handled, otherwise the next OOM handler in the chain
>> >> (e.g. BPF OOM attached to the parent cgroup or the in-kernel OOM
>> >> killer) is executed.
>> >
>> > Could you explain why do we need both? Why is not bpf_memory_freed
>> > return value sufficient?
>>
>> Strictly speaking, bpf_memory_freed should be enough, but because
>> bpf programs have to return an int and there is no additional cost
>> to add this option (pass to next or in-kernel oom handler), I thought
>> it's not a bad idea. If you feel strongly otherwise, I can ignore
>> the return value on rely on bpf_memory_freed only.
>
> No, I do not feel strongly one way or the other but I would like to
> understand thinking behind that. My slight preference would be to have a
> single return status that clearly describe the intention. If you want to
> have more flexible chaining semantic then an enum { IGNORED, HANDLED,
> PASS_TO_PARENT, ...} would be both more flexible, extensible and easier
> to understand.
The thinking is simple:
1) Most users will have a single global bpf oom policy, which basically
replaces the in-kernel oom killer.
2) If there are standalone containers, they might want to do the same on
their level. And the "host" system doesn't directly control it.
3) If for some reason the inner oom handler fails to free up some
memory, there are two potential fallback options: call the in-kernel oom
killer for that memory cgroup or call an upper level bpf oom killer, if
there is one.
I think the latter is more logical and less surprising. Imagine you're
running multiple containers and some of them implement their own bpf oom
logic and some don't. Why would we treat them differently if their bpf
logic fails?
Re a single return value: I can absolutely specify return values as an
enum, my point is that unlike the kernel code we can't fully trust the
value returned from a bpf program, this is why the second check is in
place.
Can we just ignore the returned value and rely on the freed_memory flag?
Sure, but I don't think it bus us anything.
Also, I have to admit that I don't have an immediate production use case
for nested oom handlers (I'm fine with a global one), but it was asked
by Alexei Starovoitov. And I agree with him that the containerized case
will come up soon, so it's better to think of it in advance.
>> >> The bpf_handle_out_of_memory() callback program is sleepable to enable
>> >> using iterators, e.g. cgroup iterators. The callback receives struct
>> >> oom_control as an argument, so it can determine the scope of the OOM
>> >> event: if this is a memcg-wide or system-wide OOM.
>> >
>> > This could be tricky because it might introduce a subtle and hard to
>> > debug lock dependency chain. lock(a); allocation() -> oom -> lock(a).
>> > Sleepable locks should be only allowed in trylock mode.
>>
>> Agree, but it's achieved by controlling the context where oom can be
>> declared (e.g. in bpf_psi case it's done from a work context).
>
> but out_of_memory is any sleepable context. So this is a real problem.
We need to restrict both:
1) where from bpf_out_of_memory() can be called (already done, as of now
only from bpf_psi callback, which is safe).
2) which kfuncs are available to bpf oom handlers (only those, which are
not trying to grab unsafe locks) - I'll double check it in thenext version.
Thank you!
On Mon 03-11-25 17:45:09, Roman Gushchin wrote:
> Michal Hocko <mhocko@suse.com> writes:
>
> > On Sun 02-11-25 13:36:25, Roman Gushchin wrote:
> >> Michal Hocko <mhocko@suse.com> writes:
[...]
> > No, I do not feel strongly one way or the other but I would like to
> > understand thinking behind that. My slight preference would be to have a
> > single return status that clearly describe the intention. If you want to
> > have more flexible chaining semantic then an enum { IGNORED, HANDLED,
> > PASS_TO_PARENT, ...} would be both more flexible, extensible and easier
> > to understand.
>
> The thinking is simple:
> 1) Most users will have a single global bpf oom policy, which basically
> replaces the in-kernel oom killer.
> 2) If there are standalone containers, they might want to do the same on
> their level. And the "host" system doesn't directly control it.
> 3) If for some reason the inner oom handler fails to free up some
> memory, there are two potential fallback options: call the in-kernel oom
> killer for that memory cgroup or call an upper level bpf oom killer, if
> there is one.
>
> I think the latter is more logical and less surprising. Imagine you're
> running multiple containers and some of them implement their own bpf oom
> logic and some don't. Why would we treat them differently if their bpf
> logic fails?
I think both approaches are valid and it should be the actual handler to
tell what to do next. If the handler would prefer the in-kernel fallback
it should be able to enforce that rather than a potentially unknown bpf
handler up the chain.
> Re a single return value: I can absolutely specify return values as an
> enum, my point is that unlike the kernel code we can't fully trust the
> value returned from a bpf program, this is why the second check is in
> place.
I do not understand this. Could you elaborate? Why we cannot trust the
return value but we can trust a combination of the return value and a
state stored in a helper structure?
> Can we just ignore the returned value and rely on the freed_memory flag?
I do not think having a single freed_memory flag is more helpful. This
is just a number that cannot say much more than a memory has been freed.
It is not really important whether and how much memory bpf handler
believes it has freed. It is much more important to note whether it
believes it is done, it needs assistance from a different handler up the
chain or just pass over to the in-kernel implementation.
> Sure, but I don't think it bus us anything.
>
> Also, I have to admit that I don't have an immediate production use case
> for nested oom handlers (I'm fine with a global one), but it was asked
> by Alexei Starovoitov. And I agree with him that the containerized case
> will come up soon, so it's better to think of it in advance.
I agree it is good to be prepared for that.
> >> >> The bpf_handle_out_of_memory() callback program is sleepable to enable
> >> >> using iterators, e.g. cgroup iterators. The callback receives struct
> >> >> oom_control as an argument, so it can determine the scope of the OOM
> >> >> event: if this is a memcg-wide or system-wide OOM.
> >> >
> >> > This could be tricky because it might introduce a subtle and hard to
> >> > debug lock dependency chain. lock(a); allocation() -> oom -> lock(a).
> >> > Sleepable locks should be only allowed in trylock mode.
> >>
> >> Agree, but it's achieved by controlling the context where oom can be
> >> declared (e.g. in bpf_psi case it's done from a work context).
> >
> > but out_of_memory is any sleepable context. So this is a real problem.
>
> We need to restrict both:
> 1) where from bpf_out_of_memory() can be called (already done, as of now
> only from bpf_psi callback, which is safe).
> 2) which kfuncs are available to bpf oom handlers (only those, which are
> not trying to grab unsafe locks) - I'll double check it in thenext version.
OK. All I am trying to say is that only safe sleepable locks are
trylocks and that should be documented because I do not think it can be
enforced
--
Michal Hocko
SUSE Labs
Michal Hocko <mhocko@suse.com> writes:
> On Mon 03-11-25 17:45:09, Roman Gushchin wrote:
>> Michal Hocko <mhocko@suse.com> writes:
>>
>> > On Sun 02-11-25 13:36:25, Roman Gushchin wrote:
>> >> Michal Hocko <mhocko@suse.com> writes:
> [...]
>> > No, I do not feel strongly one way or the other but I would like to
>> > understand thinking behind that. My slight preference would be to have a
>> > single return status that clearly describe the intention. If you want to
>> > have more flexible chaining semantic then an enum { IGNORED, HANDLED,
>> > PASS_TO_PARENT, ...} would be both more flexible, extensible and easier
>> > to understand.
>>
>> The thinking is simple:
>> 1) Most users will have a single global bpf oom policy, which basically
>> replaces the in-kernel oom killer.
>> 2) If there are standalone containers, they might want to do the same on
>> their level. And the "host" system doesn't directly control it.
>> 3) If for some reason the inner oom handler fails to free up some
>> memory, there are two potential fallback options: call the in-kernel oom
>> killer for that memory cgroup or call an upper level bpf oom killer, if
>> there is one.
>>
>> I think the latter is more logical and less surprising. Imagine you're
>> running multiple containers and some of them implement their own bpf oom
>> logic and some don't. Why would we treat them differently if their bpf
>> logic fails?
>
> I think both approaches are valid and it should be the actual handler to
> tell what to do next. If the handler would prefer the in-kernel fallback
> it should be able to enforce that rather than a potentially unknown bpf
> handler up the chain.
The counter-argument is that cgroups are hierarchical and higher level
cgroups should be able to enforce the desired behavior for their
sub-trees. I'm not sure what's more important here and have to think
more about it.
Do you have an example when it might be important for container to not
pass to a higher level bpf handler?
>
>> Re a single return value: I can absolutely specify return values as an
>> enum, my point is that unlike the kernel code we can't fully trust the
>> value returned from a bpf program, this is why the second check is in
>> place.
>
> I do not understand this. Could you elaborate? Why we cannot trust the
> return value but we can trust a combination of the return value and a
> state stored in a helper structure?
Imagine bpf program which does nothing and simple returns 1. Imagine
it's loaded as a system-wide oom handler. This will effectively disable
the oom killer and lead to a potential deadlock on memory.
But it's a perfectly valid bpf program.
This is something I want to avoid (and it's a common practice with other
bpf programs).
What I do I also rely on the value of the oom control's field, which is
not accessible to the bpf program for write directly, but can be changed
by calling certain helper functions, e.g. bpf_oom_kill_process.
>> Can we just ignore the returned value and rely on the freed_memory flag?
>
> I do not think having a single freed_memory flag is more helpful. This
> is just a number that cannot say much more than a memory has been freed.
> It is not really important whether and how much memory bpf handler
> believes it has freed. It is much more important to note whether it
> believes it is done, it needs assistance from a different handler up the
> chain or just pass over to the in-kernel implementation.
Btw in general in a containerized environment a bpf handler knows
nothing about bpf programs up in the cgroup hierarchy... So it only
knows whether it was able to free some memory or not.
>
>> Sure, but I don't think it bus us anything.
>>
>> Also, I have to admit that I don't have an immediate production use case
>> for nested oom handlers (I'm fine with a global one), but it was asked
>> by Alexei Starovoitov. And I agree with him that the containerized case
>> will come up soon, so it's better to think of it in advance.
>
> I agree it is good to be prepared for that.
>
>> >> >> The bpf_handle_out_of_memory() callback program is sleepable to enable
>> >> >> using iterators, e.g. cgroup iterators. The callback receives struct
>> >> >> oom_control as an argument, so it can determine the scope of the OOM
>> >> >> event: if this is a memcg-wide or system-wide OOM.
>> >> >
>> >> > This could be tricky because it might introduce a subtle and hard to
>> >> > debug lock dependency chain. lock(a); allocation() -> oom -> lock(a).
>> >> > Sleepable locks should be only allowed in trylock mode.
>> >>
>> >> Agree, but it's achieved by controlling the context where oom can be
>> >> declared (e.g. in bpf_psi case it's done from a work context).
>> >
>> > but out_of_memory is any sleepable context. So this is a real problem.
>>
>> We need to restrict both:
>> 1) where from bpf_out_of_memory() can be called (already done, as of now
>> only from bpf_psi callback, which is safe).
>> 2) which kfuncs are available to bpf oom handlers (only those, which are
>> not trying to grab unsafe locks) - I'll double check it in thenext version.
>
> OK. All I am trying to say is that only safe sleepable locks are
> trylocks and that should be documented because I do not think it can be
> enforced
It can! Not directly, but by controlling which kfuncs/helpers are
available to bpf programs.
I agree with you in principle re locks and necessary precaution here.
Thanks!
On Tue 04-11-25 10:14:05, Roman Gushchin wrote:
> Michal Hocko <mhocko@suse.com> writes:
>
> > On Mon 03-11-25 17:45:09, Roman Gushchin wrote:
> >> Michal Hocko <mhocko@suse.com> writes:
> >>
> >> > On Sun 02-11-25 13:36:25, Roman Gushchin wrote:
> >> >> Michal Hocko <mhocko@suse.com> writes:
> > [...]
> >> > No, I do not feel strongly one way or the other but I would like to
> >> > understand thinking behind that. My slight preference would be to have a
> >> > single return status that clearly describe the intention. If you want to
> >> > have more flexible chaining semantic then an enum { IGNORED, HANDLED,
> >> > PASS_TO_PARENT, ...} would be both more flexible, extensible and easier
> >> > to understand.
> >>
> >> The thinking is simple:
> >> 1) Most users will have a single global bpf oom policy, which basically
> >> replaces the in-kernel oom killer.
> >> 2) If there are standalone containers, they might want to do the same on
> >> their level. And the "host" system doesn't directly control it.
> >> 3) If for some reason the inner oom handler fails to free up some
> >> memory, there are two potential fallback options: call the in-kernel oom
> >> killer for that memory cgroup or call an upper level bpf oom killer, if
> >> there is one.
> >>
> >> I think the latter is more logical and less surprising. Imagine you're
> >> running multiple containers and some of them implement their own bpf oom
> >> logic and some don't. Why would we treat them differently if their bpf
> >> logic fails?
> >
> > I think both approaches are valid and it should be the actual handler to
> > tell what to do next. If the handler would prefer the in-kernel fallback
> > it should be able to enforce that rather than a potentially unknown bpf
> > handler up the chain.
>
> The counter-argument is that cgroups are hierarchical and higher level
> cgroups should be able to enforce the desired behavior for their
> sub-trees. I'm not sure what's more important here and have to think
> more about it.
Right and they can enforce that through their limits - hence oom.
> Do you have an example when it might be important for container to not
> pass to a higher level bpf handler?
Nothing really specific. I still trying to wrap my head around what
level of flexibility is necessary here. My initial thoughts would be
just deal with it in the scope of the bpf handler and fallback to the
kernel implementation if it cannot deal with the situation. Since you
brought that up you made me think.
I know that we do not provide userspace like no-regression policy to BPF
programs but it would be still good to have a way to add new potential
fallback policies without breaking existing handlers.
> >> Re a single return value: I can absolutely specify return values as an
> >> enum, my point is that unlike the kernel code we can't fully trust the
> >> value returned from a bpf program, this is why the second check is in
> >> place.
> >
> > I do not understand this. Could you elaborate? Why we cannot trust the
> > return value but we can trust a combination of the return value and a
> > state stored in a helper structure?
>
> Imagine bpf program which does nothing and simple returns 1. Imagine
> it's loaded as a system-wide oom handler. This will effectively disable
> the oom killer and lead to a potential deadlock on memory.
> But it's a perfectly valid bpf program.
> This is something I want to avoid (and it's a common practice with other
> bpf programs).
>
> What I do I also rely on the value of the oom control's field, which is
> not accessible to the bpf program for write directly, but can be changed
> by calling certain helper functions, e.g. bpf_oom_kill_process.
OK, now I can see your point. You want to have a line of defense from
trusted BPF facing interface. This makes sense to me. Maybe it would be
good to call that out more explicitly. Something like
The BPF OOM infrastructure only trusts BPF handlers which are using pre
selected functions to free up memory e.g. bpf_oom_kill_process. Those
will set an internal state not available to those handlers directly.
BPF handler return value is ignored if that state is not set.
I would rather call this differently to freed_memory as the actual
memory might be freed asynchronously (e.g. oom_reaper) and this is more
about conformity/trust than actual physical memory being freed. I do not
care much about naming as long as this is clearly document though.
Including set of functions that are forming that prescribed API.
[...]
> > OK. All I am trying to say is that only safe sleepable locks are
> > trylocks and that should be documented because I do not think it can be
> > enforced
>
> It can! Not directly, but by controlling which kfuncs/helpers are
> available to bpf programs.
OK, I see. This is better than relying only on having this documented.
--
Michal Hocko
SUSE Labs
On Tue, Oct 28, 2025 at 7:22 AM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> Introduce a bpf struct ops for implementing custom OOM handling
> policies.
>
> It's possible to load one bpf_oom_ops for the system and one
> bpf_oom_ops for every memory cgroup. In case of a memcg OOM, the
> cgroup tree is traversed from the OOM'ing memcg up to the root and
> corresponding BPF OOM handlers are executed until some memory is
> freed. If no memory is freed, the kernel OOM killer is invoked.
>
> The struct ops provides the bpf_handle_out_of_memory() callback,
> which expected to return 1 if it was able to free some memory and 0
> otherwise. If 1 is returned, the kernel also checks the bpf_memory_freed
> field of the oom_control structure, which is expected to be set by
> kfuncs suitable for releasing memory. If both are set, OOM is
> considered handled, otherwise the next OOM handler in the chain
> (e.g. BPF OOM attached to the parent cgroup or the in-kernel OOM
> killer) is executed.
>
> The bpf_handle_out_of_memory() callback program is sleepable to enable
> using iterators, e.g. cgroup iterators. The callback receives struct
> oom_control as an argument, so it can determine the scope of the OOM
> event: if this is a memcg-wide or system-wide OOM.
>
> The callback is executed just before the kernel victim task selection
> algorithm, so all heuristics and sysctls like panic on oom,
> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task
> are respected.
>
> BPF OOM struct ops provides the handle_cgroup_offline() callback
> which is good for releasing struct ops if the corresponding cgroup
> is gone.
>
> The struct ops also has the name field, which allows to define a
> custom name for the implemented policy. It's printed in the OOM report
> in the oom_policy=<policy> format. "default" is printed if bpf is not
> used or policy name is not specified.
>
> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
> oom_policy=bpf_test_policy
> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full)
> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
> [ 112.698167] Call Trace:
> [ 112.698177] <TASK>
> [ 112.698182] dump_stack_lvl+0x4d/0x70
> [ 112.698192] dump_header+0x59/0x1c6
> [ 112.698199] oom_kill_process.cold+0x8/0xef
> [ 112.698206] bpf_oom_kill_process+0x59/0xb0
> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313
> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf
> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5
> [ 112.698240] bpf_handle_oom+0x11a/0x1e0
> [ 112.698250] out_of_memory+0xab/0x5c0
> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110
> [ 112.698274] try_charge_memcg+0x4b5/0x7e0
> [ 112.698288] charge_memcg+0x2f/0xc0
> [ 112.698293] __mem_cgroup_charge+0x30/0xc0
> [ 112.698299] do_anonymous_page+0x40f/0xa50
> [ 112.698311] __handle_mm_fault+0xbba/0x1140
> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5
> [ 112.698335] handle_mm_fault+0xe6/0x370
> [ 112.698343] do_user_addr_fault+0x211/0x6a0
> [ 112.698354] exc_page_fault+0x75/0x1d0
> [ 112.698363] asm_exc_page_fault+0x26/0x30
> [ 112.698366] RIP: 0033:0x7fa97236db00
>
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> ---
> include/linux/bpf_oom.h | 74 ++++++++++
> include/linux/memcontrol.h | 5 +
> include/linux/oom.h | 8 ++
> mm/Makefile | 3 +
> mm/bpf_oom.c | 272 +++++++++++++++++++++++++++++++++++++
> mm/memcontrol.c | 2 +
> mm/oom_kill.c | 22 ++-
> 7 files changed, 384 insertions(+), 2 deletions(-)
> create mode 100644 include/linux/bpf_oom.h
> create mode 100644 mm/bpf_oom.c
>
> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
> new file mode 100644
> index 000000000000..18c32a5a068b
> --- /dev/null
> +++ b/include/linux/bpf_oom.h
> @@ -0,0 +1,74 @@
> +/* SPDX-License-Identifier: GPL-2.0+ */
> +
> +#ifndef __BPF_OOM_H
> +#define __BPF_OOM_H
> +
> +struct oom_control;
> +
> +#define BPF_OOM_NAME_MAX_LEN 64
> +
> +struct bpf_oom_ctx {
> + /*
> + * If bpf_oom_ops is attached to a cgroup, id of this cgroup.
> + * 0 otherwise.
> + */
> + u64 cgroup_id;
> +};
> +
> +struct bpf_oom_ops {
> + /**
> + * @handle_out_of_memory: Out of memory bpf handler, called before
> + * the in-kernel OOM killer.
> + * @ctx: Execution context
> + * @oc: OOM control structure
> + *
> + * Should return 1 if some memory was freed up, otherwise
> + * the in-kernel OOM killer is invoked.
> + */
> + int (*handle_out_of_memory)(struct bpf_oom_ctx *ctx, struct oom_control *oc);
> +
> + /**
> + * @handle_cgroup_offline: Cgroup offline callback
> + * @ctx: Execution context
> + * @cgroup_id: Id of deleted cgroup
> + *
> + * Called if the cgroup with the attached bpf_oom_ops is deleted.
> + */
> + void (*handle_cgroup_offline)(struct bpf_oom_ctx *ctx, u64 cgroup_id);
> +
> + /**
> + * @name: BPF OOM policy name
> + */
> + char name[BPF_OOM_NAME_MAX_LEN];
> +};
> +
> +#ifdef CONFIG_BPF_SYSCALL
> +/**
> + * @bpf_handle_oom: handle out of memory condition using bpf
> + * @oc: OOM control structure
> + *
> + * Returns true if some memory was freed.
> + */
> +bool bpf_handle_oom(struct oom_control *oc);
> +
> +
> +/**
> + * @bpf_oom_memcg_offline: handle memcg offlining
> + * @memcg: Memory cgroup is offlined
> + *
> + * When a memory cgroup is about to be deleted and there is an
> + * attached BPF OOM structure, it has to be detached.
> + */
> +void bpf_oom_memcg_offline(struct mem_cgroup *memcg);
> +
> +#else /* CONFIG_BPF_SYSCALL */
> +static inline bool bpf_handle_oom(struct oom_control *oc)
> +{
> + return false;
> +}
> +
> +static inline void bpf_oom_memcg_offline(struct mem_cgroup *memcg) {}
> +
> +#endif /* CONFIG_BPF_SYSCALL */
> +
> +#endif /* __BPF_OOM_H */
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 50d851ff3f27..39a6c7c8735b 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -29,6 +29,7 @@ struct obj_cgroup;
> struct page;
> struct mm_struct;
> struct kmem_cache;
> +struct bpf_oom_ops;
>
> /* Cgroup-specific page state, on top of universal node page state */
> enum memcg_stat_item {
> @@ -226,6 +227,10 @@ struct mem_cgroup {
> */
> bool oom_group;
>
> +#ifdef CONFIG_BPF_SYSCALL
> + struct bpf_oom_ops *bpf_oom;
> +#endif
> +
> int swappiness;
>
> /* memory.events and memory.events.local */
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 7b02bc1d0a7e..721087952d04 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -51,6 +51,14 @@ struct oom_control {
>
> /* Used to print the constraint info. */
> enum oom_constraint constraint;
> +
> +#ifdef CONFIG_BPF_SYSCALL
> + /* Used by the bpf oom implementation to mark the forward progress */
> + bool bpf_memory_freed;
> +
> + /* Policy name */
> + const char *bpf_policy_name;
> +#endif
> };
>
> extern struct mutex oom_lock;
> diff --git a/mm/Makefile b/mm/Makefile
> index 21abb3353550..051e88c699af 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
> ifdef CONFIG_SWAP
> obj-$(CONFIG_MEMCG) += swap_cgroup.o
> endif
> +ifdef CONFIG_BPF_SYSCALL
> +obj-y += bpf_oom.o
> +endif
> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
> obj-$(CONFIG_GUP_TEST) += gup_test.o
> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
> new file mode 100644
> index 000000000000..c4d09ed9d541
> --- /dev/null
> +++ b/mm/bpf_oom.c
> @@ -0,0 +1,272 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * BPF-driven OOM killer customization
> + *
> + * Author: Roman Gushchin <roman.gushchin@linux.dev>
> + */
> +
> +#include <linux/bpf.h>
> +#include <linux/oom.h>
> +#include <linux/bpf_oom.h>
> +#include <linux/srcu.h>
> +#include <linux/cgroup.h>
> +#include <linux/memcontrol.h>
> +
> +DEFINE_STATIC_SRCU(bpf_oom_srcu);
> +static struct bpf_oom_ops *system_bpf_oom;
> +
> +#ifdef CONFIG_MEMCG
> +static u64 memcg_cgroup_id(struct mem_cgroup *memcg)
> +{
> + return cgroup_id(memcg->css.cgroup);
> +}
> +
> +static struct bpf_oom_ops **bpf_oom_memcg_ops_ptr(struct mem_cgroup *memcg)
> +{
> + return &memcg->bpf_oom;
> +}
> +#else /* CONFIG_MEMCG */
> +static u64 memcg_cgroup_id(struct mem_cgroup *memcg)
> +{
> + return 0;
> +}
> +static struct bpf_oom_ops **bpf_oom_memcg_ops_ptr(struct mem_cgroup *memcg)
> +{
> + return NULL;
> +}
> +#endif
> +
> +static int bpf_ops_handle_oom(struct bpf_oom_ops *bpf_oom_ops,
> + struct mem_cgroup *memcg,
> + struct oom_control *oc)
> +{
> + struct bpf_oom_ctx exec_ctx;
> + int ret;
> +
> + if (IS_ENABLED(CONFIG_MEMCG) && memcg)
> + exec_ctx.cgroup_id = memcg_cgroup_id(memcg);
> + else
> + exec_ctx.cgroup_id = 0;
> +
> + oc->bpf_policy_name = &bpf_oom_ops->name[0];
> + oc->bpf_memory_freed = false;
> + ret = bpf_oom_ops->handle_out_of_memory(&exec_ctx, oc);
> + oc->bpf_policy_name = NULL;
> +
> + return ret;
> +}
> +
> +bool bpf_handle_oom(struct oom_control *oc)
> +{
> + struct bpf_oom_ops *bpf_oom_ops = NULL;
> + struct mem_cgroup __maybe_unused *memcg;
> + int idx, ret = 0;
> +
> + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */
> + idx = srcu_read_lock(&bpf_oom_srcu);
> +
> +#ifdef CONFIG_MEMCG
> + /* Find the nearest bpf_oom_ops traversing the cgroup tree upwards */
> + for (memcg = oc->memcg; memcg; memcg = parent_mem_cgroup(memcg)) {
> + bpf_oom_ops = READ_ONCE(memcg->bpf_oom);
> + if (!bpf_oom_ops)
> + continue;
> +
> + /* Call BPF OOM handler */
> + ret = bpf_ops_handle_oom(bpf_oom_ops, memcg, oc);
> + if (ret && oc->bpf_memory_freed)
> + goto exit;
> + }
> +#endif /* CONFIG_MEMCG */
> +
> + /*
> + * System-wide OOM or per-memcg BPF OOM handler wasn't successful?
> + * Try system_bpf_oom.
> + */
> + bpf_oom_ops = READ_ONCE(system_bpf_oom);
> + if (!bpf_oom_ops)
> + goto exit;
> +
> + /* Call BPF OOM handler */
> + ret = bpf_ops_handle_oom(bpf_oom_ops, NULL, oc);
> +exit:
> + srcu_read_unlock(&bpf_oom_srcu, idx);
> + return ret && oc->bpf_memory_freed;
> +}
> +
> +static int __handle_out_of_memory(struct bpf_oom_ctx *exec_ctx,
> + struct oom_control *oc)
> +{
> + return 0;
> +}
> +
> +static void __handle_cgroup_offline(struct bpf_oom_ctx *exec_ctx, u64 cgroup_id)
> +{
> +}
> +
> +static struct bpf_oom_ops __bpf_oom_ops = {
> + .handle_out_of_memory = __handle_out_of_memory,
> + .handle_cgroup_offline = __handle_cgroup_offline,
> +};
> +
> +static const struct bpf_func_proto *
> +bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +{
> + return tracing_prog_func_proto(func_id, prog);
> +}
> +
> +static bool bpf_oom_ops_is_valid_access(int off, int size,
> + enum bpf_access_type type,
> + const struct bpf_prog *prog,
> + struct bpf_insn_access_aux *info)
> +{
> + return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
> +}
> +
> +static const struct bpf_verifier_ops bpf_oom_verifier_ops = {
> + .get_func_proto = bpf_oom_func_proto,
> + .is_valid_access = bpf_oom_ops_is_valid_access,
> +};
> +
> +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
> +{
> + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link);
> + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL;
> + struct bpf_oom_ops *bpf_oom_ops = kdata;
> + struct mem_cgroup *memcg = NULL;
> + int err = 0;
> +
> + if (IS_ENABLED(CONFIG_MEMCG) && ops_link->cgroup_id) {
> + /* Attach to a memory cgroup? */
> + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
> + if (IS_ERR_OR_NULL(memcg))
> + return PTR_ERR(memcg);
> + bpf_oom_ops_ptr = bpf_oom_memcg_ops_ptr(memcg);
> + } else {
> + /* System-wide OOM handler */
> + bpf_oom_ops_ptr = &system_bpf_oom;
> + }
> +
> + /* Another struct ops attached? */
> + if (READ_ONCE(*bpf_oom_ops_ptr)) {
> + err = -EBUSY;
> + goto exit;
> + }
> +
> + /* Expose bpf_oom_ops structure */
> + WRITE_ONCE(*bpf_oom_ops_ptr, bpf_oom_ops);
The mechanism for propagating this pointer to child cgroups isn't
clear. Would an explicit installation in every cgroup be required?
This approach seems impractical for production environments, where
cgroups are often created dynamically.
--
Regards
Yafang
Yafang Shao <laoar.shao@gmail.com> writes:
> On Tue, Oct 28, 2025 at 7:22 AM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>>
>> Introduce a bpf struct ops for implementing custom OOM handling
>> policies.
>>
>> It's possible to load one bpf_oom_ops for the system and one
>> bpf_oom_ops for every memory cgroup. In case of a memcg OOM, the
>> cgroup tree is traversed from the OOM'ing memcg up to the root and
>> corresponding BPF OOM handlers are executed until some memory is
>> freed. If no memory is freed, the kernel OOM killer is invoked.
>>
>> The struct ops provides the bpf_handle_out_of_memory() callback,
>> which expected to return 1 if it was able to free some memory and 0
>> otherwise. If 1 is returned, the kernel also checks the bpf_memory_freed
>> field of the oom_control structure, which is expected to be set by
>> kfuncs suitable for releasing memory. If both are set, OOM is
>> considered handled, otherwise the next OOM handler in the chain
>> (e.g. BPF OOM attached to the parent cgroup or the in-kernel OOM
>> killer) is executed.
>>
>> The bpf_handle_out_of_memory() callback program is sleepable to enable
>> using iterators, e.g. cgroup iterators. The callback receives struct
>> oom_control as an argument, so it can determine the scope of the OOM
>> event: if this is a memcg-wide or system-wide OOM.
>>
>> The callback is executed just before the kernel victim task selection
>> algorithm, so all heuristics and sysctls like panic on oom,
>> sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task
>> are respected.
>>
>> BPF OOM struct ops provides the handle_cgroup_offline() callback
>> which is good for releasing struct ops if the corresponding cgroup
>> is gone.
>>
>> The struct ops also has the name field, which allows to define a
>> custom name for the implemented policy. It's printed in the OOM report
>> in the oom_policy=<policy> format. "default" is printed if bpf is not
>> used or policy name is not specified.
>>
>> [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0
>> oom_policy=bpf_test_policy
>> [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full)
>> [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
>> [ 112.698167] Call Trace:
>> [ 112.698177] <TASK>
>> [ 112.698182] dump_stack_lvl+0x4d/0x70
>> [ 112.698192] dump_header+0x59/0x1c6
>> [ 112.698199] oom_kill_process.cold+0x8/0xef
>> [ 112.698206] bpf_oom_kill_process+0x59/0xb0
>> [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313
>> [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf
>> [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5
>> [ 112.698240] bpf_handle_oom+0x11a/0x1e0
>> [ 112.698250] out_of_memory+0xab/0x5c0
>> [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110
>> [ 112.698274] try_charge_memcg+0x4b5/0x7e0
>> [ 112.698288] charge_memcg+0x2f/0xc0
>> [ 112.698293] __mem_cgroup_charge+0x30/0xc0
>> [ 112.698299] do_anonymous_page+0x40f/0xa50
>> [ 112.698311] __handle_mm_fault+0xbba/0x1140
>> [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5
>> [ 112.698335] handle_mm_fault+0xe6/0x370
>> [ 112.698343] do_user_addr_fault+0x211/0x6a0
>> [ 112.698354] exc_page_fault+0x75/0x1d0
>> [ 112.698363] asm_exc_page_fault+0x26/0x30
>> [ 112.698366] RIP: 0033:0x7fa97236db00
>>
>> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
>> ---
>> include/linux/bpf_oom.h | 74 ++++++++++
>> include/linux/memcontrol.h | 5 +
>> include/linux/oom.h | 8 ++
>> mm/Makefile | 3 +
>> mm/bpf_oom.c | 272 +++++++++++++++++++++++++++++++++++++
>> mm/memcontrol.c | 2 +
>> mm/oom_kill.c | 22 ++-
>> 7 files changed, 384 insertions(+), 2 deletions(-)
>> create mode 100644 include/linux/bpf_oom.h
>> create mode 100644 mm/bpf_oom.c
>>
>> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
>> new file mode 100644
>> index 000000000000..18c32a5a068b
>> --- /dev/null
>> +++ b/include/linux/bpf_oom.h
>> @@ -0,0 +1,74 @@
>> +/* SPDX-License-Identifier: GPL-2.0+ */
>> +
>> +#ifndef __BPF_OOM_H
>> +#define __BPF_OOM_H
>> +
>> +struct oom_control;
>> +
>> +#define BPF_OOM_NAME_MAX_LEN 64
>> +
>> +struct bpf_oom_ctx {
>> + /*
>> + * If bpf_oom_ops is attached to a cgroup, id of this cgroup.
>> + * 0 otherwise.
>> + */
>> + u64 cgroup_id;
>> +};
>> +
>> +struct bpf_oom_ops {
>> + /**
>> + * @handle_out_of_memory: Out of memory bpf handler, called before
>> + * the in-kernel OOM killer.
>> + * @ctx: Execution context
>> + * @oc: OOM control structure
>> + *
>> + * Should return 1 if some memory was freed up, otherwise
>> + * the in-kernel OOM killer is invoked.
>> + */
>> + int (*handle_out_of_memory)(struct bpf_oom_ctx *ctx, struct oom_control *oc);
>> +
>> + /**
>> + * @handle_cgroup_offline: Cgroup offline callback
>> + * @ctx: Execution context
>> + * @cgroup_id: Id of deleted cgroup
>> + *
>> + * Called if the cgroup with the attached bpf_oom_ops is deleted.
>> + */
>> + void (*handle_cgroup_offline)(struct bpf_oom_ctx *ctx, u64 cgroup_id);
>> +
>> + /**
>> + * @name: BPF OOM policy name
>> + */
>> + char name[BPF_OOM_NAME_MAX_LEN];
>> +};
>> +
>> +#ifdef CONFIG_BPF_SYSCALL
>> +/**
>> + * @bpf_handle_oom: handle out of memory condition using bpf
>> + * @oc: OOM control structure
>> + *
>> + * Returns true if some memory was freed.
>> + */
>> +bool bpf_handle_oom(struct oom_control *oc);
>> +
>> +
>> +/**
>> + * @bpf_oom_memcg_offline: handle memcg offlining
>> + * @memcg: Memory cgroup is offlined
>> + *
>> + * When a memory cgroup is about to be deleted and there is an
>> + * attached BPF OOM structure, it has to be detached.
>> + */
>> +void bpf_oom_memcg_offline(struct mem_cgroup *memcg);
>> +
>> +#else /* CONFIG_BPF_SYSCALL */
>> +static inline bool bpf_handle_oom(struct oom_control *oc)
>> +{
>> + return false;
>> +}
>> +
>> +static inline void bpf_oom_memcg_offline(struct mem_cgroup *memcg) {}
>> +
>> +#endif /* CONFIG_BPF_SYSCALL */
>> +
>> +#endif /* __BPF_OOM_H */
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index 50d851ff3f27..39a6c7c8735b 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -29,6 +29,7 @@ struct obj_cgroup;
>> struct page;
>> struct mm_struct;
>> struct kmem_cache;
>> +struct bpf_oom_ops;
>>
>> /* Cgroup-specific page state, on top of universal node page state */
>> enum memcg_stat_item {
>> @@ -226,6 +227,10 @@ struct mem_cgroup {
>> */
>> bool oom_group;
>>
>> +#ifdef CONFIG_BPF_SYSCALL
>> + struct bpf_oom_ops *bpf_oom;
>> +#endif
>> +
>> int swappiness;
>>
>> /* memory.events and memory.events.local */
>> diff --git a/include/linux/oom.h b/include/linux/oom.h
>> index 7b02bc1d0a7e..721087952d04 100644
>> --- a/include/linux/oom.h
>> +++ b/include/linux/oom.h
>> @@ -51,6 +51,14 @@ struct oom_control {
>>
>> /* Used to print the constraint info. */
>> enum oom_constraint constraint;
>> +
>> +#ifdef CONFIG_BPF_SYSCALL
>> + /* Used by the bpf oom implementation to mark the forward progress */
>> + bool bpf_memory_freed;
>> +
>> + /* Policy name */
>> + const char *bpf_policy_name;
>> +#endif
>> };
>>
>> extern struct mutex oom_lock;
>> diff --git a/mm/Makefile b/mm/Makefile
>> index 21abb3353550..051e88c699af 100644
>> --- a/mm/Makefile
>> +++ b/mm/Makefile
>> @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
>> ifdef CONFIG_SWAP
>> obj-$(CONFIG_MEMCG) += swap_cgroup.o
>> endif
>> +ifdef CONFIG_BPF_SYSCALL
>> +obj-y += bpf_oom.o
>> +endif
>> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
>> obj-$(CONFIG_GUP_TEST) += gup_test.o
>> obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
>> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
>> new file mode 100644
>> index 000000000000..c4d09ed9d541
>> --- /dev/null
>> +++ b/mm/bpf_oom.c
>> @@ -0,0 +1,272 @@
>> +// SPDX-License-Identifier: GPL-2.0-or-later
>> +/*
>> + * BPF-driven OOM killer customization
>> + *
>> + * Author: Roman Gushchin <roman.gushchin@linux.dev>
>> + */
>> +
>> +#include <linux/bpf.h>
>> +#include <linux/oom.h>
>> +#include <linux/bpf_oom.h>
>> +#include <linux/srcu.h>
>> +#include <linux/cgroup.h>
>> +#include <linux/memcontrol.h>
>> +
>> +DEFINE_STATIC_SRCU(bpf_oom_srcu);
>> +static struct bpf_oom_ops *system_bpf_oom;
>> +
>> +#ifdef CONFIG_MEMCG
>> +static u64 memcg_cgroup_id(struct mem_cgroup *memcg)
>> +{
>> + return cgroup_id(memcg->css.cgroup);
>> +}
>> +
>> +static struct bpf_oom_ops **bpf_oom_memcg_ops_ptr(struct mem_cgroup *memcg)
>> +{
>> + return &memcg->bpf_oom;
>> +}
>> +#else /* CONFIG_MEMCG */
>> +static u64 memcg_cgroup_id(struct mem_cgroup *memcg)
>> +{
>> + return 0;
>> +}
>> +static struct bpf_oom_ops **bpf_oom_memcg_ops_ptr(struct mem_cgroup *memcg)
>> +{
>> + return NULL;
>> +}
>> +#endif
>> +
>> +static int bpf_ops_handle_oom(struct bpf_oom_ops *bpf_oom_ops,
>> + struct mem_cgroup *memcg,
>> + struct oom_control *oc)
>> +{
>> + struct bpf_oom_ctx exec_ctx;
>> + int ret;
>> +
>> + if (IS_ENABLED(CONFIG_MEMCG) && memcg)
>> + exec_ctx.cgroup_id = memcg_cgroup_id(memcg);
>> + else
>> + exec_ctx.cgroup_id = 0;
>> +
>> + oc->bpf_policy_name = &bpf_oom_ops->name[0];
>> + oc->bpf_memory_freed = false;
>> + ret = bpf_oom_ops->handle_out_of_memory(&exec_ctx, oc);
>> + oc->bpf_policy_name = NULL;
>> +
>> + return ret;
>> +}
>> +
>> +bool bpf_handle_oom(struct oom_control *oc)
>> +{
>> + struct bpf_oom_ops *bpf_oom_ops = NULL;
>> + struct mem_cgroup __maybe_unused *memcg;
>> + int idx, ret = 0;
>> +
>> + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */
>> + idx = srcu_read_lock(&bpf_oom_srcu);
>> +
>> +#ifdef CONFIG_MEMCG
>> + /* Find the nearest bpf_oom_ops traversing the cgroup tree upwards */
>> + for (memcg = oc->memcg; memcg; memcg = parent_mem_cgroup(memcg)) {
>> + bpf_oom_ops = READ_ONCE(memcg->bpf_oom);
>> + if (!bpf_oom_ops)
>> + continue;
>> +
>> + /* Call BPF OOM handler */
>> + ret = bpf_ops_handle_oom(bpf_oom_ops, memcg, oc);
>> + if (ret && oc->bpf_memory_freed)
>> + goto exit;
>> + }
>> +#endif /* CONFIG_MEMCG */
>> +
>> + /*
>> + * System-wide OOM or per-memcg BPF OOM handler wasn't successful?
>> + * Try system_bpf_oom.
>> + */
>> + bpf_oom_ops = READ_ONCE(system_bpf_oom);
>> + if (!bpf_oom_ops)
>> + goto exit;
>> +
>> + /* Call BPF OOM handler */
>> + ret = bpf_ops_handle_oom(bpf_oom_ops, NULL, oc);
>> +exit:
>> + srcu_read_unlock(&bpf_oom_srcu, idx);
>> + return ret && oc->bpf_memory_freed;
>> +}
>> +
>> +static int __handle_out_of_memory(struct bpf_oom_ctx *exec_ctx,
>> + struct oom_control *oc)
>> +{
>> + return 0;
>> +}
>> +
>> +static void __handle_cgroup_offline(struct bpf_oom_ctx *exec_ctx, u64 cgroup_id)
>> +{
>> +}
>> +
>> +static struct bpf_oom_ops __bpf_oom_ops = {
>> + .handle_out_of_memory = __handle_out_of_memory,
>> + .handle_cgroup_offline = __handle_cgroup_offline,
>> +};
>> +
>> +static const struct bpf_func_proto *
>> +bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>> +{
>> + return tracing_prog_func_proto(func_id, prog);
>> +}
>> +
>> +static bool bpf_oom_ops_is_valid_access(int off, int size,
>> + enum bpf_access_type type,
>> + const struct bpf_prog *prog,
>> + struct bpf_insn_access_aux *info)
>> +{
>> + return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
>> +}
>> +
>> +static const struct bpf_verifier_ops bpf_oom_verifier_ops = {
>> + .get_func_proto = bpf_oom_func_proto,
>> + .is_valid_access = bpf_oom_ops_is_valid_access,
>> +};
>> +
>> +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
>> +{
>> + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link);
>> + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL;
>> + struct bpf_oom_ops *bpf_oom_ops = kdata;
>> + struct mem_cgroup *memcg = NULL;
>> + int err = 0;
>> +
>> + if (IS_ENABLED(CONFIG_MEMCG) && ops_link->cgroup_id) {
>> + /* Attach to a memory cgroup? */
>> + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
>> + if (IS_ERR_OR_NULL(memcg))
>> + return PTR_ERR(memcg);
>> + bpf_oom_ops_ptr = bpf_oom_memcg_ops_ptr(memcg);
>> + } else {
>> + /* System-wide OOM handler */
>> + bpf_oom_ops_ptr = &system_bpf_oom;
>> + }
>> +
>> + /* Another struct ops attached? */
>> + if (READ_ONCE(*bpf_oom_ops_ptr)) {
>> + err = -EBUSY;
>> + goto exit;
>> + }
>> +
>> + /* Expose bpf_oom_ops structure */
>> + WRITE_ONCE(*bpf_oom_ops_ptr, bpf_oom_ops);
>
> The mechanism for propagating this pointer to child cgroups isn't
> clear. Would an explicit installation in every cgroup be required?
> This approach seems impractical for production environments, where
> cgroups are often created dynamically.
There is no need to propagate it. Instead, the cgroup tree is traversed
to the root when then OOM is happening and the closest bpf_oom_ops is used.
Obviously, unlike some other cases of attaching bpf progs to cgroups,
OOMs can not be that frequent, so there is no need to optimize for speed
here.
On 10/27/25 4:17 PM, Roman Gushchin wrote:
> diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
> new file mode 100644
> index 000000000000..18c32a5a068b
> --- /dev/null
> +++ b/include/linux/bpf_oom.h
> @@ -0,0 +1,74 @@
> +/* SPDX-License-Identifier: GPL-2.0+ */
> +
> +#ifndef __BPF_OOM_H
> +#define __BPF_OOM_H
> +
> +struct oom_control;
> +
> +#define BPF_OOM_NAME_MAX_LEN 64
> +
> +struct bpf_oom_ctx {
> + /*
> + * If bpf_oom_ops is attached to a cgroup, id of this cgroup.
> + * 0 otherwise.
> + */
> + u64 cgroup_id;
> +};
A function argument can be added to the ops (e.g. handle_out_of_memory)
in the future. afaict, I don't see it will disrupt the existing bpf prog
as long as it does not change the ordering of the existing arguments.
If it goes down the 'struct bpf_oom_ctx" abstraction path, all future
new members of the 'struct bpf_oom_ctx' will need to be initialized even
they may not be useful for most of the existing ops.
For networking use case, I am quite sure the wrapping is unnecessary. I
will leave it as fruit of thoughts here for this use case.
> +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
> +{
> + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link);
link could be NULL here. "return -EOPNOTSUPP" for the legacy kdata reg
that does not use the link api.
In the future, we should enforce link must be used in the
bpf_struct_ops.c except for a few of the existing struct_ops kernel users.
> + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL;
> + struct bpf_oom_ops *bpf_oom_ops = kdata;
> + struct mem_cgroup *memcg = NULL;
> + int err = 0;
> +
> + if (IS_ENABLED(CONFIG_MEMCG) && ops_link->cgroup_id) {
> + /* Attach to a memory cgroup? */
> + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
> + if (IS_ERR_OR_NULL(memcg))
> + return PTR_ERR(memcg);
> + bpf_oom_ops_ptr = bpf_oom_memcg_ops_ptr(memcg);
> + } else {
> + /* System-wide OOM handler */
> + bpf_oom_ops_ptr = &system_bpf_oom;
> + }
> +
> + /* Another struct ops attached? */
> + if (READ_ONCE(*bpf_oom_ops_ptr)) {
> + err = -EBUSY;
> + goto exit;
> + }
> +
> + /* Expose bpf_oom_ops structure */
> + WRITE_ONCE(*bpf_oom_ops_ptr, bpf_oom_ops);
> +exit:
> + mem_cgroup_put(memcg);
> + return err;
> +}
> +
> +static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link)
> +{
> + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link);
> + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL;
> + struct bpf_oom_ops *bpf_oom_ops = kdata;
> + struct mem_cgroup *memcg = NULL;
> +
> + if (IS_ENABLED(CONFIG_MEMCG) && ops_link->cgroup_id) {
> + /* Detach from a memory cgroup? */
> + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
> + if (IS_ERR_OR_NULL(memcg))
> + goto exit;
> + bpf_oom_ops_ptr = bpf_oom_memcg_ops_ptr(memcg);
> + } else {
> + /* System-wide OOM handler */
> + bpf_oom_ops_ptr = &system_bpf_oom;
> + }
> +
> + /* Hide bpf_oom_ops from new callers */
> + if (!WARN_ON(READ_ONCE(*bpf_oom_ops_ptr) != bpf_oom_ops))
> + WRITE_ONCE(*bpf_oom_ops_ptr, NULL);
> +
> + mem_cgroup_put(memcg);
> +
> +exit:
> + /* Release bpf_oom_ops after a srcu grace period */
> + synchronize_srcu(&bpf_oom_srcu);
> +}
> +
> +#ifdef CONFIG_MEMCG
> +void bpf_oom_memcg_offline(struct mem_cgroup *memcg)
Is it when the memcg/cgroup is going away? I think it should also call
bpf_struct_ops_map_link_detach (through link->ops->detach [1]). It will
notify the user space which may poll on the link fd. This will also call
the bpf_oom_ops_unreg above.
[1]
https://lore.kernel.org/all/20240530065946.979330-7-thinker.li@gmail.com/
> +{
> + struct bpf_oom_ops *bpf_oom_ops;
> + struct bpf_oom_ctx exec_ctx;
> + u64 cgrp_id;
> + int idx;
> +
> + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */
> + idx = srcu_read_lock(&bpf_oom_srcu);
> +
> + bpf_oom_ops = READ_ONCE(memcg->bpf_oom);
> + WRITE_ONCE(memcg->bpf_oom, NULL);
> +
> + if (bpf_oom_ops && bpf_oom_ops->handle_cgroup_offline) {
> + cgrp_id = cgroup_id(memcg->css.cgroup);
> + exec_ctx.cgroup_id = cgrp_id;
> + bpf_oom_ops->handle_cgroup_offline(&exec_ctx, cgrp_id);
> + }
> +
> + srcu_read_unlock(&bpf_oom_srcu, idx);
> +}
On Mon, Oct 27, 2025 at 4:18 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
[...]
> +
> +struct bpf_oom_ops {
> + /**
> + * @handle_out_of_memory: Out of memory bpf handler, called before
> + * the in-kernel OOM killer.
> + * @ctx: Execution context
> + * @oc: OOM control structure
> + *
> + * Should return 1 if some memory was freed up, otherwise
> + * the in-kernel OOM killer is invoked.
> + */
> + int (*handle_out_of_memory)(struct bpf_oom_ctx *ctx, struct oom_control *oc);
> +
> + /**
> + * @handle_cgroup_offline: Cgroup offline callback
> + * @ctx: Execution context
> + * @cgroup_id: Id of deleted cgroup
> + *
> + * Called if the cgroup with the attached bpf_oom_ops is deleted.
> + */
> + void (*handle_cgroup_offline)(struct bpf_oom_ctx *ctx, u64 cgroup_id);
handle_out_of_memory() and handle_cgroup_offline() takes bpf_oom_ctx,
which is just cgroup_id for now. Shall we pass in struct mem_cgroup, which
should be easier to use?
Thanks,
Song
> +
> + /**
> + * @name: BPF OOM policy name
> + */
> + char name[BPF_OOM_NAME_MAX_LEN];
> +};
> +
> +#ifdef CONFIG_BPF_SYSCALL
> +/**
> + * @bpf_handle_oom: handle out of memory condition using bpf
> + * @oc: OOM control structure
> + *
> + * Returns true if some memory was freed.
> + */
> +bool bpf_handle_oom(struct oom_control *oc);
> +
Song Liu <song@kernel.org> writes:
> On Mon, Oct 27, 2025 at 4:18 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
> [...]
>> +
>> +struct bpf_oom_ops {
>> + /**
>> + * @handle_out_of_memory: Out of memory bpf handler, called before
>> + * the in-kernel OOM killer.
>> + * @ctx: Execution context
>> + * @oc: OOM control structure
>> + *
>> + * Should return 1 if some memory was freed up, otherwise
>> + * the in-kernel OOM killer is invoked.
>> + */
>> + int (*handle_out_of_memory)(struct bpf_oom_ctx *ctx, struct oom_control *oc);
>> +
>> + /**
>> + * @handle_cgroup_offline: Cgroup offline callback
>> + * @ctx: Execution context
>> + * @cgroup_id: Id of deleted cgroup
>> + *
>> + * Called if the cgroup with the attached bpf_oom_ops is deleted.
>> + */
>> + void (*handle_cgroup_offline)(struct bpf_oom_ctx *ctx, u64 cgroup_id);
>
> handle_out_of_memory() and handle_cgroup_offline() takes bpf_oom_ctx,
> which is just cgroup_id for now. Shall we pass in struct mem_cgroup, which
> should be easier to use?
I want it to be easier to extend, this is why the structure. But I can
pass a memcg pointer instead of cgroup_id, not a problem.
Thanks!
>
> Thanks,
> Song
>
>> +
>> + /**
>> + * @name: BPF OOM policy name
>> + */
>> + char name[BPF_OOM_NAME_MAX_LEN];
>> +};
>> +
>> +#ifdef CONFIG_BPF_SYSCALL
>> +/**
>> + * @bpf_handle_oom: handle out of memory condition using bpf
>> + * @oc: OOM control structure
>> + *
>> + * Returns true if some memory was freed.
>> + */
>> +bool bpf_handle_oom(struct oom_control *oc);
>> +
On Mon, Oct 27, 2025 at 4:18 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> +bool bpf_handle_oom(struct oom_control *oc)
> +{
> + struct bpf_oom_ops *bpf_oom_ops = NULL;
> + struct mem_cgroup __maybe_unused *memcg;
> + int idx, ret = 0;
> +
> + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */
> + idx = srcu_read_lock(&bpf_oom_srcu);
> +
> +#ifdef CONFIG_MEMCG
> + /* Find the nearest bpf_oom_ops traversing the cgroup tree upwards */
> + for (memcg = oc->memcg; memcg; memcg = parent_mem_cgroup(memcg)) {
> + bpf_oom_ops = READ_ONCE(memcg->bpf_oom);
> + if (!bpf_oom_ops)
> + continue;
> +
> + /* Call BPF OOM handler */
> + ret = bpf_ops_handle_oom(bpf_oom_ops, memcg, oc);
> + if (ret && oc->bpf_memory_freed)
> + goto exit;
> + }
> +#endif /* CONFIG_MEMCG */
> +
> + /*
> + * System-wide OOM or per-memcg BPF OOM handler wasn't successful?
> + * Try system_bpf_oom.
> + */
> + bpf_oom_ops = READ_ONCE(system_bpf_oom);
> + if (!bpf_oom_ops)
> + goto exit;
> +
> + /* Call BPF OOM handler */
> + ret = bpf_ops_handle_oom(bpf_oom_ops, NULL, oc);
> +exit:
> + srcu_read_unlock(&bpf_oom_srcu, idx);
> + return ret && oc->bpf_memory_freed;
> +}
...
> +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
> +{
> + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link);
> + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL;
> + struct bpf_oom_ops *bpf_oom_ops = kdata;
> + struct mem_cgroup *memcg = NULL;
> + int err = 0;
> +
> + if (IS_ENABLED(CONFIG_MEMCG) && ops_link->cgroup_id) {
> + /* Attach to a memory cgroup? */
> + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
> + if (IS_ERR_OR_NULL(memcg))
> + return PTR_ERR(memcg);
> + bpf_oom_ops_ptr = bpf_oom_memcg_ops_ptr(memcg);
> + } else {
> + /* System-wide OOM handler */
> + bpf_oom_ops_ptr = &system_bpf_oom;
> + }
I don't like the fallback and special case of cgroup_id == 0.
imo it would be cleaner to require CONFIG_MEMCG for this feature
and only allow attach to a cgroup.
There is always a root cgroup that can be attached to and that
handler will be acting as "system wide" oom handler.
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> On Mon, Oct 27, 2025 at 4:18 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>>
>> +bool bpf_handle_oom(struct oom_control *oc)
>> +{
>> + struct bpf_oom_ops *bpf_oom_ops = NULL;
>> + struct mem_cgroup __maybe_unused *memcg;
>> + int idx, ret = 0;
>> +
>> + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */
>> + idx = srcu_read_lock(&bpf_oom_srcu);
>> +
>> +#ifdef CONFIG_MEMCG
>> + /* Find the nearest bpf_oom_ops traversing the cgroup tree upwards */
>> + for (memcg = oc->memcg; memcg; memcg = parent_mem_cgroup(memcg)) {
>> + bpf_oom_ops = READ_ONCE(memcg->bpf_oom);
>> + if (!bpf_oom_ops)
>> + continue;
>> +
>> + /* Call BPF OOM handler */
>> + ret = bpf_ops_handle_oom(bpf_oom_ops, memcg, oc);
>> + if (ret && oc->bpf_memory_freed)
>> + goto exit;
>> + }
>> +#endif /* CONFIG_MEMCG */
>> +
>> + /*
>> + * System-wide OOM or per-memcg BPF OOM handler wasn't successful?
>> + * Try system_bpf_oom.
>> + */
>> + bpf_oom_ops = READ_ONCE(system_bpf_oom);
>> + if (!bpf_oom_ops)
>> + goto exit;
>> +
>> + /* Call BPF OOM handler */
>> + ret = bpf_ops_handle_oom(bpf_oom_ops, NULL, oc);
>> +exit:
>> + srcu_read_unlock(&bpf_oom_srcu, idx);
>> + return ret && oc->bpf_memory_freed;
>> +}
>
> ...
>
>> +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
>> +{
>> + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link);
>> + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL;
>> + struct bpf_oom_ops *bpf_oom_ops = kdata;
>> + struct mem_cgroup *memcg = NULL;
>> + int err = 0;
>> +
>> + if (IS_ENABLED(CONFIG_MEMCG) && ops_link->cgroup_id) {
>> + /* Attach to a memory cgroup? */
>> + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
>> + if (IS_ERR_OR_NULL(memcg))
>> + return PTR_ERR(memcg);
>> + bpf_oom_ops_ptr = bpf_oom_memcg_ops_ptr(memcg);
>> + } else {
>> + /* System-wide OOM handler */
>> + bpf_oom_ops_ptr = &system_bpf_oom;
>> + }
>
> I don't like the fallback and special case of cgroup_id == 0.
> imo it would be cleaner to require CONFIG_MEMCG for this feature
> and only allow attach to a cgroup.
> There is always a root cgroup that can be attached to and that
> handler will be acting as "system wide" oom handler.
I thought about it, but then it can't be used on !CONFIG_MEMCG
configurations and also before cgroupfs is mounted, root cgroup
is created etc. This is why system-wide things are often handled in a
special way, e.g. in by PSI (grep system_group_pcpu).
I think supporting !CONFIG_MEMCG configurations might be useful for
some very stripped down VM's, for example.
On Tue, Oct 28, 2025 at 11:42 AM Roman Gushchin
<roman.gushchin@linux.dev> wrote:
>
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>
> > On Mon, Oct 27, 2025 at 4:18 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
> >>
> >> +bool bpf_handle_oom(struct oom_control *oc)
> >> +{
> >> + struct bpf_oom_ops *bpf_oom_ops = NULL;
> >> + struct mem_cgroup __maybe_unused *memcg;
> >> + int idx, ret = 0;
> >> +
> >> + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */
> >> + idx = srcu_read_lock(&bpf_oom_srcu);
> >> +
> >> +#ifdef CONFIG_MEMCG
> >> + /* Find the nearest bpf_oom_ops traversing the cgroup tree upwards */
> >> + for (memcg = oc->memcg; memcg; memcg = parent_mem_cgroup(memcg)) {
> >> + bpf_oom_ops = READ_ONCE(memcg->bpf_oom);
> >> + if (!bpf_oom_ops)
> >> + continue;
> >> +
> >> + /* Call BPF OOM handler */
> >> + ret = bpf_ops_handle_oom(bpf_oom_ops, memcg, oc);
> >> + if (ret && oc->bpf_memory_freed)
> >> + goto exit;
> >> + }
> >> +#endif /* CONFIG_MEMCG */
> >> +
> >> + /*
> >> + * System-wide OOM or per-memcg BPF OOM handler wasn't successful?
> >> + * Try system_bpf_oom.
> >> + */
> >> + bpf_oom_ops = READ_ONCE(system_bpf_oom);
> >> + if (!bpf_oom_ops)
> >> + goto exit;
> >> +
> >> + /* Call BPF OOM handler */
> >> + ret = bpf_ops_handle_oom(bpf_oom_ops, NULL, oc);
> >> +exit:
> >> + srcu_read_unlock(&bpf_oom_srcu, idx);
> >> + return ret && oc->bpf_memory_freed;
> >> +}
> >
> > ...
> >
> >> +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
> >> +{
> >> + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link);
> >> + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL;
> >> + struct bpf_oom_ops *bpf_oom_ops = kdata;
> >> + struct mem_cgroup *memcg = NULL;
> >> + int err = 0;
> >> +
> >> + if (IS_ENABLED(CONFIG_MEMCG) && ops_link->cgroup_id) {
> >> + /* Attach to a memory cgroup? */
> >> + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
> >> + if (IS_ERR_OR_NULL(memcg))
> >> + return PTR_ERR(memcg);
> >> + bpf_oom_ops_ptr = bpf_oom_memcg_ops_ptr(memcg);
> >> + } else {
> >> + /* System-wide OOM handler */
> >> + bpf_oom_ops_ptr = &system_bpf_oom;
> >> + }
> >
> > I don't like the fallback and special case of cgroup_id == 0.
> > imo it would be cleaner to require CONFIG_MEMCG for this feature
> > and only allow attach to a cgroup.
> > There is always a root cgroup that can be attached to and that
> > handler will be acting as "system wide" oom handler.
>
> I thought about it, but then it can't be used on !CONFIG_MEMCG
> configurations and also before cgroupfs is mounted, root cgroup
> is created etc.
before that bpf isn't viable either, and oom is certainly not an issue.
> This is why system-wide things are often handled in a
> special way, e.g. in by PSI (grep system_group_pcpu).
>
> I think supporting !CONFIG_MEMCG configurations might be useful for
> some very stripped down VM's, for example.
I thought I wouldn't need to convince the guy who converted bpf maps
to memcg and it made it pretty much mandatory for the bpf subsystem :)
I think the following is long overdue:
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index eb3de35734f0..af60be6d3d41 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -34,6 +34,7 @@ config BPF_SYSCALL
select NET_SOCK_MSG if NET
select NET_XGRESS if NET
select PAGE_POOL if NET
+ depends on MEMCG
default n
With this we can cleanup a ton of code.
Let's not add more hacks just because some weird thing
still wants !MEMCG. If they do, they will survive without bpf.
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> On Tue, Oct 28, 2025 at 11:42 AM Roman Gushchin
> <roman.gushchin@linux.dev> wrote:
>>
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>>
>> > On Mon, Oct 27, 2025 at 4:18 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>> >>
>> >> +bool bpf_handle_oom(struct oom_control *oc)
>> >> +{
>> >> + struct bpf_oom_ops *bpf_oom_ops = NULL;
>> >> + struct mem_cgroup __maybe_unused *memcg;
>> >> + int idx, ret = 0;
>> >> +
>> >> + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */
>> >> + idx = srcu_read_lock(&bpf_oom_srcu);
>> >> +
>> >> +#ifdef CONFIG_MEMCG
>> >> + /* Find the nearest bpf_oom_ops traversing the cgroup tree upwards */
>> >> + for (memcg = oc->memcg; memcg; memcg = parent_mem_cgroup(memcg)) {
>> >> + bpf_oom_ops = READ_ONCE(memcg->bpf_oom);
>> >> + if (!bpf_oom_ops)
>> >> + continue;
>> >> +
>> >> + /* Call BPF OOM handler */
>> >> + ret = bpf_ops_handle_oom(bpf_oom_ops, memcg, oc);
>> >> + if (ret && oc->bpf_memory_freed)
>> >> + goto exit;
>> >> + }
>> >> +#endif /* CONFIG_MEMCG */
>> >> +
>> >> + /*
>> >> + * System-wide OOM or per-memcg BPF OOM handler wasn't successful?
>> >> + * Try system_bpf_oom.
>> >> + */
>> >> + bpf_oom_ops = READ_ONCE(system_bpf_oom);
>> >> + if (!bpf_oom_ops)
>> >> + goto exit;
>> >> +
>> >> + /* Call BPF OOM handler */
>> >> + ret = bpf_ops_handle_oom(bpf_oom_ops, NULL, oc);
>> >> +exit:
>> >> + srcu_read_unlock(&bpf_oom_srcu, idx);
>> >> + return ret && oc->bpf_memory_freed;
>> >> +}
>> >
>> > ...
>> >
>> >> +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
>> >> +{
>> >> + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link);
>> >> + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL;
>> >> + struct bpf_oom_ops *bpf_oom_ops = kdata;
>> >> + struct mem_cgroup *memcg = NULL;
>> >> + int err = 0;
>> >> +
>> >> + if (IS_ENABLED(CONFIG_MEMCG) && ops_link->cgroup_id) {
>> >> + /* Attach to a memory cgroup? */
>> >> + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
>> >> + if (IS_ERR_OR_NULL(memcg))
>> >> + return PTR_ERR(memcg);
>> >> + bpf_oom_ops_ptr = bpf_oom_memcg_ops_ptr(memcg);
>> >> + } else {
>> >> + /* System-wide OOM handler */
>> >> + bpf_oom_ops_ptr = &system_bpf_oom;
>> >> + }
>> >
>> > I don't like the fallback and special case of cgroup_id == 0.
>> > imo it would be cleaner to require CONFIG_MEMCG for this feature
>> > and only allow attach to a cgroup.
>> > There is always a root cgroup that can be attached to and that
>> > handler will be acting as "system wide" oom handler.
>>
>> I thought about it, but then it can't be used on !CONFIG_MEMCG
>> configurations and also before cgroupfs is mounted, root cgroup
>> is created etc.
>
> before that bpf isn't viable either, and oom is certainly not an issue.
>
>> This is why system-wide things are often handled in a
>> special way, e.g. in by PSI (grep system_group_pcpu).
>>
>> I think supporting !CONFIG_MEMCG configurations might be useful for
>> some very stripped down VM's, for example.
>
> I thought I wouldn't need to convince the guy who converted bpf maps
> to memcg and it made it pretty much mandatory for the bpf subsystem :)
> I think the following is long overdue:
> diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
> index eb3de35734f0..af60be6d3d41 100644
> --- a/kernel/bpf/Kconfig
> +++ b/kernel/bpf/Kconfig
> @@ -34,6 +34,7 @@ config BPF_SYSCALL
> select NET_SOCK_MSG if NET
> select NET_XGRESS if NET
> select PAGE_POOL if NET
> + depends on MEMCG
> default n
>
> With this we can cleanup a ton of code.
> Let's not add more hacks just because some weird thing
> still wants !MEMCG. If they do, they will survive without bpf.
Ok, this is bold, but why not?
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Are you going to land it separately, I guess?
```
commit 16c40fd36bbe32c63f44388b5613db55c6b27557
Author: Roman Gushchin <roman.gushchin@linux.dev>
mm: introduce BPF struct ops for OOM handling
This patch introduces a BPF struct ops mechanism for custom OOM handling
policies. It allows attaching bpf_oom_ops system-wide or per-memcg, with
handlers executed before the kernel OOM killer.
> diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
> new file mode 100644
> index 000000000..c4d09ed9d
> --- /dev/null
> +++ b/mm/bpf_oom.c
[ ... ]
> @@ -0,0 +1,272 @@
[ ... ]
> +static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link)
> +{
> + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link);
> + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL;
> + struct bpf_oom_ops *bpf_oom_ops = kdata;
> + struct mem_cgroup *memcg = NULL;
> +
> + if (IS_ENABLED(CONFIG_MEMCG) && ops_link->cgroup_id) {
> + /* Detach from a memory cgroup? */
> + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
> + if (IS_ERR_OR_NULL(memcg))
> + goto exit;
> + bpf_oom_ops_ptr = bpf_oom_memcg_ops_ptr(memcg);
> + } else {
> + /* System-wide OOM handler */
> + bpf_oom_ops_ptr = &system_bpf_oom;
> + }
> +
> + /* Hide bpf_oom_ops from new callers */
> + if (!WARN_ON(READ_ONCE(*bpf_oom_ops_ptr) != bpf_oom_ops))
> + WRITE_ONCE(*bpf_oom_ops_ptr, NULL);
> +
> + mem_cgroup_put(memcg);
> +
> +exit:
> + /* Release bpf_oom_ops after a srcu grace period */
> + synchronize_srcu(&bpf_oom_srcu);
> +}
> +
> +#ifdef CONFIG_MEMCG
> +void bpf_oom_memcg_offline(struct mem_cgroup *memcg)
> +{
> + struct bpf_oom_ops *bpf_oom_ops;
> + struct bpf_oom_ctx exec_ctx;
> + u64 cgrp_id;
> + int idx;
> +
> + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */
> + idx = srcu_read_lock(&bpf_oom_srcu);
> +
> + bpf_oom_ops = READ_ONCE(memcg->bpf_oom);
> + WRITE_ONCE(memcg->bpf_oom, NULL);
Can bpf_oom_ops_unreg() race with bpf_oom_memcg_offline()? If userspace
closes the BPF struct ops link FD while or after the cgroup is being
deleted, both functions could run concurrently on the same memcg:
Thread 1 (unreg): Thread 2 (offline):
mem_cgroup_get_from_ino()
[succeeds, cgroup exists]
srcu_read_lock()
READ_ONCE(memcg->bpf_oom) [gets ops]
WRITE_ONCE(memcg->bpf_oom, NULL)
READ_ONCE(*bpf_oom_ops_ptr)
[sees NULL]
WARN_ON() triggers
The WARN_ON in bpf_oom_ops_unreg() assumes *bpf_oom_ops_ptr matches
bpf_oom_ops, but bpf_oom_memcg_offline() can clear it first during
concurrent execution.
> +
> + if (bpf_oom_ops && bpf_oom_ops->handle_cgroup_offline) {
> + cgrp_id = cgroup_id(memcg->css.cgroup);
> + exec_ctx.cgroup_id = cgrp_id;
> + bpf_oom_ops->handle_cgroup_offline(&exec_ctx, cgrp_id);
> + }
> +
> + srcu_read_unlock(&bpf_oom_srcu, idx);
> +}
> +#endif /* CONFIG_MEMCG */
[ ... ]
```
---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
In-Reply-To-Subject: `mm: introduce BPF struct ops for OOM handling`
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/18859027430
© 2016 - 2025 Red Hat, Inc.