[v1] mm: BPF OOM

[PATCH v1 10/14] bpf: selftests: bpf OOM handler test

Posted by Roman Gushchin 1 month, 2 weeks ago

Implement a pseudo-realistic test for the OOM handling
functionality.

The OOM handling policy which is implemented in bpf is to
kill all tasks belonging to the biggest leaf cgroup, which
doesn't contain unkillable tasks (tasks with oom_score_adj
set to -1000). Pagecache size is excluded from the accounting.

The test creates a hierarchy of memory cgroups, causes an
OOM at the top level, checks that the expected process will be
killed and checks memcg's oom statistics.

Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
 .../selftests/bpf/prog_tests/test_oom.c       | 229 ++++++++++++++++++
 tools/testing/selftests/bpf/progs/test_oom.c  | 108 +++++++++
 2 files changed, 337 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/test_oom.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_oom.c

diff --git a/tools/testing/selftests/bpf/prog_tests/test_oom.c b/tools/testing/selftests/bpf/prog_tests/test_oom.c
new file mode 100644
index 000000000000..eaeb14a9d18f
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_oom.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <test_progs.h>
+#include <bpf/btf.h>
+#include <bpf/bpf.h>
+
+#include "cgroup_helpers.h"
+#include "test_oom.skel.h"
+
+struct cgroup_desc {
+	const char *path;
+	int fd;
+	unsigned long long id;
+	int pid;
+	size_t target;
+	size_t max;
+	int oom_score_adj;
+	bool victim;
+};
+
+#define MB (1024 * 1024)
+#define OOM_SCORE_ADJ_MIN	(-1000)
+#define OOM_SCORE_ADJ_MAX	1000
+
+static struct cgroup_desc cgroups[] = {
+	{ .path = "/oom_test", .max = 80 * MB},
+	{ .path = "/oom_test/cg1", .target = 10 * MB,
+	  .oom_score_adj = OOM_SCORE_ADJ_MAX },
+	{ .path = "/oom_test/cg2", .target = 40 * MB,
+	  .oom_score_adj = OOM_SCORE_ADJ_MIN },
+	{ .path = "/oom_test/cg3" },
+	{ .path = "/oom_test/cg3/cg4", .target = 30 * MB,
+	  .victim = true },
+	{ .path = "/oom_test/cg3/cg5", .target = 20 * MB },
+};
+
+static int spawn_task(struct cgroup_desc *desc)
+{
+	char *ptr;
+	int pid;
+
+	pid = fork();
+	if (pid < 0)
+		return pid;
+
+	if (pid > 0) {
+		/* parent */
+		desc->pid = pid;
+		return 0;
+	}
+
+	/* child */
+	if (desc->oom_score_adj) {
+		char buf[64];
+		int fd = open("/proc/self/oom_score_adj", O_WRONLY);
+
+		if (fd < 0)
+			return -1;
+
+		snprintf(buf, sizeof(buf), "%d", desc->oom_score_adj);
+		write(fd, buf, sizeof(buf));
+		close(fd);
+	}
+
+	ptr = (char *)malloc(desc->target);
+	if (!ptr)
+		return -ENOMEM;
+
+	memset(ptr, 'a', desc->target);
+
+	while (1)
+		sleep(1000);
+
+	return 0;
+}
+
+static void setup_environment(void)
+{
+	int i, err;
+
+	err = setup_cgroup_environment();
+	if (!ASSERT_OK(err, "setup_cgroup_environment"))
+		goto cleanup;
+
+	for (i = 0; i < ARRAY_SIZE(cgroups); i++) {
+		cgroups[i].fd = create_and_get_cgroup(cgroups[i].path);
+		if (!ASSERT_GE(cgroups[i].fd, 0, "create_and_get_cgroup"))
+			goto cleanup;
+
+		cgroups[i].id = get_cgroup_id(cgroups[i].path);
+		if (!ASSERT_GT(cgroups[i].id, 0, "get_cgroup_id"))
+			goto cleanup;
+
+		/* Freeze the top-level cgroup */
+		if (i == 0) {
+			/* Freeze the top-level cgroup */
+			err = write_cgroup_file(cgroups[i].path, "cgroup.freeze", "1");
+			if (!ASSERT_OK(err, "freeze cgroup"))
+				goto cleanup;
+		}
+
+		/* Recursively enable the memory controller */
+		if (!cgroups[i].target) {
+
+			err = write_cgroup_file(cgroups[i].path, "cgroup.subtree_control",
+						"+memory");
+			if (!ASSERT_OK(err, "enable memory controller"))
+				goto cleanup;
+		}
+
+		/* Set memory.max */
+		if (cgroups[i].max) {
+			char buf[256];
+
+			snprintf(buf, sizeof(buf), "%lu", cgroups[i].max);
+			err = write_cgroup_file(cgroups[i].path, "memory.max", buf);
+			if (!ASSERT_OK(err, "set memory.max"))
+				goto cleanup;
+
+			snprintf(buf, sizeof(buf), "0");
+			write_cgroup_file(cgroups[i].path, "memory.swap.max", buf);
+
+		}
+
+		/* Spawn tasks creating memory pressure */
+		if (cgroups[i].target) {
+			char buf[256];
+
+			err = spawn_task(&cgroups[i]);
+			if (!ASSERT_OK(err, "spawn task"))
+				goto cleanup;
+
+			snprintf(buf, sizeof(buf), "%d", cgroups[i].pid);
+			err = write_cgroup_file(cgroups[i].path, "cgroup.procs", buf);
+			if (!ASSERT_OK(err, "put child into a cgroup"))
+				goto cleanup;
+		}
+	}
+
+	return;
+
+cleanup:
+	cleanup_cgroup_environment();
+}
+
+static int run_and_wait_for_oom(void)
+{
+	int ret = -1;
+	bool first = true;
+	char buf[4096] = {};
+	size_t size;
+
+	/* Unfreeze the top-level cgroup */
+	ret = write_cgroup_file(cgroups[0].path, "cgroup.freeze", "0");
+	if (!ASSERT_OK(ret, "freeze cgroup"))
+		return -1;
+
+	for (;;) {
+		int i, status;
+		pid_t pid = wait(&status);
+
+		if (pid == -1) {
+			if (errno == EINTR)
+				continue;
+			/* ECHILD */
+			break;
+		}
+
+		if (!first)
+			continue;
+
+		first = false;
+
+		/* Check which process was terminated first */
+		for (i = 0; i < ARRAY_SIZE(cgroups); i++) {
+			if (!ASSERT_OK(cgroups[i].victim !=
+				       (pid == cgroups[i].pid),
+				       "correct process was killed")) {
+				ret = -1;
+				break;
+			}
+
+			if (!cgroups[i].victim)
+				continue;
+
+			/* Check the memcg oom counter */
+			size = read_cgroup_file(cgroups[i].path,
+						"memory.events",
+						buf, sizeof(buf));
+			if (!ASSERT_OK(size <= 0, "read memory.events")) {
+				ret = -1;
+				break;
+			}
+
+			if (!ASSERT_OK(strstr(buf, "oom_kill 1") == NULL,
+				       "oom_kill count check")) {
+				ret = -1;
+				break;
+			}
+		}
+
+		/* Kill all remaining tasks */
+		for (i = 0; i < ARRAY_SIZE(cgroups); i++)
+			if (cgroups[i].pid && cgroups[i].pid != pid)
+				kill(cgroups[i].pid, SIGKILL);
+	}
+
+	return ret;
+}
+
+void test_oom(void)
+{
+	struct test_oom *skel;
+	int err;
+
+	setup_environment();
+
+	skel = test_oom__open_and_load();
+	err = test_oom__attach(skel);
+	if (CHECK_FAIL(err))
+		goto cleanup;
+
+	/* Unfreeze all child tasks and create the memory pressure */
+	err = run_and_wait_for_oom();
+	CHECK_FAIL(err);
+
+cleanup:
+	cleanup_cgroup_environment();
+	test_oom__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_oom.c b/tools/testing/selftests/bpf/progs/test_oom.c
new file mode 100644
index 000000000000..ca83563fc9a8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_oom.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+#define OOM_SCORE_ADJ_MIN	(-1000)
+
+void bpf_rcu_read_lock(void) __ksym;
+void bpf_rcu_read_unlock(void) __ksym;
+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+struct mem_cgroup *bpf_get_root_mem_cgroup(void) __ksym;
+struct mem_cgroup *bpf_get_mem_cgroup(struct cgroup_subsys_state *css) __ksym;
+void bpf_put_mem_cgroup(struct mem_cgroup *memcg) __ksym;
+int bpf_oom_kill_process(struct oom_control *oc, struct task_struct *task,
+			 const char *message__str) __ksym;
+
+static bool mem_cgroup_killable(struct mem_cgroup *memcg)
+{
+	struct task_struct *task;
+	bool ret = true;
+
+	bpf_for_each(css_task, task, &memcg->css, CSS_TASK_ITER_PROCS)
+		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+			return false;
+
+	return ret;
+}
+
+/*
+ * Find the largest leaf cgroup (ignoring page cache) without unkillable tasks
+ * and kill all belonging tasks.
+ */
+SEC("struct_ops.s/handle_out_of_memory")
+int BPF_PROG(test_out_of_memory, struct oom_control *oc)
+{
+	struct task_struct *task;
+	struct mem_cgroup *root_memcg = oc->memcg;
+	struct mem_cgroup *memcg, *victim = NULL;
+	struct cgroup_subsys_state *css_pos;
+	unsigned long usage, max_usage = 0;
+	unsigned long pagecache = 0;
+	int ret = 0;
+
+	if (root_memcg)
+		root_memcg = bpf_get_mem_cgroup(&root_memcg->css);
+	else
+		root_memcg = bpf_get_root_mem_cgroup();
+
+	if (!root_memcg)
+		return 0;
+
+	bpf_rcu_read_lock();
+	bpf_for_each(css, css_pos, &root_memcg->css, BPF_CGROUP_ITER_DESCENDANTS_POST) {
+		if (css_pos->cgroup->nr_descendants + css_pos->cgroup->nr_dying_descendants)
+			continue;
+
+		memcg = bpf_get_mem_cgroup(css_pos);
+		if (!memcg)
+			continue;
+
+		usage = bpf_mem_cgroup_usage(memcg);
+		pagecache = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES);
+
+		if (usage > pagecache)
+			usage -= pagecache;
+		else
+			usage = 0;
+
+		if ((usage > max_usage) && mem_cgroup_killable(memcg)) {
+			max_usage = usage;
+			if (victim)
+				bpf_put_mem_cgroup(victim);
+			victim = bpf_get_mem_cgroup(&memcg->css);
+		}
+
+		bpf_put_mem_cgroup(memcg);
+	}
+	bpf_rcu_read_unlock();
+
+	if (!victim)
+		goto exit;
+
+	bpf_for_each(css_task, task, &victim->css, CSS_TASK_ITER_PROCS) {
+		struct task_struct *t = bpf_task_acquire(task);
+
+		if (t) {
+			if (!bpf_task_is_oom_victim(task))
+				bpf_oom_kill_process(oc, task, "bpf oom test");
+			bpf_task_release(t);
+			ret = 1;
+		}
+	}
+
+	bpf_put_mem_cgroup(victim);
+exit:
+	bpf_put_mem_cgroup(root_memcg);
+
+	return ret;
+}
+
+SEC(".struct_ops.link")
+struct bpf_oom_ops test_bpf_oom = {
+	.name = "bpf_test_policy",
+	.handle_out_of_memory = (void *)test_out_of_memory,
+};
-- 
2.50.1

Re: [PATCH v1 10/14] bpf: selftests: bpf OOM handler test

Posted by Andrii Nakryiko 1 month, 2 weeks ago

On Mon, Aug 18, 2025 at 10:05 AM Roman Gushchin
<roman.gushchin@linux.dev> wrote:
>
> Implement a pseudo-realistic test for the OOM handling
> functionality.
>
> The OOM handling policy which is implemented in bpf is to
> kill all tasks belonging to the biggest leaf cgroup, which
> doesn't contain unkillable tasks (tasks with oom_score_adj
> set to -1000). Pagecache size is excluded from the accounting.
>
> The test creates a hierarchy of memory cgroups, causes an
> OOM at the top level, checks that the expected process will be
> killed and checks memcg's oom statistics.
>
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> ---
>  .../selftests/bpf/prog_tests/test_oom.c       | 229 ++++++++++++++++++
>  tools/testing/selftests/bpf/progs/test_oom.c  | 108 +++++++++
>  2 files changed, 337 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/test_oom.c
>  create mode 100644 tools/testing/selftests/bpf/progs/test_oom.c
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/test_oom.c b/tools/testing/selftests/bpf/prog_tests/test_oom.c
> new file mode 100644
> index 000000000000..eaeb14a9d18f
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/test_oom.c
> @@ -0,0 +1,229 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <test_progs.h>
> +#include <bpf/btf.h>
> +#include <bpf/bpf.h>
> +
> +#include "cgroup_helpers.h"
> +#include "test_oom.skel.h"
> +
> +struct cgroup_desc {
> +       const char *path;
> +       int fd;
> +       unsigned long long id;
> +       int pid;
> +       size_t target;
> +       size_t max;
> +       int oom_score_adj;
> +       bool victim;
> +};
> +
> +#define MB (1024 * 1024)
> +#define OOM_SCORE_ADJ_MIN      (-1000)
> +#define OOM_SCORE_ADJ_MAX      1000
> +
> +static struct cgroup_desc cgroups[] = {
> +       { .path = "/oom_test", .max = 80 * MB},
> +       { .path = "/oom_test/cg1", .target = 10 * MB,
> +         .oom_score_adj = OOM_SCORE_ADJ_MAX },
> +       { .path = "/oom_test/cg2", .target = 40 * MB,
> +         .oom_score_adj = OOM_SCORE_ADJ_MIN },
> +       { .path = "/oom_test/cg3" },
> +       { .path = "/oom_test/cg3/cg4", .target = 30 * MB,
> +         .victim = true },
> +       { .path = "/oom_test/cg3/cg5", .target = 20 * MB },
> +};
> +
> +static int spawn_task(struct cgroup_desc *desc)
> +{
> +       char *ptr;
> +       int pid;
> +
> +       pid = fork();
> +       if (pid < 0)
> +               return pid;
> +
> +       if (pid > 0) {
> +               /* parent */
> +               desc->pid = pid;
> +               return 0;
> +       }
> +
> +       /* child */
> +       if (desc->oom_score_adj) {
> +               char buf[64];
> +               int fd = open("/proc/self/oom_score_adj", O_WRONLY);
> +
> +               if (fd < 0)
> +                       return -1;
> +
> +               snprintf(buf, sizeof(buf), "%d", desc->oom_score_adj);
> +               write(fd, buf, sizeof(buf));
> +               close(fd);
> +       }
> +
> +       ptr = (char *)malloc(desc->target);
> +       if (!ptr)
> +               return -ENOMEM;
> +
> +       memset(ptr, 'a', desc->target);
> +
> +       while (1)
> +               sleep(1000);
> +
> +       return 0;
> +}
> +
> +static void setup_environment(void)
> +{
> +       int i, err;
> +
> +       err = setup_cgroup_environment();
> +       if (!ASSERT_OK(err, "setup_cgroup_environment"))
> +               goto cleanup;
> +
> +       for (i = 0; i < ARRAY_SIZE(cgroups); i++) {
> +               cgroups[i].fd = create_and_get_cgroup(cgroups[i].path);
> +               if (!ASSERT_GE(cgroups[i].fd, 0, "create_and_get_cgroup"))
> +                       goto cleanup;
> +
> +               cgroups[i].id = get_cgroup_id(cgroups[i].path);
> +               if (!ASSERT_GT(cgroups[i].id, 0, "get_cgroup_id"))
> +                       goto cleanup;
> +
> +               /* Freeze the top-level cgroup */
> +               if (i == 0) {
> +                       /* Freeze the top-level cgroup */
> +                       err = write_cgroup_file(cgroups[i].path, "cgroup.freeze", "1");
> +                       if (!ASSERT_OK(err, "freeze cgroup"))
> +                               goto cleanup;
> +               }
> +
> +               /* Recursively enable the memory controller */
> +               if (!cgroups[i].target) {
> +
> +                       err = write_cgroup_file(cgroups[i].path, "cgroup.subtree_control",
> +                                               "+memory");
> +                       if (!ASSERT_OK(err, "enable memory controller"))
> +                               goto cleanup;
> +               }
> +
> +               /* Set memory.max */
> +               if (cgroups[i].max) {
> +                       char buf[256];
> +
> +                       snprintf(buf, sizeof(buf), "%lu", cgroups[i].max);
> +                       err = write_cgroup_file(cgroups[i].path, "memory.max", buf);
> +                       if (!ASSERT_OK(err, "set memory.max"))
> +                               goto cleanup;
> +
> +                       snprintf(buf, sizeof(buf), "0");
> +                       write_cgroup_file(cgroups[i].path, "memory.swap.max", buf);
> +
> +               }
> +
> +               /* Spawn tasks creating memory pressure */
> +               if (cgroups[i].target) {
> +                       char buf[256];
> +
> +                       err = spawn_task(&cgroups[i]);
> +                       if (!ASSERT_OK(err, "spawn task"))
> +                               goto cleanup;
> +
> +                       snprintf(buf, sizeof(buf), "%d", cgroups[i].pid);
> +                       err = write_cgroup_file(cgroups[i].path, "cgroup.procs", buf);
> +                       if (!ASSERT_OK(err, "put child into a cgroup"))
> +                               goto cleanup;
> +               }
> +       }
> +
> +       return;
> +
> +cleanup:
> +       cleanup_cgroup_environment();
> +}
> +
> +static int run_and_wait_for_oom(void)
> +{
> +       int ret = -1;
> +       bool first = true;
> +       char buf[4096] = {};
> +       size_t size;
> +
> +       /* Unfreeze the top-level cgroup */
> +       ret = write_cgroup_file(cgroups[0].path, "cgroup.freeze", "0");
> +       if (!ASSERT_OK(ret, "freeze cgroup"))
> +               return -1;
> +
> +       for (;;) {
> +               int i, status;
> +               pid_t pid = wait(&status);
> +
> +               if (pid == -1) {
> +                       if (errno == EINTR)
> +                               continue;
> +                       /* ECHILD */
> +                       break;
> +               }
> +
> +               if (!first)
> +                       continue;
> +
> +               first = false;
> +
> +               /* Check which process was terminated first */
> +               for (i = 0; i < ARRAY_SIZE(cgroups); i++) {
> +                       if (!ASSERT_OK(cgroups[i].victim !=
> +                                      (pid == cgroups[i].pid),
> +                                      "correct process was killed")) {
> +                               ret = -1;
> +                               break;
> +                       }
> +
> +                       if (!cgroups[i].victim)
> +                               continue;
> +
> +                       /* Check the memcg oom counter */
> +                       size = read_cgroup_file(cgroups[i].path,
> +                                               "memory.events",
> +                                               buf, sizeof(buf));
> +                       if (!ASSERT_OK(size <= 0, "read memory.events")) {
> +                               ret = -1;
> +                               break;
> +                       }
> +
> +                       if (!ASSERT_OK(strstr(buf, "oom_kill 1") == NULL,
> +                                      "oom_kill count check")) {
> +                               ret = -1;
> +                               break;
> +                       }
> +               }
> +
> +               /* Kill all remaining tasks */
> +               for (i = 0; i < ARRAY_SIZE(cgroups); i++)
> +                       if (cgroups[i].pid && cgroups[i].pid != pid)
> +                               kill(cgroups[i].pid, SIGKILL);
> +       }
> +
> +       return ret;
> +}
> +
> +void test_oom(void)
> +{
> +       struct test_oom *skel;
> +       int err;
> +
> +       setup_environment();
> +
> +       skel = test_oom__open_and_load();
> +       err = test_oom__attach(skel);
> +       if (CHECK_FAIL(err))
> +               goto cleanup;
> +
> +       /* Unfreeze all child tasks and create the memory pressure */
> +       err = run_and_wait_for_oom();
> +       CHECK_FAIL(err);
> +
> +cleanup:
> +       cleanup_cgroup_environment();
> +       test_oom__destroy(skel);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/test_oom.c b/tools/testing/selftests/bpf/progs/test_oom.c
> new file mode 100644
> index 000000000000..ca83563fc9a8
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/test_oom.c
> @@ -0,0 +1,108 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include "vmlinux.h"
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +char _license[] SEC("license") = "GPL";
> +
> +#define OOM_SCORE_ADJ_MIN      (-1000)
> +
> +void bpf_rcu_read_lock(void) __ksym;
> +void bpf_rcu_read_unlock(void) __ksym;
> +struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
> +void bpf_task_release(struct task_struct *p) __ksym;
> +struct mem_cgroup *bpf_get_root_mem_cgroup(void) __ksym;
> +struct mem_cgroup *bpf_get_mem_cgroup(struct cgroup_subsys_state *css) __ksym;
> +void bpf_put_mem_cgroup(struct mem_cgroup *memcg) __ksym;
> +int bpf_oom_kill_process(struct oom_control *oc, struct task_struct *task,
> +                        const char *message__str) __ksym;

These declarations should come from vmlinux.h, if you don't get them,
you might not have recent enough pahole.

At the very least these should all be __ksym __weak, not just __ksym
(but I'd rather not add them, though).

[...]

Re: [PATCH v1 10/14] bpf: selftests: bpf OOM handler test

Posted by Roman Gushchin 1 month, 2 weeks ago

Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Mon, Aug 18, 2025 at 10:05 AM Roman Gushchin
> <roman.gushchin@linux.dev> wrote:
>>
>> Implement a pseudo-realistic test for the OOM handling
>> functionality.
>>
>> The OOM handling policy which is implemented in bpf is to
>> kill all tasks belonging to the biggest leaf cgroup, which
>> doesn't contain unkillable tasks (tasks with oom_score_adj
>> set to -1000). Pagecache size is excluded from the accounting.
>>
>> The test creates a hierarchy of memory cgroups, causes an
>> OOM at the top level, checks that the expected process will be
>> killed and checks memcg's oom statistics.
>>
>> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
>> ---
>>  .../selftests/bpf/prog_tests/test_oom.c       | 229 ++++++++++++++++++
>>  tools/testing/selftests/bpf/progs/test_oom.c  | 108 +++++++++
>>  2 files changed, 337 insertions(+)
>>  create mode 100644 tools/testing/selftests/bpf/prog_tests/test_oom.c
>>  create mode 100644 tools/testing/selftests/bpf/progs/test_oom.c
>>
>> diff --git a/tools/testing/selftests/bpf/prog_tests/test_oom.c b/tools/testing/selftests/bpf/prog_tests/test_oom.c
>> new file mode 100644
>> index 000000000000..eaeb14a9d18f
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/prog_tests/test_oom.c
>> @@ -0,0 +1,229 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +#include <test_progs.h>
>> +#include <bpf/btf.h>
>> +#include <bpf/bpf.h>
>> +
>> +#include "cgroup_helpers.h"
>> +#include "test_oom.skel.h"
>> +
>> +struct cgroup_desc {
>> +       const char *path;
>> +       int fd;
>> +       unsigned long long id;
>> +       int pid;
>> +       size_t target;
>> +       size_t max;
>> +       int oom_score_adj;
>> +       bool victim;
>> +};
>> +
>> +#define MB (1024 * 1024)
>> +#define OOM_SCORE_ADJ_MIN      (-1000)
>> +#define OOM_SCORE_ADJ_MAX      1000
>> +
>> +static struct cgroup_desc cgroups[] = {
>> +       { .path = "/oom_test", .max = 80 * MB},
>> +       { .path = "/oom_test/cg1", .target = 10 * MB,
>> +         .oom_score_adj = OOM_SCORE_ADJ_MAX },
>> +       { .path = "/oom_test/cg2", .target = 40 * MB,
>> +         .oom_score_adj = OOM_SCORE_ADJ_MIN },
>> +       { .path = "/oom_test/cg3" },
>> +       { .path = "/oom_test/cg3/cg4", .target = 30 * MB,
>> +         .victim = true },
>> +       { .path = "/oom_test/cg3/cg5", .target = 20 * MB },
>> +};
>> +
>> +static int spawn_task(struct cgroup_desc *desc)
>> +{
>> +       char *ptr;
>> +       int pid;
>> +
>> +       pid = fork();
>> +       if (pid < 0)
>> +               return pid;
>> +
>> +       if (pid > 0) {
>> +               /* parent */
>> +               desc->pid = pid;
>> +               return 0;
>> +       }
>> +
>> +       /* child */
>> +       if (desc->oom_score_adj) {
>> +               char buf[64];
>> +               int fd = open("/proc/self/oom_score_adj", O_WRONLY);
>> +
>> +               if (fd < 0)
>> +                       return -1;
>> +
>> +               snprintf(buf, sizeof(buf), "%d", desc->oom_score_adj);
>> +               write(fd, buf, sizeof(buf));
>> +               close(fd);
>> +       }
>> +
>> +       ptr = (char *)malloc(desc->target);
>> +       if (!ptr)
>> +               return -ENOMEM;
>> +
>> +       memset(ptr, 'a', desc->target);
>> +
>> +       while (1)
>> +               sleep(1000);
>> +
>> +       return 0;
>> +}
>> +
>> +static void setup_environment(void)
>> +{
>> +       int i, err;
>> +
>> +       err = setup_cgroup_environment();
>> +       if (!ASSERT_OK(err, "setup_cgroup_environment"))
>> +               goto cleanup;
>> +
>> +       for (i = 0; i < ARRAY_SIZE(cgroups); i++) {
>> +               cgroups[i].fd = create_and_get_cgroup(cgroups[i].path);
>> +               if (!ASSERT_GE(cgroups[i].fd, 0, "create_and_get_cgroup"))
>> +                       goto cleanup;
>> +
>> +               cgroups[i].id = get_cgroup_id(cgroups[i].path);
>> +               if (!ASSERT_GT(cgroups[i].id, 0, "get_cgroup_id"))
>> +                       goto cleanup;
>> +
>> +               /* Freeze the top-level cgroup */
>> +               if (i == 0) {
>> +                       /* Freeze the top-level cgroup */
>> +                       err = write_cgroup_file(cgroups[i].path, "cgroup.freeze", "1");
>> +                       if (!ASSERT_OK(err, "freeze cgroup"))
>> +                               goto cleanup;
>> +               }
>> +
>> +               /* Recursively enable the memory controller */
>> +               if (!cgroups[i].target) {
>> +
>> +                       err = write_cgroup_file(cgroups[i].path, "cgroup.subtree_control",
>> +                                               "+memory");
>> +                       if (!ASSERT_OK(err, "enable memory controller"))
>> +                               goto cleanup;
>> +               }
>> +
>> +               /* Set memory.max */
>> +               if (cgroups[i].max) {
>> +                       char buf[256];
>> +
>> +                       snprintf(buf, sizeof(buf), "%lu", cgroups[i].max);
>> +                       err = write_cgroup_file(cgroups[i].path, "memory.max", buf);
>> +                       if (!ASSERT_OK(err, "set memory.max"))
>> +                               goto cleanup;
>> +
>> +                       snprintf(buf, sizeof(buf), "0");
>> +                       write_cgroup_file(cgroups[i].path, "memory.swap.max", buf);
>> +
>> +               }
>> +
>> +               /* Spawn tasks creating memory pressure */
>> +               if (cgroups[i].target) {
>> +                       char buf[256];
>> +
>> +                       err = spawn_task(&cgroups[i]);
>> +                       if (!ASSERT_OK(err, "spawn task"))
>> +                               goto cleanup;
>> +
>> +                       snprintf(buf, sizeof(buf), "%d", cgroups[i].pid);
>> +                       err = write_cgroup_file(cgroups[i].path, "cgroup.procs", buf);
>> +                       if (!ASSERT_OK(err, "put child into a cgroup"))
>> +                               goto cleanup;
>> +               }
>> +       }
>> +
>> +       return;
>> +
>> +cleanup:
>> +       cleanup_cgroup_environment();
>> +}
>> +
>> +static int run_and_wait_for_oom(void)
>> +{
>> +       int ret = -1;
>> +       bool first = true;
>> +       char buf[4096] = {};
>> +       size_t size;
>> +
>> +       /* Unfreeze the top-level cgroup */
>> +       ret = write_cgroup_file(cgroups[0].path, "cgroup.freeze", "0");
>> +       if (!ASSERT_OK(ret, "freeze cgroup"))
>> +               return -1;
>> +
>> +       for (;;) {
>> +               int i, status;
>> +               pid_t pid = wait(&status);
>> +
>> +               if (pid == -1) {
>> +                       if (errno == EINTR)
>> +                               continue;
>> +                       /* ECHILD */
>> +                       break;
>> +               }
>> +
>> +               if (!first)
>> +                       continue;
>> +
>> +               first = false;
>> +
>> +               /* Check which process was terminated first */
>> +               for (i = 0; i < ARRAY_SIZE(cgroups); i++) {
>> +                       if (!ASSERT_OK(cgroups[i].victim !=
>> +                                      (pid == cgroups[i].pid),
>> +                                      "correct process was killed")) {
>> +                               ret = -1;
>> +                               break;
>> +                       }
>> +
>> +                       if (!cgroups[i].victim)
>> +                               continue;
>> +
>> +                       /* Check the memcg oom counter */
>> +                       size = read_cgroup_file(cgroups[i].path,
>> +                                               "memory.events",
>> +                                               buf, sizeof(buf));
>> +                       if (!ASSERT_OK(size <= 0, "read memory.events")) {
>> +                               ret = -1;
>> +                               break;
>> +                       }
>> +
>> +                       if (!ASSERT_OK(strstr(buf, "oom_kill 1") == NULL,
>> +                                      "oom_kill count check")) {
>> +                               ret = -1;
>> +                               break;
>> +                       }
>> +               }
>> +
>> +               /* Kill all remaining tasks */
>> +               for (i = 0; i < ARRAY_SIZE(cgroups); i++)
>> +                       if (cgroups[i].pid && cgroups[i].pid != pid)
>> +                               kill(cgroups[i].pid, SIGKILL);
>> +       }
>> +
>> +       return ret;
>> +}
>> +
>> +void test_oom(void)
>> +{
>> +       struct test_oom *skel;
>> +       int err;
>> +
>> +       setup_environment();
>> +
>> +       skel = test_oom__open_and_load();
>> +       err = test_oom__attach(skel);
>> +       if (CHECK_FAIL(err))
>> +               goto cleanup;
>> +
>> +       /* Unfreeze all child tasks and create the memory pressure */
>> +       err = run_and_wait_for_oom();
>> +       CHECK_FAIL(err);
>> +
>> +cleanup:
>> +       cleanup_cgroup_environment();
>> +       test_oom__destroy(skel);
>> +}
>> diff --git a/tools/testing/selftests/bpf/progs/test_oom.c b/tools/testing/selftests/bpf/progs/test_oom.c
>> new file mode 100644
>> index 000000000000..ca83563fc9a8
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/progs/test_oom.c
>> @@ -0,0 +1,108 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +#include "vmlinux.h"
>> +#include <bpf/bpf_helpers.h>
>> +#include <bpf/bpf_tracing.h>
>> +
>> +char _license[] SEC("license") = "GPL";
>> +
>> +#define OOM_SCORE_ADJ_MIN      (-1000)
>> +
>> +void bpf_rcu_read_lock(void) __ksym;
>> +void bpf_rcu_read_unlock(void) __ksym;
>> +struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
>> +void bpf_task_release(struct task_struct *p) __ksym;
>> +struct mem_cgroup *bpf_get_root_mem_cgroup(void) __ksym;
>> +struct mem_cgroup *bpf_get_mem_cgroup(struct cgroup_subsys_state *css) __ksym;
>> +void bpf_put_mem_cgroup(struct mem_cgroup *memcg) __ksym;
>> +int bpf_oom_kill_process(struct oom_control *oc, struct task_struct *task,
>> +                        const char *message__str) __ksym;
>
> These declarations should come from vmlinux.h, if you don't get them,
> you might not have recent enough pahole.

Indeed. Fixed, thanks!

Re: [PATCH v1 10/14] bpf: selftests: bpf OOM handler test

Posted by Kumar Kartikeya Dwivedi 1 month, 2 weeks ago

On Mon, 18 Aug 2025 at 19:02, Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> Implement a pseudo-realistic test for the OOM handling
> functionality.
>
> The OOM handling policy which is implemented in bpf is to
> kill all tasks belonging to the biggest leaf cgroup, which
> doesn't contain unkillable tasks (tasks with oom_score_adj
> set to -1000). Pagecache size is excluded from the accounting.
>
> The test creates a hierarchy of memory cgroups, causes an
> OOM at the top level, checks that the expected process will be
> killed and checks memcg's oom statistics.
>
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> ---
>  [...]
> +
> +/*
> + * Find the largest leaf cgroup (ignoring page cache) without unkillable tasks
> + * and kill all belonging tasks.
> + */
> +SEC("struct_ops.s/handle_out_of_memory")
> +int BPF_PROG(test_out_of_memory, struct oom_control *oc)
> +{
> +       struct task_struct *task;
> +       struct mem_cgroup *root_memcg = oc->memcg;
> +       struct mem_cgroup *memcg, *victim = NULL;
> +       struct cgroup_subsys_state *css_pos;
> +       unsigned long usage, max_usage = 0;
> +       unsigned long pagecache = 0;
> +       int ret = 0;
> +
> +       if (root_memcg)
> +               root_memcg = bpf_get_mem_cgroup(&root_memcg->css);
> +       else
> +               root_memcg = bpf_get_root_mem_cgroup();
> +
> +       if (!root_memcg)
> +               return 0;
> +
> +       bpf_rcu_read_lock();
> +       bpf_for_each(css, css_pos, &root_memcg->css, BPF_CGROUP_ITER_DESCENDANTS_POST) {
> +               if (css_pos->cgroup->nr_descendants + css_pos->cgroup->nr_dying_descendants)
> +                       continue;
> +
> +               memcg = bpf_get_mem_cgroup(css_pos);
> +               if (!memcg)
> +                       continue;
> +
> +               usage = bpf_mem_cgroup_usage(memcg);
> +               pagecache = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES);
> +
> +               if (usage > pagecache)
> +                       usage -= pagecache;
> +               else
> +                       usage = 0;
> +
> +               if ((usage > max_usage) && mem_cgroup_killable(memcg)) {
> +                       max_usage = usage;
> +                       if (victim)
> +                               bpf_put_mem_cgroup(victim);
> +                       victim = bpf_get_mem_cgroup(&memcg->css);
> +               }
> +
> +               bpf_put_mem_cgroup(memcg);
> +       }
> +       bpf_rcu_read_unlock();
> +
> +       if (!victim)
> +               goto exit;
> +
> +       bpf_for_each(css_task, task, &victim->css, CSS_TASK_ITER_PROCS) {
> +               struct task_struct *t = bpf_task_acquire(task);
> +
> +               if (t) {
> +                       if (!bpf_task_is_oom_victim(task))
> +                               bpf_oom_kill_process(oc, task, "bpf oom test");

Is there a scenario where we want to invoke bpf_oom_kill_process when
the task is not an oom victim?
Would it be better to subsume this check in the kfunc itself?

> +                       bpf_task_release(t);
> +                       ret = 1;
> +               }
> +       }
> +
> +       bpf_put_mem_cgroup(victim);
> +exit:
> +       bpf_put_mem_cgroup(root_memcg);
> +
> +       return ret;
> +}
> +
> +SEC(".struct_ops.link")
> +struct bpf_oom_ops test_bpf_oom = {
> +       .name = "bpf_test_policy",
> +       .handle_out_of_memory = (void *)test_out_of_memory,
> +};
> --
> 2.50.1
>

Re: [PATCH v1 10/14] bpf: selftests: bpf OOM handler test

Posted by Roman Gushchin 1 month, 2 weeks ago

Kumar Kartikeya Dwivedi <memxor@gmail.com> writes:

> On Mon, 18 Aug 2025 at 19:02, Roman Gushchin <roman.gushchin@linux.dev> wrote:
>>
>> Implement a pseudo-realistic test for the OOM handling
>> functionality.
>>
>> The OOM handling policy which is implemented in bpf is to
>> kill all tasks belonging to the biggest leaf cgroup, which
>> doesn't contain unkillable tasks (tasks with oom_score_adj
>> set to -1000). Pagecache size is excluded from the accounting.
>>
>> The test creates a hierarchy of memory cgroups, causes an
>> OOM at the top level, checks that the expected process will be
>> killed and checks memcg's oom statistics.
>>
>> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
>> ---
>>  [...]
>> +
>> +/*
>> + * Find the largest leaf cgroup (ignoring page cache) without unkillable tasks
>> + * and kill all belonging tasks.
>> + */
>> +SEC("struct_ops.s/handle_out_of_memory")
>> +int BPF_PROG(test_out_of_memory, struct oom_control *oc)
>> +{
>> +       struct task_struct *task;
>> +       struct mem_cgroup *root_memcg = oc->memcg;
>> +       struct mem_cgroup *memcg, *victim = NULL;
>> +       struct cgroup_subsys_state *css_pos;
>> +       unsigned long usage, max_usage = 0;
>> +       unsigned long pagecache = 0;
>> +       int ret = 0;
>> +
>> +       if (root_memcg)
>> +               root_memcg = bpf_get_mem_cgroup(&root_memcg->css);
>> +       else
>> +               root_memcg = bpf_get_root_mem_cgroup();
>> +
>> +       if (!root_memcg)
>> +               return 0;
>> +
>> +       bpf_rcu_read_lock();
>> +       bpf_for_each(css, css_pos, &root_memcg->css, BPF_CGROUP_ITER_DESCENDANTS_POST) {
>> +               if (css_pos->cgroup->nr_descendants + css_pos->cgroup->nr_dying_descendants)
>> +                       continue;
>> +
>> +               memcg = bpf_get_mem_cgroup(css_pos);
>> +               if (!memcg)
>> +                       continue;
>> +
>> +               usage = bpf_mem_cgroup_usage(memcg);
>> +               pagecache = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES);
>> +
>> +               if (usage > pagecache)
>> +                       usage -= pagecache;
>> +               else
>> +                       usage = 0;
>> +
>> +               if ((usage > max_usage) && mem_cgroup_killable(memcg)) {
>> +                       max_usage = usage;
>> +                       if (victim)
>> +                               bpf_put_mem_cgroup(victim);
>> +                       victim = bpf_get_mem_cgroup(&memcg->css);
>> +               }
>> +
>> +               bpf_put_mem_cgroup(memcg);
>> +       }
>> +       bpf_rcu_read_unlock();
>> +
>> +       if (!victim)
>> +               goto exit;
>> +
>> +       bpf_for_each(css_task, task, &victim->css, CSS_TASK_ITER_PROCS) {
>> +               struct task_struct *t = bpf_task_acquire(task);
>> +
>> +               if (t) {
>> +                       if (!bpf_task_is_oom_victim(task))
>> +                               bpf_oom_kill_process(oc, task, "bpf oom test");
>
> Is there a scenario where we want to invoke bpf_oom_kill_process when
> the task is not an oom victim?

Not really, but...

> Would it be better to subsume this check in the kfunc itself?

bpf_task_is_oom_victim() is useful by itself, because if we see
a task which is about to be killed, we can likely simple bail out.
Let me adjust the test to reflect it.