From: Hui Zhu <zhuhui@kylinos.cn>
Add a comprehensive selftest suite for the `memcg_bpf_ops`
functionality. These tests validate that BPF programs can correctly
influence memory cgroup throttling behavior by implementing the new
hooks.
The test suite is added in `prog_tests/memcg_ops.c` and covers
several key scenarios:
1. `test_memcg_ops_over_high`:
Verifies that a BPF program can trigger throttling on a low-priority
cgroup by returning a delay from the `get_high_delay_ms` hook when a
high-priority cgroup is under pressure.
2. `test_memcg_ops_below_low_over_high`:
Tests the combination of the `below_low` and `get_high_delay_ms`
hooks, ensuring they work together as expected.
3. `test_memcg_ops_below_min_over_high`:
Validates the interaction between the `below_min` and
`get_high_delay_ms` hooks.
The test framework sets up a cgroup hierarchy with high and low
priority groups, attaches BPF programs, runs memory-intensive
workloads, and asserts that the observed throttling (measured by
workload execution time) matches expectations.
The BPF program (`progs/memcg_ops.c`) uses a tracepoint on
`memcg:count_memcg_events` (specifically PGFAULT) to detect memory
pressure and trigger the appropriate hooks in response. This test
suite provides essential validation for the new memory control
mechanisms.
Signed-off-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Hui Zhu <zhuhui@kylinos.cn>
---
MAINTAINERS | 2 +
.../selftests/bpf/prog_tests/memcg_ops.c | 537 ++++++++++++++++++
tools/testing/selftests/bpf/progs/memcg_ops.c | 129 +++++
3 files changed, 668 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/memcg_ops.c
create mode 100644 tools/testing/selftests/bpf/progs/memcg_ops.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 491d567f7dc8..7e07bb330eae 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6471,6 +6471,8 @@ F: mm/memcontrol-v1.h
F: mm/page_counter.c
F: mm/swap_cgroup.c
F: samples/cgroup/*
+F: tools/testing/selftests/bpf/prog_tests/memcg_ops.c
+F: tools/testing/selftests/bpf/progs/memcg_ops.c
F: tools/testing/selftests/cgroup/memcg_protection.m
F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c
F: tools/testing/selftests/cgroup/test_kmem.c
diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c
new file mode 100644
index 000000000000..9a8d16296f2d
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c
@@ -0,0 +1,537 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory controller eBPF struct ops test
+ */
+
+#include <test_progs.h>
+#include <bpf/btf.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "cgroup_helpers.h"
+
+struct local_config {
+ u64 threshold;
+ u64 high_cgroup_id;
+ bool use_below_low;
+ bool use_below_min;
+ unsigned int over_high_ms;
+} local_config;
+
+#include "memcg_ops.skel.h"
+
+#define TRIGGER_THRESHOLD 1
+#define OVER_HIGH_MS 2000
+#define FILE_SIZE (64 * 1024 * 1024ul)
+#define BUFFER_SIZE (4096)
+#define CG_LIMIT (120 * 1024 * 1024ul)
+
+#define CG_DIR "/memcg_ops_test"
+#define CG_HIGH_DIR CG_DIR "/high"
+#define CG_LOW_DIR CG_DIR "/low"
+
+static int
+setup_cgroup(int *high_cgroup_id, int *low_cgroup_fd, int *high_cgroup_fd)
+{
+ int ret;
+ char limit_buf[20];
+
+ ret = setup_cgroup_environment();
+ if (!ASSERT_OK(ret, "setup_cgroup_environment"))
+ goto cleanup;
+
+ ret = create_and_get_cgroup(CG_DIR);
+ if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_DIR))
+ goto cleanup;
+ close(ret);
+ ret = enable_controllers(CG_DIR, "memory");
+ if (!ASSERT_OK(ret, "enable_controllers"))
+ goto cleanup;
+ snprintf(limit_buf, 20, "%ld", CG_LIMIT);
+ ret = write_cgroup_file(CG_DIR, "memory.max", limit_buf);
+ if (!ASSERT_OK(ret, "write_cgroup_file memory.max"))
+ goto cleanup;
+ ret = write_cgroup_file(CG_DIR, "memory.swap.max", "0");
+ if (!ASSERT_OK(ret, "write_cgroup_file memory.swap.max"))
+ goto cleanup;
+
+ ret = create_and_get_cgroup(CG_HIGH_DIR);
+ if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_HIGH_DIR))
+ goto cleanup;
+ if (high_cgroup_fd)
+ *high_cgroup_fd = ret;
+ else
+ close(ret);
+ ret = (int)get_cgroup_id(CG_HIGH_DIR);
+ if (!ASSERT_GE(ret, 0, "get_cgroup_id"))
+ goto cleanup;
+ *high_cgroup_id = ret;
+
+ ret = create_and_get_cgroup(CG_LOW_DIR);
+ if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_LOW_DIR))
+ goto cleanup;
+ if (low_cgroup_fd)
+ *low_cgroup_fd = ret;
+ else
+ close(ret);
+
+ return 0;
+
+cleanup:
+ cleanup_cgroup_environment();
+ return -1;
+}
+
+int write_file(const char *filename)
+{
+ int ret = -1;
+ size_t written = 0;
+ char *buffer;
+ FILE *fp;
+
+ fp = fopen(filename, "wb");
+ if (!fp)
+ goto out;
+
+ buffer = malloc(BUFFER_SIZE);
+ if (!buffer)
+ goto cleanup_fp;
+
+ memset(buffer, 'A', BUFFER_SIZE);
+
+ while (written < FILE_SIZE) {
+ size_t to_write = (FILE_SIZE - written < BUFFER_SIZE) ?
+ (FILE_SIZE - written) :
+ BUFFER_SIZE;
+
+ if (fwrite(buffer, 1, to_write, fp) != to_write)
+ goto cleanup;
+ written += to_write;
+ }
+
+ ret = 0;
+cleanup:
+ free(buffer);
+cleanup_fp:
+ fclose(fp);
+out:
+ return ret;
+}
+
+int read_file(const char *filename, int iterations)
+{
+ int ret = -1;
+ long page_size = sysconf(_SC_PAGESIZE);
+ char *p;
+ char *map;
+ size_t i;
+ int fd;
+ struct stat sb;
+
+ fd = open(filename, O_RDONLY);
+ if (fd == -1)
+ goto out;
+
+ if (fstat(fd, &sb) == -1)
+ goto cleanup_fd;
+
+ if (sb.st_size != FILE_SIZE) {
+ fprintf(stderr, "File size mismatch: expected %ld, got %ld\n",
+ FILE_SIZE, sb.st_size);
+ goto cleanup_fd;
+ }
+
+ map = mmap(NULL, FILE_SIZE, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (map == MAP_FAILED)
+ goto cleanup_fd;
+
+ for (int iter = 0; iter < iterations; iter++) {
+ for (i = 0; i < FILE_SIZE; i += page_size) {
+ /* access a byte to trigger page fault */
+ p = &map[i];
+ __asm__ __volatile__("" : : "r"(p) : "memory");
+ }
+
+ if (env.verbosity >= VERBOSE_NORMAL)
+ printf("%s %d %d done\n", __func__, getpid(), iter);
+ }
+
+ if (munmap(map, FILE_SIZE) == -1)
+ goto cleanup_fd;
+
+ ret = 0;
+
+cleanup_fd:
+ close(fd);
+out:
+ return ret;
+}
+
+static void
+real_test_memcg_ops_child_work(const char *cgroup_path,
+ char *data_filename,
+ char *time_filename,
+ int read_times)
+{
+ struct timeval start, end;
+ double elapsed;
+ FILE *fp;
+
+ if (!ASSERT_OK(join_parent_cgroup(cgroup_path), "join_parent_cgroup"))
+ goto out;
+
+ if (env.verbosity >= VERBOSE_NORMAL)
+ printf("%s %d begin\n", __func__, getpid());
+
+ gettimeofday(&start, NULL);
+
+ if (!ASSERT_OK(write_file(data_filename), "write_file"))
+ goto out;
+
+ if (env.verbosity >= VERBOSE_NORMAL)
+ printf("%s %d write_file done\n", __func__, getpid());
+
+ if (!ASSERT_OK(read_file(data_filename, read_times), "read_file"))
+ goto out;
+
+ gettimeofday(&end, NULL);
+
+ elapsed = (end.tv_sec - start.tv_sec) +
+ (end.tv_usec - start.tv_usec) / 1000000.0;
+
+ if (env.verbosity >= VERBOSE_NORMAL)
+ printf("%s %d end %.6f\n", __func__, getpid(), elapsed);
+
+ fp = fopen(time_filename, "w");
+ if (!ASSERT_OK_PTR(fp, "fopen"))
+ goto out;
+ fprintf(fp, "%.6f", elapsed);
+ fclose(fp);
+
+out:
+ exit(0);
+}
+
+static int get_time(char *time_filename, double *time)
+{
+ int ret = -1;
+ FILE *fp;
+ char buf[64];
+
+ fp = fopen(time_filename, "r");
+ if (!ASSERT_OK_PTR(fp, "fopen"))
+ goto out;
+
+ if (!ASSERT_OK_PTR(fgets(buf, sizeof(buf), fp), "fgets"))
+ goto cleanup;
+
+ if (sscanf(buf, "%lf", time) < 0) {
+ PRINT_FAIL("sscanf %s", buf);
+ goto cleanup;
+ }
+
+ ret = 0;
+cleanup:
+ fclose(fp);
+out:
+ return ret;
+}
+
+static void real_test_memcg_ops(int read_times)
+{
+ int ret;
+ char data_file1[] = "/tmp/test_data_XXXXXX";
+ char data_file2[] = "/tmp/test_data_XXXXXX";
+ char time_file1[] = "/tmp/test_time_XXXXXX";
+ char time_file2[] = "/tmp/test_time_XXXXXX";
+ pid_t pid1, pid2;
+ double time1, time2;
+
+ ret = mkstemp(data_file1);
+ if (!ASSERT_GT(ret, 0, "mkstemp"))
+ return;
+ close(ret);
+ ret = mkstemp(data_file2);
+ if (!ASSERT_GT(ret, 0, "mkstemp"))
+ goto cleanup_data_file1;
+ close(ret);
+ ret = mkstemp(time_file1);
+ if (!ASSERT_GT(ret, 0, "mkstemp"))
+ goto cleanup_data_file2;
+ close(ret);
+ ret = mkstemp(time_file2);
+ if (!ASSERT_GT(ret, 0, "mkstemp"))
+ goto cleanup_time_file1;
+ close(ret);
+
+ pid1 = fork();
+ if (!ASSERT_GE(pid1, 0, "fork"))
+ goto cleanup;
+ if (pid1 == 0)
+ real_test_memcg_ops_child_work(CG_LOW_DIR,
+ data_file1,
+ time_file1,
+ read_times);
+
+ pid2 = fork();
+ if (!ASSERT_GE(pid1, 0, "fork"))
+ goto cleanup;
+ if (pid2 == 0)
+ real_test_memcg_ops_child_work(CG_HIGH_DIR,
+ data_file2,
+ time_file2,
+ read_times);
+
+ ret = waitpid(pid1, NULL, 0);
+ if (!ASSERT_GT(ret, 0, "waitpid"))
+ goto cleanup;
+
+ ret = waitpid(pid2, NULL, 0);
+ if (!ASSERT_GT(ret, 0, "waitpid"))
+ goto cleanup;
+
+ if (get_time(time_file1, &time1))
+ goto cleanup;
+
+ if (get_time(time_file2, &time2))
+ goto cleanup;
+
+ if (time1 < time2 || time1 - time2 <= 1)
+ PRINT_FAIL("low fast compare time1=%f, time2=%f",
+ time1, time2);
+
+cleanup:
+ unlink(time_file2);
+cleanup_time_file1:
+ unlink(time_file1);
+cleanup_data_file2:
+ unlink(data_file2);
+cleanup_data_file1:
+ unlink(data_file1);
+}
+
+void test_memcg_ops_over_high(void)
+{
+ int err, map_fd;
+ struct memcg_ops *skel;
+ struct bpf_map *map;
+ size_t bss_sz;
+ struct memcg_ops__bss *bss_data;
+ __u32 key = 0;
+ struct bpf_program *prog = NULL;
+ struct bpf_link *link = NULL, *link2 = NULL;
+ DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts);
+ int high_cgroup_id, low_cgroup_fd = -1;
+
+ err = setup_cgroup(&high_cgroup_id, &low_cgroup_fd, NULL);
+ if (!ASSERT_OK(err, "setup_cgroup"))
+ goto out;
+
+ skel = memcg_ops__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load"))
+ goto out;
+
+ map = bpf_object__find_map_by_name(skel->obj, ".bss");
+ if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss"))
+ goto out;
+
+ map_fd = bpf_map__fd(map);
+ bss_sz = bpf_map__value_size(map);
+ bss_data = malloc(bpf_map__value_size(map));
+ if (!ASSERT_OK_PTR(bss_data, "malloc(bpf_map__value_size(map))"))
+ goto out;
+ memset(bss_data, 0, sizeof(struct local_config));
+ bss_data->local_config.high_cgroup_id = high_cgroup_id;
+ bss_data->local_config.threshold = TRIGGER_THRESHOLD;
+ bss_data->local_config.use_below_low = false;
+ bss_data->local_config.use_below_min = false;
+ bss_data->local_config.over_high_ms = OVER_HIGH_MS;
+ err = bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST);
+ free(bss_data);
+ if (!ASSERT_OK(err, "bpf_map_update_elem"))
+ goto out;
+
+ prog = bpf_object__find_program_by_name(skel->obj,
+ "handle_count_memcg_events");
+ if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
+ goto out;
+
+ link = bpf_program__attach(prog);
+ if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
+ goto out;
+
+ map = bpf_object__find_map_by_name(skel->obj, "low_mcg_ops");
+ if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name low_mcg_ops"))
+ goto out;
+
+ opts.relative_fd = low_cgroup_fd;
+ link2 = bpf_map__attach_struct_ops_opts(map, &opts);
+ if (!ASSERT_OK_PTR(link2, "bpf_map__attach_struct_ops_opts"))
+ goto out;
+
+ real_test_memcg_ops(5);
+
+out:
+ bpf_link__destroy(link);
+ bpf_link__destroy(link2);
+ memcg_ops__detach(skel);
+ close(low_cgroup_fd);
+ cleanup_cgroup_environment();
+}
+
+void test_memcg_ops_below_low_over_high(void)
+{
+ int err, map_fd;
+ struct memcg_ops *skel;
+ struct bpf_map *map;
+ size_t bss_sz;
+ struct memcg_ops__bss *bss_data;
+ __u32 key = 0;
+ struct bpf_program *prog = NULL;
+ struct bpf_link *link = NULL, *link_high = NULL, *link_low = NULL;
+ DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts);
+ int high_cgroup_id, high_cgroup_fd = -1, low_cgroup_fd = -1;
+
+ err = setup_cgroup(&high_cgroup_id, &low_cgroup_fd, &high_cgroup_fd);
+ if (!ASSERT_OK(err, "setup_cgroup"))
+ goto out;
+
+ skel = memcg_ops__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load"))
+ goto out;
+
+ map = bpf_object__find_map_by_name(skel->obj, ".bss");
+ if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss"))
+ goto out;
+
+ map_fd = bpf_map__fd(map);
+ bss_sz = bpf_map__value_size(map);
+ bss_data = malloc(bpf_map__value_size(map));
+ if (!ASSERT_OK_PTR(bss_data, "malloc(bpf_map__value_size(map))"))
+ goto out;
+ memset(bss_data, 0, sizeof(struct local_config));
+ bss_data->local_config.high_cgroup_id = high_cgroup_id;
+ bss_data->local_config.threshold = TRIGGER_THRESHOLD;
+ bss_data->local_config.use_below_low = true;
+ bss_data->local_config.use_below_min = false;
+ bss_data->local_config.over_high_ms = OVER_HIGH_MS;
+ err = bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST);
+ free(bss_data);
+ if (!ASSERT_OK(err, "bpf_map_update_elem"))
+ goto out;
+
+ prog = bpf_object__find_program_by_name(skel->obj,
+ "handle_count_memcg_events");
+ if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
+ goto out;
+
+ link = bpf_program__attach(prog);
+ if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
+ goto out;
+
+ map = bpf_object__find_map_by_name(skel->obj, "high_mcg_ops");
+ if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name mcg_ops"))
+ goto out;
+ opts.relative_fd = high_cgroup_fd;
+ link_high = bpf_map__attach_struct_ops_opts(map, &opts);
+ if (!ASSERT_OK_PTR(link_high, "bpf_map__attach_struct_ops_opts"))
+ goto out;
+
+ map = bpf_object__find_map_by_name(skel->obj, "low_mcg_ops");
+ if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name mcg_ops"))
+ goto out;
+ opts.relative_fd = low_cgroup_fd;
+ link_low = bpf_map__attach_struct_ops_opts(map, &opts);
+ if (!ASSERT_OK_PTR(link_low, "bpf_map__attach_struct_ops_opts"))
+ goto out;
+
+ real_test_memcg_ops(50);
+
+out:
+ bpf_link__destroy(link);
+ bpf_link__destroy(link_high);
+ bpf_link__destroy(link_low);
+ memcg_ops__detach(skel);
+ close(high_cgroup_fd);
+ close(low_cgroup_fd);
+ cleanup_cgroup_environment();
+}
+
+void test_memcg_ops_below_min_over_high(void)
+{
+ int err, map_fd;
+ struct memcg_ops *skel;
+ struct bpf_map *map;
+ size_t bss_sz;
+ struct memcg_ops__bss *bss_data;
+ __u32 key = 0;
+ struct bpf_program *prog = NULL;
+ struct bpf_link *link = NULL, *link_high = NULL, *link_low = NULL;
+ DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts);
+ int high_cgroup_id, high_cgroup_fd = -1, low_cgroup_fd = -1;
+
+ err = setup_cgroup(&high_cgroup_id, &low_cgroup_fd, &high_cgroup_fd);
+ if (!ASSERT_OK(err, "setup_cgroup"))
+ goto out;
+
+ skel = memcg_ops__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "memcg_ops__open_and_load"))
+ goto out;
+
+ map = bpf_object__find_map_by_name(skel->obj, ".bss");
+ if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss"))
+ goto out;
+
+ map_fd = bpf_map__fd(map);
+ bss_sz = bpf_map__value_size(map);
+ bss_data = malloc(bpf_map__value_size(map));
+ if (!ASSERT_OK_PTR(bss_data, "malloc(bpf_map__value_size(map))"))
+ goto out;
+ memset(bss_data, 0, sizeof(struct local_config));
+ bss_data->local_config.high_cgroup_id = high_cgroup_id;
+ bss_data->local_config.threshold = TRIGGER_THRESHOLD;
+ bss_data->local_config.use_below_low = false;
+ bss_data->local_config.use_below_min = true;
+ bss_data->local_config.over_high_ms = OVER_HIGH_MS;
+ err = bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST);
+ free(bss_data);
+ if (!ASSERT_OK(err, "bpf_map_update_elem"))
+ goto out;
+
+ prog = bpf_object__find_program_by_name(skel->obj,
+ "handle_count_memcg_events");
+ if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
+ goto out;
+
+ link = bpf_program__attach(prog);
+ if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
+ goto out;
+
+ map = bpf_object__find_map_by_name(skel->obj, "high_mcg_ops");
+ if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name mcg_ops"))
+ goto out;
+ opts.relative_fd = high_cgroup_fd;
+ link_high = bpf_map__attach_struct_ops_opts(map, &opts);
+ if (!ASSERT_OK_PTR(link_high, "bpf_map__attach_struct_ops_opts"))
+ goto out;
+
+ map = bpf_object__find_map_by_name(skel->obj, "low_mcg_ops");
+ if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name mcg_ops"))
+ goto out;
+ opts.relative_fd = low_cgroup_fd;
+ link_low = bpf_map__attach_struct_ops_opts(map, &opts);
+ if (!ASSERT_OK_PTR(link_low, "bpf_map__attach_struct_ops_opts"))
+ goto out;
+
+ real_test_memcg_ops(50);
+
+out:
+ bpf_link__destroy(link);
+ bpf_link__destroy(link_high);
+ bpf_link__destroy(link_low);
+ memcg_ops__detach(skel);
+ close(high_cgroup_fd);
+ close(low_cgroup_fd);
+ cleanup_cgroup_environment();
+}
diff --git a/tools/testing/selftests/bpf/progs/memcg_ops.c b/tools/testing/selftests/bpf/progs/memcg_ops.c
new file mode 100644
index 000000000000..44087a206a61
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/memcg_ops.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define ONE_SECOND_NS 1000000000
+
+struct local_config {
+ u64 threshold;
+ u64 high_cgroup_id;
+ bool use_below_low;
+ bool use_below_min;
+ unsigned int over_high_ms;
+} local_config;
+
+struct AggregationData {
+ u64 sum;
+ u64 window_start_ts;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, u32);
+ __type(value, struct AggregationData);
+} aggregation_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, u32);
+ __type(value, u64);
+} trigger_ts_map SEC(".maps");
+
+SEC("tp/memcg/count_memcg_events")
+int
+handle_count_memcg_events(struct trace_event_raw_memcg_rstat_events *ctx)
+{
+ u32 key = 0;
+ struct AggregationData *data;
+ u64 current_ts;
+
+ if (ctx->id != local_config.high_cgroup_id ||
+ (ctx->item != PGFAULT))
+ goto out;
+
+ data = bpf_map_lookup_elem(&aggregation_map, &key);
+ if (!data)
+ goto out;
+
+ current_ts = bpf_ktime_get_ns();
+
+ if (current_ts - data->window_start_ts < ONE_SECOND_NS) {
+ data->sum += ctx->val;
+ } else {
+ data->window_start_ts = current_ts;
+ data->sum = ctx->val;
+ }
+
+ if (data->sum > local_config.threshold) {
+ bpf_map_update_elem(&trigger_ts_map, &key, ¤t_ts,
+ BPF_ANY);
+ data->sum = 0;
+ data->window_start_ts = current_ts;
+ }
+
+out:
+ return 0;
+}
+
+static bool need_threshold(void)
+{
+ u32 key = 0;
+ u64 *trigger_ts;
+ bool ret = false;
+
+ trigger_ts = bpf_map_lookup_elem(&trigger_ts_map, &key);
+ if (!trigger_ts || *trigger_ts == 0)
+ goto out;
+
+ u64 current_ts = bpf_ktime_get_ns();
+
+ if (current_ts - *trigger_ts < ONE_SECOND_NS)
+ ret = true;
+
+out:
+ return ret;
+}
+
+SEC("struct_ops/below_low")
+unsigned int below_low_impl(struct mem_cgroup *memcg)
+{
+ if (!local_config.use_below_low)
+ return false;
+
+ return need_threshold();
+}
+
+SEC("struct_ops/below_min")
+unsigned int below_min_impl(struct mem_cgroup *memcg)
+{
+ if (!local_config.use_below_min)
+ return false;
+
+ return need_threshold();
+}
+
+SEC("struct_ops/get_high_delay_ms")
+unsigned int get_high_delay_ms_impl(struct mem_cgroup *memcg)
+{
+ if (local_config.over_high_ms && need_threshold())
+ return local_config.over_high_ms;
+
+ return 0;
+}
+
+SEC(".struct_ops.link")
+struct memcg_bpf_ops high_mcg_ops = {
+ .below_low = (void *)below_low_impl,
+ .below_min = (void *)below_min_impl,
+};
+
+SEC(".struct_ops.link")
+struct memcg_bpf_ops low_mcg_ops = {
+ .get_high_delay_ms = (void *)get_high_delay_ms_impl,
+};
+
+char LICENSE[] SEC("license") = "GPL";
--
2.43.0
Hi Hui,
On 1/23/26 1:00 AM, Hui Zhu wrote:
> From: Hui Zhu <zhuhui@kylinos.cn>
>
> Add a comprehensive selftest suite for the `memcg_bpf_ops`
> functionality. These tests validate that BPF programs can correctly
> influence memory cgroup throttling behavior by implementing the new
> hooks.
>
> The test suite is added in `prog_tests/memcg_ops.c` and covers
> several key scenarios:
>
> 1. `test_memcg_ops_over_high`:
> Verifies that a BPF program can trigger throttling on a low-priority
> cgroup by returning a delay from the `get_high_delay_ms` hook when a
> high-priority cgroup is under pressure.
>
> 2. `test_memcg_ops_below_low_over_high`:
> Tests the combination of the `below_low` and `get_high_delay_ms`
> hooks, ensuring they work together as expected.
>
> 3. `test_memcg_ops_below_min_over_high`:
> Validates the interaction between the `below_min` and
> `get_high_delay_ms` hooks.
>
> The test framework sets up a cgroup hierarchy with high and low
> priority groups, attaches BPF programs, runs memory-intensive
> workloads, and asserts that the observed throttling (measured by
> workload execution time) matches expectations.
>
> The BPF program (`progs/memcg_ops.c`) uses a tracepoint on
> `memcg:count_memcg_events` (specifically PGFAULT) to detect memory
> pressure and trigger the appropriate hooks in response. This test
> suite provides essential validation for the new memory control
> mechanisms.
>
> Signed-off-by: Geliang Tang <geliang@kernel.org>
> Signed-off-by: Hui Zhu <zhuhui@kylinos.cn>
> ---
[..]
> diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c
> new file mode 100644
> index 000000000000..9a8d16296f2d
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c
> @@ -0,0 +1,537 @@
[..]
> +
> +static void
> +real_test_memcg_ops_child_work(const char *cgroup_path,
> + char *data_filename,
> + char *time_filename,
> + int read_times)
> +{
> + struct timeval start, end;
> + double elapsed;
> + FILE *fp;
> +
> + if (!ASSERT_OK(join_parent_cgroup(cgroup_path), "join_parent_cgroup"))
> + goto out;
> +
> + if (env.verbosity >= VERBOSE_NORMAL)
> + printf("%s %d begin\n", __func__, getpid());
> +
> + gettimeofday(&start, NULL);
> +
> + if (!ASSERT_OK(write_file(data_filename), "write_file"))
> + goto out;
> +
> + if (env.verbosity >= VERBOSE_NORMAL)
> + printf("%s %d write_file done\n", __func__, getpid());
> +
> + if (!ASSERT_OK(read_file(data_filename, read_times), "read_file"))
> + goto out;
> +
> + gettimeofday(&end, NULL);
> +
> + elapsed = (end.tv_sec - start.tv_sec) +
> + (end.tv_usec - start.tv_usec) / 1000000.0;
> +
> + if (env.verbosity >= VERBOSE_NORMAL)
> + printf("%s %d end %.6f\n", __func__, getpid(), elapsed);
> +
> + fp = fopen(time_filename, "w");
> + if (!ASSERT_OK_PTR(fp, "fopen"))
> + goto out;
> + fprintf(fp, "%.6f", elapsed);
> + fclose(fp);
> +
> +out:
> + exit(0);
> +}
> +
[..]
> +static void real_test_memcg_ops(int read_times)
> +{
> + int ret;
> + char data_file1[] = "/tmp/test_data_XXXXXX";
> + char data_file2[] = "/tmp/test_data_XXXXXX";
> + char time_file1[] = "/tmp/test_time_XXXXXX";
> + char time_file2[] = "/tmp/test_time_XXXXXX";
> + pid_t pid1, pid2;
> + double time1, time2;
> +
> + ret = mkstemp(data_file1);
> + if (!ASSERT_GT(ret, 0, "mkstemp"))
> + return;
> + close(ret);
> + ret = mkstemp(data_file2);
> + if (!ASSERT_GT(ret, 0, "mkstemp"))
> + goto cleanup_data_file1;
> + close(ret);
> + ret = mkstemp(time_file1);
> + if (!ASSERT_GT(ret, 0, "mkstemp"))
> + goto cleanup_data_file2;
> + close(ret);
> + ret = mkstemp(time_file2);
> + if (!ASSERT_GT(ret, 0, "mkstemp"))
> + goto cleanup_time_file1;
> + close(ret);
> +
> + pid1 = fork();
> + if (!ASSERT_GE(pid1, 0, "fork"))
> + goto cleanup;
> + if (pid1 == 0)
> + real_test_memcg_ops_child_work(CG_LOW_DIR,
> + data_file1,
> + time_file1,
> + read_times);
Would it be better to call exit() after real_test_memcg_ops_child_work()
instead of within it? This way the fork/exit/wait logic is contained in
the same scope making the lifetimes easier to track. I had to go back
and search for the call to exit() since at a glance this function
appears to proceed to call fork() and waitpid() from within both parent
and child procs (though it really does not).
2026年1月24日 04:47, "JP Kobryn" <inwardvessel@gmail.com mailto:inwardvessel@gmail.com?to=%22JP%20Kobryn%22%20%3Cinwardvessel%40gmail.com%3E > 写到:
>
> Hi Hui,
>
> On 1/23/26 1:00 AM, Hui Zhu wrote:
>
> >
> > From: Hui Zhu <zhuhui@kylinos.cn>
> > Add a comprehensive selftest suite for the `memcg_bpf_ops`
> > functionality. These tests validate that BPF programs can correctly
> > influence memory cgroup throttling behavior by implementing the new
> > hooks.
> > The test suite is added in `prog_tests/memcg_ops.c` and covers
> > several key scenarios:
> > 1. `test_memcg_ops_over_high`:
> > Verifies that a BPF program can trigger throttling on a low-priority
> > cgroup by returning a delay from the `get_high_delay_ms` hook when a
> > high-priority cgroup is under pressure.
> > 2. `test_memcg_ops_below_low_over_high`:
> > Tests the combination of the `below_low` and `get_high_delay_ms`
> > hooks, ensuring they work together as expected.
> > 3. `test_memcg_ops_below_min_over_high`:
> > Validates the interaction between the `below_min` and
> > `get_high_delay_ms` hooks.
> > The test framework sets up a cgroup hierarchy with high and low
> > priority groups, attaches BPF programs, runs memory-intensive
> > workloads, and asserts that the observed throttling (measured by
> > workload execution time) matches expectations.
> > The BPF program (`progs/memcg_ops.c`) uses a tracepoint on
> > `memcg:count_memcg_events` (specifically PGFAULT) to detect memory
> > pressure and trigger the appropriate hooks in response. This test
> > suite provides essential validation for the new memory control
> > mechanisms.
> > Signed-off-by: Geliang Tang <geliang@kernel.org>
> > Signed-off-by: Hui Zhu <zhuhui@kylinos.cn>
> > ---
> >
> [..]
>
> >
> > diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c
> > new file mode 100644
> > index 000000000000..9a8d16296f2d
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c
> > @@ -0,0 +1,537 @@
> >
> [..]
>
> >
> > +
> > +static void
> > +real_test_memcg_ops_child_work(const char *cgroup_path,
> > + char *data_filename,
> > + char *time_filename,
> > + int read_times)
> > +{
> > + struct timeval start, end;
> > + double elapsed;
> > + FILE *fp;
> > +
> > + if (!ASSERT_OK(join_parent_cgroup(cgroup_path), "join_parent_cgroup"))
> > + goto out;
> > +
> > + if (env.verbosity >= VERBOSE_NORMAL)
> > + printf("%s %d begin\n", __func__, getpid());
> > +
> > + gettimeofday(&start, NULL);
> > +
> > + if (!ASSERT_OK(write_file(data_filename), "write_file"))
> > + goto out;
> > +
> > + if (env.verbosity >= VERBOSE_NORMAL)
> > + printf("%s %d write_file done\n", __func__, getpid());
> > +
> > + if (!ASSERT_OK(read_file(data_filename, read_times), "read_file"))
> > + goto out;
> > +
> > + gettimeofday(&end, NULL);
> > +
> > + elapsed = (end.tv_sec - start.tv_sec) +
> > + (end.tv_usec - start.tv_usec) / 1000000.0;
> > +
> > + if (env.verbosity >= VERBOSE_NORMAL)
> > + printf("%s %d end %.6f\n", __func__, getpid(), elapsed);
> > +
> > + fp = fopen(time_filename, "w");
> > + if (!ASSERT_OK_PTR(fp, "fopen"))
> > + goto out;
> > + fprintf(fp, "%.6f", elapsed);
> > + fclose(fp);
> > +
> > +out:
> > + exit(0);
> > +}
> > +
> >
> [..]
>
> >
> > +static void real_test_memcg_ops(int read_times)
> > +{
> > + int ret;
> > + char data_file1[] = "/tmp/test_data_XXXXXX";
> > + char data_file2[] = "/tmp/test_data_XXXXXX";
> > + char time_file1[] = "/tmp/test_time_XXXXXX";
> > + char time_file2[] = "/tmp/test_time_XXXXXX";
> > + pid_t pid1, pid2;
> > + double time1, time2;
> > +
> > + ret = mkstemp(data_file1);
> > + if (!ASSERT_GT(ret, 0, "mkstemp"))
> > + return;
> > + close(ret);
> > + ret = mkstemp(data_file2);
> > + if (!ASSERT_GT(ret, 0, "mkstemp"))
> > + goto cleanup_data_file1;
> > + close(ret);
> > + ret = mkstemp(time_file1);
> > + if (!ASSERT_GT(ret, 0, "mkstemp"))
> > + goto cleanup_data_file2;
> > + close(ret);
> > + ret = mkstemp(time_file2);
> > + if (!ASSERT_GT(ret, 0, "mkstemp"))
> > + goto cleanup_time_file1;
> > + close(ret);
> > +
> > + pid1 = fork();
> > + if (!ASSERT_GE(pid1, 0, "fork"))
> > + goto cleanup;
> > + if (pid1 == 0)
> > + real_test_memcg_ops_child_work(CG_LOW_DIR,
> > + data_file1,
> > + time_file1,
> > + read_times);
> >
> Would it be better to call exit() after real_test_memcg_ops_child_work()
> instead of within it? This way the fork/exit/wait logic is contained in
> the same scope making the lifetimes easier to track. I had to go back
> and search for the call to exit() since at a glance this function
> appears to proceed to call fork() and waitpid() from within both parent
> and child procs (though it really does not).
>
I will fix it.
Best,
Hui
> diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c
> new file mode 100644
> index 000000000000..9a8d16296f2d
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c
[ ... ]
> +static void real_test_memcg_ops(int read_times)
> +{
> + int ret;
> + char data_file1[] = "/tmp/test_data_XXXXXX";
> + char data_file2[] = "/tmp/test_data_XXXXXX";
> + char time_file1[] = "/tmp/test_time_XXXXXX";
> + char time_file2[] = "/tmp/test_time_XXXXXX";
> + pid_t pid1, pid2;
> + double time1, time2;
[ ... ]
> + pid1 = fork();
> + if (!ASSERT_GE(pid1, 0, "fork"))
> + goto cleanup;
> + if (pid1 == 0)
> + real_test_memcg_ops_child_work(CG_LOW_DIR,
> + data_file1,
> + time_file1,
> + read_times);
> +
> + pid2 = fork();
> + if (!ASSERT_GE(pid1, 0, "fork"))
^^^^
Should this check pid2 instead of pid1? After assigning pid2 = fork(),
checking pid1 means a failed second fork() would go undetected if the
first fork() succeeded.
> + goto cleanup;
> + if (pid2 == 0)
> + real_test_memcg_ops_child_work(CG_HIGH_DIR,
> + data_file2,
> + time_file2,
> + read_times);
[ ... ]
> +void test_memcg_ops_over_high(void)
> +{
> + int err, map_fd;
> + struct memcg_ops *skel;
^^^^^^^^^^^^^^^
The skel variable is not initialized to NULL here. If setup_cgroup()
fails early and jumps to the out label, memcg_ops__detach(skel) would
be called with an uninitialized pointer.
Also, the cleanup path calls memcg_ops__detach() but never calls
memcg_ops__destroy(). Would this leak the skeleton object and its
associated BPF resources?
> + struct bpf_map *map;
> + size_t bss_sz;
> + struct memcg_ops__bss *bss_data;
> + __u32 key = 0;
> + struct bpf_program *prog = NULL;
> + struct bpf_link *link = NULL, *link2 = NULL;
> + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts);
> + int high_cgroup_id, low_cgroup_fd = -1;
> +
> + err = setup_cgroup(&high_cgroup_id, &low_cgroup_fd, NULL);
> + if (!ASSERT_OK(err, "setup_cgroup"))
> + goto out;
> +
> + skel = memcg_ops__open_and_load();
[ ... ]
> +out:
> + bpf_link__destroy(link);
> + bpf_link__destroy(link2);
> + memcg_ops__detach(skel);
> + close(low_cgroup_fd);
> + cleanup_cgroup_environment();
> +}
[ ... ]
> +void test_memcg_ops_below_low_over_high(void)
> +{
> + int err, map_fd;
> + struct memcg_ops *skel;
Same issue here - skel is not initialized to NULL.
[ ... ]
> +out:
> + bpf_link__destroy(link);
> + bpf_link__destroy(link_high);
> + bpf_link__destroy(link_low);
> + memcg_ops__detach(skel);
[ ... ]
> +void test_memcg_ops_below_min_over_high(void)
> +{
> + int err, map_fd;
> + struct memcg_ops *skel;
Same issue here as well - skel is not initialized to NULL.
[ ... ]
> +out:
> + bpf_link__destroy(link);
> + bpf_link__destroy(link_high);
> + bpf_link__destroy(link_low);
> + memcg_ops__detach(skel);
> + close(high_cgroup_fd);
> + close(low_cgroup_fd);
> + cleanup_cgroup_environment();
> +}
---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/21280790825
© 2016 - 2026 Red Hat, Inc.